yosina 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +36 -0
- data/Gemfile +6 -0
- data/README.ja.md +229 -0
- data/README.md +229 -0
- data/Rakefile +30 -0
- data/codegen/dataset.rb +215 -0
- data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
- data/codegen/emitters/combined_transliterator_data.rb +28 -0
- data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
- data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
- data/codegen/emitters/simple_transliterator.rb +76 -0
- data/codegen/emitters/utils.rb +45 -0
- data/codegen/emitters.rb +8 -0
- data/codegen/main.rb +109 -0
- data/lib/yosina/char.rb +65 -0
- data/lib/yosina/chars.rb +152 -0
- data/lib/yosina/recipes.rb +359 -0
- data/lib/yosina/transliterator.rb +49 -0
- data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
- data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
- data/lib/yosina/transliterators/combined.rb +52 -0
- data/lib/yosina/transliterators/combined_data.rb +495 -0
- data/lib/yosina/transliterators/hira_kata.rb +106 -0
- data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
- data/lib/yosina/transliterators/hyphens.rb +83 -0
- data/lib/yosina/transliterators/hyphens_data.rb +60 -0
- data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
- data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
- data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
- data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
- data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
- data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
- data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
- data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
- data/lib/yosina/transliterators/radicals.rb +361 -0
- data/lib/yosina/transliterators/spaces.rb +79 -0
- data/lib/yosina/transliterators.rb +57 -0
- data/lib/yosina/version.rb +5 -0
- data/lib/yosina.rb +62 -0
- data/yosina.gemspec +41 -0
- metadata +159 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 79e70c5202973c9c7c6e2c5b5de4538422eb9323071dbaabc6563be241129a44
|
|
4
|
+
data.tar.gz: bf615675a3b77c5b330ac5da9132e2df90d92911d3763ae8c3bcc612aa894289
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 9acbf40e48148bd8cd6db88b02df70364a840c5b8a283df61256e838db9990bff8b73b1a30e95e892c20cbe5542a8d21d870284a7779cbc833043fd3aef612b7
|
|
7
|
+
data.tar.gz: 64e0f480547d54d14318f677ac459905209b6365096623cfe849a21740a1150a3c263c950b37f62dd30b82dce55b1a2e89e2c5def130736a1ac4b44d1c21acc4
|
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
plugins:
|
|
2
|
+
# - rubocop-rake
|
|
3
|
+
- rubocop-minitest
|
|
4
|
+
AllCops:
|
|
5
|
+
Include:
|
|
6
|
+
- lib/**/*.rb
|
|
7
|
+
- codegen/**/*.rb
|
|
8
|
+
- Gemfile
|
|
9
|
+
- "*.gemspec"
|
|
10
|
+
Exclude:
|
|
11
|
+
- lib/yosina/transliterators/hyphens_data.rb
|
|
12
|
+
- lib/yosina/transliterators/ivs_svs_base_data.rb
|
|
13
|
+
- lib/yosina/transliterators/combined_data.rb
|
|
14
|
+
- lib/yosina/transliterators/circled_or_squared_data.rb
|
|
15
|
+
- vendor/**/*
|
|
16
|
+
Naming/VariableNumber:
|
|
17
|
+
Exclude:
|
|
18
|
+
- codegen/**/*.rb
|
|
19
|
+
Metrics/ModuleLength:
|
|
20
|
+
Enabled: false
|
|
21
|
+
Metrics/MethodLength:
|
|
22
|
+
Enabled: false
|
|
23
|
+
Metrics/BlockLength:
|
|
24
|
+
Enabled: false
|
|
25
|
+
Metrics/ClassLength:
|
|
26
|
+
Max: 200
|
|
27
|
+
Metrics/AbcSize:
|
|
28
|
+
Enabled: false
|
|
29
|
+
Metrics/CyclomaticComplexity:
|
|
30
|
+
Enabled: false
|
|
31
|
+
Metrics/PerceivedComplexity:
|
|
32
|
+
Enabled: false
|
|
33
|
+
Metrics/BlockNesting:
|
|
34
|
+
Enabled: false
|
|
35
|
+
Style/WordArray:
|
|
36
|
+
Enabled: false
|
data/Gemfile
ADDED
data/README.ja.md
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# Yosina Ruby
|
|
2
|
+
|
|
3
|
+
Yosina日本語テキスト翻字ライブラリのRubyポート。
|
|
4
|
+
|
|
5
|
+
## 概要
|
|
6
|
+
|
|
7
|
+
Yosinaは、日本語テキスト処理でよく必要とされる様々なテキスト正規化および変換機能を提供する日本語テキスト翻字ライブラリです。
|
|
8
|
+
|
|
9
|
+
## 使用方法
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
require 'yosina'
|
|
13
|
+
|
|
14
|
+
# レシピを使用してトランスリテレータを作成
|
|
15
|
+
recipe = Yosina::TransliterationRecipe.new(
|
|
16
|
+
replace_spaces: true,
|
|
17
|
+
kanji_old_new: true,
|
|
18
|
+
replace_circled_or_squared_characters: true,
|
|
19
|
+
replace_combined_characters: true,
|
|
20
|
+
hira_kata: "hira-to-kata", # ひらがなをカタカナに変換
|
|
21
|
+
replace_japanese_iteration_marks: true, # 繰り返し記号を展開
|
|
22
|
+
to_fullwidth: true
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
transliterator = Yosina.make_transliterator(recipe)
|
|
26
|
+
|
|
27
|
+
# 様々な特殊文字でテキストを翻字
|
|
28
|
+
input = "①②③ ⒶⒷⒸ ㍿㍑㌠㋿" # 丸囲み数字、文字、表意文字空白、結合文字
|
|
29
|
+
result = transliterator.call(input)
|
|
30
|
+
puts result # "(1)(2)(3) (A)(B)(C) 株式会社リットルサンチーム令和"
|
|
31
|
+
|
|
32
|
+
# 旧字体を新字体に変換
|
|
33
|
+
old_kanji = "舊字體"
|
|
34
|
+
result = transliterator.call(old_kanji)
|
|
35
|
+
puts result # "旧字体"
|
|
36
|
+
|
|
37
|
+
# 半角カタカナを全角に変換
|
|
38
|
+
half_width = "テストモジレツ"
|
|
39
|
+
result = transliterator.call(half_width)
|
|
40
|
+
puts result # "テストモジレツ"
|
|
41
|
+
|
|
42
|
+
# ひらがなからカタカナへの変換と繰り返し記号のデモ
|
|
43
|
+
mixed_text = "学問のすゝめ"
|
|
44
|
+
result = transliterator.call(mixed_text)
|
|
45
|
+
puts result # "学問ノススメ"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 設定を使用した高度な使用方法
|
|
49
|
+
|
|
50
|
+
```ruby
|
|
51
|
+
require 'yosina'
|
|
52
|
+
|
|
53
|
+
# 特定の設定でトランスリテレータを作成
|
|
54
|
+
configs = [
|
|
55
|
+
Yosina::TransliteratorConfig.new('spaces'),
|
|
56
|
+
Yosina::TransliteratorConfig.new('kanji-old-new'),
|
|
57
|
+
Yosina::TransliteratorConfig.new('radicals'),
|
|
58
|
+
Yosina::TransliteratorConfig.new('circled-or-squared'),
|
|
59
|
+
Yosina::TransliteratorConfig.new('combined'),
|
|
60
|
+
Yosina::TransliteratorConfig.new('hira-kata', { mode: 'kata-to-hira' }), # カタカナをひらがなに変換
|
|
61
|
+
Yosina::TransliteratorConfig.new('japanese-iteration-marks') # 々、ゝゞ、ヽヾなどの繰り返し記号を展開
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
transliterator = Yosina.make_transliterator(configs)
|
|
65
|
+
|
|
66
|
+
# 新しい変換を含む様々な変換の例
|
|
67
|
+
input_text = "カタカナでの時々の佐々木さん"
|
|
68
|
+
result = transliterator.call(input_text)
|
|
69
|
+
puts result # "かたかなでの時時の佐佐木さん"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### 文字列名を使用
|
|
73
|
+
|
|
74
|
+
```ruby
|
|
75
|
+
require 'yosina'
|
|
76
|
+
|
|
77
|
+
# 文字列名での簡略化された設定
|
|
78
|
+
configs = ['spaces', 'kanji-old-new', 'radicals']
|
|
79
|
+
|
|
80
|
+
transliterator = Yosina.make_transliterator(configs)
|
|
81
|
+
result = transliterator.call("日本語のテキスト")
|
|
82
|
+
puts result
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### 個別のトランスリテレータを使用
|
|
86
|
+
|
|
87
|
+
```ruby
|
|
88
|
+
require 'yosina'
|
|
89
|
+
|
|
90
|
+
# 丸囲み・角囲みトランスリテレータを作成
|
|
91
|
+
circled_factory = Yosina::Transliterators::CircledOrSquared
|
|
92
|
+
circled_transliterator = circled_factory.call
|
|
93
|
+
|
|
94
|
+
chars = Yosina::Chars.build_char_array("①②③ⒶⒷⒸ")
|
|
95
|
+
result_chars = circled_transliterator.call(chars)
|
|
96
|
+
output = Yosina::Chars.from_chars(result_chars)
|
|
97
|
+
puts output # "123ABC"
|
|
98
|
+
|
|
99
|
+
# 結合トランスリテレータを作成
|
|
100
|
+
combined_factory = Yosina::Transliterators::Combined
|
|
101
|
+
combined_transliterator = combined_factory.call
|
|
102
|
+
|
|
103
|
+
chars2 = Yosina::Chars.build_char_array("㍿㍑㌠㋿") # 結合文字
|
|
104
|
+
result_chars2 = combined_transliterator.call(chars2)
|
|
105
|
+
output2 = Yosina::Chars.from_chars(result_chars2)
|
|
106
|
+
puts output2 # "株式会社リットルサンチーム令和"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## インストール
|
|
110
|
+
|
|
111
|
+
アプリケーションのGemfileに以下の行を追加:
|
|
112
|
+
|
|
113
|
+
```ruby
|
|
114
|
+
gem 'yosina'
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
その後、実行:
|
|
118
|
+
|
|
119
|
+
$ bundle install
|
|
120
|
+
|
|
121
|
+
または自分でインストール:
|
|
122
|
+
|
|
123
|
+
$ gem install yosina
|
|
124
|
+
|
|
125
|
+
## 利用可能なトランスリテレータ
|
|
126
|
+
|
|
127
|
+
### 1. **丸囲み・角囲み文字** (`circled-or-squared`)
|
|
128
|
+
丸囲みや角囲みの文字を通常の文字に変換します。
|
|
129
|
+
- オプション: `templates` (カスタムレンダリング)、`includeEmojis` (絵文字を含める)
|
|
130
|
+
- 例: `①②③` → `(1)(2)(3)`、`㊙㊗` → `(秘)(祝)`
|
|
131
|
+
|
|
132
|
+
### 2. **結合文字** (`combined`)
|
|
133
|
+
結合文字を個別の文字シーケンスに展開します。
|
|
134
|
+
- 例: `㍻` (平成) → `平成`、`㈱` → `(株)`
|
|
135
|
+
|
|
136
|
+
### 3. **ひらがな・カタカナ合成** (`hira-kata-composition`)
|
|
137
|
+
分解されたひらがなとカタカナを合成された等価文字に結合します。
|
|
138
|
+
- オプション: `composeNonCombiningMarks` (非結合マークを合成)
|
|
139
|
+
- 例: `か + ゙` → `が`、`ヘ + ゜` → `ペ`
|
|
140
|
+
|
|
141
|
+
### 4. **ひらがな・カタカナ** (`hira-kata`)
|
|
142
|
+
ひらがなとカタカナの間で双方向に変換します。
|
|
143
|
+
- オプション: `mode` ("hira-to-kata" または "kata-to-hira")
|
|
144
|
+
- 例: `ひらがな` → `ヒラガナ` (hira-to-kata)
|
|
145
|
+
|
|
146
|
+
### 5. **ハイフン** (`hyphens`)
|
|
147
|
+
様々なダッシュ・ハイフン記号を日本語で一般的に使用されるものに置き換えます。
|
|
148
|
+
- オプション: `precedence` (マッピング優先順位)
|
|
149
|
+
- 利用可能なマッピング: "ascii"、"jisx0201"、"jisx0208_90"、"jisx0208_90_windows"、"jisx0208_verbatim"
|
|
150
|
+
- 例: `2019—2020` (emダッシュ) → `2019-2020`
|
|
151
|
+
|
|
152
|
+
### 6. **表意文字注釈** (`ideographic-annotations`)
|
|
153
|
+
伝統的な中国語から日本語への翻訳で使用される表意文字注釈を置き換えます。
|
|
154
|
+
- 例: `㆖㆘` → `上下`
|
|
155
|
+
|
|
156
|
+
### 7. **IVS-SVSベース** (`ivs-svs-base`)
|
|
157
|
+
表意文字異体字セレクタ(IVS)と標準化異体字セレクタ(SVS)を処理します。
|
|
158
|
+
- オプション: `charset`、`mode` ("ivs-or-svs" または "base")、`preferSVS`、`dropSelectorsAltogether`
|
|
159
|
+
- 例: `葛󠄀` (葛 + IVS) → `葛`
|
|
160
|
+
|
|
161
|
+
### 8. **日本語繰り返し記号** (`japanese-iteration-marks`)
|
|
162
|
+
繰り返し記号を前の文字を繰り返すことで展開します。
|
|
163
|
+
- 例: `時々` → `時時`、`いすゞ` → `いすず`
|
|
164
|
+
|
|
165
|
+
### 9. **JIS X 0201および類似** (`jisx0201-and-alike`)
|
|
166
|
+
半角・全角文字変換を処理します。
|
|
167
|
+
- オプション: `fullwidthToHalfwidth`、`convertGL` (英数字/記号)、`convertGR` (カタカナ)、`u005cAsYenSign`
|
|
168
|
+
- 例: `ABC123` → `ABC123`、`カタカナ` → `カタカナ`
|
|
169
|
+
|
|
170
|
+
### 10. **旧字体・新字体** (`kanji-old-new`)
|
|
171
|
+
旧字体の漢字を新字体に変換します。
|
|
172
|
+
- 例: `舊字體の變換` → `旧字体の変換`
|
|
173
|
+
|
|
174
|
+
### 11. **数学英数記号** (`mathematical-alphanumerics`)
|
|
175
|
+
数学英数記号を通常のASCIIに正規化します。
|
|
176
|
+
- 例: `𝐀𝐁𝐂` (数学太字) → `ABC`
|
|
177
|
+
|
|
178
|
+
### 12. **長音記号** (`prolonged-sound-marks`)
|
|
179
|
+
ハイフンと長音記号の間の文脈的な変換を処理します。
|
|
180
|
+
- オプション: `skipAlreadyTransliteratedChars`、`allowProlongedHatsuon`、`allowProlongedSokuon`、`replaceProlongedMarksFollowingAlnums`
|
|
181
|
+
- 例: `イ−ハト−ヴォ` (ハイフン付き) → `イーハトーヴォ` (長音記号)
|
|
182
|
+
|
|
183
|
+
### 13. **部首** (`radicals`)
|
|
184
|
+
CJK部首文字を対応する表意文字に変換します。
|
|
185
|
+
- 例: `⾔⾨⾷` (康熙部首) → `言門食`
|
|
186
|
+
|
|
187
|
+
### 14. **空白** (`spaces`)
|
|
188
|
+
様々なUnicode空白文字を標準ASCII空白に正規化します。
|
|
189
|
+
- 例: `A B` (表意文字空白) → `A B`
|
|
190
|
+
|
|
191
|
+
## 開発
|
|
192
|
+
|
|
193
|
+
リポジトリをチェックアウトした後、`bundle install`を実行して依存関係をインストールします。
|
|
194
|
+
|
|
195
|
+
### コード生成
|
|
196
|
+
|
|
197
|
+
一部のトランスリテレータはデータファイルから生成されます:
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
rake codegen
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
これにより、`../data`ディレクトリのJSONデータファイルからトランスリテレータが生成されます。
|
|
204
|
+
|
|
205
|
+
### テスト
|
|
206
|
+
|
|
207
|
+
テストスイートを実行:
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
rake test
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
または特定のテストを実行:
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
ruby test/test_basic.rb
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## 要件
|
|
220
|
+
|
|
221
|
+
- Ruby 3.0以降のバージョン
|
|
222
|
+
|
|
223
|
+
## 貢献
|
|
224
|
+
|
|
225
|
+
バグレポートとプルリクエストは、GitHubのhttps://github.com/yosina-lib/yosinaで歓迎します。
|
|
226
|
+
|
|
227
|
+
## ライセンス
|
|
228
|
+
|
|
229
|
+
gemは[MITライセンス](https://opensource.org/licenses/MIT)の条件の下でオープンソースとして利用可能です。
|
data/README.md
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# Yosina Ruby
|
|
2
|
+
|
|
3
|
+
A Ruby port of the Yosina Japanese text transliteration library.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Yosina is a library for Japanese text transliteration that provides various text normalization and conversion features commonly needed when processing Japanese text.
|
|
8
|
+
|
|
9
|
+
## Usage
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
require 'yosina'
|
|
13
|
+
|
|
14
|
+
# Create a transliterator using a recipe
|
|
15
|
+
recipe = Yosina::TransliterationRecipe.new(
|
|
16
|
+
replace_spaces: true,
|
|
17
|
+
kanji_old_new: true,
|
|
18
|
+
replace_circled_or_squared_characters: true,
|
|
19
|
+
replace_combined_characters: true,
|
|
20
|
+
hira_kata: "hira-to-kata", # Convert hiragana to katakana
|
|
21
|
+
replace_japanese_iteration_marks: true, # Expand iteration marks
|
|
22
|
+
to_fullwidth: true
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
transliterator = Yosina.make_transliterator(recipe)
|
|
26
|
+
|
|
27
|
+
# Use it to transliterate text with various special characters
|
|
28
|
+
input = "①②③ ⒶⒷⒸ ㍿㍑㌠㋿" # circled numbers, letters, ideographic space, combined characters
|
|
29
|
+
result = transliterator.call(input)
|
|
30
|
+
puts result # "(1)(2)(3) (A)(B)(C) 株式会社リットルサンチーム令和"
|
|
31
|
+
|
|
32
|
+
# Convert old kanji to new
|
|
33
|
+
old_kanji = "舊字體"
|
|
34
|
+
result = transliterator.call(old_kanji)
|
|
35
|
+
puts result # "旧字体"
|
|
36
|
+
|
|
37
|
+
# Convert half-width katakana to full-width
|
|
38
|
+
half_width = "テストモジレツ"
|
|
39
|
+
result = transliterator.call(half_width)
|
|
40
|
+
puts result # "テストモジレツ"
|
|
41
|
+
|
|
42
|
+
# Demonstrate hiragana to katakana conversion with iteration marks
|
|
43
|
+
mixed_text = "学問のすゝめ"
|
|
44
|
+
result = transliterator.call(mixed_text)
|
|
45
|
+
puts result # "学問ノススメ"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Advanced Usage with Configs
|
|
49
|
+
|
|
50
|
+
```ruby
|
|
51
|
+
require 'yosina'
|
|
52
|
+
|
|
53
|
+
# Create transliterator with specific configurations
|
|
54
|
+
configs = [
|
|
55
|
+
Yosina::TransliteratorConfig.new('spaces'),
|
|
56
|
+
Yosina::TransliteratorConfig.new('kanji-old-new'),
|
|
57
|
+
Yosina::TransliteratorConfig.new('radicals'),
|
|
58
|
+
Yosina::TransliteratorConfig.new('circled-or-squared'),
|
|
59
|
+
Yosina::TransliteratorConfig.new('combined'),
|
|
60
|
+
Yosina::TransliteratorConfig.new('hira-kata', { mode: 'kata-to-hira' }), # Convert katakana to hiragana
|
|
61
|
+
Yosina::TransliteratorConfig.new('japanese-iteration-marks') # Expand iteration marks like 々, ゝゞ, ヽヾ
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
transliterator = Yosina.make_transliterator(configs)
|
|
65
|
+
|
|
66
|
+
# Example with various transformations including the new ones
|
|
67
|
+
input_text = "カタカナでの時々の佐々木さん"
|
|
68
|
+
result = transliterator.call(input_text)
|
|
69
|
+
puts result # "かたかなでの時時の佐佐木さん"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Using String Names
|
|
73
|
+
|
|
74
|
+
```ruby
|
|
75
|
+
require 'yosina'
|
|
76
|
+
|
|
77
|
+
# Simplified configuration with string names
|
|
78
|
+
configs = ['spaces', 'kanji-old-new', 'radicals']
|
|
79
|
+
|
|
80
|
+
transliterator = Yosina.make_transliterator(configs)
|
|
81
|
+
result = transliterator.call("some japanese text")
|
|
82
|
+
puts result
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Using Individual Transliterators
|
|
86
|
+
|
|
87
|
+
```ruby
|
|
88
|
+
require 'yosina'
|
|
89
|
+
|
|
90
|
+
# Create a circled-or-squared transliterator
|
|
91
|
+
circled_factory = Yosina::Transliterators::CircledOrSquared
|
|
92
|
+
circled_transliterator = circled_factory.call
|
|
93
|
+
|
|
94
|
+
chars = Yosina::Chars.build_char_array("①②③ⒶⒷⒸ")
|
|
95
|
+
result_chars = circled_transliterator.call(chars)
|
|
96
|
+
output = Yosina::Chars.from_chars(result_chars)
|
|
97
|
+
puts output # "123ABC"
|
|
98
|
+
|
|
99
|
+
# Create a combined transliterator
|
|
100
|
+
combined_factory = Yosina::Transliterators::Combined
|
|
101
|
+
combined_transliterator = combined_factory.call
|
|
102
|
+
|
|
103
|
+
chars2 = Yosina::Chars.build_char_array("㍿㍑㌠㋿") # combined characters
|
|
104
|
+
result_chars2 = combined_transliterator.call(chars2)
|
|
105
|
+
output2 = Yosina::Chars.from_chars(result_chars2)
|
|
106
|
+
puts output2 # "株式会社リットルサンチーム令和"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Requirements
|
|
110
|
+
|
|
111
|
+
- Ruby 3.0 and later versions
|
|
112
|
+
|
|
113
|
+
## Installation
|
|
114
|
+
|
|
115
|
+
Add this line to your application's Gemfile:
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
gem 'yosina'
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
And then execute:
|
|
122
|
+
|
|
123
|
+
$ bundle install
|
|
124
|
+
|
|
125
|
+
Or install it yourself as:
|
|
126
|
+
|
|
127
|
+
$ gem install yosina
|
|
128
|
+
|
|
129
|
+
## Available Transliterators
|
|
130
|
+
|
|
131
|
+
### 1. **Circled or Squared** (`circled-or-squared`)
|
|
132
|
+
Converts circled or squared characters to their plain equivalents.
|
|
133
|
+
- Options: `templates` (custom rendering), `includeEmojis` (include emoji characters)
|
|
134
|
+
- Example: `①②③` → `(1)(2)(3)`, `㊙㊗` → `(秘)(祝)`
|
|
135
|
+
|
|
136
|
+
### 2. **Combined** (`combined`)
|
|
137
|
+
Expands combined characters into their individual character sequences.
|
|
138
|
+
- Example: `㍻` (Heisei era) → `平成`, `㈱` → `(株)`
|
|
139
|
+
|
|
140
|
+
### 3. **Hiragana-Katakana Composition** (`hira-kata-composition`)
|
|
141
|
+
Combines decomposed hiraganas and katakanas into composed equivalents.
|
|
142
|
+
- Options: `composeNonCombiningMarks` (compose non-combining marks)
|
|
143
|
+
- Example: `か + ゙` → `が`, `ヘ + ゜` → `ペ`
|
|
144
|
+
|
|
145
|
+
### 4. **Hiragana-Katakana** (`hira-kata`)
|
|
146
|
+
Converts between hiragana and katakana scripts bidirectionally.
|
|
147
|
+
- Options: `mode` ("hira-to-kata" or "kata-to-hira")
|
|
148
|
+
- Example: `ひらがな` → `ヒラガナ` (hira-to-kata)
|
|
149
|
+
|
|
150
|
+
### 5. **Hyphens** (`hyphens`)
|
|
151
|
+
Replaces various dash/hyphen symbols with common ones used in Japanese.
|
|
152
|
+
- Options: `precedence` (mapping priority order)
|
|
153
|
+
- Available mappings: "ascii", "jisx0201", "jisx0208_90", "jisx0208_90_windows", "jisx0208_verbatim"
|
|
154
|
+
- Example: `2019—2020` (em dash) → `2019-2020`
|
|
155
|
+
|
|
156
|
+
### 6. **Ideographic Annotations** (`ideographic-annotations`)
|
|
157
|
+
Replaces ideographic annotations used in traditional Chinese-to-Japanese translation.
|
|
158
|
+
- Example: `㆖㆘` → `上下`
|
|
159
|
+
|
|
160
|
+
### 7. **IVS-SVS Base** (`ivs-svs-base`)
|
|
161
|
+
Handles Ideographic and Standardized Variation Selectors.
|
|
162
|
+
- Options: `charset`, `mode` ("ivs-or-svs" or "base"), `preferSVS`, `dropSelectorsAltogether`
|
|
163
|
+
- Example: `葛󠄀` (葛 + IVS) → `葛`
|
|
164
|
+
|
|
165
|
+
### 8. **Japanese Iteration Marks** (`japanese-iteration-marks`)
|
|
166
|
+
Expands iteration marks by repeating the preceding character.
|
|
167
|
+
- Example: `時々` → `時時`, `いすゞ` → `いすず`
|
|
168
|
+
|
|
169
|
+
### 9. **JIS X 0201 and Alike** (`jisx0201-and-alike`)
|
|
170
|
+
Handles half-width/full-width character conversion.
|
|
171
|
+
- Options: `fullwidthToHalfwidth`, `convertGL` (alphanumerics/symbols), `convertGR` (katakana), `u005cAsYenSign`
|
|
172
|
+
- Example: `ABC123` → `ABC123`, `カタカナ` → `カタカナ`
|
|
173
|
+
|
|
174
|
+
### 10. **Kanji Old-New** (`kanji-old-new`)
|
|
175
|
+
Converts old-style kanji (旧字体) to modern forms (新字体).
|
|
176
|
+
- Example: `舊字體の變換` → `旧字体の変換`
|
|
177
|
+
|
|
178
|
+
### 11. **Mathematical Alphanumerics** (`mathematical-alphanumerics`)
|
|
179
|
+
Normalizes mathematical alphanumeric symbols to plain ASCII.
|
|
180
|
+
- Example: `𝐀𝐁𝐂` (mathematical bold) → `ABC`
|
|
181
|
+
|
|
182
|
+
### 12. **Prolonged Sound Marks** (`prolonged-sound-marks`)
|
|
183
|
+
Handles contextual conversion between hyphens and prolonged sound marks.
|
|
184
|
+
- Options: `skipAlreadyTransliteratedChars`, `allowProlongedHatsuon`, `allowProlongedSokuon`, `replaceProlongedMarksFollowingAlnums`
|
|
185
|
+
- Example: `イ−ハト−ヴォ` (with hyphen) → `イーハトーヴォ` (prolonged mark)
|
|
186
|
+
|
|
187
|
+
### 13. **Radicals** (`radicals`)
|
|
188
|
+
Converts CJK radical characters to their corresponding ideographs.
|
|
189
|
+
- Example: `⾔⾨⾷` (Kangxi radicals) → `言門食`
|
|
190
|
+
|
|
191
|
+
### 14. **Spaces** (`spaces`)
|
|
192
|
+
Normalizes various Unicode space characters to standard ASCII space.
|
|
193
|
+
- Example: `A B` (ideographic space) → `A B`
|
|
194
|
+
|
|
195
|
+
## Development
|
|
196
|
+
|
|
197
|
+
After checking out the repo, run `bundle install` to install dependencies.
|
|
198
|
+
|
|
199
|
+
### Code Generation
|
|
200
|
+
|
|
201
|
+
Some transliterators are generated from data files:
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
rake codegen
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
This generates transliterators from the JSON data files in the `../data` directory.
|
|
208
|
+
|
|
209
|
+
### Testing
|
|
210
|
+
|
|
211
|
+
Run the test suite with:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
rake test
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Or run specific tests:
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
ruby test/test_basic.rb
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Contributing
|
|
224
|
+
|
|
225
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/yosina-lib/yosina.
|
|
226
|
+
|
|
227
|
+
## License
|
|
228
|
+
|
|
229
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'bundler/gem_tasks'
|
|
4
|
+
require 'rake/testtask'
|
|
5
|
+
require 'rdoc/task'
|
|
6
|
+
require 'rubocop/rake_task'
|
|
7
|
+
require 'yard'
|
|
8
|
+
|
|
9
|
+
RuboCop::RakeTask.new
|
|
10
|
+
|
|
11
|
+
Rake::TestTask.new(:test) do |t|
|
|
12
|
+
t.libs << 'test'
|
|
13
|
+
t.libs << 'lib'
|
|
14
|
+
t.test_files = FileList['test/**/test_*.rb']
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
RDoc::Task.new do |rd|
|
|
18
|
+
rd.main = 'README.md'
|
|
19
|
+
rd.rdoc_files.include('README.md', 'lib/**/*.rb')
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
YARD::Rake::YardocTask.new do |t|
|
|
23
|
+
t.files = ['lib/**/*.rb']
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
task :codegen do
|
|
27
|
+
ruby 'codegen/main.rb'
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
task default: :test
|