yosina 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e0feecfd822d02f104629ea622864a55c77771f2af69d54f19a564a06e45d6cf
4
- data.tar.gz: 04fba6b2b2db15a96f9574771fbe8e4c71debacfa62b9d7480ed178ee96c3868
3
+ metadata.gz: e2f55aa031748a0855edb5e5d75e8f0aa61f2c1636432bf1d2582aaff0acbed7
4
+ data.tar.gz: 7297e0aa25fd447303bc379e7206b76589bf1ffa970e767a11d82e04f92a480b
5
5
  SHA512:
6
- metadata.gz: 0aff88194c04e9316f69925fb4e6499d6e5dc7ae1c168136fdbc41ea01e38b2760cb9e2e2315879eb60eed18396f8405a2ac800e067469a1e9f6f015acc44010
7
- data.tar.gz: 9212dc8a91402ef11255a33303a09a371a0a5ec15ec1d6f4d2b86f3f4afbfa62255ace9ee649e29425890c1c6df0085356680408c6faab4968c57f4daaf32275
6
+ metadata.gz: 1f9b3046e6e9ecd17080793bff4499af828826fb47fc1990aa5576d246e3a00d1030ef0694698b4e4b954f2de28ef95e7ebcccdcce6fb01823506232149b17a2
7
+ data.tar.gz: 81abcc7866ccae0ad2966a7593f94cd5f4b4535709c752c369b20382e650186f8e85647e2a676ace9d586d97bd7a47ec36d2e7e264816df53ad6afe3a616d68f
data/README.ja.md CHANGED
@@ -188,6 +188,23 @@ CJK部首文字を対応する表意文字に変換します。
188
188
  様々なUnicode空白文字を標準ASCII空白に正規化します。
189
189
  - 例: `A B` (表意文字空白) → `A B`
190
190
 
191
+ ### 15. **ローマ数字** (`roman-numerals`)
192
+ Unicodeのローマ数字文字を対応するASCII文字に変換します。
193
+ - 例: `Ⅰ Ⅱ Ⅲ` → `I II III`、`ⅰ ⅱ ⅲ` → `i ii iii`
194
+
195
+ ### 16. **小書きひらがな・カタカナ** (`small-hirakatas`)
196
+ 小書きのひらがな・カタカナを通常サイズの等価文字に変換します。
197
+ - 例: `ぁぃぅ` → `あいう`、`ァィゥ` → `アイウ`
198
+
199
+ ### 17. **変体仮名** (`archaic-hirakatas`)
200
+ 変体仮名(古い仮名文字)を現代のひらがな・カタカナに変換します。
201
+ - 例: `𛀁` → `え`
202
+
203
+ ### 18. **歴史的仮名** (`historical-hirakatas`)
204
+ 歴史的なひらがな・カタカナを現代の等価文字に変換します。
205
+ - オプション: `hiraganas` ("simple"、"decompose"、"skip")、`katakanas` ("simple"、"decompose"、"skip")、`voicedKatakanas` ("decompose"、"skip")
206
+ - 例: `ゐ` → `い` (simple)、`ゐ` → `うぃ` (decompose)、`ヰ` → `イ` (simple)
207
+
191
208
  ## 開発
192
209
 
193
210
  リポジトリをチェックアウトした後、`bundle install`を実行して依存関係をインストールします。
data/README.md CHANGED
@@ -196,6 +196,19 @@ Normalizes various Unicode space characters to standard ASCII space.
196
196
  Converts Unicode Roman numeral characters to their ASCII letter equivalents.
197
197
  - Example: `Ⅰ Ⅱ Ⅲ` → `I II III`, `ⅰ ⅱ ⅲ` → `i ii iii`
198
198
 
199
+ ### 16. **Small Hirakatas** (`small-hirakatas`)
200
+ Converts small hiragana and katakana characters to their ordinary-sized equivalents.
201
+ - Example: `ぁぃぅ` → `あいう`, `ァィゥ` → `アイウ`
202
+
203
+ ### 17. **Archaic Hirakatas** (`archaic-hirakatas`)
204
+ Converts archaic kana (hentaigana) to their modern hiragana or katakana equivalents.
205
+ - Example: `𛀁` → `え`
206
+
207
+ ### 18. **Historical Hirakatas** (`historical-hirakatas`)
208
+ Converts historical hiragana and katakana characters to their modern equivalents.
209
+ - Options: `hiraganas` ("simple", "decompose", or "skip"), `katakanas` ("simple", "decompose", or "skip"), `voicedKatakanas` ("decompose" or "skip")
210
+ - Example: `ゐ` → `い` (simple), `ゐ` → `うぃ` (decompose), `ヰ` → `イ` (simple)
211
+
199
212
  ## Development
200
213
 
201
214
  After checking out the repo, run `bundle install` to install dependencies.
data/codegen/dataset.rb CHANGED
@@ -14,6 +14,8 @@ DatasetSourceDefs = Struct.new(
14
14
  :combined,
15
15
  :circled_or_squared,
16
16
  :roman_numerals,
17
+ :archaic_hirakatas,
18
+ :small_hirakatas,
17
19
  keyword_init: true
18
20
  )
19
21
 
@@ -29,6 +31,8 @@ Dataset = Struct.new(
29
31
  :combined,
30
32
  :circled_or_squared,
31
33
  :roman_numerals,
34
+ :archaic_hirakatas,
35
+ :small_hirakatas,
32
36
  keyword_init: true
33
37
  )
34
38
 
@@ -241,6 +245,8 @@ def build_dataset_from_data_root(data_root, defs)
241
245
  kanji_old_new: load_kanji_old_new_data(data_root / defs.kanji_old_new),
242
246
  combined: load_combined_data(data_root / defs.combined),
243
247
  circled_or_squared: load_circled_or_squared_data(data_root / defs.circled_or_squared),
244
- roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals)
248
+ roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals),
249
+ archaic_hirakatas: load_simple_data(data_root / defs.archaic_hirakatas),
250
+ small_hirakatas: load_simple_data(data_root / defs.small_hirakatas)
245
251
  )
246
252
  end
data/codegen/main.rb CHANGED
@@ -32,7 +32,9 @@ def main
32
32
  kanji_old_new: 'kanji-old-new-form.json',
33
33
  combined: 'combined-chars.json',
34
34
  circled_or_squared: 'circled-or-squared.json',
35
- roman_numerals: 'roman-numerals.json'
35
+ roman_numerals: 'roman-numerals.json',
36
+ archaic_hirakatas: 'archaic-hirakatas.json',
37
+ small_hirakatas: 'small-hirakatas.json'
36
38
  )
37
39
 
38
40
  # Load the dataset
@@ -64,6 +66,16 @@ def main
64
66
  'kanji_old_new',
65
67
  'Replace old-style kanji with modern equivalents',
66
68
  dataset.kanji_old_new
69
+ ],
70
+ [
71
+ 'archaic_hirakatas',
72
+ 'Replaces archaic kana (hentaigana) with their modern equivalents.',
73
+ dataset.archaic_hirakatas
74
+ ],
75
+ [
76
+ 'small_hirakatas',
77
+ 'Replaces small hiragana/katakana with their ordinary-sized equivalents.',
78
+ dataset.small_hirakatas
67
79
  ]
68
80
  ]
69
81
 
@@ -51,12 +51,15 @@ module Yosina
51
51
  end
52
52
 
53
53
  # Configuration recipe for building transliterator chains
54
+ # rubocop:disable Metrics/ClassLength
54
55
  class TransliterationRecipe
55
56
  attr_accessor :kanji_old_new, :hira_kata, :replace_japanese_iteration_marks,
56
57
  :replace_suspicious_hyphens_to_prolonged_sound_marks,
57
58
  :replace_combined_characters, :replace_circled_or_squared_characters,
58
59
  :replace_ideographic_annotations, :replace_radicals, :replace_spaces,
59
60
  :replace_hyphens, :replace_mathematical_alphanumerics, :replace_roman_numerals,
61
+ :replace_archaic_hirakatas, :replace_small_hirakatas,
62
+ :convert_historical_hirakatas,
60
63
  :combine_decomposed_hiraganas_and_katakanas, :to_fullwidth, :to_halfwidth,
61
64
  :remove_ivs_svs, :charset
62
65
 
@@ -161,6 +164,8 @@ module Yosina
161
164
  replace_ideographic_annotations: false, replace_radicals: false,
162
165
  replace_spaces: false, replace_hyphens: false,
163
166
  replace_mathematical_alphanumerics: false, replace_roman_numerals: false,
167
+ replace_archaic_hirakatas: false, replace_small_hirakatas: false,
168
+ convert_historical_hirakatas: nil,
164
169
  combine_decomposed_hiraganas_and_katakanas: false,
165
170
  to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false,
166
171
  charset: 'unijis_2004')
@@ -176,6 +181,9 @@ module Yosina
176
181
  @replace_hyphens = replace_hyphens
177
182
  @replace_mathematical_alphanumerics = replace_mathematical_alphanumerics
178
183
  @replace_roman_numerals = replace_roman_numerals
184
+ @replace_archaic_hirakatas = replace_archaic_hirakatas
185
+ @replace_small_hirakatas = replace_small_hirakatas
186
+ @convert_historical_hirakatas = convert_historical_hirakatas
179
187
  @combine_decomposed_hiraganas_and_katakanas = combine_decomposed_hiraganas_and_katakanas
180
188
  @to_fullwidth = to_fullwidth
181
189
  @to_halfwidth = to_halfwidth
@@ -208,6 +216,9 @@ module Yosina
208
216
  ctx = apply_replace_hyphens(ctx)
209
217
  ctx = apply_replace_mathematical_alphanumerics(ctx)
210
218
  ctx = apply_replace_roman_numerals(ctx)
219
+ ctx = apply_replace_archaic_hirakatas(ctx)
220
+ ctx = apply_replace_small_hirakatas(ctx)
221
+ ctx = apply_convert_historical_hirakatas(ctx)
211
222
  ctx = apply_combine_decomposed_hiraganas_and_katakanas(ctx)
212
223
  ctx = apply_to_fullwidth(ctx)
213
224
  ctx = apply_hira_kata(ctx)
@@ -329,6 +340,35 @@ module Yosina
329
340
  end
330
341
  end
331
342
 
343
+ def apply_replace_archaic_hirakatas(ctx)
344
+ if @replace_archaic_hirakatas
345
+ ctx.insert_middle([:archaic_hirakatas, {}])
346
+ else
347
+ ctx
348
+ end
349
+ end
350
+
351
+ def apply_replace_small_hirakatas(ctx)
352
+ if @replace_small_hirakatas
353
+ ctx.insert_middle([:small_hirakatas, {}])
354
+ else
355
+ ctx
356
+ end
357
+ end
358
+
359
+ def apply_convert_historical_hirakatas(ctx)
360
+ if @convert_historical_hirakatas
361
+ mode = @convert_historical_hirakatas
362
+ ctx.insert_middle([:historical_hirakatas, {
363
+ hiraganas: mode,
364
+ katakanas: mode,
365
+ voiced_katakanas: mode == 'decompose' ? 'decompose' : 'skip'
366
+ }])
367
+ else
368
+ ctx
369
+ end
370
+ end
371
+
332
372
  def apply_combine_decomposed_hiraganas_and_katakanas(ctx)
333
373
  if @combine_decomposed_hiraganas_and_katakanas
334
374
  ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
@@ -373,3 +413,4 @@ module Yosina
373
413
  recipe.build_transliterator_configs
374
414
  end
375
415
  end
416
+ # rubocop:enable Metrics/ClassLength
@@ -0,0 +1,350 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Replaces archaic kana (hentaigana) with their modern equivalents.
6
+ module ArchaicHirakatas
7
+ # Generated mapping data from archaic_hirakatas.json
8
+ ARCHAIC_HIRAKATAS_MAPPINGS = {
9
+ "\u{1b000}" => "\u{30a8}",
10
+ "\u{1b001}" => "\u{3048}",
11
+ "\u{1b002}" => "\u{3042}",
12
+ "\u{1b003}" => "\u{3042}",
13
+ "\u{1b004}" => "\u{3042}",
14
+ "\u{1b005}" => "\u{3042}",
15
+ "\u{1b006}" => "\u{3044}",
16
+ "\u{1b007}" => "\u{3044}",
17
+ "\u{1b008}" => "\u{3044}",
18
+ "\u{1b009}" => "\u{3044}",
19
+ "\u{1b00a}" => "\u{3046}",
20
+ "\u{1b00b}" => "\u{3046}",
21
+ "\u{1b00c}" => "\u{3046}",
22
+ "\u{1b00d}" => "\u{3046}",
23
+ "\u{1b00e}" => "\u{3046}",
24
+ "\u{1b00f}" => "\u{3048}",
25
+ "\u{1b010}" => "\u{3048}",
26
+ "\u{1b011}" => "\u{3048}",
27
+ "\u{1b012}" => "\u{3048}",
28
+ "\u{1b013}" => "\u{3048}",
29
+ "\u{1b014}" => "\u{304a}",
30
+ "\u{1b015}" => "\u{304a}",
31
+ "\u{1b016}" => "\u{304a}",
32
+ "\u{1b017}" => "\u{304b}",
33
+ "\u{1b018}" => "\u{304b}",
34
+ "\u{1b019}" => "\u{304b}",
35
+ "\u{1b01a}" => "\u{304b}",
36
+ "\u{1b01b}" => "\u{304b}",
37
+ "\u{1b01c}" => "\u{304b}",
38
+ "\u{1b01d}" => "\u{304b}",
39
+ "\u{1b01e}" => "\u{304b}",
40
+ "\u{1b01f}" => "\u{304b}",
41
+ "\u{1b020}" => "\u{304b}",
42
+ "\u{1b021}" => "\u{304b}",
43
+ "\u{1b022}" => "\u{304b}",
44
+ "\u{1b023}" => "\u{304d}",
45
+ "\u{1b024}" => "\u{304d}",
46
+ "\u{1b025}" => "\u{304d}",
47
+ "\u{1b026}" => "\u{304d}",
48
+ "\u{1b027}" => "\u{304d}",
49
+ "\u{1b028}" => "\u{304d}",
50
+ "\u{1b029}" => "\u{304d}",
51
+ "\u{1b02a}" => "\u{304d}",
52
+ "\u{1b02b}" => "\u{304f}",
53
+ "\u{1b02c}" => "\u{304f}",
54
+ "\u{1b02d}" => "\u{304f}",
55
+ "\u{1b02e}" => "\u{304f}",
56
+ "\u{1b02f}" => "\u{304f}",
57
+ "\u{1b030}" => "\u{304f}",
58
+ "\u{1b031}" => "\u{304f}",
59
+ "\u{1b032}" => "\u{3051}",
60
+ "\u{1b033}" => "\u{3051}",
61
+ "\u{1b034}" => "\u{3051}",
62
+ "\u{1b035}" => "\u{3051}",
63
+ "\u{1b036}" => "\u{3051}",
64
+ "\u{1b037}" => "\u{3051}",
65
+ "\u{1b038}" => "\u{3053}",
66
+ "\u{1b039}" => "\u{3053}",
67
+ "\u{1b03a}" => "\u{3053}",
68
+ "\u{1b03b}" => "\u{3053}",
69
+ "\u{1b03c}" => "\u{3055}",
70
+ "\u{1b03d}" => "\u{3055}",
71
+ "\u{1b03e}" => "\u{3055}",
72
+ "\u{1b03f}" => "\u{3055}",
73
+ "\u{1b040}" => "\u{3055}",
74
+ "\u{1b041}" => "\u{3055}",
75
+ "\u{1b042}" => "\u{3055}",
76
+ "\u{1b043}" => "\u{3055}",
77
+ "\u{1b044}" => "\u{3057}",
78
+ "\u{1b045}" => "\u{3057}",
79
+ "\u{1b046}" => "\u{3057}",
80
+ "\u{1b047}" => "\u{3057}",
81
+ "\u{1b048}" => "\u{3057}",
82
+ "\u{1b049}" => "\u{3057}",
83
+ "\u{1b04a}" => "\u{3059}",
84
+ "\u{1b04b}" => "\u{3059}",
85
+ "\u{1b04c}" => "\u{3059}",
86
+ "\u{1b04d}" => "\u{3059}",
87
+ "\u{1b04e}" => "\u{3059}",
88
+ "\u{1b04f}" => "\u{3059}",
89
+ "\u{1b050}" => "\u{3059}",
90
+ "\u{1b051}" => "\u{3059}",
91
+ "\u{1b052}" => "\u{305b}",
92
+ "\u{1b053}" => "\u{305b}",
93
+ "\u{1b054}" => "\u{305b}",
94
+ "\u{1b055}" => "\u{305b}",
95
+ "\u{1b056}" => "\u{305b}",
96
+ "\u{1b057}" => "\u{305d}",
97
+ "\u{1b058}" => "\u{305d}",
98
+ "\u{1b059}" => "\u{305d}",
99
+ "\u{1b05a}" => "\u{305d}",
100
+ "\u{1b05b}" => "\u{305d}",
101
+ "\u{1b05c}" => "\u{305d}",
102
+ "\u{1b05d}" => "\u{305d}",
103
+ "\u{1b05e}" => "\u{305f}",
104
+ "\u{1b05f}" => "\u{305f}",
105
+ "\u{1b060}" => "\u{305f}",
106
+ "\u{1b061}" => "\u{305f}",
107
+ "\u{1b062}" => "\u{3061}",
108
+ "\u{1b063}" => "\u{3061}",
109
+ "\u{1b064}" => "\u{3061}",
110
+ "\u{1b065}" => "\u{3061}",
111
+ "\u{1b066}" => "\u{3061}",
112
+ "\u{1b067}" => "\u{3061}",
113
+ "\u{1b068}" => "\u{3061}",
114
+ "\u{1b069}" => "\u{3064}",
115
+ "\u{1b06a}" => "\u{3064}",
116
+ "\u{1b06b}" => "\u{3064}",
117
+ "\u{1b06c}" => "\u{3064}",
118
+ "\u{1b06d}" => "\u{3064}",
119
+ "\u{1b06e}" => "\u{3066}",
120
+ "\u{1b06f}" => "\u{3066}",
121
+ "\u{1b070}" => "\u{3066}",
122
+ "\u{1b071}" => "\u{3066}",
123
+ "\u{1b072}" => "\u{3066}",
124
+ "\u{1b073}" => "\u{3066}",
125
+ "\u{1b074}" => "\u{3066}",
126
+ "\u{1b075}" => "\u{3066}",
127
+ "\u{1b076}" => "\u{3066}",
128
+ "\u{1b077}" => "\u{3068}",
129
+ "\u{1b078}" => "\u{3068}",
130
+ "\u{1b079}" => "\u{3068}",
131
+ "\u{1b07a}" => "\u{3068}",
132
+ "\u{1b07b}" => "\u{3068}",
133
+ "\u{1b07c}" => "\u{3068}",
134
+ "\u{1b07d}" => "\u{3068}",
135
+ "\u{1b07e}" => "\u{306a}",
136
+ "\u{1b07f}" => "\u{306a}",
137
+ "\u{1b080}" => "\u{306a}",
138
+ "\u{1b081}" => "\u{306a}",
139
+ "\u{1b082}" => "\u{306a}",
140
+ "\u{1b083}" => "\u{306a}",
141
+ "\u{1b084}" => "\u{306a}",
142
+ "\u{1b085}" => "\u{306a}",
143
+ "\u{1b086}" => "\u{306a}",
144
+ "\u{1b087}" => "\u{306b}",
145
+ "\u{1b088}" => "\u{306b}",
146
+ "\u{1b089}" => "\u{306b}",
147
+ "\u{1b08a}" => "\u{306b}",
148
+ "\u{1b08b}" => "\u{306b}",
149
+ "\u{1b08c}" => "\u{306b}",
150
+ "\u{1b08d}" => "\u{306b}",
151
+ "\u{1b08e}" => "\u{306b}",
152
+ "\u{1b08f}" => "\u{306c}",
153
+ "\u{1b090}" => "\u{306c}",
154
+ "\u{1b091}" => "\u{306c}",
155
+ "\u{1b092}" => "\u{306d}",
156
+ "\u{1b093}" => "\u{306d}",
157
+ "\u{1b094}" => "\u{306d}",
158
+ "\u{1b095}" => "\u{306d}",
159
+ "\u{1b096}" => "\u{306d}",
160
+ "\u{1b097}" => "\u{306d}",
161
+ "\u{1b098}" => "\u{306d}",
162
+ "\u{1b099}" => "\u{306e}",
163
+ "\u{1b09a}" => "\u{306e}",
164
+ "\u{1b09b}" => "\u{306e}",
165
+ "\u{1b09c}" => "\u{306e}",
166
+ "\u{1b09d}" => "\u{306e}",
167
+ "\u{1b09e}" => "\u{306f}",
168
+ "\u{1b09f}" => "\u{306f}",
169
+ "\u{1b0a0}" => "\u{306f}",
170
+ "\u{1b0a1}" => "\u{306f}",
171
+ "\u{1b0a2}" => "\u{306f}",
172
+ "\u{1b0a3}" => "\u{306f}",
173
+ "\u{1b0a4}" => "\u{306f}",
174
+ "\u{1b0a5}" => "\u{306f}",
175
+ "\u{1b0a6}" => "\u{306f}",
176
+ "\u{1b0a7}" => "\u{306f}",
177
+ "\u{1b0a8}" => "\u{306f}",
178
+ "\u{1b0a9}" => "\u{3072}",
179
+ "\u{1b0aa}" => "\u{3072}",
180
+ "\u{1b0ab}" => "\u{3072}",
181
+ "\u{1b0ac}" => "\u{3072}",
182
+ "\u{1b0ad}" => "\u{3072}",
183
+ "\u{1b0ae}" => "\u{3072}",
184
+ "\u{1b0af}" => "\u{3072}",
185
+ "\u{1b0b0}" => "\u{3075}",
186
+ "\u{1b0b1}" => "\u{3075}",
187
+ "\u{1b0b2}" => "\u{3075}",
188
+ "\u{1b0b3}" => "\u{3078}",
189
+ "\u{1b0b4}" => "\u{3078}",
190
+ "\u{1b0b5}" => "\u{3078}",
191
+ "\u{1b0b6}" => "\u{3078}",
192
+ "\u{1b0b7}" => "\u{3078}",
193
+ "\u{1b0b8}" => "\u{3078}",
194
+ "\u{1b0b9}" => "\u{3078}",
195
+ "\u{1b0ba}" => "\u{307b}",
196
+ "\u{1b0bb}" => "\u{307b}",
197
+ "\u{1b0bc}" => "\u{307b}",
198
+ "\u{1b0bd}" => "\u{307b}",
199
+ "\u{1b0be}" => "\u{307b}",
200
+ "\u{1b0bf}" => "\u{307b}",
201
+ "\u{1b0c0}" => "\u{307b}",
202
+ "\u{1b0c1}" => "\u{307b}",
203
+ "\u{1b0c2}" => "\u{307e}",
204
+ "\u{1b0c3}" => "\u{307e}",
205
+ "\u{1b0c4}" => "\u{307e}",
206
+ "\u{1b0c5}" => "\u{307e}",
207
+ "\u{1b0c6}" => "\u{307e}",
208
+ "\u{1b0c7}" => "\u{307e}",
209
+ "\u{1b0c8}" => "\u{307e}",
210
+ "\u{1b0c9}" => "\u{307f}",
211
+ "\u{1b0ca}" => "\u{307f}",
212
+ "\u{1b0cb}" => "\u{307f}",
213
+ "\u{1b0cc}" => "\u{307f}",
214
+ "\u{1b0cd}" => "\u{307f}",
215
+ "\u{1b0ce}" => "\u{307f}",
216
+ "\u{1b0cf}" => "\u{307f}",
217
+ "\u{1b0d0}" => "\u{3080}",
218
+ "\u{1b0d1}" => "\u{3080}",
219
+ "\u{1b0d2}" => "\u{3080}",
220
+ "\u{1b0d3}" => "\u{3080}",
221
+ "\u{1b0d4}" => "\u{3081}",
222
+ "\u{1b0d5}" => "\u{3081}",
223
+ "\u{1b0d6}" => "\u{3081}",
224
+ "\u{1b0d7}" => "\u{3082}",
225
+ "\u{1b0d8}" => "\u{3082}",
226
+ "\u{1b0d9}" => "\u{3082}",
227
+ "\u{1b0da}" => "\u{3082}",
228
+ "\u{1b0db}" => "\u{3082}",
229
+ "\u{1b0dc}" => "\u{3082}",
230
+ "\u{1b0dd}" => "\u{3084}",
231
+ "\u{1b0de}" => "\u{3084}",
232
+ "\u{1b0df}" => "\u{3084}",
233
+ "\u{1b0e0}" => "\u{3084}",
234
+ "\u{1b0e1}" => "\u{3084}",
235
+ "\u{1b0e2}" => "\u{3084}",
236
+ "\u{1b0e3}" => "\u{3086}",
237
+ "\u{1b0e4}" => "\u{3086}",
238
+ "\u{1b0e5}" => "\u{3086}",
239
+ "\u{1b0e6}" => "\u{3086}",
240
+ "\u{1b0e7}" => "\u{3088}",
241
+ "\u{1b0e8}" => "\u{3088}",
242
+ "\u{1b0e9}" => "\u{3088}",
243
+ "\u{1b0ea}" => "\u{3088}",
244
+ "\u{1b0eb}" => "\u{3088}",
245
+ "\u{1b0ec}" => "\u{3088}",
246
+ "\u{1b0ed}" => "\u{3089}",
247
+ "\u{1b0ee}" => "\u{3089}",
248
+ "\u{1b0ef}" => "\u{3089}",
249
+ "\u{1b0f0}" => "\u{3089}",
250
+ "\u{1b0f1}" => "\u{308a}",
251
+ "\u{1b0f2}" => "\u{308a}",
252
+ "\u{1b0f3}" => "\u{308a}",
253
+ "\u{1b0f4}" => "\u{308a}",
254
+ "\u{1b0f5}" => "\u{308a}",
255
+ "\u{1b0f6}" => "\u{308a}",
256
+ "\u{1b0f7}" => "\u{308a}",
257
+ "\u{1b0f8}" => "\u{308b}",
258
+ "\u{1b0f9}" => "\u{308b}",
259
+ "\u{1b0fa}" => "\u{308b}",
260
+ "\u{1b0fb}" => "\u{308b}",
261
+ "\u{1b0fc}" => "\u{308b}",
262
+ "\u{1b0fd}" => "\u{308b}",
263
+ "\u{1b0fe}" => "\u{308c}",
264
+ "\u{1b0ff}" => "\u{308c}",
265
+ "\u{1b100}" => "\u{308c}",
266
+ "\u{1b101}" => "\u{308c}",
267
+ "\u{1b102}" => "\u{308d}",
268
+ "\u{1b103}" => "\u{308d}",
269
+ "\u{1b104}" => "\u{308d}",
270
+ "\u{1b105}" => "\u{308d}",
271
+ "\u{1b106}" => "\u{308d}",
272
+ "\u{1b107}" => "\u{308d}",
273
+ "\u{1b108}" => "\u{308f}",
274
+ "\u{1b109}" => "\u{308f}",
275
+ "\u{1b10a}" => "\u{308f}",
276
+ "\u{1b10b}" => "\u{308f}",
277
+ "\u{1b10c}" => "\u{308f}",
278
+ "\u{1b10d}" => "\u{3090}",
279
+ "\u{1b10e}" => "\u{3090}",
280
+ "\u{1b10f}" => "\u{3090}",
281
+ "\u{1b110}" => "\u{3090}",
282
+ "\u{1b111}" => "\u{3090}",
283
+ "\u{1b112}" => "\u{3091}",
284
+ "\u{1b113}" => "\u{3091}",
285
+ "\u{1b114}" => "\u{3091}",
286
+ "\u{1b115}" => "\u{3091}",
287
+ "\u{1b116}" => "\u{3092}",
288
+ "\u{1b117}" => "\u{3092}",
289
+ "\u{1b118}" => "\u{3092}",
290
+ "\u{1b119}" => "\u{3092}",
291
+ "\u{1b11a}" => "\u{3092}",
292
+ "\u{1b11b}" => "\u{3092}",
293
+ "\u{1b11c}" => "\u{3092}",
294
+ "\u{1b11d}" => "\u{3093}",
295
+ "\u{1b11e}" => "\u{3093}",
296
+ "\u{1b11f}" => "\u{3046}",
297
+ "\u{1b120}" => "\u{30a4}",
298
+ "\u{1b121}" => "\u{30a8}",
299
+ "\u{1b122}" => "\u{30a6}"
300
+ }.freeze
301
+
302
+ # Transliterator for archaic_hirakatas
303
+ class Transliterator < Yosina::BaseTransliterator
304
+ # Initialize the transliterator with options
305
+ #
306
+ # @param _options [Hash] Configuration options (currently unused)
307
+ def initialize(_options = {})
308
+ # Options currently unused for archaic_hirakatas transliterator
309
+ super()
310
+ end
311
+
312
+ # Replaces archaic kana (hentaigana) with their modern equivalents.
313
+ #
314
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
315
+ # @return [Enumerable<Char>] The transliterated characters
316
+ def call(input_chars)
317
+ offset = 0
318
+
319
+ result = input_chars.filter_map do |char|
320
+ replacement = ARCHAIC_HIRAKATAS_MAPPINGS[char.c]
321
+ c = if replacement
322
+ # Skip empty replacements (character removal)
323
+ next if replacement.empty?
324
+
325
+ Char.new(c: replacement, offset: offset, source: char)
326
+ else
327
+ char.with_offset(offset)
328
+ end
329
+ offset += c.c.length
330
+ c
331
+ end
332
+
333
+ class << result
334
+ include Yosina::Chars
335
+ end
336
+
337
+ result
338
+ end
339
+ end
340
+
341
+ # Factory method to create a archaic_hirakatas transliterator
342
+ #
343
+ # @param options [Hash] Configuration options
344
+ # @return [Transliterator] A new archaic_hirakatas transliterator instance
345
+ def self.call(options = {})
346
+ Transliterator.new(options)
347
+ end
348
+ end
349
+ end
350
+ end
@@ -79,7 +79,11 @@ module Yosina
79
79
  ['ょ', 'ョ', 'ョ'],
80
80
  ['ゎ', 'ヮ', nil],
81
81
  ['ゕ', 'ヵ', nil],
82
- ['ゖ', 'ヶ', nil]
82
+ ['ゖ', 'ヶ', nil],
83
+ ["\u{1B132}", "\u{1B155}", nil],
84
+ ["\u{1B150}", "\u{1B164}", nil],
85
+ ["\u{1B151}", "\u{1B165}", nil],
86
+ ["\u{1B152}", "\u{1B166}", nil]
83
87
  ].freeze
84
88
 
85
89
  # Generate voiced character mappings
@@ -0,0 +1,153 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Convert historical hiragana/katakana characters to their modern equivalents
6
+ module HistoricalHirakatas
7
+ # Historical hiragana mappings: source => { simple:, decompose: }
8
+ HISTORICAL_HIRAGANA_MAPPINGS = {
9
+ "\u{3090}" => { simple: "\u{3044}", decompose: "\u{3046}\u{3043}" }, # ゐ → い / うぃ
10
+ "\u{3091}" => { simple: "\u{3048}", decompose: "\u{3046}\u{3047}" } # ゑ → え / うぇ
11
+ }.freeze
12
+
13
+ # Historical katakana mappings: source => { simple:, decompose: }
14
+ HISTORICAL_KATAKANA_MAPPINGS = {
15
+ "\u{30F0}" => { simple: "\u{30A4}", decompose: "\u{30A6}\u{30A3}" }, # ヰ → イ / ウィ
16
+ "\u{30F1}" => { simple: "\u{30A8}", decompose: "\u{30A6}\u{30A7}" } # ヱ → エ / ウェ
17
+ }.freeze
18
+
19
+ # Voiced historical katakana mappings: source => small vowel suffix
20
+ VOICED_HISTORICAL_KANA_MAPPINGS = {
21
+ "\u{30F7}" => "\u{30A1}", # ヷ → ァ
22
+ "\u{30F8}" => "\u{30A3}", # ヸ → ィ
23
+ "\u{30F9}" => "\u{30A7}", # ヹ → ェ
24
+ "\u{30FA}" => "\u{30A9}" # ヺ → ォ
25
+ }.freeze
26
+
27
+ VOICED_HISTORICAL_KANA_DECOMPOSED_MAPPINGS = {
28
+ "\u{30EF}" => "\u{30A1}", # ヷ → ァ
29
+ "\u{30F0}" => "\u{30A3}", # ヸ → ィ
30
+ "\u{30F1}" => "\u{30A7}", # ヹ → ェ
31
+ "\u{30F2}" => "\u{30A9}" # ヺ → ォ
32
+ }.freeze
33
+
34
+ COMBINING_DAKUTEN = "\u{3099}"
35
+ VU = "\u{30F4}"
36
+ U = "\u{30A6}"
37
+
38
+ # Transliterator for historical hiragana/katakana conversion
39
+ class Transliterator < Yosina::BaseTransliterator
40
+ # Initialize the transliterator with options
41
+ #
42
+ # @param options [Hash] Configuration options
43
+ # @option options [String] :hiraganas "simple" (default), "decompose", or "skip"
44
+ # @option options [String] :katakanas "simple" (default), "decompose", or "skip"
45
+ # @option options [String] :voiced_katakanas "decompose" or "skip" (default)
46
+ def initialize(options = {})
47
+ super()
48
+ @hiraganas = (options[:hiraganas] || :simple).to_sym
49
+ @katakanas = (options[:katakanas] || :simple).to_sym
50
+ @voiced_katakanas = (options[:voiced_katakanas] || :skip).to_sym
51
+ end
52
+
53
+ # Convert historical hiragana/katakana characters to modern equivalents
54
+ #
55
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
56
+ # @return [Enumerable<Char>] The transliterated characters
57
+ def call(input_chars)
58
+ Chars.enum do |y|
59
+ offset = 0
60
+ pending = nil
61
+ input_chars.each do |char|
62
+ if char.sentinel?
63
+ offset = emit_char(y, pending, offset) if pending
64
+ pending = nil
65
+ y << char
66
+ break
67
+ end
68
+
69
+ if pending.nil?
70
+ pending = char
71
+ next
72
+ end
73
+
74
+ if char.c == COMBINING_DAKUTEN
75
+ # Check if pending char could be a decomposed voiced base
76
+ decomposed = VOICED_HISTORICAL_KANA_DECOMPOSED_MAPPINGS[pending.c]
77
+ if @voiced_katakanas == :skip || decomposed.nil?
78
+ y << pending.with_offset(offset)
79
+ offset += pending.c.length
80
+ pending = char
81
+ next
82
+ end
83
+ y << Char.new(c: U, offset: offset, source: pending)
84
+ offset += U.length
85
+ y << char.with_offset(offset)
86
+ offset += char.c.length
87
+ y << Char.new(c: decomposed, offset: offset, source: pending)
88
+ offset += decomposed.length
89
+ pending = nil
90
+ next
91
+ end
92
+
93
+ offset = emit_char(y, pending, offset)
94
+ pending = char
95
+ end
96
+ # Flush any remaining pending char
97
+ emit_char(y, pending, offset) if pending
98
+ end
99
+ end
100
+
101
+ private
102
+
103
+ # Emit a single char through the normal mapping logic
104
+ #
105
+ # @param y [Enumerator::Yielder] The yielder
106
+ # @param char [Char] The character to emit
107
+ # @param offset [Integer] The current offset
108
+ # @return [Integer] The new offset after emitting
109
+ # rubocop:disable Naming/MethodParameterName
110
+ def emit_char(y, char, offset)
111
+ # Historical hiragana
112
+ hira_mapping = HISTORICAL_HIRAGANA_MAPPINGS[char.c]
113
+ if hira_mapping && @hiraganas != :skip
114
+ replacement = hira_mapping[@hiraganas]
115
+ y << Char.new(c: replacement, offset: offset, source: char)
116
+ return offset + replacement.length
117
+ end
118
+
119
+ # Historical katakana
120
+ kata_mapping = HISTORICAL_KATAKANA_MAPPINGS[char.c]
121
+ if kata_mapping && @katakanas != :skip
122
+ replacement = kata_mapping[@katakanas]
123
+ y << Char.new(c: replacement, offset: offset, source: char)
124
+ return offset + replacement.length
125
+ end
126
+
127
+ # Voiced historical katakana
128
+ if @voiced_katakanas == :decompose
129
+ decomposed = VOICED_HISTORICAL_KANA_MAPPINGS[char.c]
130
+ if decomposed
131
+ y << Char.new(c: VU, offset: offset, source: char)
132
+ offset += VU.length
133
+ y << Char.new(c: decomposed, offset: offset, source: char)
134
+ return offset + decomposed.length
135
+ end
136
+ end
137
+
138
+ y << char.with_offset(offset)
139
+ offset + char.c.length
140
+ end
141
+ end
142
+ # rubocop:enable Naming/MethodParameterName
143
+
144
+ # Factory method to create a historical hirakatas transliterator
145
+ #
146
+ # @param options [Hash] Configuration options
147
+ # @return [Transliterator] A new historical hirakatas transliterator instance
148
+ def self.call(options = {})
149
+ Transliterator.new(options)
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Replaces small hiragana/katakana with their ordinary-sized equivalents.
6
+ module SmallHirakatas
7
+ # Generated mapping data from small_hirakatas.json
8
+ SMALL_HIRAKATAS_MAPPINGS = {
9
+ "\u{3041}" => "\u{3042}",
10
+ "\u{3043}" => "\u{3044}",
11
+ "\u{3045}" => "\u{3046}",
12
+ "\u{3047}" => "\u{3048}",
13
+ "\u{3049}" => "\u{304a}",
14
+ "\u{3063}" => "\u{3064}",
15
+ "\u{3083}" => "\u{3084}",
16
+ "\u{3085}" => "\u{3086}",
17
+ "\u{3087}" => "\u{3088}",
18
+ "\u{308e}" => "\u{308f}",
19
+ "\u{3095}" => "\u{304b}",
20
+ "\u{3096}" => "\u{3051}",
21
+ "\u{30a1}" => "\u{30a2}",
22
+ "\u{30a3}" => "\u{30a4}",
23
+ "\u{30a5}" => "\u{30a6}",
24
+ "\u{30a7}" => "\u{30a8}",
25
+ "\u{30a9}" => "\u{30aa}",
26
+ "\u{30c3}" => "\u{30c4}",
27
+ "\u{30e3}" => "\u{30e4}",
28
+ "\u{30e5}" => "\u{30e6}",
29
+ "\u{30e7}" => "\u{30e8}",
30
+ "\u{30ee}" => "\u{30ef}",
31
+ "\u{30f5}" => "\u{30ab}",
32
+ "\u{30f6}" => "\u{30b1}",
33
+ "\u{31f0}" => "\u{30af}",
34
+ "\u{31f1}" => "\u{30b7}",
35
+ "\u{31f2}" => "\u{30b9}",
36
+ "\u{31f3}" => "\u{30c8}",
37
+ "\u{31f4}" => "\u{30cc}",
38
+ "\u{31f5}" => "\u{30cf}",
39
+ "\u{31f6}" => "\u{30d2}",
40
+ "\u{31f7}" => "\u{30d5}",
41
+ "\u{31f8}" => "\u{30d8}",
42
+ "\u{31f9}" => "\u{30db}",
43
+ "\u{31fa}" => "\u{30e0}",
44
+ "\u{31fb}" => "\u{30e9}",
45
+ "\u{31fc}" => "\u{30ea}",
46
+ "\u{31fd}" => "\u{30eb}",
47
+ "\u{31fe}" => "\u{30ec}",
48
+ "\u{31ff}" => "\u{30ed}",
49
+ "\u{ff67}" => "\u{ff71}",
50
+ "\u{ff68}" => "\u{ff72}",
51
+ "\u{ff69}" => "\u{ff73}",
52
+ "\u{ff6a}" => "\u{ff74}",
53
+ "\u{ff6b}" => "\u{ff75}",
54
+ "\u{ff6c}" => "\u{ff94}",
55
+ "\u{ff6d}" => "\u{ff95}",
56
+ "\u{ff6e}" => "\u{ff96}",
57
+ "\u{ff6f}" => "\u{ff82}",
58
+ "\u{1b132}" => "\u{3053}",
59
+ "\u{1b150}" => "\u{3090}",
60
+ "\u{1b151}" => "\u{3091}",
61
+ "\u{1b152}" => "\u{3092}",
62
+ "\u{1b155}" => "\u{30b3}",
63
+ "\u{1b164}" => "\u{30f0}",
64
+ "\u{1b165}" => "\u{30f1}",
65
+ "\u{1b166}" => "\u{30f2}",
66
+ "\u{1b167}" => "\u{30f3}"
67
+ }.freeze
68
+
69
+ # Transliterator for small_hirakatas
70
+ class Transliterator < Yosina::BaseTransliterator
71
+ # Initialize the transliterator with options
72
+ #
73
+ # @param _options [Hash] Configuration options (currently unused)
74
+ def initialize(_options = {})
75
+ # Options currently unused for small_hirakatas transliterator
76
+ super()
77
+ end
78
+
79
+ # Replaces small hiragana/katakana with their ordinary-sized equivalents.
80
+ #
81
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
82
+ # @return [Enumerable<Char>] The transliterated characters
83
+ def call(input_chars)
84
+ offset = 0
85
+
86
+ result = input_chars.filter_map do |char|
87
+ replacement = SMALL_HIRAKATAS_MAPPINGS[char.c]
88
+ c = if replacement
89
+ # Skip empty replacements (character removal)
90
+ next if replacement.empty?
91
+
92
+ Char.new(c: replacement, offset: offset, source: char)
93
+ else
94
+ char.with_offset(offset)
95
+ end
96
+ offset += c.c.length
97
+ c
98
+ end
99
+
100
+ class << result
101
+ include Yosina::Chars
102
+ end
103
+
104
+ result
105
+ end
106
+ end
107
+
108
+ # Factory method to create a small_hirakatas transliterator
109
+ #
110
+ # @param options [Hash] Configuration options
111
+ # @return [Transliterator] A new small_hirakatas transliterator instance
112
+ def self.call(options = {})
113
+ Transliterator.new(options)
114
+ end
115
+ end
116
+ end
117
+ end
@@ -15,6 +15,9 @@ require_relative 'transliterators/jisx0201_and_alike'
15
15
  require_relative 'transliterators/circled_or_squared'
16
16
  require_relative 'transliterators/combined'
17
17
  require_relative 'transliterators/japanese_iteration_marks'
18
+ require_relative 'transliterators/archaic_hirakatas'
19
+ require_relative 'transliterators/small_hirakatas'
20
+ require_relative 'transliterators/historical_hirakatas'
18
21
 
19
22
  module Yosina
20
23
  # Registry for transliterator factories
@@ -34,7 +37,10 @@ module Yosina
34
37
  jisx0201_and_alike: Transliterators::Jisx0201AndAlike,
35
38
  combined: Transliterators::Combined,
36
39
  circled_or_squared: CircledOrSquared,
37
- japanese_iteration_marks: Transliterators::JapaneseIterationMarks
40
+ japanese_iteration_marks: Transliterators::JapaneseIterationMarks,
41
+ archaic_hirakatas: Transliterators::ArchaicHirakatas,
42
+ small_hirakatas: Transliterators::SmallHirakatas,
43
+ historical_hirakatas: Transliterators::HistoricalHirakatas
38
44
  }.freeze
39
45
 
40
46
  # Get a transliterator factory by name
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Yosina
4
- VERSION = '1.0.0'
4
+ VERSION = '1.1.1'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yosina
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moriyoshi Koizumi
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-09-24 00:00:00.000000000 Z
11
+ date: 2026-03-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -109,6 +109,7 @@ files:
109
109
  - lib/yosina/recipes.rb
110
110
  - lib/yosina/transliterator.rb
111
111
  - lib/yosina/transliterators.rb
112
+ - lib/yosina/transliterators/archaic_hirakatas.rb
112
113
  - lib/yosina/transliterators/circled_or_squared.rb
113
114
  - lib/yosina/transliterators/circled_or_squared_data.rb
114
115
  - lib/yosina/transliterators/combined.rb
@@ -116,6 +117,7 @@ files:
116
117
  - lib/yosina/transliterators/hira_kata.rb
117
118
  - lib/yosina/transliterators/hira_kata_composition.rb
118
119
  - lib/yosina/transliterators/hira_kata_table.rb
120
+ - lib/yosina/transliterators/historical_hirakatas.rb
119
121
  - lib/yosina/transliterators/hyphens.rb
120
122
  - lib/yosina/transliterators/hyphens_data.rb
121
123
  - lib/yosina/transliterators/ideographic_annotations.rb
@@ -129,6 +131,7 @@ files:
129
131
  - lib/yosina/transliterators/radicals.rb
130
132
  - lib/yosina/transliterators/roman_numerals.rb
131
133
  - lib/yosina/transliterators/roman_numerals_data.rb
134
+ - lib/yosina/transliterators/small_hirakatas.rb
132
135
  - lib/yosina/transliterators/spaces.rb
133
136
  - lib/yosina/version.rb
134
137
  - yosina.gemspec