yosina 0.2.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4831a722a9b91da965e6aa838c2af758ba86cdc445577cd2ca153614e6f38d7b
4
- data.tar.gz: 85961c7d59c4d8e8032f93108c5201a47cfa3b8598bcb686dad1e4da34ac3787
3
+ metadata.gz: c3683e10291faa325375fe5c3b069cdc6069e1654a7aae8c9cc754d9cf33b17b
4
+ data.tar.gz: 891d3b94688ae9af9693cd61b8edc30b9c2e0388662586a33de39ebb900bb4bf
5
5
  SHA512:
6
- metadata.gz: 744642abe6c4cbf69848552953008697a1ddc98765331703800a91c5b504dbd3697a2de80992222c37130fe70aeb2b14d813b88a2ea76fde83a11caf04d901e6
7
- data.tar.gz: a307fb1706da402d99c0c865ad5e8ed293dfdf8f8bd69a59aab3aa9c9172b9cd1461b8075a6934e9d7f496cfe971a12d02a89cd00d2af90d78c55d98b7b04fa8
6
+ metadata.gz: 86a08510acaafae1a2afe8b72b194204839e6aa63a07e44b4ebd1219048722536e38d6bcc5972f3c3a543fd5f94648b6a98dbe9f6ee6c90000eacc0a0a6495d6
7
+ data.tar.gz: 0ce9a44af3df6dcb1f89bc6e3b2661da834008a0d3c972c69acac5c3a644ffe4455ac9ca70cb5556980456e6e7ce4e50b8beb324ad9134531e936141b4c06f4d
data/codegen/dataset.rb CHANGED
@@ -14,6 +14,8 @@ DatasetSourceDefs = Struct.new(
14
14
  :combined,
15
15
  :circled_or_squared,
16
16
  :roman_numerals,
17
+ :archaic_hirakatas,
18
+ :small_hirakatas,
17
19
  keyword_init: true
18
20
  )
19
21
 
@@ -29,6 +31,8 @@ Dataset = Struct.new(
29
31
  :combined,
30
32
  :circled_or_squared,
31
33
  :roman_numerals,
34
+ :archaic_hirakatas,
35
+ :small_hirakatas,
32
36
  keyword_init: true
33
37
  )
34
38
 
@@ -241,6 +245,8 @@ def build_dataset_from_data_root(data_root, defs)
241
245
  kanji_old_new: load_kanji_old_new_data(data_root / defs.kanji_old_new),
242
246
  combined: load_combined_data(data_root / defs.combined),
243
247
  circled_or_squared: load_circled_or_squared_data(data_root / defs.circled_or_squared),
244
- roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals)
248
+ roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals),
249
+ archaic_hirakatas: load_simple_data(data_root / defs.archaic_hirakatas),
250
+ small_hirakatas: load_simple_data(data_root / defs.small_hirakatas)
245
251
  )
246
252
  end
data/codegen/main.rb CHANGED
@@ -32,7 +32,9 @@ def main
32
32
  kanji_old_new: 'kanji-old-new-form.json',
33
33
  combined: 'combined-chars.json',
34
34
  circled_or_squared: 'circled-or-squared.json',
35
- roman_numerals: 'roman-numerals.json'
35
+ roman_numerals: 'roman-numerals.json',
36
+ archaic_hirakatas: 'archaic-hirakatas.json',
37
+ small_hirakatas: 'small-hirakatas.json'
36
38
  )
37
39
 
38
40
  # Load the dataset
@@ -64,6 +66,16 @@ def main
64
66
  'kanji_old_new',
65
67
  'Replace old-style kanji with modern equivalents',
66
68
  dataset.kanji_old_new
69
+ ],
70
+ [
71
+ 'archaic_hirakatas',
72
+ 'Replaces archaic kana (hentaigana) with their modern equivalents.',
73
+ dataset.archaic_hirakatas
74
+ ],
75
+ [
76
+ 'small_hirakatas',
77
+ 'Replaces small hiragana/katakana with their ordinary-sized equivalents.',
78
+ dataset.small_hirakatas
67
79
  ]
68
80
  ]
69
81
 
@@ -51,12 +51,15 @@ module Yosina
51
51
  end
52
52
 
53
53
  # Configuration recipe for building transliterator chains
54
+ # rubocop:disable Metrics/ClassLength
54
55
  class TransliterationRecipe
55
56
  attr_accessor :kanji_old_new, :hira_kata, :replace_japanese_iteration_marks,
56
57
  :replace_suspicious_hyphens_to_prolonged_sound_marks,
57
58
  :replace_combined_characters, :replace_circled_or_squared_characters,
58
59
  :replace_ideographic_annotations, :replace_radicals, :replace_spaces,
59
60
  :replace_hyphens, :replace_mathematical_alphanumerics, :replace_roman_numerals,
61
+ :replace_archaic_hirakatas, :replace_small_hirakatas,
62
+ :convert_historical_hirakatas,
60
63
  :combine_decomposed_hiraganas_and_katakanas, :to_fullwidth, :to_halfwidth,
61
64
  :remove_ivs_svs, :charset
62
65
 
@@ -161,6 +164,8 @@ module Yosina
161
164
  replace_ideographic_annotations: false, replace_radicals: false,
162
165
  replace_spaces: false, replace_hyphens: false,
163
166
  replace_mathematical_alphanumerics: false, replace_roman_numerals: false,
167
+ replace_archaic_hirakatas: false, replace_small_hirakatas: false,
168
+ convert_historical_hirakatas: nil,
164
169
  combine_decomposed_hiraganas_and_katakanas: false,
165
170
  to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false,
166
171
  charset: 'unijis_2004')
@@ -176,6 +181,9 @@ module Yosina
176
181
  @replace_hyphens = replace_hyphens
177
182
  @replace_mathematical_alphanumerics = replace_mathematical_alphanumerics
178
183
  @replace_roman_numerals = replace_roman_numerals
184
+ @replace_archaic_hirakatas = replace_archaic_hirakatas
185
+ @replace_small_hirakatas = replace_small_hirakatas
186
+ @convert_historical_hirakatas = convert_historical_hirakatas
179
187
  @combine_decomposed_hiraganas_and_katakanas = combine_decomposed_hiraganas_and_katakanas
180
188
  @to_fullwidth = to_fullwidth
181
189
  @to_halfwidth = to_halfwidth
@@ -208,6 +216,9 @@ module Yosina
208
216
  ctx = apply_replace_hyphens(ctx)
209
217
  ctx = apply_replace_mathematical_alphanumerics(ctx)
210
218
  ctx = apply_replace_roman_numerals(ctx)
219
+ ctx = apply_replace_archaic_hirakatas(ctx)
220
+ ctx = apply_replace_small_hirakatas(ctx)
221
+ ctx = apply_convert_historical_hirakatas(ctx)
211
222
  ctx = apply_combine_decomposed_hiraganas_and_katakanas(ctx)
212
223
  ctx = apply_to_fullwidth(ctx)
213
224
  ctx = apply_hira_kata(ctx)
@@ -241,7 +252,7 @@ module Yosina
241
252
 
242
253
  def apply_hira_kata(ctx)
243
254
  if @hira_kata
244
- ctx.insert_middle([:hira_kata, { mode: @hira_kata }])
255
+ ctx.insert_tail([:hira_kata, { mode: @hira_kata }])
245
256
  else
246
257
  ctx
247
258
  end
@@ -329,6 +340,35 @@ module Yosina
329
340
  end
330
341
  end
331
342
 
343
+ def apply_replace_archaic_hirakatas(ctx)
344
+ if @replace_archaic_hirakatas
345
+ ctx.insert_middle([:archaic_hirakatas, {}])
346
+ else
347
+ ctx
348
+ end
349
+ end
350
+
351
+ def apply_replace_small_hirakatas(ctx)
352
+ if @replace_small_hirakatas
353
+ ctx.insert_middle([:small_hirakatas, {}])
354
+ else
355
+ ctx
356
+ end
357
+ end
358
+
359
+ def apply_convert_historical_hirakatas(ctx)
360
+ if @convert_historical_hirakatas
361
+ mode = @convert_historical_hirakatas
362
+ ctx.insert_middle([:historical_hirakatas, {
363
+ hiraganas: mode,
364
+ katakanas: mode,
365
+ voiced_katakanas: mode == 'decompose' ? 'decompose' : 'skip'
366
+ }])
367
+ else
368
+ ctx
369
+ end
370
+ end
371
+
332
372
  def apply_combine_decomposed_hiraganas_and_katakanas(ctx)
333
373
  if @combine_decomposed_hiraganas_and_katakanas
334
374
  ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
@@ -373,3 +413,4 @@ module Yosina
373
413
  recipe.build_transliterator_configs
374
414
  end
375
415
  end
416
+ # rubocop:enable Metrics/ClassLength
@@ -0,0 +1,350 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Replaces archaic kana (hentaigana) with their modern equivalents.
6
+ module ArchaicHirakatas
7
+ # Generated mapping data from archaic_hirakatas.json
8
+ ARCHAIC_HIRAKATAS_MAPPINGS = {
9
+ "\u{1b000}" => "\u{30a8}",
10
+ "\u{1b001}" => "\u{3048}",
11
+ "\u{1b002}" => "\u{3042}",
12
+ "\u{1b003}" => "\u{3042}",
13
+ "\u{1b004}" => "\u{3042}",
14
+ "\u{1b005}" => "\u{3042}",
15
+ "\u{1b006}" => "\u{3044}",
16
+ "\u{1b007}" => "\u{3044}",
17
+ "\u{1b008}" => "\u{3044}",
18
+ "\u{1b009}" => "\u{3044}",
19
+ "\u{1b00a}" => "\u{3046}",
20
+ "\u{1b00b}" => "\u{3046}",
21
+ "\u{1b00c}" => "\u{3046}",
22
+ "\u{1b00d}" => "\u{3046}",
23
+ "\u{1b00e}" => "\u{3046}",
24
+ "\u{1b00f}" => "\u{3048}",
25
+ "\u{1b010}" => "\u{3048}",
26
+ "\u{1b011}" => "\u{3048}",
27
+ "\u{1b012}" => "\u{3048}",
28
+ "\u{1b013}" => "\u{3048}",
29
+ "\u{1b014}" => "\u{304a}",
30
+ "\u{1b015}" => "\u{304a}",
31
+ "\u{1b016}" => "\u{304a}",
32
+ "\u{1b017}" => "\u{304b}",
33
+ "\u{1b018}" => "\u{304b}",
34
+ "\u{1b019}" => "\u{304b}",
35
+ "\u{1b01a}" => "\u{304b}",
36
+ "\u{1b01b}" => "\u{304b}",
37
+ "\u{1b01c}" => "\u{304b}",
38
+ "\u{1b01d}" => "\u{304b}",
39
+ "\u{1b01e}" => "\u{304b}",
40
+ "\u{1b01f}" => "\u{304b}",
41
+ "\u{1b020}" => "\u{304b}",
42
+ "\u{1b021}" => "\u{304b}",
43
+ "\u{1b022}" => "\u{304b}",
44
+ "\u{1b023}" => "\u{304d}",
45
+ "\u{1b024}" => "\u{304d}",
46
+ "\u{1b025}" => "\u{304d}",
47
+ "\u{1b026}" => "\u{304d}",
48
+ "\u{1b027}" => "\u{304d}",
49
+ "\u{1b028}" => "\u{304d}",
50
+ "\u{1b029}" => "\u{304d}",
51
+ "\u{1b02a}" => "\u{304d}",
52
+ "\u{1b02b}" => "\u{304f}",
53
+ "\u{1b02c}" => "\u{304f}",
54
+ "\u{1b02d}" => "\u{304f}",
55
+ "\u{1b02e}" => "\u{304f}",
56
+ "\u{1b02f}" => "\u{304f}",
57
+ "\u{1b030}" => "\u{304f}",
58
+ "\u{1b031}" => "\u{304f}",
59
+ "\u{1b032}" => "\u{3051}",
60
+ "\u{1b033}" => "\u{3051}",
61
+ "\u{1b034}" => "\u{3051}",
62
+ "\u{1b035}" => "\u{3051}",
63
+ "\u{1b036}" => "\u{3051}",
64
+ "\u{1b037}" => "\u{3051}",
65
+ "\u{1b038}" => "\u{3053}",
66
+ "\u{1b039}" => "\u{3053}",
67
+ "\u{1b03a}" => "\u{3053}",
68
+ "\u{1b03b}" => "\u{3053}",
69
+ "\u{1b03c}" => "\u{3055}",
70
+ "\u{1b03d}" => "\u{3055}",
71
+ "\u{1b03e}" => "\u{3055}",
72
+ "\u{1b03f}" => "\u{3055}",
73
+ "\u{1b040}" => "\u{3055}",
74
+ "\u{1b041}" => "\u{3055}",
75
+ "\u{1b042}" => "\u{3055}",
76
+ "\u{1b043}" => "\u{3055}",
77
+ "\u{1b044}" => "\u{3057}",
78
+ "\u{1b045}" => "\u{3057}",
79
+ "\u{1b046}" => "\u{3057}",
80
+ "\u{1b047}" => "\u{3057}",
81
+ "\u{1b048}" => "\u{3057}",
82
+ "\u{1b049}" => "\u{3057}",
83
+ "\u{1b04a}" => "\u{3059}",
84
+ "\u{1b04b}" => "\u{3059}",
85
+ "\u{1b04c}" => "\u{3059}",
86
+ "\u{1b04d}" => "\u{3059}",
87
+ "\u{1b04e}" => "\u{3059}",
88
+ "\u{1b04f}" => "\u{3059}",
89
+ "\u{1b050}" => "\u{3059}",
90
+ "\u{1b051}" => "\u{3059}",
91
+ "\u{1b052}" => "\u{305b}",
92
+ "\u{1b053}" => "\u{305b}",
93
+ "\u{1b054}" => "\u{305b}",
94
+ "\u{1b055}" => "\u{305b}",
95
+ "\u{1b056}" => "\u{305b}",
96
+ "\u{1b057}" => "\u{305d}",
97
+ "\u{1b058}" => "\u{305d}",
98
+ "\u{1b059}" => "\u{305d}",
99
+ "\u{1b05a}" => "\u{305d}",
100
+ "\u{1b05b}" => "\u{305d}",
101
+ "\u{1b05c}" => "\u{305d}",
102
+ "\u{1b05d}" => "\u{305d}",
103
+ "\u{1b05e}" => "\u{305f}",
104
+ "\u{1b05f}" => "\u{305f}",
105
+ "\u{1b060}" => "\u{305f}",
106
+ "\u{1b061}" => "\u{305f}",
107
+ "\u{1b062}" => "\u{3061}",
108
+ "\u{1b063}" => "\u{3061}",
109
+ "\u{1b064}" => "\u{3061}",
110
+ "\u{1b065}" => "\u{3061}",
111
+ "\u{1b066}" => "\u{3061}",
112
+ "\u{1b067}" => "\u{3061}",
113
+ "\u{1b068}" => "\u{3061}",
114
+ "\u{1b069}" => "\u{3064}",
115
+ "\u{1b06a}" => "\u{3064}",
116
+ "\u{1b06b}" => "\u{3064}",
117
+ "\u{1b06c}" => "\u{3064}",
118
+ "\u{1b06d}" => "\u{3064}",
119
+ "\u{1b06e}" => "\u{3066}",
120
+ "\u{1b06f}" => "\u{3066}",
121
+ "\u{1b070}" => "\u{3066}",
122
+ "\u{1b071}" => "\u{3066}",
123
+ "\u{1b072}" => "\u{3066}",
124
+ "\u{1b073}" => "\u{3066}",
125
+ "\u{1b074}" => "\u{3066}",
126
+ "\u{1b075}" => "\u{3066}",
127
+ "\u{1b076}" => "\u{3066}",
128
+ "\u{1b077}" => "\u{3068}",
129
+ "\u{1b078}" => "\u{3068}",
130
+ "\u{1b079}" => "\u{3068}",
131
+ "\u{1b07a}" => "\u{3068}",
132
+ "\u{1b07b}" => "\u{3068}",
133
+ "\u{1b07c}" => "\u{3068}",
134
+ "\u{1b07d}" => "\u{3068}",
135
+ "\u{1b07e}" => "\u{306a}",
136
+ "\u{1b07f}" => "\u{306a}",
137
+ "\u{1b080}" => "\u{306a}",
138
+ "\u{1b081}" => "\u{306a}",
139
+ "\u{1b082}" => "\u{306a}",
140
+ "\u{1b083}" => "\u{306a}",
141
+ "\u{1b084}" => "\u{306a}",
142
+ "\u{1b085}" => "\u{306a}",
143
+ "\u{1b086}" => "\u{306a}",
144
+ "\u{1b087}" => "\u{306b}",
145
+ "\u{1b088}" => "\u{306b}",
146
+ "\u{1b089}" => "\u{306b}",
147
+ "\u{1b08a}" => "\u{306b}",
148
+ "\u{1b08b}" => "\u{306b}",
149
+ "\u{1b08c}" => "\u{306b}",
150
+ "\u{1b08d}" => "\u{306b}",
151
+ "\u{1b08e}" => "\u{306b}",
152
+ "\u{1b08f}" => "\u{306c}",
153
+ "\u{1b090}" => "\u{306c}",
154
+ "\u{1b091}" => "\u{306c}",
155
+ "\u{1b092}" => "\u{306d}",
156
+ "\u{1b093}" => "\u{306d}",
157
+ "\u{1b094}" => "\u{306d}",
158
+ "\u{1b095}" => "\u{306d}",
159
+ "\u{1b096}" => "\u{306d}",
160
+ "\u{1b097}" => "\u{306d}",
161
+ "\u{1b098}" => "\u{306d}",
162
+ "\u{1b099}" => "\u{306e}",
163
+ "\u{1b09a}" => "\u{306e}",
164
+ "\u{1b09b}" => "\u{306e}",
165
+ "\u{1b09c}" => "\u{306e}",
166
+ "\u{1b09d}" => "\u{306e}",
167
+ "\u{1b09e}" => "\u{306f}",
168
+ "\u{1b09f}" => "\u{306f}",
169
+ "\u{1b0a0}" => "\u{306f}",
170
+ "\u{1b0a1}" => "\u{306f}",
171
+ "\u{1b0a2}" => "\u{306f}",
172
+ "\u{1b0a3}" => "\u{306f}",
173
+ "\u{1b0a4}" => "\u{306f}",
174
+ "\u{1b0a5}" => "\u{306f}",
175
+ "\u{1b0a6}" => "\u{306f}",
176
+ "\u{1b0a7}" => "\u{306f}",
177
+ "\u{1b0a8}" => "\u{306f}",
178
+ "\u{1b0a9}" => "\u{3072}",
179
+ "\u{1b0aa}" => "\u{3072}",
180
+ "\u{1b0ab}" => "\u{3072}",
181
+ "\u{1b0ac}" => "\u{3072}",
182
+ "\u{1b0ad}" => "\u{3072}",
183
+ "\u{1b0ae}" => "\u{3072}",
184
+ "\u{1b0af}" => "\u{3072}",
185
+ "\u{1b0b0}" => "\u{3075}",
186
+ "\u{1b0b1}" => "\u{3075}",
187
+ "\u{1b0b2}" => "\u{3075}",
188
+ "\u{1b0b3}" => "\u{3078}",
189
+ "\u{1b0b4}" => "\u{3078}",
190
+ "\u{1b0b5}" => "\u{3078}",
191
+ "\u{1b0b6}" => "\u{3078}",
192
+ "\u{1b0b7}" => "\u{3078}",
193
+ "\u{1b0b8}" => "\u{3078}",
194
+ "\u{1b0b9}" => "\u{3078}",
195
+ "\u{1b0ba}" => "\u{307b}",
196
+ "\u{1b0bb}" => "\u{307b}",
197
+ "\u{1b0bc}" => "\u{307b}",
198
+ "\u{1b0bd}" => "\u{307b}",
199
+ "\u{1b0be}" => "\u{307b}",
200
+ "\u{1b0bf}" => "\u{307b}",
201
+ "\u{1b0c0}" => "\u{307b}",
202
+ "\u{1b0c1}" => "\u{307b}",
203
+ "\u{1b0c2}" => "\u{307e}",
204
+ "\u{1b0c3}" => "\u{307e}",
205
+ "\u{1b0c4}" => "\u{307e}",
206
+ "\u{1b0c5}" => "\u{307e}",
207
+ "\u{1b0c6}" => "\u{307e}",
208
+ "\u{1b0c7}" => "\u{307e}",
209
+ "\u{1b0c8}" => "\u{307e}",
210
+ "\u{1b0c9}" => "\u{307f}",
211
+ "\u{1b0ca}" => "\u{307f}",
212
+ "\u{1b0cb}" => "\u{307f}",
213
+ "\u{1b0cc}" => "\u{307f}",
214
+ "\u{1b0cd}" => "\u{307f}",
215
+ "\u{1b0ce}" => "\u{307f}",
216
+ "\u{1b0cf}" => "\u{307f}",
217
+ "\u{1b0d0}" => "\u{3080}",
218
+ "\u{1b0d1}" => "\u{3080}",
219
+ "\u{1b0d2}" => "\u{3080}",
220
+ "\u{1b0d3}" => "\u{3080}",
221
+ "\u{1b0d4}" => "\u{3081}",
222
+ "\u{1b0d5}" => "\u{3081}",
223
+ "\u{1b0d6}" => "\u{3081}",
224
+ "\u{1b0d7}" => "\u{3082}",
225
+ "\u{1b0d8}" => "\u{3082}",
226
+ "\u{1b0d9}" => "\u{3082}",
227
+ "\u{1b0da}" => "\u{3082}",
228
+ "\u{1b0db}" => "\u{3082}",
229
+ "\u{1b0dc}" => "\u{3082}",
230
+ "\u{1b0dd}" => "\u{3084}",
231
+ "\u{1b0de}" => "\u{3084}",
232
+ "\u{1b0df}" => "\u{3084}",
233
+ "\u{1b0e0}" => "\u{3084}",
234
+ "\u{1b0e1}" => "\u{3084}",
235
+ "\u{1b0e2}" => "\u{3084}",
236
+ "\u{1b0e3}" => "\u{3086}",
237
+ "\u{1b0e4}" => "\u{3086}",
238
+ "\u{1b0e5}" => "\u{3086}",
239
+ "\u{1b0e6}" => "\u{3086}",
240
+ "\u{1b0e7}" => "\u{3088}",
241
+ "\u{1b0e8}" => "\u{3088}",
242
+ "\u{1b0e9}" => "\u{3088}",
243
+ "\u{1b0ea}" => "\u{3088}",
244
+ "\u{1b0eb}" => "\u{3088}",
245
+ "\u{1b0ec}" => "\u{3088}",
246
+ "\u{1b0ed}" => "\u{3089}",
247
+ "\u{1b0ee}" => "\u{3089}",
248
+ "\u{1b0ef}" => "\u{3089}",
249
+ "\u{1b0f0}" => "\u{3089}",
250
+ "\u{1b0f1}" => "\u{308a}",
251
+ "\u{1b0f2}" => "\u{308a}",
252
+ "\u{1b0f3}" => "\u{308a}",
253
+ "\u{1b0f4}" => "\u{308a}",
254
+ "\u{1b0f5}" => "\u{308a}",
255
+ "\u{1b0f6}" => "\u{308a}",
256
+ "\u{1b0f7}" => "\u{308a}",
257
+ "\u{1b0f8}" => "\u{308b}",
258
+ "\u{1b0f9}" => "\u{308b}",
259
+ "\u{1b0fa}" => "\u{308b}",
260
+ "\u{1b0fb}" => "\u{308b}",
261
+ "\u{1b0fc}" => "\u{308b}",
262
+ "\u{1b0fd}" => "\u{308b}",
263
+ "\u{1b0fe}" => "\u{308c}",
264
+ "\u{1b0ff}" => "\u{308c}",
265
+ "\u{1b100}" => "\u{308c}",
266
+ "\u{1b101}" => "\u{308c}",
267
+ "\u{1b102}" => "\u{308d}",
268
+ "\u{1b103}" => "\u{308d}",
269
+ "\u{1b104}" => "\u{308d}",
270
+ "\u{1b105}" => "\u{308d}",
271
+ "\u{1b106}" => "\u{308d}",
272
+ "\u{1b107}" => "\u{308d}",
273
+ "\u{1b108}" => "\u{308f}",
274
+ "\u{1b109}" => "\u{308f}",
275
+ "\u{1b10a}" => "\u{308f}",
276
+ "\u{1b10b}" => "\u{308f}",
277
+ "\u{1b10c}" => "\u{308f}",
278
+ "\u{1b10d}" => "\u{3090}",
279
+ "\u{1b10e}" => "\u{3090}",
280
+ "\u{1b10f}" => "\u{3090}",
281
+ "\u{1b110}" => "\u{3090}",
282
+ "\u{1b111}" => "\u{3090}",
283
+ "\u{1b112}" => "\u{3091}",
284
+ "\u{1b113}" => "\u{3091}",
285
+ "\u{1b114}" => "\u{3091}",
286
+ "\u{1b115}" => "\u{3091}",
287
+ "\u{1b116}" => "\u{3092}",
288
+ "\u{1b117}" => "\u{3092}",
289
+ "\u{1b118}" => "\u{3092}",
290
+ "\u{1b119}" => "\u{3092}",
291
+ "\u{1b11a}" => "\u{3092}",
292
+ "\u{1b11b}" => "\u{3092}",
293
+ "\u{1b11c}" => "\u{3092}",
294
+ "\u{1b11d}" => "\u{3093}",
295
+ "\u{1b11e}" => "\u{3093}",
296
+ "\u{1b11f}" => "\u{3046}",
297
+ "\u{1b120}" => "\u{30a4}",
298
+ "\u{1b121}" => "\u{30a8}",
299
+ "\u{1b122}" => "\u{30a6}"
300
+ }.freeze
301
+
302
+ # Transliterator for archaic_hirakatas
303
+ class Transliterator < Yosina::BaseTransliterator
304
+ # Initialize the transliterator with options
305
+ #
306
+ # @param _options [Hash] Configuration options (currently unused)
307
+ def initialize(_options = {})
308
+ # Options currently unused for archaic_hirakatas transliterator
309
+ super()
310
+ end
311
+
312
+ # Replaces archaic kana (hentaigana) with their modern equivalents.
313
+ #
314
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
315
+ # @return [Enumerable<Char>] The transliterated characters
316
+ def call(input_chars)
317
+ offset = 0
318
+
319
+ result = input_chars.filter_map do |char|
320
+ replacement = ARCHAIC_HIRAKATAS_MAPPINGS[char.c]
321
+ c = if replacement
322
+ # Skip empty replacements (character removal)
323
+ next if replacement.empty?
324
+
325
+ Char.new(c: replacement, offset: offset, source: char)
326
+ else
327
+ char.with_offset(offset)
328
+ end
329
+ offset += c.c.length
330
+ c
331
+ end
332
+
333
+ class << result
334
+ include Yosina::Chars
335
+ end
336
+
337
+ result
338
+ end
339
+ end
340
+
341
+ # Factory method to create a archaic_hirakatas transliterator
342
+ #
343
+ # @param options [Hash] Configuration options
344
+ # @return [Transliterator] A new archaic_hirakatas transliterator instance
345
+ def self.call(options = {})
346
+ Transliterator.new(options)
347
+ end
348
+ end
349
+ end
350
+ end
@@ -79,7 +79,11 @@ module Yosina
79
79
  ['ょ', 'ョ', 'ョ'],
80
80
  ['ゎ', 'ヮ', nil],
81
81
  ['ゕ', 'ヵ', nil],
82
- ['ゖ', 'ヶ', nil]
82
+ ['ゖ', 'ヶ', nil],
83
+ ["\u{1B132}", "\u{1B155}", nil],
84
+ ["\u{1B150}", "\u{1B164}", nil],
85
+ ["\u{1B151}", "\u{1B165}", nil],
86
+ ["\u{1B152}", "\u{1B166}", nil]
83
87
  ].freeze
84
88
 
85
89
  # Generate voiced character mappings
@@ -0,0 +1,153 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Convert historical hiragana/katakana characters to their modern equivalents
6
+ module HistoricalHirakatas
7
+ # Historical hiragana mappings: source => { simple:, decompose: }
8
+ HISTORICAL_HIRAGANA_MAPPINGS = {
9
+ "\u{3090}" => { simple: "\u{3044}", decompose: "\u{3046}\u{3043}" }, # ゐ → い / うぃ
10
+ "\u{3091}" => { simple: "\u{3048}", decompose: "\u{3046}\u{3047}" } # ゑ → え / うぇ
11
+ }.freeze
12
+
13
+ # Historical katakana mappings: source => { simple:, decompose: }
14
+ HISTORICAL_KATAKANA_MAPPINGS = {
15
+ "\u{30F0}" => { simple: "\u{30A4}", decompose: "\u{30A6}\u{30A3}" }, # ヰ → イ / ウィ
16
+ "\u{30F1}" => { simple: "\u{30A8}", decompose: "\u{30A6}\u{30A7}" } # ヱ → エ / ウェ
17
+ }.freeze
18
+
19
+ # Voiced historical katakana mappings: source => small vowel suffix
20
+ VOICED_HISTORICAL_KANA_MAPPINGS = {
21
+ "\u{30F7}" => "\u{30A1}", # ヷ → ァ
22
+ "\u{30F8}" => "\u{30A3}", # ヸ → ィ
23
+ "\u{30F9}" => "\u{30A7}", # ヹ → ェ
24
+ "\u{30FA}" => "\u{30A9}" # ヺ → ォ
25
+ }.freeze
26
+
27
+ VOICED_HISTORICAL_KANA_DECOMPOSED_MAPPINGS = {
28
+ "\u{30EF}" => "\u{30A1}", # ヷ → ァ
29
+ "\u{30F0}" => "\u{30A3}", # ヸ → ィ
30
+ "\u{30F1}" => "\u{30A7}", # ヹ → ェ
31
+ "\u{30F2}" => "\u{30A9}" # ヺ → ォ
32
+ }.freeze
33
+
34
+ COMBINING_DAKUTEN = "\u{3099}"
35
+ VU = "\u{30F4}"
36
+ U = "\u{30A6}"
37
+
38
+ # Transliterator for historical hiragana/katakana conversion
39
+ class Transliterator < Yosina::BaseTransliterator
40
+ # Initialize the transliterator with options
41
+ #
42
+ # @param options [Hash] Configuration options
43
+ # @option options [String] :hiraganas "simple" (default), "decompose", or "skip"
44
+ # @option options [String] :katakanas "simple" (default), "decompose", or "skip"
45
+ # @option options [String] :voiced_katakanas "decompose" or "skip" (default)
46
+ def initialize(options = {})
47
+ super()
48
+ @hiraganas = (options[:hiraganas] || :simple).to_sym
49
+ @katakanas = (options[:katakanas] || :simple).to_sym
50
+ @voiced_katakanas = (options[:voiced_katakanas] || :skip).to_sym
51
+ end
52
+
53
+ # Convert historical hiragana/katakana characters to modern equivalents
54
+ #
55
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
56
+ # @return [Enumerable<Char>] The transliterated characters
57
+ def call(input_chars)
58
+ Chars.enum do |y|
59
+ offset = 0
60
+ pending = nil
61
+ input_chars.each do |char|
62
+ if char.sentinel?
63
+ offset = emit_char(y, pending, offset) if pending
64
+ pending = nil
65
+ y << char
66
+ break
67
+ end
68
+
69
+ if pending.nil?
70
+ pending = char
71
+ next
72
+ end
73
+
74
+ if char.c == COMBINING_DAKUTEN
75
+ # Check if pending char could be a decomposed voiced base
76
+ decomposed = VOICED_HISTORICAL_KANA_DECOMPOSED_MAPPINGS[pending.c]
77
+ if @voiced_katakanas == :skip || decomposed.nil?
78
+ y << pending.with_offset(offset)
79
+ offset += pending.c.length
80
+ pending = char
81
+ next
82
+ end
83
+ y << Char.new(c: U, offset: offset, source: pending)
84
+ offset += U.length
85
+ y << char.with_offset(offset)
86
+ offset += char.c.length
87
+ y << Char.new(c: decomposed, offset: offset, source: pending)
88
+ offset += decomposed.length
89
+ pending = nil
90
+ next
91
+ end
92
+
93
+ offset = emit_char(y, pending, offset)
94
+ pending = char
95
+ end
96
+ # Flush any remaining pending char
97
+ emit_char(y, pending, offset) if pending
98
+ end
99
+ end
100
+
101
+ private
102
+
103
+ # Emit a single char through the normal mapping logic
104
+ #
105
+ # @param y [Enumerator::Yielder] The yielder
106
+ # @param char [Char] The character to emit
107
+ # @param offset [Integer] The current offset
108
+ # @return [Integer] The new offset after emitting
109
+ # rubocop:disable Naming/MethodParameterName
110
+ def emit_char(y, char, offset)
111
+ # Historical hiragana
112
+ hira_mapping = HISTORICAL_HIRAGANA_MAPPINGS[char.c]
113
+ if hira_mapping && @hiraganas != :skip
114
+ replacement = hira_mapping[@hiraganas]
115
+ y << Char.new(c: replacement, offset: offset, source: char)
116
+ return offset + replacement.length
117
+ end
118
+
119
+ # Historical katakana
120
+ kata_mapping = HISTORICAL_KATAKANA_MAPPINGS[char.c]
121
+ if kata_mapping && @katakanas != :skip
122
+ replacement = kata_mapping[@katakanas]
123
+ y << Char.new(c: replacement, offset: offset, source: char)
124
+ return offset + replacement.length
125
+ end
126
+
127
+ # Voiced historical katakana
128
+ if @voiced_katakanas == :decompose
129
+ decomposed = VOICED_HISTORICAL_KANA_MAPPINGS[char.c]
130
+ if decomposed
131
+ y << Char.new(c: VU, offset: offset, source: char)
132
+ offset += VU.length
133
+ y << Char.new(c: decomposed, offset: offset, source: char)
134
+ return offset + decomposed.length
135
+ end
136
+ end
137
+
138
+ y << char.with_offset(offset)
139
+ offset + char.c.length
140
+ end
141
+ end
142
+ # rubocop:enable Naming/MethodParameterName
143
+
144
+ # Factory method to create a historical hirakatas transliterator
145
+ #
146
+ # @param options [Hash] Configuration options
147
+ # @return [Transliterator] A new historical hirakatas transliterator instance
148
+ def self.call(options = {})
149
+ Transliterator.new(options)
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Replaces small hiragana/katakana with their ordinary-sized equivalents.
6
+ module SmallHirakatas
7
+ # Generated mapping data from small_hirakatas.json
8
+ SMALL_HIRAKATAS_MAPPINGS = {
9
+ "\u{3041}" => "\u{3042}",
10
+ "\u{3043}" => "\u{3044}",
11
+ "\u{3045}" => "\u{3046}",
12
+ "\u{3047}" => "\u{3048}",
13
+ "\u{3049}" => "\u{304a}",
14
+ "\u{3063}" => "\u{3064}",
15
+ "\u{3083}" => "\u{3084}",
16
+ "\u{3085}" => "\u{3086}",
17
+ "\u{3087}" => "\u{3088}",
18
+ "\u{308e}" => "\u{308f}",
19
+ "\u{3095}" => "\u{304b}",
20
+ "\u{3096}" => "\u{3051}",
21
+ "\u{30a1}" => "\u{30a2}",
22
+ "\u{30a3}" => "\u{30a4}",
23
+ "\u{30a5}" => "\u{30a6}",
24
+ "\u{30a7}" => "\u{30a8}",
25
+ "\u{30a9}" => "\u{30aa}",
26
+ "\u{30c3}" => "\u{30c4}",
27
+ "\u{30e3}" => "\u{30e4}",
28
+ "\u{30e5}" => "\u{30e6}",
29
+ "\u{30e7}" => "\u{30e8}",
30
+ "\u{30ee}" => "\u{30ef}",
31
+ "\u{30f5}" => "\u{30ab}",
32
+ "\u{30f6}" => "\u{30b1}",
33
+ "\u{31f0}" => "\u{30af}",
34
+ "\u{31f1}" => "\u{30b7}",
35
+ "\u{31f2}" => "\u{30b9}",
36
+ "\u{31f3}" => "\u{30c8}",
37
+ "\u{31f4}" => "\u{30cc}",
38
+ "\u{31f5}" => "\u{30cf}",
39
+ "\u{31f6}" => "\u{30d2}",
40
+ "\u{31f7}" => "\u{30d5}",
41
+ "\u{31f8}" => "\u{30d8}",
42
+ "\u{31f9}" => "\u{30db}",
43
+ "\u{31fa}" => "\u{30e0}",
44
+ "\u{31fb}" => "\u{30e9}",
45
+ "\u{31fc}" => "\u{30ea}",
46
+ "\u{31fd}" => "\u{30eb}",
47
+ "\u{31fe}" => "\u{30ec}",
48
+ "\u{31ff}" => "\u{30ed}",
49
+ "\u{ff67}" => "\u{ff71}",
50
+ "\u{ff68}" => "\u{ff72}",
51
+ "\u{ff69}" => "\u{ff73}",
52
+ "\u{ff6a}" => "\u{ff74}",
53
+ "\u{ff6b}" => "\u{ff75}",
54
+ "\u{ff6c}" => "\u{ff94}",
55
+ "\u{ff6d}" => "\u{ff95}",
56
+ "\u{ff6e}" => "\u{ff96}",
57
+ "\u{ff6f}" => "\u{ff82}",
58
+ "\u{1b132}" => "\u{3053}",
59
+ "\u{1b150}" => "\u{3090}",
60
+ "\u{1b151}" => "\u{3091}",
61
+ "\u{1b152}" => "\u{3092}",
62
+ "\u{1b155}" => "\u{30b3}",
63
+ "\u{1b164}" => "\u{30f0}",
64
+ "\u{1b165}" => "\u{30f1}",
65
+ "\u{1b166}" => "\u{30f2}",
66
+ "\u{1b167}" => "\u{30f3}"
67
+ }.freeze
68
+
69
+ # Transliterator for small_hirakatas
70
+ class Transliterator < Yosina::BaseTransliterator
71
+ # Initialize the transliterator with options
72
+ #
73
+ # @param _options [Hash] Configuration options (currently unused)
74
+ def initialize(_options = {})
75
+ # Options currently unused for small_hirakatas transliterator
76
+ super()
77
+ end
78
+
79
+ # Replaces small hiragana/katakana with their ordinary-sized equivalents.
80
+ #
81
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
82
+ # @return [Enumerable<Char>] The transliterated characters
83
+ def call(input_chars)
84
+ offset = 0
85
+
86
+ result = input_chars.filter_map do |char|
87
+ replacement = SMALL_HIRAKATAS_MAPPINGS[char.c]
88
+ c = if replacement
89
+ # Skip empty replacements (character removal)
90
+ next if replacement.empty?
91
+
92
+ Char.new(c: replacement, offset: offset, source: char)
93
+ else
94
+ char.with_offset(offset)
95
+ end
96
+ offset += c.c.length
97
+ c
98
+ end
99
+
100
+ class << result
101
+ include Yosina::Chars
102
+ end
103
+
104
+ result
105
+ end
106
+ end
107
+
108
+ # Factory method to create a small_hirakatas transliterator
109
+ #
110
+ # @param options [Hash] Configuration options
111
+ # @return [Transliterator] A new small_hirakatas transliterator instance
112
+ def self.call(options = {})
113
+ Transliterator.new(options)
114
+ end
115
+ end
116
+ end
117
+ end
@@ -15,6 +15,9 @@ require_relative 'transliterators/jisx0201_and_alike'
15
15
  require_relative 'transliterators/circled_or_squared'
16
16
  require_relative 'transliterators/combined'
17
17
  require_relative 'transliterators/japanese_iteration_marks'
18
+ require_relative 'transliterators/archaic_hirakatas'
19
+ require_relative 'transliterators/small_hirakatas'
20
+ require_relative 'transliterators/historical_hirakatas'
18
21
 
19
22
  module Yosina
20
23
  # Registry for transliterator factories
@@ -34,7 +37,10 @@ module Yosina
34
37
  jisx0201_and_alike: Transliterators::Jisx0201AndAlike,
35
38
  combined: Transliterators::Combined,
36
39
  circled_or_squared: CircledOrSquared,
37
- japanese_iteration_marks: Transliterators::JapaneseIterationMarks
40
+ japanese_iteration_marks: Transliterators::JapaneseIterationMarks,
41
+ archaic_hirakatas: Transliterators::ArchaicHirakatas,
42
+ small_hirakatas: Transliterators::SmallHirakatas,
43
+ historical_hirakatas: Transliterators::HistoricalHirakatas
38
44
  }.freeze
39
45
 
40
46
  # Get a transliterator factory by name
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Yosina
4
- VERSION = '0.2.0'
4
+ VERSION = '1.1.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yosina
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moriyoshi Koizumi
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-09-23 00:00:00.000000000 Z
11
+ date: 2026-03-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -109,6 +109,7 @@ files:
109
109
  - lib/yosina/recipes.rb
110
110
  - lib/yosina/transliterator.rb
111
111
  - lib/yosina/transliterators.rb
112
+ - lib/yosina/transliterators/archaic_hirakatas.rb
112
113
  - lib/yosina/transliterators/circled_or_squared.rb
113
114
  - lib/yosina/transliterators/circled_or_squared_data.rb
114
115
  - lib/yosina/transliterators/combined.rb
@@ -116,6 +117,7 @@ files:
116
117
  - lib/yosina/transliterators/hira_kata.rb
117
118
  - lib/yosina/transliterators/hira_kata_composition.rb
118
119
  - lib/yosina/transliterators/hira_kata_table.rb
120
+ - lib/yosina/transliterators/historical_hirakatas.rb
119
121
  - lib/yosina/transliterators/hyphens.rb
120
122
  - lib/yosina/transliterators/hyphens_data.rb
121
123
  - lib/yosina/transliterators/ideographic_annotations.rb
@@ -129,6 +131,7 @@ files:
129
131
  - lib/yosina/transliterators/radicals.rb
130
132
  - lib/yosina/transliterators/roman_numerals.rb
131
133
  - lib/yosina/transliterators/roman_numerals_data.rb
134
+ - lib/yosina/transliterators/small_hirakatas.rb
132
135
  - lib/yosina/transliterators/spaces.rb
133
136
  - lib/yosina/version.rb
134
137
  - yosina.gemspec
@@ -140,7 +143,7 @@ metadata:
140
143
  homepage_uri: https://github.com/yosina-lib/yosina
141
144
  source_code_uri: https://github.com/yosina-lib/yosina
142
145
  changelog_uri: https://github.com/yosina-lib/yosina/releases
143
- post_install_message:
146
+ post_install_message:
144
147
  rdoc_options: []
145
148
  require_paths:
146
149
  - lib
@@ -155,8 +158,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
155
158
  - !ruby/object:Gem::Version
156
159
  version: '0'
157
160
  requirements: []
158
- rubygems_version: 3.5.11
159
- signing_key:
161
+ rubygems_version: 3.0.3.1
162
+ signing_key:
160
163
  specification_version: 4
161
164
  summary: Japanese text transliteration library
162
165
  test_files: []