yosina 0.2.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/codegen/dataset.rb +7 -1
- data/codegen/main.rb +13 -1
- data/lib/yosina/recipes.rb +42 -1
- data/lib/yosina/transliterators/archaic_hirakatas.rb +350 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +5 -1
- data/lib/yosina/transliterators/historical_hirakatas.rb +153 -0
- data/lib/yosina/transliterators/small_hirakatas.rb +117 -0
- data/lib/yosina/transliterators.rb +7 -1
- data/lib/yosina/version.rb +1 -1
- metadata +9 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c3683e10291faa325375fe5c3b069cdc6069e1654a7aae8c9cc754d9cf33b17b
|
|
4
|
+
data.tar.gz: 891d3b94688ae9af9693cd61b8edc30b9c2e0388662586a33de39ebb900bb4bf
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 86a08510acaafae1a2afe8b72b194204839e6aa63a07e44b4ebd1219048722536e38d6bcc5972f3c3a543fd5f94648b6a98dbe9f6ee6c90000eacc0a0a6495d6
|
|
7
|
+
data.tar.gz: 0ce9a44af3df6dcb1f89bc6e3b2661da834008a0d3c972c69acac5c3a644ffe4455ac9ca70cb5556980456e6e7ce4e50b8beb324ad9134531e936141b4c06f4d
|
data/codegen/dataset.rb
CHANGED
|
@@ -14,6 +14,8 @@ DatasetSourceDefs = Struct.new(
|
|
|
14
14
|
:combined,
|
|
15
15
|
:circled_or_squared,
|
|
16
16
|
:roman_numerals,
|
|
17
|
+
:archaic_hirakatas,
|
|
18
|
+
:small_hirakatas,
|
|
17
19
|
keyword_init: true
|
|
18
20
|
)
|
|
19
21
|
|
|
@@ -29,6 +31,8 @@ Dataset = Struct.new(
|
|
|
29
31
|
:combined,
|
|
30
32
|
:circled_or_squared,
|
|
31
33
|
:roman_numerals,
|
|
34
|
+
:archaic_hirakatas,
|
|
35
|
+
:small_hirakatas,
|
|
32
36
|
keyword_init: true
|
|
33
37
|
)
|
|
34
38
|
|
|
@@ -241,6 +245,8 @@ def build_dataset_from_data_root(data_root, defs)
|
|
|
241
245
|
kanji_old_new: load_kanji_old_new_data(data_root / defs.kanji_old_new),
|
|
242
246
|
combined: load_combined_data(data_root / defs.combined),
|
|
243
247
|
circled_or_squared: load_circled_or_squared_data(data_root / defs.circled_or_squared),
|
|
244
|
-
roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals)
|
|
248
|
+
roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals),
|
|
249
|
+
archaic_hirakatas: load_simple_data(data_root / defs.archaic_hirakatas),
|
|
250
|
+
small_hirakatas: load_simple_data(data_root / defs.small_hirakatas)
|
|
245
251
|
)
|
|
246
252
|
end
|
data/codegen/main.rb
CHANGED
|
@@ -32,7 +32,9 @@ def main
|
|
|
32
32
|
kanji_old_new: 'kanji-old-new-form.json',
|
|
33
33
|
combined: 'combined-chars.json',
|
|
34
34
|
circled_or_squared: 'circled-or-squared.json',
|
|
35
|
-
roman_numerals: 'roman-numerals.json'
|
|
35
|
+
roman_numerals: 'roman-numerals.json',
|
|
36
|
+
archaic_hirakatas: 'archaic-hirakatas.json',
|
|
37
|
+
small_hirakatas: 'small-hirakatas.json'
|
|
36
38
|
)
|
|
37
39
|
|
|
38
40
|
# Load the dataset
|
|
@@ -64,6 +66,16 @@ def main
|
|
|
64
66
|
'kanji_old_new',
|
|
65
67
|
'Replace old-style kanji with modern equivalents',
|
|
66
68
|
dataset.kanji_old_new
|
|
69
|
+
],
|
|
70
|
+
[
|
|
71
|
+
'archaic_hirakatas',
|
|
72
|
+
'Replaces archaic kana (hentaigana) with their modern equivalents.',
|
|
73
|
+
dataset.archaic_hirakatas
|
|
74
|
+
],
|
|
75
|
+
[
|
|
76
|
+
'small_hirakatas',
|
|
77
|
+
'Replaces small hiragana/katakana with their ordinary-sized equivalents.',
|
|
78
|
+
dataset.small_hirakatas
|
|
67
79
|
]
|
|
68
80
|
]
|
|
69
81
|
|
data/lib/yosina/recipes.rb
CHANGED
|
@@ -51,12 +51,15 @@ module Yosina
|
|
|
51
51
|
end
|
|
52
52
|
|
|
53
53
|
# Configuration recipe for building transliterator chains
|
|
54
|
+
# rubocop:disable Metrics/ClassLength
|
|
54
55
|
class TransliterationRecipe
|
|
55
56
|
attr_accessor :kanji_old_new, :hira_kata, :replace_japanese_iteration_marks,
|
|
56
57
|
:replace_suspicious_hyphens_to_prolonged_sound_marks,
|
|
57
58
|
:replace_combined_characters, :replace_circled_or_squared_characters,
|
|
58
59
|
:replace_ideographic_annotations, :replace_radicals, :replace_spaces,
|
|
59
60
|
:replace_hyphens, :replace_mathematical_alphanumerics, :replace_roman_numerals,
|
|
61
|
+
:replace_archaic_hirakatas, :replace_small_hirakatas,
|
|
62
|
+
:convert_historical_hirakatas,
|
|
60
63
|
:combine_decomposed_hiraganas_and_katakanas, :to_fullwidth, :to_halfwidth,
|
|
61
64
|
:remove_ivs_svs, :charset
|
|
62
65
|
|
|
@@ -161,6 +164,8 @@ module Yosina
|
|
|
161
164
|
replace_ideographic_annotations: false, replace_radicals: false,
|
|
162
165
|
replace_spaces: false, replace_hyphens: false,
|
|
163
166
|
replace_mathematical_alphanumerics: false, replace_roman_numerals: false,
|
|
167
|
+
replace_archaic_hirakatas: false, replace_small_hirakatas: false,
|
|
168
|
+
convert_historical_hirakatas: nil,
|
|
164
169
|
combine_decomposed_hiraganas_and_katakanas: false,
|
|
165
170
|
to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false,
|
|
166
171
|
charset: 'unijis_2004')
|
|
@@ -176,6 +181,9 @@ module Yosina
|
|
|
176
181
|
@replace_hyphens = replace_hyphens
|
|
177
182
|
@replace_mathematical_alphanumerics = replace_mathematical_alphanumerics
|
|
178
183
|
@replace_roman_numerals = replace_roman_numerals
|
|
184
|
+
@replace_archaic_hirakatas = replace_archaic_hirakatas
|
|
185
|
+
@replace_small_hirakatas = replace_small_hirakatas
|
|
186
|
+
@convert_historical_hirakatas = convert_historical_hirakatas
|
|
179
187
|
@combine_decomposed_hiraganas_and_katakanas = combine_decomposed_hiraganas_and_katakanas
|
|
180
188
|
@to_fullwidth = to_fullwidth
|
|
181
189
|
@to_halfwidth = to_halfwidth
|
|
@@ -208,6 +216,9 @@ module Yosina
|
|
|
208
216
|
ctx = apply_replace_hyphens(ctx)
|
|
209
217
|
ctx = apply_replace_mathematical_alphanumerics(ctx)
|
|
210
218
|
ctx = apply_replace_roman_numerals(ctx)
|
|
219
|
+
ctx = apply_replace_archaic_hirakatas(ctx)
|
|
220
|
+
ctx = apply_replace_small_hirakatas(ctx)
|
|
221
|
+
ctx = apply_convert_historical_hirakatas(ctx)
|
|
211
222
|
ctx = apply_combine_decomposed_hiraganas_and_katakanas(ctx)
|
|
212
223
|
ctx = apply_to_fullwidth(ctx)
|
|
213
224
|
ctx = apply_hira_kata(ctx)
|
|
@@ -241,7 +252,7 @@ module Yosina
|
|
|
241
252
|
|
|
242
253
|
def apply_hira_kata(ctx)
|
|
243
254
|
if @hira_kata
|
|
244
|
-
ctx.
|
|
255
|
+
ctx.insert_tail([:hira_kata, { mode: @hira_kata }])
|
|
245
256
|
else
|
|
246
257
|
ctx
|
|
247
258
|
end
|
|
@@ -329,6 +340,35 @@ module Yosina
|
|
|
329
340
|
end
|
|
330
341
|
end
|
|
331
342
|
|
|
343
|
+
def apply_replace_archaic_hirakatas(ctx)
|
|
344
|
+
if @replace_archaic_hirakatas
|
|
345
|
+
ctx.insert_middle([:archaic_hirakatas, {}])
|
|
346
|
+
else
|
|
347
|
+
ctx
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def apply_replace_small_hirakatas(ctx)
|
|
352
|
+
if @replace_small_hirakatas
|
|
353
|
+
ctx.insert_middle([:small_hirakatas, {}])
|
|
354
|
+
else
|
|
355
|
+
ctx
|
|
356
|
+
end
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
def apply_convert_historical_hirakatas(ctx)
|
|
360
|
+
if @convert_historical_hirakatas
|
|
361
|
+
mode = @convert_historical_hirakatas
|
|
362
|
+
ctx.insert_middle([:historical_hirakatas, {
|
|
363
|
+
hiraganas: mode,
|
|
364
|
+
katakanas: mode,
|
|
365
|
+
voiced_katakanas: mode == 'decompose' ? 'decompose' : 'skip'
|
|
366
|
+
}])
|
|
367
|
+
else
|
|
368
|
+
ctx
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
|
|
332
372
|
def apply_combine_decomposed_hiraganas_and_katakanas(ctx)
|
|
333
373
|
if @combine_decomposed_hiraganas_and_katakanas
|
|
334
374
|
ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
|
|
@@ -373,3 +413,4 @@ module Yosina
|
|
|
373
413
|
recipe.build_transliterator_configs
|
|
374
414
|
end
|
|
375
415
|
end
|
|
416
|
+
# rubocop:enable Metrics/ClassLength
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Replaces archaic kana (hentaigana) with their modern equivalents.
|
|
6
|
+
module ArchaicHirakatas
|
|
7
|
+
# Generated mapping data from archaic_hirakatas.json
|
|
8
|
+
ARCHAIC_HIRAKATAS_MAPPINGS = {
|
|
9
|
+
"\u{1b000}" => "\u{30a8}",
|
|
10
|
+
"\u{1b001}" => "\u{3048}",
|
|
11
|
+
"\u{1b002}" => "\u{3042}",
|
|
12
|
+
"\u{1b003}" => "\u{3042}",
|
|
13
|
+
"\u{1b004}" => "\u{3042}",
|
|
14
|
+
"\u{1b005}" => "\u{3042}",
|
|
15
|
+
"\u{1b006}" => "\u{3044}",
|
|
16
|
+
"\u{1b007}" => "\u{3044}",
|
|
17
|
+
"\u{1b008}" => "\u{3044}",
|
|
18
|
+
"\u{1b009}" => "\u{3044}",
|
|
19
|
+
"\u{1b00a}" => "\u{3046}",
|
|
20
|
+
"\u{1b00b}" => "\u{3046}",
|
|
21
|
+
"\u{1b00c}" => "\u{3046}",
|
|
22
|
+
"\u{1b00d}" => "\u{3046}",
|
|
23
|
+
"\u{1b00e}" => "\u{3046}",
|
|
24
|
+
"\u{1b00f}" => "\u{3048}",
|
|
25
|
+
"\u{1b010}" => "\u{3048}",
|
|
26
|
+
"\u{1b011}" => "\u{3048}",
|
|
27
|
+
"\u{1b012}" => "\u{3048}",
|
|
28
|
+
"\u{1b013}" => "\u{3048}",
|
|
29
|
+
"\u{1b014}" => "\u{304a}",
|
|
30
|
+
"\u{1b015}" => "\u{304a}",
|
|
31
|
+
"\u{1b016}" => "\u{304a}",
|
|
32
|
+
"\u{1b017}" => "\u{304b}",
|
|
33
|
+
"\u{1b018}" => "\u{304b}",
|
|
34
|
+
"\u{1b019}" => "\u{304b}",
|
|
35
|
+
"\u{1b01a}" => "\u{304b}",
|
|
36
|
+
"\u{1b01b}" => "\u{304b}",
|
|
37
|
+
"\u{1b01c}" => "\u{304b}",
|
|
38
|
+
"\u{1b01d}" => "\u{304b}",
|
|
39
|
+
"\u{1b01e}" => "\u{304b}",
|
|
40
|
+
"\u{1b01f}" => "\u{304b}",
|
|
41
|
+
"\u{1b020}" => "\u{304b}",
|
|
42
|
+
"\u{1b021}" => "\u{304b}",
|
|
43
|
+
"\u{1b022}" => "\u{304b}",
|
|
44
|
+
"\u{1b023}" => "\u{304d}",
|
|
45
|
+
"\u{1b024}" => "\u{304d}",
|
|
46
|
+
"\u{1b025}" => "\u{304d}",
|
|
47
|
+
"\u{1b026}" => "\u{304d}",
|
|
48
|
+
"\u{1b027}" => "\u{304d}",
|
|
49
|
+
"\u{1b028}" => "\u{304d}",
|
|
50
|
+
"\u{1b029}" => "\u{304d}",
|
|
51
|
+
"\u{1b02a}" => "\u{304d}",
|
|
52
|
+
"\u{1b02b}" => "\u{304f}",
|
|
53
|
+
"\u{1b02c}" => "\u{304f}",
|
|
54
|
+
"\u{1b02d}" => "\u{304f}",
|
|
55
|
+
"\u{1b02e}" => "\u{304f}",
|
|
56
|
+
"\u{1b02f}" => "\u{304f}",
|
|
57
|
+
"\u{1b030}" => "\u{304f}",
|
|
58
|
+
"\u{1b031}" => "\u{304f}",
|
|
59
|
+
"\u{1b032}" => "\u{3051}",
|
|
60
|
+
"\u{1b033}" => "\u{3051}",
|
|
61
|
+
"\u{1b034}" => "\u{3051}",
|
|
62
|
+
"\u{1b035}" => "\u{3051}",
|
|
63
|
+
"\u{1b036}" => "\u{3051}",
|
|
64
|
+
"\u{1b037}" => "\u{3051}",
|
|
65
|
+
"\u{1b038}" => "\u{3053}",
|
|
66
|
+
"\u{1b039}" => "\u{3053}",
|
|
67
|
+
"\u{1b03a}" => "\u{3053}",
|
|
68
|
+
"\u{1b03b}" => "\u{3053}",
|
|
69
|
+
"\u{1b03c}" => "\u{3055}",
|
|
70
|
+
"\u{1b03d}" => "\u{3055}",
|
|
71
|
+
"\u{1b03e}" => "\u{3055}",
|
|
72
|
+
"\u{1b03f}" => "\u{3055}",
|
|
73
|
+
"\u{1b040}" => "\u{3055}",
|
|
74
|
+
"\u{1b041}" => "\u{3055}",
|
|
75
|
+
"\u{1b042}" => "\u{3055}",
|
|
76
|
+
"\u{1b043}" => "\u{3055}",
|
|
77
|
+
"\u{1b044}" => "\u{3057}",
|
|
78
|
+
"\u{1b045}" => "\u{3057}",
|
|
79
|
+
"\u{1b046}" => "\u{3057}",
|
|
80
|
+
"\u{1b047}" => "\u{3057}",
|
|
81
|
+
"\u{1b048}" => "\u{3057}",
|
|
82
|
+
"\u{1b049}" => "\u{3057}",
|
|
83
|
+
"\u{1b04a}" => "\u{3059}",
|
|
84
|
+
"\u{1b04b}" => "\u{3059}",
|
|
85
|
+
"\u{1b04c}" => "\u{3059}",
|
|
86
|
+
"\u{1b04d}" => "\u{3059}",
|
|
87
|
+
"\u{1b04e}" => "\u{3059}",
|
|
88
|
+
"\u{1b04f}" => "\u{3059}",
|
|
89
|
+
"\u{1b050}" => "\u{3059}",
|
|
90
|
+
"\u{1b051}" => "\u{3059}",
|
|
91
|
+
"\u{1b052}" => "\u{305b}",
|
|
92
|
+
"\u{1b053}" => "\u{305b}",
|
|
93
|
+
"\u{1b054}" => "\u{305b}",
|
|
94
|
+
"\u{1b055}" => "\u{305b}",
|
|
95
|
+
"\u{1b056}" => "\u{305b}",
|
|
96
|
+
"\u{1b057}" => "\u{305d}",
|
|
97
|
+
"\u{1b058}" => "\u{305d}",
|
|
98
|
+
"\u{1b059}" => "\u{305d}",
|
|
99
|
+
"\u{1b05a}" => "\u{305d}",
|
|
100
|
+
"\u{1b05b}" => "\u{305d}",
|
|
101
|
+
"\u{1b05c}" => "\u{305d}",
|
|
102
|
+
"\u{1b05d}" => "\u{305d}",
|
|
103
|
+
"\u{1b05e}" => "\u{305f}",
|
|
104
|
+
"\u{1b05f}" => "\u{305f}",
|
|
105
|
+
"\u{1b060}" => "\u{305f}",
|
|
106
|
+
"\u{1b061}" => "\u{305f}",
|
|
107
|
+
"\u{1b062}" => "\u{3061}",
|
|
108
|
+
"\u{1b063}" => "\u{3061}",
|
|
109
|
+
"\u{1b064}" => "\u{3061}",
|
|
110
|
+
"\u{1b065}" => "\u{3061}",
|
|
111
|
+
"\u{1b066}" => "\u{3061}",
|
|
112
|
+
"\u{1b067}" => "\u{3061}",
|
|
113
|
+
"\u{1b068}" => "\u{3061}",
|
|
114
|
+
"\u{1b069}" => "\u{3064}",
|
|
115
|
+
"\u{1b06a}" => "\u{3064}",
|
|
116
|
+
"\u{1b06b}" => "\u{3064}",
|
|
117
|
+
"\u{1b06c}" => "\u{3064}",
|
|
118
|
+
"\u{1b06d}" => "\u{3064}",
|
|
119
|
+
"\u{1b06e}" => "\u{3066}",
|
|
120
|
+
"\u{1b06f}" => "\u{3066}",
|
|
121
|
+
"\u{1b070}" => "\u{3066}",
|
|
122
|
+
"\u{1b071}" => "\u{3066}",
|
|
123
|
+
"\u{1b072}" => "\u{3066}",
|
|
124
|
+
"\u{1b073}" => "\u{3066}",
|
|
125
|
+
"\u{1b074}" => "\u{3066}",
|
|
126
|
+
"\u{1b075}" => "\u{3066}",
|
|
127
|
+
"\u{1b076}" => "\u{3066}",
|
|
128
|
+
"\u{1b077}" => "\u{3068}",
|
|
129
|
+
"\u{1b078}" => "\u{3068}",
|
|
130
|
+
"\u{1b079}" => "\u{3068}",
|
|
131
|
+
"\u{1b07a}" => "\u{3068}",
|
|
132
|
+
"\u{1b07b}" => "\u{3068}",
|
|
133
|
+
"\u{1b07c}" => "\u{3068}",
|
|
134
|
+
"\u{1b07d}" => "\u{3068}",
|
|
135
|
+
"\u{1b07e}" => "\u{306a}",
|
|
136
|
+
"\u{1b07f}" => "\u{306a}",
|
|
137
|
+
"\u{1b080}" => "\u{306a}",
|
|
138
|
+
"\u{1b081}" => "\u{306a}",
|
|
139
|
+
"\u{1b082}" => "\u{306a}",
|
|
140
|
+
"\u{1b083}" => "\u{306a}",
|
|
141
|
+
"\u{1b084}" => "\u{306a}",
|
|
142
|
+
"\u{1b085}" => "\u{306a}",
|
|
143
|
+
"\u{1b086}" => "\u{306a}",
|
|
144
|
+
"\u{1b087}" => "\u{306b}",
|
|
145
|
+
"\u{1b088}" => "\u{306b}",
|
|
146
|
+
"\u{1b089}" => "\u{306b}",
|
|
147
|
+
"\u{1b08a}" => "\u{306b}",
|
|
148
|
+
"\u{1b08b}" => "\u{306b}",
|
|
149
|
+
"\u{1b08c}" => "\u{306b}",
|
|
150
|
+
"\u{1b08d}" => "\u{306b}",
|
|
151
|
+
"\u{1b08e}" => "\u{306b}",
|
|
152
|
+
"\u{1b08f}" => "\u{306c}",
|
|
153
|
+
"\u{1b090}" => "\u{306c}",
|
|
154
|
+
"\u{1b091}" => "\u{306c}",
|
|
155
|
+
"\u{1b092}" => "\u{306d}",
|
|
156
|
+
"\u{1b093}" => "\u{306d}",
|
|
157
|
+
"\u{1b094}" => "\u{306d}",
|
|
158
|
+
"\u{1b095}" => "\u{306d}",
|
|
159
|
+
"\u{1b096}" => "\u{306d}",
|
|
160
|
+
"\u{1b097}" => "\u{306d}",
|
|
161
|
+
"\u{1b098}" => "\u{306d}",
|
|
162
|
+
"\u{1b099}" => "\u{306e}",
|
|
163
|
+
"\u{1b09a}" => "\u{306e}",
|
|
164
|
+
"\u{1b09b}" => "\u{306e}",
|
|
165
|
+
"\u{1b09c}" => "\u{306e}",
|
|
166
|
+
"\u{1b09d}" => "\u{306e}",
|
|
167
|
+
"\u{1b09e}" => "\u{306f}",
|
|
168
|
+
"\u{1b09f}" => "\u{306f}",
|
|
169
|
+
"\u{1b0a0}" => "\u{306f}",
|
|
170
|
+
"\u{1b0a1}" => "\u{306f}",
|
|
171
|
+
"\u{1b0a2}" => "\u{306f}",
|
|
172
|
+
"\u{1b0a3}" => "\u{306f}",
|
|
173
|
+
"\u{1b0a4}" => "\u{306f}",
|
|
174
|
+
"\u{1b0a5}" => "\u{306f}",
|
|
175
|
+
"\u{1b0a6}" => "\u{306f}",
|
|
176
|
+
"\u{1b0a7}" => "\u{306f}",
|
|
177
|
+
"\u{1b0a8}" => "\u{306f}",
|
|
178
|
+
"\u{1b0a9}" => "\u{3072}",
|
|
179
|
+
"\u{1b0aa}" => "\u{3072}",
|
|
180
|
+
"\u{1b0ab}" => "\u{3072}",
|
|
181
|
+
"\u{1b0ac}" => "\u{3072}",
|
|
182
|
+
"\u{1b0ad}" => "\u{3072}",
|
|
183
|
+
"\u{1b0ae}" => "\u{3072}",
|
|
184
|
+
"\u{1b0af}" => "\u{3072}",
|
|
185
|
+
"\u{1b0b0}" => "\u{3075}",
|
|
186
|
+
"\u{1b0b1}" => "\u{3075}",
|
|
187
|
+
"\u{1b0b2}" => "\u{3075}",
|
|
188
|
+
"\u{1b0b3}" => "\u{3078}",
|
|
189
|
+
"\u{1b0b4}" => "\u{3078}",
|
|
190
|
+
"\u{1b0b5}" => "\u{3078}",
|
|
191
|
+
"\u{1b0b6}" => "\u{3078}",
|
|
192
|
+
"\u{1b0b7}" => "\u{3078}",
|
|
193
|
+
"\u{1b0b8}" => "\u{3078}",
|
|
194
|
+
"\u{1b0b9}" => "\u{3078}",
|
|
195
|
+
"\u{1b0ba}" => "\u{307b}",
|
|
196
|
+
"\u{1b0bb}" => "\u{307b}",
|
|
197
|
+
"\u{1b0bc}" => "\u{307b}",
|
|
198
|
+
"\u{1b0bd}" => "\u{307b}",
|
|
199
|
+
"\u{1b0be}" => "\u{307b}",
|
|
200
|
+
"\u{1b0bf}" => "\u{307b}",
|
|
201
|
+
"\u{1b0c0}" => "\u{307b}",
|
|
202
|
+
"\u{1b0c1}" => "\u{307b}",
|
|
203
|
+
"\u{1b0c2}" => "\u{307e}",
|
|
204
|
+
"\u{1b0c3}" => "\u{307e}",
|
|
205
|
+
"\u{1b0c4}" => "\u{307e}",
|
|
206
|
+
"\u{1b0c5}" => "\u{307e}",
|
|
207
|
+
"\u{1b0c6}" => "\u{307e}",
|
|
208
|
+
"\u{1b0c7}" => "\u{307e}",
|
|
209
|
+
"\u{1b0c8}" => "\u{307e}",
|
|
210
|
+
"\u{1b0c9}" => "\u{307f}",
|
|
211
|
+
"\u{1b0ca}" => "\u{307f}",
|
|
212
|
+
"\u{1b0cb}" => "\u{307f}",
|
|
213
|
+
"\u{1b0cc}" => "\u{307f}",
|
|
214
|
+
"\u{1b0cd}" => "\u{307f}",
|
|
215
|
+
"\u{1b0ce}" => "\u{307f}",
|
|
216
|
+
"\u{1b0cf}" => "\u{307f}",
|
|
217
|
+
"\u{1b0d0}" => "\u{3080}",
|
|
218
|
+
"\u{1b0d1}" => "\u{3080}",
|
|
219
|
+
"\u{1b0d2}" => "\u{3080}",
|
|
220
|
+
"\u{1b0d3}" => "\u{3080}",
|
|
221
|
+
"\u{1b0d4}" => "\u{3081}",
|
|
222
|
+
"\u{1b0d5}" => "\u{3081}",
|
|
223
|
+
"\u{1b0d6}" => "\u{3081}",
|
|
224
|
+
"\u{1b0d7}" => "\u{3082}",
|
|
225
|
+
"\u{1b0d8}" => "\u{3082}",
|
|
226
|
+
"\u{1b0d9}" => "\u{3082}",
|
|
227
|
+
"\u{1b0da}" => "\u{3082}",
|
|
228
|
+
"\u{1b0db}" => "\u{3082}",
|
|
229
|
+
"\u{1b0dc}" => "\u{3082}",
|
|
230
|
+
"\u{1b0dd}" => "\u{3084}",
|
|
231
|
+
"\u{1b0de}" => "\u{3084}",
|
|
232
|
+
"\u{1b0df}" => "\u{3084}",
|
|
233
|
+
"\u{1b0e0}" => "\u{3084}",
|
|
234
|
+
"\u{1b0e1}" => "\u{3084}",
|
|
235
|
+
"\u{1b0e2}" => "\u{3084}",
|
|
236
|
+
"\u{1b0e3}" => "\u{3086}",
|
|
237
|
+
"\u{1b0e4}" => "\u{3086}",
|
|
238
|
+
"\u{1b0e5}" => "\u{3086}",
|
|
239
|
+
"\u{1b0e6}" => "\u{3086}",
|
|
240
|
+
"\u{1b0e7}" => "\u{3088}",
|
|
241
|
+
"\u{1b0e8}" => "\u{3088}",
|
|
242
|
+
"\u{1b0e9}" => "\u{3088}",
|
|
243
|
+
"\u{1b0ea}" => "\u{3088}",
|
|
244
|
+
"\u{1b0eb}" => "\u{3088}",
|
|
245
|
+
"\u{1b0ec}" => "\u{3088}",
|
|
246
|
+
"\u{1b0ed}" => "\u{3089}",
|
|
247
|
+
"\u{1b0ee}" => "\u{3089}",
|
|
248
|
+
"\u{1b0ef}" => "\u{3089}",
|
|
249
|
+
"\u{1b0f0}" => "\u{3089}",
|
|
250
|
+
"\u{1b0f1}" => "\u{308a}",
|
|
251
|
+
"\u{1b0f2}" => "\u{308a}",
|
|
252
|
+
"\u{1b0f3}" => "\u{308a}",
|
|
253
|
+
"\u{1b0f4}" => "\u{308a}",
|
|
254
|
+
"\u{1b0f5}" => "\u{308a}",
|
|
255
|
+
"\u{1b0f6}" => "\u{308a}",
|
|
256
|
+
"\u{1b0f7}" => "\u{308a}",
|
|
257
|
+
"\u{1b0f8}" => "\u{308b}",
|
|
258
|
+
"\u{1b0f9}" => "\u{308b}",
|
|
259
|
+
"\u{1b0fa}" => "\u{308b}",
|
|
260
|
+
"\u{1b0fb}" => "\u{308b}",
|
|
261
|
+
"\u{1b0fc}" => "\u{308b}",
|
|
262
|
+
"\u{1b0fd}" => "\u{308b}",
|
|
263
|
+
"\u{1b0fe}" => "\u{308c}",
|
|
264
|
+
"\u{1b0ff}" => "\u{308c}",
|
|
265
|
+
"\u{1b100}" => "\u{308c}",
|
|
266
|
+
"\u{1b101}" => "\u{308c}",
|
|
267
|
+
"\u{1b102}" => "\u{308d}",
|
|
268
|
+
"\u{1b103}" => "\u{308d}",
|
|
269
|
+
"\u{1b104}" => "\u{308d}",
|
|
270
|
+
"\u{1b105}" => "\u{308d}",
|
|
271
|
+
"\u{1b106}" => "\u{308d}",
|
|
272
|
+
"\u{1b107}" => "\u{308d}",
|
|
273
|
+
"\u{1b108}" => "\u{308f}",
|
|
274
|
+
"\u{1b109}" => "\u{308f}",
|
|
275
|
+
"\u{1b10a}" => "\u{308f}",
|
|
276
|
+
"\u{1b10b}" => "\u{308f}",
|
|
277
|
+
"\u{1b10c}" => "\u{308f}",
|
|
278
|
+
"\u{1b10d}" => "\u{3090}",
|
|
279
|
+
"\u{1b10e}" => "\u{3090}",
|
|
280
|
+
"\u{1b10f}" => "\u{3090}",
|
|
281
|
+
"\u{1b110}" => "\u{3090}",
|
|
282
|
+
"\u{1b111}" => "\u{3090}",
|
|
283
|
+
"\u{1b112}" => "\u{3091}",
|
|
284
|
+
"\u{1b113}" => "\u{3091}",
|
|
285
|
+
"\u{1b114}" => "\u{3091}",
|
|
286
|
+
"\u{1b115}" => "\u{3091}",
|
|
287
|
+
"\u{1b116}" => "\u{3092}",
|
|
288
|
+
"\u{1b117}" => "\u{3092}",
|
|
289
|
+
"\u{1b118}" => "\u{3092}",
|
|
290
|
+
"\u{1b119}" => "\u{3092}",
|
|
291
|
+
"\u{1b11a}" => "\u{3092}",
|
|
292
|
+
"\u{1b11b}" => "\u{3092}",
|
|
293
|
+
"\u{1b11c}" => "\u{3092}",
|
|
294
|
+
"\u{1b11d}" => "\u{3093}",
|
|
295
|
+
"\u{1b11e}" => "\u{3093}",
|
|
296
|
+
"\u{1b11f}" => "\u{3046}",
|
|
297
|
+
"\u{1b120}" => "\u{30a4}",
|
|
298
|
+
"\u{1b121}" => "\u{30a8}",
|
|
299
|
+
"\u{1b122}" => "\u{30a6}"
|
|
300
|
+
}.freeze
|
|
301
|
+
|
|
302
|
+
# Transliterator for archaic_hirakatas
|
|
303
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
304
|
+
# Initialize the transliterator with options
|
|
305
|
+
#
|
|
306
|
+
# @param _options [Hash] Configuration options (currently unused)
|
|
307
|
+
def initialize(_options = {})
|
|
308
|
+
# Options currently unused for archaic_hirakatas transliterator
|
|
309
|
+
super()
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
# Replaces archaic kana (hentaigana) with their modern equivalents.
|
|
313
|
+
#
|
|
314
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
315
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
316
|
+
def call(input_chars)
|
|
317
|
+
offset = 0
|
|
318
|
+
|
|
319
|
+
result = input_chars.filter_map do |char|
|
|
320
|
+
replacement = ARCHAIC_HIRAKATAS_MAPPINGS[char.c]
|
|
321
|
+
c = if replacement
|
|
322
|
+
# Skip empty replacements (character removal)
|
|
323
|
+
next if replacement.empty?
|
|
324
|
+
|
|
325
|
+
Char.new(c: replacement, offset: offset, source: char)
|
|
326
|
+
else
|
|
327
|
+
char.with_offset(offset)
|
|
328
|
+
end
|
|
329
|
+
offset += c.c.length
|
|
330
|
+
c
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
class << result
|
|
334
|
+
include Yosina::Chars
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
result
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# Factory method to create a archaic_hirakatas transliterator
|
|
342
|
+
#
|
|
343
|
+
# @param options [Hash] Configuration options
|
|
344
|
+
# @return [Transliterator] A new archaic_hirakatas transliterator instance
|
|
345
|
+
def self.call(options = {})
|
|
346
|
+
Transliterator.new(options)
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
end
|
|
@@ -79,7 +79,11 @@ module Yosina
|
|
|
79
79
|
['ょ', 'ョ', 'ョ'],
|
|
80
80
|
['ゎ', 'ヮ', nil],
|
|
81
81
|
['ゕ', 'ヵ', nil],
|
|
82
|
-
['ゖ', 'ヶ', nil]
|
|
82
|
+
['ゖ', 'ヶ', nil],
|
|
83
|
+
["\u{1B132}", "\u{1B155}", nil],
|
|
84
|
+
["\u{1B150}", "\u{1B164}", nil],
|
|
85
|
+
["\u{1B151}", "\u{1B165}", nil],
|
|
86
|
+
["\u{1B152}", "\u{1B166}", nil]
|
|
83
87
|
].freeze
|
|
84
88
|
|
|
85
89
|
# Generate voiced character mappings
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Convert historical hiragana/katakana characters to their modern equivalents
|
|
6
|
+
module HistoricalHirakatas
|
|
7
|
+
# Historical hiragana mappings: source => { simple:, decompose: }
|
|
8
|
+
HISTORICAL_HIRAGANA_MAPPINGS = {
|
|
9
|
+
"\u{3090}" => { simple: "\u{3044}", decompose: "\u{3046}\u{3043}" }, # ゐ → い / うぃ
|
|
10
|
+
"\u{3091}" => { simple: "\u{3048}", decompose: "\u{3046}\u{3047}" } # ゑ → え / うぇ
|
|
11
|
+
}.freeze
|
|
12
|
+
|
|
13
|
+
# Historical katakana mappings: source => { simple:, decompose: }
|
|
14
|
+
HISTORICAL_KATAKANA_MAPPINGS = {
|
|
15
|
+
"\u{30F0}" => { simple: "\u{30A4}", decompose: "\u{30A6}\u{30A3}" }, # ヰ → イ / ウィ
|
|
16
|
+
"\u{30F1}" => { simple: "\u{30A8}", decompose: "\u{30A6}\u{30A7}" } # ヱ → エ / ウェ
|
|
17
|
+
}.freeze
|
|
18
|
+
|
|
19
|
+
# Voiced historical katakana mappings: source => small vowel suffix
|
|
20
|
+
VOICED_HISTORICAL_KANA_MAPPINGS = {
|
|
21
|
+
"\u{30F7}" => "\u{30A1}", # ヷ → ァ
|
|
22
|
+
"\u{30F8}" => "\u{30A3}", # ヸ → ィ
|
|
23
|
+
"\u{30F9}" => "\u{30A7}", # ヹ → ェ
|
|
24
|
+
"\u{30FA}" => "\u{30A9}" # ヺ → ォ
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
VOICED_HISTORICAL_KANA_DECOMPOSED_MAPPINGS = {
|
|
28
|
+
"\u{30EF}" => "\u{30A1}", # ヷ → ァ
|
|
29
|
+
"\u{30F0}" => "\u{30A3}", # ヸ → ィ
|
|
30
|
+
"\u{30F1}" => "\u{30A7}", # ヹ → ェ
|
|
31
|
+
"\u{30F2}" => "\u{30A9}" # ヺ → ォ
|
|
32
|
+
}.freeze
|
|
33
|
+
|
|
34
|
+
COMBINING_DAKUTEN = "\u{3099}"
|
|
35
|
+
VU = "\u{30F4}"
|
|
36
|
+
U = "\u{30A6}"
|
|
37
|
+
|
|
38
|
+
# Transliterator for historical hiragana/katakana conversion
|
|
39
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
40
|
+
# Initialize the transliterator with options
|
|
41
|
+
#
|
|
42
|
+
# @param options [Hash] Configuration options
|
|
43
|
+
# @option options [String] :hiraganas "simple" (default), "decompose", or "skip"
|
|
44
|
+
# @option options [String] :katakanas "simple" (default), "decompose", or "skip"
|
|
45
|
+
# @option options [String] :voiced_katakanas "decompose" or "skip" (default)
|
|
46
|
+
def initialize(options = {})
|
|
47
|
+
super()
|
|
48
|
+
@hiraganas = (options[:hiraganas] || :simple).to_sym
|
|
49
|
+
@katakanas = (options[:katakanas] || :simple).to_sym
|
|
50
|
+
@voiced_katakanas = (options[:voiced_katakanas] || :skip).to_sym
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Convert historical hiragana/katakana characters to modern equivalents
|
|
54
|
+
#
|
|
55
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
56
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
57
|
+
def call(input_chars)
|
|
58
|
+
Chars.enum do |y|
|
|
59
|
+
offset = 0
|
|
60
|
+
pending = nil
|
|
61
|
+
input_chars.each do |char|
|
|
62
|
+
if char.sentinel?
|
|
63
|
+
offset = emit_char(y, pending, offset) if pending
|
|
64
|
+
pending = nil
|
|
65
|
+
y << char
|
|
66
|
+
break
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
if pending.nil?
|
|
70
|
+
pending = char
|
|
71
|
+
next
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
if char.c == COMBINING_DAKUTEN
|
|
75
|
+
# Check if pending char could be a decomposed voiced base
|
|
76
|
+
decomposed = VOICED_HISTORICAL_KANA_DECOMPOSED_MAPPINGS[pending.c]
|
|
77
|
+
if @voiced_katakanas == :skip || decomposed.nil?
|
|
78
|
+
y << pending.with_offset(offset)
|
|
79
|
+
offset += pending.c.length
|
|
80
|
+
pending = char
|
|
81
|
+
next
|
|
82
|
+
end
|
|
83
|
+
y << Char.new(c: U, offset: offset, source: pending)
|
|
84
|
+
offset += U.length
|
|
85
|
+
y << char.with_offset(offset)
|
|
86
|
+
offset += char.c.length
|
|
87
|
+
y << Char.new(c: decomposed, offset: offset, source: pending)
|
|
88
|
+
offset += decomposed.length
|
|
89
|
+
pending = nil
|
|
90
|
+
next
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
offset = emit_char(y, pending, offset)
|
|
94
|
+
pending = char
|
|
95
|
+
end
|
|
96
|
+
# Flush any remaining pending char
|
|
97
|
+
emit_char(y, pending, offset) if pending
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
private
|
|
102
|
+
|
|
103
|
+
# Emit a single char through the normal mapping logic
|
|
104
|
+
#
|
|
105
|
+
# @param y [Enumerator::Yielder] The yielder
|
|
106
|
+
# @param char [Char] The character to emit
|
|
107
|
+
# @param offset [Integer] The current offset
|
|
108
|
+
# @return [Integer] The new offset after emitting
|
|
109
|
+
# rubocop:disable Naming/MethodParameterName
|
|
110
|
+
def emit_char(y, char, offset)
|
|
111
|
+
# Historical hiragana
|
|
112
|
+
hira_mapping = HISTORICAL_HIRAGANA_MAPPINGS[char.c]
|
|
113
|
+
if hira_mapping && @hiraganas != :skip
|
|
114
|
+
replacement = hira_mapping[@hiraganas]
|
|
115
|
+
y << Char.new(c: replacement, offset: offset, source: char)
|
|
116
|
+
return offset + replacement.length
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Historical katakana
|
|
120
|
+
kata_mapping = HISTORICAL_KATAKANA_MAPPINGS[char.c]
|
|
121
|
+
if kata_mapping && @katakanas != :skip
|
|
122
|
+
replacement = kata_mapping[@katakanas]
|
|
123
|
+
y << Char.new(c: replacement, offset: offset, source: char)
|
|
124
|
+
return offset + replacement.length
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Voiced historical katakana
|
|
128
|
+
if @voiced_katakanas == :decompose
|
|
129
|
+
decomposed = VOICED_HISTORICAL_KANA_MAPPINGS[char.c]
|
|
130
|
+
if decomposed
|
|
131
|
+
y << Char.new(c: VU, offset: offset, source: char)
|
|
132
|
+
offset += VU.length
|
|
133
|
+
y << Char.new(c: decomposed, offset: offset, source: char)
|
|
134
|
+
return offset + decomposed.length
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
y << char.with_offset(offset)
|
|
139
|
+
offset + char.c.length
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
# rubocop:enable Naming/MethodParameterName
|
|
143
|
+
|
|
144
|
+
# Factory method to create a historical hirakatas transliterator
|
|
145
|
+
#
|
|
146
|
+
# @param options [Hash] Configuration options
|
|
147
|
+
# @return [Transliterator] A new historical hirakatas transliterator instance
|
|
148
|
+
def self.call(options = {})
|
|
149
|
+
Transliterator.new(options)
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Replaces small hiragana/katakana with their ordinary-sized equivalents.
|
|
6
|
+
module SmallHirakatas
|
|
7
|
+
# Generated mapping data from small_hirakatas.json
|
|
8
|
+
SMALL_HIRAKATAS_MAPPINGS = {
|
|
9
|
+
"\u{3041}" => "\u{3042}",
|
|
10
|
+
"\u{3043}" => "\u{3044}",
|
|
11
|
+
"\u{3045}" => "\u{3046}",
|
|
12
|
+
"\u{3047}" => "\u{3048}",
|
|
13
|
+
"\u{3049}" => "\u{304a}",
|
|
14
|
+
"\u{3063}" => "\u{3064}",
|
|
15
|
+
"\u{3083}" => "\u{3084}",
|
|
16
|
+
"\u{3085}" => "\u{3086}",
|
|
17
|
+
"\u{3087}" => "\u{3088}",
|
|
18
|
+
"\u{308e}" => "\u{308f}",
|
|
19
|
+
"\u{3095}" => "\u{304b}",
|
|
20
|
+
"\u{3096}" => "\u{3051}",
|
|
21
|
+
"\u{30a1}" => "\u{30a2}",
|
|
22
|
+
"\u{30a3}" => "\u{30a4}",
|
|
23
|
+
"\u{30a5}" => "\u{30a6}",
|
|
24
|
+
"\u{30a7}" => "\u{30a8}",
|
|
25
|
+
"\u{30a9}" => "\u{30aa}",
|
|
26
|
+
"\u{30c3}" => "\u{30c4}",
|
|
27
|
+
"\u{30e3}" => "\u{30e4}",
|
|
28
|
+
"\u{30e5}" => "\u{30e6}",
|
|
29
|
+
"\u{30e7}" => "\u{30e8}",
|
|
30
|
+
"\u{30ee}" => "\u{30ef}",
|
|
31
|
+
"\u{30f5}" => "\u{30ab}",
|
|
32
|
+
"\u{30f6}" => "\u{30b1}",
|
|
33
|
+
"\u{31f0}" => "\u{30af}",
|
|
34
|
+
"\u{31f1}" => "\u{30b7}",
|
|
35
|
+
"\u{31f2}" => "\u{30b9}",
|
|
36
|
+
"\u{31f3}" => "\u{30c8}",
|
|
37
|
+
"\u{31f4}" => "\u{30cc}",
|
|
38
|
+
"\u{31f5}" => "\u{30cf}",
|
|
39
|
+
"\u{31f6}" => "\u{30d2}",
|
|
40
|
+
"\u{31f7}" => "\u{30d5}",
|
|
41
|
+
"\u{31f8}" => "\u{30d8}",
|
|
42
|
+
"\u{31f9}" => "\u{30db}",
|
|
43
|
+
"\u{31fa}" => "\u{30e0}",
|
|
44
|
+
"\u{31fb}" => "\u{30e9}",
|
|
45
|
+
"\u{31fc}" => "\u{30ea}",
|
|
46
|
+
"\u{31fd}" => "\u{30eb}",
|
|
47
|
+
"\u{31fe}" => "\u{30ec}",
|
|
48
|
+
"\u{31ff}" => "\u{30ed}",
|
|
49
|
+
"\u{ff67}" => "\u{ff71}",
|
|
50
|
+
"\u{ff68}" => "\u{ff72}",
|
|
51
|
+
"\u{ff69}" => "\u{ff73}",
|
|
52
|
+
"\u{ff6a}" => "\u{ff74}",
|
|
53
|
+
"\u{ff6b}" => "\u{ff75}",
|
|
54
|
+
"\u{ff6c}" => "\u{ff94}",
|
|
55
|
+
"\u{ff6d}" => "\u{ff95}",
|
|
56
|
+
"\u{ff6e}" => "\u{ff96}",
|
|
57
|
+
"\u{ff6f}" => "\u{ff82}",
|
|
58
|
+
"\u{1b132}" => "\u{3053}",
|
|
59
|
+
"\u{1b150}" => "\u{3090}",
|
|
60
|
+
"\u{1b151}" => "\u{3091}",
|
|
61
|
+
"\u{1b152}" => "\u{3092}",
|
|
62
|
+
"\u{1b155}" => "\u{30b3}",
|
|
63
|
+
"\u{1b164}" => "\u{30f0}",
|
|
64
|
+
"\u{1b165}" => "\u{30f1}",
|
|
65
|
+
"\u{1b166}" => "\u{30f2}",
|
|
66
|
+
"\u{1b167}" => "\u{30f3}"
|
|
67
|
+
}.freeze
|
|
68
|
+
|
|
69
|
+
# Transliterator for small_hirakatas
|
|
70
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
71
|
+
# Initialize the transliterator with options
|
|
72
|
+
#
|
|
73
|
+
# @param _options [Hash] Configuration options (currently unused)
|
|
74
|
+
def initialize(_options = {})
|
|
75
|
+
# Options currently unused for small_hirakatas transliterator
|
|
76
|
+
super()
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Replaces small hiragana/katakana with their ordinary-sized equivalents.
|
|
80
|
+
#
|
|
81
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
82
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
83
|
+
def call(input_chars)
|
|
84
|
+
offset = 0
|
|
85
|
+
|
|
86
|
+
result = input_chars.filter_map do |char|
|
|
87
|
+
replacement = SMALL_HIRAKATAS_MAPPINGS[char.c]
|
|
88
|
+
c = if replacement
|
|
89
|
+
# Skip empty replacements (character removal)
|
|
90
|
+
next if replacement.empty?
|
|
91
|
+
|
|
92
|
+
Char.new(c: replacement, offset: offset, source: char)
|
|
93
|
+
else
|
|
94
|
+
char.with_offset(offset)
|
|
95
|
+
end
|
|
96
|
+
offset += c.c.length
|
|
97
|
+
c
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
class << result
|
|
101
|
+
include Yosina::Chars
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
result
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Factory method to create a small_hirakatas transliterator
|
|
109
|
+
#
|
|
110
|
+
# @param options [Hash] Configuration options
|
|
111
|
+
# @return [Transliterator] A new small_hirakatas transliterator instance
|
|
112
|
+
def self.call(options = {})
|
|
113
|
+
Transliterator.new(options)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -15,6 +15,9 @@ require_relative 'transliterators/jisx0201_and_alike'
|
|
|
15
15
|
require_relative 'transliterators/circled_or_squared'
|
|
16
16
|
require_relative 'transliterators/combined'
|
|
17
17
|
require_relative 'transliterators/japanese_iteration_marks'
|
|
18
|
+
require_relative 'transliterators/archaic_hirakatas'
|
|
19
|
+
require_relative 'transliterators/small_hirakatas'
|
|
20
|
+
require_relative 'transliterators/historical_hirakatas'
|
|
18
21
|
|
|
19
22
|
module Yosina
|
|
20
23
|
# Registry for transliterator factories
|
|
@@ -34,7 +37,10 @@ module Yosina
|
|
|
34
37
|
jisx0201_and_alike: Transliterators::Jisx0201AndAlike,
|
|
35
38
|
combined: Transliterators::Combined,
|
|
36
39
|
circled_or_squared: CircledOrSquared,
|
|
37
|
-
japanese_iteration_marks: Transliterators::JapaneseIterationMarks
|
|
40
|
+
japanese_iteration_marks: Transliterators::JapaneseIterationMarks,
|
|
41
|
+
archaic_hirakatas: Transliterators::ArchaicHirakatas,
|
|
42
|
+
small_hirakatas: Transliterators::SmallHirakatas,
|
|
43
|
+
historical_hirakatas: Transliterators::HistoricalHirakatas
|
|
38
44
|
}.freeze
|
|
39
45
|
|
|
40
46
|
# Get a transliterator factory by name
|
data/lib/yosina/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: yosina
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 1.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Moriyoshi Koizumi
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-03-18 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: minitest
|
|
@@ -109,6 +109,7 @@ files:
|
|
|
109
109
|
- lib/yosina/recipes.rb
|
|
110
110
|
- lib/yosina/transliterator.rb
|
|
111
111
|
- lib/yosina/transliterators.rb
|
|
112
|
+
- lib/yosina/transliterators/archaic_hirakatas.rb
|
|
112
113
|
- lib/yosina/transliterators/circled_or_squared.rb
|
|
113
114
|
- lib/yosina/transliterators/circled_or_squared_data.rb
|
|
114
115
|
- lib/yosina/transliterators/combined.rb
|
|
@@ -116,6 +117,7 @@ files:
|
|
|
116
117
|
- lib/yosina/transliterators/hira_kata.rb
|
|
117
118
|
- lib/yosina/transliterators/hira_kata_composition.rb
|
|
118
119
|
- lib/yosina/transliterators/hira_kata_table.rb
|
|
120
|
+
- lib/yosina/transliterators/historical_hirakatas.rb
|
|
119
121
|
- lib/yosina/transliterators/hyphens.rb
|
|
120
122
|
- lib/yosina/transliterators/hyphens_data.rb
|
|
121
123
|
- lib/yosina/transliterators/ideographic_annotations.rb
|
|
@@ -129,6 +131,7 @@ files:
|
|
|
129
131
|
- lib/yosina/transliterators/radicals.rb
|
|
130
132
|
- lib/yosina/transliterators/roman_numerals.rb
|
|
131
133
|
- lib/yosina/transliterators/roman_numerals_data.rb
|
|
134
|
+
- lib/yosina/transliterators/small_hirakatas.rb
|
|
132
135
|
- lib/yosina/transliterators/spaces.rb
|
|
133
136
|
- lib/yosina/version.rb
|
|
134
137
|
- yosina.gemspec
|
|
@@ -140,7 +143,7 @@ metadata:
|
|
|
140
143
|
homepage_uri: https://github.com/yosina-lib/yosina
|
|
141
144
|
source_code_uri: https://github.com/yosina-lib/yosina
|
|
142
145
|
changelog_uri: https://github.com/yosina-lib/yosina/releases
|
|
143
|
-
post_install_message:
|
|
146
|
+
post_install_message:
|
|
144
147
|
rdoc_options: []
|
|
145
148
|
require_paths:
|
|
146
149
|
- lib
|
|
@@ -155,8 +158,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
155
158
|
- !ruby/object:Gem::Version
|
|
156
159
|
version: '0'
|
|
157
160
|
requirements: []
|
|
158
|
-
rubygems_version: 3.
|
|
159
|
-
signing_key:
|
|
161
|
+
rubygems_version: 3.0.3.1
|
|
162
|
+
signing_key:
|
|
160
163
|
specification_version: 4
|
|
161
164
|
summary: Japanese text transliteration library
|
|
162
165
|
test_files: []
|