yosina 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.ja.md +17 -0
- data/README.md +13 -0
- data/codegen/dataset.rb +7 -1
- data/codegen/main.rb +13 -1
- data/lib/yosina/recipes.rb +41 -0
- data/lib/yosina/transliterators/archaic_hirakatas.rb +350 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +5 -1
- data/lib/yosina/transliterators/historical_hirakatas.rb +153 -0
- data/lib/yosina/transliterators/small_hirakatas.rb +117 -0
- data/lib/yosina/transliterators.rb +7 -1
- data/lib/yosina/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e2f55aa031748a0855edb5e5d75e8f0aa61f2c1636432bf1d2582aaff0acbed7
|
|
4
|
+
data.tar.gz: 7297e0aa25fd447303bc379e7206b76589bf1ffa970e767a11d82e04f92a480b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1f9b3046e6e9ecd17080793bff4499af828826fb47fc1990aa5576d246e3a00d1030ef0694698b4e4b954f2de28ef95e7ebcccdcce6fb01823506232149b17a2
|
|
7
|
+
data.tar.gz: 81abcc7866ccae0ad2966a7593f94cd5f4b4535709c752c369b20382e650186f8e85647e2a676ace9d586d97bd7a47ec36d2e7e264816df53ad6afe3a616d68f
|
data/README.ja.md
CHANGED
|
@@ -188,6 +188,23 @@ CJK部首文字を対応する表意文字に変換します。
|
|
|
188
188
|
様々なUnicode空白文字を標準ASCII空白に正規化します。
|
|
189
189
|
- 例: `A B` (表意文字空白) → `A B`
|
|
190
190
|
|
|
191
|
+
### 15. **ローマ数字** (`roman-numerals`)
|
|
192
|
+
Unicodeのローマ数字文字を対応するASCII文字に変換します。
|
|
193
|
+
- 例: `Ⅰ Ⅱ Ⅲ` → `I II III`、`ⅰ ⅱ ⅲ` → `i ii iii`
|
|
194
|
+
|
|
195
|
+
### 16. **小書きひらがな・カタカナ** (`small-hirakatas`)
|
|
196
|
+
小書きのひらがな・カタカナを通常サイズの等価文字に変換します。
|
|
197
|
+
- 例: `ぁぃぅ` → `あいう`、`ァィゥ` → `アイウ`
|
|
198
|
+
|
|
199
|
+
### 17. **変体仮名** (`archaic-hirakatas`)
|
|
200
|
+
変体仮名(古い仮名文字)を現代のひらがな・カタカナに変換します。
|
|
201
|
+
- 例: `𛀁` → `え`
|
|
202
|
+
|
|
203
|
+
### 18. **歴史的仮名** (`historical-hirakatas`)
|
|
204
|
+
歴史的なひらがな・カタカナを現代の等価文字に変換します。
|
|
205
|
+
- オプション: `hiraganas` ("simple"、"decompose"、"skip")、`katakanas` ("simple"、"decompose"、"skip")、`voicedKatakanas` ("decompose"、"skip")
|
|
206
|
+
- 例: `ゐ` → `い` (simple)、`ゐ` → `うぃ` (decompose)、`ヰ` → `イ` (simple)
|
|
207
|
+
|
|
191
208
|
## 開発
|
|
192
209
|
|
|
193
210
|
リポジトリをチェックアウトした後、`bundle install`を実行して依存関係をインストールします。
|
data/README.md
CHANGED
|
@@ -196,6 +196,19 @@ Normalizes various Unicode space characters to standard ASCII space.
|
|
|
196
196
|
Converts Unicode Roman numeral characters to their ASCII letter equivalents.
|
|
197
197
|
- Example: `Ⅰ Ⅱ Ⅲ` → `I II III`, `ⅰ ⅱ ⅲ` → `i ii iii`
|
|
198
198
|
|
|
199
|
+
### 16. **Small Hirakatas** (`small-hirakatas`)
|
|
200
|
+
Converts small hiragana and katakana characters to their ordinary-sized equivalents.
|
|
201
|
+
- Example: `ぁぃぅ` → `あいう`, `ァィゥ` → `アイウ`
|
|
202
|
+
|
|
203
|
+
### 17. **Archaic Hirakatas** (`archaic-hirakatas`)
|
|
204
|
+
Converts archaic kana (hentaigana) to their modern hiragana or katakana equivalents.
|
|
205
|
+
- Example: `𛀁` → `え`
|
|
206
|
+
|
|
207
|
+
### 18. **Historical Hirakatas** (`historical-hirakatas`)
|
|
208
|
+
Converts historical hiragana and katakana characters to their modern equivalents.
|
|
209
|
+
- Options: `hiraganas` ("simple", "decompose", or "skip"), `katakanas` ("simple", "decompose", or "skip"), `voicedKatakanas` ("decompose" or "skip")
|
|
210
|
+
- Example: `ゐ` → `い` (simple), `ゐ` → `うぃ` (decompose), `ヰ` → `イ` (simple)
|
|
211
|
+
|
|
199
212
|
## Development
|
|
200
213
|
|
|
201
214
|
After checking out the repo, run `bundle install` to install dependencies.
|
data/codegen/dataset.rb
CHANGED
|
@@ -14,6 +14,8 @@ DatasetSourceDefs = Struct.new(
|
|
|
14
14
|
:combined,
|
|
15
15
|
:circled_or_squared,
|
|
16
16
|
:roman_numerals,
|
|
17
|
+
:archaic_hirakatas,
|
|
18
|
+
:small_hirakatas,
|
|
17
19
|
keyword_init: true
|
|
18
20
|
)
|
|
19
21
|
|
|
@@ -29,6 +31,8 @@ Dataset = Struct.new(
|
|
|
29
31
|
:combined,
|
|
30
32
|
:circled_or_squared,
|
|
31
33
|
:roman_numerals,
|
|
34
|
+
:archaic_hirakatas,
|
|
35
|
+
:small_hirakatas,
|
|
32
36
|
keyword_init: true
|
|
33
37
|
)
|
|
34
38
|
|
|
@@ -241,6 +245,8 @@ def build_dataset_from_data_root(data_root, defs)
|
|
|
241
245
|
kanji_old_new: load_kanji_old_new_data(data_root / defs.kanji_old_new),
|
|
242
246
|
combined: load_combined_data(data_root / defs.combined),
|
|
243
247
|
circled_or_squared: load_circled_or_squared_data(data_root / defs.circled_or_squared),
|
|
244
|
-
roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals)
|
|
248
|
+
roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals),
|
|
249
|
+
archaic_hirakatas: load_simple_data(data_root / defs.archaic_hirakatas),
|
|
250
|
+
small_hirakatas: load_simple_data(data_root / defs.small_hirakatas)
|
|
245
251
|
)
|
|
246
252
|
end
|
data/codegen/main.rb
CHANGED
|
@@ -32,7 +32,9 @@ def main
|
|
|
32
32
|
kanji_old_new: 'kanji-old-new-form.json',
|
|
33
33
|
combined: 'combined-chars.json',
|
|
34
34
|
circled_or_squared: 'circled-or-squared.json',
|
|
35
|
-
roman_numerals: 'roman-numerals.json'
|
|
35
|
+
roman_numerals: 'roman-numerals.json',
|
|
36
|
+
archaic_hirakatas: 'archaic-hirakatas.json',
|
|
37
|
+
small_hirakatas: 'small-hirakatas.json'
|
|
36
38
|
)
|
|
37
39
|
|
|
38
40
|
# Load the dataset
|
|
@@ -64,6 +66,16 @@ def main
|
|
|
64
66
|
'kanji_old_new',
|
|
65
67
|
'Replace old-style kanji with modern equivalents',
|
|
66
68
|
dataset.kanji_old_new
|
|
69
|
+
],
|
|
70
|
+
[
|
|
71
|
+
'archaic_hirakatas',
|
|
72
|
+
'Replaces archaic kana (hentaigana) with their modern equivalents.',
|
|
73
|
+
dataset.archaic_hirakatas
|
|
74
|
+
],
|
|
75
|
+
[
|
|
76
|
+
'small_hirakatas',
|
|
77
|
+
'Replaces small hiragana/katakana with their ordinary-sized equivalents.',
|
|
78
|
+
dataset.small_hirakatas
|
|
67
79
|
]
|
|
68
80
|
]
|
|
69
81
|
|
data/lib/yosina/recipes.rb
CHANGED
|
@@ -51,12 +51,15 @@ module Yosina
|
|
|
51
51
|
end
|
|
52
52
|
|
|
53
53
|
# Configuration recipe for building transliterator chains
|
|
54
|
+
# rubocop:disable Metrics/ClassLength
|
|
54
55
|
class TransliterationRecipe
|
|
55
56
|
attr_accessor :kanji_old_new, :hira_kata, :replace_japanese_iteration_marks,
|
|
56
57
|
:replace_suspicious_hyphens_to_prolonged_sound_marks,
|
|
57
58
|
:replace_combined_characters, :replace_circled_or_squared_characters,
|
|
58
59
|
:replace_ideographic_annotations, :replace_radicals, :replace_spaces,
|
|
59
60
|
:replace_hyphens, :replace_mathematical_alphanumerics, :replace_roman_numerals,
|
|
61
|
+
:replace_archaic_hirakatas, :replace_small_hirakatas,
|
|
62
|
+
:convert_historical_hirakatas,
|
|
60
63
|
:combine_decomposed_hiraganas_and_katakanas, :to_fullwidth, :to_halfwidth,
|
|
61
64
|
:remove_ivs_svs, :charset
|
|
62
65
|
|
|
@@ -161,6 +164,8 @@ module Yosina
|
|
|
161
164
|
replace_ideographic_annotations: false, replace_radicals: false,
|
|
162
165
|
replace_spaces: false, replace_hyphens: false,
|
|
163
166
|
replace_mathematical_alphanumerics: false, replace_roman_numerals: false,
|
|
167
|
+
replace_archaic_hirakatas: false, replace_small_hirakatas: false,
|
|
168
|
+
convert_historical_hirakatas: nil,
|
|
164
169
|
combine_decomposed_hiraganas_and_katakanas: false,
|
|
165
170
|
to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false,
|
|
166
171
|
charset: 'unijis_2004')
|
|
@@ -176,6 +181,9 @@ module Yosina
|
|
|
176
181
|
@replace_hyphens = replace_hyphens
|
|
177
182
|
@replace_mathematical_alphanumerics = replace_mathematical_alphanumerics
|
|
178
183
|
@replace_roman_numerals = replace_roman_numerals
|
|
184
|
+
@replace_archaic_hirakatas = replace_archaic_hirakatas
|
|
185
|
+
@replace_small_hirakatas = replace_small_hirakatas
|
|
186
|
+
@convert_historical_hirakatas = convert_historical_hirakatas
|
|
179
187
|
@combine_decomposed_hiraganas_and_katakanas = combine_decomposed_hiraganas_and_katakanas
|
|
180
188
|
@to_fullwidth = to_fullwidth
|
|
181
189
|
@to_halfwidth = to_halfwidth
|
|
@@ -208,6 +216,9 @@ module Yosina
|
|
|
208
216
|
ctx = apply_replace_hyphens(ctx)
|
|
209
217
|
ctx = apply_replace_mathematical_alphanumerics(ctx)
|
|
210
218
|
ctx = apply_replace_roman_numerals(ctx)
|
|
219
|
+
ctx = apply_replace_archaic_hirakatas(ctx)
|
|
220
|
+
ctx = apply_replace_small_hirakatas(ctx)
|
|
221
|
+
ctx = apply_convert_historical_hirakatas(ctx)
|
|
211
222
|
ctx = apply_combine_decomposed_hiraganas_and_katakanas(ctx)
|
|
212
223
|
ctx = apply_to_fullwidth(ctx)
|
|
213
224
|
ctx = apply_hira_kata(ctx)
|
|
@@ -329,6 +340,35 @@ module Yosina
|
|
|
329
340
|
end
|
|
330
341
|
end
|
|
331
342
|
|
|
343
|
+
def apply_replace_archaic_hirakatas(ctx)
|
|
344
|
+
if @replace_archaic_hirakatas
|
|
345
|
+
ctx.insert_middle([:archaic_hirakatas, {}])
|
|
346
|
+
else
|
|
347
|
+
ctx
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def apply_replace_small_hirakatas(ctx)
|
|
352
|
+
if @replace_small_hirakatas
|
|
353
|
+
ctx.insert_middle([:small_hirakatas, {}])
|
|
354
|
+
else
|
|
355
|
+
ctx
|
|
356
|
+
end
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
def apply_convert_historical_hirakatas(ctx)
|
|
360
|
+
if @convert_historical_hirakatas
|
|
361
|
+
mode = @convert_historical_hirakatas
|
|
362
|
+
ctx.insert_middle([:historical_hirakatas, {
|
|
363
|
+
hiraganas: mode,
|
|
364
|
+
katakanas: mode,
|
|
365
|
+
voiced_katakanas: mode == 'decompose' ? 'decompose' : 'skip'
|
|
366
|
+
}])
|
|
367
|
+
else
|
|
368
|
+
ctx
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
|
|
332
372
|
def apply_combine_decomposed_hiraganas_and_katakanas(ctx)
|
|
333
373
|
if @combine_decomposed_hiraganas_and_katakanas
|
|
334
374
|
ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
|
|
@@ -373,3 +413,4 @@ module Yosina
|
|
|
373
413
|
recipe.build_transliterator_configs
|
|
374
414
|
end
|
|
375
415
|
end
|
|
416
|
+
# rubocop:enable Metrics/ClassLength
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Replaces archaic kana (hentaigana) with their modern equivalents.
|
|
6
|
+
module ArchaicHirakatas
|
|
7
|
+
# Generated mapping data from archaic_hirakatas.json
|
|
8
|
+
ARCHAIC_HIRAKATAS_MAPPINGS = {
|
|
9
|
+
"\u{1b000}" => "\u{30a8}",
|
|
10
|
+
"\u{1b001}" => "\u{3048}",
|
|
11
|
+
"\u{1b002}" => "\u{3042}",
|
|
12
|
+
"\u{1b003}" => "\u{3042}",
|
|
13
|
+
"\u{1b004}" => "\u{3042}",
|
|
14
|
+
"\u{1b005}" => "\u{3042}",
|
|
15
|
+
"\u{1b006}" => "\u{3044}",
|
|
16
|
+
"\u{1b007}" => "\u{3044}",
|
|
17
|
+
"\u{1b008}" => "\u{3044}",
|
|
18
|
+
"\u{1b009}" => "\u{3044}",
|
|
19
|
+
"\u{1b00a}" => "\u{3046}",
|
|
20
|
+
"\u{1b00b}" => "\u{3046}",
|
|
21
|
+
"\u{1b00c}" => "\u{3046}",
|
|
22
|
+
"\u{1b00d}" => "\u{3046}",
|
|
23
|
+
"\u{1b00e}" => "\u{3046}",
|
|
24
|
+
"\u{1b00f}" => "\u{3048}",
|
|
25
|
+
"\u{1b010}" => "\u{3048}",
|
|
26
|
+
"\u{1b011}" => "\u{3048}",
|
|
27
|
+
"\u{1b012}" => "\u{3048}",
|
|
28
|
+
"\u{1b013}" => "\u{3048}",
|
|
29
|
+
"\u{1b014}" => "\u{304a}",
|
|
30
|
+
"\u{1b015}" => "\u{304a}",
|
|
31
|
+
"\u{1b016}" => "\u{304a}",
|
|
32
|
+
"\u{1b017}" => "\u{304b}",
|
|
33
|
+
"\u{1b018}" => "\u{304b}",
|
|
34
|
+
"\u{1b019}" => "\u{304b}",
|
|
35
|
+
"\u{1b01a}" => "\u{304b}",
|
|
36
|
+
"\u{1b01b}" => "\u{304b}",
|
|
37
|
+
"\u{1b01c}" => "\u{304b}",
|
|
38
|
+
"\u{1b01d}" => "\u{304b}",
|
|
39
|
+
"\u{1b01e}" => "\u{304b}",
|
|
40
|
+
"\u{1b01f}" => "\u{304b}",
|
|
41
|
+
"\u{1b020}" => "\u{304b}",
|
|
42
|
+
"\u{1b021}" => "\u{304b}",
|
|
43
|
+
"\u{1b022}" => "\u{304b}",
|
|
44
|
+
"\u{1b023}" => "\u{304d}",
|
|
45
|
+
"\u{1b024}" => "\u{304d}",
|
|
46
|
+
"\u{1b025}" => "\u{304d}",
|
|
47
|
+
"\u{1b026}" => "\u{304d}",
|
|
48
|
+
"\u{1b027}" => "\u{304d}",
|
|
49
|
+
"\u{1b028}" => "\u{304d}",
|
|
50
|
+
"\u{1b029}" => "\u{304d}",
|
|
51
|
+
"\u{1b02a}" => "\u{304d}",
|
|
52
|
+
"\u{1b02b}" => "\u{304f}",
|
|
53
|
+
"\u{1b02c}" => "\u{304f}",
|
|
54
|
+
"\u{1b02d}" => "\u{304f}",
|
|
55
|
+
"\u{1b02e}" => "\u{304f}",
|
|
56
|
+
"\u{1b02f}" => "\u{304f}",
|
|
57
|
+
"\u{1b030}" => "\u{304f}",
|
|
58
|
+
"\u{1b031}" => "\u{304f}",
|
|
59
|
+
"\u{1b032}" => "\u{3051}",
|
|
60
|
+
"\u{1b033}" => "\u{3051}",
|
|
61
|
+
"\u{1b034}" => "\u{3051}",
|
|
62
|
+
"\u{1b035}" => "\u{3051}",
|
|
63
|
+
"\u{1b036}" => "\u{3051}",
|
|
64
|
+
"\u{1b037}" => "\u{3051}",
|
|
65
|
+
"\u{1b038}" => "\u{3053}",
|
|
66
|
+
"\u{1b039}" => "\u{3053}",
|
|
67
|
+
"\u{1b03a}" => "\u{3053}",
|
|
68
|
+
"\u{1b03b}" => "\u{3053}",
|
|
69
|
+
"\u{1b03c}" => "\u{3055}",
|
|
70
|
+
"\u{1b03d}" => "\u{3055}",
|
|
71
|
+
"\u{1b03e}" => "\u{3055}",
|
|
72
|
+
"\u{1b03f}" => "\u{3055}",
|
|
73
|
+
"\u{1b040}" => "\u{3055}",
|
|
74
|
+
"\u{1b041}" => "\u{3055}",
|
|
75
|
+
"\u{1b042}" => "\u{3055}",
|
|
76
|
+
"\u{1b043}" => "\u{3055}",
|
|
77
|
+
"\u{1b044}" => "\u{3057}",
|
|
78
|
+
"\u{1b045}" => "\u{3057}",
|
|
79
|
+
"\u{1b046}" => "\u{3057}",
|
|
80
|
+
"\u{1b047}" => "\u{3057}",
|
|
81
|
+
"\u{1b048}" => "\u{3057}",
|
|
82
|
+
"\u{1b049}" => "\u{3057}",
|
|
83
|
+
"\u{1b04a}" => "\u{3059}",
|
|
84
|
+
"\u{1b04b}" => "\u{3059}",
|
|
85
|
+
"\u{1b04c}" => "\u{3059}",
|
|
86
|
+
"\u{1b04d}" => "\u{3059}",
|
|
87
|
+
"\u{1b04e}" => "\u{3059}",
|
|
88
|
+
"\u{1b04f}" => "\u{3059}",
|
|
89
|
+
"\u{1b050}" => "\u{3059}",
|
|
90
|
+
"\u{1b051}" => "\u{3059}",
|
|
91
|
+
"\u{1b052}" => "\u{305b}",
|
|
92
|
+
"\u{1b053}" => "\u{305b}",
|
|
93
|
+
"\u{1b054}" => "\u{305b}",
|
|
94
|
+
"\u{1b055}" => "\u{305b}",
|
|
95
|
+
"\u{1b056}" => "\u{305b}",
|
|
96
|
+
"\u{1b057}" => "\u{305d}",
|
|
97
|
+
"\u{1b058}" => "\u{305d}",
|
|
98
|
+
"\u{1b059}" => "\u{305d}",
|
|
99
|
+
"\u{1b05a}" => "\u{305d}",
|
|
100
|
+
"\u{1b05b}" => "\u{305d}",
|
|
101
|
+
"\u{1b05c}" => "\u{305d}",
|
|
102
|
+
"\u{1b05d}" => "\u{305d}",
|
|
103
|
+
"\u{1b05e}" => "\u{305f}",
|
|
104
|
+
"\u{1b05f}" => "\u{305f}",
|
|
105
|
+
"\u{1b060}" => "\u{305f}",
|
|
106
|
+
"\u{1b061}" => "\u{305f}",
|
|
107
|
+
"\u{1b062}" => "\u{3061}",
|
|
108
|
+
"\u{1b063}" => "\u{3061}",
|
|
109
|
+
"\u{1b064}" => "\u{3061}",
|
|
110
|
+
"\u{1b065}" => "\u{3061}",
|
|
111
|
+
"\u{1b066}" => "\u{3061}",
|
|
112
|
+
"\u{1b067}" => "\u{3061}",
|
|
113
|
+
"\u{1b068}" => "\u{3061}",
|
|
114
|
+
"\u{1b069}" => "\u{3064}",
|
|
115
|
+
"\u{1b06a}" => "\u{3064}",
|
|
116
|
+
"\u{1b06b}" => "\u{3064}",
|
|
117
|
+
"\u{1b06c}" => "\u{3064}",
|
|
118
|
+
"\u{1b06d}" => "\u{3064}",
|
|
119
|
+
"\u{1b06e}" => "\u{3066}",
|
|
120
|
+
"\u{1b06f}" => "\u{3066}",
|
|
121
|
+
"\u{1b070}" => "\u{3066}",
|
|
122
|
+
"\u{1b071}" => "\u{3066}",
|
|
123
|
+
"\u{1b072}" => "\u{3066}",
|
|
124
|
+
"\u{1b073}" => "\u{3066}",
|
|
125
|
+
"\u{1b074}" => "\u{3066}",
|
|
126
|
+
"\u{1b075}" => "\u{3066}",
|
|
127
|
+
"\u{1b076}" => "\u{3066}",
|
|
128
|
+
"\u{1b077}" => "\u{3068}",
|
|
129
|
+
"\u{1b078}" => "\u{3068}",
|
|
130
|
+
"\u{1b079}" => "\u{3068}",
|
|
131
|
+
"\u{1b07a}" => "\u{3068}",
|
|
132
|
+
"\u{1b07b}" => "\u{3068}",
|
|
133
|
+
"\u{1b07c}" => "\u{3068}",
|
|
134
|
+
"\u{1b07d}" => "\u{3068}",
|
|
135
|
+
"\u{1b07e}" => "\u{306a}",
|
|
136
|
+
"\u{1b07f}" => "\u{306a}",
|
|
137
|
+
"\u{1b080}" => "\u{306a}",
|
|
138
|
+
"\u{1b081}" => "\u{306a}",
|
|
139
|
+
"\u{1b082}" => "\u{306a}",
|
|
140
|
+
"\u{1b083}" => "\u{306a}",
|
|
141
|
+
"\u{1b084}" => "\u{306a}",
|
|
142
|
+
"\u{1b085}" => "\u{306a}",
|
|
143
|
+
"\u{1b086}" => "\u{306a}",
|
|
144
|
+
"\u{1b087}" => "\u{306b}",
|
|
145
|
+
"\u{1b088}" => "\u{306b}",
|
|
146
|
+
"\u{1b089}" => "\u{306b}",
|
|
147
|
+
"\u{1b08a}" => "\u{306b}",
|
|
148
|
+
"\u{1b08b}" => "\u{306b}",
|
|
149
|
+
"\u{1b08c}" => "\u{306b}",
|
|
150
|
+
"\u{1b08d}" => "\u{306b}",
|
|
151
|
+
"\u{1b08e}" => "\u{306b}",
|
|
152
|
+
"\u{1b08f}" => "\u{306c}",
|
|
153
|
+
"\u{1b090}" => "\u{306c}",
|
|
154
|
+
"\u{1b091}" => "\u{306c}",
|
|
155
|
+
"\u{1b092}" => "\u{306d}",
|
|
156
|
+
"\u{1b093}" => "\u{306d}",
|
|
157
|
+
"\u{1b094}" => "\u{306d}",
|
|
158
|
+
"\u{1b095}" => "\u{306d}",
|
|
159
|
+
"\u{1b096}" => "\u{306d}",
|
|
160
|
+
"\u{1b097}" => "\u{306d}",
|
|
161
|
+
"\u{1b098}" => "\u{306d}",
|
|
162
|
+
"\u{1b099}" => "\u{306e}",
|
|
163
|
+
"\u{1b09a}" => "\u{306e}",
|
|
164
|
+
"\u{1b09b}" => "\u{306e}",
|
|
165
|
+
"\u{1b09c}" => "\u{306e}",
|
|
166
|
+
"\u{1b09d}" => "\u{306e}",
|
|
167
|
+
"\u{1b09e}" => "\u{306f}",
|
|
168
|
+
"\u{1b09f}" => "\u{306f}",
|
|
169
|
+
"\u{1b0a0}" => "\u{306f}",
|
|
170
|
+
"\u{1b0a1}" => "\u{306f}",
|
|
171
|
+
"\u{1b0a2}" => "\u{306f}",
|
|
172
|
+
"\u{1b0a3}" => "\u{306f}",
|
|
173
|
+
"\u{1b0a4}" => "\u{306f}",
|
|
174
|
+
"\u{1b0a5}" => "\u{306f}",
|
|
175
|
+
"\u{1b0a6}" => "\u{306f}",
|
|
176
|
+
"\u{1b0a7}" => "\u{306f}",
|
|
177
|
+
"\u{1b0a8}" => "\u{306f}",
|
|
178
|
+
"\u{1b0a9}" => "\u{3072}",
|
|
179
|
+
"\u{1b0aa}" => "\u{3072}",
|
|
180
|
+
"\u{1b0ab}" => "\u{3072}",
|
|
181
|
+
"\u{1b0ac}" => "\u{3072}",
|
|
182
|
+
"\u{1b0ad}" => "\u{3072}",
|
|
183
|
+
"\u{1b0ae}" => "\u{3072}",
|
|
184
|
+
"\u{1b0af}" => "\u{3072}",
|
|
185
|
+
"\u{1b0b0}" => "\u{3075}",
|
|
186
|
+
"\u{1b0b1}" => "\u{3075}",
|
|
187
|
+
"\u{1b0b2}" => "\u{3075}",
|
|
188
|
+
"\u{1b0b3}" => "\u{3078}",
|
|
189
|
+
"\u{1b0b4}" => "\u{3078}",
|
|
190
|
+
"\u{1b0b5}" => "\u{3078}",
|
|
191
|
+
"\u{1b0b6}" => "\u{3078}",
|
|
192
|
+
"\u{1b0b7}" => "\u{3078}",
|
|
193
|
+
"\u{1b0b8}" => "\u{3078}",
|
|
194
|
+
"\u{1b0b9}" => "\u{3078}",
|
|
195
|
+
"\u{1b0ba}" => "\u{307b}",
|
|
196
|
+
"\u{1b0bb}" => "\u{307b}",
|
|
197
|
+
"\u{1b0bc}" => "\u{307b}",
|
|
198
|
+
"\u{1b0bd}" => "\u{307b}",
|
|
199
|
+
"\u{1b0be}" => "\u{307b}",
|
|
200
|
+
"\u{1b0bf}" => "\u{307b}",
|
|
201
|
+
"\u{1b0c0}" => "\u{307b}",
|
|
202
|
+
"\u{1b0c1}" => "\u{307b}",
|
|
203
|
+
"\u{1b0c2}" => "\u{307e}",
|
|
204
|
+
"\u{1b0c3}" => "\u{307e}",
|
|
205
|
+
"\u{1b0c4}" => "\u{307e}",
|
|
206
|
+
"\u{1b0c5}" => "\u{307e}",
|
|
207
|
+
"\u{1b0c6}" => "\u{307e}",
|
|
208
|
+
"\u{1b0c7}" => "\u{307e}",
|
|
209
|
+
"\u{1b0c8}" => "\u{307e}",
|
|
210
|
+
"\u{1b0c9}" => "\u{307f}",
|
|
211
|
+
"\u{1b0ca}" => "\u{307f}",
|
|
212
|
+
"\u{1b0cb}" => "\u{307f}",
|
|
213
|
+
"\u{1b0cc}" => "\u{307f}",
|
|
214
|
+
"\u{1b0cd}" => "\u{307f}",
|
|
215
|
+
"\u{1b0ce}" => "\u{307f}",
|
|
216
|
+
"\u{1b0cf}" => "\u{307f}",
|
|
217
|
+
"\u{1b0d0}" => "\u{3080}",
|
|
218
|
+
"\u{1b0d1}" => "\u{3080}",
|
|
219
|
+
"\u{1b0d2}" => "\u{3080}",
|
|
220
|
+
"\u{1b0d3}" => "\u{3080}",
|
|
221
|
+
"\u{1b0d4}" => "\u{3081}",
|
|
222
|
+
"\u{1b0d5}" => "\u{3081}",
|
|
223
|
+
"\u{1b0d6}" => "\u{3081}",
|
|
224
|
+
"\u{1b0d7}" => "\u{3082}",
|
|
225
|
+
"\u{1b0d8}" => "\u{3082}",
|
|
226
|
+
"\u{1b0d9}" => "\u{3082}",
|
|
227
|
+
"\u{1b0da}" => "\u{3082}",
|
|
228
|
+
"\u{1b0db}" => "\u{3082}",
|
|
229
|
+
"\u{1b0dc}" => "\u{3082}",
|
|
230
|
+
"\u{1b0dd}" => "\u{3084}",
|
|
231
|
+
"\u{1b0de}" => "\u{3084}",
|
|
232
|
+
"\u{1b0df}" => "\u{3084}",
|
|
233
|
+
"\u{1b0e0}" => "\u{3084}",
|
|
234
|
+
"\u{1b0e1}" => "\u{3084}",
|
|
235
|
+
"\u{1b0e2}" => "\u{3084}",
|
|
236
|
+
"\u{1b0e3}" => "\u{3086}",
|
|
237
|
+
"\u{1b0e4}" => "\u{3086}",
|
|
238
|
+
"\u{1b0e5}" => "\u{3086}",
|
|
239
|
+
"\u{1b0e6}" => "\u{3086}",
|
|
240
|
+
"\u{1b0e7}" => "\u{3088}",
|
|
241
|
+
"\u{1b0e8}" => "\u{3088}",
|
|
242
|
+
"\u{1b0e9}" => "\u{3088}",
|
|
243
|
+
"\u{1b0ea}" => "\u{3088}",
|
|
244
|
+
"\u{1b0eb}" => "\u{3088}",
|
|
245
|
+
"\u{1b0ec}" => "\u{3088}",
|
|
246
|
+
"\u{1b0ed}" => "\u{3089}",
|
|
247
|
+
"\u{1b0ee}" => "\u{3089}",
|
|
248
|
+
"\u{1b0ef}" => "\u{3089}",
|
|
249
|
+
"\u{1b0f0}" => "\u{3089}",
|
|
250
|
+
"\u{1b0f1}" => "\u{308a}",
|
|
251
|
+
"\u{1b0f2}" => "\u{308a}",
|
|
252
|
+
"\u{1b0f3}" => "\u{308a}",
|
|
253
|
+
"\u{1b0f4}" => "\u{308a}",
|
|
254
|
+
"\u{1b0f5}" => "\u{308a}",
|
|
255
|
+
"\u{1b0f6}" => "\u{308a}",
|
|
256
|
+
"\u{1b0f7}" => "\u{308a}",
|
|
257
|
+
"\u{1b0f8}" => "\u{308b}",
|
|
258
|
+
"\u{1b0f9}" => "\u{308b}",
|
|
259
|
+
"\u{1b0fa}" => "\u{308b}",
|
|
260
|
+
"\u{1b0fb}" => "\u{308b}",
|
|
261
|
+
"\u{1b0fc}" => "\u{308b}",
|
|
262
|
+
"\u{1b0fd}" => "\u{308b}",
|
|
263
|
+
"\u{1b0fe}" => "\u{308c}",
|
|
264
|
+
"\u{1b0ff}" => "\u{308c}",
|
|
265
|
+
"\u{1b100}" => "\u{308c}",
|
|
266
|
+
"\u{1b101}" => "\u{308c}",
|
|
267
|
+
"\u{1b102}" => "\u{308d}",
|
|
268
|
+
"\u{1b103}" => "\u{308d}",
|
|
269
|
+
"\u{1b104}" => "\u{308d}",
|
|
270
|
+
"\u{1b105}" => "\u{308d}",
|
|
271
|
+
"\u{1b106}" => "\u{308d}",
|
|
272
|
+
"\u{1b107}" => "\u{308d}",
|
|
273
|
+
"\u{1b108}" => "\u{308f}",
|
|
274
|
+
"\u{1b109}" => "\u{308f}",
|
|
275
|
+
"\u{1b10a}" => "\u{308f}",
|
|
276
|
+
"\u{1b10b}" => "\u{308f}",
|
|
277
|
+
"\u{1b10c}" => "\u{308f}",
|
|
278
|
+
"\u{1b10d}" => "\u{3090}",
|
|
279
|
+
"\u{1b10e}" => "\u{3090}",
|
|
280
|
+
"\u{1b10f}" => "\u{3090}",
|
|
281
|
+
"\u{1b110}" => "\u{3090}",
|
|
282
|
+
"\u{1b111}" => "\u{3090}",
|
|
283
|
+
"\u{1b112}" => "\u{3091}",
|
|
284
|
+
"\u{1b113}" => "\u{3091}",
|
|
285
|
+
"\u{1b114}" => "\u{3091}",
|
|
286
|
+
"\u{1b115}" => "\u{3091}",
|
|
287
|
+
"\u{1b116}" => "\u{3092}",
|
|
288
|
+
"\u{1b117}" => "\u{3092}",
|
|
289
|
+
"\u{1b118}" => "\u{3092}",
|
|
290
|
+
"\u{1b119}" => "\u{3092}",
|
|
291
|
+
"\u{1b11a}" => "\u{3092}",
|
|
292
|
+
"\u{1b11b}" => "\u{3092}",
|
|
293
|
+
"\u{1b11c}" => "\u{3092}",
|
|
294
|
+
"\u{1b11d}" => "\u{3093}",
|
|
295
|
+
"\u{1b11e}" => "\u{3093}",
|
|
296
|
+
"\u{1b11f}" => "\u{3046}",
|
|
297
|
+
"\u{1b120}" => "\u{30a4}",
|
|
298
|
+
"\u{1b121}" => "\u{30a8}",
|
|
299
|
+
"\u{1b122}" => "\u{30a6}"
|
|
300
|
+
}.freeze
|
|
301
|
+
|
|
302
|
+
# Transliterator for archaic_hirakatas
|
|
303
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
304
|
+
# Initialize the transliterator with options
|
|
305
|
+
#
|
|
306
|
+
# @param _options [Hash] Configuration options (currently unused)
|
|
307
|
+
def initialize(_options = {})
|
|
308
|
+
# Options currently unused for archaic_hirakatas transliterator
|
|
309
|
+
super()
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
# Replaces archaic kana (hentaigana) with their modern equivalents.
|
|
313
|
+
#
|
|
314
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
315
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
316
|
+
def call(input_chars)
|
|
317
|
+
offset = 0
|
|
318
|
+
|
|
319
|
+
result = input_chars.filter_map do |char|
|
|
320
|
+
replacement = ARCHAIC_HIRAKATAS_MAPPINGS[char.c]
|
|
321
|
+
c = if replacement
|
|
322
|
+
# Skip empty replacements (character removal)
|
|
323
|
+
next if replacement.empty?
|
|
324
|
+
|
|
325
|
+
Char.new(c: replacement, offset: offset, source: char)
|
|
326
|
+
else
|
|
327
|
+
char.with_offset(offset)
|
|
328
|
+
end
|
|
329
|
+
offset += c.c.length
|
|
330
|
+
c
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
class << result
|
|
334
|
+
include Yosina::Chars
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
result
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# Factory method to create a archaic_hirakatas transliterator
|
|
342
|
+
#
|
|
343
|
+
# @param options [Hash] Configuration options
|
|
344
|
+
# @return [Transliterator] A new archaic_hirakatas transliterator instance
|
|
345
|
+
def self.call(options = {})
|
|
346
|
+
Transliterator.new(options)
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
end
|
|
@@ -79,7 +79,11 @@ module Yosina
|
|
|
79
79
|
['ょ', 'ョ', 'ョ'],
|
|
80
80
|
['ゎ', 'ヮ', nil],
|
|
81
81
|
['ゕ', 'ヵ', nil],
|
|
82
|
-
['ゖ', 'ヶ', nil]
|
|
82
|
+
['ゖ', 'ヶ', nil],
|
|
83
|
+
["\u{1B132}", "\u{1B155}", nil],
|
|
84
|
+
["\u{1B150}", "\u{1B164}", nil],
|
|
85
|
+
["\u{1B151}", "\u{1B165}", nil],
|
|
86
|
+
["\u{1B152}", "\u{1B166}", nil]
|
|
83
87
|
].freeze
|
|
84
88
|
|
|
85
89
|
# Generate voiced character mappings
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Convert historical hiragana/katakana characters to their modern equivalents
|
|
6
|
+
module HistoricalHirakatas
|
|
7
|
+
# Historical hiragana mappings: source => { simple:, decompose: }
|
|
8
|
+
HISTORICAL_HIRAGANA_MAPPINGS = {
|
|
9
|
+
"\u{3090}" => { simple: "\u{3044}", decompose: "\u{3046}\u{3043}" }, # ゐ → い / うぃ
|
|
10
|
+
"\u{3091}" => { simple: "\u{3048}", decompose: "\u{3046}\u{3047}" } # ゑ → え / うぇ
|
|
11
|
+
}.freeze
|
|
12
|
+
|
|
13
|
+
# Historical katakana mappings: source => { simple:, decompose: }
|
|
14
|
+
HISTORICAL_KATAKANA_MAPPINGS = {
|
|
15
|
+
"\u{30F0}" => { simple: "\u{30A4}", decompose: "\u{30A6}\u{30A3}" }, # ヰ → イ / ウィ
|
|
16
|
+
"\u{30F1}" => { simple: "\u{30A8}", decompose: "\u{30A6}\u{30A7}" } # ヱ → エ / ウェ
|
|
17
|
+
}.freeze
|
|
18
|
+
|
|
19
|
+
# Voiced historical katakana mappings: source => small vowel suffix
|
|
20
|
+
VOICED_HISTORICAL_KANA_MAPPINGS = {
|
|
21
|
+
"\u{30F7}" => "\u{30A1}", # ヷ → ァ
|
|
22
|
+
"\u{30F8}" => "\u{30A3}", # ヸ → ィ
|
|
23
|
+
"\u{30F9}" => "\u{30A7}", # ヹ → ェ
|
|
24
|
+
"\u{30FA}" => "\u{30A9}" # ヺ → ォ
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
VOICED_HISTORICAL_KANA_DECOMPOSED_MAPPINGS = {
|
|
28
|
+
"\u{30EF}" => "\u{30A1}", # ヷ → ァ
|
|
29
|
+
"\u{30F0}" => "\u{30A3}", # ヸ → ィ
|
|
30
|
+
"\u{30F1}" => "\u{30A7}", # ヹ → ェ
|
|
31
|
+
"\u{30F2}" => "\u{30A9}" # ヺ → ォ
|
|
32
|
+
}.freeze
|
|
33
|
+
|
|
34
|
+
COMBINING_DAKUTEN = "\u{3099}"
|
|
35
|
+
VU = "\u{30F4}"
|
|
36
|
+
U = "\u{30A6}"
|
|
37
|
+
|
|
38
|
+
# Transliterator for historical hiragana/katakana conversion
|
|
39
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
40
|
+
# Initialize the transliterator with options
|
|
41
|
+
#
|
|
42
|
+
# @param options [Hash] Configuration options
|
|
43
|
+
# @option options [String] :hiraganas "simple" (default), "decompose", or "skip"
|
|
44
|
+
# @option options [String] :katakanas "simple" (default), "decompose", or "skip"
|
|
45
|
+
# @option options [String] :voiced_katakanas "decompose" or "skip" (default)
|
|
46
|
+
def initialize(options = {})
|
|
47
|
+
super()
|
|
48
|
+
@hiraganas = (options[:hiraganas] || :simple).to_sym
|
|
49
|
+
@katakanas = (options[:katakanas] || :simple).to_sym
|
|
50
|
+
@voiced_katakanas = (options[:voiced_katakanas] || :skip).to_sym
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Convert historical hiragana/katakana characters to modern equivalents
|
|
54
|
+
#
|
|
55
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
56
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
57
|
+
def call(input_chars)
|
|
58
|
+
Chars.enum do |y|
|
|
59
|
+
offset = 0
|
|
60
|
+
pending = nil
|
|
61
|
+
input_chars.each do |char|
|
|
62
|
+
if char.sentinel?
|
|
63
|
+
offset = emit_char(y, pending, offset) if pending
|
|
64
|
+
pending = nil
|
|
65
|
+
y << char
|
|
66
|
+
break
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
if pending.nil?
|
|
70
|
+
pending = char
|
|
71
|
+
next
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
if char.c == COMBINING_DAKUTEN
|
|
75
|
+
# Check if pending char could be a decomposed voiced base
|
|
76
|
+
decomposed = VOICED_HISTORICAL_KANA_DECOMPOSED_MAPPINGS[pending.c]
|
|
77
|
+
if @voiced_katakanas == :skip || decomposed.nil?
|
|
78
|
+
y << pending.with_offset(offset)
|
|
79
|
+
offset += pending.c.length
|
|
80
|
+
pending = char
|
|
81
|
+
next
|
|
82
|
+
end
|
|
83
|
+
y << Char.new(c: U, offset: offset, source: pending)
|
|
84
|
+
offset += U.length
|
|
85
|
+
y << char.with_offset(offset)
|
|
86
|
+
offset += char.c.length
|
|
87
|
+
y << Char.new(c: decomposed, offset: offset, source: pending)
|
|
88
|
+
offset += decomposed.length
|
|
89
|
+
pending = nil
|
|
90
|
+
next
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
offset = emit_char(y, pending, offset)
|
|
94
|
+
pending = char
|
|
95
|
+
end
|
|
96
|
+
# Flush any remaining pending char
|
|
97
|
+
emit_char(y, pending, offset) if pending
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
private
|
|
102
|
+
|
|
103
|
+
# Emit a single char through the normal mapping logic
|
|
104
|
+
#
|
|
105
|
+
# @param y [Enumerator::Yielder] The yielder
|
|
106
|
+
# @param char [Char] The character to emit
|
|
107
|
+
# @param offset [Integer] The current offset
|
|
108
|
+
# @return [Integer] The new offset after emitting
|
|
109
|
+
# rubocop:disable Naming/MethodParameterName
|
|
110
|
+
def emit_char(y, char, offset)
|
|
111
|
+
# Historical hiragana
|
|
112
|
+
hira_mapping = HISTORICAL_HIRAGANA_MAPPINGS[char.c]
|
|
113
|
+
if hira_mapping && @hiraganas != :skip
|
|
114
|
+
replacement = hira_mapping[@hiraganas]
|
|
115
|
+
y << Char.new(c: replacement, offset: offset, source: char)
|
|
116
|
+
return offset + replacement.length
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Historical katakana
|
|
120
|
+
kata_mapping = HISTORICAL_KATAKANA_MAPPINGS[char.c]
|
|
121
|
+
if kata_mapping && @katakanas != :skip
|
|
122
|
+
replacement = kata_mapping[@katakanas]
|
|
123
|
+
y << Char.new(c: replacement, offset: offset, source: char)
|
|
124
|
+
return offset + replacement.length
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Voiced historical katakana
|
|
128
|
+
if @voiced_katakanas == :decompose
|
|
129
|
+
decomposed = VOICED_HISTORICAL_KANA_MAPPINGS[char.c]
|
|
130
|
+
if decomposed
|
|
131
|
+
y << Char.new(c: VU, offset: offset, source: char)
|
|
132
|
+
offset += VU.length
|
|
133
|
+
y << Char.new(c: decomposed, offset: offset, source: char)
|
|
134
|
+
return offset + decomposed.length
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
y << char.with_offset(offset)
|
|
139
|
+
offset + char.c.length
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
# rubocop:enable Naming/MethodParameterName
|
|
143
|
+
|
|
144
|
+
# Factory method to create a historical hirakatas transliterator
|
|
145
|
+
#
|
|
146
|
+
# @param options [Hash] Configuration options
|
|
147
|
+
# @return [Transliterator] A new historical hirakatas transliterator instance
|
|
148
|
+
def self.call(options = {})
|
|
149
|
+
Transliterator.new(options)
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Replaces small hiragana/katakana with their ordinary-sized equivalents.
|
|
6
|
+
module SmallHirakatas
|
|
7
|
+
# Generated mapping data from small_hirakatas.json
|
|
8
|
+
SMALL_HIRAKATAS_MAPPINGS = {
|
|
9
|
+
"\u{3041}" => "\u{3042}",
|
|
10
|
+
"\u{3043}" => "\u{3044}",
|
|
11
|
+
"\u{3045}" => "\u{3046}",
|
|
12
|
+
"\u{3047}" => "\u{3048}",
|
|
13
|
+
"\u{3049}" => "\u{304a}",
|
|
14
|
+
"\u{3063}" => "\u{3064}",
|
|
15
|
+
"\u{3083}" => "\u{3084}",
|
|
16
|
+
"\u{3085}" => "\u{3086}",
|
|
17
|
+
"\u{3087}" => "\u{3088}",
|
|
18
|
+
"\u{308e}" => "\u{308f}",
|
|
19
|
+
"\u{3095}" => "\u{304b}",
|
|
20
|
+
"\u{3096}" => "\u{3051}",
|
|
21
|
+
"\u{30a1}" => "\u{30a2}",
|
|
22
|
+
"\u{30a3}" => "\u{30a4}",
|
|
23
|
+
"\u{30a5}" => "\u{30a6}",
|
|
24
|
+
"\u{30a7}" => "\u{30a8}",
|
|
25
|
+
"\u{30a9}" => "\u{30aa}",
|
|
26
|
+
"\u{30c3}" => "\u{30c4}",
|
|
27
|
+
"\u{30e3}" => "\u{30e4}",
|
|
28
|
+
"\u{30e5}" => "\u{30e6}",
|
|
29
|
+
"\u{30e7}" => "\u{30e8}",
|
|
30
|
+
"\u{30ee}" => "\u{30ef}",
|
|
31
|
+
"\u{30f5}" => "\u{30ab}",
|
|
32
|
+
"\u{30f6}" => "\u{30b1}",
|
|
33
|
+
"\u{31f0}" => "\u{30af}",
|
|
34
|
+
"\u{31f1}" => "\u{30b7}",
|
|
35
|
+
"\u{31f2}" => "\u{30b9}",
|
|
36
|
+
"\u{31f3}" => "\u{30c8}",
|
|
37
|
+
"\u{31f4}" => "\u{30cc}",
|
|
38
|
+
"\u{31f5}" => "\u{30cf}",
|
|
39
|
+
"\u{31f6}" => "\u{30d2}",
|
|
40
|
+
"\u{31f7}" => "\u{30d5}",
|
|
41
|
+
"\u{31f8}" => "\u{30d8}",
|
|
42
|
+
"\u{31f9}" => "\u{30db}",
|
|
43
|
+
"\u{31fa}" => "\u{30e0}",
|
|
44
|
+
"\u{31fb}" => "\u{30e9}",
|
|
45
|
+
"\u{31fc}" => "\u{30ea}",
|
|
46
|
+
"\u{31fd}" => "\u{30eb}",
|
|
47
|
+
"\u{31fe}" => "\u{30ec}",
|
|
48
|
+
"\u{31ff}" => "\u{30ed}",
|
|
49
|
+
"\u{ff67}" => "\u{ff71}",
|
|
50
|
+
"\u{ff68}" => "\u{ff72}",
|
|
51
|
+
"\u{ff69}" => "\u{ff73}",
|
|
52
|
+
"\u{ff6a}" => "\u{ff74}",
|
|
53
|
+
"\u{ff6b}" => "\u{ff75}",
|
|
54
|
+
"\u{ff6c}" => "\u{ff94}",
|
|
55
|
+
"\u{ff6d}" => "\u{ff95}",
|
|
56
|
+
"\u{ff6e}" => "\u{ff96}",
|
|
57
|
+
"\u{ff6f}" => "\u{ff82}",
|
|
58
|
+
"\u{1b132}" => "\u{3053}",
|
|
59
|
+
"\u{1b150}" => "\u{3090}",
|
|
60
|
+
"\u{1b151}" => "\u{3091}",
|
|
61
|
+
"\u{1b152}" => "\u{3092}",
|
|
62
|
+
"\u{1b155}" => "\u{30b3}",
|
|
63
|
+
"\u{1b164}" => "\u{30f0}",
|
|
64
|
+
"\u{1b165}" => "\u{30f1}",
|
|
65
|
+
"\u{1b166}" => "\u{30f2}",
|
|
66
|
+
"\u{1b167}" => "\u{30f3}"
|
|
67
|
+
}.freeze
|
|
68
|
+
|
|
69
|
+
# Transliterator for small_hirakatas
|
|
70
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
71
|
+
# Initialize the transliterator with options
|
|
72
|
+
#
|
|
73
|
+
# @param _options [Hash] Configuration options (currently unused)
|
|
74
|
+
def initialize(_options = {})
|
|
75
|
+
# Options currently unused for small_hirakatas transliterator
|
|
76
|
+
super()
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Replaces small hiragana/katakana with their ordinary-sized equivalents.
|
|
80
|
+
#
|
|
81
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
82
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
83
|
+
def call(input_chars)
|
|
84
|
+
offset = 0
|
|
85
|
+
|
|
86
|
+
result = input_chars.filter_map do |char|
|
|
87
|
+
replacement = SMALL_HIRAKATAS_MAPPINGS[char.c]
|
|
88
|
+
c = if replacement
|
|
89
|
+
# Skip empty replacements (character removal)
|
|
90
|
+
next if replacement.empty?
|
|
91
|
+
|
|
92
|
+
Char.new(c: replacement, offset: offset, source: char)
|
|
93
|
+
else
|
|
94
|
+
char.with_offset(offset)
|
|
95
|
+
end
|
|
96
|
+
offset += c.c.length
|
|
97
|
+
c
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
class << result
|
|
101
|
+
include Yosina::Chars
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
result
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Factory method to create a small_hirakatas transliterator
|
|
109
|
+
#
|
|
110
|
+
# @param options [Hash] Configuration options
|
|
111
|
+
# @return [Transliterator] A new small_hirakatas transliterator instance
|
|
112
|
+
def self.call(options = {})
|
|
113
|
+
Transliterator.new(options)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -15,6 +15,9 @@ require_relative 'transliterators/jisx0201_and_alike'
|
|
|
15
15
|
require_relative 'transliterators/circled_or_squared'
|
|
16
16
|
require_relative 'transliterators/combined'
|
|
17
17
|
require_relative 'transliterators/japanese_iteration_marks'
|
|
18
|
+
require_relative 'transliterators/archaic_hirakatas'
|
|
19
|
+
require_relative 'transliterators/small_hirakatas'
|
|
20
|
+
require_relative 'transliterators/historical_hirakatas'
|
|
18
21
|
|
|
19
22
|
module Yosina
|
|
20
23
|
# Registry for transliterator factories
|
|
@@ -34,7 +37,10 @@ module Yosina
|
|
|
34
37
|
jisx0201_and_alike: Transliterators::Jisx0201AndAlike,
|
|
35
38
|
combined: Transliterators::Combined,
|
|
36
39
|
circled_or_squared: CircledOrSquared,
|
|
37
|
-
japanese_iteration_marks: Transliterators::JapaneseIterationMarks
|
|
40
|
+
japanese_iteration_marks: Transliterators::JapaneseIterationMarks,
|
|
41
|
+
archaic_hirakatas: Transliterators::ArchaicHirakatas,
|
|
42
|
+
small_hirakatas: Transliterators::SmallHirakatas,
|
|
43
|
+
historical_hirakatas: Transliterators::HistoricalHirakatas
|
|
38
44
|
}.freeze
|
|
39
45
|
|
|
40
46
|
# Get a transliterator factory by name
|
data/lib/yosina/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: yosina
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Moriyoshi Koizumi
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-03-19 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: minitest
|
|
@@ -109,6 +109,7 @@ files:
|
|
|
109
109
|
- lib/yosina/recipes.rb
|
|
110
110
|
- lib/yosina/transliterator.rb
|
|
111
111
|
- lib/yosina/transliterators.rb
|
|
112
|
+
- lib/yosina/transliterators/archaic_hirakatas.rb
|
|
112
113
|
- lib/yosina/transliterators/circled_or_squared.rb
|
|
113
114
|
- lib/yosina/transliterators/circled_or_squared_data.rb
|
|
114
115
|
- lib/yosina/transliterators/combined.rb
|
|
@@ -116,6 +117,7 @@ files:
|
|
|
116
117
|
- lib/yosina/transliterators/hira_kata.rb
|
|
117
118
|
- lib/yosina/transliterators/hira_kata_composition.rb
|
|
118
119
|
- lib/yosina/transliterators/hira_kata_table.rb
|
|
120
|
+
- lib/yosina/transliterators/historical_hirakatas.rb
|
|
119
121
|
- lib/yosina/transliterators/hyphens.rb
|
|
120
122
|
- lib/yosina/transliterators/hyphens_data.rb
|
|
121
123
|
- lib/yosina/transliterators/ideographic_annotations.rb
|
|
@@ -129,6 +131,7 @@ files:
|
|
|
129
131
|
- lib/yosina/transliterators/radicals.rb
|
|
130
132
|
- lib/yosina/transliterators/roman_numerals.rb
|
|
131
133
|
- lib/yosina/transliterators/roman_numerals_data.rb
|
|
134
|
+
- lib/yosina/transliterators/small_hirakatas.rb
|
|
132
135
|
- lib/yosina/transliterators/spaces.rb
|
|
133
136
|
- lib/yosina/version.rb
|
|
134
137
|
- yosina.gemspec
|