yosina 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +36 -0
- data/Gemfile +6 -0
- data/README.ja.md +229 -0
- data/README.md +229 -0
- data/Rakefile +30 -0
- data/codegen/dataset.rb +215 -0
- data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
- data/codegen/emitters/combined_transliterator_data.rb +28 -0
- data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
- data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
- data/codegen/emitters/simple_transliterator.rb +76 -0
- data/codegen/emitters/utils.rb +45 -0
- data/codegen/emitters.rb +8 -0
- data/codegen/main.rb +109 -0
- data/lib/yosina/char.rb +65 -0
- data/lib/yosina/chars.rb +152 -0
- data/lib/yosina/recipes.rb +359 -0
- data/lib/yosina/transliterator.rb +49 -0
- data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
- data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
- data/lib/yosina/transliterators/combined.rb +52 -0
- data/lib/yosina/transliterators/combined_data.rb +495 -0
- data/lib/yosina/transliterators/hira_kata.rb +106 -0
- data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
- data/lib/yosina/transliterators/hyphens.rb +83 -0
- data/lib/yosina/transliterators/hyphens_data.rb +60 -0
- data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
- data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
- data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
- data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
- data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
- data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
- data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
- data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
- data/lib/yosina/transliterators/radicals.rb +361 -0
- data/lib/yosina/transliterators/spaces.rb +79 -0
- data/lib/yosina/transliterators.rb +57 -0
- data/lib/yosina/version.rb +5 -0
- data/lib/yosina.rb +62 -0
- data/yosina.gemspec +41 -0
- metadata +159 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Handle Japanese iteration marks transliterator
|
|
6
|
+
module JapaneseIterationMarks
|
|
7
|
+
# Iteration mark characters
|
|
8
|
+
HIRAGANA_ITERATION_MARK = "\u{309d}" # ゝ
|
|
9
|
+
HIRAGANA_VOICED_ITERATION_MARK = "\u{309e}" # ゞ
|
|
10
|
+
VERTICAL_HIRAGANA_ITERATION_MARK = "\u{3031}" # 〱
|
|
11
|
+
VERTICAL_HIRAGANA_VOICED_ITERATION_MARK = "\u{3032}" # 〲
|
|
12
|
+
KATAKANA_ITERATION_MARK = "\u{30fd}" # ヽ
|
|
13
|
+
KATAKANA_VOICED_ITERATION_MARK = "\u{30fe}" # ヾ
|
|
14
|
+
VERTICAL_KATAKANA_ITERATION_MARK = "\u{3033}" # 〳
|
|
15
|
+
VERTICAL_KATAKANA_VOICED_ITERATION_MARK = "\u{3034}" # 〴
|
|
16
|
+
KANJI_ITERATION_MARK = "\u{3005}" # 々
|
|
17
|
+
|
|
18
|
+
# Mix-in for character type checks
|
|
19
|
+
module CharType
|
|
20
|
+
# Check if character is hiragana (excluding small forms and special marks)
|
|
21
|
+
def hiragana?(char_code)
|
|
22
|
+
char_code >= 0x3041 && char_code <= 0x3096
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Check if character is katakana (including halfwidth)
|
|
26
|
+
def katakana?(char_code)
|
|
27
|
+
(char_code >= 0x30a1 && char_code <= 0x30fa) ||
|
|
28
|
+
(char_code >= 0xff66 && char_code <= 0xff9f)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Check if character is kanji
|
|
32
|
+
def kanji?(char_code)
|
|
33
|
+
# CJK Unified Ideographs (common kanji ranges)
|
|
34
|
+
(char_code >= 0x4e00 && char_code <= 0x9fff) ||
|
|
35
|
+
(char_code >= 0x3400 && char_code <= 0x4dbf) ||
|
|
36
|
+
(char_code >= 0x20000 && char_code <= 0x2a6df) ||
|
|
37
|
+
(char_code >= 0x2a700 && char_code <= 0x2b73f) ||
|
|
38
|
+
(char_code >= 0x2b740 && char_code <= 0x2b81f) ||
|
|
39
|
+
(char_code >= 0x2b820 && char_code <= 0x2ceaf) ||
|
|
40
|
+
(char_code >= 0x2ceb0 && char_code <= 0x2ebef) ||
|
|
41
|
+
(char_code >= 0x30000 && char_code <= 0x3134f)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Check if character is hatsuon (ん/ン)
|
|
45
|
+
def hatsuon?(char_code)
|
|
46
|
+
[0x3093, 0x30f3, 0xff9d].include?(char_code)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Check if character is sokuon (っ/ッ)
|
|
50
|
+
def sokuon?(char_code)
|
|
51
|
+
[0x3063, 0x30c3, 0xff6f].include?(char_code)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Check if character is an iteration mark
|
|
55
|
+
def iteration_mark?(char)
|
|
56
|
+
[
|
|
57
|
+
HIRAGANA_ITERATION_MARK,
|
|
58
|
+
HIRAGANA_VOICED_ITERATION_MARK,
|
|
59
|
+
VERTICAL_HIRAGANA_ITERATION_MARK,
|
|
60
|
+
VERTICAL_HIRAGANA_VOICED_ITERATION_MARK,
|
|
61
|
+
KATAKANA_ITERATION_MARK,
|
|
62
|
+
KATAKANA_VOICED_ITERATION_MARK,
|
|
63
|
+
VERTICAL_KATAKANA_ITERATION_MARK,
|
|
64
|
+
VERTICAL_KATAKANA_VOICED_ITERATION_MARK,
|
|
65
|
+
KANJI_ITERATION_MARK
|
|
66
|
+
].include?(char)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Check if character is voiced (has dakuten)
|
|
70
|
+
def voiced?(char)
|
|
71
|
+
VOICED_CHARS.include?(char)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Check if character is semi-voiced (has handakuten)
|
|
75
|
+
def semi_voiced?(char)
|
|
76
|
+
# Hiragana semi-voiced
|
|
77
|
+
%w[ぱ ぴ ぷ ぺ ぽ].include?(char) ||
|
|
78
|
+
# Katakana semi-voiced
|
|
79
|
+
%w[パ ピ プ ペ ポ].include?(char)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Voicing mappings for hiragana
|
|
84
|
+
HIRAGANA_VOICING = {
|
|
85
|
+
'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご',
|
|
86
|
+
'さ' => 'ざ', 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ',
|
|
87
|
+
'た' => 'だ', 'ち' => 'ぢ', 'つ' => 'づ', 'て' => 'で', 'と' => 'ど',
|
|
88
|
+
'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ', 'へ' => 'べ', 'ほ' => 'ぼ'
|
|
89
|
+
}.freeze
|
|
90
|
+
|
|
91
|
+
# Voicing mappings for katakana
|
|
92
|
+
KATAKANA_VOICING = {
|
|
93
|
+
'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
|
|
94
|
+
'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
|
|
95
|
+
'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
|
|
96
|
+
'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
|
|
97
|
+
'ウ' => 'ヴ',
|
|
98
|
+
# Halfwidth katakana
|
|
99
|
+
'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
|
|
100
|
+
'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
|
|
101
|
+
'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
|
|
102
|
+
'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
|
|
103
|
+
'ウ' => 'ヴ'
|
|
104
|
+
}.freeze
|
|
105
|
+
|
|
106
|
+
# Derive voiced characters from voicing mappings
|
|
107
|
+
VOICED_CHARS = (HIRAGANA_VOICING.values + KATAKANA_VOICING.values).freeze
|
|
108
|
+
|
|
109
|
+
# Reverse voicing mappings (voiced to unvoiced)
|
|
110
|
+
HIRAGANA_UNVOICING = HIRAGANA_VOICING.invert.freeze
|
|
111
|
+
KATAKANA_UNVOICING = KATAKANA_VOICING.invert.freeze
|
|
112
|
+
|
|
113
|
+
# Transliterator for Japanese iteration marks
|
|
114
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
115
|
+
include CharType
|
|
116
|
+
|
|
117
|
+
# Initialize the transliterator with options
|
|
118
|
+
#
|
|
119
|
+
# @param options [Hash] Configuration options (currently unused but kept for consistency)
|
|
120
|
+
def initialize(_options = {})
|
|
121
|
+
super()
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Replace iteration marks with appropriate repeated characters
|
|
125
|
+
#
|
|
126
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
127
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
128
|
+
def call(input_chars)
|
|
129
|
+
offset = 0
|
|
130
|
+
prev_char_info = nil
|
|
131
|
+
prev_was_iteration_mark = false
|
|
132
|
+
|
|
133
|
+
Chars.enum do |y|
|
|
134
|
+
input_chars.each do |char|
|
|
135
|
+
# Skip empty/sentinel characters
|
|
136
|
+
if char.c.empty?
|
|
137
|
+
y << char
|
|
138
|
+
next
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
current_char = char.c
|
|
142
|
+
|
|
143
|
+
if iteration_mark?(current_char)
|
|
144
|
+
# Check if previous character was also an iteration mark
|
|
145
|
+
if prev_was_iteration_mark
|
|
146
|
+
# Don't replace consecutive iteration marks
|
|
147
|
+
y << char.with_offset(offset)
|
|
148
|
+
offset += char.c.length
|
|
149
|
+
prev_was_iteration_mark = true
|
|
150
|
+
next
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# We have an iteration mark, check if we can replace it
|
|
154
|
+
replacement = nil
|
|
155
|
+
if prev_char_info
|
|
156
|
+
case current_char
|
|
157
|
+
when HIRAGANA_ITERATION_MARK, VERTICAL_HIRAGANA_ITERATION_MARK
|
|
158
|
+
# Repeat previous hiragana if valid
|
|
159
|
+
if prev_char_info[:type] == :hiragana
|
|
160
|
+
replacement = prev_char_info[:char]
|
|
161
|
+
elsif prev_char_info[:type] == :hiragana_voiced
|
|
162
|
+
# Voiced character with unvoiced mark: unvoice it
|
|
163
|
+
replacement = HIRAGANA_UNVOICING[prev_char_info[:char]]
|
|
164
|
+
end
|
|
165
|
+
when HIRAGANA_VOICED_ITERATION_MARK, VERTICAL_HIRAGANA_VOICED_ITERATION_MARK
|
|
166
|
+
# Repeat previous hiragana with voicing if possible
|
|
167
|
+
if prev_char_info[:type] == :hiragana
|
|
168
|
+
replacement = HIRAGANA_VOICING[prev_char_info[:char]]
|
|
169
|
+
elsif prev_char_info[:type] == :hiragana_voiced
|
|
170
|
+
# Voiced character with voiced mark: keep it voiced
|
|
171
|
+
replacement = prev_char_info[:char]
|
|
172
|
+
end
|
|
173
|
+
when KATAKANA_ITERATION_MARK, VERTICAL_KATAKANA_ITERATION_MARK
|
|
174
|
+
# Repeat previous katakana if valid
|
|
175
|
+
if prev_char_info[:type] == :katakana
|
|
176
|
+
replacement = prev_char_info[:char]
|
|
177
|
+
elsif prev_char_info[:type] == :katakana_voiced
|
|
178
|
+
# Voiced character with unvoiced mark: unvoice it
|
|
179
|
+
replacement = KATAKANA_UNVOICING[prev_char_info[:char]]
|
|
180
|
+
end
|
|
181
|
+
when KATAKANA_VOICED_ITERATION_MARK, VERTICAL_KATAKANA_VOICED_ITERATION_MARK
|
|
182
|
+
# Repeat previous katakana with voicing if possible
|
|
183
|
+
if prev_char_info[:type] == :katakana
|
|
184
|
+
replacement = KATAKANA_VOICING[prev_char_info[:char]]
|
|
185
|
+
elsif prev_char_info[:type] == :katakana_voiced
|
|
186
|
+
# Voiced character with voiced mark: keep it voiced
|
|
187
|
+
replacement = prev_char_info[:char]
|
|
188
|
+
end
|
|
189
|
+
when KANJI_ITERATION_MARK
|
|
190
|
+
# Repeat previous kanji
|
|
191
|
+
replacement = prev_char_info[:char] if prev_char_info[:type] == :kanji
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
if replacement
|
|
196
|
+
# Create a new character with the replacement
|
|
197
|
+
y << Char.new(c: replacement, offset: offset, source: char)
|
|
198
|
+
offset += replacement.length
|
|
199
|
+
# Don't update prev_char_info - keep the original one
|
|
200
|
+
# This ensures consecutive iteration marks work correctly
|
|
201
|
+
else
|
|
202
|
+
# Couldn't replace the iteration mark
|
|
203
|
+
y << char.with_offset(offset)
|
|
204
|
+
offset += char.c.length
|
|
205
|
+
end
|
|
206
|
+
prev_was_iteration_mark = true
|
|
207
|
+
next
|
|
208
|
+
else
|
|
209
|
+
# Not an iteration mark
|
|
210
|
+
y << char.with_offset(offset)
|
|
211
|
+
offset += char.c.length
|
|
212
|
+
|
|
213
|
+
# Update previous character info
|
|
214
|
+
char_code = current_char.ord
|
|
215
|
+
char_type = get_char_type(current_char, char_code)
|
|
216
|
+
|
|
217
|
+
# Only update prev_char_info if it's a repeatable character
|
|
218
|
+
prev_char_info = ({ char: current_char, type: char_type } if char_type && char_type != :other)
|
|
219
|
+
|
|
220
|
+
prev_was_iteration_mark = false
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
private
|
|
227
|
+
|
|
228
|
+
# Get the character type
|
|
229
|
+
def get_char_type(char, char_code)
|
|
230
|
+
# Check special cases first
|
|
231
|
+
return :other if hatsuon?(char_code) || sokuon?(char_code)
|
|
232
|
+
return :other if semi_voiced?(char)
|
|
233
|
+
|
|
234
|
+
# Check if it's a voiced character
|
|
235
|
+
if voiced?(char)
|
|
236
|
+
if hiragana?(char_code)
|
|
237
|
+
return :hiragana_voiced
|
|
238
|
+
elsif katakana?(char_code)
|
|
239
|
+
return :katakana_voiced
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Check character type
|
|
244
|
+
return :hiragana if hiragana?(char_code)
|
|
245
|
+
return :katakana if katakana?(char_code)
|
|
246
|
+
return :kanji if kanji?(char_code)
|
|
247
|
+
|
|
248
|
+
:other
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Factory method to create a Japanese iteration marks transliterator
|
|
253
|
+
#
|
|
254
|
+
# @param options [Hash] Configuration options
|
|
255
|
+
# @return [Transliterator] A new Japanese iteration marks transliterator instance
|
|
256
|
+
def self.call(options = {})
|
|
257
|
+
Transliterator.new(options)
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|