yosina 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/Gemfile +6 -0
  4. data/README.ja.md +229 -0
  5. data/README.md +229 -0
  6. data/Rakefile +30 -0
  7. data/codegen/dataset.rb +215 -0
  8. data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
  9. data/codegen/emitters/combined_transliterator_data.rb +28 -0
  10. data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
  11. data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
  12. data/codegen/emitters/simple_transliterator.rb +76 -0
  13. data/codegen/emitters/utils.rb +45 -0
  14. data/codegen/emitters.rb +8 -0
  15. data/codegen/main.rb +109 -0
  16. data/lib/yosina/char.rb +65 -0
  17. data/lib/yosina/chars.rb +152 -0
  18. data/lib/yosina/recipes.rb +359 -0
  19. data/lib/yosina/transliterator.rb +49 -0
  20. data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
  21. data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
  22. data/lib/yosina/transliterators/combined.rb +52 -0
  23. data/lib/yosina/transliterators/combined_data.rb +495 -0
  24. data/lib/yosina/transliterators/hira_kata.rb +106 -0
  25. data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
  26. data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
  27. data/lib/yosina/transliterators/hyphens.rb +83 -0
  28. data/lib/yosina/transliterators/hyphens_data.rb +60 -0
  29. data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
  30. data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
  31. data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
  32. data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
  33. data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
  34. data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
  35. data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
  36. data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
  37. data/lib/yosina/transliterators/radicals.rb +361 -0
  38. data/lib/yosina/transliterators/spaces.rb +79 -0
  39. data/lib/yosina/transliterators.rb +57 -0
  40. data/lib/yosina/version.rb +5 -0
  41. data/lib/yosina.rb +62 -0
  42. data/yosina.gemspec +41 -0
  43. metadata +159 -0
@@ -0,0 +1,261 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Handle Japanese iteration marks transliterator
6
+ module JapaneseIterationMarks
7
+ # Iteration mark characters
8
+ HIRAGANA_ITERATION_MARK = "\u{309d}" # ゝ
9
+ HIRAGANA_VOICED_ITERATION_MARK = "\u{309e}" # ゞ
10
+ VERTICAL_HIRAGANA_ITERATION_MARK = "\u{3031}" # 〱
11
+ VERTICAL_HIRAGANA_VOICED_ITERATION_MARK = "\u{3032}" # 〲
12
+ KATAKANA_ITERATION_MARK = "\u{30fd}" # ヽ
13
+ KATAKANA_VOICED_ITERATION_MARK = "\u{30fe}" # ヾ
14
+ VERTICAL_KATAKANA_ITERATION_MARK = "\u{3033}" # 〳
15
+ VERTICAL_KATAKANA_VOICED_ITERATION_MARK = "\u{3034}" # 〴
16
+ KANJI_ITERATION_MARK = "\u{3005}" # 々
17
+
18
+ # Mix-in for character type checks
19
+ module CharType
20
+ # Check if character is hiragana (excluding small forms and special marks)
21
+ def hiragana?(char_code)
22
+ char_code >= 0x3041 && char_code <= 0x3096
23
+ end
24
+
25
+ # Check if character is katakana (including halfwidth)
26
+ def katakana?(char_code)
27
+ (char_code >= 0x30a1 && char_code <= 0x30fa) ||
28
+ (char_code >= 0xff66 && char_code <= 0xff9f)
29
+ end
30
+
31
+ # Check if character is kanji
32
+ def kanji?(char_code)
33
+ # CJK Unified Ideographs (common kanji ranges)
34
+ (char_code >= 0x4e00 && char_code <= 0x9fff) ||
35
+ (char_code >= 0x3400 && char_code <= 0x4dbf) ||
36
+ (char_code >= 0x20000 && char_code <= 0x2a6df) ||
37
+ (char_code >= 0x2a700 && char_code <= 0x2b73f) ||
38
+ (char_code >= 0x2b740 && char_code <= 0x2b81f) ||
39
+ (char_code >= 0x2b820 && char_code <= 0x2ceaf) ||
40
+ (char_code >= 0x2ceb0 && char_code <= 0x2ebef) ||
41
+ (char_code >= 0x30000 && char_code <= 0x3134f)
42
+ end
43
+
44
+ # Check if character is hatsuon (ん/ン)
45
+ def hatsuon?(char_code)
46
+ [0x3093, 0x30f3, 0xff9d].include?(char_code)
47
+ end
48
+
49
+ # Check if character is sokuon (っ/ッ)
50
+ def sokuon?(char_code)
51
+ [0x3063, 0x30c3, 0xff6f].include?(char_code)
52
+ end
53
+
54
+ # Check if character is an iteration mark
55
+ def iteration_mark?(char)
56
+ [
57
+ HIRAGANA_ITERATION_MARK,
58
+ HIRAGANA_VOICED_ITERATION_MARK,
59
+ VERTICAL_HIRAGANA_ITERATION_MARK,
60
+ VERTICAL_HIRAGANA_VOICED_ITERATION_MARK,
61
+ KATAKANA_ITERATION_MARK,
62
+ KATAKANA_VOICED_ITERATION_MARK,
63
+ VERTICAL_KATAKANA_ITERATION_MARK,
64
+ VERTICAL_KATAKANA_VOICED_ITERATION_MARK,
65
+ KANJI_ITERATION_MARK
66
+ ].include?(char)
67
+ end
68
+
69
+ # Check if character is voiced (has dakuten)
70
+ def voiced?(char)
71
+ VOICED_CHARS.include?(char)
72
+ end
73
+
74
+ # Check if character is semi-voiced (has handakuten)
75
+ def semi_voiced?(char)
76
+ # Hiragana semi-voiced
77
+ %w[ぱ ぴ ぷ ぺ ぽ].include?(char) ||
78
+ # Katakana semi-voiced
79
+ %w[パ ピ プ ペ ポ].include?(char)
80
+ end
81
+ end
82
+
83
+ # Voicing mappings for hiragana
84
+ HIRAGANA_VOICING = {
85
+ 'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご',
86
+ 'さ' => 'ざ', 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ',
87
+ 'た' => 'だ', 'ち' => 'ぢ', 'つ' => 'づ', 'て' => 'で', 'と' => 'ど',
88
+ 'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ', 'へ' => 'べ', 'ほ' => 'ぼ'
89
+ }.freeze
90
+
91
+ # Voicing mappings for katakana
92
+ KATAKANA_VOICING = {
93
+ 'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
94
+ 'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
95
+ 'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
96
+ 'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
97
+ 'ウ' => 'ヴ',
98
+ # Halfwidth katakana
99
+ 'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
100
+ 'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
101
+ 'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
102
+ 'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
103
+ 'ウ' => 'ヴ'
104
+ }.freeze
105
+
106
+ # Derive voiced characters from voicing mappings
107
+ VOICED_CHARS = (HIRAGANA_VOICING.values + KATAKANA_VOICING.values).freeze
108
+
109
+ # Reverse voicing mappings (voiced to unvoiced)
110
+ HIRAGANA_UNVOICING = HIRAGANA_VOICING.invert.freeze
111
+ KATAKANA_UNVOICING = KATAKANA_VOICING.invert.freeze
112
+
113
+ # Transliterator for Japanese iteration marks
114
+ class Transliterator < Yosina::BaseTransliterator
115
+ include CharType
116
+
117
+ # Initialize the transliterator with options
118
+ #
119
+ # @param options [Hash] Configuration options (currently unused but kept for consistency)
120
+ def initialize(_options = {})
121
+ super()
122
+ end
123
+
124
+ # Replace iteration marks with appropriate repeated characters
125
+ #
126
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
127
+ # @return [Enumerable<Char>] The transliterated characters
128
+ def call(input_chars)
129
+ offset = 0
130
+ prev_char_info = nil
131
+ prev_was_iteration_mark = false
132
+
133
+ Chars.enum do |y|
134
+ input_chars.each do |char|
135
+ # Skip empty/sentinel characters
136
+ if char.c.empty?
137
+ y << char
138
+ next
139
+ end
140
+
141
+ current_char = char.c
142
+
143
+ if iteration_mark?(current_char)
144
+ # Check if previous character was also an iteration mark
145
+ if prev_was_iteration_mark
146
+ # Don't replace consecutive iteration marks
147
+ y << char.with_offset(offset)
148
+ offset += char.c.length
149
+ prev_was_iteration_mark = true
150
+ next
151
+ end
152
+
153
+ # We have an iteration mark, check if we can replace it
154
+ replacement = nil
155
+ if prev_char_info
156
+ case current_char
157
+ when HIRAGANA_ITERATION_MARK, VERTICAL_HIRAGANA_ITERATION_MARK
158
+ # Repeat previous hiragana if valid
159
+ if prev_char_info[:type] == :hiragana
160
+ replacement = prev_char_info[:char]
161
+ elsif prev_char_info[:type] == :hiragana_voiced
162
+ # Voiced character with unvoiced mark: unvoice it
163
+ replacement = HIRAGANA_UNVOICING[prev_char_info[:char]]
164
+ end
165
+ when HIRAGANA_VOICED_ITERATION_MARK, VERTICAL_HIRAGANA_VOICED_ITERATION_MARK
166
+ # Repeat previous hiragana with voicing if possible
167
+ if prev_char_info[:type] == :hiragana
168
+ replacement = HIRAGANA_VOICING[prev_char_info[:char]]
169
+ elsif prev_char_info[:type] == :hiragana_voiced
170
+ # Voiced character with voiced mark: keep it voiced
171
+ replacement = prev_char_info[:char]
172
+ end
173
+ when KATAKANA_ITERATION_MARK, VERTICAL_KATAKANA_ITERATION_MARK
174
+ # Repeat previous katakana if valid
175
+ if prev_char_info[:type] == :katakana
176
+ replacement = prev_char_info[:char]
177
+ elsif prev_char_info[:type] == :katakana_voiced
178
+ # Voiced character with unvoiced mark: unvoice it
179
+ replacement = KATAKANA_UNVOICING[prev_char_info[:char]]
180
+ end
181
+ when KATAKANA_VOICED_ITERATION_MARK, VERTICAL_KATAKANA_VOICED_ITERATION_MARK
182
+ # Repeat previous katakana with voicing if possible
183
+ if prev_char_info[:type] == :katakana
184
+ replacement = KATAKANA_VOICING[prev_char_info[:char]]
185
+ elsif prev_char_info[:type] == :katakana_voiced
186
+ # Voiced character with voiced mark: keep it voiced
187
+ replacement = prev_char_info[:char]
188
+ end
189
+ when KANJI_ITERATION_MARK
190
+ # Repeat previous kanji
191
+ replacement = prev_char_info[:char] if prev_char_info[:type] == :kanji
192
+ end
193
+ end
194
+
195
+ if replacement
196
+ # Create a new character with the replacement
197
+ y << Char.new(c: replacement, offset: offset, source: char)
198
+ offset += replacement.length
199
+ # Don't update prev_char_info - keep the original one
200
+ # This ensures consecutive iteration marks work correctly
201
+ else
202
+ # Couldn't replace the iteration mark
203
+ y << char.with_offset(offset)
204
+ offset += char.c.length
205
+ end
206
+ prev_was_iteration_mark = true
207
+ next
208
+ else
209
+ # Not an iteration mark
210
+ y << char.with_offset(offset)
211
+ offset += char.c.length
212
+
213
+ # Update previous character info
214
+ char_code = current_char.ord
215
+ char_type = get_char_type(current_char, char_code)
216
+
217
+ # Only update prev_char_info if it's a repeatable character
218
+ prev_char_info = ({ char: current_char, type: char_type } if char_type && char_type != :other)
219
+
220
+ prev_was_iteration_mark = false
221
+ end
222
+ end
223
+ end
224
+ end
225
+
226
+ private
227
+
228
+ # Get the character type
229
+ def get_char_type(char, char_code)
230
+ # Check special cases first
231
+ return :other if hatsuon?(char_code) || sokuon?(char_code)
232
+ return :other if semi_voiced?(char)
233
+
234
+ # Check if it's a voiced character
235
+ if voiced?(char)
236
+ if hiragana?(char_code)
237
+ return :hiragana_voiced
238
+ elsif katakana?(char_code)
239
+ return :katakana_voiced
240
+ end
241
+ end
242
+
243
+ # Check character type
244
+ return :hiragana if hiragana?(char_code)
245
+ return :katakana if katakana?(char_code)
246
+ return :kanji if kanji?(char_code)
247
+
248
+ :other
249
+ end
250
+ end
251
+
252
+ # Factory method to create a Japanese iteration marks transliterator
253
+ #
254
+ # @param options [Hash] Configuration options
255
+ # @return [Transliterator] A new Japanese iteration marks transliterator instance
256
+ def self.call(options = {})
257
+ Transliterator.new(options)
258
+ end
259
+ end
260
+ end
261
+ end