yosina 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/Gemfile +6 -0
  4. data/README.ja.md +229 -0
  5. data/README.md +229 -0
  6. data/Rakefile +30 -0
  7. data/codegen/dataset.rb +215 -0
  8. data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
  9. data/codegen/emitters/combined_transliterator_data.rb +28 -0
  10. data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
  11. data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
  12. data/codegen/emitters/simple_transliterator.rb +76 -0
  13. data/codegen/emitters/utils.rb +45 -0
  14. data/codegen/emitters.rb +8 -0
  15. data/codegen/main.rb +109 -0
  16. data/lib/yosina/char.rb +65 -0
  17. data/lib/yosina/chars.rb +152 -0
  18. data/lib/yosina/recipes.rb +359 -0
  19. data/lib/yosina/transliterator.rb +49 -0
  20. data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
  21. data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
  22. data/lib/yosina/transliterators/combined.rb +52 -0
  23. data/lib/yosina/transliterators/combined_data.rb +495 -0
  24. data/lib/yosina/transliterators/hira_kata.rb +106 -0
  25. data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
  26. data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
  27. data/lib/yosina/transliterators/hyphens.rb +83 -0
  28. data/lib/yosina/transliterators/hyphens_data.rb +60 -0
  29. data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
  30. data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
  31. data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
  32. data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
  33. data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
  34. data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
  35. data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
  36. data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
  37. data/lib/yosina/transliterators/radicals.rb +361 -0
  38. data/lib/yosina/transliterators/spaces.rb +79 -0
  39. data/lib/yosina/transliterators.rb +57 -0
  40. data/lib/yosina/version.rb +5 -0
  41. data/lib/yosina.rb +62 -0
  42. data/yosina.gemspec +41 -0
  43. metadata +159 -0
@@ -0,0 +1,451 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../transliterator'
4
+ require_relative 'hira_kata_table'
5
+
6
+ module Yosina
7
+ module Transliterators
8
+ # JIS X 0201 and alike transliterator for fullwidth/halfwidth conversion
9
+ module Jisx0201AndAlike
10
+ include HiraKataTable
11
+
12
+ # GL area mapping table (fullwidth to halfwidth)
13
+ JISX0201_GL_TABLE = [
14
+ ["\u3000", "\u0020"], # Ideographic space to space
15
+ ["\uff01", "\u0021"], # ! to !
16
+ ["\uff02", "\u0022"], # " to "
17
+ ["\uff03", "\u0023"], # # to #
18
+ ["\uff04", "\u0024"], # $ to $
19
+ ["\uff05", "\u0025"], # % to %
20
+ ["\uff06", "\u0026"], # & to &
21
+ ["\uff07", "\u0027"], # ' to '
22
+ ["\uff08", "\u0028"], # ( to (
23
+ ["\uff09", "\u0029"], # ) to )
24
+ ["\uff0a", "\u002a"], # * to *
25
+ ["\uff0b", "\u002b"], # + to +
26
+ ["\uff0c", "\u002c"], # , to ,
27
+ ["\uff0d", "\u002d"], # - to -
28
+ ["\uff0e", "\u002e"], # . to .
29
+ ["\uff0f", "\u002f"], # / to /
30
+ ["\uff10", "\u0030"], # 0 to 0
31
+ ["\uff11", "\u0031"], # 1 to 1
32
+ ["\uff12", "\u0032"], # 2 to 2
33
+ ["\uff13", "\u0033"], # 3 to 3
34
+ ["\uff14", "\u0034"], # 4 to 4
35
+ ["\uff15", "\u0035"], # 5 to 5
36
+ ["\uff16", "\u0036"], # 6 to 6
37
+ ["\uff17", "\u0037"], # 7 to 7
38
+ ["\uff18", "\u0038"], # 8 to 8
39
+ ["\uff19", "\u0039"], # 9 to 9
40
+ ["\uff1a", "\u003a"], # : to :
41
+ ["\uff1b", "\u003b"], # ; to ;
42
+ ["\uff1c", "\u003c"], # < to <
43
+ ["\uff1d", "\u003d"], # = to =
44
+ ["\uff1e", "\u003e"], # > to >
45
+ ["\uff1f", "\u003f"], # ? to ?
46
+ ["\uff20", "\u0040"], # @ to @
47
+ ["\uff21", "\u0041"], # A to A
48
+ ["\uff22", "\u0042"], # B to B
49
+ ["\uff23", "\u0043"], # C to C
50
+ ["\uff24", "\u0044"], # D to D
51
+ ["\uff25", "\u0045"], # E to E
52
+ ["\uff26", "\u0046"], # F to F
53
+ ["\uff27", "\u0047"], # G to G
54
+ ["\uff28", "\u0048"], # H to H
55
+ ["\uff29", "\u0049"], # I to I
56
+ ["\uff2a", "\u004a"], # J to J
57
+ ["\uff2b", "\u004b"], # K to K
58
+ ["\uff2c", "\u004c"], # L to L
59
+ ["\uff2d", "\u004d"], # M to M
60
+ ["\uff2e", "\u004e"], # N to N
61
+ ["\uff2f", "\u004f"], # O to O
62
+ ["\uff30", "\u0050"], # P to P
63
+ ["\uff31", "\u0051"], # Q to Q
64
+ ["\uff32", "\u0052"], # R to R
65
+ ["\uff33", "\u0053"], # S to S
66
+ ["\uff34", "\u0054"], # T to T
67
+ ["\uff35", "\u0055"], # U to U
68
+ ["\uff36", "\u0056"], # V to V
69
+ ["\uff37", "\u0057"], # W to W
70
+ ["\uff38", "\u0058"], # X to X
71
+ ["\uff39", "\u0059"], # Y to Y
72
+ ["\uff3a", "\u005a"], # Z to Z
73
+ ["\uff3b", "\u005b"], # [ to [
74
+ ["\uff3d", "\u005d"], # ] to ]
75
+ ["\uff3e", "\u005e"], # ^ to ^
76
+ ["\uff3f", "\u005f"], # _ to _
77
+ ["\uff40", "\u0060"], # ` to `
78
+ ["\uff41", "\u0061"], # a to a
79
+ ["\uff42", "\u0062"], # b to b
80
+ ["\uff43", "\u0063"], # c to c
81
+ ["\uff44", "\u0064"], # d to d
82
+ ["\uff45", "\u0065"], # e to e
83
+ ["\uff46", "\u0066"], # f to f
84
+ ["\uff47", "\u0067"], # g to g
85
+ ["\uff48", "\u0068"], # h to h
86
+ ["\uff49", "\u0069"], # i to i
87
+ ["\uff4a", "\u006a"], # j to j
88
+ ["\uff4b", "\u006b"], # k to k
89
+ ["\uff4c", "\u006c"], # l to l
90
+ ["\uff4d", "\u006d"], # m to m
91
+ ["\uff4e", "\u006e"], # n to n
92
+ ["\uff4f", "\u006f"], # o to o
93
+ ["\uff50", "\u0070"], # p to p
94
+ ["\uff51", "\u0071"], # q to q
95
+ ["\uff52", "\u0072"], # r to r
96
+ ["\uff53", "\u0073"], # s to s
97
+ ["\uff54", "\u0074"], # t to t
98
+ ["\uff55", "\u0075"], # u to u
99
+ ["\uff56", "\u0076"], # v to v
100
+ ["\uff57", "\u0077"], # w to w
101
+ ["\uff58", "\u0078"], # x to x
102
+ ["\uff59", "\u0079"], # y to y
103
+ ["\uff5a", "\u007a"], # z to z
104
+ ["\uff5b", "\u007b"], # { to {
105
+ ["\uff5c", "\u007c"], # | to |
106
+ ["\uff5d", "\u007d"] # } to }
107
+ ].freeze
108
+
109
+ # Special GL overrides
110
+ JISX0201_GL_OVERRIDES = {
111
+ u005c_as_yen_sign: [["\uffe5", "\u005c"]], # ¥ to \
112
+ u005c_as_backslash: [["\uff3c", "\u005c"]], # \ to \
113
+ u007e_as_fullwidth_tilde: [["\uff5e", "\u007e"]], # ~ to ~
114
+ u007e_as_wave_dash: [["\u301c", "\u007e"]], # 〜 to ~
115
+ u007e_as_overline: [["\u203e", "\u007e"]], # ‾ to ~
116
+ u007e_as_fullwidth_macron: [["\uffe3", "\u007e"]], #  ̄ to ~
117
+ u00a5_as_yen_sign: [["\uffe5", "\u00a5"]] # ¥ to ¥
118
+ }.freeze
119
+
120
+ # Generate GR table from shared table
121
+ def self.generate_gr_table
122
+ result = [
123
+ ["\u3002", "\uff61"], # 。 to 。
124
+ ["\u300c", "\uff62"], # 「 to 「
125
+ ["\u300d", "\uff63"], # 」 to 」
126
+ ["\u3001", "\uff64"], # 、 to 、
127
+ ["\u30fb", "\uff65"], # ・ to ・
128
+ ["\u30fc", "\uff70"], # ー to ー
129
+ ["\u309b", "\uff9e"], # ゛ to ゙
130
+ ["\u309c", "\uff9f"] # ゜to ゚
131
+ ]
132
+ # Add katakana mappings from main table
133
+ HIRAGANA_KATAKANA_TABLE.each do |_, katakana, halfwidth|
134
+ result << [katakana[0], halfwidth] if halfwidth
135
+ end
136
+ # Add small kana mappings
137
+ HIRAGANA_KATAKANA_SMALL_TABLE.each do |_, katakana, halfwidth|
138
+ result << [katakana, halfwidth] if halfwidth
139
+ end
140
+ result
141
+ end
142
+
143
+ # GR area mapping table (fullwidth to halfwidth)
144
+ JISX0201_GR_TABLE = generate_gr_table.freeze
145
+
146
+ # Special punctuations
147
+ SPECIAL_PUNCTUATIONS_TABLE = [["\u30a0", "\u003d"]].freeze # ゠ to =
148
+
149
+ # Generate voiced letters table from shared table
150
+ def self.generate_voiced_letters_table
151
+ result = []
152
+ HIRAGANA_KATAKANA_TABLE.each do |_, katakana, halfwidth|
153
+ next unless halfwidth
154
+
155
+ result << [katakana[1], "#{halfwidth}\uff9e"] if katakana[1] # Has voiced form
156
+ result << [katakana[2], "#{halfwidth}\uff9f"] if katakana[2] # Has semi-voiced form
157
+ end
158
+ result
159
+ end
160
+
161
+ # Voiced letters table
162
+ VOICED_LETTERS_TABLE = generate_voiced_letters_table.freeze
163
+
164
+ # Generate hiragana mappings from shared table
165
+ def self.generate_hiragana_mappings
166
+ result = []
167
+ # Add main table hiragana mappings
168
+ HIRAGANA_KATAKANA_TABLE.each do |hiragana, _, halfwidth|
169
+ next unless hiragana[0] && halfwidth
170
+
171
+ result << [hiragana[0], halfwidth]
172
+ result << [hiragana[1], "#{halfwidth}\uff9e"] if hiragana[1] # Has voiced form
173
+ result << [hiragana[2], "#{halfwidth}\uff9f"] if hiragana[2] # Has semi-voiced form
174
+ end
175
+ # Add small kana mappings
176
+ HIRAGANA_KATAKANA_SMALL_TABLE.each do |hiragana, _, halfwidth|
177
+ result << [hiragana, halfwidth] if halfwidth
178
+ end
179
+ result
180
+ end
181
+
182
+ # Hiragana mappings
183
+ HIRAGANA_MAPPINGS = generate_hiragana_mappings.freeze
184
+
185
+ # Transliterator for JIS X 0201 and alike
186
+ class Transliterator < Yosina::BaseTransliterator
187
+ attr_reader :fullwidth_to_halfwidth, :convert_gl, :convert_gr, :convert_unsafe_specials,
188
+ :convert_hiraganas, :combine_voiced_sound_marks,
189
+ :u005c_as_yen_sign, :u005c_as_backslash,
190
+ :u007e_as_fullwidth_tilde, :u007e_as_wave_dash,
191
+ :u007e_as_overline, :u007e_as_fullwidth_macron,
192
+ :u00a5_as_yen_sign
193
+
194
+ # Initialize the transliterator with options
195
+ #
196
+ # @param options [Hash] Configuration options
197
+ # @option options [Boolean] :fullwidth_to_halfwidth Convert fullwidth to halfwidth (default: true)
198
+ # @option options [Boolean] :convert_gl Convert GL characters (default: true)
199
+ # @option options [Boolean] :convert_gr Convert GR characters (default: true)
200
+ # @option options [Boolean] :convert_unsafe_specials Convert unsafe special characters
201
+ # @option options [Boolean] :convert_hiraganas Convert hiraganas (default: false)
202
+ # @option options [Boolean] :combine_voiced_sound_marks Combine voiced sound marks (default: true)
203
+ # @option options [Boolean] :u005c_as_yen_sign Treat backslash as yen sign
204
+ # @option options [Boolean] :u005c_as_backslash Treat backslash verbatim
205
+ # @option options [Boolean] :u007e_as_fullwidth_tilde Convert tilde to fullwidth tilde
206
+ # @option options [Boolean] :u007e_as_wave_dash Convert tilde to wave dash
207
+ # @option options [Boolean] :u007e_as_overline Convert tilde to overline
208
+ # @option options [Boolean] :u007e_as_fullwidth_macron Convert tilde to fullwidth macron
209
+ # @option options [Boolean] :u00a5_as_yen_sign Convert yen sign to backslash
210
+ def initialize(options = {})
211
+ super()
212
+ @fullwidth_to_halfwidth = options.fetch(:fullwidth_to_halfwidth, true)
213
+ @convert_gl = options.fetch(:convert_gl, true)
214
+ @convert_gr = options.fetch(:convert_gr, true)
215
+ @convert_hiraganas = options.fetch(:convert_hiraganas, false)
216
+ @combine_voiced_sound_marks = options.fetch(:combine_voiced_sound_marks, true)
217
+
218
+ # Set defaults based on direction
219
+ if @fullwidth_to_halfwidth
220
+ @convert_unsafe_specials = options.fetch(:convert_unsafe_specials, true)
221
+ @u005c_as_yen_sign = options.fetch(:u005c_as_yen_sign) { !options.key?(:u00a5_as_yen_sign) }
222
+ @u005c_as_backslash = options.fetch(:u005c_as_backslash, false)
223
+ @u007e_as_fullwidth_tilde = options.fetch(:u007e_as_fullwidth_tilde, true)
224
+ @u007e_as_wave_dash = options.fetch(:u007e_as_wave_dash, true)
225
+ @u007e_as_overline = options.fetch(:u007e_as_overline, false)
226
+ @u007e_as_fullwidth_macron = options.fetch(:u007e_as_fullwidth_macron, false)
227
+ @u00a5_as_yen_sign = options.fetch(:u00a5_as_yen_sign, false)
228
+ else
229
+ @convert_unsafe_specials = options.fetch(:convert_unsafe_specials, false)
230
+ @u005c_as_yen_sign = options.fetch(:u005c_as_yen_sign) { !options.key?(:u005c_as_backslash) }
231
+ @u005c_as_backslash = options.fetch(:u005c_as_backslash, false)
232
+ @u007e_as_fullwidth_tilde = options.fetch(:u007e_as_fullwidth_tilde) do
233
+ !options.key?(:u007e_as_wave_dash) &&
234
+ !options.key?(:u007e_as_overline) &&
235
+ !options.key?(:u007e_as_fullwidth_macron)
236
+ end
237
+ @u007e_as_wave_dash = options.fetch(:u007e_as_wave_dash, false)
238
+ @u007e_as_overline = options.fetch(:u007e_as_overline, false)
239
+ @u007e_as_fullwidth_macron = options.fetch(:u007e_as_fullwidth_macron, false)
240
+ @u00a5_as_yen_sign = options.fetch(:u00a5_as_yen_sign, true)
241
+ end
242
+
243
+ validate_options!
244
+ build_mappings!
245
+ end
246
+
247
+ # Transliterate characters
248
+ #
249
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
250
+ # @return [Enumerable<Char>] The transliterated characters
251
+ def call(input_chars)
252
+ if @fullwidth_to_halfwidth
253
+ convert_fullwidth_to_halfwidth(input_chars)
254
+ else
255
+ convert_halfwidth_to_fullwidth(input_chars)
256
+ end
257
+ end
258
+
259
+ private
260
+
261
+ def validate_options!
262
+ if @fullwidth_to_halfwidth
263
+ # For forward direction, only check this specific combination
264
+ if @u005c_as_yen_sign && @u00a5_as_yen_sign
265
+ raise ArgumentError,
266
+ 'u005c_as_yen_sign and u00a5_as_yen_sign are mutually exclusive,' \
267
+ ' and cannot be set to true at the same time.'
268
+ end
269
+ else
270
+ # For reverse direction, group overrides by their target character and validate
271
+ # Build groups of options that map to the same character
272
+ groups = {}
273
+ JISX0201_GL_OVERRIDES.each do |key, pairs|
274
+ next unless instance_variable_get("@#{key}")
275
+
276
+ pairs.each do |hw|
277
+ groups[hw] ||= []
278
+ groups[hw] << key
279
+ end
280
+ end
281
+
282
+ # Check if multiple options in the same group are set
283
+ groups.each_value do |keys|
284
+ next unless keys.size > 1
285
+
286
+ names = keys.map(&:to_s)
287
+ last = names.pop
288
+ raise ArgumentError,
289
+ "#{names.join(', ')} and #{last} are mutually exclusive," \
290
+ 'and cannot be set to true at the same time.'
291
+ end
292
+ end
293
+ end
294
+
295
+ def build_mappings!
296
+ if @fullwidth_to_halfwidth
297
+ build_forward_mappings!
298
+ else
299
+ build_reverse_mappings!
300
+ build_voiced_reverse_mappings! if @combine_voiced_sound_marks && @convert_gr
301
+ end
302
+ end
303
+
304
+ def build_forward_mappings!
305
+ @fwd_mappings = {}
306
+
307
+ if @convert_gl
308
+ # Add basic GL mappings
309
+ JISX0201_GL_TABLE.each { |fw, hw| @fwd_mappings[fw] = hw }
310
+
311
+ # Add override mappings
312
+ add_override_mappings(@fwd_mappings, false)
313
+
314
+ # Add special punctuations if enabled
315
+ SPECIAL_PUNCTUATIONS_TABLE.each { |fw, hw| @fwd_mappings[fw] = hw } if @convert_unsafe_specials
316
+ end
317
+
318
+ return unless @convert_gr
319
+
320
+ # Add basic GR mappings
321
+ JISX0201_GR_TABLE.each { |fw, hw| @fwd_mappings[fw] = hw }
322
+ VOICED_LETTERS_TABLE.each { |fw, hw| @fwd_mappings[fw] = hw }
323
+
324
+ # Add combining marks
325
+ @fwd_mappings["\u3099"] = "\uff9e" # combining dakuten
326
+ @fwd_mappings["\u309a"] = "\uff9f" # combining handakuten
327
+
328
+ # Add hiragana mappings if enabled
329
+ return unless @convert_hiraganas
330
+
331
+ HIRAGANA_MAPPINGS.each do |fw, hw|
332
+ @fwd_mappings[fw] = hw
333
+ end
334
+ end
335
+
336
+ def build_reverse_mappings!
337
+ @rev_mappings = {}
338
+
339
+ if @convert_gl
340
+ # Add basic GL reverse mappings
341
+ JISX0201_GL_TABLE.each { |fw, hw| @rev_mappings[hw] = fw }
342
+
343
+ # Add override reverse mappings
344
+ add_override_mappings(@rev_mappings, true)
345
+
346
+ # Add special punctuations if enabled
347
+ SPECIAL_PUNCTUATIONS_TABLE.each { |fw, hw| @rev_mappings[hw] = fw } if @convert_unsafe_specials
348
+ end
349
+
350
+ return unless @convert_gr
351
+
352
+ # Add basic GR reverse mappings
353
+ JISX0201_GR_TABLE.each { |fw, hw| @rev_mappings[hw] = fw }
354
+ end
355
+
356
+ def build_voiced_reverse_mappings!
357
+ @voiced_rev_mappings = {}
358
+ VOICED_LETTERS_TABLE.each do |fw, hw|
359
+ @voiced_rev_mappings[hw[0]] ||= {}
360
+ @voiced_rev_mappings[hw[0]][hw[1]] = fw
361
+ end
362
+ end
363
+
364
+ def add_override_mappings(mappings, reverse)
365
+ JISX0201_GL_OVERRIDES.each do |key, pairs|
366
+ next unless instance_variable_get("@#{key}")
367
+
368
+ pairs.each do |fw, hw|
369
+ if reverse
370
+ mappings[hw] = fw
371
+ else
372
+ mappings[fw] = hw
373
+ end
374
+ end
375
+ end
376
+ end
377
+
378
+ def convert_fullwidth_to_halfwidth(input_chars)
379
+ offset = 0
380
+ Chars.enum do |y|
381
+ input_chars.each do |char|
382
+ if (mapped = @fwd_mappings[char.c])
383
+ mapped.each_char do |c|
384
+ y << Char.new(c: c, offset: offset, source: char)
385
+ offset += c.length
386
+ end
387
+ else
388
+ y << char.with_offset(offset)
389
+ offset += char.c.length
390
+ end
391
+ end
392
+ end
393
+ end
394
+
395
+ def convert_halfwidth_to_fullwidth(input_chars)
396
+ offset = 0
397
+ pending = nil
398
+
399
+ Chars.enum do |y|
400
+ e = input_chars.each
401
+ loop do
402
+ if pending
403
+ char = pending
404
+ pending = nil
405
+ else
406
+ begin
407
+ char = e.next
408
+ rescue StopIteration
409
+ break
410
+ end
411
+ end
412
+ if char.sentinel?
413
+ y << char.with_offset(offset)
414
+ next
415
+ end
416
+ # Check if this character might start a combination
417
+ if @voiced_rev_mappings && (inner = @voiced_rev_mappings[char.c])
418
+ next_char = e.next
419
+ if (combined = inner[next_char.c])
420
+ y << Char.new(c: combined, offset: offset, source: char)
421
+ offset += combined.length
422
+ next
423
+ else
424
+ pending = next_char
425
+ end
426
+ end
427
+
428
+ # Normal mapping
429
+ mapped = @rev_mappings[char.c]
430
+ if mapped
431
+ y << Char.new(c: mapped, offset: offset, source: char)
432
+ offset += mapped.length
433
+ else
434
+ y << char.with_offset(offset)
435
+ offset += char.c.length
436
+ end
437
+ end
438
+ end
439
+ end
440
+ end
441
+
442
+ # Factory method to create a JIS X 0201 and alike transliterator
443
+ #
444
+ # @param options [Hash] Configuration options
445
+ # @return [Transliterator] A new JIS X 0201 and alike transliterator instance
446
+ def self.call(options = {})
447
+ Transliterator.new(options)
448
+ end
449
+ end
450
+ end
451
+ end