yosina 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/Gemfile +6 -0
  4. data/README.ja.md +229 -0
  5. data/README.md +229 -0
  6. data/Rakefile +30 -0
  7. data/codegen/dataset.rb +215 -0
  8. data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
  9. data/codegen/emitters/combined_transliterator_data.rb +28 -0
  10. data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
  11. data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
  12. data/codegen/emitters/simple_transliterator.rb +76 -0
  13. data/codegen/emitters/utils.rb +45 -0
  14. data/codegen/emitters.rb +8 -0
  15. data/codegen/main.rb +109 -0
  16. data/lib/yosina/char.rb +65 -0
  17. data/lib/yosina/chars.rb +152 -0
  18. data/lib/yosina/recipes.rb +359 -0
  19. data/lib/yosina/transliterator.rb +49 -0
  20. data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
  21. data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
  22. data/lib/yosina/transliterators/combined.rb +52 -0
  23. data/lib/yosina/transliterators/combined_data.rb +495 -0
  24. data/lib/yosina/transliterators/hira_kata.rb +106 -0
  25. data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
  26. data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
  27. data/lib/yosina/transliterators/hyphens.rb +83 -0
  28. data/lib/yosina/transliterators/hyphens_data.rb +60 -0
  29. data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
  30. data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
  31. data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
  32. data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
  33. data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
  34. data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
  35. data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
  36. data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
  37. data/lib/yosina/transliterators/radicals.rb +361 -0
  38. data/lib/yosina/transliterators/spaces.rb +79 -0
  39. data/lib/yosina/transliterators.rb +57 -0
  40. data/lib/yosina/version.rb +5 -0
  41. data/lib/yosina.rb +62 -0
  42. data/yosina.gemspec +41 -0
  43. metadata +159 -0
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ # Represents a character with metadata for transliteration
5
+ class Char
6
+ attr_accessor :c, :offset, :source
7
+
8
+ # Initialize a new character
9
+ #
10
+ # @param c [String] The character string
11
+ # @param offset [Integer] The offset position in the original text
12
+ # @param source [Char, nil] Optional reference to the original character
13
+ # rubocop:disable Naming/MethodParameterName
14
+ def initialize(c:, offset:, source: nil)
15
+ @c = c
16
+ @offset = offset
17
+ @source = source
18
+ end
19
+ # rubocop:enable Naming/MethodParameterName
20
+
21
+ # Check if the character is a sentinel (empty character)
22
+ #
23
+ # @return [Boolean] true if the character is empty, false otherwise
24
+ def sentinel?
25
+ @c.empty?
26
+ end
27
+
28
+ # Create a new Char with a different offset
29
+ #
30
+ # @param offset [Integer] The new offset for the character
31
+ # @return [Char] A new Char instance with the updated offset
32
+ def with_offset(offset)
33
+ Char.new(c: @c, offset: offset, source: self)
34
+ end
35
+
36
+ # Check if the character has been transliterated
37
+ #
38
+ # @return [Boolean] true if the character has a source, false otherwise
39
+ def transliterated?
40
+ c = self
41
+ loop do
42
+ s = c.source
43
+ break if s.nil?
44
+ return true if c.c != s.c
45
+
46
+ c = s
47
+ end
48
+ false
49
+ end
50
+
51
+ def ==(other)
52
+ return false unless other.is_a?(Char)
53
+
54
+ c == other.c && offset == other.offset && source == other.source
55
+ end
56
+
57
+ def to_s
58
+ c
59
+ end
60
+
61
+ def inspect
62
+ "#<Yosina::Char c=#{c.inspect} offset=#{offset} source=#{source&.inspect}>"
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ # Character array building and string conversion utilities
5
+ module Chars
6
+ # Build a character array from a string, handling IVS/SVS sequences
7
+ #
8
+ # This function properly handles Ideographic Variation Sequences (IVS) and
9
+ # Standardized Variation Sequences (SVS) by combining base characters with
10
+ # their variation selectors into single Char objects.
11
+ #
12
+ # @param input_str [String] The input string to convert to character array
13
+ # @return [Chars] A list of Char objects representing the input string,
14
+ # with a sentinel empty character at the end
15
+ def self.build_char_array(input_str)
16
+ result = []
17
+ offset = 0
18
+ prev_char = nil
19
+ prev_codepoint = nil
20
+
21
+ input_str.each_char do |char|
22
+ codepoint = char.ord
23
+
24
+ if prev_char && prev_codepoint
25
+ # Check if current character is a variation selector
26
+ # Variation selectors are in ranges: U+FE00-U+FE0F, U+E0100-U+E01EF
27
+ if (0xFE00..0xFE0F).cover?(codepoint) || (0xE0100..0xE01EF).cover?(codepoint)
28
+ # Combine previous character with variation selector
29
+ combined_char = prev_char + char
30
+ result << Char.new(c: combined_char, offset: offset)
31
+ offset += combined_char.length
32
+ prev_char = prev_codepoint = nil
33
+ next
34
+ end
35
+
36
+ # Previous character was not followed by a variation selector
37
+ result << Char.new(c: prev_char, offset: offset)
38
+ offset += prev_char.length
39
+ end
40
+
41
+ # Store current character for next iteration
42
+ prev_char = char
43
+ prev_codepoint = codepoint
44
+ end
45
+
46
+ # Handle the last character if any
47
+ if prev_char
48
+ result << Char.new(c: prev_char, offset: offset)
49
+ offset += prev_char.length
50
+ end
51
+
52
+ # Add sentinel empty character
53
+ result << Char.new(c: '', offset: offset)
54
+
55
+ class << result
56
+ include Chars
57
+ end
58
+
59
+ result
60
+ end
61
+
62
+ # Convert an array of characters back to a string
63
+ #
64
+ # This function filters out sentinel characters (empty strings) that are
65
+ # used internally by the transliteration system.
66
+ #
67
+ # @param chars [Enumerable<Char>] An array of Char objects
68
+ # @return [String] A string composed of the non-empty characters
69
+ def self.as_s(chars)
70
+ chars.reject { |char| char.c.empty? }.map(&:c).join
71
+ end
72
+
73
+ # Create an enumerator that yields characters from the input
74
+ #
75
+ # @param &block [Proc] A block that yields characters to the enumerator
76
+ # @return [Enumerator] An enumerator that yields Char objects
77
+ def self.enum(&block)
78
+ e = Enumerator.new { |y| block.call(y) }
79
+ class << e
80
+ include Chars
81
+ end
82
+ e
83
+ end
84
+
85
+ def to_s
86
+ Chars.as_s(self)
87
+ end
88
+
89
+ %i[
90
+ chunk_while
91
+ partition
92
+ slice_before
93
+ slice_when
94
+ ].each do |chunker|
95
+ define_method(chunker) do |*args, &block|
96
+ e = super.send(:chunker, *args, &block)
97
+ e.map do |slice|
98
+ class << slice
99
+ include Chars
100
+ end
101
+ slice
102
+ end
103
+ end
104
+ end
105
+
106
+ %i[
107
+ chain
108
+ find_all
109
+ drop
110
+ drop_while
111
+ entries
112
+ filter
113
+ grep
114
+ grep_v
115
+ reject
116
+ select
117
+ sort
118
+ sort_by
119
+ take
120
+ take_while
121
+ to_a
122
+ ].each do |method|
123
+ define_method(method) do |*args, &block|
124
+ e = super(*args, &block)
125
+ class << e
126
+ include Chars
127
+ end
128
+ e
129
+ end
130
+ end
131
+
132
+ def chunk(&block)
133
+ e = super(&block)
134
+ e.map do |g, slice|
135
+ class << slice
136
+ include Chars
137
+ end
138
+ [g, slice]
139
+ end
140
+ end
141
+
142
+ def group_by(&block)
143
+ e = super(&block)
144
+ e.transform_values do |slice|
145
+ class << slice
146
+ include Chars
147
+ end
148
+ slice
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,359 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Main module for Yosina text transliteration library
4
+ module Yosina
5
+ # Internal builder for creating lists of transliterator configurations
6
+ class TransliteratorConfigListBuilder
7
+ attr_reader :head, :tail
8
+
9
+ def initialize(head: [], tail: [])
10
+ @head = head.dup
11
+ @tail = tail.dup
12
+ end
13
+
14
+ # Insert config at the head of the chain
15
+ def insert_head(config, force_replace: false)
16
+ idx = @head.find_index { |c| c[0] == config[0] }
17
+ if idx
18
+ @head[idx] = config if force_replace
19
+ else
20
+ @head.unshift(config)
21
+ end
22
+ self
23
+ end
24
+
25
+ # Insert config in the middle (tail list, at beginning)
26
+ def insert_middle(config, force_replace: false)
27
+ idx = @tail.find_index { |c| c[0] == config[0] }
28
+ if idx
29
+ @tail[idx] = config if force_replace
30
+ else
31
+ @tail.unshift(config)
32
+ end
33
+ self
34
+ end
35
+
36
+ # Insert config at the tail of the chain
37
+ def insert_tail(config, force_replace: false)
38
+ idx = @tail.find_index { |c| c[0] == config[0] }
39
+ if idx
40
+ @tail[idx] = config if force_replace
41
+ else
42
+ @tail.push(config)
43
+ end
44
+ self
45
+ end
46
+
47
+ # Build the final configuration list
48
+ def build
49
+ @head + @tail
50
+ end
51
+ end
52
+
53
+ # Configuration recipe for building transliterator chains
54
+ class TransliterationRecipe
55
+ attr_accessor :kanji_old_new, :hira_kata, :replace_japanese_iteration_marks,
56
+ :replace_suspicious_hyphens_to_prolonged_sound_marks,
57
+ :replace_combined_characters, :replace_circled_or_squared_characters,
58
+ :replace_ideographic_annotations, :replace_radicals, :replace_spaces,
59
+ :replace_hyphens, :replace_mathematical_alphanumerics,
60
+ :combine_decomposed_hiraganas_and_katakanas, :to_fullwidth, :to_halfwidth,
61
+ :remove_ivs_svs, :charset
62
+
63
+ # Initialize a new transliterator recipe
64
+ #
65
+ # @param kanji_old_new [Boolean] Replace old-style kanji glyphs with modern equivalents
66
+ # @example
67
+ # # Input: "舊字體の變換"
68
+ # # Output: "旧字体の変換"
69
+ # @param hira_kata [String, nil] Convert between hiragana and katakana scripts
70
+ # @example
71
+ # # Input: "ひらがな" (with 'hira-to-kata')
72
+ # # Output: "ヒラガナ"
73
+ # # Input: "カタカナ" (with 'kata-to-hira')
74
+ # # Output: "かたかな"
75
+ # @param replace_japanese_iteration_marks [Boolean] Replace Japanese iteration marks with the characters
76
+ # they represent
77
+ # @example
78
+ # # Input: "時々"
79
+ # # Output: "時時"
80
+ # # Input: "いすゞ"
81
+ # # Output: "いすず"
82
+ # @param replace_suspicious_hyphens_to_prolonged_sound_marks [Boolean] Replace suspicious hyphens with prolonged
83
+ # sound marks
84
+ # @example
85
+ # # Input: "スーパ-" (with hyphen-minus)
86
+ # # Output: "スーパー" (becomes prolonged sound mark)
87
+ # @param replace_combined_characters [Boolean] Replace combined characters with their corresponding characters
88
+ # @example
89
+ # # Input: "㍻" (single character for Heisei era)
90
+ # # Output: "平成"
91
+ # # Input: "㈱"
92
+ # # Output: "(株)"
93
+ # @param replace_circled_or_squared_characters [Boolean, String] Replace circled or squared characters with
94
+ # templates
95
+ # @example
96
+ # # Input: "①②③"
97
+ # # Output: "(1)(2)(3)"
98
+ # # Input: "㊙㊗"
99
+ # # Output: "(秘)(祝)"
100
+ # @param replace_ideographic_annotations [Boolean] Replace ideographic annotations
101
+ # @example
102
+ # # Input: "㆖㆘" (ideographic annotations)
103
+ # # Output: "上下"
104
+ # @param replace_radicals [Boolean] Replace Kangxi radicals with CJK ideographs
105
+ # @example
106
+ # # Input: "⾔⾨⾷" (Kangxi radicals)
107
+ # # Output: "言門食" (CJK ideographs)
108
+ # @param replace_spaces [Boolean] Replace various space characters
109
+ # @example
110
+ # # Input: "A B" (ideographic space U+3000)
111
+ # # Output: "A B" (half-width space)
112
+ # # Input: "A B" (non-breaking space U+00A0)
113
+ # # Output: "A B" (regular space)
114
+ # @param replace_hyphens [Boolean, Array<String>] Replace various dash/hyphen symbols
115
+ # @example
116
+ # # Input: "2019—2020" (em dash)
117
+ # # Output: "2019-2020" (hyphen-minus)
118
+ # # Input: "A–B" (en dash)
119
+ # # Output: "A-B"
120
+ # @param replace_mathematical_alphanumerics [Boolean] Replace mathematical alphanumerics
121
+ # @example
122
+ # # Input: "𝐀𝐁𝐂" (mathematical bold)
123
+ # # Output: "ABC"
124
+ # # Input: "𝟏𝟐𝟑" (mathematical bold digits)
125
+ # # Output: "123"
126
+ # @param combine_decomposed_hiraganas_and_katakanas [Boolean] Combine decomposed hiraganas/katakanas
127
+ # @example
128
+ # # Input: "が" (か + ゙)
129
+ # # Output: "が" (single character)
130
+ # # Input: "ヘ゜" (ヘ + ゜)
131
+ # # Output: "ペ" (single character)
132
+ # @param to_fullwidth [Boolean, String] Replace half-width with fullwidth characters
133
+ # @example
134
+ # # Input: "ABC123"
135
+ # # Output: "ABC123"
136
+ # # Input: "カタカナ"
137
+ # # Output: "カタカナ"
138
+ # @param to_halfwidth [Boolean, String] Replace full-width with half-width characters
139
+ # @example
140
+ # # Input: "ABC123"
141
+ # # Output: "ABC123"
142
+ # # Input: "カタカナ" (with hankaku-kana)
143
+ # # Output: "カタカナ"
144
+ # @param remove_ivs_svs [Boolean, String] Remove IVS/SVS selectors
145
+ # @example
146
+ # # Input: "葛󠄀" (葛 + IVS U+E0100)
147
+ # # Output: "葛" (without selector)
148
+ # # Input: "辻󠄀" (辻 + IVS)
149
+ # # Output: "辻"
150
+ # @param charset [String] Charset for IVS/SVS transliteration
151
+ # rubocop:disable Metrics/ParameterLists
152
+ def initialize(kanji_old_new: false, hira_kata: nil, replace_japanese_iteration_marks: false,
153
+ replace_suspicious_hyphens_to_prolonged_sound_marks: false,
154
+ replace_combined_characters: false, replace_circled_or_squared_characters: false,
155
+ replace_ideographic_annotations: false, replace_radicals: false,
156
+ replace_spaces: false, replace_hyphens: false,
157
+ replace_mathematical_alphanumerics: false,
158
+ combine_decomposed_hiraganas_and_katakanas: false,
159
+ to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false,
160
+ charset: 'unijis_2004')
161
+ @kanji_old_new = kanji_old_new
162
+ @hira_kata = hira_kata
163
+ @replace_japanese_iteration_marks = replace_japanese_iteration_marks
164
+ @replace_suspicious_hyphens_to_prolonged_sound_marks = replace_suspicious_hyphens_to_prolonged_sound_marks
165
+ @replace_combined_characters = replace_combined_characters
166
+ @replace_circled_or_squared_characters = replace_circled_or_squared_characters
167
+ @replace_ideographic_annotations = replace_ideographic_annotations
168
+ @replace_radicals = replace_radicals
169
+ @replace_spaces = replace_spaces
170
+ @replace_hyphens = replace_hyphens
171
+ @replace_mathematical_alphanumerics = replace_mathematical_alphanumerics
172
+ @combine_decomposed_hiraganas_and_katakanas = combine_decomposed_hiraganas_and_katakanas
173
+ @to_fullwidth = to_fullwidth
174
+ @to_halfwidth = to_halfwidth
175
+ @remove_ivs_svs = remove_ivs_svs
176
+ @charset = charset
177
+ end
178
+ # rubocop:enable Metrics/ParameterLists
179
+
180
+ # Build transliterator configurations from this recipe
181
+ #
182
+ # @return [Array<Array>] Array of transliterator configurations
183
+ # @raise [ArgumentError] If the recipe contains mutually exclusive options
184
+ def build_transliterator_configs
185
+ # Check for mutually exclusive options
186
+ errors = []
187
+ errors << 'to_fullwidth and to_halfwidth are mutually exclusive' if to_fullwidth && to_halfwidth
188
+
189
+ raise ArgumentError, errors.join('; ') unless errors.empty?
190
+
191
+ ctx = TransliteratorConfigListBuilder.new
192
+
193
+ # Apply transformations in the specified order
194
+ ctx = apply_kanji_old_new(ctx)
195
+ ctx = apply_replace_suspicious_hyphens_to_prolonged_sound_marks(ctx)
196
+ ctx = apply_replace_circled_or_squared_characters(ctx)
197
+ ctx = apply_replace_combined_characters(ctx)
198
+ ctx = apply_replace_ideographic_annotations(ctx)
199
+ ctx = apply_replace_radicals(ctx)
200
+ ctx = apply_replace_spaces(ctx)
201
+ ctx = apply_replace_hyphens(ctx)
202
+ ctx = apply_replace_mathematical_alphanumerics(ctx)
203
+ ctx = apply_combine_decomposed_hiraganas_and_katakanas(ctx)
204
+ ctx = apply_to_fullwidth(ctx)
205
+ ctx = apply_hira_kata(ctx)
206
+ ctx = apply_replace_japanese_iteration_marks(ctx)
207
+ ctx = apply_to_halfwidth(ctx)
208
+ ctx = apply_remove_ivs_svs(ctx)
209
+
210
+ ctx.build
211
+ end
212
+
213
+ private
214
+
215
+ def remove_ivs_svs_helper(ctx, drop_all_selectors)
216
+ # First insert IVS-or-SVS mode at head
217
+ ctx = ctx.insert_head([:ivs_svs_base, { mode: 'ivs-or-svs', charset: @charset }], force_replace: true)
218
+ # Then insert base mode at tail
219
+ ctx.insert_tail(
220
+ [:ivs_svs_base,
221
+ { mode: 'base', drop_selectors_altogether: drop_all_selectors, charset: @charset }], force_replace: true
222
+ )
223
+ end
224
+
225
+ def apply_kanji_old_new(ctx)
226
+ if @kanji_old_new
227
+ ctx = remove_ivs_svs_helper(ctx, false)
228
+ ctx.insert_middle([:kanji_old_new, {}])
229
+ else
230
+ ctx
231
+ end
232
+ end
233
+
234
+ def apply_hira_kata(ctx)
235
+ if @hira_kata
236
+ ctx.insert_middle([:hira_kata, { mode: @hira_kata }])
237
+ else
238
+ ctx
239
+ end
240
+ end
241
+
242
+ def apply_replace_japanese_iteration_marks(ctx)
243
+ if @replace_japanese_iteration_marks
244
+ # Insert HiraKataComposition at head to ensure composed forms
245
+ ctx = ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
246
+ # Then insert the japanese-iteration-marks in the middle
247
+ ctx.insert_middle([:japanese_iteration_marks, {}])
248
+ else
249
+ ctx
250
+ end
251
+ end
252
+
253
+ def apply_replace_suspicious_hyphens_to_prolonged_sound_marks(ctx)
254
+ if @replace_suspicious_hyphens_to_prolonged_sound_marks
255
+ ctx.insert_middle([:prolonged_sound_marks, { replace_prolonged_marks_following_alnums: true }])
256
+ else
257
+ ctx
258
+ end
259
+ end
260
+
261
+ def apply_replace_combined_characters(ctx)
262
+ ctx.insert_middle([:combined, {}]) if @replace_combined_characters
263
+ ctx
264
+ end
265
+
266
+ def apply_replace_circled_or_squared_characters(ctx)
267
+ if @replace_circled_or_squared_characters
268
+ include_emojis = @replace_circled_or_squared_characters != 'exclude-emojis'
269
+ ctx.insert_middle([:circled_or_squared, { include_emojis: include_emojis }])
270
+ else
271
+ ctx
272
+ end
273
+ end
274
+
275
+ def apply_replace_ideographic_annotations(ctx)
276
+ if @replace_ideographic_annotations
277
+ ctx.insert_middle([:ideographic_annotations, {}])
278
+ else
279
+ ctx
280
+ end
281
+ end
282
+
283
+ def apply_replace_radicals(ctx)
284
+ if @replace_radicals
285
+ ctx.insert_middle([:radicals, {}])
286
+ else
287
+ ctx
288
+ end
289
+ end
290
+
291
+ def apply_replace_spaces(ctx)
292
+ if @replace_spaces
293
+ ctx.insert_middle([:spaces, {}])
294
+ else
295
+ ctx
296
+ end
297
+ end
298
+
299
+ def apply_replace_hyphens(ctx)
300
+ if @replace_hyphens
301
+ precedence = @replace_hyphens.is_a?(Array) ? @replace_hyphens : %i[jisx0208_90_windows jisx0201]
302
+ ctx.insert_middle([:hyphens, { precedence: precedence }])
303
+ else
304
+ ctx
305
+ end
306
+ end
307
+
308
+ def apply_replace_mathematical_alphanumerics(ctx)
309
+ if @replace_mathematical_alphanumerics
310
+ ctx.insert_middle([:mathematical_alphanumerics, {}])
311
+ else
312
+ ctx
313
+ end
314
+ end
315
+
316
+ def apply_combine_decomposed_hiraganas_and_katakanas(ctx)
317
+ if @combine_decomposed_hiraganas_and_katakanas
318
+ ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
319
+ else
320
+ ctx
321
+ end
322
+ end
323
+
324
+ def apply_to_fullwidth(ctx)
325
+ if @to_fullwidth
326
+ u005c_as_yen_sign = @to_fullwidth == 'u005c-as-yen-sign'
327
+ ctx.insert_tail([:jisx0201_and_alike, { fullwidth_to_halfwidth: false, u005c_as_yen_sign: u005c_as_yen_sign }])
328
+ end
329
+ ctx
330
+ end
331
+
332
+ def apply_to_halfwidth(ctx)
333
+ if @to_halfwidth
334
+ convert_gr = @to_halfwidth == 'hankaku-kana'
335
+ ctx.insert_tail([:jisx0201_and_alike,
336
+ { fullwidth_to_halfwidth: true, convert_gl: true, convert_gr: convert_gr }])
337
+ else
338
+ ctx
339
+ end
340
+ end
341
+
342
+ def apply_remove_ivs_svs(ctx)
343
+ if @remove_ivs_svs
344
+ drop_all_selectors = @remove_ivs_svs == 'drop-all-selectors'
345
+ remove_ivs_svs_helper(ctx, drop_all_selectors)
346
+ else
347
+ ctx
348
+ end
349
+ end
350
+ end
351
+
352
+ # Build an array of transliterator configs from a recipe object
353
+ #
354
+ # @param recipe [TransliterationRecipe] A TransliterationRecipe object
355
+ # @return [Array<Array>] Array of transliterator configurations
356
+ def self.build_transliterator_configs_from_recipe(recipe)
357
+ recipe.build_transliterator_configs
358
+ end
359
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ # Configuration for a transliterator
5
+ class TransliteratorConfig
6
+ attr_accessor :name, :options
7
+
8
+ # Initialize a new transliterator configuration
9
+ #
10
+ # @param name [String, Symbol] The name of the transliterator
11
+ # @param options [Hash, nil] Configuration options for the transliterator
12
+ def initialize(name, options = nil)
13
+ @name = name
14
+ @options = options
15
+ end
16
+ end
17
+
18
+ # Base class for all transliterators
19
+ class BaseTransliterator
20
+ # Transliterate an array of characters
21
+ #
22
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
23
+ # @return [Enumerable<Char>] The transliterated characters
24
+ def call(input_chars)
25
+ raise NotImplementedError, 'Subclasses must implement call method'
26
+ end
27
+ end
28
+
29
+ # Chains multiple transliterators together
30
+ class ChainedTransliterator < BaseTransliterator
31
+ # Initialize a chained transliterator
32
+ #
33
+ # @param transliterators [Array<BaseTransliterator>] The transliterators to chain
34
+ def initialize(transliterators)
35
+ super()
36
+ @transliterators = transliterators
37
+ end
38
+
39
+ # Apply all transliterators in sequence
40
+ #
41
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
42
+ # @return [Enumerable<Char>] The transliterated characters
43
+ def call(input_chars)
44
+ @transliterators.reduce(input_chars) do |chars, transliterator|
45
+ transliterator.call(chars)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'circled_or_squared_data'
4
+
5
+ module Yosina
6
+ module Transliterators
7
+ # Replace circled or squared characters with their corresponding templates
8
+ module CircledOrSquared
9
+ # Transliterator for circled or squared characters
10
+ class Transliterator < Yosina::BaseTransliterator
11
+ # Initialize the transliterator with options
12
+ #
13
+ # @param options [Hash] Configuration options
14
+ # @option options [Boolean] :include_emojis Whether to include emoji representations
15
+ # @option options [Hash] :templates Custom templates for circle and square
16
+ # @option options [String] :templates['circle'] Template for circled characters
17
+ # @option options [String] :templates['square'] Template for squared characters
18
+ def initialize(options = {})
19
+ super()
20
+ @include_emojis = options[:include_emojis] || false
21
+ templates = options[:templates] || {}
22
+ @templates = {
23
+ 'c' => templates['circle'] || '(?)',
24
+ 's' => templates['square'] || '[?]'
25
+ }
26
+ end
27
+
28
+ # Replace circled or squared characters with their corresponding templates
29
+ #
30
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
31
+ # @return [Enumerable<Char>] The transliterated characters
32
+ def call(input_chars)
33
+ offset = 0
34
+
35
+ Chars.enum do |y|
36
+ input_chars.each do |char|
37
+ mapping = CIRCLED_OR_SQUARED_MAPPINGS[char.c]
38
+ if mapping && (!mapping[:emoji] || @include_emojis)
39
+ rendering = mapping[:rendering]
40
+ type_abbrev = mapping[:type]
41
+ template = @templates[type_abbrev]
42
+ replacement = +template
43
+ replacement['?'] = rendering
44
+
45
+ replacement.each_char do |replacement_char|
46
+ y << Char.new(c: replacement_char, offset: offset, source: char)
47
+ offset += replacement_char.length
48
+ end
49
+ else
50
+ y << Char.new(c: char.c, offset: offset, source: char.source)
51
+ offset += char.c.length
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ # Factory method to create a combined transliterator
59
+ #
60
+ # @param options [Hash] Configuration options
61
+ # @return [Transliterator] A new combined transliterator instance
62
+ def self.call(options = {})
63
+ Transliterator.new(options)
64
+ end
65
+ end
66
+ end
67
+ end