yosina 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/Gemfile +6 -0
  4. data/README.ja.md +229 -0
  5. data/README.md +229 -0
  6. data/Rakefile +30 -0
  7. data/codegen/dataset.rb +215 -0
  8. data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
  9. data/codegen/emitters/combined_transliterator_data.rb +28 -0
  10. data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
  11. data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
  12. data/codegen/emitters/simple_transliterator.rb +76 -0
  13. data/codegen/emitters/utils.rb +45 -0
  14. data/codegen/emitters.rb +8 -0
  15. data/codegen/main.rb +109 -0
  16. data/lib/yosina/char.rb +65 -0
  17. data/lib/yosina/chars.rb +152 -0
  18. data/lib/yosina/recipes.rb +359 -0
  19. data/lib/yosina/transliterator.rb +49 -0
  20. data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
  21. data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
  22. data/lib/yosina/transliterators/combined.rb +52 -0
  23. data/lib/yosina/transliterators/combined_data.rb +495 -0
  24. data/lib/yosina/transliterators/hira_kata.rb +106 -0
  25. data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
  26. data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
  27. data/lib/yosina/transliterators/hyphens.rb +83 -0
  28. data/lib/yosina/transliterators/hyphens_data.rb +60 -0
  29. data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
  30. data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
  31. data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
  32. data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
  33. data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
  34. data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
  35. data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
  36. data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
  37. data/lib/yosina/transliterators/radicals.rb +361 -0
  38. data/lib/yosina/transliterators/spaces.rb +79 -0
  39. data/lib/yosina/transliterators.rb +57 -0
  40. data/lib/yosina/version.rb +5 -0
  41. data/lib/yosina.rb +62 -0
  42. data/yosina.gemspec +41 -0
  43. metadata +159 -0
@@ -0,0 +1,206 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Handle prolonged sound marks transliterator
6
+ module ProlongedSoundMarks
7
+ # Mix-in for character type checks
8
+ module CharType
9
+ # Hyphen-like characters that can be converted to prolonged sound marks
10
+ HYPHEN_LIKE_CHARS = [
11
+ 0x002d, # HYPHEN-MINUS
12
+ 0x2010, # HYPHEN
13
+ 0x2014, # EM DASH
14
+ 0x2015, # HORIZONTAL BAR
15
+ 0x2212, # MINUS SIGN
16
+ 0xff0d, # FULLWIDTH HYPHEN-MINUS
17
+ 0xff70, # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK (already converted)
18
+ 0x30fc # KATAKANA-HIRAGANA PROLONGED SOUND MARK (already converted)
19
+ ].freeze
20
+
21
+ # Check if character is halfwidth Japanese
22
+ def halfwidth?(char_code)
23
+ halfwidth_alphanumeric?(char_code) ||
24
+ (char_code >= 0xff66 && char_code <= 0xff6f) ||
25
+ (char_code >= 0xff70 && char_code <= 0xff9f)
26
+ end
27
+
28
+ def hiragana?(char_code)
29
+ (char_code >= 0x3041 && char_code <= 0x309c &&
30
+ char_code != 0x3063 && char_code != 0x3093) ||
31
+ char_code == 0x309f
32
+ end
33
+
34
+ def katakana?(char_code)
35
+ (char_code >= 0x30a1 && char_code <= 0x30fa && char_code != 0x30c3 && char_code != 0x30f3) ||
36
+ (char_code >= 0x30fd && char_code <= 0x30ff) ||
37
+ (char_code >= 0xff70 && char_code <= 0xff9f && char_code != 0xff6f && char_code != 0xff9d)
38
+ end
39
+
40
+ def halfwidth_alphanumeric?(char_code)
41
+ (char_code >= 0x30 && char_code <= 0x39) ||
42
+ (char_code >= 0x41 && char_code <= 0x5A) ||
43
+ (char_code >= 0x61 && char_code <= 0x7A)
44
+ end
45
+
46
+ def fullwidth_alphanumeric?(char_code)
47
+ (char_code >= 0xff10 && char_code <= 0xff19) ||
48
+ (char_code >= 0xff21 && char_code <= 0xff3a) ||
49
+ (char_code >= 0xff41 && char_code <= 0xff5a)
50
+ end
51
+
52
+ # Check if character is fullwidth Japanese
53
+ def fullwidth?(char_code)
54
+ char_code == 0x30fc || hiragana?(char_code) || katakana?(char_code) || fullwidth_alphanumeric?(char_code)
55
+ end
56
+
57
+ # Check if character is alphanumeric
58
+ def alphanumeric?(char_code)
59
+ halfwidth_alphanumeric?(char_code) || fullwidth_alphanumeric?(char_code)
60
+ end
61
+
62
+ def hatsuon?(char_code)
63
+ [0x3093, 0x30f3, 0xff9d].include?(char_code)
64
+ end
65
+
66
+ def sokuon?(char_code)
67
+ [0x3063, 0x30c3, 0xff6f].include?(char_code)
68
+ end
69
+
70
+ def prolonged_sound_mark?(char_code)
71
+ [0x30fc, 0xff70].include?(char_code)
72
+ end
73
+
74
+ def prolongable?(char_code)
75
+ prolonged_sound_mark?(char_code) || hiragana?(char_code) || katakana?(char_code)
76
+ end
77
+
78
+ def hyphen_like?(char_code)
79
+ HYPHEN_LIKE_CHARS.include?(char_code)
80
+ end
81
+ end
82
+
83
+ # Transliterator for prolonged sound marks
84
+ class Transliterator < Yosina::BaseTransliterator
85
+ attr_reader :skip_already_transliterated_chars, :allow_prolonged_hatsuon,
86
+ :allow_prolonged_sokuon, :replace_prolonged_marks_following_alnums
87
+
88
+ # Initialize the transliterator with options
89
+ #
90
+ # @param options [Hash] Configuration options
91
+ # @option options [Boolean] :skip_already_transliterated_chars Skip chars that were already processed.
92
+ # Default: false.
93
+ # @option options [Boolean] :allow_prolonged_hatsuon Allow prolonging ん/ン/ン.
94
+ # Default: false.
95
+ # @option options [Boolean] :allow_prolonged_sokuon Allow prolonging っ/ッ/ッ.
96
+ # Default: false.
97
+ # @option options [Boolean] :replace_prolonged_marks_following_alnums Replace prolonged marks after alphanum
98
+ # with hyphens. Default: false.
99
+ def initialize(options = {})
100
+ super()
101
+ @skip_already_transliterated_chars = options.fetch(:skip_already_transliterated_chars, false)
102
+ @allow_prolonged_hatsuon = options.fetch(:allow_prolonged_hatsuon, false)
103
+ @allow_prolonged_sokuon = options.fetch(:allow_prolonged_sokuon, false)
104
+ @replace_prolonged_marks_following_alnums = options.fetch(:replace_prolonged_marks_following_alnums, false)
105
+ end
106
+
107
+ # Convert hyphen-like characters to appropriate prolonged sound marks
108
+ #
109
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
110
+ # @return [Enumerable<Char>] The transliterated characters
111
+ def call(input_chars)
112
+ offset = 0
113
+ processed_char_in_lookahead = false
114
+ lookahead_buf = []
115
+ last_non_prolonged_char = nil
116
+
117
+ Chars.enum do |y|
118
+ input_chars.each do |char|
119
+ unless lookahead_buf.empty?
120
+ if !char.c.empty? && hyphen_like?(char.c.ord)
121
+ processed_char_in_lookahead = true unless char.source.nil?
122
+ lookahead_buf << char
123
+ next
124
+ end
125
+ prev_non_prolonged_char = last_non_prolonged_char
126
+ last_non_prolonged_char = char
127
+
128
+ if (prev_non_prolonged_char.nil? || alphanumeric?(prev_non_prolonged_char.c.ord)) && (
129
+ !@skip_already_transliterated_chars || !processed_char_in_lookahead
130
+ )
131
+ halfwidth = halfwidth?(
132
+ prev_non_prolonged_char.nil? ? last_non_prolonged_char.c.ord : prev_non_prolonged_char.c.ord
133
+ )
134
+ replacement = halfwidth ? "\u002d" : "\uff0d"
135
+ lookahead_buf.each do |buffered_char|
136
+ y << Char.new(c: replacement, offset: offset, source: buffered_char)
137
+ offset += replacement.length
138
+ end
139
+ else
140
+ lookahead_buf.each do |buffered_char|
141
+ y << buffered_char.with_offset(offset)
142
+ offset += buffered_char.c.length
143
+ end
144
+ end
145
+
146
+ lookahead_buf.clear
147
+ y << char.with_offset(offset)
148
+ offset += char.c.length
149
+ last_non_prolonged_char = char
150
+ processed_char_in_lookahead = false
151
+ next
152
+ end
153
+ if !char.c.empty? && hyphen_like?(char.c.ord)
154
+ should_procses = !@skip_already_transliterated_chars || !char.transliterated?
155
+ if should_procses && !last_non_prolonged_char.nil?
156
+ if prolongable_char?(last_non_prolonged_char.c.ord)
157
+ replacement = halfwidth?(last_non_prolonged_char.c.ord) ? "\uff70" : "\u30fc"
158
+ y << Char.new(c: replacement, offset: offset, source: char)
159
+ offset += replacement.length
160
+ next
161
+ elsif @replace_prolonged_marks_following_alnums && alphanumeric?(last_non_prolonged_char.c.ord)
162
+ lookahead_buf << char
163
+ next
164
+ end
165
+ end
166
+ else
167
+ last_non_prolonged_char = char
168
+ end
169
+ y << char.with_offset(offset)
170
+ offset += char.c.length
171
+ end
172
+ end
173
+ end
174
+
175
+ private
176
+
177
+ include CharType
178
+
179
+ # Check if a character can be prolonged
180
+ #
181
+ # @param char [String] The character to check
182
+ # @return [Boolean] True if the character can be prolonged
183
+ def prolongable_char?(char_code)
184
+ # Default prolongable characters
185
+ return true if prolongable?(char_code)
186
+
187
+ # Hatsuon (if allowed)
188
+ return true if @allow_prolonged_hatsuon && hatsuon?(char_code)
189
+
190
+ # Sokuon (if allowed)
191
+ return true if @allow_prolonged_sokuon && sokuon?(char_code)
192
+
193
+ false
194
+ end
195
+ end
196
+
197
+ # Factory method to create a prolonged sound marks transliterator
198
+ #
199
+ # @param options [Hash] Configuration options
200
+ # @return [Transliterator] A new prolonged sound marks transliterator instance
201
+ def self.call(options = {})
202
+ Transliterator.new(options)
203
+ end
204
+ end
205
+ end
206
+ end
@@ -0,0 +1,361 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Replace Kangxi radicals with equivalent CJK ideographs
6
+ module Radicals
7
+ # Generated mapping data from radicals.json
8
+ RADICALS_MAPPINGS = {
9
+ "\u{2f00}" => "\u{4e00}",
10
+ "\u{2f01}" => "\u{4e28}",
11
+ "\u{2f02}" => "\u{4e36}",
12
+ "\u{2f03}" => "\u{4e3f}",
13
+ "\u{2f04}" => "\u{4e59}",
14
+ "\u{2f05}" => "\u{4e85}",
15
+ "\u{2f06}" => "\u{4e8c}",
16
+ "\u{2f07}" => "\u{4ea0}",
17
+ "\u{2f08}" => "\u{4eba}",
18
+ "\u{2f09}" => "\u{513f}",
19
+ "\u{2f0a}" => "\u{5165}",
20
+ "\u{2f0b}" => "\u{516b}",
21
+ "\u{2f0c}" => "\u{5182}",
22
+ "\u{2f0d}" => "\u{5196}",
23
+ "\u{2f0e}" => "\u{51ab}",
24
+ "\u{2f0f}" => "\u{51e0}",
25
+ "\u{2f10}" => "\u{51f5}",
26
+ "\u{2f11}" => "\u{5200}",
27
+ "\u{2f12}" => "\u{529b}",
28
+ "\u{2f13}" => "\u{52f9}",
29
+ "\u{2f14}" => "\u{5315}",
30
+ "\u{2f15}" => "\u{531a}",
31
+ "\u{2f16}" => "\u{5338}",
32
+ "\u{2f17}" => "\u{5341}",
33
+ "\u{2f18}" => "\u{535c}",
34
+ "\u{2f19}" => "\u{5369}",
35
+ "\u{2f1a}" => "\u{5382}",
36
+ "\u{2f1b}" => "\u{53b6}",
37
+ "\u{2f1c}" => "\u{53c8}",
38
+ "\u{2f1d}" => "\u{53e3}",
39
+ "\u{2f1e}" => "\u{56d7}",
40
+ "\u{2f1f}" => "\u{571f}",
41
+ "\u{2f20}" => "\u{58eb}",
42
+ "\u{2f21}" => "\u{5902}",
43
+ "\u{2f22}" => "\u{590a}",
44
+ "\u{2f23}" => "\u{5915}",
45
+ "\u{2f24}" => "\u{5927}",
46
+ "\u{2f25}" => "\u{5973}",
47
+ "\u{2f26}" => "\u{5b50}",
48
+ "\u{2f27}" => "\u{5b80}",
49
+ "\u{2f28}" => "\u{5bf8}",
50
+ "\u{2f29}" => "\u{5c0f}",
51
+ "\u{2f2a}" => "\u{5c22}",
52
+ "\u{2f2b}" => "\u{5c38}",
53
+ "\u{2f2c}" => "\u{5c6e}",
54
+ "\u{2f2d}" => "\u{5c71}",
55
+ "\u{2f2e}" => "\u{5ddb}",
56
+ "\u{2f2f}" => "\u{5de5}",
57
+ "\u{2f30}" => "\u{5df1}",
58
+ "\u{2f31}" => "\u{5dfe}",
59
+ "\u{2f32}" => "\u{5e72}",
60
+ "\u{2f33}" => "\u{5e7a}",
61
+ "\u{2f34}" => "\u{5e7f}",
62
+ "\u{2f35}" => "\u{5ef4}",
63
+ "\u{2f36}" => "\u{5efe}",
64
+ "\u{2f37}" => "\u{5f0b}",
65
+ "\u{2f38}" => "\u{5f13}",
66
+ "\u{2f39}" => "\u{5f50}",
67
+ "\u{2f3a}" => "\u{5f61}",
68
+ "\u{2f3b}" => "\u{5f73}",
69
+ "\u{2f3c}" => "\u{5fc3}",
70
+ "\u{2f3d}" => "\u{6208}",
71
+ "\u{2f3e}" => "\u{6236}",
72
+ "\u{2f3f}" => "\u{624b}",
73
+ "\u{2f40}" => "\u{652f}",
74
+ "\u{2f41}" => "\u{6534}",
75
+ "\u{2f42}" => "\u{6587}",
76
+ "\u{2f43}" => "\u{6597}",
77
+ "\u{2f44}" => "\u{65a4}",
78
+ "\u{2f45}" => "\u{65b9}",
79
+ "\u{2f46}" => "\u{65e0}",
80
+ "\u{2f47}" => "\u{65e5}",
81
+ "\u{2f48}" => "\u{66f0}",
82
+ "\u{2f49}" => "\u{6708}",
83
+ "\u{2f4a}" => "\u{6728}",
84
+ "\u{2f4b}" => "\u{6b20}",
85
+ "\u{2f4c}" => "\u{6b62}",
86
+ "\u{2f4d}" => "\u{6b79}",
87
+ "\u{2f4e}" => "\u{6bb3}",
88
+ "\u{2f4f}" => "\u{6bcb}",
89
+ "\u{2f50}" => "\u{6bd4}",
90
+ "\u{2f51}" => "\u{6bdb}",
91
+ "\u{2f52}" => "\u{6c0f}",
92
+ "\u{2f53}" => "\u{6c14}",
93
+ "\u{2f54}" => "\u{6c34}",
94
+ "\u{2f55}" => "\u{706b}",
95
+ "\u{2f56}" => "\u{722a}",
96
+ "\u{2f57}" => "\u{7236}",
97
+ "\u{2f58}" => "\u{723b}",
98
+ "\u{2f59}" => "\u{723f}",
99
+ "\u{2f5a}" => "\u{7247}",
100
+ "\u{2f5b}" => "\u{7259}",
101
+ "\u{2f5c}" => "\u{725b}",
102
+ "\u{2f5d}" => "\u{72ac}",
103
+ "\u{2f5e}" => "\u{7384}",
104
+ "\u{2f5f}" => "\u{7389}",
105
+ "\u{2f60}" => "\u{74dc}",
106
+ "\u{2f61}" => "\u{74e6}",
107
+ "\u{2f62}" => "\u{7518}",
108
+ "\u{2f63}" => "\u{751f}",
109
+ "\u{2f64}" => "\u{7528}",
110
+ "\u{2f65}" => "\u{7530}",
111
+ "\u{2f66}" => "\u{758b}",
112
+ "\u{2f67}" => "\u{7592}",
113
+ "\u{2f68}" => "\u{7676}",
114
+ "\u{2f69}" => "\u{767d}",
115
+ "\u{2f6a}" => "\u{76ae}",
116
+ "\u{2f6b}" => "\u{76bf}",
117
+ "\u{2f6c}" => "\u{76ee}",
118
+ "\u{2f6d}" => "\u{77db}",
119
+ "\u{2f6e}" => "\u{77e2}",
120
+ "\u{2f6f}" => "\u{77f3}",
121
+ "\u{2f70}" => "\u{793a}",
122
+ "\u{2f71}" => "\u{79b8}",
123
+ "\u{2f72}" => "\u{79be}",
124
+ "\u{2f73}" => "\u{7a74}",
125
+ "\u{2f74}" => "\u{7acb}",
126
+ "\u{2f75}" => "\u{7af9}",
127
+ "\u{2f76}" => "\u{7c73}",
128
+ "\u{2f77}" => "\u{7cf8}",
129
+ "\u{2f78}" => "\u{7f36}",
130
+ "\u{2f79}" => "\u{7f51}",
131
+ "\u{2f7a}" => "\u{7f8a}",
132
+ "\u{2f7b}" => "\u{7fbd}",
133
+ "\u{2f7c}" => "\u{8001}",
134
+ "\u{2f7d}" => "\u{800c}",
135
+ "\u{2f7e}" => "\u{8012}",
136
+ "\u{2f7f}" => "\u{8033}",
137
+ "\u{2f80}" => "\u{807f}",
138
+ "\u{2f81}" => "\u{8089}",
139
+ "\u{2f82}" => "\u{81e3}",
140
+ "\u{2f83}" => "\u{81ea}",
141
+ "\u{2f84}" => "\u{81f3}",
142
+ "\u{2f85}" => "\u{81fc}",
143
+ "\u{2f86}" => "\u{820c}",
144
+ "\u{2f87}" => "\u{821b}",
145
+ "\u{2f88}" => "\u{821f}",
146
+ "\u{2f89}" => "\u{826e}",
147
+ "\u{2f8a}" => "\u{8272}",
148
+ "\u{2f8b}" => "\u{8278}",
149
+ "\u{2f8c}" => "\u{864d}",
150
+ "\u{2f8d}" => "\u{866b}",
151
+ "\u{2f8e}" => "\u{8840}",
152
+ "\u{2f8f}" => "\u{884c}",
153
+ "\u{2f90}" => "\u{8863}",
154
+ "\u{2f91}" => "\u{897e}",
155
+ "\u{2f92}" => "\u{898b}",
156
+ "\u{2f93}" => "\u{89d2}",
157
+ "\u{2f94}" => "\u{8a00}",
158
+ "\u{2f95}" => "\u{8c37}",
159
+ "\u{2f96}" => "\u{8c46}",
160
+ "\u{2f97}" => "\u{8c55}",
161
+ "\u{2f98}" => "\u{8c78}",
162
+ "\u{2f99}" => "\u{8c9d}",
163
+ "\u{2f9a}" => "\u{8d64}",
164
+ "\u{2f9b}" => "\u{8d70}",
165
+ "\u{2f9c}" => "\u{8db3}",
166
+ "\u{2f9d}" => "\u{8eab}",
167
+ "\u{2f9e}" => "\u{8eca}",
168
+ "\u{2f9f}" => "\u{8f9b}",
169
+ "\u{2fa0}" => "\u{8fb0}",
170
+ "\u{2fa1}" => "\u{8fb5}",
171
+ "\u{2fa2}" => "\u{9091}",
172
+ "\u{2fa3}" => "\u{9149}",
173
+ "\u{2fa4}" => "\u{91c6}",
174
+ "\u{2fa5}" => "\u{91cc}",
175
+ "\u{2fa6}" => "\u{91d1}",
176
+ "\u{2fa7}" => "\u{9577}",
177
+ "\u{2fa8}" => "\u{9580}",
178
+ "\u{2fa9}" => "\u{961c}",
179
+ "\u{2faa}" => "\u{96b6}",
180
+ "\u{2fab}" => "\u{96b9}",
181
+ "\u{2fac}" => "\u{96e8}",
182
+ "\u{2fad}" => "\u{9751}",
183
+ "\u{2fae}" => "\u{975e}",
184
+ "\u{2faf}" => "\u{9762}",
185
+ "\u{2fb0}" => "\u{9769}",
186
+ "\u{2fb1}" => "\u{97cb}",
187
+ "\u{2fb2}" => "\u{97ed}",
188
+ "\u{2fb3}" => "\u{97f3}",
189
+ "\u{2fb4}" => "\u{9801}",
190
+ "\u{2fb5}" => "\u{98a8}",
191
+ "\u{2fb6}" => "\u{98db}",
192
+ "\u{2fb7}" => "\u{98df}",
193
+ "\u{2fb8}" => "\u{9996}",
194
+ "\u{2fb9}" => "\u{9999}",
195
+ "\u{2fba}" => "\u{99ac}",
196
+ "\u{2fbb}" => "\u{9aa8}",
197
+ "\u{2fbc}" => "\u{9ad8}",
198
+ "\u{2fbd}" => "\u{9adf}",
199
+ "\u{2fbe}" => "\u{9b25}",
200
+ "\u{2fbf}" => "\u{9b2f}",
201
+ "\u{2fc0}" => "\u{9b32}",
202
+ "\u{2fc1}" => "\u{9b3c}",
203
+ "\u{2fc2}" => "\u{9b5a}",
204
+ "\u{2fc3}" => "\u{9ce5}",
205
+ "\u{2fc4}" => "\u{9e75}",
206
+ "\u{2fc5}" => "\u{9e7f}",
207
+ "\u{2fc6}" => "\u{9ea5}",
208
+ "\u{2fc7}" => "\u{9ebb}",
209
+ "\u{2fc8}" => "\u{9ec3}",
210
+ "\u{2fc9}" => "\u{9ecd}",
211
+ "\u{2fca}" => "\u{9ed1}",
212
+ "\u{2fcb}" => "\u{9ef9}",
213
+ "\u{2fcc}" => "\u{9efd}",
214
+ "\u{2fcd}" => "\u{9f0e}",
215
+ "\u{2fce}" => "\u{9f13}",
216
+ "\u{2fcf}" => "\u{9f20}",
217
+ "\u{2fd0}" => "\u{9f3b}",
218
+ "\u{2fd1}" => "\u{9f4a}",
219
+ "\u{2fd2}" => "\u{9f52}",
220
+ "\u{2fd3}" => "\u{9f8d}",
221
+ "\u{2fd4}" => "\u{9f9c}",
222
+ "\u{2fd5}" => "\u{9fa0}",
223
+ "\u{2e80}" => "\u{51ab}",
224
+ "\u{2e81}" => "\u{5382}",
225
+ "\u{2e82}" => "\u{4e5b}",
226
+ "\u{2e83}" => "\u{4e5a}",
227
+ "\u{2e84}" => "\u{4e59}",
228
+ "\u{2e85}" => "\u{4ebb}",
229
+ "\u{2e86}" => "\u{5182}",
230
+ "\u{2e89}" => "\u{5202}",
231
+ "\u{2e8a}" => "\u{535c}",
232
+ "\u{2e8b}" => "\u{353e}",
233
+ "\u{2e8e}" => "\u{5140}",
234
+ "\u{2e8f}" => "\u{5c23}",
235
+ "\u{2e90}" => "\u{5c22}",
236
+ "\u{2e92}" => "\u{5df3}",
237
+ "\u{2e93}" => "\u{5e7a}",
238
+ "\u{2e94}" => "\u{5f51}",
239
+ "\u{2e95}" => "\u{5f50}",
240
+ "\u{2e96}" => "\u{5fc4}",
241
+ "\u{2e97}" => "\u{38fa}",
242
+ "\u{2e98}" => "\u{624c}",
243
+ "\u{2e99}" => "\u{6535}",
244
+ "\u{2e9b}" => "\u{65e1}",
245
+ "\u{2e9d}" => "\u{6708}",
246
+ "\u{2e9e}" => "\u{6b7a}",
247
+ "\u{2e9f}" => "\u{6bcd}",
248
+ "\u{2ea0}" => "\u{6c11}",
249
+ "\u{2ea1}" => "\u{6c35}",
250
+ "\u{2ea2}" => "\u{6c3a}",
251
+ "\u{2ea3}" => "\u{706c}",
252
+ "\u{2ea5}" => "\u{722b}",
253
+ "\u{2ea6}" => "\u{4e2c}",
254
+ "\u{2ea8}" => "\u{72ad}",
255
+ "\u{2eab}" => "\u{7f52}",
256
+ "\u{2eac}" => "\u{793a}",
257
+ "\u{2ead}" => "\u{793b}",
258
+ "\u{2eaf}" => "\u{7cf9}",
259
+ "\u{2eb0}" => "\u{7e9f}",
260
+ "\u{2eb1}" => "\u{7f53}",
261
+ "\u{2eb3}" => "\u{34c1}",
262
+ "\u{2eb4}" => "\u{34c1}",
263
+ "\u{2eb9}" => "\u{8002}",
264
+ "\u{2eba}" => "\u{8080}",
265
+ "\u{2ebc}" => "\u{6708}",
266
+ "\u{2ebd}" => "\u{81fc}",
267
+ "\u{2ebe}" => "\u{8279}",
268
+ "\u{2ebf}" => "\u{8279}",
269
+ "\u{2ec0}" => "\u{8279}",
270
+ "\u{2ec1}" => "\u{864e}",
271
+ "\u{2ec2}" => "\u{8864}",
272
+ "\u{2ec3}" => "\u{8980}",
273
+ "\u{2ec4}" => "\u{897f}",
274
+ "\u{2ec5}" => "\u{89c1}",
275
+ "\u{2ec8}" => "\u{8ba0}",
276
+ "\u{2ec9}" => "\u{8d1d}",
277
+ "\u{2ecb}" => "\u{8f66}",
278
+ "\u{2ecd}" => "\u{8fb6}",
279
+ "\u{2ecf}" => "\u{961d}",
280
+ "\u{2ed0}" => "\u{9485}",
281
+ "\u{2ed1}" => "\u{9577}",
282
+ "\u{2ed2}" => "\u{9578}",
283
+ "\u{2ed3}" => "\u{957f}",
284
+ "\u{2ed6}" => "\u{961d}",
285
+ "\u{2ed8}" => "\u{9752}",
286
+ "\u{2ed9}" => "\u{97e6}",
287
+ "\u{2eda}" => "\u{9875}",
288
+ "\u{2edb}" => "\u{98ce}",
289
+ "\u{2edc}" => "\u{98de}",
290
+ "\u{2edd}" => "\u{98df}",
291
+ "\u{2edf}" => "\u{98e0}",
292
+ "\u{2ee0}" => "\u{9963}",
293
+ "\u{2ee2}" => "\u{9a6c}",
294
+ "\u{2ee3}" => "\u{9aa8}",
295
+ "\u{2ee4}" => "\u{9b3c}",
296
+ "\u{2ee5}" => "\u{9c7c}",
297
+ "\u{2ee6}" => "\u{9e1f}",
298
+ "\u{2ee7}" => "\u{5364}",
299
+ "\u{2ee8}" => "\u{9ea6}",
300
+ "\u{2ee9}" => "\u{9ec4}",
301
+ "\u{2eea}" => "\u{9efe}",
302
+ "\u{2eeb}" => "\u{6589}",
303
+ "\u{2eec}" => "\u{9f50}",
304
+ "\u{2eed}" => "\u{6b6f}",
305
+ "\u{2eee}" => "\u{9f7f}",
306
+ "\u{2eef}" => "\u{7adc}",
307
+ "\u{2ef0}" => "\u{9f99}",
308
+ "\u{2ef1}" => "\u{9f9c}",
309
+ "\u{2ef2}" => "\u{4e80}",
310
+ "\u{2ef3}" => "\u{9f9f}"
311
+ }.freeze
312
+
313
+ # Transliterator for radicals
314
+ class Transliterator < Yosina::BaseTransliterator
315
+ # Initialize the transliterator with options
316
+ #
317
+ # @param _options [Hash] Configuration options (currently unused)
318
+ def initialize(_options = {})
319
+ # Options currently unused for radicals transliterator
320
+ super()
321
+ end
322
+
323
+ # Replace Kangxi radicals with equivalent CJK ideographs
324
+ #
325
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
326
+ # @return [Enumerable<Char>] The transliterated characters
327
+ def call(input_chars)
328
+ offset = 0
329
+
330
+ result = input_chars.filter_map do |char|
331
+ replacement = RADICALS_MAPPINGS[char.c]
332
+ c = if replacement
333
+ # Skip empty replacements (character removal)
334
+ next if replacement.empty?
335
+
336
+ Char.new(c: replacement, offset: offset, source: char)
337
+ else
338
+ char.with_offset(offset)
339
+ end
340
+ offset += c.c.length
341
+ c
342
+ end
343
+
344
+ class << result
345
+ include Yosina::Chars
346
+ end
347
+
348
+ result
349
+ end
350
+ end
351
+
352
+ # Factory method to create a radicals transliterator
353
+ #
354
+ # @param options [Hash] Configuration options
355
+ # @return [Transliterator] A new radicals transliterator instance
356
+ def self.call(options = {})
357
+ Transliterator.new(options)
358
+ end
359
+ end
360
+ end
361
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Replace various space characters with plain whitespace
6
+ module Spaces
7
+ # Generated mapping data from spaces.json
8
+ SPACES_MAPPINGS = {
9
+ "\u{a0}" => ' ',
10
+ "\u{180e}" => '',
11
+ "\u{2000}" => ' ',
12
+ "\u{2001}" => ' ',
13
+ "\u{2002}" => ' ',
14
+ "\u{2003}" => ' ',
15
+ "\u{2004}" => ' ',
16
+ "\u{2005}" => ' ',
17
+ "\u{2006}" => ' ',
18
+ "\u{2007}" => ' ',
19
+ "\u{2008}" => ' ',
20
+ "\u{2009}" => ' ',
21
+ "\u{200a}" => ' ',
22
+ "\u{200b}" => ' ',
23
+ "\u{202f}" => ' ',
24
+ "\u{205f}" => ' ',
25
+ "\u{3000}" => ' ',
26
+ "\u{3164}" => ' ',
27
+ "\u{ffa0}" => ' ',
28
+ "\u{feff}" => ''
29
+ }.freeze
30
+
31
+ # Transliterator for spaces
32
+ class Transliterator < Yosina::BaseTransliterator
33
+ # Initialize the transliterator with options
34
+ #
35
+ # @param _options [Hash] Configuration options (currently unused)
36
+ def initialize(_options = {})
37
+ # Options currently unused for spaces transliterator
38
+ super()
39
+ end
40
+
41
+ # Replace various space characters with plain whitespace
42
+ #
43
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
44
+ # @return [Enumerable<Char>] The transliterated characters
45
+ def call(input_chars)
46
+ offset = 0
47
+
48
+ result = input_chars.filter_map do |char|
49
+ replacement = SPACES_MAPPINGS[char.c]
50
+ c = if replacement
51
+ # Skip empty replacements (character removal)
52
+ next if replacement.empty?
53
+
54
+ Char.new(c: replacement, offset: offset, source: char)
55
+ else
56
+ char.with_offset(offset)
57
+ end
58
+ offset += c.c.length
59
+ c
60
+ end
61
+
62
+ class << result
63
+ include Yosina::Chars
64
+ end
65
+
66
+ result
67
+ end
68
+ end
69
+
70
+ # Factory method to create a spaces transliterator
71
+ #
72
+ # @param options [Hash] Configuration options
73
+ # @return [Transliterator] A new spaces transliterator instance
74
+ def self.call(options = {})
75
+ Transliterator.new(options)
76
+ end
77
+ end
78
+ end
79
+ end