yosina 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +36 -0
- data/Gemfile +6 -0
- data/README.ja.md +229 -0
- data/README.md +229 -0
- data/Rakefile +30 -0
- data/codegen/dataset.rb +215 -0
- data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
- data/codegen/emitters/combined_transliterator_data.rb +28 -0
- data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
- data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
- data/codegen/emitters/simple_transliterator.rb +76 -0
- data/codegen/emitters/utils.rb +45 -0
- data/codegen/emitters.rb +8 -0
- data/codegen/main.rb +109 -0
- data/lib/yosina/char.rb +65 -0
- data/lib/yosina/chars.rb +152 -0
- data/lib/yosina/recipes.rb +359 -0
- data/lib/yosina/transliterator.rb +49 -0
- data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
- data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
- data/lib/yosina/transliterators/combined.rb +52 -0
- data/lib/yosina/transliterators/combined_data.rb +495 -0
- data/lib/yosina/transliterators/hira_kata.rb +106 -0
- data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
- data/lib/yosina/transliterators/hyphens.rb +83 -0
- data/lib/yosina/transliterators/hyphens_data.rb +60 -0
- data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
- data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
- data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
- data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
- data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
- data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
- data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
- data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
- data/lib/yosina/transliterators/radicals.rb +361 -0
- data/lib/yosina/transliterators/spaces.rb +79 -0
- data/lib/yosina/transliterators.rb +57 -0
- data/lib/yosina/version.rb +5 -0
- data/lib/yosina.rb +62 -0
- data/yosina.gemspec +41 -0
- metadata +159 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Handle prolonged sound marks transliterator
|
|
6
|
+
module ProlongedSoundMarks
|
|
7
|
+
# Mix-in for character type checks
|
|
8
|
+
module CharType
|
|
9
|
+
# Hyphen-like characters that can be converted to prolonged sound marks
|
|
10
|
+
HYPHEN_LIKE_CHARS = [
|
|
11
|
+
0x002d, # HYPHEN-MINUS
|
|
12
|
+
0x2010, # HYPHEN
|
|
13
|
+
0x2014, # EM DASH
|
|
14
|
+
0x2015, # HORIZONTAL BAR
|
|
15
|
+
0x2212, # MINUS SIGN
|
|
16
|
+
0xff0d, # FULLWIDTH HYPHEN-MINUS
|
|
17
|
+
0xff70, # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK (already converted)
|
|
18
|
+
0x30fc # KATAKANA-HIRAGANA PROLONGED SOUND MARK (already converted)
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
# Check if character is halfwidth Japanese
|
|
22
|
+
def halfwidth?(char_code)
|
|
23
|
+
halfwidth_alphanumeric?(char_code) ||
|
|
24
|
+
(char_code >= 0xff66 && char_code <= 0xff6f) ||
|
|
25
|
+
(char_code >= 0xff70 && char_code <= 0xff9f)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def hiragana?(char_code)
|
|
29
|
+
(char_code >= 0x3041 && char_code <= 0x309c &&
|
|
30
|
+
char_code != 0x3063 && char_code != 0x3093) ||
|
|
31
|
+
char_code == 0x309f
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def katakana?(char_code)
|
|
35
|
+
(char_code >= 0x30a1 && char_code <= 0x30fa && char_code != 0x30c3 && char_code != 0x30f3) ||
|
|
36
|
+
(char_code >= 0x30fd && char_code <= 0x30ff) ||
|
|
37
|
+
(char_code >= 0xff70 && char_code <= 0xff9f && char_code != 0xff6f && char_code != 0xff9d)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def halfwidth_alphanumeric?(char_code)
|
|
41
|
+
(char_code >= 0x30 && char_code <= 0x39) ||
|
|
42
|
+
(char_code >= 0x41 && char_code <= 0x5A) ||
|
|
43
|
+
(char_code >= 0x61 && char_code <= 0x7A)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def fullwidth_alphanumeric?(char_code)
|
|
47
|
+
(char_code >= 0xff10 && char_code <= 0xff19) ||
|
|
48
|
+
(char_code >= 0xff21 && char_code <= 0xff3a) ||
|
|
49
|
+
(char_code >= 0xff41 && char_code <= 0xff5a)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Check if character is fullwidth Japanese
|
|
53
|
+
def fullwidth?(char_code)
|
|
54
|
+
char_code == 0x30fc || hiragana?(char_code) || katakana?(char_code) || fullwidth_alphanumeric?(char_code)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Check if character is alphanumeric
|
|
58
|
+
def alphanumeric?(char_code)
|
|
59
|
+
halfwidth_alphanumeric?(char_code) || fullwidth_alphanumeric?(char_code)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def hatsuon?(char_code)
|
|
63
|
+
[0x3093, 0x30f3, 0xff9d].include?(char_code)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def sokuon?(char_code)
|
|
67
|
+
[0x3063, 0x30c3, 0xff6f].include?(char_code)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def prolonged_sound_mark?(char_code)
|
|
71
|
+
[0x30fc, 0xff70].include?(char_code)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def prolongable?(char_code)
|
|
75
|
+
prolonged_sound_mark?(char_code) || hiragana?(char_code) || katakana?(char_code)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def hyphen_like?(char_code)
|
|
79
|
+
HYPHEN_LIKE_CHARS.include?(char_code)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Transliterator for prolonged sound marks
|
|
84
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
85
|
+
attr_reader :skip_already_transliterated_chars, :allow_prolonged_hatsuon,
|
|
86
|
+
:allow_prolonged_sokuon, :replace_prolonged_marks_following_alnums
|
|
87
|
+
|
|
88
|
+
# Initialize the transliterator with options
|
|
89
|
+
#
|
|
90
|
+
# @param options [Hash] Configuration options
|
|
91
|
+
# @option options [Boolean] :skip_already_transliterated_chars Skip chars that were already processed.
|
|
92
|
+
# Default: false.
|
|
93
|
+
# @option options [Boolean] :allow_prolonged_hatsuon Allow prolonging ん/ン/ン.
|
|
94
|
+
# Default: false.
|
|
95
|
+
# @option options [Boolean] :allow_prolonged_sokuon Allow prolonging っ/ッ/ッ.
|
|
96
|
+
# Default: false.
|
|
97
|
+
# @option options [Boolean] :replace_prolonged_marks_following_alnums Replace prolonged marks after alphanum
|
|
98
|
+
# with hyphens. Default: false.
|
|
99
|
+
def initialize(options = {})
|
|
100
|
+
super()
|
|
101
|
+
@skip_already_transliterated_chars = options.fetch(:skip_already_transliterated_chars, false)
|
|
102
|
+
@allow_prolonged_hatsuon = options.fetch(:allow_prolonged_hatsuon, false)
|
|
103
|
+
@allow_prolonged_sokuon = options.fetch(:allow_prolonged_sokuon, false)
|
|
104
|
+
@replace_prolonged_marks_following_alnums = options.fetch(:replace_prolonged_marks_following_alnums, false)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Convert hyphen-like characters to appropriate prolonged sound marks
|
|
108
|
+
#
|
|
109
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
110
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
111
|
+
def call(input_chars)
|
|
112
|
+
offset = 0
|
|
113
|
+
processed_char_in_lookahead = false
|
|
114
|
+
lookahead_buf = []
|
|
115
|
+
last_non_prolonged_char = nil
|
|
116
|
+
|
|
117
|
+
Chars.enum do |y|
|
|
118
|
+
input_chars.each do |char|
|
|
119
|
+
unless lookahead_buf.empty?
|
|
120
|
+
if !char.c.empty? && hyphen_like?(char.c.ord)
|
|
121
|
+
processed_char_in_lookahead = true unless char.source.nil?
|
|
122
|
+
lookahead_buf << char
|
|
123
|
+
next
|
|
124
|
+
end
|
|
125
|
+
prev_non_prolonged_char = last_non_prolonged_char
|
|
126
|
+
last_non_prolonged_char = char
|
|
127
|
+
|
|
128
|
+
if (prev_non_prolonged_char.nil? || alphanumeric?(prev_non_prolonged_char.c.ord)) && (
|
|
129
|
+
!@skip_already_transliterated_chars || !processed_char_in_lookahead
|
|
130
|
+
)
|
|
131
|
+
halfwidth = halfwidth?(
|
|
132
|
+
prev_non_prolonged_char.nil? ? last_non_prolonged_char.c.ord : prev_non_prolonged_char.c.ord
|
|
133
|
+
)
|
|
134
|
+
replacement = halfwidth ? "\u002d" : "\uff0d"
|
|
135
|
+
lookahead_buf.each do |buffered_char|
|
|
136
|
+
y << Char.new(c: replacement, offset: offset, source: buffered_char)
|
|
137
|
+
offset += replacement.length
|
|
138
|
+
end
|
|
139
|
+
else
|
|
140
|
+
lookahead_buf.each do |buffered_char|
|
|
141
|
+
y << buffered_char.with_offset(offset)
|
|
142
|
+
offset += buffered_char.c.length
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
lookahead_buf.clear
|
|
147
|
+
y << char.with_offset(offset)
|
|
148
|
+
offset += char.c.length
|
|
149
|
+
last_non_prolonged_char = char
|
|
150
|
+
processed_char_in_lookahead = false
|
|
151
|
+
next
|
|
152
|
+
end
|
|
153
|
+
if !char.c.empty? && hyphen_like?(char.c.ord)
|
|
154
|
+
should_procses = !@skip_already_transliterated_chars || !char.transliterated?
|
|
155
|
+
if should_procses && !last_non_prolonged_char.nil?
|
|
156
|
+
if prolongable_char?(last_non_prolonged_char.c.ord)
|
|
157
|
+
replacement = halfwidth?(last_non_prolonged_char.c.ord) ? "\uff70" : "\u30fc"
|
|
158
|
+
y << Char.new(c: replacement, offset: offset, source: char)
|
|
159
|
+
offset += replacement.length
|
|
160
|
+
next
|
|
161
|
+
elsif @replace_prolonged_marks_following_alnums && alphanumeric?(last_non_prolonged_char.c.ord)
|
|
162
|
+
lookahead_buf << char
|
|
163
|
+
next
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
else
|
|
167
|
+
last_non_prolonged_char = char
|
|
168
|
+
end
|
|
169
|
+
y << char.with_offset(offset)
|
|
170
|
+
offset += char.c.length
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
private
|
|
176
|
+
|
|
177
|
+
include CharType
|
|
178
|
+
|
|
179
|
+
# Check if a character can be prolonged
|
|
180
|
+
#
|
|
181
|
+
# @param char [String] The character to check
|
|
182
|
+
# @return [Boolean] True if the character can be prolonged
|
|
183
|
+
def prolongable_char?(char_code)
|
|
184
|
+
# Default prolongable characters
|
|
185
|
+
return true if prolongable?(char_code)
|
|
186
|
+
|
|
187
|
+
# Hatsuon (if allowed)
|
|
188
|
+
return true if @allow_prolonged_hatsuon && hatsuon?(char_code)
|
|
189
|
+
|
|
190
|
+
# Sokuon (if allowed)
|
|
191
|
+
return true if @allow_prolonged_sokuon && sokuon?(char_code)
|
|
192
|
+
|
|
193
|
+
false
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Factory method to create a prolonged sound marks transliterator
|
|
198
|
+
#
|
|
199
|
+
# @param options [Hash] Configuration options
|
|
200
|
+
# @return [Transliterator] A new prolonged sound marks transliterator instance
|
|
201
|
+
def self.call(options = {})
|
|
202
|
+
Transliterator.new(options)
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Replace Kangxi radicals with equivalent CJK ideographs
|
|
6
|
+
module Radicals
|
|
7
|
+
# Generated mapping data from radicals.json
|
|
8
|
+
RADICALS_MAPPINGS = {
|
|
9
|
+
"\u{2f00}" => "\u{4e00}",
|
|
10
|
+
"\u{2f01}" => "\u{4e28}",
|
|
11
|
+
"\u{2f02}" => "\u{4e36}",
|
|
12
|
+
"\u{2f03}" => "\u{4e3f}",
|
|
13
|
+
"\u{2f04}" => "\u{4e59}",
|
|
14
|
+
"\u{2f05}" => "\u{4e85}",
|
|
15
|
+
"\u{2f06}" => "\u{4e8c}",
|
|
16
|
+
"\u{2f07}" => "\u{4ea0}",
|
|
17
|
+
"\u{2f08}" => "\u{4eba}",
|
|
18
|
+
"\u{2f09}" => "\u{513f}",
|
|
19
|
+
"\u{2f0a}" => "\u{5165}",
|
|
20
|
+
"\u{2f0b}" => "\u{516b}",
|
|
21
|
+
"\u{2f0c}" => "\u{5182}",
|
|
22
|
+
"\u{2f0d}" => "\u{5196}",
|
|
23
|
+
"\u{2f0e}" => "\u{51ab}",
|
|
24
|
+
"\u{2f0f}" => "\u{51e0}",
|
|
25
|
+
"\u{2f10}" => "\u{51f5}",
|
|
26
|
+
"\u{2f11}" => "\u{5200}",
|
|
27
|
+
"\u{2f12}" => "\u{529b}",
|
|
28
|
+
"\u{2f13}" => "\u{52f9}",
|
|
29
|
+
"\u{2f14}" => "\u{5315}",
|
|
30
|
+
"\u{2f15}" => "\u{531a}",
|
|
31
|
+
"\u{2f16}" => "\u{5338}",
|
|
32
|
+
"\u{2f17}" => "\u{5341}",
|
|
33
|
+
"\u{2f18}" => "\u{535c}",
|
|
34
|
+
"\u{2f19}" => "\u{5369}",
|
|
35
|
+
"\u{2f1a}" => "\u{5382}",
|
|
36
|
+
"\u{2f1b}" => "\u{53b6}",
|
|
37
|
+
"\u{2f1c}" => "\u{53c8}",
|
|
38
|
+
"\u{2f1d}" => "\u{53e3}",
|
|
39
|
+
"\u{2f1e}" => "\u{56d7}",
|
|
40
|
+
"\u{2f1f}" => "\u{571f}",
|
|
41
|
+
"\u{2f20}" => "\u{58eb}",
|
|
42
|
+
"\u{2f21}" => "\u{5902}",
|
|
43
|
+
"\u{2f22}" => "\u{590a}",
|
|
44
|
+
"\u{2f23}" => "\u{5915}",
|
|
45
|
+
"\u{2f24}" => "\u{5927}",
|
|
46
|
+
"\u{2f25}" => "\u{5973}",
|
|
47
|
+
"\u{2f26}" => "\u{5b50}",
|
|
48
|
+
"\u{2f27}" => "\u{5b80}",
|
|
49
|
+
"\u{2f28}" => "\u{5bf8}",
|
|
50
|
+
"\u{2f29}" => "\u{5c0f}",
|
|
51
|
+
"\u{2f2a}" => "\u{5c22}",
|
|
52
|
+
"\u{2f2b}" => "\u{5c38}",
|
|
53
|
+
"\u{2f2c}" => "\u{5c6e}",
|
|
54
|
+
"\u{2f2d}" => "\u{5c71}",
|
|
55
|
+
"\u{2f2e}" => "\u{5ddb}",
|
|
56
|
+
"\u{2f2f}" => "\u{5de5}",
|
|
57
|
+
"\u{2f30}" => "\u{5df1}",
|
|
58
|
+
"\u{2f31}" => "\u{5dfe}",
|
|
59
|
+
"\u{2f32}" => "\u{5e72}",
|
|
60
|
+
"\u{2f33}" => "\u{5e7a}",
|
|
61
|
+
"\u{2f34}" => "\u{5e7f}",
|
|
62
|
+
"\u{2f35}" => "\u{5ef4}",
|
|
63
|
+
"\u{2f36}" => "\u{5efe}",
|
|
64
|
+
"\u{2f37}" => "\u{5f0b}",
|
|
65
|
+
"\u{2f38}" => "\u{5f13}",
|
|
66
|
+
"\u{2f39}" => "\u{5f50}",
|
|
67
|
+
"\u{2f3a}" => "\u{5f61}",
|
|
68
|
+
"\u{2f3b}" => "\u{5f73}",
|
|
69
|
+
"\u{2f3c}" => "\u{5fc3}",
|
|
70
|
+
"\u{2f3d}" => "\u{6208}",
|
|
71
|
+
"\u{2f3e}" => "\u{6236}",
|
|
72
|
+
"\u{2f3f}" => "\u{624b}",
|
|
73
|
+
"\u{2f40}" => "\u{652f}",
|
|
74
|
+
"\u{2f41}" => "\u{6534}",
|
|
75
|
+
"\u{2f42}" => "\u{6587}",
|
|
76
|
+
"\u{2f43}" => "\u{6597}",
|
|
77
|
+
"\u{2f44}" => "\u{65a4}",
|
|
78
|
+
"\u{2f45}" => "\u{65b9}",
|
|
79
|
+
"\u{2f46}" => "\u{65e0}",
|
|
80
|
+
"\u{2f47}" => "\u{65e5}",
|
|
81
|
+
"\u{2f48}" => "\u{66f0}",
|
|
82
|
+
"\u{2f49}" => "\u{6708}",
|
|
83
|
+
"\u{2f4a}" => "\u{6728}",
|
|
84
|
+
"\u{2f4b}" => "\u{6b20}",
|
|
85
|
+
"\u{2f4c}" => "\u{6b62}",
|
|
86
|
+
"\u{2f4d}" => "\u{6b79}",
|
|
87
|
+
"\u{2f4e}" => "\u{6bb3}",
|
|
88
|
+
"\u{2f4f}" => "\u{6bcb}",
|
|
89
|
+
"\u{2f50}" => "\u{6bd4}",
|
|
90
|
+
"\u{2f51}" => "\u{6bdb}",
|
|
91
|
+
"\u{2f52}" => "\u{6c0f}",
|
|
92
|
+
"\u{2f53}" => "\u{6c14}",
|
|
93
|
+
"\u{2f54}" => "\u{6c34}",
|
|
94
|
+
"\u{2f55}" => "\u{706b}",
|
|
95
|
+
"\u{2f56}" => "\u{722a}",
|
|
96
|
+
"\u{2f57}" => "\u{7236}",
|
|
97
|
+
"\u{2f58}" => "\u{723b}",
|
|
98
|
+
"\u{2f59}" => "\u{723f}",
|
|
99
|
+
"\u{2f5a}" => "\u{7247}",
|
|
100
|
+
"\u{2f5b}" => "\u{7259}",
|
|
101
|
+
"\u{2f5c}" => "\u{725b}",
|
|
102
|
+
"\u{2f5d}" => "\u{72ac}",
|
|
103
|
+
"\u{2f5e}" => "\u{7384}",
|
|
104
|
+
"\u{2f5f}" => "\u{7389}",
|
|
105
|
+
"\u{2f60}" => "\u{74dc}",
|
|
106
|
+
"\u{2f61}" => "\u{74e6}",
|
|
107
|
+
"\u{2f62}" => "\u{7518}",
|
|
108
|
+
"\u{2f63}" => "\u{751f}",
|
|
109
|
+
"\u{2f64}" => "\u{7528}",
|
|
110
|
+
"\u{2f65}" => "\u{7530}",
|
|
111
|
+
"\u{2f66}" => "\u{758b}",
|
|
112
|
+
"\u{2f67}" => "\u{7592}",
|
|
113
|
+
"\u{2f68}" => "\u{7676}",
|
|
114
|
+
"\u{2f69}" => "\u{767d}",
|
|
115
|
+
"\u{2f6a}" => "\u{76ae}",
|
|
116
|
+
"\u{2f6b}" => "\u{76bf}",
|
|
117
|
+
"\u{2f6c}" => "\u{76ee}",
|
|
118
|
+
"\u{2f6d}" => "\u{77db}",
|
|
119
|
+
"\u{2f6e}" => "\u{77e2}",
|
|
120
|
+
"\u{2f6f}" => "\u{77f3}",
|
|
121
|
+
"\u{2f70}" => "\u{793a}",
|
|
122
|
+
"\u{2f71}" => "\u{79b8}",
|
|
123
|
+
"\u{2f72}" => "\u{79be}",
|
|
124
|
+
"\u{2f73}" => "\u{7a74}",
|
|
125
|
+
"\u{2f74}" => "\u{7acb}",
|
|
126
|
+
"\u{2f75}" => "\u{7af9}",
|
|
127
|
+
"\u{2f76}" => "\u{7c73}",
|
|
128
|
+
"\u{2f77}" => "\u{7cf8}",
|
|
129
|
+
"\u{2f78}" => "\u{7f36}",
|
|
130
|
+
"\u{2f79}" => "\u{7f51}",
|
|
131
|
+
"\u{2f7a}" => "\u{7f8a}",
|
|
132
|
+
"\u{2f7b}" => "\u{7fbd}",
|
|
133
|
+
"\u{2f7c}" => "\u{8001}",
|
|
134
|
+
"\u{2f7d}" => "\u{800c}",
|
|
135
|
+
"\u{2f7e}" => "\u{8012}",
|
|
136
|
+
"\u{2f7f}" => "\u{8033}",
|
|
137
|
+
"\u{2f80}" => "\u{807f}",
|
|
138
|
+
"\u{2f81}" => "\u{8089}",
|
|
139
|
+
"\u{2f82}" => "\u{81e3}",
|
|
140
|
+
"\u{2f83}" => "\u{81ea}",
|
|
141
|
+
"\u{2f84}" => "\u{81f3}",
|
|
142
|
+
"\u{2f85}" => "\u{81fc}",
|
|
143
|
+
"\u{2f86}" => "\u{820c}",
|
|
144
|
+
"\u{2f87}" => "\u{821b}",
|
|
145
|
+
"\u{2f88}" => "\u{821f}",
|
|
146
|
+
"\u{2f89}" => "\u{826e}",
|
|
147
|
+
"\u{2f8a}" => "\u{8272}",
|
|
148
|
+
"\u{2f8b}" => "\u{8278}",
|
|
149
|
+
"\u{2f8c}" => "\u{864d}",
|
|
150
|
+
"\u{2f8d}" => "\u{866b}",
|
|
151
|
+
"\u{2f8e}" => "\u{8840}",
|
|
152
|
+
"\u{2f8f}" => "\u{884c}",
|
|
153
|
+
"\u{2f90}" => "\u{8863}",
|
|
154
|
+
"\u{2f91}" => "\u{897e}",
|
|
155
|
+
"\u{2f92}" => "\u{898b}",
|
|
156
|
+
"\u{2f93}" => "\u{89d2}",
|
|
157
|
+
"\u{2f94}" => "\u{8a00}",
|
|
158
|
+
"\u{2f95}" => "\u{8c37}",
|
|
159
|
+
"\u{2f96}" => "\u{8c46}",
|
|
160
|
+
"\u{2f97}" => "\u{8c55}",
|
|
161
|
+
"\u{2f98}" => "\u{8c78}",
|
|
162
|
+
"\u{2f99}" => "\u{8c9d}",
|
|
163
|
+
"\u{2f9a}" => "\u{8d64}",
|
|
164
|
+
"\u{2f9b}" => "\u{8d70}",
|
|
165
|
+
"\u{2f9c}" => "\u{8db3}",
|
|
166
|
+
"\u{2f9d}" => "\u{8eab}",
|
|
167
|
+
"\u{2f9e}" => "\u{8eca}",
|
|
168
|
+
"\u{2f9f}" => "\u{8f9b}",
|
|
169
|
+
"\u{2fa0}" => "\u{8fb0}",
|
|
170
|
+
"\u{2fa1}" => "\u{8fb5}",
|
|
171
|
+
"\u{2fa2}" => "\u{9091}",
|
|
172
|
+
"\u{2fa3}" => "\u{9149}",
|
|
173
|
+
"\u{2fa4}" => "\u{91c6}",
|
|
174
|
+
"\u{2fa5}" => "\u{91cc}",
|
|
175
|
+
"\u{2fa6}" => "\u{91d1}",
|
|
176
|
+
"\u{2fa7}" => "\u{9577}",
|
|
177
|
+
"\u{2fa8}" => "\u{9580}",
|
|
178
|
+
"\u{2fa9}" => "\u{961c}",
|
|
179
|
+
"\u{2faa}" => "\u{96b6}",
|
|
180
|
+
"\u{2fab}" => "\u{96b9}",
|
|
181
|
+
"\u{2fac}" => "\u{96e8}",
|
|
182
|
+
"\u{2fad}" => "\u{9751}",
|
|
183
|
+
"\u{2fae}" => "\u{975e}",
|
|
184
|
+
"\u{2faf}" => "\u{9762}",
|
|
185
|
+
"\u{2fb0}" => "\u{9769}",
|
|
186
|
+
"\u{2fb1}" => "\u{97cb}",
|
|
187
|
+
"\u{2fb2}" => "\u{97ed}",
|
|
188
|
+
"\u{2fb3}" => "\u{97f3}",
|
|
189
|
+
"\u{2fb4}" => "\u{9801}",
|
|
190
|
+
"\u{2fb5}" => "\u{98a8}",
|
|
191
|
+
"\u{2fb6}" => "\u{98db}",
|
|
192
|
+
"\u{2fb7}" => "\u{98df}",
|
|
193
|
+
"\u{2fb8}" => "\u{9996}",
|
|
194
|
+
"\u{2fb9}" => "\u{9999}",
|
|
195
|
+
"\u{2fba}" => "\u{99ac}",
|
|
196
|
+
"\u{2fbb}" => "\u{9aa8}",
|
|
197
|
+
"\u{2fbc}" => "\u{9ad8}",
|
|
198
|
+
"\u{2fbd}" => "\u{9adf}",
|
|
199
|
+
"\u{2fbe}" => "\u{9b25}",
|
|
200
|
+
"\u{2fbf}" => "\u{9b2f}",
|
|
201
|
+
"\u{2fc0}" => "\u{9b32}",
|
|
202
|
+
"\u{2fc1}" => "\u{9b3c}",
|
|
203
|
+
"\u{2fc2}" => "\u{9b5a}",
|
|
204
|
+
"\u{2fc3}" => "\u{9ce5}",
|
|
205
|
+
"\u{2fc4}" => "\u{9e75}",
|
|
206
|
+
"\u{2fc5}" => "\u{9e7f}",
|
|
207
|
+
"\u{2fc6}" => "\u{9ea5}",
|
|
208
|
+
"\u{2fc7}" => "\u{9ebb}",
|
|
209
|
+
"\u{2fc8}" => "\u{9ec3}",
|
|
210
|
+
"\u{2fc9}" => "\u{9ecd}",
|
|
211
|
+
"\u{2fca}" => "\u{9ed1}",
|
|
212
|
+
"\u{2fcb}" => "\u{9ef9}",
|
|
213
|
+
"\u{2fcc}" => "\u{9efd}",
|
|
214
|
+
"\u{2fcd}" => "\u{9f0e}",
|
|
215
|
+
"\u{2fce}" => "\u{9f13}",
|
|
216
|
+
"\u{2fcf}" => "\u{9f20}",
|
|
217
|
+
"\u{2fd0}" => "\u{9f3b}",
|
|
218
|
+
"\u{2fd1}" => "\u{9f4a}",
|
|
219
|
+
"\u{2fd2}" => "\u{9f52}",
|
|
220
|
+
"\u{2fd3}" => "\u{9f8d}",
|
|
221
|
+
"\u{2fd4}" => "\u{9f9c}",
|
|
222
|
+
"\u{2fd5}" => "\u{9fa0}",
|
|
223
|
+
"\u{2e80}" => "\u{51ab}",
|
|
224
|
+
"\u{2e81}" => "\u{5382}",
|
|
225
|
+
"\u{2e82}" => "\u{4e5b}",
|
|
226
|
+
"\u{2e83}" => "\u{4e5a}",
|
|
227
|
+
"\u{2e84}" => "\u{4e59}",
|
|
228
|
+
"\u{2e85}" => "\u{4ebb}",
|
|
229
|
+
"\u{2e86}" => "\u{5182}",
|
|
230
|
+
"\u{2e89}" => "\u{5202}",
|
|
231
|
+
"\u{2e8a}" => "\u{535c}",
|
|
232
|
+
"\u{2e8b}" => "\u{353e}",
|
|
233
|
+
"\u{2e8e}" => "\u{5140}",
|
|
234
|
+
"\u{2e8f}" => "\u{5c23}",
|
|
235
|
+
"\u{2e90}" => "\u{5c22}",
|
|
236
|
+
"\u{2e92}" => "\u{5df3}",
|
|
237
|
+
"\u{2e93}" => "\u{5e7a}",
|
|
238
|
+
"\u{2e94}" => "\u{5f51}",
|
|
239
|
+
"\u{2e95}" => "\u{5f50}",
|
|
240
|
+
"\u{2e96}" => "\u{5fc4}",
|
|
241
|
+
"\u{2e97}" => "\u{38fa}",
|
|
242
|
+
"\u{2e98}" => "\u{624c}",
|
|
243
|
+
"\u{2e99}" => "\u{6535}",
|
|
244
|
+
"\u{2e9b}" => "\u{65e1}",
|
|
245
|
+
"\u{2e9d}" => "\u{6708}",
|
|
246
|
+
"\u{2e9e}" => "\u{6b7a}",
|
|
247
|
+
"\u{2e9f}" => "\u{6bcd}",
|
|
248
|
+
"\u{2ea0}" => "\u{6c11}",
|
|
249
|
+
"\u{2ea1}" => "\u{6c35}",
|
|
250
|
+
"\u{2ea2}" => "\u{6c3a}",
|
|
251
|
+
"\u{2ea3}" => "\u{706c}",
|
|
252
|
+
"\u{2ea5}" => "\u{722b}",
|
|
253
|
+
"\u{2ea6}" => "\u{4e2c}",
|
|
254
|
+
"\u{2ea8}" => "\u{72ad}",
|
|
255
|
+
"\u{2eab}" => "\u{7f52}",
|
|
256
|
+
"\u{2eac}" => "\u{793a}",
|
|
257
|
+
"\u{2ead}" => "\u{793b}",
|
|
258
|
+
"\u{2eaf}" => "\u{7cf9}",
|
|
259
|
+
"\u{2eb0}" => "\u{7e9f}",
|
|
260
|
+
"\u{2eb1}" => "\u{7f53}",
|
|
261
|
+
"\u{2eb3}" => "\u{34c1}",
|
|
262
|
+
"\u{2eb4}" => "\u{34c1}",
|
|
263
|
+
"\u{2eb9}" => "\u{8002}",
|
|
264
|
+
"\u{2eba}" => "\u{8080}",
|
|
265
|
+
"\u{2ebc}" => "\u{6708}",
|
|
266
|
+
"\u{2ebd}" => "\u{81fc}",
|
|
267
|
+
"\u{2ebe}" => "\u{8279}",
|
|
268
|
+
"\u{2ebf}" => "\u{8279}",
|
|
269
|
+
"\u{2ec0}" => "\u{8279}",
|
|
270
|
+
"\u{2ec1}" => "\u{864e}",
|
|
271
|
+
"\u{2ec2}" => "\u{8864}",
|
|
272
|
+
"\u{2ec3}" => "\u{8980}",
|
|
273
|
+
"\u{2ec4}" => "\u{897f}",
|
|
274
|
+
"\u{2ec5}" => "\u{89c1}",
|
|
275
|
+
"\u{2ec8}" => "\u{8ba0}",
|
|
276
|
+
"\u{2ec9}" => "\u{8d1d}",
|
|
277
|
+
"\u{2ecb}" => "\u{8f66}",
|
|
278
|
+
"\u{2ecd}" => "\u{8fb6}",
|
|
279
|
+
"\u{2ecf}" => "\u{961d}",
|
|
280
|
+
"\u{2ed0}" => "\u{9485}",
|
|
281
|
+
"\u{2ed1}" => "\u{9577}",
|
|
282
|
+
"\u{2ed2}" => "\u{9578}",
|
|
283
|
+
"\u{2ed3}" => "\u{957f}",
|
|
284
|
+
"\u{2ed6}" => "\u{961d}",
|
|
285
|
+
"\u{2ed8}" => "\u{9752}",
|
|
286
|
+
"\u{2ed9}" => "\u{97e6}",
|
|
287
|
+
"\u{2eda}" => "\u{9875}",
|
|
288
|
+
"\u{2edb}" => "\u{98ce}",
|
|
289
|
+
"\u{2edc}" => "\u{98de}",
|
|
290
|
+
"\u{2edd}" => "\u{98df}",
|
|
291
|
+
"\u{2edf}" => "\u{98e0}",
|
|
292
|
+
"\u{2ee0}" => "\u{9963}",
|
|
293
|
+
"\u{2ee2}" => "\u{9a6c}",
|
|
294
|
+
"\u{2ee3}" => "\u{9aa8}",
|
|
295
|
+
"\u{2ee4}" => "\u{9b3c}",
|
|
296
|
+
"\u{2ee5}" => "\u{9c7c}",
|
|
297
|
+
"\u{2ee6}" => "\u{9e1f}",
|
|
298
|
+
"\u{2ee7}" => "\u{5364}",
|
|
299
|
+
"\u{2ee8}" => "\u{9ea6}",
|
|
300
|
+
"\u{2ee9}" => "\u{9ec4}",
|
|
301
|
+
"\u{2eea}" => "\u{9efe}",
|
|
302
|
+
"\u{2eeb}" => "\u{6589}",
|
|
303
|
+
"\u{2eec}" => "\u{9f50}",
|
|
304
|
+
"\u{2eed}" => "\u{6b6f}",
|
|
305
|
+
"\u{2eee}" => "\u{9f7f}",
|
|
306
|
+
"\u{2eef}" => "\u{7adc}",
|
|
307
|
+
"\u{2ef0}" => "\u{9f99}",
|
|
308
|
+
"\u{2ef1}" => "\u{9f9c}",
|
|
309
|
+
"\u{2ef2}" => "\u{4e80}",
|
|
310
|
+
"\u{2ef3}" => "\u{9f9f}"
|
|
311
|
+
}.freeze
|
|
312
|
+
|
|
313
|
+
# Transliterator for radicals
|
|
314
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
315
|
+
# Initialize the transliterator with options
|
|
316
|
+
#
|
|
317
|
+
# @param _options [Hash] Configuration options (currently unused)
|
|
318
|
+
def initialize(_options = {})
|
|
319
|
+
# Options currently unused for radicals transliterator
|
|
320
|
+
super()
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# Replace Kangxi radicals with equivalent CJK ideographs
|
|
324
|
+
#
|
|
325
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
326
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
327
|
+
def call(input_chars)
|
|
328
|
+
offset = 0
|
|
329
|
+
|
|
330
|
+
result = input_chars.filter_map do |char|
|
|
331
|
+
replacement = RADICALS_MAPPINGS[char.c]
|
|
332
|
+
c = if replacement
|
|
333
|
+
# Skip empty replacements (character removal)
|
|
334
|
+
next if replacement.empty?
|
|
335
|
+
|
|
336
|
+
Char.new(c: replacement, offset: offset, source: char)
|
|
337
|
+
else
|
|
338
|
+
char.with_offset(offset)
|
|
339
|
+
end
|
|
340
|
+
offset += c.c.length
|
|
341
|
+
c
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
class << result
|
|
345
|
+
include Yosina::Chars
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
result
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# Factory method to create a radicals transliterator
|
|
353
|
+
#
|
|
354
|
+
# @param options [Hash] Configuration options
|
|
355
|
+
# @return [Transliterator] A new radicals transliterator instance
|
|
356
|
+
def self.call(options = {})
|
|
357
|
+
Transliterator.new(options)
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Replace various space characters with plain whitespace
|
|
6
|
+
module Spaces
|
|
7
|
+
# Generated mapping data from spaces.json
|
|
8
|
+
SPACES_MAPPINGS = {
|
|
9
|
+
"\u{a0}" => ' ',
|
|
10
|
+
"\u{180e}" => '',
|
|
11
|
+
"\u{2000}" => ' ',
|
|
12
|
+
"\u{2001}" => ' ',
|
|
13
|
+
"\u{2002}" => ' ',
|
|
14
|
+
"\u{2003}" => ' ',
|
|
15
|
+
"\u{2004}" => ' ',
|
|
16
|
+
"\u{2005}" => ' ',
|
|
17
|
+
"\u{2006}" => ' ',
|
|
18
|
+
"\u{2007}" => ' ',
|
|
19
|
+
"\u{2008}" => ' ',
|
|
20
|
+
"\u{2009}" => ' ',
|
|
21
|
+
"\u{200a}" => ' ',
|
|
22
|
+
"\u{200b}" => ' ',
|
|
23
|
+
"\u{202f}" => ' ',
|
|
24
|
+
"\u{205f}" => ' ',
|
|
25
|
+
"\u{3000}" => ' ',
|
|
26
|
+
"\u{3164}" => ' ',
|
|
27
|
+
"\u{ffa0}" => ' ',
|
|
28
|
+
"\u{feff}" => ''
|
|
29
|
+
}.freeze
|
|
30
|
+
|
|
31
|
+
# Transliterator for spaces
|
|
32
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
33
|
+
# Initialize the transliterator with options
|
|
34
|
+
#
|
|
35
|
+
# @param _options [Hash] Configuration options (currently unused)
|
|
36
|
+
def initialize(_options = {})
|
|
37
|
+
# Options currently unused for spaces transliterator
|
|
38
|
+
super()
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Replace various space characters with plain whitespace
|
|
42
|
+
#
|
|
43
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
44
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
45
|
+
def call(input_chars)
|
|
46
|
+
offset = 0
|
|
47
|
+
|
|
48
|
+
result = input_chars.filter_map do |char|
|
|
49
|
+
replacement = SPACES_MAPPINGS[char.c]
|
|
50
|
+
c = if replacement
|
|
51
|
+
# Skip empty replacements (character removal)
|
|
52
|
+
next if replacement.empty?
|
|
53
|
+
|
|
54
|
+
Char.new(c: replacement, offset: offset, source: char)
|
|
55
|
+
else
|
|
56
|
+
char.with_offset(offset)
|
|
57
|
+
end
|
|
58
|
+
offset += c.c.length
|
|
59
|
+
c
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
class << result
|
|
63
|
+
include Yosina::Chars
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
result
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Factory method to create a spaces transliterator
|
|
71
|
+
#
|
|
72
|
+
# @param options [Hash] Configuration options
|
|
73
|
+
# @return [Transliterator] A new spaces transliterator instance
|
|
74
|
+
def self.call(options = {})
|
|
75
|
+
Transliterator.new(options)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|