yosina 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +36 -0
- data/Gemfile +6 -0
- data/README.ja.md +229 -0
- data/README.md +229 -0
- data/Rakefile +30 -0
- data/codegen/dataset.rb +215 -0
- data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
- data/codegen/emitters/combined_transliterator_data.rb +28 -0
- data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
- data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
- data/codegen/emitters/simple_transliterator.rb +76 -0
- data/codegen/emitters/utils.rb +45 -0
- data/codegen/emitters.rb +8 -0
- data/codegen/main.rb +109 -0
- data/lib/yosina/char.rb +65 -0
- data/lib/yosina/chars.rb +152 -0
- data/lib/yosina/recipes.rb +359 -0
- data/lib/yosina/transliterator.rb +49 -0
- data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
- data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
- data/lib/yosina/transliterators/combined.rb +52 -0
- data/lib/yosina/transliterators/combined_data.rb +495 -0
- data/lib/yosina/transliterators/hira_kata.rb +106 -0
- data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
- data/lib/yosina/transliterators/hyphens.rb +83 -0
- data/lib/yosina/transliterators/hyphens_data.rb +60 -0
- data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
- data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
- data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
- data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
- data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
- data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
- data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
- data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
- data/lib/yosina/transliterators/radicals.rb +361 -0
- data/lib/yosina/transliterators/spaces.rb +79 -0
- data/lib/yosina/transliterators.rb +57 -0
- data/lib/yosina/version.rb +5 -0
- data/lib/yosina.rb +62 -0
- data/yosina.gemspec +41 -0
- metadata +159 -0
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../transliterator'
|
|
4
|
+
require_relative 'hira_kata_table'
|
|
5
|
+
|
|
6
|
+
module Yosina
|
|
7
|
+
module Transliterators
|
|
8
|
+
# JIS X 0201 and alike transliterator for fullwidth/halfwidth conversion
|
|
9
|
+
module Jisx0201AndAlike
|
|
10
|
+
include HiraKataTable
|
|
11
|
+
|
|
12
|
+
# GL area mapping table (fullwidth to halfwidth)
|
|
13
|
+
JISX0201_GL_TABLE = [
|
|
14
|
+
["\u3000", "\u0020"], # Ideographic space to space
|
|
15
|
+
["\uff01", "\u0021"], # ! to !
|
|
16
|
+
["\uff02", "\u0022"], # " to "
|
|
17
|
+
["\uff03", "\u0023"], # # to #
|
|
18
|
+
["\uff04", "\u0024"], # $ to $
|
|
19
|
+
["\uff05", "\u0025"], # % to %
|
|
20
|
+
["\uff06", "\u0026"], # & to &
|
|
21
|
+
["\uff07", "\u0027"], # ' to '
|
|
22
|
+
["\uff08", "\u0028"], # ( to (
|
|
23
|
+
["\uff09", "\u0029"], # ) to )
|
|
24
|
+
["\uff0a", "\u002a"], # * to *
|
|
25
|
+
["\uff0b", "\u002b"], # + to +
|
|
26
|
+
["\uff0c", "\u002c"], # , to ,
|
|
27
|
+
["\uff0d", "\u002d"], # - to -
|
|
28
|
+
["\uff0e", "\u002e"], # . to .
|
|
29
|
+
["\uff0f", "\u002f"], # / to /
|
|
30
|
+
["\uff10", "\u0030"], # 0 to 0
|
|
31
|
+
["\uff11", "\u0031"], # 1 to 1
|
|
32
|
+
["\uff12", "\u0032"], # 2 to 2
|
|
33
|
+
["\uff13", "\u0033"], # 3 to 3
|
|
34
|
+
["\uff14", "\u0034"], # 4 to 4
|
|
35
|
+
["\uff15", "\u0035"], # 5 to 5
|
|
36
|
+
["\uff16", "\u0036"], # 6 to 6
|
|
37
|
+
["\uff17", "\u0037"], # 7 to 7
|
|
38
|
+
["\uff18", "\u0038"], # 8 to 8
|
|
39
|
+
["\uff19", "\u0039"], # 9 to 9
|
|
40
|
+
["\uff1a", "\u003a"], # : to :
|
|
41
|
+
["\uff1b", "\u003b"], # ; to ;
|
|
42
|
+
["\uff1c", "\u003c"], # < to <
|
|
43
|
+
["\uff1d", "\u003d"], # = to =
|
|
44
|
+
["\uff1e", "\u003e"], # > to >
|
|
45
|
+
["\uff1f", "\u003f"], # ? to ?
|
|
46
|
+
["\uff20", "\u0040"], # @ to @
|
|
47
|
+
["\uff21", "\u0041"], # A to A
|
|
48
|
+
["\uff22", "\u0042"], # B to B
|
|
49
|
+
["\uff23", "\u0043"], # C to C
|
|
50
|
+
["\uff24", "\u0044"], # D to D
|
|
51
|
+
["\uff25", "\u0045"], # E to E
|
|
52
|
+
["\uff26", "\u0046"], # F to F
|
|
53
|
+
["\uff27", "\u0047"], # G to G
|
|
54
|
+
["\uff28", "\u0048"], # H to H
|
|
55
|
+
["\uff29", "\u0049"], # I to I
|
|
56
|
+
["\uff2a", "\u004a"], # J to J
|
|
57
|
+
["\uff2b", "\u004b"], # K to K
|
|
58
|
+
["\uff2c", "\u004c"], # L to L
|
|
59
|
+
["\uff2d", "\u004d"], # M to M
|
|
60
|
+
["\uff2e", "\u004e"], # N to N
|
|
61
|
+
["\uff2f", "\u004f"], # O to O
|
|
62
|
+
["\uff30", "\u0050"], # P to P
|
|
63
|
+
["\uff31", "\u0051"], # Q to Q
|
|
64
|
+
["\uff32", "\u0052"], # R to R
|
|
65
|
+
["\uff33", "\u0053"], # S to S
|
|
66
|
+
["\uff34", "\u0054"], # T to T
|
|
67
|
+
["\uff35", "\u0055"], # U to U
|
|
68
|
+
["\uff36", "\u0056"], # V to V
|
|
69
|
+
["\uff37", "\u0057"], # W to W
|
|
70
|
+
["\uff38", "\u0058"], # X to X
|
|
71
|
+
["\uff39", "\u0059"], # Y to Y
|
|
72
|
+
["\uff3a", "\u005a"], # Z to Z
|
|
73
|
+
["\uff3b", "\u005b"], # [ to [
|
|
74
|
+
["\uff3d", "\u005d"], # ] to ]
|
|
75
|
+
["\uff3e", "\u005e"], # ^ to ^
|
|
76
|
+
["\uff3f", "\u005f"], # _ to _
|
|
77
|
+
["\uff40", "\u0060"], # ` to `
|
|
78
|
+
["\uff41", "\u0061"], # a to a
|
|
79
|
+
["\uff42", "\u0062"], # b to b
|
|
80
|
+
["\uff43", "\u0063"], # c to c
|
|
81
|
+
["\uff44", "\u0064"], # d to d
|
|
82
|
+
["\uff45", "\u0065"], # e to e
|
|
83
|
+
["\uff46", "\u0066"], # f to f
|
|
84
|
+
["\uff47", "\u0067"], # g to g
|
|
85
|
+
["\uff48", "\u0068"], # h to h
|
|
86
|
+
["\uff49", "\u0069"], # i to i
|
|
87
|
+
["\uff4a", "\u006a"], # j to j
|
|
88
|
+
["\uff4b", "\u006b"], # k to k
|
|
89
|
+
["\uff4c", "\u006c"], # l to l
|
|
90
|
+
["\uff4d", "\u006d"], # m to m
|
|
91
|
+
["\uff4e", "\u006e"], # n to n
|
|
92
|
+
["\uff4f", "\u006f"], # o to o
|
|
93
|
+
["\uff50", "\u0070"], # p to p
|
|
94
|
+
["\uff51", "\u0071"], # q to q
|
|
95
|
+
["\uff52", "\u0072"], # r to r
|
|
96
|
+
["\uff53", "\u0073"], # s to s
|
|
97
|
+
["\uff54", "\u0074"], # t to t
|
|
98
|
+
["\uff55", "\u0075"], # u to u
|
|
99
|
+
["\uff56", "\u0076"], # v to v
|
|
100
|
+
["\uff57", "\u0077"], # w to w
|
|
101
|
+
["\uff58", "\u0078"], # x to x
|
|
102
|
+
["\uff59", "\u0079"], # y to y
|
|
103
|
+
["\uff5a", "\u007a"], # z to z
|
|
104
|
+
["\uff5b", "\u007b"], # { to {
|
|
105
|
+
["\uff5c", "\u007c"], # | to |
|
|
106
|
+
["\uff5d", "\u007d"] # } to }
|
|
107
|
+
].freeze
|
|
108
|
+
|
|
109
|
+
# Special GL overrides
|
|
110
|
+
JISX0201_GL_OVERRIDES = {
|
|
111
|
+
u005c_as_yen_sign: [["\uffe5", "\u005c"]], # ¥ to \
|
|
112
|
+
u005c_as_backslash: [["\uff3c", "\u005c"]], # \ to \
|
|
113
|
+
u007e_as_fullwidth_tilde: [["\uff5e", "\u007e"]], # ~ to ~
|
|
114
|
+
u007e_as_wave_dash: [["\u301c", "\u007e"]], # 〜 to ~
|
|
115
|
+
u007e_as_overline: [["\u203e", "\u007e"]], # ‾ to ~
|
|
116
|
+
u007e_as_fullwidth_macron: [["\uffe3", "\u007e"]], #  ̄ to ~
|
|
117
|
+
u00a5_as_yen_sign: [["\uffe5", "\u00a5"]] # ¥ to ¥
|
|
118
|
+
}.freeze
|
|
119
|
+
|
|
120
|
+
# Generate GR table from shared table
|
|
121
|
+
def self.generate_gr_table
|
|
122
|
+
result = [
|
|
123
|
+
["\u3002", "\uff61"], # 。 to 。
|
|
124
|
+
["\u300c", "\uff62"], # 「 to 「
|
|
125
|
+
["\u300d", "\uff63"], # 」 to 」
|
|
126
|
+
["\u3001", "\uff64"], # 、 to 、
|
|
127
|
+
["\u30fb", "\uff65"], # ・ to ・
|
|
128
|
+
["\u30fc", "\uff70"], # ー to ー
|
|
129
|
+
["\u309b", "\uff9e"], # ゛ to ゙
|
|
130
|
+
["\u309c", "\uff9f"] # ゜to ゚
|
|
131
|
+
]
|
|
132
|
+
# Add katakana mappings from main table
|
|
133
|
+
HIRAGANA_KATAKANA_TABLE.each do |_, katakana, halfwidth|
|
|
134
|
+
result << [katakana[0], halfwidth] if halfwidth
|
|
135
|
+
end
|
|
136
|
+
# Add small kana mappings
|
|
137
|
+
HIRAGANA_KATAKANA_SMALL_TABLE.each do |_, katakana, halfwidth|
|
|
138
|
+
result << [katakana, halfwidth] if halfwidth
|
|
139
|
+
end
|
|
140
|
+
result
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# GR area mapping table (fullwidth to halfwidth)
|
|
144
|
+
JISX0201_GR_TABLE = generate_gr_table.freeze
|
|
145
|
+
|
|
146
|
+
# Special punctuations
|
|
147
|
+
SPECIAL_PUNCTUATIONS_TABLE = [["\u30a0", "\u003d"]].freeze # ゠ to =
|
|
148
|
+
|
|
149
|
+
# Generate voiced letters table from shared table
|
|
150
|
+
def self.generate_voiced_letters_table
|
|
151
|
+
result = []
|
|
152
|
+
HIRAGANA_KATAKANA_TABLE.each do |_, katakana, halfwidth|
|
|
153
|
+
next unless halfwidth
|
|
154
|
+
|
|
155
|
+
result << [katakana[1], "#{halfwidth}\uff9e"] if katakana[1] # Has voiced form
|
|
156
|
+
result << [katakana[2], "#{halfwidth}\uff9f"] if katakana[2] # Has semi-voiced form
|
|
157
|
+
end
|
|
158
|
+
result
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Voiced letters table
|
|
162
|
+
VOICED_LETTERS_TABLE = generate_voiced_letters_table.freeze
|
|
163
|
+
|
|
164
|
+
# Generate hiragana mappings from shared table
|
|
165
|
+
def self.generate_hiragana_mappings
|
|
166
|
+
result = []
|
|
167
|
+
# Add main table hiragana mappings
|
|
168
|
+
HIRAGANA_KATAKANA_TABLE.each do |hiragana, _, halfwidth|
|
|
169
|
+
next unless hiragana[0] && halfwidth
|
|
170
|
+
|
|
171
|
+
result << [hiragana[0], halfwidth]
|
|
172
|
+
result << [hiragana[1], "#{halfwidth}\uff9e"] if hiragana[1] # Has voiced form
|
|
173
|
+
result << [hiragana[2], "#{halfwidth}\uff9f"] if hiragana[2] # Has semi-voiced form
|
|
174
|
+
end
|
|
175
|
+
# Add small kana mappings
|
|
176
|
+
HIRAGANA_KATAKANA_SMALL_TABLE.each do |hiragana, _, halfwidth|
|
|
177
|
+
result << [hiragana, halfwidth] if halfwidth
|
|
178
|
+
end
|
|
179
|
+
result
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Hiragana mappings
|
|
183
|
+
HIRAGANA_MAPPINGS = generate_hiragana_mappings.freeze
|
|
184
|
+
|
|
185
|
+
# Transliterator for JIS X 0201 and alike
|
|
186
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
187
|
+
attr_reader :fullwidth_to_halfwidth, :convert_gl, :convert_gr, :convert_unsafe_specials,
|
|
188
|
+
:convert_hiraganas, :combine_voiced_sound_marks,
|
|
189
|
+
:u005c_as_yen_sign, :u005c_as_backslash,
|
|
190
|
+
:u007e_as_fullwidth_tilde, :u007e_as_wave_dash,
|
|
191
|
+
:u007e_as_overline, :u007e_as_fullwidth_macron,
|
|
192
|
+
:u00a5_as_yen_sign
|
|
193
|
+
|
|
194
|
+
# Initialize the transliterator with options
|
|
195
|
+
#
|
|
196
|
+
# @param options [Hash] Configuration options
|
|
197
|
+
# @option options [Boolean] :fullwidth_to_halfwidth Convert fullwidth to halfwidth (default: true)
|
|
198
|
+
# @option options [Boolean] :convert_gl Convert GL characters (default: true)
|
|
199
|
+
# @option options [Boolean] :convert_gr Convert GR characters (default: true)
|
|
200
|
+
# @option options [Boolean] :convert_unsafe_specials Convert unsafe special characters
|
|
201
|
+
# @option options [Boolean] :convert_hiraganas Convert hiraganas (default: false)
|
|
202
|
+
# @option options [Boolean] :combine_voiced_sound_marks Combine voiced sound marks (default: true)
|
|
203
|
+
# @option options [Boolean] :u005c_as_yen_sign Treat backslash as yen sign
|
|
204
|
+
# @option options [Boolean] :u005c_as_backslash Treat backslash verbatim
|
|
205
|
+
# @option options [Boolean] :u007e_as_fullwidth_tilde Convert tilde to fullwidth tilde
|
|
206
|
+
# @option options [Boolean] :u007e_as_wave_dash Convert tilde to wave dash
|
|
207
|
+
# @option options [Boolean] :u007e_as_overline Convert tilde to overline
|
|
208
|
+
# @option options [Boolean] :u007e_as_fullwidth_macron Convert tilde to fullwidth macron
|
|
209
|
+
# @option options [Boolean] :u00a5_as_yen_sign Convert yen sign to backslash
|
|
210
|
+
def initialize(options = {})
|
|
211
|
+
super()
|
|
212
|
+
@fullwidth_to_halfwidth = options.fetch(:fullwidth_to_halfwidth, true)
|
|
213
|
+
@convert_gl = options.fetch(:convert_gl, true)
|
|
214
|
+
@convert_gr = options.fetch(:convert_gr, true)
|
|
215
|
+
@convert_hiraganas = options.fetch(:convert_hiraganas, false)
|
|
216
|
+
@combine_voiced_sound_marks = options.fetch(:combine_voiced_sound_marks, true)
|
|
217
|
+
|
|
218
|
+
# Set defaults based on direction
|
|
219
|
+
if @fullwidth_to_halfwidth
|
|
220
|
+
@convert_unsafe_specials = options.fetch(:convert_unsafe_specials, true)
|
|
221
|
+
@u005c_as_yen_sign = options.fetch(:u005c_as_yen_sign) { !options.key?(:u00a5_as_yen_sign) }
|
|
222
|
+
@u005c_as_backslash = options.fetch(:u005c_as_backslash, false)
|
|
223
|
+
@u007e_as_fullwidth_tilde = options.fetch(:u007e_as_fullwidth_tilde, true)
|
|
224
|
+
@u007e_as_wave_dash = options.fetch(:u007e_as_wave_dash, true)
|
|
225
|
+
@u007e_as_overline = options.fetch(:u007e_as_overline, false)
|
|
226
|
+
@u007e_as_fullwidth_macron = options.fetch(:u007e_as_fullwidth_macron, false)
|
|
227
|
+
@u00a5_as_yen_sign = options.fetch(:u00a5_as_yen_sign, false)
|
|
228
|
+
else
|
|
229
|
+
@convert_unsafe_specials = options.fetch(:convert_unsafe_specials, false)
|
|
230
|
+
@u005c_as_yen_sign = options.fetch(:u005c_as_yen_sign) { !options.key?(:u005c_as_backslash) }
|
|
231
|
+
@u005c_as_backslash = options.fetch(:u005c_as_backslash, false)
|
|
232
|
+
@u007e_as_fullwidth_tilde = options.fetch(:u007e_as_fullwidth_tilde) do
|
|
233
|
+
!options.key?(:u007e_as_wave_dash) &&
|
|
234
|
+
!options.key?(:u007e_as_overline) &&
|
|
235
|
+
!options.key?(:u007e_as_fullwidth_macron)
|
|
236
|
+
end
|
|
237
|
+
@u007e_as_wave_dash = options.fetch(:u007e_as_wave_dash, false)
|
|
238
|
+
@u007e_as_overline = options.fetch(:u007e_as_overline, false)
|
|
239
|
+
@u007e_as_fullwidth_macron = options.fetch(:u007e_as_fullwidth_macron, false)
|
|
240
|
+
@u00a5_as_yen_sign = options.fetch(:u00a5_as_yen_sign, true)
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
validate_options!
|
|
244
|
+
build_mappings!
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Transliterate characters
|
|
248
|
+
#
|
|
249
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
250
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
251
|
+
def call(input_chars)
|
|
252
|
+
if @fullwidth_to_halfwidth
|
|
253
|
+
convert_fullwidth_to_halfwidth(input_chars)
|
|
254
|
+
else
|
|
255
|
+
convert_halfwidth_to_fullwidth(input_chars)
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
private
|
|
260
|
+
|
|
261
|
+
def validate_options!
|
|
262
|
+
if @fullwidth_to_halfwidth
|
|
263
|
+
# For forward direction, only check this specific combination
|
|
264
|
+
if @u005c_as_yen_sign && @u00a5_as_yen_sign
|
|
265
|
+
raise ArgumentError,
|
|
266
|
+
'u005c_as_yen_sign and u00a5_as_yen_sign are mutually exclusive,' \
|
|
267
|
+
' and cannot be set to true at the same time.'
|
|
268
|
+
end
|
|
269
|
+
else
|
|
270
|
+
# For reverse direction, group overrides by their target character and validate
|
|
271
|
+
# Build groups of options that map to the same character
|
|
272
|
+
groups = {}
|
|
273
|
+
JISX0201_GL_OVERRIDES.each do |key, pairs|
|
|
274
|
+
next unless instance_variable_get("@#{key}")
|
|
275
|
+
|
|
276
|
+
pairs.each do |hw|
|
|
277
|
+
groups[hw] ||= []
|
|
278
|
+
groups[hw] << key
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Check if multiple options in the same group are set
|
|
283
|
+
groups.each_value do |keys|
|
|
284
|
+
next unless keys.size > 1
|
|
285
|
+
|
|
286
|
+
names = keys.map(&:to_s)
|
|
287
|
+
last = names.pop
|
|
288
|
+
raise ArgumentError,
|
|
289
|
+
"#{names.join(', ')} and #{last} are mutually exclusive," \
|
|
290
|
+
'and cannot be set to true at the same time.'
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
def build_mappings!
|
|
296
|
+
if @fullwidth_to_halfwidth
|
|
297
|
+
build_forward_mappings!
|
|
298
|
+
else
|
|
299
|
+
build_reverse_mappings!
|
|
300
|
+
build_voiced_reverse_mappings! if @combine_voiced_sound_marks && @convert_gr
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
def build_forward_mappings!
|
|
305
|
+
@fwd_mappings = {}
|
|
306
|
+
|
|
307
|
+
if @convert_gl
|
|
308
|
+
# Add basic GL mappings
|
|
309
|
+
JISX0201_GL_TABLE.each { |fw, hw| @fwd_mappings[fw] = hw }
|
|
310
|
+
|
|
311
|
+
# Add override mappings
|
|
312
|
+
add_override_mappings(@fwd_mappings, false)
|
|
313
|
+
|
|
314
|
+
# Add special punctuations if enabled
|
|
315
|
+
SPECIAL_PUNCTUATIONS_TABLE.each { |fw, hw| @fwd_mappings[fw] = hw } if @convert_unsafe_specials
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
return unless @convert_gr
|
|
319
|
+
|
|
320
|
+
# Add basic GR mappings
|
|
321
|
+
JISX0201_GR_TABLE.each { |fw, hw| @fwd_mappings[fw] = hw }
|
|
322
|
+
VOICED_LETTERS_TABLE.each { |fw, hw| @fwd_mappings[fw] = hw }
|
|
323
|
+
|
|
324
|
+
# Add combining marks
|
|
325
|
+
@fwd_mappings["\u3099"] = "\uff9e" # combining dakuten
|
|
326
|
+
@fwd_mappings["\u309a"] = "\uff9f" # combining handakuten
|
|
327
|
+
|
|
328
|
+
# Add hiragana mappings if enabled
|
|
329
|
+
return unless @convert_hiraganas
|
|
330
|
+
|
|
331
|
+
HIRAGANA_MAPPINGS.each do |fw, hw|
|
|
332
|
+
@fwd_mappings[fw] = hw
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def build_reverse_mappings!
|
|
337
|
+
@rev_mappings = {}
|
|
338
|
+
|
|
339
|
+
if @convert_gl
|
|
340
|
+
# Add basic GL reverse mappings
|
|
341
|
+
JISX0201_GL_TABLE.each { |fw, hw| @rev_mappings[hw] = fw }
|
|
342
|
+
|
|
343
|
+
# Add override reverse mappings
|
|
344
|
+
add_override_mappings(@rev_mappings, true)
|
|
345
|
+
|
|
346
|
+
# Add special punctuations if enabled
|
|
347
|
+
SPECIAL_PUNCTUATIONS_TABLE.each { |fw, hw| @rev_mappings[hw] = fw } if @convert_unsafe_specials
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
return unless @convert_gr
|
|
351
|
+
|
|
352
|
+
# Add basic GR reverse mappings
|
|
353
|
+
JISX0201_GR_TABLE.each { |fw, hw| @rev_mappings[hw] = fw }
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
def build_voiced_reverse_mappings!
|
|
357
|
+
@voiced_rev_mappings = {}
|
|
358
|
+
VOICED_LETTERS_TABLE.each do |fw, hw|
|
|
359
|
+
@voiced_rev_mappings[hw[0]] ||= {}
|
|
360
|
+
@voiced_rev_mappings[hw[0]][hw[1]] = fw
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def add_override_mappings(mappings, reverse)
|
|
365
|
+
JISX0201_GL_OVERRIDES.each do |key, pairs|
|
|
366
|
+
next unless instance_variable_get("@#{key}")
|
|
367
|
+
|
|
368
|
+
pairs.each do |fw, hw|
|
|
369
|
+
if reverse
|
|
370
|
+
mappings[hw] = fw
|
|
371
|
+
else
|
|
372
|
+
mappings[fw] = hw
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
def convert_fullwidth_to_halfwidth(input_chars)
|
|
379
|
+
offset = 0
|
|
380
|
+
Chars.enum do |y|
|
|
381
|
+
input_chars.each do |char|
|
|
382
|
+
if (mapped = @fwd_mappings[char.c])
|
|
383
|
+
mapped.each_char do |c|
|
|
384
|
+
y << Char.new(c: c, offset: offset, source: char)
|
|
385
|
+
offset += c.length
|
|
386
|
+
end
|
|
387
|
+
else
|
|
388
|
+
y << char.with_offset(offset)
|
|
389
|
+
offset += char.c.length
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
def convert_halfwidth_to_fullwidth(input_chars)
|
|
396
|
+
offset = 0
|
|
397
|
+
pending = nil
|
|
398
|
+
|
|
399
|
+
Chars.enum do |y|
|
|
400
|
+
e = input_chars.each
|
|
401
|
+
loop do
|
|
402
|
+
if pending
|
|
403
|
+
char = pending
|
|
404
|
+
pending = nil
|
|
405
|
+
else
|
|
406
|
+
begin
|
|
407
|
+
char = e.next
|
|
408
|
+
rescue StopIteration
|
|
409
|
+
break
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
if char.sentinel?
|
|
413
|
+
y << char.with_offset(offset)
|
|
414
|
+
next
|
|
415
|
+
end
|
|
416
|
+
# Check if this character might start a combination
|
|
417
|
+
if @voiced_rev_mappings && (inner = @voiced_rev_mappings[char.c])
|
|
418
|
+
next_char = e.next
|
|
419
|
+
if (combined = inner[next_char.c])
|
|
420
|
+
y << Char.new(c: combined, offset: offset, source: char)
|
|
421
|
+
offset += combined.length
|
|
422
|
+
next
|
|
423
|
+
else
|
|
424
|
+
pending = next_char
|
|
425
|
+
end
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
# Normal mapping
|
|
429
|
+
mapped = @rev_mappings[char.c]
|
|
430
|
+
if mapped
|
|
431
|
+
y << Char.new(c: mapped, offset: offset, source: char)
|
|
432
|
+
offset += mapped.length
|
|
433
|
+
else
|
|
434
|
+
y << char.with_offset(offset)
|
|
435
|
+
offset += char.c.length
|
|
436
|
+
end
|
|
437
|
+
end
|
|
438
|
+
end
|
|
439
|
+
end
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Factory method to create a JIS X 0201 and alike transliterator
|
|
443
|
+
#
|
|
444
|
+
# @param options [Hash] Configuration options
|
|
445
|
+
# @return [Transliterator] A new JIS X 0201 and alike transliterator instance
|
|
446
|
+
def self.call(options = {})
|
|
447
|
+
Transliterator.new(options)
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
end
|
|
451
|
+
end
|