yosina 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +36 -0
- data/Gemfile +6 -0
- data/README.ja.md +229 -0
- data/README.md +229 -0
- data/Rakefile +30 -0
- data/codegen/dataset.rb +215 -0
- data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
- data/codegen/emitters/combined_transliterator_data.rb +28 -0
- data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
- data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
- data/codegen/emitters/simple_transliterator.rb +76 -0
- data/codegen/emitters/utils.rb +45 -0
- data/codegen/emitters.rb +8 -0
- data/codegen/main.rb +109 -0
- data/lib/yosina/char.rb +65 -0
- data/lib/yosina/chars.rb +152 -0
- data/lib/yosina/recipes.rb +359 -0
- data/lib/yosina/transliterator.rb +49 -0
- data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
- data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
- data/lib/yosina/transliterators/combined.rb +52 -0
- data/lib/yosina/transliterators/combined_data.rb +495 -0
- data/lib/yosina/transliterators/hira_kata.rb +106 -0
- data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
- data/lib/yosina/transliterators/hyphens.rb +83 -0
- data/lib/yosina/transliterators/hyphens_data.rb +60 -0
- data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
- data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
- data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
- data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
- data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
- data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
- data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
- data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
- data/lib/yosina/transliterators/radicals.rb +361 -0
- data/lib/yosina/transliterators/spaces.rb +79 -0
- data/lib/yosina/transliterators.rb +57 -0
- data/lib/yosina/version.rb +5 -0
- data/lib/yosina.rb +62 -0
- data/yosina.gemspec +41 -0
- metadata +159 -0
data/codegen/dataset.rb
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
# Dataset source definitions
|
|
6
|
+
DatasetSourceDefs = Struct.new(
|
|
7
|
+
:spaces,
|
|
8
|
+
:radicals,
|
|
9
|
+
:mathematical_alphanumerics,
|
|
10
|
+
:ideographic_annotations,
|
|
11
|
+
:hyphens,
|
|
12
|
+
:ivs_svs_base,
|
|
13
|
+
:kanji_old_new,
|
|
14
|
+
:combined,
|
|
15
|
+
:circled_or_squared,
|
|
16
|
+
keyword_init: true
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# Dataset container
|
|
20
|
+
Dataset = Struct.new(
|
|
21
|
+
:spaces,
|
|
22
|
+
:radicals,
|
|
23
|
+
:mathematical_alphanumerics,
|
|
24
|
+
:ideographic_annotations,
|
|
25
|
+
:hyphens,
|
|
26
|
+
:ivs_svs_base,
|
|
27
|
+
:kanji_old_new,
|
|
28
|
+
:combined,
|
|
29
|
+
:circled_or_squared,
|
|
30
|
+
keyword_init: true
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Convert Unicode codepoint string to character
|
|
34
|
+
def unicode_to_char(unicode_str)
|
|
35
|
+
return '' if unicode_str.nil?
|
|
36
|
+
|
|
37
|
+
# Remove U+ prefix and convert to integer
|
|
38
|
+
codepoint = unicode_str.sub(/^U\+/, '').to_i(16)
|
|
39
|
+
[codepoint].pack('U*')
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Load simple mapping data (key -> value or key -> null for removal)
|
|
43
|
+
def load_simple_data(filepath)
|
|
44
|
+
data = JSON.parse(File.read(filepath))
|
|
45
|
+
mappings = {}
|
|
46
|
+
|
|
47
|
+
data.each do |key, value|
|
|
48
|
+
char_key = unicode_to_char(key)
|
|
49
|
+
char_value = value.nil? ? '' : unicode_to_char(value)
|
|
50
|
+
mappings[char_key] = char_value
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
mappings
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Load kanji old-new form data (array of pairs)
|
|
57
|
+
def load_kanji_old_new_data(filepath)
|
|
58
|
+
data = JSON.parse(File.read(filepath))
|
|
59
|
+
mappings = {}
|
|
60
|
+
|
|
61
|
+
data.each do |pair|
|
|
62
|
+
next unless pair.is_a?(Array) && pair.length == 2
|
|
63
|
+
|
|
64
|
+
old_form = pair[0]
|
|
65
|
+
new_form = pair[1]
|
|
66
|
+
|
|
67
|
+
# Convert IVS sequences to characters
|
|
68
|
+
old_chars = []
|
|
69
|
+
new_chars = []
|
|
70
|
+
|
|
71
|
+
if old_form['ivs']
|
|
72
|
+
old_chars = old_form['ivs'].map { |cp| unicode_to_char(cp) }
|
|
73
|
+
elsif old_form['svs']
|
|
74
|
+
old_chars = old_form['svs'].map { |cp| unicode_to_char(cp) }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
if new_form['ivs']
|
|
78
|
+
new_chars = new_form['ivs'].map { |cp| unicode_to_char(cp) }
|
|
79
|
+
elsif new_form['svs']
|
|
80
|
+
new_chars = new_form['svs'].map { |cp| unicode_to_char(cp) }
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
next if old_chars.empty? || new_chars.empty?
|
|
84
|
+
|
|
85
|
+
old_key = old_chars.join
|
|
86
|
+
new_value = new_chars.join
|
|
87
|
+
|
|
88
|
+
mappings[old_key] = new_value
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
mappings
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Load hyphens data (array of hyphen records)
|
|
95
|
+
def load_hyphens_data(filepath)
|
|
96
|
+
data = JSON.parse(File.read(filepath))
|
|
97
|
+
records = []
|
|
98
|
+
|
|
99
|
+
data.each do |record|
|
|
100
|
+
hyphen_char = unicode_to_char(record['code'])
|
|
101
|
+
|
|
102
|
+
# Extract mappings from the record
|
|
103
|
+
hyphens_record = {
|
|
104
|
+
hyphen: hyphen_char,
|
|
105
|
+
ascii: nil,
|
|
106
|
+
jisx0201: nil,
|
|
107
|
+
jisx0208_90: nil,
|
|
108
|
+
jisx0208_90_windows: nil,
|
|
109
|
+
jisx0208_verbatim: nil
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Map ASCII field
|
|
113
|
+
if record['ascii'] && !record['ascii'].empty?
|
|
114
|
+
hyphens_record[:ascii] = record['ascii'].map { |cp| unicode_to_char(cp) }.join
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Map JIS X 0201 field
|
|
118
|
+
if record['jisx0201'] && !record['jisx0201'].empty?
|
|
119
|
+
hyphens_record[:jisx0201] = record['jisx0201'].map { |cp| unicode_to_char(cp) }.join
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Map JIS X 0208-1978 field (use as jisx0208_90)
|
|
123
|
+
if record['jisx0208-1978'] && !record['jisx0208-1978'].empty?
|
|
124
|
+
hyphens_record[:jisx0208_90] = record['jisx0208-1978'].map { |cp| unicode_to_char(cp) }.join
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Map JIS X 0208-1978 Windows field
|
|
128
|
+
if record['jisx0208-1978-windows'] && !record['jisx0208-1978-windows'].empty?
|
|
129
|
+
hyphens_record[:jisx0208_90_windows] = record['jisx0208-1978-windows'].map { |cp| unicode_to_char(cp) }.join
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
records << hyphens_record
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
records
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Load IVS/SVS base data (array of mapping records)
|
|
139
|
+
def load_ivs_svs_base_data(filepath)
|
|
140
|
+
data = JSON.parse(File.read(filepath))
|
|
141
|
+
records = []
|
|
142
|
+
|
|
143
|
+
data.each do |record|
|
|
144
|
+
# Build IVS sequence
|
|
145
|
+
ivs_chars = []
|
|
146
|
+
ivs_chars = record['ivs'].map { |cp| unicode_to_char(cp) } if record['ivs'].is_a?(Array)
|
|
147
|
+
|
|
148
|
+
# Build SVS sequence
|
|
149
|
+
svs_chars = []
|
|
150
|
+
svs_chars = record['svs'].map { |cp| unicode_to_char(cp) } if record['svs'].is_a?(Array)
|
|
151
|
+
|
|
152
|
+
# Get base characters
|
|
153
|
+
base90_char = record['base90'] ? unicode_to_char(record['base90']) : nil
|
|
154
|
+
base2004_char = record['base2004'] ? unicode_to_char(record['base2004']) : nil
|
|
155
|
+
|
|
156
|
+
# Only add record if we have IVS data
|
|
157
|
+
next if ivs_chars.empty?
|
|
158
|
+
|
|
159
|
+
records << {
|
|
160
|
+
ivs: ivs_chars.join,
|
|
161
|
+
svs: svs_chars.empty? ? nil : svs_chars.join,
|
|
162
|
+
base90: base90_char,
|
|
163
|
+
base2004: base2004_char
|
|
164
|
+
}
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
records
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Load combined characters data (key -> string to be split into characters)
|
|
171
|
+
def load_combined_data(filepath)
|
|
172
|
+
data = JSON.parse(File.read(filepath))
|
|
173
|
+
mappings = {}
|
|
174
|
+
|
|
175
|
+
data.each do |key, value|
|
|
176
|
+
char_key = unicode_to_char(key)
|
|
177
|
+
char_list = value.chars
|
|
178
|
+
mappings[char_key] = char_list
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
mappings
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Load circled or squared characters data
|
|
185
|
+
def load_circled_or_squared_data(filepath)
|
|
186
|
+
data = JSON.parse(File.read(filepath))
|
|
187
|
+
mappings = {}
|
|
188
|
+
|
|
189
|
+
data.each do |key, record|
|
|
190
|
+
char_key = unicode_to_char(key)
|
|
191
|
+
record_data = {
|
|
192
|
+
rendering: record['rendering'],
|
|
193
|
+
type: record['type'],
|
|
194
|
+
emoji: record['emoji']
|
|
195
|
+
}
|
|
196
|
+
mappings[char_key] = record_data
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
mappings
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Build dataset from data root directory
|
|
203
|
+
def build_dataset_from_data_root(data_root, defs)
|
|
204
|
+
Dataset.new(
|
|
205
|
+
spaces: load_simple_data(data_root / defs.spaces),
|
|
206
|
+
radicals: load_simple_data(data_root / defs.radicals),
|
|
207
|
+
mathematical_alphanumerics: load_simple_data(data_root / defs.mathematical_alphanumerics),
|
|
208
|
+
ideographic_annotations: load_simple_data(data_root / defs.ideographic_annotations),
|
|
209
|
+
hyphens: load_hyphens_data(data_root / defs.hyphens),
|
|
210
|
+
ivs_svs_base: load_ivs_svs_base_data(data_root / defs.ivs_svs_base),
|
|
211
|
+
kanji_old_new: load_kanji_old_new_data(data_root / defs.kanji_old_new),
|
|
212
|
+
combined: load_combined_data(data_root / defs.combined),
|
|
213
|
+
circled_or_squared: load_circled_or_squared_data(data_root / defs.circled_or_squared)
|
|
214
|
+
)
|
|
215
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'utils'
|
|
4
|
+
|
|
5
|
+
# Render a circled or squared character transliterator
|
|
6
|
+
def render_circled_or_squared_transliterator_data(mappings)
|
|
7
|
+
# Generate mapping entries for circled/squared characters
|
|
8
|
+
mapping_entries = mappings.map do |key, record|
|
|
9
|
+
type_abbrev = record[:type] == 'circle' ? 'c' : 's'
|
|
10
|
+
record_repr = "{ rendering: #{to_string_literal(record[:rendering])}" \
|
|
11
|
+
", type: #{to_string_literal(type_abbrev)}, emoji: #{record[:emoji]} }"
|
|
12
|
+
" #{to_string_literal(key)} => #{record_repr}"
|
|
13
|
+
end.join(",\n")
|
|
14
|
+
|
|
15
|
+
dedent 4, <<~RUBY
|
|
16
|
+
# frozen_string_literal: true
|
|
17
|
+
|
|
18
|
+
module Yosina
|
|
19
|
+
module Transliterators
|
|
20
|
+
# Replace circled or squared characters with their corresponding templates
|
|
21
|
+
module CircledOrSquared
|
|
22
|
+
# Generated mapping data from circled-or-squared.json
|
|
23
|
+
CIRCLED_OR_SQUARED_MAPPINGS = {
|
|
24
|
+
#{mapping_entries}
|
|
25
|
+
}.freeze
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
RUBY
|
|
30
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'utils'
|
|
4
|
+
|
|
5
|
+
# Render a combined character transliterator
|
|
6
|
+
def render_combined_transliterator_data(mappings)
|
|
7
|
+
# Generate mapping entries for combined characters -> character arrays
|
|
8
|
+
mapping_entries = mappings.map do |key, value_array|
|
|
9
|
+
value_repr = "[#{value_array.map { |c| to_string_literal(c) }.join(', ')}]"
|
|
10
|
+
" #{to_string_literal(key)} => #{value_repr}"
|
|
11
|
+
end.join(",\n")
|
|
12
|
+
|
|
13
|
+
dedent 4, <<~RUBY
|
|
14
|
+
# frozen_string_literal: true
|
|
15
|
+
|
|
16
|
+
module Yosina
|
|
17
|
+
module Transliterators
|
|
18
|
+
# Replace each combined character with its corresponding individual characters
|
|
19
|
+
module Combined
|
|
20
|
+
# Generated mapping data from combined-chars.json
|
|
21
|
+
COMBINED_MAPPINGS = {
|
|
22
|
+
#{mapping_entries}
|
|
23
|
+
}.freeze
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
RUBY
|
|
28
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'utils'
|
|
4
|
+
|
|
5
|
+
# Render hyphens transliterator data
|
|
6
|
+
def render_hyphens_transliterator_data(records)
|
|
7
|
+
# Generate records array - now using proper hyphens data structure
|
|
8
|
+
records_data = records.map do |record|
|
|
9
|
+
# Build HyphensRecord fields
|
|
10
|
+
fields = []
|
|
11
|
+
fields << "ascii: #{to_string_literal(record[:ascii])}" if record[:ascii]
|
|
12
|
+
fields << "jisx0201: #{to_string_literal(record[:jisx0201])}" if record[:jisx0201]
|
|
13
|
+
fields << "jisx0208_90: #{to_string_literal(record[:jisx0208_90])}" if record[:jisx0208_90]
|
|
14
|
+
fields << "jisx0208_90_windows: #{to_string_literal(record[:jisx0208_90_windows])}" if record[:jisx0208_90_windows]
|
|
15
|
+
fields << "jisx0208_verbatim: #{to_string_literal(record[:jisx0208_verbatim])}" if record[:jisx0208_verbatim]
|
|
16
|
+
|
|
17
|
+
record_repr = if fields.empty?
|
|
18
|
+
'HyphensRecord.new'
|
|
19
|
+
else
|
|
20
|
+
"HyphensRecord.new(#{fields.join(', ')})"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
" #{to_string_literal(record[:hyphen])} => #{record_repr}"
|
|
24
|
+
end.join(",\n")
|
|
25
|
+
|
|
26
|
+
<<~RUBY
|
|
27
|
+
# frozen_string_literal: true
|
|
28
|
+
|
|
29
|
+
module Yosina
|
|
30
|
+
module Transliterators
|
|
31
|
+
# Generated hyphens data
|
|
32
|
+
module HyphensData
|
|
33
|
+
# Record for hyphen transliteration data
|
|
34
|
+
HyphensRecord = Struct.new(:ascii, :jisx0201, :jisx0208_90, :jisx0208_90_windows, :jisx0208_verbatim, keyword_init: true) do
|
|
35
|
+
def initialize(ascii: nil, jisx0201: nil, jisx0208_90: nil, jisx0208_90_windows: nil, jisx0208_verbatim: nil)
|
|
36
|
+
super
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Generated mapping data
|
|
41
|
+
HYPHENS_MAPPINGS = {
|
|
42
|
+
#{records_data}
|
|
43
|
+
}.freeze
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
RUBY
|
|
48
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'utils'
|
|
4
|
+
|
|
5
|
+
# Render IVS/SVS base transliterator data
|
|
6
|
+
def render_ivs_svs_base_transliterator_data(records)
|
|
7
|
+
# Build compressed data similar to Python implementation
|
|
8
|
+
compressed_parts = []
|
|
9
|
+
records.each do |record|
|
|
10
|
+
compressed_parts << (record[:ivs] || '')
|
|
11
|
+
compressed_parts << (record[:svs] || '')
|
|
12
|
+
compressed_parts << (record[:base90] || '')
|
|
13
|
+
compressed_parts << (record[:base2004] || '')
|
|
14
|
+
end
|
|
15
|
+
compressed_data = compressed_parts.join("\0")
|
|
16
|
+
compressed_data_escaped = to_string_literal(compressed_data)
|
|
17
|
+
|
|
18
|
+
dedent(4, <<~RUBY
|
|
19
|
+
# frozen_string_literal: true
|
|
20
|
+
|
|
21
|
+
module Yosina
|
|
22
|
+
module Transliterators
|
|
23
|
+
# Generated IVS/SVS base data
|
|
24
|
+
module IvsSvsBaseData
|
|
25
|
+
# Record for IVS/SVS base transliteration data
|
|
26
|
+
IvsSvsBaseRecord = Struct.new(:ivs, :svs, :base90, :base2004, keyword_init: true) do
|
|
27
|
+
def initialize(ivs:, svs: nil, base90: nil, base2004: nil)
|
|
28
|
+
super
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Compressed data table - 4 strings per record: [ivs, svs, base90, base2004, ...]
|
|
33
|
+
COMPRESSED_DATA = #{compressed_data_escaped}.freeze
|
|
34
|
+
RECORDS_COUNT = #{records.length}
|
|
35
|
+
|
|
36
|
+
# Expand compressed data into a mapping dictionary
|
|
37
|
+
def self.expand_compressed_data
|
|
38
|
+
mappings = {}
|
|
39
|
+
# Split by null bytes to get all fields
|
|
40
|
+
fields = COMPRESSED_DATA.split("\\0")
|
|
41
|
+
# Process 4 fields at a time (ivs, svs, base90, base2004)
|
|
42
|
+
(0...fields.length).step(4) do |i|
|
|
43
|
+
next unless i + 3 < fields.length
|
|
44
|
+
ivs = fields[i]
|
|
45
|
+
svs = fields[i + 1].empty? ? nil : fields[i + 1]
|
|
46
|
+
base90 = fields[i + 2].empty? ? nil : fields[i + 2]
|
|
47
|
+
base2004 = fields[i + 3].empty? ? nil : fields[i + 3]
|
|
48
|
+
# Only add if ivs is not empty
|
|
49
|
+
if !ivs.empty?
|
|
50
|
+
mappings[ivs] = IvsSvsBaseRecord.new(
|
|
51
|
+
ivs: ivs,
|
|
52
|
+
svs: svs,
|
|
53
|
+
base90: base90,
|
|
54
|
+
base2004: base2004
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
mappings
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Lazy-loaded mappings cache
|
|
62
|
+
@mappings_cache = nil
|
|
63
|
+
@base_to_variants_cache = nil
|
|
64
|
+
@variants_to_base_cache = nil
|
|
65
|
+
|
|
66
|
+
# Get the IVS/SVS mappings dictionary, loading it if necessary
|
|
67
|
+
def self.get_ivs_svs_mappings
|
|
68
|
+
@mappings_cache ||= expand_compressed_data
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Build optimized lookup tables for base-to-variants and variants-to-base mappings
|
|
72
|
+
def self.populate_lookup_tables
|
|
73
|
+
return if @base_to_variants_cache && @variants_to_base_cache
|
|
74
|
+
|
|
75
|
+
mappings = get_ivs_svs_mappings
|
|
76
|
+
|
|
77
|
+
# For base->IVS/SVS lookup (used in "ivs-or-svs" mode)
|
|
78
|
+
base_to_variants_2004 = {}
|
|
79
|
+
base_to_variants_90 = {}
|
|
80
|
+
|
|
81
|
+
# For IVS/SVS->base lookup (used in "base" mode)
|
|
82
|
+
variants_to_base = {}
|
|
83
|
+
|
|
84
|
+
mappings.each do |variant_seq, record|
|
|
85
|
+
# Map base characters to their IVS/SVS variants
|
|
86
|
+
if record.base2004 && !base_to_variants_2004.key?(record.base2004)
|
|
87
|
+
base_to_variants_2004[record.base2004] = record
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
if record.base90 && !base_to_variants_90.key?(record.base90)
|
|
91
|
+
base_to_variants_90[record.base90] = record
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Map IVS/SVS variants back to base characters
|
|
95
|
+
variants_to_base[variant_seq] = record
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
@base_to_variants_cache = {
|
|
99
|
+
'unijis_2004' => base_to_variants_2004,
|
|
100
|
+
'unijis_90' => base_to_variants_90
|
|
101
|
+
}
|
|
102
|
+
@variants_to_base_cache = variants_to_base
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Get base character to variants mapping for the specified charset
|
|
106
|
+
def self.get_base_to_variants_mappings(charset = 'unijis_2004')
|
|
107
|
+
populate_lookup_tables
|
|
108
|
+
@base_to_variants_cache[charset]
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Get variants to base character mapping
|
|
112
|
+
def self.get_variants_to_base_mappings
|
|
113
|
+
populate_lookup_tables
|
|
114
|
+
@variants_to_base_cache
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
RUBY
|
|
120
|
+
)
|
|
121
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'utils'
|
|
4
|
+
|
|
5
|
+
# Render a simple transliterator with a mapping table
|
|
6
|
+
def render_simple_transliterator(name, description, mappings)
|
|
7
|
+
class_name = name.split('_').map(&:capitalize).join
|
|
8
|
+
constant_name = "#{name.upcase}_MAPPINGS"
|
|
9
|
+
|
|
10
|
+
# Generate mapping entries
|
|
11
|
+
mapping_entries = mappings.map do |key, value|
|
|
12
|
+
" #{to_string_literal(key)} => #{to_string_literal(value)}"
|
|
13
|
+
end.join(",\n")
|
|
14
|
+
|
|
15
|
+
dedent 4, <<~RUBY
|
|
16
|
+
# frozen_string_literal: true
|
|
17
|
+
|
|
18
|
+
module Yosina
|
|
19
|
+
module Transliterators
|
|
20
|
+
# #{description}
|
|
21
|
+
module #{class_name}
|
|
22
|
+
# Generated mapping data from #{name}.json
|
|
23
|
+
#{constant_name} = {
|
|
24
|
+
#{mapping_entries}
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
# Transliterator for #{name}
|
|
28
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
29
|
+
# Initialize the transliterator with options
|
|
30
|
+
#
|
|
31
|
+
# @param _options [Hash] Configuration options (currently unused)
|
|
32
|
+
def initialize(_options = {})
|
|
33
|
+
# Options currently unused for #{name} transliterator
|
|
34
|
+
super()
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# #{description}
|
|
38
|
+
#
|
|
39
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
40
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
41
|
+
def call(input_chars)
|
|
42
|
+
offset = 0
|
|
43
|
+
|
|
44
|
+
result = input_chars.filter_map do |char|
|
|
45
|
+
replacement = #{constant_name}[char.c]
|
|
46
|
+
c = if replacement
|
|
47
|
+
# Skip empty replacements (character removal)
|
|
48
|
+
next if replacement.empty?
|
|
49
|
+
Char.new(c: replacement, offset: offset, source: char)
|
|
50
|
+
else
|
|
51
|
+
char.with_offset(offset)
|
|
52
|
+
end
|
|
53
|
+
offset += c.c.length
|
|
54
|
+
c
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
class << result
|
|
58
|
+
include Yosina::Chars
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
result
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Factory method to create a #{name} transliterator
|
|
66
|
+
#
|
|
67
|
+
# @param options [Hash] Configuration options
|
|
68
|
+
# @return [Transliterator] A new #{name} transliterator instance
|
|
69
|
+
def self.call(options = {})
|
|
70
|
+
Transliterator.new(options)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
RUBY
|
|
76
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
def _to_unicode_escapes_inner(str)
|
|
4
|
+
special_char_exists = false
|
|
5
|
+
[
|
|
6
|
+
str.codepoints.map do |codepoint|
|
|
7
|
+
if codepoint > 127
|
|
8
|
+
special_char_exists = true
|
|
9
|
+
# rubocop:disable Style/FormatString
|
|
10
|
+
'\\u{%x}' % codepoint
|
|
11
|
+
# rubocop:enable Style/FormatString
|
|
12
|
+
elsif [34, 39, 92].include?(codepoint)
|
|
13
|
+
special_char_exists = true
|
|
14
|
+
"\\#{codepoint.chr}"
|
|
15
|
+
else
|
|
16
|
+
codepoint.chr
|
|
17
|
+
end
|
|
18
|
+
end.join,
|
|
19
|
+
special_char_exists
|
|
20
|
+
]
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Convert a string to use Unicode escape sequences for non-ASCII characters
|
|
24
|
+
def to_unicode_escapes(str)
|
|
25
|
+
_to_unicode_escapes_inner(str)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_string_literal(str)
|
|
29
|
+
result, special_char_exists = _to_unicode_escapes_inner(str)
|
|
30
|
+
if special_char_exists
|
|
31
|
+
"\"#{result}\""
|
|
32
|
+
else
|
|
33
|
+
"'#{result}'"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def dedent(shift, lines)
|
|
38
|
+
lines.lines.map do |l|
|
|
39
|
+
if l[0, shift].each_char.all? { |c| c == ' ' }
|
|
40
|
+
l[shift..]
|
|
41
|
+
else
|
|
42
|
+
l
|
|
43
|
+
end
|
|
44
|
+
end.join
|
|
45
|
+
end
|
data/codegen/emitters.rb
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Main emitters file that requires all individual emitter modules
|
|
4
|
+
require_relative 'emitters/simple_transliterator'
|
|
5
|
+
require_relative 'emitters/hyphens_transliterator_data'
|
|
6
|
+
require_relative 'emitters/ivs_svs_base_transliterator_data'
|
|
7
|
+
require_relative 'emitters/combined_transliterator_data'
|
|
8
|
+
require_relative 'emitters/circled_or_squared_transliterator_data'
|
data/codegen/main.rb
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require 'json'
|
|
5
|
+
require 'pathname'
|
|
6
|
+
|
|
7
|
+
require_relative 'dataset'
|
|
8
|
+
require_relative 'emitters'
|
|
9
|
+
|
|
10
|
+
# Main code generation entry point
|
|
11
|
+
def main
|
|
12
|
+
# Determine project paths
|
|
13
|
+
current_dir = Pathname(__FILE__).parent
|
|
14
|
+
project_root = current_dir.parent
|
|
15
|
+
data_root = project_root.parent / 'data'
|
|
16
|
+
dest_root = project_root / 'lib' / 'yosina' / 'transliterators'
|
|
17
|
+
|
|
18
|
+
puts "Loading dataset from: #{data_root}"
|
|
19
|
+
puts "Writing output to: #{dest_root}"
|
|
20
|
+
|
|
21
|
+
# Ensure destination directory exists
|
|
22
|
+
dest_root.mkpath
|
|
23
|
+
|
|
24
|
+
# Define dataset source definitions
|
|
25
|
+
defs = DatasetSourceDefs.new(
|
|
26
|
+
spaces: 'spaces.json',
|
|
27
|
+
radicals: 'radicals.json',
|
|
28
|
+
mathematical_alphanumerics: 'mathematical-alphanumerics.json',
|
|
29
|
+
ideographic_annotations: 'ideographic-annotation-marks.json',
|
|
30
|
+
hyphens: 'hyphens.json',
|
|
31
|
+
ivs_svs_base: 'ivs-svs-base-mappings.json',
|
|
32
|
+
kanji_old_new: 'kanji-old-new-form.json',
|
|
33
|
+
combined: 'combined-chars.json',
|
|
34
|
+
circled_or_squared: 'circled-or-squared.json'
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Load the dataset
|
|
38
|
+
dataset = build_dataset_from_data_root(data_root, defs)
|
|
39
|
+
|
|
40
|
+
# Generate simple transliterators
|
|
41
|
+
simple_transliterators = [
|
|
42
|
+
[
|
|
43
|
+
'spaces',
|
|
44
|
+
'Replace various space characters with plain whitespace',
|
|
45
|
+
dataset.spaces
|
|
46
|
+
],
|
|
47
|
+
[
|
|
48
|
+
'radicals',
|
|
49
|
+
'Replace Kangxi radicals with equivalent CJK ideographs',
|
|
50
|
+
dataset.radicals
|
|
51
|
+
],
|
|
52
|
+
[
|
|
53
|
+
'mathematical_alphanumerics',
|
|
54
|
+
'Replace mathematical alphanumeric symbols with plain characters',
|
|
55
|
+
dataset.mathematical_alphanumerics
|
|
56
|
+
],
|
|
57
|
+
[
|
|
58
|
+
'ideographic_annotations',
|
|
59
|
+
'Replace ideographic annotation marks used in traditional translation',
|
|
60
|
+
dataset.ideographic_annotations
|
|
61
|
+
],
|
|
62
|
+
[
|
|
63
|
+
'kanji_old_new',
|
|
64
|
+
'Replace old-style kanji with modern equivalents',
|
|
65
|
+
dataset.kanji_old_new
|
|
66
|
+
]
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
simple_transliterators.each do |name, description, data|
|
|
70
|
+
output = render_simple_transliterator(name, description, data)
|
|
71
|
+
filename = "#{snake_case(name)}.rb"
|
|
72
|
+
filepath = dest_root / filename
|
|
73
|
+
puts "Generating: #{filename}"
|
|
74
|
+
filepath.write(output)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Generate hyphens data
|
|
78
|
+
output = render_hyphens_transliterator_data(dataset.hyphens)
|
|
79
|
+
filepath = dest_root / 'hyphens_data.rb'
|
|
80
|
+
puts 'Generating: hyphens_data.rb'
|
|
81
|
+
filepath.write(output)
|
|
82
|
+
|
|
83
|
+
# Generate IVS/SVS base data
|
|
84
|
+
output = render_ivs_svs_base_transliterator_data(dataset.ivs_svs_base)
|
|
85
|
+
filepath = dest_root / 'ivs_svs_base_data.rb'
|
|
86
|
+
puts 'Generating: ivs_svs_base_data.rb'
|
|
87
|
+
filepath.write(output)
|
|
88
|
+
|
|
89
|
+
# Generate combined transliterator
|
|
90
|
+
output = render_combined_transliterator_data(dataset.combined)
|
|
91
|
+
filepath = dest_root / 'combined_data.rb'
|
|
92
|
+
puts 'Generating: combined_data.rb'
|
|
93
|
+
filepath.write(output)
|
|
94
|
+
|
|
95
|
+
# Generate circled or squared transliterator
|
|
96
|
+
output = render_circled_or_squared_transliterator_data(dataset.circled_or_squared)
|
|
97
|
+
filepath = dest_root / 'circled_or_squared_data.rb'
|
|
98
|
+
puts 'Generating: circled_or_squared_data.rb'
|
|
99
|
+
filepath.write(output)
|
|
100
|
+
|
|
101
|
+
puts 'Code generation complete!'
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Convert camelCase to snake_case
|
|
105
|
+
def snake_case(str)
|
|
106
|
+
str.gsub(/([A-Z])/, '_\1').downcase.sub(/^_/, '')
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
main if $PROGRAM_NAME == __FILE__
|