yosina 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/Gemfile +6 -0
  4. data/README.ja.md +229 -0
  5. data/README.md +229 -0
  6. data/Rakefile +30 -0
  7. data/codegen/dataset.rb +215 -0
  8. data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
  9. data/codegen/emitters/combined_transliterator_data.rb +28 -0
  10. data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
  11. data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
  12. data/codegen/emitters/simple_transliterator.rb +76 -0
  13. data/codegen/emitters/utils.rb +45 -0
  14. data/codegen/emitters.rb +8 -0
  15. data/codegen/main.rb +109 -0
  16. data/lib/yosina/char.rb +65 -0
  17. data/lib/yosina/chars.rb +152 -0
  18. data/lib/yosina/recipes.rb +359 -0
  19. data/lib/yosina/transliterator.rb +49 -0
  20. data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
  21. data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
  22. data/lib/yosina/transliterators/combined.rb +52 -0
  23. data/lib/yosina/transliterators/combined_data.rb +495 -0
  24. data/lib/yosina/transliterators/hira_kata.rb +106 -0
  25. data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
  26. data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
  27. data/lib/yosina/transliterators/hyphens.rb +83 -0
  28. data/lib/yosina/transliterators/hyphens_data.rb +60 -0
  29. data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
  30. data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
  31. data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
  32. data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
  33. data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
  34. data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
  35. data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
  36. data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
  37. data/lib/yosina/transliterators/radicals.rb +361 -0
  38. data/lib/yosina/transliterators/spaces.rb +79 -0
  39. data/lib/yosina/transliterators.rb +57 -0
  40. data/lib/yosina/version.rb +5 -0
  41. data/lib/yosina.rb +62 -0
  42. data/yosina.gemspec +41 -0
  43. metadata +159 -0
@@ -0,0 +1,215 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ # Dataset source definitions
6
+ DatasetSourceDefs = Struct.new(
7
+ :spaces,
8
+ :radicals,
9
+ :mathematical_alphanumerics,
10
+ :ideographic_annotations,
11
+ :hyphens,
12
+ :ivs_svs_base,
13
+ :kanji_old_new,
14
+ :combined,
15
+ :circled_or_squared,
16
+ keyword_init: true
17
+ )
18
+
19
+ # Dataset container
20
+ Dataset = Struct.new(
21
+ :spaces,
22
+ :radicals,
23
+ :mathematical_alphanumerics,
24
+ :ideographic_annotations,
25
+ :hyphens,
26
+ :ivs_svs_base,
27
+ :kanji_old_new,
28
+ :combined,
29
+ :circled_or_squared,
30
+ keyword_init: true
31
+ )
32
+
33
+ # Convert Unicode codepoint string to character
34
+ def unicode_to_char(unicode_str)
35
+ return '' if unicode_str.nil?
36
+
37
+ # Remove U+ prefix and convert to integer
38
+ codepoint = unicode_str.sub(/^U\+/, '').to_i(16)
39
+ [codepoint].pack('U*')
40
+ end
41
+
42
+ # Load simple mapping data (key -> value or key -> null for removal)
43
+ def load_simple_data(filepath)
44
+ data = JSON.parse(File.read(filepath))
45
+ mappings = {}
46
+
47
+ data.each do |key, value|
48
+ char_key = unicode_to_char(key)
49
+ char_value = value.nil? ? '' : unicode_to_char(value)
50
+ mappings[char_key] = char_value
51
+ end
52
+
53
+ mappings
54
+ end
55
+
56
+ # Load kanji old-new form data (array of pairs)
57
+ def load_kanji_old_new_data(filepath)
58
+ data = JSON.parse(File.read(filepath))
59
+ mappings = {}
60
+
61
+ data.each do |pair|
62
+ next unless pair.is_a?(Array) && pair.length == 2
63
+
64
+ old_form = pair[0]
65
+ new_form = pair[1]
66
+
67
+ # Convert IVS sequences to characters
68
+ old_chars = []
69
+ new_chars = []
70
+
71
+ if old_form['ivs']
72
+ old_chars = old_form['ivs'].map { |cp| unicode_to_char(cp) }
73
+ elsif old_form['svs']
74
+ old_chars = old_form['svs'].map { |cp| unicode_to_char(cp) }
75
+ end
76
+
77
+ if new_form['ivs']
78
+ new_chars = new_form['ivs'].map { |cp| unicode_to_char(cp) }
79
+ elsif new_form['svs']
80
+ new_chars = new_form['svs'].map { |cp| unicode_to_char(cp) }
81
+ end
82
+
83
+ next if old_chars.empty? || new_chars.empty?
84
+
85
+ old_key = old_chars.join
86
+ new_value = new_chars.join
87
+
88
+ mappings[old_key] = new_value
89
+ end
90
+
91
+ mappings
92
+ end
93
+
94
+ # Load hyphens data (array of hyphen records)
95
+ def load_hyphens_data(filepath)
96
+ data = JSON.parse(File.read(filepath))
97
+ records = []
98
+
99
+ data.each do |record|
100
+ hyphen_char = unicode_to_char(record['code'])
101
+
102
+ # Extract mappings from the record
103
+ hyphens_record = {
104
+ hyphen: hyphen_char,
105
+ ascii: nil,
106
+ jisx0201: nil,
107
+ jisx0208_90: nil,
108
+ jisx0208_90_windows: nil,
109
+ jisx0208_verbatim: nil
110
+ }
111
+
112
+ # Map ASCII field
113
+ if record['ascii'] && !record['ascii'].empty?
114
+ hyphens_record[:ascii] = record['ascii'].map { |cp| unicode_to_char(cp) }.join
115
+ end
116
+
117
+ # Map JIS X 0201 field
118
+ if record['jisx0201'] && !record['jisx0201'].empty?
119
+ hyphens_record[:jisx0201] = record['jisx0201'].map { |cp| unicode_to_char(cp) }.join
120
+ end
121
+
122
+ # Map JIS X 0208-1978 field (use as jisx0208_90)
123
+ if record['jisx0208-1978'] && !record['jisx0208-1978'].empty?
124
+ hyphens_record[:jisx0208_90] = record['jisx0208-1978'].map { |cp| unicode_to_char(cp) }.join
125
+ end
126
+
127
+ # Map JIS X 0208-1978 Windows field
128
+ if record['jisx0208-1978-windows'] && !record['jisx0208-1978-windows'].empty?
129
+ hyphens_record[:jisx0208_90_windows] = record['jisx0208-1978-windows'].map { |cp| unicode_to_char(cp) }.join
130
+ end
131
+
132
+ records << hyphens_record
133
+ end
134
+
135
+ records
136
+ end
137
+
138
+ # Load IVS/SVS base data (array of mapping records)
139
+ def load_ivs_svs_base_data(filepath)
140
+ data = JSON.parse(File.read(filepath))
141
+ records = []
142
+
143
+ data.each do |record|
144
+ # Build IVS sequence
145
+ ivs_chars = []
146
+ ivs_chars = record['ivs'].map { |cp| unicode_to_char(cp) } if record['ivs'].is_a?(Array)
147
+
148
+ # Build SVS sequence
149
+ svs_chars = []
150
+ svs_chars = record['svs'].map { |cp| unicode_to_char(cp) } if record['svs'].is_a?(Array)
151
+
152
+ # Get base characters
153
+ base90_char = record['base90'] ? unicode_to_char(record['base90']) : nil
154
+ base2004_char = record['base2004'] ? unicode_to_char(record['base2004']) : nil
155
+
156
+ # Only add record if we have IVS data
157
+ next if ivs_chars.empty?
158
+
159
+ records << {
160
+ ivs: ivs_chars.join,
161
+ svs: svs_chars.empty? ? nil : svs_chars.join,
162
+ base90: base90_char,
163
+ base2004: base2004_char
164
+ }
165
+ end
166
+
167
+ records
168
+ end
169
+
170
+ # Load combined characters data (key -> string to be split into characters)
171
+ def load_combined_data(filepath)
172
+ data = JSON.parse(File.read(filepath))
173
+ mappings = {}
174
+
175
+ data.each do |key, value|
176
+ char_key = unicode_to_char(key)
177
+ char_list = value.chars
178
+ mappings[char_key] = char_list
179
+ end
180
+
181
+ mappings
182
+ end
183
+
184
+ # Load circled or squared characters data
185
+ def load_circled_or_squared_data(filepath)
186
+ data = JSON.parse(File.read(filepath))
187
+ mappings = {}
188
+
189
+ data.each do |key, record|
190
+ char_key = unicode_to_char(key)
191
+ record_data = {
192
+ rendering: record['rendering'],
193
+ type: record['type'],
194
+ emoji: record['emoji']
195
+ }
196
+ mappings[char_key] = record_data
197
+ end
198
+
199
+ mappings
200
+ end
201
+
202
+ # Build dataset from data root directory
203
+ def build_dataset_from_data_root(data_root, defs)
204
+ Dataset.new(
205
+ spaces: load_simple_data(data_root / defs.spaces),
206
+ radicals: load_simple_data(data_root / defs.radicals),
207
+ mathematical_alphanumerics: load_simple_data(data_root / defs.mathematical_alphanumerics),
208
+ ideographic_annotations: load_simple_data(data_root / defs.ideographic_annotations),
209
+ hyphens: load_hyphens_data(data_root / defs.hyphens),
210
+ ivs_svs_base: load_ivs_svs_base_data(data_root / defs.ivs_svs_base),
211
+ kanji_old_new: load_kanji_old_new_data(data_root / defs.kanji_old_new),
212
+ combined: load_combined_data(data_root / defs.combined),
213
+ circled_or_squared: load_circled_or_squared_data(data_root / defs.circled_or_squared)
214
+ )
215
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'utils'
4
+
5
+ # Render a circled or squared character transliterator
6
+ def render_circled_or_squared_transliterator_data(mappings)
7
+ # Generate mapping entries for circled/squared characters
8
+ mapping_entries = mappings.map do |key, record|
9
+ type_abbrev = record[:type] == 'circle' ? 'c' : 's'
10
+ record_repr = "{ rendering: #{to_string_literal(record[:rendering])}" \
11
+ ", type: #{to_string_literal(type_abbrev)}, emoji: #{record[:emoji]} }"
12
+ " #{to_string_literal(key)} => #{record_repr}"
13
+ end.join(",\n")
14
+
15
+ dedent 4, <<~RUBY
16
+ # frozen_string_literal: true
17
+
18
+ module Yosina
19
+ module Transliterators
20
+ # Replace circled or squared characters with their corresponding templates
21
+ module CircledOrSquared
22
+ # Generated mapping data from circled-or-squared.json
23
+ CIRCLED_OR_SQUARED_MAPPINGS = {
24
+ #{mapping_entries}
25
+ }.freeze
26
+ end
27
+ end
28
+ end
29
+ RUBY
30
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'utils'
4
+
5
+ # Render a combined character transliterator
6
+ def render_combined_transliterator_data(mappings)
7
+ # Generate mapping entries for combined characters -> character arrays
8
+ mapping_entries = mappings.map do |key, value_array|
9
+ value_repr = "[#{value_array.map { |c| to_string_literal(c) }.join(', ')}]"
10
+ " #{to_string_literal(key)} => #{value_repr}"
11
+ end.join(",\n")
12
+
13
+ dedent 4, <<~RUBY
14
+ # frozen_string_literal: true
15
+
16
+ module Yosina
17
+ module Transliterators
18
+ # Replace each combined character with its corresponding individual characters
19
+ module Combined
20
+ # Generated mapping data from combined-chars.json
21
+ COMBINED_MAPPINGS = {
22
+ #{mapping_entries}
23
+ }.freeze
24
+ end
25
+ end
26
+ end
27
+ RUBY
28
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'utils'
4
+
5
+ # Render hyphens transliterator data
6
+ def render_hyphens_transliterator_data(records)
7
+ # Generate records array - now using proper hyphens data structure
8
+ records_data = records.map do |record|
9
+ # Build HyphensRecord fields
10
+ fields = []
11
+ fields << "ascii: #{to_string_literal(record[:ascii])}" if record[:ascii]
12
+ fields << "jisx0201: #{to_string_literal(record[:jisx0201])}" if record[:jisx0201]
13
+ fields << "jisx0208_90: #{to_string_literal(record[:jisx0208_90])}" if record[:jisx0208_90]
14
+ fields << "jisx0208_90_windows: #{to_string_literal(record[:jisx0208_90_windows])}" if record[:jisx0208_90_windows]
15
+ fields << "jisx0208_verbatim: #{to_string_literal(record[:jisx0208_verbatim])}" if record[:jisx0208_verbatim]
16
+
17
+ record_repr = if fields.empty?
18
+ 'HyphensRecord.new'
19
+ else
20
+ "HyphensRecord.new(#{fields.join(', ')})"
21
+ end
22
+
23
+ " #{to_string_literal(record[:hyphen])} => #{record_repr}"
24
+ end.join(",\n")
25
+
26
+ <<~RUBY
27
+ # frozen_string_literal: true
28
+
29
+ module Yosina
30
+ module Transliterators
31
+ # Generated hyphens data
32
+ module HyphensData
33
+ # Record for hyphen transliteration data
34
+ HyphensRecord = Struct.new(:ascii, :jisx0201, :jisx0208_90, :jisx0208_90_windows, :jisx0208_verbatim, keyword_init: true) do
35
+ def initialize(ascii: nil, jisx0201: nil, jisx0208_90: nil, jisx0208_90_windows: nil, jisx0208_verbatim: nil)
36
+ super
37
+ end
38
+ end
39
+
40
+ # Generated mapping data
41
+ HYPHENS_MAPPINGS = {
42
+ #{records_data}
43
+ }.freeze
44
+ end
45
+ end
46
+ end
47
+ RUBY
48
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'utils'
4
+
5
+ # Render IVS/SVS base transliterator data
6
+ def render_ivs_svs_base_transliterator_data(records)
7
+ # Build compressed data similar to Python implementation
8
+ compressed_parts = []
9
+ records.each do |record|
10
+ compressed_parts << (record[:ivs] || '')
11
+ compressed_parts << (record[:svs] || '')
12
+ compressed_parts << (record[:base90] || '')
13
+ compressed_parts << (record[:base2004] || '')
14
+ end
15
+ compressed_data = compressed_parts.join("\0")
16
+ compressed_data_escaped = to_string_literal(compressed_data)
17
+
18
+ dedent(4, <<~RUBY
19
+ # frozen_string_literal: true
20
+
21
+ module Yosina
22
+ module Transliterators
23
+ # Generated IVS/SVS base data
24
+ module IvsSvsBaseData
25
+ # Record for IVS/SVS base transliteration data
26
+ IvsSvsBaseRecord = Struct.new(:ivs, :svs, :base90, :base2004, keyword_init: true) do
27
+ def initialize(ivs:, svs: nil, base90: nil, base2004: nil)
28
+ super
29
+ end
30
+ end
31
+
32
+ # Compressed data table - 4 strings per record: [ivs, svs, base90, base2004, ...]
33
+ COMPRESSED_DATA = #{compressed_data_escaped}.freeze
34
+ RECORDS_COUNT = #{records.length}
35
+
36
+ # Expand compressed data into a mapping dictionary
37
+ def self.expand_compressed_data
38
+ mappings = {}
39
+ # Split by null bytes to get all fields
40
+ fields = COMPRESSED_DATA.split("\\0")
41
+ # Process 4 fields at a time (ivs, svs, base90, base2004)
42
+ (0...fields.length).step(4) do |i|
43
+ next unless i + 3 < fields.length
44
+ ivs = fields[i]
45
+ svs = fields[i + 1].empty? ? nil : fields[i + 1]
46
+ base90 = fields[i + 2].empty? ? nil : fields[i + 2]
47
+ base2004 = fields[i + 3].empty? ? nil : fields[i + 3]
48
+ # Only add if ivs is not empty
49
+ if !ivs.empty?
50
+ mappings[ivs] = IvsSvsBaseRecord.new(
51
+ ivs: ivs,
52
+ svs: svs,
53
+ base90: base90,
54
+ base2004: base2004
55
+ )
56
+ end
57
+ end
58
+ mappings
59
+ end
60
+
61
+ # Lazy-loaded mappings cache
62
+ @mappings_cache = nil
63
+ @base_to_variants_cache = nil
64
+ @variants_to_base_cache = nil
65
+
66
+ # Get the IVS/SVS mappings dictionary, loading it if necessary
67
+ def self.get_ivs_svs_mappings
68
+ @mappings_cache ||= expand_compressed_data
69
+ end
70
+
71
+ # Build optimized lookup tables for base-to-variants and variants-to-base mappings
72
+ def self.populate_lookup_tables
73
+ return if @base_to_variants_cache && @variants_to_base_cache
74
+
75
+ mappings = get_ivs_svs_mappings
76
+
77
+ # For base->IVS/SVS lookup (used in "ivs-or-svs" mode)
78
+ base_to_variants_2004 = {}
79
+ base_to_variants_90 = {}
80
+
81
+ # For IVS/SVS->base lookup (used in "base" mode)
82
+ variants_to_base = {}
83
+
84
+ mappings.each do |variant_seq, record|
85
+ # Map base characters to their IVS/SVS variants
86
+ if record.base2004 && !base_to_variants_2004.key?(record.base2004)
87
+ base_to_variants_2004[record.base2004] = record
88
+ end
89
+
90
+ if record.base90 && !base_to_variants_90.key?(record.base90)
91
+ base_to_variants_90[record.base90] = record
92
+ end
93
+
94
+ # Map IVS/SVS variants back to base characters
95
+ variants_to_base[variant_seq] = record
96
+ end
97
+
98
+ @base_to_variants_cache = {
99
+ 'unijis_2004' => base_to_variants_2004,
100
+ 'unijis_90' => base_to_variants_90
101
+ }
102
+ @variants_to_base_cache = variants_to_base
103
+ end
104
+
105
+ # Get base character to variants mapping for the specified charset
106
+ def self.get_base_to_variants_mappings(charset = 'unijis_2004')
107
+ populate_lookup_tables
108
+ @base_to_variants_cache[charset]
109
+ end
110
+
111
+ # Get variants to base character mapping
112
+ def self.get_variants_to_base_mappings
113
+ populate_lookup_tables
114
+ @variants_to_base_cache
115
+ end
116
+ end
117
+ end
118
+ end
119
+ RUBY
120
+ )
121
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'utils'
4
+
5
+ # Render a simple transliterator with a mapping table
6
+ def render_simple_transliterator(name, description, mappings)
7
+ class_name = name.split('_').map(&:capitalize).join
8
+ constant_name = "#{name.upcase}_MAPPINGS"
9
+
10
+ # Generate mapping entries
11
+ mapping_entries = mappings.map do |key, value|
12
+ " #{to_string_literal(key)} => #{to_string_literal(value)}"
13
+ end.join(",\n")
14
+
15
+ dedent 4, <<~RUBY
16
+ # frozen_string_literal: true
17
+
18
+ module Yosina
19
+ module Transliterators
20
+ # #{description}
21
+ module #{class_name}
22
+ # Generated mapping data from #{name}.json
23
+ #{constant_name} = {
24
+ #{mapping_entries}
25
+ }.freeze
26
+
27
+ # Transliterator for #{name}
28
+ class Transliterator < Yosina::BaseTransliterator
29
+ # Initialize the transliterator with options
30
+ #
31
+ # @param _options [Hash] Configuration options (currently unused)
32
+ def initialize(_options = {})
33
+ # Options currently unused for #{name} transliterator
34
+ super()
35
+ end
36
+
37
+ # #{description}
38
+ #
39
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
40
+ # @return [Enumerable<Char>] The transliterated characters
41
+ def call(input_chars)
42
+ offset = 0
43
+
44
+ result = input_chars.filter_map do |char|
45
+ replacement = #{constant_name}[char.c]
46
+ c = if replacement
47
+ # Skip empty replacements (character removal)
48
+ next if replacement.empty?
49
+ Char.new(c: replacement, offset: offset, source: char)
50
+ else
51
+ char.with_offset(offset)
52
+ end
53
+ offset += c.c.length
54
+ c
55
+ end
56
+
57
+ class << result
58
+ include Yosina::Chars
59
+ end
60
+
61
+ result
62
+ end
63
+ end
64
+
65
+ # Factory method to create a #{name} transliterator
66
+ #
67
+ # @param options [Hash] Configuration options
68
+ # @return [Transliterator] A new #{name} transliterator instance
69
+ def self.call(options = {})
70
+ Transliterator.new(options)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ RUBY
76
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ def _to_unicode_escapes_inner(str)
4
+ special_char_exists = false
5
+ [
6
+ str.codepoints.map do |codepoint|
7
+ if codepoint > 127
8
+ special_char_exists = true
9
+ # rubocop:disable Style/FormatString
10
+ '\\u{%x}' % codepoint
11
+ # rubocop:enable Style/FormatString
12
+ elsif [34, 39, 92].include?(codepoint)
13
+ special_char_exists = true
14
+ "\\#{codepoint.chr}"
15
+ else
16
+ codepoint.chr
17
+ end
18
+ end.join,
19
+ special_char_exists
20
+ ]
21
+ end
22
+
23
+ # Convert a string to use Unicode escape sequences for non-ASCII characters
24
+ def to_unicode_escapes(str)
25
+ _to_unicode_escapes_inner(str)
26
+ end
27
+
28
+ def to_string_literal(str)
29
+ result, special_char_exists = _to_unicode_escapes_inner(str)
30
+ if special_char_exists
31
+ "\"#{result}\""
32
+ else
33
+ "'#{result}'"
34
+ end
35
+ end
36
+
37
+ def dedent(shift, lines)
38
+ lines.lines.map do |l|
39
+ if l[0, shift].each_char.all? { |c| c == ' ' }
40
+ l[shift..]
41
+ else
42
+ l
43
+ end
44
+ end.join
45
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Main emitters file that requires all individual emitter modules
4
+ require_relative 'emitters/simple_transliterator'
5
+ require_relative 'emitters/hyphens_transliterator_data'
6
+ require_relative 'emitters/ivs_svs_base_transliterator_data'
7
+ require_relative 'emitters/combined_transliterator_data'
8
+ require_relative 'emitters/circled_or_squared_transliterator_data'
data/codegen/main.rb ADDED
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'json'
5
+ require 'pathname'
6
+
7
+ require_relative 'dataset'
8
+ require_relative 'emitters'
9
+
10
+ # Main code generation entry point
11
+ def main
12
+ # Determine project paths
13
+ current_dir = Pathname(__FILE__).parent
14
+ project_root = current_dir.parent
15
+ data_root = project_root.parent / 'data'
16
+ dest_root = project_root / 'lib' / 'yosina' / 'transliterators'
17
+
18
+ puts "Loading dataset from: #{data_root}"
19
+ puts "Writing output to: #{dest_root}"
20
+
21
+ # Ensure destination directory exists
22
+ dest_root.mkpath
23
+
24
+ # Define dataset source definitions
25
+ defs = DatasetSourceDefs.new(
26
+ spaces: 'spaces.json',
27
+ radicals: 'radicals.json',
28
+ mathematical_alphanumerics: 'mathematical-alphanumerics.json',
29
+ ideographic_annotations: 'ideographic-annotation-marks.json',
30
+ hyphens: 'hyphens.json',
31
+ ivs_svs_base: 'ivs-svs-base-mappings.json',
32
+ kanji_old_new: 'kanji-old-new-form.json',
33
+ combined: 'combined-chars.json',
34
+ circled_or_squared: 'circled-or-squared.json'
35
+ )
36
+
37
+ # Load the dataset
38
+ dataset = build_dataset_from_data_root(data_root, defs)
39
+
40
+ # Generate simple transliterators
41
+ simple_transliterators = [
42
+ [
43
+ 'spaces',
44
+ 'Replace various space characters with plain whitespace',
45
+ dataset.spaces
46
+ ],
47
+ [
48
+ 'radicals',
49
+ 'Replace Kangxi radicals with equivalent CJK ideographs',
50
+ dataset.radicals
51
+ ],
52
+ [
53
+ 'mathematical_alphanumerics',
54
+ 'Replace mathematical alphanumeric symbols with plain characters',
55
+ dataset.mathematical_alphanumerics
56
+ ],
57
+ [
58
+ 'ideographic_annotations',
59
+ 'Replace ideographic annotation marks used in traditional translation',
60
+ dataset.ideographic_annotations
61
+ ],
62
+ [
63
+ 'kanji_old_new',
64
+ 'Replace old-style kanji with modern equivalents',
65
+ dataset.kanji_old_new
66
+ ]
67
+ ]
68
+
69
+ simple_transliterators.each do |name, description, data|
70
+ output = render_simple_transliterator(name, description, data)
71
+ filename = "#{snake_case(name)}.rb"
72
+ filepath = dest_root / filename
73
+ puts "Generating: #{filename}"
74
+ filepath.write(output)
75
+ end
76
+
77
+ # Generate hyphens data
78
+ output = render_hyphens_transliterator_data(dataset.hyphens)
79
+ filepath = dest_root / 'hyphens_data.rb'
80
+ puts 'Generating: hyphens_data.rb'
81
+ filepath.write(output)
82
+
83
+ # Generate IVS/SVS base data
84
+ output = render_ivs_svs_base_transliterator_data(dataset.ivs_svs_base)
85
+ filepath = dest_root / 'ivs_svs_base_data.rb'
86
+ puts 'Generating: ivs_svs_base_data.rb'
87
+ filepath.write(output)
88
+
89
+ # Generate combined transliterator
90
+ output = render_combined_transliterator_data(dataset.combined)
91
+ filepath = dest_root / 'combined_data.rb'
92
+ puts 'Generating: combined_data.rb'
93
+ filepath.write(output)
94
+
95
+ # Generate circled or squared transliterator
96
+ output = render_circled_or_squared_transliterator_data(dataset.circled_or_squared)
97
+ filepath = dest_root / 'circled_or_squared_data.rb'
98
+ puts 'Generating: circled_or_squared_data.rb'
99
+ filepath.write(output)
100
+
101
+ puts 'Code generation complete!'
102
+ end
103
+
104
+ # Convert camelCase to snake_case
105
+ def snake_case(str)
106
+ str.gsub(/([A-Z])/, '_\1').downcase.sub(/^_/, '')
107
+ end
108
+
109
+ main if $PROGRAM_NAME == __FILE__