yosina 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 79e70c5202973c9c7c6e2c5b5de4538422eb9323071dbaabc6563be241129a44
4
- data.tar.gz: bf615675a3b77c5b330ac5da9132e2df90d92911d3763ae8c3bcc612aa894289
3
+ metadata.gz: e0feecfd822d02f104629ea622864a55c77771f2af69d54f19a564a06e45d6cf
4
+ data.tar.gz: 04fba6b2b2db15a96f9574771fbe8e4c71debacfa62b9d7480ed178ee96c3868
5
5
  SHA512:
6
- metadata.gz: 9acbf40e48148bd8cd6db88b02df70364a840c5b8a283df61256e838db9990bff8b73b1a30e95e892c20cbe5542a8d21d870284a7779cbc833043fd3aef612b7
7
- data.tar.gz: 64e0f480547d54d14318f677ac459905209b6365096623cfe849a21740a1150a3c263c950b37f62dd30b82dce55b1a2e89e2c5def130736a1ac4b44d1c21acc4
6
+ metadata.gz: 0aff88194c04e9316f69925fb4e6499d6e5dc7ae1c168136fdbc41ea01e38b2760cb9e2e2315879eb60eed18396f8405a2ac800e067469a1e9f6f015acc44010
7
+ data.tar.gz: 9212dc8a91402ef11255a33303a09a371a0a5ec15ec1d6f4d2b86f3f4afbfa62255ace9ee649e29425890c1c6df0085356680408c6faab4968c57f4daaf32275
data/README.md CHANGED
@@ -192,6 +192,10 @@ Converts CJK radical characters to their corresponding ideographs.
192
192
  Normalizes various Unicode space characters to standard ASCII space.
193
193
  - Example: `A B` (ideographic space) → `A B`
194
194
 
195
+ ### 15. **Roman Numerals** (`roman-numerals`)
196
+ Converts Unicode Roman numeral characters to their ASCII letter equivalents.
197
+ - Example: `Ⅰ Ⅱ Ⅲ` → `I II III`, `ⅰ ⅱ ⅲ` → `i ii iii`
198
+
195
199
  ## Development
196
200
 
197
201
  After checking out the repo, run `bundle install` to install dependencies.
data/codegen/dataset.rb CHANGED
@@ -13,6 +13,7 @@ DatasetSourceDefs = Struct.new(
13
13
  :kanji_old_new,
14
14
  :combined,
15
15
  :circled_or_squared,
16
+ :roman_numerals,
16
17
  keyword_init: true
17
18
  )
18
19
 
@@ -27,6 +28,7 @@ Dataset = Struct.new(
27
28
  :kanji_old_new,
28
29
  :combined,
29
30
  :circled_or_squared,
31
+ :roman_numerals,
30
32
  keyword_init: true
31
33
  )
32
34
 
@@ -199,6 +201,34 @@ def load_circled_or_squared_data(filepath)
199
201
  mappings
200
202
  end
201
203
 
204
+ # Load roman numerals data
205
+ def load_roman_numerals_data(filepath)
206
+ data = JSON.parse(File.read(filepath))
207
+ mappings = {}
208
+
209
+ data.each do |record|
210
+ # Parse upper and lower codes
211
+ upper_char = unicode_to_char(record['codes']['upper'])
212
+ lower_char = unicode_to_char(record['codes']['lower'])
213
+
214
+ # Parse decomposed forms
215
+ decomposed_upper = record['decomposed']['upper'].map { |cp| unicode_to_char(cp) }
216
+ decomposed_lower = record['decomposed']['lower'].map { |cp| unicode_to_char(cp) }
217
+
218
+ # Store both upper and lower mappings
219
+ mappings[upper_char] = {
220
+ value: record['value'],
221
+ decomposed: decomposed_upper
222
+ }
223
+ mappings[lower_char] = {
224
+ value: record['value'],
225
+ decomposed: decomposed_lower
226
+ }
227
+ end
228
+
229
+ mappings
230
+ end
231
+
202
232
  # Build dataset from data root directory
203
233
  def build_dataset_from_data_root(data_root, defs)
204
234
  Dataset.new(
@@ -210,6 +240,7 @@ def build_dataset_from_data_root(data_root, defs)
210
240
  ivs_svs_base: load_ivs_svs_base_data(data_root / defs.ivs_svs_base),
211
241
  kanji_old_new: load_kanji_old_new_data(data_root / defs.kanji_old_new),
212
242
  combined: load_combined_data(data_root / defs.combined),
213
- circled_or_squared: load_circled_or_squared_data(data_root / defs.circled_or_squared)
243
+ circled_or_squared: load_circled_or_squared_data(data_root / defs.circled_or_squared),
244
+ roman_numerals: load_roman_numerals_data(data_root / defs.roman_numerals)
214
245
  )
215
246
  end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'utils'
4
+
5
+ # Render a roman numerals transliterator
6
+ def render_roman_numerals_transliterator_data(mappings)
7
+ # Generate mapping entries for roman numerals -> decomposed arrays
8
+ mapping_entries = mappings.map do |key, record|
9
+ decomposed_array = record[:decomposed]
10
+ value_repr = "[#{decomposed_array.map { |c| to_string_literal(c) }.join(', ')}]"
11
+ " #{to_string_literal(key)} => #{value_repr}"
12
+ end.join(",\n")
13
+
14
+ dedent 4, <<~RUBY
15
+ # frozen_string_literal: true
16
+
17
+ module Yosina
18
+ module Transliterators
19
+ # Replace roman numeral characters with their ASCII letter equivalents
20
+ module RomanNumerals
21
+ # Generated mapping data from roman-numerals.json
22
+ ROMAN_NUMERAL_MAPPINGS = {
23
+ #{mapping_entries}
24
+ }.freeze
25
+ end
26
+ end
27
+ end
28
+ RUBY
29
+ end
data/codegen/emitters.rb CHANGED
@@ -6,3 +6,4 @@ require_relative 'emitters/hyphens_transliterator_data'
6
6
  require_relative 'emitters/ivs_svs_base_transliterator_data'
7
7
  require_relative 'emitters/combined_transliterator_data'
8
8
  require_relative 'emitters/circled_or_squared_transliterator_data'
9
+ require_relative 'emitters/roman_numerals_transliterator_data'
data/codegen/main.rb CHANGED
@@ -31,7 +31,8 @@ def main
31
31
  ivs_svs_base: 'ivs-svs-base-mappings.json',
32
32
  kanji_old_new: 'kanji-old-new-form.json',
33
33
  combined: 'combined-chars.json',
34
- circled_or_squared: 'circled-or-squared.json'
34
+ circled_or_squared: 'circled-or-squared.json',
35
+ roman_numerals: 'roman-numerals.json'
35
36
  )
36
37
 
37
38
  # Load the dataset
@@ -98,6 +99,12 @@ def main
98
99
  puts 'Generating: circled_or_squared_data.rb'
99
100
  filepath.write(output)
100
101
 
102
+ # Generate roman numerals transliterator
103
+ output = render_roman_numerals_transliterator_data(dataset.roman_numerals)
104
+ filepath = dest_root / 'roman_numerals_data.rb'
105
+ puts 'Generating: roman_numerals_data.rb'
106
+ filepath.write(output)
107
+
101
108
  puts 'Code generation complete!'
102
109
  end
103
110
 
@@ -56,7 +56,7 @@ module Yosina
56
56
  :replace_suspicious_hyphens_to_prolonged_sound_marks,
57
57
  :replace_combined_characters, :replace_circled_or_squared_characters,
58
58
  :replace_ideographic_annotations, :replace_radicals, :replace_spaces,
59
- :replace_hyphens, :replace_mathematical_alphanumerics,
59
+ :replace_hyphens, :replace_mathematical_alphanumerics, :replace_roman_numerals,
60
60
  :combine_decomposed_hiraganas_and_katakanas, :to_fullwidth, :to_halfwidth,
61
61
  :remove_ivs_svs, :charset
62
62
 
@@ -123,6 +123,12 @@ module Yosina
123
123
  # # Output: "ABC"
124
124
  # # Input: "𝟏𝟐𝟑" (mathematical bold digits)
125
125
  # # Output: "123"
126
+ # @param replace_roman_numerals [Boolean] Replace roman numeral characters
127
+ # @example
128
+ # # Input: "Ⅲ" (Roman numeral III)
129
+ # # Output: "III"
130
+ # # Input: "ⅻ" (Roman numeral xii)
131
+ # # Output: "xii"
126
132
  # @param combine_decomposed_hiraganas_and_katakanas [Boolean] Combine decomposed hiraganas/katakanas
127
133
  # @example
128
134
  # # Input: "が" (か + ゙)
@@ -154,7 +160,7 @@ module Yosina
154
160
  replace_combined_characters: false, replace_circled_or_squared_characters: false,
155
161
  replace_ideographic_annotations: false, replace_radicals: false,
156
162
  replace_spaces: false, replace_hyphens: false,
157
- replace_mathematical_alphanumerics: false,
163
+ replace_mathematical_alphanumerics: false, replace_roman_numerals: false,
158
164
  combine_decomposed_hiraganas_and_katakanas: false,
159
165
  to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false,
160
166
  charset: 'unijis_2004')
@@ -169,6 +175,7 @@ module Yosina
169
175
  @replace_spaces = replace_spaces
170
176
  @replace_hyphens = replace_hyphens
171
177
  @replace_mathematical_alphanumerics = replace_mathematical_alphanumerics
178
+ @replace_roman_numerals = replace_roman_numerals
172
179
  @combine_decomposed_hiraganas_and_katakanas = combine_decomposed_hiraganas_and_katakanas
173
180
  @to_fullwidth = to_fullwidth
174
181
  @to_halfwidth = to_halfwidth
@@ -200,6 +207,7 @@ module Yosina
200
207
  ctx = apply_replace_spaces(ctx)
201
208
  ctx = apply_replace_hyphens(ctx)
202
209
  ctx = apply_replace_mathematical_alphanumerics(ctx)
210
+ ctx = apply_replace_roman_numerals(ctx)
203
211
  ctx = apply_combine_decomposed_hiraganas_and_katakanas(ctx)
204
212
  ctx = apply_to_fullwidth(ctx)
205
213
  ctx = apply_hira_kata(ctx)
@@ -233,7 +241,7 @@ module Yosina
233
241
 
234
242
  def apply_hira_kata(ctx)
235
243
  if @hira_kata
236
- ctx.insert_middle([:hira_kata, { mode: @hira_kata }])
244
+ ctx.insert_tail([:hira_kata, { mode: @hira_kata }])
237
245
  else
238
246
  ctx
239
247
  end
@@ -313,6 +321,14 @@ module Yosina
313
321
  end
314
322
  end
315
323
 
324
+ def apply_replace_roman_numerals(ctx)
325
+ if @replace_roman_numerals
326
+ ctx.insert_middle([:roman_numerals, {}])
327
+ else
328
+ ctx
329
+ end
330
+ end
331
+
316
332
  def apply_combine_decomposed_hiraganas_and_katakanas(ctx)
317
333
  if @combine_decomposed_hiraganas_and_katakanas
318
334
  ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
@@ -132,11 +132,13 @@ module Yosina
132
132
  "\u{2791}" => { rendering: '8', type: 'c', emoji: false },
133
133
  "\u{2792}" => { rendering: '9', type: 'c', emoji: false },
134
134
  "\u{2793}" => { rendering: '10', type: 'c', emoji: false },
135
+ "\u{3036}" => { rendering: "\u{3012}", type: 'c', emoji: false },
135
136
  "\u{3244}" => { rendering: "\u{554f}", type: 'c', emoji: false },
136
137
  "\u{3245}" => { rendering: "\u{5e7c}", type: 'c', emoji: false },
137
138
  "\u{3246}" => { rendering: "\u{6587}", type: 'c', emoji: false },
138
139
  "\u{3247}" => { rendering: "\u{7b8f}", type: 'c', emoji: false },
139
- "\u{3248}" => { rendering: '21', type: 'c', emoji: false },
140
+ "\u{3248}" => { rendering: '10', type: 'c', emoji: false },
141
+ "\u{3251}" => { rendering: '21', type: 'c', emoji: false },
140
142
  "\u{3252}" => { rendering: '22', type: 'c', emoji: false },
141
143
  "\u{3253}" => { rendering: '23', type: 'c', emoji: false },
142
144
  "\u{3254}" => { rendering: '24', type: 'c', emoji: false },
@@ -386,7 +388,7 @@ module Yosina
386
388
  "\u{1f1aa}" => { rendering: 'SHV', type: 's', emoji: false },
387
389
  "\u{1f1ab}" => { rendering: 'UHD', type: 's', emoji: false },
388
390
  "\u{1f1ac}" => { rendering: 'VOD', type: 's', emoji: false },
389
- "\u{1f1ad}" => { rendering: 'VOD', type: 's', emoji: false },
391
+ "\u{1f1ad}" => { rendering: 'M', type: 'c', emoji: false },
390
392
  "\u{1f1e6}" => { rendering: 'A', type: 's', emoji: false },
391
393
  "\u{1f1e7}" => { rendering: 'B', type: 's', emoji: false },
392
394
  "\u{1f1e8}" => { rendering: 'C', type: 's', emoji: false },
@@ -27,6 +27,8 @@ module Yosina
27
27
  "\u{2014}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2014}", jisx0208_90_windows: "\u{2015}"),
28
28
  "\u{2015}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
29
29
  "\u{2016}" => HyphensRecord.new(jisx0208_90: "\u{2016}", jisx0208_90_windows: "\u{2225}"),
30
+ "\u{2032}" => HyphensRecord.new(ascii: "\'", jisx0201: "\'", jisx0208_90: "\u{2032}", jisx0208_90_windows: "\u{2032}"),
31
+ "\u{2033}" => HyphensRecord.new(ascii: "\"", jisx0201: "\"", jisx0208_90: "\u{2033}", jisx0208_90_windows: "\u{2033}"),
30
32
  "\u{203e}" => HyphensRecord.new(jisx0201: '~', jisx0208_90: "\u{ffe3}", jisx0208_90_windows: "\u{ffe3}"),
31
33
  "\u{2043}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
32
34
  "\u{2053}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{301c}"),
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'roman_numerals_data'
4
+
5
+ module Yosina
6
+ module Transliterators
7
+ # Replace roman numeral characters with their ASCII letter equivalents
8
+ module RomanNumerals
9
+ # Transliterator for roman numerals
10
+ class Transliterator < Yosina::BaseTransliterator
11
+ # Initialize the transliterator
12
+ def initialize(_options = nil)
13
+ super()
14
+ end
15
+
16
+ # Convert roman numeral characters to ASCII equivalents
17
+ #
18
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
19
+ # @return [Enumerable<Char>] The transliterated characters
20
+ def call(input_chars)
21
+ offset = 0
22
+
23
+ Chars.enum do |y|
24
+ input_chars.each do |char|
25
+ replacement = RomanNumerals::ROMAN_NUMERAL_MAPPINGS[char.c]
26
+ if replacement
27
+ replacement.each do |c|
28
+ y << Char.new(c: c, offset: offset, source: char)
29
+ offset += c.length
30
+ end
31
+ else
32
+ y << Char.new(c: char.c, offset: offset, source: char)
33
+ offset += char.c.length
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ # Factory method to create a transliterator
41
+ #
42
+ # @param options [Hash] Configuration options (currently unused)
43
+ # @return [Transliterator] A new transliterator instance
44
+ def self.call(options = {})
45
+ Transliterator.new(options)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Replace roman numeral characters with their ASCII letter equivalents
6
+ module RomanNumerals
7
+ # Generated mapping data from roman-numerals.json
8
+ ROMAN_NUMERAL_MAPPINGS = {
9
+ "\u{2160}" => ['I'],
10
+ "\u{2170}" => ['i'],
11
+ "\u{2161}" => ['I', 'I'],
12
+ "\u{2171}" => ['i', 'i'],
13
+ "\u{2162}" => ['I', 'I', 'I'],
14
+ "\u{2172}" => ['i', 'i', 'i'],
15
+ "\u{2163}" => ['I', 'V'],
16
+ "\u{2173}" => ['i', 'v'],
17
+ "\u{2164}" => ['V'],
18
+ "\u{2174}" => ['v'],
19
+ "\u{2165}" => ['V', 'I'],
20
+ "\u{2175}" => ['v', 'i'],
21
+ "\u{2166}" => ['V', 'I', 'I'],
22
+ "\u{2176}" => ['v', 'i', 'i'],
23
+ "\u{2167}" => ['V', 'I', 'I', 'I'],
24
+ "\u{2177}" => ['v', 'i', 'i', 'i'],
25
+ "\u{2168}" => ['I', 'X'],
26
+ "\u{2178}" => ['i', 'x'],
27
+ "\u{2169}" => ['X'],
28
+ "\u{2179}" => ['x'],
29
+ "\u{216a}" => ['X', 'I'],
30
+ "\u{217a}" => ['x', 'i'],
31
+ "\u{216b}" => ['X', 'I', 'I'],
32
+ "\u{217b}" => ['x', 'i', 'i'],
33
+ "\u{216c}" => ['L'],
34
+ "\u{217c}" => ['l'],
35
+ "\u{216d}" => ['C'],
36
+ "\u{217d}" => ['c'],
37
+ "\u{216e}" => ['D'],
38
+ "\u{217e}" => ['d'],
39
+ "\u{216f}" => ['M'],
40
+ "\u{217f}" => ['m']
41
+ }.freeze
42
+ end
43
+ end
44
+ end
@@ -3,6 +3,7 @@
3
3
  require_relative 'transliterators/spaces'
4
4
  require_relative 'transliterators/kanji_old_new'
5
5
  require_relative 'transliterators/radicals'
6
+ require_relative 'transliterators/roman_numerals'
6
7
  require_relative 'transliterators/ideographic_annotations'
7
8
  require_relative 'transliterators/mathematical_alphanumerics'
8
9
  require_relative 'transliterators/prolonged_sound_marks'
@@ -22,6 +23,7 @@ module Yosina
22
23
  spaces: Transliterators::Spaces,
23
24
  kanji_old_new: Transliterators::KanjiOldNew,
24
25
  radicals: Transliterators::Radicals,
26
+ roman_numerals: Transliterators::RomanNumerals,
25
27
  ideographic_annotations: Transliterators::IdeographicAnnotations,
26
28
  mathematical_alphanumerics: Transliterators::MathematicalAlphanumerics,
27
29
  prolonged_sound_marks: Transliterators::ProlongedSoundMarks,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Yosina
4
- VERSION = '0.1.0'
4
+ VERSION = '1.0.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yosina
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Moriyoshi Koizumi
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-08-19 00:00:00.000000000 Z
11
+ date: 2025-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -99,6 +99,7 @@ files:
99
99
  - codegen/emitters/combined_transliterator_data.rb
100
100
  - codegen/emitters/hyphens_transliterator_data.rb
101
101
  - codegen/emitters/ivs_svs_base_transliterator_data.rb
102
+ - codegen/emitters/roman_numerals_transliterator_data.rb
102
103
  - codegen/emitters/simple_transliterator.rb
103
104
  - codegen/emitters/utils.rb
104
105
  - codegen/main.rb
@@ -126,6 +127,8 @@ files:
126
127
  - lib/yosina/transliterators/mathematical_alphanumerics.rb
127
128
  - lib/yosina/transliterators/prolonged_sound_marks.rb
128
129
  - lib/yosina/transliterators/radicals.rb
130
+ - lib/yosina/transliterators/roman_numerals.rb
131
+ - lib/yosina/transliterators/roman_numerals_data.rb
129
132
  - lib/yosina/transliterators/spaces.rb
130
133
  - lib/yosina/version.rb
131
134
  - yosina.gemspec