yosina 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +36 -0
  3. data/Gemfile +6 -0
  4. data/README.ja.md +229 -0
  5. data/README.md +229 -0
  6. data/Rakefile +30 -0
  7. data/codegen/dataset.rb +215 -0
  8. data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
  9. data/codegen/emitters/combined_transliterator_data.rb +28 -0
  10. data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
  11. data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
  12. data/codegen/emitters/simple_transliterator.rb +76 -0
  13. data/codegen/emitters/utils.rb +45 -0
  14. data/codegen/emitters.rb +8 -0
  15. data/codegen/main.rb +109 -0
  16. data/lib/yosina/char.rb +65 -0
  17. data/lib/yosina/chars.rb +152 -0
  18. data/lib/yosina/recipes.rb +359 -0
  19. data/lib/yosina/transliterator.rb +49 -0
  20. data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
  21. data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
  22. data/lib/yosina/transliterators/combined.rb +52 -0
  23. data/lib/yosina/transliterators/combined_data.rb +495 -0
  24. data/lib/yosina/transliterators/hira_kata.rb +106 -0
  25. data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
  26. data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
  27. data/lib/yosina/transliterators/hyphens.rb +83 -0
  28. data/lib/yosina/transliterators/hyphens_data.rb +60 -0
  29. data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
  30. data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
  31. data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
  32. data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
  33. data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
  34. data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
  35. data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
  36. data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
  37. data/lib/yosina/transliterators/radicals.rb +361 -0
  38. data/lib/yosina/transliterators/spaces.rb +79 -0
  39. data/lib/yosina/transliterators.rb +57 -0
  40. data/lib/yosina/version.rb +5 -0
  41. data/lib/yosina.rb +62 -0
  42. data/yosina.gemspec +41 -0
  43. metadata +159 -0
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'hira_kata_table'
4
+
5
+ module Yosina
6
+ module Transliterators
7
+ # Hiragana/katakana composition transliterator
8
+ module HiraKataComposition
9
+ include HiraKataTable
10
+
11
+ VOICED_SOUND_MARK_MAPPINGS = Hash[VOICED_CHARACTERS].freeze
12
+ SEMI_VOICED_SOUND_MARK_MAPPINGS = Hash[SEMI_VOICED_CHARACTERS].freeze
13
+
14
+ # Combining mark mappings for hiragana and katakana
15
+ COMBINING_MARKS = {
16
+ "\u3099" => VOICED_SOUND_MARK_MAPPINGS,
17
+ "\u309A" => SEMI_VOICED_SOUND_MARK_MAPPINGS
18
+ }.freeze
19
+
20
+ # Non-combining mark mappings
21
+ NON_COMBINING_MARKS = {
22
+ "\u3099" => VOICED_SOUND_MARK_MAPPINGS,
23
+ "\u309A" => SEMI_VOICED_SOUND_MARK_MAPPINGS,
24
+ "\u309B" => VOICED_SOUND_MARK_MAPPINGS,
25
+ "\u309C" => SEMI_VOICED_SOUND_MARK_MAPPINGS
26
+ }.freeze
27
+
28
+ # Transliterator for hiragana/katakana composition
29
+ class Transliterator < Yosina::BaseTransliterator
30
+ attr_reader :compose_non_combining_marks
31
+
32
+ # Initialize the transliterator with options
33
+ #
34
+ # @param options [Hash] Configuration options
35
+ # @option options [Boolean] :compose_non_combining_marks Whether to compose non-combining
36
+ # marks (゛ and ゜) too. Defaults to false.
37
+ def initialize(options = {})
38
+ super()
39
+ @compose_non_combining_marks = options[:compose_non_combining_marks] || false
40
+ @mappings = @compose_non_combining_marks ? NON_COMBINING_MARKS : COMBINING_MARKS
41
+ end
42
+
43
+ # Combine decomposed hiragana and katakana characters with their marks
44
+ #
45
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
46
+ # @return [Enumerable<Char>] The transliterated characters
47
+ def call(input_chars)
48
+ e = input_chars.each
49
+ offset = 0
50
+
51
+ Chars.enum do |y|
52
+ begin
53
+ prev = e.next
54
+ rescue StopIteration
55
+ break
56
+ end
57
+
58
+ if prev.sentinel?
59
+ y << prev
60
+ break
61
+ end
62
+
63
+ loop do
64
+ begin
65
+ char = e.next
66
+ rescue StopIteration
67
+ break
68
+ end
69
+
70
+ if prev
71
+ # Check for combining marks
72
+ if (mark_mapping = @mappings[char.c]) && (composed = mark_mapping[prev.c])
73
+ # Found a composable combination
74
+ y << Char.new(c: composed, offset: offset, source: char)
75
+ offset += composed.length
76
+ prev = nil
77
+ next
78
+ end
79
+
80
+ # No composition possible, keep original character
81
+ y << prev.with_offset(offset)
82
+ offset += prev.c.length
83
+ end
84
+ prev = char
85
+ end
86
+ if prev
87
+ y << prev.with_offset(offset)
88
+ offset += prev.c.length
89
+ end
90
+ end
91
+ end
92
+ end
93
+
94
+ # Factory method to create a hiragana/katakana composition transliterator
95
+ #
96
+ # @param options [Hash] Configuration options
97
+ # @return [Transliterator] A new composition transliterator instance
98
+ def self.call(options = {})
99
+ Transliterator.new(options)
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Hiragana-Katakana mapping table
6
+ module HiraKataTable
7
+ # Main hiragana-katakana table with [hiragana, katakana, halfwidth] structure
8
+ HIRAGANA_KATAKANA_TABLE = [
9
+ # Vowels
10
+ [['あ', nil, nil], ['ア', nil, nil], 'ア'],
11
+ [['い', nil, nil], ['イ', nil, nil], 'イ'],
12
+ [['う', 'ゔ', nil], ['ウ', 'ヴ', nil], 'ウ'],
13
+ [['え', nil, nil], ['エ', nil, nil], 'エ'],
14
+ [['お', nil, nil], ['オ', nil, nil], 'オ'],
15
+ # K-row
16
+ [['か', 'が', nil], ['カ', 'ガ', nil], 'カ'],
17
+ [['き', 'ぎ', nil], ['キ', 'ギ', nil], 'キ'],
18
+ [['く', 'ぐ', nil], ['ク', 'グ', nil], 'ク'],
19
+ [['け', 'げ', nil], ['ケ', 'ゲ', nil], 'ケ'],
20
+ [['こ', 'ご', nil], ['コ', 'ゴ', nil], 'コ'],
21
+ # S-row
22
+ [['さ', 'ざ', nil], ['サ', 'ザ', nil], 'サ'],
23
+ [['し', 'じ', nil], ['シ', 'ジ', nil], 'シ'],
24
+ [['す', 'ず', nil], ['ス', 'ズ', nil], 'ス'],
25
+ [['せ', 'ぜ', nil], ['セ', 'ゼ', nil], 'セ'],
26
+ [['そ', 'ぞ', nil], ['ソ', 'ゾ', nil], 'ソ'],
27
+ # T-row
28
+ [['た', 'だ', nil], ['タ', 'ダ', nil], 'タ'],
29
+ [['ち', 'ぢ', nil], ['チ', 'ヂ', nil], 'チ'],
30
+ [['つ', 'づ', nil], ['ツ', 'ヅ', nil], 'ツ'],
31
+ [['て', 'で', nil], ['テ', 'デ', nil], 'テ'],
32
+ [['と', 'ど', nil], ['ト', 'ド', nil], 'ト'],
33
+ # N-row
34
+ [['な', nil, nil], ['ナ', nil, nil], 'ナ'],
35
+ [['に', nil, nil], ['ニ', nil, nil], 'ニ'],
36
+ [['ぬ', nil, nil], ['ヌ', nil, nil], 'ヌ'],
37
+ [['ね', nil, nil], ['ネ', nil, nil], 'ネ'],
38
+ [['の', nil, nil], ['ノ', nil, nil], 'ノ'],
39
+ # H-row
40
+ [['は', 'ば', 'ぱ'], ['ハ', 'バ', 'パ'], 'ハ'],
41
+ [['ひ', 'び', 'ぴ'], ['ヒ', 'ビ', 'ピ'], 'ヒ'],
42
+ [['ふ', 'ぶ', 'ぷ'], ['フ', 'ブ', 'プ'], 'フ'],
43
+ [['へ', 'べ', 'ぺ'], ['ヘ', 'ベ', 'ペ'], 'ヘ'],
44
+ [['ほ', 'ぼ', 'ぽ'], ['ホ', 'ボ', 'ポ'], 'ホ'],
45
+ # M-row
46
+ [['ま', nil, nil], ['マ', nil, nil], 'マ'],
47
+ [['み', nil, nil], ['ミ', nil, nil], 'ミ'],
48
+ [['む', nil, nil], ['ム', nil, nil], 'ム'],
49
+ [['め', nil, nil], ['メ', nil, nil], 'メ'],
50
+ [['も', nil, nil], ['モ', nil, nil], 'モ'],
51
+ # Y-row
52
+ [['や', nil, nil], ['ヤ', nil, nil], 'ヤ'],
53
+ [['ゆ', nil, nil], ['ユ', nil, nil], 'ユ'],
54
+ [['よ', nil, nil], ['ヨ', nil, nil], 'ヨ'],
55
+ # R-row
56
+ [['ら', nil, nil], ['ラ', nil, nil], 'ラ'],
57
+ [['り', nil, nil], ['リ', nil, nil], 'リ'],
58
+ [['る', nil, nil], ['ル', nil, nil], 'ル'],
59
+ [['れ', nil, nil], ['レ', nil, nil], 'レ'],
60
+ [['ろ', nil, nil], ['ロ', nil, nil], 'ロ'],
61
+ # W-row
62
+ [['わ', nil, nil], ['ワ', 'ヷ', nil], 'ワ'],
63
+ [['ゐ', nil, nil], ['ヰ', 'ヸ', nil], nil],
64
+ [['ゑ', nil, nil], ['ヱ', 'ヹ', nil], nil],
65
+ [['を', nil, nil], ['ヲ', 'ヺ', nil], 'ヲ'],
66
+ [['ん', nil, nil], ['ン', nil, nil], 'ン']
67
+ ].freeze
68
+
69
+ # Small kana table
70
+ HIRAGANA_KATAKANA_SMALL_TABLE = [
71
+ ['ぁ', 'ァ', 'ァ'],
72
+ ['ぃ', 'ィ', 'ィ'],
73
+ ['ぅ', 'ゥ', 'ゥ'],
74
+ ['ぇ', 'ェ', 'ェ'],
75
+ ['ぉ', 'ォ', 'ォ'],
76
+ ['っ', 'ッ', 'ッ'],
77
+ ['ゃ', 'ャ', 'ャ'],
78
+ ['ゅ', 'ュ', 'ュ'],
79
+ ['ょ', 'ョ', 'ョ'],
80
+ ['ゎ', 'ヮ', nil],
81
+ ['ゕ', 'ヵ', nil],
82
+ ['ゖ', 'ヶ', nil]
83
+ ].freeze
84
+
85
+ # Generate voiced character mappings
86
+ def self.generate_voiced_characters
87
+ result = []
88
+ HIRAGANA_KATAKANA_TABLE.each do |hiragana, katakana, _|
89
+ result << [hiragana[0], hiragana[1]] if hiragana[0] && hiragana[1]
90
+ result << [katakana[0], katakana[1]] if katakana[0] && katakana[1]
91
+ end
92
+ # Add iteration marks
93
+ result.concat([
94
+ ['ゝ', 'ゞ'],
95
+ ['ヽ', 'ヾ'],
96
+ ['〱', '〲'], # U+3031 -> U+3032 (vertical hiragana)
97
+ ['〳', '〴'] # U+3033 -> U+3034 (vertical katakana)
98
+ ])
99
+ result
100
+ end
101
+
102
+ # Generate semi-voiced character mappings
103
+ def self.generate_semi_voiced_characters
104
+ result = []
105
+ HIRAGANA_KATAKANA_TABLE.each do |hiragana, katakana, _|
106
+ result << [hiragana[0], hiragana[2]] if hiragana[0] && hiragana[2]
107
+ result << [katakana[0], katakana[2]] if katakana[0] && katakana[2]
108
+ end
109
+ result
110
+ end
111
+
112
+ VOICED_CHARACTERS = generate_voiced_characters.freeze
113
+ SEMI_VOICED_CHARACTERS = generate_semi_voiced_characters.freeze
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'hyphens_data'
4
+
5
+ module Yosina
6
+ module Transliterators
7
+ # Handle hyphen replacement with precedence logic
8
+ module Hyphens
9
+ # Default precedence of mappings (matching JavaScript default)
10
+ # rubocop:disable Naming/VariableNumber
11
+ DEFAULT_PRECEDENCE = [:jisx0208_90].freeze
12
+ # rubocop:enable Naming/VariableNumber
13
+
14
+ # Transliterator for hyphens
15
+ class Transliterator < Yosina::BaseTransliterator
16
+ attr_reader :precedence
17
+
18
+ # Initialize the transliterator with options
19
+ #
20
+ # @param options [Hash] Configuration options
21
+ # @option options [Array<Symbol>] :precedence List of mapping variants to apply in order.
22
+ # Available options: :ascii, :jisx0201, :jisx0208_90, :jisx0208_90_windows, :jisx0208_verbatim
23
+ # Defaults to [:jisx0208_90]
24
+ def initialize(options = nil)
25
+ super()
26
+ @precedence = options[:precedence] || DEFAULT_PRECEDENCE
27
+ end
28
+
29
+ # Normalize hyphen characters based on precedence
30
+ #
31
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
32
+ # @return [Enumerable<Char>] The transliterated characters
33
+ def call(input_chars)
34
+ offset = 0
35
+
36
+ Chars.enum do |y|
37
+ input_chars.each do |char|
38
+ record = HyphensData::HYPHENS_MAPPINGS[char.c]
39
+ if record
40
+ replacement = get_replacement(record)
41
+ if replacement && replacement != char.c
42
+ replacement.each_char do |c|
43
+ y << Char.new(c: c, offset: offset, source: char)
44
+ offset += replacement.length
45
+ end
46
+ else
47
+ y << Char.new(c: char.c, offset: offset, source: char)
48
+ offset += char.c.length
49
+ end
50
+ else
51
+ y << Char.new(c: char.c, offset: offset, source: char)
52
+ offset += char.c.length
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+
60
+ # Get the replacement character based on precedence
61
+ #
62
+ # @param record [HyphensData::HyphensRecord] The hyphen record containing mapping options
63
+ # @return [String, nil] The replacement character or nil if no mapping found
64
+ def get_replacement(record)
65
+ @precedence.each do |mapping_type|
66
+ replacement = (record.send mapping_type if record.respond_to?(mapping_type))
67
+ return replacement if replacement
68
+ end
69
+
70
+ nil
71
+ end
72
+ end
73
+
74
+ # Factory method to create a hyphens transliterator
75
+ #
76
+ # @param options [Hash] Configuration options
77
+ # @return [Transliterator] A new hyphens transliterator instance
78
+ def self.call(options = {})
79
+ Transliterator.new(options)
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Generated hyphens data
6
+ module HyphensData
7
+ # Record for hyphen transliteration data
8
+ HyphensRecord = Struct.new(:ascii, :jisx0201, :jisx0208_90, :jisx0208_90_windows, :jisx0208_verbatim, keyword_init: true) do
9
+ def initialize(ascii: nil, jisx0201: nil, jisx0208_90: nil, jisx0208_90_windows: nil, jisx0208_verbatim: nil)
10
+ super
11
+ end
12
+ end
13
+
14
+ # Generated mapping data
15
+ HYPHENS_MAPPINGS = {
16
+ '-' => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{2212}"),
17
+ '|' => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
18
+ '~' => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
19
+ "\u{a2}" => HyphensRecord.new(jisx0208_90: "\u{a2}", jisx0208_90_windows: "\u{ffe0}"),
20
+ "\u{a3}" => HyphensRecord.new(jisx0208_90: "\u{a3}", jisx0208_90_windows: "\u{ffe1}"),
21
+ "\u{a6}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
22
+ "\u{2d7}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{ff0d}"),
23
+ "\u{2010}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
24
+ "\u{2011}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
25
+ "\u{2012}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
26
+ "\u{2013}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
27
+ "\u{2014}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2014}", jisx0208_90_windows: "\u{2015}"),
28
+ "\u{2015}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
29
+ "\u{2016}" => HyphensRecord.new(jisx0208_90: "\u{2016}", jisx0208_90_windows: "\u{2225}"),
30
+ "\u{203e}" => HyphensRecord.new(jisx0201: '~', jisx0208_90: "\u{ffe3}", jisx0208_90_windows: "\u{ffe3}"),
31
+ "\u{2043}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
32
+ "\u{2053}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{301c}"),
33
+ "\u{2212}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{ff0d}"),
34
+ "\u{2225}" => HyphensRecord.new(jisx0208_90: "\u{2016}", jisx0208_90_windows: "\u{2225}"),
35
+ "\u{223c}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
36
+ "\u{223d}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
37
+ "\u{2500}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
38
+ "\u{2501}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
39
+ "\u{2502}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
40
+ "\u{2796}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{ff0d}"),
41
+ "\u{29ff}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{ff0d}"),
42
+ "\u{2e3a}" => HyphensRecord.new(ascii: '--', jisx0201: '--', jisx0208_90: "\u{2014}\u{2014}", jisx0208_90_windows: "\u{2015}\u{2015}"),
43
+ "\u{2e3b}" => HyphensRecord.new(ascii: '---', jisx0201: '---', jisx0208_90: "\u{2014}\u{2014}\u{2014}", jisx0208_90_windows: "\u{2015}\u{2015}\u{2015}"),
44
+ "\u{301c}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
45
+ "\u{30a0}" => HyphensRecord.new(ascii: '=', jisx0201: '=', jisx0208_90: "\u{ff1d}", jisx0208_90_windows: "\u{ff1d}"),
46
+ "\u{30fb}" => HyphensRecord.new(jisx0201: "\u{ff65}", jisx0208_90: "\u{30fb}", jisx0208_90_windows: "\u{30fb}"),
47
+ "\u{30fc}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{30fc}", jisx0208_90_windows: "\u{30fc}"),
48
+ "\u{fe31}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
49
+ "\u{fe58}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
50
+ "\u{fe63}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
51
+ "\u{ff0d}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{ff0d}"),
52
+ "\u{ff5c}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
53
+ "\u{ff5e}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
54
+ "\u{ffe4}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ffe4}"),
55
+ "\u{ff70}" => HyphensRecord.new(ascii: '-', jisx0201: "\u{ff70}", jisx0208_90: "\u{30fc}", jisx0208_90_windows: "\u{30fc}"),
56
+ "\u{ffe8}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}")
57
+ }.freeze
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Yosina
4
+ module Transliterators
5
+ # Replace ideographic annotation marks used in traditional translation
6
+ module IdeographicAnnotations
7
+ # Generated mapping data from ideographic_annotations.json
8
+ IDEOGRAPHIC_ANNOTATIONS_MAPPINGS = {
9
+ "\u{3192}" => "\u{4e00}",
10
+ "\u{3193}" => "\u{4e8c}",
11
+ "\u{3194}" => "\u{4e09}",
12
+ "\u{3195}" => "\u{56db}",
13
+ "\u{3196}" => "\u{4e0a}",
14
+ "\u{3197}" => "\u{4e2d}",
15
+ "\u{3198}" => "\u{4e0b}",
16
+ "\u{3199}" => "\u{7532}",
17
+ "\u{319a}" => "\u{4e59}",
18
+ "\u{319b}" => "\u{4e19}",
19
+ "\u{319c}" => "\u{4e01}",
20
+ "\u{319d}" => "\u{5929}",
21
+ "\u{319e}" => "\u{5730}",
22
+ "\u{319f}" => "\u{4eba}"
23
+ }.freeze
24
+
25
+ # Transliterator for ideographic_annotations
26
+ class Transliterator < Yosina::BaseTransliterator
27
+ # Initialize the transliterator with options
28
+ #
29
+ # @param _options [Hash] Configuration options (currently unused)
30
+ def initialize(_options = {})
31
+ # Options currently unused for ideographic_annotations transliterator
32
+ super()
33
+ end
34
+
35
+ # Replace ideographic annotation marks used in traditional translation
36
+ #
37
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
38
+ # @return [Enumerable<Char>] The transliterated characters
39
+ def call(input_chars)
40
+ offset = 0
41
+
42
+ result = input_chars.filter_map do |char|
43
+ replacement = IDEOGRAPHIC_ANNOTATIONS_MAPPINGS[char.c]
44
+ c = if replacement
45
+ # Skip empty replacements (character removal)
46
+ next if replacement.empty?
47
+
48
+ Char.new(c: replacement, offset: offset, source: char)
49
+ else
50
+ char.with_offset(offset)
51
+ end
52
+ offset += c.c.length
53
+ c
54
+ end
55
+
56
+ class << result
57
+ include Yosina::Chars
58
+ end
59
+
60
+ result
61
+ end
62
+ end
63
+
64
+ # Factory method to create a ideographic_annotations transliterator
65
+ #
66
+ # @param options [Hash] Configuration options
67
+ # @return [Transliterator] A new ideographic_annotations transliterator instance
68
+ def self.call(options = {})
69
+ Transliterator.new(options)
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,169 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'ivs_svs_base_data'
4
+
5
+ module Yosina
6
+ module Transliterators
7
+ # IVS/SVS base handling with proper forward and reverse transliteration
8
+ module IvsSvsBase
9
+ # Forward transliterator to add IVS/SVS selectors to base characters
10
+ class ForwardTransliterator
11
+ attr_reader :base_to_variants, :prefer_svs
12
+
13
+ # Initialize the forward transliterator with options
14
+ #
15
+ # @param base_to_variants [Hash] Mapping of base characters to their IVS/SVS variants
16
+ # @param prefer_svs [Boolean] Whether to prefer SVS over IVS when both exist
17
+ def initialize(base_to_variants, prefer_svs)
18
+ @base_to_variants = base_to_variants
19
+ @prefer_svs = prefer_svs
20
+ end
21
+
22
+ # Add IVS/SVS selectors to base characters
23
+ #
24
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
25
+ # @return [Enumerable<Char>] The transliterated characters
26
+ def call(input_chars)
27
+ offset = 0
28
+
29
+ Chars.enum do |y|
30
+ input_chars.each do |char|
31
+ # Try to add IVS/SVS selectors to base characters
32
+ record = @base_to_variants[char.c]
33
+ replacement = nil
34
+ if record
35
+ if @prefer_svs && record.svs
36
+ replacement = record.svs
37
+ elsif record.ivs
38
+ replacement = record.ivs
39
+ end
40
+ end
41
+
42
+ if replacement
43
+ y << Char.new(c: replacement, offset: offset, source: char)
44
+ offset += replacement.length
45
+ else
46
+ y << char.with_offset(offset)
47
+ offset += char.c.length
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ # Reverse transliterator to remove IVS/SVS selectors and get base characters
55
+ class ReverseTransliterator
56
+ attr_reader :variants_to_base, :charset, :drop_selectors_altogether
57
+
58
+ # Initialize the reverse transliterator with options
59
+ #
60
+ # @param variants_to_base [Hash] Mapping of IVS/SVS characters to their base forms
61
+ # @param charset [String] The charset to use for base mappings ("unijis_90" or "unijis_2004")
62
+ # @param drop_selectors_altogether [Boolean] Whether to drop all selectors
63
+ def initialize(variants_to_base, charset, drop_selectors_altogether)
64
+ @variants_to_base = variants_to_base
65
+ @charset = charset
66
+ @drop_selectors_altogether = drop_selectors_altogether
67
+ end
68
+
69
+ # Remove IVS/SVS selectors to get base characters
70
+ #
71
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
72
+ # @return [Enumerable<Char>] The transliterated characters
73
+ def call(input_chars)
74
+ offset = 0
75
+
76
+ Chars.enum do |y|
77
+ input_chars.each do |char|
78
+ replacement = nil
79
+
80
+ # Try to remove IVS/SVS selectors
81
+ record = @variants_to_base[char.c]
82
+ if record
83
+ if @charset == 'unijis_2004' && record.base2004
84
+ replacement = record.base2004
85
+ elsif @charset == 'unijis_90' && record.base90
86
+ replacement = record.base90
87
+ end
88
+ end
89
+
90
+ # If no replacement found and drop_selectors_altogether is true,
91
+ # try to remove variation selectors manually
92
+ if !replacement && @drop_selectors_altogether && char.c.length > 1
93
+ second_char = char.c[1]
94
+ second_char_ord = second_char.ord
95
+ # Check for variation selectors: U+FE00-U+FE0F or U+E0100-U+E01EF
96
+ if (second_char_ord >= 0xFE00 && second_char_ord <= 0xFE0F) ||
97
+ (second_char_ord >= 0xE0100 && second_char_ord <= 0xE01EF)
98
+ replacement = char.c[0]
99
+ end
100
+ end
101
+
102
+ if replacement
103
+ y << Char.new(c: replacement, offset: offset, source: char)
104
+ offset += replacement.length
105
+ else
106
+ y << char.with_offset(offset)
107
+ offset += char.c.length
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
113
+
114
+ # Main IVS/SVS base transliterator
115
+ class Transliterator < Yosina::BaseTransliterator
116
+ attr_reader :mode, :drop_selectors_altogether, :charset, :prefer_svs, :inner
117
+
118
+ # Initialize the transliterator with options
119
+ #
120
+ # @param options [Hash] Configuration options
121
+ # @option options [String] :mode The mode of operation ("ivs-or-svs", "base"). Defaults to "base".
122
+ # - "ivs-or-svs": Add IVS/SVS selectors to kanji characters
123
+ # - "base": Remove IVS/SVS selectors to get base characters
124
+ # @option options [Boolean] :drop_selectors_altogether Whether to drop all selectors when mode is "base".
125
+ # Defaults to false.
126
+ # @option options [String] :charset The charset to use for base mappings ("unijis_90" or "unijis_2004").
127
+ # Defaults to "unijis_2004".
128
+ # @option options [Boolean] :prefer_svs When mode is "ivs-or-svs", prefer SVS over IVS if both exist.
129
+ # Defaults to false.
130
+ def initialize(options = {})
131
+ super()
132
+ @mode = options[:mode] || 'base'
133
+ @drop_selectors_altogether = options[:drop_selectors_altogether] || false
134
+ @charset = options[:charset] || 'unijis_2004'
135
+ @prefer_svs = options[:prefer_svs] || false
136
+
137
+ @inner = if @mode == 'ivs-or-svs'
138
+ ForwardTransliterator.new(
139
+ IvsSvsBaseData.get_base_to_variants_mappings(@charset),
140
+ @prefer_svs
141
+ )
142
+ else
143
+ ReverseTransliterator.new(
144
+ IvsSvsBaseData.get_variants_to_base_mappings,
145
+ @charset,
146
+ @drop_selectors_altogether
147
+ )
148
+ end
149
+ end
150
+
151
+ # Handle IVS/SVS sequences
152
+ #
153
+ # @param input_chars [Enumerable<Char>] The characters to transliterate
154
+ # @return [Enumerable<Char>] The transliterated characters
155
+ def call(input_chars)
156
+ @inner.call(input_chars)
157
+ end
158
+ end
159
+
160
+ # Factory method to create an IVS/SVS base transliterator
161
+ #
162
+ # @param options [Hash] Configuration options
163
+ # @return [Transliterator] A new IVS/SVS base transliterator instance
164
+ def self.call(options = {})
165
+ Transliterator.new(options)
166
+ end
167
+ end
168
+ end
169
+ end