yosina 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +36 -0
- data/Gemfile +6 -0
- data/README.ja.md +229 -0
- data/README.md +229 -0
- data/Rakefile +30 -0
- data/codegen/dataset.rb +215 -0
- data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
- data/codegen/emitters/combined_transliterator_data.rb +28 -0
- data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
- data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
- data/codegen/emitters/simple_transliterator.rb +76 -0
- data/codegen/emitters/utils.rb +45 -0
- data/codegen/emitters.rb +8 -0
- data/codegen/main.rb +109 -0
- data/lib/yosina/char.rb +65 -0
- data/lib/yosina/chars.rb +152 -0
- data/lib/yosina/recipes.rb +359 -0
- data/lib/yosina/transliterator.rb +49 -0
- data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
- data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
- data/lib/yosina/transliterators/combined.rb +52 -0
- data/lib/yosina/transliterators/combined_data.rb +495 -0
- data/lib/yosina/transliterators/hira_kata.rb +106 -0
- data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
- data/lib/yosina/transliterators/hyphens.rb +83 -0
- data/lib/yosina/transliterators/hyphens_data.rb +60 -0
- data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
- data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
- data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
- data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
- data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
- data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
- data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
- data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
- data/lib/yosina/transliterators/radicals.rb +361 -0
- data/lib/yosina/transliterators/spaces.rb +79 -0
- data/lib/yosina/transliterators.rb +57 -0
- data/lib/yosina/version.rb +5 -0
- data/lib/yosina.rb +62 -0
- data/yosina.gemspec +41 -0
- metadata +159 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'hira_kata_table'
|
|
4
|
+
|
|
5
|
+
module Yosina
|
|
6
|
+
module Transliterators
|
|
7
|
+
# Hiragana/katakana composition transliterator
|
|
8
|
+
module HiraKataComposition
|
|
9
|
+
include HiraKataTable
|
|
10
|
+
|
|
11
|
+
VOICED_SOUND_MARK_MAPPINGS = Hash[VOICED_CHARACTERS].freeze
|
|
12
|
+
SEMI_VOICED_SOUND_MARK_MAPPINGS = Hash[SEMI_VOICED_CHARACTERS].freeze
|
|
13
|
+
|
|
14
|
+
# Combining mark mappings for hiragana and katakana
|
|
15
|
+
COMBINING_MARKS = {
|
|
16
|
+
"\u3099" => VOICED_SOUND_MARK_MAPPINGS,
|
|
17
|
+
"\u309A" => SEMI_VOICED_SOUND_MARK_MAPPINGS
|
|
18
|
+
}.freeze
|
|
19
|
+
|
|
20
|
+
# Non-combining mark mappings
|
|
21
|
+
NON_COMBINING_MARKS = {
|
|
22
|
+
"\u3099" => VOICED_SOUND_MARK_MAPPINGS,
|
|
23
|
+
"\u309A" => SEMI_VOICED_SOUND_MARK_MAPPINGS,
|
|
24
|
+
"\u309B" => VOICED_SOUND_MARK_MAPPINGS,
|
|
25
|
+
"\u309C" => SEMI_VOICED_SOUND_MARK_MAPPINGS
|
|
26
|
+
}.freeze
|
|
27
|
+
|
|
28
|
+
# Transliterator for hiragana/katakana composition
|
|
29
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
30
|
+
attr_reader :compose_non_combining_marks
|
|
31
|
+
|
|
32
|
+
# Initialize the transliterator with options
|
|
33
|
+
#
|
|
34
|
+
# @param options [Hash] Configuration options
|
|
35
|
+
# @option options [Boolean] :compose_non_combining_marks Whether to compose non-combining
|
|
36
|
+
# marks (゛ and ゜) too. Defaults to false.
|
|
37
|
+
def initialize(options = {})
|
|
38
|
+
super()
|
|
39
|
+
@compose_non_combining_marks = options[:compose_non_combining_marks] || false
|
|
40
|
+
@mappings = @compose_non_combining_marks ? NON_COMBINING_MARKS : COMBINING_MARKS
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Combine decomposed hiragana and katakana characters with their marks
|
|
44
|
+
#
|
|
45
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
46
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
47
|
+
def call(input_chars)
|
|
48
|
+
e = input_chars.each
|
|
49
|
+
offset = 0
|
|
50
|
+
|
|
51
|
+
Chars.enum do |y|
|
|
52
|
+
begin
|
|
53
|
+
prev = e.next
|
|
54
|
+
rescue StopIteration
|
|
55
|
+
break
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
if prev.sentinel?
|
|
59
|
+
y << prev
|
|
60
|
+
break
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
loop do
|
|
64
|
+
begin
|
|
65
|
+
char = e.next
|
|
66
|
+
rescue StopIteration
|
|
67
|
+
break
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
if prev
|
|
71
|
+
# Check for combining marks
|
|
72
|
+
if (mark_mapping = @mappings[char.c]) && (composed = mark_mapping[prev.c])
|
|
73
|
+
# Found a composable combination
|
|
74
|
+
y << Char.new(c: composed, offset: offset, source: char)
|
|
75
|
+
offset += composed.length
|
|
76
|
+
prev = nil
|
|
77
|
+
next
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# No composition possible, keep original character
|
|
81
|
+
y << prev.with_offset(offset)
|
|
82
|
+
offset += prev.c.length
|
|
83
|
+
end
|
|
84
|
+
prev = char
|
|
85
|
+
end
|
|
86
|
+
if prev
|
|
87
|
+
y << prev.with_offset(offset)
|
|
88
|
+
offset += prev.c.length
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Factory method to create a hiragana/katakana composition transliterator
|
|
95
|
+
#
|
|
96
|
+
# @param options [Hash] Configuration options
|
|
97
|
+
# @return [Transliterator] A new composition transliterator instance
|
|
98
|
+
def self.call(options = {})
|
|
99
|
+
Transliterator.new(options)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Hiragana-Katakana mapping table
|
|
6
|
+
module HiraKataTable
|
|
7
|
+
# Main hiragana-katakana table with [hiragana, katakana, halfwidth] structure
|
|
8
|
+
HIRAGANA_KATAKANA_TABLE = [
|
|
9
|
+
# Vowels
|
|
10
|
+
[['あ', nil, nil], ['ア', nil, nil], 'ア'],
|
|
11
|
+
[['い', nil, nil], ['イ', nil, nil], 'イ'],
|
|
12
|
+
[['う', 'ゔ', nil], ['ウ', 'ヴ', nil], 'ウ'],
|
|
13
|
+
[['え', nil, nil], ['エ', nil, nil], 'エ'],
|
|
14
|
+
[['お', nil, nil], ['オ', nil, nil], 'オ'],
|
|
15
|
+
# K-row
|
|
16
|
+
[['か', 'が', nil], ['カ', 'ガ', nil], 'カ'],
|
|
17
|
+
[['き', 'ぎ', nil], ['キ', 'ギ', nil], 'キ'],
|
|
18
|
+
[['く', 'ぐ', nil], ['ク', 'グ', nil], 'ク'],
|
|
19
|
+
[['け', 'げ', nil], ['ケ', 'ゲ', nil], 'ケ'],
|
|
20
|
+
[['こ', 'ご', nil], ['コ', 'ゴ', nil], 'コ'],
|
|
21
|
+
# S-row
|
|
22
|
+
[['さ', 'ざ', nil], ['サ', 'ザ', nil], 'サ'],
|
|
23
|
+
[['し', 'じ', nil], ['シ', 'ジ', nil], 'シ'],
|
|
24
|
+
[['す', 'ず', nil], ['ス', 'ズ', nil], 'ス'],
|
|
25
|
+
[['せ', 'ぜ', nil], ['セ', 'ゼ', nil], 'セ'],
|
|
26
|
+
[['そ', 'ぞ', nil], ['ソ', 'ゾ', nil], 'ソ'],
|
|
27
|
+
# T-row
|
|
28
|
+
[['た', 'だ', nil], ['タ', 'ダ', nil], 'タ'],
|
|
29
|
+
[['ち', 'ぢ', nil], ['チ', 'ヂ', nil], 'チ'],
|
|
30
|
+
[['つ', 'づ', nil], ['ツ', 'ヅ', nil], 'ツ'],
|
|
31
|
+
[['て', 'で', nil], ['テ', 'デ', nil], 'テ'],
|
|
32
|
+
[['と', 'ど', nil], ['ト', 'ド', nil], 'ト'],
|
|
33
|
+
# N-row
|
|
34
|
+
[['な', nil, nil], ['ナ', nil, nil], 'ナ'],
|
|
35
|
+
[['に', nil, nil], ['ニ', nil, nil], 'ニ'],
|
|
36
|
+
[['ぬ', nil, nil], ['ヌ', nil, nil], 'ヌ'],
|
|
37
|
+
[['ね', nil, nil], ['ネ', nil, nil], 'ネ'],
|
|
38
|
+
[['の', nil, nil], ['ノ', nil, nil], 'ノ'],
|
|
39
|
+
# H-row
|
|
40
|
+
[['は', 'ば', 'ぱ'], ['ハ', 'バ', 'パ'], 'ハ'],
|
|
41
|
+
[['ひ', 'び', 'ぴ'], ['ヒ', 'ビ', 'ピ'], 'ヒ'],
|
|
42
|
+
[['ふ', 'ぶ', 'ぷ'], ['フ', 'ブ', 'プ'], 'フ'],
|
|
43
|
+
[['へ', 'べ', 'ぺ'], ['ヘ', 'ベ', 'ペ'], 'ヘ'],
|
|
44
|
+
[['ほ', 'ぼ', 'ぽ'], ['ホ', 'ボ', 'ポ'], 'ホ'],
|
|
45
|
+
# M-row
|
|
46
|
+
[['ま', nil, nil], ['マ', nil, nil], 'マ'],
|
|
47
|
+
[['み', nil, nil], ['ミ', nil, nil], 'ミ'],
|
|
48
|
+
[['む', nil, nil], ['ム', nil, nil], 'ム'],
|
|
49
|
+
[['め', nil, nil], ['メ', nil, nil], 'メ'],
|
|
50
|
+
[['も', nil, nil], ['モ', nil, nil], 'モ'],
|
|
51
|
+
# Y-row
|
|
52
|
+
[['や', nil, nil], ['ヤ', nil, nil], 'ヤ'],
|
|
53
|
+
[['ゆ', nil, nil], ['ユ', nil, nil], 'ユ'],
|
|
54
|
+
[['よ', nil, nil], ['ヨ', nil, nil], 'ヨ'],
|
|
55
|
+
# R-row
|
|
56
|
+
[['ら', nil, nil], ['ラ', nil, nil], 'ラ'],
|
|
57
|
+
[['り', nil, nil], ['リ', nil, nil], 'リ'],
|
|
58
|
+
[['る', nil, nil], ['ル', nil, nil], 'ル'],
|
|
59
|
+
[['れ', nil, nil], ['レ', nil, nil], 'レ'],
|
|
60
|
+
[['ろ', nil, nil], ['ロ', nil, nil], 'ロ'],
|
|
61
|
+
# W-row
|
|
62
|
+
[['わ', nil, nil], ['ワ', 'ヷ', nil], 'ワ'],
|
|
63
|
+
[['ゐ', nil, nil], ['ヰ', 'ヸ', nil], nil],
|
|
64
|
+
[['ゑ', nil, nil], ['ヱ', 'ヹ', nil], nil],
|
|
65
|
+
[['を', nil, nil], ['ヲ', 'ヺ', nil], 'ヲ'],
|
|
66
|
+
[['ん', nil, nil], ['ン', nil, nil], 'ン']
|
|
67
|
+
].freeze
|
|
68
|
+
|
|
69
|
+
# Small kana table
|
|
70
|
+
HIRAGANA_KATAKANA_SMALL_TABLE = [
|
|
71
|
+
['ぁ', 'ァ', 'ァ'],
|
|
72
|
+
['ぃ', 'ィ', 'ィ'],
|
|
73
|
+
['ぅ', 'ゥ', 'ゥ'],
|
|
74
|
+
['ぇ', 'ェ', 'ェ'],
|
|
75
|
+
['ぉ', 'ォ', 'ォ'],
|
|
76
|
+
['っ', 'ッ', 'ッ'],
|
|
77
|
+
['ゃ', 'ャ', 'ャ'],
|
|
78
|
+
['ゅ', 'ュ', 'ュ'],
|
|
79
|
+
['ょ', 'ョ', 'ョ'],
|
|
80
|
+
['ゎ', 'ヮ', nil],
|
|
81
|
+
['ゕ', 'ヵ', nil],
|
|
82
|
+
['ゖ', 'ヶ', nil]
|
|
83
|
+
].freeze
|
|
84
|
+
|
|
85
|
+
# Generate voiced character mappings
|
|
86
|
+
def self.generate_voiced_characters
|
|
87
|
+
result = []
|
|
88
|
+
HIRAGANA_KATAKANA_TABLE.each do |hiragana, katakana, _|
|
|
89
|
+
result << [hiragana[0], hiragana[1]] if hiragana[0] && hiragana[1]
|
|
90
|
+
result << [katakana[0], katakana[1]] if katakana[0] && katakana[1]
|
|
91
|
+
end
|
|
92
|
+
# Add iteration marks
|
|
93
|
+
result.concat([
|
|
94
|
+
['ゝ', 'ゞ'],
|
|
95
|
+
['ヽ', 'ヾ'],
|
|
96
|
+
['〱', '〲'], # U+3031 -> U+3032 (vertical hiragana)
|
|
97
|
+
['〳', '〴'] # U+3033 -> U+3034 (vertical katakana)
|
|
98
|
+
])
|
|
99
|
+
result
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Generate semi-voiced character mappings
|
|
103
|
+
def self.generate_semi_voiced_characters
|
|
104
|
+
result = []
|
|
105
|
+
HIRAGANA_KATAKANA_TABLE.each do |hiragana, katakana, _|
|
|
106
|
+
result << [hiragana[0], hiragana[2]] if hiragana[0] && hiragana[2]
|
|
107
|
+
result << [katakana[0], katakana[2]] if katakana[0] && katakana[2]
|
|
108
|
+
end
|
|
109
|
+
result
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
VOICED_CHARACTERS = generate_voiced_characters.freeze
|
|
113
|
+
SEMI_VOICED_CHARACTERS = generate_semi_voiced_characters.freeze
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'hyphens_data'
|
|
4
|
+
|
|
5
|
+
module Yosina
|
|
6
|
+
module Transliterators
|
|
7
|
+
# Handle hyphen replacement with precedence logic
|
|
8
|
+
module Hyphens
|
|
9
|
+
# Default precedence of mappings (matching JavaScript default)
|
|
10
|
+
# rubocop:disable Naming/VariableNumber
|
|
11
|
+
DEFAULT_PRECEDENCE = [:jisx0208_90].freeze
|
|
12
|
+
# rubocop:enable Naming/VariableNumber
|
|
13
|
+
|
|
14
|
+
# Transliterator for hyphens
|
|
15
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
16
|
+
attr_reader :precedence
|
|
17
|
+
|
|
18
|
+
# Initialize the transliterator with options
|
|
19
|
+
#
|
|
20
|
+
# @param options [Hash] Configuration options
|
|
21
|
+
# @option options [Array<Symbol>] :precedence List of mapping variants to apply in order.
|
|
22
|
+
# Available options: :ascii, :jisx0201, :jisx0208_90, :jisx0208_90_windows, :jisx0208_verbatim
|
|
23
|
+
# Defaults to [:jisx0208_90]
|
|
24
|
+
def initialize(options = nil)
|
|
25
|
+
super()
|
|
26
|
+
@precedence = options[:precedence] || DEFAULT_PRECEDENCE
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Normalize hyphen characters based on precedence
|
|
30
|
+
#
|
|
31
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
32
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
33
|
+
def call(input_chars)
|
|
34
|
+
offset = 0
|
|
35
|
+
|
|
36
|
+
Chars.enum do |y|
|
|
37
|
+
input_chars.each do |char|
|
|
38
|
+
record = HyphensData::HYPHENS_MAPPINGS[char.c]
|
|
39
|
+
if record
|
|
40
|
+
replacement = get_replacement(record)
|
|
41
|
+
if replacement && replacement != char.c
|
|
42
|
+
replacement.each_char do |c|
|
|
43
|
+
y << Char.new(c: c, offset: offset, source: char)
|
|
44
|
+
offset += replacement.length
|
|
45
|
+
end
|
|
46
|
+
else
|
|
47
|
+
y << Char.new(c: char.c, offset: offset, source: char)
|
|
48
|
+
offset += char.c.length
|
|
49
|
+
end
|
|
50
|
+
else
|
|
51
|
+
y << Char.new(c: char.c, offset: offset, source: char)
|
|
52
|
+
offset += char.c.length
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
# Get the replacement character based on precedence
|
|
61
|
+
#
|
|
62
|
+
# @param record [HyphensData::HyphensRecord] The hyphen record containing mapping options
|
|
63
|
+
# @return [String, nil] The replacement character or nil if no mapping found
|
|
64
|
+
def get_replacement(record)
|
|
65
|
+
@precedence.each do |mapping_type|
|
|
66
|
+
replacement = (record.send mapping_type if record.respond_to?(mapping_type))
|
|
67
|
+
return replacement if replacement
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
nil
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Factory method to create a hyphens transliterator
|
|
75
|
+
#
|
|
76
|
+
# @param options [Hash] Configuration options
|
|
77
|
+
# @return [Transliterator] A new hyphens transliterator instance
|
|
78
|
+
def self.call(options = {})
|
|
79
|
+
Transliterator.new(options)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Generated hyphens data
|
|
6
|
+
module HyphensData
|
|
7
|
+
# Record for hyphen transliteration data
|
|
8
|
+
HyphensRecord = Struct.new(:ascii, :jisx0201, :jisx0208_90, :jisx0208_90_windows, :jisx0208_verbatim, keyword_init: true) do
|
|
9
|
+
def initialize(ascii: nil, jisx0201: nil, jisx0208_90: nil, jisx0208_90_windows: nil, jisx0208_verbatim: nil)
|
|
10
|
+
super
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Generated mapping data
|
|
15
|
+
HYPHENS_MAPPINGS = {
|
|
16
|
+
'-' => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{2212}"),
|
|
17
|
+
'|' => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
|
|
18
|
+
'~' => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
|
|
19
|
+
"\u{a2}" => HyphensRecord.new(jisx0208_90: "\u{a2}", jisx0208_90_windows: "\u{ffe0}"),
|
|
20
|
+
"\u{a3}" => HyphensRecord.new(jisx0208_90: "\u{a3}", jisx0208_90_windows: "\u{ffe1}"),
|
|
21
|
+
"\u{a6}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
|
|
22
|
+
"\u{2d7}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{ff0d}"),
|
|
23
|
+
"\u{2010}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
|
|
24
|
+
"\u{2011}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
|
|
25
|
+
"\u{2012}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
|
|
26
|
+
"\u{2013}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
|
|
27
|
+
"\u{2014}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2014}", jisx0208_90_windows: "\u{2015}"),
|
|
28
|
+
"\u{2015}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
|
|
29
|
+
"\u{2016}" => HyphensRecord.new(jisx0208_90: "\u{2016}", jisx0208_90_windows: "\u{2225}"),
|
|
30
|
+
"\u{203e}" => HyphensRecord.new(jisx0201: '~', jisx0208_90: "\u{ffe3}", jisx0208_90_windows: "\u{ffe3}"),
|
|
31
|
+
"\u{2043}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
|
|
32
|
+
"\u{2053}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{301c}"),
|
|
33
|
+
"\u{2212}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{ff0d}"),
|
|
34
|
+
"\u{2225}" => HyphensRecord.new(jisx0208_90: "\u{2016}", jisx0208_90_windows: "\u{2225}"),
|
|
35
|
+
"\u{223c}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
|
|
36
|
+
"\u{223d}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
|
|
37
|
+
"\u{2500}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
|
|
38
|
+
"\u{2501}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2015}", jisx0208_90_windows: "\u{2015}"),
|
|
39
|
+
"\u{2502}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
|
|
40
|
+
"\u{2796}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{ff0d}"),
|
|
41
|
+
"\u{29ff}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{ff0d}"),
|
|
42
|
+
"\u{2e3a}" => HyphensRecord.new(ascii: '--', jisx0201: '--', jisx0208_90: "\u{2014}\u{2014}", jisx0208_90_windows: "\u{2015}\u{2015}"),
|
|
43
|
+
"\u{2e3b}" => HyphensRecord.new(ascii: '---', jisx0201: '---', jisx0208_90: "\u{2014}\u{2014}\u{2014}", jisx0208_90_windows: "\u{2015}\u{2015}\u{2015}"),
|
|
44
|
+
"\u{301c}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
|
|
45
|
+
"\u{30a0}" => HyphensRecord.new(ascii: '=', jisx0201: '=', jisx0208_90: "\u{ff1d}", jisx0208_90_windows: "\u{ff1d}"),
|
|
46
|
+
"\u{30fb}" => HyphensRecord.new(jisx0201: "\u{ff65}", jisx0208_90: "\u{30fb}", jisx0208_90_windows: "\u{30fb}"),
|
|
47
|
+
"\u{30fc}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{30fc}", jisx0208_90_windows: "\u{30fc}"),
|
|
48
|
+
"\u{fe31}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
|
|
49
|
+
"\u{fe58}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
|
|
50
|
+
"\u{fe63}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2010}", jisx0208_90_windows: "\u{2010}"),
|
|
51
|
+
"\u{ff0d}" => HyphensRecord.new(ascii: '-', jisx0201: '-', jisx0208_90: "\u{2212}", jisx0208_90_windows: "\u{ff0d}"),
|
|
52
|
+
"\u{ff5c}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}"),
|
|
53
|
+
"\u{ff5e}" => HyphensRecord.new(ascii: '~', jisx0201: '~', jisx0208_90: "\u{301c}", jisx0208_90_windows: "\u{ff5e}"),
|
|
54
|
+
"\u{ffe4}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ffe4}"),
|
|
55
|
+
"\u{ff70}" => HyphensRecord.new(ascii: '-', jisx0201: "\u{ff70}", jisx0208_90: "\u{30fc}", jisx0208_90_windows: "\u{30fc}"),
|
|
56
|
+
"\u{ffe8}" => HyphensRecord.new(ascii: '|', jisx0201: '|', jisx0208_90: "\u{ff5c}", jisx0208_90_windows: "\u{ff5c}")
|
|
57
|
+
}.freeze
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
module Transliterators
|
|
5
|
+
# Replace ideographic annotation marks used in traditional translation
|
|
6
|
+
module IdeographicAnnotations
|
|
7
|
+
# Generated mapping data from ideographic_annotations.json
|
|
8
|
+
IDEOGRAPHIC_ANNOTATIONS_MAPPINGS = {
|
|
9
|
+
"\u{3192}" => "\u{4e00}",
|
|
10
|
+
"\u{3193}" => "\u{4e8c}",
|
|
11
|
+
"\u{3194}" => "\u{4e09}",
|
|
12
|
+
"\u{3195}" => "\u{56db}",
|
|
13
|
+
"\u{3196}" => "\u{4e0a}",
|
|
14
|
+
"\u{3197}" => "\u{4e2d}",
|
|
15
|
+
"\u{3198}" => "\u{4e0b}",
|
|
16
|
+
"\u{3199}" => "\u{7532}",
|
|
17
|
+
"\u{319a}" => "\u{4e59}",
|
|
18
|
+
"\u{319b}" => "\u{4e19}",
|
|
19
|
+
"\u{319c}" => "\u{4e01}",
|
|
20
|
+
"\u{319d}" => "\u{5929}",
|
|
21
|
+
"\u{319e}" => "\u{5730}",
|
|
22
|
+
"\u{319f}" => "\u{4eba}"
|
|
23
|
+
}.freeze
|
|
24
|
+
|
|
25
|
+
# Transliterator for ideographic_annotations
|
|
26
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
27
|
+
# Initialize the transliterator with options
|
|
28
|
+
#
|
|
29
|
+
# @param _options [Hash] Configuration options (currently unused)
|
|
30
|
+
def initialize(_options = {})
|
|
31
|
+
# Options currently unused for ideographic_annotations transliterator
|
|
32
|
+
super()
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Replace ideographic annotation marks used in traditional translation
|
|
36
|
+
#
|
|
37
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
38
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
39
|
+
def call(input_chars)
|
|
40
|
+
offset = 0
|
|
41
|
+
|
|
42
|
+
result = input_chars.filter_map do |char|
|
|
43
|
+
replacement = IDEOGRAPHIC_ANNOTATIONS_MAPPINGS[char.c]
|
|
44
|
+
c = if replacement
|
|
45
|
+
# Skip empty replacements (character removal)
|
|
46
|
+
next if replacement.empty?
|
|
47
|
+
|
|
48
|
+
Char.new(c: replacement, offset: offset, source: char)
|
|
49
|
+
else
|
|
50
|
+
char.with_offset(offset)
|
|
51
|
+
end
|
|
52
|
+
offset += c.c.length
|
|
53
|
+
c
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
class << result
|
|
57
|
+
include Yosina::Chars
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
result
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Factory method to create a ideographic_annotations transliterator
|
|
65
|
+
#
|
|
66
|
+
# @param options [Hash] Configuration options
|
|
67
|
+
# @return [Transliterator] A new ideographic_annotations transliterator instance
|
|
68
|
+
def self.call(options = {})
|
|
69
|
+
Transliterator.new(options)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'ivs_svs_base_data'
|
|
4
|
+
|
|
5
|
+
module Yosina
|
|
6
|
+
module Transliterators
|
|
7
|
+
# IVS/SVS base handling with proper forward and reverse transliteration
|
|
8
|
+
module IvsSvsBase
|
|
9
|
+
# Forward transliterator to add IVS/SVS selectors to base characters
|
|
10
|
+
class ForwardTransliterator
|
|
11
|
+
attr_reader :base_to_variants, :prefer_svs
|
|
12
|
+
|
|
13
|
+
# Initialize the forward transliterator with options
|
|
14
|
+
#
|
|
15
|
+
# @param base_to_variants [Hash] Mapping of base characters to their IVS/SVS variants
|
|
16
|
+
# @param prefer_svs [Boolean] Whether to prefer SVS over IVS when both exist
|
|
17
|
+
def initialize(base_to_variants, prefer_svs)
|
|
18
|
+
@base_to_variants = base_to_variants
|
|
19
|
+
@prefer_svs = prefer_svs
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Add IVS/SVS selectors to base characters
|
|
23
|
+
#
|
|
24
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
25
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
26
|
+
def call(input_chars)
|
|
27
|
+
offset = 0
|
|
28
|
+
|
|
29
|
+
Chars.enum do |y|
|
|
30
|
+
input_chars.each do |char|
|
|
31
|
+
# Try to add IVS/SVS selectors to base characters
|
|
32
|
+
record = @base_to_variants[char.c]
|
|
33
|
+
replacement = nil
|
|
34
|
+
if record
|
|
35
|
+
if @prefer_svs && record.svs
|
|
36
|
+
replacement = record.svs
|
|
37
|
+
elsif record.ivs
|
|
38
|
+
replacement = record.ivs
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
if replacement
|
|
43
|
+
y << Char.new(c: replacement, offset: offset, source: char)
|
|
44
|
+
offset += replacement.length
|
|
45
|
+
else
|
|
46
|
+
y << char.with_offset(offset)
|
|
47
|
+
offset += char.c.length
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Reverse transliterator to remove IVS/SVS selectors and get base characters
|
|
55
|
+
class ReverseTransliterator
|
|
56
|
+
attr_reader :variants_to_base, :charset, :drop_selectors_altogether
|
|
57
|
+
|
|
58
|
+
# Initialize the reverse transliterator with options
|
|
59
|
+
#
|
|
60
|
+
# @param variants_to_base [Hash] Mapping of IVS/SVS characters to their base forms
|
|
61
|
+
# @param charset [String] The charset to use for base mappings ("unijis_90" or "unijis_2004")
|
|
62
|
+
# @param drop_selectors_altogether [Boolean] Whether to drop all selectors
|
|
63
|
+
def initialize(variants_to_base, charset, drop_selectors_altogether)
|
|
64
|
+
@variants_to_base = variants_to_base
|
|
65
|
+
@charset = charset
|
|
66
|
+
@drop_selectors_altogether = drop_selectors_altogether
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Remove IVS/SVS selectors to get base characters
|
|
70
|
+
#
|
|
71
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
72
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
73
|
+
def call(input_chars)
|
|
74
|
+
offset = 0
|
|
75
|
+
|
|
76
|
+
Chars.enum do |y|
|
|
77
|
+
input_chars.each do |char|
|
|
78
|
+
replacement = nil
|
|
79
|
+
|
|
80
|
+
# Try to remove IVS/SVS selectors
|
|
81
|
+
record = @variants_to_base[char.c]
|
|
82
|
+
if record
|
|
83
|
+
if @charset == 'unijis_2004' && record.base2004
|
|
84
|
+
replacement = record.base2004
|
|
85
|
+
elsif @charset == 'unijis_90' && record.base90
|
|
86
|
+
replacement = record.base90
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# If no replacement found and drop_selectors_altogether is true,
|
|
91
|
+
# try to remove variation selectors manually
|
|
92
|
+
if !replacement && @drop_selectors_altogether && char.c.length > 1
|
|
93
|
+
second_char = char.c[1]
|
|
94
|
+
second_char_ord = second_char.ord
|
|
95
|
+
# Check for variation selectors: U+FE00-U+FE0F or U+E0100-U+E01EF
|
|
96
|
+
if (second_char_ord >= 0xFE00 && second_char_ord <= 0xFE0F) ||
|
|
97
|
+
(second_char_ord >= 0xE0100 && second_char_ord <= 0xE01EF)
|
|
98
|
+
replacement = char.c[0]
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
if replacement
|
|
103
|
+
y << Char.new(c: replacement, offset: offset, source: char)
|
|
104
|
+
offset += replacement.length
|
|
105
|
+
else
|
|
106
|
+
y << char.with_offset(offset)
|
|
107
|
+
offset += char.c.length
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Main IVS/SVS base transliterator
|
|
115
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
116
|
+
attr_reader :mode, :drop_selectors_altogether, :charset, :prefer_svs, :inner
|
|
117
|
+
|
|
118
|
+
# Initialize the transliterator with options
|
|
119
|
+
#
|
|
120
|
+
# @param options [Hash] Configuration options
|
|
121
|
+
# @option options [String] :mode The mode of operation ("ivs-or-svs", "base"). Defaults to "base".
|
|
122
|
+
# - "ivs-or-svs": Add IVS/SVS selectors to kanji characters
|
|
123
|
+
# - "base": Remove IVS/SVS selectors to get base characters
|
|
124
|
+
# @option options [Boolean] :drop_selectors_altogether Whether to drop all selectors when mode is "base".
|
|
125
|
+
# Defaults to false.
|
|
126
|
+
# @option options [String] :charset The charset to use for base mappings ("unijis_90" or "unijis_2004").
|
|
127
|
+
# Defaults to "unijis_2004".
|
|
128
|
+
# @option options [Boolean] :prefer_svs When mode is "ivs-or-svs", prefer SVS over IVS if both exist.
|
|
129
|
+
# Defaults to false.
|
|
130
|
+
def initialize(options = {})
|
|
131
|
+
super()
|
|
132
|
+
@mode = options[:mode] || 'base'
|
|
133
|
+
@drop_selectors_altogether = options[:drop_selectors_altogether] || false
|
|
134
|
+
@charset = options[:charset] || 'unijis_2004'
|
|
135
|
+
@prefer_svs = options[:prefer_svs] || false
|
|
136
|
+
|
|
137
|
+
@inner = if @mode == 'ivs-or-svs'
|
|
138
|
+
ForwardTransliterator.new(
|
|
139
|
+
IvsSvsBaseData.get_base_to_variants_mappings(@charset),
|
|
140
|
+
@prefer_svs
|
|
141
|
+
)
|
|
142
|
+
else
|
|
143
|
+
ReverseTransliterator.new(
|
|
144
|
+
IvsSvsBaseData.get_variants_to_base_mappings,
|
|
145
|
+
@charset,
|
|
146
|
+
@drop_selectors_altogether
|
|
147
|
+
)
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Handle IVS/SVS sequences
|
|
152
|
+
#
|
|
153
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
154
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
155
|
+
def call(input_chars)
|
|
156
|
+
@inner.call(input_chars)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Factory method to create an IVS/SVS base transliterator
|
|
161
|
+
#
|
|
162
|
+
# @param options [Hash] Configuration options
|
|
163
|
+
# @return [Transliterator] A new IVS/SVS base transliterator instance
|
|
164
|
+
def self.call(options = {})
|
|
165
|
+
Transliterator.new(options)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
Binary file
|