yosina 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +36 -0
- data/Gemfile +6 -0
- data/README.ja.md +229 -0
- data/README.md +229 -0
- data/Rakefile +30 -0
- data/codegen/dataset.rb +215 -0
- data/codegen/emitters/circled_or_squared_transliterator_data.rb +30 -0
- data/codegen/emitters/combined_transliterator_data.rb +28 -0
- data/codegen/emitters/hyphens_transliterator_data.rb +48 -0
- data/codegen/emitters/ivs_svs_base_transliterator_data.rb +121 -0
- data/codegen/emitters/simple_transliterator.rb +76 -0
- data/codegen/emitters/utils.rb +45 -0
- data/codegen/emitters.rb +8 -0
- data/codegen/main.rb +109 -0
- data/lib/yosina/char.rb +65 -0
- data/lib/yosina/chars.rb +152 -0
- data/lib/yosina/recipes.rb +359 -0
- data/lib/yosina/transliterator.rb +49 -0
- data/lib/yosina/transliterators/circled_or_squared.rb +67 -0
- data/lib/yosina/transliterators/circled_or_squared_data.rb +469 -0
- data/lib/yosina/transliterators/combined.rb +52 -0
- data/lib/yosina/transliterators/combined_data.rb +495 -0
- data/lib/yosina/transliterators/hira_kata.rb +106 -0
- data/lib/yosina/transliterators/hira_kata_composition.rb +103 -0
- data/lib/yosina/transliterators/hira_kata_table.rb +116 -0
- data/lib/yosina/transliterators/hyphens.rb +83 -0
- data/lib/yosina/transliterators/hyphens_data.rb +60 -0
- data/lib/yosina/transliterators/ideographic_annotations.rb +73 -0
- data/lib/yosina/transliterators/ivs_svs_base.rb +169 -0
- data/lib/yosina/transliterators/ivs_svs_base_data.rb +0 -0
- data/lib/yosina/transliterators/japanese_iteration_marks.rb +261 -0
- data/lib/yosina/transliterators/jisx0201_and_alike.rb +451 -0
- data/lib/yosina/transliterators/kanji_old_new.rb +1137 -0
- data/lib/yosina/transliterators/mathematical_alphanumerics.rb +799 -0
- data/lib/yosina/transliterators/prolonged_sound_marks.rb +206 -0
- data/lib/yosina/transliterators/radicals.rb +361 -0
- data/lib/yosina/transliterators/spaces.rb +79 -0
- data/lib/yosina/transliterators.rb +57 -0
- data/lib/yosina/version.rb +5 -0
- data/lib/yosina.rb +62 -0
- data/yosina.gemspec +41 -0
- metadata +159 -0
data/lib/yosina/char.rb
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
# Represents a character with metadata for transliteration
|
|
5
|
+
class Char
|
|
6
|
+
attr_accessor :c, :offset, :source
|
|
7
|
+
|
|
8
|
+
# Initialize a new character
|
|
9
|
+
#
|
|
10
|
+
# @param c [String] The character string
|
|
11
|
+
# @param offset [Integer] The offset position in the original text
|
|
12
|
+
# @param source [Char, nil] Optional reference to the original character
|
|
13
|
+
# rubocop:disable Naming/MethodParameterName
|
|
14
|
+
def initialize(c:, offset:, source: nil)
|
|
15
|
+
@c = c
|
|
16
|
+
@offset = offset
|
|
17
|
+
@source = source
|
|
18
|
+
end
|
|
19
|
+
# rubocop:enable Naming/MethodParameterName
|
|
20
|
+
|
|
21
|
+
# Check if the character is a sentinel (empty character)
|
|
22
|
+
#
|
|
23
|
+
# @return [Boolean] true if the character is empty, false otherwise
|
|
24
|
+
def sentinel?
|
|
25
|
+
@c.empty?
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Create a new Char with a different offset
|
|
29
|
+
#
|
|
30
|
+
# @param offset [Integer] The new offset for the character
|
|
31
|
+
# @return [Char] A new Char instance with the updated offset
|
|
32
|
+
def with_offset(offset)
|
|
33
|
+
Char.new(c: @c, offset: offset, source: self)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Check if the character has been transliterated
|
|
37
|
+
#
|
|
38
|
+
# @return [Boolean] true if the character has a source, false otherwise
|
|
39
|
+
def transliterated?
|
|
40
|
+
c = self
|
|
41
|
+
loop do
|
|
42
|
+
s = c.source
|
|
43
|
+
break if s.nil?
|
|
44
|
+
return true if c.c != s.c
|
|
45
|
+
|
|
46
|
+
c = s
|
|
47
|
+
end
|
|
48
|
+
false
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def ==(other)
|
|
52
|
+
return false unless other.is_a?(Char)
|
|
53
|
+
|
|
54
|
+
c == other.c && offset == other.offset && source == other.source
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def to_s
|
|
58
|
+
c
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def inspect
|
|
62
|
+
"#<Yosina::Char c=#{c.inspect} offset=#{offset} source=#{source&.inspect}>"
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
data/lib/yosina/chars.rb
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
# Character array building and string conversion utilities
|
|
5
|
+
module Chars
|
|
6
|
+
# Build a character array from a string, handling IVS/SVS sequences
|
|
7
|
+
#
|
|
8
|
+
# This function properly handles Ideographic Variation Sequences (IVS) and
|
|
9
|
+
# Standardized Variation Sequences (SVS) by combining base characters with
|
|
10
|
+
# their variation selectors into single Char objects.
|
|
11
|
+
#
|
|
12
|
+
# @param input_str [String] The input string to convert to character array
|
|
13
|
+
# @return [Chars] A list of Char objects representing the input string,
|
|
14
|
+
# with a sentinel empty character at the end
|
|
15
|
+
def self.build_char_array(input_str)
|
|
16
|
+
result = []
|
|
17
|
+
offset = 0
|
|
18
|
+
prev_char = nil
|
|
19
|
+
prev_codepoint = nil
|
|
20
|
+
|
|
21
|
+
input_str.each_char do |char|
|
|
22
|
+
codepoint = char.ord
|
|
23
|
+
|
|
24
|
+
if prev_char && prev_codepoint
|
|
25
|
+
# Check if current character is a variation selector
|
|
26
|
+
# Variation selectors are in ranges: U+FE00-U+FE0F, U+E0100-U+E01EF
|
|
27
|
+
if (0xFE00..0xFE0F).cover?(codepoint) || (0xE0100..0xE01EF).cover?(codepoint)
|
|
28
|
+
# Combine previous character with variation selector
|
|
29
|
+
combined_char = prev_char + char
|
|
30
|
+
result << Char.new(c: combined_char, offset: offset)
|
|
31
|
+
offset += combined_char.length
|
|
32
|
+
prev_char = prev_codepoint = nil
|
|
33
|
+
next
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Previous character was not followed by a variation selector
|
|
37
|
+
result << Char.new(c: prev_char, offset: offset)
|
|
38
|
+
offset += prev_char.length
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Store current character for next iteration
|
|
42
|
+
prev_char = char
|
|
43
|
+
prev_codepoint = codepoint
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Handle the last character if any
|
|
47
|
+
if prev_char
|
|
48
|
+
result << Char.new(c: prev_char, offset: offset)
|
|
49
|
+
offset += prev_char.length
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Add sentinel empty character
|
|
53
|
+
result << Char.new(c: '', offset: offset)
|
|
54
|
+
|
|
55
|
+
class << result
|
|
56
|
+
include Chars
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
result
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Convert an array of characters back to a string
|
|
63
|
+
#
|
|
64
|
+
# This function filters out sentinel characters (empty strings) that are
|
|
65
|
+
# used internally by the transliteration system.
|
|
66
|
+
#
|
|
67
|
+
# @param chars [Enumerable<Char>] An array of Char objects
|
|
68
|
+
# @return [String] A string composed of the non-empty characters
|
|
69
|
+
def self.as_s(chars)
|
|
70
|
+
chars.reject { |char| char.c.empty? }.map(&:c).join
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Create an enumerator that yields characters from the input
|
|
74
|
+
#
|
|
75
|
+
# @param &block [Proc] A block that yields characters to the enumerator
|
|
76
|
+
# @return [Enumerator] An enumerator that yields Char objects
|
|
77
|
+
def self.enum(&block)
|
|
78
|
+
e = Enumerator.new { |y| block.call(y) }
|
|
79
|
+
class << e
|
|
80
|
+
include Chars
|
|
81
|
+
end
|
|
82
|
+
e
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def to_s
|
|
86
|
+
Chars.as_s(self)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
%i[
|
|
90
|
+
chunk_while
|
|
91
|
+
partition
|
|
92
|
+
slice_before
|
|
93
|
+
slice_when
|
|
94
|
+
].each do |chunker|
|
|
95
|
+
define_method(chunker) do |*args, &block|
|
|
96
|
+
e = super.send(:chunker, *args, &block)
|
|
97
|
+
e.map do |slice|
|
|
98
|
+
class << slice
|
|
99
|
+
include Chars
|
|
100
|
+
end
|
|
101
|
+
slice
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
%i[
|
|
107
|
+
chain
|
|
108
|
+
find_all
|
|
109
|
+
drop
|
|
110
|
+
drop_while
|
|
111
|
+
entries
|
|
112
|
+
filter
|
|
113
|
+
grep
|
|
114
|
+
grep_v
|
|
115
|
+
reject
|
|
116
|
+
select
|
|
117
|
+
sort
|
|
118
|
+
sort_by
|
|
119
|
+
take
|
|
120
|
+
take_while
|
|
121
|
+
to_a
|
|
122
|
+
].each do |method|
|
|
123
|
+
define_method(method) do |*args, &block|
|
|
124
|
+
e = super(*args, &block)
|
|
125
|
+
class << e
|
|
126
|
+
include Chars
|
|
127
|
+
end
|
|
128
|
+
e
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def chunk(&block)
|
|
133
|
+
e = super(&block)
|
|
134
|
+
e.map do |g, slice|
|
|
135
|
+
class << slice
|
|
136
|
+
include Chars
|
|
137
|
+
end
|
|
138
|
+
[g, slice]
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def group_by(&block)
|
|
143
|
+
e = super(&block)
|
|
144
|
+
e.transform_values do |slice|
|
|
145
|
+
class << slice
|
|
146
|
+
include Chars
|
|
147
|
+
end
|
|
148
|
+
slice
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Main module for Yosina text transliteration library
|
|
4
|
+
module Yosina
|
|
5
|
+
# Internal builder for creating lists of transliterator configurations
|
|
6
|
+
class TransliteratorConfigListBuilder
|
|
7
|
+
attr_reader :head, :tail
|
|
8
|
+
|
|
9
|
+
def initialize(head: [], tail: [])
|
|
10
|
+
@head = head.dup
|
|
11
|
+
@tail = tail.dup
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Insert config at the head of the chain
|
|
15
|
+
def insert_head(config, force_replace: false)
|
|
16
|
+
idx = @head.find_index { |c| c[0] == config[0] }
|
|
17
|
+
if idx
|
|
18
|
+
@head[idx] = config if force_replace
|
|
19
|
+
else
|
|
20
|
+
@head.unshift(config)
|
|
21
|
+
end
|
|
22
|
+
self
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Insert config in the middle (tail list, at beginning)
|
|
26
|
+
def insert_middle(config, force_replace: false)
|
|
27
|
+
idx = @tail.find_index { |c| c[0] == config[0] }
|
|
28
|
+
if idx
|
|
29
|
+
@tail[idx] = config if force_replace
|
|
30
|
+
else
|
|
31
|
+
@tail.unshift(config)
|
|
32
|
+
end
|
|
33
|
+
self
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Insert config at the tail of the chain
|
|
37
|
+
def insert_tail(config, force_replace: false)
|
|
38
|
+
idx = @tail.find_index { |c| c[0] == config[0] }
|
|
39
|
+
if idx
|
|
40
|
+
@tail[idx] = config if force_replace
|
|
41
|
+
else
|
|
42
|
+
@tail.push(config)
|
|
43
|
+
end
|
|
44
|
+
self
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Build the final configuration list
|
|
48
|
+
def build
|
|
49
|
+
@head + @tail
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Configuration recipe for building transliterator chains
|
|
54
|
+
class TransliterationRecipe
|
|
55
|
+
attr_accessor :kanji_old_new, :hira_kata, :replace_japanese_iteration_marks,
|
|
56
|
+
:replace_suspicious_hyphens_to_prolonged_sound_marks,
|
|
57
|
+
:replace_combined_characters, :replace_circled_or_squared_characters,
|
|
58
|
+
:replace_ideographic_annotations, :replace_radicals, :replace_spaces,
|
|
59
|
+
:replace_hyphens, :replace_mathematical_alphanumerics,
|
|
60
|
+
:combine_decomposed_hiraganas_and_katakanas, :to_fullwidth, :to_halfwidth,
|
|
61
|
+
:remove_ivs_svs, :charset
|
|
62
|
+
|
|
63
|
+
# Initialize a new transliterator recipe
|
|
64
|
+
#
|
|
65
|
+
# @param kanji_old_new [Boolean] Replace old-style kanji glyphs with modern equivalents
|
|
66
|
+
# @example
|
|
67
|
+
# # Input: "舊字體の變換"
|
|
68
|
+
# # Output: "旧字体の変換"
|
|
69
|
+
# @param hira_kata [String, nil] Convert between hiragana and katakana scripts
|
|
70
|
+
# @example
|
|
71
|
+
# # Input: "ひらがな" (with 'hira-to-kata')
|
|
72
|
+
# # Output: "ヒラガナ"
|
|
73
|
+
# # Input: "カタカナ" (with 'kata-to-hira')
|
|
74
|
+
# # Output: "かたかな"
|
|
75
|
+
# @param replace_japanese_iteration_marks [Boolean] Replace Japanese iteration marks with the characters
|
|
76
|
+
# they represent
|
|
77
|
+
# @example
|
|
78
|
+
# # Input: "時々"
|
|
79
|
+
# # Output: "時時"
|
|
80
|
+
# # Input: "いすゞ"
|
|
81
|
+
# # Output: "いすず"
|
|
82
|
+
# @param replace_suspicious_hyphens_to_prolonged_sound_marks [Boolean] Replace suspicious hyphens with prolonged
|
|
83
|
+
# sound marks
|
|
84
|
+
# @example
|
|
85
|
+
# # Input: "スーパ-" (with hyphen-minus)
|
|
86
|
+
# # Output: "スーパー" (becomes prolonged sound mark)
|
|
87
|
+
# @param replace_combined_characters [Boolean] Replace combined characters with their corresponding characters
|
|
88
|
+
# @example
|
|
89
|
+
# # Input: "㍻" (single character for Heisei era)
|
|
90
|
+
# # Output: "平成"
|
|
91
|
+
# # Input: "㈱"
|
|
92
|
+
# # Output: "(株)"
|
|
93
|
+
# @param replace_circled_or_squared_characters [Boolean, String] Replace circled or squared characters with
|
|
94
|
+
# templates
|
|
95
|
+
# @example
|
|
96
|
+
# # Input: "①②③"
|
|
97
|
+
# # Output: "(1)(2)(3)"
|
|
98
|
+
# # Input: "㊙㊗"
|
|
99
|
+
# # Output: "(秘)(祝)"
|
|
100
|
+
# @param replace_ideographic_annotations [Boolean] Replace ideographic annotations
|
|
101
|
+
# @example
|
|
102
|
+
# # Input: "㆖㆘" (ideographic annotations)
|
|
103
|
+
# # Output: "上下"
|
|
104
|
+
# @param replace_radicals [Boolean] Replace Kangxi radicals with CJK ideographs
|
|
105
|
+
# @example
|
|
106
|
+
# # Input: "⾔⾨⾷" (Kangxi radicals)
|
|
107
|
+
# # Output: "言門食" (CJK ideographs)
|
|
108
|
+
# @param replace_spaces [Boolean] Replace various space characters
|
|
109
|
+
# @example
|
|
110
|
+
# # Input: "A B" (ideographic space U+3000)
|
|
111
|
+
# # Output: "A B" (half-width space)
|
|
112
|
+
# # Input: "A B" (non-breaking space U+00A0)
|
|
113
|
+
# # Output: "A B" (regular space)
|
|
114
|
+
# @param replace_hyphens [Boolean, Array<String>] Replace various dash/hyphen symbols
|
|
115
|
+
# @example
|
|
116
|
+
# # Input: "2019—2020" (em dash)
|
|
117
|
+
# # Output: "2019-2020" (hyphen-minus)
|
|
118
|
+
# # Input: "A–B" (en dash)
|
|
119
|
+
# # Output: "A-B"
|
|
120
|
+
# @param replace_mathematical_alphanumerics [Boolean] Replace mathematical alphanumerics
|
|
121
|
+
# @example
|
|
122
|
+
# # Input: "𝐀𝐁𝐂" (mathematical bold)
|
|
123
|
+
# # Output: "ABC"
|
|
124
|
+
# # Input: "𝟏𝟐𝟑" (mathematical bold digits)
|
|
125
|
+
# # Output: "123"
|
|
126
|
+
# @param combine_decomposed_hiraganas_and_katakanas [Boolean] Combine decomposed hiraganas/katakanas
|
|
127
|
+
# @example
|
|
128
|
+
# # Input: "が" (か + ゙)
|
|
129
|
+
# # Output: "が" (single character)
|
|
130
|
+
# # Input: "ヘ゜" (ヘ + ゜)
|
|
131
|
+
# # Output: "ペ" (single character)
|
|
132
|
+
# @param to_fullwidth [Boolean, String] Replace half-width with fullwidth characters
|
|
133
|
+
# @example
|
|
134
|
+
# # Input: "ABC123"
|
|
135
|
+
# # Output: "ABC123"
|
|
136
|
+
# # Input: "カタカナ"
|
|
137
|
+
# # Output: "カタカナ"
|
|
138
|
+
# @param to_halfwidth [Boolean, String] Replace full-width with half-width characters
|
|
139
|
+
# @example
|
|
140
|
+
# # Input: "ABC123"
|
|
141
|
+
# # Output: "ABC123"
|
|
142
|
+
# # Input: "カタカナ" (with hankaku-kana)
|
|
143
|
+
# # Output: "カタカナ"
|
|
144
|
+
# @param remove_ivs_svs [Boolean, String] Remove IVS/SVS selectors
|
|
145
|
+
# @example
|
|
146
|
+
# # Input: "葛󠄀" (葛 + IVS U+E0100)
|
|
147
|
+
# # Output: "葛" (without selector)
|
|
148
|
+
# # Input: "辻󠄀" (辻 + IVS)
|
|
149
|
+
# # Output: "辻"
|
|
150
|
+
# @param charset [String] Charset for IVS/SVS transliteration
|
|
151
|
+
# rubocop:disable Metrics/ParameterLists
|
|
152
|
+
def initialize(kanji_old_new: false, hira_kata: nil, replace_japanese_iteration_marks: false,
|
|
153
|
+
replace_suspicious_hyphens_to_prolonged_sound_marks: false,
|
|
154
|
+
replace_combined_characters: false, replace_circled_or_squared_characters: false,
|
|
155
|
+
replace_ideographic_annotations: false, replace_radicals: false,
|
|
156
|
+
replace_spaces: false, replace_hyphens: false,
|
|
157
|
+
replace_mathematical_alphanumerics: false,
|
|
158
|
+
combine_decomposed_hiraganas_and_katakanas: false,
|
|
159
|
+
to_fullwidth: false, to_halfwidth: false, remove_ivs_svs: false,
|
|
160
|
+
charset: 'unijis_2004')
|
|
161
|
+
@kanji_old_new = kanji_old_new
|
|
162
|
+
@hira_kata = hira_kata
|
|
163
|
+
@replace_japanese_iteration_marks = replace_japanese_iteration_marks
|
|
164
|
+
@replace_suspicious_hyphens_to_prolonged_sound_marks = replace_suspicious_hyphens_to_prolonged_sound_marks
|
|
165
|
+
@replace_combined_characters = replace_combined_characters
|
|
166
|
+
@replace_circled_or_squared_characters = replace_circled_or_squared_characters
|
|
167
|
+
@replace_ideographic_annotations = replace_ideographic_annotations
|
|
168
|
+
@replace_radicals = replace_radicals
|
|
169
|
+
@replace_spaces = replace_spaces
|
|
170
|
+
@replace_hyphens = replace_hyphens
|
|
171
|
+
@replace_mathematical_alphanumerics = replace_mathematical_alphanumerics
|
|
172
|
+
@combine_decomposed_hiraganas_and_katakanas = combine_decomposed_hiraganas_and_katakanas
|
|
173
|
+
@to_fullwidth = to_fullwidth
|
|
174
|
+
@to_halfwidth = to_halfwidth
|
|
175
|
+
@remove_ivs_svs = remove_ivs_svs
|
|
176
|
+
@charset = charset
|
|
177
|
+
end
|
|
178
|
+
# rubocop:enable Metrics/ParameterLists
|
|
179
|
+
|
|
180
|
+
# Build transliterator configurations from this recipe
|
|
181
|
+
#
|
|
182
|
+
# @return [Array<Array>] Array of transliterator configurations
|
|
183
|
+
# @raise [ArgumentError] If the recipe contains mutually exclusive options
|
|
184
|
+
def build_transliterator_configs
|
|
185
|
+
# Check for mutually exclusive options
|
|
186
|
+
errors = []
|
|
187
|
+
errors << 'to_fullwidth and to_halfwidth are mutually exclusive' if to_fullwidth && to_halfwidth
|
|
188
|
+
|
|
189
|
+
raise ArgumentError, errors.join('; ') unless errors.empty?
|
|
190
|
+
|
|
191
|
+
ctx = TransliteratorConfigListBuilder.new
|
|
192
|
+
|
|
193
|
+
# Apply transformations in the specified order
|
|
194
|
+
ctx = apply_kanji_old_new(ctx)
|
|
195
|
+
ctx = apply_replace_suspicious_hyphens_to_prolonged_sound_marks(ctx)
|
|
196
|
+
ctx = apply_replace_circled_or_squared_characters(ctx)
|
|
197
|
+
ctx = apply_replace_combined_characters(ctx)
|
|
198
|
+
ctx = apply_replace_ideographic_annotations(ctx)
|
|
199
|
+
ctx = apply_replace_radicals(ctx)
|
|
200
|
+
ctx = apply_replace_spaces(ctx)
|
|
201
|
+
ctx = apply_replace_hyphens(ctx)
|
|
202
|
+
ctx = apply_replace_mathematical_alphanumerics(ctx)
|
|
203
|
+
ctx = apply_combine_decomposed_hiraganas_and_katakanas(ctx)
|
|
204
|
+
ctx = apply_to_fullwidth(ctx)
|
|
205
|
+
ctx = apply_hira_kata(ctx)
|
|
206
|
+
ctx = apply_replace_japanese_iteration_marks(ctx)
|
|
207
|
+
ctx = apply_to_halfwidth(ctx)
|
|
208
|
+
ctx = apply_remove_ivs_svs(ctx)
|
|
209
|
+
|
|
210
|
+
ctx.build
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
private
|
|
214
|
+
|
|
215
|
+
def remove_ivs_svs_helper(ctx, drop_all_selectors)
|
|
216
|
+
# First insert IVS-or-SVS mode at head
|
|
217
|
+
ctx = ctx.insert_head([:ivs_svs_base, { mode: 'ivs-or-svs', charset: @charset }], force_replace: true)
|
|
218
|
+
# Then insert base mode at tail
|
|
219
|
+
ctx.insert_tail(
|
|
220
|
+
[:ivs_svs_base,
|
|
221
|
+
{ mode: 'base', drop_selectors_altogether: drop_all_selectors, charset: @charset }], force_replace: true
|
|
222
|
+
)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def apply_kanji_old_new(ctx)
|
|
226
|
+
if @kanji_old_new
|
|
227
|
+
ctx = remove_ivs_svs_helper(ctx, false)
|
|
228
|
+
ctx.insert_middle([:kanji_old_new, {}])
|
|
229
|
+
else
|
|
230
|
+
ctx
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def apply_hira_kata(ctx)
|
|
235
|
+
if @hira_kata
|
|
236
|
+
ctx.insert_middle([:hira_kata, { mode: @hira_kata }])
|
|
237
|
+
else
|
|
238
|
+
ctx
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def apply_replace_japanese_iteration_marks(ctx)
|
|
243
|
+
if @replace_japanese_iteration_marks
|
|
244
|
+
# Insert HiraKataComposition at head to ensure composed forms
|
|
245
|
+
ctx = ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
|
|
246
|
+
# Then insert the japanese-iteration-marks in the middle
|
|
247
|
+
ctx.insert_middle([:japanese_iteration_marks, {}])
|
|
248
|
+
else
|
|
249
|
+
ctx
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def apply_replace_suspicious_hyphens_to_prolonged_sound_marks(ctx)
|
|
254
|
+
if @replace_suspicious_hyphens_to_prolonged_sound_marks
|
|
255
|
+
ctx.insert_middle([:prolonged_sound_marks, { replace_prolonged_marks_following_alnums: true }])
|
|
256
|
+
else
|
|
257
|
+
ctx
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def apply_replace_combined_characters(ctx)
|
|
262
|
+
ctx.insert_middle([:combined, {}]) if @replace_combined_characters
|
|
263
|
+
ctx
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def apply_replace_circled_or_squared_characters(ctx)
|
|
267
|
+
if @replace_circled_or_squared_characters
|
|
268
|
+
include_emojis = @replace_circled_or_squared_characters != 'exclude-emojis'
|
|
269
|
+
ctx.insert_middle([:circled_or_squared, { include_emojis: include_emojis }])
|
|
270
|
+
else
|
|
271
|
+
ctx
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def apply_replace_ideographic_annotations(ctx)
|
|
276
|
+
if @replace_ideographic_annotations
|
|
277
|
+
ctx.insert_middle([:ideographic_annotations, {}])
|
|
278
|
+
else
|
|
279
|
+
ctx
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def apply_replace_radicals(ctx)
|
|
284
|
+
if @replace_radicals
|
|
285
|
+
ctx.insert_middle([:radicals, {}])
|
|
286
|
+
else
|
|
287
|
+
ctx
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def apply_replace_spaces(ctx)
|
|
292
|
+
if @replace_spaces
|
|
293
|
+
ctx.insert_middle([:spaces, {}])
|
|
294
|
+
else
|
|
295
|
+
ctx
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
def apply_replace_hyphens(ctx)
|
|
300
|
+
if @replace_hyphens
|
|
301
|
+
precedence = @replace_hyphens.is_a?(Array) ? @replace_hyphens : %i[jisx0208_90_windows jisx0201]
|
|
302
|
+
ctx.insert_middle([:hyphens, { precedence: precedence }])
|
|
303
|
+
else
|
|
304
|
+
ctx
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def apply_replace_mathematical_alphanumerics(ctx)
|
|
309
|
+
if @replace_mathematical_alphanumerics
|
|
310
|
+
ctx.insert_middle([:mathematical_alphanumerics, {}])
|
|
311
|
+
else
|
|
312
|
+
ctx
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def apply_combine_decomposed_hiraganas_and_katakanas(ctx)
|
|
317
|
+
if @combine_decomposed_hiraganas_and_katakanas
|
|
318
|
+
ctx.insert_head([:hira_kata_composition, { compose_non_combining_marks: true }])
|
|
319
|
+
else
|
|
320
|
+
ctx
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
def apply_to_fullwidth(ctx)
|
|
325
|
+
if @to_fullwidth
|
|
326
|
+
u005c_as_yen_sign = @to_fullwidth == 'u005c-as-yen-sign'
|
|
327
|
+
ctx.insert_tail([:jisx0201_and_alike, { fullwidth_to_halfwidth: false, u005c_as_yen_sign: u005c_as_yen_sign }])
|
|
328
|
+
end
|
|
329
|
+
ctx
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
def apply_to_halfwidth(ctx)
|
|
333
|
+
if @to_halfwidth
|
|
334
|
+
convert_gr = @to_halfwidth == 'hankaku-kana'
|
|
335
|
+
ctx.insert_tail([:jisx0201_and_alike,
|
|
336
|
+
{ fullwidth_to_halfwidth: true, convert_gl: true, convert_gr: convert_gr }])
|
|
337
|
+
else
|
|
338
|
+
ctx
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
def apply_remove_ivs_svs(ctx)
|
|
343
|
+
if @remove_ivs_svs
|
|
344
|
+
drop_all_selectors = @remove_ivs_svs == 'drop-all-selectors'
|
|
345
|
+
remove_ivs_svs_helper(ctx, drop_all_selectors)
|
|
346
|
+
else
|
|
347
|
+
ctx
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# Build an array of transliterator configs from a recipe object
|
|
353
|
+
#
|
|
354
|
+
# @param recipe [TransliterationRecipe] A TransliterationRecipe object
|
|
355
|
+
# @return [Array<Array>] Array of transliterator configurations
|
|
356
|
+
def self.build_transliterator_configs_from_recipe(recipe)
|
|
357
|
+
recipe.build_transliterator_configs
|
|
358
|
+
end
|
|
359
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Yosina
|
|
4
|
+
# Configuration for a transliterator
|
|
5
|
+
class TransliteratorConfig
|
|
6
|
+
attr_accessor :name, :options
|
|
7
|
+
|
|
8
|
+
# Initialize a new transliterator configuration
|
|
9
|
+
#
|
|
10
|
+
# @param name [String, Symbol] The name of the transliterator
|
|
11
|
+
# @param options [Hash, nil] Configuration options for the transliterator
|
|
12
|
+
def initialize(name, options = nil)
|
|
13
|
+
@name = name
|
|
14
|
+
@options = options
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Base class for all transliterators
|
|
19
|
+
class BaseTransliterator
|
|
20
|
+
# Transliterate an array of characters
|
|
21
|
+
#
|
|
22
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
23
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
24
|
+
def call(input_chars)
|
|
25
|
+
raise NotImplementedError, 'Subclasses must implement call method'
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Chains multiple transliterators together
|
|
30
|
+
class ChainedTransliterator < BaseTransliterator
|
|
31
|
+
# Initialize a chained transliterator
|
|
32
|
+
#
|
|
33
|
+
# @param transliterators [Array<BaseTransliterator>] The transliterators to chain
|
|
34
|
+
def initialize(transliterators)
|
|
35
|
+
super()
|
|
36
|
+
@transliterators = transliterators
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Apply all transliterators in sequence
|
|
40
|
+
#
|
|
41
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
42
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
43
|
+
def call(input_chars)
|
|
44
|
+
@transliterators.reduce(input_chars) do |chars, transliterator|
|
|
45
|
+
transliterator.call(chars)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'circled_or_squared_data'
|
|
4
|
+
|
|
5
|
+
module Yosina
|
|
6
|
+
module Transliterators
|
|
7
|
+
# Replace circled or squared characters with their corresponding templates
|
|
8
|
+
module CircledOrSquared
|
|
9
|
+
# Transliterator for circled or squared characters
|
|
10
|
+
class Transliterator < Yosina::BaseTransliterator
|
|
11
|
+
# Initialize the transliterator with options
|
|
12
|
+
#
|
|
13
|
+
# @param options [Hash] Configuration options
|
|
14
|
+
# @option options [Boolean] :include_emojis Whether to include emoji representations
|
|
15
|
+
# @option options [Hash] :templates Custom templates for circle and square
|
|
16
|
+
# @option options [String] :templates['circle'] Template for circled characters
|
|
17
|
+
# @option options [String] :templates['square'] Template for squared characters
|
|
18
|
+
def initialize(options = {})
|
|
19
|
+
super()
|
|
20
|
+
@include_emojis = options[:include_emojis] || false
|
|
21
|
+
templates = options[:templates] || {}
|
|
22
|
+
@templates = {
|
|
23
|
+
'c' => templates['circle'] || '(?)',
|
|
24
|
+
's' => templates['square'] || '[?]'
|
|
25
|
+
}
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Replace circled or squared characters with their corresponding templates
|
|
29
|
+
#
|
|
30
|
+
# @param input_chars [Enumerable<Char>] The characters to transliterate
|
|
31
|
+
# @return [Enumerable<Char>] The transliterated characters
|
|
32
|
+
def call(input_chars)
|
|
33
|
+
offset = 0
|
|
34
|
+
|
|
35
|
+
Chars.enum do |y|
|
|
36
|
+
input_chars.each do |char|
|
|
37
|
+
mapping = CIRCLED_OR_SQUARED_MAPPINGS[char.c]
|
|
38
|
+
if mapping && (!mapping[:emoji] || @include_emojis)
|
|
39
|
+
rendering = mapping[:rendering]
|
|
40
|
+
type_abbrev = mapping[:type]
|
|
41
|
+
template = @templates[type_abbrev]
|
|
42
|
+
replacement = +template
|
|
43
|
+
replacement['?'] = rendering
|
|
44
|
+
|
|
45
|
+
replacement.each_char do |replacement_char|
|
|
46
|
+
y << Char.new(c: replacement_char, offset: offset, source: char)
|
|
47
|
+
offset += replacement_char.length
|
|
48
|
+
end
|
|
49
|
+
else
|
|
50
|
+
y << Char.new(c: char.c, offset: offset, source: char.source)
|
|
51
|
+
offset += char.c.length
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Factory method to create a combined transliterator
|
|
59
|
+
#
|
|
60
|
+
# @param options [Hash] Configuration options
|
|
61
|
+
# @return [Transliterator] A new combined transliterator instance
|
|
62
|
+
def self.call(options = {})
|
|
63
|
+
Transliterator.new(options)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|