unicode_madness 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,307 @@
|
|
1
|
+
class JapaneseString < UnicodeString
|
2
|
+
# A string that can be used in a regular expression character class to match
|
3
|
+
# any kanji character. (Example: <tt>/[#{KANJI_CLASS}]/</tt>)
|
4
|
+
KANJI_CLASS =
|
5
|
+
"#{UCSCodepoint.new(0x4e00)}-#{UCSCodepoint.new(0x9fbf)}" +
|
6
|
+
"#{UCSCodepoint.new(0x3400)}-#{UCSCodepoint.new(0x4dbf)}" +
|
7
|
+
"#{UCSCodepoint.new(0x20000)}-#{UCSCodepoint.new(0x2a6df)}"
|
8
|
+
|
9
|
+
# A string that can be used in a regular expression character class to match
|
10
|
+
# any katakana character. (Example: <tt>/[#{KATAKANA_CLASS}]/</tt>)
|
11
|
+
KATAKANA_CLASS = "#{UCSCodepoint.new(0x30a2)}-#{UCSCodepoint.new(0x30ff)}"
|
12
|
+
|
13
|
+
# A string that can be used in a regular expression character class to match
|
14
|
+
# any hiragana or katakana character. (Example: <tt>/[#{KANA_CLASS}]/</tt>)
|
15
|
+
KANA_CLASS =
|
16
|
+
"#{UCSCodepoint.new(0x3040)}-#{UCSCodepoint.new(0x30ff)}" +
|
17
|
+
"#{UCSCodepoint.new(0x31f0)}-#{UCSCodepoint.new(0x31ff)}"
|
18
|
+
|
19
|
+
# Table for converting katakana to their equivalent hiragana.
|
20
|
+
KATAKANA_TO_HIRAGANA = {
|
21
|
+
'ア' => 'あ', 'イ' => 'い', 'ウ' => 'う', 'エ' => 'え', 'オ' => 'お', 'カ' => 'か',
|
22
|
+
'キ' => 'き', 'ク' => 'く', 'ケ' => 'け', 'コ' => 'こ', 'サ' => 'さ', 'シ' => 'し',
|
23
|
+
'ス' => 'す', 'セ' => 'せ', 'ソ' => 'そ', 'タ' => 'た', 'チ' => 'ち', 'ツ' => 'つ',
|
24
|
+
'テ' => 'て', 'ト' => 'と', 'ナ' => 'な', 'ニ' => 'に', 'ヌ' => 'ぬ', 'ネ' => 'ね',
|
25
|
+
'ノ' => 'の', 'ハ' => 'は', 'ヒ' => 'ひ', 'フ' => 'ふ', 'ヘ' => 'へ', 'ホ' => 'ほ',
|
26
|
+
'マ' => 'ま', 'ミ' => 'み', 'ム' => 'む', 'メ' => 'め', 'モ' => 'も', 'ヤ' => 'や',
|
27
|
+
'ユ' => 'ゆ', 'ヨ' => 'よ', 'ラ' => 'ら', 'リ' => 'り', 'ル' => 'る', 'レ' => 'れ',
|
28
|
+
'ロ' => 'ろ', 'ワ' => 'わ', 'ヰ' => 'ゐ', 'ヱ' => 'ゑ', 'ヲ' => 'を', 'ン' => 'ん',
|
29
|
+
'ガ' => 'が', 'ギ' => 'ぎ', 'グ' => 'ぐ', 'ゲ' => 'げ', 'ゴ' => 'ご', 'ザ' => 'ざ',
|
30
|
+
'ジ' => 'じ', 'ズ' => 'ず', 'ゼ' => 'ぜ', 'ゾ' => 'ぞ', 'ダ' => 'だ', 'ヂ' => 'ぢ',
|
31
|
+
'ヅ' => 'づ', 'デ' => 'で', 'ド' => 'ど', 'バ' => 'ば', 'ビ' => 'び', 'ブ' => 'ぶ',
|
32
|
+
'ベ' => 'べ', 'ボ' => 'ぼ', 'パ' => 'ぱ', 'ピ' => 'ぴ', 'プ' => 'ぷ', 'ペ' => 'ぺ',
|
33
|
+
'ポ' => 'ぽ', 'ァ' => 'ぁ', 'ィ' => 'ぃ', 'ゥ' => 'ぅ', 'ェ' => 'ぇ', 'ォ' => 'ぉ',
|
34
|
+
'ャ' => 'ゃ', 'ュ' => 'ゅ', 'ョ' => 'ょ', 'ッ' => 'っ'
|
35
|
+
}
|
36
|
+
|
37
|
+
# Table for converting voiced hiragana and katakana to their unvoiced forms.
|
38
|
+
UNVOICED_KANA = {
|
39
|
+
'が' => 'か', 'ぎ' => 'き', 'ぐ' => 'く', 'げ' => 'け', 'ご' => 'こ', 'ざ' => 'さ',
|
40
|
+
'じ' => 'し', 'ず' => 'す', 'ぜ' => 'せ', 'ぞ' => 'そ', 'だ' => 'た', 'ぢ' => 'ち',
|
41
|
+
'づ' => 'つ', 'で' => 'て', 'ど' => 'と', 'ば' => 'は', 'び' => 'ひ', 'ぶ' => 'ふ',
|
42
|
+
'べ' => 'へ', 'ぼ' => 'ほ', 'ぱ' => 'は', 'ぴ' => 'ひ', 'ぷ' => 'ふ', 'ぺ' => 'へ',
|
43
|
+
'ぽ' => 'ほ', 'ヴ' => 'ウ', 'ガ' => 'カ', 'ギ' => 'キ', 'グ' => 'ク', 'ゲ' => 'ケ',
|
44
|
+
'ゴ' => 'コ', 'ザ' => 'サ', 'ジ' => 'シ', 'ズ' => 'ス', 'ゼ' => 'セ', 'ゾ' => 'ソ',
|
45
|
+
'ダ' => 'タ', 'ヂ' => 'チ', 'ヅ' => 'ツ', 'デ' => 'テ', 'ド' => 'ト', 'バ' => 'ハ',
|
46
|
+
'ビ' => 'ヒ', 'ブ' => 'フ', 'ベ' => 'ヘ', 'ボ' => 'ホ', 'パ' => 'ハ', 'ピ' => 'ヒ',
|
47
|
+
'プ' => 'フ', 'ペ' => 'ヘ', 'ポ' => 'ホ'
|
48
|
+
}
|
49
|
+
|
50
|
+
# Table for converting unvoiced hiragana and katakana to their voiced forms.
|
51
|
+
VOICED_KANA = {
|
52
|
+
'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご', 'さ' => 'ざ',
|
53
|
+
'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ', 'た' => 'だ', 'ち' => 'ぢ',
|
54
|
+
'つ' => 'づ', 'て' => 'で', 'と' => 'ど', 'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ',
|
55
|
+
'へ' => 'べ', 'ほ' => 'ぼ', 'は' => 'ぱ', 'ひ' => 'ぴ', 'ふ' => 'ぷ', 'へ' => 'ぺ',
|
56
|
+
'ほ' => 'ぽ', 'ウ' => 'ヴ', 'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ',
|
57
|
+
'コ' => 'ゴ', 'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
|
58
|
+
'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド', 'ハ' => 'バ',
|
59
|
+
'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ', 'ハ' => 'パ', 'ヒ' => 'ピ',
|
60
|
+
'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ'
|
61
|
+
}
|
62
|
+
|
63
|
+
# Maps kana to their romanized equivalents. Also maps full-width Latin
|
64
|
+
# characters to their ASCII equivalents.
|
65
|
+
KANA_ROMAJI_MAP = {
|
66
|
+
"あ" => "a", "い" => "i", "う" => "u", "え" => "e", "お" => "o", "か" => "ka",
|
67
|
+
"き" => "ki", "く" => "ku", "け" => "ke", "こ" => "ko", "さ" => "sa",
|
68
|
+
"し" => "shi", "す" => "su", "せ" => "se", "そ" => "so", "た" => "ta",
|
69
|
+
"ち" => "chi", "つ" => "tsu", "て" => "te", "と" => "to", "な" => "na",
|
70
|
+
"に" => "ni", "ぬ" => "nu", "ね" => "ne", "の" => "no", "は" => "ha",
|
71
|
+
"ひ" => "hi", "ふ" => "fu", "へ" => "he", "ほ" => "ho", "ま" => "ma",
|
72
|
+
"み" => "mi", "む" => "mu", "め" => "me", "も" => "mo", "や" => "ya",
|
73
|
+
"ゆ" => "yu", "よ" => "yo", "ら" => "ra", "り" => "ri", "る" => "ru",
|
74
|
+
"れ" => "re", "ろ" => "ro", "わ" => "wa", "ゐ" => "wi", "ゑ" => "we",
|
75
|
+
"を" => "wo", "ん" => "n", "が" => "ga", "ぎ" => "gi", "ぐ" => "gu",
|
76
|
+
"げ" => "ge", "ご" => "go", "ざ" => "za", "じ" => "ji", "ず" => "zu",
|
77
|
+
"ぜ" => "ze", "ぞ" => "zo", "だ" => "da", "ぢ" => "ji", "づ" => "zu",
|
78
|
+
"で" => "de", "ど" => "do", "ば" => "ba", "び" => "bi", "ぶ" => "bu",
|
79
|
+
"べ" => "be", "ぼ" => "bo", "ぱ" => "pa", "ぴ" => "pi", "ぷ" => "pu",
|
80
|
+
"ぺ" => "pe", "ぽ" => "po", "ア" => "a", "イ" => "i", "ウ" => "u", "エ" => "e",
|
81
|
+
"オ" => "o", "カ" => "ka", "キ" => "ki", "ク" => "ku", "ケ" => "ke",
|
82
|
+
"コ" => "ko", "サ" => "sa", "シ" => "shi", "ス" => "su", "セ" => "se",
|
83
|
+
"ソ" => "so", "タ" => "ta", "チ" => "chi", "ツ" => "tsu", "テ" => "te",
|
84
|
+
"ト" => "to", "ナ" => "na", "ニ" => "ni", "ヌ" => "nu", "ネ" => "ne",
|
85
|
+
"ノ" => "no", "ハ" => "ha", "ヒ" => "hi", "フ" => "fu", "ヘ" => "he",
|
86
|
+
"ホ" => "ho", "マ" => "ma", "ミ" => "mi", "ム" => "mu", "メ" => "me",
|
87
|
+
"モ" => "mo", "ヤ" => "ya", "ユ" => "yu", "ヨ" => "yo", "ラ" => "ra",
|
88
|
+
"リ" => "ri", "ル" => "ru", "レ" => "re", "ロ" => "ro", "ワ" => "wa",
|
89
|
+
"ヰ" => "wi", "ヱ" => "we", "ヲ" => "wo", "ン" => "n", "ガ" => "ga",
|
90
|
+
"ギ" => "gi", "グ" => "gu", "ゲ" => "ge", "ゴ" => "go", "ザ" => "za",
|
91
|
+
"ジ" => "ji", "ズ" => "zu", "ゼ" => "ze", "ゾ" => "zo", "ダ" => "da",
|
92
|
+
"ヂ" => "ji", "ヅ" => "zu", "デ" => "de", "ド" => "do", "バ" => "ba",
|
93
|
+
"ビ" => "bi", "ブ" => "bu", "ベ" => "be", "ボ" => "bo", "パ" => "pa",
|
94
|
+
"ピ" => "pi", "プ" => "pu", "ペ" => "pe", "ポ" => "po", "ヴ" => "vu",
|
95
|
+
"・" => " ", "0" => "0", "1" => "1", "2" => "2", "3" => "3", "4" => "4",
|
96
|
+
"5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9", "!" => "!",
|
97
|
+
""" => "\"", "#" => "#", "$" => "\$", "%" => "%", "&" => "&", "'" => "'",
|
98
|
+
"(" => "(", ")" => ")", "*" => "*", "+" => "+", "," => ".", "-" => "-",
|
99
|
+
"." => ".", "/" => "/", ":" => ":", ";" => ";", "<" => "<", "=" => "=",
|
100
|
+
">" => ">", "?" => "?", "@" => "\@", "A" => "A", "B" => "B", "C" => "C",
|
101
|
+
"D" => "D", "E" => "E", "F" => "F", "G" => "G", "H" => "H", "I" => "I",
|
102
|
+
"J" => "J", "K" => "K", "L" => "L", "M" => "M", "N" => "N", "O" => "O",
|
103
|
+
"P" => "P", "Q" => "Q", "R" => "R", "S" => "S", "T" => "T", "U" => "U",
|
104
|
+
"V" => "V", "W" => "W", "X" => "X", "Y" => "Y", "Z" => "Z", "[" => "[",
|
105
|
+
"\" => "\\", "]" => "]", "^" => "^", "_" => "_", "`" => "`", "a" => "a",
|
106
|
+
"b" => "b", "c" => "c", "d" => "d", "e" => "e", "f" => "f", "g" => "g",
|
107
|
+
"h" => "h", "i" => "i", "j" => "j", "k" => "k", "l" => "l", "m" => "m",
|
108
|
+
"n" => "n", "o" => "o", "p" => "p", "q" => "q", "r" => "r", "s" => "s",
|
109
|
+
"t" => "t", "u" => "u", "v" => "v", "w" => "w", "x" => "x", "y" => "y",
|
110
|
+
"z" => "z", "{" => "{", "|" => "|", "}" => "}", "〜" => "-"
|
111
|
+
}
|
112
|
+
|
113
|
+
# Returns a new string with this string's katakana replaced with equivalent
|
114
|
+
# hiragana.
|
115
|
+
def to_hiragana
|
116
|
+
new_str = ''
|
117
|
+
split('').each do |ch|
|
118
|
+
if KATAKANA_TO_HIRAGANA.has_key?(ch)
|
119
|
+
new_str += KATAKANA_TO_HIRAGANA[ch]
|
120
|
+
else
|
121
|
+
new_str += ch
|
122
|
+
end
|
123
|
+
end
|
124
|
+
self.class.new(new_str)
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns a new string with this string's voiced hiragana and katakana
|
128
|
+
# replaced with their unvoiced forms.
|
129
|
+
def unvoice_kana
|
130
|
+
new_str = ''
|
131
|
+
split('').each do |ch|
|
132
|
+
if UNVOICED_KANA.has_key?(ch)
|
133
|
+
new_str += UNVOICED_KANA[ch]
|
134
|
+
else
|
135
|
+
new_str += ch
|
136
|
+
end
|
137
|
+
end
|
138
|
+
self.class.new(new_str)
|
139
|
+
end
|
140
|
+
|
141
|
+
# Returns a new string with this string's unvoiced hiragana and katakana
|
142
|
+
# replaced with their voiced forms.
|
143
|
+
def voice_kana
|
144
|
+
new_str = ''
|
145
|
+
split('').each do |ch|
|
146
|
+
if VOICED_KANA.has_key?(ch)
|
147
|
+
new_str += VOICED_KANA[ch]
|
148
|
+
else
|
149
|
+
new_str += ch
|
150
|
+
end
|
151
|
+
end
|
152
|
+
self.class.new(new_str)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Creates a new string by romanizing the kana in this string. Full-width
|
156
|
+
# Latin characters are also converted to their ASCII equivalents. If
|
157
|
+
# +warnings+ is true (the default), a message is printed on +STDERR+ if an
|
158
|
+
# un-romanizable character is encountered.
|
159
|
+
def romanize(warnings = true)
|
160
|
+
romanized = String.new(self)
|
161
|
+
|
162
|
+
# Convert dipthongs. This gsub-mania is probably insanely inefficient.
|
163
|
+
romanized.gsub!('きゃ', 'kya'); romanized.gsub!('キャ', 'kya')
|
164
|
+
romanized.gsub!('きゅ', 'kyu'); romanized.gsub!('キュ', 'kyu')
|
165
|
+
romanized.gsub!('きょ', 'kyo'); romanized.gsub!('キョ', 'kyo')
|
166
|
+
romanized.gsub!('しゃ', 'sha'); romanized.gsub!('シャ', 'sha')
|
167
|
+
romanized.gsub!('しゅ', 'shu'); romanized.gsub!('シュ', 'shu')
|
168
|
+
romanized.gsub!('しぇ', 'she'); romanized.gsub!('シェ', 'she')
|
169
|
+
romanized.gsub!('しょ', 'sho'); romanized.gsub!('ショ', 'sho')
|
170
|
+
romanized.gsub!('ちゃ', 'cha'); romanized.gsub!('チャ', 'cha')
|
171
|
+
romanized.gsub!('ちゅ', 'chu'); romanized.gsub!('チュ', 'chu')
|
172
|
+
romanized.gsub!('ちょ', 'cho'); romanized.gsub!('チョ', 'cho')
|
173
|
+
romanized.gsub!('にゃ', 'nya'); romanized.gsub!('ニャ', 'nya')
|
174
|
+
romanized.gsub!('にゅ', 'nyu'); romanized.gsub!('ニュ', 'nyu')
|
175
|
+
romanized.gsub!('にょ', 'nyo'); romanized.gsub!('ニョ', 'nyo')
|
176
|
+
romanized.gsub!('ひゃ', 'hya'); romanized.gsub!('ヒャ', 'hya')
|
177
|
+
romanized.gsub!('ひゅ', 'hyu'); romanized.gsub!('ヒュ', 'hyu')
|
178
|
+
romanized.gsub!('ひょ', 'hyo'); romanized.gsub!('ヒョ', 'hyo')
|
179
|
+
romanized.gsub!('みゃ', 'mya'); romanized.gsub!('ミャ', 'mya')
|
180
|
+
romanized.gsub!('みゅ', 'myu'); romanized.gsub!('ミュ', 'myu')
|
181
|
+
romanized.gsub!('みょ', 'myo'); romanized.gsub!('ミョ', 'myo')
|
182
|
+
romanized.gsub!('りゃ', 'rya'); romanized.gsub!('リャ', 'rya')
|
183
|
+
romanized.gsub!('りゅ', 'ryu'); romanized.gsub!('リュ', 'ryu')
|
184
|
+
romanized.gsub!('りょ', 'ryo'); romanized.gsub!('リョ', 'ryo')
|
185
|
+
romanized.gsub!('ぎゃ', 'gya'); romanized.gsub!('ギャ', 'gya')
|
186
|
+
romanized.gsub!('ぎゅ', 'gyu'); romanized.gsub!('ギュ', 'gyu')
|
187
|
+
romanized.gsub!('ぎょ', 'gyo'); romanized.gsub!('ギョ', 'gyo')
|
188
|
+
romanized.gsub!('じゃ', 'ja'); romanized.gsub!('ジャ', 'ja')
|
189
|
+
romanized.gsub!('じゅ', 'ju'); romanized.gsub!('ジュ', 'ju')
|
190
|
+
romanized.gsub!('じょ', 'jo'); romanized.gsub!('ジョ', 'jo')
|
191
|
+
romanized.gsub!('ぢゃ', 'ja'); romanized.gsub!('ヂャ', 'ja')
|
192
|
+
romanized.gsub!('ぢゅ', 'ju'); romanized.gsub!('ヂュ', 'ju')
|
193
|
+
romanized.gsub!('ぢょ', 'jo'); romanized.gsub!('ヂョ', 'jo')
|
194
|
+
romanized.gsub!('びゃ', 'bya'); romanized.gsub!('ビャ', 'bya')
|
195
|
+
romanized.gsub!('びゅ', 'byu'); romanized.gsub!('ビュ', 'byu')
|
196
|
+
romanized.gsub!('びょ', 'byo'); romanized.gsub!('ビョ', 'byo')
|
197
|
+
romanized.gsub!('ぴゃ', 'pya'); romanized.gsub!('ピャ', 'pya')
|
198
|
+
romanized.gsub!('ぴゅ', 'pyu'); romanized.gsub!('ピュ', 'pyu')
|
199
|
+
romanized.gsub!('ぴょ', 'pyo'); romanized.gsub!('ピョ', 'pyo')
|
200
|
+
|
201
|
+
# Convert extended kana.
|
202
|
+
romanized.gsub!('ふぁ', 'fa'); romanized.gsub!('でぃ', 'ti')
|
203
|
+
romanized.gsub!('イェ', 'ye'); romanized.gsub!('ウィ', 'wi')
|
204
|
+
romanized.gsub!('ウェ', 'we'); romanized.gsub!('ウォ', 'wo')
|
205
|
+
romanized.gsub!('ヴァ', 'va'); romanized.gsub!('ヴィ', 'vi')
|
206
|
+
romanized.gsub!('ヴゥ', 'vu'); romanized.gsub!('ヴェ', 've')
|
207
|
+
romanized.gsub!('ヴォ', 'vo'); romanized.gsub!('シェ', 'she')
|
208
|
+
romanized.gsub!('ジェ', 'je'); romanized.gsub!('チェ', 'che')
|
209
|
+
romanized.gsub!('ティ', 'ti'); romanized.gsub!('トゥ', 'tu')
|
210
|
+
romanized.gsub!('チュ', 'tyu'); romanized.gsub!('ディ', 'di')
|
211
|
+
romanized.gsub!('ドゥ', 'du'); romanized.gsub!('デュ', 'dyu')
|
212
|
+
romanized.gsub!('ツァ', 'tsa'); romanized.gsub!('ツェ', 'tse')
|
213
|
+
romanized.gsub!('ツォ', 'tso'); romanized.gsub!('ファ', 'fa')
|
214
|
+
romanized.gsub!('フィ', 'fi'); romanized.gsub!('フェ', 'fe')
|
215
|
+
romanized.gsub!('フォ', 'fo'); romanized.gsub!('フュ', 'fyu')
|
216
|
+
romanized.gsub!('スィ', 'si'); romanized.gsub!('ゲィ', 'gei')
|
217
|
+
romanized.gsub!('ワァ', 'waa'); romanized.gsub!('ツィ', 'tsui')
|
218
|
+
romanized.gsub!('シィ', 'shii'); romanized.gsub!('ウァ', 'ua')
|
219
|
+
romanized.gsub!('ヴュ', 'vyu'); romanized.gsub!('クォ', 'quo')
|
220
|
+
romanized.gsub!('テュ', 'tu'); romanized.gsub!('グィ', 'gui')
|
221
|
+
romanized.gsub!('クェ', 'que'); romanized.gsub!('ビィ', 'bii')
|
222
|
+
romanized.gsub!('ツィ', 'tsi'); romanized.gsub!('ズィ', 'zi')
|
223
|
+
romanized.gsub!('リィ', 'rii'); romanized.gsub!('テュ', 'tu')
|
224
|
+
|
225
|
+
# Do simple conversions.
|
226
|
+
chars = romanized.split('')
|
227
|
+
chars.each_with_index do |ch,i|
|
228
|
+
chars[i] = KANA_ROMAJI_MAP[ch] if KANA_ROMAJI_MAP.has_key?(ch)
|
229
|
+
if chars[i] !~ /\A[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz&\d\.\-ッっー ]+\Z/
|
230
|
+
STDERR.puts "Couldn't romanize #{ch} in #{self}" if warnings
|
231
|
+
end
|
232
|
+
end
|
233
|
+
romanized = chars.join('')
|
234
|
+
|
235
|
+
# Convert letter-doublers (small tsu and katakana dash).
|
236
|
+
romanized.gsub!(/[ッっ](.)/, '\1\1')
|
237
|
+
romanized.gsub!(/(.)ー/, '\1\1')
|
238
|
+
|
239
|
+
romanized
|
240
|
+
end
|
241
|
+
|
242
|
+
# Creates a 7-bit-safe string that can be used to sort strings containing
|
243
|
+
# kana and/or English text.
|
244
|
+
def kana_sort_key
|
245
|
+
key = ''
|
246
|
+
downcase.split('').each do |ch|
|
247
|
+
if ch =~ /[0-9]/
|
248
|
+
ch[0] -= 15 # produces ! through *
|
249
|
+
elsif ch =~ /[a-z]/
|
250
|
+
ch[0] -= 54 # produces + through E
|
251
|
+
elsif KANA_SORT_MAP.has_key?(ch)
|
252
|
+
ch = KANA_SORT_MAP[ch]
|
253
|
+
if ch.kind_of?(Numeric)
|
254
|
+
tmp = ' '
|
255
|
+
tmp[0] = ch + 70
|
256
|
+
ch = tmp
|
257
|
+
else
|
258
|
+
redo
|
259
|
+
end
|
260
|
+
else
|
261
|
+
next
|
262
|
+
end
|
263
|
+
key += ch
|
264
|
+
end
|
265
|
+
key
|
266
|
+
end
|
267
|
+
|
268
|
+
private
|
269
|
+
|
270
|
+
# Table for creating kana sort keys. See kana_sort_key.
|
271
|
+
KANA_SORT_MAP = {
|
272
|
+
"あ" => 0, "い" => 1, "う" => 2, "え" => 3, "お" => 4, "か" => 5, "き" => 6,
|
273
|
+
"く" => 7, "け" => 8, "こ" => 9, "さ" => 10, "し" => 11, "す" => 12, "せ" => 13,
|
274
|
+
"そ" => 14, "た" => 15, "ち" => 16, "つ" => 17, "て" => 18, "と" => 19,
|
275
|
+
"な" => 20, "に" => 21, "ぬ" => 22, "ね" => 23, "の" => 24, "は" => 25,
|
276
|
+
"ひ" => 26, "ふ" => 27, "へ" => 28, "ほ" => 29, "ま" => 30, "み" => 31,
|
277
|
+
"む" => 32, "め" => 33, "も" => 34, "や" => 35, "ゆ" => 36, "よ" => 37,
|
278
|
+
"ら" => 38, "り" => 39, "る" => 40, "れ" => 41, "ろ" => 42, "わ" => 43,
|
279
|
+
"ゐ" => 44, "ゑ" => 45, "を" => 46, "ん" => 47, "が" => 5, "ぎ" => 6, "ぐ" => 7,
|
280
|
+
"げ" => 8, "ご" => 9, "ざ" => 10, "じ" => 11, "ず" => 12, "ぜ" => 13, "ぞ" => 14,
|
281
|
+
"だ" => 15, "ぢ" => 16, "づ" => 17, "で" => 18, "ど" => 19, "ば" => 25,
|
282
|
+
"び" => 26, "ぶ" => 27, "べ" => 28, "ぼ" => 29, "ぱ" => 25, "ぴ" => 26,
|
283
|
+
"ぷ" => 27, "ぺ" => 28, "ぽ" => 29, "ア" => 0, "イ" => 1, "ウ" => 2, "エ" => 3,
|
284
|
+
"オ" => 4, "カ" => 5, "キ" => 6, "ク" => 7, "ケ" => 8, "コ" => 9, "サ" => 10,
|
285
|
+
"シ" => 11, "ス" => 12, "セ" => 13, "ソ" => 14, "タ" => 15, "チ" => 16,
|
286
|
+
"ツ" => 17, "テ" => 18, "ト" => 19, "ナ" => 20, "ニ" => 21, "ヌ" => 22,
|
287
|
+
"ネ" => 23, "ノ" => 24, "ハ" => 25, "ヒ" => 26, "フ" => 27, "ヘ" => 28,
|
288
|
+
"ホ" => 29, "マ" => 30, "ミ" => 31, "ム" => 32, "メ" => 33, "モ" => 34,
|
289
|
+
"ヤ" => 35, "ユ" => 36, "ヨ" => 37, "ラ" => 38, "リ" => 39, "ル" => 40,
|
290
|
+
"レ" => 41, "ロ" => 42, "ワ" => 43, "ヰ" => 44, "ヱ" => 45, "ヲ" => 46,
|
291
|
+
"ン" => 47, "ガ" => 5, "ギ" => 6, "グ" => 7, "ゲ" => 8, "ゴ" => 9, "ザ" => 10,
|
292
|
+
"ジ" => 11, "ズ" => 12, "ゼ" => 13, "ゾ" => 14, "ダ" => 15, "ヂ" => 16,
|
293
|
+
"ヅ" => 17, "デ" => 18, "ド" => 19, "バ" => 25, "ビ" => 26, "ブ" => 27,
|
294
|
+
"ベ" => 28, "ボ" => 29, "パ" => 25, "ピ" => 26, "プ" => 27, "ペ" => 28,
|
295
|
+
"ポ" => 29, "ヴ" => 2, "0" => "0", "1" => "1", "2" => "2", "3" => "3",
|
296
|
+
"4" => "4", "5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9",
|
297
|
+
"A" => "a", "B" => "b", "C" => "c", "D" => "d", "E" => "e", "F" => "f",
|
298
|
+
"G" => "g", "H" => "h", "I" => "i", "J" => "j", "K" => "k", "L" => "l",
|
299
|
+
"M" => "m", "N" => "n", "O" => "o", "P" => "p", "Q" => "q", "R" => "r",
|
300
|
+
"S" => "s", "T" => "t", "U" => "u", "V" => "v", "W" => "w", "X" => "x",
|
301
|
+
"Y" => "y", "Z" => "z", "a" => "a", "b" => "b", "c" => "c", "d" => "d",
|
302
|
+
"e" => "e", "f" => "f", "g" => "g", "h" => "h", "i" => "i", "j" => "j",
|
303
|
+
"k" => "k", "l" => "l", "m" => "m", "n" => "n", "o" => "o", "p" => "p",
|
304
|
+
"q" => "q", "r" => "r", "s" => "s", "t" => "t", "u" => "u", "v" => "v",
|
305
|
+
"w" => "w", "x" => "x", "y" => "y", "z" => "z"
|
306
|
+
}
|
307
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
|
3
|
+
class UCSCodepoint < DelegateClass(Integer)
|
4
|
+
# Returns a Boolean indicating whether this UCS codepoint represents a kanji
|
5
|
+
# character.
|
6
|
+
def kanji?
|
7
|
+
(self >= 0x4e00 && self <= 0x9fbf) ||
|
8
|
+
(self >= 0x3400 && self <= 0x4dbf) ||
|
9
|
+
(self >= 0x20000 && self <= 0x2a6df)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Returns a Boolean indicating whether this UCS codepoint represents a
|
13
|
+
# hiragana or katakana character.
|
14
|
+
def kana?
|
15
|
+
(self >= 0x3040 && self <= 0x30ff) ||
|
16
|
+
(self >= 0x31f0 && self <= 0x31ff)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Returns a Boolean indicating whether this UCS codepoint represents a
|
20
|
+
# full-width latin character.
|
21
|
+
def wide_latin?
|
22
|
+
self >= 0xff10 && self <= 0xff5a
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns an encoded string containing the character represented by this UCS
|
26
|
+
# codepoint. Currently only UTF-8 encoding is supported.
|
27
|
+
def to_s
|
28
|
+
unless $KCODE =~ /^u/i
|
29
|
+
raise ArgumentError, 'unrecognized encoding (only UTF-8 is supported at the moment)'
|
30
|
+
end
|
31
|
+
|
32
|
+
if self <= 0x7f
|
33
|
+
ch = ' '
|
34
|
+
ch[0] = to_i
|
35
|
+
elsif self <= 0x7ff
|
36
|
+
ch = ' '
|
37
|
+
ch[0] = ((self & 0x7c0) >> 6) | 0xc0
|
38
|
+
ch[1] = self & 0x3f | 0x80
|
39
|
+
elsif self <= 0xffff
|
40
|
+
ch = ' '
|
41
|
+
ch[0] = ((self & 0xf000) >> 12) | 0xe0
|
42
|
+
ch[1] = ((self & 0xfc0) >> 6) | 0x80
|
43
|
+
ch[2] = self & 0x3f | 0x80
|
44
|
+
else
|
45
|
+
ch = ' '
|
46
|
+
ch[0] = ((self & 0x1c0000) >> 18) | 0xf0
|
47
|
+
ch[1] = ((self & 0x3f000) >> 12) | 0x80
|
48
|
+
ch[2] = ((self & 0xfc0) >> 6) | 0x80
|
49
|
+
ch[3] = (self & 0x3f) | 0x80
|
50
|
+
end
|
51
|
+
return ch
|
52
|
+
end
|
53
|
+
|
54
|
+
def inspect
|
55
|
+
"#<#{self.class}:0x#{self.to_i.to_s(16)} #{self.to_s.inspect}>"
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
class UnicodeString < String
|
2
|
+
# Returns a Boolean indicating whether this character is a kanji character.
|
3
|
+
# (This string must contain only one character.)
|
4
|
+
def kanji?
|
5
|
+
codepoint.kanji?
|
6
|
+
end
|
7
|
+
|
8
|
+
# Returns a Boolean indicating whether this character is a hiragana or
|
9
|
+
# katakana character. (This string must contain only one character.)
|
10
|
+
def kana?
|
11
|
+
codepoint.kana?
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns a Boolean indicating whether this character is a full-width latin
|
15
|
+
# character. (This string must contain only one character.)
|
16
|
+
def wide_latin?
|
17
|
+
codepoint.wide_latin?
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns the UCS codepoint of this character. (This string must contain only
|
21
|
+
# one character.) Currently only UTF8 encoding is supported.
|
22
|
+
def codepoint
|
23
|
+
unless $KCODE =~ /^u/i
|
24
|
+
raise ArgumentError, "unsupported encoding (#{$KCODE})"
|
25
|
+
end
|
26
|
+
unless jlength == 1
|
27
|
+
raise RangeError, "string must be exactly one character long"
|
28
|
+
end
|
29
|
+
|
30
|
+
case self.length
|
31
|
+
when 1
|
32
|
+
UCSCodepoint.new(self[0])
|
33
|
+
when 2
|
34
|
+
UCSCodepoint.new(
|
35
|
+
((self[0] & 0x1f) << 6) +
|
36
|
+
(self[1] & 0x3f)
|
37
|
+
)
|
38
|
+
when 3
|
39
|
+
UCSCodepoint.new(
|
40
|
+
((self[0] & 0x0f) << 12) +
|
41
|
+
((self[1] & 0x3f) << 6) +
|
42
|
+
(self[2] & 0x3f)
|
43
|
+
)
|
44
|
+
when 4
|
45
|
+
UCSCodepoint.new(
|
46
|
+
((self[0] & 0x07) << 18) +
|
47
|
+
((self[1] & 0x3f) << 12) +
|
48
|
+
((self[2] & 0x3f) << 6) +
|
49
|
+
(self[3] & 0x3f)
|
50
|
+
)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Like index, but returns a character offset instead of a byte offset. The
|
55
|
+
# starting offset is also in characters instead of bytes.
|
56
|
+
def uindex(substr, uoffset = 0)
|
57
|
+
offset = uindex_to_index(uoffset)
|
58
|
+
index_to_uindex(index(substr, offset))
|
59
|
+
end
|
60
|
+
|
61
|
+
# Like slice, but takes a character offset and length (instead of bytes).
|
62
|
+
# Can't handle negative lengths.
|
63
|
+
def uslice(uoffset, ulength)
|
64
|
+
offset = uindex_to_index(uoffset)
|
65
|
+
substr = slice(offset, length)
|
66
|
+
substr.split('')[0,ulength].join('')
|
67
|
+
end
|
68
|
+
|
69
|
+
# Converts a byte offset to a character offset. The byte offset must be
|
70
|
+
# greater than or equal to zero and less than or equal to the byte length of
|
71
|
+
# the string. Returns +nil+ if the offset is in the middle of a character.
|
72
|
+
def index_to_uindex(byte_index)
|
73
|
+
return nil if byte_index.nil?
|
74
|
+
if byte_index < 0 || byte_index > length
|
75
|
+
raise RangeError, 'index out of range'
|
76
|
+
end
|
77
|
+
|
78
|
+
chars = split('')
|
79
|
+
char_index = 0
|
80
|
+
chars.each do |ch|
|
81
|
+
break if byte_index == 0
|
82
|
+
byte_index -= ch.length
|
83
|
+
return nil if byte_index < 0
|
84
|
+
char_index += 1
|
85
|
+
end
|
86
|
+
char_index
|
87
|
+
end
|
88
|
+
|
89
|
+
# Converts a character offset to a byte offset. The character offset must be
|
90
|
+
# greater than or equal to zero and less than or equal to the character
|
91
|
+
# length of the string.
|
92
|
+
def uindex_to_index(char_index)
|
93
|
+
return nil if char_index.nil?
|
94
|
+
if char_index < 0 || char_index > jlength
|
95
|
+
raise RangeError, 'index out of range'
|
96
|
+
end
|
97
|
+
|
98
|
+
chars = split('')
|
99
|
+
byte_index = 0
|
100
|
+
char_index.times do |i|
|
101
|
+
byte_index += chars[i].length
|
102
|
+
end
|
103
|
+
byte_index
|
104
|
+
end
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unicode_madness
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
version: 1.0.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Dana Contreras
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-03-27 00:00:00 -04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description:
|
22
|
+
email:
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- lib/unicode_madness.rb
|
31
|
+
- lib/unicode_madness/ucs_codepoint.rb
|
32
|
+
- lib/unicode_madness/unicode_string.rb
|
33
|
+
- lib/unicode_madness/japanese_string.rb
|
34
|
+
has_rdoc: true
|
35
|
+
homepage: http://github.com/DanaDanger/unicode_madness
|
36
|
+
licenses: []
|
37
|
+
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
segments:
|
48
|
+
- 0
|
49
|
+
version: "0"
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 1.3.6
|
61
|
+
signing_key:
|
62
|
+
specification_version: 3
|
63
|
+
summary: Madness? THIS.IS.UNICODE! (Plus some goodies for Japanese.)
|
64
|
+
test_files: []
|
65
|
+
|