unicode_madness 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
|
|
1
|
+
class JapaneseString < UnicodeString
|
2
|
+
# A string that can be used in a regular expression character class to match
|
3
|
+
# any kanji character. (Example: <tt>/[#{KANJI_CLASS}]/</tt>)
|
4
|
+
KANJI_CLASS =
|
5
|
+
"#{UCSCodepoint.new(0x4e00)}-#{UCSCodepoint.new(0x9fbf)}" +
|
6
|
+
"#{UCSCodepoint.new(0x3400)}-#{UCSCodepoint.new(0x4dbf)}" +
|
7
|
+
"#{UCSCodepoint.new(0x20000)}-#{UCSCodepoint.new(0x2a6df)}"
|
8
|
+
|
9
|
+
# A string that can be used in a regular expression character class to match
|
10
|
+
# any katakana character. (Example: <tt>/[#{KATAKANA_CLASS}]/</tt>)
|
11
|
+
KATAKANA_CLASS = "#{UCSCodepoint.new(0x30a2)}-#{UCSCodepoint.new(0x30ff)}"
|
12
|
+
|
13
|
+
# A string that can be used in a regular expression character class to match
|
14
|
+
# any hiragana or katakana character. (Example: <tt>/[#{KANA_CLASS}]/</tt>)
|
15
|
+
KANA_CLASS =
|
16
|
+
"#{UCSCodepoint.new(0x3040)}-#{UCSCodepoint.new(0x30ff)}" +
|
17
|
+
"#{UCSCodepoint.new(0x31f0)}-#{UCSCodepoint.new(0x31ff)}"
|
18
|
+
|
19
|
+
# Table for converting katakana to their equivalent hiragana.
|
20
|
+
KATAKANA_TO_HIRAGANA = {
|
21
|
+
'ア' => 'あ', 'イ' => 'い', 'ウ' => 'う', 'エ' => 'え', 'オ' => 'お', 'カ' => 'か',
|
22
|
+
'キ' => 'き', 'ク' => 'く', 'ケ' => 'け', 'コ' => 'こ', 'サ' => 'さ', 'シ' => 'し',
|
23
|
+
'ス' => 'す', 'セ' => 'せ', 'ソ' => 'そ', 'タ' => 'た', 'チ' => 'ち', 'ツ' => 'つ',
|
24
|
+
'テ' => 'て', 'ト' => 'と', 'ナ' => 'な', 'ニ' => 'に', 'ヌ' => 'ぬ', 'ネ' => 'ね',
|
25
|
+
'ノ' => 'の', 'ハ' => 'は', 'ヒ' => 'ひ', 'フ' => 'ふ', 'ヘ' => 'へ', 'ホ' => 'ほ',
|
26
|
+
'マ' => 'ま', 'ミ' => 'み', 'ム' => 'む', 'メ' => 'め', 'モ' => 'も', 'ヤ' => 'や',
|
27
|
+
'ユ' => 'ゆ', 'ヨ' => 'よ', 'ラ' => 'ら', 'リ' => 'り', 'ル' => 'る', 'レ' => 'れ',
|
28
|
+
'ロ' => 'ろ', 'ワ' => 'わ', 'ヰ' => 'ゐ', 'ヱ' => 'ゑ', 'ヲ' => 'を', 'ン' => 'ん',
|
29
|
+
'ガ' => 'が', 'ギ' => 'ぎ', 'グ' => 'ぐ', 'ゲ' => 'げ', 'ゴ' => 'ご', 'ザ' => 'ざ',
|
30
|
+
'ジ' => 'じ', 'ズ' => 'ず', 'ゼ' => 'ぜ', 'ゾ' => 'ぞ', 'ダ' => 'だ', 'ヂ' => 'ぢ',
|
31
|
+
'ヅ' => 'づ', 'デ' => 'で', 'ド' => 'ど', 'バ' => 'ば', 'ビ' => 'び', 'ブ' => 'ぶ',
|
32
|
+
'ベ' => 'べ', 'ボ' => 'ぼ', 'パ' => 'ぱ', 'ピ' => 'ぴ', 'プ' => 'ぷ', 'ペ' => 'ぺ',
|
33
|
+
'ポ' => 'ぽ', 'ァ' => 'ぁ', 'ィ' => 'ぃ', 'ゥ' => 'ぅ', 'ェ' => 'ぇ', 'ォ' => 'ぉ',
|
34
|
+
'ャ' => 'ゃ', 'ュ' => 'ゅ', 'ョ' => 'ょ', 'ッ' => 'っ'
|
35
|
+
}
|
36
|
+
|
37
|
+
# Table for converting voiced hiragana and katakana to their unvoiced forms.
|
38
|
+
UNVOICED_KANA = {
|
39
|
+
'が' => 'か', 'ぎ' => 'き', 'ぐ' => 'く', 'げ' => 'け', 'ご' => 'こ', 'ざ' => 'さ',
|
40
|
+
'じ' => 'し', 'ず' => 'す', 'ぜ' => 'せ', 'ぞ' => 'そ', 'だ' => 'た', 'ぢ' => 'ち',
|
41
|
+
'づ' => 'つ', 'で' => 'て', 'ど' => 'と', 'ば' => 'は', 'び' => 'ひ', 'ぶ' => 'ふ',
|
42
|
+
'べ' => 'へ', 'ぼ' => 'ほ', 'ぱ' => 'は', 'ぴ' => 'ひ', 'ぷ' => 'ふ', 'ぺ' => 'へ',
|
43
|
+
'ぽ' => 'ほ', 'ヴ' => 'ウ', 'ガ' => 'カ', 'ギ' => 'キ', 'グ' => 'ク', 'ゲ' => 'ケ',
|
44
|
+
'ゴ' => 'コ', 'ザ' => 'サ', 'ジ' => 'シ', 'ズ' => 'ス', 'ゼ' => 'セ', 'ゾ' => 'ソ',
|
45
|
+
'ダ' => 'タ', 'ヂ' => 'チ', 'ヅ' => 'ツ', 'デ' => 'テ', 'ド' => 'ト', 'バ' => 'ハ',
|
46
|
+
'ビ' => 'ヒ', 'ブ' => 'フ', 'ベ' => 'ヘ', 'ボ' => 'ホ', 'パ' => 'ハ', 'ピ' => 'ヒ',
|
47
|
+
'プ' => 'フ', 'ペ' => 'ヘ', 'ポ' => 'ホ'
|
48
|
+
}
|
49
|
+
|
50
|
+
# Table for converting unvoiced hiragana and katakana to their voiced forms.
|
51
|
+
VOICED_KANA = {
|
52
|
+
'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご', 'さ' => 'ざ',
|
53
|
+
'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ', 'た' => 'だ', 'ち' => 'ぢ',
|
54
|
+
'つ' => 'づ', 'て' => 'で', 'と' => 'ど', 'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ',
|
55
|
+
'へ' => 'べ', 'ほ' => 'ぼ', 'は' => 'ぱ', 'ひ' => 'ぴ', 'ふ' => 'ぷ', 'へ' => 'ぺ',
|
56
|
+
'ほ' => 'ぽ', 'ウ' => 'ヴ', 'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ',
|
57
|
+
'コ' => 'ゴ', 'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
|
58
|
+
'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド', 'ハ' => 'バ',
|
59
|
+
'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ', 'ハ' => 'パ', 'ヒ' => 'ピ',
|
60
|
+
'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ'
|
61
|
+
}
|
62
|
+
|
63
|
+
# Maps kana to their romanized equivalents. Also maps full-width Latin
|
64
|
+
# characters to their ASCII equivalents.
|
65
|
+
KANA_ROMAJI_MAP = {
|
66
|
+
"あ" => "a", "い" => "i", "う" => "u", "え" => "e", "お" => "o", "か" => "ka",
|
67
|
+
"き" => "ki", "く" => "ku", "け" => "ke", "こ" => "ko", "さ" => "sa",
|
68
|
+
"し" => "shi", "す" => "su", "せ" => "se", "そ" => "so", "た" => "ta",
|
69
|
+
"ち" => "chi", "つ" => "tsu", "て" => "te", "と" => "to", "な" => "na",
|
70
|
+
"に" => "ni", "ぬ" => "nu", "ね" => "ne", "の" => "no", "は" => "ha",
|
71
|
+
"ひ" => "hi", "ふ" => "fu", "へ" => "he", "ほ" => "ho", "ま" => "ma",
|
72
|
+
"み" => "mi", "む" => "mu", "め" => "me", "も" => "mo", "や" => "ya",
|
73
|
+
"ゆ" => "yu", "よ" => "yo", "ら" => "ra", "り" => "ri", "る" => "ru",
|
74
|
+
"れ" => "re", "ろ" => "ro", "わ" => "wa", "ゐ" => "wi", "ゑ" => "we",
|
75
|
+
"を" => "wo", "ん" => "n", "が" => "ga", "ぎ" => "gi", "ぐ" => "gu",
|
76
|
+
"げ" => "ge", "ご" => "go", "ざ" => "za", "じ" => "ji", "ず" => "zu",
|
77
|
+
"ぜ" => "ze", "ぞ" => "zo", "だ" => "da", "ぢ" => "ji", "づ" => "zu",
|
78
|
+
"で" => "de", "ど" => "do", "ば" => "ba", "び" => "bi", "ぶ" => "bu",
|
79
|
+
"べ" => "be", "ぼ" => "bo", "ぱ" => "pa", "ぴ" => "pi", "ぷ" => "pu",
|
80
|
+
"ぺ" => "pe", "ぽ" => "po", "ア" => "a", "イ" => "i", "ウ" => "u", "エ" => "e",
|
81
|
+
"オ" => "o", "カ" => "ka", "キ" => "ki", "ク" => "ku", "ケ" => "ke",
|
82
|
+
"コ" => "ko", "サ" => "sa", "シ" => "shi", "ス" => "su", "セ" => "se",
|
83
|
+
"ソ" => "so", "タ" => "ta", "チ" => "chi", "ツ" => "tsu", "テ" => "te",
|
84
|
+
"ト" => "to", "ナ" => "na", "ニ" => "ni", "ヌ" => "nu", "ネ" => "ne",
|
85
|
+
"ノ" => "no", "ハ" => "ha", "ヒ" => "hi", "フ" => "fu", "ヘ" => "he",
|
86
|
+
"ホ" => "ho", "マ" => "ma", "ミ" => "mi", "ム" => "mu", "メ" => "me",
|
87
|
+
"モ" => "mo", "ヤ" => "ya", "ユ" => "yu", "ヨ" => "yo", "ラ" => "ra",
|
88
|
+
"リ" => "ri", "ル" => "ru", "レ" => "re", "ロ" => "ro", "ワ" => "wa",
|
89
|
+
"ヰ" => "wi", "ヱ" => "we", "ヲ" => "wo", "ン" => "n", "ガ" => "ga",
|
90
|
+
"ギ" => "gi", "グ" => "gu", "ゲ" => "ge", "ゴ" => "go", "ザ" => "za",
|
91
|
+
"ジ" => "ji", "ズ" => "zu", "ゼ" => "ze", "ゾ" => "zo", "ダ" => "da",
|
92
|
+
"ヂ" => "ji", "ヅ" => "zu", "デ" => "de", "ド" => "do", "バ" => "ba",
|
93
|
+
"ビ" => "bi", "ブ" => "bu", "ベ" => "be", "ボ" => "bo", "パ" => "pa",
|
94
|
+
"ピ" => "pi", "プ" => "pu", "ペ" => "pe", "ポ" => "po", "ヴ" => "vu",
|
95
|
+
"・" => " ", "0" => "0", "1" => "1", "2" => "2", "3" => "3", "4" => "4",
|
96
|
+
"5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9", "!" => "!",
|
97
|
+
""" => "\"", "#" => "#", "$" => "\$", "%" => "%", "&" => "&", "'" => "'",
|
98
|
+
"(" => "(", ")" => ")", "*" => "*", "+" => "+", "," => ".", "-" => "-",
|
99
|
+
"." => ".", "/" => "/", ":" => ":", ";" => ";", "<" => "<", "=" => "=",
|
100
|
+
">" => ">", "?" => "?", "@" => "\@", "A" => "A", "B" => "B", "C" => "C",
|
101
|
+
"D" => "D", "E" => "E", "F" => "F", "G" => "G", "H" => "H", "I" => "I",
|
102
|
+
"J" => "J", "K" => "K", "L" => "L", "M" => "M", "N" => "N", "O" => "O",
|
103
|
+
"P" => "P", "Q" => "Q", "R" => "R", "S" => "S", "T" => "T", "U" => "U",
|
104
|
+
"V" => "V", "W" => "W", "X" => "X", "Y" => "Y", "Z" => "Z", "[" => "[",
|
105
|
+
"\" => "\\", "]" => "]", "^" => "^", "_" => "_", "`" => "`", "a" => "a",
|
106
|
+
"b" => "b", "c" => "c", "d" => "d", "e" => "e", "f" => "f", "g" => "g",
|
107
|
+
"h" => "h", "i" => "i", "j" => "j", "k" => "k", "l" => "l", "m" => "m",
|
108
|
+
"n" => "n", "o" => "o", "p" => "p", "q" => "q", "r" => "r", "s" => "s",
|
109
|
+
"t" => "t", "u" => "u", "v" => "v", "w" => "w", "x" => "x", "y" => "y",
|
110
|
+
"z" => "z", "{" => "{", "|" => "|", "}" => "}", "〜" => "-"
|
111
|
+
}
|
112
|
+
|
113
|
+
# Returns a new string with this string's katakana replaced with equivalent
|
114
|
+
# hiragana.
|
115
|
+
def to_hiragana
|
116
|
+
new_str = ''
|
117
|
+
split('').each do |ch|
|
118
|
+
if KATAKANA_TO_HIRAGANA.has_key?(ch)
|
119
|
+
new_str += KATAKANA_TO_HIRAGANA[ch]
|
120
|
+
else
|
121
|
+
new_str += ch
|
122
|
+
end
|
123
|
+
end
|
124
|
+
self.class.new(new_str)
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns a new string with this string's voiced hiragana and katakana
|
128
|
+
# replaced with their unvoiced forms.
|
129
|
+
def unvoice_kana
|
130
|
+
new_str = ''
|
131
|
+
split('').each do |ch|
|
132
|
+
if UNVOICED_KANA.has_key?(ch)
|
133
|
+
new_str += UNVOICED_KANA[ch]
|
134
|
+
else
|
135
|
+
new_str += ch
|
136
|
+
end
|
137
|
+
end
|
138
|
+
self.class.new(new_str)
|
139
|
+
end
|
140
|
+
|
141
|
+
# Returns a new string with this string's unvoiced hiragana and katakana
|
142
|
+
# replaced with their voiced forms.
|
143
|
+
def voice_kana
|
144
|
+
new_str = ''
|
145
|
+
split('').each do |ch|
|
146
|
+
if VOICED_KANA.has_key?(ch)
|
147
|
+
new_str += VOICED_KANA[ch]
|
148
|
+
else
|
149
|
+
new_str += ch
|
150
|
+
end
|
151
|
+
end
|
152
|
+
self.class.new(new_str)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Creates a new string by romanizing the kana in this string. Full-width
|
156
|
+
# Latin characters are also converted to their ASCII equivalents. If
|
157
|
+
# +warnings+ is true (the default), a message is printed on +STDERR+ if an
|
158
|
+
# un-romanizable character is encountered.
|
159
|
+
def romanize(warnings = true)
|
160
|
+
romanized = String.new(self)
|
161
|
+
|
162
|
+
# Convert dipthongs. This gsub-mania is probably insanely inefficient.
|
163
|
+
romanized.gsub!('きゃ', 'kya'); romanized.gsub!('キャ', 'kya')
|
164
|
+
romanized.gsub!('きゅ', 'kyu'); romanized.gsub!('キュ', 'kyu')
|
165
|
+
romanized.gsub!('きょ', 'kyo'); romanized.gsub!('キョ', 'kyo')
|
166
|
+
romanized.gsub!('しゃ', 'sha'); romanized.gsub!('シャ', 'sha')
|
167
|
+
romanized.gsub!('しゅ', 'shu'); romanized.gsub!('シュ', 'shu')
|
168
|
+
romanized.gsub!('しぇ', 'she'); romanized.gsub!('シェ', 'she')
|
169
|
+
romanized.gsub!('しょ', 'sho'); romanized.gsub!('ショ', 'sho')
|
170
|
+
romanized.gsub!('ちゃ', 'cha'); romanized.gsub!('チャ', 'cha')
|
171
|
+
romanized.gsub!('ちゅ', 'chu'); romanized.gsub!('チュ', 'chu')
|
172
|
+
romanized.gsub!('ちょ', 'cho'); romanized.gsub!('チョ', 'cho')
|
173
|
+
romanized.gsub!('にゃ', 'nya'); romanized.gsub!('ニャ', 'nya')
|
174
|
+
romanized.gsub!('にゅ', 'nyu'); romanized.gsub!('ニュ', 'nyu')
|
175
|
+
romanized.gsub!('にょ', 'nyo'); romanized.gsub!('ニョ', 'nyo')
|
176
|
+
romanized.gsub!('ひゃ', 'hya'); romanized.gsub!('ヒャ', 'hya')
|
177
|
+
romanized.gsub!('ひゅ', 'hyu'); romanized.gsub!('ヒュ', 'hyu')
|
178
|
+
romanized.gsub!('ひょ', 'hyo'); romanized.gsub!('ヒョ', 'hyo')
|
179
|
+
romanized.gsub!('みゃ', 'mya'); romanized.gsub!('ミャ', 'mya')
|
180
|
+
romanized.gsub!('みゅ', 'myu'); romanized.gsub!('ミュ', 'myu')
|
181
|
+
romanized.gsub!('みょ', 'myo'); romanized.gsub!('ミョ', 'myo')
|
182
|
+
romanized.gsub!('りゃ', 'rya'); romanized.gsub!('リャ', 'rya')
|
183
|
+
romanized.gsub!('りゅ', 'ryu'); romanized.gsub!('リュ', 'ryu')
|
184
|
+
romanized.gsub!('りょ', 'ryo'); romanized.gsub!('リョ', 'ryo')
|
185
|
+
romanized.gsub!('ぎゃ', 'gya'); romanized.gsub!('ギャ', 'gya')
|
186
|
+
romanized.gsub!('ぎゅ', 'gyu'); romanized.gsub!('ギュ', 'gyu')
|
187
|
+
romanized.gsub!('ぎょ', 'gyo'); romanized.gsub!('ギョ', 'gyo')
|
188
|
+
romanized.gsub!('じゃ', 'ja'); romanized.gsub!('ジャ', 'ja')
|
189
|
+
romanized.gsub!('じゅ', 'ju'); romanized.gsub!('ジュ', 'ju')
|
190
|
+
romanized.gsub!('じょ', 'jo'); romanized.gsub!('ジョ', 'jo')
|
191
|
+
romanized.gsub!('ぢゃ', 'ja'); romanized.gsub!('ヂャ', 'ja')
|
192
|
+
romanized.gsub!('ぢゅ', 'ju'); romanized.gsub!('ヂュ', 'ju')
|
193
|
+
romanized.gsub!('ぢょ', 'jo'); romanized.gsub!('ヂョ', 'jo')
|
194
|
+
romanized.gsub!('びゃ', 'bya'); romanized.gsub!('ビャ', 'bya')
|
195
|
+
romanized.gsub!('びゅ', 'byu'); romanized.gsub!('ビュ', 'byu')
|
196
|
+
romanized.gsub!('びょ', 'byo'); romanized.gsub!('ビョ', 'byo')
|
197
|
+
romanized.gsub!('ぴゃ', 'pya'); romanized.gsub!('ピャ', 'pya')
|
198
|
+
romanized.gsub!('ぴゅ', 'pyu'); romanized.gsub!('ピュ', 'pyu')
|
199
|
+
romanized.gsub!('ぴょ', 'pyo'); romanized.gsub!('ピョ', 'pyo')
|
200
|
+
|
201
|
+
# Convert extended kana.
|
202
|
+
romanized.gsub!('ふぁ', 'fa'); romanized.gsub!('でぃ', 'ti')
|
203
|
+
romanized.gsub!('イェ', 'ye'); romanized.gsub!('ウィ', 'wi')
|
204
|
+
romanized.gsub!('ウェ', 'we'); romanized.gsub!('ウォ', 'wo')
|
205
|
+
romanized.gsub!('ヴァ', 'va'); romanized.gsub!('ヴィ', 'vi')
|
206
|
+
romanized.gsub!('ヴゥ', 'vu'); romanized.gsub!('ヴェ', 've')
|
207
|
+
romanized.gsub!('ヴォ', 'vo'); romanized.gsub!('シェ', 'she')
|
208
|
+
romanized.gsub!('ジェ', 'je'); romanized.gsub!('チェ', 'che')
|
209
|
+
romanized.gsub!('ティ', 'ti'); romanized.gsub!('トゥ', 'tu')
|
210
|
+
romanized.gsub!('チュ', 'tyu'); romanized.gsub!('ディ', 'di')
|
211
|
+
romanized.gsub!('ドゥ', 'du'); romanized.gsub!('デュ', 'dyu')
|
212
|
+
romanized.gsub!('ツァ', 'tsa'); romanized.gsub!('ツェ', 'tse')
|
213
|
+
romanized.gsub!('ツォ', 'tso'); romanized.gsub!('ファ', 'fa')
|
214
|
+
romanized.gsub!('フィ', 'fi'); romanized.gsub!('フェ', 'fe')
|
215
|
+
romanized.gsub!('フォ', 'fo'); romanized.gsub!('フュ', 'fyu')
|
216
|
+
romanized.gsub!('スィ', 'si'); romanized.gsub!('ゲィ', 'gei')
|
217
|
+
romanized.gsub!('ワァ', 'waa'); romanized.gsub!('ツィ', 'tsui')
|
218
|
+
romanized.gsub!('シィ', 'shii'); romanized.gsub!('ウァ', 'ua')
|
219
|
+
romanized.gsub!('ヴュ', 'vyu'); romanized.gsub!('クォ', 'quo')
|
220
|
+
romanized.gsub!('テュ', 'tu'); romanized.gsub!('グィ', 'gui')
|
221
|
+
romanized.gsub!('クェ', 'que'); romanized.gsub!('ビィ', 'bii')
|
222
|
+
romanized.gsub!('ツィ', 'tsi'); romanized.gsub!('ズィ', 'zi')
|
223
|
+
romanized.gsub!('リィ', 'rii'); romanized.gsub!('テュ', 'tu')
|
224
|
+
|
225
|
+
# Do simple conversions.
|
226
|
+
chars = romanized.split('')
|
227
|
+
chars.each_with_index do |ch,i|
|
228
|
+
chars[i] = KANA_ROMAJI_MAP[ch] if KANA_ROMAJI_MAP.has_key?(ch)
|
229
|
+
if chars[i] !~ /\A[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz&\d\.\-ッっー ]+\Z/
|
230
|
+
STDERR.puts "Couldn't romanize #{ch} in #{self}" if warnings
|
231
|
+
end
|
232
|
+
end
|
233
|
+
romanized = chars.join('')
|
234
|
+
|
235
|
+
# Convert letter-doublers (small tsu and katakana dash).
|
236
|
+
romanized.gsub!(/[ッっ](.)/, '\1\1')
|
237
|
+
romanized.gsub!(/(.)ー/, '\1\1')
|
238
|
+
|
239
|
+
romanized
|
240
|
+
end
|
241
|
+
|
242
|
+
# Creates a 7-bit-safe string that can be used to sort strings containing
|
243
|
+
# kana and/or English text.
|
244
|
+
def kana_sort_key
|
245
|
+
key = ''
|
246
|
+
downcase.split('').each do |ch|
|
247
|
+
if ch =~ /[0-9]/
|
248
|
+
ch[0] -= 15 # produces ! through *
|
249
|
+
elsif ch =~ /[a-z]/
|
250
|
+
ch[0] -= 54 # produces + through E
|
251
|
+
elsif KANA_SORT_MAP.has_key?(ch)
|
252
|
+
ch = KANA_SORT_MAP[ch]
|
253
|
+
if ch.kind_of?(Numeric)
|
254
|
+
tmp = ' '
|
255
|
+
tmp[0] = ch + 70
|
256
|
+
ch = tmp
|
257
|
+
else
|
258
|
+
redo
|
259
|
+
end
|
260
|
+
else
|
261
|
+
next
|
262
|
+
end
|
263
|
+
key += ch
|
264
|
+
end
|
265
|
+
key
|
266
|
+
end
|
267
|
+
|
268
|
+
private
|
269
|
+
|
270
|
+
# Table for creating kana sort keys. See kana_sort_key.
|
271
|
+
KANA_SORT_MAP = {
|
272
|
+
"あ" => 0, "い" => 1, "う" => 2, "え" => 3, "お" => 4, "か" => 5, "き" => 6,
|
273
|
+
"く" => 7, "け" => 8, "こ" => 9, "さ" => 10, "し" => 11, "す" => 12, "せ" => 13,
|
274
|
+
"そ" => 14, "た" => 15, "ち" => 16, "つ" => 17, "て" => 18, "と" => 19,
|
275
|
+
"な" => 20, "に" => 21, "ぬ" => 22, "ね" => 23, "の" => 24, "は" => 25,
|
276
|
+
"ひ" => 26, "ふ" => 27, "へ" => 28, "ほ" => 29, "ま" => 30, "み" => 31,
|
277
|
+
"む" => 32, "め" => 33, "も" => 34, "や" => 35, "ゆ" => 36, "よ" => 37,
|
278
|
+
"ら" => 38, "り" => 39, "る" => 40, "れ" => 41, "ろ" => 42, "わ" => 43,
|
279
|
+
"ゐ" => 44, "ゑ" => 45, "を" => 46, "ん" => 47, "が" => 5, "ぎ" => 6, "ぐ" => 7,
|
280
|
+
"げ" => 8, "ご" => 9, "ざ" => 10, "じ" => 11, "ず" => 12, "ぜ" => 13, "ぞ" => 14,
|
281
|
+
"だ" => 15, "ぢ" => 16, "づ" => 17, "で" => 18, "ど" => 19, "ば" => 25,
|
282
|
+
"び" => 26, "ぶ" => 27, "べ" => 28, "ぼ" => 29, "ぱ" => 25, "ぴ" => 26,
|
283
|
+
"ぷ" => 27, "ぺ" => 28, "ぽ" => 29, "ア" => 0, "イ" => 1, "ウ" => 2, "エ" => 3,
|
284
|
+
"オ" => 4, "カ" => 5, "キ" => 6, "ク" => 7, "ケ" => 8, "コ" => 9, "サ" => 10,
|
285
|
+
"シ" => 11, "ス" => 12, "セ" => 13, "ソ" => 14, "タ" => 15, "チ" => 16,
|
286
|
+
"ツ" => 17, "テ" => 18, "ト" => 19, "ナ" => 20, "ニ" => 21, "ヌ" => 22,
|
287
|
+
"ネ" => 23, "ノ" => 24, "ハ" => 25, "ヒ" => 26, "フ" => 27, "ヘ" => 28,
|
288
|
+
"ホ" => 29, "マ" => 30, "ミ" => 31, "ム" => 32, "メ" => 33, "モ" => 34,
|
289
|
+
"ヤ" => 35, "ユ" => 36, "ヨ" => 37, "ラ" => 38, "リ" => 39, "ル" => 40,
|
290
|
+
"レ" => 41, "ロ" => 42, "ワ" => 43, "ヰ" => 44, "ヱ" => 45, "ヲ" => 46,
|
291
|
+
"ン" => 47, "ガ" => 5, "ギ" => 6, "グ" => 7, "ゲ" => 8, "ゴ" => 9, "ザ" => 10,
|
292
|
+
"ジ" => 11, "ズ" => 12, "ゼ" => 13, "ゾ" => 14, "ダ" => 15, "ヂ" => 16,
|
293
|
+
"ヅ" => 17, "デ" => 18, "ド" => 19, "バ" => 25, "ビ" => 26, "ブ" => 27,
|
294
|
+
"ベ" => 28, "ボ" => 29, "パ" => 25, "ピ" => 26, "プ" => 27, "ペ" => 28,
|
295
|
+
"ポ" => 29, "ヴ" => 2, "0" => "0", "1" => "1", "2" => "2", "3" => "3",
|
296
|
+
"4" => "4", "5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9",
|
297
|
+
"A" => "a", "B" => "b", "C" => "c", "D" => "d", "E" => "e", "F" => "f",
|
298
|
+
"G" => "g", "H" => "h", "I" => "i", "J" => "j", "K" => "k", "L" => "l",
|
299
|
+
"M" => "m", "N" => "n", "O" => "o", "P" => "p", "Q" => "q", "R" => "r",
|
300
|
+
"S" => "s", "T" => "t", "U" => "u", "V" => "v", "W" => "w", "X" => "x",
|
301
|
+
"Y" => "y", "Z" => "z", "a" => "a", "b" => "b", "c" => "c", "d" => "d",
|
302
|
+
"e" => "e", "f" => "f", "g" => "g", "h" => "h", "i" => "i", "j" => "j",
|
303
|
+
"k" => "k", "l" => "l", "m" => "m", "n" => "n", "o" => "o", "p" => "p",
|
304
|
+
"q" => "q", "r" => "r", "s" => "s", "t" => "t", "u" => "u", "v" => "v",
|
305
|
+
"w" => "w", "x" => "x", "y" => "y", "z" => "z"
|
306
|
+
}
|
307
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
|
3
|
+
class UCSCodepoint < DelegateClass(Integer)
|
4
|
+
# Returns a Boolean indicating whether this UCS codepoint represents a kanji
|
5
|
+
# character.
|
6
|
+
def kanji?
|
7
|
+
(self >= 0x4e00 && self <= 0x9fbf) ||
|
8
|
+
(self >= 0x3400 && self <= 0x4dbf) ||
|
9
|
+
(self >= 0x20000 && self <= 0x2a6df)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Returns a Boolean indicating whether this UCS codepoint represents a
|
13
|
+
# hiragana or katakana character.
|
14
|
+
def kana?
|
15
|
+
(self >= 0x3040 && self <= 0x30ff) ||
|
16
|
+
(self >= 0x31f0 && self <= 0x31ff)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Returns a Boolean indicating whether this UCS codepoint represents a
|
20
|
+
# full-width latin character.
|
21
|
+
def wide_latin?
|
22
|
+
self >= 0xff10 && self <= 0xff5a
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns an encoded string containing the character represented by this UCS
|
26
|
+
# codepoint. Currently only UTF-8 encoding is supported.
|
27
|
+
def to_s
|
28
|
+
unless $KCODE =~ /^u/i
|
29
|
+
raise ArgumentError, 'unrecognized encoding (only UTF-8 is supported at the moment)'
|
30
|
+
end
|
31
|
+
|
32
|
+
if self <= 0x7f
|
33
|
+
ch = ' '
|
34
|
+
ch[0] = to_i
|
35
|
+
elsif self <= 0x7ff
|
36
|
+
ch = ' '
|
37
|
+
ch[0] = ((self & 0x7c0) >> 6) | 0xc0
|
38
|
+
ch[1] = self & 0x3f | 0x80
|
39
|
+
elsif self <= 0xffff
|
40
|
+
ch = ' '
|
41
|
+
ch[0] = ((self & 0xf000) >> 12) | 0xe0
|
42
|
+
ch[1] = ((self & 0xfc0) >> 6) | 0x80
|
43
|
+
ch[2] = self & 0x3f | 0x80
|
44
|
+
else
|
45
|
+
ch = ' '
|
46
|
+
ch[0] = ((self & 0x1c0000) >> 18) | 0xf0
|
47
|
+
ch[1] = ((self & 0x3f000) >> 12) | 0x80
|
48
|
+
ch[2] = ((self & 0xfc0) >> 6) | 0x80
|
49
|
+
ch[3] = (self & 0x3f) | 0x80
|
50
|
+
end
|
51
|
+
return ch
|
52
|
+
end
|
53
|
+
|
54
|
+
def inspect
|
55
|
+
"#<#{self.class}:0x#{self.to_i.to_s(16)} #{self.to_s.inspect}>"
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
class UnicodeString < String
|
2
|
+
# Returns a Boolean indicating whether this character is a kanji character.
|
3
|
+
# (This string must contain only one character.)
|
4
|
+
def kanji?
|
5
|
+
codepoint.kanji?
|
6
|
+
end
|
7
|
+
|
8
|
+
# Returns a Boolean indicating whether this character is a hiragana or
|
9
|
+
# katakana character. (This string must contain only one character.)
|
10
|
+
def kana?
|
11
|
+
codepoint.kana?
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns a Boolean indicating whether this character is a full-width latin
|
15
|
+
# character. (This string must contain only one character.)
|
16
|
+
def wide_latin?
|
17
|
+
codepoint.wide_latin?
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns the UCS codepoint of this character. (This string must contain only
|
21
|
+
# one character.) Currently only UTF8 encoding is supported.
|
22
|
+
def codepoint
|
23
|
+
unless $KCODE =~ /^u/i
|
24
|
+
raise ArgumentError, "unsupported encoding (#{$KCODE})"
|
25
|
+
end
|
26
|
+
unless jlength == 1
|
27
|
+
raise RangeError, "string must be exactly one character long"
|
28
|
+
end
|
29
|
+
|
30
|
+
case self.length
|
31
|
+
when 1
|
32
|
+
UCSCodepoint.new(self[0])
|
33
|
+
when 2
|
34
|
+
UCSCodepoint.new(
|
35
|
+
((self[0] & 0x1f) << 6) +
|
36
|
+
(self[1] & 0x3f)
|
37
|
+
)
|
38
|
+
when 3
|
39
|
+
UCSCodepoint.new(
|
40
|
+
((self[0] & 0x0f) << 12) +
|
41
|
+
((self[1] & 0x3f) << 6) +
|
42
|
+
(self[2] & 0x3f)
|
43
|
+
)
|
44
|
+
when 4
|
45
|
+
UCSCodepoint.new(
|
46
|
+
((self[0] & 0x07) << 18) +
|
47
|
+
((self[1] & 0x3f) << 12) +
|
48
|
+
((self[2] & 0x3f) << 6) +
|
49
|
+
(self[3] & 0x3f)
|
50
|
+
)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Like index, but returns a character offset instead of a byte offset. The
|
55
|
+
# starting offset is also in characters instead of bytes.
|
56
|
+
def uindex(substr, uoffset = 0)
|
57
|
+
offset = uindex_to_index(uoffset)
|
58
|
+
index_to_uindex(index(substr, offset))
|
59
|
+
end
|
60
|
+
|
61
|
+
# Like slice, but takes a character offset and length (instead of bytes).
|
62
|
+
# Can't handle negative lengths.
|
63
|
+
def uslice(uoffset, ulength)
|
64
|
+
offset = uindex_to_index(uoffset)
|
65
|
+
substr = slice(offset, length)
|
66
|
+
substr.split('')[0,ulength].join('')
|
67
|
+
end
|
68
|
+
|
69
|
+
# Converts a byte offset to a character offset. The byte offset must be
|
70
|
+
# greater than or equal to zero and less than or equal to the byte length of
|
71
|
+
# the string. Returns +nil+ if the offset is in the middle of a character.
|
72
|
+
def index_to_uindex(byte_index)
|
73
|
+
return nil if byte_index.nil?
|
74
|
+
if byte_index < 0 || byte_index > length
|
75
|
+
raise RangeError, 'index out of range'
|
76
|
+
end
|
77
|
+
|
78
|
+
chars = split('')
|
79
|
+
char_index = 0
|
80
|
+
chars.each do |ch|
|
81
|
+
break if byte_index == 0
|
82
|
+
byte_index -= ch.length
|
83
|
+
return nil if byte_index < 0
|
84
|
+
char_index += 1
|
85
|
+
end
|
86
|
+
char_index
|
87
|
+
end
|
88
|
+
|
89
|
+
# Converts a character offset to a byte offset. The character offset must be
|
90
|
+
# greater than or equal to zero and less than or equal to the character
|
91
|
+
# length of the string.
|
92
|
+
def uindex_to_index(char_index)
|
93
|
+
return nil if char_index.nil?
|
94
|
+
if char_index < 0 || char_index > jlength
|
95
|
+
raise RangeError, 'index out of range'
|
96
|
+
end
|
97
|
+
|
98
|
+
chars = split('')
|
99
|
+
byte_index = 0
|
100
|
+
char_index.times do |i|
|
101
|
+
byte_index += chars[i].length
|
102
|
+
end
|
103
|
+
byte_index
|
104
|
+
end
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unicode_madness
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
version: 1.0.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Dana Contreras
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-03-27 00:00:00 -04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description:
|
22
|
+
email:
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- lib/unicode_madness.rb
|
31
|
+
- lib/unicode_madness/ucs_codepoint.rb
|
32
|
+
- lib/unicode_madness/unicode_string.rb
|
33
|
+
- lib/unicode_madness/japanese_string.rb
|
34
|
+
has_rdoc: true
|
35
|
+
homepage: http://github.com/DanaDanger/unicode_madness
|
36
|
+
licenses: []
|
37
|
+
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
segments:
|
48
|
+
- 0
|
49
|
+
version: "0"
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 1.3.6
|
61
|
+
signing_key:
|
62
|
+
specification_version: 3
|
63
|
+
summary: Madness? THIS.IS.UNICODE! (Plus some goodies for Japanese.)
|
64
|
+
test_files: []
|
65
|
+
|