unicode_madness 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ $KCODE = 'UTF8' if $KCODE == 'NONE'
2
+ require 'jcode'
3
+
4
+ require 'unicode_madness/ucs_codepoint'
5
+ require 'unicode_madness/unicode_string'
6
+ require 'unicode_madness/japanese_string'
@@ -0,0 +1,307 @@
1
+ class JapaneseString < UnicodeString
2
+ # A string that can be used in a regular expression character class to match
3
+ # any kanji character. (Example: <tt>/[#{KANJI_CLASS}]/</tt>)
4
+ KANJI_CLASS =
5
+ "#{UCSCodepoint.new(0x4e00)}-#{UCSCodepoint.new(0x9fbf)}" +
6
+ "#{UCSCodepoint.new(0x3400)}-#{UCSCodepoint.new(0x4dbf)}" +
7
+ "#{UCSCodepoint.new(0x20000)}-#{UCSCodepoint.new(0x2a6df)}"
8
+
9
+ # A string that can be used in a regular expression character class to match
10
+ # any katakana character. (Example: <tt>/[#{KATAKANA_CLASS}]/</tt>)
11
+ KATAKANA_CLASS = "#{UCSCodepoint.new(0x30a2)}-#{UCSCodepoint.new(0x30ff)}"
12
+
13
+ # A string that can be used in a regular expression character class to match
14
+ # any hiragana or katakana character. (Example: <tt>/[#{KANA_CLASS}]/</tt>)
15
+ KANA_CLASS =
16
+ "#{UCSCodepoint.new(0x3040)}-#{UCSCodepoint.new(0x30ff)}" +
17
+ "#{UCSCodepoint.new(0x31f0)}-#{UCSCodepoint.new(0x31ff)}"
18
+
19
+ # Table for converting katakana to their equivalent hiragana.
20
+ KATAKANA_TO_HIRAGANA = {
21
+ 'ア' => 'あ', 'イ' => 'い', 'ウ' => 'う', 'エ' => 'え', 'オ' => 'お', 'カ' => 'か',
22
+ 'キ' => 'き', 'ク' => 'く', 'ケ' => 'け', 'コ' => 'こ', 'サ' => 'さ', 'シ' => 'し',
23
+ 'ス' => 'す', 'セ' => 'せ', 'ソ' => 'そ', 'タ' => 'た', 'チ' => 'ち', 'ツ' => 'つ',
24
+ 'テ' => 'て', 'ト' => 'と', 'ナ' => 'な', 'ニ' => 'に', 'ヌ' => 'ぬ', 'ネ' => 'ね',
25
+ 'ノ' => 'の', 'ハ' => 'は', 'ヒ' => 'ひ', 'フ' => 'ふ', 'ヘ' => 'へ', 'ホ' => 'ほ',
26
+ 'マ' => 'ま', 'ミ' => 'み', 'ム' => 'む', 'メ' => 'め', 'モ' => 'も', 'ヤ' => 'や',
27
+ 'ユ' => 'ゆ', 'ヨ' => 'よ', 'ラ' => 'ら', 'リ' => 'り', 'ル' => 'る', 'レ' => 'れ',
28
+ 'ロ' => 'ろ', 'ワ' => 'わ', 'ヰ' => 'ゐ', 'ヱ' => 'ゑ', 'ヲ' => 'を', 'ン' => 'ん',
29
+ 'ガ' => 'が', 'ギ' => 'ぎ', 'グ' => 'ぐ', 'ゲ' => 'げ', 'ゴ' => 'ご', 'ザ' => 'ざ',
30
+ 'ジ' => 'じ', 'ズ' => 'ず', 'ゼ' => 'ぜ', 'ゾ' => 'ぞ', 'ダ' => 'だ', 'ヂ' => 'ぢ',
31
+ 'ヅ' => 'づ', 'デ' => 'で', 'ド' => 'ど', 'バ' => 'ば', 'ビ' => 'び', 'ブ' => 'ぶ',
32
+ 'ベ' => 'べ', 'ボ' => 'ぼ', 'パ' => 'ぱ', 'ピ' => 'ぴ', 'プ' => 'ぷ', 'ペ' => 'ぺ',
33
+ 'ポ' => 'ぽ', 'ァ' => 'ぁ', 'ィ' => 'ぃ', 'ゥ' => 'ぅ', 'ェ' => 'ぇ', 'ォ' => 'ぉ',
34
+ 'ャ' => 'ゃ', 'ュ' => 'ゅ', 'ョ' => 'ょ', 'ッ' => 'っ'
35
+ }
36
+
37
+ # Table for converting voiced hiragana and katakana to their unvoiced forms.
38
+ UNVOICED_KANA = {
39
+ 'が' => 'か', 'ぎ' => 'き', 'ぐ' => 'く', 'げ' => 'け', 'ご' => 'こ', 'ざ' => 'さ',
40
+ 'じ' => 'し', 'ず' => 'す', 'ぜ' => 'せ', 'ぞ' => 'そ', 'だ' => 'た', 'ぢ' => 'ち',
41
+ 'づ' => 'つ', 'で' => 'て', 'ど' => 'と', 'ば' => 'は', 'び' => 'ひ', 'ぶ' => 'ふ',
42
+ 'べ' => 'へ', 'ぼ' => 'ほ', 'ぱ' => 'は', 'ぴ' => 'ひ', 'ぷ' => 'ふ', 'ぺ' => 'へ',
43
+ 'ぽ' => 'ほ', 'ヴ' => 'ウ', 'ガ' => 'カ', 'ギ' => 'キ', 'グ' => 'ク', 'ゲ' => 'ケ',
44
+ 'ゴ' => 'コ', 'ザ' => 'サ', 'ジ' => 'シ', 'ズ' => 'ス', 'ゼ' => 'セ', 'ゾ' => 'ソ',
45
+ 'ダ' => 'タ', 'ヂ' => 'チ', 'ヅ' => 'ツ', 'デ' => 'テ', 'ド' => 'ト', 'バ' => 'ハ',
46
+ 'ビ' => 'ヒ', 'ブ' => 'フ', 'ベ' => 'ヘ', 'ボ' => 'ホ', 'パ' => 'ハ', 'ピ' => 'ヒ',
47
+ 'プ' => 'フ', 'ペ' => 'ヘ', 'ポ' => 'ホ'
48
+ }
49
+
50
+ # Table for converting unvoiced hiragana and katakana to their voiced forms.
51
+ VOICED_KANA = {
52
+ 'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご', 'さ' => 'ざ',
53
+ 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ', 'た' => 'だ', 'ち' => 'ぢ',
54
+ 'つ' => 'づ', 'て' => 'で', 'と' => 'ど', 'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ',
55
+ 'へ' => 'べ', 'ほ' => 'ぼ', 'は' => 'ぱ', 'ひ' => 'ぴ', 'ふ' => 'ぷ', 'へ' => 'ぺ',
56
+ 'ほ' => 'ぽ', 'ウ' => 'ヴ', 'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ',
57
+ 'コ' => 'ゴ', 'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
58
+ 'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド', 'ハ' => 'バ',
59
+ 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ', 'ハ' => 'パ', 'ヒ' => 'ピ',
60
+ 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ'
61
+ }
62
+
63
+ # Maps kana to their romanized equivalents. Also maps full-width Latin
64
+ # characters to their ASCII equivalents.
65
+ KANA_ROMAJI_MAP = {
66
+ "あ" => "a", "い" => "i", "う" => "u", "え" => "e", "お" => "o", "か" => "ka",
67
+ "き" => "ki", "く" => "ku", "け" => "ke", "こ" => "ko", "さ" => "sa",
68
+ "し" => "shi", "す" => "su", "せ" => "se", "そ" => "so", "た" => "ta",
69
+ "ち" => "chi", "つ" => "tsu", "て" => "te", "と" => "to", "な" => "na",
70
+ "に" => "ni", "ぬ" => "nu", "ね" => "ne", "の" => "no", "は" => "ha",
71
+ "ひ" => "hi", "ふ" => "fu", "へ" => "he", "ほ" => "ho", "ま" => "ma",
72
+ "み" => "mi", "む" => "mu", "め" => "me", "も" => "mo", "や" => "ya",
73
+ "ゆ" => "yu", "よ" => "yo", "ら" => "ra", "り" => "ri", "る" => "ru",
74
+ "れ" => "re", "ろ" => "ro", "わ" => "wa", "ゐ" => "wi", "ゑ" => "we",
75
+ "を" => "wo", "ん" => "n", "が" => "ga", "ぎ" => "gi", "ぐ" => "gu",
76
+ "げ" => "ge", "ご" => "go", "ざ" => "za", "じ" => "ji", "ず" => "zu",
77
+ "ぜ" => "ze", "ぞ" => "zo", "だ" => "da", "ぢ" => "ji", "づ" => "zu",
78
+ "で" => "de", "ど" => "do", "ば" => "ba", "び" => "bi", "ぶ" => "bu",
79
+ "べ" => "be", "ぼ" => "bo", "ぱ" => "pa", "ぴ" => "pi", "ぷ" => "pu",
80
+ "ぺ" => "pe", "ぽ" => "po", "ア" => "a", "イ" => "i", "ウ" => "u", "エ" => "e",
81
+ "オ" => "o", "カ" => "ka", "キ" => "ki", "ク" => "ku", "ケ" => "ke",
82
+ "コ" => "ko", "サ" => "sa", "シ" => "shi", "ス" => "su", "セ" => "se",
83
+ "ソ" => "so", "タ" => "ta", "チ" => "chi", "ツ" => "tsu", "テ" => "te",
84
+ "ト" => "to", "ナ" => "na", "ニ" => "ni", "ヌ" => "nu", "ネ" => "ne",
85
+ "ノ" => "no", "ハ" => "ha", "ヒ" => "hi", "フ" => "fu", "ヘ" => "he",
86
+ "ホ" => "ho", "マ" => "ma", "ミ" => "mi", "ム" => "mu", "メ" => "me",
87
+ "モ" => "mo", "ヤ" => "ya", "ユ" => "yu", "ヨ" => "yo", "ラ" => "ra",
88
+ "リ" => "ri", "ル" => "ru", "レ" => "re", "ロ" => "ro", "ワ" => "wa",
89
+ "ヰ" => "wi", "ヱ" => "we", "ヲ" => "wo", "ン" => "n", "ガ" => "ga",
90
+ "ギ" => "gi", "グ" => "gu", "ゲ" => "ge", "ゴ" => "go", "ザ" => "za",
91
+ "ジ" => "ji", "ズ" => "zu", "ゼ" => "ze", "ゾ" => "zo", "ダ" => "da",
92
+ "ヂ" => "ji", "ヅ" => "zu", "デ" => "de", "ド" => "do", "バ" => "ba",
93
+ "ビ" => "bi", "ブ" => "bu", "ベ" => "be", "ボ" => "bo", "パ" => "pa",
94
+ "ピ" => "pi", "プ" => "pu", "ペ" => "pe", "ポ" => "po", "ヴ" => "vu",
95
+ "・" => " ", "0" => "0", "1" => "1", "2" => "2", "3" => "3", "4" => "4",
96
+ "5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9", "!" => "!",
97
+ """ => "\"", "#" => "#", "$" => "\$", "%" => "%", "&" => "&", "'" => "'",
98
+ "(" => "(", ")" => ")", "*" => "*", "+" => "+", "," => ".", "-" => "-",
99
+ "." => ".", "/" => "/", ":" => ":", ";" => ";", "<" => "<", "=" => "=",
100
+ ">" => ">", "?" => "?", "@" => "\@", "A" => "A", "B" => "B", "C" => "C",
101
+ "D" => "D", "E" => "E", "F" => "F", "G" => "G", "H" => "H", "I" => "I",
102
+ "J" => "J", "K" => "K", "L" => "L", "M" => "M", "N" => "N", "O" => "O",
103
+ "P" => "P", "Q" => "Q", "R" => "R", "S" => "S", "T" => "T", "U" => "U",
104
+ "V" => "V", "W" => "W", "X" => "X", "Y" => "Y", "Z" => "Z", "[" => "[",
105
+ "\" => "\\", "]" => "]", "^" => "^", "_" => "_", "`" => "`", "a" => "a",
106
+ "b" => "b", "c" => "c", "d" => "d", "e" => "e", "f" => "f", "g" => "g",
107
+ "h" => "h", "i" => "i", "j" => "j", "k" => "k", "l" => "l", "m" => "m",
108
+ "n" => "n", "o" => "o", "p" => "p", "q" => "q", "r" => "r", "s" => "s",
109
+ "t" => "t", "u" => "u", "v" => "v", "w" => "w", "x" => "x", "y" => "y",
110
+ "z" => "z", "{" => "{", "|" => "|", "}" => "}", "〜" => "-"
111
+ }
112
+
113
+ # Returns a new string with this string's katakana replaced with equivalent
114
+ # hiragana.
115
+ def to_hiragana
116
+ new_str = ''
117
+ split('').each do |ch|
118
+ if KATAKANA_TO_HIRAGANA.has_key?(ch)
119
+ new_str += KATAKANA_TO_HIRAGANA[ch]
120
+ else
121
+ new_str += ch
122
+ end
123
+ end
124
+ self.class.new(new_str)
125
+ end
126
+
127
+ # Returns a new string with this string's voiced hiragana and katakana
128
+ # replaced with their unvoiced forms.
129
+ def unvoice_kana
130
+ new_str = ''
131
+ split('').each do |ch|
132
+ if UNVOICED_KANA.has_key?(ch)
133
+ new_str += UNVOICED_KANA[ch]
134
+ else
135
+ new_str += ch
136
+ end
137
+ end
138
+ self.class.new(new_str)
139
+ end
140
+
141
+ # Returns a new string with this string's unvoiced hiragana and katakana
142
+ # replaced with their voiced forms.
143
+ def voice_kana
144
+ new_str = ''
145
+ split('').each do |ch|
146
+ if VOICED_KANA.has_key?(ch)
147
+ new_str += VOICED_KANA[ch]
148
+ else
149
+ new_str += ch
150
+ end
151
+ end
152
+ self.class.new(new_str)
153
+ end
154
+
155
+ # Creates a new string by romanizing the kana in this string. Full-width
156
+ # Latin characters are also converted to their ASCII equivalents. If
157
+ # +warnings+ is true (the default), a message is printed on +STDERR+ if an
158
+ # un-romanizable character is encountered.
159
+ def romanize(warnings = true)
160
+ romanized = String.new(self)
161
+
162
+ # Convert dipthongs. This gsub-mania is probably insanely inefficient.
163
+ romanized.gsub!('きゃ', 'kya'); romanized.gsub!('キャ', 'kya')
164
+ romanized.gsub!('きゅ', 'kyu'); romanized.gsub!('キュ', 'kyu')
165
+ romanized.gsub!('きょ', 'kyo'); romanized.gsub!('キョ', 'kyo')
166
+ romanized.gsub!('しゃ', 'sha'); romanized.gsub!('シャ', 'sha')
167
+ romanized.gsub!('しゅ', 'shu'); romanized.gsub!('シュ', 'shu')
168
+ romanized.gsub!('しぇ', 'she'); romanized.gsub!('シェ', 'she')
169
+ romanized.gsub!('しょ', 'sho'); romanized.gsub!('ショ', 'sho')
170
+ romanized.gsub!('ちゃ', 'cha'); romanized.gsub!('チャ', 'cha')
171
+ romanized.gsub!('ちゅ', 'chu'); romanized.gsub!('チュ', 'chu')
172
+ romanized.gsub!('ちょ', 'cho'); romanized.gsub!('チョ', 'cho')
173
+ romanized.gsub!('にゃ', 'nya'); romanized.gsub!('ニャ', 'nya')
174
+ romanized.gsub!('にゅ', 'nyu'); romanized.gsub!('ニュ', 'nyu')
175
+ romanized.gsub!('にょ', 'nyo'); romanized.gsub!('ニョ', 'nyo')
176
+ romanized.gsub!('ひゃ', 'hya'); romanized.gsub!('ヒャ', 'hya')
177
+ romanized.gsub!('ひゅ', 'hyu'); romanized.gsub!('ヒュ', 'hyu')
178
+ romanized.gsub!('ひょ', 'hyo'); romanized.gsub!('ヒョ', 'hyo')
179
+ romanized.gsub!('みゃ', 'mya'); romanized.gsub!('ミャ', 'mya')
180
+ romanized.gsub!('みゅ', 'myu'); romanized.gsub!('ミュ', 'myu')
181
+ romanized.gsub!('みょ', 'myo'); romanized.gsub!('ミョ', 'myo')
182
+ romanized.gsub!('りゃ', 'rya'); romanized.gsub!('リャ', 'rya')
183
+ romanized.gsub!('りゅ', 'ryu'); romanized.gsub!('リュ', 'ryu')
184
+ romanized.gsub!('りょ', 'ryo'); romanized.gsub!('リョ', 'ryo')
185
+ romanized.gsub!('ぎゃ', 'gya'); romanized.gsub!('ギャ', 'gya')
186
+ romanized.gsub!('ぎゅ', 'gyu'); romanized.gsub!('ギュ', 'gyu')
187
+ romanized.gsub!('ぎょ', 'gyo'); romanized.gsub!('ギョ', 'gyo')
188
+ romanized.gsub!('じゃ', 'ja'); romanized.gsub!('ジャ', 'ja')
189
+ romanized.gsub!('じゅ', 'ju'); romanized.gsub!('ジュ', 'ju')
190
+ romanized.gsub!('じょ', 'jo'); romanized.gsub!('ジョ', 'jo')
191
+ romanized.gsub!('ぢゃ', 'ja'); romanized.gsub!('ヂャ', 'ja')
192
+ romanized.gsub!('ぢゅ', 'ju'); romanized.gsub!('ヂュ', 'ju')
193
+ romanized.gsub!('ぢょ', 'jo'); romanized.gsub!('ヂョ', 'jo')
194
+ romanized.gsub!('びゃ', 'bya'); romanized.gsub!('ビャ', 'bya')
195
+ romanized.gsub!('びゅ', 'byu'); romanized.gsub!('ビュ', 'byu')
196
+ romanized.gsub!('びょ', 'byo'); romanized.gsub!('ビョ', 'byo')
197
+ romanized.gsub!('ぴゃ', 'pya'); romanized.gsub!('ピャ', 'pya')
198
+ romanized.gsub!('ぴゅ', 'pyu'); romanized.gsub!('ピュ', 'pyu')
199
+ romanized.gsub!('ぴょ', 'pyo'); romanized.gsub!('ピョ', 'pyo')
200
+
201
+ # Convert extended kana.
202
+ romanized.gsub!('ふぁ', 'fa'); romanized.gsub!('でぃ', 'ti')
203
+ romanized.gsub!('イェ', 'ye'); romanized.gsub!('ウィ', 'wi')
204
+ romanized.gsub!('ウェ', 'we'); romanized.gsub!('ウォ', 'wo')
205
+ romanized.gsub!('ヴァ', 'va'); romanized.gsub!('ヴィ', 'vi')
206
+ romanized.gsub!('ヴゥ', 'vu'); romanized.gsub!('ヴェ', 've')
207
+ romanized.gsub!('ヴォ', 'vo'); romanized.gsub!('シェ', 'she')
208
+ romanized.gsub!('ジェ', 'je'); romanized.gsub!('チェ', 'che')
209
+ romanized.gsub!('ティ', 'ti'); romanized.gsub!('トゥ', 'tu')
210
+ romanized.gsub!('チュ', 'tyu'); romanized.gsub!('ディ', 'di')
211
+ romanized.gsub!('ドゥ', 'du'); romanized.gsub!('デュ', 'dyu')
212
+ romanized.gsub!('ツァ', 'tsa'); romanized.gsub!('ツェ', 'tse')
213
+ romanized.gsub!('ツォ', 'tso'); romanized.gsub!('ファ', 'fa')
214
+ romanized.gsub!('フィ', 'fi'); romanized.gsub!('フェ', 'fe')
215
+ romanized.gsub!('フォ', 'fo'); romanized.gsub!('フュ', 'fyu')
216
+ romanized.gsub!('スィ', 'si'); romanized.gsub!('ゲィ', 'gei')
217
+ romanized.gsub!('ワァ', 'waa'); romanized.gsub!('ツィ', 'tsui')
218
+ romanized.gsub!('シィ', 'shii'); romanized.gsub!('ウァ', 'ua')
219
+ romanized.gsub!('ヴュ', 'vyu'); romanized.gsub!('クォ', 'quo')
220
+ romanized.gsub!('テュ', 'tu'); romanized.gsub!('グィ', 'gui')
221
+ romanized.gsub!('クェ', 'que'); romanized.gsub!('ビィ', 'bii')
222
+ romanized.gsub!('ツィ', 'tsi'); romanized.gsub!('ズィ', 'zi')
223
+ romanized.gsub!('リィ', 'rii'); romanized.gsub!('テュ', 'tu')
224
+
225
+ # Do simple conversions.
226
+ chars = romanized.split('')
227
+ chars.each_with_index do |ch,i|
228
+ chars[i] = KANA_ROMAJI_MAP[ch] if KANA_ROMAJI_MAP.has_key?(ch)
229
+ if chars[i] !~ /\A[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz&\d\.\-ッっー ]+\Z/
230
+ STDERR.puts "Couldn't romanize #{ch} in #{self}" if warnings
231
+ end
232
+ end
233
+ romanized = chars.join('')
234
+
235
+ # Convert letter-doublers (small tsu and katakana dash).
236
+ romanized.gsub!(/[ッっ](.)/, '\1\1')
237
+ romanized.gsub!(/(.)ー/, '\1\1')
238
+
239
+ romanized
240
+ end
241
+
242
+ # Creates a 7-bit-safe string that can be used to sort strings containing
243
+ # kana and/or English text.
244
+ def kana_sort_key
245
+ key = ''
246
+ downcase.split('').each do |ch|
247
+ if ch =~ /[0-9]/
248
+ ch[0] -= 15 # produces ! through *
249
+ elsif ch =~ /[a-z]/
250
+ ch[0] -= 54 # produces + through E
251
+ elsif KANA_SORT_MAP.has_key?(ch)
252
+ ch = KANA_SORT_MAP[ch]
253
+ if ch.kind_of?(Numeric)
254
+ tmp = ' '
255
+ tmp[0] = ch + 70
256
+ ch = tmp
257
+ else
258
+ redo
259
+ end
260
+ else
261
+ next
262
+ end
263
+ key += ch
264
+ end
265
+ key
266
+ end
267
+
268
+ private
269
+
270
+ # Table for creating kana sort keys. See kana_sort_key.
271
+ KANA_SORT_MAP = {
272
+ "あ" => 0, "い" => 1, "う" => 2, "え" => 3, "お" => 4, "か" => 5, "き" => 6,
273
+ "く" => 7, "け" => 8, "こ" => 9, "さ" => 10, "し" => 11, "す" => 12, "せ" => 13,
274
+ "そ" => 14, "た" => 15, "ち" => 16, "つ" => 17, "て" => 18, "と" => 19,
275
+ "な" => 20, "に" => 21, "ぬ" => 22, "ね" => 23, "の" => 24, "は" => 25,
276
+ "ひ" => 26, "ふ" => 27, "へ" => 28, "ほ" => 29, "ま" => 30, "み" => 31,
277
+ "む" => 32, "め" => 33, "も" => 34, "や" => 35, "ゆ" => 36, "よ" => 37,
278
+ "ら" => 38, "り" => 39, "る" => 40, "れ" => 41, "ろ" => 42, "わ" => 43,
279
+ "ゐ" => 44, "ゑ" => 45, "を" => 46, "ん" => 47, "が" => 5, "ぎ" => 6, "ぐ" => 7,
280
+ "げ" => 8, "ご" => 9, "ざ" => 10, "じ" => 11, "ず" => 12, "ぜ" => 13, "ぞ" => 14,
281
+ "だ" => 15, "ぢ" => 16, "づ" => 17, "で" => 18, "ど" => 19, "ば" => 25,
282
+ "び" => 26, "ぶ" => 27, "べ" => 28, "ぼ" => 29, "ぱ" => 25, "ぴ" => 26,
283
+ "ぷ" => 27, "ぺ" => 28, "ぽ" => 29, "ア" => 0, "イ" => 1, "ウ" => 2, "エ" => 3,
284
+ "オ" => 4, "カ" => 5, "キ" => 6, "ク" => 7, "ケ" => 8, "コ" => 9, "サ" => 10,
285
+ "シ" => 11, "ス" => 12, "セ" => 13, "ソ" => 14, "タ" => 15, "チ" => 16,
286
+ "ツ" => 17, "テ" => 18, "ト" => 19, "ナ" => 20, "ニ" => 21, "ヌ" => 22,
287
+ "ネ" => 23, "ノ" => 24, "ハ" => 25, "ヒ" => 26, "フ" => 27, "ヘ" => 28,
288
+ "ホ" => 29, "マ" => 30, "ミ" => 31, "ム" => 32, "メ" => 33, "モ" => 34,
289
+ "ヤ" => 35, "ユ" => 36, "ヨ" => 37, "ラ" => 38, "リ" => 39, "ル" => 40,
290
+ "レ" => 41, "ロ" => 42, "ワ" => 43, "ヰ" => 44, "ヱ" => 45, "ヲ" => 46,
291
+ "ン" => 47, "ガ" => 5, "ギ" => 6, "グ" => 7, "ゲ" => 8, "ゴ" => 9, "ザ" => 10,
292
+ "ジ" => 11, "ズ" => 12, "ゼ" => 13, "ゾ" => 14, "ダ" => 15, "ヂ" => 16,
293
+ "ヅ" => 17, "デ" => 18, "ド" => 19, "バ" => 25, "ビ" => 26, "ブ" => 27,
294
+ "ベ" => 28, "ボ" => 29, "パ" => 25, "ピ" => 26, "プ" => 27, "ペ" => 28,
295
+ "ポ" => 29, "ヴ" => 2, "0" => "0", "1" => "1", "2" => "2", "3" => "3",
296
+ "4" => "4", "5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9",
297
+ "A" => "a", "B" => "b", "C" => "c", "D" => "d", "E" => "e", "F" => "f",
298
+ "G" => "g", "H" => "h", "I" => "i", "J" => "j", "K" => "k", "L" => "l",
299
+ "M" => "m", "N" => "n", "O" => "o", "P" => "p", "Q" => "q", "R" => "r",
300
+ "S" => "s", "T" => "t", "U" => "u", "V" => "v", "W" => "w", "X" => "x",
301
+ "Y" => "y", "Z" => "z", "a" => "a", "b" => "b", "c" => "c", "d" => "d",
302
+ "e" => "e", "f" => "f", "g" => "g", "h" => "h", "i" => "i", "j" => "j",
303
+ "k" => "k", "l" => "l", "m" => "m", "n" => "n", "o" => "o", "p" => "p",
304
+ "q" => "q", "r" => "r", "s" => "s", "t" => "t", "u" => "u", "v" => "v",
305
+ "w" => "w", "x" => "x", "y" => "y", "z" => "z"
306
+ }
307
+ end
@@ -0,0 +1,57 @@
1
+ require 'delegate'
2
+
3
+ class UCSCodepoint < DelegateClass(Integer)
4
+ # Returns a Boolean indicating whether this UCS codepoint represents a kanji
5
+ # character.
6
+ def kanji?
7
+ (self >= 0x4e00 && self <= 0x9fbf) ||
8
+ (self >= 0x3400 && self <= 0x4dbf) ||
9
+ (self >= 0x20000 && self <= 0x2a6df)
10
+ end
11
+
12
+ # Returns a Boolean indicating whether this UCS codepoint represents a
13
+ # hiragana or katakana character.
14
+ def kana?
15
+ (self >= 0x3040 && self <= 0x30ff) ||
16
+ (self >= 0x31f0 && self <= 0x31ff)
17
+ end
18
+
19
+ # Returns a Boolean indicating whether this UCS codepoint represents a
20
+ # full-width latin character.
21
+ def wide_latin?
22
+ self >= 0xff10 && self <= 0xff5a
23
+ end
24
+
25
+ # Returns an encoded string containing the character represented by this UCS
26
+ # codepoint. Currently only UTF-8 encoding is supported.
27
+ def to_s
28
+ unless $KCODE =~ /^u/i
29
+ raise ArgumentError, 'unrecognized encoding (only UTF-8 is supported at the moment)'
30
+ end
31
+
32
+ if self <= 0x7f
33
+ ch = ' '
34
+ ch[0] = to_i
35
+ elsif self <= 0x7ff
36
+ ch = ' '
37
+ ch[0] = ((self & 0x7c0) >> 6) | 0xc0
38
+ ch[1] = self & 0x3f | 0x80
39
+ elsif self <= 0xffff
40
+ ch = ' '
41
+ ch[0] = ((self & 0xf000) >> 12) | 0xe0
42
+ ch[1] = ((self & 0xfc0) >> 6) | 0x80
43
+ ch[2] = self & 0x3f | 0x80
44
+ else
45
+ ch = ' '
46
+ ch[0] = ((self & 0x1c0000) >> 18) | 0xf0
47
+ ch[1] = ((self & 0x3f000) >> 12) | 0x80
48
+ ch[2] = ((self & 0xfc0) >> 6) | 0x80
49
+ ch[3] = (self & 0x3f) | 0x80
50
+ end
51
+ return ch
52
+ end
53
+
54
+ def inspect
55
+ "#<#{self.class}:0x#{self.to_i.to_s(16)} #{self.to_s.inspect}>"
56
+ end
57
+ end
@@ -0,0 +1,105 @@
1
+ class UnicodeString < String
2
+ # Returns a Boolean indicating whether this character is a kanji character.
3
+ # (This string must contain only one character.)
4
+ def kanji?
5
+ codepoint.kanji?
6
+ end
7
+
8
+ # Returns a Boolean indicating whether this character is a hiragana or
9
+ # katakana character. (This string must contain only one character.)
10
+ def kana?
11
+ codepoint.kana?
12
+ end
13
+
14
+ # Returns a Boolean indicating whether this character is a full-width latin
15
+ # character. (This string must contain only one character.)
16
+ def wide_latin?
17
+ codepoint.wide_latin?
18
+ end
19
+
20
+ # Returns the UCS codepoint of this character. (This string must contain only
21
+ # one character.) Currently only UTF8 encoding is supported.
22
+ def codepoint
23
+ unless $KCODE =~ /^u/i
24
+ raise ArgumentError, "unsupported encoding (#{$KCODE})"
25
+ end
26
+ unless jlength == 1
27
+ raise RangeError, "string must be exactly one character long"
28
+ end
29
+
30
+ case self.length
31
+ when 1
32
+ UCSCodepoint.new(self[0])
33
+ when 2
34
+ UCSCodepoint.new(
35
+ ((self[0] & 0x1f) << 6) +
36
+ (self[1] & 0x3f)
37
+ )
38
+ when 3
39
+ UCSCodepoint.new(
40
+ ((self[0] & 0x0f) << 12) +
41
+ ((self[1] & 0x3f) << 6) +
42
+ (self[2] & 0x3f)
43
+ )
44
+ when 4
45
+ UCSCodepoint.new(
46
+ ((self[0] & 0x07) << 18) +
47
+ ((self[1] & 0x3f) << 12) +
48
+ ((self[2] & 0x3f) << 6) +
49
+ (self[3] & 0x3f)
50
+ )
51
+ end
52
+ end
53
+
54
+ # Like index, but returns a character offset instead of a byte offset. The
55
+ # starting offset is also in characters instead of bytes.
56
+ def uindex(substr, uoffset = 0)
57
+ offset = uindex_to_index(uoffset)
58
+ index_to_uindex(index(substr, offset))
59
+ end
60
+
61
+ # Like slice, but takes a character offset and length (instead of bytes).
62
+ # Can't handle negative lengths.
63
+ def uslice(uoffset, ulength)
64
+ offset = uindex_to_index(uoffset)
65
+ substr = slice(offset, length)
66
+ substr.split('')[0,ulength].join('')
67
+ end
68
+
69
+ # Converts a byte offset to a character offset. The byte offset must be
70
+ # greater than or equal to zero and less than or equal to the byte length of
71
+ # the string. Returns +nil+ if the offset is in the middle of a character.
72
+ def index_to_uindex(byte_index)
73
+ return nil if byte_index.nil?
74
+ if byte_index < 0 || byte_index > length
75
+ raise RangeError, 'index out of range'
76
+ end
77
+
78
+ chars = split('')
79
+ char_index = 0
80
+ chars.each do |ch|
81
+ break if byte_index == 0
82
+ byte_index -= ch.length
83
+ return nil if byte_index < 0
84
+ char_index += 1
85
+ end
86
+ char_index
87
+ end
88
+
89
+ # Converts a character offset to a byte offset. The character offset must be
90
+ # greater than or equal to zero and less than or equal to the character
91
+ # length of the string.
92
+ def uindex_to_index(char_index)
93
+ return nil if char_index.nil?
94
+ if char_index < 0 || char_index > jlength
95
+ raise RangeError, 'index out of range'
96
+ end
97
+
98
+ chars = split('')
99
+ byte_index = 0
100
+ char_index.times do |i|
101
+ byte_index += chars[i].length
102
+ end
103
+ byte_index
104
+ end
105
+ end
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode_madness
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 0
8
+ - 2
9
+ version: 1.0.2
10
+ platform: ruby
11
+ authors:
12
+ - Dana Contreras
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-27 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description:
22
+ email:
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/unicode_madness.rb
31
+ - lib/unicode_madness/ucs_codepoint.rb
32
+ - lib/unicode_madness/unicode_string.rb
33
+ - lib/unicode_madness/japanese_string.rb
34
+ has_rdoc: true
35
+ homepage: http://github.com/DanaDanger/unicode_madness
36
+ licenses: []
37
+
38
+ post_install_message:
39
+ rdoc_options: []
40
+
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ segments:
48
+ - 0
49
+ version: "0"
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ requirements: []
58
+
59
+ rubyforge_project:
60
+ rubygems_version: 1.3.6
61
+ signing_key:
62
+ specification_version: 3
63
+ summary: Madness? THIS.IS.UNICODE! (Plus some goodies for Japanese.)
64
+ test_files: []
65
+