unicode_madness 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ $KCODE = 'UTF8' if $KCODE == 'NONE'
2
+ require 'jcode'
3
+
4
+ require 'unicode_madness/ucs_codepoint'
5
+ require 'unicode_madness/unicode_string'
6
+ require 'unicode_madness/japanese_string'
@@ -0,0 +1,307 @@
1
+ class JapaneseString < UnicodeString
2
+ # A string that can be used in a regular expression character class to match
3
+ # any kanji character. (Example: <tt>/[#{KANJI_CLASS}]/</tt>)
4
+ KANJI_CLASS =
5
+ "#{UCSCodepoint.new(0x4e00)}-#{UCSCodepoint.new(0x9fbf)}" +
6
+ "#{UCSCodepoint.new(0x3400)}-#{UCSCodepoint.new(0x4dbf)}" +
7
+ "#{UCSCodepoint.new(0x20000)}-#{UCSCodepoint.new(0x2a6df)}"
8
+
9
+ # A string that can be used in a regular expression character class to match
10
+ # any katakana character. (Example: <tt>/[#{KATAKANA_CLASS}]/</tt>)
11
+ KATAKANA_CLASS = "#{UCSCodepoint.new(0x30a2)}-#{UCSCodepoint.new(0x30ff)}"
12
+
13
+ # A string that can be used in a regular expression character class to match
14
+ # any hiragana or katakana character. (Example: <tt>/[#{KANA_CLASS}]/</tt>)
15
+ KANA_CLASS =
16
+ "#{UCSCodepoint.new(0x3040)}-#{UCSCodepoint.new(0x30ff)}" +
17
+ "#{UCSCodepoint.new(0x31f0)}-#{UCSCodepoint.new(0x31ff)}"
18
+
19
+ # Table for converting katakana to their equivalent hiragana.
20
+ KATAKANA_TO_HIRAGANA = {
21
+ 'ア' => 'あ', 'イ' => 'い', 'ウ' => 'う', 'エ' => 'え', 'オ' => 'お', 'カ' => 'か',
22
+ 'キ' => 'き', 'ク' => 'く', 'ケ' => 'け', 'コ' => 'こ', 'サ' => 'さ', 'シ' => 'し',
23
+ 'ス' => 'す', 'セ' => 'せ', 'ソ' => 'そ', 'タ' => 'た', 'チ' => 'ち', 'ツ' => 'つ',
24
+ 'テ' => 'て', 'ト' => 'と', 'ナ' => 'な', 'ニ' => 'に', 'ヌ' => 'ぬ', 'ネ' => 'ね',
25
+ 'ノ' => 'の', 'ハ' => 'は', 'ヒ' => 'ひ', 'フ' => 'ふ', 'ヘ' => 'へ', 'ホ' => 'ほ',
26
+ 'マ' => 'ま', 'ミ' => 'み', 'ム' => 'む', 'メ' => 'め', 'モ' => 'も', 'ヤ' => 'や',
27
+ 'ユ' => 'ゆ', 'ヨ' => 'よ', 'ラ' => 'ら', 'リ' => 'り', 'ル' => 'る', 'レ' => 'れ',
28
+ 'ロ' => 'ろ', 'ワ' => 'わ', 'ヰ' => 'ゐ', 'ヱ' => 'ゑ', 'ヲ' => 'を', 'ン' => 'ん',
29
+ 'ガ' => 'が', 'ギ' => 'ぎ', 'グ' => 'ぐ', 'ゲ' => 'げ', 'ゴ' => 'ご', 'ザ' => 'ざ',
30
+ 'ジ' => 'じ', 'ズ' => 'ず', 'ゼ' => 'ぜ', 'ゾ' => 'ぞ', 'ダ' => 'だ', 'ヂ' => 'ぢ',
31
+ 'ヅ' => 'づ', 'デ' => 'で', 'ド' => 'ど', 'バ' => 'ば', 'ビ' => 'び', 'ブ' => 'ぶ',
32
+ 'ベ' => 'べ', 'ボ' => 'ぼ', 'パ' => 'ぱ', 'ピ' => 'ぴ', 'プ' => 'ぷ', 'ペ' => 'ぺ',
33
+ 'ポ' => 'ぽ', 'ァ' => 'ぁ', 'ィ' => 'ぃ', 'ゥ' => 'ぅ', 'ェ' => 'ぇ', 'ォ' => 'ぉ',
34
+ 'ャ' => 'ゃ', 'ュ' => 'ゅ', 'ョ' => 'ょ', 'ッ' => 'っ'
35
+ }
36
+
37
+ # Table for converting voiced hiragana and katakana to their unvoiced forms.
38
+ UNVOICED_KANA = {
39
+ 'が' => 'か', 'ぎ' => 'き', 'ぐ' => 'く', 'げ' => 'け', 'ご' => 'こ', 'ざ' => 'さ',
40
+ 'じ' => 'し', 'ず' => 'す', 'ぜ' => 'せ', 'ぞ' => 'そ', 'だ' => 'た', 'ぢ' => 'ち',
41
+ 'づ' => 'つ', 'で' => 'て', 'ど' => 'と', 'ば' => 'は', 'び' => 'ひ', 'ぶ' => 'ふ',
42
+ 'べ' => 'へ', 'ぼ' => 'ほ', 'ぱ' => 'は', 'ぴ' => 'ひ', 'ぷ' => 'ふ', 'ぺ' => 'へ',
43
+ 'ぽ' => 'ほ', 'ヴ' => 'ウ', 'ガ' => 'カ', 'ギ' => 'キ', 'グ' => 'ク', 'ゲ' => 'ケ',
44
+ 'ゴ' => 'コ', 'ザ' => 'サ', 'ジ' => 'シ', 'ズ' => 'ス', 'ゼ' => 'セ', 'ゾ' => 'ソ',
45
+ 'ダ' => 'タ', 'ヂ' => 'チ', 'ヅ' => 'ツ', 'デ' => 'テ', 'ド' => 'ト', 'バ' => 'ハ',
46
+ 'ビ' => 'ヒ', 'ブ' => 'フ', 'ベ' => 'ヘ', 'ボ' => 'ホ', 'パ' => 'ハ', 'ピ' => 'ヒ',
47
+ 'プ' => 'フ', 'ペ' => 'ヘ', 'ポ' => 'ホ'
48
+ }
49
+
50
+ # Table for converting unvoiced hiragana and katakana to their voiced forms.
51
+ VOICED_KANA = {
52
+ 'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご', 'さ' => 'ざ',
53
+ 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ', 'た' => 'だ', 'ち' => 'ぢ',
54
+ 'つ' => 'づ', 'て' => 'で', 'と' => 'ど', 'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ',
55
+ 'へ' => 'べ', 'ほ' => 'ぼ', 'は' => 'ぱ', 'ひ' => 'ぴ', 'ふ' => 'ぷ', 'へ' => 'ぺ',
56
+ 'ほ' => 'ぽ', 'ウ' => 'ヴ', 'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ',
57
+ 'コ' => 'ゴ', 'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
58
+ 'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド', 'ハ' => 'バ',
59
+ 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ', 'ハ' => 'パ', 'ヒ' => 'ピ',
60
+ 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ'
61
+ }
62
+
63
+ # Maps kana to their romanized equivalents. Also maps full-width Latin
64
+ # characters to their ASCII equivalents.
65
+ KANA_ROMAJI_MAP = {
66
+ "あ" => "a", "い" => "i", "う" => "u", "え" => "e", "お" => "o", "か" => "ka",
67
+ "き" => "ki", "く" => "ku", "け" => "ke", "こ" => "ko", "さ" => "sa",
68
+ "し" => "shi", "す" => "su", "せ" => "se", "そ" => "so", "た" => "ta",
69
+ "ち" => "chi", "つ" => "tsu", "て" => "te", "と" => "to", "な" => "na",
70
+ "に" => "ni", "ぬ" => "nu", "ね" => "ne", "の" => "no", "は" => "ha",
71
+ "ひ" => "hi", "ふ" => "fu", "へ" => "he", "ほ" => "ho", "ま" => "ma",
72
+ "み" => "mi", "む" => "mu", "め" => "me", "も" => "mo", "や" => "ya",
73
+ "ゆ" => "yu", "よ" => "yo", "ら" => "ra", "り" => "ri", "る" => "ru",
74
+ "れ" => "re", "ろ" => "ro", "わ" => "wa", "ゐ" => "wi", "ゑ" => "we",
75
+ "を" => "wo", "ん" => "n", "が" => "ga", "ぎ" => "gi", "ぐ" => "gu",
76
+ "げ" => "ge", "ご" => "go", "ざ" => "za", "じ" => "ji", "ず" => "zu",
77
+ "ぜ" => "ze", "ぞ" => "zo", "だ" => "da", "ぢ" => "ji", "づ" => "zu",
78
+ "で" => "de", "ど" => "do", "ば" => "ba", "び" => "bi", "ぶ" => "bu",
79
+ "べ" => "be", "ぼ" => "bo", "ぱ" => "pa", "ぴ" => "pi", "ぷ" => "pu",
80
+ "ぺ" => "pe", "ぽ" => "po", "ア" => "a", "イ" => "i", "ウ" => "u", "エ" => "e",
81
+ "オ" => "o", "カ" => "ka", "キ" => "ki", "ク" => "ku", "ケ" => "ke",
82
+ "コ" => "ko", "サ" => "sa", "シ" => "shi", "ス" => "su", "セ" => "se",
83
+ "ソ" => "so", "タ" => "ta", "チ" => "chi", "ツ" => "tsu", "テ" => "te",
84
+ "ト" => "to", "ナ" => "na", "ニ" => "ni", "ヌ" => "nu", "ネ" => "ne",
85
+ "ノ" => "no", "ハ" => "ha", "ヒ" => "hi", "フ" => "fu", "ヘ" => "he",
86
+ "ホ" => "ho", "マ" => "ma", "ミ" => "mi", "ム" => "mu", "メ" => "me",
87
+ "モ" => "mo", "ヤ" => "ya", "ユ" => "yu", "ヨ" => "yo", "ラ" => "ra",
88
+ "リ" => "ri", "ル" => "ru", "レ" => "re", "ロ" => "ro", "ワ" => "wa",
89
+ "ヰ" => "wi", "ヱ" => "we", "ヲ" => "wo", "ン" => "n", "ガ" => "ga",
90
+ "ギ" => "gi", "グ" => "gu", "ゲ" => "ge", "ゴ" => "go", "ザ" => "za",
91
+ "ジ" => "ji", "ズ" => "zu", "ゼ" => "ze", "ゾ" => "zo", "ダ" => "da",
92
+ "ヂ" => "ji", "ヅ" => "zu", "デ" => "de", "ド" => "do", "バ" => "ba",
93
+ "ビ" => "bi", "ブ" => "bu", "ベ" => "be", "ボ" => "bo", "パ" => "pa",
94
+ "ピ" => "pi", "プ" => "pu", "ペ" => "pe", "ポ" => "po", "ヴ" => "vu",
95
+ "・" => " ", "0" => "0", "1" => "1", "2" => "2", "3" => "3", "4" => "4",
96
+ "5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9", "!" => "!",
97
+ """ => "\"", "#" => "#", "$" => "\$", "%" => "%", "&" => "&", "'" => "'",
98
+ "(" => "(", ")" => ")", "*" => "*", "+" => "+", "," => ".", "-" => "-",
99
+ "." => ".", "/" => "/", ":" => ":", ";" => ";", "<" => "<", "=" => "=",
100
+ ">" => ">", "?" => "?", "@" => "\@", "A" => "A", "B" => "B", "C" => "C",
101
+ "D" => "D", "E" => "E", "F" => "F", "G" => "G", "H" => "H", "I" => "I",
102
+ "J" => "J", "K" => "K", "L" => "L", "M" => "M", "N" => "N", "O" => "O",
103
+ "P" => "P", "Q" => "Q", "R" => "R", "S" => "S", "T" => "T", "U" => "U",
104
+ "V" => "V", "W" => "W", "X" => "X", "Y" => "Y", "Z" => "Z", "[" => "[",
105
+ "\" => "\\", "]" => "]", "^" => "^", "_" => "_", "`" => "`", "a" => "a",
106
+ "b" => "b", "c" => "c", "d" => "d", "e" => "e", "f" => "f", "g" => "g",
107
+ "h" => "h", "i" => "i", "j" => "j", "k" => "k", "l" => "l", "m" => "m",
108
+ "n" => "n", "o" => "o", "p" => "p", "q" => "q", "r" => "r", "s" => "s",
109
+ "t" => "t", "u" => "u", "v" => "v", "w" => "w", "x" => "x", "y" => "y",
110
+ "z" => "z", "{" => "{", "|" => "|", "}" => "}", "〜" => "-"
111
+ }
112
+
113
+ # Returns a new string with this string's katakana replaced with equivalent
114
+ # hiragana.
115
+ def to_hiragana
116
+ new_str = ''
117
+ split('').each do |ch|
118
+ if KATAKANA_TO_HIRAGANA.has_key?(ch)
119
+ new_str += KATAKANA_TO_HIRAGANA[ch]
120
+ else
121
+ new_str += ch
122
+ end
123
+ end
124
+ self.class.new(new_str)
125
+ end
126
+
127
+ # Returns a new string with this string's voiced hiragana and katakana
128
+ # replaced with their unvoiced forms.
129
+ def unvoice_kana
130
+ new_str = ''
131
+ split('').each do |ch|
132
+ if UNVOICED_KANA.has_key?(ch)
133
+ new_str += UNVOICED_KANA[ch]
134
+ else
135
+ new_str += ch
136
+ end
137
+ end
138
+ self.class.new(new_str)
139
+ end
140
+
141
+ # Returns a new string with this string's unvoiced hiragana and katakana
142
+ # replaced with their voiced forms.
143
+ def voice_kana
144
+ new_str = ''
145
+ split('').each do |ch|
146
+ if VOICED_KANA.has_key?(ch)
147
+ new_str += VOICED_KANA[ch]
148
+ else
149
+ new_str += ch
150
+ end
151
+ end
152
+ self.class.new(new_str)
153
+ end
154
+
155
+ # Creates a new string by romanizing the kana in this string. Full-width
156
+ # Latin characters are also converted to their ASCII equivalents. If
157
+ # +warnings+ is true (the default), a message is printed on +STDERR+ if an
158
+ # un-romanizable character is encountered.
159
+ def romanize(warnings = true)
160
+ romanized = String.new(self)
161
+
162
+ # Convert dipthongs. This gsub-mania is probably insanely inefficient.
163
+ romanized.gsub!('きゃ', 'kya'); romanized.gsub!('キャ', 'kya')
164
+ romanized.gsub!('きゅ', 'kyu'); romanized.gsub!('キュ', 'kyu')
165
+ romanized.gsub!('きょ', 'kyo'); romanized.gsub!('キョ', 'kyo')
166
+ romanized.gsub!('しゃ', 'sha'); romanized.gsub!('シャ', 'sha')
167
+ romanized.gsub!('しゅ', 'shu'); romanized.gsub!('シュ', 'shu')
168
+ romanized.gsub!('しぇ', 'she'); romanized.gsub!('シェ', 'she')
169
+ romanized.gsub!('しょ', 'sho'); romanized.gsub!('ショ', 'sho')
170
+ romanized.gsub!('ちゃ', 'cha'); romanized.gsub!('チャ', 'cha')
171
+ romanized.gsub!('ちゅ', 'chu'); romanized.gsub!('チュ', 'chu')
172
+ romanized.gsub!('ちょ', 'cho'); romanized.gsub!('チョ', 'cho')
173
+ romanized.gsub!('にゃ', 'nya'); romanized.gsub!('ニャ', 'nya')
174
+ romanized.gsub!('にゅ', 'nyu'); romanized.gsub!('ニュ', 'nyu')
175
+ romanized.gsub!('にょ', 'nyo'); romanized.gsub!('ニョ', 'nyo')
176
+ romanized.gsub!('ひゃ', 'hya'); romanized.gsub!('ヒャ', 'hya')
177
+ romanized.gsub!('ひゅ', 'hyu'); romanized.gsub!('ヒュ', 'hyu')
178
+ romanized.gsub!('ひょ', 'hyo'); romanized.gsub!('ヒョ', 'hyo')
179
+ romanized.gsub!('みゃ', 'mya'); romanized.gsub!('ミャ', 'mya')
180
+ romanized.gsub!('みゅ', 'myu'); romanized.gsub!('ミュ', 'myu')
181
+ romanized.gsub!('みょ', 'myo'); romanized.gsub!('ミョ', 'myo')
182
+ romanized.gsub!('りゃ', 'rya'); romanized.gsub!('リャ', 'rya')
183
+ romanized.gsub!('りゅ', 'ryu'); romanized.gsub!('リュ', 'ryu')
184
+ romanized.gsub!('りょ', 'ryo'); romanized.gsub!('リョ', 'ryo')
185
+ romanized.gsub!('ぎゃ', 'gya'); romanized.gsub!('ギャ', 'gya')
186
+ romanized.gsub!('ぎゅ', 'gyu'); romanized.gsub!('ギュ', 'gyu')
187
+ romanized.gsub!('ぎょ', 'gyo'); romanized.gsub!('ギョ', 'gyo')
188
+ romanized.gsub!('じゃ', 'ja'); romanized.gsub!('ジャ', 'ja')
189
+ romanized.gsub!('じゅ', 'ju'); romanized.gsub!('ジュ', 'ju')
190
+ romanized.gsub!('じょ', 'jo'); romanized.gsub!('ジョ', 'jo')
191
+ romanized.gsub!('ぢゃ', 'ja'); romanized.gsub!('ヂャ', 'ja')
192
+ romanized.gsub!('ぢゅ', 'ju'); romanized.gsub!('ヂュ', 'ju')
193
+ romanized.gsub!('ぢょ', 'jo'); romanized.gsub!('ヂョ', 'jo')
194
+ romanized.gsub!('びゃ', 'bya'); romanized.gsub!('ビャ', 'bya')
195
+ romanized.gsub!('びゅ', 'byu'); romanized.gsub!('ビュ', 'byu')
196
+ romanized.gsub!('びょ', 'byo'); romanized.gsub!('ビョ', 'byo')
197
+ romanized.gsub!('ぴゃ', 'pya'); romanized.gsub!('ピャ', 'pya')
198
+ romanized.gsub!('ぴゅ', 'pyu'); romanized.gsub!('ピュ', 'pyu')
199
+ romanized.gsub!('ぴょ', 'pyo'); romanized.gsub!('ピョ', 'pyo')
200
+
201
+ # Convert extended kana.
202
+ romanized.gsub!('ふぁ', 'fa'); romanized.gsub!('でぃ', 'ti')
203
+ romanized.gsub!('イェ', 'ye'); romanized.gsub!('ウィ', 'wi')
204
+ romanized.gsub!('ウェ', 'we'); romanized.gsub!('ウォ', 'wo')
205
+ romanized.gsub!('ヴァ', 'va'); romanized.gsub!('ヴィ', 'vi')
206
+ romanized.gsub!('ヴゥ', 'vu'); romanized.gsub!('ヴェ', 've')
207
+ romanized.gsub!('ヴォ', 'vo'); romanized.gsub!('シェ', 'she')
208
+ romanized.gsub!('ジェ', 'je'); romanized.gsub!('チェ', 'che')
209
+ romanized.gsub!('ティ', 'ti'); romanized.gsub!('トゥ', 'tu')
210
+ romanized.gsub!('チュ', 'tyu'); romanized.gsub!('ディ', 'di')
211
+ romanized.gsub!('ドゥ', 'du'); romanized.gsub!('デュ', 'dyu')
212
+ romanized.gsub!('ツァ', 'tsa'); romanized.gsub!('ツェ', 'tse')
213
+ romanized.gsub!('ツォ', 'tso'); romanized.gsub!('ファ', 'fa')
214
+ romanized.gsub!('フィ', 'fi'); romanized.gsub!('フェ', 'fe')
215
+ romanized.gsub!('フォ', 'fo'); romanized.gsub!('フュ', 'fyu')
216
+ romanized.gsub!('スィ', 'si'); romanized.gsub!('ゲィ', 'gei')
217
+ romanized.gsub!('ワァ', 'waa'); romanized.gsub!('ツィ', 'tsui')
218
+ romanized.gsub!('シィ', 'shii'); romanized.gsub!('ウァ', 'ua')
219
+ romanized.gsub!('ヴュ', 'vyu'); romanized.gsub!('クォ', 'quo')
220
+ romanized.gsub!('テュ', 'tu'); romanized.gsub!('グィ', 'gui')
221
+ romanized.gsub!('クェ', 'que'); romanized.gsub!('ビィ', 'bii')
222
+ romanized.gsub!('ツィ', 'tsi'); romanized.gsub!('ズィ', 'zi')
223
+ romanized.gsub!('リィ', 'rii'); romanized.gsub!('テュ', 'tu')
224
+
225
+ # Do simple conversions.
226
+ chars = romanized.split('')
227
+ chars.each_with_index do |ch,i|
228
+ chars[i] = KANA_ROMAJI_MAP[ch] if KANA_ROMAJI_MAP.has_key?(ch)
229
+ if chars[i] !~ /\A[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz&\d\.\-ッっー ]+\Z/
230
+ STDERR.puts "Couldn't romanize #{ch} in #{self}" if warnings
231
+ end
232
+ end
233
+ romanized = chars.join('')
234
+
235
+ # Convert letter-doublers (small tsu and katakana dash).
236
+ romanized.gsub!(/[ッっ](.)/, '\1\1')
237
+ romanized.gsub!(/(.)ー/, '\1\1')
238
+
239
+ romanized
240
+ end
241
+
242
+ # Creates a 7-bit-safe string that can be used to sort strings containing
243
+ # kana and/or English text.
244
+ def kana_sort_key
245
+ key = ''
246
+ downcase.split('').each do |ch|
247
+ if ch =~ /[0-9]/
248
+ ch[0] -= 15 # produces ! through *
249
+ elsif ch =~ /[a-z]/
250
+ ch[0] -= 54 # produces + through E
251
+ elsif KANA_SORT_MAP.has_key?(ch)
252
+ ch = KANA_SORT_MAP[ch]
253
+ if ch.kind_of?(Numeric)
254
+ tmp = ' '
255
+ tmp[0] = ch + 70
256
+ ch = tmp
257
+ else
258
+ redo
259
+ end
260
+ else
261
+ next
262
+ end
263
+ key += ch
264
+ end
265
+ key
266
+ end
267
+
268
+ private
269
+
270
+ # Table for creating kana sort keys. See kana_sort_key.
271
+ KANA_SORT_MAP = {
272
+ "あ" => 0, "い" => 1, "う" => 2, "え" => 3, "お" => 4, "か" => 5, "き" => 6,
273
+ "く" => 7, "け" => 8, "こ" => 9, "さ" => 10, "し" => 11, "す" => 12, "せ" => 13,
274
+ "そ" => 14, "た" => 15, "ち" => 16, "つ" => 17, "て" => 18, "と" => 19,
275
+ "な" => 20, "に" => 21, "ぬ" => 22, "ね" => 23, "の" => 24, "は" => 25,
276
+ "ひ" => 26, "ふ" => 27, "へ" => 28, "ほ" => 29, "ま" => 30, "み" => 31,
277
+ "む" => 32, "め" => 33, "も" => 34, "や" => 35, "ゆ" => 36, "よ" => 37,
278
+ "ら" => 38, "り" => 39, "る" => 40, "れ" => 41, "ろ" => 42, "わ" => 43,
279
+ "ゐ" => 44, "ゑ" => 45, "を" => 46, "ん" => 47, "が" => 5, "ぎ" => 6, "ぐ" => 7,
280
+ "げ" => 8, "ご" => 9, "ざ" => 10, "じ" => 11, "ず" => 12, "ぜ" => 13, "ぞ" => 14,
281
+ "だ" => 15, "ぢ" => 16, "づ" => 17, "で" => 18, "ど" => 19, "ば" => 25,
282
+ "び" => 26, "ぶ" => 27, "べ" => 28, "ぼ" => 29, "ぱ" => 25, "ぴ" => 26,
283
+ "ぷ" => 27, "ぺ" => 28, "ぽ" => 29, "ア" => 0, "イ" => 1, "ウ" => 2, "エ" => 3,
284
+ "オ" => 4, "カ" => 5, "キ" => 6, "ク" => 7, "ケ" => 8, "コ" => 9, "サ" => 10,
285
+ "シ" => 11, "ス" => 12, "セ" => 13, "ソ" => 14, "タ" => 15, "チ" => 16,
286
+ "ツ" => 17, "テ" => 18, "ト" => 19, "ナ" => 20, "ニ" => 21, "ヌ" => 22,
287
+ "ネ" => 23, "ノ" => 24, "ハ" => 25, "ヒ" => 26, "フ" => 27, "ヘ" => 28,
288
+ "ホ" => 29, "マ" => 30, "ミ" => 31, "ム" => 32, "メ" => 33, "モ" => 34,
289
+ "ヤ" => 35, "ユ" => 36, "ヨ" => 37, "ラ" => 38, "リ" => 39, "ル" => 40,
290
+ "レ" => 41, "ロ" => 42, "ワ" => 43, "ヰ" => 44, "ヱ" => 45, "ヲ" => 46,
291
+ "ン" => 47, "ガ" => 5, "ギ" => 6, "グ" => 7, "ゲ" => 8, "ゴ" => 9, "ザ" => 10,
292
+ "ジ" => 11, "ズ" => 12, "ゼ" => 13, "ゾ" => 14, "ダ" => 15, "ヂ" => 16,
293
+ "ヅ" => 17, "デ" => 18, "ド" => 19, "バ" => 25, "ビ" => 26, "ブ" => 27,
294
+ "ベ" => 28, "ボ" => 29, "パ" => 25, "ピ" => 26, "プ" => 27, "ペ" => 28,
295
+ "ポ" => 29, "ヴ" => 2, "0" => "0", "1" => "1", "2" => "2", "3" => "3",
296
+ "4" => "4", "5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9",
297
+ "A" => "a", "B" => "b", "C" => "c", "D" => "d", "E" => "e", "F" => "f",
298
+ "G" => "g", "H" => "h", "I" => "i", "J" => "j", "K" => "k", "L" => "l",
299
+ "M" => "m", "N" => "n", "O" => "o", "P" => "p", "Q" => "q", "R" => "r",
300
+ "S" => "s", "T" => "t", "U" => "u", "V" => "v", "W" => "w", "X" => "x",
301
+ "Y" => "y", "Z" => "z", "a" => "a", "b" => "b", "c" => "c", "d" => "d",
302
+ "e" => "e", "f" => "f", "g" => "g", "h" => "h", "i" => "i", "j" => "j",
303
+ "k" => "k", "l" => "l", "m" => "m", "n" => "n", "o" => "o", "p" => "p",
304
+ "q" => "q", "r" => "r", "s" => "s", "t" => "t", "u" => "u", "v" => "v",
305
+ "w" => "w", "x" => "x", "y" => "y", "z" => "z"
306
+ }
307
+ end
@@ -0,0 +1,57 @@
1
+ require 'delegate'
2
+
3
+ class UCSCodepoint < DelegateClass(Integer)
4
+ # Returns a Boolean indicating whether this UCS codepoint represents a kanji
5
+ # character.
6
+ def kanji?
7
+ (self >= 0x4e00 && self <= 0x9fbf) ||
8
+ (self >= 0x3400 && self <= 0x4dbf) ||
9
+ (self >= 0x20000 && self <= 0x2a6df)
10
+ end
11
+
12
+ # Returns a Boolean indicating whether this UCS codepoint represents a
13
+ # hiragana or katakana character.
14
+ def kana?
15
+ (self >= 0x3040 && self <= 0x30ff) ||
16
+ (self >= 0x31f0 && self <= 0x31ff)
17
+ end
18
+
19
+ # Returns a Boolean indicating whether this UCS codepoint represents a
20
+ # full-width latin character.
21
+ def wide_latin?
22
+ self >= 0xff10 && self <= 0xff5a
23
+ end
24
+
25
+ # Returns an encoded string containing the character represented by this UCS
26
+ # codepoint. Currently only UTF-8 encoding is supported.
27
+ def to_s
28
+ unless $KCODE =~ /^u/i
29
+ raise ArgumentError, 'unrecognized encoding (only UTF-8 is supported at the moment)'
30
+ end
31
+
32
+ if self <= 0x7f
33
+ ch = ' '
34
+ ch[0] = to_i
35
+ elsif self <= 0x7ff
36
+ ch = ' '
37
+ ch[0] = ((self & 0x7c0) >> 6) | 0xc0
38
+ ch[1] = self & 0x3f | 0x80
39
+ elsif self <= 0xffff
40
+ ch = ' '
41
+ ch[0] = ((self & 0xf000) >> 12) | 0xe0
42
+ ch[1] = ((self & 0xfc0) >> 6) | 0x80
43
+ ch[2] = self & 0x3f | 0x80
44
+ else
45
+ ch = ' '
46
+ ch[0] = ((self & 0x1c0000) >> 18) | 0xf0
47
+ ch[1] = ((self & 0x3f000) >> 12) | 0x80
48
+ ch[2] = ((self & 0xfc0) >> 6) | 0x80
49
+ ch[3] = (self & 0x3f) | 0x80
50
+ end
51
+ return ch
52
+ end
53
+
54
+ def inspect
55
+ "#<#{self.class}:0x#{self.to_i.to_s(16)} #{self.to_s.inspect}>"
56
+ end
57
+ end
@@ -0,0 +1,105 @@
1
+ class UnicodeString < String
2
+ # Returns a Boolean indicating whether this character is a kanji character.
3
+ # (This string must contain only one character.)
4
+ def kanji?
5
+ codepoint.kanji?
6
+ end
7
+
8
+ # Returns a Boolean indicating whether this character is a hiragana or
9
+ # katakana character. (This string must contain only one character.)
10
+ def kana?
11
+ codepoint.kana?
12
+ end
13
+
14
+ # Returns a Boolean indicating whether this character is a full-width latin
15
+ # character. (This string must contain only one character.)
16
+ def wide_latin?
17
+ codepoint.wide_latin?
18
+ end
19
+
20
+ # Returns the UCS codepoint of this character. (This string must contain only
21
+ # one character.) Currently only UTF8 encoding is supported.
22
+ def codepoint
23
+ unless $KCODE =~ /^u/i
24
+ raise ArgumentError, "unsupported encoding (#{$KCODE})"
25
+ end
26
+ unless jlength == 1
27
+ raise RangeError, "string must be exactly one character long"
28
+ end
29
+
30
+ case self.length
31
+ when 1
32
+ UCSCodepoint.new(self[0])
33
+ when 2
34
+ UCSCodepoint.new(
35
+ ((self[0] & 0x1f) << 6) +
36
+ (self[1] & 0x3f)
37
+ )
38
+ when 3
39
+ UCSCodepoint.new(
40
+ ((self[0] & 0x0f) << 12) +
41
+ ((self[1] & 0x3f) << 6) +
42
+ (self[2] & 0x3f)
43
+ )
44
+ when 4
45
+ UCSCodepoint.new(
46
+ ((self[0] & 0x07) << 18) +
47
+ ((self[1] & 0x3f) << 12) +
48
+ ((self[2] & 0x3f) << 6) +
49
+ (self[3] & 0x3f)
50
+ )
51
+ end
52
+ end
53
+
54
+ # Like index, but returns a character offset instead of a byte offset. The
55
+ # starting offset is also in characters instead of bytes.
56
+ def uindex(substr, uoffset = 0)
57
+ offset = uindex_to_index(uoffset)
58
+ index_to_uindex(index(substr, offset))
59
+ end
60
+
61
+ # Like slice, but takes a character offset and length (instead of bytes).
62
+ # Can't handle negative lengths.
63
+ def uslice(uoffset, ulength)
64
+ offset = uindex_to_index(uoffset)
65
+ substr = slice(offset, length)
66
+ substr.split('')[0,ulength].join('')
67
+ end
68
+
69
+ # Converts a byte offset to a character offset. The byte offset must be
70
+ # greater than or equal to zero and less than or equal to the byte length of
71
+ # the string. Returns +nil+ if the offset is in the middle of a character.
72
+ def index_to_uindex(byte_index)
73
+ return nil if byte_index.nil?
74
+ if byte_index < 0 || byte_index > length
75
+ raise RangeError, 'index out of range'
76
+ end
77
+
78
+ chars = split('')
79
+ char_index = 0
80
+ chars.each do |ch|
81
+ break if byte_index == 0
82
+ byte_index -= ch.length
83
+ return nil if byte_index < 0
84
+ char_index += 1
85
+ end
86
+ char_index
87
+ end
88
+
89
+ # Converts a character offset to a byte offset. The character offset must be
90
+ # greater than or equal to zero and less than or equal to the character
91
+ # length of the string.
92
+ def uindex_to_index(char_index)
93
+ return nil if char_index.nil?
94
+ if char_index < 0 || char_index > jlength
95
+ raise RangeError, 'index out of range'
96
+ end
97
+
98
+ chars = split('')
99
+ byte_index = 0
100
+ char_index.times do |i|
101
+ byte_index += chars[i].length
102
+ end
103
+ byte_index
104
+ end
105
+ end
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode_madness
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 0
8
+ - 2
9
+ version: 1.0.2
10
+ platform: ruby
11
+ authors:
12
+ - Dana Contreras
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-27 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description:
22
+ email:
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/unicode_madness.rb
31
+ - lib/unicode_madness/ucs_codepoint.rb
32
+ - lib/unicode_madness/unicode_string.rb
33
+ - lib/unicode_madness/japanese_string.rb
34
+ has_rdoc: true
35
+ homepage: http://github.com/DanaDanger/unicode_madness
36
+ licenses: []
37
+
38
+ post_install_message:
39
+ rdoc_options: []
40
+
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ segments:
48
+ - 0
49
+ version: "0"
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ requirements: []
58
+
59
+ rubyforge_project:
60
+ rubygems_version: 1.3.6
61
+ signing_key:
62
+ specification_version: 3
63
+ summary: Madness? THIS.IS.UNICODE! (Plus some goodies for Japanese.)
64
+ test_files: []
65
+