babosa 1.0.4 → 2.0.0.beta
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Changelog.md +12 -0
- data/README.md +80 -117
- data/Rakefile +9 -8
- data/lib/babosa.rb +2 -4
- data/lib/babosa/identifier.rb +82 -121
- data/lib/babosa/transliterator/base.rb +57 -56
- data/lib/babosa/transliterator/bulgarian.rb +3 -2
- data/lib/babosa/transliterator/cyrillic.rb +5 -5
- data/lib/babosa/transliterator/danish.rb +3 -3
- data/lib/babosa/transliterator/german.rb +3 -2
- data/lib/babosa/transliterator/greek.rb +4 -3
- data/lib/babosa/transliterator/hindi.rb +3 -2
- data/lib/babosa/transliterator/latin.rb +5 -5
- data/lib/babosa/transliterator/macedonian.rb +3 -2
- data/lib/babosa/transliterator/norwegian.rb +3 -3
- data/lib/babosa/transliterator/romanian.rb +3 -2
- data/lib/babosa/transliterator/russian.rb +3 -2
- data/lib/babosa/transliterator/serbian.rb +29 -27
- data/lib/babosa/transliterator/spanish.rb +2 -2
- data/lib/babosa/transliterator/swedish.rb +3 -3
- data/lib/babosa/transliterator/turkish.rb +8 -8
- data/lib/babosa/transliterator/ukrainian.rb +5 -4
- data/lib/babosa/transliterator/vietnamese.rb +4 -3
- data/lib/babosa/version.rb +3 -1
- data/spec/{babosa_spec.rb → identifier_spec.rb} +9 -10
- data/spec/spec_helper.rb +6 -6
- data/spec/transliterators/base_spec.rb +5 -6
- data/spec/transliterators/bulgarian_spec.rb +4 -5
- data/spec/transliterators/danish_spec.rb +5 -6
- data/spec/transliterators/german_spec.rb +4 -5
- data/spec/transliterators/greek_spec.rb +7 -7
- data/spec/transliterators/hindi_spec.rb +7 -7
- data/spec/transliterators/latin_spec.rb +3 -4
- data/spec/transliterators/macedonian_spec.rb +3 -4
- data/spec/transliterators/norwegian_spec.rb +4 -4
- data/spec/transliterators/polish_spec.rb +3 -5
- data/spec/transliterators/romanian_spec.rb +5 -6
- data/spec/transliterators/russian_spec.rb +3 -4
- data/spec/transliterators/serbian_spec.rb +6 -7
- data/spec/transliterators/spanish_spec.rb +4 -5
- data/spec/transliterators/swedish_spec.rb +7 -7
- data/spec/transliterators/turkish_spec.rb +24 -24
- data/spec/transliterators/ukrainian_spec.rb +74 -75
- data/spec/transliterators/vietnamese_spec.rb +10 -10
- metadata +17 -38
- data/lib/babosa/utf8/active_support_proxy.rb +0 -38
- data/lib/babosa/utf8/dumb_proxy.rb +0 -49
- data/lib/babosa/utf8/java_proxy.rb +0 -22
- data/lib/babosa/utf8/mappings.rb +0 -193
- data/lib/babosa/utf8/proxy.rb +0 -125
- data/lib/babosa/utf8/unicode_proxy.rb +0 -23
- data/spec/utf8_proxy_spec.rb +0 -52
@@ -1,22 +0,0 @@
|
|
1
|
-
module Babosa
|
2
|
-
module UTF8
|
3
|
-
# A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
|
4
|
-
module JavaProxy
|
5
|
-
extend Proxy
|
6
|
-
extend self
|
7
|
-
java_import java.text.Normalizer
|
8
|
-
|
9
|
-
def downcase(string)
|
10
|
-
string.to_java.to_lower_case.to_s
|
11
|
-
end
|
12
|
-
|
13
|
-
def upcase(string)
|
14
|
-
string.to_java.to_upper_case.to_s
|
15
|
-
end
|
16
|
-
|
17
|
-
def normalize_utf8(string)
|
18
|
-
Normalizer.normalize(string, Normalizer::Form::NFC).to_s
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
data/lib/babosa/utf8/mappings.rb
DELETED
@@ -1,193 +0,0 @@
|
|
1
|
-
module Babosa
|
2
|
-
module UTF8
|
3
|
-
|
4
|
-
# A small subset of the mappings provided by Unicode.org, limited to Latin
|
5
|
-
# characters. This is used for Babosa's default "dumb" UTF-8 support.
|
6
|
-
module Mappings
|
7
|
-
DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
|
8
|
-
71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
|
9
|
-
79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
|
10
|
-
87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
|
11
|
-
226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
|
12
|
-
233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
|
13
|
-
240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
|
14
|
-
248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
|
15
|
-
[115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
|
16
|
-
268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
|
17
|
-
282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
|
18
|
-
296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
|
19
|
-
309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
|
20
|
-
324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
|
21
|
-
336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
|
22
|
-
350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
|
23
|
-
364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
|
24
|
-
377, 378, 379, 380, 381, 382]
|
25
|
-
|
26
|
-
UPCASE = DOWNCASE.invert
|
27
|
-
|
28
|
-
COMPOSITION = {
|
29
|
-
[65,768] => 192,
|
30
|
-
[65,769] => 193,
|
31
|
-
[65,770] => 194,
|
32
|
-
[65,771] => 195,
|
33
|
-
[65,776] => 196,
|
34
|
-
[65,778] => 197,
|
35
|
-
[67,807] => 199,
|
36
|
-
[69,768] => 200,
|
37
|
-
[69,769] => 201,
|
38
|
-
[69,770] => 202,
|
39
|
-
[69,776] => 203,
|
40
|
-
[73,768] => 204,
|
41
|
-
[73,769] => 205,
|
42
|
-
[73,770] => 206,
|
43
|
-
[73,776] => 207,
|
44
|
-
[78,771] => 209,
|
45
|
-
[79,768] => 210,
|
46
|
-
[79,769] => 211,
|
47
|
-
[79,770] => 212,
|
48
|
-
[79,771] => 213,
|
49
|
-
[79,776] => 214,
|
50
|
-
[85,768] => 217,
|
51
|
-
[85,769] => 218,
|
52
|
-
[85,770] => 219,
|
53
|
-
[85,776] => 220,
|
54
|
-
[89,769] => 221,
|
55
|
-
[97,768] => 224,
|
56
|
-
[97,769] => 225,
|
57
|
-
[97,770] => 226,
|
58
|
-
[97,771] => 227,
|
59
|
-
[97,776] => 228,
|
60
|
-
[97,778] => 229,
|
61
|
-
[99,807] => 231,
|
62
|
-
[101,768] => 232,
|
63
|
-
[101,769] => 233,
|
64
|
-
[101,770] => 234,
|
65
|
-
[101,776] => 235,
|
66
|
-
[105,768] => 236,
|
67
|
-
[105,769] => 237,
|
68
|
-
[105,770] => 238,
|
69
|
-
[105,776] => 239,
|
70
|
-
[110,771] => 241,
|
71
|
-
[111,768] => 242,
|
72
|
-
[111,769] => 243,
|
73
|
-
[111,770] => 244,
|
74
|
-
[111,771] => 245,
|
75
|
-
[111,776] => 246,
|
76
|
-
[117,768] => 249,
|
77
|
-
[117,769] => 250,
|
78
|
-
[117,770] => 251,
|
79
|
-
[117,776] => 252,
|
80
|
-
[121,769] => 253,
|
81
|
-
[121,776] => 255,
|
82
|
-
[65,772] => 256,
|
83
|
-
[97,772] => 257,
|
84
|
-
[65,774] => 258,
|
85
|
-
[97,774] => 259,
|
86
|
-
[65,808] => 260,
|
87
|
-
[97,808] => 261,
|
88
|
-
[67,769] => 262,
|
89
|
-
[99,769] => 263,
|
90
|
-
[67,770] => 264,
|
91
|
-
[99,770] => 265,
|
92
|
-
[67,775] => 266,
|
93
|
-
[99,775] => 267,
|
94
|
-
[67,780] => 268,
|
95
|
-
[99,780] => 269,
|
96
|
-
[68,780] => 270,
|
97
|
-
[100,780] => 271,
|
98
|
-
[69,772] => 274,
|
99
|
-
[101,772] => 275,
|
100
|
-
[69,774] => 276,
|
101
|
-
[101,774] => 277,
|
102
|
-
[69,775] => 278,
|
103
|
-
[101,775] => 279,
|
104
|
-
[69,808] => 280,
|
105
|
-
[101,808] => 281,
|
106
|
-
[69,780] => 282,
|
107
|
-
[101,780] => 283,
|
108
|
-
[71,770] => 284,
|
109
|
-
[103,770] => 285,
|
110
|
-
[71,774] => 286,
|
111
|
-
[103,774] => 287,
|
112
|
-
[71,775] => 288,
|
113
|
-
[103,775] => 289,
|
114
|
-
[71,807] => 290,
|
115
|
-
[103,807] => 291,
|
116
|
-
[72,770] => 292,
|
117
|
-
[104,770] => 293,
|
118
|
-
[73,771] => 296,
|
119
|
-
[105,771] => 297,
|
120
|
-
[73,772] => 298,
|
121
|
-
[105,772] => 299,
|
122
|
-
[73,774] => 300,
|
123
|
-
[105,774] => 301,
|
124
|
-
[73,808] => 302,
|
125
|
-
[105,808] => 303,
|
126
|
-
[73,775] => 304,
|
127
|
-
[74,770] => 308,
|
128
|
-
[106,770] => 309,
|
129
|
-
[75,807] => 310,
|
130
|
-
[107,807] => 311,
|
131
|
-
[76,769] => 313,
|
132
|
-
[108,769] => 314,
|
133
|
-
[76,807] => 315,
|
134
|
-
[108,807] => 316,
|
135
|
-
[76,780] => 317,
|
136
|
-
[108,780] => 318,
|
137
|
-
[78,769] => 323,
|
138
|
-
[110,769] => 324,
|
139
|
-
[78,807] => 325,
|
140
|
-
[110,807] => 326,
|
141
|
-
[78,780] => 327,
|
142
|
-
[110,780] => 328,
|
143
|
-
[79,772] => 332,
|
144
|
-
[111,772] => 333,
|
145
|
-
[79,774] => 334,
|
146
|
-
[111,774] => 335,
|
147
|
-
[79,779] => 336,
|
148
|
-
[111,779] => 337,
|
149
|
-
[82,769] => 340,
|
150
|
-
[114,769] => 341,
|
151
|
-
[82,807] => 342,
|
152
|
-
[114,807] => 343,
|
153
|
-
[82,780] => 344,
|
154
|
-
[114,780] => 345,
|
155
|
-
[83,769] => 346,
|
156
|
-
[115,769] => 347,
|
157
|
-
[83,770] => 348,
|
158
|
-
[115,770] => 349,
|
159
|
-
[83,807] => 350,
|
160
|
-
[115,807] => 351,
|
161
|
-
[83,780] => 352,
|
162
|
-
[115,780] => 353,
|
163
|
-
[84,807] => 354,
|
164
|
-
[116,807] => 355,
|
165
|
-
[84,780] => 356,
|
166
|
-
[116,780] => 357,
|
167
|
-
[85,771] => 360,
|
168
|
-
[117,771] => 361,
|
169
|
-
[85,772] => 362,
|
170
|
-
[117,772] => 363,
|
171
|
-
[85,774] => 364,
|
172
|
-
[117,774] => 365,
|
173
|
-
[85,778] => 366,
|
174
|
-
[117,778] => 367,
|
175
|
-
[85,779] => 368,
|
176
|
-
[117,779] => 369,
|
177
|
-
[85,808] => 370,
|
178
|
-
[117,808] => 371,
|
179
|
-
[87,770] => 372,
|
180
|
-
[119,770] => 373,
|
181
|
-
[89,770] => 374,
|
182
|
-
[121,770] => 375,
|
183
|
-
[89,776] => 376,
|
184
|
-
[90,769] => 377,
|
185
|
-
[122,769] => 378,
|
186
|
-
[90,775] => 379,
|
187
|
-
[122,775] => 380,
|
188
|
-
[90,780] => 381,
|
189
|
-
[122,780] => 382
|
190
|
-
}
|
191
|
-
end
|
192
|
-
end
|
193
|
-
end
|
data/lib/babosa/utf8/proxy.rb
DELETED
@@ -1,125 +0,0 @@
|
|
1
|
-
module Babosa
|
2
|
-
module UTF8
|
3
|
-
|
4
|
-
autoload :JavaProxy, "babosa/utf8/java_proxy"
|
5
|
-
autoload :UnicodeProxy, "babosa/utf8/unicode_proxy"
|
6
|
-
autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy"
|
7
|
-
autoload :DumbProxy, "babosa/utf8/dumb_proxy"
|
8
|
-
|
9
|
-
# A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
|
10
|
-
# The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
|
11
|
-
module Proxy
|
12
|
-
CP1252 = {
|
13
|
-
128 => [226, 130, 172],
|
14
|
-
129 => nil,
|
15
|
-
130 => [226, 128, 154],
|
16
|
-
131 => [198, 146],
|
17
|
-
132 => [226, 128, 158],
|
18
|
-
133 => [226, 128, 166],
|
19
|
-
134 => [226, 128, 160],
|
20
|
-
135 => [226, 128, 161],
|
21
|
-
136 => [203, 134],
|
22
|
-
137 => [226, 128, 176],
|
23
|
-
138 => [197, 160],
|
24
|
-
139 => [226, 128, 185],
|
25
|
-
140 => [197, 146],
|
26
|
-
141 => nil,
|
27
|
-
142 => [197, 189],
|
28
|
-
143 => nil,
|
29
|
-
144 => nil,
|
30
|
-
145 => [226, 128, 152],
|
31
|
-
146 => [226, 128, 153],
|
32
|
-
147 => [226, 128, 156],
|
33
|
-
148 => [226, 128, 157],
|
34
|
-
149 => [226, 128, 162],
|
35
|
-
150 => [226, 128, 147],
|
36
|
-
151 => [226, 128, 148],
|
37
|
-
152 => [203, 156],
|
38
|
-
153 => [226, 132, 162],
|
39
|
-
154 => [197, 161],
|
40
|
-
155 => [226, 128, 186],
|
41
|
-
156 => [197, 147],
|
42
|
-
157 => nil,
|
43
|
-
158 => [197, 190],
|
44
|
-
159 => [197, 184]
|
45
|
-
}
|
46
|
-
|
47
|
-
# This is a stub for a method that should return a Unicode-aware
|
48
|
-
# downcased version of the given string.
|
49
|
-
def downcase(string)
|
50
|
-
raise NotImplementedError
|
51
|
-
end
|
52
|
-
|
53
|
-
# This is a stub for a method that should return a Unicode-aware
|
54
|
-
# upcased version of the given string.
|
55
|
-
def upcase(string)
|
56
|
-
raise NotImplementedError
|
57
|
-
end
|
58
|
-
|
59
|
-
# This is a stub for a method that should return the Unicode NFC
|
60
|
-
# normalization of the given string.
|
61
|
-
def normalize_utf8(string)
|
62
|
-
raise NotImplementedError
|
63
|
-
end
|
64
|
-
|
65
|
-
if ''.respond_to?(:scrub) && !defined?(Rubinius)
|
66
|
-
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
67
|
-
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
68
|
-
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
69
|
-
# always work.
|
70
|
-
def tidy_bytes(string)
|
71
|
-
string.scrub do |bad|
|
72
|
-
tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
|
73
|
-
end
|
74
|
-
end
|
75
|
-
else
|
76
|
-
def tidy_bytes(string)
|
77
|
-
bytes = string.unpack("C*")
|
78
|
-
conts_expected = 0
|
79
|
-
last_lead = 0
|
80
|
-
|
81
|
-
bytes.each_index do |i|
|
82
|
-
byte = bytes[i]
|
83
|
-
is_cont = byte > 127 && byte < 192
|
84
|
-
is_lead = byte > 191 && byte < 245
|
85
|
-
is_unused = byte > 240
|
86
|
-
is_restricted = byte > 244
|
87
|
-
|
88
|
-
# Impossible or highly unlikely byte? Clean it.
|
89
|
-
if is_unused || is_restricted
|
90
|
-
bytes[i] = tidy_byte(byte)
|
91
|
-
elsif is_cont
|
92
|
-
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
93
|
-
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
94
|
-
else
|
95
|
-
if conts_expected > 0
|
96
|
-
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
97
|
-
# the leading byte.
|
98
|
-
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
99
|
-
conts_expected = 0
|
100
|
-
end
|
101
|
-
if is_lead
|
102
|
-
# Final byte is leading? Clean it.
|
103
|
-
if i == bytes.length - 1
|
104
|
-
bytes[i] = tidy_byte(bytes.last)
|
105
|
-
else
|
106
|
-
# Valid leading byte? Expect continuations determined by position of
|
107
|
-
# first zero bit, with max of 3.
|
108
|
-
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
109
|
-
last_lead = i
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
private
|
119
|
-
|
120
|
-
def tidy_byte(byte)
|
121
|
-
byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'unicode'
|
2
|
-
|
3
|
-
module Babosa
|
4
|
-
module UTF8
|
5
|
-
# A UTF-8 proxy using the Unicode gem.
|
6
|
-
# @see http://github.com/blackwinter/unicode
|
7
|
-
module UnicodeProxy
|
8
|
-
extend Proxy
|
9
|
-
extend self
|
10
|
-
def downcase(string)
|
11
|
-
Unicode.downcase(string)
|
12
|
-
end
|
13
|
-
|
14
|
-
def upcase(string)
|
15
|
-
Unicode.upcase(string)
|
16
|
-
end
|
17
|
-
|
18
|
-
def normalize_utf8(string)
|
19
|
-
Unicode.normalize_C(string)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
data/spec/utf8_proxy_spec.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require File.expand_path("../spec_helper", __FILE__)
|
3
|
-
|
4
|
-
PROXIES = [Babosa::UTF8::DumbProxy, Babosa::UTF8::ActiveSupportProxy, Babosa::UTF8::UnicodeProxy]
|
5
|
-
PROXIES << Babosa::UTF8::JavaProxy if Babosa.jruby15?
|
6
|
-
|
7
|
-
PROXIES.each do |proxy|
|
8
|
-
|
9
|
-
describe proxy do
|
10
|
-
|
11
|
-
around do |example|
|
12
|
-
begin
|
13
|
-
old_proxy = Babosa::Identifier.utf8_proxy
|
14
|
-
Babosa::Identifier.utf8_proxy = proxy
|
15
|
-
example.run
|
16
|
-
ensure
|
17
|
-
Babosa::Identifier.utf8_proxy = old_proxy
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
describe "#normalize_utf8" do
|
22
|
-
it "should normalize to canonical composed" do
|
23
|
-
# ÅÉÎØÜ
|
24
|
-
uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136]
|
25
|
-
composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156]
|
26
|
-
uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*")
|
27
|
-
expect(proxy.normalize_utf8(uncomposed_string).unpack("C*")).to eql(composed_bytes)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
describe "#upcase" do
|
32
|
-
it "should upcase the string" do
|
33
|
-
expect(proxy.upcase("åéîøü")).to eql("ÅÉÎØÜ")
|
34
|
-
expect("åéîøü".to_identifier.upcase).to eql("ÅÉÎØÜ")
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
describe "#downcase" do
|
39
|
-
it "should downcase the string" do
|
40
|
-
expect(proxy.downcase("ÅÉÎØÜ")).to eql("åéîøü")
|
41
|
-
expect("ÅÉÎØÜ".to_identifier.downcase).to eql("åéîøü")
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
describe 'tidy_bytes' do
|
46
|
-
it 'should fix invalid UTF-8 strings' do
|
47
|
-
expect(proxy.tidy_bytes("\x93abc")).to eq('“abc')
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
end
|