babosa 1.0.4 → 2.0.0.beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Changelog.md +12 -0
- data/README.md +80 -117
- data/Rakefile +9 -8
- data/lib/babosa.rb +2 -4
- data/lib/babosa/identifier.rb +82 -121
- data/lib/babosa/transliterator/base.rb +57 -56
- data/lib/babosa/transliterator/bulgarian.rb +3 -2
- data/lib/babosa/transliterator/cyrillic.rb +5 -5
- data/lib/babosa/transliterator/danish.rb +3 -3
- data/lib/babosa/transliterator/german.rb +3 -2
- data/lib/babosa/transliterator/greek.rb +4 -3
- data/lib/babosa/transliterator/hindi.rb +3 -2
- data/lib/babosa/transliterator/latin.rb +5 -5
- data/lib/babosa/transliterator/macedonian.rb +3 -2
- data/lib/babosa/transliterator/norwegian.rb +3 -3
- data/lib/babosa/transliterator/romanian.rb +3 -2
- data/lib/babosa/transliterator/russian.rb +3 -2
- data/lib/babosa/transliterator/serbian.rb +29 -27
- data/lib/babosa/transliterator/spanish.rb +2 -2
- data/lib/babosa/transliterator/swedish.rb +3 -3
- data/lib/babosa/transliterator/turkish.rb +8 -8
- data/lib/babosa/transliterator/ukrainian.rb +5 -4
- data/lib/babosa/transliterator/vietnamese.rb +4 -3
- data/lib/babosa/version.rb +3 -1
- data/spec/{babosa_spec.rb → identifier_spec.rb} +9 -10
- data/spec/spec_helper.rb +6 -6
- data/spec/transliterators/base_spec.rb +5 -6
- data/spec/transliterators/bulgarian_spec.rb +4 -5
- data/spec/transliterators/danish_spec.rb +5 -6
- data/spec/transliterators/german_spec.rb +4 -5
- data/spec/transliterators/greek_spec.rb +7 -7
- data/spec/transliterators/hindi_spec.rb +7 -7
- data/spec/transliterators/latin_spec.rb +3 -4
- data/spec/transliterators/macedonian_spec.rb +3 -4
- data/spec/transliterators/norwegian_spec.rb +4 -4
- data/spec/transliterators/polish_spec.rb +3 -5
- data/spec/transliterators/romanian_spec.rb +5 -6
- data/spec/transliterators/russian_spec.rb +3 -4
- data/spec/transliterators/serbian_spec.rb +6 -7
- data/spec/transliterators/spanish_spec.rb +4 -5
- data/spec/transliterators/swedish_spec.rb +7 -7
- data/spec/transliterators/turkish_spec.rb +24 -24
- data/spec/transliterators/ukrainian_spec.rb +74 -75
- data/spec/transliterators/vietnamese_spec.rb +10 -10
- metadata +17 -38
- data/lib/babosa/utf8/active_support_proxy.rb +0 -38
- data/lib/babosa/utf8/dumb_proxy.rb +0 -49
- data/lib/babosa/utf8/java_proxy.rb +0 -22
- data/lib/babosa/utf8/mappings.rb +0 -193
- data/lib/babosa/utf8/proxy.rb +0 -125
- data/lib/babosa/utf8/unicode_proxy.rb +0 -23
- data/spec/utf8_proxy_spec.rb +0 -52
@@ -1,22 +0,0 @@
|
|
1
|
-
module Babosa
|
2
|
-
module UTF8
|
3
|
-
# A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
|
4
|
-
module JavaProxy
|
5
|
-
extend Proxy
|
6
|
-
extend self
|
7
|
-
java_import java.text.Normalizer
|
8
|
-
|
9
|
-
def downcase(string)
|
10
|
-
string.to_java.to_lower_case.to_s
|
11
|
-
end
|
12
|
-
|
13
|
-
def upcase(string)
|
14
|
-
string.to_java.to_upper_case.to_s
|
15
|
-
end
|
16
|
-
|
17
|
-
def normalize_utf8(string)
|
18
|
-
Normalizer.normalize(string, Normalizer::Form::NFC).to_s
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
data/lib/babosa/utf8/mappings.rb
DELETED
@@ -1,193 +0,0 @@
|
|
1
|
-
module Babosa
|
2
|
-
module UTF8
|
3
|
-
|
4
|
-
# A small subset of the mappings provided by Unicode.org, limited to Latin
|
5
|
-
# characters. This is used for Babosa's default "dumb" UTF-8 support.
|
6
|
-
module Mappings
|
7
|
-
DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
|
8
|
-
71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
|
9
|
-
79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
|
10
|
-
87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
|
11
|
-
226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
|
12
|
-
233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
|
13
|
-
240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
|
14
|
-
248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
|
15
|
-
[115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
|
16
|
-
268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
|
17
|
-
282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
|
18
|
-
296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
|
19
|
-
309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
|
20
|
-
324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
|
21
|
-
336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
|
22
|
-
350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
|
23
|
-
364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
|
24
|
-
377, 378, 379, 380, 381, 382]
|
25
|
-
|
26
|
-
UPCASE = DOWNCASE.invert
|
27
|
-
|
28
|
-
COMPOSITION = {
|
29
|
-
[65,768] => 192,
|
30
|
-
[65,769] => 193,
|
31
|
-
[65,770] => 194,
|
32
|
-
[65,771] => 195,
|
33
|
-
[65,776] => 196,
|
34
|
-
[65,778] => 197,
|
35
|
-
[67,807] => 199,
|
36
|
-
[69,768] => 200,
|
37
|
-
[69,769] => 201,
|
38
|
-
[69,770] => 202,
|
39
|
-
[69,776] => 203,
|
40
|
-
[73,768] => 204,
|
41
|
-
[73,769] => 205,
|
42
|
-
[73,770] => 206,
|
43
|
-
[73,776] => 207,
|
44
|
-
[78,771] => 209,
|
45
|
-
[79,768] => 210,
|
46
|
-
[79,769] => 211,
|
47
|
-
[79,770] => 212,
|
48
|
-
[79,771] => 213,
|
49
|
-
[79,776] => 214,
|
50
|
-
[85,768] => 217,
|
51
|
-
[85,769] => 218,
|
52
|
-
[85,770] => 219,
|
53
|
-
[85,776] => 220,
|
54
|
-
[89,769] => 221,
|
55
|
-
[97,768] => 224,
|
56
|
-
[97,769] => 225,
|
57
|
-
[97,770] => 226,
|
58
|
-
[97,771] => 227,
|
59
|
-
[97,776] => 228,
|
60
|
-
[97,778] => 229,
|
61
|
-
[99,807] => 231,
|
62
|
-
[101,768] => 232,
|
63
|
-
[101,769] => 233,
|
64
|
-
[101,770] => 234,
|
65
|
-
[101,776] => 235,
|
66
|
-
[105,768] => 236,
|
67
|
-
[105,769] => 237,
|
68
|
-
[105,770] => 238,
|
69
|
-
[105,776] => 239,
|
70
|
-
[110,771] => 241,
|
71
|
-
[111,768] => 242,
|
72
|
-
[111,769] => 243,
|
73
|
-
[111,770] => 244,
|
74
|
-
[111,771] => 245,
|
75
|
-
[111,776] => 246,
|
76
|
-
[117,768] => 249,
|
77
|
-
[117,769] => 250,
|
78
|
-
[117,770] => 251,
|
79
|
-
[117,776] => 252,
|
80
|
-
[121,769] => 253,
|
81
|
-
[121,776] => 255,
|
82
|
-
[65,772] => 256,
|
83
|
-
[97,772] => 257,
|
84
|
-
[65,774] => 258,
|
85
|
-
[97,774] => 259,
|
86
|
-
[65,808] => 260,
|
87
|
-
[97,808] => 261,
|
88
|
-
[67,769] => 262,
|
89
|
-
[99,769] => 263,
|
90
|
-
[67,770] => 264,
|
91
|
-
[99,770] => 265,
|
92
|
-
[67,775] => 266,
|
93
|
-
[99,775] => 267,
|
94
|
-
[67,780] => 268,
|
95
|
-
[99,780] => 269,
|
96
|
-
[68,780] => 270,
|
97
|
-
[100,780] => 271,
|
98
|
-
[69,772] => 274,
|
99
|
-
[101,772] => 275,
|
100
|
-
[69,774] => 276,
|
101
|
-
[101,774] => 277,
|
102
|
-
[69,775] => 278,
|
103
|
-
[101,775] => 279,
|
104
|
-
[69,808] => 280,
|
105
|
-
[101,808] => 281,
|
106
|
-
[69,780] => 282,
|
107
|
-
[101,780] => 283,
|
108
|
-
[71,770] => 284,
|
109
|
-
[103,770] => 285,
|
110
|
-
[71,774] => 286,
|
111
|
-
[103,774] => 287,
|
112
|
-
[71,775] => 288,
|
113
|
-
[103,775] => 289,
|
114
|
-
[71,807] => 290,
|
115
|
-
[103,807] => 291,
|
116
|
-
[72,770] => 292,
|
117
|
-
[104,770] => 293,
|
118
|
-
[73,771] => 296,
|
119
|
-
[105,771] => 297,
|
120
|
-
[73,772] => 298,
|
121
|
-
[105,772] => 299,
|
122
|
-
[73,774] => 300,
|
123
|
-
[105,774] => 301,
|
124
|
-
[73,808] => 302,
|
125
|
-
[105,808] => 303,
|
126
|
-
[73,775] => 304,
|
127
|
-
[74,770] => 308,
|
128
|
-
[106,770] => 309,
|
129
|
-
[75,807] => 310,
|
130
|
-
[107,807] => 311,
|
131
|
-
[76,769] => 313,
|
132
|
-
[108,769] => 314,
|
133
|
-
[76,807] => 315,
|
134
|
-
[108,807] => 316,
|
135
|
-
[76,780] => 317,
|
136
|
-
[108,780] => 318,
|
137
|
-
[78,769] => 323,
|
138
|
-
[110,769] => 324,
|
139
|
-
[78,807] => 325,
|
140
|
-
[110,807] => 326,
|
141
|
-
[78,780] => 327,
|
142
|
-
[110,780] => 328,
|
143
|
-
[79,772] => 332,
|
144
|
-
[111,772] => 333,
|
145
|
-
[79,774] => 334,
|
146
|
-
[111,774] => 335,
|
147
|
-
[79,779] => 336,
|
148
|
-
[111,779] => 337,
|
149
|
-
[82,769] => 340,
|
150
|
-
[114,769] => 341,
|
151
|
-
[82,807] => 342,
|
152
|
-
[114,807] => 343,
|
153
|
-
[82,780] => 344,
|
154
|
-
[114,780] => 345,
|
155
|
-
[83,769] => 346,
|
156
|
-
[115,769] => 347,
|
157
|
-
[83,770] => 348,
|
158
|
-
[115,770] => 349,
|
159
|
-
[83,807] => 350,
|
160
|
-
[115,807] => 351,
|
161
|
-
[83,780] => 352,
|
162
|
-
[115,780] => 353,
|
163
|
-
[84,807] => 354,
|
164
|
-
[116,807] => 355,
|
165
|
-
[84,780] => 356,
|
166
|
-
[116,780] => 357,
|
167
|
-
[85,771] => 360,
|
168
|
-
[117,771] => 361,
|
169
|
-
[85,772] => 362,
|
170
|
-
[117,772] => 363,
|
171
|
-
[85,774] => 364,
|
172
|
-
[117,774] => 365,
|
173
|
-
[85,778] => 366,
|
174
|
-
[117,778] => 367,
|
175
|
-
[85,779] => 368,
|
176
|
-
[117,779] => 369,
|
177
|
-
[85,808] => 370,
|
178
|
-
[117,808] => 371,
|
179
|
-
[87,770] => 372,
|
180
|
-
[119,770] => 373,
|
181
|
-
[89,770] => 374,
|
182
|
-
[121,770] => 375,
|
183
|
-
[89,776] => 376,
|
184
|
-
[90,769] => 377,
|
185
|
-
[122,769] => 378,
|
186
|
-
[90,775] => 379,
|
187
|
-
[122,775] => 380,
|
188
|
-
[90,780] => 381,
|
189
|
-
[122,780] => 382
|
190
|
-
}
|
191
|
-
end
|
192
|
-
end
|
193
|
-
end
|
data/lib/babosa/utf8/proxy.rb
DELETED
@@ -1,125 +0,0 @@
|
|
1
|
-
module Babosa
|
2
|
-
module UTF8
|
3
|
-
|
4
|
-
autoload :JavaProxy, "babosa/utf8/java_proxy"
|
5
|
-
autoload :UnicodeProxy, "babosa/utf8/unicode_proxy"
|
6
|
-
autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy"
|
7
|
-
autoload :DumbProxy, "babosa/utf8/dumb_proxy"
|
8
|
-
|
9
|
-
# A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
|
10
|
-
# The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
|
11
|
-
module Proxy
|
12
|
-
CP1252 = {
|
13
|
-
128 => [226, 130, 172],
|
14
|
-
129 => nil,
|
15
|
-
130 => [226, 128, 154],
|
16
|
-
131 => [198, 146],
|
17
|
-
132 => [226, 128, 158],
|
18
|
-
133 => [226, 128, 166],
|
19
|
-
134 => [226, 128, 160],
|
20
|
-
135 => [226, 128, 161],
|
21
|
-
136 => [203, 134],
|
22
|
-
137 => [226, 128, 176],
|
23
|
-
138 => [197, 160],
|
24
|
-
139 => [226, 128, 185],
|
25
|
-
140 => [197, 146],
|
26
|
-
141 => nil,
|
27
|
-
142 => [197, 189],
|
28
|
-
143 => nil,
|
29
|
-
144 => nil,
|
30
|
-
145 => [226, 128, 152],
|
31
|
-
146 => [226, 128, 153],
|
32
|
-
147 => [226, 128, 156],
|
33
|
-
148 => [226, 128, 157],
|
34
|
-
149 => [226, 128, 162],
|
35
|
-
150 => [226, 128, 147],
|
36
|
-
151 => [226, 128, 148],
|
37
|
-
152 => [203, 156],
|
38
|
-
153 => [226, 132, 162],
|
39
|
-
154 => [197, 161],
|
40
|
-
155 => [226, 128, 186],
|
41
|
-
156 => [197, 147],
|
42
|
-
157 => nil,
|
43
|
-
158 => [197, 190],
|
44
|
-
159 => [197, 184]
|
45
|
-
}
|
46
|
-
|
47
|
-
# This is a stub for a method that should return a Unicode-aware
|
48
|
-
# downcased version of the given string.
|
49
|
-
def downcase(string)
|
50
|
-
raise NotImplementedError
|
51
|
-
end
|
52
|
-
|
53
|
-
# This is a stub for a method that should return a Unicode-aware
|
54
|
-
# upcased version of the given string.
|
55
|
-
def upcase(string)
|
56
|
-
raise NotImplementedError
|
57
|
-
end
|
58
|
-
|
59
|
-
# This is a stub for a method that should return the Unicode NFC
|
60
|
-
# normalization of the given string.
|
61
|
-
def normalize_utf8(string)
|
62
|
-
raise NotImplementedError
|
63
|
-
end
|
64
|
-
|
65
|
-
if ''.respond_to?(:scrub) && !defined?(Rubinius)
|
66
|
-
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
67
|
-
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
68
|
-
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
69
|
-
# always work.
|
70
|
-
def tidy_bytes(string)
|
71
|
-
string.scrub do |bad|
|
72
|
-
tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
|
73
|
-
end
|
74
|
-
end
|
75
|
-
else
|
76
|
-
def tidy_bytes(string)
|
77
|
-
bytes = string.unpack("C*")
|
78
|
-
conts_expected = 0
|
79
|
-
last_lead = 0
|
80
|
-
|
81
|
-
bytes.each_index do |i|
|
82
|
-
byte = bytes[i]
|
83
|
-
is_cont = byte > 127 && byte < 192
|
84
|
-
is_lead = byte > 191 && byte < 245
|
85
|
-
is_unused = byte > 240
|
86
|
-
is_restricted = byte > 244
|
87
|
-
|
88
|
-
# Impossible or highly unlikely byte? Clean it.
|
89
|
-
if is_unused || is_restricted
|
90
|
-
bytes[i] = tidy_byte(byte)
|
91
|
-
elsif is_cont
|
92
|
-
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
93
|
-
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
94
|
-
else
|
95
|
-
if conts_expected > 0
|
96
|
-
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
97
|
-
# the leading byte.
|
98
|
-
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
99
|
-
conts_expected = 0
|
100
|
-
end
|
101
|
-
if is_lead
|
102
|
-
# Final byte is leading? Clean it.
|
103
|
-
if i == bytes.length - 1
|
104
|
-
bytes[i] = tidy_byte(bytes.last)
|
105
|
-
else
|
106
|
-
# Valid leading byte? Expect continuations determined by position of
|
107
|
-
# first zero bit, with max of 3.
|
108
|
-
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
109
|
-
last_lead = i
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
private
|
119
|
-
|
120
|
-
def tidy_byte(byte)
|
121
|
-
byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'unicode'
|
2
|
-
|
3
|
-
module Babosa
|
4
|
-
module UTF8
|
5
|
-
# A UTF-8 proxy using the Unicode gem.
|
6
|
-
# @see http://github.com/blackwinter/unicode
|
7
|
-
module UnicodeProxy
|
8
|
-
extend Proxy
|
9
|
-
extend self
|
10
|
-
def downcase(string)
|
11
|
-
Unicode.downcase(string)
|
12
|
-
end
|
13
|
-
|
14
|
-
def upcase(string)
|
15
|
-
Unicode.upcase(string)
|
16
|
-
end
|
17
|
-
|
18
|
-
def normalize_utf8(string)
|
19
|
-
Unicode.normalize_C(string)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
data/spec/utf8_proxy_spec.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require File.expand_path("../spec_helper", __FILE__)
|
3
|
-
|
4
|
-
PROXIES = [Babosa::UTF8::DumbProxy, Babosa::UTF8::ActiveSupportProxy, Babosa::UTF8::UnicodeProxy]
|
5
|
-
PROXIES << Babosa::UTF8::JavaProxy if Babosa.jruby15?
|
6
|
-
|
7
|
-
PROXIES.each do |proxy|
|
8
|
-
|
9
|
-
describe proxy do
|
10
|
-
|
11
|
-
around do |example|
|
12
|
-
begin
|
13
|
-
old_proxy = Babosa::Identifier.utf8_proxy
|
14
|
-
Babosa::Identifier.utf8_proxy = proxy
|
15
|
-
example.run
|
16
|
-
ensure
|
17
|
-
Babosa::Identifier.utf8_proxy = old_proxy
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
describe "#normalize_utf8" do
|
22
|
-
it "should normalize to canonical composed" do
|
23
|
-
# ÅÉÎØÜ
|
24
|
-
uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136]
|
25
|
-
composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156]
|
26
|
-
uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*")
|
27
|
-
expect(proxy.normalize_utf8(uncomposed_string).unpack("C*")).to eql(composed_bytes)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
describe "#upcase" do
|
32
|
-
it "should upcase the string" do
|
33
|
-
expect(proxy.upcase("åéîøü")).to eql("ÅÉÎØÜ")
|
34
|
-
expect("åéîøü".to_identifier.upcase).to eql("ÅÉÎØÜ")
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
describe "#downcase" do
|
39
|
-
it "should downcase the string" do
|
40
|
-
expect(proxy.downcase("ÅÉÎØÜ")).to eql("åéîøü")
|
41
|
-
expect("ÅÉÎØÜ".to_identifier.downcase).to eql("åéîøü")
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
describe 'tidy_bytes' do
|
46
|
-
it 'should fix invalid UTF-8 strings' do
|
47
|
-
expect(proxy.tidy_bytes("\x93abc")).to eq('“abc')
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
end
|