babosa 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/README.md +59 -28
- data/Rakefile +14 -8
- data/lib/babosa.rb +11 -1
- data/lib/babosa/identifier.rb +26 -16
- data/lib/babosa/transliterator/base.rb +89 -0
- data/lib/babosa/transliterator/bulgarian.rb +27 -0
- data/lib/babosa/transliterator/cyrillic.rb +111 -0
- data/lib/babosa/transliterator/danish.rb +15 -0
- data/lib/babosa/transliterator/german.rb +15 -0
- data/lib/babosa/transliterator/latin.rb +199 -0
- data/lib/babosa/transliterator/russian.rb +22 -0
- data/lib/babosa/transliterator/serbian.rb +34 -0
- data/lib/babosa/transliterator/spanish.rb +9 -0
- data/lib/babosa/transliterator/ukranian.rb +11 -0
- data/lib/babosa/utf8/dumb_proxy.rb +1 -0
- data/lib/babosa/version.rb +1 -1
- data/spec/babosa_spec.rb +131 -0
- data/spec/spec_helper.rb +33 -0
- data/spec/transliterators/base_spec.rb +16 -0
- data/spec/transliterators/bulgarian_spec.rb +20 -0
- data/spec/transliterators/danish_spec.rb +17 -0
- data/spec/transliterators/german_spec.rb +17 -0
- data/spec/transliterators/russian_spec.rb +9 -0
- data/spec/transliterators/serbian_spec.rb +25 -0
- data/spec/transliterators/spanish_spec.rb +13 -0
- data/spec/transliterators/ukranian_spec.rb +9 -0
- data/spec/utf8_proxy_spec.rb +48 -0
- metadata +63 -19
- data/lib/babosa/characters.rb +0 -80
- data/test/babosa_test.rb +0 -198
@@ -0,0 +1,199 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Babosa
|
3
|
+
module Transliterator
|
4
|
+
class Latin < Base
|
5
|
+
|
6
|
+
APPROXIMATIONS = {
|
7
|
+
"À" => "A",
|
8
|
+
"Á" => "A",
|
9
|
+
"Â" => "A",
|
10
|
+
"Ã" => "A",
|
11
|
+
"Ä" => "A",
|
12
|
+
"Å" => "A",
|
13
|
+
"Æ" => "Ae",
|
14
|
+
"Ç" => "C",
|
15
|
+
"È" => "E",
|
16
|
+
"É" => "E",
|
17
|
+
"Ê" => "E",
|
18
|
+
"Ë" => "E",
|
19
|
+
"Ì" => "I",
|
20
|
+
"Í" => "I",
|
21
|
+
"Î" => "I",
|
22
|
+
"Ï" => "I",
|
23
|
+
"Ð" => "D",
|
24
|
+
"Ñ" => "N",
|
25
|
+
"Ò" => "O",
|
26
|
+
"Ó" => "O",
|
27
|
+
"Ô" => "O",
|
28
|
+
"Õ" => "O",
|
29
|
+
"Ö" => "O",
|
30
|
+
"Ø" => "O",
|
31
|
+
"Ù" => "U",
|
32
|
+
"Ú" => "U",
|
33
|
+
"Û" => "U",
|
34
|
+
"Ü" => "U",
|
35
|
+
"Ý" => "Y",
|
36
|
+
"Þ" => "Th",
|
37
|
+
"ß" => "ss",
|
38
|
+
"à" => "a" ,
|
39
|
+
"á" => "a",
|
40
|
+
"â" => "a",
|
41
|
+
"ã" => "a",
|
42
|
+
"ä" => "a",
|
43
|
+
"å" => "a",
|
44
|
+
"æ" => "ae",
|
45
|
+
"ç" => "c" ,
|
46
|
+
"è" => "e",
|
47
|
+
"é" => "e",
|
48
|
+
"ê" => "e",
|
49
|
+
"ë" => "e",
|
50
|
+
"ì" => "i",
|
51
|
+
"í" => "i",
|
52
|
+
"î" => "i",
|
53
|
+
"ï" => "i",
|
54
|
+
"ð" => "d",
|
55
|
+
"ñ" => "n",
|
56
|
+
"ò" => "o",
|
57
|
+
"ó" => "o",
|
58
|
+
"ô" => "o",
|
59
|
+
"õ" => "o",
|
60
|
+
"ö" => "o",
|
61
|
+
"ø" => "o",
|
62
|
+
"ù" => "u",
|
63
|
+
"ú" => "u",
|
64
|
+
"û" => "u",
|
65
|
+
"ü" => "u",
|
66
|
+
"ý" => "y",
|
67
|
+
"þ" => "th",
|
68
|
+
"ÿ" => "y",
|
69
|
+
"Ā" => "A",
|
70
|
+
"Ă" => "A",
|
71
|
+
"Ą" => "A",
|
72
|
+
"Ć" => "C",
|
73
|
+
"Ĉ" => "C",
|
74
|
+
"Ċ" => "C",
|
75
|
+
"Č" => "C",
|
76
|
+
"Ď" => "D",
|
77
|
+
"Đ" => "D",
|
78
|
+
"Ē" => "E",
|
79
|
+
"Ĕ" => "E",
|
80
|
+
"Ė" => "E",
|
81
|
+
"Ę" => "E",
|
82
|
+
"Ě" => "E",
|
83
|
+
"Ĝ" => "G",
|
84
|
+
"Ğ" => "G",
|
85
|
+
"Ġ" => "G",
|
86
|
+
"Ģ" => "G",
|
87
|
+
"Ĥ" => "H",
|
88
|
+
"Ħ" => "H",
|
89
|
+
"Ĩ" => "I",
|
90
|
+
"Ī" => "I",
|
91
|
+
"Ĭ" => "I",
|
92
|
+
"Į" => "I",
|
93
|
+
"İ" => "I",
|
94
|
+
"IJ" => "Ij",
|
95
|
+
"Ĵ" => "J",
|
96
|
+
"Ķ" => "K",
|
97
|
+
"Ĺ" => "L",
|
98
|
+
"Ļ" => "L",
|
99
|
+
"Ľ" => "L",
|
100
|
+
"Ŀ" => "L",
|
101
|
+
"Ł" => "L",
|
102
|
+
"Ń" => "N",
|
103
|
+
"Ņ" => "N",
|
104
|
+
"Ň" => "N",
|
105
|
+
"Ŋ" => "Ng",
|
106
|
+
"Ō" => "O",
|
107
|
+
"Ŏ" => "O",
|
108
|
+
"Ő" => "O",
|
109
|
+
"Œ" => "OE",
|
110
|
+
"Ŕ" => "R",
|
111
|
+
"Ŗ" => "R",
|
112
|
+
"Ř" => "R",
|
113
|
+
"Ś" => "S",
|
114
|
+
"Ŝ" => "S",
|
115
|
+
"Ş" => "S",
|
116
|
+
"Š" => "S",
|
117
|
+
"Ţ" => "T",
|
118
|
+
"Ť" => "T",
|
119
|
+
"Ŧ" => "T",
|
120
|
+
"Ũ" => "U",
|
121
|
+
"Ū" => "U",
|
122
|
+
"Ŭ" => "U",
|
123
|
+
"Ů" => "U",
|
124
|
+
"Ű" => "U",
|
125
|
+
"Ų" => "U",
|
126
|
+
"Ŵ" => "W",
|
127
|
+
"Ŷ" => "Y",
|
128
|
+
"Ÿ" => "Y",
|
129
|
+
"Ź" => "Z",
|
130
|
+
"Ż" => "Z",
|
131
|
+
"Ž" => "Z",
|
132
|
+
"ā" => "a",
|
133
|
+
"ă" => "a",
|
134
|
+
"ą" => "a",
|
135
|
+
"ć" => "c",
|
136
|
+
"ĉ" => "c",
|
137
|
+
"ċ" => "c",
|
138
|
+
"č" => "c",
|
139
|
+
"ď" => "d",
|
140
|
+
"đ" => "d",
|
141
|
+
"ē" => "e",
|
142
|
+
"ĕ" => "e",
|
143
|
+
"ė" => "e",
|
144
|
+
"ę" => "e",
|
145
|
+
"ě" => "e",
|
146
|
+
"ĝ" => "g",
|
147
|
+
"ğ" => "g",
|
148
|
+
"ġ" => "g",
|
149
|
+
"ģ" => "g",
|
150
|
+
"ĥ" => "h",
|
151
|
+
"ħ" => "h",
|
152
|
+
"ĩ" => "i",
|
153
|
+
"ī" => "i",
|
154
|
+
"ĭ" => "i",
|
155
|
+
"į" => "i",
|
156
|
+
"ı" => "i",
|
157
|
+
"ij" => "ij",
|
158
|
+
"ĵ" => "j",
|
159
|
+
"ķ" => "k",
|
160
|
+
"ĸ" => "k",
|
161
|
+
"ĺ" => "l",
|
162
|
+
"ļ" => "l",
|
163
|
+
"ľ" => "l",
|
164
|
+
"ŀ" => "l",
|
165
|
+
"ł" => "l",
|
166
|
+
"ń" => "n",
|
167
|
+
"ņ" => "n",
|
168
|
+
"ň" => "n",
|
169
|
+
"ʼn" => "n",
|
170
|
+
"ŋ" => "ng",
|
171
|
+
"ō" => "o",
|
172
|
+
"ŏ" => "o",
|
173
|
+
"ő" => "o",
|
174
|
+
"œ" => "oe",
|
175
|
+
"ŕ" => "r",
|
176
|
+
"ŗ" => "r",
|
177
|
+
"ř" => "r",
|
178
|
+
"ś" => "s",
|
179
|
+
"ŝ" => "s",
|
180
|
+
"ş" => "s",
|
181
|
+
"š" => "s",
|
182
|
+
"ţ" => "t",
|
183
|
+
"ť" => "t",
|
184
|
+
"ŧ" => "t",
|
185
|
+
"ũ" => "u",
|
186
|
+
"ū" => "u",
|
187
|
+
"ŭ" => "u",
|
188
|
+
"ů" => "u",
|
189
|
+
"ű" => "u",
|
190
|
+
"ų" => "u",
|
191
|
+
"ŵ" => "w",
|
192
|
+
"ŷ" => "y",
|
193
|
+
"ž" => "z",
|
194
|
+
"ź" => "z",
|
195
|
+
"ż" => "z"
|
196
|
+
}
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Babosa
|
3
|
+
module Transliterator
|
4
|
+
class Russian < Cyrillic
|
5
|
+
APPROXIMATIONS = {
|
6
|
+
"Й" => "I",
|
7
|
+
"М" => "M",
|
8
|
+
"Х" => "H",
|
9
|
+
"Ц" => "Ts",
|
10
|
+
"Ш" => "Sh",
|
11
|
+
"Щ" => "Sch",
|
12
|
+
"Ю" => "U",
|
13
|
+
"Я" => "Ya",
|
14
|
+
"й" => "i",
|
15
|
+
"х" => "h",
|
16
|
+
"ц" => "ts",
|
17
|
+
"щ" => "sch",
|
18
|
+
"ю" => "u"
|
19
|
+
}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Babosa
|
4
|
+
module Transliterator
|
5
|
+
class Serbian < Latin
|
6
|
+
APPROXIMATIONS = Cyrillic.const_get(:APPROXIMATIONS).merge({
|
7
|
+
"Ð" => "Dj",
|
8
|
+
"Č" => "Ch",
|
9
|
+
"Š" => "Sh",
|
10
|
+
"č" => "ch",
|
11
|
+
"đ" => "dj",
|
12
|
+
"š" => "sh",
|
13
|
+
"Ћ" => "C",
|
14
|
+
"Ц" => "C",
|
15
|
+
"Ч" => "Ch",
|
16
|
+
"Ђ" => "Dj",
|
17
|
+
"Џ" => "Dz",
|
18
|
+
"Х" => "H",
|
19
|
+
"Ј" => "J",
|
20
|
+
"Љ" => "Lj",
|
21
|
+
"Њ" => "Nj",
|
22
|
+
"ц" => "c",
|
23
|
+
"ћ" => "c",
|
24
|
+
"ч" => "ch",
|
25
|
+
"ђ" => "dj",
|
26
|
+
"џ" => "dz",
|
27
|
+
"х" => "h",
|
28
|
+
"ј" => "j",
|
29
|
+
"љ" => "lj",
|
30
|
+
"њ" => "nj"
|
31
|
+
})
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/babosa/version.rb
CHANGED
data/spec/babosa_spec.rb
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path("../spec_helper", __FILE__)
|
3
|
+
|
4
|
+
describe Babosa::Identifier do
|
5
|
+
|
6
|
+
it "should respond_to :empty?" do
|
7
|
+
"".to_slug.should respond_to(:empty?)
|
8
|
+
end
|
9
|
+
|
10
|
+
%w[approximate_ascii clean downcase word_chars normalize to_ascii upcase with_dashes].each do |method|
|
11
|
+
describe "##{method}" do
|
12
|
+
it "should work with invalid UTF-8 strings" do
|
13
|
+
expect {"\x93abc".to_slug.send method}.not_to raise_exception
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "#word_chars" do
|
19
|
+
it "word_chars! should leave only letters and spaces" do
|
20
|
+
string = "a*$%^$@!@b$%^&*()*!c"
|
21
|
+
string.to_slug.word_chars.should match(/[a-z ]*/i)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#transliterate" do
|
26
|
+
it "should transliterate to ascii" do
|
27
|
+
slug = (0xC0..0x17E).to_a.each do |codepoint|
|
28
|
+
ss = [codepoint].pack("U*").to_slug
|
29
|
+
ss.approximate_ascii.should match(/[\x0-\x7f]/)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should transliterate uncomposed utf8" do
|
34
|
+
string = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS
|
35
|
+
string.to_slug.approximate_ascii.should eql("u")
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe "#downcase" do
|
40
|
+
it "should lowercase strings" do
|
41
|
+
"FELIZ AÑO".to_slug.downcase.should eql("feliz año")
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#upcase" do
|
46
|
+
it "should uppercase strings" do
|
47
|
+
"feliz año".to_slug.upcase.should eql("FELIZ AÑO")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "#normalize" do
|
52
|
+
it "should replace whitespace with dashes" do
|
53
|
+
"a b".to_slug.clean.normalize.should eql("a-b")
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should replace multiple spaces with 1 dash" do
|
57
|
+
"a b".to_slug.clean.normalize.should eql("a-b")
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should replace multiple dashes with 1 dash" do
|
61
|
+
"male - female".to_slug.normalize.should eql("male-female")
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should strip trailing space" do
|
65
|
+
"ab ".to_slug.normalize.should eql("ab")
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should strip leading space" do
|
69
|
+
" ab".to_slug.normalize.should eql("ab")
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should strip trailing slashes" do
|
73
|
+
"ab-".to_slug.normalize.should eql("ab")
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should strip leading slashes" do
|
77
|
+
"-ab".to_slug.normalize.should eql("ab")
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should not modify valid name strings" do
|
81
|
+
"a-b-c-d".to_slug.normalize.should eql("a-b-c-d")
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should work with non roman chars" do
|
85
|
+
"検 索".to_slug.normalize.should eql("検-索")
|
86
|
+
end
|
87
|
+
|
88
|
+
context "with to_ascii option" do
|
89
|
+
it "should approximate and strip non ascii" do
|
90
|
+
ss = "カタカナ: katakana is über cool".to_slug
|
91
|
+
ss.normalize(:to_ascii => true).should eql("katakana-is-uber-cool")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe "#truncate_bytes" do
|
97
|
+
it "should by byte length" do
|
98
|
+
"üa".to_slug.truncate_bytes(2).should eql("ü")
|
99
|
+
"üa".to_slug.truncate_bytes(1).should eql("")
|
100
|
+
"üa".to_slug.truncate_bytes(100).should eql("üa")
|
101
|
+
"üéøá".to_slug.truncate_bytes(3).should eql("ü")
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
describe "#truncate" do
|
106
|
+
it "should truncate by char length" do
|
107
|
+
"üa".to_slug.truncate(2).should eql("üa")
|
108
|
+
"üa".to_slug.truncate(1).should eql("ü")
|
109
|
+
"üa".to_slug.truncate(100).should eql("üa")
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
describe "#with_dashes" do
|
114
|
+
it "should not change byte size when replacing spaces" do
|
115
|
+
"".to_slug.with_dashes.bytesize.should eql(0)
|
116
|
+
" ".to_slug.with_dashes.bytesize.should eql(1)
|
117
|
+
"-abc-".to_slug.with_dashes.bytesize.should eql(5)
|
118
|
+
" abc ".to_slug.with_dashes.bytesize.should eql(5)
|
119
|
+
" a bc ".to_slug.with_dashes.bytesize.should eql(7)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
describe "#to_ruby_method" do
|
124
|
+
it "should get a string suitable for use as a ruby method" do
|
125
|
+
"¿¿¿hello... world???".to_slug.to_ruby_method.should eql("hello_world?")
|
126
|
+
"カタカナ: katakana is über cool".to_slug.to_ruby_method.should eql("katakana_is_uber_cool")
|
127
|
+
"カタカナ: katakana is über cool!".to_slug.to_ruby_method.should eql("katakana_is_uber_cool!")
|
128
|
+
"カタカナ: katakana is über cool".to_slug.to_ruby_method(false).should eql("katakana_is_uber_cool")
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|