babosa 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/README.md +59 -28
- data/Rakefile +14 -8
- data/lib/babosa.rb +11 -1
- data/lib/babosa/identifier.rb +26 -16
- data/lib/babosa/transliterator/base.rb +89 -0
- data/lib/babosa/transliterator/bulgarian.rb +27 -0
- data/lib/babosa/transliterator/cyrillic.rb +111 -0
- data/lib/babosa/transliterator/danish.rb +15 -0
- data/lib/babosa/transliterator/german.rb +15 -0
- data/lib/babosa/transliterator/latin.rb +199 -0
- data/lib/babosa/transliterator/russian.rb +22 -0
- data/lib/babosa/transliterator/serbian.rb +34 -0
- data/lib/babosa/transliterator/spanish.rb +9 -0
- data/lib/babosa/transliterator/ukranian.rb +11 -0
- data/lib/babosa/utf8/dumb_proxy.rb +1 -0
- data/lib/babosa/version.rb +1 -1
- data/spec/babosa_spec.rb +131 -0
- data/spec/spec_helper.rb +33 -0
- data/spec/transliterators/base_spec.rb +16 -0
- data/spec/transliterators/bulgarian_spec.rb +20 -0
- data/spec/transliterators/danish_spec.rb +17 -0
- data/spec/transliterators/german_spec.rb +17 -0
- data/spec/transliterators/russian_spec.rb +9 -0
- data/spec/transliterators/serbian_spec.rb +25 -0
- data/spec/transliterators/spanish_spec.rb +13 -0
- data/spec/transliterators/ukranian_spec.rb +9 -0
- data/spec/utf8_proxy_spec.rb +48 -0
- metadata +63 -19
- data/lib/babosa/characters.rb +0 -80
- data/test/babosa_test.rb +0 -198
@@ -0,0 +1,199 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Babosa
|
3
|
+
module Transliterator
|
4
|
+
class Latin < Base
|
5
|
+
|
6
|
+
APPROXIMATIONS = {
|
7
|
+
"À" => "A",
|
8
|
+
"Á" => "A",
|
9
|
+
"Â" => "A",
|
10
|
+
"Ã" => "A",
|
11
|
+
"Ä" => "A",
|
12
|
+
"Å" => "A",
|
13
|
+
"Æ" => "Ae",
|
14
|
+
"Ç" => "C",
|
15
|
+
"È" => "E",
|
16
|
+
"É" => "E",
|
17
|
+
"Ê" => "E",
|
18
|
+
"Ë" => "E",
|
19
|
+
"Ì" => "I",
|
20
|
+
"Í" => "I",
|
21
|
+
"Î" => "I",
|
22
|
+
"Ï" => "I",
|
23
|
+
"Ð" => "D",
|
24
|
+
"Ñ" => "N",
|
25
|
+
"Ò" => "O",
|
26
|
+
"Ó" => "O",
|
27
|
+
"Ô" => "O",
|
28
|
+
"Õ" => "O",
|
29
|
+
"Ö" => "O",
|
30
|
+
"Ø" => "O",
|
31
|
+
"Ù" => "U",
|
32
|
+
"Ú" => "U",
|
33
|
+
"Û" => "U",
|
34
|
+
"Ü" => "U",
|
35
|
+
"Ý" => "Y",
|
36
|
+
"Þ" => "Th",
|
37
|
+
"ß" => "ss",
|
38
|
+
"à" => "a" ,
|
39
|
+
"á" => "a",
|
40
|
+
"â" => "a",
|
41
|
+
"ã" => "a",
|
42
|
+
"ä" => "a",
|
43
|
+
"å" => "a",
|
44
|
+
"æ" => "ae",
|
45
|
+
"ç" => "c" ,
|
46
|
+
"è" => "e",
|
47
|
+
"é" => "e",
|
48
|
+
"ê" => "e",
|
49
|
+
"ë" => "e",
|
50
|
+
"ì" => "i",
|
51
|
+
"í" => "i",
|
52
|
+
"î" => "i",
|
53
|
+
"ï" => "i",
|
54
|
+
"ð" => "d",
|
55
|
+
"ñ" => "n",
|
56
|
+
"ò" => "o",
|
57
|
+
"ó" => "o",
|
58
|
+
"ô" => "o",
|
59
|
+
"õ" => "o",
|
60
|
+
"ö" => "o",
|
61
|
+
"ø" => "o",
|
62
|
+
"ù" => "u",
|
63
|
+
"ú" => "u",
|
64
|
+
"û" => "u",
|
65
|
+
"ü" => "u",
|
66
|
+
"ý" => "y",
|
67
|
+
"þ" => "th",
|
68
|
+
"ÿ" => "y",
|
69
|
+
"Ā" => "A",
|
70
|
+
"Ă" => "A",
|
71
|
+
"Ą" => "A",
|
72
|
+
"Ć" => "C",
|
73
|
+
"Ĉ" => "C",
|
74
|
+
"Ċ" => "C",
|
75
|
+
"Č" => "C",
|
76
|
+
"Ď" => "D",
|
77
|
+
"Đ" => "D",
|
78
|
+
"Ē" => "E",
|
79
|
+
"Ĕ" => "E",
|
80
|
+
"Ė" => "E",
|
81
|
+
"Ę" => "E",
|
82
|
+
"Ě" => "E",
|
83
|
+
"Ĝ" => "G",
|
84
|
+
"Ğ" => "G",
|
85
|
+
"Ġ" => "G",
|
86
|
+
"Ģ" => "G",
|
87
|
+
"Ĥ" => "H",
|
88
|
+
"Ħ" => "H",
|
89
|
+
"Ĩ" => "I",
|
90
|
+
"Ī" => "I",
|
91
|
+
"Ĭ" => "I",
|
92
|
+
"Į" => "I",
|
93
|
+
"İ" => "I",
|
94
|
+
"IJ" => "Ij",
|
95
|
+
"Ĵ" => "J",
|
96
|
+
"Ķ" => "K",
|
97
|
+
"Ĺ" => "L",
|
98
|
+
"Ļ" => "L",
|
99
|
+
"Ľ" => "L",
|
100
|
+
"Ŀ" => "L",
|
101
|
+
"Ł" => "L",
|
102
|
+
"Ń" => "N",
|
103
|
+
"Ņ" => "N",
|
104
|
+
"Ň" => "N",
|
105
|
+
"Ŋ" => "Ng",
|
106
|
+
"Ō" => "O",
|
107
|
+
"Ŏ" => "O",
|
108
|
+
"Ő" => "O",
|
109
|
+
"Œ" => "OE",
|
110
|
+
"Ŕ" => "R",
|
111
|
+
"Ŗ" => "R",
|
112
|
+
"Ř" => "R",
|
113
|
+
"Ś" => "S",
|
114
|
+
"Ŝ" => "S",
|
115
|
+
"Ş" => "S",
|
116
|
+
"Š" => "S",
|
117
|
+
"Ţ" => "T",
|
118
|
+
"Ť" => "T",
|
119
|
+
"Ŧ" => "T",
|
120
|
+
"Ũ" => "U",
|
121
|
+
"Ū" => "U",
|
122
|
+
"Ŭ" => "U",
|
123
|
+
"Ů" => "U",
|
124
|
+
"Ű" => "U",
|
125
|
+
"Ų" => "U",
|
126
|
+
"Ŵ" => "W",
|
127
|
+
"Ŷ" => "Y",
|
128
|
+
"Ÿ" => "Y",
|
129
|
+
"Ź" => "Z",
|
130
|
+
"Ż" => "Z",
|
131
|
+
"Ž" => "Z",
|
132
|
+
"ā" => "a",
|
133
|
+
"ă" => "a",
|
134
|
+
"ą" => "a",
|
135
|
+
"ć" => "c",
|
136
|
+
"ĉ" => "c",
|
137
|
+
"ċ" => "c",
|
138
|
+
"č" => "c",
|
139
|
+
"ď" => "d",
|
140
|
+
"đ" => "d",
|
141
|
+
"ē" => "e",
|
142
|
+
"ĕ" => "e",
|
143
|
+
"ė" => "e",
|
144
|
+
"ę" => "e",
|
145
|
+
"ě" => "e",
|
146
|
+
"ĝ" => "g",
|
147
|
+
"ğ" => "g",
|
148
|
+
"ġ" => "g",
|
149
|
+
"ģ" => "g",
|
150
|
+
"ĥ" => "h",
|
151
|
+
"ħ" => "h",
|
152
|
+
"ĩ" => "i",
|
153
|
+
"ī" => "i",
|
154
|
+
"ĭ" => "i",
|
155
|
+
"į" => "i",
|
156
|
+
"ı" => "i",
|
157
|
+
"ij" => "ij",
|
158
|
+
"ĵ" => "j",
|
159
|
+
"ķ" => "k",
|
160
|
+
"ĸ" => "k",
|
161
|
+
"ĺ" => "l",
|
162
|
+
"ļ" => "l",
|
163
|
+
"ľ" => "l",
|
164
|
+
"ŀ" => "l",
|
165
|
+
"ł" => "l",
|
166
|
+
"ń" => "n",
|
167
|
+
"ņ" => "n",
|
168
|
+
"ň" => "n",
|
169
|
+
"ʼn" => "n",
|
170
|
+
"ŋ" => "ng",
|
171
|
+
"ō" => "o",
|
172
|
+
"ŏ" => "o",
|
173
|
+
"ő" => "o",
|
174
|
+
"œ" => "oe",
|
175
|
+
"ŕ" => "r",
|
176
|
+
"ŗ" => "r",
|
177
|
+
"ř" => "r",
|
178
|
+
"ś" => "s",
|
179
|
+
"ŝ" => "s",
|
180
|
+
"ş" => "s",
|
181
|
+
"š" => "s",
|
182
|
+
"ţ" => "t",
|
183
|
+
"ť" => "t",
|
184
|
+
"ŧ" => "t",
|
185
|
+
"ũ" => "u",
|
186
|
+
"ū" => "u",
|
187
|
+
"ŭ" => "u",
|
188
|
+
"ů" => "u",
|
189
|
+
"ű" => "u",
|
190
|
+
"ų" => "u",
|
191
|
+
"ŵ" => "w",
|
192
|
+
"ŷ" => "y",
|
193
|
+
"ž" => "z",
|
194
|
+
"ź" => "z",
|
195
|
+
"ż" => "z"
|
196
|
+
}
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Babosa
|
3
|
+
module Transliterator
|
4
|
+
class Russian < Cyrillic
|
5
|
+
APPROXIMATIONS = {
|
6
|
+
"Й" => "I",
|
7
|
+
"М" => "M",
|
8
|
+
"Х" => "H",
|
9
|
+
"Ц" => "Ts",
|
10
|
+
"Ш" => "Sh",
|
11
|
+
"Щ" => "Sch",
|
12
|
+
"Ю" => "U",
|
13
|
+
"Я" => "Ya",
|
14
|
+
"й" => "i",
|
15
|
+
"х" => "h",
|
16
|
+
"ц" => "ts",
|
17
|
+
"щ" => "sch",
|
18
|
+
"ю" => "u"
|
19
|
+
}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Babosa
|
4
|
+
module Transliterator
|
5
|
+
class Serbian < Latin
|
6
|
+
APPROXIMATIONS = Cyrillic.const_get(:APPROXIMATIONS).merge({
|
7
|
+
"Ð" => "Dj",
|
8
|
+
"Č" => "Ch",
|
9
|
+
"Š" => "Sh",
|
10
|
+
"č" => "ch",
|
11
|
+
"đ" => "dj",
|
12
|
+
"š" => "sh",
|
13
|
+
"Ћ" => "C",
|
14
|
+
"Ц" => "C",
|
15
|
+
"Ч" => "Ch",
|
16
|
+
"Ђ" => "Dj",
|
17
|
+
"Џ" => "Dz",
|
18
|
+
"Х" => "H",
|
19
|
+
"Ј" => "J",
|
20
|
+
"Љ" => "Lj",
|
21
|
+
"Њ" => "Nj",
|
22
|
+
"ц" => "c",
|
23
|
+
"ћ" => "c",
|
24
|
+
"ч" => "ch",
|
25
|
+
"ђ" => "dj",
|
26
|
+
"џ" => "dz",
|
27
|
+
"х" => "h",
|
28
|
+
"ј" => "j",
|
29
|
+
"љ" => "lj",
|
30
|
+
"њ" => "nj"
|
31
|
+
})
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/babosa/version.rb
CHANGED
data/spec/babosa_spec.rb
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path("../spec_helper", __FILE__)
|
3
|
+
|
4
|
+
describe Babosa::Identifier do
|
5
|
+
|
6
|
+
it "should respond_to :empty?" do
|
7
|
+
"".to_slug.should respond_to(:empty?)
|
8
|
+
end
|
9
|
+
|
10
|
+
%w[approximate_ascii clean downcase word_chars normalize to_ascii upcase with_dashes].each do |method|
|
11
|
+
describe "##{method}" do
|
12
|
+
it "should work with invalid UTF-8 strings" do
|
13
|
+
expect {"\x93abc".to_slug.send method}.not_to raise_exception
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "#word_chars" do
|
19
|
+
it "word_chars! should leave only letters and spaces" do
|
20
|
+
string = "a*$%^$@!@b$%^&*()*!c"
|
21
|
+
string.to_slug.word_chars.should match(/[a-z ]*/i)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#transliterate" do
|
26
|
+
it "should transliterate to ascii" do
|
27
|
+
slug = (0xC0..0x17E).to_a.each do |codepoint|
|
28
|
+
ss = [codepoint].pack("U*").to_slug
|
29
|
+
ss.approximate_ascii.should match(/[\x0-\x7f]/)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should transliterate uncomposed utf8" do
|
34
|
+
string = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS
|
35
|
+
string.to_slug.approximate_ascii.should eql("u")
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe "#downcase" do
|
40
|
+
it "should lowercase strings" do
|
41
|
+
"FELIZ AÑO".to_slug.downcase.should eql("feliz año")
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#upcase" do
|
46
|
+
it "should uppercase strings" do
|
47
|
+
"feliz año".to_slug.upcase.should eql("FELIZ AÑO")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "#normalize" do
|
52
|
+
it "should replace whitespace with dashes" do
|
53
|
+
"a b".to_slug.clean.normalize.should eql("a-b")
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should replace multiple spaces with 1 dash" do
|
57
|
+
"a b".to_slug.clean.normalize.should eql("a-b")
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should replace multiple dashes with 1 dash" do
|
61
|
+
"male - female".to_slug.normalize.should eql("male-female")
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should strip trailing space" do
|
65
|
+
"ab ".to_slug.normalize.should eql("ab")
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should strip leading space" do
|
69
|
+
" ab".to_slug.normalize.should eql("ab")
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should strip trailing slashes" do
|
73
|
+
"ab-".to_slug.normalize.should eql("ab")
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should strip leading slashes" do
|
77
|
+
"-ab".to_slug.normalize.should eql("ab")
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should not modify valid name strings" do
|
81
|
+
"a-b-c-d".to_slug.normalize.should eql("a-b-c-d")
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should work with non roman chars" do
|
85
|
+
"検 索".to_slug.normalize.should eql("検-索")
|
86
|
+
end
|
87
|
+
|
88
|
+
context "with to_ascii option" do
|
89
|
+
it "should approximate and strip non ascii" do
|
90
|
+
ss = "カタカナ: katakana is über cool".to_slug
|
91
|
+
ss.normalize(:to_ascii => true).should eql("katakana-is-uber-cool")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe "#truncate_bytes" do
|
97
|
+
it "should by byte length" do
|
98
|
+
"üa".to_slug.truncate_bytes(2).should eql("ü")
|
99
|
+
"üa".to_slug.truncate_bytes(1).should eql("")
|
100
|
+
"üa".to_slug.truncate_bytes(100).should eql("üa")
|
101
|
+
"üéøá".to_slug.truncate_bytes(3).should eql("ü")
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
describe "#truncate" do
|
106
|
+
it "should truncate by char length" do
|
107
|
+
"üa".to_slug.truncate(2).should eql("üa")
|
108
|
+
"üa".to_slug.truncate(1).should eql("ü")
|
109
|
+
"üa".to_slug.truncate(100).should eql("üa")
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
describe "#with_dashes" do
|
114
|
+
it "should not change byte size when replacing spaces" do
|
115
|
+
"".to_slug.with_dashes.bytesize.should eql(0)
|
116
|
+
" ".to_slug.with_dashes.bytesize.should eql(1)
|
117
|
+
"-abc-".to_slug.with_dashes.bytesize.should eql(5)
|
118
|
+
" abc ".to_slug.with_dashes.bytesize.should eql(5)
|
119
|
+
" a bc ".to_slug.with_dashes.bytesize.should eql(7)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
describe "#to_ruby_method" do
|
124
|
+
it "should get a string suitable for use as a ruby method" do
|
125
|
+
"¿¿¿hello... world???".to_slug.to_ruby_method.should eql("hello_world?")
|
126
|
+
"カタカナ: katakana is über cool".to_slug.to_ruby_method.should eql("katakana_is_uber_cool")
|
127
|
+
"カタカナ: katakana is über cool!".to_slug.to_ruby_method.should eql("katakana_is_uber_cool!")
|
128
|
+
"カタカナ: katakana is über cool".to_slug.to_ruby_method(false).should eql("katakana_is_uber_cool")
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|