babosa 1.0.4 → 2.0.0.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/Changelog.md +12 -0
  3. data/README.md +80 -117
  4. data/Rakefile +9 -8
  5. data/lib/babosa.rb +2 -4
  6. data/lib/babosa/identifier.rb +82 -121
  7. data/lib/babosa/transliterator/base.rb +57 -56
  8. data/lib/babosa/transliterator/bulgarian.rb +3 -2
  9. data/lib/babosa/transliterator/cyrillic.rb +5 -5
  10. data/lib/babosa/transliterator/danish.rb +3 -3
  11. data/lib/babosa/transliterator/german.rb +3 -2
  12. data/lib/babosa/transliterator/greek.rb +4 -3
  13. data/lib/babosa/transliterator/hindi.rb +3 -2
  14. data/lib/babosa/transliterator/latin.rb +5 -5
  15. data/lib/babosa/transliterator/macedonian.rb +3 -2
  16. data/lib/babosa/transliterator/norwegian.rb +3 -3
  17. data/lib/babosa/transliterator/romanian.rb +3 -2
  18. data/lib/babosa/transliterator/russian.rb +3 -2
  19. data/lib/babosa/transliterator/serbian.rb +29 -27
  20. data/lib/babosa/transliterator/spanish.rb +2 -2
  21. data/lib/babosa/transliterator/swedish.rb +3 -3
  22. data/lib/babosa/transliterator/turkish.rb +8 -8
  23. data/lib/babosa/transliterator/ukrainian.rb +5 -4
  24. data/lib/babosa/transliterator/vietnamese.rb +4 -3
  25. data/lib/babosa/version.rb +3 -1
  26. data/spec/{babosa_spec.rb → identifier_spec.rb} +9 -10
  27. data/spec/spec_helper.rb +6 -6
  28. data/spec/transliterators/base_spec.rb +5 -6
  29. data/spec/transliterators/bulgarian_spec.rb +4 -5
  30. data/spec/transliterators/danish_spec.rb +5 -6
  31. data/spec/transliterators/german_spec.rb +4 -5
  32. data/spec/transliterators/greek_spec.rb +7 -7
  33. data/spec/transliterators/hindi_spec.rb +7 -7
  34. data/spec/transliterators/latin_spec.rb +3 -4
  35. data/spec/transliterators/macedonian_spec.rb +3 -4
  36. data/spec/transliterators/norwegian_spec.rb +4 -4
  37. data/spec/transliterators/polish_spec.rb +3 -5
  38. data/spec/transliterators/romanian_spec.rb +5 -6
  39. data/spec/transliterators/russian_spec.rb +3 -4
  40. data/spec/transliterators/serbian_spec.rb +6 -7
  41. data/spec/transliterators/spanish_spec.rb +4 -5
  42. data/spec/transliterators/swedish_spec.rb +7 -7
  43. data/spec/transliterators/turkish_spec.rb +24 -24
  44. data/spec/transliterators/ukrainian_spec.rb +74 -75
  45. data/spec/transliterators/vietnamese_spec.rb +10 -10
  46. metadata +17 -38
  47. data/lib/babosa/utf8/active_support_proxy.rb +0 -38
  48. data/lib/babosa/utf8/dumb_proxy.rb +0 -49
  49. data/lib/babosa/utf8/java_proxy.rb +0 -22
  50. data/lib/babosa/utf8/mappings.rb +0 -193
  51. data/lib/babosa/utf8/proxy.rb +0 -125
  52. data/lib/babosa/utf8/unicode_proxy.rb +0 -23
  53. data/spec/utf8_proxy_spec.rb +0 -52
@@ -1,22 +0,0 @@
1
- module Babosa
2
- module UTF8
3
- # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
4
- module JavaProxy
5
- extend Proxy
6
- extend self
7
- java_import java.text.Normalizer
8
-
9
- def downcase(string)
10
- string.to_java.to_lower_case.to_s
11
- end
12
-
13
- def upcase(string)
14
- string.to_java.to_upper_case.to_s
15
- end
16
-
17
- def normalize_utf8(string)
18
- Normalizer.normalize(string, Normalizer::Form::NFC).to_s
19
- end
20
- end
21
- end
22
- end
@@ -1,193 +0,0 @@
1
- module Babosa
2
- module UTF8
3
-
4
- # A small subset of the mappings provided by Unicode.org, limited to Latin
5
- # characters. This is used for Babosa's default "dumb" UTF-8 support.
6
- module Mappings
7
- DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
8
- 71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
9
- 79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
10
- 87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
11
- 226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
12
- 233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
13
- 240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
14
- 248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
15
- [115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
16
- 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
17
- 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
18
- 296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
19
- 309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
20
- 324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
21
- 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
22
- 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
23
- 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
24
- 377, 378, 379, 380, 381, 382]
25
-
26
- UPCASE = DOWNCASE.invert
27
-
28
- COMPOSITION = {
29
- [65,768] => 192,
30
- [65,769] => 193,
31
- [65,770] => 194,
32
- [65,771] => 195,
33
- [65,776] => 196,
34
- [65,778] => 197,
35
- [67,807] => 199,
36
- [69,768] => 200,
37
- [69,769] => 201,
38
- [69,770] => 202,
39
- [69,776] => 203,
40
- [73,768] => 204,
41
- [73,769] => 205,
42
- [73,770] => 206,
43
- [73,776] => 207,
44
- [78,771] => 209,
45
- [79,768] => 210,
46
- [79,769] => 211,
47
- [79,770] => 212,
48
- [79,771] => 213,
49
- [79,776] => 214,
50
- [85,768] => 217,
51
- [85,769] => 218,
52
- [85,770] => 219,
53
- [85,776] => 220,
54
- [89,769] => 221,
55
- [97,768] => 224,
56
- [97,769] => 225,
57
- [97,770] => 226,
58
- [97,771] => 227,
59
- [97,776] => 228,
60
- [97,778] => 229,
61
- [99,807] => 231,
62
- [101,768] => 232,
63
- [101,769] => 233,
64
- [101,770] => 234,
65
- [101,776] => 235,
66
- [105,768] => 236,
67
- [105,769] => 237,
68
- [105,770] => 238,
69
- [105,776] => 239,
70
- [110,771] => 241,
71
- [111,768] => 242,
72
- [111,769] => 243,
73
- [111,770] => 244,
74
- [111,771] => 245,
75
- [111,776] => 246,
76
- [117,768] => 249,
77
- [117,769] => 250,
78
- [117,770] => 251,
79
- [117,776] => 252,
80
- [121,769] => 253,
81
- [121,776] => 255,
82
- [65,772] => 256,
83
- [97,772] => 257,
84
- [65,774] => 258,
85
- [97,774] => 259,
86
- [65,808] => 260,
87
- [97,808] => 261,
88
- [67,769] => 262,
89
- [99,769] => 263,
90
- [67,770] => 264,
91
- [99,770] => 265,
92
- [67,775] => 266,
93
- [99,775] => 267,
94
- [67,780] => 268,
95
- [99,780] => 269,
96
- [68,780] => 270,
97
- [100,780] => 271,
98
- [69,772] => 274,
99
- [101,772] => 275,
100
- [69,774] => 276,
101
- [101,774] => 277,
102
- [69,775] => 278,
103
- [101,775] => 279,
104
- [69,808] => 280,
105
- [101,808] => 281,
106
- [69,780] => 282,
107
- [101,780] => 283,
108
- [71,770] => 284,
109
- [103,770] => 285,
110
- [71,774] => 286,
111
- [103,774] => 287,
112
- [71,775] => 288,
113
- [103,775] => 289,
114
- [71,807] => 290,
115
- [103,807] => 291,
116
- [72,770] => 292,
117
- [104,770] => 293,
118
- [73,771] => 296,
119
- [105,771] => 297,
120
- [73,772] => 298,
121
- [105,772] => 299,
122
- [73,774] => 300,
123
- [105,774] => 301,
124
- [73,808] => 302,
125
- [105,808] => 303,
126
- [73,775] => 304,
127
- [74,770] => 308,
128
- [106,770] => 309,
129
- [75,807] => 310,
130
- [107,807] => 311,
131
- [76,769] => 313,
132
- [108,769] => 314,
133
- [76,807] => 315,
134
- [108,807] => 316,
135
- [76,780] => 317,
136
- [108,780] => 318,
137
- [78,769] => 323,
138
- [110,769] => 324,
139
- [78,807] => 325,
140
- [110,807] => 326,
141
- [78,780] => 327,
142
- [110,780] => 328,
143
- [79,772] => 332,
144
- [111,772] => 333,
145
- [79,774] => 334,
146
- [111,774] => 335,
147
- [79,779] => 336,
148
- [111,779] => 337,
149
- [82,769] => 340,
150
- [114,769] => 341,
151
- [82,807] => 342,
152
- [114,807] => 343,
153
- [82,780] => 344,
154
- [114,780] => 345,
155
- [83,769] => 346,
156
- [115,769] => 347,
157
- [83,770] => 348,
158
- [115,770] => 349,
159
- [83,807] => 350,
160
- [115,807] => 351,
161
- [83,780] => 352,
162
- [115,780] => 353,
163
- [84,807] => 354,
164
- [116,807] => 355,
165
- [84,780] => 356,
166
- [116,780] => 357,
167
- [85,771] => 360,
168
- [117,771] => 361,
169
- [85,772] => 362,
170
- [117,772] => 363,
171
- [85,774] => 364,
172
- [117,774] => 365,
173
- [85,778] => 366,
174
- [117,778] => 367,
175
- [85,779] => 368,
176
- [117,779] => 369,
177
- [85,808] => 370,
178
- [117,808] => 371,
179
- [87,770] => 372,
180
- [119,770] => 373,
181
- [89,770] => 374,
182
- [121,770] => 375,
183
- [89,776] => 376,
184
- [90,769] => 377,
185
- [122,769] => 378,
186
- [90,775] => 379,
187
- [122,775] => 380,
188
- [90,780] => 381,
189
- [122,780] => 382
190
- }
191
- end
192
- end
193
- end
@@ -1,125 +0,0 @@
1
- module Babosa
2
- module UTF8
3
-
4
- autoload :JavaProxy, "babosa/utf8/java_proxy"
5
- autoload :UnicodeProxy, "babosa/utf8/unicode_proxy"
6
- autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy"
7
- autoload :DumbProxy, "babosa/utf8/dumb_proxy"
8
-
9
- # A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
10
- # The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
11
- module Proxy
12
- CP1252 = {
13
- 128 => [226, 130, 172],
14
- 129 => nil,
15
- 130 => [226, 128, 154],
16
- 131 => [198, 146],
17
- 132 => [226, 128, 158],
18
- 133 => [226, 128, 166],
19
- 134 => [226, 128, 160],
20
- 135 => [226, 128, 161],
21
- 136 => [203, 134],
22
- 137 => [226, 128, 176],
23
- 138 => [197, 160],
24
- 139 => [226, 128, 185],
25
- 140 => [197, 146],
26
- 141 => nil,
27
- 142 => [197, 189],
28
- 143 => nil,
29
- 144 => nil,
30
- 145 => [226, 128, 152],
31
- 146 => [226, 128, 153],
32
- 147 => [226, 128, 156],
33
- 148 => [226, 128, 157],
34
- 149 => [226, 128, 162],
35
- 150 => [226, 128, 147],
36
- 151 => [226, 128, 148],
37
- 152 => [203, 156],
38
- 153 => [226, 132, 162],
39
- 154 => [197, 161],
40
- 155 => [226, 128, 186],
41
- 156 => [197, 147],
42
- 157 => nil,
43
- 158 => [197, 190],
44
- 159 => [197, 184]
45
- }
46
-
47
- # This is a stub for a method that should return a Unicode-aware
48
- # downcased version of the given string.
49
- def downcase(string)
50
- raise NotImplementedError
51
- end
52
-
53
- # This is a stub for a method that should return a Unicode-aware
54
- # upcased version of the given string.
55
- def upcase(string)
56
- raise NotImplementedError
57
- end
58
-
59
- # This is a stub for a method that should return the Unicode NFC
60
- # normalization of the given string.
61
- def normalize_utf8(string)
62
- raise NotImplementedError
63
- end
64
-
65
- if ''.respond_to?(:scrub) && !defined?(Rubinius)
66
- # Attempt to replace invalid UTF-8 bytes with valid ones. This method
67
- # naively assumes if you have invalid UTF8 bytes, they are either Windows
68
- # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
69
- # always work.
70
- def tidy_bytes(string)
71
- string.scrub do |bad|
72
- tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
73
- end
74
- end
75
- else
76
- def tidy_bytes(string)
77
- bytes = string.unpack("C*")
78
- conts_expected = 0
79
- last_lead = 0
80
-
81
- bytes.each_index do |i|
82
- byte = bytes[i]
83
- is_cont = byte > 127 && byte < 192
84
- is_lead = byte > 191 && byte < 245
85
- is_unused = byte > 240
86
- is_restricted = byte > 244
87
-
88
- # Impossible or highly unlikely byte? Clean it.
89
- if is_unused || is_restricted
90
- bytes[i] = tidy_byte(byte)
91
- elsif is_cont
92
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
93
- conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
94
- else
95
- if conts_expected > 0
96
- # Expected continuation, but got ASCII or leading? Clean backwards up to
97
- # the leading byte.
98
- (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
99
- conts_expected = 0
100
- end
101
- if is_lead
102
- # Final byte is leading? Clean it.
103
- if i == bytes.length - 1
104
- bytes[i] = tidy_byte(bytes.last)
105
- else
106
- # Valid leading byte? Expect continuations determined by position of
107
- # first zero bit, with max of 3.
108
- conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
109
- last_lead = i
110
- end
111
- end
112
- end
113
- end
114
- bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
115
- end
116
- end
117
-
118
- private
119
-
120
- def tidy_byte(byte)
121
- byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
122
- end
123
- end
124
- end
125
- end
@@ -1,23 +0,0 @@
1
- require 'unicode'
2
-
3
- module Babosa
4
- module UTF8
5
- # A UTF-8 proxy using the Unicode gem.
6
- # @see http://github.com/blackwinter/unicode
7
- module UnicodeProxy
8
- extend Proxy
9
- extend self
10
- def downcase(string)
11
- Unicode.downcase(string)
12
- end
13
-
14
- def upcase(string)
15
- Unicode.upcase(string)
16
- end
17
-
18
- def normalize_utf8(string)
19
- Unicode.normalize_C(string)
20
- end
21
- end
22
- end
23
- end
@@ -1,52 +0,0 @@
1
- # encoding: utf-8
2
- require File.expand_path("../spec_helper", __FILE__)
3
-
4
- PROXIES = [Babosa::UTF8::DumbProxy, Babosa::UTF8::ActiveSupportProxy, Babosa::UTF8::UnicodeProxy]
5
- PROXIES << Babosa::UTF8::JavaProxy if Babosa.jruby15?
6
-
7
- PROXIES.each do |proxy|
8
-
9
- describe proxy do
10
-
11
- around do |example|
12
- begin
13
- old_proxy = Babosa::Identifier.utf8_proxy
14
- Babosa::Identifier.utf8_proxy = proxy
15
- example.run
16
- ensure
17
- Babosa::Identifier.utf8_proxy = old_proxy
18
- end
19
- end
20
-
21
- describe "#normalize_utf8" do
22
- it "should normalize to canonical composed" do
23
- # ÅÉÎØÜ
24
- uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136]
25
- composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156]
26
- uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*")
27
- expect(proxy.normalize_utf8(uncomposed_string).unpack("C*")).to eql(composed_bytes)
28
- end
29
- end
30
-
31
- describe "#upcase" do
32
- it "should upcase the string" do
33
- expect(proxy.upcase("åéîøü")).to eql("ÅÉÎØÜ")
34
- expect("åéîøü".to_identifier.upcase).to eql("ÅÉÎØÜ")
35
- end
36
- end
37
-
38
- describe "#downcase" do
39
- it "should downcase the string" do
40
- expect(proxy.downcase("ÅÉÎØÜ")).to eql("åéîøü")
41
- expect("ÅÉÎØÜ".to_identifier.downcase).to eql("åéîøü")
42
- end
43
- end
44
-
45
- describe 'tidy_bytes' do
46
- it 'should fix invalid UTF-8 strings' do
47
- expect(proxy.tidy_bytes("\x93abc")).to eq('“abc')
48
- end
49
- end
50
-
51
- end
52
- end