babosa 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/Changelog.md +12 -0
  5. data/README.md +81 -119
  6. data/Rakefile +9 -8
  7. data/lib/babosa.rb +2 -4
  8. data/lib/babosa/identifier.rb +104 -129
  9. data/lib/babosa/transliterator/base.rb +57 -56
  10. data/lib/babosa/transliterator/bulgarian.rb +3 -2
  11. data/lib/babosa/transliterator/cyrillic.rb +5 -5
  12. data/lib/babosa/transliterator/danish.rb +3 -3
  13. data/lib/babosa/transliterator/german.rb +3 -2
  14. data/lib/babosa/transliterator/greek.rb +4 -3
  15. data/lib/babosa/transliterator/hindi.rb +3 -2
  16. data/lib/babosa/transliterator/latin.rb +5 -5
  17. data/lib/babosa/transliterator/macedonian.rb +3 -2
  18. data/lib/babosa/transliterator/norwegian.rb +3 -3
  19. data/lib/babosa/transliterator/romanian.rb +3 -2
  20. data/lib/babosa/transliterator/russian.rb +3 -2
  21. data/lib/babosa/transliterator/serbian.rb +29 -27
  22. data/lib/babosa/transliterator/spanish.rb +2 -2
  23. data/lib/babosa/transliterator/swedish.rb +3 -3
  24. data/lib/babosa/transliterator/turkish.rb +8 -8
  25. data/lib/babosa/transliterator/ukrainian.rb +5 -4
  26. data/lib/babosa/transliterator/vietnamese.rb +4 -3
  27. data/lib/babosa/version.rb +3 -1
  28. data/spec/{babosa_spec.rb → identifier_spec.rb} +13 -14
  29. data/spec/spec_helper.rb +6 -6
  30. data/spec/transliterators/base_spec.rb +5 -6
  31. data/spec/transliterators/bulgarian_spec.rb +4 -5
  32. data/spec/transliterators/danish_spec.rb +5 -6
  33. data/spec/transliterators/german_spec.rb +4 -5
  34. data/spec/transliterators/greek_spec.rb +7 -7
  35. data/spec/transliterators/hindi_spec.rb +7 -7
  36. data/spec/transliterators/latin_spec.rb +3 -4
  37. data/spec/transliterators/macedonian_spec.rb +3 -4
  38. data/spec/transliterators/norwegian_spec.rb +4 -4
  39. data/spec/transliterators/polish_spec.rb +3 -5
  40. data/spec/transliterators/romanian_spec.rb +5 -6
  41. data/spec/transliterators/russian_spec.rb +3 -4
  42. data/spec/transliterators/serbian_spec.rb +6 -7
  43. data/spec/transliterators/spanish_spec.rb +4 -5
  44. data/spec/transliterators/swedish_spec.rb +7 -7
  45. data/spec/transliterators/turkish_spec.rb +24 -24
  46. data/spec/transliterators/ukrainian_spec.rb +74 -75
  47. data/spec/transliterators/vietnamese_spec.rb +10 -10
  48. metadata +44 -38
  49. metadata.gz.sig +2 -0
  50. data/lib/babosa/utf8/active_support_proxy.rb +0 -38
  51. data/lib/babosa/utf8/dumb_proxy.rb +0 -49
  52. data/lib/babosa/utf8/java_proxy.rb +0 -22
  53. data/lib/babosa/utf8/mappings.rb +0 -193
  54. data/lib/babosa/utf8/proxy.rb +0 -125
  55. data/lib/babosa/utf8/unicode_proxy.rb +0 -23
  56. data/spec/utf8_proxy_spec.rb +0 -52
@@ -1,49 +0,0 @@
1
- require File.expand_path("../mappings", __FILE__)
2
-
3
- module Babosa
4
- module UTF8
5
-
6
- # This module provides fallback UTF-8 support when nothing else is
7
- # available. It does case folding for Roman alphabet-based characters
8
- # commonly used by Western European languages and little else, making it
9
- # useless for Russian, Bulgarian, Greek, etc. If at all possible, Unicode
10
- # or ActiveSupport should be used instead because they support the full
11
- # UTF-8 character range.
12
- module DumbProxy
13
- extend Proxy
14
- extend self
15
-
16
- def downcase(string)
17
- string.downcase.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
18
- end
19
-
20
- def upcase(string)
21
- string.upcase.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
22
- end
23
-
24
- if ''.respond_to?(:unicode_normalize)
25
- def normalize_utf8(string)
26
- string.unicode_normalize
27
- end
28
- else
29
- # On Ruby 2.2, this uses the native Unicode normalize method. On all
30
- # other Rubies, it does a very naive Unicode normalization, which should
31
- # work for this library's purposes (i.e., Roman-based codepoints, up to
32
- # U+017E). Do not use reuse this as a general solution! Use a real
33
- # library like Unicode or ActiveSupport instead.
34
- def normalize_utf8(string)
35
- codepoints = string.unpack("U*")
36
- new = []
37
- until codepoints.empty? do
38
- if Mappings::COMPOSITION[codepoints[0..1]]
39
- new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
40
- else
41
- new << codepoints.shift
42
- end
43
- end
44
- new.compact.flatten.pack("U*")
45
- end
46
- end
47
- end
48
- end
49
- end
@@ -1,22 +0,0 @@
1
- module Babosa
2
- module UTF8
3
- # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
4
- module JavaProxy
5
- extend Proxy
6
- extend self
7
- java_import java.text.Normalizer
8
-
9
- def downcase(string)
10
- string.to_java.to_lower_case.to_s
11
- end
12
-
13
- def upcase(string)
14
- string.to_java.to_upper_case.to_s
15
- end
16
-
17
- def normalize_utf8(string)
18
- Normalizer.normalize(string, Normalizer::Form::NFC).to_s
19
- end
20
- end
21
- end
22
- end
@@ -1,193 +0,0 @@
1
- module Babosa
2
- module UTF8
3
-
4
- # A small subset of the mappings provided by Unicode.org, limited to Latin
5
- # characters. This is used for Babosa's default "dumb" UTF-8 support.
6
- module Mappings
7
- DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
8
- 71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
9
- 79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
10
- 87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
11
- 226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
12
- 233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
13
- 240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
14
- 248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
15
- [115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
16
- 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
17
- 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
18
- 296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
19
- 309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
20
- 324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
21
- 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
22
- 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
23
- 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
24
- 377, 378, 379, 380, 381, 382]
25
-
26
- UPCASE = DOWNCASE.invert
27
-
28
- COMPOSITION = {
29
- [65,768] => 192,
30
- [65,769] => 193,
31
- [65,770] => 194,
32
- [65,771] => 195,
33
- [65,776] => 196,
34
- [65,778] => 197,
35
- [67,807] => 199,
36
- [69,768] => 200,
37
- [69,769] => 201,
38
- [69,770] => 202,
39
- [69,776] => 203,
40
- [73,768] => 204,
41
- [73,769] => 205,
42
- [73,770] => 206,
43
- [73,776] => 207,
44
- [78,771] => 209,
45
- [79,768] => 210,
46
- [79,769] => 211,
47
- [79,770] => 212,
48
- [79,771] => 213,
49
- [79,776] => 214,
50
- [85,768] => 217,
51
- [85,769] => 218,
52
- [85,770] => 219,
53
- [85,776] => 220,
54
- [89,769] => 221,
55
- [97,768] => 224,
56
- [97,769] => 225,
57
- [97,770] => 226,
58
- [97,771] => 227,
59
- [97,776] => 228,
60
- [97,778] => 229,
61
- [99,807] => 231,
62
- [101,768] => 232,
63
- [101,769] => 233,
64
- [101,770] => 234,
65
- [101,776] => 235,
66
- [105,768] => 236,
67
- [105,769] => 237,
68
- [105,770] => 238,
69
- [105,776] => 239,
70
- [110,771] => 241,
71
- [111,768] => 242,
72
- [111,769] => 243,
73
- [111,770] => 244,
74
- [111,771] => 245,
75
- [111,776] => 246,
76
- [117,768] => 249,
77
- [117,769] => 250,
78
- [117,770] => 251,
79
- [117,776] => 252,
80
- [121,769] => 253,
81
- [121,776] => 255,
82
- [65,772] => 256,
83
- [97,772] => 257,
84
- [65,774] => 258,
85
- [97,774] => 259,
86
- [65,808] => 260,
87
- [97,808] => 261,
88
- [67,769] => 262,
89
- [99,769] => 263,
90
- [67,770] => 264,
91
- [99,770] => 265,
92
- [67,775] => 266,
93
- [99,775] => 267,
94
- [67,780] => 268,
95
- [99,780] => 269,
96
- [68,780] => 270,
97
- [100,780] => 271,
98
- [69,772] => 274,
99
- [101,772] => 275,
100
- [69,774] => 276,
101
- [101,774] => 277,
102
- [69,775] => 278,
103
- [101,775] => 279,
104
- [69,808] => 280,
105
- [101,808] => 281,
106
- [69,780] => 282,
107
- [101,780] => 283,
108
- [71,770] => 284,
109
- [103,770] => 285,
110
- [71,774] => 286,
111
- [103,774] => 287,
112
- [71,775] => 288,
113
- [103,775] => 289,
114
- [71,807] => 290,
115
- [103,807] => 291,
116
- [72,770] => 292,
117
- [104,770] => 293,
118
- [73,771] => 296,
119
- [105,771] => 297,
120
- [73,772] => 298,
121
- [105,772] => 299,
122
- [73,774] => 300,
123
- [105,774] => 301,
124
- [73,808] => 302,
125
- [105,808] => 303,
126
- [73,775] => 304,
127
- [74,770] => 308,
128
- [106,770] => 309,
129
- [75,807] => 310,
130
- [107,807] => 311,
131
- [76,769] => 313,
132
- [108,769] => 314,
133
- [76,807] => 315,
134
- [108,807] => 316,
135
- [76,780] => 317,
136
- [108,780] => 318,
137
- [78,769] => 323,
138
- [110,769] => 324,
139
- [78,807] => 325,
140
- [110,807] => 326,
141
- [78,780] => 327,
142
- [110,780] => 328,
143
- [79,772] => 332,
144
- [111,772] => 333,
145
- [79,774] => 334,
146
- [111,774] => 335,
147
- [79,779] => 336,
148
- [111,779] => 337,
149
- [82,769] => 340,
150
- [114,769] => 341,
151
- [82,807] => 342,
152
- [114,807] => 343,
153
- [82,780] => 344,
154
- [114,780] => 345,
155
- [83,769] => 346,
156
- [115,769] => 347,
157
- [83,770] => 348,
158
- [115,770] => 349,
159
- [83,807] => 350,
160
- [115,807] => 351,
161
- [83,780] => 352,
162
- [115,780] => 353,
163
- [84,807] => 354,
164
- [116,807] => 355,
165
- [84,780] => 356,
166
- [116,780] => 357,
167
- [85,771] => 360,
168
- [117,771] => 361,
169
- [85,772] => 362,
170
- [117,772] => 363,
171
- [85,774] => 364,
172
- [117,774] => 365,
173
- [85,778] => 366,
174
- [117,778] => 367,
175
- [85,779] => 368,
176
- [117,779] => 369,
177
- [85,808] => 370,
178
- [117,808] => 371,
179
- [87,770] => 372,
180
- [119,770] => 373,
181
- [89,770] => 374,
182
- [121,770] => 375,
183
- [89,776] => 376,
184
- [90,769] => 377,
185
- [122,769] => 378,
186
- [90,775] => 379,
187
- [122,775] => 380,
188
- [90,780] => 381,
189
- [122,780] => 382
190
- }
191
- end
192
- end
193
- end
@@ -1,125 +0,0 @@
1
- module Babosa
2
- module UTF8
3
-
4
- autoload :JavaProxy, "babosa/utf8/java_proxy"
5
- autoload :UnicodeProxy, "babosa/utf8/unicode_proxy"
6
- autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy"
7
- autoload :DumbProxy, "babosa/utf8/dumb_proxy"
8
-
9
- # A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
10
- # The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
11
- module Proxy
12
- CP1252 = {
13
- 128 => [226, 130, 172],
14
- 129 => nil,
15
- 130 => [226, 128, 154],
16
- 131 => [198, 146],
17
- 132 => [226, 128, 158],
18
- 133 => [226, 128, 166],
19
- 134 => [226, 128, 160],
20
- 135 => [226, 128, 161],
21
- 136 => [203, 134],
22
- 137 => [226, 128, 176],
23
- 138 => [197, 160],
24
- 139 => [226, 128, 185],
25
- 140 => [197, 146],
26
- 141 => nil,
27
- 142 => [197, 189],
28
- 143 => nil,
29
- 144 => nil,
30
- 145 => [226, 128, 152],
31
- 146 => [226, 128, 153],
32
- 147 => [226, 128, 156],
33
- 148 => [226, 128, 157],
34
- 149 => [226, 128, 162],
35
- 150 => [226, 128, 147],
36
- 151 => [226, 128, 148],
37
- 152 => [203, 156],
38
- 153 => [226, 132, 162],
39
- 154 => [197, 161],
40
- 155 => [226, 128, 186],
41
- 156 => [197, 147],
42
- 157 => nil,
43
- 158 => [197, 190],
44
- 159 => [197, 184]
45
- }
46
-
47
- # This is a stub for a method that should return a Unicode-aware
48
- # downcased version of the given string.
49
- def downcase(string)
50
- raise NotImplementedError
51
- end
52
-
53
- # This is a stub for a method that should return a Unicode-aware
54
- # upcased version of the given string.
55
- def upcase(string)
56
- raise NotImplementedError
57
- end
58
-
59
- # This is a stub for a method that should return the Unicode NFC
60
- # normalization of the given string.
61
- def normalize_utf8(string)
62
- raise NotImplementedError
63
- end
64
-
65
- if ''.respond_to?(:scrub) && !defined?(Rubinius)
66
- # Attempt to replace invalid UTF-8 bytes with valid ones. This method
67
- # naively assumes if you have invalid UTF8 bytes, they are either Windows
68
- # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
69
- # always work.
70
- def tidy_bytes(string)
71
- string.scrub do |bad|
72
- tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
73
- end
74
- end
75
- else
76
- def tidy_bytes(string)
77
- bytes = string.unpack("C*")
78
- conts_expected = 0
79
- last_lead = 0
80
-
81
- bytes.each_index do |i|
82
- byte = bytes[i]
83
- is_cont = byte > 127 && byte < 192
84
- is_lead = byte > 191 && byte < 245
85
- is_unused = byte > 240
86
- is_restricted = byte > 244
87
-
88
- # Impossible or highly unlikely byte? Clean it.
89
- if is_unused || is_restricted
90
- bytes[i] = tidy_byte(byte)
91
- elsif is_cont
92
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
93
- conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
94
- else
95
- if conts_expected > 0
96
- # Expected continuation, but got ASCII or leading? Clean backwards up to
97
- # the leading byte.
98
- (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
99
- conts_expected = 0
100
- end
101
- if is_lead
102
- # Final byte is leading? Clean it.
103
- if i == bytes.length - 1
104
- bytes[i] = tidy_byte(bytes.last)
105
- else
106
- # Valid leading byte? Expect continuations determined by position of
107
- # first zero bit, with max of 3.
108
- conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
109
- last_lead = i
110
- end
111
- end
112
- end
113
- end
114
- bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
115
- end
116
- end
117
-
118
- private
119
-
120
- def tidy_byte(byte)
121
- byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
122
- end
123
- end
124
- end
125
- end
@@ -1,23 +0,0 @@
1
- require 'unicode'
2
-
3
- module Babosa
4
- module UTF8
5
- # A UTF-8 proxy using the Unicode gem.
6
- # @see http://github.com/blackwinter/unicode
7
- module UnicodeProxy
8
- extend Proxy
9
- extend self
10
- def downcase(string)
11
- Unicode.downcase(string)
12
- end
13
-
14
- def upcase(string)
15
- Unicode.upcase(string)
16
- end
17
-
18
- def normalize_utf8(string)
19
- Unicode.normalize_C(string)
20
- end
21
- end
22
- end
23
- end
@@ -1,52 +0,0 @@
1
- # encoding: utf-8
2
- require File.expand_path("../spec_helper", __FILE__)
3
-
4
- PROXIES = [Babosa::UTF8::DumbProxy, Babosa::UTF8::ActiveSupportProxy, Babosa::UTF8::UnicodeProxy]
5
- PROXIES << Babosa::UTF8::JavaProxy if Babosa.jruby15?
6
-
7
- PROXIES.each do |proxy|
8
-
9
- describe proxy do
10
-
11
- around do |example|
12
- begin
13
- old_proxy = Babosa::Identifier.utf8_proxy
14
- Babosa::Identifier.utf8_proxy = proxy
15
- example.run
16
- ensure
17
- Babosa::Identifier.utf8_proxy = old_proxy
18
- end
19
- end
20
-
21
- describe "#normalize_utf8" do
22
- it "should normalize to canonical composed" do
23
- # ÅÉÎØÜ
24
- uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136]
25
- composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156]
26
- uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*")
27
- expect(proxy.normalize_utf8(uncomposed_string).unpack("C*")).to eql(composed_bytes)
28
- end
29
- end
30
-
31
- describe "#upcase" do
32
- it "should upcase the string" do
33
- expect(proxy.upcase("åéîøü")).to eql("ÅÉÎØÜ")
34
- expect("åéîøü".to_identifier.upcase).to eql("ÅÉÎØÜ")
35
- end
36
- end
37
-
38
- describe "#downcase" do
39
- it "should downcase the string" do
40
- expect(proxy.downcase("ÅÉÎØÜ")).to eql("åéîøü")
41
- expect("ÅÉÎØÜ".to_identifier.downcase).to eql("åéîøü")
42
- end
43
- end
44
-
45
- describe 'tidy_bytes' do
46
- it 'should fix invalid UTF-8 strings' do
47
- expect(proxy.tidy_bytes("\x93abc")).to eq('“abc')
48
- end
49
- end
50
-
51
- end
52
- end