kebab 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gemtest +0 -0
  3. data/Changelog.md +99 -0
  4. data/MIT-LICENSE +19 -0
  5. data/README.md +26 -0
  6. data/Rakefile +34 -0
  7. data/lib/kebab.rb +18 -0
  8. data/lib/kebab/identifier.rb +294 -0
  9. data/lib/kebab/transliterator/base.rb +110 -0
  10. data/lib/kebab/transliterator/bulgarian.rb +27 -0
  11. data/lib/kebab/transliterator/cyrillic.rb +108 -0
  12. data/lib/kebab/transliterator/danish.rb +15 -0
  13. data/lib/kebab/transliterator/german.rb +15 -0
  14. data/lib/kebab/transliterator/greek.rb +77 -0
  15. data/lib/kebab/transliterator/hindi.rb +137 -0
  16. data/lib/kebab/transliterator/latin.rb +199 -0
  17. data/lib/kebab/transliterator/macedonian.rb +29 -0
  18. data/lib/kebab/transliterator/norwegian.rb +14 -0
  19. data/lib/kebab/transliterator/romanian.rb +13 -0
  20. data/lib/kebab/transliterator/russian.rb +22 -0
  21. data/lib/kebab/transliterator/serbian.rb +34 -0
  22. data/lib/kebab/transliterator/spanish.rb +9 -0
  23. data/lib/kebab/transliterator/swedish.rb +16 -0
  24. data/lib/kebab/transliterator/turkish.rb +8 -0
  25. data/lib/kebab/transliterator/ukrainian.rb +30 -0
  26. data/lib/kebab/transliterator/vietnamese.rb +143 -0
  27. data/lib/kebab/utf8/active_support_proxy.rb +26 -0
  28. data/lib/kebab/utf8/dumb_proxy.rb +49 -0
  29. data/lib/kebab/utf8/java_proxy.rb +22 -0
  30. data/lib/kebab/utf8/mappings.rb +193 -0
  31. data/lib/kebab/utf8/proxy.rb +125 -0
  32. data/lib/kebab/utf8/unicode_proxy.rb +23 -0
  33. data/lib/kebab/version.rb +5 -0
  34. data/spec/kebab_spec.rb +155 -0
  35. data/spec/spec_helper.rb +45 -0
  36. data/spec/transliterators/base_spec.rb +16 -0
  37. data/spec/transliterators/bulgarian_spec.rb +20 -0
  38. data/spec/transliterators/danish_spec.rb +17 -0
  39. data/spec/transliterators/german_spec.rb +17 -0
  40. data/spec/transliterators/greek_spec.rb +17 -0
  41. data/spec/transliterators/hindi_spec.rb +17 -0
  42. data/spec/transliterators/latin_spec.rb +9 -0
  43. data/spec/transliterators/macedonian_spec.rb +9 -0
  44. data/spec/transliterators/norwegian_spec.rb +18 -0
  45. data/spec/transliterators/polish_spec.rb +14 -0
  46. data/spec/transliterators/romanian_spec.rb +19 -0
  47. data/spec/transliterators/russian_spec.rb +9 -0
  48. data/spec/transliterators/serbian_spec.rb +25 -0
  49. data/spec/transliterators/spanish_spec.rb +13 -0
  50. data/spec/transliterators/swedish_spec.rb +18 -0
  51. data/spec/transliterators/turkish_spec.rb +24 -0
  52. data/spec/transliterators/ukrainian_spec.rb +88 -0
  53. data/spec/transliterators/vietnamese_spec.rb +18 -0
  54. data/spec/utf8_proxy_spec.rb +53 -0
  55. metadata +167 -0
@@ -0,0 +1,26 @@
1
+ require 'active_support'
2
+ require 'active_support/multibyte/unicode'
3
+
4
+ module Kebab
5
+ module UTF8
6
+ # A UTF-8 proxy using Active Support's multibyte support.
7
+ module ActiveSupportProxy
8
+ extend ActiveSupport::Multibyte::Unicode
9
+ extend self
10
+
11
+ def self.normalize_utf8(string)
12
+ normalize(string, :c)
13
+ end
14
+
15
+ if ActiveSupport::VERSION::MAJOR == 3
16
+ def downcase(string)
17
+ ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
18
+ end
19
+
20
+ def upcase(string)
21
+ ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,49 @@
1
+ require File.expand_path("../mappings", __FILE__)
2
+
3
+ module Kebab
4
+ module UTF8
5
+
6
+ # This module provides fallback UTF-8 support when nothing else is
7
+ # available. It does case folding for Roman alphabet-based characters
8
+ # commonly used by Western European languages and little else, making it
9
+ # useless for Russian, Bulgarian, Greek, etc. If at all possible, Unicode
10
+ # or ActiveSupport should be used instead because they support the full
11
+ # UTF-8 character range.
12
+ module DumbProxy
13
+ extend Proxy
14
+ extend self
15
+
16
+ def downcase(string)
17
+ string.downcase.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
18
+ end
19
+
20
+ def upcase(string)
21
+ string.upcase.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
22
+ end
23
+
24
+ if ''.respond_to?(:unicode_normalize)
25
+ def normalize_utf8(string)
26
+ string.unicode_normalize
27
+ end
28
+ else
29
+ # On Ruby 2.2, this uses the native Unicode normalize method. On all
30
+ # other Rubies, it does a very naive Unicode normalization, which should
31
+ # work for this library's purposes (i.e., Roman-based codepoints, up to
32
+ # U+017E). Do not use reuse this as a general solution! Use a real
33
+ # library like Unicode or ActiveSupport instead.
34
+ def normalize_utf8(string)
35
+ codepoints = string.unpack("U*")
36
+ new = []
37
+ until codepoints.empty? do
38
+ if Mappings::COMPOSITION[codepoints[0..1]]
39
+ new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
40
+ else
41
+ new << codepoints.shift
42
+ end
43
+ end
44
+ new.compact.flatten.pack("U*")
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,22 @@
1
+ module Kebab
2
+ module UTF8
3
+ # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
4
+ module JavaProxy
5
+ extend Proxy
6
+ extend self
7
+ java_import java.text.Normalizer
8
+
9
+ def downcase(string)
10
+ string.to_java.to_lower_case.to_s
11
+ end
12
+
13
+ def upcase(string)
14
+ string.to_java.to_upper_case.to_s
15
+ end
16
+
17
+ def normalize_utf8(string)
18
+ Normalizer.normalize(string, Normalizer::Form::NFC).to_s
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,193 @@
1
+ module Kebab
2
+ module UTF8
3
+
4
+ # A small subset of the mappings provided by Unicode.org, limited to Latin
5
+ # characters. This is used for Kebab's default "dumb" UTF-8 support.
6
+ module Mappings
7
+ DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
8
+ 71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
9
+ 79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
10
+ 87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
11
+ 226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
12
+ 233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
13
+ 240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
14
+ 248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
15
+ [115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
16
+ 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
17
+ 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
18
+ 296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
19
+ 309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
20
+ 324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
21
+ 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
22
+ 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
23
+ 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
24
+ 377, 378, 379, 380, 381, 382]
25
+
26
+ UPCASE = DOWNCASE.invert
27
+
28
+ COMPOSITION = {
29
+ [65,768] => 192,
30
+ [65,769] => 193,
31
+ [65,770] => 194,
32
+ [65,771] => 195,
33
+ [65,776] => 196,
34
+ [65,778] => 197,
35
+ [67,807] => 199,
36
+ [69,768] => 200,
37
+ [69,769] => 201,
38
+ [69,770] => 202,
39
+ [69,776] => 203,
40
+ [73,768] => 204,
41
+ [73,769] => 205,
42
+ [73,770] => 206,
43
+ [73,776] => 207,
44
+ [78,771] => 209,
45
+ [79,768] => 210,
46
+ [79,769] => 211,
47
+ [79,770] => 212,
48
+ [79,771] => 213,
49
+ [79,776] => 214,
50
+ [85,768] => 217,
51
+ [85,769] => 218,
52
+ [85,770] => 219,
53
+ [85,776] => 220,
54
+ [89,769] => 221,
55
+ [97,768] => 224,
56
+ [97,769] => 225,
57
+ [97,770] => 226,
58
+ [97,771] => 227,
59
+ [97,776] => 228,
60
+ [97,778] => 229,
61
+ [99,807] => 231,
62
+ [101,768] => 232,
63
+ [101,769] => 233,
64
+ [101,770] => 234,
65
+ [101,776] => 235,
66
+ [105,768] => 236,
67
+ [105,769] => 237,
68
+ [105,770] => 238,
69
+ [105,776] => 239,
70
+ [110,771] => 241,
71
+ [111,768] => 242,
72
+ [111,769] => 243,
73
+ [111,770] => 244,
74
+ [111,771] => 245,
75
+ [111,776] => 246,
76
+ [117,768] => 249,
77
+ [117,769] => 250,
78
+ [117,770] => 251,
79
+ [117,776] => 252,
80
+ [121,769] => 253,
81
+ [121,776] => 255,
82
+ [65,772] => 256,
83
+ [97,772] => 257,
84
+ [65,774] => 258,
85
+ [97,774] => 259,
86
+ [65,808] => 260,
87
+ [97,808] => 261,
88
+ [67,769] => 262,
89
+ [99,769] => 263,
90
+ [67,770] => 264,
91
+ [99,770] => 265,
92
+ [67,775] => 266,
93
+ [99,775] => 267,
94
+ [67,780] => 268,
95
+ [99,780] => 269,
96
+ [68,780] => 270,
97
+ [100,780] => 271,
98
+ [69,772] => 274,
99
+ [101,772] => 275,
100
+ [69,774] => 276,
101
+ [101,774] => 277,
102
+ [69,775] => 278,
103
+ [101,775] => 279,
104
+ [69,808] => 280,
105
+ [101,808] => 281,
106
+ [69,780] => 282,
107
+ [101,780] => 283,
108
+ [71,770] => 284,
109
+ [103,770] => 285,
110
+ [71,774] => 286,
111
+ [103,774] => 287,
112
+ [71,775] => 288,
113
+ [103,775] => 289,
114
+ [71,807] => 290,
115
+ [103,807] => 291,
116
+ [72,770] => 292,
117
+ [104,770] => 293,
118
+ [73,771] => 296,
119
+ [105,771] => 297,
120
+ [73,772] => 298,
121
+ [105,772] => 299,
122
+ [73,774] => 300,
123
+ [105,774] => 301,
124
+ [73,808] => 302,
125
+ [105,808] => 303,
126
+ [73,775] => 304,
127
+ [74,770] => 308,
128
+ [106,770] => 309,
129
+ [75,807] => 310,
130
+ [107,807] => 311,
131
+ [76,769] => 313,
132
+ [108,769] => 314,
133
+ [76,807] => 315,
134
+ [108,807] => 316,
135
+ [76,780] => 317,
136
+ [108,780] => 318,
137
+ [78,769] => 323,
138
+ [110,769] => 324,
139
+ [78,807] => 325,
140
+ [110,807] => 326,
141
+ [78,780] => 327,
142
+ [110,780] => 328,
143
+ [79,772] => 332,
144
+ [111,772] => 333,
145
+ [79,774] => 334,
146
+ [111,774] => 335,
147
+ [79,779] => 336,
148
+ [111,779] => 337,
149
+ [82,769] => 340,
150
+ [114,769] => 341,
151
+ [82,807] => 342,
152
+ [114,807] => 343,
153
+ [82,780] => 344,
154
+ [114,780] => 345,
155
+ [83,769] => 346,
156
+ [115,769] => 347,
157
+ [83,770] => 348,
158
+ [115,770] => 349,
159
+ [83,807] => 350,
160
+ [115,807] => 351,
161
+ [83,780] => 352,
162
+ [115,780] => 353,
163
+ [84,807] => 354,
164
+ [116,807] => 355,
165
+ [84,780] => 356,
166
+ [116,780] => 357,
167
+ [85,771] => 360,
168
+ [117,771] => 361,
169
+ [85,772] => 362,
170
+ [117,772] => 363,
171
+ [85,774] => 364,
172
+ [117,774] => 365,
173
+ [85,778] => 366,
174
+ [117,778] => 367,
175
+ [85,779] => 368,
176
+ [117,779] => 369,
177
+ [85,808] => 370,
178
+ [117,808] => 371,
179
+ [87,770] => 372,
180
+ [119,770] => 373,
181
+ [89,770] => 374,
182
+ [121,770] => 375,
183
+ [89,776] => 376,
184
+ [90,769] => 377,
185
+ [122,769] => 378,
186
+ [90,775] => 379,
187
+ [122,775] => 380,
188
+ [90,780] => 381,
189
+ [122,780] => 382
190
+ }
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,125 @@
1
+ module Kebab
2
+ module UTF8
3
+
4
+ autoload :JavaProxy, "kebab/utf8/java_proxy"
5
+ autoload :UnicodeProxy, "kebab/utf8/unicode_proxy"
6
+ autoload :ActiveSupportProxy, "kebab/utf8/active_support_proxy"
7
+ autoload :DumbProxy, "kebab/utf8/dumb_proxy"
8
+
9
+ # A UTF-8 proxy for Kebab can be any object which responds to the methods in this module.
10
+ # The following proxies are provided by Kebab: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
11
+ module Proxy
12
+ CP1252 = {
13
+ 128 => [226, 130, 172],
14
+ 129 => nil,
15
+ 130 => [226, 128, 154],
16
+ 131 => [198, 146],
17
+ 132 => [226, 128, 158],
18
+ 133 => [226, 128, 166],
19
+ 134 => [226, 128, 160],
20
+ 135 => [226, 128, 161],
21
+ 136 => [203, 134],
22
+ 137 => [226, 128, 176],
23
+ 138 => [197, 160],
24
+ 139 => [226, 128, 185],
25
+ 140 => [197, 146],
26
+ 141 => nil,
27
+ 142 => [197, 189],
28
+ 143 => nil,
29
+ 144 => nil,
30
+ 145 => [226, 128, 152],
31
+ 146 => [226, 128, 153],
32
+ 147 => [226, 128, 156],
33
+ 148 => [226, 128, 157],
34
+ 149 => [226, 128, 162],
35
+ 150 => [226, 128, 147],
36
+ 151 => [226, 128, 148],
37
+ 152 => [203, 156],
38
+ 153 => [226, 132, 162],
39
+ 154 => [197, 161],
40
+ 155 => [226, 128, 186],
41
+ 156 => [197, 147],
42
+ 157 => nil,
43
+ 158 => [197, 190],
44
+ 159 => [197, 184]
45
+ }
46
+
47
+ # This is a stub for a method that should return a Unicode-aware
48
+ # downcased version of the given string.
49
+ def downcase(string)
50
+ raise NotImplementedError
51
+ end
52
+
53
+ # This is a stub for a method that should return a Unicode-aware
54
+ # upcased version of the given string.
55
+ def upcase(string)
56
+ raise NotImplementedError
57
+ end
58
+
59
+ # This is a stub for a method that should return the Unicode NFC
60
+ # normalization of the given string.
61
+ def normalize_utf8(string)
62
+ raise NotImplementedError
63
+ end
64
+
65
+ if ''.respond_to?(:scrub) && !defined?(Rubinius)
66
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
67
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
68
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
69
+ # always work.
70
+ def tidy_bytes(string)
71
+ string.scrub do |bad|
72
+ tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
73
+ end
74
+ end
75
+ else
76
+ def tidy_bytes(string)
77
+ bytes = string.unpack("C*")
78
+ conts_expected = 0
79
+ last_lead = 0
80
+
81
+ bytes.each_index do |i|
82
+ byte = bytes[i]
83
+ is_cont = byte > 127 && byte < 192
84
+ is_lead = byte > 191 && byte < 245
85
+ is_unused = byte > 240
86
+ is_restricted = byte > 244
87
+
88
+ # Impossible or highly unlikely byte? Clean it.
89
+ if is_unused || is_restricted
90
+ bytes[i] = tidy_byte(byte)
91
+ elsif is_cont
92
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
93
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
94
+ else
95
+ if conts_expected > 0
96
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
97
+ # the leading byte.
98
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
99
+ conts_expected = 0
100
+ end
101
+ if is_lead
102
+ # Final byte is leading? Clean it.
103
+ if i == bytes.length - 1
104
+ bytes[i] = tidy_byte(bytes.last)
105
+ else
106
+ # Valid leading byte? Expect continuations determined by position of
107
+ # first zero bit, with max of 3.
108
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
109
+ last_lead = i
110
+ end
111
+ end
112
+ end
113
+ end
114
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
115
+ end
116
+ end
117
+
118
+ private
119
+
120
+ def tidy_byte(byte)
121
+ byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,23 @@
1
+ require 'unicode'
2
+
3
+ module Kebab
4
+ module UTF8
5
+ # A UTF-8 proxy using the Unicode gem.
6
+ # @see http://github.com/blackwinter/unicode
7
+ module UnicodeProxy
8
+ extend Proxy
9
+ extend self
10
+ def downcase(string)
11
+ Unicode.downcase(string)
12
+ end
13
+
14
+ def upcase(string)
15
+ Unicode.upcase(string)
16
+ end
17
+
18
+ def normalize_utf8(string)
19
+ Unicode.normalize_C(string)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,5 @@
1
+ module Kebab
2
+ module Version
3
+ STRING = '1.0.2'
4
+ end
5
+ end