babosa 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2010 Norman Clarke
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,91 @@
1
+ # Babosa
2
+
3
+ Babosa is a library for creating slugs. It is an extraction and improvement of
4
+ the string code from [FriendlyId](http://github.com/norman/friendly_id),
5
+ intended to help developers create libraries similar to FriendlyId.
6
+
7
+ ## Features / Usage
8
+
9
+ ### ASCII transliteration
10
+
11
+ "Gölcük, Turkey".to_slug.approximate_ascii.to_s #=> "Golcuk, Turkey"
12
+
13
+ ### Special cases for German and Spanish
14
+
15
+ "Jürgen Müller".to_slug.approximate_ascii.to_s #=> "Jurgen Muller"
16
+ "Jürgen Müller".to_slug.approximate_ascii(:german).to_s #=> "Juergen Mueller"
17
+ "feliz año".to_slug.approximate_ascii.to_s #=> "feliz ano"
18
+ "feliz año".to_slug.approximate_ascii(:spanish).to_s #=> "feliz anio"
19
+
20
+ ### Non-ASCII removal
21
+
22
+ "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
23
+
24
+ ### Truncate by characters
25
+
26
+ "üüü".to_slug.truncate(2).to_s #=> "üü"
27
+
28
+ ### Truncate by bytes
29
+
30
+ This can be useful to ensure the generated slug will fit in a database column
31
+ whose length is limited by bytes rather than UTF-8 characters.
32
+
33
+ "üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
34
+
35
+ ### All-in-one
36
+
37
+ "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
38
+
39
+ There are many more features; check the API docs and source code to find out
40
+ more.
41
+
42
+ ## Getting it
43
+
44
+ Babosa can be installed via Rubygems:
45
+
46
+ gem install babosa
47
+
48
+ You can get the source code from its [Github repository](http://github.com/norman/babosa).
49
+
50
+ ## Reporting bugs
51
+
52
+ Please use Babosa's [Github issue tracker](http://github.com/norman/babosa/issues).
53
+
54
+
55
+ ## Misc
56
+
57
+ The speed and quality of Babosa's UTF-8 support depends on which Ruby and which
58
+ gems you are using.
59
+
60
+ On JRuby 1.5 and above, Babosa uses Java's native UTF-8 support. If you require
61
+ [Unicode](http://github.com/blackwinter/unicode) or ActiveSupport before
62
+ Babosa, it will use the support provided by those libraries. Otherwise, Babosa
63
+ defaults to very basic UTF-8 support for Latin characters only.
64
+
65
+ "Babosa" means slug in Spanish.
66
+
67
+ ## Author
68
+
69
+ [Norman Clarke](http://njclarke.com)
70
+
71
+ ## Copyright
72
+
73
+ Copyright (c) 2010 Norman Clarke
74
+
75
+ Permission is hereby granted, free of charge, to any person obtaining a copy
76
+ of this software and associated documentation files (the "Software"), to deal
77
+ in the Software without restriction, including without limitation the rights
78
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
79
+ copies of the Software, and to permit persons to whom the Software is
80
+ furnished to do so, subject to the following conditions:
81
+
82
+ The above copyright notice and this permission notice shall be included in all
83
+ copies or substantial portions of the Software.
84
+
85
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
86
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
87
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
88
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
89
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
90
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
91
+ SOFTWARE.
@@ -0,0 +1,27 @@
1
+ require "rake/testtask"
2
+ require "rake/clean"
3
+ require "rake/gempackagetask"
4
+
5
+ task :default => :test
6
+
7
+ CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
8
+ Rake::GemPackageTask.new(eval(File.read("babosa.gemspec"))) { |pkg| }
9
+ Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
10
+
11
+ begin
12
+ require "yard"
13
+ YARD::Rake::YardocTask.new do |t|
14
+ t.options = ["--output-dir=doc"]
15
+ end
16
+ rescue LoadError
17
+ end
18
+
19
+ begin
20
+ require "rcov/rcovtask"
21
+ Rcov::RcovTask.new do |r|
22
+ r.test_files = FileList["test/**/*_test.rb"]
23
+ r.verbose = true
24
+ r.rcov_opts << "--exclude gems/*"
25
+ end
26
+ rescue LoadError
27
+ end
data/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ $LOAD_PATH << File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.uniq!
3
+ require "babosa"
@@ -0,0 +1,22 @@
1
+ module Babosa
2
+ def self.jruby15?
3
+ JRUBY_VERSION >= "1.5" rescue false
4
+ end
5
+ end
6
+
7
+ class String
8
+ def to_slug
9
+ Babosa::SlugString.new self
10
+ end
11
+
12
+ # Compatibility with 1.8.6
13
+ if !public_method_defined? :bytesize
14
+ def bytesize
15
+ unpack("C*").length
16
+ end
17
+ end
18
+ end
19
+
20
+ require "babosa/characters"
21
+ require "babosa/utf8/proxy"
22
+ require "babosa/slug_string"
@@ -0,0 +1,75 @@
1
+ # encoding: utf-8
2
+ module Babosa
3
+
4
+ # This module provides sets of characters needed for various UTF-8 aware
5
+ # string operations.
6
+ module Characters
7
+ extend self
8
+
9
+ # Hash of UTF-8 - ASCII approximations.
10
+ attr_reader :approximations
11
+ # Punctuation and control characters to remove from slug strings.
12
+ attr_reader :strippable
13
+
14
+ @strippable = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
15
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
16
+ 40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
17
+ 95, 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
18
+ 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
19
+ 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166,
20
+ 167, 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183,
21
+ 184, 185, 187, 188, 189, 190, 191, 215, 247]
22
+
23
+ # Adds a hash of approximations.
24
+ # @example
25
+ # add_approximations :spanish, "ñ" => "ni"
26
+ # @param [#to_sym] name The name of the approximations to add.
27
+ # @param Hash hash The approximations to add.
28
+ def add_approximations(name, hash)
29
+ @approximations ||= {}
30
+ @approximations[name.to_sym] = hash.inject({}) do |memo, object|
31
+ key = object[0].unpack("U").shift
32
+ value = object[1].unpack("C*")
33
+ memo[key] = value.length == 1 ? value[0] : value
34
+ memo
35
+ end
36
+ end
37
+
38
+ add_approximations :spanish, "ñ" => "ni"
39
+ add_approximations :german, "ä" => "ae", "ö" => "oe", "ü" => "ue"
40
+ add_approximations :latin, {
41
+ "À" => "A", "Á" => "A", "Â" => "A", "Ã" => "A", "Ä" => "A", "Å" => "A",
42
+ "Æ" => "AE", "Ç" => "C", "È" => "E", "É" => "E", "Ê" => "E", "Ë" => "E",
43
+ "Ì" => "I", "Í" => "I", "Î" => "I", "Ï" => "I", "Ð" => "D", "Ñ" => "N",
44
+ "Ò" => "O", "Ó" => "O", "Ô" => "O", "Õ" => "O", "Ö" => "O", "Ø" => "O",
45
+ "Ù" => "U", "Ú" => "U", "Û" => "U", "Ü" => "U", "Ý" => "Y", "Þ" => "Th",
46
+ "ß" => "ss", "à" => "a" , "á" => "a", "â" => "a", "ã" => "a", "ä" => "a",
47
+ "å" => "a", "æ" => "ae", "ç" => "c" , "è" => "e", "é" => "e", "ê" => "e",
48
+ "ë" => "e", "ì" => "i", "í" => "i", "î" => "i", "ï" => "i", "ð" => "d",
49
+ "ñ" => "n", "ò" => "o", "ó" => "o", "ô" => "o", "õ" => "o", "ö" => "o",
50
+ "ø" => "o", "ù" => "u", "ú" => "u", "û" => "u", "ü" => "u", "ý" => "y",
51
+ "þ" => "th", "ÿ" => "y", "Ā" => "A", "ā" => "a", "Ă" => "A", "ă" => "a",
52
+ "Ą" => "A", "ą" => "a", "Ć" => "C", "ć" => "c", "Ĉ" => "C", "ĉ" => "c",
53
+ "Ċ" => "C", "ċ" => "c", "Č" => "C", "č" => "c", "Ď" => "D", "ď" => "d",
54
+ "Đ" => "D", "đ" => "d", "Ē" => "E", "ē" => "e", "Ĕ" => "E", "ĕ" => "e",
55
+ "Ė" => "E", "ė" => "e", "Ę" => "E", "ę" => "e", "Ě" => "E", "ě" => "e",
56
+ "Ĝ" => "G", "ĝ" => "g", "Ğ" => "G", "ğ" => "g", "Ġ" => "G", "ġ" => "g",
57
+ "Ģ" => "G", "ģ" => "g", "Ĥ" => "H", "ĥ" => "h", "Ħ" => "H", "ħ" => "h",
58
+ "Ĩ" => "I", "ĩ" => "i", "Ī" => "I", "ī" => "i", "Ĭ" => "I", "ĭ" => "i",
59
+ "Į" => "I", "į" => "i", "İ" => "I", "ı" => "i", "IJ" => "IJ", "ij" => "ij",
60
+ "Ĵ" => "J", "ĵ" => "j", "Ķ" => "K", "ķ" => "k", "ĸ" => "k", "Ĺ" => "L",
61
+ "ĺ" => "l", "Ļ" => "L", "ļ" => "l", "Ľ" => "L", "ľ" => "l", "Ŀ" => "L",
62
+ "ŀ" => "l", "Ł" => "L", "ł" => "l", "Ń" => "N", "ń" => "n", "Ņ" => "N",
63
+ "ņ" => "n", "Ň" => "N", "ň" => "n", "ʼn" => "n", "Ŋ" => "NG", "ŋ" => "ng",
64
+ "Ō" => "O", "ō" => "o", "Ŏ" => "O", "ŏ" => "o", "Ő" => "O", "ő" => "o",
65
+ "Œ" => "OE", "œ" => "oe", "Ŕ" => "R", "ŕ" => "r", "Ŗ" => "R", "ŗ" => "r",
66
+ "Ř" => "R", "ř" => "r", "Ś" => "S", "ś" => "s", "Ŝ" => "S", "ŝ" => "s",
67
+ "Ş" => "S", "ş" => "s", "Š" => "S", "š" => "s", "Ţ" => "T", "ţ" => "t",
68
+ "Ť" => "T", "ť" => "t", "Ŧ" => "T", "ŧ" => "t", "Ũ" => "U", "ũ" => "u",
69
+ "Ū" => "U", "ū" => "u", "Ŭ" => "U", "ŭ" => "u", "Ů" => "U", "ů" => "u",
70
+ "Ű" => "U", "ű" => "u", "Ų" => "U", "ų" => "u", "Ŵ" => "W", "ŵ" => "w",
71
+ "Ŷ" => "Y", "ŷ" => "y", "Ÿ" => "Y", "Ź" => "Z", "ź" => "z", "Ż" => "Z",
72
+ "ż" => "z", "Ž" => "Z", "ž" => "z", "×" => "x", "÷" => "/"
73
+ }
74
+ end
75
+ end
@@ -0,0 +1,223 @@
1
+ # encoding: utf-8
2
+
3
+ module Babosa
4
+
5
+ # This class provides some string-manipulation methods specific to slugs.
6
+ #
7
+ # Note that this class includes many "bang methods" such as {#clean!} and
8
+ # {#normalize!} that perform actions on the string in-place. Each of these
9
+ # methods has a corresponding "bangless" method (i.e., +SlugString#clean!+
10
+ # and +SlugString#clean+) which does not appear in the documentation because
11
+ # it is generated dynamically.
12
+ #
13
+ # All of the bang methods return an instance of String, while the bangless
14
+ # versions return an instance of Babosa::SlugString, so that calls to methods
15
+ # specific to this class can be chained:
16
+ #
17
+ # string = SlugString.new("hello world")
18
+ # string.with_dashes! # => "hello-world"
19
+ # string.with_dashes # => <Babosa::SlugString:0x000001013e1590 @wrapped_string="hello-world">
20
+ #
21
+ # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
22
+ class SlugString
23
+
24
+ attr_reader :wrapped_string
25
+ alias to_s wrapped_string
26
+
27
+ @@utf8_proxy = if Babosa.jruby15?
28
+ UTF8::JavaProxy
29
+ elsif defined? Unicode
30
+ UTF8::UnicodeProxy
31
+ elsif defined? ActiveSupport
32
+ UTF8::ActiveSupportProxy
33
+ else
34
+ UTF8::DumbProxy
35
+ end
36
+
37
+ # Return the proxy used for UTF-8 support.
38
+ # @see Babosa::UTF8::UTF8Proxy
39
+ def self.utf8_proxy
40
+ @@utf8_proxy
41
+ end
42
+
43
+ # Set a proxy object used for UTF-8 support.
44
+ # @see Babosa::UTF8::UTF8Proxy
45
+ def self.utf8_proxy=(obj)
46
+ @@utf8_proxy = obj
47
+ end
48
+
49
+ def method_missing(symbol, *args, &block)
50
+ @wrapped_string.__send__(symbol, *args, &block)
51
+ end
52
+
53
+ # @param string [String] The string to use as the basis of the SlugString.
54
+ def initialize(string)
55
+ @wrapped_string = string.to_s
56
+ tidy_bytes!
57
+ normalize_utf8!
58
+ end
59
+
60
+ # Approximate an ASCII string. This works only for Western strings using
61
+ # characters that are Roman-alphabet characters + diacritics. Non-letter
62
+ # characters are left unmodified.
63
+ #
64
+ # string = SlugString.new "Łódź, Poland"
65
+ # string.approximate_ascii # => "Lodz, Poland"
66
+ # string = SlugString.new "日本"
67
+ # string.approximate_ascii # => "日本"
68
+ #
69
+ # You can pass any key(s) from +Characters.approximations+ as arguments. This allows
70
+ # for contextual approximations. By default; +:spanish+ and +:german+ are
71
+ # provided:
72
+ #
73
+ # string = SlugString.new "Jürgen Müller"
74
+ # string.approximate_ascii # => "Jurgen Muller"
75
+ # string.approximate_ascii :german # => "Juergen Mueller"
76
+ # string = SlugString.new "¡Feliz año!"
77
+ # string.approximate_ascii # => "¡Feliz ano!"
78
+ # string.approximate_ascii :spanish # => "¡Feliz anio!"
79
+ #
80
+ # You can modify the built-in approximations, or add your own:
81
+ #
82
+ # # Make Spanish use "nh" rather than "nn"
83
+ # Babosa::Characters.add_approximations(:spanish, "ñ" => "nh")
84
+ #
85
+ # Notice that this method does not simply convert to ASCII; if you want
86
+ # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
87
+ #
88
+ # string.approximate_ascii!(:spanish) # => "¡Feliz anio!"
89
+ # string.to_ascii! # => "Feliz anio!"
90
+ # @param *args <Symbol>
91
+ # @return String
92
+ def approximate_ascii!(overrides = {})
93
+ overrides = Characters.approximations[overrides] if overrides.kind_of? Symbol
94
+ @wrapped_string = unpack("U*").map { |char| approx_char(char, overrides) }.flatten.pack("U*")
95
+ end
96
+
97
+ # Converts dashes to spaces, removes leading and trailing spaces, and
98
+ # replaces multiple whitespace characters with a single space.
99
+ # @return String
100
+ def clean!
101
+ @wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip
102
+ end
103
+
104
+ # Remove any non-word characters.
105
+ # @return String
106
+ def word_chars!
107
+ @wrapped_string = (unpack("U*") - Characters.strippable).pack("U*")
108
+ end
109
+
110
+ # Normalize the string for use as a slug. Note that in this context,
111
+ # +normalize+ means, strip, remove non-letters/numbers, downcasing,
112
+ # truncating to 255 bytes and converting whitespace to dashes.
113
+ # @param Boolean ascii If true, approximate ASCII and then remove any non-ASCII characters.
114
+ # @return String
115
+ def normalize!(ascii = false)
116
+ if ascii
117
+ approximate_ascii!
118
+ to_ascii!
119
+ end
120
+ clean!
121
+ word_chars!
122
+ clean!
123
+ downcase!
124
+ truncate_bytes!(255)
125
+ with_dashes!
126
+ end
127
+
128
+ # Delete any non-ascii characters.
129
+ # @return String
130
+ def to_ascii!
131
+ @wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '')
132
+ end
133
+
134
+ # Truncate the string to +max+ characters.
135
+ # @example
136
+ # "üéøá".to_slug.truncate(3) #=> "üéø"
137
+ # @return String
138
+ def truncate!(max)
139
+ @wrapped_string = unpack("U*")[0...max].pack("U*")
140
+ end
141
+
142
+ # Truncate the string to +max+ bytes. This can be useful for ensuring that
143
+ # a UTF-8 string will always fit into a database column with a certain max
144
+ # byte length. The resulting string may be less than +max+ if the string must
145
+ # be truncated at a multibyte character boundary.
146
+ # @example
147
+ # "üéøá".to_slug.truncate_bytes(3) #=> "ü"
148
+ # @return String
149
+ def truncate_bytes!(max)
150
+ return @wrapped_string if @wrapped_string.bytesize <= max
151
+ curr = 0
152
+ new = []
153
+ unpack("U*").each do |char|
154
+ break if curr > max
155
+ char = [char].pack("U")
156
+ curr += char.bytesize
157
+ if curr <= max
158
+ new << char
159
+ end
160
+ end
161
+ @wrapped_string = new.join
162
+ end
163
+
164
+ # Replaces whitespace with dashes ("-").
165
+ # @return String
166
+ def with_dashes!
167
+ @wrapped_string = @wrapped_string.gsub(/\s/u, "-")
168
+ end
169
+
170
+ # Perform UTF-8 sensitive upcasing.
171
+ # @return String
172
+ def upcase!
173
+ @wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
174
+ end
175
+
176
+ # Perform UTF-8 sensitive downcasing.
177
+ # @return String
178
+ def downcase!
179
+ @wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
180
+ end
181
+
182
+ # Perform Unicode composition on the wrapped string.
183
+ # @return String
184
+ def normalize_utf8!
185
+ @wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string)
186
+ end
187
+
188
+ # Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
189
+ # UTF-8.
190
+ # @return String
191
+ def tidy_bytes!
192
+ @wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string)
193
+ end
194
+
195
+ %w[approximate_ascii clean downcase word_chars normalize normalize_utf8
196
+ tidy_bytes to_ascii truncate truncate_bytes upcase with_dashes].each do |method|
197
+ class_eval(<<-EOM)
198
+ def #{method}(*args)
199
+ send_to_new_instance(:#{method}!, *args)
200
+ end
201
+ EOM
202
+ end
203
+
204
+ def to_slug
205
+ self
206
+ end
207
+
208
+ private
209
+
210
+ # Look up the character's approximation in the configured maps.
211
+ def approx_char(char, overrides = {})
212
+ overrides[char] or Characters.approximations[:latin][char] or char
213
+ end
214
+
215
+ # Used as the basis of the bangless methods.
216
+ def send_to_new_instance(*args)
217
+ string = SlugString.new self
218
+ string.send(*args)
219
+ string
220
+ end
221
+
222
+ end
223
+ end
@@ -0,0 +1,20 @@
1
+ module Babosa
2
+ module UTF8
3
+ # A UTF-8 proxy using Active Support's multibyte support.
4
+ module ActiveSupportProxy
5
+ extend UTF8Proxy
6
+ extend self
7
+ def downcase(string)
8
+ ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
9
+ end
10
+
11
+ def upcase(string)
12
+ ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
13
+ end
14
+
15
+ def normalize_utf8(string)
16
+ ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,41 @@
1
+ require File.expand_path("../mappings", __FILE__)
2
+ module Babosa
3
+ module UTF8
4
+
5
+ # This module provides fallback UTF-8 support when nothing else is
6
+ # available. It does case folding for Roman alphabet-based characters
7
+ # commonly used by Western European languages and little else, making it
8
+ # useless for Russian, Bulgarian, Greek, etc. If at all possible, Unicode
9
+ # or ActiveSupport should be used instead because they support the full
10
+ # UTF-8 character range.
11
+ module DumbProxy
12
+ extend UTF8Proxy
13
+ extend self
14
+
15
+ def downcase(string)
16
+ string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
17
+ end
18
+
19
+ def upcase(string)
20
+ string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
21
+ end
22
+
23
+ # This does a very naive Unicode normalization, which should work for
24
+ # this library's purposes (i.e., Roman-based codepoints, up to U+017E).
25
+ # Do not use reuse this as a general solution! Use a real library like
26
+ # Unicode or ActiveSupport instead.
27
+ def normalize_utf8(string)
28
+ codepoints = string.unpack("U*")
29
+ new = []
30
+ until codepoints.empty? do
31
+ if Mappings::COMPOSITION[codepoints[0..1]]
32
+ new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
33
+ else
34
+ new << codepoints.shift
35
+ end
36
+ end
37
+ new.compact.flatten.pack("U*")
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,24 @@
1
+ include Java
2
+
3
+ module Babosa
4
+ module UTF8
5
+ # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
6
+ module JavaProxy
7
+ extend UTF8Proxy
8
+ extend self
9
+ import java.text.Normalizer
10
+
11
+ def downcase(string)
12
+ string.to_java.to_lower_case.to_s
13
+ end
14
+
15
+ def upcase(string)
16
+ string.to_java.to_upper_case.to_s
17
+ end
18
+
19
+ def normalize_utf8(string)
20
+ Normalizer.normalize(string, Normalizer::Form::NFC).to_s
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,193 @@
1
+ module Babosa
2
+ module UTF8
3
+
4
+ # A small subset of the mappings provided by Unicode.org, limited to Latin
5
+ # characters. This is used for Babosa's default "dumb" UTF-8 support.
6
+ module Mappings
7
+ DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
8
+ 71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
9
+ 79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
10
+ 87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
11
+ 226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
12
+ 233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
13
+ 240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
14
+ 248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
15
+ [115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
16
+ 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
17
+ 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
18
+ 296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
19
+ 309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
20
+ 324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
21
+ 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
22
+ 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
23
+ 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
24
+ 377, 378, 379, 380, 381, 382]
25
+
26
+ UPCASE = DOWNCASE.invert
27
+
28
+ COMPOSITION = {
29
+ [65,768] => 192,
30
+ [65,769] => 193,
31
+ [65,770] => 194,
32
+ [65,771] => 195,
33
+ [65,776] => 196,
34
+ [65,778] => 197,
35
+ [67,807] => 199,
36
+ [69,768] => 200,
37
+ [69,769] => 201,
38
+ [69,770] => 202,
39
+ [69,776] => 203,
40
+ [73,768] => 204,
41
+ [73,769] => 205,
42
+ [73,770] => 206,
43
+ [73,776] => 207,
44
+ [78,771] => 209,
45
+ [79,768] => 210,
46
+ [79,769] => 211,
47
+ [79,770] => 212,
48
+ [79,771] => 213,
49
+ [79,776] => 214,
50
+ [85,768] => 217,
51
+ [85,769] => 218,
52
+ [85,770] => 219,
53
+ [85,776] => 220,
54
+ [89,769] => 221,
55
+ [97,768] => 224,
56
+ [97,769] => 225,
57
+ [97,770] => 226,
58
+ [97,771] => 227,
59
+ [97,776] => 228,
60
+ [97,778] => 229,
61
+ [99,807] => 231,
62
+ [101,768] => 232,
63
+ [101,769] => 233,
64
+ [101,770] => 234,
65
+ [101,776] => 235,
66
+ [105,768] => 236,
67
+ [105,769] => 237,
68
+ [105,770] => 238,
69
+ [105,776] => 239,
70
+ [110,771] => 241,
71
+ [111,768] => 242,
72
+ [111,769] => 243,
73
+ [111,770] => 244,
74
+ [111,771] => 245,
75
+ [111,776] => 246,
76
+ [117,768] => 249,
77
+ [117,769] => 250,
78
+ [117,770] => 251,
79
+ [117,776] => 252,
80
+ [121,769] => 253,
81
+ [121,776] => 255,
82
+ [65,772] => 256,
83
+ [97,772] => 257,
84
+ [65,774] => 258,
85
+ [97,774] => 259,
86
+ [65,808] => 260,
87
+ [97,808] => 261,
88
+ [67,769] => 262,
89
+ [99,769] => 263,
90
+ [67,770] => 264,
91
+ [99,770] => 265,
92
+ [67,775] => 266,
93
+ [99,775] => 267,
94
+ [67,780] => 268,
95
+ [99,780] => 269,
96
+ [68,780] => 270,
97
+ [100,780] => 271,
98
+ [69,772] => 274,
99
+ [101,772] => 275,
100
+ [69,774] => 276,
101
+ [101,774] => 277,
102
+ [69,775] => 278,
103
+ [101,775] => 279,
104
+ [69,808] => 280,
105
+ [101,808] => 281,
106
+ [69,780] => 282,
107
+ [101,780] => 283,
108
+ [71,770] => 284,
109
+ [103,770] => 285,
110
+ [71,774] => 286,
111
+ [103,774] => 287,
112
+ [71,775] => 288,
113
+ [103,775] => 289,
114
+ [71,807] => 290,
115
+ [103,807] => 291,
116
+ [72,770] => 292,
117
+ [104,770] => 293,
118
+ [73,771] => 296,
119
+ [105,771] => 297,
120
+ [73,772] => 298,
121
+ [105,772] => 299,
122
+ [73,774] => 300,
123
+ [105,774] => 301,
124
+ [73,808] => 302,
125
+ [105,808] => 303,
126
+ [73,775] => 304,
127
+ [74,770] => 308,
128
+ [106,770] => 309,
129
+ [75,807] => 310,
130
+ [107,807] => 311,
131
+ [76,769] => 313,
132
+ [108,769] => 314,
133
+ [76,807] => 315,
134
+ [108,807] => 316,
135
+ [76,780] => 317,
136
+ [108,780] => 318,
137
+ [78,769] => 323,
138
+ [110,769] => 324,
139
+ [78,807] => 325,
140
+ [110,807] => 326,
141
+ [78,780] => 327,
142
+ [110,780] => 328,
143
+ [79,772] => 332,
144
+ [111,772] => 333,
145
+ [79,774] => 334,
146
+ [111,774] => 335,
147
+ [79,779] => 336,
148
+ [111,779] => 337,
149
+ [82,769] => 340,
150
+ [114,769] => 341,
151
+ [82,807] => 342,
152
+ [114,807] => 343,
153
+ [82,780] => 344,
154
+ [114,780] => 345,
155
+ [83,769] => 346,
156
+ [115,769] => 347,
157
+ [83,770] => 348,
158
+ [115,770] => 349,
159
+ [83,807] => 350,
160
+ [115,807] => 351,
161
+ [83,780] => 352,
162
+ [115,780] => 353,
163
+ [84,807] => 354,
164
+ [116,807] => 355,
165
+ [84,780] => 356,
166
+ [116,780] => 357,
167
+ [85,771] => 360,
168
+ [117,771] => 361,
169
+ [85,772] => 362,
170
+ [117,772] => 363,
171
+ [85,774] => 364,
172
+ [117,774] => 365,
173
+ [85,778] => 366,
174
+ [117,778] => 367,
175
+ [85,779] => 368,
176
+ [117,779] => 369,
177
+ [85,808] => 370,
178
+ [117,808] => 371,
179
+ [87,770] => 372,
180
+ [119,770] => 373,
181
+ [89,770] => 374,
182
+ [121,770] => 375,
183
+ [89,776] => 376,
184
+ [90,769] => 377,
185
+ [122,769] => 378,
186
+ [90,775] => 379,
187
+ [122,775] => 380,
188
+ [90,780] => 381,
189
+ [122,780] => 382
190
+ }
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,118 @@
1
+ module Babosa
2
+ module UTF8
3
+
4
+ autoload :JavaProxy, "babosa/utf8/java_proxy"
5
+ autoload :UnicodeProxy, "babosa/utf8/unicode_proxy"
6
+ autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy"
7
+ autoload :DumbProxy, "babosa/utf8/dumb_proxy"
8
+
9
+ # A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
10
+ # The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
11
+ module UTF8Proxy
12
+ CP1252 = {
13
+ 128 => [226, 130, 172],
14
+ 129 => nil,
15
+ 130 => [226, 128, 154],
16
+ 131 => [198, 146],
17
+ 132 => [226, 128, 158],
18
+ 133 => [226, 128, 166],
19
+ 134 => [226, 128, 160],
20
+ 135 => [226, 128, 161],
21
+ 136 => [203, 134],
22
+ 137 => [226, 128, 176],
23
+ 138 => [197, 160],
24
+ 139 => [226, 128, 185],
25
+ 140 => [197, 146],
26
+ 141 => nil,
27
+ 142 => [197, 189],
28
+ 143 => nil,
29
+ 144 => nil,
30
+ 145 => [226, 128, 152],
31
+ 146 => [226, 128, 153],
32
+ 147 => [226, 128, 156],
33
+ 148 => [226, 128, 157],
34
+ 149 => [226, 128, 162],
35
+ 150 => [226, 128, 147],
36
+ 151 => [226, 128, 148],
37
+ 152 => [203, 156],
38
+ 153 => [226, 132, 162],
39
+ 154 => [197, 161],
40
+ 155 => [226, 128, 186],
41
+ 156 => [197, 147],
42
+ 157 => nil,
43
+ 158 => [197, 190],
44
+ 159 => [197, 184]
45
+ }
46
+
47
+ # This is a stub for a method that should return a Unicode-aware
48
+ # downcased version of the given string.
49
+ def downcase(string)
50
+ raise NotImplementedError
51
+ end
52
+
53
+ # This is a stub for a method that should return a Unicode-aware
54
+ # upcased version of the given string.
55
+ def upcase(string)
56
+ raise NotImplementedError
57
+ end
58
+
59
+ # This is a stub for a method that should return the Unicode NFC
60
+ # normalization of the given string.
61
+ def normalize_utf8(string)
62
+ raise NotImplementedError
63
+ end
64
+
65
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
66
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
67
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
68
+ # always work.
69
+ def tidy_bytes(string)
70
+ bytes = string.unpack("C*")
71
+ conts_expected = 0
72
+ last_lead = 0
73
+
74
+ bytes.each_index do |i|
75
+ byte = bytes[i]
76
+ is_ascii = byte < 128
77
+ is_cont = byte > 127 && byte < 192
78
+ is_lead = byte > 191 && byte < 245
79
+ is_unused = byte > 240
80
+ is_restricted = byte > 244
81
+
82
+ # Impossible or highly unlikely byte? Clean it.
83
+ if is_unused || is_restricted
84
+ bytes[i] = tidy_byte(byte)
85
+ elsif is_cont
86
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
87
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
88
+ else
89
+ if conts_expected > 0
90
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
91
+ # the leading byte.
92
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
93
+ conts_expected = 0
94
+ end
95
+ if is_lead
96
+ # Final byte is leading? Clean it.
97
+ if i == bytes.length - 1
98
+ bytes[i] = tidy_byte(bytes.last)
99
+ else
100
+ # Valid leading byte? Expect continuations determined by position of
101
+ # first zero bit, with max of 3.
102
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
103
+ last_lead = i
104
+ end
105
+ end
106
+ end
107
+ end
108
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
109
+ end
110
+
111
+ private
112
+
113
+ def tidy_byte(byte)
114
+ byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,21 @@
1
+ module Babosa
2
+ module UTF8
3
+ # A UTF-8 proxy using the Unicode gem.
4
+ # @see http://github.com/blackwinter/unicode
5
+ module UnicodeProxy
6
+ extend UTF8Proxy
7
+ extend self
8
+ def downcase(string)
9
+ Unicode.downcase(string)
10
+ end
11
+
12
+ def upcase(string)
13
+ Unicode.upcase(string)
14
+ end
15
+
16
+ def normalize_utf8(string)
17
+ Unicode.normalize_C(string)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module Babosa
2
+ module Version
3
+ STRING = "0.1.0"
4
+ end
5
+ end
@@ -0,0 +1,160 @@
1
+ # encoding: utf-8
2
+ $KCODE = 'UTF8' if RUBY_VERSION < '1.9'
3
+ $LOAD_PATH << File.expand_path("../../lib", __FILE__)
4
+ $LOAD_PATH.uniq!
5
+
6
+ require "rubygems"
7
+ require "bundler"
8
+ Bundler.setup
9
+ require "test/unit"
10
+ require "babosa"
11
+
12
+ Module.send :include, Module.new {
13
+ def test(name, &block)
14
+ define_method("test_#{name.gsub(/[^a-z0-9_]/i, "_")}".to_sym, &block)
15
+ end
16
+ }
17
+
18
+ module UTF8ProxyTest
19
+ test "should downcase strings" do
20
+ assert_equal "åéîøü", proxy.downcase("ÅÉÎØÜ")
21
+ end
22
+
23
+ test "should upcase strings" do
24
+ assert_equal "ÅÉÎØÜ", proxy.upcase("åéîøü")
25
+ end
26
+
27
+ test "should compose UTF-8" do
28
+ # ÅÉÎØÜ
29
+ uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136]
30
+ composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156]
31
+ uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*")
32
+ composed_string = composed_bytes.pack("C*").unpack("U*").pack("U*")
33
+ assert_equal composed_bytes, proxy.normalize_utf8(uncomposed_string).unpack("C*")
34
+ end
35
+ end
36
+
37
+ if Babosa.jruby15?
38
+ class JavaProxyTest < Test::Unit::TestCase
39
+ include UTF8ProxyTest
40
+ def proxy
41
+ Babosa::UTF8::JavaProxy
42
+ end
43
+ end
44
+ end
45
+
46
+ class DumbProxyTest < Test::Unit::TestCase
47
+ include UTF8ProxyTest
48
+ def proxy
49
+ Babosa::UTF8::DumbProxy
50
+ end
51
+ end
52
+
53
+ class BabosaTest < Test::Unit::TestCase
54
+
55
+ test "word_chars! should leave only letters and spaces" do
56
+ string = "a*$%^$@!@b$%^&*()*!c"
57
+ assert_match /[a-z ]*/i, string.to_slug.word_chars!
58
+ end
59
+
60
+ test "approximate_ascii should transliterate to ascii" do
61
+ slug = (0xC0..0x17E).to_a.each do |codepoint|
62
+ ss = [codepoint].pack("U*").to_slug
63
+ approx = ss.approximate_ascii
64
+ assert_match /[\x0-\x7f]/, approx.to_s
65
+ end
66
+ end
67
+
68
+ test "should lowercase strings" do
69
+ assert_equal "feliz año", "FELIZ AÑO".to_slug.downcase!
70
+ end
71
+
72
+ test "should uppercase strings" do
73
+ assert_equal "FELIZ AÑO", "feliz año".to_slug.upcase!
74
+ end
75
+
76
+ test "should replace whitespace with dashes" do
77
+ assert_equal "a-b", "a b".to_slug.clean.normalize!
78
+ end
79
+
80
+ test "should replace multiple spaces with 1 dash" do
81
+ assert_equal "a-b", "a b".to_slug.clean.normalize!
82
+ end
83
+
84
+ test "should replace multiple dashes with 1 dash" do
85
+ assert_equal "male-female", "male - female".to_slug.normalize!
86
+ end
87
+
88
+ test "should strip trailing space" do
89
+ assert_equal "ab", "ab ".to_slug.normalize!
90
+ end
91
+
92
+ test "should strip leading space" do
93
+ assert_equal "ab", " ab".to_slug.normalize!
94
+ end
95
+
96
+ test "should strip trailing slashes" do
97
+ assert_equal "ab", "ab-".to_slug.normalize!
98
+ end
99
+
100
+ test "should strip leading slashes" do
101
+ assert_equal "ab", "-ab".to_slug.normalize!
102
+ end
103
+
104
+ test "should not modify valid name strings" do
105
+ assert_equal "a-b-c-d", "a-b-c-d".to_slug.normalize!
106
+ end
107
+
108
+ test "should do special approximations for German" do
109
+ assert_equal "Juergen", "Jürgen".to_slug.approximate_ascii!(:german)
110
+ end
111
+
112
+ test "should do special approximations for Spanish" do
113
+ assert_equal "anio", "año".to_slug.approximate_ascii!(:spanish)
114
+ end
115
+
116
+ test "should work with non roman chars" do
117
+ assert_equal "検-索", "検 索".to_slug.normalize!
118
+ end
119
+
120
+ test "should work with invalid UTF-8 strings" do
121
+ %w[approximate_ascii clean downcase word_chars normalize to_ascii upcase with_dashes].each do |method|
122
+ string = "\x93abc".to_slug
123
+ assert_nothing_raised do
124
+ method == "truncate" ? string.send(method, 32) : string.send(method)
125
+ end
126
+ end
127
+ end
128
+
129
+ test "should truncate string by byte length" do
130
+ assert_equal "ü", "üa".to_slug.truncate_bytes!(2)
131
+ assert_equal "", "üa".to_slug.truncate_bytes!(1)
132
+ assert_equal "üa", "üa".to_slug.truncate_bytes!(100)
133
+ assert_equal "ü", "üéøá".to_slug.truncate_bytes!(3)
134
+ end
135
+
136
+ test "should truncate string by char length" do
137
+ assert_equal "üa", "üa".to_slug.truncate!(2)
138
+ assert_equal "ü", "üa".to_slug.truncate!(1)
139
+ assert_equal "üa", "üa".to_slug.truncate!(100)
140
+ end
141
+
142
+ test "should transliterate uncomposed utf8" do
143
+ string = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS
144
+ assert_equal "u", string.to_slug.approximate_ascii!
145
+ end
146
+
147
+ test "with_dashes should not change byte size when replacing spaces" do
148
+ assert_equal "".bytesize, "".to_slug.with_dashes.bytesize
149
+ assert_equal " ".bytesize, " ".to_slug.with_dashes.bytesize
150
+ assert_equal "-abc-".bytesize, "-abc-".to_slug.with_dashes.bytesize
151
+ assert_equal " abc ".bytesize, " abc ".to_slug.with_dashes.bytesize
152
+ assert_equal " a bc ".bytesize, " a bc ".to_slug.with_dashes.bytesize
153
+ end
154
+
155
+ test "normalize! with ascii should approximate and strip non ascii" do
156
+ ss = "カタカナ: katakana is über cool".to_slug
157
+ assert_equal "katakana-is-uber-cool", ss.normalize!(true)
158
+ end
159
+
160
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: babosa
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Norman Clarke
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-07-12 00:00:00 -03:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: " A library for creating slugs. Babosa an extraction and improvement of the\n string code from FriendlyId, intended to help developers create similar\n libraries or plugins.\n"
22
+ email: norman@njclarke.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/babosa/characters.rb
31
+ - lib/babosa/slug_string.rb
32
+ - lib/babosa/utf8/active_support_proxy.rb
33
+ - lib/babosa/utf8/dumb_proxy.rb
34
+ - lib/babosa/utf8/java_proxy.rb
35
+ - lib/babosa/utf8/mappings.rb
36
+ - lib/babosa/utf8/proxy.rb
37
+ - lib/babosa/utf8/unicode_proxy.rb
38
+ - lib/babosa/version.rb
39
+ - lib/babosa.rb
40
+ - README.md
41
+ - MIT-LICENSE
42
+ - Rakefile
43
+ - init.rb
44
+ - test/babosa_test.rb
45
+ has_rdoc: false
46
+ homepage: http://norman.github.com/babosa
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options: []
51
+
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ segments:
60
+ - 0
61
+ version: "0"
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ requirements: []
71
+
72
+ rubyforge_project: "[none]"
73
+ rubygems_version: 1.3.7
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: A library for creating slugs.
77
+ test_files:
78
+ - test/babosa_test.rb