babosa 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2010 Norman Clarke
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,91 @@
1
+ # Babosa
2
+
3
+ Babosa is a library for creating slugs. It is an extraction and improvement of
4
+ the string code from [FriendlyId](http://github.com/norman/friendly_id),
5
+ intended to help developers create libraries similar to FriendlyId.
6
+
7
+ ## Features / Usage
8
+
9
+ ### ASCII transliteration
10
+
11
+ "Gölcük, Turkey".to_slug.approximate_ascii.to_s #=> "Golcuk, Turkey"
12
+
13
+ ### Special cases for German and Spanish
14
+
15
+ "Jürgen Müller".to_slug.approximate_ascii.to_s #=> "Jurgen Muller"
16
+ "Jürgen Müller".to_slug.approximate_ascii(:german).to_s #=> "Juergen Mueller"
17
+ "feliz año".to_slug.approximate_ascii.to_s #=> "feliz ano"
18
+ "feliz año".to_slug.approximate_ascii(:spanish).to_s #=> "feliz anio"
19
+
20
+ ### Non-ASCII removal
21
+
22
+ "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
23
+
24
+ ### Truncate by characters
25
+
26
+ "üüü".to_slug.truncate(2).to_s #=> "üü"
27
+
28
+ ### Truncate by bytes
29
+
30
+ This can be useful to ensure the generated slug will fit in a database column
31
+ whose length is limited by bytes rather than UTF-8 characters.
32
+
33
+ "üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
34
+
35
+ ### All-in-one
36
+
37
+ "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
38
+
39
+ There are many more features; check the API docs and source code to find out
40
+ more.
41
+
42
+ ## Getting it
43
+
44
+ Babosa can be installed via Rubygems:
45
+
46
+ gem install babosa
47
+
48
+ You can get the source code from its [Github repository](http://github.com/norman/babosa).
49
+
50
+ ## Reporting bugs
51
+
52
+ Please use Babosa's [Github issue tracker](http://github.com/norman/babosa/issues).
53
+
54
+
55
+ ## Misc
56
+
57
+ The speed and quality of Babosa's UTF-8 support depends on which Ruby and which
58
+ gems you are using.
59
+
60
+ On JRuby 1.5 and above, Babosa uses Java's native UTF-8 support. If you require
61
+ [Unicode](http://github.com/blackwinter/unicode) or ActiveSupport before
62
+ Babosa, it will use the support provided by those libraries. Otherwise, Babosa
63
+ defaults to very basic UTF-8 support for Latin characters only.
64
+
65
+ "Babosa" means slug in Spanish.
66
+
67
+ ## Author
68
+
69
+ [Norman Clarke](http://njclarke.com)
70
+
71
+ ## Copyright
72
+
73
+ Copyright (c) 2010 Norman Clarke
74
+
75
+ Permission is hereby granted, free of charge, to any person obtaining a copy
76
+ of this software and associated documentation files (the "Software"), to deal
77
+ in the Software without restriction, including without limitation the rights
78
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
79
+ copies of the Software, and to permit persons to whom the Software is
80
+ furnished to do so, subject to the following conditions:
81
+
82
+ The above copyright notice and this permission notice shall be included in all
83
+ copies or substantial portions of the Software.
84
+
85
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
86
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
87
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
88
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
89
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
90
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
91
+ SOFTWARE.
@@ -0,0 +1,27 @@
1
+ require "rake/testtask"
2
+ require "rake/clean"
3
+ require "rake/gempackagetask"
4
+
5
+ task :default => :test
6
+
7
+ CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
8
+ Rake::GemPackageTask.new(eval(File.read("babosa.gemspec"))) { |pkg| }
9
+ Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
10
+
11
+ begin
12
+ require "yard"
13
+ YARD::Rake::YardocTask.new do |t|
14
+ t.options = ["--output-dir=doc"]
15
+ end
16
+ rescue LoadError
17
+ end
18
+
19
+ begin
20
+ require "rcov/rcovtask"
21
+ Rcov::RcovTask.new do |r|
22
+ r.test_files = FileList["test/**/*_test.rb"]
23
+ r.verbose = true
24
+ r.rcov_opts << "--exclude gems/*"
25
+ end
26
+ rescue LoadError
27
+ end
data/init.rb ADDED
@@ -0,0 +1,3 @@
1
+ $LOAD_PATH << File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.uniq!
3
+ require "babosa"
@@ -0,0 +1,22 @@
1
+ module Babosa
2
+ def self.jruby15?
3
+ JRUBY_VERSION >= "1.5" rescue false
4
+ end
5
+ end
6
+
7
+ class String
8
+ def to_slug
9
+ Babosa::SlugString.new self
10
+ end
11
+
12
+ # Compatibility with 1.8.6
13
+ if !public_method_defined? :bytesize
14
+ def bytesize
15
+ unpack("C*").length
16
+ end
17
+ end
18
+ end
19
+
20
+ require "babosa/characters"
21
+ require "babosa/utf8/proxy"
22
+ require "babosa/slug_string"
@@ -0,0 +1,75 @@
1
+ # encoding: utf-8
2
+ module Babosa
3
+
4
+ # This module provides sets of characters needed for various UTF-8 aware
5
+ # string operations.
6
+ module Characters
7
+ extend self
8
+
9
+ # Hash of UTF-8 - ASCII approximations.
10
+ attr_reader :approximations
11
+ # Punctuation and control characters to remove from slug strings.
12
+ attr_reader :strippable
13
+
14
+ @strippable = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
15
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
16
+ 40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
17
+ 95, 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
18
+ 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
19
+ 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166,
20
+ 167, 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183,
21
+ 184, 185, 187, 188, 189, 190, 191, 215, 247]
22
+
23
+ # Adds a hash of approximations.
24
+ # @example
25
+ # add_approximations :spanish, "ñ" => "ni"
26
+ # @param [#to_sym] name The name of the approximations to add.
27
+ # @param Hash hash The approximations to add.
28
+ def add_approximations(name, hash)
29
+ @approximations ||= {}
30
+ @approximations[name.to_sym] = hash.inject({}) do |memo, object|
31
+ key = object[0].unpack("U").shift
32
+ value = object[1].unpack("C*")
33
+ memo[key] = value.length == 1 ? value[0] : value
34
+ memo
35
+ end
36
+ end
37
+
38
+ add_approximations :spanish, "ñ" => "ni"
39
+ add_approximations :german, "ä" => "ae", "ö" => "oe", "ü" => "ue"
40
+ add_approximations :latin, {
41
+ "À" => "A", "Á" => "A", "Â" => "A", "Ã" => "A", "Ä" => "A", "Å" => "A",
42
+ "Æ" => "AE", "Ç" => "C", "È" => "E", "É" => "E", "Ê" => "E", "Ë" => "E",
43
+ "Ì" => "I", "Í" => "I", "Î" => "I", "Ï" => "I", "Ð" => "D", "Ñ" => "N",
44
+ "Ò" => "O", "Ó" => "O", "Ô" => "O", "Õ" => "O", "Ö" => "O", "Ø" => "O",
45
+ "Ù" => "U", "Ú" => "U", "Û" => "U", "Ü" => "U", "Ý" => "Y", "Þ" => "Th",
46
+ "ß" => "ss", "à" => "a" , "á" => "a", "â" => "a", "ã" => "a", "ä" => "a",
47
+ "å" => "a", "æ" => "ae", "ç" => "c" , "è" => "e", "é" => "e", "ê" => "e",
48
+ "ë" => "e", "ì" => "i", "í" => "i", "î" => "i", "ï" => "i", "ð" => "d",
49
+ "ñ" => "n", "ò" => "o", "ó" => "o", "ô" => "o", "õ" => "o", "ö" => "o",
50
+ "ø" => "o", "ù" => "u", "ú" => "u", "û" => "u", "ü" => "u", "ý" => "y",
51
+ "þ" => "th", "ÿ" => "y", "Ā" => "A", "ā" => "a", "Ă" => "A", "ă" => "a",
52
+ "Ą" => "A", "ą" => "a", "Ć" => "C", "ć" => "c", "Ĉ" => "C", "ĉ" => "c",
53
+ "Ċ" => "C", "ċ" => "c", "Č" => "C", "č" => "c", "Ď" => "D", "ď" => "d",
54
+ "Đ" => "D", "đ" => "d", "Ē" => "E", "ē" => "e", "Ĕ" => "E", "ĕ" => "e",
55
+ "Ė" => "E", "ė" => "e", "Ę" => "E", "ę" => "e", "Ě" => "E", "ě" => "e",
56
+ "Ĝ" => "G", "ĝ" => "g", "Ğ" => "G", "ğ" => "g", "Ġ" => "G", "ġ" => "g",
57
+ "Ģ" => "G", "ģ" => "g", "Ĥ" => "H", "ĥ" => "h", "Ħ" => "H", "ħ" => "h",
58
+ "Ĩ" => "I", "ĩ" => "i", "Ī" => "I", "ī" => "i", "Ĭ" => "I", "ĭ" => "i",
59
+ "Į" => "I", "į" => "i", "İ" => "I", "ı" => "i", "IJ" => "IJ", "ij" => "ij",
60
+ "Ĵ" => "J", "ĵ" => "j", "Ķ" => "K", "ķ" => "k", "ĸ" => "k", "Ĺ" => "L",
61
+ "ĺ" => "l", "Ļ" => "L", "ļ" => "l", "Ľ" => "L", "ľ" => "l", "Ŀ" => "L",
62
+ "ŀ" => "l", "Ł" => "L", "ł" => "l", "Ń" => "N", "ń" => "n", "Ņ" => "N",
63
+ "ņ" => "n", "Ň" => "N", "ň" => "n", "ʼn" => "n", "Ŋ" => "NG", "ŋ" => "ng",
64
+ "Ō" => "O", "ō" => "o", "Ŏ" => "O", "ŏ" => "o", "Ő" => "O", "ő" => "o",
65
+ "Œ" => "OE", "œ" => "oe", "Ŕ" => "R", "ŕ" => "r", "Ŗ" => "R", "ŗ" => "r",
66
+ "Ř" => "R", "ř" => "r", "Ś" => "S", "ś" => "s", "Ŝ" => "S", "ŝ" => "s",
67
+ "Ş" => "S", "ş" => "s", "Š" => "S", "š" => "s", "Ţ" => "T", "ţ" => "t",
68
+ "Ť" => "T", "ť" => "t", "Ŧ" => "T", "ŧ" => "t", "Ũ" => "U", "ũ" => "u",
69
+ "Ū" => "U", "ū" => "u", "Ŭ" => "U", "ŭ" => "u", "Ů" => "U", "ů" => "u",
70
+ "Ű" => "U", "ű" => "u", "Ų" => "U", "ų" => "u", "Ŵ" => "W", "ŵ" => "w",
71
+ "Ŷ" => "Y", "ŷ" => "y", "Ÿ" => "Y", "Ź" => "Z", "ź" => "z", "Ż" => "Z",
72
+ "ż" => "z", "Ž" => "Z", "ž" => "z", "×" => "x", "÷" => "/"
73
+ }
74
+ end
75
+ end
@@ -0,0 +1,223 @@
1
+ # encoding: utf-8
2
+
3
+ module Babosa
4
+
5
+ # This class provides some string-manipulation methods specific to slugs.
6
+ #
7
+ # Note that this class includes many "bang methods" such as {#clean!} and
8
+ # {#normalize!} that perform actions on the string in-place. Each of these
9
+ # methods has a corresponding "bangless" method (i.e., +SlugString#clean!+
10
+ # and +SlugString#clean+) which does not appear in the documentation because
11
+ # it is generated dynamically.
12
+ #
13
+ # All of the bang methods return an instance of String, while the bangless
14
+ # versions return an instance of Babosa::SlugString, so that calls to methods
15
+ # specific to this class can be chained:
16
+ #
17
+ # string = SlugString.new("hello world")
18
+ # string.with_dashes! # => "hello-world"
19
+ # string.with_dashes # => <Babosa::SlugString:0x000001013e1590 @wrapped_string="hello-world">
20
+ #
21
+ # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
22
+ class SlugString
23
+
24
+ attr_reader :wrapped_string
25
+ alias to_s wrapped_string
26
+
27
+ @@utf8_proxy = if Babosa.jruby15?
28
+ UTF8::JavaProxy
29
+ elsif defined? Unicode
30
+ UTF8::UnicodeProxy
31
+ elsif defined? ActiveSupport
32
+ UTF8::ActiveSupportProxy
33
+ else
34
+ UTF8::DumbProxy
35
+ end
36
+
37
+ # Return the proxy used for UTF-8 support.
38
+ # @see Babosa::UTF8::UTF8Proxy
39
+ def self.utf8_proxy
40
+ @@utf8_proxy
41
+ end
42
+
43
+ # Set a proxy object used for UTF-8 support.
44
+ # @see Babosa::UTF8::UTF8Proxy
45
+ def self.utf8_proxy=(obj)
46
+ @@utf8_proxy = obj
47
+ end
48
+
49
+ def method_missing(symbol, *args, &block)
50
+ @wrapped_string.__send__(symbol, *args, &block)
51
+ end
52
+
53
+ # @param string [String] The string to use as the basis of the SlugString.
54
+ def initialize(string)
55
+ @wrapped_string = string.to_s
56
+ tidy_bytes!
57
+ normalize_utf8!
58
+ end
59
+
60
+ # Approximate an ASCII string. This works only for Western strings using
61
+ # characters that are Roman-alphabet characters + diacritics. Non-letter
62
+ # characters are left unmodified.
63
+ #
64
+ # string = SlugString.new "Łódź, Poland"
65
+ # string.approximate_ascii # => "Lodz, Poland"
66
+ # string = SlugString.new "日本"
67
+ # string.approximate_ascii # => "日本"
68
+ #
69
+ # You can pass any key(s) from +Characters.approximations+ as arguments. This allows
70
+ # for contextual approximations. By default; +:spanish+ and +:german+ are
71
+ # provided:
72
+ #
73
+ # string = SlugString.new "Jürgen Müller"
74
+ # string.approximate_ascii # => "Jurgen Muller"
75
+ # string.approximate_ascii :german # => "Juergen Mueller"
76
+ # string = SlugString.new "¡Feliz año!"
77
+ # string.approximate_ascii # => "¡Feliz ano!"
78
+ # string.approximate_ascii :spanish # => "¡Feliz anio!"
79
+ #
80
+ # You can modify the built-in approximations, or add your own:
81
+ #
82
+ # # Make Spanish use "nh" rather than "nn"
83
+ # Babosa::Characters.add_approximations(:spanish, "ñ" => "nh")
84
+ #
85
+ # Notice that this method does not simply convert to ASCII; if you want
86
+ # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
87
+ #
88
+ # string.approximate_ascii!(:spanish) # => "¡Feliz anio!"
89
+ # string.to_ascii! # => "Feliz anio!"
90
+ # @param *args <Symbol>
91
+ # @return String
92
+ def approximate_ascii!(overrides = {})
93
+ overrides = Characters.approximations[overrides] if overrides.kind_of? Symbol
94
+ @wrapped_string = unpack("U*").map { |char| approx_char(char, overrides) }.flatten.pack("U*")
95
+ end
96
+
97
+ # Converts dashes to spaces, removes leading and trailing spaces, and
98
+ # replaces multiple whitespace characters with a single space.
99
+ # @return String
100
+ def clean!
101
+ @wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip
102
+ end
103
+
104
+ # Remove any non-word characters.
105
+ # @return String
106
+ def word_chars!
107
+ @wrapped_string = (unpack("U*") - Characters.strippable).pack("U*")
108
+ end
109
+
110
+ # Normalize the string for use as a slug. Note that in this context,
111
+ # +normalize+ means, strip, remove non-letters/numbers, downcasing,
112
+ # truncating to 255 bytes and converting whitespace to dashes.
113
+ # @param Boolean ascii If true, approximate ASCII and then remove any non-ASCII characters.
114
+ # @return String
115
+ def normalize!(ascii = false)
116
+ if ascii
117
+ approximate_ascii!
118
+ to_ascii!
119
+ end
120
+ clean!
121
+ word_chars!
122
+ clean!
123
+ downcase!
124
+ truncate_bytes!(255)
125
+ with_dashes!
126
+ end
127
+
128
+ # Delete any non-ascii characters.
129
+ # @return String
130
+ def to_ascii!
131
+ @wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '')
132
+ end
133
+
134
+ # Truncate the string to +max+ characters.
135
+ # @example
136
+ # "üéøá".to_slug.truncate(3) #=> "üéø"
137
+ # @return String
138
+ def truncate!(max)
139
+ @wrapped_string = unpack("U*")[0...max].pack("U*")
140
+ end
141
+
142
+ # Truncate the string to +max+ bytes. This can be useful for ensuring that
143
+ # a UTF-8 string will always fit into a database column with a certain max
144
+ # byte length. The resulting string may be less than +max+ if the string must
145
+ # be truncated at a multibyte character boundary.
146
+ # @example
147
+ # "üéøá".to_slug.truncate_bytes(3) #=> "ü"
148
+ # @return String
149
+ def truncate_bytes!(max)
150
+ return @wrapped_string if @wrapped_string.bytesize <= max
151
+ curr = 0
152
+ new = []
153
+ unpack("U*").each do |char|
154
+ break if curr > max
155
+ char = [char].pack("U")
156
+ curr += char.bytesize
157
+ if curr <= max
158
+ new << char
159
+ end
160
+ end
161
+ @wrapped_string = new.join
162
+ end
163
+
164
+ # Replaces whitespace with dashes ("-").
165
+ # @return String
166
+ def with_dashes!
167
+ @wrapped_string = @wrapped_string.gsub(/\s/u, "-")
168
+ end
169
+
170
+ # Perform UTF-8 sensitive upcasing.
171
+ # @return String
172
+ def upcase!
173
+ @wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
174
+ end
175
+
176
+ # Perform UTF-8 sensitive downcasing.
177
+ # @return String
178
+ def downcase!
179
+ @wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
180
+ end
181
+
182
+ # Perform Unicode composition on the wrapped string.
183
+ # @return String
184
+ def normalize_utf8!
185
+ @wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string)
186
+ end
187
+
188
+ # Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
189
+ # UTF-8.
190
+ # @return String
191
+ def tidy_bytes!
192
+ @wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string)
193
+ end
194
+
195
+ %w[approximate_ascii clean downcase word_chars normalize normalize_utf8
196
+ tidy_bytes to_ascii truncate truncate_bytes upcase with_dashes].each do |method|
197
+ class_eval(<<-EOM)
198
+ def #{method}(*args)
199
+ send_to_new_instance(:#{method}!, *args)
200
+ end
201
+ EOM
202
+ end
203
+
204
+ def to_slug
205
+ self
206
+ end
207
+
208
+ private
209
+
210
+ # Look up the character's approximation in the configured maps.
211
+ def approx_char(char, overrides = {})
212
+ overrides[char] or Characters.approximations[:latin][char] or char
213
+ end
214
+
215
+ # Used as the basis of the bangless methods.
216
+ def send_to_new_instance(*args)
217
+ string = SlugString.new self
218
+ string.send(*args)
219
+ string
220
+ end
221
+
222
+ end
223
+ end
@@ -0,0 +1,20 @@
1
+ module Babosa
2
+ module UTF8
3
+ # A UTF-8 proxy using Active Support's multibyte support.
4
+ module ActiveSupportProxy
5
+ extend UTF8Proxy
6
+ extend self
7
+ def downcase(string)
8
+ ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
9
+ end
10
+
11
+ def upcase(string)
12
+ ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
13
+ end
14
+
15
+ def normalize_utf8(string)
16
+ ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,41 @@
1
+ require File.expand_path("../mappings", __FILE__)
2
+ module Babosa
3
+ module UTF8
4
+
5
+ # This module provides fallback UTF-8 support when nothing else is
6
+ # available. It does case folding for Roman alphabet-based characters
7
+ # commonly used by Western European languages and little else, making it
8
+ # useless for Russian, Bulgarian, Greek, etc. If at all possible, Unicode
9
+ # or ActiveSupport should be used instead because they support the full
10
+ # UTF-8 character range.
11
+ module DumbProxy
12
+ extend UTF8Proxy
13
+ extend self
14
+
15
+ def downcase(string)
16
+ string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
17
+ end
18
+
19
+ def upcase(string)
20
+ string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
21
+ end
22
+
23
+ # This does a very naive Unicode normalization, which should work for
24
+ # this library's purposes (i.e., Roman-based codepoints, up to U+017E).
25
+ # Do not use reuse this as a general solution! Use a real library like
26
+ # Unicode or ActiveSupport instead.
27
+ def normalize_utf8(string)
28
+ codepoints = string.unpack("U*")
29
+ new = []
30
+ until codepoints.empty? do
31
+ if Mappings::COMPOSITION[codepoints[0..1]]
32
+ new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
33
+ else
34
+ new << codepoints.shift
35
+ end
36
+ end
37
+ new.compact.flatten.pack("U*")
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,24 @@
1
+ include Java
2
+
3
+ module Babosa
4
+ module UTF8
5
+ # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
6
+ module JavaProxy
7
+ extend UTF8Proxy
8
+ extend self
9
+ import java.text.Normalizer
10
+
11
+ def downcase(string)
12
+ string.to_java.to_lower_case.to_s
13
+ end
14
+
15
+ def upcase(string)
16
+ string.to_java.to_upper_case.to_s
17
+ end
18
+
19
+ def normalize_utf8(string)
20
+ Normalizer.normalize(string, Normalizer::Form::NFC).to_s
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,193 @@
1
+ module Babosa
2
+ module UTF8
3
+
4
+ # A small subset of the mappings provided by Unicode.org, limited to Latin
5
+ # characters. This is used for Babosa's default "dumb" UTF-8 support.
6
+ module Mappings
7
+ DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
8
+ 71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
9
+ 79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
10
+ 87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
11
+ 226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
12
+ 233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
13
+ 240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
14
+ 248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
15
+ [115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
16
+ 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
17
+ 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
18
+ 296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
19
+ 309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
20
+ 324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
21
+ 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
22
+ 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
23
+ 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
24
+ 377, 378, 379, 380, 381, 382]
25
+
26
+ UPCASE = DOWNCASE.invert
27
+
28
+ COMPOSITION = {
29
+ [65,768] => 192,
30
+ [65,769] => 193,
31
+ [65,770] => 194,
32
+ [65,771] => 195,
33
+ [65,776] => 196,
34
+ [65,778] => 197,
35
+ [67,807] => 199,
36
+ [69,768] => 200,
37
+ [69,769] => 201,
38
+ [69,770] => 202,
39
+ [69,776] => 203,
40
+ [73,768] => 204,
41
+ [73,769] => 205,
42
+ [73,770] => 206,
43
+ [73,776] => 207,
44
+ [78,771] => 209,
45
+ [79,768] => 210,
46
+ [79,769] => 211,
47
+ [79,770] => 212,
48
+ [79,771] => 213,
49
+ [79,776] => 214,
50
+ [85,768] => 217,
51
+ [85,769] => 218,
52
+ [85,770] => 219,
53
+ [85,776] => 220,
54
+ [89,769] => 221,
55
+ [97,768] => 224,
56
+ [97,769] => 225,
57
+ [97,770] => 226,
58
+ [97,771] => 227,
59
+ [97,776] => 228,
60
+ [97,778] => 229,
61
+ [99,807] => 231,
62
+ [101,768] => 232,
63
+ [101,769] => 233,
64
+ [101,770] => 234,
65
+ [101,776] => 235,
66
+ [105,768] => 236,
67
+ [105,769] => 237,
68
+ [105,770] => 238,
69
+ [105,776] => 239,
70
+ [110,771] => 241,
71
+ [111,768] => 242,
72
+ [111,769] => 243,
73
+ [111,770] => 244,
74
+ [111,771] => 245,
75
+ [111,776] => 246,
76
+ [117,768] => 249,
77
+ [117,769] => 250,
78
+ [117,770] => 251,
79
+ [117,776] => 252,
80
+ [121,769] => 253,
81
+ [121,776] => 255,
82
+ [65,772] => 256,
83
+ [97,772] => 257,
84
+ [65,774] => 258,
85
+ [97,774] => 259,
86
+ [65,808] => 260,
87
+ [97,808] => 261,
88
+ [67,769] => 262,
89
+ [99,769] => 263,
90
+ [67,770] => 264,
91
+ [99,770] => 265,
92
+ [67,775] => 266,
93
+ [99,775] => 267,
94
+ [67,780] => 268,
95
+ [99,780] => 269,
96
+ [68,780] => 270,
97
+ [100,780] => 271,
98
+ [69,772] => 274,
99
+ [101,772] => 275,
100
+ [69,774] => 276,
101
+ [101,774] => 277,
102
+ [69,775] => 278,
103
+ [101,775] => 279,
104
+ [69,808] => 280,
105
+ [101,808] => 281,
106
+ [69,780] => 282,
107
+ [101,780] => 283,
108
+ [71,770] => 284,
109
+ [103,770] => 285,
110
+ [71,774] => 286,
111
+ [103,774] => 287,
112
+ [71,775] => 288,
113
+ [103,775] => 289,
114
+ [71,807] => 290,
115
+ [103,807] => 291,
116
+ [72,770] => 292,
117
+ [104,770] => 293,
118
+ [73,771] => 296,
119
+ [105,771] => 297,
120
+ [73,772] => 298,
121
+ [105,772] => 299,
122
+ [73,774] => 300,
123
+ [105,774] => 301,
124
+ [73,808] => 302,
125
+ [105,808] => 303,
126
+ [73,775] => 304,
127
+ [74,770] => 308,
128
+ [106,770] => 309,
129
+ [75,807] => 310,
130
+ [107,807] => 311,
131
+ [76,769] => 313,
132
+ [108,769] => 314,
133
+ [76,807] => 315,
134
+ [108,807] => 316,
135
+ [76,780] => 317,
136
+ [108,780] => 318,
137
+ [78,769] => 323,
138
+ [110,769] => 324,
139
+ [78,807] => 325,
140
+ [110,807] => 326,
141
+ [78,780] => 327,
142
+ [110,780] => 328,
143
+ [79,772] => 332,
144
+ [111,772] => 333,
145
+ [79,774] => 334,
146
+ [111,774] => 335,
147
+ [79,779] => 336,
148
+ [111,779] => 337,
149
+ [82,769] => 340,
150
+ [114,769] => 341,
151
+ [82,807] => 342,
152
+ [114,807] => 343,
153
+ [82,780] => 344,
154
+ [114,780] => 345,
155
+ [83,769] => 346,
156
+ [115,769] => 347,
157
+ [83,770] => 348,
158
+ [115,770] => 349,
159
+ [83,807] => 350,
160
+ [115,807] => 351,
161
+ [83,780] => 352,
162
+ [115,780] => 353,
163
+ [84,807] => 354,
164
+ [116,807] => 355,
165
+ [84,780] => 356,
166
+ [116,780] => 357,
167
+ [85,771] => 360,
168
+ [117,771] => 361,
169
+ [85,772] => 362,
170
+ [117,772] => 363,
171
+ [85,774] => 364,
172
+ [117,774] => 365,
173
+ [85,778] => 366,
174
+ [117,778] => 367,
175
+ [85,779] => 368,
176
+ [117,779] => 369,
177
+ [85,808] => 370,
178
+ [117,808] => 371,
179
+ [87,770] => 372,
180
+ [119,770] => 373,
181
+ [89,770] => 374,
182
+ [121,770] => 375,
183
+ [89,776] => 376,
184
+ [90,769] => 377,
185
+ [122,769] => 378,
186
+ [90,775] => 379,
187
+ [122,775] => 380,
188
+ [90,780] => 381,
189
+ [122,780] => 382
190
+ }
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,118 @@
1
+ module Babosa
2
+ module UTF8
3
+
4
+ autoload :JavaProxy, "babosa/utf8/java_proxy"
5
+ autoload :UnicodeProxy, "babosa/utf8/unicode_proxy"
6
+ autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy"
7
+ autoload :DumbProxy, "babosa/utf8/dumb_proxy"
8
+
9
+ # A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
10
+ # The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
11
+ module UTF8Proxy
12
+ CP1252 = {
13
+ 128 => [226, 130, 172],
14
+ 129 => nil,
15
+ 130 => [226, 128, 154],
16
+ 131 => [198, 146],
17
+ 132 => [226, 128, 158],
18
+ 133 => [226, 128, 166],
19
+ 134 => [226, 128, 160],
20
+ 135 => [226, 128, 161],
21
+ 136 => [203, 134],
22
+ 137 => [226, 128, 176],
23
+ 138 => [197, 160],
24
+ 139 => [226, 128, 185],
25
+ 140 => [197, 146],
26
+ 141 => nil,
27
+ 142 => [197, 189],
28
+ 143 => nil,
29
+ 144 => nil,
30
+ 145 => [226, 128, 152],
31
+ 146 => [226, 128, 153],
32
+ 147 => [226, 128, 156],
33
+ 148 => [226, 128, 157],
34
+ 149 => [226, 128, 162],
35
+ 150 => [226, 128, 147],
36
+ 151 => [226, 128, 148],
37
+ 152 => [203, 156],
38
+ 153 => [226, 132, 162],
39
+ 154 => [197, 161],
40
+ 155 => [226, 128, 186],
41
+ 156 => [197, 147],
42
+ 157 => nil,
43
+ 158 => [197, 190],
44
+ 159 => [197, 184]
45
+ }
46
+
47
+ # This is a stub for a method that should return a Unicode-aware
48
+ # downcased version of the given string.
49
+ def downcase(string)
50
+ raise NotImplementedError
51
+ end
52
+
53
+ # This is a stub for a method that should return a Unicode-aware
54
+ # upcased version of the given string.
55
+ def upcase(string)
56
+ raise NotImplementedError
57
+ end
58
+
59
+ # This is a stub for a method that should return the Unicode NFC
60
+ # normalization of the given string.
61
+ def normalize_utf8(string)
62
+ raise NotImplementedError
63
+ end
64
+
65
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
66
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
67
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
68
+ # always work.
69
+ def tidy_bytes(string)
70
+ bytes = string.unpack("C*")
71
+ conts_expected = 0
72
+ last_lead = 0
73
+
74
+ bytes.each_index do |i|
75
+ byte = bytes[i]
76
+ is_ascii = byte < 128
77
+ is_cont = byte > 127 && byte < 192
78
+ is_lead = byte > 191 && byte < 245
79
+ is_unused = byte > 240
80
+ is_restricted = byte > 244
81
+
82
+ # Impossible or highly unlikely byte? Clean it.
83
+ if is_unused || is_restricted
84
+ bytes[i] = tidy_byte(byte)
85
+ elsif is_cont
86
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
87
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
88
+ else
89
+ if conts_expected > 0
90
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
91
+ # the leading byte.
92
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
93
+ conts_expected = 0
94
+ end
95
+ if is_lead
96
+ # Final byte is leading? Clean it.
97
+ if i == bytes.length - 1
98
+ bytes[i] = tidy_byte(bytes.last)
99
+ else
100
+ # Valid leading byte? Expect continuations determined by position of
101
+ # first zero bit, with max of 3.
102
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
103
+ last_lead = i
104
+ end
105
+ end
106
+ end
107
+ end
108
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
109
+ end
110
+
111
+ private
112
+
113
+ def tidy_byte(byte)
114
+ byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,21 @@
1
+ module Babosa
2
+ module UTF8
3
+ # A UTF-8 proxy using the Unicode gem.
4
+ # @see http://github.com/blackwinter/unicode
5
+ module UnicodeProxy
6
+ extend UTF8Proxy
7
+ extend self
8
+ def downcase(string)
9
+ Unicode.downcase(string)
10
+ end
11
+
12
+ def upcase(string)
13
+ Unicode.upcase(string)
14
+ end
15
+
16
+ def normalize_utf8(string)
17
+ Unicode.normalize_C(string)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module Babosa
2
+ module Version
3
+ STRING = "0.1.0"
4
+ end
5
+ end
@@ -0,0 +1,160 @@
1
+ # encoding: utf-8
2
+ $KCODE = 'UTF8' if RUBY_VERSION < '1.9'
3
+ $LOAD_PATH << File.expand_path("../../lib", __FILE__)
4
+ $LOAD_PATH.uniq!
5
+
6
+ require "rubygems"
7
+ require "bundler"
8
+ Bundler.setup
9
+ require "test/unit"
10
+ require "babosa"
11
+
12
+ Module.send :include, Module.new {
13
+ def test(name, &block)
14
+ define_method("test_#{name.gsub(/[^a-z0-9_]/i, "_")}".to_sym, &block)
15
+ end
16
+ }
17
+
18
+ module UTF8ProxyTest
19
+ test "should downcase strings" do
20
+ assert_equal "åéîøü", proxy.downcase("ÅÉÎØÜ")
21
+ end
22
+
23
+ test "should upcase strings" do
24
+ assert_equal "ÅÉÎØÜ", proxy.upcase("åéîøü")
25
+ end
26
+
27
+ test "should compose UTF-8" do
28
+ # ÅÉÎØÜ
29
+ uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136]
30
+ composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156]
31
+ uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*")
32
+ composed_string = composed_bytes.pack("C*").unpack("U*").pack("U*")
33
+ assert_equal composed_bytes, proxy.normalize_utf8(uncomposed_string).unpack("C*")
34
+ end
35
+ end
36
+
37
+ if Babosa.jruby15?
38
+ class JavaProxyTest < Test::Unit::TestCase
39
+ include UTF8ProxyTest
40
+ def proxy
41
+ Babosa::UTF8::JavaProxy
42
+ end
43
+ end
44
+ end
45
+
46
+ class DumbProxyTest < Test::Unit::TestCase
47
+ include UTF8ProxyTest
48
+ def proxy
49
+ Babosa::UTF8::DumbProxy
50
+ end
51
+ end
52
+
53
+ class BabosaTest < Test::Unit::TestCase
54
+
55
+ test "word_chars! should leave only letters and spaces" do
56
+ string = "a*$%^$@!@b$%^&*()*!c"
57
+ assert_match /[a-z ]*/i, string.to_slug.word_chars!
58
+ end
59
+
60
+ test "approximate_ascii should transliterate to ascii" do
61
+ slug = (0xC0..0x17E).to_a.each do |codepoint|
62
+ ss = [codepoint].pack("U*").to_slug
63
+ approx = ss.approximate_ascii
64
+ assert_match /[\x0-\x7f]/, approx.to_s
65
+ end
66
+ end
67
+
68
+ test "should lowercase strings" do
69
+ assert_equal "feliz año", "FELIZ AÑO".to_slug.downcase!
70
+ end
71
+
72
+ test "should uppercase strings" do
73
+ assert_equal "FELIZ AÑO", "feliz año".to_slug.upcase!
74
+ end
75
+
76
+ test "should replace whitespace with dashes" do
77
+ assert_equal "a-b", "a b".to_slug.clean.normalize!
78
+ end
79
+
80
+ test "should replace multiple spaces with 1 dash" do
81
+ assert_equal "a-b", "a b".to_slug.clean.normalize!
82
+ end
83
+
84
+ test "should replace multiple dashes with 1 dash" do
85
+ assert_equal "male-female", "male - female".to_slug.normalize!
86
+ end
87
+
88
+ test "should strip trailing space" do
89
+ assert_equal "ab", "ab ".to_slug.normalize!
90
+ end
91
+
92
+ test "should strip leading space" do
93
+ assert_equal "ab", " ab".to_slug.normalize!
94
+ end
95
+
96
+ test "should strip trailing slashes" do
97
+ assert_equal "ab", "ab-".to_slug.normalize!
98
+ end
99
+
100
+ test "should strip leading slashes" do
101
+ assert_equal "ab", "-ab".to_slug.normalize!
102
+ end
103
+
104
+ test "should not modify valid name strings" do
105
+ assert_equal "a-b-c-d", "a-b-c-d".to_slug.normalize!
106
+ end
107
+
108
+ test "should do special approximations for German" do
109
+ assert_equal "Juergen", "Jürgen".to_slug.approximate_ascii!(:german)
110
+ end
111
+
112
+ test "should do special approximations for Spanish" do
113
+ assert_equal "anio", "año".to_slug.approximate_ascii!(:spanish)
114
+ end
115
+
116
+ test "should work with non roman chars" do
117
+ assert_equal "検-索", "検 索".to_slug.normalize!
118
+ end
119
+
120
+ test "should work with invalid UTF-8 strings" do
121
+ %w[approximate_ascii clean downcase word_chars normalize to_ascii upcase with_dashes].each do |method|
122
+ string = "\x93abc".to_slug
123
+ assert_nothing_raised do
124
+ method == "truncate" ? string.send(method, 32) : string.send(method)
125
+ end
126
+ end
127
+ end
128
+
129
+ test "should truncate string by byte length" do
130
+ assert_equal "ü", "üa".to_slug.truncate_bytes!(2)
131
+ assert_equal "", "üa".to_slug.truncate_bytes!(1)
132
+ assert_equal "üa", "üa".to_slug.truncate_bytes!(100)
133
+ assert_equal "ü", "üéøá".to_slug.truncate_bytes!(3)
134
+ end
135
+
136
+ test "should truncate string by char length" do
137
+ assert_equal "üa", "üa".to_slug.truncate!(2)
138
+ assert_equal "ü", "üa".to_slug.truncate!(1)
139
+ assert_equal "üa", "üa".to_slug.truncate!(100)
140
+ end
141
+
142
+ test "should transliterate uncomposed utf8" do
143
+ string = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS
144
+ assert_equal "u", string.to_slug.approximate_ascii!
145
+ end
146
+
147
+ test "with_dashes should not change byte size when replacing spaces" do
148
+ assert_equal "".bytesize, "".to_slug.with_dashes.bytesize
149
+ assert_equal " ".bytesize, " ".to_slug.with_dashes.bytesize
150
+ assert_equal "-abc-".bytesize, "-abc-".to_slug.with_dashes.bytesize
151
+ assert_equal " abc ".bytesize, " abc ".to_slug.with_dashes.bytesize
152
+ assert_equal " a bc ".bytesize, " a bc ".to_slug.with_dashes.bytesize
153
+ end
154
+
155
+ test "normalize! with ascii should approximate and strip non ascii" do
156
+ ss = "カタカナ: katakana is über cool".to_slug
157
+ assert_equal "katakana-is-uber-cool", ss.normalize!(true)
158
+ end
159
+
160
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: babosa
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Norman Clarke
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-07-12 00:00:00 -03:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: " A library for creating slugs. Babosa an extraction and improvement of the\n string code from FriendlyId, intended to help developers create similar\n libraries or plugins.\n"
22
+ email: norman@njclarke.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/babosa/characters.rb
31
+ - lib/babosa/slug_string.rb
32
+ - lib/babosa/utf8/active_support_proxy.rb
33
+ - lib/babosa/utf8/dumb_proxy.rb
34
+ - lib/babosa/utf8/java_proxy.rb
35
+ - lib/babosa/utf8/mappings.rb
36
+ - lib/babosa/utf8/proxy.rb
37
+ - lib/babosa/utf8/unicode_proxy.rb
38
+ - lib/babosa/version.rb
39
+ - lib/babosa.rb
40
+ - README.md
41
+ - MIT-LICENSE
42
+ - Rakefile
43
+ - init.rb
44
+ - test/babosa_test.rb
45
+ has_rdoc: false
46
+ homepage: http://norman.github.com/babosa
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options: []
51
+
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ segments:
60
+ - 0
61
+ version: "0"
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ requirements: []
71
+
72
+ rubyforge_project: "[none]"
73
+ rubygems_version: 1.3.7
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: A library for creating slugs.
77
+ test_files:
78
+ - test/babosa_test.rb