babosa 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +19 -0
- data/README.md +91 -0
- data/Rakefile +27 -0
- data/init.rb +3 -0
- data/lib/babosa.rb +22 -0
- data/lib/babosa/characters.rb +75 -0
- data/lib/babosa/slug_string.rb +223 -0
- data/lib/babosa/utf8/active_support_proxy.rb +20 -0
- data/lib/babosa/utf8/dumb_proxy.rb +41 -0
- data/lib/babosa/utf8/java_proxy.rb +24 -0
- data/lib/babosa/utf8/mappings.rb +193 -0
- data/lib/babosa/utf8/proxy.rb +118 -0
- data/lib/babosa/utf8/unicode_proxy.rb +21 -0
- data/lib/babosa/version.rb +5 -0
- data/test/babosa_test.rb +160 -0
- metadata +78 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2010 Norman Clarke
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# Babosa
|
2
|
+
|
3
|
+
Babosa is a library for creating slugs. It is an extraction and improvement of
|
4
|
+
the string code from [FriendlyId](http://github.com/norman/friendly_id),
|
5
|
+
intended to help developers create libraries similar to FriendlyId.
|
6
|
+
|
7
|
+
## Features / Usage
|
8
|
+
|
9
|
+
### ASCII transliteration
|
10
|
+
|
11
|
+
"Gölcük, Turkey".to_slug.approximate_ascii.to_s #=> "Golcuk, Turkey"
|
12
|
+
|
13
|
+
### Special cases for German and Spanish
|
14
|
+
|
15
|
+
"Jürgen Müller".to_slug.approximate_ascii.to_s #=> "Jurgen Muller"
|
16
|
+
"Jürgen Müller".to_slug.approximate_ascii(:german).to_s #=> "Juergen Mueller"
|
17
|
+
"feliz año".to_slug.approximate_ascii.to_s #=> "feliz ano"
|
18
|
+
"feliz año".to_slug.approximate_ascii(:spanish).to_s #=> "feliz anio"
|
19
|
+
|
20
|
+
### Non-ASCII removal
|
21
|
+
|
22
|
+
"Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
|
23
|
+
|
24
|
+
### Truncate by characters
|
25
|
+
|
26
|
+
"üüü".to_slug.truncate(2).to_s #=> "üü"
|
27
|
+
|
28
|
+
### Truncate by bytes
|
29
|
+
|
30
|
+
This can be useful to ensure the generated slug will fit in a database column
|
31
|
+
whose length is limited by bytes rather than UTF-8 characters.
|
32
|
+
|
33
|
+
"üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
|
34
|
+
|
35
|
+
### All-in-one
|
36
|
+
|
37
|
+
"Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
|
38
|
+
|
39
|
+
There are many more features; check the API docs and source code to find out
|
40
|
+
more.
|
41
|
+
|
42
|
+
## Getting it
|
43
|
+
|
44
|
+
Babosa can be installed via Rubygems:
|
45
|
+
|
46
|
+
gem install babosa
|
47
|
+
|
48
|
+
You can get the source code from its [Github repository](http://github.com/norman/babosa).
|
49
|
+
|
50
|
+
## Reporting bugs
|
51
|
+
|
52
|
+
Please use Babosa's [Github issue tracker](http://github.com/norman/babosa/issues).
|
53
|
+
|
54
|
+
|
55
|
+
## Misc
|
56
|
+
|
57
|
+
The speed and quality of Babosa's UTF-8 support depends on which Ruby and which
|
58
|
+
gems you are using.
|
59
|
+
|
60
|
+
On JRuby 1.5 and above, Babosa uses Java's native UTF-8 support. If you require
|
61
|
+
[Unicode](http://github.com/blackwinter/unicode) or ActiveSupport before
|
62
|
+
Babosa, it will use the support provided by those libraries. Otherwise, Babosa
|
63
|
+
defaults to very basic UTF-8 support for Latin characters only.
|
64
|
+
|
65
|
+
"Babosa" means slug in Spanish.
|
66
|
+
|
67
|
+
## Author
|
68
|
+
|
69
|
+
[Norman Clarke](http://njclarke.com)
|
70
|
+
|
71
|
+
## Copyright
|
72
|
+
|
73
|
+
Copyright (c) 2010 Norman Clarke
|
74
|
+
|
75
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
76
|
+
of this software and associated documentation files (the "Software"), to deal
|
77
|
+
in the Software without restriction, including without limitation the rights
|
78
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
79
|
+
copies of the Software, and to permit persons to whom the Software is
|
80
|
+
furnished to do so, subject to the following conditions:
|
81
|
+
|
82
|
+
The above copyright notice and this permission notice shall be included in all
|
83
|
+
copies or substantial portions of the Software.
|
84
|
+
|
85
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
86
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
87
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
88
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
89
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
90
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
91
|
+
SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require "rake/testtask"
|
2
|
+
require "rake/clean"
|
3
|
+
require "rake/gempackagetask"
|
4
|
+
|
5
|
+
task :default => :test
|
6
|
+
|
7
|
+
CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
|
8
|
+
Rake::GemPackageTask.new(eval(File.read("babosa.gemspec"))) { |pkg| }
|
9
|
+
Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
|
10
|
+
|
11
|
+
begin
|
12
|
+
require "yard"
|
13
|
+
YARD::Rake::YardocTask.new do |t|
|
14
|
+
t.options = ["--output-dir=doc"]
|
15
|
+
end
|
16
|
+
rescue LoadError
|
17
|
+
end
|
18
|
+
|
19
|
+
begin
|
20
|
+
require "rcov/rcovtask"
|
21
|
+
Rcov::RcovTask.new do |r|
|
22
|
+
r.test_files = FileList["test/**/*_test.rb"]
|
23
|
+
r.verbose = true
|
24
|
+
r.rcov_opts << "--exclude gems/*"
|
25
|
+
end
|
26
|
+
rescue LoadError
|
27
|
+
end
|
data/init.rb
ADDED
data/lib/babosa.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Babosa
|
2
|
+
def self.jruby15?
|
3
|
+
JRUBY_VERSION >= "1.5" rescue false
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
class String
|
8
|
+
def to_slug
|
9
|
+
Babosa::SlugString.new self
|
10
|
+
end
|
11
|
+
|
12
|
+
# Compatibility with 1.8.6
|
13
|
+
if !public_method_defined? :bytesize
|
14
|
+
def bytesize
|
15
|
+
unpack("C*").length
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
require "babosa/characters"
|
21
|
+
require "babosa/utf8/proxy"
|
22
|
+
require "babosa/slug_string"
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Babosa
|
3
|
+
|
4
|
+
# This module provides sets of characters needed for various UTF-8 aware
|
5
|
+
# string operations.
|
6
|
+
module Characters
|
7
|
+
extend self
|
8
|
+
|
9
|
+
# Hash of UTF-8 - ASCII approximations.
|
10
|
+
attr_reader :approximations
|
11
|
+
# Punctuation and control characters to remove from slug strings.
|
12
|
+
attr_reader :strippable
|
13
|
+
|
14
|
+
@strippable = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
|
15
|
+
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
|
16
|
+
40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
|
17
|
+
95, 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
|
18
|
+
136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
|
19
|
+
151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166,
|
20
|
+
167, 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183,
|
21
|
+
184, 185, 187, 188, 189, 190, 191, 215, 247]
|
22
|
+
|
23
|
+
# Adds a hash of approximations.
|
24
|
+
# @example
|
25
|
+
# add_approximations :spanish, "ñ" => "ni"
|
26
|
+
# @param [#to_sym] name The name of the approximations to add.
|
27
|
+
# @param Hash hash The approximations to add.
|
28
|
+
def add_approximations(name, hash)
|
29
|
+
@approximations ||= {}
|
30
|
+
@approximations[name.to_sym] = hash.inject({}) do |memo, object|
|
31
|
+
key = object[0].unpack("U").shift
|
32
|
+
value = object[1].unpack("C*")
|
33
|
+
memo[key] = value.length == 1 ? value[0] : value
|
34
|
+
memo
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
add_approximations :spanish, "ñ" => "ni"
|
39
|
+
add_approximations :german, "ä" => "ae", "ö" => "oe", "ü" => "ue"
|
40
|
+
add_approximations :latin, {
|
41
|
+
"À" => "A", "Á" => "A", "Â" => "A", "Ã" => "A", "Ä" => "A", "Å" => "A",
|
42
|
+
"Æ" => "AE", "Ç" => "C", "È" => "E", "É" => "E", "Ê" => "E", "Ë" => "E",
|
43
|
+
"Ì" => "I", "Í" => "I", "Î" => "I", "Ï" => "I", "Ð" => "D", "Ñ" => "N",
|
44
|
+
"Ò" => "O", "Ó" => "O", "Ô" => "O", "Õ" => "O", "Ö" => "O", "Ø" => "O",
|
45
|
+
"Ù" => "U", "Ú" => "U", "Û" => "U", "Ü" => "U", "Ý" => "Y", "Þ" => "Th",
|
46
|
+
"ß" => "ss", "à" => "a" , "á" => "a", "â" => "a", "ã" => "a", "ä" => "a",
|
47
|
+
"å" => "a", "æ" => "ae", "ç" => "c" , "è" => "e", "é" => "e", "ê" => "e",
|
48
|
+
"ë" => "e", "ì" => "i", "í" => "i", "î" => "i", "ï" => "i", "ð" => "d",
|
49
|
+
"ñ" => "n", "ò" => "o", "ó" => "o", "ô" => "o", "õ" => "o", "ö" => "o",
|
50
|
+
"ø" => "o", "ù" => "u", "ú" => "u", "û" => "u", "ü" => "u", "ý" => "y",
|
51
|
+
"þ" => "th", "ÿ" => "y", "Ā" => "A", "ā" => "a", "Ă" => "A", "ă" => "a",
|
52
|
+
"Ą" => "A", "ą" => "a", "Ć" => "C", "ć" => "c", "Ĉ" => "C", "ĉ" => "c",
|
53
|
+
"Ċ" => "C", "ċ" => "c", "Č" => "C", "č" => "c", "Ď" => "D", "ď" => "d",
|
54
|
+
"Đ" => "D", "đ" => "d", "Ē" => "E", "ē" => "e", "Ĕ" => "E", "ĕ" => "e",
|
55
|
+
"Ė" => "E", "ė" => "e", "Ę" => "E", "ę" => "e", "Ě" => "E", "ě" => "e",
|
56
|
+
"Ĝ" => "G", "ĝ" => "g", "Ğ" => "G", "ğ" => "g", "Ġ" => "G", "ġ" => "g",
|
57
|
+
"Ģ" => "G", "ģ" => "g", "Ĥ" => "H", "ĥ" => "h", "Ħ" => "H", "ħ" => "h",
|
58
|
+
"Ĩ" => "I", "ĩ" => "i", "Ī" => "I", "ī" => "i", "Ĭ" => "I", "ĭ" => "i",
|
59
|
+
"Į" => "I", "į" => "i", "İ" => "I", "ı" => "i", "IJ" => "IJ", "ij" => "ij",
|
60
|
+
"Ĵ" => "J", "ĵ" => "j", "Ķ" => "K", "ķ" => "k", "ĸ" => "k", "Ĺ" => "L",
|
61
|
+
"ĺ" => "l", "Ļ" => "L", "ļ" => "l", "Ľ" => "L", "ľ" => "l", "Ŀ" => "L",
|
62
|
+
"ŀ" => "l", "Ł" => "L", "ł" => "l", "Ń" => "N", "ń" => "n", "Ņ" => "N",
|
63
|
+
"ņ" => "n", "Ň" => "N", "ň" => "n", "ʼn" => "n", "Ŋ" => "NG", "ŋ" => "ng",
|
64
|
+
"Ō" => "O", "ō" => "o", "Ŏ" => "O", "ŏ" => "o", "Ő" => "O", "ő" => "o",
|
65
|
+
"Œ" => "OE", "œ" => "oe", "Ŕ" => "R", "ŕ" => "r", "Ŗ" => "R", "ŗ" => "r",
|
66
|
+
"Ř" => "R", "ř" => "r", "Ś" => "S", "ś" => "s", "Ŝ" => "S", "ŝ" => "s",
|
67
|
+
"Ş" => "S", "ş" => "s", "Š" => "S", "š" => "s", "Ţ" => "T", "ţ" => "t",
|
68
|
+
"Ť" => "T", "ť" => "t", "Ŧ" => "T", "ŧ" => "t", "Ũ" => "U", "ũ" => "u",
|
69
|
+
"Ū" => "U", "ū" => "u", "Ŭ" => "U", "ŭ" => "u", "Ů" => "U", "ů" => "u",
|
70
|
+
"Ű" => "U", "ű" => "u", "Ų" => "U", "ų" => "u", "Ŵ" => "W", "ŵ" => "w",
|
71
|
+
"Ŷ" => "Y", "ŷ" => "y", "Ÿ" => "Y", "Ź" => "Z", "ź" => "z", "Ż" => "Z",
|
72
|
+
"ż" => "z", "Ž" => "Z", "ž" => "z", "×" => "x", "÷" => "/"
|
73
|
+
}
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Babosa
|
4
|
+
|
5
|
+
# This class provides some string-manipulation methods specific to slugs.
|
6
|
+
#
|
7
|
+
# Note that this class includes many "bang methods" such as {#clean!} and
|
8
|
+
# {#normalize!} that perform actions on the string in-place. Each of these
|
9
|
+
# methods has a corresponding "bangless" method (i.e., +SlugString#clean!+
|
10
|
+
# and +SlugString#clean+) which does not appear in the documentation because
|
11
|
+
# it is generated dynamically.
|
12
|
+
#
|
13
|
+
# All of the bang methods return an instance of String, while the bangless
|
14
|
+
# versions return an instance of Babosa::SlugString, so that calls to methods
|
15
|
+
# specific to this class can be chained:
|
16
|
+
#
|
17
|
+
# string = SlugString.new("hello world")
|
18
|
+
# string.with_dashes! # => "hello-world"
|
19
|
+
# string.with_dashes # => <Babosa::SlugString:0x000001013e1590 @wrapped_string="hello-world">
|
20
|
+
#
|
21
|
+
# @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
|
22
|
+
class SlugString
|
23
|
+
|
24
|
+
attr_reader :wrapped_string
|
25
|
+
alias to_s wrapped_string
|
26
|
+
|
27
|
+
@@utf8_proxy = if Babosa.jruby15?
|
28
|
+
UTF8::JavaProxy
|
29
|
+
elsif defined? Unicode
|
30
|
+
UTF8::UnicodeProxy
|
31
|
+
elsif defined? ActiveSupport
|
32
|
+
UTF8::ActiveSupportProxy
|
33
|
+
else
|
34
|
+
UTF8::DumbProxy
|
35
|
+
end
|
36
|
+
|
37
|
+
# Return the proxy used for UTF-8 support.
|
38
|
+
# @see Babosa::UTF8::UTF8Proxy
|
39
|
+
def self.utf8_proxy
|
40
|
+
@@utf8_proxy
|
41
|
+
end
|
42
|
+
|
43
|
+
# Set a proxy object used for UTF-8 support.
|
44
|
+
# @see Babosa::UTF8::UTF8Proxy
|
45
|
+
def self.utf8_proxy=(obj)
|
46
|
+
@@utf8_proxy = obj
|
47
|
+
end
|
48
|
+
|
49
|
+
def method_missing(symbol, *args, &block)
|
50
|
+
@wrapped_string.__send__(symbol, *args, &block)
|
51
|
+
end
|
52
|
+
|
53
|
+
# @param string [String] The string to use as the basis of the SlugString.
|
54
|
+
def initialize(string)
|
55
|
+
@wrapped_string = string.to_s
|
56
|
+
tidy_bytes!
|
57
|
+
normalize_utf8!
|
58
|
+
end
|
59
|
+
|
60
|
+
# Approximate an ASCII string. This works only for Western strings using
|
61
|
+
# characters that are Roman-alphabet characters + diacritics. Non-letter
|
62
|
+
# characters are left unmodified.
|
63
|
+
#
|
64
|
+
# string = SlugString.new "Łódź, Poland"
|
65
|
+
# string.approximate_ascii # => "Lodz, Poland"
|
66
|
+
# string = SlugString.new "日本"
|
67
|
+
# string.approximate_ascii # => "日本"
|
68
|
+
#
|
69
|
+
# You can pass any key(s) from +Characters.approximations+ as arguments. This allows
|
70
|
+
# for contextual approximations. By default; +:spanish+ and +:german+ are
|
71
|
+
# provided:
|
72
|
+
#
|
73
|
+
# string = SlugString.new "Jürgen Müller"
|
74
|
+
# string.approximate_ascii # => "Jurgen Muller"
|
75
|
+
# string.approximate_ascii :german # => "Juergen Mueller"
|
76
|
+
# string = SlugString.new "¡Feliz año!"
|
77
|
+
# string.approximate_ascii # => "¡Feliz ano!"
|
78
|
+
# string.approximate_ascii :spanish # => "¡Feliz anio!"
|
79
|
+
#
|
80
|
+
# You can modify the built-in approximations, or add your own:
|
81
|
+
#
|
82
|
+
# # Make Spanish use "nh" rather than "nn"
|
83
|
+
# Babosa::Characters.add_approximations(:spanish, "ñ" => "nh")
|
84
|
+
#
|
85
|
+
# Notice that this method does not simply convert to ASCII; if you want
|
86
|
+
# to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
|
87
|
+
#
|
88
|
+
# string.approximate_ascii!(:spanish) # => "¡Feliz anio!"
|
89
|
+
# string.to_ascii! # => "Feliz anio!"
|
90
|
+
# @param *args <Symbol>
|
91
|
+
# @return String
|
92
|
+
def approximate_ascii!(overrides = {})
|
93
|
+
overrides = Characters.approximations[overrides] if overrides.kind_of? Symbol
|
94
|
+
@wrapped_string = unpack("U*").map { |char| approx_char(char, overrides) }.flatten.pack("U*")
|
95
|
+
end
|
96
|
+
|
97
|
+
# Converts dashes to spaces, removes leading and trailing spaces, and
|
98
|
+
# replaces multiple whitespace characters with a single space.
|
99
|
+
# @return String
|
100
|
+
def clean!
|
101
|
+
@wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip
|
102
|
+
end
|
103
|
+
|
104
|
+
# Remove any non-word characters.
|
105
|
+
# @return String
|
106
|
+
def word_chars!
|
107
|
+
@wrapped_string = (unpack("U*") - Characters.strippable).pack("U*")
|
108
|
+
end
|
109
|
+
|
110
|
+
# Normalize the string for use as a slug. Note that in this context,
|
111
|
+
# +normalize+ means, strip, remove non-letters/numbers, downcasing,
|
112
|
+
# truncating to 255 bytes and converting whitespace to dashes.
|
113
|
+
# @param Boolean ascii If true, approximate ASCII and then remove any non-ASCII characters.
|
114
|
+
# @return String
|
115
|
+
def normalize!(ascii = false)
|
116
|
+
if ascii
|
117
|
+
approximate_ascii!
|
118
|
+
to_ascii!
|
119
|
+
end
|
120
|
+
clean!
|
121
|
+
word_chars!
|
122
|
+
clean!
|
123
|
+
downcase!
|
124
|
+
truncate_bytes!(255)
|
125
|
+
with_dashes!
|
126
|
+
end
|
127
|
+
|
128
|
+
# Delete any non-ascii characters.
|
129
|
+
# @return String
|
130
|
+
def to_ascii!
|
131
|
+
@wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '')
|
132
|
+
end
|
133
|
+
|
134
|
+
# Truncate the string to +max+ characters.
|
135
|
+
# @example
|
136
|
+
# "üéøá".to_slug.truncate(3) #=> "üéø"
|
137
|
+
# @return String
|
138
|
+
def truncate!(max)
|
139
|
+
@wrapped_string = unpack("U*")[0...max].pack("U*")
|
140
|
+
end
|
141
|
+
|
142
|
+
# Truncate the string to +max+ bytes. This can be useful for ensuring that
|
143
|
+
# a UTF-8 string will always fit into a database column with a certain max
|
144
|
+
# byte length. The resulting string may be less than +max+ if the string must
|
145
|
+
# be truncated at a multibyte character boundary.
|
146
|
+
# @example
|
147
|
+
# "üéøá".to_slug.truncate_bytes(3) #=> "ü"
|
148
|
+
# @return String
|
149
|
+
def truncate_bytes!(max)
|
150
|
+
return @wrapped_string if @wrapped_string.bytesize <= max
|
151
|
+
curr = 0
|
152
|
+
new = []
|
153
|
+
unpack("U*").each do |char|
|
154
|
+
break if curr > max
|
155
|
+
char = [char].pack("U")
|
156
|
+
curr += char.bytesize
|
157
|
+
if curr <= max
|
158
|
+
new << char
|
159
|
+
end
|
160
|
+
end
|
161
|
+
@wrapped_string = new.join
|
162
|
+
end
|
163
|
+
|
164
|
+
# Replaces whitespace with dashes ("-").
|
165
|
+
# @return String
|
166
|
+
def with_dashes!
|
167
|
+
@wrapped_string = @wrapped_string.gsub(/\s/u, "-")
|
168
|
+
end
|
169
|
+
|
170
|
+
# Perform UTF-8 sensitive upcasing.
|
171
|
+
# @return String
|
172
|
+
def upcase!
|
173
|
+
@wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
|
174
|
+
end
|
175
|
+
|
176
|
+
# Perform UTF-8 sensitive downcasing.
|
177
|
+
# @return String
|
178
|
+
def downcase!
|
179
|
+
@wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Perform Unicode composition on the wrapped string.
|
183
|
+
# @return String
|
184
|
+
def normalize_utf8!
|
185
|
+
@wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string)
|
186
|
+
end
|
187
|
+
|
188
|
+
# Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
|
189
|
+
# UTF-8.
|
190
|
+
# @return String
|
191
|
+
def tidy_bytes!
|
192
|
+
@wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string)
|
193
|
+
end
|
194
|
+
|
195
|
+
%w[approximate_ascii clean downcase word_chars normalize normalize_utf8
|
196
|
+
tidy_bytes to_ascii truncate truncate_bytes upcase with_dashes].each do |method|
|
197
|
+
class_eval(<<-EOM)
|
198
|
+
def #{method}(*args)
|
199
|
+
send_to_new_instance(:#{method}!, *args)
|
200
|
+
end
|
201
|
+
EOM
|
202
|
+
end
|
203
|
+
|
204
|
+
def to_slug
|
205
|
+
self
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
# Look up the character's approximation in the configured maps.
|
211
|
+
def approx_char(char, overrides = {})
|
212
|
+
overrides[char] or Characters.approximations[:latin][char] or char
|
213
|
+
end
|
214
|
+
|
215
|
+
# Used as the basis of the bangless methods.
|
216
|
+
def send_to_new_instance(*args)
|
217
|
+
string = SlugString.new self
|
218
|
+
string.send(*args)
|
219
|
+
string
|
220
|
+
end
|
221
|
+
|
222
|
+
end
|
223
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Babosa
|
2
|
+
module UTF8
|
3
|
+
# A UTF-8 proxy using Active Support's multibyte support.
|
4
|
+
module ActiveSupportProxy
|
5
|
+
extend UTF8Proxy
|
6
|
+
extend self
|
7
|
+
def downcase(string)
|
8
|
+
ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
|
9
|
+
end
|
10
|
+
|
11
|
+
def upcase(string)
|
12
|
+
ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def normalize_utf8(string)
|
16
|
+
ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.expand_path("../mappings", __FILE__)
|
2
|
+
module Babosa
|
3
|
+
module UTF8
|
4
|
+
|
5
|
+
# This module provides fallback UTF-8 support when nothing else is
|
6
|
+
# available. It does case folding for Roman alphabet-based characters
|
7
|
+
# commonly used by Western European languages and little else, making it
|
8
|
+
# useless for Russian, Bulgarian, Greek, etc. If at all possible, Unicode
|
9
|
+
# or ActiveSupport should be used instead because they support the full
|
10
|
+
# UTF-8 character range.
|
11
|
+
module DumbProxy
|
12
|
+
extend UTF8Proxy
|
13
|
+
extend self
|
14
|
+
|
15
|
+
def downcase(string)
|
16
|
+
string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
|
17
|
+
end
|
18
|
+
|
19
|
+
def upcase(string)
|
20
|
+
string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
|
21
|
+
end
|
22
|
+
|
23
|
+
# This does a very naive Unicode normalization, which should work for
|
24
|
+
# this library's purposes (i.e., Roman-based codepoints, up to U+017E).
|
25
|
+
# Do not use reuse this as a general solution! Use a real library like
|
26
|
+
# Unicode or ActiveSupport instead.
|
27
|
+
def normalize_utf8(string)
|
28
|
+
codepoints = string.unpack("U*")
|
29
|
+
new = []
|
30
|
+
until codepoints.empty? do
|
31
|
+
if Mappings::COMPOSITION[codepoints[0..1]]
|
32
|
+
new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
|
33
|
+
else
|
34
|
+
new << codepoints.shift
|
35
|
+
end
|
36
|
+
end
|
37
|
+
new.compact.flatten.pack("U*")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
include Java
|
2
|
+
|
3
|
+
module Babosa
|
4
|
+
module UTF8
|
5
|
+
# A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
|
6
|
+
module JavaProxy
|
7
|
+
extend UTF8Proxy
|
8
|
+
extend self
|
9
|
+
import java.text.Normalizer
|
10
|
+
|
11
|
+
def downcase(string)
|
12
|
+
string.to_java.to_lower_case.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def upcase(string)
|
16
|
+
string.to_java.to_upper_case.to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
def normalize_utf8(string)
|
20
|
+
Normalizer.normalize(string, Normalizer::Form::NFC).to_s
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
module Babosa
|
2
|
+
module UTF8
|
3
|
+
|
4
|
+
# A small subset of the mappings provided by Unicode.org, limited to Latin
|
5
|
+
# characters. This is used for Babosa's default "dumb" UTF-8 support.
|
6
|
+
module Mappings
|
7
|
+
DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
|
8
|
+
71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
|
9
|
+
79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
|
10
|
+
87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
|
11
|
+
226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
|
12
|
+
233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
|
13
|
+
240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
|
14
|
+
248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
|
15
|
+
[115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
|
16
|
+
268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
|
17
|
+
282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
|
18
|
+
296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
|
19
|
+
309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
|
20
|
+
324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
|
21
|
+
336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
|
22
|
+
350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
|
23
|
+
364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
|
24
|
+
377, 378, 379, 380, 381, 382]
|
25
|
+
|
26
|
+
UPCASE = DOWNCASE.invert
|
27
|
+
|
28
|
+
COMPOSITION = {
|
29
|
+
[65,768] => 192,
|
30
|
+
[65,769] => 193,
|
31
|
+
[65,770] => 194,
|
32
|
+
[65,771] => 195,
|
33
|
+
[65,776] => 196,
|
34
|
+
[65,778] => 197,
|
35
|
+
[67,807] => 199,
|
36
|
+
[69,768] => 200,
|
37
|
+
[69,769] => 201,
|
38
|
+
[69,770] => 202,
|
39
|
+
[69,776] => 203,
|
40
|
+
[73,768] => 204,
|
41
|
+
[73,769] => 205,
|
42
|
+
[73,770] => 206,
|
43
|
+
[73,776] => 207,
|
44
|
+
[78,771] => 209,
|
45
|
+
[79,768] => 210,
|
46
|
+
[79,769] => 211,
|
47
|
+
[79,770] => 212,
|
48
|
+
[79,771] => 213,
|
49
|
+
[79,776] => 214,
|
50
|
+
[85,768] => 217,
|
51
|
+
[85,769] => 218,
|
52
|
+
[85,770] => 219,
|
53
|
+
[85,776] => 220,
|
54
|
+
[89,769] => 221,
|
55
|
+
[97,768] => 224,
|
56
|
+
[97,769] => 225,
|
57
|
+
[97,770] => 226,
|
58
|
+
[97,771] => 227,
|
59
|
+
[97,776] => 228,
|
60
|
+
[97,778] => 229,
|
61
|
+
[99,807] => 231,
|
62
|
+
[101,768] => 232,
|
63
|
+
[101,769] => 233,
|
64
|
+
[101,770] => 234,
|
65
|
+
[101,776] => 235,
|
66
|
+
[105,768] => 236,
|
67
|
+
[105,769] => 237,
|
68
|
+
[105,770] => 238,
|
69
|
+
[105,776] => 239,
|
70
|
+
[110,771] => 241,
|
71
|
+
[111,768] => 242,
|
72
|
+
[111,769] => 243,
|
73
|
+
[111,770] => 244,
|
74
|
+
[111,771] => 245,
|
75
|
+
[111,776] => 246,
|
76
|
+
[117,768] => 249,
|
77
|
+
[117,769] => 250,
|
78
|
+
[117,770] => 251,
|
79
|
+
[117,776] => 252,
|
80
|
+
[121,769] => 253,
|
81
|
+
[121,776] => 255,
|
82
|
+
[65,772] => 256,
|
83
|
+
[97,772] => 257,
|
84
|
+
[65,774] => 258,
|
85
|
+
[97,774] => 259,
|
86
|
+
[65,808] => 260,
|
87
|
+
[97,808] => 261,
|
88
|
+
[67,769] => 262,
|
89
|
+
[99,769] => 263,
|
90
|
+
[67,770] => 264,
|
91
|
+
[99,770] => 265,
|
92
|
+
[67,775] => 266,
|
93
|
+
[99,775] => 267,
|
94
|
+
[67,780] => 268,
|
95
|
+
[99,780] => 269,
|
96
|
+
[68,780] => 270,
|
97
|
+
[100,780] => 271,
|
98
|
+
[69,772] => 274,
|
99
|
+
[101,772] => 275,
|
100
|
+
[69,774] => 276,
|
101
|
+
[101,774] => 277,
|
102
|
+
[69,775] => 278,
|
103
|
+
[101,775] => 279,
|
104
|
+
[69,808] => 280,
|
105
|
+
[101,808] => 281,
|
106
|
+
[69,780] => 282,
|
107
|
+
[101,780] => 283,
|
108
|
+
[71,770] => 284,
|
109
|
+
[103,770] => 285,
|
110
|
+
[71,774] => 286,
|
111
|
+
[103,774] => 287,
|
112
|
+
[71,775] => 288,
|
113
|
+
[103,775] => 289,
|
114
|
+
[71,807] => 290,
|
115
|
+
[103,807] => 291,
|
116
|
+
[72,770] => 292,
|
117
|
+
[104,770] => 293,
|
118
|
+
[73,771] => 296,
|
119
|
+
[105,771] => 297,
|
120
|
+
[73,772] => 298,
|
121
|
+
[105,772] => 299,
|
122
|
+
[73,774] => 300,
|
123
|
+
[105,774] => 301,
|
124
|
+
[73,808] => 302,
|
125
|
+
[105,808] => 303,
|
126
|
+
[73,775] => 304,
|
127
|
+
[74,770] => 308,
|
128
|
+
[106,770] => 309,
|
129
|
+
[75,807] => 310,
|
130
|
+
[107,807] => 311,
|
131
|
+
[76,769] => 313,
|
132
|
+
[108,769] => 314,
|
133
|
+
[76,807] => 315,
|
134
|
+
[108,807] => 316,
|
135
|
+
[76,780] => 317,
|
136
|
+
[108,780] => 318,
|
137
|
+
[78,769] => 323,
|
138
|
+
[110,769] => 324,
|
139
|
+
[78,807] => 325,
|
140
|
+
[110,807] => 326,
|
141
|
+
[78,780] => 327,
|
142
|
+
[110,780] => 328,
|
143
|
+
[79,772] => 332,
|
144
|
+
[111,772] => 333,
|
145
|
+
[79,774] => 334,
|
146
|
+
[111,774] => 335,
|
147
|
+
[79,779] => 336,
|
148
|
+
[111,779] => 337,
|
149
|
+
[82,769] => 340,
|
150
|
+
[114,769] => 341,
|
151
|
+
[82,807] => 342,
|
152
|
+
[114,807] => 343,
|
153
|
+
[82,780] => 344,
|
154
|
+
[114,780] => 345,
|
155
|
+
[83,769] => 346,
|
156
|
+
[115,769] => 347,
|
157
|
+
[83,770] => 348,
|
158
|
+
[115,770] => 349,
|
159
|
+
[83,807] => 350,
|
160
|
+
[115,807] => 351,
|
161
|
+
[83,780] => 352,
|
162
|
+
[115,780] => 353,
|
163
|
+
[84,807] => 354,
|
164
|
+
[116,807] => 355,
|
165
|
+
[84,780] => 356,
|
166
|
+
[116,780] => 357,
|
167
|
+
[85,771] => 360,
|
168
|
+
[117,771] => 361,
|
169
|
+
[85,772] => 362,
|
170
|
+
[117,772] => 363,
|
171
|
+
[85,774] => 364,
|
172
|
+
[117,774] => 365,
|
173
|
+
[85,778] => 366,
|
174
|
+
[117,778] => 367,
|
175
|
+
[85,779] => 368,
|
176
|
+
[117,779] => 369,
|
177
|
+
[85,808] => 370,
|
178
|
+
[117,808] => 371,
|
179
|
+
[87,770] => 372,
|
180
|
+
[119,770] => 373,
|
181
|
+
[89,770] => 374,
|
182
|
+
[121,770] => 375,
|
183
|
+
[89,776] => 376,
|
184
|
+
[90,769] => 377,
|
185
|
+
[122,769] => 378,
|
186
|
+
[90,775] => 379,
|
187
|
+
[122,775] => 380,
|
188
|
+
[90,780] => 381,
|
189
|
+
[122,780] => 382
|
190
|
+
}
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
module Babosa
|
2
|
+
module UTF8
|
3
|
+
|
4
|
+
autoload :JavaProxy, "babosa/utf8/java_proxy"
|
5
|
+
autoload :UnicodeProxy, "babosa/utf8/unicode_proxy"
|
6
|
+
autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy"
|
7
|
+
autoload :DumbProxy, "babosa/utf8/dumb_proxy"
|
8
|
+
|
9
|
+
# A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
|
10
|
+
# The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
|
11
|
+
module UTF8Proxy
|
12
|
+
CP1252 = {
|
13
|
+
128 => [226, 130, 172],
|
14
|
+
129 => nil,
|
15
|
+
130 => [226, 128, 154],
|
16
|
+
131 => [198, 146],
|
17
|
+
132 => [226, 128, 158],
|
18
|
+
133 => [226, 128, 166],
|
19
|
+
134 => [226, 128, 160],
|
20
|
+
135 => [226, 128, 161],
|
21
|
+
136 => [203, 134],
|
22
|
+
137 => [226, 128, 176],
|
23
|
+
138 => [197, 160],
|
24
|
+
139 => [226, 128, 185],
|
25
|
+
140 => [197, 146],
|
26
|
+
141 => nil,
|
27
|
+
142 => [197, 189],
|
28
|
+
143 => nil,
|
29
|
+
144 => nil,
|
30
|
+
145 => [226, 128, 152],
|
31
|
+
146 => [226, 128, 153],
|
32
|
+
147 => [226, 128, 156],
|
33
|
+
148 => [226, 128, 157],
|
34
|
+
149 => [226, 128, 162],
|
35
|
+
150 => [226, 128, 147],
|
36
|
+
151 => [226, 128, 148],
|
37
|
+
152 => [203, 156],
|
38
|
+
153 => [226, 132, 162],
|
39
|
+
154 => [197, 161],
|
40
|
+
155 => [226, 128, 186],
|
41
|
+
156 => [197, 147],
|
42
|
+
157 => nil,
|
43
|
+
158 => [197, 190],
|
44
|
+
159 => [197, 184]
|
45
|
+
}
|
46
|
+
|
47
|
+
# This is a stub for a method that should return a Unicode-aware
|
48
|
+
# downcased version of the given string.
|
49
|
+
def downcase(string)
|
50
|
+
raise NotImplementedError
|
51
|
+
end
|
52
|
+
|
53
|
+
# This is a stub for a method that should return a Unicode-aware
|
54
|
+
# upcased version of the given string.
|
55
|
+
def upcase(string)
|
56
|
+
raise NotImplementedError
|
57
|
+
end
|
58
|
+
|
59
|
+
# This is a stub for a method that should return the Unicode NFC
|
60
|
+
# normalization of the given string.
|
61
|
+
def normalize_utf8(string)
|
62
|
+
raise NotImplementedError
|
63
|
+
end
|
64
|
+
|
65
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
66
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
67
|
+
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
68
|
+
# always work.
|
69
|
+
def tidy_bytes(string)
|
70
|
+
bytes = string.unpack("C*")
|
71
|
+
conts_expected = 0
|
72
|
+
last_lead = 0
|
73
|
+
|
74
|
+
bytes.each_index do |i|
|
75
|
+
byte = bytes[i]
|
76
|
+
is_ascii = byte < 128
|
77
|
+
is_cont = byte > 127 && byte < 192
|
78
|
+
is_lead = byte > 191 && byte < 245
|
79
|
+
is_unused = byte > 240
|
80
|
+
is_restricted = byte > 244
|
81
|
+
|
82
|
+
# Impossible or highly unlikely byte? Clean it.
|
83
|
+
if is_unused || is_restricted
|
84
|
+
bytes[i] = tidy_byte(byte)
|
85
|
+
elsif is_cont
|
86
|
+
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
87
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
88
|
+
else
|
89
|
+
if conts_expected > 0
|
90
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
91
|
+
# the leading byte.
|
92
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
93
|
+
conts_expected = 0
|
94
|
+
end
|
95
|
+
if is_lead
|
96
|
+
# Final byte is leading? Clean it.
|
97
|
+
if i == bytes.length - 1
|
98
|
+
bytes[i] = tidy_byte(bytes.last)
|
99
|
+
else
|
100
|
+
# Valid leading byte? Expect continuations determined by position of
|
101
|
+
# first zero bit, with max of 3.
|
102
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
103
|
+
last_lead = i
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def tidy_byte(byte)
|
114
|
+
byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Babosa
|
2
|
+
module UTF8
|
3
|
+
# A UTF-8 proxy using the Unicode gem.
|
4
|
+
# @see http://github.com/blackwinter/unicode
|
5
|
+
module UnicodeProxy
|
6
|
+
extend UTF8Proxy
|
7
|
+
extend self
|
8
|
+
def downcase(string)
|
9
|
+
Unicode.downcase(string)
|
10
|
+
end
|
11
|
+
|
12
|
+
def upcase(string)
|
13
|
+
Unicode.upcase(string)
|
14
|
+
end
|
15
|
+
|
16
|
+
def normalize_utf8(string)
|
17
|
+
Unicode.normalize_C(string)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/test/babosa_test.rb
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$KCODE = 'UTF8' if RUBY_VERSION < '1.9'
|
3
|
+
$LOAD_PATH << File.expand_path("../../lib", __FILE__)
|
4
|
+
$LOAD_PATH.uniq!
|
5
|
+
|
6
|
+
require "rubygems"
|
7
|
+
require "bundler"
|
8
|
+
Bundler.setup
|
9
|
+
require "test/unit"
|
10
|
+
require "babosa"
|
11
|
+
|
12
|
+
Module.send :include, Module.new {
|
13
|
+
def test(name, &block)
|
14
|
+
define_method("test_#{name.gsub(/[^a-z0-9_]/i, "_")}".to_sym, &block)
|
15
|
+
end
|
16
|
+
}
|
17
|
+
|
18
|
+
module UTF8ProxyTest
|
19
|
+
test "should downcase strings" do
|
20
|
+
assert_equal "åéîøü", proxy.downcase("ÅÉÎØÜ")
|
21
|
+
end
|
22
|
+
|
23
|
+
test "should upcase strings" do
|
24
|
+
assert_equal "ÅÉÎØÜ", proxy.upcase("åéîøü")
|
25
|
+
end
|
26
|
+
|
27
|
+
test "should compose UTF-8" do
|
28
|
+
# ÅÉÎØÜ
|
29
|
+
uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136]
|
30
|
+
composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156]
|
31
|
+
uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*")
|
32
|
+
composed_string = composed_bytes.pack("C*").unpack("U*").pack("U*")
|
33
|
+
assert_equal composed_bytes, proxy.normalize_utf8(uncomposed_string).unpack("C*")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
if Babosa.jruby15?
|
38
|
+
class JavaProxyTest < Test::Unit::TestCase
|
39
|
+
include UTF8ProxyTest
|
40
|
+
def proxy
|
41
|
+
Babosa::UTF8::JavaProxy
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class DumbProxyTest < Test::Unit::TestCase
|
47
|
+
include UTF8ProxyTest
|
48
|
+
def proxy
|
49
|
+
Babosa::UTF8::DumbProxy
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class BabosaTest < Test::Unit::TestCase
|
54
|
+
|
55
|
+
test "word_chars! should leave only letters and spaces" do
|
56
|
+
string = "a*$%^$@!@b$%^&*()*!c"
|
57
|
+
assert_match /[a-z ]*/i, string.to_slug.word_chars!
|
58
|
+
end
|
59
|
+
|
60
|
+
test "approximate_ascii should transliterate to ascii" do
|
61
|
+
slug = (0xC0..0x17E).to_a.each do |codepoint|
|
62
|
+
ss = [codepoint].pack("U*").to_slug
|
63
|
+
approx = ss.approximate_ascii
|
64
|
+
assert_match /[\x0-\x7f]/, approx.to_s
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
test "should lowercase strings" do
|
69
|
+
assert_equal "feliz año", "FELIZ AÑO".to_slug.downcase!
|
70
|
+
end
|
71
|
+
|
72
|
+
test "should uppercase strings" do
|
73
|
+
assert_equal "FELIZ AÑO", "feliz año".to_slug.upcase!
|
74
|
+
end
|
75
|
+
|
76
|
+
test "should replace whitespace with dashes" do
|
77
|
+
assert_equal "a-b", "a b".to_slug.clean.normalize!
|
78
|
+
end
|
79
|
+
|
80
|
+
test "should replace multiple spaces with 1 dash" do
|
81
|
+
assert_equal "a-b", "a b".to_slug.clean.normalize!
|
82
|
+
end
|
83
|
+
|
84
|
+
test "should replace multiple dashes with 1 dash" do
|
85
|
+
assert_equal "male-female", "male - female".to_slug.normalize!
|
86
|
+
end
|
87
|
+
|
88
|
+
test "should strip trailing space" do
|
89
|
+
assert_equal "ab", "ab ".to_slug.normalize!
|
90
|
+
end
|
91
|
+
|
92
|
+
test "should strip leading space" do
|
93
|
+
assert_equal "ab", " ab".to_slug.normalize!
|
94
|
+
end
|
95
|
+
|
96
|
+
test "should strip trailing slashes" do
|
97
|
+
assert_equal "ab", "ab-".to_slug.normalize!
|
98
|
+
end
|
99
|
+
|
100
|
+
test "should strip leading slashes" do
|
101
|
+
assert_equal "ab", "-ab".to_slug.normalize!
|
102
|
+
end
|
103
|
+
|
104
|
+
test "should not modify valid name strings" do
|
105
|
+
assert_equal "a-b-c-d", "a-b-c-d".to_slug.normalize!
|
106
|
+
end
|
107
|
+
|
108
|
+
test "should do special approximations for German" do
|
109
|
+
assert_equal "Juergen", "Jürgen".to_slug.approximate_ascii!(:german)
|
110
|
+
end
|
111
|
+
|
112
|
+
test "should do special approximations for Spanish" do
|
113
|
+
assert_equal "anio", "año".to_slug.approximate_ascii!(:spanish)
|
114
|
+
end
|
115
|
+
|
116
|
+
test "should work with non roman chars" do
|
117
|
+
assert_equal "検-索", "検 索".to_slug.normalize!
|
118
|
+
end
|
119
|
+
|
120
|
+
test "should work with invalid UTF-8 strings" do
|
121
|
+
%w[approximate_ascii clean downcase word_chars normalize to_ascii upcase with_dashes].each do |method|
|
122
|
+
string = "\x93abc".to_slug
|
123
|
+
assert_nothing_raised do
|
124
|
+
method == "truncate" ? string.send(method, 32) : string.send(method)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
test "should truncate string by byte length" do
|
130
|
+
assert_equal "ü", "üa".to_slug.truncate_bytes!(2)
|
131
|
+
assert_equal "", "üa".to_slug.truncate_bytes!(1)
|
132
|
+
assert_equal "üa", "üa".to_slug.truncate_bytes!(100)
|
133
|
+
assert_equal "ü", "üéøá".to_slug.truncate_bytes!(3)
|
134
|
+
end
|
135
|
+
|
136
|
+
test "should truncate string by char length" do
|
137
|
+
assert_equal "üa", "üa".to_slug.truncate!(2)
|
138
|
+
assert_equal "ü", "üa".to_slug.truncate!(1)
|
139
|
+
assert_equal "üa", "üa".to_slug.truncate!(100)
|
140
|
+
end
|
141
|
+
|
142
|
+
test "should transliterate uncomposed utf8" do
|
143
|
+
string = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS
|
144
|
+
assert_equal "u", string.to_slug.approximate_ascii!
|
145
|
+
end
|
146
|
+
|
147
|
+
test "with_dashes should not change byte size when replacing spaces" do
|
148
|
+
assert_equal "".bytesize, "".to_slug.with_dashes.bytesize
|
149
|
+
assert_equal " ".bytesize, " ".to_slug.with_dashes.bytesize
|
150
|
+
assert_equal "-abc-".bytesize, "-abc-".to_slug.with_dashes.bytesize
|
151
|
+
assert_equal " abc ".bytesize, " abc ".to_slug.with_dashes.bytesize
|
152
|
+
assert_equal " a bc ".bytesize, " a bc ".to_slug.with_dashes.bytesize
|
153
|
+
end
|
154
|
+
|
155
|
+
test "normalize! with ascii should approximate and strip non ascii" do
|
156
|
+
ss = "カタカナ: katakana is über cool".to_slug
|
157
|
+
assert_equal "katakana-is-uber-cool", ss.normalize!(true)
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: babosa
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Norman Clarke
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-07-12 00:00:00 -03:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: " A library for creating slugs. Babosa an extraction and improvement of the\n string code from FriendlyId, intended to help developers create similar\n libraries or plugins.\n"
|
22
|
+
email: norman@njclarke.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- lib/babosa/characters.rb
|
31
|
+
- lib/babosa/slug_string.rb
|
32
|
+
- lib/babosa/utf8/active_support_proxy.rb
|
33
|
+
- lib/babosa/utf8/dumb_proxy.rb
|
34
|
+
- lib/babosa/utf8/java_proxy.rb
|
35
|
+
- lib/babosa/utf8/mappings.rb
|
36
|
+
- lib/babosa/utf8/proxy.rb
|
37
|
+
- lib/babosa/utf8/unicode_proxy.rb
|
38
|
+
- lib/babosa/version.rb
|
39
|
+
- lib/babosa.rb
|
40
|
+
- README.md
|
41
|
+
- MIT-LICENSE
|
42
|
+
- Rakefile
|
43
|
+
- init.rb
|
44
|
+
- test/babosa_test.rb
|
45
|
+
has_rdoc: false
|
46
|
+
homepage: http://norman.github.com/babosa
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: "[none]"
|
73
|
+
rubygems_version: 1.3.7
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: A library for creating slugs.
|
77
|
+
test_files:
|
78
|
+
- test/babosa_test.rb
|