babosa 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +19 -0
- data/README.md +91 -0
- data/Rakefile +27 -0
- data/init.rb +3 -0
- data/lib/babosa.rb +22 -0
- data/lib/babosa/characters.rb +75 -0
- data/lib/babosa/slug_string.rb +223 -0
- data/lib/babosa/utf8/active_support_proxy.rb +20 -0
- data/lib/babosa/utf8/dumb_proxy.rb +41 -0
- data/lib/babosa/utf8/java_proxy.rb +24 -0
- data/lib/babosa/utf8/mappings.rb +193 -0
- data/lib/babosa/utf8/proxy.rb +118 -0
- data/lib/babosa/utf8/unicode_proxy.rb +21 -0
- data/lib/babosa/version.rb +5 -0
- data/test/babosa_test.rb +160 -0
- metadata +78 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2010 Norman Clarke
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# Babosa
|
2
|
+
|
3
|
+
Babosa is a library for creating slugs. It is an extraction and improvement of
|
4
|
+
the string code from [FriendlyId](http://github.com/norman/friendly_id),
|
5
|
+
intended to help developers create libraries similar to FriendlyId.
|
6
|
+
|
7
|
+
## Features / Usage
|
8
|
+
|
9
|
+
### ASCII transliteration
|
10
|
+
|
11
|
+
"Gölcük, Turkey".to_slug.approximate_ascii.to_s #=> "Golcuk, Turkey"
|
12
|
+
|
13
|
+
### Special cases for German and Spanish
|
14
|
+
|
15
|
+
"Jürgen Müller".to_slug.approximate_ascii.to_s #=> "Jurgen Muller"
|
16
|
+
"Jürgen Müller".to_slug.approximate_ascii(:german).to_s #=> "Juergen Mueller"
|
17
|
+
"feliz año".to_slug.approximate_ascii.to_s #=> "feliz ano"
|
18
|
+
"feliz año".to_slug.approximate_ascii(:spanish).to_s #=> "feliz anio"
|
19
|
+
|
20
|
+
### Non-ASCII removal
|
21
|
+
|
22
|
+
"Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
|
23
|
+
|
24
|
+
### Truncate by characters
|
25
|
+
|
26
|
+
"üüü".to_slug.truncate(2).to_s #=> "üü"
|
27
|
+
|
28
|
+
### Truncate by bytes
|
29
|
+
|
30
|
+
This can be useful to ensure the generated slug will fit in a database column
|
31
|
+
whose length is limited by bytes rather than UTF-8 characters.
|
32
|
+
|
33
|
+
"üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
|
34
|
+
|
35
|
+
### All-in-one
|
36
|
+
|
37
|
+
"Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
|
38
|
+
|
39
|
+
There are many more features; check the API docs and source code to find out
|
40
|
+
more.
|
41
|
+
|
42
|
+
## Getting it
|
43
|
+
|
44
|
+
Babosa can be installed via Rubygems:
|
45
|
+
|
46
|
+
gem install babosa
|
47
|
+
|
48
|
+
You can get the source code from its [Github repository](http://github.com/norman/babosa).
|
49
|
+
|
50
|
+
## Reporting bugs
|
51
|
+
|
52
|
+
Please use Babosa's [Github issue tracker](http://github.com/norman/babosa/issues).
|
53
|
+
|
54
|
+
|
55
|
+
## Misc
|
56
|
+
|
57
|
+
The speed and quality of Babosa's UTF-8 support depends on which Ruby and which
|
58
|
+
gems you are using.
|
59
|
+
|
60
|
+
On JRuby 1.5 and above, Babosa uses Java's native UTF-8 support. If you require
|
61
|
+
[Unicode](http://github.com/blackwinter/unicode) or ActiveSupport before
|
62
|
+
Babosa, it will use the support provided by those libraries. Otherwise, Babosa
|
63
|
+
defaults to very basic UTF-8 support for Latin characters only.
|
64
|
+
|
65
|
+
"Babosa" means slug in Spanish.
|
66
|
+
|
67
|
+
## Author
|
68
|
+
|
69
|
+
[Norman Clarke](http://njclarke.com)
|
70
|
+
|
71
|
+
## Copyright
|
72
|
+
|
73
|
+
Copyright (c) 2010 Norman Clarke
|
74
|
+
|
75
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
76
|
+
of this software and associated documentation files (the "Software"), to deal
|
77
|
+
in the Software without restriction, including without limitation the rights
|
78
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
79
|
+
copies of the Software, and to permit persons to whom the Software is
|
80
|
+
furnished to do so, subject to the following conditions:
|
81
|
+
|
82
|
+
The above copyright notice and this permission notice shall be included in all
|
83
|
+
copies or substantial portions of the Software.
|
84
|
+
|
85
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
86
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
87
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
88
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
89
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
90
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
91
|
+
SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require "rake/testtask"
|
2
|
+
require "rake/clean"
|
3
|
+
require "rake/gempackagetask"
|
4
|
+
|
5
|
+
task :default => :test
|
6
|
+
|
7
|
+
CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
|
8
|
+
Rake::GemPackageTask.new(eval(File.read("babosa.gemspec"))) { |pkg| }
|
9
|
+
Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
|
10
|
+
|
11
|
+
begin
|
12
|
+
require "yard"
|
13
|
+
YARD::Rake::YardocTask.new do |t|
|
14
|
+
t.options = ["--output-dir=doc"]
|
15
|
+
end
|
16
|
+
rescue LoadError
|
17
|
+
end
|
18
|
+
|
19
|
+
begin
|
20
|
+
require "rcov/rcovtask"
|
21
|
+
Rcov::RcovTask.new do |r|
|
22
|
+
r.test_files = FileList["test/**/*_test.rb"]
|
23
|
+
r.verbose = true
|
24
|
+
r.rcov_opts << "--exclude gems/*"
|
25
|
+
end
|
26
|
+
rescue LoadError
|
27
|
+
end
|
data/init.rb
ADDED
data/lib/babosa.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Babosa
|
2
|
+
def self.jruby15?
|
3
|
+
JRUBY_VERSION >= "1.5" rescue false
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
class String
|
8
|
+
def to_slug
|
9
|
+
Babosa::SlugString.new self
|
10
|
+
end
|
11
|
+
|
12
|
+
# Compatibility with 1.8.6
|
13
|
+
if !public_method_defined? :bytesize
|
14
|
+
def bytesize
|
15
|
+
unpack("C*").length
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
require "babosa/characters"
|
21
|
+
require "babosa/utf8/proxy"
|
22
|
+
require "babosa/slug_string"
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Babosa
|
3
|
+
|
4
|
+
# This module provides sets of characters needed for various UTF-8 aware
|
5
|
+
# string operations.
|
6
|
+
module Characters
|
7
|
+
extend self
|
8
|
+
|
9
|
+
# Hash of UTF-8 - ASCII approximations.
|
10
|
+
attr_reader :approximations
|
11
|
+
# Punctuation and control characters to remove from slug strings.
|
12
|
+
attr_reader :strippable
|
13
|
+
|
14
|
+
@strippable = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
|
15
|
+
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
|
16
|
+
40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
|
17
|
+
95, 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
|
18
|
+
136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
|
19
|
+
151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166,
|
20
|
+
167, 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183,
|
21
|
+
184, 185, 187, 188, 189, 190, 191, 215, 247]
|
22
|
+
|
23
|
+
# Adds a hash of approximations.
|
24
|
+
# @example
|
25
|
+
# add_approximations :spanish, "ñ" => "ni"
|
26
|
+
# @param [#to_sym] name The name of the approximations to add.
|
27
|
+
# @param Hash hash The approximations to add.
|
28
|
+
def add_approximations(name, hash)
|
29
|
+
@approximations ||= {}
|
30
|
+
@approximations[name.to_sym] = hash.inject({}) do |memo, object|
|
31
|
+
key = object[0].unpack("U").shift
|
32
|
+
value = object[1].unpack("C*")
|
33
|
+
memo[key] = value.length == 1 ? value[0] : value
|
34
|
+
memo
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
add_approximations :spanish, "ñ" => "ni"
|
39
|
+
add_approximations :german, "ä" => "ae", "ö" => "oe", "ü" => "ue"
|
40
|
+
add_approximations :latin, {
|
41
|
+
"À" => "A", "Á" => "A", "Â" => "A", "Ã" => "A", "Ä" => "A", "Å" => "A",
|
42
|
+
"Æ" => "AE", "Ç" => "C", "È" => "E", "É" => "E", "Ê" => "E", "Ë" => "E",
|
43
|
+
"Ì" => "I", "Í" => "I", "Î" => "I", "Ï" => "I", "Ð" => "D", "Ñ" => "N",
|
44
|
+
"Ò" => "O", "Ó" => "O", "Ô" => "O", "Õ" => "O", "Ö" => "O", "Ø" => "O",
|
45
|
+
"Ù" => "U", "Ú" => "U", "Û" => "U", "Ü" => "U", "Ý" => "Y", "Þ" => "Th",
|
46
|
+
"ß" => "ss", "à" => "a" , "á" => "a", "â" => "a", "ã" => "a", "ä" => "a",
|
47
|
+
"å" => "a", "æ" => "ae", "ç" => "c" , "è" => "e", "é" => "e", "ê" => "e",
|
48
|
+
"ë" => "e", "ì" => "i", "í" => "i", "î" => "i", "ï" => "i", "ð" => "d",
|
49
|
+
"ñ" => "n", "ò" => "o", "ó" => "o", "ô" => "o", "õ" => "o", "ö" => "o",
|
50
|
+
"ø" => "o", "ù" => "u", "ú" => "u", "û" => "u", "ü" => "u", "ý" => "y",
|
51
|
+
"þ" => "th", "ÿ" => "y", "Ā" => "A", "ā" => "a", "Ă" => "A", "ă" => "a",
|
52
|
+
"Ą" => "A", "ą" => "a", "Ć" => "C", "ć" => "c", "Ĉ" => "C", "ĉ" => "c",
|
53
|
+
"Ċ" => "C", "ċ" => "c", "Č" => "C", "č" => "c", "Ď" => "D", "ď" => "d",
|
54
|
+
"Đ" => "D", "đ" => "d", "Ē" => "E", "ē" => "e", "Ĕ" => "E", "ĕ" => "e",
|
55
|
+
"Ė" => "E", "ė" => "e", "Ę" => "E", "ę" => "e", "Ě" => "E", "ě" => "e",
|
56
|
+
"Ĝ" => "G", "ĝ" => "g", "Ğ" => "G", "ğ" => "g", "Ġ" => "G", "ġ" => "g",
|
57
|
+
"Ģ" => "G", "ģ" => "g", "Ĥ" => "H", "ĥ" => "h", "Ħ" => "H", "ħ" => "h",
|
58
|
+
"Ĩ" => "I", "ĩ" => "i", "Ī" => "I", "ī" => "i", "Ĭ" => "I", "ĭ" => "i",
|
59
|
+
"Į" => "I", "į" => "i", "İ" => "I", "ı" => "i", "IJ" => "IJ", "ij" => "ij",
|
60
|
+
"Ĵ" => "J", "ĵ" => "j", "Ķ" => "K", "ķ" => "k", "ĸ" => "k", "Ĺ" => "L",
|
61
|
+
"ĺ" => "l", "Ļ" => "L", "ļ" => "l", "Ľ" => "L", "ľ" => "l", "Ŀ" => "L",
|
62
|
+
"ŀ" => "l", "Ł" => "L", "ł" => "l", "Ń" => "N", "ń" => "n", "Ņ" => "N",
|
63
|
+
"ņ" => "n", "Ň" => "N", "ň" => "n", "ʼn" => "n", "Ŋ" => "NG", "ŋ" => "ng",
|
64
|
+
"Ō" => "O", "ō" => "o", "Ŏ" => "O", "ŏ" => "o", "Ő" => "O", "ő" => "o",
|
65
|
+
"Œ" => "OE", "œ" => "oe", "Ŕ" => "R", "ŕ" => "r", "Ŗ" => "R", "ŗ" => "r",
|
66
|
+
"Ř" => "R", "ř" => "r", "Ś" => "S", "ś" => "s", "Ŝ" => "S", "ŝ" => "s",
|
67
|
+
"Ş" => "S", "ş" => "s", "Š" => "S", "š" => "s", "Ţ" => "T", "ţ" => "t",
|
68
|
+
"Ť" => "T", "ť" => "t", "Ŧ" => "T", "ŧ" => "t", "Ũ" => "U", "ũ" => "u",
|
69
|
+
"Ū" => "U", "ū" => "u", "Ŭ" => "U", "ŭ" => "u", "Ů" => "U", "ů" => "u",
|
70
|
+
"Ű" => "U", "ű" => "u", "Ų" => "U", "ų" => "u", "Ŵ" => "W", "ŵ" => "w",
|
71
|
+
"Ŷ" => "Y", "ŷ" => "y", "Ÿ" => "Y", "Ź" => "Z", "ź" => "z", "Ż" => "Z",
|
72
|
+
"ż" => "z", "Ž" => "Z", "ž" => "z", "×" => "x", "÷" => "/"
|
73
|
+
}
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Babosa
|
4
|
+
|
5
|
+
# This class provides some string-manipulation methods specific to slugs.
|
6
|
+
#
|
7
|
+
# Note that this class includes many "bang methods" such as {#clean!} and
|
8
|
+
# {#normalize!} that perform actions on the string in-place. Each of these
|
9
|
+
# methods has a corresponding "bangless" method (i.e., +SlugString#clean!+
|
10
|
+
# and +SlugString#clean+) which does not appear in the documentation because
|
11
|
+
# it is generated dynamically.
|
12
|
+
#
|
13
|
+
# All of the bang methods return an instance of String, while the bangless
|
14
|
+
# versions return an instance of Babosa::SlugString, so that calls to methods
|
15
|
+
# specific to this class can be chained:
|
16
|
+
#
|
17
|
+
# string = SlugString.new("hello world")
|
18
|
+
# string.with_dashes! # => "hello-world"
|
19
|
+
# string.with_dashes # => <Babosa::SlugString:0x000001013e1590 @wrapped_string="hello-world">
|
20
|
+
#
|
21
|
+
# @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
|
22
|
+
class SlugString
|
23
|
+
|
24
|
+
attr_reader :wrapped_string
|
25
|
+
alias to_s wrapped_string
|
26
|
+
|
27
|
+
@@utf8_proxy = if Babosa.jruby15?
|
28
|
+
UTF8::JavaProxy
|
29
|
+
elsif defined? Unicode
|
30
|
+
UTF8::UnicodeProxy
|
31
|
+
elsif defined? ActiveSupport
|
32
|
+
UTF8::ActiveSupportProxy
|
33
|
+
else
|
34
|
+
UTF8::DumbProxy
|
35
|
+
end
|
36
|
+
|
37
|
+
# Return the proxy used for UTF-8 support.
|
38
|
+
# @see Babosa::UTF8::UTF8Proxy
|
39
|
+
def self.utf8_proxy
|
40
|
+
@@utf8_proxy
|
41
|
+
end
|
42
|
+
|
43
|
+
# Set a proxy object used for UTF-8 support.
|
44
|
+
# @see Babosa::UTF8::UTF8Proxy
|
45
|
+
def self.utf8_proxy=(obj)
|
46
|
+
@@utf8_proxy = obj
|
47
|
+
end
|
48
|
+
|
49
|
+
def method_missing(symbol, *args, &block)
|
50
|
+
@wrapped_string.__send__(symbol, *args, &block)
|
51
|
+
end
|
52
|
+
|
53
|
+
# @param string [String] The string to use as the basis of the SlugString.
|
54
|
+
def initialize(string)
|
55
|
+
@wrapped_string = string.to_s
|
56
|
+
tidy_bytes!
|
57
|
+
normalize_utf8!
|
58
|
+
end
|
59
|
+
|
60
|
+
# Approximate an ASCII string. This works only for Western strings using
|
61
|
+
# characters that are Roman-alphabet characters + diacritics. Non-letter
|
62
|
+
# characters are left unmodified.
|
63
|
+
#
|
64
|
+
# string = SlugString.new "Łódź, Poland"
|
65
|
+
# string.approximate_ascii # => "Lodz, Poland"
|
66
|
+
# string = SlugString.new "日本"
|
67
|
+
# string.approximate_ascii # => "日本"
|
68
|
+
#
|
69
|
+
# You can pass any key(s) from +Characters.approximations+ as arguments. This allows
|
70
|
+
# for contextual approximations. By default; +:spanish+ and +:german+ are
|
71
|
+
# provided:
|
72
|
+
#
|
73
|
+
# string = SlugString.new "Jürgen Müller"
|
74
|
+
# string.approximate_ascii # => "Jurgen Muller"
|
75
|
+
# string.approximate_ascii :german # => "Juergen Mueller"
|
76
|
+
# string = SlugString.new "¡Feliz año!"
|
77
|
+
# string.approximate_ascii # => "¡Feliz ano!"
|
78
|
+
# string.approximate_ascii :spanish # => "¡Feliz anio!"
|
79
|
+
#
|
80
|
+
# You can modify the built-in approximations, or add your own:
|
81
|
+
#
|
82
|
+
# # Make Spanish use "nh" rather than "nn"
|
83
|
+
# Babosa::Characters.add_approximations(:spanish, "ñ" => "nh")
|
84
|
+
#
|
85
|
+
# Notice that this method does not simply convert to ASCII; if you want
|
86
|
+
# to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
|
87
|
+
#
|
88
|
+
# string.approximate_ascii!(:spanish) # => "¡Feliz anio!"
|
89
|
+
# string.to_ascii! # => "Feliz anio!"
|
90
|
+
# @param *args <Symbol>
|
91
|
+
# @return String
|
92
|
+
def approximate_ascii!(overrides = {})
|
93
|
+
overrides = Characters.approximations[overrides] if overrides.kind_of? Symbol
|
94
|
+
@wrapped_string = unpack("U*").map { |char| approx_char(char, overrides) }.flatten.pack("U*")
|
95
|
+
end
|
96
|
+
|
97
|
+
# Converts dashes to spaces, removes leading and trailing spaces, and
|
98
|
+
# replaces multiple whitespace characters with a single space.
|
99
|
+
# @return String
|
100
|
+
def clean!
|
101
|
+
@wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip
|
102
|
+
end
|
103
|
+
|
104
|
+
# Remove any non-word characters.
|
105
|
+
# @return String
|
106
|
+
def word_chars!
|
107
|
+
@wrapped_string = (unpack("U*") - Characters.strippable).pack("U*")
|
108
|
+
end
|
109
|
+
|
110
|
+
# Normalize the string for use as a slug. Note that in this context,
|
111
|
+
# +normalize+ means, strip, remove non-letters/numbers, downcasing,
|
112
|
+
# truncating to 255 bytes and converting whitespace to dashes.
|
113
|
+
# @param Boolean ascii If true, approximate ASCII and then remove any non-ASCII characters.
|
114
|
+
# @return String
|
115
|
+
def normalize!(ascii = false)
|
116
|
+
if ascii
|
117
|
+
approximate_ascii!
|
118
|
+
to_ascii!
|
119
|
+
end
|
120
|
+
clean!
|
121
|
+
word_chars!
|
122
|
+
clean!
|
123
|
+
downcase!
|
124
|
+
truncate_bytes!(255)
|
125
|
+
with_dashes!
|
126
|
+
end
|
127
|
+
|
128
|
+
# Delete any non-ascii characters.
|
129
|
+
# @return String
|
130
|
+
def to_ascii!
|
131
|
+
@wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '')
|
132
|
+
end
|
133
|
+
|
134
|
+
# Truncate the string to +max+ characters.
|
135
|
+
# @example
|
136
|
+
# "üéøá".to_slug.truncate(3) #=> "üéø"
|
137
|
+
# @return String
|
138
|
+
def truncate!(max)
|
139
|
+
@wrapped_string = unpack("U*")[0...max].pack("U*")
|
140
|
+
end
|
141
|
+
|
142
|
+
# Truncate the string to +max+ bytes. This can be useful for ensuring that
|
143
|
+
# a UTF-8 string will always fit into a database column with a certain max
|
144
|
+
# byte length. The resulting string may be less than +max+ if the string must
|
145
|
+
# be truncated at a multibyte character boundary.
|
146
|
+
# @example
|
147
|
+
# "üéøá".to_slug.truncate_bytes(3) #=> "ü"
|
148
|
+
# @return String
|
149
|
+
def truncate_bytes!(max)
|
150
|
+
return @wrapped_string if @wrapped_string.bytesize <= max
|
151
|
+
curr = 0
|
152
|
+
new = []
|
153
|
+
unpack("U*").each do |char|
|
154
|
+
break if curr > max
|
155
|
+
char = [char].pack("U")
|
156
|
+
curr += char.bytesize
|
157
|
+
if curr <= max
|
158
|
+
new << char
|
159
|
+
end
|
160
|
+
end
|
161
|
+
@wrapped_string = new.join
|
162
|
+
end
|
163
|
+
|
164
|
+
# Replaces whitespace with dashes ("-").
|
165
|
+
# @return String
|
166
|
+
def with_dashes!
|
167
|
+
@wrapped_string = @wrapped_string.gsub(/\s/u, "-")
|
168
|
+
end
|
169
|
+
|
170
|
+
# Perform UTF-8 sensitive upcasing.
|
171
|
+
# @return String
|
172
|
+
def upcase!
|
173
|
+
@wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
|
174
|
+
end
|
175
|
+
|
176
|
+
# Perform UTF-8 sensitive downcasing.
|
177
|
+
# @return String
|
178
|
+
def downcase!
|
179
|
+
@wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Perform Unicode composition on the wrapped string.
|
183
|
+
# @return String
|
184
|
+
def normalize_utf8!
|
185
|
+
@wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string)
|
186
|
+
end
|
187
|
+
|
188
|
+
# Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
|
189
|
+
# UTF-8.
|
190
|
+
# @return String
|
191
|
+
def tidy_bytes!
|
192
|
+
@wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string)
|
193
|
+
end
|
194
|
+
|
195
|
+
%w[approximate_ascii clean downcase word_chars normalize normalize_utf8
|
196
|
+
tidy_bytes to_ascii truncate truncate_bytes upcase with_dashes].each do |method|
|
197
|
+
class_eval(<<-EOM)
|
198
|
+
def #{method}(*args)
|
199
|
+
send_to_new_instance(:#{method}!, *args)
|
200
|
+
end
|
201
|
+
EOM
|
202
|
+
end
|
203
|
+
|
204
|
+
def to_slug
|
205
|
+
self
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
# Look up the character's approximation in the configured maps.
|
211
|
+
def approx_char(char, overrides = {})
|
212
|
+
overrides[char] or Characters.approximations[:latin][char] or char
|
213
|
+
end
|
214
|
+
|
215
|
+
# Used as the basis of the bangless methods.
|
216
|
+
def send_to_new_instance(*args)
|
217
|
+
string = SlugString.new self
|
218
|
+
string.send(*args)
|
219
|
+
string
|
220
|
+
end
|
221
|
+
|
222
|
+
end
|
223
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Babosa
|
2
|
+
module UTF8
|
3
|
+
# A UTF-8 proxy using Active Support's multibyte support.
|
4
|
+
module ActiveSupportProxy
|
5
|
+
extend UTF8Proxy
|
6
|
+
extend self
|
7
|
+
def downcase(string)
|
8
|
+
ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
|
9
|
+
end
|
10
|
+
|
11
|
+
def upcase(string)
|
12
|
+
ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def normalize_utf8(string)
|
16
|
+
ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.expand_path("../mappings", __FILE__)
|
2
|
+
module Babosa
|
3
|
+
module UTF8
|
4
|
+
|
5
|
+
# This module provides fallback UTF-8 support when nothing else is
|
6
|
+
# available. It does case folding for Roman alphabet-based characters
|
7
|
+
# commonly used by Western European languages and little else, making it
|
8
|
+
# useless for Russian, Bulgarian, Greek, etc. If at all possible, Unicode
|
9
|
+
# or ActiveSupport should be used instead because they support the full
|
10
|
+
# UTF-8 character range.
|
11
|
+
module DumbProxy
|
12
|
+
extend UTF8Proxy
|
13
|
+
extend self
|
14
|
+
|
15
|
+
def downcase(string)
|
16
|
+
string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
|
17
|
+
end
|
18
|
+
|
19
|
+
def upcase(string)
|
20
|
+
string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
|
21
|
+
end
|
22
|
+
|
23
|
+
# This does a very naive Unicode normalization, which should work for
|
24
|
+
# this library's purposes (i.e., Roman-based codepoints, up to U+017E).
|
25
|
+
# Do not use reuse this as a general solution! Use a real library like
|
26
|
+
# Unicode or ActiveSupport instead.
|
27
|
+
def normalize_utf8(string)
|
28
|
+
codepoints = string.unpack("U*")
|
29
|
+
new = []
|
30
|
+
until codepoints.empty? do
|
31
|
+
if Mappings::COMPOSITION[codepoints[0..1]]
|
32
|
+
new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
|
33
|
+
else
|
34
|
+
new << codepoints.shift
|
35
|
+
end
|
36
|
+
end
|
37
|
+
new.compact.flatten.pack("U*")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
include Java
|
2
|
+
|
3
|
+
module Babosa
|
4
|
+
module UTF8
|
5
|
+
# A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
|
6
|
+
module JavaProxy
|
7
|
+
extend UTF8Proxy
|
8
|
+
extend self
|
9
|
+
import java.text.Normalizer
|
10
|
+
|
11
|
+
def downcase(string)
|
12
|
+
string.to_java.to_lower_case.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def upcase(string)
|
16
|
+
string.to_java.to_upper_case.to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
def normalize_utf8(string)
|
20
|
+
Normalizer.normalize(string, Normalizer::Form::NFC).to_s
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
module Babosa
|
2
|
+
module UTF8
|
3
|
+
|
4
|
+
# A small subset of the mappings provided by Unicode.org, limited to Latin
|
5
|
+
# characters. This is used for Babosa's default "dumb" UTF-8 support.
|
6
|
+
module Mappings
|
7
|
+
DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102,
|
8
|
+
71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110,
|
9
|
+
79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118,
|
10
|
+
87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194,
|
11
|
+
226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201,
|
12
|
+
233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208,
|
13
|
+
240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216,
|
14
|
+
248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223,
|
15
|
+
[115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
|
16
|
+
268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
|
17
|
+
282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
|
18
|
+
296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308,
|
19
|
+
309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
|
20
|
+
324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335,
|
21
|
+
336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
|
22
|
+
350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
|
23
|
+
364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255,
|
24
|
+
377, 378, 379, 380, 381, 382]
|
25
|
+
|
26
|
+
UPCASE = DOWNCASE.invert
|
27
|
+
|
28
|
+
COMPOSITION = {
|
29
|
+
[65,768] => 192,
|
30
|
+
[65,769] => 193,
|
31
|
+
[65,770] => 194,
|
32
|
+
[65,771] => 195,
|
33
|
+
[65,776] => 196,
|
34
|
+
[65,778] => 197,
|
35
|
+
[67,807] => 199,
|
36
|
+
[69,768] => 200,
|
37
|
+
[69,769] => 201,
|
38
|
+
[69,770] => 202,
|
39
|
+
[69,776] => 203,
|
40
|
+
[73,768] => 204,
|
41
|
+
[73,769] => 205,
|
42
|
+
[73,770] => 206,
|
43
|
+
[73,776] => 207,
|
44
|
+
[78,771] => 209,
|
45
|
+
[79,768] => 210,
|
46
|
+
[79,769] => 211,
|
47
|
+
[79,770] => 212,
|
48
|
+
[79,771] => 213,
|
49
|
+
[79,776] => 214,
|
50
|
+
[85,768] => 217,
|
51
|
+
[85,769] => 218,
|
52
|
+
[85,770] => 219,
|
53
|
+
[85,776] => 220,
|
54
|
+
[89,769] => 221,
|
55
|
+
[97,768] => 224,
|
56
|
+
[97,769] => 225,
|
57
|
+
[97,770] => 226,
|
58
|
+
[97,771] => 227,
|
59
|
+
[97,776] => 228,
|
60
|
+
[97,778] => 229,
|
61
|
+
[99,807] => 231,
|
62
|
+
[101,768] => 232,
|
63
|
+
[101,769] => 233,
|
64
|
+
[101,770] => 234,
|
65
|
+
[101,776] => 235,
|
66
|
+
[105,768] => 236,
|
67
|
+
[105,769] => 237,
|
68
|
+
[105,770] => 238,
|
69
|
+
[105,776] => 239,
|
70
|
+
[110,771] => 241,
|
71
|
+
[111,768] => 242,
|
72
|
+
[111,769] => 243,
|
73
|
+
[111,770] => 244,
|
74
|
+
[111,771] => 245,
|
75
|
+
[111,776] => 246,
|
76
|
+
[117,768] => 249,
|
77
|
+
[117,769] => 250,
|
78
|
+
[117,770] => 251,
|
79
|
+
[117,776] => 252,
|
80
|
+
[121,769] => 253,
|
81
|
+
[121,776] => 255,
|
82
|
+
[65,772] => 256,
|
83
|
+
[97,772] => 257,
|
84
|
+
[65,774] => 258,
|
85
|
+
[97,774] => 259,
|
86
|
+
[65,808] => 260,
|
87
|
+
[97,808] => 261,
|
88
|
+
[67,769] => 262,
|
89
|
+
[99,769] => 263,
|
90
|
+
[67,770] => 264,
|
91
|
+
[99,770] => 265,
|
92
|
+
[67,775] => 266,
|
93
|
+
[99,775] => 267,
|
94
|
+
[67,780] => 268,
|
95
|
+
[99,780] => 269,
|
96
|
+
[68,780] => 270,
|
97
|
+
[100,780] => 271,
|
98
|
+
[69,772] => 274,
|
99
|
+
[101,772] => 275,
|
100
|
+
[69,774] => 276,
|
101
|
+
[101,774] => 277,
|
102
|
+
[69,775] => 278,
|
103
|
+
[101,775] => 279,
|
104
|
+
[69,808] => 280,
|
105
|
+
[101,808] => 281,
|
106
|
+
[69,780] => 282,
|
107
|
+
[101,780] => 283,
|
108
|
+
[71,770] => 284,
|
109
|
+
[103,770] => 285,
|
110
|
+
[71,774] => 286,
|
111
|
+
[103,774] => 287,
|
112
|
+
[71,775] => 288,
|
113
|
+
[103,775] => 289,
|
114
|
+
[71,807] => 290,
|
115
|
+
[103,807] => 291,
|
116
|
+
[72,770] => 292,
|
117
|
+
[104,770] => 293,
|
118
|
+
[73,771] => 296,
|
119
|
+
[105,771] => 297,
|
120
|
+
[73,772] => 298,
|
121
|
+
[105,772] => 299,
|
122
|
+
[73,774] => 300,
|
123
|
+
[105,774] => 301,
|
124
|
+
[73,808] => 302,
|
125
|
+
[105,808] => 303,
|
126
|
+
[73,775] => 304,
|
127
|
+
[74,770] => 308,
|
128
|
+
[106,770] => 309,
|
129
|
+
[75,807] => 310,
|
130
|
+
[107,807] => 311,
|
131
|
+
[76,769] => 313,
|
132
|
+
[108,769] => 314,
|
133
|
+
[76,807] => 315,
|
134
|
+
[108,807] => 316,
|
135
|
+
[76,780] => 317,
|
136
|
+
[108,780] => 318,
|
137
|
+
[78,769] => 323,
|
138
|
+
[110,769] => 324,
|
139
|
+
[78,807] => 325,
|
140
|
+
[110,807] => 326,
|
141
|
+
[78,780] => 327,
|
142
|
+
[110,780] => 328,
|
143
|
+
[79,772] => 332,
|
144
|
+
[111,772] => 333,
|
145
|
+
[79,774] => 334,
|
146
|
+
[111,774] => 335,
|
147
|
+
[79,779] => 336,
|
148
|
+
[111,779] => 337,
|
149
|
+
[82,769] => 340,
|
150
|
+
[114,769] => 341,
|
151
|
+
[82,807] => 342,
|
152
|
+
[114,807] => 343,
|
153
|
+
[82,780] => 344,
|
154
|
+
[114,780] => 345,
|
155
|
+
[83,769] => 346,
|
156
|
+
[115,769] => 347,
|
157
|
+
[83,770] => 348,
|
158
|
+
[115,770] => 349,
|
159
|
+
[83,807] => 350,
|
160
|
+
[115,807] => 351,
|
161
|
+
[83,780] => 352,
|
162
|
+
[115,780] => 353,
|
163
|
+
[84,807] => 354,
|
164
|
+
[116,807] => 355,
|
165
|
+
[84,780] => 356,
|
166
|
+
[116,780] => 357,
|
167
|
+
[85,771] => 360,
|
168
|
+
[117,771] => 361,
|
169
|
+
[85,772] => 362,
|
170
|
+
[117,772] => 363,
|
171
|
+
[85,774] => 364,
|
172
|
+
[117,774] => 365,
|
173
|
+
[85,778] => 366,
|
174
|
+
[117,778] => 367,
|
175
|
+
[85,779] => 368,
|
176
|
+
[117,779] => 369,
|
177
|
+
[85,808] => 370,
|
178
|
+
[117,808] => 371,
|
179
|
+
[87,770] => 372,
|
180
|
+
[119,770] => 373,
|
181
|
+
[89,770] => 374,
|
182
|
+
[121,770] => 375,
|
183
|
+
[89,776] => 376,
|
184
|
+
[90,769] => 377,
|
185
|
+
[122,769] => 378,
|
186
|
+
[90,775] => 379,
|
187
|
+
[122,775] => 380,
|
188
|
+
[90,780] => 381,
|
189
|
+
[122,780] => 382
|
190
|
+
}
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
module Babosa
|
2
|
+
module UTF8
|
3
|
+
|
4
|
+
autoload :JavaProxy, "babosa/utf8/java_proxy"
|
5
|
+
autoload :UnicodeProxy, "babosa/utf8/unicode_proxy"
|
6
|
+
autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy"
|
7
|
+
autoload :DumbProxy, "babosa/utf8/dumb_proxy"
|
8
|
+
|
9
|
+
# A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
|
10
|
+
# The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
|
11
|
+
module UTF8Proxy
|
12
|
+
CP1252 = {
|
13
|
+
128 => [226, 130, 172],
|
14
|
+
129 => nil,
|
15
|
+
130 => [226, 128, 154],
|
16
|
+
131 => [198, 146],
|
17
|
+
132 => [226, 128, 158],
|
18
|
+
133 => [226, 128, 166],
|
19
|
+
134 => [226, 128, 160],
|
20
|
+
135 => [226, 128, 161],
|
21
|
+
136 => [203, 134],
|
22
|
+
137 => [226, 128, 176],
|
23
|
+
138 => [197, 160],
|
24
|
+
139 => [226, 128, 185],
|
25
|
+
140 => [197, 146],
|
26
|
+
141 => nil,
|
27
|
+
142 => [197, 189],
|
28
|
+
143 => nil,
|
29
|
+
144 => nil,
|
30
|
+
145 => [226, 128, 152],
|
31
|
+
146 => [226, 128, 153],
|
32
|
+
147 => [226, 128, 156],
|
33
|
+
148 => [226, 128, 157],
|
34
|
+
149 => [226, 128, 162],
|
35
|
+
150 => [226, 128, 147],
|
36
|
+
151 => [226, 128, 148],
|
37
|
+
152 => [203, 156],
|
38
|
+
153 => [226, 132, 162],
|
39
|
+
154 => [197, 161],
|
40
|
+
155 => [226, 128, 186],
|
41
|
+
156 => [197, 147],
|
42
|
+
157 => nil,
|
43
|
+
158 => [197, 190],
|
44
|
+
159 => [197, 184]
|
45
|
+
}
|
46
|
+
|
47
|
+
# This is a stub for a method that should return a Unicode-aware
|
48
|
+
# downcased version of the given string.
|
49
|
+
def downcase(string)
|
50
|
+
raise NotImplementedError
|
51
|
+
end
|
52
|
+
|
53
|
+
# This is a stub for a method that should return a Unicode-aware
|
54
|
+
# upcased version of the given string.
|
55
|
+
def upcase(string)
|
56
|
+
raise NotImplementedError
|
57
|
+
end
|
58
|
+
|
59
|
+
# This is a stub for a method that should return the Unicode NFC
|
60
|
+
# normalization of the given string.
|
61
|
+
def normalize_utf8(string)
|
62
|
+
raise NotImplementedError
|
63
|
+
end
|
64
|
+
|
65
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
66
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
67
|
+
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
68
|
+
# always work.
|
69
|
+
def tidy_bytes(string)
|
70
|
+
bytes = string.unpack("C*")
|
71
|
+
conts_expected = 0
|
72
|
+
last_lead = 0
|
73
|
+
|
74
|
+
bytes.each_index do |i|
|
75
|
+
byte = bytes[i]
|
76
|
+
is_ascii = byte < 128
|
77
|
+
is_cont = byte > 127 && byte < 192
|
78
|
+
is_lead = byte > 191 && byte < 245
|
79
|
+
is_unused = byte > 240
|
80
|
+
is_restricted = byte > 244
|
81
|
+
|
82
|
+
# Impossible or highly unlikely byte? Clean it.
|
83
|
+
if is_unused || is_restricted
|
84
|
+
bytes[i] = tidy_byte(byte)
|
85
|
+
elsif is_cont
|
86
|
+
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
87
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
88
|
+
else
|
89
|
+
if conts_expected > 0
|
90
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
91
|
+
# the leading byte.
|
92
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
93
|
+
conts_expected = 0
|
94
|
+
end
|
95
|
+
if is_lead
|
96
|
+
# Final byte is leading? Clean it.
|
97
|
+
if i == bytes.length - 1
|
98
|
+
bytes[i] = tidy_byte(bytes.last)
|
99
|
+
else
|
100
|
+
# Valid leading byte? Expect continuations determined by position of
|
101
|
+
# first zero bit, with max of 3.
|
102
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
103
|
+
last_lead = i
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def tidy_byte(byte)
|
114
|
+
byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Babosa
|
2
|
+
module UTF8
|
3
|
+
# A UTF-8 proxy using the Unicode gem.
|
4
|
+
# @see http://github.com/blackwinter/unicode
|
5
|
+
module UnicodeProxy
|
6
|
+
extend UTF8Proxy
|
7
|
+
extend self
|
8
|
+
def downcase(string)
|
9
|
+
Unicode.downcase(string)
|
10
|
+
end
|
11
|
+
|
12
|
+
def upcase(string)
|
13
|
+
Unicode.upcase(string)
|
14
|
+
end
|
15
|
+
|
16
|
+
def normalize_utf8(string)
|
17
|
+
Unicode.normalize_C(string)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/test/babosa_test.rb
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$KCODE = 'UTF8' if RUBY_VERSION < '1.9'
|
3
|
+
$LOAD_PATH << File.expand_path("../../lib", __FILE__)
|
4
|
+
$LOAD_PATH.uniq!
|
5
|
+
|
6
|
+
require "rubygems"
|
7
|
+
require "bundler"
|
8
|
+
Bundler.setup
|
9
|
+
require "test/unit"
|
10
|
+
require "babosa"
|
11
|
+
|
12
|
+
Module.send :include, Module.new {
|
13
|
+
def test(name, &block)
|
14
|
+
define_method("test_#{name.gsub(/[^a-z0-9_]/i, "_")}".to_sym, &block)
|
15
|
+
end
|
16
|
+
}
|
17
|
+
|
18
|
+
module UTF8ProxyTest
|
19
|
+
test "should downcase strings" do
|
20
|
+
assert_equal "åéîøü", proxy.downcase("ÅÉÎØÜ")
|
21
|
+
end
|
22
|
+
|
23
|
+
test "should upcase strings" do
|
24
|
+
assert_equal "ÅÉÎØÜ", proxy.upcase("åéîøü")
|
25
|
+
end
|
26
|
+
|
27
|
+
test "should compose UTF-8" do
|
28
|
+
# ÅÉÎØÜ
|
29
|
+
uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136]
|
30
|
+
composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156]
|
31
|
+
uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*")
|
32
|
+
composed_string = composed_bytes.pack("C*").unpack("U*").pack("U*")
|
33
|
+
assert_equal composed_bytes, proxy.normalize_utf8(uncomposed_string).unpack("C*")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
if Babosa.jruby15?
|
38
|
+
class JavaProxyTest < Test::Unit::TestCase
|
39
|
+
include UTF8ProxyTest
|
40
|
+
def proxy
|
41
|
+
Babosa::UTF8::JavaProxy
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class DumbProxyTest < Test::Unit::TestCase
|
47
|
+
include UTF8ProxyTest
|
48
|
+
def proxy
|
49
|
+
Babosa::UTF8::DumbProxy
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class BabosaTest < Test::Unit::TestCase
|
54
|
+
|
55
|
+
test "word_chars! should leave only letters and spaces" do
|
56
|
+
string = "a*$%^$@!@b$%^&*()*!c"
|
57
|
+
assert_match /[a-z ]*/i, string.to_slug.word_chars!
|
58
|
+
end
|
59
|
+
|
60
|
+
test "approximate_ascii should transliterate to ascii" do
|
61
|
+
slug = (0xC0..0x17E).to_a.each do |codepoint|
|
62
|
+
ss = [codepoint].pack("U*").to_slug
|
63
|
+
approx = ss.approximate_ascii
|
64
|
+
assert_match /[\x0-\x7f]/, approx.to_s
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
test "should lowercase strings" do
|
69
|
+
assert_equal "feliz año", "FELIZ AÑO".to_slug.downcase!
|
70
|
+
end
|
71
|
+
|
72
|
+
test "should uppercase strings" do
|
73
|
+
assert_equal "FELIZ AÑO", "feliz año".to_slug.upcase!
|
74
|
+
end
|
75
|
+
|
76
|
+
test "should replace whitespace with dashes" do
|
77
|
+
assert_equal "a-b", "a b".to_slug.clean.normalize!
|
78
|
+
end
|
79
|
+
|
80
|
+
test "should replace multiple spaces with 1 dash" do
|
81
|
+
assert_equal "a-b", "a b".to_slug.clean.normalize!
|
82
|
+
end
|
83
|
+
|
84
|
+
test "should replace multiple dashes with 1 dash" do
|
85
|
+
assert_equal "male-female", "male - female".to_slug.normalize!
|
86
|
+
end
|
87
|
+
|
88
|
+
test "should strip trailing space" do
|
89
|
+
assert_equal "ab", "ab ".to_slug.normalize!
|
90
|
+
end
|
91
|
+
|
92
|
+
test "should strip leading space" do
|
93
|
+
assert_equal "ab", " ab".to_slug.normalize!
|
94
|
+
end
|
95
|
+
|
96
|
+
test "should strip trailing slashes" do
|
97
|
+
assert_equal "ab", "ab-".to_slug.normalize!
|
98
|
+
end
|
99
|
+
|
100
|
+
test "should strip leading slashes" do
|
101
|
+
assert_equal "ab", "-ab".to_slug.normalize!
|
102
|
+
end
|
103
|
+
|
104
|
+
test "should not modify valid name strings" do
|
105
|
+
assert_equal "a-b-c-d", "a-b-c-d".to_slug.normalize!
|
106
|
+
end
|
107
|
+
|
108
|
+
test "should do special approximations for German" do
|
109
|
+
assert_equal "Juergen", "Jürgen".to_slug.approximate_ascii!(:german)
|
110
|
+
end
|
111
|
+
|
112
|
+
test "should do special approximations for Spanish" do
|
113
|
+
assert_equal "anio", "año".to_slug.approximate_ascii!(:spanish)
|
114
|
+
end
|
115
|
+
|
116
|
+
test "should work with non roman chars" do
|
117
|
+
assert_equal "検-索", "検 索".to_slug.normalize!
|
118
|
+
end
|
119
|
+
|
120
|
+
test "should work with invalid UTF-8 strings" do
|
121
|
+
%w[approximate_ascii clean downcase word_chars normalize to_ascii upcase with_dashes].each do |method|
|
122
|
+
string = "\x93abc".to_slug
|
123
|
+
assert_nothing_raised do
|
124
|
+
method == "truncate" ? string.send(method, 32) : string.send(method)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
test "should truncate string by byte length" do
|
130
|
+
assert_equal "ü", "üa".to_slug.truncate_bytes!(2)
|
131
|
+
assert_equal "", "üa".to_slug.truncate_bytes!(1)
|
132
|
+
assert_equal "üa", "üa".to_slug.truncate_bytes!(100)
|
133
|
+
assert_equal "ü", "üéøá".to_slug.truncate_bytes!(3)
|
134
|
+
end
|
135
|
+
|
136
|
+
test "should truncate string by char length" do
|
137
|
+
assert_equal "üa", "üa".to_slug.truncate!(2)
|
138
|
+
assert_equal "ü", "üa".to_slug.truncate!(1)
|
139
|
+
assert_equal "üa", "üa".to_slug.truncate!(100)
|
140
|
+
end
|
141
|
+
|
142
|
+
test "should transliterate uncomposed utf8" do
|
143
|
+
string = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS
|
144
|
+
assert_equal "u", string.to_slug.approximate_ascii!
|
145
|
+
end
|
146
|
+
|
147
|
+
test "with_dashes should not change byte size when replacing spaces" do
|
148
|
+
assert_equal "".bytesize, "".to_slug.with_dashes.bytesize
|
149
|
+
assert_equal " ".bytesize, " ".to_slug.with_dashes.bytesize
|
150
|
+
assert_equal "-abc-".bytesize, "-abc-".to_slug.with_dashes.bytesize
|
151
|
+
assert_equal " abc ".bytesize, " abc ".to_slug.with_dashes.bytesize
|
152
|
+
assert_equal " a bc ".bytesize, " a bc ".to_slug.with_dashes.bytesize
|
153
|
+
end
|
154
|
+
|
155
|
+
test "normalize! with ascii should approximate and strip non ascii" do
|
156
|
+
ss = "カタカナ: katakana is über cool".to_slug
|
157
|
+
assert_equal "katakana-is-uber-cool", ss.normalize!(true)
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: babosa
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Norman Clarke
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-07-12 00:00:00 -03:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: " A library for creating slugs. Babosa an extraction and improvement of the\n string code from FriendlyId, intended to help developers create similar\n libraries or plugins.\n"
|
22
|
+
email: norman@njclarke.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- lib/babosa/characters.rb
|
31
|
+
- lib/babosa/slug_string.rb
|
32
|
+
- lib/babosa/utf8/active_support_proxy.rb
|
33
|
+
- lib/babosa/utf8/dumb_proxy.rb
|
34
|
+
- lib/babosa/utf8/java_proxy.rb
|
35
|
+
- lib/babosa/utf8/mappings.rb
|
36
|
+
- lib/babosa/utf8/proxy.rb
|
37
|
+
- lib/babosa/utf8/unicode_proxy.rb
|
38
|
+
- lib/babosa/version.rb
|
39
|
+
- lib/babosa.rb
|
40
|
+
- README.md
|
41
|
+
- MIT-LICENSE
|
42
|
+
- Rakefile
|
43
|
+
- init.rb
|
44
|
+
- test/babosa_test.rb
|
45
|
+
has_rdoc: false
|
46
|
+
homepage: http://norman.github.com/babosa
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: "[none]"
|
73
|
+
rubygems_version: 1.3.7
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: A library for creating slugs.
|
77
|
+
test_files:
|
78
|
+
- test/babosa_test.rb
|