string_normalizr 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +24 -0
- data/Manifest +1 -1
- data/README.rdoc +7 -5
- data/Rakefile +1 -1
- data/lib/string_normalizr.rb +96 -6
- data/string_normalizr.gemspec +4 -4
- data/test/string_normalizr_test.rb +72 -5
- metadata +7 -5
data/LICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Copyright (c) 2010, Carsten Zimmermann
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
6
|
+
* Redistributions of source code must retain the above copyright
|
7
|
+
notice, this list of conditions and the following disclaimer.
|
8
|
+
* Redistributions in binary form must reproduce the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer in the
|
10
|
+
documentation and/or other materials provided with the distribution.
|
11
|
+
* Neither the name of the original author / copyright holder nor the
|
12
|
+
names of its contributors may be used to endorse or promote products
|
13
|
+
derived from this software without specific prior written permission.
|
14
|
+
|
15
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
16
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
|
19
|
+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/Manifest
CHANGED
data/README.rdoc
CHANGED
@@ -6,7 +6,7 @@ of your String instance.
|
|
6
6
|
|
7
7
|
|
8
8
|
=== Installation
|
9
|
-
$ gem install
|
9
|
+
$ gem install string_normalizr
|
10
10
|
|
11
11
|
Or use it as a plugin with your Rails app:
|
12
12
|
|
@@ -15,12 +15,14 @@ Or use it as a plugin with your Rails app:
|
|
15
15
|
=== Usage
|
16
16
|
"hellö world".normalize
|
17
17
|
|
18
|
-
=== Caveats
|
19
|
-
* Tests seem to have difficulties with multibyte characters right now.
|
20
|
-
|
21
18
|
=== .plan
|
22
|
-
*
|
19
|
+
* support custom collation hash
|
20
|
+
* support more special chars by default
|
21
|
+
* handle punctuation marks
|
23
22
|
|
23
|
+
=== Changelog
|
24
|
+
* 0.2: Normalization can now be customized via an options hash.
|
25
|
+
* 0.1: Initial version
|
24
26
|
|
25
27
|
---
|
26
28
|
|
data/Rakefile
CHANGED
@@ -9,7 +9,7 @@ Rake::TestTask.new("test") do |t|
|
|
9
9
|
t.verbose = false
|
10
10
|
end
|
11
11
|
|
12
|
-
Echoe.new('string_normalizr', '0.
|
12
|
+
Echoe.new('string_normalizr', '0.2') do |p|
|
13
13
|
p.description = "Let String instances be conviently normalized"
|
14
14
|
p.url = "http://github.com/carpodaster/string_normalizr"
|
15
15
|
p.author = "Carsten Zimmermann"
|
data/lib/string_normalizr.rb
CHANGED
@@ -1,20 +1,110 @@
|
|
1
|
-
|
1
|
+
# -*- coding: utf-8 -*-"
|
2
|
+
require "rubygems"
|
3
|
+
module AegisNet # :nodoc:
|
2
4
|
module StringNormalizr
|
3
5
|
|
6
|
+
COLLATION = {
|
7
|
+
'Ä' => 'Ae',
|
8
|
+
'Æ' => 'Ae',
|
9
|
+
'Å' => 'A',
|
10
|
+
'À' => 'A',
|
11
|
+
'Á' => 'A',
|
12
|
+
'Â' => 'A',
|
13
|
+
'Ç' => 'C',
|
14
|
+
'È' => "E",
|
15
|
+
'É' => "E",
|
16
|
+
'Ê' => "E",
|
17
|
+
'Ë' => 'E',
|
18
|
+
'Í' => 'I',
|
19
|
+
'Ì' => 'I',
|
20
|
+
'Î' => 'I',
|
21
|
+
'Ï' => 'I',
|
22
|
+
'Ñ' => 'N',
|
23
|
+
'Ö' => 'Oe',
|
24
|
+
'Œ' => 'Oe',
|
25
|
+
'Ø' => 'O',
|
26
|
+
'Ô' => 'O',
|
27
|
+
'Ó' => 'O',
|
28
|
+
'Ò' => 'O',
|
29
|
+
'Ü' => 'Ue',
|
30
|
+
'Ú' => 'U',
|
31
|
+
'Ù' => 'U',
|
32
|
+
'Ÿ' => 'Y',
|
33
|
+
'ä' => 'ae',
|
34
|
+
'æ' => 'ae',
|
35
|
+
'å' => 'a',
|
36
|
+
'à' => 'a',
|
37
|
+
'á' => 'a',
|
38
|
+
'â' => 'a',
|
39
|
+
'ç' => 'c',
|
40
|
+
'è' => 'e',
|
41
|
+
'é' => 'e',
|
42
|
+
'ê' => 'e',
|
43
|
+
'ë' => 'e',
|
44
|
+
'í' => 'i',
|
45
|
+
'ì' => 'i',
|
46
|
+
'î' => 'i',
|
47
|
+
'ï' => 'i',
|
48
|
+
'ñ' => 'n',
|
49
|
+
'ö' => 'oe',
|
50
|
+
'œ' => 'oe',
|
51
|
+
'ø' => 'o',
|
52
|
+
'ô' => 'o',
|
53
|
+
'ó' => 'o',
|
54
|
+
'ò' => 'o',
|
55
|
+
'ü' => 'ue',
|
56
|
+
'ú' => 'u',
|
57
|
+
'ù' => 'u',
|
58
|
+
'ÿ' => 'y',
|
59
|
+
'ß' => 'ss',
|
60
|
+
}
|
61
|
+
|
4
62
|
def self.included(base)
|
5
63
|
base.send(:include, InstanceMethods)
|
6
64
|
end
|
7
65
|
|
8
|
-
|
9
66
|
module InstanceMethods
|
10
|
-
|
11
|
-
|
67
|
+
|
68
|
+
# Returns a new String based on pre-defined normalization rules
|
69
|
+
#
|
70
|
+
# == Parameters
|
71
|
+
# * +options+: optional Hash for normalization customization
|
72
|
+
#
|
73
|
+
# == Available options
|
74
|
+
# * <tt>:strip</tt> - trim leading and trailing whitespaces (true|false, default: true)
|
75
|
+
# * <tt>:replace_whitespaces</tt> - replace whitespaces within the string with +str+
|
76
|
+
# or set to +false+ to leave whitespaces alone. Makes little
|
77
|
+
# sense w/o :strip => true (str|false, default: "-")
|
78
|
+
#
|
79
|
+
# == Examples
|
80
|
+
# "This is án exåmple".normalize
|
81
|
+
# => "This-is-an-example
|
82
|
+
#
|
83
|
+
# "Tëst string with träiling whitespaces ".normalize(:replace_whitespaces => false)
|
84
|
+
# => "Test string with traeiling whitespaces"
|
85
|
+
#
|
86
|
+
def normalize(options = {})
|
87
|
+
# shamelessly taken from ActiveSupport::ActiveSupport::Hash::Keys#assert_valid_keys
|
88
|
+
valid_keys = [:replace_whitespaces, :strip]
|
89
|
+
unknown_keys = options.keys - [valid_keys].flatten
|
90
|
+
raise(ArgumentError, "Unknown key(s): #{unknown_keys.join(", ")}") unless unknown_keys.empty?
|
91
|
+
|
92
|
+
# Default options
|
93
|
+
options = {
|
94
|
+
:downcase => false,
|
95
|
+
:strip => true,
|
96
|
+
:replace_whitespaces => "-"
|
97
|
+
}.merge(options)
|
98
|
+
|
99
|
+
n_str = AegisNet::StringNormalizr::COLLATION.inject(dup) {|str, (collate_from, collate_to)| str.gsub(collate_from, collate_to)}
|
100
|
+
n_str.strip! if options[:strip]
|
101
|
+
n_str.gsub!(/\s+/, options[:replace_whitespaces]) if options[:replace_whitespaces]
|
102
|
+
n_str
|
12
103
|
end
|
13
104
|
end
|
14
|
-
|
15
105
|
end
|
16
106
|
end
|
17
107
|
|
18
|
-
class String
|
108
|
+
class String # :nodoc:
|
19
109
|
include AegisNet::StringNormalizr
|
20
110
|
end
|
data/string_normalizr.gemspec
CHANGED
@@ -2,15 +2,15 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{string_normalizr}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Carsten Zimmermann"]
|
9
|
-
s.date = %q{2010-09-
|
9
|
+
s.date = %q{2010-09-20}
|
10
10
|
s.description = %q{Let String instances be conviently normalized}
|
11
11
|
s.email = %q{carp@hacksocke.de}
|
12
|
-
s.extra_rdoc_files = ["README.rdoc", "lib/string_normalizr.rb"]
|
13
|
-
s.files = ["README.rdoc", "Rakefile", "init.rb", "lib/string_normalizr.rb", "nbproject/private/rake-d.txt", "
|
12
|
+
s.extra_rdoc_files = ["LICENSE", "README.rdoc", "lib/string_normalizr.rb"]
|
13
|
+
s.files = ["LICENSE", "README.rdoc", "Rakefile", "init.rb", "lib/string_normalizr.rb", "nbproject/private/rake-d.txt", "test/string_normalizr_test.rb", "Manifest", "string_normalizr.gemspec"]
|
14
14
|
s.homepage = %q{http://github.com/carpodaster/string_normalizr}
|
15
15
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "String_normalizr", "--main", "README.rdoc"]
|
16
16
|
s.require_paths = ["lib"]
|
@@ -1,22 +1,89 @@
|
|
1
|
+
# -*- coding: utf-8 -*-"
|
1
2
|
require 'test/unit'
|
2
3
|
require File.dirname(__FILE__) + "/../lib/string_normalizr"
|
3
4
|
|
4
5
|
class StringNormalizrTest < Test::Unit::TestCase
|
5
|
-
|
6
|
+
|
6
7
|
def test_string_integration
|
7
8
|
assert "some string".respond_to?(:normalize)
|
8
9
|
end
|
9
10
|
|
10
11
|
def test_whitespaces
|
12
|
+
assert_equal "This-is-an-example", "This is an example".normalize
|
13
|
+
assert_equal "This is an example", "This is an example".normalize(:replace_whitespaces => false)
|
14
|
+
assert_equal "This=is=an=example", "This is an example".normalize(:replace_whitespaces => "=")
|
15
|
+
|
11
16
|
assert_equal "foo", " foo \n \t".normalize
|
17
|
+
assert_equal "foo \n \t", "foo \n \t".normalize(:strip => false, :replace_whitespaces => false)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_accents
|
21
|
+
assert_equal "a", "á".normalize
|
22
|
+
assert_equal "a", "à".normalize
|
23
|
+
assert_equal "a", "â".normalize
|
24
|
+
assert_equal "A", "Á".normalize
|
25
|
+
assert_equal "A", "À".normalize
|
26
|
+
assert_equal "A", "Â".normalize
|
27
|
+
assert_equal "e", "é".normalize
|
28
|
+
assert_equal "e", "è".normalize
|
29
|
+
assert_equal "E", "É".normalize
|
30
|
+
assert_equal "E", "È".normalize
|
31
|
+
assert_equal "i", "í".normalize
|
32
|
+
assert_equal "i", "ì".normalize
|
33
|
+
assert_equal "i", "î".normalize
|
34
|
+
assert_equal "I", "Í".normalize
|
35
|
+
assert_equal "I", "Ì".normalize
|
36
|
+
assert_equal "I", "Î".normalize
|
37
|
+
assert_equal "o", "ó".normalize
|
38
|
+
assert_equal "o", "ò".normalize
|
39
|
+
assert_equal "o", "ô".normalize
|
40
|
+
assert_equal "O", "Ó".normalize
|
41
|
+
assert_equal "O", "Ò".normalize
|
42
|
+
assert_equal "O", "Ô".normalize
|
43
|
+
assert_equal "u", "ú".normalize
|
44
|
+
assert_equal "u", "ù".normalize
|
45
|
+
assert_equal "U", "Ú".normalize
|
46
|
+
assert_equal "U", "Ù".normalize
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_umlauts
|
50
|
+
assert_equal "ae", "ä".normalize
|
51
|
+
assert_equal "Ae", "Ä".normalize
|
52
|
+
assert_equal "oe", "ö".normalize
|
53
|
+
assert_equal "Oe", "Ö".normalize
|
54
|
+
assert_equal "ue", "ü".normalize
|
55
|
+
assert_equal "Ue", "Ü".normalize
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_spanish_chars
|
59
|
+
assert_equal "c", "ç".normalize
|
60
|
+
assert_equal "C", "Ç".normalize
|
61
|
+
assert_equal "n", "ñ".normalize
|
62
|
+
assert_equal "N", "Ñ".normalize
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_scandinavian_chars
|
66
|
+
assert_equal "a", "å".normalize
|
67
|
+
assert_equal "A", "Å".normalize
|
68
|
+
assert_equal "o", "ø".normalize
|
69
|
+
assert_equal "O", "Ø".normalize
|
12
70
|
end
|
13
71
|
|
14
|
-
def
|
15
|
-
assert_equal "
|
72
|
+
def test_ligatures
|
73
|
+
assert_equal "ae", "æ".normalize
|
74
|
+
assert_equal "Ae", "Æ".normalize
|
75
|
+
assert_equal "oe", "œ".normalize
|
76
|
+
assert_equal 'Oe', 'Œ'.normalize
|
77
|
+
assert_equal "ss", "ß".normalize
|
16
78
|
end
|
17
79
|
|
18
|
-
def
|
19
|
-
assert_equal "
|
80
|
+
def test_diaresises
|
81
|
+
assert_equal "e", "ë".normalize
|
82
|
+
assert_equal "E", "Ë".normalize
|
83
|
+
assert_equal "i", "ï".normalize
|
84
|
+
assert_equal "I", "Ï".normalize
|
85
|
+
assert_equal "y", "ÿ".normalize
|
86
|
+
assert_equal "Y", "Ÿ".normalize
|
20
87
|
end
|
21
88
|
|
22
89
|
end
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string_normalizr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 2
|
9
|
+
version: "0.2"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Carsten Zimmermann
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-09-
|
17
|
+
date: 2010-09-20 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -25,17 +25,19 @@ executables: []
|
|
25
25
|
extensions: []
|
26
26
|
|
27
27
|
extra_rdoc_files:
|
28
|
+
- LICENSE
|
28
29
|
- README.rdoc
|
29
30
|
- lib/string_normalizr.rb
|
30
31
|
files:
|
32
|
+
- LICENSE
|
31
33
|
- README.rdoc
|
32
34
|
- Rakefile
|
33
35
|
- init.rb
|
34
36
|
- lib/string_normalizr.rb
|
35
37
|
- nbproject/private/rake-d.txt
|
36
|
-
- string_normalizr.gemspec
|
37
38
|
- test/string_normalizr_test.rb
|
38
39
|
- Manifest
|
40
|
+
- string_normalizr.gemspec
|
39
41
|
has_rdoc: true
|
40
42
|
homepage: http://github.com/carpodaster/string_normalizr
|
41
43
|
licenses: []
|