string_normalizr 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +24 -0
- data/Manifest +1 -1
- data/README.rdoc +7 -5
- data/Rakefile +1 -1
- data/lib/string_normalizr.rb +96 -6
- data/string_normalizr.gemspec +4 -4
- data/test/string_normalizr_test.rb +72 -5
- metadata +7 -5
data/LICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Copyright (c) 2010, Carsten Zimmermann
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
6
|
+
* Redistributions of source code must retain the above copyright
|
7
|
+
notice, this list of conditions and the following disclaimer.
|
8
|
+
* Redistributions in binary form must reproduce the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer in the
|
10
|
+
documentation and/or other materials provided with the distribution.
|
11
|
+
* Neither the name of the original author / copyright holder nor the
|
12
|
+
names of its contributors may be used to endorse or promote products
|
13
|
+
derived from this software without specific prior written permission.
|
14
|
+
|
15
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
16
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
|
19
|
+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/Manifest
CHANGED
data/README.rdoc
CHANGED
@@ -6,7 +6,7 @@ of your String instance.
|
|
6
6
|
|
7
7
|
|
8
8
|
=== Installation
|
9
|
-
$ gem install
|
9
|
+
$ gem install string_normalizr
|
10
10
|
|
11
11
|
Or use it as a plugin with your Rails app:
|
12
12
|
|
@@ -15,12 +15,14 @@ Or use it as a plugin with your Rails app:
|
|
15
15
|
=== Usage
|
16
16
|
"hellö world".normalize
|
17
17
|
|
18
|
-
=== Caveats
|
19
|
-
* Tests seem to have difficulties with multibyte characters right now.
|
20
|
-
|
21
18
|
=== .plan
|
22
|
-
*
|
19
|
+
* support custom collation hash
|
20
|
+
* support more special chars by default
|
21
|
+
* handle punctuation marks
|
23
22
|
|
23
|
+
=== Changelog
|
24
|
+
* 0.2: Normalization can now be customized via an options hash.
|
25
|
+
* 0.1: Initial version
|
24
26
|
|
25
27
|
---
|
26
28
|
|
data/Rakefile
CHANGED
@@ -9,7 +9,7 @@ Rake::TestTask.new("test") do |t|
|
|
9
9
|
t.verbose = false
|
10
10
|
end
|
11
11
|
|
12
|
-
Echoe.new('string_normalizr', '0.
|
12
|
+
Echoe.new('string_normalizr', '0.2') do |p|
|
13
13
|
p.description = "Let String instances be conviently normalized"
|
14
14
|
p.url = "http://github.com/carpodaster/string_normalizr"
|
15
15
|
p.author = "Carsten Zimmermann"
|
data/lib/string_normalizr.rb
CHANGED
@@ -1,20 +1,110 @@
|
|
1
|
-
|
1
|
+
# -*- coding: utf-8 -*-"
|
2
|
+
require "rubygems"
|
3
|
+
module AegisNet # :nodoc:
|
2
4
|
module StringNormalizr
|
3
5
|
|
6
|
+
COLLATION = {
|
7
|
+
'Ä' => 'Ae',
|
8
|
+
'Æ' => 'Ae',
|
9
|
+
'Å' => 'A',
|
10
|
+
'À' => 'A',
|
11
|
+
'Á' => 'A',
|
12
|
+
'Â' => 'A',
|
13
|
+
'Ç' => 'C',
|
14
|
+
'È' => "E",
|
15
|
+
'É' => "E",
|
16
|
+
'Ê' => "E",
|
17
|
+
'Ë' => 'E',
|
18
|
+
'Í' => 'I',
|
19
|
+
'Ì' => 'I',
|
20
|
+
'Î' => 'I',
|
21
|
+
'Ï' => 'I',
|
22
|
+
'Ñ' => 'N',
|
23
|
+
'Ö' => 'Oe',
|
24
|
+
'Œ' => 'Oe',
|
25
|
+
'Ø' => 'O',
|
26
|
+
'Ô' => 'O',
|
27
|
+
'Ó' => 'O',
|
28
|
+
'Ò' => 'O',
|
29
|
+
'Ü' => 'Ue',
|
30
|
+
'Ú' => 'U',
|
31
|
+
'Ù' => 'U',
|
32
|
+
'Ÿ' => 'Y',
|
33
|
+
'ä' => 'ae',
|
34
|
+
'æ' => 'ae',
|
35
|
+
'å' => 'a',
|
36
|
+
'à' => 'a',
|
37
|
+
'á' => 'a',
|
38
|
+
'â' => 'a',
|
39
|
+
'ç' => 'c',
|
40
|
+
'è' => 'e',
|
41
|
+
'é' => 'e',
|
42
|
+
'ê' => 'e',
|
43
|
+
'ë' => 'e',
|
44
|
+
'í' => 'i',
|
45
|
+
'ì' => 'i',
|
46
|
+
'î' => 'i',
|
47
|
+
'ï' => 'i',
|
48
|
+
'ñ' => 'n',
|
49
|
+
'ö' => 'oe',
|
50
|
+
'œ' => 'oe',
|
51
|
+
'ø' => 'o',
|
52
|
+
'ô' => 'o',
|
53
|
+
'ó' => 'o',
|
54
|
+
'ò' => 'o',
|
55
|
+
'ü' => 'ue',
|
56
|
+
'ú' => 'u',
|
57
|
+
'ù' => 'u',
|
58
|
+
'ÿ' => 'y',
|
59
|
+
'ß' => 'ss',
|
60
|
+
}
|
61
|
+
|
4
62
|
def self.included(base)
|
5
63
|
base.send(:include, InstanceMethods)
|
6
64
|
end
|
7
65
|
|
8
|
-
|
9
66
|
module InstanceMethods
|
10
|
-
|
11
|
-
|
67
|
+
|
68
|
+
# Returns a new String based on pre-defined normalization rules
|
69
|
+
#
|
70
|
+
# == Parameters
|
71
|
+
# * +options+: optional Hash for normalization customization
|
72
|
+
#
|
73
|
+
# == Available options
|
74
|
+
# * <tt>:strip</tt> - trim leading and trailing whitespaces (true|false, default: true)
|
75
|
+
# * <tt>:replace_whitespaces</tt> - replace whitespaces within the string with +str+
|
76
|
+
# or set to +false+ to leave whitespaces alone. Makes little
|
77
|
+
# sense w/o :strip => true (str|false, default: "-")
|
78
|
+
#
|
79
|
+
# == Examples
|
80
|
+
# "This is án exåmple".normalize
|
81
|
+
# => "This-is-an-example
|
82
|
+
#
|
83
|
+
# "Tëst string with träiling whitespaces ".normalize(:replace_whitespaces => false)
|
84
|
+
# => "Test string with traeiling whitespaces"
|
85
|
+
#
|
86
|
+
def normalize(options = {})
|
87
|
+
# shamelessly taken from ActiveSupport::ActiveSupport::Hash::Keys#assert_valid_keys
|
88
|
+
valid_keys = [:replace_whitespaces, :strip]
|
89
|
+
unknown_keys = options.keys - [valid_keys].flatten
|
90
|
+
raise(ArgumentError, "Unknown key(s): #{unknown_keys.join(", ")}") unless unknown_keys.empty?
|
91
|
+
|
92
|
+
# Default options
|
93
|
+
options = {
|
94
|
+
:downcase => false,
|
95
|
+
:strip => true,
|
96
|
+
:replace_whitespaces => "-"
|
97
|
+
}.merge(options)
|
98
|
+
|
99
|
+
n_str = AegisNet::StringNormalizr::COLLATION.inject(dup) {|str, (collate_from, collate_to)| str.gsub(collate_from, collate_to)}
|
100
|
+
n_str.strip! if options[:strip]
|
101
|
+
n_str.gsub!(/\s+/, options[:replace_whitespaces]) if options[:replace_whitespaces]
|
102
|
+
n_str
|
12
103
|
end
|
13
104
|
end
|
14
|
-
|
15
105
|
end
|
16
106
|
end
|
17
107
|
|
18
|
-
class String
|
108
|
+
class String # :nodoc:
|
19
109
|
include AegisNet::StringNormalizr
|
20
110
|
end
|
data/string_normalizr.gemspec
CHANGED
@@ -2,15 +2,15 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{string_normalizr}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Carsten Zimmermann"]
|
9
|
-
s.date = %q{2010-09-
|
9
|
+
s.date = %q{2010-09-20}
|
10
10
|
s.description = %q{Let String instances be conviently normalized}
|
11
11
|
s.email = %q{carp@hacksocke.de}
|
12
|
-
s.extra_rdoc_files = ["README.rdoc", "lib/string_normalizr.rb"]
|
13
|
-
s.files = ["README.rdoc", "Rakefile", "init.rb", "lib/string_normalizr.rb", "nbproject/private/rake-d.txt", "
|
12
|
+
s.extra_rdoc_files = ["LICENSE", "README.rdoc", "lib/string_normalizr.rb"]
|
13
|
+
s.files = ["LICENSE", "README.rdoc", "Rakefile", "init.rb", "lib/string_normalizr.rb", "nbproject/private/rake-d.txt", "test/string_normalizr_test.rb", "Manifest", "string_normalizr.gemspec"]
|
14
14
|
s.homepage = %q{http://github.com/carpodaster/string_normalizr}
|
15
15
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "String_normalizr", "--main", "README.rdoc"]
|
16
16
|
s.require_paths = ["lib"]
|
@@ -1,22 +1,89 @@
|
|
1
|
+
# -*- coding: utf-8 -*-"
|
1
2
|
require 'test/unit'
|
2
3
|
require File.dirname(__FILE__) + "/../lib/string_normalizr"
|
3
4
|
|
4
5
|
class StringNormalizrTest < Test::Unit::TestCase
|
5
|
-
|
6
|
+
|
6
7
|
def test_string_integration
|
7
8
|
assert "some string".respond_to?(:normalize)
|
8
9
|
end
|
9
10
|
|
10
11
|
def test_whitespaces
|
12
|
+
assert_equal "This-is-an-example", "This is an example".normalize
|
13
|
+
assert_equal "This is an example", "This is an example".normalize(:replace_whitespaces => false)
|
14
|
+
assert_equal "This=is=an=example", "This is an example".normalize(:replace_whitespaces => "=")
|
15
|
+
|
11
16
|
assert_equal "foo", " foo \n \t".normalize
|
17
|
+
assert_equal "foo \n \t", "foo \n \t".normalize(:strip => false, :replace_whitespaces => false)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_accents
|
21
|
+
assert_equal "a", "á".normalize
|
22
|
+
assert_equal "a", "à".normalize
|
23
|
+
assert_equal "a", "â".normalize
|
24
|
+
assert_equal "A", "Á".normalize
|
25
|
+
assert_equal "A", "À".normalize
|
26
|
+
assert_equal "A", "Â".normalize
|
27
|
+
assert_equal "e", "é".normalize
|
28
|
+
assert_equal "e", "è".normalize
|
29
|
+
assert_equal "E", "É".normalize
|
30
|
+
assert_equal "E", "È".normalize
|
31
|
+
assert_equal "i", "í".normalize
|
32
|
+
assert_equal "i", "ì".normalize
|
33
|
+
assert_equal "i", "î".normalize
|
34
|
+
assert_equal "I", "Í".normalize
|
35
|
+
assert_equal "I", "Ì".normalize
|
36
|
+
assert_equal "I", "Î".normalize
|
37
|
+
assert_equal "o", "ó".normalize
|
38
|
+
assert_equal "o", "ò".normalize
|
39
|
+
assert_equal "o", "ô".normalize
|
40
|
+
assert_equal "O", "Ó".normalize
|
41
|
+
assert_equal "O", "Ò".normalize
|
42
|
+
assert_equal "O", "Ô".normalize
|
43
|
+
assert_equal "u", "ú".normalize
|
44
|
+
assert_equal "u", "ù".normalize
|
45
|
+
assert_equal "U", "Ú".normalize
|
46
|
+
assert_equal "U", "Ù".normalize
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_umlauts
|
50
|
+
assert_equal "ae", "ä".normalize
|
51
|
+
assert_equal "Ae", "Ä".normalize
|
52
|
+
assert_equal "oe", "ö".normalize
|
53
|
+
assert_equal "Oe", "Ö".normalize
|
54
|
+
assert_equal "ue", "ü".normalize
|
55
|
+
assert_equal "Ue", "Ü".normalize
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_spanish_chars
|
59
|
+
assert_equal "c", "ç".normalize
|
60
|
+
assert_equal "C", "Ç".normalize
|
61
|
+
assert_equal "n", "ñ".normalize
|
62
|
+
assert_equal "N", "Ñ".normalize
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_scandinavian_chars
|
66
|
+
assert_equal "a", "å".normalize
|
67
|
+
assert_equal "A", "Å".normalize
|
68
|
+
assert_equal "o", "ø".normalize
|
69
|
+
assert_equal "O", "Ø".normalize
|
12
70
|
end
|
13
71
|
|
14
|
-
def
|
15
|
-
assert_equal "
|
72
|
+
def test_ligatures
|
73
|
+
assert_equal "ae", "æ".normalize
|
74
|
+
assert_equal "Ae", "Æ".normalize
|
75
|
+
assert_equal "oe", "œ".normalize
|
76
|
+
assert_equal 'Oe', 'Œ'.normalize
|
77
|
+
assert_equal "ss", "ß".normalize
|
16
78
|
end
|
17
79
|
|
18
|
-
def
|
19
|
-
assert_equal "
|
80
|
+
def test_diaresises
|
81
|
+
assert_equal "e", "ë".normalize
|
82
|
+
assert_equal "E", "Ë".normalize
|
83
|
+
assert_equal "i", "ï".normalize
|
84
|
+
assert_equal "I", "Ï".normalize
|
85
|
+
assert_equal "y", "ÿ".normalize
|
86
|
+
assert_equal "Y", "Ÿ".normalize
|
20
87
|
end
|
21
88
|
|
22
89
|
end
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string_normalizr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 2
|
9
|
+
version: "0.2"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Carsten Zimmermann
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-09-
|
17
|
+
date: 2010-09-20 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -25,17 +25,19 @@ executables: []
|
|
25
25
|
extensions: []
|
26
26
|
|
27
27
|
extra_rdoc_files:
|
28
|
+
- LICENSE
|
28
29
|
- README.rdoc
|
29
30
|
- lib/string_normalizr.rb
|
30
31
|
files:
|
32
|
+
- LICENSE
|
31
33
|
- README.rdoc
|
32
34
|
- Rakefile
|
33
35
|
- init.rb
|
34
36
|
- lib/string_normalizr.rb
|
35
37
|
- nbproject/private/rake-d.txt
|
36
|
-
- string_normalizr.gemspec
|
37
38
|
- test/string_normalizr_test.rb
|
38
39
|
- Manifest
|
40
|
+
- string_normalizr.gemspec
|
39
41
|
has_rdoc: true
|
40
42
|
homepage: http://github.com/carpodaster/string_normalizr
|
41
43
|
licenses: []
|