fuzzy-string-match 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +1 -0
- data/README.md +25 -20
- data/Rakefile +87 -0
- data/VERSION.yml +4 -0
- data/lib/fuzzystringmatch.rb +21 -174
- data/lib/fuzzystringmatch/inline.rb +18 -0
- data/lib/fuzzystringmatch/inline/jarowinkler.rb +113 -0
- data/lib/fuzzystringmatch/pure.rb +18 -0
- data/lib/fuzzystringmatch/pure/jarowinkler.rb +101 -0
- data/test/basic_native_spec.rb +62 -0
- data/test/basic_pure_spec.rb +62 -0
- data/test/mutibyte_spec.rb +58 -0
- metadata +64 -76
- data/benchmark/vs_amatch.rb +0 -54
- data/test/fuzzystringmatch_spec.rb +0 -139
data/.gemtest
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
|
data/README.md
CHANGED
@@ -1,51 +1,54 @@
|
|
1
1
|
# What is fuzzy-string-match
|
2
|
+
[](http://travis-ci.org/kiyoka/fuzzy-string-match)
|
2
3
|
|
3
4
|
* fuzzy-string-match is a fuzzy string matching library for ruby.
|
4
5
|
* It is fast. ( written in C with RubyInline )
|
5
6
|
* It supports only Jaro-Winkler distance algorithm.
|
6
7
|
* This program was ported by hand from lucene-3.0.2. (lucene is Java product)
|
7
|
-
* If you want to add another string distance algorithm, please
|
8
|
+
* If you want to add another string distance algorithm, please fork it on github and port by yourself.
|
8
9
|
|
9
10
|
## The reason why i developed fuzzy-string-match
|
10
11
|
* I tried amatch-0.2.5, but it contains some issues.
|
11
|
-
1.
|
12
|
+
1. memory leaks.
|
12
13
|
2. I felt difficult to maintain it.
|
13
14
|
* So, I decide to create another gem by porting lucene-3.0.x.
|
14
15
|
|
15
|
-
## Installing
|
16
|
-
|
16
|
+
## Installing
|
17
|
+
|
18
|
+
gem install fuzzy-string-match
|
19
|
+
|
20
|
+
## Installing (pure ruby version)
|
21
|
+
|
22
|
+
gem install fuzzy-string-match_pure
|
17
23
|
|
18
24
|
## Features
|
19
25
|
* Calculate Jaro-Winkler distance of two strings.
|
20
26
|
* Pure ruby version can handle both ASCII and UTF8 strings. (and slow)
|
21
|
-
* Native version can only ASCII strings. (
|
27
|
+
* Native version can only ASCII strings. (but it is fast)
|
22
28
|
|
23
29
|
## Sample code
|
24
|
-
* Native version
|
25
30
|
|
26
|
-
|
31
|
+
### Native version
|
32
|
+
|
27
33
|
require 'fuzzystringmatch'
|
28
|
-
jarow = FuzzyStringMatch::JaroWinkler.
|
34
|
+
jarow = FuzzyStringMatch::JaroWinkler.create( :native )
|
29
35
|
p jarow.getDistance( "jones", "johnson" )
|
30
|
-
</code>
|
31
36
|
|
32
|
-
|
37
|
+
### Pure ruby version
|
33
38
|
|
34
|
-
<code>
|
35
39
|
require 'fuzzystringmatch'
|
36
|
-
jarow = FuzzyStringMatch::JaroWinkler.
|
37
|
-
p jarow.getDistance(
|
38
|
-
|
40
|
+
jarow = FuzzyStringMatch::JaroWinkler.create( :pure )
|
41
|
+
p jarow.getDistance( "jones", "johnson" )
|
42
|
+
p jarow.getDistance( "ああ", "あい" )
|
39
43
|
|
40
44
|
## Sample on irb
|
41
45
|
|
42
|
-
<code>
|
43
46
|
irb(main):001:0> require 'fuzzystringmatch'
|
44
47
|
require 'fuzzystringmatch'
|
45
48
|
=> true
|
46
49
|
|
47
|
-
irb(main):002:0> jarow = FuzzyStringMatch::JaroWinkler.
|
48
|
-
jarow = FuzzyStringMatch::JaroWinkler.
|
50
|
+
irb(main):002:0> jarow = FuzzyStringMatch::JaroWinkler.create( :native )
|
51
|
+
jarow = FuzzyStringMatch::JaroWinkler.create( :native )
|
49
52
|
=> #<FuzzyStringMatch::JaroWinklerNative:0x000001011b0010>
|
50
53
|
|
51
54
|
irb(main):003:0> jarow.getDistance( "al", "al" )
|
@@ -55,11 +58,9 @@
|
|
55
58
|
irb(main):004:0> jarow.getDistance( "dixon", "dicksonx" )
|
56
59
|
jarow.getDistance( "dixon", "dicksonx" )
|
57
60
|
=> 0.8133333333333332
|
58
|
-
</code>
|
59
61
|
|
60
62
|
## Benchmarks
|
61
63
|
|
62
|
-
<console>
|
63
64
|
$ rake bench
|
64
65
|
ruby ./benchmark/vs_amatch.rb
|
65
66
|
---
|
@@ -74,12 +75,16 @@
|
|
74
75
|
[this Module (native)]
|
75
76
|
user system total real
|
76
77
|
0.480000 0.000000 0.480000 ( 0.484187)
|
77
|
-
</console>
|
78
78
|
|
79
79
|
## Requires
|
80
|
+
|
81
|
+
### for CRuby
|
80
82
|
- RubyInline
|
81
83
|
- Ruby 1.9.1 or higher
|
82
84
|
|
85
|
+
### for JRuby
|
86
|
+
- JRuby 1.6.6 or higher
|
87
|
+
|
83
88
|
## Author
|
84
89
|
- Copyright (C) Kiyoka Nishiyama <kiyoka@sumibi.org>
|
85
90
|
- I ported from java source code of lucene-3.0.2.
|
data/Rakefile
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
#-*- mode: ruby; -*-
|
2
|
+
#
|
3
|
+
# Release Engineering
|
4
|
+
# 1. edit the VERSION.yml file
|
5
|
+
# 2. rake test
|
6
|
+
# 3. rake gemspec && rake build
|
7
|
+
# to generate fuzzy-string-match-x.x.x.gem
|
8
|
+
# 4. install fuzzy-string-match-x.x.x.gem to clean environment and test
|
9
|
+
# 5. rake release
|
10
|
+
# 6. gem push pkg/fuzzy-string-match-x.x.x.gem ( need gem version 1.3.6 or higer. Please "gem update --system" to update )
|
11
|
+
#
|
12
|
+
# for Development
|
13
|
+
# rake test_dev
|
14
|
+
# rake benchmark
|
15
|
+
|
16
|
+
require 'rake'
|
17
|
+
begin
|
18
|
+
require 'jeweler'
|
19
|
+
|
20
|
+
Jeweler::Tasks.new do |gemspec|
|
21
|
+
gemspec.name = "fuzzy-string-match_pure"
|
22
|
+
gemspec.summary = "fuzzy string matching library (Pure ruby version)"
|
23
|
+
gemspec.description = "calculate Jaro Winkler distance."
|
24
|
+
gemspec.email = "kiyoka@sumibi.org"
|
25
|
+
gemspec.homepage = "http://github.com/kiyoka/fuzzy-string-match"
|
26
|
+
gemspec.authors = ["Kiyoka Nishiyama"]
|
27
|
+
gemspec.files = FileList['.gemtest',
|
28
|
+
'Rakefile',
|
29
|
+
'VERSION.yml',
|
30
|
+
'lib/fuzzystringmatch/pure/jarowinkler.rb',
|
31
|
+
'lib/fuzzystringmatch/pure.rb',
|
32
|
+
'lib/fuzzystringmatch.rb',
|
33
|
+
'lib/*.rb',
|
34
|
+
'test/basic_pure_spec.rb',
|
35
|
+
'test/mutibyte_spec.rb',
|
36
|
+
'LICENSE.txt',
|
37
|
+
'README.md'].to_a
|
38
|
+
gemspec.add_dependency( "rspec" )
|
39
|
+
gemspec.required_ruby_version = '>= 1.9.1'
|
40
|
+
end
|
41
|
+
|
42
|
+
Jeweler::Tasks.new do |gemspec|
|
43
|
+
gemspec.name = "fuzzy-string-match"
|
44
|
+
gemspec.summary = "fuzzy string matching library"
|
45
|
+
gemspec.description = "calculate Jaro Winkler distance."
|
46
|
+
gemspec.email = "kiyoka@sumibi.org"
|
47
|
+
gemspec.homepage = "http://github.com/kiyoka/fuzzy-string-match"
|
48
|
+
gemspec.authors = ["Kiyoka Nishiyama"]
|
49
|
+
gemspec.files = FileList['.gemtest',
|
50
|
+
'Rakefile',
|
51
|
+
'VERSION.yml',
|
52
|
+
'lib/fuzzystringmatch/inline/jarowinkler.rb',
|
53
|
+
'lib/fuzzystringmatch/inline.rb',
|
54
|
+
'lib/fuzzystringmatch/pure/jarowinkler.rb',
|
55
|
+
'lib/fuzzystringmatch/pure.rb',
|
56
|
+
'lib/fuzzystringmatch.rb',
|
57
|
+
'test/basic_native_spec.rb',
|
58
|
+
'test/basic_pure_spec.rb',
|
59
|
+
'test/mutibyte_spec.rb',
|
60
|
+
'LICENSE.txt',
|
61
|
+
'README.md'].to_a
|
62
|
+
gemspec.add_dependency( "rspec" )
|
63
|
+
gemspec.add_dependency( 'RubyInline', '>= 3.8.6')
|
64
|
+
gemspec.required_ruby_version = '>= 1.9.1'
|
65
|
+
end
|
66
|
+
|
67
|
+
rescue LoadError
|
68
|
+
puts "Jeweler not available. Install it with: sudo gem install jeweler"
|
69
|
+
end
|
70
|
+
|
71
|
+
task :default => [:test] do
|
72
|
+
end
|
73
|
+
|
74
|
+
task :test do
|
75
|
+
sh "ruby -I ./lib `which rspec` -b ./test/basic_native_spec.rb" if File.exist?( "./test/basic_native_spec.rb" )
|
76
|
+
sh "ruby -I ./lib `which rspec` -b ./test/basic_pure_spec.rb"
|
77
|
+
sh "ruby -I ./lib `which rspec` -b ./test/mutibyte_spec.rb"
|
78
|
+
end
|
79
|
+
|
80
|
+
task :test_dev do
|
81
|
+
sh "ruby -I ./lib `which rspec` -b ./test/verify_with_amatch_spec.rb"
|
82
|
+
end
|
83
|
+
|
84
|
+
task :bench do
|
85
|
+
sh "ruby ./benchmark/vs_amatch.rb"
|
86
|
+
end
|
87
|
+
|
data/VERSION.yml
ADDED
data/lib/fuzzystringmatch.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# Fuzzy String Match
|
3
3
|
#
|
4
|
-
# Copyright 2010 Kiyoka Nishiyama
|
4
|
+
# Copyright 2010-2011 Kiyoka Nishiyama
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
7
|
# you may not use this file except in compliance with the License.
|
@@ -15,187 +15,34 @@
|
|
15
15
|
# See the License for the specific language governing permissions and
|
16
16
|
# limitations under the License.
|
17
17
|
#
|
18
|
-
|
18
|
+
require 'fuzzystringmatch/pure'
|
19
|
+
begin
|
20
|
+
if RUBY_PLATFORM == "java"
|
21
|
+
STDERR.puts "fuzzy-string-match Warning: native version is disabled on java platform. falled back to pure ruby version..."
|
22
|
+
else
|
23
|
+
require 'fuzzystringmatch/inline'
|
24
|
+
end
|
25
|
+
rescue LoadError
|
26
|
+
end
|
19
27
|
|
28
|
+
module FuzzyStringMatch
|
20
29
|
class JaroWinkler
|
21
|
-
def create( type = :pure ) # factory method
|
30
|
+
def self.create( type = :pure ) # factory method
|
22
31
|
case type
|
23
32
|
when :pure
|
24
|
-
JaroWinklerPure.new
|
33
|
+
FuzzyStringMatch::JaroWinklerPure.new
|
25
34
|
when :native
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
class JaroWinklerPure
|
32
|
-
THRESHOLD = 0.7
|
33
|
-
|
34
|
-
def getDistance( s1, s2 )
|
35
|
-
a1 = s1.split( // )
|
36
|
-
a2 = s2.split( // )
|
37
|
-
|
38
|
-
if s1.size > s2.size
|
39
|
-
(max,min) = a1,a2
|
40
|
-
else
|
41
|
-
(max,min) = a2,a1
|
42
|
-
end
|
43
|
-
|
44
|
-
range = [ (max.size / 2 - 1), 0 ].max
|
45
|
-
indexes = Array.new( min.size, -1 )
|
46
|
-
flags = Array.new( max.size, false )
|
47
|
-
|
48
|
-
matches = 0;
|
49
|
-
(0 ... min.size).each { |mi|
|
50
|
-
c1 = min[mi]
|
51
|
-
xi = [mi - range, 0].max
|
52
|
-
xn = [mi + range + 1, max.size].min
|
53
|
-
|
54
|
-
(xi ... xn).each { |i|
|
55
|
-
if (not flags[i]) && ( c1 == max[i] )
|
56
|
-
indexes[mi] = i
|
57
|
-
flags[i] = true
|
58
|
-
matches += 1
|
59
|
-
break
|
60
|
-
end
|
61
|
-
}
|
62
|
-
}
|
63
|
-
|
64
|
-
ms1 = Array.new( matches, nil )
|
65
|
-
ms2 = Array.new( matches, nil )
|
66
|
-
|
67
|
-
si = 0
|
68
|
-
(0 ... min.size).each { |i|
|
69
|
-
if (indexes[i] != -1)
|
70
|
-
ms1[si] = min[i]
|
71
|
-
si += 1
|
35
|
+
begin
|
36
|
+
FuzzyStringMatch::JaroWinklerInline.new
|
37
|
+
rescue NameError
|
38
|
+
STDERR.puts "fuzzy-string-match Warning: native version is disabled. falled back to pure ruby version..."
|
39
|
+
FuzzyStringMatch::JaroWinklerPure.new
|
72
40
|
end
|
73
|
-
}
|
74
|
-
|
75
|
-
si = 0
|
76
|
-
(0 ... max.size).each { |i|
|
77
|
-
if flags[i]
|
78
|
-
ms2[si] = max[i]
|
79
|
-
si += 1
|
80
|
-
end
|
81
|
-
}
|
82
|
-
|
83
|
-
transpositions = 0
|
84
|
-
(0 ... ms1.size).each { |mi|
|
85
|
-
if ms1[mi] != ms2[mi]
|
86
|
-
transpositions += 1
|
87
|
-
end
|
88
|
-
}
|
89
|
-
|
90
|
-
prefix = 0
|
91
|
-
(0 ... min.size).each { |mi|
|
92
|
-
if s1[mi] == s2[mi]
|
93
|
-
prefix += 1
|
94
|
-
else
|
95
|
-
break
|
96
|
-
end
|
97
|
-
}
|
98
|
-
|
99
|
-
if 0 == matches
|
100
|
-
0.0
|
101
|
-
else
|
102
|
-
m = matches.to_f
|
103
|
-
t = (transpositions/ 2)
|
104
|
-
j = ((m / s1.size) + (m / s2.size) + ((m - t) / m)) / 3.0;
|
105
|
-
return j < THRESHOLD ? j : j + [0.1, 1.0 / max.size].min * prefix * (1 - j)
|
106
41
|
end
|
107
42
|
end
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
class JaroWinklerNative
|
112
|
-
inline do |builder|
|
113
|
-
builder.include '<iostream>'
|
114
|
-
builder.add_compile_flags '-x c++', '-lstdc++'
|
115
|
-
builder.c_raw 'int max( int a, int b ) { return ((a)>(b)?(a):(b)); }'
|
116
|
-
builder.c_raw 'int min( int a, int b ) { return ((a)<(b)?(a):(b)); }'
|
117
|
-
builder.c_raw 'double double_min( double a, double b ) { return ((a)<(b)?(a):(b)); }'
|
118
|
-
builder.c '
|
119
|
-
double getDistance( char *s1, char *s2 )
|
120
|
-
{
|
121
|
-
char *_max;
|
122
|
-
char *_min;
|
123
|
-
int _max_length = 0;
|
124
|
-
int _min_length = 0;
|
125
|
-
if ( strlen(s1) > strlen(s2) ) {
|
126
|
-
_max = s1; _max_length = strlen(s1);
|
127
|
-
_min = s2; _min_length = strlen(s2);
|
128
|
-
}
|
129
|
-
else {
|
130
|
-
_max = s2; _max_length = strlen(s2);
|
131
|
-
_min = s1; _min_length = strlen(s1);
|
132
|
-
}
|
133
|
-
int range = max( _max_length / 2 - 1, 0 );
|
134
|
-
|
135
|
-
int indexes[_min_length];
|
136
|
-
for( int i = 0 ; i < _min_length ; i++ ) {
|
137
|
-
indexes[i] = -1;
|
138
|
-
}
|
139
|
-
|
140
|
-
int flags[_max_length];
|
141
|
-
for( int i = 0 ; i < _max_length ; i++ ) {
|
142
|
-
flags[i] = 0;
|
143
|
-
}
|
144
|
-
int matches = 0;
|
145
|
-
for (int mi = 0; mi < _min_length; mi++) {
|
146
|
-
char c1 = _min[mi];
|
147
|
-
for (int xi = max(mi - range, 0), xn = min(mi + range + 1, _max_length); xi < xn; xi++ ) {
|
148
|
-
if (!flags[xi] && (c1 == _max[xi])) {
|
149
|
-
indexes[mi] = xi;
|
150
|
-
flags[xi] = 1;
|
151
|
-
matches++;
|
152
|
-
break;
|
153
|
-
}
|
154
|
-
}
|
155
|
-
}
|
156
|
-
|
157
|
-
char ms1[matches];
|
158
|
-
char ms2[matches];
|
159
|
-
int ms1_length = matches;
|
160
|
-
|
161
|
-
for (int i = 0, si = 0; i < _min_length; i++) {
|
162
|
-
if (indexes[i] != -1) {
|
163
|
-
ms1[si] = _min[i];
|
164
|
-
si++;
|
165
|
-
}
|
166
|
-
}
|
167
|
-
for (int i = 0, si = 0; i < _max_length; i++) {
|
168
|
-
if (flags[i]) {
|
169
|
-
ms2[si] = _max[i];
|
170
|
-
si++;
|
171
|
-
}
|
172
|
-
}
|
173
|
-
int transpositions = 0;
|
174
|
-
for (int mi = 0; mi < ms1_length; mi++) {
|
175
|
-
if (ms1[mi] != ms2[mi]) {
|
176
|
-
transpositions++;
|
177
|
-
}
|
178
|
-
}
|
179
|
-
int prefix = 0;
|
180
|
-
for (int mi = 0; mi < _min_length; mi++) {
|
181
|
-
if (s1[mi] == s2[mi]) {
|
182
|
-
prefix++;
|
183
|
-
} else {
|
184
|
-
break;
|
185
|
-
}
|
186
|
-
}
|
187
|
-
|
188
|
-
double m = (double) matches;
|
189
|
-
if (matches == 0) {
|
190
|
-
return 0.0;
|
191
|
-
}
|
192
|
-
int t = transpositions / 2;
|
193
|
-
double j = ((m / strlen(s1) + m / strlen(s2) + (m - t) / m)) / 3;
|
194
|
-
double jw = j < 0.7 ? j : j + double_min(0.1, 1.0 / _max_length) * prefix
|
195
|
-
* (1 - j);
|
196
|
-
return jw;
|
197
|
-
}'
|
43
|
+
def create( type = :pure ) # this is obsolute
|
44
|
+
STDERR.puts "fuzzy-string-match Warning: FuzzyStringMatch.new.create() is obsolute, please use FuzzyStringMatch.create() ..."
|
45
|
+
FuzzyStringMatch::JaroWinkler.create( type )
|
198
46
|
end
|
199
47
|
end
|
200
|
-
|
201
48
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#
|
2
|
+
# Fuzzy String Match
|
3
|
+
#
|
4
|
+
# Copyright 2010-2011 Kiyoka Nishiyama
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
# you may not use this file except in compliance with the License.
|
8
|
+
# You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
#
|
18
|
+
require 'fuzzystringmatch/inline/jarowinkler'
|
@@ -0,0 +1,113 @@
|
|
1
|
+
#
|
2
|
+
# Fuzzy String Match
|
3
|
+
#
|
4
|
+
# Copyright 2010-2011 Kiyoka Nishiyama
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
# you may not use this file except in compliance with the License.
|
8
|
+
# You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
#
|
18
|
+
module FuzzyStringMatch
|
19
|
+
require 'inline'
|
20
|
+
class JaroWinklerInline
|
21
|
+
def pure?
|
22
|
+
false
|
23
|
+
end
|
24
|
+
|
25
|
+
inline do |builder|
|
26
|
+
builder.include '<iostream>'
|
27
|
+
builder.add_compile_flags '-x c++', '-lstdc++'
|
28
|
+
builder.c_raw 'int max( int a, int b ) { return ((a)>(b)?(a):(b)); }'
|
29
|
+
builder.c_raw 'int min( int a, int b ) { return ((a)<(b)?(a):(b)); }'
|
30
|
+
builder.c_raw 'double double_min( double a, double b ) { return ((a)<(b)?(a):(b)); }'
|
31
|
+
builder.c '
|
32
|
+
double getDistance( char *s1, char *s2 )
|
33
|
+
{
|
34
|
+
char *_max;
|
35
|
+
char *_min;
|
36
|
+
int _max_length = 0;
|
37
|
+
int _min_length = 0;
|
38
|
+
if ( strlen(s1) > strlen(s2) ) {
|
39
|
+
_max = s1; _max_length = strlen(s1);
|
40
|
+
_min = s2; _min_length = strlen(s2);
|
41
|
+
}
|
42
|
+
else {
|
43
|
+
_max = s2; _max_length = strlen(s2);
|
44
|
+
_min = s1; _min_length = strlen(s1);
|
45
|
+
}
|
46
|
+
int range = max( _max_length / 2 - 1, 0 );
|
47
|
+
|
48
|
+
int indexes[_min_length];
|
49
|
+
for( int i = 0 ; i < _min_length ; i++ ) {
|
50
|
+
indexes[i] = -1;
|
51
|
+
}
|
52
|
+
|
53
|
+
int flags[_max_length];
|
54
|
+
for( int i = 0 ; i < _max_length ; i++ ) {
|
55
|
+
flags[i] = 0;
|
56
|
+
}
|
57
|
+
int matches = 0;
|
58
|
+
for (int mi = 0; mi < _min_length; mi++) {
|
59
|
+
char c1 = _min[mi];
|
60
|
+
for (int xi = max(mi - range, 0), xn = min(mi + range + 1, _max_length); xi < xn; xi++ ) {
|
61
|
+
if (!flags[xi] && (c1 == _max[xi])) {
|
62
|
+
indexes[mi] = xi;
|
63
|
+
flags[xi] = 1;
|
64
|
+
matches++;
|
65
|
+
break;
|
66
|
+
}
|
67
|
+
}
|
68
|
+
}
|
69
|
+
|
70
|
+
char ms1[matches];
|
71
|
+
char ms2[matches];
|
72
|
+
int ms1_length = matches;
|
73
|
+
|
74
|
+
for (int i = 0, si = 0; i < _min_length; i++) {
|
75
|
+
if (indexes[i] != -1) {
|
76
|
+
ms1[si] = _min[i];
|
77
|
+
si++;
|
78
|
+
}
|
79
|
+
}
|
80
|
+
for (int i = 0, si = 0; i < _max_length; i++) {
|
81
|
+
if (flags[i]) {
|
82
|
+
ms2[si] = _max[i];
|
83
|
+
si++;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
int transpositions = 0;
|
87
|
+
for (int mi = 0; mi < ms1_length; mi++) {
|
88
|
+
if (ms1[mi] != ms2[mi]) {
|
89
|
+
transpositions++;
|
90
|
+
}
|
91
|
+
}
|
92
|
+
int prefix = 0;
|
93
|
+
for (int mi = 0; mi < _min_length; mi++) {
|
94
|
+
if (s1[mi] == s2[mi]) {
|
95
|
+
prefix++;
|
96
|
+
} else {
|
97
|
+
break;
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
double m = (double) matches;
|
102
|
+
if (matches == 0) {
|
103
|
+
return 0.0;
|
104
|
+
}
|
105
|
+
int t = transpositions / 2;
|
106
|
+
double j = ((m / strlen(s1) + m / strlen(s2) + (m - t) / m)) / 3;
|
107
|
+
double jw = j < 0.7 ? j : j + double_min(0.1, 1.0 / _max_length) * prefix
|
108
|
+
* (1 - j);
|
109
|
+
return jw;
|
110
|
+
}'
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#
|
2
|
+
# Fuzzy String Match
|
3
|
+
#
|
4
|
+
# Copyright 2010-2011 Kiyoka Nishiyama
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
# you may not use this file except in compliance with the License.
|
8
|
+
# You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
#
|
18
|
+
require 'fuzzystringmatch/pure/jarowinkler'
|
@@ -0,0 +1,101 @@
|
|
1
|
+
#
|
2
|
+
# Fuzzy String Match
|
3
|
+
#
|
4
|
+
# Copyright 2010-2011 Kiyoka Nishiyama
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
# you may not use this file except in compliance with the License.
|
8
|
+
# You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
#
|
18
|
+
module FuzzyStringMatch
|
19
|
+
class JaroWinklerPure
|
20
|
+
THRESHOLD = 0.7
|
21
|
+
|
22
|
+
def pure?
|
23
|
+
true
|
24
|
+
end
|
25
|
+
|
26
|
+
def getDistance( s1, s2 )
|
27
|
+
a1 = s1.split( // )
|
28
|
+
a2 = s2.split( // )
|
29
|
+
|
30
|
+
if s1.size > s2.size
|
31
|
+
(max,min) = a1,a2
|
32
|
+
else
|
33
|
+
(max,min) = a2,a1
|
34
|
+
end
|
35
|
+
|
36
|
+
range = [ (max.size / 2 - 1), 0 ].max
|
37
|
+
indexes = Array.new( min.size, -1 )
|
38
|
+
flags = Array.new( max.size, false )
|
39
|
+
|
40
|
+
matches = 0;
|
41
|
+
(0 ... min.size).each { |mi|
|
42
|
+
c1 = min[mi]
|
43
|
+
xi = [mi - range, 0].max
|
44
|
+
xn = [mi + range + 1, max.size].min
|
45
|
+
|
46
|
+
(xi ... xn).each { |i|
|
47
|
+
if (not flags[i]) && ( c1 == max[i] )
|
48
|
+
indexes[mi] = i
|
49
|
+
flags[i] = true
|
50
|
+
matches += 1
|
51
|
+
break
|
52
|
+
end
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
ms1 = Array.new( matches, nil )
|
57
|
+
ms2 = Array.new( matches, nil )
|
58
|
+
|
59
|
+
si = 0
|
60
|
+
(0 ... min.size).each { |i|
|
61
|
+
if (indexes[i] != -1)
|
62
|
+
ms1[si] = min[i]
|
63
|
+
si += 1
|
64
|
+
end
|
65
|
+
}
|
66
|
+
|
67
|
+
si = 0
|
68
|
+
(0 ... max.size).each { |i|
|
69
|
+
if flags[i]
|
70
|
+
ms2[si] = max[i]
|
71
|
+
si += 1
|
72
|
+
end
|
73
|
+
}
|
74
|
+
|
75
|
+
transpositions = 0
|
76
|
+
(0 ... ms1.size).each { |mi|
|
77
|
+
if ms1[mi] != ms2[mi]
|
78
|
+
transpositions += 1
|
79
|
+
end
|
80
|
+
}
|
81
|
+
|
82
|
+
prefix = 0
|
83
|
+
(0 ... min.size).each { |mi|
|
84
|
+
if s1[mi] == s2[mi]
|
85
|
+
prefix += 1
|
86
|
+
else
|
87
|
+
break
|
88
|
+
end
|
89
|
+
}
|
90
|
+
|
91
|
+
if 0 == matches
|
92
|
+
0.0
|
93
|
+
else
|
94
|
+
m = matches.to_f
|
95
|
+
t = (transpositions/ 2)
|
96
|
+
j = ((m / s1.size) + (m / s2.size) + ((m - t) / m)) / 3.0;
|
97
|
+
return j < THRESHOLD ? j : j + [0.1, 1.0 / max.size].min * prefix * (1 - j)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: utf-8 -*-
|
3
|
+
#
|
4
|
+
# basic_spec.rb - "Basic test-cases for FuzzyStringMatch module "
|
5
|
+
#
|
6
|
+
# Copyright (c) 2011 Kiyoka Nishiyama <kiyoka@sumibi.org>
|
7
|
+
#
|
8
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
9
|
+
# contributor license agreements. See the NOTICE file distributed with
|
10
|
+
# this work for additional information regarding copyright ownership.
|
11
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
12
|
+
# (the "License"); you may not use this file except in compliance with
|
13
|
+
# the License. You may obtain a copy of the License at
|
14
|
+
#
|
15
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
16
|
+
#
|
17
|
+
# Unless required by applicable law or agreed to in writing, software
|
18
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
19
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
20
|
+
# See the License for the specific language governing permissions and
|
21
|
+
# limitations under the License.
|
22
|
+
#
|
23
|
+
#
|
24
|
+
require 'fuzzystringmatch'
|
25
|
+
|
26
|
+
describe FuzzyStringMatch, "when some string distances (Native) are" do
|
27
|
+
before do
|
28
|
+
@jarow = FuzzyStringMatch::JaroWinkler.create( :native )
|
29
|
+
end
|
30
|
+
it "should" do
|
31
|
+
@jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
|
32
|
+
@jarow.getDistance( "al", "al" ).should == 1.0
|
33
|
+
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
34
|
+
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
35
|
+
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
36
|
+
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
37
|
+
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
38
|
+
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
39
|
+
lambda {
|
40
|
+
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
41
|
+
d2 = @jarow.getDistance("zac ephron", "kai ephron")
|
42
|
+
d1 > d2
|
43
|
+
}.should be_true
|
44
|
+
lambda {
|
45
|
+
d1 = @jarow.getDistance("brittney spears", "britney spears")
|
46
|
+
d2 = @jarow.getDistance("brittney spears", "brittney startzman")
|
47
|
+
d1 > d2
|
48
|
+
}.should be_true
|
49
|
+
|
50
|
+
@jarow.pure?( ).should == (RUBY_PLATFORM == "java")
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe FuzzyStringMatch, "when older factory method was called, (Pure) are" do
|
55
|
+
before do
|
56
|
+
@jarow = FuzzyStringMatch::JaroWinkler.new.create
|
57
|
+
end
|
58
|
+
it "should" do
|
59
|
+
@jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
|
60
|
+
@jarow.pure?( ).should be_true
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: utf-8 -*-
|
3
|
+
#
|
4
|
+
# basic_pure_spec.rb - "Basic test-cases for FuzzyStringMatch module "
|
5
|
+
#
|
6
|
+
# Copyright (c) 2011 Kiyoka Nishiyama <kiyoka@sumibi.org>
|
7
|
+
#
|
8
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
9
|
+
# contributor license agreements. See the NOTICE file distributed with
|
10
|
+
# this work for additional information regarding copyright ownership.
|
11
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
12
|
+
# (the "License"); you may not use this file except in compliance with
|
13
|
+
# the License. You may obtain a copy of the License at
|
14
|
+
#
|
15
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
16
|
+
#
|
17
|
+
# Unless required by applicable law or agreed to in writing, software
|
18
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
19
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
20
|
+
# See the License for the specific language governing permissions and
|
21
|
+
# limitations under the License.
|
22
|
+
#
|
23
|
+
#
|
24
|
+
require 'fuzzystringmatch'
|
25
|
+
|
26
|
+
describe FuzzyStringMatch, "when some string distances (Pure) are" do
|
27
|
+
before do
|
28
|
+
@jarow = FuzzyStringMatch::JaroWinkler.create
|
29
|
+
end
|
30
|
+
it "should" do
|
31
|
+
@jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
|
32
|
+
@jarow.getDistance( "al", "al" ).should == 1.0
|
33
|
+
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
34
|
+
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
35
|
+
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
36
|
+
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
37
|
+
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
38
|
+
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
39
|
+
lambda {
|
40
|
+
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
41
|
+
d2 = @jarow.getDistance("zac ephron", "kai ephron")
|
42
|
+
d1 > d2
|
43
|
+
}.should be_true
|
44
|
+
lambda {
|
45
|
+
d1 = @jarow.getDistance("brittney spears", "britney spears")
|
46
|
+
d2 = @jarow.getDistance("brittney spears", "brittney startzman")
|
47
|
+
d1 > d2
|
48
|
+
}.should be_true
|
49
|
+
|
50
|
+
@jarow.pure?( ).should be_true
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe FuzzyStringMatch, "when older factory method was called, (Pure) are" do
|
55
|
+
before do
|
56
|
+
@jarow = FuzzyStringMatch::JaroWinkler.new.create
|
57
|
+
end
|
58
|
+
it "should" do
|
59
|
+
@jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
|
60
|
+
@jarow.pure?( ).should be_true
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: utf-8 -*-
|
3
|
+
#
|
4
|
+
# mutibyte_spec.rb - "Multibyte test-cases for FuzzyStringMatch module "
|
5
|
+
#
|
6
|
+
# Copyright (c) 2011 Kiyoka Nishiyama <kiyoka@sumibi.org>
|
7
|
+
#
|
8
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
9
|
+
# contributor license agreements. See the NOTICE file distributed with
|
10
|
+
# this work for additional information regarding copyright ownership.
|
11
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
12
|
+
# (the "License"); you may not use this file except in compliance with
|
13
|
+
# the License. You may obtain a copy of the License at
|
14
|
+
#
|
15
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
16
|
+
#
|
17
|
+
# Unless required by applicable law or agreed to in writing, software
|
18
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
19
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
20
|
+
# See the License for the specific language governing permissions and
|
21
|
+
# limitations under the License.
|
22
|
+
#
|
23
|
+
#
|
24
|
+
require 'fuzzystringmatch'
|
25
|
+
|
26
|
+
describe FuzzyStringMatch, "when some UTF8 string distances (Pure) are" do
|
27
|
+
before do
|
28
|
+
@jarow = FuzzyStringMatch::JaroWinkler.create
|
29
|
+
end
|
30
|
+
it "should" do
|
31
|
+
@jarow.getDistance( "al", "al" ).should == 1.0
|
32
|
+
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
33
|
+
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
34
|
+
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
35
|
+
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
36
|
+
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
37
|
+
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
38
|
+
lambda {
|
39
|
+
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
40
|
+
d2 = @jarow.getDistance("zac ephron", "kai ephron")
|
41
|
+
d1 > d2
|
42
|
+
}.should be_true
|
43
|
+
lambda {
|
44
|
+
d1 = @jarow.getDistance("brittney spears", "britney spears")
|
45
|
+
d2 = @jarow.getDistance("brittney spears", "brittney startzman")
|
46
|
+
d1 > d2
|
47
|
+
}.should be_true
|
48
|
+
@jarow.getDistance( "スパゲティー", "スパゲッティー" ).should be_within(0.0001).of(0.9666)
|
49
|
+
@jarow.getDistance( "スパゲティー", "スパゲティ" ).should be_within(0.0001).of(0.9722)
|
50
|
+
@jarow.getDistance( "スティービー・ワンダー", "スピーディー・ワンダー" ).should be_within(0.0001).of(0.8561)
|
51
|
+
@jarow.getDistance( "マイケル・ジャクソン", "ジャイケル・マクソン" ).should be_within(0.0001).of(0.8000)
|
52
|
+
@jarow.getDistance( "まつもとゆきひろ", "まつもとひろゆき" ).should be_within(0.0001).of(0.9500)
|
53
|
+
@jarow.getDistance( "クライエント", "クライアント" ).should be_within(0.0001).of(0.9222)
|
54
|
+
@jarow.getDistance( "サーバー", "サーバ" ).should be_within(0.0001).of(0.9416)
|
55
|
+
|
56
|
+
@jarow.pure?( ).should be_true
|
57
|
+
end
|
58
|
+
end
|
metadata
CHANGED
@@ -1,115 +1,103 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy-string-match
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.9.2
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 9
|
9
|
-
- 1
|
10
|
-
version: 0.9.1
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Kiyoka Nishiyama
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-02-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
21
15
|
name: rspec
|
16
|
+
requirement: &2156958320 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
22
23
|
prerelease: false
|
23
|
-
|
24
|
+
version_requirements: *2156958320
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: RubyInline
|
27
|
+
requirement: &2156957800 !ruby/object:Gem::Requirement
|
24
28
|
none: false
|
25
|
-
requirements:
|
26
|
-
- -
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
|
29
|
-
|
30
|
-
- 0
|
31
|
-
version: "0"
|
32
|
-
type: :development
|
33
|
-
version_requirements: *id001
|
34
|
-
- !ruby/object:Gem::Dependency
|
35
|
-
name: amatch
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 3.8.6
|
33
|
+
type: :runtime
|
36
34
|
prerelease: false
|
37
|
-
|
35
|
+
version_requirements: *2156957800
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
requirement: &2156957320 !ruby/object:Gem::Requirement
|
38
39
|
none: false
|
39
|
-
requirements:
|
40
|
-
- -
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
|
43
|
-
|
44
|
-
- 0
|
45
|
-
version: "0"
|
46
|
-
type: :development
|
47
|
-
version_requirements: *id002
|
48
|
-
- !ruby/object:Gem::Dependency
|
49
|
-
name: RubyInline
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
50
45
|
prerelease: false
|
51
|
-
|
46
|
+
version_requirements: *2156957320
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: RubyInline
|
49
|
+
requirement: &2156956840 !ruby/object:Gem::Requirement
|
52
50
|
none: false
|
53
|
-
requirements:
|
54
|
-
- -
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
hash: 43
|
57
|
-
segments:
|
58
|
-
- 3
|
59
|
-
- 8
|
60
|
-
- 6
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
61
54
|
version: 3.8.6
|
62
55
|
type: :runtime
|
63
|
-
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *2156956840
|
64
58
|
description: calculate Jaro Winkler distance.
|
65
59
|
email: kiyoka@sumibi.org
|
66
60
|
executables: []
|
67
|
-
|
68
61
|
extensions: []
|
69
|
-
|
70
|
-
extra_rdoc_files:
|
62
|
+
extra_rdoc_files:
|
71
63
|
- LICENSE.txt
|
72
64
|
- README.md
|
73
|
-
files:
|
65
|
+
files:
|
66
|
+
- .gemtest
|
74
67
|
- LICENSE.txt
|
75
68
|
- README.md
|
76
|
-
-
|
69
|
+
- Rakefile
|
70
|
+
- VERSION.yml
|
77
71
|
- lib/fuzzystringmatch.rb
|
78
|
-
-
|
72
|
+
- lib/fuzzystringmatch/inline.rb
|
73
|
+
- lib/fuzzystringmatch/inline/jarowinkler.rb
|
74
|
+
- lib/fuzzystringmatch/pure.rb
|
75
|
+
- lib/fuzzystringmatch/pure/jarowinkler.rb
|
76
|
+
- test/basic_native_spec.rb
|
77
|
+
- test/basic_pure_spec.rb
|
78
|
+
- test/mutibyte_spec.rb
|
79
79
|
homepage: http://github.com/kiyoka/fuzzy-string-match
|
80
80
|
licenses: []
|
81
|
-
|
82
81
|
post_install_message:
|
83
82
|
rdoc_options: []
|
84
|
-
|
85
|
-
require_paths:
|
83
|
+
require_paths:
|
86
84
|
- lib
|
87
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
88
86
|
none: false
|
89
|
-
requirements:
|
90
|
-
- -
|
91
|
-
- !ruby/object:Gem::Version
|
92
|
-
hash: 49
|
93
|
-
segments:
|
94
|
-
- 1
|
95
|
-
- 9
|
96
|
-
- 1
|
87
|
+
requirements:
|
88
|
+
- - ! '>='
|
89
|
+
- !ruby/object:Gem::Version
|
97
90
|
version: 1.9.1
|
98
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
92
|
none: false
|
100
|
-
requirements:
|
101
|
-
- -
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
|
104
|
-
segments:
|
105
|
-
- 0
|
106
|
-
version: "0"
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
107
97
|
requirements: []
|
108
|
-
|
109
98
|
rubyforge_project:
|
110
|
-
rubygems_version: 1.
|
99
|
+
rubygems_version: 1.8.15
|
111
100
|
signing_key:
|
112
101
|
specification_version: 3
|
113
102
|
summary: fuzzy string matching library
|
114
103
|
test_files: []
|
115
|
-
|
data/benchmark/vs_amatch.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
#!/usr/local/bin/ruby
|
2
|
-
#
|
3
|
-
# http://www.ruby-lang.org/ja/man/html/benchmark.html
|
4
|
-
#
|
5
|
-
require 'benchmark'
|
6
|
-
require 'amatch'
|
7
|
-
require './lib/fuzzystringmatch'
|
8
|
-
|
9
|
-
looptimes = 10000
|
10
|
-
|
11
|
-
names50 = [ "Aichi", "Akita", "Aomori", "Chiba", "Ehime", "Fukui", "Fukuoka", "Fukushima", "Gifu", "Gunma",
|
12
|
-
"Hiroshima", "Hokkaido", "Hyogo", "Ibaraki", "Ishikawa", "Iwate", "Kagawa", "Kagoshima", "Kanagawa", "Kochi",
|
13
|
-
"Kumamoto", "Kyoto", "Mie", "Miyagi", "Miyazaki", "Nagano", "Nagasaki", "Nara", "Niigata", "Oita",
|
14
|
-
"Okayama", "Okinawa", "Osaka", "Saga", "Saitama", "Shiga", "Shimane", "Shizuoka", "Tochigi", "Tokushima",
|
15
|
-
"Tokyo", "Tottori", "Toyama", "Wakayama", "Yamagata", "Yamaguchi", "Yamanashi", "Dummy1", "Dummy2", "Dummy3" ]
|
16
|
-
names = names50 + names50
|
17
|
-
|
18
|
-
keyword = "Tokyo"
|
19
|
-
|
20
|
-
printf( " --- \n" )
|
21
|
-
printf( " --- Each match functions will be called %dMega times. --- \n", (names.size * looptimes) / (1000.0 * 1000.0) )
|
22
|
-
printf( " --- \n" )
|
23
|
-
|
24
|
-
puts "[Amatch]"
|
25
|
-
puts Benchmark::CAPTION
|
26
|
-
puts Benchmark.measure {
|
27
|
-
jarow = Amatch::JaroWinkler.new keyword
|
28
|
-
looptimes.times { |n|
|
29
|
-
names.map { |x|
|
30
|
-
jarow.match( x )
|
31
|
-
}
|
32
|
-
}
|
33
|
-
}
|
34
|
-
|
35
|
-
puts "[this Module (pure)]"
|
36
|
-
puts Benchmark::CAPTION
|
37
|
-
puts Benchmark.measure {
|
38
|
-
jarow = FuzzyStringMatch::JaroWinkler.new.create
|
39
|
-
looptimes.times { |n|
|
40
|
-
names.map { |x|
|
41
|
-
jarow.getDistance( keyword, x )
|
42
|
-
}
|
43
|
-
}
|
44
|
-
}
|
45
|
-
puts "[this Module (native)]"
|
46
|
-
puts Benchmark::CAPTION
|
47
|
-
puts Benchmark.measure {
|
48
|
-
jarow = FuzzyStringMatch::JaroWinkler.new.create( :native )
|
49
|
-
looptimes.times { |n|
|
50
|
-
names.map { |x|
|
51
|
-
jarow.getDistance( keyword, x )
|
52
|
-
}
|
53
|
-
}
|
54
|
-
}
|
@@ -1,139 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- encoding: utf-8 -*-
|
3
|
-
#
|
4
|
-
# fuzzystringmatch_spec.rb - "RSpec file for FuzzyStringMatch module "
|
5
|
-
#
|
6
|
-
# Copyright (c) 2010 Kiyoka Nishiyama <kiyoka@sumibi.org>
|
7
|
-
#
|
8
|
-
# Licensed to the Apache Software Foundation (ASF) under one or more
|
9
|
-
# contributor license agreements. See the NOTICE file distributed with
|
10
|
-
# this work for additional information regarding copyright ownership.
|
11
|
-
# The ASF licenses this file to You under the Apache License, Version 2.0
|
12
|
-
# (the "License"); you may not use this file except in compliance with
|
13
|
-
# the License. You may obtain a copy of the License at
|
14
|
-
#
|
15
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
16
|
-
#
|
17
|
-
# Unless required by applicable law or agreed to in writing, software
|
18
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
19
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
20
|
-
# See the License for the specific language governing permissions and
|
21
|
-
# limitations under the License.
|
22
|
-
#
|
23
|
-
#
|
24
|
-
require 'fuzzystringmatch'
|
25
|
-
require 'amatch'
|
26
|
-
|
27
|
-
describe FuzzyStringMatch, "when some string distances (Pure) are" do
|
28
|
-
before do
|
29
|
-
@jarow = FuzzyStringMatch::JaroWinkler.new.create
|
30
|
-
end
|
31
|
-
it "should" do
|
32
|
-
@jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
|
33
|
-
@jarow.getDistance( "al", "al" ).should == 1.0
|
34
|
-
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
35
|
-
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
36
|
-
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
37
|
-
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
38
|
-
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
39
|
-
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
40
|
-
lambda {
|
41
|
-
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
42
|
-
d2 = @jarow.getDistance("zac ephron", "kai ephron")
|
43
|
-
d1 > d2
|
44
|
-
}.should be_true
|
45
|
-
lambda {
|
46
|
-
d1 = @jarow.getDistance("brittney spears", "britney spears")
|
47
|
-
d2 = @jarow.getDistance("brittney spears", "brittney startzman")
|
48
|
-
d1 > d2
|
49
|
-
}.should be_true
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
describe FuzzyStringMatch, "when some string distances (Native) are" do
|
54
|
-
before do
|
55
|
-
@jarow = FuzzyStringMatch::JaroWinkler.new.create( :native )
|
56
|
-
end
|
57
|
-
it "should" do
|
58
|
-
@jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
|
59
|
-
@jarow.getDistance( "al", "al" ).should == 1.0
|
60
|
-
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
61
|
-
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
62
|
-
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
63
|
-
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
64
|
-
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
65
|
-
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
66
|
-
lambda {
|
67
|
-
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
68
|
-
d2 = @jarow.getDistance("zac ephron", "kai ephron")
|
69
|
-
d1 > d2
|
70
|
-
}.should be_true
|
71
|
-
lambda {
|
72
|
-
d1 = @jarow.getDistance("brittney spears", "britney spears")
|
73
|
-
d2 = @jarow.getDistance("brittney spears", "brittney startzman")
|
74
|
-
d1 > d2
|
75
|
-
}.should be_true
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def amatch_getDistance( s1, s2 )
|
80
|
-
@jarow = Amatch::JaroWinkler.new( s1 )
|
81
|
-
@jarow.match( s2 )
|
82
|
-
end
|
83
|
-
|
84
|
-
|
85
|
-
describe FuzzyStringMatch, "when some UTF8 string distances (Pure) are" do
|
86
|
-
before do
|
87
|
-
@jarow = FuzzyStringMatch::JaroWinkler.new.create
|
88
|
-
end
|
89
|
-
it "should" do
|
90
|
-
@jarow.getDistance( "al", "al" ).should == 1.0
|
91
|
-
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
92
|
-
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
93
|
-
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
94
|
-
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
95
|
-
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
96
|
-
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
97
|
-
lambda {
|
98
|
-
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
99
|
-
d2 = @jarow.getDistance("zac ephron", "kai ephron")
|
100
|
-
d1 > d2
|
101
|
-
}.should be_true
|
102
|
-
lambda {
|
103
|
-
d1 = @jarow.getDistance("brittney spears", "britney spears")
|
104
|
-
d2 = @jarow.getDistance("brittney spears", "brittney startzman")
|
105
|
-
d1 > d2
|
106
|
-
}.should be_true
|
107
|
-
@jarow.getDistance( "スパゲティー", "スパゲッティー" ).should be_within(0.0001).of(0.9666)
|
108
|
-
@jarow.getDistance( "スパゲティー", "スパゲティ" ).should be_within(0.0001).of(0.9722)
|
109
|
-
@jarow.getDistance( "スティービー・ワンダー", "スピーディー・ワンダー" ).should be_within(0.0001).of(0.8561)
|
110
|
-
@jarow.getDistance( "マイケル・ジャクソン", "ジャイケル・マクソン" ).should be_within(0.0001).of(0.8000)
|
111
|
-
@jarow.getDistance( "まつもとゆきひろ", "まつもとひろゆき" ).should be_within(0.0001).of(0.9500)
|
112
|
-
@jarow.getDistance( "クライエント", "クライアント" ).should be_within(0.0001).of(0.9222)
|
113
|
-
@jarow.getDistance( "サーバー", "サーバ" ).should be_within(0.0001).of(0.9416)
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
|
118
|
-
describe Amatch, "when use Amatch gem, results are" do
|
119
|
-
it "should" do
|
120
|
-
amatch_getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9666) ## amatch's result value is different from lucene version.
|
121
|
-
amatch_getDistance( "al", "al" ).should == 1.0
|
122
|
-
amatch_getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
123
|
-
amatch_getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
124
|
-
amatch_getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
125
|
-
amatch_getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
126
|
-
amatch_getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
127
|
-
amatch_getDistance( "fvie", "ten" ).should == 0.0
|
128
|
-
lambda {
|
129
|
-
d1 = amatch_getDistance("zac ephron", "zac efron")
|
130
|
-
d2 = amatch_getDistance("zac ephron", "kai ephron")
|
131
|
-
d1 > d2
|
132
|
-
}.should be_true
|
133
|
-
lambda {
|
134
|
-
d1 = amatch_getDistance("brittney spears", "britney spears")
|
135
|
-
d2 = amatch_getDistance("brittney spears", "brittney startzman")
|
136
|
-
d1 > d2
|
137
|
-
}.should be_true
|
138
|
-
end
|
139
|
-
end
|