fuzzy-string-match 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +32 -7
- data/lib/fuzzystringmatch.rb +9 -8
- data/test/fuzzystringmatch_spec.rb +32 -32
- metadata +15 -11
data/README.md
CHANGED
@@ -2,17 +2,23 @@
|
|
2
2
|
|
3
3
|
* fuzzy-string-match is a fuzzy string matching library for ruby.
|
4
4
|
* It is fast. ( written in C with RubyInline )
|
5
|
-
* It
|
5
|
+
* It supports only Jaro-Winkler distance algorithm.
|
6
6
|
* This program was ported by hand from lucene-3.0.2. (lucene is Java product)
|
7
7
|
* If you want to add another string distance algorithm, please port by yourself and contact me <kiyoka@sumibi.org>.
|
8
8
|
|
9
|
+
## The reason why i developed fuzzy-string-match
|
10
|
+
* I tried amatch-0.2.5, but it contains some issues.
|
11
|
+
1. Some memory leaks.
|
12
|
+
2. I felt difficult to maintain it.
|
13
|
+
* So, I decide to create another gem by porting lucene-3.0.x.
|
14
|
+
|
9
15
|
## Installing
|
10
16
|
1. gem install fuzzy-string-match
|
11
17
|
|
12
18
|
## Features
|
13
|
-
*
|
14
|
-
* Pure ruby version can handle both
|
15
|
-
* Native version can only
|
19
|
+
* Calculate Jaro-Winkler distance of two strings.
|
20
|
+
* Pure ruby version can handle both ASCII and UTF8 strings. (and slow)
|
21
|
+
* Native version can only ASCII strings. (and fast)
|
16
22
|
|
17
23
|
## Sample code
|
18
24
|
* Native version
|
@@ -51,6 +57,25 @@
|
|
51
57
|
=> 0.8133333333333332
|
52
58
|
</code>
|
53
59
|
|
60
|
+
## Benchmarks
|
61
|
+
|
62
|
+
<console>
|
63
|
+
$ rake bench
|
64
|
+
ruby ./benchmark/vs_amatch.rb
|
65
|
+
---
|
66
|
+
--- Each match functions will be called 1Mega times. ---
|
67
|
+
---
|
68
|
+
[Amatch]
|
69
|
+
user system total real
|
70
|
+
1.160000 0.050000 1.210000 ( 1.218259)
|
71
|
+
[this Module (pure)]
|
72
|
+
user system total real
|
73
|
+
39.940000 0.160000 40.100000 ( 40.542448)
|
74
|
+
[this Module (native)]
|
75
|
+
user system total real
|
76
|
+
0.480000 0.000000 0.480000 ( 0.484187)
|
77
|
+
</console>
|
78
|
+
|
54
79
|
## Requires
|
55
80
|
- RubyInline
|
56
81
|
- Ruby 1.9.1 or higher
|
@@ -60,9 +85,9 @@
|
|
60
85
|
- I ported from java source code of lucene-3.0.2.
|
61
86
|
|
62
87
|
## See also
|
63
|
-
- http://en.wikipedia.org/wiki/Jaro–Winkler_distance
|
64
|
-
- http://lucene.apache.org
|
65
|
-
- http://github.com/naoya/perl-text-jarowinkler
|
88
|
+
- <http://en.wikipedia.org/wiki/Jaro–Winkler_distance>
|
89
|
+
- <http://lucene.apache.org/>
|
90
|
+
- <http://github.com/naoya/perl-text-jarowinkler>
|
66
91
|
|
67
92
|
## License
|
68
93
|
- Apache 2.0 LICENSE
|
data/lib/fuzzystringmatch.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#
|
2
|
-
# Fuzzy String Match
|
2
|
+
# Fuzzy String Match
|
3
3
|
#
|
4
4
|
# Copyright 2010 Kiyoka Nishiyama
|
5
5
|
#
|
@@ -16,7 +16,7 @@
|
|
16
16
|
# limitations under the License.
|
17
17
|
#
|
18
18
|
module FuzzyStringMatch
|
19
|
-
|
19
|
+
|
20
20
|
class JaroWinkler
|
21
21
|
def create( type = :pure ) # factory method
|
22
22
|
case type
|
@@ -34,7 +34,7 @@ module FuzzyStringMatch
|
|
34
34
|
def getDistance( s1, s2 )
|
35
35
|
a1 = s1.split( // )
|
36
36
|
a2 = s2.split( // )
|
37
|
-
|
37
|
+
|
38
38
|
if s1.size > s2.size
|
39
39
|
(max,min) = a1,a2
|
40
40
|
else
|
@@ -50,7 +50,7 @@ module FuzzyStringMatch
|
|
50
50
|
c1 = min[mi]
|
51
51
|
xi = [mi - range, 0].max
|
52
52
|
xn = [mi + range + 1, max.size].min
|
53
|
-
|
53
|
+
|
54
54
|
(xi ... xn).each { |i|
|
55
55
|
if (not flags[i]) && ( c1 == max[i] )
|
56
56
|
indexes[mi] = i
|
@@ -79,7 +79,7 @@ module FuzzyStringMatch
|
|
79
79
|
si += 1
|
80
80
|
end
|
81
81
|
}
|
82
|
-
|
82
|
+
|
83
83
|
transpositions = 0
|
84
84
|
(0 ... ms1.size).each { |mi|
|
85
85
|
if ms1[mi] != ms2[mi]
|
@@ -110,7 +110,8 @@ module FuzzyStringMatch
|
|
110
110
|
require 'inline'
|
111
111
|
class JaroWinklerNative
|
112
112
|
inline do |builder|
|
113
|
-
builder.
|
113
|
+
builder.include '<iostream>'
|
114
|
+
builder.add_compile_flags '-x c++', '-lstdc++'
|
114
115
|
builder.c_raw 'int max( int a, int b ) { return ((a)>(b)?(a):(b)); }'
|
115
116
|
builder.c_raw 'int min( int a, int b ) { return ((a)<(b)?(a):(b)); }'
|
116
117
|
builder.c_raw 'double double_min( double a, double b ) { return ((a)<(b)?(a):(b)); }'
|
@@ -130,7 +131,7 @@ double getDistance( char *s1, char *s2 )
|
|
130
131
|
_min = s1; _min_length = strlen(s1);
|
131
132
|
}
|
132
133
|
int range = max( _max_length / 2 - 1, 0 );
|
133
|
-
|
134
|
+
|
134
135
|
int indexes[_min_length];
|
135
136
|
for( int i = 0 ; i < _min_length ; i++ ) {
|
136
137
|
indexes[i] = -1;
|
@@ -156,7 +157,7 @@ double getDistance( char *s1, char *s2 )
|
|
156
157
|
char ms1[matches];
|
157
158
|
char ms2[matches];
|
158
159
|
int ms1_length = matches;
|
159
|
-
|
160
|
+
|
160
161
|
for (int i = 0, si = 0; i < _min_length; i++) {
|
161
162
|
if (indexes[i] != -1) {
|
162
163
|
ms1[si] = _min[i];
|
@@ -29,13 +29,13 @@ describe FuzzyStringMatch, "when some string distances (Pure) are" do
|
|
29
29
|
@jarow = FuzzyStringMatch::JaroWinkler.new.create
|
30
30
|
end
|
31
31
|
it "should" do
|
32
|
-
@jarow.getDistance( "henka", "henkan" ).should
|
32
|
+
@jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
|
33
33
|
@jarow.getDistance( "al", "al" ).should == 1.0
|
34
|
-
@jarow.getDistance( "martha", "marhta" ).should
|
35
|
-
@jarow.getDistance( "jones", "johnson" ).should
|
36
|
-
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should
|
37
|
-
@jarow.getDistance( "dwayne", "duane" ).should
|
38
|
-
@jarow.getDistance( "dixon", "dicksonx" ).should
|
34
|
+
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
35
|
+
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
36
|
+
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
37
|
+
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
38
|
+
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
39
39
|
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
40
40
|
lambda {
|
41
41
|
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
@@ -55,13 +55,13 @@ describe FuzzyStringMatch, "when some string distances (Native) are" do
|
|
55
55
|
@jarow = FuzzyStringMatch::JaroWinkler.new.create( :native )
|
56
56
|
end
|
57
57
|
it "should" do
|
58
|
-
@jarow.getDistance( "henka", "henkan" ).should
|
58
|
+
@jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
|
59
59
|
@jarow.getDistance( "al", "al" ).should == 1.0
|
60
|
-
@jarow.getDistance( "martha", "marhta" ).should
|
61
|
-
@jarow.getDistance( "jones", "johnson" ).should
|
62
|
-
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should
|
63
|
-
@jarow.getDistance( "dwayne", "duane" ).should
|
64
|
-
@jarow.getDistance( "dixon", "dicksonx" ).should
|
60
|
+
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
61
|
+
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
62
|
+
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
63
|
+
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
64
|
+
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
65
65
|
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
66
66
|
lambda {
|
67
67
|
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
@@ -87,13 +87,13 @@ describe FuzzyStringMatch, "when some UTF8 string distances (Pure) are" do
|
|
87
87
|
@jarow = FuzzyStringMatch::JaroWinkler.new.create
|
88
88
|
end
|
89
89
|
it "should" do
|
90
|
-
@jarow.getDistance( "al", "al"
|
91
|
-
@jarow.getDistance( "martha", "marhta"
|
92
|
-
@jarow.getDistance( "jones", "johnson" ).should
|
93
|
-
@jarow.getDistance( "abcvwxyz", "cabvwxyz"
|
94
|
-
@jarow.getDistance( "dwayne", "duane"
|
95
|
-
@jarow.getDistance( "dixon", "dicksonx"
|
96
|
-
@jarow.getDistance( "fvie", "ten"
|
90
|
+
@jarow.getDistance( "al", "al" ).should == 1.0
|
91
|
+
@jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
92
|
+
@jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
93
|
+
@jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
94
|
+
@jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
95
|
+
@jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
96
|
+
@jarow.getDistance( "fvie", "ten" ).should == 0.0
|
97
97
|
lambda {
|
98
98
|
d1 = @jarow.getDistance("zac ephron", "zac efron")
|
99
99
|
d2 = @jarow.getDistance("zac ephron", "kai ephron")
|
@@ -104,26 +104,26 @@ describe FuzzyStringMatch, "when some UTF8 string distances (Pure) are" do
|
|
104
104
|
d2 = @jarow.getDistance("brittney spears", "brittney startzman")
|
105
105
|
d1 > d2
|
106
106
|
}.should be_true
|
107
|
-
@jarow.getDistance( "スパゲティー", "スパゲッティー"
|
108
|
-
@jarow.getDistance( "スパゲティー", "スパゲティ"
|
109
|
-
@jarow.getDistance( "スティービー・ワンダー", "スピーディー・ワンダー" ).should
|
110
|
-
@jarow.getDistance( "マイケル・ジャクソン", "ジャイケル・マクソン" ).should
|
111
|
-
@jarow.getDistance( "まつもとゆきひろ", "まつもとひろゆき" ).should
|
112
|
-
@jarow.getDistance( "クライエント", "クライアント"
|
113
|
-
@jarow.getDistance( "サーバー", "サーバ"
|
107
|
+
@jarow.getDistance( "スパゲティー", "スパゲッティー" ).should be_within(0.0001).of(0.9666)
|
108
|
+
@jarow.getDistance( "スパゲティー", "スパゲティ" ).should be_within(0.0001).of(0.9722)
|
109
|
+
@jarow.getDistance( "スティービー・ワンダー", "スピーディー・ワンダー" ).should be_within(0.0001).of(0.8561)
|
110
|
+
@jarow.getDistance( "マイケル・ジャクソン", "ジャイケル・マクソン" ).should be_within(0.0001).of(0.8000)
|
111
|
+
@jarow.getDistance( "まつもとゆきひろ", "まつもとひろゆき" ).should be_within(0.0001).of(0.9500)
|
112
|
+
@jarow.getDistance( "クライエント", "クライアント" ).should be_within(0.0001).of(0.9222)
|
113
|
+
@jarow.getDistance( "サーバー", "サーバ" ).should be_within(0.0001).of(0.9416)
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
117
117
|
|
118
118
|
describe Amatch, "when use Amatch gem, results are" do
|
119
119
|
it "should" do
|
120
|
-
amatch_getDistance( "henka", "henkan" ).should
|
120
|
+
amatch_getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9666) ## amatch's result value is different from lucene version.
|
121
121
|
amatch_getDistance( "al", "al" ).should == 1.0
|
122
|
-
amatch_getDistance( "martha", "marhta" ).should
|
123
|
-
amatch_getDistance( "jones", "johnson" ).should
|
124
|
-
amatch_getDistance( "abcvwxyz", "cabvwxyz" ).should
|
125
|
-
amatch_getDistance( "dwayne", "duane" ).should
|
126
|
-
amatch_getDistance( "dixon", "dicksonx" ).should
|
122
|
+
amatch_getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
|
123
|
+
amatch_getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
|
124
|
+
amatch_getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
|
125
|
+
amatch_getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
|
126
|
+
amatch_getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
|
127
127
|
amatch_getDistance( "fvie", "ten" ).should == 0.0
|
128
128
|
lambda {
|
129
129
|
d1 = amatch_getDistance("zac ephron", "zac efron")
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy-string-match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 57
|
5
|
+
prerelease:
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 9
|
8
|
-
-
|
9
|
-
version: 0.9.
|
9
|
+
- 1
|
10
|
+
version: 0.9.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Kiyoka Nishiyama
|
@@ -14,8 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date:
|
18
|
-
default_executable:
|
18
|
+
date: 2011-07-30 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: rspec
|
@@ -25,6 +25,7 @@ dependencies:
|
|
25
25
|
requirements:
|
26
26
|
- - ">="
|
27
27
|
- !ruby/object:Gem::Version
|
28
|
+
hash: 3
|
28
29
|
segments:
|
29
30
|
- 0
|
30
31
|
version: "0"
|
@@ -38,6 +39,7 @@ dependencies:
|
|
38
39
|
requirements:
|
39
40
|
- - ">="
|
40
41
|
- !ruby/object:Gem::Version
|
42
|
+
hash: 3
|
41
43
|
segments:
|
42
44
|
- 0
|
43
45
|
version: "0"
|
@@ -51,6 +53,7 @@ dependencies:
|
|
51
53
|
requirements:
|
52
54
|
- - ">="
|
53
55
|
- !ruby/object:Gem::Version
|
56
|
+
hash: 43
|
54
57
|
segments:
|
55
58
|
- 3
|
56
59
|
- 8
|
@@ -73,13 +76,12 @@ files:
|
|
73
76
|
- benchmark/vs_amatch.rb
|
74
77
|
- lib/fuzzystringmatch.rb
|
75
78
|
- test/fuzzystringmatch_spec.rb
|
76
|
-
has_rdoc: true
|
77
79
|
homepage: http://github.com/kiyoka/fuzzy-string-match
|
78
80
|
licenses: []
|
79
81
|
|
80
82
|
post_install_message:
|
81
|
-
rdoc_options:
|
82
|
-
|
83
|
+
rdoc_options: []
|
84
|
+
|
83
85
|
require_paths:
|
84
86
|
- lib
|
85
87
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -87,6 +89,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
87
89
|
requirements:
|
88
90
|
- - ">="
|
89
91
|
- !ruby/object:Gem::Version
|
92
|
+
hash: 49
|
90
93
|
segments:
|
91
94
|
- 1
|
92
95
|
- 9
|
@@ -97,15 +100,16 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
100
|
requirements:
|
98
101
|
- - ">="
|
99
102
|
- !ruby/object:Gem::Version
|
103
|
+
hash: 3
|
100
104
|
segments:
|
101
105
|
- 0
|
102
106
|
version: "0"
|
103
107
|
requirements: []
|
104
108
|
|
105
109
|
rubyforge_project:
|
106
|
-
rubygems_version: 1.
|
110
|
+
rubygems_version: 1.7.2
|
107
111
|
signing_key:
|
108
112
|
specification_version: 3
|
109
113
|
summary: fuzzy string matching library
|
110
|
-
test_files:
|
111
|
-
|
114
|
+
test_files: []
|
115
|
+
|