fuzzy-string-match 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -2,17 +2,23 @@
2
2
 
3
3
  * fuzzy-string-match is a fuzzy string matching library for ruby.
4
4
  * It is fast. ( written in C with RubyInline )
5
- * It suports only Jaro-Winkler distance algorithm.
5
+ * It supports only Jaro-Winkler distance algorithm.
6
6
  * This program was ported by hand from lucene-3.0.2. (lucene is Java product)
7
7
  * If you want to add another string distance algorithm, please port by yourself and contact me <kiyoka@sumibi.org>.
8
8
 
9
+ ## The reason why i developed fuzzy-string-match
10
+ * I tried amatch-0.2.5, but it contains some issues.
11
+ 1. Some memory leaks.
12
+ 2. I felt difficult to maintain it.
13
+ * So, I decide to create another gem by porting lucene-3.0.x.
14
+
9
15
  ## Installing
10
16
  1. gem install fuzzy-string-match
11
17
 
12
18
  ## Features
13
- * Caluclate Jaro-Winkler distance of two strings.
14
- * Pure ruby version can handle both ascii and UTF8 strings. (and slow)
15
- * Native version can only ascii strings. (and fast)
19
+ * Calculate Jaro-Winkler distance of two strings.
20
+ * Pure ruby version can handle both ASCII and UTF8 strings. (and slow)
21
+ * Native version can only ASCII strings. (and fast)
16
22
 
17
23
  ## Sample code
18
24
  * Native version
@@ -51,6 +57,25 @@
51
57
  => 0.8133333333333332
52
58
  </code>
53
59
 
60
+ ## Benchmarks
61
+
62
+ <console>
63
+ $ rake bench
64
+ ruby ./benchmark/vs_amatch.rb
65
+ ---
66
+ --- Each match functions will be called 1Mega times. ---
67
+ ---
68
+ [Amatch]
69
+ user system total real
70
+ 1.160000 0.050000 1.210000 ( 1.218259)
71
+ [this Module (pure)]
72
+ user system total real
73
+ 39.940000 0.160000 40.100000 ( 40.542448)
74
+ [this Module (native)]
75
+ user system total real
76
+ 0.480000 0.000000 0.480000 ( 0.484187)
77
+ </console>
78
+
54
79
  ## Requires
55
80
  - RubyInline
56
81
  - Ruby 1.9.1 or higher
@@ -60,9 +85,9 @@
60
85
  - I ported from java source code of lucene-3.0.2.
61
86
 
62
87
  ## See also
63
- - http://en.wikipedia.org/wiki/Jaro–Winkler_distance
64
- - http://lucene.apache.org/
65
- - http://github.com/naoya/perl-text-jarowinkler
88
+ - <http://en.wikipedia.org/wiki/Jaro–Winkler_distance>
89
+ - <http://lucene.apache.org/>
90
+ - <http://github.com/naoya/perl-text-jarowinkler>
66
91
 
67
92
  ## License
68
93
  - Apache 2.0 LICENSE
@@ -1,5 +1,5 @@
1
1
  #
2
- # Fuzzy String Match
2
+ # Fuzzy String Match
3
3
  #
4
4
  # Copyright 2010 Kiyoka Nishiyama
5
5
  #
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
  #
18
18
  module FuzzyStringMatch
19
-
19
+
20
20
  class JaroWinkler
21
21
  def create( type = :pure ) # factory method
22
22
  case type
@@ -34,7 +34,7 @@ module FuzzyStringMatch
34
34
  def getDistance( s1, s2 )
35
35
  a1 = s1.split( // )
36
36
  a2 = s2.split( // )
37
-
37
+
38
38
  if s1.size > s2.size
39
39
  (max,min) = a1,a2
40
40
  else
@@ -50,7 +50,7 @@ module FuzzyStringMatch
50
50
  c1 = min[mi]
51
51
  xi = [mi - range, 0].max
52
52
  xn = [mi + range + 1, max.size].min
53
-
53
+
54
54
  (xi ... xn).each { |i|
55
55
  if (not flags[i]) && ( c1 == max[i] )
56
56
  indexes[mi] = i
@@ -79,7 +79,7 @@ module FuzzyStringMatch
79
79
  si += 1
80
80
  end
81
81
  }
82
-
82
+
83
83
  transpositions = 0
84
84
  (0 ... ms1.size).each { |mi|
85
85
  if ms1[mi] != ms2[mi]
@@ -110,7 +110,8 @@ module FuzzyStringMatch
110
110
  require 'inline'
111
111
  class JaroWinklerNative
112
112
  inline do |builder|
113
- builder.add_compile_flags '-std=c99'
113
+ builder.include '<iostream>'
114
+ builder.add_compile_flags '-x c++', '-lstdc++'
114
115
  builder.c_raw 'int max( int a, int b ) { return ((a)>(b)?(a):(b)); }'
115
116
  builder.c_raw 'int min( int a, int b ) { return ((a)<(b)?(a):(b)); }'
116
117
  builder.c_raw 'double double_min( double a, double b ) { return ((a)<(b)?(a):(b)); }'
@@ -130,7 +131,7 @@ double getDistance( char *s1, char *s2 )
130
131
  _min = s1; _min_length = strlen(s1);
131
132
  }
132
133
  int range = max( _max_length / 2 - 1, 0 );
133
-
134
+
134
135
  int indexes[_min_length];
135
136
  for( int i = 0 ; i < _min_length ; i++ ) {
136
137
  indexes[i] = -1;
@@ -156,7 +157,7 @@ double getDistance( char *s1, char *s2 )
156
157
  char ms1[matches];
157
158
  char ms2[matches];
158
159
  int ms1_length = matches;
159
-
160
+
160
161
  for (int i = 0, si = 0; i < _min_length; i++) {
161
162
  if (indexes[i] != -1) {
162
163
  ms1[si] = _min[i];
@@ -29,13 +29,13 @@ describe FuzzyStringMatch, "when some string distances (Pure) are" do
29
29
  @jarow = FuzzyStringMatch::JaroWinkler.new.create
30
30
  end
31
31
  it "should" do
32
- @jarow.getDistance( "henka", "henkan" ).should be_close( 0.9722, 0.0001 )
32
+ @jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
33
33
  @jarow.getDistance( "al", "al" ).should == 1.0
34
- @jarow.getDistance( "martha", "marhta" ).should be_close( 0.9611, 0.0001 )
35
- @jarow.getDistance( "jones", "johnson" ).should be_close( 0.8323, 0.0001 )
36
- @jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_close( 0.9583, 0.0001 )
37
- @jarow.getDistance( "dwayne", "duane" ).should be_close( 0.8400, 0.0001 )
38
- @jarow.getDistance( "dixon", "dicksonx" ).should be_close( 0.8133, 0.0001 )
34
+ @jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
35
+ @jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
36
+ @jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
37
+ @jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
38
+ @jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
39
39
  @jarow.getDistance( "fvie", "ten" ).should == 0.0
40
40
  lambda {
41
41
  d1 = @jarow.getDistance("zac ephron", "zac efron")
@@ -55,13 +55,13 @@ describe FuzzyStringMatch, "when some string distances (Native) are" do
55
55
  @jarow = FuzzyStringMatch::JaroWinkler.new.create( :native )
56
56
  end
57
57
  it "should" do
58
- @jarow.getDistance( "henka", "henkan" ).should be_close( 0.9722, 0.0001 )
58
+ @jarow.getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9722)
59
59
  @jarow.getDistance( "al", "al" ).should == 1.0
60
- @jarow.getDistance( "martha", "marhta" ).should be_close( 0.9611, 0.0001 )
61
- @jarow.getDistance( "jones", "johnson" ).should be_close( 0.8323, 0.0001 )
62
- @jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_close( 0.9583, 0.0001 )
63
- @jarow.getDistance( "dwayne", "duane" ).should be_close( 0.8400, 0.0001 )
64
- @jarow.getDistance( "dixon", "dicksonx" ).should be_close( 0.8133, 0.0001 )
60
+ @jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
61
+ @jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
62
+ @jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
63
+ @jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
64
+ @jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
65
65
  @jarow.getDistance( "fvie", "ten" ).should == 0.0
66
66
  lambda {
67
67
  d1 = @jarow.getDistance("zac ephron", "zac efron")
@@ -87,13 +87,13 @@ describe FuzzyStringMatch, "when some UTF8 string distances (Pure) are" do
87
87
  @jarow = FuzzyStringMatch::JaroWinkler.new.create
88
88
  end
89
89
  it "should" do
90
- @jarow.getDistance( "al", "al" ).should == 1.0
91
- @jarow.getDistance( "martha", "marhta" ).should be_close( 0.9611, 0.0001 )
92
- @jarow.getDistance( "jones", "johnson" ).should be_close( 0.8323, 0.0001 )
93
- @jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_close( 0.9583, 0.0001 )
94
- @jarow.getDistance( "dwayne", "duane" ).should be_close( 0.8400, 0.0001 )
95
- @jarow.getDistance( "dixon", "dicksonx" ).should be_close( 0.8133, 0.0001 )
96
- @jarow.getDistance( "fvie", "ten" ).should == 0.0
90
+ @jarow.getDistance( "al", "al" ).should == 1.0
91
+ @jarow.getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
92
+ @jarow.getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
93
+ @jarow.getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
94
+ @jarow.getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
95
+ @jarow.getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
96
+ @jarow.getDistance( "fvie", "ten" ).should == 0.0
97
97
  lambda {
98
98
  d1 = @jarow.getDistance("zac ephron", "zac efron")
99
99
  d2 = @jarow.getDistance("zac ephron", "kai ephron")
@@ -104,26 +104,26 @@ describe FuzzyStringMatch, "when some UTF8 string distances (Pure) are" do
104
104
  d2 = @jarow.getDistance("brittney spears", "brittney startzman")
105
105
  d1 > d2
106
106
  }.should be_true
107
- @jarow.getDistance( "スパゲティー", "スパゲッティー" ).should be_close( 0.9666, 0.0001 )
108
- @jarow.getDistance( "スパゲティー", "スパゲティ" ).should be_close( 0.9722, 0.0001 )
109
- @jarow.getDistance( "スティービー・ワンダー", "スピーディー・ワンダー" ).should be_close( 0.8561, 0.0001 )
110
- @jarow.getDistance( "マイケル・ジャクソン", "ジャイケル・マクソン" ).should be_close( 0.8000, 0.0001 )
111
- @jarow.getDistance( "まつもとゆきひろ", "まつもとひろゆき" ).should be_close( 0.9500, 0.0001 )
112
- @jarow.getDistance( "クライエント", "クライアント" ).should be_close( 0.9222, 0.0001 )
113
- @jarow.getDistance( "サーバー", "サーバ" ).should be_close( 0.9416, 0.0001 )
107
+ @jarow.getDistance( "スパゲティー", "スパゲッティー" ).should be_within(0.0001).of(0.9666)
108
+ @jarow.getDistance( "スパゲティー", "スパゲティ" ).should be_within(0.0001).of(0.9722)
109
+ @jarow.getDistance( "スティービー・ワンダー", "スピーディー・ワンダー" ).should be_within(0.0001).of(0.8561)
110
+ @jarow.getDistance( "マイケル・ジャクソン", "ジャイケル・マクソン" ).should be_within(0.0001).of(0.8000)
111
+ @jarow.getDistance( "まつもとゆきひろ", "まつもとひろゆき" ).should be_within(0.0001).of(0.9500)
112
+ @jarow.getDistance( "クライエント", "クライアント" ).should be_within(0.0001).of(0.9222)
113
+ @jarow.getDistance( "サーバー", "サーバ" ).should be_within(0.0001).of(0.9416)
114
114
  end
115
115
  end
116
116
 
117
117
 
118
118
  describe Amatch, "when use Amatch gem, results are" do
119
119
  it "should" do
120
- amatch_getDistance( "henka", "henkan" ).should be_close( 0.9666, 0.0001 ) ## amatch's result value is different from lucene version.
120
+ amatch_getDistance( "henka", "henkan" ).should be_within(0.0001).of(0.9666) ## amatch's result value is different from lucene version.
121
121
  amatch_getDistance( "al", "al" ).should == 1.0
122
- amatch_getDistance( "martha", "marhta" ).should be_close( 0.9611, 0.0001 )
123
- amatch_getDistance( "jones", "johnson" ).should be_close( 0.8323, 0.0001 )
124
- amatch_getDistance( "abcvwxyz", "cabvwxyz" ).should be_close( 0.9583, 0.0001 )
125
- amatch_getDistance( "dwayne", "duane" ).should be_close( 0.8400, 0.0001 )
126
- amatch_getDistance( "dixon", "dicksonx" ).should be_close( 0.8133, 0.0001 )
122
+ amatch_getDistance( "martha", "marhta" ).should be_within(0.0001).of(0.9611)
123
+ amatch_getDistance( "jones", "johnson" ).should be_within(0.0001).of(0.8323)
124
+ amatch_getDistance( "abcvwxyz", "cabvwxyz" ).should be_within(0.0001).of(0.9583)
125
+ amatch_getDistance( "dwayne", "duane" ).should be_within(0.0001).of(0.8400)
126
+ amatch_getDistance( "dixon", "dicksonx" ).should be_within(0.0001).of(0.8133)
127
127
  amatch_getDistance( "fvie", "ten" ).should == 0.0
128
128
  lambda {
129
129
  d1 = amatch_getDistance("zac ephron", "zac efron")
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fuzzy-string-match
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
4
+ hash: 57
5
+ prerelease:
5
6
  segments:
6
7
  - 0
7
8
  - 9
8
- - 0
9
- version: 0.9.0
9
+ - 1
10
+ version: 0.9.1
10
11
  platform: ruby
11
12
  authors:
12
13
  - Kiyoka Nishiyama
@@ -14,8 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-10-13 00:00:00 +09:00
18
- default_executable:
18
+ date: 2011-07-30 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: rspec
@@ -25,6 +25,7 @@ dependencies:
25
25
  requirements:
26
26
  - - ">="
27
27
  - !ruby/object:Gem::Version
28
+ hash: 3
28
29
  segments:
29
30
  - 0
30
31
  version: "0"
@@ -38,6 +39,7 @@ dependencies:
38
39
  requirements:
39
40
  - - ">="
40
41
  - !ruby/object:Gem::Version
42
+ hash: 3
41
43
  segments:
42
44
  - 0
43
45
  version: "0"
@@ -51,6 +53,7 @@ dependencies:
51
53
  requirements:
52
54
  - - ">="
53
55
  - !ruby/object:Gem::Version
56
+ hash: 43
54
57
  segments:
55
58
  - 3
56
59
  - 8
@@ -73,13 +76,12 @@ files:
73
76
  - benchmark/vs_amatch.rb
74
77
  - lib/fuzzystringmatch.rb
75
78
  - test/fuzzystringmatch_spec.rb
76
- has_rdoc: true
77
79
  homepage: http://github.com/kiyoka/fuzzy-string-match
78
80
  licenses: []
79
81
 
80
82
  post_install_message:
81
- rdoc_options:
82
- - --charset=UTF-8
83
+ rdoc_options: []
84
+
83
85
  require_paths:
84
86
  - lib
85
87
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -87,6 +89,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
87
89
  requirements:
88
90
  - - ">="
89
91
  - !ruby/object:Gem::Version
92
+ hash: 49
90
93
  segments:
91
94
  - 1
92
95
  - 9
@@ -97,15 +100,16 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
100
  requirements:
98
101
  - - ">="
99
102
  - !ruby/object:Gem::Version
103
+ hash: 3
100
104
  segments:
101
105
  - 0
102
106
  version: "0"
103
107
  requirements: []
104
108
 
105
109
  rubyforge_project:
106
- rubygems_version: 1.3.7
110
+ rubygems_version: 1.7.2
107
111
  signing_key:
108
112
  specification_version: 3
109
113
  summary: fuzzy string matching library
110
- test_files:
111
- - test/fuzzystringmatch_spec.rb
114
+ test_files: []
115
+