jaro_winkler 1.3.6 → 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/jaro_winkler/jaro.c +4 -4
- data/ext/jaro_winkler/jaro.h +0 -1
- data/lib/jaro_winkler/version.rb +1 -1
- metadata +5 -24
- data/.gitignore +0 -14
- data/.rspec +0 -2
- data/.travis.yml +0 -6
- data/Gemfile +0 -2
- data/LICENSE.txt +0 -22
- data/README.md +0 -129
- data/Rakefile +0 -51
- data/benchmark/native.rb +0 -26
- data/benchmark/native.txt +0 -12
- data/benchmark/pure.rb +0 -16
- data/benchmark/pure.txt +0 -8
- data/jaro_winkler.gemspec +0 -30
- data/spec/adjusting_table_spec.rb +0 -8
- data/spec/jaro_winkler_spec.rb +0 -69
- data/spec/spec_helper.rb +0 -89
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7687918cbfcaa8ffb589f1d1383ac2e6611e235f
|
4
|
+
data.tar.gz: e8129522a42193023a77593dddfe3917eafe3f68
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be7befc2a7e5c5a2866ba7e4995bf99a0f9d75bfab3e01c806a1d39b10b0a27975f795f2330248e96e5673a8ce81d0ef01fa12a6d18f3f2cf73ff9de28764a60
|
7
|
+
data.tar.gz: 3183fb3c534e1c1820ee16cd6a81bfcd35899ffc586e5ac064f5e4deace965d5507b83ca1d4a1fe3d8d30cc894da72eb998f1119ec3223a61c50c072d7e2029d
|
data/ext/jaro_winkler/jaro.c
CHANGED
@@ -27,10 +27,10 @@ double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str,
|
|
27
27
|
int window_size = long_codes_len/2 - 1;
|
28
28
|
if(window_size < 0) window_size = 0;
|
29
29
|
|
30
|
-
char short_codes_flag[
|
31
|
-
char long_codes_flag[
|
32
|
-
memset(short_codes_flag, 0,
|
33
|
-
memset(long_codes_flag, 0,
|
30
|
+
char short_codes_flag[short_str_len];
|
31
|
+
char long_codes_flag[long_str_len];
|
32
|
+
memset(short_codes_flag, 0, short_str_len);
|
33
|
+
memset(long_codes_flag, 0, long_str_len);
|
34
34
|
|
35
35
|
// count number of matching characters
|
36
36
|
int match_count = 0;
|
data/ext/jaro_winkler/jaro.h
CHANGED
data/lib/jaro_winkler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -111,24 +111,12 @@ dependencies:
|
|
111
111
|
description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
|
112
112
|
and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
|
113
113
|
string.
|
114
|
-
email:
|
115
|
-
- tonytonyjan@gmail.com
|
114
|
+
email: tonytonyjan@gmail.com
|
116
115
|
executables: []
|
117
116
|
extensions:
|
118
117
|
- ext/jaro_winkler/extconf.rb
|
119
118
|
extra_rdoc_files: []
|
120
119
|
files:
|
121
|
-
- ".gitignore"
|
122
|
-
- ".rspec"
|
123
|
-
- ".travis.yml"
|
124
|
-
- Gemfile
|
125
|
-
- LICENSE.txt
|
126
|
-
- README.md
|
127
|
-
- Rakefile
|
128
|
-
- benchmark/native.rb
|
129
|
-
- benchmark/native.txt
|
130
|
-
- benchmark/pure.rb
|
131
|
-
- benchmark/pure.txt
|
132
120
|
- ext/jaro_winkler/adj_matrix.c
|
133
121
|
- ext/jaro_winkler/adj_matrix.h
|
134
122
|
- ext/jaro_winkler/code.c
|
@@ -138,14 +126,10 @@ files:
|
|
138
126
|
- ext/jaro_winkler/jaro.h
|
139
127
|
- ext/jaro_winkler/jaro_winkler.c
|
140
128
|
- ext/jaro_winkler/murmur_hash2.c
|
141
|
-
- jaro_winkler.gemspec
|
142
129
|
- lib/jaro_winkler.rb
|
143
130
|
- lib/jaro_winkler/adjusting_table.rb
|
144
131
|
- lib/jaro_winkler/fallback.rb
|
145
132
|
- lib/jaro_winkler/version.rb
|
146
|
-
- spec/adjusting_table_spec.rb
|
147
|
-
- spec/jaro_winkler_spec.rb
|
148
|
-
- spec/spec_helper.rb
|
149
133
|
homepage: https://github.com/tonytonyjan/jaro_winkler
|
150
134
|
licenses:
|
151
135
|
- MIT
|
@@ -166,12 +150,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
150
|
version: '0'
|
167
151
|
requirements: []
|
168
152
|
rubyforge_project:
|
169
|
-
rubygems_version: 2.4.
|
153
|
+
rubygems_version: 2.4.5.1
|
170
154
|
signing_key:
|
171
155
|
specification_version: 4
|
172
156
|
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
|
173
157
|
UTF-8 string.
|
174
|
-
test_files:
|
175
|
-
- spec/adjusting_table_spec.rb
|
176
|
-
- spec/jaro_winkler_spec.rb
|
177
|
-
- spec/spec_helper.rb
|
158
|
+
test_files: []
|
data/.gitignore
DELETED
data/.rspec
DELETED
data/.travis.yml
DELETED
data/Gemfile
DELETED
data/LICENSE.txt
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
Copyright (c) 2014 Jian Weihang
|
2
|
-
|
3
|
-
MIT License
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
-
a copy of this software and associated documentation files (the
|
7
|
-
"Software"), to deal in the Software without restriction, including
|
8
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
-
permit persons to whom the Software is furnished to do so, subject to
|
11
|
-
the following conditions:
|
12
|
-
|
13
|
-
The above copyright notice and this permission notice shall be
|
14
|
-
included in all copies or substantial portions of the Software.
|
15
|
-
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
DELETED
@@ -1,129 +0,0 @@
|
|
1
|
-
[](https://travis-ci.org/tonytonyjan/jaro_winkler)
|
2
|
-
|
3
|
-
It's an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both of them supports UTF-8 string.
|
4
|
-
|
5
|
-
# Installation
|
6
|
-
|
7
|
-
```
|
8
|
-
gem install jaro_winkler
|
9
|
-
```
|
10
|
-
|
11
|
-
# Usage
|
12
|
-
|
13
|
-
```ruby
|
14
|
-
require 'jaro_winkler'
|
15
|
-
JaroWinkler.distance "MARTHA", "MARHTA"
|
16
|
-
# => 0.9611
|
17
|
-
JaroWinkler.distance "MARTHA", "marhta", ignore_case: true
|
18
|
-
# => 0.9611
|
19
|
-
JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
20
|
-
# => 0.9778
|
21
|
-
|
22
|
-
# Force the strategy
|
23
|
-
JaroWinkler.c_distance "MARTHA", "MARHTA" # C extension
|
24
|
-
JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
|
25
|
-
```
|
26
|
-
|
27
|
-
## Options
|
28
|
-
|
29
|
-
Name | Type | Default | Note
|
30
|
-
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
|
31
|
-
ignore_case | boolean | false | All lower case characters are converted to upper case prior to the comparison.
|
32
|
-
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
33
|
-
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
34
|
-
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
|
35
|
-
|
36
|
-
# Adjusting Table
|
37
|
-
|
38
|
-
## Default Table
|
39
|
-
|
40
|
-
```
|
41
|
-
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'], ['I', 'U'],
|
42
|
-
['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'], ['S', 'Z'], ['X', 'S'],
|
43
|
-
['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'], ['2', 'Z'], ['5', 'S'], ['8', 'B'],
|
44
|
-
['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'], ['E', ' '], ['Y', ' '], ['S', ' ']
|
45
|
-
```
|
46
|
-
|
47
|
-
## How it works?
|
48
|
-
|
49
|
-
Original Formula:
|
50
|
-
|
51
|
-
%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
|
52
|
-
|
53
|
-
where
|
54
|
-
|
55
|
-
- `m` is the number of matching characters.
|
56
|
-
- `t` is half the number of transpositions.
|
57
|
-
|
58
|
-
With Adjusting Table:
|
59
|
-
|
60
|
-
%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
|
61
|
-
|
62
|
-
where
|
63
|
-
|
64
|
-
- `s` is the number of nonmatching but similar characters.
|
65
|
-
|
66
|
-
# Why This?
|
67
|
-
|
68
|
-
There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
|
69
|
-
|
70
|
-
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged (see tables below).
|
71
|
-
|
72
|
-
# Compare with other gems
|
73
|
-
|
74
|
-
| jaro_winkler | fuzzystringmatch | hotwater | amatch
|
75
|
-
--------------- | ------------ | ---------------- | -------- | ------
|
76
|
-
UTF-8 Suport | **Yes** | Pure Ruby only | No | No
|
77
|
-
Windows Support | **Yes** | | No | **Yes**
|
78
|
-
Adjusting Table | **Yes** | No | No | No
|
79
|
-
Native | **Yes** | **Yes** | **Yes** | **Yes**
|
80
|
-
Pure Ruby | **Yes** | **Yes** | No | No
|
81
|
-
Speed | Medium | **Fast** | Medium | Slow
|
82
|
-
Bug Found | **Not Yet** | Yes | **Not Yet** | Yes
|
83
|
-
|
84
|
-
For `Bug Found`, I made a rake task to build the table below, the source code is in `Rakefile`:
|
85
|
-
|
86
|
-
str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater | amatch
|
87
|
-
--- | --- | --- | --- | --- | --- | ---
|
88
|
-
"henka" | "henkan" | 0.9667 | 0.9667 | **0.9722** | 0.9667 | **0.9444**
|
89
|
-
"al" | "al" | 1.0 | 1.0 | 1.0 | 1.0 | 1.0
|
90
|
-
"martha" | "marhta" | 0.9611 | 0.9611 | 0.9611 | 0.9611 | **0.9444**
|
91
|
-
"jones" | "johnson" | 0.8324 | 0.8324 | 0.8324 | 0.8324 | **0.7905**
|
92
|
-
"abcvwxyz" | "cabvwxyz" | 0.9583 | 0.9583 | 0.9583 | 0.9583 | 0.9583
|
93
|
-
"dwayne" | "duane" | 0.84 | 0.84 | 0.84 | 0.84 | **0.8222**
|
94
|
-
"dixon" | "dicksonx" | 0.8133 | 0.8133 | 0.8133 | 0.8133 | **0.7667**
|
95
|
-
"fvie" | "ten" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
|
96
|
-
|
97
|
-
- The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
|
98
|
-
- Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
|
99
|
-
|
100
|
-
# Benchmark
|
101
|
-
|
102
|
-
## Pure Ruby
|
103
|
-
|
104
|
-
| user | system | total | real
|
105
|
-
---------------- | -------- | -------- | -------- | ------------
|
106
|
-
jaro_winkler | 1.300000 | 0.000000 | 1.300000 | ( 1.299802)
|
107
|
-
fuzzystringmatch | 1.510000 | 0.000000 | 1.510000 | ( 1.510136)
|
108
|
-
|
109
|
-
- jaro_winkler (1.3.1)
|
110
|
-
- fuzzy-string-match (0.9.6)
|
111
|
-
|
112
|
-
## Native
|
113
|
-
|
114
|
-
| user | system | total | real
|
115
|
-
---------------- | -------- | -------- | -------- | ------------
|
116
|
-
jaro_winkler | 0.350000 | 0.010000 | 0.360000 | ( 0.345293)
|
117
|
-
fuzzystringmatch | 0.140000 | 0.000000 | 0.140000 | ( 0.138711)
|
118
|
-
hotwater | 0.310000 | 0.000000 | 0.310000 | ( 0.306498)
|
119
|
-
amatch | 0.960000 | 0.000000 | 0.960000 | ( 0.961509)
|
120
|
-
|
121
|
-
- jaro_winkler (1.3.1)
|
122
|
-
- fuzzy-string-match (0.9.6)
|
123
|
-
- hotwater (0.1.2)
|
124
|
-
- amatch (0.3.0)
|
125
|
-
|
126
|
-
# Todo
|
127
|
-
|
128
|
-
- Custom adjusting word table.
|
129
|
-
- The algorithm between C and Ruby are different.
|
data/Rakefile
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
require "bundler/gem_tasks"
|
2
|
-
require "rake/extensiontask"
|
3
|
-
require 'rspec/core/rake_task'
|
4
|
-
|
5
|
-
RSpec::Core::RakeTask.new(:spec)
|
6
|
-
Rake::ExtensionTask.new("jaro_winkler") do |ext|
|
7
|
-
ext.lib_dir = "lib/jaro_winkler"
|
8
|
-
end
|
9
|
-
|
10
|
-
task default: [:compile, :spec]
|
11
|
-
|
12
|
-
desc 'type can be "native" or "pure"'
|
13
|
-
task :benchmark, :type do |t, args|
|
14
|
-
args.with_defaults(type: :all)
|
15
|
-
ROOT_PATH = File.expand_path('..', __FILE__)
|
16
|
-
LIB_PATH = File.join(ROOT_PATH, 'lib')
|
17
|
-
BENCHMARK_PATH = File.join(ROOT_PATH, 'benchmark')
|
18
|
-
|
19
|
-
files = File.join(BENCHMARK_PATH, args[:type] == :all ? '*.rb' : "#{args[:type]}.rb")
|
20
|
-
Dir[files].each do |path|
|
21
|
-
output_path = File.join(BENCHMARK_PATH, File.basename(path, '*.rb').sub('.rb', '.txt'))
|
22
|
-
cmd = "RUBYLIB=#{LIB_PATH} ruby #{path}"
|
23
|
-
puts cmd
|
24
|
-
output = `#{cmd}`
|
25
|
-
File.write(output_path, output)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
task :compare do
|
30
|
-
require 'jaro_winkler'
|
31
|
-
require 'fuzzystringmatch'
|
32
|
-
require 'hotwater'
|
33
|
-
require 'amatch'
|
34
|
-
@ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten'], ['San Francisco', 'Santa Monica']]
|
35
|
-
table = []
|
36
|
-
table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
|
37
|
-
table << %w[--- --- --- --- --- ---]
|
38
|
-
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
|
39
|
-
@ary.each do |str_1, str_2|
|
40
|
-
table << ["\"#{str_1}\"", "\"#{str_2}\"", JaroWinkler.distance(str_1, str_2).round(4), jarow.getDistance(str_1, str_2).round(4), Hotwater.jaro_winkler_distance(str_1, str_2).round(4), Amatch::Jaro.new(str_1).match(str_2).round(4)]
|
41
|
-
end
|
42
|
-
col_len = []
|
43
|
-
table.first.length.times{ |i| col_len << table.map{ |row| row[i].to_s.length }.max }
|
44
|
-
table.first.each_with_index{ |title, i| "%-#{col_len[i]}s" % title }
|
45
|
-
table.each_with_index do |row|
|
46
|
-
row.each_with_index do |col, i|
|
47
|
-
row[i] = "%-#{col_len[i]}s" % col.to_s
|
48
|
-
end
|
49
|
-
end
|
50
|
-
table.each{|row| puts row.join(' | ')}
|
51
|
-
end
|
data/benchmark/native.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
require 'benchmark'
|
2
|
-
require 'jaro_winkler'
|
3
|
-
require 'fuzzystringmatch'
|
4
|
-
require 'hotwater'
|
5
|
-
require 'amatch'
|
6
|
-
ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
7
|
-
|
8
|
-
n = 100000
|
9
|
-
Benchmark.bmbm do |x|
|
10
|
-
x.report 'jaro_winkler' do
|
11
|
-
n.times{ ary.each{ |str1, str2| JaroWinkler.c_distance(str1, str2) } }
|
12
|
-
end
|
13
|
-
|
14
|
-
x.report 'fuzzystringmatch' do
|
15
|
-
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
|
16
|
-
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
17
|
-
end
|
18
|
-
|
19
|
-
x.report 'hotwater' do
|
20
|
-
n.times{ ary.each{ |str1, str2| Hotwater.jaro_winkler_distance(str1, str2) } }
|
21
|
-
end
|
22
|
-
|
23
|
-
x.report 'amatch' do
|
24
|
-
n.times{ ary.each{ |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
|
25
|
-
end
|
26
|
-
end
|
data/benchmark/native.txt
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
Rehearsal ----------------------------------------------------
|
2
|
-
jaro_winkler 0.350000 0.000000 0.350000 ( 0.348383)
|
3
|
-
fuzzystringmatch 0.330000 0.020000 0.350000 ( 0.354850)
|
4
|
-
hotwater 0.280000 0.000000 0.280000 ( 0.278819)
|
5
|
-
amatch 0.980000 0.000000 0.980000 ( 0.983325)
|
6
|
-
------------------------------------------- total: 1.960000sec
|
7
|
-
|
8
|
-
user system total real
|
9
|
-
jaro_winkler 0.330000 0.000000 0.330000 ( 0.331923)
|
10
|
-
fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.135655)
|
11
|
-
hotwater 0.280000 0.000000 0.280000 ( 0.276728)
|
12
|
-
amatch 0.930000 0.010000 0.940000 ( 0.932943)
|
data/benchmark/pure.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
require 'benchmark'
|
2
|
-
require 'jaro_winkler'
|
3
|
-
require 'fuzzystringmatch'
|
4
|
-
ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
5
|
-
|
6
|
-
n = 10000
|
7
|
-
Benchmark.bmbm do |x|
|
8
|
-
x.report 'jaro_winkler' do
|
9
|
-
n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
|
10
|
-
end
|
11
|
-
|
12
|
-
x.report 'fuzzystringmatch' do
|
13
|
-
jarow = FuzzyStringMatch::JaroWinkler.create(:pure)
|
14
|
-
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
15
|
-
end
|
16
|
-
end
|
data/benchmark/pure.txt
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
Rehearsal ----------------------------------------------------
|
2
|
-
jaro_winkler 1.300000 0.000000 1.300000 ( 1.300723)
|
3
|
-
fuzzystringmatch 1.500000 0.010000 1.510000 ( 1.497842)
|
4
|
-
------------------------------------------- total: 2.810000sec
|
5
|
-
|
6
|
-
user system total real
|
7
|
-
jaro_winkler 1.300000 0.000000 1.300000 ( 1.299802)
|
8
|
-
fuzzystringmatch 1.510000 0.000000 1.510000 ( 1.510136)
|
data/jaro_winkler.gemspec
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'jaro_winkler/fallback'
|
5
|
-
require 'jaro_winkler/version'
|
6
|
-
|
7
|
-
Gem::Specification.new do |spec|
|
8
|
-
spec.name = "jaro_winkler"
|
9
|
-
spec.version = JaroWinkler::VERSION
|
10
|
-
spec.authors = ["Jian Weihang"]
|
11
|
-
spec.email = ["tonytonyjan@gmail.com"]
|
12
|
-
spec.extensions = ["ext/jaro_winkler/extconf.rb"]
|
13
|
-
spec.summary = %q{Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.}
|
14
|
-
spec.description = %q{It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.}
|
15
|
-
spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
|
16
|
-
spec.license = "MIT"
|
17
|
-
|
18
|
-
spec.files = `git ls-files -z`.split("\x0")
|
19
|
-
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
20
|
-
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
21
|
-
spec.require_paths = ["lib"]
|
22
|
-
|
23
|
-
spec.add_development_dependency "bundler", "~> 1.7"
|
24
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
25
|
-
spec.add_development_dependency "rake-compiler"
|
26
|
-
spec.add_development_dependency "rspec"
|
27
|
-
spec.add_development_dependency "fuzzy-string-match"
|
28
|
-
spec.add_development_dependency "hotwater"
|
29
|
-
spec.add_development_dependency "amatch"
|
30
|
-
end
|
data/spec/jaro_winkler_spec.rb
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'jaro_winkler'
|
3
|
-
include JaroWinkler
|
4
|
-
|
5
|
-
shared_examples 'common' do |strategy|
|
6
|
-
it 'works' do
|
7
|
-
expect(send(strategy, 'henka','henkan')).to be_within(0.0001).of(0.9667)
|
8
|
-
expect(send(strategy, 'al','al')).to be_within(0.0001).of(1.0)
|
9
|
-
expect(send(strategy, 'martha','marhta')).to be_within(0.0001).of(0.9611)
|
10
|
-
expect(send(strategy, 'jones','johnson')).to be_within(0.0001).of(0.8323)
|
11
|
-
expect(send(strategy, 'abcvwxyz','cabvwxyz')).to be_within(0.0001).of(0.9583)
|
12
|
-
expect(send(strategy, 'dwayne','duane')).to be_within(0.0001).of(0.8400)
|
13
|
-
expect(send(strategy, 'dixon','dicksonx')).to be_within(0.0001).of(0.8133)
|
14
|
-
expect(send(strategy, 'fvie','ten')).to be_within(0.0001).of(0.0)
|
15
|
-
expect(send(strategy, 'tony','tony')).to be_within(0.0001).of(1.0)
|
16
|
-
expect(send(strategy, 'tonytonyjan','tonytonyjan')).to be_within(0.0001).of(1.0)
|
17
|
-
expect(send(strategy, 'x','x')).to be_within(0.0001).of(1.0)
|
18
|
-
expect(send(strategy, '','')).to be_within(0.0001).of(0.0)
|
19
|
-
expect(send(strategy, 'tony','')).to be_within(0.0001).of(0.0)
|
20
|
-
expect(send(strategy, '','tony')).to be_within(0.0001).of(0.0)
|
21
|
-
expect(send(strategy, 'tonytonyjan','tony')).to be_within(0.0001).of(0.8727)
|
22
|
-
expect(send(strategy, 'tony','tonytonyjan')).to be_within(0.0001).of(0.8727)
|
23
|
-
end
|
24
|
-
|
25
|
-
it 'works with UTF-8' do
|
26
|
-
expect(send(strategy, '變形金剛4:絕跡重生','變形金剛4: 絕跡重生')).to be_within(0.0001).of(0.9818)
|
27
|
-
expect(send(strategy, '連勝文','連勝丼')).to be_within(0.0001).of(0.8222)
|
28
|
-
expect(send(strategy, '馬英九','馬英丸')).to be_within(0.0001).of(0.8222)
|
29
|
-
expect(send(strategy, '良い','いい')).to be_within(0.0001).of(0.6666)
|
30
|
-
end
|
31
|
-
|
32
|
-
it 'sets ignore_case' do
|
33
|
-
expect(send(strategy, 'MARTHA', 'marhta', ignore_case: true)).to be_within(0.0001).of(0.9611)
|
34
|
-
end
|
35
|
-
|
36
|
-
it 'sets weight' do
|
37
|
-
expect(send(strategy, 'MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
38
|
-
end
|
39
|
-
|
40
|
-
it 'sets threshold' do
|
41
|
-
expect(send(strategy, 'MARTHA', 'MARHTA', threshold: 0.99)).to be_within(0.0001).of(0.9445)
|
42
|
-
end
|
43
|
-
|
44
|
-
|
45
|
-
it 'works with adjusting table' do
|
46
|
-
expect(send(strategy, 'HENKA', 'HENKAN', adj_table: true)).to be_within(0.0001).of(0.9667) # m=5, t=0, s=0
|
47
|
-
expect(send(strategy, 'AL', 'AL', adj_table: true)).to be_within(0.0001).of(1.0) # m=2, t=0, s=0
|
48
|
-
expect(send(strategy, 'MARTHA', 'MARHTA', adj_table: true)).to be_within(0.0001).of(0.9611) # m=6, t=1, s=0
|
49
|
-
expect(send(strategy, 'JONES', 'JOHNSON', adj_table: true)).to be_within(0.0001).of(0.8598) # m=4, t=0, s=3
|
50
|
-
expect(send(strategy, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true)).to be_within(0.0001).of(0.9583) # m=8, t=1, s=0
|
51
|
-
expect(send(strategy, 'DWAYNE', 'DUANE', adj_table: true)).to be_within(0.0001).of(0.8730) # m=4, t=0, s=3
|
52
|
-
expect(send(strategy, 'DIXON', 'DICKSONX', adj_table: true)).to be_within(0.0001).of(0.8393) # m=4, t=0, s=3
|
53
|
-
expect(send(strategy, 'FVIE', 'TEN', adj_table: true)).to be_within(0.0001).of(0.0)
|
54
|
-
end
|
55
|
-
|
56
|
-
context 'with weight exceeding 0.25' do
|
57
|
-
it 'throws exception' do
|
58
|
-
expect{ send(strategy, 'MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
describe 'Pure Ruby' do
|
64
|
-
include_examples 'common', :r_distance
|
65
|
-
end
|
66
|
-
|
67
|
-
describe 'C extention' do
|
68
|
-
include_examples 'common', :c_distance
|
69
|
-
end
|
data/spec/spec_helper.rb
DELETED
@@ -1,89 +0,0 @@
|
|
1
|
-
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
-
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
-
# The generated `.rspec` file contains `--require spec_helper` which will cause this
|
4
|
-
# file to always be loaded, without a need to explicitly require it in any files.
|
5
|
-
#
|
6
|
-
# Given that it is always loaded, you are encouraged to keep this file as
|
7
|
-
# light-weight as possible. Requiring heavyweight dependencies from this file
|
8
|
-
# will add to the boot time of your test suite on EVERY test run, even for an
|
9
|
-
# individual file that may not need all of that loaded. Instead, consider making
|
10
|
-
# a separate helper file that requires the additional dependencies and performs
|
11
|
-
# the additional setup, and require it from the spec files that actually need it.
|
12
|
-
#
|
13
|
-
# The `.rspec` file also contains a few flags that are not defaults but that
|
14
|
-
# users commonly want.
|
15
|
-
#
|
16
|
-
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
17
|
-
RSpec.configure do |config|
|
18
|
-
# rspec-expectations config goes here. You can use an alternate
|
19
|
-
# assertion/expectation library such as wrong or the stdlib/minitest
|
20
|
-
# assertions if you prefer.
|
21
|
-
config.expect_with :rspec do |expectations|
|
22
|
-
# This option will default to `true` in RSpec 4. It makes the `description`
|
23
|
-
# and `failure_message` of custom matchers include text for helper methods
|
24
|
-
# defined using `chain`, e.g.:
|
25
|
-
# be_bigger_than(2).and_smaller_than(4).description
|
26
|
-
# # => "be bigger than 2 and smaller than 4"
|
27
|
-
# ...rather than:
|
28
|
-
# # => "be bigger than 2"
|
29
|
-
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
30
|
-
end
|
31
|
-
|
32
|
-
# rspec-mocks config goes here. You can use an alternate test double
|
33
|
-
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
34
|
-
config.mock_with :rspec do |mocks|
|
35
|
-
# Prevents you from mocking or stubbing a method that does not exist on
|
36
|
-
# a real object. This is generally recommended, and will default to
|
37
|
-
# `true` in RSpec 4.
|
38
|
-
mocks.verify_partial_doubles = true
|
39
|
-
end
|
40
|
-
|
41
|
-
# The settings below are suggested to provide a good initial experience
|
42
|
-
# with RSpec, but feel free to customize to your heart's content.
|
43
|
-
=begin
|
44
|
-
# These two settings work together to allow you to limit a spec run
|
45
|
-
# to individual examples or groups you care about by tagging them with
|
46
|
-
# `:focus` metadata. When nothing is tagged with `:focus`, all examples
|
47
|
-
# get run.
|
48
|
-
config.filter_run :focus
|
49
|
-
config.run_all_when_everything_filtered = true
|
50
|
-
|
51
|
-
# Limits the available syntax to the non-monkey patched syntax that is recommended.
|
52
|
-
# For more details, see:
|
53
|
-
# - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
|
54
|
-
# - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
55
|
-
# - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
|
56
|
-
config.disable_monkey_patching!
|
57
|
-
|
58
|
-
# This setting enables warnings. It's recommended, but in some cases may
|
59
|
-
# be too noisy due to issues in dependencies.
|
60
|
-
config.warnings = true
|
61
|
-
|
62
|
-
# Many RSpec users commonly either run the entire suite or an individual
|
63
|
-
# file, and it's useful to allow more verbose output when running an
|
64
|
-
# individual spec file.
|
65
|
-
if config.files_to_run.one?
|
66
|
-
# Use the documentation formatter for detailed output,
|
67
|
-
# unless a formatter has already been configured
|
68
|
-
# (e.g. via a command-line flag).
|
69
|
-
config.default_formatter = 'doc'
|
70
|
-
end
|
71
|
-
|
72
|
-
# Print the 10 slowest examples and example groups at the
|
73
|
-
# end of the spec run, to help surface which specs are running
|
74
|
-
# particularly slow.
|
75
|
-
config.profile_examples = 10
|
76
|
-
|
77
|
-
# Run specs in random order to surface order dependencies. If you find an
|
78
|
-
# order dependency and want to debug it, you can fix the order by providing
|
79
|
-
# the seed, which is printed after each run.
|
80
|
-
# --seed 1234
|
81
|
-
config.order = :random
|
82
|
-
|
83
|
-
# Seed global randomization in this process using the `--seed` CLI option.
|
84
|
-
# Setting this allows you to use `--seed` to deterministically reproduce
|
85
|
-
# test failures related to randomization by passing the same `--seed` value
|
86
|
-
# as the one that triggered the failure.
|
87
|
-
Kernel.srand config.seed
|
88
|
-
=end
|
89
|
-
end
|