jaro_winkler 1.2.1 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +7 -3
- data/benchmark/native.rb +14 -2
- data/ext/jaro_winkler/distance.c +2 -2
- data/jaro_winkler.gemspec +2 -1
- data/lib/jaro_winkler.rb +12 -11
- data/lib/jaro_winkler/fallback.rb +7 -0
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +6 -10
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a1229af20c91c12d0f4aeb8fb13dd57cc617344
|
4
|
+
data.tar.gz: 919b78c919c60d15bdeacdfc84bd6e13952e6181
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 56011763a68a13b9b04973a33b3575c768f81c6257d42284e99bcdf94c66896493cf68fad4e33338081b251fb5d1d93f0f6815e82eee7a8e33b0e97d5707826a
|
7
|
+
data.tar.gz: 79e38669e7d5d4a7582af83a76bee06b49e3b34304708ac5a0d5c8be661dbfc4196a81157b636d8ed4266645af4fcc671b6afcde506588fa3af56d8178961b92
|
data/README.md
CHANGED
@@ -2,6 +2,10 @@
|
|
2
2
|
|
3
3
|
It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
|
4
4
|
|
5
|
+
**Windows Issue**
|
6
|
+
|
7
|
+
It will fallabck to pure Ruby implementation on Windows since it can't be compiled currently. (ref [#1](https://github.com/tonytonyjan/jaro_winkler/issues/1))
|
8
|
+
|
5
9
|
# Installation
|
6
10
|
|
7
11
|
```
|
@@ -20,8 +24,8 @@ JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
|
20
24
|
# => 0.9778
|
21
25
|
|
22
26
|
# Force the strategy
|
23
|
-
JaroWinkler.c_distance "MARTHA", "MARHTA"
|
24
|
-
JaroWinkler.r_distance "MARTHA", "MARHTA"
|
27
|
+
JaroWinkler.c_distance "MARTHA", "MARHTA" # C extension
|
28
|
+
JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
|
25
29
|
```
|
26
30
|
|
27
31
|
**Both implementations support UTF-8 string.**
|
@@ -32,7 +36,7 @@ Name | Type | Default | Note
|
|
32
36
|
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
|
33
37
|
case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
|
34
38
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
35
|
-
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above
|
39
|
+
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
36
40
|
|
37
41
|
# Why This?
|
38
42
|
|
data/benchmark/native.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'benchmark'
|
2
2
|
require 'jaro_winkler'
|
3
3
|
require 'fuzzystringmatch'
|
4
|
+
require 'hotwater'
|
5
|
+
require 'amatch'
|
4
6
|
ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
5
7
|
|
6
8
|
n = 100000
|
@@ -13,8 +15,18 @@ Benchmark.bmbm do |x|
|
|
13
15
|
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
|
14
16
|
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
15
17
|
end
|
18
|
+
|
19
|
+
x.report 'hotwater' do
|
20
|
+
n.times{ ary.each{ |str1, str2| Hotwater.jaro_winkler_distance(str1, str2) } }
|
21
|
+
end
|
22
|
+
|
23
|
+
x.report 'amatch' do
|
24
|
+
n.times{ ary.each{ |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
|
25
|
+
end
|
16
26
|
end
|
17
27
|
|
18
28
|
# user system total real
|
19
|
-
# jaro_winkler 0.
|
20
|
-
# fuzzystringmatch 0.
|
29
|
+
# jaro_winkler 0.420000 0.000000 0.420000 ( 0.426742)
|
30
|
+
# fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.160146)
|
31
|
+
# hotwater 0.300000 0.000000 0.300000 ( 0.297350)
|
32
|
+
# amatch 0.980000 0.010000 0.990000 ( 0.982874)
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -55,10 +55,10 @@ double c_distance(char *s1, char *s2, Option *opt){
|
|
55
55
|
}
|
56
56
|
int window_size = ary_2_len / 2 - 1;
|
57
57
|
if(window_size < 0) window_size = 0;
|
58
|
-
double matches
|
58
|
+
double matches = 0.0;
|
59
59
|
int transpositions = 0;
|
60
60
|
int previous_index = -1;
|
61
|
-
int max_index
|
61
|
+
int max_index = ary_2_len - 1;
|
62
62
|
for(int i = 0; i < ary_1_len; i++){
|
63
63
|
int left = i - window_size;
|
64
64
|
int right = i + window_size;
|
data/jaro_winkler.gemspec
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
lib = File.expand_path('../lib', __FILE__)
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'jaro_winkler/fallback'
|
4
5
|
require 'jaro_winkler/version'
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
@@ -8,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
8
9
|
spec.version = JaroWinkler::VERSION
|
9
10
|
spec.authors = ["Jian Weihang"]
|
10
11
|
spec.email = ["tonytonyjan@gmail.com"]
|
11
|
-
spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless
|
12
|
+
spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless JaroWinkler.fallback?
|
12
13
|
spec.summary = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
|
13
14
|
spec.description = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
|
14
15
|
spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
|
data/lib/jaro_winkler.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
require 'jaro_winkler/
|
1
|
+
require 'jaro_winkler/fallback'
|
2
|
+
require 'jaro_winkler/jaro_winkler.so' unless JaroWinkler.fallback?
|
2
3
|
module JaroWinkler
|
3
4
|
module_function
|
4
5
|
def jaro_distance s1, s2
|
@@ -8,12 +9,12 @@ module JaroWinkler
|
|
8
9
|
s1, s2 = s2, s1
|
9
10
|
length1, length2 = length2, length1
|
10
11
|
end
|
11
|
-
window_size
|
12
|
-
window_size
|
13
|
-
matches
|
14
|
-
transpositions
|
15
|
-
previous_index
|
16
|
-
max_index
|
12
|
+
window_size = (length2 / 2) - 1
|
13
|
+
window_size = 0 if window_size < 0
|
14
|
+
matches = 0.0
|
15
|
+
transpositions = 0
|
16
|
+
previous_index = -1
|
17
|
+
max_index = length2 - 1
|
17
18
|
s1.chars.each_with_index do |c1, i|
|
18
19
|
left = i - window_size
|
19
20
|
right = i + window_size
|
@@ -46,9 +47,9 @@ module JaroWinkler
|
|
46
47
|
options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
|
47
48
|
weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
|
48
49
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
49
|
-
s1, s2
|
50
|
-
distance
|
51
|
-
prefix
|
50
|
+
s1, s2 = s1.downcase, s2.downcase if case_match
|
51
|
+
distance = jaro_distance(s1, s2)
|
52
|
+
prefix = 0
|
52
53
|
max_length = [4, s1.length, s2.length].min
|
53
54
|
s1[0, max_length].chars.each_with_index do |c1, i|
|
54
55
|
c1 == s2[i] ? prefix += 1 : break
|
@@ -56,7 +57,7 @@ module JaroWinkler
|
|
56
57
|
distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
|
57
58
|
end
|
58
59
|
|
59
|
-
if
|
60
|
+
if JaroWinkler.fallback?
|
60
61
|
alias :distance :r_distance
|
61
62
|
alias :c_distance :r_distance
|
62
63
|
module_function :distance, :c_distance
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -25,28 +25,24 @@ describe JaroWinkler do
|
|
25
25
|
|
26
26
|
it 'works' do
|
27
27
|
@ary.each do |s1, s2, ans|
|
28
|
-
expect(
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
it 'supports C extension' do
|
33
|
-
@ary.each do |s1, s2, ans|
|
28
|
+
expect(r_distance(s1, s2)).to be_within(0.0001).of(ans)
|
34
29
|
expect(c_distance(s1, s2)).to be_within(0.0001).of(ans)
|
35
30
|
end
|
36
31
|
end
|
37
32
|
|
38
33
|
it 'works with UTF-8' do
|
39
|
-
expect(
|
34
|
+
expect(c_distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
|
40
35
|
end
|
41
36
|
|
42
37
|
it 'can ignore case' do
|
43
|
-
expect(
|
38
|
+
expect(r_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
|
44
39
|
expect(c_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
|
45
40
|
end
|
46
41
|
|
47
42
|
it 'can set weight' do
|
48
|
-
expect(
|
43
|
+
expect(r_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
49
44
|
expect(c_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
50
|
-
expect{
|
45
|
+
expect{ r_distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
|
46
|
+
expect{ c_distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
|
51
47
|
end
|
52
48
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -76,6 +76,7 @@ files:
|
|
76
76
|
- ext/jaro_winkler/strcmp95.c
|
77
77
|
- jaro_winkler.gemspec
|
78
78
|
- lib/jaro_winkler.rb
|
79
|
+
- lib/jaro_winkler/fallback.rb
|
79
80
|
- lib/jaro_winkler/version.rb
|
80
81
|
- spec/jaro_winkler_spec.rb
|
81
82
|
- spec/spec_helper.rb
|