jaro_winkler 1.2.0 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -7
- data/benchmark/native.rb +4 -9
- data/benchmark/pure.rb +2 -2
- data/ext/jaro_winkler/jaro_winkler.c +1 -0
- data/lib/jaro_winkler.rb +11 -3
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +1 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a1b9c70518b0e53cf56495a67e2ffe90962b25f
|
4
|
+
data.tar.gz: 407eee20cb14e8e2b3fde69ad200547b34b27ebe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0ec1165eac3e38cbac6462d3920467b3bdf2e08957367f150b51553710dd147ffd3ec78774bd74408867a433bf47ca7e5a810750aaddc2bc92aa3cb7d8dc1d31
|
7
|
+
data.tar.gz: 918f8cc2603b09f6a23e8df6bbdd76d3ea295988f20a57000b9382d84f8e97f7d7205d3992d2b48fa1820e3b01bcb4de8473a0d9f1a39f4735efeac441c26d71
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# About
|
2
2
|
|
3
|
-
It's a
|
3
|
+
It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
|
4
4
|
|
5
5
|
# Installation
|
6
6
|
|
@@ -19,9 +19,9 @@ JaroWinkler.distance "MARTHA", "marhta", case_match: true
|
|
19
19
|
JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
20
20
|
# => 0.9778
|
21
21
|
|
22
|
-
#
|
23
|
-
JaroWinkler.c_distance "MARTHA", "MARHTA"
|
24
|
-
JaroWinkler.
|
22
|
+
# Force the strategy
|
23
|
+
JaroWinkler.c_distance "MARTHA", "MARHTA"
|
24
|
+
JaroWinkler.r_distance "MARTHA", "MARHTA"
|
25
25
|
```
|
26
26
|
|
27
27
|
**Both implementations support UTF-8 string.**
|
@@ -33,7 +33,6 @@ Name | Type | Default | Note
|
|
33
33
|
case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
|
34
34
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
35
35
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
|
36
|
-
native | boolean | false | Use native version.
|
37
36
|
|
38
37
|
# Why This?
|
39
38
|
|
@@ -85,5 +84,4 @@ end
|
|
85
84
|
|
86
85
|
# Todo
|
87
86
|
|
88
|
-
- Adjusting word table (Reference to original C implementation.)
|
89
|
-
- Remove `#c_distance`, use C extension as default, and fallback to Ruby in Java platform.
|
87
|
+
- Adjusting word table (Reference to original C implementation.)
|
data/benchmark/native.rb
CHANGED
@@ -5,21 +5,16 @@ ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'c
|
|
5
5
|
|
6
6
|
n = 100000
|
7
7
|
Benchmark.bmbm do |x|
|
8
|
-
x.report '
|
8
|
+
x.report 'jaro_winkler' do
|
9
9
|
n.times{ ary.each{ |str1, str2| JaroWinkler.c_distance(str1, str2) } }
|
10
10
|
end
|
11
11
|
|
12
|
-
x.report '#distance(s1, s2, native: true)' do
|
13
|
-
n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2, native: true) } }
|
14
|
-
end
|
15
|
-
|
16
12
|
x.report 'fuzzystringmatch' do
|
17
13
|
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
|
18
14
|
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
19
15
|
end
|
20
16
|
end
|
21
17
|
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
# fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.155539)
|
18
|
+
# user system total real
|
19
|
+
# jaro_winkler 0.380000 0.000000 0.380000 ( 0.386071)
|
20
|
+
# fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.138053)
|
data/benchmark/pure.rb
CHANGED
@@ -5,8 +5,8 @@ ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'c
|
|
5
5
|
|
6
6
|
n = 100000
|
7
7
|
Benchmark.bmbm do |x|
|
8
|
-
x.report 'jaro_winkler
|
9
|
-
n.times{ ary.each{ |str1, str2| JaroWinkler.
|
8
|
+
x.report 'jaro_winkler' do
|
9
|
+
n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
|
10
10
|
end
|
11
11
|
|
12
12
|
x.report 'fuzzystringmatch' do
|
@@ -17,6 +17,7 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
|
17
17
|
VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
|
18
18
|
VALUE case_match = rb_hash_aref(opt, ID2SYM(rb_intern("case_match")));
|
19
19
|
if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
|
20
|
+
if(c_opt->weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
20
21
|
if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
|
21
22
|
if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
|
22
23
|
}
|
data/lib/jaro_winkler.rb
CHANGED
@@ -42,9 +42,8 @@ module JaroWinkler
|
|
42
42
|
matches == 0 ? 0 : (matches / length1 + matches / length2 + (matches - transpositions) / matches) / 3.0
|
43
43
|
end
|
44
44
|
|
45
|
-
def
|
46
|
-
options = {weight: 0.1, threshold: 0.7, case_match: false
|
47
|
-
return c_distance(s1, s2, options) if RUBY_PLATFORM != 'java' && options[:native]
|
45
|
+
def r_distance s1, s2, options = {}
|
46
|
+
options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
|
48
47
|
weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
|
49
48
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
50
49
|
s1, s2 = s1.downcase, s2.downcase if case_match
|
@@ -56,4 +55,13 @@ module JaroWinkler
|
|
56
55
|
end
|
57
56
|
distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
|
58
57
|
end
|
58
|
+
|
59
|
+
if RUBY_PLATFORM == 'java'
|
60
|
+
alias :distance :r_distance
|
61
|
+
alias :c_distance :r_distance
|
62
|
+
module_function :distance, :c_distance
|
63
|
+
else
|
64
|
+
alias :distance :c_distance
|
65
|
+
module_function :distance
|
66
|
+
end
|
59
67
|
end
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED