jaro_winkler 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -7
- data/benchmark/native.rb +4 -9
- data/benchmark/pure.rb +2 -2
- data/ext/jaro_winkler/jaro_winkler.c +1 -0
- data/lib/jaro_winkler.rb +11 -3
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +1 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a1b9c70518b0e53cf56495a67e2ffe90962b25f
|
4
|
+
data.tar.gz: 407eee20cb14e8e2b3fde69ad200547b34b27ebe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0ec1165eac3e38cbac6462d3920467b3bdf2e08957367f150b51553710dd147ffd3ec78774bd74408867a433bf47ca7e5a810750aaddc2bc92aa3cb7d8dc1d31
|
7
|
+
data.tar.gz: 918f8cc2603b09f6a23e8df6bbdd76d3ea295988f20a57000b9382d84f8e97f7d7205d3992d2b48fa1820e3b01bcb4de8473a0d9f1a39f4735efeac441c26d71
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# About
|
2
2
|
|
3
|
-
It's a
|
3
|
+
It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
|
4
4
|
|
5
5
|
# Installation
|
6
6
|
|
@@ -19,9 +19,9 @@ JaroWinkler.distance "MARTHA", "marhta", case_match: true
|
|
19
19
|
JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
20
20
|
# => 0.9778
|
21
21
|
|
22
|
-
#
|
23
|
-
JaroWinkler.c_distance "MARTHA", "MARHTA"
|
24
|
-
JaroWinkler.
|
22
|
+
# Force the strategy
|
23
|
+
JaroWinkler.c_distance "MARTHA", "MARHTA"
|
24
|
+
JaroWinkler.r_distance "MARTHA", "MARHTA"
|
25
25
|
```
|
26
26
|
|
27
27
|
**Both implementations support UTF-8 string.**
|
@@ -33,7 +33,6 @@ Name | Type | Default | Note
|
|
33
33
|
case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
|
34
34
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
35
35
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
|
36
|
-
native | boolean | false | Use native version.
|
37
36
|
|
38
37
|
# Why This?
|
39
38
|
|
@@ -85,5 +84,4 @@ end
|
|
85
84
|
|
86
85
|
# Todo
|
87
86
|
|
88
|
-
- Adjusting word table (Reference to original C implementation.)
|
89
|
-
- Remove `#c_distance`, use C extension as default, and fallback to Ruby in Java platform.
|
87
|
+
- Adjusting word table (Reference to original C implementation.)
|
data/benchmark/native.rb
CHANGED
@@ -5,21 +5,16 @@ ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'c
|
|
5
5
|
|
6
6
|
n = 100000
|
7
7
|
Benchmark.bmbm do |x|
|
8
|
-
x.report '
|
8
|
+
x.report 'jaro_winkler' do
|
9
9
|
n.times{ ary.each{ |str1, str2| JaroWinkler.c_distance(str1, str2) } }
|
10
10
|
end
|
11
11
|
|
12
|
-
x.report '#distance(s1, s2, native: true)' do
|
13
|
-
n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2, native: true) } }
|
14
|
-
end
|
15
|
-
|
16
12
|
x.report 'fuzzystringmatch' do
|
17
13
|
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
|
18
14
|
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
19
15
|
end
|
20
16
|
end
|
21
17
|
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
# fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.155539)
|
18
|
+
# user system total real
|
19
|
+
# jaro_winkler 0.380000 0.000000 0.380000 ( 0.386071)
|
20
|
+
# fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.138053)
|
data/benchmark/pure.rb
CHANGED
@@ -5,8 +5,8 @@ ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'c
|
|
5
5
|
|
6
6
|
n = 100000
|
7
7
|
Benchmark.bmbm do |x|
|
8
|
-
x.report 'jaro_winkler
|
9
|
-
n.times{ ary.each{ |str1, str2| JaroWinkler.
|
8
|
+
x.report 'jaro_winkler' do
|
9
|
+
n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
|
10
10
|
end
|
11
11
|
|
12
12
|
x.report 'fuzzystringmatch' do
|
@@ -17,6 +17,7 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
|
17
17
|
VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
|
18
18
|
VALUE case_match = rb_hash_aref(opt, ID2SYM(rb_intern("case_match")));
|
19
19
|
if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
|
20
|
+
if(c_opt->weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
20
21
|
if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
|
21
22
|
if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
|
22
23
|
}
|
data/lib/jaro_winkler.rb
CHANGED
@@ -42,9 +42,8 @@ module JaroWinkler
|
|
42
42
|
matches == 0 ? 0 : (matches / length1 + matches / length2 + (matches - transpositions) / matches) / 3.0
|
43
43
|
end
|
44
44
|
|
45
|
-
def
|
46
|
-
options = {weight: 0.1, threshold: 0.7, case_match: false
|
47
|
-
return c_distance(s1, s2, options) if RUBY_PLATFORM != 'java' && options[:native]
|
45
|
+
def r_distance s1, s2, options = {}
|
46
|
+
options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
|
48
47
|
weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
|
49
48
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
50
49
|
s1, s2 = s1.downcase, s2.downcase if case_match
|
@@ -56,4 +55,13 @@ module JaroWinkler
|
|
56
55
|
end
|
57
56
|
distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
|
58
57
|
end
|
58
|
+
|
59
|
+
if RUBY_PLATFORM == 'java'
|
60
|
+
alias :distance :r_distance
|
61
|
+
alias :c_distance :r_distance
|
62
|
+
module_function :distance, :c_distance
|
63
|
+
else
|
64
|
+
alias :distance :c_distance
|
65
|
+
module_function :distance
|
66
|
+
end
|
59
67
|
end
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED