jaro_winkler 1.2.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a1a6510f807b126a00145ca02323746c1885409
4
- data.tar.gz: 3cc743dde90a2dc0f682af51b506da4d4dc8e6ed
3
+ metadata.gz: 4a1b9c70518b0e53cf56495a67e2ffe90962b25f
4
+ data.tar.gz: 407eee20cb14e8e2b3fde69ad200547b34b27ebe
5
5
  SHA512:
6
- metadata.gz: 02fec80fb44db8e6efed4b6cf083d55e4356cf706827c85850db8ef1c409a483e90c20e6de8437f0282d9621d46513a80061d1dd333d6b4b45b85d81f5c73bd5
7
- data.tar.gz: e6e43be13fb520ddfa88206488689f4329c68003b0c4c1c40b69192c6df3d7bcc323f18316f21392ebf0da0dc2fce6949d43b8d8833ad72fe59ca288ebce3132
6
+ metadata.gz: 0ec1165eac3e38cbac6462d3920467b3bdf2e08957367f150b51553710dd147ffd3ec78774bd74408867a433bf47ca7e5a810750aaddc2bc92aa3cb7d8dc1d31
7
+ data.tar.gz: 918f8cc2603b09f6a23e8df6bbdd76d3ea295988f20a57000b9382d84f8e97f7d7205d3992d2b48fa1820e3b01bcb4de8473a0d9f1a39f4735efeac441c26d71
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # About
2
2
 
3
- It's a pure Ruby implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm.
3
+ It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
4
4
 
5
5
  # Installation
6
6
 
@@ -19,9 +19,9 @@ JaroWinkler.distance "MARTHA", "marhta", case_match: true
19
19
  JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
20
20
  # => 0.9778
21
21
 
22
- # Native
23
- JaroWinkler.c_distance "MARTHA", "MARHTA" # Recommended one, it's 7 times faster than the latter.
24
- JaroWinkler.distance "MARTHA", "MARHTA", native: true
22
+ # Force the strategy
23
+ JaroWinkler.c_distance "MARTHA", "MARHTA"
24
+ JaroWinkler.r_distance "MARTHA", "MARHTA"
25
25
  ```
26
26
 
27
27
  **Both implementations support UTF-8 string.**
@@ -33,7 +33,6 @@ Name | Type | Default | Note
33
33
  case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
34
34
  weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
35
35
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
36
- native | boolean | false | Use native version.
37
36
 
38
37
  # Why This?
39
38
 
@@ -85,5 +84,4 @@ end
85
84
 
86
85
  # Todo
87
86
 
88
- - Adjusting word table (Reference to original C implementation.)
89
- - Remove `#c_distance`, use C extension as default, and fallback to Ruby in Java platform.
87
+ - Adjusting word table (Reference to original C implementation.)
data/benchmark/native.rb CHANGED
@@ -5,21 +5,16 @@ ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'c
5
5
 
6
6
  n = 100000
7
7
  Benchmark.bmbm do |x|
8
- x.report '#c_distance(s1, s2)' do
8
+ x.report 'jaro_winkler' do
9
9
  n.times{ ary.each{ |str1, str2| JaroWinkler.c_distance(str1, str2) } }
10
10
  end
11
11
 
12
- x.report '#distance(s1, s2, native: true)' do
13
- n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2, native: true) } }
14
- end
15
-
16
12
  x.report 'fuzzystringmatch' do
17
13
  jarow = FuzzyStringMatch::JaroWinkler.create(:native)
18
14
  n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
19
15
  end
20
16
  end
21
17
 
22
- # user system total real
23
- # #c_distance(s1, s2) 0.350000 0.000000 0.350000 ( 0.349109)
24
- # #distance(s1, s2, native: true) 2.480000 0.050000 2.530000 ( 2.526027)
25
- # fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.155539)
18
+ # user system total real
19
+ # jaro_winkler 0.380000 0.000000 0.380000 ( 0.386071)
20
+ # fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.138053)
data/benchmark/pure.rb CHANGED
@@ -5,8 +5,8 @@ ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'c
5
5
 
6
6
  n = 100000
7
7
  Benchmark.bmbm do |x|
8
- x.report 'jaro_winkler ' do
9
- n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2) } }
8
+ x.report 'jaro_winkler' do
9
+ n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
10
10
  end
11
11
 
12
12
  x.report 'fuzzystringmatch' do
@@ -17,6 +17,7 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
17
17
  VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
18
18
  VALUE case_match = rb_hash_aref(opt, ID2SYM(rb_intern("case_match")));
19
19
  if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
20
+ if(c_opt->weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
20
21
  if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
21
22
  if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
22
23
  }
data/lib/jaro_winkler.rb CHANGED
@@ -42,9 +42,8 @@ module JaroWinkler
42
42
  matches == 0 ? 0 : (matches / length1 + matches / length2 + (matches - transpositions) / matches) / 3.0
43
43
  end
44
44
 
45
- def distance s1, s2, options = {}
46
- options = {weight: 0.1, threshold: 0.7, case_match: false, native: false}.merge options
47
- return c_distance(s1, s2, options) if RUBY_PLATFORM != 'java' && options[:native]
45
+ def r_distance s1, s2, options = {}
46
+ options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
48
47
  weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
49
48
  raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
50
49
  s1, s2 = s1.downcase, s2.downcase if case_match
@@ -56,4 +55,13 @@ module JaroWinkler
56
55
  end
57
56
  distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
58
57
  end
58
+
59
+ if RUBY_PLATFORM == 'java'
60
+ alias :distance :r_distance
61
+ alias :c_distance :r_distance
62
+ module_function :distance, :c_distance
63
+ else
64
+ alias :distance :c_distance
65
+ module_function :distance
66
+ end
59
67
  end
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.2.0"
2
+ VERSION = "1.2.1"
3
3
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'jaro_winkler'
2
3
  include JaroWinkler
3
4
  describe JaroWinkler do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang