jaro_winkler 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a1a6510f807b126a00145ca02323746c1885409
4
- data.tar.gz: 3cc743dde90a2dc0f682af51b506da4d4dc8e6ed
3
+ metadata.gz: 4a1b9c70518b0e53cf56495a67e2ffe90962b25f
4
+ data.tar.gz: 407eee20cb14e8e2b3fde69ad200547b34b27ebe
5
5
  SHA512:
6
- metadata.gz: 02fec80fb44db8e6efed4b6cf083d55e4356cf706827c85850db8ef1c409a483e90c20e6de8437f0282d9621d46513a80061d1dd333d6b4b45b85d81f5c73bd5
7
- data.tar.gz: e6e43be13fb520ddfa88206488689f4329c68003b0c4c1c40b69192c6df3d7bcc323f18316f21392ebf0da0dc2fce6949d43b8d8833ad72fe59ca288ebce3132
6
+ metadata.gz: 0ec1165eac3e38cbac6462d3920467b3bdf2e08957367f150b51553710dd147ffd3ec78774bd74408867a433bf47ca7e5a810750aaddc2bc92aa3cb7d8dc1d31
7
+ data.tar.gz: 918f8cc2603b09f6a23e8df6bbdd76d3ea295988f20a57000b9382d84f8e97f7d7205d3992d2b48fa1820e3b01bcb4de8473a0d9f1a39f4735efeac441c26d71
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # About
2
2
 
3
- It's a pure Ruby implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm.
3
+ It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
4
4
 
5
5
  # Installation
6
6
 
@@ -19,9 +19,9 @@ JaroWinkler.distance "MARTHA", "marhta", case_match: true
19
19
  JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
20
20
  # => 0.9778
21
21
 
22
- # Native
23
- JaroWinkler.c_distance "MARTHA", "MARHTA" # Recommended one, it's 7 times faster than the latter.
24
- JaroWinkler.distance "MARTHA", "MARHTA", native: true
22
+ # Force the strategy
23
+ JaroWinkler.c_distance "MARTHA", "MARHTA"
24
+ JaroWinkler.r_distance "MARTHA", "MARHTA"
25
25
  ```
26
26
 
27
27
  **Both implementations support UTF-8 string.**
@@ -33,7 +33,6 @@ Name | Type | Default | Note
33
33
  case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
34
34
  weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
35
35
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
36
- native | boolean | false | Use native version.
37
36
 
38
37
  # Why This?
39
38
 
@@ -85,5 +84,4 @@ end
85
84
 
86
85
  # Todo
87
86
 
88
- - Adjusting word table (Reference to original C implementation.)
89
- - Remove `#c_distance`, use C extension as default, and fallback to Ruby in Java platform.
87
+ - Adjusting word table (Reference to original C implementation.)
data/benchmark/native.rb CHANGED
@@ -5,21 +5,16 @@ ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'c
5
5
 
6
6
  n = 100000
7
7
  Benchmark.bmbm do |x|
8
- x.report '#c_distance(s1, s2)' do
8
+ x.report 'jaro_winkler' do
9
9
  n.times{ ary.each{ |str1, str2| JaroWinkler.c_distance(str1, str2) } }
10
10
  end
11
11
 
12
- x.report '#distance(s1, s2, native: true)' do
13
- n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2, native: true) } }
14
- end
15
-
16
12
  x.report 'fuzzystringmatch' do
17
13
  jarow = FuzzyStringMatch::JaroWinkler.create(:native)
18
14
  n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
19
15
  end
20
16
  end
21
17
 
22
- # user system total real
23
- # #c_distance(s1, s2) 0.350000 0.000000 0.350000 ( 0.349109)
24
- # #distance(s1, s2, native: true) 2.480000 0.050000 2.530000 ( 2.526027)
25
- # fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.155539)
18
+ # user system total real
19
+ # jaro_winkler 0.380000 0.000000 0.380000 ( 0.386071)
20
+ # fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.138053)
data/benchmark/pure.rb CHANGED
@@ -5,8 +5,8 @@ ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'c
5
5
 
6
6
  n = 100000
7
7
  Benchmark.bmbm do |x|
8
- x.report 'jaro_winkler ' do
9
- n.times{ ary.each{ |str1, str2| JaroWinkler.distance(str1, str2) } }
8
+ x.report 'jaro_winkler' do
9
+ n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
10
10
  end
11
11
 
12
12
  x.report 'fuzzystringmatch' do
@@ -17,6 +17,7 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
17
17
  VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
18
18
  VALUE case_match = rb_hash_aref(opt, ID2SYM(rb_intern("case_match")));
19
19
  if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
20
+ if(c_opt->weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
20
21
  if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
21
22
  if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
22
23
  }
data/lib/jaro_winkler.rb CHANGED
@@ -42,9 +42,8 @@ module JaroWinkler
42
42
  matches == 0 ? 0 : (matches / length1 + matches / length2 + (matches - transpositions) / matches) / 3.0
43
43
  end
44
44
 
45
- def distance s1, s2, options = {}
46
- options = {weight: 0.1, threshold: 0.7, case_match: false, native: false}.merge options
47
- return c_distance(s1, s2, options) if RUBY_PLATFORM != 'java' && options[:native]
45
+ def r_distance s1, s2, options = {}
46
+ options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
48
47
  weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
49
48
  raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
50
49
  s1, s2 = s1.downcase, s2.downcase if case_match
@@ -56,4 +55,13 @@ module JaroWinkler
56
55
  end
57
56
  distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
58
57
  end
58
+
59
+ if RUBY_PLATFORM == 'java'
60
+ alias :distance :r_distance
61
+ alias :c_distance :r_distance
62
+ module_function :distance, :c_distance
63
+ else
64
+ alias :distance :c_distance
65
+ module_function :distance
66
+ end
59
67
  end
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.2.0"
2
+ VERSION = "1.2.1"
3
3
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'jaro_winkler'
2
3
  include JaroWinkler
3
4
  describe JaroWinkler do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang