jaro_winkler 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4a1b9c70518b0e53cf56495a67e2ffe90962b25f
4
- data.tar.gz: 407eee20cb14e8e2b3fde69ad200547b34b27ebe
3
+ metadata.gz: 7a1229af20c91c12d0f4aeb8fb13dd57cc617344
4
+ data.tar.gz: 919b78c919c60d15bdeacdfc84bd6e13952e6181
5
5
  SHA512:
6
- metadata.gz: 0ec1165eac3e38cbac6462d3920467b3bdf2e08957367f150b51553710dd147ffd3ec78774bd74408867a433bf47ca7e5a810750aaddc2bc92aa3cb7d8dc1d31
7
- data.tar.gz: 918f8cc2603b09f6a23e8df6bbdd76d3ea295988f20a57000b9382d84f8e97f7d7205d3992d2b48fa1820e3b01bcb4de8473a0d9f1a39f4735efeac441c26d71
6
+ metadata.gz: 56011763a68a13b9b04973a33b3575c768f81c6257d42284e99bcdf94c66896493cf68fad4e33338081b251fb5d1d93f0f6815e82eee7a8e33b0e97d5707826a
7
+ data.tar.gz: 79e38669e7d5d4a7582af83a76bee06b49e3b34304708ac5a0d5c8be661dbfc4196a81157b636d8ed4266645af4fcc671b6afcde506588fa3af56d8178961b92
data/README.md CHANGED
@@ -2,6 +2,10 @@
2
2
 
3
3
  It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
4
4
 
5
+ **Windows Issue**
6
+
7
+ It will fallabck to pure Ruby implementation on Windows since it can't be compiled currently. (ref [#1](https://github.com/tonytonyjan/jaro_winkler/issues/1))
8
+
5
9
  # Installation
6
10
 
7
11
  ```
@@ -20,8 +24,8 @@ JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
20
24
  # => 0.9778
21
25
 
22
26
  # Force the strategy
23
- JaroWinkler.c_distance "MARTHA", "MARHTA"
24
- JaroWinkler.r_distance "MARTHA", "MARHTA"
27
+ JaroWinkler.c_distance "MARTHA", "MARHTA" # C extension
28
+ JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
25
29
  ```
26
30
 
27
31
  **Both implementations support UTF-8 string.**
@@ -32,7 +36,7 @@ Name | Type | Default | Note
32
36
  ----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
33
37
  case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
34
38
  weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
35
- threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
39
+ threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
36
40
 
37
41
  # Why This?
38
42
 
data/benchmark/native.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require 'benchmark'
2
2
  require 'jaro_winkler'
3
3
  require 'fuzzystringmatch'
4
+ require 'hotwater'
5
+ require 'amatch'
4
6
  ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
5
7
 
6
8
  n = 100000
@@ -13,8 +15,18 @@ Benchmark.bmbm do |x|
13
15
  jarow = FuzzyStringMatch::JaroWinkler.create(:native)
14
16
  n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
15
17
  end
18
+
19
+ x.report 'hotwater' do
20
+ n.times{ ary.each{ |str1, str2| Hotwater.jaro_winkler_distance(str1, str2) } }
21
+ end
22
+
23
+ x.report 'amatch' do
24
+ n.times{ ary.each{ |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
25
+ end
16
26
  end
17
27
 
18
28
  # user system total real
19
- # jaro_winkler 0.380000 0.000000 0.380000 ( 0.386071)
20
- # fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.138053)
29
+ # jaro_winkler 0.420000 0.000000 0.420000 ( 0.426742)
30
+ # fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.160146)
31
+ # hotwater 0.300000 0.000000 0.300000 ( 0.297350)
32
+ # amatch 0.980000 0.010000 0.990000 ( 0.982874)
@@ -55,10 +55,10 @@ double c_distance(char *s1, char *s2, Option *opt){
55
55
  }
56
56
  int window_size = ary_2_len / 2 - 1;
57
57
  if(window_size < 0) window_size = 0;
58
- double matches = 0.0;
58
+ double matches = 0.0;
59
59
  int transpositions = 0;
60
60
  int previous_index = -1;
61
- int max_index = ary_2_len - 1;
61
+ int max_index = ary_2_len - 1;
62
62
  for(int i = 0; i < ary_1_len; i++){
63
63
  int left = i - window_size;
64
64
  int right = i + window_size;
data/jaro_winkler.gemspec CHANGED
@@ -1,6 +1,7 @@
1
1
  # coding: utf-8
2
2
  lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'jaro_winkler/fallback'
4
5
  require 'jaro_winkler/version'
5
6
 
6
7
  Gem::Specification.new do |spec|
@@ -8,7 +9,7 @@ Gem::Specification.new do |spec|
8
9
  spec.version = JaroWinkler::VERSION
9
10
  spec.authors = ["Jian Weihang"]
10
11
  spec.email = ["tonytonyjan@gmail.com"]
11
- spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless RUBY_PLATFORM == 'java'
12
+ spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless JaroWinkler.fallback?
12
13
  spec.summary = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
13
14
  spec.description = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
14
15
  spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
data/lib/jaro_winkler.rb CHANGED
@@ -1,4 +1,5 @@
1
- require 'jaro_winkler/jaro_winkler.so' unless RUBY_PLATFORM == 'java'
1
+ require 'jaro_winkler/fallback'
2
+ require 'jaro_winkler/jaro_winkler.so' unless JaroWinkler.fallback?
2
3
  module JaroWinkler
3
4
  module_function
4
5
  def jaro_distance s1, s2
@@ -8,12 +9,12 @@ module JaroWinkler
8
9
  s1, s2 = s2, s1
9
10
  length1, length2 = length2, length1
10
11
  end
11
- window_size = (length2 / 2) - 1
12
- window_size = 0 if window_size < 0
13
- matches = 0.0
14
- transpositions = 0
15
- previous_index = -1
16
- max_index = length2 - 1
12
+ window_size = (length2 / 2) - 1
13
+ window_size = 0 if window_size < 0
14
+ matches = 0.0
15
+ transpositions = 0
16
+ previous_index = -1
17
+ max_index = length2 - 1
17
18
  s1.chars.each_with_index do |c1, i|
18
19
  left = i - window_size
19
20
  right = i + window_size
@@ -46,9 +47,9 @@ module JaroWinkler
46
47
  options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
47
48
  weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
48
49
  raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
49
- s1, s2 = s1.downcase, s2.downcase if case_match
50
- distance = jaro_distance(s1, s2)
51
- prefix = 0
50
+ s1, s2 = s1.downcase, s2.downcase if case_match
51
+ distance = jaro_distance(s1, s2)
52
+ prefix = 0
52
53
  max_length = [4, s1.length, s2.length].min
53
54
  s1[0, max_length].chars.each_with_index do |c1, i|
54
55
  c1 == s2[i] ? prefix += 1 : break
@@ -56,7 +57,7 @@ module JaroWinkler
56
57
  distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
57
58
  end
58
59
 
59
- if RUBY_PLATFORM == 'java'
60
+ if JaroWinkler.fallback?
60
61
  alias :distance :r_distance
61
62
  alias :c_distance :r_distance
62
63
  module_function :distance, :c_distance
@@ -0,0 +1,7 @@
1
+ module JaroWinkler
2
+ module_function
3
+ def fallback?
4
+ # TODO: Make windows compilable.
5
+ RUBY_PLATFORM == 'java' || RUBY_PLATFORM == /cygwin|mswin|mingw|bccwin|wince|emx/
6
+ end
7
+ end
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.2.1"
2
+ VERSION = "1.2.2"
3
3
  end
@@ -25,28 +25,24 @@ describe JaroWinkler do
25
25
 
26
26
  it 'works' do
27
27
  @ary.each do |s1, s2, ans|
28
- expect(distance(s1, s2)).to be_within(0.0001).of(ans)
29
- end
30
- end
31
-
32
- it 'supports C extension' do
33
- @ary.each do |s1, s2, ans|
28
+ expect(r_distance(s1, s2)).to be_within(0.0001).of(ans)
34
29
  expect(c_distance(s1, s2)).to be_within(0.0001).of(ans)
35
30
  end
36
31
  end
37
32
 
38
33
  it 'works with UTF-8' do
39
- expect(distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
34
+ expect(c_distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
40
35
  end
41
36
 
42
37
  it 'can ignore case' do
43
- expect(distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
38
+ expect(r_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
44
39
  expect(c_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
45
40
  end
46
41
 
47
42
  it 'can set weight' do
48
- expect(distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
43
+ expect(r_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
49
44
  expect(c_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
50
- expect{ distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
45
+ expect{ r_distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
46
+ expect{ c_distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
51
47
  end
52
48
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-07 00:00:00.000000000 Z
11
+ date: 2014-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -76,6 +76,7 @@ files:
76
76
  - ext/jaro_winkler/strcmp95.c
77
77
  - jaro_winkler.gemspec
78
78
  - lib/jaro_winkler.rb
79
+ - lib/jaro_winkler/fallback.rb
79
80
  - lib/jaro_winkler/version.rb
80
81
  - spec/jaro_winkler_spec.rb
81
82
  - spec/spec_helper.rb