jaro_winkler 1.2.1 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4a1b9c70518b0e53cf56495a67e2ffe90962b25f
4
- data.tar.gz: 407eee20cb14e8e2b3fde69ad200547b34b27ebe
3
+ metadata.gz: 7a1229af20c91c12d0f4aeb8fb13dd57cc617344
4
+ data.tar.gz: 919b78c919c60d15bdeacdfc84bd6e13952e6181
5
5
  SHA512:
6
- metadata.gz: 0ec1165eac3e38cbac6462d3920467b3bdf2e08957367f150b51553710dd147ffd3ec78774bd74408867a433bf47ca7e5a810750aaddc2bc92aa3cb7d8dc1d31
7
- data.tar.gz: 918f8cc2603b09f6a23e8df6bbdd76d3ea295988f20a57000b9382d84f8e97f7d7205d3992d2b48fa1820e3b01bcb4de8473a0d9f1a39f4735efeac441c26d71
6
+ metadata.gz: 56011763a68a13b9b04973a33b3575c768f81c6257d42284e99bcdf94c66896493cf68fad4e33338081b251fb5d1d93f0f6815e82eee7a8e33b0e97d5707826a
7
+ data.tar.gz: 79e38669e7d5d4a7582af83a76bee06b49e3b34304708ac5a0d5c8be661dbfc4196a81157b636d8ed4266645af4fcc671b6afcde506588fa3af56d8178961b92
data/README.md CHANGED
@@ -2,6 +2,10 @@
2
2
 
3
3
  It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
4
4
 
5
+ **Windows Issue**
6
+
7
+ It will fallabck to pure Ruby implementation on Windows since it can't be compiled currently. (ref [#1](https://github.com/tonytonyjan/jaro_winkler/issues/1))
8
+
5
9
  # Installation
6
10
 
7
11
  ```
@@ -20,8 +24,8 @@ JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
20
24
  # => 0.9778
21
25
 
22
26
  # Force the strategy
23
- JaroWinkler.c_distance "MARTHA", "MARHTA"
24
- JaroWinkler.r_distance "MARTHA", "MARHTA"
27
+ JaroWinkler.c_distance "MARTHA", "MARHTA" # C extension
28
+ JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
25
29
  ```
26
30
 
27
31
  **Both implementations support UTF-8 string.**
@@ -32,7 +36,7 @@ Name | Type | Default | Note
32
36
  ----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
33
37
  case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
34
38
  weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
35
- threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
39
+ threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
36
40
 
37
41
  # Why This?
38
42
 
data/benchmark/native.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require 'benchmark'
2
2
  require 'jaro_winkler'
3
3
  require 'fuzzystringmatch'
4
+ require 'hotwater'
5
+ require 'amatch'
4
6
  ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
5
7
 
6
8
  n = 100000
@@ -13,8 +15,18 @@ Benchmark.bmbm do |x|
13
15
  jarow = FuzzyStringMatch::JaroWinkler.create(:native)
14
16
  n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
15
17
  end
18
+
19
+ x.report 'hotwater' do
20
+ n.times{ ary.each{ |str1, str2| Hotwater.jaro_winkler_distance(str1, str2) } }
21
+ end
22
+
23
+ x.report 'amatch' do
24
+ n.times{ ary.each{ |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
25
+ end
16
26
  end
17
27
 
18
28
  # user system total real
19
- # jaro_winkler 0.380000 0.000000 0.380000 ( 0.386071)
20
- # fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.138053)
29
+ # jaro_winkler 0.420000 0.000000 0.420000 ( 0.426742)
30
+ # fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.160146)
31
+ # hotwater 0.300000 0.000000 0.300000 ( 0.297350)
32
+ # amatch 0.980000 0.010000 0.990000 ( 0.982874)
@@ -55,10 +55,10 @@ double c_distance(char *s1, char *s2, Option *opt){
55
55
  }
56
56
  int window_size = ary_2_len / 2 - 1;
57
57
  if(window_size < 0) window_size = 0;
58
- double matches = 0.0;
58
+ double matches = 0.0;
59
59
  int transpositions = 0;
60
60
  int previous_index = -1;
61
- int max_index = ary_2_len - 1;
61
+ int max_index = ary_2_len - 1;
62
62
  for(int i = 0; i < ary_1_len; i++){
63
63
  int left = i - window_size;
64
64
  int right = i + window_size;
data/jaro_winkler.gemspec CHANGED
@@ -1,6 +1,7 @@
1
1
  # coding: utf-8
2
2
  lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'jaro_winkler/fallback'
4
5
  require 'jaro_winkler/version'
5
6
 
6
7
  Gem::Specification.new do |spec|
@@ -8,7 +9,7 @@ Gem::Specification.new do |spec|
8
9
  spec.version = JaroWinkler::VERSION
9
10
  spec.authors = ["Jian Weihang"]
10
11
  spec.email = ["tonytonyjan@gmail.com"]
11
- spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless RUBY_PLATFORM == 'java'
12
+ spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless JaroWinkler.fallback?
12
13
  spec.summary = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
13
14
  spec.description = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
14
15
  spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
data/lib/jaro_winkler.rb CHANGED
@@ -1,4 +1,5 @@
1
- require 'jaro_winkler/jaro_winkler.so' unless RUBY_PLATFORM == 'java'
1
+ require 'jaro_winkler/fallback'
2
+ require 'jaro_winkler/jaro_winkler.so' unless JaroWinkler.fallback?
2
3
  module JaroWinkler
3
4
  module_function
4
5
  def jaro_distance s1, s2
@@ -8,12 +9,12 @@ module JaroWinkler
8
9
  s1, s2 = s2, s1
9
10
  length1, length2 = length2, length1
10
11
  end
11
- window_size = (length2 / 2) - 1
12
- window_size = 0 if window_size < 0
13
- matches = 0.0
14
- transpositions = 0
15
- previous_index = -1
16
- max_index = length2 - 1
12
+ window_size = (length2 / 2) - 1
13
+ window_size = 0 if window_size < 0
14
+ matches = 0.0
15
+ transpositions = 0
16
+ previous_index = -1
17
+ max_index = length2 - 1
17
18
  s1.chars.each_with_index do |c1, i|
18
19
  left = i - window_size
19
20
  right = i + window_size
@@ -46,9 +47,9 @@ module JaroWinkler
46
47
  options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
47
48
  weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
48
49
  raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
49
- s1, s2 = s1.downcase, s2.downcase if case_match
50
- distance = jaro_distance(s1, s2)
51
- prefix = 0
50
+ s1, s2 = s1.downcase, s2.downcase if case_match
51
+ distance = jaro_distance(s1, s2)
52
+ prefix = 0
52
53
  max_length = [4, s1.length, s2.length].min
53
54
  s1[0, max_length].chars.each_with_index do |c1, i|
54
55
  c1 == s2[i] ? prefix += 1 : break
@@ -56,7 +57,7 @@ module JaroWinkler
56
57
  distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
57
58
  end
58
59
 
59
- if RUBY_PLATFORM == 'java'
60
+ if JaroWinkler.fallback?
60
61
  alias :distance :r_distance
61
62
  alias :c_distance :r_distance
62
63
  module_function :distance, :c_distance
@@ -0,0 +1,7 @@
1
+ module JaroWinkler
2
+ module_function
3
+ def fallback?
4
+ # TODO: Make windows compilable.
5
+ RUBY_PLATFORM == 'java' || RUBY_PLATFORM == /cygwin|mswin|mingw|bccwin|wince|emx/
6
+ end
7
+ end
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.2.1"
2
+ VERSION = "1.2.2"
3
3
  end
@@ -25,28 +25,24 @@ describe JaroWinkler do
25
25
 
26
26
  it 'works' do
27
27
  @ary.each do |s1, s2, ans|
28
- expect(distance(s1, s2)).to be_within(0.0001).of(ans)
29
- end
30
- end
31
-
32
- it 'supports C extension' do
33
- @ary.each do |s1, s2, ans|
28
+ expect(r_distance(s1, s2)).to be_within(0.0001).of(ans)
34
29
  expect(c_distance(s1, s2)).to be_within(0.0001).of(ans)
35
30
  end
36
31
  end
37
32
 
38
33
  it 'works with UTF-8' do
39
- expect(distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
34
+ expect(c_distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
40
35
  end
41
36
 
42
37
  it 'can ignore case' do
43
- expect(distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
38
+ expect(r_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
44
39
  expect(c_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
45
40
  end
46
41
 
47
42
  it 'can set weight' do
48
- expect(distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
43
+ expect(r_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
49
44
  expect(c_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
50
- expect{ distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
45
+ expect{ r_distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
46
+ expect{ c_distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
51
47
  end
52
48
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-07 00:00:00.000000000 Z
11
+ date: 2014-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -76,6 +76,7 @@ files:
76
76
  - ext/jaro_winkler/strcmp95.c
77
77
  - jaro_winkler.gemspec
78
78
  - lib/jaro_winkler.rb
79
+ - lib/jaro_winkler/fallback.rb
79
80
  - lib/jaro_winkler/version.rb
80
81
  - spec/jaro_winkler_spec.rb
81
82
  - spec/spec_helper.rb