jaro_winkler 1.2.4 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0fbc9d71de8c91f8e5d6ef78ac1216d6fa065f78
4
- data.tar.gz: 2cc7ae02aca3a9dfde2f6ed4659cf5957a5f156a
3
+ metadata.gz: 120dffbbfa06b3ce184be2bcd8d151e77d372e7d
4
+ data.tar.gz: 0b0e0d419aa7065f4a4600ad617e6c68d5d31446
5
5
  SHA512:
6
- metadata.gz: f57f69297ca1bdc756989784eff33cff98b0f94a2450110b056dbbb841f0ac88b63e565ec1c2b9f6d20c193346a49bb3f49ac8e07829f3d5af5b36031851a151
7
- data.tar.gz: 7e341b67320fd93177d120510183245bc2e2ba56145e5f5bbdaa0affe7142e38e8c35a3b949fda191657b5465645874fe91ff4268369ddd39185bae26b47f59f
6
+ metadata.gz: da4d765314fe0588475a59081b3a105d15888a3015c40eb6d96f31081fc037d61a6fdda1aadf0d1c79a7619cbb2a462c66e6527533c833889d1d6346c368caa7
7
+ data.tar.gz: 8a6ac9fb9af738d6e3e8a1a53daa5f8b01f7db133e7447afcc9055665ca7d787828f2f21f1b57408f5b289a1b2d7b5086eb4f2b889036d30d16905e5bd10ae24
data/README.md CHANGED
@@ -42,51 +42,61 @@ threshold | number | 0.7 | The prefix bonus is only added when the compar
42
42
 
43
43
  There is also another gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match), it uses the same algorithm and both provides C and Ruby implementation.
44
44
 
45
- I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged:
46
-
47
- string 1 | string 2 | origin | fuzzy-string-match | jaro_winkler
48
- ---------- | ---------- | -------- | ------------------ | ------------------
49
- "henka" | "henkan" | 0.966667 | 0.9722 (wrong) | 0.9666666666666667
50
- "al" | "al" | 1.000000 | 1.0 | 1.0
51
- "martha" | "marhta" | 0.961111 | 0.9611 | 0.9611111111111111
52
- "jones" | "johnson" | 0.832381 | 0.8323 | 0.8323809523809523
53
- "abcvwxyz" | "cabvwxyz" | 0.958333 | 0.9583 | 0.9583333333333333
54
- "dwayne" | "duane" | 0.840000 | 0.8400 | 0.84
55
- "dixon" | "dicksonx" | 0.813333 | 0.8133 | 0.8133333333333332
56
- "fvie" | "ten" | 0.000000 | 0.0 | 0
45
+ I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged (see table below).
46
+
47
+ # Compare with other gems
48
+
49
+ | jaro_winkler | fuzzystringmatch | hotwater | amatch
50
+ ------------ | ------------ | ---------------- | -------- | ------
51
+ UTF-8 Suport | Yes | Pure Ruby only | |
52
+ Native | Yes | Yes | Yes | Yes
53
+ Pure Ruby | Yes | Yes | |
54
+ Speed | Medium | Fast | Medium | Low
55
+ Bug Found | | Yes | | Yes
56
+
57
+ For `Bug Found`, I made a rake task to build the table below, the source code is in `Rakefile`:
58
+
59
+ str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater | amatch
60
+ --- | --- | --- | --- | --- | --- | ---
61
+ "henka" | "henkan" | 0.9667 | 0.9667 | **0.9722** | 0.9667 | **0.9444**
62
+ "al" | "al" | 1.0 | 1.0 | 1.0 | 1.0 | 1.0
63
+ "martha" | "marhta" | 0.9611 | 0.9611 | 0.9611 | 0.9611 | **0.9444**
64
+ "jones" | "johnson" | 0.8324 | 0.8324 | 0.8324 | 0.8324 | **0.7905**
65
+ "abcvwxyz" | "cabvwxyz" | 0.9583 | 0.9583 | 0.9583 | 0.9583 | 0.9583
66
+ "dwayne" | "duane" | 0.84 | 0.84 | 0.84 | 0.84 | **0.8222**
67
+ "dixon" | "dicksonx" | 0.8133 | 0.8133 | 0.8133 | 0.8133 | **0.7667**
68
+ "fvie" | "ten" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
57
69
 
58
70
  - The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
59
71
  - Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
60
72
 
61
73
  ## Benchmark
62
74
 
63
- - jaro_winkler (1.2.3)
64
- - fuzzy-string-match (0.9.6)
65
-
66
- ```ruby
67
- require 'benchmark'
68
- require 'jaro_winkler'
69
- require 'fuzzystringmatch'
70
- ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
71
-
72
- n = 100000
73
- Benchmark.bmbm do |x|
74
- x.report 'jaro_winkler' do
75
- n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
76
- end
77
-
78
- x.report 'fuzzystringmatch' do
79
- jarow = FuzzyStringMatch::JaroWinkler.create(:pure)
80
- n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
81
- end
82
- end
83
- ```
75
+ ### Pure Ruby
84
76
 
85
77
  | user | system | total | real
86
- --------------- | --------- | -------- | --------- | ------------
78
+ ---------------- | --------- | -------- | --------- | ------------
87
79
  jaro_winkler | 12.750000 | 0.030000 | 12.780000 | ( 12.782842)
88
80
  fuzzystringmatch | 16.240000 | 0.030000 | 16.270000 | ( 16.287380)
89
81
 
82
+ - jaro_winkler (1.2.3)
83
+ - fuzzy-string-match (0.9.6)
84
+
85
+ ### Native
86
+
87
+ | user | system | total | real
88
+ ---------------- | -------- | -------- | -------- | ------------
89
+ jaro_winkler | 0.390000 | 0.000000 | 0.390000 | ( 0.392408)
90
+ fuzzystringmatch | 0.150000 | 0.000000 | 0.150000 | ( 0.151552)
91
+ hotwater | 0.320000 | 0.000000 | 0.320000 | ( 0.317740)
92
+ amatch | 0.960000 | 0.010000 | 0.970000 | ( 0.964803)
93
+
94
+ - jaro_winkler (1.2.3)
95
+ - fuzzy-string-match (0.9.6)
96
+ - hotwater (0.1.2)
97
+ - amatch (0.3.0)
98
+
90
99
  # Todo
91
100
 
101
+ - Make it faster
92
102
  - Adjusting word table (Reference to original C implementation.)
data/Rakefile CHANGED
@@ -1,5 +1,4 @@
1
1
  require "bundler/gem_tasks"
2
-
3
2
  require "rake/extensiontask"
4
3
 
5
4
  Rake::ExtensionTask.new("jaro_winkler") do |ext|
@@ -16,4 +15,28 @@ task :benchmark do
16
15
  puts cmd
17
16
  system(cmd)
18
17
  end
18
+ end
19
+
20
+ task :compare do
21
+ require 'jaro_winkler'
22
+ require 'fuzzystringmatch'
23
+ require 'hotwater'
24
+ require 'amatch'
25
+ @ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
26
+ table = []
27
+ table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
28
+ table << %w[--- --- --- --- --- ---]
29
+ jarow = FuzzyStringMatch::JaroWinkler.create(:native)
30
+ @ary.each do |str_1, str_2|
31
+ table << ["\"#{str_1}\"", "\"#{str_2}\"", JaroWinkler.distance(str_1, str_2).round(4), jarow.getDistance(str_1, str_2).round(4), Hotwater.jaro_winkler_distance(str_1, str_2).round(4), Amatch::Jaro.new(str_1).match(str_2).round(4)]
32
+ end
33
+ col_len = []
34
+ table.first.length.times{ |i| col_len << table.map{ |row| row[i].to_s.length }.max }
35
+ table.first.each_with_index{ |title, i| "%-#{col_len[i]}s" % title }
36
+ table.each_with_index do |row|
37
+ row.each_with_index do |col, i|
38
+ row[i] = "%-#{col_len[i]}s" % col.to_s
39
+ end
40
+ end
41
+ table.each{|row| puts row.join(' | ')}
19
42
  end
data/benchmark/native.txt CHANGED
@@ -1,5 +1,12 @@
1
1
  Rehearsal ----------------------------------------------------
2
- jaro_winkler 0.380000 0.000000 0.380000 ( 0.376303)
3
- fuzzystringmatch 0.350000 0.020000 0.370000 ( 0.369344)
4
- hotwater 0.280000 0.000000 0.280000 ( 0.281579)
5
- amatch
2
+ jaro_winkler 0.370000 0.000000 0.370000 ( 0.367923)
3
+ fuzzystringmatch 0.340000 0.030000 0.370000 ( 0.372721)
4
+ hotwater 0.310000 0.000000 0.310000 ( 0.313405)
5
+ amatch 0.970000 0.000000 0.970000 ( 0.968318)
6
+ ------------------------------------------- total: 2.020000sec
7
+
8
+ user system total real
9
+ jaro_winkler 0.390000 0.000000 0.390000 ( 0.392408)
10
+ fuzzystringmatch 0.150000 0.000000 0.150000 ( 0.151552)
11
+ hotwater 0.320000 0.000000 0.320000 ( 0.317740)
12
+ amatch 0.960000 0.010000 0.970000 ( 0.964803)
data/benchmark/pure.txt CHANGED
@@ -1,8 +1,8 @@
1
1
  Rehearsal ----------------------------------------------------
2
- jaro_winkler 12.520000 0.020000 12.540000 ( 12.548948)
3
- fuzzystringmatch 15.370000 0.020000 15.390000 ( 15.408540)
4
- ------------------------------------------ total: 27.930000sec
2
+ jaro_winkler 12.690000 0.020000 12.710000 ( 12.726929)
3
+ fuzzystringmatch 15.230000 0.020000 15.250000 ( 15.255146)
4
+ ------------------------------------------ total: 27.960000sec
5
5
 
6
6
  user system total real
7
- jaro_winkler 12.750000 0.030000 12.780000 ( 12.782842)
8
- fuzzystringmatch 16.240000 0.030000 16.270000 ( 16.287380)
7
+ jaro_winkler 12.290000 0.020000 12.310000 ( 12.308876)
8
+ fuzzystringmatch 15.460000 0.020000 15.480000 ( 15.493061)
@@ -21,8 +21,8 @@ static int char_bytes_num(char first_char){
21
21
  else return 1;
22
22
  }
23
23
 
24
- static unsigned long* codepoints(const char *str, int byte_len, int *ret_len){
25
- unsigned long *ret = calloc(byte_len, sizeof(long));
24
+ static unsigned long long* codepoints(const char *str, int byte_len, int *ret_len){
25
+ unsigned long long *ret = calloc(byte_len, sizeof(long));
26
26
  int count = 0;
27
27
  for(int i = 0; i < byte_len;){
28
28
  int bytes_num = char_bytes_num(str[i]);
@@ -40,7 +40,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
40
40
  if(!opt){ free_opt_flag = 1; opt = option_new(); }
41
41
 
42
42
  int ary_1_len, ary_2_len;
43
- unsigned long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
43
+ unsigned long long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
44
44
 
45
45
  if(opt->case_match){
46
46
  for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
@@ -49,7 +49,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
49
49
 
50
50
  // Guarantee the order
51
51
  if(ary_1_len > ary_2_len){
52
- unsigned long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
52
+ unsigned long long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
53
53
  int tmp2 = ary_1_len; ary_1_len = ary_2_len; ary_2_len = tmp2;
54
54
  }
55
55
  int window_size = ary_2_len / 2 - 1;
@@ -63,17 +63,14 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
63
63
  int right = i + window_size;
64
64
  if(left < 0) left = 0;
65
65
  if(right > max_index) right = max_index;
66
- char matched = 0;
67
- char found = 0;
66
+ char matched = 0, found = 0;
68
67
  for(int j = left; j <= right; j++){
69
68
  if(ary_1[i] == ary_2[j]){
70
69
  matched = 1;
71
- if(!found){
72
- if(j > previous_index){
73
- previous_index = j;
74
- found = 1;
75
- }
76
- } // if(!found){
70
+ if(!found && j > previous_index){
71
+ previous_index = j;
72
+ found = 1;
73
+ }
77
74
  } // if(ary_1[i] == ary_2[j]){
78
75
  } // for(int j = left; j <= right; j++){
79
76
  if(matched){
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.2.4"
2
+ VERSION = "1.2.5"
3
3
  end
data/lib/jaro_winkler.rb CHANGED
@@ -26,11 +26,9 @@ module JaroWinkler
26
26
  if c1 == c2
27
27
  matched = true
28
28
  s2_index = left + j
29
- unless found
30
- if s2_index > previous_index
31
- previous_index = s2_index
32
- found = true
33
- end
29
+ if !found && s2_index > previous_index
30
+ previous_index = s2_index
31
+ found = true
34
32
  end
35
33
  end
36
34
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.4
4
+ version: 1.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang