jaro_winkler 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0fbc9d71de8c91f8e5d6ef78ac1216d6fa065f78
4
- data.tar.gz: 2cc7ae02aca3a9dfde2f6ed4659cf5957a5f156a
3
+ metadata.gz: 120dffbbfa06b3ce184be2bcd8d151e77d372e7d
4
+ data.tar.gz: 0b0e0d419aa7065f4a4600ad617e6c68d5d31446
5
5
  SHA512:
6
- metadata.gz: f57f69297ca1bdc756989784eff33cff98b0f94a2450110b056dbbb841f0ac88b63e565ec1c2b9f6d20c193346a49bb3f49ac8e07829f3d5af5b36031851a151
7
- data.tar.gz: 7e341b67320fd93177d120510183245bc2e2ba56145e5f5bbdaa0affe7142e38e8c35a3b949fda191657b5465645874fe91ff4268369ddd39185bae26b47f59f
6
+ metadata.gz: da4d765314fe0588475a59081b3a105d15888a3015c40eb6d96f31081fc037d61a6fdda1aadf0d1c79a7619cbb2a462c66e6527533c833889d1d6346c368caa7
7
+ data.tar.gz: 8a6ac9fb9af738d6e3e8a1a53daa5f8b01f7db133e7447afcc9055665ca7d787828f2f21f1b57408f5b289a1b2d7b5086eb4f2b889036d30d16905e5bd10ae24
data/README.md CHANGED
@@ -42,51 +42,61 @@ threshold | number | 0.7 | The prefix bonus is only added when the compar
42
42
 
43
43
  There is also another gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match), it uses the same algorithm and both provides C and Ruby implementation.
44
44
 
45
- I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged:
46
-
47
- string 1 | string 2 | origin | fuzzy-string-match | jaro_winkler
48
- ---------- | ---------- | -------- | ------------------ | ------------------
49
- "henka" | "henkan" | 0.966667 | 0.9722 (wrong) | 0.9666666666666667
50
- "al" | "al" | 1.000000 | 1.0 | 1.0
51
- "martha" | "marhta" | 0.961111 | 0.9611 | 0.9611111111111111
52
- "jones" | "johnson" | 0.832381 | 0.8323 | 0.8323809523809523
53
- "abcvwxyz" | "cabvwxyz" | 0.958333 | 0.9583 | 0.9583333333333333
54
- "dwayne" | "duane" | 0.840000 | 0.8400 | 0.84
55
- "dixon" | "dicksonx" | 0.813333 | 0.8133 | 0.8133333333333332
56
- "fvie" | "ten" | 0.000000 | 0.0 | 0
45
+ I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged (see table below).
46
+
47
+ # Compare with other gems
48
+
49
+ | jaro_winkler | fuzzystringmatch | hotwater | amatch
50
+ ------------ | ------------ | ---------------- | -------- | ------
51
+ UTF-8 Suport | Yes | Pure Ruby only | |
52
+ Native | Yes | Yes | Yes | Yes
53
+ Pure Ruby | Yes | Yes | |
54
+ Speed | Medium | Fast | Medium | Low
55
+ Bug Found | | Yes | | Yes
56
+
57
+ For `Bug Found`, I made a rake task to build the table below, the source code is in `Rakefile`:
58
+
59
+ str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater | amatch
60
+ --- | --- | --- | --- | --- | --- | ---
61
+ "henka" | "henkan" | 0.9667 | 0.9667 | **0.9722** | 0.9667 | **0.9444**
62
+ "al" | "al" | 1.0 | 1.0 | 1.0 | 1.0 | 1.0
63
+ "martha" | "marhta" | 0.9611 | 0.9611 | 0.9611 | 0.9611 | **0.9444**
64
+ "jones" | "johnson" | 0.8324 | 0.8324 | 0.8324 | 0.8324 | **0.7905**
65
+ "abcvwxyz" | "cabvwxyz" | 0.9583 | 0.9583 | 0.9583 | 0.9583 | 0.9583
66
+ "dwayne" | "duane" | 0.84 | 0.84 | 0.84 | 0.84 | **0.8222**
67
+ "dixon" | "dicksonx" | 0.8133 | 0.8133 | 0.8133 | 0.8133 | **0.7667**
68
+ "fvie" | "ten" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
57
69
 
58
70
  - The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
59
71
  - Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
60
72
 
61
73
  ## Benchmark
62
74
 
63
- - jaro_winkler (1.2.3)
64
- - fuzzy-string-match (0.9.6)
65
-
66
- ```ruby
67
- require 'benchmark'
68
- require 'jaro_winkler'
69
- require 'fuzzystringmatch'
70
- ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
71
-
72
- n = 100000
73
- Benchmark.bmbm do |x|
74
- x.report 'jaro_winkler' do
75
- n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
76
- end
77
-
78
- x.report 'fuzzystringmatch' do
79
- jarow = FuzzyStringMatch::JaroWinkler.create(:pure)
80
- n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
81
- end
82
- end
83
- ```
75
+ ### Pure Ruby
84
76
 
85
77
  | user | system | total | real
86
- --------------- | --------- | -------- | --------- | ------------
78
+ ---------------- | --------- | -------- | --------- | ------------
87
79
  jaro_winkler | 12.750000 | 0.030000 | 12.780000 | ( 12.782842)
88
80
  fuzzystringmatch | 16.240000 | 0.030000 | 16.270000 | ( 16.287380)
89
81
 
82
+ - jaro_winkler (1.2.3)
83
+ - fuzzy-string-match (0.9.6)
84
+
85
+ ### Native
86
+
87
+ | user | system | total | real
88
+ ---------------- | -------- | -------- | -------- | ------------
89
+ jaro_winkler | 0.390000 | 0.000000 | 0.390000 | ( 0.392408)
90
+ fuzzystringmatch | 0.150000 | 0.000000 | 0.150000 | ( 0.151552)
91
+ hotwater | 0.320000 | 0.000000 | 0.320000 | ( 0.317740)
92
+ amatch | 0.960000 | 0.010000 | 0.970000 | ( 0.964803)
93
+
94
+ - jaro_winkler (1.2.3)
95
+ - fuzzy-string-match (0.9.6)
96
+ - hotwater (0.1.2)
97
+ - amatch (0.3.0)
98
+
90
99
  # Todo
91
100
 
101
+ - Make it faster
92
102
  - Adjusting word table (Reference to original C implementation.)
data/Rakefile CHANGED
@@ -1,5 +1,4 @@
1
1
  require "bundler/gem_tasks"
2
-
3
2
  require "rake/extensiontask"
4
3
 
5
4
  Rake::ExtensionTask.new("jaro_winkler") do |ext|
@@ -16,4 +15,28 @@ task :benchmark do
16
15
  puts cmd
17
16
  system(cmd)
18
17
  end
18
+ end
19
+
20
+ task :compare do
21
+ require 'jaro_winkler'
22
+ require 'fuzzystringmatch'
23
+ require 'hotwater'
24
+ require 'amatch'
25
+ @ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
26
+ table = []
27
+ table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
28
+ table << %w[--- --- --- --- --- ---]
29
+ jarow = FuzzyStringMatch::JaroWinkler.create(:native)
30
+ @ary.each do |str_1, str_2|
31
+ table << ["\"#{str_1}\"", "\"#{str_2}\"", JaroWinkler.distance(str_1, str_2).round(4), jarow.getDistance(str_1, str_2).round(4), Hotwater.jaro_winkler_distance(str_1, str_2).round(4), Amatch::Jaro.new(str_1).match(str_2).round(4)]
32
+ end
33
+ col_len = []
34
+ table.first.length.times{ |i| col_len << table.map{ |row| row[i].to_s.length }.max }
35
+ table.first.each_with_index{ |title, i| "%-#{col_len[i]}s" % title }
36
+ table.each_with_index do |row|
37
+ row.each_with_index do |col, i|
38
+ row[i] = "%-#{col_len[i]}s" % col.to_s
39
+ end
40
+ end
41
+ table.each{|row| puts row.join(' | ')}
19
42
  end
data/benchmark/native.txt CHANGED
@@ -1,5 +1,12 @@
1
1
  Rehearsal ----------------------------------------------------
2
- jaro_winkler 0.380000 0.000000 0.380000 ( 0.376303)
3
- fuzzystringmatch 0.350000 0.020000 0.370000 ( 0.369344)
4
- hotwater 0.280000 0.000000 0.280000 ( 0.281579)
5
- amatch
2
+ jaro_winkler 0.370000 0.000000 0.370000 ( 0.367923)
3
+ fuzzystringmatch 0.340000 0.030000 0.370000 ( 0.372721)
4
+ hotwater 0.310000 0.000000 0.310000 ( 0.313405)
5
+ amatch 0.970000 0.000000 0.970000 ( 0.968318)
6
+ ------------------------------------------- total: 2.020000sec
7
+
8
+ user system total real
9
+ jaro_winkler 0.390000 0.000000 0.390000 ( 0.392408)
10
+ fuzzystringmatch 0.150000 0.000000 0.150000 ( 0.151552)
11
+ hotwater 0.320000 0.000000 0.320000 ( 0.317740)
12
+ amatch 0.960000 0.010000 0.970000 ( 0.964803)
data/benchmark/pure.txt CHANGED
@@ -1,8 +1,8 @@
1
1
  Rehearsal ----------------------------------------------------
2
- jaro_winkler 12.520000 0.020000 12.540000 ( 12.548948)
3
- fuzzystringmatch 15.370000 0.020000 15.390000 ( 15.408540)
4
- ------------------------------------------ total: 27.930000sec
2
+ jaro_winkler 12.690000 0.020000 12.710000 ( 12.726929)
3
+ fuzzystringmatch 15.230000 0.020000 15.250000 ( 15.255146)
4
+ ------------------------------------------ total: 27.960000sec
5
5
 
6
6
  user system total real
7
- jaro_winkler 12.750000 0.030000 12.780000 ( 12.782842)
8
- fuzzystringmatch 16.240000 0.030000 16.270000 ( 16.287380)
7
+ jaro_winkler 12.290000 0.020000 12.310000 ( 12.308876)
8
+ fuzzystringmatch 15.460000 0.020000 15.480000 ( 15.493061)
@@ -21,8 +21,8 @@ static int char_bytes_num(char first_char){
21
21
  else return 1;
22
22
  }
23
23
 
24
- static unsigned long* codepoints(const char *str, int byte_len, int *ret_len){
25
- unsigned long *ret = calloc(byte_len, sizeof(long));
24
+ static unsigned long long* codepoints(const char *str, int byte_len, int *ret_len){
25
+ unsigned long long *ret = calloc(byte_len, sizeof(long));
26
26
  int count = 0;
27
27
  for(int i = 0; i < byte_len;){
28
28
  int bytes_num = char_bytes_num(str[i]);
@@ -40,7 +40,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
40
40
  if(!opt){ free_opt_flag = 1; opt = option_new(); }
41
41
 
42
42
  int ary_1_len, ary_2_len;
43
- unsigned long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
43
+ unsigned long long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
44
44
 
45
45
  if(opt->case_match){
46
46
  for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
@@ -49,7 +49,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
49
49
 
50
50
  // Guarantee the order
51
51
  if(ary_1_len > ary_2_len){
52
- unsigned long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
52
+ unsigned long long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
53
53
  int tmp2 = ary_1_len; ary_1_len = ary_2_len; ary_2_len = tmp2;
54
54
  }
55
55
  int window_size = ary_2_len / 2 - 1;
@@ -63,17 +63,14 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
63
63
  int right = i + window_size;
64
64
  if(left < 0) left = 0;
65
65
  if(right > max_index) right = max_index;
66
- char matched = 0;
67
- char found = 0;
66
+ char matched = 0, found = 0;
68
67
  for(int j = left; j <= right; j++){
69
68
  if(ary_1[i] == ary_2[j]){
70
69
  matched = 1;
71
- if(!found){
72
- if(j > previous_index){
73
- previous_index = j;
74
- found = 1;
75
- }
76
- } // if(!found){
70
+ if(!found && j > previous_index){
71
+ previous_index = j;
72
+ found = 1;
73
+ }
77
74
  } // if(ary_1[i] == ary_2[j]){
78
75
  } // for(int j = left; j <= right; j++){
79
76
  if(matched){
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.2.4"
2
+ VERSION = "1.2.5"
3
3
  end
data/lib/jaro_winkler.rb CHANGED
@@ -26,11 +26,9 @@ module JaroWinkler
26
26
  if c1 == c2
27
27
  matched = true
28
28
  s2_index = left + j
29
- unless found
30
- if s2_index > previous_index
31
- previous_index = s2_index
32
- found = true
33
- end
29
+ if !found && s2_index > previous_index
30
+ previous_index = s2_index
31
+ found = true
34
32
  end
35
33
  end
36
34
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.4
4
+ version: 1.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang