jaro_winkler 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +44 -34
- data/Rakefile +24 -1
- data/benchmark/native.txt +11 -4
- data/benchmark/pure.txt +5 -5
- data/ext/jaro_winkler/distance.c +9 -12
- data/lib/jaro_winkler/version.rb +1 -1
- data/lib/jaro_winkler.rb +3 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 120dffbbfa06b3ce184be2bcd8d151e77d372e7d
|
4
|
+
data.tar.gz: 0b0e0d419aa7065f4a4600ad617e6c68d5d31446
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: da4d765314fe0588475a59081b3a105d15888a3015c40eb6d96f31081fc037d61a6fdda1aadf0d1c79a7619cbb2a462c66e6527533c833889d1d6346c368caa7
|
7
|
+
data.tar.gz: 8a6ac9fb9af738d6e3e8a1a53daa5f8b01f7db133e7447afcc9055665ca7d787828f2f21f1b57408f5b289a1b2d7b5086eb4f2b889036d30d16905e5bd10ae24
|
data/README.md
CHANGED
@@ -42,51 +42,61 @@ threshold | number | 0.7 | The prefix bonus is only added when the compar
|
|
42
42
|
|
43
43
|
There is also another gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match), it uses the same algorithm and both provides C and Ruby implementation.
|
44
44
|
|
45
|
-
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
45
|
+
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged (see table below).
|
46
|
+
|
47
|
+
# Compare with other gems
|
48
|
+
|
49
|
+
| jaro_winkler | fuzzystringmatch | hotwater | amatch
|
50
|
+
------------ | ------------ | ---------------- | -------- | ------
|
51
|
+
UTF-8 Suport | Yes | Pure Ruby only | |
|
52
|
+
Native | Yes | Yes | Yes | Yes
|
53
|
+
Pure Ruby | Yes | Yes | |
|
54
|
+
Speed | Medium | Fast | Medium | Low
|
55
|
+
Bug Found | | Yes | | Yes
|
56
|
+
|
57
|
+
For `Bug Found`, I made a rake task to build the table below, the source code is in `Rakefile`:
|
58
|
+
|
59
|
+
str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater | amatch
|
60
|
+
--- | --- | --- | --- | --- | --- | ---
|
61
|
+
"henka" | "henkan" | 0.9667 | 0.9667 | **0.9722** | 0.9667 | **0.9444**
|
62
|
+
"al" | "al" | 1.0 | 1.0 | 1.0 | 1.0 | 1.0
|
63
|
+
"martha" | "marhta" | 0.9611 | 0.9611 | 0.9611 | 0.9611 | **0.9444**
|
64
|
+
"jones" | "johnson" | 0.8324 | 0.8324 | 0.8324 | 0.8324 | **0.7905**
|
65
|
+
"abcvwxyz" | "cabvwxyz" | 0.9583 | 0.9583 | 0.9583 | 0.9583 | 0.9583
|
66
|
+
"dwayne" | "duane" | 0.84 | 0.84 | 0.84 | 0.84 | **0.8222**
|
67
|
+
"dixon" | "dicksonx" | 0.8133 | 0.8133 | 0.8133 | 0.8133 | **0.7667**
|
68
|
+
"fvie" | "ten" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
|
57
69
|
|
58
70
|
- The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
|
59
71
|
- Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
|
60
72
|
|
61
73
|
## Benchmark
|
62
74
|
|
63
|
-
|
64
|
-
- fuzzy-string-match (0.9.6)
|
65
|
-
|
66
|
-
```ruby
|
67
|
-
require 'benchmark'
|
68
|
-
require 'jaro_winkler'
|
69
|
-
require 'fuzzystringmatch'
|
70
|
-
ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
71
|
-
|
72
|
-
n = 100000
|
73
|
-
Benchmark.bmbm do |x|
|
74
|
-
x.report 'jaro_winkler' do
|
75
|
-
n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
|
76
|
-
end
|
77
|
-
|
78
|
-
x.report 'fuzzystringmatch' do
|
79
|
-
jarow = FuzzyStringMatch::JaroWinkler.create(:pure)
|
80
|
-
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
81
|
-
end
|
82
|
-
end
|
83
|
-
```
|
75
|
+
### Pure Ruby
|
84
76
|
|
85
77
|
| user | system | total | real
|
86
|
-
|
78
|
+
---------------- | --------- | -------- | --------- | ------------
|
87
79
|
jaro_winkler | 12.750000 | 0.030000 | 12.780000 | ( 12.782842)
|
88
80
|
fuzzystringmatch | 16.240000 | 0.030000 | 16.270000 | ( 16.287380)
|
89
81
|
|
82
|
+
- jaro_winkler (1.2.3)
|
83
|
+
- fuzzy-string-match (0.9.6)
|
84
|
+
|
85
|
+
### Native
|
86
|
+
|
87
|
+
| user | system | total | real
|
88
|
+
---------------- | -------- | -------- | -------- | ------------
|
89
|
+
jaro_winkler | 0.390000 | 0.000000 | 0.390000 | ( 0.392408)
|
90
|
+
fuzzystringmatch | 0.150000 | 0.000000 | 0.150000 | ( 0.151552)
|
91
|
+
hotwater | 0.320000 | 0.000000 | 0.320000 | ( 0.317740)
|
92
|
+
amatch | 0.960000 | 0.010000 | 0.970000 | ( 0.964803)
|
93
|
+
|
94
|
+
- jaro_winkler (1.2.3)
|
95
|
+
- fuzzy-string-match (0.9.6)
|
96
|
+
- hotwater (0.1.2)
|
97
|
+
- amatch (0.3.0)
|
98
|
+
|
90
99
|
# Todo
|
91
100
|
|
101
|
+
- Make it faster
|
92
102
|
- Adjusting word table (Reference to original C implementation.)
|
data/Rakefile
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
-
|
3
2
|
require "rake/extensiontask"
|
4
3
|
|
5
4
|
Rake::ExtensionTask.new("jaro_winkler") do |ext|
|
@@ -16,4 +15,28 @@ task :benchmark do
|
|
16
15
|
puts cmd
|
17
16
|
system(cmd)
|
18
17
|
end
|
18
|
+
end
|
19
|
+
|
20
|
+
task :compare do
|
21
|
+
require 'jaro_winkler'
|
22
|
+
require 'fuzzystringmatch'
|
23
|
+
require 'hotwater'
|
24
|
+
require 'amatch'
|
25
|
+
@ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
26
|
+
table = []
|
27
|
+
table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
|
28
|
+
table << %w[--- --- --- --- --- ---]
|
29
|
+
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
|
30
|
+
@ary.each do |str_1, str_2|
|
31
|
+
table << ["\"#{str_1}\"", "\"#{str_2}\"", JaroWinkler.distance(str_1, str_2).round(4), jarow.getDistance(str_1, str_2).round(4), Hotwater.jaro_winkler_distance(str_1, str_2).round(4), Amatch::Jaro.new(str_1).match(str_2).round(4)]
|
32
|
+
end
|
33
|
+
col_len = []
|
34
|
+
table.first.length.times{ |i| col_len << table.map{ |row| row[i].to_s.length }.max }
|
35
|
+
table.first.each_with_index{ |title, i| "%-#{col_len[i]}s" % title }
|
36
|
+
table.each_with_index do |row|
|
37
|
+
row.each_with_index do |col, i|
|
38
|
+
row[i] = "%-#{col_len[i]}s" % col.to_s
|
39
|
+
end
|
40
|
+
end
|
41
|
+
table.each{|row| puts row.join(' | ')}
|
19
42
|
end
|
data/benchmark/native.txt
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
Rehearsal ----------------------------------------------------
|
2
|
-
jaro_winkler 0.
|
3
|
-
fuzzystringmatch 0.
|
4
|
-
hotwater 0.
|
5
|
-
amatch
|
2
|
+
jaro_winkler 0.370000 0.000000 0.370000 ( 0.367923)
|
3
|
+
fuzzystringmatch 0.340000 0.030000 0.370000 ( 0.372721)
|
4
|
+
hotwater 0.310000 0.000000 0.310000 ( 0.313405)
|
5
|
+
amatch 0.970000 0.000000 0.970000 ( 0.968318)
|
6
|
+
------------------------------------------- total: 2.020000sec
|
7
|
+
|
8
|
+
user system total real
|
9
|
+
jaro_winkler 0.390000 0.000000 0.390000 ( 0.392408)
|
10
|
+
fuzzystringmatch 0.150000 0.000000 0.150000 ( 0.151552)
|
11
|
+
hotwater 0.320000 0.000000 0.320000 ( 0.317740)
|
12
|
+
amatch 0.960000 0.010000 0.970000 ( 0.964803)
|
data/benchmark/pure.txt
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
Rehearsal ----------------------------------------------------
|
2
|
-
jaro_winkler 12.
|
3
|
-
fuzzystringmatch 15.
|
4
|
-
------------------------------------------ total: 27.
|
2
|
+
jaro_winkler 12.690000 0.020000 12.710000 ( 12.726929)
|
3
|
+
fuzzystringmatch 15.230000 0.020000 15.250000 ( 15.255146)
|
4
|
+
------------------------------------------ total: 27.960000sec
|
5
5
|
|
6
6
|
user system total real
|
7
|
-
jaro_winkler 12.
|
8
|
-
fuzzystringmatch
|
7
|
+
jaro_winkler 12.290000 0.020000 12.310000 ( 12.308876)
|
8
|
+
fuzzystringmatch 15.460000 0.020000 15.480000 ( 15.493061)
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -21,8 +21,8 @@ static int char_bytes_num(char first_char){
|
|
21
21
|
else return 1;
|
22
22
|
}
|
23
23
|
|
24
|
-
static unsigned long* codepoints(const char *str, int byte_len, int *ret_len){
|
25
|
-
unsigned long *ret = calloc(byte_len, sizeof(long));
|
24
|
+
static unsigned long long* codepoints(const char *str, int byte_len, int *ret_len){
|
25
|
+
unsigned long long *ret = calloc(byte_len, sizeof(long));
|
26
26
|
int count = 0;
|
27
27
|
for(int i = 0; i < byte_len;){
|
28
28
|
int bytes_num = char_bytes_num(str[i]);
|
@@ -40,7 +40,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
|
|
40
40
|
if(!opt){ free_opt_flag = 1; opt = option_new(); }
|
41
41
|
|
42
42
|
int ary_1_len, ary_2_len;
|
43
|
-
unsigned long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
|
43
|
+
unsigned long long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
|
44
44
|
|
45
45
|
if(opt->case_match){
|
46
46
|
for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
|
@@ -49,7 +49,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
|
|
49
49
|
|
50
50
|
// Guarantee the order
|
51
51
|
if(ary_1_len > ary_2_len){
|
52
|
-
unsigned long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
|
52
|
+
unsigned long long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
|
53
53
|
int tmp2 = ary_1_len; ary_1_len = ary_2_len; ary_2_len = tmp2;
|
54
54
|
}
|
55
55
|
int window_size = ary_2_len / 2 - 1;
|
@@ -63,17 +63,14 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
|
|
63
63
|
int right = i + window_size;
|
64
64
|
if(left < 0) left = 0;
|
65
65
|
if(right > max_index) right = max_index;
|
66
|
-
char matched
|
67
|
-
char found = 0;
|
66
|
+
char matched = 0, found = 0;
|
68
67
|
for(int j = left; j <= right; j++){
|
69
68
|
if(ary_1[i] == ary_2[j]){
|
70
69
|
matched = 1;
|
71
|
-
if(!found){
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
}
|
76
|
-
} // if(!found){
|
70
|
+
if(!found && j > previous_index){
|
71
|
+
previous_index = j;
|
72
|
+
found = 1;
|
73
|
+
}
|
77
74
|
} // if(ary_1[i] == ary_2[j]){
|
78
75
|
} // for(int j = left; j <= right; j++){
|
79
76
|
if(matched){
|
data/lib/jaro_winkler/version.rb
CHANGED
data/lib/jaro_winkler.rb
CHANGED
@@ -26,11 +26,9 @@ module JaroWinkler
|
|
26
26
|
if c1 == c2
|
27
27
|
matched = true
|
28
28
|
s2_index = left + j
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
found = true
|
33
|
-
end
|
29
|
+
if !found && s2_index > previous_index
|
30
|
+
previous_index = s2_index
|
31
|
+
found = true
|
34
32
|
end
|
35
33
|
end
|
36
34
|
end
|