jaro_winkler 1.2.4 → 1.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +44 -34
- data/Rakefile +24 -1
- data/benchmark/native.txt +11 -4
- data/benchmark/pure.txt +5 -5
- data/ext/jaro_winkler/distance.c +9 -12
- data/lib/jaro_winkler/version.rb +1 -1
- data/lib/jaro_winkler.rb +3 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 120dffbbfa06b3ce184be2bcd8d151e77d372e7d
|
4
|
+
data.tar.gz: 0b0e0d419aa7065f4a4600ad617e6c68d5d31446
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: da4d765314fe0588475a59081b3a105d15888a3015c40eb6d96f31081fc037d61a6fdda1aadf0d1c79a7619cbb2a462c66e6527533c833889d1d6346c368caa7
|
7
|
+
data.tar.gz: 8a6ac9fb9af738d6e3e8a1a53daa5f8b01f7db133e7447afcc9055665ca7d787828f2f21f1b57408f5b289a1b2d7b5086eb4f2b889036d30d16905e5bd10ae24
|
data/README.md
CHANGED
@@ -42,51 +42,61 @@ threshold | number | 0.7 | The prefix bonus is only added when the compar
|
|
42
42
|
|
43
43
|
There is also another gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match), it uses the same algorithm and both provides C and Ruby implementation.
|
44
44
|
|
45
|
-
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
45
|
+
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged (see table below).
|
46
|
+
|
47
|
+
# Compare with other gems
|
48
|
+
|
49
|
+
| jaro_winkler | fuzzystringmatch | hotwater | amatch
|
50
|
+
------------ | ------------ | ---------------- | -------- | ------
|
51
|
+
UTF-8 Suport | Yes | Pure Ruby only | |
|
52
|
+
Native | Yes | Yes | Yes | Yes
|
53
|
+
Pure Ruby | Yes | Yes | |
|
54
|
+
Speed | Medium | Fast | Medium | Low
|
55
|
+
Bug Found | | Yes | | Yes
|
56
|
+
|
57
|
+
For `Bug Found`, I made a rake task to build the table below, the source code is in `Rakefile`:
|
58
|
+
|
59
|
+
str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater | amatch
|
60
|
+
--- | --- | --- | --- | --- | --- | ---
|
61
|
+
"henka" | "henkan" | 0.9667 | 0.9667 | **0.9722** | 0.9667 | **0.9444**
|
62
|
+
"al" | "al" | 1.0 | 1.0 | 1.0 | 1.0 | 1.0
|
63
|
+
"martha" | "marhta" | 0.9611 | 0.9611 | 0.9611 | 0.9611 | **0.9444**
|
64
|
+
"jones" | "johnson" | 0.8324 | 0.8324 | 0.8324 | 0.8324 | **0.7905**
|
65
|
+
"abcvwxyz" | "cabvwxyz" | 0.9583 | 0.9583 | 0.9583 | 0.9583 | 0.9583
|
66
|
+
"dwayne" | "duane" | 0.84 | 0.84 | 0.84 | 0.84 | **0.8222**
|
67
|
+
"dixon" | "dicksonx" | 0.8133 | 0.8133 | 0.8133 | 0.8133 | **0.7667**
|
68
|
+
"fvie" | "ten" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
|
57
69
|
|
58
70
|
- The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
|
59
71
|
- Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
|
60
72
|
|
61
73
|
## Benchmark
|
62
74
|
|
63
|
-
|
64
|
-
- fuzzy-string-match (0.9.6)
|
65
|
-
|
66
|
-
```ruby
|
67
|
-
require 'benchmark'
|
68
|
-
require 'jaro_winkler'
|
69
|
-
require 'fuzzystringmatch'
|
70
|
-
ary = [['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
71
|
-
|
72
|
-
n = 100000
|
73
|
-
Benchmark.bmbm do |x|
|
74
|
-
x.report 'jaro_winkler' do
|
75
|
-
n.times{ ary.each{ |str1, str2| JaroWinkler.r_distance(str1, str2) } }
|
76
|
-
end
|
77
|
-
|
78
|
-
x.report 'fuzzystringmatch' do
|
79
|
-
jarow = FuzzyStringMatch::JaroWinkler.create(:pure)
|
80
|
-
n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
|
81
|
-
end
|
82
|
-
end
|
83
|
-
```
|
75
|
+
### Pure Ruby
|
84
76
|
|
85
77
|
| user | system | total | real
|
86
|
-
|
78
|
+
---------------- | --------- | -------- | --------- | ------------
|
87
79
|
jaro_winkler | 12.750000 | 0.030000 | 12.780000 | ( 12.782842)
|
88
80
|
fuzzystringmatch | 16.240000 | 0.030000 | 16.270000 | ( 16.287380)
|
89
81
|
|
82
|
+
- jaro_winkler (1.2.3)
|
83
|
+
- fuzzy-string-match (0.9.6)
|
84
|
+
|
85
|
+
### Native
|
86
|
+
|
87
|
+
| user | system | total | real
|
88
|
+
---------------- | -------- | -------- | -------- | ------------
|
89
|
+
jaro_winkler | 0.390000 | 0.000000 | 0.390000 | ( 0.392408)
|
90
|
+
fuzzystringmatch | 0.150000 | 0.000000 | 0.150000 | ( 0.151552)
|
91
|
+
hotwater | 0.320000 | 0.000000 | 0.320000 | ( 0.317740)
|
92
|
+
amatch | 0.960000 | 0.010000 | 0.970000 | ( 0.964803)
|
93
|
+
|
94
|
+
- jaro_winkler (1.2.3)
|
95
|
+
- fuzzy-string-match (0.9.6)
|
96
|
+
- hotwater (0.1.2)
|
97
|
+
- amatch (0.3.0)
|
98
|
+
|
90
99
|
# Todo
|
91
100
|
|
101
|
+
- Make it faster
|
92
102
|
- Adjusting word table (Reference to original C implementation.)
|
data/Rakefile
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
-
|
3
2
|
require "rake/extensiontask"
|
4
3
|
|
5
4
|
Rake::ExtensionTask.new("jaro_winkler") do |ext|
|
@@ -16,4 +15,28 @@ task :benchmark do
|
|
16
15
|
puts cmd
|
17
16
|
system(cmd)
|
18
17
|
end
|
18
|
+
end
|
19
|
+
|
20
|
+
task :compare do
|
21
|
+
require 'jaro_winkler'
|
22
|
+
require 'fuzzystringmatch'
|
23
|
+
require 'hotwater'
|
24
|
+
require 'amatch'
|
25
|
+
@ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
26
|
+
table = []
|
27
|
+
table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
|
28
|
+
table << %w[--- --- --- --- --- ---]
|
29
|
+
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
|
30
|
+
@ary.each do |str_1, str_2|
|
31
|
+
table << ["\"#{str_1}\"", "\"#{str_2}\"", JaroWinkler.distance(str_1, str_2).round(4), jarow.getDistance(str_1, str_2).round(4), Hotwater.jaro_winkler_distance(str_1, str_2).round(4), Amatch::Jaro.new(str_1).match(str_2).round(4)]
|
32
|
+
end
|
33
|
+
col_len = []
|
34
|
+
table.first.length.times{ |i| col_len << table.map{ |row| row[i].to_s.length }.max }
|
35
|
+
table.first.each_with_index{ |title, i| "%-#{col_len[i]}s" % title }
|
36
|
+
table.each_with_index do |row|
|
37
|
+
row.each_with_index do |col, i|
|
38
|
+
row[i] = "%-#{col_len[i]}s" % col.to_s
|
39
|
+
end
|
40
|
+
end
|
41
|
+
table.each{|row| puts row.join(' | ')}
|
19
42
|
end
|
data/benchmark/native.txt
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
Rehearsal ----------------------------------------------------
|
2
|
-
jaro_winkler 0.
|
3
|
-
fuzzystringmatch 0.
|
4
|
-
hotwater 0.
|
5
|
-
amatch
|
2
|
+
jaro_winkler 0.370000 0.000000 0.370000 ( 0.367923)
|
3
|
+
fuzzystringmatch 0.340000 0.030000 0.370000 ( 0.372721)
|
4
|
+
hotwater 0.310000 0.000000 0.310000 ( 0.313405)
|
5
|
+
amatch 0.970000 0.000000 0.970000 ( 0.968318)
|
6
|
+
------------------------------------------- total: 2.020000sec
|
7
|
+
|
8
|
+
user system total real
|
9
|
+
jaro_winkler 0.390000 0.000000 0.390000 ( 0.392408)
|
10
|
+
fuzzystringmatch 0.150000 0.000000 0.150000 ( 0.151552)
|
11
|
+
hotwater 0.320000 0.000000 0.320000 ( 0.317740)
|
12
|
+
amatch 0.960000 0.010000 0.970000 ( 0.964803)
|
data/benchmark/pure.txt
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
Rehearsal ----------------------------------------------------
|
2
|
-
jaro_winkler 12.
|
3
|
-
fuzzystringmatch 15.
|
4
|
-
------------------------------------------ total: 27.
|
2
|
+
jaro_winkler 12.690000 0.020000 12.710000 ( 12.726929)
|
3
|
+
fuzzystringmatch 15.230000 0.020000 15.250000 ( 15.255146)
|
4
|
+
------------------------------------------ total: 27.960000sec
|
5
5
|
|
6
6
|
user system total real
|
7
|
-
jaro_winkler 12.
|
8
|
-
fuzzystringmatch
|
7
|
+
jaro_winkler 12.290000 0.020000 12.310000 ( 12.308876)
|
8
|
+
fuzzystringmatch 15.460000 0.020000 15.480000 ( 15.493061)
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -21,8 +21,8 @@ static int char_bytes_num(char first_char){
|
|
21
21
|
else return 1;
|
22
22
|
}
|
23
23
|
|
24
|
-
static unsigned long* codepoints(const char *str, int byte_len, int *ret_len){
|
25
|
-
unsigned long *ret = calloc(byte_len, sizeof(long));
|
24
|
+
static unsigned long long* codepoints(const char *str, int byte_len, int *ret_len){
|
25
|
+
unsigned long long *ret = calloc(byte_len, sizeof(long));
|
26
26
|
int count = 0;
|
27
27
|
for(int i = 0; i < byte_len;){
|
28
28
|
int bytes_num = char_bytes_num(str[i]);
|
@@ -40,7 +40,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
|
|
40
40
|
if(!opt){ free_opt_flag = 1; opt = option_new(); }
|
41
41
|
|
42
42
|
int ary_1_len, ary_2_len;
|
43
|
-
unsigned long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
|
43
|
+
unsigned long long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
|
44
44
|
|
45
45
|
if(opt->case_match){
|
46
46
|
for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
|
@@ -49,7 +49,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
|
|
49
49
|
|
50
50
|
// Guarantee the order
|
51
51
|
if(ary_1_len > ary_2_len){
|
52
|
-
unsigned long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
|
52
|
+
unsigned long long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
|
53
53
|
int tmp2 = ary_1_len; ary_1_len = ary_2_len; ary_2_len = tmp2;
|
54
54
|
}
|
55
55
|
int window_size = ary_2_len / 2 - 1;
|
@@ -63,17 +63,14 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
|
|
63
63
|
int right = i + window_size;
|
64
64
|
if(left < 0) left = 0;
|
65
65
|
if(right > max_index) right = max_index;
|
66
|
-
char matched
|
67
|
-
char found = 0;
|
66
|
+
char matched = 0, found = 0;
|
68
67
|
for(int j = left; j <= right; j++){
|
69
68
|
if(ary_1[i] == ary_2[j]){
|
70
69
|
matched = 1;
|
71
|
-
if(!found){
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
}
|
76
|
-
} // if(!found){
|
70
|
+
if(!found && j > previous_index){
|
71
|
+
previous_index = j;
|
72
|
+
found = 1;
|
73
|
+
}
|
77
74
|
} // if(ary_1[i] == ary_2[j]){
|
78
75
|
} // for(int j = left; j <= right; j++){
|
79
76
|
if(matched){
|
data/lib/jaro_winkler/version.rb
CHANGED
data/lib/jaro_winkler.rb
CHANGED
@@ -26,11 +26,9 @@ module JaroWinkler
|
|
26
26
|
if c1 == c2
|
27
27
|
matched = true
|
28
28
|
s2_index = left + j
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
found = true
|
33
|
-
end
|
29
|
+
if !found && s2_index > previous_index
|
30
|
+
previous_index = s2_index
|
31
|
+
found = true
|
34
32
|
end
|
35
33
|
end
|
36
34
|
end
|