jaro_winkler 1.2.8 → 1.3.0.beta
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +34 -24
- data/Rakefile +9 -4
- data/ext/jaro_winkler/distance.c +119 -51
- data/ext/jaro_winkler/distance.h +4 -4
- data/ext/jaro_winkler/jaro_winkler.c +9 -8
- data/lib/jaro_winkler.rb +13 -3
- data/lib/jaro_winkler/adjusting_table.rb +10 -0
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +62 -40
- metadata +5 -5
- data/ext/jaro_winkler/strcmp95.c +0 -199
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eb7b3cd6c1b5f5a137879bd1e3bd8237a85fa081
|
4
|
+
data.tar.gz: 0d9d7fa4893a23b36cb04de1bf5a4c37f739e400
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb8b721fa6c615f2ac972be2469342a3e4f3d53b428be34248340021aedaafeee5f302fc393e132cd4fca4be58d1de40e9ae4b814b4635f37afea559c7ee762b
|
7
|
+
data.tar.gz: 4b5cf2339c8b07f309ae0dec2b7e678b96b17c4ae1f7a62e2a71a6f0e373008af2f45640cc1948961578e0c96f93ec63633fcff2bc9a1e341ec16003a376ef03
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# About
|
2
2
|
|
3
|
-
It's
|
3
|
+
It's an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both of them supports UTF-8 string.
|
4
4
|
|
5
5
|
# Installation
|
6
6
|
|
@@ -24,8 +24,6 @@ JaroWinkler.c_distance "MARTHA", "MARHTA" # C extension
|
|
24
24
|
JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
|
25
25
|
```
|
26
26
|
|
27
|
-
**Both implementations support UTF-8 string.**
|
28
|
-
|
29
27
|
## Options
|
30
28
|
|
31
29
|
Name | Type | Default | Note
|
@@ -33,35 +31,47 @@ Name | Type | Default | Note
|
|
33
31
|
ignore_case | boolean | false | All lower case characters are converted to upper case prior to the comparison.
|
34
32
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
35
33
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
34
|
+
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
|
35
|
+
|
36
|
+
## Default Adjusting Table
|
37
|
+
|
38
|
+
```
|
39
|
+
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'], ['I', 'U'],
|
40
|
+
['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'], ['S', 'Z'], ['X', 'S'],
|
41
|
+
['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'], ['2', 'Z'], ['5', 'S'], ['8', 'B'],
|
42
|
+
['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'], ['E', ' '], ['Y', ' '], ['S', ' ']
|
43
|
+
```
|
36
44
|
|
37
45
|
# Why This?
|
38
46
|
|
39
|
-
There is also another gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match)
|
47
|
+
There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
|
40
48
|
|
41
|
-
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged (see
|
49
|
+
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged (see tables below).
|
42
50
|
|
43
51
|
# Compare with other gems
|
44
52
|
|
45
|
-
|
46
|
-
|
47
|
-
UTF-8 Suport
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
53
|
+
| jaro_winkler | fuzzystringmatch | hotwater | amatch
|
54
|
+
--------------- | ------------ | ---------------- | -------- | ------
|
55
|
+
UTF-8 Suport | **Yes** | Pure Ruby only | No | No
|
56
|
+
Windows Support | **Yes** | | No | **Yes**
|
57
|
+
Adjusting Table | **Yes** | No | No | No
|
58
|
+
Native | **Yes** | **Yes** | **Yes** | **Yes**
|
59
|
+
Pure Ruby | **Yes** | **Yes** | No | No
|
60
|
+
Speed | Medium | Fast | Medium | Slow
|
61
|
+
Bug Found | **Not Yet** | Yes | **Not Yet** | Yes
|
52
62
|
|
53
63
|
For `Bug Found`, I made a rake task to build the table below, the source code is in `Rakefile`:
|
54
64
|
|
55
|
-
str_1 | str_2 | origin
|
56
|
-
--- | --- | ---
|
57
|
-
"henka" | "henkan" | 0.9667
|
58
|
-
"al" | "al" | 1.0
|
59
|
-
"martha" | "marhta" | 0.9611
|
60
|
-
"jones" | "johnson" | 0.8324
|
61
|
-
"abcvwxyz" | "cabvwxyz" | 0.9583
|
62
|
-
"dwayne" | "duane" | 0.84
|
63
|
-
"dixon" | "dicksonx" | 0.8133
|
64
|
-
"fvie" | "ten" | 0.0
|
65
|
+
str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater | amatch
|
66
|
+
--- | --- | --- | --- | --- | --- | ---
|
67
|
+
"henka" | "henkan" | 0.9667 | 0.9667 | **0.9722** | 0.9667 | **0.9444**
|
68
|
+
"al" | "al" | 1.0 | 1.0 | 1.0 | 1.0 | 1.0
|
69
|
+
"martha" | "marhta" | 0.9611 | 0.9611 | 0.9611 | 0.9611 | **0.9444**
|
70
|
+
"jones" | "johnson" | 0.8324 | 0.8324 | 0.8324 | 0.8324 | **0.7905**
|
71
|
+
"abcvwxyz" | "cabvwxyz" | 0.9583 | 0.9583 | 0.9583 | 0.9583 | 0.9583
|
72
|
+
"dwayne" | "duane" | 0.84 | 0.84 | 0.84 | 0.84 | **0.8222**
|
73
|
+
"dixon" | "dicksonx" | 0.8133 | 0.8133 | 0.8133 | 0.8133 | **0.7667**
|
74
|
+
"fvie" | "ten" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
|
65
75
|
|
66
76
|
- The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
|
67
77
|
- Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
|
@@ -94,5 +104,5 @@ amatch | 0.960000 | 0.010000 | 0.970000 | ( 0.964803)
|
|
94
104
|
|
95
105
|
# Todo
|
96
106
|
|
97
|
-
-
|
98
|
-
-
|
107
|
+
- Custom adjusting word table.
|
108
|
+
- If the adjusting table is ASCII encoded, use dense matrix instread of sparse matrix to speed up.
|
data/Rakefile
CHANGED
@@ -5,15 +5,20 @@ Rake::ExtensionTask.new("jaro_winkler") do |ext|
|
|
5
5
|
ext.lib_dir = "lib/jaro_winkler"
|
6
6
|
end
|
7
7
|
|
8
|
-
|
8
|
+
desc 'type can be "native" or "pure"'
|
9
|
+
task :benchmark, :type do |t, args|
|
10
|
+
args.with_defaults(type: :all)
|
9
11
|
ROOT_PATH = File.expand_path('..', __FILE__)
|
10
12
|
LIB_PATH = File.join(ROOT_PATH, 'lib')
|
11
13
|
BENCHMARK_PATH = File.join(ROOT_PATH, 'benchmark')
|
12
|
-
|
14
|
+
|
15
|
+
files = File.join(BENCHMARK_PATH, args[:type] == :all ? '*.rb' : "#{args[:type]}.rb")
|
16
|
+
Dir[files].each do |path|
|
13
17
|
output_path = File.join(BENCHMARK_PATH, File.basename(path, '*.rb').sub('.rb', '.txt'))
|
14
|
-
cmd = "RUBYLIB=#{LIB_PATH} ruby #{path}
|
18
|
+
cmd = "RUBYLIB=#{LIB_PATH} ruby #{path}"
|
15
19
|
puts cmd
|
16
|
-
|
20
|
+
output = `#{cmd}`
|
21
|
+
File.write(output_path, output)
|
17
22
|
end
|
18
23
|
end
|
19
24
|
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -3,93 +3,161 @@
|
|
3
3
|
#include <ctype.h>
|
4
4
|
#include "distance.h"
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
opt->threshold = 0.7;
|
11
|
-
return opt;
|
12
|
-
}
|
6
|
+
typedef struct{
|
7
|
+
unsigned long long code;
|
8
|
+
unsigned int byte_length;
|
9
|
+
} UnicodeHash;
|
13
10
|
|
14
|
-
|
15
|
-
unsigned
|
16
|
-
|
17
|
-
|
18
|
-
else if(c >= 240) return 4; // 11110xxx
|
19
|
-
else if(c >= 224) return 3; // 1110xxxx
|
20
|
-
else if(c >= 192) return 2; // 110xxxxx
|
21
|
-
else return 1;
|
22
|
-
}
|
11
|
+
typedef struct{
|
12
|
+
unsigned long long *ary;
|
13
|
+
int length;
|
14
|
+
} Codepoints;
|
23
15
|
|
24
|
-
|
25
|
-
unsigned long long
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
16
|
+
typedef struct{
|
17
|
+
unsigned long long x, y;
|
18
|
+
} Coord;
|
19
|
+
|
20
|
+
typedef struct{
|
21
|
+
Coord *coords;
|
22
|
+
int length;
|
23
|
+
} Matrix;
|
24
|
+
|
25
|
+
static const char *DEFAULT_ADJ_TABLE[] = {
|
26
|
+
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
27
|
+
"I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
|
28
|
+
"M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
|
29
|
+
"0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
|
30
|
+
};
|
31
|
+
static Matrix DEFAULT_MATRIX;
|
32
|
+
|
33
|
+
static UnicodeHash unicode_hash_new(const char *str);
|
34
|
+
static Codepoints codepoints_new(const char *str, int byte_len);
|
35
|
+
static Matrix matrix_new(const char **adj_table, int length);
|
36
|
+
static char matrix_find(Matrix matrix, unsigned long long code_1, unsigned long long code_2);
|
37
|
+
|
38
|
+
Option option_new(){
|
39
|
+
Option opt;
|
40
|
+
opt.ignore_case = 0;
|
41
|
+
opt.weight = 0.1;
|
42
|
+
opt.threshold = 0.7;
|
43
|
+
return opt;
|
35
44
|
}
|
36
45
|
|
37
|
-
double c_distance(char *s1, int
|
46
|
+
double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
|
38
47
|
// set default option if NULL passed
|
39
48
|
int free_opt_flag = 0;
|
40
|
-
if(!opt){ free_opt_flag = 1; opt = option_new(); }
|
41
49
|
|
42
|
-
|
43
|
-
|
50
|
+
Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len);
|
51
|
+
Codepoints code_ary_2 = codepoints_new(s2, s2_byte_len);
|
44
52
|
|
45
|
-
if(opt
|
46
|
-
for(int i = 0; i <
|
47
|
-
for(int i = 0; i <
|
53
|
+
if(opt.ignore_case){
|
54
|
+
for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
|
55
|
+
for(int i = 0; i < code_ary_2.length; ++i) if(code_ary_2.ary[i] < 256 && islower(code_ary_2.ary[i])) code_ary_2.ary[i] -= 32;
|
48
56
|
}
|
49
57
|
|
50
58
|
// Guarantee the order
|
51
|
-
if(
|
52
|
-
unsigned long long *tmp =
|
53
|
-
int tmp2 =
|
59
|
+
if(code_ary_1.length > code_ary_2.length){
|
60
|
+
unsigned long long *tmp = code_ary_1.ary; code_ary_1.ary = code_ary_2.ary; code_ary_2.ary = tmp;
|
61
|
+
int tmp2 = code_ary_1.length; code_ary_1.length = code_ary_2.length; code_ary_2.length = tmp2;
|
54
62
|
}
|
55
|
-
|
63
|
+
|
64
|
+
// Adjusting table
|
65
|
+
static char first_time = 1;
|
66
|
+
if(opt.adj_table){
|
67
|
+
if(first_time) DEFAULT_MATRIX = matrix_new(DEFAULT_ADJ_TABLE, sizeof(DEFAULT_ADJ_TABLE) / 8);
|
68
|
+
first_time = 0;
|
69
|
+
}
|
70
|
+
|
71
|
+
// Compute jaro distance
|
72
|
+
int window_size = code_ary_2.length / 2 - 1;
|
56
73
|
if(window_size < 0) window_size = 0;
|
57
74
|
double matches = 0.0;
|
75
|
+
double sim_matches = 0.0;
|
58
76
|
int transpositions = 0;
|
59
77
|
int previous_index = -1;
|
60
|
-
int max_index =
|
61
|
-
for(int i = 0; i <
|
78
|
+
int max_index = code_ary_2.length - 1;
|
79
|
+
for(int i = 0; i < code_ary_1.length; i++){
|
62
80
|
int left = i - window_size;
|
63
81
|
int right = i + window_size;
|
64
82
|
if(left < 0) left = 0;
|
65
83
|
if(right > max_index) right = max_index;
|
66
|
-
char matched
|
84
|
+
char matched = 0;
|
85
|
+
char found = 0;
|
86
|
+
char sim_matched = 0;
|
67
87
|
for(int j = left; j <= right; j++){
|
68
|
-
if(
|
88
|
+
if(code_ary_1.ary[i] == code_ary_2.ary[j]){
|
69
89
|
matched = 1;
|
70
90
|
if(!found && j > previous_index){
|
71
91
|
previous_index = j;
|
72
92
|
found = 1;
|
73
93
|
}
|
74
|
-
}
|
94
|
+
}else if(opt.adj_table && matrix_find(DEFAULT_MATRIX, code_ary_1.ary[i], code_ary_2.ary[j])) sim_matched = 1;
|
75
95
|
} // for(int j = left; j <= right; j++){
|
76
96
|
if(matched){
|
77
97
|
matches++;
|
78
98
|
if(!found) transpositions++;
|
79
|
-
}
|
80
|
-
} // for(int i = 0; i <
|
99
|
+
}else if(sim_matched) sim_matches += 3;
|
100
|
+
} // for(int i = 0; i < code_ary_1.length; i++){
|
101
|
+
|
81
102
|
// Don't divide transpositions by 2 since it's been counted directly by above code.
|
82
|
-
double
|
103
|
+
double similarity = matches;
|
104
|
+
if(opt.adj_table) similarity += sim_matches / 10;
|
105
|
+
double jaro_distance = matches == 0 ? 0 : (similarity / code_ary_1.length + similarity / code_ary_2.length + (matches - transpositions) / matches) / 3.0;
|
83
106
|
|
84
107
|
// calculate jaro-winkler distance
|
85
|
-
double threshold = opt
|
108
|
+
double threshold = opt.threshold, weight = opt.weight;
|
86
109
|
int prefix = 0;
|
87
|
-
int max_length =
|
110
|
+
int max_length = code_ary_1.length > 4 ? 4 : code_ary_1.length;
|
88
111
|
for(int i = 0; i < max_length; ++i){
|
89
|
-
if(
|
112
|
+
if(code_ary_1.ary[i] == code_ary_2.ary[i]) prefix++;
|
90
113
|
else break;
|
91
114
|
}
|
92
|
-
free(
|
93
|
-
if(free_opt_flag) free(opt);
|
115
|
+
free(code_ary_1.ary); free(code_ary_2.ary);
|
94
116
|
return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
|
117
|
+
}
|
118
|
+
|
119
|
+
static UnicodeHash unicode_hash_new(const char *str){
|
120
|
+
UnicodeHash ret = {};
|
121
|
+
unsigned char first_char = str[0];
|
122
|
+
if(first_char >= 252) ret.byte_length = 6; // 1111110x
|
123
|
+
else if(first_char >= 248) ret.byte_length = 5; // 111110xx
|
124
|
+
else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
|
125
|
+
else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
|
126
|
+
else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
|
127
|
+
else ret.byte_length = 1;
|
128
|
+
memcpy(&ret.code, str, ret.byte_length);
|
129
|
+
return ret;
|
130
|
+
}
|
131
|
+
|
132
|
+
static Codepoints codepoints_new(const char *str, int byte_len){
|
133
|
+
Codepoints ret = {0};
|
134
|
+
ret.ary = calloc(byte_len, sizeof(long long));
|
135
|
+
int count = 0;
|
136
|
+
for(int i = 0; i < byte_len;){
|
137
|
+
UnicodeHash hash = unicode_hash_new(str + i);
|
138
|
+
ret.ary[count] = hash.code;
|
139
|
+
count++;
|
140
|
+
i += hash.byte_length;
|
141
|
+
}
|
142
|
+
ret.length += count;
|
143
|
+
return ret;
|
144
|
+
}
|
145
|
+
|
146
|
+
static Matrix matrix_new(const char **adj_table, int length){
|
147
|
+
Matrix ret;
|
148
|
+
ret.coords = calloc(length, sizeof(Coord));
|
149
|
+
ret.length = length;
|
150
|
+
for(int i = 0; i < length; i += 2){
|
151
|
+
UnicodeHash h1 = unicode_hash_new(adj_table[i]);
|
152
|
+
UnicodeHash h2 = unicode_hash_new(adj_table[i+1]);
|
153
|
+
ret.coords[i].x = ret.coords[i+1].y = h1.code;
|
154
|
+
ret.coords[i].y = ret.coords[i+1].x = h2.code;
|
155
|
+
}
|
156
|
+
return ret;
|
157
|
+
}
|
158
|
+
|
159
|
+
static char matrix_find(Matrix matrix, unsigned long long code_1, unsigned long long code_2){
|
160
|
+
for (int i = 0; i < matrix.length; i++)
|
161
|
+
if(matrix.coords[i].x == code_1 && matrix.coords[i].y == code_2) return 1;
|
162
|
+
return 0;
|
95
163
|
}
|
data/ext/jaro_winkler/distance.h
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
#define DISTANCE_H 1
|
3
3
|
|
4
4
|
typedef struct{
|
5
|
-
char ignore_case;
|
6
5
|
double weight, threshold;
|
6
|
+
char ignore_case, adj_table;
|
7
7
|
} Option;
|
8
8
|
|
9
|
-
double
|
10
|
-
Option
|
9
|
+
double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
|
10
|
+
Option option_new();
|
11
11
|
|
12
|
-
#endif /* DISTANCE_H */
|
12
|
+
#endif /* DISTANCE_H */
|
@@ -11,17 +11,18 @@ void Init_jaro_winkler(void){
|
|
11
11
|
VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
12
12
|
VALUE s1, s2, opt;
|
13
13
|
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
14
|
-
Option
|
14
|
+
Option c_opt = option_new();
|
15
15
|
if(TYPE(opt) == T_HASH){
|
16
|
-
VALUE weight
|
17
|
-
VALUE threshold
|
16
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
|
17
|
+
VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
|
18
18
|
VALUE ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case")));
|
19
|
-
|
20
|
-
if(
|
21
|
-
if(
|
22
|
-
if(!NIL_P(
|
19
|
+
VALUE adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
20
|
+
if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
|
21
|
+
if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
22
|
+
if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
|
23
|
+
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
24
|
+
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
23
25
|
}
|
24
26
|
VALUE ret = rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
|
25
|
-
free(c_opt);
|
26
27
|
return ret;
|
27
28
|
}
|
data/lib/jaro_winkler.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
require 'jaro_winkler/fallback'
|
2
|
+
require 'jaro_winkler/adjusting_table'
|
2
3
|
require 'jaro_winkler/jaro_winkler.so' unless JaroWinkler.fallback?
|
3
4
|
module JaroWinkler
|
4
5
|
module_function
|
5
|
-
def jaro_distance s1, s2
|
6
|
+
def jaro_distance s1, s2, options = {}
|
7
|
+
options = {adj_table: false}.merge options
|
6
8
|
length1, length2 = s1.length, s2.length
|
7
9
|
# Guarantee the length order
|
8
10
|
if s1.length > s2.length
|
@@ -12,6 +14,7 @@ module JaroWinkler
|
|
12
14
|
window_size = (length2 / 2) - 1
|
13
15
|
window_size = 0 if window_size < 0
|
14
16
|
matches = 0.0
|
17
|
+
sim_matches = 0.0
|
15
18
|
transpositions = 0
|
16
19
|
previous_index = -1
|
17
20
|
max_index = length2 - 1
|
@@ -21,6 +24,7 @@ module JaroWinkler
|
|
21
24
|
left = 0 if left < 0
|
22
25
|
right = max_index if right > max_index
|
23
26
|
matched = false
|
27
|
+
sim_matched = false
|
24
28
|
found = false
|
25
29
|
s2[left..right].chars.each_with_index do |c2, j|
|
26
30
|
if c1 == c2
|
@@ -30,15 +34,21 @@ module JaroWinkler
|
|
30
34
|
previous_index = s2_index
|
31
35
|
found = true
|
32
36
|
end
|
37
|
+
elsif options[:adj_table] && DEFAULT_ADJ_TABLE[c1][c2]
|
38
|
+
sim_matched = true
|
33
39
|
end
|
34
40
|
end
|
35
41
|
if matched
|
36
42
|
matches += 1
|
37
43
|
transpositions += 1 unless found
|
44
|
+
elsif sim_matched # not matched but similarly matched
|
45
|
+
sim_matches += 3
|
38
46
|
end
|
39
47
|
end
|
40
48
|
# Don't divide transpositions by 2 since it's been counted directly by above code.
|
41
|
-
|
49
|
+
similarity = matches
|
50
|
+
similarity += sim_matches / 10 if options[:adj_table]
|
51
|
+
matches == 0 ? 0 : (similarity / length1 + similarity / length2 + (matches - transpositions) / matches) / 3.0
|
42
52
|
end
|
43
53
|
|
44
54
|
def r_distance s1, s2, options = {}
|
@@ -46,7 +56,7 @@ module JaroWinkler
|
|
46
56
|
weight, threshold, ignore_case = options[:weight], options[:threshold], options[:ignore_case]
|
47
57
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
48
58
|
s1, s2 = s1.upcase, s2.upcase if ignore_case
|
49
|
-
distance = jaro_distance(s1, s2)
|
59
|
+
distance = jaro_distance(s1, s2, options)
|
50
60
|
prefix = 0
|
51
61
|
max_length = [4, s1.length, s2.length].min
|
52
62
|
s1[0, max_length].chars.each_with_index do |c1, i|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module JaroWinkler
|
2
|
+
DEFAULT_ADJ_TABLE = Hash.new({})
|
3
|
+
[
|
4
|
+
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
|
5
|
+
['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
|
6
|
+
['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
|
7
|
+
['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
|
8
|
+
['E', ' '], ['Y', ' '], ['S', ' ']
|
9
|
+
].each{ |s1, s2| DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true }
|
10
|
+
end
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -1,62 +1,84 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'jaro_winkler'
|
3
3
|
include JaroWinkler
|
4
|
-
describe JaroWinkler do
|
5
|
-
before(:all) do
|
6
|
-
@ary = [
|
7
|
-
['henka', 'henkan', 0.9667],
|
8
|
-
['al', 'al', 1.0],
|
9
|
-
['martha', 'marhta', 0.9611],
|
10
|
-
['jones', 'johnson', 0.8323],
|
11
|
-
['abcvwxyz', 'cabvwxyz', 0.9583],
|
12
|
-
['dwayne', 'duane', 0.8400],
|
13
|
-
['dixon', 'dicksonx', 0.8133],
|
14
|
-
['fvie', 'ten', 0.0],
|
15
|
-
['tony', 'tony', 1.0],
|
16
|
-
['tonytonyjan', 'tonytonyjan', 1.0],
|
17
|
-
['x', 'x', 1.0],
|
18
|
-
['', '', 0.0],
|
19
|
-
['tony', '', 0.0],
|
20
|
-
['', 'tony', 0.0],
|
21
|
-
['tonytonyjan', 'tony', 0.8727],
|
22
|
-
['tony', 'tonytonyjan', 0.8727]
|
23
|
-
]
|
24
|
-
end
|
25
4
|
|
5
|
+
shared_examples 'common' do |strategy|
|
26
6
|
it 'works' do
|
27
|
-
|
28
|
-
|
29
|
-
|
7
|
+
ary = [
|
8
|
+
['henka' , 'henkan' , 0.9667] ,
|
9
|
+
['al' , 'al' , 1.0] ,
|
10
|
+
['martha' , 'marhta' , 0.9611] ,
|
11
|
+
['jones' , 'johnson' , 0.8323] ,
|
12
|
+
['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
|
13
|
+
['dwayne' , 'duane' , 0.8400] ,
|
14
|
+
['dixon' , 'dicksonx' , 0.8133] ,
|
15
|
+
['fvie' , 'ten' , 0.0] ,
|
16
|
+
['tony' , 'tony' , 1.0] ,
|
17
|
+
['tonytonyjan' , 'tonytonyjan' , 1.0] ,
|
18
|
+
['x' , 'x' , 1.0] ,
|
19
|
+
['' , '' , 0.0] ,
|
20
|
+
['tony' , '' , 0.0] ,
|
21
|
+
['' , 'tony' , 0.0] ,
|
22
|
+
['tonytonyjan' , 'tony' , 0.8727] ,
|
23
|
+
['tony' , 'tonytonyjan' , 0.8727]
|
24
|
+
]
|
25
|
+
ary.each do |s1, s2, ans|
|
26
|
+
expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
|
30
27
|
end
|
31
28
|
end
|
32
29
|
|
33
30
|
it 'works with UTF-8' do
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
ary = [
|
32
|
+
['變形金剛4:絕跡重生' , '變形金剛4: 絕跡重生' , 0.9818] ,
|
33
|
+
['連勝文' , '連勝丼' , 0.8222] ,
|
34
|
+
['馬英九' , '馬英丸' , 0.8222] ,
|
35
|
+
['良い' , 'いい' , 0.6666] ,
|
36
|
+
]
|
37
|
+
ary.each do |s1, s2, ans|
|
38
|
+
expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
|
39
|
+
end
|
37
40
|
end
|
38
41
|
|
39
42
|
it 'sets ignore_case' do
|
40
|
-
|
41
|
-
expect(r_distance(*params)).to be_within(0.0001).of(0.9611)
|
42
|
-
expect(c_distance(*params)).to be_within(0.0001).of(0.9611)
|
43
|
+
expect(send(strategy, 'MARTHA', 'marhta', ignore_case: true)).to be_within(0.0001).of(0.9611)
|
43
44
|
end
|
44
45
|
|
45
46
|
it 'sets weight' do
|
46
|
-
|
47
|
-
expect(r_distance(*params)).to be_within(0.0001).of(0.9778)
|
48
|
-
expect(c_distance(*params)).to be_within(0.0001).of(0.9778)
|
47
|
+
expect(send(strategy, 'MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
49
48
|
end
|
50
49
|
|
51
50
|
it 'sets threshold' do
|
52
|
-
|
53
|
-
expect(r_distance(*params)).to be_within(0.0001).of(0.9445)
|
54
|
-
expect(c_distance(*params)).to be_within(0.0001).of(0.9445)
|
51
|
+
expect(send(strategy, 'MARTHA', 'MARHTA', threshold: 0.99)).to be_within(0.0001).of(0.9445)
|
55
52
|
end
|
56
53
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
54
|
+
|
55
|
+
it 'works with adjusting table' do
|
56
|
+
ary = [
|
57
|
+
['HENKA' , 'HENKAN' , 0.9667] , # m=5, t=0, s=0
|
58
|
+
['AL' , 'AL' , 1.0 ], # m=2, t=0, s=0
|
59
|
+
['MARTHA' , 'MARHTA' , 0.9611] , # m=6, t=1, s=0
|
60
|
+
['JONES' , 'JOHNSON' , 0.8598] , # m=4, t=0, s=3
|
61
|
+
['ABCVWXYZ' , 'CABVWXYZ' , 0.9583] , # m=8, t=1, s=0
|
62
|
+
['DWAYNE' , 'DUANE' , 0.8730] , # m=4, t=0, s=3
|
63
|
+
['DIXON' , 'DICKSONX' , 0.8393] , # m=4, t=0, s=3
|
64
|
+
['FVIE' , 'TEN' , 0.0 ]
|
65
|
+
]
|
66
|
+
ary.each do |s1, s2, ans|
|
67
|
+
expect(send(strategy, s1, s2, adj_table: true)).to be_within(0.0001).of(ans)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
context 'with weight exceeding 0.25' do
|
72
|
+
it 'throws exception' do
|
73
|
+
expect{ send(strategy, 'MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
|
74
|
+
end
|
61
75
|
end
|
76
|
+
end
|
77
|
+
|
78
|
+
describe 'Pure Ruby' do
|
79
|
+
include_examples 'common', :r_distance
|
80
|
+
end
|
81
|
+
|
82
|
+
describe 'C extention' do
|
83
|
+
include_examples 'common', :c_distance
|
62
84
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0.beta
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -77,9 +77,9 @@ files:
|
|
77
77
|
- ext/jaro_winkler/extconf.rb
|
78
78
|
- ext/jaro_winkler/jaro_winkler.c
|
79
79
|
- ext/jaro_winkler/jaro_winkler.h
|
80
|
-
- ext/jaro_winkler/strcmp95.c
|
81
80
|
- jaro_winkler.gemspec
|
82
81
|
- lib/jaro_winkler.rb
|
82
|
+
- lib/jaro_winkler/adjusting_table.rb
|
83
83
|
- lib/jaro_winkler/fallback.rb
|
84
84
|
- lib/jaro_winkler/version.rb
|
85
85
|
- spec/jaro_winkler_spec.rb
|
@@ -99,9 +99,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
99
99
|
version: '0'
|
100
100
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
101
|
requirements:
|
102
|
-
- - "
|
102
|
+
- - ">"
|
103
103
|
- !ruby/object:Gem::Version
|
104
|
-
version:
|
104
|
+
version: 1.3.1
|
105
105
|
requirements: []
|
106
106
|
rubyforge_project:
|
107
107
|
rubygems_version: 2.4.1
|
data/ext/jaro_winkler/strcmp95.c
DELETED
@@ -1,199 +0,0 @@
|
|
1
|
-
|
2
|
-
/* strcmp95.c Version 2 */
|
3
|
-
|
4
|
-
/* The strcmp95 function returns a double precision value from 0.0 (total
|
5
|
-
disagreement) to 1.0 (character-by-character agreement). The returned
|
6
|
-
value is a measure of the similarity of the two strings. */
|
7
|
-
|
8
|
-
/* Date of Release: Jan. 26, 1994 */
|
9
|
-
/* Modified: April 24, 1994 Corrected the processing of the single length
|
10
|
-
character strings.
|
11
|
-
Authors: This function was written using the logic from code written by
|
12
|
-
Bill Winkler, George McLaughlin and Matt Jaro with modifications
|
13
|
-
by Maureen Lynch.
|
14
|
-
Comment: This is the official string comparator to be used for matching
|
15
|
-
during the 1995 Test Census. */
|
16
|
-
|
17
|
-
#include <ctype.h>
|
18
|
-
#include <string.h>
|
19
|
-
|
20
|
-
#define NOTNUM(c) ((c>57) || (c<48))
|
21
|
-
#define INRANGE(c) ((c>0) && (c<91))
|
22
|
-
#define MAX_VAR_SIZE 61
|
23
|
-
#define NULL60 " "
|
24
|
-
|
25
|
-
double strcmp95(char *ying, char *yang, long y_length, int *ind_c)
|
26
|
-
|
27
|
-
{
|
28
|
-
/* Arguments:
|
29
|
-
|
30
|
-
ying and yang are pointers to the 2 strings to be compared. The strings
|
31
|
-
need not be NUL-terminated strings because the length is passed.
|
32
|
-
|
33
|
-
y_length is the length of the strings.
|
34
|
-
|
35
|
-
ind_c is an array that is used to define whether certain options should be
|
36
|
-
activated. A nonzero value indicates the option is deactivated.
|
37
|
-
The options are:
|
38
|
-
ind_c[0] Increase the probability of a match when the number of matched
|
39
|
-
characters is large. This option allows for a little more
|
40
|
-
tolerance when the strings are large. It is not an appropriate
|
41
|
-
test when comparing fixed length fields such as phone and
|
42
|
-
social security numbers.
|
43
|
-
ind_c[1] All lower case characters are converted to upper case prior
|
44
|
-
to the comparison. Disabling this feature means that the lower
|
45
|
-
case string "code" will not be recognized as the same as the
|
46
|
-
upper case string "CODE". Also, the adjustment for similar
|
47
|
-
characters section only applies to uppercase characters.
|
48
|
-
|
49
|
-
The suggested values are all zeros for character strings such as names. */
|
50
|
-
|
51
|
-
static int pass=1/* TODO: make it an option parameter */, adjwt[91][91];
|
52
|
-
static char sp[39][2] =
|
53
|
-
{'A','E', 'A','I', 'A','O', 'A','U', 'B','V', 'E','I', 'E','O', 'E','U',
|
54
|
-
'I','O', 'I','U', 'O','U', 'I','Y', 'E','Y', 'C','G', 'E','F',
|
55
|
-
'W','U', 'W','V', 'X','K', 'S','Z', 'X','S', 'Q','C', 'U','V',
|
56
|
-
'M','N', 'L','I', 'Q','O', 'P','R', 'I','J', '2','Z', '5','S',
|
57
|
-
'8','B', '1','I', '1','L', '0','O', '0','Q', 'C','K', 'G','J',
|
58
|
-
'E',' ', 'Y',' ', 'S',' '};
|
59
|
-
|
60
|
-
char ying_hold[MAX_VAR_SIZE],
|
61
|
-
yang_hold[MAX_VAR_SIZE],
|
62
|
-
ying_flag[MAX_VAR_SIZE],
|
63
|
-
yang_flag[MAX_VAR_SIZE];
|
64
|
-
|
65
|
-
double weight, Num_sim;
|
66
|
-
|
67
|
-
long minv, search_range, lowlim, ying_length,
|
68
|
-
hilim, N_trans, Num_com, yang_length;
|
69
|
-
|
70
|
-
int yl1, yi_st, N_simi;
|
71
|
-
|
72
|
-
register int i, j, k;
|
73
|
-
|
74
|
-
/* Initialize the adjwt array on the first call to the function only.
|
75
|
-
The adjwt array is used to give partial credit for characters that
|
76
|
-
may be errors due to known phonetic or character recognition errors.
|
77
|
-
A typical example is to match the letter "O" with the number "0" */
|
78
|
-
if (!pass) {
|
79
|
-
pass++;
|
80
|
-
for (i=0; i<91; i++) for (j=0; j<91; j++) adjwt[i][j] = 0;
|
81
|
-
for (i=0; i<36; i++) {
|
82
|
-
adjwt[sp[i][0]][sp[i][1]] = 3;
|
83
|
-
adjwt[sp[i][1]][sp[i][0]] = 3;
|
84
|
-
} }
|
85
|
-
|
86
|
-
/* If either string is blank - return - added in Version 2 */
|
87
|
-
if (!strncmp(ying,NULL60,y_length)) return(0.0);
|
88
|
-
if (!strncmp(yang,NULL60,y_length)) return(0.0);
|
89
|
-
|
90
|
-
/* Identify the strings to be compared by stripping off all leading and
|
91
|
-
trailing spaces. */
|
92
|
-
k = y_length - 1;
|
93
|
-
for(j = 0;((ying[j]==' ') && (j < k));j++);
|
94
|
-
for(i = k;((ying[i]==' ') && (i > 0));i--);
|
95
|
-
ying_length = i + 1 - j;
|
96
|
-
yi_st = j;
|
97
|
-
|
98
|
-
for(j = 0;((yang[j]==' ') && (j < k));j++);
|
99
|
-
for(i = k;((yang[i]==' ') && (i > 0));i--);
|
100
|
-
yang_length = i + 1 - j;
|
101
|
-
|
102
|
-
ying_hold[0]=yang_hold[0]=0;
|
103
|
-
strncat(ying_hold,&ying[yi_st],ying_length);
|
104
|
-
strncat(yang_hold,&yang[j],yang_length);
|
105
|
-
|
106
|
-
if (ying_length > yang_length) {
|
107
|
-
search_range = ying_length;
|
108
|
-
minv = yang_length;
|
109
|
-
}
|
110
|
-
else {
|
111
|
-
search_range = yang_length;
|
112
|
-
minv = ying_length;
|
113
|
-
}
|
114
|
-
|
115
|
-
/* If either string is blank - return */
|
116
|
-
/* if (!minv) return(0.0); removed in version 2 */
|
117
|
-
|
118
|
-
/* Blank out the flags */
|
119
|
-
ying_flag[0] = yang_flag[0] = 0;
|
120
|
-
strncat(ying_flag,NULL60,search_range);
|
121
|
-
strncat(yang_flag,NULL60,search_range);
|
122
|
-
search_range = (search_range/2) - 1;
|
123
|
-
if (search_range < 0) search_range = 0; /* added in version 2 */
|
124
|
-
|
125
|
-
/* Convert all lower case characters to upper case. */
|
126
|
-
if (!ind_c[1]) {
|
127
|
-
for (i = 0;i < ying_length;i++) if (islower(ying_hold[i])) ying_hold[i] -= 32;
|
128
|
-
for (j = 0;j < yang_length;j++) if (islower(yang_hold[j])) yang_hold[j] -= 32;
|
129
|
-
}
|
130
|
-
|
131
|
-
/* Looking only within the search range, count and flag the matched pairs. */
|
132
|
-
Num_com = 0;
|
133
|
-
yl1 = yang_length - 1;
|
134
|
-
for (i = 0;i < ying_length;i++) {
|
135
|
-
lowlim = (i >= search_range) ? i - search_range : 0;
|
136
|
-
hilim = ((i + search_range) <= yl1) ? (i + search_range) : yl1;
|
137
|
-
for (j = lowlim;j <= hilim;j++) {
|
138
|
-
if ((yang_flag[j] != '1') && (yang_hold[j] == ying_hold[i])) {
|
139
|
-
yang_flag[j] = '1';
|
140
|
-
ying_flag[i] = '1';
|
141
|
-
Num_com++;
|
142
|
-
break;
|
143
|
-
} } }
|
144
|
-
|
145
|
-
/* If no characters in common - return */
|
146
|
-
if (!Num_com) return(0.0);
|
147
|
-
|
148
|
-
/* Count the number of transpositions */
|
149
|
-
k = N_trans = 0;
|
150
|
-
for (i = 0;i < ying_length;i++) {
|
151
|
-
if (ying_flag[i] == '1') {
|
152
|
-
for (j = k;j < yang_length;j++) {
|
153
|
-
if (yang_flag[j] == '1') {
|
154
|
-
k = j + 1;
|
155
|
-
break;
|
156
|
-
} }
|
157
|
-
if (ying_hold[i] != yang_hold[j]) N_trans++;
|
158
|
-
} }
|
159
|
-
N_trans = N_trans / 2;
|
160
|
-
|
161
|
-
/* adjust for similarities in nonmatched characters */
|
162
|
-
N_simi = 0;
|
163
|
-
if (minv > Num_com) {
|
164
|
-
for (i = 0;i < ying_length;i++) {
|
165
|
-
if (ying_flag[i] == ' ' && INRANGE(ying_hold[i])) {
|
166
|
-
for (j = 0;j < yang_length;j++) {
|
167
|
-
if (yang_flag[j] == ' ' && INRANGE(yang_hold[j])) {
|
168
|
-
if (adjwt[ying_hold[i]][yang_hold[j]] > 0) {
|
169
|
-
N_simi += adjwt[ying_hold[i]][yang_hold[j]];
|
170
|
-
yang_flag[j] = '2';
|
171
|
-
break;
|
172
|
-
} } } } } }
|
173
|
-
Num_sim = ((double) N_simi)/10.0 + Num_com;
|
174
|
-
|
175
|
-
/* Main weight computation. */
|
176
|
-
weight= Num_sim / ((double) ying_length) + Num_sim / ((double) yang_length)
|
177
|
-
+ ((double) (Num_com - N_trans)) / ((double) Num_com);
|
178
|
-
weight = weight / 3.0;
|
179
|
-
|
180
|
-
/* Continue to boost the weight if the strings are similar */
|
181
|
-
if (weight > 0.7) {
|
182
|
-
|
183
|
-
/* Adjust for having up to the first 4 characters in common */
|
184
|
-
j = (minv >= 4) ? 4 : minv;
|
185
|
-
for (i=0;((i<j)&&(ying_hold[i]==yang_hold[i])&&(NOTNUM(ying_hold[i])));i++);
|
186
|
-
if (i) weight += i * 0.1 * (1.0 - weight);
|
187
|
-
|
188
|
-
/* Optionally adjust for long strings. */
|
189
|
-
/* After agreeing beginning chars, at least two more must agree and
|
190
|
-
the agreeing characters must be > .5 of remaining characters. */
|
191
|
-
if ((!ind_c[0]) && (minv>4) && (Num_com>i+1) && (2*Num_com>=minv+i))
|
192
|
-
if (NOTNUM(ying_hold[0]))
|
193
|
-
weight += (double) (1.0-weight) *
|
194
|
-
((double) (Num_com-i-1) / ((double) (ying_length+yang_length-i*2+2)));
|
195
|
-
}
|
196
|
-
|
197
|
-
return(weight);
|
198
|
-
|
199
|
-
} /* strcmp95 */
|