jaro_winkler 1.3.2.beta → 1.3.2.beta2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -4
- data/Rakefile +1 -1
- data/benchmark/native.txt +9 -9
- data/ext/jaro_winkler/adj_matrix.c +25 -2
- data/ext/jaro_winkler/adj_matrix.h +5 -4
- data/ext/jaro_winkler/codepoints.c +29 -0
- data/ext/jaro_winkler/codepoints.h +17 -0
- data/ext/jaro_winkler/distance.c +12 -77
- data/ext/jaro_winkler/distance.h +1 -1
- data/ext/jaro_winkler/extconf.rb +1 -1
- data/ext/jaro_winkler/jaro_winkler.c +5 -5
- data/ext/jaro_winkler/murmur_hash2.c +64 -64
- data/jaro_winkler.gemspec +4 -0
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +17 -16
- metadata +61 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e3750b9af1a515ee78a01b77192fd9eb2697f56
|
4
|
+
data.tar.gz: c1473c5a327eda3fc5973935dca50f584399d5d0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 615204f1ab3906d01e44d92b685751004887c339bee5fc35633c26f928b2d7e3d07159085b7122683eba21b3c56cabfdbf28687a7d95aa3815445693a9a9979a
|
7
|
+
data.tar.gz: 63601a6358c4a600c3ef258f465b38944ce346e48720613342da7b258663fefc1d51df984e2bab97b245c6e0ef0ecfcaf76ff3ad5b3b0281a9bd748b4f7d3951
|
data/README.md
CHANGED
@@ -33,7 +33,9 @@ weight | number | 0.1 | A constant scaling factor for how much the sco
|
|
33
33
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
34
34
|
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
|
35
35
|
|
36
|
-
|
36
|
+
# About Adjusting Table
|
37
|
+
|
38
|
+
## Default Table
|
37
39
|
|
38
40
|
```
|
39
41
|
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'], ['I', 'U'],
|
@@ -42,9 +44,9 @@ adj_table | boolean | false | The option is used to give partial credit for
|
|
42
44
|
['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'], ['E', ' '], ['Y', ' '], ['S', ' ']
|
43
45
|
```
|
44
46
|
|
45
|
-
## How
|
47
|
+
## How it works?
|
46
48
|
|
47
|
-
|
49
|
+
Original Formula:
|
48
50
|
|
49
51
|
![origin](https://chart.googleapis.com/chart?cht=tx&chs&chl=%5Cbegin%7Bcases%7D0%26%7B%5Ctext%7Bif%20%7Dm%3D0%7D%5C%5C%5Cfrac%7B1%7D%7B3%7D(%5Cfrac%7Bm%7D%7B%5Cleft%7Cs1%5Cright%7C%7D%2B%5Cfrac%7Bm%7D%7B%5Cleft%7Cs2%5Cright%7C%7D%2B%5Cfrac%7Bm-t%7D%7Bm%7D)%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
|
50
52
|
|
@@ -53,7 +55,7 @@ where
|
|
53
55
|
- `m` is the number of matching characters.
|
54
56
|
- `t` is half the number of transpositions.
|
55
57
|
|
56
|
-
|
58
|
+
With Adjusting Table:
|
57
59
|
|
58
60
|
![adj](https://chart.googleapis.com/chart?cht=tx&chs&chl=%5Cbegin%7Bcases%7D0%26%5Ctext%7Bif%20%7Dm%3D0%5C%5C%5Cfrac%7B1%7D%7B3%7D(%5Cfrac%7B%5Cfrac%7Bs%7D%7B10%7D%2Bm%7D%7B%5Cleft%7Cs1%5Cright%7C%7D%2B%5Cfrac%7B%5Cfrac%7Bs%7D%7B10%7D%2Bm%7D%7B%5Cleft%7Cs2%5Cright%7C%7D%2B%5Cfrac%7Bm-t%7D%7Bm%7D)%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
|
59
61
|
|
@@ -61,6 +63,15 @@ where
|
|
61
63
|
|
62
64
|
- `s` is the number of nonmatching but similar characters.
|
63
65
|
|
66
|
+
## Difference Between v1.3.1 And v1.3.2.beta
|
67
|
+
|
68
|
+
Version | Algorithm
|
69
|
+
----------- | -----------------------------------------------------------------------
|
70
|
+
v1.3.1 | One linked list to store sparse matrix and iterate to find similar character.
|
71
|
+
v1.3.2.beta | One hash table with multiple linked lists for collision handling.
|
72
|
+
|
73
|
+
In theory, the latter should work more efficient than the former (more test data needed).
|
74
|
+
|
64
75
|
# Why This?
|
65
76
|
|
66
77
|
There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
|
data/Rakefile
CHANGED
@@ -27,7 +27,7 @@ task :compare do
|
|
27
27
|
require 'fuzzystringmatch'
|
28
28
|
require 'hotwater'
|
29
29
|
require 'amatch'
|
30
|
-
@ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
30
|
+
@ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten'], ['San Francisco', 'Santa Monica']]
|
31
31
|
table = []
|
32
32
|
table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
|
33
33
|
table << %w[--- --- --- --- --- ---]
|
data/benchmark/native.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
Rehearsal ----------------------------------------------------
|
2
|
-
jaro_winkler 0.350000 0.000000 0.350000 ( 0.
|
3
|
-
fuzzystringmatch 0.
|
4
|
-
hotwater 0.
|
5
|
-
amatch
|
6
|
-
------------------------------------------- total:
|
2
|
+
jaro_winkler 0.350000 0.000000 0.350000 ( 0.348383)
|
3
|
+
fuzzystringmatch 0.330000 0.020000 0.350000 ( 0.354850)
|
4
|
+
hotwater 0.280000 0.000000 0.280000 ( 0.278819)
|
5
|
+
amatch 0.980000 0.000000 0.980000 ( 0.983325)
|
6
|
+
------------------------------------------- total: 1.960000sec
|
7
7
|
|
8
8
|
user system total real
|
9
|
-
jaro_winkler 0.
|
10
|
-
fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.
|
11
|
-
hotwater 0.
|
12
|
-
amatch 0.
|
9
|
+
jaro_winkler 0.330000 0.000000 0.330000 ( 0.331923)
|
10
|
+
fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.135655)
|
11
|
+
hotwater 0.280000 0.000000 0.280000 ( 0.276728)
|
12
|
+
amatch 0.930000 0.010000 0.940000 ( 0.932943)
|
@@ -1,8 +1,16 @@
|
|
1
1
|
#include <stdlib.h>
|
2
2
|
#include "adj_matrix.h"
|
3
|
+
#include "codepoints.h"
|
4
|
+
|
5
|
+
const char *DEFAULT_ADJ_TABLE[] = {
|
6
|
+
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
7
|
+
"I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
|
8
|
+
"M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
|
9
|
+
"0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
|
10
|
+
};
|
3
11
|
|
4
12
|
extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
|
5
|
-
|
13
|
+
inline void node_free(Node *head);
|
6
14
|
|
7
15
|
AdjMatrix* adj_matrix_new(unsigned int length){
|
8
16
|
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
@@ -42,7 +50,7 @@ char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long
|
|
42
50
|
}
|
43
51
|
}
|
44
52
|
|
45
|
-
|
53
|
+
inline void node_free(Node *head){
|
46
54
|
if(head == NULL) return;
|
47
55
|
node_free(head->next);
|
48
56
|
free(head);
|
@@ -59,4 +67,19 @@ void adj_matrix_free(AdjMatrix *matrix){
|
|
59
67
|
}
|
60
68
|
free(matrix->table);
|
61
69
|
free(matrix);
|
70
|
+
}
|
71
|
+
|
72
|
+
AdjMatrix* adj_matrix_default(){
|
73
|
+
static char first_time = 1;
|
74
|
+
static AdjMatrix *ret_matrix;
|
75
|
+
if(first_time){
|
76
|
+
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
77
|
+
int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
|
78
|
+
for(int i = 0; i < length; i += 2){
|
79
|
+
UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
|
80
|
+
adj_matrix_add(ret_matrix, h1.code, h2.code);
|
81
|
+
}
|
82
|
+
first_time = 0;
|
83
|
+
}
|
84
|
+
return ret_matrix;
|
62
85
|
}
|
@@ -13,9 +13,10 @@ typedef struct{
|
|
13
13
|
unsigned int length;
|
14
14
|
} AdjMatrix;
|
15
15
|
|
16
|
-
AdjMatrix* adj_matrix_new(unsigned int length);
|
17
|
-
void
|
18
|
-
char
|
19
|
-
void
|
16
|
+
AdjMatrix* adj_matrix_new (unsigned int length);
|
17
|
+
void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
|
18
|
+
char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
|
19
|
+
void adj_matrix_free (AdjMatrix *matrix);
|
20
|
+
AdjMatrix* adj_matrix_default();
|
20
21
|
|
21
22
|
#endif /* ADJ_MATRIX_H */
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include "codepoints.h"
|
4
|
+
|
5
|
+
UnicodeHash unicode_hash_new(const char *str){
|
6
|
+
UnicodeHash ret = {};
|
7
|
+
unsigned char first_char = str[0];
|
8
|
+
if(first_char >= 252) ret.byte_length = 6; // 1111110x
|
9
|
+
else if(first_char >= 248) ret.byte_length = 5; // 111110xx
|
10
|
+
else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
|
11
|
+
else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
|
12
|
+
else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
|
13
|
+
else ret.byte_length = 1;
|
14
|
+
memcpy(&ret.code, str, ret.byte_length);
|
15
|
+
return ret;
|
16
|
+
}
|
17
|
+
|
18
|
+
Codepoints codepoints_new(const char *str, int byte_len){
|
19
|
+
Codepoints ret = {};
|
20
|
+
ret.ary = malloc(byte_len * sizeof(long long));
|
21
|
+
ret.length = 0;
|
22
|
+
for(int i = 0; i < byte_len;){
|
23
|
+
UnicodeHash hash = unicode_hash_new(str + i);
|
24
|
+
ret.ary[ret.length] = hash.code;
|
25
|
+
ret.length++;
|
26
|
+
i += hash.byte_length;
|
27
|
+
}
|
28
|
+
return ret;
|
29
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef CODEPOINTS_H
|
2
|
+
#define CODEPOINTS_H 1
|
3
|
+
|
4
|
+
typedef struct{
|
5
|
+
unsigned long long code;
|
6
|
+
unsigned int byte_length;
|
7
|
+
} UnicodeHash;
|
8
|
+
|
9
|
+
typedef struct{
|
10
|
+
unsigned long long *ary;
|
11
|
+
int length;
|
12
|
+
} Codepoints;
|
13
|
+
|
14
|
+
UnicodeHash unicode_hash_new(const char *str);
|
15
|
+
Codepoints codepoints_new (const char *str, int byte_len);
|
16
|
+
|
17
|
+
#endif /* CODEPOINTS_H */
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -1,30 +1,9 @@
|
|
1
|
-
#include <string.h>
|
2
1
|
#include <stdlib.h>
|
3
2
|
#include <ctype.h>
|
4
3
|
#include "distance.h"
|
4
|
+
#include "codepoints.h"
|
5
5
|
#include "adj_matrix.h"
|
6
6
|
|
7
|
-
typedef struct{
|
8
|
-
unsigned long long code;
|
9
|
-
unsigned int byte_length;
|
10
|
-
} UnicodeHash;
|
11
|
-
|
12
|
-
typedef struct{
|
13
|
-
unsigned long long *ary;
|
14
|
-
int length;
|
15
|
-
} Codepoints;
|
16
|
-
|
17
|
-
const char *DEFAULT_ADJ_TABLE[] = {
|
18
|
-
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
19
|
-
"I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
|
20
|
-
"M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
|
21
|
-
"0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
|
22
|
-
};
|
23
|
-
|
24
|
-
static UnicodeHash unicode_hash_new(const char *str);
|
25
|
-
static Codepoints codepoints_new(const char *str, int byte_len);
|
26
|
-
static AdjMatrix* adj_matrix_default();
|
27
|
-
|
28
7
|
Option option_new(){
|
29
8
|
Option opt;
|
30
9
|
opt.ignore_case = opt.adj_table = 0;
|
@@ -33,12 +12,9 @@ Option option_new(){
|
|
33
12
|
return opt;
|
34
13
|
}
|
35
14
|
|
36
|
-
double
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len);
|
41
|
-
Codepoints code_ary_2 = codepoints_new(s2, s2_byte_len);
|
15
|
+
double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
|
16
|
+
Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len),
|
17
|
+
code_ary_2 = codepoints_new(s2, s2_byte_len);
|
42
18
|
|
43
19
|
if(opt.ignore_case){
|
44
20
|
for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
|
@@ -54,19 +30,19 @@ double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option o
|
|
54
30
|
// Compute jaro distance
|
55
31
|
int window_size = code_ary_2.length / 2 - 1;
|
56
32
|
if(window_size < 0) window_size = 0;
|
57
|
-
double matches = 0.0
|
58
|
-
|
59
|
-
int transpositions = 0
|
60
|
-
|
61
|
-
|
33
|
+
double matches = 0.0,
|
34
|
+
sim_matches = 0.0;
|
35
|
+
int transpositions = 0,
|
36
|
+
previous_index = -1,
|
37
|
+
max_index = code_ary_2.length - 1;
|
62
38
|
for(int i = 0; i < code_ary_1.length; i++){
|
63
39
|
int left = i - window_size;
|
64
40
|
int right = i + window_size;
|
65
41
|
if(left < 0) left = 0;
|
66
42
|
if(right > max_index) right = max_index;
|
67
|
-
char matched = 0
|
68
|
-
|
69
|
-
|
43
|
+
char matched = 0,
|
44
|
+
found = 0,
|
45
|
+
sim_matched = 0;
|
70
46
|
for(int j = left; j <= right; j++){
|
71
47
|
if(code_ary_1.ary[i] == code_ary_2.ary[j]){
|
72
48
|
matched = 1;
|
@@ -97,45 +73,4 @@ double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option o
|
|
97
73
|
}
|
98
74
|
free(code_ary_1.ary); free(code_ary_2.ary);
|
99
75
|
return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
|
100
|
-
}
|
101
|
-
|
102
|
-
static UnicodeHash unicode_hash_new(const char *str){
|
103
|
-
UnicodeHash ret = {};
|
104
|
-
unsigned char first_char = str[0];
|
105
|
-
if(first_char >= 252) ret.byte_length = 6; // 1111110x
|
106
|
-
else if(first_char >= 248) ret.byte_length = 5; // 111110xx
|
107
|
-
else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
|
108
|
-
else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
|
109
|
-
else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
|
110
|
-
else ret.byte_length = 1;
|
111
|
-
memcpy(&ret.code, str, ret.byte_length);
|
112
|
-
return ret;
|
113
|
-
}
|
114
|
-
|
115
|
-
static Codepoints codepoints_new(const char *str, int byte_len){
|
116
|
-
Codepoints ret = {};
|
117
|
-
ret.ary = calloc(byte_len, sizeof(long long));
|
118
|
-
int count = 0;
|
119
|
-
for(int i = 0; i < byte_len;){
|
120
|
-
UnicodeHash hash = unicode_hash_new(str + i);
|
121
|
-
ret.ary[count] = hash.code;
|
122
|
-
count++;
|
123
|
-
i += hash.byte_length;
|
124
|
-
}
|
125
|
-
ret.length += count;
|
126
|
-
return ret;
|
127
|
-
}
|
128
|
-
|
129
|
-
static AdjMatrix* adj_matrix_default(){
|
130
|
-
static char first_time = 1;
|
131
|
-
static AdjMatrix *ret_matrix;
|
132
|
-
if(first_time){
|
133
|
-
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
134
|
-
for(int i = 0; i < 78; i += 2){
|
135
|
-
UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
|
136
|
-
adj_matrix_add(ret_matrix, h1.code, h2.code);
|
137
|
-
}
|
138
|
-
first_time = 0;
|
139
|
-
}
|
140
|
-
return ret_matrix;
|
141
76
|
}
|
data/ext/jaro_winkler/distance.h
CHANGED
@@ -6,7 +6,7 @@ typedef struct{
|
|
6
6
|
char ignore_case, adj_table;
|
7
7
|
} Option;
|
8
8
|
|
9
|
-
double
|
9
|
+
double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
|
10
10
|
Option option_new();
|
11
11
|
|
12
12
|
#endif /* DISTANCE_H */
|
data/ext/jaro_winkler/extconf.rb
CHANGED
@@ -13,15 +13,15 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
|
13
13
|
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
14
14
|
Option c_opt = option_new();
|
15
15
|
if(TYPE(opt) == T_HASH){
|
16
|
-
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")))
|
17
|
-
|
18
|
-
|
19
|
-
|
16
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
17
|
+
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
18
|
+
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
19
|
+
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
20
20
|
if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
|
21
21
|
if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
22
22
|
if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
|
23
23
|
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
24
24
|
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
25
25
|
}
|
26
|
-
return rb_float_new(
|
26
|
+
return rb_float_new(distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
|
27
27
|
}
|
@@ -1,64 +1,64 @@
|
|
1
|
-
//-----------------------------------------------------------------------------
|
2
|
-
// MurmurHash2, by Austin Appleby
|
3
|
-
|
4
|
-
// Note - This code makes a few assumptions about how your machine behaves -
|
5
|
-
|
6
|
-
// 1. We can read a 4-byte value from any address without crashing
|
7
|
-
// 2. sizeof(int) == 4
|
8
|
-
|
9
|
-
// And it has a few limitations -
|
10
|
-
|
11
|
-
// 1. It will not work incrementally.
|
12
|
-
// 2. It will not produce the same results on little-endian and big-endian
|
13
|
-
// machines.
|
14
|
-
|
15
|
-
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
16
|
-
{
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
}
|
1
|
+
//-----------------------------------------------------------------------------
|
2
|
+
// MurmurHash2, by Austin Appleby
|
3
|
+
|
4
|
+
// Note - This code makes a few assumptions about how your machine behaves -
|
5
|
+
|
6
|
+
// 1. We can read a 4-byte value from any address without crashing
|
7
|
+
// 2. sizeof(int) == 4
|
8
|
+
|
9
|
+
// And it has a few limitations -
|
10
|
+
|
11
|
+
// 1. It will not work incrementally.
|
12
|
+
// 2. It will not produce the same results on little-endian and big-endian
|
13
|
+
// machines.
|
14
|
+
|
15
|
+
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
16
|
+
{
|
17
|
+
// 'm' and 'r' are mixing constants generated offline.
|
18
|
+
// They're not really 'magic', they just happen to work well.
|
19
|
+
|
20
|
+
const unsigned int m = 0x5bd1e995;
|
21
|
+
const int r = 24;
|
22
|
+
|
23
|
+
// Initialize the hash to a 'random' value
|
24
|
+
|
25
|
+
unsigned int h = seed ^ len;
|
26
|
+
|
27
|
+
// Mix 4 bytes at a time into the hash
|
28
|
+
|
29
|
+
const unsigned char * data = (const unsigned char *)key;
|
30
|
+
|
31
|
+
while(len >= 4)
|
32
|
+
{
|
33
|
+
unsigned int k = *(unsigned int *)data;
|
34
|
+
|
35
|
+
k *= m;
|
36
|
+
k ^= k >> r;
|
37
|
+
k *= m;
|
38
|
+
|
39
|
+
h *= m;
|
40
|
+
h ^= k;
|
41
|
+
|
42
|
+
data += 4;
|
43
|
+
len -= 4;
|
44
|
+
}
|
45
|
+
|
46
|
+
// Handle the last few bytes of the input array
|
47
|
+
|
48
|
+
switch(len)
|
49
|
+
{
|
50
|
+
case 3: h ^= data[2] << 16;
|
51
|
+
case 2: h ^= data[1] << 8;
|
52
|
+
case 1: h ^= data[0];
|
53
|
+
h *= m;
|
54
|
+
};
|
55
|
+
|
56
|
+
// Do a few final mixes of the hash to ensure the last few
|
57
|
+
// bytes are well-incorporated.
|
58
|
+
|
59
|
+
h ^= h >> 13;
|
60
|
+
h *= m;
|
61
|
+
h ^= h >> 15;
|
62
|
+
|
63
|
+
return h;
|
64
|
+
}
|
data/jaro_winkler.gemspec
CHANGED
@@ -23,4 +23,8 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency "bundler", "~> 1.7"
|
24
24
|
spec.add_development_dependency "rake", "~> 10.0"
|
25
25
|
spec.add_development_dependency "rake-compiler"
|
26
|
+
spec.add_development_dependency "rspec"
|
27
|
+
spec.add_development_dependency "fuzzy-string-match"
|
28
|
+
spec.add_development_dependency "hotwater"
|
29
|
+
spec.add_development_dependency "amatch"
|
26
30
|
end
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -5,22 +5,23 @@ include JaroWinkler
|
|
5
5
|
shared_examples 'common' do |strategy|
|
6
6
|
it 'works' do
|
7
7
|
ary = [
|
8
|
-
['henka'
|
9
|
-
['al'
|
10
|
-
['martha'
|
11
|
-
['jones'
|
12
|
-
['abcvwxyz'
|
13
|
-
['dwayne'
|
14
|
-
['dixon'
|
15
|
-
['fvie'
|
16
|
-
['tony'
|
17
|
-
['tonytonyjan'
|
18
|
-
['x'
|
19
|
-
[''
|
20
|
-
['tony'
|
21
|
-
[''
|
22
|
-
['tonytonyjan'
|
23
|
-
['tony'
|
8
|
+
['henka' , 'henkan' , 0.9667] ,
|
9
|
+
['al' , 'al' , 1.0] ,
|
10
|
+
['martha' , 'marhta' , 0.9611] ,
|
11
|
+
['jones' , 'johnson' , 0.8323] ,
|
12
|
+
['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
|
13
|
+
['dwayne' , 'duane' , 0.8400] ,
|
14
|
+
['dixon' , 'dicksonx' , 0.8133] ,
|
15
|
+
['fvie' , 'ten' , 0.0] ,
|
16
|
+
['tony' , 'tony' , 1.0] ,
|
17
|
+
['tonytonyjan' , 'tonytonyjan' , 1.0] ,
|
18
|
+
['x' , 'x' , 1.0] ,
|
19
|
+
['' , '' , 0.0] ,
|
20
|
+
['tony' , '' , 0.0] ,
|
21
|
+
['' , 'tony' , 0.0] ,
|
22
|
+
['tonytonyjan' , 'tony' , 0.8727] ,
|
23
|
+
['tony' , 'tonytonyjan' , 0.8727] ,
|
24
|
+
['San Francisco' , 'Santa Monica' , 0.8180]
|
24
25
|
]
|
25
26
|
ary.each do |s1, s2, ans|
|
26
27
|
expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.2.
|
4
|
+
version: 1.3.2.beta2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,62 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: fuzzy-string-match
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: hotwater
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: amatch
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
55
111
|
description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
|
56
112
|
and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
|
57
113
|
string.
|
@@ -74,6 +130,8 @@ files:
|
|
74
130
|
- benchmark/pure.txt
|
75
131
|
- ext/jaro_winkler/adj_matrix.c
|
76
132
|
- ext/jaro_winkler/adj_matrix.h
|
133
|
+
- ext/jaro_winkler/codepoints.c
|
134
|
+
- ext/jaro_winkler/codepoints.h
|
77
135
|
- ext/jaro_winkler/distance.c
|
78
136
|
- ext/jaro_winkler/distance.h
|
79
137
|
- ext/jaro_winkler/extconf.rb
|
@@ -107,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
165
|
version: 1.3.1
|
108
166
|
requirements: []
|
109
167
|
rubyforge_project:
|
110
|
-
rubygems_version: 2.4.
|
168
|
+
rubygems_version: 2.4.2
|
111
169
|
signing_key:
|
112
170
|
specification_version: 4
|
113
171
|
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
|