jaro_winkler 1.3.2.beta → 1.3.2.beta2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -4
- data/Rakefile +1 -1
- data/benchmark/native.txt +9 -9
- data/ext/jaro_winkler/adj_matrix.c +25 -2
- data/ext/jaro_winkler/adj_matrix.h +5 -4
- data/ext/jaro_winkler/codepoints.c +29 -0
- data/ext/jaro_winkler/codepoints.h +17 -0
- data/ext/jaro_winkler/distance.c +12 -77
- data/ext/jaro_winkler/distance.h +1 -1
- data/ext/jaro_winkler/extconf.rb +1 -1
- data/ext/jaro_winkler/jaro_winkler.c +5 -5
- data/ext/jaro_winkler/murmur_hash2.c +64 -64
- data/jaro_winkler.gemspec +4 -0
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +17 -16
- metadata +61 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e3750b9af1a515ee78a01b77192fd9eb2697f56
|
4
|
+
data.tar.gz: c1473c5a327eda3fc5973935dca50f584399d5d0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 615204f1ab3906d01e44d92b685751004887c339bee5fc35633c26f928b2d7e3d07159085b7122683eba21b3c56cabfdbf28687a7d95aa3815445693a9a9979a
|
7
|
+
data.tar.gz: 63601a6358c4a600c3ef258f465b38944ce346e48720613342da7b258663fefc1d51df984e2bab97b245c6e0ef0ecfcaf76ff3ad5b3b0281a9bd748b4f7d3951
|
data/README.md
CHANGED
@@ -33,7 +33,9 @@ weight | number | 0.1 | A constant scaling factor for how much the sco
|
|
33
33
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
34
34
|
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
|
35
35
|
|
36
|
-
|
36
|
+
# About Adjusting Table
|
37
|
+
|
38
|
+
## Default Table
|
37
39
|
|
38
40
|
```
|
39
41
|
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'], ['I', 'U'],
|
@@ -42,9 +44,9 @@ adj_table | boolean | false | The option is used to give partial credit for
|
|
42
44
|
['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'], ['E', ' '], ['Y', ' '], ['S', ' ']
|
43
45
|
```
|
44
46
|
|
45
|
-
## How
|
47
|
+
## How it works?
|
46
48
|
|
47
|
-
|
49
|
+
Original Formula:
|
48
50
|
|
49
51
|
%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
|
50
52
|
|
@@ -53,7 +55,7 @@ where
|
|
53
55
|
- `m` is the number of matching characters.
|
54
56
|
- `t` is half the number of transpositions.
|
55
57
|
|
56
|
-
|
58
|
+
With Adjusting Table:
|
57
59
|
|
58
60
|
%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
|
59
61
|
|
@@ -61,6 +63,15 @@ where
|
|
61
63
|
|
62
64
|
- `s` is the number of nonmatching but similar characters.
|
63
65
|
|
66
|
+
## Difference Between v1.3.1 And v1.3.2.beta
|
67
|
+
|
68
|
+
Version | Algorithm
|
69
|
+
----------- | -----------------------------------------------------------------------
|
70
|
+
v1.3.1 | One linked list to store sparse matrix and iterate to find similar character.
|
71
|
+
v1.3.2.beta | One hash table with multiple linked lists for collision handling.
|
72
|
+
|
73
|
+
In theory, the latter should work more efficient than the former (more test data needed).
|
74
|
+
|
64
75
|
# Why This?
|
65
76
|
|
66
77
|
There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
|
data/Rakefile
CHANGED
@@ -27,7 +27,7 @@ task :compare do
|
|
27
27
|
require 'fuzzystringmatch'
|
28
28
|
require 'hotwater'
|
29
29
|
require 'amatch'
|
30
|
-
@ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
|
30
|
+
@ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten'], ['San Francisco', 'Santa Monica']]
|
31
31
|
table = []
|
32
32
|
table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
|
33
33
|
table << %w[--- --- --- --- --- ---]
|
data/benchmark/native.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
Rehearsal ----------------------------------------------------
|
2
|
-
jaro_winkler 0.350000 0.000000 0.350000 ( 0.
|
3
|
-
fuzzystringmatch 0.
|
4
|
-
hotwater 0.
|
5
|
-
amatch
|
6
|
-
------------------------------------------- total:
|
2
|
+
jaro_winkler 0.350000 0.000000 0.350000 ( 0.348383)
|
3
|
+
fuzzystringmatch 0.330000 0.020000 0.350000 ( 0.354850)
|
4
|
+
hotwater 0.280000 0.000000 0.280000 ( 0.278819)
|
5
|
+
amatch 0.980000 0.000000 0.980000 ( 0.983325)
|
6
|
+
------------------------------------------- total: 1.960000sec
|
7
7
|
|
8
8
|
user system total real
|
9
|
-
jaro_winkler 0.
|
10
|
-
fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.
|
11
|
-
hotwater 0.
|
12
|
-
amatch 0.
|
9
|
+
jaro_winkler 0.330000 0.000000 0.330000 ( 0.331923)
|
10
|
+
fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.135655)
|
11
|
+
hotwater 0.280000 0.000000 0.280000 ( 0.276728)
|
12
|
+
amatch 0.930000 0.010000 0.940000 ( 0.932943)
|
@@ -1,8 +1,16 @@
|
|
1
1
|
#include <stdlib.h>
|
2
2
|
#include "adj_matrix.h"
|
3
|
+
#include "codepoints.h"
|
4
|
+
|
5
|
+
const char *DEFAULT_ADJ_TABLE[] = {
|
6
|
+
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
7
|
+
"I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
|
8
|
+
"M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
|
9
|
+
"0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
|
10
|
+
};
|
3
11
|
|
4
12
|
extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
|
5
|
-
|
13
|
+
inline void node_free(Node *head);
|
6
14
|
|
7
15
|
AdjMatrix* adj_matrix_new(unsigned int length){
|
8
16
|
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
@@ -42,7 +50,7 @@ char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long
|
|
42
50
|
}
|
43
51
|
}
|
44
52
|
|
45
|
-
|
53
|
+
inline void node_free(Node *head){
|
46
54
|
if(head == NULL) return;
|
47
55
|
node_free(head->next);
|
48
56
|
free(head);
|
@@ -59,4 +67,19 @@ void adj_matrix_free(AdjMatrix *matrix){
|
|
59
67
|
}
|
60
68
|
free(matrix->table);
|
61
69
|
free(matrix);
|
70
|
+
}
|
71
|
+
|
72
|
+
AdjMatrix* adj_matrix_default(){
|
73
|
+
static char first_time = 1;
|
74
|
+
static AdjMatrix *ret_matrix;
|
75
|
+
if(first_time){
|
76
|
+
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
77
|
+
int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
|
78
|
+
for(int i = 0; i < length; i += 2){
|
79
|
+
UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
|
80
|
+
adj_matrix_add(ret_matrix, h1.code, h2.code);
|
81
|
+
}
|
82
|
+
first_time = 0;
|
83
|
+
}
|
84
|
+
return ret_matrix;
|
62
85
|
}
|
@@ -13,9 +13,10 @@ typedef struct{
|
|
13
13
|
unsigned int length;
|
14
14
|
} AdjMatrix;
|
15
15
|
|
16
|
-
AdjMatrix* adj_matrix_new(unsigned int length);
|
17
|
-
void
|
18
|
-
char
|
19
|
-
void
|
16
|
+
AdjMatrix* adj_matrix_new (unsigned int length);
|
17
|
+
void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
|
18
|
+
char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
|
19
|
+
void adj_matrix_free (AdjMatrix *matrix);
|
20
|
+
AdjMatrix* adj_matrix_default();
|
20
21
|
|
21
22
|
#endif /* ADJ_MATRIX_H */
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include "codepoints.h"
|
4
|
+
|
5
|
+
UnicodeHash unicode_hash_new(const char *str){
|
6
|
+
UnicodeHash ret = {};
|
7
|
+
unsigned char first_char = str[0];
|
8
|
+
if(first_char >= 252) ret.byte_length = 6; // 1111110x
|
9
|
+
else if(first_char >= 248) ret.byte_length = 5; // 111110xx
|
10
|
+
else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
|
11
|
+
else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
|
12
|
+
else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
|
13
|
+
else ret.byte_length = 1;
|
14
|
+
memcpy(&ret.code, str, ret.byte_length);
|
15
|
+
return ret;
|
16
|
+
}
|
17
|
+
|
18
|
+
Codepoints codepoints_new(const char *str, int byte_len){
|
19
|
+
Codepoints ret = {};
|
20
|
+
ret.ary = malloc(byte_len * sizeof(long long));
|
21
|
+
ret.length = 0;
|
22
|
+
for(int i = 0; i < byte_len;){
|
23
|
+
UnicodeHash hash = unicode_hash_new(str + i);
|
24
|
+
ret.ary[ret.length] = hash.code;
|
25
|
+
ret.length++;
|
26
|
+
i += hash.byte_length;
|
27
|
+
}
|
28
|
+
return ret;
|
29
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef CODEPOINTS_H
|
2
|
+
#define CODEPOINTS_H 1
|
3
|
+
|
4
|
+
typedef struct{
|
5
|
+
unsigned long long code;
|
6
|
+
unsigned int byte_length;
|
7
|
+
} UnicodeHash;
|
8
|
+
|
9
|
+
typedef struct{
|
10
|
+
unsigned long long *ary;
|
11
|
+
int length;
|
12
|
+
} Codepoints;
|
13
|
+
|
14
|
+
UnicodeHash unicode_hash_new(const char *str);
|
15
|
+
Codepoints codepoints_new (const char *str, int byte_len);
|
16
|
+
|
17
|
+
#endif /* CODEPOINTS_H */
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -1,30 +1,9 @@
|
|
1
|
-
#include <string.h>
|
2
1
|
#include <stdlib.h>
|
3
2
|
#include <ctype.h>
|
4
3
|
#include "distance.h"
|
4
|
+
#include "codepoints.h"
|
5
5
|
#include "adj_matrix.h"
|
6
6
|
|
7
|
-
typedef struct{
|
8
|
-
unsigned long long code;
|
9
|
-
unsigned int byte_length;
|
10
|
-
} UnicodeHash;
|
11
|
-
|
12
|
-
typedef struct{
|
13
|
-
unsigned long long *ary;
|
14
|
-
int length;
|
15
|
-
} Codepoints;
|
16
|
-
|
17
|
-
const char *DEFAULT_ADJ_TABLE[] = {
|
18
|
-
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
19
|
-
"I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
|
20
|
-
"M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
|
21
|
-
"0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
|
22
|
-
};
|
23
|
-
|
24
|
-
static UnicodeHash unicode_hash_new(const char *str);
|
25
|
-
static Codepoints codepoints_new(const char *str, int byte_len);
|
26
|
-
static AdjMatrix* adj_matrix_default();
|
27
|
-
|
28
7
|
Option option_new(){
|
29
8
|
Option opt;
|
30
9
|
opt.ignore_case = opt.adj_table = 0;
|
@@ -33,12 +12,9 @@ Option option_new(){
|
|
33
12
|
return opt;
|
34
13
|
}
|
35
14
|
|
36
|
-
double
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len);
|
41
|
-
Codepoints code_ary_2 = codepoints_new(s2, s2_byte_len);
|
15
|
+
double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
|
16
|
+
Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len),
|
17
|
+
code_ary_2 = codepoints_new(s2, s2_byte_len);
|
42
18
|
|
43
19
|
if(opt.ignore_case){
|
44
20
|
for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
|
@@ -54,19 +30,19 @@ double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option o
|
|
54
30
|
// Compute jaro distance
|
55
31
|
int window_size = code_ary_2.length / 2 - 1;
|
56
32
|
if(window_size < 0) window_size = 0;
|
57
|
-
double matches = 0.0
|
58
|
-
|
59
|
-
int transpositions = 0
|
60
|
-
|
61
|
-
|
33
|
+
double matches = 0.0,
|
34
|
+
sim_matches = 0.0;
|
35
|
+
int transpositions = 0,
|
36
|
+
previous_index = -1,
|
37
|
+
max_index = code_ary_2.length - 1;
|
62
38
|
for(int i = 0; i < code_ary_1.length; i++){
|
63
39
|
int left = i - window_size;
|
64
40
|
int right = i + window_size;
|
65
41
|
if(left < 0) left = 0;
|
66
42
|
if(right > max_index) right = max_index;
|
67
|
-
char matched = 0
|
68
|
-
|
69
|
-
|
43
|
+
char matched = 0,
|
44
|
+
found = 0,
|
45
|
+
sim_matched = 0;
|
70
46
|
for(int j = left; j <= right; j++){
|
71
47
|
if(code_ary_1.ary[i] == code_ary_2.ary[j]){
|
72
48
|
matched = 1;
|
@@ -97,45 +73,4 @@ double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option o
|
|
97
73
|
}
|
98
74
|
free(code_ary_1.ary); free(code_ary_2.ary);
|
99
75
|
return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
|
100
|
-
}
|
101
|
-
|
102
|
-
static UnicodeHash unicode_hash_new(const char *str){
|
103
|
-
UnicodeHash ret = {};
|
104
|
-
unsigned char first_char = str[0];
|
105
|
-
if(first_char >= 252) ret.byte_length = 6; // 1111110x
|
106
|
-
else if(first_char >= 248) ret.byte_length = 5; // 111110xx
|
107
|
-
else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
|
108
|
-
else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
|
109
|
-
else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
|
110
|
-
else ret.byte_length = 1;
|
111
|
-
memcpy(&ret.code, str, ret.byte_length);
|
112
|
-
return ret;
|
113
|
-
}
|
114
|
-
|
115
|
-
static Codepoints codepoints_new(const char *str, int byte_len){
|
116
|
-
Codepoints ret = {};
|
117
|
-
ret.ary = calloc(byte_len, sizeof(long long));
|
118
|
-
int count = 0;
|
119
|
-
for(int i = 0; i < byte_len;){
|
120
|
-
UnicodeHash hash = unicode_hash_new(str + i);
|
121
|
-
ret.ary[count] = hash.code;
|
122
|
-
count++;
|
123
|
-
i += hash.byte_length;
|
124
|
-
}
|
125
|
-
ret.length += count;
|
126
|
-
return ret;
|
127
|
-
}
|
128
|
-
|
129
|
-
static AdjMatrix* adj_matrix_default(){
|
130
|
-
static char first_time = 1;
|
131
|
-
static AdjMatrix *ret_matrix;
|
132
|
-
if(first_time){
|
133
|
-
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
134
|
-
for(int i = 0; i < 78; i += 2){
|
135
|
-
UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
|
136
|
-
adj_matrix_add(ret_matrix, h1.code, h2.code);
|
137
|
-
}
|
138
|
-
first_time = 0;
|
139
|
-
}
|
140
|
-
return ret_matrix;
|
141
76
|
}
|
data/ext/jaro_winkler/distance.h
CHANGED
@@ -6,7 +6,7 @@ typedef struct{
|
|
6
6
|
char ignore_case, adj_table;
|
7
7
|
} Option;
|
8
8
|
|
9
|
-
double
|
9
|
+
double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
|
10
10
|
Option option_new();
|
11
11
|
|
12
12
|
#endif /* DISTANCE_H */
|
data/ext/jaro_winkler/extconf.rb
CHANGED
@@ -13,15 +13,15 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
|
13
13
|
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
14
14
|
Option c_opt = option_new();
|
15
15
|
if(TYPE(opt) == T_HASH){
|
16
|
-
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")))
|
17
|
-
|
18
|
-
|
19
|
-
|
16
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
17
|
+
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
18
|
+
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
19
|
+
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
20
20
|
if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
|
21
21
|
if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
22
22
|
if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
|
23
23
|
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
24
24
|
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
25
25
|
}
|
26
|
-
return rb_float_new(
|
26
|
+
return rb_float_new(distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
|
27
27
|
}
|
@@ -1,64 +1,64 @@
|
|
1
|
-
//-----------------------------------------------------------------------------
|
2
|
-
// MurmurHash2, by Austin Appleby
|
3
|
-
|
4
|
-
// Note - This code makes a few assumptions about how your machine behaves -
|
5
|
-
|
6
|
-
// 1. We can read a 4-byte value from any address without crashing
|
7
|
-
// 2. sizeof(int) == 4
|
8
|
-
|
9
|
-
// And it has a few limitations -
|
10
|
-
|
11
|
-
// 1. It will not work incrementally.
|
12
|
-
// 2. It will not produce the same results on little-endian and big-endian
|
13
|
-
// machines.
|
14
|
-
|
15
|
-
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
16
|
-
{
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
}
|
1
|
+
//-----------------------------------------------------------------------------
|
2
|
+
// MurmurHash2, by Austin Appleby
|
3
|
+
|
4
|
+
// Note - This code makes a few assumptions about how your machine behaves -
|
5
|
+
|
6
|
+
// 1. We can read a 4-byte value from any address without crashing
|
7
|
+
// 2. sizeof(int) == 4
|
8
|
+
|
9
|
+
// And it has a few limitations -
|
10
|
+
|
11
|
+
// 1. It will not work incrementally.
|
12
|
+
// 2. It will not produce the same results on little-endian and big-endian
|
13
|
+
// machines.
|
14
|
+
|
15
|
+
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
16
|
+
{
|
17
|
+
// 'm' and 'r' are mixing constants generated offline.
|
18
|
+
// They're not really 'magic', they just happen to work well.
|
19
|
+
|
20
|
+
const unsigned int m = 0x5bd1e995;
|
21
|
+
const int r = 24;
|
22
|
+
|
23
|
+
// Initialize the hash to a 'random' value
|
24
|
+
|
25
|
+
unsigned int h = seed ^ len;
|
26
|
+
|
27
|
+
// Mix 4 bytes at a time into the hash
|
28
|
+
|
29
|
+
const unsigned char * data = (const unsigned char *)key;
|
30
|
+
|
31
|
+
while(len >= 4)
|
32
|
+
{
|
33
|
+
unsigned int k = *(unsigned int *)data;
|
34
|
+
|
35
|
+
k *= m;
|
36
|
+
k ^= k >> r;
|
37
|
+
k *= m;
|
38
|
+
|
39
|
+
h *= m;
|
40
|
+
h ^= k;
|
41
|
+
|
42
|
+
data += 4;
|
43
|
+
len -= 4;
|
44
|
+
}
|
45
|
+
|
46
|
+
// Handle the last few bytes of the input array
|
47
|
+
|
48
|
+
switch(len)
|
49
|
+
{
|
50
|
+
case 3: h ^= data[2] << 16;
|
51
|
+
case 2: h ^= data[1] << 8;
|
52
|
+
case 1: h ^= data[0];
|
53
|
+
h *= m;
|
54
|
+
};
|
55
|
+
|
56
|
+
// Do a few final mixes of the hash to ensure the last few
|
57
|
+
// bytes are well-incorporated.
|
58
|
+
|
59
|
+
h ^= h >> 13;
|
60
|
+
h *= m;
|
61
|
+
h ^= h >> 15;
|
62
|
+
|
63
|
+
return h;
|
64
|
+
}
|
data/jaro_winkler.gemspec
CHANGED
@@ -23,4 +23,8 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency "bundler", "~> 1.7"
|
24
24
|
spec.add_development_dependency "rake", "~> 10.0"
|
25
25
|
spec.add_development_dependency "rake-compiler"
|
26
|
+
spec.add_development_dependency "rspec"
|
27
|
+
spec.add_development_dependency "fuzzy-string-match"
|
28
|
+
spec.add_development_dependency "hotwater"
|
29
|
+
spec.add_development_dependency "amatch"
|
26
30
|
end
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -5,22 +5,23 @@ include JaroWinkler
|
|
5
5
|
shared_examples 'common' do |strategy|
|
6
6
|
it 'works' do
|
7
7
|
ary = [
|
8
|
-
['henka'
|
9
|
-
['al'
|
10
|
-
['martha'
|
11
|
-
['jones'
|
12
|
-
['abcvwxyz'
|
13
|
-
['dwayne'
|
14
|
-
['dixon'
|
15
|
-
['fvie'
|
16
|
-
['tony'
|
17
|
-
['tonytonyjan'
|
18
|
-
['x'
|
19
|
-
[''
|
20
|
-
['tony'
|
21
|
-
[''
|
22
|
-
['tonytonyjan'
|
23
|
-
['tony'
|
8
|
+
['henka' , 'henkan' , 0.9667] ,
|
9
|
+
['al' , 'al' , 1.0] ,
|
10
|
+
['martha' , 'marhta' , 0.9611] ,
|
11
|
+
['jones' , 'johnson' , 0.8323] ,
|
12
|
+
['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
|
13
|
+
['dwayne' , 'duane' , 0.8400] ,
|
14
|
+
['dixon' , 'dicksonx' , 0.8133] ,
|
15
|
+
['fvie' , 'ten' , 0.0] ,
|
16
|
+
['tony' , 'tony' , 1.0] ,
|
17
|
+
['tonytonyjan' , 'tonytonyjan' , 1.0] ,
|
18
|
+
['x' , 'x' , 1.0] ,
|
19
|
+
['' , '' , 0.0] ,
|
20
|
+
['tony' , '' , 0.0] ,
|
21
|
+
['' , 'tony' , 0.0] ,
|
22
|
+
['tonytonyjan' , 'tony' , 0.8727] ,
|
23
|
+
['tony' , 'tonytonyjan' , 0.8727] ,
|
24
|
+
['San Francisco' , 'Santa Monica' , 0.8180]
|
24
25
|
]
|
25
26
|
ary.each do |s1, s2, ans|
|
26
27
|
expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.2.
|
4
|
+
version: 1.3.2.beta2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,62 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: fuzzy-string-match
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: hotwater
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: amatch
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
55
111
|
description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
|
56
112
|
and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
|
57
113
|
string.
|
@@ -74,6 +130,8 @@ files:
|
|
74
130
|
- benchmark/pure.txt
|
75
131
|
- ext/jaro_winkler/adj_matrix.c
|
76
132
|
- ext/jaro_winkler/adj_matrix.h
|
133
|
+
- ext/jaro_winkler/codepoints.c
|
134
|
+
- ext/jaro_winkler/codepoints.h
|
77
135
|
- ext/jaro_winkler/distance.c
|
78
136
|
- ext/jaro_winkler/distance.h
|
79
137
|
- ext/jaro_winkler/extconf.rb
|
@@ -107,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
165
|
version: 1.3.1
|
108
166
|
requirements: []
|
109
167
|
rubyforge_project:
|
110
|
-
rubygems_version: 2.4.
|
168
|
+
rubygems_version: 2.4.2
|
111
169
|
signing_key:
|
112
170
|
specification_version: 4
|
113
171
|
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
|