jaro_winkler 1.4.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/jaro_winkler/adj_matrix.c +89 -0
- data/ext/jaro_winkler/adj_matrix.h +22 -0
- data/ext/jaro_winkler/code.c +29 -0
- data/ext/jaro_winkler/code.h +7 -0
- data/ext/jaro_winkler/jaro.c +122 -0
- data/ext/jaro_winkler/jaro.h +17 -0
- data/ext/jaro_winkler/jaro_winkler.c +45 -0
- data/ext/jaro_winkler/murmur_hash2.c +64 -0
- data/lib/jaro_winkler.rb +9 -0
- data/lib/jaro_winkler/adjusting_table.rb +19 -0
- data/lib/jaro_winkler/jaro_winkler_pure.rb +125 -0
- data/lib/jaro_winkler/version.rb +3 -0
- metadata +111 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 52f268c19787793ca7383fe1de1f0355e0a3e6b9
|
4
|
+
data.tar.gz: f8814b814294a7f9268a6df2ad1ad72c75146c3e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2ea65143ad847ef5cd565584c2dd1ce19908136506697eafe0579609227628a6e2bbb4baacd0d6c3ee883bcea07fff3043ae305d84c307a0e5f359dff64ab0c1
|
7
|
+
data.tar.gz: 254d25523a0654343ca5b9a552789021a30d3dc7d0c613333db7d67f3ccc41ce003c01bf00b63089b9a340a83594d40f1d7c49b2e59601885b471e68048fc23f
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#include "adj_matrix.h"
|
2
|
+
#include "code.h"
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
|
6
|
+
const char *DEFAULT_ADJ_TABLE[] = {
|
7
|
+
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
8
|
+
"I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
|
9
|
+
"M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
|
10
|
+
"0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
|
11
|
+
};
|
12
|
+
|
13
|
+
extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
|
14
|
+
void node_free(Node *head);
|
15
|
+
|
16
|
+
AdjMatrix* adj_matrix_new(unsigned int length){
|
17
|
+
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
18
|
+
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
|
19
|
+
matrix->table = malloc(matrix->length * sizeof(Node**));
|
20
|
+
for(int i = 0; i < matrix->length; i++){
|
21
|
+
matrix->table[i] = malloc(matrix->length * sizeof(Node*));
|
22
|
+
for (int j = 0; j < matrix->length; j++)
|
23
|
+
matrix->table[i][j] = NULL;
|
24
|
+
}
|
25
|
+
return matrix;
|
26
|
+
}
|
27
|
+
|
28
|
+
void adj_matrix_add(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
|
29
|
+
unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
|
30
|
+
h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
|
31
|
+
Node *new_node = malloc(sizeof(Node)); new_node->x = h1; new_node->y = h2; new_node->next = NULL;
|
32
|
+
if(matrix->table[h1][h2] == NULL){
|
33
|
+
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
|
34
|
+
}
|
35
|
+
else{
|
36
|
+
Node *previous = NULL;
|
37
|
+
for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next) previous = i;
|
38
|
+
previous->next = new_node;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
|
43
|
+
unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
|
44
|
+
h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
|
45
|
+
Node *node = matrix->table[h1][h2];
|
46
|
+
if(node == NULL) return 0;
|
47
|
+
else{
|
48
|
+
for(Node *i = node; i != NULL; i = i->next)
|
49
|
+
if((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1)) return 1;
|
50
|
+
return 0;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
void node_free(Node *head){
|
55
|
+
if(head == NULL) return;
|
56
|
+
node_free(head->next);
|
57
|
+
free(head);
|
58
|
+
}
|
59
|
+
|
60
|
+
void adj_matrix_free(AdjMatrix *matrix){
|
61
|
+
for(int i = 0; i < matrix->length; i++){
|
62
|
+
for(int j = 0; j < matrix->length; j++)
|
63
|
+
if(matrix->table[i][j] != NULL){
|
64
|
+
node_free(matrix->table[i][j]);
|
65
|
+
matrix->table[i][j] = matrix->table[j][i] = NULL;
|
66
|
+
}
|
67
|
+
free(matrix->table[i]);
|
68
|
+
}
|
69
|
+
free(matrix->table);
|
70
|
+
free(matrix);
|
71
|
+
}
|
72
|
+
|
73
|
+
AdjMatrix* adj_matrix_default(){
|
74
|
+
static char first_time = 1;
|
75
|
+
static AdjMatrix *ret_matrix;
|
76
|
+
if(first_time){
|
77
|
+
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
78
|
+
int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
|
79
|
+
for(int i = 0; i < length; i += 2){
|
80
|
+
unsigned long long code_1, code_2;
|
81
|
+
int dummy_length;
|
82
|
+
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
|
83
|
+
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
|
84
|
+
adj_matrix_add(ret_matrix, code_1, code_2);
|
85
|
+
}
|
86
|
+
first_time = 0;
|
87
|
+
}
|
88
|
+
return ret_matrix;
|
89
|
+
}
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#ifndef ADJ_MATRIX_H
|
2
|
+
#define ADJ_MATRIX_H
|
3
|
+
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
4
|
+
#define ADJ_MATRIX_SEED 9527
|
5
|
+
|
6
|
+
typedef struct _node{
|
7
|
+
struct _node *next;
|
8
|
+
unsigned long long x, y;
|
9
|
+
} Node;
|
10
|
+
|
11
|
+
typedef struct{
|
12
|
+
Node ***table;
|
13
|
+
unsigned int length;
|
14
|
+
} AdjMatrix;
|
15
|
+
|
16
|
+
AdjMatrix* adj_matrix_new (unsigned int length);
|
17
|
+
void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
|
18
|
+
char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
|
19
|
+
void adj_matrix_free (AdjMatrix *matrix);
|
20
|
+
AdjMatrix* adj_matrix_default();
|
21
|
+
|
22
|
+
#endif
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
|
5
|
+
unsigned char first_char = str[0];
|
6
|
+
if(first_char >= 252) *ret_byte_length = 6; // 1111110x
|
7
|
+
else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
|
8
|
+
else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
|
9
|
+
else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
|
10
|
+
else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
|
11
|
+
else *ret_byte_length = 1;
|
12
|
+
*ret_code = 0;
|
13
|
+
memcpy(ret_code, str, *ret_byte_length);
|
14
|
+
}
|
15
|
+
|
16
|
+
void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
|
17
|
+
unsigned int code;
|
18
|
+
char byte_length;
|
19
|
+
|
20
|
+
*ret_codes = calloc(length, sizeof(long long));
|
21
|
+
*ret_length = 0;
|
22
|
+
|
23
|
+
for(int i = 0; i < length;){
|
24
|
+
int byte_length;
|
25
|
+
utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
|
26
|
+
*ret_length += 1;
|
27
|
+
i += byte_length;
|
28
|
+
}
|
29
|
+
}
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#include "jaro.h"
|
2
|
+
#include "code.h"
|
3
|
+
#include "adj_matrix.h"
|
4
|
+
|
5
|
+
#include <string.h>
|
6
|
+
#include <stdlib.h>
|
7
|
+
#include <ctype.h>
|
8
|
+
|
9
|
+
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
|
10
|
+
|
11
|
+
double jaro_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
|
12
|
+
double jaro_winkler_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
|
13
|
+
|
14
|
+
double jaro_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
|
15
|
+
if(!short_str_len || !long_str_len) return 0.0;
|
16
|
+
|
17
|
+
unsigned long long *short_codes, *long_codes;
|
18
|
+
int short_codes_len, long_codes_len;
|
19
|
+
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
|
20
|
+
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
|
21
|
+
|
22
|
+
double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
|
23
|
+
|
24
|
+
free(short_codes); free(long_codes);
|
25
|
+
return ret;
|
26
|
+
}
|
27
|
+
|
28
|
+
double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
|
29
|
+
if(!short_str_len || !long_str_len) return 0.0;
|
30
|
+
|
31
|
+
unsigned long long *short_codes, *long_codes;
|
32
|
+
int short_codes_len, long_codes_len;
|
33
|
+
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
|
34
|
+
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
|
35
|
+
|
36
|
+
double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
|
37
|
+
|
38
|
+
free(short_codes); free(long_codes);
|
39
|
+
return ret;
|
40
|
+
}
|
41
|
+
|
42
|
+
double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
|
43
|
+
if(!short_codes_len || !long_codes_len) return 0.0;
|
44
|
+
|
45
|
+
if(short_codes_len > long_codes_len){
|
46
|
+
SWAP(short_codes, long_codes);
|
47
|
+
SWAP(short_codes_len, long_codes_len);
|
48
|
+
}
|
49
|
+
|
50
|
+
if(opt->ignore_case){
|
51
|
+
for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
|
52
|
+
for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
|
53
|
+
}
|
54
|
+
|
55
|
+
int window_size = long_codes_len/2 - 1;
|
56
|
+
if(window_size < 0) window_size = 0;
|
57
|
+
|
58
|
+
char short_codes_flag[short_codes_len];
|
59
|
+
char long_codes_flag[long_codes_len];
|
60
|
+
memset(short_codes_flag, 0, short_codes_len);
|
61
|
+
memset(long_codes_flag, 0, long_codes_len);
|
62
|
+
|
63
|
+
// count number of matching characters
|
64
|
+
int match_count = 0;
|
65
|
+
for(int i = 0; i < short_codes_len; i++){
|
66
|
+
int left = (i >= window_size) ? i - window_size : 0;
|
67
|
+
int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
|
68
|
+
if(right > long_codes_len - 1) right = long_codes_len - 1;
|
69
|
+
for(int j = left; j <= right; j++){
|
70
|
+
if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
|
71
|
+
short_codes_flag[i] = long_codes_flag[j] = 1;
|
72
|
+
match_count++;
|
73
|
+
break;
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
if(!match_count) return 0.0;
|
79
|
+
|
80
|
+
// count number of transpositions
|
81
|
+
int transposition_count = 0, j = 0, k = 0;
|
82
|
+
for(int i = 0; i < short_codes_len; i++){
|
83
|
+
if(short_codes_flag[i]){
|
84
|
+
for(j = k; j < long_codes_len; j++){
|
85
|
+
if(long_codes_flag[j]){
|
86
|
+
k = j + 1;
|
87
|
+
break;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
if(short_codes[i] != long_codes[j]) transposition_count++;
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
// count similarities in nonmatched characters
|
95
|
+
int similar_count = 0;
|
96
|
+
if(opt->adj_table && short_codes_len > match_count)
|
97
|
+
for(int i = 0; i < short_codes_len; i++)
|
98
|
+
if(!short_codes_flag[i])
|
99
|
+
for(int j = 0; j < long_codes_len; j++)
|
100
|
+
if(!long_codes_flag[j])
|
101
|
+
if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
|
102
|
+
similar_count += 3;
|
103
|
+
break;
|
104
|
+
}
|
105
|
+
|
106
|
+
double m = (double)match_count;
|
107
|
+
double t = (double)(transposition_count/2);
|
108
|
+
if(opt->adj_table) m = similar_count/10.0 + m;
|
109
|
+
return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
|
110
|
+
}
|
111
|
+
|
112
|
+
double jaro_winkler_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
|
113
|
+
double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
|
114
|
+
|
115
|
+
if(jaro_distance < opt->threshold) return jaro_distance;
|
116
|
+
else{
|
117
|
+
int prefix = 0;
|
118
|
+
int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
|
119
|
+
for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
|
120
|
+
return jaro_distance + prefix*opt->weight*(1-jaro_distance);
|
121
|
+
}
|
122
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef LIBJARO_JARO_H
|
2
|
+
#define LIBJARO_JARO_H
|
3
|
+
|
4
|
+
#define DEFAULT_WEIGHT 0.1
|
5
|
+
#define DEFAULT_THRESHOLD 0.7
|
6
|
+
|
7
|
+
typedef struct LibJaroOption{
|
8
|
+
double weight, threshold;
|
9
|
+
char ignore_case, adj_table;
|
10
|
+
} LibJaroOption;
|
11
|
+
|
12
|
+
|
13
|
+
static const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
|
14
|
+
double jaro_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
|
15
|
+
double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
|
16
|
+
|
17
|
+
#endif
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "jaro.h"
|
3
|
+
|
4
|
+
VALUE rb_mJaroWinkler,
|
5
|
+
rb_eError,
|
6
|
+
rb_eInvalidWeightError;
|
7
|
+
|
8
|
+
VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
|
9
|
+
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
|
10
|
+
VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt));
|
11
|
+
|
12
|
+
void Init_jaro_winkler_ext(void){
|
13
|
+
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
14
|
+
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
|
15
|
+
rb_eInvalidWeightError = rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
|
16
|
+
rb_define_module_function(rb_mJaroWinkler, "distance", rb_jaro_winkler_distance, -1);
|
17
|
+
rb_define_module_function(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance, -1);
|
18
|
+
}
|
19
|
+
|
20
|
+
|
21
|
+
VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt)){
|
22
|
+
VALUE s1, s2, opt;
|
23
|
+
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
24
|
+
LibJaroOption c_opt = DEFAULT_OPT;
|
25
|
+
if(TYPE(opt) == T_HASH){
|
26
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
27
|
+
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
28
|
+
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
29
|
+
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
30
|
+
if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
|
31
|
+
if(c_opt.weight > 0.25) rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
32
|
+
if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
|
33
|
+
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
34
|
+
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
35
|
+
}
|
36
|
+
return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
|
37
|
+
}
|
38
|
+
|
39
|
+
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self){
|
40
|
+
return distance(argc, argv, self, jaro_distance);
|
41
|
+
}
|
42
|
+
|
43
|
+
VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self){
|
44
|
+
return distance(argc, argv, self, jaro_winkler_distance);
|
45
|
+
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
//-----------------------------------------------------------------------------
|
2
|
+
// MurmurHash2, by Austin Appleby
|
3
|
+
|
4
|
+
// Note - This code makes a few assumptions about how your machine behaves -
|
5
|
+
|
6
|
+
// 1. We can read a 4-byte value from any address without crashing
|
7
|
+
// 2. sizeof(int) == 4
|
8
|
+
|
9
|
+
// And it has a few limitations -
|
10
|
+
|
11
|
+
// 1. It will not work incrementally.
|
12
|
+
// 2. It will not produce the same results on little-endian and big-endian
|
13
|
+
// machines.
|
14
|
+
|
15
|
+
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
16
|
+
{
|
17
|
+
// 'm' and 'r' are mixing constants generated offline.
|
18
|
+
// They're not really 'magic', they just happen to work well.
|
19
|
+
|
20
|
+
const unsigned int m = 0x5bd1e995;
|
21
|
+
const int r = 24;
|
22
|
+
|
23
|
+
// Initialize the hash to a 'random' value
|
24
|
+
|
25
|
+
unsigned int h = seed ^ len;
|
26
|
+
|
27
|
+
// Mix 4 bytes at a time into the hash
|
28
|
+
|
29
|
+
const unsigned char * data = (const unsigned char *)key;
|
30
|
+
|
31
|
+
while(len >= 4)
|
32
|
+
{
|
33
|
+
unsigned int k = *(unsigned int *)data;
|
34
|
+
|
35
|
+
k *= m;
|
36
|
+
k ^= k >> r;
|
37
|
+
k *= m;
|
38
|
+
|
39
|
+
h *= m;
|
40
|
+
h ^= k;
|
41
|
+
|
42
|
+
data += 4;
|
43
|
+
len -= 4;
|
44
|
+
}
|
45
|
+
|
46
|
+
// Handle the last few bytes of the input array
|
47
|
+
|
48
|
+
switch(len)
|
49
|
+
{
|
50
|
+
case 3: h ^= data[2] << 16;
|
51
|
+
case 2: h ^= data[1] << 8;
|
52
|
+
case 1: h ^= data[0];
|
53
|
+
h *= m;
|
54
|
+
};
|
55
|
+
|
56
|
+
// Do a few final mixes of the hash to ensure the last few
|
57
|
+
// bytes are well-incorporated.
|
58
|
+
|
59
|
+
h ^= h >> 13;
|
60
|
+
h *= m;
|
61
|
+
h ^= h >> 15;
|
62
|
+
|
63
|
+
return h;
|
64
|
+
}
|
data/lib/jaro_winkler.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
module JaroWinkler
|
2
|
+
DEFAULT_ADJ_TABLE = Hash.new
|
3
|
+
[
|
4
|
+
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
|
5
|
+
['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
|
6
|
+
['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
|
7
|
+
['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
|
8
|
+
['E', ' '], ['Y', ' '], ['S', ' ']
|
9
|
+
].each{ |s1, s2|
|
10
|
+
if not DEFAULT_ADJ_TABLE.has_key?(s1)
|
11
|
+
DEFAULT_ADJ_TABLE[s1] = Hash.new
|
12
|
+
end
|
13
|
+
if not DEFAULT_ADJ_TABLE.has_key?(s2)
|
14
|
+
DEFAULT_ADJ_TABLE[s2] = Hash.new
|
15
|
+
end
|
16
|
+
DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
|
17
|
+
}
|
18
|
+
DEFAULT_ADJ_TABLE.default = Hash.new
|
19
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'jaro_winkler/adjusting_table'
|
2
|
+
module JaroWinkler
|
3
|
+
class Error < RuntimeError; end
|
4
|
+
class InvalidWeightError < Error; end
|
5
|
+
|
6
|
+
DEFAULT_WEIGHT = 0.1
|
7
|
+
DEFAULT_THRESHOLD = 0.7
|
8
|
+
DEFAULT_OPTIONS = {
|
9
|
+
jaro: {adj_table: false, ignore_case: false},
|
10
|
+
jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
|
11
|
+
}
|
12
|
+
|
13
|
+
module_function
|
14
|
+
|
15
|
+
def distance str1, str2, options={}
|
16
|
+
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
17
|
+
end
|
18
|
+
|
19
|
+
def jaro_distance str1, str2, options={}
|
20
|
+
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
21
|
+
end
|
22
|
+
|
23
|
+
def _distance codes1, codes2, options={}
|
24
|
+
options = DEFAULT_OPTIONS[:jaro_winkler].merge options
|
25
|
+
raise InvalidWeightError if options[:weight] > 0.25
|
26
|
+
jaro_distance = _jaro_distance(codes1, codes2, options);
|
27
|
+
|
28
|
+
if jaro_distance < options[:threshold]
|
29
|
+
jaro_distance
|
30
|
+
else
|
31
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
32
|
+
len1, len2 = codes1.length, codes2.length
|
33
|
+
max_4 = len1 > 4 ? 4 : len1
|
34
|
+
prefix = 0
|
35
|
+
while prefix < max_4 && codes1[prefix] == codes2[prefix]
|
36
|
+
prefix += 1
|
37
|
+
end
|
38
|
+
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def _jaro_distance codes1, codes2, options={}
|
43
|
+
options = DEFAULT_OPTIONS[:jaro].merge options
|
44
|
+
|
45
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
46
|
+
len1, len2 = codes1.length, codes2.length
|
47
|
+
return 0.0 if len1 == 0 || len2 == 0
|
48
|
+
|
49
|
+
if options[:ignore_case]
|
50
|
+
codes1.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
51
|
+
codes2.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
52
|
+
end
|
53
|
+
|
54
|
+
window = len2/2 - 1
|
55
|
+
window = 0 if(window < 0)
|
56
|
+
flags1, flags2 = 0, 0
|
57
|
+
|
58
|
+
# // count number of matching characters
|
59
|
+
match_count = 0;
|
60
|
+
i = 0
|
61
|
+
while i < len1
|
62
|
+
left = (i >= window) ? i - window : 0
|
63
|
+
right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
|
64
|
+
right = len2 - 1 if right > len2 - 1
|
65
|
+
j = left
|
66
|
+
while j <= right
|
67
|
+
if flags2[j] == 0 && codes1[i] == codes2[j]
|
68
|
+
flags1 |= (1 << i)
|
69
|
+
flags2 |= (1 << j)
|
70
|
+
match_count += 1
|
71
|
+
break
|
72
|
+
end
|
73
|
+
j +=1
|
74
|
+
end
|
75
|
+
i += 1
|
76
|
+
end
|
77
|
+
|
78
|
+
return 0.0 if match_count == 0
|
79
|
+
|
80
|
+
# // count number of transpositions
|
81
|
+
transposition_count = j = k = 0
|
82
|
+
i = 0
|
83
|
+
while i < len1
|
84
|
+
if flags1[i] == 1
|
85
|
+
j = k
|
86
|
+
while j < len2
|
87
|
+
if flags2[j] == 1
|
88
|
+
k = j + 1;
|
89
|
+
break;
|
90
|
+
end
|
91
|
+
j += 1
|
92
|
+
end
|
93
|
+
transposition_count += 1 if codes1[i] != codes2[j]
|
94
|
+
end
|
95
|
+
i += 1
|
96
|
+
end
|
97
|
+
|
98
|
+
# // count similarities in nonmatched characters
|
99
|
+
similar_count = 0
|
100
|
+
if options[:adj_table] && len1 > match_count
|
101
|
+
i = 0
|
102
|
+
while i < len1
|
103
|
+
if flags1[i] == 0
|
104
|
+
j = 0
|
105
|
+
while j < len2
|
106
|
+
if flags2[j] == 0
|
107
|
+
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
|
108
|
+
similar_count += 3
|
109
|
+
break
|
110
|
+
end
|
111
|
+
end
|
112
|
+
j += 1
|
113
|
+
end
|
114
|
+
end
|
115
|
+
i += 1
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
m = match_count.to_f
|
120
|
+
t = transposition_count/2
|
121
|
+
m = similar_count/10.0 + m if options[:adj_table]
|
122
|
+
(m/len1 + m/len2 + (m-t)/m) / 3
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
metadata
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jaro_winkler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.4.0
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Jian Weihang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
version_requirements: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
requirement: !ruby/object:Gem::Requirement
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: '1.7'
|
25
|
+
prerelease: false
|
26
|
+
type: :development
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ~>
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '10.0'
|
39
|
+
prerelease: false
|
40
|
+
type: :development
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
43
|
+
version_requirements: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
prerelease: false
|
54
|
+
type: :development
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
version_requirements: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
prerelease: false
|
68
|
+
type: :development
|
69
|
+
description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
|
70
|
+
email: tonytonyjan@gmail.com
|
71
|
+
executables: []
|
72
|
+
extensions: []
|
73
|
+
extra_rdoc_files: []
|
74
|
+
files:
|
75
|
+
- ext/jaro_winkler/adj_matrix.c
|
76
|
+
- ext/jaro_winkler/adj_matrix.h
|
77
|
+
- ext/jaro_winkler/code.c
|
78
|
+
- ext/jaro_winkler/code.h
|
79
|
+
- ext/jaro_winkler/jaro.c
|
80
|
+
- ext/jaro_winkler/jaro.h
|
81
|
+
- ext/jaro_winkler/jaro_winkler.c
|
82
|
+
- ext/jaro_winkler/murmur_hash2.c
|
83
|
+
- lib/jaro_winkler.rb
|
84
|
+
- lib/jaro_winkler/adjusting_table.rb
|
85
|
+
- lib/jaro_winkler/jaro_winkler_pure.rb
|
86
|
+
- lib/jaro_winkler/version.rb
|
87
|
+
homepage: https://github.com/tonytonyjan/jaro_winkler
|
88
|
+
licenses:
|
89
|
+
- MIT
|
90
|
+
metadata: {}
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - '>='
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - '>='
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
requirements: []
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 2.4.5
|
108
|
+
signing_key:
|
109
|
+
specification_version: 4
|
110
|
+
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.
|
111
|
+
test_files: []
|