jaro_winkler 1.4.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/jaro_winkler/adj_matrix.c +89 -0
- data/ext/jaro_winkler/adj_matrix.h +22 -0
- data/ext/jaro_winkler/code.c +29 -0
- data/ext/jaro_winkler/code.h +7 -0
- data/ext/jaro_winkler/jaro.c +122 -0
- data/ext/jaro_winkler/jaro.h +17 -0
- data/ext/jaro_winkler/jaro_winkler.c +45 -0
- data/ext/jaro_winkler/murmur_hash2.c +64 -0
- data/lib/jaro_winkler.rb +9 -0
- data/lib/jaro_winkler/adjusting_table.rb +19 -0
- data/lib/jaro_winkler/jaro_winkler_pure.rb +125 -0
- data/lib/jaro_winkler/version.rb +3 -0
- metadata +111 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 52f268c19787793ca7383fe1de1f0355e0a3e6b9
|
4
|
+
data.tar.gz: f8814b814294a7f9268a6df2ad1ad72c75146c3e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2ea65143ad847ef5cd565584c2dd1ce19908136506697eafe0579609227628a6e2bbb4baacd0d6c3ee883bcea07fff3043ae305d84c307a0e5f359dff64ab0c1
|
7
|
+
data.tar.gz: 254d25523a0654343ca5b9a552789021a30d3dc7d0c613333db7d67f3ccc41ce003c01bf00b63089b9a340a83594d40f1d7c49b2e59601885b471e68048fc23f
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#include "adj_matrix.h"
|
2
|
+
#include "code.h"
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
|
6
|
+
const char *DEFAULT_ADJ_TABLE[] = {
|
7
|
+
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
8
|
+
"I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
|
9
|
+
"M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
|
10
|
+
"0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
|
11
|
+
};
|
12
|
+
|
13
|
+
extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
|
14
|
+
void node_free(Node *head);
|
15
|
+
|
16
|
+
AdjMatrix* adj_matrix_new(unsigned int length){
|
17
|
+
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
|
18
|
+
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
|
19
|
+
matrix->table = malloc(matrix->length * sizeof(Node**));
|
20
|
+
for(int i = 0; i < matrix->length; i++){
|
21
|
+
matrix->table[i] = malloc(matrix->length * sizeof(Node*));
|
22
|
+
for (int j = 0; j < matrix->length; j++)
|
23
|
+
matrix->table[i][j] = NULL;
|
24
|
+
}
|
25
|
+
return matrix;
|
26
|
+
}
|
27
|
+
|
28
|
+
void adj_matrix_add(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
|
29
|
+
unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
|
30
|
+
h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
|
31
|
+
Node *new_node = malloc(sizeof(Node)); new_node->x = h1; new_node->y = h2; new_node->next = NULL;
|
32
|
+
if(matrix->table[h1][h2] == NULL){
|
33
|
+
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
|
34
|
+
}
|
35
|
+
else{
|
36
|
+
Node *previous = NULL;
|
37
|
+
for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next) previous = i;
|
38
|
+
previous->next = new_node;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
|
43
|
+
unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
|
44
|
+
h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
|
45
|
+
Node *node = matrix->table[h1][h2];
|
46
|
+
if(node == NULL) return 0;
|
47
|
+
else{
|
48
|
+
for(Node *i = node; i != NULL; i = i->next)
|
49
|
+
if((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1)) return 1;
|
50
|
+
return 0;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
void node_free(Node *head){
|
55
|
+
if(head == NULL) return;
|
56
|
+
node_free(head->next);
|
57
|
+
free(head);
|
58
|
+
}
|
59
|
+
|
60
|
+
void adj_matrix_free(AdjMatrix *matrix){
|
61
|
+
for(int i = 0; i < matrix->length; i++){
|
62
|
+
for(int j = 0; j < matrix->length; j++)
|
63
|
+
if(matrix->table[i][j] != NULL){
|
64
|
+
node_free(matrix->table[i][j]);
|
65
|
+
matrix->table[i][j] = matrix->table[j][i] = NULL;
|
66
|
+
}
|
67
|
+
free(matrix->table[i]);
|
68
|
+
}
|
69
|
+
free(matrix->table);
|
70
|
+
free(matrix);
|
71
|
+
}
|
72
|
+
|
73
|
+
AdjMatrix* adj_matrix_default(){
|
74
|
+
static char first_time = 1;
|
75
|
+
static AdjMatrix *ret_matrix;
|
76
|
+
if(first_time){
|
77
|
+
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
78
|
+
int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
|
79
|
+
for(int i = 0; i < length; i += 2){
|
80
|
+
unsigned long long code_1, code_2;
|
81
|
+
int dummy_length;
|
82
|
+
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
|
83
|
+
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
|
84
|
+
adj_matrix_add(ret_matrix, code_1, code_2);
|
85
|
+
}
|
86
|
+
first_time = 0;
|
87
|
+
}
|
88
|
+
return ret_matrix;
|
89
|
+
}
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#ifndef ADJ_MATRIX_H
|
2
|
+
#define ADJ_MATRIX_H
|
3
|
+
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
4
|
+
#define ADJ_MATRIX_SEED 9527
|
5
|
+
|
6
|
+
typedef struct _node{
|
7
|
+
struct _node *next;
|
8
|
+
unsigned long long x, y;
|
9
|
+
} Node;
|
10
|
+
|
11
|
+
typedef struct{
|
12
|
+
Node ***table;
|
13
|
+
unsigned int length;
|
14
|
+
} AdjMatrix;
|
15
|
+
|
16
|
+
AdjMatrix* adj_matrix_new (unsigned int length);
|
17
|
+
void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
|
18
|
+
char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
|
19
|
+
void adj_matrix_free (AdjMatrix *matrix);
|
20
|
+
AdjMatrix* adj_matrix_default();
|
21
|
+
|
22
|
+
#endif
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
|
5
|
+
unsigned char first_char = str[0];
|
6
|
+
if(first_char >= 252) *ret_byte_length = 6; // 1111110x
|
7
|
+
else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
|
8
|
+
else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
|
9
|
+
else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
|
10
|
+
else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
|
11
|
+
else *ret_byte_length = 1;
|
12
|
+
*ret_code = 0;
|
13
|
+
memcpy(ret_code, str, *ret_byte_length);
|
14
|
+
}
|
15
|
+
|
16
|
+
void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
|
17
|
+
unsigned int code;
|
18
|
+
char byte_length;
|
19
|
+
|
20
|
+
*ret_codes = calloc(length, sizeof(long long));
|
21
|
+
*ret_length = 0;
|
22
|
+
|
23
|
+
for(int i = 0; i < length;){
|
24
|
+
int byte_length;
|
25
|
+
utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
|
26
|
+
*ret_length += 1;
|
27
|
+
i += byte_length;
|
28
|
+
}
|
29
|
+
}
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#include "jaro.h"
|
2
|
+
#include "code.h"
|
3
|
+
#include "adj_matrix.h"
|
4
|
+
|
5
|
+
#include <string.h>
|
6
|
+
#include <stdlib.h>
|
7
|
+
#include <ctype.h>
|
8
|
+
|
9
|
+
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
|
10
|
+
|
11
|
+
double jaro_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
|
12
|
+
double jaro_winkler_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
|
13
|
+
|
14
|
+
double jaro_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
|
15
|
+
if(!short_str_len || !long_str_len) return 0.0;
|
16
|
+
|
17
|
+
unsigned long long *short_codes, *long_codes;
|
18
|
+
int short_codes_len, long_codes_len;
|
19
|
+
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
|
20
|
+
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
|
21
|
+
|
22
|
+
double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
|
23
|
+
|
24
|
+
free(short_codes); free(long_codes);
|
25
|
+
return ret;
|
26
|
+
}
|
27
|
+
|
28
|
+
double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
|
29
|
+
if(!short_str_len || !long_str_len) return 0.0;
|
30
|
+
|
31
|
+
unsigned long long *short_codes, *long_codes;
|
32
|
+
int short_codes_len, long_codes_len;
|
33
|
+
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
|
34
|
+
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
|
35
|
+
|
36
|
+
double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
|
37
|
+
|
38
|
+
free(short_codes); free(long_codes);
|
39
|
+
return ret;
|
40
|
+
}
|
41
|
+
|
42
|
+
double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
|
43
|
+
if(!short_codes_len || !long_codes_len) return 0.0;
|
44
|
+
|
45
|
+
if(short_codes_len > long_codes_len){
|
46
|
+
SWAP(short_codes, long_codes);
|
47
|
+
SWAP(short_codes_len, long_codes_len);
|
48
|
+
}
|
49
|
+
|
50
|
+
if(opt->ignore_case){
|
51
|
+
for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
|
52
|
+
for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
|
53
|
+
}
|
54
|
+
|
55
|
+
int window_size = long_codes_len/2 - 1;
|
56
|
+
if(window_size < 0) window_size = 0;
|
57
|
+
|
58
|
+
char short_codes_flag[short_codes_len];
|
59
|
+
char long_codes_flag[long_codes_len];
|
60
|
+
memset(short_codes_flag, 0, short_codes_len);
|
61
|
+
memset(long_codes_flag, 0, long_codes_len);
|
62
|
+
|
63
|
+
// count number of matching characters
|
64
|
+
int match_count = 0;
|
65
|
+
for(int i = 0; i < short_codes_len; i++){
|
66
|
+
int left = (i >= window_size) ? i - window_size : 0;
|
67
|
+
int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
|
68
|
+
if(right > long_codes_len - 1) right = long_codes_len - 1;
|
69
|
+
for(int j = left; j <= right; j++){
|
70
|
+
if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
|
71
|
+
short_codes_flag[i] = long_codes_flag[j] = 1;
|
72
|
+
match_count++;
|
73
|
+
break;
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
if(!match_count) return 0.0;
|
79
|
+
|
80
|
+
// count number of transpositions
|
81
|
+
int transposition_count = 0, j = 0, k = 0;
|
82
|
+
for(int i = 0; i < short_codes_len; i++){
|
83
|
+
if(short_codes_flag[i]){
|
84
|
+
for(j = k; j < long_codes_len; j++){
|
85
|
+
if(long_codes_flag[j]){
|
86
|
+
k = j + 1;
|
87
|
+
break;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
if(short_codes[i] != long_codes[j]) transposition_count++;
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
// count similarities in nonmatched characters
|
95
|
+
int similar_count = 0;
|
96
|
+
if(opt->adj_table && short_codes_len > match_count)
|
97
|
+
for(int i = 0; i < short_codes_len; i++)
|
98
|
+
if(!short_codes_flag[i])
|
99
|
+
for(int j = 0; j < long_codes_len; j++)
|
100
|
+
if(!long_codes_flag[j])
|
101
|
+
if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
|
102
|
+
similar_count += 3;
|
103
|
+
break;
|
104
|
+
}
|
105
|
+
|
106
|
+
double m = (double)match_count;
|
107
|
+
double t = (double)(transposition_count/2);
|
108
|
+
if(opt->adj_table) m = similar_count/10.0 + m;
|
109
|
+
return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
|
110
|
+
}
|
111
|
+
|
112
|
+
double jaro_winkler_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
|
113
|
+
double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
|
114
|
+
|
115
|
+
if(jaro_distance < opt->threshold) return jaro_distance;
|
116
|
+
else{
|
117
|
+
int prefix = 0;
|
118
|
+
int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
|
119
|
+
for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
|
120
|
+
return jaro_distance + prefix*opt->weight*(1-jaro_distance);
|
121
|
+
}
|
122
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef LIBJARO_JARO_H
|
2
|
+
#define LIBJARO_JARO_H
|
3
|
+
|
4
|
+
#define DEFAULT_WEIGHT 0.1
|
5
|
+
#define DEFAULT_THRESHOLD 0.7
|
6
|
+
|
7
|
+
typedef struct LibJaroOption{
|
8
|
+
double weight, threshold;
|
9
|
+
char ignore_case, adj_table;
|
10
|
+
} LibJaroOption;
|
11
|
+
|
12
|
+
|
13
|
+
static const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
|
14
|
+
double jaro_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
|
15
|
+
double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
|
16
|
+
|
17
|
+
#endif
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "jaro.h"
|
3
|
+
|
4
|
+
VALUE rb_mJaroWinkler,
|
5
|
+
rb_eError,
|
6
|
+
rb_eInvalidWeightError;
|
7
|
+
|
8
|
+
VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
|
9
|
+
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
|
10
|
+
VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt));
|
11
|
+
|
12
|
+
void Init_jaro_winkler_ext(void){
|
13
|
+
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
14
|
+
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
|
15
|
+
rb_eInvalidWeightError = rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
|
16
|
+
rb_define_module_function(rb_mJaroWinkler, "distance", rb_jaro_winkler_distance, -1);
|
17
|
+
rb_define_module_function(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance, -1);
|
18
|
+
}
|
19
|
+
|
20
|
+
|
21
|
+
VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt)){
|
22
|
+
VALUE s1, s2, opt;
|
23
|
+
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
24
|
+
LibJaroOption c_opt = DEFAULT_OPT;
|
25
|
+
if(TYPE(opt) == T_HASH){
|
26
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
27
|
+
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
28
|
+
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
29
|
+
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
30
|
+
if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
|
31
|
+
if(c_opt.weight > 0.25) rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
32
|
+
if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
|
33
|
+
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
34
|
+
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
35
|
+
}
|
36
|
+
return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
|
37
|
+
}
|
38
|
+
|
39
|
+
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self){
|
40
|
+
return distance(argc, argv, self, jaro_distance);
|
41
|
+
}
|
42
|
+
|
43
|
+
VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self){
|
44
|
+
return distance(argc, argv, self, jaro_winkler_distance);
|
45
|
+
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
//-----------------------------------------------------------------------------
|
2
|
+
// MurmurHash2, by Austin Appleby
|
3
|
+
|
4
|
+
// Note - This code makes a few assumptions about how your machine behaves -
|
5
|
+
|
6
|
+
// 1. We can read a 4-byte value from any address without crashing
|
7
|
+
// 2. sizeof(int) == 4
|
8
|
+
|
9
|
+
// And it has a few limitations -
|
10
|
+
|
11
|
+
// 1. It will not work incrementally.
|
12
|
+
// 2. It will not produce the same results on little-endian and big-endian
|
13
|
+
// machines.
|
14
|
+
|
15
|
+
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
|
16
|
+
{
|
17
|
+
// 'm' and 'r' are mixing constants generated offline.
|
18
|
+
// They're not really 'magic', they just happen to work well.
|
19
|
+
|
20
|
+
const unsigned int m = 0x5bd1e995;
|
21
|
+
const int r = 24;
|
22
|
+
|
23
|
+
// Initialize the hash to a 'random' value
|
24
|
+
|
25
|
+
unsigned int h = seed ^ len;
|
26
|
+
|
27
|
+
// Mix 4 bytes at a time into the hash
|
28
|
+
|
29
|
+
const unsigned char * data = (const unsigned char *)key;
|
30
|
+
|
31
|
+
while(len >= 4)
|
32
|
+
{
|
33
|
+
unsigned int k = *(unsigned int *)data;
|
34
|
+
|
35
|
+
k *= m;
|
36
|
+
k ^= k >> r;
|
37
|
+
k *= m;
|
38
|
+
|
39
|
+
h *= m;
|
40
|
+
h ^= k;
|
41
|
+
|
42
|
+
data += 4;
|
43
|
+
len -= 4;
|
44
|
+
}
|
45
|
+
|
46
|
+
// Handle the last few bytes of the input array
|
47
|
+
|
48
|
+
switch(len)
|
49
|
+
{
|
50
|
+
case 3: h ^= data[2] << 16;
|
51
|
+
case 2: h ^= data[1] << 8;
|
52
|
+
case 1: h ^= data[0];
|
53
|
+
h *= m;
|
54
|
+
};
|
55
|
+
|
56
|
+
// Do a few final mixes of the hash to ensure the last few
|
57
|
+
// bytes are well-incorporated.
|
58
|
+
|
59
|
+
h ^= h >> 13;
|
60
|
+
h *= m;
|
61
|
+
h ^= h >> 15;
|
62
|
+
|
63
|
+
return h;
|
64
|
+
}
|
data/lib/jaro_winkler.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
module JaroWinkler
|
2
|
+
DEFAULT_ADJ_TABLE = Hash.new
|
3
|
+
[
|
4
|
+
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
|
5
|
+
['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
|
6
|
+
['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
|
7
|
+
['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
|
8
|
+
['E', ' '], ['Y', ' '], ['S', ' ']
|
9
|
+
].each{ |s1, s2|
|
10
|
+
if not DEFAULT_ADJ_TABLE.has_key?(s1)
|
11
|
+
DEFAULT_ADJ_TABLE[s1] = Hash.new
|
12
|
+
end
|
13
|
+
if not DEFAULT_ADJ_TABLE.has_key?(s2)
|
14
|
+
DEFAULT_ADJ_TABLE[s2] = Hash.new
|
15
|
+
end
|
16
|
+
DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
|
17
|
+
}
|
18
|
+
DEFAULT_ADJ_TABLE.default = Hash.new
|
19
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'jaro_winkler/adjusting_table'
|
2
|
+
module JaroWinkler
|
3
|
+
class Error < RuntimeError; end
|
4
|
+
class InvalidWeightError < Error; end
|
5
|
+
|
6
|
+
DEFAULT_WEIGHT = 0.1
|
7
|
+
DEFAULT_THRESHOLD = 0.7
|
8
|
+
DEFAULT_OPTIONS = {
|
9
|
+
jaro: {adj_table: false, ignore_case: false},
|
10
|
+
jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
|
11
|
+
}
|
12
|
+
|
13
|
+
module_function
|
14
|
+
|
15
|
+
def distance str1, str2, options={}
|
16
|
+
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
17
|
+
end
|
18
|
+
|
19
|
+
def jaro_distance str1, str2, options={}
|
20
|
+
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
|
21
|
+
end
|
22
|
+
|
23
|
+
def _distance codes1, codes2, options={}
|
24
|
+
options = DEFAULT_OPTIONS[:jaro_winkler].merge options
|
25
|
+
raise InvalidWeightError if options[:weight] > 0.25
|
26
|
+
jaro_distance = _jaro_distance(codes1, codes2, options);
|
27
|
+
|
28
|
+
if jaro_distance < options[:threshold]
|
29
|
+
jaro_distance
|
30
|
+
else
|
31
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
32
|
+
len1, len2 = codes1.length, codes2.length
|
33
|
+
max_4 = len1 > 4 ? 4 : len1
|
34
|
+
prefix = 0
|
35
|
+
while prefix < max_4 && codes1[prefix] == codes2[prefix]
|
36
|
+
prefix += 1
|
37
|
+
end
|
38
|
+
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def _jaro_distance codes1, codes2, options={}
|
43
|
+
options = DEFAULT_OPTIONS[:jaro].merge options
|
44
|
+
|
45
|
+
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
|
46
|
+
len1, len2 = codes1.length, codes2.length
|
47
|
+
return 0.0 if len1 == 0 || len2 == 0
|
48
|
+
|
49
|
+
if options[:ignore_case]
|
50
|
+
codes1.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
51
|
+
codes2.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
|
52
|
+
end
|
53
|
+
|
54
|
+
window = len2/2 - 1
|
55
|
+
window = 0 if(window < 0)
|
56
|
+
flags1, flags2 = 0, 0
|
57
|
+
|
58
|
+
# // count number of matching characters
|
59
|
+
match_count = 0;
|
60
|
+
i = 0
|
61
|
+
while i < len1
|
62
|
+
left = (i >= window) ? i - window : 0
|
63
|
+
right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
|
64
|
+
right = len2 - 1 if right > len2 - 1
|
65
|
+
j = left
|
66
|
+
while j <= right
|
67
|
+
if flags2[j] == 0 && codes1[i] == codes2[j]
|
68
|
+
flags1 |= (1 << i)
|
69
|
+
flags2 |= (1 << j)
|
70
|
+
match_count += 1
|
71
|
+
break
|
72
|
+
end
|
73
|
+
j +=1
|
74
|
+
end
|
75
|
+
i += 1
|
76
|
+
end
|
77
|
+
|
78
|
+
return 0.0 if match_count == 0
|
79
|
+
|
80
|
+
# // count number of transpositions
|
81
|
+
transposition_count = j = k = 0
|
82
|
+
i = 0
|
83
|
+
while i < len1
|
84
|
+
if flags1[i] == 1
|
85
|
+
j = k
|
86
|
+
while j < len2
|
87
|
+
if flags2[j] == 1
|
88
|
+
k = j + 1;
|
89
|
+
break;
|
90
|
+
end
|
91
|
+
j += 1
|
92
|
+
end
|
93
|
+
transposition_count += 1 if codes1[i] != codes2[j]
|
94
|
+
end
|
95
|
+
i += 1
|
96
|
+
end
|
97
|
+
|
98
|
+
# // count similarities in nonmatched characters
|
99
|
+
similar_count = 0
|
100
|
+
if options[:adj_table] && len1 > match_count
|
101
|
+
i = 0
|
102
|
+
while i < len1
|
103
|
+
if flags1[i] == 0
|
104
|
+
j = 0
|
105
|
+
while j < len2
|
106
|
+
if flags2[j] == 0
|
107
|
+
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
|
108
|
+
similar_count += 3
|
109
|
+
break
|
110
|
+
end
|
111
|
+
end
|
112
|
+
j += 1
|
113
|
+
end
|
114
|
+
end
|
115
|
+
i += 1
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
m = match_count.to_f
|
120
|
+
t = transposition_count/2
|
121
|
+
m = similar_count/10.0 + m if options[:adj_table]
|
122
|
+
(m/len1 + m/len2 + (m-t)/m) / 3
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
metadata
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jaro_winkler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.4.0
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Jian Weihang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
version_requirements: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
requirement: !ruby/object:Gem::Requirement
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: '1.7'
|
25
|
+
prerelease: false
|
26
|
+
type: :development
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ~>
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '10.0'
|
39
|
+
prerelease: false
|
40
|
+
type: :development
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
43
|
+
version_requirements: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
prerelease: false
|
54
|
+
type: :development
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
version_requirements: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
prerelease: false
|
68
|
+
type: :development
|
69
|
+
description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
|
70
|
+
email: tonytonyjan@gmail.com
|
71
|
+
executables: []
|
72
|
+
extensions: []
|
73
|
+
extra_rdoc_files: []
|
74
|
+
files:
|
75
|
+
- ext/jaro_winkler/adj_matrix.c
|
76
|
+
- ext/jaro_winkler/adj_matrix.h
|
77
|
+
- ext/jaro_winkler/code.c
|
78
|
+
- ext/jaro_winkler/code.h
|
79
|
+
- ext/jaro_winkler/jaro.c
|
80
|
+
- ext/jaro_winkler/jaro.h
|
81
|
+
- ext/jaro_winkler/jaro_winkler.c
|
82
|
+
- ext/jaro_winkler/murmur_hash2.c
|
83
|
+
- lib/jaro_winkler.rb
|
84
|
+
- lib/jaro_winkler/adjusting_table.rb
|
85
|
+
- lib/jaro_winkler/jaro_winkler_pure.rb
|
86
|
+
- lib/jaro_winkler/version.rb
|
87
|
+
homepage: https://github.com/tonytonyjan/jaro_winkler
|
88
|
+
licenses:
|
89
|
+
- MIT
|
90
|
+
metadata: {}
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - '>='
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - '>='
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
requirements: []
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 2.4.5
|
108
|
+
signing_key:
|
109
|
+
specification_version: 4
|
110
|
+
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.
|
111
|
+
test_files: []
|