jaro_winkler 1.4.0-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 52f268c19787793ca7383fe1de1f0355e0a3e6b9
4
+ data.tar.gz: f8814b814294a7f9268a6df2ad1ad72c75146c3e
5
+ SHA512:
6
+ metadata.gz: 2ea65143ad847ef5cd565584c2dd1ce19908136506697eafe0579609227628a6e2bbb4baacd0d6c3ee883bcea07fff3043ae305d84c307a0e5f359dff64ab0c1
7
+ data.tar.gz: 254d25523a0654343ca5b9a552789021a30d3dc7d0c613333db7d67f3ccc41ce003c01bf00b63089b9a340a83594d40f1d7c49b2e59601885b471e68048fc23f
@@ -0,0 +1,89 @@
1
+ #include "adj_matrix.h"
2
+ #include "code.h"
3
+
4
+ #include <stdlib.h>
5
+
6
+ const char *DEFAULT_ADJ_TABLE[] = {
7
+ "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
8
+ "I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
9
+ "M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
10
+ "0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
11
+ };
12
+
13
+ extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
14
+ void node_free(Node *head);
15
+
16
+ AdjMatrix* adj_matrix_new(unsigned int length){
17
+ AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
18
+ matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
19
+ matrix->table = malloc(matrix->length * sizeof(Node**));
20
+ for(int i = 0; i < matrix->length; i++){
21
+ matrix->table[i] = malloc(matrix->length * sizeof(Node*));
22
+ for (int j = 0; j < matrix->length; j++)
23
+ matrix->table[i][j] = NULL;
24
+ }
25
+ return matrix;
26
+ }
27
+
28
+ void adj_matrix_add(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
29
+ unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
30
+ h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
31
+ Node *new_node = malloc(sizeof(Node)); new_node->x = h1; new_node->y = h2; new_node->next = NULL;
32
+ if(matrix->table[h1][h2] == NULL){
33
+ matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
34
+ }
35
+ else{
36
+ Node *previous = NULL;
37
+ for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next) previous = i;
38
+ previous->next = new_node;
39
+ }
40
+ }
41
+
42
+ char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
43
+ unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
44
+ h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
45
+ Node *node = matrix->table[h1][h2];
46
+ if(node == NULL) return 0;
47
+ else{
48
+ for(Node *i = node; i != NULL; i = i->next)
49
+ if((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1)) return 1;
50
+ return 0;
51
+ }
52
+ }
53
+
54
+ void node_free(Node *head){
55
+ if(head == NULL) return;
56
+ node_free(head->next);
57
+ free(head);
58
+ }
59
+
60
+ void adj_matrix_free(AdjMatrix *matrix){
61
+ for(int i = 0; i < matrix->length; i++){
62
+ for(int j = 0; j < matrix->length; j++)
63
+ if(matrix->table[i][j] != NULL){
64
+ node_free(matrix->table[i][j]);
65
+ matrix->table[i][j] = matrix->table[j][i] = NULL;
66
+ }
67
+ free(matrix->table[i]);
68
+ }
69
+ free(matrix->table);
70
+ free(matrix);
71
+ }
72
+
73
+ AdjMatrix* adj_matrix_default(){
74
+ static char first_time = 1;
75
+ static AdjMatrix *ret_matrix;
76
+ if(first_time){
77
+ ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
78
+ int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
79
+ for(int i = 0; i < length; i += 2){
80
+ unsigned long long code_1, code_2;
81
+ int dummy_length;
82
+ utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
83
+ utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
84
+ adj_matrix_add(ret_matrix, code_1, code_2);
85
+ }
86
+ first_time = 0;
87
+ }
88
+ return ret_matrix;
89
+ }
@@ -0,0 +1,22 @@
1
+ #ifndef ADJ_MATRIX_H
2
+ #define ADJ_MATRIX_H
3
+ #define ADJ_MATRIX_DEFAULT_LENGTH 958
4
+ #define ADJ_MATRIX_SEED 9527
5
+
6
+ typedef struct _node{
7
+ struct _node *next;
8
+ unsigned long long x, y;
9
+ } Node;
10
+
11
+ typedef struct{
12
+ Node ***table;
13
+ unsigned int length;
14
+ } AdjMatrix;
15
+
16
+ AdjMatrix* adj_matrix_new (unsigned int length);
17
+ void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
18
+ char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
19
+ void adj_matrix_free (AdjMatrix *matrix);
20
+ AdjMatrix* adj_matrix_default();
21
+
22
+ #endif
@@ -0,0 +1,29 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+
4
+ void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
5
+ unsigned char first_char = str[0];
6
+ if(first_char >= 252) *ret_byte_length = 6; // 1111110x
7
+ else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
8
+ else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
9
+ else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
10
+ else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
11
+ else *ret_byte_length = 1;
12
+ *ret_code = 0;
13
+ memcpy(ret_code, str, *ret_byte_length);
14
+ }
15
+
16
+ void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
17
+ unsigned int code;
18
+ char byte_length;
19
+
20
+ *ret_codes = calloc(length, sizeof(long long));
21
+ *ret_length = 0;
22
+
23
+ for(int i = 0; i < length;){
24
+ int byte_length;
25
+ utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
26
+ *ret_length += 1;
27
+ i += byte_length;
28
+ }
29
+ }
@@ -0,0 +1,7 @@
1
+ #ifndef CODE_H
2
+ #define CODE_H
3
+
4
+ void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length);
5
+ void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length);
6
+
7
+ #endif
@@ -0,0 +1,122 @@
1
+ #include "jaro.h"
2
+ #include "code.h"
3
+ #include "adj_matrix.h"
4
+
5
+ #include <string.h>
6
+ #include <stdlib.h>
7
+ #include <ctype.h>
8
+
9
+ #define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
10
+
11
+ double jaro_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
12
+ double jaro_winkler_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
13
+
14
+ double jaro_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
15
+ if(!short_str_len || !long_str_len) return 0.0;
16
+
17
+ unsigned long long *short_codes, *long_codes;
18
+ int short_codes_len, long_codes_len;
19
+ string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
20
+ string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
21
+
22
+ double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
23
+
24
+ free(short_codes); free(long_codes);
25
+ return ret;
26
+ }
27
+
28
+ double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
29
+ if(!short_str_len || !long_str_len) return 0.0;
30
+
31
+ unsigned long long *short_codes, *long_codes;
32
+ int short_codes_len, long_codes_len;
33
+ string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
34
+ string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
35
+
36
+ double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
37
+
38
+ free(short_codes); free(long_codes);
39
+ return ret;
40
+ }
41
+
42
+ double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
43
+ if(!short_codes_len || !long_codes_len) return 0.0;
44
+
45
+ if(short_codes_len > long_codes_len){
46
+ SWAP(short_codes, long_codes);
47
+ SWAP(short_codes_len, long_codes_len);
48
+ }
49
+
50
+ if(opt->ignore_case){
51
+ for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
52
+ for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
53
+ }
54
+
55
+ int window_size = long_codes_len/2 - 1;
56
+ if(window_size < 0) window_size = 0;
57
+
58
+ char short_codes_flag[short_codes_len];
59
+ char long_codes_flag[long_codes_len];
60
+ memset(short_codes_flag, 0, short_codes_len);
61
+ memset(long_codes_flag, 0, long_codes_len);
62
+
63
+ // count number of matching characters
64
+ int match_count = 0;
65
+ for(int i = 0; i < short_codes_len; i++){
66
+ int left = (i >= window_size) ? i - window_size : 0;
67
+ int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
68
+ if(right > long_codes_len - 1) right = long_codes_len - 1;
69
+ for(int j = left; j <= right; j++){
70
+ if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
71
+ short_codes_flag[i] = long_codes_flag[j] = 1;
72
+ match_count++;
73
+ break;
74
+ }
75
+ }
76
+ }
77
+
78
+ if(!match_count) return 0.0;
79
+
80
+ // count number of transpositions
81
+ int transposition_count = 0, j = 0, k = 0;
82
+ for(int i = 0; i < short_codes_len; i++){
83
+ if(short_codes_flag[i]){
84
+ for(j = k; j < long_codes_len; j++){
85
+ if(long_codes_flag[j]){
86
+ k = j + 1;
87
+ break;
88
+ }
89
+ }
90
+ if(short_codes[i] != long_codes[j]) transposition_count++;
91
+ }
92
+ }
93
+
94
+ // count similarities in nonmatched characters
95
+ int similar_count = 0;
96
+ if(opt->adj_table && short_codes_len > match_count)
97
+ for(int i = 0; i < short_codes_len; i++)
98
+ if(!short_codes_flag[i])
99
+ for(int j = 0; j < long_codes_len; j++)
100
+ if(!long_codes_flag[j])
101
+ if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
102
+ similar_count += 3;
103
+ break;
104
+ }
105
+
106
+ double m = (double)match_count;
107
+ double t = (double)(transposition_count/2);
108
+ if(opt->adj_table) m = similar_count/10.0 + m;
109
+ return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
110
+ }
111
+
112
+ double jaro_winkler_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
113
+ double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
114
+
115
+ if(jaro_distance < opt->threshold) return jaro_distance;
116
+ else{
117
+ int prefix = 0;
118
+ int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
119
+ for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
120
+ return jaro_distance + prefix*opt->weight*(1-jaro_distance);
121
+ }
122
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef LIBJARO_JARO_H
2
+ #define LIBJARO_JARO_H
3
+
4
+ #define DEFAULT_WEIGHT 0.1
5
+ #define DEFAULT_THRESHOLD 0.7
6
+
7
+ typedef struct LibJaroOption{
8
+ double weight, threshold;
9
+ char ignore_case, adj_table;
10
+ } LibJaroOption;
11
+
12
+
13
+ static const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
14
+ double jaro_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
15
+ double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
16
+
17
+ #endif
@@ -0,0 +1,45 @@
1
+ #include "ruby.h"
2
+ #include "jaro.h"
3
+
4
+ VALUE rb_mJaroWinkler,
5
+ rb_eError,
6
+ rb_eInvalidWeightError;
7
+
8
+ VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
9
+ VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
10
+ VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt));
11
+
12
+ void Init_jaro_winkler_ext(void){
13
+ rb_mJaroWinkler = rb_define_module("JaroWinkler");
14
+ rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
15
+ rb_eInvalidWeightError = rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
16
+ rb_define_module_function(rb_mJaroWinkler, "distance", rb_jaro_winkler_distance, -1);
17
+ rb_define_module_function(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance, -1);
18
+ }
19
+
20
+
21
+ VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt)){
22
+ VALUE s1, s2, opt;
23
+ rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
24
+ LibJaroOption c_opt = DEFAULT_OPT;
25
+ if(TYPE(opt) == T_HASH){
26
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
27
+ threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
28
+ ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
29
+ adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
30
+ if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
31
+ if(c_opt.weight > 0.25) rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
32
+ if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
33
+ if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
34
+ if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
35
+ }
36
+ return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
37
+ }
38
+
39
+ VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self){
40
+ return distance(argc, argv, self, jaro_distance);
41
+ }
42
+
43
+ VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self){
44
+ return distance(argc, argv, self, jaro_winkler_distance);
45
+ }
@@ -0,0 +1,64 @@
1
+ //-----------------------------------------------------------------------------
2
+ // MurmurHash2, by Austin Appleby
3
+
4
+ // Note - This code makes a few assumptions about how your machine behaves -
5
+
6
+ // 1. We can read a 4-byte value from any address without crashing
7
+ // 2. sizeof(int) == 4
8
+
9
+ // And it has a few limitations -
10
+
11
+ // 1. It will not work incrementally.
12
+ // 2. It will not produce the same results on little-endian and big-endian
13
+ // machines.
14
+
15
+ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
16
+ {
17
+ // 'm' and 'r' are mixing constants generated offline.
18
+ // They're not really 'magic', they just happen to work well.
19
+
20
+ const unsigned int m = 0x5bd1e995;
21
+ const int r = 24;
22
+
23
+ // Initialize the hash to a 'random' value
24
+
25
+ unsigned int h = seed ^ len;
26
+
27
+ // Mix 4 bytes at a time into the hash
28
+
29
+ const unsigned char * data = (const unsigned char *)key;
30
+
31
+ while(len >= 4)
32
+ {
33
+ unsigned int k = *(unsigned int *)data;
34
+
35
+ k *= m;
36
+ k ^= k >> r;
37
+ k *= m;
38
+
39
+ h *= m;
40
+ h ^= k;
41
+
42
+ data += 4;
43
+ len -= 4;
44
+ }
45
+
46
+ // Handle the last few bytes of the input array
47
+
48
+ switch(len)
49
+ {
50
+ case 3: h ^= data[2] << 16;
51
+ case 2: h ^= data[1] << 8;
52
+ case 1: h ^= data[0];
53
+ h *= m;
54
+ };
55
+
56
+ // Do a few final mixes of the hash to ensure the last few
57
+ // bytes are well-incorporated.
58
+
59
+ h ^= h >> 13;
60
+ h *= m;
61
+ h ^= h >> 15;
62
+
63
+ return h;
64
+ }
@@ -0,0 +1,9 @@
1
+ require 'jaro_winkler/version'
2
+
3
+ case RUBY_PLATFORM
4
+ when 'java'
5
+ require 'jaro_winkler/jaro_winkler_pure'
6
+ else
7
+ require 'jaro_winkler/jaro_winkler_ext'
8
+ end
9
+
@@ -0,0 +1,19 @@
1
+ module JaroWinkler
2
+ DEFAULT_ADJ_TABLE = Hash.new
3
+ [
4
+ ['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
5
+ ['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
6
+ ['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
7
+ ['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
8
+ ['E', ' '], ['Y', ' '], ['S', ' ']
9
+ ].each{ |s1, s2|
10
+ if not DEFAULT_ADJ_TABLE.has_key?(s1)
11
+ DEFAULT_ADJ_TABLE[s1] = Hash.new
12
+ end
13
+ if not DEFAULT_ADJ_TABLE.has_key?(s2)
14
+ DEFAULT_ADJ_TABLE[s2] = Hash.new
15
+ end
16
+ DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
17
+ }
18
+ DEFAULT_ADJ_TABLE.default = Hash.new
19
+ end
@@ -0,0 +1,125 @@
1
+ require 'jaro_winkler/adjusting_table'
2
+ module JaroWinkler
3
+ class Error < RuntimeError; end
4
+ class InvalidWeightError < Error; end
5
+
6
+ DEFAULT_WEIGHT = 0.1
7
+ DEFAULT_THRESHOLD = 0.7
8
+ DEFAULT_OPTIONS = {
9
+ jaro: {adj_table: false, ignore_case: false},
10
+ jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
11
+ }
12
+
13
+ module_function
14
+
15
+ def distance str1, str2, options={}
16
+ _distance str1.codepoints.to_a, str2.codepoints.to_a, options
17
+ end
18
+
19
+ def jaro_distance str1, str2, options={}
20
+ _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
21
+ end
22
+
23
+ def _distance codes1, codes2, options={}
24
+ options = DEFAULT_OPTIONS[:jaro_winkler].merge options
25
+ raise InvalidWeightError if options[:weight] > 0.25
26
+ jaro_distance = _jaro_distance(codes1, codes2, options);
27
+
28
+ if jaro_distance < options[:threshold]
29
+ jaro_distance
30
+ else
31
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
32
+ len1, len2 = codes1.length, codes2.length
33
+ max_4 = len1 > 4 ? 4 : len1
34
+ prefix = 0
35
+ while prefix < max_4 && codes1[prefix] == codes2[prefix]
36
+ prefix += 1
37
+ end
38
+ jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
39
+ end
40
+ end
41
+
42
+ def _jaro_distance codes1, codes2, options={}
43
+ options = DEFAULT_OPTIONS[:jaro].merge options
44
+
45
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
46
+ len1, len2 = codes1.length, codes2.length
47
+ return 0.0 if len1 == 0 || len2 == 0
48
+
49
+ if options[:ignore_case]
50
+ codes1.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
51
+ codes2.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
52
+ end
53
+
54
+ window = len2/2 - 1
55
+ window = 0 if(window < 0)
56
+ flags1, flags2 = 0, 0
57
+
58
+ # // count number of matching characters
59
+ match_count = 0;
60
+ i = 0
61
+ while i < len1
62
+ left = (i >= window) ? i - window : 0
63
+ right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
64
+ right = len2 - 1 if right > len2 - 1
65
+ j = left
66
+ while j <= right
67
+ if flags2[j] == 0 && codes1[i] == codes2[j]
68
+ flags1 |= (1 << i)
69
+ flags2 |= (1 << j)
70
+ match_count += 1
71
+ break
72
+ end
73
+ j +=1
74
+ end
75
+ i += 1
76
+ end
77
+
78
+ return 0.0 if match_count == 0
79
+
80
+ # // count number of transpositions
81
+ transposition_count = j = k = 0
82
+ i = 0
83
+ while i < len1
84
+ if flags1[i] == 1
85
+ j = k
86
+ while j < len2
87
+ if flags2[j] == 1
88
+ k = j + 1;
89
+ break;
90
+ end
91
+ j += 1
92
+ end
93
+ transposition_count += 1 if codes1[i] != codes2[j]
94
+ end
95
+ i += 1
96
+ end
97
+
98
+ # // count similarities in nonmatched characters
99
+ similar_count = 0
100
+ if options[:adj_table] && len1 > match_count
101
+ i = 0
102
+ while i < len1
103
+ if flags1[i] == 0
104
+ j = 0
105
+ while j < len2
106
+ if flags2[j] == 0
107
+ if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
108
+ similar_count += 3
109
+ break
110
+ end
111
+ end
112
+ j += 1
113
+ end
114
+ end
115
+ i += 1
116
+ end
117
+ end
118
+
119
+ m = match_count.to_f
120
+ t = transposition_count/2
121
+ m = similar_count/10.0 + m if options[:adj_table]
122
+ (m/len1 + m/len2 + (m-t)/m) / 3
123
+ end
124
+
125
+ end
@@ -0,0 +1,3 @@
1
+ module JaroWinkler
2
+ VERSION = '1.4.0'
3
+ end
metadata ADDED
@@ -0,0 +1,111 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jaro_winkler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.4.0
5
+ platform: java
6
+ authors:
7
+ - Jian Weihang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: '1.7'
25
+ prerelease: false
26
+ type: :development
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ~>
37
+ - !ruby/object:Gem::Version
38
+ version: '10.0'
39
+ prerelease: false
40
+ type: :development
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ version_requirements: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ prerelease: false
54
+ type: :development
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ version_requirements: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ prerelease: false
68
+ type: :development
69
+ description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
70
+ email: tonytonyjan@gmail.com
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - ext/jaro_winkler/adj_matrix.c
76
+ - ext/jaro_winkler/adj_matrix.h
77
+ - ext/jaro_winkler/code.c
78
+ - ext/jaro_winkler/code.h
79
+ - ext/jaro_winkler/jaro.c
80
+ - ext/jaro_winkler/jaro.h
81
+ - ext/jaro_winkler/jaro_winkler.c
82
+ - ext/jaro_winkler/murmur_hash2.c
83
+ - lib/jaro_winkler.rb
84
+ - lib/jaro_winkler/adjusting_table.rb
85
+ - lib/jaro_winkler/jaro_winkler_pure.rb
86
+ - lib/jaro_winkler/version.rb
87
+ homepage: https://github.com/tonytonyjan/jaro_winkler
88
+ licenses:
89
+ - MIT
90
+ metadata: {}
91
+ post_install_message:
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - '>='
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ requirements: []
106
+ rubyforge_project:
107
+ rubygems_version: 2.4.5
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.
111
+ test_files: []