jaro_winkler 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 52f268c19787793ca7383fe1de1f0355e0a3e6b9
4
+ data.tar.gz: f8814b814294a7f9268a6df2ad1ad72c75146c3e
5
+ SHA512:
6
+ metadata.gz: 2ea65143ad847ef5cd565584c2dd1ce19908136506697eafe0579609227628a6e2bbb4baacd0d6c3ee883bcea07fff3043ae305d84c307a0e5f359dff64ab0c1
7
+ data.tar.gz: 254d25523a0654343ca5b9a552789021a30d3dc7d0c613333db7d67f3ccc41ce003c01bf00b63089b9a340a83594d40f1d7c49b2e59601885b471e68048fc23f
@@ -0,0 +1,89 @@
1
+ #include "adj_matrix.h"
2
+ #include "code.h"
3
+
4
+ #include <stdlib.h>
5
+
6
+ const char *DEFAULT_ADJ_TABLE[] = {
7
+ "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
8
+ "I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
9
+ "M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
10
+ "0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
11
+ };
12
+
13
+ extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
14
+ void node_free(Node *head);
15
+
16
+ AdjMatrix* adj_matrix_new(unsigned int length){
17
+ AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
18
+ matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
19
+ matrix->table = malloc(matrix->length * sizeof(Node**));
20
+ for(int i = 0; i < matrix->length; i++){
21
+ matrix->table[i] = malloc(matrix->length * sizeof(Node*));
22
+ for (int j = 0; j < matrix->length; j++)
23
+ matrix->table[i][j] = NULL;
24
+ }
25
+ return matrix;
26
+ }
27
+
28
+ void adj_matrix_add(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
29
+ unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
30
+ h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
31
+ Node *new_node = malloc(sizeof(Node)); new_node->x = h1; new_node->y = h2; new_node->next = NULL;
32
+ if(matrix->table[h1][h2] == NULL){
33
+ matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
34
+ }
35
+ else{
36
+ Node *previous = NULL;
37
+ for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next) previous = i;
38
+ previous->next = new_node;
39
+ }
40
+ }
41
+
42
+ char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long y){
43
+ unsigned int h1 = MurmurHash2(&x, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
44
+ h2 = MurmurHash2(&y, sizeof(long long), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
45
+ Node *node = matrix->table[h1][h2];
46
+ if(node == NULL) return 0;
47
+ else{
48
+ for(Node *i = node; i != NULL; i = i->next)
49
+ if((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1)) return 1;
50
+ return 0;
51
+ }
52
+ }
53
+
54
+ void node_free(Node *head){
55
+ if(head == NULL) return;
56
+ node_free(head->next);
57
+ free(head);
58
+ }
59
+
60
+ void adj_matrix_free(AdjMatrix *matrix){
61
+ for(int i = 0; i < matrix->length; i++){
62
+ for(int j = 0; j < matrix->length; j++)
63
+ if(matrix->table[i][j] != NULL){
64
+ node_free(matrix->table[i][j]);
65
+ matrix->table[i][j] = matrix->table[j][i] = NULL;
66
+ }
67
+ free(matrix->table[i]);
68
+ }
69
+ free(matrix->table);
70
+ free(matrix);
71
+ }
72
+
73
+ AdjMatrix* adj_matrix_default(){
74
+ static char first_time = 1;
75
+ static AdjMatrix *ret_matrix;
76
+ if(first_time){
77
+ ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
78
+ int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
79
+ for(int i = 0; i < length; i += 2){
80
+ unsigned long long code_1, code_2;
81
+ int dummy_length;
82
+ utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
83
+ utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
84
+ adj_matrix_add(ret_matrix, code_1, code_2);
85
+ }
86
+ first_time = 0;
87
+ }
88
+ return ret_matrix;
89
+ }
@@ -0,0 +1,22 @@
1
+ #ifndef ADJ_MATRIX_H
2
+ #define ADJ_MATRIX_H
3
+ #define ADJ_MATRIX_DEFAULT_LENGTH 958
4
+ #define ADJ_MATRIX_SEED 9527
5
+
6
+ typedef struct _node{
7
+ struct _node *next;
8
+ unsigned long long x, y;
9
+ } Node;
10
+
11
+ typedef struct{
12
+ Node ***table;
13
+ unsigned int length;
14
+ } AdjMatrix;
15
+
16
+ AdjMatrix* adj_matrix_new (unsigned int length);
17
+ void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
18
+ char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
19
+ void adj_matrix_free (AdjMatrix *matrix);
20
+ AdjMatrix* adj_matrix_default();
21
+
22
+ #endif
@@ -0,0 +1,29 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+
4
+ void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
5
+ unsigned char first_char = str[0];
6
+ if(first_char >= 252) *ret_byte_length = 6; // 1111110x
7
+ else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
8
+ else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
9
+ else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
10
+ else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
11
+ else *ret_byte_length = 1;
12
+ *ret_code = 0;
13
+ memcpy(ret_code, str, *ret_byte_length);
14
+ }
15
+
16
+ void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
17
+ unsigned int code;
18
+ char byte_length;
19
+
20
+ *ret_codes = calloc(length, sizeof(long long));
21
+ *ret_length = 0;
22
+
23
+ for(int i = 0; i < length;){
24
+ int byte_length;
25
+ utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
26
+ *ret_length += 1;
27
+ i += byte_length;
28
+ }
29
+ }
@@ -0,0 +1,7 @@
1
+ #ifndef CODE_H
2
+ #define CODE_H
3
+
4
+ void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length);
5
+ void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length);
6
+
7
+ #endif
@@ -0,0 +1,122 @@
1
+ #include "jaro.h"
2
+ #include "code.h"
3
+ #include "adj_matrix.h"
4
+
5
+ #include <string.h>
6
+ #include <stdlib.h>
7
+ #include <ctype.h>
8
+
9
+ #define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
10
+
11
+ double jaro_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
12
+ double jaro_winkler_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
13
+
14
+ double jaro_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
15
+ if(!short_str_len || !long_str_len) return 0.0;
16
+
17
+ unsigned long long *short_codes, *long_codes;
18
+ int short_codes_len, long_codes_len;
19
+ string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
20
+ string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
21
+
22
+ double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
23
+
24
+ free(short_codes); free(long_codes);
25
+ return ret;
26
+ }
27
+
28
+ double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
29
+ if(!short_str_len || !long_str_len) return 0.0;
30
+
31
+ unsigned long long *short_codes, *long_codes;
32
+ int short_codes_len, long_codes_len;
33
+ string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
34
+ string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
35
+
36
+ double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
37
+
38
+ free(short_codes); free(long_codes);
39
+ return ret;
40
+ }
41
+
42
+ double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
43
+ if(!short_codes_len || !long_codes_len) return 0.0;
44
+
45
+ if(short_codes_len > long_codes_len){
46
+ SWAP(short_codes, long_codes);
47
+ SWAP(short_codes_len, long_codes_len);
48
+ }
49
+
50
+ if(opt->ignore_case){
51
+ for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
52
+ for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
53
+ }
54
+
55
+ int window_size = long_codes_len/2 - 1;
56
+ if(window_size < 0) window_size = 0;
57
+
58
+ char short_codes_flag[short_codes_len];
59
+ char long_codes_flag[long_codes_len];
60
+ memset(short_codes_flag, 0, short_codes_len);
61
+ memset(long_codes_flag, 0, long_codes_len);
62
+
63
+ // count number of matching characters
64
+ int match_count = 0;
65
+ for(int i = 0; i < short_codes_len; i++){
66
+ int left = (i >= window_size) ? i - window_size : 0;
67
+ int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
68
+ if(right > long_codes_len - 1) right = long_codes_len - 1;
69
+ for(int j = left; j <= right; j++){
70
+ if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
71
+ short_codes_flag[i] = long_codes_flag[j] = 1;
72
+ match_count++;
73
+ break;
74
+ }
75
+ }
76
+ }
77
+
78
+ if(!match_count) return 0.0;
79
+
80
+ // count number of transpositions
81
+ int transposition_count = 0, j = 0, k = 0;
82
+ for(int i = 0; i < short_codes_len; i++){
83
+ if(short_codes_flag[i]){
84
+ for(j = k; j < long_codes_len; j++){
85
+ if(long_codes_flag[j]){
86
+ k = j + 1;
87
+ break;
88
+ }
89
+ }
90
+ if(short_codes[i] != long_codes[j]) transposition_count++;
91
+ }
92
+ }
93
+
94
+ // count similarities in nonmatched characters
95
+ int similar_count = 0;
96
+ if(opt->adj_table && short_codes_len > match_count)
97
+ for(int i = 0; i < short_codes_len; i++)
98
+ if(!short_codes_flag[i])
99
+ for(int j = 0; j < long_codes_len; j++)
100
+ if(!long_codes_flag[j])
101
+ if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
102
+ similar_count += 3;
103
+ break;
104
+ }
105
+
106
+ double m = (double)match_count;
107
+ double t = (double)(transposition_count/2);
108
+ if(opt->adj_table) m = similar_count/10.0 + m;
109
+ return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
110
+ }
111
+
112
+ double jaro_winkler_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
113
+ double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
114
+
115
+ if(jaro_distance < opt->threshold) return jaro_distance;
116
+ else{
117
+ int prefix = 0;
118
+ int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
119
+ for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
120
+ return jaro_distance + prefix*opt->weight*(1-jaro_distance);
121
+ }
122
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef LIBJARO_JARO_H
2
+ #define LIBJARO_JARO_H
3
+
4
+ #define DEFAULT_WEIGHT 0.1
5
+ #define DEFAULT_THRESHOLD 0.7
6
+
7
+ typedef struct LibJaroOption{
8
+ double weight, threshold;
9
+ char ignore_case, adj_table;
10
+ } LibJaroOption;
11
+
12
+
13
+ static const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
14
+ double jaro_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
15
+ double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
16
+
17
+ #endif
@@ -0,0 +1,45 @@
1
+ #include "ruby.h"
2
+ #include "jaro.h"
3
+
4
+ VALUE rb_mJaroWinkler,
5
+ rb_eError,
6
+ rb_eInvalidWeightError;
7
+
8
+ VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
9
+ VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
10
+ VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt));
11
+
12
+ void Init_jaro_winkler_ext(void){
13
+ rb_mJaroWinkler = rb_define_module("JaroWinkler");
14
+ rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
15
+ rb_eInvalidWeightError = rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
16
+ rb_define_module_function(rb_mJaroWinkler, "distance", rb_jaro_winkler_distance, -1);
17
+ rb_define_module_function(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance, -1);
18
+ }
19
+
20
+
21
+ VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt)){
22
+ VALUE s1, s2, opt;
23
+ rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
24
+ LibJaroOption c_opt = DEFAULT_OPT;
25
+ if(TYPE(opt) == T_HASH){
26
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
27
+ threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
28
+ ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
29
+ adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
30
+ if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
31
+ if(c_opt.weight > 0.25) rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
32
+ if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
33
+ if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
34
+ if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
35
+ }
36
+ return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
37
+ }
38
+
39
+ VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self){
40
+ return distance(argc, argv, self, jaro_distance);
41
+ }
42
+
43
+ VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self){
44
+ return distance(argc, argv, self, jaro_winkler_distance);
45
+ }
@@ -0,0 +1,64 @@
1
+ //-----------------------------------------------------------------------------
2
+ // MurmurHash2, by Austin Appleby
3
+
4
+ // Note - This code makes a few assumptions about how your machine behaves -
5
+
6
+ // 1. We can read a 4-byte value from any address without crashing
7
+ // 2. sizeof(int) == 4
8
+
9
+ // And it has a few limitations -
10
+
11
+ // 1. It will not work incrementally.
12
+ // 2. It will not produce the same results on little-endian and big-endian
13
+ // machines.
14
+
15
+ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
16
+ {
17
+ // 'm' and 'r' are mixing constants generated offline.
18
+ // They're not really 'magic', they just happen to work well.
19
+
20
+ const unsigned int m = 0x5bd1e995;
21
+ const int r = 24;
22
+
23
+ // Initialize the hash to a 'random' value
24
+
25
+ unsigned int h = seed ^ len;
26
+
27
+ // Mix 4 bytes at a time into the hash
28
+
29
+ const unsigned char * data = (const unsigned char *)key;
30
+
31
+ while(len >= 4)
32
+ {
33
+ unsigned int k = *(unsigned int *)data;
34
+
35
+ k *= m;
36
+ k ^= k >> r;
37
+ k *= m;
38
+
39
+ h *= m;
40
+ h ^= k;
41
+
42
+ data += 4;
43
+ len -= 4;
44
+ }
45
+
46
+ // Handle the last few bytes of the input array
47
+
48
+ switch(len)
49
+ {
50
+ case 3: h ^= data[2] << 16;
51
+ case 2: h ^= data[1] << 8;
52
+ case 1: h ^= data[0];
53
+ h *= m;
54
+ };
55
+
56
+ // Do a few final mixes of the hash to ensure the last few
57
+ // bytes are well-incorporated.
58
+
59
+ h ^= h >> 13;
60
+ h *= m;
61
+ h ^= h >> 15;
62
+
63
+ return h;
64
+ }
@@ -0,0 +1,9 @@
1
+ require 'jaro_winkler/version'
2
+
3
+ case RUBY_PLATFORM
4
+ when 'java'
5
+ require 'jaro_winkler/jaro_winkler_pure'
6
+ else
7
+ require 'jaro_winkler/jaro_winkler_ext'
8
+ end
9
+
@@ -0,0 +1,19 @@
1
+ module JaroWinkler
2
+ DEFAULT_ADJ_TABLE = Hash.new
3
+ [
4
+ ['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
5
+ ['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
6
+ ['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
7
+ ['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
8
+ ['E', ' '], ['Y', ' '], ['S', ' ']
9
+ ].each{ |s1, s2|
10
+ if not DEFAULT_ADJ_TABLE.has_key?(s1)
11
+ DEFAULT_ADJ_TABLE[s1] = Hash.new
12
+ end
13
+ if not DEFAULT_ADJ_TABLE.has_key?(s2)
14
+ DEFAULT_ADJ_TABLE[s2] = Hash.new
15
+ end
16
+ DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
17
+ }
18
+ DEFAULT_ADJ_TABLE.default = Hash.new
19
+ end
@@ -0,0 +1,125 @@
1
+ require 'jaro_winkler/adjusting_table'
2
+ module JaroWinkler
3
+ class Error < RuntimeError; end
4
+ class InvalidWeightError < Error; end
5
+
6
+ DEFAULT_WEIGHT = 0.1
7
+ DEFAULT_THRESHOLD = 0.7
8
+ DEFAULT_OPTIONS = {
9
+ jaro: {adj_table: false, ignore_case: false},
10
+ jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
11
+ }
12
+
13
+ module_function
14
+
15
+ def distance str1, str2, options={}
16
+ _distance str1.codepoints.to_a, str2.codepoints.to_a, options
17
+ end
18
+
19
+ def jaro_distance str1, str2, options={}
20
+ _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
21
+ end
22
+
23
+ def _distance codes1, codes2, options={}
24
+ options = DEFAULT_OPTIONS[:jaro_winkler].merge options
25
+ raise InvalidWeightError if options[:weight] > 0.25
26
+ jaro_distance = _jaro_distance(codes1, codes2, options);
27
+
28
+ if jaro_distance < options[:threshold]
29
+ jaro_distance
30
+ else
31
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
32
+ len1, len2 = codes1.length, codes2.length
33
+ max_4 = len1 > 4 ? 4 : len1
34
+ prefix = 0
35
+ while prefix < max_4 && codes1[prefix] == codes2[prefix]
36
+ prefix += 1
37
+ end
38
+ jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
39
+ end
40
+ end
41
+
42
+ def _jaro_distance codes1, codes2, options={}
43
+ options = DEFAULT_OPTIONS[:jaro].merge options
44
+
45
+ codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
46
+ len1, len2 = codes1.length, codes2.length
47
+ return 0.0 if len1 == 0 || len2 == 0
48
+
49
+ if options[:ignore_case]
50
+ codes1.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
51
+ codes2.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
52
+ end
53
+
54
+ window = len2/2 - 1
55
+ window = 0 if(window < 0)
56
+ flags1, flags2 = 0, 0
57
+
58
+ # // count number of matching characters
59
+ match_count = 0;
60
+ i = 0
61
+ while i < len1
62
+ left = (i >= window) ? i - window : 0
63
+ right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
64
+ right = len2 - 1 if right > len2 - 1
65
+ j = left
66
+ while j <= right
67
+ if flags2[j] == 0 && codes1[i] == codes2[j]
68
+ flags1 |= (1 << i)
69
+ flags2 |= (1 << j)
70
+ match_count += 1
71
+ break
72
+ end
73
+ j +=1
74
+ end
75
+ i += 1
76
+ end
77
+
78
+ return 0.0 if match_count == 0
79
+
80
+ # // count number of transpositions
81
+ transposition_count = j = k = 0
82
+ i = 0
83
+ while i < len1
84
+ if flags1[i] == 1
85
+ j = k
86
+ while j < len2
87
+ if flags2[j] == 1
88
+ k = j + 1;
89
+ break;
90
+ end
91
+ j += 1
92
+ end
93
+ transposition_count += 1 if codes1[i] != codes2[j]
94
+ end
95
+ i += 1
96
+ end
97
+
98
+ # // count similarities in nonmatched characters
99
+ similar_count = 0
100
+ if options[:adj_table] && len1 > match_count
101
+ i = 0
102
+ while i < len1
103
+ if flags1[i] == 0
104
+ j = 0
105
+ while j < len2
106
+ if flags2[j] == 0
107
+ if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
108
+ similar_count += 3
109
+ break
110
+ end
111
+ end
112
+ j += 1
113
+ end
114
+ end
115
+ i += 1
116
+ end
117
+ end
118
+
119
+ m = match_count.to_f
120
+ t = transposition_count/2
121
+ m = similar_count/10.0 + m if options[:adj_table]
122
+ (m/len1 + m/len2 + (m-t)/m) / 3
123
+ end
124
+
125
+ end
@@ -0,0 +1,3 @@
1
+ module JaroWinkler
2
+ VERSION = '1.4.0'
3
+ end
metadata ADDED
@@ -0,0 +1,111 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jaro_winkler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.4.0
5
+ platform: java
6
+ authors:
7
+ - Jian Weihang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: '1.7'
25
+ prerelease: false
26
+ type: :development
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ~>
37
+ - !ruby/object:Gem::Version
38
+ version: '10.0'
39
+ prerelease: false
40
+ type: :development
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ version_requirements: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ prerelease: false
54
+ type: :development
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ version_requirements: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ prerelease: false
68
+ type: :development
69
+ description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
70
+ email: tonytonyjan@gmail.com
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - ext/jaro_winkler/adj_matrix.c
76
+ - ext/jaro_winkler/adj_matrix.h
77
+ - ext/jaro_winkler/code.c
78
+ - ext/jaro_winkler/code.h
79
+ - ext/jaro_winkler/jaro.c
80
+ - ext/jaro_winkler/jaro.h
81
+ - ext/jaro_winkler/jaro_winkler.c
82
+ - ext/jaro_winkler/murmur_hash2.c
83
+ - lib/jaro_winkler.rb
84
+ - lib/jaro_winkler/adjusting_table.rb
85
+ - lib/jaro_winkler/jaro_winkler_pure.rb
86
+ - lib/jaro_winkler/version.rb
87
+ homepage: https://github.com/tonytonyjan/jaro_winkler
88
+ licenses:
89
+ - MIT
90
+ metadata: {}
91
+ post_install_message:
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - '>='
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ requirements: []
106
+ rubyforge_project:
107
+ rubygems_version: 2.4.5
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.
111
+ test_files: []