jaro_winkler 1.3.2.beta → 1.3.2.beta2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a90ee9e013479d3c47c7cd632646921f87af4fc
4
- data.tar.gz: 30d065fc0728d3fda6db84af71584f7a60b038cb
3
+ metadata.gz: 6e3750b9af1a515ee78a01b77192fd9eb2697f56
4
+ data.tar.gz: c1473c5a327eda3fc5973935dca50f584399d5d0
5
5
  SHA512:
6
- metadata.gz: d29510e81e2ab5510a85360321e77444363531a2786c5f5dd6213514c0f5d97232ac5d19b25eb0fbd3bc9972dc255e326b7f600c2254cdb2fe6ed7be20cd76e9
7
- data.tar.gz: 5cbb9e3167a42f86ecd6b93dfd9fe21aca7e599e7faf492e9c8f59b109958b157faff0601f1b8078ee6b3a0e92165eda9cab36d6d8eb9d7c5935f3828da18d74
6
+ metadata.gz: 615204f1ab3906d01e44d92b685751004887c339bee5fc35633c26f928b2d7e3d07159085b7122683eba21b3c56cabfdbf28687a7d95aa3815445693a9a9979a
7
+ data.tar.gz: 63601a6358c4a600c3ef258f465b38944ce346e48720613342da7b258663fefc1d51df984e2bab97b245c6e0ef0ecfcaf76ff3ad5b3b0281a9bd748b4f7d3951
data/README.md CHANGED
@@ -33,7 +33,9 @@ weight | number | 0.1 | A constant scaling factor for how much the sco
33
33
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
34
34
  adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
35
35
 
36
- ## Default Adjusting Table
36
+ # About Adjusting Table
37
+
38
+ ## Default Table
37
39
 
38
40
  ```
39
41
  ['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'], ['I', 'U'],
@@ -42,9 +44,9 @@ adj_table | boolean | false | The option is used to give partial credit for
42
44
  ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'], ['E', ' '], ['Y', ' '], ['S', ' ']
43
45
  ```
44
46
 
45
- ## How Adjusting Table Work
47
+ ## How it works?
46
48
 
47
- origin formula:
49
+ Original Formula:
48
50
 
49
51
  ![origin](https://chart.googleapis.com/chart?cht=tx&chs&chl=%5Cbegin%7Bcases%7D0%26%7B%5Ctext%7Bif%20%7Dm%3D0%7D%5C%5C%5Cfrac%7B1%7D%7B3%7D(%5Cfrac%7Bm%7D%7B%5Cleft%7Cs1%5Cright%7C%7D%2B%5Cfrac%7Bm%7D%7B%5Cleft%7Cs2%5Cright%7C%7D%2B%5Cfrac%7Bm-t%7D%7Bm%7D)%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
50
52
 
@@ -53,7 +55,7 @@ where
53
55
  - `m` is the number of matching characters.
54
56
  - `t` is half the number of transpositions.
55
57
 
56
- with adjusting table:
58
+ With Adjusting Table:
57
59
 
58
60
  ![adj](https://chart.googleapis.com/chart?cht=tx&chs&chl=%5Cbegin%7Bcases%7D0%26%5Ctext%7Bif%20%7Dm%3D0%5C%5C%5Cfrac%7B1%7D%7B3%7D(%5Cfrac%7B%5Cfrac%7Bs%7D%7B10%7D%2Bm%7D%7B%5Cleft%7Cs1%5Cright%7C%7D%2B%5Cfrac%7B%5Cfrac%7Bs%7D%7B10%7D%2Bm%7D%7B%5Cleft%7Cs2%5Cright%7C%7D%2B%5Cfrac%7Bm-t%7D%7Bm%7D)%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
59
61
 
@@ -61,6 +63,15 @@ where
61
63
 
62
64
  - `s` is the number of nonmatching but similar characters.
63
65
 
66
+ ## Difference Between v1.3.1 And v1.3.2.beta
67
+
68
+ Version | Algorithm
69
+ ----------- | -----------------------------------------------------------------------
70
+ v1.3.1 | One linked list to store sparse matrix and iterate to find similar character.
71
+ v1.3.2.beta | One hash table with multiple linked lists for collision handling.
72
+
73
+ In theory, the latter should work more efficient than the former (more test data needed).
74
+
64
75
  # Why This?
65
76
 
66
77
  There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
data/Rakefile CHANGED
@@ -27,7 +27,7 @@ task :compare do
27
27
  require 'fuzzystringmatch'
28
28
  require 'hotwater'
29
29
  require 'amatch'
30
- @ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
30
+ @ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten'], ['San Francisco', 'Santa Monica']]
31
31
  table = []
32
32
  table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
33
33
  table << %w[--- --- --- --- --- ---]
data/benchmark/native.txt CHANGED
@@ -1,12 +1,12 @@
1
1
  Rehearsal ----------------------------------------------------
2
- jaro_winkler 0.350000 0.000000 0.350000 ( 0.358591)
3
- fuzzystringmatch 0.360000 0.020000 0.380000 ( 0.381666)
4
- hotwater 0.340000 0.000000 0.340000 ( 0.337789)
5
- amatch 1.010000 0.000000 1.010000 ( 1.010946)
6
- ------------------------------------------- total: 2.080000sec
2
+ jaro_winkler 0.350000 0.000000 0.350000 ( 0.348383)
3
+ fuzzystringmatch 0.330000 0.020000 0.350000 ( 0.354850)
4
+ hotwater 0.280000 0.000000 0.280000 ( 0.278819)
5
+ amatch 0.980000 0.000000 0.980000 ( 0.983325)
6
+ ------------------------------------------- total: 1.960000sec
7
7
 
8
8
  user system total real
9
- jaro_winkler 0.350000 0.010000 0.360000 ( 0.345293)
10
- fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.138711)
11
- hotwater 0.310000 0.000000 0.310000 ( 0.306498)
12
- amatch 0.960000 0.000000 0.960000 ( 0.961509)
9
+ jaro_winkler 0.330000 0.000000 0.330000 ( 0.331923)
10
+ fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.135655)
11
+ hotwater 0.280000 0.000000 0.280000 ( 0.276728)
12
+ amatch 0.930000 0.010000 0.940000 ( 0.932943)
@@ -1,8 +1,16 @@
1
1
  #include <stdlib.h>
2
2
  #include "adj_matrix.h"
3
+ #include "codepoints.h"
4
+
5
+ const char *DEFAULT_ADJ_TABLE[] = {
6
+ "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
7
+ "I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
8
+ "M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
9
+ "0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
10
+ };
3
11
 
4
12
  extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
5
- static void node_free(Node *head);
13
+ inline void node_free(Node *head);
6
14
 
7
15
  AdjMatrix* adj_matrix_new(unsigned int length){
8
16
  AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
@@ -42,7 +50,7 @@ char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long
42
50
  }
43
51
  }
44
52
 
45
- static void node_free(Node *head){
53
+ inline void node_free(Node *head){
46
54
  if(head == NULL) return;
47
55
  node_free(head->next);
48
56
  free(head);
@@ -59,4 +67,19 @@ void adj_matrix_free(AdjMatrix *matrix){
59
67
  }
60
68
  free(matrix->table);
61
69
  free(matrix);
70
+ }
71
+
72
+ AdjMatrix* adj_matrix_default(){
73
+ static char first_time = 1;
74
+ static AdjMatrix *ret_matrix;
75
+ if(first_time){
76
+ ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
77
+ int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
78
+ for(int i = 0; i < length; i += 2){
79
+ UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
80
+ adj_matrix_add(ret_matrix, h1.code, h2.code);
81
+ }
82
+ first_time = 0;
83
+ }
84
+ return ret_matrix;
62
85
  }
@@ -13,9 +13,10 @@ typedef struct{
13
13
  unsigned int length;
14
14
  } AdjMatrix;
15
15
 
16
- AdjMatrix* adj_matrix_new(unsigned int length);
17
- void adj_matrix_add(AdjMatrix *matrix, unsigned long long x, unsigned long long y);
18
- char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long y);
19
- void adj_matrix_free(AdjMatrix *matrix);
16
+ AdjMatrix* adj_matrix_new (unsigned int length);
17
+ void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
18
+ char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
19
+ void adj_matrix_free (AdjMatrix *matrix);
20
+ AdjMatrix* adj_matrix_default();
20
21
 
21
22
  #endif /* ADJ_MATRIX_H */
@@ -0,0 +1,29 @@
1
+ #include <string.h>
2
+ #include <stdlib.h>
3
+ #include "codepoints.h"
4
+
5
+ UnicodeHash unicode_hash_new(const char *str){
6
+ UnicodeHash ret = {};
7
+ unsigned char first_char = str[0];
8
+ if(first_char >= 252) ret.byte_length = 6; // 1111110x
9
+ else if(first_char >= 248) ret.byte_length = 5; // 111110xx
10
+ else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
11
+ else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
12
+ else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
13
+ else ret.byte_length = 1;
14
+ memcpy(&ret.code, str, ret.byte_length);
15
+ return ret;
16
+ }
17
+
18
+ Codepoints codepoints_new(const char *str, int byte_len){
19
+ Codepoints ret = {};
20
+ ret.ary = malloc(byte_len * sizeof(long long));
21
+ ret.length = 0;
22
+ for(int i = 0; i < byte_len;){
23
+ UnicodeHash hash = unicode_hash_new(str + i);
24
+ ret.ary[ret.length] = hash.code;
25
+ ret.length++;
26
+ i += hash.byte_length;
27
+ }
28
+ return ret;
29
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef CODEPOINTS_H
2
+ #define CODEPOINTS_H 1
3
+
4
+ typedef struct{
5
+ unsigned long long code;
6
+ unsigned int byte_length;
7
+ } UnicodeHash;
8
+
9
+ typedef struct{
10
+ unsigned long long *ary;
11
+ int length;
12
+ } Codepoints;
13
+
14
+ UnicodeHash unicode_hash_new(const char *str);
15
+ Codepoints codepoints_new (const char *str, int byte_len);
16
+
17
+ #endif /* CODEPOINTS_H */
@@ -1,30 +1,9 @@
1
- #include <string.h>
2
1
  #include <stdlib.h>
3
2
  #include <ctype.h>
4
3
  #include "distance.h"
4
+ #include "codepoints.h"
5
5
  #include "adj_matrix.h"
6
6
 
7
- typedef struct{
8
- unsigned long long code;
9
- unsigned int byte_length;
10
- } UnicodeHash;
11
-
12
- typedef struct{
13
- unsigned long long *ary;
14
- int length;
15
- } Codepoints;
16
-
17
- const char *DEFAULT_ADJ_TABLE[] = {
18
- "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
19
- "I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
20
- "M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
21
- "0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
22
- };
23
-
24
- static UnicodeHash unicode_hash_new(const char *str);
25
- static Codepoints codepoints_new(const char *str, int byte_len);
26
- static AdjMatrix* adj_matrix_default();
27
-
28
7
  Option option_new(){
29
8
  Option opt;
30
9
  opt.ignore_case = opt.adj_table = 0;
@@ -33,12 +12,9 @@ Option option_new(){
33
12
  return opt;
34
13
  }
35
14
 
36
- double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
37
- // set default option if NULL passed
38
- int free_opt_flag = 0;
39
-
40
- Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len);
41
- Codepoints code_ary_2 = codepoints_new(s2, s2_byte_len);
15
+ double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
16
+ Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len),
17
+ code_ary_2 = codepoints_new(s2, s2_byte_len);
42
18
 
43
19
  if(opt.ignore_case){
44
20
  for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
@@ -54,19 +30,19 @@ double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option o
54
30
  // Compute jaro distance
55
31
  int window_size = code_ary_2.length / 2 - 1;
56
32
  if(window_size < 0) window_size = 0;
57
- double matches = 0.0;
58
- double sim_matches = 0.0;
59
- int transpositions = 0;
60
- int previous_index = -1;
61
- int max_index = code_ary_2.length - 1;
33
+ double matches = 0.0,
34
+ sim_matches = 0.0;
35
+ int transpositions = 0,
36
+ previous_index = -1,
37
+ max_index = code_ary_2.length - 1;
62
38
  for(int i = 0; i < code_ary_1.length; i++){
63
39
  int left = i - window_size;
64
40
  int right = i + window_size;
65
41
  if(left < 0) left = 0;
66
42
  if(right > max_index) right = max_index;
67
- char matched = 0;
68
- char found = 0;
69
- char sim_matched = 0;
43
+ char matched = 0,
44
+ found = 0,
45
+ sim_matched = 0;
70
46
  for(int j = left; j <= right; j++){
71
47
  if(code_ary_1.ary[i] == code_ary_2.ary[j]){
72
48
  matched = 1;
@@ -97,45 +73,4 @@ double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option o
97
73
  }
98
74
  free(code_ary_1.ary); free(code_ary_2.ary);
99
75
  return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
100
- }
101
-
102
- static UnicodeHash unicode_hash_new(const char *str){
103
- UnicodeHash ret = {};
104
- unsigned char first_char = str[0];
105
- if(first_char >= 252) ret.byte_length = 6; // 1111110x
106
- else if(first_char >= 248) ret.byte_length = 5; // 111110xx
107
- else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
108
- else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
109
- else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
110
- else ret.byte_length = 1;
111
- memcpy(&ret.code, str, ret.byte_length);
112
- return ret;
113
- }
114
-
115
- static Codepoints codepoints_new(const char *str, int byte_len){
116
- Codepoints ret = {};
117
- ret.ary = calloc(byte_len, sizeof(long long));
118
- int count = 0;
119
- for(int i = 0; i < byte_len;){
120
- UnicodeHash hash = unicode_hash_new(str + i);
121
- ret.ary[count] = hash.code;
122
- count++;
123
- i += hash.byte_length;
124
- }
125
- ret.length += count;
126
- return ret;
127
- }
128
-
129
- static AdjMatrix* adj_matrix_default(){
130
- static char first_time = 1;
131
- static AdjMatrix *ret_matrix;
132
- if(first_time){
133
- ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
134
- for(int i = 0; i < 78; i += 2){
135
- UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
136
- adj_matrix_add(ret_matrix, h1.code, h2.code);
137
- }
138
- first_time = 0;
139
- }
140
- return ret_matrix;
141
76
  }
@@ -6,7 +6,7 @@ typedef struct{
6
6
  char ignore_case, adj_table;
7
7
  } Option;
8
8
 
9
- double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
9
+ double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
10
10
  Option option_new();
11
11
 
12
12
  #endif /* DISTANCE_H */
@@ -1,3 +1,3 @@
1
1
  require "mkmf"
2
- $CFLAGS << ' -std=gnu99' if Gem.win_platform?
2
+ $CFLAGS << ' -std=c99 '
3
3
  create_makefile("jaro_winkler/jaro_winkler")
@@ -13,15 +13,15 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
13
13
  rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
14
14
  Option c_opt = option_new();
15
15
  if(TYPE(opt) == T_HASH){
16
- VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
17
- VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
18
- VALUE ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case")));
19
- VALUE adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
16
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
17
+ threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
18
+ ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
19
+ adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
20
20
  if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
21
21
  if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
22
22
  if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
23
23
  if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
24
24
  if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
25
25
  }
26
- return rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
26
+ return rb_float_new(distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
27
27
  }
@@ -1,64 +1,64 @@
1
- //-----------------------------------------------------------------------------
2
- // MurmurHash2, by Austin Appleby
3
-
4
- // Note - This code makes a few assumptions about how your machine behaves -
5
-
6
- // 1. We can read a 4-byte value from any address without crashing
7
- // 2. sizeof(int) == 4
8
-
9
- // And it has a few limitations -
10
-
11
- // 1. It will not work incrementally.
12
- // 2. It will not produce the same results on little-endian and big-endian
13
- // machines.
14
-
15
- unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
16
- {
17
- // 'm' and 'r' are mixing constants generated offline.
18
- // They're not really 'magic', they just happen to work well.
19
-
20
- const unsigned int m = 0x5bd1e995;
21
- const int r = 24;
22
-
23
- // Initialize the hash to a 'random' value
24
-
25
- unsigned int h = seed ^ len;
26
-
27
- // Mix 4 bytes at a time into the hash
28
-
29
- const unsigned char * data = (const unsigned char *)key;
30
-
31
- while(len >= 4)
32
- {
33
- unsigned int k = *(unsigned int *)data;
34
-
35
- k *= m;
36
- k ^= k >> r;
37
- k *= m;
38
-
39
- h *= m;
40
- h ^= k;
41
-
42
- data += 4;
43
- len -= 4;
44
- }
45
-
46
- // Handle the last few bytes of the input array
47
-
48
- switch(len)
49
- {
50
- case 3: h ^= data[2] << 16;
51
- case 2: h ^= data[1] << 8;
52
- case 1: h ^= data[0];
53
- h *= m;
54
- };
55
-
56
- // Do a few final mixes of the hash to ensure the last few
57
- // bytes are well-incorporated.
58
-
59
- h ^= h >> 13;
60
- h *= m;
61
- h ^= h >> 15;
62
-
63
- return h;
64
- }
1
+ //-----------------------------------------------------------------------------
2
+ // MurmurHash2, by Austin Appleby
3
+
4
+ // Note - This code makes a few assumptions about how your machine behaves -
5
+
6
+ // 1. We can read a 4-byte value from any address without crashing
7
+ // 2. sizeof(int) == 4
8
+
9
+ // And it has a few limitations -
10
+
11
+ // 1. It will not work incrementally.
12
+ // 2. It will not produce the same results on little-endian and big-endian
13
+ // machines.
14
+
15
+ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
16
+ {
17
+ // 'm' and 'r' are mixing constants generated offline.
18
+ // They're not really 'magic', they just happen to work well.
19
+
20
+ const unsigned int m = 0x5bd1e995;
21
+ const int r = 24;
22
+
23
+ // Initialize the hash to a 'random' value
24
+
25
+ unsigned int h = seed ^ len;
26
+
27
+ // Mix 4 bytes at a time into the hash
28
+
29
+ const unsigned char * data = (const unsigned char *)key;
30
+
31
+ while(len >= 4)
32
+ {
33
+ unsigned int k = *(unsigned int *)data;
34
+
35
+ k *= m;
36
+ k ^= k >> r;
37
+ k *= m;
38
+
39
+ h *= m;
40
+ h ^= k;
41
+
42
+ data += 4;
43
+ len -= 4;
44
+ }
45
+
46
+ // Handle the last few bytes of the input array
47
+
48
+ switch(len)
49
+ {
50
+ case 3: h ^= data[2] << 16;
51
+ case 2: h ^= data[1] << 8;
52
+ case 1: h ^= data[0];
53
+ h *= m;
54
+ };
55
+
56
+ // Do a few final mixes of the hash to ensure the last few
57
+ // bytes are well-incorporated.
58
+
59
+ h ^= h >> 13;
60
+ h *= m;
61
+ h ^= h >> 15;
62
+
63
+ return h;
64
+ }
data/jaro_winkler.gemspec CHANGED
@@ -23,4 +23,8 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "bundler", "~> 1.7"
24
24
  spec.add_development_dependency "rake", "~> 10.0"
25
25
  spec.add_development_dependency "rake-compiler"
26
+ spec.add_development_dependency "rspec"
27
+ spec.add_development_dependency "fuzzy-string-match"
28
+ spec.add_development_dependency "hotwater"
29
+ spec.add_development_dependency "amatch"
26
30
  end
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.3.2.beta"
2
+ VERSION = "1.3.2.beta2"
3
3
  end
@@ -5,22 +5,23 @@ include JaroWinkler
5
5
  shared_examples 'common' do |strategy|
6
6
  it 'works' do
7
7
  ary = [
8
- ['henka' , 'henkan' , 0.9667] ,
9
- ['al' , 'al' , 1.0] ,
10
- ['martha' , 'marhta' , 0.9611] ,
11
- ['jones' , 'johnson' , 0.8323] ,
12
- ['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
13
- ['dwayne' , 'duane' , 0.8400] ,
14
- ['dixon' , 'dicksonx' , 0.8133] ,
15
- ['fvie' , 'ten' , 0.0] ,
16
- ['tony' , 'tony' , 1.0] ,
17
- ['tonytonyjan' , 'tonytonyjan' , 1.0] ,
18
- ['x' , 'x' , 1.0] ,
19
- ['' , '' , 0.0] ,
20
- ['tony' , '' , 0.0] ,
21
- ['' , 'tony' , 0.0] ,
22
- ['tonytonyjan' , 'tony' , 0.8727] ,
23
- ['tony' , 'tonytonyjan' , 0.8727]
8
+ ['henka' , 'henkan' , 0.9667] ,
9
+ ['al' , 'al' , 1.0] ,
10
+ ['martha' , 'marhta' , 0.9611] ,
11
+ ['jones' , 'johnson' , 0.8323] ,
12
+ ['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
13
+ ['dwayne' , 'duane' , 0.8400] ,
14
+ ['dixon' , 'dicksonx' , 0.8133] ,
15
+ ['fvie' , 'ten' , 0.0] ,
16
+ ['tony' , 'tony' , 1.0] ,
17
+ ['tonytonyjan' , 'tonytonyjan' , 1.0] ,
18
+ ['x' , 'x' , 1.0] ,
19
+ ['' , '' , 0.0] ,
20
+ ['tony' , '' , 0.0] ,
21
+ ['' , 'tony' , 0.0] ,
22
+ ['tonytonyjan' , 'tony' , 0.8727] ,
23
+ ['tony' , 'tonytonyjan' , 0.8727] ,
24
+ ['San Francisco' , 'Santa Monica' , 0.8180]
24
25
  ]
25
26
  ary.each do |s1, s2, ans|
26
27
  expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.2.beta
4
+ version: 1.3.2.beta2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-11 00:00:00.000000000 Z
11
+ date: 2014-10-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,62 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: fuzzy-string-match
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: hotwater
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: amatch
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
55
111
  description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
56
112
  and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
57
113
  string.
@@ -74,6 +130,8 @@ files:
74
130
  - benchmark/pure.txt
75
131
  - ext/jaro_winkler/adj_matrix.c
76
132
  - ext/jaro_winkler/adj_matrix.h
133
+ - ext/jaro_winkler/codepoints.c
134
+ - ext/jaro_winkler/codepoints.h
77
135
  - ext/jaro_winkler/distance.c
78
136
  - ext/jaro_winkler/distance.h
79
137
  - ext/jaro_winkler/extconf.rb
@@ -107,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
107
165
  version: 1.3.1
108
166
  requirements: []
109
167
  rubyforge_project:
110
- rubygems_version: 2.4.1
168
+ rubygems_version: 2.4.2
111
169
  signing_key:
112
170
  specification_version: 4
113
171
  summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support