jaro_winkler 1.3.2.beta → 1.3.2.beta2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a90ee9e013479d3c47c7cd632646921f87af4fc
4
- data.tar.gz: 30d065fc0728d3fda6db84af71584f7a60b038cb
3
+ metadata.gz: 6e3750b9af1a515ee78a01b77192fd9eb2697f56
4
+ data.tar.gz: c1473c5a327eda3fc5973935dca50f584399d5d0
5
5
  SHA512:
6
- metadata.gz: d29510e81e2ab5510a85360321e77444363531a2786c5f5dd6213514c0f5d97232ac5d19b25eb0fbd3bc9972dc255e326b7f600c2254cdb2fe6ed7be20cd76e9
7
- data.tar.gz: 5cbb9e3167a42f86ecd6b93dfd9fe21aca7e599e7faf492e9c8f59b109958b157faff0601f1b8078ee6b3a0e92165eda9cab36d6d8eb9d7c5935f3828da18d74
6
+ metadata.gz: 615204f1ab3906d01e44d92b685751004887c339bee5fc35633c26f928b2d7e3d07159085b7122683eba21b3c56cabfdbf28687a7d95aa3815445693a9a9979a
7
+ data.tar.gz: 63601a6358c4a600c3ef258f465b38944ce346e48720613342da7b258663fefc1d51df984e2bab97b245c6e0ef0ecfcaf76ff3ad5b3b0281a9bd748b4f7d3951
data/README.md CHANGED
@@ -33,7 +33,9 @@ weight | number | 0.1 | A constant scaling factor for how much the sco
33
33
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
34
34
  adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
35
35
 
36
- ## Default Adjusting Table
36
+ # About Adjusting Table
37
+
38
+ ## Default Table
37
39
 
38
40
  ```
39
41
  ['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'], ['I', 'U'],
@@ -42,9 +44,9 @@ adj_table | boolean | false | The option is used to give partial credit for
42
44
  ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'], ['E', ' '], ['Y', ' '], ['S', ' ']
43
45
  ```
44
46
 
45
- ## How Adjusting Table Work
47
+ ## How it works?
46
48
 
47
- origin formula:
49
+ Original Formula:
48
50
 
49
51
  ![origin](https://chart.googleapis.com/chart?cht=tx&chs&chl=%5Cbegin%7Bcases%7D0%26%7B%5Ctext%7Bif%20%7Dm%3D0%7D%5C%5C%5Cfrac%7B1%7D%7B3%7D(%5Cfrac%7Bm%7D%7B%5Cleft%7Cs1%5Cright%7C%7D%2B%5Cfrac%7Bm%7D%7B%5Cleft%7Cs2%5Cright%7C%7D%2B%5Cfrac%7Bm-t%7D%7Bm%7D)%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
50
52
 
@@ -53,7 +55,7 @@ where
53
55
  - `m` is the number of matching characters.
54
56
  - `t` is half the number of transpositions.
55
57
 
56
- with adjusting table:
58
+ With Adjusting Table:
57
59
 
58
60
  ![adj](https://chart.googleapis.com/chart?cht=tx&chs&chl=%5Cbegin%7Bcases%7D0%26%5Ctext%7Bif%20%7Dm%3D0%5C%5C%5Cfrac%7B1%7D%7B3%7D(%5Cfrac%7B%5Cfrac%7Bs%7D%7B10%7D%2Bm%7D%7B%5Cleft%7Cs1%5Cright%7C%7D%2B%5Cfrac%7B%5Cfrac%7Bs%7D%7B10%7D%2Bm%7D%7B%5Cleft%7Cs2%5Cright%7C%7D%2B%5Cfrac%7Bm-t%7D%7Bm%7D)%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
59
61
 
@@ -61,6 +63,15 @@ where
61
63
 
62
64
  - `s` is the number of nonmatching but similar characters.
63
65
 
66
+ ## Difference Between v1.3.1 And v1.3.2.beta
67
+
68
+ Version | Algorithm
69
+ ----------- | -----------------------------------------------------------------------
70
+ v1.3.1 | One linked list to store sparse matrix and iterate to find similar character.
71
+ v1.3.2.beta | One hash table with multiple linked lists for collision handling.
72
+
73
+ In theory, the latter should work more efficient than the former (more test data needed).
74
+
64
75
  # Why This?
65
76
 
66
77
  There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
data/Rakefile CHANGED
@@ -27,7 +27,7 @@ task :compare do
27
27
  require 'fuzzystringmatch'
28
28
  require 'hotwater'
29
29
  require 'amatch'
30
- @ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten']]
30
+ @ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten'], ['San Francisco', 'Santa Monica']]
31
31
  table = []
32
32
  table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
33
33
  table << %w[--- --- --- --- --- ---]
data/benchmark/native.txt CHANGED
@@ -1,12 +1,12 @@
1
1
  Rehearsal ----------------------------------------------------
2
- jaro_winkler 0.350000 0.000000 0.350000 ( 0.358591)
3
- fuzzystringmatch 0.360000 0.020000 0.380000 ( 0.381666)
4
- hotwater 0.340000 0.000000 0.340000 ( 0.337789)
5
- amatch 1.010000 0.000000 1.010000 ( 1.010946)
6
- ------------------------------------------- total: 2.080000sec
2
+ jaro_winkler 0.350000 0.000000 0.350000 ( 0.348383)
3
+ fuzzystringmatch 0.330000 0.020000 0.350000 ( 0.354850)
4
+ hotwater 0.280000 0.000000 0.280000 ( 0.278819)
5
+ amatch 0.980000 0.000000 0.980000 ( 0.983325)
6
+ ------------------------------------------- total: 1.960000sec
7
7
 
8
8
  user system total real
9
- jaro_winkler 0.350000 0.010000 0.360000 ( 0.345293)
10
- fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.138711)
11
- hotwater 0.310000 0.000000 0.310000 ( 0.306498)
12
- amatch 0.960000 0.000000 0.960000 ( 0.961509)
9
+ jaro_winkler 0.330000 0.000000 0.330000 ( 0.331923)
10
+ fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.135655)
11
+ hotwater 0.280000 0.000000 0.280000 ( 0.276728)
12
+ amatch 0.930000 0.010000 0.940000 ( 0.932943)
@@ -1,8 +1,16 @@
1
1
  #include <stdlib.h>
2
2
  #include "adj_matrix.h"
3
+ #include "codepoints.h"
4
+
5
+ const char *DEFAULT_ADJ_TABLE[] = {
6
+ "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
7
+ "I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
8
+ "M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
9
+ "0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
10
+ };
3
11
 
4
12
  extern unsigned int MurmurHash2(const void * key, int len, unsigned int seed);
5
- static void node_free(Node *head);
13
+ inline void node_free(Node *head);
6
14
 
7
15
  AdjMatrix* adj_matrix_new(unsigned int length){
8
16
  AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
@@ -42,7 +50,7 @@ char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long
42
50
  }
43
51
  }
44
52
 
45
- static void node_free(Node *head){
53
+ inline void node_free(Node *head){
46
54
  if(head == NULL) return;
47
55
  node_free(head->next);
48
56
  free(head);
@@ -59,4 +67,19 @@ void adj_matrix_free(AdjMatrix *matrix){
59
67
  }
60
68
  free(matrix->table);
61
69
  free(matrix);
70
+ }
71
+
72
+ AdjMatrix* adj_matrix_default(){
73
+ static char first_time = 1;
74
+ static AdjMatrix *ret_matrix;
75
+ if(first_time){
76
+ ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
77
+ int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
78
+ for(int i = 0; i < length; i += 2){
79
+ UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
80
+ adj_matrix_add(ret_matrix, h1.code, h2.code);
81
+ }
82
+ first_time = 0;
83
+ }
84
+ return ret_matrix;
62
85
  }
@@ -13,9 +13,10 @@ typedef struct{
13
13
  unsigned int length;
14
14
  } AdjMatrix;
15
15
 
16
- AdjMatrix* adj_matrix_new(unsigned int length);
17
- void adj_matrix_add(AdjMatrix *matrix, unsigned long long x, unsigned long long y);
18
- char adj_matrix_find(AdjMatrix *matrix, unsigned long long x, unsigned long long y);
19
- void adj_matrix_free(AdjMatrix *matrix);
16
+ AdjMatrix* adj_matrix_new (unsigned int length);
17
+ void adj_matrix_add (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
18
+ char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned long long y);
19
+ void adj_matrix_free (AdjMatrix *matrix);
20
+ AdjMatrix* adj_matrix_default();
20
21
 
21
22
  #endif /* ADJ_MATRIX_H */
@@ -0,0 +1,29 @@
1
+ #include <string.h>
2
+ #include <stdlib.h>
3
+ #include "codepoints.h"
4
+
5
+ UnicodeHash unicode_hash_new(const char *str){
6
+ UnicodeHash ret = {};
7
+ unsigned char first_char = str[0];
8
+ if(first_char >= 252) ret.byte_length = 6; // 1111110x
9
+ else if(first_char >= 248) ret.byte_length = 5; // 111110xx
10
+ else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
11
+ else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
12
+ else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
13
+ else ret.byte_length = 1;
14
+ memcpy(&ret.code, str, ret.byte_length);
15
+ return ret;
16
+ }
17
+
18
+ Codepoints codepoints_new(const char *str, int byte_len){
19
+ Codepoints ret = {};
20
+ ret.ary = malloc(byte_len * sizeof(long long));
21
+ ret.length = 0;
22
+ for(int i = 0; i < byte_len;){
23
+ UnicodeHash hash = unicode_hash_new(str + i);
24
+ ret.ary[ret.length] = hash.code;
25
+ ret.length++;
26
+ i += hash.byte_length;
27
+ }
28
+ return ret;
29
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef CODEPOINTS_H
2
+ #define CODEPOINTS_H 1
3
+
4
+ typedef struct{
5
+ unsigned long long code;
6
+ unsigned int byte_length;
7
+ } UnicodeHash;
8
+
9
+ typedef struct{
10
+ unsigned long long *ary;
11
+ int length;
12
+ } Codepoints;
13
+
14
+ UnicodeHash unicode_hash_new(const char *str);
15
+ Codepoints codepoints_new (const char *str, int byte_len);
16
+
17
+ #endif /* CODEPOINTS_H */
@@ -1,30 +1,9 @@
1
- #include <string.h>
2
1
  #include <stdlib.h>
3
2
  #include <ctype.h>
4
3
  #include "distance.h"
4
+ #include "codepoints.h"
5
5
  #include "adj_matrix.h"
6
6
 
7
- typedef struct{
8
- unsigned long long code;
9
- unsigned int byte_length;
10
- } UnicodeHash;
11
-
12
- typedef struct{
13
- unsigned long long *ary;
14
- int length;
15
- } Codepoints;
16
-
17
- const char *DEFAULT_ADJ_TABLE[] = {
18
- "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
19
- "I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
20
- "M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
21
- "0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
22
- };
23
-
24
- static UnicodeHash unicode_hash_new(const char *str);
25
- static Codepoints codepoints_new(const char *str, int byte_len);
26
- static AdjMatrix* adj_matrix_default();
27
-
28
7
  Option option_new(){
29
8
  Option opt;
30
9
  opt.ignore_case = opt.adj_table = 0;
@@ -33,12 +12,9 @@ Option option_new(){
33
12
  return opt;
34
13
  }
35
14
 
36
- double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
37
- // set default option if NULL passed
38
- int free_opt_flag = 0;
39
-
40
- Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len);
41
- Codepoints code_ary_2 = codepoints_new(s2, s2_byte_len);
15
+ double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
16
+ Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len),
17
+ code_ary_2 = codepoints_new(s2, s2_byte_len);
42
18
 
43
19
  if(opt.ignore_case){
44
20
  for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
@@ -54,19 +30,19 @@ double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option o
54
30
  // Compute jaro distance
55
31
  int window_size = code_ary_2.length / 2 - 1;
56
32
  if(window_size < 0) window_size = 0;
57
- double matches = 0.0;
58
- double sim_matches = 0.0;
59
- int transpositions = 0;
60
- int previous_index = -1;
61
- int max_index = code_ary_2.length - 1;
33
+ double matches = 0.0,
34
+ sim_matches = 0.0;
35
+ int transpositions = 0,
36
+ previous_index = -1,
37
+ max_index = code_ary_2.length - 1;
62
38
  for(int i = 0; i < code_ary_1.length; i++){
63
39
  int left = i - window_size;
64
40
  int right = i + window_size;
65
41
  if(left < 0) left = 0;
66
42
  if(right > max_index) right = max_index;
67
- char matched = 0;
68
- char found = 0;
69
- char sim_matched = 0;
43
+ char matched = 0,
44
+ found = 0,
45
+ sim_matched = 0;
70
46
  for(int j = left; j <= right; j++){
71
47
  if(code_ary_1.ary[i] == code_ary_2.ary[j]){
72
48
  matched = 1;
@@ -97,45 +73,4 @@ double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option o
97
73
  }
98
74
  free(code_ary_1.ary); free(code_ary_2.ary);
99
75
  return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
100
- }
101
-
102
- static UnicodeHash unicode_hash_new(const char *str){
103
- UnicodeHash ret = {};
104
- unsigned char first_char = str[0];
105
- if(first_char >= 252) ret.byte_length = 6; // 1111110x
106
- else if(first_char >= 248) ret.byte_length = 5; // 111110xx
107
- else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
108
- else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
109
- else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
110
- else ret.byte_length = 1;
111
- memcpy(&ret.code, str, ret.byte_length);
112
- return ret;
113
- }
114
-
115
- static Codepoints codepoints_new(const char *str, int byte_len){
116
- Codepoints ret = {};
117
- ret.ary = calloc(byte_len, sizeof(long long));
118
- int count = 0;
119
- for(int i = 0; i < byte_len;){
120
- UnicodeHash hash = unicode_hash_new(str + i);
121
- ret.ary[count] = hash.code;
122
- count++;
123
- i += hash.byte_length;
124
- }
125
- ret.length += count;
126
- return ret;
127
- }
128
-
129
- static AdjMatrix* adj_matrix_default(){
130
- static char first_time = 1;
131
- static AdjMatrix *ret_matrix;
132
- if(first_time){
133
- ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
134
- for(int i = 0; i < 78; i += 2){
135
- UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
136
- adj_matrix_add(ret_matrix, h1.code, h2.code);
137
- }
138
- first_time = 0;
139
- }
140
- return ret_matrix;
141
76
  }
@@ -6,7 +6,7 @@ typedef struct{
6
6
  char ignore_case, adj_table;
7
7
  } Option;
8
8
 
9
- double c_distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
9
+ double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
10
10
  Option option_new();
11
11
 
12
12
  #endif /* DISTANCE_H */
@@ -1,3 +1,3 @@
1
1
  require "mkmf"
2
- $CFLAGS << ' -std=gnu99' if Gem.win_platform?
2
+ $CFLAGS << ' -std=c99 '
3
3
  create_makefile("jaro_winkler/jaro_winkler")
@@ -13,15 +13,15 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
13
13
  rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
14
14
  Option c_opt = option_new();
15
15
  if(TYPE(opt) == T_HASH){
16
- VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
17
- VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
18
- VALUE ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case")));
19
- VALUE adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
16
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
17
+ threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
18
+ ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
19
+ adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
20
20
  if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
21
21
  if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
22
22
  if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
23
23
  if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
24
24
  if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
25
25
  }
26
- return rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
26
+ return rb_float_new(distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
27
27
  }
@@ -1,64 +1,64 @@
1
- //-----------------------------------------------------------------------------
2
- // MurmurHash2, by Austin Appleby
3
-
4
- // Note - This code makes a few assumptions about how your machine behaves -
5
-
6
- // 1. We can read a 4-byte value from any address without crashing
7
- // 2. sizeof(int) == 4
8
-
9
- // And it has a few limitations -
10
-
11
- // 1. It will not work incrementally.
12
- // 2. It will not produce the same results on little-endian and big-endian
13
- // machines.
14
-
15
- unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
16
- {
17
- // 'm' and 'r' are mixing constants generated offline.
18
- // They're not really 'magic', they just happen to work well.
19
-
20
- const unsigned int m = 0x5bd1e995;
21
- const int r = 24;
22
-
23
- // Initialize the hash to a 'random' value
24
-
25
- unsigned int h = seed ^ len;
26
-
27
- // Mix 4 bytes at a time into the hash
28
-
29
- const unsigned char * data = (const unsigned char *)key;
30
-
31
- while(len >= 4)
32
- {
33
- unsigned int k = *(unsigned int *)data;
34
-
35
- k *= m;
36
- k ^= k >> r;
37
- k *= m;
38
-
39
- h *= m;
40
- h ^= k;
41
-
42
- data += 4;
43
- len -= 4;
44
- }
45
-
46
- // Handle the last few bytes of the input array
47
-
48
- switch(len)
49
- {
50
- case 3: h ^= data[2] << 16;
51
- case 2: h ^= data[1] << 8;
52
- case 1: h ^= data[0];
53
- h *= m;
54
- };
55
-
56
- // Do a few final mixes of the hash to ensure the last few
57
- // bytes are well-incorporated.
58
-
59
- h ^= h >> 13;
60
- h *= m;
61
- h ^= h >> 15;
62
-
63
- return h;
64
- }
1
+ //-----------------------------------------------------------------------------
2
+ // MurmurHash2, by Austin Appleby
3
+
4
+ // Note - This code makes a few assumptions about how your machine behaves -
5
+
6
+ // 1. We can read a 4-byte value from any address without crashing
7
+ // 2. sizeof(int) == 4
8
+
9
+ // And it has a few limitations -
10
+
11
+ // 1. It will not work incrementally.
12
+ // 2. It will not produce the same results on little-endian and big-endian
13
+ // machines.
14
+
15
+ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
16
+ {
17
+ // 'm' and 'r' are mixing constants generated offline.
18
+ // They're not really 'magic', they just happen to work well.
19
+
20
+ const unsigned int m = 0x5bd1e995;
21
+ const int r = 24;
22
+
23
+ // Initialize the hash to a 'random' value
24
+
25
+ unsigned int h = seed ^ len;
26
+
27
+ // Mix 4 bytes at a time into the hash
28
+
29
+ const unsigned char * data = (const unsigned char *)key;
30
+
31
+ while(len >= 4)
32
+ {
33
+ unsigned int k = *(unsigned int *)data;
34
+
35
+ k *= m;
36
+ k ^= k >> r;
37
+ k *= m;
38
+
39
+ h *= m;
40
+ h ^= k;
41
+
42
+ data += 4;
43
+ len -= 4;
44
+ }
45
+
46
+ // Handle the last few bytes of the input array
47
+
48
+ switch(len)
49
+ {
50
+ case 3: h ^= data[2] << 16;
51
+ case 2: h ^= data[1] << 8;
52
+ case 1: h ^= data[0];
53
+ h *= m;
54
+ };
55
+
56
+ // Do a few final mixes of the hash to ensure the last few
57
+ // bytes are well-incorporated.
58
+
59
+ h ^= h >> 13;
60
+ h *= m;
61
+ h ^= h >> 15;
62
+
63
+ return h;
64
+ }
data/jaro_winkler.gemspec CHANGED
@@ -23,4 +23,8 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "bundler", "~> 1.7"
24
24
  spec.add_development_dependency "rake", "~> 10.0"
25
25
  spec.add_development_dependency "rake-compiler"
26
+ spec.add_development_dependency "rspec"
27
+ spec.add_development_dependency "fuzzy-string-match"
28
+ spec.add_development_dependency "hotwater"
29
+ spec.add_development_dependency "amatch"
26
30
  end
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.3.2.beta"
2
+ VERSION = "1.3.2.beta2"
3
3
  end
@@ -5,22 +5,23 @@ include JaroWinkler
5
5
  shared_examples 'common' do |strategy|
6
6
  it 'works' do
7
7
  ary = [
8
- ['henka' , 'henkan' , 0.9667] ,
9
- ['al' , 'al' , 1.0] ,
10
- ['martha' , 'marhta' , 0.9611] ,
11
- ['jones' , 'johnson' , 0.8323] ,
12
- ['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
13
- ['dwayne' , 'duane' , 0.8400] ,
14
- ['dixon' , 'dicksonx' , 0.8133] ,
15
- ['fvie' , 'ten' , 0.0] ,
16
- ['tony' , 'tony' , 1.0] ,
17
- ['tonytonyjan' , 'tonytonyjan' , 1.0] ,
18
- ['x' , 'x' , 1.0] ,
19
- ['' , '' , 0.0] ,
20
- ['tony' , '' , 0.0] ,
21
- ['' , 'tony' , 0.0] ,
22
- ['tonytonyjan' , 'tony' , 0.8727] ,
23
- ['tony' , 'tonytonyjan' , 0.8727]
8
+ ['henka' , 'henkan' , 0.9667] ,
9
+ ['al' , 'al' , 1.0] ,
10
+ ['martha' , 'marhta' , 0.9611] ,
11
+ ['jones' , 'johnson' , 0.8323] ,
12
+ ['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
13
+ ['dwayne' , 'duane' , 0.8400] ,
14
+ ['dixon' , 'dicksonx' , 0.8133] ,
15
+ ['fvie' , 'ten' , 0.0] ,
16
+ ['tony' , 'tony' , 1.0] ,
17
+ ['tonytonyjan' , 'tonytonyjan' , 1.0] ,
18
+ ['x' , 'x' , 1.0] ,
19
+ ['' , '' , 0.0] ,
20
+ ['tony' , '' , 0.0] ,
21
+ ['' , 'tony' , 0.0] ,
22
+ ['tonytonyjan' , 'tony' , 0.8727] ,
23
+ ['tony' , 'tonytonyjan' , 0.8727] ,
24
+ ['San Francisco' , 'Santa Monica' , 0.8180]
24
25
  ]
25
26
  ary.each do |s1, s2, ans|
26
27
  expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.2.beta
4
+ version: 1.3.2.beta2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-11 00:00:00.000000000 Z
11
+ date: 2014-10-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,62 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: fuzzy-string-match
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: hotwater
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: amatch
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
55
111
  description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
56
112
  and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
57
113
  string.
@@ -74,6 +130,8 @@ files:
74
130
  - benchmark/pure.txt
75
131
  - ext/jaro_winkler/adj_matrix.c
76
132
  - ext/jaro_winkler/adj_matrix.h
133
+ - ext/jaro_winkler/codepoints.c
134
+ - ext/jaro_winkler/codepoints.h
77
135
  - ext/jaro_winkler/distance.c
78
136
  - ext/jaro_winkler/distance.h
79
137
  - ext/jaro_winkler/extconf.rb
@@ -107,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
107
165
  version: 1.3.1
108
166
  requirements: []
109
167
  rubyforge_project:
110
- rubygems_version: 2.4.1
168
+ rubygems_version: 2.4.2
111
169
  signing_key:
112
170
  specification_version: 4
113
171
  summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support