jaro_winkler 1.3.5 → 1.3.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 32cfc9b7e44ed814f68170efdf3f799ed4641868
4
- data.tar.gz: 81e3e5311cc315cf302d8a1f34509e263e6cb8f0
3
+ metadata.gz: d48fca750c919028bdf13a2145bf431919a2c924
4
+ data.tar.gz: 134da623e4f99a708cf83f7e2e961a8882ecffb8
5
5
  SHA512:
6
- metadata.gz: 874bcc658f618d3b1843957945543c1a5f2c465333d75c7f805c3179aad2db897b25fd8471ac8f7e4cb02216d2a1d7a440664b8bc0f89444dcc88507d5d35cae
7
- data.tar.gz: c6842017d743987b8ce40e6ec752e1184f455a5db56c5119a064b37558fb5a22c25e19a65e79d7b43a06406227bb5826665f8c72063fe90afe87972209c8e363
6
+ metadata.gz: 7476d0dcd726f7d6b2405c861b139f99b22c71e21e48607beab76c1a29a11b6e9398c33943e8bfcff042b18fdea2d66f858283df17bbfd4e351a3ca93ee8f05a
7
+ data.tar.gz: 1c0a9c5b3521e761c752bc19a09a24052da3a46357195b9a86c86815418fc55bbe090b44db738bf086a44aabb1056f660a1584f1c8c7d01d124faf20623041b4
data/README.md CHANGED
@@ -1,7 +1,5 @@
1
1
  [![Build Status](https://travis-ci.org/tonytonyjan/jaro_winkler.svg?branch=master)](https://travis-ci.org/tonytonyjan/jaro_winkler)
2
2
 
3
- # About
4
-
5
3
  It's an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both of them supports UTF-8 string.
6
4
 
7
5
  # Installation
@@ -35,7 +33,7 @@ weight | number | 0.1 | A constant scaling factor for how much the sco
35
33
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
36
34
  adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
37
35
 
38
- # About Adjusting Table
36
+ # Adjusting Table
39
37
 
40
38
  ## Default Table
41
39
 
@@ -65,15 +63,6 @@ where
65
63
 
66
64
  - `s` is the number of nonmatching but similar characters.
67
65
 
68
- ## Difference Between v1.3.1 And v1.3.2.beta
69
-
70
- Version | Algorithm
71
- ----------- | -----------------------------------------------------------------------
72
- v1.3.1 | One linked list to store sparse matrix and iterate to find similar character.
73
- v1.3.2.beta | One hash table with multiple linked lists for collision handling.
74
-
75
- In theory, the latter should work more efficient than the former (more test data needed).
76
-
77
66
  # Why This?
78
67
 
79
68
  There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
@@ -108,9 +97,9 @@ str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater |
108
97
  - The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
109
98
  - Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
110
99
 
111
- ## Benchmark
100
+ # Benchmark
112
101
 
113
- ### Pure Ruby
102
+ ## Pure Ruby
114
103
 
115
104
  | user | system | total | real
116
105
  ---------------- | -------- | -------- | -------- | ------------
@@ -120,7 +109,7 @@ fuzzystringmatch | 1.510000 | 0.000000 | 1.510000 | ( 1.510136)
120
109
  - jaro_winkler (1.3.1)
121
110
  - fuzzy-string-match (0.9.6)
122
111
 
123
- ### Native
112
+ ## Native
124
113
 
125
114
  | user | system | total | real
126
115
  ---------------- | -------- | -------- | -------- | ------------
@@ -136,4 +125,5 @@ amatch | 0.960000 | 0.000000 | 0.960000 | ( 0.961509)
136
125
 
137
126
  # Todo
138
127
 
139
- - Custom adjusting word table.
128
+ - Custom adjusting word table.
129
+ - The algorithm between C and Ruby are different.
@@ -1,6 +1,7 @@
1
- #include <stdlib.h>
2
1
  #include "adj_matrix.h"
3
- #include "codepoints.h"
2
+ #include "code.h"
3
+
4
+ #include <stdlib.h>
4
5
 
5
6
  const char *DEFAULT_ADJ_TABLE[] = {
6
7
  "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
@@ -76,8 +77,11 @@ AdjMatrix* adj_matrix_default(){
76
77
  ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
77
78
  int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
78
79
  for(int i = 0; i < length; i += 2){
79
- UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
80
- adj_matrix_add(ret_matrix, h1.code, h2.code);
80
+ unsigned long long code_1, code_2;
81
+ int dummy_length;
82
+ utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
83
+ utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
84
+ adj_matrix_add(ret_matrix, code_1, code_2);
81
85
  }
82
86
  first_time = 0;
83
87
  }
@@ -1,5 +1,5 @@
1
1
  #ifndef ADJ_MATRIX_H
2
- #define ADJ_MATRIX_H 1
2
+ #define ADJ_MATRIX_H
3
3
  #define ADJ_MATRIX_DEFAULT_LENGTH 958
4
4
  #define ADJ_MATRIX_SEED 9527
5
5
 
@@ -19,4 +19,4 @@ char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned
19
19
  void adj_matrix_free (AdjMatrix *matrix);
20
20
  AdjMatrix* adj_matrix_default();
21
21
 
22
- #endif /* ADJ_MATRIX_H */
22
+ #endif
@@ -0,0 +1,29 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+
4
+ void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
5
+ unsigned char first_char = str[0];
6
+ if(first_char >= 252) *ret_byte_length = 6; // 1111110x
7
+ else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
8
+ else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
9
+ else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
10
+ else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
11
+ else *ret_byte_length = 1;
12
+ *ret_code = 0;
13
+ memcpy(ret_code, str, *ret_byte_length);
14
+ }
15
+
16
+ void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
17
+ unsigned int code;
18
+ char byte_length;
19
+
20
+ *ret_codes = calloc(length, sizeof(long long));
21
+ *ret_length = 0;
22
+
23
+ for(int i = 0; i < length;){
24
+ int byte_length;
25
+ utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
26
+ *ret_length += 1;
27
+ i += byte_length;
28
+ }
29
+ }
@@ -0,0 +1,7 @@
1
+ #ifndef CODE_H
2
+ #define CODE_H
3
+
4
+ void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length);
5
+ void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length);
6
+
7
+ #endif
@@ -0,0 +1,103 @@
1
+ #include "jaro.h"
2
+ #include "code.h"
3
+ #include "adj_matrix.h"
4
+
5
+ #include <string.h>
6
+ #include <stdlib.h>
7
+ #include <ctype.h>
8
+
9
+ double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
10
+ if(!short_str_len || !long_str_len) return 0.0;
11
+
12
+ if(short_str_len > long_str_len){
13
+ SWAP(short_str, long_str);
14
+ SWAP(short_str_len, long_str_len);
15
+ }
16
+
17
+ unsigned long long *short_codes, *long_codes;
18
+ int short_codes_len, long_codes_len;
19
+ string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
20
+ string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
21
+
22
+ if(opt->ignore_case){
23
+ for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
24
+ for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
25
+ }
26
+
27
+ int window_size = long_codes_len/2 - 1;
28
+ if(window_size < 0) window_size = 0;
29
+
30
+ char short_codes_flag[MAX_WORD_LENGTH];
31
+ char long_codes_flag[MAX_WORD_LENGTH];
32
+ memset(short_codes_flag, 0, MAX_WORD_LENGTH);
33
+ memset(long_codes_flag, 0, MAX_WORD_LENGTH);
34
+
35
+ // count number of matching characters
36
+ int match_count = 0;
37
+ for(int i = 0; i < short_codes_len; i++){
38
+ int left = (i >= window_size) ? i - window_size : 0;
39
+ int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
40
+ if(right > long_codes_len - 1) right = long_codes_len - 1;
41
+ for(int j = left; j <= right; j++){
42
+ if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
43
+ short_codes_flag[i] = long_codes_flag[j] = 1;
44
+ match_count++;
45
+ break;
46
+ }
47
+ }
48
+ }
49
+ if(!match_count){
50
+ free(short_codes); free(long_codes);
51
+ return 0.0;
52
+ }
53
+
54
+ // count number of transpositions
55
+ int transposition_count = 0, j = 0, k = 0;
56
+ for(int i = 0; i < short_codes_len; i++){
57
+ if(short_codes_flag[i]){
58
+ for(j = k; j < long_codes_len; j++){
59
+ if(long_codes_flag[j]){
60
+ k = j + 1;
61
+ break;
62
+ }
63
+ }
64
+ if(short_codes[i] != long_codes[j]) transposition_count++;
65
+ }
66
+ }
67
+
68
+ // count similarities in nonmatched characters
69
+ int similar_count = 0;
70
+ if(opt->adj_table && short_codes_len > match_count)
71
+ for(int i = 0; i < short_codes_len; i++)
72
+ if(!short_codes_flag[i])
73
+ for(int j = 0; j < long_codes_len; j++)
74
+ if(!long_codes_flag[j])
75
+ if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
76
+ similar_count += 3;
77
+ break;
78
+ }
79
+
80
+ // jaro distance
81
+ double jaro_distance;
82
+ double m = (double)match_count;
83
+ double t = (double)(transposition_count/2);
84
+ if(opt->adj_table) m = similar_count/10.0 + m;
85
+ jaro_distance = (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
86
+
87
+ // jaro winkler distance
88
+ if(!opt){
89
+ static LibJaroOption default_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD};
90
+ opt = &default_opt;
91
+ }
92
+ if(jaro_distance < opt->threshold){
93
+ free(short_codes); free(long_codes);
94
+ return jaro_distance;
95
+ }
96
+ else{
97
+ int prefix = 0;
98
+ int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
99
+ for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
100
+ free(short_codes); free(long_codes);
101
+ return jaro_distance + prefix*opt->weight*(1-jaro_distance);
102
+ }
103
+ }
@@ -0,0 +1,16 @@
1
+ #ifndef LIBJARO_JARO_H
2
+ #define LIBJARO_JARO_H
3
+
4
+ #define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
5
+ #define MAX_WORD_LENGTH 64
6
+ #define DEFAULT_WEIGHT 0.1
7
+ #define DEFAULT_THRESHOLD 0.7
8
+
9
+ typedef struct LibJaroOption{
10
+ double weight, threshold;
11
+ char ignore_case, adj_table;
12
+ } LibJaroOption;
13
+
14
+ double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
15
+
16
+ #endif
@@ -1,27 +1,28 @@
1
- #include "jaro_winkler.h"
2
- #include "distance.h"
1
+ #include "ruby.h"
2
+ #include "jaro.h"
3
3
 
4
4
  VALUE rb_mJaroWinkler;
5
+ VALUE distance(int argc, VALUE *argv, VALUE self);
5
6
 
6
7
  void Init_jaro_winkler(void){
7
8
  rb_mJaroWinkler = rb_define_module("JaroWinkler");
8
- rb_define_module_function(rb_mJaroWinkler, "c_distance", rb_distance, -1);
9
+ rb_define_module_function(rb_mJaroWinkler, "c_distance", distance, -1);
9
10
  }
10
11
 
11
- VALUE rb_distance(int argc, VALUE *argv, VALUE self){
12
+ VALUE distance(int argc, VALUE *argv, VALUE self){
12
13
  VALUE s1, s2, opt;
13
14
  rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
14
- Option c_opt = option_new();
15
+ LibJaroOption c_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
15
16
  if(TYPE(opt) == T_HASH){
16
- VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
17
- threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
17
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
18
+ threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
18
19
  ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
19
- adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
20
+ adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
20
21
  if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
21
22
  if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
22
- if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
23
- if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
24
- if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
23
+ if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
24
+ if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
25
+ if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
25
26
  }
26
- return rb_float_new(distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
27
+ return rb_float_new(jaro_winkler_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
27
28
  }
@@ -1,10 +1,19 @@
1
1
  module JaroWinkler
2
- DEFAULT_ADJ_TABLE = Hash.new({})
2
+ DEFAULT_ADJ_TABLE = Hash.new
3
3
  [
4
4
  ['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
5
5
  ['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
6
6
  ['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
7
7
  ['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
8
8
  ['E', ' '], ['Y', ' '], ['S', ' ']
9
- ].each{ |s1, s2| DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true }
10
- end
9
+ ].each{ |s1, s2|
10
+ if not DEFAULT_ADJ_TABLE.has_key?(s1)
11
+ DEFAULT_ADJ_TABLE[s1] = Hash.new
12
+ end
13
+ if not DEFAULT_ADJ_TABLE.has_key?(s2)
14
+ DEFAULT_ADJ_TABLE[s2] = Hash.new
15
+ end
16
+ DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
17
+ }
18
+ DEFAULT_ADJ_TABLE.default = Hash.new
19
+ end
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.3.5"
2
+ VERSION = "1.3.6"
3
3
  end
@@ -0,0 +1,8 @@
1
+ # spec/adjusting_table_spec.rb
2
+ require 'jaro_winkler'
3
+
4
+ describe JaroWinkler::DEFAULT_ADJ_TABLE do
5
+ it 'should not be empty' do
6
+ expect(JaroWinkler::DEFAULT_ADJ_TABLE).not_to be_empty
7
+ end
8
+ end
@@ -4,40 +4,29 @@ include JaroWinkler
4
4
 
5
5
  shared_examples 'common' do |strategy|
6
6
  it 'works' do
7
- ary = [
8
- ['henka' , 'henkan' , 0.9667] ,
9
- ['al' , 'al' , 1.0] ,
10
- ['martha' , 'marhta' , 0.9611] ,
11
- ['jones' , 'johnson' , 0.8323] ,
12
- ['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
13
- ['dwayne' , 'duane' , 0.8400] ,
14
- ['dixon' , 'dicksonx' , 0.8133] ,
15
- ['fvie' , 'ten' , 0.0] ,
16
- ['tony' , 'tony' , 1.0] ,
17
- ['tonytonyjan' , 'tonytonyjan' , 1.0] ,
18
- ['x' , 'x' , 1.0] ,
19
- ['' , '' , 0.0] ,
20
- ['tony' , '' , 0.0] ,
21
- ['' , 'tony' , 0.0] ,
22
- ['tonytonyjan' , 'tony' , 0.8727] ,
23
- ['tony' , 'tonytonyjan' , 0.8727] ,
24
- # ['San Francisco' , 'Santa Monica' , 0.8180]
25
- ]
26
- ary.each do |s1, s2, ans|
27
- expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
28
- end
7
+ expect(send(strategy, 'henka','henkan')).to be_within(0.0001).of(0.9667)
8
+ expect(send(strategy, 'al','al')).to be_within(0.0001).of(1.0)
9
+ expect(send(strategy, 'martha','marhta')).to be_within(0.0001).of(0.9611)
10
+ expect(send(strategy, 'jones','johnson')).to be_within(0.0001).of(0.8323)
11
+ expect(send(strategy, 'abcvwxyz','cabvwxyz')).to be_within(0.0001).of(0.9583)
12
+ expect(send(strategy, 'dwayne','duane')).to be_within(0.0001).of(0.8400)
13
+ expect(send(strategy, 'dixon','dicksonx')).to be_within(0.0001).of(0.8133)
14
+ expect(send(strategy, 'fvie','ten')).to be_within(0.0001).of(0.0)
15
+ expect(send(strategy, 'tony','tony')).to be_within(0.0001).of(1.0)
16
+ expect(send(strategy, 'tonytonyjan','tonytonyjan')).to be_within(0.0001).of(1.0)
17
+ expect(send(strategy, 'x','x')).to be_within(0.0001).of(1.0)
18
+ expect(send(strategy, '','')).to be_within(0.0001).of(0.0)
19
+ expect(send(strategy, 'tony','')).to be_within(0.0001).of(0.0)
20
+ expect(send(strategy, '','tony')).to be_within(0.0001).of(0.0)
21
+ expect(send(strategy, 'tonytonyjan','tony')).to be_within(0.0001).of(0.8727)
22
+ expect(send(strategy, 'tony','tonytonyjan')).to be_within(0.0001).of(0.8727)
29
23
  end
30
24
 
31
25
  it 'works with UTF-8' do
32
- ary = [
33
- ['變形金剛4:絕跡重生' , '變形金剛4: 絕跡重生' , 0.9818] ,
34
- ['連勝文' , '連勝丼' , 0.8222] ,
35
- ['馬英九' , '馬英丸' , 0.8222] ,
36
- ['良い' , 'いい' , 0.6666] ,
37
- ]
38
- ary.each do |s1, s2, ans|
39
- expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
40
- end
26
+ expect(send(strategy, '變形金剛4:絕跡重生','變形金剛4: 絕跡重生')).to be_within(0.0001).of(0.9818)
27
+ expect(send(strategy, '連勝文','連勝丼')).to be_within(0.0001).of(0.8222)
28
+ expect(send(strategy, '馬英九','馬英丸')).to be_within(0.0001).of(0.8222)
29
+ expect(send(strategy, '良い','いい')).to be_within(0.0001).of(0.6666)
41
30
  end
42
31
 
43
32
  it 'sets ignore_case' do
@@ -54,19 +43,14 @@ shared_examples 'common' do |strategy|
54
43
 
55
44
 
56
45
  it 'works with adjusting table' do
57
- ary = [
58
- ['HENKA' , 'HENKAN' , 0.9667] , # m=5, t=0, s=0
59
- ['AL' , 'AL' , 1.0 ] , # m=2, t=0, s=0
60
- ['MARTHA' , 'MARHTA' , 0.9611] , # m=6, t=1, s=0
61
- ['JONES' , 'JOHNSON' , 0.8598] , # m=4, t=0, s=3
62
- ['ABCVWXYZ' , 'CABVWXYZ' , 0.9583] , # m=8, t=1, s=0
63
- ['DWAYNE' , 'DUANE' , 0.8730] , # m=4, t=0, s=3
64
- ['DIXON' , 'DICKSONX' , 0.8393] , # m=4, t=0, s=3
65
- ['FVIE' , 'TEN' , 0.0 ]
66
- ]
67
- ary.each do |s1, s2, ans|
68
- expect(send(strategy, s1, s2, adj_table: true)).to be_within(0.0001).of(ans)
69
- end
46
+ expect(send(strategy, 'HENKA', 'HENKAN', adj_table: true)).to be_within(0.0001).of(0.9667) # m=5, t=0, s=0
47
+ expect(send(strategy, 'AL', 'AL', adj_table: true)).to be_within(0.0001).of(1.0) # m=2, t=0, s=0
48
+ expect(send(strategy, 'MARTHA', 'MARHTA', adj_table: true)).to be_within(0.0001).of(0.9611) # m=6, t=1, s=0
49
+ expect(send(strategy, 'JONES', 'JOHNSON', adj_table: true)).to be_within(0.0001).of(0.8598) # m=4, t=0, s=3
50
+ expect(send(strategy, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true)).to be_within(0.0001).of(0.9583) # m=8, t=1, s=0
51
+ expect(send(strategy, 'DWAYNE', 'DUANE', adj_table: true)).to be_within(0.0001).of(0.8730) # m=4, t=0, s=3
52
+ expect(send(strategy, 'DIXON', 'DICKSONX', adj_table: true)).to be_within(0.0001).of(0.8393) # m=4, t=0, s=3
53
+ expect(send(strategy, 'FVIE', 'TEN', adj_table: true)).to be_within(0.0001).of(0.0)
70
54
  end
71
55
 
72
56
  context 'with weight exceeding 0.25' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.5
4
+ version: 1.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-02 00:00:00.000000000 Z
11
+ date: 2015-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -131,19 +131,19 @@ files:
131
131
  - benchmark/pure.txt
132
132
  - ext/jaro_winkler/adj_matrix.c
133
133
  - ext/jaro_winkler/adj_matrix.h
134
- - ext/jaro_winkler/codepoints.c
135
- - ext/jaro_winkler/codepoints.h
136
- - ext/jaro_winkler/distance.c
137
- - ext/jaro_winkler/distance.h
134
+ - ext/jaro_winkler/code.c
135
+ - ext/jaro_winkler/code.h
138
136
  - ext/jaro_winkler/extconf.rb
137
+ - ext/jaro_winkler/jaro.c
138
+ - ext/jaro_winkler/jaro.h
139
139
  - ext/jaro_winkler/jaro_winkler.c
140
- - ext/jaro_winkler/jaro_winkler.h
141
140
  - ext/jaro_winkler/murmur_hash2.c
142
141
  - jaro_winkler.gemspec
143
142
  - lib/jaro_winkler.rb
144
143
  - lib/jaro_winkler/adjusting_table.rb
145
144
  - lib/jaro_winkler/fallback.rb
146
145
  - lib/jaro_winkler/version.rb
146
+ - spec/adjusting_table_spec.rb
147
147
  - spec/jaro_winkler_spec.rb
148
148
  - spec/spec_helper.rb
149
149
  homepage: https://github.com/tonytonyjan/jaro_winkler
@@ -166,11 +166,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
166
  version: '0'
167
167
  requirements: []
168
168
  rubyforge_project:
169
- rubygems_version: 2.4.5
169
+ rubygems_version: 2.4.6
170
170
  signing_key:
171
171
  specification_version: 4
172
172
  summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
173
173
  UTF-8 string.
174
174
  test_files:
175
+ - spec/adjusting_table_spec.rb
175
176
  - spec/jaro_winkler_spec.rb
176
177
  - spec/spec_helper.rb
@@ -1,29 +0,0 @@
1
- #include <string.h>
2
- #include <stdlib.h>
3
- #include "codepoints.h"
4
-
5
- UnicodeHash unicode_hash_new(const char *str){
6
- UnicodeHash ret = {};
7
- unsigned char first_char = str[0];
8
- if(first_char >= 252) ret.byte_length = 6; // 1111110x
9
- else if(first_char >= 248) ret.byte_length = 5; // 111110xx
10
- else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
11
- else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
12
- else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
13
- else ret.byte_length = 1;
14
- memcpy(&ret.code, str, ret.byte_length);
15
- return ret;
16
- }
17
-
18
- Codepoints codepoints_new(const char *str, int byte_len){
19
- Codepoints ret = {};
20
- ret.ary = malloc(byte_len * sizeof(long long));
21
- ret.length = 0;
22
- for(int i = 0; i < byte_len;){
23
- UnicodeHash hash = unicode_hash_new(str + i);
24
- ret.ary[ret.length] = hash.code;
25
- ret.length++;
26
- i += hash.byte_length;
27
- }
28
- return ret;
29
- }
@@ -1,17 +0,0 @@
1
- #ifndef CODEPOINTS_H
2
- #define CODEPOINTS_H 1
3
-
4
- typedef struct{
5
- unsigned long long code;
6
- unsigned int byte_length;
7
- } UnicodeHash;
8
-
9
- typedef struct{
10
- unsigned long long *ary;
11
- int length;
12
- } Codepoints;
13
-
14
- UnicodeHash unicode_hash_new(const char *str);
15
- Codepoints codepoints_new (const char *str, int byte_len);
16
-
17
- #endif /* CODEPOINTS_H */
@@ -1,76 +0,0 @@
1
- #include <stdlib.h>
2
- #include <ctype.h>
3
- #include "distance.h"
4
- #include "codepoints.h"
5
- #include "adj_matrix.h"
6
-
7
- Option option_new(){
8
- Option opt;
9
- opt.ignore_case = opt.adj_table = 0;
10
- opt.weight = 0.1;
11
- opt.threshold = 0.7;
12
- return opt;
13
- }
14
-
15
- double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
16
- Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len),
17
- code_ary_2 = codepoints_new(s2, s2_byte_len);
18
-
19
- if(opt.ignore_case){
20
- for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
21
- for(int i = 0; i < code_ary_2.length; ++i) if(code_ary_2.ary[i] < 256 && islower(code_ary_2.ary[i])) code_ary_2.ary[i] -= 32;
22
- }
23
-
24
- // Guarantee the order
25
- if(code_ary_1.length > code_ary_2.length){
26
- unsigned long long *tmp = code_ary_1.ary; code_ary_1.ary = code_ary_2.ary; code_ary_2.ary = tmp;
27
- int tmp2 = code_ary_1.length; code_ary_1.length = code_ary_2.length; code_ary_2.length = tmp2;
28
- }
29
-
30
- // Compute jaro distance
31
- int window_size = code_ary_2.length / 2 - 1;
32
- if(window_size < 0) window_size = 0;
33
- double matches = 0.0,
34
- sim_matches = 0.0;
35
- int transpositions = 0,
36
- previous_index = -1,
37
- max_index = code_ary_2.length - 1;
38
- for(int i = 0; i < code_ary_1.length; i++){
39
- int left = i - window_size;
40
- int right = i + window_size;
41
- if(left < 0) left = 0;
42
- if(right > max_index) right = max_index;
43
- char matched = 0,
44
- found = 0,
45
- sim_matched = 0;
46
- for(int j = left; j <= right; j++){
47
- if(code_ary_1.ary[i] == code_ary_2.ary[j]){
48
- matched = 1;
49
- if(!found && j > previous_index){
50
- previous_index = j;
51
- found = 1;
52
- }
53
- }else if(opt.adj_table && adj_matrix_find(adj_matrix_default(), code_ary_1.ary[i], code_ary_2.ary[j])) sim_matched = 1;
54
- } // for(int j = left; j <= right; j++){
55
- if(matched){
56
- matches++;
57
- if(!found) transpositions++;
58
- }else if(sim_matched) sim_matches += 3;
59
- } // for(int i = 0; i < code_ary_1.length; i++){
60
-
61
- // Don't divide transpositions by 2 since it's been counted directly by above code.
62
- double similarity = matches;
63
- if(opt.adj_table) similarity += sim_matches / 10;
64
- double jaro_distance = matches == 0 ? 0 : (similarity / code_ary_1.length + similarity / code_ary_2.length + (matches - transpositions) / matches) / 3.0;
65
-
66
- // calculate jaro-winkler distance
67
- double threshold = opt.threshold, weight = opt.weight;
68
- int prefix = 0;
69
- int max_length = code_ary_1.length > 4 ? 4 : code_ary_1.length;
70
- for(int i = 0; i < max_length; ++i){
71
- if(code_ary_1.ary[i] == code_ary_2.ary[i]) prefix++;
72
- else break;
73
- }
74
- free(code_ary_1.ary); free(code_ary_2.ary);
75
- return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
76
- }
@@ -1,12 +0,0 @@
1
- #ifndef DISTANCE_H
2
- #define DISTANCE_H 1
3
-
4
- typedef struct{
5
- double weight, threshold;
6
- char ignore_case, adj_table;
7
- } Option;
8
-
9
- double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
10
- Option option_new();
11
-
12
- #endif /* DISTANCE_H */
@@ -1,8 +0,0 @@
1
- #ifndef JARO_WINKLER_H
2
- #define JARO_WINKLER_H 1
3
-
4
- #include "ruby.h"
5
-
6
- VALUE rb_distance(int argc, VALUE *argv, VALUE obj);
7
-
8
- #endif /* JARO_WINKLER_H */