jaro_winkler 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 32cfc9b7e44ed814f68170efdf3f799ed4641868
4
- data.tar.gz: 81e3e5311cc315cf302d8a1f34509e263e6cb8f0
3
+ metadata.gz: d48fca750c919028bdf13a2145bf431919a2c924
4
+ data.tar.gz: 134da623e4f99a708cf83f7e2e961a8882ecffb8
5
5
  SHA512:
6
- metadata.gz: 874bcc658f618d3b1843957945543c1a5f2c465333d75c7f805c3179aad2db897b25fd8471ac8f7e4cb02216d2a1d7a440664b8bc0f89444dcc88507d5d35cae
7
- data.tar.gz: c6842017d743987b8ce40e6ec752e1184f455a5db56c5119a064b37558fb5a22c25e19a65e79d7b43a06406227bb5826665f8c72063fe90afe87972209c8e363
6
+ metadata.gz: 7476d0dcd726f7d6b2405c861b139f99b22c71e21e48607beab76c1a29a11b6e9398c33943e8bfcff042b18fdea2d66f858283df17bbfd4e351a3ca93ee8f05a
7
+ data.tar.gz: 1c0a9c5b3521e761c752bc19a09a24052da3a46357195b9a86c86815418fc55bbe090b44db738bf086a44aabb1056f660a1584f1c8c7d01d124faf20623041b4
data/README.md CHANGED
@@ -1,7 +1,5 @@
1
1
  [![Build Status](https://travis-ci.org/tonytonyjan/jaro_winkler.svg?branch=master)](https://travis-ci.org/tonytonyjan/jaro_winkler)
2
2
 
3
- # About
4
-
5
3
  It's an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both of them supports UTF-8 string.
6
4
 
7
5
  # Installation
@@ -35,7 +33,7 @@ weight | number | 0.1 | A constant scaling factor for how much the sco
35
33
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
36
34
  adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
37
35
 
38
- # About Adjusting Table
36
+ # Adjusting Table
39
37
 
40
38
  ## Default Table
41
39
 
@@ -65,15 +63,6 @@ where
65
63
 
66
64
  - `s` is the number of nonmatching but similar characters.
67
65
 
68
- ## Difference Between v1.3.1 And v1.3.2.beta
69
-
70
- Version | Algorithm
71
- ----------- | -----------------------------------------------------------------------
72
- v1.3.1 | One linked list to store sparse matrix and iterate to find similar character.
73
- v1.3.2.beta | One hash table with multiple linked lists for collision handling.
74
-
75
- In theory, the latter should work more efficient than the former (more test data needed).
76
-
77
66
  # Why This?
78
67
 
79
68
  There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
@@ -108,9 +97,9 @@ str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater |
108
97
  - The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
109
98
  - Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
110
99
 
111
- ## Benchmark
100
+ # Benchmark
112
101
 
113
- ### Pure Ruby
102
+ ## Pure Ruby
114
103
 
115
104
  | user | system | total | real
116
105
  ---------------- | -------- | -------- | -------- | ------------
@@ -120,7 +109,7 @@ fuzzystringmatch | 1.510000 | 0.000000 | 1.510000 | ( 1.510136)
120
109
  - jaro_winkler (1.3.1)
121
110
  - fuzzy-string-match (0.9.6)
122
111
 
123
- ### Native
112
+ ## Native
124
113
 
125
114
  | user | system | total | real
126
115
  ---------------- | -------- | -------- | -------- | ------------
@@ -136,4 +125,5 @@ amatch | 0.960000 | 0.000000 | 0.960000 | ( 0.961509)
136
125
 
137
126
  # Todo
138
127
 
139
- - Custom adjusting word table.
128
+ - Custom adjusting word table.
129
+ - The algorithm between C and Ruby are different.
@@ -1,6 +1,7 @@
1
- #include <stdlib.h>
2
1
  #include "adj_matrix.h"
3
- #include "codepoints.h"
2
+ #include "code.h"
3
+
4
+ #include <stdlib.h>
4
5
 
5
6
  const char *DEFAULT_ADJ_TABLE[] = {
6
7
  "A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
@@ -76,8 +77,11 @@ AdjMatrix* adj_matrix_default(){
76
77
  ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
77
78
  int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
78
79
  for(int i = 0; i < length; i += 2){
79
- UnicodeHash h1 = unicode_hash_new(DEFAULT_ADJ_TABLE[i]), h2 = unicode_hash_new(DEFAULT_ADJ_TABLE[i + 1]);
80
- adj_matrix_add(ret_matrix, h1.code, h2.code);
80
+ unsigned long long code_1, code_2;
81
+ int dummy_length;
82
+ utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
83
+ utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
84
+ adj_matrix_add(ret_matrix, code_1, code_2);
81
85
  }
82
86
  first_time = 0;
83
87
  }
@@ -1,5 +1,5 @@
1
1
  #ifndef ADJ_MATRIX_H
2
- #define ADJ_MATRIX_H 1
2
+ #define ADJ_MATRIX_H
3
3
  #define ADJ_MATRIX_DEFAULT_LENGTH 958
4
4
  #define ADJ_MATRIX_SEED 9527
5
5
 
@@ -19,4 +19,4 @@ char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned
19
19
  void adj_matrix_free (AdjMatrix *matrix);
20
20
  AdjMatrix* adj_matrix_default();
21
21
 
22
- #endif /* ADJ_MATRIX_H */
22
+ #endif
@@ -0,0 +1,29 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+
4
+ void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
5
+ unsigned char first_char = str[0];
6
+ if(first_char >= 252) *ret_byte_length = 6; // 1111110x
7
+ else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
8
+ else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
9
+ else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
10
+ else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
11
+ else *ret_byte_length = 1;
12
+ *ret_code = 0;
13
+ memcpy(ret_code, str, *ret_byte_length);
14
+ }
15
+
16
+ void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
17
+ unsigned int code;
18
+ char byte_length;
19
+
20
+ *ret_codes = calloc(length, sizeof(long long));
21
+ *ret_length = 0;
22
+
23
+ for(int i = 0; i < length;){
24
+ int byte_length;
25
+ utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
26
+ *ret_length += 1;
27
+ i += byte_length;
28
+ }
29
+ }
@@ -0,0 +1,7 @@
1
+ #ifndef CODE_H
2
+ #define CODE_H
3
+
4
+ void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length);
5
+ void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length);
6
+
7
+ #endif
@@ -0,0 +1,103 @@
1
+ #include "jaro.h"
2
+ #include "code.h"
3
+ #include "adj_matrix.h"
4
+
5
+ #include <string.h>
6
+ #include <stdlib.h>
7
+ #include <ctype.h>
8
+
9
+ double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
10
+ if(!short_str_len || !long_str_len) return 0.0;
11
+
12
+ if(short_str_len > long_str_len){
13
+ SWAP(short_str, long_str);
14
+ SWAP(short_str_len, long_str_len);
15
+ }
16
+
17
+ unsigned long long *short_codes, *long_codes;
18
+ int short_codes_len, long_codes_len;
19
+ string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
20
+ string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
21
+
22
+ if(opt->ignore_case){
23
+ for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
24
+ for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
25
+ }
26
+
27
+ int window_size = long_codes_len/2 - 1;
28
+ if(window_size < 0) window_size = 0;
29
+
30
+ char short_codes_flag[MAX_WORD_LENGTH];
31
+ char long_codes_flag[MAX_WORD_LENGTH];
32
+ memset(short_codes_flag, 0, MAX_WORD_LENGTH);
33
+ memset(long_codes_flag, 0, MAX_WORD_LENGTH);
34
+
35
+ // count number of matching characters
36
+ int match_count = 0;
37
+ for(int i = 0; i < short_codes_len; i++){
38
+ int left = (i >= window_size) ? i - window_size : 0;
39
+ int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
40
+ if(right > long_codes_len - 1) right = long_codes_len - 1;
41
+ for(int j = left; j <= right; j++){
42
+ if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
43
+ short_codes_flag[i] = long_codes_flag[j] = 1;
44
+ match_count++;
45
+ break;
46
+ }
47
+ }
48
+ }
49
+ if(!match_count){
50
+ free(short_codes); free(long_codes);
51
+ return 0.0;
52
+ }
53
+
54
+ // count number of transpositions
55
+ int transposition_count = 0, j = 0, k = 0;
56
+ for(int i = 0; i < short_codes_len; i++){
57
+ if(short_codes_flag[i]){
58
+ for(j = k; j < long_codes_len; j++){
59
+ if(long_codes_flag[j]){
60
+ k = j + 1;
61
+ break;
62
+ }
63
+ }
64
+ if(short_codes[i] != long_codes[j]) transposition_count++;
65
+ }
66
+ }
67
+
68
+ // count similarities in nonmatched characters
69
+ int similar_count = 0;
70
+ if(opt->adj_table && short_codes_len > match_count)
71
+ for(int i = 0; i < short_codes_len; i++)
72
+ if(!short_codes_flag[i])
73
+ for(int j = 0; j < long_codes_len; j++)
74
+ if(!long_codes_flag[j])
75
+ if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
76
+ similar_count += 3;
77
+ break;
78
+ }
79
+
80
+ // jaro distance
81
+ double jaro_distance;
82
+ double m = (double)match_count;
83
+ double t = (double)(transposition_count/2);
84
+ if(opt->adj_table) m = similar_count/10.0 + m;
85
+ jaro_distance = (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
86
+
87
+ // jaro winkler distance
88
+ if(!opt){
89
+ static LibJaroOption default_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD};
90
+ opt = &default_opt;
91
+ }
92
+ if(jaro_distance < opt->threshold){
93
+ free(short_codes); free(long_codes);
94
+ return jaro_distance;
95
+ }
96
+ else{
97
+ int prefix = 0;
98
+ int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
99
+ for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
100
+ free(short_codes); free(long_codes);
101
+ return jaro_distance + prefix*opt->weight*(1-jaro_distance);
102
+ }
103
+ }
@@ -0,0 +1,16 @@
1
+ #ifndef LIBJARO_JARO_H
2
+ #define LIBJARO_JARO_H
3
+
4
+ #define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
5
+ #define MAX_WORD_LENGTH 64
6
+ #define DEFAULT_WEIGHT 0.1
7
+ #define DEFAULT_THRESHOLD 0.7
8
+
9
+ typedef struct LibJaroOption{
10
+ double weight, threshold;
11
+ char ignore_case, adj_table;
12
+ } LibJaroOption;
13
+
14
+ double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
15
+
16
+ #endif
@@ -1,27 +1,28 @@
1
- #include "jaro_winkler.h"
2
- #include "distance.h"
1
+ #include "ruby.h"
2
+ #include "jaro.h"
3
3
 
4
4
  VALUE rb_mJaroWinkler;
5
+ VALUE distance(int argc, VALUE *argv, VALUE self);
5
6
 
6
7
  void Init_jaro_winkler(void){
7
8
  rb_mJaroWinkler = rb_define_module("JaroWinkler");
8
- rb_define_module_function(rb_mJaroWinkler, "c_distance", rb_distance, -1);
9
+ rb_define_module_function(rb_mJaroWinkler, "c_distance", distance, -1);
9
10
  }
10
11
 
11
- VALUE rb_distance(int argc, VALUE *argv, VALUE self){
12
+ VALUE distance(int argc, VALUE *argv, VALUE self){
12
13
  VALUE s1, s2, opt;
13
14
  rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
14
- Option c_opt = option_new();
15
+ LibJaroOption c_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
15
16
  if(TYPE(opt) == T_HASH){
16
- VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
17
- threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
17
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
18
+ threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
18
19
  ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
19
- adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
20
+ adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
20
21
  if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
21
22
  if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
22
- if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
23
- if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
24
- if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
23
+ if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
24
+ if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
25
+ if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
25
26
  }
26
- return rb_float_new(distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
27
+ return rb_float_new(jaro_winkler_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
27
28
  }
@@ -1,10 +1,19 @@
1
1
  module JaroWinkler
2
- DEFAULT_ADJ_TABLE = Hash.new({})
2
+ DEFAULT_ADJ_TABLE = Hash.new
3
3
  [
4
4
  ['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
5
5
  ['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
6
6
  ['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
7
7
  ['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
8
8
  ['E', ' '], ['Y', ' '], ['S', ' ']
9
- ].each{ |s1, s2| DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true }
10
- end
9
+ ].each{ |s1, s2|
10
+ if not DEFAULT_ADJ_TABLE.has_key?(s1)
11
+ DEFAULT_ADJ_TABLE[s1] = Hash.new
12
+ end
13
+ if not DEFAULT_ADJ_TABLE.has_key?(s2)
14
+ DEFAULT_ADJ_TABLE[s2] = Hash.new
15
+ end
16
+ DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
17
+ }
18
+ DEFAULT_ADJ_TABLE.default = Hash.new
19
+ end
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.3.5"
2
+ VERSION = "1.3.6"
3
3
  end
@@ -0,0 +1,8 @@
1
+ # spec/adjusting_table_spec.rb
2
+ require 'jaro_winkler'
3
+
4
+ describe JaroWinkler::DEFAULT_ADJ_TABLE do
5
+ it 'should not be empty' do
6
+ expect(JaroWinkler::DEFAULT_ADJ_TABLE).not_to be_empty
7
+ end
8
+ end
@@ -4,40 +4,29 @@ include JaroWinkler
4
4
 
5
5
  shared_examples 'common' do |strategy|
6
6
  it 'works' do
7
- ary = [
8
- ['henka' , 'henkan' , 0.9667] ,
9
- ['al' , 'al' , 1.0] ,
10
- ['martha' , 'marhta' , 0.9611] ,
11
- ['jones' , 'johnson' , 0.8323] ,
12
- ['abcvwxyz' , 'cabvwxyz' , 0.9583] ,
13
- ['dwayne' , 'duane' , 0.8400] ,
14
- ['dixon' , 'dicksonx' , 0.8133] ,
15
- ['fvie' , 'ten' , 0.0] ,
16
- ['tony' , 'tony' , 1.0] ,
17
- ['tonytonyjan' , 'tonytonyjan' , 1.0] ,
18
- ['x' , 'x' , 1.0] ,
19
- ['' , '' , 0.0] ,
20
- ['tony' , '' , 0.0] ,
21
- ['' , 'tony' , 0.0] ,
22
- ['tonytonyjan' , 'tony' , 0.8727] ,
23
- ['tony' , 'tonytonyjan' , 0.8727] ,
24
- # ['San Francisco' , 'Santa Monica' , 0.8180]
25
- ]
26
- ary.each do |s1, s2, ans|
27
- expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
28
- end
7
+ expect(send(strategy, 'henka','henkan')).to be_within(0.0001).of(0.9667)
8
+ expect(send(strategy, 'al','al')).to be_within(0.0001).of(1.0)
9
+ expect(send(strategy, 'martha','marhta')).to be_within(0.0001).of(0.9611)
10
+ expect(send(strategy, 'jones','johnson')).to be_within(0.0001).of(0.8323)
11
+ expect(send(strategy, 'abcvwxyz','cabvwxyz')).to be_within(0.0001).of(0.9583)
12
+ expect(send(strategy, 'dwayne','duane')).to be_within(0.0001).of(0.8400)
13
+ expect(send(strategy, 'dixon','dicksonx')).to be_within(0.0001).of(0.8133)
14
+ expect(send(strategy, 'fvie','ten')).to be_within(0.0001).of(0.0)
15
+ expect(send(strategy, 'tony','tony')).to be_within(0.0001).of(1.0)
16
+ expect(send(strategy, 'tonytonyjan','tonytonyjan')).to be_within(0.0001).of(1.0)
17
+ expect(send(strategy, 'x','x')).to be_within(0.0001).of(1.0)
18
+ expect(send(strategy, '','')).to be_within(0.0001).of(0.0)
19
+ expect(send(strategy, 'tony','')).to be_within(0.0001).of(0.0)
20
+ expect(send(strategy, '','tony')).to be_within(0.0001).of(0.0)
21
+ expect(send(strategy, 'tonytonyjan','tony')).to be_within(0.0001).of(0.8727)
22
+ expect(send(strategy, 'tony','tonytonyjan')).to be_within(0.0001).of(0.8727)
29
23
  end
30
24
 
31
25
  it 'works with UTF-8' do
32
- ary = [
33
- ['變形金剛4:絕跡重生' , '變形金剛4: 絕跡重生' , 0.9818] ,
34
- ['連勝文' , '連勝丼' , 0.8222] ,
35
- ['馬英九' , '馬英丸' , 0.8222] ,
36
- ['良い' , 'いい' , 0.6666] ,
37
- ]
38
- ary.each do |s1, s2, ans|
39
- expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
40
- end
26
+ expect(send(strategy, '變形金剛4:絕跡重生','變形金剛4: 絕跡重生')).to be_within(0.0001).of(0.9818)
27
+ expect(send(strategy, '連勝文','連勝丼')).to be_within(0.0001).of(0.8222)
28
+ expect(send(strategy, '馬英九','馬英丸')).to be_within(0.0001).of(0.8222)
29
+ expect(send(strategy, '良い','いい')).to be_within(0.0001).of(0.6666)
41
30
  end
42
31
 
43
32
  it 'sets ignore_case' do
@@ -54,19 +43,14 @@ shared_examples 'common' do |strategy|
54
43
 
55
44
 
56
45
  it 'works with adjusting table' do
57
- ary = [
58
- ['HENKA' , 'HENKAN' , 0.9667] , # m=5, t=0, s=0
59
- ['AL' , 'AL' , 1.0 ] , # m=2, t=0, s=0
60
- ['MARTHA' , 'MARHTA' , 0.9611] , # m=6, t=1, s=0
61
- ['JONES' , 'JOHNSON' , 0.8598] , # m=4, t=0, s=3
62
- ['ABCVWXYZ' , 'CABVWXYZ' , 0.9583] , # m=8, t=1, s=0
63
- ['DWAYNE' , 'DUANE' , 0.8730] , # m=4, t=0, s=3
64
- ['DIXON' , 'DICKSONX' , 0.8393] , # m=4, t=0, s=3
65
- ['FVIE' , 'TEN' , 0.0 ]
66
- ]
67
- ary.each do |s1, s2, ans|
68
- expect(send(strategy, s1, s2, adj_table: true)).to be_within(0.0001).of(ans)
69
- end
46
+ expect(send(strategy, 'HENKA', 'HENKAN', adj_table: true)).to be_within(0.0001).of(0.9667) # m=5, t=0, s=0
47
+ expect(send(strategy, 'AL', 'AL', adj_table: true)).to be_within(0.0001).of(1.0) # m=2, t=0, s=0
48
+ expect(send(strategy, 'MARTHA', 'MARHTA', adj_table: true)).to be_within(0.0001).of(0.9611) # m=6, t=1, s=0
49
+ expect(send(strategy, 'JONES', 'JOHNSON', adj_table: true)).to be_within(0.0001).of(0.8598) # m=4, t=0, s=3
50
+ expect(send(strategy, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true)).to be_within(0.0001).of(0.9583) # m=8, t=1, s=0
51
+ expect(send(strategy, 'DWAYNE', 'DUANE', adj_table: true)).to be_within(0.0001).of(0.8730) # m=4, t=0, s=3
52
+ expect(send(strategy, 'DIXON', 'DICKSONX', adj_table: true)).to be_within(0.0001).of(0.8393) # m=4, t=0, s=3
53
+ expect(send(strategy, 'FVIE', 'TEN', adj_table: true)).to be_within(0.0001).of(0.0)
70
54
  end
71
55
 
72
56
  context 'with weight exceeding 0.25' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.5
4
+ version: 1.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-02 00:00:00.000000000 Z
11
+ date: 2015-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -131,19 +131,19 @@ files:
131
131
  - benchmark/pure.txt
132
132
  - ext/jaro_winkler/adj_matrix.c
133
133
  - ext/jaro_winkler/adj_matrix.h
134
- - ext/jaro_winkler/codepoints.c
135
- - ext/jaro_winkler/codepoints.h
136
- - ext/jaro_winkler/distance.c
137
- - ext/jaro_winkler/distance.h
134
+ - ext/jaro_winkler/code.c
135
+ - ext/jaro_winkler/code.h
138
136
  - ext/jaro_winkler/extconf.rb
137
+ - ext/jaro_winkler/jaro.c
138
+ - ext/jaro_winkler/jaro.h
139
139
  - ext/jaro_winkler/jaro_winkler.c
140
- - ext/jaro_winkler/jaro_winkler.h
141
140
  - ext/jaro_winkler/murmur_hash2.c
142
141
  - jaro_winkler.gemspec
143
142
  - lib/jaro_winkler.rb
144
143
  - lib/jaro_winkler/adjusting_table.rb
145
144
  - lib/jaro_winkler/fallback.rb
146
145
  - lib/jaro_winkler/version.rb
146
+ - spec/adjusting_table_spec.rb
147
147
  - spec/jaro_winkler_spec.rb
148
148
  - spec/spec_helper.rb
149
149
  homepage: https://github.com/tonytonyjan/jaro_winkler
@@ -166,11 +166,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
166
  version: '0'
167
167
  requirements: []
168
168
  rubyforge_project:
169
- rubygems_version: 2.4.5
169
+ rubygems_version: 2.4.6
170
170
  signing_key:
171
171
  specification_version: 4
172
172
  summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
173
173
  UTF-8 string.
174
174
  test_files:
175
+ - spec/adjusting_table_spec.rb
175
176
  - spec/jaro_winkler_spec.rb
176
177
  - spec/spec_helper.rb
@@ -1,29 +0,0 @@
1
- #include <string.h>
2
- #include <stdlib.h>
3
- #include "codepoints.h"
4
-
5
- UnicodeHash unicode_hash_new(const char *str){
6
- UnicodeHash ret = {};
7
- unsigned char first_char = str[0];
8
- if(first_char >= 252) ret.byte_length = 6; // 1111110x
9
- else if(first_char >= 248) ret.byte_length = 5; // 111110xx
10
- else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
11
- else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
12
- else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
13
- else ret.byte_length = 1;
14
- memcpy(&ret.code, str, ret.byte_length);
15
- return ret;
16
- }
17
-
18
- Codepoints codepoints_new(const char *str, int byte_len){
19
- Codepoints ret = {};
20
- ret.ary = malloc(byte_len * sizeof(long long));
21
- ret.length = 0;
22
- for(int i = 0; i < byte_len;){
23
- UnicodeHash hash = unicode_hash_new(str + i);
24
- ret.ary[ret.length] = hash.code;
25
- ret.length++;
26
- i += hash.byte_length;
27
- }
28
- return ret;
29
- }
@@ -1,17 +0,0 @@
1
- #ifndef CODEPOINTS_H
2
- #define CODEPOINTS_H 1
3
-
4
- typedef struct{
5
- unsigned long long code;
6
- unsigned int byte_length;
7
- } UnicodeHash;
8
-
9
- typedef struct{
10
- unsigned long long *ary;
11
- int length;
12
- } Codepoints;
13
-
14
- UnicodeHash unicode_hash_new(const char *str);
15
- Codepoints codepoints_new (const char *str, int byte_len);
16
-
17
- #endif /* CODEPOINTS_H */
@@ -1,76 +0,0 @@
1
- #include <stdlib.h>
2
- #include <ctype.h>
3
- #include "distance.h"
4
- #include "codepoints.h"
5
- #include "adj_matrix.h"
6
-
7
- Option option_new(){
8
- Option opt;
9
- opt.ignore_case = opt.adj_table = 0;
10
- opt.weight = 0.1;
11
- opt.threshold = 0.7;
12
- return opt;
13
- }
14
-
15
- double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
16
- Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len),
17
- code_ary_2 = codepoints_new(s2, s2_byte_len);
18
-
19
- if(opt.ignore_case){
20
- for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
21
- for(int i = 0; i < code_ary_2.length; ++i) if(code_ary_2.ary[i] < 256 && islower(code_ary_2.ary[i])) code_ary_2.ary[i] -= 32;
22
- }
23
-
24
- // Guarantee the order
25
- if(code_ary_1.length > code_ary_2.length){
26
- unsigned long long *tmp = code_ary_1.ary; code_ary_1.ary = code_ary_2.ary; code_ary_2.ary = tmp;
27
- int tmp2 = code_ary_1.length; code_ary_1.length = code_ary_2.length; code_ary_2.length = tmp2;
28
- }
29
-
30
- // Compute jaro distance
31
- int window_size = code_ary_2.length / 2 - 1;
32
- if(window_size < 0) window_size = 0;
33
- double matches = 0.0,
34
- sim_matches = 0.0;
35
- int transpositions = 0,
36
- previous_index = -1,
37
- max_index = code_ary_2.length - 1;
38
- for(int i = 0; i < code_ary_1.length; i++){
39
- int left = i - window_size;
40
- int right = i + window_size;
41
- if(left < 0) left = 0;
42
- if(right > max_index) right = max_index;
43
- char matched = 0,
44
- found = 0,
45
- sim_matched = 0;
46
- for(int j = left; j <= right; j++){
47
- if(code_ary_1.ary[i] == code_ary_2.ary[j]){
48
- matched = 1;
49
- if(!found && j > previous_index){
50
- previous_index = j;
51
- found = 1;
52
- }
53
- }else if(opt.adj_table && adj_matrix_find(adj_matrix_default(), code_ary_1.ary[i], code_ary_2.ary[j])) sim_matched = 1;
54
- } // for(int j = left; j <= right; j++){
55
- if(matched){
56
- matches++;
57
- if(!found) transpositions++;
58
- }else if(sim_matched) sim_matches += 3;
59
- } // for(int i = 0; i < code_ary_1.length; i++){
60
-
61
- // Don't divide transpositions by 2 since it's been counted directly by above code.
62
- double similarity = matches;
63
- if(opt.adj_table) similarity += sim_matches / 10;
64
- double jaro_distance = matches == 0 ? 0 : (similarity / code_ary_1.length + similarity / code_ary_2.length + (matches - transpositions) / matches) / 3.0;
65
-
66
- // calculate jaro-winkler distance
67
- double threshold = opt.threshold, weight = opt.weight;
68
- int prefix = 0;
69
- int max_length = code_ary_1.length > 4 ? 4 : code_ary_1.length;
70
- for(int i = 0; i < max_length; ++i){
71
- if(code_ary_1.ary[i] == code_ary_2.ary[i]) prefix++;
72
- else break;
73
- }
74
- free(code_ary_1.ary); free(code_ary_2.ary);
75
- return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
76
- }
@@ -1,12 +0,0 @@
1
- #ifndef DISTANCE_H
2
- #define DISTANCE_H 1
3
-
4
- typedef struct{
5
- double weight, threshold;
6
- char ignore_case, adj_table;
7
- } Option;
8
-
9
- double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
10
- Option option_new();
11
-
12
- #endif /* DISTANCE_H */
@@ -1,8 +0,0 @@
1
- #ifndef JARO_WINKLER_H
2
- #define JARO_WINKLER_H 1
3
-
4
- #include "ruby.h"
5
-
6
- VALUE rb_distance(int argc, VALUE *argv, VALUE obj);
7
-
8
- #endif /* JARO_WINKLER_H */