jaro_winkler 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -16
- data/ext/jaro_winkler/adj_matrix.c +8 -4
- data/ext/jaro_winkler/adj_matrix.h +2 -2
- data/ext/jaro_winkler/code.c +29 -0
- data/ext/jaro_winkler/code.h +7 -0
- data/ext/jaro_winkler/jaro.c +103 -0
- data/ext/jaro_winkler/jaro.h +16 -0
- data/ext/jaro_winkler/jaro_winkler.c +13 -12
- data/lib/jaro_winkler/adjusting_table.rb +12 -3
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/adjusting_table_spec.rb +8 -0
- data/spec/jaro_winkler_spec.rb +28 -44
- metadata +9 -8
- data/ext/jaro_winkler/codepoints.c +0 -29
- data/ext/jaro_winkler/codepoints.h +0 -17
- data/ext/jaro_winkler/distance.c +0 -76
- data/ext/jaro_winkler/distance.h +0 -12
- data/ext/jaro_winkler/jaro_winkler.h +0 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d48fca750c919028bdf13a2145bf431919a2c924
|
4
|
+
data.tar.gz: 134da623e4f99a708cf83f7e2e961a8882ecffb8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7476d0dcd726f7d6b2405c861b139f99b22c71e21e48607beab76c1a29a11b6e9398c33943e8bfcff042b18fdea2d66f858283df17bbfd4e351a3ca93ee8f05a
|
7
|
+
data.tar.gz: 1c0a9c5b3521e761c752bc19a09a24052da3a46357195b9a86c86815418fc55bbe090b44db738bf086a44aabb1056f660a1584f1c8c7d01d124faf20623041b4
|
data/README.md
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
[](https://travis-ci.org/tonytonyjan/jaro_winkler)
|
2
2
|
|
3
|
-
# About
|
4
|
-
|
5
3
|
It's an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both of them supports UTF-8 string.
|
6
4
|
|
7
5
|
# Installation
|
@@ -35,7 +33,7 @@ weight | number | 0.1 | A constant scaling factor for how much the sco
|
|
35
33
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
36
34
|
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
|
37
35
|
|
38
|
-
#
|
36
|
+
# Adjusting Table
|
39
37
|
|
40
38
|
## Default Table
|
41
39
|
|
@@ -65,15 +63,6 @@ where
|
|
65
63
|
|
66
64
|
- `s` is the number of nonmatching but similar characters.
|
67
65
|
|
68
|
-
## Difference Between v1.3.1 And v1.3.2.beta
|
69
|
-
|
70
|
-
Version | Algorithm
|
71
|
-
----------- | -----------------------------------------------------------------------
|
72
|
-
v1.3.1 | One linked list to store sparse matrix and iterate to find similar character.
|
73
|
-
v1.3.2.beta | One hash table with multiple linked lists for collision handling.
|
74
|
-
|
75
|
-
In theory, the latter should work more efficient than the former (more test data needed).
|
76
|
-
|
77
66
|
# Why This?
|
78
67
|
|
79
68
|
There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
|
@@ -108,9 +97,9 @@ str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater |
|
|
108
97
|
- The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
|
109
98
|
- Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
|
110
99
|
|
111
|
-
|
100
|
+
# Benchmark
|
112
101
|
|
113
|
-
|
102
|
+
## Pure Ruby
|
114
103
|
|
115
104
|
| user | system | total | real
|
116
105
|
---------------- | -------- | -------- | -------- | ------------
|
@@ -120,7 +109,7 @@ fuzzystringmatch | 1.510000 | 0.000000 | 1.510000 | ( 1.510136)
|
|
120
109
|
- jaro_winkler (1.3.1)
|
121
110
|
- fuzzy-string-match (0.9.6)
|
122
111
|
|
123
|
-
|
112
|
+
## Native
|
124
113
|
|
125
114
|
| user | system | total | real
|
126
115
|
---------------- | -------- | -------- | -------- | ------------
|
@@ -136,4 +125,5 @@ amatch | 0.960000 | 0.000000 | 0.960000 | ( 0.961509)
|
|
136
125
|
|
137
126
|
# Todo
|
138
127
|
|
139
|
-
- Custom adjusting word table.
|
128
|
+
- Custom adjusting word table.
|
129
|
+
- The algorithm between C and Ruby are different.
|
@@ -1,6 +1,7 @@
|
|
1
|
-
#include <stdlib.h>
|
2
1
|
#include "adj_matrix.h"
|
3
|
-
#include "
|
2
|
+
#include "code.h"
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
4
5
|
|
5
6
|
const char *DEFAULT_ADJ_TABLE[] = {
|
6
7
|
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
@@ -76,8 +77,11 @@ AdjMatrix* adj_matrix_default(){
|
|
76
77
|
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
77
78
|
int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
|
78
79
|
for(int i = 0; i < length; i += 2){
|
79
|
-
|
80
|
-
|
80
|
+
unsigned long long code_1, code_2;
|
81
|
+
int dummy_length;
|
82
|
+
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
|
83
|
+
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
|
84
|
+
adj_matrix_add(ret_matrix, code_1, code_2);
|
81
85
|
}
|
82
86
|
first_time = 0;
|
83
87
|
}
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#ifndef ADJ_MATRIX_H
|
2
|
-
#define ADJ_MATRIX_H
|
2
|
+
#define ADJ_MATRIX_H
|
3
3
|
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
4
4
|
#define ADJ_MATRIX_SEED 9527
|
5
5
|
|
@@ -19,4 +19,4 @@ char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned
|
|
19
19
|
void adj_matrix_free (AdjMatrix *matrix);
|
20
20
|
AdjMatrix* adj_matrix_default();
|
21
21
|
|
22
|
-
#endif
|
22
|
+
#endif
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
|
5
|
+
unsigned char first_char = str[0];
|
6
|
+
if(first_char >= 252) *ret_byte_length = 6; // 1111110x
|
7
|
+
else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
|
8
|
+
else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
|
9
|
+
else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
|
10
|
+
else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
|
11
|
+
else *ret_byte_length = 1;
|
12
|
+
*ret_code = 0;
|
13
|
+
memcpy(ret_code, str, *ret_byte_length);
|
14
|
+
}
|
15
|
+
|
16
|
+
void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
|
17
|
+
unsigned int code;
|
18
|
+
char byte_length;
|
19
|
+
|
20
|
+
*ret_codes = calloc(length, sizeof(long long));
|
21
|
+
*ret_length = 0;
|
22
|
+
|
23
|
+
for(int i = 0; i < length;){
|
24
|
+
int byte_length;
|
25
|
+
utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
|
26
|
+
*ret_length += 1;
|
27
|
+
i += byte_length;
|
28
|
+
}
|
29
|
+
}
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#include "jaro.h"
|
2
|
+
#include "code.h"
|
3
|
+
#include "adj_matrix.h"
|
4
|
+
|
5
|
+
#include <string.h>
|
6
|
+
#include <stdlib.h>
|
7
|
+
#include <ctype.h>
|
8
|
+
|
9
|
+
double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
|
10
|
+
if(!short_str_len || !long_str_len) return 0.0;
|
11
|
+
|
12
|
+
if(short_str_len > long_str_len){
|
13
|
+
SWAP(short_str, long_str);
|
14
|
+
SWAP(short_str_len, long_str_len);
|
15
|
+
}
|
16
|
+
|
17
|
+
unsigned long long *short_codes, *long_codes;
|
18
|
+
int short_codes_len, long_codes_len;
|
19
|
+
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
|
20
|
+
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
|
21
|
+
|
22
|
+
if(opt->ignore_case){
|
23
|
+
for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
|
24
|
+
for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
|
25
|
+
}
|
26
|
+
|
27
|
+
int window_size = long_codes_len/2 - 1;
|
28
|
+
if(window_size < 0) window_size = 0;
|
29
|
+
|
30
|
+
char short_codes_flag[MAX_WORD_LENGTH];
|
31
|
+
char long_codes_flag[MAX_WORD_LENGTH];
|
32
|
+
memset(short_codes_flag, 0, MAX_WORD_LENGTH);
|
33
|
+
memset(long_codes_flag, 0, MAX_WORD_LENGTH);
|
34
|
+
|
35
|
+
// count number of matching characters
|
36
|
+
int match_count = 0;
|
37
|
+
for(int i = 0; i < short_codes_len; i++){
|
38
|
+
int left = (i >= window_size) ? i - window_size : 0;
|
39
|
+
int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
|
40
|
+
if(right > long_codes_len - 1) right = long_codes_len - 1;
|
41
|
+
for(int j = left; j <= right; j++){
|
42
|
+
if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
|
43
|
+
short_codes_flag[i] = long_codes_flag[j] = 1;
|
44
|
+
match_count++;
|
45
|
+
break;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
}
|
49
|
+
if(!match_count){
|
50
|
+
free(short_codes); free(long_codes);
|
51
|
+
return 0.0;
|
52
|
+
}
|
53
|
+
|
54
|
+
// count number of transpositions
|
55
|
+
int transposition_count = 0, j = 0, k = 0;
|
56
|
+
for(int i = 0; i < short_codes_len; i++){
|
57
|
+
if(short_codes_flag[i]){
|
58
|
+
for(j = k; j < long_codes_len; j++){
|
59
|
+
if(long_codes_flag[j]){
|
60
|
+
k = j + 1;
|
61
|
+
break;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
if(short_codes[i] != long_codes[j]) transposition_count++;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
// count similarities in nonmatched characters
|
69
|
+
int similar_count = 0;
|
70
|
+
if(opt->adj_table && short_codes_len > match_count)
|
71
|
+
for(int i = 0; i < short_codes_len; i++)
|
72
|
+
if(!short_codes_flag[i])
|
73
|
+
for(int j = 0; j < long_codes_len; j++)
|
74
|
+
if(!long_codes_flag[j])
|
75
|
+
if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
|
76
|
+
similar_count += 3;
|
77
|
+
break;
|
78
|
+
}
|
79
|
+
|
80
|
+
// jaro distance
|
81
|
+
double jaro_distance;
|
82
|
+
double m = (double)match_count;
|
83
|
+
double t = (double)(transposition_count/2);
|
84
|
+
if(opt->adj_table) m = similar_count/10.0 + m;
|
85
|
+
jaro_distance = (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
|
86
|
+
|
87
|
+
// jaro winkler distance
|
88
|
+
if(!opt){
|
89
|
+
static LibJaroOption default_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD};
|
90
|
+
opt = &default_opt;
|
91
|
+
}
|
92
|
+
if(jaro_distance < opt->threshold){
|
93
|
+
free(short_codes); free(long_codes);
|
94
|
+
return jaro_distance;
|
95
|
+
}
|
96
|
+
else{
|
97
|
+
int prefix = 0;
|
98
|
+
int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
|
99
|
+
for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
|
100
|
+
free(short_codes); free(long_codes);
|
101
|
+
return jaro_distance + prefix*opt->weight*(1-jaro_distance);
|
102
|
+
}
|
103
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#ifndef LIBJARO_JARO_H
|
2
|
+
#define LIBJARO_JARO_H
|
3
|
+
|
4
|
+
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
|
5
|
+
#define MAX_WORD_LENGTH 64
|
6
|
+
#define DEFAULT_WEIGHT 0.1
|
7
|
+
#define DEFAULT_THRESHOLD 0.7
|
8
|
+
|
9
|
+
typedef struct LibJaroOption{
|
10
|
+
double weight, threshold;
|
11
|
+
char ignore_case, adj_table;
|
12
|
+
} LibJaroOption;
|
13
|
+
|
14
|
+
double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
|
15
|
+
|
16
|
+
#endif
|
@@ -1,27 +1,28 @@
|
|
1
|
-
#include "
|
2
|
-
#include "
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "jaro.h"
|
3
3
|
|
4
4
|
VALUE rb_mJaroWinkler;
|
5
|
+
VALUE distance(int argc, VALUE *argv, VALUE self);
|
5
6
|
|
6
7
|
void Init_jaro_winkler(void){
|
7
8
|
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
8
|
-
rb_define_module_function(rb_mJaroWinkler, "c_distance",
|
9
|
+
rb_define_module_function(rb_mJaroWinkler, "c_distance", distance, -1);
|
9
10
|
}
|
10
11
|
|
11
|
-
VALUE
|
12
|
+
VALUE distance(int argc, VALUE *argv, VALUE self){
|
12
13
|
VALUE s1, s2, opt;
|
13
14
|
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
14
|
-
|
15
|
+
LibJaroOption c_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
|
15
16
|
if(TYPE(opt) == T_HASH){
|
16
|
-
VALUE weight
|
17
|
-
threshold
|
17
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
18
|
+
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
18
19
|
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
19
|
-
adj_table
|
20
|
+
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
20
21
|
if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
|
21
22
|
if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
22
|
-
if(!NIL_P(threshold)) c_opt.threshold
|
23
|
-
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case)
|
24
|
-
if(!NIL_P(adj_table)) c_opt.adj_table
|
23
|
+
if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
|
24
|
+
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
25
|
+
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
25
26
|
}
|
26
|
-
return rb_float_new(
|
27
|
+
return rb_float_new(jaro_winkler_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
|
27
28
|
}
|
@@ -1,10 +1,19 @@
|
|
1
1
|
module JaroWinkler
|
2
|
-
DEFAULT_ADJ_TABLE = Hash.new
|
2
|
+
DEFAULT_ADJ_TABLE = Hash.new
|
3
3
|
[
|
4
4
|
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
|
5
5
|
['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
|
6
6
|
['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
|
7
7
|
['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
|
8
8
|
['E', ' '], ['Y', ' '], ['S', ' ']
|
9
|
-
].each{ |s1, s2|
|
10
|
-
|
9
|
+
].each{ |s1, s2|
|
10
|
+
if not DEFAULT_ADJ_TABLE.has_key?(s1)
|
11
|
+
DEFAULT_ADJ_TABLE[s1] = Hash.new
|
12
|
+
end
|
13
|
+
if not DEFAULT_ADJ_TABLE.has_key?(s2)
|
14
|
+
DEFAULT_ADJ_TABLE[s2] = Hash.new
|
15
|
+
end
|
16
|
+
DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
|
17
|
+
}
|
18
|
+
DEFAULT_ADJ_TABLE.default = Hash.new
|
19
|
+
end
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -4,40 +4,29 @@ include JaroWinkler
|
|
4
4
|
|
5
5
|
shared_examples 'common' do |strategy|
|
6
6
|
it 'works' do
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
['tony' , 'tonytonyjan' , 0.8727] ,
|
24
|
-
# ['San Francisco' , 'Santa Monica' , 0.8180]
|
25
|
-
]
|
26
|
-
ary.each do |s1, s2, ans|
|
27
|
-
expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
|
28
|
-
end
|
7
|
+
expect(send(strategy, 'henka','henkan')).to be_within(0.0001).of(0.9667)
|
8
|
+
expect(send(strategy, 'al','al')).to be_within(0.0001).of(1.0)
|
9
|
+
expect(send(strategy, 'martha','marhta')).to be_within(0.0001).of(0.9611)
|
10
|
+
expect(send(strategy, 'jones','johnson')).to be_within(0.0001).of(0.8323)
|
11
|
+
expect(send(strategy, 'abcvwxyz','cabvwxyz')).to be_within(0.0001).of(0.9583)
|
12
|
+
expect(send(strategy, 'dwayne','duane')).to be_within(0.0001).of(0.8400)
|
13
|
+
expect(send(strategy, 'dixon','dicksonx')).to be_within(0.0001).of(0.8133)
|
14
|
+
expect(send(strategy, 'fvie','ten')).to be_within(0.0001).of(0.0)
|
15
|
+
expect(send(strategy, 'tony','tony')).to be_within(0.0001).of(1.0)
|
16
|
+
expect(send(strategy, 'tonytonyjan','tonytonyjan')).to be_within(0.0001).of(1.0)
|
17
|
+
expect(send(strategy, 'x','x')).to be_within(0.0001).of(1.0)
|
18
|
+
expect(send(strategy, '','')).to be_within(0.0001).of(0.0)
|
19
|
+
expect(send(strategy, 'tony','')).to be_within(0.0001).of(0.0)
|
20
|
+
expect(send(strategy, '','tony')).to be_within(0.0001).of(0.0)
|
21
|
+
expect(send(strategy, 'tonytonyjan','tony')).to be_within(0.0001).of(0.8727)
|
22
|
+
expect(send(strategy, 'tony','tonytonyjan')).to be_within(0.0001).of(0.8727)
|
29
23
|
end
|
30
24
|
|
31
25
|
it 'works with UTF-8' do
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
['良い' , 'いい' , 0.6666] ,
|
37
|
-
]
|
38
|
-
ary.each do |s1, s2, ans|
|
39
|
-
expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
|
40
|
-
end
|
26
|
+
expect(send(strategy, '變形金剛4:絕跡重生','變形金剛4: 絕跡重生')).to be_within(0.0001).of(0.9818)
|
27
|
+
expect(send(strategy, '連勝文','連勝丼')).to be_within(0.0001).of(0.8222)
|
28
|
+
expect(send(strategy, '馬英九','馬英丸')).to be_within(0.0001).of(0.8222)
|
29
|
+
expect(send(strategy, '良い','いい')).to be_within(0.0001).of(0.6666)
|
41
30
|
end
|
42
31
|
|
43
32
|
it 'sets ignore_case' do
|
@@ -54,19 +43,14 @@ shared_examples 'common' do |strategy|
|
|
54
43
|
|
55
44
|
|
56
45
|
it 'works with adjusting table' do
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
['FVIE' , 'TEN' , 0.0 ]
|
66
|
-
]
|
67
|
-
ary.each do |s1, s2, ans|
|
68
|
-
expect(send(strategy, s1, s2, adj_table: true)).to be_within(0.0001).of(ans)
|
69
|
-
end
|
46
|
+
expect(send(strategy, 'HENKA', 'HENKAN', adj_table: true)).to be_within(0.0001).of(0.9667) # m=5, t=0, s=0
|
47
|
+
expect(send(strategy, 'AL', 'AL', adj_table: true)).to be_within(0.0001).of(1.0) # m=2, t=0, s=0
|
48
|
+
expect(send(strategy, 'MARTHA', 'MARHTA', adj_table: true)).to be_within(0.0001).of(0.9611) # m=6, t=1, s=0
|
49
|
+
expect(send(strategy, 'JONES', 'JOHNSON', adj_table: true)).to be_within(0.0001).of(0.8598) # m=4, t=0, s=3
|
50
|
+
expect(send(strategy, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true)).to be_within(0.0001).of(0.9583) # m=8, t=1, s=0
|
51
|
+
expect(send(strategy, 'DWAYNE', 'DUANE', adj_table: true)).to be_within(0.0001).of(0.8730) # m=4, t=0, s=3
|
52
|
+
expect(send(strategy, 'DIXON', 'DICKSONX', adj_table: true)).to be_within(0.0001).of(0.8393) # m=4, t=0, s=3
|
53
|
+
expect(send(strategy, 'FVIE', 'TEN', adj_table: true)).to be_within(0.0001).of(0.0)
|
70
54
|
end
|
71
55
|
|
72
56
|
context 'with weight exceeding 0.25' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -131,19 +131,19 @@ files:
|
|
131
131
|
- benchmark/pure.txt
|
132
132
|
- ext/jaro_winkler/adj_matrix.c
|
133
133
|
- ext/jaro_winkler/adj_matrix.h
|
134
|
-
- ext/jaro_winkler/
|
135
|
-
- ext/jaro_winkler/
|
136
|
-
- ext/jaro_winkler/distance.c
|
137
|
-
- ext/jaro_winkler/distance.h
|
134
|
+
- ext/jaro_winkler/code.c
|
135
|
+
- ext/jaro_winkler/code.h
|
138
136
|
- ext/jaro_winkler/extconf.rb
|
137
|
+
- ext/jaro_winkler/jaro.c
|
138
|
+
- ext/jaro_winkler/jaro.h
|
139
139
|
- ext/jaro_winkler/jaro_winkler.c
|
140
|
-
- ext/jaro_winkler/jaro_winkler.h
|
141
140
|
- ext/jaro_winkler/murmur_hash2.c
|
142
141
|
- jaro_winkler.gemspec
|
143
142
|
- lib/jaro_winkler.rb
|
144
143
|
- lib/jaro_winkler/adjusting_table.rb
|
145
144
|
- lib/jaro_winkler/fallback.rb
|
146
145
|
- lib/jaro_winkler/version.rb
|
146
|
+
- spec/adjusting_table_spec.rb
|
147
147
|
- spec/jaro_winkler_spec.rb
|
148
148
|
- spec/spec_helper.rb
|
149
149
|
homepage: https://github.com/tonytonyjan/jaro_winkler
|
@@ -166,11 +166,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
166
|
version: '0'
|
167
167
|
requirements: []
|
168
168
|
rubyforge_project:
|
169
|
-
rubygems_version: 2.4.
|
169
|
+
rubygems_version: 2.4.6
|
170
170
|
signing_key:
|
171
171
|
specification_version: 4
|
172
172
|
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
|
173
173
|
UTF-8 string.
|
174
174
|
test_files:
|
175
|
+
- spec/adjusting_table_spec.rb
|
175
176
|
- spec/jaro_winkler_spec.rb
|
176
177
|
- spec/spec_helper.rb
|
@@ -1,29 +0,0 @@
|
|
1
|
-
#include <string.h>
|
2
|
-
#include <stdlib.h>
|
3
|
-
#include "codepoints.h"
|
4
|
-
|
5
|
-
UnicodeHash unicode_hash_new(const char *str){
|
6
|
-
UnicodeHash ret = {};
|
7
|
-
unsigned char first_char = str[0];
|
8
|
-
if(first_char >= 252) ret.byte_length = 6; // 1111110x
|
9
|
-
else if(first_char >= 248) ret.byte_length = 5; // 111110xx
|
10
|
-
else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
|
11
|
-
else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
|
12
|
-
else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
|
13
|
-
else ret.byte_length = 1;
|
14
|
-
memcpy(&ret.code, str, ret.byte_length);
|
15
|
-
return ret;
|
16
|
-
}
|
17
|
-
|
18
|
-
Codepoints codepoints_new(const char *str, int byte_len){
|
19
|
-
Codepoints ret = {};
|
20
|
-
ret.ary = malloc(byte_len * sizeof(long long));
|
21
|
-
ret.length = 0;
|
22
|
-
for(int i = 0; i < byte_len;){
|
23
|
-
UnicodeHash hash = unicode_hash_new(str + i);
|
24
|
-
ret.ary[ret.length] = hash.code;
|
25
|
-
ret.length++;
|
26
|
-
i += hash.byte_length;
|
27
|
-
}
|
28
|
-
return ret;
|
29
|
-
}
|
@@ -1,17 +0,0 @@
|
|
1
|
-
#ifndef CODEPOINTS_H
|
2
|
-
#define CODEPOINTS_H 1
|
3
|
-
|
4
|
-
typedef struct{
|
5
|
-
unsigned long long code;
|
6
|
-
unsigned int byte_length;
|
7
|
-
} UnicodeHash;
|
8
|
-
|
9
|
-
typedef struct{
|
10
|
-
unsigned long long *ary;
|
11
|
-
int length;
|
12
|
-
} Codepoints;
|
13
|
-
|
14
|
-
UnicodeHash unicode_hash_new(const char *str);
|
15
|
-
Codepoints codepoints_new (const char *str, int byte_len);
|
16
|
-
|
17
|
-
#endif /* CODEPOINTS_H */
|
data/ext/jaro_winkler/distance.c
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
#include <stdlib.h>
|
2
|
-
#include <ctype.h>
|
3
|
-
#include "distance.h"
|
4
|
-
#include "codepoints.h"
|
5
|
-
#include "adj_matrix.h"
|
6
|
-
|
7
|
-
Option option_new(){
|
8
|
-
Option opt;
|
9
|
-
opt.ignore_case = opt.adj_table = 0;
|
10
|
-
opt.weight = 0.1;
|
11
|
-
opt.threshold = 0.7;
|
12
|
-
return opt;
|
13
|
-
}
|
14
|
-
|
15
|
-
double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
|
16
|
-
Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len),
|
17
|
-
code_ary_2 = codepoints_new(s2, s2_byte_len);
|
18
|
-
|
19
|
-
if(opt.ignore_case){
|
20
|
-
for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
|
21
|
-
for(int i = 0; i < code_ary_2.length; ++i) if(code_ary_2.ary[i] < 256 && islower(code_ary_2.ary[i])) code_ary_2.ary[i] -= 32;
|
22
|
-
}
|
23
|
-
|
24
|
-
// Guarantee the order
|
25
|
-
if(code_ary_1.length > code_ary_2.length){
|
26
|
-
unsigned long long *tmp = code_ary_1.ary; code_ary_1.ary = code_ary_2.ary; code_ary_2.ary = tmp;
|
27
|
-
int tmp2 = code_ary_1.length; code_ary_1.length = code_ary_2.length; code_ary_2.length = tmp2;
|
28
|
-
}
|
29
|
-
|
30
|
-
// Compute jaro distance
|
31
|
-
int window_size = code_ary_2.length / 2 - 1;
|
32
|
-
if(window_size < 0) window_size = 0;
|
33
|
-
double matches = 0.0,
|
34
|
-
sim_matches = 0.0;
|
35
|
-
int transpositions = 0,
|
36
|
-
previous_index = -1,
|
37
|
-
max_index = code_ary_2.length - 1;
|
38
|
-
for(int i = 0; i < code_ary_1.length; i++){
|
39
|
-
int left = i - window_size;
|
40
|
-
int right = i + window_size;
|
41
|
-
if(left < 0) left = 0;
|
42
|
-
if(right > max_index) right = max_index;
|
43
|
-
char matched = 0,
|
44
|
-
found = 0,
|
45
|
-
sim_matched = 0;
|
46
|
-
for(int j = left; j <= right; j++){
|
47
|
-
if(code_ary_1.ary[i] == code_ary_2.ary[j]){
|
48
|
-
matched = 1;
|
49
|
-
if(!found && j > previous_index){
|
50
|
-
previous_index = j;
|
51
|
-
found = 1;
|
52
|
-
}
|
53
|
-
}else if(opt.adj_table && adj_matrix_find(adj_matrix_default(), code_ary_1.ary[i], code_ary_2.ary[j])) sim_matched = 1;
|
54
|
-
} // for(int j = left; j <= right; j++){
|
55
|
-
if(matched){
|
56
|
-
matches++;
|
57
|
-
if(!found) transpositions++;
|
58
|
-
}else if(sim_matched) sim_matches += 3;
|
59
|
-
} // for(int i = 0; i < code_ary_1.length; i++){
|
60
|
-
|
61
|
-
// Don't divide transpositions by 2 since it's been counted directly by above code.
|
62
|
-
double similarity = matches;
|
63
|
-
if(opt.adj_table) similarity += sim_matches / 10;
|
64
|
-
double jaro_distance = matches == 0 ? 0 : (similarity / code_ary_1.length + similarity / code_ary_2.length + (matches - transpositions) / matches) / 3.0;
|
65
|
-
|
66
|
-
// calculate jaro-winkler distance
|
67
|
-
double threshold = opt.threshold, weight = opt.weight;
|
68
|
-
int prefix = 0;
|
69
|
-
int max_length = code_ary_1.length > 4 ? 4 : code_ary_1.length;
|
70
|
-
for(int i = 0; i < max_length; ++i){
|
71
|
-
if(code_ary_1.ary[i] == code_ary_2.ary[i]) prefix++;
|
72
|
-
else break;
|
73
|
-
}
|
74
|
-
free(code_ary_1.ary); free(code_ary_2.ary);
|
75
|
-
return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
|
76
|
-
}
|
data/ext/jaro_winkler/distance.h
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
#ifndef DISTANCE_H
|
2
|
-
#define DISTANCE_H 1
|
3
|
-
|
4
|
-
typedef struct{
|
5
|
-
double weight, threshold;
|
6
|
-
char ignore_case, adj_table;
|
7
|
-
} Option;
|
8
|
-
|
9
|
-
double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
|
10
|
-
Option option_new();
|
11
|
-
|
12
|
-
#endif /* DISTANCE_H */
|