jaro_winkler 1.3.5 → 1.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -16
- data/ext/jaro_winkler/adj_matrix.c +8 -4
- data/ext/jaro_winkler/adj_matrix.h +2 -2
- data/ext/jaro_winkler/code.c +29 -0
- data/ext/jaro_winkler/code.h +7 -0
- data/ext/jaro_winkler/jaro.c +103 -0
- data/ext/jaro_winkler/jaro.h +16 -0
- data/ext/jaro_winkler/jaro_winkler.c +13 -12
- data/lib/jaro_winkler/adjusting_table.rb +12 -3
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/adjusting_table_spec.rb +8 -0
- data/spec/jaro_winkler_spec.rb +28 -44
- metadata +9 -8
- data/ext/jaro_winkler/codepoints.c +0 -29
- data/ext/jaro_winkler/codepoints.h +0 -17
- data/ext/jaro_winkler/distance.c +0 -76
- data/ext/jaro_winkler/distance.h +0 -12
- data/ext/jaro_winkler/jaro_winkler.h +0 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d48fca750c919028bdf13a2145bf431919a2c924
|
4
|
+
data.tar.gz: 134da623e4f99a708cf83f7e2e961a8882ecffb8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7476d0dcd726f7d6b2405c861b139f99b22c71e21e48607beab76c1a29a11b6e9398c33943e8bfcff042b18fdea2d66f858283df17bbfd4e351a3ca93ee8f05a
|
7
|
+
data.tar.gz: 1c0a9c5b3521e761c752bc19a09a24052da3a46357195b9a86c86815418fc55bbe090b44db738bf086a44aabb1056f660a1584f1c8c7d01d124faf20623041b4
|
data/README.md
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
[![Build Status](https://travis-ci.org/tonytonyjan/jaro_winkler.svg?branch=master)](https://travis-ci.org/tonytonyjan/jaro_winkler)
|
2
2
|
|
3
|
-
# About
|
4
|
-
|
5
3
|
It's an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both of them supports UTF-8 string.
|
6
4
|
|
7
5
|
# Installation
|
@@ -35,7 +33,7 @@ weight | number | 0.1 | A constant scaling factor for how much the sco
|
|
35
33
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
36
34
|
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
|
37
35
|
|
38
|
-
#
|
36
|
+
# Adjusting Table
|
39
37
|
|
40
38
|
## Default Table
|
41
39
|
|
@@ -65,15 +63,6 @@ where
|
|
65
63
|
|
66
64
|
- `s` is the number of nonmatching but similar characters.
|
67
65
|
|
68
|
-
## Difference Between v1.3.1 And v1.3.2.beta
|
69
|
-
|
70
|
-
Version | Algorithm
|
71
|
-
----------- | -----------------------------------------------------------------------
|
72
|
-
v1.3.1 | One linked list to store sparse matrix and iterate to find similar character.
|
73
|
-
v1.3.2.beta | One hash table with multiple linked lists for collision handling.
|
74
|
-
|
75
|
-
In theory, the latter should work more efficient than the former (more test data needed).
|
76
|
-
|
77
66
|
# Why This?
|
78
67
|
|
79
68
|
There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
|
@@ -108,9 +97,9 @@ str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater |
|
|
108
97
|
- The origin result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
|
109
98
|
- Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
|
110
99
|
|
111
|
-
|
100
|
+
# Benchmark
|
112
101
|
|
113
|
-
|
102
|
+
## Pure Ruby
|
114
103
|
|
115
104
|
| user | system | total | real
|
116
105
|
---------------- | -------- | -------- | -------- | ------------
|
@@ -120,7 +109,7 @@ fuzzystringmatch | 1.510000 | 0.000000 | 1.510000 | ( 1.510136)
|
|
120
109
|
- jaro_winkler (1.3.1)
|
121
110
|
- fuzzy-string-match (0.9.6)
|
122
111
|
|
123
|
-
|
112
|
+
## Native
|
124
113
|
|
125
114
|
| user | system | total | real
|
126
115
|
---------------- | -------- | -------- | -------- | ------------
|
@@ -136,4 +125,5 @@ amatch | 0.960000 | 0.000000 | 0.960000 | ( 0.961509)
|
|
136
125
|
|
137
126
|
# Todo
|
138
127
|
|
139
|
-
- Custom adjusting word table.
|
128
|
+
- Custom adjusting word table.
|
129
|
+
- The algorithm between C and Ruby are different.
|
@@ -1,6 +1,7 @@
|
|
1
|
-
#include <stdlib.h>
|
2
1
|
#include "adj_matrix.h"
|
3
|
-
#include "
|
2
|
+
#include "code.h"
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
4
5
|
|
5
6
|
const char *DEFAULT_ADJ_TABLE[] = {
|
6
7
|
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
|
@@ -76,8 +77,11 @@ AdjMatrix* adj_matrix_default(){
|
|
76
77
|
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
|
77
78
|
int length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
|
78
79
|
for(int i = 0; i < length; i += 2){
|
79
|
-
|
80
|
-
|
80
|
+
unsigned long long code_1, code_2;
|
81
|
+
int dummy_length;
|
82
|
+
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i], &code_1, &dummy_length);
|
83
|
+
utf_char_to_code((char*)DEFAULT_ADJ_TABLE[i+1], &code_2, &dummy_length);
|
84
|
+
adj_matrix_add(ret_matrix, code_1, code_2);
|
81
85
|
}
|
82
86
|
first_time = 0;
|
83
87
|
}
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#ifndef ADJ_MATRIX_H
|
2
|
-
#define ADJ_MATRIX_H
|
2
|
+
#define ADJ_MATRIX_H
|
3
3
|
#define ADJ_MATRIX_DEFAULT_LENGTH 958
|
4
4
|
#define ADJ_MATRIX_SEED 9527
|
5
5
|
|
@@ -19,4 +19,4 @@ char adj_matrix_find (AdjMatrix *matrix, unsigned long long x, unsigned
|
|
19
19
|
void adj_matrix_free (AdjMatrix *matrix);
|
20
20
|
AdjMatrix* adj_matrix_default();
|
21
21
|
|
22
|
-
#endif
|
22
|
+
#endif
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
void utf_char_to_code(char *str, unsigned long long *ret_code, int *ret_byte_length){
|
5
|
+
unsigned char first_char = str[0];
|
6
|
+
if(first_char >= 252) *ret_byte_length = 6; // 1111110x
|
7
|
+
else if(first_char >= 248) *ret_byte_length = 5; // 111110xx
|
8
|
+
else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx
|
9
|
+
else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx
|
10
|
+
else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx
|
11
|
+
else *ret_byte_length = 1;
|
12
|
+
*ret_code = 0;
|
13
|
+
memcpy(ret_code, str, *ret_byte_length);
|
14
|
+
}
|
15
|
+
|
16
|
+
void string_to_codes(char *str, int length, unsigned long long **ret_codes, int *ret_length){
|
17
|
+
unsigned int code;
|
18
|
+
char byte_length;
|
19
|
+
|
20
|
+
*ret_codes = calloc(length, sizeof(long long));
|
21
|
+
*ret_length = 0;
|
22
|
+
|
23
|
+
for(int i = 0; i < length;){
|
24
|
+
int byte_length;
|
25
|
+
utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length);
|
26
|
+
*ret_length += 1;
|
27
|
+
i += byte_length;
|
28
|
+
}
|
29
|
+
}
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#include "jaro.h"
|
2
|
+
#include "code.h"
|
3
|
+
#include "adj_matrix.h"
|
4
|
+
|
5
|
+
#include <string.h>
|
6
|
+
#include <stdlib.h>
|
7
|
+
#include <ctype.h>
|
8
|
+
|
9
|
+
double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
|
10
|
+
if(!short_str_len || !long_str_len) return 0.0;
|
11
|
+
|
12
|
+
if(short_str_len > long_str_len){
|
13
|
+
SWAP(short_str, long_str);
|
14
|
+
SWAP(short_str_len, long_str_len);
|
15
|
+
}
|
16
|
+
|
17
|
+
unsigned long long *short_codes, *long_codes;
|
18
|
+
int short_codes_len, long_codes_len;
|
19
|
+
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
|
20
|
+
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);
|
21
|
+
|
22
|
+
if(opt->ignore_case){
|
23
|
+
for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
|
24
|
+
for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
|
25
|
+
}
|
26
|
+
|
27
|
+
int window_size = long_codes_len/2 - 1;
|
28
|
+
if(window_size < 0) window_size = 0;
|
29
|
+
|
30
|
+
char short_codes_flag[MAX_WORD_LENGTH];
|
31
|
+
char long_codes_flag[MAX_WORD_LENGTH];
|
32
|
+
memset(short_codes_flag, 0, MAX_WORD_LENGTH);
|
33
|
+
memset(long_codes_flag, 0, MAX_WORD_LENGTH);
|
34
|
+
|
35
|
+
// count number of matching characters
|
36
|
+
int match_count = 0;
|
37
|
+
for(int i = 0; i < short_codes_len; i++){
|
38
|
+
int left = (i >= window_size) ? i - window_size : 0;
|
39
|
+
int right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
|
40
|
+
if(right > long_codes_len - 1) right = long_codes_len - 1;
|
41
|
+
for(int j = left; j <= right; j++){
|
42
|
+
if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
|
43
|
+
short_codes_flag[i] = long_codes_flag[j] = 1;
|
44
|
+
match_count++;
|
45
|
+
break;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
}
|
49
|
+
if(!match_count){
|
50
|
+
free(short_codes); free(long_codes);
|
51
|
+
return 0.0;
|
52
|
+
}
|
53
|
+
|
54
|
+
// count number of transpositions
|
55
|
+
int transposition_count = 0, j = 0, k = 0;
|
56
|
+
for(int i = 0; i < short_codes_len; i++){
|
57
|
+
if(short_codes_flag[i]){
|
58
|
+
for(j = k; j < long_codes_len; j++){
|
59
|
+
if(long_codes_flag[j]){
|
60
|
+
k = j + 1;
|
61
|
+
break;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
if(short_codes[i] != long_codes[j]) transposition_count++;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
// count similarities in nonmatched characters
|
69
|
+
int similar_count = 0;
|
70
|
+
if(opt->adj_table && short_codes_len > match_count)
|
71
|
+
for(int i = 0; i < short_codes_len; i++)
|
72
|
+
if(!short_codes_flag[i])
|
73
|
+
for(int j = 0; j < long_codes_len; j++)
|
74
|
+
if(!long_codes_flag[j])
|
75
|
+
if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
|
76
|
+
similar_count += 3;
|
77
|
+
break;
|
78
|
+
}
|
79
|
+
|
80
|
+
// jaro distance
|
81
|
+
double jaro_distance;
|
82
|
+
double m = (double)match_count;
|
83
|
+
double t = (double)(transposition_count/2);
|
84
|
+
if(opt->adj_table) m = similar_count/10.0 + m;
|
85
|
+
jaro_distance = (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
|
86
|
+
|
87
|
+
// jaro winkler distance
|
88
|
+
if(!opt){
|
89
|
+
static LibJaroOption default_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD};
|
90
|
+
opt = &default_opt;
|
91
|
+
}
|
92
|
+
if(jaro_distance < opt->threshold){
|
93
|
+
free(short_codes); free(long_codes);
|
94
|
+
return jaro_distance;
|
95
|
+
}
|
96
|
+
else{
|
97
|
+
int prefix = 0;
|
98
|
+
int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
|
99
|
+
for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
|
100
|
+
free(short_codes); free(long_codes);
|
101
|
+
return jaro_distance + prefix*opt->weight*(1-jaro_distance);
|
102
|
+
}
|
103
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#ifndef LIBJARO_JARO_H
|
2
|
+
#define LIBJARO_JARO_H
|
3
|
+
|
4
|
+
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
|
5
|
+
#define MAX_WORD_LENGTH 64
|
6
|
+
#define DEFAULT_WEIGHT 0.1
|
7
|
+
#define DEFAULT_THRESHOLD 0.7
|
8
|
+
|
9
|
+
typedef struct LibJaroOption{
|
10
|
+
double weight, threshold;
|
11
|
+
char ignore_case, adj_table;
|
12
|
+
} LibJaroOption;
|
13
|
+
|
14
|
+
double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
|
15
|
+
|
16
|
+
#endif
|
@@ -1,27 +1,28 @@
|
|
1
|
-
#include "
|
2
|
-
#include "
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "jaro.h"
|
3
3
|
|
4
4
|
VALUE rb_mJaroWinkler;
|
5
|
+
VALUE distance(int argc, VALUE *argv, VALUE self);
|
5
6
|
|
6
7
|
void Init_jaro_winkler(void){
|
7
8
|
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
8
|
-
rb_define_module_function(rb_mJaroWinkler, "c_distance",
|
9
|
+
rb_define_module_function(rb_mJaroWinkler, "c_distance", distance, -1);
|
9
10
|
}
|
10
11
|
|
11
|
-
VALUE
|
12
|
+
VALUE distance(int argc, VALUE *argv, VALUE self){
|
12
13
|
VALUE s1, s2, opt;
|
13
14
|
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
14
|
-
|
15
|
+
LibJaroOption c_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
|
15
16
|
if(TYPE(opt) == T_HASH){
|
16
|
-
VALUE weight
|
17
|
-
threshold
|
17
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
|
18
|
+
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
|
18
19
|
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
|
19
|
-
adj_table
|
20
|
+
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
|
20
21
|
if(!NIL_P(weight)) c_opt.weight = NUM2DBL(weight);
|
21
22
|
if(c_opt.weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
22
|
-
if(!NIL_P(threshold)) c_opt.threshold
|
23
|
-
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case)
|
24
|
-
if(!NIL_P(adj_table)) c_opt.adj_table
|
23
|
+
if(!NIL_P(threshold)) c_opt.threshold = NUM2DBL(threshold);
|
24
|
+
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
25
|
+
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
|
25
26
|
}
|
26
|
-
return rb_float_new(
|
27
|
+
return rb_float_new(jaro_winkler_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
|
27
28
|
}
|
@@ -1,10 +1,19 @@
|
|
1
1
|
module JaroWinkler
|
2
|
-
DEFAULT_ADJ_TABLE = Hash.new
|
2
|
+
DEFAULT_ADJ_TABLE = Hash.new
|
3
3
|
[
|
4
4
|
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'],
|
5
5
|
['I', 'U'], ['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'],
|
6
6
|
['S', 'Z'], ['X', 'S'], ['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'],
|
7
7
|
['2', 'Z'], ['5', 'S'], ['8', 'B'], ['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'],
|
8
8
|
['E', ' '], ['Y', ' '], ['S', ' ']
|
9
|
-
].each{ |s1, s2|
|
10
|
-
|
9
|
+
].each{ |s1, s2|
|
10
|
+
if not DEFAULT_ADJ_TABLE.has_key?(s1)
|
11
|
+
DEFAULT_ADJ_TABLE[s1] = Hash.new
|
12
|
+
end
|
13
|
+
if not DEFAULT_ADJ_TABLE.has_key?(s2)
|
14
|
+
DEFAULT_ADJ_TABLE[s2] = Hash.new
|
15
|
+
end
|
16
|
+
DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
|
17
|
+
}
|
18
|
+
DEFAULT_ADJ_TABLE.default = Hash.new
|
19
|
+
end
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -4,40 +4,29 @@ include JaroWinkler
|
|
4
4
|
|
5
5
|
shared_examples 'common' do |strategy|
|
6
6
|
it 'works' do
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
['tony' , 'tonytonyjan' , 0.8727] ,
|
24
|
-
# ['San Francisco' , 'Santa Monica' , 0.8180]
|
25
|
-
]
|
26
|
-
ary.each do |s1, s2, ans|
|
27
|
-
expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
|
28
|
-
end
|
7
|
+
expect(send(strategy, 'henka','henkan')).to be_within(0.0001).of(0.9667)
|
8
|
+
expect(send(strategy, 'al','al')).to be_within(0.0001).of(1.0)
|
9
|
+
expect(send(strategy, 'martha','marhta')).to be_within(0.0001).of(0.9611)
|
10
|
+
expect(send(strategy, 'jones','johnson')).to be_within(0.0001).of(0.8323)
|
11
|
+
expect(send(strategy, 'abcvwxyz','cabvwxyz')).to be_within(0.0001).of(0.9583)
|
12
|
+
expect(send(strategy, 'dwayne','duane')).to be_within(0.0001).of(0.8400)
|
13
|
+
expect(send(strategy, 'dixon','dicksonx')).to be_within(0.0001).of(0.8133)
|
14
|
+
expect(send(strategy, 'fvie','ten')).to be_within(0.0001).of(0.0)
|
15
|
+
expect(send(strategy, 'tony','tony')).to be_within(0.0001).of(1.0)
|
16
|
+
expect(send(strategy, 'tonytonyjan','tonytonyjan')).to be_within(0.0001).of(1.0)
|
17
|
+
expect(send(strategy, 'x','x')).to be_within(0.0001).of(1.0)
|
18
|
+
expect(send(strategy, '','')).to be_within(0.0001).of(0.0)
|
19
|
+
expect(send(strategy, 'tony','')).to be_within(0.0001).of(0.0)
|
20
|
+
expect(send(strategy, '','tony')).to be_within(0.0001).of(0.0)
|
21
|
+
expect(send(strategy, 'tonytonyjan','tony')).to be_within(0.0001).of(0.8727)
|
22
|
+
expect(send(strategy, 'tony','tonytonyjan')).to be_within(0.0001).of(0.8727)
|
29
23
|
end
|
30
24
|
|
31
25
|
it 'works with UTF-8' do
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
['良い' , 'いい' , 0.6666] ,
|
37
|
-
]
|
38
|
-
ary.each do |s1, s2, ans|
|
39
|
-
expect(send(strategy, s1, s2)).to be_within(0.0001).of(ans)
|
40
|
-
end
|
26
|
+
expect(send(strategy, '變形金剛4:絕跡重生','變形金剛4: 絕跡重生')).to be_within(0.0001).of(0.9818)
|
27
|
+
expect(send(strategy, '連勝文','連勝丼')).to be_within(0.0001).of(0.8222)
|
28
|
+
expect(send(strategy, '馬英九','馬英丸')).to be_within(0.0001).of(0.8222)
|
29
|
+
expect(send(strategy, '良い','いい')).to be_within(0.0001).of(0.6666)
|
41
30
|
end
|
42
31
|
|
43
32
|
it 'sets ignore_case' do
|
@@ -54,19 +43,14 @@ shared_examples 'common' do |strategy|
|
|
54
43
|
|
55
44
|
|
56
45
|
it 'works with adjusting table' do
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
['FVIE' , 'TEN' , 0.0 ]
|
66
|
-
]
|
67
|
-
ary.each do |s1, s2, ans|
|
68
|
-
expect(send(strategy, s1, s2, adj_table: true)).to be_within(0.0001).of(ans)
|
69
|
-
end
|
46
|
+
expect(send(strategy, 'HENKA', 'HENKAN', adj_table: true)).to be_within(0.0001).of(0.9667) # m=5, t=0, s=0
|
47
|
+
expect(send(strategy, 'AL', 'AL', adj_table: true)).to be_within(0.0001).of(1.0) # m=2, t=0, s=0
|
48
|
+
expect(send(strategy, 'MARTHA', 'MARHTA', adj_table: true)).to be_within(0.0001).of(0.9611) # m=6, t=1, s=0
|
49
|
+
expect(send(strategy, 'JONES', 'JOHNSON', adj_table: true)).to be_within(0.0001).of(0.8598) # m=4, t=0, s=3
|
50
|
+
expect(send(strategy, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true)).to be_within(0.0001).of(0.9583) # m=8, t=1, s=0
|
51
|
+
expect(send(strategy, 'DWAYNE', 'DUANE', adj_table: true)).to be_within(0.0001).of(0.8730) # m=4, t=0, s=3
|
52
|
+
expect(send(strategy, 'DIXON', 'DICKSONX', adj_table: true)).to be_within(0.0001).of(0.8393) # m=4, t=0, s=3
|
53
|
+
expect(send(strategy, 'FVIE', 'TEN', adj_table: true)).to be_within(0.0001).of(0.0)
|
70
54
|
end
|
71
55
|
|
72
56
|
context 'with weight exceeding 0.25' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -131,19 +131,19 @@ files:
|
|
131
131
|
- benchmark/pure.txt
|
132
132
|
- ext/jaro_winkler/adj_matrix.c
|
133
133
|
- ext/jaro_winkler/adj_matrix.h
|
134
|
-
- ext/jaro_winkler/
|
135
|
-
- ext/jaro_winkler/
|
136
|
-
- ext/jaro_winkler/distance.c
|
137
|
-
- ext/jaro_winkler/distance.h
|
134
|
+
- ext/jaro_winkler/code.c
|
135
|
+
- ext/jaro_winkler/code.h
|
138
136
|
- ext/jaro_winkler/extconf.rb
|
137
|
+
- ext/jaro_winkler/jaro.c
|
138
|
+
- ext/jaro_winkler/jaro.h
|
139
139
|
- ext/jaro_winkler/jaro_winkler.c
|
140
|
-
- ext/jaro_winkler/jaro_winkler.h
|
141
140
|
- ext/jaro_winkler/murmur_hash2.c
|
142
141
|
- jaro_winkler.gemspec
|
143
142
|
- lib/jaro_winkler.rb
|
144
143
|
- lib/jaro_winkler/adjusting_table.rb
|
145
144
|
- lib/jaro_winkler/fallback.rb
|
146
145
|
- lib/jaro_winkler/version.rb
|
146
|
+
- spec/adjusting_table_spec.rb
|
147
147
|
- spec/jaro_winkler_spec.rb
|
148
148
|
- spec/spec_helper.rb
|
149
149
|
homepage: https://github.com/tonytonyjan/jaro_winkler
|
@@ -166,11 +166,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
166
|
version: '0'
|
167
167
|
requirements: []
|
168
168
|
rubyforge_project:
|
169
|
-
rubygems_version: 2.4.
|
169
|
+
rubygems_version: 2.4.6
|
170
170
|
signing_key:
|
171
171
|
specification_version: 4
|
172
172
|
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
|
173
173
|
UTF-8 string.
|
174
174
|
test_files:
|
175
|
+
- spec/adjusting_table_spec.rb
|
175
176
|
- spec/jaro_winkler_spec.rb
|
176
177
|
- spec/spec_helper.rb
|
@@ -1,29 +0,0 @@
|
|
1
|
-
#include <string.h>
|
2
|
-
#include <stdlib.h>
|
3
|
-
#include "codepoints.h"
|
4
|
-
|
5
|
-
UnicodeHash unicode_hash_new(const char *str){
|
6
|
-
UnicodeHash ret = {};
|
7
|
-
unsigned char first_char = str[0];
|
8
|
-
if(first_char >= 252) ret.byte_length = 6; // 1111110x
|
9
|
-
else if(first_char >= 248) ret.byte_length = 5; // 111110xx
|
10
|
-
else if(first_char >= 240) ret.byte_length = 4; // 11110xxx
|
11
|
-
else if(first_char >= 224) ret.byte_length = 3; // 1110xxxx
|
12
|
-
else if(first_char >= 192) ret.byte_length = 2; // 110xxxxx
|
13
|
-
else ret.byte_length = 1;
|
14
|
-
memcpy(&ret.code, str, ret.byte_length);
|
15
|
-
return ret;
|
16
|
-
}
|
17
|
-
|
18
|
-
Codepoints codepoints_new(const char *str, int byte_len){
|
19
|
-
Codepoints ret = {};
|
20
|
-
ret.ary = malloc(byte_len * sizeof(long long));
|
21
|
-
ret.length = 0;
|
22
|
-
for(int i = 0; i < byte_len;){
|
23
|
-
UnicodeHash hash = unicode_hash_new(str + i);
|
24
|
-
ret.ary[ret.length] = hash.code;
|
25
|
-
ret.length++;
|
26
|
-
i += hash.byte_length;
|
27
|
-
}
|
28
|
-
return ret;
|
29
|
-
}
|
@@ -1,17 +0,0 @@
|
|
1
|
-
#ifndef CODEPOINTS_H
|
2
|
-
#define CODEPOINTS_H 1
|
3
|
-
|
4
|
-
typedef struct{
|
5
|
-
unsigned long long code;
|
6
|
-
unsigned int byte_length;
|
7
|
-
} UnicodeHash;
|
8
|
-
|
9
|
-
typedef struct{
|
10
|
-
unsigned long long *ary;
|
11
|
-
int length;
|
12
|
-
} Codepoints;
|
13
|
-
|
14
|
-
UnicodeHash unicode_hash_new(const char *str);
|
15
|
-
Codepoints codepoints_new (const char *str, int byte_len);
|
16
|
-
|
17
|
-
#endif /* CODEPOINTS_H */
|
data/ext/jaro_winkler/distance.c
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
#include <stdlib.h>
|
2
|
-
#include <ctype.h>
|
3
|
-
#include "distance.h"
|
4
|
-
#include "codepoints.h"
|
5
|
-
#include "adj_matrix.h"
|
6
|
-
|
7
|
-
Option option_new(){
|
8
|
-
Option opt;
|
9
|
-
opt.ignore_case = opt.adj_table = 0;
|
10
|
-
opt.weight = 0.1;
|
11
|
-
opt.threshold = 0.7;
|
12
|
-
return opt;
|
13
|
-
}
|
14
|
-
|
15
|
-
double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt){
|
16
|
-
Codepoints code_ary_1 = codepoints_new(s1, s1_byte_len),
|
17
|
-
code_ary_2 = codepoints_new(s2, s2_byte_len);
|
18
|
-
|
19
|
-
if(opt.ignore_case){
|
20
|
-
for(int i = 0; i < code_ary_1.length; ++i) if(code_ary_1.ary[i] < 256 && islower(code_ary_1.ary[i])) code_ary_1.ary[i] -= 32;
|
21
|
-
for(int i = 0; i < code_ary_2.length; ++i) if(code_ary_2.ary[i] < 256 && islower(code_ary_2.ary[i])) code_ary_2.ary[i] -= 32;
|
22
|
-
}
|
23
|
-
|
24
|
-
// Guarantee the order
|
25
|
-
if(code_ary_1.length > code_ary_2.length){
|
26
|
-
unsigned long long *tmp = code_ary_1.ary; code_ary_1.ary = code_ary_2.ary; code_ary_2.ary = tmp;
|
27
|
-
int tmp2 = code_ary_1.length; code_ary_1.length = code_ary_2.length; code_ary_2.length = tmp2;
|
28
|
-
}
|
29
|
-
|
30
|
-
// Compute jaro distance
|
31
|
-
int window_size = code_ary_2.length / 2 - 1;
|
32
|
-
if(window_size < 0) window_size = 0;
|
33
|
-
double matches = 0.0,
|
34
|
-
sim_matches = 0.0;
|
35
|
-
int transpositions = 0,
|
36
|
-
previous_index = -1,
|
37
|
-
max_index = code_ary_2.length - 1;
|
38
|
-
for(int i = 0; i < code_ary_1.length; i++){
|
39
|
-
int left = i - window_size;
|
40
|
-
int right = i + window_size;
|
41
|
-
if(left < 0) left = 0;
|
42
|
-
if(right > max_index) right = max_index;
|
43
|
-
char matched = 0,
|
44
|
-
found = 0,
|
45
|
-
sim_matched = 0;
|
46
|
-
for(int j = left; j <= right; j++){
|
47
|
-
if(code_ary_1.ary[i] == code_ary_2.ary[j]){
|
48
|
-
matched = 1;
|
49
|
-
if(!found && j > previous_index){
|
50
|
-
previous_index = j;
|
51
|
-
found = 1;
|
52
|
-
}
|
53
|
-
}else if(opt.adj_table && adj_matrix_find(adj_matrix_default(), code_ary_1.ary[i], code_ary_2.ary[j])) sim_matched = 1;
|
54
|
-
} // for(int j = left; j <= right; j++){
|
55
|
-
if(matched){
|
56
|
-
matches++;
|
57
|
-
if(!found) transpositions++;
|
58
|
-
}else if(sim_matched) sim_matches += 3;
|
59
|
-
} // for(int i = 0; i < code_ary_1.length; i++){
|
60
|
-
|
61
|
-
// Don't divide transpositions by 2 since it's been counted directly by above code.
|
62
|
-
double similarity = matches;
|
63
|
-
if(opt.adj_table) similarity += sim_matches / 10;
|
64
|
-
double jaro_distance = matches == 0 ? 0 : (similarity / code_ary_1.length + similarity / code_ary_2.length + (matches - transpositions) / matches) / 3.0;
|
65
|
-
|
66
|
-
// calculate jaro-winkler distance
|
67
|
-
double threshold = opt.threshold, weight = opt.weight;
|
68
|
-
int prefix = 0;
|
69
|
-
int max_length = code_ary_1.length > 4 ? 4 : code_ary_1.length;
|
70
|
-
for(int i = 0; i < max_length; ++i){
|
71
|
-
if(code_ary_1.ary[i] == code_ary_2.ary[i]) prefix++;
|
72
|
-
else break;
|
73
|
-
}
|
74
|
-
free(code_ary_1.ary); free(code_ary_2.ary);
|
75
|
-
return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
|
76
|
-
}
|
data/ext/jaro_winkler/distance.h
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
#ifndef DISTANCE_H
|
2
|
-
#define DISTANCE_H 1
|
3
|
-
|
4
|
-
typedef struct{
|
5
|
-
double weight, threshold;
|
6
|
-
char ignore_case, adj_table;
|
7
|
-
} Option;
|
8
|
-
|
9
|
-
double distance(char *s1, int s1_byte_len, char *s2, int s2_byte_len, Option opt);
|
10
|
-
Option option_new();
|
11
|
-
|
12
|
-
#endif /* DISTANCE_H */
|