jaro_winkler 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -16
- data/benchmark/native.rb +3 -3
- data/ext/jaro_winkler/distance.c +99 -0
- data/ext/jaro_winkler/distance.h +12 -0
- data/ext/jaro_winkler/jaro_winkler.c +21 -26
- data/ext/jaro_winkler/jaro_winkler.h +1 -1
- data/lib/jaro_winkler.rb +6 -5
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +8 -1
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a1a6510f807b126a00145ca02323746c1885409
|
4
|
+
data.tar.gz: 3cc743dde90a2dc0f682af51b506da4d4dc8e6ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02fec80fb44db8e6efed4b6cf083d55e4356cf706827c85850db8ef1c409a483e90c20e6de8437f0282d9621d46513a80061d1dd333d6b4b45b85d81f5c73bd5
|
7
|
+
data.tar.gz: e6e43be13fb520ddfa88206488689f4329c68003b0c4c1c40b69192c6df3d7bcc323f18316f21392ebf0da0dc2fce6949d43b8d8833ad72fe59ca288ebce3132
|
data/README.md
CHANGED
@@ -20,10 +20,12 @@ JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
|
20
20
|
# => 0.9778
|
21
21
|
|
22
22
|
# Native
|
23
|
-
JaroWinkler.c_distance "MARTHA", "MARHTA"
|
23
|
+
JaroWinkler.c_distance "MARTHA", "MARHTA" # Recommended one, it's 7 times faster than the latter.
|
24
24
|
JaroWinkler.distance "MARTHA", "MARHTA", native: true
|
25
25
|
```
|
26
26
|
|
27
|
+
**Both implementations support UTF-8 string.**
|
28
|
+
|
27
29
|
## Options
|
28
30
|
|
29
31
|
Name | Type | Default | Note
|
@@ -31,14 +33,7 @@ Name | Type | Default | Note
|
|
31
33
|
case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
|
32
34
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
33
35
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
|
34
|
-
native | boolean | false | Use native version
|
35
|
-
|
36
|
-
## Pure Ruby v.s. Native
|
37
|
-
|
38
|
-
| Pure | Native
|
39
|
-
-------------- | ---- | ------
|
40
|
-
UTF-8 Support | Yes | No
|
41
|
-
Option Setting | Yes | No
|
36
|
+
native | boolean | false | Use native version.
|
42
37
|
|
43
38
|
# Why This?
|
44
39
|
|
@@ -90,10 +85,5 @@ end
|
|
90
85
|
|
91
86
|
# Todo
|
92
87
|
|
93
|
-
-
|
94
|
-
-
|
95
|
-
- Add more optoins to natvie version.
|
96
|
-
- case_match
|
97
|
-
- weight
|
98
|
-
- threshold
|
99
|
-
- adjusting word table (It's from the original C implementation.)
|
88
|
+
- Adjusting word table (Reference to original C implementation.)
|
89
|
+
- Remove `#c_distance`, use C extension as default, and fallback to Ruby in Java platform.
|
data/benchmark/native.rb
CHANGED
@@ -20,6 +20,6 @@ Benchmark.bmbm do |x|
|
|
20
20
|
end
|
21
21
|
|
22
22
|
# user system total real
|
23
|
-
# #c_distance(s1, s2) 0.
|
24
|
-
# #distance(s1, s2, native: true) 2.
|
25
|
-
# fuzzystringmatch 0.
|
23
|
+
# #c_distance(s1, s2) 0.350000 0.000000 0.350000 ( 0.349109)
|
24
|
+
# #distance(s1, s2, native: true) 2.480000 0.050000 2.530000 ( 2.526027)
|
25
|
+
# fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.155539)
|
@@ -0,0 +1,99 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
#include "distance.h"
|
5
|
+
|
6
|
+
Option* option_new(){
|
7
|
+
Option *opt = calloc(1, sizeof(Option));
|
8
|
+
opt->case_match = 0;
|
9
|
+
opt->weight = 0.1;
|
10
|
+
opt->threshold = 0.7;
|
11
|
+
return opt;
|
12
|
+
}
|
13
|
+
|
14
|
+
static int char_bytes_num(char first_char){
|
15
|
+
unsigned char c = first_char;
|
16
|
+
if(c >= 252) return 6; // 1111110x
|
17
|
+
else if(c >= 248) return 5; // 111110xx
|
18
|
+
else if(c >= 240) return 4; // 11110xxx
|
19
|
+
else if(c >= 224) return 3; // 1110xxxx
|
20
|
+
else if(c >= 192) return 2; // 110xxxxx
|
21
|
+
else return 1;
|
22
|
+
}
|
23
|
+
|
24
|
+
static unsigned long* codepoints(const char *str, int *ret_len){
|
25
|
+
int str_len = strlen(str);
|
26
|
+
unsigned long *ret = calloc(str_len, sizeof(long));
|
27
|
+
int count = 0;
|
28
|
+
for(int i = 0; i < str_len;){
|
29
|
+
int bytes_num = char_bytes_num(str[i]);
|
30
|
+
memcpy(&ret[count], &str[i], bytes_num);
|
31
|
+
count++;
|
32
|
+
i += bytes_num;
|
33
|
+
}
|
34
|
+
*ret_len = count;
|
35
|
+
return ret;
|
36
|
+
}
|
37
|
+
|
38
|
+
double c_distance(char *s1, char *s2, Option *opt){
|
39
|
+
// set default option if NULL passed
|
40
|
+
int free_opt_flag = 0;
|
41
|
+
if(!opt){ free_opt_flag = 1; opt = option_new(); }
|
42
|
+
|
43
|
+
int ary_1_len, ary_2_len;
|
44
|
+
unsigned long *ary_1 = codepoints(s1, &ary_1_len), *ary_2 = codepoints(s2, &ary_2_len);
|
45
|
+
|
46
|
+
if(opt->case_match){
|
47
|
+
for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
|
48
|
+
for(int i = 0; i < ary_2_len; ++i) if(ary_2[i] < 256 && islower(ary_2[i])) ary_2[i] -= 32;
|
49
|
+
}
|
50
|
+
|
51
|
+
// Guarantee the order
|
52
|
+
if(ary_1_len > ary_2_len){
|
53
|
+
unsigned long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
|
54
|
+
int tmp2 = ary_1_len; ary_1_len = ary_2_len; ary_2_len = tmp2;
|
55
|
+
}
|
56
|
+
int window_size = ary_2_len / 2 - 1;
|
57
|
+
if(window_size < 0) window_size = 0;
|
58
|
+
double matches = 0.0;
|
59
|
+
int transpositions = 0;
|
60
|
+
int previous_index = -1;
|
61
|
+
int max_index = ary_2_len - 1;
|
62
|
+
for(int i = 0; i < ary_1_len; i++){
|
63
|
+
int left = i - window_size;
|
64
|
+
int right = i + window_size;
|
65
|
+
if(left < 0) left = 0;
|
66
|
+
if(right > max_index) right = max_index;
|
67
|
+
char matched = 0;
|
68
|
+
char found = 0;
|
69
|
+
for(int j = left; j <= right; j++){
|
70
|
+
if(ary_1[i] == ary_2[j]){
|
71
|
+
matched = 1;
|
72
|
+
if(!found){
|
73
|
+
if(j > previous_index){
|
74
|
+
previous_index = j;
|
75
|
+
found = 1;
|
76
|
+
}
|
77
|
+
} // if(!found){
|
78
|
+
} // if(ary_1[i] == ary_2[j]){
|
79
|
+
} // for(int j = left; j <= right; j++){
|
80
|
+
if(matched){
|
81
|
+
matches++;
|
82
|
+
if(!found) transpositions++;
|
83
|
+
}
|
84
|
+
} // for(int i = 0; i < ary_1_len; i++){
|
85
|
+
// Don't divide transpositions by 2 since it's been counted directly by above code.
|
86
|
+
double jaro_distance = matches == 0 ? 0 : (matches / ary_1_len + matches / ary_2_len + (matches - transpositions) / matches) / 3.0;
|
87
|
+
|
88
|
+
// calculate jaro-winkler distance
|
89
|
+
double threshold = opt->threshold, weight = opt->weight;
|
90
|
+
int prefix = 0;
|
91
|
+
int max_length = ary_1_len > 4 ? 4 : ary_1_len;
|
92
|
+
for(int i = 0; i < max_length; ++i){
|
93
|
+
if(ary_1[i] == ary_2[i]) prefix++;
|
94
|
+
else break;
|
95
|
+
}
|
96
|
+
free(ary_1); free(ary_2);
|
97
|
+
if(free_opt_flag) free(opt);
|
98
|
+
return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
|
99
|
+
}
|
@@ -1,32 +1,27 @@
|
|
1
1
|
#include "jaro_winkler.h"
|
2
|
-
#
|
3
|
-
VALUE rb_mJaroWinkler;
|
4
|
-
|
5
|
-
static VALUE distance(VALUE self, VALUE s1, VALUE s2){
|
6
|
-
Check_Type(s1, T_STRING); Check_Type(s2, T_STRING);
|
7
|
-
// Check encoding
|
8
|
-
VALUE s1_ascii_only = TYPE(rb_funcall(s1, rb_intern("ascii_only?"), 0));
|
9
|
-
VALUE s2_ascii_only = TYPE(rb_funcall(s2, rb_intern("ascii_only?"), 0));
|
10
|
-
if(s1_ascii_only == T_FALSE || s2_ascii_only == T_FALSE) printf("WARNING: Non-ASCII string detected.\n");
|
2
|
+
#include "distance.h"
|
11
3
|
|
12
|
-
|
13
|
-
// guarantee the length of s1_ptr is less than or equal to that of s2_ptr
|
14
|
-
if(RSTRING_LEN(s1) > RSTRING_LEN(s2)){ VALUE *tmp = s1_ptr; s1_ptr = s2_ptr; s2_ptr = tmp; }
|
15
|
-
int min_length = RSTRING_LEN(*s1_ptr), max_length = RSTRING_LEN(*s2_ptr);
|
16
|
-
char *c_s1_ptr = StringValuePtr(*s1_ptr), *c_s2_ptr = StringValuePtr(*s2_ptr);
|
17
|
-
int opt[] = {1, 0};
|
18
|
-
if(min_length != max_length){
|
19
|
-
// padding spaces
|
20
|
-
char buf[max_length];
|
21
|
-
for(int i = min_length; i < max_length; ++i) buf[i] = ' ';
|
22
|
-
memcpy(buf, c_s1_ptr, min_length);
|
23
|
-
c_s1_ptr = buf;
|
24
|
-
return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
|
25
|
-
}
|
26
|
-
return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
|
27
|
-
}
|
4
|
+
VALUE rb_mJaroWinkler;
|
28
5
|
|
29
6
|
void Init_jaro_winkler(void){
|
30
7
|
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
31
|
-
rb_define_module_function(rb_mJaroWinkler, "c_distance",
|
8
|
+
rb_define_module_function(rb_mJaroWinkler, "c_distance", rb_distance, -1);
|
32
9
|
}
|
10
|
+
|
11
|
+
VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
12
|
+
VALUE s1, s2, opt;
|
13
|
+
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
14
|
+
Option *c_opt = option_new();
|
15
|
+
if(TYPE(opt) == T_HASH){
|
16
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
|
17
|
+
VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
|
18
|
+
VALUE case_match = rb_hash_aref(opt, ID2SYM(rb_intern("case_match")));
|
19
|
+
if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
|
20
|
+
if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
|
21
|
+
if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
|
22
|
+
}
|
23
|
+
// TODO: replace StringValueCStr with StringValuePtr and RSTRING_LEN
|
24
|
+
VALUE ret = rb_float_new(c_distance(StringValueCStr(s1), StringValueCStr(s2), c_opt));
|
25
|
+
free(c_opt);
|
26
|
+
return ret;
|
27
|
+
}
|
data/lib/jaro_winkler.rb
CHANGED
@@ -2,13 +2,14 @@ require 'jaro_winkler/jaro_winkler.so' unless RUBY_PLATFORM == 'java'
|
|
2
2
|
module JaroWinkler
|
3
3
|
module_function
|
4
4
|
def jaro_distance s1, s2
|
5
|
-
|
5
|
+
length1, length2 = s1.length, s2.length
|
6
6
|
# Guarantee the length order
|
7
7
|
if s1.length > s2.length
|
8
|
-
|
8
|
+
s1, s2 = s2, s1
|
9
|
+
length1, length2 = length2, length1
|
9
10
|
end
|
10
|
-
|
11
|
-
window_size
|
11
|
+
window_size = (length2 / 2) - 1
|
12
|
+
window_size = 0 if window_size < 0
|
12
13
|
matches = 0.0
|
13
14
|
transpositions = 0
|
14
15
|
previous_index = -1
|
@@ -43,7 +44,7 @@ module JaroWinkler
|
|
43
44
|
|
44
45
|
def distance s1, s2, options = {}
|
45
46
|
options = {weight: 0.1, threshold: 0.7, case_match: false, native: false}.merge options
|
46
|
-
return c_distance(s1, s2) if RUBY_PLATFORM != 'java' && options[:native]
|
47
|
+
return c_distance(s1, s2, options) if RUBY_PLATFORM != 'java' && options[:native]
|
47
48
|
weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
|
48
49
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
49
50
|
s1, s2 = s1.downcase, s2.downcase if case_match
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -13,6 +13,7 @@ describe JaroWinkler do
|
|
13
13
|
['fvie', 'ten', 0.0],
|
14
14
|
['tony', 'tony', 1.0],
|
15
15
|
['tonytonyjan', 'tonytonyjan', 1.0],
|
16
|
+
['x', 'x', 1.0],
|
16
17
|
['', '', 0.0],
|
17
18
|
['tony', '', 0.0],
|
18
19
|
['', 'tony', 0.0],
|
@@ -29,16 +30,22 @@ describe JaroWinkler do
|
|
29
30
|
|
30
31
|
it 'supports C extension' do
|
31
32
|
@ary.each do |s1, s2, ans|
|
32
|
-
expect(
|
33
|
+
expect(c_distance(s1, s2)).to be_within(0.0001).of(ans)
|
33
34
|
end
|
34
35
|
end
|
35
36
|
|
37
|
+
it 'works with UTF-8' do
|
38
|
+
expect(distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
|
39
|
+
end
|
40
|
+
|
36
41
|
it 'can ignore case' do
|
37
42
|
expect(distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
|
43
|
+
expect(c_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
|
38
44
|
end
|
39
45
|
|
40
46
|
it 'can set weight' do
|
41
47
|
expect(distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
48
|
+
expect(c_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
42
49
|
expect{ distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
|
43
50
|
end
|
44
51
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
@@ -68,6 +68,8 @@ files:
|
|
68
68
|
- Rakefile
|
69
69
|
- benchmark/native.rb
|
70
70
|
- benchmark/pure.rb
|
71
|
+
- ext/jaro_winkler/distance.c
|
72
|
+
- ext/jaro_winkler/distance.h
|
71
73
|
- ext/jaro_winkler/extconf.rb
|
72
74
|
- ext/jaro_winkler/jaro_winkler.c
|
73
75
|
- ext/jaro_winkler/jaro_winkler.h
|