jaro_winkler 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -16
- data/benchmark/native.rb +3 -3
- data/ext/jaro_winkler/distance.c +99 -0
- data/ext/jaro_winkler/distance.h +12 -0
- data/ext/jaro_winkler/jaro_winkler.c +21 -26
- data/ext/jaro_winkler/jaro_winkler.h +1 -1
- data/lib/jaro_winkler.rb +6 -5
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +8 -1
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a1a6510f807b126a00145ca02323746c1885409
|
4
|
+
data.tar.gz: 3cc743dde90a2dc0f682af51b506da4d4dc8e6ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02fec80fb44db8e6efed4b6cf083d55e4356cf706827c85850db8ef1c409a483e90c20e6de8437f0282d9621d46513a80061d1dd333d6b4b45b85d81f5c73bd5
|
7
|
+
data.tar.gz: e6e43be13fb520ddfa88206488689f4329c68003b0c4c1c40b69192c6df3d7bcc323f18316f21392ebf0da0dc2fce6949d43b8d8833ad72fe59ca288ebce3132
|
data/README.md
CHANGED
@@ -20,10 +20,12 @@ JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
|
20
20
|
# => 0.9778
|
21
21
|
|
22
22
|
# Native
|
23
|
-
JaroWinkler.c_distance "MARTHA", "MARHTA"
|
23
|
+
JaroWinkler.c_distance "MARTHA", "MARHTA" # Recommended one, it's 7 times faster than the latter.
|
24
24
|
JaroWinkler.distance "MARTHA", "MARHTA", native: true
|
25
25
|
```
|
26
26
|
|
27
|
+
**Both implementations support UTF-8 string.**
|
28
|
+
|
27
29
|
## Options
|
28
30
|
|
29
31
|
Name | Type | Default | Note
|
@@ -31,14 +33,7 @@ Name | Type | Default | Note
|
|
31
33
|
case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
|
32
34
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
33
35
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
|
34
|
-
native | boolean | false | Use native version
|
35
|
-
|
36
|
-
## Pure Ruby v.s. Native
|
37
|
-
|
38
|
-
| Pure | Native
|
39
|
-
-------------- | ---- | ------
|
40
|
-
UTF-8 Support | Yes | No
|
41
|
-
Option Setting | Yes | No
|
36
|
+
native | boolean | false | Use native version.
|
42
37
|
|
43
38
|
# Why This?
|
44
39
|
|
@@ -90,10 +85,5 @@ end
|
|
90
85
|
|
91
86
|
# Todo
|
92
87
|
|
93
|
-
-
|
94
|
-
-
|
95
|
-
- Add more optoins to natvie version.
|
96
|
-
- case_match
|
97
|
-
- weight
|
98
|
-
- threshold
|
99
|
-
- adjusting word table (It's from the original C implementation.)
|
88
|
+
- Adjusting word table (Reference to original C implementation.)
|
89
|
+
- Remove `#c_distance`, use C extension as default, and fallback to Ruby in Java platform.
|
data/benchmark/native.rb
CHANGED
@@ -20,6 +20,6 @@ Benchmark.bmbm do |x|
|
|
20
20
|
end
|
21
21
|
|
22
22
|
# user system total real
|
23
|
-
# #c_distance(s1, s2) 0.
|
24
|
-
# #distance(s1, s2, native: true) 2.
|
25
|
-
# fuzzystringmatch 0.
|
23
|
+
# #c_distance(s1, s2) 0.350000 0.000000 0.350000 ( 0.349109)
|
24
|
+
# #distance(s1, s2, native: true) 2.480000 0.050000 2.530000 ( 2.526027)
|
25
|
+
# fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.155539)
|
@@ -0,0 +1,99 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
#include "distance.h"
|
5
|
+
|
6
|
+
Option* option_new(){
|
7
|
+
Option *opt = calloc(1, sizeof(Option));
|
8
|
+
opt->case_match = 0;
|
9
|
+
opt->weight = 0.1;
|
10
|
+
opt->threshold = 0.7;
|
11
|
+
return opt;
|
12
|
+
}
|
13
|
+
|
14
|
+
static int char_bytes_num(char first_char){
|
15
|
+
unsigned char c = first_char;
|
16
|
+
if(c >= 252) return 6; // 1111110x
|
17
|
+
else if(c >= 248) return 5; // 111110xx
|
18
|
+
else if(c >= 240) return 4; // 11110xxx
|
19
|
+
else if(c >= 224) return 3; // 1110xxxx
|
20
|
+
else if(c >= 192) return 2; // 110xxxxx
|
21
|
+
else return 1;
|
22
|
+
}
|
23
|
+
|
24
|
+
static unsigned long* codepoints(const char *str, int *ret_len){
|
25
|
+
int str_len = strlen(str);
|
26
|
+
unsigned long *ret = calloc(str_len, sizeof(long));
|
27
|
+
int count = 0;
|
28
|
+
for(int i = 0; i < str_len;){
|
29
|
+
int bytes_num = char_bytes_num(str[i]);
|
30
|
+
memcpy(&ret[count], &str[i], bytes_num);
|
31
|
+
count++;
|
32
|
+
i += bytes_num;
|
33
|
+
}
|
34
|
+
*ret_len = count;
|
35
|
+
return ret;
|
36
|
+
}
|
37
|
+
|
38
|
+
double c_distance(char *s1, char *s2, Option *opt){
|
39
|
+
// set default option if NULL passed
|
40
|
+
int free_opt_flag = 0;
|
41
|
+
if(!opt){ free_opt_flag = 1; opt = option_new(); }
|
42
|
+
|
43
|
+
int ary_1_len, ary_2_len;
|
44
|
+
unsigned long *ary_1 = codepoints(s1, &ary_1_len), *ary_2 = codepoints(s2, &ary_2_len);
|
45
|
+
|
46
|
+
if(opt->case_match){
|
47
|
+
for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
|
48
|
+
for(int i = 0; i < ary_2_len; ++i) if(ary_2[i] < 256 && islower(ary_2[i])) ary_2[i] -= 32;
|
49
|
+
}
|
50
|
+
|
51
|
+
// Guarantee the order
|
52
|
+
if(ary_1_len > ary_2_len){
|
53
|
+
unsigned long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
|
54
|
+
int tmp2 = ary_1_len; ary_1_len = ary_2_len; ary_2_len = tmp2;
|
55
|
+
}
|
56
|
+
int window_size = ary_2_len / 2 - 1;
|
57
|
+
if(window_size < 0) window_size = 0;
|
58
|
+
double matches = 0.0;
|
59
|
+
int transpositions = 0;
|
60
|
+
int previous_index = -1;
|
61
|
+
int max_index = ary_2_len - 1;
|
62
|
+
for(int i = 0; i < ary_1_len; i++){
|
63
|
+
int left = i - window_size;
|
64
|
+
int right = i + window_size;
|
65
|
+
if(left < 0) left = 0;
|
66
|
+
if(right > max_index) right = max_index;
|
67
|
+
char matched = 0;
|
68
|
+
char found = 0;
|
69
|
+
for(int j = left; j <= right; j++){
|
70
|
+
if(ary_1[i] == ary_2[j]){
|
71
|
+
matched = 1;
|
72
|
+
if(!found){
|
73
|
+
if(j > previous_index){
|
74
|
+
previous_index = j;
|
75
|
+
found = 1;
|
76
|
+
}
|
77
|
+
} // if(!found){
|
78
|
+
} // if(ary_1[i] == ary_2[j]){
|
79
|
+
} // for(int j = left; j <= right; j++){
|
80
|
+
if(matched){
|
81
|
+
matches++;
|
82
|
+
if(!found) transpositions++;
|
83
|
+
}
|
84
|
+
} // for(int i = 0; i < ary_1_len; i++){
|
85
|
+
// Don't divide transpositions by 2 since it's been counted directly by above code.
|
86
|
+
double jaro_distance = matches == 0 ? 0 : (matches / ary_1_len + matches / ary_2_len + (matches - transpositions) / matches) / 3.0;
|
87
|
+
|
88
|
+
// calculate jaro-winkler distance
|
89
|
+
double threshold = opt->threshold, weight = opt->weight;
|
90
|
+
int prefix = 0;
|
91
|
+
int max_length = ary_1_len > 4 ? 4 : ary_1_len;
|
92
|
+
for(int i = 0; i < max_length; ++i){
|
93
|
+
if(ary_1[i] == ary_2[i]) prefix++;
|
94
|
+
else break;
|
95
|
+
}
|
96
|
+
free(ary_1); free(ary_2);
|
97
|
+
if(free_opt_flag) free(opt);
|
98
|
+
return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
|
99
|
+
}
|
@@ -1,32 +1,27 @@
|
|
1
1
|
#include "jaro_winkler.h"
|
2
|
-
#
|
3
|
-
VALUE rb_mJaroWinkler;
|
4
|
-
|
5
|
-
static VALUE distance(VALUE self, VALUE s1, VALUE s2){
|
6
|
-
Check_Type(s1, T_STRING); Check_Type(s2, T_STRING);
|
7
|
-
// Check encoding
|
8
|
-
VALUE s1_ascii_only = TYPE(rb_funcall(s1, rb_intern("ascii_only?"), 0));
|
9
|
-
VALUE s2_ascii_only = TYPE(rb_funcall(s2, rb_intern("ascii_only?"), 0));
|
10
|
-
if(s1_ascii_only == T_FALSE || s2_ascii_only == T_FALSE) printf("WARNING: Non-ASCII string detected.\n");
|
2
|
+
#include "distance.h"
|
11
3
|
|
12
|
-
|
13
|
-
// guarantee the length of s1_ptr is less than or equal to that of s2_ptr
|
14
|
-
if(RSTRING_LEN(s1) > RSTRING_LEN(s2)){ VALUE *tmp = s1_ptr; s1_ptr = s2_ptr; s2_ptr = tmp; }
|
15
|
-
int min_length = RSTRING_LEN(*s1_ptr), max_length = RSTRING_LEN(*s2_ptr);
|
16
|
-
char *c_s1_ptr = StringValuePtr(*s1_ptr), *c_s2_ptr = StringValuePtr(*s2_ptr);
|
17
|
-
int opt[] = {1, 0};
|
18
|
-
if(min_length != max_length){
|
19
|
-
// padding spaces
|
20
|
-
char buf[max_length];
|
21
|
-
for(int i = min_length; i < max_length; ++i) buf[i] = ' ';
|
22
|
-
memcpy(buf, c_s1_ptr, min_length);
|
23
|
-
c_s1_ptr = buf;
|
24
|
-
return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
|
25
|
-
}
|
26
|
-
return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
|
27
|
-
}
|
4
|
+
VALUE rb_mJaroWinkler;
|
28
5
|
|
29
6
|
void Init_jaro_winkler(void){
|
30
7
|
rb_mJaroWinkler = rb_define_module("JaroWinkler");
|
31
|
-
rb_define_module_function(rb_mJaroWinkler, "c_distance",
|
8
|
+
rb_define_module_function(rb_mJaroWinkler, "c_distance", rb_distance, -1);
|
32
9
|
}
|
10
|
+
|
11
|
+
VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
12
|
+
VALUE s1, s2, opt;
|
13
|
+
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
|
14
|
+
Option *c_opt = option_new();
|
15
|
+
if(TYPE(opt) == T_HASH){
|
16
|
+
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
|
17
|
+
VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
|
18
|
+
VALUE case_match = rb_hash_aref(opt, ID2SYM(rb_intern("case_match")));
|
19
|
+
if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
|
20
|
+
if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
|
21
|
+
if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
|
22
|
+
}
|
23
|
+
// TODO: replace StringValueCStr with StringValuePtr and RSTRING_LEN
|
24
|
+
VALUE ret = rb_float_new(c_distance(StringValueCStr(s1), StringValueCStr(s2), c_opt));
|
25
|
+
free(c_opt);
|
26
|
+
return ret;
|
27
|
+
}
|
data/lib/jaro_winkler.rb
CHANGED
@@ -2,13 +2,14 @@ require 'jaro_winkler/jaro_winkler.so' unless RUBY_PLATFORM == 'java'
|
|
2
2
|
module JaroWinkler
|
3
3
|
module_function
|
4
4
|
def jaro_distance s1, s2
|
5
|
-
|
5
|
+
length1, length2 = s1.length, s2.length
|
6
6
|
# Guarantee the length order
|
7
7
|
if s1.length > s2.length
|
8
|
-
|
8
|
+
s1, s2 = s2, s1
|
9
|
+
length1, length2 = length2, length1
|
9
10
|
end
|
10
|
-
|
11
|
-
window_size
|
11
|
+
window_size = (length2 / 2) - 1
|
12
|
+
window_size = 0 if window_size < 0
|
12
13
|
matches = 0.0
|
13
14
|
transpositions = 0
|
14
15
|
previous_index = -1
|
@@ -43,7 +44,7 @@ module JaroWinkler
|
|
43
44
|
|
44
45
|
def distance s1, s2, options = {}
|
45
46
|
options = {weight: 0.1, threshold: 0.7, case_match: false, native: false}.merge options
|
46
|
-
return c_distance(s1, s2) if RUBY_PLATFORM != 'java' && options[:native]
|
47
|
+
return c_distance(s1, s2, options) if RUBY_PLATFORM != 'java' && options[:native]
|
47
48
|
weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
|
48
49
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
49
50
|
s1, s2 = s1.downcase, s2.downcase if case_match
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -13,6 +13,7 @@ describe JaroWinkler do
|
|
13
13
|
['fvie', 'ten', 0.0],
|
14
14
|
['tony', 'tony', 1.0],
|
15
15
|
['tonytonyjan', 'tonytonyjan', 1.0],
|
16
|
+
['x', 'x', 1.0],
|
16
17
|
['', '', 0.0],
|
17
18
|
['tony', '', 0.0],
|
18
19
|
['', 'tony', 0.0],
|
@@ -29,16 +30,22 @@ describe JaroWinkler do
|
|
29
30
|
|
30
31
|
it 'supports C extension' do
|
31
32
|
@ary.each do |s1, s2, ans|
|
32
|
-
expect(
|
33
|
+
expect(c_distance(s1, s2)).to be_within(0.0001).of(ans)
|
33
34
|
end
|
34
35
|
end
|
35
36
|
|
37
|
+
it 'works with UTF-8' do
|
38
|
+
expect(distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
|
39
|
+
end
|
40
|
+
|
36
41
|
it 'can ignore case' do
|
37
42
|
expect(distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
|
43
|
+
expect(c_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
|
38
44
|
end
|
39
45
|
|
40
46
|
it 'can set weight' do
|
41
47
|
expect(distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
48
|
+
expect(c_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
|
42
49
|
expect{ distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
|
43
50
|
end
|
44
51
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
@@ -68,6 +68,8 @@ files:
|
|
68
68
|
- Rakefile
|
69
69
|
- benchmark/native.rb
|
70
70
|
- benchmark/pure.rb
|
71
|
+
- ext/jaro_winkler/distance.c
|
72
|
+
- ext/jaro_winkler/distance.h
|
71
73
|
- ext/jaro_winkler/extconf.rb
|
72
74
|
- ext/jaro_winkler/jaro_winkler.c
|
73
75
|
- ext/jaro_winkler/jaro_winkler.h
|