jaro_winkler 1.2.7 → 1.2.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/ext/jaro_winkler/distance.c +2 -2
- data/ext/jaro_winkler/distance.h +1 -1
- data/ext/jaro_winkler/jaro_winkler.c +2 -2
- data/lib/jaro_winkler.rb +3 -3
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +23 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13eaf9c47df66ba5c9883611ee7a1c8468cc9e7a
|
4
|
+
data.tar.gz: 7c43db047cfb1aaade4c0d6f2c907ed19f8a79ab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57421be340741b44879c3104689363a7ca2897014a6e9cd0c3fcaa524b29abf1a5db370cab72e4de0299f26ca2e14357ce24f7b8b7cafce97ca1527c27c46798
|
7
|
+
data.tar.gz: 8db803506546b4a99dd7e267bc561c90664282b4c8dc253f22695a0a8cc55df64e559be3c7e68cb26e05fd4ca08626481a82c82128f8616e0a2b378b29012413
|
data/README.md
CHANGED
@@ -14,7 +14,7 @@ gem install jaro_winkler
|
|
14
14
|
require 'jaro_winkler'
|
15
15
|
JaroWinkler.distance "MARTHA", "MARHTA"
|
16
16
|
# => 0.9611
|
17
|
-
JaroWinkler.distance "MARTHA", "marhta",
|
17
|
+
JaroWinkler.distance "MARTHA", "marhta", ignore_case: true
|
18
18
|
# => 0.9611
|
19
19
|
JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
20
20
|
# => 0.9778
|
@@ -30,7 +30,7 @@ JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
|
|
30
30
|
|
31
31
|
Name | Type | Default | Note
|
32
32
|
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
|
33
|
-
|
33
|
+
ignore_case | boolean | false | All lower case characters are converted to upper case prior to the comparison.
|
34
34
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
35
35
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
36
36
|
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Option* option_new(){
|
7
7
|
Option *opt = calloc(1, sizeof(Option));
|
8
|
-
opt->
|
8
|
+
opt->ignore_case = 0;
|
9
9
|
opt->weight = 0.1;
|
10
10
|
opt->threshold = 0.7;
|
11
11
|
return opt;
|
@@ -42,7 +42,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
|
|
42
42
|
int ary_1_len, ary_2_len;
|
43
43
|
unsigned long long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
|
44
44
|
|
45
|
-
if(opt->
|
45
|
+
if(opt->ignore_case){
|
46
46
|
for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
|
47
47
|
for(int i = 0; i < ary_2_len; ++i) if(ary_2[i] < 256 && islower(ary_2[i])) ary_2[i] -= 32;
|
48
48
|
}
|
data/ext/jaro_winkler/distance.h
CHANGED
@@ -15,11 +15,11 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
|
15
15
|
if(TYPE(opt) == T_HASH){
|
16
16
|
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
|
17
17
|
VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
|
18
|
-
VALUE
|
18
|
+
VALUE ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case")));
|
19
19
|
if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
|
20
20
|
if(c_opt->weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
21
21
|
if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
|
22
|
-
if(!NIL_P(
|
22
|
+
if(!NIL_P(ignore_case)) c_opt->ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
23
23
|
}
|
24
24
|
VALUE ret = rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
|
25
25
|
free(c_opt);
|
data/lib/jaro_winkler.rb
CHANGED
@@ -42,10 +42,10 @@ module JaroWinkler
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def r_distance s1, s2, options = {}
|
45
|
-
options = {weight: 0.1, threshold: 0.7,
|
46
|
-
weight, threshold,
|
45
|
+
options = {weight: 0.1, threshold: 0.7, ignore_case: false}.merge options
|
46
|
+
weight, threshold, ignore_case = options[:weight], options[:threshold], options[:ignore_case]
|
47
47
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
48
|
-
s1, s2 = s1.upcase, s2.upcase if
|
48
|
+
s1, s2 = s1.upcase, s2.upcase if ignore_case
|
49
49
|
distance = jaro_distance(s1, s2)
|
50
50
|
prefix = 0
|
51
51
|
max_length = [4, s1.length, s2.length].min
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -31,18 +31,32 @@ describe JaroWinkler do
|
|
31
31
|
end
|
32
32
|
|
33
33
|
it 'works with UTF-8' do
|
34
|
-
expect(c_distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to
|
34
|
+
expect(c_distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to be_within(0.0001).of(0.9818)
|
35
|
+
expect(c_distance('連勝文', '連勝丼')).to be_within(0.0001).of(0.8222)
|
36
|
+
expect(c_distance('馬英九', '馬英丸')).to be_within(0.0001).of(0.8222)
|
35
37
|
end
|
36
38
|
|
37
|
-
it '
|
38
|
-
|
39
|
-
expect(
|
39
|
+
it 'sets ignore_case' do
|
40
|
+
params = 'MARTHA', 'marhta', {ignore_case: true}
|
41
|
+
expect(r_distance(*params)).to be_within(0.0001).of(0.9611)
|
42
|
+
expect(c_distance(*params)).to be_within(0.0001).of(0.9611)
|
40
43
|
end
|
41
44
|
|
42
|
-
it '
|
43
|
-
|
44
|
-
expect(
|
45
|
-
expect
|
46
|
-
|
45
|
+
it 'sets weight' do
|
46
|
+
params = 'MARTHA', 'MARHTA', {weight: 0.2}
|
47
|
+
expect(r_distance(*params)).to be_within(0.0001).of(0.9778)
|
48
|
+
expect(c_distance(*params)).to be_within(0.0001).of(0.9778)
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'sets threshold' do
|
52
|
+
params = 'MARTHA', 'MARHTA', {threshold: 0.99}
|
53
|
+
expect(r_distance(*params)).to be_within(0.0001).of(0.9445)
|
54
|
+
expect(c_distance(*params)).to be_within(0.0001).of(0.9445)
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'throws exception when weight exceeding 0.25' do
|
58
|
+
params = 'MARTHA', 'MARHTA', {weight: 0.26}
|
59
|
+
expect{ r_distance(*params) }.to raise_error
|
60
|
+
expect{ c_distance(*params) }.to raise_error
|
47
61
|
end
|
48
62
|
end
|