jaro_winkler 1.2.7 → 1.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/ext/jaro_winkler/distance.c +2 -2
- data/ext/jaro_winkler/distance.h +1 -1
- data/ext/jaro_winkler/jaro_winkler.c +2 -2
- data/lib/jaro_winkler.rb +3 -3
- data/lib/jaro_winkler/version.rb +1 -1
- data/spec/jaro_winkler_spec.rb +23 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13eaf9c47df66ba5c9883611ee7a1c8468cc9e7a
|
4
|
+
data.tar.gz: 7c43db047cfb1aaade4c0d6f2c907ed19f8a79ab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57421be340741b44879c3104689363a7ca2897014a6e9cd0c3fcaa524b29abf1a5db370cab72e4de0299f26ca2e14357ce24f7b8b7cafce97ca1527c27c46798
|
7
|
+
data.tar.gz: 8db803506546b4a99dd7e267bc561c90664282b4c8dc253f22695a0a8cc55df64e559be3c7e68cb26e05fd4ca08626481a82c82128f8616e0a2b378b29012413
|
data/README.md
CHANGED
@@ -14,7 +14,7 @@ gem install jaro_winkler
|
|
14
14
|
require 'jaro_winkler'
|
15
15
|
JaroWinkler.distance "MARTHA", "MARHTA"
|
16
16
|
# => 0.9611
|
17
|
-
JaroWinkler.distance "MARTHA", "marhta",
|
17
|
+
JaroWinkler.distance "MARTHA", "marhta", ignore_case: true
|
18
18
|
# => 0.9611
|
19
19
|
JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
|
20
20
|
# => 0.9778
|
@@ -30,7 +30,7 @@ JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
|
|
30
30
|
|
31
31
|
Name | Type | Default | Note
|
32
32
|
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
|
33
|
-
|
33
|
+
ignore_case | boolean | false | All lower case characters are converted to upper case prior to the comparison.
|
34
34
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
35
35
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
36
36
|
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Option* option_new(){
|
7
7
|
Option *opt = calloc(1, sizeof(Option));
|
8
|
-
opt->
|
8
|
+
opt->ignore_case = 0;
|
9
9
|
opt->weight = 0.1;
|
10
10
|
opt->threshold = 0.7;
|
11
11
|
return opt;
|
@@ -42,7 +42,7 @@ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt)
|
|
42
42
|
int ary_1_len, ary_2_len;
|
43
43
|
unsigned long long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
|
44
44
|
|
45
|
-
if(opt->
|
45
|
+
if(opt->ignore_case){
|
46
46
|
for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
|
47
47
|
for(int i = 0; i < ary_2_len; ++i) if(ary_2[i] < 256 && islower(ary_2[i])) ary_2[i] -= 32;
|
48
48
|
}
|
data/ext/jaro_winkler/distance.h
CHANGED
@@ -15,11 +15,11 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
|
15
15
|
if(TYPE(opt) == T_HASH){
|
16
16
|
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
|
17
17
|
VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
|
18
|
-
VALUE
|
18
|
+
VALUE ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case")));
|
19
19
|
if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
|
20
20
|
if(c_opt->weight > 0.25) rb_raise(rb_eRuntimeError, "Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1.");
|
21
21
|
if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
|
22
|
-
if(!NIL_P(
|
22
|
+
if(!NIL_P(ignore_case)) c_opt->ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
|
23
23
|
}
|
24
24
|
VALUE ret = rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
|
25
25
|
free(c_opt);
|
data/lib/jaro_winkler.rb
CHANGED
@@ -42,10 +42,10 @@ module JaroWinkler
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def r_distance s1, s2, options = {}
|
45
|
-
options = {weight: 0.1, threshold: 0.7,
|
46
|
-
weight, threshold,
|
45
|
+
options = {weight: 0.1, threshold: 0.7, ignore_case: false}.merge options
|
46
|
+
weight, threshold, ignore_case = options[:weight], options[:threshold], options[:ignore_case]
|
47
47
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
48
|
-
s1, s2 = s1.upcase, s2.upcase if
|
48
|
+
s1, s2 = s1.upcase, s2.upcase if ignore_case
|
49
49
|
distance = jaro_distance(s1, s2)
|
50
50
|
prefix = 0
|
51
51
|
max_length = [4, s1.length, s2.length].min
|
data/lib/jaro_winkler/version.rb
CHANGED
data/spec/jaro_winkler_spec.rb
CHANGED
@@ -31,18 +31,32 @@ describe JaroWinkler do
|
|
31
31
|
end
|
32
32
|
|
33
33
|
it 'works with UTF-8' do
|
34
|
-
expect(c_distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to
|
34
|
+
expect(c_distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to be_within(0.0001).of(0.9818)
|
35
|
+
expect(c_distance('連勝文', '連勝丼')).to be_within(0.0001).of(0.8222)
|
36
|
+
expect(c_distance('馬英九', '馬英丸')).to be_within(0.0001).of(0.8222)
|
35
37
|
end
|
36
38
|
|
37
|
-
it '
|
38
|
-
|
39
|
-
expect(
|
39
|
+
it 'sets ignore_case' do
|
40
|
+
params = 'MARTHA', 'marhta', {ignore_case: true}
|
41
|
+
expect(r_distance(*params)).to be_within(0.0001).of(0.9611)
|
42
|
+
expect(c_distance(*params)).to be_within(0.0001).of(0.9611)
|
40
43
|
end
|
41
44
|
|
42
|
-
it '
|
43
|
-
|
44
|
-
expect(
|
45
|
-
expect
|
46
|
-
|
45
|
+
it 'sets weight' do
|
46
|
+
params = 'MARTHA', 'MARHTA', {weight: 0.2}
|
47
|
+
expect(r_distance(*params)).to be_within(0.0001).of(0.9778)
|
48
|
+
expect(c_distance(*params)).to be_within(0.0001).of(0.9778)
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'sets threshold' do
|
52
|
+
params = 'MARTHA', 'MARHTA', {threshold: 0.99}
|
53
|
+
expect(r_distance(*params)).to be_within(0.0001).of(0.9445)
|
54
|
+
expect(c_distance(*params)).to be_within(0.0001).of(0.9445)
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'throws exception when weight exceeding 0.25' do
|
58
|
+
params = 'MARTHA', 'MARHTA', {weight: 0.26}
|
59
|
+
expect{ r_distance(*params) }.to raise_error
|
60
|
+
expect{ c_distance(*params) }.to raise_error
|
47
61
|
end
|
48
62
|
end
|