jaro_winkler 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5ca8266cce395bbcf2b425545e7a426110ef2601
4
- data.tar.gz: 23689730155a05387f5908b4661271f20aa118af
3
+ metadata.gz: 9a1a6510f807b126a00145ca02323746c1885409
4
+ data.tar.gz: 3cc743dde90a2dc0f682af51b506da4d4dc8e6ed
5
5
  SHA512:
6
- metadata.gz: 67661a5f12b0204e76cf50fc44bad3a912b3a570602d18402e29065a6d75455cf7666b439f1aecb8b4d8f77b380cc5af8ff0b7c1f143b557b6fb4488198c19cf
7
- data.tar.gz: cb172f54e6ac951d068ad805e157d31c233335e50ecf81c89c15d7cdf50b3b092cbfda044b962639b8e0f48d760bcddfb2277893d70a83b018e63cad97084e0f
6
+ metadata.gz: 02fec80fb44db8e6efed4b6cf083d55e4356cf706827c85850db8ef1c409a483e90c20e6de8437f0282d9621d46513a80061d1dd333d6b4b45b85d81f5c73bd5
7
+ data.tar.gz: e6e43be13fb520ddfa88206488689f4329c68003b0c4c1c40b69192c6df3d7bcc323f18316f21392ebf0da0dc2fce6949d43b8d8833ad72fe59ca288ebce3132
data/README.md CHANGED
@@ -20,10 +20,12 @@ JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
20
20
  # => 0.9778
21
21
 
22
22
  # Native
23
- JaroWinkler.c_distance "MARTHA", "MARHTA"
23
+ JaroWinkler.c_distance "MARTHA", "MARHTA" # Recommended one, it's 7 times faster than the latter.
24
24
  JaroWinkler.distance "MARTHA", "MARHTA", native: true
25
25
  ```
26
26
 
27
+ **Both implementations support UTF-8 string.**
28
+
27
29
  ## Options
28
30
 
29
31
  Name | Type | Default | Note
@@ -31,14 +33,7 @@ Name | Type | Default | Note
31
33
  case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
32
34
  weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
33
35
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
34
- native | boolean | false | Use native version, note that it omits all the other options.
35
-
36
- ## Pure Ruby v.s. Native
37
-
38
- | Pure | Native
39
- -------------- | ---- | ------
40
- UTF-8 Support | Yes | No
41
- Option Setting | Yes | No
36
+ native | boolean | false | Use native version.
42
37
 
43
38
  # Why This?
44
39
 
@@ -90,10 +85,5 @@ end
90
85
 
91
86
  # Todo
92
87
 
93
- - Speed up `#distance(s1, s2, native: true)`
94
- - Support UTF-8 in native version.
95
- - Add more optoins to natvie version.
96
- - case_match
97
- - weight
98
- - threshold
99
- - adjusting word table (It's from the original C implementation.)
88
+ - Adjusting word table (Reference to original C implementation.)
89
+ - Remove `#c_distance`, use C extension as default, and fallback to Ruby in Java platform.
@@ -20,6 +20,6 @@ Benchmark.bmbm do |x|
20
20
  end
21
21
 
22
22
  # user system total real
23
- # #c_distance(s1, s2) 0.270000 0.000000 0.270000 ( 0.270250)
24
- # #distance(s1, s2, native: true) 2.030000 0.050000 2.080000 ( 2.075878)
25
- # fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.141239)
23
+ # #c_distance(s1, s2) 0.350000 0.000000 0.350000 ( 0.349109)
24
+ # #distance(s1, s2, native: true) 2.480000 0.050000 2.530000 ( 2.526027)
25
+ # fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.155539)
@@ -0,0 +1,99 @@
1
+ #include <string.h>
2
+ #include <stdlib.h>
3
+ #include <ctype.h>
4
+ #include "distance.h"
5
+
6
+ Option* option_new(){
7
+ Option *opt = calloc(1, sizeof(Option));
8
+ opt->case_match = 0;
9
+ opt->weight = 0.1;
10
+ opt->threshold = 0.7;
11
+ return opt;
12
+ }
13
+
14
+ static int char_bytes_num(char first_char){
15
+ unsigned char c = first_char;
16
+ if(c >= 252) return 6; // 1111110x
17
+ else if(c >= 248) return 5; // 111110xx
18
+ else if(c >= 240) return 4; // 11110xxx
19
+ else if(c >= 224) return 3; // 1110xxxx
20
+ else if(c >= 192) return 2; // 110xxxxx
21
+ else return 1;
22
+ }
23
+
24
+ static unsigned long* codepoints(const char *str, int *ret_len){
25
+ int str_len = strlen(str);
26
+ unsigned long *ret = calloc(str_len, sizeof(long));
27
+ int count = 0;
28
+ for(int i = 0; i < str_len;){
29
+ int bytes_num = char_bytes_num(str[i]);
30
+ memcpy(&ret[count], &str[i], bytes_num);
31
+ count++;
32
+ i += bytes_num;
33
+ }
34
+ *ret_len = count;
35
+ return ret;
36
+ }
37
+
38
+ double c_distance(char *s1, char *s2, Option *opt){
39
+ // set default option if NULL passed
40
+ int free_opt_flag = 0;
41
+ if(!opt){ free_opt_flag = 1; opt = option_new(); }
42
+
43
+ int ary_1_len, ary_2_len;
44
+ unsigned long *ary_1 = codepoints(s1, &ary_1_len), *ary_2 = codepoints(s2, &ary_2_len);
45
+
46
+ if(opt->case_match){
47
+ for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
48
+ for(int i = 0; i < ary_2_len; ++i) if(ary_2[i] < 256 && islower(ary_2[i])) ary_2[i] -= 32;
49
+ }
50
+
51
+ // Guarantee the order
52
+ if(ary_1_len > ary_2_len){
53
+ unsigned long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
54
+ int tmp2 = ary_1_len; ary_1_len = ary_2_len; ary_2_len = tmp2;
55
+ }
56
+ int window_size = ary_2_len / 2 - 1;
57
+ if(window_size < 0) window_size = 0;
58
+ double matches = 0.0;
59
+ int transpositions = 0;
60
+ int previous_index = -1;
61
+ int max_index = ary_2_len - 1;
62
+ for(int i = 0; i < ary_1_len; i++){
63
+ int left = i - window_size;
64
+ int right = i + window_size;
65
+ if(left < 0) left = 0;
66
+ if(right > max_index) right = max_index;
67
+ char matched = 0;
68
+ char found = 0;
69
+ for(int j = left; j <= right; j++){
70
+ if(ary_1[i] == ary_2[j]){
71
+ matched = 1;
72
+ if(!found){
73
+ if(j > previous_index){
74
+ previous_index = j;
75
+ found = 1;
76
+ }
77
+ } // if(!found){
78
+ } // if(ary_1[i] == ary_2[j]){
79
+ } // for(int j = left; j <= right; j++){
80
+ if(matched){
81
+ matches++;
82
+ if(!found) transpositions++;
83
+ }
84
+ } // for(int i = 0; i < ary_1_len; i++){
85
+ // Don't divide transpositions by 2 since it's been counted directly by above code.
86
+ double jaro_distance = matches == 0 ? 0 : (matches / ary_1_len + matches / ary_2_len + (matches - transpositions) / matches) / 3.0;
87
+
88
+ // calculate jaro-winkler distance
89
+ double threshold = opt->threshold, weight = opt->weight;
90
+ int prefix = 0;
91
+ int max_length = ary_1_len > 4 ? 4 : ary_1_len;
92
+ for(int i = 0; i < max_length; ++i){
93
+ if(ary_1[i] == ary_2[i]) prefix++;
94
+ else break;
95
+ }
96
+ free(ary_1); free(ary_2);
97
+ if(free_opt_flag) free(opt);
98
+ return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
99
+ }
@@ -0,0 +1,12 @@
1
+ #ifndef DISTANCE_H
2
+ #define DISTANCE_H 1
3
+
4
+ typedef struct{
5
+ char case_match;
6
+ double weight, threshold;
7
+ } Option;
8
+
9
+ double c_distance(char *s1, char *s2, Option *opt);
10
+ Option* option_new();
11
+
12
+ #endif /* DISTANCE_H */
@@ -1,32 +1,27 @@
1
1
  #include "jaro_winkler.h"
2
- #define MAX(X,Y) ((X) < (Y) ? (Y) : (X))
3
- VALUE rb_mJaroWinkler;
4
-
5
- static VALUE distance(VALUE self, VALUE s1, VALUE s2){
6
- Check_Type(s1, T_STRING); Check_Type(s2, T_STRING);
7
- // Check encoding
8
- VALUE s1_ascii_only = TYPE(rb_funcall(s1, rb_intern("ascii_only?"), 0));
9
- VALUE s2_ascii_only = TYPE(rb_funcall(s2, rb_intern("ascii_only?"), 0));
10
- if(s1_ascii_only == T_FALSE || s2_ascii_only == T_FALSE) printf("WARNING: Non-ASCII string detected.\n");
2
+ #include "distance.h"
11
3
 
12
- VALUE *s1_ptr = &s1, *s2_ptr = &s2;
13
- // guarantee the length of s1_ptr is less than or equal to that of s2_ptr
14
- if(RSTRING_LEN(s1) > RSTRING_LEN(s2)){ VALUE *tmp = s1_ptr; s1_ptr = s2_ptr; s2_ptr = tmp; }
15
- int min_length = RSTRING_LEN(*s1_ptr), max_length = RSTRING_LEN(*s2_ptr);
16
- char *c_s1_ptr = StringValuePtr(*s1_ptr), *c_s2_ptr = StringValuePtr(*s2_ptr);
17
- int opt[] = {1, 0};
18
- if(min_length != max_length){
19
- // padding spaces
20
- char buf[max_length];
21
- for(int i = min_length; i < max_length; ++i) buf[i] = ' ';
22
- memcpy(buf, c_s1_ptr, min_length);
23
- c_s1_ptr = buf;
24
- return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
25
- }
26
- return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
27
- }
4
+ VALUE rb_mJaroWinkler;
28
5
 
29
6
  void Init_jaro_winkler(void){
30
7
  rb_mJaroWinkler = rb_define_module("JaroWinkler");
31
- rb_define_module_function(rb_mJaroWinkler, "c_distance", distance, 2);
8
+ rb_define_module_function(rb_mJaroWinkler, "c_distance", rb_distance, -1);
32
9
  }
10
+
11
+ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
12
+ VALUE s1, s2, opt;
13
+ rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
14
+ Option *c_opt = option_new();
15
+ if(TYPE(opt) == T_HASH){
16
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
17
+ VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
18
+ VALUE case_match = rb_hash_aref(opt, ID2SYM(rb_intern("case_match")));
19
+ if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
20
+ if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
21
+ if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
22
+ }
23
+ // TODO: replace StringValueCStr with StringValuePtr and RSTRING_LEN
24
+ VALUE ret = rb_float_new(c_distance(StringValueCStr(s1), StringValueCStr(s2), c_opt));
25
+ free(c_opt);
26
+ return ret;
27
+ }
@@ -3,6 +3,6 @@
3
3
 
4
4
  #include "ruby.h"
5
5
 
6
- double strcmp95(char *ying, char *yang, long y_length, int *ind_c);
6
+ VALUE rb_distance(int argc, VALUE *argv, VALUE obj);
7
7
 
8
8
  #endif /* JARO_WINKLER_H */
@@ -2,13 +2,14 @@ require 'jaro_winkler/jaro_winkler.so' unless RUBY_PLATFORM == 'java'
2
2
  module JaroWinkler
3
3
  module_function
4
4
  def jaro_distance s1, s2
5
- return 0.0 if s1.empty? || s2.empty?
5
+ length1, length2 = s1.length, s2.length
6
6
  # Guarantee the length order
7
7
  if s1.length > s2.length
8
- tmp = s1; s1 = s2; s2 = tmp
8
+ s1, s2 = s2, s1
9
+ length1, length2 = length2, length1
9
10
  end
10
- length1, length2 = s1.length, s2.length
11
- window_size = (s2.length / 2) - 1
11
+ window_size = (length2 / 2) - 1
12
+ window_size = 0 if window_size < 0
12
13
  matches = 0.0
13
14
  transpositions = 0
14
15
  previous_index = -1
@@ -43,7 +44,7 @@ module JaroWinkler
43
44
 
44
45
  def distance s1, s2, options = {}
45
46
  options = {weight: 0.1, threshold: 0.7, case_match: false, native: false}.merge options
46
- return c_distance(s1, s2) if RUBY_PLATFORM != 'java' && options[:native]
47
+ return c_distance(s1, s2, options) if RUBY_PLATFORM != 'java' && options[:native]
47
48
  weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
48
49
  raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
49
50
  s1, s2 = s1.downcase, s2.downcase if case_match
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.1.1"
2
+ VERSION = "1.2.0"
3
3
  end
@@ -13,6 +13,7 @@ describe JaroWinkler do
13
13
  ['fvie', 'ten', 0.0],
14
14
  ['tony', 'tony', 1.0],
15
15
  ['tonytonyjan', 'tonytonyjan', 1.0],
16
+ ['x', 'x', 1.0],
16
17
  ['', '', 0.0],
17
18
  ['tony', '', 0.0],
18
19
  ['', 'tony', 0.0],
@@ -29,16 +30,22 @@ describe JaroWinkler do
29
30
 
30
31
  it 'supports C extension' do
31
32
  @ary.each do |s1, s2, ans|
32
- expect(distance(s1, s2, native: true)).to be_within(0.0001).of(ans)
33
+ expect(c_distance(s1, s2)).to be_within(0.0001).of(ans)
33
34
  end
34
35
  end
35
36
 
37
+ it 'works with UTF-8' do
38
+ expect(distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
39
+ end
40
+
36
41
  it 'can ignore case' do
37
42
  expect(distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
43
+ expect(c_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
38
44
  end
39
45
 
40
46
  it 'can set weight' do
41
47
  expect(distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
48
+ expect(c_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
42
49
  expect{ distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
43
50
  end
44
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
@@ -68,6 +68,8 @@ files:
68
68
  - Rakefile
69
69
  - benchmark/native.rb
70
70
  - benchmark/pure.rb
71
+ - ext/jaro_winkler/distance.c
72
+ - ext/jaro_winkler/distance.h
71
73
  - ext/jaro_winkler/extconf.rb
72
74
  - ext/jaro_winkler/jaro_winkler.c
73
75
  - ext/jaro_winkler/jaro_winkler.h