jaro_winkler 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5ca8266cce395bbcf2b425545e7a426110ef2601
4
- data.tar.gz: 23689730155a05387f5908b4661271f20aa118af
3
+ metadata.gz: 9a1a6510f807b126a00145ca02323746c1885409
4
+ data.tar.gz: 3cc743dde90a2dc0f682af51b506da4d4dc8e6ed
5
5
  SHA512:
6
- metadata.gz: 67661a5f12b0204e76cf50fc44bad3a912b3a570602d18402e29065a6d75455cf7666b439f1aecb8b4d8f77b380cc5af8ff0b7c1f143b557b6fb4488198c19cf
7
- data.tar.gz: cb172f54e6ac951d068ad805e157d31c233335e50ecf81c89c15d7cdf50b3b092cbfda044b962639b8e0f48d760bcddfb2277893d70a83b018e63cad97084e0f
6
+ metadata.gz: 02fec80fb44db8e6efed4b6cf083d55e4356cf706827c85850db8ef1c409a483e90c20e6de8437f0282d9621d46513a80061d1dd333d6b4b45b85d81f5c73bd5
7
+ data.tar.gz: e6e43be13fb520ddfa88206488689f4329c68003b0c4c1c40b69192c6df3d7bcc323f18316f21392ebf0da0dc2fce6949d43b8d8833ad72fe59ca288ebce3132
data/README.md CHANGED
@@ -20,10 +20,12 @@ JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
20
20
  # => 0.9778
21
21
 
22
22
  # Native
23
- JaroWinkler.c_distance "MARTHA", "MARHTA"
23
+ JaroWinkler.c_distance "MARTHA", "MARHTA" # Recommended one, it's 7 times faster than the latter.
24
24
  JaroWinkler.distance "MARTHA", "MARHTA", native: true
25
25
  ```
26
26
 
27
+ **Both implementations support UTF-8 string.**
28
+
27
29
  ## Options
28
30
 
29
31
  Name | Type | Default | Note
@@ -31,14 +33,7 @@ Name | Type | Default | Note
31
33
  case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
32
34
  weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
33
35
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
34
- native | boolean | false | Use native version, note that it omits all the other options.
35
-
36
- ## Pure Ruby v.s. Native
37
-
38
- | Pure | Native
39
- -------------- | ---- | ------
40
- UTF-8 Support | Yes | No
41
- Option Setting | Yes | No
36
+ native | boolean | false | Use native version.
42
37
 
43
38
  # Why This?
44
39
 
@@ -90,10 +85,5 @@ end
90
85
 
91
86
  # Todo
92
87
 
93
- - Speed up `#distance(s1, s2, native: true)`
94
- - Support UTF-8 in native version.
95
- - Add more optoins to natvie version.
96
- - case_match
97
- - weight
98
- - threshold
99
- - adjusting word table (It's from the original C implementation.)
88
+ - Adjusting word table (Reference to original C implementation.)
89
+ - Remove `#c_distance`, use C extension as default, and fallback to Ruby in Java platform.
@@ -20,6 +20,6 @@ Benchmark.bmbm do |x|
20
20
  end
21
21
 
22
22
  # user system total real
23
- # #c_distance(s1, s2) 0.270000 0.000000 0.270000 ( 0.270250)
24
- # #distance(s1, s2, native: true) 2.030000 0.050000 2.080000 ( 2.075878)
25
- # fuzzystringmatch 0.140000 0.000000 0.140000 ( 0.141239)
23
+ # #c_distance(s1, s2) 0.350000 0.000000 0.350000 ( 0.349109)
24
+ # #distance(s1, s2, native: true) 2.480000 0.050000 2.530000 ( 2.526027)
25
+ # fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.155539)
@@ -0,0 +1,99 @@
1
+ #include <string.h>
2
+ #include <stdlib.h>
3
+ #include <ctype.h>
4
+ #include "distance.h"
5
+
6
+ Option* option_new(){
7
+ Option *opt = calloc(1, sizeof(Option));
8
+ opt->case_match = 0;
9
+ opt->weight = 0.1;
10
+ opt->threshold = 0.7;
11
+ return opt;
12
+ }
13
+
14
+ static int char_bytes_num(char first_char){
15
+ unsigned char c = first_char;
16
+ if(c >= 252) return 6; // 1111110x
17
+ else if(c >= 248) return 5; // 111110xx
18
+ else if(c >= 240) return 4; // 11110xxx
19
+ else if(c >= 224) return 3; // 1110xxxx
20
+ else if(c >= 192) return 2; // 110xxxxx
21
+ else return 1;
22
+ }
23
+
24
+ static unsigned long* codepoints(const char *str, int *ret_len){
25
+ int str_len = strlen(str);
26
+ unsigned long *ret = calloc(str_len, sizeof(long));
27
+ int count = 0;
28
+ for(int i = 0; i < str_len;){
29
+ int bytes_num = char_bytes_num(str[i]);
30
+ memcpy(&ret[count], &str[i], bytes_num);
31
+ count++;
32
+ i += bytes_num;
33
+ }
34
+ *ret_len = count;
35
+ return ret;
36
+ }
37
+
38
+ double c_distance(char *s1, char *s2, Option *opt){
39
+ // set default option if NULL passed
40
+ int free_opt_flag = 0;
41
+ if(!opt){ free_opt_flag = 1; opt = option_new(); }
42
+
43
+ int ary_1_len, ary_2_len;
44
+ unsigned long *ary_1 = codepoints(s1, &ary_1_len), *ary_2 = codepoints(s2, &ary_2_len);
45
+
46
+ if(opt->case_match){
47
+ for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
48
+ for(int i = 0; i < ary_2_len; ++i) if(ary_2[i] < 256 && islower(ary_2[i])) ary_2[i] -= 32;
49
+ }
50
+
51
+ // Guarantee the order
52
+ if(ary_1_len > ary_2_len){
53
+ unsigned long *tmp = ary_1; ary_1 = ary_2; ary_2 = tmp;
54
+ int tmp2 = ary_1_len; ary_1_len = ary_2_len; ary_2_len = tmp2;
55
+ }
56
+ int window_size = ary_2_len / 2 - 1;
57
+ if(window_size < 0) window_size = 0;
58
+ double matches = 0.0;
59
+ int transpositions = 0;
60
+ int previous_index = -1;
61
+ int max_index = ary_2_len - 1;
62
+ for(int i = 0; i < ary_1_len; i++){
63
+ int left = i - window_size;
64
+ int right = i + window_size;
65
+ if(left < 0) left = 0;
66
+ if(right > max_index) right = max_index;
67
+ char matched = 0;
68
+ char found = 0;
69
+ for(int j = left; j <= right; j++){
70
+ if(ary_1[i] == ary_2[j]){
71
+ matched = 1;
72
+ if(!found){
73
+ if(j > previous_index){
74
+ previous_index = j;
75
+ found = 1;
76
+ }
77
+ } // if(!found){
78
+ } // if(ary_1[i] == ary_2[j]){
79
+ } // for(int j = left; j <= right; j++){
80
+ if(matched){
81
+ matches++;
82
+ if(!found) transpositions++;
83
+ }
84
+ } // for(int i = 0; i < ary_1_len; i++){
85
+ // Don't divide transpositions by 2 since it's been counted directly by above code.
86
+ double jaro_distance = matches == 0 ? 0 : (matches / ary_1_len + matches / ary_2_len + (matches - transpositions) / matches) / 3.0;
87
+
88
+ // calculate jaro-winkler distance
89
+ double threshold = opt->threshold, weight = opt->weight;
90
+ int prefix = 0;
91
+ int max_length = ary_1_len > 4 ? 4 : ary_1_len;
92
+ for(int i = 0; i < max_length; ++i){
93
+ if(ary_1[i] == ary_2[i]) prefix++;
94
+ else break;
95
+ }
96
+ free(ary_1); free(ary_2);
97
+ if(free_opt_flag) free(opt);
98
+ return jaro_distance < threshold ? jaro_distance : jaro_distance + ((prefix * weight) * (1 - jaro_distance));
99
+ }
@@ -0,0 +1,12 @@
1
+ #ifndef DISTANCE_H
2
+ #define DISTANCE_H 1
3
+
4
+ typedef struct{
5
+ char case_match;
6
+ double weight, threshold;
7
+ } Option;
8
+
9
+ double c_distance(char *s1, char *s2, Option *opt);
10
+ Option* option_new();
11
+
12
+ #endif /* DISTANCE_H */
@@ -1,32 +1,27 @@
1
1
  #include "jaro_winkler.h"
2
- #define MAX(X,Y) ((X) < (Y) ? (Y) : (X))
3
- VALUE rb_mJaroWinkler;
4
-
5
- static VALUE distance(VALUE self, VALUE s1, VALUE s2){
6
- Check_Type(s1, T_STRING); Check_Type(s2, T_STRING);
7
- // Check encoding
8
- VALUE s1_ascii_only = TYPE(rb_funcall(s1, rb_intern("ascii_only?"), 0));
9
- VALUE s2_ascii_only = TYPE(rb_funcall(s2, rb_intern("ascii_only?"), 0));
10
- if(s1_ascii_only == T_FALSE || s2_ascii_only == T_FALSE) printf("WARNING: Non-ASCII string detected.\n");
2
+ #include "distance.h"
11
3
 
12
- VALUE *s1_ptr = &s1, *s2_ptr = &s2;
13
- // guarantee the length of s1_ptr is less than or equal to that of s2_ptr
14
- if(RSTRING_LEN(s1) > RSTRING_LEN(s2)){ VALUE *tmp = s1_ptr; s1_ptr = s2_ptr; s2_ptr = tmp; }
15
- int min_length = RSTRING_LEN(*s1_ptr), max_length = RSTRING_LEN(*s2_ptr);
16
- char *c_s1_ptr = StringValuePtr(*s1_ptr), *c_s2_ptr = StringValuePtr(*s2_ptr);
17
- int opt[] = {1, 0};
18
- if(min_length != max_length){
19
- // padding spaces
20
- char buf[max_length];
21
- for(int i = min_length; i < max_length; ++i) buf[i] = ' ';
22
- memcpy(buf, c_s1_ptr, min_length);
23
- c_s1_ptr = buf;
24
- return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
25
- }
26
- return rb_float_new(strcmp95(c_s1_ptr, c_s2_ptr, max_length, opt));
27
- }
4
+ VALUE rb_mJaroWinkler;
28
5
 
29
6
  void Init_jaro_winkler(void){
30
7
  rb_mJaroWinkler = rb_define_module("JaroWinkler");
31
- rb_define_module_function(rb_mJaroWinkler, "c_distance", distance, 2);
8
+ rb_define_module_function(rb_mJaroWinkler, "c_distance", rb_distance, -1);
32
9
  }
10
+
11
+ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
12
+ VALUE s1, s2, opt;
13
+ rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
14
+ Option *c_opt = option_new();
15
+ if(TYPE(opt) == T_HASH){
16
+ VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight")));
17
+ VALUE threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold")));
18
+ VALUE case_match = rb_hash_aref(opt, ID2SYM(rb_intern("case_match")));
19
+ if(!NIL_P(weight)) c_opt->weight = NUM2DBL(weight);
20
+ if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
21
+ if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
22
+ }
23
+ // TODO: replace StringValueCStr with StringValuePtr and RSTRING_LEN
24
+ VALUE ret = rb_float_new(c_distance(StringValueCStr(s1), StringValueCStr(s2), c_opt));
25
+ free(c_opt);
26
+ return ret;
27
+ }
@@ -3,6 +3,6 @@
3
3
 
4
4
  #include "ruby.h"
5
5
 
6
- double strcmp95(char *ying, char *yang, long y_length, int *ind_c);
6
+ VALUE rb_distance(int argc, VALUE *argv, VALUE obj);
7
7
 
8
8
  #endif /* JARO_WINKLER_H */
@@ -2,13 +2,14 @@ require 'jaro_winkler/jaro_winkler.so' unless RUBY_PLATFORM == 'java'
2
2
  module JaroWinkler
3
3
  module_function
4
4
  def jaro_distance s1, s2
5
- return 0.0 if s1.empty? || s2.empty?
5
+ length1, length2 = s1.length, s2.length
6
6
  # Guarantee the length order
7
7
  if s1.length > s2.length
8
- tmp = s1; s1 = s2; s2 = tmp
8
+ s1, s2 = s2, s1
9
+ length1, length2 = length2, length1
9
10
  end
10
- length1, length2 = s1.length, s2.length
11
- window_size = (s2.length / 2) - 1
11
+ window_size = (length2 / 2) - 1
12
+ window_size = 0 if window_size < 0
12
13
  matches = 0.0
13
14
  transpositions = 0
14
15
  previous_index = -1
@@ -43,7 +44,7 @@ module JaroWinkler
43
44
 
44
45
  def distance s1, s2, options = {}
45
46
  options = {weight: 0.1, threshold: 0.7, case_match: false, native: false}.merge options
46
- return c_distance(s1, s2) if RUBY_PLATFORM != 'java' && options[:native]
47
+ return c_distance(s1, s2, options) if RUBY_PLATFORM != 'java' && options[:native]
47
48
  weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
48
49
  raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
49
50
  s1, s2 = s1.downcase, s2.downcase if case_match
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.1.1"
2
+ VERSION = "1.2.0"
3
3
  end
@@ -13,6 +13,7 @@ describe JaroWinkler do
13
13
  ['fvie', 'ten', 0.0],
14
14
  ['tony', 'tony', 1.0],
15
15
  ['tonytonyjan', 'tonytonyjan', 1.0],
16
+ ['x', 'x', 1.0],
16
17
  ['', '', 0.0],
17
18
  ['tony', '', 0.0],
18
19
  ['', 'tony', 0.0],
@@ -29,16 +30,22 @@ describe JaroWinkler do
29
30
 
30
31
  it 'supports C extension' do
31
32
  @ary.each do |s1, s2, ans|
32
- expect(distance(s1, s2, native: true)).to be_within(0.0001).of(ans)
33
+ expect(c_distance(s1, s2)).to be_within(0.0001).of(ans)
33
34
  end
34
35
  end
35
36
 
37
+ it 'works with UTF-8' do
38
+ expect(distance('變形金剛4:絕跡重生', '變形金剛4: 絕跡重生')).to eq c_distance('0123456789', '01234x56789')
39
+ end
40
+
36
41
  it 'can ignore case' do
37
42
  expect(distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
43
+ expect(c_distance('MARTHA', 'marhta', case_match: true)).to be_within(0.0001).of(0.9611)
38
44
  end
39
45
 
40
46
  it 'can set weight' do
41
47
  expect(distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
48
+ expect(c_distance('MARTHA', 'MARHTA', weight: 0.2)).to be_within(0.0001).of(0.9778)
42
49
  expect{ distance('MARTHA', 'MARHTA', weight: 0.26) }.to raise_error
43
50
  end
44
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
@@ -68,6 +68,8 @@ files:
68
68
  - Rakefile
69
69
  - benchmark/native.rb
70
70
  - benchmark/pure.rb
71
+ - ext/jaro_winkler/distance.c
72
+ - ext/jaro_winkler/distance.h
71
73
  - ext/jaro_winkler/extconf.rb
72
74
  - ext/jaro_winkler/jaro_winkler.c
73
75
  - ext/jaro_winkler/jaro_winkler.h