jaro_winkler 1.2.2 → 1.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7a1229af20c91c12d0f4aeb8fb13dd57cc617344
4
- data.tar.gz: 919b78c919c60d15bdeacdfc84bd6e13952e6181
3
+ metadata.gz: 036b901c1de03b193a10e71951a6c241c4fa31ef
4
+ data.tar.gz: 3cbf7ec98a3d56e7f522345061f4395e18f1a7ca
5
5
  SHA512:
6
- metadata.gz: 56011763a68a13b9b04973a33b3575c768f81c6257d42284e99bcdf94c66896493cf68fad4e33338081b251fb5d1d93f0f6815e82eee7a8e33b0e97d5707826a
7
- data.tar.gz: 79e38669e7d5d4a7582af83a76bee06b49e3b34304708ac5a0d5c8be661dbfc4196a81157b636d8ed4266645af4fcc671b6afcde506588fa3af56d8178961b92
6
+ metadata.gz: 1fa3be807f2cac992dc419df83267ab73d8924d32b1f341cab9d6adfcf31211fd5a89b5b2badb6fe36c78617ccf1524d4f6d9fa785ba50c3ab46481d0033a14f
7
+ data.tar.gz: 54682556479a596049b1f927dd546cfccefb61c0db6ce0f9ae4fec1790780839910ffd7985ec8dccb5f8ed5ce8c1a36498202841bf5d1d7485b1bdc985dd8cdb
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # About
2
2
 
3
- It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
3
+ It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
4
4
 
5
5
  **Windows Issue**
6
6
 
@@ -34,7 +34,7 @@ JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
34
34
 
35
35
  Name | Type | Default | Note
36
36
  ----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
37
- case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
37
+ case_match | boolean | false | All lower case characters are converted to upper case prior to the comparison.
38
38
  weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
39
39
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
40
40
 
data/Rakefile CHANGED
@@ -5,3 +5,15 @@ require "rake/extensiontask"
5
5
  Rake::ExtensionTask.new("jaro_winkler") do |ext|
6
6
  ext.lib_dir = "lib/jaro_winkler"
7
7
  end
8
+
9
+ task :benchmark do
10
+ ROOT_PATH = File.expand_path('..', __FILE__)
11
+ LIB_PATH = File.join(ROOT_PATH, 'lib')
12
+ BENCHMARK_PATH = File.join(ROOT_PATH, 'benchmark')
13
+ Dir[File.join(BENCHMARK_PATH, '*.rb')].each do |path|
14
+ output_path = File.join(BENCHMARK_PATH, File.basename(path, '*.rb').sub('.rb', '.txt'))
15
+ cmd = "RUBYLIB=#{LIB_PATH} ruby #{path} > #{output_path}"
16
+ puts cmd
17
+ system(cmd)
18
+ end
19
+ end
@@ -24,9 +24,3 @@ Benchmark.bmbm do |x|
24
24
  n.times{ ary.each{ |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
25
25
  end
26
26
  end
27
-
28
- # user system total real
29
- # jaro_winkler 0.420000 0.000000 0.420000 ( 0.426742)
30
- # fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.160146)
31
- # hotwater 0.300000 0.000000 0.300000 ( 0.297350)
32
- # amatch 0.980000 0.010000 0.990000 ( 0.982874)
@@ -0,0 +1,5 @@
1
+ Rehearsal ----------------------------------------------------
2
+ jaro_winkler 0.380000 0.000000 0.380000 ( 0.376303)
3
+ fuzzystringmatch 0.350000 0.020000 0.370000 ( 0.369344)
4
+ hotwater 0.280000 0.000000 0.280000 ( 0.281579)
5
+ amatch
@@ -14,7 +14,3 @@ Benchmark.bmbm do |x|
14
14
  n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
15
15
  end
16
16
  end
17
-
18
- # user system total real
19
- # jaro_winkler 12.480000 0.010000 12.490000 ( 12.497828)
20
- # fuzzystringmatch 14.990000 0.010000 15.000000 ( 15.014898)
@@ -0,0 +1,8 @@
1
+ Rehearsal ----------------------------------------------------
2
+ jaro_winkler 12.520000 0.020000 12.540000 ( 12.548948)
3
+ fuzzystringmatch 15.370000 0.020000 15.390000 ( 15.408540)
4
+ ------------------------------------------ total: 27.930000sec
5
+
6
+ user system total real
7
+ jaro_winkler 12.750000 0.030000 12.780000 ( 12.782842)
8
+ fuzzystringmatch 16.240000 0.030000 16.270000 ( 16.287380)
@@ -21,11 +21,10 @@ static int char_bytes_num(char first_char){
21
21
  else return 1;
22
22
  }
23
23
 
24
- static unsigned long* codepoints(const char *str, int *ret_len){
25
- int str_len = strlen(str);
26
- unsigned long *ret = calloc(str_len, sizeof(long));
24
+ static unsigned long* codepoints(const char *str, int byte_len, int *ret_len){
25
+ unsigned long *ret = calloc(byte_len, sizeof(long));
27
26
  int count = 0;
28
- for(int i = 0; i < str_len;){
27
+ for(int i = 0; i < byte_len;){
29
28
  int bytes_num = char_bytes_num(str[i]);
30
29
  memcpy(&ret[count], &str[i], bytes_num);
31
30
  count++;
@@ -35,13 +34,13 @@ static unsigned long* codepoints(const char *str, int *ret_len){
35
34
  return ret;
36
35
  }
37
36
 
38
- double c_distance(char *s1, char *s2, Option *opt){
37
+ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt){
39
38
  // set default option if NULL passed
40
39
  int free_opt_flag = 0;
41
40
  if(!opt){ free_opt_flag = 1; opt = option_new(); }
42
41
 
43
42
  int ary_1_len, ary_2_len;
44
- unsigned long *ary_1 = codepoints(s1, &ary_1_len), *ary_2 = codepoints(s2, &ary_2_len);
43
+ unsigned long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
45
44
 
46
45
  if(opt->case_match){
47
46
  for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
@@ -6,7 +6,7 @@ typedef struct{
6
6
  double weight, threshold;
7
7
  } Option;
8
8
 
9
- double c_distance(char *s1, char *s2, Option *opt);
9
+ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt);
10
10
  Option* option_new();
11
11
 
12
12
  #endif /* DISTANCE_H */
@@ -21,8 +21,7 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
21
21
  if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
22
22
  if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
23
23
  }
24
- // TODO: replace StringValueCStr with StringValuePtr and RSTRING_LEN
25
- VALUE ret = rb_float_new(c_distance(StringValueCStr(s1), StringValueCStr(s2), c_opt));
24
+ VALUE ret = rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
26
25
  free(c_opt);
27
26
  return ret;
28
27
  }
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
10
10
  spec.authors = ["Jian Weihang"]
11
11
  spec.email = ["tonytonyjan@gmail.com"]
12
12
  spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless JaroWinkler.fallback?
13
- spec.summary = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
14
- spec.description = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
13
+ spec.summary = %q{Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.}
14
+ spec.description = %q{It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.}
15
15
  spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
16
16
  spec.license = "MIT"
17
17
 
@@ -47,7 +47,7 @@ module JaroWinkler
47
47
  options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
48
48
  weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
49
49
  raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
50
- s1, s2 = s1.downcase, s2.downcase if case_match
50
+ s1, s2 = s1.upcase, s2.upcase if case_match
51
51
  distance = jaro_distance(s1, s2)
52
52
  prefix = 0
53
53
  max_length = [4, s1.length, s2.length].min
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.2.2"
2
+ VERSION = "1.2.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.2
4
+ version: 1.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
@@ -52,7 +52,9 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
- description: Pure Ruby implementation of Jaro-Winkler distance algorithm.
55
+ description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
56
+ and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
57
+ string.
56
58
  email:
57
59
  - tonytonyjan@gmail.com
58
60
  executables: []
@@ -67,7 +69,9 @@ files:
67
69
  - README.md
68
70
  - Rakefile
69
71
  - benchmark/native.rb
72
+ - benchmark/native.txt
70
73
  - benchmark/pure.rb
74
+ - benchmark/pure.txt
71
75
  - ext/jaro_winkler/distance.c
72
76
  - ext/jaro_winkler/distance.h
73
77
  - ext/jaro_winkler/extconf.rb
@@ -103,7 +107,8 @@ rubyforge_project:
103
107
  rubygems_version: 2.4.1
104
108
  signing_key:
105
109
  specification_version: 4
106
- summary: Pure Ruby implementation of Jaro-Winkler distance algorithm.
110
+ summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
111
+ UTF-8 string.
107
112
  test_files:
108
113
  - spec/jaro_winkler_spec.rb
109
114
  - spec/spec_helper.rb