jaro_winkler 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7a1229af20c91c12d0f4aeb8fb13dd57cc617344
4
- data.tar.gz: 919b78c919c60d15bdeacdfc84bd6e13952e6181
3
+ metadata.gz: 036b901c1de03b193a10e71951a6c241c4fa31ef
4
+ data.tar.gz: 3cbf7ec98a3d56e7f522345061f4395e18f1a7ca
5
5
  SHA512:
6
- metadata.gz: 56011763a68a13b9b04973a33b3575c768f81c6257d42284e99bcdf94c66896493cf68fad4e33338081b251fb5d1d93f0f6815e82eee7a8e33b0e97d5707826a
7
- data.tar.gz: 79e38669e7d5d4a7582af83a76bee06b49e3b34304708ac5a0d5c8be661dbfc4196a81157b636d8ed4266645af4fcc671b6afcde506588fa3af56d8178961b92
6
+ metadata.gz: 1fa3be807f2cac992dc419df83267ab73d8924d32b1f341cab9d6adfcf31211fd5a89b5b2badb6fe36c78617ccf1524d4f6d9fa785ba50c3ab46481d0033a14f
7
+ data.tar.gz: 54682556479a596049b1f927dd546cfccefb61c0db6ce0f9ae4fec1790780839910ffd7985ec8dccb5f8ed5ce8c1a36498202841bf5d1d7485b1bdc985dd8cdb
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # About
2
2
 
3
- It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
3
+ It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
4
4
 
5
5
  **Windows Issue**
6
6
 
@@ -34,7 +34,7 @@ JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
34
34
 
35
35
  Name | Type | Default | Note
36
36
  ----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
37
- case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
37
+ case_match | boolean | false | All lower case characters are converted to upper case prior to the comparison.
38
38
  weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
39
39
  threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
40
40
 
data/Rakefile CHANGED
@@ -5,3 +5,15 @@ require "rake/extensiontask"
5
5
  Rake::ExtensionTask.new("jaro_winkler") do |ext|
6
6
  ext.lib_dir = "lib/jaro_winkler"
7
7
  end
8
+
9
+ task :benchmark do
10
+ ROOT_PATH = File.expand_path('..', __FILE__)
11
+ LIB_PATH = File.join(ROOT_PATH, 'lib')
12
+ BENCHMARK_PATH = File.join(ROOT_PATH, 'benchmark')
13
+ Dir[File.join(BENCHMARK_PATH, '*.rb')].each do |path|
14
+ output_path = File.join(BENCHMARK_PATH, File.basename(path, '*.rb').sub('.rb', '.txt'))
15
+ cmd = "RUBYLIB=#{LIB_PATH} ruby #{path} > #{output_path}"
16
+ puts cmd
17
+ system(cmd)
18
+ end
19
+ end
@@ -24,9 +24,3 @@ Benchmark.bmbm do |x|
24
24
  n.times{ ary.each{ |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
25
25
  end
26
26
  end
27
-
28
- # user system total real
29
- # jaro_winkler 0.420000 0.000000 0.420000 ( 0.426742)
30
- # fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.160146)
31
- # hotwater 0.300000 0.000000 0.300000 ( 0.297350)
32
- # amatch 0.980000 0.010000 0.990000 ( 0.982874)
@@ -0,0 +1,5 @@
1
+ Rehearsal ----------------------------------------------------
2
+ jaro_winkler 0.380000 0.000000 0.380000 ( 0.376303)
3
+ fuzzystringmatch 0.350000 0.020000 0.370000 ( 0.369344)
4
+ hotwater 0.280000 0.000000 0.280000 ( 0.281579)
5
+ amatch
@@ -14,7 +14,3 @@ Benchmark.bmbm do |x|
14
14
  n.times{ ary.each{ |str1, str2| jarow.getDistance(str1, str2) } }
15
15
  end
16
16
  end
17
-
18
- # user system total real
19
- # jaro_winkler 12.480000 0.010000 12.490000 ( 12.497828)
20
- # fuzzystringmatch 14.990000 0.010000 15.000000 ( 15.014898)
@@ -0,0 +1,8 @@
1
+ Rehearsal ----------------------------------------------------
2
+ jaro_winkler 12.520000 0.020000 12.540000 ( 12.548948)
3
+ fuzzystringmatch 15.370000 0.020000 15.390000 ( 15.408540)
4
+ ------------------------------------------ total: 27.930000sec
5
+
6
+ user system total real
7
+ jaro_winkler 12.750000 0.030000 12.780000 ( 12.782842)
8
+ fuzzystringmatch 16.240000 0.030000 16.270000 ( 16.287380)
@@ -21,11 +21,10 @@ static int char_bytes_num(char first_char){
21
21
  else return 1;
22
22
  }
23
23
 
24
- static unsigned long* codepoints(const char *str, int *ret_len){
25
- int str_len = strlen(str);
26
- unsigned long *ret = calloc(str_len, sizeof(long));
24
+ static unsigned long* codepoints(const char *str, int byte_len, int *ret_len){
25
+ unsigned long *ret = calloc(byte_len, sizeof(long));
27
26
  int count = 0;
28
- for(int i = 0; i < str_len;){
27
+ for(int i = 0; i < byte_len;){
29
28
  int bytes_num = char_bytes_num(str[i]);
30
29
  memcpy(&ret[count], &str[i], bytes_num);
31
30
  count++;
@@ -35,13 +34,13 @@ static unsigned long* codepoints(const char *str, int *ret_len){
35
34
  return ret;
36
35
  }
37
36
 
38
- double c_distance(char *s1, char *s2, Option *opt){
37
+ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt){
39
38
  // set default option if NULL passed
40
39
  int free_opt_flag = 0;
41
40
  if(!opt){ free_opt_flag = 1; opt = option_new(); }
42
41
 
43
42
  int ary_1_len, ary_2_len;
44
- unsigned long *ary_1 = codepoints(s1, &ary_1_len), *ary_2 = codepoints(s2, &ary_2_len);
43
+ unsigned long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
45
44
 
46
45
  if(opt->case_match){
47
46
  for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
@@ -6,7 +6,7 @@ typedef struct{
6
6
  double weight, threshold;
7
7
  } Option;
8
8
 
9
- double c_distance(char *s1, char *s2, Option *opt);
9
+ double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt);
10
10
  Option* option_new();
11
11
 
12
12
  #endif /* DISTANCE_H */
@@ -21,8 +21,7 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
21
21
  if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
22
22
  if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
23
23
  }
24
- // TODO: replace StringValueCStr with StringValuePtr and RSTRING_LEN
25
- VALUE ret = rb_float_new(c_distance(StringValueCStr(s1), StringValueCStr(s2), c_opt));
24
+ VALUE ret = rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
26
25
  free(c_opt);
27
26
  return ret;
28
27
  }
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
10
10
  spec.authors = ["Jian Weihang"]
11
11
  spec.email = ["tonytonyjan@gmail.com"]
12
12
  spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless JaroWinkler.fallback?
13
- spec.summary = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
14
- spec.description = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
13
+ spec.summary = %q{Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.}
14
+ spec.description = %q{It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.}
15
15
  spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
16
16
  spec.license = "MIT"
17
17
 
@@ -47,7 +47,7 @@ module JaroWinkler
47
47
  options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
48
48
  weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
49
49
  raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
50
- s1, s2 = s1.downcase, s2.downcase if case_match
50
+ s1, s2 = s1.upcase, s2.upcase if case_match
51
51
  distance = jaro_distance(s1, s2)
52
52
  prefix = 0
53
53
  max_length = [4, s1.length, s2.length].min
@@ -1,3 +1,3 @@
1
1
  module JaroWinkler
2
- VERSION = "1.2.2"
2
+ VERSION = "1.2.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jaro_winkler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.2
4
+ version: 1.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang
@@ -52,7 +52,9 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
- description: Pure Ruby implementation of Jaro-Winkler distance algorithm.
55
+ description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
56
+ and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
57
+ string.
56
58
  email:
57
59
  - tonytonyjan@gmail.com
58
60
  executables: []
@@ -67,7 +69,9 @@ files:
67
69
  - README.md
68
70
  - Rakefile
69
71
  - benchmark/native.rb
72
+ - benchmark/native.txt
70
73
  - benchmark/pure.rb
74
+ - benchmark/pure.txt
71
75
  - ext/jaro_winkler/distance.c
72
76
  - ext/jaro_winkler/distance.h
73
77
  - ext/jaro_winkler/extconf.rb
@@ -103,7 +107,8 @@ rubyforge_project:
103
107
  rubygems_version: 2.4.1
104
108
  signing_key:
105
109
  specification_version: 4
106
- summary: Pure Ruby implementation of Jaro-Winkler distance algorithm.
110
+ summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
111
+ UTF-8 string.
107
112
  test_files:
108
113
  - spec/jaro_winkler_spec.rb
109
114
  - spec/spec_helper.rb