jaro_winkler 1.2.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/Rakefile +12 -0
- data/benchmark/native.rb +0 -6
- data/benchmark/native.txt +5 -0
- data/benchmark/pure.rb +0 -4
- data/benchmark/pure.txt +8 -0
- data/ext/jaro_winkler/distance.c +5 -6
- data/ext/jaro_winkler/distance.h +1 -1
- data/ext/jaro_winkler/jaro_winkler.c +1 -2
- data/jaro_winkler.gemspec +2 -2
- data/lib/jaro_winkler.rb +1 -1
- data/lib/jaro_winkler/version.rb +1 -1
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 036b901c1de03b193a10e71951a6c241c4fa31ef
|
4
|
+
data.tar.gz: 3cbf7ec98a3d56e7f522345061f4395e18f1a7ca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1fa3be807f2cac992dc419df83267ab73d8924d32b1f341cab9d6adfcf31211fd5a89b5b2badb6fe36c78617ccf1524d4f6d9fa785ba50c3ab46481d0033a14f
|
7
|
+
data.tar.gz: 54682556479a596049b1f927dd546cfccefb61c0db6ce0f9ae4fec1790780839910ffd7985ec8dccb5f8ed5ce8c1a36498202841bf5d1d7485b1bdc985dd8cdb
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# About
|
2
2
|
|
3
|
-
It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
|
3
|
+
It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
|
4
4
|
|
5
5
|
**Windows Issue**
|
6
6
|
|
@@ -34,7 +34,7 @@ JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
|
|
34
34
|
|
35
35
|
Name | Type | Default | Note
|
36
36
|
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
|
37
|
-
case_match | boolean | false | All
|
37
|
+
case_match | boolean | false | All lower case characters are converted to upper case prior to the comparison.
|
38
38
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
39
39
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
40
40
|
|
data/Rakefile
CHANGED
@@ -5,3 +5,15 @@ require "rake/extensiontask"
|
|
5
5
|
Rake::ExtensionTask.new("jaro_winkler") do |ext|
|
6
6
|
ext.lib_dir = "lib/jaro_winkler"
|
7
7
|
end
|
8
|
+
|
9
|
+
task :benchmark do
|
10
|
+
ROOT_PATH = File.expand_path('..', __FILE__)
|
11
|
+
LIB_PATH = File.join(ROOT_PATH, 'lib')
|
12
|
+
BENCHMARK_PATH = File.join(ROOT_PATH, 'benchmark')
|
13
|
+
Dir[File.join(BENCHMARK_PATH, '*.rb')].each do |path|
|
14
|
+
output_path = File.join(BENCHMARK_PATH, File.basename(path, '*.rb').sub('.rb', '.txt'))
|
15
|
+
cmd = "RUBYLIB=#{LIB_PATH} ruby #{path} > #{output_path}"
|
16
|
+
puts cmd
|
17
|
+
system(cmd)
|
18
|
+
end
|
19
|
+
end
|
data/benchmark/native.rb
CHANGED
@@ -24,9 +24,3 @@ Benchmark.bmbm do |x|
|
|
24
24
|
n.times{ ary.each{ |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
|
25
25
|
end
|
26
26
|
end
|
27
|
-
|
28
|
-
# user system total real
|
29
|
-
# jaro_winkler 0.420000 0.000000 0.420000 ( 0.426742)
|
30
|
-
# fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.160146)
|
31
|
-
# hotwater 0.300000 0.000000 0.300000 ( 0.297350)
|
32
|
-
# amatch 0.980000 0.010000 0.990000 ( 0.982874)
|
data/benchmark/pure.rb
CHANGED
data/benchmark/pure.txt
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
Rehearsal ----------------------------------------------------
|
2
|
+
jaro_winkler 12.520000 0.020000 12.540000 ( 12.548948)
|
3
|
+
fuzzystringmatch 15.370000 0.020000 15.390000 ( 15.408540)
|
4
|
+
------------------------------------------ total: 27.930000sec
|
5
|
+
|
6
|
+
user system total real
|
7
|
+
jaro_winkler 12.750000 0.030000 12.780000 ( 12.782842)
|
8
|
+
fuzzystringmatch 16.240000 0.030000 16.270000 ( 16.287380)
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -21,11 +21,10 @@ static int char_bytes_num(char first_char){
|
|
21
21
|
else return 1;
|
22
22
|
}
|
23
23
|
|
24
|
-
static unsigned long* codepoints(const char *str, int *ret_len){
|
25
|
-
|
26
|
-
unsigned long *ret = calloc(str_len, sizeof(long));
|
24
|
+
static unsigned long* codepoints(const char *str, int byte_len, int *ret_len){
|
25
|
+
unsigned long *ret = calloc(byte_len, sizeof(long));
|
27
26
|
int count = 0;
|
28
|
-
for(int i = 0; i <
|
27
|
+
for(int i = 0; i < byte_len;){
|
29
28
|
int bytes_num = char_bytes_num(str[i]);
|
30
29
|
memcpy(&ret[count], &str[i], bytes_num);
|
31
30
|
count++;
|
@@ -35,13 +34,13 @@ static unsigned long* codepoints(const char *str, int *ret_len){
|
|
35
34
|
return ret;
|
36
35
|
}
|
37
36
|
|
38
|
-
double c_distance(char *s1, char *s2, Option *opt){
|
37
|
+
double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt){
|
39
38
|
// set default option if NULL passed
|
40
39
|
int free_opt_flag = 0;
|
41
40
|
if(!opt){ free_opt_flag = 1; opt = option_new(); }
|
42
41
|
|
43
42
|
int ary_1_len, ary_2_len;
|
44
|
-
unsigned long *ary_1 = codepoints(s1, &ary_1_len), *ary_2 = codepoints(s2, &ary_2_len);
|
43
|
+
unsigned long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
|
45
44
|
|
46
45
|
if(opt->case_match){
|
47
46
|
for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
|
data/ext/jaro_winkler/distance.h
CHANGED
@@ -21,8 +21,7 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
|
21
21
|
if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
|
22
22
|
if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
|
23
23
|
}
|
24
|
-
|
25
|
-
VALUE ret = rb_float_new(c_distance(StringValueCStr(s1), StringValueCStr(s2), c_opt));
|
24
|
+
VALUE ret = rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
|
26
25
|
free(c_opt);
|
27
26
|
return ret;
|
28
27
|
}
|
data/jaro_winkler.gemspec
CHANGED
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.authors = ["Jian Weihang"]
|
11
11
|
spec.email = ["tonytonyjan@gmail.com"]
|
12
12
|
spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless JaroWinkler.fallback?
|
13
|
-
spec.summary = %q{
|
14
|
-
spec.description = %q{
|
13
|
+
spec.summary = %q{Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.}
|
14
|
+
spec.description = %q{It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.}
|
15
15
|
spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
|
16
16
|
spec.license = "MIT"
|
17
17
|
|
data/lib/jaro_winkler.rb
CHANGED
@@ -47,7 +47,7 @@ module JaroWinkler
|
|
47
47
|
options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
|
48
48
|
weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
|
49
49
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
50
|
-
s1, s2 = s1.
|
50
|
+
s1, s2 = s1.upcase, s2.upcase if case_match
|
51
51
|
distance = jaro_distance(s1, s2)
|
52
52
|
prefix = 0
|
53
53
|
max_length = [4, s1.length, s2.length].min
|
data/lib/jaro_winkler/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
@@ -52,7 +52,9 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
-
description:
|
55
|
+
description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
|
56
|
+
and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
|
57
|
+
string.
|
56
58
|
email:
|
57
59
|
- tonytonyjan@gmail.com
|
58
60
|
executables: []
|
@@ -67,7 +69,9 @@ files:
|
|
67
69
|
- README.md
|
68
70
|
- Rakefile
|
69
71
|
- benchmark/native.rb
|
72
|
+
- benchmark/native.txt
|
70
73
|
- benchmark/pure.rb
|
74
|
+
- benchmark/pure.txt
|
71
75
|
- ext/jaro_winkler/distance.c
|
72
76
|
- ext/jaro_winkler/distance.h
|
73
77
|
- ext/jaro_winkler/extconf.rb
|
@@ -103,7 +107,8 @@ rubyforge_project:
|
|
103
107
|
rubygems_version: 2.4.1
|
104
108
|
signing_key:
|
105
109
|
specification_version: 4
|
106
|
-
summary:
|
110
|
+
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
|
111
|
+
UTF-8 string.
|
107
112
|
test_files:
|
108
113
|
- spec/jaro_winkler_spec.rb
|
109
114
|
- spec/spec_helper.rb
|