jaro_winkler 1.2.2 → 1.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/Rakefile +12 -0
- data/benchmark/native.rb +0 -6
- data/benchmark/native.txt +5 -0
- data/benchmark/pure.rb +0 -4
- data/benchmark/pure.txt +8 -0
- data/ext/jaro_winkler/distance.c +5 -6
- data/ext/jaro_winkler/distance.h +1 -1
- data/ext/jaro_winkler/jaro_winkler.c +1 -2
- data/jaro_winkler.gemspec +2 -2
- data/lib/jaro_winkler.rb +1 -1
- data/lib/jaro_winkler/version.rb +1 -1
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 036b901c1de03b193a10e71951a6c241c4fa31ef
|
4
|
+
data.tar.gz: 3cbf7ec98a3d56e7f522345061f4395e18f1a7ca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1fa3be807f2cac992dc419df83267ab73d8924d32b1f341cab9d6adfcf31211fd5a89b5b2badb6fe36c78617ccf1524d4f6d9fa785ba50c3ab46481d0033a14f
|
7
|
+
data.tar.gz: 54682556479a596049b1f927dd546cfccefb61c0db6ce0f9ae4fec1790780839910ffd7985ec8dccb5f8ed5ce8c1a36498202841bf5d1d7485b1bdc985dd8cdb
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# About
|
2
2
|
|
3
|
-
It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby.
|
3
|
+
It's a implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.
|
4
4
|
|
5
5
|
**Windows Issue**
|
6
6
|
|
@@ -34,7 +34,7 @@ JaroWinkler.r_distance "MARTHA", "MARHTA" # Pure Ruby
|
|
34
34
|
|
35
35
|
Name | Type | Default | Note
|
36
36
|
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
|
37
|
-
case_match | boolean | false | All
|
37
|
+
case_match | boolean | false | All lower case characters are converted to upper case prior to the comparison.
|
38
38
|
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
39
39
|
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
|
40
40
|
|
data/Rakefile
CHANGED
@@ -5,3 +5,15 @@ require "rake/extensiontask"
|
|
5
5
|
Rake::ExtensionTask.new("jaro_winkler") do |ext|
|
6
6
|
ext.lib_dir = "lib/jaro_winkler"
|
7
7
|
end
|
8
|
+
|
9
|
+
task :benchmark do
|
10
|
+
ROOT_PATH = File.expand_path('..', __FILE__)
|
11
|
+
LIB_PATH = File.join(ROOT_PATH, 'lib')
|
12
|
+
BENCHMARK_PATH = File.join(ROOT_PATH, 'benchmark')
|
13
|
+
Dir[File.join(BENCHMARK_PATH, '*.rb')].each do |path|
|
14
|
+
output_path = File.join(BENCHMARK_PATH, File.basename(path, '*.rb').sub('.rb', '.txt'))
|
15
|
+
cmd = "RUBYLIB=#{LIB_PATH} ruby #{path} > #{output_path}"
|
16
|
+
puts cmd
|
17
|
+
system(cmd)
|
18
|
+
end
|
19
|
+
end
|
data/benchmark/native.rb
CHANGED
@@ -24,9 +24,3 @@ Benchmark.bmbm do |x|
|
|
24
24
|
n.times{ ary.each{ |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
|
25
25
|
end
|
26
26
|
end
|
27
|
-
|
28
|
-
# user system total real
|
29
|
-
# jaro_winkler 0.420000 0.000000 0.420000 ( 0.426742)
|
30
|
-
# fuzzystringmatch 0.160000 0.000000 0.160000 ( 0.160146)
|
31
|
-
# hotwater 0.300000 0.000000 0.300000 ( 0.297350)
|
32
|
-
# amatch 0.980000 0.010000 0.990000 ( 0.982874)
|
data/benchmark/pure.rb
CHANGED
data/benchmark/pure.txt
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
Rehearsal ----------------------------------------------------
|
2
|
+
jaro_winkler 12.520000 0.020000 12.540000 ( 12.548948)
|
3
|
+
fuzzystringmatch 15.370000 0.020000 15.390000 ( 15.408540)
|
4
|
+
------------------------------------------ total: 27.930000sec
|
5
|
+
|
6
|
+
user system total real
|
7
|
+
jaro_winkler 12.750000 0.030000 12.780000 ( 12.782842)
|
8
|
+
fuzzystringmatch 16.240000 0.030000 16.270000 ( 16.287380)
|
data/ext/jaro_winkler/distance.c
CHANGED
@@ -21,11 +21,10 @@ static int char_bytes_num(char first_char){
|
|
21
21
|
else return 1;
|
22
22
|
}
|
23
23
|
|
24
|
-
static unsigned long* codepoints(const char *str, int *ret_len){
|
25
|
-
|
26
|
-
unsigned long *ret = calloc(str_len, sizeof(long));
|
24
|
+
static unsigned long* codepoints(const char *str, int byte_len, int *ret_len){
|
25
|
+
unsigned long *ret = calloc(byte_len, sizeof(long));
|
27
26
|
int count = 0;
|
28
|
-
for(int i = 0; i <
|
27
|
+
for(int i = 0; i < byte_len;){
|
29
28
|
int bytes_num = char_bytes_num(str[i]);
|
30
29
|
memcpy(&ret[count], &str[i], bytes_num);
|
31
30
|
count++;
|
@@ -35,13 +34,13 @@ static unsigned long* codepoints(const char *str, int *ret_len){
|
|
35
34
|
return ret;
|
36
35
|
}
|
37
36
|
|
38
|
-
double c_distance(char *s1, char *s2, Option *opt){
|
37
|
+
double c_distance(char *s1, int byte_len1, char *s2, int byte_len2, Option *opt){
|
39
38
|
// set default option if NULL passed
|
40
39
|
int free_opt_flag = 0;
|
41
40
|
if(!opt){ free_opt_flag = 1; opt = option_new(); }
|
42
41
|
|
43
42
|
int ary_1_len, ary_2_len;
|
44
|
-
unsigned long *ary_1 = codepoints(s1, &ary_1_len), *ary_2 = codepoints(s2, &ary_2_len);
|
43
|
+
unsigned long *ary_1 = codepoints(s1, byte_len1, &ary_1_len), *ary_2 = codepoints(s2, byte_len2, &ary_2_len);
|
45
44
|
|
46
45
|
if(opt->case_match){
|
47
46
|
for(int i = 0; i < ary_1_len; ++i) if(ary_1[i] < 256 && islower(ary_1[i])) ary_1[i] -= 32;
|
data/ext/jaro_winkler/distance.h
CHANGED
@@ -21,8 +21,7 @@ VALUE rb_distance(int argc, VALUE *argv, VALUE self){
|
|
21
21
|
if(!NIL_P(threshold)) c_opt->threshold = NUM2DBL(threshold);
|
22
22
|
if(!NIL_P(case_match)) c_opt->case_match = (TYPE(case_match) == T_FALSE || NIL_P(case_match)) ? 0 : 1;
|
23
23
|
}
|
24
|
-
|
25
|
-
VALUE ret = rb_float_new(c_distance(StringValueCStr(s1), StringValueCStr(s2), c_opt));
|
24
|
+
VALUE ret = rb_float_new(c_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), c_opt));
|
26
25
|
free(c_opt);
|
27
26
|
return ret;
|
28
27
|
}
|
data/jaro_winkler.gemspec
CHANGED
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.authors = ["Jian Weihang"]
|
11
11
|
spec.email = ["tonytonyjan@gmail.com"]
|
12
12
|
spec.extensions = ["ext/jaro_winkler/extconf.rb"] unless JaroWinkler.fallback?
|
13
|
-
spec.summary = %q{
|
14
|
-
spec.description = %q{
|
13
|
+
spec.summary = %q{Ruby & C implementation of Jaro-Winkler distance algorithm which both support UTF-8 string.}
|
14
|
+
spec.description = %q{It's a implementation of Jaro-Winkler distance algorithm, it uses C extension and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8 string.}
|
15
15
|
spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
|
16
16
|
spec.license = "MIT"
|
17
17
|
|
data/lib/jaro_winkler.rb
CHANGED
@@ -47,7 +47,7 @@ module JaroWinkler
|
|
47
47
|
options = {weight: 0.1, threshold: 0.7, case_match: false}.merge options
|
48
48
|
weight, threshold, case_match = options[:weight], options[:threshold], options[:case_match]
|
49
49
|
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
50
|
-
s1, s2 = s1.
|
50
|
+
s1, s2 = s1.upcase, s2.upcase if case_match
|
51
51
|
distance = jaro_distance(s1, s2)
|
52
52
|
prefix = 0
|
53
53
|
max_length = [4, s1.length, s2.length].min
|
data/lib/jaro_winkler/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jaro_winkler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jian Weihang
|
@@ -52,7 +52,9 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
-
description:
|
55
|
+
description: It's a implementation of Jaro-Winkler distance algorithm, it uses C extension
|
56
|
+
and will fallback to pure Ruby version in JRuby. Both implementation supports UTF-8
|
57
|
+
string.
|
56
58
|
email:
|
57
59
|
- tonytonyjan@gmail.com
|
58
60
|
executables: []
|
@@ -67,7 +69,9 @@ files:
|
|
67
69
|
- README.md
|
68
70
|
- Rakefile
|
69
71
|
- benchmark/native.rb
|
72
|
+
- benchmark/native.txt
|
70
73
|
- benchmark/pure.rb
|
74
|
+
- benchmark/pure.txt
|
71
75
|
- ext/jaro_winkler/distance.c
|
72
76
|
- ext/jaro_winkler/distance.h
|
73
77
|
- ext/jaro_winkler/extconf.rb
|
@@ -103,7 +107,8 @@ rubyforge_project:
|
|
103
107
|
rubygems_version: 2.4.1
|
104
108
|
signing_key:
|
105
109
|
specification_version: 4
|
106
|
-
summary:
|
110
|
+
summary: Ruby & C implementation of Jaro-Winkler distance algorithm which both support
|
111
|
+
UTF-8 string.
|
107
112
|
test_files:
|
108
113
|
- spec/jaro_winkler_spec.rb
|
109
114
|
- spec/spec_helper.rb
|