taxamatch_rb 0.8.3 → 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/Gemfile.lock +2 -0
- data/Rakefile +0 -7
- data/VERSION +1 -1
- data/lib/taxamatch_rb.rb +2 -2
- data/lib/taxamatch_rb/authmatch.rb +2 -2
- data/spec/taxamatch_rb_spec.rb +1 -15
- data/taxamatch_rb.gemspec +5 -6
- metadata +37 -31
- data/ext/damerau_levenshtein/damerau_levenshtein.c +0 -112
- data/ext/damerau_levenshtein/extconf.rb +0 -11
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +0 -42
- data/spec/damerau_levenshtein_mod_test.txt +0 -63
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -15,6 +15,7 @@ GEM
|
|
15
15
|
gherkin (~> 2.4.1)
|
16
16
|
json (>= 1.4.6)
|
17
17
|
term-ansicolor (>= 1.0.5)
|
18
|
+
damerau-levenshtein (0.5.4)
|
18
19
|
diff-lcs (1.1.2)
|
19
20
|
gherkin (2.4.1)
|
20
21
|
json (>= 1.4.6)
|
@@ -65,6 +66,7 @@ DEPENDENCIES
|
|
65
66
|
biodiversity19 (>= 1.0.10)
|
66
67
|
bundler (~> 1.0.0)
|
67
68
|
cucumber
|
69
|
+
damerau-levenshtein (>= 0.5.4)
|
68
70
|
jeweler (~> 1.6.0)
|
69
71
|
mocha
|
70
72
|
rake-compiler
|
data/Rakefile
CHANGED
@@ -10,7 +10,6 @@ rescue Bundler::BundlerError => e
|
|
10
10
|
end
|
11
11
|
|
12
12
|
require 'rake'
|
13
|
-
require 'rake/extensiontask'
|
14
13
|
|
15
14
|
begin
|
16
15
|
require 'jeweler'
|
@@ -43,10 +42,4 @@ RSpec::Core::RakeTask.new(:rcov) do |spec|
|
|
43
42
|
spec.rcov = true
|
44
43
|
end
|
45
44
|
|
46
|
-
Rake::ExtensionTask.new("damerau_levenshtein") do |extension|
|
47
|
-
extension.lib_dir = "lib"
|
48
|
-
end
|
49
|
-
|
50
|
-
Rake::Task[:spec].prerequisites << :compile
|
51
|
-
|
52
45
|
task :default => :spec
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.4
|
data/lib/taxamatch_rb.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
$:.unshift(File.dirname(__FILE__)) unless
|
3
3
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
4
4
|
# $:.unshift('taxamatch_rb')
|
5
|
-
require '
|
5
|
+
require 'damerau-levenshtein'
|
6
6
|
require 'taxamatch_rb/atomizer'
|
7
7
|
require 'taxamatch_rb/normalizer'
|
8
8
|
require 'taxamatch_rb/phonetizer'
|
@@ -17,7 +17,7 @@ module Taxamatch
|
|
17
17
|
|
18
18
|
def initialize
|
19
19
|
@parser = Taxamatch::Atomizer.new
|
20
|
-
@dlm =
|
20
|
+
@dlm = DamerauLevenshtein
|
21
21
|
end
|
22
22
|
|
23
23
|
|
@@ -75,7 +75,7 @@ module Taxamatch
|
|
75
75
|
def self.fuzzy_match_authors(author1, author2)
|
76
76
|
au1_length = author1.size
|
77
77
|
au2_length = author2.size
|
78
|
-
dlm =
|
78
|
+
dlm = DamerauLevenshtein
|
79
79
|
ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
|
80
80
|
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
|
81
81
|
end
|
@@ -86,4 +86,4 @@ module Taxamatch
|
|
86
86
|
nil
|
87
87
|
end
|
88
88
|
end
|
89
|
-
end
|
89
|
+
end
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -1,19 +1,6 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
|
-
describe 'DamerauLevenshteinMod' do
|
5
|
-
it 'should get tests' do
|
6
|
-
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
|
7
|
-
dl = Taxamatch::DamerauLevenshteinMod.new
|
8
|
-
if y
|
9
|
-
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
10
|
-
puts y if res != y[4].to_i
|
11
|
-
res.should == y[4].to_i
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
4
|
describe 'Atomizer' do
|
18
5
|
before(:all) do
|
19
6
|
@parser = Taxamatch::Atomizer.new
|
@@ -54,12 +41,11 @@ describe 'Taxamatch::Base' do
|
|
54
41
|
end
|
55
42
|
|
56
43
|
it 'should get txt tests' do
|
57
|
-
dl = Taxamatch::DamerauLevenshteinMod.new
|
58
44
|
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
|
59
45
|
if y
|
60
46
|
y[2] = y[2] == 'true' ? true : false
|
61
47
|
res = @tm.taxamatch(y[0], y[1], false)
|
62
|
-
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
48
|
+
# puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
63
49
|
res['match'].should == y[2]
|
64
50
|
res['edit_distance'].should == y[3].to_i
|
65
51
|
end
|
data/taxamatch_rb.gemspec
CHANGED
@@ -5,14 +5,13 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "taxamatch_rb"
|
8
|
-
s.version = "0.8.
|
8
|
+
s.version = "0.8.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Dmitry Mozzherin"]
|
12
|
-
s.date = "2012-02-
|
12
|
+
s.date = "2012-02-13"
|
13
13
|
s.description = "This gem implements algorithm for fuzzy matching scientific names developed by Tony Rees"
|
14
14
|
s.email = "dmozzherin@eol.org"
|
15
|
-
s.extensions = ["ext/damerau_levenshtein/extconf.rb"]
|
16
15
|
s.extra_rdoc_files = [
|
17
16
|
"LICENSE",
|
18
17
|
"README.rdoc"
|
@@ -25,14 +24,11 @@ Gem::Specification.new do |s|
|
|
25
24
|
"README.rdoc",
|
26
25
|
"Rakefile",
|
27
26
|
"VERSION",
|
28
|
-
"ext/damerau_levenshtein/damerau_levenshtein.c",
|
29
27
|
"lib/taxamatch_rb.rb",
|
30
28
|
"lib/taxamatch_rb/atomizer.rb",
|
31
29
|
"lib/taxamatch_rb/authmatch.rb",
|
32
|
-
"lib/taxamatch_rb/damerau_levenshtein_mod.rb",
|
33
30
|
"lib/taxamatch_rb/normalizer.rb",
|
34
31
|
"lib/taxamatch_rb/phonetizer.rb",
|
35
|
-
"spec/damerau_levenshtein_mod_test.txt",
|
36
32
|
"spec/spec.opts",
|
37
33
|
"spec/spec_helper.rb",
|
38
34
|
"spec/taxamatch_rb_spec.rb",
|
@@ -49,6 +45,7 @@ Gem::Specification.new do |s|
|
|
49
45
|
|
50
46
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
51
47
|
s.add_runtime_dependency(%q<biodiversity19>, [">= 1.0.10"])
|
48
|
+
s.add_runtime_dependency(%q<damerau-levenshtein>, [">= 0.5.4"])
|
52
49
|
s.add_development_dependency(%q<rake-compiler>, [">= 0"])
|
53
50
|
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
54
51
|
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
@@ -61,6 +58,7 @@ Gem::Specification.new do |s|
|
|
61
58
|
s.add_development_dependency(%q<mocha>, [">= 0"])
|
62
59
|
else
|
63
60
|
s.add_dependency(%q<biodiversity19>, [">= 1.0.10"])
|
61
|
+
s.add_dependency(%q<damerau-levenshtein>, [">= 0.5.4"])
|
64
62
|
s.add_dependency(%q<rake-compiler>, [">= 0"])
|
65
63
|
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
66
64
|
s.add_dependency(%q<cucumber>, [">= 0"])
|
@@ -74,6 +72,7 @@ Gem::Specification.new do |s|
|
|
74
72
|
end
|
75
73
|
else
|
76
74
|
s.add_dependency(%q<biodiversity19>, [">= 1.0.10"])
|
75
|
+
s.add_dependency(%q<damerau-levenshtein>, [">= 0.5.4"])
|
77
76
|
s.add_dependency(%q<rake-compiler>, [">= 0"])
|
78
77
|
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
79
78
|
s.add_dependency(%q<cucumber>, [">= 0"])
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: taxamatch_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-13 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: biodiversity19
|
16
|
-
requirement: &
|
16
|
+
requirement: &70328583642780 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,21 @@ dependencies:
|
|
21
21
|
version: 1.0.10
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70328583642780
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: damerau-levenshtein
|
27
|
+
requirement: &70328583642180 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 0.5.4
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70328583642180
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: rake-compiler
|
27
|
-
requirement: &
|
38
|
+
requirement: &70328583629080 !ruby/object:Gem::Requirement
|
28
39
|
none: false
|
29
40
|
requirements:
|
30
41
|
- - ! '>='
|
@@ -32,10 +43,10 @@ dependencies:
|
|
32
43
|
version: '0'
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *70328583629080
|
36
47
|
- !ruby/object:Gem::Dependency
|
37
48
|
name: rspec
|
38
|
-
requirement: &
|
49
|
+
requirement: &70328583628480 !ruby/object:Gem::Requirement
|
39
50
|
none: false
|
40
51
|
requirements:
|
41
52
|
- - ~>
|
@@ -43,10 +54,10 @@ dependencies:
|
|
43
54
|
version: 2.3.0
|
44
55
|
type: :development
|
45
56
|
prerelease: false
|
46
|
-
version_requirements: *
|
57
|
+
version_requirements: *70328583628480
|
47
58
|
- !ruby/object:Gem::Dependency
|
48
59
|
name: cucumber
|
49
|
-
requirement: &
|
60
|
+
requirement: &70328583627880 !ruby/object:Gem::Requirement
|
50
61
|
none: false
|
51
62
|
requirements:
|
52
63
|
- - ! '>='
|
@@ -54,10 +65,10 @@ dependencies:
|
|
54
65
|
version: '0'
|
55
66
|
type: :development
|
56
67
|
prerelease: false
|
57
|
-
version_requirements: *
|
68
|
+
version_requirements: *70328583627880
|
58
69
|
- !ruby/object:Gem::Dependency
|
59
70
|
name: bundler
|
60
|
-
requirement: &
|
71
|
+
requirement: &70328583627280 !ruby/object:Gem::Requirement
|
61
72
|
none: false
|
62
73
|
requirements:
|
63
74
|
- - ~>
|
@@ -65,10 +76,10 @@ dependencies:
|
|
65
76
|
version: 1.0.0
|
66
77
|
type: :development
|
67
78
|
prerelease: false
|
68
|
-
version_requirements: *
|
79
|
+
version_requirements: *70328583627280
|
69
80
|
- !ruby/object:Gem::Dependency
|
70
81
|
name: jeweler
|
71
|
-
requirement: &
|
82
|
+
requirement: &70328583626700 !ruby/object:Gem::Requirement
|
72
83
|
none: false
|
73
84
|
requirements:
|
74
85
|
- - ~>
|
@@ -76,10 +87,10 @@ dependencies:
|
|
76
87
|
version: 1.6.0
|
77
88
|
type: :development
|
78
89
|
prerelease: false
|
79
|
-
version_requirements: *
|
90
|
+
version_requirements: *70328583626700
|
80
91
|
- !ruby/object:Gem::Dependency
|
81
92
|
name: rcov
|
82
|
-
requirement: &
|
93
|
+
requirement: &70328583626100 !ruby/object:Gem::Requirement
|
83
94
|
none: false
|
84
95
|
requirements:
|
85
96
|
- - ! '>='
|
@@ -87,10 +98,10 @@ dependencies:
|
|
87
98
|
version: '0'
|
88
99
|
type: :development
|
89
100
|
prerelease: false
|
90
|
-
version_requirements: *
|
101
|
+
version_requirements: *70328583626100
|
91
102
|
- !ruby/object:Gem::Dependency
|
92
103
|
name: ruby-debug19
|
93
|
-
requirement: &
|
104
|
+
requirement: &70328583625500 !ruby/object:Gem::Requirement
|
94
105
|
none: false
|
95
106
|
requirements:
|
96
107
|
- - ! '>='
|
@@ -98,10 +109,10 @@ dependencies:
|
|
98
109
|
version: '0'
|
99
110
|
type: :development
|
100
111
|
prerelease: false
|
101
|
-
version_requirements: *
|
112
|
+
version_requirements: *70328583625500
|
102
113
|
- !ruby/object:Gem::Dependency
|
103
114
|
name: ruby-prof
|
104
|
-
requirement: &
|
115
|
+
requirement: &70328583624960 !ruby/object:Gem::Requirement
|
105
116
|
none: false
|
106
117
|
requirements:
|
107
118
|
- - ! '>='
|
@@ -109,10 +120,10 @@ dependencies:
|
|
109
120
|
version: '0'
|
110
121
|
type: :development
|
111
122
|
prerelease: false
|
112
|
-
version_requirements: *
|
123
|
+
version_requirements: *70328583624960
|
113
124
|
- !ruby/object:Gem::Dependency
|
114
125
|
name: shoulda
|
115
|
-
requirement: &
|
126
|
+
requirement: &70328583624380 !ruby/object:Gem::Requirement
|
116
127
|
none: false
|
117
128
|
requirements:
|
118
129
|
- - ! '>='
|
@@ -120,10 +131,10 @@ dependencies:
|
|
120
131
|
version: '0'
|
121
132
|
type: :development
|
122
133
|
prerelease: false
|
123
|
-
version_requirements: *
|
134
|
+
version_requirements: *70328583624380
|
124
135
|
- !ruby/object:Gem::Dependency
|
125
136
|
name: mocha
|
126
|
-
requirement: &
|
137
|
+
requirement: &70328583623720 !ruby/object:Gem::Requirement
|
127
138
|
none: false
|
128
139
|
requirements:
|
129
140
|
- - ! '>='
|
@@ -131,13 +142,12 @@ dependencies:
|
|
131
142
|
version: '0'
|
132
143
|
type: :development
|
133
144
|
prerelease: false
|
134
|
-
version_requirements: *
|
145
|
+
version_requirements: *70328583623720
|
135
146
|
description: This gem implements algorithm for fuzzy matching scientific names developed
|
136
147
|
by Tony Rees
|
137
148
|
email: dmozzherin@eol.org
|
138
149
|
executables: []
|
139
|
-
extensions:
|
140
|
-
- ext/damerau_levenshtein/extconf.rb
|
150
|
+
extensions: []
|
141
151
|
extra_rdoc_files:
|
142
152
|
- LICENSE
|
143
153
|
- README.rdoc
|
@@ -149,20 +159,16 @@ files:
|
|
149
159
|
- README.rdoc
|
150
160
|
- Rakefile
|
151
161
|
- VERSION
|
152
|
-
- ext/damerau_levenshtein/damerau_levenshtein.c
|
153
162
|
- lib/taxamatch_rb.rb
|
154
163
|
- lib/taxamatch_rb/atomizer.rb
|
155
164
|
- lib/taxamatch_rb/authmatch.rb
|
156
|
-
- lib/taxamatch_rb/damerau_levenshtein_mod.rb
|
157
165
|
- lib/taxamatch_rb/normalizer.rb
|
158
166
|
- lib/taxamatch_rb/phonetizer.rb
|
159
|
-
- spec/damerau_levenshtein_mod_test.txt
|
160
167
|
- spec/spec.opts
|
161
168
|
- spec/spec_helper.rb
|
162
169
|
- spec/taxamatch_rb_spec.rb
|
163
170
|
- spec/taxamatch_test.txt
|
164
171
|
- taxamatch_rb.gemspec
|
165
|
-
- ext/damerau_levenshtein/extconf.rb
|
166
172
|
homepage: http://github.com/GlobalNamesArchitecture/taxamatch_rb
|
167
173
|
licenses: []
|
168
174
|
post_install_message:
|
@@ -177,7 +183,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
177
183
|
version: '0'
|
178
184
|
segments:
|
179
185
|
- 0
|
180
|
-
hash:
|
186
|
+
hash: 1595435064862339145
|
181
187
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
182
188
|
none: false
|
183
189
|
requirements:
|
@@ -1,112 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
|
3
|
-
VALUE DamerauLevenshtein = Qnil;
|
4
|
-
|
5
|
-
void Init_damerau_levenshtein();
|
6
|
-
|
7
|
-
VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance);
|
8
|
-
|
9
|
-
void Init_damerau_levenshtein() {
|
10
|
-
DamerauLevenshtein = rb_define_module("DamerauLevenshtein");
|
11
|
-
rb_define_method(DamerauLevenshtein, "distance_utf", method_distance_utf, 4);
|
12
|
-
}
|
13
|
-
|
14
|
-
VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance){
|
15
|
-
int i, i1, j, j1, k, half_tl, cost, *d, distance, del, ins, subs, transp, block;
|
16
|
-
int sl, tl, half_sl;
|
17
|
-
int stop_execution = 0;
|
18
|
-
int min = 0;
|
19
|
-
int current_distance = 0;
|
20
|
-
|
21
|
-
int block_size = NUM2INT(_block_size);
|
22
|
-
int max_distance = NUM2INT(_max_distance);
|
23
|
-
|
24
|
-
VALUE *sv = RARRAY_PTR(_s);
|
25
|
-
VALUE *tv = RARRAY_PTR(_t);
|
26
|
-
|
27
|
-
sl = (int) RARRAY_LEN(_s);
|
28
|
-
tl = (int) RARRAY_LEN(_t);
|
29
|
-
|
30
|
-
if (sl == 0) return INT2NUM(tl);
|
31
|
-
if (tl == 0) return INT2NUM(sl);
|
32
|
-
//case of lengths 1 must present or it will break further in the code
|
33
|
-
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
|
34
|
-
|
35
|
-
int s[sl];
|
36
|
-
int t[tl];
|
37
|
-
|
38
|
-
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
|
39
|
-
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
|
40
|
-
|
41
|
-
sl++;
|
42
|
-
tl++;
|
43
|
-
|
44
|
-
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
45
|
-
d = malloc((sizeof(int))*(sl)*(tl));
|
46
|
-
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
47
|
-
for(i = 0; i < tl; i++){
|
48
|
-
d[i*sl] = i;
|
49
|
-
}
|
50
|
-
|
51
|
-
//fill up array with scores
|
52
|
-
for(i = 1; i<sl; i++){
|
53
|
-
d[i] = i;
|
54
|
-
if (stop_execution == 1) break;
|
55
|
-
current_distance = 10000;
|
56
|
-
for(j = 1; j<tl; j++){
|
57
|
-
|
58
|
-
cost = 1;
|
59
|
-
if(s[i-1] == t[j-1]) cost = 0;
|
60
|
-
|
61
|
-
half_sl = (sl - 1)/2;
|
62
|
-
half_tl = (tl - 1)/2;
|
63
|
-
|
64
|
-
block = block_size < half_sl ? block_size : half_sl;
|
65
|
-
block = block < half_tl ? block : half_tl;
|
66
|
-
|
67
|
-
while (block >= 1){
|
68
|
-
int swap1 = 1;
|
69
|
-
int swap2 = 1;
|
70
|
-
i1 = i - (block * 2);
|
71
|
-
j1 = j - (block * 2);
|
72
|
-
for (k = i1; k < i1 + block; k++) {
|
73
|
-
if (s[k] != t[k + block]){
|
74
|
-
swap1 = 0;
|
75
|
-
break;
|
76
|
-
}
|
77
|
-
}
|
78
|
-
for (k = j1; k < j1 + block; k++) {
|
79
|
-
if (t[k] != s[k + block]){
|
80
|
-
swap2 = 0;
|
81
|
-
break;
|
82
|
-
}
|
83
|
-
}
|
84
|
-
|
85
|
-
del = d[j*sl + i - 1] + 1;
|
86
|
-
ins = d[(j-1)*sl + i] + 1;
|
87
|
-
min = del;
|
88
|
-
if (ins < min) min = ins;
|
89
|
-
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
90
|
-
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
91
|
-
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
92
|
-
if (transp < min) min = transp;
|
93
|
-
block = 0;
|
94
|
-
} else if (block == 1) {
|
95
|
-
subs = d[(j-1)*sl + i - 1] + cost;
|
96
|
-
if (subs < min) min = subs;
|
97
|
-
}
|
98
|
-
block--;
|
99
|
-
}
|
100
|
-
d[j*sl+i]=min;
|
101
|
-
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
|
102
|
-
}
|
103
|
-
if (current_distance > max_distance) {
|
104
|
-
stop_execution = 1;
|
105
|
-
}
|
106
|
-
}
|
107
|
-
distance=d[sl * tl - 1];
|
108
|
-
if (stop_execution == 1) distance = current_distance;
|
109
|
-
|
110
|
-
free(d);
|
111
|
-
return INT2NUM(distance);
|
112
|
-
}
|
@@ -1,42 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
require 'damerau_levenshtein'
|
4
|
-
|
5
|
-
module Taxamatch
|
6
|
-
|
7
|
-
class DamerauLevenshteinMod
|
8
|
-
include DamerauLevenshtein
|
9
|
-
|
10
|
-
def distance(str1, str2, block_size=2, max_distance=10)
|
11
|
-
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
end
|
16
|
-
|
17
|
-
if __FILE__ == $0
|
18
|
-
|
19
|
-
a = Taxamatch::DamerauLevenshteinMod.new
|
20
|
-
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
|
21
|
-
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
|
22
|
-
|
23
|
-
#puts s.join(",")
|
24
|
-
#puts t.join(",")
|
25
|
-
|
26
|
-
start = Time.now
|
27
|
-
(1..100000).each do
|
28
|
-
a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921',1,10)
|
29
|
-
end
|
30
|
-
puts "with unpack time: " + (Time.now - start).to_s + ' sec'
|
31
|
-
|
32
|
-
start = Time.now
|
33
|
-
(1..100000).each do
|
34
|
-
a.distance_utf(s, t, 1, 10)
|
35
|
-
end
|
36
|
-
puts 'utf time: ' + (Time.now - start).to_s + ' sec'
|
37
|
-
|
38
|
-
#puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
|
39
|
-
#puts a.distance_utf(s, t, 2, 10)
|
40
|
-
#puts a.distance('tar','atp',1,10);
|
41
|
-
puts a.distance('sub', 'usb', 1, 10);
|
42
|
-
end
|
@@ -1,63 +0,0 @@
|
|
1
|
-
######################
|
2
|
-
# Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
|
3
|
-
#
|
4
|
-
# * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
|
5
|
-
# * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
|
6
|
-
#
|
7
|
-
# Fields:
|
8
|
-
# String1|String2|maximum distance|transposition block size|expected distance
|
9
|
-
# - String1, String2
|
10
|
-
# compared strings
|
11
|
-
# - maximum distance
|
12
|
-
# stops execution of the algorithm when calculated distance exceeds the maximum distance number
|
13
|
-
# - transosition block size
|
14
|
-
# determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
|
15
|
-
# - expected distance
|
16
|
-
# resulting distance that has to be achieved by the algorithm
|
17
|
-
# Note: algorithm does not try to normalize or interpret strings in any way.
|
18
|
-
######################
|
19
|
-
|
20
|
-
#it whould recognize the exact match
|
21
|
-
Pomatomus|Pomatomus|10|1|0
|
22
|
-
|
23
|
-
#it should not try to normalize incoming strings
|
24
|
-
Pomatomus|Pomatomus|10|1|1
|
25
|
-
Pomatomus|pomatomus|10|1|1
|
26
|
-
|
27
|
-
#it should calculate special cases
|
28
|
-
Pomatomus||10|1|9
|
29
|
-
|Pomatomus|10|1|9
|
30
|
-
P|p|10|1|1
|
31
|
-
#TODO: one letter vs longer string generates a big negative number
|
32
|
-
#L|Linneaus|10|1|7
|
33
|
-
|
34
|
-
|
35
|
-
#it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
|
36
|
-
Pomatomus|Pomatomux|10|1|1
|
37
|
-
Pmatomus|Pomatomus|10|1|1
|
38
|
-
Pomatomus|Pmatomus|10|1|1
|
39
|
-
Rpmatomus|Pomatomus|10|1|2
|
40
|
-
Pommtomus|Pomatomus|10|1|1
|
41
|
-
Potamomus|Pomatomus|10|1|2
|
42
|
-
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
|
43
|
-
Pomatomus|oPmatomus|10|1|1
|
44
|
-
Pomatomus|Pomatomsu|10|1|1
|
45
|
-
Pomtaomus|Pomatomus|10|1|1
|
46
|
-
Pomatoums|Pomatomus|10|1|1
|
47
|
-
Potamomus|Pomatomus|10|1|2
|
48
|
-
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
|
49
|
-
|
50
|
-
#it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
|
51
|
-
serrulatus|serratulus|10|2|2
|
52
|
-
Pomatomus|Poomumats|10|3|3
|
53
|
-
vesiculosus|vecusilosus|10|1|4
|
54
|
-
vesiculosus|vecusilosus|10|2|2
|
55
|
-
trimerophyton|mertriophyton|10|1|6
|
56
|
-
trimerophyton|mertriophyton|10|3|3
|
57
|
-
|
58
|
-
#it should stop trying if distance exceeds maximum allowed distance
|
59
|
-
Pxxxxomus|Pomatomus|10|1|4
|
60
|
-
Pxxxxomus|Pomatomus|2|1|3
|
61
|
-
|
62
|
-
#
|
63
|
-
PUNCTATA|PUNCTATA|10|1|0
|