taxamatch_rb 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -0
- data/Gemfile.lock +2 -0
- data/Rakefile +0 -7
- data/VERSION +1 -1
- data/lib/taxamatch_rb.rb +2 -2
- data/lib/taxamatch_rb/authmatch.rb +2 -2
- data/spec/taxamatch_rb_spec.rb +1 -15
- data/taxamatch_rb.gemspec +5 -6
- metadata +37 -31
- data/ext/damerau_levenshtein/damerau_levenshtein.c +0 -112
- data/ext/damerau_levenshtein/extconf.rb +0 -11
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +0 -42
- data/spec/damerau_levenshtein_mod_test.txt +0 -63
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -15,6 +15,7 @@ GEM
|
|
|
15
15
|
gherkin (~> 2.4.1)
|
|
16
16
|
json (>= 1.4.6)
|
|
17
17
|
term-ansicolor (>= 1.0.5)
|
|
18
|
+
damerau-levenshtein (0.5.4)
|
|
18
19
|
diff-lcs (1.1.2)
|
|
19
20
|
gherkin (2.4.1)
|
|
20
21
|
json (>= 1.4.6)
|
|
@@ -65,6 +66,7 @@ DEPENDENCIES
|
|
|
65
66
|
biodiversity19 (>= 1.0.10)
|
|
66
67
|
bundler (~> 1.0.0)
|
|
67
68
|
cucumber
|
|
69
|
+
damerau-levenshtein (>= 0.5.4)
|
|
68
70
|
jeweler (~> 1.6.0)
|
|
69
71
|
mocha
|
|
70
72
|
rake-compiler
|
data/Rakefile
CHANGED
|
@@ -10,7 +10,6 @@ rescue Bundler::BundlerError => e
|
|
|
10
10
|
end
|
|
11
11
|
|
|
12
12
|
require 'rake'
|
|
13
|
-
require 'rake/extensiontask'
|
|
14
13
|
|
|
15
14
|
begin
|
|
16
15
|
require 'jeweler'
|
|
@@ -43,10 +42,4 @@ RSpec::Core::RakeTask.new(:rcov) do |spec|
|
|
|
43
42
|
spec.rcov = true
|
|
44
43
|
end
|
|
45
44
|
|
|
46
|
-
Rake::ExtensionTask.new("damerau_levenshtein") do |extension|
|
|
47
|
-
extension.lib_dir = "lib"
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
Rake::Task[:spec].prerequisites << :compile
|
|
51
|
-
|
|
52
45
|
task :default => :spec
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.8.
|
|
1
|
+
0.8.4
|
data/lib/taxamatch_rb.rb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
$:.unshift(File.dirname(__FILE__)) unless
|
|
3
3
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
|
4
4
|
# $:.unshift('taxamatch_rb')
|
|
5
|
-
require '
|
|
5
|
+
require 'damerau-levenshtein'
|
|
6
6
|
require 'taxamatch_rb/atomizer'
|
|
7
7
|
require 'taxamatch_rb/normalizer'
|
|
8
8
|
require 'taxamatch_rb/phonetizer'
|
|
@@ -17,7 +17,7 @@ module Taxamatch
|
|
|
17
17
|
|
|
18
18
|
def initialize
|
|
19
19
|
@parser = Taxamatch::Atomizer.new
|
|
20
|
-
@dlm =
|
|
20
|
+
@dlm = DamerauLevenshtein
|
|
21
21
|
end
|
|
22
22
|
|
|
23
23
|
|
|
@@ -75,7 +75,7 @@ module Taxamatch
|
|
|
75
75
|
def self.fuzzy_match_authors(author1, author2)
|
|
76
76
|
au1_length = author1.size
|
|
77
77
|
au2_length = author2.size
|
|
78
|
-
dlm =
|
|
78
|
+
dlm = DamerauLevenshtein
|
|
79
79
|
ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
|
|
80
80
|
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
|
|
81
81
|
end
|
|
@@ -86,4 +86,4 @@ module Taxamatch
|
|
|
86
86
|
nil
|
|
87
87
|
end
|
|
88
88
|
end
|
|
89
|
-
end
|
|
89
|
+
end
|
data/spec/taxamatch_rb_spec.rb
CHANGED
|
@@ -1,19 +1,6 @@
|
|
|
1
1
|
# encoding: UTF-8
|
|
2
2
|
require 'spec_helper'
|
|
3
3
|
|
|
4
|
-
describe 'DamerauLevenshteinMod' do
|
|
5
|
-
it 'should get tests' do
|
|
6
|
-
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
|
|
7
|
-
dl = Taxamatch::DamerauLevenshteinMod.new
|
|
8
|
-
if y
|
|
9
|
-
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
|
10
|
-
puts y if res != y[4].to_i
|
|
11
|
-
res.should == y[4].to_i
|
|
12
|
-
end
|
|
13
|
-
end
|
|
14
|
-
end
|
|
15
|
-
end
|
|
16
|
-
|
|
17
4
|
describe 'Atomizer' do
|
|
18
5
|
before(:all) do
|
|
19
6
|
@parser = Taxamatch::Atomizer.new
|
|
@@ -54,12 +41,11 @@ describe 'Taxamatch::Base' do
|
|
|
54
41
|
end
|
|
55
42
|
|
|
56
43
|
it 'should get txt tests' do
|
|
57
|
-
dl = Taxamatch::DamerauLevenshteinMod.new
|
|
58
44
|
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
|
|
59
45
|
if y
|
|
60
46
|
y[2] = y[2] == 'true' ? true : false
|
|
61
47
|
res = @tm.taxamatch(y[0], y[1], false)
|
|
62
|
-
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
|
48
|
+
# puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
|
63
49
|
res['match'].should == y[2]
|
|
64
50
|
res['edit_distance'].should == y[3].to_i
|
|
65
51
|
end
|
data/taxamatch_rb.gemspec
CHANGED
|
@@ -5,14 +5,13 @@
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |s|
|
|
7
7
|
s.name = "taxamatch_rb"
|
|
8
|
-
s.version = "0.8.
|
|
8
|
+
s.version = "0.8.4"
|
|
9
9
|
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
11
|
s.authors = ["Dmitry Mozzherin"]
|
|
12
|
-
s.date = "2012-02-
|
|
12
|
+
s.date = "2012-02-13"
|
|
13
13
|
s.description = "This gem implements algorithm for fuzzy matching scientific names developed by Tony Rees"
|
|
14
14
|
s.email = "dmozzherin@eol.org"
|
|
15
|
-
s.extensions = ["ext/damerau_levenshtein/extconf.rb"]
|
|
16
15
|
s.extra_rdoc_files = [
|
|
17
16
|
"LICENSE",
|
|
18
17
|
"README.rdoc"
|
|
@@ -25,14 +24,11 @@ Gem::Specification.new do |s|
|
|
|
25
24
|
"README.rdoc",
|
|
26
25
|
"Rakefile",
|
|
27
26
|
"VERSION",
|
|
28
|
-
"ext/damerau_levenshtein/damerau_levenshtein.c",
|
|
29
27
|
"lib/taxamatch_rb.rb",
|
|
30
28
|
"lib/taxamatch_rb/atomizer.rb",
|
|
31
29
|
"lib/taxamatch_rb/authmatch.rb",
|
|
32
|
-
"lib/taxamatch_rb/damerau_levenshtein_mod.rb",
|
|
33
30
|
"lib/taxamatch_rb/normalizer.rb",
|
|
34
31
|
"lib/taxamatch_rb/phonetizer.rb",
|
|
35
|
-
"spec/damerau_levenshtein_mod_test.txt",
|
|
36
32
|
"spec/spec.opts",
|
|
37
33
|
"spec/spec_helper.rb",
|
|
38
34
|
"spec/taxamatch_rb_spec.rb",
|
|
@@ -49,6 +45,7 @@ Gem::Specification.new do |s|
|
|
|
49
45
|
|
|
50
46
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
|
51
47
|
s.add_runtime_dependency(%q<biodiversity19>, [">= 1.0.10"])
|
|
48
|
+
s.add_runtime_dependency(%q<damerau-levenshtein>, [">= 0.5.4"])
|
|
52
49
|
s.add_development_dependency(%q<rake-compiler>, [">= 0"])
|
|
53
50
|
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
|
54
51
|
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
|
@@ -61,6 +58,7 @@ Gem::Specification.new do |s|
|
|
|
61
58
|
s.add_development_dependency(%q<mocha>, [">= 0"])
|
|
62
59
|
else
|
|
63
60
|
s.add_dependency(%q<biodiversity19>, [">= 1.0.10"])
|
|
61
|
+
s.add_dependency(%q<damerau-levenshtein>, [">= 0.5.4"])
|
|
64
62
|
s.add_dependency(%q<rake-compiler>, [">= 0"])
|
|
65
63
|
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
|
66
64
|
s.add_dependency(%q<cucumber>, [">= 0"])
|
|
@@ -74,6 +72,7 @@ Gem::Specification.new do |s|
|
|
|
74
72
|
end
|
|
75
73
|
else
|
|
76
74
|
s.add_dependency(%q<biodiversity19>, [">= 1.0.10"])
|
|
75
|
+
s.add_dependency(%q<damerau-levenshtein>, [">= 0.5.4"])
|
|
77
76
|
s.add_dependency(%q<rake-compiler>, [">= 0"])
|
|
78
77
|
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
|
79
78
|
s.add_dependency(%q<cucumber>, [">= 0"])
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: taxamatch_rb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.8.
|
|
4
|
+
version: 0.8.4
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,11 +9,11 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-02-
|
|
12
|
+
date: 2012-02-13 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: biodiversity19
|
|
16
|
-
requirement: &
|
|
16
|
+
requirement: &70328583642780 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ! '>='
|
|
@@ -21,10 +21,21 @@ dependencies:
|
|
|
21
21
|
version: 1.0.10
|
|
22
22
|
type: :runtime
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *70328583642780
|
|
25
|
+
- !ruby/object:Gem::Dependency
|
|
26
|
+
name: damerau-levenshtein
|
|
27
|
+
requirement: &70328583642180 !ruby/object:Gem::Requirement
|
|
28
|
+
none: false
|
|
29
|
+
requirements:
|
|
30
|
+
- - ! '>='
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: 0.5.4
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: *70328583642180
|
|
25
36
|
- !ruby/object:Gem::Dependency
|
|
26
37
|
name: rake-compiler
|
|
27
|
-
requirement: &
|
|
38
|
+
requirement: &70328583629080 !ruby/object:Gem::Requirement
|
|
28
39
|
none: false
|
|
29
40
|
requirements:
|
|
30
41
|
- - ! '>='
|
|
@@ -32,10 +43,10 @@ dependencies:
|
|
|
32
43
|
version: '0'
|
|
33
44
|
type: :development
|
|
34
45
|
prerelease: false
|
|
35
|
-
version_requirements: *
|
|
46
|
+
version_requirements: *70328583629080
|
|
36
47
|
- !ruby/object:Gem::Dependency
|
|
37
48
|
name: rspec
|
|
38
|
-
requirement: &
|
|
49
|
+
requirement: &70328583628480 !ruby/object:Gem::Requirement
|
|
39
50
|
none: false
|
|
40
51
|
requirements:
|
|
41
52
|
- - ~>
|
|
@@ -43,10 +54,10 @@ dependencies:
|
|
|
43
54
|
version: 2.3.0
|
|
44
55
|
type: :development
|
|
45
56
|
prerelease: false
|
|
46
|
-
version_requirements: *
|
|
57
|
+
version_requirements: *70328583628480
|
|
47
58
|
- !ruby/object:Gem::Dependency
|
|
48
59
|
name: cucumber
|
|
49
|
-
requirement: &
|
|
60
|
+
requirement: &70328583627880 !ruby/object:Gem::Requirement
|
|
50
61
|
none: false
|
|
51
62
|
requirements:
|
|
52
63
|
- - ! '>='
|
|
@@ -54,10 +65,10 @@ dependencies:
|
|
|
54
65
|
version: '0'
|
|
55
66
|
type: :development
|
|
56
67
|
prerelease: false
|
|
57
|
-
version_requirements: *
|
|
68
|
+
version_requirements: *70328583627880
|
|
58
69
|
- !ruby/object:Gem::Dependency
|
|
59
70
|
name: bundler
|
|
60
|
-
requirement: &
|
|
71
|
+
requirement: &70328583627280 !ruby/object:Gem::Requirement
|
|
61
72
|
none: false
|
|
62
73
|
requirements:
|
|
63
74
|
- - ~>
|
|
@@ -65,10 +76,10 @@ dependencies:
|
|
|
65
76
|
version: 1.0.0
|
|
66
77
|
type: :development
|
|
67
78
|
prerelease: false
|
|
68
|
-
version_requirements: *
|
|
79
|
+
version_requirements: *70328583627280
|
|
69
80
|
- !ruby/object:Gem::Dependency
|
|
70
81
|
name: jeweler
|
|
71
|
-
requirement: &
|
|
82
|
+
requirement: &70328583626700 !ruby/object:Gem::Requirement
|
|
72
83
|
none: false
|
|
73
84
|
requirements:
|
|
74
85
|
- - ~>
|
|
@@ -76,10 +87,10 @@ dependencies:
|
|
|
76
87
|
version: 1.6.0
|
|
77
88
|
type: :development
|
|
78
89
|
prerelease: false
|
|
79
|
-
version_requirements: *
|
|
90
|
+
version_requirements: *70328583626700
|
|
80
91
|
- !ruby/object:Gem::Dependency
|
|
81
92
|
name: rcov
|
|
82
|
-
requirement: &
|
|
93
|
+
requirement: &70328583626100 !ruby/object:Gem::Requirement
|
|
83
94
|
none: false
|
|
84
95
|
requirements:
|
|
85
96
|
- - ! '>='
|
|
@@ -87,10 +98,10 @@ dependencies:
|
|
|
87
98
|
version: '0'
|
|
88
99
|
type: :development
|
|
89
100
|
prerelease: false
|
|
90
|
-
version_requirements: *
|
|
101
|
+
version_requirements: *70328583626100
|
|
91
102
|
- !ruby/object:Gem::Dependency
|
|
92
103
|
name: ruby-debug19
|
|
93
|
-
requirement: &
|
|
104
|
+
requirement: &70328583625500 !ruby/object:Gem::Requirement
|
|
94
105
|
none: false
|
|
95
106
|
requirements:
|
|
96
107
|
- - ! '>='
|
|
@@ -98,10 +109,10 @@ dependencies:
|
|
|
98
109
|
version: '0'
|
|
99
110
|
type: :development
|
|
100
111
|
prerelease: false
|
|
101
|
-
version_requirements: *
|
|
112
|
+
version_requirements: *70328583625500
|
|
102
113
|
- !ruby/object:Gem::Dependency
|
|
103
114
|
name: ruby-prof
|
|
104
|
-
requirement: &
|
|
115
|
+
requirement: &70328583624960 !ruby/object:Gem::Requirement
|
|
105
116
|
none: false
|
|
106
117
|
requirements:
|
|
107
118
|
- - ! '>='
|
|
@@ -109,10 +120,10 @@ dependencies:
|
|
|
109
120
|
version: '0'
|
|
110
121
|
type: :development
|
|
111
122
|
prerelease: false
|
|
112
|
-
version_requirements: *
|
|
123
|
+
version_requirements: *70328583624960
|
|
113
124
|
- !ruby/object:Gem::Dependency
|
|
114
125
|
name: shoulda
|
|
115
|
-
requirement: &
|
|
126
|
+
requirement: &70328583624380 !ruby/object:Gem::Requirement
|
|
116
127
|
none: false
|
|
117
128
|
requirements:
|
|
118
129
|
- - ! '>='
|
|
@@ -120,10 +131,10 @@ dependencies:
|
|
|
120
131
|
version: '0'
|
|
121
132
|
type: :development
|
|
122
133
|
prerelease: false
|
|
123
|
-
version_requirements: *
|
|
134
|
+
version_requirements: *70328583624380
|
|
124
135
|
- !ruby/object:Gem::Dependency
|
|
125
136
|
name: mocha
|
|
126
|
-
requirement: &
|
|
137
|
+
requirement: &70328583623720 !ruby/object:Gem::Requirement
|
|
127
138
|
none: false
|
|
128
139
|
requirements:
|
|
129
140
|
- - ! '>='
|
|
@@ -131,13 +142,12 @@ dependencies:
|
|
|
131
142
|
version: '0'
|
|
132
143
|
type: :development
|
|
133
144
|
prerelease: false
|
|
134
|
-
version_requirements: *
|
|
145
|
+
version_requirements: *70328583623720
|
|
135
146
|
description: This gem implements algorithm for fuzzy matching scientific names developed
|
|
136
147
|
by Tony Rees
|
|
137
148
|
email: dmozzherin@eol.org
|
|
138
149
|
executables: []
|
|
139
|
-
extensions:
|
|
140
|
-
- ext/damerau_levenshtein/extconf.rb
|
|
150
|
+
extensions: []
|
|
141
151
|
extra_rdoc_files:
|
|
142
152
|
- LICENSE
|
|
143
153
|
- README.rdoc
|
|
@@ -149,20 +159,16 @@ files:
|
|
|
149
159
|
- README.rdoc
|
|
150
160
|
- Rakefile
|
|
151
161
|
- VERSION
|
|
152
|
-
- ext/damerau_levenshtein/damerau_levenshtein.c
|
|
153
162
|
- lib/taxamatch_rb.rb
|
|
154
163
|
- lib/taxamatch_rb/atomizer.rb
|
|
155
164
|
- lib/taxamatch_rb/authmatch.rb
|
|
156
|
-
- lib/taxamatch_rb/damerau_levenshtein_mod.rb
|
|
157
165
|
- lib/taxamatch_rb/normalizer.rb
|
|
158
166
|
- lib/taxamatch_rb/phonetizer.rb
|
|
159
|
-
- spec/damerau_levenshtein_mod_test.txt
|
|
160
167
|
- spec/spec.opts
|
|
161
168
|
- spec/spec_helper.rb
|
|
162
169
|
- spec/taxamatch_rb_spec.rb
|
|
163
170
|
- spec/taxamatch_test.txt
|
|
164
171
|
- taxamatch_rb.gemspec
|
|
165
|
-
- ext/damerau_levenshtein/extconf.rb
|
|
166
172
|
homepage: http://github.com/GlobalNamesArchitecture/taxamatch_rb
|
|
167
173
|
licenses: []
|
|
168
174
|
post_install_message:
|
|
@@ -177,7 +183,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
177
183
|
version: '0'
|
|
178
184
|
segments:
|
|
179
185
|
- 0
|
|
180
|
-
hash:
|
|
186
|
+
hash: 1595435064862339145
|
|
181
187
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
182
188
|
none: false
|
|
183
189
|
requirements:
|
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
#include "ruby.h"
|
|
2
|
-
|
|
3
|
-
VALUE DamerauLevenshtein = Qnil;
|
|
4
|
-
|
|
5
|
-
void Init_damerau_levenshtein();
|
|
6
|
-
|
|
7
|
-
VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance);
|
|
8
|
-
|
|
9
|
-
void Init_damerau_levenshtein() {
|
|
10
|
-
DamerauLevenshtein = rb_define_module("DamerauLevenshtein");
|
|
11
|
-
rb_define_method(DamerauLevenshtein, "distance_utf", method_distance_utf, 4);
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VALUE _max_distance){
|
|
15
|
-
int i, i1, j, j1, k, half_tl, cost, *d, distance, del, ins, subs, transp, block;
|
|
16
|
-
int sl, tl, half_sl;
|
|
17
|
-
int stop_execution = 0;
|
|
18
|
-
int min = 0;
|
|
19
|
-
int current_distance = 0;
|
|
20
|
-
|
|
21
|
-
int block_size = NUM2INT(_block_size);
|
|
22
|
-
int max_distance = NUM2INT(_max_distance);
|
|
23
|
-
|
|
24
|
-
VALUE *sv = RARRAY_PTR(_s);
|
|
25
|
-
VALUE *tv = RARRAY_PTR(_t);
|
|
26
|
-
|
|
27
|
-
sl = (int) RARRAY_LEN(_s);
|
|
28
|
-
tl = (int) RARRAY_LEN(_t);
|
|
29
|
-
|
|
30
|
-
if (sl == 0) return INT2NUM(tl);
|
|
31
|
-
if (tl == 0) return INT2NUM(sl);
|
|
32
|
-
//case of lengths 1 must present or it will break further in the code
|
|
33
|
-
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
|
|
34
|
-
|
|
35
|
-
int s[sl];
|
|
36
|
-
int t[tl];
|
|
37
|
-
|
|
38
|
-
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
|
|
39
|
-
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
|
|
40
|
-
|
|
41
|
-
sl++;
|
|
42
|
-
tl++;
|
|
43
|
-
|
|
44
|
-
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
|
45
|
-
d = malloc((sizeof(int))*(sl)*(tl));
|
|
46
|
-
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
|
47
|
-
for(i = 0; i < tl; i++){
|
|
48
|
-
d[i*sl] = i;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
//fill up array with scores
|
|
52
|
-
for(i = 1; i<sl; i++){
|
|
53
|
-
d[i] = i;
|
|
54
|
-
if (stop_execution == 1) break;
|
|
55
|
-
current_distance = 10000;
|
|
56
|
-
for(j = 1; j<tl; j++){
|
|
57
|
-
|
|
58
|
-
cost = 1;
|
|
59
|
-
if(s[i-1] == t[j-1]) cost = 0;
|
|
60
|
-
|
|
61
|
-
half_sl = (sl - 1)/2;
|
|
62
|
-
half_tl = (tl - 1)/2;
|
|
63
|
-
|
|
64
|
-
block = block_size < half_sl ? block_size : half_sl;
|
|
65
|
-
block = block < half_tl ? block : half_tl;
|
|
66
|
-
|
|
67
|
-
while (block >= 1){
|
|
68
|
-
int swap1 = 1;
|
|
69
|
-
int swap2 = 1;
|
|
70
|
-
i1 = i - (block * 2);
|
|
71
|
-
j1 = j - (block * 2);
|
|
72
|
-
for (k = i1; k < i1 + block; k++) {
|
|
73
|
-
if (s[k] != t[k + block]){
|
|
74
|
-
swap1 = 0;
|
|
75
|
-
break;
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
for (k = j1; k < j1 + block; k++) {
|
|
79
|
-
if (t[k] != s[k + block]){
|
|
80
|
-
swap2 = 0;
|
|
81
|
-
break;
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
del = d[j*sl + i - 1] + 1;
|
|
86
|
-
ins = d[(j-1)*sl + i] + 1;
|
|
87
|
-
min = del;
|
|
88
|
-
if (ins < min) min = ins;
|
|
89
|
-
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
|
90
|
-
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
|
91
|
-
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
|
92
|
-
if (transp < min) min = transp;
|
|
93
|
-
block = 0;
|
|
94
|
-
} else if (block == 1) {
|
|
95
|
-
subs = d[(j-1)*sl + i - 1] + cost;
|
|
96
|
-
if (subs < min) min = subs;
|
|
97
|
-
}
|
|
98
|
-
block--;
|
|
99
|
-
}
|
|
100
|
-
d[j*sl+i]=min;
|
|
101
|
-
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
|
|
102
|
-
}
|
|
103
|
-
if (current_distance > max_distance) {
|
|
104
|
-
stop_execution = 1;
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
distance=d[sl * tl - 1];
|
|
108
|
-
if (stop_execution == 1) distance = current_distance;
|
|
109
|
-
|
|
110
|
-
free(d);
|
|
111
|
-
return INT2NUM(distance);
|
|
112
|
-
}
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
# encoding: UTF-8
|
|
2
|
-
|
|
3
|
-
require 'damerau_levenshtein'
|
|
4
|
-
|
|
5
|
-
module Taxamatch
|
|
6
|
-
|
|
7
|
-
class DamerauLevenshteinMod
|
|
8
|
-
include DamerauLevenshtein
|
|
9
|
-
|
|
10
|
-
def distance(str1, str2, block_size=2, max_distance=10)
|
|
11
|
-
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
|
12
|
-
end
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
if __FILE__ == $0
|
|
18
|
-
|
|
19
|
-
a = Taxamatch::DamerauLevenshteinMod.new
|
|
20
|
-
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
|
|
21
|
-
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
|
|
22
|
-
|
|
23
|
-
#puts s.join(",")
|
|
24
|
-
#puts t.join(",")
|
|
25
|
-
|
|
26
|
-
start = Time.now
|
|
27
|
-
(1..100000).each do
|
|
28
|
-
a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921',1,10)
|
|
29
|
-
end
|
|
30
|
-
puts "with unpack time: " + (Time.now - start).to_s + ' sec'
|
|
31
|
-
|
|
32
|
-
start = Time.now
|
|
33
|
-
(1..100000).each do
|
|
34
|
-
a.distance_utf(s, t, 1, 10)
|
|
35
|
-
end
|
|
36
|
-
puts 'utf time: ' + (Time.now - start).to_s + ' sec'
|
|
37
|
-
|
|
38
|
-
#puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
|
|
39
|
-
#puts a.distance_utf(s, t, 2, 10)
|
|
40
|
-
#puts a.distance('tar','atp',1,10);
|
|
41
|
-
puts a.distance('sub', 'usb', 1, 10);
|
|
42
|
-
end
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
######################
|
|
2
|
-
# Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
|
|
3
|
-
#
|
|
4
|
-
# * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
|
|
5
|
-
# * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
|
|
6
|
-
#
|
|
7
|
-
# Fields:
|
|
8
|
-
# String1|String2|maximum distance|transposition block size|expected distance
|
|
9
|
-
# - String1, String2
|
|
10
|
-
# compared strings
|
|
11
|
-
# - maximum distance
|
|
12
|
-
# stops execution of the algorithm when calculated distance exceeds the maximum distance number
|
|
13
|
-
# - transosition block size
|
|
14
|
-
# determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
|
|
15
|
-
# - expected distance
|
|
16
|
-
# resulting distance that has to be achieved by the algorithm
|
|
17
|
-
# Note: algorithm does not try to normalize or interpret strings in any way.
|
|
18
|
-
######################
|
|
19
|
-
|
|
20
|
-
#it whould recognize the exact match
|
|
21
|
-
Pomatomus|Pomatomus|10|1|0
|
|
22
|
-
|
|
23
|
-
#it should not try to normalize incoming strings
|
|
24
|
-
Pomatomus|Pomatomus|10|1|1
|
|
25
|
-
Pomatomus|pomatomus|10|1|1
|
|
26
|
-
|
|
27
|
-
#it should calculate special cases
|
|
28
|
-
Pomatomus||10|1|9
|
|
29
|
-
|Pomatomus|10|1|9
|
|
30
|
-
P|p|10|1|1
|
|
31
|
-
#TODO: one letter vs longer string generates a big negative number
|
|
32
|
-
#L|Linneaus|10|1|7
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
#it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
|
|
36
|
-
Pomatomus|Pomatomux|10|1|1
|
|
37
|
-
Pmatomus|Pomatomus|10|1|1
|
|
38
|
-
Pomatomus|Pmatomus|10|1|1
|
|
39
|
-
Rpmatomus|Pomatomus|10|1|2
|
|
40
|
-
Pommtomus|Pomatomus|10|1|1
|
|
41
|
-
Potamomus|Pomatomus|10|1|2
|
|
42
|
-
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
|
|
43
|
-
Pomatomus|oPmatomus|10|1|1
|
|
44
|
-
Pomatomus|Pomatomsu|10|1|1
|
|
45
|
-
Pomtaomus|Pomatomus|10|1|1
|
|
46
|
-
Pomatoums|Pomatomus|10|1|1
|
|
47
|
-
Potamomus|Pomatomus|10|1|2
|
|
48
|
-
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
|
|
49
|
-
|
|
50
|
-
#it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
|
|
51
|
-
serrulatus|serratulus|10|2|2
|
|
52
|
-
Pomatomus|Poomumats|10|3|3
|
|
53
|
-
vesiculosus|vecusilosus|10|1|4
|
|
54
|
-
vesiculosus|vecusilosus|10|2|2
|
|
55
|
-
trimerophyton|mertriophyton|10|1|6
|
|
56
|
-
trimerophyton|mertriophyton|10|3|3
|
|
57
|
-
|
|
58
|
-
#it should stop trying if distance exceeds maximum allowed distance
|
|
59
|
-
Pxxxxomus|Pomatomus|10|1|4
|
|
60
|
-
Pxxxxomus|Pomatomus|2|1|3
|
|
61
|
-
|
|
62
|
-
#
|
|
63
|
-
PUNCTATA|PUNCTATA|10|1|0
|