taxamatch_rb 0.6.5 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +75 -0
- data/README.rdoc +1 -5
- data/ext/damerau_levenshtein/extconf.rb +11 -0
- data/lib/taxamatch_rb.rb +19 -19
- data/lib/taxamatch_rb/damerau_levenshtein.bundle +0 -0
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +10 -107
- data/spec/damerau_levenshtein_mod_test.txt +1 -1
- data/spec/spec_helper.rb +1 -7
- data/spec/taxamatch_rb_spec.rb +21 -21
- metadata +186 -17
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
GEM
|
|
2
|
+
remote: http://rubygems.org/
|
|
3
|
+
specs:
|
|
4
|
+
archive-tar-minitar (0.5.2)
|
|
5
|
+
biodiversity (0.5.16)
|
|
6
|
+
json
|
|
7
|
+
treetop
|
|
8
|
+
biodiversity19 (0.5.16)
|
|
9
|
+
json
|
|
10
|
+
treetop
|
|
11
|
+
builder (3.0.0)
|
|
12
|
+
columnize (0.3.3)
|
|
13
|
+
cucumber (1.0.0)
|
|
14
|
+
builder (>= 2.1.2)
|
|
15
|
+
diff-lcs (>= 1.1.2)
|
|
16
|
+
gherkin (~> 2.4.1)
|
|
17
|
+
json (>= 1.4.6)
|
|
18
|
+
term-ansicolor (>= 1.0.5)
|
|
19
|
+
diff-lcs (1.1.2)
|
|
20
|
+
gherkin (2.4.1)
|
|
21
|
+
json (>= 1.4.6)
|
|
22
|
+
git (1.2.5)
|
|
23
|
+
jeweler (1.6.2)
|
|
24
|
+
bundler (~> 1.0)
|
|
25
|
+
git (>= 1.2.5)
|
|
26
|
+
rake
|
|
27
|
+
json (1.5.3)
|
|
28
|
+
linecache19 (0.5.12)
|
|
29
|
+
ruby_core_source (>= 0.1.4)
|
|
30
|
+
mocha (0.9.12)
|
|
31
|
+
polyglot (0.3.1)
|
|
32
|
+
rake (0.9.2)
|
|
33
|
+
rake-compiler (0.7.9)
|
|
34
|
+
rake
|
|
35
|
+
rcov (0.9.9)
|
|
36
|
+
rspec (2.3.0)
|
|
37
|
+
rspec-core (~> 2.3.0)
|
|
38
|
+
rspec-expectations (~> 2.3.0)
|
|
39
|
+
rspec-mocks (~> 2.3.0)
|
|
40
|
+
rspec-core (2.3.1)
|
|
41
|
+
rspec-expectations (2.3.0)
|
|
42
|
+
diff-lcs (~> 1.1.2)
|
|
43
|
+
rspec-mocks (2.3.0)
|
|
44
|
+
ruby-debug-base19 (0.11.25)
|
|
45
|
+
columnize (>= 0.3.1)
|
|
46
|
+
linecache19 (>= 0.5.11)
|
|
47
|
+
ruby_core_source (>= 0.1.4)
|
|
48
|
+
ruby-debug19 (0.11.6)
|
|
49
|
+
columnize (>= 0.3.1)
|
|
50
|
+
linecache19 (>= 0.5.11)
|
|
51
|
+
ruby-debug-base19 (>= 0.11.19)
|
|
52
|
+
ruby-prof (0.10.7)
|
|
53
|
+
ruby_core_source (0.1.5)
|
|
54
|
+
archive-tar-minitar (>= 0.5.2)
|
|
55
|
+
shoulda (2.11.3)
|
|
56
|
+
term-ansicolor (1.0.5)
|
|
57
|
+
treetop (1.4.9)
|
|
58
|
+
polyglot (>= 0.3.1)
|
|
59
|
+
|
|
60
|
+
PLATFORMS
|
|
61
|
+
ruby
|
|
62
|
+
|
|
63
|
+
DEPENDENCIES
|
|
64
|
+
biodiversity (~> 0.5.13)
|
|
65
|
+
biodiversity19 (~> 0.5.13)
|
|
66
|
+
bundler (~> 1.0.0)
|
|
67
|
+
cucumber
|
|
68
|
+
jeweler (~> 1.6.0)
|
|
69
|
+
mocha
|
|
70
|
+
rake-compiler
|
|
71
|
+
rcov
|
|
72
|
+
rspec (~> 2.3.0)
|
|
73
|
+
ruby-debug19
|
|
74
|
+
ruby-prof
|
|
75
|
+
shoulda
|
data/README.rdoc
CHANGED
|
@@ -14,11 +14,7 @@ Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
|
|
|
14
14
|
|
|
15
15
|
== Installation
|
|
16
16
|
|
|
17
|
-
sudo gem install
|
|
18
|
-
|
|
19
|
-
or
|
|
20
|
-
sudo gem sources -a http://gems.github.com #(you only have to do this once)
|
|
21
|
-
sudo gem install dimus-taxamatch_rb
|
|
17
|
+
sudo gem install taxamatch_rb
|
|
22
18
|
|
|
23
19
|
== Usage
|
|
24
20
|
|
data/lib/taxamatch_rb.rb
CHANGED
|
@@ -8,38 +8,38 @@ require 'taxamatch_rb/normalizer'
|
|
|
8
8
|
require 'taxamatch_rb/phonetizer'
|
|
9
9
|
require 'taxamatch_rb/authmatch'
|
|
10
10
|
|
|
11
|
-
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
|
11
|
+
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
|
12
12
|
|
|
13
13
|
module Taxamatch
|
|
14
14
|
|
|
15
15
|
class Base
|
|
16
|
-
|
|
16
|
+
|
|
17
17
|
def initialize
|
|
18
18
|
@parser = Taxamatch::Atomizer.new
|
|
19
19
|
@dlm = Taxamatch::DamerauLevenshteinMod.new
|
|
20
20
|
end
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
|
|
22
|
+
|
|
23
23
|
#takes two scientific names and returns true if names match and false if they don't
|
|
24
|
-
def taxamatch(str1, str2, return_boolean = true)
|
|
24
|
+
def taxamatch(str1, str2, return_boolean = true)
|
|
25
25
|
preparsed_1 = @parser.parse(str1)
|
|
26
26
|
preparsed_2 = @parser.parse(str2)
|
|
27
27
|
match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
|
|
28
28
|
return_boolean ? (!!match && match['match']) : match
|
|
29
29
|
end
|
|
30
|
-
|
|
31
|
-
#takes two hashes of parsed scientific names, analyses them and returns back
|
|
30
|
+
|
|
31
|
+
#takes two hashes of parsed scientific names, analyses them and returns back
|
|
32
32
|
#this function is useful when species strings are preparsed.
|
|
33
33
|
def taxamatch_preparsed(preparsed_1, preparsed_2)
|
|
34
34
|
result = nil
|
|
35
|
-
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
|
35
|
+
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
|
36
36
|
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
|
37
37
|
if result && result['match']
|
|
38
|
-
result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
|
|
38
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
|
|
39
39
|
end
|
|
40
40
|
return result
|
|
41
41
|
end
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
def match_uninomial(preparsed_1, preparsed_2)
|
|
44
44
|
match_genera(preparsed_1[:uninomial], preparsed_2[:uninomial])
|
|
45
45
|
end
|
|
@@ -54,14 +54,14 @@ module Taxamatch
|
|
|
54
54
|
match_hash = match_matches(gen_match, sp_match, infrasp_match)
|
|
55
55
|
elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
|
|
56
56
|
match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
|
|
57
|
-
total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:string].size : preparsed_2[:infraspecies][0][:string].size
|
|
57
|
+
total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:string].size : preparsed_2[:infraspecies][0][:string].size
|
|
58
58
|
else
|
|
59
59
|
match_hash = match_matches(gen_match, sp_match)
|
|
60
60
|
end
|
|
61
61
|
match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
|
|
62
62
|
match_hash
|
|
63
63
|
end
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
def match_genera(genus1, genus2)
|
|
66
66
|
genus1_length = genus1[:normalized].size
|
|
67
67
|
genus2_length = genus2[:normalized].size
|
|
@@ -69,10 +69,10 @@ module Taxamatch
|
|
|
69
69
|
match = false
|
|
70
70
|
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
|
71
71
|
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
|
|
72
|
-
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
|
73
|
-
|
|
72
|
+
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
|
73
|
+
|
|
74
74
|
match = true if ed <= 3 && (min_length > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
|
75
|
-
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
|
75
|
+
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
|
76
76
|
end
|
|
77
77
|
|
|
78
78
|
def match_species(sp1, sp2)
|
|
@@ -86,11 +86,11 @@ module Taxamatch
|
|
|
86
86
|
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.3334
|
|
87
87
|
#puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
|
|
88
88
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
|
89
|
-
|
|
89
|
+
|
|
90
90
|
match = true if ed <= 4 && (min_length >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
|
91
91
|
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
|
92
92
|
end
|
|
93
|
-
|
|
93
|
+
|
|
94
94
|
def match_authors(preparsed_1, preparsed_2)
|
|
95
95
|
au1 = preparsed_1[:all_authors]
|
|
96
96
|
au2 = preparsed_2[:all_authors]
|
|
@@ -98,8 +98,8 @@ module Taxamatch
|
|
|
98
98
|
yr2 = preparsed_2[:all_years]
|
|
99
99
|
Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
|
100
100
|
end
|
|
101
|
-
|
|
102
|
-
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
|
101
|
+
|
|
102
|
+
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
|
103
103
|
match = species_match
|
|
104
104
|
if infraspecies_match
|
|
105
105
|
match['edit_distance'] += infraspecies_match['edit_distance']
|
|
Binary file
|
|
@@ -1,122 +1,25 @@
|
|
|
1
1
|
# encoding: UTF-8
|
|
2
|
-
|
|
3
|
-
require '
|
|
4
|
-
|
|
2
|
+
|
|
3
|
+
require File.join(File.dirname(__FILE__), 'damerau_levenshtein')
|
|
4
|
+
|
|
5
5
|
module Taxamatch
|
|
6
6
|
|
|
7
7
|
class DamerauLevenshteinMod
|
|
8
|
+
include DamerauLevenshtein
|
|
9
|
+
|
|
8
10
|
def distance(str1, str2, block_size=2, max_distance=10)
|
|
9
|
-
# puts str1.unpack("U*");
|
|
10
11
|
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
|
11
12
|
end
|
|
12
|
-
|
|
13
|
-
inline do |builder|
|
|
14
|
-
builder.c "
|
|
15
|
-
static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
|
|
16
|
-
int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
|
|
17
|
-
int stop_execution = 0;
|
|
18
|
-
int min = 0;
|
|
19
|
-
int current_distance = 0;
|
|
20
|
-
|
|
21
|
-
VALUE *sv = RARRAY_PTR(_s);
|
|
22
|
-
VALUE *tv = RARRAY_PTR(_t);
|
|
23
|
-
|
|
24
|
-
sl = RARRAY_LEN(_s);
|
|
25
|
-
tl = RARRAY_LEN(_t);
|
|
26
|
-
|
|
27
|
-
if (sl == 0) return INT2NUM(tl);
|
|
28
|
-
if (tl == 0) return INT2NUM(sl);
|
|
29
|
-
//case of lengths 1 must present or it will break further in the code
|
|
30
|
-
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
|
|
31
|
-
|
|
32
|
-
int s[sl];
|
|
33
|
-
int t[tl];
|
|
34
|
-
|
|
35
|
-
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
|
|
36
|
-
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
|
|
37
|
-
|
|
38
|
-
sl++;
|
|
39
|
-
tl++;
|
|
40
|
-
|
|
41
|
-
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
|
42
|
-
d = malloc((sizeof(int))*(sl)*(tl));
|
|
43
|
-
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
|
44
|
-
for(i = 0; i < tl; i++){
|
|
45
|
-
d[i*sl] = i;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
//fill up array with scores
|
|
49
|
-
for(i = 1; i<sl; i++){
|
|
50
|
-
d[i] = i;
|
|
51
|
-
if (stop_execution == 1) break;
|
|
52
|
-
current_distance = 10000;
|
|
53
|
-
for(j = 1; j<tl; j++){
|
|
54
|
-
|
|
55
|
-
cost = 1;
|
|
56
|
-
if(s[i-1] == t[j-1]) cost = 0;
|
|
57
|
-
|
|
58
|
-
half_sl = (sl - 1)/2;
|
|
59
|
-
half_tl = (tl - 1)/2;
|
|
60
|
-
|
|
61
|
-
block = block_size < half_sl ? block_size : half_sl;
|
|
62
|
-
block = block < half_tl ? block : half_tl;
|
|
63
|
-
|
|
64
|
-
while (block >= 1){
|
|
65
|
-
int swap1 = 1;
|
|
66
|
-
int swap2 = 1;
|
|
67
|
-
i1 = i - (block * 2);
|
|
68
|
-
j1 = j - (block * 2);
|
|
69
|
-
for (k = i1; k < i1 + block; k++) {
|
|
70
|
-
if (s[k] != t[k + block]){
|
|
71
|
-
swap1 = 0;
|
|
72
|
-
break;
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
for (k = j1; k < j1 + block; k++) {
|
|
76
|
-
if (t[k] != s[k + block]){
|
|
77
|
-
swap2 = 0;
|
|
78
|
-
break;
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
del = d[j*sl + i - 1] + 1;
|
|
83
|
-
ins = d[(j-1)*sl + i] + 1;
|
|
84
|
-
min = del;
|
|
85
|
-
if (ins < min) min = ins;
|
|
86
|
-
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
|
87
|
-
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
|
88
|
-
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
|
89
|
-
if (transp < min) min = transp;
|
|
90
|
-
block = 0;
|
|
91
|
-
} else if (block == 1) {
|
|
92
|
-
subs = d[(j-1)*sl + i - 1] + cost;
|
|
93
|
-
if (subs < min) min = subs;
|
|
94
|
-
}
|
|
95
|
-
block--;
|
|
96
|
-
}
|
|
97
|
-
d[j*sl+i]=min;
|
|
98
|
-
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
|
|
99
|
-
}
|
|
100
|
-
if (current_distance > max_distance) {
|
|
101
|
-
stop_execution = 1;
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
distance=d[sl * tl - 1];
|
|
105
|
-
if (stop_execution == 1) distance = current_distance;
|
|
106
|
-
|
|
107
|
-
free(d);
|
|
108
|
-
return INT2NUM(distance);
|
|
109
|
-
}
|
|
110
|
-
"
|
|
111
|
-
end
|
|
112
13
|
end
|
|
14
|
+
|
|
113
15
|
end
|
|
114
16
|
|
|
115
17
|
if __FILE__ == $0
|
|
116
|
-
|
|
18
|
+
|
|
19
|
+
a = Taxamatch::DamerauLevenshteinMod.new
|
|
117
20
|
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
|
|
118
21
|
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
|
|
119
|
-
|
|
22
|
+
|
|
120
23
|
#puts s.join(",")
|
|
121
24
|
#puts t.join(",")
|
|
122
25
|
|
|
@@ -133,7 +36,7 @@ if __FILE__ == $0
|
|
|
133
36
|
puts 'utf time: ' + (Time.now - start).to_s + ' sec'
|
|
134
37
|
|
|
135
38
|
#puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
|
|
136
|
-
#puts a.distance_utf(s, t, 2, 10)
|
|
39
|
+
#puts a.distance_utf(s, t, 2, 10)
|
|
137
40
|
#puts a.distance('tar','atp',1,10);
|
|
138
41
|
puts a.distance('sub', 'usb', 1, 10);
|
|
139
42
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/taxamatch_rb_spec.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# encoding: UTF-8
|
|
2
|
-
require
|
|
2
|
+
require 'spec_helper'
|
|
3
3
|
|
|
4
4
|
describe 'DamerauLevenshteinMod' do
|
|
5
5
|
it 'should get tests' do
|
|
@@ -7,7 +7,7 @@ describe 'DamerauLevenshteinMod' do
|
|
|
7
7
|
dl = Taxamatch::DamerauLevenshteinMod.new
|
|
8
8
|
if y
|
|
9
9
|
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
|
10
|
-
|
|
10
|
+
puts y if res != y[4].to_i
|
|
11
11
|
res.should == y[4].to_i
|
|
12
12
|
end
|
|
13
13
|
end
|
|
@@ -18,17 +18,17 @@ describe 'Atomizer' do
|
|
|
18
18
|
before(:all) do
|
|
19
19
|
@parser = Taxamatch::Atomizer.new
|
|
20
20
|
end
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
it 'should parse uninomials' do
|
|
23
23
|
@parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:string=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
|
|
24
24
|
@parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:string=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
|
|
25
25
|
end
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
it 'should parse binomials' do
|
|
28
28
|
@parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:string=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
|
|
29
29
|
end
|
|
30
|
-
|
|
31
|
-
it 'should parse trinomials' do
|
|
30
|
+
|
|
31
|
+
it 'should parse trinomials' do
|
|
32
32
|
@parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:string=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:string=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
|
|
33
33
|
end
|
|
34
34
|
end
|
|
@@ -42,7 +42,7 @@ describe 'Taxamatch::Normalizer' do
|
|
|
42
42
|
Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
|
|
43
43
|
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
|
|
44
44
|
end
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
it 'should normalize words' do
|
|
47
47
|
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
|
|
48
48
|
end
|
|
@@ -52,25 +52,25 @@ describe 'Taxamatch::Base' do
|
|
|
52
52
|
before(:all) do
|
|
53
53
|
@tm = Taxamatch::Base.new
|
|
54
54
|
end
|
|
55
|
-
|
|
55
|
+
|
|
56
56
|
it 'should get txt tests' do
|
|
57
57
|
dl = Taxamatch::DamerauLevenshteinMod.new
|
|
58
58
|
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
|
|
59
59
|
if y
|
|
60
60
|
y[2] = y[2] == 'true' ? true : false
|
|
61
61
|
res = @tm.taxamatch(y[0], y[1], false)
|
|
62
|
-
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
|
62
|
+
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
|
63
63
|
res['match'].should == y[2]
|
|
64
64
|
res['edit_distance'].should == y[3].to_i
|
|
65
65
|
end
|
|
66
66
|
end
|
|
67
67
|
end
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
it 'should work with names that cannot be parsed' do
|
|
70
70
|
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
|
|
71
71
|
res = false
|
|
72
72
|
end
|
|
73
|
-
|
|
73
|
+
|
|
74
74
|
it 'should compare genera' do
|
|
75
75
|
#edit distance 1 always match
|
|
76
76
|
g1 = make_taxamatch_hash 'Plantago'
|
|
@@ -138,17 +138,17 @@ describe 'Taxamatch::Base' do
|
|
|
138
138
|
#Should not match if Distance 2 or 3 and first 1 char is not the same
|
|
139
139
|
s1 = make_taxamatch_hash 'morrrr'
|
|
140
140
|
s2 = make_taxamatch_hash 'torraa'
|
|
141
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
|
141
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
|
142
142
|
#Distance 1 will match anywhere
|
|
143
143
|
s1 = make_taxamatch_hash 'major'
|
|
144
144
|
s2 = make_taxamatch_hash 'rajor'
|
|
145
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
|
|
145
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
|
|
146
146
|
#Will not match if distance 3 and length is less then twice of the edit distance
|
|
147
147
|
s1 = make_taxamatch_hash 'marrr'
|
|
148
148
|
s2 = make_taxamatch_hash 'maaaa'
|
|
149
149
|
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
|
150
150
|
end
|
|
151
|
-
|
|
151
|
+
|
|
152
152
|
it 'should match matches' do
|
|
153
153
|
#No trobule case
|
|
154
154
|
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
|
@@ -159,7 +159,7 @@ describe 'Taxamatch::Base' do
|
|
|
159
159
|
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
|
160
160
|
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
|
161
161
|
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
|
162
|
-
smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
|
162
|
+
smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
|
163
163
|
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
|
164
164
|
#Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
|
|
165
165
|
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
|
|
@@ -188,7 +188,7 @@ describe 'Taxamatch::Base' do
|
|
|
188
188
|
before(:all) do
|
|
189
189
|
@am = Taxamatch::Authmatch
|
|
190
190
|
end
|
|
191
|
-
|
|
191
|
+
|
|
192
192
|
it 'should calculate score' do
|
|
193
193
|
res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
|
|
194
194
|
res.should == 90
|
|
@@ -219,22 +219,22 @@ describe 'Taxamatch::Base' do
|
|
|
219
219
|
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
|
|
220
220
|
res.should == 0
|
|
221
221
|
end
|
|
222
|
-
|
|
222
|
+
|
|
223
223
|
it 'should compare years' do
|
|
224
224
|
@am.compare_years([1882],[1880]).should == 2
|
|
225
225
|
@am.compare_years([1882],[]).should == nil
|
|
226
226
|
@am.compare_years([],[]).should == 0
|
|
227
227
|
@am.compare_years([1788,1798], [1788,1798]).should be_nil
|
|
228
228
|
end
|
|
229
|
-
|
|
230
|
-
it 'should remove duplicate authors' do
|
|
229
|
+
|
|
230
|
+
it 'should remove duplicate authors' do
|
|
231
231
|
#Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
|
|
232
232
|
#Muller is identical
|
|
233
233
|
res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
|
|
234
234
|
res.should == [[], []]
|
|
235
235
|
#same in different order
|
|
236
236
|
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
|
|
237
|
-
res.should == [[], []]
|
|
237
|
+
res.should == [[], []]
|
|
238
238
|
#auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
|
|
239
239
|
res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
|
|
240
240
|
res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
|
|
@@ -252,7 +252,7 @@ describe 'Taxamatch::Base' do
|
|
|
252
252
|
# res = @am.fuzzy_match_authors('L', 'Muller')
|
|
253
253
|
# res.should be_false
|
|
254
254
|
end
|
|
255
|
-
|
|
255
|
+
|
|
256
256
|
end
|
|
257
257
|
|
|
258
258
|
end
|
metadata
CHANGED
|
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
|
4
4
|
prerelease: false
|
|
5
5
|
segments:
|
|
6
6
|
- 0
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
version: 0.
|
|
7
|
+
- 7
|
|
8
|
+
- 4
|
|
9
|
+
version: 0.7.4
|
|
10
10
|
platform: ruby
|
|
11
11
|
authors:
|
|
12
12
|
- Dmitry Mozzherin
|
|
@@ -14,13 +14,42 @@ autorequire:
|
|
|
14
14
|
bindir: bin
|
|
15
15
|
cert_chain: []
|
|
16
16
|
|
|
17
|
-
date:
|
|
17
|
+
date: 2011-06-23 00:00:00 -04:00
|
|
18
18
|
default_executable:
|
|
19
19
|
dependencies:
|
|
20
20
|
- !ruby/object:Gem::Dependency
|
|
21
|
-
name:
|
|
22
|
-
prerelease: false
|
|
21
|
+
name: biodiversity
|
|
23
22
|
requirement: &id001 !ruby/object:Gem::Requirement
|
|
23
|
+
none: false
|
|
24
|
+
requirements:
|
|
25
|
+
- - ~>
|
|
26
|
+
- !ruby/object:Gem::Version
|
|
27
|
+
segments:
|
|
28
|
+
- 0
|
|
29
|
+
- 5
|
|
30
|
+
- 13
|
|
31
|
+
version: 0.5.13
|
|
32
|
+
type: :runtime
|
|
33
|
+
prerelease: false
|
|
34
|
+
version_requirements: *id001
|
|
35
|
+
- !ruby/object:Gem::Dependency
|
|
36
|
+
name: biodiversity19
|
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
38
|
+
none: false
|
|
39
|
+
requirements:
|
|
40
|
+
- - ~>
|
|
41
|
+
- !ruby/object:Gem::Version
|
|
42
|
+
segments:
|
|
43
|
+
- 0
|
|
44
|
+
- 5
|
|
45
|
+
- 13
|
|
46
|
+
version: 0.5.13
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: *id002
|
|
50
|
+
- !ruby/object:Gem::Dependency
|
|
51
|
+
name: rake-compiler
|
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
|
24
53
|
none: false
|
|
25
54
|
requirements:
|
|
26
55
|
- - ">="
|
|
@@ -29,11 +58,134 @@ dependencies:
|
|
|
29
58
|
- 0
|
|
30
59
|
version: "0"
|
|
31
60
|
type: :runtime
|
|
32
|
-
|
|
61
|
+
prerelease: false
|
|
62
|
+
version_requirements: *id003
|
|
33
63
|
- !ruby/object:Gem::Dependency
|
|
34
|
-
name:
|
|
64
|
+
name: rspec
|
|
65
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
|
66
|
+
none: false
|
|
67
|
+
requirements:
|
|
68
|
+
- - ~>
|
|
69
|
+
- !ruby/object:Gem::Version
|
|
70
|
+
segments:
|
|
71
|
+
- 2
|
|
72
|
+
- 3
|
|
73
|
+
- 0
|
|
74
|
+
version: 2.3.0
|
|
75
|
+
type: :development
|
|
35
76
|
prerelease: false
|
|
36
|
-
|
|
77
|
+
version_requirements: *id004
|
|
78
|
+
- !ruby/object:Gem::Dependency
|
|
79
|
+
name: cucumber
|
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
|
81
|
+
none: false
|
|
82
|
+
requirements:
|
|
83
|
+
- - ">="
|
|
84
|
+
- !ruby/object:Gem::Version
|
|
85
|
+
segments:
|
|
86
|
+
- 0
|
|
87
|
+
version: "0"
|
|
88
|
+
type: :development
|
|
89
|
+
prerelease: false
|
|
90
|
+
version_requirements: *id005
|
|
91
|
+
- !ruby/object:Gem::Dependency
|
|
92
|
+
name: bundler
|
|
93
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
|
94
|
+
none: false
|
|
95
|
+
requirements:
|
|
96
|
+
- - ~>
|
|
97
|
+
- !ruby/object:Gem::Version
|
|
98
|
+
segments:
|
|
99
|
+
- 1
|
|
100
|
+
- 0
|
|
101
|
+
- 0
|
|
102
|
+
version: 1.0.0
|
|
103
|
+
type: :development
|
|
104
|
+
prerelease: false
|
|
105
|
+
version_requirements: *id006
|
|
106
|
+
- !ruby/object:Gem::Dependency
|
|
107
|
+
name: jeweler
|
|
108
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
|
109
|
+
none: false
|
|
110
|
+
requirements:
|
|
111
|
+
- - ~>
|
|
112
|
+
- !ruby/object:Gem::Version
|
|
113
|
+
segments:
|
|
114
|
+
- 1
|
|
115
|
+
- 6
|
|
116
|
+
- 0
|
|
117
|
+
version: 1.6.0
|
|
118
|
+
type: :development
|
|
119
|
+
prerelease: false
|
|
120
|
+
version_requirements: *id007
|
|
121
|
+
- !ruby/object:Gem::Dependency
|
|
122
|
+
name: rcov
|
|
123
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
|
124
|
+
none: false
|
|
125
|
+
requirements:
|
|
126
|
+
- - ">="
|
|
127
|
+
- !ruby/object:Gem::Version
|
|
128
|
+
segments:
|
|
129
|
+
- 0
|
|
130
|
+
version: "0"
|
|
131
|
+
type: :development
|
|
132
|
+
prerelease: false
|
|
133
|
+
version_requirements: *id008
|
|
134
|
+
- !ruby/object:Gem::Dependency
|
|
135
|
+
name: ruby-debug19
|
|
136
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
|
137
|
+
none: false
|
|
138
|
+
requirements:
|
|
139
|
+
- - ">="
|
|
140
|
+
- !ruby/object:Gem::Version
|
|
141
|
+
segments:
|
|
142
|
+
- 0
|
|
143
|
+
version: "0"
|
|
144
|
+
type: :development
|
|
145
|
+
prerelease: false
|
|
146
|
+
version_requirements: *id009
|
|
147
|
+
- !ruby/object:Gem::Dependency
|
|
148
|
+
name: ruby-prof
|
|
149
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
|
150
|
+
none: false
|
|
151
|
+
requirements:
|
|
152
|
+
- - ">="
|
|
153
|
+
- !ruby/object:Gem::Version
|
|
154
|
+
segments:
|
|
155
|
+
- 0
|
|
156
|
+
version: "0"
|
|
157
|
+
type: :development
|
|
158
|
+
prerelease: false
|
|
159
|
+
version_requirements: *id010
|
|
160
|
+
- !ruby/object:Gem::Dependency
|
|
161
|
+
name: shoulda
|
|
162
|
+
requirement: &id011 !ruby/object:Gem::Requirement
|
|
163
|
+
none: false
|
|
164
|
+
requirements:
|
|
165
|
+
- - ">="
|
|
166
|
+
- !ruby/object:Gem::Version
|
|
167
|
+
segments:
|
|
168
|
+
- 0
|
|
169
|
+
version: "0"
|
|
170
|
+
type: :development
|
|
171
|
+
prerelease: false
|
|
172
|
+
version_requirements: *id011
|
|
173
|
+
- !ruby/object:Gem::Dependency
|
|
174
|
+
name: mocha
|
|
175
|
+
requirement: &id012 !ruby/object:Gem::Requirement
|
|
176
|
+
none: false
|
|
177
|
+
requirements:
|
|
178
|
+
- - ">="
|
|
179
|
+
- !ruby/object:Gem::Version
|
|
180
|
+
segments:
|
|
181
|
+
- 0
|
|
182
|
+
version: "0"
|
|
183
|
+
type: :development
|
|
184
|
+
prerelease: false
|
|
185
|
+
version_requirements: *id012
|
|
186
|
+
- !ruby/object:Gem::Dependency
|
|
187
|
+
name: biodiversity
|
|
188
|
+
requirement: &id013 !ruby/object:Gem::Requirement
|
|
37
189
|
none: false
|
|
38
190
|
requirements:
|
|
39
191
|
- - ">="
|
|
@@ -44,21 +196,37 @@ dependencies:
|
|
|
44
196
|
- 13
|
|
45
197
|
version: 0.5.13
|
|
46
198
|
type: :runtime
|
|
47
|
-
|
|
199
|
+
prerelease: false
|
|
200
|
+
version_requirements: *id013
|
|
201
|
+
- !ruby/object:Gem::Dependency
|
|
202
|
+
name: rake-compiler
|
|
203
|
+
requirement: &id014 !ruby/object:Gem::Requirement
|
|
204
|
+
none: false
|
|
205
|
+
requirements:
|
|
206
|
+
- - ">="
|
|
207
|
+
- !ruby/object:Gem::Version
|
|
208
|
+
segments:
|
|
209
|
+
- 0
|
|
210
|
+
version: "0"
|
|
211
|
+
type: :runtime
|
|
212
|
+
prerelease: false
|
|
213
|
+
version_requirements: *id014
|
|
48
214
|
description: This gem implements algorithm for fuzzy matching scientific names developed by Tony Rees
|
|
49
215
|
email: dmozzherin@eol.org
|
|
50
216
|
executables: []
|
|
51
217
|
|
|
52
|
-
extensions:
|
|
53
|
-
|
|
218
|
+
extensions:
|
|
219
|
+
- ext/damerau_levenshtein/extconf.rb
|
|
54
220
|
extra_rdoc_files:
|
|
55
221
|
- LICENSE
|
|
56
222
|
- README.rdoc
|
|
57
223
|
files:
|
|
224
|
+
- Gemfile.lock
|
|
58
225
|
- README.rdoc
|
|
59
226
|
- lib/taxamatch_rb.rb
|
|
60
227
|
- lib/taxamatch_rb/atomizer.rb
|
|
61
228
|
- lib/taxamatch_rb/authmatch.rb
|
|
229
|
+
- lib/taxamatch_rb/damerau_levenshtein.bundle
|
|
62
230
|
- lib/taxamatch_rb/damerau_levenshtein_mod.rb
|
|
63
231
|
- lib/taxamatch_rb/normalizer.rb
|
|
64
232
|
- lib/taxamatch_rb/phonetizer.rb
|
|
@@ -68,13 +236,14 @@ files:
|
|
|
68
236
|
- spec/taxamatch_rb_spec.rb
|
|
69
237
|
- spec/taxamatch_test.txt
|
|
70
238
|
- LICENSE
|
|
239
|
+
- ext/damerau_levenshtein/extconf.rb
|
|
71
240
|
has_rdoc: true
|
|
72
241
|
homepage: http://github.com/GlobalNamesArchitecture/taxamatch_rb
|
|
73
242
|
licenses: []
|
|
74
243
|
|
|
75
244
|
post_install_message:
|
|
76
|
-
rdoc_options:
|
|
77
|
-
|
|
245
|
+
rdoc_options: []
|
|
246
|
+
|
|
78
247
|
require_paths:
|
|
79
248
|
- lib
|
|
80
249
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
@@ -82,6 +251,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
82
251
|
requirements:
|
|
83
252
|
- - ">="
|
|
84
253
|
- !ruby/object:Gem::Version
|
|
254
|
+
hash: -2865757795593253659
|
|
85
255
|
segments:
|
|
86
256
|
- 0
|
|
87
257
|
version: "0"
|
|
@@ -100,6 +270,5 @@ rubygems_version: 1.3.7
|
|
|
100
270
|
signing_key:
|
|
101
271
|
specification_version: 3
|
|
102
272
|
summary: Implementation of Tony Rees Taxamatch algorithms
|
|
103
|
-
test_files:
|
|
104
|
-
|
|
105
|
-
- spec/taxamatch_rb_spec.rb
|
|
273
|
+
test_files: []
|
|
274
|
+
|