taxamatch_rb 0.6.5 → 0.7.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +75 -0
- data/README.rdoc +1 -5
- data/ext/damerau_levenshtein/extconf.rb +11 -0
- data/lib/taxamatch_rb.rb +19 -19
- data/lib/taxamatch_rb/damerau_levenshtein.bundle +0 -0
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +10 -107
- data/spec/damerau_levenshtein_mod_test.txt +1 -1
- data/spec/spec_helper.rb +1 -7
- data/spec/taxamatch_rb_spec.rb +21 -21
- metadata +186 -17
data/Gemfile.lock
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
archive-tar-minitar (0.5.2)
|
5
|
+
biodiversity (0.5.16)
|
6
|
+
json
|
7
|
+
treetop
|
8
|
+
biodiversity19 (0.5.16)
|
9
|
+
json
|
10
|
+
treetop
|
11
|
+
builder (3.0.0)
|
12
|
+
columnize (0.3.3)
|
13
|
+
cucumber (1.0.0)
|
14
|
+
builder (>= 2.1.2)
|
15
|
+
diff-lcs (>= 1.1.2)
|
16
|
+
gherkin (~> 2.4.1)
|
17
|
+
json (>= 1.4.6)
|
18
|
+
term-ansicolor (>= 1.0.5)
|
19
|
+
diff-lcs (1.1.2)
|
20
|
+
gherkin (2.4.1)
|
21
|
+
json (>= 1.4.6)
|
22
|
+
git (1.2.5)
|
23
|
+
jeweler (1.6.2)
|
24
|
+
bundler (~> 1.0)
|
25
|
+
git (>= 1.2.5)
|
26
|
+
rake
|
27
|
+
json (1.5.3)
|
28
|
+
linecache19 (0.5.12)
|
29
|
+
ruby_core_source (>= 0.1.4)
|
30
|
+
mocha (0.9.12)
|
31
|
+
polyglot (0.3.1)
|
32
|
+
rake (0.9.2)
|
33
|
+
rake-compiler (0.7.9)
|
34
|
+
rake
|
35
|
+
rcov (0.9.9)
|
36
|
+
rspec (2.3.0)
|
37
|
+
rspec-core (~> 2.3.0)
|
38
|
+
rspec-expectations (~> 2.3.0)
|
39
|
+
rspec-mocks (~> 2.3.0)
|
40
|
+
rspec-core (2.3.1)
|
41
|
+
rspec-expectations (2.3.0)
|
42
|
+
diff-lcs (~> 1.1.2)
|
43
|
+
rspec-mocks (2.3.0)
|
44
|
+
ruby-debug-base19 (0.11.25)
|
45
|
+
columnize (>= 0.3.1)
|
46
|
+
linecache19 (>= 0.5.11)
|
47
|
+
ruby_core_source (>= 0.1.4)
|
48
|
+
ruby-debug19 (0.11.6)
|
49
|
+
columnize (>= 0.3.1)
|
50
|
+
linecache19 (>= 0.5.11)
|
51
|
+
ruby-debug-base19 (>= 0.11.19)
|
52
|
+
ruby-prof (0.10.7)
|
53
|
+
ruby_core_source (0.1.5)
|
54
|
+
archive-tar-minitar (>= 0.5.2)
|
55
|
+
shoulda (2.11.3)
|
56
|
+
term-ansicolor (1.0.5)
|
57
|
+
treetop (1.4.9)
|
58
|
+
polyglot (>= 0.3.1)
|
59
|
+
|
60
|
+
PLATFORMS
|
61
|
+
ruby
|
62
|
+
|
63
|
+
DEPENDENCIES
|
64
|
+
biodiversity (~> 0.5.13)
|
65
|
+
biodiversity19 (~> 0.5.13)
|
66
|
+
bundler (~> 1.0.0)
|
67
|
+
cucumber
|
68
|
+
jeweler (~> 1.6.0)
|
69
|
+
mocha
|
70
|
+
rake-compiler
|
71
|
+
rcov
|
72
|
+
rspec (~> 2.3.0)
|
73
|
+
ruby-debug19
|
74
|
+
ruby-prof
|
75
|
+
shoulda
|
data/README.rdoc
CHANGED
@@ -14,11 +14,7 @@ Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
|
|
14
14
|
|
15
15
|
== Installation
|
16
16
|
|
17
|
-
sudo gem install
|
18
|
-
|
19
|
-
or
|
20
|
-
sudo gem sources -a http://gems.github.com #(you only have to do this once)
|
21
|
-
sudo gem install dimus-taxamatch_rb
|
17
|
+
sudo gem install taxamatch_rb
|
22
18
|
|
23
19
|
== Usage
|
24
20
|
|
data/lib/taxamatch_rb.rb
CHANGED
@@ -8,38 +8,38 @@ require 'taxamatch_rb/normalizer'
|
|
8
8
|
require 'taxamatch_rb/phonetizer'
|
9
9
|
require 'taxamatch_rb/authmatch'
|
10
10
|
|
11
|
-
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
11
|
+
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
12
12
|
|
13
13
|
module Taxamatch
|
14
14
|
|
15
15
|
class Base
|
16
|
-
|
16
|
+
|
17
17
|
def initialize
|
18
18
|
@parser = Taxamatch::Atomizer.new
|
19
19
|
@dlm = Taxamatch::DamerauLevenshteinMod.new
|
20
20
|
end
|
21
|
-
|
22
|
-
|
21
|
+
|
22
|
+
|
23
23
|
#takes two scientific names and returns true if names match and false if they don't
|
24
|
-
def taxamatch(str1, str2, return_boolean = true)
|
24
|
+
def taxamatch(str1, str2, return_boolean = true)
|
25
25
|
preparsed_1 = @parser.parse(str1)
|
26
26
|
preparsed_2 = @parser.parse(str2)
|
27
27
|
match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
|
28
28
|
return_boolean ? (!!match && match['match']) : match
|
29
29
|
end
|
30
|
-
|
31
|
-
#takes two hashes of parsed scientific names, analyses them and returns back
|
30
|
+
|
31
|
+
#takes two hashes of parsed scientific names, analyses them and returns back
|
32
32
|
#this function is useful when species strings are preparsed.
|
33
33
|
def taxamatch_preparsed(preparsed_1, preparsed_2)
|
34
34
|
result = nil
|
35
|
-
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
35
|
+
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
36
36
|
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
37
37
|
if result && result['match']
|
38
|
-
result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
|
38
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
|
39
39
|
end
|
40
40
|
return result
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
43
|
def match_uninomial(preparsed_1, preparsed_2)
|
44
44
|
match_genera(preparsed_1[:uninomial], preparsed_2[:uninomial])
|
45
45
|
end
|
@@ -54,14 +54,14 @@ module Taxamatch
|
|
54
54
|
match_hash = match_matches(gen_match, sp_match, infrasp_match)
|
55
55
|
elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
|
56
56
|
match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
|
57
|
-
total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:string].size : preparsed_2[:infraspecies][0][:string].size
|
57
|
+
total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:string].size : preparsed_2[:infraspecies][0][:string].size
|
58
58
|
else
|
59
59
|
match_hash = match_matches(gen_match, sp_match)
|
60
60
|
end
|
61
61
|
match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
|
62
62
|
match_hash
|
63
63
|
end
|
64
|
-
|
64
|
+
|
65
65
|
def match_genera(genus1, genus2)
|
66
66
|
genus1_length = genus1[:normalized].size
|
67
67
|
genus2_length = genus2[:normalized].size
|
@@ -69,10 +69,10 @@ module Taxamatch
|
|
69
69
|
match = false
|
70
70
|
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
71
71
|
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
|
72
|
-
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
73
|
-
|
72
|
+
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
73
|
+
|
74
74
|
match = true if ed <= 3 && (min_length > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
75
|
-
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
75
|
+
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
76
76
|
end
|
77
77
|
|
78
78
|
def match_species(sp1, sp2)
|
@@ -86,11 +86,11 @@ module Taxamatch
|
|
86
86
|
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.3334
|
87
87
|
#puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
|
88
88
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
89
|
-
|
89
|
+
|
90
90
|
match = true if ed <= 4 && (min_length >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
91
91
|
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
def match_authors(preparsed_1, preparsed_2)
|
95
95
|
au1 = preparsed_1[:all_authors]
|
96
96
|
au2 = preparsed_2[:all_authors]
|
@@ -98,8 +98,8 @@ module Taxamatch
|
|
98
98
|
yr2 = preparsed_2[:all_years]
|
99
99
|
Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
100
100
|
end
|
101
|
-
|
102
|
-
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
101
|
+
|
102
|
+
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
103
103
|
match = species_match
|
104
104
|
if infraspecies_match
|
105
105
|
match['edit_distance'] += infraspecies_match['edit_distance']
|
Binary file
|
@@ -1,122 +1,25 @@
|
|
1
1
|
# encoding: UTF-8
|
2
|
-
|
3
|
-
require '
|
4
|
-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'damerau_levenshtein')
|
4
|
+
|
5
5
|
module Taxamatch
|
6
6
|
|
7
7
|
class DamerauLevenshteinMod
|
8
|
+
include DamerauLevenshtein
|
9
|
+
|
8
10
|
def distance(str1, str2, block_size=2, max_distance=10)
|
9
|
-
# puts str1.unpack("U*");
|
10
11
|
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
11
12
|
end
|
12
|
-
|
13
|
-
inline do |builder|
|
14
|
-
builder.c "
|
15
|
-
static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
|
16
|
-
int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
|
17
|
-
int stop_execution = 0;
|
18
|
-
int min = 0;
|
19
|
-
int current_distance = 0;
|
20
|
-
|
21
|
-
VALUE *sv = RARRAY_PTR(_s);
|
22
|
-
VALUE *tv = RARRAY_PTR(_t);
|
23
|
-
|
24
|
-
sl = RARRAY_LEN(_s);
|
25
|
-
tl = RARRAY_LEN(_t);
|
26
|
-
|
27
|
-
if (sl == 0) return INT2NUM(tl);
|
28
|
-
if (tl == 0) return INT2NUM(sl);
|
29
|
-
//case of lengths 1 must present or it will break further in the code
|
30
|
-
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
|
31
|
-
|
32
|
-
int s[sl];
|
33
|
-
int t[tl];
|
34
|
-
|
35
|
-
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
|
36
|
-
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
|
37
|
-
|
38
|
-
sl++;
|
39
|
-
tl++;
|
40
|
-
|
41
|
-
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
42
|
-
d = malloc((sizeof(int))*(sl)*(tl));
|
43
|
-
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
44
|
-
for(i = 0; i < tl; i++){
|
45
|
-
d[i*sl] = i;
|
46
|
-
}
|
47
|
-
|
48
|
-
//fill up array with scores
|
49
|
-
for(i = 1; i<sl; i++){
|
50
|
-
d[i] = i;
|
51
|
-
if (stop_execution == 1) break;
|
52
|
-
current_distance = 10000;
|
53
|
-
for(j = 1; j<tl; j++){
|
54
|
-
|
55
|
-
cost = 1;
|
56
|
-
if(s[i-1] == t[j-1]) cost = 0;
|
57
|
-
|
58
|
-
half_sl = (sl - 1)/2;
|
59
|
-
half_tl = (tl - 1)/2;
|
60
|
-
|
61
|
-
block = block_size < half_sl ? block_size : half_sl;
|
62
|
-
block = block < half_tl ? block : half_tl;
|
63
|
-
|
64
|
-
while (block >= 1){
|
65
|
-
int swap1 = 1;
|
66
|
-
int swap2 = 1;
|
67
|
-
i1 = i - (block * 2);
|
68
|
-
j1 = j - (block * 2);
|
69
|
-
for (k = i1; k < i1 + block; k++) {
|
70
|
-
if (s[k] != t[k + block]){
|
71
|
-
swap1 = 0;
|
72
|
-
break;
|
73
|
-
}
|
74
|
-
}
|
75
|
-
for (k = j1; k < j1 + block; k++) {
|
76
|
-
if (t[k] != s[k + block]){
|
77
|
-
swap2 = 0;
|
78
|
-
break;
|
79
|
-
}
|
80
|
-
}
|
81
|
-
|
82
|
-
del = d[j*sl + i - 1] + 1;
|
83
|
-
ins = d[(j-1)*sl + i] + 1;
|
84
|
-
min = del;
|
85
|
-
if (ins < min) min = ins;
|
86
|
-
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
87
|
-
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
88
|
-
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
89
|
-
if (transp < min) min = transp;
|
90
|
-
block = 0;
|
91
|
-
} else if (block == 1) {
|
92
|
-
subs = d[(j-1)*sl + i - 1] + cost;
|
93
|
-
if (subs < min) min = subs;
|
94
|
-
}
|
95
|
-
block--;
|
96
|
-
}
|
97
|
-
d[j*sl+i]=min;
|
98
|
-
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
|
99
|
-
}
|
100
|
-
if (current_distance > max_distance) {
|
101
|
-
stop_execution = 1;
|
102
|
-
}
|
103
|
-
}
|
104
|
-
distance=d[sl * tl - 1];
|
105
|
-
if (stop_execution == 1) distance = current_distance;
|
106
|
-
|
107
|
-
free(d);
|
108
|
-
return INT2NUM(distance);
|
109
|
-
}
|
110
|
-
"
|
111
|
-
end
|
112
13
|
end
|
14
|
+
|
113
15
|
end
|
114
16
|
|
115
17
|
if __FILE__ == $0
|
116
|
-
|
18
|
+
|
19
|
+
a = Taxamatch::DamerauLevenshteinMod.new
|
117
20
|
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
|
118
21
|
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
|
119
|
-
|
22
|
+
|
120
23
|
#puts s.join(",")
|
121
24
|
#puts t.join(",")
|
122
25
|
|
@@ -133,7 +36,7 @@ if __FILE__ == $0
|
|
133
36
|
puts 'utf time: ' + (Time.now - start).to_s + ' sec'
|
134
37
|
|
135
38
|
#puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
|
136
|
-
#puts a.distance_utf(s, t, 2, 10)
|
39
|
+
#puts a.distance_utf(s, t, 2, 10)
|
137
40
|
#puts a.distance('tar','atp',1,10);
|
138
41
|
puts a.distance('sub', 'usb', 1, 10);
|
139
42
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# encoding: UTF-8
|
2
|
-
require
|
2
|
+
require 'spec_helper'
|
3
3
|
|
4
4
|
describe 'DamerauLevenshteinMod' do
|
5
5
|
it 'should get tests' do
|
@@ -7,7 +7,7 @@ describe 'DamerauLevenshteinMod' do
|
|
7
7
|
dl = Taxamatch::DamerauLevenshteinMod.new
|
8
8
|
if y
|
9
9
|
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
10
|
-
|
10
|
+
puts y if res != y[4].to_i
|
11
11
|
res.should == y[4].to_i
|
12
12
|
end
|
13
13
|
end
|
@@ -18,17 +18,17 @@ describe 'Atomizer' do
|
|
18
18
|
before(:all) do
|
19
19
|
@parser = Taxamatch::Atomizer.new
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
it 'should parse uninomials' do
|
23
23
|
@parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:string=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
|
24
24
|
@parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:string=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
|
25
25
|
end
|
26
|
-
|
26
|
+
|
27
27
|
it 'should parse binomials' do
|
28
28
|
@parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:string=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
|
29
29
|
end
|
30
|
-
|
31
|
-
it 'should parse trinomials' do
|
30
|
+
|
31
|
+
it 'should parse trinomials' do
|
32
32
|
@parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:string=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:string=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
|
33
33
|
end
|
34
34
|
end
|
@@ -42,7 +42,7 @@ describe 'Taxamatch::Normalizer' do
|
|
42
42
|
Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
|
43
43
|
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
46
|
it 'should normalize words' do
|
47
47
|
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
|
48
48
|
end
|
@@ -52,25 +52,25 @@ describe 'Taxamatch::Base' do
|
|
52
52
|
before(:all) do
|
53
53
|
@tm = Taxamatch::Base.new
|
54
54
|
end
|
55
|
-
|
55
|
+
|
56
56
|
it 'should get txt tests' do
|
57
57
|
dl = Taxamatch::DamerauLevenshteinMod.new
|
58
58
|
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
|
59
59
|
if y
|
60
60
|
y[2] = y[2] == 'true' ? true : false
|
61
61
|
res = @tm.taxamatch(y[0], y[1], false)
|
62
|
-
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
62
|
+
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
63
63
|
res['match'].should == y[2]
|
64
64
|
res['edit_distance'].should == y[3].to_i
|
65
65
|
end
|
66
66
|
end
|
67
67
|
end
|
68
|
-
|
68
|
+
|
69
69
|
it 'should work with names that cannot be parsed' do
|
70
70
|
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
|
71
71
|
res = false
|
72
72
|
end
|
73
|
-
|
73
|
+
|
74
74
|
it 'should compare genera' do
|
75
75
|
#edit distance 1 always match
|
76
76
|
g1 = make_taxamatch_hash 'Plantago'
|
@@ -138,17 +138,17 @@ describe 'Taxamatch::Base' do
|
|
138
138
|
#Should not match if Distance 2 or 3 and first 1 char is not the same
|
139
139
|
s1 = make_taxamatch_hash 'morrrr'
|
140
140
|
s2 = make_taxamatch_hash 'torraa'
|
141
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
141
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
142
142
|
#Distance 1 will match anywhere
|
143
143
|
s1 = make_taxamatch_hash 'major'
|
144
144
|
s2 = make_taxamatch_hash 'rajor'
|
145
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
|
145
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
|
146
146
|
#Will not match if distance 3 and length is less then twice of the edit distance
|
147
147
|
s1 = make_taxamatch_hash 'marrr'
|
148
148
|
s2 = make_taxamatch_hash 'maaaa'
|
149
149
|
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
150
150
|
end
|
151
|
-
|
151
|
+
|
152
152
|
it 'should match matches' do
|
153
153
|
#No trobule case
|
154
154
|
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
@@ -159,7 +159,7 @@ describe 'Taxamatch::Base' do
|
|
159
159
|
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
160
160
|
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
161
161
|
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
162
|
-
smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
162
|
+
smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
163
163
|
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
164
164
|
#Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
|
165
165
|
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
|
@@ -188,7 +188,7 @@ describe 'Taxamatch::Base' do
|
|
188
188
|
before(:all) do
|
189
189
|
@am = Taxamatch::Authmatch
|
190
190
|
end
|
191
|
-
|
191
|
+
|
192
192
|
it 'should calculate score' do
|
193
193
|
res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
|
194
194
|
res.should == 90
|
@@ -219,22 +219,22 @@ describe 'Taxamatch::Base' do
|
|
219
219
|
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
|
220
220
|
res.should == 0
|
221
221
|
end
|
222
|
-
|
222
|
+
|
223
223
|
it 'should compare years' do
|
224
224
|
@am.compare_years([1882],[1880]).should == 2
|
225
225
|
@am.compare_years([1882],[]).should == nil
|
226
226
|
@am.compare_years([],[]).should == 0
|
227
227
|
@am.compare_years([1788,1798], [1788,1798]).should be_nil
|
228
228
|
end
|
229
|
-
|
230
|
-
it 'should remove duplicate authors' do
|
229
|
+
|
230
|
+
it 'should remove duplicate authors' do
|
231
231
|
#Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
|
232
232
|
#Muller is identical
|
233
233
|
res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
|
234
234
|
res.should == [[], []]
|
235
235
|
#same in different order
|
236
236
|
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
|
237
|
-
res.should == [[], []]
|
237
|
+
res.should == [[], []]
|
238
238
|
#auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
|
239
239
|
res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
|
240
240
|
res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
|
@@ -252,7 +252,7 @@ describe 'Taxamatch::Base' do
|
|
252
252
|
# res = @am.fuzzy_match_authors('L', 'Muller')
|
253
253
|
# res.should be_false
|
254
254
|
end
|
255
|
-
|
255
|
+
|
256
256
|
end
|
257
257
|
|
258
258
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 7
|
8
|
+
- 4
|
9
|
+
version: 0.7.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Dmitry Mozzherin
|
@@ -14,13 +14,42 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date:
|
17
|
+
date: 2011-06-23 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
|
-
name:
|
22
|
-
prerelease: false
|
21
|
+
name: biodiversity
|
23
22
|
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ~>
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
- 5
|
30
|
+
- 13
|
31
|
+
version: 0.5.13
|
32
|
+
type: :runtime
|
33
|
+
prerelease: false
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: biodiversity19
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ~>
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
segments:
|
43
|
+
- 0
|
44
|
+
- 5
|
45
|
+
- 13
|
46
|
+
version: 0.5.13
|
47
|
+
type: :runtime
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: *id002
|
50
|
+
- !ruby/object:Gem::Dependency
|
51
|
+
name: rake-compiler
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
24
53
|
none: false
|
25
54
|
requirements:
|
26
55
|
- - ">="
|
@@ -29,11 +58,134 @@ dependencies:
|
|
29
58
|
- 0
|
30
59
|
version: "0"
|
31
60
|
type: :runtime
|
32
|
-
|
61
|
+
prerelease: false
|
62
|
+
version_requirements: *id003
|
33
63
|
- !ruby/object:Gem::Dependency
|
34
|
-
name:
|
64
|
+
name: rspec
|
65
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ~>
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
segments:
|
71
|
+
- 2
|
72
|
+
- 3
|
73
|
+
- 0
|
74
|
+
version: 2.3.0
|
75
|
+
type: :development
|
35
76
|
prerelease: false
|
36
|
-
|
77
|
+
version_requirements: *id004
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: cucumber
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
segments:
|
86
|
+
- 0
|
87
|
+
version: "0"
|
88
|
+
type: :development
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *id005
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: bundler
|
93
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ~>
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
segments:
|
99
|
+
- 1
|
100
|
+
- 0
|
101
|
+
- 0
|
102
|
+
version: 1.0.0
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: *id006
|
106
|
+
- !ruby/object:Gem::Dependency
|
107
|
+
name: jeweler
|
108
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ~>
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
segments:
|
114
|
+
- 1
|
115
|
+
- 6
|
116
|
+
- 0
|
117
|
+
version: 1.6.0
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: *id007
|
121
|
+
- !ruby/object:Gem::Dependency
|
122
|
+
name: rcov
|
123
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
124
|
+
none: false
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
segments:
|
129
|
+
- 0
|
130
|
+
version: "0"
|
131
|
+
type: :development
|
132
|
+
prerelease: false
|
133
|
+
version_requirements: *id008
|
134
|
+
- !ruby/object:Gem::Dependency
|
135
|
+
name: ruby-debug19
|
136
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ">="
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
segments:
|
142
|
+
- 0
|
143
|
+
version: "0"
|
144
|
+
type: :development
|
145
|
+
prerelease: false
|
146
|
+
version_requirements: *id009
|
147
|
+
- !ruby/object:Gem::Dependency
|
148
|
+
name: ruby-prof
|
149
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
150
|
+
none: false
|
151
|
+
requirements:
|
152
|
+
- - ">="
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
segments:
|
155
|
+
- 0
|
156
|
+
version: "0"
|
157
|
+
type: :development
|
158
|
+
prerelease: false
|
159
|
+
version_requirements: *id010
|
160
|
+
- !ruby/object:Gem::Dependency
|
161
|
+
name: shoulda
|
162
|
+
requirement: &id011 !ruby/object:Gem::Requirement
|
163
|
+
none: false
|
164
|
+
requirements:
|
165
|
+
- - ">="
|
166
|
+
- !ruby/object:Gem::Version
|
167
|
+
segments:
|
168
|
+
- 0
|
169
|
+
version: "0"
|
170
|
+
type: :development
|
171
|
+
prerelease: false
|
172
|
+
version_requirements: *id011
|
173
|
+
- !ruby/object:Gem::Dependency
|
174
|
+
name: mocha
|
175
|
+
requirement: &id012 !ruby/object:Gem::Requirement
|
176
|
+
none: false
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
segments:
|
181
|
+
- 0
|
182
|
+
version: "0"
|
183
|
+
type: :development
|
184
|
+
prerelease: false
|
185
|
+
version_requirements: *id012
|
186
|
+
- !ruby/object:Gem::Dependency
|
187
|
+
name: biodiversity
|
188
|
+
requirement: &id013 !ruby/object:Gem::Requirement
|
37
189
|
none: false
|
38
190
|
requirements:
|
39
191
|
- - ">="
|
@@ -44,21 +196,37 @@ dependencies:
|
|
44
196
|
- 13
|
45
197
|
version: 0.5.13
|
46
198
|
type: :runtime
|
47
|
-
|
199
|
+
prerelease: false
|
200
|
+
version_requirements: *id013
|
201
|
+
- !ruby/object:Gem::Dependency
|
202
|
+
name: rake-compiler
|
203
|
+
requirement: &id014 !ruby/object:Gem::Requirement
|
204
|
+
none: false
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
segments:
|
209
|
+
- 0
|
210
|
+
version: "0"
|
211
|
+
type: :runtime
|
212
|
+
prerelease: false
|
213
|
+
version_requirements: *id014
|
48
214
|
description: This gem implements algorithm for fuzzy matching scientific names developed by Tony Rees
|
49
215
|
email: dmozzherin@eol.org
|
50
216
|
executables: []
|
51
217
|
|
52
|
-
extensions:
|
53
|
-
|
218
|
+
extensions:
|
219
|
+
- ext/damerau_levenshtein/extconf.rb
|
54
220
|
extra_rdoc_files:
|
55
221
|
- LICENSE
|
56
222
|
- README.rdoc
|
57
223
|
files:
|
224
|
+
- Gemfile.lock
|
58
225
|
- README.rdoc
|
59
226
|
- lib/taxamatch_rb.rb
|
60
227
|
- lib/taxamatch_rb/atomizer.rb
|
61
228
|
- lib/taxamatch_rb/authmatch.rb
|
229
|
+
- lib/taxamatch_rb/damerau_levenshtein.bundle
|
62
230
|
- lib/taxamatch_rb/damerau_levenshtein_mod.rb
|
63
231
|
- lib/taxamatch_rb/normalizer.rb
|
64
232
|
- lib/taxamatch_rb/phonetizer.rb
|
@@ -68,13 +236,14 @@ files:
|
|
68
236
|
- spec/taxamatch_rb_spec.rb
|
69
237
|
- spec/taxamatch_test.txt
|
70
238
|
- LICENSE
|
239
|
+
- ext/damerau_levenshtein/extconf.rb
|
71
240
|
has_rdoc: true
|
72
241
|
homepage: http://github.com/GlobalNamesArchitecture/taxamatch_rb
|
73
242
|
licenses: []
|
74
243
|
|
75
244
|
post_install_message:
|
76
|
-
rdoc_options:
|
77
|
-
|
245
|
+
rdoc_options: []
|
246
|
+
|
78
247
|
require_paths:
|
79
248
|
- lib
|
80
249
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -82,6 +251,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
82
251
|
requirements:
|
83
252
|
- - ">="
|
84
253
|
- !ruby/object:Gem::Version
|
254
|
+
hash: -2865757795593253659
|
85
255
|
segments:
|
86
256
|
- 0
|
87
257
|
version: "0"
|
@@ -100,6 +270,5 @@ rubygems_version: 1.3.7
|
|
100
270
|
signing_key:
|
101
271
|
specification_version: 3
|
102
272
|
summary: Implementation of Tony Rees Taxamatch algorithms
|
103
|
-
test_files:
|
104
|
-
|
105
|
-
- spec/taxamatch_rb_spec.rb
|
273
|
+
test_files: []
|
274
|
+
|