dimus-taxamatch_rb 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +54 -1
- data/lib/taxamatch_rb.rb +18 -18
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +17 -15
- data/lib/taxamatch_rb/normalizer.rb +5 -1
- data/lib/taxamatch_rb/parser.rb +2 -1
- data/spec/taxamatch_rb_spec.rb +4 -5
- data/spec/taxamatch_test.txt +10 -1
- metadata +2 -2
data/README.rdoc
CHANGED
@@ -1,6 +1,59 @@
|
|
1
1
|
= taxamatch_rb
|
2
2
|
|
3
|
-
|
3
|
+
Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
|
4
|
+
|
5
|
+
The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
|
6
|
+
|
7
|
+
tm = Taxamatch::Base.new
|
8
|
+
tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
|
9
|
+
tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
|
10
|
+
tm.taxamatch('Homo sapiens Mozzherin', 'Homo sapiens Linnaeus') #returns false
|
11
|
+
|
12
|
+
Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
|
13
|
+
|
14
|
+
== Installation
|
15
|
+
|
16
|
+
sudo gem install dimus-taxamatch_rb --source http://gems.github.com
|
17
|
+
|
18
|
+
or
|
19
|
+
sudo gem sources -a http://gems.github.com #(you only have to do this once)
|
20
|
+
sudo gem install dimus-taxamatch_rb
|
21
|
+
|
22
|
+
== Usage
|
23
|
+
|
24
|
+
require 'rubygems' #not needed for ruby > 1.9.1
|
25
|
+
require 'taxamatch_rb'
|
26
|
+
|
27
|
+
tm = Taxamatch::Base.new
|
28
|
+
|
29
|
+
* compare full scientific names
|
30
|
+
|
31
|
+
tm.taxamatch('Hommo sapiens L.', 'Homo sapiens Linnaeus')
|
32
|
+
|
33
|
+
* preparse names for the matching (necessary for large databases of scientific names)
|
34
|
+
|
35
|
+
p = Taxamatch::Parser.new
|
36
|
+
parsed_name1 = p.parse('Monacanthus fronticinctus Günther 1867 sec. Eschmeyer 2004')
|
37
|
+
parsed_name2 = p.parse('Monacanthus fronticinctus (Gunther, 1867)')
|
38
|
+
|
39
|
+
* compare preparsed names
|
40
|
+
|
41
|
+
tm.taxamatch_preparsed(parsed_name1, parsed_name2)
|
42
|
+
|
43
|
+
* compare genera
|
44
|
+
|
45
|
+
tm.match_genera('Monacanthus', 'MONOCANTUS')
|
46
|
+
|
47
|
+
* compare species
|
48
|
+
|
49
|
+
tm.match_species('fronticinctus', 'frontecinctus')
|
50
|
+
|
51
|
+
* compare authors and years
|
52
|
+
|
53
|
+
Taxamatch::Authmatch.authmatch(['Linnaeus'], ['L','Muller'], [1786], [1787])
|
54
|
+
|
55
|
+
|
56
|
+
You can find more examples in spec section of the code
|
4
57
|
|
5
58
|
== Copyright
|
6
59
|
|
data/lib/taxamatch_rb.rb
CHANGED
@@ -22,32 +22,32 @@ module Taxamatch
|
|
22
22
|
|
23
23
|
#takes two scientific names and returns true if names match and false if they don't
|
24
24
|
def taxamatch(str1, str2)
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
preparsed_1 = @parser.parse(str1)
|
26
|
+
preparsed_2 = @parser.parse(str2)
|
27
|
+
taxamatch_preparsed(preparsed_1, preparsed_2)[:match]
|
28
28
|
end
|
29
29
|
|
30
30
|
#takes two hashes of parsed scientific names, analyses them and returns back
|
31
31
|
#this function is useful when species strings are preparsed.
|
32
|
-
def
|
32
|
+
def taxamatch_preparsed(preparsed_1, preparsed_2)
|
33
33
|
result = nil
|
34
|
-
result = match_uninomial(
|
35
|
-
result = match_multinomial(
|
34
|
+
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
35
|
+
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
36
36
|
if result && result[:match]
|
37
|
-
result[:match] = false if match_authors(
|
37
|
+
result[:match] = false if match_authors(preparsed_1, preparsed_2) == 0
|
38
38
|
end
|
39
39
|
return result
|
40
40
|
end
|
41
41
|
|
42
|
-
def match_uninomial(
|
42
|
+
def match_uninomial(preparsed_1, preparsed_2)
|
43
43
|
return false
|
44
44
|
end
|
45
45
|
|
46
|
-
def match_multinomial(
|
47
|
-
gen_match = match_genera(
|
48
|
-
sp_match = match_species(
|
49
|
-
au_match = match_authors(
|
50
|
-
total_length =
|
46
|
+
def match_multinomial(preparsed_1, preparsed_2)
|
47
|
+
gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
|
48
|
+
sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
|
49
|
+
au_match = match_authors(preparsed_1, preparsed_2)
|
50
|
+
total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
|
51
51
|
match = match_matches(gen_match, sp_match)
|
52
52
|
match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
|
53
53
|
end
|
@@ -76,11 +76,11 @@ module Taxamatch
|
|
76
76
|
{:edit_distance => ed, :match => match, :phonetic_match => false}
|
77
77
|
end
|
78
78
|
|
79
|
-
def match_authors(
|
80
|
-
au1 =
|
81
|
-
au2 =
|
82
|
-
yr1 =
|
83
|
-
yr2 =
|
79
|
+
def match_authors(preparsed_1, preparsed_2)
|
80
|
+
au1 = preparsed_1[:all_authors]
|
81
|
+
au2 = preparsed_2[:all_authors]
|
82
|
+
yr1 = preparsed_1[:all_years]
|
83
|
+
yr2 = preparsed_2[:all_years]
|
84
84
|
Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
85
85
|
end
|
86
86
|
|
@@ -12,9 +12,11 @@ module Taxamatch
|
|
12
12
|
|
13
13
|
inline do |builder|
|
14
14
|
builder.c "
|
15
|
-
static VALUE distance_utf(VALUE _s, VALUE _t,
|
16
|
-
|
17
|
-
|
15
|
+
static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
|
16
|
+
int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
|
17
|
+
int stop_execution = 0;
|
18
|
+
int min = 0;
|
19
|
+
int current_distance = 0;
|
18
20
|
|
19
21
|
VALUE *sv = RARRAY_PTR(_s);
|
20
22
|
VALUE *tv = RARRAY_PTR(_t);
|
@@ -22,22 +24,22 @@ module Taxamatch
|
|
22
24
|
sl = RARRAY_LEN(_s);
|
23
25
|
tl = RARRAY_LEN(_t);
|
24
26
|
|
25
|
-
if (sl == 0) return
|
26
|
-
if (tl == 0) return
|
27
|
+
if (sl == 0) return INT2NUM(tl);
|
28
|
+
if (tl == 0) return INT2NUM(sl);
|
27
29
|
//case of lengths 1 must present or it will break further in the code
|
28
|
-
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return
|
30
|
+
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
|
29
31
|
|
30
|
-
|
31
|
-
|
32
|
+
int s[sl];
|
33
|
+
int t[tl];
|
32
34
|
|
33
|
-
for (i=0; i < sl; i++) s[i] =
|
34
|
-
for (i=0; i < tl; i++) t[i] =
|
35
|
+
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
|
36
|
+
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
|
35
37
|
|
36
38
|
sl++;
|
37
39
|
tl++;
|
38
40
|
|
39
41
|
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
40
|
-
d = malloc((sizeof(
|
42
|
+
d = malloc((sizeof(int))*(sl)*(tl));
|
41
43
|
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
42
44
|
for(i = 0; i < tl; i++){
|
43
45
|
d[i*sl] = i;
|
@@ -60,8 +62,8 @@ module Taxamatch
|
|
60
62
|
block = block < half_tl ? block : half_tl;
|
61
63
|
|
62
64
|
while (block >= 1){
|
63
|
-
|
64
|
-
|
65
|
+
int swap1 = 1;
|
66
|
+
int swap2 = 1;
|
65
67
|
i1 = i - (block * 2);
|
66
68
|
j1 = j - (block * 2);
|
67
69
|
for (k = i1; k < i1 + block; k++) {
|
@@ -81,7 +83,7 @@ module Taxamatch
|
|
81
83
|
ins = d[(j-1)*sl + i] + 1;
|
82
84
|
min = del;
|
83
85
|
if (ins < min) min = ins;
|
84
|
-
//if (i == 2 && j==2) return
|
86
|
+
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
85
87
|
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
86
88
|
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
87
89
|
if (transp < min) min = transp;
|
@@ -103,7 +105,7 @@ module Taxamatch
|
|
103
105
|
if (stop_execution == 1) distance = current_distance;
|
104
106
|
|
105
107
|
free(d);
|
106
|
-
return
|
108
|
+
return INT2NUM(distance);
|
107
109
|
}
|
108
110
|
"
|
109
111
|
end
|
@@ -8,7 +8,11 @@ module Taxamatch
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def self.normalize_word(word)
|
11
|
-
self.normalize(word).gsub(/[^A-Z0-9\-]/, '')
|
11
|
+
self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.normalize_author(string)
|
15
|
+
self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
|
12
16
|
end
|
13
17
|
|
14
18
|
protected
|
data/lib/taxamatch_rb/parser.rb
CHANGED
@@ -71,8 +71,9 @@ module Taxamatch
|
|
71
71
|
end
|
72
72
|
end
|
73
73
|
res[:authors].uniq!
|
74
|
+
res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
|
74
75
|
res[:years].uniq!
|
75
|
-
@res[:all_authors] += res[:
|
76
|
+
@res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
|
76
77
|
@res[:all_years] += res[:years] if res[:years].size > 0
|
77
78
|
end
|
78
79
|
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -20,17 +20,16 @@ describe 'Parser' do
|
|
20
20
|
end
|
21
21
|
|
22
22
|
it 'should parse uninomials' do
|
23
|
-
@parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[]}}
|
24
|
-
@parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :
|
25
|
-
@parser.parse('Ærenea (Lacordaire, 1872) Muller 2007').should == {:all_authors=>["LACORDAIRE", "MULLER"], :all_years=>["1872", "2007"], :uninomial=>{:epitheton=>"Aerenea", :authors=>["Lacordaire", "Muller"], :normalized=>"AERENEA", :phonetized=>"ERINIA", :years=>["1872", "2007"]}}
|
23
|
+
@parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
|
24
|
+
@parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
|
26
25
|
end
|
27
26
|
|
28
27
|
it 'should parse binomials' do
|
29
|
-
@parser.parse('Leœptura laetifica Dow, 1913').should == {:
|
28
|
+
@parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:epitheton=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
|
30
29
|
end
|
31
30
|
|
32
31
|
it 'should parse trinomials' do
|
33
|
-
@parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:genus=>{:epitheton=>"Hydnellum", :
|
32
|
+
@parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:epitheton=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:epitheton=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
|
34
33
|
end
|
35
34
|
end
|
36
35
|
|
data/spec/taxamatch_test.txt
CHANGED
@@ -16,4 +16,13 @@ Pomatomus saltator|Pomatomus saltatrix|true
|
|
16
16
|
Loligo pealeii|Loligo plei|false
|
17
17
|
|
18
18
|
# different authors should not match
|
19
|
-
Puma concolor Linnaeus|Puma concolor Kurtz|false
|
19
|
+
Puma concolor Linnaeus|Puma concolor Kurtz|false
|
20
|
+
|
21
|
+
#real life examples
|
22
|
+
Biatora borealis|Bactra borealis Diakonoff 1964|false
|
23
|
+
|
24
|
+
Homo sapien|Homo sapiens|true
|
25
|
+
Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true
|
26
|
+
Homo sapiens Mozzherin|Homo sapiens Linneaus|false
|
27
|
+
|
28
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dimus-taxamatch_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-08-
|
12
|
+
date: 2009-08-08 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|