dimus-taxamatch_rb 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,6 +1,59 @@
1
1
  = taxamatch_rb
2
2
 
3
- Description goes here.
3
+ Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
4
+
5
+ The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
6
+
7
+ tm = Taxamatch::Base.new
8
+ tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
9
+ tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
10
+ tm.taxamatch('Homo sapiens Mozzherin', 'Homo sapiens Linnaeus') #returns false
11
+
12
+ Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
13
+
14
+ == Installation
15
+
16
+ sudo gem install dimus-taxamatch_rb --source http://gems.github.com
17
+
18
+ or
19
+ sudo gem sources -a http://gems.github.com #(you only have to do this once)
20
+ sudo gem install dimus-taxamatch_rb
21
+
22
+ == Usage
23
+
24
+ require 'rubygems' #not needed for ruby > 1.9.1
25
+ require 'taxamatch_rb'
26
+
27
+ tm = Taxamatch::Base.new
28
+
29
+ * compare full scientific names
30
+
31
+ tm.taxamatch('Hommo sapiens L.', 'Homo sapiens Linnaeus')
32
+
33
+ * preparse names for the matching (necessary for large databases of scientific names)
34
+
35
+ p = Taxamatch::Parser.new
36
+ parsed_name1 = p.parse('Monacanthus fronticinctus Günther 1867 sec. Eschmeyer 2004')
37
+ parsed_name2 = p.parse('Monacanthus fronticinctus (Gunther, 1867)')
38
+
39
+ * compare preparsed names
40
+
41
+ tm.taxamatch_preparsed(parsed_name1, parsed_name2)
42
+
43
+ * compare genera
44
+
45
+ tm.match_genera('Monacanthus', 'MONOCANTUS')
46
+
47
+ * compare species
48
+
49
+ tm.match_species('fronticinctus', 'frontecinctus')
50
+
51
+ * compare authors and years
52
+
53
+ Taxamatch::Authmatch.authmatch(['Linnaeus'], ['L','Muller'], [1786], [1787])
54
+
55
+
56
+ You can find more examples in spec section of the code
4
57
 
5
58
  == Copyright
6
59
 
data/lib/taxamatch_rb.rb CHANGED
@@ -22,32 +22,32 @@ module Taxamatch
22
22
 
23
23
  #takes two scientific names and returns true if names match and false if they don't
24
24
  def taxamatch(str1, str2)
25
- parsed_data_1 = @parser.parse(str1)
26
- parsed_data_2 = @parser.parse(str2)
27
- taxamatch_parsed_data(parsed_data_1, parsed_data_2)[:match]
25
+ preparsed_1 = @parser.parse(str1)
26
+ preparsed_2 = @parser.parse(str2)
27
+ taxamatch_preparsed(preparsed_1, preparsed_2)[:match]
28
28
  end
29
29
 
30
30
  #takes two hashes of parsed scientific names, analyses them and returns back
31
31
  #this function is useful when species strings are preparsed.
32
- def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
32
+ def taxamatch_preparsed(preparsed_1, preparsed_2)
33
33
  result = nil
34
- result = match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
35
- result = match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
34
+ result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
35
+ result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
36
36
  if result && result[:match]
37
- result[:match] = false if match_authors(parsed_data_1, parsed_data_2) == 0
37
+ result[:match] = false if match_authors(preparsed_1, preparsed_2) == 0
38
38
  end
39
39
  return result
40
40
  end
41
41
 
42
- def match_uninomial(parsed_data_1, parsed_data_2)
42
+ def match_uninomial(preparsed_1, preparsed_2)
43
43
  return false
44
44
  end
45
45
 
46
- def match_multinomial(parsed_data_1, parsed_data_2)
47
- gen_match = match_genera(parsed_data_1[:genus], parsed_data_2[:genus])
48
- sp_match = match_species(parsed_data_1[:species], parsed_data_2[:species])
49
- au_match = match_authors(parsed_data_1, parsed_data_2)
50
- total_length = parsed_data_1[:genus][:epitheton].size + parsed_data_2[:genus][:epitheton].size + parsed_data_1[:species][:epitheton].size + parsed_data_2[:species][:epitheton].size
46
+ def match_multinomial(preparsed_1, preparsed_2)
47
+ gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
48
+ sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
49
+ au_match = match_authors(preparsed_1, preparsed_2)
50
+ total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
51
51
  match = match_matches(gen_match, sp_match)
52
52
  match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
53
53
  end
@@ -76,11 +76,11 @@ module Taxamatch
76
76
  {:edit_distance => ed, :match => match, :phonetic_match => false}
77
77
  end
78
78
 
79
- def match_authors(parsed_data_1, parsed_data_2)
80
- au1 = parsed_data_1[:all_authors]
81
- au2 = parsed_data_2[:all_authors]
82
- yr1 = parsed_data_1[:all_years]
83
- yr2 = parsed_data_2[:all_years]
79
+ def match_authors(preparsed_1, preparsed_2)
80
+ au1 = preparsed_1[:all_authors]
81
+ au2 = preparsed_2[:all_authors]
82
+ yr1 = preparsed_1[:all_years]
83
+ yr2 = preparsed_2[:all_years]
84
84
  Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
85
85
  end
86
86
 
@@ -12,9 +12,11 @@ module Taxamatch
12
12
 
13
13
  inline do |builder|
14
14
  builder.c "
15
- static VALUE distance_utf(VALUE _s, VALUE _t, long block_size, long max_distance){
16
- long min, i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block, current_distance;
17
- long stop_execution = 0;
15
+ static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
16
+ int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
17
+ int stop_execution = 0;
18
+ int min = 0;
19
+ int current_distance = 0;
18
20
 
19
21
  VALUE *sv = RARRAY_PTR(_s);
20
22
  VALUE *tv = RARRAY_PTR(_t);
@@ -22,22 +24,22 @@ module Taxamatch
22
24
  sl = RARRAY_LEN(_s);
23
25
  tl = RARRAY_LEN(_t);
24
26
 
25
- if (sl == 0) return LONG2NUM(tl);
26
- if (tl == 0) return LONG2NUM(sl);
27
+ if (sl == 0) return INT2NUM(tl);
28
+ if (tl == 0) return INT2NUM(sl);
27
29
  //case of lengths 1 must present or it will break further in the code
28
- if (sl == 1 && tl == 1 && sv[0] != tv[0]) return LONG2NUM(1);
30
+ if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
29
31
 
30
- long s[sl];
31
- long t[tl];
32
+ int s[sl];
33
+ int t[tl];
32
34
 
33
- for (i=0; i < sl; i++) s[i] = NUM2LONG(sv[i]);
34
- for (i=0; i < tl; i++) t[i] = NUM2LONG(tv[i]);
35
+ for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
36
+ for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
35
37
 
36
38
  sl++;
37
39
  tl++;
38
40
 
39
41
  //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
40
- d = malloc((sizeof(long))*(sl)*(tl));
42
+ d = malloc((sizeof(int))*(sl)*(tl));
41
43
  //populate 'vertical' row starting from the 2nd position (first one is filled already)
42
44
  for(i = 0; i < tl; i++){
43
45
  d[i*sl] = i;
@@ -60,8 +62,8 @@ module Taxamatch
60
62
  block = block < half_tl ? block : half_tl;
61
63
 
62
64
  while (block >= 1){
63
- long swap1 = 1;
64
- long swap2 = 1;
65
+ int swap1 = 1;
66
+ int swap2 = 1;
65
67
  i1 = i - (block * 2);
66
68
  j1 = j - (block * 2);
67
69
  for (k = i1; k < i1 + block; k++) {
@@ -81,7 +83,7 @@ module Taxamatch
81
83
  ins = d[(j-1)*sl + i] + 1;
82
84
  min = del;
83
85
  if (ins < min) min = ins;
84
- //if (i == 2 && j==2) return LONG2NUM(swap2+5);
86
+ //if (i == 2 && j==2) return INT2NUM(swap2+5);
85
87
  if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
86
88
  transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
87
89
  if (transp < min) min = transp;
@@ -103,7 +105,7 @@ module Taxamatch
103
105
  if (stop_execution == 1) distance = current_distance;
104
106
 
105
107
  free(d);
106
- return LONG2NUM(distance);
108
+ return INT2NUM(distance);
107
109
  }
108
110
  "
109
111
  end
@@ -8,7 +8,11 @@ module Taxamatch
8
8
  end
9
9
 
10
10
  def self.normalize_word(word)
11
- self.normalize(word).gsub(/[^A-Z0-9\-]/, '')
11
+ self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
12
+ end
13
+
14
+ def self.normalize_author(string)
15
+ self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
12
16
  end
13
17
 
14
18
  protected
@@ -71,8 +71,9 @@ module Taxamatch
71
71
  end
72
72
  end
73
73
  res[:authors].uniq!
74
+ res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
74
75
  res[:years].uniq!
75
- @res[:all_authors] += res[:authors] if res[:authors].size > 0
76
+ @res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
76
77
  @res[:all_years] += res[:years] if res[:years].size > 0
77
78
  end
78
79
 
@@ -20,17 +20,16 @@ describe 'Parser' do
20
20
  end
21
21
 
22
22
  it 'should parse uninomials' do
23
- @parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[]}}
24
- @parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :authors=>["Lacordaire"], :normalized=>"AERENEA", :phonetized=>"ERINIA", :years=>["1872"]}}
25
- @parser.parse('Ærenea (Lacordaire, 1872) Muller 2007').should == {:all_authors=>["LACORDAIRE", "MULLER"], :all_years=>["1872", "2007"], :uninomial=>{:epitheton=>"Aerenea", :authors=>["Lacordaire", "Muller"], :normalized=>"AERENEA", :phonetized=>"ERINIA", :years=>["1872", "2007"]}}
23
+ @parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
24
+ @parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
26
25
  end
27
26
 
28
27
  it 'should parse binomials' do
29
- @parser.parse('Leœptura laetifica Dow, 1913').should == {:species=>{:epitheton=>"laetifica", :authors=>["Dow"], :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :years=>["1913"]}, :all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:epitheton=>"Leoeptura", :authors=>[], :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :years=>[]}}
28
+ @parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:epitheton=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
30
29
  end
31
30
 
32
31
  it 'should parse trinomials' do
33
- @parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:genus=>{:epitheton=>"Hydnellum", :authors=>[], :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :years=>[]}, :infraspecies=>[{:epitheton=>"zonatum", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :normalized=>"ZONATUM", :phonetized=>"ZANATA", :years=>["1972"]}], :all_authors=>["BANKER", "D. HALL", "D.E. STUNTZ"], :all_years=>["1972"], :species=>{:epitheton=>"scrobiculatum", :authors=>[], :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :years=>[]}}
32
+ @parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:epitheton=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:epitheton=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
34
33
  end
35
34
  end
36
35
 
@@ -16,4 +16,13 @@ Pomatomus saltator|Pomatomus saltatrix|true
16
16
  Loligo pealeii|Loligo plei|false
17
17
 
18
18
  # different authors should not match
19
- Puma concolor Linnaeus|Puma concolor Kurtz|false
19
+ Puma concolor Linnaeus|Puma concolor Kurtz|false
20
+
21
+ #real life examples
22
+ Biatora borealis|Bactra borealis Diakonoff 1964|false
23
+
24
+ Homo sapien|Homo sapiens|true
25
+ Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true
26
+ Homo sapiens Mozzherin|Homo sapiens Linneaus|false
27
+
28
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dimus-taxamatch_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-07 00:00:00 -07:00
12
+ date: 2009-08-08 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency