taxamatch_rb 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Dmitry Mozzherin
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,61 @@
1
+ = taxamatch_rb
2
+
3
+ Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
4
+
5
+ The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
6
+
7
+ require 'taxamatch_rb'
8
+ tm = Taxamatch::Base.new
9
+ tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
10
+ tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
11
+ tm.taxamatch('Homo sapiens Mozzherin', 'Homo sapiens Linnaeus') #returns false
12
+
13
+ Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
14
+
15
+ == Installation
16
+
17
+ sudo gem install dimus-taxamatch_rb --source http://gems.github.com
18
+
19
+ or
20
+ sudo gem sources -a http://gems.github.com #(you only have to do this once)
21
+ sudo gem install dimus-taxamatch_rb
22
+
23
+ == Usage
24
+
25
+ require 'rubygems' #not needed for ruby > 1.9.1
26
+ require 'taxamatch_rb'
27
+
28
+ tm = Taxamatch::Base.new
29
+
30
+ * compare full scientific names
31
+
32
+ tm.taxamatch('Hommo sapiens L.', 'Homo sapiens Linnaeus')
33
+
34
+ * preparse names for the matching (necessary for large databases of scientific names)
35
+
36
+ p = Taxamatch::Atomizer.new
37
+ parsed_name1 = p.parse('Monacanthus fronticinctus Günther 1867 sec. Eschmeyer 2004')
38
+ parsed_name2 = p.parse('Monacanthus fronticinctus (Gunther, 1867)')
39
+
40
+ * compare preparsed names
41
+
42
+ tm.taxamatch_preparsed(parsed_name1, parsed_name2)
43
+
44
+ * compare genera
45
+
46
+ tm.match_genera('Monacanthus', 'MONOCANTUS')
47
+
48
+ * compare species
49
+
50
+ tm.match_species('fronticinctus', 'frontecinctus')
51
+
52
+ * compare authors and years
53
+
54
+ Taxamatch::Authmatch.authmatch(['Linnaeus'], ['L','Muller'], [1786], [1787])
55
+
56
+
57
+ You can find more examples in spec section of the code
58
+
59
+ == Copyright
60
+
61
+ Copyright (c) 2009 Dmitry Mozzherin. See LICENSE for details.
@@ -0,0 +1,117 @@
1
+ # encoding: UTF-8
2
+ $:.unshift(File.dirname(__FILE__)) unless
3
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
4
+ # $:.unshift('taxamatch_rb')
5
+ require 'taxamatch_rb/damerau_levenshtein_mod'
6
+ require 'taxamatch_rb/atomizer'
7
+ require 'taxamatch_rb/normalizer'
8
+ require 'taxamatch_rb/phonetizer'
9
+ require 'taxamatch_rb/authmatch'
10
+ require 'ruby-debug'
11
+
12
+ $KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
13
+
14
+ module Taxamatch
15
+
16
+ class Base
17
+
18
+ def initialize
19
+ @parser = Taxamatch::Atomizer.new
20
+ @dlm = Taxamatch::DamerauLevenshteinMod.new
21
+ end
22
+
23
+
24
+ #takes two scientific names and returns true if names match and false if they don't
25
+ def taxamatch(str1, str2, return_boolean = true)
26
+ preparsed_1 = @parser.parse(str1)
27
+ preparsed_2 = @parser.parse(str2)
28
+ match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
29
+ return_boolean && match ? match['match'] : match
30
+ end
31
+
32
+ #takes two hashes of parsed scientific names, analyses them and returns back
33
+ #this function is useful when species strings are preparsed.
34
+ def taxamatch_preparsed(preparsed_1, preparsed_2)
35
+ result = nil
36
+ result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
37
+ result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
38
+ if result && result['match']
39
+ result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
40
+ end
41
+ return result
42
+ end
43
+
44
+ def match_uninomial(preparsed_1, preparsed_2)
45
+ match_genera(preparsed_1[:uninomial], preparsed_2[:uninomial])
46
+ end
47
+
48
+ def match_multinomial(preparsed_1, preparsed_2)
49
+ gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
50
+ sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
51
+ total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
52
+ if preparsed_1[:infraspecies] && preparsed_2[:infraspecies]
53
+ infrasp_match = match_species(preparsed_1[:infraspecies][0], preparsed_2[:infraspecies][0])
54
+ total_length += preparsed_1[:infraspecies][0][:epitheton].size + preparsed_2[:infraspecies][0][:epitheton].size
55
+ match_hash = match_matches(gen_match, sp_match, infrasp_match)
56
+ elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
57
+ match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
58
+ total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:epitheton].size : preparsed_2[:infraspecies][0][:epitheton].size
59
+ else
60
+ match_hash = match_matches(gen_match, sp_match)
61
+ end
62
+ match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
63
+ match_hash
64
+ end
65
+
66
+ def match_genera(genus1, genus2)
67
+ genus1_length = genus1[:normalized].size
68
+ genus2_length = genus2[:normalized].size
69
+ match = false
70
+ ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
71
+ return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[genus1_length, genus2_length].min > 0.2
72
+ return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
73
+
74
+ match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
75
+ {'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
76
+ end
77
+
78
+ def match_species(sp1, sp2)
79
+ sp1_length = sp1[:normalized].size
80
+ sp2_length = sp2[:normalized].size
81
+ sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
82
+ sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
83
+ match = false
84
+ ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
85
+ return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[sp1_length, sp2_length].min > 0.3334
86
+ #puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
87
+ return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
88
+
89
+ match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
90
+ { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
91
+ end
92
+
93
+ def match_authors(preparsed_1, preparsed_2)
94
+ au1 = preparsed_1[:all_authors]
95
+ au2 = preparsed_2[:all_authors]
96
+ yr1 = preparsed_1[:all_years]
97
+ yr2 = preparsed_2[:all_years]
98
+ Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
99
+ end
100
+
101
+ def match_matches(genus_match, species_match, infraspecies_match = nil)
102
+ match = species_match
103
+ if infraspecies_match
104
+ match['edit_distance'] += infraspecies_match['edit_distance']
105
+ match['match'] &&= infraspecies_match['match']
106
+ match['phonetic_match'] &&= infraspecies_match['phonetic_match']
107
+ end
108
+ match['edit_distance'] += genus_match['edit_distance']
109
+ match['match'] = false if match['edit_distance'] > (infraspecies_match ? 6 : 4)
110
+ match['match'] &&= genus_match['match']
111
+ match['phonetic_match'] &&= genus_match['phonetic_match']
112
+ match
113
+ end
114
+
115
+ end
116
+
117
+ end
@@ -0,0 +1,82 @@
1
+ # encoding: UTF-8
2
+ require 'biodiversity'
3
+
4
+ module Taxamatch
5
+
6
+ class Atomizer
7
+ def initialize
8
+ @parser = ScientificNameParser.new
9
+ @parsed_raw = nil
10
+ @res = {}
11
+ end
12
+
13
+ def parse(name)
14
+ @res = {:all_authors => [], :all_years => []}
15
+ @parsed_raw = @parser.parse(name)[:scientificName]
16
+ organize_results
17
+ end
18
+
19
+ def parsed_raw
20
+ return @parsed_raw
21
+ end
22
+
23
+ protected
24
+
25
+ def organize_results
26
+ pr = @parsed_raw
27
+ return nil unless pr[:parsed]
28
+ d = pr[:details][0]
29
+ process_node(:uninomial, d[:uninomial])
30
+ process_node(:genus, d[:genus])
31
+ process_node(:species, d[:species], true)
32
+ process_infraspecies(d[:infraspecies])
33
+ @res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
34
+ @res[:all_years].uniq!
35
+ @res.keys.size > 2 ? @res : nil
36
+ end
37
+
38
+ def process_node(name, node, is_species = false)
39
+ return unless node
40
+ @res[name] = {}
41
+ @res[name][:epitheton] = node[:epitheton]
42
+ @res[name][:normalized] = Taxamatch::Normalizer.normalize(node[:epitheton])
43
+ @res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node[:epitheton], is_species)
44
+ get_authors_years(node, @res[name])
45
+ end
46
+
47
+ def process_infraspecies(node)
48
+ return unless node
49
+ @res[:infraspecies] = []
50
+ node.each do |infr|
51
+ hsh = {}
52
+ hsh[:epitheton] = infr[:epitheton]
53
+ hsh[:normalized] = Taxamatch::Normalizer.normalize(infr[:epitheton])
54
+ hsh[:phonetized] = Taxamatch::Phonetizer.near_match(infr[:epitheton], true)
55
+ get_authors_years(infr,hsh)
56
+ @res[:infraspecies] << hsh
57
+ end
58
+ end
59
+
60
+ def get_authors_years(node, res)
61
+ res[:authors] = []
62
+ res[:years] = []
63
+ [:basionymAuthorTeam, :combinationAuthorTeam].each do |au|
64
+ if node[au]
65
+ res[:authors] += node[au][:author]
66
+ res[:years] << node[au][:year] if node[au][:year]
67
+ if node[au][:exAuthorTeam]
68
+ res[:authors] += node[au][:exAuthorTeam][:author]
69
+ res[:years] << node[au][:exAuthorTeam][:year] if node[au][:exAuthorTeam][:year]
70
+ end
71
+ end
72
+ end
73
+ res[:authors].uniq!
74
+ res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
75
+ res[:years].uniq!
76
+ @res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
77
+ @res[:all_years] += res[:years] if res[:years].size > 0
78
+ end
79
+
80
+ end
81
+ end
82
+
@@ -0,0 +1,89 @@
1
+ # Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
2
+
3
+ module Taxamatch
4
+ class Authmatch
5
+
6
+ def self.authmatch(authors1, authors2, years1, years2)
7
+ unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
8
+ year_difference = compare_years(years1, years2)
9
+ get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
10
+ end
11
+
12
+ def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
13
+ count_before = authors1.size + authors2.size
14
+ count_after = unique_authors1.size + unique_authors2.size
15
+ score = 0
16
+ if count_after == 0
17
+ if year_diff != nil
18
+ if year_diff == 0
19
+ score = 100
20
+ elsif year_diff == 1
21
+ score = 54
22
+ end
23
+ else
24
+ score = 94
25
+ end
26
+ elsif unique_authors1.size == 0 || unique_authors2.size == 0
27
+ if year_diff != nil
28
+ if year_diff == 0
29
+ score = 91
30
+ elsif year_diff == 1
31
+ score = 51
32
+ end
33
+ else
34
+ score = 90
35
+ end
36
+ else
37
+ score = ((1 - count_after.to_f/count_before.to_f) * 100).round
38
+ score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
39
+ end
40
+ score > 50 ? score : 0
41
+ end
42
+
43
+ def self.remove_duplicate_authors(authors1, authors2)
44
+ unique_authors1 = authors1.dup
45
+ unique_authors2 = authors2.dup
46
+ authors1.each do |au1|
47
+ authors2.each do |au2|
48
+ au1_match = au2_match = false
49
+ if au1 == au2
50
+ au1_match = au2_match = true
51
+ elsif au1 == au2[0...au1.size]
52
+ au1_match = true
53
+ elsif au1[0...au2.size] == au2
54
+ au2_match = true
55
+ end
56
+ if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
57
+ unique_authors1.delete au1
58
+ unique_authors2.delete au2
59
+ elsif au1_match
60
+ unique_authors1.delete au1
61
+ elsif au2_match
62
+ unique_authors2.delete au2
63
+ else
64
+ #TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
65
+ if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
66
+ unique_authors1.delete au1
67
+ unique_authors2.delete au2
68
+ end
69
+ end
70
+ end
71
+ end
72
+ [unique_authors1, unique_authors2]
73
+ end
74
+
75
+ def self.fuzzy_match_authors(author1, author2)
76
+ au1_length = author1.size
77
+ au2_length = author2.size
78
+ dlm = Taxamatch::DamerauLevenshteinMod.new
79
+ ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
80
+ (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
81
+ end
82
+
83
+ def self.compare_years(years1, years2)
84
+ return 0 if years1 == [] && years2 == []
85
+ return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
86
+ nil
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,139 @@
1
+ # encoding: UTF-8
2
+ require 'rubygems'
3
+ require 'inline'
4
+ require 'time'
5
+ module Taxamatch
6
+
7
+ class DamerauLevenshteinMod
8
+ def distance(str1, str2, block_size=2, max_distance=10)
9
+ # puts str1.unpack("U*");
10
+ distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
11
+ end
12
+
13
+ inline do |builder|
14
+ builder.c "
15
+ static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
16
+ int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
17
+ int stop_execution = 0;
18
+ int min = 0;
19
+ int current_distance = 0;
20
+
21
+ VALUE *sv = RARRAY_PTR(_s);
22
+ VALUE *tv = RARRAY_PTR(_t);
23
+
24
+ sl = RARRAY_LEN(_s);
25
+ tl = RARRAY_LEN(_t);
26
+
27
+ if (sl == 0) return INT2NUM(tl);
28
+ if (tl == 0) return INT2NUM(sl);
29
+ //case of lengths 1 must present or it will break further in the code
30
+ if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
31
+
32
+ int s[sl];
33
+ int t[tl];
34
+
35
+ for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
36
+ for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
37
+
38
+ sl++;
39
+ tl++;
40
+
41
+ //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
42
+ d = malloc((sizeof(int))*(sl)*(tl));
43
+ //populate 'vertical' row starting from the 2nd position (first one is filled already)
44
+ for(i = 0; i < tl; i++){
45
+ d[i*sl] = i;
46
+ }
47
+
48
+ //fill up array with scores
49
+ for(i = 1; i<sl; i++){
50
+ d[i] = i;
51
+ if (stop_execution == 1) break;
52
+ current_distance = 10000;
53
+ for(j = 1; j<tl; j++){
54
+
55
+ cost = 1;
56
+ if(s[i-1] == t[j-1]) cost = 0;
57
+
58
+ half_sl = (sl - 1)/2;
59
+ half_tl = (tl - 1)/2;
60
+
61
+ block = block_size < half_sl ? block_size : half_sl;
62
+ block = block < half_tl ? block : half_tl;
63
+
64
+ while (block >= 1){
65
+ int swap1 = 1;
66
+ int swap2 = 1;
67
+ i1 = i - (block * 2);
68
+ j1 = j - (block * 2);
69
+ for (k = i1; k < i1 + block; k++) {
70
+ if (s[k] != t[k + block]){
71
+ swap1 = 0;
72
+ break;
73
+ }
74
+ }
75
+ for (k = j1; k < j1 + block; k++) {
76
+ if (t[k] != s[k + block]){
77
+ swap2 = 0;
78
+ break;
79
+ }
80
+ }
81
+
82
+ del = d[j*sl + i - 1] + 1;
83
+ ins = d[(j-1)*sl + i] + 1;
84
+ min = del;
85
+ if (ins < min) min = ins;
86
+ //if (i == 2 && j==2) return INT2NUM(swap2+5);
87
+ if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
88
+ transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
89
+ if (transp < min) min = transp;
90
+ block = 0;
91
+ } else if (block == 1) {
92
+ subs = d[(j-1)*sl + i - 1] + cost;
93
+ if (subs < min) min = subs;
94
+ }
95
+ block--;
96
+ }
97
+ d[j*sl+i]=min;
98
+ if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
99
+ }
100
+ if (current_distance > max_distance) {
101
+ stop_execution = 1;
102
+ }
103
+ }
104
+ distance=d[sl * tl - 1];
105
+ if (stop_execution == 1) distance = current_distance;
106
+
107
+ free(d);
108
+ return INT2NUM(distance);
109
+ }
110
+ "
111
+ end
112
+ end
113
+ end
114
+
115
+ if __FILE__ == $0
116
+ a=Taxamatch::DamerauLevenshteinMod.new
117
+ s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
118
+ t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
119
+
120
+ #puts s.join(",")
121
+ #puts t.join(",")
122
+
123
+ start = Time.now
124
+ (1..100000).each do
125
+ a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921',1,10)
126
+ end
127
+ puts "with unpack time: " + (Time.now - start).to_s + ' sec'
128
+
129
+ start = Time.now
130
+ (1..100000).each do
131
+ a.distance_utf(s, t, 1, 10)
132
+ end
133
+ puts 'utf time: ' + (Time.now - start).to_s + ' sec'
134
+
135
+ #puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
136
+ #puts a.distance_utf(s, t, 2, 10)
137
+ #puts a.distance('tar','atp',1,10);
138
+ puts a.distance('sub', 'usb', 1, 10);
139
+ end
@@ -0,0 +1,55 @@
1
+ # encoding: UTF-8
2
+
3
+ module Taxamatch
4
+
5
+ module Normalizer
6
+ def self.normalize(string)
7
+ utf8_to_ascii(string).upcase
8
+ end
9
+
10
+ def self.normalize_word(word)
11
+ self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
12
+ end
13
+
14
+ def self.normalize_author(string)
15
+ self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
16
+ end
17
+
18
+ protected
19
+ def self.utf8_to_ascii(string)
20
+ string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
21
+ string = string.gsub(/[ÉÈÊË]/, "E")
22
+ string = string.gsub(/[ÍÌÎÏ]/, "I")
23
+ string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
24
+ string = string.gsub(/[ÚÙÛÜ]/, "U")
25
+ string = string.gsub(/[Ý]/, "Y")
26
+ string = string.gsub(/Æ/, "AE")
27
+ string = string.gsub(/[ČÇ]/, "C")
28
+ string = string.gsub(/[ŠŞ]/, "S")
29
+ string = string.gsub(/[Đ]/, "D")
30
+ string = string.gsub(/Ž/, "Z")
31
+ string = string.gsub(/Ñ/, "N")
32
+ string = string.gsub(/Œ/, "OE")
33
+ string = string.gsub(/ß/, "B")
34
+ string = string.gsub(/Ķ/, "K")
35
+ string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
36
+ string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
37
+ string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
38
+ string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
39
+ string = string.gsub(/[úùûüůưừựủứụ]/, "u")
40
+ string = string.gsub(/[žź]/, "z")
41
+ string = string.gsub(/[ýÿỹ]/, "y")
42
+ string = string.gsub(/[đ]/, "d")
43
+ string = string.gsub(/æ/, "ae")
44
+ string = string.gsub(/[čćç]/, "c")
45
+ string = string.gsub(/[ñńň]/, "n")
46
+ string = string.gsub(/œ/, "oe")
47
+ string = string.gsub(/[śšş]/, "s")
48
+ string = string.gsub(/ř/, "r")
49
+ string = string.gsub(/ğ/, "g")
50
+ string = string.gsub(/Ř/, "R")
51
+ end
52
+
53
+ end
54
+
55
+ end
@@ -0,0 +1,79 @@
1
+ # encoding: UTF-8
2
+ module Taxamatch
3
+
4
+ module Phonetizer
5
+
6
+ def self.phonetize(a_word, normalize_ending = false)
7
+ self.near_match(a_word, normalize_ending)
8
+ end
9
+
10
+ def self.near_match(a_word, normalize_ending = false)
11
+ a_word = a_word.strip rescue ''
12
+ return '' if a_word == ''
13
+ a_word = Taxamatch::Normalizer.normalize a_word
14
+ case a_word
15
+ when /^AE/
16
+ a_word = 'E' + a_word[2..-1]
17
+ when /^CN/
18
+ a_word = 'N' + a_word[2..-1]
19
+ when /^CT/
20
+ a_word = 'T' + a_word[2..-1]
21
+ when /^CZ/
22
+ a_word = 'C' + a_word[2..-1]
23
+ when /^DJ/
24
+ a_word = 'J' + a_word[2..-1]
25
+ when /^EA/
26
+ a_word = 'E' + a_word[2..-1]
27
+ when /^EU/
28
+ a_word = 'U' + a_word[2..-1]
29
+ when /^GN/
30
+ a_word = 'N' + a_word[2..-1]
31
+ when /^KN/
32
+ a_word = 'N' + a_word[2..-1]
33
+ when /^MC/
34
+ a_word = 'MAC' + a_word[2..-1]
35
+ when /^MN/
36
+ a_word = 'N' + a_word[2..-1]
37
+ when /^OE/
38
+ a_word = 'E' + a_word[2..-1]
39
+ when /^QU/
40
+ a_word = 'Q' + a_word[2..-1]
41
+ when /^PS/
42
+ a_word = 'S' + a_word[2..-1]
43
+ when /^PT/
44
+ a_word = 'T' + a_word[2..-1]
45
+ when /^TS/
46
+ a_word = 'S' + a_word[2..-1]
47
+ when /^WR/
48
+ a_word = 'R' + a_word[2..-1]
49
+ when /^X/
50
+ a_word = 'Z' + a_word[1..-1]
51
+ end
52
+ first_char = a_word.split('')[0]
53
+ rest_chars = a_word.split('')[1..-1].join('')
54
+ rest_chars.gsub!('AE', 'I')
55
+ rest_chars.gsub!('IA', 'A')
56
+ rest_chars.gsub!('OE', 'I')
57
+ rest_chars.gsub!('OI', 'A')
58
+ rest_chars.gsub!('SC', 'S')
59
+ rest_chars.gsub!('H', '')
60
+ rest_chars.tr!('EOUYKZ', 'IAIICS')
61
+ a_word = (first_char + rest_chars).squeeze
62
+
63
+ if normalize_ending && a_word.size > 4
64
+ a_word = self.normalize_ending(a_word)
65
+ end
66
+ a_word
67
+ end
68
+
69
+ def self.normalize_ending(a_word)
70
+ # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
71
+ # -- at the end of a string translate all to -a
72
+ a_word.gsub!(/IS$/, 'A')
73
+ a_word.gsub!(/IM$/, 'A')
74
+ a_word.gsub(/AS$/, 'A')
75
+ end
76
+
77
+ end
78
+
79
+ end
@@ -0,0 +1,63 @@
1
+ ######################
2
+ # Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
3
+ #
4
+ # * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
5
+ # * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
6
+ #
7
+ # Fields:
8
+ # String1|String2|maximum distance|transposition block size|expected distance
9
+ # - String1, String2
10
+ # compared strings
11
+ # - maximum distance
12
+ # stops execution of the algorithm when calculated distance exceeds the maximum distance number
13
+ # - transosition block size
14
+ # determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
15
+ # - expected distance
16
+ # resulting distance that has to be achieved by the algorithm
17
+ # Note: algorithm does not try to normalize or interpret strings in any way.
18
+ ######################
19
+
20
+ #it whould recognize the exact match
21
+ Pomatomus|Pomatomus|10|1|0
22
+
23
+ #it should not try to normalize incoming strings
24
+ Pomatomus|Pomatomus|10|1|1
25
+ Pomatomus|pomatomus|10|1|1
26
+
27
+ #it should calculate special cases
28
+ Pomatomus||10|1|9
29
+ |Pomatomus|10|1|9
30
+ P|p|10|1|1
31
+ #TODO: one letter vs longer string generates a big negative number
32
+ #L|Linneaus|10|1|7
33
+
34
+
35
+ #it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
36
+ Pomatomus|Pomatomux|10|1|1
37
+ Pmatomus|Pomatomus|10|1|1
38
+ Pomatomus|Pmatomus|10|1|1
39
+ Rpmatomus|Pomatomus|10|1|2
40
+ Pommtomus|Pomatomus|10|1|1
41
+ Potamomus|Pomatomus|10|1|2
42
+ Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
43
+ Pomatomus|oPmatomus|10|1|1
44
+ Pomatomus|Pomatomsu|10|1|1
45
+ Pomtaomus|Pomatomus|10|1|1
46
+ Pomatoums|Pomatomus|10|1|1
47
+ Potamomus|Pomatomus|10|1|2
48
+ Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
49
+
50
+ #it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
51
+ serrulatus|serratulus|10|2|2
52
+ Pomatomus|Poomumats|10|3|3
53
+ vesiculosus|vecusilosus|10|1|4
54
+ vesiculosus|vecusilosus|10|2|2
55
+ trimerophyton|mertriophyton|10|1|6
56
+ trimerophyton|mertriophyton|10|3|3
57
+
58
+ #it should stop trying if distance exceeds maximum allowed distance
59
+ Pxxxxomus|Pomatomus|10|1|4
60
+ Pxxxxomus|Pomatomus|2|1|3
61
+
62
+ #
63
+ PUNCTATA|PUNCTATA|10|1|0
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --colour
@@ -0,0 +1,28 @@
1
+ begin
2
+ require 'spec'
3
+ rescue LoadError
4
+ require 'rubygems' unless ENV['NO_RUBYGEMS']
5
+ gem 'rspec'
6
+ require 'spec'
7
+ end
8
+
9
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
10
+ require 'taxamatch_rb'
11
+
12
+ def read_test_file(file, fields_num)
13
+ f = open(file)
14
+ f.each do |line|
15
+ fields = line.split("|")
16
+ if line.match(/^\s*#/) == nil && fields.size == fields_num
17
+ fields[-1] = fields[-1].split('#')[0].strip
18
+ yield(fields)
19
+ else
20
+ yield(nil)
21
+ end
22
+ end
23
+ end
24
+
25
+ def make_taxamatch_hash(string)
26
+ normalized = Taxamatch::Normalizer.normalize(string)
27
+ {:epitheton => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
28
+ end
@@ -0,0 +1,254 @@
1
+ # encoding: UTF-8
2
+ require File.dirname(__FILE__) + '/spec_helper.rb'
3
+
4
+ describe 'DamerauLevenshteinMod' do
5
+ it 'should get tests' do
6
+ read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
7
+ dl = Taxamatch::DamerauLevenshteinMod.new
8
+ if y
9
+ res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
10
+ puts y if res != y[4].to_i
11
+ res.should == y[4].to_i
12
+ end
13
+ end
14
+ end
15
+ end
16
+
17
+ describe 'Atomizer' do
18
+ before(:all) do
19
+ @parser = Taxamatch::Atomizer.new
20
+ end
21
+
22
+ it 'should parse uninomials' do
23
+ @parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
24
+ @parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
25
+ end
26
+
27
+ it 'should parse binomials' do
28
+ @parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:epitheton=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
29
+ end
30
+
31
+ it 'should parse trinomials' do
32
+ @parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:epitheton=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:epitheton=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
33
+ end
34
+ end
35
+
36
+
37
+ describe 'Taxamatch::Normalizer' do
38
+ it 'should normalize strings' do
39
+ Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
40
+ Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
41
+ Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
42
+ Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
43
+ Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
44
+ end
45
+
46
+ it 'should normalize words' do
47
+ Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
48
+ end
49
+ end
50
+
51
+ describe 'Taxamatch::Base' do
52
+ before(:all) do
53
+ @tm = Taxamatch::Base.new
54
+ end
55
+
56
+ it 'should get txt tests' do
57
+ dl = Taxamatch::DamerauLevenshteinMod.new
58
+ read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
59
+ if y
60
+ y[2] = y[2] == 'true' ? true : false
61
+ res = @tm.taxamatch(y[0], y[1], false)
62
+ puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
63
+ res['match'].should == y[2]
64
+ res['edit_distance'].should == y[3].to_i
65
+ end
66
+ end
67
+ end
68
+
69
+ it 'should work with names that cannot be parsed' do
70
+ res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
71
+ res = false
72
+ end
73
+
74
+ it 'should compare genera' do
75
+ #edit distance 1 always match
76
+ g1 = make_taxamatch_hash 'Plantago'
77
+ g2 = make_taxamatch_hash 'Plantagon'
78
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
79
+ #edit_distance above threshold does not math
80
+ g1 = make_taxamatch_hash 'Plantago'
81
+ g2 = make_taxamatch_hash 'This shouldnt match'
82
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
83
+ #phonetic_match matches
84
+ g1 = make_taxamatch_hash 'Plantagi'
85
+ g2 = make_taxamatch_hash 'Plantagy'
86
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
87
+ #distance 1 in first letter also matches
88
+ g1 = make_taxamatch_hash 'Xantheri'
89
+ g2 = make_taxamatch_hash 'Pantheri'
90
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
91
+ #phonetic match tramps everything
92
+ g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
93
+ g2 = make_taxamatch_hash 'Zanthery'
94
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
95
+ #same first letter and distance 2 should match
96
+ g1 = make_taxamatch_hash 'Xantherii'
97
+ g2 = make_taxamatch_hash 'Xantherrr'
98
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
99
+ #First letter is the same and distance is 3 should match, no phonetic match
100
+ g1 = make_taxamatch_hash 'Xantheriii'
101
+ g2 = make_taxamatch_hash 'Xantherrrr'
102
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
103
+ #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
104
+ g1 = make_taxamatch_hash 'Xant'
105
+ g2 = make_taxamatch_hash 'Xanthe'
106
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
107
+ #Should not match if edit distance > 3 and no phonetic match
108
+ g1 = make_taxamatch_hash 'Xantheriiii'
109
+ g2 = make_taxamatch_hash 'Xantherrrrr'
110
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
111
+ end
112
+
113
+ it 'should compare species' do
114
+ #Exact match
115
+ s1 = make_taxamatch_hash 'major'
116
+ s2 = make_taxamatch_hash 'major'
117
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
118
+ #Phonetic match always works
119
+ s1 = make_taxamatch_hash 'xanteriiiiiiii'
120
+ s2 = make_taxamatch_hash 'zantereeeeeeee'
121
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
122
+ #Phonetic match works with different endings
123
+ s1 = make_taxamatch_hash 'majorum'
124
+ s2 = make_taxamatch_hash 'majoris'
125
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
126
+ #Distance 4 matches if first 3 chars are the same
127
+ s1 = make_taxamatch_hash 'majorrrrr'
128
+ s2 = make_taxamatch_hash 'majoraaaa'
129
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
130
+ #Should not match if Distance 4 matches and first 3 chars are not the same
131
+ s1 = make_taxamatch_hash 'majorrrrr'
132
+ s2 = make_taxamatch_hash 'marorraaa'
133
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
134
+ #Distance 2 or 3 matches if first 1 char is the same
135
+ s1 = make_taxamatch_hash 'morrrr'
136
+ s2 = make_taxamatch_hash 'moraaa'
137
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
138
+ #Should not match if Distance 2 or 3 and first 1 char is not the same
139
+ s1 = make_taxamatch_hash 'morrrr'
140
+ s2 = make_taxamatch_hash 'torraa'
141
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
142
+ #Distance 1 will match anywhere
143
+ s1 = make_taxamatch_hash 'major'
144
+ s2 = make_taxamatch_hash 'rajor'
145
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
146
+ #Will not match if distance 3 and length is less then twice of the edit distance
147
+ s1 = make_taxamatch_hash 'marrr'
148
+ s2 = make_taxamatch_hash 'maaaa'
149
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
150
+ end
151
+
152
+ it 'should match mathes' do
153
+ #No trobule case
154
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
155
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
156
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
157
+ #Will not match if either genus or sp. epithet dont match
158
+ gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
159
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
160
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
161
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
162
+ smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
163
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
164
+ #Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
165
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
166
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
167
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
168
+ #Should not have phonetic match if one of the components does not match phonetically
169
+ gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
170
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
171
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
172
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
173
+ smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
174
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
175
+ #edit distance should be equal the sum of of edit distances
176
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
177
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
178
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
179
+ end
180
+
181
+ describe 'Taxamatch::Authmatch' do
182
+ before(:all) do
183
+ @am = Taxamatch::Authmatch
184
+ end
185
+
186
+ it 'should calculate score' do
187
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
188
+ res.should == 90
189
+ res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
190
+ res.should == 0
191
+ #found all authors, same year
192
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
193
+ res.should == 100
194
+ #all authors, 1 year diff
195
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
196
+ res.should == 54
197
+ #year is not counted in
198
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
199
+ res.should == 94
200
+ #found all authors on one side, same year
201
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
202
+ res.should == 91
203
+ #found all authors on one side, 1 year diff
204
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
205
+ res.should == 51
206
+ #found all authors on one side, year does not count
207
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
208
+ res.should == 90
209
+ #found some authors
210
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
211
+ res.should == 67
212
+ #if year does not match or not present no match for previous case
213
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
214
+ res.should == 0
215
+ end
216
+
217
+ it 'should compare years' do
218
+ @am.compare_years([1882],[1880]).should == 2
219
+ @am.compare_years([1882],[]).should == nil
220
+ @am.compare_years([],[]).should == 0
221
+ @am.compare_years([1788,1798], [1788,1798]).should be_nil
222
+ end
223
+
224
+ it 'should remove duplicate authors' do
225
+ #Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
226
+ #Muller is identical
227
+ res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
228
+ res.should == [[], []]
229
+ #same in different order
230
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
231
+ res.should == [[], []]
232
+ #auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
233
+ res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
234
+ res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
235
+ #fuzzy match
236
+ res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
237
+ res.should == [["Dem"], ["Stepanov"]]
238
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
239
+ res.should == [['Linnaeus', 'Muller'], ['Kenn']]
240
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
241
+ res.should == [[],['Kurtz']]
242
+ end
243
+
244
+ it 'should fuzzy match authors' do
245
+ #TODO: fix the bug revealed by this test
246
+ # res = @am.fuzzy_match_authors('L', 'Muller')
247
+ # res.should be_false
248
+ end
249
+
250
+ end
251
+
252
+ end
253
+
254
+
@@ -0,0 +1,45 @@
1
+ ###
2
+ #
3
+ # Tests for string comparison by taxamatch algorithm
4
+ # name1|name2|match|edit_distance
5
+ #
6
+ ##
7
+ # Comparing uninomials
8
+ Pomatomus|Pomatomas|true|1
9
+ Pomatomus L.|Pomatomas Linn.|true|1
10
+ Pomatomus Ber|Pomatomas Linn|false|1
11
+ Pomatomus L. 1753|Pomatomus Linn. 1800|false|0
12
+
13
+ ## additional authorship should match
14
+ Puma concolor|Puma concolor L.|true|0
15
+ #
16
+ ## one-letter misspeling in species epithet should match
17
+ Puma concolor|Puma cancolor|true|1
18
+ #
19
+ Pomatomus saltatrix|Pomatomus saltratix|true|2
20
+ Pomatomus saltator|Pomatomus saltatrix|true|3
21
+ #
22
+ Loligo pealeii|Loligo plei|false|3
23
+ #
24
+ ## different authors should not match
25
+ Puma concolor Linnaeus|Puma concolor Kurtz|false|0
26
+ #
27
+ ##real life examples
28
+ Biatora borealis|Bactra borealis Diakonoff 1964|false|3
29
+ #
30
+ Homo sapien|Homo sapiens|true|1
31
+ Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true|1
32
+ Homo sapiens Mozzherin|Homo sapiens Linneaus|false|0
33
+ #
34
+ Quinqueloculina punctata|Quinqueloculina punctata d'Orbigny 1905|true|0
35
+ Pomatomus saltator (Linnaeus, 1766)|Pomatomus saltatrix (Linnaeus, 1766)|true|0|3
36
+ #
37
+ #Trinomial names
38
+ Homo sapiens stupidus|Homo spiens stupidus|true|1
39
+ Pomatomus saltator saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|true|1
40
+ Pomatomus saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|false|5
41
+ Pomatomus saltator saltator saltatorische|Pomatomus saltator soltator|true|1
42
+
43
+
44
+
45
+
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: taxamatch_rb
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 6
8
+ - 0
9
+ version: 0.6.0
10
+ platform: ruby
11
+ authors:
12
+ - Dmitry Mozzherin
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-19 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: RubyInline
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :runtime
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: biodiversity
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ - 5
42
+ - 13
43
+ version: 0.5.13
44
+ type: :runtime
45
+ version_requirements: *id002
46
+ description: This gem implements algorithsm for fuzzy matching scientific names developed by Tony Rees
47
+ email: dmozzherin@eol.org
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ extra_rdoc_files:
53
+ - LICENSE
54
+ - README.rdoc
55
+ files:
56
+ - README.rdoc
57
+ - lib/taxamatch_rb.rb
58
+ - lib/taxamatch_rb/atomizer.rb
59
+ - lib/taxamatch_rb/authmatch.rb
60
+ - lib/taxamatch_rb/damerau_levenshtein_mod.rb
61
+ - lib/taxamatch_rb/normalizer.rb
62
+ - lib/taxamatch_rb/phonetizer.rb
63
+ - spec/damerau_levenshtein_mod_test.txt
64
+ - spec/spec.opts
65
+ - spec/spec_helper.rb
66
+ - spec/taxamatch_rb_spec.rb
67
+ - spec/taxamatch_test.txt
68
+ - LICENSE
69
+ has_rdoc: true
70
+ homepage: http://github.com/dimus/taxamatch_rb
71
+ licenses: []
72
+
73
+ post_install_message:
74
+ rdoc_options:
75
+ - --charset=UTF-8
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ segments:
83
+ - 0
84
+ version: "0"
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ segments:
90
+ - 0
91
+ version: "0"
92
+ requirements: []
93
+
94
+ rubyforge_project:
95
+ rubygems_version: 1.3.6
96
+ signing_key:
97
+ specification_version: 3
98
+ summary: Implementation of Tony Rees Taxamatch algorithms
99
+ test_files:
100
+ - spec/spec_helper.rb
101
+ - spec/taxamatch_rb_spec.rb