taxamatch_rb 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Dmitry Mozzherin
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,61 @@
1
+ = taxamatch_rb
2
+
3
+ Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
4
+
5
+ The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
6
+
7
+ require 'taxamatch_rb'
8
+ tm = Taxamatch::Base.new
9
+ tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
10
+ tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
11
+ tm.taxamatch('Homo sapiens Mozzherin', 'Homo sapiens Linnaeus') #returns false
12
+
13
+ Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
14
+
15
+ == Installation
16
+
17
+ sudo gem install dimus-taxamatch_rb --source http://gems.github.com
18
+
19
+ or
20
+ sudo gem sources -a http://gems.github.com #(you only have to do this once)
21
+ sudo gem install dimus-taxamatch_rb
22
+
23
+ == Usage
24
+
25
+ require 'rubygems' #not needed for ruby > 1.9.1
26
+ require 'taxamatch_rb'
27
+
28
+ tm = Taxamatch::Base.new
29
+
30
+ * compare full scientific names
31
+
32
+ tm.taxamatch('Hommo sapiens L.', 'Homo sapiens Linnaeus')
33
+
34
+ * preparse names for the matching (necessary for large databases of scientific names)
35
+
36
+ p = Taxamatch::Atomizer.new
37
+ parsed_name1 = p.parse('Monacanthus fronticinctus Günther 1867 sec. Eschmeyer 2004')
38
+ parsed_name2 = p.parse('Monacanthus fronticinctus (Gunther, 1867)')
39
+
40
+ * compare preparsed names
41
+
42
+ tm.taxamatch_preparsed(parsed_name1, parsed_name2)
43
+
44
+ * compare genera
45
+
46
+ tm.match_genera('Monacanthus', 'MONOCANTUS')
47
+
48
+ * compare species
49
+
50
+ tm.match_species('fronticinctus', 'frontecinctus')
51
+
52
+ * compare authors and years
53
+
54
+ Taxamatch::Authmatch.authmatch(['Linnaeus'], ['L','Muller'], [1786], [1787])
55
+
56
+
57
+ You can find more examples in spec section of the code
58
+
59
+ == Copyright
60
+
61
+ Copyright (c) 2009 Dmitry Mozzherin. See LICENSE for details.
@@ -0,0 +1,117 @@
1
+ # encoding: UTF-8
2
+ $:.unshift(File.dirname(__FILE__)) unless
3
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
4
+ # $:.unshift('taxamatch_rb')
5
+ require 'taxamatch_rb/damerau_levenshtein_mod'
6
+ require 'taxamatch_rb/atomizer'
7
+ require 'taxamatch_rb/normalizer'
8
+ require 'taxamatch_rb/phonetizer'
9
+ require 'taxamatch_rb/authmatch'
10
+ require 'ruby-debug'
11
+
12
+ $KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
13
+
14
+ module Taxamatch
15
+
16
+ class Base
17
+
18
+ def initialize
19
+ @parser = Taxamatch::Atomizer.new
20
+ @dlm = Taxamatch::DamerauLevenshteinMod.new
21
+ end
22
+
23
+
24
+ #takes two scientific names and returns true if names match and false if they don't
25
+ def taxamatch(str1, str2, return_boolean = true)
26
+ preparsed_1 = @parser.parse(str1)
27
+ preparsed_2 = @parser.parse(str2)
28
+ match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
29
+ return_boolean && match ? match['match'] : match
30
+ end
31
+
32
+ #takes two hashes of parsed scientific names, analyses them and returns back
33
+ #this function is useful when species strings are preparsed.
34
+ def taxamatch_preparsed(preparsed_1, preparsed_2)
35
+ result = nil
36
+ result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
37
+ result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
38
+ if result && result['match']
39
+ result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
40
+ end
41
+ return result
42
+ end
43
+
44
+ def match_uninomial(preparsed_1, preparsed_2)
45
+ match_genera(preparsed_1[:uninomial], preparsed_2[:uninomial])
46
+ end
47
+
48
+ def match_multinomial(preparsed_1, preparsed_2)
49
+ gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
50
+ sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
51
+ total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
52
+ if preparsed_1[:infraspecies] && preparsed_2[:infraspecies]
53
+ infrasp_match = match_species(preparsed_1[:infraspecies][0], preparsed_2[:infraspecies][0])
54
+ total_length += preparsed_1[:infraspecies][0][:epitheton].size + preparsed_2[:infraspecies][0][:epitheton].size
55
+ match_hash = match_matches(gen_match, sp_match, infrasp_match)
56
+ elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
57
+ match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
58
+ total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:epitheton].size : preparsed_2[:infraspecies][0][:epitheton].size
59
+ else
60
+ match_hash = match_matches(gen_match, sp_match)
61
+ end
62
+ match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
63
+ match_hash
64
+ end
65
+
66
+ def match_genera(genus1, genus2)
67
+ genus1_length = genus1[:normalized].size
68
+ genus2_length = genus2[:normalized].size
69
+ match = false
70
+ ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
71
+ return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[genus1_length, genus2_length].min > 0.2
72
+ return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
73
+
74
+ match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
75
+ {'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
76
+ end
77
+
78
+ def match_species(sp1, sp2)
79
+ sp1_length = sp1[:normalized].size
80
+ sp2_length = sp2[:normalized].size
81
+ sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
82
+ sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
83
+ match = false
84
+ ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
85
+ return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[sp1_length, sp2_length].min > 0.3334
86
+ #puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
87
+ return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
88
+
89
+ match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
90
+ { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
91
+ end
92
+
93
+ def match_authors(preparsed_1, preparsed_2)
94
+ au1 = preparsed_1[:all_authors]
95
+ au2 = preparsed_2[:all_authors]
96
+ yr1 = preparsed_1[:all_years]
97
+ yr2 = preparsed_2[:all_years]
98
+ Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
99
+ end
100
+
101
+ def match_matches(genus_match, species_match, infraspecies_match = nil)
102
+ match = species_match
103
+ if infraspecies_match
104
+ match['edit_distance'] += infraspecies_match['edit_distance']
105
+ match['match'] &&= infraspecies_match['match']
106
+ match['phonetic_match'] &&= infraspecies_match['phonetic_match']
107
+ end
108
+ match['edit_distance'] += genus_match['edit_distance']
109
+ match['match'] = false if match['edit_distance'] > (infraspecies_match ? 6 : 4)
110
+ match['match'] &&= genus_match['match']
111
+ match['phonetic_match'] &&= genus_match['phonetic_match']
112
+ match
113
+ end
114
+
115
+ end
116
+
117
+ end
@@ -0,0 +1,82 @@
1
+ # encoding: UTF-8
2
+ require 'biodiversity'
3
+
4
+ module Taxamatch
5
+
6
+ class Atomizer
7
+ def initialize
8
+ @parser = ScientificNameParser.new
9
+ @parsed_raw = nil
10
+ @res = {}
11
+ end
12
+
13
+ def parse(name)
14
+ @res = {:all_authors => [], :all_years => []}
15
+ @parsed_raw = @parser.parse(name)[:scientificName]
16
+ organize_results
17
+ end
18
+
19
+ def parsed_raw
20
+ return @parsed_raw
21
+ end
22
+
23
+ protected
24
+
25
+ def organize_results
26
+ pr = @parsed_raw
27
+ return nil unless pr[:parsed]
28
+ d = pr[:details][0]
29
+ process_node(:uninomial, d[:uninomial])
30
+ process_node(:genus, d[:genus])
31
+ process_node(:species, d[:species], true)
32
+ process_infraspecies(d[:infraspecies])
33
+ @res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
34
+ @res[:all_years].uniq!
35
+ @res.keys.size > 2 ? @res : nil
36
+ end
37
+
38
+ def process_node(name, node, is_species = false)
39
+ return unless node
40
+ @res[name] = {}
41
+ @res[name][:epitheton] = node[:epitheton]
42
+ @res[name][:normalized] = Taxamatch::Normalizer.normalize(node[:epitheton])
43
+ @res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node[:epitheton], is_species)
44
+ get_authors_years(node, @res[name])
45
+ end
46
+
47
+ def process_infraspecies(node)
48
+ return unless node
49
+ @res[:infraspecies] = []
50
+ node.each do |infr|
51
+ hsh = {}
52
+ hsh[:epitheton] = infr[:epitheton]
53
+ hsh[:normalized] = Taxamatch::Normalizer.normalize(infr[:epitheton])
54
+ hsh[:phonetized] = Taxamatch::Phonetizer.near_match(infr[:epitheton], true)
55
+ get_authors_years(infr,hsh)
56
+ @res[:infraspecies] << hsh
57
+ end
58
+ end
59
+
60
+ def get_authors_years(node, res)
61
+ res[:authors] = []
62
+ res[:years] = []
63
+ [:basionymAuthorTeam, :combinationAuthorTeam].each do |au|
64
+ if node[au]
65
+ res[:authors] += node[au][:author]
66
+ res[:years] << node[au][:year] if node[au][:year]
67
+ if node[au][:exAuthorTeam]
68
+ res[:authors] += node[au][:exAuthorTeam][:author]
69
+ res[:years] << node[au][:exAuthorTeam][:year] if node[au][:exAuthorTeam][:year]
70
+ end
71
+ end
72
+ end
73
+ res[:authors].uniq!
74
+ res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
75
+ res[:years].uniq!
76
+ @res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
77
+ @res[:all_years] += res[:years] if res[:years].size > 0
78
+ end
79
+
80
+ end
81
+ end
82
+
@@ -0,0 +1,89 @@
1
+ # Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
2
+
3
+ module Taxamatch
4
+ class Authmatch
5
+
6
+ def self.authmatch(authors1, authors2, years1, years2)
7
+ unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
8
+ year_difference = compare_years(years1, years2)
9
+ get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
10
+ end
11
+
12
+ def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
13
+ count_before = authors1.size + authors2.size
14
+ count_after = unique_authors1.size + unique_authors2.size
15
+ score = 0
16
+ if count_after == 0
17
+ if year_diff != nil
18
+ if year_diff == 0
19
+ score = 100
20
+ elsif year_diff == 1
21
+ score = 54
22
+ end
23
+ else
24
+ score = 94
25
+ end
26
+ elsif unique_authors1.size == 0 || unique_authors2.size == 0
27
+ if year_diff != nil
28
+ if year_diff == 0
29
+ score = 91
30
+ elsif year_diff == 1
31
+ score = 51
32
+ end
33
+ else
34
+ score = 90
35
+ end
36
+ else
37
+ score = ((1 - count_after.to_f/count_before.to_f) * 100).round
38
+ score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
39
+ end
40
+ score > 50 ? score : 0
41
+ end
42
+
43
+ def self.remove_duplicate_authors(authors1, authors2)
44
+ unique_authors1 = authors1.dup
45
+ unique_authors2 = authors2.dup
46
+ authors1.each do |au1|
47
+ authors2.each do |au2|
48
+ au1_match = au2_match = false
49
+ if au1 == au2
50
+ au1_match = au2_match = true
51
+ elsif au1 == au2[0...au1.size]
52
+ au1_match = true
53
+ elsif au1[0...au2.size] == au2
54
+ au2_match = true
55
+ end
56
+ if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
57
+ unique_authors1.delete au1
58
+ unique_authors2.delete au2
59
+ elsif au1_match
60
+ unique_authors1.delete au1
61
+ elsif au2_match
62
+ unique_authors2.delete au2
63
+ else
64
+ #TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
65
+ if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
66
+ unique_authors1.delete au1
67
+ unique_authors2.delete au2
68
+ end
69
+ end
70
+ end
71
+ end
72
+ [unique_authors1, unique_authors2]
73
+ end
74
+
75
+ def self.fuzzy_match_authors(author1, author2)
76
+ au1_length = author1.size
77
+ au2_length = author2.size
78
+ dlm = Taxamatch::DamerauLevenshteinMod.new
79
+ ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
80
+ (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
81
+ end
82
+
83
+ def self.compare_years(years1, years2)
84
+ return 0 if years1 == [] && years2 == []
85
+ return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
86
+ nil
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,139 @@
1
+ # encoding: UTF-8
2
+ require 'rubygems'
3
+ require 'inline'
4
+ require 'time'
5
+ module Taxamatch
6
+
7
+ class DamerauLevenshteinMod
8
+ def distance(str1, str2, block_size=2, max_distance=10)
9
+ # puts str1.unpack("U*");
10
+ distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
11
+ end
12
+
13
+ inline do |builder|
14
+ builder.c "
15
+ static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
16
+ int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
17
+ int stop_execution = 0;
18
+ int min = 0;
19
+ int current_distance = 0;
20
+
21
+ VALUE *sv = RARRAY_PTR(_s);
22
+ VALUE *tv = RARRAY_PTR(_t);
23
+
24
+ sl = RARRAY_LEN(_s);
25
+ tl = RARRAY_LEN(_t);
26
+
27
+ if (sl == 0) return INT2NUM(tl);
28
+ if (tl == 0) return INT2NUM(sl);
29
+ //case of lengths 1 must present or it will break further in the code
30
+ if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
31
+
32
+ int s[sl];
33
+ int t[tl];
34
+
35
+ for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
36
+ for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
37
+
38
+ sl++;
39
+ tl++;
40
+
41
+ //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
42
+ d = malloc((sizeof(int))*(sl)*(tl));
43
+ //populate 'vertical' row starting from the 2nd position (first one is filled already)
44
+ for(i = 0; i < tl; i++){
45
+ d[i*sl] = i;
46
+ }
47
+
48
+ //fill up array with scores
49
+ for(i = 1; i<sl; i++){
50
+ d[i] = i;
51
+ if (stop_execution == 1) break;
52
+ current_distance = 10000;
53
+ for(j = 1; j<tl; j++){
54
+
55
+ cost = 1;
56
+ if(s[i-1] == t[j-1]) cost = 0;
57
+
58
+ half_sl = (sl - 1)/2;
59
+ half_tl = (tl - 1)/2;
60
+
61
+ block = block_size < half_sl ? block_size : half_sl;
62
+ block = block < half_tl ? block : half_tl;
63
+
64
+ while (block >= 1){
65
+ int swap1 = 1;
66
+ int swap2 = 1;
67
+ i1 = i - (block * 2);
68
+ j1 = j - (block * 2);
69
+ for (k = i1; k < i1 + block; k++) {
70
+ if (s[k] != t[k + block]){
71
+ swap1 = 0;
72
+ break;
73
+ }
74
+ }
75
+ for (k = j1; k < j1 + block; k++) {
76
+ if (t[k] != s[k + block]){
77
+ swap2 = 0;
78
+ break;
79
+ }
80
+ }
81
+
82
+ del = d[j*sl + i - 1] + 1;
83
+ ins = d[(j-1)*sl + i] + 1;
84
+ min = del;
85
+ if (ins < min) min = ins;
86
+ //if (i == 2 && j==2) return INT2NUM(swap2+5);
87
+ if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
88
+ transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
89
+ if (transp < min) min = transp;
90
+ block = 0;
91
+ } else if (block == 1) {
92
+ subs = d[(j-1)*sl + i - 1] + cost;
93
+ if (subs < min) min = subs;
94
+ }
95
+ block--;
96
+ }
97
+ d[j*sl+i]=min;
98
+ if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
99
+ }
100
+ if (current_distance > max_distance) {
101
+ stop_execution = 1;
102
+ }
103
+ }
104
+ distance=d[sl * tl - 1];
105
+ if (stop_execution == 1) distance = current_distance;
106
+
107
+ free(d);
108
+ return INT2NUM(distance);
109
+ }
110
+ "
111
+ end
112
+ end
113
+ end
114
+
115
+ if __FILE__ == $0
116
+ a=Taxamatch::DamerauLevenshteinMod.new
117
+ s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
118
+ t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
119
+
120
+ #puts s.join(",")
121
+ #puts t.join(",")
122
+
123
+ start = Time.now
124
+ (1..100000).each do
125
+ a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921',1,10)
126
+ end
127
+ puts "with unpack time: " + (Time.now - start).to_s + ' sec'
128
+
129
+ start = Time.now
130
+ (1..100000).each do
131
+ a.distance_utf(s, t, 1, 10)
132
+ end
133
+ puts 'utf time: ' + (Time.now - start).to_s + ' sec'
134
+
135
+ #puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
136
+ #puts a.distance_utf(s, t, 2, 10)
137
+ #puts a.distance('tar','atp',1,10);
138
+ puts a.distance('sub', 'usb', 1, 10);
139
+ end
@@ -0,0 +1,55 @@
1
+ # encoding: UTF-8
2
+
3
+ module Taxamatch
4
+
5
+ module Normalizer
6
+ def self.normalize(string)
7
+ utf8_to_ascii(string).upcase
8
+ end
9
+
10
+ def self.normalize_word(word)
11
+ self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
12
+ end
13
+
14
+ def self.normalize_author(string)
15
+ self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
16
+ end
17
+
18
+ protected
19
+ def self.utf8_to_ascii(string)
20
+ string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
21
+ string = string.gsub(/[ÉÈÊË]/, "E")
22
+ string = string.gsub(/[ÍÌÎÏ]/, "I")
23
+ string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
24
+ string = string.gsub(/[ÚÙÛÜ]/, "U")
25
+ string = string.gsub(/[Ý]/, "Y")
26
+ string = string.gsub(/Æ/, "AE")
27
+ string = string.gsub(/[ČÇ]/, "C")
28
+ string = string.gsub(/[ŠŞ]/, "S")
29
+ string = string.gsub(/[Đ]/, "D")
30
+ string = string.gsub(/Ž/, "Z")
31
+ string = string.gsub(/Ñ/, "N")
32
+ string = string.gsub(/Œ/, "OE")
33
+ string = string.gsub(/ß/, "B")
34
+ string = string.gsub(/Ķ/, "K")
35
+ string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
36
+ string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
37
+ string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
38
+ string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
39
+ string = string.gsub(/[úùûüůưừựủứụ]/, "u")
40
+ string = string.gsub(/[žź]/, "z")
41
+ string = string.gsub(/[ýÿỹ]/, "y")
42
+ string = string.gsub(/[đ]/, "d")
43
+ string = string.gsub(/æ/, "ae")
44
+ string = string.gsub(/[čćç]/, "c")
45
+ string = string.gsub(/[ñńň]/, "n")
46
+ string = string.gsub(/œ/, "oe")
47
+ string = string.gsub(/[śšş]/, "s")
48
+ string = string.gsub(/ř/, "r")
49
+ string = string.gsub(/ğ/, "g")
50
+ string = string.gsub(/Ř/, "R")
51
+ end
52
+
53
+ end
54
+
55
+ end
@@ -0,0 +1,79 @@
1
+ # encoding: UTF-8
2
+ module Taxamatch
3
+
4
+ module Phonetizer
5
+
6
+ def self.phonetize(a_word, normalize_ending = false)
7
+ self.near_match(a_word, normalize_ending)
8
+ end
9
+
10
+ def self.near_match(a_word, normalize_ending = false)
11
+ a_word = a_word.strip rescue ''
12
+ return '' if a_word == ''
13
+ a_word = Taxamatch::Normalizer.normalize a_word
14
+ case a_word
15
+ when /^AE/
16
+ a_word = 'E' + a_word[2..-1]
17
+ when /^CN/
18
+ a_word = 'N' + a_word[2..-1]
19
+ when /^CT/
20
+ a_word = 'T' + a_word[2..-1]
21
+ when /^CZ/
22
+ a_word = 'C' + a_word[2..-1]
23
+ when /^DJ/
24
+ a_word = 'J' + a_word[2..-1]
25
+ when /^EA/
26
+ a_word = 'E' + a_word[2..-1]
27
+ when /^EU/
28
+ a_word = 'U' + a_word[2..-1]
29
+ when /^GN/
30
+ a_word = 'N' + a_word[2..-1]
31
+ when /^KN/
32
+ a_word = 'N' + a_word[2..-1]
33
+ when /^MC/
34
+ a_word = 'MAC' + a_word[2..-1]
35
+ when /^MN/
36
+ a_word = 'N' + a_word[2..-1]
37
+ when /^OE/
38
+ a_word = 'E' + a_word[2..-1]
39
+ when /^QU/
40
+ a_word = 'Q' + a_word[2..-1]
41
+ when /^PS/
42
+ a_word = 'S' + a_word[2..-1]
43
+ when /^PT/
44
+ a_word = 'T' + a_word[2..-1]
45
+ when /^TS/
46
+ a_word = 'S' + a_word[2..-1]
47
+ when /^WR/
48
+ a_word = 'R' + a_word[2..-1]
49
+ when /^X/
50
+ a_word = 'Z' + a_word[1..-1]
51
+ end
52
+ first_char = a_word.split('')[0]
53
+ rest_chars = a_word.split('')[1..-1].join('')
54
+ rest_chars.gsub!('AE', 'I')
55
+ rest_chars.gsub!('IA', 'A')
56
+ rest_chars.gsub!('OE', 'I')
57
+ rest_chars.gsub!('OI', 'A')
58
+ rest_chars.gsub!('SC', 'S')
59
+ rest_chars.gsub!('H', '')
60
+ rest_chars.tr!('EOUYKZ', 'IAIICS')
61
+ a_word = (first_char + rest_chars).squeeze
62
+
63
+ if normalize_ending && a_word.size > 4
64
+ a_word = self.normalize_ending(a_word)
65
+ end
66
+ a_word
67
+ end
68
+
69
+ def self.normalize_ending(a_word)
70
+ # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
71
+ # -- at the end of a string translate all to -a
72
+ a_word.gsub!(/IS$/, 'A')
73
+ a_word.gsub!(/IM$/, 'A')
74
+ a_word.gsub(/AS$/, 'A')
75
+ end
76
+
77
+ end
78
+
79
+ end
@@ -0,0 +1,63 @@
1
+ ######################
2
+ # Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
3
+ #
4
+ # * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
5
+ # * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
6
+ #
7
+ # Fields:
8
+ # String1|String2|maximum distance|transposition block size|expected distance
9
+ # - String1, String2
10
+ # compared strings
11
+ # - maximum distance
12
+ # stops execution of the algorithm when calculated distance exceeds the maximum distance number
13
+ # - transosition block size
14
+ # determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
15
+ # - expected distance
16
+ # resulting distance that has to be achieved by the algorithm
17
+ # Note: algorithm does not try to normalize or interpret strings in any way.
18
+ ######################
19
+
20
+ #it whould recognize the exact match
21
+ Pomatomus|Pomatomus|10|1|0
22
+
23
+ #it should not try to normalize incoming strings
24
+ Pomatomus|Pomatomus|10|1|1
25
+ Pomatomus|pomatomus|10|1|1
26
+
27
+ #it should calculate special cases
28
+ Pomatomus||10|1|9
29
+ |Pomatomus|10|1|9
30
+ P|p|10|1|1
31
+ #TODO: one letter vs longer string generates a big negative number
32
+ #L|Linneaus|10|1|7
33
+
34
+
35
+ #it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
36
+ Pomatomus|Pomatomux|10|1|1
37
+ Pmatomus|Pomatomus|10|1|1
38
+ Pomatomus|Pmatomus|10|1|1
39
+ Rpmatomus|Pomatomus|10|1|2
40
+ Pommtomus|Pomatomus|10|1|1
41
+ Potamomus|Pomatomus|10|1|2
42
+ Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
43
+ Pomatomus|oPmatomus|10|1|1
44
+ Pomatomus|Pomatomsu|10|1|1
45
+ Pomtaomus|Pomatomus|10|1|1
46
+ Pomatoums|Pomatomus|10|1|1
47
+ Potamomus|Pomatomus|10|1|2
48
+ Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
49
+
50
+ #it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
51
+ serrulatus|serratulus|10|2|2
52
+ Pomatomus|Poomumats|10|3|3
53
+ vesiculosus|vecusilosus|10|1|4
54
+ vesiculosus|vecusilosus|10|2|2
55
+ trimerophyton|mertriophyton|10|1|6
56
+ trimerophyton|mertriophyton|10|3|3
57
+
58
+ #it should stop trying if distance exceeds maximum allowed distance
59
+ Pxxxxomus|Pomatomus|10|1|4
60
+ Pxxxxomus|Pomatomus|2|1|3
61
+
62
+ #
63
+ PUNCTATA|PUNCTATA|10|1|0
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --colour
@@ -0,0 +1,28 @@
1
+ begin
2
+ require 'spec'
3
+ rescue LoadError
4
+ require 'rubygems' unless ENV['NO_RUBYGEMS']
5
+ gem 'rspec'
6
+ require 'spec'
7
+ end
8
+
9
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
10
+ require 'taxamatch_rb'
11
+
12
+ def read_test_file(file, fields_num)
13
+ f = open(file)
14
+ f.each do |line|
15
+ fields = line.split("|")
16
+ if line.match(/^\s*#/) == nil && fields.size == fields_num
17
+ fields[-1] = fields[-1].split('#')[0].strip
18
+ yield(fields)
19
+ else
20
+ yield(nil)
21
+ end
22
+ end
23
+ end
24
+
25
+ def make_taxamatch_hash(string)
26
+ normalized = Taxamatch::Normalizer.normalize(string)
27
+ {:epitheton => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
28
+ end
@@ -0,0 +1,254 @@
1
+ # encoding: UTF-8
2
+ require File.dirname(__FILE__) + '/spec_helper.rb'
3
+
4
+ describe 'DamerauLevenshteinMod' do
5
+ it 'should get tests' do
6
+ read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
7
+ dl = Taxamatch::DamerauLevenshteinMod.new
8
+ if y
9
+ res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
10
+ puts y if res != y[4].to_i
11
+ res.should == y[4].to_i
12
+ end
13
+ end
14
+ end
15
+ end
16
+
17
+ describe 'Atomizer' do
18
+ before(:all) do
19
+ @parser = Taxamatch::Atomizer.new
20
+ end
21
+
22
+ it 'should parse uninomials' do
23
+ @parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
24
+ @parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
25
+ end
26
+
27
+ it 'should parse binomials' do
28
+ @parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:epitheton=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
29
+ end
30
+
31
+ it 'should parse trinomials' do
32
+ @parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:epitheton=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:epitheton=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
33
+ end
34
+ end
35
+
36
+
37
+ describe 'Taxamatch::Normalizer' do
38
+ it 'should normalize strings' do
39
+ Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
40
+ Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
41
+ Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
42
+ Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
43
+ Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
44
+ end
45
+
46
+ it 'should normalize words' do
47
+ Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
48
+ end
49
+ end
50
+
51
+ describe 'Taxamatch::Base' do
52
+ before(:all) do
53
+ @tm = Taxamatch::Base.new
54
+ end
55
+
56
+ it 'should get txt tests' do
57
+ dl = Taxamatch::DamerauLevenshteinMod.new
58
+ read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
59
+ if y
60
+ y[2] = y[2] == 'true' ? true : false
61
+ res = @tm.taxamatch(y[0], y[1], false)
62
+ puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
63
+ res['match'].should == y[2]
64
+ res['edit_distance'].should == y[3].to_i
65
+ end
66
+ end
67
+ end
68
+
69
+ it 'should work with names that cannot be parsed' do
70
+ res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
71
+ res = false
72
+ end
73
+
74
+ it 'should compare genera' do
75
+ #edit distance 1 always match
76
+ g1 = make_taxamatch_hash 'Plantago'
77
+ g2 = make_taxamatch_hash 'Plantagon'
78
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
79
+ #edit_distance above threshold does not math
80
+ g1 = make_taxamatch_hash 'Plantago'
81
+ g2 = make_taxamatch_hash 'This shouldnt match'
82
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
83
+ #phonetic_match matches
84
+ g1 = make_taxamatch_hash 'Plantagi'
85
+ g2 = make_taxamatch_hash 'Plantagy'
86
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
87
+ #distance 1 in first letter also matches
88
+ g1 = make_taxamatch_hash 'Xantheri'
89
+ g2 = make_taxamatch_hash 'Pantheri'
90
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
91
+ #phonetic match tramps everything
92
+ g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
93
+ g2 = make_taxamatch_hash 'Zanthery'
94
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
95
+ #same first letter and distance 2 should match
96
+ g1 = make_taxamatch_hash 'Xantherii'
97
+ g2 = make_taxamatch_hash 'Xantherrr'
98
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
99
+ #First letter is the same and distance is 3 should match, no phonetic match
100
+ g1 = make_taxamatch_hash 'Xantheriii'
101
+ g2 = make_taxamatch_hash 'Xantherrrr'
102
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
103
+ #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
104
+ g1 = make_taxamatch_hash 'Xant'
105
+ g2 = make_taxamatch_hash 'Xanthe'
106
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
107
+ #Should not match if edit distance > 3 and no phonetic match
108
+ g1 = make_taxamatch_hash 'Xantheriiii'
109
+ g2 = make_taxamatch_hash 'Xantherrrrr'
110
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
111
+ end
112
+
113
+ it 'should compare species' do
114
+ #Exact match
115
+ s1 = make_taxamatch_hash 'major'
116
+ s2 = make_taxamatch_hash 'major'
117
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
118
+ #Phonetic match always works
119
+ s1 = make_taxamatch_hash 'xanteriiiiiiii'
120
+ s2 = make_taxamatch_hash 'zantereeeeeeee'
121
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
122
+ #Phonetic match works with different endings
123
+ s1 = make_taxamatch_hash 'majorum'
124
+ s2 = make_taxamatch_hash 'majoris'
125
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
126
+ #Distance 4 matches if first 3 chars are the same
127
+ s1 = make_taxamatch_hash 'majorrrrr'
128
+ s2 = make_taxamatch_hash 'majoraaaa'
129
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
130
+ #Should not match if Distance 4 matches and first 3 chars are not the same
131
+ s1 = make_taxamatch_hash 'majorrrrr'
132
+ s2 = make_taxamatch_hash 'marorraaa'
133
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
134
+ #Distance 2 or 3 matches if first 1 char is the same
135
+ s1 = make_taxamatch_hash 'morrrr'
136
+ s2 = make_taxamatch_hash 'moraaa'
137
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
138
+ #Should not match if Distance 2 or 3 and first 1 char is not the same
139
+ s1 = make_taxamatch_hash 'morrrr'
140
+ s2 = make_taxamatch_hash 'torraa'
141
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
142
+ #Distance 1 will match anywhere
143
+ s1 = make_taxamatch_hash 'major'
144
+ s2 = make_taxamatch_hash 'rajor'
145
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
146
+ #Will not match if distance 3 and length is less then twice of the edit distance
147
+ s1 = make_taxamatch_hash 'marrr'
148
+ s2 = make_taxamatch_hash 'maaaa'
149
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
150
+ end
151
+
152
+ it 'should match mathes' do
153
+ #No trobule case
154
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
155
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
156
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
157
+ #Will not match if either genus or sp. epithet dont match
158
+ gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
159
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
160
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
161
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
162
+ smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
163
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
164
+ #Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
165
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
166
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
167
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
168
+ #Should not have phonetic match if one of the components does not match phonetically
169
+ gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
170
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
171
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
172
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
173
+ smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
174
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
175
+ #edit distance should be equal the sum of of edit distances
176
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
177
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
178
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
179
+ end
180
+
181
+ describe 'Taxamatch::Authmatch' do
182
+ before(:all) do
183
+ @am = Taxamatch::Authmatch
184
+ end
185
+
186
+ it 'should calculate score' do
187
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
188
+ res.should == 90
189
+ res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
190
+ res.should == 0
191
+ #found all authors, same year
192
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
193
+ res.should == 100
194
+ #all authors, 1 year diff
195
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
196
+ res.should == 54
197
+ #year is not counted in
198
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
199
+ res.should == 94
200
+ #found all authors on one side, same year
201
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
202
+ res.should == 91
203
+ #found all authors on one side, 1 year diff
204
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
205
+ res.should == 51
206
+ #found all authors on one side, year does not count
207
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
208
+ res.should == 90
209
+ #found some authors
210
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
211
+ res.should == 67
212
+ #if year does not match or not present no match for previous case
213
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
214
+ res.should == 0
215
+ end
216
+
217
+ it 'should compare years' do
218
+ @am.compare_years([1882],[1880]).should == 2
219
+ @am.compare_years([1882],[]).should == nil
220
+ @am.compare_years([],[]).should == 0
221
+ @am.compare_years([1788,1798], [1788,1798]).should be_nil
222
+ end
223
+
224
+ it 'should remove duplicate authors' do
225
+ #Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
226
+ #Muller is identical
227
+ res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
228
+ res.should == [[], []]
229
+ #same in different order
230
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
231
+ res.should == [[], []]
232
+ #auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
233
+ res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
234
+ res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
235
+ #fuzzy match
236
+ res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
237
+ res.should == [["Dem"], ["Stepanov"]]
238
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
239
+ res.should == [['Linnaeus', 'Muller'], ['Kenn']]
240
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
241
+ res.should == [[],['Kurtz']]
242
+ end
243
+
244
+ it 'should fuzzy match authors' do
245
+ #TODO: fix the bug revealed by this test
246
+ # res = @am.fuzzy_match_authors('L', 'Muller')
247
+ # res.should be_false
248
+ end
249
+
250
+ end
251
+
252
+ end
253
+
254
+
@@ -0,0 +1,45 @@
1
+ ###
2
+ #
3
+ # Tests for string comparison by taxamatch algorithm
4
+ # name1|name2|match|edit_distance
5
+ #
6
+ ##
7
+ # Comparing uninomials
8
+ Pomatomus|Pomatomas|true|1
9
+ Pomatomus L.|Pomatomas Linn.|true|1
10
+ Pomatomus Ber|Pomatomas Linn|false|1
11
+ Pomatomus L. 1753|Pomatomus Linn. 1800|false|0
12
+
13
+ ## additional authorship should match
14
+ Puma concolor|Puma concolor L.|true|0
15
+ #
16
+ ## one-letter misspeling in species epithet should match
17
+ Puma concolor|Puma cancolor|true|1
18
+ #
19
+ Pomatomus saltatrix|Pomatomus saltratix|true|2
20
+ Pomatomus saltator|Pomatomus saltatrix|true|3
21
+ #
22
+ Loligo pealeii|Loligo plei|false|3
23
+ #
24
+ ## different authors should not match
25
+ Puma concolor Linnaeus|Puma concolor Kurtz|false|0
26
+ #
27
+ ##real life examples
28
+ Biatora borealis|Bactra borealis Diakonoff 1964|false|3
29
+ #
30
+ Homo sapien|Homo sapiens|true|1
31
+ Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true|1
32
+ Homo sapiens Mozzherin|Homo sapiens Linneaus|false|0
33
+ #
34
+ Quinqueloculina punctata|Quinqueloculina punctata d'Orbigny 1905|true|0
35
+ Pomatomus saltator (Linnaeus, 1766)|Pomatomus saltatrix (Linnaeus, 1766)|true|0|3
36
+ #
37
+ #Trinomial names
38
+ Homo sapiens stupidus|Homo spiens stupidus|true|1
39
+ Pomatomus saltator saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|true|1
40
+ Pomatomus saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|false|5
41
+ Pomatomus saltator saltator saltatorische|Pomatomus saltator soltator|true|1
42
+
43
+
44
+
45
+
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: taxamatch_rb
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 6
8
+ - 0
9
+ version: 0.6.0
10
+ platform: ruby
11
+ authors:
12
+ - Dmitry Mozzherin
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-19 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: RubyInline
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :runtime
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: biodiversity
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ - 5
42
+ - 13
43
+ version: 0.5.13
44
+ type: :runtime
45
+ version_requirements: *id002
46
+ description: This gem implements algorithsm for fuzzy matching scientific names developed by Tony Rees
47
+ email: dmozzherin@eol.org
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ extra_rdoc_files:
53
+ - LICENSE
54
+ - README.rdoc
55
+ files:
56
+ - README.rdoc
57
+ - lib/taxamatch_rb.rb
58
+ - lib/taxamatch_rb/atomizer.rb
59
+ - lib/taxamatch_rb/authmatch.rb
60
+ - lib/taxamatch_rb/damerau_levenshtein_mod.rb
61
+ - lib/taxamatch_rb/normalizer.rb
62
+ - lib/taxamatch_rb/phonetizer.rb
63
+ - spec/damerau_levenshtein_mod_test.txt
64
+ - spec/spec.opts
65
+ - spec/spec_helper.rb
66
+ - spec/taxamatch_rb_spec.rb
67
+ - spec/taxamatch_test.txt
68
+ - LICENSE
69
+ has_rdoc: true
70
+ homepage: http://github.com/dimus/taxamatch_rb
71
+ licenses: []
72
+
73
+ post_install_message:
74
+ rdoc_options:
75
+ - --charset=UTF-8
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ segments:
83
+ - 0
84
+ version: "0"
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ segments:
90
+ - 0
91
+ version: "0"
92
+ requirements: []
93
+
94
+ rubyforge_project:
95
+ rubygems_version: 1.3.6
96
+ signing_key:
97
+ specification_version: 3
98
+ summary: Implementation of Tony Rees Taxamatch algorithms
99
+ test_files:
100
+ - spec/spec_helper.rb
101
+ - spec/taxamatch_rb_spec.rb