taxamatch_rb 0.9.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,19 @@
1
- # Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
1
+ # Algorithms for Taxamatch::Authmatch
2
+ # are developed by Patrick Leary of uBio and EOL fame
2
3
 
3
4
  module Taxamatch
4
5
  class Authmatch
5
6
 
6
7
  def self.authmatch(authors1, authors2, years1, years2)
7
- unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
8
+ unique_authors1, unique_authors2 =
9
+ remove_duplicate_authors(authors1, authors2)
8
10
  year_difference = compare_years(years1, years2)
9
- get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
11
+ get_score(authors1, unique_authors1,
12
+ authors2, unique_authors2, year_difference)
10
13
  end
11
-
12
- def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
14
+
15
+ def self.get_score(authors1, unique_authors1,
16
+ authors2, unique_authors2, year_diff)
13
17
  count_before = authors1.size + authors2.size
14
18
  count_after = unique_authors1.size + unique_authors2.size
15
19
  score = 0
@@ -18,7 +22,7 @@ module Taxamatch
18
22
  if year_diff == 0
19
23
  score = 100
20
24
  elsif year_diff == 1
21
- score = 54
25
+ score = 54
22
26
  end
23
27
  else
24
28
  score = 94
@@ -35,11 +39,11 @@ module Taxamatch
35
39
  end
36
40
  else
37
41
  score = ((1 - count_after.to_f/count_before.to_f) * 100).round
38
- score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
42
+ score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
39
43
  end
40
44
  score > 50 ? score : 0
41
45
  end
42
-
46
+
43
47
  def self.remove_duplicate_authors(authors1, authors2)
44
48
  unique_authors1 = authors1.dup
45
49
  unique_authors2 = authors2.dup
@@ -48,12 +52,14 @@ module Taxamatch
48
52
  au1_match = au2_match = false
49
53
  if au1 == au2
50
54
  au1_match = au2_match = true
51
- elsif au1 == au2[0...au1.size]
55
+ elsif au1 == au2[0...au1.size]
52
56
  au1_match = true
53
57
  elsif au1[0...au2.size] == au2
54
58
  au2_match = true
55
59
  end
56
- if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
60
+ if (au1.size >= 3 && au1_match) ||
61
+ (au2.size >= 3 && au2_match) ||
62
+ (au1_match && au2_match)
57
63
  unique_authors1.delete au1
58
64
  unique_authors2.delete au2
59
65
  elsif au1_match
@@ -61,8 +67,11 @@ module Taxamatch
61
67
  elsif au2_match
62
68
  unique_authors2.delete au2
63
69
  else
64
- #TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
65
- if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
70
+ #TODO: masking a bug in damerau levenshtsin
71
+ # mod which appears comparing 1letter to a longer string
72
+ if au1.size > 1 &&
73
+ au2.size > 1 &&
74
+ self.fuzzy_match_authors(au1, au2)
66
75
  unique_authors1.delete au1
67
76
  unique_authors2.delete au2
68
77
  end
@@ -71,18 +80,22 @@ module Taxamatch
71
80
  end
72
81
  [unique_authors1, unique_authors2]
73
82
  end
74
-
83
+
75
84
  def self.fuzzy_match_authors(author1, author2)
76
85
  au1_length = author1.size
77
86
  au2_length = author2.size
78
87
  dlm = DamerauLevenshtein
79
- ed = dlm.distance(author1, author2,1,3) #get around a bug in C code, but it really has to be fixed
80
- (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
88
+ #get around a bug in C code, but it really has to be fixed
89
+ ed = dlm.distance(author1, author2,1,3)
90
+ (ed <= 3 && ([au1_length, au2_length].min > ed * 2) &&
91
+ (ed < 2 || author1[0] == author2[0]))
81
92
  end
82
93
 
83
94
  def self.compare_years(years1, years2)
84
95
  return 0 if years1 == [] && years2 == []
85
- return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
96
+ if years1.size == 1 && years2.size == 1
97
+ return (years1[0].to_i - years2[0].to_i).abs
98
+ end
86
99
  nil
87
100
  end
88
101
  end
@@ -1,16 +1,16 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module Taxamatch
4
-
4
+
5
5
  module Normalizer
6
6
  def self.normalize(string)
7
7
  utf8_to_ascii(string.strip.upcase).gsub(/[^\x00-\x7F]/,'?')
8
8
  end
9
-
9
+
10
10
  def self.normalize_word(word)
11
11
  self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
12
12
  end
13
-
13
+
14
14
  def self.normalize_author(string)
15
15
  self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
16
16
  end
@@ -20,7 +20,7 @@ module Taxamatch
20
20
  year_int = nil unless year_int.between?(1757, Time.now.year + 1)
21
21
  year_int
22
22
  end
23
-
23
+
24
24
 
25
25
  private
26
26
  def self.utf8_to_ascii(string)
@@ -2,11 +2,11 @@
2
2
  module Taxamatch
3
3
 
4
4
  module Phonetizer
5
-
5
+
6
6
  def self.phonetize(a_word, normalize_ending = false)
7
7
  self.near_match(a_word, normalize_ending)
8
8
  end
9
-
9
+
10
10
  def self.near_match(a_word, normalize_ending = false)
11
11
  a_word = a_word.strip rescue ''
12
12
  return '' if a_word == ''
@@ -50,7 +50,7 @@ module Taxamatch
50
50
  a_word = 'Z' + a_word[1..-1]
51
51
  end
52
52
  first_char = a_word.split('')[0]
53
- rest_chars = a_word.split('')[1..-1].join('')
53
+ rest_chars = a_word.split('')[1..-1].join('')
54
54
  rest_chars.gsub!('AE', 'I')
55
55
  rest_chars.gsub!('IA', 'A')
56
56
  rest_chars.gsub!('OE', 'I')
@@ -59,21 +59,22 @@ module Taxamatch
59
59
  rest_chars.gsub!('H', '')
60
60
  rest_chars.tr!('EOUYKZ', 'IAIICS')
61
61
  a_word = (first_char + rest_chars).squeeze
62
-
62
+
63
63
  if normalize_ending && a_word.size > 4
64
64
  a_word = self.normalize_ending(a_word)
65
65
  end
66
66
  a_word
67
67
  end
68
-
68
+
69
69
  def self.normalize_ending(a_word)
70
- # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
70
+ # -- deal with variant endings
71
+ # -is (includes -us, -ys, -es), -im (was -um), -as (-os)
71
72
  # -- at the end of a string translate all to -a
72
73
  a_word.gsub!(/IS$/, 'A')
73
74
  a_word.gsub!(/IM$/, 'A')
74
75
  a_word.gsub(/AS$/, 'A')
75
76
  end
76
-
77
+
77
78
  end
78
79
 
79
- end
80
+ end
@@ -7,25 +7,81 @@ describe 'Atomizer' do
7
7
  end
8
8
 
9
9
  it 'should parse uninomials' do
10
- @parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :canonical_form=>"Betula", :uninomial=>{:string=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
11
- @parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>[1872], :canonical_form=>"Aerenea", :uninomial=>{:string=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>[1872], :normalized_authors=>["LACORDAIRE"]}}
10
+ @parser.parse('Betula').should == { :all_authors => [], :all_years => [],
11
+ :canonical_form => "Betula", :uninomial => { :string => "Betula",
12
+ :normalized => 'BETULA', :phonetized => "BITILA", :authors => [],
13
+ :years => [], :normalized_authors => [] } }
14
+ @parser.parse('Ærenea Lacordaire, 1872').should == {
15
+ :all_authors => ["LACORDAIRE"], :all_years => [1872],
16
+ :canonical_form => "Aerenea", :uninomial => { :string => "Aerenea",
17
+ :normalized => "AERENEA", :phonetized => "ERINIA",
18
+ :authors => ["Lacordaire"], :years => [1872],
19
+ :normalized_authors => ["LACORDAIRE"] } }
12
20
  end
13
21
 
14
22
  it 'should parse binomials' do
15
- @parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>[1913], :canonical_form=>"Leoeptura laetifica", :genus=>{:string=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>[1913], :normalized_authors=>["DOW"]}}
23
+ @parser.parse('Leœptura laetifica Dow, 1913').should == {
24
+ :all_authors => ["DOW"], :all_years => [1913],
25
+ :canonical_form => "Leoeptura laetifica", :genus => {
26
+ :string => "Leoeptura", :normalized => "LEOEPTURA",
27
+ :phonetized => "LIPTIRA", :authors => [], :years => [],
28
+ :normalized_authors => []}, :species => {
29
+ :string => "laetifica", :normalized => "LAETIFICA",
30
+ :phonetized => "LITIFICA", :authors => ["Dow"],
31
+ :years => [1913], :normalized_authors => ["DOW"] } }
16
32
  end
17
33
 
18
34
  it 'should parse trinomials' do
19
- @parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>[1972], :canonical_form=>"Hydnellum scrobiculatum zonatum", :genus=>{:string=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:string=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>[1972], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
35
+ @parser.parse('Hydnellum scrobiculatum zonatum ' +
36
+ '(Banker) D. Hall et D.E. Stuntz 1972').should == {
37
+ :all_authors => ["BANKER", "D HALL", "D E STUNTZ"], :all_years => [1972],
38
+ :canonical_form => "Hydnellum scrobiculatum zonatum", :genus=>{
39
+ :string => "Hydnellum", :normalized => "HYDNELLUM",
40
+ :phonetized => "HIDNILIM", :authors => [], :years => [],
41
+ :normalized_authors => [] }, :species => { :string => "scrobiculatum",
42
+ :normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
43
+ :authors => [], :years => [], :normalized_authors => [] },
44
+ :infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
45
+ :phonetized => "ZANATA", :authors => ["Banker", "D. Hall", "D.E. Stuntz"],
46
+ :years => [1972], :normalized_authors => ["BANKER", "D HALL",
47
+ "D E STUNTZ"] }] }
20
48
  end
21
49
 
22
50
  it 'should normalize years to integers' do
23
51
  future_year = Time.now.year + 10
24
- @parser.parse("Hydnellum scrobiculatum Kern #{future_year} zonatum (Banker) D. Hall et D.E. Stuntz 1972?").should == {:all_authors=>["KERN", "BANKER", "D HALL", "D E STUNTZ"], :all_years=>[1972], :canonical_form=>"Hydnellum scrobiculatum zonatum", :genus=>{:string=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>["Kern"], :years=>[], :normalized_authors=>["KERN"]}, :infraspecies=>[{:string=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>[1972], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
52
+ @parser.parse("Hydnellum scrobiculatum Kern #{future_year} " +
53
+ "zonatum (Banker) D. Hall et D.E. Stuntz 1972?").should == {
54
+ :all_authors => ["KERN", "BANKER", "D HALL", "D E STUNTZ"],
55
+ :all_years => [1972],
56
+ :canonical_form => "Hydnellum scrobiculatum zonatum", :genus => {
57
+ :string => "Hydnellum", :normalized => "HYDNELLUM",
58
+ :phonetized => "HIDNILIM", :authors => [], :years => [],
59
+ :normalized_authors => [] }, :species => { :string => "scrobiculatum",
60
+ :normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
61
+ :authors => ["Kern"], :years => [], :normalized_authors => ["KERN"] },
62
+ :infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
63
+ :phonetized => "ZANATA", :authors =>
64
+ ["Banker", "D. Hall", "D.E. Stuntz"], :years => [1972],
65
+ :normalized_authors => ["BANKER", "D HALL", "D E STUNTZ"] }] }
25
66
  end
26
67
 
27
68
  it 'should normalize names with abbreviated genus after cf.' do
28
- @parser.parse('Unio cf. U. alba').should == {:all_authors=>[], :all_years=>[], :canonical_form=>"Unio", :genus=>{:string=>"Unio", :normalized=>"UNIO", :phonetized=>"UNIA", :authors=>[], :years=>[], :normalized_authors=>[]}}
69
+ @parser.parse('Unio cf. U. alba').should == { :all_authors => [],
70
+ :all_years => [], :canonical_form => "Unio",
71
+ :genus => { :string => "Unio", :normalized => "UNIO",
72
+ :phonetized => "UNIA", :authors => [], :years => [],
73
+ :normalized_authors => [] } }
74
+ end
75
+
76
+ it 'should parse names which broke it before' do
77
+ ['Parus caeruleus species complex',
78
+ 'Euxoa nr. idahoensis sp. 1clay',
79
+ 'Cetraria islandica ? islandica',
80
+ 'Buteo borealis ? ventralis'].each do |n|
81
+ res = @parser.parse(n)
82
+ res.class.should == Hash
83
+ res.empty?.should be_false
84
+ end
29
85
  end
30
86
  end
31
87
 
@@ -38,12 +94,14 @@ describe 'Taxamatch::Normalizer' do
38
94
  Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
39
95
  Taxamatch::Normalizer.normalize('Fallé€n').should == 'FALLE?N'
40
96
  Taxamatch::Normalizer.normalize('Fallén привет').should == 'FALLEN ??????'
41
- Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
97
+ Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should ==
98
+ 'CHORIOZOPELLA TRAGARDHI'
42
99
  Taxamatch::Normalizer.normalize('×Zygomena').should == 'xZYGOMENA'
43
100
  end
44
101
 
45
102
  it 'should normalize words' do
46
- Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
103
+ Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should ==
104
+ 'L-3EOEPTURA'
47
105
  end
48
106
  end
49
107
 
@@ -53,7 +111,8 @@ describe 'Taxamatch::Base' do
53
111
  end
54
112
 
55
113
  it 'should get txt tests' do
56
- read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
114
+ test_file = File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt'
115
+ read_test_file(test_file, 4) do |y|
57
116
  if y
58
117
  y[2] = y[2] == 'true' ? true : false
59
118
  res = @tm.taxamatch(y[0], y[1], false)
@@ -65,127 +124,169 @@ describe 'Taxamatch::Base' do
65
124
  end
66
125
 
67
126
  it 'should work with names that cannot be parsed' do
68
- res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
127
+ res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921',
128
+ 'Quadraspidiotus ostreaeformis Curtis)')
69
129
  res = false
70
130
  end
71
131
 
72
132
  it 'should compare genera' do
73
- #edit distance 1 always match
133
+ # edit distance 1 always match
74
134
  g1 = make_taxamatch_hash 'Plantago'
75
135
  g2 = make_taxamatch_hash 'Plantagon'
76
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
77
- #edit_distance above threshold does not math
136
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
137
+ 'edit_distance' => 1, 'match' => true }
138
+ # edit_distance above threshold does not math
78
139
  g1 = make_taxamatch_hash 'Plantago'
79
140
  g2 = make_taxamatch_hash 'This shouldnt match'
80
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
81
- #phonetic_match matches
141
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
142
+ 'match' => false, 'edit_distance' => 4 }
143
+ # phonetic_match matches
82
144
  g1 = make_taxamatch_hash 'Plantagi'
83
145
  g2 = make_taxamatch_hash 'Plantagy'
84
- @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
85
- @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
86
- #distance 1 in first letter also matches
146
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
147
+ 'edit_distance' => 1, 'match' => true }
148
+ @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
149
+ 'phonetic_match' => false, 'edit_distance' => 1, 'match' => true }
150
+ # distance 1 in first letter also matches
87
151
  g1 = make_taxamatch_hash 'Xantheri'
88
152
  g2 = make_taxamatch_hash 'Pantheri'
89
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
90
- #phonetic match tramps everything
153
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
154
+ 'edit_distance' => 1, 'match' => true }
155
+ # phonetic match tramps everything
91
156
  g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
92
157
  g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
93
- @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
94
- @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'edit_distance' => 4, 'match' => false}
95
- #same first letter and distance 2 should match
158
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
159
+ 'edit_distance' => 4, 'match' => true }
160
+ @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
161
+ 'phonetic_match' => false, 'edit_distance' => 4, 'match' => false }
162
+ # same first letter and distance 2 should match
96
163
  g1 = make_taxamatch_hash 'Xaaaantherii'
97
164
  g2 = make_taxamatch_hash 'Xaaaantherrr'
98
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
99
- #First letter is the same and distance is 3 should match, no phonetic match
165
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
166
+ 'match' => true, 'edit_distance' => 2 }
167
+ # First letter is the same and distance is 3 should match, no phonetic match
100
168
  g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
101
169
  g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
102
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
103
- #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
170
+ @tm.match_genera(g1, g2).should ==
171
+ { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 3 }
172
+ # Should not match if one of words is shorter than 2x edit
173
+ # distance and distance is 2 or 3
104
174
  g1 = make_taxamatch_hash 'Xant'
105
175
  g2 = make_taxamatch_hash 'Xanthe'
106
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
107
- #Should not match if edit distance > 3 and no phonetic match
176
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
177
+ 'match' => false, 'edit_distance' => 2 }
178
+ # Should not match if edit distance > 3 and no phonetic match
108
179
  g1 = make_taxamatch_hash 'Xantheriiii'
109
180
  g2 = make_taxamatch_hash 'Xantherrrrr'
110
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
181
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
182
+ 'match' => false, 'edit_distance' => 4 }
111
183
  end
112
184
 
113
185
  it 'should compare species' do
114
- #Exact match
186
+ # Exact match
115
187
  s1 = make_taxamatch_hash 'major'
116
188
  s2 = make_taxamatch_hash 'major'
117
- @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
118
- @tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 0}
119
- #Phonetic match always works
189
+ @tm.match_species(s1, s2).should == { 'phonetic_match' => true,
190
+ 'match' => true, 'edit_distance' => 0 }
191
+ @tm.match_species(s1, s2, :with_phonetic_match => false).should == {
192
+ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 0 }
193
+ # Phonetic match always works
120
194
  s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
121
195
  s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
122
- @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 4}
123
- @tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
124
- #Phonetic match works with different endings
196
+ @tm.match_species(s1, s2).should == { 'phonetic_match' => true,
197
+ 'match' => true, 'edit_distance' => 4 }
198
+ @tm.match_species(s1, s2, :with_phonetic_match => false).should ==
199
+ { 'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
200
+ # Phonetic match works with different endings
125
201
  s1 = make_taxamatch_hash 'majorum'
126
202
  s2 = make_taxamatch_hash 'majoris'
127
- @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
128
- @tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
129
- #Distance 4 matches if first 3 chars are the same
203
+ @tm.match_species(s1, s2).should == {
204
+ 'phonetic_match' => true, 'match' => true, 'edit_distance' => 2 }
205
+ @tm.match_species(s1, s2, :with_phonetic_match => false).should ==
206
+ { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 2 }
207
+ # Distance 4 matches if first 3 chars are the same
130
208
  s1 = make_taxamatch_hash 'majjjjorrrrr'
131
209
  s2 = make_taxamatch_hash 'majjjjoraaaa'
132
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
133
- #Should not match if Distance 4 matches and first 3 chars are not the same
210
+ @tm.match_species(s1, s2).should ==
211
+ { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 4 }
212
+ # Should not match if Distance 4 matches and first 3 chars are not the same
134
213
  s1 = make_taxamatch_hash 'majorrrrr'
135
214
  s2 = make_taxamatch_hash 'marorraaa'
136
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
137
- #Distance 2 or 3 matches if first 1 char is the same
215
+ @tm.match_species(s1, s2).should == {
216
+ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
217
+ # Distance 2 or 3 matches if first 1 char is the same
138
218
  s1 = make_taxamatch_hash 'moooorrrr'
139
219
  s2 = make_taxamatch_hash 'mooooraaa'
140
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
141
- #Should not match if Distance 2 or 3 and first 1 char is not the same
220
+ @tm.match_species(s1, s2).should == { 'phonetic_match' => false,
221
+ 'match' => true, 'edit_distance' => 3 }
222
+ # Should not match if Distance 2 or 3 and first 1 char is not the same
142
223
  s1 = make_taxamatch_hash 'morrrr'
143
224
  s2 = make_taxamatch_hash 'torraa'
144
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
145
- #Distance 1 will match anywhere
225
+ @tm.match_species(s1, s2).should == {
226
+ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
227
+ # Distance 1 will match anywhere
146
228
  s1 = make_taxamatch_hash 'major'
147
229
  s2 = make_taxamatch_hash 'rajor'
148
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
149
- #Will not match if distance 3 and length is less then twice of the edit distance
230
+ @tm.match_species(s1, s2).should == {
231
+ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 1 }
232
+ # Will not match if distance 3 and length is less then twice
233
+ # of the edit distance
150
234
  s1 = make_taxamatch_hash 'marrr'
151
235
  s2 = make_taxamatch_hash 'maaaa'
152
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
236
+ @tm.match_species(s1, s2).should == {
237
+ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
153
238
  end
154
239
 
155
240
  it 'should match matches' do
156
- #No trobule case
157
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
158
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
159
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
160
- #Will not match if either genus or sp. epithet dont match
161
- gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
162
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
163
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
164
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
165
- smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
166
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
167
- #Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
168
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
169
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
170
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 5, 'match' => false}
171
- #Should not have phonetic match if one of the components does not match phonetically
172
- gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
173
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
174
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
175
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
176
- smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
177
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
178
- #edit distance should be equal the sum of of edit distances
179
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
180
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
181
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
241
+ # No trobule case
242
+ gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
243
+ smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
244
+ @tm.match_matches(gmatch, smatch).should ==
245
+ { 'phonetic_match' => true, 'edit_distance' => 2, 'match' => true }
246
+ # Will not match if either genus or sp. epithet dont match
247
+ gmatch = { 'match' => false,
248
+ 'phonetic_match' => false, 'edit_distance' => 1 }
249
+ smatch = { 'match' => true,
250
+ 'phonetic_match' => true, 'edit_distance' => 1 }
251
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
252
+ 'edit_distance' => 2, 'match' => false }
253
+ gmatch = { 'match' => true, 'phonetic_match' => true,
254
+ 'edit_distance' => 1 }
255
+ smatch = { 'match' => false, 'phonetic_match' => false,
256
+ 'edit_distance' => 1 }
257
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
258
+ 'edit_distance' => 2, 'match' => false }
259
+ # Should not match if binomial edit distance > 4
260
+ # NOTE: EVEN with full phonetic match
261
+ gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 3 }
262
+ smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
263
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => true,
264
+ 'edit_distance' => 5, 'match' => false }
265
+ # Should not have phonetic match if one of the components
266
+ # does not match phonetically
267
+ gmatch = { 'match' => true,
268
+ 'phonetic_match' => false, 'edit_distance' => 1 }
269
+ smatch = { 'match' => true,
270
+ 'phonetic_match' => true, 'edit_distance' => 1 }
271
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
272
+ 'edit_distance' => 2, 'match' => true }
273
+ gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
274
+ smatch = { 'match' => true,
275
+ 'phonetic_match' => false, 'edit_distance' => 1 }
276
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
277
+ 'edit_distance' => 2, 'match' => true }
278
+ # edit distance should be equal the sum of of edit distances
279
+ gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
280
+ smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
281
+ @tm.match_matches(gmatch, smatch).should == {
282
+ 'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true }
182
283
  end
183
284
 
184
285
  it 'should return only boolean values' do
185
286
  @tm.taxamatch("AJLJljljlj", "sls").should_not be_nil
186
287
  @tm.taxamatch('Olsl','a')
187
288
  end
188
-
289
+
189
290
  it "should not match authors from different parts of name" do
190
291
  parser = Taxamatch::Atomizer.new
191
292
  t = Taxamatch::Base.new
@@ -199,11 +300,11 @@ describe 'Taxamatch::Base' do
199
300
  n8 = parser.parse "Betula alba Linnaeus alba Smith"
200
301
  n9 = parser.parse "Betula alba Smith alba L."
201
302
  n10 = parser.parse "Betula Linn."
202
- #if one authorship is empty, return 0
303
+ # if one authorship is empty, return 0
203
304
  t.match_authors(n1, n5).should == 0
204
305
  t.match_authors(n5, n1).should == 0
205
306
  t.match_authors(n5, n6).should == 0
206
- #if authorship matches on different levels ignore
307
+ # if authorship matches on different levels ignore
207
308
  t.match_authors(n7, n3).should == 0
208
309
  t.match_authors(n8, n3).should == -1
209
310
  t.match_authors(n2, n8).should == 0
@@ -227,29 +328,37 @@ describe 'Taxamatch::Base' do
227
328
  res.should == 90
228
329
  res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
229
330
  res.should == 0
230
- #found all authors, same year
231
- res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
331
+ # found all authors, same year
332
+ res = @am.authmatch(['Linnaeus', 'Muller'],
333
+ ['Muller', 'Linnaeus'], [1766], [1766])
232
334
  res.should == 100
233
- #all authors, 1 year diff
234
- res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
335
+ # all authors, 1 year diff
336
+ res = @am.authmatch(['Linnaeus', 'Muller'],
337
+ ['Muller', 'Linnaeus'], [1767], [1766])
235
338
  res.should == 54
236
- #year is not counted in
237
- res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
339
+ # year is not counted in
340
+ res = @am.authmatch(['Linnaeus', 'Muller'],
341
+ ['Muller', 'Linnaeus'], [1767], [])
238
342
  res.should == 94
239
- #found all authors on one side, same year
240
- res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
343
+ # found all authors on one side, same year
344
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
345
+ ['Muller', 'Linnaeus'], [1767], [1767])
241
346
  res.should == 91
242
- #found all authors on one side, 1 year diff
243
- res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
347
+ # found all authors on one side, 1 year diff
348
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
349
+ ['Muller', 'Linnaeus'], [1766], [1767])
244
350
  res.should == 51
245
- #found all authors on one side, year does not count
246
- res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
351
+ # found all authors on one side, year does not count
352
+ res = @am.authmatch(['Linnaeus', 'Muller'],
353
+ ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
247
354
  res.should == 90
248
- #found some authors
249
- res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
355
+ # found some authors
356
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
357
+ ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
250
358
  res.should == 67
251
- #if year does not match or not present no match for previous case
252
- res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
359
+ # if year does not match or not present no match for previous case
360
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
361
+ ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
253
362
  res.should == 0
254
363
  end
255
364
 
@@ -261,22 +370,29 @@ describe 'Taxamatch::Base' do
261
370
  end
262
371
 
263
372
  it 'should remove duplicate authors' do
264
- #Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
265
- #Muller is identical
266
- res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
373
+ # Li submatches Linnaeus and it its size 3 is big enought to remove
374
+ # Linnaeus Muller is identical
375
+ res = @am.remove_duplicate_authors(['Lin', 'Muller'],
376
+ ['Linnaeus', 'Muller'])
267
377
  res.should == [[], []]
268
- #same in different order
269
- res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
378
+ # same in different order
379
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
380
+ ['Linn', 'Muller'])
270
381
  res.should == [[], []]
271
- #auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
272
- res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
382
+ # auth Li submatches Linnaeus, but Li size less then 3
383
+ # required to remove Linnaeus
384
+ res = @am.remove_duplicate_authors(['Dem', 'Li'],
385
+ ['Linnaeus', 'Stepanov'])
273
386
  res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
274
- #fuzzy match
275
- res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
387
+ # fuzzy match
388
+ res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'],
389
+ ['Linnaeus', 'Stepanov'])
276
390
  res.should == [["Dem"], ["Stepanov"]]
277
- res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
391
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
392
+ ['L', 'Kenn'])
278
393
  res.should == [['Linnaeus', 'Muller'], ['Kenn']]
279
- res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
394
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
395
+ ['Muller', 'Linnaeus', 'Kurtz'])
280
396
  res.should == [[],['Kurtz']]
281
397
  end
282
398
 
@@ -288,5 +404,3 @@ describe 'Taxamatch::Base' do
288
404
  end
289
405
 
290
406
  end
291
-
292
-