taxamatch_rb 0.9.10 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,15 +1,19 @@
1
- # Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
1
+ # Algorithms for Taxamatch::Authmatch
2
+ # are developed by Patrick Leary of uBio and EOL fame
2
3
 
3
4
  module Taxamatch
4
5
  class Authmatch
5
6
 
6
7
  def self.authmatch(authors1, authors2, years1, years2)
7
- unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
8
+ unique_authors1, unique_authors2 =
9
+ remove_duplicate_authors(authors1, authors2)
8
10
  year_difference = compare_years(years1, years2)
9
- get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
11
+ get_score(authors1, unique_authors1,
12
+ authors2, unique_authors2, year_difference)
10
13
  end
11
-
12
- def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
14
+
15
+ def self.get_score(authors1, unique_authors1,
16
+ authors2, unique_authors2, year_diff)
13
17
  count_before = authors1.size + authors2.size
14
18
  count_after = unique_authors1.size + unique_authors2.size
15
19
  score = 0
@@ -18,7 +22,7 @@ module Taxamatch
18
22
  if year_diff == 0
19
23
  score = 100
20
24
  elsif year_diff == 1
21
- score = 54
25
+ score = 54
22
26
  end
23
27
  else
24
28
  score = 94
@@ -35,11 +39,11 @@ module Taxamatch
35
39
  end
36
40
  else
37
41
  score = ((1 - count_after.to_f/count_before.to_f) * 100).round
38
- score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
42
+ score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
39
43
  end
40
44
  score > 50 ? score : 0
41
45
  end
42
-
46
+
43
47
  def self.remove_duplicate_authors(authors1, authors2)
44
48
  unique_authors1 = authors1.dup
45
49
  unique_authors2 = authors2.dup
@@ -48,12 +52,14 @@ module Taxamatch
48
52
  au1_match = au2_match = false
49
53
  if au1 == au2
50
54
  au1_match = au2_match = true
51
- elsif au1 == au2[0...au1.size]
55
+ elsif au1 == au2[0...au1.size]
52
56
  au1_match = true
53
57
  elsif au1[0...au2.size] == au2
54
58
  au2_match = true
55
59
  end
56
- if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
60
+ if (au1.size >= 3 && au1_match) ||
61
+ (au2.size >= 3 && au2_match) ||
62
+ (au1_match && au2_match)
57
63
  unique_authors1.delete au1
58
64
  unique_authors2.delete au2
59
65
  elsif au1_match
@@ -61,8 +67,11 @@ module Taxamatch
61
67
  elsif au2_match
62
68
  unique_authors2.delete au2
63
69
  else
64
- #TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
65
- if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
70
+ #TODO: masking a bug in damerau levenshtsin
71
+ # mod which appears comparing 1letter to a longer string
72
+ if au1.size > 1 &&
73
+ au2.size > 1 &&
74
+ self.fuzzy_match_authors(au1, au2)
66
75
  unique_authors1.delete au1
67
76
  unique_authors2.delete au2
68
77
  end
@@ -71,18 +80,22 @@ module Taxamatch
71
80
  end
72
81
  [unique_authors1, unique_authors2]
73
82
  end
74
-
83
+
75
84
  def self.fuzzy_match_authors(author1, author2)
76
85
  au1_length = author1.size
77
86
  au2_length = author2.size
78
87
  dlm = DamerauLevenshtein
79
- ed = dlm.distance(author1, author2,1,3) #get around a bug in C code, but it really has to be fixed
80
- (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
88
+ #get around a bug in C code, but it really has to be fixed
89
+ ed = dlm.distance(author1, author2,1,3)
90
+ (ed <= 3 && ([au1_length, au2_length].min > ed * 2) &&
91
+ (ed < 2 || author1[0] == author2[0]))
81
92
  end
82
93
 
83
94
  def self.compare_years(years1, years2)
84
95
  return 0 if years1 == [] && years2 == []
85
- return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
96
+ if years1.size == 1 && years2.size == 1
97
+ return (years1[0].to_i - years2[0].to_i).abs
98
+ end
86
99
  nil
87
100
  end
88
101
  end
@@ -1,16 +1,16 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module Taxamatch
4
-
4
+
5
5
  module Normalizer
6
6
  def self.normalize(string)
7
7
  utf8_to_ascii(string.strip.upcase).gsub(/[^\x00-\x7F]/,'?')
8
8
  end
9
-
9
+
10
10
  def self.normalize_word(word)
11
11
  self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
12
12
  end
13
-
13
+
14
14
  def self.normalize_author(string)
15
15
  self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
16
16
  end
@@ -20,7 +20,7 @@ module Taxamatch
20
20
  year_int = nil unless year_int.between?(1757, Time.now.year + 1)
21
21
  year_int
22
22
  end
23
-
23
+
24
24
 
25
25
  private
26
26
  def self.utf8_to_ascii(string)
@@ -2,11 +2,11 @@
2
2
  module Taxamatch
3
3
 
4
4
  module Phonetizer
5
-
5
+
6
6
  def self.phonetize(a_word, normalize_ending = false)
7
7
  self.near_match(a_word, normalize_ending)
8
8
  end
9
-
9
+
10
10
  def self.near_match(a_word, normalize_ending = false)
11
11
  a_word = a_word.strip rescue ''
12
12
  return '' if a_word == ''
@@ -50,7 +50,7 @@ module Taxamatch
50
50
  a_word = 'Z' + a_word[1..-1]
51
51
  end
52
52
  first_char = a_word.split('')[0]
53
- rest_chars = a_word.split('')[1..-1].join('')
53
+ rest_chars = a_word.split('')[1..-1].join('')
54
54
  rest_chars.gsub!('AE', 'I')
55
55
  rest_chars.gsub!('IA', 'A')
56
56
  rest_chars.gsub!('OE', 'I')
@@ -59,21 +59,22 @@ module Taxamatch
59
59
  rest_chars.gsub!('H', '')
60
60
  rest_chars.tr!('EOUYKZ', 'IAIICS')
61
61
  a_word = (first_char + rest_chars).squeeze
62
-
62
+
63
63
  if normalize_ending && a_word.size > 4
64
64
  a_word = self.normalize_ending(a_word)
65
65
  end
66
66
  a_word
67
67
  end
68
-
68
+
69
69
  def self.normalize_ending(a_word)
70
- # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
70
+ # -- deal with variant endings
71
+ # -is (includes -us, -ys, -es), -im (was -um), -as (-os)
71
72
  # -- at the end of a string translate all to -a
72
73
  a_word.gsub!(/IS$/, 'A')
73
74
  a_word.gsub!(/IM$/, 'A')
74
75
  a_word.gsub(/AS$/, 'A')
75
76
  end
76
-
77
+
77
78
  end
78
79
 
79
- end
80
+ end
@@ -7,25 +7,81 @@ describe 'Atomizer' do
7
7
  end
8
8
 
9
9
  it 'should parse uninomials' do
10
- @parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :canonical_form=>"Betula", :uninomial=>{:string=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
11
- @parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>[1872], :canonical_form=>"Aerenea", :uninomial=>{:string=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>[1872], :normalized_authors=>["LACORDAIRE"]}}
10
+ @parser.parse('Betula').should == { :all_authors => [], :all_years => [],
11
+ :canonical_form => "Betula", :uninomial => { :string => "Betula",
12
+ :normalized => 'BETULA', :phonetized => "BITILA", :authors => [],
13
+ :years => [], :normalized_authors => [] } }
14
+ @parser.parse('Ærenea Lacordaire, 1872').should == {
15
+ :all_authors => ["LACORDAIRE"], :all_years => [1872],
16
+ :canonical_form => "Aerenea", :uninomial => { :string => "Aerenea",
17
+ :normalized => "AERENEA", :phonetized => "ERINIA",
18
+ :authors => ["Lacordaire"], :years => [1872],
19
+ :normalized_authors => ["LACORDAIRE"] } }
12
20
  end
13
21
 
14
22
  it 'should parse binomials' do
15
- @parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>[1913], :canonical_form=>"Leoeptura laetifica", :genus=>{:string=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>[1913], :normalized_authors=>["DOW"]}}
23
+ @parser.parse('Leœptura laetifica Dow, 1913').should == {
24
+ :all_authors => ["DOW"], :all_years => [1913],
25
+ :canonical_form => "Leoeptura laetifica", :genus => {
26
+ :string => "Leoeptura", :normalized => "LEOEPTURA",
27
+ :phonetized => "LIPTIRA", :authors => [], :years => [],
28
+ :normalized_authors => []}, :species => {
29
+ :string => "laetifica", :normalized => "LAETIFICA",
30
+ :phonetized => "LITIFICA", :authors => ["Dow"],
31
+ :years => [1913], :normalized_authors => ["DOW"] } }
16
32
  end
17
33
 
18
34
  it 'should parse trinomials' do
19
- @parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>[1972], :canonical_form=>"Hydnellum scrobiculatum zonatum", :genus=>{:string=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:string=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>[1972], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
35
+ @parser.parse('Hydnellum scrobiculatum zonatum ' +
36
+ '(Banker) D. Hall et D.E. Stuntz 1972').should == {
37
+ :all_authors => ["BANKER", "D HALL", "D E STUNTZ"], :all_years => [1972],
38
+ :canonical_form => "Hydnellum scrobiculatum zonatum", :genus=>{
39
+ :string => "Hydnellum", :normalized => "HYDNELLUM",
40
+ :phonetized => "HIDNILIM", :authors => [], :years => [],
41
+ :normalized_authors => [] }, :species => { :string => "scrobiculatum",
42
+ :normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
43
+ :authors => [], :years => [], :normalized_authors => [] },
44
+ :infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
45
+ :phonetized => "ZANATA", :authors => ["Banker", "D. Hall", "D.E. Stuntz"],
46
+ :years => [1972], :normalized_authors => ["BANKER", "D HALL",
47
+ "D E STUNTZ"] }] }
20
48
  end
21
49
 
22
50
  it 'should normalize years to integers' do
23
51
  future_year = Time.now.year + 10
24
- @parser.parse("Hydnellum scrobiculatum Kern #{future_year} zonatum (Banker) D. Hall et D.E. Stuntz 1972?").should == {:all_authors=>["KERN", "BANKER", "D HALL", "D E STUNTZ"], :all_years=>[1972], :canonical_form=>"Hydnellum scrobiculatum zonatum", :genus=>{:string=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:string=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>["Kern"], :years=>[], :normalized_authors=>["KERN"]}, :infraspecies=>[{:string=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>[1972], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
52
+ @parser.parse("Hydnellum scrobiculatum Kern #{future_year} " +
53
+ "zonatum (Banker) D. Hall et D.E. Stuntz 1972?").should == {
54
+ :all_authors => ["KERN", "BANKER", "D HALL", "D E STUNTZ"],
55
+ :all_years => [1972],
56
+ :canonical_form => "Hydnellum scrobiculatum zonatum", :genus => {
57
+ :string => "Hydnellum", :normalized => "HYDNELLUM",
58
+ :phonetized => "HIDNILIM", :authors => [], :years => [],
59
+ :normalized_authors => [] }, :species => { :string => "scrobiculatum",
60
+ :normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
61
+ :authors => ["Kern"], :years => [], :normalized_authors => ["KERN"] },
62
+ :infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
63
+ :phonetized => "ZANATA", :authors =>
64
+ ["Banker", "D. Hall", "D.E. Stuntz"], :years => [1972],
65
+ :normalized_authors => ["BANKER", "D HALL", "D E STUNTZ"] }] }
25
66
  end
26
67
 
27
68
  it 'should normalize names with abbreviated genus after cf.' do
28
- @parser.parse('Unio cf. U. alba').should == {:all_authors=>[], :all_years=>[], :canonical_form=>"Unio", :genus=>{:string=>"Unio", :normalized=>"UNIO", :phonetized=>"UNIA", :authors=>[], :years=>[], :normalized_authors=>[]}}
69
+ @parser.parse('Unio cf. U. alba').should == { :all_authors => [],
70
+ :all_years => [], :canonical_form => "Unio",
71
+ :genus => { :string => "Unio", :normalized => "UNIO",
72
+ :phonetized => "UNIA", :authors => [], :years => [],
73
+ :normalized_authors => [] } }
74
+ end
75
+
76
+ it 'should parse names which broke it before' do
77
+ ['Parus caeruleus species complex',
78
+ 'Euxoa nr. idahoensis sp. 1clay',
79
+ 'Cetraria islandica ? islandica',
80
+ 'Buteo borealis ? ventralis'].each do |n|
81
+ res = @parser.parse(n)
82
+ res.class.should == Hash
83
+ res.empty?.should be_false
84
+ end
29
85
  end
30
86
  end
31
87
 
@@ -38,12 +94,14 @@ describe 'Taxamatch::Normalizer' do
38
94
  Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
39
95
  Taxamatch::Normalizer.normalize('Fallé€n').should == 'FALLE?N'
40
96
  Taxamatch::Normalizer.normalize('Fallén привет').should == 'FALLEN ??????'
41
- Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
97
+ Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should ==
98
+ 'CHORIOZOPELLA TRAGARDHI'
42
99
  Taxamatch::Normalizer.normalize('×Zygomena').should == 'xZYGOMENA'
43
100
  end
44
101
 
45
102
  it 'should normalize words' do
46
- Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
103
+ Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should ==
104
+ 'L-3EOEPTURA'
47
105
  end
48
106
  end
49
107
 
@@ -53,7 +111,8 @@ describe 'Taxamatch::Base' do
53
111
  end
54
112
 
55
113
  it 'should get txt tests' do
56
- read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
114
+ test_file = File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt'
115
+ read_test_file(test_file, 4) do |y|
57
116
  if y
58
117
  y[2] = y[2] == 'true' ? true : false
59
118
  res = @tm.taxamatch(y[0], y[1], false)
@@ -65,127 +124,169 @@ describe 'Taxamatch::Base' do
65
124
  end
66
125
 
67
126
  it 'should work with names that cannot be parsed' do
68
- res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
127
+ res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921',
128
+ 'Quadraspidiotus ostreaeformis Curtis)')
69
129
  res = false
70
130
  end
71
131
 
72
132
  it 'should compare genera' do
73
- #edit distance 1 always match
133
+ # edit distance 1 always match
74
134
  g1 = make_taxamatch_hash 'Plantago'
75
135
  g2 = make_taxamatch_hash 'Plantagon'
76
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
77
- #edit_distance above threshold does not math
136
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
137
+ 'edit_distance' => 1, 'match' => true }
138
+ # edit_distance above threshold does not math
78
139
  g1 = make_taxamatch_hash 'Plantago'
79
140
  g2 = make_taxamatch_hash 'This shouldnt match'
80
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
81
- #phonetic_match matches
141
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
142
+ 'match' => false, 'edit_distance' => 4 }
143
+ # phonetic_match matches
82
144
  g1 = make_taxamatch_hash 'Plantagi'
83
145
  g2 = make_taxamatch_hash 'Plantagy'
84
- @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
85
- @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
86
- #distance 1 in first letter also matches
146
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
147
+ 'edit_distance' => 1, 'match' => true }
148
+ @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
149
+ 'phonetic_match' => false, 'edit_distance' => 1, 'match' => true }
150
+ # distance 1 in first letter also matches
87
151
  g1 = make_taxamatch_hash 'Xantheri'
88
152
  g2 = make_taxamatch_hash 'Pantheri'
89
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
90
- #phonetic match tramps everything
153
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
154
+ 'edit_distance' => 1, 'match' => true }
155
+ # phonetic match tramps everything
91
156
  g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
92
157
  g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
93
- @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
94
- @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'edit_distance' => 4, 'match' => false}
95
- #same first letter and distance 2 should match
158
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
159
+ 'edit_distance' => 4, 'match' => true }
160
+ @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
161
+ 'phonetic_match' => false, 'edit_distance' => 4, 'match' => false }
162
+ # same first letter and distance 2 should match
96
163
  g1 = make_taxamatch_hash 'Xaaaantherii'
97
164
  g2 = make_taxamatch_hash 'Xaaaantherrr'
98
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
99
- #First letter is the same and distance is 3 should match, no phonetic match
165
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
166
+ 'match' => true, 'edit_distance' => 2 }
167
+ # First letter is the same and distance is 3 should match, no phonetic match
100
168
  g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
101
169
  g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
102
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
103
- #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
170
+ @tm.match_genera(g1, g2).should ==
171
+ { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 3 }
172
+ # Should not match if one of words is shorter than 2x edit
173
+ # distance and distance is 2 or 3
104
174
  g1 = make_taxamatch_hash 'Xant'
105
175
  g2 = make_taxamatch_hash 'Xanthe'
106
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
107
- #Should not match if edit distance > 3 and no phonetic match
176
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
177
+ 'match' => false, 'edit_distance' => 2 }
178
+ # Should not match if edit distance > 3 and no phonetic match
108
179
  g1 = make_taxamatch_hash 'Xantheriiii'
109
180
  g2 = make_taxamatch_hash 'Xantherrrrr'
110
- @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
181
+ @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
182
+ 'match' => false, 'edit_distance' => 4 }
111
183
  end
112
184
 
113
185
  it 'should compare species' do
114
- #Exact match
186
+ # Exact match
115
187
  s1 = make_taxamatch_hash 'major'
116
188
  s2 = make_taxamatch_hash 'major'
117
- @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
118
- @tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 0}
119
- #Phonetic match always works
189
+ @tm.match_species(s1, s2).should == { 'phonetic_match' => true,
190
+ 'match' => true, 'edit_distance' => 0 }
191
+ @tm.match_species(s1, s2, :with_phonetic_match => false).should == {
192
+ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 0 }
193
+ # Phonetic match always works
120
194
  s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
121
195
  s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
122
- @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 4}
123
- @tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
124
- #Phonetic match works with different endings
196
+ @tm.match_species(s1, s2).should == { 'phonetic_match' => true,
197
+ 'match' => true, 'edit_distance' => 4 }
198
+ @tm.match_species(s1, s2, :with_phonetic_match => false).should ==
199
+ { 'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
200
+ # Phonetic match works with different endings
125
201
  s1 = make_taxamatch_hash 'majorum'
126
202
  s2 = make_taxamatch_hash 'majoris'
127
- @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
128
- @tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
129
- #Distance 4 matches if first 3 chars are the same
203
+ @tm.match_species(s1, s2).should == {
204
+ 'phonetic_match' => true, 'match' => true, 'edit_distance' => 2 }
205
+ @tm.match_species(s1, s2, :with_phonetic_match => false).should ==
206
+ { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 2 }
207
+ # Distance 4 matches if first 3 chars are the same
130
208
  s1 = make_taxamatch_hash 'majjjjorrrrr'
131
209
  s2 = make_taxamatch_hash 'majjjjoraaaa'
132
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
133
- #Should not match if Distance 4 matches and first 3 chars are not the same
210
+ @tm.match_species(s1, s2).should ==
211
+ { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 4 }
212
+ # Should not match if Distance 4 matches and first 3 chars are not the same
134
213
  s1 = make_taxamatch_hash 'majorrrrr'
135
214
  s2 = make_taxamatch_hash 'marorraaa'
136
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
137
- #Distance 2 or 3 matches if first 1 char is the same
215
+ @tm.match_species(s1, s2).should == {
216
+ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
217
+ # Distance 2 or 3 matches if first 1 char is the same
138
218
  s1 = make_taxamatch_hash 'moooorrrr'
139
219
  s2 = make_taxamatch_hash 'mooooraaa'
140
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
141
- #Should not match if Distance 2 or 3 and first 1 char is not the same
220
+ @tm.match_species(s1, s2).should == { 'phonetic_match' => false,
221
+ 'match' => true, 'edit_distance' => 3 }
222
+ # Should not match if Distance 2 or 3 and first 1 char is not the same
142
223
  s1 = make_taxamatch_hash 'morrrr'
143
224
  s2 = make_taxamatch_hash 'torraa'
144
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
145
- #Distance 1 will match anywhere
225
+ @tm.match_species(s1, s2).should == {
226
+ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
227
+ # Distance 1 will match anywhere
146
228
  s1 = make_taxamatch_hash 'major'
147
229
  s2 = make_taxamatch_hash 'rajor'
148
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
149
- #Will not match if distance 3 and length is less then twice of the edit distance
230
+ @tm.match_species(s1, s2).should == {
231
+ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 1 }
232
+ # Will not match if distance 3 and length is less then twice
233
+ # of the edit distance
150
234
  s1 = make_taxamatch_hash 'marrr'
151
235
  s2 = make_taxamatch_hash 'maaaa'
152
- @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
236
+ @tm.match_species(s1, s2).should == {
237
+ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
153
238
  end
154
239
 
155
240
  it 'should match matches' do
156
- #No trobule case
157
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
158
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
159
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
160
- #Will not match if either genus or sp. epithet dont match
161
- gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
162
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
163
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
164
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
165
- smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
166
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
167
- #Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
168
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
169
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
170
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 5, 'match' => false}
171
- #Should not have phonetic match if one of the components does not match phonetically
172
- gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
173
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
174
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
175
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
176
- smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
177
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
178
- #edit distance should be equal the sum of of edit distances
179
- gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
180
- smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
181
- @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
241
+ # No trobule case
242
+ gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
243
+ smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
244
+ @tm.match_matches(gmatch, smatch).should ==
245
+ { 'phonetic_match' => true, 'edit_distance' => 2, 'match' => true }
246
+ # Will not match if either genus or sp. epithet dont match
247
+ gmatch = { 'match' => false,
248
+ 'phonetic_match' => false, 'edit_distance' => 1 }
249
+ smatch = { 'match' => true,
250
+ 'phonetic_match' => true, 'edit_distance' => 1 }
251
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
252
+ 'edit_distance' => 2, 'match' => false }
253
+ gmatch = { 'match' => true, 'phonetic_match' => true,
254
+ 'edit_distance' => 1 }
255
+ smatch = { 'match' => false, 'phonetic_match' => false,
256
+ 'edit_distance' => 1 }
257
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
258
+ 'edit_distance' => 2, 'match' => false }
259
+ # Should not match if binomial edit distance > 4
260
+ # NOTE: EVEN with full phonetic match
261
+ gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 3 }
262
+ smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
263
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => true,
264
+ 'edit_distance' => 5, 'match' => false }
265
+ # Should not have phonetic match if one of the components
266
+ # does not match phonetically
267
+ gmatch = { 'match' => true,
268
+ 'phonetic_match' => false, 'edit_distance' => 1 }
269
+ smatch = { 'match' => true,
270
+ 'phonetic_match' => true, 'edit_distance' => 1 }
271
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
272
+ 'edit_distance' => 2, 'match' => true }
273
+ gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
274
+ smatch = { 'match' => true,
275
+ 'phonetic_match' => false, 'edit_distance' => 1 }
276
+ @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
277
+ 'edit_distance' => 2, 'match' => true }
278
+ # edit distance should be equal the sum of of edit distances
279
+ gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
280
+ smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
281
+ @tm.match_matches(gmatch, smatch).should == {
282
+ 'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true }
182
283
  end
183
284
 
184
285
  it 'should return only boolean values' do
185
286
  @tm.taxamatch("AJLJljljlj", "sls").should_not be_nil
186
287
  @tm.taxamatch('Olsl','a')
187
288
  end
188
-
289
+
189
290
  it "should not match authors from different parts of name" do
190
291
  parser = Taxamatch::Atomizer.new
191
292
  t = Taxamatch::Base.new
@@ -199,11 +300,11 @@ describe 'Taxamatch::Base' do
199
300
  n8 = parser.parse "Betula alba Linnaeus alba Smith"
200
301
  n9 = parser.parse "Betula alba Smith alba L."
201
302
  n10 = parser.parse "Betula Linn."
202
- #if one authorship is empty, return 0
303
+ # if one authorship is empty, return 0
203
304
  t.match_authors(n1, n5).should == 0
204
305
  t.match_authors(n5, n1).should == 0
205
306
  t.match_authors(n5, n6).should == 0
206
- #if authorship matches on different levels ignore
307
+ # if authorship matches on different levels ignore
207
308
  t.match_authors(n7, n3).should == 0
208
309
  t.match_authors(n8, n3).should == -1
209
310
  t.match_authors(n2, n8).should == 0
@@ -227,29 +328,37 @@ describe 'Taxamatch::Base' do
227
328
  res.should == 90
228
329
  res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
229
330
  res.should == 0
230
- #found all authors, same year
231
- res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
331
+ # found all authors, same year
332
+ res = @am.authmatch(['Linnaeus', 'Muller'],
333
+ ['Muller', 'Linnaeus'], [1766], [1766])
232
334
  res.should == 100
233
- #all authors, 1 year diff
234
- res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
335
+ # all authors, 1 year diff
336
+ res = @am.authmatch(['Linnaeus', 'Muller'],
337
+ ['Muller', 'Linnaeus'], [1767], [1766])
235
338
  res.should == 54
236
- #year is not counted in
237
- res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
339
+ # year is not counted in
340
+ res = @am.authmatch(['Linnaeus', 'Muller'],
341
+ ['Muller', 'Linnaeus'], [1767], [])
238
342
  res.should == 94
239
- #found all authors on one side, same year
240
- res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
343
+ # found all authors on one side, same year
344
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
345
+ ['Muller', 'Linnaeus'], [1767], [1767])
241
346
  res.should == 91
242
- #found all authors on one side, 1 year diff
243
- res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
347
+ # found all authors on one side, 1 year diff
348
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
349
+ ['Muller', 'Linnaeus'], [1766], [1767])
244
350
  res.should == 51
245
- #found all authors on one side, year does not count
246
- res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
351
+ # found all authors on one side, year does not count
352
+ res = @am.authmatch(['Linnaeus', 'Muller'],
353
+ ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
247
354
  res.should == 90
248
- #found some authors
249
- res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
355
+ # found some authors
356
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
357
+ ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
250
358
  res.should == 67
251
- #if year does not match or not present no match for previous case
252
- res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
359
+ # if year does not match or not present no match for previous case
360
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
361
+ ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
253
362
  res.should == 0
254
363
  end
255
364
 
@@ -261,22 +370,29 @@ describe 'Taxamatch::Base' do
261
370
  end
262
371
 
263
372
  it 'should remove duplicate authors' do
264
- #Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
265
- #Muller is identical
266
- res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
373
+ # Li submatches Linnaeus and it its size 3 is big enought to remove
374
+ # Linnaeus Muller is identical
375
+ res = @am.remove_duplicate_authors(['Lin', 'Muller'],
376
+ ['Linnaeus', 'Muller'])
267
377
  res.should == [[], []]
268
- #same in different order
269
- res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
378
+ # same in different order
379
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
380
+ ['Linn', 'Muller'])
270
381
  res.should == [[], []]
271
- #auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
272
- res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
382
+ # auth Li submatches Linnaeus, but Li size less then 3
383
+ # required to remove Linnaeus
384
+ res = @am.remove_duplicate_authors(['Dem', 'Li'],
385
+ ['Linnaeus', 'Stepanov'])
273
386
  res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
274
- #fuzzy match
275
- res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
387
+ # fuzzy match
388
+ res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'],
389
+ ['Linnaeus', 'Stepanov'])
276
390
  res.should == [["Dem"], ["Stepanov"]]
277
- res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
391
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
392
+ ['L', 'Kenn'])
278
393
  res.should == [['Linnaeus', 'Muller'], ['Kenn']]
279
- res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
394
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
395
+ ['Muller', 'Linnaeus', 'Kurtz'])
280
396
  res.should == [[],['Kurtz']]
281
397
  end
282
398
 
@@ -288,5 +404,3 @@ describe 'Taxamatch::Base' do
288
404
  end
289
405
 
290
406
  end
291
-
292
-