dimus-taxamatch_rb 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
data/lib/taxamatch_rb.rb CHANGED
@@ -25,9 +25,13 @@ class Taxamatch
25
25
  #takes two hashes of parsed scientific names, analyses them and returns back
26
26
  #this function is useful when species strings are preparsed.
27
27
  def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
28
- return match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
29
- return match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
30
- return false
28
+ result = nil
29
+ result = match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
30
+ result = match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
31
+ if result && result[:match]
32
+ result[:match] = match_authors(parsed_data_1, parsed_data_2) > 0 ? true : false
33
+ end
34
+ return result
31
35
  end
32
36
 
33
37
  def match_uninomial(parsed_data_1, parsed_data_2)
@@ -72,7 +76,7 @@ class Taxamatch
72
76
  au2 = parsed_data_2[:all_authors]
73
77
  yr1 = parsed_data_1[:all_years]
74
78
  yr2 = parsed_data_2[:all_years]
75
- #Authormatch.compare_authorities(au1, au2, yr1, yr2)
79
+ Authmatch.authmatch(au1, au2, yr1, yr2)
76
80
  end
77
81
 
78
82
  def match_matches(genus_match, species_match, infraspecies_matches = [])
@@ -84,4 +88,4 @@ class Taxamatch
84
88
  match
85
89
  end
86
90
 
87
- end
91
+ end
@@ -1,86 +1,85 @@
1
1
  class Authmatch
2
2
 
3
3
  def self.authmatch(authors1, authors2, years1, years2)
4
- return true
5
4
  unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
6
5
  year_difference = compare_years(years1, years2)
6
+ get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
7
+ end
7
8
 
8
-
9
- #return get_score_author_comparison(authors1, unique_authors1, authors2, unique_authors2, year_difference, 50, true);
9
+ def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
10
+ count_before = authors1.size + authors2.size
11
+ count_after = unique_authors1.size + unique_authors2.size
12
+ score = 0
13
+ if count_after == 0
14
+ if year_diff != nil
15
+ if year_diff == 0
16
+ score = 100
17
+ elsif year_diff == 1
18
+ score = 54
19
+ end
20
+ else
21
+ score = 94
22
+ end
23
+ elsif unique_authors1.size > 0 || unique_authors2.size > 0
24
+ if year_diff != nil
25
+ if year_diff == 0
26
+ score = 91
27
+ elsif year_diff == 1
28
+ score = 51
29
+ end
30
+ else
31
+ score = 90
32
+ end
33
+ else
34
+ score = ((1 - count_after.to_f/count_before.to_f) * 100).round
35
+ score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
36
+ end
37
+ score > 50 ? score : 0
10
38
  end
11
39
 
12
- def self.remove_duplicate_authors(author1, authors2)
13
- au1_match = au2_match = false
14
- au1_match.each do |au1|
15
- match1 = false
16
- au1_match.each do |au2|
17
- match2 = false
40
+ def self.remove_duplicate_authors(authors1, authors2)
41
+ unique_authors1 = authors1.dup
42
+ unique_authors2 = authors2.dup
43
+ authors1.each do |au1|
44
+ au1_match = false
45
+ authors2.each do |au2|
46
+ au2_match = false
18
47
  if au1 == au2
19
- match1 = match2 = true
20
- elsif au1.size < au2.size
21
- match1 = true if au1 == au2[0..au1.size]
22
- elseif
48
+ au1_match = au2_match = true if au1 == au2
49
+ elsif au1 == au2[0...au1.size]
50
+ au1_match = true
51
+ elsif au1[0...au2.size] == au2
52
+ au2_match = true
53
+ end
54
+ if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
55
+ unique_authors1.delete au1
56
+ unique_authors2.delete au2
57
+ elsif au1_match
58
+ unique_authors1.delete au1
59
+ elsif au2_match
60
+ unique_authors2.delete au2
61
+ else
62
+ if self.fuzzy_match_authors(au1, au2)
63
+ unique_authors1.delete au1
64
+ unique_authors2.delete au2
65
+ end
23
66
  end
24
67
  end
25
68
  end
69
+ [unique_authors1, unique_authors2]
70
+ end
71
+
72
+ def self.fuzzy_match_authors(author1, author2)
73
+ au1_length = author1.size
74
+ au2_length = author2.size
75
+ dlm = DamerauLevenshteinMod.new
76
+ ed = dlm.distance(author1, author2,2,3)
77
+ (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
26
78
  end
27
79
 
28
80
  def self.compare_years(years1, years2)
29
81
  return 0 if years1 == [] && years2 == []
30
- return (years1[0] - years2[0]).abs if years1.size == 1 && years2.size == 1
82
+ return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
31
83
  nil
32
84
  end
33
85
  end
34
-
35
- =begin
36
- foreach($author_words1 as $key1 => $author1)
37
- {
38
- $author1_matches = false;
39
- $author1 = Normalize::normalize_author_string($author1);
40
- foreach($author_words2 as $key2 => $author2)
41
- {
42
- $author2_matches = false;
43
- $author2 = Normalize::normalize_author_string($author2);
44
-
45
- if($author1 == $author2)
46
- {
47
- $author1_matches = true;
48
- $author2_matches = true;
49
- }elseif(preg_match("/^".preg_quote($author1, "/")."/i", $author2))
50
- {
51
- $author1_matches = true;
52
- }elseif(preg_match("/^".preg_quote($author2, "/")."/i", $author1))
53
- {
54
- $author2_matches = true;
55
- }
56
-
57
- // equal or one is contained in the other, so consider it a match for both terms
58
- if((strlen($author1)>=3 && $author1_matches) || (strlen($author2)>=3 && $author2_matches) || $author1 == $author2)
59
- {
60
- unset($unique_authors1[$key1]);
61
- unset($unique_authors2[$key2]);
62
- }elseif($author1_matches)
63
- {
64
- // author1 was abbreviation of author2
65
- unset($unique_authors1[$key1]);
66
- }elseif($author2_matches)
67
- {
68
- // author1 was abbreviation of author2
69
- unset($unique_authors2[$key2]);
70
- }else
71
- {
72
- // no match or abbreviation so try a fuzzy match
73
- $max_length = max(strlen($author1), strlen($author2));
74
- $lev = levenshtein($author1, $author2);
75
- if(($lev/$max_length) <= .167)
76
- {
77
- unset($unique_authors1[$key1]);
78
- unset($unique_authors2[$key2]);
79
- }
80
- }
81
- }
82
- reset($author_words2);
83
- }
84
-
85
-
86
- =end
@@ -41,8 +41,7 @@ describe 'Normalizer' do
41
41
  Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
42
42
  Normalizer.normalize('Ærenea').should == 'AERENEA'
43
43
  Normalizer.normalize('Fallén').should == 'FALLEN'
44
- Normalizer.normalize('abcd').should == 'ABCD'
45
- Normalizer.normalize('abcd').should == 'ABCD'
44
+ Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
46
45
  end
47
46
 
48
47
  it 'should normalize words' do
@@ -180,12 +179,33 @@ describe 'Taxamatch' do
180
179
  @am = Authmatch
181
180
  end
182
181
 
182
+ it 'should calculate score' do
183
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['L', 'Kenn'], [], [1788])
184
+ res.should == 90
185
+ end
186
+
183
187
  it 'should compare years' do
184
188
  @am.compare_years([1882],[1880]).should == 2
185
189
  @am.compare_years([1882],[]).should == nil
186
190
  @am.compare_years([],[]).should == 0
187
191
  @am.compare_years([1788,1798], [1788,1798]).should be_nil
188
192
  end
193
+
194
+ it 'should remove duplicate authors' do
195
+ #Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
196
+ #Muller is identical
197
+ res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
198
+ res.should == [[], []]
199
+ #same in different order
200
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
201
+ res.should == [[], []]
202
+ #auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
203
+ res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
204
+ res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
205
+ #fuzzy match
206
+ res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
207
+ res.should == [["Dem"], ["Stepanov"]]
208
+ end
189
209
  end
190
210
 
191
211
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dimus-taxamatch_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-02 00:00:00 -07:00
12
+ date: 2009-08-06 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency