dimus-taxamatch_rb 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/taxamatch_rb.rb +9 -5
- data/lib/taxamatch_rb/authmatch.rb +66 -67
- data/spec/taxamatch_rb_spec.rb +22 -2
- metadata +2 -2
data/lib/taxamatch_rb.rb
CHANGED
@@ -25,9 +25,13 @@ class Taxamatch
|
|
25
25
|
#takes two hashes of parsed scientific names, analyses them and returns back
|
26
26
|
#this function is useful when species strings are preparsed.
|
27
27
|
def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
result = nil
|
29
|
+
result = match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
|
30
|
+
result = match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
|
31
|
+
if result && result[:match]
|
32
|
+
result[:match] = match_authors(parsed_data_1, parsed_data_2) > 0 ? true : false
|
33
|
+
end
|
34
|
+
return result
|
31
35
|
end
|
32
36
|
|
33
37
|
def match_uninomial(parsed_data_1, parsed_data_2)
|
@@ -72,7 +76,7 @@ class Taxamatch
|
|
72
76
|
au2 = parsed_data_2[:all_authors]
|
73
77
|
yr1 = parsed_data_1[:all_years]
|
74
78
|
yr2 = parsed_data_2[:all_years]
|
75
|
-
|
79
|
+
Authmatch.authmatch(au1, au2, yr1, yr2)
|
76
80
|
end
|
77
81
|
|
78
82
|
def match_matches(genus_match, species_match, infraspecies_matches = [])
|
@@ -84,4 +88,4 @@ class Taxamatch
|
|
84
88
|
match
|
85
89
|
end
|
86
90
|
|
87
|
-
end
|
91
|
+
end
|
@@ -1,86 +1,85 @@
|
|
1
1
|
class Authmatch
|
2
2
|
|
3
3
|
def self.authmatch(authors1, authors2, years1, years2)
|
4
|
-
return true
|
5
4
|
unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
|
6
5
|
year_difference = compare_years(years1, years2)
|
6
|
+
get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
|
7
|
+
end
|
7
8
|
|
8
|
-
|
9
|
-
|
9
|
+
def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
|
10
|
+
count_before = authors1.size + authors2.size
|
11
|
+
count_after = unique_authors1.size + unique_authors2.size
|
12
|
+
score = 0
|
13
|
+
if count_after == 0
|
14
|
+
if year_diff != nil
|
15
|
+
if year_diff == 0
|
16
|
+
score = 100
|
17
|
+
elsif year_diff == 1
|
18
|
+
score = 54
|
19
|
+
end
|
20
|
+
else
|
21
|
+
score = 94
|
22
|
+
end
|
23
|
+
elsif unique_authors1.size > 0 || unique_authors2.size > 0
|
24
|
+
if year_diff != nil
|
25
|
+
if year_diff == 0
|
26
|
+
score = 91
|
27
|
+
elsif year_diff == 1
|
28
|
+
score = 51
|
29
|
+
end
|
30
|
+
else
|
31
|
+
score = 90
|
32
|
+
end
|
33
|
+
else
|
34
|
+
score = ((1 - count_after.to_f/count_before.to_f) * 100).round
|
35
|
+
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
36
|
+
end
|
37
|
+
score > 50 ? score : 0
|
10
38
|
end
|
11
39
|
|
12
|
-
def self.remove_duplicate_authors(
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
au1_match
|
17
|
-
|
40
|
+
def self.remove_duplicate_authors(authors1, authors2)
|
41
|
+
unique_authors1 = authors1.dup
|
42
|
+
unique_authors2 = authors2.dup
|
43
|
+
authors1.each do |au1|
|
44
|
+
au1_match = false
|
45
|
+
authors2.each do |au2|
|
46
|
+
au2_match = false
|
18
47
|
if au1 == au2
|
19
|
-
|
20
|
-
elsif au1
|
21
|
-
|
22
|
-
|
48
|
+
au1_match = au2_match = true if au1 == au2
|
49
|
+
elsif au1 == au2[0...au1.size]
|
50
|
+
au1_match = true
|
51
|
+
elsif au1[0...au2.size] == au2
|
52
|
+
au2_match = true
|
53
|
+
end
|
54
|
+
if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
|
55
|
+
unique_authors1.delete au1
|
56
|
+
unique_authors2.delete au2
|
57
|
+
elsif au1_match
|
58
|
+
unique_authors1.delete au1
|
59
|
+
elsif au2_match
|
60
|
+
unique_authors2.delete au2
|
61
|
+
else
|
62
|
+
if self.fuzzy_match_authors(au1, au2)
|
63
|
+
unique_authors1.delete au1
|
64
|
+
unique_authors2.delete au2
|
65
|
+
end
|
23
66
|
end
|
24
67
|
end
|
25
68
|
end
|
69
|
+
[unique_authors1, unique_authors2]
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.fuzzy_match_authors(author1, author2)
|
73
|
+
au1_length = author1.size
|
74
|
+
au2_length = author2.size
|
75
|
+
dlm = DamerauLevenshteinMod.new
|
76
|
+
ed = dlm.distance(author1, author2,2,3)
|
77
|
+
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
|
26
78
|
end
|
27
79
|
|
28
80
|
def self.compare_years(years1, years2)
|
29
81
|
return 0 if years1 == [] && years2 == []
|
30
|
-
return (years1[0] - years2[0]).abs if years1.size == 1 && years2.size == 1
|
82
|
+
return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
|
31
83
|
nil
|
32
84
|
end
|
33
85
|
end
|
34
|
-
|
35
|
-
=begin
|
36
|
-
foreach($author_words1 as $key1 => $author1)
|
37
|
-
{
|
38
|
-
$author1_matches = false;
|
39
|
-
$author1 = Normalize::normalize_author_string($author1);
|
40
|
-
foreach($author_words2 as $key2 => $author2)
|
41
|
-
{
|
42
|
-
$author2_matches = false;
|
43
|
-
$author2 = Normalize::normalize_author_string($author2);
|
44
|
-
|
45
|
-
if($author1 == $author2)
|
46
|
-
{
|
47
|
-
$author1_matches = true;
|
48
|
-
$author2_matches = true;
|
49
|
-
}elseif(preg_match("/^".preg_quote($author1, "/")."/i", $author2))
|
50
|
-
{
|
51
|
-
$author1_matches = true;
|
52
|
-
}elseif(preg_match("/^".preg_quote($author2, "/")."/i", $author1))
|
53
|
-
{
|
54
|
-
$author2_matches = true;
|
55
|
-
}
|
56
|
-
|
57
|
-
// equal or one is contained in the other, so consider it a match for both terms
|
58
|
-
if((strlen($author1)>=3 && $author1_matches) || (strlen($author2)>=3 && $author2_matches) || $author1 == $author2)
|
59
|
-
{
|
60
|
-
unset($unique_authors1[$key1]);
|
61
|
-
unset($unique_authors2[$key2]);
|
62
|
-
}elseif($author1_matches)
|
63
|
-
{
|
64
|
-
// author1 was abbreviation of author2
|
65
|
-
unset($unique_authors1[$key1]);
|
66
|
-
}elseif($author2_matches)
|
67
|
-
{
|
68
|
-
// author1 was abbreviation of author2
|
69
|
-
unset($unique_authors2[$key2]);
|
70
|
-
}else
|
71
|
-
{
|
72
|
-
// no match or abbreviation so try a fuzzy match
|
73
|
-
$max_length = max(strlen($author1), strlen($author2));
|
74
|
-
$lev = levenshtein($author1, $author2);
|
75
|
-
if(($lev/$max_length) <= .167)
|
76
|
-
{
|
77
|
-
unset($unique_authors1[$key1]);
|
78
|
-
unset($unique_authors2[$key2]);
|
79
|
-
}
|
80
|
-
}
|
81
|
-
}
|
82
|
-
reset($author_words2);
|
83
|
-
}
|
84
|
-
|
85
|
-
|
86
|
-
=end
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -41,8 +41,7 @@ describe 'Normalizer' do
|
|
41
41
|
Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
|
42
42
|
Normalizer.normalize('Ærenea').should == 'AERENEA'
|
43
43
|
Normalizer.normalize('Fallén').should == 'FALLEN'
|
44
|
-
Normalizer.normalize('
|
45
|
-
Normalizer.normalize('abcd').should == 'ABCD'
|
44
|
+
Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
|
46
45
|
end
|
47
46
|
|
48
47
|
it 'should normalize words' do
|
@@ -180,12 +179,33 @@ describe 'Taxamatch' do
|
|
180
179
|
@am = Authmatch
|
181
180
|
end
|
182
181
|
|
182
|
+
it 'should calculate score' do
|
183
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['L', 'Kenn'], [], [1788])
|
184
|
+
res.should == 90
|
185
|
+
end
|
186
|
+
|
183
187
|
it 'should compare years' do
|
184
188
|
@am.compare_years([1882],[1880]).should == 2
|
185
189
|
@am.compare_years([1882],[]).should == nil
|
186
190
|
@am.compare_years([],[]).should == 0
|
187
191
|
@am.compare_years([1788,1798], [1788,1798]).should be_nil
|
188
192
|
end
|
193
|
+
|
194
|
+
it 'should remove duplicate authors' do
|
195
|
+
#Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
|
196
|
+
#Muller is identical
|
197
|
+
res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
|
198
|
+
res.should == [[], []]
|
199
|
+
#same in different order
|
200
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
|
201
|
+
res.should == [[], []]
|
202
|
+
#auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
|
203
|
+
res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
|
204
|
+
res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
|
205
|
+
#fuzzy match
|
206
|
+
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
|
207
|
+
res.should == [["Dem"], ["Stepanov"]]
|
208
|
+
end
|
189
209
|
end
|
190
210
|
|
191
211
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dimus-taxamatch_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-08-
|
12
|
+
date: 2009-08-06 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|