dimus-taxamatch_rb 0.1.7 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/taxamatch_rb/authmatch.rb +70 -68
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +87 -85
- data/lib/taxamatch_rb/normalizer.rb +44 -40
- data/lib/taxamatch_rb/parser.rb +66 -62
- data/lib/taxamatch_rb/phonetizer.rb +67 -64
- data/lib/taxamatch_rb.rb +74 -67
- data/spec/damerau_levenshtein_mod_test.txt +2 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/taxamatch_rb_spec.rb +53 -16
- data/spec/taxamatch_test.txt +1 -1
- metadata +1 -1
@@ -1,85 +1,87 @@
|
|
1
|
-
|
1
|
+
module Taxamatch
|
2
|
+
class Authmatch
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
def self.authmatch(authors1, authors2, years1, years2)
|
5
|
+
unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
|
6
|
+
year_difference = compare_years(years1, years2)
|
7
|
+
get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
|
8
|
+
end
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
10
|
+
def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
|
11
|
+
count_before = authors1.size + authors2.size
|
12
|
+
count_after = unique_authors1.size + unique_authors2.size
|
13
|
+
score = 0
|
14
|
+
if count_after == 0
|
15
|
+
if year_diff != nil
|
16
|
+
if year_diff == 0
|
17
|
+
score = 100
|
18
|
+
elsif year_diff == 1
|
19
|
+
score = 54
|
20
|
+
end
|
21
|
+
else
|
22
|
+
score = 94
|
19
23
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
score =
|
24
|
+
elsif unique_authors1.size == 0 || unique_authors2.size == 0
|
25
|
+
if year_diff != nil
|
26
|
+
if year_diff == 0
|
27
|
+
score = 91
|
28
|
+
elsif year_diff == 1
|
29
|
+
score = 51
|
30
|
+
end
|
31
|
+
else
|
32
|
+
score = 90
|
29
33
|
end
|
30
34
|
else
|
31
|
-
score =
|
35
|
+
score = ((1 - count_after.to_f/count_before.to_f) * 100).round
|
36
|
+
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
32
37
|
end
|
33
|
-
|
34
|
-
score = ((1 - count_after.to_f/count_before.to_f) * 100).round
|
35
|
-
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
38
|
+
score > 50 ? score : 0
|
36
39
|
end
|
37
|
-
score > 50 ? score : 0
|
38
|
-
end
|
39
40
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
elsif au1_match
|
58
|
-
unique_authors1.delete au1
|
59
|
-
elsif au2_match
|
60
|
-
unique_authors2.delete au2
|
61
|
-
else
|
62
|
-
if self.fuzzy_match_authors(au1, au2)
|
41
|
+
def self.remove_duplicate_authors(authors1, authors2)
|
42
|
+
unique_authors1 = authors1.dup
|
43
|
+
unique_authors2 = authors2.dup
|
44
|
+
authors1.each do |au1|
|
45
|
+
authors2.each do |au2|
|
46
|
+
au1_match = au2_match = false
|
47
|
+
if au1 == au2
|
48
|
+
au1_match = au2_match = true
|
49
|
+
elsif au1 == au2[0...au1.size]
|
50
|
+
au1_match = true
|
51
|
+
elsif au1[0...au2.size] == au2
|
52
|
+
au2_match = true
|
53
|
+
end
|
54
|
+
if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
|
55
|
+
unique_authors1.delete au1
|
56
|
+
unique_authors2.delete au2
|
57
|
+
elsif au1_match
|
63
58
|
unique_authors1.delete au1
|
59
|
+
elsif au2_match
|
64
60
|
unique_authors2.delete au2
|
61
|
+
else
|
62
|
+
#TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
|
63
|
+
if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
|
64
|
+
unique_authors1.delete au1
|
65
|
+
unique_authors2.delete au2
|
66
|
+
end
|
65
67
|
end
|
66
68
|
end
|
67
69
|
end
|
70
|
+
[unique_authors1, unique_authors2]
|
68
71
|
end
|
69
|
-
[unique_authors1, unique_authors2]
|
70
|
-
end
|
71
72
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
def self.fuzzy_match_authors(author1, author2)
|
74
|
+
au1_length = author1.size
|
75
|
+
au2_length = author2.size
|
76
|
+
dlm = Taxamatch::DamerauLevenshteinMod.new
|
77
|
+
ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
|
78
|
+
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
|
79
|
+
end
|
79
80
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
81
|
+
def self.compare_years(years1, years2)
|
82
|
+
return 0 if years1 == [] && years2 == []
|
83
|
+
return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
|
84
|
+
nil
|
85
|
+
end
|
84
86
|
end
|
85
|
-
end
|
87
|
+
end
|
@@ -2,114 +2,116 @@
|
|
2
2
|
require 'rubygems'
|
3
3
|
require 'inline'
|
4
4
|
require 'time'
|
5
|
+
module Taxamatch
|
5
6
|
|
6
|
-
class DamerauLevenshteinMod
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
class DamerauLevenshteinMod
|
8
|
+
def distance(str1, str2, block_size=2, max_distance=10)
|
9
|
+
# puts str1.unpack("U*");
|
10
|
+
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
inline do |builder|
|
14
|
+
builder.c "
|
15
|
+
static VALUE distance_utf(VALUE _s, VALUE _t, long block_size, long max_distance){
|
16
|
+
long min, i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block, current_distance;
|
17
|
+
long stop_execution = 0;
|
17
18
|
|
18
|
-
|
19
|
-
|
19
|
+
VALUE *sv = RARRAY_PTR(_s);
|
20
|
+
VALUE *tv = RARRAY_PTR(_t);
|
20
21
|
|
21
|
-
|
22
|
-
|
22
|
+
sl = RARRAY_LEN(_s);
|
23
|
+
tl = RARRAY_LEN(_t);
|
23
24
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
if (sl == 0) return LONG2NUM(tl);
|
26
|
+
if (tl == 0) return LONG2NUM(sl);
|
27
|
+
//case of lengths 1 must present or it will break further in the code
|
28
|
+
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return LONG2NUM(1);
|
28
29
|
|
29
|
-
|
30
|
-
|
30
|
+
long s[sl];
|
31
|
+
long t[tl];
|
31
32
|
|
32
|
-
|
33
|
-
|
33
|
+
for (i=0; i < sl; i++) s[i] = NUM2LONG(sv[i]);
|
34
|
+
for (i=0; i < tl; i++) t[i] = NUM2LONG(tv[i]);
|
34
35
|
|
35
|
-
|
36
|
-
|
36
|
+
sl++;
|
37
|
+
tl++;
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
40
|
+
d = malloc((sizeof(long))*(sl)*(tl));
|
41
|
+
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
42
|
+
for(i = 0; i < tl; i++){
|
43
|
+
d[i*sl] = i;
|
44
|
+
}
|
44
45
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
46
|
+
//fill up array with scores
|
47
|
+
for(i = 1; i<sl; i++){
|
48
|
+
d[i] = i;
|
49
|
+
if (stop_execution == 1) break;
|
50
|
+
current_distance = 10000;
|
51
|
+
for(j = 1; j<tl; j++){
|
51
52
|
|
52
|
-
|
53
|
-
|
53
|
+
cost = 1;
|
54
|
+
if(s[i-1] == t[j-1]) cost = 0;
|
54
55
|
|
55
|
-
|
56
|
-
|
56
|
+
half_sl = (sl - 1)/2;
|
57
|
+
half_tl = (tl - 1)/2;
|
57
58
|
|
58
|
-
|
59
|
-
|
59
|
+
block = block_size < half_sl ? block_size : half_sl;
|
60
|
+
block = block < half_tl ? block : half_tl;
|
60
61
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
62
|
+
while (block >= 1){
|
63
|
+
long swap1 = 1;
|
64
|
+
long swap2 = 1;
|
65
|
+
i1 = i - (block * 2);
|
66
|
+
j1 = j - (block * 2);
|
67
|
+
for (k = i1; k < i1 + block; k++) {
|
68
|
+
if (s[k] != t[k + block]){
|
69
|
+
swap1 = 0;
|
70
|
+
break;
|
71
|
+
}
|
70
72
|
}
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
73
|
+
for (k = j1; k < j1 + block; k++) {
|
74
|
+
if (t[k] != s[k + block]){
|
75
|
+
swap2 = 0;
|
76
|
+
break;
|
77
|
+
}
|
76
78
|
}
|
77
|
-
}
|
78
79
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
80
|
+
del = d[j*sl + i - 1] + 1;
|
81
|
+
ins = d[(j-1)*sl + i] + 1;
|
82
|
+
min = del;
|
83
|
+
if (ins < min) min = ins;
|
84
|
+
//if (i == 2 && j==2) return LONG2NUM(swap2+5);
|
85
|
+
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
86
|
+
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
87
|
+
if (transp < min) min = transp;
|
88
|
+
block = 0;
|
89
|
+
} else if (block == 1) {
|
90
|
+
subs = d[(j-1)*sl + i - 1] + cost;
|
91
|
+
if (subs < min) min = subs;
|
92
|
+
}
|
93
|
+
block--;
|
94
|
+
}
|
95
|
+
d[j*sl+i]=min;
|
96
|
+
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
|
97
|
+
}
|
98
|
+
if (current_distance > max_distance) {
|
99
|
+
stop_execution = 1;
|
100
|
+
}
|
99
101
|
}
|
100
|
-
|
101
|
-
|
102
|
-
if (stop_execution == 1) distance = current_distance;
|
102
|
+
distance=d[sl * tl - 1];
|
103
|
+
if (stop_execution == 1) distance = current_distance;
|
103
104
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
105
|
+
free(d);
|
106
|
+
return LONG2NUM(distance);
|
107
|
+
}
|
108
|
+
"
|
109
|
+
end
|
108
110
|
end
|
109
111
|
end
|
110
112
|
|
111
113
|
if __FILE__ == $0
|
112
|
-
a=DamerauLevenshteinMod.new
|
114
|
+
a=Taxamatch::DamerauLevenshteinMod.new
|
113
115
|
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
|
114
116
|
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
|
115
117
|
|
@@ -1,47 +1,51 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
|
-
module
|
4
|
-
def self.normalize(string)
|
5
|
-
utf8_to_ascii(string).upcase
|
6
|
-
end
|
3
|
+
module Taxamatch
|
7
4
|
|
8
|
-
|
9
|
-
self.normalize(
|
10
|
-
|
5
|
+
module Normalizer
|
6
|
+
def self.normalize(string)
|
7
|
+
utf8_to_ascii(string).upcase
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.normalize_word(word)
|
11
|
+
self.normalize(word).gsub(/[^A-Z0-9\-]/, '')
|
12
|
+
end
|
13
|
+
|
14
|
+
protected
|
15
|
+
def self.utf8_to_ascii(string)
|
16
|
+
string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
|
17
|
+
string = string.gsub(/[ÉÈÊË]/, "E")
|
18
|
+
string = string.gsub(/[ÍÌÎÏ]/, "I")
|
19
|
+
string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
|
20
|
+
string = string.gsub(/[ÚÙÛÜ]/, "U")
|
21
|
+
string = string.gsub(/[Ý]/, "Y")
|
22
|
+
string = string.gsub(/Æ/, "AE")
|
23
|
+
string = string.gsub(/[ČÇ]/, "C")
|
24
|
+
string = string.gsub(/[ŠŞ]/, "S")
|
25
|
+
string = string.gsub(/[Đ]/, "D")
|
26
|
+
string = string.gsub(/Ž/, "Z")
|
27
|
+
string = string.gsub(/Ñ/, "N")
|
28
|
+
string = string.gsub(/Œ/, "OE")
|
29
|
+
string = string.gsub(/ß/, "B")
|
30
|
+
string = string.gsub(/Ķ/, "K")
|
31
|
+
string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
|
32
|
+
string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
|
33
|
+
string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
|
34
|
+
string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
|
35
|
+
string = string.gsub(/[úùûüůưừựủứụ]/, "u")
|
36
|
+
string = string.gsub(/[žź]/, "z")
|
37
|
+
string = string.gsub(/[ýÿỹ]/, "y")
|
38
|
+
string = string.gsub(/[đ]/, "d")
|
39
|
+
string = string.gsub(/æ/, "ae")
|
40
|
+
string = string.gsub(/[čćç]/, "c")
|
41
|
+
string = string.gsub(/[ñńň]/, "n")
|
42
|
+
string = string.gsub(/œ/, "oe")
|
43
|
+
string = string.gsub(/[śšş]/, "s")
|
44
|
+
string = string.gsub(/ř/, "r")
|
45
|
+
string = string.gsub(/ğ/, "g")
|
46
|
+
string = string.gsub(/Ř/, "R")
|
47
|
+
end
|
11
48
|
|
12
|
-
protected
|
13
|
-
def self.utf8_to_ascii(string)
|
14
|
-
string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
|
15
|
-
string = string.gsub(/[ÉÈÊË]/, "E")
|
16
|
-
string = string.gsub(/[ÍÌÎÏ]/, "I")
|
17
|
-
string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
|
18
|
-
string = string.gsub(/[ÚÙÛÜ]/, "U")
|
19
|
-
string = string.gsub(/[Ý]/, "Y")
|
20
|
-
string = string.gsub(/Æ/, "AE")
|
21
|
-
string = string.gsub(/[ČÇ]/, "C")
|
22
|
-
string = string.gsub(/[ŠŞ]/, "S")
|
23
|
-
string = string.gsub(/[Đ]/, "D")
|
24
|
-
string = string.gsub(/Ž/, "Z")
|
25
|
-
string = string.gsub(/Ñ/, "N")
|
26
|
-
string = string.gsub(/Œ/, "OE")
|
27
|
-
string = string.gsub(/ß/, "B")
|
28
|
-
string = string.gsub(/Ķ/, "K")
|
29
|
-
string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
|
30
|
-
string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
|
31
|
-
string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
|
32
|
-
string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
|
33
|
-
string = string.gsub(/[úùûüůưừựủứụ]/, "u")
|
34
|
-
string = string.gsub(/[žź]/, "z")
|
35
|
-
string = string.gsub(/[ýÿỹ]/, "y")
|
36
|
-
string = string.gsub(/[đ]/, "d")
|
37
|
-
string = string.gsub(/æ/, "ae")
|
38
|
-
string = string.gsub(/[čćç]/, "c")
|
39
|
-
string = string.gsub(/[ñńň]/, "n")
|
40
|
-
string = string.gsub(/œ/, "oe")
|
41
|
-
string = string.gsub(/[śšş]/, "s")
|
42
|
-
string = string.gsub(/ř/, "r")
|
43
|
-
string = string.gsub(/ğ/, "g")
|
44
|
-
string = string.gsub(/Ř/, "R")
|
45
49
|
end
|
46
50
|
|
47
51
|
end
|
data/lib/taxamatch_rb/parser.rb
CHANGED
@@ -1,83 +1,87 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require 'biodiversity'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
4
|
+
module Taxamatch
|
5
|
+
|
6
|
+
class Parser
|
7
|
+
def initialize
|
8
|
+
@parser = ScientificNameParser.new
|
9
|
+
@parsed_raw = nil
|
10
|
+
@res = {}
|
11
|
+
end
|
10
12
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
def parse(name)
|
14
|
+
@res = {:all_authors => [], :all_years => []}
|
15
|
+
@parsed_raw = JSON.load(@parser.parse(name).to_json)['scientificName']
|
16
|
+
organize_results
|
17
|
+
end
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
|
19
|
+
def parsed_raw
|
20
|
+
return @parsed_raw
|
21
|
+
end
|
20
22
|
|
21
|
-
protected
|
23
|
+
protected
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
25
|
+
def organize_results
|
26
|
+
pr = @parsed_raw
|
27
|
+
return nil unless pr['parsed']
|
28
|
+
d = pr['details'][0]
|
29
|
+
process_node(:uninomial, d['uninomial'])
|
30
|
+
process_node(:genus, d['genus'])
|
31
|
+
process_node(:species, d['species'], true)
|
32
|
+
process_infraspecies(d['infraspecies'])
|
33
|
+
@res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
|
34
|
+
@res[:all_years].uniq!
|
35
|
+
@res.keys.size > 2 ? @res : nil
|
36
|
+
end
|
35
37
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
38
|
+
def process_node(name, node, is_species = false)
|
39
|
+
return unless node
|
40
|
+
@res[name] = {}
|
41
|
+
@res[name][:epitheton] = node['epitheton']
|
42
|
+
@res[name][:normalized] = Taxamatch::Normalizer.normalize(node['epitheton'])
|
43
|
+
@res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node['epitheton'], is_species)
|
44
|
+
get_authors_years(node, @res[name])
|
45
|
+
end
|
44
46
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
47
|
+
def process_infraspecies(node)
|
48
|
+
return unless node
|
49
|
+
@res[:infraspecies] = []
|
50
|
+
node.each do |infr|
|
51
|
+
hsh = {}
|
52
|
+
hsh[:epitheton] = infr['epitheton']
|
53
|
+
hsh[:normalized] = Taxamatch::Normalizer.normalize(infr['epitheton'])
|
54
|
+
hsh[:phonetized] = Taxamatch::Phonetizer.near_match(infr['epitheton'], true)
|
55
|
+
get_authors_years(infr,hsh)
|
56
|
+
@res[:infraspecies] << hsh
|
57
|
+
end
|
55
58
|
end
|
56
|
-
end
|
57
59
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
def get_authors_years(node, res)
|
61
|
+
res[:authors] = []
|
62
|
+
res[:years] = []
|
63
|
+
['basionymAuthorTeam','combinationAuthorTeam'].each do |au|
|
64
|
+
if node[au]
|
65
|
+
res[:authors] += node[au]['author']
|
66
|
+
res[:years] << node[au]['year'] if node[au]['year']
|
67
|
+
if node[au]['exAuthorTeam']
|
68
|
+
res[:authors] += node[au]['exAuthorTeam']['author']
|
69
|
+
res[:years] << node[au]['exAuthorTeam']['year'] if node[au]['exAuthorTeam']['year']
|
70
|
+
end
|
68
71
|
end
|
69
72
|
end
|
73
|
+
res[:authors].uniq!
|
74
|
+
res[:years].uniq!
|
75
|
+
@res[:all_authors] += res[:authors] if res[:authors].size > 0
|
76
|
+
@res[:all_years] += res[:years] if res[:years].size > 0
|
70
77
|
end
|
71
|
-
res[:authors].uniq!
|
72
|
-
res[:years].uniq!
|
73
|
-
@res[:all_authors] += res[:authors] if res[:authors].size > 0
|
74
|
-
@res[:all_years] += res[:years] if res[:years].size > 0
|
75
|
-
end
|
76
78
|
|
79
|
+
end
|
77
80
|
end
|
78
81
|
|
79
82
|
if __FILE__ == $0
|
80
83
|
require 'pp'
|
81
84
|
p = Parser.new
|
82
85
|
puts p.parse('Salmonella werahensis (Castellani) Hauduroy and Ehringer in Hauduroy 1937')
|
83
|
-
end
|
86
|
+
end
|
87
|
+
|
@@ -1,72 +1,75 @@
|
|
1
1
|
# encoding: UTF-8
|
2
|
+
module Taxamatch
|
2
3
|
|
3
|
-
class Phonetizer
|
4
|
+
class Phonetizer
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
6
|
+
def self.near_match(a_word, normalize_ending = false)
|
7
|
+
a_word = a_word.strip rescue ''
|
8
|
+
return '' if a_word == ''
|
9
|
+
a_word = Taxamatch::Normalizer.normalize a_word
|
10
|
+
case a_word
|
11
|
+
when /^AE/
|
12
|
+
a_word = 'E' + a_word[2..-1]
|
13
|
+
when /^CN/
|
14
|
+
a_word = 'N' + a_word[2..-1]
|
15
|
+
when /^CT/
|
16
|
+
a_word = 'T' + a_word[2..-1]
|
17
|
+
when /^CZ/
|
18
|
+
a_word = 'C' + a_word[2..-1]
|
19
|
+
when /^DJ/
|
20
|
+
a_word = 'J' + a_word[2..-1]
|
21
|
+
when /^EA/
|
22
|
+
a_word = 'E' + a_word[2..-1]
|
23
|
+
when /^EU/
|
24
|
+
a_word = 'U' + a_word[2..-1]
|
25
|
+
when /^GN/
|
26
|
+
a_word = 'N' + a_word[2..-1]
|
27
|
+
when /^KN/
|
28
|
+
a_word = 'N' + a_word[2..-1]
|
29
|
+
when /^MC/
|
30
|
+
a_word = 'MAC' + a_word[2..-1]
|
31
|
+
when /^MN/
|
32
|
+
a_word = 'N' + a_word[2..-1]
|
33
|
+
when /^OE/
|
34
|
+
a_word = 'E' + a_word[2..-1]
|
35
|
+
when /^QU/
|
36
|
+
a_word = 'Q' + a_word[2..-1]
|
37
|
+
when /^PS/
|
38
|
+
a_word = 'S' + a_word[2..-1]
|
39
|
+
when /^PT/
|
40
|
+
a_word = 'T' + a_word[2..-1]
|
41
|
+
when /^TS/
|
42
|
+
a_word = 'S' + a_word[2..-1]
|
43
|
+
when /^WR/
|
44
|
+
a_word = 'R' + a_word[2..-1]
|
45
|
+
when /^X/
|
46
|
+
a_word = 'Z' + a_word[1..-1]
|
47
|
+
end
|
48
|
+
first_char = a_word.split('')[0]
|
49
|
+
rest_chars = a_word.split('')[1..-1].join('')
|
50
|
+
rest_chars.gsub!('AE', 'I')
|
51
|
+
rest_chars.gsub!('IA', 'A')
|
52
|
+
rest_chars.gsub!('OE', 'I')
|
53
|
+
rest_chars.gsub!('OI', 'A')
|
54
|
+
rest_chars.gsub!('SC', 'S')
|
55
|
+
rest_chars.gsub!('H', '')
|
56
|
+
rest_chars.tr!('EOUYKZ', 'IAIICS')
|
57
|
+
a_word = (first_char + rest_chars).squeeze
|
57
58
|
|
58
|
-
|
59
|
-
|
59
|
+
if normalize_ending && a_word.size > 4
|
60
|
+
a_word = self.normalize_ending(a_word)
|
61
|
+
end
|
62
|
+
a_word
|
60
63
|
end
|
61
|
-
a_word
|
62
|
-
end
|
63
64
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
65
|
+
def self.normalize_ending(a_word)
|
66
|
+
# -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
|
67
|
+
# -- at the end of a string translate all to -a
|
68
|
+
a_word.gsub!(/IS$/, 'A')
|
69
|
+
a_word.gsub!(/IM$/, 'A')
|
70
|
+
a_word.gsub(/AS$/, 'A')
|
71
|
+
end
|
71
72
|
|
73
|
+
end
|
74
|
+
|
72
75
|
end
|
data/lib/taxamatch_rb.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: UTF-8
|
1
2
|
$:.unshift(File.dirname(__FILE__)) unless
|
2
3
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
4
|
# $:.unshift('taxamatch_rb')
|
@@ -7,85 +8,91 @@ require 'taxamatch_rb/normalizer'
|
|
7
8
|
require 'taxamatch_rb/phonetizer'
|
8
9
|
require 'taxamatch_rb/authmatch'
|
9
10
|
|
10
|
-
|
11
|
+
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
12
|
+
|
13
|
+
module Taxamatch
|
14
|
+
|
15
|
+
class Base
|
11
16
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
17
|
+
def initialize
|
18
|
+
@parser = Taxamatch::Parser.new
|
19
|
+
@dlm = Taxamatch::DamerauLevenshteinMod.new
|
20
|
+
end
|
16
21
|
|
17
22
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
23
|
+
#takes two scientific names and returns true if names match and false if they don't
|
24
|
+
def taxamatch(str1, str2)
|
25
|
+
parsed_data_1 = @parser.parse(str1)
|
26
|
+
parsed_data_2 = @parser.parse(str2)
|
27
|
+
taxamatch_parsed_data(parsed_data_1, parsed_data_2)[:match]
|
28
|
+
end
|
24
29
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
#takes two hashes of parsed scientific names, analyses them and returns back
|
31
|
+
#this function is useful when species strings are preparsed.
|
32
|
+
def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
|
33
|
+
result = nil
|
34
|
+
result = match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
|
35
|
+
result = match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
|
36
|
+
if result && result[:match]
|
37
|
+
result[:match] = false if match_authors(parsed_data_1, parsed_data_2) == 0
|
38
|
+
end
|
39
|
+
return result
|
33
40
|
end
|
34
|
-
return result
|
35
|
-
end
|
36
41
|
|
37
|
-
|
38
|
-
|
39
|
-
|
42
|
+
def match_uninomial(parsed_data_1, parsed_data_2)
|
43
|
+
return false
|
44
|
+
end
|
40
45
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
46
|
+
def match_multinomial(parsed_data_1, parsed_data_2)
|
47
|
+
gen_match = match_genera(parsed_data_1[:genus], parsed_data_2[:genus])
|
48
|
+
sp_match = match_species(parsed_data_1[:species], parsed_data_2[:species])
|
49
|
+
au_match = match_authors(parsed_data_1, parsed_data_2)
|
50
|
+
total_length = parsed_data_1[:genus][:epitheton].size + parsed_data_2[:genus][:epitheton].size + parsed_data_1[:species][:epitheton].size + parsed_data_2[:species][:epitheton].size
|
51
|
+
match = match_matches(gen_match, sp_match)
|
52
|
+
match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
|
53
|
+
end
|
49
54
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
55
|
+
def match_genera(genus1, genus2)
|
56
|
+
genus1_length = genus1[:normalized].size
|
57
|
+
genus2_length = genus2[:normalized].size
|
58
|
+
match = false
|
59
|
+
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
|
60
|
+
return {:edit_distance => ed, :phonetic_match => true, :match => true} if genus1[:phonetized] == genus2[:phonetized]
|
56
61
|
|
57
|
-
|
58
|
-
|
59
|
-
|
62
|
+
match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
63
|
+
{:edit_distance => ed, :match => match, :phonetic_match => false}
|
64
|
+
end
|
60
65
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
def match_species(sp1, sp2)
|
67
|
+
sp1_length = sp1[:normalized].size
|
68
|
+
sp2_length = sp2[:normalized].size
|
69
|
+
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
70
|
+
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
71
|
+
match = false
|
72
|
+
ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
|
73
|
+
return {:edit_distance => ed, :phonetic_match => true, :match => true} if sp1[:phonetized] == sp2[:phonetized]
|
69
74
|
|
70
|
-
|
71
|
-
|
72
|
-
|
75
|
+
match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
76
|
+
{:edit_distance => ed, :match => match, :phonetic_match => false}
|
77
|
+
end
|
73
78
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
79
|
+
def match_authors(parsed_data_1, parsed_data_2)
|
80
|
+
au1 = parsed_data_1[:all_authors]
|
81
|
+
au2 = parsed_data_2[:all_authors]
|
82
|
+
yr1 = parsed_data_1[:all_years]
|
83
|
+
yr2 = parsed_data_2[:all_years]
|
84
|
+
Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
85
|
+
end
|
81
86
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
87
|
+
def match_matches(genus_match, species_match, infraspecies_matches = [])
|
88
|
+
match = species_match
|
89
|
+
match[:edit_distance] += genus_match[:edit_distance]
|
90
|
+
match[:match] = false if match[:edit_distance] > 4
|
91
|
+
match[:match] &&= genus_match[:match]
|
92
|
+
match[:phonetic_match] &&= genus_match[:phonetic_match]
|
93
|
+
match
|
94
|
+
end
|
95
|
+
|
89
96
|
end
|
90
97
|
|
91
|
-
end
|
98
|
+
end
|
@@ -28,6 +28,8 @@ Pomatomus|pomatomus|10|1|1
|
|
28
28
|
Pomatomus||10|1|9
|
29
29
|
|Pomatomus|10|1|9
|
30
30
|
P|p|10|1|1
|
31
|
+
#TODO: one letter vs longer string generates a big negative number
|
32
|
+
#L|Linneaus|10|1|7
|
31
33
|
|
32
34
|
|
33
35
|
#it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
|
data/spec/spec_helper.rb
CHANGED
@@ -23,6 +23,6 @@ def read_test_file(file, fields_num)
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def make_taxamatch_hash(string)
|
26
|
-
normalized = Normalizer.normalize(string)
|
27
|
-
{:epitheton => string, :normalized => normalized, :phonetized => Phonetizer.near_match(normalized)}
|
26
|
+
normalized = Taxamatch::Normalizer.normalize(string)
|
27
|
+
{:epitheton => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
|
28
28
|
end
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require File.dirname(__FILE__) + '/spec_helper.rb'
|
3
3
|
|
4
|
-
describe '
|
4
|
+
describe 'DamerauLevenshteinMod' do
|
5
5
|
it 'should get tests' do
|
6
6
|
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
|
7
|
-
dl = DamerauLevenshteinMod.new
|
7
|
+
dl = Taxamatch::DamerauLevenshteinMod.new
|
8
8
|
if y
|
9
9
|
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
10
10
|
puts y if res != y[4].to_i
|
@@ -16,7 +16,7 @@ end
|
|
16
16
|
|
17
17
|
describe 'Parser' do
|
18
18
|
before(:all) do
|
19
|
-
@parser =
|
19
|
+
@parser = Taxamatch::Parser.new
|
20
20
|
end
|
21
21
|
|
22
22
|
it 'should parse uninomials' do
|
@@ -35,27 +35,27 @@ describe 'Parser' do
|
|
35
35
|
end
|
36
36
|
|
37
37
|
|
38
|
-
describe 'Normalizer' do
|
38
|
+
describe 'Taxamatch::Normalizer' do
|
39
39
|
it 'should normalize strings' do
|
40
|
-
Normalizer.normalize('abcd').should == 'ABCD'
|
41
|
-
Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
|
42
|
-
Normalizer.normalize('Ærenea').should == 'AERENEA'
|
43
|
-
Normalizer.normalize('Fallén').should == 'FALLEN'
|
44
|
-
Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
|
40
|
+
Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
|
41
|
+
Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
|
42
|
+
Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
|
43
|
+
Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
|
44
|
+
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'should normalize words' do
|
48
|
-
Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
|
48
|
+
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
describe 'Taxamatch' do
|
52
|
+
describe 'Taxamatch::Base' do
|
53
53
|
before(:all) do
|
54
|
-
@tm = Taxamatch.new
|
54
|
+
@tm = Taxamatch::Base.new
|
55
55
|
end
|
56
56
|
|
57
57
|
it 'should get txt tests' do
|
58
|
-
dl = DamerauLevenshteinMod.new
|
58
|
+
dl = Taxamatch::DamerauLevenshteinMod.new
|
59
59
|
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 3) do |y|
|
60
60
|
if y
|
61
61
|
y[2] = y[2] == 'true' ? true : false
|
@@ -174,14 +174,40 @@ describe 'Taxamatch' do
|
|
174
174
|
@tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
|
175
175
|
end
|
176
176
|
|
177
|
-
describe 'Authmatch' do
|
177
|
+
describe 'Taxamatch::Authmatch' do
|
178
178
|
before(:all) do
|
179
|
-
@am = Authmatch
|
179
|
+
@am = Taxamatch::Authmatch
|
180
180
|
end
|
181
181
|
|
182
182
|
it 'should calculate score' do
|
183
|
-
res = @am.authmatch(['Linnaeus', 'Muller'], ['L'
|
183
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
|
184
184
|
res.should == 90
|
185
|
+
res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
|
186
|
+
res.should == 0
|
187
|
+
#found all authors, same year
|
188
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
|
189
|
+
res.should == 100
|
190
|
+
#all authors, 1 year diff
|
191
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
|
192
|
+
res.should == 54
|
193
|
+
#year is not counted in
|
194
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
|
195
|
+
res.should == 94
|
196
|
+
#found all authors on one side, same year
|
197
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
|
198
|
+
res.should == 91
|
199
|
+
#found all authors on one side, 1 year diff
|
200
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
|
201
|
+
res.should == 51
|
202
|
+
#found all authors on one side, year does not count
|
203
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
|
204
|
+
res.should == 90
|
205
|
+
#found some authors
|
206
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
|
207
|
+
res.should == 67
|
208
|
+
#if year does not match or not present no match for previous case
|
209
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
|
210
|
+
res.should == 0
|
185
211
|
end
|
186
212
|
|
187
213
|
it 'should compare years' do
|
@@ -205,7 +231,18 @@ describe 'Taxamatch' do
|
|
205
231
|
#fuzzy match
|
206
232
|
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
|
207
233
|
res.should == [["Dem"], ["Stepanov"]]
|
234
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
|
235
|
+
res.should == [['Linnaeus', 'Muller'], ['Kenn']]
|
236
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
|
237
|
+
res.should == [[],['Kurtz']]
|
238
|
+
end
|
239
|
+
|
240
|
+
it 'should fuzzy match authors' do
|
241
|
+
#TODO: fix the bug revealed by this test
|
242
|
+
# res = @am.fuzzy_match_authors('L', 'Muller')
|
243
|
+
# res.should be_false
|
208
244
|
end
|
245
|
+
|
209
246
|
end
|
210
247
|
|
211
248
|
end
|
data/spec/taxamatch_test.txt
CHANGED