dimus-taxamatch_rb 0.1.7 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,85 +1,87 @@
1
- class Authmatch
1
+ module Taxamatch
2
+ class Authmatch
2
3
 
3
- def self.authmatch(authors1, authors2, years1, years2)
4
- unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
5
- year_difference = compare_years(years1, years2)
6
- get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
7
- end
4
+ def self.authmatch(authors1, authors2, years1, years2)
5
+ unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
6
+ year_difference = compare_years(years1, years2)
7
+ get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
8
+ end
8
9
 
9
- def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
10
- count_before = authors1.size + authors2.size
11
- count_after = unique_authors1.size + unique_authors2.size
12
- score = 0
13
- if count_after == 0
14
- if year_diff != nil
15
- if year_diff == 0
16
- score = 100
17
- elsif year_diff == 1
18
- score = 54
10
+ def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
11
+ count_before = authors1.size + authors2.size
12
+ count_after = unique_authors1.size + unique_authors2.size
13
+ score = 0
14
+ if count_after == 0
15
+ if year_diff != nil
16
+ if year_diff == 0
17
+ score = 100
18
+ elsif year_diff == 1
19
+ score = 54
20
+ end
21
+ else
22
+ score = 94
19
23
  end
20
- else
21
- score = 94
22
- end
23
- elsif unique_authors1.size > 0 || unique_authors2.size > 0
24
- if year_diff != nil
25
- if year_diff == 0
26
- score = 91
27
- elsif year_diff == 1
28
- score = 51
24
+ elsif unique_authors1.size == 0 || unique_authors2.size == 0
25
+ if year_diff != nil
26
+ if year_diff == 0
27
+ score = 91
28
+ elsif year_diff == 1
29
+ score = 51
30
+ end
31
+ else
32
+ score = 90
29
33
  end
30
34
  else
31
- score = 90
35
+ score = ((1 - count_after.to_f/count_before.to_f) * 100).round
36
+ score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
32
37
  end
33
- else
34
- score = ((1 - count_after.to_f/count_before.to_f) * 100).round
35
- score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
38
+ score > 50 ? score : 0
36
39
  end
37
- score > 50 ? score : 0
38
- end
39
40
 
40
- def self.remove_duplicate_authors(authors1, authors2)
41
- unique_authors1 = authors1.dup
42
- unique_authors2 = authors2.dup
43
- authors1.each do |au1|
44
- au1_match = false
45
- authors2.each do |au2|
46
- au2_match = false
47
- if au1 == au2
48
- au1_match = au2_match = true if au1 == au2
49
- elsif au1 == au2[0...au1.size]
50
- au1_match = true
51
- elsif au1[0...au2.size] == au2
52
- au2_match = true
53
- end
54
- if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
55
- unique_authors1.delete au1
56
- unique_authors2.delete au2
57
- elsif au1_match
58
- unique_authors1.delete au1
59
- elsif au2_match
60
- unique_authors2.delete au2
61
- else
62
- if self.fuzzy_match_authors(au1, au2)
41
+ def self.remove_duplicate_authors(authors1, authors2)
42
+ unique_authors1 = authors1.dup
43
+ unique_authors2 = authors2.dup
44
+ authors1.each do |au1|
45
+ authors2.each do |au2|
46
+ au1_match = au2_match = false
47
+ if au1 == au2
48
+ au1_match = au2_match = true
49
+ elsif au1 == au2[0...au1.size]
50
+ au1_match = true
51
+ elsif au1[0...au2.size] == au2
52
+ au2_match = true
53
+ end
54
+ if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
55
+ unique_authors1.delete au1
56
+ unique_authors2.delete au2
57
+ elsif au1_match
63
58
  unique_authors1.delete au1
59
+ elsif au2_match
64
60
  unique_authors2.delete au2
61
+ else
62
+ #TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
63
+ if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
64
+ unique_authors1.delete au1
65
+ unique_authors2.delete au2
66
+ end
65
67
  end
66
68
  end
67
69
  end
70
+ [unique_authors1, unique_authors2]
68
71
  end
69
- [unique_authors1, unique_authors2]
70
- end
71
72
 
72
- def self.fuzzy_match_authors(author1, author2)
73
- au1_length = author1.size
74
- au2_length = author2.size
75
- dlm = DamerauLevenshteinMod.new
76
- ed = dlm.distance(author1, author2,2,3)
77
- (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
78
- end
73
+ def self.fuzzy_match_authors(author1, author2)
74
+ au1_length = author1.size
75
+ au2_length = author2.size
76
+ dlm = Taxamatch::DamerauLevenshteinMod.new
77
+ ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
78
+ (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
79
+ end
79
80
 
80
- def self.compare_years(years1, years2)
81
- return 0 if years1 == [] && years2 == []
82
- return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
83
- nil
81
+ def self.compare_years(years1, years2)
82
+ return 0 if years1 == [] && years2 == []
83
+ return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
84
+ nil
85
+ end
84
86
  end
85
- end
87
+ end
@@ -2,114 +2,116 @@
2
2
  require 'rubygems'
3
3
  require 'inline'
4
4
  require 'time'
5
+ module Taxamatch
5
6
 
6
- class DamerauLevenshteinMod
7
- def distance(str1, str2, block_size=2, max_distance=10)
8
- # puts str1.unpack("U*");
9
- distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
10
- end
7
+ class DamerauLevenshteinMod
8
+ def distance(str1, str2, block_size=2, max_distance=10)
9
+ # puts str1.unpack("U*");
10
+ distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
11
+ end
11
12
 
12
- inline do |builder|
13
- builder.c "
14
- static VALUE distance_utf(VALUE _s, VALUE _t, long block_size, long max_distance){
15
- long min, i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block, current_distance;
16
- long stop_execution = 0;
13
+ inline do |builder|
14
+ builder.c "
15
+ static VALUE distance_utf(VALUE _s, VALUE _t, long block_size, long max_distance){
16
+ long min, i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block, current_distance;
17
+ long stop_execution = 0;
17
18
 
18
- VALUE *sv = RARRAY_PTR(_s);
19
- VALUE *tv = RARRAY_PTR(_t);
19
+ VALUE *sv = RARRAY_PTR(_s);
20
+ VALUE *tv = RARRAY_PTR(_t);
20
21
 
21
- sl = RARRAY_LEN(_s);
22
- tl = RARRAY_LEN(_t);
22
+ sl = RARRAY_LEN(_s);
23
+ tl = RARRAY_LEN(_t);
23
24
 
24
- if (sl == 0) return LONG2NUM(tl);
25
- if (tl == 0) return LONG2NUM(sl);
26
- //case of lengths 1 must present or it will break further in the code
27
- if (sl == 1 && tl == 1 && sv[0] != tv[0]) return LONG2NUM(1);
25
+ if (sl == 0) return LONG2NUM(tl);
26
+ if (tl == 0) return LONG2NUM(sl);
27
+ //case of lengths 1 must present or it will break further in the code
28
+ if (sl == 1 && tl == 1 && sv[0] != tv[0]) return LONG2NUM(1);
28
29
 
29
- long s[sl];
30
- long t[tl];
30
+ long s[sl];
31
+ long t[tl];
31
32
 
32
- for (i=0; i < sl; i++) s[i] = NUM2LONG(sv[i]);
33
- for (i=0; i < tl; i++) t[i] = NUM2LONG(tv[i]);
33
+ for (i=0; i < sl; i++) s[i] = NUM2LONG(sv[i]);
34
+ for (i=0; i < tl; i++) t[i] = NUM2LONG(tv[i]);
34
35
 
35
- sl++;
36
- tl++;
36
+ sl++;
37
+ tl++;
37
38
 
38
- //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
39
- d = malloc((sizeof(long))*(sl)*(tl));
40
- //populate 'vertical' row starting from the 2nd position (first one is filled already)
41
- for(i = 0; i < tl; i++){
42
- d[i*sl] = i;
43
- }
39
+ //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
40
+ d = malloc((sizeof(long))*(sl)*(tl));
41
+ //populate 'vertical' row starting from the 2nd position (first one is filled already)
42
+ for(i = 0; i < tl; i++){
43
+ d[i*sl] = i;
44
+ }
44
45
 
45
- //fill up array with scores
46
- for(i = 1; i<sl; i++){
47
- d[i] = i;
48
- if (stop_execution == 1) break;
49
- current_distance = 10000;
50
- for(j = 1; j<tl; j++){
46
+ //fill up array with scores
47
+ for(i = 1; i<sl; i++){
48
+ d[i] = i;
49
+ if (stop_execution == 1) break;
50
+ current_distance = 10000;
51
+ for(j = 1; j<tl; j++){
51
52
 
52
- cost = 1;
53
- if(s[i-1] == t[j-1]) cost = 0;
53
+ cost = 1;
54
+ if(s[i-1] == t[j-1]) cost = 0;
54
55
 
55
- half_sl = (sl - 1)/2;
56
- half_tl = (tl - 1)/2;
56
+ half_sl = (sl - 1)/2;
57
+ half_tl = (tl - 1)/2;
57
58
 
58
- block = block_size < half_sl ? block_size : half_sl;
59
- block = block < half_tl ? block : half_tl;
59
+ block = block_size < half_sl ? block_size : half_sl;
60
+ block = block < half_tl ? block : half_tl;
60
61
 
61
- while (block >= 1){
62
- long swap1 = 1;
63
- long swap2 = 1;
64
- i1 = i - (block * 2);
65
- j1 = j - (block * 2);
66
- for (k = i1; k < i1 + block; k++) {
67
- if (s[k] != t[k + block]){
68
- swap1 = 0;
69
- break;
62
+ while (block >= 1){
63
+ long swap1 = 1;
64
+ long swap2 = 1;
65
+ i1 = i - (block * 2);
66
+ j1 = j - (block * 2);
67
+ for (k = i1; k < i1 + block; k++) {
68
+ if (s[k] != t[k + block]){
69
+ swap1 = 0;
70
+ break;
71
+ }
70
72
  }
71
- }
72
- for (k = j1; k < j1 + block; k++) {
73
- if (t[k] != s[k + block]){
74
- swap2 = 0;
75
- break;
73
+ for (k = j1; k < j1 + block; k++) {
74
+ if (t[k] != s[k + block]){
75
+ swap2 = 0;
76
+ break;
77
+ }
76
78
  }
77
- }
78
79
 
79
- del = d[j*sl + i - 1] + 1;
80
- ins = d[(j-1)*sl + i] + 1;
81
- min = del;
82
- if (ins < min) min = ins;
83
- //if (i == 2 && j==2) return LONG2NUM(swap2+5);
84
- if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
85
- transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
86
- if (transp < min) min = transp;
87
- block = 0;
88
- } else if (block == 1) {
89
- subs = d[(j-1)*sl + i - 1] + cost;
90
- if (subs < min) min = subs;
91
- }
92
- block--;
93
- }
94
- d[j*sl+i]=min;
95
- if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
96
- }
97
- if (current_distance > max_distance) {
98
- stop_execution = 1;
80
+ del = d[j*sl + i - 1] + 1;
81
+ ins = d[(j-1)*sl + i] + 1;
82
+ min = del;
83
+ if (ins < min) min = ins;
84
+ //if (i == 2 && j==2) return LONG2NUM(swap2+5);
85
+ if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
86
+ transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
87
+ if (transp < min) min = transp;
88
+ block = 0;
89
+ } else if (block == 1) {
90
+ subs = d[(j-1)*sl + i - 1] + cost;
91
+ if (subs < min) min = subs;
92
+ }
93
+ block--;
94
+ }
95
+ d[j*sl+i]=min;
96
+ if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
97
+ }
98
+ if (current_distance > max_distance) {
99
+ stop_execution = 1;
100
+ }
99
101
  }
100
- }
101
- distance=d[sl * tl - 1];
102
- if (stop_execution == 1) distance = current_distance;
102
+ distance=d[sl * tl - 1];
103
+ if (stop_execution == 1) distance = current_distance;
103
104
 
104
- free(d);
105
- return LONG2NUM(distance);
106
- }
107
- "
105
+ free(d);
106
+ return LONG2NUM(distance);
107
+ }
108
+ "
109
+ end
108
110
  end
109
111
  end
110
112
 
111
113
  if __FILE__ == $0
112
- a=DamerauLevenshteinMod.new
114
+ a=Taxamatch::DamerauLevenshteinMod.new
113
115
  s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
114
116
  t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
115
117
 
@@ -1,47 +1,51 @@
1
1
  # encoding: UTF-8
2
2
 
3
- module Normalizer
4
- def self.normalize(string)
5
- utf8_to_ascii(string).upcase
6
- end
3
+ module Taxamatch
7
4
 
8
- def self.normalize_word(word)
9
- self.normalize(word).gsub(/[^A-Z0-9\-]/, '')
10
- end
5
+ module Normalizer
6
+ def self.normalize(string)
7
+ utf8_to_ascii(string).upcase
8
+ end
9
+
10
+ def self.normalize_word(word)
11
+ self.normalize(word).gsub(/[^A-Z0-9\-]/, '')
12
+ end
13
+
14
+ protected
15
+ def self.utf8_to_ascii(string)
16
+ string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
17
+ string = string.gsub(/[ÉÈÊË]/, "E")
18
+ string = string.gsub(/[ÍÌÎÏ]/, "I")
19
+ string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
20
+ string = string.gsub(/[ÚÙÛÜ]/, "U")
21
+ string = string.gsub(/[Ý]/, "Y")
22
+ string = string.gsub(/Æ/, "AE")
23
+ string = string.gsub(/[ČÇ]/, "C")
24
+ string = string.gsub(/[ŠŞ]/, "S")
25
+ string = string.gsub(/[Đ]/, "D")
26
+ string = string.gsub(/Ž/, "Z")
27
+ string = string.gsub(/Ñ/, "N")
28
+ string = string.gsub(/Œ/, "OE")
29
+ string = string.gsub(/ß/, "B")
30
+ string = string.gsub(/Ķ/, "K")
31
+ string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
32
+ string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
33
+ string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
34
+ string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
35
+ string = string.gsub(/[úùûüůưừựủứụ]/, "u")
36
+ string = string.gsub(/[žź]/, "z")
37
+ string = string.gsub(/[ýÿỹ]/, "y")
38
+ string = string.gsub(/[đ]/, "d")
39
+ string = string.gsub(/æ/, "ae")
40
+ string = string.gsub(/[čćç]/, "c")
41
+ string = string.gsub(/[ñńň]/, "n")
42
+ string = string.gsub(/œ/, "oe")
43
+ string = string.gsub(/[śšş]/, "s")
44
+ string = string.gsub(/ř/, "r")
45
+ string = string.gsub(/ğ/, "g")
46
+ string = string.gsub(/Ř/, "R")
47
+ end
11
48
 
12
- protected
13
- def self.utf8_to_ascii(string)
14
- string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
15
- string = string.gsub(/[ÉÈÊË]/, "E")
16
- string = string.gsub(/[ÍÌÎÏ]/, "I")
17
- string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
18
- string = string.gsub(/[ÚÙÛÜ]/, "U")
19
- string = string.gsub(/[Ý]/, "Y")
20
- string = string.gsub(/Æ/, "AE")
21
- string = string.gsub(/[ČÇ]/, "C")
22
- string = string.gsub(/[ŠŞ]/, "S")
23
- string = string.gsub(/[Đ]/, "D")
24
- string = string.gsub(/Ž/, "Z")
25
- string = string.gsub(/Ñ/, "N")
26
- string = string.gsub(/Œ/, "OE")
27
- string = string.gsub(/ß/, "B")
28
- string = string.gsub(/Ķ/, "K")
29
- string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
30
- string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
31
- string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
32
- string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
33
- string = string.gsub(/[úùûüůưừựủứụ]/, "u")
34
- string = string.gsub(/[žź]/, "z")
35
- string = string.gsub(/[ýÿỹ]/, "y")
36
- string = string.gsub(/[đ]/, "d")
37
- string = string.gsub(/æ/, "ae")
38
- string = string.gsub(/[čćç]/, "c")
39
- string = string.gsub(/[ñńň]/, "n")
40
- string = string.gsub(/œ/, "oe")
41
- string = string.gsub(/[śšş]/, "s")
42
- string = string.gsub(/ř/, "r")
43
- string = string.gsub(/ğ/, "g")
44
- string = string.gsub(/Ř/, "R")
45
49
  end
46
50
 
47
51
  end
@@ -1,83 +1,87 @@
1
1
  # encoding: UTF-8
2
2
  require 'biodiversity'
3
3
 
4
- class TaxamatchParser
5
- def initialize
6
- @parser = ScientificNameParser.new
7
- @parsed_raw = nil
8
- @res = {}
9
- end
4
+ module Taxamatch
5
+
6
+ class Parser
7
+ def initialize
8
+ @parser = ScientificNameParser.new
9
+ @parsed_raw = nil
10
+ @res = {}
11
+ end
10
12
 
11
- def parse(name)
12
- @res = {:all_authors => [], :all_years => []}
13
- @parsed_raw = JSON.load(@parser.parse(name).to_json)['scientificName']
14
- organize_results
15
- end
13
+ def parse(name)
14
+ @res = {:all_authors => [], :all_years => []}
15
+ @parsed_raw = JSON.load(@parser.parse(name).to_json)['scientificName']
16
+ organize_results
17
+ end
16
18
 
17
- def parsed_raw
18
- return @parsed_raw
19
- end
19
+ def parsed_raw
20
+ return @parsed_raw
21
+ end
20
22
 
21
- protected
23
+ protected
22
24
 
23
- def organize_results
24
- pr = @parsed_raw
25
- return nil unless pr['parsed']
26
- d = pr['details'][0]
27
- process_node(:uninomial, d['uninomial'])
28
- process_node(:genus, d['genus'])
29
- process_node(:species, d['species'], true)
30
- process_infraspecies(d['infraspecies'])
31
- @res[:all_authors] = @res[:all_authors].uniq.map {|a| Normalizer.normalize(a)}
32
- @res[:all_years].uniq!
33
- @res.keys.size > 2 ? @res : nil
34
- end
25
+ def organize_results
26
+ pr = @parsed_raw
27
+ return nil unless pr['parsed']
28
+ d = pr['details'][0]
29
+ process_node(:uninomial, d['uninomial'])
30
+ process_node(:genus, d['genus'])
31
+ process_node(:species, d['species'], true)
32
+ process_infraspecies(d['infraspecies'])
33
+ @res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
34
+ @res[:all_years].uniq!
35
+ @res.keys.size > 2 ? @res : nil
36
+ end
35
37
 
36
- def process_node(name, node, is_species = false)
37
- return unless node
38
- @res[name] = {}
39
- @res[name][:epitheton] = node['epitheton']
40
- @res[name][:normalized] = Normalizer.normalize(node['epitheton'])
41
- @res[name][:phonetized] = Phonetizer.near_match(node['epitheton'], is_species)
42
- get_authors_years(node, @res[name])
43
- end
38
+ def process_node(name, node, is_species = false)
39
+ return unless node
40
+ @res[name] = {}
41
+ @res[name][:epitheton] = node['epitheton']
42
+ @res[name][:normalized] = Taxamatch::Normalizer.normalize(node['epitheton'])
43
+ @res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node['epitheton'], is_species)
44
+ get_authors_years(node, @res[name])
45
+ end
44
46
 
45
- def process_infraspecies(node)
46
- return unless node
47
- @res[:infraspecies] = []
48
- node.each do |infr|
49
- hsh = {}
50
- hsh[:epitheton] = infr['epitheton']
51
- hsh[:normalized] = Normalizer.normalize(infr['epitheton'])
52
- hsh[:phonetized] = Phonetizer.near_match(infr['epitheton'], true)
53
- get_authors_years(infr,hsh)
54
- @res[:infraspecies] << hsh
47
+ def process_infraspecies(node)
48
+ return unless node
49
+ @res[:infraspecies] = []
50
+ node.each do |infr|
51
+ hsh = {}
52
+ hsh[:epitheton] = infr['epitheton']
53
+ hsh[:normalized] = Taxamatch::Normalizer.normalize(infr['epitheton'])
54
+ hsh[:phonetized] = Taxamatch::Phonetizer.near_match(infr['epitheton'], true)
55
+ get_authors_years(infr,hsh)
56
+ @res[:infraspecies] << hsh
57
+ end
55
58
  end
56
- end
57
59
 
58
- def get_authors_years(node, res)
59
- res[:authors] = []
60
- res[:years] = []
61
- ['basionymAuthorTeam','combinationAuthorTeam'].each do |au|
62
- if node[au]
63
- res[:authors] += node[au]['author']
64
- res[:years] << node[au]['year'] if node[au]['year']
65
- if node[au]['exAuthorTeam']
66
- res[:authors] += node[au]['exAuthorTeam']['author']
67
- res[:years] << node[au]['exAuthorTeam']['year'] if node[au]['exAuthorTeam']['year']
60
+ def get_authors_years(node, res)
61
+ res[:authors] = []
62
+ res[:years] = []
63
+ ['basionymAuthorTeam','combinationAuthorTeam'].each do |au|
64
+ if node[au]
65
+ res[:authors] += node[au]['author']
66
+ res[:years] << node[au]['year'] if node[au]['year']
67
+ if node[au]['exAuthorTeam']
68
+ res[:authors] += node[au]['exAuthorTeam']['author']
69
+ res[:years] << node[au]['exAuthorTeam']['year'] if node[au]['exAuthorTeam']['year']
70
+ end
68
71
  end
69
72
  end
73
+ res[:authors].uniq!
74
+ res[:years].uniq!
75
+ @res[:all_authors] += res[:authors] if res[:authors].size > 0
76
+ @res[:all_years] += res[:years] if res[:years].size > 0
70
77
  end
71
- res[:authors].uniq!
72
- res[:years].uniq!
73
- @res[:all_authors] += res[:authors] if res[:authors].size > 0
74
- @res[:all_years] += res[:years] if res[:years].size > 0
75
- end
76
78
 
79
+ end
77
80
  end
78
81
 
79
82
  if __FILE__ == $0
80
83
  require 'pp'
81
84
  p = Parser.new
82
85
  puts p.parse('Salmonella werahensis (Castellani) Hauduroy and Ehringer in Hauduroy 1937')
83
- end
86
+ end
87
+
@@ -1,72 +1,75 @@
1
1
  # encoding: UTF-8
2
+ module Taxamatch
2
3
 
3
- class Phonetizer
4
+ class Phonetizer
4
5
 
5
- def self.near_match(a_word, normalize_ending = false)
6
- a_word = a_word.strip rescue ''
7
- return '' if a_word == ''
8
- a_word = Normalizer.normalize a_word
9
- case a_word
10
- when /^AE/
11
- a_word = 'E' + a_word[2..-1]
12
- when /^CN/
13
- a_word = 'N' + a_word[2..-1]
14
- when /^CT/
15
- a_word = 'T' + a_word[2..-1]
16
- when /^CZ/
17
- a_word = 'C' + a_word[2..-1]
18
- when /^DJ/
19
- a_word = 'J' + a_word[2..-1]
20
- when /^EA/
21
- a_word = 'E' + a_word[2..-1]
22
- when /^EU/
23
- a_word = 'U' + a_word[2..-1]
24
- when /^GN/
25
- a_word = 'N' + a_word[2..-1]
26
- when /^KN/
27
- a_word = 'N' + a_word[2..-1]
28
- when /^MC/
29
- a_word = 'MAC' + a_word[2..-1]
30
- when /^MN/
31
- a_word = 'N' + a_word[2..-1]
32
- when /^OE/
33
- a_word = 'E' + a_word[2..-1]
34
- when /^QU/
35
- a_word = 'Q' + a_word[2..-1]
36
- when /^PS/
37
- a_word = 'S' + a_word[2..-1]
38
- when /^PT/
39
- a_word = 'T' + a_word[2..-1]
40
- when /^TS/
41
- a_word = 'S' + a_word[2..-1]
42
- when /^WR/
43
- a_word = 'R' + a_word[2..-1]
44
- when /^X/
45
- a_word = 'Z' + a_word[1..-1]
46
- end
47
- first_char = a_word.split('')[0]
48
- rest_chars = a_word.split('')[1..-1].join('')
49
- rest_chars.gsub!('AE', 'I')
50
- rest_chars.gsub!('IA', 'A')
51
- rest_chars.gsub!('OE', 'I')
52
- rest_chars.gsub!('OI', 'A')
53
- rest_chars.gsub!('SC', 'S')
54
- rest_chars.gsub!('H', '')
55
- rest_chars.tr!('EOUYKZ', 'IAIICS')
56
- a_word = (first_char + rest_chars).squeeze
6
+ def self.near_match(a_word, normalize_ending = false)
7
+ a_word = a_word.strip rescue ''
8
+ return '' if a_word == ''
9
+ a_word = Taxamatch::Normalizer.normalize a_word
10
+ case a_word
11
+ when /^AE/
12
+ a_word = 'E' + a_word[2..-1]
13
+ when /^CN/
14
+ a_word = 'N' + a_word[2..-1]
15
+ when /^CT/
16
+ a_word = 'T' + a_word[2..-1]
17
+ when /^CZ/
18
+ a_word = 'C' + a_word[2..-1]
19
+ when /^DJ/
20
+ a_word = 'J' + a_word[2..-1]
21
+ when /^EA/
22
+ a_word = 'E' + a_word[2..-1]
23
+ when /^EU/
24
+ a_word = 'U' + a_word[2..-1]
25
+ when /^GN/
26
+ a_word = 'N' + a_word[2..-1]
27
+ when /^KN/
28
+ a_word = 'N' + a_word[2..-1]
29
+ when /^MC/
30
+ a_word = 'MAC' + a_word[2..-1]
31
+ when /^MN/
32
+ a_word = 'N' + a_word[2..-1]
33
+ when /^OE/
34
+ a_word = 'E' + a_word[2..-1]
35
+ when /^QU/
36
+ a_word = 'Q' + a_word[2..-1]
37
+ when /^PS/
38
+ a_word = 'S' + a_word[2..-1]
39
+ when /^PT/
40
+ a_word = 'T' + a_word[2..-1]
41
+ when /^TS/
42
+ a_word = 'S' + a_word[2..-1]
43
+ when /^WR/
44
+ a_word = 'R' + a_word[2..-1]
45
+ when /^X/
46
+ a_word = 'Z' + a_word[1..-1]
47
+ end
48
+ first_char = a_word.split('')[0]
49
+ rest_chars = a_word.split('')[1..-1].join('')
50
+ rest_chars.gsub!('AE', 'I')
51
+ rest_chars.gsub!('IA', 'A')
52
+ rest_chars.gsub!('OE', 'I')
53
+ rest_chars.gsub!('OI', 'A')
54
+ rest_chars.gsub!('SC', 'S')
55
+ rest_chars.gsub!('H', '')
56
+ rest_chars.tr!('EOUYKZ', 'IAIICS')
57
+ a_word = (first_char + rest_chars).squeeze
57
58
 
58
- if normalize_ending && a_word.size > 4
59
- a_word = self.normalize_ending(a_word)
59
+ if normalize_ending && a_word.size > 4
60
+ a_word = self.normalize_ending(a_word)
61
+ end
62
+ a_word
60
63
  end
61
- a_word
62
- end
63
64
 
64
- def self.normalize_ending(a_word)
65
- # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
66
- # -- at the end of a string translate all to -a
67
- a_word.gsub!(/IS$/, 'A')
68
- a_word.gsub!(/IM$/, 'A')
69
- a_word.gsub(/AS$/, 'A')
70
- end
65
+ def self.normalize_ending(a_word)
66
+ # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
67
+ # -- at the end of a string translate all to -a
68
+ a_word.gsub!(/IS$/, 'A')
69
+ a_word.gsub!(/IM$/, 'A')
70
+ a_word.gsub(/AS$/, 'A')
71
+ end
71
72
 
73
+ end
74
+
72
75
  end
data/lib/taxamatch_rb.rb CHANGED
@@ -1,3 +1,4 @@
1
+ # encoding: UTF-8
1
2
  $:.unshift(File.dirname(__FILE__)) unless
2
3
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
4
  # $:.unshift('taxamatch_rb')
@@ -7,85 +8,91 @@ require 'taxamatch_rb/normalizer'
7
8
  require 'taxamatch_rb/phonetizer'
8
9
  require 'taxamatch_rb/authmatch'
9
10
 
10
- class Taxamatch
11
+ $KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
12
+
13
+ module Taxamatch
14
+
15
+ class Base
11
16
 
12
- def initialize
13
- @parser = TaxamatchParser.new
14
- @dlm = DamerauLevenshteinMod.new
15
- end
17
+ def initialize
18
+ @parser = Taxamatch::Parser.new
19
+ @dlm = Taxamatch::DamerauLevenshteinMod.new
20
+ end
16
21
 
17
22
 
18
- #takes two scientific names and returns true if names match and false if they don't
19
- def taxamatch(str1, str2)
20
- parsed_data_1 = @parser.parse(str1)
21
- parsed_data_2 = @parser.parse(str2)
22
- taxamatch_parsed_data(parsed_data_1, parsed_data_2)[:match]
23
- end
23
+ #takes two scientific names and returns true if names match and false if they don't
24
+ def taxamatch(str1, str2)
25
+ parsed_data_1 = @parser.parse(str1)
26
+ parsed_data_2 = @parser.parse(str2)
27
+ taxamatch_parsed_data(parsed_data_1, parsed_data_2)[:match]
28
+ end
24
29
 
25
- #takes two hashes of parsed scientific names, analyses them and returns back
26
- #this function is useful when species strings are preparsed.
27
- def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
28
- result = nil
29
- result = match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
30
- result = match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
31
- if result && result[:match]
32
- result[:match] = match_authors(parsed_data_1, parsed_data_2) > 0 ? true : false
30
+ #takes two hashes of parsed scientific names, analyses them and returns back
31
+ #this function is useful when species strings are preparsed.
32
+ def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
33
+ result = nil
34
+ result = match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
35
+ result = match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
36
+ if result && result[:match]
37
+ result[:match] = false if match_authors(parsed_data_1, parsed_data_2) == 0
38
+ end
39
+ return result
33
40
  end
34
- return result
35
- end
36
41
 
37
- def match_uninomial(parsed_data_1, parsed_data_2)
38
- return false
39
- end
42
+ def match_uninomial(parsed_data_1, parsed_data_2)
43
+ return false
44
+ end
40
45
 
41
- def match_multinomial(parsed_data_1, parsed_data_2)
42
- gen_match = match_genera(parsed_data_1[:genus], parsed_data_2[:genus])
43
- sp_match = match_species(parsed_data_1[:species], parsed_data_2[:species])
44
- au_match = match_authors(parsed_data_1, parsed_data_2)
45
- total_length = parsed_data_1[:genus][:epitheton].size + parsed_data_2[:genus][:epitheton].size + parsed_data_1[:species][:epitheton].size + parsed_data_2[:species][:epitheton].size
46
- match = match_matches(gen_match, sp_match)
47
- match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
48
- end
46
+ def match_multinomial(parsed_data_1, parsed_data_2)
47
+ gen_match = match_genera(parsed_data_1[:genus], parsed_data_2[:genus])
48
+ sp_match = match_species(parsed_data_1[:species], parsed_data_2[:species])
49
+ au_match = match_authors(parsed_data_1, parsed_data_2)
50
+ total_length = parsed_data_1[:genus][:epitheton].size + parsed_data_2[:genus][:epitheton].size + parsed_data_1[:species][:epitheton].size + parsed_data_2[:species][:epitheton].size
51
+ match = match_matches(gen_match, sp_match)
52
+ match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
53
+ end
49
54
 
50
- def match_genera(genus1, genus2)
51
- genus1_length = genus1[:normalized].size
52
- genus2_length = genus2[:normalized].size
53
- match = false
54
- ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
55
- return {:edit_distance => ed, :phonetic_match => true, :match => true} if genus1[:phonetized] == genus2[:phonetized]
55
+ def match_genera(genus1, genus2)
56
+ genus1_length = genus1[:normalized].size
57
+ genus2_length = genus2[:normalized].size
58
+ match = false
59
+ ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
60
+ return {:edit_distance => ed, :phonetic_match => true, :match => true} if genus1[:phonetized] == genus2[:phonetized]
56
61
 
57
- match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
58
- {:edit_distance => ed, :match => match, :phonetic_match => false}
59
- end
62
+ match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
63
+ {:edit_distance => ed, :match => match, :phonetic_match => false}
64
+ end
60
65
 
61
- def match_species(sp1, sp2)
62
- sp1_length = sp1[:normalized].size
63
- sp2_length = sp2[:normalized].size
64
- sp1[:phonetized] = Phonetizer.normalize_ending sp1[:phonetized]
65
- sp2[:phonetized] = Phonetizer.normalize_ending sp2[:phonetized]
66
- match = false
67
- ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
68
- return {:edit_distance => ed, :phonetic_match => true, :match => true} if sp1[:phonetized] == sp2[:phonetized]
66
+ def match_species(sp1, sp2)
67
+ sp1_length = sp1[:normalized].size
68
+ sp2_length = sp2[:normalized].size
69
+ sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
70
+ sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
71
+ match = false
72
+ ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
73
+ return {:edit_distance => ed, :phonetic_match => true, :match => true} if sp1[:phonetized] == sp2[:phonetized]
69
74
 
70
- match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
71
- {:edit_distance => ed, :match => match, :phonetic_match => false}
72
- end
75
+ match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
76
+ {:edit_distance => ed, :match => match, :phonetic_match => false}
77
+ end
73
78
 
74
- def match_authors(parsed_data_1, parsed_data_2)
75
- au1 = parsed_data_1[:all_authors]
76
- au2 = parsed_data_2[:all_authors]
77
- yr1 = parsed_data_1[:all_years]
78
- yr2 = parsed_data_2[:all_years]
79
- Authmatch.authmatch(au1, au2, yr1, yr2)
80
- end
79
+ def match_authors(parsed_data_1, parsed_data_2)
80
+ au1 = parsed_data_1[:all_authors]
81
+ au2 = parsed_data_2[:all_authors]
82
+ yr1 = parsed_data_1[:all_years]
83
+ yr2 = parsed_data_2[:all_years]
84
+ Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
85
+ end
81
86
 
82
- def match_matches(genus_match, species_match, infraspecies_matches = [])
83
- match = species_match
84
- match[:edit_distance] += genus_match[:edit_distance]
85
- match[:match] = false if match[:edit_distance] > 4
86
- match[:match] &&= genus_match[:match]
87
- match[:phonetic_match] &&= genus_match[:phonetic_match]
88
- match
87
+ def match_matches(genus_match, species_match, infraspecies_matches = [])
88
+ match = species_match
89
+ match[:edit_distance] += genus_match[:edit_distance]
90
+ match[:match] = false if match[:edit_distance] > 4
91
+ match[:match] &&= genus_match[:match]
92
+ match[:phonetic_match] &&= genus_match[:phonetic_match]
93
+ match
94
+ end
95
+
89
96
  end
90
97
 
91
- end
98
+ end
@@ -28,6 +28,8 @@ Pomatomus|pomatomus|10|1|1
28
28
  Pomatomus||10|1|9
29
29
  |Pomatomus|10|1|9
30
30
  P|p|10|1|1
31
+ #TODO: one letter vs longer string generates a big negative number
32
+ #L|Linneaus|10|1|7
31
33
 
32
34
 
33
35
  #it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
data/spec/spec_helper.rb CHANGED
@@ -23,6 +23,6 @@ def read_test_file(file, fields_num)
23
23
  end
24
24
 
25
25
  def make_taxamatch_hash(string)
26
- normalized = Normalizer.normalize(string)
27
- {:epitheton => string, :normalized => normalized, :phonetized => Phonetizer.near_match(normalized)}
26
+ normalized = Taxamatch::Normalizer.normalize(string)
27
+ {:epitheton => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
28
28
  end
@@ -1,10 +1,10 @@
1
1
  # encoding: UTF-8
2
2
  require File.dirname(__FILE__) + '/spec_helper.rb'
3
3
 
4
- describe 'DamerauLevensteinMod' do
4
+ describe 'DamerauLevenshteinMod' do
5
5
  it 'should get tests' do
6
6
  read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
7
- dl = DamerauLevenshteinMod.new
7
+ dl = Taxamatch::DamerauLevenshteinMod.new
8
8
  if y
9
9
  res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
10
10
  puts y if res != y[4].to_i
@@ -16,7 +16,7 @@ end
16
16
 
17
17
  describe 'Parser' do
18
18
  before(:all) do
19
- @parser =TaxamatchParser.new
19
+ @parser = Taxamatch::Parser.new
20
20
  end
21
21
 
22
22
  it 'should parse uninomials' do
@@ -35,27 +35,27 @@ describe 'Parser' do
35
35
  end
36
36
 
37
37
 
38
- describe 'Normalizer' do
38
+ describe 'Taxamatch::Normalizer' do
39
39
  it 'should normalize strings' do
40
- Normalizer.normalize('abcd').should == 'ABCD'
41
- Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
42
- Normalizer.normalize('Ærenea').should == 'AERENEA'
43
- Normalizer.normalize('Fallén').should == 'FALLEN'
44
- Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
40
+ Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
41
+ Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
42
+ Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
43
+ Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
44
+ Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
45
45
  end
46
46
 
47
47
  it 'should normalize words' do
48
- Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
48
+ Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
49
49
  end
50
50
  end
51
51
 
52
- describe 'Taxamatch' do
52
+ describe 'Taxamatch::Base' do
53
53
  before(:all) do
54
- @tm = Taxamatch.new
54
+ @tm = Taxamatch::Base.new
55
55
  end
56
56
 
57
57
  it 'should get txt tests' do
58
- dl = DamerauLevenshteinMod.new
58
+ dl = Taxamatch::DamerauLevenshteinMod.new
59
59
  read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 3) do |y|
60
60
  if y
61
61
  y[2] = y[2] == 'true' ? true : false
@@ -174,14 +174,40 @@ describe 'Taxamatch' do
174
174
  @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
175
175
  end
176
176
 
177
- describe 'Authmatch' do
177
+ describe 'Taxamatch::Authmatch' do
178
178
  before(:all) do
179
- @am = Authmatch
179
+ @am = Taxamatch::Authmatch
180
180
  end
181
181
 
182
182
  it 'should calculate score' do
183
- res = @am.authmatch(['Linnaeus', 'Muller'], ['L', 'Kenn'], [], [1788])
183
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
184
184
  res.should == 90
185
+ res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
186
+ res.should == 0
187
+ #found all authors, same year
188
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
189
+ res.should == 100
190
+ #all authors, 1 year diff
191
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
192
+ res.should == 54
193
+ #year is not counted in
194
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
195
+ res.should == 94
196
+ #found all authors on one side, same year
197
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
198
+ res.should == 91
199
+ #found all authors on one side, 1 year diff
200
+ res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
201
+ res.should == 51
202
+ #found all authors on one side, year does not count
203
+ res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
204
+ res.should == 90
205
+ #found some authors
206
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
207
+ res.should == 67
208
+ #if year does not match or not present no match for previous case
209
+ res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
210
+ res.should == 0
185
211
  end
186
212
 
187
213
  it 'should compare years' do
@@ -205,7 +231,18 @@ describe 'Taxamatch' do
205
231
  #fuzzy match
206
232
  res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
207
233
  res.should == [["Dem"], ["Stepanov"]]
234
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
235
+ res.should == [['Linnaeus', 'Muller'], ['Kenn']]
236
+ res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
237
+ res.should == [[],['Kurtz']]
238
+ end
239
+
240
+ it 'should fuzzy match authors' do
241
+ #TODO: fix the bug revealed by this test
242
+ # res = @am.fuzzy_match_authors('L', 'Muller')
243
+ # res.should be_false
208
244
  end
245
+
209
246
  end
210
247
 
211
248
  end
@@ -16,4 +16,4 @@ Pomatomus saltator|Pomatomus saltatrix|true
16
16
  Loligo pealeii|Loligo plei|false
17
17
 
18
18
  # different authors should not match
19
- #Puma concolor Linnaeus|Puma concolor Kurtz|false
19
+ Puma concolor Linnaeus|Puma concolor Kurtz|false
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dimus-taxamatch_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin