dimus-taxamatch_rb 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +6 -0
- data/LICENSE +20 -0
- data/README.rdoc +7 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/features/step_definitions/common_steps.rb +163 -0
- data/features/step_definitions/taxamatch_rb.rb +92 -0
- data/features/support/common.rb +29 -0
- data/features/support/env.rb +14 -0
- data/features/support/matchers.rb +11 -0
- data/features/taxamatch_rb.feature +33 -0
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +136 -0
- data/lib/taxamatch_rb/normalizer.rb +47 -0
- data/lib/taxamatch_rb/parser.rb +83 -0
- data/lib/taxamatch_rb/phonetizer.rb +74 -0
- data/lib/taxamatch_rb.rb +444 -0
- data/spec/damerau_levenshtein_mod_test.txt +58 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +24 -0
- data/spec/taxamatch_rb_spec.rb +50 -0
- data/taxamatch_rb.gemspec +65 -0
- metadata +96 -0
@@ -0,0 +1,83 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'biodiversity'
|
3
|
+
|
4
|
+
class Parser
|
5
|
+
def initialize
|
6
|
+
@parser = ScientificNameParser.new
|
7
|
+
@parsed_raw = nil
|
8
|
+
@res = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse(name)
|
12
|
+
@res = {:all_authors => [], :all_years => []}
|
13
|
+
@parsed_raw = JSON.load(@parser.parse(name).to_json)['scientificName']
|
14
|
+
organize_results
|
15
|
+
end
|
16
|
+
|
17
|
+
def parsed_raw
|
18
|
+
return @parsed_raw
|
19
|
+
end
|
20
|
+
|
21
|
+
protected
|
22
|
+
|
23
|
+
def organize_results
|
24
|
+
pr = @parsed_raw
|
25
|
+
return nil unless pr['parsed']
|
26
|
+
d = pr['details'][0]
|
27
|
+
process_node(:uninomial, d['uninomial'])
|
28
|
+
process_node(:genus, d['genus'])
|
29
|
+
process_node(:species, d['species'], true)
|
30
|
+
process_infraspecies(d['infraspecies'])
|
31
|
+
@res[:all_authors].uniq!
|
32
|
+
@res[:all_years].uniq!
|
33
|
+
@res.keys.size > 2 ? @res : nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def process_node(name, node, is_species = false)
|
37
|
+
return unless node
|
38
|
+
@res[name] = {}
|
39
|
+
@res[name][:epitheton] = node['epitheton']
|
40
|
+
@res[name][:normalized] = Normalizer.normalize(node['epitheton'])
|
41
|
+
@res[name][:phonetized] = Phonetizer.near_match(node['epitheton'], is_species)
|
42
|
+
get_authors_years(node, @res[name])
|
43
|
+
end
|
44
|
+
|
45
|
+
def process_infraspecies(node)
|
46
|
+
return unless node
|
47
|
+
@res[:infraspecies] = []
|
48
|
+
node.each do |infr|
|
49
|
+
hsh = {}
|
50
|
+
hsh[:epitheton] = infr['epitheton']
|
51
|
+
hsh[:normalized] = Normalizer.normalize(infr['epitheton'])
|
52
|
+
hsh[:phonetized] = Phonetizer.near_match(infr['epitheton'], true)
|
53
|
+
get_authors_years(infr,hsh)
|
54
|
+
@res[:infraspecies] << hsh
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def get_authors_years(node, res)
|
59
|
+
res[:authors] = []
|
60
|
+
res[:years] = []
|
61
|
+
['basionymAuthorTeam','combinationAuthorTeam'].each do |au|
|
62
|
+
if node[au]
|
63
|
+
res[:authors] += node[au]['author']
|
64
|
+
res[:years] << node[au]['year'] if node[au]['year']
|
65
|
+
if node[au]['exAuthorTeam']
|
66
|
+
res[:authors] += node[au]['exAuthorTeam']['author']
|
67
|
+
res[:years] << node[au]['exAuthorTeam']['year'] if node[au]['exAuthorTeam']['year']
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
res[:authors].uniq!
|
72
|
+
res[:years].uniq!
|
73
|
+
@res[:all_authors] += res[:authors] if res[:authors].size > 0
|
74
|
+
@res[:all_years] += res[:years] if res[:years].size > 0
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
if __FILE__ == $0
|
80
|
+
require 'pp'
|
81
|
+
p = Parser.new
|
82
|
+
puts p.parse('Salmonella werahensis (Castellani) Hauduroy and Ehringer in Hauduroy 1937')
|
83
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class Phonetizer
|
4
|
+
|
5
|
+
def self.near_match(a_word, normalize_ending = false)
|
6
|
+
a_word = a_word.strip rescue ''
|
7
|
+
return '' if a_word == ''
|
8
|
+
a_word = Normalizer.normalize a_word
|
9
|
+
case a_word
|
10
|
+
when /^AE/
|
11
|
+
a_word = 'E' + a_word[2..-1]
|
12
|
+
when /^CN/
|
13
|
+
a_word = 'N' + a_word[2..-1]
|
14
|
+
when /^CT/
|
15
|
+
a_word = 'T' + a_word[2..-1]
|
16
|
+
when /^CZ/
|
17
|
+
a_word = 'C' + a_word[2..-1]
|
18
|
+
when /^DJ/
|
19
|
+
a_word = 'J' + a_word[2..-1]
|
20
|
+
when /^EA/
|
21
|
+
a_word = 'E' + a_word[2..-1]
|
22
|
+
when /^EU/
|
23
|
+
a_word = 'U' + a_word[2..-1]
|
24
|
+
when /^GN/
|
25
|
+
a_word = 'N' + a_word[2..-1]
|
26
|
+
when /^KN/
|
27
|
+
a_word = 'N' + a_word[2..-1]
|
28
|
+
when /^MC/
|
29
|
+
a_word = 'MAC' + a_word[2..-1]
|
30
|
+
when /^MN/
|
31
|
+
a_word = 'N' + a_word[2..-1]
|
32
|
+
when /^OE/
|
33
|
+
a_word = 'E' + a_word[2..-1]
|
34
|
+
when /^QU/
|
35
|
+
a_word = 'Q' + a_word[2..-1]
|
36
|
+
when /^PS/
|
37
|
+
a_word = 'S' + a_word[2..-1]
|
38
|
+
when /^PT/
|
39
|
+
a_word = 'T' + a_word[2..-1]
|
40
|
+
when /^TS/
|
41
|
+
a_word = 'S' + a_word[2..-1]
|
42
|
+
when /^WR/
|
43
|
+
a_word = 'R' + a_word[2..-1]
|
44
|
+
when /^X/
|
45
|
+
a_word = 'Z' + a_word[1..-1]
|
46
|
+
end
|
47
|
+
first_char = a_word.split('')[0]
|
48
|
+
rest_chars = a_word.split('')[1..-1].join('')
|
49
|
+
rest_chars.gsub!('AE', 'I')
|
50
|
+
rest_chars.gsub!('IA', 'A')
|
51
|
+
rest_chars.gsub!('OE', 'I')
|
52
|
+
rest_chars.gsub!('OI', 'A')
|
53
|
+
rest_chars.gsub!('SC', 'S')
|
54
|
+
rest_chars.gsub!('H', '')
|
55
|
+
rest_chars.tr!('EOUYKZ', 'IAIICS')
|
56
|
+
a_word = (first_char + rest_chars).squeeze
|
57
|
+
|
58
|
+
if normalize_ending && a_word.size > 4
|
59
|
+
a_word = self.normalize_ending(a_word)
|
60
|
+
end
|
61
|
+
a_word
|
62
|
+
end
|
63
|
+
|
64
|
+
protected
|
65
|
+
|
66
|
+
def self.normalize_ending(a_word)
|
67
|
+
# -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
|
68
|
+
# -- at the end of a string translate all to -a
|
69
|
+
a_word.gsub!(/IS$/, 'A')
|
70
|
+
a_word.gsub!(/IM$/, 'A')
|
71
|
+
a_word.gsub(/AS$/, 'A')
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
data/lib/taxamatch_rb.rb
ADDED
@@ -0,0 +1,444 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
# $:.unshift('taxamatch_rb')
|
4
|
+
require 'taxamatch_rb/damerau_levenshtein_mod'
|
5
|
+
require 'taxamatch_rb/parser'
|
6
|
+
require 'taxamatch_rb/normalizer'
|
7
|
+
require 'taxamatch_rb/phonetizer'
|
8
|
+
|
9
|
+
class Taxamatch
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@parser = Parser.new
|
13
|
+
end
|
14
|
+
#
|
15
|
+
#
|
16
|
+
# #takes two scientific names and returns true if names matched and false if they did not
|
17
|
+
# def taxamatch(str1, str2) {
|
18
|
+
# taxa1 = @parser.parse_prepare(str1)
|
19
|
+
# taxa2 = @parser.parse_prepare(str2)
|
20
|
+
#
|
21
|
+
# full_taxamatch(taxa1, taxa2)[:mached]
|
22
|
+
# }
|
23
|
+
#
|
24
|
+
# #takes two hashes of parsed scientific names, analyses them and returns back
|
25
|
+
# #this function is useful when species strings are preparsed.
|
26
|
+
# def full_taxamatch
|
27
|
+
# end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
#
|
33
|
+
# public function name_strings_match($name_string1, $name_string2) {
|
34
|
+
# $info_1 = new Splitter(null,$name_string1);
|
35
|
+
# $info_2 = new Splitter(null,$name_string2);
|
36
|
+
# return $this->name_objects_match($info_1, $info_2);
|
37
|
+
# }
|
38
|
+
#
|
39
|
+
# public function name_objects_match($name_object_1, $name_object_2) {
|
40
|
+
# $genus_match = $this->match_genera($name_object_1->genus, $name_object_2->genus);
|
41
|
+
# $epithets_match = $this->match_species_epithets($name_object_1->species, $name_object_2->species);
|
42
|
+
# $total_length = strlen($name_object_1->genus) + strlen($name_object_1->species) + strlen($name_object_2->genus) + strlen($name_object_2->species);
|
43
|
+
# $match = $this->match_binomials($genus_match, $epithets_match);
|
44
|
+
# return $this->match_response_to_float($match, $total_length);
|
45
|
+
# }
|
46
|
+
#
|
47
|
+
# public function match_response_to_float($match_response, $total_length_of_strings) {
|
48
|
+
# if(!$match_response['match']) return 0.0;
|
49
|
+
#
|
50
|
+
# return (1 - ($match_response['edit_distance'] / ($total_length_of_strings/2)));
|
51
|
+
# }
|
52
|
+
#
|
53
|
+
# public function match_genera($genus1, $genus2) {
|
54
|
+
# $match = $phonetic_match = false;
|
55
|
+
# $nm = new NearMatch();
|
56
|
+
# $genus1_phonetic = $nm->near_match($genus1);
|
57
|
+
# $genus2_phonetic = $nm->near_match($genus2);
|
58
|
+
# $genus1_length = strlen($genus1);
|
59
|
+
#
|
60
|
+
# $temp_genus_ED = $this->mdld($genus2, $genus1, 2);
|
61
|
+
# // add the genus post-filter
|
62
|
+
# // min. 51% "good" chars
|
63
|
+
# // first char must match for ED 2+
|
64
|
+
# if( ($temp_genus_ED <= 3 && ( min( strlen( $genus2 ), $genus1_length ) > ( $temp_genus_ED * 2 ))
|
65
|
+
# && ( $temp_genus_ED < 2 || ( substr($genus2,0,1) == substr($genus1,0,1) ) ) )
|
66
|
+
# || ($genus1_phonetic == $genus2_phonetic) ) {
|
67
|
+
# $match = true;
|
68
|
+
# // accept as exact or near match; append to genus results table
|
69
|
+
# $this->debug['process'][] = "6a (near_match_genus:$genus2_phonetic) (this_near_match_genus:$genus1_phonetic)";
|
70
|
+
#
|
71
|
+
# if($genus1_phonetic == $genus2_phonetic) $phonetic_match = true;
|
72
|
+
# }
|
73
|
+
# return array(
|
74
|
+
# 'match' => $match,
|
75
|
+
# 'phonetic_match' => $phonetic_match,
|
76
|
+
# 'edit_distance' => $temp_genus_ED);
|
77
|
+
# }
|
78
|
+
#
|
79
|
+
# public function match_species_epithets($species_epithet1, $species_epithet2) {
|
80
|
+
# $match = false;
|
81
|
+
# $phonetic_match = false;
|
82
|
+
# $epithet1_length = strlen($species_epithet1);
|
83
|
+
# $epithet2_length = strlen($species_epithet2);
|
84
|
+
#
|
85
|
+
# $nm = new NearMatch();
|
86
|
+
# $epithet1_phonetic = $nm->near_match($species_epithet1);
|
87
|
+
# $epithet2_phonetic = $nm->near_match($species_epithet2);
|
88
|
+
# $temp_species_ED = $this->mdld($species_epithet2, $species_epithet1, 4);
|
89
|
+
# // add the species post-filter
|
90
|
+
# // min. 50% "good" chars
|
91
|
+
# // first char must match for ED2+
|
92
|
+
# // first 3 chars must match for ED4
|
93
|
+
# if ($epithet2_phonetic == $epithet1_phonetic) $match = true;
|
94
|
+
# elseif( ($temp_species_ED <= 4 && min($epithet2_length, $epithet1_length) >= ($temp_species_ED*2)
|
95
|
+
# && ($temp_species_ED < 2 || strpos($species_epithet2 , substr($species_epithet1,0,1)) !== false)
|
96
|
+
# && ($temp_species_ED < 4 || strpos($species_epithet2 , substr($species_epithet1,0,3)) !== false))) $match = true;
|
97
|
+
#
|
98
|
+
# // if phonetic match, set relevant flag
|
99
|
+
# if ($epithet2_phonetic == $epithet1_phonetic) $phonetic_match = true;
|
100
|
+
#
|
101
|
+
# return array('match' => $match, 'phonetic_match' => $phonetic_match, 'edit_distance' => $temp_species_ED);
|
102
|
+
# }
|
103
|
+
#
|
104
|
+
#
|
105
|
+
# public function match_binomials($genus_match, $species_epithets_match) {
|
106
|
+
# $binomial_match = $species_epithets_match;
|
107
|
+
# $binomial_match['edit_distance'] = $genus_match["edit_distance"] + $species_epithets_match["edit_distance"];
|
108
|
+
#
|
109
|
+
# if(!$genus_match['match']) $binomial_match['match'] = false;
|
110
|
+
# if($binomial_match["edit_distance"] > 4) $binomial_match['match'] = false;
|
111
|
+
# if(!$genus_match['phonetic_match']) $binomial_match['phonetic_match'] = false;
|
112
|
+
#
|
113
|
+
#
|
114
|
+
# return $binomial_match;
|
115
|
+
# }
|
116
|
+
#
|
117
|
+
# // public function match_species($genus1, $species_epithet1, $genus2, $species_epithet2, $genus_edit_distance) {
|
118
|
+
# // $match = false;
|
119
|
+
# // $phonetic_match = false;
|
120
|
+
# // $epithet1_length = strlen($species_epithet1);
|
121
|
+
# // $epithet2_length = strlen($species_epithet2);
|
122
|
+
# //
|
123
|
+
# // $nm = new NearMatch();
|
124
|
+
# // $genus1_phonetic = $nm->near_match(genus1);
|
125
|
+
# // $genus2_phonetic = $nm->near_match(genus2);
|
126
|
+
# // $epithet1_phonetic = $nm->near_match($species_epithet1);
|
127
|
+
# // $epithet2_phonetic = $nm->near_match($species_epithet2);
|
128
|
+
# // $temp_species_ED = $this->mdld($species2, $species1, 4);
|
129
|
+
# // // add the species post-filter
|
130
|
+
# // // min. 50% "good" chars
|
131
|
+
# // // first char must match for ED2+
|
132
|
+
# // // first 3 chars must match for ED4
|
133
|
+
# // if ( ($epithet2_phonetic == $epithet1_phonetic)
|
134
|
+
# // || ( ($genus_edit_distance + $temp_species_ED <= 4)
|
135
|
+
# // && ($temp_species_ED <= 4 && min(strlen($epithet2_length),$epithet1_length) >= ($temp_species_ED*2)
|
136
|
+
# // && ($temp_species_ED < 2 || strpos($species_epithet2 , substr($species_epithet1,1,1)) !== false)
|
137
|
+
# // && ($temp_species_ED < 4 || strpos($species_epithet2 , substr($species_epithet1,1,3)) !== false)
|
138
|
+
# // && ($genus_edit_distance + $temp_species_ED <= 4) ))) {
|
139
|
+
# // $match = true;
|
140
|
+
# // // accept as exact or near match, append to species results table
|
141
|
+
# // // if phonetic match, set relevant flag
|
142
|
+
# // if ( ($genus2_phonetic == $genus1_phonetic) && ($epithet2_phonetic == $epithet1_phonetic) ) $phonetic_match = true;
|
143
|
+
# // }
|
144
|
+
# // return array('match' => $match, 'phonetic_match' => $phonetic_match, 'edit_distance' => $temp_species_ED);
|
145
|
+
# // }
|
146
|
+
#
|
147
|
+
# /**
|
148
|
+
# * Function : process
|
149
|
+
# * Purpose: Perform exact and fuzzy matching on a species name, or single genus name
|
150
|
+
# * Input: - genus, genus+species, or genus+species+authority (in this version), as "searchtxt"
|
151
|
+
# * - "search_mode" to control search mode: currently normal (default) / rapid / no_shaping
|
152
|
+
# * - "debug" - print internal parameters used if not null
|
153
|
+
# * Outputs: list of genera and species that match (or near match) input terms, with associated
|
154
|
+
# * ancillary info as desired
|
155
|
+
# * Remarks:
|
156
|
+
# * (1) This demo version is configured to access base data in three tables:
|
157
|
+
# * - genlist_test1 (genus info); primary key (PK) is genus_id
|
158
|
+
# * - splist_test1 (species info); PK is species_id, has genus_id as foreign key (FK)
|
159
|
+
# * (= link to relevant row in genus table)
|
160
|
+
# * - auth_abbrev_test1 (authority abbreviations - required by subsidiary function
|
161
|
+
# * "normalize_auth". Refer README file for relevant minimum table definitions.
|
162
|
+
# * If authority comparisons are not required, calls to "normalize_auth" can be disabled and
|
163
|
+
# * relevant function commented out, removing need for third table.
|
164
|
+
# * (In a production system, table and column names can be varied as desired so long as
|
165
|
+
# * code is altered at relevant points, also could be re-configured to hold all genus+species info together in a single table with minor re-write).
|
166
|
+
# * (2) Writes to and reads back from pre-defined global temporary tables
|
167
|
+
# * "genus_id_matches" and "species_id_matches", new instances of these are automatically
|
168
|
+
# * created for each session (i.e., do not need clearing at procedure end). Refer
|
169
|
+
# * README file for relevant table definitions.
|
170
|
+
# * (3) When result shaping is on in this version, a relevant message displayed as required
|
171
|
+
# * for developer feedback, if more distant results are being masked (in producton version,
|
172
|
+
# * possibly would not do this)
|
173
|
+
# * (4) Requires the following subsidiary functions (supplied elsewhere in this package):
|
174
|
+
# * - normalize
|
175
|
+
# * - normalize_auth
|
176
|
+
# * - reduce_spaces
|
177
|
+
# * - ngram
|
178
|
+
# * - compare_auth
|
179
|
+
# * - near_match
|
180
|
+
# * - mdld
|
181
|
+
# * (5) Accepts "+" as input separator in place of space (e.g. "Homo+sapiens"), e.g. for calling
|
182
|
+
# * via a HTTP GET request as needed.
|
183
|
+
# * @param string $searchtxt : genus, genus+species, or genus+species+authority
|
184
|
+
# * @param string $search_mode : normal (default) / rapid / no_shaping
|
185
|
+
# * @param boolean $cache
|
186
|
+
# * @return boolean
|
187
|
+
# */
|
188
|
+
# public function process($searchtxt, $search_mode='normal', $cache = false) {
|
189
|
+
#
|
190
|
+
# $this->input = $searchtxt;
|
191
|
+
#
|
192
|
+
# $this->debug['process'][] = "1 (searchtxt:$searchtxt) (search_mode:$search_mode)";
|
193
|
+
# $this->searchtxt = $searchtxt;
|
194
|
+
# $this->search_mode=$search_mode;
|
195
|
+
#
|
196
|
+
# // accept "+" as separator if supplied, tranform to space
|
197
|
+
# if ( strpos($this->searchtxt,'+') !== false ) {
|
198
|
+
# $text_str = str_replace('+',' ',$this->searchtxt);
|
199
|
+
# } else {
|
200
|
+
# $text_str = $this->searchtxt;
|
201
|
+
# }
|
202
|
+
#
|
203
|
+
# $this->debug['process'][] = "1a (text_str:$text_str)";
|
204
|
+
#
|
205
|
+
# if ( is_null($text_str) || $text_str == '' ) {
|
206
|
+
# $this->debug['process'][] = "2 Return(false)";
|
207
|
+
# return false;
|
208
|
+
# }
|
209
|
+
#
|
210
|
+
# // Clearing the temporary tables
|
211
|
+
# $this->db->clearTempTables();
|
212
|
+
#
|
213
|
+
# // includes stripping of presumed non-relevant content including subgenera, comments, cf's, aff's, etc... to
|
214
|
+
#
|
215
|
+
# // Normalizing the search text
|
216
|
+
# $n = new Normalize($this->db);
|
217
|
+
#
|
218
|
+
# $this->debug['process'][] = "3 (text_str:$text_str)";
|
219
|
+
#
|
220
|
+
# if(!$this->chop_overload) {
|
221
|
+
# // leave presumed genus + species + authority (in this instance), with genus and species in uppercase
|
222
|
+
# $splitter = new Splitter($n,$text_str);
|
223
|
+
#
|
224
|
+
# $this->this_search_genus = $this_search_genus = $splitter->get('genus');
|
225
|
+
# $this->this_search_species = $this_search_species = $splitter->get('species');
|
226
|
+
# $this->this_authority = $this_authority = $splitter->get('author');
|
227
|
+
# }
|
228
|
+
#
|
229
|
+
# // cache_flag switch detemines if caching is allowed for the source
|
230
|
+
# if($this->cache_flag == true) {
|
231
|
+
#
|
232
|
+
# if ( $this_search_genus != '' && $this_search_species != '' && $this_authority != '' ) {
|
233
|
+
# $cache_key = $this_search_genus . '-' . $this_search_species . '-' . $this_authority . '_' . $search_mode;
|
234
|
+
# $cache_path = $this->cache_path . $this->db->source . "/authority/";
|
235
|
+
# } else if ( $this_search_genus != '' && $this_search_species != '' ) {
|
236
|
+
# $cache_key = $this_search_genus . '-' . $this_search_species . '_' . $search_mode;
|
237
|
+
# $cache_path = $this->cache_path . $this->db->source . "/species/";
|
238
|
+
# } else if ( $this_search_genus != '' ) {
|
239
|
+
# $cache_key = $this_search_genus . '_' . $search_mode;
|
240
|
+
# $cache_path = $this->cache_path . $this->db->source . "/genus/";
|
241
|
+
# }
|
242
|
+
#
|
243
|
+
# $this->mkdir_recursive($cache_path);
|
244
|
+
# $this->_cache = new Cache( $cache_path );
|
245
|
+
# $this->_cache->setKey($cache_key);
|
246
|
+
#
|
247
|
+
# }
|
248
|
+
#
|
249
|
+
# $cache_loop_flag = false;
|
250
|
+
# if($cache == true && $this->cache_flag == true) {
|
251
|
+
# if($this->_cache->cache_exists()) $cache_loop_flag = true;
|
252
|
+
# }
|
253
|
+
#
|
254
|
+
# if(!$cache_loop_flag) {
|
255
|
+
#
|
256
|
+
# $this->debug['process'][] = "3a (this_search_genus:$this_search_genus) (this_search_species:$this_search_species) (this_authority:$this_authority)";
|
257
|
+
#
|
258
|
+
# $nm = new NearMatch();
|
259
|
+
# $this_near_match_genus = $nm->near_match($this_search_genus);
|
260
|
+
#
|
261
|
+
# $this->debug['process'][] = "3b (this_near_match_genus:$this_near_match_genus)";
|
262
|
+
# //TODO refactor inside of a method
|
263
|
+
# $this_genus_start = substr($this_search_genus,0,3);
|
264
|
+
# $this_genus_end = substr($this_search_genus,-3);
|
265
|
+
# $this_genus_length = strlen($this_search_genus);
|
266
|
+
# //TODO_END
|
267
|
+
# $this->debug['process'][] = "3c (this_search_genus,$this_search_genus) (this_genus_start:$this_genus_start) (this_genus_end:$this_genus_end) (this_genus_length:$this_genus_length)";
|
268
|
+
#
|
269
|
+
# if ($this_search_species != '') {
|
270
|
+
# $this_near_match_species = $nm->near_match($this_search_species, 'epithet_only');
|
271
|
+
# $this_species_length = strlen($this_search_species);
|
272
|
+
# $this->debug['process'][] = "4 (this_search_species:$this_search_species) (this_near_match_species:$this_near_match_species) (this_species_length:$this_species_length)";
|
273
|
+
# }
|
274
|
+
#
|
275
|
+
#
|
276
|
+
# // now look for exact or near matches on genus first select candidate genera for edit distance (MDLD) test
|
277
|
+
#
|
278
|
+
# // for drec in genus_cur loop -- includes the genus pre-filter (main portion)
|
279
|
+
# $genus_res = $this->db->genus_cur($this->search_mode, $this_near_match_genus, $this_near_match_species, $this_genus_length,$this_genus_start,$this_genus_end);
|
280
|
+
#
|
281
|
+
# $this->debug['process'][] = "5 (genus_res:$genus_res)";
|
282
|
+
#
|
283
|
+
# if(count($genus_res)) {
|
284
|
+
# foreach($genus_res as $drec) {
|
285
|
+
# // test candidate genera for edit distance, keep if satisfies post-test criteria
|
286
|
+
# $this->genera_tested++;
|
287
|
+
# // do the genus edit distance test
|
288
|
+
#
|
289
|
+
# $genus_match = $this->match_genera($this_search_genus, $drec->search_genus_name);
|
290
|
+
# if ($genus_match['match']) {
|
291
|
+
# $phonetic_flag = $genus_match['phonetic_match'] ? 'Y' : null;
|
292
|
+
# $this->db->saveGenusMatches($drec->genus_id, $drec->genus, $genus_match['edit_distance'], $phonetic_flag);
|
293
|
+
#
|
294
|
+
# if ( ($this_search_species != null) && ($this_search_species != '') ) {
|
295
|
+
# $species_res = $this->db->species_cur($drec->genus_id, $this_species_length );
|
296
|
+
#
|
297
|
+
# if(count($species_res)) {
|
298
|
+
# foreach($species_res as $drec1) {
|
299
|
+
# $this->species_tested++;
|
300
|
+
#
|
301
|
+
# // do the species edit distance test
|
302
|
+
# $species_epithets_match = $this->match_species_epithets($this_search_species, $drec1->search_species_name);
|
303
|
+
# $binomials_match = $this->match_binomials($genus_match, $species_epithets_match);
|
304
|
+
# if ($binomials_match['match']) {
|
305
|
+
#
|
306
|
+
# $bionial_phonetic_flag = $binomials_match['phonetic_match'] ? 'Y' : null;
|
307
|
+
# $this->db->saveSpeciesMatches($drec1->species_id, $drec1->genus_species, $genus_match['edit_distance'], $temp_species_ED, $binomials_match['edit_distance'], $bionial_phonetic_flag);
|
308
|
+
# } //
|
309
|
+
# } // End foreach species_res
|
310
|
+
# } // End If elements exist for species_res
|
311
|
+
# } // End Search Species Exist
|
312
|
+
# }
|
313
|
+
# }
|
314
|
+
# }
|
315
|
+
# } // End Cache Loop Flag
|
316
|
+
# return true;
|
317
|
+
# }
|
318
|
+
#
|
319
|
+
# /**
|
320
|
+
# * generateResponse
|
321
|
+
# * Result generation section (including ranking, result shaping,
|
322
|
+
# * and authority comparison) - for demo purposes only
|
323
|
+
# * NB, in a production system this would be replaced by something
|
324
|
+
# * more appropriate, e.g. write to a file or database table,
|
325
|
+
# * generate a HTML page for web display,
|
326
|
+
# * generate XML response, etc. etc.
|
327
|
+
# * @param boolean $cache
|
328
|
+
# * @return boolean
|
329
|
+
# */
|
330
|
+
# public function generateResponse($cache) {
|
331
|
+
#
|
332
|
+
# $cache_loop_flag = false;
|
333
|
+
# if($cache == true && $this->cache_flag == true) {
|
334
|
+
# if($this->_cache->cache_exists()) $cache_loop_flag = true;
|
335
|
+
# }
|
336
|
+
#
|
337
|
+
# // if($cache == true && $this->_cache->cache_exists() && $this->cache_flag == true) {
|
338
|
+
# if($cache_loop_flag) {
|
339
|
+
#
|
340
|
+
# $this->data = $this->_cache->fetch();
|
341
|
+
# $data_array = json_decode($this->data,true);
|
342
|
+
# $data_array['cache'] = $cache;
|
343
|
+
# $this->data = json_encode($data_array);
|
344
|
+
#
|
345
|
+
# } else {
|
346
|
+
#
|
347
|
+
# // genus exact, phonetic, and other near matches
|
348
|
+
# $this->output['input'] = $this->searchtxt;
|
349
|
+
# $this->debug['generateResponse'][] = "1 (input:" . $this->searchtxt . ")";
|
350
|
+
#
|
351
|
+
# // Genus Exact
|
352
|
+
# $this->debug['generateResponse'][] = "1a (getGenusAuthority:exact)";
|
353
|
+
# $this->getGenusAuthority(0,'exact');
|
354
|
+
# // Genus Phonetic
|
355
|
+
# $this->debug['generateResponse'][] = "1b (getGenusAuthority:phonetic)";
|
356
|
+
# $this->getGenusAuthority('P','phonetic');
|
357
|
+
# // Genus near matches
|
358
|
+
# $this->debug['generateResponse'][] = "1c (getGenusAuthority:near_1)";
|
359
|
+
# $this->getGenusAuthority(1,'near_1');
|
360
|
+
# $this->debug['generateResponse'][] = "1d (getGenusAuthority:near_2)";
|
361
|
+
# $this->getGenusAuthority(2,'near_2');
|
362
|
+
#
|
363
|
+
# if(!is_array($this->output['genus']) && $this->this_search_genus != '') {$this->output['genus'] = array();}
|
364
|
+
#
|
365
|
+
# if ( !is_null($this->this_search_species) ) {
|
366
|
+
# // species exact, phonetic, and other near matches
|
367
|
+
#
|
368
|
+
# $this->debug['generateResponse'][] = "2a (getSpeciesAuthority:exact) ($this->this_authority)";
|
369
|
+
# $this->getSpeciesAuthority( 0, 'exact', $this->this_authority );
|
370
|
+
# $this->debug['generateResponse'][] = "2b (getSpeciesAuthority:phonetic) ($this->this_authority)";
|
371
|
+
# $this->getSpeciesAuthority( 'P', 'phonetic', $this->this_authority );
|
372
|
+
# $this->debug['generateResponse'][] = "2c (getSpeciesAuthority:near_1) ($this->this_authority)";
|
373
|
+
# $this->getSpeciesAuthority( 1, 'near_1', $this->this_authority );
|
374
|
+
# $this->debug['generateResponse'][] = "2d (getSpeciesAuthority:near_2) ($this->this_authority)";
|
375
|
+
# $this->getSpeciesAuthority( 2, 'near_2', $this->this_authority );
|
376
|
+
#
|
377
|
+
# // -- Here is the result shaping section (only show ED 3 if no ED 1,2 or phonetic matches, only
|
378
|
+
# // -- show ED 4 if no ED 1,2,3 or phonetic matches). By default shaping is on, unless disabled
|
379
|
+
# // -- via the input parameter "search_mode" set to 'no_shaping'.
|
380
|
+
# // -- In this demo we supplement any actual shaping with a message to show that it has been invoked,
|
381
|
+
# // -- to show the system operates correctly.
|
382
|
+
# if ($this->species_found == 'Y') {
|
383
|
+
# $temp_species_count = $this->db->countSpeciesMatches(3);
|
384
|
+
# $this->debug['generateResponse'][] = "3 (temp_species_count:$temp_species_count)";
|
385
|
+
# }
|
386
|
+
#
|
387
|
+
# if( $temp_species_count > 0 && $this->search_mode == 'no_shaping' ) {
|
388
|
+
# $this->debug['generateResponse'][] = "4 (getSpeciesAuthority:near_3) ($this->this_authority)";
|
389
|
+
# $this->getSpeciesAuthority( 3, 'near_3', $this->this_authority );
|
390
|
+
#
|
391
|
+
# if( $this->species_found == 'Y' ) {
|
392
|
+
# $temp_species_count = $this->db->countSpeciesMatches(4);
|
393
|
+
# }
|
394
|
+
#
|
395
|
+
# if( $temp_species_count > 0 && $this->search_mode == 'no_shaping') {
|
396
|
+
# $this->debug['generateResponse'][] = "4 (getSpeciesAuthority:near_4) ($this->this_authority)";
|
397
|
+
# $this->getSpeciesAuthority( 4, 'near_4', $this->this_authority );
|
398
|
+
# }
|
399
|
+
# } // END temp_species_count > 0 and "no_shaping"
|
400
|
+
#
|
401
|
+
# } // END If this_search_species
|
402
|
+
#
|
403
|
+
# if(!is_array($this->output['species']) && $this->this_search_species != '') {$this->output['species'] = array();}
|
404
|
+
# if($this->output_type == 'rest') {
|
405
|
+
# if($this->debug_flag) {
|
406
|
+
# $this->data = json_encode( array( 'success' => true, 'cache' => $cache, 'data' => $this->output, 'debug' => $this->debug ) );
|
407
|
+
# } else {
|
408
|
+
# $this->data = json_encode( array( 'success' => true, 'cache' => $cache, 'data' => $this->output));
|
409
|
+
# }
|
410
|
+
# } else {
|
411
|
+
# $this->data = $this->output;
|
412
|
+
# }
|
413
|
+
#
|
414
|
+
# if($this->cache_flag == true) {
|
415
|
+
# if( ! $this->_cache->cache_exists()) {
|
416
|
+
# if($this->debug_flag) {
|
417
|
+
# $op_array = array (
|
418
|
+
# 'success' => true
|
419
|
+
# , 'cache_date' => date('Y-m-d')
|
420
|
+
# , 'data' => $this->output
|
421
|
+
# , 'debug' => $this->debug
|
422
|
+
# );
|
423
|
+
# } else {
|
424
|
+
# $op_array = array (
|
425
|
+
# 'success' => true
|
426
|
+
# , 'cache_date' => date('Y-m-d')
|
427
|
+
# , 'data' => $this->output
|
428
|
+
# );
|
429
|
+
#
|
430
|
+
# }
|
431
|
+
# $op = json_encode($op_array);
|
432
|
+
# $this->_cache->update($op);
|
433
|
+
# $tmp_cache_key = $this->_cache->getKey();
|
434
|
+
# $this->_cache->setKey($tmp_cache_key . '_debug');
|
435
|
+
# $dbg = @json_encode($this->debug);
|
436
|
+
# $this->_cache->update($dbg);
|
437
|
+
# $this->_cache->setKey($tmp_cache_key);
|
438
|
+
# }
|
439
|
+
# }
|
440
|
+
# }
|
441
|
+
#
|
442
|
+
# return true;
|
443
|
+
#
|
444
|
+
# }
|
@@ -0,0 +1,58 @@
|
|
1
|
+
######################
|
2
|
+
# Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
|
3
|
+
#
|
4
|
+
# * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
|
5
|
+
# * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
|
6
|
+
#
|
7
|
+
# Fields:
|
8
|
+
# String1|String2|maximum distance|transposition block size|expected distance
|
9
|
+
# - String1, String2
|
10
|
+
# compared strings
|
11
|
+
# - maximum distance
|
12
|
+
# stops execution of the algorithm when calculated distance exceeds the maximum distance number
|
13
|
+
# - transosition block size
|
14
|
+
# determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
|
15
|
+
# - expected distance
|
16
|
+
# resulting distance that has to be achieved by the algorithm
|
17
|
+
# Note: algorithm does not try to normalize or interpret strings in any way.
|
18
|
+
######################
|
19
|
+
|
20
|
+
#it whould recognize the exact match
|
21
|
+
Pomatomus|Pomatomus|10|1|0
|
22
|
+
|
23
|
+
#it should not try to normalize incoming strings
|
24
|
+
Pomatomus|Pomatomus|10|1|1
|
25
|
+
Pomatomus|pomatomus|10|1|1
|
26
|
+
|
27
|
+
#it should calculate special cases
|
28
|
+
Pomatomus||10|1|9
|
29
|
+
|Pomatomus|10|1|9
|
30
|
+
P|p|10|1|1
|
31
|
+
|
32
|
+
|
33
|
+
#it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
|
34
|
+
Pomatomus|Pomatomux|10|1|1
|
35
|
+
Pmatomus|Pomatomus|10|1|1
|
36
|
+
Pomatomus|Pmatomus|10|1|1
|
37
|
+
Rpmatomus|Pomatomus|10|1|2
|
38
|
+
Pommtomus|Pomatomus|10|1|1
|
39
|
+
Potamomus|Pomatomus|10|1|2
|
40
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
|
41
|
+
Pomatomus|oPmatomus|10|1|1
|
42
|
+
Pomatomus|Pomatomsu|10|1|1
|
43
|
+
Pomtaomus|Pomatomus|10|1|1
|
44
|
+
Pomatoums|Pomatomus|10|1|1
|
45
|
+
Potamomus|Pomatomus|10|1|2
|
46
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
|
47
|
+
|
48
|
+
#it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
|
49
|
+
serrulatus|serratulus|10|2|2
|
50
|
+
Pomatomus|Poomumats|10|3|3
|
51
|
+
vesiculosus|vecusilosus|10|1|4
|
52
|
+
vesiculosus|vecusilosus|10|2|2
|
53
|
+
trimerophyton|mertriophyton|10|1|6
|
54
|
+
trimerophyton|mertriophyton|10|3|3
|
55
|
+
|
56
|
+
#it should stop trying if distance exceeds maximum allowed distance
|
57
|
+
Pxxxxomus|Pomatomus|10|1|4
|
58
|
+
Pxxxxomus|Pomatomus|2|1|null
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|