dimus-taxamatch_rb 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,83 @@
1
+ # encoding: UTF-8
2
+ require 'biodiversity'
3
+
4
+ class Parser
5
+ def initialize
6
+ @parser = ScientificNameParser.new
7
+ @parsed_raw = nil
8
+ @res = {}
9
+ end
10
+
11
+ def parse(name)
12
+ @res = {:all_authors => [], :all_years => []}
13
+ @parsed_raw = JSON.load(@parser.parse(name).to_json)['scientificName']
14
+ organize_results
15
+ end
16
+
17
+ def parsed_raw
18
+ return @parsed_raw
19
+ end
20
+
21
+ protected
22
+
23
+ def organize_results
24
+ pr = @parsed_raw
25
+ return nil unless pr['parsed']
26
+ d = pr['details'][0]
27
+ process_node(:uninomial, d['uninomial'])
28
+ process_node(:genus, d['genus'])
29
+ process_node(:species, d['species'], true)
30
+ process_infraspecies(d['infraspecies'])
31
+ @res[:all_authors].uniq!
32
+ @res[:all_years].uniq!
33
+ @res.keys.size > 2 ? @res : nil
34
+ end
35
+
36
+ def process_node(name, node, is_species = false)
37
+ return unless node
38
+ @res[name] = {}
39
+ @res[name][:epitheton] = node['epitheton']
40
+ @res[name][:normalized] = Normalizer.normalize(node['epitheton'])
41
+ @res[name][:phonetized] = Phonetizer.near_match(node['epitheton'], is_species)
42
+ get_authors_years(node, @res[name])
43
+ end
44
+
45
+ def process_infraspecies(node)
46
+ return unless node
47
+ @res[:infraspecies] = []
48
+ node.each do |infr|
49
+ hsh = {}
50
+ hsh[:epitheton] = infr['epitheton']
51
+ hsh[:normalized] = Normalizer.normalize(infr['epitheton'])
52
+ hsh[:phonetized] = Phonetizer.near_match(infr['epitheton'], true)
53
+ get_authors_years(infr,hsh)
54
+ @res[:infraspecies] << hsh
55
+ end
56
+ end
57
+
58
+ def get_authors_years(node, res)
59
+ res[:authors] = []
60
+ res[:years] = []
61
+ ['basionymAuthorTeam','combinationAuthorTeam'].each do |au|
62
+ if node[au]
63
+ res[:authors] += node[au]['author']
64
+ res[:years] << node[au]['year'] if node[au]['year']
65
+ if node[au]['exAuthorTeam']
66
+ res[:authors] += node[au]['exAuthorTeam']['author']
67
+ res[:years] << node[au]['exAuthorTeam']['year'] if node[au]['exAuthorTeam']['year']
68
+ end
69
+ end
70
+ end
71
+ res[:authors].uniq!
72
+ res[:years].uniq!
73
+ @res[:all_authors] += res[:authors] if res[:authors].size > 0
74
+ @res[:all_years] += res[:years] if res[:years].size > 0
75
+ end
76
+
77
+ end
78
+
79
+ if __FILE__ == $0
80
+ require 'pp'
81
+ p = Parser.new
82
+ puts p.parse('Salmonella werahensis (Castellani) Hauduroy and Ehringer in Hauduroy 1937')
83
+ end
@@ -0,0 +1,74 @@
1
+ # encoding: UTF-8
2
+
3
+ class Phonetizer
4
+
5
+ def self.near_match(a_word, normalize_ending = false)
6
+ a_word = a_word.strip rescue ''
7
+ return '' if a_word == ''
8
+ a_word = Normalizer.normalize a_word
9
+ case a_word
10
+ when /^AE/
11
+ a_word = 'E' + a_word[2..-1]
12
+ when /^CN/
13
+ a_word = 'N' + a_word[2..-1]
14
+ when /^CT/
15
+ a_word = 'T' + a_word[2..-1]
16
+ when /^CZ/
17
+ a_word = 'C' + a_word[2..-1]
18
+ when /^DJ/
19
+ a_word = 'J' + a_word[2..-1]
20
+ when /^EA/
21
+ a_word = 'E' + a_word[2..-1]
22
+ when /^EU/
23
+ a_word = 'U' + a_word[2..-1]
24
+ when /^GN/
25
+ a_word = 'N' + a_word[2..-1]
26
+ when /^KN/
27
+ a_word = 'N' + a_word[2..-1]
28
+ when /^MC/
29
+ a_word = 'MAC' + a_word[2..-1]
30
+ when /^MN/
31
+ a_word = 'N' + a_word[2..-1]
32
+ when /^OE/
33
+ a_word = 'E' + a_word[2..-1]
34
+ when /^QU/
35
+ a_word = 'Q' + a_word[2..-1]
36
+ when /^PS/
37
+ a_word = 'S' + a_word[2..-1]
38
+ when /^PT/
39
+ a_word = 'T' + a_word[2..-1]
40
+ when /^TS/
41
+ a_word = 'S' + a_word[2..-1]
42
+ when /^WR/
43
+ a_word = 'R' + a_word[2..-1]
44
+ when /^X/
45
+ a_word = 'Z' + a_word[1..-1]
46
+ end
47
+ first_char = a_word.split('')[0]
48
+ rest_chars = a_word.split('')[1..-1].join('')
49
+ rest_chars.gsub!('AE', 'I')
50
+ rest_chars.gsub!('IA', 'A')
51
+ rest_chars.gsub!('OE', 'I')
52
+ rest_chars.gsub!('OI', 'A')
53
+ rest_chars.gsub!('SC', 'S')
54
+ rest_chars.gsub!('H', '')
55
+ rest_chars.tr!('EOUYKZ', 'IAIICS')
56
+ a_word = (first_char + rest_chars).squeeze
57
+
58
+ if normalize_ending && a_word.size > 4
59
+ a_word = self.normalize_ending(a_word)
60
+ end
61
+ a_word
62
+ end
63
+
64
+ protected
65
+
66
+ def self.normalize_ending(a_word)
67
+ # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
68
+ # -- at the end of a string translate all to -a
69
+ a_word.gsub!(/IS$/, 'A')
70
+ a_word.gsub!(/IM$/, 'A')
71
+ a_word.gsub(/AS$/, 'A')
72
+ end
73
+
74
+ end
@@ -0,0 +1,444 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+ # $:.unshift('taxamatch_rb')
4
+ require 'taxamatch_rb/damerau_levenshtein_mod'
5
+ require 'taxamatch_rb/parser'
6
+ require 'taxamatch_rb/normalizer'
7
+ require 'taxamatch_rb/phonetizer'
8
+
9
+ class Taxamatch
10
+
11
+ def initialize
12
+ @parser = Parser.new
13
+ end
14
+ #
15
+ #
16
+ # #takes two scientific names and returns true if names matched and false if they did not
17
+ # def taxamatch(str1, str2) {
18
+ # taxa1 = @parser.parse_prepare(str1)
19
+ # taxa2 = @parser.parse_prepare(str2)
20
+ #
21
+ # full_taxamatch(taxa1, taxa2)[:mached]
22
+ # }
23
+ #
24
+ # #takes two hashes of parsed scientific names, analyses them and returns back
25
+ # #this function is useful when species strings are preparsed.
26
+ # def full_taxamatch
27
+ # end
28
+
29
+ end
30
+
31
+
32
+ #
33
+ # public function name_strings_match($name_string1, $name_string2) {
34
+ # $info_1 = new Splitter(null,$name_string1);
35
+ # $info_2 = new Splitter(null,$name_string2);
36
+ # return $this->name_objects_match($info_1, $info_2);
37
+ # }
38
+ #
39
+ # public function name_objects_match($name_object_1, $name_object_2) {
40
+ # $genus_match = $this->match_genera($name_object_1->genus, $name_object_2->genus);
41
+ # $epithets_match = $this->match_species_epithets($name_object_1->species, $name_object_2->species);
42
+ # $total_length = strlen($name_object_1->genus) + strlen($name_object_1->species) + strlen($name_object_2->genus) + strlen($name_object_2->species);
43
+ # $match = $this->match_binomials($genus_match, $epithets_match);
44
+ # return $this->match_response_to_float($match, $total_length);
45
+ # }
46
+ #
47
+ # public function match_response_to_float($match_response, $total_length_of_strings) {
48
+ # if(!$match_response['match']) return 0.0;
49
+ #
50
+ # return (1 - ($match_response['edit_distance'] / ($total_length_of_strings/2)));
51
+ # }
52
+ #
53
+ # public function match_genera($genus1, $genus2) {
54
+ # $match = $phonetic_match = false;
55
+ # $nm = new NearMatch();
56
+ # $genus1_phonetic = $nm->near_match($genus1);
57
+ # $genus2_phonetic = $nm->near_match($genus2);
58
+ # $genus1_length = strlen($genus1);
59
+ #
60
+ # $temp_genus_ED = $this->mdld($genus2, $genus1, 2);
61
+ # // add the genus post-filter
62
+ # // min. 51% "good" chars
63
+ # // first char must match for ED 2+
64
+ # if( ($temp_genus_ED <= 3 && ( min( strlen( $genus2 ), $genus1_length ) > ( $temp_genus_ED * 2 ))
65
+ # && ( $temp_genus_ED < 2 || ( substr($genus2,0,1) == substr($genus1,0,1) ) ) )
66
+ # || ($genus1_phonetic == $genus2_phonetic) ) {
67
+ # $match = true;
68
+ # // accept as exact or near match; append to genus results table
69
+ # $this->debug['process'][] = "6a (near_match_genus:$genus2_phonetic) (this_near_match_genus:$genus1_phonetic)";
70
+ #
71
+ # if($genus1_phonetic == $genus2_phonetic) $phonetic_match = true;
72
+ # }
73
+ # return array(
74
+ # 'match' => $match,
75
+ # 'phonetic_match' => $phonetic_match,
76
+ # 'edit_distance' => $temp_genus_ED);
77
+ # }
78
+ #
79
+ # public function match_species_epithets($species_epithet1, $species_epithet2) {
80
+ # $match = false;
81
+ # $phonetic_match = false;
82
+ # $epithet1_length = strlen($species_epithet1);
83
+ # $epithet2_length = strlen($species_epithet2);
84
+ #
85
+ # $nm = new NearMatch();
86
+ # $epithet1_phonetic = $nm->near_match($species_epithet1);
87
+ # $epithet2_phonetic = $nm->near_match($species_epithet2);
88
+ # $temp_species_ED = $this->mdld($species_epithet2, $species_epithet1, 4);
89
+ # // add the species post-filter
90
+ # // min. 50% "good" chars
91
+ # // first char must match for ED2+
92
+ # // first 3 chars must match for ED4
93
+ # if ($epithet2_phonetic == $epithet1_phonetic) $match = true;
94
+ # elseif( ($temp_species_ED <= 4 && min($epithet2_length, $epithet1_length) >= ($temp_species_ED*2)
95
+ # && ($temp_species_ED < 2 || strpos($species_epithet2 , substr($species_epithet1,0,1)) !== false)
96
+ # && ($temp_species_ED < 4 || strpos($species_epithet2 , substr($species_epithet1,0,3)) !== false))) $match = true;
97
+ #
98
+ # // if phonetic match, set relevant flag
99
+ # if ($epithet2_phonetic == $epithet1_phonetic) $phonetic_match = true;
100
+ #
101
+ # return array('match' => $match, 'phonetic_match' => $phonetic_match, 'edit_distance' => $temp_species_ED);
102
+ # }
103
+ #
104
+ #
105
+ # public function match_binomials($genus_match, $species_epithets_match) {
106
+ # $binomial_match = $species_epithets_match;
107
+ # $binomial_match['edit_distance'] = $genus_match["edit_distance"] + $species_epithets_match["edit_distance"];
108
+ #
109
+ # if(!$genus_match['match']) $binomial_match['match'] = false;
110
+ # if($binomial_match["edit_distance"] > 4) $binomial_match['match'] = false;
111
+ # if(!$genus_match['phonetic_match']) $binomial_match['phonetic_match'] = false;
112
+ #
113
+ #
114
+ # return $binomial_match;
115
+ # }
116
+ #
117
+ # // public function match_species($genus1, $species_epithet1, $genus2, $species_epithet2, $genus_edit_distance) {
118
+ # // $match = false;
119
+ # // $phonetic_match = false;
120
+ # // $epithet1_length = strlen($species_epithet1);
121
+ # // $epithet2_length = strlen($species_epithet2);
122
+ # //
123
+ # // $nm = new NearMatch();
124
+ # // $genus1_phonetic = $nm->near_match(genus1);
125
+ # // $genus2_phonetic = $nm->near_match(genus2);
126
+ # // $epithet1_phonetic = $nm->near_match($species_epithet1);
127
+ # // $epithet2_phonetic = $nm->near_match($species_epithet2);
128
+ # // $temp_species_ED = $this->mdld($species2, $species1, 4);
129
+ # // // add the species post-filter
130
+ # // // min. 50% "good" chars
131
+ # // // first char must match for ED2+
132
+ # // // first 3 chars must match for ED4
133
+ # // if ( ($epithet2_phonetic == $epithet1_phonetic)
134
+ # // || ( ($genus_edit_distance + $temp_species_ED <= 4)
135
+ # // && ($temp_species_ED <= 4 && min(strlen($epithet2_length),$epithet1_length) >= ($temp_species_ED*2)
136
+ # // && ($temp_species_ED < 2 || strpos($species_epithet2 , substr($species_epithet1,1,1)) !== false)
137
+ # // && ($temp_species_ED < 4 || strpos($species_epithet2 , substr($species_epithet1,1,3)) !== false)
138
+ # // && ($genus_edit_distance + $temp_species_ED <= 4) ))) {
139
+ # // $match = true;
140
+ # // // accept as exact or near match, append to species results table
141
+ # // // if phonetic match, set relevant flag
142
+ # // if ( ($genus2_phonetic == $genus1_phonetic) && ($epithet2_phonetic == $epithet1_phonetic) ) $phonetic_match = true;
143
+ # // }
144
+ # // return array('match' => $match, 'phonetic_match' => $phonetic_match, 'edit_distance' => $temp_species_ED);
145
+ # // }
146
+ #
147
+ # /**
148
+ # * Function : process
149
+ # * Purpose: Perform exact and fuzzy matching on a species name, or single genus name
150
+ # * Input: - genus, genus+species, or genus+species+authority (in this version), as "searchtxt"
151
+ # * - "search_mode" to control search mode: currently normal (default) / rapid / no_shaping
152
+ # * - "debug" - print internal parameters used if not null
153
+ # * Outputs: list of genera and species that match (or near match) input terms, with associated
154
+ # * ancillary info as desired
155
+ # * Remarks:
156
+ # * (1) This demo version is configured to access base data in three tables:
157
+ # * - genlist_test1 (genus info); primary key (PK) is genus_id
158
+ # * - splist_test1 (species info); PK is species_id, has genus_id as foreign key (FK)
159
+ # * (= link to relevant row in genus table)
160
+ # * - auth_abbrev_test1 (authority abbreviations - required by subsidiary function
161
+ # * "normalize_auth". Refer README file for relevant minimum table definitions.
162
+ # * If authority comparisons are not required, calls to "normalize_auth" can be disabled and
163
+ # * relevant function commented out, removing need for third table.
164
+ # * (In a production system, table and column names can be varied as desired so long as
165
+ # * code is altered at relevant points, also could be re-configured to hold all genus+species info together in a single table with minor re-write).
166
+ # * (2) Writes to and reads back from pre-defined global temporary tables
167
+ # * "genus_id_matches" and "species_id_matches", new instances of these are automatically
168
+ # * created for each session (i.e., do not need clearing at procedure end). Refer
169
+ # * README file for relevant table definitions.
170
+ # * (3) When result shaping is on in this version, a relevant message displayed as required
171
+ # * for developer feedback, if more distant results are being masked (in producton version,
172
+ # * possibly would not do this)
173
+ # * (4) Requires the following subsidiary functions (supplied elsewhere in this package):
174
+ # * - normalize
175
+ # * - normalize_auth
176
+ # * - reduce_spaces
177
+ # * - ngram
178
+ # * - compare_auth
179
+ # * - near_match
180
+ # * - mdld
181
+ # * (5) Accepts "+" as input separator in place of space (e.g. "Homo+sapiens"), e.g. for calling
182
+ # * via a HTTP GET request as needed.
183
+ # * @param string $searchtxt : genus, genus+species, or genus+species+authority
184
+ # * @param string $search_mode : normal (default) / rapid / no_shaping
185
+ # * @param boolean $cache
186
+ # * @return boolean
187
+ # */
188
+ # public function process($searchtxt, $search_mode='normal', $cache = false) {
189
+ #
190
+ # $this->input = $searchtxt;
191
+ #
192
+ # $this->debug['process'][] = "1 (searchtxt:$searchtxt) (search_mode:$search_mode)";
193
+ # $this->searchtxt = $searchtxt;
194
+ # $this->search_mode=$search_mode;
195
+ #
196
+ # // accept "+" as separator if supplied, tranform to space
197
+ # if ( strpos($this->searchtxt,'+') !== false ) {
198
+ # $text_str = str_replace('+',' ',$this->searchtxt);
199
+ # } else {
200
+ # $text_str = $this->searchtxt;
201
+ # }
202
+ #
203
+ # $this->debug['process'][] = "1a (text_str:$text_str)";
204
+ #
205
+ # if ( is_null($text_str) || $text_str == '' ) {
206
+ # $this->debug['process'][] = "2 Return(false)";
207
+ # return false;
208
+ # }
209
+ #
210
+ # // Clearing the temporary tables
211
+ # $this->db->clearTempTables();
212
+ #
213
+ # // includes stripping of presumed non-relevant content including subgenera, comments, cf's, aff's, etc... to
214
+ #
215
+ # // Normalizing the search text
216
+ # $n = new Normalize($this->db);
217
+ #
218
+ # $this->debug['process'][] = "3 (text_str:$text_str)";
219
+ #
220
+ # if(!$this->chop_overload) {
221
+ # // leave presumed genus + species + authority (in this instance), with genus and species in uppercase
222
+ # $splitter = new Splitter($n,$text_str);
223
+ #
224
+ # $this->this_search_genus = $this_search_genus = $splitter->get('genus');
225
+ # $this->this_search_species = $this_search_species = $splitter->get('species');
226
+ # $this->this_authority = $this_authority = $splitter->get('author');
227
+ # }
228
+ #
229
+ # // cache_flag switch detemines if caching is allowed for the source
230
+ # if($this->cache_flag == true) {
231
+ #
232
+ # if ( $this_search_genus != '' && $this_search_species != '' && $this_authority != '' ) {
233
+ # $cache_key = $this_search_genus . '-' . $this_search_species . '-' . $this_authority . '_' . $search_mode;
234
+ # $cache_path = $this->cache_path . $this->db->source . "/authority/";
235
+ # } else if ( $this_search_genus != '' && $this_search_species != '' ) {
236
+ # $cache_key = $this_search_genus . '-' . $this_search_species . '_' . $search_mode;
237
+ # $cache_path = $this->cache_path . $this->db->source . "/species/";
238
+ # } else if ( $this_search_genus != '' ) {
239
+ # $cache_key = $this_search_genus . '_' . $search_mode;
240
+ # $cache_path = $this->cache_path . $this->db->source . "/genus/";
241
+ # }
242
+ #
243
+ # $this->mkdir_recursive($cache_path);
244
+ # $this->_cache = new Cache( $cache_path );
245
+ # $this->_cache->setKey($cache_key);
246
+ #
247
+ # }
248
+ #
249
+ # $cache_loop_flag = false;
250
+ # if($cache == true && $this->cache_flag == true) {
251
+ # if($this->_cache->cache_exists()) $cache_loop_flag = true;
252
+ # }
253
+ #
254
+ # if(!$cache_loop_flag) {
255
+ #
256
+ # $this->debug['process'][] = "3a (this_search_genus:$this_search_genus) (this_search_species:$this_search_species) (this_authority:$this_authority)";
257
+ #
258
+ # $nm = new NearMatch();
259
+ # $this_near_match_genus = $nm->near_match($this_search_genus);
260
+ #
261
+ # $this->debug['process'][] = "3b (this_near_match_genus:$this_near_match_genus)";
262
+ # //TODO refactor inside of a method
263
+ # $this_genus_start = substr($this_search_genus,0,3);
264
+ # $this_genus_end = substr($this_search_genus,-3);
265
+ # $this_genus_length = strlen($this_search_genus);
266
+ # //TODO_END
267
+ # $this->debug['process'][] = "3c (this_search_genus,$this_search_genus) (this_genus_start:$this_genus_start) (this_genus_end:$this_genus_end) (this_genus_length:$this_genus_length)";
268
+ #
269
+ # if ($this_search_species != '') {
270
+ # $this_near_match_species = $nm->near_match($this_search_species, 'epithet_only');
271
+ # $this_species_length = strlen($this_search_species);
272
+ # $this->debug['process'][] = "4 (this_search_species:$this_search_species) (this_near_match_species:$this_near_match_species) (this_species_length:$this_species_length)";
273
+ # }
274
+ #
275
+ #
276
+ # // now look for exact or near matches on genus first select candidate genera for edit distance (MDLD) test
277
+ #
278
+ # // for drec in genus_cur loop -- includes the genus pre-filter (main portion)
279
+ # $genus_res = $this->db->genus_cur($this->search_mode, $this_near_match_genus, $this_near_match_species, $this_genus_length,$this_genus_start,$this_genus_end);
280
+ #
281
+ # $this->debug['process'][] = "5 (genus_res:$genus_res)";
282
+ #
283
+ # if(count($genus_res)) {
284
+ # foreach($genus_res as $drec) {
285
+ # // test candidate genera for edit distance, keep if satisfies post-test criteria
286
+ # $this->genera_tested++;
287
+ # // do the genus edit distance test
288
+ #
289
+ # $genus_match = $this->match_genera($this_search_genus, $drec->search_genus_name);
290
+ # if ($genus_match['match']) {
291
+ # $phonetic_flag = $genus_match['phonetic_match'] ? 'Y' : null;
292
+ # $this->db->saveGenusMatches($drec->genus_id, $drec->genus, $genus_match['edit_distance'], $phonetic_flag);
293
+ #
294
+ # if ( ($this_search_species != null) && ($this_search_species != '') ) {
295
+ # $species_res = $this->db->species_cur($drec->genus_id, $this_species_length );
296
+ #
297
+ # if(count($species_res)) {
298
+ # foreach($species_res as $drec1) {
299
+ # $this->species_tested++;
300
+ #
301
+ # // do the species edit distance test
302
+ # $species_epithets_match = $this->match_species_epithets($this_search_species, $drec1->search_species_name);
303
+ # $binomials_match = $this->match_binomials($genus_match, $species_epithets_match);
304
+ # if ($binomials_match['match']) {
305
+ #
306
+ # $bionial_phonetic_flag = $binomials_match['phonetic_match'] ? 'Y' : null;
307
+ # $this->db->saveSpeciesMatches($drec1->species_id, $drec1->genus_species, $genus_match['edit_distance'], $temp_species_ED, $binomials_match['edit_distance'], $bionial_phonetic_flag);
308
+ # } //
309
+ # } // End foreach species_res
310
+ # } // End If elements exist for species_res
311
+ # } // End Search Species Exist
312
+ # }
313
+ # }
314
+ # }
315
+ # } // End Cache Loop Flag
316
+ # return true;
317
+ # }
318
+ #
319
+ # /**
320
+ # * generateResponse
321
+ # * Result generation section (including ranking, result shaping,
322
+ # * and authority comparison) - for demo purposes only
323
+ # * NB, in a production system this would be replaced by something
324
+ # * more appropriate, e.g. write to a file or database table,
325
+ # * generate a HTML page for web display,
326
+ # * generate XML response, etc. etc.
327
+ # * @param boolean $cache
328
+ # * @return boolean
329
+ # */
330
+ # public function generateResponse($cache) {
331
+ #
332
+ # $cache_loop_flag = false;
333
+ # if($cache == true && $this->cache_flag == true) {
334
+ # if($this->_cache->cache_exists()) $cache_loop_flag = true;
335
+ # }
336
+ #
337
+ # // if($cache == true && $this->_cache->cache_exists() && $this->cache_flag == true) {
338
+ # if($cache_loop_flag) {
339
+ #
340
+ # $this->data = $this->_cache->fetch();
341
+ # $data_array = json_decode($this->data,true);
342
+ # $data_array['cache'] = $cache;
343
+ # $this->data = json_encode($data_array);
344
+ #
345
+ # } else {
346
+ #
347
+ # // genus exact, phonetic, and other near matches
348
+ # $this->output['input'] = $this->searchtxt;
349
+ # $this->debug['generateResponse'][] = "1 (input:" . $this->searchtxt . ")";
350
+ #
351
+ # // Genus Exact
352
+ # $this->debug['generateResponse'][] = "1a (getGenusAuthority:exact)";
353
+ # $this->getGenusAuthority(0,'exact');
354
+ # // Genus Phonetic
355
+ # $this->debug['generateResponse'][] = "1b (getGenusAuthority:phonetic)";
356
+ # $this->getGenusAuthority('P','phonetic');
357
+ # // Genus near matches
358
+ # $this->debug['generateResponse'][] = "1c (getGenusAuthority:near_1)";
359
+ # $this->getGenusAuthority(1,'near_1');
360
+ # $this->debug['generateResponse'][] = "1d (getGenusAuthority:near_2)";
361
+ # $this->getGenusAuthority(2,'near_2');
362
+ #
363
+ # if(!is_array($this->output['genus']) && $this->this_search_genus != '') {$this->output['genus'] = array();}
364
+ #
365
+ # if ( !is_null($this->this_search_species) ) {
366
+ # // species exact, phonetic, and other near matches
367
+ #
368
+ # $this->debug['generateResponse'][] = "2a (getSpeciesAuthority:exact) ($this->this_authority)";
369
+ # $this->getSpeciesAuthority( 0, 'exact', $this->this_authority );
370
+ # $this->debug['generateResponse'][] = "2b (getSpeciesAuthority:phonetic) ($this->this_authority)";
371
+ # $this->getSpeciesAuthority( 'P', 'phonetic', $this->this_authority );
372
+ # $this->debug['generateResponse'][] = "2c (getSpeciesAuthority:near_1) ($this->this_authority)";
373
+ # $this->getSpeciesAuthority( 1, 'near_1', $this->this_authority );
374
+ # $this->debug['generateResponse'][] = "2d (getSpeciesAuthority:near_2) ($this->this_authority)";
375
+ # $this->getSpeciesAuthority( 2, 'near_2', $this->this_authority );
376
+ #
377
+ # // -- Here is the result shaping section (only show ED 3 if no ED 1,2 or phonetic matches, only
378
+ # // -- show ED 4 if no ED 1,2,3 or phonetic matches). By default shaping is on, unless disabled
379
+ # // -- via the input parameter "search_mode" set to 'no_shaping'.
380
+ # // -- In this demo we supplement any actual shaping with a message to show that it has been invoked,
381
+ # // -- to show the system operates correctly.
382
+ # if ($this->species_found == 'Y') {
383
+ # $temp_species_count = $this->db->countSpeciesMatches(3);
384
+ # $this->debug['generateResponse'][] = "3 (temp_species_count:$temp_species_count)";
385
+ # }
386
+ #
387
+ # if( $temp_species_count > 0 && $this->search_mode == 'no_shaping' ) {
388
+ # $this->debug['generateResponse'][] = "4 (getSpeciesAuthority:near_3) ($this->this_authority)";
389
+ # $this->getSpeciesAuthority( 3, 'near_3', $this->this_authority );
390
+ #
391
+ # if( $this->species_found == 'Y' ) {
392
+ # $temp_species_count = $this->db->countSpeciesMatches(4);
393
+ # }
394
+ #
395
+ # if( $temp_species_count > 0 && $this->search_mode == 'no_shaping') {
396
+ # $this->debug['generateResponse'][] = "4 (getSpeciesAuthority:near_4) ($this->this_authority)";
397
+ # $this->getSpeciesAuthority( 4, 'near_4', $this->this_authority );
398
+ # }
399
+ # } // END temp_species_count > 0 and "no_shaping"
400
+ #
401
+ # } // END If this_search_species
402
+ #
403
+ # if(!is_array($this->output['species']) && $this->this_search_species != '') {$this->output['species'] = array();}
404
+ # if($this->output_type == 'rest') {
405
+ # if($this->debug_flag) {
406
+ # $this->data = json_encode( array( 'success' => true, 'cache' => $cache, 'data' => $this->output, 'debug' => $this->debug ) );
407
+ # } else {
408
+ # $this->data = json_encode( array( 'success' => true, 'cache' => $cache, 'data' => $this->output));
409
+ # }
410
+ # } else {
411
+ # $this->data = $this->output;
412
+ # }
413
+ #
414
+ # if($this->cache_flag == true) {
415
+ # if( ! $this->_cache->cache_exists()) {
416
+ # if($this->debug_flag) {
417
+ # $op_array = array (
418
+ # 'success' => true
419
+ # , 'cache_date' => date('Y-m-d')
420
+ # , 'data' => $this->output
421
+ # , 'debug' => $this->debug
422
+ # );
423
+ # } else {
424
+ # $op_array = array (
425
+ # 'success' => true
426
+ # , 'cache_date' => date('Y-m-d')
427
+ # , 'data' => $this->output
428
+ # );
429
+ #
430
+ # }
431
+ # $op = json_encode($op_array);
432
+ # $this->_cache->update($op);
433
+ # $tmp_cache_key = $this->_cache->getKey();
434
+ # $this->_cache->setKey($tmp_cache_key . '_debug');
435
+ # $dbg = @json_encode($this->debug);
436
+ # $this->_cache->update($dbg);
437
+ # $this->_cache->setKey($tmp_cache_key);
438
+ # }
439
+ # }
440
+ # }
441
+ #
442
+ # return true;
443
+ #
444
+ # }
@@ -0,0 +1,58 @@
1
+ ######################
2
+ # Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
3
+ #
4
+ # * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
5
+ # * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
6
+ #
7
+ # Fields:
8
+ # String1|String2|maximum distance|transposition block size|expected distance
9
+ # - String1, String2
10
+ # compared strings
11
+ # - maximum distance
12
+ # stops execution of the algorithm when calculated distance exceeds the maximum distance number
13
+ # - transosition block size
14
+ # determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
15
+ # - expected distance
16
+ # resulting distance that has to be achieved by the algorithm
17
+ # Note: algorithm does not try to normalize or interpret strings in any way.
18
+ ######################
19
+
20
+ #it whould recognize the exact match
21
+ Pomatomus|Pomatomus|10|1|0
22
+
23
+ #it should not try to normalize incoming strings
24
+ Pomatomus|Pomatomus|10|1|1
25
+ Pomatomus|pomatomus|10|1|1
26
+
27
+ #it should calculate special cases
28
+ Pomatomus||10|1|9
29
+ |Pomatomus|10|1|9
30
+ P|p|10|1|1
31
+
32
+
33
+ #it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
34
+ Pomatomus|Pomatomux|10|1|1
35
+ Pmatomus|Pomatomus|10|1|1
36
+ Pomatomus|Pmatomus|10|1|1
37
+ Rpmatomus|Pomatomus|10|1|2
38
+ Pommtomus|Pomatomus|10|1|1
39
+ Potamomus|Pomatomus|10|1|2
40
+ Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
41
+ Pomatomus|oPmatomus|10|1|1
42
+ Pomatomus|Pomatomsu|10|1|1
43
+ Pomtaomus|Pomatomus|10|1|1
44
+ Pomatoums|Pomatomus|10|1|1
45
+ Potamomus|Pomatomus|10|1|2
46
+ Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
47
+
48
+ #it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
49
+ serrulatus|serratulus|10|2|2
50
+ Pomatomus|Poomumats|10|3|3
51
+ vesiculosus|vecusilosus|10|1|4
52
+ vesiculosus|vecusilosus|10|2|2
53
+ trimerophyton|mertriophyton|10|1|6
54
+ trimerophyton|mertriophyton|10|3|3
55
+
56
+ #it should stop trying if distance exceeds maximum allowed distance
57
+ Pxxxxomus|Pomatomus|10|1|4
58
+ Pxxxxomus|Pomatomus|2|1|null
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --colour