dimus-taxamatch_rb 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.2
data/lib/taxamatch_rb.rb CHANGED
@@ -10,435 +10,68 @@ class Taxamatch
10
10
 
11
11
  def initialize
12
12
  @parser = Parser.new
13
+ @dlm = DamerauLevenshteinMod.new
14
+ end
15
+
16
+
17
+ #takes two scientific names and returns true if names match and false if they don't
18
+ def taxamatch(str1, str2)
19
+ parsed_data_1 = @parser.parse(str1)
20
+ parsed_data_2 = @parser.parse(str2)
21
+ taxamatch_parsed_data(parsed_data_1, parsed_data_2)[:match]
22
+ end
23
+
24
+ #takes two hashes of parsed scientific names, analyses them and returns back
25
+ #this function is useful when species strings are preparsed.
26
+ def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
27
+ return match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:unicode] && parsed_data_2[:unicode]
28
+ return match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
29
+ return false
30
+ end
31
+
32
+ def match_uninomial(parsed_data_1, parsed_data_2)
33
+ return false
13
34
  end
14
- #
15
- #
16
- # #takes two scientific names and returns true if names matched and false if they did not
17
- # def taxamatch(str1, str2) {
18
- # taxa1 = @parser.parse_prepare(str1)
19
- # taxa2 = @parser.parse_prepare(str2)
20
- #
21
- # full_taxamatch(taxa1, taxa2)[:mached]
22
- # }
23
- #
24
- # #takes two hashes of parsed scientific names, analyses them and returns back
25
- # #this function is useful when species strings are preparsed.
26
- # def full_taxamatch
27
- # end
28
35
 
29
- end
36
+ def match_multinomial(parsed_data_1, parsed_data_2)
37
+ gen_match = match_genera(parsed_data_1[:genus], parsed_data_2[:genus])
38
+ sp_match = match_species(parsed_data_1[:species], parsed_data_2[:species])
39
+ total_length = parsed_data_1[:genus][:epitheton].size + parsed_data_2[:genus][:epitheton].size + parsed_data_1[:species][:epitheton].size + parsed_data_2[:species][:epitheton].size
40
+ match = match_matches(gen_match, sp_match)
41
+ match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
42
+ end
43
+
44
+ def match_genera(genus1, genus2)
45
+ genus1_length = genus1[:normalized].size
46
+ genus2_length = genus2[:normalized].size
47
+ match = false
48
+ ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
49
+ return {:edit_distance => ed, :phonetic_match => true, :match => true} if genus1[:phonetized] == genus2[:phonetized]
50
+
51
+ match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
52
+ {:edit_distance => ed, :match => match, :phonetic_match => false}
53
+ end
30
54
 
55
+ def match_species(sp1, sp2)
56
+ sp1_length = sp1[:normalized].size
57
+ sp2_length = sp2[:normalized].size
58
+ sp1[:phonetized] = Phonetizer.normalize_ending sp1[:phonetized]
59
+ sp2[:phonetized] = Phonetizer.normalize_ending sp2[:phonetized]
60
+ match = false
61
+ ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
62
+ return {:edit_distance => ed, :phonetic_match => true, :match => true} if sp1[:phonetized] == sp2[:phonetized]
63
+
64
+ match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
65
+ {:edit_distance => ed, :match => match, :phonetic_match => false}
66
+ end
67
+
68
+ def match_matches(genus_match, species_match, infraspecies_matches = [])
69
+ match = species_match
70
+ match[:edit_distance] += genus_match[:edit_distance]
71
+ match[:match] = false if match[:edit_distance] > 4
72
+ match[:match] &&= genus_match[:match]
73
+ match[:phonetic_match] &&= genus_match[:phonetic_match]
74
+ match
75
+ end
31
76
 
32
- #
33
- # public function name_strings_match($name_string1, $name_string2) {
34
- # $info_1 = new Splitter(null,$name_string1);
35
- # $info_2 = new Splitter(null,$name_string2);
36
- # return $this->name_objects_match($info_1, $info_2);
37
- # }
38
- #
39
- # public function name_objects_match($name_object_1, $name_object_2) {
40
- # $genus_match = $this->match_genera($name_object_1->genus, $name_object_2->genus);
41
- # $epithets_match = $this->match_species_epithets($name_object_1->species, $name_object_2->species);
42
- # $total_length = strlen($name_object_1->genus) + strlen($name_object_1->species) + strlen($name_object_2->genus) + strlen($name_object_2->species);
43
- # $match = $this->match_binomials($genus_match, $epithets_match);
44
- # return $this->match_response_to_float($match, $total_length);
45
- # }
46
- #
47
- # public function match_response_to_float($match_response, $total_length_of_strings) {
48
- # if(!$match_response['match']) return 0.0;
49
- #
50
- # return (1 - ($match_response['edit_distance'] / ($total_length_of_strings/2)));
51
- # }
52
- #
53
- # public function match_genera($genus1, $genus2) {
54
- # $match = $phonetic_match = false;
55
- # $nm = new NearMatch();
56
- # $genus1_phonetic = $nm->near_match($genus1);
57
- # $genus2_phonetic = $nm->near_match($genus2);
58
- # $genus1_length = strlen($genus1);
59
- #
60
- # $temp_genus_ED = $this->mdld($genus2, $genus1, 2);
61
- # // add the genus post-filter
62
- # // min. 51% "good" chars
63
- # // first char must match for ED 2+
64
- # if( ($temp_genus_ED <= 3 && ( min( strlen( $genus2 ), $genus1_length ) > ( $temp_genus_ED * 2 ))
65
- # && ( $temp_genus_ED < 2 || ( substr($genus2,0,1) == substr($genus1,0,1) ) ) )
66
- # || ($genus1_phonetic == $genus2_phonetic) ) {
67
- # $match = true;
68
- # // accept as exact or near match; append to genus results table
69
- # $this->debug['process'][] = "6a (near_match_genus:$genus2_phonetic) (this_near_match_genus:$genus1_phonetic)";
70
- #
71
- # if($genus1_phonetic == $genus2_phonetic) $phonetic_match = true;
72
- # }
73
- # return array(
74
- # 'match' => $match,
75
- # 'phonetic_match' => $phonetic_match,
76
- # 'edit_distance' => $temp_genus_ED);
77
- # }
78
- #
79
- # public function match_species_epithets($species_epithet1, $species_epithet2) {
80
- # $match = false;
81
- # $phonetic_match = false;
82
- # $epithet1_length = strlen($species_epithet1);
83
- # $epithet2_length = strlen($species_epithet2);
84
- #
85
- # $nm = new NearMatch();
86
- # $epithet1_phonetic = $nm->near_match($species_epithet1);
87
- # $epithet2_phonetic = $nm->near_match($species_epithet2);
88
- # $temp_species_ED = $this->mdld($species_epithet2, $species_epithet1, 4);
89
- # // add the species post-filter
90
- # // min. 50% "good" chars
91
- # // first char must match for ED2+
92
- # // first 3 chars must match for ED4
93
- # if ($epithet2_phonetic == $epithet1_phonetic) $match = true;
94
- # elseif( ($temp_species_ED <= 4 && min($epithet2_length, $epithet1_length) >= ($temp_species_ED*2)
95
- # && ($temp_species_ED < 2 || strpos($species_epithet2 , substr($species_epithet1,0,1)) !== false)
96
- # && ($temp_species_ED < 4 || strpos($species_epithet2 , substr($species_epithet1,0,3)) !== false))) $match = true;
97
- #
98
- # // if phonetic match, set relevant flag
99
- # if ($epithet2_phonetic == $epithet1_phonetic) $phonetic_match = true;
100
- #
101
- # return array('match' => $match, 'phonetic_match' => $phonetic_match, 'edit_distance' => $temp_species_ED);
102
- # }
103
- #
104
- #
105
- # public function match_binomials($genus_match, $species_epithets_match) {
106
- # $binomial_match = $species_epithets_match;
107
- # $binomial_match['edit_distance'] = $genus_match["edit_distance"] + $species_epithets_match["edit_distance"];
108
- #
109
- # if(!$genus_match['match']) $binomial_match['match'] = false;
110
- # if($binomial_match["edit_distance"] > 4) $binomial_match['match'] = false;
111
- # if(!$genus_match['phonetic_match']) $binomial_match['phonetic_match'] = false;
112
- #
113
- #
114
- # return $binomial_match;
115
- # }
116
- #
117
- # // public function match_species($genus1, $species_epithet1, $genus2, $species_epithet2, $genus_edit_distance) {
118
- # // $match = false;
119
- # // $phonetic_match = false;
120
- # // $epithet1_length = strlen($species_epithet1);
121
- # // $epithet2_length = strlen($species_epithet2);
122
- # //
123
- # // $nm = new NearMatch();
124
- # // $genus1_phonetic = $nm->near_match(genus1);
125
- # // $genus2_phonetic = $nm->near_match(genus2);
126
- # // $epithet1_phonetic = $nm->near_match($species_epithet1);
127
- # // $epithet2_phonetic = $nm->near_match($species_epithet2);
128
- # // $temp_species_ED = $this->mdld($species2, $species1, 4);
129
- # // // add the species post-filter
130
- # // // min. 50% "good" chars
131
- # // // first char must match for ED2+
132
- # // // first 3 chars must match for ED4
133
- # // if ( ($epithet2_phonetic == $epithet1_phonetic)
134
- # // || ( ($genus_edit_distance + $temp_species_ED <= 4)
135
- # // && ($temp_species_ED <= 4 && min(strlen($epithet2_length),$epithet1_length) >= ($temp_species_ED*2)
136
- # // && ($temp_species_ED < 2 || strpos($species_epithet2 , substr($species_epithet1,1,1)) !== false)
137
- # // && ($temp_species_ED < 4 || strpos($species_epithet2 , substr($species_epithet1,1,3)) !== false)
138
- # // && ($genus_edit_distance + $temp_species_ED <= 4) ))) {
139
- # // $match = true;
140
- # // // accept as exact or near match, append to species results table
141
- # // // if phonetic match, set relevant flag
142
- # // if ( ($genus2_phonetic == $genus1_phonetic) && ($epithet2_phonetic == $epithet1_phonetic) ) $phonetic_match = true;
143
- # // }
144
- # // return array('match' => $match, 'phonetic_match' => $phonetic_match, 'edit_distance' => $temp_species_ED);
145
- # // }
146
- #
147
- # /**
148
- # * Function : process
149
- # * Purpose: Perform exact and fuzzy matching on a species name, or single genus name
150
- # * Input: - genus, genus+species, or genus+species+authority (in this version), as "searchtxt"
151
- # * - "search_mode" to control search mode: currently normal (default) / rapid / no_shaping
152
- # * - "debug" - print internal parameters used if not null
153
- # * Outputs: list of genera and species that match (or near match) input terms, with associated
154
- # * ancillary info as desired
155
- # * Remarks:
156
- # * (1) This demo version is configured to access base data in three tables:
157
- # * - genlist_test1 (genus info); primary key (PK) is genus_id
158
- # * - splist_test1 (species info); PK is species_id, has genus_id as foreign key (FK)
159
- # * (= link to relevant row in genus table)
160
- # * - auth_abbrev_test1 (authority abbreviations - required by subsidiary function
161
- # * "normalize_auth". Refer README file for relevant minimum table definitions.
162
- # * If authority comparisons are not required, calls to "normalize_auth" can be disabled and
163
- # * relevant function commented out, removing need for third table.
164
- # * (In a production system, table and column names can be varied as desired so long as
165
- # * code is altered at relevant points, also could be re-configured to hold all genus+species info together in a single table with minor re-write).
166
- # * (2) Writes to and reads back from pre-defined global temporary tables
167
- # * "genus_id_matches" and "species_id_matches", new instances of these are automatically
168
- # * created for each session (i.e., do not need clearing at procedure end). Refer
169
- # * README file for relevant table definitions.
170
- # * (3) When result shaping is on in this version, a relevant message displayed as required
171
- # * for developer feedback, if more distant results are being masked (in producton version,
172
- # * possibly would not do this)
173
- # * (4) Requires the following subsidiary functions (supplied elsewhere in this package):
174
- # * - normalize
175
- # * - normalize_auth
176
- # * - reduce_spaces
177
- # * - ngram
178
- # * - compare_auth
179
- # * - near_match
180
- # * - mdld
181
- # * (5) Accepts "+" as input separator in place of space (e.g. "Homo+sapiens"), e.g. for calling
182
- # * via a HTTP GET request as needed.
183
- # * @param string $searchtxt : genus, genus+species, or genus+species+authority
184
- # * @param string $search_mode : normal (default) / rapid / no_shaping
185
- # * @param boolean $cache
186
- # * @return boolean
187
- # */
188
- # public function process($searchtxt, $search_mode='normal', $cache = false) {
189
- #
190
- # $this->input = $searchtxt;
191
- #
192
- # $this->debug['process'][] = "1 (searchtxt:$searchtxt) (search_mode:$search_mode)";
193
- # $this->searchtxt = $searchtxt;
194
- # $this->search_mode=$search_mode;
195
- #
196
- # // accept "+" as separator if supplied, tranform to space
197
- # if ( strpos($this->searchtxt,'+') !== false ) {
198
- # $text_str = str_replace('+',' ',$this->searchtxt);
199
- # } else {
200
- # $text_str = $this->searchtxt;
201
- # }
202
- #
203
- # $this->debug['process'][] = "1a (text_str:$text_str)";
204
- #
205
- # if ( is_null($text_str) || $text_str == '' ) {
206
- # $this->debug['process'][] = "2 Return(false)";
207
- # return false;
208
- # }
209
- #
210
- # // Clearing the temporary tables
211
- # $this->db->clearTempTables();
212
- #
213
- # // includes stripping of presumed non-relevant content including subgenera, comments, cf's, aff's, etc... to
214
- #
215
- # // Normalizing the search text
216
- # $n = new Normalize($this->db);
217
- #
218
- # $this->debug['process'][] = "3 (text_str:$text_str)";
219
- #
220
- # if(!$this->chop_overload) {
221
- # // leave presumed genus + species + authority (in this instance), with genus and species in uppercase
222
- # $splitter = new Splitter($n,$text_str);
223
- #
224
- # $this->this_search_genus = $this_search_genus = $splitter->get('genus');
225
- # $this->this_search_species = $this_search_species = $splitter->get('species');
226
- # $this->this_authority = $this_authority = $splitter->get('author');
227
- # }
228
- #
229
- # // cache_flag switch detemines if caching is allowed for the source
230
- # if($this->cache_flag == true) {
231
- #
232
- # if ( $this_search_genus != '' && $this_search_species != '' && $this_authority != '' ) {
233
- # $cache_key = $this_search_genus . '-' . $this_search_species . '-' . $this_authority . '_' . $search_mode;
234
- # $cache_path = $this->cache_path . $this->db->source . "/authority/";
235
- # } else if ( $this_search_genus != '' && $this_search_species != '' ) {
236
- # $cache_key = $this_search_genus . '-' . $this_search_species . '_' . $search_mode;
237
- # $cache_path = $this->cache_path . $this->db->source . "/species/";
238
- # } else if ( $this_search_genus != '' ) {
239
- # $cache_key = $this_search_genus . '_' . $search_mode;
240
- # $cache_path = $this->cache_path . $this->db->source . "/genus/";
241
- # }
242
- #
243
- # $this->mkdir_recursive($cache_path);
244
- # $this->_cache = new Cache( $cache_path );
245
- # $this->_cache->setKey($cache_key);
246
- #
247
- # }
248
- #
249
- # $cache_loop_flag = false;
250
- # if($cache == true && $this->cache_flag == true) {
251
- # if($this->_cache->cache_exists()) $cache_loop_flag = true;
252
- # }
253
- #
254
- # if(!$cache_loop_flag) {
255
- #
256
- # $this->debug['process'][] = "3a (this_search_genus:$this_search_genus) (this_search_species:$this_search_species) (this_authority:$this_authority)";
257
- #
258
- # $nm = new NearMatch();
259
- # $this_near_match_genus = $nm->near_match($this_search_genus);
260
- #
261
- # $this->debug['process'][] = "3b (this_near_match_genus:$this_near_match_genus)";
262
- # //TODO refactor inside of a method
263
- # $this_genus_start = substr($this_search_genus,0,3);
264
- # $this_genus_end = substr($this_search_genus,-3);
265
- # $this_genus_length = strlen($this_search_genus);
266
- # //TODO_END
267
- # $this->debug['process'][] = "3c (this_search_genus,$this_search_genus) (this_genus_start:$this_genus_start) (this_genus_end:$this_genus_end) (this_genus_length:$this_genus_length)";
268
- #
269
- # if ($this_search_species != '') {
270
- # $this_near_match_species = $nm->near_match($this_search_species, 'epithet_only');
271
- # $this_species_length = strlen($this_search_species);
272
- # $this->debug['process'][] = "4 (this_search_species:$this_search_species) (this_near_match_species:$this_near_match_species) (this_species_length:$this_species_length)";
273
- # }
274
- #
275
- #
276
- # // now look for exact or near matches on genus first select candidate genera for edit distance (MDLD) test
277
- #
278
- # // for drec in genus_cur loop -- includes the genus pre-filter (main portion)
279
- # $genus_res = $this->db->genus_cur($this->search_mode, $this_near_match_genus, $this_near_match_species, $this_genus_length,$this_genus_start,$this_genus_end);
280
- #
281
- # $this->debug['process'][] = "5 (genus_res:$genus_res)";
282
- #
283
- # if(count($genus_res)) {
284
- # foreach($genus_res as $drec) {
285
- # // test candidate genera for edit distance, keep if satisfies post-test criteria
286
- # $this->genera_tested++;
287
- # // do the genus edit distance test
288
- #
289
- # $genus_match = $this->match_genera($this_search_genus, $drec->search_genus_name);
290
- # if ($genus_match['match']) {
291
- # $phonetic_flag = $genus_match['phonetic_match'] ? 'Y' : null;
292
- # $this->db->saveGenusMatches($drec->genus_id, $drec->genus, $genus_match['edit_distance'], $phonetic_flag);
293
- #
294
- # if ( ($this_search_species != null) && ($this_search_species != '') ) {
295
- # $species_res = $this->db->species_cur($drec->genus_id, $this_species_length );
296
- #
297
- # if(count($species_res)) {
298
- # foreach($species_res as $drec1) {
299
- # $this->species_tested++;
300
- #
301
- # // do the species edit distance test
302
- # $species_epithets_match = $this->match_species_epithets($this_search_species, $drec1->search_species_name);
303
- # $binomials_match = $this->match_binomials($genus_match, $species_epithets_match);
304
- # if ($binomials_match['match']) {
305
- #
306
- # $bionial_phonetic_flag = $binomials_match['phonetic_match'] ? 'Y' : null;
307
- # $this->db->saveSpeciesMatches($drec1->species_id, $drec1->genus_species, $genus_match['edit_distance'], $temp_species_ED, $binomials_match['edit_distance'], $bionial_phonetic_flag);
308
- # } //
309
- # } // End foreach species_res
310
- # } // End If elements exist for species_res
311
- # } // End Search Species Exist
312
- # }
313
- # }
314
- # }
315
- # } // End Cache Loop Flag
316
- # return true;
317
- # }
318
- #
319
- # /**
320
- # * generateResponse
321
- # * Result generation section (including ranking, result shaping,
322
- # * and authority comparison) - for demo purposes only
323
- # * NB, in a production system this would be replaced by something
324
- # * more appropriate, e.g. write to a file or database table,
325
- # * generate a HTML page for web display,
326
- # * generate XML response, etc. etc.
327
- # * @param boolean $cache
328
- # * @return boolean
329
- # */
330
- # public function generateResponse($cache) {
331
- #
332
- # $cache_loop_flag = false;
333
- # if($cache == true && $this->cache_flag == true) {
334
- # if($this->_cache->cache_exists()) $cache_loop_flag = true;
335
- # }
336
- #
337
- # // if($cache == true && $this->_cache->cache_exists() && $this->cache_flag == true) {
338
- # if($cache_loop_flag) {
339
- #
340
- # $this->data = $this->_cache->fetch();
341
- # $data_array = json_decode($this->data,true);
342
- # $data_array['cache'] = $cache;
343
- # $this->data = json_encode($data_array);
344
- #
345
- # } else {
346
- #
347
- # // genus exact, phonetic, and other near matches
348
- # $this->output['input'] = $this->searchtxt;
349
- # $this->debug['generateResponse'][] = "1 (input:" . $this->searchtxt . ")";
350
- #
351
- # // Genus Exact
352
- # $this->debug['generateResponse'][] = "1a (getGenusAuthority:exact)";
353
- # $this->getGenusAuthority(0,'exact');
354
- # // Genus Phonetic
355
- # $this->debug['generateResponse'][] = "1b (getGenusAuthority:phonetic)";
356
- # $this->getGenusAuthority('P','phonetic');
357
- # // Genus near matches
358
- # $this->debug['generateResponse'][] = "1c (getGenusAuthority:near_1)";
359
- # $this->getGenusAuthority(1,'near_1');
360
- # $this->debug['generateResponse'][] = "1d (getGenusAuthority:near_2)";
361
- # $this->getGenusAuthority(2,'near_2');
362
- #
363
- # if(!is_array($this->output['genus']) && $this->this_search_genus != '') {$this->output['genus'] = array();}
364
- #
365
- # if ( !is_null($this->this_search_species) ) {
366
- # // species exact, phonetic, and other near matches
367
- #
368
- # $this->debug['generateResponse'][] = "2a (getSpeciesAuthority:exact) ($this->this_authority)";
369
- # $this->getSpeciesAuthority( 0, 'exact', $this->this_authority );
370
- # $this->debug['generateResponse'][] = "2b (getSpeciesAuthority:phonetic) ($this->this_authority)";
371
- # $this->getSpeciesAuthority( 'P', 'phonetic', $this->this_authority );
372
- # $this->debug['generateResponse'][] = "2c (getSpeciesAuthority:near_1) ($this->this_authority)";
373
- # $this->getSpeciesAuthority( 1, 'near_1', $this->this_authority );
374
- # $this->debug['generateResponse'][] = "2d (getSpeciesAuthority:near_2) ($this->this_authority)";
375
- # $this->getSpeciesAuthority( 2, 'near_2', $this->this_authority );
376
- #
377
- # // -- Here is the result shaping section (only show ED 3 if no ED 1,2 or phonetic matches, only
378
- # // -- show ED 4 if no ED 1,2,3 or phonetic matches). By default shaping is on, unless disabled
379
- # // -- via the input parameter "search_mode" set to 'no_shaping'.
380
- # // -- In this demo we supplement any actual shaping with a message to show that it has been invoked,
381
- # // -- to show the system operates correctly.
382
- # if ($this->species_found == 'Y') {
383
- # $temp_species_count = $this->db->countSpeciesMatches(3);
384
- # $this->debug['generateResponse'][] = "3 (temp_species_count:$temp_species_count)";
385
- # }
386
- #
387
- # if( $temp_species_count > 0 && $this->search_mode == 'no_shaping' ) {
388
- # $this->debug['generateResponse'][] = "4 (getSpeciesAuthority:near_3) ($this->this_authority)";
389
- # $this->getSpeciesAuthority( 3, 'near_3', $this->this_authority );
390
- #
391
- # if( $this->species_found == 'Y' ) {
392
- # $temp_species_count = $this->db->countSpeciesMatches(4);
393
- # }
394
- #
395
- # if( $temp_species_count > 0 && $this->search_mode == 'no_shaping') {
396
- # $this->debug['generateResponse'][] = "4 (getSpeciesAuthority:near_4) ($this->this_authority)";
397
- # $this->getSpeciesAuthority( 4, 'near_4', $this->this_authority );
398
- # }
399
- # } // END temp_species_count > 0 and "no_shaping"
400
- #
401
- # } // END If this_search_species
402
- #
403
- # if(!is_array($this->output['species']) && $this->this_search_species != '') {$this->output['species'] = array();}
404
- # if($this->output_type == 'rest') {
405
- # if($this->debug_flag) {
406
- # $this->data = json_encode( array( 'success' => true, 'cache' => $cache, 'data' => $this->output, 'debug' => $this->debug ) );
407
- # } else {
408
- # $this->data = json_encode( array( 'success' => true, 'cache' => $cache, 'data' => $this->output));
409
- # }
410
- # } else {
411
- # $this->data = $this->output;
412
- # }
413
- #
414
- # if($this->cache_flag == true) {
415
- # if( ! $this->_cache->cache_exists()) {
416
- # if($this->debug_flag) {
417
- # $op_array = array (
418
- # 'success' => true
419
- # , 'cache_date' => date('Y-m-d')
420
- # , 'data' => $this->output
421
- # , 'debug' => $this->debug
422
- # );
423
- # } else {
424
- # $op_array = array (
425
- # 'success' => true
426
- # , 'cache_date' => date('Y-m-d')
427
- # , 'data' => $this->output
428
- # );
429
- #
430
- # }
431
- # $op = json_encode($op_array);
432
- # $this->_cache->update($op);
433
- # $tmp_cache_key = $this->_cache->getKey();
434
- # $this->_cache->setKey($tmp_cache_key . '_debug');
435
- # $dbg = @json_encode($this->debug);
436
- # $this->_cache->update($dbg);
437
- # $this->_cache->setKey($tmp_cache_key);
438
- # }
439
- # }
440
- # }
441
- #
442
- # return true;
443
- #
444
- # }
77
+ end
File without changes
@@ -6,8 +6,7 @@ require 'time'
6
6
  class DamerauLevenshteinMod
7
7
  def distance(str1, str2, block_size=2, max_distance=10)
8
8
  # puts str1.unpack("U*");
9
- res = distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
10
- (res > max_distance) ? nil : res
9
+ distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
11
10
  end
12
11
 
13
12
  inline do |builder|
@@ -6,7 +6,7 @@ module Normalizer
6
6
  end
7
7
 
8
8
  def self.normalize_word(word)
9
- self.normalize(word).gsub(/[^A-Z\.\-]/, '')
9
+ self.normalize(word).gsub(/[^A-Z0-9\.\-]/, '')
10
10
  end
11
11
 
12
12
  protected
@@ -61,8 +61,6 @@ class Phonetizer
61
61
  a_word
62
62
  end
63
63
 
64
- protected
65
-
66
64
  def self.normalize_ending(a_word)
67
65
  # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
68
66
  # -- at the end of a string translate all to -a
@@ -55,4 +55,4 @@ trimerophyton|mertriophyton|10|3|3
55
55
 
56
56
  #it should stop trying if distance exceeds maximum allowed distance
57
57
  Pxxxxomus|Pomatomus|10|1|4
58
- Pxxxxomus|Pomatomus|2|1|null
58
+ Pxxxxomus|Pomatomus|2|1|3
data/spec/spec_helper.rb CHANGED
@@ -9,16 +9,20 @@ end
9
9
  $:.unshift(File.dirname(__FILE__) + '/../lib')
10
10
  require 'taxamatch_rb'
11
11
 
12
- def read_test_file
13
- f = open(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt')
12
+ def read_test_file(file, fields_num)
13
+ f = open(file)
14
14
  f.each do |line|
15
- str1, str2, max_dist, block_size, distance = line.split("|")
16
- if line.match(/^\s*#/) == nil && str1 && str2 && max_dist && block_size && distance
17
- distance = distance.split('#')[0].strip
18
- distance = (distance == 'null') ? nil : distance.to_i
19
- yield({:str1 => str1, :str2 => str2, :max_dist => max_dist.to_i, :block_size => block_size.to_i, :distance => distance})
15
+ fields = line.split("|")
16
+ if line.match(/^\s*#/) == nil && fields.size == fields_num
17
+ fields[-1] = fields[-1].split('#')[0].strip
18
+ yield(fields)
20
19
  else
21
- yield({:comment => line})
20
+ yield(nil)
22
21
  end
23
22
  end
23
+ end
24
+
25
+ def make_taxamatch_hash(string)
26
+ normalized = Normalizer.normalize(string)
27
+ {:epitheton => string, :normalized => normalized, :phonetized => Phonetizer.near_match(normalized)}
24
28
  end
@@ -3,11 +3,12 @@ require File.dirname(__FILE__) + '/spec_helper.rb'
3
3
 
4
4
  describe 'DamerauLevensteinMod' do
5
5
  it 'should get tests' do
6
- read_test_file do |y|
6
+ read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
7
7
  dl = DamerauLevenshteinMod.new
8
- unless y[:comment]
9
- # puts "%s, %s, %s" % [y[:str1], y[:str2], y[:distance]]
10
- dl.distance(y[:str1], y[:str2], y[:block_size], y[:max_dist]).should == y[:distance]
8
+ if y
9
+ res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
10
+ puts y if res != y[4].to_i
11
+ res.should == y[4].to_i
11
12
  end
12
13
  end
13
14
  end
@@ -45,6 +46,134 @@ describe 'Normalizer' do
45
46
  end
46
47
 
47
48
  it 'should normalize words' do
48
- Normalizer.normalize_word('Leœ|pt[ura$').should == 'LEOEPTURA'
49
+ Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
49
50
  end
50
- end
51
+ end
52
+
53
+ describe 'Taxamatch' do
54
+ before(:all) do
55
+ @tm = Taxamatch.new
56
+ end
57
+
58
+ it 'should get txt tests' do
59
+ dl = DamerauLevenshteinMod.new
60
+ read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 3) do |y|
61
+ if y
62
+ y[2] = y[2] == 'true' ? true : false
63
+ res = @tm.taxamatch(y[0], y[1])
64
+ puts "%s, %s, %s" % [y[0], y[1], y[2]] if res != y[2]
65
+ res.should == y[2]
66
+ end
67
+ end
68
+ end
69
+
70
+
71
+ it 'should compare genera' do
72
+ #edit distance 1 always match
73
+ g1 = make_taxamatch_hash 'Plantago'
74
+ g2 = make_taxamatch_hash 'Plantagon'
75
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :edit_distance=>1, :match=>true}
76
+ #edit_distance above threshold does not math
77
+ g1 = make_taxamatch_hash 'Plantago'
78
+ g2 = make_taxamatch_hash 'This shouldnt match'
79
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
80
+ #phonetic_match matches
81
+ g1 = make_taxamatch_hash 'Plantagi'
82
+ g2 = make_taxamatch_hash 'Plantagy'
83
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>true, :edit_distance=>1, :match=>true}
84
+ #distance 1 in first letter also matches
85
+ g1 = make_taxamatch_hash 'Xantheri'
86
+ g2 = make_taxamatch_hash 'Pantheri'
87
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :edit_distance=>1, :match=>true}
88
+ #phonetic match tramps everything
89
+ g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
90
+ g2 = make_taxamatch_hash 'Zanthery'
91
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
92
+ #same first letter and distance 2 should match
93
+ g1 = make_taxamatch_hash 'Xantherii'
94
+ g2 = make_taxamatch_hash 'Xantherrr'
95
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>2}
96
+ #First letter is the same and distance is 3 should match, no phonetic match
97
+ g1 = make_taxamatch_hash 'Xantheriii'
98
+ g2 = make_taxamatch_hash 'Xantherrrr'
99
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>3}
100
+ #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
101
+ g1 = make_taxamatch_hash 'Xant'
102
+ g2 = make_taxamatch_hash 'Xanthe'
103
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>2}
104
+ #Should not match if edit distance > 3 and no phonetic match
105
+ g1 = make_taxamatch_hash 'Xantheriiii'
106
+ g2 = make_taxamatch_hash 'Xantherrrrr'
107
+ @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
108
+ end
109
+
110
+ it 'should compare species' do
111
+ #Exact match
112
+ s1 = make_taxamatch_hash 'major'
113
+ s2 = make_taxamatch_hash 'major'
114
+ @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>0}
115
+ #Phonetic match always works
116
+ s1 = make_taxamatch_hash 'xanteriiiiiiii'
117
+ s2 = make_taxamatch_hash 'zantereeeeeeee'
118
+ @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>5}
119
+ #Phonetic match works with different endings
120
+ s1 = make_taxamatch_hash 'majorum'
121
+ s2 = make_taxamatch_hash 'majoris'
122
+ @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>2}
123
+ #Distance 4 matches if first 3 chars are the same
124
+ s1 = make_taxamatch_hash 'majorrrrr'
125
+ s2 = make_taxamatch_hash 'majoraaaa'
126
+ @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>4}
127
+ #Should not match if Distance 4 matches and first 3 chars are not the same
128
+ s1 = make_taxamatch_hash 'majorrrrr'
129
+ s2 = make_taxamatch_hash 'marorraaa'
130
+ @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
131
+ #Distance 2 or 3 matches if first 1 char is the same
132
+ s1 = make_taxamatch_hash 'morrrr'
133
+ s2 = make_taxamatch_hash 'moraaa'
134
+ @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>3}
135
+ #Should not match if Distance 2 or 3 and first 1 char is not the same
136
+ s1 = make_taxamatch_hash 'morrrr'
137
+ s2 = make_taxamatch_hash 'torraa'
138
+ @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>3}
139
+ #Distance 1 will match anywhere
140
+ s1 = make_taxamatch_hash 'major'
141
+ s2 = make_taxamatch_hash 'rajor'
142
+ @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>1}
143
+ #Will not match if distance 3 and length is less then twice of the edit distance
144
+ s1 = make_taxamatch_hash 'marrr'
145
+ s2 = make_taxamatch_hash 'maaaa'
146
+ @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>3}
147
+ end
148
+
149
+ it 'should match mathes' do
150
+ #No trobule case
151
+ gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
152
+ smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
153
+ @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>2, :match=>true}
154
+ #Will not match if either genus or sp. epithet dont match
155
+ gmatch = {:match => false, :phonetic_match => false, :edit_distance => 1}
156
+ smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
157
+ @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>false}
158
+ gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
159
+ smatch = {:match => false, :phonetic_match => false, :edit_distance => 1}
160
+ @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>false}
161
+ #Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
162
+ gmatch = {:match => true, :phonetic_match => true, :edit_distance => 3}
163
+ smatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
164
+ @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>5, :match=>false}
165
+ #Should not have phonetic match if one of the components does not match phonetically
166
+ gmatch = {:match => true, :phonetic_match => false, :edit_distance => 1}
167
+ smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
168
+ @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>true}
169
+ gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
170
+ smatch = {:match => true, :phonetic_match => false, :edit_distance => 1}
171
+ @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>true}
172
+ #edit distance should be equal the sum of of edit distances
173
+ gmatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
174
+ smatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
175
+ @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
176
+ end
177
+ end
178
+
179
+
@@ -0,0 +1,19 @@
1
+ ###
2
+ #
3
+ # Tests for string comparison by taxamatch algorithm
4
+ #
5
+ ##
6
+
7
+ # additional authorship should match
8
+ Puma concolor|Puma concolor L.|true
9
+
10
+ # one-letter misspeling in species epithet should match
11
+ Puma concolor|Puma cancolor|true
12
+
13
+ Pomatomus saltatrix|Pomatomus saltratix|true
14
+ Pomatomus saltator|Pomatomus saltatrix|true
15
+
16
+ Loligo pealeii|Loligo plei|false
17
+
18
+ # different authors should not match
19
+ #Puma concolor Linnaeus|Puma concolor Kurtz|false
data/taxamatch_rb.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{taxamatch_rb}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Dmitry Mozzherin"]
@@ -26,6 +26,7 @@ Gem::Specification.new do |s|
26
26
  "features/support/matchers.rb",
27
27
  "features/taxamatch_rb.feature",
28
28
  "lib/taxamatch_rb.rb",
29
+ "lib/taxamatch_rb/authormatch.rb",
29
30
  "lib/taxamatch_rb/damerau_levenshtein_mod.rb",
30
31
  "lib/taxamatch_rb/normalizer.rb",
31
32
  "lib/taxamatch_rb/parser.rb",
@@ -34,6 +35,7 @@ Gem::Specification.new do |s|
34
35
  "spec/spec.opts",
35
36
  "spec/spec_helper.rb",
36
37
  "spec/taxamatch_rb_spec.rb",
38
+ "spec/taxamatch_test.txt",
37
39
  "taxamatch_rb.gemspec"
38
40
  ]
39
41
  s.has_rdoc = true
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dimus-taxamatch_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
@@ -55,6 +55,7 @@ files:
55
55
  - features/support/matchers.rb
56
56
  - features/taxamatch_rb.feature
57
57
  - lib/taxamatch_rb.rb
58
+ - lib/taxamatch_rb/authormatch.rb
58
59
  - lib/taxamatch_rb/damerau_levenshtein_mod.rb
59
60
  - lib/taxamatch_rb/normalizer.rb
60
61
  - lib/taxamatch_rb/parser.rb
@@ -63,6 +64,7 @@ files:
63
64
  - spec/spec.opts
64
65
  - spec/spec_helper.rb
65
66
  - spec/taxamatch_rb_spec.rb
67
+ - spec/taxamatch_test.txt
66
68
  - taxamatch_rb.gemspec
67
69
  has_rdoc: true
68
70
  homepage: http://github.com/dimus/taxamatch_rb