dimus-taxamatch_rb 0.5.3 → 0.5.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/taxamatch_rb.rb +8 -7
- data/spec/damerau_levenshtein_mod_test.txt +3 -0
- data/spec/taxamatch_rb_spec.rb +5 -4
- data/spec/taxamatch_test.txt +24 -18
- metadata +6 -6
data/lib/taxamatch_rb.rb
CHANGED
@@ -21,10 +21,11 @@ module Taxamatch
|
|
21
21
|
|
22
22
|
|
23
23
|
#takes two scientific names and returns true if names match and false if they don't
|
24
|
-
def taxamatch(str1, str2)
|
24
|
+
def taxamatch(str1, str2, return_boolean = true)
|
25
25
|
preparsed_1 = @parser.parse(str1)
|
26
26
|
preparsed_2 = @parser.parse(str2)
|
27
|
-
taxamatch_preparsed(preparsed_1, preparsed_2)
|
27
|
+
match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
|
28
|
+
return_boolean && match ? match['match'] : match
|
28
29
|
end
|
29
30
|
|
30
31
|
#takes two hashes of parsed scientific names, analyses them and returns back
|
@@ -34,7 +35,7 @@ module Taxamatch
|
|
34
35
|
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
35
36
|
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
36
37
|
if result && result['match']
|
37
|
-
result['match'] =
|
38
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
|
38
39
|
end
|
39
40
|
return result
|
40
41
|
end
|
@@ -46,17 +47,16 @@ module Taxamatch
|
|
46
47
|
def match_multinomial(preparsed_1, preparsed_2)
|
47
48
|
gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
|
48
49
|
sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
|
49
|
-
au_match = match_authors(preparsed_1, preparsed_2)
|
50
50
|
total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
|
51
51
|
match = match_matches(gen_match, sp_match)
|
52
|
-
match.merge({'score' => (1- match['edit_distance']/(total_length/2))})
|
52
|
+
match.merge({'score' => (1 - match['edit_distance']/(total_length/2))})
|
53
53
|
end
|
54
54
|
|
55
55
|
def match_genera(genus1, genus2)
|
56
56
|
genus1_length = genus1[:normalized].size
|
57
57
|
genus2_length = genus2[:normalized].size
|
58
58
|
match = false
|
59
|
-
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],
|
59
|
+
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
60
60
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
61
61
|
|
62
62
|
match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
@@ -69,7 +69,8 @@ module Taxamatch
|
|
69
69
|
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
70
70
|
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
71
71
|
match = false
|
72
|
-
ed = @dlm.distance(sp1[:normalized], sp2[:normalized],
|
72
|
+
ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
|
73
|
+
#puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
|
73
74
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
74
75
|
|
75
76
|
match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -55,12 +55,13 @@ describe 'Taxamatch::Base' do
|
|
55
55
|
|
56
56
|
it 'should get txt tests' do
|
57
57
|
dl = Taxamatch::DamerauLevenshteinMod.new
|
58
|
-
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt',
|
58
|
+
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
|
59
59
|
if y
|
60
60
|
y[2] = y[2] == 'true' ? true : false
|
61
|
-
res = @tm.taxamatch(y[0], y[1])
|
62
|
-
puts "%s, %s, %s" % [y[0], y[1], y[2]] if res != y[2]
|
63
|
-
res.should == y[2]
|
61
|
+
res = @tm.taxamatch(y[0], y[1], false)
|
62
|
+
#puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]] if res != y[2]
|
63
|
+
res['match'].should == y[2]
|
64
|
+
res['edit_distance'].should == y[3].to_i
|
64
65
|
end
|
65
66
|
end
|
66
67
|
end
|
data/spec/taxamatch_test.txt
CHANGED
@@ -1,28 +1,34 @@
|
|
1
1
|
###
|
2
2
|
#
|
3
3
|
# Tests for string comparison by taxamatch algorithm
|
4
|
+
# name1|name2|match|edit_distance
|
4
5
|
#
|
5
6
|
##
|
6
7
|
|
7
|
-
|
8
|
-
Puma concolor|Puma concolor L.|true
|
9
|
-
|
10
|
-
|
11
|
-
Puma concolor|Puma cancolor|true
|
12
|
-
|
13
|
-
Pomatomus saltatrix|Pomatomus saltratix|true
|
14
|
-
Pomatomus saltator|Pomatomus saltatrix|true
|
15
|
-
|
16
|
-
Loligo pealeii|Loligo plei|false
|
17
|
-
|
18
|
-
|
19
|
-
Puma concolor Linnaeus|Puma concolor Kurtz|false
|
8
|
+
## additional authorship should match
|
9
|
+
Puma concolor|Puma concolor L.|true|0
|
10
|
+
#
|
11
|
+
## one-letter misspeling in species epithet should match
|
12
|
+
Puma concolor|Puma cancolor|true|1
|
13
|
+
#
|
14
|
+
Pomatomus saltatrix|Pomatomus saltratix|true|2
|
15
|
+
Pomatomus saltator|Pomatomus saltatrix|true|3
|
16
|
+
#
|
17
|
+
Loligo pealeii|Loligo plei|false|3
|
18
|
+
#
|
19
|
+
## different authors should not match
|
20
|
+
Puma concolor Linnaeus|Puma concolor Kurtz|false|0
|
21
|
+
#
|
22
|
+
##real life examples
|
23
|
+
Biatora borealis|Bactra borealis Diakonoff 1964|false|3
|
24
|
+
#
|
25
|
+
Homo sapien|Homo sapiens|true|1
|
26
|
+
Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true|1
|
27
|
+
Homo sapiens Mozzherin|Homo sapiens Linneaus|false|0
|
28
|
+
#
|
29
|
+
Quinqueloculina punctata|Quinqueloculina punctata d'Orbigny 1905|true|0
|
30
|
+
Pomatomus saltator (Linnaeus, 1766)|Pomatomus saltatrix (Linnaeus, 1766)|true|0|3
|
20
31
|
|
21
|
-
#real life examples
|
22
|
-
Biatora borealis|Bactra borealis Diakonoff 1964|false
|
23
32
|
|
24
|
-
Homo sapien|Homo sapiens|true
|
25
|
-
Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true
|
26
|
-
Homo sapiens Mozzherin|Homo sapiens Linneaus|false
|
27
33
|
|
28
34
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dimus-taxamatch_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-08-
|
12
|
+
date: 2009-08-16 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: "0"
|
34
34
|
version:
|
35
|
-
description:
|
35
|
+
description: This gem implements algorithsm for fuzzy matching scientific names developed by Tony Rees
|
36
36
|
email: dmozzherin@eol.org
|
37
37
|
executables: []
|
38
38
|
|
@@ -55,7 +55,7 @@ files:
|
|
55
55
|
- spec/taxamatch_rb_spec.rb
|
56
56
|
- spec/taxamatch_test.txt
|
57
57
|
- LICENSE
|
58
|
-
has_rdoc:
|
58
|
+
has_rdoc: false
|
59
59
|
homepage: http://github.com/dimus/taxamatch_rb
|
60
60
|
licenses:
|
61
61
|
post_install_message:
|
@@ -80,8 +80,8 @@ requirements: []
|
|
80
80
|
rubyforge_project:
|
81
81
|
rubygems_version: 1.3.5
|
82
82
|
signing_key:
|
83
|
-
specification_version:
|
84
|
-
summary:
|
83
|
+
specification_version: 3
|
84
|
+
summary: Implementation of Tony Rees Taxamatch algorithms
|
85
85
|
test_files:
|
86
86
|
- spec/spec_helper.rb
|
87
87
|
- spec/taxamatch_rb_spec.rb
|