biodiversity19 1.1.3 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +5 -0
- data/VERSION +1 -1
- data/lib/biodiversity/parser.rb +37 -9
- data/lib/biodiversity/parser/scientific_name_clean.treetop +22 -0
- data/spec/parser/scientific_name.spec.rb +31 -0
- data/spec/parser/test_data.txt +2 -0
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -100,5 +100,10 @@ You can use it as a library
|
|
100
100
|
# to resolve lsid and get back RDF file
|
101
101
|
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
102
102
|
|
103
|
+
# to get canonicals with ranks for infraspecific epithets:
|
104
|
+
parser = ScientificNameParser.new(canonical_with_rank: true)
|
105
|
+
parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
|
106
|
+
# should get 'Cola cordifolia var. puberula'
|
107
|
+
|
103
108
|
Copyright (c) 2009-2011 Marine Biological Laboratory. See LICENSE.txt for
|
104
109
|
further details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.
|
1
|
+
1.2.0
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -12,7 +12,7 @@ module PreProcessor
|
|
12
12
|
TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
|
13
13
|
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
|
14
14
|
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
15
|
-
LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
|
15
|
+
LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|nudum|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
|
16
16
|
|
17
17
|
def self.clean(a_string)
|
18
18
|
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
@@ -36,7 +36,9 @@ class ParallelParser
|
|
36
36
|
end
|
37
37
|
|
38
38
|
def parse(names_list)
|
39
|
-
parsed = Parallel.map(names_list.uniq, :
|
39
|
+
parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
|
40
|
+
[n, parse_process(n)]
|
41
|
+
end
|
40
42
|
parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
|
41
43
|
end
|
42
44
|
|
@@ -47,7 +49,10 @@ class ParallelParser
|
|
47
49
|
private
|
48
50
|
def parse_process(name)
|
49
51
|
p = ScientificNameParser.new
|
50
|
-
|
52
|
+
failed_res = { scientificName: { parsed: false,
|
53
|
+
verbatim: name,
|
54
|
+
error: 'Parser error' } }
|
55
|
+
p.parse(name) rescue failed_res
|
51
56
|
end
|
52
57
|
end
|
53
58
|
|
@@ -64,7 +69,10 @@ end
|
|
64
69
|
# end
|
65
70
|
|
66
71
|
class ScientificNameParser
|
67
|
-
VERSION = open(File.join(File.dirname(__FILE__),
|
72
|
+
VERSION = open(File.join(File.dirname(__FILE__),
|
73
|
+
'..',
|
74
|
+
'..',
|
75
|
+
'VERSION')).readline.strip
|
68
76
|
|
69
77
|
def self.fix_case(name_string)
|
70
78
|
name_ary = name_string.split(/\s+/)
|
@@ -96,7 +104,8 @@ class ScientificNameParser
|
|
96
104
|
end
|
97
105
|
|
98
106
|
|
99
|
-
def initialize
|
107
|
+
def initialize(opts = {})
|
108
|
+
@canonical_with_rank = !!opts[:canonical_with_rank]
|
100
109
|
@verbatim = ''
|
101
110
|
@clean = ScientificNameCleanParser.new
|
102
111
|
@dirty = ScientificNameDirtyParser.new
|
@@ -134,7 +143,9 @@ class ScientificNameParser
|
|
134
143
|
@parsed = @dirty.parse(salvage_string) || @canonical.parse(a_string) || { :verbatim => a_string }
|
135
144
|
end
|
136
145
|
rescue
|
137
|
-
@parsed = {:
|
146
|
+
@parsed = { scientificName: { parsed: false,
|
147
|
+
verbatim: name,
|
148
|
+
error: 'Parser error' } }
|
138
149
|
end
|
139
150
|
end
|
140
151
|
|
@@ -142,7 +153,8 @@ class ScientificNameParser
|
|
142
153
|
@verbatim = a_string
|
143
154
|
end
|
144
155
|
|
145
|
-
def @parsed.all(
|
156
|
+
def @parsed.all(opts = {})
|
157
|
+
canonical_with_rank = !!opts[:canonical_with_rank]
|
146
158
|
parsed = self.class != Hash
|
147
159
|
res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
|
148
160
|
if parsed
|
@@ -159,8 +171,10 @@ class ScientificNameParser
|
|
159
171
|
else
|
160
172
|
res.merge!(self)
|
161
173
|
end
|
174
|
+
if canonical_with_rank && canonical.count(" ") > 1 && res[:details][0][:infraspecies]
|
175
|
+
ScientificNameParser.add_rank_to_canonical(res)
|
176
|
+
end
|
162
177
|
res = {:scientificName => res}
|
163
|
-
res
|
164
178
|
end
|
165
179
|
|
166
180
|
def @parsed.pos_json
|
@@ -172,7 +186,21 @@ class ScientificNameParser
|
|
172
186
|
end
|
173
187
|
|
174
188
|
@parsed.verbatim = @verbatim
|
175
|
-
@parsed.all
|
189
|
+
@parsed.all(canonical_with_rank: @canonical_with_rank)
|
190
|
+
end
|
191
|
+
|
192
|
+
private
|
193
|
+
|
194
|
+
def self.add_rank_to_canonical(parsed)
|
195
|
+
parts = parsed[:canonical].split(" ")
|
196
|
+
name_ary = parts[0..1]
|
197
|
+
parsed[:details][0][:infraspecies].each do |data|
|
198
|
+
infrasp = data[:string]
|
199
|
+
rank = data[:rank]
|
200
|
+
name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
|
201
|
+
end
|
202
|
+
parsed[:canonical] = name_ary.join(" ")
|
176
203
|
end
|
204
|
+
|
177
205
|
end
|
178
206
|
|
@@ -262,6 +262,28 @@ grammar ScientificNameClean
|
|
262
262
|
end
|
263
263
|
}
|
264
264
|
/
|
265
|
+
a:genus space b:infragenus space aid:annotation_identification? space c:species space aid:annotation_identification space d:infraspecies_mult {
|
266
|
+
def value
|
267
|
+
a.value + " " + b.value + " " + c.value + " " + d.value
|
268
|
+
end
|
269
|
+
|
270
|
+
def canonical
|
271
|
+
a.canonical + " " + c.canonical + " " + d.canonical
|
272
|
+
end
|
273
|
+
|
274
|
+
def pos
|
275
|
+
a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
|
276
|
+
end
|
277
|
+
|
278
|
+
def hybrid
|
279
|
+
c.hybrid rescue false
|
280
|
+
end
|
281
|
+
|
282
|
+
def details
|
283
|
+
a.details.merge(b.details).merge(c.details).merge(d.details)
|
284
|
+
end
|
285
|
+
}
|
286
|
+
/
|
265
287
|
a:genus space b:infragenus space aid:annotation_identification? space c:species {
|
266
288
|
def value
|
267
289
|
if defined? aid.apply
|
@@ -26,6 +26,7 @@ describe ScientificNameParser do
|
|
26
26
|
JSON.load(json(y[:name])).should == JSON.load(y[:jsn]) unless y[:comment]
|
27
27
|
end
|
28
28
|
end
|
29
|
+
|
29
30
|
|
30
31
|
# it 'should generate new test_file' do
|
31
32
|
# new_test = open(File.expand_path(dir + "../../spec/parser/test_data_new.txt"),'w')
|
@@ -57,6 +58,36 @@ describe ScientificNameParser do
|
|
57
58
|
end
|
58
59
|
end
|
59
60
|
|
61
|
+
describe "ScientificNameParser with ranked canonicals" do
|
62
|
+
before(:all) do
|
63
|
+
@parser = ScientificNameParser.new(canonical_with_rank: true)
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should not influence output for uninomials and binomials' do
|
67
|
+
data = [
|
68
|
+
['Ekbainacanthus Yakowlew 1902','Ekbainacanthus'],
|
69
|
+
['Ekboarmia sagnesi herrerai Exposito 2007', 'Ekboarmia sagnesi herrerai'],
|
70
|
+
['Ekboarmia holli Oberthür', 'Ekboarmia holli']]
|
71
|
+
|
72
|
+
data.each do |d|
|
73
|
+
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
74
|
+
parsed.should == d[1]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should preserve rank for ranked multinomials' do
|
79
|
+
data = [
|
80
|
+
['Cola cordifolia var. puberula A. Chev.', 'Cola cordifolia var. puberula'],
|
81
|
+
['Abies homolepis forma umbilicata (Mayr) Schelle', 'Abies homolepis forma umbilicata'],
|
82
|
+
['Quercus ilex ssp. ballota (Desf.) Samp', 'Quercus ilex ssp. ballota']
|
83
|
+
]
|
84
|
+
data.each do |d|
|
85
|
+
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
86
|
+
parsed.should == d[1]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
60
91
|
|
61
92
|
describe ParallelParser do
|
62
93
|
it "should find number of cpus" do
|
data/spec/parser/test_data.txt
CHANGED
@@ -190,6 +190,8 @@ Endoxyla sp. GM-, 2003|{"scientificName":{"parsed":true, "parser_version":"test_
|
|
190
190
|
Liopropoma sp.2 Not applicable|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Liopropoma sp.2 Not applicable", "normalized":"Liopropoma", "canonical":"Liopropoma", "hybrid":false, "details":[{"genus":{"string":"Liopropoma"}, "annotation_identification":"sp.", "ignored":{"unparsed":"2 Not applicable"}}], "parser_run":1, "positions":{"0":["genus", 10], "11":["annotation_identification", 14]}}}
|
191
191
|
Lacanobia nr. subjuncta Bold:Aab, 0925|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lacanobia nr. subjuncta Bold:Aab, 0925", "normalized":"Lacanobia", "canonical":"Lacanobia", "hybrid":false, "details":[{"genus":{"string":"Lacanobia"}, "annotation_identification":"nr.", "ignored":{"species":{"string":"subjuncta", "authorship":"Bold:Aab", "basionymAuthorTeam":{"authorTeam":"Bold:Aab", "author":["Bold:Aab"]}}}}], "parser_run":2, "positions":{"0":["genus", 9], "10":["annotation_identification", 13]}}}
|
192
192
|
Lacanobia sp. nr. subjuncta Bold:Aab, 0925|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lacanobia sp. nr. subjuncta Bold:Aab, 0925", "normalized":"Lacanobia", "canonical":"Lacanobia", "hybrid":false, "details":[{"genus":{"string":"Lacanobia"}, "annotation_identification":"sp. nr.", "ignored":{"species":{"string":"subjuncta", "authorship":"Bold:Aab", "basionymAuthorTeam":{"authorTeam":"Bold:Aab", "author":["Bold:Aab"]}}}}], "parser_run":2, "positions":{"0":["genus", 9], "10":["annotation_identification", 17]}}}
|
193
|
+
#Larus occidentalis cf. wymani|{}
|
194
|
+
Calidris cf. cooperi|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Calidris cf. cooperi", "normalized":"Calidris cf. cooperi", "canonical":"Calidris cooperi", "hybrid":false, "details":[{"genus":{"string":"Calidris"}, "annotation_identification":"cf.", "species":{"species":{"string":"cooperi"}}}], "parser_run":1, "positions":{"0":["genus", 8], "9":["annotation_identification", 12], "13":["species", 20]}}}
|
193
195
|
#TODO:Gemmula cf. cosmoi NP-2008 -- generates wrong authorship
|
194
196
|
|
195
197
|
#unknown authorship
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biodiversity19
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-03-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: treetop
|
@@ -178,7 +178,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
segments:
|
180
180
|
- 0
|
181
|
-
hash:
|
181
|
+
hash: 3489764594482913391
|
182
182
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
183
183
|
none: false
|
184
184
|
requirements:
|