biodiversity19 1.1.3 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +5 -0
- data/VERSION +1 -1
- data/lib/biodiversity/parser.rb +37 -9
- data/lib/biodiversity/parser/scientific_name_clean.treetop +22 -0
- data/spec/parser/scientific_name.spec.rb +31 -0
- data/spec/parser/test_data.txt +2 -0
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -100,5 +100,10 @@ You can use it as a library
|
|
100
100
|
# to resolve lsid and get back RDF file
|
101
101
|
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
102
102
|
|
103
|
+
# to get canonicals with ranks for infraspecific epithets:
|
104
|
+
parser = ScientificNameParser.new(canonical_with_rank: true)
|
105
|
+
parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
|
106
|
+
# should get 'Cola cordifolia var. puberula'
|
107
|
+
|
103
108
|
Copyright (c) 2009-2011 Marine Biological Laboratory. See LICENSE.txt for
|
104
109
|
further details.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.
|
1
|
+
1.2.0
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -12,7 +12,7 @@ module PreProcessor
|
|
12
12
|
TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
|
13
13
|
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
|
14
14
|
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
15
|
-
LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
|
15
|
+
LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|nudum|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
|
16
16
|
|
17
17
|
def self.clean(a_string)
|
18
18
|
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
@@ -36,7 +36,9 @@ class ParallelParser
|
|
36
36
|
end
|
37
37
|
|
38
38
|
def parse(names_list)
|
39
|
-
parsed = Parallel.map(names_list.uniq, :
|
39
|
+
parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
|
40
|
+
[n, parse_process(n)]
|
41
|
+
end
|
40
42
|
parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
|
41
43
|
end
|
42
44
|
|
@@ -47,7 +49,10 @@ class ParallelParser
|
|
47
49
|
private
|
48
50
|
def parse_process(name)
|
49
51
|
p = ScientificNameParser.new
|
50
|
-
|
52
|
+
failed_res = { scientificName: { parsed: false,
|
53
|
+
verbatim: name,
|
54
|
+
error: 'Parser error' } }
|
55
|
+
p.parse(name) rescue failed_res
|
51
56
|
end
|
52
57
|
end
|
53
58
|
|
@@ -64,7 +69,10 @@ end
|
|
64
69
|
# end
|
65
70
|
|
66
71
|
class ScientificNameParser
|
67
|
-
VERSION = open(File.join(File.dirname(__FILE__),
|
72
|
+
VERSION = open(File.join(File.dirname(__FILE__),
|
73
|
+
'..',
|
74
|
+
'..',
|
75
|
+
'VERSION')).readline.strip
|
68
76
|
|
69
77
|
def self.fix_case(name_string)
|
70
78
|
name_ary = name_string.split(/\s+/)
|
@@ -96,7 +104,8 @@ class ScientificNameParser
|
|
96
104
|
end
|
97
105
|
|
98
106
|
|
99
|
-
def initialize
|
107
|
+
def initialize(opts = {})
|
108
|
+
@canonical_with_rank = !!opts[:canonical_with_rank]
|
100
109
|
@verbatim = ''
|
101
110
|
@clean = ScientificNameCleanParser.new
|
102
111
|
@dirty = ScientificNameDirtyParser.new
|
@@ -134,7 +143,9 @@ class ScientificNameParser
|
|
134
143
|
@parsed = @dirty.parse(salvage_string) || @canonical.parse(a_string) || { :verbatim => a_string }
|
135
144
|
end
|
136
145
|
rescue
|
137
|
-
@parsed = {:
|
146
|
+
@parsed = { scientificName: { parsed: false,
|
147
|
+
verbatim: name,
|
148
|
+
error: 'Parser error' } }
|
138
149
|
end
|
139
150
|
end
|
140
151
|
|
@@ -142,7 +153,8 @@ class ScientificNameParser
|
|
142
153
|
@verbatim = a_string
|
143
154
|
end
|
144
155
|
|
145
|
-
def @parsed.all(
|
156
|
+
def @parsed.all(opts = {})
|
157
|
+
canonical_with_rank = !!opts[:canonical_with_rank]
|
146
158
|
parsed = self.class != Hash
|
147
159
|
res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
|
148
160
|
if parsed
|
@@ -159,8 +171,10 @@ class ScientificNameParser
|
|
159
171
|
else
|
160
172
|
res.merge!(self)
|
161
173
|
end
|
174
|
+
if canonical_with_rank && canonical.count(" ") > 1 && res[:details][0][:infraspecies]
|
175
|
+
ScientificNameParser.add_rank_to_canonical(res)
|
176
|
+
end
|
162
177
|
res = {:scientificName => res}
|
163
|
-
res
|
164
178
|
end
|
165
179
|
|
166
180
|
def @parsed.pos_json
|
@@ -172,7 +186,21 @@ class ScientificNameParser
|
|
172
186
|
end
|
173
187
|
|
174
188
|
@parsed.verbatim = @verbatim
|
175
|
-
@parsed.all
|
189
|
+
@parsed.all(canonical_with_rank: @canonical_with_rank)
|
190
|
+
end
|
191
|
+
|
192
|
+
private
|
193
|
+
|
194
|
+
def self.add_rank_to_canonical(parsed)
|
195
|
+
parts = parsed[:canonical].split(" ")
|
196
|
+
name_ary = parts[0..1]
|
197
|
+
parsed[:details][0][:infraspecies].each do |data|
|
198
|
+
infrasp = data[:string]
|
199
|
+
rank = data[:rank]
|
200
|
+
name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
|
201
|
+
end
|
202
|
+
parsed[:canonical] = name_ary.join(" ")
|
176
203
|
end
|
204
|
+
|
177
205
|
end
|
178
206
|
|
@@ -262,6 +262,28 @@ grammar ScientificNameClean
|
|
262
262
|
end
|
263
263
|
}
|
264
264
|
/
|
265
|
+
a:genus space b:infragenus space aid:annotation_identification? space c:species space aid:annotation_identification space d:infraspecies_mult {
|
266
|
+
def value
|
267
|
+
a.value + " " + b.value + " " + c.value + " " + d.value
|
268
|
+
end
|
269
|
+
|
270
|
+
def canonical
|
271
|
+
a.canonical + " " + c.canonical + " " + d.canonical
|
272
|
+
end
|
273
|
+
|
274
|
+
def pos
|
275
|
+
a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
|
276
|
+
end
|
277
|
+
|
278
|
+
def hybrid
|
279
|
+
c.hybrid rescue false
|
280
|
+
end
|
281
|
+
|
282
|
+
def details
|
283
|
+
a.details.merge(b.details).merge(c.details).merge(d.details)
|
284
|
+
end
|
285
|
+
}
|
286
|
+
/
|
265
287
|
a:genus space b:infragenus space aid:annotation_identification? space c:species {
|
266
288
|
def value
|
267
289
|
if defined? aid.apply
|
@@ -26,6 +26,7 @@ describe ScientificNameParser do
|
|
26
26
|
JSON.load(json(y[:name])).should == JSON.load(y[:jsn]) unless y[:comment]
|
27
27
|
end
|
28
28
|
end
|
29
|
+
|
29
30
|
|
30
31
|
# it 'should generate new test_file' do
|
31
32
|
# new_test = open(File.expand_path(dir + "../../spec/parser/test_data_new.txt"),'w')
|
@@ -57,6 +58,36 @@ describe ScientificNameParser do
|
|
57
58
|
end
|
58
59
|
end
|
59
60
|
|
61
|
+
describe "ScientificNameParser with ranked canonicals" do
|
62
|
+
before(:all) do
|
63
|
+
@parser = ScientificNameParser.new(canonical_with_rank: true)
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should not influence output for uninomials and binomials' do
|
67
|
+
data = [
|
68
|
+
['Ekbainacanthus Yakowlew 1902','Ekbainacanthus'],
|
69
|
+
['Ekboarmia sagnesi herrerai Exposito 2007', 'Ekboarmia sagnesi herrerai'],
|
70
|
+
['Ekboarmia holli Oberthür', 'Ekboarmia holli']]
|
71
|
+
|
72
|
+
data.each do |d|
|
73
|
+
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
74
|
+
parsed.should == d[1]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should preserve rank for ranked multinomials' do
|
79
|
+
data = [
|
80
|
+
['Cola cordifolia var. puberula A. Chev.', 'Cola cordifolia var. puberula'],
|
81
|
+
['Abies homolepis forma umbilicata (Mayr) Schelle', 'Abies homolepis forma umbilicata'],
|
82
|
+
['Quercus ilex ssp. ballota (Desf.) Samp', 'Quercus ilex ssp. ballota']
|
83
|
+
]
|
84
|
+
data.each do |d|
|
85
|
+
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
86
|
+
parsed.should == d[1]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
60
91
|
|
61
92
|
describe ParallelParser do
|
62
93
|
it "should find number of cpus" do
|
data/spec/parser/test_data.txt
CHANGED
@@ -190,6 +190,8 @@ Endoxyla sp. GM-, 2003|{"scientificName":{"parsed":true, "parser_version":"test_
|
|
190
190
|
Liopropoma sp.2 Not applicable|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Liopropoma sp.2 Not applicable", "normalized":"Liopropoma", "canonical":"Liopropoma", "hybrid":false, "details":[{"genus":{"string":"Liopropoma"}, "annotation_identification":"sp.", "ignored":{"unparsed":"2 Not applicable"}}], "parser_run":1, "positions":{"0":["genus", 10], "11":["annotation_identification", 14]}}}
|
191
191
|
Lacanobia nr. subjuncta Bold:Aab, 0925|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lacanobia nr. subjuncta Bold:Aab, 0925", "normalized":"Lacanobia", "canonical":"Lacanobia", "hybrid":false, "details":[{"genus":{"string":"Lacanobia"}, "annotation_identification":"nr.", "ignored":{"species":{"string":"subjuncta", "authorship":"Bold:Aab", "basionymAuthorTeam":{"authorTeam":"Bold:Aab", "author":["Bold:Aab"]}}}}], "parser_run":2, "positions":{"0":["genus", 9], "10":["annotation_identification", 13]}}}
|
192
192
|
Lacanobia sp. nr. subjuncta Bold:Aab, 0925|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lacanobia sp. nr. subjuncta Bold:Aab, 0925", "normalized":"Lacanobia", "canonical":"Lacanobia", "hybrid":false, "details":[{"genus":{"string":"Lacanobia"}, "annotation_identification":"sp. nr.", "ignored":{"species":{"string":"subjuncta", "authorship":"Bold:Aab", "basionymAuthorTeam":{"authorTeam":"Bold:Aab", "author":["Bold:Aab"]}}}}], "parser_run":2, "positions":{"0":["genus", 9], "10":["annotation_identification", 17]}}}
|
193
|
+
#Larus occidentalis cf. wymani|{}
|
194
|
+
Calidris cf. cooperi|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Calidris cf. cooperi", "normalized":"Calidris cf. cooperi", "canonical":"Calidris cooperi", "hybrid":false, "details":[{"genus":{"string":"Calidris"}, "annotation_identification":"cf.", "species":{"species":{"string":"cooperi"}}}], "parser_run":1, "positions":{"0":["genus", 8], "9":["annotation_identification", 12], "13":["species", 20]}}}
|
193
195
|
#TODO:Gemmula cf. cosmoi NP-2008 -- generates wrong authorship
|
194
196
|
|
195
197
|
#unknown authorship
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biodiversity19
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-03-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: treetop
|
@@ -178,7 +178,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
segments:
|
180
180
|
- 0
|
181
|
-
hash:
|
181
|
+
hash: 3489764594482913391
|
182
182
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
183
183
|
none: false
|
184
184
|
requirements:
|