biodiversity19 1.1.3 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -100,5 +100,10 @@ You can use it as a library
100
100
  # to resolve lsid and get back RDF file
101
101
  LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
102
102
 
103
+ # to get canonicals with ranks for infraspecific epithets:
104
+ parser = ScientificNameParser.new(canonical_with_rank: true)
105
+ parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
106
+ # should get 'Cola cordifolia var. puberula'
107
+
103
108
  Copyright (c) 2009-2011 Marine Biological Laboratory. See LICENSE.txt for
104
109
  further details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.1.3
1
+ 1.2.0
@@ -12,7 +12,7 @@ module PreProcessor
12
12
  TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
13
13
  TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
14
14
  NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
15
- LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
15
+ LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|nudum|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
16
16
 
17
17
  def self.clean(a_string)
18
18
  [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
@@ -36,7 +36,9 @@ class ParallelParser
36
36
  end
37
37
 
38
38
  def parse(names_list)
39
- parsed = Parallel.map(names_list.uniq, :in_processes => @processes_num) { |n| [n, parse_process(n)] }
39
+ parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
40
+ [n, parse_process(n)]
41
+ end
40
42
  parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
41
43
  end
42
44
 
@@ -47,7 +49,10 @@ class ParallelParser
47
49
  private
48
50
  def parse_process(name)
49
51
  p = ScientificNameParser.new
50
- p.parse(name) rescue {:scientificName => {:parsed => false, :verbatim => name, :error => 'Parser error'}}
52
+ failed_res = { scientificName: { parsed: false,
53
+ verbatim: name,
54
+ error: 'Parser error' } }
55
+ p.parse(name) rescue failed_res
51
56
  end
52
57
  end
53
58
 
@@ -64,7 +69,10 @@ end
64
69
  # end
65
70
 
66
71
  class ScientificNameParser
67
- VERSION = open(File.join(File.dirname(__FILE__), '..', '..', 'VERSION')).readline.strip
72
+ VERSION = open(File.join(File.dirname(__FILE__),
73
+ '..',
74
+ '..',
75
+ 'VERSION')).readline.strip
68
76
 
69
77
  def self.fix_case(name_string)
70
78
  name_ary = name_string.split(/\s+/)
@@ -96,7 +104,8 @@ class ScientificNameParser
96
104
  end
97
105
 
98
106
 
99
- def initialize
107
+ def initialize(opts = {})
108
+ @canonical_with_rank = !!opts[:canonical_with_rank]
100
109
  @verbatim = ''
101
110
  @clean = ScientificNameCleanParser.new
102
111
  @dirty = ScientificNameDirtyParser.new
@@ -134,7 +143,9 @@ class ScientificNameParser
134
143
  @parsed = @dirty.parse(salvage_string) || @canonical.parse(a_string) || { :verbatim => a_string }
135
144
  end
136
145
  rescue
137
- @parsed = {:scientificName => {:parsed => false, :verbatim => name, :error => 'Parser error'}}
146
+ @parsed = { scientificName: { parsed: false,
147
+ verbatim: name,
148
+ error: 'Parser error' } }
138
149
  end
139
150
  end
140
151
 
@@ -142,7 +153,8 @@ class ScientificNameParser
142
153
  @verbatim = a_string
143
154
  end
144
155
 
145
- def @parsed.all(verbatim = @verbatim)
156
+ def @parsed.all(opts = {})
157
+ canonical_with_rank = !!opts[:canonical_with_rank]
146
158
  parsed = self.class != Hash
147
159
  res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
148
160
  if parsed
@@ -159,8 +171,10 @@ class ScientificNameParser
159
171
  else
160
172
  res.merge!(self)
161
173
  end
174
+ if canonical_with_rank && canonical.count(" ") > 1 && res[:details][0][:infraspecies]
175
+ ScientificNameParser.add_rank_to_canonical(res)
176
+ end
162
177
  res = {:scientificName => res}
163
- res
164
178
  end
165
179
 
166
180
  def @parsed.pos_json
@@ -172,7 +186,21 @@ class ScientificNameParser
172
186
  end
173
187
 
174
188
  @parsed.verbatim = @verbatim
175
- @parsed.all
189
+ @parsed.all(canonical_with_rank: @canonical_with_rank)
190
+ end
191
+
192
+ private
193
+
194
+ def self.add_rank_to_canonical(parsed)
195
+ parts = parsed[:canonical].split(" ")
196
+ name_ary = parts[0..1]
197
+ parsed[:details][0][:infraspecies].each do |data|
198
+ infrasp = data[:string]
199
+ rank = data[:rank]
200
+ name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
201
+ end
202
+ parsed[:canonical] = name_ary.join(" ")
176
203
  end
204
+
177
205
  end
178
206
 
@@ -262,6 +262,28 @@ grammar ScientificNameClean
262
262
  end
263
263
  }
264
264
  /
265
+ a:genus space b:infragenus space aid:annotation_identification? space c:species space aid:annotation_identification space d:infraspecies_mult {
266
+ def value
267
+ a.value + " " + b.value + " " + c.value + " " + d.value
268
+ end
269
+
270
+ def canonical
271
+ a.canonical + " " + c.canonical + " " + d.canonical
272
+ end
273
+
274
+ def pos
275
+ a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
276
+ end
277
+
278
+ def hybrid
279
+ c.hybrid rescue false
280
+ end
281
+
282
+ def details
283
+ a.details.merge(b.details).merge(c.details).merge(d.details)
284
+ end
285
+ }
286
+ /
265
287
  a:genus space b:infragenus space aid:annotation_identification? space c:species {
266
288
  def value
267
289
  if defined? aid.apply
@@ -26,6 +26,7 @@ describe ScientificNameParser do
26
26
  JSON.load(json(y[:name])).should == JSON.load(y[:jsn]) unless y[:comment]
27
27
  end
28
28
  end
29
+
29
30
 
30
31
  # it 'should generate new test_file' do
31
32
  # new_test = open(File.expand_path(dir + "../../spec/parser/test_data_new.txt"),'w')
@@ -57,6 +58,36 @@ describe ScientificNameParser do
57
58
  end
58
59
  end
59
60
 
61
+ describe "ScientificNameParser with ranked canonicals" do
62
+ before(:all) do
63
+ @parser = ScientificNameParser.new(canonical_with_rank: true)
64
+ end
65
+
66
+ it 'should not influence output for uninomials and binomials' do
67
+ data = [
68
+ ['Ekbainacanthus Yakowlew 1902','Ekbainacanthus'],
69
+ ['Ekboarmia sagnesi herrerai Exposito 2007', 'Ekboarmia sagnesi herrerai'],
70
+ ['Ekboarmia holli Oberthür', 'Ekboarmia holli']]
71
+
72
+ data.each do |d|
73
+ parsed = @parser.parse(d[0])[:scientificName][:canonical]
74
+ parsed.should == d[1]
75
+ end
76
+ end
77
+
78
+ it 'should preserve rank for ranked multinomials' do
79
+ data = [
80
+ ['Cola cordifolia var. puberula A. Chev.', 'Cola cordifolia var. puberula'],
81
+ ['Abies homolepis forma umbilicata (Mayr) Schelle', 'Abies homolepis forma umbilicata'],
82
+ ['Quercus ilex ssp. ballota (Desf.) Samp', 'Quercus ilex ssp. ballota']
83
+ ]
84
+ data.each do |d|
85
+ parsed = @parser.parse(d[0])[:scientificName][:canonical]
86
+ parsed.should == d[1]
87
+ end
88
+ end
89
+
90
+ end
60
91
 
61
92
  describe ParallelParser do
62
93
  it "should find number of cpus" do
@@ -190,6 +190,8 @@ Endoxyla sp. GM-, 2003|{"scientificName":{"parsed":true, "parser_version":"test_
190
190
  Liopropoma sp.2 Not applicable|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Liopropoma sp.2 Not applicable", "normalized":"Liopropoma", "canonical":"Liopropoma", "hybrid":false, "details":[{"genus":{"string":"Liopropoma"}, "annotation_identification":"sp.", "ignored":{"unparsed":"2 Not applicable"}}], "parser_run":1, "positions":{"0":["genus", 10], "11":["annotation_identification", 14]}}}
191
191
  Lacanobia nr. subjuncta Bold:Aab, 0925|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lacanobia nr. subjuncta Bold:Aab, 0925", "normalized":"Lacanobia", "canonical":"Lacanobia", "hybrid":false, "details":[{"genus":{"string":"Lacanobia"}, "annotation_identification":"nr.", "ignored":{"species":{"string":"subjuncta", "authorship":"Bold:Aab", "basionymAuthorTeam":{"authorTeam":"Bold:Aab", "author":["Bold:Aab"]}}}}], "parser_run":2, "positions":{"0":["genus", 9], "10":["annotation_identification", 13]}}}
192
192
  Lacanobia sp. nr. subjuncta Bold:Aab, 0925|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lacanobia sp. nr. subjuncta Bold:Aab, 0925", "normalized":"Lacanobia", "canonical":"Lacanobia", "hybrid":false, "details":[{"genus":{"string":"Lacanobia"}, "annotation_identification":"sp. nr.", "ignored":{"species":{"string":"subjuncta", "authorship":"Bold:Aab", "basionymAuthorTeam":{"authorTeam":"Bold:Aab", "author":["Bold:Aab"]}}}}], "parser_run":2, "positions":{"0":["genus", 9], "10":["annotation_identification", 17]}}}
193
+ #Larus occidentalis cf. wymani|{}
194
+ Calidris cf. cooperi|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Calidris cf. cooperi", "normalized":"Calidris cf. cooperi", "canonical":"Calidris cooperi", "hybrid":false, "details":[{"genus":{"string":"Calidris"}, "annotation_identification":"cf.", "species":{"species":{"string":"cooperi"}}}], "parser_run":1, "positions":{"0":["genus", 8], "9":["annotation_identification", 12], "13":["species", 20]}}}
193
195
  #TODO:Gemmula cf. cosmoi NP-2008 -- generates wrong authorship
194
196
 
195
197
  #unknown authorship
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biodiversity19
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.3
4
+ version: 1.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-08 00:00:00.000000000 Z
12
+ date: 2013-03-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: treetop
@@ -178,7 +178,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
178
178
  version: '0'
179
179
  segments:
180
180
  - 0
181
- hash: -2308575381079309035
181
+ hash: 3489764594482913391
182
182
  required_rubygems_version: !ruby/object:Gem::Requirement
183
183
  none: false
184
184
  requirements: