biodiversity19 1.1.3 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -100,5 +100,10 @@ You can use it as a library
100
100
  # to resolve lsid and get back RDF file
101
101
  LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
102
102
 
103
+ # to get canonicals with ranks for infraspecific epithets:
104
+ parser = ScientificNameParser.new(canonical_with_rank: true)
105
+ parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
106
+ # should get 'Cola cordifolia var. puberula'
107
+
103
108
  Copyright (c) 2009-2011 Marine Biological Laboratory. See LICENSE.txt for
104
109
  further details.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.1.3
1
+ 1.2.0
@@ -12,7 +12,7 @@ module PreProcessor
12
12
  TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
13
13
  TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
14
14
  NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
15
- LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
15
+ LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|nudum|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
16
16
 
17
17
  def self.clean(a_string)
18
18
  [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
@@ -36,7 +36,9 @@ class ParallelParser
36
36
  end
37
37
 
38
38
  def parse(names_list)
39
- parsed = Parallel.map(names_list.uniq, :in_processes => @processes_num) { |n| [n, parse_process(n)] }
39
+ parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
40
+ [n, parse_process(n)]
41
+ end
40
42
  parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
41
43
  end
42
44
 
@@ -47,7 +49,10 @@ class ParallelParser
47
49
  private
48
50
  def parse_process(name)
49
51
  p = ScientificNameParser.new
50
- p.parse(name) rescue {:scientificName => {:parsed => false, :verbatim => name, :error => 'Parser error'}}
52
+ failed_res = { scientificName: { parsed: false,
53
+ verbatim: name,
54
+ error: 'Parser error' } }
55
+ p.parse(name) rescue failed_res
51
56
  end
52
57
  end
53
58
 
@@ -64,7 +69,10 @@ end
64
69
  # end
65
70
 
66
71
  class ScientificNameParser
67
- VERSION = open(File.join(File.dirname(__FILE__), '..', '..', 'VERSION')).readline.strip
72
+ VERSION = open(File.join(File.dirname(__FILE__),
73
+ '..',
74
+ '..',
75
+ 'VERSION')).readline.strip
68
76
 
69
77
  def self.fix_case(name_string)
70
78
  name_ary = name_string.split(/\s+/)
@@ -96,7 +104,8 @@ class ScientificNameParser
96
104
  end
97
105
 
98
106
 
99
- def initialize
107
+ def initialize(opts = {})
108
+ @canonical_with_rank = !!opts[:canonical_with_rank]
100
109
  @verbatim = ''
101
110
  @clean = ScientificNameCleanParser.new
102
111
  @dirty = ScientificNameDirtyParser.new
@@ -134,7 +143,9 @@ class ScientificNameParser
134
143
  @parsed = @dirty.parse(salvage_string) || @canonical.parse(a_string) || { :verbatim => a_string }
135
144
  end
136
145
  rescue
137
- @parsed = {:scientificName => {:parsed => false, :verbatim => name, :error => 'Parser error'}}
146
+ @parsed = { scientificName: { parsed: false,
147
+ verbatim: name,
148
+ error: 'Parser error' } }
138
149
  end
139
150
  end
140
151
 
@@ -142,7 +153,8 @@ class ScientificNameParser
142
153
  @verbatim = a_string
143
154
  end
144
155
 
145
- def @parsed.all(verbatim = @verbatim)
156
+ def @parsed.all(opts = {})
157
+ canonical_with_rank = !!opts[:canonical_with_rank]
146
158
  parsed = self.class != Hash
147
159
  res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
148
160
  if parsed
@@ -159,8 +171,10 @@ class ScientificNameParser
159
171
  else
160
172
  res.merge!(self)
161
173
  end
174
+ if canonical_with_rank && canonical.count(" ") > 1 && res[:details][0][:infraspecies]
175
+ ScientificNameParser.add_rank_to_canonical(res)
176
+ end
162
177
  res = {:scientificName => res}
163
- res
164
178
  end
165
179
 
166
180
  def @parsed.pos_json
@@ -172,7 +186,21 @@ class ScientificNameParser
172
186
  end
173
187
 
174
188
  @parsed.verbatim = @verbatim
175
- @parsed.all
189
+ @parsed.all(canonical_with_rank: @canonical_with_rank)
190
+ end
191
+
192
+ private
193
+
194
+ def self.add_rank_to_canonical(parsed)
195
+ parts = parsed[:canonical].split(" ")
196
+ name_ary = parts[0..1]
197
+ parsed[:details][0][:infraspecies].each do |data|
198
+ infrasp = data[:string]
199
+ rank = data[:rank]
200
+ name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
201
+ end
202
+ parsed[:canonical] = name_ary.join(" ")
176
203
  end
204
+
177
205
  end
178
206
 
@@ -262,6 +262,28 @@ grammar ScientificNameClean
262
262
  end
263
263
  }
264
264
  /
265
+ a:genus space b:infragenus space aid:annotation_identification? space c:species space aid:annotation_identification space d:infraspecies_mult {
266
+ def value
267
+ a.value + " " + b.value + " " + c.value + " " + d.value
268
+ end
269
+
270
+ def canonical
271
+ a.canonical + " " + c.canonical + " " + d.canonical
272
+ end
273
+
274
+ def pos
275
+ a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
276
+ end
277
+
278
+ def hybrid
279
+ c.hybrid rescue false
280
+ end
281
+
282
+ def details
283
+ a.details.merge(b.details).merge(c.details).merge(d.details)
284
+ end
285
+ }
286
+ /
265
287
  a:genus space b:infragenus space aid:annotation_identification? space c:species {
266
288
  def value
267
289
  if defined? aid.apply
@@ -26,6 +26,7 @@ describe ScientificNameParser do
26
26
  JSON.load(json(y[:name])).should == JSON.load(y[:jsn]) unless y[:comment]
27
27
  end
28
28
  end
29
+
29
30
 
30
31
  # it 'should generate new test_file' do
31
32
  # new_test = open(File.expand_path(dir + "../../spec/parser/test_data_new.txt"),'w')
@@ -57,6 +58,36 @@ describe ScientificNameParser do
57
58
  end
58
59
  end
59
60
 
61
+ describe "ScientificNameParser with ranked canonicals" do
62
+ before(:all) do
63
+ @parser = ScientificNameParser.new(canonical_with_rank: true)
64
+ end
65
+
66
+ it 'should not influence output for uninomials and binomials' do
67
+ data = [
68
+ ['Ekbainacanthus Yakowlew 1902','Ekbainacanthus'],
69
+ ['Ekboarmia sagnesi herrerai Exposito 2007', 'Ekboarmia sagnesi herrerai'],
70
+ ['Ekboarmia holli Oberthür', 'Ekboarmia holli']]
71
+
72
+ data.each do |d|
73
+ parsed = @parser.parse(d[0])[:scientificName][:canonical]
74
+ parsed.should == d[1]
75
+ end
76
+ end
77
+
78
+ it 'should preserve rank for ranked multinomials' do
79
+ data = [
80
+ ['Cola cordifolia var. puberula A. Chev.', 'Cola cordifolia var. puberula'],
81
+ ['Abies homolepis forma umbilicata (Mayr) Schelle', 'Abies homolepis forma umbilicata'],
82
+ ['Quercus ilex ssp. ballota (Desf.) Samp', 'Quercus ilex ssp. ballota']
83
+ ]
84
+ data.each do |d|
85
+ parsed = @parser.parse(d[0])[:scientificName][:canonical]
86
+ parsed.should == d[1]
87
+ end
88
+ end
89
+
90
+ end
60
91
 
61
92
  describe ParallelParser do
62
93
  it "should find number of cpus" do
@@ -190,6 +190,8 @@ Endoxyla sp. GM-, 2003|{"scientificName":{"parsed":true, "parser_version":"test_
190
190
  Liopropoma sp.2 Not applicable|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Liopropoma sp.2 Not applicable", "normalized":"Liopropoma", "canonical":"Liopropoma", "hybrid":false, "details":[{"genus":{"string":"Liopropoma"}, "annotation_identification":"sp.", "ignored":{"unparsed":"2 Not applicable"}}], "parser_run":1, "positions":{"0":["genus", 10], "11":["annotation_identification", 14]}}}
191
191
  Lacanobia nr. subjuncta Bold:Aab, 0925|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lacanobia nr. subjuncta Bold:Aab, 0925", "normalized":"Lacanobia", "canonical":"Lacanobia", "hybrid":false, "details":[{"genus":{"string":"Lacanobia"}, "annotation_identification":"nr.", "ignored":{"species":{"string":"subjuncta", "authorship":"Bold:Aab", "basionymAuthorTeam":{"authorTeam":"Bold:Aab", "author":["Bold:Aab"]}}}}], "parser_run":2, "positions":{"0":["genus", 9], "10":["annotation_identification", 13]}}}
192
192
  Lacanobia sp. nr. subjuncta Bold:Aab, 0925|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lacanobia sp. nr. subjuncta Bold:Aab, 0925", "normalized":"Lacanobia", "canonical":"Lacanobia", "hybrid":false, "details":[{"genus":{"string":"Lacanobia"}, "annotation_identification":"sp. nr.", "ignored":{"species":{"string":"subjuncta", "authorship":"Bold:Aab", "basionymAuthorTeam":{"authorTeam":"Bold:Aab", "author":["Bold:Aab"]}}}}], "parser_run":2, "positions":{"0":["genus", 9], "10":["annotation_identification", 17]}}}
193
+ #Larus occidentalis cf. wymani|{}
194
+ Calidris cf. cooperi|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Calidris cf. cooperi", "normalized":"Calidris cf. cooperi", "canonical":"Calidris cooperi", "hybrid":false, "details":[{"genus":{"string":"Calidris"}, "annotation_identification":"cf.", "species":{"species":{"string":"cooperi"}}}], "parser_run":1, "positions":{"0":["genus", 8], "9":["annotation_identification", 12], "13":["species", 20]}}}
193
195
  #TODO:Gemmula cf. cosmoi NP-2008 -- generates wrong authorship
194
196
 
195
197
  #unknown authorship
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biodiversity19
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.3
4
+ version: 1.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-08 00:00:00.000000000 Z
12
+ date: 2013-03-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: treetop
@@ -178,7 +178,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
178
178
  version: '0'
179
179
  segments:
180
180
  - 0
181
- hash: -2308575381079309035
181
+ hash: 3489764594482913391
182
182
  required_rubygems_version: !ruby/object:Gem::Requirement
183
183
  none: false
184
184
  requirements: