biodiversity 1.0.10 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,22 +9,44 @@ require 'json'
9
9
  module PreProcessor
10
10
  NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
11
11
  TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
12
- TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
13
- TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
12
+ TAXON_CONCEPTS2 = /\s+
13
+ (\(?s\.\s?s\.|
14
+ \(?s\.\s?l\.|
15
+ \(?s\.\s?str\.|
16
+ \(?s\.\s?lat\.|
17
+ sec\.|sec|near)\b.*$/x
18
+ TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
14
19
  NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
15
- LAST_WORD_JUNK = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
16
-
20
+ LAST_WORD_JUNK = /(,\s*|\s+)
21
+ (spp\.|spp|var\.|
22
+ var|von|van|ined\.|
23
+ ined|sensu|new|non|nec|
24
+ nudum|cf\.|cf|sp\.|sp|
25
+ ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
26
+
17
27
  def self.clean(a_string)
18
- [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
28
+ [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
29
+ TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
19
30
  a_string = a_string.gsub(i, '')
20
31
  end
21
32
  a_string = a_string.tr('ſ','s') #old 's'
22
33
  a_string
23
- end
34
+ end
24
35
  end
25
36
 
37
+ # Public: Parser which runs in parallel.
38
+ #
39
+ # Examples
40
+ #
41
+ # parser = ParallelParser.new(4)
42
+ # parser.parse(['Betula L.', 'Pardosa moesta'])
26
43
  class ParallelParser
27
44
 
45
+ # Public: Initialize ParallelParser.
46
+ #
47
+ # processes_num - an Integer to setup the number of processes (default: nil).
48
+ # If processes number is not set it will be determined
49
+ # automatically.
28
50
  def initialize(processes_num = nil)
29
51
  require 'parallel'
30
52
  cpu_num
@@ -35,11 +57,32 @@ class ParallelParser
35
57
  end
36
58
  end
37
59
 
60
+ # Public: Parses an array of scientific names using several processes
61
+ # in parallel.
62
+ #
63
+ # Scientific names are deduplicated in the process, so every string is
64
+ # parsed only once.
65
+ #
66
+ # names_list - takes an Array of scientific names,
67
+ # each element should be a String.
68
+ #
69
+ # Examples
70
+ #
71
+ # parser = ParallelParser.new(4)
72
+ # parser.parse(['Homo sapiens L.', 'Quercus quercus'])
73
+ #
74
+ # Returns a Hash with scientific names as a key, and parsing results as
75
+ # a value.
38
76
  def parse(names_list)
39
- parsed = Parallel.map(names_list.uniq, :in_processes => @processes_num) { |n| [n, parse_process(n)] }
77
+ parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
78
+ [n, parse_process(n)]
79
+ end
40
80
  parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
41
81
  end
42
82
 
83
+ # Public: Returns the number of cores/CPUs.
84
+ #
85
+ # Returns Integer of cores/CPUs.
43
86
  def cpu_num
44
87
  @cpu_num ||= Parallel.processor_count
45
88
  end
@@ -47,7 +90,7 @@ class ParallelParser
47
90
  private
48
91
  def parse_process(name)
49
92
  p = ScientificNameParser.new
50
- p.parse(name) rescue {:scientificName => {:parsed => false, :verbatim => name, :error => 'Parser error'}}
93
+ p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
51
94
  end
52
95
  end
53
96
 
@@ -58,15 +101,64 @@ end
58
101
  # @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
59
102
  # @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
60
103
  # @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
61
- # @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
104
+ # @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
105
+ # viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
62
106
  # @parsed = nil
63
107
  # end
64
108
  # end
65
109
 
66
110
  class ScientificNameParser
67
- VERSION = open(File.join(File.dirname(__FILE__), '..', '..', 'VERSION')).readline.strip
68
-
69
- def initialize
111
+ VERSION = open(File.join(File.dirname(__FILE__),
112
+ '..',
113
+ '..',
114
+ 'VERSION')).readline.strip
115
+
116
+ FAILED_RESULT = ->(name) do
117
+ { scientificName:
118
+ { parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
119
+ }
120
+ end
121
+
122
+ def self.version
123
+ VERSION
124
+ end
125
+
126
+ def self.fix_case(name_string)
127
+ name_ary = name_string.split(/\s+/)
128
+ words_num = name_ary.size
129
+ res = nil
130
+ if words_num == 1
131
+ res = name_ary[0].gsub(/[\(\)\{\}]/, '')
132
+ if res.size > 1
133
+ res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
134
+ else
135
+ res = nil
136
+ end
137
+ else
138
+ if name_ary[0].size > 1
139
+ word1 = UnicodeUtils.upcase(name_ary[0][0]) +
140
+ UnicodeUtils.downcase(name_ary[0][1..-1])
141
+ else
142
+ word1 = name_ary[0]
143
+ end
144
+ if name_ary[1].match(/^\(/)
145
+ word2 = name_ary[1].gsub(/\)$/, '') + ')'
146
+ word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
147
+ UnicodeUtils.downcase(word2[2..-1])
148
+ else
149
+ word2 = UnicodeUtils.downcase(name_ary[1])
150
+ end
151
+ res = word1 + ' ' +
152
+ word2 + ' ' +
153
+ name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
154
+ res.strip!
155
+ end
156
+ res
157
+ end
158
+
159
+
160
+ def initialize(opts = {})
161
+ @canonical_with_rank = !!opts[:canonical_with_rank]
70
162
  @verbatim = ''
71
163
  @clean = ScientificNameCleanParser.new
72
164
  @dirty = ScientificNameDirtyParser.new
@@ -75,8 +167,12 @@ class ScientificNameParser
75
167
  end
76
168
 
77
169
  def virus?(a_string)
78
- !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\b(virus|viruses|phage|phages|viroid|viroids|satellite|satellites|prion|prions)\b/i))
79
- end
170
+ !!(a_string.match(/\sICTV\s*$/) ||
171
+ a_string.match(/\b(virus|viruses|
172
+ phage|phages|viroid|viroids|
173
+ satellite|satellites|prion|prions)\b/ix) ||
174
+ a_string.match(/[A-Z]?[a-z]+virus\b/))
175
+ end
80
176
 
81
177
  def unknown_placement?(a_string)
82
178
  !!(a_string.match(/incertae\s+sedis/i) || a_string.match(/inc\.\s*sed\./i))
@@ -85,54 +181,85 @@ class ScientificNameParser
85
181
  def parsed
86
182
  @parsed
87
183
  end
88
-
184
+
89
185
  def parse(a_string)
90
- @verbatim = a_string
186
+ @verbatim = a_string.strip
91
187
  a_string = PreProcessor::clean(a_string)
92
-
188
+
93
189
  if virus?(a_string)
94
- @parsed = { :verbatim => a_string, :virus => true }
190
+ @parsed = { verbatim: a_string, virus: true }
95
191
  elsif unknown_placement?(a_string)
96
- @parsed = { :verbatim => a_string }
192
+ @parsed = { verbatim: a_string }
97
193
  else
98
- @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
194
+ begin
195
+ @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
196
+ unless @parsed
197
+ index = @dirty.index || @clean.index
198
+ salvage_match = a_string[0..index].split(/\s+/)[0..-2]
199
+ salvage_string = salvage_match ? salvage_match.join(' ') : a_string
200
+ @parsed = @dirty.parse(salvage_string) ||
201
+ @canonical.parse(a_string) ||
202
+ { verbatim: a_string }
203
+ end
204
+ rescue
205
+ @parsed = FAILED_RESULT.(@verbatim)
206
+ end
99
207
  end
100
208
 
101
209
  def @parsed.verbatim=(a_string)
102
210
  @verbatim = a_string
103
211
  end
104
212
 
105
- def @parsed.all(verbatim = @verbatim)
213
+ def @parsed.all(opts = {})
214
+ canonical_with_rank = !!opts[:canonical_with_rank]
106
215
  parsed = self.class != Hash
107
- res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
216
+ res = { parsed: parsed, parser_version: ScientificNameParser::VERSION}
108
217
  if parsed
109
218
  hybrid = self.hybrid rescue false
110
219
  res.merge!({
111
- :verbatim => @verbatim,
112
- :normalized => self.value,
113
- :canonical => self.canonical,
114
- :hybrid => hybrid,
115
- :details => self.details,
116
- :parser_run => self.parser_run,
117
- :positions => self.pos
220
+ verbatim: @verbatim,
221
+ normalized: self.value,
222
+ canonical: self.canonical,
223
+ hybrid: hybrid,
224
+ details: self.details,
225
+ parser_run: self.parser_run,
226
+ positions: self.pos
118
227
  })
119
228
  else
120
229
  res.merge!(self)
121
230
  end
231
+ if (canonical_with_rank &&
232
+ canonical.count(' ') > 1 &&
233
+ res[:details][0][:infraspecies])
234
+ ScientificNameParser.add_rank_to_canonical(res)
235
+ end
122
236
  res = {:scientificName => res}
123
- res
124
237
  end
125
-
238
+
126
239
  def @parsed.pos_json
127
240
  self.pos.to_json rescue ''
128
241
  end
129
-
242
+
130
243
  def @parsed.all_json
131
244
  self.all.to_json rescue ''
132
245
  end
133
246
 
134
247
  @parsed.verbatim = @verbatim
135
- @parsed.all
248
+ @parsed.all(canonical_with_rank: @canonical_with_rank)
249
+ end
250
+
251
+ private
252
+
253
+ def self.add_rank_to_canonical(parsed)
254
+ parts = parsed[:canonical].split(' ')
255
+ name_ary = parts[0..1]
256
+ parsed[:details][0][:infraspecies].each do |data|
257
+ infrasp = data[:string]
258
+ rank = data[:rank]
259
+ name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
260
+ end
261
+ parsed[:canonical] = name_ary.join(' ')
136
262
  end
263
+
137
264
  end
138
265
 
@@ -40,7 +40,7 @@ grammar ScientificNameCanonical
40
40
  end
41
41
 
42
42
  def canonical
43
- a.canonical + " " + b.canonical + " " + c.canonical
43
+ a.canonical + " " + c.canonical
44
44
  end
45
45
 
46
46
  def pos
@@ -58,7 +58,7 @@ grammar ScientificNameCanonical
58
58
  end
59
59
 
60
60
  def canonical
61
- a.canonical + " " + b.canonical
61
+ a.canonical
62
62
  end
63
63
 
64
64
  def pos
@@ -110,6 +110,8 @@ grammar ScientificNameCanonical
110
110
  end
111
111
 
112
112
  rule garbage
113
+ space "$$g@rbg3$$"
114
+ /
113
115
  space (["',.]) space [^щ]*
114
116
  /
115
117
  space_hard [^ш]+
@@ -1,4 +1,6 @@
1
1
  # encoding: UTF-8
2
+ require 'unicode_utils'
3
+
2
4
  grammar ScientificNameClean
3
5
 
4
6
  rule root
@@ -6,19 +8,19 @@ grammar ScientificNameClean
6
8
  def value
7
9
  a.value.gsub(/\s{2,}/, ' ').strip
8
10
  end
9
-
11
+
10
12
  def canonical
11
13
  a.canonical.gsub(/\s{2,}/, ' ').strip
12
14
  end
13
-
15
+
14
16
  def pos
15
17
  a.pos
16
18
  end
17
-
19
+
18
20
  def hybrid
19
21
  a.hybrid
20
22
  end
21
-
23
+
22
24
  def details
23
25
  a.details.class == Array ? a.details : [a.details]
24
26
  end
@@ -28,25 +30,25 @@ grammar ScientificNameClean
28
30
  end
29
31
  }
30
32
  end
31
-
33
+
32
34
  rule scientific_name_5
33
35
  a:multinomial_name space_hard hybrid_character space_hard b:species {
34
36
  def value
35
37
  a.value + " × " + b.value
36
38
  end
37
-
39
+
38
40
  def canonical
39
41
  a.canonical + " × " + b.canonical
40
42
  end
41
-
43
+
42
44
  def pos
43
45
  a.pos.merge(b.pos)
44
46
  end
45
-
47
+
46
48
  def hybrid
47
49
  true
48
50
  end
49
-
51
+
50
52
  def details
51
53
  [a.details, b.details.merge({:genus => a.details[:genus]})]
52
54
  end
@@ -56,19 +58,19 @@ grammar ScientificNameClean
56
58
  def value
57
59
  a.value + " " + b.apply(c)
58
60
  end
59
-
61
+
60
62
  def canonical
61
63
  a.canonical
62
64
  end
63
-
65
+
64
66
  def pos
65
67
  a.pos.merge(c.pos)
66
68
  end
67
-
69
+
68
70
  def hybrid
69
71
  a.hybrid
70
72
  end
71
-
73
+
72
74
  def details
73
75
  a.details.merge(b.details(c))
74
76
  end
@@ -76,25 +78,25 @@ grammar ScientificNameClean
76
78
  /
77
79
  scientific_name_4
78
80
  end
79
-
81
+
80
82
  rule scientific_name_4
81
83
  a:scientific_name_1 space hybrid_character space b:scientific_name_1 {
82
84
  def value
83
85
  a.value + " × " + b.value
84
86
  end
85
-
87
+
86
88
  def canonical
87
89
  a.canonical + " × " + b.canonical
88
90
  end
89
-
91
+
90
92
  def pos
91
93
  a.pos.merge(b.pos)
92
94
  end
93
-
95
+
94
96
  def hybrid
95
97
  true
96
98
  end
97
-
99
+
98
100
  def details
99
101
  [a.details, b.details]
100
102
  end
@@ -104,19 +106,19 @@ grammar ScientificNameClean
104
106
  def value
105
107
  a.value + " × ?"
106
108
  end
107
-
109
+
108
110
  def canonical
109
111
  a.canonical
110
112
  end
111
-
113
+
112
114
  def pos
113
115
  a.pos
114
116
  end
115
-
117
+
116
118
  def hybrid
117
119
  true
118
120
  end
119
-
121
+
120
122
  def details
121
123
  [a.details, "?"]
122
124
  end
@@ -124,25 +126,25 @@ grammar ScientificNameClean
124
126
  /
125
127
  scientific_name_3
126
128
  end
127
-
129
+
128
130
  rule scientific_name_3
129
131
  a:hybrid_character space b:scientific_name_2 {
130
132
  def value
131
133
  a.value + " " + b.value
132
134
  end
133
-
135
+
134
136
  def canonical
135
137
  b.canonical
136
138
  end
137
-
139
+
138
140
  def pos
139
141
  b.pos
140
142
  end
141
-
143
+
142
144
  def hybrid
143
145
  true
144
146
  end
145
-
147
+
146
148
  def details
147
149
  b.details
148
150
  end
@@ -150,25 +152,25 @@ grammar ScientificNameClean
150
152
  /
151
153
  scientific_name_2
152
154
  end
153
-
155
+
154
156
  rule scientific_name_2
155
157
  a:scientific_name_1 space b:status_part {
156
158
  def value
157
159
  a.value + " " + b.value
158
160
  end
159
-
161
+
160
162
  def canonical
161
163
  a.canonical
162
164
  end
163
-
165
+
164
166
  def pos
165
167
  a.pos
166
168
  end
167
-
169
+
168
170
  def hybrid
169
171
  a.hybrid rescue false
170
172
  end
171
-
173
+
172
174
  def details
173
175
  a.details.merge(b.details)
174
176
  end
@@ -178,12 +180,14 @@ grammar ScientificNameClean
178
180
  end
179
181
 
180
182
  rule scientific_name_1
183
+ multiuninomial_name
184
+ /
181
185
  multinomial_name
182
186
  /
183
- uninomial_name
187
+ uninomial_name
184
188
  end
185
-
186
-
189
+
190
+
187
191
  rule status_part
188
192
  a:status_word space b:status_part {
189
193
  def value
@@ -196,7 +200,7 @@ grammar ScientificNameClean
196
200
  /
197
201
  status_word
198
202
  end
199
-
203
+
200
204
  rule status_word
201
205
  latin_word [\.] {
202
206
  def value
@@ -209,114 +213,239 @@ grammar ScientificNameClean
209
213
  #/
210
214
  #latin_word
211
215
  end
212
-
213
-
216
+
217
+ rule unparsed
218
+ .+ space {
219
+
220
+ def value
221
+ ''
222
+ end
223
+
224
+ def hybrid
225
+ false
226
+ end
227
+
228
+ def canonical
229
+ ''
230
+ end
231
+
232
+ def pos
233
+ {interval.begin => ['unparsed', interval.end]}
234
+ end
235
+
236
+ def details
237
+ {:unparsed => text_value}
238
+ end
239
+ }
240
+ end
241
+
214
242
  rule multinomial_name
215
- a:genus space b:infragenus space species_prefix? space c:species space_hard d:infraspecies_mult {
243
+ a:genus space b:infragenus space aid:annotation_identification? space c:species space_hard d:infraspecies_mult {
216
244
  def value
217
245
  a.value + " " + b.value + " " + c.value + " " + d.value
218
246
  end
219
-
247
+
220
248
  def canonical
221
249
  a.canonical + " " + c.canonical + " " + d.canonical
222
250
  end
223
-
251
+
224
252
  def pos
225
253
  a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
226
254
  end
227
-
255
+
228
256
  def hybrid
229
257
  c.hybrid rescue false
230
258
  end
231
-
259
+
232
260
  def details
233
261
  a.details.merge(b.details).merge(c.details).merge(d.details)
234
262
  end
235
263
  }
236
- /
237
- a:genus space b:infragenus space species_prefix? space c:species {
264
+ /
265
+ a:genus space b:infragenus space aid:annotation_identification? space c:species space aid:annotation_identification space d:infraspecies_mult {
238
266
  def value
239
- a.value + " " + b.value + " " + c.value
267
+ a.value + " " + b.value + " " + c.value + " " + d.value
240
268
  end
241
-
269
+
242
270
  def canonical
243
- a.canonical + " " + c.canonical
271
+ a.canonical + " " + c.canonical + " " + d.canonical
244
272
  end
245
-
273
+
246
274
  def pos
247
- a.pos.merge(b.pos).merge(c.pos)
275
+ a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
276
+ end
277
+
278
+ def hybrid
279
+ c.hybrid rescue false
280
+ end
281
+
282
+ def details
283
+ a.details.merge(b.details).merge(c.details).merge(d.details)
284
+ end
285
+ }
286
+ /
287
+ a:genus space b:infragenus space aid:annotation_identification? space c:species {
288
+ def value
289
+ if defined? aid.apply
290
+ a.value + " " + b.value + aid.apply(c)
291
+ else
292
+ a.value + " " + b.value + " " + c.value
293
+ end
248
294
  end
249
-
295
+
296
+ def canonical
297
+ if defined? aid.apply
298
+ a.canonical + aid.canonical(c)
299
+ else
300
+ a.canonical + " " + c.canonical
301
+ end
302
+ end
303
+
304
+ def pos
305
+ if defined? aid.apply
306
+ a.pos.merge(b.pos).merge(aid.pos(c))
307
+ else
308
+ a.pos.merge(b.pos).merge(c.pos)
309
+ end
310
+ end
311
+
250
312
  def hybrid
251
313
  c.hybrid rescue false
252
314
  end
253
-
315
+
254
316
  def details
255
- a.details.merge(b.details).merge(c.details)
317
+ if defined? aid.apply
318
+ a.details.merge(b.details).merge(aid.apply(c))
319
+ else
320
+ a.details.merge(b.details).merge(c.details)
321
+ end
256
322
  end
257
323
  }
258
324
  /
259
- a:genus space species_prefix? space b:species space_hard c:infraspecies_mult {
325
+ a:genus space aid:annotation_identification? space b:species space_hard c:infraspecies_mult {
260
326
  def value
261
- a.value + " " + b.value + " " + c.value
327
+ a.value + " " + b.value + " " + c.value
262
328
  end
263
329
 
264
330
  def canonical
265
331
  a.canonical + " " + b.canonical + " " + c.canonical
266
332
  end
267
-
333
+
268
334
  def pos
269
335
  a.pos.merge(b.pos).merge(c.pos)
270
336
  end
271
-
337
+
272
338
  def hybrid
273
339
  b.hybrid rescue false
274
340
  end
275
-
341
+
276
342
  def details
277
343
  a.details.merge(b.details).merge(c.details)
278
344
  end
279
345
  }
280
346
  /
281
- a:genus space species_prefix? space b:species {
347
+ a:genus space aid:annotation_identification? space b:species {
282
348
  def value
283
- a.value + " " + b.value
349
+ if defined? aid.apply
350
+ a.value + aid.apply(b)
351
+ else
352
+ a.value + " " + b.value
353
+ end
284
354
  end
285
355
 
286
356
  def canonical
287
- a.canonical + " " + b.canonical
357
+ if defined? aid.apply
358
+ a.canonical + aid.canonical(b)
359
+ else
360
+ a.canonical + " " + b.canonical
361
+ end
288
362
  end
289
-
363
+
290
364
  def pos
291
- a.pos.merge(b.pos)
365
+ if defined? aid.apply
366
+ a.pos.merge(aid.pos(b))
367
+ else
368
+ a.pos.merge(b.pos)
369
+ end
292
370
  end
293
-
371
+
294
372
  def hybrid
295
373
  b.hybrid rescue false
296
374
  end
297
-
375
+
298
376
  def details
299
- a.details.merge(b.details)
377
+ if defined? aid.apply
378
+ a.details.merge(aid.details(b))
379
+ else
380
+ a.details.merge(b.details)
381
+ end
382
+ end
383
+ }
384
+ /
385
+ a:genus space aid:annotation_identification space b:unparsed {
386
+ def value
387
+ a.value + aid.apply(b)
388
+ end
389
+
390
+ def canonical
391
+ a.canonical + aid.canonical(b)
392
+ end
393
+
394
+ def pos
395
+ a.pos.merge(aid.pos(b))
396
+ end
397
+
398
+ def hybrid
399
+ false
400
+ end
401
+
402
+ def details
403
+ a.details.merge(aid.details(b))
300
404
  end
301
405
  }
302
406
  end
303
-
407
+
408
+ rule multiuninomial_name
409
+ a:uninomial_name space b:rank_uninomial space c:uninomial_name {
410
+
411
+ def value
412
+ a.value + " " + b.value + " " + c.value
413
+ end
414
+
415
+ def canonical
416
+ a.canonical
417
+ end
418
+
419
+ def hybrid
420
+ false
421
+ end
422
+
423
+ def pos
424
+ a.pos.merge(b.pos(c))
425
+ end
426
+
427
+ def details
428
+ a.details.merge(b.details(c))
429
+ end
430
+ }
431
+ end
432
+
304
433
  rule infraspecies_mult
305
434
  a:infraspecies space b:infraspecies_mult {
306
435
  def value
307
436
  a.value + " " + b.value
308
437
  end
309
-
438
+
310
439
  def canonical
311
440
  a.canonical + " " + b.canonical
312
441
  end
313
-
442
+
314
443
  def pos
315
444
  a.pos.merge(b.pos)
316
445
  end
317
-
446
+
318
447
  def details
319
- a_array = a.details[:infraspecies].class == Array ? a.details[:infraspecies] : [a.details[:infraspecies]]
448
+ a_array = a.details[:infraspecies].class == Array ? a.details[:infraspecies] : [a.details[:infraspecies]]
320
449
  b_array = b.details[:infraspecies].class == Array ? b.details[:infraspecies] : [b.details[:infraspecies]]
321
450
  a.details.merge({:infraspecies => a_array + b_array})
322
451
  end
@@ -324,70 +453,98 @@ grammar ScientificNameClean
324
453
  /
325
454
  infraspecies {
326
455
  def details
327
- {:infraspecies => [super[:infraspecies]]}
456
+ if super[:annotation_identification]
457
+ {:infraspecies => [{:annotation_identification => super[:annotation_identification], :ignored => super[:ignored]}]}
458
+ else
459
+ {:infraspecies => [super[:infraspecies]]}
460
+ end
328
461
  end
329
462
  }
330
463
  end
331
-
464
+
332
465
  rule infraspecies
333
466
  a:infraspecies_string space b:authorship {
334
467
  def value
335
468
  a.value + " " + b.value
336
469
  end
337
-
470
+
338
471
  def canonical
339
472
  a.canonical
340
473
  end
341
-
474
+
342
475
  def pos
343
476
  a.pos.merge(b.pos)
344
477
  end
345
-
478
+
346
479
  def details
347
480
  {:infraspecies => a.details[:infraspecies].merge(b.details)}
348
481
  end
349
482
  }
350
483
  /
351
- infraspecies_string
484
+ infraspecies_string
352
485
  end
353
-
486
+
354
487
  rule infraspecies_string
355
- sel:rank space_hard a:species_word {
356
- def value
488
+ sel:rank space a:species_word {
489
+ def value
357
490
  sel.apply(a)
358
491
  end
359
492
  def canonical
360
493
  sel.canonical(a)
361
494
  end
362
-
495
+
363
496
  def pos
364
497
  sel.pos(a)
365
498
  end
366
-
499
+
367
500
  def details
368
501
  sel.details(a)
369
502
  end
370
503
  }
371
504
  /
372
- species_word ![\.] {
505
+ aid:annotation_identification space a:species_word ![\.] {
373
506
  def value
374
- text_value
507
+ aid.apply(a)
508
+ end
509
+
510
+ def canonical
511
+ aid.canonical(a)
512
+ end
513
+
514
+ def pos
515
+ def a.pos
516
+ {interval.begin => ['infraspecies', a.interval.end]}
517
+ end
518
+ aid.pos(a)
519
+ end
520
+
521
+ def details
522
+ def a.details
523
+ {:infraspecies => {:string => value, :rank => 'n/a'}}
524
+ end
525
+ aid.details(a)
375
526
  end
376
-
527
+ }
528
+ /
529
+ a:species_word ![\.] {
530
+ def value
531
+ a.value
532
+ end
533
+
377
534
  def canonical
378
535
  value
379
536
  end
380
-
537
+
381
538
  def pos
382
539
  {interval.begin => ['infraspecies', interval.end]}
383
540
  end
384
-
541
+
385
542
  def details
386
543
  {:infraspecies => {:string => value, :rank => 'n/a'}}
387
544
  end
388
545
  }
389
546
  end
390
-
547
+
391
548
  rule taxon_concept_rank
392
549
  ("sec."/"sensu.") {
393
550
  def value
@@ -398,77 +555,70 @@ grammar ScientificNameClean
398
555
  end
399
556
  def details(a = nil)
400
557
  {:taxon_concept => a.details}
401
- end
558
+ end
402
559
  }
403
560
  end
404
-
561
+
405
562
  rule rank
406
- ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
407
- /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
563
+ ("morph."/"f.sp."/"B "/"ssp."/"ssp "/"mut."/"nat "/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var "/"subsp."/"subsp "/"subf."/"race "/"forma "/"fma."/"fma "/"form."/"form "/"fo."/"fo"/"f."/"α"/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
408
564
  {
409
565
  def value
410
566
  text_value.strip
411
567
  end
412
568
 
413
569
  def apply(a)
414
- " " + text_value + " " + a.value
570
+ " " + text_value.strip + " " + a.value
415
571
  end
416
572
 
417
573
  def canonical(a)
418
574
  " " + a.value
419
575
  end
420
-
576
+
421
577
  def pos(a)
422
- {interval.begin => ['infraspecific_type', interval.end], a.interval.begin => ['infraspecies', a.interval.end]}
578
+ interval_end = text_value[-1] == ' ' ? interval.end - 1 : interval.end
579
+ {interval.begin => ['infraspecific_type', interval_end], a.interval.begin => ['infraspecies', a.interval.end]}
423
580
  end
424
-
581
+
425
582
  def details(a = nil)
426
- {:infraspecies => {:string => (a.value rescue nil), :rank => text_value}}
583
+ {:infraspecies => {:string => (a.value rescue nil), :rank => text_value.strip}}
427
584
  end
428
585
  }
429
- /
430
- rank_forma
431
586
  end
432
-
433
- rule rank_forma
434
- ("forma"/"form."/"form"/"fo."/"f.")
435
- {
587
+
588
+ rule rank_uninomial
589
+ ("sect."/"sect "/"subsect."/"subsect "/"trib."/"trib "/"subtrib."/"subtrib "/"ser."/"ser "/"subgen."/"subgen "/"fam."/"fam "/"subfam."/"subfam "/"supertrib."/"supertrib ") {
436
590
  def value
437
- "f."
438
- end
439
- def apply(a)
440
- " " + value + " " + a.value
441
- end
442
- def canonical(a)
443
- " " + a.value
591
+ text_value.strip
444
592
  end
445
- def pos(a)
446
- {interval.begin => ['infraspecific_type', interval.end], a.interval.begin => ['infraspecies', a.interval.end]}
593
+
594
+ def pos(uni)
595
+ {interval.begin => ['rank_uninomial', interval.end], uni.interval.begin => ['uninomial', uni.interval.end]}
447
596
  end
448
- def details(a = nil)
449
- {:infraspecies => {:string => (a.value rescue nil), :rank => value}}
597
+
598
+ def details(uni)
599
+ {:rank_uninomials => value, :uninomial2 => uni.details[:uninomial]}
450
600
  end
451
601
  }
452
602
  end
453
-
603
+
454
604
  rule species
455
605
  a:species_string space b:authorship {
456
606
  def value
457
607
  a.value + " " + b.value
458
608
  end
459
-
609
+
460
610
  def canonical
461
611
  a.canonical
462
612
  end
463
-
613
+
464
614
  def hybrid
465
615
  a.hybrid rescue false
466
616
  end
467
-
617
+
468
618
  def pos
469
619
  a.pos.merge(b.pos)
470
620
  end
471
-
621
+
472
622
  def details
473
623
  {:species => a.details[:species].merge(b.details)}
474
624
  end
@@ -476,43 +626,21 @@ grammar ScientificNameClean
476
626
  /
477
627
  species_string
478
628
  end
479
-
629
+
480
630
  rule species_string
481
- # a:species_word &(space_hard author_prefix_word space_hard) {
482
- # def value
483
- # a.value
484
- # end
485
- #
486
- # def canonical
487
- # a.value
488
- # end
489
- #
490
- # def hybrid
491
- # a.hybrid rescue false
492
- # end
493
- #
494
- # def pos
495
- # {a.interval.begin => ['species', a.interval.end]}
496
- # end
497
- #
498
- # def details
499
- # {:species => {:string => a.value}}
500
- # end
501
- # }
502
- # /
503
631
  species_word {
504
632
  def canonical
505
633
  value
506
634
  end
507
-
635
+
508
636
  def pos
509
637
  {interval.begin => ['species', interval.end]}
510
638
  end
511
-
639
+
512
640
  def hybrid
513
641
  false
514
642
  end
515
-
643
+
516
644
  def details
517
645
  {:species => {:string => value}}
518
646
  end
@@ -520,65 +648,85 @@ grammar ScientificNameClean
520
648
  /
521
649
  species_word_hybrid
522
650
  end
523
-
651
+
524
652
  rule infragenus
525
653
  left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
526
654
  def value
527
655
  "(" + a.value + ")"
528
656
  end
529
-
657
+
530
658
  def canonical
531
659
  a.value
532
660
  end
533
-
661
+
534
662
  def pos
535
663
  {a.interval.begin => ['infragenus', a.interval.end]}
536
664
  end
537
-
665
+
538
666
  def details
539
667
  {:infragenus => {:string => a.value}}
540
668
  end
541
669
  }
542
670
  end
543
-
671
+
544
672
  rule genus
545
- a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
673
+ a:(abbreviated_genus/uninomial_string) !(space_hard author_prefix_word space_hard author_word) {
546
674
  def value
547
675
  a.value
548
676
  end
549
-
677
+
550
678
  def pos
551
679
  {a.interval.begin => ['genus', a.interval.end]}
552
680
  end
553
-
681
+
554
682
  def canonical
555
683
  a.value
556
684
  end
557
-
685
+
558
686
  def details
559
687
  {:genus => {:string => a.value}}
560
688
  end
561
689
  }
562
690
  end
563
-
691
+
692
+ rule abbreviated_genus
693
+ [A-Z] [a-z]? [a-z]? [\\.] space {
694
+ def value
695
+ text_value.strip
696
+ end
697
+
698
+ def canonical
699
+ value
700
+ end
701
+
702
+ def pos
703
+ {interval.begin => ["abbreviated_genus", interval.end]}
704
+ end
705
+
706
+ def details
707
+ {:abbreviated_genus => {:string => value}}
708
+ end
709
+ }
710
+ end
711
+
564
712
  rule uninomial_name
565
713
  a:uninomial_string space b:infragenus space c:simple_authorship {
566
714
  def value
567
715
  a.value + " " + b.value + " " + c.value
568
716
  end
569
-
717
+
570
718
  def canonical
571
719
  a.canonical
572
720
  end
573
-
721
+
574
722
  def pos
575
723
  a.pos.merge(b.pos).merge(c.pos)
576
724
  end
577
-
725
+
578
726
  def hybrid
579
727
  false
580
728
  end
581
-
729
+
582
730
  def details
583
731
  {:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
584
732
  end
@@ -588,19 +736,19 @@ grammar ScientificNameClean
588
736
  def value
589
737
  a.value + " " + b.value
590
738
  end
591
-
739
+
592
740
  def canonical
593
741
  a.canonical
594
742
  end
595
-
743
+
596
744
  def pos
597
745
  a.pos.merge(b.pos)
598
746
  end
599
-
747
+
600
748
  def hybrid
601
749
  false
602
750
  end
603
-
751
+
604
752
  def details
605
753
  {:uninomial => a.details[:uninomial].merge(b.details)}
606
754
  end
@@ -610,19 +758,19 @@ grammar ScientificNameClean
610
758
  def value
611
759
  a.value + " " + b.value
612
760
  end
613
-
761
+
614
762
  def canonical
615
763
  a.canonical
616
764
  end
617
-
765
+
618
766
  def pos
619
767
  a.pos.merge(b.pos)
620
768
  end
621
-
769
+
622
770
  def hybrid
623
771
  false
624
772
  end
625
-
773
+
626
774
  def details
627
775
  {:uninomial => a.details[:uninomial].merge(b.details)}
628
776
  end
@@ -636,31 +784,31 @@ grammar ScientificNameClean
636
784
  def canonical
637
785
  value
638
786
  end
639
-
787
+
640
788
  def pos
641
789
  {interval.begin => ['uninomial', interval.end]}
642
790
  end
643
-
791
+
644
792
  def hybrid
645
793
  false
646
794
  end
647
-
648
- def details
795
+
796
+ def details
649
797
  {:uninomial => {:string => value}}
650
798
  end
651
799
  }
652
800
  end
653
-
801
+
654
802
  rule authorship
655
803
  a:basionym_authorship_with_parenthesis space b:simple_authorship ","? space c:ex_authorship {
656
804
  def value
657
805
  a.value + " " + b.value + " " + c.value
658
806
  end
659
-
807
+
660
808
  def pos
661
809
  a.pos.merge(b.pos).merge(c.pos)
662
810
  end
663
-
811
+
664
812
  def details
665
813
  val = {:authorship => text_value.strip, :combinationAuthorTeam => b.details[:basionymAuthorTeam], :basionymAuthorTeam => a.details[:basionymAuthorTeam]}
666
814
  val[:combinationAuthorTeam].merge!(c.details)
@@ -672,11 +820,11 @@ grammar ScientificNameClean
672
820
  def value
673
821
  a.value + " " + b.value
674
822
  end
675
-
823
+
676
824
  def pos
677
825
  a.pos.merge(b.pos)
678
826
  end
679
-
827
+
680
828
  def details
681
829
  {:authorship => text_value.strip, :combinationAuthorTeam => b.details[:basionymAuthorTeam], :basionymAuthorTeam => a.details[:basionymAuthorTeam]}
682
830
  end
@@ -688,11 +836,11 @@ grammar ScientificNameClean
688
836
  def value
689
837
  a.value + " " + b.value
690
838
  end
691
-
839
+
692
840
  def pos
693
841
  a.pos.merge(b.pos)
694
842
  end
695
-
843
+
696
844
  def details
697
845
  val = a.details
698
846
  val[:authorship] = text_value.strip
@@ -703,21 +851,21 @@ grammar ScientificNameClean
703
851
  /
704
852
  simple_authorship
705
853
  end
706
-
707
-
854
+
855
+
708
856
  rule basionym_authorship_with_parenthesis
709
857
  left_paren space a:authors_names space right_paren space [,]? space b:year {
710
858
  def value
711
859
  "(" + a.value + " " + b.value + ")"
712
860
  end
713
-
861
+
714
862
  def pos
715
863
  a.pos.merge(b.pos)
716
- end
717
-
864
+ end
865
+
718
866
  def details
719
- { :authorship => text_value,
720
- :basionymAuthorTeam => {:author_team => text_value}.merge(a.details).merge(b.details)
867
+ { :authorship => text_value,
868
+ :basionymAuthorTeam => {:author_team => text_value}.merge(a.details).merge(b.details)
721
869
  }
722
870
  end
723
871
  }
@@ -726,11 +874,11 @@ grammar ScientificNameClean
726
874
  def value
727
875
  "(" + a.value + " " + b.value + ")"
728
876
  end
729
-
877
+
730
878
  def pos
731
879
  a.pos.merge(b.pos)
732
880
  end
733
-
881
+
734
882
  def details
735
883
  val = a.details
736
884
  val[:basionymAuthorTeam].merge!(b.details)
@@ -743,15 +891,15 @@ grammar ScientificNameClean
743
891
  def value
744
892
  "(" + a.value + ")"
745
893
  end
746
-
894
+
747
895
  def pos
748
896
  a.pos
749
897
  end
750
-
898
+
751
899
  def details
752
900
  val = a.details
753
901
  val[:authorship] = text_value
754
- val
902
+ val
755
903
  end
756
904
  }
757
905
  /
@@ -759,32 +907,32 @@ grammar ScientificNameClean
759
907
  def value
760
908
  "(?)"
761
909
  end
762
-
910
+
763
911
  def pos
764
912
  {a.interval.begin => ['unknown_author', a.interval.end]}
765
913
  end
766
-
914
+
767
915
  def details
768
916
  {:authorship => text_value, :basionymAuthorTeam => {:authorTeam => text_value, :author => ['?']}}
769
917
  end
770
918
  }
771
919
  end
772
-
920
+
773
921
  rule ex_authorship
774
922
  ex_sep space b:simple_authorship {
775
923
  def value
776
924
  " ex " + b.value
777
925
  end
778
-
926
+
779
927
  def pos
780
928
  b.pos
781
929
  end
782
-
930
+
783
931
  def details
784
932
  val = {:exAuthorTeam => {:authorTeam => b.text_value.strip}.merge(b.details[:basionymAuthorTeam])}
785
933
  val
786
934
  end
787
- }
935
+ }
788
936
  end
789
937
 
790
938
  rule simple_authorship
@@ -792,17 +940,17 @@ grammar ScientificNameClean
792
940
  def value
793
941
  a.value + " " + b.value
794
942
  end
795
-
943
+
796
944
  def pos
797
945
  a.pos.merge(b.pos)
798
946
  end
799
-
947
+
800
948
  def details
801
949
  details_with_arg(:basionymAuthorTeam)
802
950
  end
803
-
951
+
804
952
  def details_with_arg(authorTeamType = 'basionymAuthorTeam')
805
- { :authorship => text_value,
953
+ { :authorship => text_value,
806
954
  authorTeamType.to_sym => {
807
955
  :authorTeam => a.text_value.strip
808
956
  }.merge(a.details).merge(b.details)
@@ -814,17 +962,17 @@ grammar ScientificNameClean
814
962
  def value
815
963
  a.value + " " + b.value
816
964
  end
817
-
965
+
818
966
  def pos
819
967
  a.pos.merge(b.pos)
820
968
  end
821
-
969
+
822
970
  def details
823
971
  details_with_arg(:basionymAuthorTeam)
824
972
  end
825
-
973
+
826
974
  def details_with_arg(authorTeamType = 'basionymAuthorTeam')
827
- { :authorship => text_value,
975
+ { :authorship => text_value,
828
976
  authorTeamType.to_sym => {
829
977
  :authorTeam => a.text_value.strip
830
978
  }.merge(a.details).merge(b.details)
@@ -838,27 +986,27 @@ grammar ScientificNameClean
838
986
  details[:basionymAuthorTeam].merge!(super)
839
987
  details
840
988
  end
841
-
989
+
842
990
  def details_with_arg(authorTeamType = 'basionymAuthorTeam')
843
- { :authorship => text_value,
991
+ { :authorship => text_value,
844
992
  authorTeamType.to_sym => {
845
993
  :authorTeam => text_value,
846
994
  }
847
- }
995
+ }
848
996
  end
849
997
  }
850
998
  end
851
-
999
+
852
1000
  rule authors_names
853
1001
  a:author_name space sep:author_separator space b:authors_names {
854
1002
  def value
855
1003
  sep.apply(a,b)
856
1004
  end
857
-
1005
+
858
1006
  def pos
859
1007
  sep.pos(a,b)
860
1008
  end
861
-
1009
+
862
1010
  def details
863
1011
  sep.details(a,b)
864
1012
  end
@@ -868,40 +1016,40 @@ grammar ScientificNameClean
868
1016
  /
869
1017
  unknown_auth
870
1018
  end
871
-
872
-
1019
+
1020
+
873
1021
  rule unknown_auth
874
- ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
1022
+ ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") !latin_word {
875
1023
  def value
876
1024
  text_value
877
1025
  end
878
-
1026
+
879
1027
  def pos
880
1028
  {interval.begin => ['unknown_author', interval.end]}
881
1029
  end
882
-
1030
+
883
1031
  def details
884
1032
  {:author => ["unknown"]}
885
1033
  end
886
1034
  }
887
1035
  end
888
-
1036
+
889
1037
  rule ex_sep
890
1038
  ("ex"/"in") &[\s]
891
1039
  end
892
-
1040
+
893
1041
  rule author_separator
894
- ("&"/","/"and"/"et") {
1042
+ ("&amp;"/"&"/","/"and"/"et") {
895
1043
  def apply(a,b)
896
1044
  sep = text_value.strip
897
- sep = " et" if ["&","and","et"].include? sep
1045
+ sep = " &" if ["&amp;", "&","and","et"].include? sep
898
1046
  a.value + sep + " " + b.value
899
1047
  end
900
-
1048
+
901
1049
  def pos(a,b)
902
1050
  a.pos.merge(b.pos)
903
1051
  end
904
-
1052
+
905
1053
  def details(a,b)
906
1054
  {:author => a.details[:author] + b.details[:author]}
907
1055
  end
@@ -913,8 +1061,8 @@ grammar ScientificNameClean
913
1061
  def value
914
1062
  a.value + ' ' + b.value
915
1063
  end
916
-
917
- def pos
1064
+
1065
+ def pos
918
1066
  a.pos.merge(b.pos)
919
1067
  end
920
1068
 
@@ -925,17 +1073,17 @@ grammar ScientificNameClean
925
1073
  /
926
1074
  author_name_without_postfix
927
1075
  end
928
-
1076
+
929
1077
  rule author_name_without_postfix
930
1078
  space a:author_prefix_word space b:author_name {
931
1079
  def value
932
1080
  a.value + " " + b.value
933
1081
  end
934
-
1082
+
935
1083
  def pos
936
1084
  a.pos.merge(b.pos)
937
1085
  end
938
-
1086
+
939
1087
  def details
940
1088
  {:author => [value]}
941
1089
  end
@@ -945,11 +1093,11 @@ grammar ScientificNameClean
945
1093
  def value
946
1094
  a.value + " " + b.value
947
1095
  end
948
-
1096
+
949
1097
  def pos
950
1098
  a.pos.merge(b.pos)
951
1099
  end
952
-
1100
+
953
1101
  def details
954
1102
  {:author => [value]}
955
1103
  end
@@ -957,17 +1105,17 @@ grammar ScientificNameClean
957
1105
  /
958
1106
  author_word
959
1107
  end
960
-
1108
+
961
1109
  rule author_word
962
1110
  "A S. Xu" {
963
1111
  def value
964
1112
  text_value.strip
965
1113
  end
966
-
1114
+
967
1115
  def pos
968
1116
  {interval.begin => ['author_word', 1], (interval.begin + 2) => ['author_word', 2], (interval.begin + 5) => ['author_word', 2]}
969
1117
  end
970
-
1118
+
971
1119
  def details
972
1120
  {:author => [value]}
973
1121
  end
@@ -977,26 +1125,28 @@ grammar ScientificNameClean
977
1125
  def value
978
1126
  text_value.strip
979
1127
  end
980
-
1128
+
981
1129
  def pos
982
1130
  #cheating because there are several words in some of them
983
1131
  {interval.begin => ['author_word', interval.end]}
984
1132
  end
985
-
1133
+
986
1134
  def details
987
1135
  {:author => [value]}
988
1136
  end
989
1137
  }
990
- /
1138
+ /
991
1139
  ("Å"/"Ö"/"Á"/"Ø"/"Ô"/"Š"/"Ś"/"Č"/"Ķ"/"Ł"/"É"/"Ž"/[A-W]/[Y-Z]) [^0-9\[\]\(\)\s&,]* {
992
1140
  def value
993
- text_value
1141
+ text_value.gsub(/([\p{Lu}]{3,})/) do |match|
1142
+ UnicodeUtils.titlecase(match)
1143
+ end
994
1144
  end
995
-
1145
+
996
1146
  def pos
997
1147
  {interval.begin => ['author_word', interval.end]}
998
1148
  end
999
-
1149
+
1000
1150
  def details
1001
1151
  {:author => [value]}
1002
1152
  end
@@ -1006,11 +1156,11 @@ grammar ScientificNameClean
1006
1156
  def value
1007
1157
  text_value
1008
1158
  end
1009
-
1159
+
1010
1160
  def pos
1011
1161
  {interval.begin => ['author_word', interval.end]}
1012
1162
  end
1013
-
1163
+
1014
1164
  def details
1015
1165
  {:author => [value]}
1016
1166
  end
@@ -1018,13 +1168,13 @@ grammar ScientificNameClean
1018
1168
  /
1019
1169
  author_prefix_word
1020
1170
  end
1021
-
1171
+
1022
1172
  rule author_prefix_word
1023
- space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
1173
+ space ("ab"/"af"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
1024
1174
  def value
1025
1175
  text_value
1026
1176
  end
1027
-
1177
+
1028
1178
  def pos
1029
1179
  #cheating because there are several words in some of them
1030
1180
  {interval.begin => ['author_word', interval.end]}
@@ -1034,7 +1184,7 @@ grammar ScientificNameClean
1034
1184
 
1035
1185
  rule author_postfix_word
1036
1186
  ("f."/"filius") {
1037
- def value
1187
+ def value
1038
1188
  text_value.strip
1039
1189
  end
1040
1190
 
@@ -1043,7 +1193,7 @@ grammar ScientificNameClean
1043
1193
  end
1044
1194
  }
1045
1195
  end
1046
-
1196
+
1047
1197
  rule cap_latin_word_pair
1048
1198
  a:cap_latin_word "-" b:cap_latin_word {
1049
1199
  def value
@@ -1051,7 +1201,7 @@ grammar ScientificNameClean
1051
1201
  end
1052
1202
  }
1053
1203
  end
1054
-
1204
+
1055
1205
  rule cap_latin_word
1056
1206
  a:([A-Z]/cap_digraph) b:latin_word "?" {
1057
1207
  def value
@@ -1091,19 +1241,19 @@ grammar ScientificNameClean
1091
1241
  def value
1092
1242
  a.value + " " + b.value
1093
1243
  end
1094
-
1244
+
1095
1245
  def canonical
1096
1246
  b.value
1097
1247
  end
1098
-
1248
+
1099
1249
  def hybrid
1100
1250
  true
1101
1251
  end
1102
-
1252
+
1103
1253
  def pos
1104
1254
  {b.interval.begin => ['species', b.interval.end]}
1105
1255
  end
1106
-
1256
+
1107
1257
  def details
1108
1258
  {:species => {:string => b.value}}
1109
1259
  end
@@ -1113,19 +1263,19 @@ grammar ScientificNameClean
1113
1263
  def value
1114
1264
  "× " + b.value
1115
1265
  end
1116
-
1266
+
1117
1267
  def canonical
1118
1268
  b.value
1119
1269
  end
1120
-
1270
+
1121
1271
  def hybrid
1122
1272
  true
1123
1273
  end
1124
-
1274
+
1125
1275
  def pos
1126
1276
  {b.interval.begin => ['species', b.interval.end]}
1127
1277
  end
1128
-
1278
+
1129
1279
  def details
1130
1280
  {:species => {:string => b.value}}
1131
1281
  end
@@ -1135,29 +1285,74 @@ grammar ScientificNameClean
1135
1285
  def value
1136
1286
  "× " + b.value
1137
1287
  end
1138
-
1288
+
1139
1289
  def canonical
1140
1290
  b.value
1141
1291
  end
1142
-
1292
+
1143
1293
  def hybrid
1144
1294
  true
1145
1295
  end
1146
-
1296
+
1147
1297
  def pos
1148
1298
  {b.interval.begin => ['species', b.interval.end]}
1149
1299
  end
1150
-
1300
+
1151
1301
  def details
1152
1302
  {:species => {:string => b.value}}
1153
1303
  end
1154
1304
  }
1155
1305
  end
1156
1306
 
1157
- rule species_prefix
1158
- ("aff."/"corrig."/"?") &space_hard
1307
+ rule annotation_identification
1308
+ ("sp.nr."/"sp. nr."/"nr."/"nr "/"sp.aff."/"sp. aff."/"sp."/"sp "/"species"/"spp."/"spp "/"aff."/"aff "/"monst."/"? ") {
1309
+
1310
+ def value
1311
+ text_value.strip
1312
+ end
1313
+
1314
+ def apply(sp)
1315
+ ''
1316
+ end
1317
+
1318
+ def canonical(sp)
1319
+ ''
1320
+ end
1321
+
1322
+ def pos(sp)
1323
+ interval_end = text_value[-1] == ' ' ? interval.end - 1 : interval.end
1324
+ {interval.begin => ['annotation_identification', interval.end]}
1325
+ end
1326
+
1327
+ def details(sp)
1328
+ {:annotation_identification => value, :ignored => sp.details}
1329
+ end
1330
+ }
1331
+ /
1332
+ ("cf."/"cf ") {
1333
+ def value
1334
+ text_value.strip
1335
+ end
1336
+
1337
+ def apply(sp)
1338
+ ' ' + value + ' ' + sp.value
1339
+ end
1340
+
1341
+ def canonical(sp)
1342
+ ' ' + sp.canonical
1343
+ end
1344
+
1345
+ def pos(sp)
1346
+ interval_end = text_value[-1] == ' ' ? interval.end - 1 : interval.end
1347
+ {interval.begin => ['annotation_identification', interval.end]}.merge(sp.pos)
1348
+ end
1349
+
1350
+ def details(sp)
1351
+ {:annotation_identification => value, :species => sp.details}
1352
+ end
1353
+ }
1159
1354
  end
1160
-
1355
+
1161
1356
  rule species_word
1162
1357
  a:[0-9]+ "-"? b:latin_word {
1163
1358
  def value
@@ -1177,6 +1372,12 @@ grammar ScientificNameClean
1177
1372
  end
1178
1373
  }
1179
1374
  /
1375
+ "o\'donelli" {
1376
+ def value
1377
+ "odonelli"
1378
+ end
1379
+ }
1380
+ /
1180
1381
  a:valid_name_letter b:valid_name_letters {
1181
1382
  def value
1182
1383
  a.value + b.value
@@ -1191,9 +1392,9 @@ grammar ScientificNameClean
1191
1392
  text_value.split('').each do |l|
1192
1393
  l = 'ae' if l == 'æ'
1193
1394
  l = 'oe' if l == 'œ'
1194
- # not sure if we should normalize ë as well. It is legal in botanical code, but it
1195
- # might be beneficial to normalize it for the reconsiliation purposes
1196
- # l = 'e' if l == 'ë'
1395
+ # We normalize ë as well. It is legal in botanical code, but it
1396
+ # is beneficial to normalize it for the reconsiliation purposes
1397
+ l = 'e' if l == 'ë'
1197
1398
  res << l
1198
1399
  end
1199
1400
  res
@@ -1207,6 +1408,7 @@ grammar ScientificNameClean
1207
1408
  res = text_value
1208
1409
  res = 'ae' if res == 'æ'
1209
1410
  res = 'oe' if res == 'œ'
1411
+ res = 'e' if res == 'ë'
1210
1412
  res
1211
1413
  end
1212
1414
  }
@@ -1224,7 +1426,7 @@ grammar ScientificNameClean
1224
1426
  def value
1225
1427
  'Oe'
1226
1428
  end
1227
- }
1429
+ }
1228
1430
  end
1229
1431
 
1230
1432
  rule year
@@ -1232,14 +1434,14 @@ grammar ScientificNameClean
1232
1434
  def value
1233
1435
  a.value
1234
1436
  end
1235
-
1437
+
1236
1438
  def pos
1237
1439
  a.pos
1238
1440
  end
1239
-
1441
+
1240
1442
  def details
1241
1443
  a.details
1242
- end
1444
+ end
1243
1445
  }
1244
1446
  /
1245
1447
  year_number_with_character
@@ -1262,31 +1464,31 @@ grammar ScientificNameClean
1262
1464
  end
1263
1465
  }
1264
1466
  end
1265
-
1467
+
1266
1468
  rule year_number
1267
- [12] [7890] [0-9] [0-9]? [\?]? {
1469
+ [12] [7890] [0-9] ([0-9] [\?]?/"?") {
1268
1470
  def value
1269
1471
  text_value
1270
1472
  end
1271
-
1473
+
1272
1474
  def pos
1273
1475
  {interval.begin => ['year', interval.end]}
1274
1476
  end
1275
-
1477
+
1276
1478
  def details
1277
1479
  {:year => value}
1278
1480
  end
1279
1481
  }
1280
1482
  end
1281
-
1483
+
1282
1484
  rule left_paren
1283
1485
  "("
1284
1486
  end
1285
-
1487
+
1286
1488
  rule right_paren
1287
1489
  ")"
1288
1490
  end
1289
-
1491
+
1290
1492
  rule hybrid_character
1291
1493
  ("x"/"X") {
1292
1494
  def value
@@ -1296,7 +1498,7 @@ grammar ScientificNameClean
1296
1498
  /
1297
1499
  multiplication_sign
1298
1500
  end
1299
-
1501
+
1300
1502
  rule multiplication_sign
1301
1503
  ("×"/"*") {
1302
1504
  def value
@@ -1304,7 +1506,7 @@ grammar ScientificNameClean
1304
1506
  end
1305
1507
  }
1306
1508
  end
1307
-
1509
+
1308
1510
  rule space
1309
1511
  [\s]*
1310
1512
  end
@@ -1312,5 +1514,5 @@ grammar ScientificNameClean
1312
1514
  rule space_hard
1313
1515
  [\s]+
1314
1516
  end
1315
-
1517
+
1316
1518
  end