biodiversity 1.0.10 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,22 +9,44 @@ require 'json'
9
9
  module PreProcessor
10
10
  NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
11
11
  TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
12
- TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
13
- TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
12
+ TAXON_CONCEPTS2 = /\s+
13
+ (\(?s\.\s?s\.|
14
+ \(?s\.\s?l\.|
15
+ \(?s\.\s?str\.|
16
+ \(?s\.\s?lat\.|
17
+ sec\.|sec|near)\b.*$/x
18
+ TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
14
19
  NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
15
- LAST_WORD_JUNK = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
16
-
20
+ LAST_WORD_JUNK = /(,\s*|\s+)
21
+ (spp\.|spp|var\.|
22
+ var|von|van|ined\.|
23
+ ined|sensu|new|non|nec|
24
+ nudum|cf\.|cf|sp\.|sp|
25
+ ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
26
+
17
27
  def self.clean(a_string)
18
- [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
28
+ [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
29
+ TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
19
30
  a_string = a_string.gsub(i, '')
20
31
  end
21
32
  a_string = a_string.tr('ſ','s') #old 's'
22
33
  a_string
23
- end
34
+ end
24
35
  end
25
36
 
37
+ # Public: Parser which runs in parallel.
38
+ #
39
+ # Examples
40
+ #
41
+ # parser = ParallelParser.new(4)
42
+ # parser.parse(['Betula L.', 'Pardosa moesta'])
26
43
  class ParallelParser
27
44
 
45
+ # Public: Initialize ParallelParser.
46
+ #
47
+ # processes_num - an Integer to setup the number of processes (default: nil).
48
+ # If processes number is not set it will be determined
49
+ # automatically.
28
50
  def initialize(processes_num = nil)
29
51
  require 'parallel'
30
52
  cpu_num
@@ -35,11 +57,32 @@ class ParallelParser
35
57
  end
36
58
  end
37
59
 
60
+ # Public: Parses an array of scientific names using several processes
61
+ # in parallel.
62
+ #
63
+ # Scientific names are deduplicated in the process, so every string is
64
+ # parsed only once.
65
+ #
66
+ # names_list - takes an Array of scientific names,
67
+ # each element should be a String.
68
+ #
69
+ # Examples
70
+ #
71
+ # parser = ParallelParser.new(4)
72
+ # parser.parse(['Homo sapiens L.', 'Quercus quercus'])
73
+ #
74
+ # Returns a Hash with scientific names as a key, and parsing results as
75
+ # a value.
38
76
  def parse(names_list)
39
- parsed = Parallel.map(names_list.uniq, :in_processes => @processes_num) { |n| [n, parse_process(n)] }
77
+ parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
78
+ [n, parse_process(n)]
79
+ end
40
80
  parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
41
81
  end
42
82
 
83
+ # Public: Returns the number of cores/CPUs.
84
+ #
85
+ # Returns Integer of cores/CPUs.
43
86
  def cpu_num
44
87
  @cpu_num ||= Parallel.processor_count
45
88
  end
@@ -47,7 +90,7 @@ class ParallelParser
47
90
  private
48
91
  def parse_process(name)
49
92
  p = ScientificNameParser.new
50
- p.parse(name) rescue {:scientificName => {:parsed => false, :verbatim => name, :error => 'Parser error'}}
93
+ p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
51
94
  end
52
95
  end
53
96
 
@@ -58,15 +101,64 @@ end
58
101
  # @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
59
102
  # @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
60
103
  # @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
61
- # @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
104
+ # @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
105
+ # viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
62
106
  # @parsed = nil
63
107
  # end
64
108
  # end
65
109
 
66
110
  class ScientificNameParser
67
- VERSION = open(File.join(File.dirname(__FILE__), '..', '..', 'VERSION')).readline.strip
68
-
69
- def initialize
111
+ VERSION = open(File.join(File.dirname(__FILE__),
112
+ '..',
113
+ '..',
114
+ 'VERSION')).readline.strip
115
+
116
+ FAILED_RESULT = ->(name) do
117
+ { scientificName:
118
+ { parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
119
+ }
120
+ end
121
+
122
+ def self.version
123
+ VERSION
124
+ end
125
+
126
+ def self.fix_case(name_string)
127
+ name_ary = name_string.split(/\s+/)
128
+ words_num = name_ary.size
129
+ res = nil
130
+ if words_num == 1
131
+ res = name_ary[0].gsub(/[\(\)\{\}]/, '')
132
+ if res.size > 1
133
+ res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
134
+ else
135
+ res = nil
136
+ end
137
+ else
138
+ if name_ary[0].size > 1
139
+ word1 = UnicodeUtils.upcase(name_ary[0][0]) +
140
+ UnicodeUtils.downcase(name_ary[0][1..-1])
141
+ else
142
+ word1 = name_ary[0]
143
+ end
144
+ if name_ary[1].match(/^\(/)
145
+ word2 = name_ary[1].gsub(/\)$/, '') + ')'
146
+ word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
147
+ UnicodeUtils.downcase(word2[2..-1])
148
+ else
149
+ word2 = UnicodeUtils.downcase(name_ary[1])
150
+ end
151
+ res = word1 + ' ' +
152
+ word2 + ' ' +
153
+ name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
154
+ res.strip!
155
+ end
156
+ res
157
+ end
158
+
159
+
160
+ def initialize(opts = {})
161
+ @canonical_with_rank = !!opts[:canonical_with_rank]
70
162
  @verbatim = ''
71
163
  @clean = ScientificNameCleanParser.new
72
164
  @dirty = ScientificNameDirtyParser.new
@@ -75,8 +167,12 @@ class ScientificNameParser
75
167
  end
76
168
 
77
169
  def virus?(a_string)
78
- !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\b(virus|viruses|phage|phages|viroid|viroids|satellite|satellites|prion|prions)\b/i))
79
- end
170
+ !!(a_string.match(/\sICTV\s*$/) ||
171
+ a_string.match(/\b(virus|viruses|
172
+ phage|phages|viroid|viroids|
173
+ satellite|satellites|prion|prions)\b/ix) ||
174
+ a_string.match(/[A-Z]?[a-z]+virus\b/))
175
+ end
80
176
 
81
177
  def unknown_placement?(a_string)
82
178
  !!(a_string.match(/incertae\s+sedis/i) || a_string.match(/inc\.\s*sed\./i))
@@ -85,54 +181,85 @@ class ScientificNameParser
85
181
  def parsed
86
182
  @parsed
87
183
  end
88
-
184
+
89
185
  def parse(a_string)
90
- @verbatim = a_string
186
+ @verbatim = a_string.strip
91
187
  a_string = PreProcessor::clean(a_string)
92
-
188
+
93
189
  if virus?(a_string)
94
- @parsed = { :verbatim => a_string, :virus => true }
190
+ @parsed = { verbatim: a_string, virus: true }
95
191
  elsif unknown_placement?(a_string)
96
- @parsed = { :verbatim => a_string }
192
+ @parsed = { verbatim: a_string }
97
193
  else
98
- @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
194
+ begin
195
+ @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
196
+ unless @parsed
197
+ index = @dirty.index || @clean.index
198
+ salvage_match = a_string[0..index].split(/\s+/)[0..-2]
199
+ salvage_string = salvage_match ? salvage_match.join(' ') : a_string
200
+ @parsed = @dirty.parse(salvage_string) ||
201
+ @canonical.parse(a_string) ||
202
+ { verbatim: a_string }
203
+ end
204
+ rescue
205
+ @parsed = FAILED_RESULT.(@verbatim)
206
+ end
99
207
  end
100
208
 
101
209
  def @parsed.verbatim=(a_string)
102
210
  @verbatim = a_string
103
211
  end
104
212
 
105
- def @parsed.all(verbatim = @verbatim)
213
+ def @parsed.all(opts = {})
214
+ canonical_with_rank = !!opts[:canonical_with_rank]
106
215
  parsed = self.class != Hash
107
- res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
216
+ res = { parsed: parsed, parser_version: ScientificNameParser::VERSION}
108
217
  if parsed
109
218
  hybrid = self.hybrid rescue false
110
219
  res.merge!({
111
- :verbatim => @verbatim,
112
- :normalized => self.value,
113
- :canonical => self.canonical,
114
- :hybrid => hybrid,
115
- :details => self.details,
116
- :parser_run => self.parser_run,
117
- :positions => self.pos
220
+ verbatim: @verbatim,
221
+ normalized: self.value,
222
+ canonical: self.canonical,
223
+ hybrid: hybrid,
224
+ details: self.details,
225
+ parser_run: self.parser_run,
226
+ positions: self.pos
118
227
  })
119
228
  else
120
229
  res.merge!(self)
121
230
  end
231
+ if (canonical_with_rank &&
232
+ canonical.count(' ') > 1 &&
233
+ res[:details][0][:infraspecies])
234
+ ScientificNameParser.add_rank_to_canonical(res)
235
+ end
122
236
  res = {:scientificName => res}
123
- res
124
237
  end
125
-
238
+
126
239
  def @parsed.pos_json
127
240
  self.pos.to_json rescue ''
128
241
  end
129
-
242
+
130
243
  def @parsed.all_json
131
244
  self.all.to_json rescue ''
132
245
  end
133
246
 
134
247
  @parsed.verbatim = @verbatim
135
- @parsed.all
248
+ @parsed.all(canonical_with_rank: @canonical_with_rank)
249
+ end
250
+
251
+ private
252
+
253
+ def self.add_rank_to_canonical(parsed)
254
+ parts = parsed[:canonical].split(' ')
255
+ name_ary = parts[0..1]
256
+ parsed[:details][0][:infraspecies].each do |data|
257
+ infrasp = data[:string]
258
+ rank = data[:rank]
259
+ name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
260
+ end
261
+ parsed[:canonical] = name_ary.join(' ')
136
262
  end
263
+
137
264
  end
138
265
 
@@ -40,7 +40,7 @@ grammar ScientificNameCanonical
40
40
  end
41
41
 
42
42
  def canonical
43
- a.canonical + " " + b.canonical + " " + c.canonical
43
+ a.canonical + " " + c.canonical
44
44
  end
45
45
 
46
46
  def pos
@@ -58,7 +58,7 @@ grammar ScientificNameCanonical
58
58
  end
59
59
 
60
60
  def canonical
61
- a.canonical + " " + b.canonical
61
+ a.canonical
62
62
  end
63
63
 
64
64
  def pos
@@ -110,6 +110,8 @@ grammar ScientificNameCanonical
110
110
  end
111
111
 
112
112
  rule garbage
113
+ space "$$g@rbg3$$"
114
+ /
113
115
  space (["',.]) space [^щ]*
114
116
  /
115
117
  space_hard [^ш]+
@@ -1,4 +1,6 @@
1
1
  # encoding: UTF-8
2
+ require 'unicode_utils'
3
+
2
4
  grammar ScientificNameClean
3
5
 
4
6
  rule root
@@ -6,19 +8,19 @@ grammar ScientificNameClean
6
8
  def value
7
9
  a.value.gsub(/\s{2,}/, ' ').strip
8
10
  end
9
-
11
+
10
12
  def canonical
11
13
  a.canonical.gsub(/\s{2,}/, ' ').strip
12
14
  end
13
-
15
+
14
16
  def pos
15
17
  a.pos
16
18
  end
17
-
19
+
18
20
  def hybrid
19
21
  a.hybrid
20
22
  end
21
-
23
+
22
24
  def details
23
25
  a.details.class == Array ? a.details : [a.details]
24
26
  end
@@ -28,25 +30,25 @@ grammar ScientificNameClean
28
30
  end
29
31
  }
30
32
  end
31
-
33
+
32
34
  rule scientific_name_5
33
35
  a:multinomial_name space_hard hybrid_character space_hard b:species {
34
36
  def value
35
37
  a.value + " × " + b.value
36
38
  end
37
-
39
+
38
40
  def canonical
39
41
  a.canonical + " × " + b.canonical
40
42
  end
41
-
43
+
42
44
  def pos
43
45
  a.pos.merge(b.pos)
44
46
  end
45
-
47
+
46
48
  def hybrid
47
49
  true
48
50
  end
49
-
51
+
50
52
  def details
51
53
  [a.details, b.details.merge({:genus => a.details[:genus]})]
52
54
  end
@@ -56,19 +58,19 @@ grammar ScientificNameClean
56
58
  def value
57
59
  a.value + " " + b.apply(c)
58
60
  end
59
-
61
+
60
62
  def canonical
61
63
  a.canonical
62
64
  end
63
-
65
+
64
66
  def pos
65
67
  a.pos.merge(c.pos)
66
68
  end
67
-
69
+
68
70
  def hybrid
69
71
  a.hybrid
70
72
  end
71
-
73
+
72
74
  def details
73
75
  a.details.merge(b.details(c))
74
76
  end
@@ -76,25 +78,25 @@ grammar ScientificNameClean
76
78
  /
77
79
  scientific_name_4
78
80
  end
79
-
81
+
80
82
  rule scientific_name_4
81
83
  a:scientific_name_1 space hybrid_character space b:scientific_name_1 {
82
84
  def value
83
85
  a.value + " × " + b.value
84
86
  end
85
-
87
+
86
88
  def canonical
87
89
  a.canonical + " × " + b.canonical
88
90
  end
89
-
91
+
90
92
  def pos
91
93
  a.pos.merge(b.pos)
92
94
  end
93
-
95
+
94
96
  def hybrid
95
97
  true
96
98
  end
97
-
99
+
98
100
  def details
99
101
  [a.details, b.details]
100
102
  end
@@ -104,19 +106,19 @@ grammar ScientificNameClean
104
106
  def value
105
107
  a.value + " × ?"
106
108
  end
107
-
109
+
108
110
  def canonical
109
111
  a.canonical
110
112
  end
111
-
113
+
112
114
  def pos
113
115
  a.pos
114
116
  end
115
-
117
+
116
118
  def hybrid
117
119
  true
118
120
  end
119
-
121
+
120
122
  def details
121
123
  [a.details, "?"]
122
124
  end
@@ -124,25 +126,25 @@ grammar ScientificNameClean
124
126
  /
125
127
  scientific_name_3
126
128
  end
127
-
129
+
128
130
  rule scientific_name_3
129
131
  a:hybrid_character space b:scientific_name_2 {
130
132
  def value
131
133
  a.value + " " + b.value
132
134
  end
133
-
135
+
134
136
  def canonical
135
137
  b.canonical
136
138
  end
137
-
139
+
138
140
  def pos
139
141
  b.pos
140
142
  end
141
-
143
+
142
144
  def hybrid
143
145
  true
144
146
  end
145
-
147
+
146
148
  def details
147
149
  b.details
148
150
  end
@@ -150,25 +152,25 @@ grammar ScientificNameClean
150
152
  /
151
153
  scientific_name_2
152
154
  end
153
-
155
+
154
156
  rule scientific_name_2
155
157
  a:scientific_name_1 space b:status_part {
156
158
  def value
157
159
  a.value + " " + b.value
158
160
  end
159
-
161
+
160
162
  def canonical
161
163
  a.canonical
162
164
  end
163
-
165
+
164
166
  def pos
165
167
  a.pos
166
168
  end
167
-
169
+
168
170
  def hybrid
169
171
  a.hybrid rescue false
170
172
  end
171
-
173
+
172
174
  def details
173
175
  a.details.merge(b.details)
174
176
  end
@@ -178,12 +180,14 @@ grammar ScientificNameClean
178
180
  end
179
181
 
180
182
  rule scientific_name_1
183
+ multiuninomial_name
184
+ /
181
185
  multinomial_name
182
186
  /
183
- uninomial_name
187
+ uninomial_name
184
188
  end
185
-
186
-
189
+
190
+
187
191
  rule status_part
188
192
  a:status_word space b:status_part {
189
193
  def value
@@ -196,7 +200,7 @@ grammar ScientificNameClean
196
200
  /
197
201
  status_word
198
202
  end
199
-
203
+
200
204
  rule status_word
201
205
  latin_word [\.] {
202
206
  def value
@@ -209,114 +213,239 @@ grammar ScientificNameClean
209
213
  #/
210
214
  #latin_word
211
215
  end
212
-
213
-
216
+
217
+ rule unparsed
218
+ .+ space {
219
+
220
+ def value
221
+ ''
222
+ end
223
+
224
+ def hybrid
225
+ false
226
+ end
227
+
228
+ def canonical
229
+ ''
230
+ end
231
+
232
+ def pos
233
+ {interval.begin => ['unparsed', interval.end]}
234
+ end
235
+
236
+ def details
237
+ {:unparsed => text_value}
238
+ end
239
+ }
240
+ end
241
+
214
242
  rule multinomial_name
215
- a:genus space b:infragenus space species_prefix? space c:species space_hard d:infraspecies_mult {
243
+ a:genus space b:infragenus space aid:annotation_identification? space c:species space_hard d:infraspecies_mult {
216
244
  def value
217
245
  a.value + " " + b.value + " " + c.value + " " + d.value
218
246
  end
219
-
247
+
220
248
  def canonical
221
249
  a.canonical + " " + c.canonical + " " + d.canonical
222
250
  end
223
-
251
+
224
252
  def pos
225
253
  a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
226
254
  end
227
-
255
+
228
256
  def hybrid
229
257
  c.hybrid rescue false
230
258
  end
231
-
259
+
232
260
  def details
233
261
  a.details.merge(b.details).merge(c.details).merge(d.details)
234
262
  end
235
263
  }
236
- /
237
- a:genus space b:infragenus space species_prefix? space c:species {
264
+ /
265
+ a:genus space b:infragenus space aid:annotation_identification? space c:species space aid:annotation_identification space d:infraspecies_mult {
238
266
  def value
239
- a.value + " " + b.value + " " + c.value
267
+ a.value + " " + b.value + " " + c.value + " " + d.value
240
268
  end
241
-
269
+
242
270
  def canonical
243
- a.canonical + " " + c.canonical
271
+ a.canonical + " " + c.canonical + " " + d.canonical
244
272
  end
245
-
273
+
246
274
  def pos
247
- a.pos.merge(b.pos).merge(c.pos)
275
+ a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
276
+ end
277
+
278
+ def hybrid
279
+ c.hybrid rescue false
280
+ end
281
+
282
+ def details
283
+ a.details.merge(b.details).merge(c.details).merge(d.details)
284
+ end
285
+ }
286
+ /
287
+ a:genus space b:infragenus space aid:annotation_identification? space c:species {
288
+ def value
289
+ if defined? aid.apply
290
+ a.value + " " + b.value + aid.apply(c)
291
+ else
292
+ a.value + " " + b.value + " " + c.value
293
+ end
248
294
  end
249
-
295
+
296
+ def canonical
297
+ if defined? aid.apply
298
+ a.canonical + aid.canonical(c)
299
+ else
300
+ a.canonical + " " + c.canonical
301
+ end
302
+ end
303
+
304
+ def pos
305
+ if defined? aid.apply
306
+ a.pos.merge(b.pos).merge(aid.pos(c))
307
+ else
308
+ a.pos.merge(b.pos).merge(c.pos)
309
+ end
310
+ end
311
+
250
312
  def hybrid
251
313
  c.hybrid rescue false
252
314
  end
253
-
315
+
254
316
  def details
255
- a.details.merge(b.details).merge(c.details)
317
+ if defined? aid.apply
318
+ a.details.merge(b.details).merge(aid.apply(c))
319
+ else
320
+ a.details.merge(b.details).merge(c.details)
321
+ end
256
322
  end
257
323
  }
258
324
  /
259
- a:genus space species_prefix? space b:species space_hard c:infraspecies_mult {
325
+ a:genus space aid:annotation_identification? space b:species space_hard c:infraspecies_mult {
260
326
  def value
261
- a.value + " " + b.value + " " + c.value
327
+ a.value + " " + b.value + " " + c.value
262
328
  end
263
329
 
264
330
  def canonical
265
331
  a.canonical + " " + b.canonical + " " + c.canonical
266
332
  end
267
-
333
+
268
334
  def pos
269
335
  a.pos.merge(b.pos).merge(c.pos)
270
336
  end
271
-
337
+
272
338
  def hybrid
273
339
  b.hybrid rescue false
274
340
  end
275
-
341
+
276
342
  def details
277
343
  a.details.merge(b.details).merge(c.details)
278
344
  end
279
345
  }
280
346
  /
281
- a:genus space species_prefix? space b:species {
347
+ a:genus space aid:annotation_identification? space b:species {
282
348
  def value
283
- a.value + " " + b.value
349
+ if defined? aid.apply
350
+ a.value + aid.apply(b)
351
+ else
352
+ a.value + " " + b.value
353
+ end
284
354
  end
285
355
 
286
356
  def canonical
287
- a.canonical + " " + b.canonical
357
+ if defined? aid.apply
358
+ a.canonical + aid.canonical(b)
359
+ else
360
+ a.canonical + " " + b.canonical
361
+ end
288
362
  end
289
-
363
+
290
364
  def pos
291
- a.pos.merge(b.pos)
365
+ if defined? aid.apply
366
+ a.pos.merge(aid.pos(b))
367
+ else
368
+ a.pos.merge(b.pos)
369
+ end
292
370
  end
293
-
371
+
294
372
  def hybrid
295
373
  b.hybrid rescue false
296
374
  end
297
-
375
+
298
376
  def details
299
- a.details.merge(b.details)
377
+ if defined? aid.apply
378
+ a.details.merge(aid.details(b))
379
+ else
380
+ a.details.merge(b.details)
381
+ end
382
+ end
383
+ }
384
+ /
385
+ a:genus space aid:annotation_identification space b:unparsed {
386
+ def value
387
+ a.value + aid.apply(b)
388
+ end
389
+
390
+ def canonical
391
+ a.canonical + aid.canonical(b)
392
+ end
393
+
394
+ def pos
395
+ a.pos.merge(aid.pos(b))
396
+ end
397
+
398
+ def hybrid
399
+ false
400
+ end
401
+
402
+ def details
403
+ a.details.merge(aid.details(b))
300
404
  end
301
405
  }
302
406
  end
303
-
407
+
408
+ rule multiuninomial_name
409
+ a:uninomial_name space b:rank_uninomial space c:uninomial_name {
410
+
411
+ def value
412
+ a.value + " " + b.value + " " + c.value
413
+ end
414
+
415
+ def canonical
416
+ a.canonical
417
+ end
418
+
419
+ def hybrid
420
+ false
421
+ end
422
+
423
+ def pos
424
+ a.pos.merge(b.pos(c))
425
+ end
426
+
427
+ def details
428
+ a.details.merge(b.details(c))
429
+ end
430
+ }
431
+ end
432
+
304
433
  rule infraspecies_mult
305
434
  a:infraspecies space b:infraspecies_mult {
306
435
  def value
307
436
  a.value + " " + b.value
308
437
  end
309
-
438
+
310
439
  def canonical
311
440
  a.canonical + " " + b.canonical
312
441
  end
313
-
442
+
314
443
  def pos
315
444
  a.pos.merge(b.pos)
316
445
  end
317
-
446
+
318
447
  def details
319
- a_array = a.details[:infraspecies].class == Array ? a.details[:infraspecies] : [a.details[:infraspecies]]
448
+ a_array = a.details[:infraspecies].class == Array ? a.details[:infraspecies] : [a.details[:infraspecies]]
320
449
  b_array = b.details[:infraspecies].class == Array ? b.details[:infraspecies] : [b.details[:infraspecies]]
321
450
  a.details.merge({:infraspecies => a_array + b_array})
322
451
  end
@@ -324,70 +453,98 @@ grammar ScientificNameClean
324
453
  /
325
454
  infraspecies {
326
455
  def details
327
- {:infraspecies => [super[:infraspecies]]}
456
+ if super[:annotation_identification]
457
+ {:infraspecies => [{:annotation_identification => super[:annotation_identification], :ignored => super[:ignored]}]}
458
+ else
459
+ {:infraspecies => [super[:infraspecies]]}
460
+ end
328
461
  end
329
462
  }
330
463
  end
331
-
464
+
332
465
  rule infraspecies
333
466
  a:infraspecies_string space b:authorship {
334
467
  def value
335
468
  a.value + " " + b.value
336
469
  end
337
-
470
+
338
471
  def canonical
339
472
  a.canonical
340
473
  end
341
-
474
+
342
475
  def pos
343
476
  a.pos.merge(b.pos)
344
477
  end
345
-
478
+
346
479
  def details
347
480
  {:infraspecies => a.details[:infraspecies].merge(b.details)}
348
481
  end
349
482
  }
350
483
  /
351
- infraspecies_string
484
+ infraspecies_string
352
485
  end
353
-
486
+
354
487
  rule infraspecies_string
355
- sel:rank space_hard a:species_word {
356
- def value
488
+ sel:rank space a:species_word {
489
+ def value
357
490
  sel.apply(a)
358
491
  end
359
492
  def canonical
360
493
  sel.canonical(a)
361
494
  end
362
-
495
+
363
496
  def pos
364
497
  sel.pos(a)
365
498
  end
366
-
499
+
367
500
  def details
368
501
  sel.details(a)
369
502
  end
370
503
  }
371
504
  /
372
- species_word ![\.] {
505
+ aid:annotation_identification space a:species_word ![\.] {
373
506
  def value
374
- text_value
507
+ aid.apply(a)
508
+ end
509
+
510
+ def canonical
511
+ aid.canonical(a)
512
+ end
513
+
514
+ def pos
515
+ def a.pos
516
+ {interval.begin => ['infraspecies', a.interval.end]}
517
+ end
518
+ aid.pos(a)
519
+ end
520
+
521
+ def details
522
+ def a.details
523
+ {:infraspecies => {:string => value, :rank => 'n/a'}}
524
+ end
525
+ aid.details(a)
375
526
  end
376
-
527
+ }
528
+ /
529
+ a:species_word ![\.] {
530
+ def value
531
+ a.value
532
+ end
533
+
377
534
  def canonical
378
535
  value
379
536
  end
380
-
537
+
381
538
  def pos
382
539
  {interval.begin => ['infraspecies', interval.end]}
383
540
  end
384
-
541
+
385
542
  def details
386
543
  {:infraspecies => {:string => value, :rank => 'n/a'}}
387
544
  end
388
545
  }
389
546
  end
390
-
547
+
391
548
  rule taxon_concept_rank
392
549
  ("sec."/"sensu.") {
393
550
  def value
@@ -398,77 +555,70 @@ grammar ScientificNameClean
398
555
  end
399
556
  def details(a = nil)
400
557
  {:taxon_concept => a.details}
401
- end
558
+ end
402
559
  }
403
560
  end
404
-
561
+
405
562
  rule rank
406
- ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
407
- /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
563
+ ("morph."/"f.sp."/"B "/"ssp."/"ssp "/"mut."/"nat "/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var "/"subsp."/"subsp "/"subf."/"race "/"forma "/"fma."/"fma "/"form."/"form "/"fo."/"fo"/"f."/"α"/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
408
564
  {
409
565
  def value
410
566
  text_value.strip
411
567
  end
412
568
 
413
569
  def apply(a)
414
- " " + text_value + " " + a.value
570
+ " " + text_value.strip + " " + a.value
415
571
  end
416
572
 
417
573
  def canonical(a)
418
574
  " " + a.value
419
575
  end
420
-
576
+
421
577
  def pos(a)
422
- {interval.begin => ['infraspecific_type', interval.end], a.interval.begin => ['infraspecies', a.interval.end]}
578
+ interval_end = text_value[-1] == ' ' ? interval.end - 1 : interval.end
579
+ {interval.begin => ['infraspecific_type', interval_end], a.interval.begin => ['infraspecies', a.interval.end]}
423
580
  end
424
-
581
+
425
582
  def details(a = nil)
426
- {:infraspecies => {:string => (a.value rescue nil), :rank => text_value}}
583
+ {:infraspecies => {:string => (a.value rescue nil), :rank => text_value.strip}}
427
584
  end
428
585
  }
429
- /
430
- rank_forma
431
586
  end
432
-
433
- rule rank_forma
434
- ("forma"/"form."/"form"/"fo."/"f.")
435
- {
587
+
588
+ rule rank_uninomial
589
+ ("sect."/"sect "/"subsect."/"subsect "/"trib."/"trib "/"subtrib."/"subtrib "/"ser."/"ser "/"subgen."/"subgen "/"fam."/"fam "/"subfam."/"subfam "/"supertrib."/"supertrib ") {
436
590
  def value
437
- "f."
438
- end
439
- def apply(a)
440
- " " + value + " " + a.value
441
- end
442
- def canonical(a)
443
- " " + a.value
591
+ text_value.strip
444
592
  end
445
- def pos(a)
446
- {interval.begin => ['infraspecific_type', interval.end], a.interval.begin => ['infraspecies', a.interval.end]}
593
+
594
+ def pos(uni)
595
+ {interval.begin => ['rank_uninomial', interval.end], uni.interval.begin => ['uninomial', uni.interval.end]}
447
596
  end
448
- def details(a = nil)
449
- {:infraspecies => {:string => (a.value rescue nil), :rank => value}}
597
+
598
+ def details(uni)
599
+ {:rank_uninomials => value, :uninomial2 => uni.details[:uninomial]}
450
600
  end
451
601
  }
452
602
  end
453
-
603
+
454
604
  rule species
455
605
  a:species_string space b:authorship {
456
606
  def value
457
607
  a.value + " " + b.value
458
608
  end
459
-
609
+
460
610
  def canonical
461
611
  a.canonical
462
612
  end
463
-
613
+
464
614
  def hybrid
465
615
  a.hybrid rescue false
466
616
  end
467
-
617
+
468
618
  def pos
469
619
  a.pos.merge(b.pos)
470
620
  end
471
-
621
+
472
622
  def details
473
623
  {:species => a.details[:species].merge(b.details)}
474
624
  end
@@ -476,43 +626,21 @@ grammar ScientificNameClean
476
626
  /
477
627
  species_string
478
628
  end
479
-
629
+
480
630
  rule species_string
481
- # a:species_word &(space_hard author_prefix_word space_hard) {
482
- # def value
483
- # a.value
484
- # end
485
- #
486
- # def canonical
487
- # a.value
488
- # end
489
- #
490
- # def hybrid
491
- # a.hybrid rescue false
492
- # end
493
- #
494
- # def pos
495
- # {a.interval.begin => ['species', a.interval.end]}
496
- # end
497
- #
498
- # def details
499
- # {:species => {:string => a.value}}
500
- # end
501
- # }
502
- # /
503
631
  species_word {
504
632
  def canonical
505
633
  value
506
634
  end
507
-
635
+
508
636
  def pos
509
637
  {interval.begin => ['species', interval.end]}
510
638
  end
511
-
639
+
512
640
  def hybrid
513
641
  false
514
642
  end
515
-
643
+
516
644
  def details
517
645
  {:species => {:string => value}}
518
646
  end
@@ -520,65 +648,85 @@ grammar ScientificNameClean
520
648
  /
521
649
  species_word_hybrid
522
650
  end
523
-
651
+
524
652
  rule infragenus
525
653
  left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
526
654
  def value
527
655
  "(" + a.value + ")"
528
656
  end
529
-
657
+
530
658
  def canonical
531
659
  a.value
532
660
  end
533
-
661
+
534
662
  def pos
535
663
  {a.interval.begin => ['infragenus', a.interval.end]}
536
664
  end
537
-
665
+
538
666
  def details
539
667
  {:infragenus => {:string => a.value}}
540
668
  end
541
669
  }
542
670
  end
543
-
671
+
544
672
  rule genus
545
- a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
673
+ a:(abbreviated_genus/uninomial_string) !(space_hard author_prefix_word space_hard author_word) {
546
674
  def value
547
675
  a.value
548
676
  end
549
-
677
+
550
678
  def pos
551
679
  {a.interval.begin => ['genus', a.interval.end]}
552
680
  end
553
-
681
+
554
682
  def canonical
555
683
  a.value
556
684
  end
557
-
685
+
558
686
  def details
559
687
  {:genus => {:string => a.value}}
560
688
  end
561
689
  }
562
690
  end
563
-
691
+
692
+ rule abbreviated_genus
693
+ [A-Z] [a-z]? [a-z]? [\\.] space {
694
+ def value
695
+ text_value.strip
696
+ end
697
+
698
+ def canonical
699
+ value
700
+ end
701
+
702
+ def pos
703
+ {interval.begin => ["abbreviated_genus", interval.end]}
704
+ end
705
+
706
+ def details
707
+ {:abbreviated_genus => {:string => value}}
708
+ end
709
+ }
710
+ end
711
+
564
712
  rule uninomial_name
565
713
  a:uninomial_string space b:infragenus space c:simple_authorship {
566
714
  def value
567
715
  a.value + " " + b.value + " " + c.value
568
716
  end
569
-
717
+
570
718
  def canonical
571
719
  a.canonical
572
720
  end
573
-
721
+
574
722
  def pos
575
723
  a.pos.merge(b.pos).merge(c.pos)
576
724
  end
577
-
725
+
578
726
  def hybrid
579
727
  false
580
728
  end
581
-
729
+
582
730
  def details
583
731
  {:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
584
732
  end
@@ -588,19 +736,19 @@ grammar ScientificNameClean
588
736
  def value
589
737
  a.value + " " + b.value
590
738
  end
591
-
739
+
592
740
  def canonical
593
741
  a.canonical
594
742
  end
595
-
743
+
596
744
  def pos
597
745
  a.pos.merge(b.pos)
598
746
  end
599
-
747
+
600
748
  def hybrid
601
749
  false
602
750
  end
603
-
751
+
604
752
  def details
605
753
  {:uninomial => a.details[:uninomial].merge(b.details)}
606
754
  end
@@ -610,19 +758,19 @@ grammar ScientificNameClean
610
758
  def value
611
759
  a.value + " " + b.value
612
760
  end
613
-
761
+
614
762
  def canonical
615
763
  a.canonical
616
764
  end
617
-
765
+
618
766
  def pos
619
767
  a.pos.merge(b.pos)
620
768
  end
621
-
769
+
622
770
  def hybrid
623
771
  false
624
772
  end
625
-
773
+
626
774
  def details
627
775
  {:uninomial => a.details[:uninomial].merge(b.details)}
628
776
  end
@@ -636,31 +784,31 @@ grammar ScientificNameClean
636
784
  def canonical
637
785
  value
638
786
  end
639
-
787
+
640
788
  def pos
641
789
  {interval.begin => ['uninomial', interval.end]}
642
790
  end
643
-
791
+
644
792
  def hybrid
645
793
  false
646
794
  end
647
-
648
- def details
795
+
796
+ def details
649
797
  {:uninomial => {:string => value}}
650
798
  end
651
799
  }
652
800
  end
653
-
801
+
654
802
  rule authorship
655
803
  a:basionym_authorship_with_parenthesis space b:simple_authorship ","? space c:ex_authorship {
656
804
  def value
657
805
  a.value + " " + b.value + " " + c.value
658
806
  end
659
-
807
+
660
808
  def pos
661
809
  a.pos.merge(b.pos).merge(c.pos)
662
810
  end
663
-
811
+
664
812
  def details
665
813
  val = {:authorship => text_value.strip, :combinationAuthorTeam => b.details[:basionymAuthorTeam], :basionymAuthorTeam => a.details[:basionymAuthorTeam]}
666
814
  val[:combinationAuthorTeam].merge!(c.details)
@@ -672,11 +820,11 @@ grammar ScientificNameClean
672
820
  def value
673
821
  a.value + " " + b.value
674
822
  end
675
-
823
+
676
824
  def pos
677
825
  a.pos.merge(b.pos)
678
826
  end
679
-
827
+
680
828
  def details
681
829
  {:authorship => text_value.strip, :combinationAuthorTeam => b.details[:basionymAuthorTeam], :basionymAuthorTeam => a.details[:basionymAuthorTeam]}
682
830
  end
@@ -688,11 +836,11 @@ grammar ScientificNameClean
688
836
  def value
689
837
  a.value + " " + b.value
690
838
  end
691
-
839
+
692
840
  def pos
693
841
  a.pos.merge(b.pos)
694
842
  end
695
-
843
+
696
844
  def details
697
845
  val = a.details
698
846
  val[:authorship] = text_value.strip
@@ -703,21 +851,21 @@ grammar ScientificNameClean
703
851
  /
704
852
  simple_authorship
705
853
  end
706
-
707
-
854
+
855
+
708
856
  rule basionym_authorship_with_parenthesis
709
857
  left_paren space a:authors_names space right_paren space [,]? space b:year {
710
858
  def value
711
859
  "(" + a.value + " " + b.value + ")"
712
860
  end
713
-
861
+
714
862
  def pos
715
863
  a.pos.merge(b.pos)
716
- end
717
-
864
+ end
865
+
718
866
  def details
719
- { :authorship => text_value,
720
- :basionymAuthorTeam => {:author_team => text_value}.merge(a.details).merge(b.details)
867
+ { :authorship => text_value,
868
+ :basionymAuthorTeam => {:author_team => text_value}.merge(a.details).merge(b.details)
721
869
  }
722
870
  end
723
871
  }
@@ -726,11 +874,11 @@ grammar ScientificNameClean
726
874
  def value
727
875
  "(" + a.value + " " + b.value + ")"
728
876
  end
729
-
877
+
730
878
  def pos
731
879
  a.pos.merge(b.pos)
732
880
  end
733
-
881
+
734
882
  def details
735
883
  val = a.details
736
884
  val[:basionymAuthorTeam].merge!(b.details)
@@ -743,15 +891,15 @@ grammar ScientificNameClean
743
891
  def value
744
892
  "(" + a.value + ")"
745
893
  end
746
-
894
+
747
895
  def pos
748
896
  a.pos
749
897
  end
750
-
898
+
751
899
  def details
752
900
  val = a.details
753
901
  val[:authorship] = text_value
754
- val
902
+ val
755
903
  end
756
904
  }
757
905
  /
@@ -759,32 +907,32 @@ grammar ScientificNameClean
759
907
  def value
760
908
  "(?)"
761
909
  end
762
-
910
+
763
911
  def pos
764
912
  {a.interval.begin => ['unknown_author', a.interval.end]}
765
913
  end
766
-
914
+
767
915
  def details
768
916
  {:authorship => text_value, :basionymAuthorTeam => {:authorTeam => text_value, :author => ['?']}}
769
917
  end
770
918
  }
771
919
  end
772
-
920
+
773
921
  rule ex_authorship
774
922
  ex_sep space b:simple_authorship {
775
923
  def value
776
924
  " ex " + b.value
777
925
  end
778
-
926
+
779
927
  def pos
780
928
  b.pos
781
929
  end
782
-
930
+
783
931
  def details
784
932
  val = {:exAuthorTeam => {:authorTeam => b.text_value.strip}.merge(b.details[:basionymAuthorTeam])}
785
933
  val
786
934
  end
787
- }
935
+ }
788
936
  end
789
937
 
790
938
  rule simple_authorship
@@ -792,17 +940,17 @@ grammar ScientificNameClean
792
940
  def value
793
941
  a.value + " " + b.value
794
942
  end
795
-
943
+
796
944
  def pos
797
945
  a.pos.merge(b.pos)
798
946
  end
799
-
947
+
800
948
  def details
801
949
  details_with_arg(:basionymAuthorTeam)
802
950
  end
803
-
951
+
804
952
  def details_with_arg(authorTeamType = 'basionymAuthorTeam')
805
- { :authorship => text_value,
953
+ { :authorship => text_value,
806
954
  authorTeamType.to_sym => {
807
955
  :authorTeam => a.text_value.strip
808
956
  }.merge(a.details).merge(b.details)
@@ -814,17 +962,17 @@ grammar ScientificNameClean
814
962
  def value
815
963
  a.value + " " + b.value
816
964
  end
817
-
965
+
818
966
  def pos
819
967
  a.pos.merge(b.pos)
820
968
  end
821
-
969
+
822
970
  def details
823
971
  details_with_arg(:basionymAuthorTeam)
824
972
  end
825
-
973
+
826
974
  def details_with_arg(authorTeamType = 'basionymAuthorTeam')
827
- { :authorship => text_value,
975
+ { :authorship => text_value,
828
976
  authorTeamType.to_sym => {
829
977
  :authorTeam => a.text_value.strip
830
978
  }.merge(a.details).merge(b.details)
@@ -838,27 +986,27 @@ grammar ScientificNameClean
838
986
  details[:basionymAuthorTeam].merge!(super)
839
987
  details
840
988
  end
841
-
989
+
842
990
  def details_with_arg(authorTeamType = 'basionymAuthorTeam')
843
- { :authorship => text_value,
991
+ { :authorship => text_value,
844
992
  authorTeamType.to_sym => {
845
993
  :authorTeam => text_value,
846
994
  }
847
- }
995
+ }
848
996
  end
849
997
  }
850
998
  end
851
-
999
+
852
1000
  rule authors_names
853
1001
  a:author_name space sep:author_separator space b:authors_names {
854
1002
  def value
855
1003
  sep.apply(a,b)
856
1004
  end
857
-
1005
+
858
1006
  def pos
859
1007
  sep.pos(a,b)
860
1008
  end
861
-
1009
+
862
1010
  def details
863
1011
  sep.details(a,b)
864
1012
  end
@@ -868,40 +1016,40 @@ grammar ScientificNameClean
868
1016
  /
869
1017
  unknown_auth
870
1018
  end
871
-
872
-
1019
+
1020
+
873
1021
  rule unknown_auth
874
- ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
1022
+ ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") !latin_word {
875
1023
  def value
876
1024
  text_value
877
1025
  end
878
-
1026
+
879
1027
  def pos
880
1028
  {interval.begin => ['unknown_author', interval.end]}
881
1029
  end
882
-
1030
+
883
1031
  def details
884
1032
  {:author => ["unknown"]}
885
1033
  end
886
1034
  }
887
1035
  end
888
-
1036
+
889
1037
  rule ex_sep
890
1038
  ("ex"/"in") &[\s]
891
1039
  end
892
-
1040
+
893
1041
  rule author_separator
894
- ("&"/","/"and"/"et") {
1042
+ ("&amp;"/"&"/","/"and"/"et") {
895
1043
  def apply(a,b)
896
1044
  sep = text_value.strip
897
- sep = " et" if ["&","and","et"].include? sep
1045
+ sep = " &" if ["&amp;", "&","and","et"].include? sep
898
1046
  a.value + sep + " " + b.value
899
1047
  end
900
-
1048
+
901
1049
  def pos(a,b)
902
1050
  a.pos.merge(b.pos)
903
1051
  end
904
-
1052
+
905
1053
  def details(a,b)
906
1054
  {:author => a.details[:author] + b.details[:author]}
907
1055
  end
@@ -913,8 +1061,8 @@ grammar ScientificNameClean
913
1061
  def value
914
1062
  a.value + ' ' + b.value
915
1063
  end
916
-
917
- def pos
1064
+
1065
+ def pos
918
1066
  a.pos.merge(b.pos)
919
1067
  end
920
1068
 
@@ -925,17 +1073,17 @@ grammar ScientificNameClean
925
1073
  /
926
1074
  author_name_without_postfix
927
1075
  end
928
-
1076
+
929
1077
  rule author_name_without_postfix
930
1078
  space a:author_prefix_word space b:author_name {
931
1079
  def value
932
1080
  a.value + " " + b.value
933
1081
  end
934
-
1082
+
935
1083
  def pos
936
1084
  a.pos.merge(b.pos)
937
1085
  end
938
-
1086
+
939
1087
  def details
940
1088
  {:author => [value]}
941
1089
  end
@@ -945,11 +1093,11 @@ grammar ScientificNameClean
945
1093
  def value
946
1094
  a.value + " " + b.value
947
1095
  end
948
-
1096
+
949
1097
  def pos
950
1098
  a.pos.merge(b.pos)
951
1099
  end
952
-
1100
+
953
1101
  def details
954
1102
  {:author => [value]}
955
1103
  end
@@ -957,17 +1105,17 @@ grammar ScientificNameClean
957
1105
  /
958
1106
  author_word
959
1107
  end
960
-
1108
+
961
1109
  rule author_word
962
1110
  "A S. Xu" {
963
1111
  def value
964
1112
  text_value.strip
965
1113
  end
966
-
1114
+
967
1115
  def pos
968
1116
  {interval.begin => ['author_word', 1], (interval.begin + 2) => ['author_word', 2], (interval.begin + 5) => ['author_word', 2]}
969
1117
  end
970
-
1118
+
971
1119
  def details
972
1120
  {:author => [value]}
973
1121
  end
@@ -977,26 +1125,28 @@ grammar ScientificNameClean
977
1125
  def value
978
1126
  text_value.strip
979
1127
  end
980
-
1128
+
981
1129
  def pos
982
1130
  #cheating because there are several words in some of them
983
1131
  {interval.begin => ['author_word', interval.end]}
984
1132
  end
985
-
1133
+
986
1134
  def details
987
1135
  {:author => [value]}
988
1136
  end
989
1137
  }
990
- /
1138
+ /
991
1139
  ("Å"/"Ö"/"Á"/"Ø"/"Ô"/"Š"/"Ś"/"Č"/"Ķ"/"Ł"/"É"/"Ž"/[A-W]/[Y-Z]) [^0-9\[\]\(\)\s&,]* {
992
1140
  def value
993
- text_value
1141
+ text_value.gsub(/([\p{Lu}]{3,})/) do |match|
1142
+ UnicodeUtils.titlecase(match)
1143
+ end
994
1144
  end
995
-
1145
+
996
1146
  def pos
997
1147
  {interval.begin => ['author_word', interval.end]}
998
1148
  end
999
-
1149
+
1000
1150
  def details
1001
1151
  {:author => [value]}
1002
1152
  end
@@ -1006,11 +1156,11 @@ grammar ScientificNameClean
1006
1156
  def value
1007
1157
  text_value
1008
1158
  end
1009
-
1159
+
1010
1160
  def pos
1011
1161
  {interval.begin => ['author_word', interval.end]}
1012
1162
  end
1013
-
1163
+
1014
1164
  def details
1015
1165
  {:author => [value]}
1016
1166
  end
@@ -1018,13 +1168,13 @@ grammar ScientificNameClean
1018
1168
  /
1019
1169
  author_prefix_word
1020
1170
  end
1021
-
1171
+
1022
1172
  rule author_prefix_word
1023
- space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
1173
+ space ("ab"/"af"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
1024
1174
  def value
1025
1175
  text_value
1026
1176
  end
1027
-
1177
+
1028
1178
  def pos
1029
1179
  #cheating because there are several words in some of them
1030
1180
  {interval.begin => ['author_word', interval.end]}
@@ -1034,7 +1184,7 @@ grammar ScientificNameClean
1034
1184
 
1035
1185
  rule author_postfix_word
1036
1186
  ("f."/"filius") {
1037
- def value
1187
+ def value
1038
1188
  text_value.strip
1039
1189
  end
1040
1190
 
@@ -1043,7 +1193,7 @@ grammar ScientificNameClean
1043
1193
  end
1044
1194
  }
1045
1195
  end
1046
-
1196
+
1047
1197
  rule cap_latin_word_pair
1048
1198
  a:cap_latin_word "-" b:cap_latin_word {
1049
1199
  def value
@@ -1051,7 +1201,7 @@ grammar ScientificNameClean
1051
1201
  end
1052
1202
  }
1053
1203
  end
1054
-
1204
+
1055
1205
  rule cap_latin_word
1056
1206
  a:([A-Z]/cap_digraph) b:latin_word "?" {
1057
1207
  def value
@@ -1091,19 +1241,19 @@ grammar ScientificNameClean
1091
1241
  def value
1092
1242
  a.value + " " + b.value
1093
1243
  end
1094
-
1244
+
1095
1245
  def canonical
1096
1246
  b.value
1097
1247
  end
1098
-
1248
+
1099
1249
  def hybrid
1100
1250
  true
1101
1251
  end
1102
-
1252
+
1103
1253
  def pos
1104
1254
  {b.interval.begin => ['species', b.interval.end]}
1105
1255
  end
1106
-
1256
+
1107
1257
  def details
1108
1258
  {:species => {:string => b.value}}
1109
1259
  end
@@ -1113,19 +1263,19 @@ grammar ScientificNameClean
1113
1263
  def value
1114
1264
  "× " + b.value
1115
1265
  end
1116
-
1266
+
1117
1267
  def canonical
1118
1268
  b.value
1119
1269
  end
1120
-
1270
+
1121
1271
  def hybrid
1122
1272
  true
1123
1273
  end
1124
-
1274
+
1125
1275
  def pos
1126
1276
  {b.interval.begin => ['species', b.interval.end]}
1127
1277
  end
1128
-
1278
+
1129
1279
  def details
1130
1280
  {:species => {:string => b.value}}
1131
1281
  end
@@ -1135,29 +1285,74 @@ grammar ScientificNameClean
1135
1285
  def value
1136
1286
  "× " + b.value
1137
1287
  end
1138
-
1288
+
1139
1289
  def canonical
1140
1290
  b.value
1141
1291
  end
1142
-
1292
+
1143
1293
  def hybrid
1144
1294
  true
1145
1295
  end
1146
-
1296
+
1147
1297
  def pos
1148
1298
  {b.interval.begin => ['species', b.interval.end]}
1149
1299
  end
1150
-
1300
+
1151
1301
  def details
1152
1302
  {:species => {:string => b.value}}
1153
1303
  end
1154
1304
  }
1155
1305
  end
1156
1306
 
1157
- rule species_prefix
1158
- ("aff."/"corrig."/"?") &space_hard
1307
+ rule annotation_identification
1308
+ ("sp.nr."/"sp. nr."/"nr."/"nr "/"sp.aff."/"sp. aff."/"sp."/"sp "/"species"/"spp."/"spp "/"aff."/"aff "/"monst."/"? ") {
1309
+
1310
+ def value
1311
+ text_value.strip
1312
+ end
1313
+
1314
+ def apply(sp)
1315
+ ''
1316
+ end
1317
+
1318
+ def canonical(sp)
1319
+ ''
1320
+ end
1321
+
1322
+ def pos(sp)
1323
+ interval_end = text_value[-1] == ' ' ? interval.end - 1 : interval.end
1324
+ {interval.begin => ['annotation_identification', interval.end]}
1325
+ end
1326
+
1327
+ def details(sp)
1328
+ {:annotation_identification => value, :ignored => sp.details}
1329
+ end
1330
+ }
1331
+ /
1332
+ ("cf."/"cf ") {
1333
+ def value
1334
+ text_value.strip
1335
+ end
1336
+
1337
+ def apply(sp)
1338
+ ' ' + value + ' ' + sp.value
1339
+ end
1340
+
1341
+ def canonical(sp)
1342
+ ' ' + sp.canonical
1343
+ end
1344
+
1345
+ def pos(sp)
1346
+ interval_end = text_value[-1] == ' ' ? interval.end - 1 : interval.end
1347
+ {interval.begin => ['annotation_identification', interval.end]}.merge(sp.pos)
1348
+ end
1349
+
1350
+ def details(sp)
1351
+ {:annotation_identification => value, :species => sp.details}
1352
+ end
1353
+ }
1159
1354
  end
1160
-
1355
+
1161
1356
  rule species_word
1162
1357
  a:[0-9]+ "-"? b:latin_word {
1163
1358
  def value
@@ -1177,6 +1372,12 @@ grammar ScientificNameClean
1177
1372
  end
1178
1373
  }
1179
1374
  /
1375
+ "o\'donelli" {
1376
+ def value
1377
+ "odonelli"
1378
+ end
1379
+ }
1380
+ /
1180
1381
  a:valid_name_letter b:valid_name_letters {
1181
1382
  def value
1182
1383
  a.value + b.value
@@ -1191,9 +1392,9 @@ grammar ScientificNameClean
1191
1392
  text_value.split('').each do |l|
1192
1393
  l = 'ae' if l == 'æ'
1193
1394
  l = 'oe' if l == 'œ'
1194
- # not sure if we should normalize ë as well. It is legal in botanical code, but it
1195
- # might be beneficial to normalize it for the reconsiliation purposes
1196
- # l = 'e' if l == 'ë'
1395
+ # We normalize ë as well. It is legal in botanical code, but it
1396
+ # is beneficial to normalize it for the reconsiliation purposes
1397
+ l = 'e' if l == 'ë'
1197
1398
  res << l
1198
1399
  end
1199
1400
  res
@@ -1207,6 +1408,7 @@ grammar ScientificNameClean
1207
1408
  res = text_value
1208
1409
  res = 'ae' if res == 'æ'
1209
1410
  res = 'oe' if res == 'œ'
1411
+ res = 'e' if res == 'ë'
1210
1412
  res
1211
1413
  end
1212
1414
  }
@@ -1224,7 +1426,7 @@ grammar ScientificNameClean
1224
1426
  def value
1225
1427
  'Oe'
1226
1428
  end
1227
- }
1429
+ }
1228
1430
  end
1229
1431
 
1230
1432
  rule year
@@ -1232,14 +1434,14 @@ grammar ScientificNameClean
1232
1434
  def value
1233
1435
  a.value
1234
1436
  end
1235
-
1437
+
1236
1438
  def pos
1237
1439
  a.pos
1238
1440
  end
1239
-
1441
+
1240
1442
  def details
1241
1443
  a.details
1242
- end
1444
+ end
1243
1445
  }
1244
1446
  /
1245
1447
  year_number_with_character
@@ -1262,31 +1464,31 @@ grammar ScientificNameClean
1262
1464
  end
1263
1465
  }
1264
1466
  end
1265
-
1467
+
1266
1468
  rule year_number
1267
- [12] [7890] [0-9] [0-9]? [\?]? {
1469
+ [12] [7890] [0-9] ([0-9] [\?]?/"?") {
1268
1470
  def value
1269
1471
  text_value
1270
1472
  end
1271
-
1473
+
1272
1474
  def pos
1273
1475
  {interval.begin => ['year', interval.end]}
1274
1476
  end
1275
-
1477
+
1276
1478
  def details
1277
1479
  {:year => value}
1278
1480
  end
1279
1481
  }
1280
1482
  end
1281
-
1483
+
1282
1484
  rule left_paren
1283
1485
  "("
1284
1486
  end
1285
-
1487
+
1286
1488
  rule right_paren
1287
1489
  ")"
1288
1490
  end
1289
-
1491
+
1290
1492
  rule hybrid_character
1291
1493
  ("x"/"X") {
1292
1494
  def value
@@ -1296,7 +1498,7 @@ grammar ScientificNameClean
1296
1498
  /
1297
1499
  multiplication_sign
1298
1500
  end
1299
-
1501
+
1300
1502
  rule multiplication_sign
1301
1503
  ("×"/"*") {
1302
1504
  def value
@@ -1304,7 +1506,7 @@ grammar ScientificNameClean
1304
1506
  end
1305
1507
  }
1306
1508
  end
1307
-
1509
+
1308
1510
  rule space
1309
1511
  [\s]*
1310
1512
  end
@@ -1312,5 +1514,5 @@ grammar ScientificNameClean
1312
1514
  rule space_hard
1313
1515
  [\s]+
1314
1516
  end
1315
-
1517
+
1316
1518
  end