dimus-biodiversity 0.0.18 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/biodiversity/parser.rb +18 -35
- data/lib/biodiversity/parser/scientific_name_canonical.rb +248 -83
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +46 -20
- data/lib/biodiversity/parser/scientific_name_clean.rb +3304 -3409
- data/lib/biodiversity/parser/scientific_name_clean.treetop +539 -500
- data/lib/biodiversity/parser/scientific_name_dirty.rb +362 -213
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +123 -98
- data/spec/parser/scientific_name.spec.rb +7 -28
- data/spec/parser/scientific_name_canonical.spec.rb +7 -6
- data/spec/parser/scientific_name_clean.spec.rb +256 -260
- data/spec/parser/scientific_name_dirty.spec.rb +62 -52
- metadata +2 -2
data/lib/biodiversity/parser.rb
CHANGED
|
@@ -13,54 +13,37 @@ class ScientificNameParser
|
|
|
13
13
|
@clean = ScientificNameCleanParser.new
|
|
14
14
|
@dirty = ScientificNameDirtyParser.new
|
|
15
15
|
@canonical = ScientificNameCanonicalParser.new
|
|
16
|
-
@
|
|
16
|
+
@parser = nil
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def parse(a_string)
|
|
20
20
|
@verbatim = a_string
|
|
21
|
-
@
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def value
|
|
26
|
-
@node.value if @node
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
def pos
|
|
30
|
-
@node.pos if @node
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def details
|
|
34
|
-
@node.details if @node
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
def canonical
|
|
38
|
-
@node.canonical if @node
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def to_json
|
|
42
|
-
parsed = !!@node
|
|
43
|
-
if parsed
|
|
21
|
+
@parser = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string)
|
|
22
|
+
def @parser.to_json
|
|
23
|
+
parsed = !!self
|
|
44
24
|
res = {
|
|
45
25
|
:parsed => parsed,
|
|
46
|
-
:verbatim =>
|
|
26
|
+
:verbatim => self.text_value }
|
|
47
27
|
if parsed
|
|
48
28
|
res.merge!({
|
|
49
|
-
:normalized =>
|
|
50
|
-
:canonical =>
|
|
29
|
+
:normalized => self.value,
|
|
30
|
+
:canonical => self.canonical
|
|
51
31
|
})
|
|
52
|
-
|
|
32
|
+
data = self.details
|
|
33
|
+
if data[:species] && data[:species][:namedHybrid]
|
|
34
|
+
data[:species].delete(:namedHybrid)
|
|
35
|
+
data = {:namedHybrid => data}
|
|
36
|
+
end
|
|
37
|
+
res.merge!(data)
|
|
53
38
|
end
|
|
54
39
|
res = {:scientificName => res}
|
|
55
40
|
JSON.generate res
|
|
56
|
-
else
|
|
57
|
-
JSON.generate({:parsed => parsed, :verbatim => @verbatim})
|
|
58
41
|
end
|
|
42
|
+
|
|
43
|
+
def @parser.pos_json
|
|
44
|
+
JSON.generate self.pos rescue ''
|
|
45
|
+
end
|
|
46
|
+
@parser
|
|
59
47
|
end
|
|
60
|
-
|
|
61
|
-
def pos_to_json
|
|
62
|
-
JSON.generate @node.pos rescue ''
|
|
63
|
-
end
|
|
64
|
-
|
|
65
48
|
end
|
|
66
49
|
|
|
@@ -3,29 +3,41 @@ module ScientificNameCanonical
|
|
|
3
3
|
include Treetop::Runtime
|
|
4
4
|
|
|
5
5
|
def root
|
|
6
|
-
@root || :
|
|
6
|
+
@root || :root
|
|
7
7
|
end
|
|
8
8
|
|
|
9
9
|
include ScientificNameClean
|
|
10
10
|
|
|
11
11
|
include ScientificNameDirty
|
|
12
12
|
|
|
13
|
-
def
|
|
13
|
+
def _nt_root
|
|
14
14
|
start_index = index
|
|
15
|
-
if node_cache[:
|
|
16
|
-
cached = node_cache[:
|
|
15
|
+
if node_cache[:root].has_key?(index)
|
|
16
|
+
cached = node_cache[:root][index]
|
|
17
17
|
@index = cached.interval.end if cached
|
|
18
18
|
return cached
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
i0 = index
|
|
22
|
+
r1 = _nt_multinomial_with_garbage
|
|
23
|
+
if r1
|
|
24
|
+
r0 = r1
|
|
25
|
+
else
|
|
26
|
+
r2 = _nt_uninomial_with_garbage
|
|
27
|
+
if r2
|
|
28
|
+
r0 = r2
|
|
29
|
+
else
|
|
30
|
+
self.index = i0
|
|
31
|
+
r0 = nil
|
|
32
|
+
end
|
|
33
|
+
end
|
|
22
34
|
|
|
23
|
-
node_cache[:
|
|
35
|
+
node_cache[:root][start_index] = r0
|
|
24
36
|
|
|
25
37
|
return r0
|
|
26
38
|
end
|
|
27
39
|
|
|
28
|
-
module
|
|
40
|
+
module MultinomialWithGarbage0
|
|
29
41
|
def a
|
|
30
42
|
elements[0]
|
|
31
43
|
end
|
|
@@ -37,26 +49,43 @@ module ScientificNameCanonical
|
|
|
37
49
|
def b
|
|
38
50
|
elements[2]
|
|
39
51
|
end
|
|
52
|
+
|
|
53
|
+
def space
|
|
54
|
+
elements[3]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def c
|
|
58
|
+
elements[4]
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def space_hard
|
|
62
|
+
elements[5]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def garbage
|
|
66
|
+
elements[6]
|
|
67
|
+
end
|
|
40
68
|
end
|
|
41
69
|
|
|
42
|
-
module
|
|
43
|
-
def value
|
|
44
|
-
a.value
|
|
70
|
+
module MultinomialWithGarbage1
|
|
71
|
+
def value
|
|
72
|
+
a.value + " " + b.value + " " + c.value
|
|
45
73
|
end
|
|
74
|
+
|
|
46
75
|
def canonical
|
|
47
|
-
a.canonical
|
|
76
|
+
a.canonical + " " + b.canonical + " " + c.canonical
|
|
48
77
|
end
|
|
49
78
|
|
|
50
79
|
def pos
|
|
51
|
-
a.pos
|
|
80
|
+
a.pos.merge(b.pos).merge(c.pos)
|
|
52
81
|
end
|
|
53
82
|
|
|
54
83
|
def details
|
|
55
|
-
a.details.merge(
|
|
84
|
+
a.details.merge(b.details).merge(c.details)
|
|
56
85
|
end
|
|
57
86
|
end
|
|
58
87
|
|
|
59
|
-
module
|
|
88
|
+
module MultinomialWithGarbage2
|
|
60
89
|
def a
|
|
61
90
|
elements[0]
|
|
62
91
|
end
|
|
@@ -68,68 +97,114 @@ module ScientificNameCanonical
|
|
|
68
97
|
def b
|
|
69
98
|
elements[2]
|
|
70
99
|
end
|
|
100
|
+
|
|
101
|
+
def space_hard
|
|
102
|
+
elements[3]
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def garbage
|
|
106
|
+
elements[4]
|
|
107
|
+
end
|
|
71
108
|
end
|
|
72
109
|
|
|
73
|
-
module
|
|
74
|
-
def value
|
|
75
|
-
a.value
|
|
110
|
+
module MultinomialWithGarbage3
|
|
111
|
+
def value
|
|
112
|
+
a.value + " " + b.value
|
|
76
113
|
end
|
|
114
|
+
|
|
77
115
|
def canonical
|
|
78
|
-
a.canonical
|
|
116
|
+
a.canonical + " " + b.canonical
|
|
79
117
|
end
|
|
80
118
|
|
|
81
119
|
def pos
|
|
82
|
-
a.pos
|
|
120
|
+
a.pos.merge(b.pos)
|
|
83
121
|
end
|
|
84
122
|
|
|
85
123
|
def details
|
|
86
|
-
a.details.merge(
|
|
124
|
+
a.details.merge(b.details)
|
|
87
125
|
end
|
|
88
126
|
end
|
|
89
127
|
|
|
90
|
-
|
|
128
|
+
module MultinomialWithGarbage4
|
|
129
|
+
def a
|
|
130
|
+
elements[0]
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def space
|
|
134
|
+
elements[1]
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def b
|
|
138
|
+
elements[2]
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def space_hard
|
|
142
|
+
elements[3]
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def garbage
|
|
146
|
+
elements[4]
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
module MultinomialWithGarbage5
|
|
151
|
+
def value
|
|
152
|
+
a.value + " " + b.value
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def canonical
|
|
156
|
+
a.canonical + " " + b.canonical
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def pos
|
|
160
|
+
a.pos.merge(b.pos)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def details
|
|
164
|
+
a.details.merge(b.details)
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def _nt_multinomial_with_garbage
|
|
91
169
|
start_index = index
|
|
92
|
-
if node_cache[:
|
|
93
|
-
cached = node_cache[:
|
|
170
|
+
if node_cache[:multinomial_with_garbage].has_key?(index)
|
|
171
|
+
cached = node_cache[:multinomial_with_garbage][index]
|
|
94
172
|
@index = cached.interval.end if cached
|
|
95
173
|
return cached
|
|
96
174
|
end
|
|
97
175
|
|
|
98
176
|
i0 = index
|
|
99
177
|
i1, s1 = index, []
|
|
100
|
-
r2 =
|
|
178
|
+
r2 = _nt_genus
|
|
101
179
|
s1 << r2
|
|
102
180
|
if r2
|
|
103
181
|
r3 = _nt_space
|
|
104
182
|
s1 << r3
|
|
105
183
|
if r3
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
else
|
|
112
|
-
r5 = nil
|
|
113
|
-
end
|
|
184
|
+
r4 = _nt_subgenus
|
|
185
|
+
s1 << r4
|
|
186
|
+
if r4
|
|
187
|
+
r5 = _nt_space
|
|
188
|
+
s1 << r5
|
|
114
189
|
if r5
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
190
|
+
r6 = _nt_species
|
|
191
|
+
s1 << r6
|
|
192
|
+
if r6
|
|
193
|
+
r7 = _nt_space_hard
|
|
194
|
+
s1 << r7
|
|
195
|
+
if r7
|
|
196
|
+
r8 = _nt_garbage
|
|
197
|
+
s1 << r8
|
|
198
|
+
end
|
|
199
|
+
end
|
|
118
200
|
end
|
|
119
201
|
end
|
|
120
|
-
if s4.empty?
|
|
121
|
-
self.index = i4
|
|
122
|
-
r4 = nil
|
|
123
|
-
else
|
|
124
|
-
r4 = instantiate_node(SyntaxNode,input, i4...index, s4)
|
|
125
|
-
end
|
|
126
|
-
s1 << r4
|
|
127
202
|
end
|
|
128
203
|
end
|
|
129
204
|
if s1.last
|
|
130
205
|
r1 = instantiate_node(SyntaxNode,input, i1...index, s1)
|
|
131
|
-
r1.extend(
|
|
132
|
-
r1.extend(
|
|
206
|
+
r1.extend(MultinomialWithGarbage0)
|
|
207
|
+
r1.extend(MultinomialWithGarbage1)
|
|
133
208
|
else
|
|
134
209
|
self.index = i1
|
|
135
210
|
r1 = nil
|
|
@@ -137,53 +212,138 @@ module ScientificNameCanonical
|
|
|
137
212
|
if r1
|
|
138
213
|
r0 = r1
|
|
139
214
|
else
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
if
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
if
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
if r10
|
|
156
|
-
s9 << r10
|
|
157
|
-
else
|
|
158
|
-
break
|
|
215
|
+
i9, s9 = index, []
|
|
216
|
+
r10 = _nt_genus
|
|
217
|
+
s9 << r10
|
|
218
|
+
if r10
|
|
219
|
+
r11 = _nt_space
|
|
220
|
+
s9 << r11
|
|
221
|
+
if r11
|
|
222
|
+
r12 = _nt_subgenus
|
|
223
|
+
s9 << r12
|
|
224
|
+
if r12
|
|
225
|
+
r13 = _nt_space_hard
|
|
226
|
+
s9 << r13
|
|
227
|
+
if r13
|
|
228
|
+
r14 = _nt_garbage
|
|
229
|
+
s9 << r14
|
|
159
230
|
end
|
|
160
231
|
end
|
|
161
|
-
if s9.empty?
|
|
162
|
-
self.index = i9
|
|
163
|
-
r9 = nil
|
|
164
|
-
else
|
|
165
|
-
r9 = instantiate_node(SyntaxNode,input, i9...index, s9)
|
|
166
|
-
end
|
|
167
|
-
s6 << r9
|
|
168
232
|
end
|
|
169
233
|
end
|
|
170
|
-
if
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
234
|
+
if s9.last
|
|
235
|
+
r9 = instantiate_node(SyntaxNode,input, i9...index, s9)
|
|
236
|
+
r9.extend(MultinomialWithGarbage2)
|
|
237
|
+
r9.extend(MultinomialWithGarbage3)
|
|
174
238
|
else
|
|
175
|
-
self.index =
|
|
176
|
-
|
|
239
|
+
self.index = i9
|
|
240
|
+
r9 = nil
|
|
177
241
|
end
|
|
178
|
-
if
|
|
179
|
-
r0 =
|
|
242
|
+
if r9
|
|
243
|
+
r0 = r9
|
|
180
244
|
else
|
|
181
|
-
|
|
182
|
-
|
|
245
|
+
i15, s15 = index, []
|
|
246
|
+
r16 = _nt_genus
|
|
247
|
+
s15 << r16
|
|
248
|
+
if r16
|
|
249
|
+
r17 = _nt_space
|
|
250
|
+
s15 << r17
|
|
251
|
+
if r17
|
|
252
|
+
r18 = _nt_species
|
|
253
|
+
s15 << r18
|
|
254
|
+
if r18
|
|
255
|
+
r19 = _nt_space_hard
|
|
256
|
+
s15 << r19
|
|
257
|
+
if r19
|
|
258
|
+
r20 = _nt_garbage
|
|
259
|
+
s15 << r20
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
if s15.last
|
|
265
|
+
r15 = instantiate_node(SyntaxNode,input, i15...index, s15)
|
|
266
|
+
r15.extend(MultinomialWithGarbage4)
|
|
267
|
+
r15.extend(MultinomialWithGarbage5)
|
|
268
|
+
else
|
|
269
|
+
self.index = i15
|
|
270
|
+
r15 = nil
|
|
271
|
+
end
|
|
272
|
+
if r15
|
|
273
|
+
r0 = r15
|
|
274
|
+
else
|
|
275
|
+
self.index = i0
|
|
276
|
+
r0 = nil
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
node_cache[:multinomial_with_garbage][start_index] = r0
|
|
282
|
+
|
|
283
|
+
return r0
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
module UninomialWithGarbage0
|
|
287
|
+
def a
|
|
288
|
+
elements[0]
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def space_hard
|
|
292
|
+
elements[1]
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
def b
|
|
296
|
+
elements[2]
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
module UninomialWithGarbage1
|
|
301
|
+
def value
|
|
302
|
+
a.value
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def canonical
|
|
306
|
+
a.canonical
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def pos
|
|
310
|
+
a.pos
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def details
|
|
314
|
+
{:uninomial => a.details[:uninomial]}
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def _nt_uninomial_with_garbage
|
|
319
|
+
start_index = index
|
|
320
|
+
if node_cache[:uninomial_with_garbage].has_key?(index)
|
|
321
|
+
cached = node_cache[:uninomial_with_garbage][index]
|
|
322
|
+
@index = cached.interval.end if cached
|
|
323
|
+
return cached
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
i0, s0 = index, []
|
|
327
|
+
r1 = _nt_uninomial_epitheton
|
|
328
|
+
s0 << r1
|
|
329
|
+
if r1
|
|
330
|
+
r2 = _nt_space_hard
|
|
331
|
+
s0 << r2
|
|
332
|
+
if r2
|
|
333
|
+
r3 = _nt_garbage
|
|
334
|
+
s0 << r3
|
|
183
335
|
end
|
|
184
336
|
end
|
|
337
|
+
if s0.last
|
|
338
|
+
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
|
|
339
|
+
r0.extend(UninomialWithGarbage0)
|
|
340
|
+
r0.extend(UninomialWithGarbage1)
|
|
341
|
+
else
|
|
342
|
+
self.index = i0
|
|
343
|
+
r0 = nil
|
|
344
|
+
end
|
|
185
345
|
|
|
186
|
-
node_cache[:
|
|
346
|
+
node_cache[:uninomial_with_garbage][start_index] = r0
|
|
187
347
|
|
|
188
348
|
return r0
|
|
189
349
|
end
|
|
@@ -198,7 +358,7 @@ module ScientificNameCanonical
|
|
|
198
358
|
|
|
199
359
|
s0, i0 = [], index
|
|
200
360
|
loop do
|
|
201
|
-
if input.index(Regexp.new('[
|
|
361
|
+
if input.index(Regexp.new('[^ш]'), index) == index
|
|
202
362
|
r1 = instantiate_node(SyntaxNode,input, index...(index + 1))
|
|
203
363
|
@index += 1
|
|
204
364
|
else
|
|
@@ -210,7 +370,12 @@ module ScientificNameCanonical
|
|
|
210
370
|
break
|
|
211
371
|
end
|
|
212
372
|
end
|
|
213
|
-
|
|
373
|
+
if s0.empty?
|
|
374
|
+
self.index = i0
|
|
375
|
+
r0 = nil
|
|
376
|
+
else
|
|
377
|
+
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
|
|
378
|
+
end
|
|
214
379
|
|
|
215
380
|
node_cache[:garbage][start_index] = r0
|
|
216
381
|
|