dimus-biodiversity 0.0.18 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/biodiversity/parser.rb +18 -35
- data/lib/biodiversity/parser/scientific_name_canonical.rb +248 -83
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +46 -20
- data/lib/biodiversity/parser/scientific_name_clean.rb +3304 -3409
- data/lib/biodiversity/parser/scientific_name_clean.treetop +539 -500
- data/lib/biodiversity/parser/scientific_name_dirty.rb +362 -213
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +123 -98
- data/spec/parser/scientific_name.spec.rb +7 -28
- data/spec/parser/scientific_name_canonical.spec.rb +7 -6
- data/spec/parser/scientific_name_clean.spec.rb +256 -260
- data/spec/parser/scientific_name_dirty.spec.rb +62 -52
- metadata +2 -2
data/lib/biodiversity/parser.rb
CHANGED
@@ -13,54 +13,37 @@ class ScientificNameParser
|
|
13
13
|
@clean = ScientificNameCleanParser.new
|
14
14
|
@dirty = ScientificNameDirtyParser.new
|
15
15
|
@canonical = ScientificNameCanonicalParser.new
|
16
|
-
@
|
16
|
+
@parser = nil
|
17
17
|
end
|
18
18
|
|
19
19
|
def parse(a_string)
|
20
20
|
@verbatim = a_string
|
21
|
-
@
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
def value
|
26
|
-
@node.value if @node
|
27
|
-
end
|
28
|
-
|
29
|
-
def pos
|
30
|
-
@node.pos if @node
|
31
|
-
end
|
32
|
-
|
33
|
-
def details
|
34
|
-
@node.details if @node
|
35
|
-
end
|
36
|
-
|
37
|
-
def canonical
|
38
|
-
@node.canonical if @node
|
39
|
-
end
|
40
|
-
|
41
|
-
def to_json
|
42
|
-
parsed = !!@node
|
43
|
-
if parsed
|
21
|
+
@parser = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string)
|
22
|
+
def @parser.to_json
|
23
|
+
parsed = !!self
|
44
24
|
res = {
|
45
25
|
:parsed => parsed,
|
46
|
-
:verbatim =>
|
26
|
+
:verbatim => self.text_value }
|
47
27
|
if parsed
|
48
28
|
res.merge!({
|
49
|
-
:normalized =>
|
50
|
-
:canonical =>
|
29
|
+
:normalized => self.value,
|
30
|
+
:canonical => self.canonical
|
51
31
|
})
|
52
|
-
|
32
|
+
data = self.details
|
33
|
+
if data[:species] && data[:species][:namedHybrid]
|
34
|
+
data[:species].delete(:namedHybrid)
|
35
|
+
data = {:namedHybrid => data}
|
36
|
+
end
|
37
|
+
res.merge!(data)
|
53
38
|
end
|
54
39
|
res = {:scientificName => res}
|
55
40
|
JSON.generate res
|
56
|
-
else
|
57
|
-
JSON.generate({:parsed => parsed, :verbatim => @verbatim})
|
58
41
|
end
|
42
|
+
|
43
|
+
def @parser.pos_json
|
44
|
+
JSON.generate self.pos rescue ''
|
45
|
+
end
|
46
|
+
@parser
|
59
47
|
end
|
60
|
-
|
61
|
-
def pos_to_json
|
62
|
-
JSON.generate @node.pos rescue ''
|
63
|
-
end
|
64
|
-
|
65
48
|
end
|
66
49
|
|
@@ -3,29 +3,41 @@ module ScientificNameCanonical
|
|
3
3
|
include Treetop::Runtime
|
4
4
|
|
5
5
|
def root
|
6
|
-
@root || :
|
6
|
+
@root || :root
|
7
7
|
end
|
8
8
|
|
9
9
|
include ScientificNameClean
|
10
10
|
|
11
11
|
include ScientificNameDirty
|
12
12
|
|
13
|
-
def
|
13
|
+
def _nt_root
|
14
14
|
start_index = index
|
15
|
-
if node_cache[:
|
16
|
-
cached = node_cache[:
|
15
|
+
if node_cache[:root].has_key?(index)
|
16
|
+
cached = node_cache[:root][index]
|
17
17
|
@index = cached.interval.end if cached
|
18
18
|
return cached
|
19
19
|
end
|
20
20
|
|
21
|
-
|
21
|
+
i0 = index
|
22
|
+
r1 = _nt_multinomial_with_garbage
|
23
|
+
if r1
|
24
|
+
r0 = r1
|
25
|
+
else
|
26
|
+
r2 = _nt_uninomial_with_garbage
|
27
|
+
if r2
|
28
|
+
r0 = r2
|
29
|
+
else
|
30
|
+
self.index = i0
|
31
|
+
r0 = nil
|
32
|
+
end
|
33
|
+
end
|
22
34
|
|
23
|
-
node_cache[:
|
35
|
+
node_cache[:root][start_index] = r0
|
24
36
|
|
25
37
|
return r0
|
26
38
|
end
|
27
39
|
|
28
|
-
module
|
40
|
+
module MultinomialWithGarbage0
|
29
41
|
def a
|
30
42
|
elements[0]
|
31
43
|
end
|
@@ -37,26 +49,43 @@ module ScientificNameCanonical
|
|
37
49
|
def b
|
38
50
|
elements[2]
|
39
51
|
end
|
52
|
+
|
53
|
+
def space
|
54
|
+
elements[3]
|
55
|
+
end
|
56
|
+
|
57
|
+
def c
|
58
|
+
elements[4]
|
59
|
+
end
|
60
|
+
|
61
|
+
def space_hard
|
62
|
+
elements[5]
|
63
|
+
end
|
64
|
+
|
65
|
+
def garbage
|
66
|
+
elements[6]
|
67
|
+
end
|
40
68
|
end
|
41
69
|
|
42
|
-
module
|
43
|
-
def value
|
44
|
-
a.value
|
70
|
+
module MultinomialWithGarbage1
|
71
|
+
def value
|
72
|
+
a.value + " " + b.value + " " + c.value
|
45
73
|
end
|
74
|
+
|
46
75
|
def canonical
|
47
|
-
a.canonical
|
76
|
+
a.canonical + " " + b.canonical + " " + c.canonical
|
48
77
|
end
|
49
78
|
|
50
79
|
def pos
|
51
|
-
a.pos
|
80
|
+
a.pos.merge(b.pos).merge(c.pos)
|
52
81
|
end
|
53
82
|
|
54
83
|
def details
|
55
|
-
a.details.merge(
|
84
|
+
a.details.merge(b.details).merge(c.details)
|
56
85
|
end
|
57
86
|
end
|
58
87
|
|
59
|
-
module
|
88
|
+
module MultinomialWithGarbage2
|
60
89
|
def a
|
61
90
|
elements[0]
|
62
91
|
end
|
@@ -68,68 +97,114 @@ module ScientificNameCanonical
|
|
68
97
|
def b
|
69
98
|
elements[2]
|
70
99
|
end
|
100
|
+
|
101
|
+
def space_hard
|
102
|
+
elements[3]
|
103
|
+
end
|
104
|
+
|
105
|
+
def garbage
|
106
|
+
elements[4]
|
107
|
+
end
|
71
108
|
end
|
72
109
|
|
73
|
-
module
|
74
|
-
def value
|
75
|
-
a.value
|
110
|
+
module MultinomialWithGarbage3
|
111
|
+
def value
|
112
|
+
a.value + " " + b.value
|
76
113
|
end
|
114
|
+
|
77
115
|
def canonical
|
78
|
-
a.canonical
|
116
|
+
a.canonical + " " + b.canonical
|
79
117
|
end
|
80
118
|
|
81
119
|
def pos
|
82
|
-
a.pos
|
120
|
+
a.pos.merge(b.pos)
|
83
121
|
end
|
84
122
|
|
85
123
|
def details
|
86
|
-
a.details.merge(
|
124
|
+
a.details.merge(b.details)
|
87
125
|
end
|
88
126
|
end
|
89
127
|
|
90
|
-
|
128
|
+
module MultinomialWithGarbage4
|
129
|
+
def a
|
130
|
+
elements[0]
|
131
|
+
end
|
132
|
+
|
133
|
+
def space
|
134
|
+
elements[1]
|
135
|
+
end
|
136
|
+
|
137
|
+
def b
|
138
|
+
elements[2]
|
139
|
+
end
|
140
|
+
|
141
|
+
def space_hard
|
142
|
+
elements[3]
|
143
|
+
end
|
144
|
+
|
145
|
+
def garbage
|
146
|
+
elements[4]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
module MultinomialWithGarbage5
|
151
|
+
def value
|
152
|
+
a.value + " " + b.value
|
153
|
+
end
|
154
|
+
|
155
|
+
def canonical
|
156
|
+
a.canonical + " " + b.canonical
|
157
|
+
end
|
158
|
+
|
159
|
+
def pos
|
160
|
+
a.pos.merge(b.pos)
|
161
|
+
end
|
162
|
+
|
163
|
+
def details
|
164
|
+
a.details.merge(b.details)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def _nt_multinomial_with_garbage
|
91
169
|
start_index = index
|
92
|
-
if node_cache[:
|
93
|
-
cached = node_cache[:
|
170
|
+
if node_cache[:multinomial_with_garbage].has_key?(index)
|
171
|
+
cached = node_cache[:multinomial_with_garbage][index]
|
94
172
|
@index = cached.interval.end if cached
|
95
173
|
return cached
|
96
174
|
end
|
97
175
|
|
98
176
|
i0 = index
|
99
177
|
i1, s1 = index, []
|
100
|
-
r2 =
|
178
|
+
r2 = _nt_genus
|
101
179
|
s1 << r2
|
102
180
|
if r2
|
103
181
|
r3 = _nt_space
|
104
182
|
s1 << r3
|
105
183
|
if r3
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
else
|
112
|
-
r5 = nil
|
113
|
-
end
|
184
|
+
r4 = _nt_subgenus
|
185
|
+
s1 << r4
|
186
|
+
if r4
|
187
|
+
r5 = _nt_space
|
188
|
+
s1 << r5
|
114
189
|
if r5
|
115
|
-
|
116
|
-
|
117
|
-
|
190
|
+
r6 = _nt_species
|
191
|
+
s1 << r6
|
192
|
+
if r6
|
193
|
+
r7 = _nt_space_hard
|
194
|
+
s1 << r7
|
195
|
+
if r7
|
196
|
+
r8 = _nt_garbage
|
197
|
+
s1 << r8
|
198
|
+
end
|
199
|
+
end
|
118
200
|
end
|
119
201
|
end
|
120
|
-
if s4.empty?
|
121
|
-
self.index = i4
|
122
|
-
r4 = nil
|
123
|
-
else
|
124
|
-
r4 = instantiate_node(SyntaxNode,input, i4...index, s4)
|
125
|
-
end
|
126
|
-
s1 << r4
|
127
202
|
end
|
128
203
|
end
|
129
204
|
if s1.last
|
130
205
|
r1 = instantiate_node(SyntaxNode,input, i1...index, s1)
|
131
|
-
r1.extend(
|
132
|
-
r1.extend(
|
206
|
+
r1.extend(MultinomialWithGarbage0)
|
207
|
+
r1.extend(MultinomialWithGarbage1)
|
133
208
|
else
|
134
209
|
self.index = i1
|
135
210
|
r1 = nil
|
@@ -137,53 +212,138 @@ module ScientificNameCanonical
|
|
137
212
|
if r1
|
138
213
|
r0 = r1
|
139
214
|
else
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
if
|
144
|
-
|
145
|
-
|
146
|
-
if
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
if r10
|
156
|
-
s9 << r10
|
157
|
-
else
|
158
|
-
break
|
215
|
+
i9, s9 = index, []
|
216
|
+
r10 = _nt_genus
|
217
|
+
s9 << r10
|
218
|
+
if r10
|
219
|
+
r11 = _nt_space
|
220
|
+
s9 << r11
|
221
|
+
if r11
|
222
|
+
r12 = _nt_subgenus
|
223
|
+
s9 << r12
|
224
|
+
if r12
|
225
|
+
r13 = _nt_space_hard
|
226
|
+
s9 << r13
|
227
|
+
if r13
|
228
|
+
r14 = _nt_garbage
|
229
|
+
s9 << r14
|
159
230
|
end
|
160
231
|
end
|
161
|
-
if s9.empty?
|
162
|
-
self.index = i9
|
163
|
-
r9 = nil
|
164
|
-
else
|
165
|
-
r9 = instantiate_node(SyntaxNode,input, i9...index, s9)
|
166
|
-
end
|
167
|
-
s6 << r9
|
168
232
|
end
|
169
233
|
end
|
170
|
-
if
|
171
|
-
|
172
|
-
|
173
|
-
|
234
|
+
if s9.last
|
235
|
+
r9 = instantiate_node(SyntaxNode,input, i9...index, s9)
|
236
|
+
r9.extend(MultinomialWithGarbage2)
|
237
|
+
r9.extend(MultinomialWithGarbage3)
|
174
238
|
else
|
175
|
-
self.index =
|
176
|
-
|
239
|
+
self.index = i9
|
240
|
+
r9 = nil
|
177
241
|
end
|
178
|
-
if
|
179
|
-
r0 =
|
242
|
+
if r9
|
243
|
+
r0 = r9
|
180
244
|
else
|
181
|
-
|
182
|
-
|
245
|
+
i15, s15 = index, []
|
246
|
+
r16 = _nt_genus
|
247
|
+
s15 << r16
|
248
|
+
if r16
|
249
|
+
r17 = _nt_space
|
250
|
+
s15 << r17
|
251
|
+
if r17
|
252
|
+
r18 = _nt_species
|
253
|
+
s15 << r18
|
254
|
+
if r18
|
255
|
+
r19 = _nt_space_hard
|
256
|
+
s15 << r19
|
257
|
+
if r19
|
258
|
+
r20 = _nt_garbage
|
259
|
+
s15 << r20
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
if s15.last
|
265
|
+
r15 = instantiate_node(SyntaxNode,input, i15...index, s15)
|
266
|
+
r15.extend(MultinomialWithGarbage4)
|
267
|
+
r15.extend(MultinomialWithGarbage5)
|
268
|
+
else
|
269
|
+
self.index = i15
|
270
|
+
r15 = nil
|
271
|
+
end
|
272
|
+
if r15
|
273
|
+
r0 = r15
|
274
|
+
else
|
275
|
+
self.index = i0
|
276
|
+
r0 = nil
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
node_cache[:multinomial_with_garbage][start_index] = r0
|
282
|
+
|
283
|
+
return r0
|
284
|
+
end
|
285
|
+
|
286
|
+
module UninomialWithGarbage0
|
287
|
+
def a
|
288
|
+
elements[0]
|
289
|
+
end
|
290
|
+
|
291
|
+
def space_hard
|
292
|
+
elements[1]
|
293
|
+
end
|
294
|
+
|
295
|
+
def b
|
296
|
+
elements[2]
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
module UninomialWithGarbage1
|
301
|
+
def value
|
302
|
+
a.value
|
303
|
+
end
|
304
|
+
|
305
|
+
def canonical
|
306
|
+
a.canonical
|
307
|
+
end
|
308
|
+
|
309
|
+
def pos
|
310
|
+
a.pos
|
311
|
+
end
|
312
|
+
|
313
|
+
def details
|
314
|
+
{:uninomial => a.details[:uninomial]}
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
def _nt_uninomial_with_garbage
|
319
|
+
start_index = index
|
320
|
+
if node_cache[:uninomial_with_garbage].has_key?(index)
|
321
|
+
cached = node_cache[:uninomial_with_garbage][index]
|
322
|
+
@index = cached.interval.end if cached
|
323
|
+
return cached
|
324
|
+
end
|
325
|
+
|
326
|
+
i0, s0 = index, []
|
327
|
+
r1 = _nt_uninomial_epitheton
|
328
|
+
s0 << r1
|
329
|
+
if r1
|
330
|
+
r2 = _nt_space_hard
|
331
|
+
s0 << r2
|
332
|
+
if r2
|
333
|
+
r3 = _nt_garbage
|
334
|
+
s0 << r3
|
183
335
|
end
|
184
336
|
end
|
337
|
+
if s0.last
|
338
|
+
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
|
339
|
+
r0.extend(UninomialWithGarbage0)
|
340
|
+
r0.extend(UninomialWithGarbage1)
|
341
|
+
else
|
342
|
+
self.index = i0
|
343
|
+
r0 = nil
|
344
|
+
end
|
185
345
|
|
186
|
-
node_cache[:
|
346
|
+
node_cache[:uninomial_with_garbage][start_index] = r0
|
187
347
|
|
188
348
|
return r0
|
189
349
|
end
|
@@ -198,7 +358,7 @@ module ScientificNameCanonical
|
|
198
358
|
|
199
359
|
s0, i0 = [], index
|
200
360
|
loop do
|
201
|
-
if input.index(Regexp.new('[
|
361
|
+
if input.index(Regexp.new('[^ш]'), index) == index
|
202
362
|
r1 = instantiate_node(SyntaxNode,input, index...(index + 1))
|
203
363
|
@index += 1
|
204
364
|
else
|
@@ -210,7 +370,12 @@ module ScientificNameCanonical
|
|
210
370
|
break
|
211
371
|
end
|
212
372
|
end
|
213
|
-
|
373
|
+
if s0.empty?
|
374
|
+
self.index = i0
|
375
|
+
r0 = nil
|
376
|
+
else
|
377
|
+
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
|
378
|
+
end
|
214
379
|
|
215
380
|
node_cache[:garbage][start_index] = r0
|
216
381
|
|