biodiversity19 0.5.15 → 0.5.16
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Rakefile +20 -4
- data/VERSION +1 -1
- data/bin/nnparse +2 -2
- data/{biodiversity.gemspec → biodiversity19.gemspec} +11 -8
- data/lib/biodiversity/parser/scientific_name_canonical.rb +9 -3
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +8 -1
- data/lib/biodiversity/parser/scientific_name_clean.rb +362 -386
- data/lib/biodiversity/parser/scientific_name_clean.treetop +39 -45
- data/lib/biodiversity/parser/scientific_name_dirty.rb +215 -2
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +62 -1
- data/lib/biodiversity/parser.rb +1 -0
- data/spec/parser/scientific_name_canonical.spec.rb +1 -2
- data/spec/parser/scientific_name_clean.spec.rb +45 -23
- data/spec/parser/scientific_name_dirty.spec.rb +17 -1
- data/spec/parser/test_data.txt +148 -148
- metadata +23 -11
@@ -22,6 +22,10 @@ grammar ScientificNameClean
|
|
22
22
|
def details
|
23
23
|
a.details.class == Array ? a.details : [a.details]
|
24
24
|
end
|
25
|
+
|
26
|
+
def parser_run
|
27
|
+
1
|
28
|
+
end
|
25
29
|
}
|
26
30
|
end
|
27
31
|
|
@@ -509,7 +513,7 @@ grammar ScientificNameClean
|
|
509
513
|
end
|
510
514
|
|
511
515
|
rule genus
|
512
|
-
a:cap_latin_word !(space_hard author_prefix_word space_hard author_word) {
|
516
|
+
a:(cap_latin_word_pair/cap_latin_word) !(space_hard author_prefix_word space_hard author_word) {
|
513
517
|
def value
|
514
518
|
a.value
|
515
519
|
end
|
@@ -555,7 +559,7 @@ grammar ScientificNameClean
|
|
555
559
|
end
|
556
560
|
|
557
561
|
rule uninomial_string
|
558
|
-
cap_latin_word {
|
562
|
+
(cap_latin_word_pair/cap_latin_word) {
|
559
563
|
def canonical
|
560
564
|
value
|
561
565
|
end
|
@@ -938,6 +942,14 @@ grammar ScientificNameClean
|
|
938
942
|
}
|
939
943
|
end
|
940
944
|
|
945
|
+
rule cap_latin_word_pair
|
946
|
+
a:cap_latin_word "-" b:cap_latin_word {
|
947
|
+
def value
|
948
|
+
a.value + b.value.downcase
|
949
|
+
end
|
950
|
+
}
|
951
|
+
end
|
952
|
+
|
941
953
|
rule cap_latin_word
|
942
954
|
a:([A-Z]/cap_digraph) b:latin_word "?" {
|
943
955
|
def value
|
@@ -951,6 +963,12 @@ grammar ScientificNameClean
|
|
951
963
|
end
|
952
964
|
}
|
953
965
|
/
|
966
|
+
a:("AE"/"OE") b:latin_word {
|
967
|
+
def value
|
968
|
+
a.text_value[0..0] + 'e' + b.value
|
969
|
+
end
|
970
|
+
}
|
971
|
+
/
|
954
972
|
("Ca"/"Ea"/"Ge"/"Ia"/"Io"/"Io"/"Ix"/"Lo"/"Oa"/"Ra"/"Ty"/"Ua"/"Aa"/"Ja"/"Zu"/"La"/"Qu"/"As"/"Ba") {
|
955
973
|
def value
|
956
974
|
text_value
|
@@ -1041,42 +1059,32 @@ grammar ScientificNameClean
|
|
1041
1059
|
end
|
1042
1060
|
|
1043
1061
|
rule latin_word
|
1044
|
-
a:[a-
|
1045
|
-
def value
|
1046
|
-
a.text_value + b.value
|
1047
|
-
end
|
1048
|
-
}
|
1049
|
-
/
|
1050
|
-
a:digraph b:full_name_letters {
|
1062
|
+
a:[a-zëæœ] b:valid_name_letters {
|
1051
1063
|
def value
|
1052
|
-
|
1064
|
+
l = a.text_value
|
1065
|
+
l = 'ae' if l == 'æ'
|
1066
|
+
l = 'oe' if l == 'œ'
|
1067
|
+
l + b.value
|
1053
1068
|
end
|
1054
1069
|
}
|
1055
1070
|
end
|
1056
1071
|
|
1057
|
-
rule full_name_letters
|
1058
|
-
a:digraph b:full_name_letters {
|
1059
|
-
def value
|
1060
|
-
a.value + b.value
|
1061
|
-
end
|
1062
|
-
}
|
1063
|
-
/
|
1064
|
-
a:valid_name_letters b:digraph c:full_name_letters {
|
1065
|
-
def value
|
1066
|
-
a.value + b.value + c.value
|
1067
|
-
end
|
1068
|
-
}
|
1069
|
-
/
|
1070
|
-
valid_name_letters
|
1071
|
-
end
|
1072
|
-
|
1073
1072
|
rule valid_name_letters
|
1074
|
-
[a-z
|
1073
|
+
[a-z\-ëæœ]+ {
|
1075
1074
|
def value
|
1076
|
-
|
1075
|
+
res = ''
|
1076
|
+
text_value.split('').each do |l|
|
1077
|
+
l = 'ae' if l == 'æ'
|
1078
|
+
l = 'oe' if l == 'œ'
|
1079
|
+
# not sure if we should normalize ë as well. It is legal in botanical code, but it
|
1080
|
+
# might be beneficial to normalize it for the reconsiliation purposes
|
1081
|
+
# l = 'e' if l == 'ë'
|
1082
|
+
res << l
|
1083
|
+
end
|
1084
|
+
res
|
1077
1085
|
end
|
1078
1086
|
}
|
1079
|
-
end
|
1087
|
+
end
|
1080
1088
|
|
1081
1089
|
rule cap_digraph
|
1082
1090
|
"Æ" {
|
@@ -1092,20 +1100,6 @@ grammar ScientificNameClean
|
|
1092
1100
|
}
|
1093
1101
|
end
|
1094
1102
|
|
1095
|
-
rule digraph
|
1096
|
-
"æ" {
|
1097
|
-
def value
|
1098
|
-
'ae'
|
1099
|
-
end
|
1100
|
-
}
|
1101
|
-
/
|
1102
|
-
"œ" {
|
1103
|
-
def value
|
1104
|
-
'oe'
|
1105
|
-
end
|
1106
|
-
}
|
1107
|
-
end
|
1108
|
-
|
1109
1103
|
rule year
|
1110
1104
|
b:left_paren space a:(year_number_with_character/year_number) space c:right_paren {
|
1111
1105
|
def value
|
@@ -1177,9 +1171,9 @@ grammar ScientificNameClean
|
|
1177
1171
|
end
|
1178
1172
|
|
1179
1173
|
rule multiplication_sign
|
1180
|
-
"×" {
|
1174
|
+
("×"/"*") {
|
1181
1175
|
def value
|
1182
|
-
|
1176
|
+
"×"
|
1183
1177
|
end
|
1184
1178
|
}
|
1185
1179
|
end
|
@@ -3,11 +3,51 @@ module ScientificNameDirty
|
|
3
3
|
include Treetop::Runtime
|
4
4
|
|
5
5
|
def root
|
6
|
-
@root
|
6
|
+
@root ||= :root
|
7
7
|
end
|
8
8
|
|
9
9
|
include ScientificNameClean
|
10
10
|
|
11
|
+
module Root0
|
12
|
+
def space1
|
13
|
+
elements[0]
|
14
|
+
end
|
15
|
+
|
16
|
+
def a
|
17
|
+
elements[1]
|
18
|
+
end
|
19
|
+
|
20
|
+
def space2
|
21
|
+
elements[2]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
module Root1
|
26
|
+
def value
|
27
|
+
a.value.gsub(/\s{2,}/, ' ').strip
|
28
|
+
end
|
29
|
+
|
30
|
+
def canonical
|
31
|
+
a.canonical.gsub(/\s{2,}/, ' ').strip
|
32
|
+
end
|
33
|
+
|
34
|
+
def pos
|
35
|
+
a.pos
|
36
|
+
end
|
37
|
+
|
38
|
+
def hybrid
|
39
|
+
a.hybrid
|
40
|
+
end
|
41
|
+
|
42
|
+
def details
|
43
|
+
a.details.class == Array ? a.details : [a.details]
|
44
|
+
end
|
45
|
+
|
46
|
+
def parser_run
|
47
|
+
2
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
11
51
|
def _nt_root
|
12
52
|
start_index = index
|
13
53
|
if node_cache[:root].has_key?(index)
|
@@ -19,7 +59,25 @@ module ScientificNameDirty
|
|
19
59
|
return cached
|
20
60
|
end
|
21
61
|
|
22
|
-
|
62
|
+
i0, s0 = index, []
|
63
|
+
r1 = _nt_space
|
64
|
+
s0 << r1
|
65
|
+
if r1
|
66
|
+
r2 = _nt_scientific_name_5
|
67
|
+
s0 << r2
|
68
|
+
if r2
|
69
|
+
r3 = _nt_space
|
70
|
+
s0 << r3
|
71
|
+
end
|
72
|
+
end
|
73
|
+
if s0.last
|
74
|
+
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
|
75
|
+
r0.extend(Root0)
|
76
|
+
r0.extend(Root1)
|
77
|
+
else
|
78
|
+
@index = i0
|
79
|
+
r0 = nil
|
80
|
+
end
|
23
81
|
|
24
82
|
node_cache[:root][start_index] = r0
|
25
83
|
|
@@ -329,6 +387,161 @@ module ScientificNameDirty
|
|
329
387
|
r0
|
330
388
|
end
|
331
389
|
|
390
|
+
module LatinWord0
|
391
|
+
def a
|
392
|
+
elements[0]
|
393
|
+
end
|
394
|
+
|
395
|
+
def b
|
396
|
+
elements[1]
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
module LatinWord1
|
401
|
+
def value
|
402
|
+
res = ''
|
403
|
+
text_value.split('').each do |l|
|
404
|
+
l = 'ae' if l == 'æ'
|
405
|
+
l = 'oe' if l == 'œ'
|
406
|
+
res << l
|
407
|
+
end
|
408
|
+
res.tr('àâåãäáçčéèíìïňññóòôøõöúùürŕřŗššşž',
|
409
|
+
'aaaaaacceeiiinnnoooooouuurrrrsssz')
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
def _nt_latin_word
|
414
|
+
start_index = index
|
415
|
+
if node_cache[:latin_word].has_key?(index)
|
416
|
+
cached = node_cache[:latin_word][index]
|
417
|
+
if cached
|
418
|
+
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
|
419
|
+
@index = cached.interval.end
|
420
|
+
end
|
421
|
+
return cached
|
422
|
+
end
|
423
|
+
|
424
|
+
i0, s0 = index, []
|
425
|
+
if has_terminal?('\G[a-z\\-ëæœàâåãäáçčéèíìïňññóòôøõöúùürŕřŗššşž]', true, index)
|
426
|
+
r1 = true
|
427
|
+
@index += 1
|
428
|
+
else
|
429
|
+
r1 = nil
|
430
|
+
end
|
431
|
+
s0 << r1
|
432
|
+
if r1
|
433
|
+
r2 = _nt_valid_name_letters
|
434
|
+
s0 << r2
|
435
|
+
end
|
436
|
+
if s0.last
|
437
|
+
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
|
438
|
+
r0.extend(LatinWord0)
|
439
|
+
r0.extend(LatinWord1)
|
440
|
+
else
|
441
|
+
@index = i0
|
442
|
+
r0 = nil
|
443
|
+
end
|
444
|
+
|
445
|
+
node_cache[:latin_word][start_index] = r0
|
446
|
+
|
447
|
+
r0
|
448
|
+
end
|
449
|
+
|
450
|
+
module ValidNameLetters0
|
451
|
+
def value
|
452
|
+
res = ''
|
453
|
+
text_value.split('').each do |l|
|
454
|
+
l = 'ae' if l == 'æ'
|
455
|
+
l = 'oe' if l == 'œ'
|
456
|
+
res << l
|
457
|
+
end
|
458
|
+
res.tr('àâåãäáçčéèíìïňññóòôøõöúùürŕřŗššşž',
|
459
|
+
'aaaaaacceeiiinnnoooooouuurrrrsssz')
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
def _nt_valid_name_letters
|
464
|
+
start_index = index
|
465
|
+
if node_cache[:valid_name_letters].has_key?(index)
|
466
|
+
cached = node_cache[:valid_name_letters][index]
|
467
|
+
if cached
|
468
|
+
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
|
469
|
+
@index = cached.interval.end
|
470
|
+
end
|
471
|
+
return cached
|
472
|
+
end
|
473
|
+
|
474
|
+
s0, i0 = [], index
|
475
|
+
loop do
|
476
|
+
if has_terminal?('\G[a-z\\-ëæœàâåãäáçčéèíìïňññóòôøõöúùürŕřŗššşž]', true, index)
|
477
|
+
r1 = true
|
478
|
+
@index += 1
|
479
|
+
else
|
480
|
+
r1 = nil
|
481
|
+
end
|
482
|
+
if r1
|
483
|
+
s0 << r1
|
484
|
+
else
|
485
|
+
break
|
486
|
+
end
|
487
|
+
end
|
488
|
+
if s0.empty?
|
489
|
+
@index = i0
|
490
|
+
r0 = nil
|
491
|
+
else
|
492
|
+
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
|
493
|
+
r0.extend(ValidNameLetters0)
|
494
|
+
end
|
495
|
+
|
496
|
+
node_cache[:valid_name_letters][start_index] = r0
|
497
|
+
|
498
|
+
r0
|
499
|
+
end
|
500
|
+
|
501
|
+
module ValidNameLetters0
|
502
|
+
def value
|
503
|
+
text_value
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
def _nt_valid_name_letters
|
508
|
+
start_index = index
|
509
|
+
if node_cache[:valid_name_letters].has_key?(index)
|
510
|
+
cached = node_cache[:valid_name_letters][index]
|
511
|
+
if cached
|
512
|
+
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
|
513
|
+
@index = cached.interval.end
|
514
|
+
end
|
515
|
+
return cached
|
516
|
+
end
|
517
|
+
|
518
|
+
s0, i0 = [], index
|
519
|
+
loop do
|
520
|
+
if has_terminal?('\G[a-z\\-ëüäöïéåóç]', true, index)
|
521
|
+
r1 = true
|
522
|
+
@index += 1
|
523
|
+
else
|
524
|
+
r1 = nil
|
525
|
+
end
|
526
|
+
if r1
|
527
|
+
s0 << r1
|
528
|
+
else
|
529
|
+
break
|
530
|
+
end
|
531
|
+
end
|
532
|
+
if s0.empty?
|
533
|
+
@index = i0
|
534
|
+
r0 = nil
|
535
|
+
else
|
536
|
+
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
|
537
|
+
r0.extend(ValidNameLetters0)
|
538
|
+
end
|
539
|
+
|
540
|
+
node_cache[:valid_name_letters][start_index] = r0
|
541
|
+
|
542
|
+
r0
|
543
|
+
end
|
544
|
+
|
332
545
|
module RightParen0
|
333
546
|
def space
|
334
547
|
elements[1]
|
@@ -3,7 +3,31 @@ grammar ScientificNameDirty
|
|
3
3
|
include ScientificNameClean
|
4
4
|
|
5
5
|
rule root
|
6
|
-
|
6
|
+
space a:scientific_name_5 space {
|
7
|
+
def value
|
8
|
+
a.value.gsub(/\s{2,}/, ' ').strip
|
9
|
+
end
|
10
|
+
|
11
|
+
def canonical
|
12
|
+
a.canonical.gsub(/\s{2,}/, ' ').strip
|
13
|
+
end
|
14
|
+
|
15
|
+
def pos
|
16
|
+
a.pos
|
17
|
+
end
|
18
|
+
|
19
|
+
def hybrid
|
20
|
+
a.hybrid
|
21
|
+
end
|
22
|
+
|
23
|
+
def details
|
24
|
+
a.details.class == Array ? a.details : [a.details]
|
25
|
+
end
|
26
|
+
|
27
|
+
def parser_run
|
28
|
+
2
|
29
|
+
end
|
30
|
+
}
|
7
31
|
end
|
8
32
|
|
9
33
|
rule scientific_name_5
|
@@ -90,6 +114,43 @@ grammar ScientificNameDirty
|
|
90
114
|
super
|
91
115
|
end
|
92
116
|
|
117
|
+
rule latin_word
|
118
|
+
a:[a-z\-ëæœàâåãäáçčéèíìïňññóòôøõöúùürŕřŗššşž] b:valid_name_letters {
|
119
|
+
def value
|
120
|
+
res = ''
|
121
|
+
text_value.split('').each do |l|
|
122
|
+
l = 'ae' if l == 'æ'
|
123
|
+
l = 'oe' if l == 'œ'
|
124
|
+
res << l
|
125
|
+
end
|
126
|
+
res.tr('àâåãäáçčéèíìïňññóòôøõöúùürŕřŗššşž',
|
127
|
+
'aaaaaacceeiiinnnoooooouuurrrrsssz')
|
128
|
+
end
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
132
|
+
rule valid_name_letters
|
133
|
+
[a-z\-ëæœàâåãäáçčéèíìïňññóòôøõöúùürŕřŗššşž]+ {
|
134
|
+
def value
|
135
|
+
res = ''
|
136
|
+
text_value.split('').each do |l|
|
137
|
+
l = 'ae' if l == 'æ'
|
138
|
+
l = 'oe' if l == 'œ'
|
139
|
+
res << l
|
140
|
+
end
|
141
|
+
res.tr('àâåãäáçčéèíìïňññóòôøõöúùürŕřŗššşž',
|
142
|
+
'aaaaaacceeiiinnnoooooouuurrrrsssz')
|
143
|
+
end
|
144
|
+
}
|
145
|
+
end
|
146
|
+
rule valid_name_letters
|
147
|
+
[a-z\-ëüäöïéåóç]+ {
|
148
|
+
def value
|
149
|
+
text_value
|
150
|
+
end
|
151
|
+
}
|
152
|
+
end
|
153
|
+
|
93
154
|
rule right_paren
|
94
155
|
")" space ")"
|
95
156
|
/
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -6,8 +6,7 @@ describe ScientificNameCanonical do
|
|
6
6
|
before(:all) do
|
7
7
|
set_parser(ScientificNameCanonicalParser.new)
|
8
8
|
end
|
9
|
-
|
10
|
-
|
9
|
+
|
11
10
|
it 'should parse names with valid name part and unparseable rest' do
|
12
11
|
[
|
13
12
|
['Morea ssjjlajajaj324$33 234243242','Morea', [{:uninomial=>{:string=>"Morea"}}], {0=>["uninomial", 5]}],
|
@@ -97,6 +97,7 @@ describe ScientificNameClean do
|
|
97
97
|
["Leœptura laetifica Dow, 1913", "Leoeptura laetifica Dow 1913"],
|
98
98
|
['Ærenea cognata Lacordaire, 1872', 'Aerenea cognata Lacordaire 1872'],
|
99
99
|
['Œdicnemus capensis', 'Oedicnemus capensis'],
|
100
|
+
['Œnanthæ œnanthe','Oenanthae oenanthe'],
|
100
101
|
['Œnanthe œnanthe','Oenanthe oenanthe']
|
101
102
|
]
|
102
103
|
names.each do |name_pair|
|
@@ -105,19 +106,13 @@ describe ScientificNameClean do
|
|
105
106
|
end
|
106
107
|
end
|
107
108
|
|
108
|
-
it 'should parse names with
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
parse(sn).should_not be_nil
|
114
|
-
value(sn).should == "Trematosphaeria phaeospora (E. Müll.) L. Holm 1957"
|
115
|
-
canonical(sn).should == "Trematosphaeria phaeospora"
|
116
|
-
details(sn).should == [{:genus=>{:string=>"Trematosphaeria"}, :species=>{:string=>"phaeospora", :authorship=>"(E. Müll.) L. Holm 1957", :combinationAuthorTeam=>{:authorTeam=>"L. Holm", :author=>["L. Holm"], :year=>"1957"}, :basionymAuthorTeam=>{:authorTeam=>"E. Müll.", :author=>["E. Müll."]}}}]
|
117
|
-
pos(sn).should == {0=>["genus", 15], 16=>["species", 26], 28=>["author_word", 30], 31=>["author_word", 36], 46=>["author_word", 48], 61=>["author_word", 65], 66=>["year", 70]}
|
118
|
-
|
109
|
+
it 'should parse names with e-umlaut' do
|
110
|
+
sn = 'Kalanchoë tuberosa'
|
111
|
+
canonical(sn).should == 'Kalanchoë tuberosa'
|
112
|
+
sn = 'Isoëtes asplundii H. P. Fuchs'
|
113
|
+
canonical(sn).should == 'Isoëtes asplundii'
|
119
114
|
end
|
120
|
-
|
115
|
+
|
121
116
|
it 'should parse infragenus (ICZN code)' do
|
122
117
|
sn = "Hegeter (Hegeter) intercedens Lindberg H 1950"
|
123
118
|
parse(sn).should_not be_nil
|
@@ -298,6 +293,8 @@ describe ScientificNameClean do
|
|
298
293
|
sn = 'Latrodectus 3guttatus Thorell, 1875'
|
299
294
|
canonical(sn).should == 'Latrodectus 3-guttatus'
|
300
295
|
value(sn).should == 'Latrodectus 3-guttatus Thorell 1875'
|
296
|
+
sn = 'Balaninus c-album Schönherr, CJ., 1836'
|
297
|
+
canonical(sn).should == 'Balaninus c-album'
|
301
298
|
end
|
302
299
|
|
303
300
|
it "should parse name with morph." do
|
@@ -448,13 +445,13 @@ describe ScientificNameClean do
|
|
448
445
|
end
|
449
446
|
|
450
447
|
it 'should parse names with taxon concept' do
|
451
|
-
sn = "
|
448
|
+
sn = "Stenometope laevissimus sec. Eschmeyer 2004"
|
452
449
|
val = @parser.failure_reason.to_s.match(/column [0-9]*/).to_s().gsub(/column /,'')
|
453
|
-
details(sn).should == [{:genus=>{:string=>"
|
450
|
+
details(sn).should == [{:genus=>{:string=>"Stenometope"}, :species=>{:string=>"laevissimus"}, :taxon_concept=>{:authorship=>"Eschmeyer 2004", :basionymAuthorTeam=>{:authorTeam=>"Eschmeyer", :author=>["Eschmeyer"], :year=>"2004"}}}]
|
454
451
|
pos(sn).should == {0=>["genus", 11], 12=>["species", 23], 29=>["author_word", 38], 39=>["year", 43]}
|
455
|
-
sn = "
|
452
|
+
sn = "Stenometope laevissimus Bibron 1855 sec. Eschmeyer 2004"
|
456
453
|
parse(sn).should_not be_nil
|
457
|
-
details(sn).should == [{:genus=>{:string=>"
|
454
|
+
details(sn).should == [{:genus=>{:string=>"Stenometope"}, :species=>{:string=>"laevissimus", :authorship=>"Bibron 1855", :basionymAuthorTeam=>{:authorTeam=>"Bibron", :author=>["Bibron"], :year=>"1855"}}, :taxon_concept=>{:authorship=>"Eschmeyer 2004", :basionymAuthorTeam=>{:authorTeam=>"Eschmeyer", :author=>["Eschmeyer"], :year=>"2004"}}}]
|
458
455
|
pos(sn).should == {0=>["genus", 11], 12=>["species", 23], 24=>["author_word", 30], 31=>["year", 35], 41=>["author_word", 50], 51=>["year", 55]}
|
459
456
|
end
|
460
457
|
|
@@ -462,6 +459,15 @@ describe ScientificNameClean do
|
|
462
459
|
parse(" Asplenium X inexpectatum (E.L. Braun 1940) Morton (1956) ").should_not be_nil
|
463
460
|
end
|
464
461
|
|
462
|
+
it 'should parse names with any number of spaces' do
|
463
|
+
sn = "Trematosphaeria phaeospora (E. Müll.) L. Holm 1957"
|
464
|
+
parse(sn).should_not be_nil
|
465
|
+
value(sn).should == "Trematosphaeria phaeospora (E. Müll.) L. Holm 1957"
|
466
|
+
canonical(sn).should == "Trematosphaeria phaeospora"
|
467
|
+
details(sn).should == [{:genus=>{:string=>"Trematosphaeria"}, :species=>{:string=>"phaeospora", :authorship=>"(E. Müll.) L. Holm 1957", :combinationAuthorTeam=>{:authorTeam=>"L. Holm", :author=>["L. Holm"], :year=>"1957"}, :basionymAuthorTeam=>{:authorTeam=>"E. Müll.", :author=>["E. Müll."]}}}]
|
468
|
+
pos(sn).should == {0=>["genus", 15], 16=>["species", 26], 28=>["author_word", 30], 31=>["author_word", 36], 46=>["author_word", 48], 61=>["author_word", 65], 66=>["year", 70]}
|
469
|
+
end
|
470
|
+
|
465
471
|
it 'should not parse serveral authors groups with several years NOT CORRECT' do
|
466
472
|
parse("Pseudocercospora dendrobii (H.C. Burnett 1883) (Leight.) (Movss. 1967) U. Braun & Crous 2003").should be_nil
|
467
473
|
end
|
@@ -469,23 +475,17 @@ describe ScientificNameClean do
|
|
469
475
|
it "should not parse unallowed utf-8 chars in name part" do
|
470
476
|
parse("Érematosphaeria phaespora").should be_nil
|
471
477
|
parse("Trematosphaeria phaeáapora").should be_nil
|
472
|
-
parse("Trematоsphaeria
|
478
|
+
parse("Trematоsphaeria phaeaapora").should be_nil #cyrillic o
|
473
479
|
end
|
474
480
|
|
475
481
|
it "should parse new stuff" do
|
476
482
|
sn = 'Nesticus quelpartensis Paik & Namkung, in Paik, Yaginuma & Namkung, 1969'
|
477
483
|
details(sn).should == [{:genus=>{:string=>"Nesticus"}, :species=>{:string=>"quelpartensis", :authorship=>"Paik & Namkung, in Paik, Yaginuma & Namkung, 1969", :basionymAuthorTeam=>{:authorTeam=>"Paik & Namkung", :author=>["Paik", "Namkung"], :exAuthorTeam=>{:authorTeam=>"Paik, Yaginuma & Namkung", :author=>["Paik", "Yaginuma", "Namkung"], :year=>"1969"}}}}]
|
478
484
|
parse('Dipoena yoshidai Ono, in Ono et al., 1991').should_not be_nil
|
479
|
-
sn = 'Choriozopella trägårdhi Lawrence, 1947'
|
480
|
-
details(sn).should == [{:genus=>{:string=>"Choriozopella"}, :species=>{:string=>"trägårdhi", :authorship=>"Lawrence, 1947", :basionymAuthorTeam=>{:authorTeam=>"Lawrence", :author=>["Lawrence"], :year=>"1947"}}}]
|
481
485
|
sn = 'Latrodectus mactans bishopi Kaston, 1938'
|
482
486
|
details(sn).should == [{:genus=>{:string=>"Latrodectus"}, :species=>{:string=>"mactans"}, :infraspecies=>[{:string=>"bishopi", :rank=>"n/a", :authorship=>"Kaston, 1938", :basionymAuthorTeam=>{:authorTeam=>"Kaston", :author=>["Kaston"], :year=>"1938"}}]}]
|
483
487
|
sn = 'Diplocephalus aff. procerus Thaler, 1972'
|
484
488
|
details(sn).should == [{:genus=>{:string=>"Diplocephalus"}, :species=>{:string=>"procerus", :authorship=>"Thaler, 1972", :basionymAuthorTeam=>{:authorTeam=>"Thaler", :author=>["Thaler"], :year=>"1972"}}}]
|
485
|
-
sn = 'Dyarcyops birói Kulczynski, 1908'
|
486
|
-
details(sn).should == [{:genus=>{:string=>"Dyarcyops"}, :species=>{:string=>"birói", :authorship=>"Kulczynski, 1908", :basionymAuthorTeam=>{:authorTeam=>"Kulczynski", :author=>["Kulczynski"], :year=>"1908"}}}]
|
487
|
-
sn = 'Sparassus françoisi Simon, 1898'
|
488
|
-
details(sn).should == [{:genus=>{:string=>"Sparassus"}, :species=>{:string=>"françoisi", :authorship=>"Simon, 1898", :basionymAuthorTeam=>{:authorTeam=>"Simon", :author=>["Simon"], :year=>"1898"}}}]
|
489
489
|
sn = 'Thiobacillus x Parker and Prisk 1953' #have to figure out black lists for this one
|
490
490
|
sn = 'Bacille de Plaut, Kritchevsky and Séguin 1921'
|
491
491
|
details(sn).should == [{:uninomial=>{:string=>"Bacille", :authorship=>"de Plaut, Kritchevsky and Séguin 1921", :basionymAuthorTeam=>{:authorTeam=>"de Plaut, Kritchevsky and Séguin", :author=>["de Plaut", "Kritchevsky", "Séguin"], :year=>"1921"}}}]
|
@@ -501,4 +501,26 @@ describe ScientificNameClean do
|
|
501
501
|
details(sn).should == [{:genus=>{:string=>"Flexibacter"}, :species=>{:string=>"elegans", :authorship=>"Soriano 1945, non Lewin 1969", :basionymAuthorTeam=>{:authorTeam=>"Soriano", :author=>["Soriano"], :year=>"1945"}}}]
|
502
502
|
end
|
503
503
|
|
504
|
+
# it 'should parse hybrid names with capitalized second name in genus (botanical code error)' do
|
505
|
+
# sn = 'Anacampti-Platanthera P. Fourn.'
|
506
|
+
# @parser.parse(sn)
|
507
|
+
# puts @parser.failure_reason
|
508
|
+
# parse(sn).should_not be_nil
|
509
|
+
# canonical(sn).should == 'Anacamptiplatanthera'
|
510
|
+
# sn = 'Anacampti-Platanthera vulgaris P. Fourn.'
|
511
|
+
# parse(sn).should_not be_nil
|
512
|
+
# canonical(sn).should == 'Anacamptiplatanthera'
|
513
|
+
# end
|
514
|
+
|
515
|
+
# it 'shoud parse hybrid names with * character' do
|
516
|
+
# sn = "Carduus acanthoides * crispus"
|
517
|
+
# details(sn).should == ''
|
518
|
+
# end
|
519
|
+
|
520
|
+
it 'should parse genus names starting with uppercase letters AE OE' do
|
521
|
+
sn = 'AEmona separata Broun 1921'
|
522
|
+
canonical(sn).should == 'Aemona separata'
|
523
|
+
sn = 'OEmona simplex White, 1855'
|
524
|
+
canonical(sn).should == 'Oemona simplex'
|
525
|
+
end
|
504
526
|
end
|
@@ -7,7 +7,6 @@ describe ScientificNameDirty do
|
|
7
7
|
set_parser(ScientificNameDirtyParser.new)
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
10
|
it 'should parse clean names' do
|
12
11
|
parse("Betula verucosa (L.) Bar. 1899").should_not be_nil
|
13
12
|
end
|
@@ -85,6 +84,23 @@ describe ScientificNameDirty do
|
|
85
84
|
details('Oscillaria caviae Simons 1920, according to Simons 1922').should == [{:genus=>{:string=>"Oscillaria"}, :species=>{:string=>"caviae", :authorship=>"Simons 1920", :basionymAuthorTeam=>{:authorTeam=>"Simons", :author=>["Simons"], :year=>"1920"}}}]
|
86
85
|
sn = 'Bacterium monocytogenes hominis"" Nyfeldt 1932'
|
87
86
|
details(sn).should == [{:genus=>{:string=>"Bacterium"}, :species=>{:string=>"monocytogenes"}, :infraspecies=>[{:string=>"hominis", :rank=>"n/a"}]}]
|
87
|
+
sn = 'Choriozopella trägårdhi Lawrence, 1947'
|
88
|
+
details(sn).should == [{:genus=>{:string=>"Choriozopella"}, :species=>{:string=>"tragardhi", :authorship=>"Lawrence, 1947", :basionymAuthorTeam=>{:authorTeam=>"Lawrence", :author=>["Lawrence"], :year=>"1947"}}}]
|
89
|
+
sn = 'Sparassus françoisi Simon, 1898'
|
90
|
+
details(sn).should == [{:genus=>{:string=>"Sparassus"}, :species=>{:string=>"francoisi", :authorship=>"Simon, 1898", :basionymAuthorTeam=>{:authorTeam=>"Simon", :author=>["Simon"], :year=>"1898"}}}]
|
91
|
+
sn = 'Dyarcyops birói Kulczynski, 1908'
|
92
|
+
details(sn).should == [{:genus=>{:string=>"Dyarcyops"}, :species=>{:string=>"biroi", :authorship=>"Kulczynski, 1908", :basionymAuthorTeam=>{:authorTeam=>"Kulczynski", :author=>["Kulczynski"], :year=>"1908"}}}]
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'should parse names with "common" utf-8 charactes' do
|
96
|
+
names = ["Rühlella","Sténométope laevissimus Bibron 1855"].each do |name|
|
97
|
+
parse(name).should_not be_nil
|
98
|
+
end
|
88
99
|
end
|
89
100
|
|
101
|
+
# AsterophUa japonica
|
102
|
+
# AsyTuktus ridiculw Parent 1931
|
103
|
+
# AtremOEa Staud 1870
|
104
|
+
|
105
|
+
|
90
106
|
end
|