biodiversity19 0.5.16 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/README.rdoc +5 -5
- data/Rakefile +8 -3
- data/VERSION +1 -1
- data/bin/nnparse +7 -3
- data/bin/parserver +1 -0
- data/lib/biodiversity/parser/scientific_name_clean.treetop +131 -40
- data/lib/biodiversity/parser.rb +50 -5
- data/spec/parser/scientific_name_clean.spec.rb +53 -27
- data/spec/parser/test_data.txt +73 -21
- metadata +33 -42
- data/biodiversity19.gemspec +0 -88
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -481
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -6118
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1309
data/.gitignore
CHANGED
data/README.rdoc
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
= Biodiversity
|
|
2
2
|
|
|
3
|
-
Parses
|
|
3
|
+
Parses taxonomic scientific name and breaks it into semantic elements.
|
|
4
4
|
|
|
5
5
|
== Installation
|
|
6
6
|
|
|
7
|
-
To install gem you need RubyGems >= 1.
|
|
7
|
+
To install gem you need RubyGems >= 1.3.6
|
|
8
8
|
|
|
9
|
-
$ gem
|
|
10
|
-
$ sudo gem install
|
|
9
|
+
$ sudo gem install biodiversity #for ruby 1.8.x
|
|
10
|
+
$ sudo gem install biodiversity19 #for ruby 1.9.x
|
|
11
11
|
|
|
12
12
|
== Example usage
|
|
13
13
|
|
|
14
|
-
You can parse file with
|
|
14
|
+
You can parse file with taxonomic names from command line. File should contain one scientific name per line
|
|
15
15
|
|
|
16
16
|
nnparser file_with_names
|
|
17
17
|
|
data/Rakefile
CHANGED
|
@@ -13,11 +13,13 @@ Spec::Rake::SpecTask.new do |t|
|
|
|
13
13
|
t.pattern = 'spec/**/*spec.rb'
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
+
ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
|
|
17
|
+
|
|
16
18
|
|
|
17
19
|
begin
|
|
18
20
|
require 'jeweler'
|
|
19
21
|
Jeweler::Tasks.new do |gem|
|
|
20
|
-
gem.name = "biodiversity19"
|
|
22
|
+
gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
|
|
21
23
|
gem.summary = 'Parser of scientific names'
|
|
22
24
|
gem.description = 'Tools for biodiversity informatics'
|
|
23
25
|
gem.email = "dmozzherin@gmail.com"
|
|
@@ -37,11 +39,14 @@ end
|
|
|
37
39
|
|
|
38
40
|
task :tt do
|
|
39
41
|
['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
+
file = "#{dir}/lib/biodiversity/parser/#{f}"
|
|
43
|
+
FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
|
|
44
|
+
system("tt #{file}.treetop")
|
|
45
|
+
rf = "#{file}.rb"
|
|
42
46
|
rfn = open(rf + ".tmp", 'w')
|
|
43
47
|
skip_head = false
|
|
44
48
|
f = open(rf)
|
|
49
|
+
#getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
|
|
45
50
|
f.each_with_index do |l, i|
|
|
46
51
|
skip_head = l.match(/^# Autogenerated/) if i == 0
|
|
47
52
|
if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.
|
|
1
|
+
0.6.0
|
data/bin/nnparse
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
require 'rubygems'
|
|
3
|
-
|
|
3
|
+
gem_name = RUBY_VERSION.split('.')[0..1].join('').to_i > 18 ? 'biodiversity19' : 'biodiversity'
|
|
4
|
+
gem gem_name rescue nil
|
|
4
5
|
|
|
5
6
|
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
|
|
6
7
|
require 'biodiversity'
|
|
7
8
|
require 'json'
|
|
8
9
|
|
|
10
|
+
def parser_error(name)
|
|
11
|
+
{'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
|
|
12
|
+
end
|
|
9
13
|
|
|
10
14
|
if ARGV.empty?
|
|
11
15
|
puts "Usage:\n\nnnparse file_with_scientific_names [output_file]\n\ndefault output_file is parsed.json\n\n"
|
|
@@ -31,12 +35,12 @@ IO.foreach(input) do |line|
|
|
|
31
35
|
$KCODE = 'NONE'
|
|
32
36
|
end
|
|
33
37
|
p.parse(name)
|
|
34
|
-
parsed_data = p.parsed.all_json rescue
|
|
38
|
+
parsed_data = p.parsed.all_json rescue parser_error(name)
|
|
35
39
|
if ruby_min_version < 19
|
|
36
40
|
$KCODE = old_kcode
|
|
37
41
|
end
|
|
38
42
|
rescue
|
|
39
|
-
parsed_data =
|
|
43
|
+
parsed_data = parser_error(name)
|
|
40
44
|
end
|
|
41
45
|
o.write parsed_data + "\n"
|
|
42
46
|
end
|
data/bin/parserver
CHANGED
|
@@ -30,6 +30,28 @@ grammar ScientificNameClean
|
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
rule scientific_name_5
|
|
33
|
+
a:multinomial_name space_hard hybrid_character space_hard b:species {
|
|
34
|
+
def value
|
|
35
|
+
a.value + " × " + b.value
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def canonical
|
|
39
|
+
a.canonical + " × " + b.canonical
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def pos
|
|
43
|
+
a.pos.merge(b.pos)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def hybrid
|
|
47
|
+
true
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def details
|
|
51
|
+
[a.details, b.details.merge({:genus => a.details[:genus]})]
|
|
52
|
+
end
|
|
53
|
+
}
|
|
54
|
+
/
|
|
33
55
|
a:scientific_name_1 space b:taxon_concept_rank space c:authorship {
|
|
34
56
|
def value
|
|
35
57
|
a.value + " " + b.apply(c)
|
|
@@ -62,7 +84,7 @@ grammar ScientificNameClean
|
|
|
62
84
|
end
|
|
63
85
|
|
|
64
86
|
def canonical
|
|
65
|
-
a.canonical + " " + b.canonical
|
|
87
|
+
a.canonical + " × " + b.canonical
|
|
66
88
|
end
|
|
67
89
|
|
|
68
90
|
def pos
|
|
@@ -196,7 +218,7 @@ grammar ScientificNameClean
|
|
|
196
218
|
end
|
|
197
219
|
|
|
198
220
|
def canonical
|
|
199
|
-
a.canonical + " " +
|
|
221
|
+
a.canonical + " " + c.canonical + " " + d.canonical
|
|
200
222
|
end
|
|
201
223
|
|
|
202
224
|
def pos
|
|
@@ -381,7 +403,7 @@ grammar ScientificNameClean
|
|
|
381
403
|
end
|
|
382
404
|
|
|
383
405
|
rule rank
|
|
384
|
-
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"subsp."/"subf."/"race"/"α"
|
|
406
|
+
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
|
|
385
407
|
/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
|
|
386
408
|
{
|
|
387
409
|
def value
|
|
@@ -405,7 +427,7 @@ grammar ScientificNameClean
|
|
|
405
427
|
end
|
|
406
428
|
|
|
407
429
|
rule rank_forma
|
|
408
|
-
("forma"/"form."/"fo."/"f.")
|
|
430
|
+
("forma"/"form."/"form"/"fo."/"f.")
|
|
409
431
|
{
|
|
410
432
|
def value
|
|
411
433
|
"f."
|
|
@@ -449,28 +471,28 @@ grammar ScientificNameClean
|
|
|
449
471
|
end
|
|
450
472
|
|
|
451
473
|
rule species_string
|
|
452
|
-
a:species_word &(space_hard author_prefix_word space_hard) {
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
}
|
|
473
|
-
/
|
|
474
|
+
# a:species_word &(space_hard author_prefix_word space_hard) {
|
|
475
|
+
# def value
|
|
476
|
+
# a.value
|
|
477
|
+
# end
|
|
478
|
+
#
|
|
479
|
+
# def canonical
|
|
480
|
+
# a.value
|
|
481
|
+
# end
|
|
482
|
+
#
|
|
483
|
+
# def hybrid
|
|
484
|
+
# a.hybrid rescue false
|
|
485
|
+
# end
|
|
486
|
+
#
|
|
487
|
+
# def pos
|
|
488
|
+
# {a.interval.begin => ['species', a.interval.end]}
|
|
489
|
+
# end
|
|
490
|
+
#
|
|
491
|
+
# def details
|
|
492
|
+
# {:species => {:string => a.value}}
|
|
493
|
+
# end
|
|
494
|
+
# }
|
|
495
|
+
# /
|
|
474
496
|
species_word {
|
|
475
497
|
def canonical
|
|
476
498
|
value
|
|
@@ -493,7 +515,7 @@ grammar ScientificNameClean
|
|
|
493
515
|
end
|
|
494
516
|
|
|
495
517
|
rule infragenus
|
|
496
|
-
left_paren space a:cap_latin_word space right_paren {
|
|
518
|
+
left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
|
|
497
519
|
def value
|
|
498
520
|
"(" + a.value + ")"
|
|
499
521
|
end
|
|
@@ -513,7 +535,7 @@ grammar ScientificNameClean
|
|
|
513
535
|
end
|
|
514
536
|
|
|
515
537
|
rule genus
|
|
516
|
-
a:
|
|
538
|
+
a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
|
|
517
539
|
def value
|
|
518
540
|
a.value
|
|
519
541
|
end
|
|
@@ -533,6 +555,50 @@ grammar ScientificNameClean
|
|
|
533
555
|
end
|
|
534
556
|
|
|
535
557
|
rule uninomial_name
|
|
558
|
+
a:uninomial_string space b:infragenus space c:simple_authorship {
|
|
559
|
+
def value
|
|
560
|
+
a.value + " " + b.value + " " + c.value
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
def canonical
|
|
564
|
+
a.canonical
|
|
565
|
+
end
|
|
566
|
+
|
|
567
|
+
def pos
|
|
568
|
+
a.pos.merge(b.pos).merge(c.pos)
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
def hybrid
|
|
572
|
+
false
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
def details
|
|
576
|
+
{:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
|
|
577
|
+
end
|
|
578
|
+
}
|
|
579
|
+
/
|
|
580
|
+
a:uninomial_string space b:infragenus {
|
|
581
|
+
def value
|
|
582
|
+
a.value + " " + b.value
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
def canonical
|
|
586
|
+
a.canonical
|
|
587
|
+
end
|
|
588
|
+
|
|
589
|
+
def pos
|
|
590
|
+
a.pos.merge(b.pos)
|
|
591
|
+
end
|
|
592
|
+
|
|
593
|
+
def hybrid
|
|
594
|
+
false
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
def details
|
|
598
|
+
{:uninomial => a.details[:uninomial].merge(b.details)}
|
|
599
|
+
end
|
|
600
|
+
}
|
|
601
|
+
/
|
|
536
602
|
a:uninomial_string space_hard b:authorship {
|
|
537
603
|
def value
|
|
538
604
|
a.value + " " + b.value
|
|
@@ -799,7 +865,7 @@ grammar ScientificNameClean
|
|
|
799
865
|
|
|
800
866
|
|
|
801
867
|
rule unknown_auth
|
|
802
|
-
("auct."/"hort."/"anon."/"ht.") {
|
|
868
|
+
("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
|
|
803
869
|
def value
|
|
804
870
|
text_value
|
|
805
871
|
end
|
|
@@ -837,7 +903,7 @@ grammar ScientificNameClean
|
|
|
837
903
|
end
|
|
838
904
|
|
|
839
905
|
rule author_name
|
|
840
|
-
space a:author_prefix_word space b:author_name
|
|
906
|
+
space a:author_prefix_word space b:author_name {
|
|
841
907
|
def value
|
|
842
908
|
a.value + " " + b.value
|
|
843
909
|
end
|
|
@@ -851,7 +917,7 @@ grammar ScientificNameClean
|
|
|
851
917
|
end
|
|
852
918
|
}
|
|
853
919
|
/
|
|
854
|
-
|
|
920
|
+
a:author_word space b:author_name {
|
|
855
921
|
def value
|
|
856
922
|
a.value + " " + b.value
|
|
857
923
|
end
|
|
@@ -883,7 +949,7 @@ grammar ScientificNameClean
|
|
|
883
949
|
end
|
|
884
950
|
}
|
|
885
951
|
/
|
|
886
|
-
("arg."/"et al.\{\?\}"/"et al.") {
|
|
952
|
+
("arg."/"et al.\{\?\}"/"et al."/"et al") {
|
|
887
953
|
def value
|
|
888
954
|
text_value.strip
|
|
889
955
|
end
|
|
@@ -930,7 +996,7 @@ grammar ScientificNameClean
|
|
|
930
996
|
end
|
|
931
997
|
|
|
932
998
|
rule author_prefix_word
|
|
933
|
-
space ("ab"/"bis"/"da"/"der"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
|
|
999
|
+
space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
|
|
934
1000
|
def value
|
|
935
1001
|
text_value
|
|
936
1002
|
end
|
|
@@ -976,6 +1042,14 @@ grammar ScientificNameClean
|
|
|
976
1042
|
}
|
|
977
1043
|
end
|
|
978
1044
|
|
|
1045
|
+
rule capped_dotted_char
|
|
1046
|
+
[A-Z] "." {
|
|
1047
|
+
def value
|
|
1048
|
+
text_value
|
|
1049
|
+
end
|
|
1050
|
+
}
|
|
1051
|
+
end
|
|
1052
|
+
|
|
979
1053
|
rule species_word_hybrid
|
|
980
1054
|
a:multiplication_sign space b:species_word {
|
|
981
1055
|
def value
|
|
@@ -1051,7 +1125,9 @@ grammar ScientificNameClean
|
|
|
1051
1125
|
rule species_word
|
|
1052
1126
|
a:[0-9]+ "-"? b:latin_word {
|
|
1053
1127
|
def value
|
|
1054
|
-
|
|
1128
|
+
num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"}
|
|
1129
|
+
a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-"
|
|
1130
|
+
a_value + b.value
|
|
1055
1131
|
end
|
|
1056
1132
|
}
|
|
1057
1133
|
/
|
|
@@ -1059,18 +1135,21 @@ grammar ScientificNameClean
|
|
|
1059
1135
|
end
|
|
1060
1136
|
|
|
1061
1137
|
rule latin_word
|
|
1062
|
-
a:
|
|
1138
|
+
a:valid_name_letters "-" b:latin_word {
|
|
1139
|
+
def value
|
|
1140
|
+
a.value + "-" + b.value
|
|
1141
|
+
end
|
|
1142
|
+
}
|
|
1143
|
+
/
|
|
1144
|
+
a:valid_name_letter b:valid_name_letters {
|
|
1063
1145
|
def value
|
|
1064
|
-
|
|
1065
|
-
l = 'ae' if l == 'æ'
|
|
1066
|
-
l = 'oe' if l == 'œ'
|
|
1067
|
-
l + b.value
|
|
1146
|
+
a.value + b.value
|
|
1068
1147
|
end
|
|
1069
1148
|
}
|
|
1070
1149
|
end
|
|
1071
1150
|
|
|
1072
1151
|
rule valid_name_letters
|
|
1073
|
-
[a-
|
|
1152
|
+
[a-zëæœ]+ {
|
|
1074
1153
|
def value
|
|
1075
1154
|
res = ''
|
|
1076
1155
|
text_value.split('').each do |l|
|
|
@@ -1086,6 +1165,18 @@ grammar ScientificNameClean
|
|
|
1086
1165
|
}
|
|
1087
1166
|
end
|
|
1088
1167
|
|
|
1168
|
+
rule valid_name_letter
|
|
1169
|
+
[a-zëæœ] {
|
|
1170
|
+
def value
|
|
1171
|
+
res = text_value
|
|
1172
|
+
res = 'ae' if res == 'æ'
|
|
1173
|
+
res = 'oe' if res == 'œ'
|
|
1174
|
+
res
|
|
1175
|
+
end
|
|
1176
|
+
}
|
|
1177
|
+
end
|
|
1178
|
+
|
|
1179
|
+
|
|
1089
1180
|
rule cap_digraph
|
|
1090
1181
|
"Æ" {
|
|
1091
1182
|
def value
|
data/lib/biodiversity/parser.rb
CHANGED
|
@@ -6,6 +6,35 @@ require File.join(dir, *%w[parser scientific_name_canonical])
|
|
|
6
6
|
require 'rubygems'
|
|
7
7
|
require 'json'
|
|
8
8
|
|
|
9
|
+
module PreProcessor
|
|
10
|
+
NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
|
|
11
|
+
TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
|
|
12
|
+
TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
|
|
13
|
+
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
|
|
14
|
+
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
|
15
|
+
LAST_WORD_JUNK = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
|
|
16
|
+
|
|
17
|
+
def self.clean(a_string)
|
|
18
|
+
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
|
19
|
+
a_string = a_string.gsub(i, '')
|
|
20
|
+
end
|
|
21
|
+
a_string = a_string.tr('ſ','s') #old 's'
|
|
22
|
+
a_string
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# we can use these expressions when we are ready to parse virus names
|
|
27
|
+
# class VirusParser
|
|
28
|
+
# def initialize
|
|
29
|
+
# @order = /^\s*[A-Z][a-z]\+virales/i
|
|
30
|
+
# @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
|
|
31
|
+
# @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
|
|
32
|
+
# @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
|
|
33
|
+
# @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
|
|
34
|
+
# @parsed = nil
|
|
35
|
+
# end
|
|
36
|
+
# end
|
|
37
|
+
|
|
9
38
|
class ScientificNameParser
|
|
10
39
|
|
|
11
40
|
def initialize
|
|
@@ -15,21 +44,36 @@ class ScientificNameParser
|
|
|
15
44
|
@canonical = ScientificNameCanonicalParser.new
|
|
16
45
|
@parsed = nil
|
|
17
46
|
end
|
|
18
|
-
|
|
47
|
+
|
|
48
|
+
def virus?(a_string)
|
|
49
|
+
!!(a_string.match(/\sICTV\s*$/) || a_string.match(/\s(virus|phage|viroid|satellite|prion)\b/i))
|
|
50
|
+
end
|
|
51
|
+
|
|
19
52
|
def parsed
|
|
20
53
|
@parsed
|
|
21
54
|
end
|
|
22
55
|
|
|
23
56
|
def parse(a_string)
|
|
24
57
|
@verbatim = a_string
|
|
25
|
-
|
|
26
|
-
|
|
58
|
+
a_string = PreProcessor::clean(a_string)
|
|
59
|
+
|
|
60
|
+
if virus?(a_string)
|
|
61
|
+
@parsed = { :verbatim => a_string, :virus => true }
|
|
62
|
+
else
|
|
63
|
+
@parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def @parsed.verbatim=(a_string)
|
|
67
|
+
@verbatim = a_string
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def @parsed.all(verbatim = @verbatim)
|
|
27
71
|
parsed = self.class != Hash
|
|
28
72
|
res = {:parsed => parsed}
|
|
29
73
|
if parsed
|
|
30
74
|
hybrid = self.hybrid rescue false
|
|
31
75
|
res.merge!({
|
|
32
|
-
:verbatim =>
|
|
76
|
+
:verbatim => @verbatim,
|
|
33
77
|
:normalized => self.value,
|
|
34
78
|
:canonical => self.canonical,
|
|
35
79
|
:hybrid => hybrid,
|
|
@@ -51,7 +95,8 @@ class ScientificNameParser
|
|
|
51
95
|
def @parsed.all_json
|
|
52
96
|
self.all.to_json rescue ''
|
|
53
97
|
end
|
|
54
|
-
|
|
98
|
+
|
|
99
|
+
@parsed.verbatim = @verbatim
|
|
55
100
|
@parsed.all
|
|
56
101
|
end
|
|
57
102
|
end
|
|
@@ -98,7 +98,8 @@ describe ScientificNameClean do
|
|
|
98
98
|
['Ærenea cognata Lacordaire, 1872', 'Aerenea cognata Lacordaire 1872'],
|
|
99
99
|
['Œdicnemus capensis', 'Oedicnemus capensis'],
|
|
100
100
|
['Œnanthæ œnanthe','Oenanthae oenanthe'],
|
|
101
|
-
['Œnanthe œnanthe','Oenanthe oenanthe']
|
|
101
|
+
['Œnanthe œnanthe','Oenanthe oenanthe'],
|
|
102
|
+
['Cerambyx thomæ Gmelin J. F., 1790', 'Cerambyx thomae Gmelin J. F. 1790']
|
|
102
103
|
]
|
|
103
104
|
names.each do |name_pair|
|
|
104
105
|
parse(name_pair[0]).should_not be_nil
|
|
@@ -120,6 +121,18 @@ describe ScientificNameClean do
|
|
|
120
121
|
canonical(sn).should == "Hegeter intercedens"
|
|
121
122
|
details(sn).should == [{:genus=>{:string=>"Hegeter"}, :infragenus=>{:string=>"Hegeter"}, :species=>{:string=>"intercedens", :authorship=>"Lindberg H 1950", :basionymAuthorTeam=>{:authorTeam=>"Lindberg H", :author=>["Lindberg H"], :year=>"1950"}}}]
|
|
122
123
|
pos(sn).should == {0=>["genus", 7], 9=>["infragenus", 16], 18=>["species", 29], 30=>["author_word", 38], 39=>["author_word", 40], 41=>["year", 45]}
|
|
124
|
+
sn = "Ixodes (Ixodes) hexagonus hexagonus Neumann, 1911"
|
|
125
|
+
canonical(sn).should == "Ixodes hexagonus hexagonus"
|
|
126
|
+
sn = "Brachytrypus (B.) grandidieri"
|
|
127
|
+
canonical(sn).should == "Brachytrypus grandidieri"
|
|
128
|
+
details(sn).should == [{:genus=>{:string=>"Brachytrypus"}, :infragenus=>{:string=>"B."}, :species=>{:string=>"grandidieri"}}]
|
|
129
|
+
sn = "Empis (Argyrandrus) Bezzi 1909"
|
|
130
|
+
details(sn).should == [{:uninomial=>{:string=>"Empis", :infragenus=>{:string=>"Argyrandrus"}, :authorship=>"Bezzi 1909", :basionymAuthorTeam=>{:authorTeam=>"Bezzi", :author=>["Bezzi"], :year=>"1909"}}}]
|
|
131
|
+
sn = "Platydoris (Bergh )"
|
|
132
|
+
details(sn).should == [{:uninomial=>{:string=>"Platydoris", :infragenus=>{:string=>"Bergh"}}}]
|
|
133
|
+
value(sn).should == "Platydoris (Bergh)"
|
|
134
|
+
sn = "Platydoris (B.)"
|
|
135
|
+
details(sn).should == [{:uninomial=>{:string=>"Platydoris", :infragenus=>{:string=>"B."}}}]
|
|
123
136
|
end
|
|
124
137
|
|
|
125
138
|
it 'should parse several authors without a year' do
|
|
@@ -219,6 +232,8 @@ describe ScientificNameClean do
|
|
|
219
232
|
value(sn).should == "Phaeographis inusta var. macularis (Leight.) A.L. Sm. 1861"
|
|
220
233
|
canonical(sn).should == "Phaeographis inusta macularis"
|
|
221
234
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 19], 25=>["infraspecies", 34], 35=>["author_word", 42], 44=>["author_word", 48], 49=>["author_word", 52], 53=>["year", 57]}
|
|
235
|
+
sn = "Cassytha peninsularis J. Z. Weber var. flindersii"
|
|
236
|
+
canonical(sn).should == "Cassytha peninsularis flindersii"
|
|
222
237
|
end
|
|
223
238
|
|
|
224
239
|
it 'should parse unknown original authors (auct.)/(hort.)/(?)' do
|
|
@@ -239,7 +254,7 @@ describe ScientificNameClean do
|
|
|
239
254
|
pos(sn).should == {0=>["genus", 4], 5=>["species", 10], 11=>["unknown_author", 14]}
|
|
240
255
|
end
|
|
241
256
|
|
|
242
|
-
it '
|
|
257
|
+
it 'should parse real world examples' do
|
|
243
258
|
sn = "Stagonospora polyspora M.T. Lucas & Sousa da Câmara 1934"
|
|
244
259
|
parse(sn).should_not be_nil
|
|
245
260
|
value(sn).should == "Stagonospora polyspora M.T. Lucas et Sousa da Câmara 1934"
|
|
@@ -283,16 +298,16 @@ describe ScientificNameClean do
|
|
|
283
298
|
sn = "Gastrosericus eremorum von Beaumont 1955"
|
|
284
299
|
canonical(sn).should == 'Gastrosericus eremorum'
|
|
285
300
|
sn = "Cypraeovula (Luponia) amphithales perdentata"
|
|
286
|
-
canonical(sn).should == 'Cypraeovula
|
|
301
|
+
canonical(sn).should == 'Cypraeovula amphithales perdentata'
|
|
287
302
|
details(sn).should == [{:genus=>{:string=>"Cypraeovula"}, :infragenus=>{:string=>"Luponia"}, :species=>{:string=>"amphithales"}, :infraspecies=>[{:string=>"perdentata", :rank=>"n/a"}]}]
|
|
288
303
|
sn = "Polyrhachis orsyllus nat musculus Forel 1901"
|
|
289
304
|
canonical(sn).should == "Polyrhachis orsyllus musculus"
|
|
290
305
|
sn = 'Latrodectus 13-guttatus Thorell, 1875'
|
|
291
|
-
canonical(sn).should == 'Latrodectus
|
|
292
|
-
value(sn).should == 'Latrodectus
|
|
293
|
-
sn = 'Latrodectus
|
|
294
|
-
canonical(sn).should == 'Latrodectus
|
|
295
|
-
value(sn).should == 'Latrodectus
|
|
306
|
+
canonical(sn).should == 'Latrodectus tredecguttatus'
|
|
307
|
+
value(sn).should == 'Latrodectus tredecguttatus Thorell 1875'
|
|
308
|
+
sn = 'Latrodectus 3-guttatus Thorell, 1875'
|
|
309
|
+
canonical(sn).should == 'Latrodectus triguttatus'
|
|
310
|
+
value(sn).should == 'Latrodectus triguttatus Thorell 1875'
|
|
296
311
|
sn = 'Balaninus c-album Schönherr, CJ., 1836'
|
|
297
312
|
canonical(sn).should == 'Balaninus c-album'
|
|
298
313
|
end
|
|
@@ -353,7 +368,7 @@ describe ScientificNameClean do
|
|
|
353
368
|
parse(sn).should_not be_nil
|
|
354
369
|
value(sn).should == "Arthopyrenia hyalospora (Nyl.) R.C. Harris comb. nov."
|
|
355
370
|
canonical(sn).should == "Arthopyrenia hyalospora"
|
|
356
|
-
details(sn).should == [{:genus=>{:string=>"Arthopyrenia"}, :species=>{:string=>"hyalospora", :authorship=>"(Nyl.) R.C. Harris", :combinationAuthorTeam=>{:authorTeam=>"R.C. Harris
|
|
371
|
+
details(sn).should == [{:genus=>{:string=>"Arthopyrenia"}, :species=>{:string=>"hyalospora", :authorship=>"(Nyl.) R.C. Harris", :combinationAuthorTeam=>{:authorTeam=>"R.C. Harris", :author=>["R.C. Harris"]}, :basionymAuthorTeam=>{:authorTeam=>"Nyl.", :author=>["Nyl."]}}, :status=>"comb. nov."}]
|
|
357
372
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 23], 25=>["author_word", 29], 31=>["author_word", 35], 36=>["author_word", 42]}
|
|
358
373
|
end
|
|
359
374
|
|
|
@@ -414,6 +429,9 @@ describe ScientificNameClean do
|
|
|
414
429
|
parse(res[0]).hybrid.should be_true
|
|
415
430
|
details(res[0]).should == res[1]
|
|
416
431
|
end
|
|
432
|
+
sn = "Rosa alpina x pomifera"
|
|
433
|
+
canonical(sn).should == "Rosa alpina × pomifera"
|
|
434
|
+
parse(sn).details.should == [{:genus=>{:string=>"Rosa"}, :species=>{:string=>"alpina"}}, {:species=>{:string=>"pomifera"}, :genus=>{:string=>"Rosa"}}]
|
|
417
435
|
end
|
|
418
436
|
|
|
419
437
|
it "should parse hybrid combination" do
|
|
@@ -421,14 +439,14 @@ describe ScientificNameClean do
|
|
|
421
439
|
parse(sn).should_not be_nil
|
|
422
440
|
parse(sn).hybrid.should be_true
|
|
423
441
|
value(sn).should == "Arthopyrenia hyalospora \303\227 Hydnellum scrobiculatum"
|
|
424
|
-
canonical(sn).should == "Arthopyrenia hyalospora Hydnellum scrobiculatum"
|
|
442
|
+
canonical(sn).should == "Arthopyrenia hyalospora × Hydnellum scrobiculatum"
|
|
425
443
|
details(sn).should == [{:genus=>{:string=>"Arthopyrenia"}, :species=>{:string=>"hyalospora"}}, {:genus=>{:string=>"Hydnellum"}, :species=>{:string=>"scrobiculatum"}}]
|
|
426
444
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 23], 26=>["genus", 35], 36=>["species", 49]}
|
|
427
445
|
sn = "Arthopyrenia hyalospora (Banker) D. Hall X Hydnellum scrobiculatum D.E. Stuntz"
|
|
428
446
|
parse(sn).should_not be_nil
|
|
429
447
|
parse(sn).hybrid.should be_true
|
|
430
448
|
value(sn).should == "Arthopyrenia hyalospora (Banker) D. Hall \303\227 Hydnellum scrobiculatum D.E. Stuntz"
|
|
431
|
-
canonical(sn).should == "Arthopyrenia hyalospora Hydnellum scrobiculatum"
|
|
449
|
+
canonical(sn).should == "Arthopyrenia hyalospora × Hydnellum scrobiculatum"
|
|
432
450
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 23], 25=>["author_word", 31], 33=>["author_word", 35], 36=>["author_word", 40], 43=>["genus", 52], 53=>["species", 66], 67=>["author_word", 71], 72=>["author_word", 78]}
|
|
433
451
|
value("Arthopyrenia hyalospora X").should == "Arthopyrenia hyalospora \303\227 ?"
|
|
434
452
|
sn = "Arthopyrenia hyalospora x"
|
|
@@ -446,7 +464,6 @@ describe ScientificNameClean do
|
|
|
446
464
|
|
|
447
465
|
it 'should parse names with taxon concept' do
|
|
448
466
|
sn = "Stenometope laevissimus sec. Eschmeyer 2004"
|
|
449
|
-
val = @parser.failure_reason.to_s.match(/column [0-9]*/).to_s().gsub(/column /,'')
|
|
450
467
|
details(sn).should == [{:genus=>{:string=>"Stenometope"}, :species=>{:string=>"laevissimus"}, :taxon_concept=>{:authorship=>"Eschmeyer 2004", :basionymAuthorTeam=>{:authorTeam=>"Eschmeyer", :author=>["Eschmeyer"], :year=>"2004"}}}]
|
|
451
468
|
pos(sn).should == {0=>["genus", 11], 12=>["species", 23], 29=>["author_word", 38], 39=>["year", 43]}
|
|
452
469
|
sn = "Stenometope laevissimus Bibron 1855 sec. Eschmeyer 2004"
|
|
@@ -501,26 +518,35 @@ describe ScientificNameClean do
|
|
|
501
518
|
details(sn).should == [{:genus=>{:string=>"Flexibacter"}, :species=>{:string=>"elegans", :authorship=>"Soriano 1945, non Lewin 1969", :basionymAuthorTeam=>{:authorTeam=>"Soriano", :author=>["Soriano"], :year=>"1945"}}}]
|
|
502
519
|
end
|
|
503
520
|
|
|
504
|
-
#
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
# end
|
|
521
|
+
# Combination genus names should be merged without dash or capital letter
|
|
522
|
+
it 'should parse hybrid names with capitalized second name in genus (botanical code error)' do
|
|
523
|
+
sn = 'Anacampti-Platanthera P. Fourn.'
|
|
524
|
+
parse(sn).should_not be_nil
|
|
525
|
+
canonical(sn).should == 'Anacamptiplatanthera'
|
|
526
|
+
sn = 'Anacampti-Platanthera vulgaris P. Fourn.'
|
|
527
|
+
parse(sn).should_not be_nil
|
|
528
|
+
canonical(sn).should == 'Anacamptiplatanthera vulgaris'
|
|
529
|
+
end
|
|
514
530
|
|
|
515
|
-
# it 'shoud parse hybrid names with * character' do
|
|
516
|
-
# sn = "Carduus acanthoides * crispus"
|
|
517
|
-
# details(sn).should == ''
|
|
518
|
-
# end
|
|
519
|
-
|
|
520
531
|
it 'should parse genus names starting with uppercase letters AE OE' do
|
|
521
532
|
sn = 'AEmona separata Broun 1921'
|
|
522
533
|
canonical(sn).should == 'Aemona separata'
|
|
523
534
|
sn = 'OEmona simplex White, 1855'
|
|
524
535
|
canonical(sn).should == 'Oemona simplex'
|
|
525
536
|
end
|
|
537
|
+
#"Arthrosamanea eriorhachis (Harms & sine ref. ) Aubrév." -- ignore & sine ref. (means without reference)
|
|
538
|
+
|
|
539
|
+
=begin
|
|
540
|
+
new stuff
|
|
541
|
+
|
|
542
|
+
sn = "Orchidaceae × Asconopsis hort."
|
|
543
|
+
canonical(sn).should == "Orchidaceae x Asconopsis"
|
|
544
|
+
sn
|
|
545
|
+
Tamiops swinhoei near hainanus|Tamiops swinhoei near hainanus
|
|
546
|
+
Conus textile form archiepiscopus|Conus textile form archiepiscopus|
|
|
547
|
+
Crypticus pseudosericeus ssp. olivieri Desbrochers des Loges,1881|Crypticus pseudosericeus olivieri des
|
|
548
|
+
Solanum nigrum subsp nigrum|Solanum nigrum subsp nigrum
|
|
549
|
+
Protoglossus taeniatum author unknown|Protoglossus taeniatum author unknown
|
|
550
|
+
Dupontiella (S. ?) bicolor|Dupontiella|
|
|
551
|
+
=end
|
|
526
552
|
end
|