biodiversity19 0.5.16 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/README.rdoc +5 -5
- data/Rakefile +8 -3
- data/VERSION +1 -1
- data/bin/nnparse +7 -3
- data/bin/parserver +1 -0
- data/lib/biodiversity/parser/scientific_name_clean.treetop +131 -40
- data/lib/biodiversity/parser.rb +50 -5
- data/spec/parser/scientific_name_clean.spec.rb +53 -27
- data/spec/parser/test_data.txt +73 -21
- metadata +33 -42
- data/biodiversity19.gemspec +0 -88
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -481
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -6118
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1309
data/.gitignore
CHANGED
data/README.rdoc
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
= Biodiversity
|
2
2
|
|
3
|
-
Parses
|
3
|
+
Parses taxonomic scientific name and breaks it into semantic elements.
|
4
4
|
|
5
5
|
== Installation
|
6
6
|
|
7
|
-
To install gem you need RubyGems >= 1.
|
7
|
+
To install gem you need RubyGems >= 1.3.6
|
8
8
|
|
9
|
-
$ gem
|
10
|
-
$ sudo gem install
|
9
|
+
$ sudo gem install biodiversity #for ruby 1.8.x
|
10
|
+
$ sudo gem install biodiversity19 #for ruby 1.9.x
|
11
11
|
|
12
12
|
== Example usage
|
13
13
|
|
14
|
-
You can parse file with
|
14
|
+
You can parse file with taxonomic names from command line. File should contain one scientific name per line
|
15
15
|
|
16
16
|
nnparser file_with_names
|
17
17
|
|
data/Rakefile
CHANGED
@@ -13,11 +13,13 @@ Spec::Rake::SpecTask.new do |t|
|
|
13
13
|
t.pattern = 'spec/**/*spec.rb'
|
14
14
|
end
|
15
15
|
|
16
|
+
ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
|
17
|
+
|
16
18
|
|
17
19
|
begin
|
18
20
|
require 'jeweler'
|
19
21
|
Jeweler::Tasks.new do |gem|
|
20
|
-
gem.name = "biodiversity19"
|
22
|
+
gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
|
21
23
|
gem.summary = 'Parser of scientific names'
|
22
24
|
gem.description = 'Tools for biodiversity informatics'
|
23
25
|
gem.email = "dmozzherin@gmail.com"
|
@@ -37,11 +39,14 @@ end
|
|
37
39
|
|
38
40
|
task :tt do
|
39
41
|
['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
|
40
|
-
|
41
|
-
|
42
|
+
file = "#{dir}/lib/biodiversity/parser/#{f}"
|
43
|
+
FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
|
44
|
+
system("tt #{file}.treetop")
|
45
|
+
rf = "#{file}.rb"
|
42
46
|
rfn = open(rf + ".tmp", 'w')
|
43
47
|
skip_head = false
|
44
48
|
f = open(rf)
|
49
|
+
#getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
|
45
50
|
f.each_with_index do |l, i|
|
46
51
|
skip_head = l.match(/^# Autogenerated/) if i == 0
|
47
52
|
if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.6.0
|
data/bin/nnparse
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'rubygems'
|
3
|
-
|
3
|
+
gem_name = RUBY_VERSION.split('.')[0..1].join('').to_i > 18 ? 'biodiversity19' : 'biodiversity'
|
4
|
+
gem gem_name rescue nil
|
4
5
|
|
5
6
|
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
|
6
7
|
require 'biodiversity'
|
7
8
|
require 'json'
|
8
9
|
|
10
|
+
def parser_error(name)
|
11
|
+
{'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
|
12
|
+
end
|
9
13
|
|
10
14
|
if ARGV.empty?
|
11
15
|
puts "Usage:\n\nnnparse file_with_scientific_names [output_file]\n\ndefault output_file is parsed.json\n\n"
|
@@ -31,12 +35,12 @@ IO.foreach(input) do |line|
|
|
31
35
|
$KCODE = 'NONE'
|
32
36
|
end
|
33
37
|
p.parse(name)
|
34
|
-
parsed_data = p.parsed.all_json rescue
|
38
|
+
parsed_data = p.parsed.all_json rescue parser_error(name)
|
35
39
|
if ruby_min_version < 19
|
36
40
|
$KCODE = old_kcode
|
37
41
|
end
|
38
42
|
rescue
|
39
|
-
parsed_data =
|
43
|
+
parsed_data = parser_error(name)
|
40
44
|
end
|
41
45
|
o.write parsed_data + "\n"
|
42
46
|
end
|
data/bin/parserver
CHANGED
@@ -30,6 +30,28 @@ grammar ScientificNameClean
|
|
30
30
|
end
|
31
31
|
|
32
32
|
rule scientific_name_5
|
33
|
+
a:multinomial_name space_hard hybrid_character space_hard b:species {
|
34
|
+
def value
|
35
|
+
a.value + " × " + b.value
|
36
|
+
end
|
37
|
+
|
38
|
+
def canonical
|
39
|
+
a.canonical + " × " + b.canonical
|
40
|
+
end
|
41
|
+
|
42
|
+
def pos
|
43
|
+
a.pos.merge(b.pos)
|
44
|
+
end
|
45
|
+
|
46
|
+
def hybrid
|
47
|
+
true
|
48
|
+
end
|
49
|
+
|
50
|
+
def details
|
51
|
+
[a.details, b.details.merge({:genus => a.details[:genus]})]
|
52
|
+
end
|
53
|
+
}
|
54
|
+
/
|
33
55
|
a:scientific_name_1 space b:taxon_concept_rank space c:authorship {
|
34
56
|
def value
|
35
57
|
a.value + " " + b.apply(c)
|
@@ -62,7 +84,7 @@ grammar ScientificNameClean
|
|
62
84
|
end
|
63
85
|
|
64
86
|
def canonical
|
65
|
-
a.canonical + " " + b.canonical
|
87
|
+
a.canonical + " × " + b.canonical
|
66
88
|
end
|
67
89
|
|
68
90
|
def pos
|
@@ -196,7 +218,7 @@ grammar ScientificNameClean
|
|
196
218
|
end
|
197
219
|
|
198
220
|
def canonical
|
199
|
-
a.canonical + " " +
|
221
|
+
a.canonical + " " + c.canonical + " " + d.canonical
|
200
222
|
end
|
201
223
|
|
202
224
|
def pos
|
@@ -381,7 +403,7 @@ grammar ScientificNameClean
|
|
381
403
|
end
|
382
404
|
|
383
405
|
rule rank
|
384
|
-
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"subsp."/"subf."/"race"/"α"
|
406
|
+
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
|
385
407
|
/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
|
386
408
|
{
|
387
409
|
def value
|
@@ -405,7 +427,7 @@ grammar ScientificNameClean
|
|
405
427
|
end
|
406
428
|
|
407
429
|
rule rank_forma
|
408
|
-
("forma"/"form."/"fo."/"f.")
|
430
|
+
("forma"/"form."/"form"/"fo."/"f.")
|
409
431
|
{
|
410
432
|
def value
|
411
433
|
"f."
|
@@ -449,28 +471,28 @@ grammar ScientificNameClean
|
|
449
471
|
end
|
450
472
|
|
451
473
|
rule species_string
|
452
|
-
a:species_word &(space_hard author_prefix_word space_hard) {
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
}
|
473
|
-
/
|
474
|
+
# a:species_word &(space_hard author_prefix_word space_hard) {
|
475
|
+
# def value
|
476
|
+
# a.value
|
477
|
+
# end
|
478
|
+
#
|
479
|
+
# def canonical
|
480
|
+
# a.value
|
481
|
+
# end
|
482
|
+
#
|
483
|
+
# def hybrid
|
484
|
+
# a.hybrid rescue false
|
485
|
+
# end
|
486
|
+
#
|
487
|
+
# def pos
|
488
|
+
# {a.interval.begin => ['species', a.interval.end]}
|
489
|
+
# end
|
490
|
+
#
|
491
|
+
# def details
|
492
|
+
# {:species => {:string => a.value}}
|
493
|
+
# end
|
494
|
+
# }
|
495
|
+
# /
|
474
496
|
species_word {
|
475
497
|
def canonical
|
476
498
|
value
|
@@ -493,7 +515,7 @@ grammar ScientificNameClean
|
|
493
515
|
end
|
494
516
|
|
495
517
|
rule infragenus
|
496
|
-
left_paren space a:cap_latin_word space right_paren {
|
518
|
+
left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
|
497
519
|
def value
|
498
520
|
"(" + a.value + ")"
|
499
521
|
end
|
@@ -513,7 +535,7 @@ grammar ScientificNameClean
|
|
513
535
|
end
|
514
536
|
|
515
537
|
rule genus
|
516
|
-
a:
|
538
|
+
a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
|
517
539
|
def value
|
518
540
|
a.value
|
519
541
|
end
|
@@ -533,6 +555,50 @@ grammar ScientificNameClean
|
|
533
555
|
end
|
534
556
|
|
535
557
|
rule uninomial_name
|
558
|
+
a:uninomial_string space b:infragenus space c:simple_authorship {
|
559
|
+
def value
|
560
|
+
a.value + " " + b.value + " " + c.value
|
561
|
+
end
|
562
|
+
|
563
|
+
def canonical
|
564
|
+
a.canonical
|
565
|
+
end
|
566
|
+
|
567
|
+
def pos
|
568
|
+
a.pos.merge(b.pos).merge(c.pos)
|
569
|
+
end
|
570
|
+
|
571
|
+
def hybrid
|
572
|
+
false
|
573
|
+
end
|
574
|
+
|
575
|
+
def details
|
576
|
+
{:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
|
577
|
+
end
|
578
|
+
}
|
579
|
+
/
|
580
|
+
a:uninomial_string space b:infragenus {
|
581
|
+
def value
|
582
|
+
a.value + " " + b.value
|
583
|
+
end
|
584
|
+
|
585
|
+
def canonical
|
586
|
+
a.canonical
|
587
|
+
end
|
588
|
+
|
589
|
+
def pos
|
590
|
+
a.pos.merge(b.pos)
|
591
|
+
end
|
592
|
+
|
593
|
+
def hybrid
|
594
|
+
false
|
595
|
+
end
|
596
|
+
|
597
|
+
def details
|
598
|
+
{:uninomial => a.details[:uninomial].merge(b.details)}
|
599
|
+
end
|
600
|
+
}
|
601
|
+
/
|
536
602
|
a:uninomial_string space_hard b:authorship {
|
537
603
|
def value
|
538
604
|
a.value + " " + b.value
|
@@ -799,7 +865,7 @@ grammar ScientificNameClean
|
|
799
865
|
|
800
866
|
|
801
867
|
rule unknown_auth
|
802
|
-
("auct."/"hort."/"anon."/"ht.") {
|
868
|
+
("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
|
803
869
|
def value
|
804
870
|
text_value
|
805
871
|
end
|
@@ -837,7 +903,7 @@ grammar ScientificNameClean
|
|
837
903
|
end
|
838
904
|
|
839
905
|
rule author_name
|
840
|
-
space a:author_prefix_word space b:author_name
|
906
|
+
space a:author_prefix_word space b:author_name {
|
841
907
|
def value
|
842
908
|
a.value + " " + b.value
|
843
909
|
end
|
@@ -851,7 +917,7 @@ grammar ScientificNameClean
|
|
851
917
|
end
|
852
918
|
}
|
853
919
|
/
|
854
|
-
|
920
|
+
a:author_word space b:author_name {
|
855
921
|
def value
|
856
922
|
a.value + " " + b.value
|
857
923
|
end
|
@@ -883,7 +949,7 @@ grammar ScientificNameClean
|
|
883
949
|
end
|
884
950
|
}
|
885
951
|
/
|
886
|
-
("arg."/"et al.\{\?\}"/"et al.") {
|
952
|
+
("arg."/"et al.\{\?\}"/"et al."/"et al") {
|
887
953
|
def value
|
888
954
|
text_value.strip
|
889
955
|
end
|
@@ -930,7 +996,7 @@ grammar ScientificNameClean
|
|
930
996
|
end
|
931
997
|
|
932
998
|
rule author_prefix_word
|
933
|
-
space ("ab"/"bis"/"da"/"der"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
|
999
|
+
space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
|
934
1000
|
def value
|
935
1001
|
text_value
|
936
1002
|
end
|
@@ -976,6 +1042,14 @@ grammar ScientificNameClean
|
|
976
1042
|
}
|
977
1043
|
end
|
978
1044
|
|
1045
|
+
rule capped_dotted_char
|
1046
|
+
[A-Z] "." {
|
1047
|
+
def value
|
1048
|
+
text_value
|
1049
|
+
end
|
1050
|
+
}
|
1051
|
+
end
|
1052
|
+
|
979
1053
|
rule species_word_hybrid
|
980
1054
|
a:multiplication_sign space b:species_word {
|
981
1055
|
def value
|
@@ -1051,7 +1125,9 @@ grammar ScientificNameClean
|
|
1051
1125
|
rule species_word
|
1052
1126
|
a:[0-9]+ "-"? b:latin_word {
|
1053
1127
|
def value
|
1054
|
-
|
1128
|
+
num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"}
|
1129
|
+
a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-"
|
1130
|
+
a_value + b.value
|
1055
1131
|
end
|
1056
1132
|
}
|
1057
1133
|
/
|
@@ -1059,18 +1135,21 @@ grammar ScientificNameClean
|
|
1059
1135
|
end
|
1060
1136
|
|
1061
1137
|
rule latin_word
|
1062
|
-
a:
|
1138
|
+
a:valid_name_letters "-" b:latin_word {
|
1139
|
+
def value
|
1140
|
+
a.value + "-" + b.value
|
1141
|
+
end
|
1142
|
+
}
|
1143
|
+
/
|
1144
|
+
a:valid_name_letter b:valid_name_letters {
|
1063
1145
|
def value
|
1064
|
-
|
1065
|
-
l = 'ae' if l == 'æ'
|
1066
|
-
l = 'oe' if l == 'œ'
|
1067
|
-
l + b.value
|
1146
|
+
a.value + b.value
|
1068
1147
|
end
|
1069
1148
|
}
|
1070
1149
|
end
|
1071
1150
|
|
1072
1151
|
rule valid_name_letters
|
1073
|
-
[a-
|
1152
|
+
[a-zëæœ]+ {
|
1074
1153
|
def value
|
1075
1154
|
res = ''
|
1076
1155
|
text_value.split('').each do |l|
|
@@ -1086,6 +1165,18 @@ grammar ScientificNameClean
|
|
1086
1165
|
}
|
1087
1166
|
end
|
1088
1167
|
|
1168
|
+
rule valid_name_letter
|
1169
|
+
[a-zëæœ] {
|
1170
|
+
def value
|
1171
|
+
res = text_value
|
1172
|
+
res = 'ae' if res == 'æ'
|
1173
|
+
res = 'oe' if res == 'œ'
|
1174
|
+
res
|
1175
|
+
end
|
1176
|
+
}
|
1177
|
+
end
|
1178
|
+
|
1179
|
+
|
1089
1180
|
rule cap_digraph
|
1090
1181
|
"Æ" {
|
1091
1182
|
def value
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -6,6 +6,35 @@ require File.join(dir, *%w[parser scientific_name_canonical])
|
|
6
6
|
require 'rubygems'
|
7
7
|
require 'json'
|
8
8
|
|
9
|
+
module PreProcessor
|
10
|
+
NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
|
11
|
+
TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
|
12
|
+
TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
|
13
|
+
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
|
14
|
+
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
15
|
+
LAST_WORD_JUNK = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
|
16
|
+
|
17
|
+
def self.clean(a_string)
|
18
|
+
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
19
|
+
a_string = a_string.gsub(i, '')
|
20
|
+
end
|
21
|
+
a_string = a_string.tr('ſ','s') #old 's'
|
22
|
+
a_string
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# we can use these expressions when we are ready to parse virus names
|
27
|
+
# class VirusParser
|
28
|
+
# def initialize
|
29
|
+
# @order = /^\s*[A-Z][a-z]\+virales/i
|
30
|
+
# @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
|
31
|
+
# @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
|
32
|
+
# @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
|
33
|
+
# @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
|
34
|
+
# @parsed = nil
|
35
|
+
# end
|
36
|
+
# end
|
37
|
+
|
9
38
|
class ScientificNameParser
|
10
39
|
|
11
40
|
def initialize
|
@@ -15,21 +44,36 @@ class ScientificNameParser
|
|
15
44
|
@canonical = ScientificNameCanonicalParser.new
|
16
45
|
@parsed = nil
|
17
46
|
end
|
18
|
-
|
47
|
+
|
48
|
+
def virus?(a_string)
|
49
|
+
!!(a_string.match(/\sICTV\s*$/) || a_string.match(/\s(virus|phage|viroid|satellite|prion)\b/i))
|
50
|
+
end
|
51
|
+
|
19
52
|
def parsed
|
20
53
|
@parsed
|
21
54
|
end
|
22
55
|
|
23
56
|
def parse(a_string)
|
24
57
|
@verbatim = a_string
|
25
|
-
|
26
|
-
|
58
|
+
a_string = PreProcessor::clean(a_string)
|
59
|
+
|
60
|
+
if virus?(a_string)
|
61
|
+
@parsed = { :verbatim => a_string, :virus => true }
|
62
|
+
else
|
63
|
+
@parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
|
64
|
+
end
|
65
|
+
|
66
|
+
def @parsed.verbatim=(a_string)
|
67
|
+
@verbatim = a_string
|
68
|
+
end
|
69
|
+
|
70
|
+
def @parsed.all(verbatim = @verbatim)
|
27
71
|
parsed = self.class != Hash
|
28
72
|
res = {:parsed => parsed}
|
29
73
|
if parsed
|
30
74
|
hybrid = self.hybrid rescue false
|
31
75
|
res.merge!({
|
32
|
-
:verbatim =>
|
76
|
+
:verbatim => @verbatim,
|
33
77
|
:normalized => self.value,
|
34
78
|
:canonical => self.canonical,
|
35
79
|
:hybrid => hybrid,
|
@@ -51,7 +95,8 @@ class ScientificNameParser
|
|
51
95
|
def @parsed.all_json
|
52
96
|
self.all.to_json rescue ''
|
53
97
|
end
|
54
|
-
|
98
|
+
|
99
|
+
@parsed.verbatim = @verbatim
|
55
100
|
@parsed.all
|
56
101
|
end
|
57
102
|
end
|
@@ -98,7 +98,8 @@ describe ScientificNameClean do
|
|
98
98
|
['Ærenea cognata Lacordaire, 1872', 'Aerenea cognata Lacordaire 1872'],
|
99
99
|
['Œdicnemus capensis', 'Oedicnemus capensis'],
|
100
100
|
['Œnanthæ œnanthe','Oenanthae oenanthe'],
|
101
|
-
['Œnanthe œnanthe','Oenanthe oenanthe']
|
101
|
+
['Œnanthe œnanthe','Oenanthe oenanthe'],
|
102
|
+
['Cerambyx thomæ Gmelin J. F., 1790', 'Cerambyx thomae Gmelin J. F. 1790']
|
102
103
|
]
|
103
104
|
names.each do |name_pair|
|
104
105
|
parse(name_pair[0]).should_not be_nil
|
@@ -120,6 +121,18 @@ describe ScientificNameClean do
|
|
120
121
|
canonical(sn).should == "Hegeter intercedens"
|
121
122
|
details(sn).should == [{:genus=>{:string=>"Hegeter"}, :infragenus=>{:string=>"Hegeter"}, :species=>{:string=>"intercedens", :authorship=>"Lindberg H 1950", :basionymAuthorTeam=>{:authorTeam=>"Lindberg H", :author=>["Lindberg H"], :year=>"1950"}}}]
|
122
123
|
pos(sn).should == {0=>["genus", 7], 9=>["infragenus", 16], 18=>["species", 29], 30=>["author_word", 38], 39=>["author_word", 40], 41=>["year", 45]}
|
124
|
+
sn = "Ixodes (Ixodes) hexagonus hexagonus Neumann, 1911"
|
125
|
+
canonical(sn).should == "Ixodes hexagonus hexagonus"
|
126
|
+
sn = "Brachytrypus (B.) grandidieri"
|
127
|
+
canonical(sn).should == "Brachytrypus grandidieri"
|
128
|
+
details(sn).should == [{:genus=>{:string=>"Brachytrypus"}, :infragenus=>{:string=>"B."}, :species=>{:string=>"grandidieri"}}]
|
129
|
+
sn = "Empis (Argyrandrus) Bezzi 1909"
|
130
|
+
details(sn).should == [{:uninomial=>{:string=>"Empis", :infragenus=>{:string=>"Argyrandrus"}, :authorship=>"Bezzi 1909", :basionymAuthorTeam=>{:authorTeam=>"Bezzi", :author=>["Bezzi"], :year=>"1909"}}}]
|
131
|
+
sn = "Platydoris (Bergh )"
|
132
|
+
details(sn).should == [{:uninomial=>{:string=>"Platydoris", :infragenus=>{:string=>"Bergh"}}}]
|
133
|
+
value(sn).should == "Platydoris (Bergh)"
|
134
|
+
sn = "Platydoris (B.)"
|
135
|
+
details(sn).should == [{:uninomial=>{:string=>"Platydoris", :infragenus=>{:string=>"B."}}}]
|
123
136
|
end
|
124
137
|
|
125
138
|
it 'should parse several authors without a year' do
|
@@ -219,6 +232,8 @@ describe ScientificNameClean do
|
|
219
232
|
value(sn).should == "Phaeographis inusta var. macularis (Leight.) A.L. Sm. 1861"
|
220
233
|
canonical(sn).should == "Phaeographis inusta macularis"
|
221
234
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 19], 25=>["infraspecies", 34], 35=>["author_word", 42], 44=>["author_word", 48], 49=>["author_word", 52], 53=>["year", 57]}
|
235
|
+
sn = "Cassytha peninsularis J. Z. Weber var. flindersii"
|
236
|
+
canonical(sn).should == "Cassytha peninsularis flindersii"
|
222
237
|
end
|
223
238
|
|
224
239
|
it 'should parse unknown original authors (auct.)/(hort.)/(?)' do
|
@@ -239,7 +254,7 @@ describe ScientificNameClean do
|
|
239
254
|
pos(sn).should == {0=>["genus", 4], 5=>["species", 10], 11=>["unknown_author", 14]}
|
240
255
|
end
|
241
256
|
|
242
|
-
it '
|
257
|
+
it 'should parse real world examples' do
|
243
258
|
sn = "Stagonospora polyspora M.T. Lucas & Sousa da Câmara 1934"
|
244
259
|
parse(sn).should_not be_nil
|
245
260
|
value(sn).should == "Stagonospora polyspora M.T. Lucas et Sousa da Câmara 1934"
|
@@ -283,16 +298,16 @@ describe ScientificNameClean do
|
|
283
298
|
sn = "Gastrosericus eremorum von Beaumont 1955"
|
284
299
|
canonical(sn).should == 'Gastrosericus eremorum'
|
285
300
|
sn = "Cypraeovula (Luponia) amphithales perdentata"
|
286
|
-
canonical(sn).should == 'Cypraeovula
|
301
|
+
canonical(sn).should == 'Cypraeovula amphithales perdentata'
|
287
302
|
details(sn).should == [{:genus=>{:string=>"Cypraeovula"}, :infragenus=>{:string=>"Luponia"}, :species=>{:string=>"amphithales"}, :infraspecies=>[{:string=>"perdentata", :rank=>"n/a"}]}]
|
288
303
|
sn = "Polyrhachis orsyllus nat musculus Forel 1901"
|
289
304
|
canonical(sn).should == "Polyrhachis orsyllus musculus"
|
290
305
|
sn = 'Latrodectus 13-guttatus Thorell, 1875'
|
291
|
-
canonical(sn).should == 'Latrodectus
|
292
|
-
value(sn).should == 'Latrodectus
|
293
|
-
sn = 'Latrodectus
|
294
|
-
canonical(sn).should == 'Latrodectus
|
295
|
-
value(sn).should == 'Latrodectus
|
306
|
+
canonical(sn).should == 'Latrodectus tredecguttatus'
|
307
|
+
value(sn).should == 'Latrodectus tredecguttatus Thorell 1875'
|
308
|
+
sn = 'Latrodectus 3-guttatus Thorell, 1875'
|
309
|
+
canonical(sn).should == 'Latrodectus triguttatus'
|
310
|
+
value(sn).should == 'Latrodectus triguttatus Thorell 1875'
|
296
311
|
sn = 'Balaninus c-album Schönherr, CJ., 1836'
|
297
312
|
canonical(sn).should == 'Balaninus c-album'
|
298
313
|
end
|
@@ -353,7 +368,7 @@ describe ScientificNameClean do
|
|
353
368
|
parse(sn).should_not be_nil
|
354
369
|
value(sn).should == "Arthopyrenia hyalospora (Nyl.) R.C. Harris comb. nov."
|
355
370
|
canonical(sn).should == "Arthopyrenia hyalospora"
|
356
|
-
details(sn).should == [{:genus=>{:string=>"Arthopyrenia"}, :species=>{:string=>"hyalospora", :authorship=>"(Nyl.) R.C. Harris", :combinationAuthorTeam=>{:authorTeam=>"R.C. Harris
|
371
|
+
details(sn).should == [{:genus=>{:string=>"Arthopyrenia"}, :species=>{:string=>"hyalospora", :authorship=>"(Nyl.) R.C. Harris", :combinationAuthorTeam=>{:authorTeam=>"R.C. Harris", :author=>["R.C. Harris"]}, :basionymAuthorTeam=>{:authorTeam=>"Nyl.", :author=>["Nyl."]}}, :status=>"comb. nov."}]
|
357
372
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 23], 25=>["author_word", 29], 31=>["author_word", 35], 36=>["author_word", 42]}
|
358
373
|
end
|
359
374
|
|
@@ -414,6 +429,9 @@ describe ScientificNameClean do
|
|
414
429
|
parse(res[0]).hybrid.should be_true
|
415
430
|
details(res[0]).should == res[1]
|
416
431
|
end
|
432
|
+
sn = "Rosa alpina x pomifera"
|
433
|
+
canonical(sn).should == "Rosa alpina × pomifera"
|
434
|
+
parse(sn).details.should == [{:genus=>{:string=>"Rosa"}, :species=>{:string=>"alpina"}}, {:species=>{:string=>"pomifera"}, :genus=>{:string=>"Rosa"}}]
|
417
435
|
end
|
418
436
|
|
419
437
|
it "should parse hybrid combination" do
|
@@ -421,14 +439,14 @@ describe ScientificNameClean do
|
|
421
439
|
parse(sn).should_not be_nil
|
422
440
|
parse(sn).hybrid.should be_true
|
423
441
|
value(sn).should == "Arthopyrenia hyalospora \303\227 Hydnellum scrobiculatum"
|
424
|
-
canonical(sn).should == "Arthopyrenia hyalospora Hydnellum scrobiculatum"
|
442
|
+
canonical(sn).should == "Arthopyrenia hyalospora × Hydnellum scrobiculatum"
|
425
443
|
details(sn).should == [{:genus=>{:string=>"Arthopyrenia"}, :species=>{:string=>"hyalospora"}}, {:genus=>{:string=>"Hydnellum"}, :species=>{:string=>"scrobiculatum"}}]
|
426
444
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 23], 26=>["genus", 35], 36=>["species", 49]}
|
427
445
|
sn = "Arthopyrenia hyalospora (Banker) D. Hall X Hydnellum scrobiculatum D.E. Stuntz"
|
428
446
|
parse(sn).should_not be_nil
|
429
447
|
parse(sn).hybrid.should be_true
|
430
448
|
value(sn).should == "Arthopyrenia hyalospora (Banker) D. Hall \303\227 Hydnellum scrobiculatum D.E. Stuntz"
|
431
|
-
canonical(sn).should == "Arthopyrenia hyalospora Hydnellum scrobiculatum"
|
449
|
+
canonical(sn).should == "Arthopyrenia hyalospora × Hydnellum scrobiculatum"
|
432
450
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 23], 25=>["author_word", 31], 33=>["author_word", 35], 36=>["author_word", 40], 43=>["genus", 52], 53=>["species", 66], 67=>["author_word", 71], 72=>["author_word", 78]}
|
433
451
|
value("Arthopyrenia hyalospora X").should == "Arthopyrenia hyalospora \303\227 ?"
|
434
452
|
sn = "Arthopyrenia hyalospora x"
|
@@ -446,7 +464,6 @@ describe ScientificNameClean do
|
|
446
464
|
|
447
465
|
it 'should parse names with taxon concept' do
|
448
466
|
sn = "Stenometope laevissimus sec. Eschmeyer 2004"
|
449
|
-
val = @parser.failure_reason.to_s.match(/column [0-9]*/).to_s().gsub(/column /,'')
|
450
467
|
details(sn).should == [{:genus=>{:string=>"Stenometope"}, :species=>{:string=>"laevissimus"}, :taxon_concept=>{:authorship=>"Eschmeyer 2004", :basionymAuthorTeam=>{:authorTeam=>"Eschmeyer", :author=>["Eschmeyer"], :year=>"2004"}}}]
|
451
468
|
pos(sn).should == {0=>["genus", 11], 12=>["species", 23], 29=>["author_word", 38], 39=>["year", 43]}
|
452
469
|
sn = "Stenometope laevissimus Bibron 1855 sec. Eschmeyer 2004"
|
@@ -501,26 +518,35 @@ describe ScientificNameClean do
|
|
501
518
|
details(sn).should == [{:genus=>{:string=>"Flexibacter"}, :species=>{:string=>"elegans", :authorship=>"Soriano 1945, non Lewin 1969", :basionymAuthorTeam=>{:authorTeam=>"Soriano", :author=>["Soriano"], :year=>"1945"}}}]
|
502
519
|
end
|
503
520
|
|
504
|
-
#
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
# end
|
521
|
+
# Combination genus names should be merged without dash or capital letter
|
522
|
+
it 'should parse hybrid names with capitalized second name in genus (botanical code error)' do
|
523
|
+
sn = 'Anacampti-Platanthera P. Fourn.'
|
524
|
+
parse(sn).should_not be_nil
|
525
|
+
canonical(sn).should == 'Anacamptiplatanthera'
|
526
|
+
sn = 'Anacampti-Platanthera vulgaris P. Fourn.'
|
527
|
+
parse(sn).should_not be_nil
|
528
|
+
canonical(sn).should == 'Anacamptiplatanthera vulgaris'
|
529
|
+
end
|
514
530
|
|
515
|
-
# it 'shoud parse hybrid names with * character' do
|
516
|
-
# sn = "Carduus acanthoides * crispus"
|
517
|
-
# details(sn).should == ''
|
518
|
-
# end
|
519
|
-
|
520
531
|
it 'should parse genus names starting with uppercase letters AE OE' do
|
521
532
|
sn = 'AEmona separata Broun 1921'
|
522
533
|
canonical(sn).should == 'Aemona separata'
|
523
534
|
sn = 'OEmona simplex White, 1855'
|
524
535
|
canonical(sn).should == 'Oemona simplex'
|
525
536
|
end
|
537
|
+
#"Arthrosamanea eriorhachis (Harms & sine ref. ) Aubrév." -- ignore & sine ref. (means without reference)
|
538
|
+
|
539
|
+
=begin
|
540
|
+
new stuff
|
541
|
+
|
542
|
+
sn = "Orchidaceae × Asconopsis hort."
|
543
|
+
canonical(sn).should == "Orchidaceae x Asconopsis"
|
544
|
+
sn
|
545
|
+
Tamiops swinhoei near hainanus|Tamiops swinhoei near hainanus
|
546
|
+
Conus textile form archiepiscopus|Conus textile form archiepiscopus|
|
547
|
+
Crypticus pseudosericeus ssp. olivieri Desbrochers des Loges,1881|Crypticus pseudosericeus olivieri des
|
548
|
+
Solanum nigrum subsp nigrum|Solanum nigrum subsp nigrum
|
549
|
+
Protoglossus taeniatum author unknown|Protoglossus taeniatum author unknown
|
550
|
+
Dupontiella (S. ?) bicolor|Dupontiella|
|
551
|
+
=end
|
526
552
|
end
|