biodiversity 0.5.16 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/README.rdoc +5 -5
- data/Rakefile +8 -3
- data/VERSION +1 -1
- data/bin/nnparse +7 -3
- data/bin/parserver +1 -0
- data/lib/biodiversity/parser.rb +50 -5
- data/lib/biodiversity/parser/scientific_name_clean.treetop +131 -40
- data/spec/parser/scientific_name_clean.spec.rb +53 -27
- data/spec/parser/test_data.txt +73 -21
- metadata +4 -8
- data/biodiversity.gemspec +0 -88
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -481
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -6118
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1309
data/.gitignore
CHANGED
data/README.rdoc
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
= Biodiversity
|
2
2
|
|
3
|
-
Parses
|
3
|
+
Parses taxonomic scientific name and breaks it into semantic elements.
|
4
4
|
|
5
5
|
== Installation
|
6
6
|
|
7
|
-
To install gem you need RubyGems >= 1.
|
7
|
+
To install gem you need RubyGems >= 1.3.6
|
8
8
|
|
9
|
-
$ gem
|
10
|
-
$ sudo gem install
|
9
|
+
$ sudo gem install biodiversity #for ruby 1.8.x
|
10
|
+
$ sudo gem install biodiversity19 #for ruby 1.9.x
|
11
11
|
|
12
12
|
== Example usage
|
13
13
|
|
14
|
-
You can parse file with
|
14
|
+
You can parse file with taxonomic names from command line. File should contain one scientific name per line
|
15
15
|
|
16
16
|
nnparser file_with_names
|
17
17
|
|
data/Rakefile
CHANGED
@@ -13,11 +13,13 @@ Spec::Rake::SpecTask.new do |t|
|
|
13
13
|
t.pattern = 'spec/**/*spec.rb'
|
14
14
|
end
|
15
15
|
|
16
|
+
ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
|
17
|
+
|
16
18
|
|
17
19
|
begin
|
18
20
|
require 'jeweler'
|
19
21
|
Jeweler::Tasks.new do |gem|
|
20
|
-
gem.name = "biodiversity"
|
22
|
+
gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
|
21
23
|
gem.summary = 'Parser of scientific names'
|
22
24
|
gem.description = 'Tools for biodiversity informatics'
|
23
25
|
gem.email = "dmozzherin@gmail.com"
|
@@ -37,11 +39,14 @@ end
|
|
37
39
|
|
38
40
|
task :tt do
|
39
41
|
['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
|
40
|
-
|
41
|
-
|
42
|
+
file = "#{dir}/lib/biodiversity/parser/#{f}"
|
43
|
+
FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
|
44
|
+
system("tt #{file}.treetop")
|
45
|
+
rf = "#{file}.rb"
|
42
46
|
rfn = open(rf + ".tmp", 'w')
|
43
47
|
skip_head = false
|
44
48
|
f = open(rf)
|
49
|
+
#getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
|
45
50
|
f.each_with_index do |l, i|
|
46
51
|
skip_head = l.match(/^# Autogenerated/) if i == 0
|
47
52
|
if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.6.0
|
data/bin/nnparse
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'rubygems'
|
3
|
-
|
3
|
+
gem_name = RUBY_VERSION.split('.')[0..1].join('').to_i > 18 ? 'biodiversity19' : 'biodiversity'
|
4
|
+
gem gem_name rescue nil
|
4
5
|
|
5
6
|
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
|
6
7
|
require 'biodiversity'
|
7
8
|
require 'json'
|
8
9
|
|
10
|
+
def parser_error(name)
|
11
|
+
{'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
|
12
|
+
end
|
9
13
|
|
10
14
|
if ARGV.empty?
|
11
15
|
puts "Usage:\n\nnnparse file_with_scientific_names [output_file]\n\ndefault output_file is parsed.json\n\n"
|
@@ -31,12 +35,12 @@ IO.foreach(input) do |line|
|
|
31
35
|
$KCODE = 'NONE'
|
32
36
|
end
|
33
37
|
p.parse(name)
|
34
|
-
parsed_data = p.parsed.all_json rescue
|
38
|
+
parsed_data = p.parsed.all_json rescue parser_error(name)
|
35
39
|
if ruby_min_version < 19
|
36
40
|
$KCODE = old_kcode
|
37
41
|
end
|
38
42
|
rescue
|
39
|
-
parsed_data =
|
43
|
+
parsed_data = parser_error(name)
|
40
44
|
end
|
41
45
|
o.write parsed_data + "\n"
|
42
46
|
end
|
data/bin/parserver
CHANGED
data/lib/biodiversity/parser.rb
CHANGED
@@ -6,6 +6,35 @@ require File.join(dir, *%w[parser scientific_name_canonical])
|
|
6
6
|
require 'rubygems'
|
7
7
|
require 'json'
|
8
8
|
|
9
|
+
module PreProcessor
|
10
|
+
NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
|
11
|
+
TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
|
12
|
+
TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
|
13
|
+
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
|
14
|
+
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
15
|
+
LAST_WORD_JUNK = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
|
16
|
+
|
17
|
+
def self.clean(a_string)
|
18
|
+
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
19
|
+
a_string = a_string.gsub(i, '')
|
20
|
+
end
|
21
|
+
a_string = a_string.tr('ſ','s') #old 's'
|
22
|
+
a_string
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# we can use these expressions when we are ready to parse virus names
|
27
|
+
# class VirusParser
|
28
|
+
# def initialize
|
29
|
+
# @order = /^\s*[A-Z][a-z]\+virales/i
|
30
|
+
# @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
|
31
|
+
# @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
|
32
|
+
# @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
|
33
|
+
# @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
|
34
|
+
# @parsed = nil
|
35
|
+
# end
|
36
|
+
# end
|
37
|
+
|
9
38
|
class ScientificNameParser
|
10
39
|
|
11
40
|
def initialize
|
@@ -15,21 +44,36 @@ class ScientificNameParser
|
|
15
44
|
@canonical = ScientificNameCanonicalParser.new
|
16
45
|
@parsed = nil
|
17
46
|
end
|
18
|
-
|
47
|
+
|
48
|
+
def virus?(a_string)
|
49
|
+
!!(a_string.match(/\sICTV\s*$/) || a_string.match(/\s(virus|phage|viroid|satellite|prion)\b/i))
|
50
|
+
end
|
51
|
+
|
19
52
|
def parsed
|
20
53
|
@parsed
|
21
54
|
end
|
22
55
|
|
23
56
|
def parse(a_string)
|
24
57
|
@verbatim = a_string
|
25
|
-
|
26
|
-
|
58
|
+
a_string = PreProcessor::clean(a_string)
|
59
|
+
|
60
|
+
if virus?(a_string)
|
61
|
+
@parsed = { :verbatim => a_string, :virus => true }
|
62
|
+
else
|
63
|
+
@parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
|
64
|
+
end
|
65
|
+
|
66
|
+
def @parsed.verbatim=(a_string)
|
67
|
+
@verbatim = a_string
|
68
|
+
end
|
69
|
+
|
70
|
+
def @parsed.all(verbatim = @verbatim)
|
27
71
|
parsed = self.class != Hash
|
28
72
|
res = {:parsed => parsed}
|
29
73
|
if parsed
|
30
74
|
hybrid = self.hybrid rescue false
|
31
75
|
res.merge!({
|
32
|
-
:verbatim =>
|
76
|
+
:verbatim => @verbatim,
|
33
77
|
:normalized => self.value,
|
34
78
|
:canonical => self.canonical,
|
35
79
|
:hybrid => hybrid,
|
@@ -51,7 +95,8 @@ class ScientificNameParser
|
|
51
95
|
def @parsed.all_json
|
52
96
|
self.all.to_json rescue ''
|
53
97
|
end
|
54
|
-
|
98
|
+
|
99
|
+
@parsed.verbatim = @verbatim
|
55
100
|
@parsed.all
|
56
101
|
end
|
57
102
|
end
|
@@ -30,6 +30,28 @@ grammar ScientificNameClean
|
|
30
30
|
end
|
31
31
|
|
32
32
|
rule scientific_name_5
|
33
|
+
a:multinomial_name space_hard hybrid_character space_hard b:species {
|
34
|
+
def value
|
35
|
+
a.value + " × " + b.value
|
36
|
+
end
|
37
|
+
|
38
|
+
def canonical
|
39
|
+
a.canonical + " × " + b.canonical
|
40
|
+
end
|
41
|
+
|
42
|
+
def pos
|
43
|
+
a.pos.merge(b.pos)
|
44
|
+
end
|
45
|
+
|
46
|
+
def hybrid
|
47
|
+
true
|
48
|
+
end
|
49
|
+
|
50
|
+
def details
|
51
|
+
[a.details, b.details.merge({:genus => a.details[:genus]})]
|
52
|
+
end
|
53
|
+
}
|
54
|
+
/
|
33
55
|
a:scientific_name_1 space b:taxon_concept_rank space c:authorship {
|
34
56
|
def value
|
35
57
|
a.value + " " + b.apply(c)
|
@@ -62,7 +84,7 @@ grammar ScientificNameClean
|
|
62
84
|
end
|
63
85
|
|
64
86
|
def canonical
|
65
|
-
a.canonical + " " + b.canonical
|
87
|
+
a.canonical + " × " + b.canonical
|
66
88
|
end
|
67
89
|
|
68
90
|
def pos
|
@@ -196,7 +218,7 @@ grammar ScientificNameClean
|
|
196
218
|
end
|
197
219
|
|
198
220
|
def canonical
|
199
|
-
a.canonical + " " +
|
221
|
+
a.canonical + " " + c.canonical + " " + d.canonical
|
200
222
|
end
|
201
223
|
|
202
224
|
def pos
|
@@ -381,7 +403,7 @@ grammar ScientificNameClean
|
|
381
403
|
end
|
382
404
|
|
383
405
|
rule rank
|
384
|
-
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"subsp."/"subf."/"race"/"α"
|
406
|
+
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
|
385
407
|
/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
|
386
408
|
{
|
387
409
|
def value
|
@@ -405,7 +427,7 @@ grammar ScientificNameClean
|
|
405
427
|
end
|
406
428
|
|
407
429
|
rule rank_forma
|
408
|
-
("forma"/"form."/"fo."/"f.")
|
430
|
+
("forma"/"form."/"form"/"fo."/"f.")
|
409
431
|
{
|
410
432
|
def value
|
411
433
|
"f."
|
@@ -449,28 +471,28 @@ grammar ScientificNameClean
|
|
449
471
|
end
|
450
472
|
|
451
473
|
rule species_string
|
452
|
-
a:species_word &(space_hard author_prefix_word space_hard) {
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
}
|
473
|
-
/
|
474
|
+
# a:species_word &(space_hard author_prefix_word space_hard) {
|
475
|
+
# def value
|
476
|
+
# a.value
|
477
|
+
# end
|
478
|
+
#
|
479
|
+
# def canonical
|
480
|
+
# a.value
|
481
|
+
# end
|
482
|
+
#
|
483
|
+
# def hybrid
|
484
|
+
# a.hybrid rescue false
|
485
|
+
# end
|
486
|
+
#
|
487
|
+
# def pos
|
488
|
+
# {a.interval.begin => ['species', a.interval.end]}
|
489
|
+
# end
|
490
|
+
#
|
491
|
+
# def details
|
492
|
+
# {:species => {:string => a.value}}
|
493
|
+
# end
|
494
|
+
# }
|
495
|
+
# /
|
474
496
|
species_word {
|
475
497
|
def canonical
|
476
498
|
value
|
@@ -493,7 +515,7 @@ grammar ScientificNameClean
|
|
493
515
|
end
|
494
516
|
|
495
517
|
rule infragenus
|
496
|
-
left_paren space a:cap_latin_word space right_paren {
|
518
|
+
left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
|
497
519
|
def value
|
498
520
|
"(" + a.value + ")"
|
499
521
|
end
|
@@ -513,7 +535,7 @@ grammar ScientificNameClean
|
|
513
535
|
end
|
514
536
|
|
515
537
|
rule genus
|
516
|
-
a:
|
538
|
+
a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
|
517
539
|
def value
|
518
540
|
a.value
|
519
541
|
end
|
@@ -533,6 +555,50 @@ grammar ScientificNameClean
|
|
533
555
|
end
|
534
556
|
|
535
557
|
rule uninomial_name
|
558
|
+
a:uninomial_string space b:infragenus space c:simple_authorship {
|
559
|
+
def value
|
560
|
+
a.value + " " + b.value + " " + c.value
|
561
|
+
end
|
562
|
+
|
563
|
+
def canonical
|
564
|
+
a.canonical
|
565
|
+
end
|
566
|
+
|
567
|
+
def pos
|
568
|
+
a.pos.merge(b.pos).merge(c.pos)
|
569
|
+
end
|
570
|
+
|
571
|
+
def hybrid
|
572
|
+
false
|
573
|
+
end
|
574
|
+
|
575
|
+
def details
|
576
|
+
{:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
|
577
|
+
end
|
578
|
+
}
|
579
|
+
/
|
580
|
+
a:uninomial_string space b:infragenus {
|
581
|
+
def value
|
582
|
+
a.value + " " + b.value
|
583
|
+
end
|
584
|
+
|
585
|
+
def canonical
|
586
|
+
a.canonical
|
587
|
+
end
|
588
|
+
|
589
|
+
def pos
|
590
|
+
a.pos.merge(b.pos)
|
591
|
+
end
|
592
|
+
|
593
|
+
def hybrid
|
594
|
+
false
|
595
|
+
end
|
596
|
+
|
597
|
+
def details
|
598
|
+
{:uninomial => a.details[:uninomial].merge(b.details)}
|
599
|
+
end
|
600
|
+
}
|
601
|
+
/
|
536
602
|
a:uninomial_string space_hard b:authorship {
|
537
603
|
def value
|
538
604
|
a.value + " " + b.value
|
@@ -799,7 +865,7 @@ grammar ScientificNameClean
|
|
799
865
|
|
800
866
|
|
801
867
|
rule unknown_auth
|
802
|
-
("auct."/"hort."/"anon."/"ht.") {
|
868
|
+
("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
|
803
869
|
def value
|
804
870
|
text_value
|
805
871
|
end
|
@@ -837,7 +903,7 @@ grammar ScientificNameClean
|
|
837
903
|
end
|
838
904
|
|
839
905
|
rule author_name
|
840
|
-
space a:author_prefix_word space b:author_name
|
906
|
+
space a:author_prefix_word space b:author_name {
|
841
907
|
def value
|
842
908
|
a.value + " " + b.value
|
843
909
|
end
|
@@ -851,7 +917,7 @@ grammar ScientificNameClean
|
|
851
917
|
end
|
852
918
|
}
|
853
919
|
/
|
854
|
-
|
920
|
+
a:author_word space b:author_name {
|
855
921
|
def value
|
856
922
|
a.value + " " + b.value
|
857
923
|
end
|
@@ -883,7 +949,7 @@ grammar ScientificNameClean
|
|
883
949
|
end
|
884
950
|
}
|
885
951
|
/
|
886
|
-
("arg."/"et al.\{\?\}"/"et al.") {
|
952
|
+
("arg."/"et al.\{\?\}"/"et al."/"et al") {
|
887
953
|
def value
|
888
954
|
text_value.strip
|
889
955
|
end
|
@@ -930,7 +996,7 @@ grammar ScientificNameClean
|
|
930
996
|
end
|
931
997
|
|
932
998
|
rule author_prefix_word
|
933
|
-
space ("ab"/"bis"/"da"/"der"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
|
999
|
+
space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
|
934
1000
|
def value
|
935
1001
|
text_value
|
936
1002
|
end
|
@@ -976,6 +1042,14 @@ grammar ScientificNameClean
|
|
976
1042
|
}
|
977
1043
|
end
|
978
1044
|
|
1045
|
+
rule capped_dotted_char
|
1046
|
+
[A-Z] "." {
|
1047
|
+
def value
|
1048
|
+
text_value
|
1049
|
+
end
|
1050
|
+
}
|
1051
|
+
end
|
1052
|
+
|
979
1053
|
rule species_word_hybrid
|
980
1054
|
a:multiplication_sign space b:species_word {
|
981
1055
|
def value
|
@@ -1051,7 +1125,9 @@ grammar ScientificNameClean
|
|
1051
1125
|
rule species_word
|
1052
1126
|
a:[0-9]+ "-"? b:latin_word {
|
1053
1127
|
def value
|
1054
|
-
|
1128
|
+
num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"}
|
1129
|
+
a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-"
|
1130
|
+
a_value + b.value
|
1055
1131
|
end
|
1056
1132
|
}
|
1057
1133
|
/
|
@@ -1059,18 +1135,21 @@ grammar ScientificNameClean
|
|
1059
1135
|
end
|
1060
1136
|
|
1061
1137
|
rule latin_word
|
1062
|
-
a:
|
1138
|
+
a:valid_name_letters "-" b:latin_word {
|
1139
|
+
def value
|
1140
|
+
a.value + "-" + b.value
|
1141
|
+
end
|
1142
|
+
}
|
1143
|
+
/
|
1144
|
+
a:valid_name_letter b:valid_name_letters {
|
1063
1145
|
def value
|
1064
|
-
|
1065
|
-
l = 'ae' if l == 'æ'
|
1066
|
-
l = 'oe' if l == 'œ'
|
1067
|
-
l + b.value
|
1146
|
+
a.value + b.value
|
1068
1147
|
end
|
1069
1148
|
}
|
1070
1149
|
end
|
1071
1150
|
|
1072
1151
|
rule valid_name_letters
|
1073
|
-
[a-
|
1152
|
+
[a-zëæœ]+ {
|
1074
1153
|
def value
|
1075
1154
|
res = ''
|
1076
1155
|
text_value.split('').each do |l|
|
@@ -1086,6 +1165,18 @@ grammar ScientificNameClean
|
|
1086
1165
|
}
|
1087
1166
|
end
|
1088
1167
|
|
1168
|
+
rule valid_name_letter
|
1169
|
+
[a-zëæœ] {
|
1170
|
+
def value
|
1171
|
+
res = text_value
|
1172
|
+
res = 'ae' if res == 'æ'
|
1173
|
+
res = 'oe' if res == 'œ'
|
1174
|
+
res
|
1175
|
+
end
|
1176
|
+
}
|
1177
|
+
end
|
1178
|
+
|
1179
|
+
|
1089
1180
|
rule cap_digraph
|
1090
1181
|
"Æ" {
|
1091
1182
|
def value
|