biodiversity 0.5.16 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/README.rdoc +5 -5
- data/Rakefile +8 -3
- data/VERSION +1 -1
- data/bin/nnparse +7 -3
- data/bin/parserver +1 -0
- data/lib/biodiversity/parser.rb +50 -5
- data/lib/biodiversity/parser/scientific_name_clean.treetop +131 -40
- data/spec/parser/scientific_name_clean.spec.rb +53 -27
- data/spec/parser/test_data.txt +73 -21
- metadata +4 -8
- data/biodiversity.gemspec +0 -88
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -481
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -6118
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1309
data/.gitignore
CHANGED
data/README.rdoc
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
= Biodiversity
|
2
2
|
|
3
|
-
Parses
|
3
|
+
Parses taxonomic scientific name and breaks it into semantic elements.
|
4
4
|
|
5
5
|
== Installation
|
6
6
|
|
7
|
-
To install gem you need RubyGems >= 1.
|
7
|
+
To install gem you need RubyGems >= 1.3.6
|
8
8
|
|
9
|
-
$ gem
|
10
|
-
$ sudo gem install
|
9
|
+
$ sudo gem install biodiversity #for ruby 1.8.x
|
10
|
+
$ sudo gem install biodiversity19 #for ruby 1.9.x
|
11
11
|
|
12
12
|
== Example usage
|
13
13
|
|
14
|
-
You can parse file with
|
14
|
+
You can parse file with taxonomic names from command line. File should contain one scientific name per line
|
15
15
|
|
16
16
|
nnparser file_with_names
|
17
17
|
|
data/Rakefile
CHANGED
@@ -13,11 +13,13 @@ Spec::Rake::SpecTask.new do |t|
|
|
13
13
|
t.pattern = 'spec/**/*spec.rb'
|
14
14
|
end
|
15
15
|
|
16
|
+
ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
|
17
|
+
|
16
18
|
|
17
19
|
begin
|
18
20
|
require 'jeweler'
|
19
21
|
Jeweler::Tasks.new do |gem|
|
20
|
-
gem.name = "biodiversity"
|
22
|
+
gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
|
21
23
|
gem.summary = 'Parser of scientific names'
|
22
24
|
gem.description = 'Tools for biodiversity informatics'
|
23
25
|
gem.email = "dmozzherin@gmail.com"
|
@@ -37,11 +39,14 @@ end
|
|
37
39
|
|
38
40
|
task :tt do
|
39
41
|
['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
|
40
|
-
|
41
|
-
|
42
|
+
file = "#{dir}/lib/biodiversity/parser/#{f}"
|
43
|
+
FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
|
44
|
+
system("tt #{file}.treetop")
|
45
|
+
rf = "#{file}.rb"
|
42
46
|
rfn = open(rf + ".tmp", 'w')
|
43
47
|
skip_head = false
|
44
48
|
f = open(rf)
|
49
|
+
#getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
|
45
50
|
f.each_with_index do |l, i|
|
46
51
|
skip_head = l.match(/^# Autogenerated/) if i == 0
|
47
52
|
if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.6.0
|
data/bin/nnparse
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'rubygems'
|
3
|
-
|
3
|
+
gem_name = RUBY_VERSION.split('.')[0..1].join('').to_i > 18 ? 'biodiversity19' : 'biodiversity'
|
4
|
+
gem gem_name rescue nil
|
4
5
|
|
5
6
|
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
|
6
7
|
require 'biodiversity'
|
7
8
|
require 'json'
|
8
9
|
|
10
|
+
def parser_error(name)
|
11
|
+
{'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
|
12
|
+
end
|
9
13
|
|
10
14
|
if ARGV.empty?
|
11
15
|
puts "Usage:\n\nnnparse file_with_scientific_names [output_file]\n\ndefault output_file is parsed.json\n\n"
|
@@ -31,12 +35,12 @@ IO.foreach(input) do |line|
|
|
31
35
|
$KCODE = 'NONE'
|
32
36
|
end
|
33
37
|
p.parse(name)
|
34
|
-
parsed_data = p.parsed.all_json rescue
|
38
|
+
parsed_data = p.parsed.all_json rescue parser_error(name)
|
35
39
|
if ruby_min_version < 19
|
36
40
|
$KCODE = old_kcode
|
37
41
|
end
|
38
42
|
rescue
|
39
|
-
parsed_data =
|
43
|
+
parsed_data = parser_error(name)
|
40
44
|
end
|
41
45
|
o.write parsed_data + "\n"
|
42
46
|
end
|
data/bin/parserver
CHANGED
data/lib/biodiversity/parser.rb
CHANGED
@@ -6,6 +6,35 @@ require File.join(dir, *%w[parser scientific_name_canonical])
|
|
6
6
|
require 'rubygems'
|
7
7
|
require 'json'
|
8
8
|
|
9
|
+
module PreProcessor
|
10
|
+
NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
|
11
|
+
TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
|
12
|
+
TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
|
13
|
+
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
|
14
|
+
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
15
|
+
LAST_WORD_JUNK = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
|
16
|
+
|
17
|
+
def self.clean(a_string)
|
18
|
+
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
19
|
+
a_string = a_string.gsub(i, '')
|
20
|
+
end
|
21
|
+
a_string = a_string.tr('ſ','s') #old 's'
|
22
|
+
a_string
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# we can use these expressions when we are ready to parse virus names
|
27
|
+
# class VirusParser
|
28
|
+
# def initialize
|
29
|
+
# @order = /^\s*[A-Z][a-z]\+virales/i
|
30
|
+
# @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
|
31
|
+
# @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
|
32
|
+
# @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
|
33
|
+
# @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
|
34
|
+
# @parsed = nil
|
35
|
+
# end
|
36
|
+
# end
|
37
|
+
|
9
38
|
class ScientificNameParser
|
10
39
|
|
11
40
|
def initialize
|
@@ -15,21 +44,36 @@ class ScientificNameParser
|
|
15
44
|
@canonical = ScientificNameCanonicalParser.new
|
16
45
|
@parsed = nil
|
17
46
|
end
|
18
|
-
|
47
|
+
|
48
|
+
def virus?(a_string)
|
49
|
+
!!(a_string.match(/\sICTV\s*$/) || a_string.match(/\s(virus|phage|viroid|satellite|prion)\b/i))
|
50
|
+
end
|
51
|
+
|
19
52
|
def parsed
|
20
53
|
@parsed
|
21
54
|
end
|
22
55
|
|
23
56
|
def parse(a_string)
|
24
57
|
@verbatim = a_string
|
25
|
-
|
26
|
-
|
58
|
+
a_string = PreProcessor::clean(a_string)
|
59
|
+
|
60
|
+
if virus?(a_string)
|
61
|
+
@parsed = { :verbatim => a_string, :virus => true }
|
62
|
+
else
|
63
|
+
@parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
|
64
|
+
end
|
65
|
+
|
66
|
+
def @parsed.verbatim=(a_string)
|
67
|
+
@verbatim = a_string
|
68
|
+
end
|
69
|
+
|
70
|
+
def @parsed.all(verbatim = @verbatim)
|
27
71
|
parsed = self.class != Hash
|
28
72
|
res = {:parsed => parsed}
|
29
73
|
if parsed
|
30
74
|
hybrid = self.hybrid rescue false
|
31
75
|
res.merge!({
|
32
|
-
:verbatim =>
|
76
|
+
:verbatim => @verbatim,
|
33
77
|
:normalized => self.value,
|
34
78
|
:canonical => self.canonical,
|
35
79
|
:hybrid => hybrid,
|
@@ -51,7 +95,8 @@ class ScientificNameParser
|
|
51
95
|
def @parsed.all_json
|
52
96
|
self.all.to_json rescue ''
|
53
97
|
end
|
54
|
-
|
98
|
+
|
99
|
+
@parsed.verbatim = @verbatim
|
55
100
|
@parsed.all
|
56
101
|
end
|
57
102
|
end
|
@@ -30,6 +30,28 @@ grammar ScientificNameClean
|
|
30
30
|
end
|
31
31
|
|
32
32
|
rule scientific_name_5
|
33
|
+
a:multinomial_name space_hard hybrid_character space_hard b:species {
|
34
|
+
def value
|
35
|
+
a.value + " × " + b.value
|
36
|
+
end
|
37
|
+
|
38
|
+
def canonical
|
39
|
+
a.canonical + " × " + b.canonical
|
40
|
+
end
|
41
|
+
|
42
|
+
def pos
|
43
|
+
a.pos.merge(b.pos)
|
44
|
+
end
|
45
|
+
|
46
|
+
def hybrid
|
47
|
+
true
|
48
|
+
end
|
49
|
+
|
50
|
+
def details
|
51
|
+
[a.details, b.details.merge({:genus => a.details[:genus]})]
|
52
|
+
end
|
53
|
+
}
|
54
|
+
/
|
33
55
|
a:scientific_name_1 space b:taxon_concept_rank space c:authorship {
|
34
56
|
def value
|
35
57
|
a.value + " " + b.apply(c)
|
@@ -62,7 +84,7 @@ grammar ScientificNameClean
|
|
62
84
|
end
|
63
85
|
|
64
86
|
def canonical
|
65
|
-
a.canonical + " " + b.canonical
|
87
|
+
a.canonical + " × " + b.canonical
|
66
88
|
end
|
67
89
|
|
68
90
|
def pos
|
@@ -196,7 +218,7 @@ grammar ScientificNameClean
|
|
196
218
|
end
|
197
219
|
|
198
220
|
def canonical
|
199
|
-
a.canonical + " " +
|
221
|
+
a.canonical + " " + c.canonical + " " + d.canonical
|
200
222
|
end
|
201
223
|
|
202
224
|
def pos
|
@@ -381,7 +403,7 @@ grammar ScientificNameClean
|
|
381
403
|
end
|
382
404
|
|
383
405
|
rule rank
|
384
|
-
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"subsp."/"subf."/"race"/"α"
|
406
|
+
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
|
385
407
|
/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
|
386
408
|
{
|
387
409
|
def value
|
@@ -405,7 +427,7 @@ grammar ScientificNameClean
|
|
405
427
|
end
|
406
428
|
|
407
429
|
rule rank_forma
|
408
|
-
("forma"/"form."/"fo."/"f.")
|
430
|
+
("forma"/"form."/"form"/"fo."/"f.")
|
409
431
|
{
|
410
432
|
def value
|
411
433
|
"f."
|
@@ -449,28 +471,28 @@ grammar ScientificNameClean
|
|
449
471
|
end
|
450
472
|
|
451
473
|
rule species_string
|
452
|
-
a:species_word &(space_hard author_prefix_word space_hard) {
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
}
|
473
|
-
/
|
474
|
+
# a:species_word &(space_hard author_prefix_word space_hard) {
|
475
|
+
# def value
|
476
|
+
# a.value
|
477
|
+
# end
|
478
|
+
#
|
479
|
+
# def canonical
|
480
|
+
# a.value
|
481
|
+
# end
|
482
|
+
#
|
483
|
+
# def hybrid
|
484
|
+
# a.hybrid rescue false
|
485
|
+
# end
|
486
|
+
#
|
487
|
+
# def pos
|
488
|
+
# {a.interval.begin => ['species', a.interval.end]}
|
489
|
+
# end
|
490
|
+
#
|
491
|
+
# def details
|
492
|
+
# {:species => {:string => a.value}}
|
493
|
+
# end
|
494
|
+
# }
|
495
|
+
# /
|
474
496
|
species_word {
|
475
497
|
def canonical
|
476
498
|
value
|
@@ -493,7 +515,7 @@ grammar ScientificNameClean
|
|
493
515
|
end
|
494
516
|
|
495
517
|
rule infragenus
|
496
|
-
left_paren space a:cap_latin_word space right_paren {
|
518
|
+
left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
|
497
519
|
def value
|
498
520
|
"(" + a.value + ")"
|
499
521
|
end
|
@@ -513,7 +535,7 @@ grammar ScientificNameClean
|
|
513
535
|
end
|
514
536
|
|
515
537
|
rule genus
|
516
|
-
a:
|
538
|
+
a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
|
517
539
|
def value
|
518
540
|
a.value
|
519
541
|
end
|
@@ -533,6 +555,50 @@ grammar ScientificNameClean
|
|
533
555
|
end
|
534
556
|
|
535
557
|
rule uninomial_name
|
558
|
+
a:uninomial_string space b:infragenus space c:simple_authorship {
|
559
|
+
def value
|
560
|
+
a.value + " " + b.value + " " + c.value
|
561
|
+
end
|
562
|
+
|
563
|
+
def canonical
|
564
|
+
a.canonical
|
565
|
+
end
|
566
|
+
|
567
|
+
def pos
|
568
|
+
a.pos.merge(b.pos).merge(c.pos)
|
569
|
+
end
|
570
|
+
|
571
|
+
def hybrid
|
572
|
+
false
|
573
|
+
end
|
574
|
+
|
575
|
+
def details
|
576
|
+
{:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
|
577
|
+
end
|
578
|
+
}
|
579
|
+
/
|
580
|
+
a:uninomial_string space b:infragenus {
|
581
|
+
def value
|
582
|
+
a.value + " " + b.value
|
583
|
+
end
|
584
|
+
|
585
|
+
def canonical
|
586
|
+
a.canonical
|
587
|
+
end
|
588
|
+
|
589
|
+
def pos
|
590
|
+
a.pos.merge(b.pos)
|
591
|
+
end
|
592
|
+
|
593
|
+
def hybrid
|
594
|
+
false
|
595
|
+
end
|
596
|
+
|
597
|
+
def details
|
598
|
+
{:uninomial => a.details[:uninomial].merge(b.details)}
|
599
|
+
end
|
600
|
+
}
|
601
|
+
/
|
536
602
|
a:uninomial_string space_hard b:authorship {
|
537
603
|
def value
|
538
604
|
a.value + " " + b.value
|
@@ -799,7 +865,7 @@ grammar ScientificNameClean
|
|
799
865
|
|
800
866
|
|
801
867
|
rule unknown_auth
|
802
|
-
("auct."/"hort."/"anon."/"ht.") {
|
868
|
+
("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
|
803
869
|
def value
|
804
870
|
text_value
|
805
871
|
end
|
@@ -837,7 +903,7 @@ grammar ScientificNameClean
|
|
837
903
|
end
|
838
904
|
|
839
905
|
rule author_name
|
840
|
-
space a:author_prefix_word space b:author_name
|
906
|
+
space a:author_prefix_word space b:author_name {
|
841
907
|
def value
|
842
908
|
a.value + " " + b.value
|
843
909
|
end
|
@@ -851,7 +917,7 @@ grammar ScientificNameClean
|
|
851
917
|
end
|
852
918
|
}
|
853
919
|
/
|
854
|
-
|
920
|
+
a:author_word space b:author_name {
|
855
921
|
def value
|
856
922
|
a.value + " " + b.value
|
857
923
|
end
|
@@ -883,7 +949,7 @@ grammar ScientificNameClean
|
|
883
949
|
end
|
884
950
|
}
|
885
951
|
/
|
886
|
-
("arg."/"et al.\{\?\}"/"et al.") {
|
952
|
+
("arg."/"et al.\{\?\}"/"et al."/"et al") {
|
887
953
|
def value
|
888
954
|
text_value.strip
|
889
955
|
end
|
@@ -930,7 +996,7 @@ grammar ScientificNameClean
|
|
930
996
|
end
|
931
997
|
|
932
998
|
rule author_prefix_word
|
933
|
-
space ("ab"/"bis"/"da"/"der"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
|
999
|
+
space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
|
934
1000
|
def value
|
935
1001
|
text_value
|
936
1002
|
end
|
@@ -976,6 +1042,14 @@ grammar ScientificNameClean
|
|
976
1042
|
}
|
977
1043
|
end
|
978
1044
|
|
1045
|
+
rule capped_dotted_char
|
1046
|
+
[A-Z] "." {
|
1047
|
+
def value
|
1048
|
+
text_value
|
1049
|
+
end
|
1050
|
+
}
|
1051
|
+
end
|
1052
|
+
|
979
1053
|
rule species_word_hybrid
|
980
1054
|
a:multiplication_sign space b:species_word {
|
981
1055
|
def value
|
@@ -1051,7 +1125,9 @@ grammar ScientificNameClean
|
|
1051
1125
|
rule species_word
|
1052
1126
|
a:[0-9]+ "-"? b:latin_word {
|
1053
1127
|
def value
|
1054
|
-
|
1128
|
+
num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"}
|
1129
|
+
a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-"
|
1130
|
+
a_value + b.value
|
1055
1131
|
end
|
1056
1132
|
}
|
1057
1133
|
/
|
@@ -1059,18 +1135,21 @@ grammar ScientificNameClean
|
|
1059
1135
|
end
|
1060
1136
|
|
1061
1137
|
rule latin_word
|
1062
|
-
a:
|
1138
|
+
a:valid_name_letters "-" b:latin_word {
|
1139
|
+
def value
|
1140
|
+
a.value + "-" + b.value
|
1141
|
+
end
|
1142
|
+
}
|
1143
|
+
/
|
1144
|
+
a:valid_name_letter b:valid_name_letters {
|
1063
1145
|
def value
|
1064
|
-
|
1065
|
-
l = 'ae' if l == 'æ'
|
1066
|
-
l = 'oe' if l == 'œ'
|
1067
|
-
l + b.value
|
1146
|
+
a.value + b.value
|
1068
1147
|
end
|
1069
1148
|
}
|
1070
1149
|
end
|
1071
1150
|
|
1072
1151
|
rule valid_name_letters
|
1073
|
-
[a-
|
1152
|
+
[a-zëæœ]+ {
|
1074
1153
|
def value
|
1075
1154
|
res = ''
|
1076
1155
|
text_value.split('').each do |l|
|
@@ -1086,6 +1165,18 @@ grammar ScientificNameClean
|
|
1086
1165
|
}
|
1087
1166
|
end
|
1088
1167
|
|
1168
|
+
rule valid_name_letter
|
1169
|
+
[a-zëæœ] {
|
1170
|
+
def value
|
1171
|
+
res = text_value
|
1172
|
+
res = 'ae' if res == 'æ'
|
1173
|
+
res = 'oe' if res == 'œ'
|
1174
|
+
res
|
1175
|
+
end
|
1176
|
+
}
|
1177
|
+
end
|
1178
|
+
|
1179
|
+
|
1089
1180
|
rule cap_digraph
|
1090
1181
|
"Æ" {
|
1091
1182
|
def value
|