biodiversity 0.5.16 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,3 +1,5 @@
1
+ lib/biodiversity/parser/*rb
2
+ *.gemspec
1
3
  *.sw?
2
4
  .DS_Store
3
5
  coverage
data/README.rdoc CHANGED
@@ -1,17 +1,17 @@
1
1
  = Biodiversity
2
2
 
3
- Parses species scientific name and breaks it into elements.
3
+ Parses taxonomic scientific name and breaks it into semantic elements.
4
4
 
5
5
  == Installation
6
6
 
7
- To install gem you need RubyGems >= 1.2.0
7
+ To install gem you need RubyGems >= 1.3.6
8
8
 
9
- $ gem sources -a http://gems.github.com (you only have to do this once)
10
- $ sudo gem install dimus-biodiversity
9
+ $ sudo gem install biodiversity #for ruby 1.8.x
10
+ $ sudo gem install biodiversity19 #for ruby 1.9.x
11
11
 
12
12
  == Example usage
13
13
 
14
- You can parse file with species names from command line. File should contain one scientific name per line
14
+ You can parse file with taxonomic names from command line. File should contain one scientific name per line
15
15
 
16
16
  nnparser file_with_names
17
17
 
data/Rakefile CHANGED
@@ -13,11 +13,13 @@ Spec::Rake::SpecTask.new do |t|
13
13
  t.pattern = 'spec/**/*spec.rb'
14
14
  end
15
15
 
16
+ ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
17
+
16
18
 
17
19
  begin
18
20
  require 'jeweler'
19
21
  Jeweler::Tasks.new do |gem|
20
- gem.name = "biodiversity"
22
+ gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
21
23
  gem.summary = 'Parser of scientific names'
22
24
  gem.description = 'Tools for biodiversity informatics'
23
25
  gem.email = "dmozzherin@gmail.com"
@@ -37,11 +39,14 @@ end
37
39
 
38
40
  task :tt do
39
41
  ['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
40
- system("tt #{dir}/lib/biodiversity/parser/#{f}.treetop")
41
- rf = "#{dir}/lib/biodiversity/parser/#{f}.rb"
42
+ file = "#{dir}/lib/biodiversity/parser/#{f}"
43
+ FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
44
+ system("tt #{file}.treetop")
45
+ rf = "#{file}.rb"
42
46
  rfn = open(rf + ".tmp", 'w')
43
47
  skip_head = false
44
48
  f = open(rf)
49
+ #getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
45
50
  f.each_with_index do |l, i|
46
51
  skip_head = l.match(/^# Autogenerated/) if i == 0
47
52
  if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.16
1
+ 0.6.0
data/bin/nnparse CHANGED
@@ -1,11 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'rubygems'
3
- gem 'biodiversity' rescue nil
3
+ gem_name = RUBY_VERSION.split('.')[0..1].join('').to_i > 18 ? 'biodiversity19' : 'biodiversity'
4
+ gem gem_name rescue nil
4
5
 
5
6
  $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
6
7
  require 'biodiversity'
7
8
  require 'json'
8
9
 
10
+ def parser_error(name)
11
+ {'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
12
+ end
9
13
 
10
14
  if ARGV.empty?
11
15
  puts "Usage:\n\nnnparse file_with_scientific_names [output_file]\n\ndefault output_file is parsed.json\n\n"
@@ -31,12 +35,12 @@ IO.foreach(input) do |line|
31
35
  $KCODE = 'NONE'
32
36
  end
33
37
  p.parse(name)
34
- parsed_data = p.parsed.all_json rescue {'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
38
+ parsed_data = p.parsed.all_json rescue parser_error(name)
35
39
  if ruby_min_version < 19
36
40
  $KCODE = old_kcode
37
41
  end
38
42
  rescue
39
- parsed_data = {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}.to_json
43
+ parsed_data = parser_error(name)
40
44
  end
41
45
  o.write parsed_data + "\n"
42
46
  end
data/bin/parserver CHANGED
@@ -2,6 +2,7 @@
2
2
  require 'rubygems'
3
3
  require 'socket'
4
4
  require 'biodiversity' # Get sockets from stdlib
5
+ puts "Running parser service on port 4334"
5
6
  parser = ScientificNameParser.new
6
7
  server = TCPServer.open(4334) # Socket to listen on port 4334
7
8
  loop do # Servers run forever
@@ -6,6 +6,35 @@ require File.join(dir, *%w[parser scientific_name_canonical])
6
6
  require 'rubygems'
7
7
  require 'json'
8
8
 
9
+ module PreProcessor
10
+ NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
11
+ TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
12
+ TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
13
+ TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
14
+ NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
15
+ LAST_WORD_JUNK = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
16
+
17
+ def self.clean(a_string)
18
+ [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
19
+ a_string = a_string.gsub(i, '')
20
+ end
21
+ a_string = a_string.tr('ſ','s') #old 's'
22
+ a_string
23
+ end
24
+ end
25
+
26
+ # we can use these expressions when we are ready to parse virus names
27
+ # class VirusParser
28
+ # def initialize
29
+ # @order = /^\s*[A-Z][a-z]\+virales/i
30
+ # @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
31
+ # @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
32
+ # @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
33
+ # @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
34
+ # @parsed = nil
35
+ # end
36
+ # end
37
+
9
38
  class ScientificNameParser
10
39
 
11
40
  def initialize
@@ -15,21 +44,36 @@ class ScientificNameParser
15
44
  @canonical = ScientificNameCanonicalParser.new
16
45
  @parsed = nil
17
46
  end
18
-
47
+
48
+ def virus?(a_string)
49
+ !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\s(virus|phage|viroid|satellite|prion)\b/i))
50
+ end
51
+
19
52
  def parsed
20
53
  @parsed
21
54
  end
22
55
 
23
56
  def parse(a_string)
24
57
  @verbatim = a_string
25
- @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || {:verbatim => a_string}
26
- def @parsed.all
58
+ a_string = PreProcessor::clean(a_string)
59
+
60
+ if virus?(a_string)
61
+ @parsed = { :verbatim => a_string, :virus => true }
62
+ else
63
+ @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
64
+ end
65
+
66
+ def @parsed.verbatim=(a_string)
67
+ @verbatim = a_string
68
+ end
69
+
70
+ def @parsed.all(verbatim = @verbatim)
27
71
  parsed = self.class != Hash
28
72
  res = {:parsed => parsed}
29
73
  if parsed
30
74
  hybrid = self.hybrid rescue false
31
75
  res.merge!({
32
- :verbatim => self.text_value,
76
+ :verbatim => @verbatim,
33
77
  :normalized => self.value,
34
78
  :canonical => self.canonical,
35
79
  :hybrid => hybrid,
@@ -51,7 +95,8 @@ class ScientificNameParser
51
95
  def @parsed.all_json
52
96
  self.all.to_json rescue ''
53
97
  end
54
-
98
+
99
+ @parsed.verbatim = @verbatim
55
100
  @parsed.all
56
101
  end
57
102
  end
@@ -30,6 +30,28 @@ grammar ScientificNameClean
30
30
  end
31
31
 
32
32
  rule scientific_name_5
33
+ a:multinomial_name space_hard hybrid_character space_hard b:species {
34
+ def value
35
+ a.value + " × " + b.value
36
+ end
37
+
38
+ def canonical
39
+ a.canonical + " × " + b.canonical
40
+ end
41
+
42
+ def pos
43
+ a.pos.merge(b.pos)
44
+ end
45
+
46
+ def hybrid
47
+ true
48
+ end
49
+
50
+ def details
51
+ [a.details, b.details.merge({:genus => a.details[:genus]})]
52
+ end
53
+ }
54
+ /
33
55
  a:scientific_name_1 space b:taxon_concept_rank space c:authorship {
34
56
  def value
35
57
  a.value + " " + b.apply(c)
@@ -62,7 +84,7 @@ grammar ScientificNameClean
62
84
  end
63
85
 
64
86
  def canonical
65
- a.canonical + " " + b.canonical
87
+ a.canonical + " × " + b.canonical
66
88
  end
67
89
 
68
90
  def pos
@@ -196,7 +218,7 @@ grammar ScientificNameClean
196
218
  end
197
219
 
198
220
  def canonical
199
- a.canonical + " " + b.canonical + " " + c.canonical + " " + d.canonical
221
+ a.canonical + " " + c.canonical + " " + d.canonical
200
222
  end
201
223
 
202
224
  def pos
@@ -381,7 +403,7 @@ grammar ScientificNameClean
381
403
  end
382
404
 
383
405
  rule rank
384
- ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"subsp."/"subf."/"race"/"α"
406
+ ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
385
407
  /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
386
408
  {
387
409
  def value
@@ -405,7 +427,7 @@ grammar ScientificNameClean
405
427
  end
406
428
 
407
429
  rule rank_forma
408
- ("forma"/"form."/"fo."/"f.")
430
+ ("forma"/"form."/"form"/"fo."/"f.")
409
431
  {
410
432
  def value
411
433
  "f."
@@ -449,28 +471,28 @@ grammar ScientificNameClean
449
471
  end
450
472
 
451
473
  rule species_string
452
- a:species_word &(space_hard author_prefix_word space_hard) {
453
- def value
454
- a.value
455
- end
456
-
457
- def canonical
458
- a.value
459
- end
460
-
461
- def hybrid
462
- a.hybrid rescue false
463
- end
464
-
465
- def pos
466
- {a.interval.begin => ['species', a.interval.end]}
467
- end
468
-
469
- def details
470
- {:species => {:string => a.value}}
471
- end
472
- }
473
- /
474
+ # a:species_word &(space_hard author_prefix_word space_hard) {
475
+ # def value
476
+ # a.value
477
+ # end
478
+ #
479
+ # def canonical
480
+ # a.value
481
+ # end
482
+ #
483
+ # def hybrid
484
+ # a.hybrid rescue false
485
+ # end
486
+ #
487
+ # def pos
488
+ # {a.interval.begin => ['species', a.interval.end]}
489
+ # end
490
+ #
491
+ # def details
492
+ # {:species => {:string => a.value}}
493
+ # end
494
+ # }
495
+ # /
474
496
  species_word {
475
497
  def canonical
476
498
  value
@@ -493,7 +515,7 @@ grammar ScientificNameClean
493
515
  end
494
516
 
495
517
  rule infragenus
496
- left_paren space a:cap_latin_word space right_paren {
518
+ left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
497
519
  def value
498
520
  "(" + a.value + ")"
499
521
  end
@@ -513,7 +535,7 @@ grammar ScientificNameClean
513
535
  end
514
536
 
515
537
  rule genus
516
- a:(cap_latin_word_pair/cap_latin_word) !(space_hard author_prefix_word space_hard author_word) {
538
+ a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
517
539
  def value
518
540
  a.value
519
541
  end
@@ -533,6 +555,50 @@ grammar ScientificNameClean
533
555
  end
534
556
 
535
557
  rule uninomial_name
558
+ a:uninomial_string space b:infragenus space c:simple_authorship {
559
+ def value
560
+ a.value + " " + b.value + " " + c.value
561
+ end
562
+
563
+ def canonical
564
+ a.canonical
565
+ end
566
+
567
+ def pos
568
+ a.pos.merge(b.pos).merge(c.pos)
569
+ end
570
+
571
+ def hybrid
572
+ false
573
+ end
574
+
575
+ def details
576
+ {:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
577
+ end
578
+ }
579
+ /
580
+ a:uninomial_string space b:infragenus {
581
+ def value
582
+ a.value + " " + b.value
583
+ end
584
+
585
+ def canonical
586
+ a.canonical
587
+ end
588
+
589
+ def pos
590
+ a.pos.merge(b.pos)
591
+ end
592
+
593
+ def hybrid
594
+ false
595
+ end
596
+
597
+ def details
598
+ {:uninomial => a.details[:uninomial].merge(b.details)}
599
+ end
600
+ }
601
+ /
536
602
  a:uninomial_string space_hard b:authorship {
537
603
  def value
538
604
  a.value + " " + b.value
@@ -799,7 +865,7 @@ grammar ScientificNameClean
799
865
 
800
866
 
801
867
  rule unknown_auth
802
- ("auct."/"hort."/"anon."/"ht.") {
868
+ ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
803
869
  def value
804
870
  text_value
805
871
  end
@@ -837,7 +903,7 @@ grammar ScientificNameClean
837
903
  end
838
904
 
839
905
  rule author_name
840
- space a:author_prefix_word space b:author_name space {
906
+ space a:author_prefix_word space b:author_name {
841
907
  def value
842
908
  a.value + " " + b.value
843
909
  end
@@ -851,7 +917,7 @@ grammar ScientificNameClean
851
917
  end
852
918
  }
853
919
  /
854
- space a:author_word space b:author_name space {
920
+ a:author_word space b:author_name {
855
921
  def value
856
922
  a.value + " " + b.value
857
923
  end
@@ -883,7 +949,7 @@ grammar ScientificNameClean
883
949
  end
884
950
  }
885
951
  /
886
- ("arg."/"et al.\{\?\}"/"et al.") {
952
+ ("arg."/"et al.\{\?\}"/"et al."/"et al") {
887
953
  def value
888
954
  text_value.strip
889
955
  end
@@ -930,7 +996,7 @@ grammar ScientificNameClean
930
996
  end
931
997
 
932
998
  rule author_prefix_word
933
- space ("ab"/"bis"/"da"/"der"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
999
+ space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
934
1000
  def value
935
1001
  text_value
936
1002
  end
@@ -976,6 +1042,14 @@ grammar ScientificNameClean
976
1042
  }
977
1043
  end
978
1044
 
1045
+ rule capped_dotted_char
1046
+ [A-Z] "." {
1047
+ def value
1048
+ text_value
1049
+ end
1050
+ }
1051
+ end
1052
+
979
1053
  rule species_word_hybrid
980
1054
  a:multiplication_sign space b:species_word {
981
1055
  def value
@@ -1051,7 +1125,9 @@ grammar ScientificNameClean
1051
1125
  rule species_word
1052
1126
  a:[0-9]+ "-"? b:latin_word {
1053
1127
  def value
1054
- a.text_value + "-" + b.value
1128
+ num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"}
1129
+ a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-"
1130
+ a_value + b.value
1055
1131
  end
1056
1132
  }
1057
1133
  /
@@ -1059,18 +1135,21 @@ grammar ScientificNameClean
1059
1135
  end
1060
1136
 
1061
1137
  rule latin_word
1062
- a:[a-zëæœ] b:valid_name_letters {
1138
+ a:valid_name_letters "-" b:latin_word {
1139
+ def value
1140
+ a.value + "-" + b.value
1141
+ end
1142
+ }
1143
+ /
1144
+ a:valid_name_letter b:valid_name_letters {
1063
1145
  def value
1064
- l = a.text_value
1065
- l = 'ae' if l == 'æ'
1066
- l = 'oe' if l == 'œ'
1067
- l + b.value
1146
+ a.value + b.value
1068
1147
  end
1069
1148
  }
1070
1149
  end
1071
1150
 
1072
1151
  rule valid_name_letters
1073
- [a-z\-ëæœ]+ {
1152
+ [a-zëæœ]+ {
1074
1153
  def value
1075
1154
  res = ''
1076
1155
  text_value.split('').each do |l|
@@ -1086,6 +1165,18 @@ grammar ScientificNameClean
1086
1165
  }
1087
1166
  end
1088
1167
 
1168
+ rule valid_name_letter
1169
+ [a-zëæœ] {
1170
+ def value
1171
+ res = text_value
1172
+ res = 'ae' if res == 'æ'
1173
+ res = 'oe' if res == 'œ'
1174
+ res
1175
+ end
1176
+ }
1177
+ end
1178
+
1179
+
1089
1180
  rule cap_digraph
1090
1181
  "Æ" {
1091
1182
  def value