biodiversity 0.5.16 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,3 +1,5 @@
1
+ lib/biodiversity/parser/*rb
2
+ *.gemspec
1
3
  *.sw?
2
4
  .DS_Store
3
5
  coverage
data/README.rdoc CHANGED
@@ -1,17 +1,17 @@
1
1
  = Biodiversity
2
2
 
3
- Parses species scientific name and breaks it into elements.
3
+ Parses taxonomic scientific name and breaks it into semantic elements.
4
4
 
5
5
  == Installation
6
6
 
7
- To install gem you need RubyGems >= 1.2.0
7
+ To install gem you need RubyGems >= 1.3.6
8
8
 
9
- $ gem sources -a http://gems.github.com (you only have to do this once)
10
- $ sudo gem install dimus-biodiversity
9
+ $ sudo gem install biodiversity #for ruby 1.8.x
10
+ $ sudo gem install biodiversity19 #for ruby 1.9.x
11
11
 
12
12
  == Example usage
13
13
 
14
- You can parse file with species names from command line. File should contain one scientific name per line
14
+ You can parse file with taxonomic names from command line. File should contain one scientific name per line
15
15
 
16
16
  nnparser file_with_names
17
17
 
data/Rakefile CHANGED
@@ -13,11 +13,13 @@ Spec::Rake::SpecTask.new do |t|
13
13
  t.pattern = 'spec/**/*spec.rb'
14
14
  end
15
15
 
16
+ ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
17
+
16
18
 
17
19
  begin
18
20
  require 'jeweler'
19
21
  Jeweler::Tasks.new do |gem|
20
- gem.name = "biodiversity"
22
+ gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
21
23
  gem.summary = 'Parser of scientific names'
22
24
  gem.description = 'Tools for biodiversity informatics'
23
25
  gem.email = "dmozzherin@gmail.com"
@@ -37,11 +39,14 @@ end
37
39
 
38
40
  task :tt do
39
41
  ['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
40
- system("tt #{dir}/lib/biodiversity/parser/#{f}.treetop")
41
- rf = "#{dir}/lib/biodiversity/parser/#{f}.rb"
42
+ file = "#{dir}/lib/biodiversity/parser/#{f}"
43
+ FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
44
+ system("tt #{file}.treetop")
45
+ rf = "#{file}.rb"
42
46
  rfn = open(rf + ".tmp", 'w')
43
47
  skip_head = false
44
48
  f = open(rf)
49
+ #getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
45
50
  f.each_with_index do |l, i|
46
51
  skip_head = l.match(/^# Autogenerated/) if i == 0
47
52
  if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.16
1
+ 0.6.0
data/bin/nnparse CHANGED
@@ -1,11 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'rubygems'
3
- gem 'biodiversity' rescue nil
3
+ gem_name = RUBY_VERSION.split('.')[0..1].join('').to_i > 18 ? 'biodiversity19' : 'biodiversity'
4
+ gem gem_name rescue nil
4
5
 
5
6
  $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
6
7
  require 'biodiversity'
7
8
  require 'json'
8
9
 
10
+ def parser_error(name)
11
+ {'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
12
+ end
9
13
 
10
14
  if ARGV.empty?
11
15
  puts "Usage:\n\nnnparse file_with_scientific_names [output_file]\n\ndefault output_file is parsed.json\n\n"
@@ -31,12 +35,12 @@ IO.foreach(input) do |line|
31
35
  $KCODE = 'NONE'
32
36
  end
33
37
  p.parse(name)
34
- parsed_data = p.parsed.all_json rescue {'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
38
+ parsed_data = p.parsed.all_json rescue parser_error(name)
35
39
  if ruby_min_version < 19
36
40
  $KCODE = old_kcode
37
41
  end
38
42
  rescue
39
- parsed_data = {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}.to_json
43
+ parsed_data = parser_error(name)
40
44
  end
41
45
  o.write parsed_data + "\n"
42
46
  end
data/bin/parserver CHANGED
@@ -2,6 +2,7 @@
2
2
  require 'rubygems'
3
3
  require 'socket'
4
4
  require 'biodiversity' # Get sockets from stdlib
5
+ puts "Running parser service on port 4334"
5
6
  parser = ScientificNameParser.new
6
7
  server = TCPServer.open(4334) # Socket to listen on port 4334
7
8
  loop do # Servers run forever
@@ -6,6 +6,35 @@ require File.join(dir, *%w[parser scientific_name_canonical])
6
6
  require 'rubygems'
7
7
  require 'json'
8
8
 
9
+ module PreProcessor
10
+ NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
11
+ TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
12
+ TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
13
+ TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
14
+ NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
15
+ LAST_WORD_JUNK = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
16
+
17
+ def self.clean(a_string)
18
+ [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
19
+ a_string = a_string.gsub(i, '')
20
+ end
21
+ a_string = a_string.tr('ſ','s') #old 's'
22
+ a_string
23
+ end
24
+ end
25
+
26
+ # we can use these expressions when we are ready to parse virus names
27
+ # class VirusParser
28
+ # def initialize
29
+ # @order = /^\s*[A-Z][a-z]\+virales/i
30
+ # @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
31
+ # @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
32
+ # @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
33
+ # @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
34
+ # @parsed = nil
35
+ # end
36
+ # end
37
+
9
38
  class ScientificNameParser
10
39
 
11
40
  def initialize
@@ -15,21 +44,36 @@ class ScientificNameParser
15
44
  @canonical = ScientificNameCanonicalParser.new
16
45
  @parsed = nil
17
46
  end
18
-
47
+
48
+ def virus?(a_string)
49
+ !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\s(virus|phage|viroid|satellite|prion)\b/i))
50
+ end
51
+
19
52
  def parsed
20
53
  @parsed
21
54
  end
22
55
 
23
56
  def parse(a_string)
24
57
  @verbatim = a_string
25
- @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || {:verbatim => a_string}
26
- def @parsed.all
58
+ a_string = PreProcessor::clean(a_string)
59
+
60
+ if virus?(a_string)
61
+ @parsed = { :verbatim => a_string, :virus => true }
62
+ else
63
+ @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
64
+ end
65
+
66
+ def @parsed.verbatim=(a_string)
67
+ @verbatim = a_string
68
+ end
69
+
70
+ def @parsed.all(verbatim = @verbatim)
27
71
  parsed = self.class != Hash
28
72
  res = {:parsed => parsed}
29
73
  if parsed
30
74
  hybrid = self.hybrid rescue false
31
75
  res.merge!({
32
- :verbatim => self.text_value,
76
+ :verbatim => @verbatim,
33
77
  :normalized => self.value,
34
78
  :canonical => self.canonical,
35
79
  :hybrid => hybrid,
@@ -51,7 +95,8 @@ class ScientificNameParser
51
95
  def @parsed.all_json
52
96
  self.all.to_json rescue ''
53
97
  end
54
-
98
+
99
+ @parsed.verbatim = @verbatim
55
100
  @parsed.all
56
101
  end
57
102
  end
@@ -30,6 +30,28 @@ grammar ScientificNameClean
30
30
  end
31
31
 
32
32
  rule scientific_name_5
33
+ a:multinomial_name space_hard hybrid_character space_hard b:species {
34
+ def value
35
+ a.value + " × " + b.value
36
+ end
37
+
38
+ def canonical
39
+ a.canonical + " × " + b.canonical
40
+ end
41
+
42
+ def pos
43
+ a.pos.merge(b.pos)
44
+ end
45
+
46
+ def hybrid
47
+ true
48
+ end
49
+
50
+ def details
51
+ [a.details, b.details.merge({:genus => a.details[:genus]})]
52
+ end
53
+ }
54
+ /
33
55
  a:scientific_name_1 space b:taxon_concept_rank space c:authorship {
34
56
  def value
35
57
  a.value + " " + b.apply(c)
@@ -62,7 +84,7 @@ grammar ScientificNameClean
62
84
  end
63
85
 
64
86
  def canonical
65
- a.canonical + " " + b.canonical
87
+ a.canonical + " × " + b.canonical
66
88
  end
67
89
 
68
90
  def pos
@@ -196,7 +218,7 @@ grammar ScientificNameClean
196
218
  end
197
219
 
198
220
  def canonical
199
- a.canonical + " " + b.canonical + " " + c.canonical + " " + d.canonical
221
+ a.canonical + " " + c.canonical + " " + d.canonical
200
222
  end
201
223
 
202
224
  def pos
@@ -381,7 +403,7 @@ grammar ScientificNameClean
381
403
  end
382
404
 
383
405
  rule rank
384
- ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"subsp."/"subf."/"race"/"α"
406
+ ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
385
407
  /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
386
408
  {
387
409
  def value
@@ -405,7 +427,7 @@ grammar ScientificNameClean
405
427
  end
406
428
 
407
429
  rule rank_forma
408
- ("forma"/"form."/"fo."/"f.")
430
+ ("forma"/"form."/"form"/"fo."/"f.")
409
431
  {
410
432
  def value
411
433
  "f."
@@ -449,28 +471,28 @@ grammar ScientificNameClean
449
471
  end
450
472
 
451
473
  rule species_string
452
- a:species_word &(space_hard author_prefix_word space_hard) {
453
- def value
454
- a.value
455
- end
456
-
457
- def canonical
458
- a.value
459
- end
460
-
461
- def hybrid
462
- a.hybrid rescue false
463
- end
464
-
465
- def pos
466
- {a.interval.begin => ['species', a.interval.end]}
467
- end
468
-
469
- def details
470
- {:species => {:string => a.value}}
471
- end
472
- }
473
- /
474
+ # a:species_word &(space_hard author_prefix_word space_hard) {
475
+ # def value
476
+ # a.value
477
+ # end
478
+ #
479
+ # def canonical
480
+ # a.value
481
+ # end
482
+ #
483
+ # def hybrid
484
+ # a.hybrid rescue false
485
+ # end
486
+ #
487
+ # def pos
488
+ # {a.interval.begin => ['species', a.interval.end]}
489
+ # end
490
+ #
491
+ # def details
492
+ # {:species => {:string => a.value}}
493
+ # end
494
+ # }
495
+ # /
474
496
  species_word {
475
497
  def canonical
476
498
  value
@@ -493,7 +515,7 @@ grammar ScientificNameClean
493
515
  end
494
516
 
495
517
  rule infragenus
496
- left_paren space a:cap_latin_word space right_paren {
518
+ left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
497
519
  def value
498
520
  "(" + a.value + ")"
499
521
  end
@@ -513,7 +535,7 @@ grammar ScientificNameClean
513
535
  end
514
536
 
515
537
  rule genus
516
- a:(cap_latin_word_pair/cap_latin_word) !(space_hard author_prefix_word space_hard author_word) {
538
+ a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
517
539
  def value
518
540
  a.value
519
541
  end
@@ -533,6 +555,50 @@ grammar ScientificNameClean
533
555
  end
534
556
 
535
557
  rule uninomial_name
558
+ a:uninomial_string space b:infragenus space c:simple_authorship {
559
+ def value
560
+ a.value + " " + b.value + " " + c.value
561
+ end
562
+
563
+ def canonical
564
+ a.canonical
565
+ end
566
+
567
+ def pos
568
+ a.pos.merge(b.pos).merge(c.pos)
569
+ end
570
+
571
+ def hybrid
572
+ false
573
+ end
574
+
575
+ def details
576
+ {:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
577
+ end
578
+ }
579
+ /
580
+ a:uninomial_string space b:infragenus {
581
+ def value
582
+ a.value + " " + b.value
583
+ end
584
+
585
+ def canonical
586
+ a.canonical
587
+ end
588
+
589
+ def pos
590
+ a.pos.merge(b.pos)
591
+ end
592
+
593
+ def hybrid
594
+ false
595
+ end
596
+
597
+ def details
598
+ {:uninomial => a.details[:uninomial].merge(b.details)}
599
+ end
600
+ }
601
+ /
536
602
  a:uninomial_string space_hard b:authorship {
537
603
  def value
538
604
  a.value + " " + b.value
@@ -799,7 +865,7 @@ grammar ScientificNameClean
799
865
 
800
866
 
801
867
  rule unknown_auth
802
- ("auct."/"hort."/"anon."/"ht.") {
868
+ ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
803
869
  def value
804
870
  text_value
805
871
  end
@@ -837,7 +903,7 @@ grammar ScientificNameClean
837
903
  end
838
904
 
839
905
  rule author_name
840
- space a:author_prefix_word space b:author_name space {
906
+ space a:author_prefix_word space b:author_name {
841
907
  def value
842
908
  a.value + " " + b.value
843
909
  end
@@ -851,7 +917,7 @@ grammar ScientificNameClean
851
917
  end
852
918
  }
853
919
  /
854
- space a:author_word space b:author_name space {
920
+ a:author_word space b:author_name {
855
921
  def value
856
922
  a.value + " " + b.value
857
923
  end
@@ -883,7 +949,7 @@ grammar ScientificNameClean
883
949
  end
884
950
  }
885
951
  /
886
- ("arg."/"et al.\{\?\}"/"et al.") {
952
+ ("arg."/"et al.\{\?\}"/"et al."/"et al") {
887
953
  def value
888
954
  text_value.strip
889
955
  end
@@ -930,7 +996,7 @@ grammar ScientificNameClean
930
996
  end
931
997
 
932
998
  rule author_prefix_word
933
- space ("ab"/"bis"/"da"/"der"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
999
+ space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
934
1000
  def value
935
1001
  text_value
936
1002
  end
@@ -976,6 +1042,14 @@ grammar ScientificNameClean
976
1042
  }
977
1043
  end
978
1044
 
1045
+ rule capped_dotted_char
1046
+ [A-Z] "." {
1047
+ def value
1048
+ text_value
1049
+ end
1050
+ }
1051
+ end
1052
+
979
1053
  rule species_word_hybrid
980
1054
  a:multiplication_sign space b:species_word {
981
1055
  def value
@@ -1051,7 +1125,9 @@ grammar ScientificNameClean
1051
1125
  rule species_word
1052
1126
  a:[0-9]+ "-"? b:latin_word {
1053
1127
  def value
1054
- a.text_value + "-" + b.value
1128
+ num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"}
1129
+ a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-"
1130
+ a_value + b.value
1055
1131
  end
1056
1132
  }
1057
1133
  /
@@ -1059,18 +1135,21 @@ grammar ScientificNameClean
1059
1135
  end
1060
1136
 
1061
1137
  rule latin_word
1062
- a:[a-zëæœ] b:valid_name_letters {
1138
+ a:valid_name_letters "-" b:latin_word {
1139
+ def value
1140
+ a.value + "-" + b.value
1141
+ end
1142
+ }
1143
+ /
1144
+ a:valid_name_letter b:valid_name_letters {
1063
1145
  def value
1064
- l = a.text_value
1065
- l = 'ae' if l == 'æ'
1066
- l = 'oe' if l == 'œ'
1067
- l + b.value
1146
+ a.value + b.value
1068
1147
  end
1069
1148
  }
1070
1149
  end
1071
1150
 
1072
1151
  rule valid_name_letters
1073
- [a-z\-ëæœ]+ {
1152
+ [a-zëæœ]+ {
1074
1153
  def value
1075
1154
  res = ''
1076
1155
  text_value.split('').each do |l|
@@ -1086,6 +1165,18 @@ grammar ScientificNameClean
1086
1165
  }
1087
1166
  end
1088
1167
 
1168
+ rule valid_name_letter
1169
+ [a-zëæœ] {
1170
+ def value
1171
+ res = text_value
1172
+ res = 'ae' if res == 'æ'
1173
+ res = 'oe' if res == 'œ'
1174
+ res
1175
+ end
1176
+ }
1177
+ end
1178
+
1179
+
1089
1180
  rule cap_digraph
1090
1181
  "Æ" {
1091
1182
  def value