RubyGems - biodiversity - Versions diffs - 0.5.16 → 0.6.0 - Mend

biodiversity 0.5.16 → 0.6.0

Files changed (15) hide show

data/.gitignore +2 -0
data/README.rdoc +5 -5
data/Rakefile +8 -3
data/VERSION +1 -1
data/bin/nnparse +7 -3
data/bin/parserver +1 -0
data/lib/biodiversity/parser.rb +50 -5
data/lib/biodiversity/parser/scientific_name_clean.treetop +131 -40
data/spec/parser/scientific_name_clean.spec.rb +53 -27
data/spec/parser/test_data.txt +73 -21
metadata +4 -8
data/biodiversity.gemspec +0 -88
data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -481
data/lib/biodiversity/parser/scientific_name_clean.rb +0 -6118
data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1309

data/.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
+lib/biodiversity/parser/*rb
+*.gemspec
 *.sw?
 .DS_Store
 coverage

data/README.rdoc CHANGED Viewed

@@ -1,17 +1,17 @@
 = Biodiversity
-Parses species scientific name and breaks it into elements.
+Parses taxonomic scientific name and breaks it into semantic elements.
 == Installation
-To install gem you need RubyGems >= 1.2.0
+To install gem you need RubyGems >= 1.3.6
-  $ gem sources -a http://gems.github.com (you only have to do this once)
-  $ sudo gem install dimus-biodiversity
+  $ sudo gem install biodiversity #for ruby 1.8.x
+  $ sudo gem install biodiversity19 #for ruby 1.9.x
 == Example usage
-You can parse file with species names from command line. File should contain one scientific name per line
+You can parse file with taxonomic names from command line. File should contain one scientific name per line
   nnparser file_with_names

data/Rakefile CHANGED Viewed

@@ -13,11 +13,13 @@ Spec::Rake::SpecTask.new do |t|
   t.pattern = 'spec/**/*spec.rb'
 end
+ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
 begin
   require 'jeweler'
   Jeweler::Tasks.new do |gem|
-    gem.name = "biodiversity"
+    gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
     gem.summary = 'Parser of scientific names'
     gem.description = 'Tools for biodiversity informatics'
     gem.email = "dmozzherin@gmail.com"
@@ -37,11 +39,14 @@ end
 task :tt do
   ['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
-    system("tt #{dir}/lib/biodiversity/parser/#{f}.treetop")
-    rf = "#{dir}/lib/biodiversity/parser/#{f}.rb"
+    file = "#{dir}/lib/biodiversity/parser/#{f}"
+    FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
+    system("tt #{file}.treetop")
+    rf = "#{file}.rb"
     rfn = open(rf + ".tmp", 'w')
     skip_head = false
     f = open(rf)
+    #getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
     f.each_with_index do |l, i|
       skip_head = l.match(/^# Autogenerated/) if i == 0
       if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.5.16
1	+ 0.6.0

data/bin/nnparse CHANGED Viewed

@@ -1,11 +1,15 @@
 #!/usr/bin/env ruby
 require 'rubygems'
-gem 'biodiversity' rescue nil
+gem_name = RUBY_VERSION.split('.')[0..1].join('').to_i > 18 ? 'biodiversity19' : 'biodiversity'
+gem gem_name rescue nil
 $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
 require 'biodiversity'
 require 'json'
+def parser_error(name)
+  {'scientificName' => {'parsed' => false, 'verbatim' => name,  'error' => 'Parser error'}}.to_json
+end
 if ARGV.empty?
   puts "Usage:\n\nnnparse file_with_scientific_names [output_file]\n\ndefault output_file is parsed.json\n\n"
@@ -31,12 +35,12 @@ IO.foreach(input) do |line|
       $KCODE = 'NONE'
     end
     p.parse(name)
-    parsed_data = p.parsed.all_json rescue {'scientificName' => {'parsed' => false, 'verbatim' => name,  'error' => 'Parser error'}}.to_json
+    parsed_data = p.parsed.all_json rescue parser_error(name)
     if ruby_min_version < 19
       $KCODE = old_kcode
     end
   rescue
-    parsed_data = {'parsed' => false, 'verbatim' => name,  'error' => 'Parser error'}.to_json
+    parsed_data = parser_error(name)
   end
   o.write parsed_data + "\n"
 end

data/bin/parserver CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'rubygems'
 require 'socket'
 require 'biodiversity'          # Get sockets from stdlib
+puts "Running parser service on port 4334"
 parser = ScientificNameParser.new
 server = TCPServer.open(4334)  # Socket to listen on port 4334
 loop do                         # Servers run forever

data/lib/biodiversity/parser.rb CHANGED Viewed

@@ -6,6 +6,35 @@ require File.join(dir, *%w[parser scientific_name_canonical])
 require 'rubygems'
 require 'json'
+module PreProcessor
+  NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
+  TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
+  TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
+  TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
+  NOMEN_CONCEPTS  = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
+  LAST_WORD_JUNK  = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
+  def self.clean(a_string)
+    [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
+      a_string = a_string.gsub(i, '')
+    end
+    a_string = a_string.tr('ſ','s') #old 's'
+    a_string
+  end
+end
+# we can use these expressions when we are ready to parse virus names
+# class VirusParser
+#   def initialize
+#     @order     = /^\s*[A-Z][a-z]\+virales/i
+#     @family    = /^\s*[A-Z][a-z]\+viridae|viroidae/i
+#     @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
+#     @genus     = /^\s*[A-Z][a-z]\+virus|viroid/i
+#     @species   = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
+#     @parsed    = nil
+#   end
+# end
 class ScientificNameParser
   def initialize
@@ -15,21 +44,36 @@ class ScientificNameParser
     @canonical = ScientificNameCanonicalParser.new
     @parsed = nil
   end
+  def virus?(a_string)
+    !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\s(virus|phage|viroid|satellite|prion)\b/i))
+  end
   def parsed
     @parsed
   end
   def parse(a_string)
     @verbatim = a_string
-    @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || {:verbatim => a_string}
-    def @parsed.all
+    a_string = PreProcessor::clean(a_string)
+    if virus?(a_string)
+      @parsed = { :verbatim => a_string, :virus => true }
+    else
+      @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
+    end
+    def @parsed.verbatim=(a_string)
+      @verbatim = a_string
+    end
+    def @parsed.all(verbatim = @verbatim)
       parsed = self.class != Hash
       res = {:parsed => parsed}
       if parsed
         hybrid = self.hybrid rescue false
         res.merge!({
-          :verbatim => self.text_value,
+          :verbatim => @verbatim,
           :normalized => self.value,
           :canonical => self.canonical,
           :hybrid => hybrid,
@@ -51,7 +95,8 @@ class ScientificNameParser
     def @parsed.all_json
       self.all.to_json rescue ''
     end
+    @parsed.verbatim = @verbatim
     @parsed.all
   end
 end

data/lib/biodiversity/parser/scientific_name_clean.treetop CHANGED Viewed

@@ -30,6 +30,28 @@ grammar ScientificNameClean
   end
   rule scientific_name_5
+    a:multinomial_name space_hard hybrid_character space_hard b:species {
+      def value
+        a.value + " × " + b.value
+      end
+      def canonical
+        a.canonical + " × " + b.canonical
+      end
+      def pos
+        a.pos.merge(b.pos)
+      end
+      def hybrid
+        true
+      end
+      def details
+        [a.details, b.details.merge({:genus => a.details[:genus]})]
+      end
+    }
+    /
     a:scientific_name_1 space b:taxon_concept_rank space c:authorship {
       def value
         a.value + " " + b.apply(c)
@@ -62,7 +84,7 @@ grammar ScientificNameClean
       end
       def canonical
-        a.canonical + " " + b.canonical
+        a.canonical + " × " + b.canonical
       end
       def pos
@@ -196,7 +218,7 @@ grammar ScientificNameClean
       end
       def canonical
-        a.canonical + " " + b.canonical + " " + c.canonical + " " + d.canonical
+        a.canonical + " " + c.canonical + " " + d.canonical
       end
       def pos
@@ -381,7 +403,7 @@ grammar ScientificNameClean
   end
   rule rank
-    ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"subsp."/"subf."/"race"/"α"
+    ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
     /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
     {
       def value
@@ -405,7 +427,7 @@ grammar ScientificNameClean
   end
   rule rank_forma
-    ("forma"/"form."/"fo."/"f.")
+    ("forma"/"form."/"form"/"fo."/"f.")
     {
       def value
         "f."
@@ -449,28 +471,28 @@ grammar ScientificNameClean
   end
   rule species_string
-    a:species_word &(space_hard author_prefix_word space_hard) {
-      def value
-        a.value
-      end
-      def canonical
-        a.value
-      end
-      def hybrid
-        a.hybrid rescue false
-      end
-      def pos
-        {a.interval.begin => ['species', a.interval.end]}
-      end
-      def details
-        {:species => {:string => a.value}}
-      end
-    }
-    /
+    # a:species_word &(space_hard author_prefix_word space_hard) {
+    #   def value
+    #     a.value
+    #   end
+    #
+    #   def canonical
+    #     a.value
+    #   end
+    #
+    #   def hybrid
+    #     a.hybrid rescue false
+    #   end
+    #
+    #   def pos
+    #     {a.interval.begin => ['species', a.interval.end]}
+    #   end
+    #
+    #   def details
+    #     {:species => {:string => a.value}}
+    #   end
+    # }
+    # /
     species_word {
       def canonical
         value
@@ -493,7 +515,7 @@ grammar ScientificNameClean
   end
   rule infragenus
-    left_paren space a:cap_latin_word space right_paren {
+    left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
       def value
         "(" + a.value + ")"
       end
@@ -513,7 +535,7 @@ grammar ScientificNameClean
   end
   rule genus
-    a:(cap_latin_word_pair/cap_latin_word) !(space_hard author_prefix_word space_hard author_word) {
+    a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
       def value
         a.value
       end
@@ -533,6 +555,50 @@ grammar ScientificNameClean
   end
   rule uninomial_name
+    a:uninomial_string space b:infragenus space c:simple_authorship {
+      def value
+        a.value + " " + b.value + " " + c.value
+      end
+      def canonical
+        a.canonical
+      end
+      def pos
+        a.pos.merge(b.pos).merge(c.pos)
+      end
+      def hybrid
+        false
+      end
+      def details
+        {:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
+      end
+    }
+    /
+    a:uninomial_string space b:infragenus {
+      def value
+        a.value + " " + b.value
+      end
+      def canonical
+        a.canonical
+      end
+      def pos
+        a.pos.merge(b.pos)
+      end
+      def hybrid
+        false
+      end
+      def details
+        {:uninomial => a.details[:uninomial].merge(b.details)}
+      end
+    }
+    /
     a:uninomial_string space_hard b:authorship {
       def value
         a.value + " " + b.value
@@ -799,7 +865,7 @@ grammar ScientificNameClean
   rule unknown_auth
-    ("auct."/"hort."/"anon."/"ht.") {
+    ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
       def value
         text_value
       end
@@ -837,7 +903,7 @@ grammar ScientificNameClean
   end
   rule author_name
-    space a:author_prefix_word space b:author_name space {
+    space a:author_prefix_word space b:author_name {
       def value
         a.value + " " + b.value
       end
@@ -851,7 +917,7 @@ grammar ScientificNameClean
       end
     }
     /
-    space a:author_word space b:author_name space {
+    a:author_word space b:author_name {
       def value
         a.value + " " + b.value
       end
@@ -883,7 +949,7 @@ grammar ScientificNameClean
       end
     }
     /
-    ("arg."/"et al.\{\?\}"/"et al.") {
+    ("arg."/"et al.\{\?\}"/"et al."/"et al") {
       def value
         text_value.strip
       end
@@ -930,7 +996,7 @@ grammar ScientificNameClean
   end
   rule author_prefix_word
-    space ("ab"/"bis"/"da"/"der"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
+    space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
       def value
         text_value
       end
@@ -976,6 +1042,14 @@ grammar ScientificNameClean
     }
   end
+  rule capped_dotted_char
+    [A-Z] "." {
+      def value
+        text_value
+      end
+    }
+  end
   rule species_word_hybrid
     a:multiplication_sign space b:species_word {
       def value
@@ -1051,7 +1125,9 @@ grammar ScientificNameClean
   rule species_word
     a:[0-9]+ "-"? b:latin_word {
       def value
-        a.text_value + "-" + b.value
+        num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"}
+        a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-"
+        a_value + b.value
       end
     }
     /
@@ -1059,18 +1135,21 @@ grammar ScientificNameClean
   end
   rule latin_word
-    a:[a-zëæœ] b:valid_name_letters {
+    a:valid_name_letters "-" b:latin_word {
+      def value
+        a.value + "-" + b.value
+      end
+    }
+    /
+    a:valid_name_letter b:valid_name_letters {
       def value
-        l = a.text_value
-        l = 'ae' if l == 'æ'
-        l = 'oe' if l == 'œ'
-        l + b.value
+        a.value + b.value
       end
      }
   end
   rule valid_name_letters
-    [a-z\-ëæœ]+ {
+    [a-zëæœ]+ {
       def value
         res = ''
         text_value.split('').each do |l|
@@ -1086,6 +1165,18 @@ grammar ScientificNameClean
     }
   end
+  rule valid_name_letter
+    [a-zëæœ] {
+      def value
+        res = text_value
+        res = 'ae' if res == 'æ'
+        res = 'oe' if res == 'œ'
+        res
+      end
+    }
+  end
   rule cap_digraph
     "Æ" {
       def value