RubyGems - biodiversity - Versions diffs - 0.5.16 → 0.6.0 - Mend

biodiversity 0.5.16 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/.gitignore +2 -0
data/README.rdoc +5 -5
data/Rakefile +8 -3
data/VERSION +1 -1
data/bin/nnparse +7 -3
data/bin/parserver +1 -0
data/lib/biodiversity/parser.rb +50 -5
data/lib/biodiversity/parser/scientific_name_clean.treetop +131 -40
data/spec/parser/scientific_name_clean.spec.rb +53 -27
data/spec/parser/test_data.txt +73 -21
metadata +4 -8
data/biodiversity.gemspec +0 -88
data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -481
data/lib/biodiversity/parser/scientific_name_clean.rb +0 -6118
data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1309

data/.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
+lib/biodiversity/parser/*rb
+*.gemspec
 *.sw?
 .DS_Store
 coverage

data/README.rdoc CHANGED Viewed

@@ -1,17 +1,17 @@
 = Biodiversity
-Parses species scientific name and breaks it into elements.
+Parses taxonomic scientific name and breaks it into semantic elements.
 == Installation
-To install gem you need RubyGems >= 1.2.0
+To install gem you need RubyGems >= 1.3.6
-  $ gem sources -a http://gems.github.com (you only have to do this once)
-  $ sudo gem install dimus-biodiversity
+  $ sudo gem install biodiversity #for ruby 1.8.x
+  $ sudo gem install biodiversity19 #for ruby 1.9.x
 == Example usage
-You can parse file with species names from command line. File should contain one scientific name per line
+You can parse file with taxonomic names from command line. File should contain one scientific name per line
   nnparser file_with_names

data/Rakefile CHANGED Viewed

@@ -13,11 +13,13 @@ Spec::Rake::SpecTask.new do |t|
   t.pattern = 'spec/**/*spec.rb'
 end
+ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
 begin
   require 'jeweler'
   Jeweler::Tasks.new do |gem|
-    gem.name = "biodiversity"
+    gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
     gem.summary = 'Parser of scientific names'
     gem.description = 'Tools for biodiversity informatics'
     gem.email = "dmozzherin@gmail.com"
@@ -37,11 +39,14 @@ end
 task :tt do
   ['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
-    system("tt #{dir}/lib/biodiversity/parser/#{f}.treetop")
-    rf = "#{dir}/lib/biodiversity/parser/#{f}.rb"
+    file = "#{dir}/lib/biodiversity/parser/#{f}"
+    FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
+    system("tt #{file}.treetop")
+    rf = "#{file}.rb"
     rfn = open(rf + ".tmp", 'w')
     skip_head = false
     f = open(rf)
+    #getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
     f.each_with_index do |l, i|
       skip_head = l.match(/^# Autogenerated/) if i == 0
       if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.5.16
1	+ 0.6.0

data/bin/nnparse CHANGED Viewed

@@ -1,11 +1,15 @@
 #!/usr/bin/env ruby
 require 'rubygems'
-gem 'biodiversity' rescue nil
+gem_name = RUBY_VERSION.split('.')[0..1].join('').to_i > 18 ? 'biodiversity19' : 'biodiversity'
+gem gem_name rescue nil
 $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
 require 'biodiversity'
 require 'json'
+def parser_error(name)
+  {'scientificName' => {'parsed' => false, 'verbatim' => name,  'error' => 'Parser error'}}.to_json
+end
 if ARGV.empty?
   puts "Usage:\n\nnnparse file_with_scientific_names [output_file]\n\ndefault output_file is parsed.json\n\n"
@@ -31,12 +35,12 @@ IO.foreach(input) do |line|
       $KCODE = 'NONE'
     end
     p.parse(name)
-    parsed_data = p.parsed.all_json rescue {'scientificName' => {'parsed' => false, 'verbatim' => name,  'error' => 'Parser error'}}.to_json
+    parsed_data = p.parsed.all_json rescue parser_error(name)
     if ruby_min_version < 19
       $KCODE = old_kcode
     end
   rescue
-    parsed_data = {'parsed' => false, 'verbatim' => name,  'error' => 'Parser error'}.to_json
+    parsed_data = parser_error(name)
   end
   o.write parsed_data + "\n"
 end

data/bin/parserver CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'rubygems'
 require 'socket'
 require 'biodiversity'          # Get sockets from stdlib
+puts "Running parser service on port 4334"
 parser = ScientificNameParser.new
 server = TCPServer.open(4334)  # Socket to listen on port 4334
 loop do                         # Servers run forever

data/lib/biodiversity/parser.rb CHANGED Viewed

@@ -6,6 +6,35 @@ require File.join(dir, *%w[parser scientific_name_canonical])
 require 'rubygems'
 require 'json'
+module PreProcessor
+  NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
+  TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
+  TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
+  TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
+  NOMEN_CONCEPTS  = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
+  LAST_WORD_JUNK  = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
+  def self.clean(a_string)
+    [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
+      a_string = a_string.gsub(i, '')
+    end
+    a_string = a_string.tr('ſ','s') #old 's'
+    a_string
+  end
+end
+# we can use these expressions when we are ready to parse virus names
+# class VirusParser
+#   def initialize
+#     @order     = /^\s*[A-Z][a-z]\+virales/i
+#     @family    = /^\s*[A-Z][a-z]\+viridae|viroidae/i
+#     @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
+#     @genus     = /^\s*[A-Z][a-z]\+virus|viroid/i
+#     @species   = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
+#     @parsed    = nil
+#   end
+# end
 class ScientificNameParser
   def initialize
@@ -15,21 +44,36 @@ class ScientificNameParser
     @canonical = ScientificNameCanonicalParser.new
     @parsed = nil
   end
+  def virus?(a_string)
+    !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\s(virus|phage|viroid|satellite|prion)\b/i))
+  end
   def parsed
     @parsed
   end
   def parse(a_string)
     @verbatim = a_string
-    @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || {:verbatim => a_string}
-    def @parsed.all
+    a_string = PreProcessor::clean(a_string)
+    if virus?(a_string)
+      @parsed = { :verbatim => a_string, :virus => true }
+    else
+      @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
+    end
+    def @parsed.verbatim=(a_string)
+      @verbatim = a_string
+    end
+    def @parsed.all(verbatim = @verbatim)
       parsed = self.class != Hash
       res = {:parsed => parsed}
       if parsed
         hybrid = self.hybrid rescue false
         res.merge!({
-          :verbatim => self.text_value,
+          :verbatim => @verbatim,
           :normalized => self.value,
           :canonical => self.canonical,
           :hybrid => hybrid,
@@ -51,7 +95,8 @@ class ScientificNameParser
     def @parsed.all_json
       self.all.to_json rescue ''
     end
+    @parsed.verbatim = @verbatim
     @parsed.all
   end
 end

data/lib/biodiversity/parser/scientific_name_clean.treetop CHANGED Viewed

@@ -30,6 +30,28 @@ grammar ScientificNameClean
   end
   rule scientific_name_5
+    a:multinomial_name space_hard hybrid_character space_hard b:species {
+      def value
+        a.value + " × " + b.value
+      end
+      def canonical
+        a.canonical + " × " + b.canonical
+      end
+      def pos
+        a.pos.merge(b.pos)
+      end
+      def hybrid
+        true
+      end
+      def details
+        [a.details, b.details.merge({:genus => a.details[:genus]})]
+      end
+    }
+    /
     a:scientific_name_1 space b:taxon_concept_rank space c:authorship {
       def value
         a.value + " " + b.apply(c)
@@ -62,7 +84,7 @@ grammar ScientificNameClean
       end
       def canonical
-        a.canonical + " " + b.canonical
+        a.canonical + " × " + b.canonical
       end
       def pos
@@ -196,7 +218,7 @@ grammar ScientificNameClean
       end
       def canonical
-        a.canonical + " " + b.canonical + " " + c.canonical + " " + d.canonical
+        a.canonical + " " + c.canonical + " " + d.canonical
       end
       def pos
@@ -381,7 +403,7 @@ grammar ScientificNameClean
   end
   rule rank
-    ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"subsp."/"subf."/"race"/"α"
+    ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
     /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
     {
       def value
@@ -405,7 +427,7 @@ grammar ScientificNameClean
   end
   rule rank_forma
-    ("forma"/"form."/"fo."/"f.")
+    ("forma"/"form."/"form"/"fo."/"f.")
     {
       def value
         "f."
@@ -449,28 +471,28 @@ grammar ScientificNameClean
   end
   rule species_string
-    a:species_word &(space_hard author_prefix_word space_hard) {
-      def value
-        a.value
-      end
-      def canonical
-        a.value
-      end
-      def hybrid
-        a.hybrid rescue false
-      end
-      def pos
-        {a.interval.begin => ['species', a.interval.end]}
-      end
-      def details
-        {:species => {:string => a.value}}
-      end
-    }
-    /
+    # a:species_word &(space_hard author_prefix_word space_hard) {
+    #   def value
+    #     a.value
+    #   end
+    #
+    #   def canonical
+    #     a.value
+    #   end
+    #
+    #   def hybrid
+    #     a.hybrid rescue false
+    #   end
+    #
+    #   def pos
+    #     {a.interval.begin => ['species', a.interval.end]}
+    #   end
+    #
+    #   def details
+    #     {:species => {:string => a.value}}
+    #   end
+    # }
+    # /
     species_word {
       def canonical
         value
@@ -493,7 +515,7 @@ grammar ScientificNameClean
   end
   rule infragenus
-    left_paren space a:cap_latin_word space right_paren {
+    left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
       def value
         "(" + a.value + ")"
       end
@@ -513,7 +535,7 @@ grammar ScientificNameClean
   end
   rule genus
-    a:(cap_latin_word_pair/cap_latin_word) !(space_hard author_prefix_word space_hard author_word) {
+    a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
       def value
         a.value
       end
@@ -533,6 +555,50 @@ grammar ScientificNameClean
   end
   rule uninomial_name
+    a:uninomial_string space b:infragenus space c:simple_authorship {
+      def value
+        a.value + " " + b.value + " " + c.value
+      end
+      def canonical
+        a.canonical
+      end
+      def pos
+        a.pos.merge(b.pos).merge(c.pos)
+      end
+      def hybrid
+        false
+      end
+      def details
+        {:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
+      end
+    }
+    /
+    a:uninomial_string space b:infragenus {
+      def value
+        a.value + " " + b.value
+      end
+      def canonical
+        a.canonical
+      end
+      def pos
+        a.pos.merge(b.pos)
+      end
+      def hybrid
+        false
+      end
+      def details
+        {:uninomial => a.details[:uninomial].merge(b.details)}
+      end
+    }
+    /
     a:uninomial_string space_hard b:authorship {
       def value
         a.value + " " + b.value
@@ -799,7 +865,7 @@ grammar ScientificNameClean
   rule unknown_auth
-    ("auct."/"hort."/"anon."/"ht.") {
+    ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
       def value
         text_value
       end
@@ -837,7 +903,7 @@ grammar ScientificNameClean
   end
   rule author_name
-    space a:author_prefix_word space b:author_name space {
+    space a:author_prefix_word space b:author_name {
       def value
         a.value + " " + b.value
       end
@@ -851,7 +917,7 @@ grammar ScientificNameClean
       end
     }
     /
-    space a:author_word space b:author_name space {
+    a:author_word space b:author_name {
       def value
         a.value + " " + b.value
       end
@@ -883,7 +949,7 @@ grammar ScientificNameClean
       end
     }
     /
-    ("arg."/"et al.\{\?\}"/"et al.") {
+    ("arg."/"et al.\{\?\}"/"et al."/"et al") {
       def value
         text_value.strip
       end
@@ -930,7 +996,7 @@ grammar ScientificNameClean
   end
   rule author_prefix_word
-    space ("ab"/"bis"/"da"/"der"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
+    space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
       def value
         text_value
       end
@@ -976,6 +1042,14 @@ grammar ScientificNameClean
     }
   end
+  rule capped_dotted_char
+    [A-Z] "." {
+      def value
+        text_value
+      end
+    }
+  end
   rule species_word_hybrid
     a:multiplication_sign space b:species_word {
       def value
@@ -1051,7 +1125,9 @@ grammar ScientificNameClean
   rule species_word
     a:[0-9]+ "-"? b:latin_word {
       def value
-        a.text_value + "-" + b.value
+        num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"}
+        a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-"
+        a_value + b.value
       end
     }
     /
@@ -1059,18 +1135,21 @@ grammar ScientificNameClean
   end
   rule latin_word
-    a:[a-zëæœ] b:valid_name_letters {
+    a:valid_name_letters "-" b:latin_word {
+      def value
+        a.value + "-" + b.value
+      end
+    }
+    /
+    a:valid_name_letter b:valid_name_letters {
       def value
-        l = a.text_value
-        l = 'ae' if l == 'æ'
-        l = 'oe' if l == 'œ'
-        l + b.value
+        a.value + b.value
       end
      }
   end
   rule valid_name_letters
-    [a-z\-ëæœ]+ {
+    [a-zëæœ]+ {
       def value
         res = ''
         text_value.split('').each do |l|
@@ -1086,6 +1165,18 @@ grammar ScientificNameClean
     }
   end
+  rule valid_name_letter
+    [a-zëæœ] {
+      def value
+        res = text_value
+        res = 'ae' if res == 'æ'
+        res = 'oe' if res == 'œ'
+        res
+      end
+    }
+  end
   rule cap_digraph
     "Æ" {
       def value