RubyGems - taxonifi - Versions diffs - 0.1.0 - Mend

taxonifi 0.1.0

Files changed (53) hide show

data/.document +5 -0
data/Gemfile +18 -0
data/Gemfile.lock +30 -0
data/LICENSE.txt +20 -0
data/README.rdoc +155 -0
data/Rakefile +53 -0
data/VERSION +1 -0
data/lib/assessor/assessor.rb +31 -0
data/lib/assessor/base.rb +17 -0
data/lib/assessor/row_assessor.rb +131 -0
data/lib/export/export.rb +9 -0
data/lib/export/format/base.rb +43 -0
data/lib/export/format/species_file.rb +341 -0
data/lib/lumper/lumper.rb +334 -0
data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
data/lib/models/author_year.rb +39 -0
data/lib/models/base.rb +73 -0
data/lib/models/collection.rb +92 -0
data/lib/models/generic_object.rb +15 -0
data/lib/models/geog.rb +59 -0
data/lib/models/geog_collection.rb +28 -0
data/lib/models/name.rb +206 -0
data/lib/models/name_collection.rb +149 -0
data/lib/models/person.rb +49 -0
data/lib/models/ref.rb +85 -0
data/lib/models/ref_collection.rb +106 -0
data/lib/models/species_name.rb +85 -0
data/lib/splitter/builder.rb +26 -0
data/lib/splitter/lexer.rb +70 -0
data/lib/splitter/parser.rb +54 -0
data/lib/splitter/splitter.rb +45 -0
data/lib/splitter/tokens.rb +322 -0
data/lib/taxonifi.rb +36 -0
data/test/file_fixtures/Lygaeoidea.csv +801 -0
data/test/helper.rb +38 -0
data/test/test_exporter.rb +32 -0
data/test/test_lumper_geogs.rb +59 -0
data/test/test_lumper_hierarchical_collection.rb +88 -0
data/test/test_lumper_names.rb +119 -0
data/test/test_lumper_parent_child_name_collection.rb +41 -0
data/test/test_lumper_refs.rb +91 -0
data/test/test_parser.rb +34 -0
data/test/test_splitter.rb +27 -0
data/test/test_splitter_tokens.rb +403 -0
data/test/test_taxonifi.rb +11 -0
data/test/test_taxonifi_accessor.rb +61 -0
data/test/test_taxonifi_geog.rb +51 -0
data/test/test_taxonifi_name.rb +186 -0
data/test/test_taxonifi_name_collection.rb +158 -0
data/test/test_taxonifi_ref.rb +90 -0
data/test/test_taxonifi_ref_collection.rb +69 -0
data/test/test_taxonifi_species_name.rb +95 -0
metadata +167 -0

data/lib/models/name_collection.rb ADDED Viewed

@@ -0,0 +1,149 @@
+module Taxonifi
+  class NameCollectionError < StandardError; end
+  module Model
+    # A collection of taxonomic names.
+    class NameCollection < Taxonifi::Model::Collection
+      attr_accessor :by_name_index
+      attr_accessor :ref_collection
+      def initialize(options = {})
+        super
+        @collection = []
+        @by_name_index = {}             # "foo => [1,2,3]"
+        Taxonifi::RANKS.inject(@by_name_index){|hsh, v| hsh.merge!(v => {})}
+        @by_name_index['unknown'] = {} # unranked names get dumped in here
+        @ref_collection = nil
+        true
+      end
+      def object_class
+        Taxonifi::Model::Name
+      end
+      # Return the highest RANK for which there is no
+      # name in this collection.
+      def encompassing_rank
+        highest = RANKS.size
+        @collection.each do |n|
+          h = RANKS.index(n.rank)
+          highest = h if h < highest
+        end
+        RANKS[highest - 1]
+      end
+      # The names objects in the collection at a rank.
+      # TODO: Should index this on add_object
+      def names_at_rank(rank)
+        raise if !RANKS.include?(rank)
+        names = []
+        @collection.each do |n|
+          names << n if n.rank == rank
+        end
+        names
+      end
+      # Returns id of matching existing name
+      # or false if there i s no match.
+      # Matches against name (string) and parents ("identity")
+      def name_exists?(name = Taxonifi::Model::Name)
+        # Does the name (string) exist?
+        rank = name.rank.downcase
+        rank ||= 'unknown'
+        if by_name_index[rank][name.name]
+          # Yes, check to see if parents match
+          by_name_index[rank][name.name].each do |id|
+            vector = parent_id_vector(id)
+            vector.pop
+            if vector == parent_id_vector(name.parent.id)
+              exists = true
+              return id
+            end
+          end
+        end
+        false
+      end
+      # Add an individaul name object, indexing it.
+      def add_object(obj)
+        super
+        index_by_name(obj)
+        obj
+      end
+      # Add an individaul name object, without indexing it.
+      def add_object_pre_indexed(obj)
+        super
+        index_by_name(obj)
+        obj
+      end
+      # Add a Taxonifi::Model::SpeciesName object
+      # as individual objects.
+      def add_species_name(sn)
+        raise "Failed trying to load [#{sn.display_name}]. SpeciesName#genus#parent must be set before using add_species_name." if sn.genus.parent.nil?
+        current_parent_id = sn.genus.parent.id
+        sn.names.each do |o|
+          o.parent = object_by_id(current_parent_id)
+          if id = name_exists?(o)
+            cp_id = id
+          else
+            add_object(o)
+            cp_id = o.id
+          end
+          current_parent_id = cp_id
+        end
+        current_parent_id # return the id of the last name created
+      end
+      # As #add_species_name but do
+      # not assign ids to the incoming names
+      # TODO: deprecate?
+      def add_species_name_unindexed(sn)
+        sn.names.each do |o|
+          if !name_exists?(o)
+            add_object(o)
+          end
+        end
+      end
+      # Take the author/years of these names and generate a reference collection.
+      # Start the ids assigned to the references with initial_id.
+      def generate_ref_collection(initial_id = 0)
+        rc = Taxonifi::Model::RefCollection.new(:initial_id => initial_id)
+        if collection.size > 0
+          uniques = collection.inject({}){|hsh, n| hsh.merge!(n.author_year_string => nil)}.keys.compact
+          if  uniques.size > 0
+            uniques.sort.each_with_index do |r, i|
+              next if r.size == 0
+              ref = Taxonifi::Model::Ref.new(:author_year => r)
+              rc.add_object(ref)
+            end
+          end
+        end
+        @ref_collection = rc
+      end
+      # Assign a reference collection to this name collection.
+      # !! Overwrites existing reference collection, including ones built
+      # using generate_ref_collection.
+      def ref_collection=(ref_collection)
+        @ref_collection = ref_collection if ref_collection.class == Taxonifi::Model::RefCollection
+      end
+      protected
+      # Index the object by name into the
+      # @by_name_index variable (this looks like:
+      #  {"Foo bar" => [1,2,93]})
+      def index_by_name(obj)
+        rank = obj.rank
+        rank ||= 'unknown'
+        by_name_index[rank][obj.name] ||= []
+        by_name_index[rank][obj.name].push obj.id
+      end
+    end
+  end
+end

data/lib/models/person.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require File.expand_path(File.join(File.dirname(__FILE__), "../models/base.rb"))
+module Taxonifi
+  module Model
+    # Simple Person class.
+    # You can store multiple initials and suffixes.
+    class Person < Taxonifi::Model::Base
+      ATTRIBUTES = [
+        :first_name,
+        :last_name,
+        :initials,    # an Array, no periods.
+        :suffix       # an Array
+      ]
+      ATTRIBUTES.each do |a|
+        attr_accessor a
+      end
+      def initialize(options = {})
+        opts = {
+        }.merge!(options)
+        # Check for valid opts prior to building
+        build(ATTRIBUTES, opts)
+        true
+      end
+      # Returns a string with data delimited by pipes.
+      # Used in identity comparisons.
+      def compact_string
+        s = [ATTRIBUTES.sort.collect{|a| send(a)}].join("|").downcase.gsub(/\s/, '')
+      end
+      # Nothing fancy, just the data.
+      def display_name
+        [@last_name, @first_name, @initials, @suffix].compact.flatten.join(" ")
+      end
+      # Return a string representing the initials, periods added.
+      def initials_string
+        if @initials.nil?
+          nil
+        else
+          @initials.join(".") + "."
+        end
+      end
+    end
+  end
+end

data/lib/models/ref.rb ADDED Viewed

@@ -0,0 +1,85 @@
+module Taxonifi
+  class RefError < StandardError; end
+  module Model
+    # A basic reference object.
+    class Ref < Taxonifi::Model::Base
+      # These attributes are set automatically on #new()
+      ATTRIBUTES = [
+        :authors,
+        :title,
+        :year,
+        :publication,
+        :volume,
+        :number,
+        :pages,
+        :pg_start,
+        :pg_end,
+        :cited_page,
+        :full_citation
+      ]
+      # Array of Taxonifi::Model::Person
+      attr_accessor :authors
+      # String
+      attr_accessor :title
+      # String
+      attr_accessor :year
+      # String
+      attr_accessor :publication
+      # String
+      attr_accessor :volume
+      # String
+      attr_accessor :number
+      # String.  Anything that doesn't fit in a page range.
+      attr_accessor :pages
+      # String
+      attr_accessor :pg_start
+      # String
+      attr_accessor :pg_end
+      # String.  Some specific page(s) of note.
+      attr_accessor :cited_page
+      # String. The full text of the citation, as read from input or assigned, not computed from individual components.
+      attr_accessor :full_citation
+      # String. Computed index based on existing Ref#authors and Ref#year
+      attr_accessor :author_year_index
+      # If :author_year is passed it is broken down into People + year.
+      def initialize(options = {})
+        opts = {
+        }.merge!(options)
+        @parent = nil
+        build(ATTRIBUTES, opts)
+        @authors = [] if @authors.nil?
+        raise Taxonifi::RefError, 'If :author_year is provided then authors and year must not be.' if opts[:author_year] && (!opts[:year].nil? || !opts[:authors].nil?)
+        add_author_year(opts[:author_year]) if !opts[:author_year].nil? && opts[:author_year].size > 0
+        true
+      end
+      def add_author_year(string)
+        auth_yr = Taxonifi::Splitter::Builder.build_author_year(string)
+        @year = auth_yr.year
+        @authors = auth_yr.people
+      end
+      # Returns a pipe delimited representation of the reference.
+      def compact_string
+        s = [authors.collect{|a| a.compact_string}.join, year, self.title, publication, volume, number, pages, pg_start, pg_end, cited_page].join("|").downcase.gsub(/\s/, '')
+        s
+      end
+      # Return a by author_year index.
+      def author_year_index
+        @author_year_index ||= generate_author_year_index
+      end
+      # (re-) generate the author year index.
+      def generate_author_year_index
+        @author_year_index = Taxonifi::Model::AuthorYear.new(people: @authors, year: @year).compact_index
+      end
+    end
+  end
+end

data/lib/models/ref_collection.rb ADDED Viewed

@@ -0,0 +1,106 @@
+module Taxonifi
+  class RefCollectionError < StandardError; end
+  module Model
+    # A collection of references.
+    class RefCollection < Taxonifi::Model::Collection
+      # An options index when there is one reference per row.
+      attr_accessor :row_index
+      # Points a Ref#id to an array of Person#ids.
+      # Built on request.
+      attr_accessor :author_index
+      def initialize(options = {})
+        super
+        @row_index = []
+        @author_index = {}
+        true
+      end
+      # The instance collection class.
+      def object_class
+        Taxonifi::Model::Ref
+      end
+      # The object at a given row.
+      # TODO: inherit from Collection?
+      def object_from_row(row_number)
+        @row_index[row_number]
+      end
+      # Incrementally (re-)assigns the id of every associated author (Person)
+      # This is only really useful if you assume every author is unique.
+      def enumerate_authors(initial_id = 0)
+        i = initial_id
+        collection.each do |r|
+          r.authors.each do |a|
+            a.id = i
+            i += 1
+          end
+        end
+      end
+      # Finds unique authors, and combines them, then
+      # rebuilds author lists using references to the new unique set.
+      def uniquify_authors(initial_id = 0)
+        auth_index = {}
+        unique_authors.each_with_index do |a, i|
+          a.id = i + initial_id
+          auth_index.merge!(a.compact_string => a)
+        end
+        collection.each do |r|
+          new_authors = []
+          r.authors.inject(new_authors){|ary, a| ary.push(auth_index[a.compact_string])}
+          r.authors = new_authors
+        end
+        true
+      end
+      # Build the author index.
+      #   {Ref#id => [a1#id, ... an#id]}
+      def build_author_index
+        collection.each do |r|
+          @author_index.merge!(r.id => r.authors.collect{|a| a.id ? a.id : -1})
+        end
+      end
+      # Return an array the unique author strings in this collection.
+      def unique_author_strings
+        auths = {}
+        collection.each do |r|
+          r.authors.each do |a|
+            auths.merge!(a.display_name => nil)
+          end
+        end
+        auths.keys.sort
+      end
+      # Returns Array of Taxonifi::Model::Person
+      # Will need better indexing on big lists?
+      def unique_authors
+        auths = []
+        collection.each do |r|
+          r.authors.each do |a|
+            found = false
+            auths.each do |x|
+              if a.identical?(x)
+                found = true
+                next
+              end
+            end
+            if not found
+              auths.push a.clone
+            end
+          end
+        end
+        auths
+      end
+    end
+  end
+end

data/lib/models/species_name.rb ADDED Viewed

@@ -0,0 +1,85 @@
+module Taxonifi
+  class SpeciesNameError < StandardError; end
+  module Model
+    # The species name model is just a pointer to 5 Taxonifi::Model::Names.
+    # The various metadata (author, year, original combination) is stored with the individual
+    # instances of those names.
+    # Taxonifi::Model::Names have no ids!
+    class SpeciesName < Taxonifi::Model::Base
+      ATTRIBUTES = [:genus, :subgenus, :species, :subspecies, :parent]
+      ATTRIBUTES.each do |a|
+        attr_accessor a
+      end
+      def initialize(options = {})
+        opts = {
+        }.merge!(options)
+        build(ATTRIBUTES, opts)
+        true
+      end
+      # Set the genus name.
+      def genus=(genus)
+        @genus = genus
+      end
+      # Set the subgenus name.
+      def subgenus=(subgenus)
+        raise Taxonifi::SpeciesNameError, "Species name must have a Genus name before subgenus can be assigned" if @genus.nil?
+        @subgenus = subgenus
+        @subgenus.parent = @genus
+      end
+      # Set the species name.
+      def species=(species)
+        raise Taxonifi::SpeciesNameError, "Species name must have a Genus name before species can be assigned" if @genus.nil?
+        @species = species
+        @species.parent = (@subgenus ? @subgenus : @genus)
+      end
+      # Set the subspecies name.
+      def subspecies=(subspecies)
+        raise Taxonifi::SpeciesNameError, "Subspecies name must have a species name before species can be assigned" if @species.nil?
+        @subspecies = subspecies
+        @subspecies.parent = @species
+      end
+      # Set the parent name.
+      def parent=(parent)
+        if parent.class != Taxonifi::Model::Name
+          raise SpeciesNameError, "Parent is not a Taxonifi::Model::Name."
+        end
+        if parent.rank.nil? ||  (Taxonifi::RANKS.index('genus') <= Taxonifi::RANKS.index(parent.rank))
+          raise Taxonifi::SpeciesNameError, "Parents of SpeciesNames must have rank higher than Genus."
+        end
+        @parent = parent
+      end
+      # Return an array of Name objects.
+      def names
+        ATTRIBUTES.collect{|a| self.send(a)}.compact
+      end
+      # Return a string representation of the species name.
+      def display_name
+        strs = []
+        self.names.each do |n|
+          case n.rank
+          when 'subgenus'
+            strs.push "(#{n.name})"
+          else
+            strs.push n.name
+          end
+        end
+        strs.push self.names.last.author_year
+        txt = strs.compact.join(" ")
+        txt
+      end
+    end
+  end
+end

data/lib/splitter/builder.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# Builder functionality for parsing/lexing framework.
+module Taxonifi::Splitter::Builder
+    # Load all builders (= models)
+    #  TODO: perhaps use a different scope that doesn't require loading all at once
+    Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "../models/*.rb") )) do |file|
+      require file
+    end
+    # Build and return Taxonifi::Model::AuthorYear from a string.
+    def self.build_author_year(text)
+      lexer = Taxonifi::Splitter::Lexer.new(text)
+      builder = Taxonifi::Model::AuthorYear.new
+      Taxonifi::Splitter::Parser.new(lexer, builder).parse_author_year
+      builder
+    end
+    # Build and return Taxonifi::Model::SpeciesName from a string.
+    def self.build_species_name(text)
+      lexer = Taxonifi::Splitter::Lexer.new(text, :species_name)
+      builder = Taxonifi::Model::SpeciesName.new
+      Taxonifi::Splitter::Parser.new(lexer, builder).parse_species_name
+      builder
+    end
+end

data/lib/splitter/lexer.rb ADDED Viewed

@@ -0,0 +1,70 @@
+#
+# Lexer taken verbatim from OboParser and other mjy gems.
+#
+class Taxonifi::Splitter::Lexer
+  attr_reader :input, :token_list
+  def initialize(input, token_list = nil)
+    raise Taxonifi::Splitter::SplitterError, "Invalid token list passed to Lexer." if (!token_list.nil? && !Taxonifi::Splitter::TOKEN_LISTS.include?(token_list)  )
+    token_list = :global_token_list if token_list.nil?
+    @input = input
+    @token_list = token_list
+    @next_token = nil
+  end
+  # Checks whether the next token is of the specified class.
+  def peek(token_class, token_list = nil)
+    token = read_next_token(token_class)
+    return token.class == token_class
+  end
+  # Return (and delete) the next token from the input stream, or raise an exception
+  # if the next token is not of the given class.
+  def pop(token_class)
+    token = read_next_token(token_class)
+    @next_token = nil
+    if token.class != token_class
+      raise(Taxonifi::Splitter::SplitterError, "expected #{token_class.to_s} but received #{token.class.to_s} at #{@input[0..10]}...", caller)
+    else
+      return token
+    end
+  end
+  private
+  # Read (and store) the next token from the input, if it has not already been read.
+  def read_next_token(token_class)
+    if @next_token
+      return @next_token
+    else
+      # check for a match on the specified class first
+      if match(token_class)
+        return @next_token
+      else
+        # now check all the tokens for a match
+        Taxonifi::Splitter::Tokens.send(@token_list).each {|t|
+          return @next_token if match(t)
+        }
+      end
+      # no match, either end of string or lex-error
+      if @input != ''
+        raise(Taxonifi::Splitter::SplitterError, "Lexer Error, unknown token at |#{@input[0..20]}...", caller)
+      else
+        return nil
+      end
+    end
+  end
+  # Match a token to the input.
+  def match(token_class)
+    if (m = token_class.regexp.match(@input))
+      @next_token = token_class.new(m[1])
+      @input = @input[m.end(0)..-1]
+      return true
+    else
+      return false
+    end
+  end
+end

data/lib/splitter/parser.rb ADDED Viewed

@@ -0,0 +1,54 @@
+#
+# Parser pattern taken from OboParser and other mjy gems.
+#
+# The parser takes a builder and a lexer and does the actual breakdown.
+#
+class Taxonifi::Splitter::Parser
+  def initialize(lexer, builder )
+    @lexer = lexer
+    @builder = builder
+  end
+  # parse out an author year combination.
+  # TODO: This is only indirectly tested in lumper code
+  def parse_author_year
+    t = @lexer.pop(Taxonifi::Splitter::Tokens::AuthorYear)
+    lexer = Taxonifi::Splitter::Lexer.new(t.authors)
+    authors = lexer.pop(Taxonifi::Splitter::Tokens::Authors)
+    # TODO: A people collection?
+    authors.names.each do |a|
+      n = Taxonifi::Model::Person.new()
+      n.last_name = a[:last_name]
+      n.initials = a[:initials]
+      @builder.people.push n
+    end
+    @builder.year   = t.year.to_i
+    @builder.parens = t.parens
+  end
+  # Parse a species name
+  def parse_species_name
+    t = @lexer.pop(Taxonifi::Splitter::Tokens::Quadrinomial)
+    ranks = %w{genus subgenus species subspecies}
+    names = {}
+    last_parent = nil
+    ranks.each do |r|
+      names.merge!(r: nil)
+      @builder.send("#{r}=", Taxonifi::Model::Name.new(:name => t.send(r), rank: r) ) if t.send(r)
+    end
+    if @lexer.peek(Taxonifi::Splitter::Tokens::AuthorYear)
+      t = @lexer.pop(Taxonifi::Splitter::Tokens::AuthorYear)
+      @builder.names.last.author = t.authors
+      @builder.names.last.year = t.year
+      @builder.names.last.parens = !t.parens
+      @builder.names.last.derive_authors_year
+    end
+    @builder
+  end
+end

data/lib/splitter/splitter.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module Taxonifi
+  # An implementation of the parser/lexer/token pattern by Krishna Dole which in turn was based on
+  # Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library, which has evolved
+  # into mjy's obo_parser/nexus_parser libraries.
+  module Splitter
+    TOKEN_LISTS = [
+      :global_token_list,
+      :volume_number,
+      :pages,
+      :species_name
+    ]
+    class SplitterError < StandardError; end
+    require File.expand_path(File.join(File.dirname(__FILE__), 'tokens'))
+    require File.expand_path(File.join(File.dirname(__FILE__), 'parser'))
+    require File.expand_path(File.join(File.dirname(__FILE__), 'lexer'))
+    require File.expand_path(File.join(File.dirname(__FILE__), 'builder'))
+    # stub, we might not need
+    class Splitter
+      def initialize
+        true
+      end
+    end
+  end # end Splitter module
+end # Taxonifi module
+#= Implementation
+def do_bar(input)
+  @input = input
+  raise(Taxonifi::Splitter::SplitterError, "Nothing passed to parse!") if !@input || @input.size == 0
+  builder = Taxonifi::Splitter::SplitterBuilder.new
+  lexer = Taxonifi::Splitter::Lexer.new(@input)
+  Taxonfi::Splitter::Parser.new(lexer, builder).foo
+  return builder.bar
+end