RubyGems - nexus_parser - Versions diffs - 1.0.0 - Mend

nexus_parser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/.document +5 -0
data/.gitignore +21 -0
data/LICENSE +20 -0
data/MIT-LICENSE +20 -0
data/README +13 -0
data/README.rdoc +17 -0
data/Rakefile +53 -0
data/VERSION +1 -0
data/init.rb +1 -0
data/install.rb +1 -0
data/lib/lexer.rb +66 -0
data/lib/nexus_file.rb +282 -0
data/lib/parser.rb +334 -0
data/lib/tokens.rb +269 -0
data/tasks/nexus_parser_tasks.rake +4 -0
data/test/MX_test_03.nex +234 -0
data/test/test.nex +382 -0
data/test/test_nexus_parser.rb +937 -0
data/uninstall.rb +1 -0
metadata +82 -0

data/lib/parser.rb ADDED

@@ -0,0 +1,334 @@
+class NexusFile::Parser
+  def initialize(lexer, builder)
+    @lexer = lexer
+    @builder = builder
+  end
+  def parse_file
+    # nf = @builder.new_nexus_file # create new local NexusFile instance, nf
+    blks = []
+    @lexer.pop(NexusFile::Tokens::NexusStart)
+    while @lexer.peek(NexusFile::Tokens::BeginBlk)
+      @lexer.pop(NexusFile::Tokens::BeginBlk) # pop it
+      if @lexer.peek(NexusFile::Tokens::AuthorsBlk)
+        parse_authors_blk
+      # we parse these below
+      elsif @lexer.peek(NexusFile::Tokens::TaxaBlk)
+        @lexer.pop(NexusFile::Tokens::TaxaBlk )
+        parse_taxa_blk
+      elsif @lexer.peek(NexusFile::Tokens::ChrsBlk)
+        @lexer.pop(NexusFile::Tokens::ChrsBlk)
+        parse_characters_blk
+      elsif @lexer.peek(NexusFile::Tokens::NotesBlk)
+        @lexer.pop(NexusFile::Tokens::NotesBlk)
+        parse_notes_blk
+      # we should parse this
+      elsif @lexer.peek(NexusFile::Tokens::SetsBlk)
+        @lexer.pop(NexusFile::Tokens::SetsBlk)
+      # we don't parse these
+      elsif @lexer.peek(NexusFile::Tokens::TreesBlk)
+        @foo =  @lexer.pop(NexusFile::Tokens::TreesBlk).value
+      elsif @lexer.peek(NexusFile::Tokens::LabelsBlk)
+        @lexer.pop(NexusFile::Tokens::LabelsBlk)
+      elsif @lexer.peek(NexusFile::Tokens::MqCharModelsBlk)
+        @lexer.pop(NexusFile::Tokens::MqCharModelsBlk)
+      elsif @lexer.peek(NexusFile::Tokens::AssumptionsBlk)
+        @lexer.pop(NexusFile::Tokens::AssumptionsBlk)
+      elsif @lexer.peek(NexusFile::Tokens::CodonsBlk)
+        @lexer.pop(NexusFile::Tokens::CodonsBlk)
+      end
+    end
+  end
+  # just removes it for the time being
+  def parse_authors_blk
+    # thing has non single word key/value pairs, like "AUTHOR NAME", SIGH
+    # for now just slurp it all up.
+    @lexer.pop(NexusFile::Tokens::AuthorsBlk )
+    #while true
+    #  if @lexer.peek(NexusFile::Tokens::EndBlk)
+    #    @lexer.pop(NexusFile::Tokens::EndBlk)
+    #    break
+    #  else
+     #   while @lexer.peek(NexusFile::Tokens::ValuePair)
+     #     # IMPORTANT, these are going to a general hash, there may ultimately be overlap of keys used in different blocks, this is ignored at present
+     #     @builder.add_var(@lexer.pop(NexusFile::Tokens::ValuePair).value)
+     #   end
+        #@lexer.pop(NexusFile::Tokens::ID) if @lexer.peek(NexusFile::Tokens::ID)
+     # end
+    #end
+  end
+  def parse_taxa_blk
+    @lexer.pop(NexusFile::Tokens::Title) if @lexer.peek(NexusFile::Tokens::Title)
+    # need to not ignore to test against
+    parse_dimensions if @lexer.peek(NexusFile::Tokens::Dimensions)
+    while true
+      if @lexer.peek(NexusFile::Tokens::EndBlk)
+        @lexer.pop(NexusFile::Tokens::EndBlk)
+        break
+      else
+        if @lexer.peek(NexusFile::Tokens::Taxlabels)
+          @lexer.pop(NexusFile::Tokens::Taxlabels) if @lexer.peek(NexusFile::Tokens::Taxlabels)
+          i = 0
+          while @lexer.peek(NexusFile::Tokens::Label)
+            @builder.update_taxon(:index => i, :name => @lexer.pop(NexusFile::Tokens::Label).value)
+            i += 1
+          end
+          @lexer.pop(NexusFile::Tokens::SemiColon) if @lexer.peek(NexusFile::Tokens::SemiColon) # close of tax labels, placement of this seems dubious... but tests are working
+        elsif  @lexer.peek(NexusFile::Tokens::MesquiteIDs)
+          @lexer.pop(NexusFile::Tokens::MesquiteIDs) # trashing these for now
+        elsif  @lexer.peek(NexusFile::Tokens::MesquiteBlockID)
+          @lexer.pop(NexusFile::Tokens::MesquiteBlockID)
+        end
+      end
+    end
+  end
+  def parse_characters_blk
+    while true
+      if @lexer.peek(NexusFile::Tokens::EndBlk) # we're at the end of the block, exit after geting rid of the semi-colon
+        break
+      else
+        @lexer.pop(NexusFile::Tokens::Title) if @lexer.peek(NexusFile::Tokens::Title) # not used at present
+        parse_dimensions if @lexer.peek(NexusFile::Tokens::Dimensions)
+        parse_format if @lexer.peek(NexusFile::Tokens::Format)
+        parse_chr_state_labels if @lexer.peek(NexusFile::Tokens::CharStateLabels)
+        parse_matrix if @lexer.peek(NexusFile::Tokens::Matrix)
+        # handle "\s*OPTIONS MSTAXA = UNCERTAIN;\s\n" within a characters block (sticks in an infinite loop right now)
+        @lexer.pop(NexusFile::Tokens::MesquiteIDs) if @lexer.peek(NexusFile::Tokens::MesquiteIDs) # trashing these for now
+        @lexer.pop(NexusFile::Tokens::MesquiteBlockID) if @lexer.peek(NexusFile::Tokens::MesquiteBlockID) # trashing these for now
+        false
+      end
+    end
+    @lexer.pop(NexusFile::Tokens::EndBlk)
+  end
+  # prolly pop header then fuse with parse_dimensions
+  def parse_format
+    @lexer.pop(NexusFile::Tokens::Format)
+    while @lexer.peek(NexusFile::Tokens::ValuePair)
+      @builder.add_var(@lexer.pop(NexusFile::Tokens::ValuePair).value)
+    end
+    check_initialization_of_ntax_nchar
+  end
+  def parse_dimensions
+    @lexer.pop(NexusFile::Tokens::Dimensions)
+    while @lexer.peek(NexusFile::Tokens::ValuePair)
+      @builder.add_var(@lexer.pop(NexusFile::Tokens::ValuePair).value)
+    end
+    # the last value pair with a ; is automagically handled, don't try popping it again
+    check_initialization_of_ntax_nchar
+  end
+  def check_initialization_of_ntax_nchar
+    # check for character dimensions, if otherwise not set generate them
+    if @builder.nexus_file.vars[:nchar] && @builder.nexus_file.characters == []
+      (0..(@builder.nexus_file.vars[:nchar].to_i - 1)).each {|i| @builder.stub_chr }
+    end
+    # check for taxa dimensions, if otherwise not set generate them
+    if @builder.nexus_file.vars[:ntax] && @builder.nexus_file.taxa == []
+      (0..(@builder.nexus_file.vars[:ntax].to_i - 1)).each {|i| @builder.stub_taxon }
+    end
+  end
+  def parse_chr_state_labels
+    @lexer.pop(NexusFile::Tokens::CharStateLabels)
+    while true
+      if @lexer.peek(NexusFile::Tokens::SemiColon)
+        break
+      else
+        opts = {}
+        name = ""
+        index = @lexer.pop(NexusFile::Tokens::Number).value.to_i
+        (name = @lexer.pop(NexusFile::Tokens::Label).value) if @lexer.peek(NexusFile::Tokens::Label) # not always given a letter
+        @lexer.pop(NexusFile::Tokens::BckSlash) if @lexer.peek(NexusFile::Tokens::BckSlash)
+        if !@lexer.peek(NexusFile::Tokens::Comma) || !@lexer.peek(NexusFile::Tokens::SemiColon)
+          i = 0
+          # three kludge lines, need to figure out the label/number priority, could be issue in list order w/in tokens
+          while @lexer.peek(NexusFile::Tokens::Label) || @lexer.peek(NexusFile::Tokens::Number)
+            opts.update({i.to_s => @lexer.pop(NexusFile::Tokens::Label).value}) if @lexer.peek(NexusFile::Tokens::Label)
+            opts.update({i.to_s => @lexer.pop(NexusFile::Tokens::Number).value.to_s}) if @lexer.peek(NexusFile::Tokens::Number)
+            i += 1
+          end
+        end
+        @lexer.pop(NexusFile::Tokens::Comma) if @lexer.peek(NexusFile::Tokens::Comma) # we may also have hit semicolon
+        opts.update({:index => (index - 1), :name => name})
+        raise(ParserError, "Error parsing character state labels for (or around) character #{index -1}.") if !opts[:name]
+        @builder.update_chr(opts)
+      end
+    end
+    @lexer.pop(NexusFile::Tokens::SemiColon)
+  end
+  def parse_matrix
+    @lexer.pop(NexusFile::Tokens::Matrix)
+    i = 0
+      while true
+        if @lexer.peek(NexusFile::Tokens::SemiColon)
+         break
+        else
+          t = @lexer.pop(NexusFile::Tokens::Label).value
+          @builder.update_taxon(:index => i, :name => t) # if it exists its not re-added
+          @builder.code_row(i, @lexer.pop(NexusFile::Tokens::RowVec).value)
+          i += 1
+        end
+      end
+    @lexer.pop(NexusFile::Tokens::SemiColon) # pop the semicolon
+  end
+  # this suck(s/ed), it needs work when a better API for Mesquite comes out
+  def parse_notes_blk
+    # IMPORTANT - we don't parse the (CM <note>), we just strip the "(CM" ... ")" bit for now in NexusFile::Note
+    @vars = {}
+    inf = 0
+    while true
+      inf += 1
+      raise "Either you have a gazillion notes or more likely parser is caught in an infinite loop inside parse_notes_block" if inf > 100000
+      if @lexer.peek(NexusFile::Tokens::EndBlk)
+        @lexer.pop(NexusFile::Tokens::EndBlk)
+        @builder.add_note(@vars) # one still left to add
+        break
+      else
+        if @lexer.peek(NexusFile::Tokens::ValuePair)
+          @vars.update(@lexer.pop(NexusFile::Tokens::ValuePair).value)
+        elsif @lexer.peek(NexusFile::Tokens::Label)
+          if @vars[:type] # we have the data for this row write it, and start a new one
+            @builder.add_note(@vars)
+            @vars = {}
+          else
+            @vars.update(:type => @lexer.pop(NexusFile::Tokens::Label).value)
+          end
+        elsif @lexer.peek(NexusFile::Tokens::FileLbl)
+          @lexer.pop(NexusFile::Tokens::FileLbl)
+          @vars.update(:file => 'file') # we check for whether :file key is present and handle conditionally
+        end
+      end
+    end
+  end
+    #@vars = {}
+    #while true
+    #  break if  @lexer.peek(NexusFile::Tokens::EndBlk)
+    #  @vars.update(:type => @lexer.pop(NexusFile::Tokens::Label).value)
+      # kludge to get around the funny construct that references file
+     # if @lexer.peek(NexusFile::Tokens::FileLbl)
+    #    @lexer.pop(NexusFile::Tokens::FileLbl)
+    #      vars.update(:file => 'file') # we check for whether :file key is present and handle conditionally
+     #   end
+     #   while true
+     #     meh = @lexer.pop(NexusFile::Tokens::ValuePair)
+     #     @vars.update(meh.value)
+     #     break if !@lexer.peek(NexusFile::Tokens::ValuePair)
+     #   end
+     #
+     #   @builder.add_note(@vars)
+     #   @vars = {}
+    #end
+   # @lexer.pop(NexusFile::Tokens::EndBlk)
+  def parse_trees_blk
+    true
+  end
+  def parse_labels_blk
+  end
+  def parse_sets_blk
+  end
+  def parse_assumptions_blk
+  end
+  def parse_codens_blk
+    # not likely
+  end
+  def parse_mesquitecharmodels_blk
+    # nor this
+  end
+  def parse_mesquite_blk
+  end
+  # def parse_children(parent)
+  # parse a comma-separated list of nodes
+  #  while true
+  #    parse_node(parent)
+  #    if @lexer.peek(NexusFile::Tokens::Comma)
+  #      @lexer.pop(NexusFile::Tokens::Comma)
+  #    else
+  #      break
+  #    end
+  #  end
+  # end
+end

data/lib/tokens.rb ADDED

@@ -0,0 +1,269 @@
+module NexusFile::Tokens
+  class Token
+    # this allows access the the class attribute regexp, without using a class variable
+    class << self; attr_reader :regexp; end
+    attr_reader :value
+    def initialize(str)
+      @value = str
+    end
+  end
+  # in ruby, \A is needed if you want to only match at the beginning of the string, we need this everywhere, as we're
+  # moving along popping off
+  class NexusStart < Token
+    @regexp = Regexp.new(/\A.*(\#nexus)\s*/i)
+  end
+  # at present we strip comments pre-parser initialization, because they can be placed anywhere it gets tricky to parse otherwise, and besides, they are non-standard
+  # class NexusComment < Token
+  #   @regexp = Regexp.new(/\A\s*(\[[^\]]*\])\s*/i)
+  #   def initialize(str)
+  #     str = str[1..-2] # strip the []
+  #     str.strip!
+  #    @value = str
+  #  end
+  # end
+  class BeginBlk < Token
+    @regexp = Regexp.new(/\A\s*(\s*Begin\s*)/i)
+  end
+  class EndBlk < Token
+    @regexp = Regexp.new(/\A\s*([\s\n]*End[\s\n]*;[\s\n]*)/i)
+  end
+  # label
+  class AuthorsBlk < Token
+    @regexp = Regexp.new(/\A\s*(Authors;.*?END;)\s*/im)
+  end
+  # label
+  class TaxaBlk < Token
+    @regexp = Regexp.new(/\A\s*(\s*Taxa\s*;)\s*/i)
+  end
+  # label
+  class NotesBlk < Token
+    @regexp = Regexp.new(/\A\s*(\s*Notes\s*;)\s*/i)
+  end
+  class FileLbl < Token
+    @regexp = Regexp.new(/\A\s*(\s*File\s*)\s*/i)
+  end
+  # label and content
+  class Title < Token
+    @regexp = Regexp.new(/\A\s*(title[^\;]*;)\s*/i)
+  end
+  class Dimensions < Token
+    @regexp = Regexp.new(/\A\s*(DIMENSIONS)\s*/i)
+  end
+  class Format < Token
+    @regexp = Regexp.new(/\A\s*(format)\s*/i)
+  end
+  # label
+  class Taxlabels < Token
+     @regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
+  end
+  # same as ID
+  class Label < Token
+    @regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') #  matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
+    def initialize(str)
+      str.strip!
+      str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
+      str = str[1..-2] if str[0..0] == '"'
+      str.strip!
+      @value = str
+    end
+  end
+  class ChrsBlk < Token
+    @regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
+  end
+  # note we grab EOL and ; here
+  class ValuePair < Token
+    @regexp = Regexp.new(/\A\s*([\w\d\_\&]+\s*=\s*((\'[^\']+\')|(\(.*\))|(\"[^\"]+\")|([^\s\n\t;]+)))[\s\n\t;]+/i) #  returns key => value hash for tokens like 'foo=bar' or foo = 'b a ar'
+    def initialize(str)
+      str.strip!
+      str = str.split(/=/)
+      str[1].strip!
+      str[1] = str[1][1..-2] if str[1][0..0] == "'"
+      str[1] = str[1][1..-2] if str[1][0..0] ==  "\""
+      @value = {str[0].strip.downcase.to_sym => str[1].strip}
+    end
+  end
+  class Matrix < Token
+    @regexp = Regexp.new(/\A\s*(matrix)\s*/i)
+  end
+  class RowVec < Token
+    @regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
+     def initialize(str)
+       # meh! Ruby is simpler to read than Perl?
+       # handles both () and {} style multistates
+       s = str.split(/\(|\)|\}|\{/).collect{|s| s=~ /[\,|\s]/ ? s.split(/[\,|\s]/) : s}.inject([]){|sum, x| x.class == Array ? sum << x.delete_if {|y| y == "" } : sum + x.strip.split(//)}
+      @value = s
+    end
+  end
+  class CharStateLabels < Token
+    @regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
+  end
+  class MesquiteIDs < Token
+    @regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
+  end
+  class MesquiteBlockID < Token
+    @regexp = Regexp.new(/\A\s*(BLOCKID[^;]*;)\s*/i)
+  end
+  # unparsed blocks
+  class TreesBlk < Token
+    @regexp = Regexp.new(/\A\s*(trees;.*?END;)\s*/im) # note the multi-line /m
+  end
+  class SetsBlk < Token
+    @regexp = Regexp.new(/\A\s*(sets;.*?END;)\s*/im)
+  end
+  class MqCharModelsBlk < Token
+    @regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?END;)\s*/im)
+  end
+  class LabelsBlk < Token
+    @regexp = Regexp.new(/\A\s*(LABELS;.*?END;)\s*/im)
+  end
+  class AssumptionsBlk < Token
+    @regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?END;)\s*/im)
+  end
+  class CodonsBlk < Token
+    @regexp = Regexp.new(/\A\s*(CODONS;.*?END;)\s*/im)
+  end
+  class MesquiteBlk < Token
+    @regexp = Regexp.new(/\A\s*(Mesquite;.*?END;)\s*/im)
+  end
+  class BlkEnd < Token
+    @regexp = Regexp.new(/\A[\s\n]*(END;)\s*/i)
+  end
+  class LBracket < Token
+    @regexp = Regexp.new('\A\s*(\[)\s*')
+  end
+  class RBracket < Token
+    @regexp = Regexp.new('\A\s*(\])\s*')
+  end
+  class LParen < Token
+      @regexp = Regexp.new('\A\s*(\()\s*')
+  end
+  class RParen < Token
+    @regexp = Regexp.new('\A\s*(\))\s*')
+  end
+  class Equals < Token
+    @regexp = Regexp.new('\A\s*(=)\s*')
+  end
+  class BckSlash < Token
+    @regexp = Regexp.new('\A\s*(\/)\s*')
+  end
+  # labels
+  class ID < Token
+    @regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
+    def initialize(str)
+      str.strip!
+      str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
+      @value = str
+    end
+  end
+  class Colon < Token
+    @regexp = Regexp.new('\A\s*(:)\s*')
+  end
+  class SemiColon < Token
+    @regexp = Regexp.new('\A\s*(;)\s*')
+  end
+  class Comma < Token
+    @regexp = Regexp.new('\A\s*(\,)\s*')
+  end
+  class Number < Token
+    @regexp = Regexp.new('\A\s*(-?\d+(\.\d+)?([eE][+-]?\d+)?)\s*')
+    def initialize(str)
+      # a little oddness here, in some case we don't want to include the .0
+      # see issues with numbers as labels
+      if str =~ /\./
+        @value = str.to_f
+      else
+        @value = str.to_i
+      end
+    end
+  end
+  # NexusFile::Tokens::NexusComment
+  # this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
+  def self.nexus_file_token_list
+    [ NexusFile::Tokens::NexusStart,
+      NexusFile::Tokens::BeginBlk,
+      NexusFile::Tokens::EndBlk,
+      NexusFile::Tokens::AuthorsBlk,
+      NexusFile::Tokens::SetsBlk,
+      NexusFile::Tokens::MqCharModelsBlk,
+      NexusFile::Tokens::AssumptionsBlk,
+      NexusFile::Tokens::CodonsBlk,
+      NexusFile::Tokens::MesquiteBlk,
+      NexusFile::Tokens::TreesBlk,
+      NexusFile::Tokens::LabelsBlk,
+      NexusFile::Tokens::TaxaBlk,
+      NexusFile::Tokens::NotesBlk,
+      NexusFile::Tokens::Title,
+      NexusFile::Tokens::Taxlabels,
+      NexusFile::Tokens::Dimensions,
+      NexusFile::Tokens::FileLbl,
+      NexusFile::Tokens::Format,
+      NexusFile::Tokens::Equals,
+      NexusFile::Tokens::ValuePair,  # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
+      NexusFile::Tokens::CharStateLabels,
+      NexusFile::Tokens::ChrsBlk,
+      NexusFile::Tokens::Number,
+      NexusFile::Tokens::Matrix,
+      NexusFile::Tokens::SemiColon,
+      NexusFile::Tokens::MesquiteIDs,
+      NexusFile::Tokens::MesquiteBlockID,
+      NexusFile::Tokens::BlkEnd,
+      NexusFile::Tokens::Colon,
+      NexusFile::Tokens::BckSlash,
+      NexusFile::Tokens::Comma,
+      NexusFile::Tokens::LParen,
+      NexusFile::Tokens::RParen,
+      NexusFile::Tokens::LBracket,
+      NexusFile::Tokens::RBracket,
+      NexusFile::Tokens::Label, # must be before RowVec
+      NexusFile::Tokens::RowVec,
+      NexusFile::Tokens::ID # need to trash this
+    ]
+  end
+end