RubyGems - nexus_parser - Versions diffs - 1.2.0 → 1.2.2 - Mend

nexus_parser 1.2.0 → 1.2.2

Files changed (9) hide show

checksums.yaml +4 -4
data/.gitignore +3 -0
data/lib/nexus_parser/lexer.rb +0 -10
data/lib/nexus_parser/parser.rb +146 -77
data/lib/nexus_parser/tokens.rb +87 -84
data/lib/nexus_parser/version.rb +1 -1
data/lib/nexus_parser.rb +41 -14
data/test/test_nexus_parser.rb +371 -26
metadata +3 -3

data/lib/nexus_parser/tokens.rb CHANGED Viewed

@@ -1,5 +1,8 @@
 module NexusParser::Tokens
+  ENDBLKSTR = '(end|endblock)'.freeze
+  QUOTEDLABEL = '(\'+[^\']+\'+)|(\"+[^\"]+\"+)'
   class Token
     # this allows access the the class attribute regexp, without using a class variable
     class << self; attr_reader :regexp; end
@@ -31,12 +34,12 @@ module NexusParser::Tokens
   end
   class EndBlk < Token
-    @regexp = Regexp.new(/\A\s*([\s]*End[\s]*;[\s]*)/i)
+    @regexp = Regexp.new(/\A\s*([\s]*#{ENDBLKSTR}[\s]*;[\s]*)/i)
   end
   # label
   class AuthorsBlk < Token
-    @regexp = Regexp.new(/\A\s*(Authors;.*?END;)\s*/im)
+    @regexp = Regexp.new(/\A\s*(Authors;.*?#{ENDBLKSTR};)\s*/im)
   end
   # label
@@ -66,14 +69,17 @@ module NexusParser::Tokens
     @regexp = Regexp.new(/\A\s*(format)\s*/i)
   end
+  # TODO: Handled, but ignored
+  class RespectCase < Token
+    @regexp = Regexp.new(/\A\s*(respectcase)\s*/i)
+  end
   # label
   class Taxlabels < Token
     @regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
   end
-  # same as ID
-  class Label < Token
-    @regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') #  matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
+  class LabelBase < Token
     def initialize(str)
       str.strip!
       str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
@@ -83,6 +89,20 @@ module NexusParser::Tokens
     end
   end
+  class Label < LabelBase
+    @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|(\w[^,:(); \t\n]*)+)\s*/) #  matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" "
+    def initialize(str)
+      super(str)
+    end
+  end
+  class CharacterLabel < LabelBase
+    @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|[^ \t\n\/\'\",;]+)\s*/)
+    def initialize(str)
+      super(str)
+    end
+  end
   class ChrsBlk < Token
     @regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
   end
@@ -111,10 +131,50 @@ module NexusParser::Tokens
   class RowVec < Token
     @regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
     def initialize(str)
-      # meh! Ruby is simpler to read than Perl?
-      # handles both () and {} style multistates
-      s = str.split(/\(|\)|\}|\{/).collect{|s| s=~ /[\,|\s]/ ? s.split(/[\,|\s]/) : s}.inject([]){|sum, x| x.class == Array ? sum << x.delete_if {|y| y == "" } : sum + x.strip.split(//)}
-      @value = s
+      # We ignore commas outside (and inside) of groupings, it's fine.
+      str.gsub!(/[\, \t]/, '')
+      groupers = ['(', ')', '{', '}']
+      openers = ['(', '{']
+      closers = [')', '}']
+      closer_for = { '(' => ')', '{' => '}' }
+      a = []
+      group = nil
+      group_closer = nil
+      str.each_char { |c|
+        if groupers.include? c
+          if ((openers.include?(c) && !group.nil?) ||
+            (closers.include?(c) && (group.nil? || c != group_closer)))
+            raise(NexusParser::ParseError,
+              "Mismatched grouping in matrix row '#{str}'")
+          end
+          if openers.include? c
+            group = []
+            group_closer = closer_for[c]
+          else # c is a closer
+            if group.count == 1
+              a << group.first
+            elsif group.count > 1
+              a << group
+            end
+            group = nil
+            group_closer = nil
+          end
+        else
+          if group.nil?
+            a << c
+          else
+            group << c
+          end
+        end
+      }
+      raise(NexusParser::ParseError,
+        "Unclosed grouping in matrix row '#{str}'") if !group.nil?
+      @value = a
     end
   end
@@ -122,6 +182,14 @@ module NexusParser::Tokens
     @regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
   end
+  class CharLabels < Token
+    @regexp = Regexp.new(/\A\s*(CHARLABELS)\s*/i)
+  end
+  class StateLabels < Token
+    @regexp = Regexp.new(/\A\s*(STATELABELS)\s*/i)
+  end
   class MesquiteIDs < Token
     @regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
   end
@@ -133,35 +201,35 @@ module NexusParser::Tokens
   # unparsed blocks
   class TreesBlk < Token
-    @regexp = Regexp.new(/\A\s*(trees;.*?END;)\s*/im) # note the multi-line /m
+    @regexp = Regexp.new(/\A\s*(trees;.*?#{ENDBLKSTR};)\s*/im) # note the multi-line /m
   end
   class SetsBlk < Token
-    @regexp = Regexp.new(/\A\s*(sets;.*?END;)\s*/im)
+    @regexp = Regexp.new(/\A\s*(sets;.*?#{ENDBLKSTR};)\s*/im)
   end
   class MqCharModelsBlk < Token
-    @regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?END;)\s*/im)
+    @regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?#{ENDBLKSTR};)\s*/im)
   end
   class LabelsBlk < Token
-    @regexp = Regexp.new(/\A\s*(LABELS;.*?END;)\s*/im)
+    @regexp = Regexp.new(/\A\s*(LABELS;.*?#{ENDBLKSTR};)\s*/im)
   end
   class AssumptionsBlk < Token
-    @regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?END;)\s*/im)
+    @regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?#{ENDBLKSTR};)\s*/im)
   end
   class CodonsBlk < Token
-    @regexp = Regexp.new(/\A\s*(CODONS;.*?END;)\s*/im)
+    @regexp = Regexp.new(/\A\s*(CODONS;.*?#{ENDBLKSTR};)\s*/im)
   end
   class MesquiteBlk < Token
-    @regexp = Regexp.new(/\A\s*(Mesquite;.*?END;)\s*/im)
+    @regexp = Regexp.new(/\A\s*(Mesquite;.*?#{ENDBLKSTR};)\s*/im)
   end
   class BlkEnd < Token
-    @regexp = Regexp.new(/\A[\s]*(END;)\s*/i)
+    @regexp = Regexp.new(/\A[\s]*(#{ENDBLKSTR};)\s*/i)
   end
   class LBracket < Token
@@ -188,16 +256,6 @@ module NexusParser::Tokens
     @regexp = Regexp.new('\A\s*(\/)\s*')
   end
-  # labels
-  class ID < Token
-    @regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
-    def initialize(str)
-      str.strip!
-      str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
-      @value = str
-    end
-  end
   class Colon < Token
     @regexp = Regexp.new('\A\s*(:)\s*')
   end
@@ -210,65 +268,10 @@ module NexusParser::Tokens
     @regexp = Regexp.new('\A\s*(\,)\s*')
   end
-  class Number < Token
-    @regexp = Regexp.new('\A\s*(-?\d+(\.\d+)?([eE][+-]?\d+)?)\s*')
-    def initialize(str)
-      # a little oddness here, in some case we don't want to include the .0
-      # see issues with numbers as labels
-      if str =~ /\./
-        @value = str.to_f
-      else
-        @value = str.to_i
-      end
-    end
+  class PositiveInteger < Token
+    @regexp = Regexp.new('\A\s*(\d+)\s*')
   end
   # NexusParser::Tokens::NexusComment
-  # this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
-  def self.nexus_file_token_list
-    [ NexusParser::Tokens::NexusStart,
-      NexusParser::Tokens::BeginBlk,
-      NexusParser::Tokens::EndBlk,
-      NexusParser::Tokens::AuthorsBlk,
-      NexusParser::Tokens::SetsBlk,
-      NexusParser::Tokens::MqCharModelsBlk,
-      NexusParser::Tokens::AssumptionsBlk,
-      NexusParser::Tokens::CodonsBlk,
-      NexusParser::Tokens::MesquiteBlk,
-      NexusParser::Tokens::TreesBlk,
-      NexusParser::Tokens::LabelsBlk,
-      NexusParser::Tokens::TaxaBlk,
-      NexusParser::Tokens::NotesBlk,
-      NexusParser::Tokens::Title,
-      NexusParser::Tokens::Taxlabels,
-      NexusParser::Tokens::Dimensions,
-      NexusParser::Tokens::FileLbl,
-      NexusParser::Tokens::Format,
-      NexusParser::Tokens::Equals,
-      NexusParser::Tokens::ValuePair,  # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
-      NexusParser::Tokens::CharStateLabels,
-      NexusParser::Tokens::ChrsBlk,
-      NexusParser::Tokens::Number,
-      NexusParser::Tokens::Matrix,
-      NexusParser::Tokens::SemiColon,
-      NexusParser::Tokens::MesquiteIDs,
-      NexusParser::Tokens::MesquiteBlockID,
-      NexusParser::Tokens::BlkEnd,
-      NexusParser::Tokens::Colon,
-      NexusParser::Tokens::BckSlash,
-      NexusParser::Tokens::Comma,
-      NexusParser::Tokens::LParen,
-      NexusParser::Tokens::RParen,
-      NexusParser::Tokens::LBracket,
-      NexusParser::Tokens::RBracket,
-      NexusParser::Tokens::Label, # must be before RowVec
-      NexusParser::Tokens::RowVec,
-      NexusParser::Tokens::LinkLine,
-      NexusParser::Tokens::ID # need to trash this
-    ]
-  end
 end

data/lib/nexus_parser/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module NexusParser
-  VERSION = "1.2.0"
+  VERSION = "1.2.2"
 end

data/lib/nexus_parser.rb CHANGED Viewed

@@ -3,9 +3,6 @@
 # uses the PhyloTree parser/lexer engine by Krishna Dole which in turn was based on
 # Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library
-# outstanding issues:
-## need to resolve Tokens Labels, ValuePair, IDs
 module NexusParser
   require File.expand_path(File.join(File.dirname(__FILE__), 'nexus_parser', 'tokens'))
@@ -75,7 +72,7 @@ class NexusParser
   class Coding
     # unfortunately we need this for notes
     attr_accessor :notes
-    attr_writer :state
+    attr_writer :state
     def initialize(options = {})
       @states = options[:states]
@@ -85,7 +82,7 @@ class NexusParser
     def states
       @states.class == Array ? @states : [@states]
     end
   end
   class Note
@@ -118,7 +115,7 @@ class NexusParser
     end
   end
-end
+end # end NexusParser
 # constructs the NexusParser
@@ -141,6 +138,9 @@ class Builder
   def code_row(taxon_index, rowvector)
     @nf.characters.each_with_index do |c, i|
+      raise(ParseError,
+        "Row #{taxon_index} of the matrix is too short") if rowvector[i].nil?
       @nf.codings[taxon_index.to_i] = [] if !@nf.codings[taxon_index.to_i]
       @nf.codings[taxon_index.to_i][i] = NexusParser::Coding.new(:states => rowvector[i])
@@ -185,7 +185,7 @@ class Builder
     # need to create the characters
-    raise(NexusParser::ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
+    raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
     (@nf.characters[@index].name = @opt[:name]) if @opt[:name]
@@ -193,18 +193,45 @@ class Builder
     @opt.delete(:name)
     # the rest have states
-    @opt.keys.each do |k|
+    create_or_update_states_for_character(@index, @opt)
+  end
+  def update_chr_name(i, name)
+    raise(ParseError, "There are #{@nf.characters.count} characters but we're trying to update from row #{i + 1} of the CHARLABELS list - check your NCHAR and/or the length of your list.") if !@nf.characters[i]
+    # The CHARLABELS list is unindexed, so users are allowed to use '_' to
+    # indicate that a character name is unspecified.
+    @nf.characters[i].name = (name == '_' ? '' : name)
+  end
+  # legal hash keys are :index and integers that point to state labels
+  def update_chr_states(options = {})
+    return false if !options[:index]
+    @opt = options
+    @index = @opt[:index].to_i
+    raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the STATELABELS. Check the indices.") if !@nf.characters[@index]
+    @opt.delete(:index)
+    # the rest have states
+    create_or_update_states_for_character(@index, @opt)
+  end
-      if (@nf.characters[@index].states != {}) && @nf.characters[@index].states[k] # state exists
+  def create_or_update_states_for_character(i, options)
+    options.keys.each do |k|
+      if (@nf.characters[i].states != {}) && @nf.characters[i].states[k] # state exists
         ## !! ONLY HANDLES NAME, UPDATE TO HANDLE notes etc. when we get them ##
-        update_state(@index, :index => k, :name => @opt[k])
+        update_state(i, :index => k, :name => options[k])
       else # doesn't, create it
-        @nf.characters[@index].add_state(:label => k.to_s, :name => @opt[k])
+        @nf.characters[i].add_state(:label => k.to_s, :name => options[k])
       end
     end
   end
   def update_state(chr_index, options = {})
@@ -256,7 +283,7 @@ class Builder
     @nf
   end
-end # end file
+end # end Builder
   # NexusParser::ParseError
   class ParseError < StandardError
@@ -270,7 +297,7 @@ def parse_nexus_file(input)
   @input = input
   @input.gsub!(/\[[^\]]*\]/,'')  # strip out all comments BEFORE we parse the file
   # quickly peek at the input, does this look like a Nexus file?
-  if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /end\;/i)
+  if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /(end|endblock)\;/i)
     raise(NexusParser::ParseError, "File is missing at least some required headers, check formatting.", caller)
   end