RubyGems - nexus_parser - Versions diffs - 1.2.1 → 1.2.2 - Mend

nexus_parser 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/nexus_parser/lexer.rb +0 -10
data/lib/nexus_parser/parser.rb +142 -75
data/lib/nexus_parser/tokens.rb +70 -75
data/lib/nexus_parser/version.rb +1 -1
data/lib/nexus_parser.rb +38 -11
data/test/test_nexus_parser.rb +347 -20
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: be7f8d6dc4a222f456df1bb18dc3d63182cfb83b88ee036c227a93883c5ff70a
-  data.tar.gz: 1ab8785c3ca791476efe19d290ef25f20dc790792cf82fdee4ab1a0cd7468347
+  metadata.gz: 4229e2b23de12e3ef92bc88a83aa04805d3884ca09019aaab843846f58fef964
+  data.tar.gz: 7973b5f04b84eea945ce632e5b20844a82c02a9a90c3c18a5ae4bbdaa97376c8
 SHA512:
-  metadata.gz: e2c206656a9c8a1760e158641923b47f789ef42156fd9486dd75f70f92db670f31308adf739355eca6192ae3c702f6868b04fcab8ab87e7e44590371b3838bf7
-  data.tar.gz: fb4a0f18b0430dc04aa4feebd9d9ea46fb91b8557bc61efe7d75d4ad4427da1a0fa9f0632a3afe074fe5e47c17b18f9cd6a58786833b6c07a7181f819cece0d8
+  metadata.gz: cd2739e8dcf4b84287f325a6443227b0b669a45f38c23a20f32bf30cfe88ac7eb34b5a6af6b0929c9af7b55c21b9096e683543528858527920dccfadff10d425
+  data.tar.gz: 40780dadb8ddc80554ca199e6ea9f0ffb5672db51c66c1b41390a25cf4f4c39e2b27799f644a293135b848b9feb3af0fc4cab338e5fe7f40ba670dcaad384965

data/lib/nexus_parser/lexer.rb CHANGED Viewed

@@ -33,18 +33,8 @@ class NexusParser::Lexer
     if @next_token
       return @next_token
     else
-      # check for a match on the specified class first
       if match(token_class)
         return @next_token
-      else
-        # now check all the tokens for a match
-        NexusParser::Tokens.nexus_file_token_list.each {|t|
-          return @next_token if match(t)
-        }
-      end
-      # no match, either end of string or lex-error
-      if @input != ''
-        raise( NexusParser::ParseError, "Lex Error, unknown token at #{@input[0..10]}...", caller)
       else
         return nil
       end

data/lib/nexus_parser/parser.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 class NexusParser::Parser
   def initialize(lexer, builder)
     @lexer = lexer
     @builder = builder
@@ -10,41 +10,41 @@ class NexusParser::Parser
     # nf = @builder.new_nexus_file # create new local NexusParser instance, nf
     # blks = []
     @lexer.pop(NexusParser::Tokens::NexusStart)
     while @lexer.peek(NexusParser::Tokens::BeginBlk)
       @lexer.pop(NexusParser::Tokens::BeginBlk) # pop it
       if @lexer.peek(NexusParser::Tokens::AuthorsBlk)
         parse_authors_blk
-      # we parse these below
+      # we parse these below
       elsif @lexer.peek(NexusParser::Tokens::TaxaBlk)
         @lexer.pop(NexusParser::Tokens::TaxaBlk )
         parse_taxa_blk
       elsif @lexer.peek(NexusParser::Tokens::ChrsBlk)
         @lexer.pop(NexusParser::Tokens::ChrsBlk)
         parse_characters_blk
       elsif @lexer.peek(NexusParser::Tokens::NotesBlk)
-        @lexer.pop(NexusParser::Tokens::NotesBlk)
+        @lexer.pop(NexusParser::Tokens::NotesBlk)
         parse_notes_blk
       # we should parse this
       elsif @lexer.peek(NexusParser::Tokens::SetsBlk)
         @lexer.pop(NexusParser::Tokens::SetsBlk)
-      # we don't parse these
+      # we don't parse these
       elsif @lexer.peek(NexusParser::Tokens::TreesBlk)
         @foo =  @lexer.pop(NexusParser::Tokens::TreesBlk).value
       elsif @lexer.peek(NexusParser::Tokens::LabelsBlk)
         @lexer.pop(NexusParser::Tokens::LabelsBlk)
       elsif @lexer.peek(NexusParser::Tokens::MqCharModelsBlk)
-        @lexer.pop(NexusParser::Tokens::MqCharModelsBlk)
+        @lexer.pop(NexusParser::Tokens::MqCharModelsBlk)
       elsif @lexer.peek(NexusParser::Tokens::AssumptionsBlk)
         @lexer.pop(NexusParser::Tokens::AssumptionsBlk)
@@ -52,7 +52,7 @@ class NexusParser::Parser
       elsif @lexer.peek(NexusParser::Tokens::CodonsBlk)
         @lexer.pop(NexusParser::Tokens::CodonsBlk)
       end
     end
   end
@@ -70,15 +70,15 @@ class NexusParser::Parser
      #   while @lexer.peek(NexusParser::Tokens::ValuePair)
      #     # IMPORTANT, these are going to a general hash, there may ultimately be overlap of keys used in different blocks, this is ignored at present
-     #     @builder.add_var(@lexer.pop(NexusParser::Tokens::ValuePair).value)
+     #     @builder.add_var(@lexer.pop(NexusParser::Tokens::ValuePair).value)
      #   end
         #@lexer.pop(NexusParser::Tokens::ID) if @lexer.peek(NexusParser::Tokens::ID)
      # end
     #end
   end
-  def parse_taxa_blk
+  def parse_taxa_blk
     @lexer.pop(NexusParser::Tokens::Title) if @lexer.peek(NexusParser::Tokens::Title)
     # need to not ignore to test against
@@ -88,7 +88,7 @@ class NexusParser::Parser
     while true
       inf += 1
       raise(NexusParser::ParseError,"Either you have a gazillion taxa or more likely the parser is caught in an infinite loop trying to parser taxon labels. Check for double single quotes in this block.") if inf > 100000
       if @lexer.peek(NexusParser::Tokens::EndBlk)
         @lexer.pop(NexusParser::Tokens::EndBlk)
         break
@@ -98,51 +98,53 @@ class NexusParser::Parser
           @lexer.pop(NexusParser::Tokens::Taxlabels) if @lexer.peek(NexusParser::Tokens::Taxlabels)
           i = 0
           while @lexer.peek(NexusParser::Tokens::Label)
-            @builder.update_taxon(:index => i, :name => @lexer.pop(NexusParser::Tokens::Label).value)
+            @builder.update_taxon(:index => i, :name => @lexer.pop(NexusParser::Tokens::Label).value)
             i += 1
-          end
+          end
           @lexer.pop(NexusParser::Tokens::SemiColon) if @lexer.peek(NexusParser::Tokens::SemiColon) # close of tax labels, placement of this seems dubious... but tests are working
         elsif  @lexer.peek(NexusParser::Tokens::MesquiteIDs)
           @lexer.pop(NexusParser::Tokens::MesquiteIDs) # trashing these for now
         elsif  @lexer.peek(NexusParser::Tokens::MesquiteBlockID)
-          @lexer.pop(NexusParser::Tokens::MesquiteBlockID)
+          @lexer.pop(NexusParser::Tokens::MesquiteBlockID)
         end
       end
     end
   end
-  def parse_characters_blk
-    inf = 0
+  def parse_characters_blk
+    inf = 0
     while true
       inf += 1
       raise(NexusParser::ParseError,"Either you have a gazillion characters or more likely the parser is caught in an infinite loop trying to parser character data. Check for double single quotes in this block.") if inf > 100000
       if @lexer.peek(NexusParser::Tokens::EndBlk) # we're at the end of the block, exit after geting rid of the semi-colon
-        break
+        break
       else
         @lexer.pop(NexusParser::Tokens::Title) if @lexer.peek(NexusParser::Tokens::Title) # not used at present
         @lexer.pop(NexusParser::Tokens::LinkLine) if @lexer.peek(NexusParser::Tokens::LinkLine) # trashing these for now
         parse_dimensions if @lexer.peek(NexusParser::Tokens::Dimensions)
-        parse_format if @lexer.peek(NexusParser::Tokens::Format)
+        parse_format if @lexer.peek(NexusParser::Tokens::Format)
         parse_chr_state_labels if @lexer.peek(NexusParser::Tokens::CharStateLabels)
-        parse_matrix if @lexer.peek(NexusParser::Tokens::Matrix)
+        parse_chr_labels if @lexer.peek(NexusParser::Tokens::CharLabels)
+        parse_state_labels if @lexer.peek(NexusParser::Tokens::StateLabels)
+        parse_matrix if @lexer.peek(NexusParser::Tokens::Matrix)
         # handle "\s*OPTIONS MSTAXA = UNCERTAIN;\s\n" within a characters block (sticks in an infinite loop right now)
         @lexer.pop(NexusParser::Tokens::MesquiteIDs) if @lexer.peek(NexusParser::Tokens::MesquiteIDs) # trashing these for now
         @lexer.pop(NexusParser::Tokens::MesquiteBlockID) if @lexer.peek(NexusParser::Tokens::MesquiteBlockID) # trashing these for now
-        false
       end
     end
     @lexer.pop(NexusParser::Tokens::EndBlk)
@@ -150,7 +152,7 @@ class NexusParser::Parser
   # prolly pop header then fuse with parse_dimensions
   def parse_format
-    @lexer.pop(NexusParser::Tokens::Format)
+    @lexer.pop(NexusParser::Tokens::Format)
     while @lexer.peek(NexusParser::Tokens::ValuePair) || @lexer.peek(NexusParser::Tokens::RespectCase)
       @lexer.pop(NexusParser::Tokens::RespectCase) if @lexer.peek(NexusParser::Tokens::RespectCase) # !! TODO: nothing is set, respect case is ignored
@@ -160,13 +162,13 @@ class NexusParser::Parser
     check_initialization_of_ntax_nchar
   end
-  def parse_dimensions
+  def parse_dimensions
     @lexer.pop(NexusParser::Tokens::Dimensions)
     while @lexer.peek(NexusParser::Tokens::ValuePair)
       @builder.add_var(@lexer.pop(NexusParser::Tokens::ValuePair).value)
     end
     # the last value pair with a ; is automagically handled, don't try popping it again
     check_initialization_of_ntax_nchar
   end
@@ -175,7 +177,7 @@ class NexusParser::Parser
     if @builder.nexus_file.vars[:nchar] && @builder.nexus_file.characters == []
       (0..(@builder.nexus_file.vars[:nchar].to_i - 1)).each {|i| @builder.stub_chr }
     end
     # check for taxa dimensions, if otherwise not set generate them
     if @builder.nexus_file.vars[:ntax] && @builder.nexus_file.taxa == []
       (0..(@builder.nexus_file.vars[:ntax].to_i - 1)).each {|i| @builder.stub_taxon }
@@ -184,45 +186,108 @@ class NexusParser::Parser
   def parse_chr_state_labels
     @lexer.pop(NexusParser::Tokens::CharStateLabels)
-    inf = 0
+    inf = 0
     while true
       inf += 1
       raise(NexusParser::ParseError,"Either you have a gazillion character state labels or more likely the parser is caught in an infinite loop while trying to parser character state labels. Check for double single quotes in this block.") if inf > 100000
-      if @lexer.peek(NexusParser::Tokens::SemiColon)
-        break
+      if @lexer.peek(NexusParser::Tokens::SemiColon)
+        break
       else
         opts = {}
         name = ""
-        index = @lexer.pop(NexusParser::Tokens::Number).value.to_i
-        (name = @lexer.pop(NexusParser::Tokens::Label).value) if @lexer.peek(NexusParser::Tokens::Label) # not always given a letter
+        index = @lexer.pop(NexusParser::Tokens::PositiveInteger).value.to_i
+        (name = @lexer.pop(NexusParser::Tokens::CharacterLabel).value) if @lexer.peek(NexusParser::Tokens::CharacterLabel) # not always given a letter
         @lexer.pop(NexusParser::Tokens::BckSlash) if @lexer.peek(NexusParser::Tokens::BckSlash)
         if !@lexer.peek(NexusParser::Tokens::Comma) || !@lexer.peek(NexusParser::Tokens::SemiColon)
           i = 0
-          # three kludge lines, need to figure out the label/number priority, could be issue in list order w/in tokens
-          while @lexer.peek(NexusParser::Tokens::Label) || @lexer.peek(NexusParser::Tokens::Number)
-            opts.update({i.to_s => @lexer.pop(NexusParser::Tokens::Label).value}) if @lexer.peek(NexusParser::Tokens::Label)
-            opts.update({i.to_s => @lexer.pop(NexusParser::Tokens::Number).value.to_s}) if @lexer.peek(NexusParser::Tokens::Number)
+          while @lexer.peek(NexusParser::Tokens::CharacterLabel)
+            opts.update({
+              i.to_s => @lexer.pop(NexusParser::Tokens::CharacterLabel).value
+            })
             i += 1
-          end
+          end
         end
         @lexer.pop(NexusParser::Tokens::Comma) if @lexer.peek(NexusParser::Tokens::Comma) # we may also have hit semicolon
         opts.update({:index => (index - 1), :name => name})
         raise(NexusParser::ParseError, "Error parsing character state labels for (or around) character #{index - 1}.") if !opts[:name]
         @builder.update_chr(opts)
-      end
+      end
     end
-    @lexer.pop(NexusParser::Tokens::SemiColon)
+    @lexer.pop(NexusParser::Tokens::SemiColon)
+  end
+  def parse_chr_labels
+    @lexer.pop(NexusParser::Tokens::CharLabels)
+    inf = 0
+    while true
+      inf += 1
+      raise(NexusParser::ParseError,"Either you have a gazillion character labels or more likely the parser is caught in an infinite loop while trying to parse character labels. Check for double single quotes in this block.") if inf > 100000
+      if @lexer.peek(NexusParser::Tokens::SemiColon)
+        break
+      else
+        i = 0
+        while @lexer.peek(NexusParser::Tokens::CharacterLabel)
+          @builder.update_chr_name(
+            i, @lexer.pop(NexusParser::Tokens::CharacterLabel).value
+          )
+          i += 1
+        end
+      end
+    end
+    @lexer.pop(NexusParser::Tokens::SemiColon)
+  end
+  def parse_state_labels
+    @lexer.pop(NexusParser::Tokens::StateLabels)
+    inf = 0
+    while true
+      inf += 1
+      raise(NexusParser::ParseError,"Either you have a gazillion state labels or more likely the parser is caught in an infinite loop while trying to parse state labels. Check for double single quotes in this block.") if inf > 100000
+      if @lexer.peek(NexusParser::Tokens::SemiColon)
+        break
+      else
+        opts = {}
+        index = @lexer.pop(NexusParser::Tokens::PositiveInteger).value.to_i
+        if !@lexer.peek(NexusParser::Tokens::Comma) && !@lexer.peek(NexusParser::Tokens::SemiColon)
+          i = 0
+          while @lexer.peek(NexusParser::Tokens::CharacterLabel)
+            opts.update({
+              i.to_s => @lexer.pop(NexusParser::Tokens::CharacterLabel).value
+            })
+            i += 1
+          end
+        end
+        @lexer.pop(NexusParser::Tokens::Comma) if @lexer.peek(NexusParser::Tokens::Comma) # we may also have hit semicolon
+        opts.update({:index => (index - 1)})
+        @builder.update_chr_states(opts)
+      end
+    end
+    @lexer.pop(NexusParser::Tokens::SemiColon)
   end
   def parse_matrix
@@ -230,25 +295,25 @@ class NexusParser::Parser
     i = 0
       while true
         if @lexer.peek(NexusParser::Tokens::SemiColon)
-         break
+         break
         else
           t = @lexer.pop(NexusParser::Tokens::Label).value
           @builder.update_taxon(:index => i, :name => t) # if it exists its not re-added
           @builder.code_row(i, @lexer.pop(NexusParser::Tokens::RowVec).value)
           i += 1
         end
       end
-    @lexer.pop(NexusParser::Tokens::SemiColon) # pop the semicolon
+    @lexer.pop(NexusParser::Tokens::SemiColon) # pop the semicolon
   end
   # this suck(s/ed), it needs work when a better API for Mesquite comes out
   def parse_notes_blk
     # IMPORTANT - we don't parse the (CM <note>), we just strip the "(CM" ... ")" bit for now in NexusParser::Note
-    @vars = {}
+    @vars = {}
     inf = 0 # a crude iteration checker
     while true
       inf += 1
@@ -261,18 +326,20 @@ class NexusParser::Parser
         if @lexer.peek(NexusParser::Tokens::ValuePair)
           @vars.update(@lexer.pop(NexusParser::Tokens::ValuePair).value)
-        elsif @lexer.peek(NexusParser::Tokens::Label)
-          if @vars[:type] # we have the data for this row write it, and start a new one
+        elsif @lexer.peek(NexusParser::Tokens::FileLbl)
+          @lexer.pop(NexusParser::Tokens::FileLbl)
+          @vars.update(:file => 'file') # we check for whether :file key is present and handle conditionally
+        else @lexer.peek(NexusParser::Tokens::Label)
+          # If we already have a :type set then the Label we just peeked starts a
+          # new row, so write the current one and then start a new one.
+          if @vars[:type]
             @builder.add_note(@vars)
             @vars = {}
-          else
-            @vars.update(:type => @lexer.pop(NexusParser::Tokens::Label).value)
           end
-        elsif @lexer.peek(NexusParser::Tokens::FileLbl)
-          @lexer.pop(NexusParser::Tokens::FileLbl)
-          @vars.update(:file => 'file') # we check for whether :file key is present and handle conditionally
+          @vars.update(:type => @lexer.pop(NexusParser::Tokens::Label).value)
         end
       end
     end
@@ -280,9 +347,9 @@ class NexusParser::Parser
     #@vars = {}
     #while true
-    #  break if  @lexer.peek(NexusParser::Tokens::EndBlk)
+    #  break if  @lexer.peek(NexusParser::Tokens::EndBlk)
     #  @vars.update(:type => @lexer.pop(NexusParser::Tokens::Label).value)
       # kludge to get around the funny construct that references file
@@ -293,11 +360,11 @@ class NexusParser::Parser
      #   while true
-     #     meh = @lexer.pop(NexusParser::Tokens::ValuePair)
+     #     meh = @lexer.pop(NexusParser::Tokens::ValuePair)
      #     @vars.update(meh.value)
      #     break if !@lexer.peek(NexusParser::Tokens::ValuePair)
      #   end
-     #
+     #
      #   @builder.add_note(@vars)
      #   @vars = {}
     #end
@@ -326,7 +393,7 @@ class NexusParser::Parser
     # nor this
   end
   def parse_mesquite_blk
   end
@@ -335,7 +402,7 @@ class NexusParser::Parser
   # def parse_children(parent)
   # parse a comma-separated list of nodes
-  #  while true
+  #  while true
   #    parse_node(parent)
   #    if @lexer.peek(NexusParser::Tokens::Comma)
   #      @lexer.pop(NexusParser::Tokens::Comma)
@@ -344,7 +411,7 @@ class NexusParser::Parser
   #    end
   #  end
   # end
 end

data/lib/nexus_parser/tokens.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 module NexusParser::Tokens
   ENDBLKSTR = '(end|endblock)'.freeze
+  QUOTEDLABEL = '(\'+[^\']+\'+)|(\"+[^\"]+\"+)'
   class Token
     # this allows access the the class attribute regexp, without using a class variable
@@ -78,9 +79,7 @@ module NexusParser::Tokens
     @regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
   end
-  # same as ID
-  class Label < Token
-    @regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') #  matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
+  class LabelBase < Token
     def initialize(str)
       str.strip!
       str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
@@ -90,6 +89,20 @@ module NexusParser::Tokens
     end
   end
+  class Label < LabelBase
+    @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|(\w[^,:(); \t\n]*)+)\s*/) #  matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" "
+    def initialize(str)
+      super(str)
+    end
+  end
+  class CharacterLabel < LabelBase
+    @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|[^ \t\n\/\'\",;]+)\s*/)
+    def initialize(str)
+      super(str)
+    end
+  end
   class ChrsBlk < Token
     @regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
   end
@@ -118,10 +131,50 @@ module NexusParser::Tokens
   class RowVec < Token
     @regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
     def initialize(str)
-      # meh! Ruby is simpler to read than Perl?
-      # handles both () and {} style multistates
-      s = str.split(/\(|\)|\}|\{/).collect{|s| s=~ /[\,|\s]/ ? s.split(/[\,|\s]/) : s}.inject([]){|sum, x| x.class == Array ? sum << x.delete_if {|y| y == "" } : sum + x.strip.split(//)}
-      @value = s
+      # We ignore commas outside (and inside) of groupings, it's fine.
+      str.gsub!(/[\, \t]/, '')
+      groupers = ['(', ')', '{', '}']
+      openers = ['(', '{']
+      closers = [')', '}']
+      closer_for = { '(' => ')', '{' => '}' }
+      a = []
+      group = nil
+      group_closer = nil
+      str.each_char { |c|
+        if groupers.include? c
+          if ((openers.include?(c) && !group.nil?) ||
+            (closers.include?(c) && (group.nil? || c != group_closer)))
+            raise(NexusParser::ParseError,
+              "Mismatched grouping in matrix row '#{str}'")
+          end
+          if openers.include? c
+            group = []
+            group_closer = closer_for[c]
+          else # c is a closer
+            if group.count == 1
+              a << group.first
+            elsif group.count > 1
+              a << group
+            end
+            group = nil
+            group_closer = nil
+          end
+        else
+          if group.nil?
+            a << c
+          else
+            group << c
+          end
+        end
+      }
+      raise(NexusParser::ParseError,
+        "Unclosed grouping in matrix row '#{str}'") if !group.nil?
+      @value = a
     end
   end
@@ -129,6 +182,14 @@ module NexusParser::Tokens
     @regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
   end
+  class CharLabels < Token
+    @regexp = Regexp.new(/\A\s*(CHARLABELS)\s*/i)
+  end
+  class StateLabels < Token
+    @regexp = Regexp.new(/\A\s*(STATELABELS)\s*/i)
+  end
   class MesquiteIDs < Token
     @regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
   end
@@ -195,16 +256,6 @@ module NexusParser::Tokens
     @regexp = Regexp.new('\A\s*(\/)\s*')
   end
-  # labels
-  class ID < Token
-    @regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
-    def initialize(str)
-      str.strip!
-      str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
-      @value = str
-    end
-  end
   class Colon < Token
     @regexp = Regexp.new('\A\s*(:)\s*')
   end
@@ -217,66 +268,10 @@ module NexusParser::Tokens
     @regexp = Regexp.new('\A\s*(\,)\s*')
   end
-  class Number < Token
-    @regexp = Regexp.new('\A\s*(-?\d+(\.\d+)?([eE][+-]?\d+)?)\s*')
-    def initialize(str)
-      # a little oddness here, in some case we don't want to include the .0
-      # see issues with numbers as labels
-      if str =~ /\./
-        @value = str.to_f
-      else
-        @value = str.to_i
-      end
-    end
+  class PositiveInteger < Token
+    @regexp = Regexp.new('\A\s*(\d+)\s*')
   end
   # NexusParser::Tokens::NexusComment
-  # this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
-  def self.nexus_file_token_list
-    [ NexusParser::Tokens::NexusStart,
-      NexusParser::Tokens::BeginBlk,
-      NexusParser::Tokens::EndBlk,
-      NexusParser::Tokens::AuthorsBlk,
-      NexusParser::Tokens::SetsBlk,
-      NexusParser::Tokens::MqCharModelsBlk,
-      NexusParser::Tokens::AssumptionsBlk,
-      NexusParser::Tokens::CodonsBlk,
-      NexusParser::Tokens::MesquiteBlk,
-      NexusParser::Tokens::TreesBlk,
-      NexusParser::Tokens::LabelsBlk,
-      NexusParser::Tokens::TaxaBlk,
-      NexusParser::Tokens::NotesBlk,
-      NexusParser::Tokens::Title,
-      NexusParser::Tokens::Taxlabels,
-      NexusParser::Tokens::Dimensions,
-      NexusParser::Tokens::FileLbl,
-      NexusParser::Tokens::Format,
-      NexusParser::Tokens::RespectCase,
-      NexusParser::Tokens::Equals,
-      NexusParser::Tokens::ValuePair,  # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
-      NexusParser::Tokens::CharStateLabels,
-      NexusParser::Tokens::ChrsBlk,
-      NexusParser::Tokens::Number,
-      NexusParser::Tokens::Matrix,
-      NexusParser::Tokens::SemiColon,
-      NexusParser::Tokens::MesquiteIDs,
-      NexusParser::Tokens::MesquiteBlockID,
-      NexusParser::Tokens::BlkEnd,
-      NexusParser::Tokens::Colon,
-      NexusParser::Tokens::BckSlash,
-      NexusParser::Tokens::Comma,
-      NexusParser::Tokens::LParen,
-      NexusParser::Tokens::RParen,
-      NexusParser::Tokens::LBracket,
-      NexusParser::Tokens::RBracket,
-      NexusParser::Tokens::Label, # must be before RowVec
-      NexusParser::Tokens::RowVec,
-      NexusParser::Tokens::LinkLine,
-      NexusParser::Tokens::ID # need to trash this
-    ]
-  end
 end

data/lib/nexus_parser/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module NexusParser
-  VERSION = "1.2.1"
+  VERSION = "1.2.2"
 end

data/lib/nexus_parser.rb CHANGED Viewed

@@ -3,9 +3,6 @@
 # uses the PhyloTree parser/lexer engine by Krishna Dole which in turn was based on
 # Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library
-# outstanding issues:
-## need to resolve Tokens Labels, ValuePair, IDs
 module NexusParser
   require File.expand_path(File.join(File.dirname(__FILE__), 'nexus_parser', 'tokens'))
@@ -118,7 +115,7 @@ class NexusParser
     end
   end
-end
+end # end NexusParser
 # constructs the NexusParser
@@ -141,6 +138,9 @@ class Builder
   def code_row(taxon_index, rowvector)
     @nf.characters.each_with_index do |c, i|
+      raise(ParseError,
+        "Row #{taxon_index} of the matrix is too short") if rowvector[i].nil?
       @nf.codings[taxon_index.to_i] = [] if !@nf.codings[taxon_index.to_i]
       @nf.codings[taxon_index.to_i][i] = NexusParser::Coding.new(:states => rowvector[i])
@@ -185,7 +185,7 @@ class Builder
     # need to create the characters
-    raise(NexusParser::ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
+    raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
     (@nf.characters[@index].name = @opt[:name]) if @opt[:name]
@@ -193,18 +193,45 @@ class Builder
     @opt.delete(:name)
     # the rest have states
-    @opt.keys.each do |k|
+    create_or_update_states_for_character(@index, @opt)
+  end
+  def update_chr_name(i, name)
+    raise(ParseError, "There are #{@nf.characters.count} characters but we're trying to update from row #{i + 1} of the CHARLABELS list - check your NCHAR and/or the length of your list.") if !@nf.characters[i]
-      if (@nf.characters[@index].states != {}) && @nf.characters[@index].states[k] # state exists
+    # The CHARLABELS list is unindexed, so users are allowed to use '_' to
+    # indicate that a character name is unspecified.
+    @nf.characters[i].name = (name == '_' ? '' : name)
+  end
+  # legal hash keys are :index and integers that point to state labels
+  def update_chr_states(options = {})
+    return false if !options[:index]
+    @opt = options
+    @index = @opt[:index].to_i
+    raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the STATELABELS. Check the indices.") if !@nf.characters[@index]
+    @opt.delete(:index)
+    # the rest have states
+    create_or_update_states_for_character(@index, @opt)
+  end
+  def create_or_update_states_for_character(i, options)
+    options.keys.each do |k|
+      if (@nf.characters[i].states != {}) && @nf.characters[i].states[k] # state exists
         ## !! ONLY HANDLES NAME, UPDATE TO HANDLE notes etc. when we get them ##
-        update_state(@index, :index => k, :name => @opt[k])
+        update_state(i, :index => k, :name => options[k])
       else # doesn't, create it
-        @nf.characters[@index].add_state(:label => k.to_s, :name => @opt[k])
+        @nf.characters[i].add_state(:label => k.to_s, :name => options[k])
       end
     end
   end
   def update_state(chr_index, options = {})
@@ -256,7 +283,7 @@ class Builder
     @nf
   end
-end # end file
+end # end Builder
   # NexusParser::ParseError
   class ParseError < StandardError

data/test/test_nexus_parser.rb CHANGED Viewed

@@ -35,18 +35,18 @@ class Test_Lexer < Test::Unit::TestCase
   def test_lexer
     lexer = NexusParser::Lexer.new("[ foo ] BEGIN taxa; BLORF end;")
     assert lexer.pop(NexusParser::Tokens::LBracket)
-    assert id = lexer.pop(NexusParser::Tokens::ID)
+    assert id = lexer.pop(NexusParser::Tokens::Label)
     assert_equal(id.value, "foo")
     assert lexer.pop(NexusParser::Tokens::RBracket)
     assert lexer.pop(NexusParser::Tokens::BeginBlk)
     assert lexer.pop(NexusParser::Tokens::TaxaBlk)
-    assert foo = lexer.pop(NexusParser::Tokens::ID)
+    assert foo = lexer.pop(NexusParser::Tokens::Label)
     assert_equal("BLORF", foo.value) # truncating whitespace
     assert lexer.pop(NexusParser::Tokens::BlkEnd)
     lexer2 = NexusParser::Lexer.new("[ foo ] begin authors; BLORF end; [] ()  some crud here")
     assert lexer2.pop(NexusParser::Tokens::LBracket)
-    assert id = lexer2.pop(NexusParser::Tokens::ID)
+    assert id = lexer2.pop(NexusParser::Tokens::Label)
     assert_equal(id.value, "foo")
     assert lexer2.pop(NexusParser::Tokens::RBracket)
     assert lexer2.pop(NexusParser::Tokens::BeginBlk)
@@ -64,44 +64,44 @@ class Test_Lexer < Test::Unit::TestCase
     lexer3 = NexusParser::Lexer.new("[ foo ] Begin Characters; BLORF end; [] ()  some crud here")
     assert lexer3.pop(NexusParser::Tokens::LBracket)
-    assert id = lexer3.pop(NexusParser::Tokens::ID)
+    assert id = lexer3.pop(NexusParser::Tokens::Label)
     assert_equal(id.value, "foo")
     assert lexer3.pop(NexusParser::Tokens::RBracket)
     assert lexer3.pop(NexusParser::Tokens::BeginBlk)
     assert lexer3.pop(NexusParser::Tokens::ChrsBlk)
-    assert foo = lexer3.pop(NexusParser::Tokens::ID)
+    assert foo = lexer3.pop(NexusParser::Tokens::Label)
     assert_equal("BLORF", foo.value)
     assert lexer3.pop(NexusParser::Tokens::BlkEnd)
     lexer4 = NexusParser::Lexer.new("Begin Characters; 123123123 end; [] ()  some crud here")
     assert lexer4.pop(NexusParser::Tokens::BeginBlk)
     assert lexer4.pop(NexusParser::Tokens::ChrsBlk)
-    assert foo = lexer4.pop(NexusParser::Tokens::Number)
-    assert_equal(123123123, foo.value)
+    assert foo = lexer4.pop(NexusParser::Tokens::PositiveInteger)
+    assert_equal('123123123', foo.value)
     assert lexer4.pop(NexusParser::Tokens::BlkEnd)
     lexer5 = NexusParser::Lexer.new("(0,1)")
     assert lexer5.pop(NexusParser::Tokens::LParen)
-    assert foo = lexer5.pop(NexusParser::Tokens::Number)
-    assert_equal(0, foo.value)
+    assert foo = lexer5.pop(NexusParser::Tokens::PositiveInteger)
+    assert_equal('0', foo.value)
     assert lexer5.pop(NexusParser::Tokens::Comma)
-    assert foo = lexer5.pop(NexusParser::Tokens::Number)
-    assert_equal(1, foo.value)
+    assert foo = lexer5.pop(NexusParser::Tokens::PositiveInteger)
+    assert_equal('1', foo.value)
     assert lexer5.pop(NexusParser::Tokens::RParen)
     lexer6 =  NexusParser::Lexer.new(" 210(0,1)10A1\n")
     assert foo = lexer6.pop(NexusParser::Tokens::RowVec)
     assert_equal(["2","1","0",["0","1"],"1","0","A","1"], foo.value)
-    lexer6a =  NexusParser::Lexer.new("  21a(0 1)0b{3 4 5}(0)(1 a)\n")
+    lexer6a =  NexusParser::Lexer.new("  21a(0 1)0b{345}(0)(1 a)\n")
     assert foo = lexer6a.pop(NexusParser::Tokens::RowVec)
     assert_equal(["2", "1", "a", ["0", "1"], "0", "b", ["3", "4", "5"], "0", ["1", "a"]], foo.value)
-    lexer6b =  NexusParser::Lexer.new(" 201{0 1}{0 1}0100)\x0A") # *nix line ending
+    lexer6b =  NexusParser::Lexer.new(" 201(01){0 1}0100\x0A") # *nix line ending
     assert foo = lexer6b.pop(NexusParser::Tokens::RowVec)
     assert_equal(["2", "0", "1", ["0", "1"], ["0", "1"], "0", "1", "0", "0"], foo.value)
-    lexer6c =  NexusParser::Lexer.new(" 201{0 1}{0 1}0100)\x0D\x0A") # * dos line ending
+    lexer6c =  NexusParser::Lexer.new(" 201{0 1}{01}0100\x0D\x0A") # * dos line ending
     assert foo = lexer6c.pop(NexusParser::Tokens::RowVec)
     assert_equal(["2", "0", "1", ["0", "1"], ["0", "1"], "0", "1", "0", "0"], foo.value)
@@ -126,7 +126,41 @@ class Test_Lexer < Test::Unit::TestCase
   def test_row_vec
     lexer = NexusParser::Lexer.new("0?(0 1)10(A BD , C)1(0,1,2)1-\n")
     assert foo = lexer.pop(NexusParser::Tokens::RowVec)
-    assert_equal(["0", "?", ["0", "1"], "1", "0", ["A", "BD", "C"], "1", ["0", "1", "2"], "1", "-"], foo.value)
+    assert_equal(["0", "?", ["0", "1"], "1", "0", ["A", "B", "D", "C"], "1", ["0", "1", "2"], "1", "-"], foo.value)
+  end
+  def test_ungrouped_spaces_in_row_vec
+    lexer = NexusParser::Lexer.new("- A 12(BC) ? \n")
+    assert foo = lexer.pop(NexusParser::Tokens::RowVec)
+    assert_equal(['-', 'A', '1', '2', ['B', 'C'], '?'], foo.value)
+  end
+  def test_mismatched_parens_row_vec
+    lexer = NexusParser::Lexer.new("01(12(13\n")
+    assert_raise_with_message(NexusParser::ParseError, /Mismatch/) {
+      lexer.pop(NexusParser::Tokens::RowVec)
+    }
+  end
+  def test_mismatched_groupers_row_vec
+    lexer = NexusParser::Lexer.new("01(12}13\n")
+    assert_raise_with_message(NexusParser::ParseError, /Mismatch/) {
+      lexer.pop(NexusParser::Tokens::RowVec)
+    }
+  end
+  def test_nested_parens_row_vec
+    lexer = NexusParser::Lexer.new("01(12(34))13\n")
+    assert_raise_with_message(NexusParser::ParseError, /Mismatch/) {
+      lexer.pop(NexusParser::Tokens::RowVec)
+    }
+  end
+  def test_unclosed_parens_row_vec
+    lexer = NexusParser::Lexer.new("01(123413\n")
+    assert_raise_with_message(NexusParser::ParseError, /Unclosed/) {
+      lexer.pop(NexusParser::Tokens::RowVec)
+    }
   end
   def test_punctuation
@@ -428,11 +462,6 @@ class Test_Lexer < Test::Unit::TestCase
     assert_equal 'SETS', foo.value.slice(0,4)
     assert_equal 'END;', foo.value.slice(-4,4)
   end
-  def test_lexer_errors
-    lexer = NexusParser::Lexer.new("*&")
-    assert_raise(NexusParser::ParseError) {lexer.peek(NexusParser::Tokens::ID)}
-  end
 end
@@ -574,6 +603,30 @@ class Test_Parser < Test::Unit::TestCase
     assert_equal ["-", "0", "1", "2", "A"], foo.characters[4].state_labels
   end
+  def test_matrix_with_short_row
+    input=  "
+      DIMENSIONS  NCHAR=2;
+      FORMAT DATATYPE = STANDARD GAP = - MISSING = ? SYMBOLS = \"  0 1 2 3 4 5 6 7 8 9 A\";
+      CHARSTATELABELS
+        1 Tibia_II /  norm modified, 2 TII_macrosetae /  '= TI' stronger;
+      MATRIX
+      Dictyna                0?
+      Uloborus               ??
+      Deinopis               0
+    ;
+    END;"
+    builder = NexusParser::Builder.new
+    @lexer = NexusParser::Lexer.new(input)
+    # stub the taxa, they would otherwise get added in dimensions or taxa block
+    (0..2).each{|i| builder.stub_taxon}
+    assert_raise_with_message(NexusParser::ParseError, /too short/) {
+      NexusParser::Parser.new(@lexer, builder).parse_characters_blk
+    }
+  end
   def test_characters_block_without_IDs_or_title
     input=  "
       DIMENSIONS  NCHAR=10;
@@ -623,6 +676,55 @@ class Test_Parser < Test::Unit::TestCase
     assert_equal 10, foo.characters.size
   end
+  def test_characters_charlabels_statelabels_block
+    input=  "
+      DIMENSIONS  NCHAR=4;
+      FORMAT DATATYPE = STANDARD GAP = - MISSING = ? SYMBOLS = \"  0 1 2 3 4 5 6 7 8 9 A\";
+      CHARLABELS
+        Tibia_II
+        TII_macrosetae
+        'Femoral tuber'
+        _
+      ;
+      STATELABELS
+      1 norm modified,
+      3 3 3.5 4,
+      4 pres
+      ;
+      MATRIX
+      Dictyna                -?1(01)
+      Uloborus               0321
+    ;
+    ENDBLOCK;"
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    (0..3).each{|i| builder.stub_taxon}
+    NexusParser::Parser.new(lexer,builder).parse_characters_blk
+    foo = builder.nexus_file
+    assert_equal 4, foo.characters.size
+    assert_equal "Femoral tuber", foo.characters[2].name
+    assert_equal "Undefined", foo.characters[3].name
+    assert_equal "norm", foo.characters[0].states["0"].name
+    assert_equal "modified", foo.characters[0].states["1"].name
+    assert_equal "", foo.characters[1].states["3"].name
+    assert_equal ["3", "3.5", "4"], foo.characters[2].states.keys.collect{|s| foo.characters[2].states[s].name}.sort
+    assert_equal "", foo.characters[1].states["3"].name
+    assert_equal ["-"], foo.codings[0][0].states
+    assert_equal ["?"], foo.codings[0][1].states
+    assert_equal ["0", "1"], foo.codings[0][3].states
+    assert_equal ["3"], foo.codings[1][1].states
+  end
   def test_codings
     foo = parse_nexus_file(@nf)
     assert_equal 100, foo.codings.flatten.size  # two multistates count in single cells
@@ -673,6 +775,68 @@ class Test_Parser < Test::Unit::TestCase
     assert_equal '0 1 2 3 4 5 6 7 8 9 A', foo.vars[:symbols]
   end
+  # https://github.com/mjy/nexus_parser/issues/9
+  def test_three_both_numeric_and_label_state_names_in_a_row
+    input =" CHARSTATELABELS
+    1 'Metatarsal trichobothria (CodAra.29)' / 3 9 27 asdf;
+    Matrix
+    fooo 01 more stuff here that should not be hit"
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    builder.stub_chr()
+    NexusParser::Parser.new(lexer, builder).parse_chr_state_labels
+    foo = builder.nexus_file
+    assert_equal "3", foo.characters[0].states['0'].name
+    assert_equal "9", foo.characters[0].states['1'].name
+    assert_equal "27", foo.characters[0].states['2'].name
+    assert_equal "asdf", foo.characters[0].states['3'].name
+  end
+  def test_non_label_character_name_character_labels
+    input = 'CHARSTATELABELS
+     1 (intentionally_blank) /,
+     2 /,
+     3 %_coverage /,
+     4 #_of_widgets /,
+     5 !endangered! /,
+     6 @the_front /,
+     7 =antennae,
+     8 `a_=_2` /,
+     9 -35_or-36 ,
+     10 27_or_less /,
+     11 fine_not_fine /,
+     12 3,
+      ;'
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    (0..11).each{builder.stub_chr()}
+    NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
+    foo = builder.nexus_file
+    assert_equal 12, foo.characters.size
+    assert_equal "(intentionally_blank)", foo.characters[0].name
+    assert_equal "Undefined", foo.characters[1].name
+    assert_equal "%_coverage", foo.characters[2].name
+    assert_equal "#_of_widgets", foo.characters[3].name
+    assert_equal "!endangered!", foo.characters[4].name
+    assert_equal "@the_front", foo.characters[5].name
+    assert_equal "=antennae", foo.characters[6].name # =3
+    assert_equal "`a_=_2`", foo.characters[7].name
+    assert_equal "-35_or-36", foo.characters[8].name
+    assert_equal "27_or_less", foo.characters[9].name
+    assert_equal "fine_not_fine", foo.characters[10].name
+    assert_equal "3", foo.characters[11].name
+  end
   def test_parse_chr_state_labels
     input =" CHARSTATELABELS
     1 Tibia_II /  norm modified, 2 TII_macrosetae /  '= TI' stronger, 3 Femoral_tuber /  abs pres 'm-setae', 5 Cymbium /  dorsal mesal lateral, 6 Paracymbium /  abs pres, 7 Globular_tegulum /  abs pres, 8  /  entire w_lobe, 9 Conductor_wraps_embolus, 10 Median_apophysis /  pres abs ;
@@ -754,6 +918,169 @@ class Test_Parser < Test::Unit::TestCase
   end
+  def test_parse_chr_labels
+    input =" CHARLABELS
+    _
+		'Maxillary teeth'
+    as_df
+		'Highest number of maxillary teeth (or alveoli):';
+    STATELABELS
+    1 more more more,"
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    (0..3).each{builder.stub_chr()}
+    NexusParser::Parser.new(lexer,builder).parse_chr_labels
+    foo = builder.nexus_file
+    assert_equal 4, foo.characters.size
+    assert_equal 'Undefined', foo.characters[0].name
+    assert_equal 'Maxillary teeth', foo.characters[1].name
+    assert_equal 'as_df', foo.characters[2].name
+    assert_equal 'Highest number of maxillary teeth (or alveoli):', foo.characters[3].name
+  end
+  def test_parse_state_labels
+    input =" STATELABELS
+      1 norm modified,
+      3,
+      4 pres
+    ;
+    CHARLABELS;
+		"
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    (0..3).each{builder.stub_chr()}
+    NexusParser::Parser.new(lexer,builder).parse_state_labels
+    foo = builder.nexus_file
+    assert_equal 4, foo.characters.size
+    assert_equal "norm", foo.characters[0].states["0"].name
+    assert_equal "modified", foo.characters[0].states["1"].name
+    assert_empty foo.characters[1].states
+    assert_empty foo.characters[2].states
+    assert_equal "pres", foo.characters[3].states["0"].name
+  end
+  def test_non_label_character_state_character_labels
+    input = 'CHARSTATELABELS 1 Tibia_II /
+      .5
+      .1.2_form
+      idsimple
+      %_of_length_less_than_10
+      !poisonous!
+      #_is_3_or_4
+      (leave_as_is)
+      @12_o_clock
+      >2
+      ~equal
+      =9
+      ;'
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    builder.stub_chr()
+    NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
+    foo = builder.nexus_file
+    assert_equal ".5", foo.characters[0].states["0"].name
+    assert_equal ".1.2_form", foo.characters[0].states["1"].name
+    assert_equal "idsimple", foo.characters[0].states["2"].name
+    assert_equal "%_of_length_less_than_10", foo.characters[0].states["3"].name
+    assert_equal "!poisonous!", foo.characters[0].states["4"].name
+    assert_equal "#_is_3_or_4", foo.characters[0].states["5"].name
+    assert_equal "(leave_as_is)", foo.characters[0].states["6"].name
+    assert_equal "@12_o_clock", foo.characters[0].states["7"].name
+    assert_equal ">2", foo.characters[0].states["8"].name
+    assert_equal "~equal", foo.characters[0].states["9"].name
+    assert_equal "=9", foo.characters[0].states["10"].name
+  end
+  def test_arbitrary_quote_and_quotelike_character_state_labels
+    # We could tighten up our handling of accidentally unclosed quotes, but
+    # there's pretty much no way to recover in general, so we're not testing
+    # them here.
+    # Things like ""asdf" " failing is a known issue (maybe not solvable with
+    # regular expressions?).
+    input = 'CHARSTATELABELS 1 Tibia_II /
+      "asd, \'f\'"
+      ""a\'sdf  "
+      \'  /as"df/\'
+      \'asdf;\'
+      ""as, df""
+      ;'
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    builder.stub_chr()
+    NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
+    foo = builder.nexus_file
+    assert_equal 'asd, \'f\'', foo.characters[0].states["0"].name
+    assert_equal '"a\'sdf', foo.characters[0].states["1"].name
+    assert_equal '/as"df/', foo.characters[0].states["2"].name
+    assert_equal 'asdf;', foo.characters[0].states["3"].name
+    assert_equal '"as, df"', foo.characters[0].states["4"].name
+  end
+  def test_number_label_chr_state_labels
+    # Character state names that start with numbers
+    input = 'CHARSTATELABELS 1 Tibia_II /
+      123abc
+      -1.23abc
+      -3e-3abc
+      25%_or_less_than
+      ;'
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    (0..3).each{builder.stub_chr()}
+    NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
+    foo = builder.nexus_file
+    assert_equal "123abc", foo.characters[0].states["0"].name
+    assert_equal "-1.23abc", foo.characters[0].states["1"].name
+    assert_equal "-3e-3abc", foo.characters[0].states["2"].name
+    assert_equal "25%_or_less_than", foo.characters[0].states["3"].name
+  end
+  def test_value_pair_label_chr_state_labels
+    # Character state names that are ValuePairs
+    input = 'CHARSTATELABELS 1 Tibia_II /
+      234=(a_b_c)
+      ;'
+    builder = NexusParser::Builder.new
+    lexer = NexusParser::Lexer.new(input)
+    builder.stub_chr()
+    NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
+    foo = builder.nexus_file
+    assert_equal '234=(a_b_c)', foo.characters[0].states["0"].name
+  end
   def DONT_test_parse_really_long_string_of_chr_state_labels
     input =" CHARSTATELABELS
     1 Epigynal_ventral_margin /  'entire (Fig. 15G)' 'with scape (Fig. 27D)', 2 Epigynal_external_structure /  openings_on_a_broad_depression 'copulatory openings on plate, flush with abdomen, sometimes slit like', 3 Epigynal_depression /  'round or square, at most slightly wider than high ' 'elongate, at least twice as wide as high ', 4 Epigynal_plate_surface /  'smooth (Fig. 12E)' 'ridged (Fig. 21G)', 5 epignynal_septum /  absent_ present_, 6 Copulatory_bursa_anterior_margin /  'entire, broadly transverse (Fig. 19B)' 'medially acute (Figs. 22G, 40B)', 7 'Copulatory duct: spermathecal junction' /  posterior lateral_or_anterior, 8 Copulatory_duct_loops_relative_to_spermathecae /  apart 'encircling (Fig. 93J)', 9 Copulatory_duct_terminal_sclerotization /  as_rest_of_duct_ 'distinctly sclerotized, clearly more than rest of duct ', 10 Hard_sclerotized_CD_region /  mostly_or_entirely_ectal_to_the_ectal_rim_of_the_spermathecae 'caudal to the spermathecae, mesal to ectal margin of spermathecae', 11 Male_palpal_tibial_rim /  uniform_or_only_slightly_asymmetric 'strongly and asymmetrically protruding, scoop-shaped (Fig 36D)', 12 Male_palpal_tibia_prolateral_trichobothria /  one none, 13 Cymbial_ridge_ectal_setae /  unmodified 'strongly curved towards the palpal bulb (Kochiura, Figs. 51B-C, 52C)', 14 Cymbial_distal_promargin /  entire 'with an apophysis (Argyrodes, Figs.) ', 15 Cymbial_mesal_margin /  entire 'incised (Anelosimus, Figs. 17D, 20A) ' deeply_notched, 16 Cymbial_tip_sclerotization /  like_rest_of_cymbium 'lightly sclerotized, appears white', 17 Cymbial_tip_setae /  like_other_setae 'thick and strongly curved (Kochiura, Figs. 51B, 52C)', 18 Cymbial_sheath /  absent present, 19 Lock_placement /  'distal (Figs. 67B, 92F-G, I, M)' 'central (Fig. 92H)', 20 Lock_mechanism /  'hook (Figs 31F, 60D, 91A, 92D-E, J-L)' 'hood (Figs 18A, 75B, 92F-I, M)' 'Theridula (Fig 81D)', 21 Cymbial_hook_orientation /  'facing downwards (Figs. 91A, 92D-E, J-K)' 'facing upwards (Fig. 60C-D, 92L)', 22 Cymbial_hook_location /  'inside cymbium (Fig. 92D-E, J-K)' 'ectal cymbial margin (Figs. 67B, 92L).', 23 Cymbial_hook_distal_portion /  'blunt (Figs. 31F, 92D-E)' 'tapering to a narrow tongue (Figs. 66B, 67D, 92L)', 24 Cymbial_hood_size /  'narrow (Fig. 92F-H)' 'broad (Fig. 92I)' 'Spintharus (Fig. 92M)', 25 Cymbial_hood_region /  'translucent, hood visible through cymbium (Anelosimus, Figs. 90A, 91C)' 'opaque, hood not visible', 26 Alveolus_shape /  'circular or oval (Fig. 92A-H)' 'with a mesal extension (Fig. 92A)', 27 Tegulum_ectal_margin /  entire 'protruded (Fig. 20D)', 28 Tegular_groove /  absent 'present (Fig. 28B)', 29 SDT_SB_I /  separate touching, 30 'SDT post-SB II turn' /  gradual '90 degrees (Anelosimus, Fig. 93B)', 31 SDT_SB_I_&_II_reservoir_segment_alignment /  divergent parallel, 32 SDT_SB_I_&_II_orientation /  in_plane_of_first_loop_from_fundus 'out of plane of first loop, against tegular wall', 33 SDT_RSB_I_&_II /  absent present, 34 SDT_SB_III /  absent present, 35 SDT_SB_IV /  absent 'present (Fig. 93E)', 36 Conductor_shape /  'simple, round or oval, short' 'fan shaped, narrow base and broad tip (Selkirkiella, Kochiura)' Enoplognatha Argyrodes Achaearanea Theridion '''rupununi''' '''tanzania''' '''cup-shaped''', 37 Conductor /  'with a groove for embolus (Figs. 10A, 28D, 69B)' 'entire (Figs. 13D, 17F, 52C-D)', 38 Conductor_surface /  'smooth (Figs. 75B, 77B-C)' ' heavily ridged (Figs. 10B-C, 44D. 67C, 69D)', 39 Conductor_tip_sclerotization /  like_base more_than_base, 40 Subconductor /  absent present, 41 Subconductor_pit_upper_wall /  'entire, or slightly protruding' forms_a_regular_oval_lip, 42 Subconductor_at_C_base /  narrows_abruptly_before_C_base narrows_gradually_along_its_entire_length broad_at_base, 43 'Embolus tail-SC relation' /  'hooked in, or oriented towards SC' surpasses_SC behind_E_base, 44 Tegulum_ectally_ /  occupying_less_than_half_of_the_cymbial_cavity_ occupying_more_than_half_of_the_cymbial_cavity, 45 MA_and_sperm_duct /  sperm_duct_loop_not_inside_MA 'sperm duct loop inside MA (Figs. 90F, 91B)', 46 'MA-tegular membrane connection' /  broad narrow, 47 MA_form /  unbranched 'two nearly equally sized branches (Fig. 22A-B) ', 48 MA_distal_tip /  entire hooded, 49 MA_hood_form /  'narrow, pit-like (Figs. 31F, 34D)' 'scoop-shaped (Figs. 60D, 66B, 67D)', 50 TTA_form /  entire 'grooved (Fig. 44C)', 51 TTA /  bulky 'prong shaped (vittatus group)', 52 TTA_distal_tip /  entire_or_gently_curved Argyrodes 'hooked (branched)', 53 TTA_hook_distal_branch /  barely_exceeding_lower_branch_ 'extending beyond lower branch (jucundus group) ', 54 TTA_hook_distal_branch /  thick_ 'thin, finger like (domingo, dubiosus)', 55 TTA_hook_proximal_branch /  'blunt, broad' 'flattened, bladelike' 'cylindrical, elongated', 56 TTA_surface_subterminally /  smooth ridged, 57 TTA_tip_surface /  smooth 'ridged (Figs. 7A-B, 17F, 31D, 34D, 54A, 56B, 86A)', 58 Embolus_and_TTA /  loosely_associated_to_or_resting_in_TTA_shallow_groove 'parts of E entirely enclosed in TTA (Figs. 37A-B, 44C, 89C)', 59 Embolus_tip_surface /  smooth denticulate, 60 Embolus_spiral_curviture /  gentle whip_like corkscrew, 61 Embolus_tip /  entire bifid, 62 Embolus_origin /  retroventral_on_tegulum 'retrolateral (ectal), partially or completely hidden by cymbium (Figs 44C, 60A-C, 67B)', 63 Embolus_ridges /  absent present, 64 Embolus_shape /  short_to_moderately_elongate 'extremely long, >2 spirals (Figs. 54D, 73A-E)', 65 Embolus_spiral_width /  'thin, much of E spiral subequal to E tip ' 'thick, entire E spiral much broader than tip ', 66 Embolus_distal_rim /  'entire (normal)' deeply_grooved, 67 Embolic_terminus /  abrupt 'with a distal apophysis (EA, Fig. 34E) ', 68 Embolus_tail /  'entire, smooth' 'distinct, lobed', 69 'Embolus-dh connection grooves' /  absent present, 70 'Embolus-dh grooves' /  'deep, extend into the E base more than twice longer than the distance between them' 'short, extend into the E base about as long, or slightly longer than the distance between them', 71 E_spiral_distally /  'relatively thin or filiform, cylindrical' 'thick, not cylindrical' 'rupununi/lorenzo like', 72 Embolus_spiral /  entire 'biparted (Eb)' pars_pendula, 73 Eb_orientation /  towards_embolus_tip towards_tibia, 74 Embolic_division_b /  separates_early_from_E E_and_Eb_tightly_associated_the_entire_spiral, 75 Embolic_division_b /  broad 'narrow, relative to Eb spiral, snout-like', 76 'Eb distal portion, ectal marginl' /  'level, not raised ' with_a_distinct_ridge_, 77 Eb_form /  flat 'globose, inflated', 78 Eb_form /  'distinct, clearly separate apophysis' 'short, confined to first section of spiral, barely separate', 79 Eb_tip_and_E_tip_association /  separate Eb_and_E_tips_juxtaposed 'E tip rests on Eb ''cup''', 80 Eb_snout /  'short, snug with E spiral ' 'long, separate from E spiral ', 81 Distal_portion_of_Eb /  entire with_a_cup_shaped_apophysis with_a_raised_ridge, 82 E_tail /  lobe_not_reaching_ectal_margin_of_Eb_ lobe_touching_ectal_margin_of_Eb_, 83 Extra_tegular_sclerite /  absent_ present_, 84 'Median eyes (male)' /  flush_with_carapace 'on tubercle (Argyrodes)', 85 'AME size (male)' /  subequal_or_slightly_larger_than_ALE clearly_smaller_than_ALE, 86 Cheliceral_posterior_margin /  toothed smooth, 87 Cheliceral_posterior_tooth_number /  three_or_more two one, 88 Cheliceral_furrow /  smooth denticulate, 89 Carapace_hairiness /  'sparsely or patchily hirsute (Fig. 48D)' 'uniformly hirsute (Fig. 71D)', 90 Carapace_pars_stridens /  irregular regular_parallel_ridges, 91 Interocular_area /  more_or_less_flush_with_clypeus projecting_beyond_clypeus, 92 Clypeus /  concave_or_flat with_a_prominent_projection, 93 'ocular and clypeal region setae distribution (male)' /  sparse 'in a dense field, or fields', 94 'Labium-sternum connection' /  'visible seam  (Fig. 27C)' fused, 95 Sternocoxal_tubercles /  present absent, 96 Pedicel_location /  'anterior (Fig. 94A-D)' 'medial (Fig. 94J-K)', 97 Abdominal_folium_pattern /  bilateral_spots_or_blotches distinct_central_band_, 98 Abdomen_pattern /  Anelosimus_, 99 Dorsal_band /  'dark edged by white (Kochiura, Anelosimus, Fig. 94G, J)' 'light edged by dark (Fig. 94H)' 'Ameridion, light edged by white (Fig. 94I)', 100 Abdominal_dot_pigment /  silver 'non-reflective, dull', 101 SPR_form /  'weakly keeled (Figs. 67F, 74F)' 'strongly keeled and elongate (Figs. 16B-C, 24D-E, 42F)', 102 SPR_pick_number /  '1-4' '6-28' '>30', 103 SPR_insertion /  flush_with_abdominal_surface 'on a ridge (Figs 32D, 72A-B)', 104 'SPR mesally-oriented picks' /  absent present, 105 'SPR mesally-oriented picks relative to sagittal plane' /  angled_dorsally perpendicular_or_angled_ventrally, 106 SPR /  straight_or_slightly_irregular distinctly_curved 'argyrodine, dorsal picks aside others', 107 SPR_dorsal_pick_spacing /  subequal_to_ventral_pick_spacing distinctly_compressed, 108 SPR_relative_to_pedicel /  lateral dorsal, 109 SPR_setae /  separate tight, 110 'Supra pedicillate ventrolateral  (4 o''clock) proprioreceptor' /  absent present, 111 Epiandrous_fusule_arrangement /  in_one_pair_of_sockets in_a_row, 112 Epiandrous_fusule_pair_number /  '=>9' '6-8' '4-5' 1, 113 Colulus /  'present (Figs. 45E, 61F)' 'absent (Figs. 16E, 78A)' 'invaginated (Figs. 9D,  63G)', 114 Colulus_size /  'large and fleshy (Figs. 55H, 61F)' 'small, less than half the length of its setae (Fig. 38B)', 115 Colular_setae /  present absent, 116 'Colular setae number (female)' /  three_or_more two_, 117 'Palpal claw dentition (female)' /  'dense, > half of surface covered by denticles (Figs. 2D, 9E, 11D, 12G, 45G, 47E, 58G, 80D)' 'sparse < half of surface with denticles', 118 'Palpal tibial trichobothria (female)' /  four three two five, 119 Femur_I_relative_to_II /  subequal 'robust, clearly larger than femur II', 120 'Leg IV relative length (male)' /  '3rd longest (typical leg formula 1243)' '2nd longest (typical leg formula 1423)' 'longest (typical leg formula 4123)', 121 'Leg IV relative length (female)' /  3rd_longest 2nd_longest longest_, 122 'Femur vs. metatarsus length (female)' /  metatarsus_longer metatarsus_shorter, 123 'Femur vs. metatarsus length (male)' /  metatarsus_longer metatarsus_shorter, 124 'Metatarsus vs. tibia length (female)' /  metatarsus_longer metatarsus_shorter, 125 'Metatarsus vs. tibia length (male)' /  metatarsus_longer metatarsus_shorter, 126 Metatarsal_ventral_macrosetae /  like_other_macrosetae thickened_ventrally, 127 Tarsus_IV_comb_serrations /  'simple, straight' curved_hooks, 128 Tarsal_organ_size /  'smaller than setal sockets (normal)' enlarged, 129 'Tarsus IV central claw vs. laterals (male)' /  'short, at most subequal' 'elongate, longer (Figs. 19E, 21C, 23D, 32H, 57F, 58F)', 130 'Tarsus IV central claw vs. laterals (female)' /  equal_or_shorter stout_and_distinctly_longer minute, 131 Spinneret_insertion /  abdominal_apex 'subapical, abdomen extending beyond spinnerets', 132 PLS_flagelliform_spigot_length /  subequal_to__PLS_CY 'longer than PLS CY (Figs. 68E, 78B, 82D)', 133 'PLS, PMS CY spigot bases' /  'not modified, subequal or smaller than ampullates' 'huge and elongated, much larger than ampullates ', 134 CY_shaft_surface /  smooth grooved, 135 PLS_AC_spigot_number /  five_or_more four_or_less, 136 PLS_flagelliform_spigot /  present absent, 137 PLS_posterior_AG_spigot_shape /  'normal, round' flattened, 138 PLS_theridiid_type_AG_position /  more_or_less_parallel end_to_end, 139 'PMS minor ampullate (mAP) spigot shaft length' /  'short, subequal to CY shaft' clearly_longer_than_any_CY_shaft, 140 Web_form /  'linyphioid-like sheet web (Fig. 99C)' 'cobweb (Figs. 97G, 99A-B, 100A-F, 101A-E)' 'network mesh web - with foraging field below (rupununi/lorenzo)' 'dry line-web', 141 'Knock-down lines' /  absent present, 142 Sticky_silk_in_web /  present absent, 143 Egg_sac_surface /  spherical_to_lenticular 'stalked (Fig. 88E, 98D).', 144 Egg_case_structure /  suboval_or_roundish basal_knob rhomboid elongated Spiky, 145 Web_construction /  solitary communal, 146 Mating_thread /  present absent, 147 Adult_females_per_nest /  one multiple, 148 cooperative_behavior /  solitary subsocial permanent_sociality ;

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: nexus_parser
 version: !ruby/object:Gem::Version
-  version: 1.2.1
+  version: 1.2.2
 platform: ruby
 authors:
 - mjy
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-05-03 00:00:00.000000000 Z
+date: 2024-05-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler