RubyGems - tokn - Versions diffs - 0.0.5 → 0.0.6 - Mend

tokn 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/README.txt +4 -5
data/bin/tokncompile +1 -1
data/bin/toknprocess +10 -4
data/lib/tokn/code_set.rb +332 -337
data/lib/tokn/dfa.rb +187 -162
data/lib/tokn/dfa_builder.rb +218 -220
data/lib/tokn/range_partition.rb +205 -203
data/lib/tokn/reg_parse.rb +336 -331
data/lib/tokn/state.rb +267 -270
data/lib/tokn/token_defn_parser.rb +144 -139
data/lib/tokn/tokenizer.rb +243 -175
data/lib/tokn/tokn_const.rb +11 -6
data/lib/tokn/tools.rb +42 -20
data/test/Example1.rb +50 -0
data/test/data/compileddfa.txt +1 -0
data/test/data/sampletext.txt +6 -1
data/test/test.rb +17 -12
metadata +7 -6
data/test/simple.rb +0 -33

data/lib/tokn/token_defn_parser.rb CHANGED Viewed

@@ -1,156 +1,161 @@
 require_relative 'tools'
 req('tokn_const code_set dfa_builder state reg_parse')
-# Parses a token definition script, and generates an NFA that
-# is capable of recognizing and distinguishing between the various
-# tokens.
-#
-# Each line in the script is one of
-#
-#   # ...comment... (the # must appear as the first character in the line)
-#
-#   <tokenname> ':' <regex>
-#
-#
-# A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
-# If the first character is '_', the token is treated as an 'anonymous' token; these can
-# appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
-# generated NFA.
-#
-class TokenDefParser
-  include Tokn
-  attr_reader :dfa
-  # Compile a token definition script into a DFA
-  #
-  def initialize(script, createPDF = false)
-    @script = script
-    parseScript
-    if createPDF
-      dfa.startState.generatePDF("tokenizer_dfa")
-    end
-  end
-  private
-  def parseScript
-    db = false
-    nextTokenId = 0
-    # List of tokens entries, including anonymous ones
-    @tokenListBig = []
+module ToknInternal
+  # Parses a token definition script, and generates an NFA that
+  # is capable of recognizing and distinguishing between the various
+  # tokens.
+  #
+  # Each line in the script is one of
+  #
+  #   # ...comment... (the # must appear as the first character in the line)
+  #
+  #   <tokenname> ':' <regex>
+  #
+  #
+  # A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
+  # If the first character is '_', the token is treated as an 'anonymous' token; these can
+  # appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
+  # generated NFA.
+  #
+  class TokenDefParser
-    # List of tokens names, excluding anonymous ones
-    tokenListSmall = []
+    attr_reader :dfa
-    # Maps token name to token entry
-    @tokenNameMap = {}
+    # Compile a token definition script into a DFA
+    #
+    def initialize(script, createPDF = false)
+      @script = script
+      parseScript
+      if createPDF
+        dfa.startState.generatePDF("tokenizer_dfa")
+      end
+    end
-    @lines = @script.split("\n")
+    private
-    @lines.each_with_index do |line, lineNumber|
-      line.strip!
-      # If line is empty, or starts with '#', it's a comment
-      if line.length == 0 || line[0] == '#'
-        next
-      end
-      if !(line =~ TOKENNAME_EXPR)
-        raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
-      end
-      pos = line.index(":")
-      tokenName = line[0,pos].strip()
-      expr = line[pos+1..-1].strip()
-      rex = RegParse.new(expr, @tokenNameMap)
-      # Give it the next available token id, if it's not an anonymous token
-      tkId = nil
-      if tokenName[0] != '_'
-        tkId = nextTokenId
-        nextTokenId += 1
-      end
-      tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
-      !db || pr("token entry: %s\n",d(tkEntry))
-      if @tokenNameMap.has_key?(tokenName)
-        raise ParseException, "Duplicate token name: "+line
+    def parseScript
+      db = false
+      nextTokenId = 0
+      # List of tokens entries, including anonymous ones
+      @tokenListBig = []
+      # List of tokens names, excluding anonymous ones
+      tokenListSmall = []
+      # Maps token name to token entry
+      @tokenNameMap = {}
+      @lines = @script.split("\n")
+      @lines.each_with_index do |line, lineNumber|
+        line.strip!
+        # If line is empty, or starts with '#', it's a comment
+        if line.length == 0 || line[0] == '#'
+          next
+        end
+        if !(line =~ TOKENNAME_EXPR)
+          raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
+        end
+        pos = line.index(":")
+        tokenName = line[0,pos].strip()
+        expr = line[pos+1..-1].strip()
+        rex = RegParse.new(expr, @tokenNameMap)
+        # Give it the next available token id, if it's not an anonymous token
+        tkId = nil
+        if tokenName[0] != '_'
+          tkId = nextTokenId
+          nextTokenId += 1
+        end
+        tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
+        !db || pr("token entry: %s\n",d(tkEntry))
+        if @tokenNameMap.has_key?(tokenName)
+          raise ParseException, "Duplicate token name: "+line
+        end
+        @tokenListBig.push(tkEntry)
+        @tokenNameMap[tkEntry[0]] = tkEntry
+        if tkId
+          tokenListSmall.push(tokenName)
+        end
+        !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
       end
+      combined = combineTokenNFAs()
+      !db || combined.generatePDF("combined")
-      @tokenListBig.push(tkEntry)
-      @tokenNameMap[tkEntry[0]] = tkEntry
+      dfa = DFABuilder.nfa_to_dfa(combined)
+      !db || dfa.generatePDF("combined_minimized")
-      if tkId
-        tokenListSmall.push(tokenName)
-      end
-      !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
+      @dfa = Tokn::DFA.new(tokenListSmall, dfa)
+    end
+    # Combine the individual NFAs constructed for the token definitions into
+    # one large NFA, each augmented with an edge labelled with the appropriate
+    # token identifier to let the tokenizer see which token led to the final state.
+    #
+    def combineTokenNFAs
+      baseId = 0
+      startState = nil
+      @tokenListBig.each do |tokenName, regParse, index, tokenId|
+        # Skip anonymous token definitions
+        if !tokenId
+          next
+        end
+        oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
+        dupStart = oldToNewMap[regParse.startState]
+        # Transition from the expression's end state (not a final state)
+        # to a new final state, with the transitioning edge
+        # labelled with the token id (actually, a transformed token id to distinguish
+        # it from character codes)
+        dupEnd = oldToNewMap[regParse.endState]
+        dupfinalState = State.new(baseId)
+        baseId += 1
+        dupfinalState.finalState = true
+        # Why do I need to add 'ToknInternal.' here?  Very confusing.
+        dupEnd.addEdge(CodeSet.new(ToknInternal.tokenIdToEdgeLabel(tokenId)), dupfinalState)
+        if !startState
+          startState = dupStart
+        else
+          # Add an e-transition from the start state to this expression's start
+          startState.addEdge(CodeSet.new(EPSILON),dupStart)
+        end
+      end
+      startState
     end
-    combined = combineTokenNFAs()
-    !db || combined.generatePDF("combined")
-    dfa = DFABuilder.nfa_to_dfa(combined)
-    !db || dfa.generatePDF("combined_minimized")
-    @dfa = DFA.new(tokenListSmall, dfa)
-  end
-  # Combine the individual NFAs constructed for the token definitions into
-  # one large NFA, each augmented with an edge labelled with the appropriate
-  # token identifier to let the tokenizer see which token led to the final state.
-  #
-  def combineTokenNFAs
-    baseId = 0
-    startState = nil
+    # Regex for token names preceding regular expressions
+    #
+    TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
-    @tokenListBig.each do |tokenName, regParse, index, tokenId|
-      # Skip anonymous token definitions
-      if !tokenId
-        next
-      end
-      oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
-      dupStart = oldToNewMap[regParse.startState]
-      # Transition from the expression's end state (not a final state)
-      # to a new final state, with the transitioning edge
-      # labelled with the token id (actually, a transformed token id to distinguish
-      # it from character codes)
-      dupEnd = oldToNewMap[regParse.endState]
-      dupfinalState = State.new(baseId)
-      baseId += 1
-      dupfinalState.finalState = true
-      dupEnd.addEdge(CodeSet.new(tokenIdToEdgeLabel(tokenId)), dupfinalState)
-      if !startState
-        startState = dupStart
-      else
-        # Add an e-transition from the start state to this expression's start
-        startState.addEdge(CodeSet.new(EPSILON),dupStart)
-      end
-    end
-    startState
   end
-  # Regex for token names preceding regular expressions
-  #
-  TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
-end
+end  # module ToknInternal

data/lib/tokn/tokenizer.rb CHANGED Viewed

@@ -1,211 +1,279 @@
 require_relative 'tools'
-req('tokn_const ')
+req('tokn_const dfa')
-# Extracts tokens from a script, given a previously constructed DFA.
-#
-class Tokenizer
-  include Tokn
-  # Construct a tokenizer, given a DFA and some text to process
-  #
-  def initialize(dfa, text)
-    @dfa = dfa
-    @text = text
-    @lineNumber = 0
-    @column = 0
-    @cursor = 0
-    @tokenHistory = []
-    @historyPointer = 0
-  end
+module Tokn
-  # Determine next token (without reading it)
+  # Extracts tokens from a script, given a previously constructed DFA.
   #
-  # Returns Token, or nil if end of input
-  #
-  def peek
-    if !@text
-      raise IllegalStateException, "No input text specified"
-    end
-    db = false
-    !db || warn("debug printing is on")
-    !db || pr("peek, cursor=%d\n",@cursor)
+  class Tokenizer
-    if @historyPointer == @tokenHistory.size
-      if @cursor < @text.length
-        bestLength = 0
-        bestId = UNKNOWN_TOKEN
-        charOffset = 0
-        state = @dfa.startState
-        while @cursor + charOffset <= @text.length
-          ch = nil
-          if @cursor + charOffset < @text.length
-            ch = @text[@cursor + charOffset].ord()
-            !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
-          end
-          nextState = nil
-          # Examine edges leaving this state.
-          # If one is labelled with a token id, we don't need to match the character with it;
-          # store as best token found if length is longer than previous, or equal to previous
-          # with higher id.
+    # Construct a tokenizer
+    #
+    # @param dfa the DFA to use
+    # @param text the text to extract tokens from
+    # @param skipName if not nil, tokens with this name will be skipped
+    #
+    def initialize(dfa, text, skipName = nil)
+      @dfa = dfa
+      @text = text
+      if !text
+        raise ArgumentError, "No text defined"
+      end
+      @skipTokenId = nil
+      if skipName
+        @skipTokenId = dfa.tokenId(skipName)
+        if !@skipTokenId
+          raise ArgumentError, "No token with name "+skipName+" found"
+        end
+      end
+      @lineNumber = 0
+      @column = 0
+      @cursor = 0
+      @tokenHistory = []
+      @historyPointer = 0
+    end
+    # Determine next token (without reading it)
+    #
+    # Returns Token, or nil if end of input
+    #
+    def peek
+      # if !@text
+        # raise IllegalStateException, "No input text specified"
+      # end
+      db = false
+      !db || warn("debug printing is on")
+      !db || pr("peek, cursor=%d\n",@cursor)
+      if @historyPointer == @tokenHistory.size
+        while true # repeat until we find a non-skipped token, or run out of text
+          break if @cursor >= @text.length
-          # If an edge is labelled with the current character, advance to that state.
+          bestLength = 0
+          bestId = ToknInternal::UNKNOWN_TOKEN
-          edges = state.edges
-          edges.each do |lbl,dest|
-            a = lbl.array
-            !db || pr("  edge lbl=%s, dest=%s\n",d(lbl),d(dest))
-            if a[0] < EPSILON
-              newTokenId = edgeLabelToTokenId(a[0])
-              !db || pr("   new token id=%d\n",newTokenId)
+          charOffset = 0
+          state = @dfa.startState
+          while @cursor + charOffset <= @text.length
+            ch = nil
+            if @cursor + charOffset < @text.length
+              ch = @text[@cursor + charOffset].ord()
+              !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
+            end
+            nextState = nil
+            # Examine edges leaving this state.
+            # If one is labelled with a token id, we don't need to match the character with it;
+            # store as best token found if length is longer than previous, or equal to previous
+            # with higher id.
-              if (bestLength < charOffset || newTokenId > bestId)
-                bestLength, bestId = charOffset, newTokenId
-                !db || pr("     making longest found so far\n")
+            # If an edge is labelled with the current character, advance to that state.
+            edges = state.edges
+            edges.each do |lbl,dest|
+              a = lbl.array
+              !db || pr("  edge lbl=%s, dest=%s\n",d(lbl),d(dest))
+              if a[0] < ToknInternal::EPSILON
+                newTokenId = ToknInternal::edgeLabelToTokenId(a[0])
+                !db || pr("   new token id=%d\n",newTokenId)
+                if (bestLength < charOffset || newTokenId > bestId)
+                  bestLength, bestId = charOffset, newTokenId
+                  !db || pr("     making longest found so far\n")
+                end
               end
-            end
+              if ch && lbl.contains?(ch)
+                !db || pr("   setting next state to %s\n",d(dest))
+                nextState = dest
+                break
+              end
+            end
-            if ch && lbl.contains?(ch)
-              !db || pr("   setting next state to %s\n",d(dest))
-              nextState = dest
+            if !nextState
               break
             end
-          end
-          if !nextState
-            break
+            state = nextState
+            charOffset += 1
+            !db || pr(" advanced to next state\n")
           end
-          state = nextState
-          charOffset += 1
-          !db || pr(" advanced to next state\n")
-        end
+          if bestId == @skipTokenId
+            @cursor += bestLength
+            next
+          end
+          peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
+          @tokenHistory.push(peekToken)
+          break # We found a token, so stop
+        end
+      end
+      ret = nil
+      if @historyPointer < @tokenHistory.size
+        ret = @tokenHistory[@historyPointer]
+      end
-        peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
-        @tokenHistory.push(peekToken)
-      end
+      ret
+    end
+    # Read next token
+    #
+    # @param tokenName  if not nil, the (string) name of the token expected
+    #
+    # @raise TokenizerException if no more tokens,if unrecognized token, or
+    # if token has different than expected name
+    #
+    def read(tokenName = nil)
+      token = peek()
+      if !token
+        raise TokenizerException,"No more tokens"
+      end
+      if token.id == ToknInternal::UNKNOWN_TOKEN
+        raise TokenizerException, "Unknown token "+token.inspect
+      end
+      if tokenName && tokenName != nameOf(token)
+        raise TokenizerException, "Unexpected token "+token.inspect
+      end
+      @historyPointer += 1
+      # Advance cursor, line number, column
+      tl = token.text.length
+      @cursor += tl
+      tl.times do |i|
+        c = token.text[i]
+        @column += 1
+        if c == "\n"
+          @lineNumber += 1
+          @column = 0
+        end
+      end
+      token
     end
-    ret = nil
-    if @historyPointer < @tokenHistory.size
-      ret = @tokenHistory[@historyPointer]
+    # Read next token if it has a particular name
+    #
+    # > tokenName : name to look for
+    # < token read, or nil
+    #
+    def readIf(tokenName)
+      ret = nil
+      token = peek()
+      if token && nameOf(token) == tokenName
+        ret = read()
+      end
+      ret
     end
-    ret
-  end
+    # Read a sequence of tokens
+    # @param seq string of space-delimited token names; if name is '_',
+    #   allows any token name in that position
+    # @return array of tokens read
+    #
+    def readSequence(seq)
+      seqNames = seq.split(' ')
+      ret = []
+      seqNames.each do |name|
+        tk = name != '_' ? read(name) : read
+        ret.push(tk)
+      end
+      ret
+    end
-  # Read next token
-  #
-  # > tokenName : if not nil, the (string) name of the token expected
-  #
-  # Raises TokenizerException if no more tokens,if unrecognized token, or
-  # if token has different than expected name
-  #
-  def read(tokenName = nil)
-    token = peek()
-    if !token
-      raise TokenizerException,"No more tokens"
+    # Read a sequence of tokens, if they have particular names
+    # @param seq string of space-delimited token names; if name is '_',
+    #   allows any token name in that position
+    # @return array of tokens read, or nil if the tokens had different
+    #   names (or an end of input was encountered)
+    #
+    def readSequenceIf(seq)
+      ret = []
+      seqNames = seq.split(' ')
+      seqNames.each do |name|
+        tk = peek
+        break if !tk
+        if name != '_' && nameOf(tk) != name
+          break
+        end
+        ret.push(read)
+      end
+      if ret.size != seqNames.size
+        unread(ret.size)
+        ret = nil
+      end
+      ret
     end
-    if token.id == UNKNOWN_TOKEN
-      raise TokenizerException, "Unknown token "+token.inspect
-    end
-    if tokenName && tokenName != nameOf(token)
-      raise TokenizerException, "Unexpected token "+token.inspect
+    # Determine if another token exists
+    #
+    def hasNext
+      !peek().nil?
     end
-    @historyPointer += 1
-    # Advance cursor, line number, column
+    # Get the name of a token
+    # (i.e., the name of the token definition, not its text)
+    #
+    # > token read from this tokenizer
+    #
+    def nameOf(token)
+      @dfa.tokenName(token.id)
+    end
-    tl = token.text.length
-    @cursor += tl
-    tl.times do |i|
-      c = token.text[i]
-      @column += 1
-      if c == "\n"
-        @lineNumber += 1
-        @column = 0
+    # Unread one (or more) previously read tokens
+    #
+    # @raise TokenizerException if attempt to unread token that was never read
+    #
+    def unread(count = 1)
+      if @historyPointer < count
+        raise TokenizerException, "Cannot unread before start"
       end
+      @historyPointer -= count
     end
-    token
-  end
-  # Read next token if it has a particular name
-  #
-  # > tokenName : name to look for
-  # < token read, or nil
-  #
-  def readIf(tokenName)
-    ret = nil
-    token = peek()
-    if token && nameOf(token) == tokenName
-      ret = read()
-    end
-    ret
   end
-  # Determine if another token exists
-  #
-  def hasNext
-    !peek().nil?
-  end
-  # Get the name of a token
-  # (i.e., the name of the token definition, not its text)
+  # Tokens read by Tokenizer
   #
-  # > token read from this tokenizer
-  #
-  def nameOf(token)
-    @dfa.tokenName(token.id)
-  end
-  # Unread one (or more) previously read tokens
-  #
-  def unread(count = 1)
-    if @historyPointer < count
-      raise TokenizerException, "Cannot unread before start"
+  class Token
+    attr_reader :text, :lineNumber, :column, :id
+    def initialize(id, text, lineNumber, column)
+      @id = id
+      @text = text
+      @lineNumber = lineNumber
+      @column = column
+    end
+    def unknown?
+      id == ToknInternal::UNKNOWN_TOKEN
+    end
+    # Construct description of token location within text
+    #
+    def inspect
+      s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
+      if !unknown?
+        s = s.ljust(17) + " : " + text
+      end
+      s
     end
-    @historyPointer -= count
-  end
-end
-# Tokens read by Tokenizer
-#
-class Token
-  include Tokn
-  attr_reader :text, :lineNumber, :column, :id
-  def initialize(id, text, lineNumber, column)
-    @id = id
-    @text = text
-    @lineNumber = lineNumber
-    @column = column
-  end
-  def unknown?
-    id == UNKNOWN_TOKEN
   end
-  def inspect
-    s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
-    if !unknown?
-      s = s.ljust(17) + " : " + text
-    end
-    s
+  # Exception class for Tokenizer methods
+  #
+  class TokenizerException < Exception
   end
-end
-class TokenizerException < Exception
-end
+end # module Tokn