RubyGems - tokn - Versions diffs - 0.0.4 - Mend

tokn 0.0.4

Files changed (22) hide show

checksums.yaml +7 -0
data/README.txt +194 -0
data/bin/tokncompile +16 -0
data/bin/toknprocess +26 -0
data/figures/sample_dfa.pdf +0 -0
data/lib/tokn/code_set.rb +392 -0
data/lib/tokn/dfa.rb +196 -0
data/lib/tokn/dfa_builder.rb +261 -0
data/lib/tokn/range_partition.rb +233 -0
data/lib/tokn/reg_parse.rb +379 -0
data/lib/tokn/state.rb +320 -0
data/lib/tokn/token_defn_parser.rb +156 -0
data/lib/tokn/tokenizer.rb +211 -0
data/lib/tokn/tokn_const.rb +29 -0
data/lib/tokn/tools.rb +186 -0
data/lib/tokn.rb +1 -0
data/test/data/sampletext.txt +11 -0
data/test/data/sampletokens.txt +32 -0
data/test/simple.rb +33 -0
data/test/test.rb +519 -0
data/test/testcmds +4 -0
metadata +69 -0

data/lib/tokn/token_defn_parser.rb ADDED Viewed

@@ -0,0 +1,156 @@
+require_relative 'tools'
+req('tokn_const code_set dfa_builder state reg_parse')
+# Parses a token definition script, and generates an NFA that
+# is capable of recognizing and distinguishing between the various
+# tokens.
+#
+# Each line in the script is one of
+#
+#   # ...comment... (the # must appear as the first character in the line)
+#
+#   <tokenname> ':' <regex>
+#
+#
+# A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
+# If the first character is '_', the token is treated as an 'anonymous' token; these can
+# appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
+# generated NFA.
+#
+class TokenDefParser
+  include Tokn
+  attr_reader :dfa
+  # Compile a token definition script into a DFA
+  #
+  def initialize(script, createPDF = false)
+    @script = script
+    parseScript
+    if createPDF
+      dfa.startState.generatePDF("tokenizer_dfa")
+    end
+  end
+  private
+  def parseScript
+    db = false
+    nextTokenId = 0
+    # List of tokens entries, including anonymous ones
+    @tokenListBig = []
+    # List of tokens names, excluding anonymous ones
+    tokenListSmall = []
+    # Maps token name to token entry
+    @tokenNameMap = {}
+    @lines = @script.split("\n")
+    @lines.each_with_index do |line, lineNumber|
+      line.strip!
+      # If line is empty, or starts with '#', it's a comment
+      if line.length == 0 || line[0] == '#'
+        next
+      end
+      if !(line =~ TOKENNAME_EXPR)
+        raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
+      end
+      pos = line.index(":")
+      tokenName = line[0,pos].strip()
+      expr = line[pos+1..-1].strip()
+      rex = RegParse.new(expr, @tokenNameMap)
+      # Give it the next available token id, if it's not an anonymous token
+      tkId = nil
+      if tokenName[0] != '_'
+        tkId = nextTokenId
+        nextTokenId += 1
+      end
+      tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
+      !db || pr("token entry: %s\n",d(tkEntry))
+      if @tokenNameMap.has_key?(tokenName)
+        raise ParseException, "Duplicate token name: "+line
+      end
+      @tokenListBig.push(tkEntry)
+      @tokenNameMap[tkEntry[0]] = tkEntry
+      if tkId
+        tokenListSmall.push(tokenName)
+      end
+      !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
+    end
+    combined = combineTokenNFAs()
+    !db || combined.generatePDF("combined")
+    dfa = DFABuilder.nfa_to_dfa(combined)
+    !db || dfa.generatePDF("combined_minimized")
+    @dfa = DFA.new(tokenListSmall, dfa)
+  end
+  # Combine the individual NFAs constructed for the token definitions into
+  # one large NFA, each augmented with an edge labelled with the appropriate
+  # token identifier to let the tokenizer see which token led to the final state.
+  #
+  def combineTokenNFAs
+    baseId = 0
+    startState = nil
+    @tokenListBig.each do |tokenName, regParse, index, tokenId|
+      # Skip anonymous token definitions
+      if !tokenId
+        next
+      end
+      oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
+      dupStart = oldToNewMap[regParse.startState]
+      # Transition from the expression's end state (not a final state)
+      # to a new final state, with the transitioning edge
+      # labelled with the token id (actually, a transformed token id to distinguish
+      # it from character codes)
+      dupEnd = oldToNewMap[regParse.endState]
+      dupfinalState = State.new(baseId)
+      baseId += 1
+      dupfinalState.finalState = true
+      dupEnd.addEdge(CodeSet.new(tokenIdToEdgeLabel(tokenId)), dupfinalState)
+      if !startState
+        startState = dupStart
+      else
+        # Add an e-transition from the start state to this expression's start
+        startState.addEdge(CodeSet.new(EPSILON),dupStart)
+      end
+    end
+    startState
+  end
+  # Regex for token names preceding regular expressions
+  #
+  TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
+end

data/lib/tokn/tokenizer.rb ADDED Viewed

@@ -0,0 +1,211 @@
+require_relative 'tools'
+req('tokn_const ')
+# Extracts tokens from a script, given a previously constructed DFA.
+#
+class Tokenizer
+  include Tokn
+  # Construct a tokenizer, given a DFA and some text to process
+  #
+  def initialize(dfa, text)
+    @dfa = dfa
+    @text = text
+    @lineNumber = 0
+    @column = 0
+    @cursor = 0
+    @tokenHistory = []
+    @historyPointer = 0
+  end
+  # Determine next token (without reading it)
+  #
+  # Returns Token, or nil if end of input
+  #
+  def peek
+    if !@text
+      raise IllegalStateException, "No input text specified"
+    end
+    db = false
+    !db || warn("debug printing is on")
+    !db || pr("peek, cursor=%d\n",@cursor)
+    if @historyPointer == @tokenHistory.size
+      if @cursor < @text.length
+        bestLength = 0
+        bestId = UNKNOWN_TOKEN
+        charOffset = 0
+        state = @dfa.startState
+        while @cursor + charOffset <= @text.length
+          ch = nil
+          if @cursor + charOffset < @text.length
+            ch = @text[@cursor + charOffset].ord()
+            !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
+          end
+          nextState = nil
+          # Examine edges leaving this state.
+          # If one is labelled with a token id, we don't need to match the character with it;
+          # store as best token found if length is longer than previous, or equal to previous
+          # with higher id.
+          # If an edge is labelled with the current character, advance to that state.
+          edges = state.edges
+          edges.each do |lbl,dest|
+            a = lbl.array
+            !db || pr("  edge lbl=%s, dest=%s\n",d(lbl),d(dest))
+            if a[0] < EPSILON
+              newTokenId = edgeLabelToTokenId(a[0])
+              !db || pr("   new token id=%d\n",newTokenId)
+              if (bestLength < charOffset || newTokenId > bestId)
+                bestLength, bestId = charOffset, newTokenId
+                !db || pr("     making longest found so far\n")
+              end
+            end
+            if ch && lbl.contains?(ch)
+              !db || pr("   setting next state to %s\n",d(dest))
+              nextState = dest
+              break
+            end
+          end
+          if !nextState
+            break
+          end
+          state = nextState
+          charOffset += 1
+          !db || pr(" advanced to next state\n")
+        end
+        peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
+        @tokenHistory.push(peekToken)
+      end
+    end
+    ret = nil
+    if @historyPointer < @tokenHistory.size
+      ret = @tokenHistory[@historyPointer]
+    end
+    ret
+  end
+  # Read next token
+  #
+  # > tokenName : if not nil, the (string) name of the token expected
+  #
+  # Raises TokenizerException if no more tokens,if unrecognized token, or
+  # if token has different than expected name
+  #
+  def read(tokenName = nil)
+    token = peek()
+    if !token
+      raise TokenizerException,"No more tokens"
+    end
+    if token.id == UNKNOWN_TOKEN
+      raise TokenizerException, "Unknown token "+token.inspect
+    end
+    if tokenName && tokenName != nameOf(token)
+      raise TokenizerException, "Unexpected token "+token.inspect
+    end
+    @historyPointer += 1
+    # Advance cursor, line number, column
+    tl = token.text.length
+    @cursor += tl
+    tl.times do |i|
+      c = token.text[i]
+      @column += 1
+      if c == "\n"
+        @lineNumber += 1
+        @column = 0
+      end
+    end
+    token
+  end
+  # Read next token if it has a particular name
+  #
+  # > tokenName : name to look for
+  # < token read, or nil
+  #
+  def readIf(tokenName)
+    ret = nil
+    token = peek()
+    if token && nameOf(token) == tokenName
+      ret = read()
+    end
+    ret
+  end
+  # Determine if another token exists
+  #
+  def hasNext
+    !peek().nil?
+  end
+  # Get the name of a token
+  # (i.e., the name of the token definition, not its text)
+  #
+  # > token read from this tokenizer
+  #
+  def nameOf(token)
+    @dfa.tokenName(token.id)
+  end
+  # Unread one (or more) previously read tokens
+  #
+  def unread(count = 1)
+    if @historyPointer < count
+      raise TokenizerException, "Cannot unread before start"
+    end
+    @historyPointer -= count
+  end
+end
+# Tokens read by Tokenizer
+#
+class Token
+  include Tokn
+  attr_reader :text, :lineNumber, :column, :id
+  def initialize(id, text, lineNumber, column)
+    @id = id
+    @text = text
+    @lineNumber = lineNumber
+    @column = column
+  end
+  def unknown?
+    id == UNKNOWN_TOKEN
+  end
+  def inspect
+    s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
+    if !unknown?
+      s = s.ljust(17) + " : " + text
+    end
+    s
+  end
+end
+class TokenizerException < Exception
+end

data/lib/tokn/tokn_const.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# Module containing tokn-related constants and functions
+#
+module Tokn
+  # Token id if text didn't match any tokens in the DFA
+  UNKNOWN_TOKEN = -1
+  # Code for epsilon transitions
+  EPSILON = -1
+  # One plus the maximum code represented
+  CODEMAX = 0x110000
+  # Minimum code possible (e.g., indicating a token id)
+  CODEMIN = -10000
+  # Convert a token id (>=0) to an edge label value ( < 0)
+  #
+  def tokenIdToEdgeLabel(tokenId)
+    EPSILON-1-tokenId
+  end
+  # Convert an edge label value ( < 0) to a token id (>=0)
+  #
+  def edgeLabelToTokenId(edgeLabel)
+    EPSILON-1-edgeLabel
+  end
+end

data/lib/tokn/tools.rb ADDED Viewed

@@ -0,0 +1,186 @@
+require 'set'
+# Various utility and debug convenience functions.
+#
+# Perform 'require_relative' on a set of files
+#
+# fileListStr : space-delimited file/path items, without .rb extensions
+# subdir : optional path to files relative to tools.rb
+#
+def req(fileListStr,subdir = nil)
+  fileListStr.split(' ').each do |x|
+    if subdir
+      x = File.join(subdir,x)
+    end
+    x += '.rb'
+    require_relative(x)
+  end
+end
+# Shorthand for printf(...)
+#
+def pr(*args)
+  printf(*args)
+end
+# Convert an object to a human-readable string;
+# should be considered a debug-only feature
+#
+def d(arg)
+  if arg.nil?
+    "<nil>"
+  else
+    arg.inspect
+  end
+end
+# Assert that a value is true.  Should be considered a
+# very temporary, debug-only option; it is slow and
+# generates a warning that it is being called.
+#
+def myAssert(cond, *msg)
+  oneTimeAlert("warning",0,"Checking assertion")
+  if not cond
+    if msg.size == 0
+      str = "assertion error"
+    else
+      str = sprintf(*msg)
+    end
+    raise Exception, str
+  end
+end
+# Set test directory.  If nil, sets to home directory + "__test__"
+#
+def setTestDir(d = nil)
+  if !d
+    d = File.join(Dir.home,"__test__")
+  end
+  $testDir = d
+end
+# Get a path within the test directory;
+# create test directory if it doesn't exist.
+#
+# relPath : if nil, returns the test directory; else
+#   returns the test directory joined to this one
+#
+def withinTestDir(relPath = nil)
+  if !$testDir
+    raise IllegalStateException, "No test directory has been defined"
+  end
+  if !File.directory?($testDir)
+    Dir::mkdir($testDir)
+  end
+  if relPath
+    File.join($testDir,relPath)
+  else
+    $testDir
+  end
+end
+# Convert a .dot file (string) to a PDF file "__mygraph__nnn.pdf"
+# in the test directory.
+#
+# It does this by making a system call to the 'dot' utility.
+#
+def dotToPDF(dotFile, name = "")
+  gr = dotFile
+  dotPath = withinTestDir(".__mygraph__.dot")
+  writeTextFile(dotPath,gr)
+  destName = withinTestDir( "__mygraph__"+name+".pdf")
+  system("dot -Tpdf "+dotPath+" -o "+destName)
+end
+# Get a nice, concise description of the file and line
+# of some caller within the stack.
+#
+#  nSkip : the number of items deep in the call stack to look
+#
+def getCallerLocation(nSkip = 2)
+  filename = nil
+  linenumber = nil
+  if nSkip >= 0 && nSkip < caller.size
+    fi = caller[nSkip]
+    # ' path : line number : other '
+    i = fi.index(':')
+    j = nil
+    if i
+      j = fi.index(':',i+1)
+    end
+    if j
+      pth = fi[0,i].split('/')
+      if pth.size
+        filename = pth[-1]
+      end
+      linenumber = fi[i+1,j-i-1].to_i
+    end
+  end
+  if filename && linenumber
+    loc = filename + " ("+linenumber.to_s+")"
+  else
+    loc = "(UNKNOWN LOCATION)"
+  end
+  loc
+end
+# Set of alert strings that have already been reported
+# (to avoid printing anything on subsequent invocations)
+#
+$AlertStrings = Set.new
+# Print a message if it hasn't yet been printed,
+# which includes the caller's location
+#
+# > typeString : e.g., "warning", "unimplemented"
+# > nSkip : the number of levels deep that the caller is in the stack
+# > args : if present, calls sprintf(...) with these to append to the message
+#
+def oneTimeAlert(typeString, nSkip, *args)
+  loc = getCallerLocation(nSkip + 2)
+  s = "*** "+typeString+" " + loc
+  if args && args.size
+    s2 = sprintf(args[0], *args[1..-1])
+    msg = s + ": " + s2
+  else
+    msg = s
+  end
+  if $AlertStrings.add?(msg)
+    puts msg
+  end
+end
+# Print a 'warning' alert, one time only
+#
+def warn(*args)
+  oneTimeAlert("warning",0, *args)
+end
+# Print an 'unimplemented' alert, one time only
+#
+def unimp(*args)
+  oneTimeAlert("unimplemented", 0, *args)
+end
+# Write a string to a text file
+#
+def writeTextFile(path, contents)
+    File.open(path, "wb") {|f| f.write(contents) }
+end
+# Read a file's contents, return as a string
+#
+def readTextFile(path)
+  contents = nil
+  File.open(path,"rb") {|f| contents = f.read }
+  contents
+end

data/lib/tokn.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'tokn/tokenizer'

data/test/data/sampletext.txt ADDED Viewed

@@ -0,0 +1,11 @@
+// Example source file that can be tokenized
+speed = 42   // speed of object
+gravity = -9.80
+title = 'This is a string with \' an escaped delimiter'
+if gravity == 12 {
+  do something
+}

data/test/data/sampletokens.txt ADDED Viewed

@@ -0,0 +1,32 @@
+# Sample token definitions
+# Whitespace includes a comment, which starts with '//' and
+# extends to the end of the line:
+#
+WS: ( [\f\r\s\t\n]+ ) | ( // [^\n]* \n? )
+# An anonymous token, for convenience; a non-empty sequence of digits
+#
+_DIG: [0-9]+
+# Double has lower priority than int; we want ints to
+# be interpreted as ints, not as doubles
+DBL: \-?(({_DIG}(.{_DIG})?)|.{_DIG})
+INT: \-?{_DIG}
+LBL: '([^'\n]|\\')*'
+ID:  [_a-zA-Z][_a-zA-Z0-9]*
+ASSIGN: =
+EQUIV: ==
+IF: if
+DO: do
+BROP: \{
+BRCL: \}

data/test/simple.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'test/unit'
+require_relative '../lib/tokn/tools.rb'
+req('tokenizer dfa')
+class Simple
+  def dataPath(f)
+    File.dirname(__FILE__)+"/data/"+f
+  end
+  setTestDir()
+  # Various unit tests for state machines, character range sets, etc.
+  def initialize
+    @sampleText = readTextFile(self.dataPath("sampletext.txt"))
+    # @sampleTokens = readTextFile(self.dataPath("sampletokens.txt"))
+  end
+  def makeTok
+    dfa = DFA.dfa_from_script_file(self.dataPath("sampletokens.txt"))
+    Tokenizer.new(dfa,  @sampleText)
+  end
+  def go
+    makeTok
+  end
+end
+s = Simple.new
+s.go