RubyGems - tokn - Versions diffs - 0.0.4 - Mend

tokn 0.0.4

Files changed (22) hide show

checksums.yaml +7 -0
data/README.txt +194 -0
data/bin/tokncompile +16 -0
data/bin/toknprocess +26 -0
data/figures/sample_dfa.pdf +0 -0
data/lib/tokn/code_set.rb +392 -0
data/lib/tokn/dfa.rb +196 -0
data/lib/tokn/dfa_builder.rb +261 -0
data/lib/tokn/range_partition.rb +233 -0
data/lib/tokn/reg_parse.rb +379 -0
data/lib/tokn/state.rb +320 -0
data/lib/tokn/token_defn_parser.rb +156 -0
data/lib/tokn/tokenizer.rb +211 -0
data/lib/tokn/tokn_const.rb +29 -0
data/lib/tokn/tools.rb +186 -0
data/lib/tokn.rb +1 -0
data/test/data/sampletext.txt +11 -0
data/test/data/sampletokens.txt +32 -0
data/test/simple.rb +33 -0
data/test/test.rb +519 -0
data/test/testcmds +4 -0
metadata +69 -0

data/lib/tokn/reg_parse.rb ADDED Viewed

@@ -0,0 +1,379 @@
+require_relative 'tools'
+req('code_set state')
+class ParseException < Exception
+end
+# Parses a single regular expression from a string.
+# Produces an NFA with distinguished start and end states
+# (none of these states are marked as final states)
+#
+# Here is the grammar for regular expressions.  Spaces are ignored,
+# and can be liberally sprinkled within the regular expressions to
+# aid readability.  To represent a space, the \s escape sequence must be used.
+# See the file 'sampletokens.txt' for some examples.
+#
+#   Expressions have one of these types:
+#
+#   E : base class
+#   J : a Join expression, formed by concatenating one or more together
+#   Q : a Quantified expression; followed optionally by '*', '+', or '?'
+#   P : a Parenthesized expression, which is optionally surrounded with (), {}, []
+#
+#   E -> J '|' E
+#      | J
+#
+#   J -> Q J
+#      | Q
+#
+#   Q -> P '*'
+#      | P '+'
+#      | P '?'
+#      | P
+#
+#   P -> '(' E ')'
+#      | '{' TOKENNAME '}'
+#      | '[^' SETSEQ ']'     A code not appearing in the set
+#      | '[' SETSEQ ']'
+#      | CHARCODE
+#
+#   SETSEQ -> SET SETSEQ
+#           | SET
+#
+#   SET -> CHARCODE
+#           | CHARCODE '-' CHARCODE
+#
+#   CHARCODE ->
+#            a |  b |  c  ...   any printable except {,},[, etc.
+#        |  \xhh                  hex value from 00...ff
+#        |  \uhhhh                hex value from 0000...ffff (e.g., unicode)
+#        |  \f | \n | \r | \t     formfeed, linefeed, return, tab
+#        |  \s                    a space (' ')
+#        |  \*                    where * is some other non-alphabetic
+#                                  character that needs to be escaped
+#
+# The parser performs recursive descent parsing;
+# each method returns an NFA represented by
+# a pair of states: the start and end states.
+#
+class RegParse
+  attr_reader :startState, :endState
+  # Construct a parser and perform the parsing
+  # @param script script to parse
+  # @param tokenDefMap if not nil, a map of previously parsed regular expressions
+  #     (mapping names to ids) to be consulted if a curly brace expression appears
+  #     in the script
+  #
+  def initialize(script, tokenDefMap = nil)
+    @script = script.strip
+    @nextStateId = 0
+    @tokenDefMap = tokenDefMap
+    parseScript
+  end
+  def inspect
+    s = "RegParse: #{@script}"
+    s += " start:"+d(@startState)+" end:"+d(@endState)
+    return s
+  end
+  private
+  # Raise a ParseException, with a helpful message indicating
+  # the parser's current location within the string
+  #
+  def abort(msg)
+    # Assume we've already read the problem character
+    i = @cursor - 1
+    s = ''
+    if i > 4
+      s += '...'
+    end
+    s +=  @script[i-3...i] || ""
+    s += ' !!! '
+    s += @script[i...i+3] || ""
+    if i +3 < @script.size
+      s += '...'
+    end
+    raise ParseException, msg + ": "+s
+  end
+  # Read next character as a hex digit
+  #
+  def readHex
+    v = read.upcase.ord
+    if v >= 48 and v < 58
+      return v - 48
+    elsif v >= 65 and v < 71
+      return v - 65 + 10
+    else
+      abort "Missing hex digit"
+    end
+  end
+  NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
+  # Parse character definition (CHARCODE) from input
+  #
+  def parseChar
+    c = read
+    val = c.ord
+    if "{}[]*?+|-^()".include?(c) or val <= 0x20
+      abort "Unexpected or unescaped character"
+    end
+    if c == '\\'
+      c = read
+      if "xX".include? c
+        val = (readHex() << 4) | readHex()
+      elsif "uU".include? c
+        val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
+      else
+        if c == 'f'
+          val = "\f".ord
+        elsif c == 'r'
+          val == "\r".ord
+        elsif c == 'n'
+          val = "\n".ord
+        elsif c == 't'
+          val = "\t".ord
+        elsif c == 's'
+          val = " ".ord
+        else
+          if c =~ NO_ESCAPE_CHARS
+            abort "Unsupported escape sequence ("+c+")"
+          end
+          val = c.ord
+        end
+      end
+    end
+    return val
+  end
+  def parseCharNFA
+    val = parseChar
+    # Construct a pair of states with an edge between them
+    # labelled with this character code
+    sA = newState
+    sB = newState
+    cset = CodeSet.new
+    cset.add(val)
+    sA.addEdge(cset, sB)
+    return [sA,sB]
+  end
+  def dbInfo
+    j = @cursor
+    k = j + 5
+    if k >= @script.size
+      return @script[j..k]+"<<<== end"
+    else
+      return @script[j..k]+"..."
+    end
+  end
+  def parseScript
+    # Set up the input scanner
+    @cursor = 0
+    exp = parseE
+    @startState = exp[0]
+    @endState = exp[1]
+  end
+  def newState
+    s = State.new(@nextStateId)
+    @nextStateId += 1
+    return s
+  end
+  def parseSET
+    u = parseChar
+    v = u+1
+    if readIf('-')
+      v = parseChar() + 1
+      if v <= u
+        abort "Illegal range"
+      end
+    end
+    return u,v
+  end
+  def parseSETSEQ
+    db = false
+    !db || pr("parseSETSEQ\n")
+    read('[')
+    negated = readIf('^')
+    !db || pr(" negated=%s\n",negated)
+    rs = CodeSet.new
+    u,v = parseSET
+    rs.add(u,v)
+    !db || pr(" initial set=%s\n",d(rs))
+    while not readIf(']')
+      u,v = parseSET
+      rs.add(u,v)
+      !db || pr("  added another; %s\n",d(rs))
+    end
+    if negated
+      rs.negate
+      !db || pr(" negated=%s\n",d(rs))
+    end
+    if rs.empty?
+      abort "Empty character range"
+    end
+    sA = newState
+    sB = newState
+    sA.addEdge(rs, sB)
+    return [sA,sB]
+  end
+  TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
+  def parseTokenDef
+    read('{')
+    name = ''
+    while !readIf('}')
+      name += read
+    end
+    # pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
+    if name  !~ TOKENREF_EXPR
+      abort "Problem with token name"
+    end
+    tokInfo = nil
+    if @tokenDefMap
+      tokInfo = @tokenDefMap[name]
+    end
+    if !tokInfo
+      abort "Undefined token"
+    end
+    rg = tokInfo[1]
+    oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
+    newStart = oldToNewMap[rg.startState]
+    newEnd = oldToNewMap[rg.endState]
+    [newStart, newEnd]
+  end
+  def parseP
+    ch = peek
+    if ch == '('
+      read
+      e1 = parseE
+      read ')'
+    elsif ch == '{'
+      e1 = parseTokenDef
+    elsif ch == '['
+      e1 = parseSETSEQ
+    else
+      e1 = parseCharNFA
+    end
+    return e1
+   end
+  def parseE
+    e1 = parseJ
+    if readIf('|')
+      e2 = parseE
+      u = newState
+      v = newState
+      u.addEps(e1[0])
+      u.addEps(e2[0])
+      e1[1].addEps(v)
+      e2[1].addEps(v)
+      e1 = [u,v]
+    end
+    return e1
+  end
+  def parseJ
+    e1 = parseQ
+    p = peek
+    if p and not "|)".include? p
+      e2 = parseJ
+      e1[1].addEps(e2[0])
+      e1 = [e1[0],e2[1]]
+    end
+    return e1
+  end
+  def parseQ
+    e1 = parseP
+    p = peek
+    if p == '*'
+      read
+      e1[0].addEps(e1[1])
+      e1[1].addEps(e1[0])
+    elsif p == '+'
+      read
+      e1[1].addEps(e1[0])
+    elsif p == '?'
+      read
+      e1[0].addEps(e1[1])
+      # e1[0].generatePDF("optional")
+    end
+    return e1
+  end
+  def peek(mustExist = false)
+    # skip over any non-linefeed whitespace
+    while @cursor < @script.size && " \t".index(@script[@cursor])
+      @cursor += 1
+    end
+    if mustExist or @cursor < @script.size
+      @script[@cursor]
+    else
+      nil
+    end
+  end
+  def readIf(expChar)
+    r = (peek == expChar)
+    if r
+      read
+    end
+    return r
+  end
+  def read(expChar = nil)
+    ch = peek
+    if ch and ((not expChar) or ch == expChar)
+      @cursor += 1
+      ch
+    else
+      abort 'Unexpected end of input'
+    end
+  end
+end

data/lib/tokn/state.rb ADDED Viewed

@@ -0,0 +1,320 @@
+require 'set'
+require_relative 'tools'
+req 'tokn_const'
+# A state within a state machine (NFA or DFA); also, various utility functions
+# for manipulating state machines.  Observe that a state machine can be
+# referred to by its start state.
+#
+# Each state has a set of directed edges to other states, where each edge is
+# labelled with a CodeSet.
+#
+# It also has a unique id (unique within a particular state machine),
+# and a (boolean) final state flag.
+#
+# For debug purposes, both the state and its edges can be labelled.
+#
+class State
+  include Tokn
+  attr_accessor :id
+  attr_accessor :finalState
+  alias_method :finalState?, :finalState
+  attr_accessor :label
+  # Edges are a list of [label:CharSetRange, dest:State] pairs
+  attr_reader :edges
+  # Produce a readable description of an NFA, for debug purposes
+  #
+  # > st  start state
+  #
+  def self.dumpNFA(st)
+    str = "NFA:\n"
+    map,_,_ = st.reachableStates
+    map.each do |s|
+      str += " "+d(s)+"\n"
+      str += "  edges= "+d(s.edges)+"\n"
+      s.edges.each{ |lbl,dest| str += "   "+d(lbl)+"  ==> "+d(dest)+"\n"}
+    end
+    str
+  end
+  def hash
+    return @id
+  end
+  def eql?(other)
+    return id == other.id
+  end
+  def initialize(id)
+    @edges = []
+    @id = id
+  end
+  def clearEdges
+    @edges.clear
+  end
+  # Add an edge
+  #  codeSet : the character codes to label it with
+  #  destState : destination state
+  #
+  def addEdge(codeSet,destState)
+    @edges.push([codeSet, destState])
+  end
+  # Add a e-transition edge
+  #  destState : destination state
+  #
+  def addEps(destState)
+    addEdge(CodeSet.new(EPSILON), destState)
+  end
+  def inspect
+    name
+  end
+  def name
+    nm = 'S' + d(id)
+    if label
+      nm += ": "+label
+    end
+    nm
+  end
+  # Normalize a state machine.
+  #
+  # For each state:
+  #  [] merge edges that go to a common state
+  #  [] delete edges that have empty labels
+  #  [] sort edges by destination state ids
+  #
+  # > start state
+  #
+  def self.normalizeStates(startState)
+    stateSet, _,_ = startState.reachableStates
+    stateSet.map{|s| s.normalize}
+  end
+  # Generate a PDF of the state machine;
+  # Makes a system call to the dot utility to convert a .dot file to a .pdf
+  #
+  def generatePDF(title = "nfa")
+    stateList = {}
+    startState = self
+    genAux(stateList, startState)
+    g = ""
+    g += "digraph "+title+" {\n"
+    g += " '' [shape=none]\n"
+    stateList.each_value do |s|
+      g += " '" + s.name + "' [shape="
+      if s.finalState?
+        g += "doubleoctagon"
+      else
+        g += "octagon"
+      end
+      g += "]\n"
+    end
+    g += "\n"
+    g += " '' -> '" + startState.name + "'\n"
+    stateList.each_value do |s|
+      s.edges.each do |crs, s2|
+        g += " '"+s.name+"' -> '" + s2.name + "' [label='"
+        g += d(crs)
+        g += "'][fontname=Courier][fontsize=12]\n"
+      end
+    end
+    g += "\n}\n"
+    g.gsub!( /'/, '"' )
+    dotToPDF(g,title)
+  end
+  # Normalize a state
+  #
+  #  [] merge edges that go to a common state
+  #  [] delete edges that have empty labels
+  #  [] sort edges by destination state ids
+  #
+  def normalize()
+    db = false
+    !db || pr("\n\nnormalize state:\n  %s\nedges=\n%s\n",d(self),d(@edges))
+    @edges.sort!{|x,y|
+      label1,dest1 = x
+      label2,dest2 = y
+      dest1.id <=> dest2.id
+    }
+    !db || pr(" sorted edges: %s\n",d(@edges))
+    newEdges = []
+    prevLabel, prevDest = nil,nil
+    edges.each do |label,dest|
+      !db || pr("  processing edge  %s,  %s\n",d(label),d(dest))
+      if prevDest and prevDest.id == dest.id
+        # changed = true
+        !db || pr("    adding set %s to prevLabel %s...\n",d(label),d(prevLabel))
+        prevLabel.addSet(label)
+        !db || pr("    ...now %s\n",d(prevLabel))
+      else
+        if prevDest
+          newEdges.push([prevLabel,prevDest])
+        end
+        # Must start a fresh copy!  Don't want to modify the original label.
+        prevLabel = label.makeCopy()
+        prevDest = dest
+        !db || pr("    pushed onto new edges\n")
+        end
+      end
+      if prevDest
+         newEdges.push([prevLabel,prevDest])
+      end
+    @edges = newEdges
+    !db || pr("edges now: %s\n",d(@edges))
+  end
+  # Duplicate the NFA reachable from this state, possibly with new ids
+  #
+  # > dupBaseId : lowest id to use for duplicate; if nil, uses
+  #     next available id
+  # < [ map of original states => duplicate states;
+  #     1 + highest id in new NFA ]
+  #
+  def duplicateNFA(dupBaseId = nil)
+    oldStates, oldMinId, oldMaxId = reachableStates()
+    dupBaseId ||= oldMaxId
+    oldToNewStateMap = {}
+    oldStates.each do |s|
+      s2 = State.new((s.id - oldMinId) + dupBaseId)
+      s2.finalState = s.finalState?
+      s2.label = s.label
+      oldToNewStateMap[s] = s2
+    end
+    oldStates.each do |s|
+      s2 = oldToNewStateMap[s]
+      s.edges.each{ |lbl,dest|  s2.addEdge(lbl, oldToNewStateMap[dest])}
+    end
+    [oldToNewStateMap, (oldMaxId - oldMinId) + dupBaseId]
+  end
+  # Construct the reverse of the NFA starting at this state
+  # < start state of reversed NFA
+  #
+  def reverseNFA()
+    stateSet, minId, maxId = reachableStates()
+    edgeList = []
+    newStartStateList = []
+    newFinalStateList = []
+    newStateMap = {}
+    stateSet.each do |s|
+      u = State.new(s.id)
+      newStateMap[u.id] = u
+      if s.id == self.id
+        newFinalStateList.push(u)
+        u.finalState = true
+      end
+      if s.finalState?
+        newStartStateList.push(u)
+      end
+      s.edges.each {|lbl, dest| edgeList.push([dest.id, s.id, lbl])}
+    end
+    edgeList.each do |srcId, destId, lbl|
+      srcState = newStateMap[srcId]
+      destState = newStateMap[destId]
+      srcState.addEdge(lbl, destState)
+    end
+    # Create a distinguished start node that points to each of the start nodes
+    w = State.new(maxId)
+    newStartStateList.each {|s| w.addEps(s)}
+    w
+  end
+  # Build set of states reachable from this state
+  #
+  # > list of starting states
+  # < [ set,   set of states reachable from those states
+  #     minId, lowest id in set
+  #     maxId    1 + highest id in set
+  #   ]
+  #
+  def reachableStates()
+    set = Set.new
+    stack = []
+    stack.push(self)
+    maxId = nil
+    minId = nil
+    while !stack.empty?
+      st = stack.pop
+      set.add(st)
+      if !minId || minId > st.id
+        minId = st.id
+      end
+      if !maxId || maxId <= st.id
+        maxId = 1 + st.id
+      end
+      st.edges.each do |lbl, dest|
+        if set.add?(dest)
+          stack.push(dest)
+        end
+      end
+    end
+    [set, minId,  maxId]
+  end
+end
+private
+def genAux(stateList, st)
+  if not stateList.member?(st.name)
+    stateList[st.name] = st
+    st.edges.each {|label, dest| genAux(stateList, dest)}
+  end
+end