RubyGems - tokn - Versions diffs - 0.0.4 - Mend

tokn 0.0.4

Files changed (22) hide show

checksums.yaml +7 -0
data/README.txt +194 -0
data/bin/tokncompile +16 -0
data/bin/toknprocess +26 -0
data/figures/sample_dfa.pdf +0 -0
data/lib/tokn/code_set.rb +392 -0
data/lib/tokn/dfa.rb +196 -0
data/lib/tokn/dfa_builder.rb +261 -0
data/lib/tokn/range_partition.rb +233 -0
data/lib/tokn/reg_parse.rb +379 -0
data/lib/tokn/state.rb +320 -0
data/lib/tokn/token_defn_parser.rb +156 -0
data/lib/tokn/tokenizer.rb +211 -0
data/lib/tokn/tokn_const.rb +29 -0
data/lib/tokn/tools.rb +186 -0
data/lib/tokn.rb +1 -0
data/test/data/sampletext.txt +11 -0
data/test/data/sampletokens.txt +32 -0
data/test/simple.rb +33 -0
data/test/test.rb +519 -0
data/test/testcmds +4 -0
metadata +69 -0

data/lib/tokn/dfa.rb ADDED Viewed

@@ -0,0 +1,196 @@
+require 'json'
+require_relative 'tools'
+req('code_set state')
+# A DFA for tokenizing; includes pointer to a start state, and
+# a list of token names
+#
+class DFA
+  include Tokn
+  # Compile a Tokenizer DFA from a token definition script.
+  # If persistPath is not null, it first checks if the file exists and
+  # if so, assumes it contains (in JSON form) a previously compiled
+  # DFA matching this script, and reads the DFA from it.
+  # Second, if no such file exists, it writes the DFA to it after compilation.
+  #
+  def self.dfa_from_script(script, persistPath = nil)
+    if persistPath and File.exist?(persistPath)
+      return extractDFA(readTextFile(persistPath))
+    end
+    req('token_defn_parser')
+    td = TokenDefParser.new(script)
+    dfa = td.dfa
+    if persistPath
+      writeTextFile(persistPath, dfa.serialize())
+    end
+    dfa
+  end
+  # Similar to dfa_from_script, but reads the script into memory from
+  # the file at scriptPath.
+  #
+  def self.dfa_from_script_file(scriptPath, persistPath = nil)
+    self.dfa_from_script(readTextFile(scriptPath), persistPath)
+  end
+  # Compile a Tokenizer DFA from a text file (that contains a
+  # JSON string)
+  #
+  def self.dfa_from_file(path)
+    dfa_from_json(readTextFile(path))
+  end
+  # Compile a Tokenizer DFA from a JSON string
+  #
+  def self.dfa_from_json(jsonStr)
+    db = false
+    !db|| pr("\n\nextractDFA %s...\n",jsonStr)
+    h = JSON.parse(jsonStr)
+    tNames = h["tokens"]
+    stateInfo = h["states"]
+    !db|| pr("tokens=%s\n",d(tNames))
+    !db|| pr("stateInfo=\n%s\n",d(stateInfo))
+    st = []
+    stateInfo.each_with_index do |(key,val),i|
+      !db|| pr(" creating new state, id=%d\n",i)
+      st.push(State.new(i))
+    end
+    st.each do |s|
+      !db|| pr("proc state %s\n",d(s))
+      finalState, edgeList = stateInfo[s.id]
+      s.finalState = finalState
+      edgeList.each do |edge|
+        label,destState = edge
+        cr = CodeSet.new()
+        cr.setArray(label)
+        s.addEdge(cr, st[destState])
+      end
+    end
+    DFA.new(tNames, st[0])
+  end
+  attr_reader :startState, :tokenNames
+  # Construct a DFA, given a list of token names and a starting state.
+  #
+  def initialize(tokenNameList, startState)
+    @tokenNames = tokenNameList
+    @startState = startState
+  end
+  # Determine the name of a token, given its id.
+  # Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
+  # the tokenId is nil.  Otherwise, assumes tokenId is 0..n, where
+  # n is the number of token names in the DFA.
+  #
+  def tokenName(tokenId)
+    if !tokenId
+      nm = "<EOF>"
+    elsif tokenId == UNKNOWN_TOKEN
+      nm = "<UNKNOWN>"
+    else
+      if tokenId < 0 || tokenId >= tokenNames.size
+        raise IndexError, "No such token id: "+tokenId.to_s
+      end
+      nm = tokenNames[tokenId]
+    end
+    nm
+  end
+  # Serialize this DFA to a JSON string.
+  # The DFA in JSON form has this structure:
+  #
+  #  {
+  #    "tokens" => array of token names (strings)
+  #    "states" => array of states, ordered by id (0,1,..)
+  #  }
+  #
+  # Each state has this format:
+  #  [ finalState (boolean),
+  #   [edge0, edge1, ...]
+  #  ]
+  #
+  # Edge:
+  #  [label, destination id (integer)]
+  #
+  # Labels are arrays of integers, exactly the structure of
+  # a CodeSet array.
+  #
+  def serialize
+    h = {}
+    h["tokens"] =  tokenNames
+    stateSet,_,_ = startState.reachableStates
+    idToStateMap = {}
+    stateSet.each do |st|
+      idToStateMap[st.id] = st
+    end
+    stateList = []
+    nextId = 0
+    idToStateMap.each_pair do |id, st|
+      if nextId != id
+        raise ArgumentError, "unexpected state ids"
+      end
+      nextId += 1
+      stateList.push(st)
+    end
+    if stateList.size == 0
+      raise ArgumentError, "bad states"
+    end
+    if stateList[0] != startState
+      raise ArgumentError, "bad start state"
+    end
+    stateInfo = []
+    stateList.each do |st|
+        stateInfo.push(stateToList(st))
+    end
+    h["states"] = stateInfo
+    JSON.generate(h)
+  end
+  private
+  def stateToList(state)
+    list = []
+    list.push(state.finalState?)
+    ed = []
+    state.edges.each do |lbl, dest|
+      edInfo = [lbl.array, dest.id]
+      ed.push(edInfo)
+    end
+    list.push(ed)
+    list
+  end
+end

data/lib/tokn/dfa_builder.rb ADDED Viewed

@@ -0,0 +1,261 @@
+require_relative 'tools'
+req('tokn_const code_set state range_partition reg_parse')
+# Converts NFAs (nondeterministic, finite state automata) to
+# minimal DFAs.
+#
+# Performs the subset construction algorithm described in
+# (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
+#
+# Also implements an innovative algorithm to partition a set of
+# edge labels into a set that has the property that no two elements
+# have overlapping regions.  This allows us to perform the subset construction
+# (and closure operations) efficiently while supporting large possible character
+# sets (e.g., unicode, which ranges from 0..0x10ffff.  See RangePartition.rb
+# for more details.
+#
+class DFABuilder
+  include Tokn
+  # Convert an NFA to a DFA.
+  #
+  # @param startState the start state of the NFA
+  # @param db if true, generates PDF files for debug purposes, showing various
+  #    steps of the procedure
+  #
+  def self.nfa_to_dfa(startState, db = false)
+    !db || startState.generatePDF("original_nfa")
+    # Reverse this NFA, convert to DFA, then
+    # reverse it, and convert it again.  Apparently this
+    # produces a minimal DFA.
+    rev = startState.reverseNFA()
+    !db || rev.generatePDF("reversed_nfa")
+    bld = DFABuilder.new(rev)
+    dfa = bld.build(true, false)  # partition, but don't normalize
+    !db || dfa.generatePDF("reversed_dfa")
+    rev2 = dfa.reverseNFA()
+    bld = DFABuilder.new(rev2)
+    # Don't regenerate the partition; it is still valid
+    # for this second build process
+    #
+    dfa = bld.build(false, true) # don't partition, but do normalize
+    # If there are edges that contain more than one token identifier,
+    # remove all but the first (i.e. the one with the highest token id)
+    stSet, _, _ = dfa.reachableStates
+    stSet.each do |s|
+      s.edges.each do |lbl, dest|
+        a = lbl.array
+        if !a.size
+          next
+        end
+        primeId = a[0]
+        if primeId >= EPSILON-1
+          next
+        end
+        lbl.difference!(CodeSet.new(primeId+1, EPSILON))
+      end
+    end
+    !db || dfa.generatePDF("minimal_dfa")
+    dfa
+  end
+  # Constructs a builder object
+  #
+  def initialize(nfaStartState)
+    @nextId = 0
+    @nfaStart = nfaStartState
+    # Build a map of nfa state ids => nfa states
+    @nfaStateMap = {}
+    nfas, _, _ = @nfaStart.reachableStates
+    nfas.each {|s| @nfaStateMap[s.id] = s}
+    # Initialize an array of nfa state lists, indexed by dfa state id
+    @nfaStateLists = []
+    # Map of existing DFA states; key is array of NFA state ids
+    @dfaStateMap = {}
+  end
+  # Perform the build algorithm
+  #
+  # @param partition if true, partitions the edge labels into disjoint code sets
+  # @param normalize if true, normalizes the states afterward
+  #
+  def build(partition = true, normalize = true)
+    db = false
+    !partition || partitionEdges(@nfaStart)
+    iset = Set.new
+    iset.add(@nfaStart)
+    epsClosure(iset)
+    @dfaStart,_ = createDFAState(stateSetToIdArray(iset))
+    markedStates = Set.new
+    unmarked = [@dfaStart]
+    until unmarked.empty?
+      dfaState  = unmarked.pop
+      nfaIds = @nfaStateLists[dfaState.id]
+      # map of CodeSet => set of NFA states
+      moveMap = {}
+      nfaIds.each do |nfaId|
+        nfaState = @nfaStateMap[nfaId]
+        nfaState.edges.each do |lbl,dest|
+          if lbl.array[0] == EPSILON
+            next
+          end
+          nfaStates = moveMap[lbl]
+          if !nfaStates
+            nfaStates = Set.new
+            moveMap[lbl] = nfaStates
+          end
+          nfaStates.add(dest)
+        end
+      end
+      moveMap.each_pair do |charRange,nfaStates|
+        # May be better to test if already in set before calc closure; or simply has closure
+        epsClosure(nfaStates)
+        dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
+        if isNew
+          unmarked.push(dfaDestState)
+        end
+        dfaState.addEdge(charRange, dfaDestState)
+      end
+    end
+    if normalize
+      !db || @dfaStart.generatePDF("prior_normalize")
+      !db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
+      State.normalizeStates(@dfaStart)
+      !db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
+      !db || @dfaStart.generatePDF("post_normalize")
+    end
+    @dfaStart
+  end
+  private
+  # Adds a DFA state for a set of NFA states, if one doesn't already exist
+  # for the set
+  # @param nfaStateList a sorted array of NFA state ids
+  # @return a pair [DFA State,
+  #                 created flag (boolean): true if this did not already exist]
+  #
+  def createDFAState(nfaStateList)
+    lst = nfaStateList
+    newState = @nfaStateMap[lst]
+    isNewState = !newState
+    if isNewState
+      newState = State.new(@nextId)
+      # Determine if any of the NFA states were final states
+      newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
+      if false
+        # Set label of DFA state to show which NFA states produced it
+        # (useful for debugging)
+        newState.label = lst.map {|x| x.to_s}.join(' ')
+      end
+      @nextId += 1
+      @nfaStateMap[lst] = newState
+      @nfaStateLists.push(lst)
+    end
+    return [newState,isNewState]
+  end
+  def stateSetToIdArray(s)
+    s.to_a.map {|x| x.id}.sort
+  end
+  # Calculate the epsilon closure of a set of NFA states
+  # @return a set of states
+  #
+  def epsClosure(stateSet)
+    stk = stateSet.to_a
+    while !stk.empty?
+      s = stk.pop
+      s.edges.each do |lbl,dest|
+        if lbl.contains? EPSILON
+          if stateSet.add?(dest)
+            stk.push(dest)
+          end
+        end
+      end
+    end
+    stateSet
+  end
+  # Modify edges so each is labelled with a disjoint subset
+  # of characters.  See the notes at the start of this class,
+  # as well as RangePartition.rb.
+  #
+  def partitionEdges(startState)
+    db = false
+    par = RangePartition.new
+    stateSet, _, _ = startState.reachableStates
+    stateSet.each do |s|
+      s.edges.each {|lbl,dest| par.addSet(lbl) }
+    end
+    par.prepare
+    stateSet.each do |s|
+      newEdges = []
+      s.edges.each do |lbl, dest|
+        !db||pr(" old edge: %s   => %s\n",d(lbl),d(dest.name))
+        newLbls = par.apply(lbl)
+        newLbls.each {|x| newEdges.push([x, dest]) }
+      end
+      s.clearEdges()
+      newEdges.each do |lbl,dest|
+        !db||pr(" new edge: %s   => %s\n",d(lbl),d(dest.name))
+        s.addEdge(lbl,dest)
+      end
+      !db||pr("\n")
+    end
+  end
+end

data/lib/tokn/range_partition.rb ADDED Viewed

@@ -0,0 +1,233 @@
+require_relative 'tools'
+req('tokn_const code_set')
+# A data structure that transforms a set of CodeSets to a
+# disjoint set of them, such that no two range sets overlap.
+#
+# This is improve the efficiency of the NFA => DFA algorithm,
+# which involves gathering information about what states are
+# reachable on certain characters.  We can't afford to treat each
+# character as a singleton, since the ranges can be quite large.
+# Hence, we want to treat ranges of characters as single entities;
+# this will only work if no two such ranges overlap.
+#
+# It works by starting with a tree whose node is labelled with
+# the maximal superset of character values.  Then, for each edge
+# in the NFA, performs a DFS on this tree, splitting any node that
+# only partially intersects any one set that appears in the edge label.
+# The running time is O(n log k), where n is the size of the NFA, and
+# k is the height of the resulting tree.
+#
+# We encourage k to be small by sorting the NFA edges by their
+# label complexity.
+#
+class RangePartition
+  include Tokn
+  def initialize()
+    # We will build a tree, where each node has a CodeSet
+    # associated with it, and the child nodes (if present)
+    # partition this CodeSet into smaller, nonempty sets.
+    # A tree is represented by a node, where each node is a pair [x,y],
+    # with x the node's CodeSet, and y a list of the node's children.
+    @nextNodeId = 0
+    # Make the root node hold the largest possible CodeSet.
+    # We want to be able to include all the token ids as well.
+    @rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
+    @setsToAdd = Set.new
+    # Add epsilon immediately, so it's always in its own subset
+    addSet(CodeSet.new(EPSILON))
+    @prepared = false
+  end
+  def addSet(s)
+    if @prepared
+      raise IllegalStateException
+    end
+    @setsToAdd.add(s)
+  end
+  def prepare()
+    if @prepared
+      raise IllegalStateException
+    end
+    # Construct partition from previously added sets
+    list = @setsToAdd.to_a
+    # Sort set by cardinality: probably get a more balanced tree
+    # if larger sets are processed first
+    list.sort!{ |x,y| y.cardinality <=> x.cardinality }
+    list.each do |s|
+      addSetAux(s)
+    end
+    @prepared = true
+  end
+  # Generate a .dot file, and from that, a PDF, for debug purposes
+  #
+  def generatePDF(name = "partition")
+    if !@prepared
+      raise IllegalStateException
+    end
+    g = ""
+    g += "digraph "+name+" {\n\n"
+    nodes = []
+    buildNodeList(nodes)
+    nodes.each do |node|
+      g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
+    end
+    g += "\n"
+    nodes.each do |node|
+      node.children.each do |ch|
+        g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
+      end
+    end
+    g += "\n}\n"
+    g.gsub!( /'/, '"' )
+    dotToPDF(g,name)
+  end
+  # Apply the partition to a CodeSet
+  #
+  # > s CodeSet
+  # < array of subsets from the partition whose union equals s
+  #   (this array will be the single element s if no partitioning was necessary)
+  #
+  def apply(s)
+    if !@prepared
+      raise IllegalStateException
+    end
+    list = []
+    s2 = s.makeCopy
+    applyAux(@rootNode, s2, list)
+    # Sort the list of subsets by their first elements
+    list.sort! { |x,y| x.array[0] <=> y.array[0] }
+    list
+  end
+  private
+  def applyAux(n, s, list)
+    db = false
+    !db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
+    if n.children.empty?
+      # # Verify that this set equals the input set
+      # myAssert(s.eql? n.set)
+      list.push(s)
+    else
+      n.children.each do |m|
+        s1 = s.intersect(m.set)
+        !db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
+        if s1.empty?
+          next
+        end
+        applyAux(m, s1, list)
+        !db||pr("  subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
+        s = s.difference(m.set)
+        !db||pr("  subtracted child set, now [%s]\n",d(s))
+        if s.empty?
+          break
+        end
+      end
+    end
+  end
+  def buildNode(rangeSet)
+    id = @nextNodeId
+    @nextNodeId += 1
+    n = RPNode.new(id, rangeSet, [])
+    n
+  end
+  def buildNodeList(list, root = nil)
+    if not root
+      root = @rootNode
+    end
+    list.push(root)
+    root.children.each do |x|
+      buildNodeList(list, x)
+    end
+  end
+  # Add a set to the tree, extending the tree as necessary to
+  # maintain a (disjoint) partition
+  #
+  def addSetAux(s, n = @rootNode)
+    #
+    # The algorithm is this:
+    #
+    # add (s, n)    # add set s to node n; s must be subset of n.set
+    #   if n.set = s, return
+    #   if n is leaf:
+    #     x = n.set - s
+    #     add x,y as child sets of n
+    #   else
+    #     for each child m of n:
+    #       t = intersect of m.set and s
+    #       if t is nonempty, add(t, m)
+    #
+    if n.set.eql? s
+      return
+    end
+    if n.children.empty?
+      x = n.set.difference(s)
+      n.children.push buildNode(x)
+      n.children.push buildNode(s)
+    else
+      n.children.each do |m|
+        t = m.set.intersect(s)
+        addSetAux(t,m) unless t.empty?
+      end
+    end
+  end
+end
+# A node within a RangePartition tree
+#
+class RPNode
+  attr_accessor :id, :set, :children
+  def initialize(id, set, children)
+    @id = id
+    @set = set
+    @children = children
+  end
+  def inspect
+    return 'N' + id.to_s
+  end
+end