tokn 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/tokn/dfa.rb ADDED
@@ -0,0 +1,196 @@
1
+ require 'json'
2
+ require_relative 'tools'
3
+ req('code_set state')
4
+
5
+
6
+ # A DFA for tokenizing; includes pointer to a start state, and
7
+ # a list of token names
8
+ #
9
+ class DFA
10
+
11
+ include Tokn
12
+
13
+ # Compile a Tokenizer DFA from a token definition script.
14
+ # If persistPath is not null, it first checks if the file exists and
15
+ # if so, assumes it contains (in JSON form) a previously compiled
16
+ # DFA matching this script, and reads the DFA from it.
17
+ # Second, if no such file exists, it writes the DFA to it after compilation.
18
+ #
19
+ def self.dfa_from_script(script, persistPath = nil)
20
+
21
+ if persistPath and File.exist?(persistPath)
22
+ return extractDFA(readTextFile(persistPath))
23
+ end
24
+
25
+ req('token_defn_parser')
26
+
27
+ td = TokenDefParser.new(script)
28
+ dfa = td.dfa
29
+
30
+ if persistPath
31
+ writeTextFile(persistPath, dfa.serialize())
32
+ end
33
+
34
+ dfa
35
+ end
36
+
37
+ # Similar to dfa_from_script, but reads the script into memory from
38
+ # the file at scriptPath.
39
+ #
40
+ def self.dfa_from_script_file(scriptPath, persistPath = nil)
41
+ self.dfa_from_script(readTextFile(scriptPath), persistPath)
42
+ end
43
+
44
+ # Compile a Tokenizer DFA from a text file (that contains a
45
+ # JSON string)
46
+ #
47
+ def self.dfa_from_file(path)
48
+ dfa_from_json(readTextFile(path))
49
+ end
50
+
51
+ # Compile a Tokenizer DFA from a JSON string
52
+ #
53
+ def self.dfa_from_json(jsonStr)
54
+ db = false
55
+
56
+ !db|| pr("\n\nextractDFA %s...\n",jsonStr)
57
+
58
+ h = JSON.parse(jsonStr)
59
+
60
+ tNames = h["tokens"]
61
+ stateInfo = h["states"]
62
+
63
+ !db|| pr("tokens=%s\n",d(tNames))
64
+ !db|| pr("stateInfo=\n%s\n",d(stateInfo))
65
+
66
+ st = []
67
+ stateInfo.each_with_index do |(key,val),i|
68
+ !db|| pr(" creating new state, id=%d\n",i)
69
+ st.push(State.new(i))
70
+ end
71
+
72
+ st.each do |s|
73
+ !db|| pr("proc state %s\n",d(s))
74
+
75
+ finalState, edgeList = stateInfo[s.id]
76
+ s.finalState = finalState
77
+ edgeList.each do |edge|
78
+ label,destState = edge
79
+ cr = CodeSet.new()
80
+ cr.setArray(label)
81
+ s.addEdge(cr, st[destState])
82
+ end
83
+ end
84
+
85
+ DFA.new(tNames, st[0])
86
+
87
+ end
88
+
89
+ attr_reader :startState, :tokenNames
90
+
91
+ # Construct a DFA, given a list of token names and a starting state.
92
+ #
93
+ def initialize(tokenNameList, startState)
94
+ @tokenNames = tokenNameList
95
+ @startState = startState
96
+ end
97
+
98
+ # Determine the name of a token, given its id.
99
+ # Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
100
+ # the tokenId is nil. Otherwise, assumes tokenId is 0..n, where
101
+ # n is the number of token names in the DFA.
102
+ #
103
+ def tokenName(tokenId)
104
+ if !tokenId
105
+ nm = "<EOF>"
106
+ elsif tokenId == UNKNOWN_TOKEN
107
+ nm = "<UNKNOWN>"
108
+ else
109
+ if tokenId < 0 || tokenId >= tokenNames.size
110
+ raise IndexError, "No such token id: "+tokenId.to_s
111
+ end
112
+ nm = tokenNames[tokenId]
113
+ end
114
+ nm
115
+ end
116
+
117
+ # Serialize this DFA to a JSON string.
118
+ # The DFA in JSON form has this structure:
119
+ #
120
+ # {
121
+ # "tokens" => array of token names (strings)
122
+ # "states" => array of states, ordered by id (0,1,..)
123
+ # }
124
+ #
125
+ # Each state has this format:
126
+ # [ finalState (boolean),
127
+ # [edge0, edge1, ...]
128
+ # ]
129
+ #
130
+ # Edge:
131
+ # [label, destination id (integer)]
132
+ #
133
+ # Labels are arrays of integers, exactly the structure of
134
+ # a CodeSet array.
135
+ #
136
+ def serialize
137
+
138
+ h = {}
139
+
140
+ h["tokens"] = tokenNames
141
+
142
+ stateSet,_,_ = startState.reachableStates
143
+
144
+ idToStateMap = {}
145
+ stateSet.each do |st|
146
+ idToStateMap[st.id] = st
147
+ end
148
+
149
+ stateList = []
150
+
151
+ nextId = 0
152
+ idToStateMap.each_pair do |id, st|
153
+ if nextId != id
154
+ raise ArgumentError, "unexpected state ids"
155
+ end
156
+ nextId += 1
157
+
158
+ stateList.push(st)
159
+ end
160
+
161
+ if stateList.size == 0
162
+ raise ArgumentError, "bad states"
163
+ end
164
+
165
+ if stateList[0] != startState
166
+ raise ArgumentError, "bad start state"
167
+ end
168
+
169
+ stateInfo = []
170
+ stateList.each do |st|
171
+ stateInfo.push(stateToList(st))
172
+ end
173
+ h["states"] = stateInfo
174
+
175
+ JSON.generate(h)
176
+ end
177
+
178
+ private
179
+
180
+ def stateToList(state)
181
+ list = []
182
+
183
+ list.push(state.finalState?)
184
+ ed = []
185
+ state.edges.each do |lbl, dest|
186
+ edInfo = [lbl.array, dest.id]
187
+ ed.push(edInfo)
188
+ end
189
+ list.push(ed)
190
+
191
+ list
192
+ end
193
+
194
+ end
195
+
196
+
@@ -0,0 +1,261 @@
1
+ require_relative 'tools'
2
+ req('tokn_const code_set state range_partition reg_parse')
3
+
4
+ # Converts NFAs (nondeterministic, finite state automata) to
5
+ # minimal DFAs.
6
+ #
7
+ # Performs the subset construction algorithm described in
8
+ # (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
9
+ #
10
+ # Also implements an innovative algorithm to partition a set of
11
+ # edge labels into a set that has the property that no two elements
12
+ # have overlapping regions. This allows us to perform the subset construction
13
+ # (and closure operations) efficiently while supporting large possible character
14
+ # sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
15
+ # for more details.
16
+ #
17
+ class DFABuilder
18
+
19
+ include Tokn
20
+
21
+
22
+ # Convert an NFA to a DFA.
23
+ #
24
+ # @param startState the start state of the NFA
25
+ # @param db if true, generates PDF files for debug purposes, showing various
26
+ # steps of the procedure
27
+ #
28
+ def self.nfa_to_dfa(startState, db = false)
29
+
30
+ !db || startState.generatePDF("original_nfa")
31
+
32
+ # Reverse this NFA, convert to DFA, then
33
+ # reverse it, and convert it again. Apparently this
34
+ # produces a minimal DFA.
35
+
36
+ rev = startState.reverseNFA()
37
+ !db || rev.generatePDF("reversed_nfa")
38
+
39
+ bld = DFABuilder.new(rev)
40
+ dfa = bld.build(true, false) # partition, but don't normalize
41
+
42
+ !db || dfa.generatePDF("reversed_dfa")
43
+
44
+ rev2 = dfa.reverseNFA()
45
+ bld = DFABuilder.new(rev2)
46
+
47
+ # Don't regenerate the partition; it is still valid
48
+ # for this second build process
49
+ #
50
+ dfa = bld.build(false, true) # don't partition, but do normalize
51
+
52
+ # If there are edges that contain more than one token identifier,
53
+ # remove all but the first (i.e. the one with the highest token id)
54
+
55
+ stSet, _, _ = dfa.reachableStates
56
+ stSet.each do |s|
57
+ s.edges.each do |lbl, dest|
58
+ a = lbl.array
59
+ if !a.size
60
+ next
61
+ end
62
+
63
+ primeId = a[0]
64
+
65
+ if primeId >= EPSILON-1
66
+ next
67
+ end
68
+
69
+ lbl.difference!(CodeSet.new(primeId+1, EPSILON))
70
+ end
71
+ end
72
+
73
+ !db || dfa.generatePDF("minimal_dfa")
74
+
75
+ dfa
76
+ end
77
+
78
+
79
+
80
+ # Constructs a builder object
81
+ #
82
+ def initialize(nfaStartState)
83
+ @nextId = 0
84
+ @nfaStart = nfaStartState
85
+
86
+ # Build a map of nfa state ids => nfa states
87
+ @nfaStateMap = {}
88
+ nfas, _, _ = @nfaStart.reachableStates
89
+ nfas.each {|s| @nfaStateMap[s.id] = s}
90
+
91
+ # Initialize an array of nfa state lists, indexed by dfa state id
92
+ @nfaStateLists = []
93
+
94
+ # Map of existing DFA states; key is array of NFA state ids
95
+ @dfaStateMap = {}
96
+ end
97
+
98
+ # Perform the build algorithm
99
+ #
100
+ # @param partition if true, partitions the edge labels into disjoint code sets
101
+ # @param normalize if true, normalizes the states afterward
102
+ #
103
+ def build(partition = true, normalize = true)
104
+ db = false
105
+
106
+ !partition || partitionEdges(@nfaStart)
107
+
108
+ iset = Set.new
109
+ iset.add(@nfaStart)
110
+ epsClosure(iset)
111
+
112
+ @dfaStart,_ = createDFAState(stateSetToIdArray(iset))
113
+
114
+ markedStates = Set.new
115
+
116
+ unmarked = [@dfaStart]
117
+
118
+ until unmarked.empty?
119
+ dfaState = unmarked.pop
120
+
121
+ nfaIds = @nfaStateLists[dfaState.id]
122
+
123
+ # map of CodeSet => set of NFA states
124
+ moveMap = {}
125
+
126
+ nfaIds.each do |nfaId|
127
+ nfaState = @nfaStateMap[nfaId]
128
+ nfaState.edges.each do |lbl,dest|
129
+ if lbl.array[0] == EPSILON
130
+ next
131
+ end
132
+
133
+ nfaStates = moveMap[lbl]
134
+ if !nfaStates
135
+ nfaStates = Set.new
136
+ moveMap[lbl] = nfaStates
137
+ end
138
+ nfaStates.add(dest)
139
+ end
140
+ end
141
+
142
+ moveMap.each_pair do |charRange,nfaStates|
143
+ # May be better to test if already in set before calc closure; or simply has closure
144
+ epsClosure(nfaStates)
145
+ dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
146
+ if isNew
147
+ unmarked.push(dfaDestState)
148
+ end
149
+ dfaState.addEdge(charRange, dfaDestState)
150
+ end
151
+
152
+ end
153
+
154
+ if normalize
155
+ !db || @dfaStart.generatePDF("prior_normalize")
156
+
157
+ !db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
158
+ State.normalizeStates(@dfaStart)
159
+ !db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
160
+ !db || @dfaStart.generatePDF("post_normalize")
161
+ end
162
+
163
+ @dfaStart
164
+ end
165
+
166
+ private
167
+
168
+ # Adds a DFA state for a set of NFA states, if one doesn't already exist
169
+ # for the set
170
+ # @param nfaStateList a sorted array of NFA state ids
171
+ # @return a pair [DFA State,
172
+ # created flag (boolean): true if this did not already exist]
173
+ #
174
+ def createDFAState(nfaStateList)
175
+
176
+ lst = nfaStateList
177
+
178
+ newState = @nfaStateMap[lst]
179
+ isNewState = !newState
180
+ if isNewState
181
+ newState = State.new(@nextId)
182
+
183
+ # Determine if any of the NFA states were final states
184
+ newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
185
+
186
+ if false
187
+ # Set label of DFA state to show which NFA states produced it
188
+ # (useful for debugging)
189
+ newState.label = lst.map {|x| x.to_s}.join(' ')
190
+ end
191
+
192
+ @nextId += 1
193
+ @nfaStateMap[lst] = newState
194
+ @nfaStateLists.push(lst)
195
+
196
+ end
197
+ return [newState,isNewState]
198
+ end
199
+
200
+ def stateSetToIdArray(s)
201
+ s.to_a.map {|x| x.id}.sort
202
+ end
203
+
204
+ # Calculate the epsilon closure of a set of NFA states
205
+ # @return a set of states
206
+ #
207
+ def epsClosure(stateSet)
208
+ stk = stateSet.to_a
209
+ while !stk.empty?
210
+ s = stk.pop
211
+ s.edges.each do |lbl,dest|
212
+ if lbl.contains? EPSILON
213
+ if stateSet.add?(dest)
214
+ stk.push(dest)
215
+ end
216
+ end
217
+ end
218
+ end
219
+ stateSet
220
+ end
221
+
222
+ # Modify edges so each is labelled with a disjoint subset
223
+ # of characters. See the notes at the start of this class,
224
+ # as well as RangePartition.rb.
225
+ #
226
+ def partitionEdges(startState)
227
+
228
+ db = false
229
+
230
+ par = RangePartition.new
231
+
232
+ stateSet, _, _ = startState.reachableStates
233
+
234
+ stateSet.each do |s|
235
+ s.edges.each {|lbl,dest| par.addSet(lbl) }
236
+ end
237
+
238
+ par.prepare
239
+
240
+ stateSet.each do |s|
241
+ newEdges = []
242
+ s.edges.each do |lbl, dest|
243
+ !db||pr(" old edge: %s => %s\n",d(lbl),d(dest.name))
244
+ newLbls = par.apply(lbl)
245
+ newLbls.each {|x| newEdges.push([x, dest]) }
246
+ end
247
+ s.clearEdges()
248
+
249
+ newEdges.each do |lbl,dest|
250
+ !db||pr(" new edge: %s => %s\n",d(lbl),d(dest.name))
251
+ s.addEdge(lbl,dest)
252
+ end
253
+ !db||pr("\n")
254
+ end
255
+
256
+ end
257
+
258
+
259
+ end
260
+
261
+
@@ -0,0 +1,233 @@
1
+ require_relative 'tools'
2
+ req('tokn_const code_set')
3
+
4
+
5
+ # A data structure that transforms a set of CodeSets to a
6
+ # disjoint set of them, such that no two range sets overlap.
7
+ #
8
+ # This is improve the efficiency of the NFA => DFA algorithm,
9
+ # which involves gathering information about what states are
10
+ # reachable on certain characters. We can't afford to treat each
11
+ # character as a singleton, since the ranges can be quite large.
12
+ # Hence, we want to treat ranges of characters as single entities;
13
+ # this will only work if no two such ranges overlap.
14
+ #
15
+ # It works by starting with a tree whose node is labelled with
16
+ # the maximal superset of character values. Then, for each edge
17
+ # in the NFA, performs a DFS on this tree, splitting any node that
18
+ # only partially intersects any one set that appears in the edge label.
19
+ # The running time is O(n log k), where n is the size of the NFA, and
20
+ # k is the height of the resulting tree.
21
+ #
22
+ # We encourage k to be small by sorting the NFA edges by their
23
+ # label complexity.
24
+ #
25
+ class RangePartition
26
+ include Tokn
27
+
28
+ def initialize()
29
+ # We will build a tree, where each node has a CodeSet
30
+ # associated with it, and the child nodes (if present)
31
+ # partition this CodeSet into smaller, nonempty sets.
32
+
33
+ # A tree is represented by a node, where each node is a pair [x,y],
34
+ # with x the node's CodeSet, and y a list of the node's children.
35
+
36
+ @nextNodeId = 0
37
+
38
+ # Make the root node hold the largest possible CodeSet.
39
+ # We want to be able to include all the token ids as well.
40
+
41
+ @rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
42
+
43
+ @setsToAdd = Set.new
44
+
45
+ # Add epsilon immediately, so it's always in its own subset
46
+ addSet(CodeSet.new(EPSILON))
47
+
48
+ @prepared = false
49
+ end
50
+
51
+ def addSet(s)
52
+ if @prepared
53
+ raise IllegalStateException
54
+ end
55
+ @setsToAdd.add(s)
56
+ end
57
+
58
+ def prepare()
59
+ if @prepared
60
+ raise IllegalStateException
61
+ end
62
+
63
+ # Construct partition from previously added sets
64
+
65
+ list = @setsToAdd.to_a
66
+
67
+ # Sort set by cardinality: probably get a more balanced tree
68
+ # if larger sets are processed first
69
+ list.sort!{ |x,y| y.cardinality <=> x.cardinality }
70
+
71
+ list.each do |s|
72
+ addSetAux(s)
73
+ end
74
+
75
+ @prepared = true
76
+ end
77
+
78
+
79
+ # Generate a .dot file, and from that, a PDF, for debug purposes
80
+ #
81
+ def generatePDF(name = "partition")
82
+ if !@prepared
83
+ raise IllegalStateException
84
+ end
85
+
86
+ g = ""
87
+ g += "digraph "+name+" {\n\n"
88
+
89
+ nodes = []
90
+ buildNodeList(nodes)
91
+ nodes.each do |node|
92
+ g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
93
+ end
94
+
95
+ g += "\n"
96
+ nodes.each do |node|
97
+ node.children.each do |ch|
98
+ g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
99
+ end
100
+ end
101
+
102
+ g += "\n}\n"
103
+ g.gsub!( /'/, '"' )
104
+
105
+ dotToPDF(g,name)
106
+
107
+ end
108
+
109
+
110
+ # Apply the partition to a CodeSet
111
+ #
112
+ # > s CodeSet
113
+ # < array of subsets from the partition whose union equals s
114
+ # (this array will be the single element s if no partitioning was necessary)
115
+ #
116
+ def apply(s)
117
+ if !@prepared
118
+ raise IllegalStateException
119
+ end
120
+
121
+ list = []
122
+ s2 = s.makeCopy
123
+ applyAux(@rootNode, s2, list)
124
+
125
+ # Sort the list of subsets by their first elements
126
+ list.sort! { |x,y| x.array[0] <=> y.array[0] }
127
+
128
+ list
129
+ end
130
+
131
+
132
+ private
133
+
134
+ def applyAux(n, s, list)
135
+ db = false
136
+
137
+ !db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
138
+
139
+ if n.children.empty?
140
+ # # Verify that this set equals the input set
141
+ # myAssert(s.eql? n.set)
142
+ list.push(s)
143
+ else
144
+ n.children.each do |m|
145
+ s1 = s.intersect(m.set)
146
+ !db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
147
+
148
+ if s1.empty?
149
+ next
150
+ end
151
+
152
+ applyAux(m, s1, list)
153
+
154
+ !db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
155
+ s = s.difference(m.set)
156
+ !db||pr(" subtracted child set, now [%s]\n",d(s))
157
+ if s.empty?
158
+ break
159
+ end
160
+ end
161
+ end
162
+ end
163
+
164
+ def buildNode(rangeSet)
165
+ id = @nextNodeId
166
+ @nextNodeId += 1
167
+ n = RPNode.new(id, rangeSet, [])
168
+ n
169
+ end
170
+
171
+ def buildNodeList(list, root = nil)
172
+ if not root
173
+ root = @rootNode
174
+ end
175
+ list.push(root)
176
+ root.children.each do |x|
177
+ buildNodeList(list, x)
178
+ end
179
+ end
180
+
181
+ # Add a set to the tree, extending the tree as necessary to
182
+ # maintain a (disjoint) partition
183
+ #
184
+ def addSetAux(s, n = @rootNode)
185
+ #
186
+ # The algorithm is this:
187
+ #
188
+ # add (s, n) # add set s to node n; s must be subset of n.set
189
+ # if n.set = s, return
190
+ # if n is leaf:
191
+ # x = n.set - s
192
+ # add x,y as child sets of n
193
+ # else
194
+ # for each child m of n:
195
+ # t = intersect of m.set and s
196
+ # if t is nonempty, add(t, m)
197
+ #
198
+ if n.set.eql? s
199
+ return
200
+ end
201
+ if n.children.empty?
202
+ x = n.set.difference(s)
203
+ n.children.push buildNode(x)
204
+ n.children.push buildNode(s)
205
+ else
206
+ n.children.each do |m|
207
+ t = m.set.intersect(s)
208
+ addSetAux(t,m) unless t.empty?
209
+ end
210
+ end
211
+ end
212
+
213
+ end
214
+
215
+ # A node within a RangePartition tree
216
+ #
217
+ class RPNode
218
+
219
+ attr_accessor :id, :set, :children
220
+
221
+ def initialize(id, set, children)
222
+ @id = id
223
+ @set = set
224
+ @children = children
225
+ end
226
+
227
+ def inspect
228
+ return 'N' + id.to_s
229
+ end
230
+
231
+ end
232
+
233
+