tokn 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/lib/tokn/dfa.rb ADDED
@@ -0,0 +1,196 @@
1
+ require 'json'
2
+ require_relative 'tools'
3
+ req('code_set state')
4
+
5
+
6
+ # A DFA for tokenizing; includes pointer to a start state, and
7
+ # a list of token names
8
+ #
9
+ class DFA
10
+
11
+ include Tokn
12
+
13
+ # Compile a Tokenizer DFA from a token definition script.
14
+ # If persistPath is not null, it first checks if the file exists and
15
+ # if so, assumes it contains (in JSON form) a previously compiled
16
+ # DFA matching this script, and reads the DFA from it.
17
+ # Second, if no such file exists, it writes the DFA to it after compilation.
18
+ #
19
+ def self.dfa_from_script(script, persistPath = nil)
20
+
21
+ if persistPath and File.exist?(persistPath)
22
+ return extractDFA(readTextFile(persistPath))
23
+ end
24
+
25
+ req('token_defn_parser')
26
+
27
+ td = TokenDefParser.new(script)
28
+ dfa = td.dfa
29
+
30
+ if persistPath
31
+ writeTextFile(persistPath, dfa.serialize())
32
+ end
33
+
34
+ dfa
35
+ end
36
+
37
+ # Similar to dfa_from_script, but reads the script into memory from
38
+ # the file at scriptPath.
39
+ #
40
+ def self.dfa_from_script_file(scriptPath, persistPath = nil)
41
+ self.dfa_from_script(readTextFile(scriptPath), persistPath)
42
+ end
43
+
44
+ # Compile a Tokenizer DFA from a text file (that contains a
45
+ # JSON string)
46
+ #
47
+ def self.dfa_from_file(path)
48
+ dfa_from_json(readTextFile(path))
49
+ end
50
+
51
+ # Compile a Tokenizer DFA from a JSON string
52
+ #
53
+ def self.dfa_from_json(jsonStr)
54
+ db = false
55
+
56
+ !db|| pr("\n\nextractDFA %s...\n",jsonStr)
57
+
58
+ h = JSON.parse(jsonStr)
59
+
60
+ tNames = h["tokens"]
61
+ stateInfo = h["states"]
62
+
63
+ !db|| pr("tokens=%s\n",d(tNames))
64
+ !db|| pr("stateInfo=\n%s\n",d(stateInfo))
65
+
66
+ st = []
67
+ stateInfo.each_with_index do |(key,val),i|
68
+ !db|| pr(" creating new state, id=%d\n",i)
69
+ st.push(State.new(i))
70
+ end
71
+
72
+ st.each do |s|
73
+ !db|| pr("proc state %s\n",d(s))
74
+
75
+ finalState, edgeList = stateInfo[s.id]
76
+ s.finalState = finalState
77
+ edgeList.each do |edge|
78
+ label,destState = edge
79
+ cr = CodeSet.new()
80
+ cr.setArray(label)
81
+ s.addEdge(cr, st[destState])
82
+ end
83
+ end
84
+
85
+ DFA.new(tNames, st[0])
86
+
87
+ end
88
+
89
+ attr_reader :startState, :tokenNames
90
+
91
+ # Construct a DFA, given a list of token names and a starting state.
92
+ #
93
+ def initialize(tokenNameList, startState)
94
+ @tokenNames = tokenNameList
95
+ @startState = startState
96
+ end
97
+
98
+ # Determine the name of a token, given its id.
99
+ # Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
100
+ # the tokenId is nil. Otherwise, assumes tokenId is 0..n, where
101
+ # n is the number of token names in the DFA.
102
+ #
103
+ def tokenName(tokenId)
104
+ if !tokenId
105
+ nm = "<EOF>"
106
+ elsif tokenId == UNKNOWN_TOKEN
107
+ nm = "<UNKNOWN>"
108
+ else
109
+ if tokenId < 0 || tokenId >= tokenNames.size
110
+ raise IndexError, "No such token id: "+tokenId.to_s
111
+ end
112
+ nm = tokenNames[tokenId]
113
+ end
114
+ nm
115
+ end
116
+
117
+ # Serialize this DFA to a JSON string.
118
+ # The DFA in JSON form has this structure:
119
+ #
120
+ # {
121
+ # "tokens" => array of token names (strings)
122
+ # "states" => array of states, ordered by id (0,1,..)
123
+ # }
124
+ #
125
+ # Each state has this format:
126
+ # [ finalState (boolean),
127
+ # [edge0, edge1, ...]
128
+ # ]
129
+ #
130
+ # Edge:
131
+ # [label, destination id (integer)]
132
+ #
133
+ # Labels are arrays of integers, exactly the structure of
134
+ # a CodeSet array.
135
+ #
136
+ def serialize
137
+
138
+ h = {}
139
+
140
+ h["tokens"] = tokenNames
141
+
142
+ stateSet,_,_ = startState.reachableStates
143
+
144
+ idToStateMap = {}
145
+ stateSet.each do |st|
146
+ idToStateMap[st.id] = st
147
+ end
148
+
149
+ stateList = []
150
+
151
+ nextId = 0
152
+ idToStateMap.each_pair do |id, st|
153
+ if nextId != id
154
+ raise ArgumentError, "unexpected state ids"
155
+ end
156
+ nextId += 1
157
+
158
+ stateList.push(st)
159
+ end
160
+
161
+ if stateList.size == 0
162
+ raise ArgumentError, "bad states"
163
+ end
164
+
165
+ if stateList[0] != startState
166
+ raise ArgumentError, "bad start state"
167
+ end
168
+
169
+ stateInfo = []
170
+ stateList.each do |st|
171
+ stateInfo.push(stateToList(st))
172
+ end
173
+ h["states"] = stateInfo
174
+
175
+ JSON.generate(h)
176
+ end
177
+
178
+ private
179
+
180
+ def stateToList(state)
181
+ list = []
182
+
183
+ list.push(state.finalState?)
184
+ ed = []
185
+ state.edges.each do |lbl, dest|
186
+ edInfo = [lbl.array, dest.id]
187
+ ed.push(edInfo)
188
+ end
189
+ list.push(ed)
190
+
191
+ list
192
+ end
193
+
194
+ end
195
+
196
+
@@ -0,0 +1,261 @@
1
+ require_relative 'tools'
2
+ req('tokn_const code_set state range_partition reg_parse')
3
+
4
+ # Converts NFAs (nondeterministic, finite state automata) to
5
+ # minimal DFAs.
6
+ #
7
+ # Performs the subset construction algorithm described in
8
+ # (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
9
+ #
10
+ # Also implements an innovative algorithm to partition a set of
11
+ # edge labels into a set that has the property that no two elements
12
+ # have overlapping regions. This allows us to perform the subset construction
13
+ # (and closure operations) efficiently while supporting large possible character
14
+ # sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
15
+ # for more details.
16
+ #
17
+ class DFABuilder
18
+
19
+ include Tokn
20
+
21
+
22
+ # Convert an NFA to a DFA.
23
+ #
24
+ # @param startState the start state of the NFA
25
+ # @param db if true, generates PDF files for debug purposes, showing various
26
+ # steps of the procedure
27
+ #
28
+ def self.nfa_to_dfa(startState, db = false)
29
+
30
+ !db || startState.generatePDF("original_nfa")
31
+
32
+ # Reverse this NFA, convert to DFA, then
33
+ # reverse it, and convert it again. Apparently this
34
+ # produces a minimal DFA.
35
+
36
+ rev = startState.reverseNFA()
37
+ !db || rev.generatePDF("reversed_nfa")
38
+
39
+ bld = DFABuilder.new(rev)
40
+ dfa = bld.build(true, false) # partition, but don't normalize
41
+
42
+ !db || dfa.generatePDF("reversed_dfa")
43
+
44
+ rev2 = dfa.reverseNFA()
45
+ bld = DFABuilder.new(rev2)
46
+
47
+ # Don't regenerate the partition; it is still valid
48
+ # for this second build process
49
+ #
50
+ dfa = bld.build(false, true) # don't partition, but do normalize
51
+
52
+ # If there are edges that contain more than one token identifier,
53
+ # remove all but the first (i.e. the one with the highest token id)
54
+
55
+ stSet, _, _ = dfa.reachableStates
56
+ stSet.each do |s|
57
+ s.edges.each do |lbl, dest|
58
+ a = lbl.array
59
+ if !a.size
60
+ next
61
+ end
62
+
63
+ primeId = a[0]
64
+
65
+ if primeId >= EPSILON-1
66
+ next
67
+ end
68
+
69
+ lbl.difference!(CodeSet.new(primeId+1, EPSILON))
70
+ end
71
+ end
72
+
73
+ !db || dfa.generatePDF("minimal_dfa")
74
+
75
+ dfa
76
+ end
77
+
78
+
79
+
80
+ # Constructs a builder object
81
+ #
82
+ def initialize(nfaStartState)
83
+ @nextId = 0
84
+ @nfaStart = nfaStartState
85
+
86
+ # Build a map of nfa state ids => nfa states
87
+ @nfaStateMap = {}
88
+ nfas, _, _ = @nfaStart.reachableStates
89
+ nfas.each {|s| @nfaStateMap[s.id] = s}
90
+
91
+ # Initialize an array of nfa state lists, indexed by dfa state id
92
+ @nfaStateLists = []
93
+
94
+ # Map of existing DFA states; key is array of NFA state ids
95
+ @dfaStateMap = {}
96
+ end
97
+
98
+ # Perform the build algorithm
99
+ #
100
+ # @param partition if true, partitions the edge labels into disjoint code sets
101
+ # @param normalize if true, normalizes the states afterward
102
+ #
103
+ def build(partition = true, normalize = true)
104
+ db = false
105
+
106
+ !partition || partitionEdges(@nfaStart)
107
+
108
+ iset = Set.new
109
+ iset.add(@nfaStart)
110
+ epsClosure(iset)
111
+
112
+ @dfaStart,_ = createDFAState(stateSetToIdArray(iset))
113
+
114
+ markedStates = Set.new
115
+
116
+ unmarked = [@dfaStart]
117
+
118
+ until unmarked.empty?
119
+ dfaState = unmarked.pop
120
+
121
+ nfaIds = @nfaStateLists[dfaState.id]
122
+
123
+ # map of CodeSet => set of NFA states
124
+ moveMap = {}
125
+
126
+ nfaIds.each do |nfaId|
127
+ nfaState = @nfaStateMap[nfaId]
128
+ nfaState.edges.each do |lbl,dest|
129
+ if lbl.array[0] == EPSILON
130
+ next
131
+ end
132
+
133
+ nfaStates = moveMap[lbl]
134
+ if !nfaStates
135
+ nfaStates = Set.new
136
+ moveMap[lbl] = nfaStates
137
+ end
138
+ nfaStates.add(dest)
139
+ end
140
+ end
141
+
142
+ moveMap.each_pair do |charRange,nfaStates|
143
+ # May be better to test if already in set before calc closure; or simply has closure
144
+ epsClosure(nfaStates)
145
+ dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
146
+ if isNew
147
+ unmarked.push(dfaDestState)
148
+ end
149
+ dfaState.addEdge(charRange, dfaDestState)
150
+ end
151
+
152
+ end
153
+
154
+ if normalize
155
+ !db || @dfaStart.generatePDF("prior_normalize")
156
+
157
+ !db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
158
+ State.normalizeStates(@dfaStart)
159
+ !db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
160
+ !db || @dfaStart.generatePDF("post_normalize")
161
+ end
162
+
163
+ @dfaStart
164
+ end
165
+
166
+ private
167
+
168
+ # Adds a DFA state for a set of NFA states, if one doesn't already exist
169
+ # for the set
170
+ # @param nfaStateList a sorted array of NFA state ids
171
+ # @return a pair [DFA State,
172
+ # created flag (boolean): true if this did not already exist]
173
+ #
174
+ def createDFAState(nfaStateList)
175
+
176
+ lst = nfaStateList
177
+
178
+ newState = @nfaStateMap[lst]
179
+ isNewState = !newState
180
+ if isNewState
181
+ newState = State.new(@nextId)
182
+
183
+ # Determine if any of the NFA states were final states
184
+ newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
185
+
186
+ if false
187
+ # Set label of DFA state to show which NFA states produced it
188
+ # (useful for debugging)
189
+ newState.label = lst.map {|x| x.to_s}.join(' ')
190
+ end
191
+
192
+ @nextId += 1
193
+ @nfaStateMap[lst] = newState
194
+ @nfaStateLists.push(lst)
195
+
196
+ end
197
+ return [newState,isNewState]
198
+ end
199
+
200
+ def stateSetToIdArray(s)
201
+ s.to_a.map {|x| x.id}.sort
202
+ end
203
+
204
+ # Calculate the epsilon closure of a set of NFA states
205
+ # @return a set of states
206
+ #
207
+ def epsClosure(stateSet)
208
+ stk = stateSet.to_a
209
+ while !stk.empty?
210
+ s = stk.pop
211
+ s.edges.each do |lbl,dest|
212
+ if lbl.contains? EPSILON
213
+ if stateSet.add?(dest)
214
+ stk.push(dest)
215
+ end
216
+ end
217
+ end
218
+ end
219
+ stateSet
220
+ end
221
+
222
+ # Modify edges so each is labelled with a disjoint subset
223
+ # of characters. See the notes at the start of this class,
224
+ # as well as RangePartition.rb.
225
+ #
226
+ def partitionEdges(startState)
227
+
228
+ db = false
229
+
230
+ par = RangePartition.new
231
+
232
+ stateSet, _, _ = startState.reachableStates
233
+
234
+ stateSet.each do |s|
235
+ s.edges.each {|lbl,dest| par.addSet(lbl) }
236
+ end
237
+
238
+ par.prepare
239
+
240
+ stateSet.each do |s|
241
+ newEdges = []
242
+ s.edges.each do |lbl, dest|
243
+ !db||pr(" old edge: %s => %s\n",d(lbl),d(dest.name))
244
+ newLbls = par.apply(lbl)
245
+ newLbls.each {|x| newEdges.push([x, dest]) }
246
+ end
247
+ s.clearEdges()
248
+
249
+ newEdges.each do |lbl,dest|
250
+ !db||pr(" new edge: %s => %s\n",d(lbl),d(dest.name))
251
+ s.addEdge(lbl,dest)
252
+ end
253
+ !db||pr("\n")
254
+ end
255
+
256
+ end
257
+
258
+
259
+ end
260
+
261
+
@@ -0,0 +1,233 @@
1
+ require_relative 'tools'
2
+ req('tokn_const code_set')
3
+
4
+
5
+ # A data structure that transforms a set of CodeSets to a
6
+ # disjoint set of them, such that no two range sets overlap.
7
+ #
8
+ # This is improve the efficiency of the NFA => DFA algorithm,
9
+ # which involves gathering information about what states are
10
+ # reachable on certain characters. We can't afford to treat each
11
+ # character as a singleton, since the ranges can be quite large.
12
+ # Hence, we want to treat ranges of characters as single entities;
13
+ # this will only work if no two such ranges overlap.
14
+ #
15
+ # It works by starting with a tree whose node is labelled with
16
+ # the maximal superset of character values. Then, for each edge
17
+ # in the NFA, performs a DFS on this tree, splitting any node that
18
+ # only partially intersects any one set that appears in the edge label.
19
+ # The running time is O(n log k), where n is the size of the NFA, and
20
+ # k is the height of the resulting tree.
21
+ #
22
+ # We encourage k to be small by sorting the NFA edges by their
23
+ # label complexity.
24
+ #
25
+ class RangePartition
26
+ include Tokn
27
+
28
+ def initialize()
29
+ # We will build a tree, where each node has a CodeSet
30
+ # associated with it, and the child nodes (if present)
31
+ # partition this CodeSet into smaller, nonempty sets.
32
+
33
+ # A tree is represented by a node, where each node is a pair [x,y],
34
+ # with x the node's CodeSet, and y a list of the node's children.
35
+
36
+ @nextNodeId = 0
37
+
38
+ # Make the root node hold the largest possible CodeSet.
39
+ # We want to be able to include all the token ids as well.
40
+
41
+ @rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
42
+
43
+ @setsToAdd = Set.new
44
+
45
+ # Add epsilon immediately, so it's always in its own subset
46
+ addSet(CodeSet.new(EPSILON))
47
+
48
+ @prepared = false
49
+ end
50
+
51
+ def addSet(s)
52
+ if @prepared
53
+ raise IllegalStateException
54
+ end
55
+ @setsToAdd.add(s)
56
+ end
57
+
58
+ def prepare()
59
+ if @prepared
60
+ raise IllegalStateException
61
+ end
62
+
63
+ # Construct partition from previously added sets
64
+
65
+ list = @setsToAdd.to_a
66
+
67
+ # Sort set by cardinality: probably get a more balanced tree
68
+ # if larger sets are processed first
69
+ list.sort!{ |x,y| y.cardinality <=> x.cardinality }
70
+
71
+ list.each do |s|
72
+ addSetAux(s)
73
+ end
74
+
75
+ @prepared = true
76
+ end
77
+
78
+
79
+ # Generate a .dot file, and from that, a PDF, for debug purposes
80
+ #
81
+ def generatePDF(name = "partition")
82
+ if !@prepared
83
+ raise IllegalStateException
84
+ end
85
+
86
+ g = ""
87
+ g += "digraph "+name+" {\n\n"
88
+
89
+ nodes = []
90
+ buildNodeList(nodes)
91
+ nodes.each do |node|
92
+ g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
93
+ end
94
+
95
+ g += "\n"
96
+ nodes.each do |node|
97
+ node.children.each do |ch|
98
+ g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
99
+ end
100
+ end
101
+
102
+ g += "\n}\n"
103
+ g.gsub!( /'/, '"' )
104
+
105
+ dotToPDF(g,name)
106
+
107
+ end
108
+
109
+
110
+ # Apply the partition to a CodeSet
111
+ #
112
+ # > s CodeSet
113
+ # < array of subsets from the partition whose union equals s
114
+ # (this array will be the single element s if no partitioning was necessary)
115
+ #
116
+ def apply(s)
117
+ if !@prepared
118
+ raise IllegalStateException
119
+ end
120
+
121
+ list = []
122
+ s2 = s.makeCopy
123
+ applyAux(@rootNode, s2, list)
124
+
125
+ # Sort the list of subsets by their first elements
126
+ list.sort! { |x,y| x.array[0] <=> y.array[0] }
127
+
128
+ list
129
+ end
130
+
131
+
132
+ private
133
+
134
+ def applyAux(n, s, list)
135
+ db = false
136
+
137
+ !db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
138
+
139
+ if n.children.empty?
140
+ # # Verify that this set equals the input set
141
+ # myAssert(s.eql? n.set)
142
+ list.push(s)
143
+ else
144
+ n.children.each do |m|
145
+ s1 = s.intersect(m.set)
146
+ !db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
147
+
148
+ if s1.empty?
149
+ next
150
+ end
151
+
152
+ applyAux(m, s1, list)
153
+
154
+ !db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
155
+ s = s.difference(m.set)
156
+ !db||pr(" subtracted child set, now [%s]\n",d(s))
157
+ if s.empty?
158
+ break
159
+ end
160
+ end
161
+ end
162
+ end
163
+
164
+ def buildNode(rangeSet)
165
+ id = @nextNodeId
166
+ @nextNodeId += 1
167
+ n = RPNode.new(id, rangeSet, [])
168
+ n
169
+ end
170
+
171
+ def buildNodeList(list, root = nil)
172
+ if not root
173
+ root = @rootNode
174
+ end
175
+ list.push(root)
176
+ root.children.each do |x|
177
+ buildNodeList(list, x)
178
+ end
179
+ end
180
+
181
+ # Add a set to the tree, extending the tree as necessary to
182
+ # maintain a (disjoint) partition
183
+ #
184
+ def addSetAux(s, n = @rootNode)
185
+ #
186
+ # The algorithm is this:
187
+ #
188
+ # add (s, n) # add set s to node n; s must be subset of n.set
189
+ # if n.set = s, return
190
+ # if n is leaf:
191
+ # x = n.set - s
192
+ # add x,y as child sets of n
193
+ # else
194
+ # for each child m of n:
195
+ # t = intersect of m.set and s
196
+ # if t is nonempty, add(t, m)
197
+ #
198
+ if n.set.eql? s
199
+ return
200
+ end
201
+ if n.children.empty?
202
+ x = n.set.difference(s)
203
+ n.children.push buildNode(x)
204
+ n.children.push buildNode(s)
205
+ else
206
+ n.children.each do |m|
207
+ t = m.set.intersect(s)
208
+ addSetAux(t,m) unless t.empty?
209
+ end
210
+ end
211
+ end
212
+
213
+ end
214
+
215
+ # A node within a RangePartition tree
216
+ #
217
+ class RPNode
218
+
219
+ attr_accessor :id, :set, :children
220
+
221
+ def initialize(id, set, children)
222
+ @id = id
223
+ @set = set
224
+ @children = children
225
+ end
226
+
227
+ def inspect
228
+ return 'N' + id.to_s
229
+ end
230
+
231
+ end
232
+
233
+