tokn 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,379 @@
1
+ require_relative 'tools'
2
+ req('code_set state')
3
+
4
+ class ParseException < Exception
5
+ end
6
+
7
+ # Parses a single regular expression from a string.
8
+ # Produces an NFA with distinguished start and end states
9
+ # (none of these states are marked as final states)
10
+ #
11
+ # Here is the grammar for regular expressions. Spaces are ignored,
12
+ # and can be liberally sprinkled within the regular expressions to
13
+ # aid readability. To represent a space, the \s escape sequence must be used.
14
+ # See the file 'sampletokens.txt' for some examples.
15
+ #
16
+ # Expressions have one of these types:
17
+ #
18
+ # E : base class
19
+ # J : a Join expression, formed by concatenating one or more together
20
+ # Q : a Quantified expression; followed optionally by '*', '+', or '?'
21
+ # P : a Parenthesized expression, which is optionally surrounded with (), {}, []
22
+ #
23
+ # E -> J '|' E
24
+ # | J
25
+ #
26
+ # J -> Q J
27
+ # | Q
28
+ #
29
+ # Q -> P '*'
30
+ # | P '+'
31
+ # | P '?'
32
+ # | P
33
+ #
34
+ # P -> '(' E ')'
35
+ # | '{' TOKENNAME '}'
36
+ # | '[^' SETSEQ ']' A code not appearing in the set
37
+ # | '[' SETSEQ ']'
38
+ # | CHARCODE
39
+ #
40
+ # SETSEQ -> SET SETSEQ
41
+ # | SET
42
+ #
43
+ # SET -> CHARCODE
44
+ # | CHARCODE '-' CHARCODE
45
+ #
46
+ # CHARCODE ->
47
+ # a | b | c ... any printable except {,},[, etc.
48
+ # | \xhh hex value from 00...ff
49
+ # | \uhhhh hex value from 0000...ffff (e.g., unicode)
50
+ # | \f | \n | \r | \t formfeed, linefeed, return, tab
51
+ # | \s a space (' ')
52
+ # | \* where * is some other non-alphabetic
53
+ # character that needs to be escaped
54
+ #
55
+ # The parser performs recursive descent parsing;
56
+ # each method returns an NFA represented by
57
+ # a pair of states: the start and end states.
58
+ #
59
+ class RegParse
60
+
61
+ attr_reader :startState, :endState
62
+
63
+ # Construct a parser and perform the parsing
64
+ # @param script script to parse
65
+ # @param tokenDefMap if not nil, a map of previously parsed regular expressions
66
+ # (mapping names to ids) to be consulted if a curly brace expression appears
67
+ # in the script
68
+ #
69
+ def initialize(script, tokenDefMap = nil)
70
+ @script = script.strip
71
+ @nextStateId = 0
72
+ @tokenDefMap = tokenDefMap
73
+ parseScript
74
+ end
75
+
76
+
77
+ def inspect
78
+ s = "RegParse: #{@script}"
79
+ s += " start:"+d(@startState)+" end:"+d(@endState)
80
+ return s
81
+ end
82
+
83
+ private
84
+
85
+ # Raise a ParseException, with a helpful message indicating
86
+ # the parser's current location within the string
87
+ #
88
+ def abort(msg)
89
+ # Assume we've already read the problem character
90
+ i = @cursor - 1
91
+ s = ''
92
+ if i > 4
93
+ s += '...'
94
+ end
95
+ s += @script[i-3...i] || ""
96
+ s += ' !!! '
97
+ s += @script[i...i+3] || ""
98
+ if i +3 < @script.size
99
+ s += '...'
100
+ end
101
+ raise ParseException, msg + ": "+s
102
+ end
103
+
104
+ # Read next character as a hex digit
105
+ #
106
+ def readHex
107
+ v = read.upcase.ord
108
+ if v >= 48 and v < 58
109
+ return v - 48
110
+ elsif v >= 65 and v < 71
111
+ return v - 65 + 10
112
+ else
113
+ abort "Missing hex digit"
114
+ end
115
+ end
116
+
117
+
118
+ NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
119
+
120
+ # Parse character definition (CHARCODE) from input
121
+ #
122
+ def parseChar
123
+
124
+ c = read
125
+
126
+ val = c.ord
127
+
128
+ if "{}[]*?+|-^()".include?(c) or val <= 0x20
129
+ abort "Unexpected or unescaped character"
130
+ end
131
+
132
+ if c == '\\'
133
+
134
+ c = read
135
+
136
+ if "xX".include? c
137
+ val = (readHex() << 4) | readHex()
138
+ elsif "uU".include? c
139
+ val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
140
+ else
141
+ if c == 'f'
142
+ val = "\f".ord
143
+ elsif c == 'r'
144
+ val == "\r".ord
145
+ elsif c == 'n'
146
+ val = "\n".ord
147
+ elsif c == 't'
148
+ val = "\t".ord
149
+ elsif c == 's'
150
+ val = " ".ord
151
+ else
152
+ if c =~ NO_ESCAPE_CHARS
153
+ abort "Unsupported escape sequence ("+c+")"
154
+ end
155
+ val = c.ord
156
+ end
157
+ end
158
+ end
159
+
160
+ return val
161
+ end
162
+
163
+
164
+ def parseCharNFA
165
+ val = parseChar
166
+
167
+ # Construct a pair of states with an edge between them
168
+ # labelled with this character code
169
+
170
+ sA = newState
171
+ sB = newState
172
+ cset = CodeSet.new
173
+ cset.add(val)
174
+ sA.addEdge(cset, sB)
175
+ return [sA,sB]
176
+ end
177
+
178
+
179
+
180
+ def dbInfo
181
+ j = @cursor
182
+ k = j + 5
183
+ if k >= @script.size
184
+ return @script[j..k]+"<<<== end"
185
+ else
186
+ return @script[j..k]+"..."
187
+ end
188
+ end
189
+
190
+ def parseScript
191
+ # Set up the input scanner
192
+ @cursor = 0
193
+
194
+ exp = parseE
195
+ @startState = exp[0]
196
+ @endState = exp[1]
197
+ end
198
+
199
+ def newState
200
+ s = State.new(@nextStateId)
201
+ @nextStateId += 1
202
+ return s
203
+ end
204
+
205
+ def parseSET
206
+ u = parseChar
207
+ v = u+1
208
+ if readIf('-')
209
+ v = parseChar() + 1
210
+ if v <= u
211
+ abort "Illegal range"
212
+ end
213
+ end
214
+ return u,v
215
+ end
216
+
217
+ def parseSETSEQ
218
+ db = false
219
+
220
+ !db || pr("parseSETSEQ\n")
221
+
222
+ read('[')
223
+ negated = readIf('^')
224
+ !db || pr(" negated=%s\n",negated)
225
+
226
+ rs = CodeSet.new
227
+
228
+ u,v = parseSET
229
+ rs.add(u,v)
230
+ !db || pr(" initial set=%s\n",d(rs))
231
+
232
+ while not readIf(']')
233
+ u,v = parseSET
234
+ rs.add(u,v)
235
+ !db || pr(" added another; %s\n",d(rs))
236
+ end
237
+ if negated
238
+ rs.negate
239
+ !db || pr(" negated=%s\n",d(rs))
240
+ end
241
+
242
+ if rs.empty?
243
+ abort "Empty character range"
244
+ end
245
+
246
+ sA = newState
247
+ sB = newState
248
+ sA.addEdge(rs, sB)
249
+ return [sA,sB]
250
+ end
251
+
252
+ TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
253
+
254
+ def parseTokenDef
255
+ read('{')
256
+ name = ''
257
+ while !readIf('}')
258
+ name += read
259
+ end
260
+ # pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
261
+ if name !~ TOKENREF_EXPR
262
+ abort "Problem with token name"
263
+ end
264
+ tokInfo = nil
265
+ if @tokenDefMap
266
+ tokInfo = @tokenDefMap[name]
267
+ end
268
+ if !tokInfo
269
+ abort "Undefined token"
270
+ end
271
+ rg = tokInfo[1]
272
+
273
+ oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
274
+
275
+ newStart = oldToNewMap[rg.startState]
276
+ newEnd = oldToNewMap[rg.endState]
277
+
278
+ [newStart, newEnd]
279
+
280
+
281
+ end
282
+
283
+
284
+ def parseP
285
+ ch = peek
286
+ if ch == '('
287
+ read
288
+ e1 = parseE
289
+ read ')'
290
+ elsif ch == '{'
291
+ e1 = parseTokenDef
292
+ elsif ch == '['
293
+ e1 = parseSETSEQ
294
+ else
295
+ e1 = parseCharNFA
296
+ end
297
+ return e1
298
+ end
299
+
300
+
301
+ def parseE
302
+ e1 = parseJ
303
+ if readIf('|')
304
+ e2 = parseE
305
+
306
+ u = newState
307
+ v = newState
308
+ u.addEps(e1[0])
309
+ u.addEps(e2[0])
310
+ e1[1].addEps(v)
311
+ e2[1].addEps(v)
312
+ e1 = [u,v]
313
+ end
314
+ return e1
315
+ end
316
+
317
+ def parseJ
318
+ e1 = parseQ
319
+ p = peek
320
+ if p and not "|)".include? p
321
+ e2 = parseJ
322
+ e1[1].addEps(e2[0])
323
+ e1 = [e1[0],e2[1]]
324
+ end
325
+
326
+ return e1
327
+ end
328
+
329
+ def parseQ
330
+ e1 = parseP
331
+ p = peek
332
+
333
+ if p == '*'
334
+ read
335
+ e1[0].addEps(e1[1])
336
+ e1[1].addEps(e1[0])
337
+ elsif p == '+'
338
+ read
339
+ e1[1].addEps(e1[0])
340
+ elsif p == '?'
341
+ read
342
+ e1[0].addEps(e1[1])
343
+ # e1[0].generatePDF("optional")
344
+ end
345
+ return e1
346
+ end
347
+
348
+
349
+ def peek(mustExist = false)
350
+ # skip over any non-linefeed whitespace
351
+ while @cursor < @script.size && " \t".index(@script[@cursor])
352
+ @cursor += 1
353
+ end
354
+ if mustExist or @cursor < @script.size
355
+ @script[@cursor]
356
+ else
357
+ nil
358
+ end
359
+ end
360
+
361
+ def readIf(expChar)
362
+ r = (peek == expChar)
363
+ if r
364
+ read
365
+ end
366
+ return r
367
+ end
368
+
369
+ def read(expChar = nil)
370
+ ch = peek
371
+ if ch and ((not expChar) or ch == expChar)
372
+ @cursor += 1
373
+ ch
374
+ else
375
+ abort 'Unexpected end of input'
376
+ end
377
+ end
378
+ end
379
+
data/lib/tokn/state.rb ADDED
@@ -0,0 +1,320 @@
1
+ require 'set'
2
+ require_relative 'tools'
3
+ req 'tokn_const'
4
+
5
+
6
+ # A state within a state machine (NFA or DFA); also, various utility functions
7
+ # for manipulating state machines. Observe that a state machine can be
8
+ # referred to by its start state.
9
+ #
10
+ # Each state has a set of directed edges to other states, where each edge is
11
+ # labelled with a CodeSet.
12
+ #
13
+ # It also has a unique id (unique within a particular state machine),
14
+ # and a (boolean) final state flag.
15
+ #
16
+ # For debug purposes, both the state and its edges can be labelled.
17
+ #
18
+ class State
19
+ include Tokn
20
+
21
+ attr_accessor :id
22
+ attr_accessor :finalState
23
+ alias_method :finalState?, :finalState
24
+ attr_accessor :label
25
+
26
+ # Edges are a list of [label:CharSetRange, dest:State] pairs
27
+ attr_reader :edges
28
+
29
+ # Produce a readable description of an NFA, for debug purposes
30
+ #
31
+ # > st start state
32
+ #
33
+ def self.dumpNFA(st)
34
+ str = "NFA:\n"
35
+ map,_,_ = st.reachableStates
36
+ map.each do |s|
37
+ str += " "+d(s)+"\n"
38
+ str += " edges= "+d(s.edges)+"\n"
39
+ s.edges.each{ |lbl,dest| str += " "+d(lbl)+" ==> "+d(dest)+"\n"}
40
+ end
41
+ str
42
+ end
43
+
44
+ def hash
45
+ return @id
46
+ end
47
+
48
+ def eql?(other)
49
+ return id == other.id
50
+ end
51
+
52
+ def initialize(id)
53
+ @edges = []
54
+ @id = id
55
+ end
56
+
57
+ def clearEdges
58
+ @edges.clear
59
+ end
60
+
61
+ # Add an edge
62
+ # codeSet : the character codes to label it with
63
+ # destState : destination state
64
+ #
65
+ def addEdge(codeSet,destState)
66
+ @edges.push([codeSet, destState])
67
+ end
68
+
69
+ # Add a e-transition edge
70
+ # destState : destination state
71
+ #
72
+ def addEps(destState)
73
+ addEdge(CodeSet.new(EPSILON), destState)
74
+ end
75
+
76
+ def inspect
77
+ name
78
+ end
79
+
80
+ def name
81
+ nm = 'S' + d(id)
82
+ if label
83
+ nm += ": "+label
84
+ end
85
+ nm
86
+ end
87
+
88
+ # Normalize a state machine.
89
+ #
90
+ # For each state:
91
+ # [] merge edges that go to a common state
92
+ # [] delete edges that have empty labels
93
+ # [] sort edges by destination state ids
94
+ #
95
+ # > start state
96
+ #
97
+ def self.normalizeStates(startState)
98
+ stateSet, _,_ = startState.reachableStates
99
+ stateSet.map{|s| s.normalize}
100
+ end
101
+
102
+
103
+ # Generate a PDF of the state machine;
104
+ # Makes a system call to the dot utility to convert a .dot file to a .pdf
105
+ #
106
+ def generatePDF(title = "nfa")
107
+ stateList = {}
108
+
109
+ startState = self
110
+ genAux(stateList, startState)
111
+
112
+ g = ""
113
+ g += "digraph "+title+" {\n"
114
+ g += " '' [shape=none]\n"
115
+
116
+ stateList.each_value do |s|
117
+ g += " '" + s.name + "' [shape="
118
+ if s.finalState?
119
+ g += "doubleoctagon"
120
+ else
121
+ g += "octagon"
122
+ end
123
+ g += "]\n"
124
+ end
125
+
126
+ g += "\n"
127
+ g += " '' -> '" + startState.name + "'\n"
128
+ stateList.each_value do |s|
129
+ s.edges.each do |crs, s2|
130
+ g += " '"+s.name+"' -> '" + s2.name + "' [label='"
131
+ g += d(crs)
132
+ g += "'][fontname=Courier][fontsize=12]\n"
133
+ end
134
+ end
135
+
136
+ g += "\n}\n"
137
+ g.gsub!( /'/, '"' )
138
+
139
+ dotToPDF(g,title)
140
+ end
141
+
142
+
143
+ # Normalize a state
144
+ #
145
+ # [] merge edges that go to a common state
146
+ # [] delete edges that have empty labels
147
+ # [] sort edges by destination state ids
148
+ #
149
+ def normalize()
150
+
151
+ db = false
152
+
153
+ !db || pr("\n\nnormalize state:\n %s\nedges=\n%s\n",d(self),d(@edges))
154
+
155
+ @edges.sort!{|x,y|
156
+ label1,dest1 = x
157
+ label2,dest2 = y
158
+ dest1.id <=> dest2.id
159
+ }
160
+ !db || pr(" sorted edges: %s\n",d(@edges))
161
+
162
+ newEdges = []
163
+ prevLabel, prevDest = nil,nil
164
+
165
+ edges.each do |label,dest|
166
+ !db || pr(" processing edge %s, %s\n",d(label),d(dest))
167
+
168
+ if prevDest and prevDest.id == dest.id
169
+ # changed = true
170
+ !db || pr(" adding set %s to prevLabel %s...\n",d(label),d(prevLabel))
171
+ prevLabel.addSet(label)
172
+ !db || pr(" ...now %s\n",d(prevLabel))
173
+ else
174
+ if prevDest
175
+ newEdges.push([prevLabel,prevDest])
176
+ end
177
+ # Must start a fresh copy! Don't want to modify the original label.
178
+ prevLabel = label.makeCopy()
179
+ prevDest = dest
180
+ !db || pr(" pushed onto new edges\n")
181
+ end
182
+ end
183
+ if prevDest
184
+ newEdges.push([prevLabel,prevDest])
185
+ end
186
+
187
+ @edges = newEdges
188
+ !db || pr("edges now: %s\n",d(@edges))
189
+ end
190
+
191
+
192
+ # Duplicate the NFA reachable from this state, possibly with new ids
193
+ #
194
+ # > dupBaseId : lowest id to use for duplicate; if nil, uses
195
+ # next available id
196
+ # < [ map of original states => duplicate states;
197
+ # 1 + highest id in new NFA ]
198
+ #
199
+ def duplicateNFA(dupBaseId = nil)
200
+ oldStates, oldMinId, oldMaxId = reachableStates()
201
+ dupBaseId ||= oldMaxId
202
+
203
+
204
+ oldToNewStateMap = {}
205
+
206
+ oldStates.each do |s|
207
+ s2 = State.new((s.id - oldMinId) + dupBaseId)
208
+ s2.finalState = s.finalState?
209
+ s2.label = s.label
210
+
211
+ oldToNewStateMap[s] = s2
212
+ end
213
+
214
+ oldStates.each do |s|
215
+ s2 = oldToNewStateMap[s]
216
+ s.edges.each{ |lbl,dest| s2.addEdge(lbl, oldToNewStateMap[dest])}
217
+ end
218
+
219
+ [oldToNewStateMap, (oldMaxId - oldMinId) + dupBaseId]
220
+ end
221
+
222
+
223
+
224
+ # Construct the reverse of the NFA starting at this state
225
+ # < start state of reversed NFA
226
+ #
227
+ def reverseNFA()
228
+
229
+ stateSet, minId, maxId = reachableStates()
230
+
231
+ edgeList = []
232
+
233
+ newStartStateList = []
234
+ newFinalStateList = []
235
+
236
+ newStateMap = {}
237
+
238
+ stateSet.each do |s|
239
+
240
+ u = State.new(s.id)
241
+ newStateMap[u.id] = u
242
+
243
+ if s.id == self.id
244
+ newFinalStateList.push(u)
245
+ u.finalState = true
246
+ end
247
+
248
+ if s.finalState?
249
+ newStartStateList.push(u)
250
+ end
251
+
252
+ s.edges.each {|lbl, dest| edgeList.push([dest.id, s.id, lbl])}
253
+
254
+ end
255
+
256
+ edgeList.each do |srcId, destId, lbl|
257
+ srcState = newStateMap[srcId]
258
+ destState = newStateMap[destId]
259
+ srcState.addEdge(lbl, destState)
260
+ end
261
+
262
+ # Create a distinguished start node that points to each of the start nodes
263
+ w = State.new(maxId)
264
+ newStartStateList.each {|s| w.addEps(s)}
265
+ w
266
+ end
267
+
268
+
269
+ # Build set of states reachable from this state
270
+ #
271
+ # > list of starting states
272
+ # < [ set, set of states reachable from those states
273
+ # minId, lowest id in set
274
+ # maxId 1 + highest id in set
275
+ # ]
276
+ #
277
+ def reachableStates()
278
+ set = Set.new
279
+ stack = []
280
+ stack.push(self)
281
+
282
+ maxId = nil
283
+ minId = nil
284
+
285
+ while !stack.empty?
286
+ st = stack.pop
287
+ set.add(st)
288
+
289
+ if !minId || minId > st.id
290
+ minId = st.id
291
+ end
292
+ if !maxId || maxId <= st.id
293
+ maxId = 1 + st.id
294
+ end
295
+
296
+ st.edges.each do |lbl, dest|
297
+ if set.add?(dest)
298
+ stack.push(dest)
299
+ end
300
+ end
301
+ end
302
+ [set, minId, maxId]
303
+ end
304
+
305
+
306
+
307
+
308
+ end
309
+
310
+
311
+
312
+ private
313
+
314
+ def genAux(stateList, st)
315
+ if not stateList.member?(st.name)
316
+ stateList[st.name] = st
317
+ st.edges.each {|label, dest| genAux(stateList, dest)}
318
+ end
319
+ end
320
+