tokn 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,379 @@
1
+ require_relative 'tools'
2
+ req('code_set state')
3
+
4
+ class ParseException < Exception
5
+ end
6
+
7
+ # Parses a single regular expression from a string.
8
+ # Produces an NFA with distinguished start and end states
9
+ # (none of these states are marked as final states)
10
+ #
11
+ # Here is the grammar for regular expressions. Spaces are ignored,
12
+ # and can be liberally sprinkled within the regular expressions to
13
+ # aid readability. To represent a space, the \s escape sequence must be used.
14
+ # See the file 'sampletokens.txt' for some examples.
15
+ #
16
+ # Expressions have one of these types:
17
+ #
18
+ # E : base class
19
+ # J : a Join expression, formed by concatenating one or more together
20
+ # Q : a Quantified expression; followed optionally by '*', '+', or '?'
21
+ # P : a Parenthesized expression, which is optionally surrounded with (), {}, []
22
+ #
23
+ # E -> J '|' E
24
+ # | J
25
+ #
26
+ # J -> Q J
27
+ # | Q
28
+ #
29
+ # Q -> P '*'
30
+ # | P '+'
31
+ # | P '?'
32
+ # | P
33
+ #
34
+ # P -> '(' E ')'
35
+ # | '{' TOKENNAME '}'
36
+ # | '[^' SETSEQ ']' A code not appearing in the set
37
+ # | '[' SETSEQ ']'
38
+ # | CHARCODE
39
+ #
40
+ # SETSEQ -> SET SETSEQ
41
+ # | SET
42
+ #
43
+ # SET -> CHARCODE
44
+ # | CHARCODE '-' CHARCODE
45
+ #
46
+ # CHARCODE ->
47
+ # a | b | c ... any printable except {,},[, etc.
48
+ # | \xhh hex value from 00...ff
49
+ # | \uhhhh hex value from 0000...ffff (e.g., unicode)
50
+ # | \f | \n | \r | \t formfeed, linefeed, return, tab
51
+ # | \s a space (' ')
52
+ # | \* where * is some other non-alphabetic
53
+ # character that needs to be escaped
54
+ #
55
+ # The parser performs recursive descent parsing;
56
+ # each method returns an NFA represented by
57
+ # a pair of states: the start and end states.
58
+ #
59
+ class RegParse
60
+
61
+ attr_reader :startState, :endState
62
+
63
+ # Construct a parser and perform the parsing
64
+ # @param script script to parse
65
+ # @param tokenDefMap if not nil, a map of previously parsed regular expressions
66
+ # (mapping names to ids) to be consulted if a curly brace expression appears
67
+ # in the script
68
+ #
69
+ def initialize(script, tokenDefMap = nil)
70
+ @script = script.strip
71
+ @nextStateId = 0
72
+ @tokenDefMap = tokenDefMap
73
+ parseScript
74
+ end
75
+
76
+
77
+ def inspect
78
+ s = "RegParse: #{@script}"
79
+ s += " start:"+d(@startState)+" end:"+d(@endState)
80
+ return s
81
+ end
82
+
83
+ private
84
+
85
+ # Raise a ParseException, with a helpful message indicating
86
+ # the parser's current location within the string
87
+ #
88
+ def abort(msg)
89
+ # Assume we've already read the problem character
90
+ i = @cursor - 1
91
+ s = ''
92
+ if i > 4
93
+ s += '...'
94
+ end
95
+ s += @script[i-3...i] || ""
96
+ s += ' !!! '
97
+ s += @script[i...i+3] || ""
98
+ if i +3 < @script.size
99
+ s += '...'
100
+ end
101
+ raise ParseException, msg + ": "+s
102
+ end
103
+
104
+ # Read next character as a hex digit
105
+ #
106
+ def readHex
107
+ v = read.upcase.ord
108
+ if v >= 48 and v < 58
109
+ return v - 48
110
+ elsif v >= 65 and v < 71
111
+ return v - 65 + 10
112
+ else
113
+ abort "Missing hex digit"
114
+ end
115
+ end
116
+
117
+
118
+ NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
119
+
120
+ # Parse character definition (CHARCODE) from input
121
+ #
122
+ def parseChar
123
+
124
+ c = read
125
+
126
+ val = c.ord
127
+
128
+ if "{}[]*?+|-^()".include?(c) or val <= 0x20
129
+ abort "Unexpected or unescaped character"
130
+ end
131
+
132
+ if c == '\\'
133
+
134
+ c = read
135
+
136
+ if "xX".include? c
137
+ val = (readHex() << 4) | readHex()
138
+ elsif "uU".include? c
139
+ val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
140
+ else
141
+ if c == 'f'
142
+ val = "\f".ord
143
+ elsif c == 'r'
144
+ val == "\r".ord
145
+ elsif c == 'n'
146
+ val = "\n".ord
147
+ elsif c == 't'
148
+ val = "\t".ord
149
+ elsif c == 's'
150
+ val = " ".ord
151
+ else
152
+ if c =~ NO_ESCAPE_CHARS
153
+ abort "Unsupported escape sequence ("+c+")"
154
+ end
155
+ val = c.ord
156
+ end
157
+ end
158
+ end
159
+
160
+ return val
161
+ end
162
+
163
+
164
+ def parseCharNFA
165
+ val = parseChar
166
+
167
+ # Construct a pair of states with an edge between them
168
+ # labelled with this character code
169
+
170
+ sA = newState
171
+ sB = newState
172
+ cset = CodeSet.new
173
+ cset.add(val)
174
+ sA.addEdge(cset, sB)
175
+ return [sA,sB]
176
+ end
177
+
178
+
179
+
180
+ def dbInfo
181
+ j = @cursor
182
+ k = j + 5
183
+ if k >= @script.size
184
+ return @script[j..k]+"<<<== end"
185
+ else
186
+ return @script[j..k]+"..."
187
+ end
188
+ end
189
+
190
+ def parseScript
191
+ # Set up the input scanner
192
+ @cursor = 0
193
+
194
+ exp = parseE
195
+ @startState = exp[0]
196
+ @endState = exp[1]
197
+ end
198
+
199
+ def newState
200
+ s = State.new(@nextStateId)
201
+ @nextStateId += 1
202
+ return s
203
+ end
204
+
205
+ def parseSET
206
+ u = parseChar
207
+ v = u+1
208
+ if readIf('-')
209
+ v = parseChar() + 1
210
+ if v <= u
211
+ abort "Illegal range"
212
+ end
213
+ end
214
+ return u,v
215
+ end
216
+
217
+ def parseSETSEQ
218
+ db = false
219
+
220
+ !db || pr("parseSETSEQ\n")
221
+
222
+ read('[')
223
+ negated = readIf('^')
224
+ !db || pr(" negated=%s\n",negated)
225
+
226
+ rs = CodeSet.new
227
+
228
+ u,v = parseSET
229
+ rs.add(u,v)
230
+ !db || pr(" initial set=%s\n",d(rs))
231
+
232
+ while not readIf(']')
233
+ u,v = parseSET
234
+ rs.add(u,v)
235
+ !db || pr(" added another; %s\n",d(rs))
236
+ end
237
+ if negated
238
+ rs.negate
239
+ !db || pr(" negated=%s\n",d(rs))
240
+ end
241
+
242
+ if rs.empty?
243
+ abort "Empty character range"
244
+ end
245
+
246
+ sA = newState
247
+ sB = newState
248
+ sA.addEdge(rs, sB)
249
+ return [sA,sB]
250
+ end
251
+
252
+ TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
253
+
254
+ def parseTokenDef
255
+ read('{')
256
+ name = ''
257
+ while !readIf('}')
258
+ name += read
259
+ end
260
+ # pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
261
+ if name !~ TOKENREF_EXPR
262
+ abort "Problem with token name"
263
+ end
264
+ tokInfo = nil
265
+ if @tokenDefMap
266
+ tokInfo = @tokenDefMap[name]
267
+ end
268
+ if !tokInfo
269
+ abort "Undefined token"
270
+ end
271
+ rg = tokInfo[1]
272
+
273
+ oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
274
+
275
+ newStart = oldToNewMap[rg.startState]
276
+ newEnd = oldToNewMap[rg.endState]
277
+
278
+ [newStart, newEnd]
279
+
280
+
281
+ end
282
+
283
+
284
+ def parseP
285
+ ch = peek
286
+ if ch == '('
287
+ read
288
+ e1 = parseE
289
+ read ')'
290
+ elsif ch == '{'
291
+ e1 = parseTokenDef
292
+ elsif ch == '['
293
+ e1 = parseSETSEQ
294
+ else
295
+ e1 = parseCharNFA
296
+ end
297
+ return e1
298
+ end
299
+
300
+
301
+ def parseE
302
+ e1 = parseJ
303
+ if readIf('|')
304
+ e2 = parseE
305
+
306
+ u = newState
307
+ v = newState
308
+ u.addEps(e1[0])
309
+ u.addEps(e2[0])
310
+ e1[1].addEps(v)
311
+ e2[1].addEps(v)
312
+ e1 = [u,v]
313
+ end
314
+ return e1
315
+ end
316
+
317
+ def parseJ
318
+ e1 = parseQ
319
+ p = peek
320
+ if p and not "|)".include? p
321
+ e2 = parseJ
322
+ e1[1].addEps(e2[0])
323
+ e1 = [e1[0],e2[1]]
324
+ end
325
+
326
+ return e1
327
+ end
328
+
329
+ def parseQ
330
+ e1 = parseP
331
+ p = peek
332
+
333
+ if p == '*'
334
+ read
335
+ e1[0].addEps(e1[1])
336
+ e1[1].addEps(e1[0])
337
+ elsif p == '+'
338
+ read
339
+ e1[1].addEps(e1[0])
340
+ elsif p == '?'
341
+ read
342
+ e1[0].addEps(e1[1])
343
+ # e1[0].generatePDF("optional")
344
+ end
345
+ return e1
346
+ end
347
+
348
+
349
+ def peek(mustExist = false)
350
+ # skip over any non-linefeed whitespace
351
+ while @cursor < @script.size && " \t".index(@script[@cursor])
352
+ @cursor += 1
353
+ end
354
+ if mustExist or @cursor < @script.size
355
+ @script[@cursor]
356
+ else
357
+ nil
358
+ end
359
+ end
360
+
361
+ def readIf(expChar)
362
+ r = (peek == expChar)
363
+ if r
364
+ read
365
+ end
366
+ return r
367
+ end
368
+
369
+ def read(expChar = nil)
370
+ ch = peek
371
+ if ch and ((not expChar) or ch == expChar)
372
+ @cursor += 1
373
+ ch
374
+ else
375
+ abort 'Unexpected end of input'
376
+ end
377
+ end
378
+ end
379
+
data/lib/tokn/state.rb ADDED
@@ -0,0 +1,320 @@
1
+ require 'set'
2
+ require_relative 'tools'
3
+ req 'tokn_const'
4
+
5
+
6
+ # A state within a state machine (NFA or DFA); also, various utility functions
7
+ # for manipulating state machines. Observe that a state machine can be
8
+ # referred to by its start state.
9
+ #
10
+ # Each state has a set of directed edges to other states, where each edge is
11
+ # labelled with a CodeSet.
12
+ #
13
+ # It also has a unique id (unique within a particular state machine),
14
+ # and a (boolean) final state flag.
15
+ #
16
+ # For debug purposes, both the state and its edges can be labelled.
17
+ #
18
+ class State
19
+ include Tokn
20
+
21
+ attr_accessor :id
22
+ attr_accessor :finalState
23
+ alias_method :finalState?, :finalState
24
+ attr_accessor :label
25
+
26
+ # Edges are a list of [label:CharSetRange, dest:State] pairs
27
+ attr_reader :edges
28
+
29
+ # Produce a readable description of an NFA, for debug purposes
30
+ #
31
+ # > st start state
32
+ #
33
+ def self.dumpNFA(st)
34
+ str = "NFA:\n"
35
+ map,_,_ = st.reachableStates
36
+ map.each do |s|
37
+ str += " "+d(s)+"\n"
38
+ str += " edges= "+d(s.edges)+"\n"
39
+ s.edges.each{ |lbl,dest| str += " "+d(lbl)+" ==> "+d(dest)+"\n"}
40
+ end
41
+ str
42
+ end
43
+
44
+ def hash
45
+ return @id
46
+ end
47
+
48
+ def eql?(other)
49
+ return id == other.id
50
+ end
51
+
52
+ def initialize(id)
53
+ @edges = []
54
+ @id = id
55
+ end
56
+
57
+ def clearEdges
58
+ @edges.clear
59
+ end
60
+
61
+ # Add an edge
62
+ # codeSet : the character codes to label it with
63
+ # destState : destination state
64
+ #
65
+ def addEdge(codeSet,destState)
66
+ @edges.push([codeSet, destState])
67
+ end
68
+
69
+ # Add a e-transition edge
70
+ # destState : destination state
71
+ #
72
+ def addEps(destState)
73
+ addEdge(CodeSet.new(EPSILON), destState)
74
+ end
75
+
76
+ def inspect
77
+ name
78
+ end
79
+
80
+ def name
81
+ nm = 'S' + d(id)
82
+ if label
83
+ nm += ": "+label
84
+ end
85
+ nm
86
+ end
87
+
88
+ # Normalize a state machine.
89
+ #
90
+ # For each state:
91
+ # [] merge edges that go to a common state
92
+ # [] delete edges that have empty labels
93
+ # [] sort edges by destination state ids
94
+ #
95
+ # > start state
96
+ #
97
+ def self.normalizeStates(startState)
98
+ stateSet, _,_ = startState.reachableStates
99
+ stateSet.map{|s| s.normalize}
100
+ end
101
+
102
+
103
+ # Generate a PDF of the state machine;
104
+ # Makes a system call to the dot utility to convert a .dot file to a .pdf
105
+ #
106
+ def generatePDF(title = "nfa")
107
+ stateList = {}
108
+
109
+ startState = self
110
+ genAux(stateList, startState)
111
+
112
+ g = ""
113
+ g += "digraph "+title+" {\n"
114
+ g += " '' [shape=none]\n"
115
+
116
+ stateList.each_value do |s|
117
+ g += " '" + s.name + "' [shape="
118
+ if s.finalState?
119
+ g += "doubleoctagon"
120
+ else
121
+ g += "octagon"
122
+ end
123
+ g += "]\n"
124
+ end
125
+
126
+ g += "\n"
127
+ g += " '' -> '" + startState.name + "'\n"
128
+ stateList.each_value do |s|
129
+ s.edges.each do |crs, s2|
130
+ g += " '"+s.name+"' -> '" + s2.name + "' [label='"
131
+ g += d(crs)
132
+ g += "'][fontname=Courier][fontsize=12]\n"
133
+ end
134
+ end
135
+
136
+ g += "\n}\n"
137
+ g.gsub!( /'/, '"' )
138
+
139
+ dotToPDF(g,title)
140
+ end
141
+
142
+
143
+ # Normalize a state
144
+ #
145
+ # [] merge edges that go to a common state
146
+ # [] delete edges that have empty labels
147
+ # [] sort edges by destination state ids
148
+ #
149
+ def normalize()
150
+
151
+ db = false
152
+
153
+ !db || pr("\n\nnormalize state:\n %s\nedges=\n%s\n",d(self),d(@edges))
154
+
155
+ @edges.sort!{|x,y|
156
+ label1,dest1 = x
157
+ label2,dest2 = y
158
+ dest1.id <=> dest2.id
159
+ }
160
+ !db || pr(" sorted edges: %s\n",d(@edges))
161
+
162
+ newEdges = []
163
+ prevLabel, prevDest = nil,nil
164
+
165
+ edges.each do |label,dest|
166
+ !db || pr(" processing edge %s, %s\n",d(label),d(dest))
167
+
168
+ if prevDest and prevDest.id == dest.id
169
+ # changed = true
170
+ !db || pr(" adding set %s to prevLabel %s...\n",d(label),d(prevLabel))
171
+ prevLabel.addSet(label)
172
+ !db || pr(" ...now %s\n",d(prevLabel))
173
+ else
174
+ if prevDest
175
+ newEdges.push([prevLabel,prevDest])
176
+ end
177
+ # Must start a fresh copy! Don't want to modify the original label.
178
+ prevLabel = label.makeCopy()
179
+ prevDest = dest
180
+ !db || pr(" pushed onto new edges\n")
181
+ end
182
+ end
183
+ if prevDest
184
+ newEdges.push([prevLabel,prevDest])
185
+ end
186
+
187
+ @edges = newEdges
188
+ !db || pr("edges now: %s\n",d(@edges))
189
+ end
190
+
191
+
192
+ # Duplicate the NFA reachable from this state, possibly with new ids
193
+ #
194
+ # > dupBaseId : lowest id to use for duplicate; if nil, uses
195
+ # next available id
196
+ # < [ map of original states => duplicate states;
197
+ # 1 + highest id in new NFA ]
198
+ #
199
+ def duplicateNFA(dupBaseId = nil)
200
+ oldStates, oldMinId, oldMaxId = reachableStates()
201
+ dupBaseId ||= oldMaxId
202
+
203
+
204
+ oldToNewStateMap = {}
205
+
206
+ oldStates.each do |s|
207
+ s2 = State.new((s.id - oldMinId) + dupBaseId)
208
+ s2.finalState = s.finalState?
209
+ s2.label = s.label
210
+
211
+ oldToNewStateMap[s] = s2
212
+ end
213
+
214
+ oldStates.each do |s|
215
+ s2 = oldToNewStateMap[s]
216
+ s.edges.each{ |lbl,dest| s2.addEdge(lbl, oldToNewStateMap[dest])}
217
+ end
218
+
219
+ [oldToNewStateMap, (oldMaxId - oldMinId) + dupBaseId]
220
+ end
221
+
222
+
223
+
224
+ # Construct the reverse of the NFA starting at this state
225
+ # < start state of reversed NFA
226
+ #
227
+ def reverseNFA()
228
+
229
+ stateSet, minId, maxId = reachableStates()
230
+
231
+ edgeList = []
232
+
233
+ newStartStateList = []
234
+ newFinalStateList = []
235
+
236
+ newStateMap = {}
237
+
238
+ stateSet.each do |s|
239
+
240
+ u = State.new(s.id)
241
+ newStateMap[u.id] = u
242
+
243
+ if s.id == self.id
244
+ newFinalStateList.push(u)
245
+ u.finalState = true
246
+ end
247
+
248
+ if s.finalState?
249
+ newStartStateList.push(u)
250
+ end
251
+
252
+ s.edges.each {|lbl, dest| edgeList.push([dest.id, s.id, lbl])}
253
+
254
+ end
255
+
256
+ edgeList.each do |srcId, destId, lbl|
257
+ srcState = newStateMap[srcId]
258
+ destState = newStateMap[destId]
259
+ srcState.addEdge(lbl, destState)
260
+ end
261
+
262
+ # Create a distinguished start node that points to each of the start nodes
263
+ w = State.new(maxId)
264
+ newStartStateList.each {|s| w.addEps(s)}
265
+ w
266
+ end
267
+
268
+
269
+ # Build set of states reachable from this state
270
+ #
271
+ # > list of starting states
272
+ # < [ set, set of states reachable from those states
273
+ # minId, lowest id in set
274
+ # maxId 1 + highest id in set
275
+ # ]
276
+ #
277
+ def reachableStates()
278
+ set = Set.new
279
+ stack = []
280
+ stack.push(self)
281
+
282
+ maxId = nil
283
+ minId = nil
284
+
285
+ while !stack.empty?
286
+ st = stack.pop
287
+ set.add(st)
288
+
289
+ if !minId || minId > st.id
290
+ minId = st.id
291
+ end
292
+ if !maxId || maxId <= st.id
293
+ maxId = 1 + st.id
294
+ end
295
+
296
+ st.edges.each do |lbl, dest|
297
+ if set.add?(dest)
298
+ stack.push(dest)
299
+ end
300
+ end
301
+ end
302
+ [set, minId, maxId]
303
+ end
304
+
305
+
306
+
307
+
308
+ end
309
+
310
+
311
+
312
+ private
313
+
314
+ def genAux(stateList, st)
315
+ if not stateList.member?(st.name)
316
+ stateList[st.name] = st
317
+ st.edges.each {|label, dest| genAux(stateList, dest)}
318
+ end
319
+ end
320
+