kongalib 2.0.5__cp314-cp314-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kongalib/lex.py ADDED
@@ -0,0 +1,1058 @@
1
+ # -----------------------------------------------------------------------------
2
+ # ply: lex.py
3
+ #
4
+ # Copyright (C) 2001-2011,
5
+ # David M. Beazley (Dabeaz LLC)
6
+ # All rights reserved.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without
9
+ # modification, are permitted provided that the following conditions are
10
+ # met:
11
+ #
12
+ # * Redistributions of source code must retain the above copyright notice,
13
+ # this list of conditions and the following disclaimer.
14
+ # * Redistributions in binary form must reproduce the above copyright notice,
15
+ # this list of conditions and the following disclaimer in the documentation
16
+ # and/or other materials provided with the distribution.
17
+ # * Neither the name of the David Beazley or Dabeaz LLC may be used to
18
+ # endorse or promote products derived from this software without
19
+ # specific prior written permission.
20
+ #
21
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24
+ # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25
+ # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
+ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
+ # -----------------------------------------------------------------------------
33
+
34
+ __version__ = "3.4"
35
+ __tabversion__ = "3.2" # Version of table file used
36
+
37
+ import re, sys, types, copy, os
38
+
39
+ # This tuple contains known string types
40
+ try:
41
+ # Python 2.6
42
+ StringTypes = (types.StringType, types.UnicodeType)
43
+ except AttributeError:
44
+ # Python 3.0
45
+ StringTypes = (str, bytes)
46
+
47
+ # Extract the code attribute of a function. Different implementations
48
+ # are for Python 2/3 compatibility.
49
+
50
+ if sys.version_info[0] < 3:
51
+ def func_code(f):
52
+ return f.func_code
53
+ else:
54
+ def func_code(f):
55
+ return f.__code__
56
+
57
+ # This regular expression is used to match valid token names
58
+ _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
59
+
60
+ # Exception thrown when invalid token encountered and no default error
61
+ # handler is defined.
62
+
63
+ class LexError(Exception):
64
+ def __init__(self,message,s):
65
+ self.args = (message,)
66
+ self.text = s
67
+
68
+ # Token class. This class is used to represent the tokens produced.
69
+ class LexToken(object):
70
+ def __str__(self):
71
+ return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos)
72
+ def __repr__(self):
73
+ return str(self)
74
+
75
+ # This object is a stand-in for a logging object created by the
76
+ # logging module.
77
+
78
+ class PlyLogger(object):
79
+ def __init__(self,f):
80
+ self.f = f
81
+ def critical(self,msg,*args,**kwargs):
82
+ self.f.write((msg % args) + "\n")
83
+
84
+ def warning(self,msg,*args,**kwargs):
85
+ self.f.write("WARNING: "+ (msg % args) + "\n")
86
+
87
+ def error(self,msg,*args,**kwargs):
88
+ self.f.write("ERROR: " + (msg % args) + "\n")
89
+
90
+ info = critical
91
+ debug = critical
92
+
93
+ # Null logger is used when no output is generated. Does nothing.
94
+ class NullLogger(object):
95
+ def __getattribute__(self,name):
96
+ return self
97
+ def __call__(self,*args,**kwargs):
98
+ return self
99
+
100
+ # -----------------------------------------------------------------------------
101
+ # === Lexing Engine ===
102
+ #
103
+ # The following Lexer class implements the lexer runtime. There are only
104
+ # a few public methods and attributes:
105
+ #
106
+ # input() - Store a new string in the lexer
107
+ # token() - Get the next token
108
+ # clone() - Clone the lexer
109
+ #
110
+ # lineno - Current line number
111
+ # lexpos - Current position in the input string
112
+ # -----------------------------------------------------------------------------
113
+
114
+ class Lexer:
115
+ def __init__(self):
116
+ self.lexre = None # Master regular expression. This is a list of
117
+ # tuples (re,findex) where re is a compiled
118
+ # regular expression and findex is a list
119
+ # mapping regex group numbers to rules
120
+ self.lexretext = None # Current regular expression strings
121
+ self.lexstatere = {} # Dictionary mapping lexer states to master regexs
122
+ self.lexstateretext = {} # Dictionary mapping lexer states to regex strings
123
+ self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names
124
+ self.lexstate = "INITIAL" # Current lexer state
125
+ self.lexstatestack = [] # Stack of lexer states
126
+ self.lexstateinfo = None # State information
127
+ self.lexstateignore = {} # Dictionary of ignored characters for each state
128
+ self.lexstateerrorf = {} # Dictionary of error functions for each state
129
+ self.lexreflags = 0 # Optional re compile flags
130
+ self.lexdata = None # Actual input data (as a string)
131
+ self.lexpos = 0 # Current position in input text
132
+ self.lexlen = 0 # Length of the input text
133
+ self.lexerrorf = None # Error rule (if any)
134
+ self.lextokens = None # List of valid tokens
135
+ self.lexignore = "" # Ignored characters
136
+ self.lexliterals = "" # Literal characters that can be passed through
137
+ self.lexmodule = None # Module
138
+ self.lineno = 1 # Current line number
139
+ self.lexoptimize = 0 # Optimized mode
140
+
141
+ def clone(self,object=None):
142
+ c = copy.copy(self)
143
+
144
+ # If the object parameter has been supplied, it means we are attaching the
145
+ # lexer to a new object. In this case, we have to rebind all methods in
146
+ # the lexstatere and lexstateerrorf tables.
147
+
148
+ if object:
149
+ newtab = { }
150
+ for key, ritem in self.lexstatere.items():
151
+ newre = []
152
+ for cre, findex in ritem:
153
+ newfindex = []
154
+ for f in findex:
155
+ if not f or not f[0]:
156
+ newfindex.append(f)
157
+ continue
158
+ newfindex.append((getattr(object,f[0].__name__),f[1]))
159
+ newre.append((cre,newfindex))
160
+ newtab[key] = newre
161
+ c.lexstatere = newtab
162
+ c.lexstateerrorf = { }
163
+ for key, ef in self.lexstateerrorf.items():
164
+ c.lexstateerrorf[key] = getattr(object,ef.__name__)
165
+ c.lexmodule = object
166
+ return c
167
+
168
+ # ------------------------------------------------------------
169
+ # writetab() - Write lexer information to a table file
170
+ # ------------------------------------------------------------
171
+ def writetab(self,tabfile,outputdir=""):
172
+ if isinstance(tabfile,types.ModuleType):
173
+ return
174
+ basetabfilename = tabfile.split(".")[-1]
175
+ filename = os.path.join(outputdir,basetabfilename)+".py"
176
+ tf = open(filename,"w")
177
+ tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__))
178
+ tf.write("_tabversion = %s\n" % repr(__version__))
179
+ tf.write("_lextokens = %s\n" % repr(self.lextokens))
180
+ tf.write("_lexreflags = %s\n" % repr(self.lexreflags))
181
+ tf.write("_lexliterals = %s\n" % repr(self.lexliterals))
182
+ tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
183
+
184
+ tabre = { }
185
+ # Collect all functions in the initial state
186
+ initial = self.lexstatere["INITIAL"]
187
+ initialfuncs = []
188
+ for part in initial:
189
+ for f in part[1]:
190
+ if f and f[0]:
191
+ initialfuncs.append(f)
192
+
193
+ for key, lre in self.lexstatere.items():
194
+ titem = []
195
+ for i in range(len(lre)):
196
+ titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i])))
197
+ tabre[key] = titem
198
+
199
+ tf.write("_lexstatere = %s\n" % repr(tabre))
200
+ tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
201
+
202
+ taberr = { }
203
+ for key, ef in self.lexstateerrorf.items():
204
+ if ef:
205
+ taberr[key] = ef.__name__
206
+ else:
207
+ taberr[key] = None
208
+ tf.write("_lexstateerrorf = %s\n" % repr(taberr))
209
+ tf.close()
210
+
211
+ # ------------------------------------------------------------
212
+ # readtab() - Read lexer information from a tab file
213
+ # ------------------------------------------------------------
214
+ def readtab(self,tabfile,fdict):
215
+ if isinstance(tabfile,types.ModuleType):
216
+ lextab = tabfile
217
+ else:
218
+ if sys.version_info[0] < 3:
219
+ exec("import %s as lextab" % tabfile)
220
+ else:
221
+ env = { }
222
+ exec("import %s as lextab" % tabfile, env,env)
223
+ lextab = env['lextab']
224
+
225
+ if getattr(lextab,"_tabversion","0.0") != __version__:
226
+ raise ImportError("Inconsistent PLY version")
227
+
228
+ self.lextokens = lextab._lextokens
229
+ self.lexreflags = lextab._lexreflags
230
+ self.lexliterals = lextab._lexliterals
231
+ self.lexstateinfo = lextab._lexstateinfo
232
+ self.lexstateignore = lextab._lexstateignore
233
+ self.lexstatere = { }
234
+ self.lexstateretext = { }
235
+ for key,lre in lextab._lexstatere.items():
236
+ titem = []
237
+ txtitem = []
238
+ for i in range(len(lre)):
239
+ titem.append((re.compile(lre[i][0],lextab._lexreflags | re.VERBOSE),_names_to_funcs(lre[i][1],fdict)))
240
+ txtitem.append(lre[i][0])
241
+ self.lexstatere[key] = titem
242
+ self.lexstateretext[key] = txtitem
243
+ self.lexstateerrorf = { }
244
+ for key,ef in lextab._lexstateerrorf.items():
245
+ self.lexstateerrorf[key] = fdict[ef]
246
+ self.begin('INITIAL')
247
+
248
+ # ------------------------------------------------------------
249
+ # input() - Push a new string into the lexer
250
+ # ------------------------------------------------------------
251
+ def input(self,s):
252
+ # Pull off the first character to see if s looks like a string
253
+ c = s[:1]
254
+ if not isinstance(c,StringTypes):
255
+ raise ValueError("Expected a string")
256
+ self.lexdata = s
257
+ self.lexpos = 0
258
+ self.lexlen = len(s)
259
+
260
+ # ------------------------------------------------------------
261
+ # begin() - Changes the lexing state
262
+ # ------------------------------------------------------------
263
+ def begin(self,state):
264
+ if not state in self.lexstatere:
265
+ raise ValueError("Undefined state")
266
+ self.lexre = self.lexstatere[state]
267
+ self.lexretext = self.lexstateretext[state]
268
+ self.lexignore = self.lexstateignore.get(state,"")
269
+ self.lexerrorf = self.lexstateerrorf.get(state,None)
270
+ self.lexstate = state
271
+
272
+ # ------------------------------------------------------------
273
+ # push_state() - Changes the lexing state and saves old on stack
274
+ # ------------------------------------------------------------
275
+ def push_state(self,state):
276
+ self.lexstatestack.append(self.lexstate)
277
+ self.begin(state)
278
+
279
+ # ------------------------------------------------------------
280
+ # pop_state() - Restores the previous state
281
+ # ------------------------------------------------------------
282
+ def pop_state(self):
283
+ self.begin(self.lexstatestack.pop())
284
+
285
+ # ------------------------------------------------------------
286
+ # current_state() - Returns the current lexing state
287
+ # ------------------------------------------------------------
288
+ def current_state(self):
289
+ return self.lexstate
290
+
291
+ # ------------------------------------------------------------
292
+ # skip() - Skip ahead n characters
293
+ # ------------------------------------------------------------
294
+ def skip(self,n):
295
+ self.lexpos += n
296
+
297
+ # ------------------------------------------------------------
298
+ # opttoken() - Return the next token from the Lexer
299
+ #
300
+ # Note: This function has been carefully implemented to be as fast
301
+ # as possible. Don't make changes unless you really know what
302
+ # you are doing
303
+ # ------------------------------------------------------------
304
+ def token(self):
305
+ # Make local copies of frequently referenced attributes
306
+ lexpos = self.lexpos
307
+ lexlen = self.lexlen
308
+ lexignore = self.lexignore
309
+ lexdata = self.lexdata
310
+
311
+ while lexpos < lexlen:
312
+ # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
313
+ if lexdata[lexpos] in lexignore:
314
+ lexpos += 1
315
+ continue
316
+
317
+ # Look for a regular expression match
318
+ for lexre,lexindexfunc in self.lexre:
319
+ m = lexre.match(lexdata,lexpos)
320
+ if not m: continue
321
+
322
+ # Create a token for return
323
+ tok = LexToken()
324
+ tok.value = m.group()
325
+ tok.lineno = self.lineno
326
+ tok.lexpos = lexpos
327
+
328
+ i = m.lastindex
329
+ func,tok.type = lexindexfunc[i]
330
+
331
+ if not func:
332
+ # If no token type was set, it's an ignored token
333
+ if tok.type:
334
+ self.lexpos = m.end()
335
+ return tok
336
+ else:
337
+ lexpos = m.end()
338
+ break
339
+
340
+ lexpos = m.end()
341
+
342
+ # If token is processed by a function, call it
343
+
344
+ tok.lexer = self # Set additional attributes useful in token rules
345
+ self.lexmatch = m
346
+ self.lexpos = lexpos
347
+
348
+ newtok = func(tok)
349
+
350
+ # Every function must return a token, if nothing, we just move to next token
351
+ if not newtok:
352
+ lexpos = self.lexpos # This is here in case user has updated lexpos.
353
+ lexignore = self.lexignore # This is here in case there was a state change
354
+ break
355
+
356
+ # Verify type of the token. If not in the token map, raise an error
357
+ if not self.lexoptimize:
358
+ if not newtok.type in self.lextokens:
359
+ raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
360
+ func_code(func).co_filename, func_code(func).co_firstlineno,
361
+ func.__name__, newtok.type),lexdata[lexpos:])
362
+
363
+ return newtok
364
+ else:
365
+ # No match, see if in literals
366
+ if lexdata[lexpos] in self.lexliterals:
367
+ tok = LexToken()
368
+ tok.value = lexdata[lexpos]
369
+ tok.lineno = self.lineno
370
+ tok.type = tok.value
371
+ tok.lexpos = lexpos
372
+ self.lexpos = lexpos + 1
373
+ return tok
374
+
375
+ # No match. Call t_error() if defined.
376
+ if self.lexerrorf:
377
+ tok = LexToken()
378
+ tok.value = self.lexdata[lexpos:]
379
+ tok.lineno = self.lineno
380
+ tok.type = "error"
381
+ tok.lexer = self
382
+ tok.lexpos = lexpos
383
+ self.lexpos = lexpos
384
+ newtok = self.lexerrorf(tok)
385
+ if lexpos == self.lexpos:
386
+ # Error method didn't change text position at all. This is an error.
387
+ raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
388
+ lexpos = self.lexpos
389
+ if not newtok: continue
390
+ return newtok
391
+
392
+ self.lexpos = lexpos
393
+ raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
394
+
395
+ self.lexpos = lexpos + 1
396
+ if self.lexdata is None:
397
+ raise RuntimeError("No input string given with input()")
398
+ return None
399
+
400
+ # Iterator interface
401
+ def __iter__(self):
402
+ return self
403
+
404
+ def next(self):
405
+ t = self.token()
406
+ if t is None:
407
+ raise StopIteration
408
+ return t
409
+
410
+ __next__ = next
411
+
412
+ # -----------------------------------------------------------------------------
413
+ # ==== Lex Builder ===
414
+ #
415
+ # The functions and classes below are used to collect lexing information
416
+ # and build a Lexer object from it.
417
+ # -----------------------------------------------------------------------------
418
+
419
+ # -----------------------------------------------------------------------------
420
+ # get_caller_module_dict()
421
+ #
422
+ # This function returns a dictionary containing all of the symbols defined within
423
+ # a caller further down the call stack. This is used to get the environment
424
+ # associated with the yacc() call if none was provided.
425
+ # -----------------------------------------------------------------------------
426
+
427
+ def get_caller_module_dict(levels):
428
+ try:
429
+ raise RuntimeError
430
+ except RuntimeError:
431
+ e,b,t = sys.exc_info()
432
+ f = t.tb_frame
433
+ while levels > 0:
434
+ f = f.f_back
435
+ levels -= 1
436
+ ldict = f.f_globals.copy()
437
+ if f.f_globals != f.f_locals:
438
+ ldict.update(f.f_locals)
439
+
440
+ return ldict
441
+
442
+ # -----------------------------------------------------------------------------
443
+ # _funcs_to_names()
444
+ #
445
+ # Given a list of regular expression functions, this converts it to a list
446
+ # suitable for output to a table file
447
+ # -----------------------------------------------------------------------------
448
+
449
+ def _funcs_to_names(funclist,namelist):
450
+ result = []
451
+ for f,name in zip(funclist,namelist):
452
+ if f and f[0]:
453
+ result.append((name, f[1]))
454
+ else:
455
+ result.append(f)
456
+ return result
457
+
458
+ # -----------------------------------------------------------------------------
459
+ # _names_to_funcs()
460
+ #
461
+ # Given a list of regular expression function names, this converts it back to
462
+ # functions.
463
+ # -----------------------------------------------------------------------------
464
+
465
+ def _names_to_funcs(namelist,fdict):
466
+ result = []
467
+ for n in namelist:
468
+ if n and n[0]:
469
+ result.append((fdict[n[0]],n[1]))
470
+ else:
471
+ result.append(n)
472
+ return result
473
+
474
+ # -----------------------------------------------------------------------------
475
+ # _form_master_re()
476
+ #
477
+ # This function takes a list of all of the regex components and attempts to
478
+ # form the master regular expression. Given limitations in the Python re
479
+ # module, it may be necessary to break the master regex into separate expressions.
480
+ # -----------------------------------------------------------------------------
481
+
482
+ def _form_master_re(relist,reflags,ldict,toknames):
483
+ if not relist: return []
484
+ regex = "|".join(relist)
485
+ try:
486
+ lexre = re.compile(regex,re.VERBOSE | reflags)
487
+
488
+ # Build the index to function map for the matching engine
489
+ lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1)
490
+ lexindexnames = lexindexfunc[:]
491
+
492
+ for f,i in lexre.groupindex.items():
493
+ handle = ldict.get(f,None)
494
+ if type(handle) in (types.FunctionType, types.MethodType):
495
+ lexindexfunc[i] = (handle,toknames[f])
496
+ lexindexnames[i] = f
497
+ elif handle is not None:
498
+ lexindexnames[i] = f
499
+ if f.find("ignore_") > 0:
500
+ lexindexfunc[i] = (None,None)
501
+ else:
502
+ lexindexfunc[i] = (None, toknames[f])
503
+
504
+ return [(lexre,lexindexfunc)],[regex],[lexindexnames]
505
+ except Exception:
506
+ m = int(len(relist)/2)
507
+ if m == 0: m = 1
508
+ llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames)
509
+ rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames)
510
+ return llist+rlist, lre+rre, lnames+rnames
511
+
512
+ # -----------------------------------------------------------------------------
513
+ # def _statetoken(s,names)
514
+ #
515
+ # Given a declaration name s of the form "t_" and a dictionary whose keys are
516
+ # state names, this function returns a tuple (states,tokenname) where states
517
+ # is a tuple of state names and tokenname is the name of the token. For example,
518
+ # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
519
+ # -----------------------------------------------------------------------------
520
+
521
+ def _statetoken(s,names):
522
+ nonstate = 1
523
+ parts = s.split("_")
524
+ for i in range(1,len(parts)):
525
+ if not parts[i] in names and parts[i] != 'ANY': break
526
+ if i > 1:
527
+ states = tuple(parts[1:i])
528
+ else:
529
+ states = ('INITIAL',)
530
+
531
+ if 'ANY' in states:
532
+ states = tuple(names)
533
+
534
+ tokenname = "_".join(parts[i:])
535
+ return (states,tokenname)
536
+
537
+
538
+ # -----------------------------------------------------------------------------
539
+ # LexerReflect()
540
+ #
541
+ # This class represents information needed to build a lexer as extracted from a
542
+ # user's input file.
543
+ # -----------------------------------------------------------------------------
544
+ class LexerReflect(object):
545
+ def __init__(self,ldict,log=None,reflags=0):
546
+ self.ldict = ldict
547
+ self.error_func = None
548
+ self.tokens = []
549
+ self.reflags = reflags
550
+ self.stateinfo = { 'INITIAL' : 'inclusive'}
551
+ self.files = {}
552
+ self.error = 0
553
+
554
+ if log is None:
555
+ self.log = PlyLogger(sys.stderr)
556
+ else:
557
+ self.log = log
558
+
559
+ # Get all of the basic information
560
+ def get_all(self):
561
+ self.get_tokens()
562
+ self.get_literals()
563
+ self.get_states()
564
+ self.get_rules()
565
+
566
+ # Validate all of the information
567
+ def validate_all(self):
568
+ self.validate_tokens()
569
+ self.validate_literals()
570
+ self.validate_rules()
571
+ return self.error
572
+
573
+ # Get the tokens map
574
+ def get_tokens(self):
575
+ tokens = self.ldict.get("tokens",None)
576
+ if not tokens:
577
+ self.log.error("No token list is defined")
578
+ self.error = 1
579
+ return
580
+
581
+ if not isinstance(tokens,(list, tuple)):
582
+ self.log.error("tokens must be a list or tuple")
583
+ self.error = 1
584
+ return
585
+
586
+ if not tokens:
587
+ self.log.error("tokens is empty")
588
+ self.error = 1
589
+ return
590
+
591
+ self.tokens = tokens
592
+
593
+ # Validate the tokens
594
+ def validate_tokens(self):
595
+ terminals = {}
596
+ for n in self.tokens:
597
+ if not _is_identifier.match(n):
598
+ self.log.error("Bad token name '%s'",n)
599
+ self.error = 1
600
+ if n in terminals:
601
+ self.log.warning("Token '%s' multiply defined", n)
602
+ terminals[n] = 1
603
+
604
+ # Get the literals specifier
605
+ def get_literals(self):
606
+ self.literals = self.ldict.get("literals","")
607
+
608
+ # Validate literals
609
+ def validate_literals(self):
610
+ try:
611
+ for c in self.literals:
612
+ if not isinstance(c,StringTypes) or len(c) > 1:
613
+ self.log.error("Invalid literal %s. Must be a single character", repr(c))
614
+ self.error = 1
615
+ continue
616
+
617
+ except TypeError:
618
+ self.log.error("Invalid literals specification. literals must be a sequence of characters")
619
+ self.error = 1
620
+
621
+ def get_states(self):
622
+ self.states = self.ldict.get("states",None)
623
+ # Build statemap
624
+ if self.states:
625
+ if not isinstance(self.states,(tuple,list)):
626
+ self.log.error("states must be defined as a tuple or list")
627
+ self.error = 1
628
+ else:
629
+ for s in self.states:
630
+ if not isinstance(s,tuple) or len(s) != 2:
631
+ self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s))
632
+ self.error = 1
633
+ continue
634
+ name, statetype = s
635
+ if not isinstance(name,StringTypes):
636
+ self.log.error("State name %s must be a string", repr(name))
637
+ self.error = 1
638
+ continue
639
+ if not (statetype == 'inclusive' or statetype == 'exclusive'):
640
+ self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name)
641
+ self.error = 1
642
+ continue
643
+ if name in self.stateinfo:
644
+ self.log.error("State '%s' already defined",name)
645
+ self.error = 1
646
+ continue
647
+ self.stateinfo[name] = statetype
648
+
649
+ # Get all of the symbols with a t_ prefix and sort them into various
650
+ # categories (functions, strings, error functions, and ignore characters)
651
+
652
+ def get_rules(self):
653
+ tsymbols = [f for f in self.ldict if f[:2] == 't_' ]
654
+
655
+ # Now build up a list of functions and a list of strings
656
+
657
+ self.toknames = { } # Mapping of symbols to token names
658
+ self.funcsym = { } # Symbols defined as functions
659
+ self.strsym = { } # Symbols defined as strings
660
+ self.ignore = { } # Ignore strings by state
661
+ self.errorf = { } # Error functions by state
662
+
663
+ for s in self.stateinfo:
664
+ self.funcsym[s] = []
665
+ self.strsym[s] = []
666
+
667
+ if len(tsymbols) == 0:
668
+ self.log.error("No rules of the form t_rulename are defined")
669
+ self.error = 1
670
+ return
671
+
672
+ for f in tsymbols:
673
+ t = self.ldict[f]
674
+ states, tokname = _statetoken(f,self.stateinfo)
675
+ self.toknames[f] = tokname
676
+
677
+ if hasattr(t,"__call__"):
678
+ if tokname == 'error':
679
+ for s in states:
680
+ self.errorf[s] = t
681
+ elif tokname == 'ignore':
682
+ line = func_code(t).co_firstlineno
683
+ file = func_code(t).co_filename
684
+ self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__)
685
+ self.error = 1
686
+ else:
687
+ for s in states:
688
+ self.funcsym[s].append((f,t))
689
+ elif isinstance(t, StringTypes):
690
+ if tokname == 'ignore':
691
+ for s in states:
692
+ self.ignore[s] = t
693
+ if "\\" in t:
694
+ self.log.warning("%s contains a literal backslash '\\'",f)
695
+
696
+ elif tokname == 'error':
697
+ self.log.error("Rule '%s' must be defined as a function", f)
698
+ self.error = 1
699
+ else:
700
+ for s in states:
701
+ self.strsym[s].append((f,t))
702
+ else:
703
+ self.log.error("%s not defined as a function or string", f)
704
+ self.error = 1
705
+
706
+ # Sort the functions by line number
707
+ for f in self.funcsym.values():
708
+ if sys.version_info[0] < 3:
709
+ f.sort(lambda x,y: cmp(func_code(x[1]).co_firstlineno,func_code(y[1]).co_firstlineno))
710
+ else:
711
+ # Python 3.0
712
+ f.sort(key=lambda x: func_code(x[1]).co_firstlineno)
713
+
714
+ # Sort the strings by regular expression length
715
+ for s in self.strsym.values():
716
+ if sys.version_info[0] < 3:
717
+ s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
718
+ else:
719
+ # Python 3.0
720
+ s.sort(key=lambda x: len(x[1]),reverse=True)
721
+
722
+ # Validate all of the t_rules collected
723
+ def validate_rules(self):
724
+ for state in self.stateinfo:
725
+ # Validate all rules defined by functions
726
+
727
+
728
+
729
+ for fname, f in self.funcsym[state]:
730
+ line = func_code(f).co_firstlineno
731
+ file = func_code(f).co_filename
732
+ self.files[file] = 1
733
+
734
+ tokname = self.toknames[fname]
735
+ if isinstance(f, types.MethodType):
736
+ reqargs = 2
737
+ else:
738
+ reqargs = 1
739
+ nargs = func_code(f).co_argcount
740
+ if nargs > reqargs:
741
+ self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__)
742
+ self.error = 1
743
+ continue
744
+
745
+ if nargs < reqargs:
746
+ self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__)
747
+ self.error = 1
748
+ continue
749
+
750
+ if not f.__doc__:
751
+ self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__)
752
+ self.error = 1
753
+ continue
754
+
755
+ try:
756
+ c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags)
757
+ if c.match(""):
758
+ self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__)
759
+ self.error = 1
760
+ except re.error:
761
+ _etype, e, _etrace = sys.exc_info()
762
+ self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e)
763
+ if '#' in f.__doc__:
764
+ self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__)
765
+ self.error = 1
766
+
767
+ # Validate all rules defined by strings
768
+ for name,r in self.strsym[state]:
769
+ tokname = self.toknames[name]
770
+ if tokname == 'error':
771
+ self.log.error("Rule '%s' must be defined as a function", name)
772
+ self.error = 1
773
+ continue
774
+
775
+ if not tokname in self.tokens and tokname.find("ignore_") < 0:
776
+ self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname)
777
+ self.error = 1
778
+ continue
779
+
780
+ try:
781
+ c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags)
782
+ if (c.match("")):
783
+ self.log.error("Regular expression for rule '%s' matches empty string",name)
784
+ self.error = 1
785
+ except re.error:
786
+ _etype, e, _etrace = sys.exc_info()
787
+ self.log.error("Invalid regular expression for rule '%s'. %s",name,e)
788
+ if '#' in r:
789
+ self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name)
790
+ self.error = 1
791
+
792
+ if not self.funcsym[state] and not self.strsym[state]:
793
+ self.log.error("No rules defined for state '%s'",state)
794
+ self.error = 1
795
+
796
+ # Validate the error function
797
+ efunc = self.errorf.get(state,None)
798
+ if efunc:
799
+ f = efunc
800
+ line = func_code(f).co_firstlineno
801
+ file = func_code(f).co_filename
802
+ self.files[file] = 1
803
+
804
+ if isinstance(f, types.MethodType):
805
+ reqargs = 2
806
+ else:
807
+ reqargs = 1
808
+ nargs = func_code(f).co_argcount
809
+ if nargs > reqargs:
810
+ self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__)
811
+ self.error = 1
812
+
813
+ if nargs < reqargs:
814
+ self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__)
815
+ self.error = 1
816
+
817
+ for f in self.files:
818
+ self.validate_file(f)
819
+
820
+
821
+ # -----------------------------------------------------------------------------
822
+ # validate_file()
823
+ #
824
+ # This checks to see if there are duplicated t_rulename() functions or strings
825
+ # in the parser input file. This is done using a simple regular expression
826
+ # match on each line in the given file.
827
+ # -----------------------------------------------------------------------------
828
+
829
+ def validate_file(self,filename):
830
+ import os.path
831
+ base,ext = os.path.splitext(filename)
832
+ if ext != '.py': return # No idea what the file is. Return OK
833
+
834
+ try:
835
+ f = open(filename)
836
+ lines = f.readlines()
837
+ f.close()
838
+ except IOError:
839
+ return # Couldn't find the file. Don't worry about it
840
+
841
+ fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
842
+ sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
843
+
844
+ counthash = { }
845
+ linen = 1
846
+ for l in lines:
847
+ m = fre.match(l)
848
+ if not m:
849
+ m = sre.match(l)
850
+ if m:
851
+ name = m.group(1)
852
+ prev = counthash.get(name)
853
+ if not prev:
854
+ counthash[name] = linen
855
+ else:
856
+ self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev)
857
+ self.error = 1
858
+ linen += 1
859
+
860
+ # -----------------------------------------------------------------------------
861
+ # lex(module)
862
+ #
863
+ # Build all of the regular expression rules from definitions in the supplied module
864
+ # -----------------------------------------------------------------------------
865
+ def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None):
866
+ global lexer
867
+ ldict = None
868
+ stateinfo = { 'INITIAL' : 'inclusive'}
869
+ lexobj = Lexer()
870
+ lexobj.lexoptimize = optimize
871
+ global token,input
872
+
873
+ if errorlog is None:
874
+ errorlog = PlyLogger(sys.stderr)
875
+
876
+ if debug:
877
+ if debuglog is None:
878
+ debuglog = PlyLogger(sys.stderr)
879
+
880
+ # Get the module dictionary used for the lexer
881
+ if object: module = object
882
+
883
+ if module:
884
+ _items = [(k,getattr(module,k)) for k in dir(module)]
885
+ ldict = dict(_items)
886
+ else:
887
+ ldict = get_caller_module_dict(2)
888
+
889
+ # Collect parser information from the dictionary
890
+ linfo = LexerReflect(ldict,log=errorlog,reflags=reflags)
891
+ linfo.get_all()
892
+ if not optimize:
893
+ if linfo.validate_all():
894
+ raise SyntaxError("Can't build lexer")
895
+
896
+ if optimize and lextab:
897
+ try:
898
+ lexobj.readtab(lextab,ldict)
899
+ token = lexobj.token
900
+ input = lexobj.input
901
+ lexer = lexobj
902
+ return lexobj
903
+
904
+ except ImportError:
905
+ pass
906
+
907
+ # Dump some basic debugging information
908
+ if debug:
909
+ debuglog.info("lex: tokens = %r", linfo.tokens)
910
+ debuglog.info("lex: literals = %r", linfo.literals)
911
+ debuglog.info("lex: states = %r", linfo.stateinfo)
912
+
913
+ # Build a dictionary of valid token names
914
+ lexobj.lextokens = { }
915
+ for n in linfo.tokens:
916
+ lexobj.lextokens[n] = 1
917
+
918
+ # Get literals specification
919
+ if isinstance(linfo.literals,(list,tuple)):
920
+ lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)
921
+ else:
922
+ lexobj.lexliterals = linfo.literals
923
+
924
+ # Get the stateinfo dictionary
925
+ stateinfo = linfo.stateinfo
926
+
927
+ regexs = { }
928
+ # Build the master regular expressions
929
+ for state in stateinfo:
930
+ regex_list = []
931
+
932
+ # Add rules defined by functions first
933
+ for fname, f in linfo.funcsym[state]:
934
+ line = func_code(f).co_firstlineno
935
+ file = func_code(f).co_filename
936
+ regex_list.append("(?P<%s>%s)" % (fname,f.__doc__))
937
+ if debug:
938
+ debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state)
939
+
940
+ # Now add all of the simple rules
941
+ for name,r in linfo.strsym[state]:
942
+ regex_list.append("(?P<%s>%s)" % (name,r))
943
+ if debug:
944
+ debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state)
945
+
946
+ regexs[state] = regex_list
947
+
948
+ # Build the master regular expressions
949
+
950
+ if debug:
951
+ debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====")
952
+
953
+ for state in regexs:
954
+ lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames)
955
+ lexobj.lexstatere[state] = lexre
956
+ lexobj.lexstateretext[state] = re_text
957
+ lexobj.lexstaterenames[state] = re_names
958
+ if debug:
959
+ for i in range(len(re_text)):
960
+ debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i])
961
+
962
+ # For inclusive states, we need to add the regular expressions from the INITIAL state
963
+ for state,stype in stateinfo.items():
964
+ if state != "INITIAL" and stype == 'inclusive':
965
+ lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
966
+ lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
967
+ lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL'])
968
+
969
+ lexobj.lexstateinfo = stateinfo
970
+ lexobj.lexre = lexobj.lexstatere["INITIAL"]
971
+ lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
972
+ lexobj.lexreflags = reflags
973
+
974
+ # Set up ignore variables
975
+ lexobj.lexstateignore = linfo.ignore
976
+ lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","")
977
+
978
+ # Set up error functions
979
+ lexobj.lexstateerrorf = linfo.errorf
980
+ lexobj.lexerrorf = linfo.errorf.get("INITIAL",None)
981
+ if not lexobj.lexerrorf:
982
+ errorlog.warning("No t_error rule is defined")
983
+
984
+ # Check state information for ignore and error rules
985
+ for s,stype in stateinfo.items():
986
+ if stype == 'exclusive':
987
+ if not s in linfo.errorf:
988
+ errorlog.warning("No error rule is defined for exclusive state '%s'", s)
989
+ if not s in linfo.ignore and lexobj.lexignore:
990
+ errorlog.warning("No ignore rule is defined for exclusive state '%s'", s)
991
+ elif stype == 'inclusive':
992
+ if not s in linfo.errorf:
993
+ linfo.errorf[s] = linfo.errorf.get("INITIAL",None)
994
+ if not s in linfo.ignore:
995
+ linfo.ignore[s] = linfo.ignore.get("INITIAL","")
996
+
997
+ # Create global versions of the token() and input() functions
998
+ token = lexobj.token
999
+ input = lexobj.input
1000
+ lexer = lexobj
1001
+
1002
+ # If in optimize mode, we write the lextab
1003
+ if lextab and optimize:
1004
+ lexobj.writetab(lextab,outputdir)
1005
+
1006
+ return lexobj
1007
+
1008
+ # -----------------------------------------------------------------------------
1009
+ # runmain()
1010
+ #
1011
+ # This runs the lexer as a main program
1012
+ # -----------------------------------------------------------------------------
1013
+
1014
+ def runmain(lexer=None,data=None):
1015
+ if not data:
1016
+ try:
1017
+ filename = sys.argv[1]
1018
+ f = open(filename)
1019
+ data = f.read()
1020
+ f.close()
1021
+ except IndexError:
1022
+ sys.stdout.write("Reading from standard input (type EOF to end):\n")
1023
+ data = sys.stdin.read()
1024
+
1025
+ if lexer:
1026
+ _input = lexer.input
1027
+ else:
1028
+ _input = input
1029
+ _input(data)
1030
+ if lexer:
1031
+ _token = lexer.token
1032
+ else:
1033
+ _token = token
1034
+
1035
+ while 1:
1036
+ tok = _token()
1037
+ if not tok: break
1038
+ sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos))
1039
+
1040
+ # -----------------------------------------------------------------------------
1041
+ # @TOKEN(regex)
1042
+ #
1043
+ # This decorator function can be used to set the regex expression on a function
1044
+ # when its docstring might need to be set in an alternative way
1045
+ # -----------------------------------------------------------------------------
1046
+
1047
+ def TOKEN(r):
1048
+ def set_doc(f):
1049
+ if hasattr(r,"__call__"):
1050
+ f.__doc__ = r.__doc__
1051
+ else:
1052
+ f.__doc__ = r
1053
+ return f
1054
+ return set_doc
1055
+
1056
+ # Alternative spelling of the TOKEN decorator
1057
+ Token = TOKEN
1058
+