python-cc 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pcc/ply/lex.py ADDED
@@ -0,0 +1,1097 @@
1
+ # -----------------------------------------------------------------------------
2
+ # ply: lex.py
3
+ #
4
+ # Copyright (C) 2001-2015,
5
+ # David M. Beazley (Dabeaz LLC)
6
+ # All rights reserved.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without
9
+ # modification, are permitted provided that the following conditions are
10
+ # met:
11
+ #
12
+ # * Redistributions of source code must retain the above copyright notice,
13
+ # this list of conditions and the following disclaimer.
14
+ # * Redistributions in binary form must reproduce the above copyright notice,
15
+ # this list of conditions and the following disclaimer in the documentation
16
+ # and/or other materials provided with the distribution.
17
+ # * Neither the name of the David Beazley or Dabeaz LLC may be used to
18
+ # endorse or promote products derived from this software without
19
+ # specific prior written permission.
20
+ #
21
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24
+ # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25
+ # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
+ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
+ # -----------------------------------------------------------------------------
33
+
34
+ __version__ = '3.8'
35
+ __tabversion__ = '3.8'
36
+
37
+ import re
38
+ import sys
39
+ import types
40
+ import copy
41
+ import os
42
+ import inspect
43
+
44
+ # This tuple contains known string types
45
+ try:
46
+ # Python 2.6
47
+ StringTypes = (types.StringType, types.UnicodeType)
48
+ except AttributeError:
49
+ # Python 3.0
50
+ StringTypes = (str, bytes)
51
+
52
+ # This regular expression is used to match valid token names
53
+ _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
54
+
55
+ # Exception thrown when invalid token encountered and no default error
56
+ # handler is defined.
57
+ class LexError(Exception):
58
+ def __init__(self, message, s):
59
+ self.args = (message,)
60
+ self.text = s
61
+
62
+
63
+ # Token class. This class is used to represent the tokens produced.
64
+ class LexToken(object):
65
+ def __str__(self):
66
+ return 'LexToken(%s,%r,%d,%d)' % (self.type, self.value, self.lineno, self.lexpos)
67
+
68
+ def __repr__(self):
69
+ return str(self)
70
+
71
+
72
+ # This object is a stand-in for a logging object created by the
73
+ # logging module.
74
+
75
+ class PlyLogger(object):
76
+ def __init__(self, f):
77
+ self.f = f
78
+
79
+ def critical(self, msg, *args, **kwargs):
80
+ self.f.write((msg % args) + '\n')
81
+
82
+ def warning(self, msg, *args, **kwargs):
83
+ self.f.write('WARNING: ' + (msg % args) + '\n')
84
+
85
+ def error(self, msg, *args, **kwargs):
86
+ self.f.write('ERROR: ' + (msg % args) + '\n')
87
+
88
+ info = critical
89
+ debug = critical
90
+
91
+
92
+ # Null logger is used when no output is generated. Does nothing.
93
+ class NullLogger(object):
94
+ def __getattribute__(self, name):
95
+ return self
96
+
97
+ def __call__(self, *args, **kwargs):
98
+ return self
99
+
100
+
101
+ # -----------------------------------------------------------------------------
102
+ # === Lexing Engine ===
103
+ #
104
+ # The following Lexer class implements the lexer runtime. There are only
105
+ # a few public methods and attributes:
106
+ #
107
+ # input() - Store a new string in the lexer
108
+ # token() - Get the next token
109
+ # clone() - Clone the lexer
110
+ #
111
+ # lineno - Current line number
112
+ # lexpos - Current position in the input string
113
+ # -----------------------------------------------------------------------------
114
+
115
+ class Lexer:
116
+ def __init__(self):
117
+ self.lexre = None # Master regular expression. This is a list of
118
+ # tuples (re, findex) where re is a compiled
119
+ # regular expression and findex is a list
120
+ # mapping regex group numbers to rules
121
+ self.lexretext = None # Current regular expression strings
122
+ self.lexstatere = {} # Dictionary mapping lexer states to master regexs
123
+ self.lexstateretext = {} # Dictionary mapping lexer states to regex strings
124
+ self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names
125
+ self.lexstate = 'INITIAL' # Current lexer state
126
+ self.lexstatestack = [] # Stack of lexer states
127
+ self.lexstateinfo = None # State information
128
+ self.lexstateignore = {} # Dictionary of ignored characters for each state
129
+ self.lexstateerrorf = {} # Dictionary of error functions for each state
130
+ self.lexstateeoff = {} # Dictionary of eof functions for each state
131
+ self.lexreflags = 0 # Optional re compile flags
132
+ self.lexdata = None # Actual input data (as a string)
133
+ self.lexpos = 0 # Current position in input text
134
+ self.lexlen = 0 # Length of the input text
135
+ self.lexerrorf = None # Error rule (if any)
136
+ self.lexeoff = None # EOF rule (if any)
137
+ self.lextokens = None # List of valid tokens
138
+ self.lexignore = '' # Ignored characters
139
+ self.lexliterals = '' # Literal characters that can be passed through
140
+ self.lexmodule = None # Module
141
+ self.lineno = 1 # Current line number
142
+ self.lexoptimize = False # Optimized mode
143
+
144
+ def clone(self, object=None):
145
+ c = copy.copy(self)
146
+
147
+ # If the object parameter has been supplied, it means we are attaching the
148
+ # lexer to a new object. In this case, we have to rebind all methods in
149
+ # the lexstatere and lexstateerrorf tables.
150
+
151
+ if object:
152
+ newtab = {}
153
+ for key, ritem in self.lexstatere.items():
154
+ newre = []
155
+ for cre, findex in ritem:
156
+ newfindex = []
157
+ for f in findex:
158
+ if not f or not f[0]:
159
+ newfindex.append(f)
160
+ continue
161
+ newfindex.append((getattr(object, f[0].__name__), f[1]))
162
+ newre.append((cre, newfindex))
163
+ newtab[key] = newre
164
+ c.lexstatere = newtab
165
+ c.lexstateerrorf = {}
166
+ for key, ef in self.lexstateerrorf.items():
167
+ c.lexstateerrorf[key] = getattr(object, ef.__name__)
168
+ c.lexmodule = object
169
+ return c
170
+
171
+ # ------------------------------------------------------------
172
+ # writetab() - Write lexer information to a table file
173
+ # ------------------------------------------------------------
174
+ def writetab(self, lextab, outputdir=''):
175
+ if isinstance(lextab, types.ModuleType):
176
+ raise IOError("Won't overwrite existing lextab module")
177
+ basetabmodule = lextab.split('.')[-1]
178
+ filename = os.path.join(outputdir, basetabmodule) + '.py'
179
+ with open(filename, 'w') as tf:
180
+ tf.write('# %s.py. This file automatically created by PLY (version %s). Don\'t edit!\n' % (basetabmodule, __version__))
181
+ tf.write('_tabversion = %s\n' % repr(__tabversion__))
182
+ tf.write('_lextokens = %s\n' % repr(self.lextokens))
183
+ tf.write('_lexreflags = %s\n' % repr(self.lexreflags))
184
+ tf.write('_lexliterals = %s\n' % repr(self.lexliterals))
185
+ tf.write('_lexstateinfo = %s\n' % repr(self.lexstateinfo))
186
+
187
+ # Rewrite the lexstatere table, replacing function objects with function names
188
+ tabre = {}
189
+ for statename, lre in self.lexstatere.items():
190
+ titem = []
191
+ for (pat, func), retext, renames in zip(lre, self.lexstateretext[statename], self.lexstaterenames[statename]):
192
+ titem.append((retext, _funcs_to_names(func, renames)))
193
+ tabre[statename] = titem
194
+
195
+ tf.write('_lexstatere = %s\n' % repr(tabre))
196
+ tf.write('_lexstateignore = %s\n' % repr(self.lexstateignore))
197
+
198
+ taberr = {}
199
+ for statename, ef in self.lexstateerrorf.items():
200
+ taberr[statename] = ef.__name__ if ef else None
201
+ tf.write('_lexstateerrorf = %s\n' % repr(taberr))
202
+
203
+ tabeof = {}
204
+ for statename, ef in self.lexstateeoff.items():
205
+ tabeof[statename] = ef.__name__ if ef else None
206
+ tf.write('_lexstateeoff = %s\n' % repr(tabeof))
207
+
208
+ # ------------------------------------------------------------
209
+ # readtab() - Read lexer information from a tab file
210
+ # ------------------------------------------------------------
211
+ def readtab(self, tabfile, fdict):
212
+ if isinstance(tabfile, types.ModuleType):
213
+ lextab = tabfile
214
+ else:
215
+ exec('import %s' % tabfile)
216
+ lextab = sys.modules[tabfile]
217
+
218
+ if getattr(lextab, '_tabversion', '0.0') != __tabversion__:
219
+ raise ImportError('Inconsistent PLY version')
220
+
221
+ self.lextokens = lextab._lextokens
222
+ self.lexreflags = lextab._lexreflags
223
+ self.lexliterals = lextab._lexliterals
224
+ self.lextokens_all = self.lextokens | set(self.lexliterals)
225
+ self.lexstateinfo = lextab._lexstateinfo
226
+ self.lexstateignore = lextab._lexstateignore
227
+ self.lexstatere = {}
228
+ self.lexstateretext = {}
229
+ for statename, lre in lextab._lexstatere.items():
230
+ titem = []
231
+ txtitem = []
232
+ for pat, func_name in lre:
233
+ titem.append((re.compile(pat, lextab._lexreflags | re.VERBOSE), _names_to_funcs(func_name, fdict)))
234
+
235
+ self.lexstatere[statename] = titem
236
+ self.lexstateretext[statename] = txtitem
237
+
238
+ self.lexstateerrorf = {}
239
+ for statename, ef in lextab._lexstateerrorf.items():
240
+ self.lexstateerrorf[statename] = fdict[ef]
241
+
242
+ self.lexstateeoff = {}
243
+ for statename, ef in lextab._lexstateeoff.items():
244
+ self.lexstateeoff[statename] = fdict[ef]
245
+
246
+ self.begin('INITIAL')
247
+
248
+ # ------------------------------------------------------------
249
+ # input() - Push a new string into the lexer
250
+ # ------------------------------------------------------------
251
+ def input(self, s):
252
+ # Pull off the first character to see if s looks like a string
253
+ c = s[:1]
254
+ if not isinstance(c, StringTypes):
255
+ raise ValueError('Expected a string')
256
+ self.lexdata = s
257
+ self.lexpos = 0
258
+ self.lexlen = len(s)
259
+
260
+ # ------------------------------------------------------------
261
+ # begin() - Changes the lexing state
262
+ # ------------------------------------------------------------
263
+ def begin(self, state):
264
+ if state not in self.lexstatere:
265
+ raise ValueError('Undefined state')
266
+ self.lexre = self.lexstatere[state]
267
+ self.lexretext = self.lexstateretext[state]
268
+ self.lexignore = self.lexstateignore.get(state, '')
269
+ self.lexerrorf = self.lexstateerrorf.get(state, None)
270
+ self.lexeoff = self.lexstateeoff.get(state, None)
271
+ self.lexstate = state
272
+
273
+ # ------------------------------------------------------------
274
+ # push_state() - Changes the lexing state and saves old on stack
275
+ # ------------------------------------------------------------
276
+ def push_state(self, state):
277
+ self.lexstatestack.append(self.lexstate)
278
+ self.begin(state)
279
+
280
+ # ------------------------------------------------------------
281
+ # pop_state() - Restores the previous state
282
+ # ------------------------------------------------------------
283
+ def pop_state(self):
284
+ self.begin(self.lexstatestack.pop())
285
+
286
+ # ------------------------------------------------------------
287
+ # current_state() - Returns the current lexing state
288
+ # ------------------------------------------------------------
289
+ def current_state(self):
290
+ return self.lexstate
291
+
292
+ # ------------------------------------------------------------
293
+ # skip() - Skip ahead n characters
294
+ # ------------------------------------------------------------
295
+ def skip(self, n):
296
+ self.lexpos += n
297
+
298
+ # ------------------------------------------------------------
299
+ # opttoken() - Return the next token from the Lexer
300
+ #
301
+ # Note: This function has been carefully implemented to be as fast
302
+ # as possible. Don't make changes unless you really know what
303
+ # you are doing
304
+ # ------------------------------------------------------------
305
+ def token(self):
306
+ # Make local copies of frequently referenced attributes
307
+ lexpos = self.lexpos
308
+ lexlen = self.lexlen
309
+ lexignore = self.lexignore
310
+ lexdata = self.lexdata
311
+
312
+ while lexpos < lexlen:
313
+ # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
314
+ if lexdata[lexpos] in lexignore:
315
+ lexpos += 1
316
+ continue
317
+
318
+ # Look for a regular expression match
319
+ for lexre, lexindexfunc in self.lexre:
320
+ m = lexre.match(lexdata, lexpos)
321
+ if not m:
322
+ continue
323
+
324
+ # Create a token for return
325
+ tok = LexToken()
326
+ tok.value = m.group()
327
+ tok.lineno = self.lineno
328
+ tok.lexpos = lexpos
329
+
330
+ i = m.lastindex
331
+ func, tok.type = lexindexfunc[i]
332
+
333
+ if not func:
334
+ # If no token type was set, it's an ignored token
335
+ if tok.type:
336
+ self.lexpos = m.end()
337
+ return tok
338
+ else:
339
+ lexpos = m.end()
340
+ break
341
+
342
+ lexpos = m.end()
343
+
344
+ # If token is processed by a function, call it
345
+
346
+ tok.lexer = self # Set additional attributes useful in token rules
347
+ self.lexmatch = m
348
+ self.lexpos = lexpos
349
+
350
+ newtok = func(tok)
351
+
352
+ # Every function must return a token, if nothing, we just move to next token
353
+ if not newtok:
354
+ lexpos = self.lexpos # This is here in case user has updated lexpos.
355
+ lexignore = self.lexignore # This is here in case there was a state change
356
+ break
357
+
358
+ # Verify type of the token. If not in the token map, raise an error
359
+ if not self.lexoptimize:
360
+ if newtok.type not in self.lextokens_all:
361
+ raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
362
+ func.__code__.co_filename, func.__code__.co_firstlineno,
363
+ func.__name__, newtok.type), lexdata[lexpos:])
364
+
365
+ return newtok
366
+ else:
367
+ # No match, see if in literals
368
+ if lexdata[lexpos] in self.lexliterals:
369
+ tok = LexToken()
370
+ tok.value = lexdata[lexpos]
371
+ tok.lineno = self.lineno
372
+ tok.type = tok.value
373
+ tok.lexpos = lexpos
374
+ self.lexpos = lexpos + 1
375
+ return tok
376
+
377
+ # No match. Call t_error() if defined.
378
+ if self.lexerrorf:
379
+ tok = LexToken()
380
+ tok.value = self.lexdata[lexpos:]
381
+ tok.lineno = self.lineno
382
+ tok.type = 'error'
383
+ tok.lexer = self
384
+ tok.lexpos = lexpos
385
+ self.lexpos = lexpos
386
+ newtok = self.lexerrorf(tok)
387
+ if lexpos == self.lexpos:
388
+ # Error method didn't change text position at all. This is an error.
389
+ raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
390
+ lexpos = self.lexpos
391
+ if not newtok:
392
+ continue
393
+ return newtok
394
+
395
+ self.lexpos = lexpos
396
+ raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:])
397
+
398
+ if self.lexeoff:
399
+ tok = LexToken()
400
+ tok.type = 'eof'
401
+ tok.value = ''
402
+ tok.lineno = self.lineno
403
+ tok.lexpos = lexpos
404
+ tok.lexer = self
405
+ self.lexpos = lexpos
406
+ newtok = self.lexeoff(tok)
407
+ return newtok
408
+
409
+ self.lexpos = lexpos + 1
410
+ if self.lexdata is None:
411
+ raise RuntimeError('No input string given with input()')
412
+ return None
413
+
414
+ # Iterator interface
415
+ def __iter__(self):
416
+ return self
417
+
418
+ def next(self):
419
+ t = self.token()
420
+ if t is None:
421
+ raise StopIteration
422
+ return t
423
+
424
+ __next__ = next
425
+
426
+ # -----------------------------------------------------------------------------
427
+ # ==== Lex Builder ===
428
+ #
429
+ # The functions and classes below are used to collect lexing information
430
+ # and build a Lexer object from it.
431
+ # -----------------------------------------------------------------------------
432
+
433
+ # -----------------------------------------------------------------------------
434
+ # _get_regex(func)
435
+ #
436
+ # Returns the regular expression assigned to a function either as a doc string
437
+ # or as a .regex attribute attached by the @TOKEN decorator.
438
+ # -----------------------------------------------------------------------------
439
+ def _get_regex(func):
440
+ return getattr(func, 'regex', func.__doc__)
441
+
442
+ # -----------------------------------------------------------------------------
443
+ # get_caller_module_dict()
444
+ #
445
+ # This function returns a dictionary containing all of the symbols defined within
446
+ # a caller further down the call stack. This is used to get the environment
447
+ # associated with the yacc() call if none was provided.
448
+ # -----------------------------------------------------------------------------
449
+ def get_caller_module_dict(levels):
450
+ f = sys._getframe(levels)
451
+ ldict = f.f_globals.copy()
452
+ if f.f_globals != f.f_locals:
453
+ ldict.update(f.f_locals)
454
+ return ldict
455
+
456
+ # -----------------------------------------------------------------------------
457
+ # _funcs_to_names()
458
+ #
459
+ # Given a list of regular expression functions, this converts it to a list
460
+ # suitable for output to a table file
461
+ # -----------------------------------------------------------------------------
462
+ def _funcs_to_names(funclist, namelist):
463
+ result = []
464
+ for f, name in zip(funclist, namelist):
465
+ if f and f[0]:
466
+ result.append((name, f[1]))
467
+ else:
468
+ result.append(f)
469
+ return result
470
+
471
+ # -----------------------------------------------------------------------------
472
+ # _names_to_funcs()
473
+ #
474
+ # Given a list of regular expression function names, this converts it back to
475
+ # functions.
476
+ # -----------------------------------------------------------------------------
477
+ def _names_to_funcs(namelist, fdict):
478
+ result = []
479
+ for n in namelist:
480
+ if n and n[0]:
481
+ result.append((fdict[n[0]], n[1]))
482
+ else:
483
+ result.append(n)
484
+ return result
485
+
486
+ # -----------------------------------------------------------------------------
487
+ # _form_master_re()
488
+ #
489
+ # This function takes a list of all of the regex components and attempts to
490
+ # form the master regular expression. Given limitations in the Python re
491
+ # module, it may be necessary to break the master regex into separate expressions.
492
+ # -----------------------------------------------------------------------------
493
+ def _form_master_re(relist, reflags, ldict, toknames):
494
+ if not relist:
495
+ return []
496
+ regex = '|'.join(relist)
497
+ try:
498
+ lexre = re.compile(regex, re.VERBOSE | reflags)
499
+
500
+ # Build the index to function map for the matching engine
501
+ lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1)
502
+ lexindexnames = lexindexfunc[:]
503
+
504
+ for f, i in lexre.groupindex.items():
505
+ handle = ldict.get(f, None)
506
+ if type(handle) in (types.FunctionType, types.MethodType):
507
+ lexindexfunc[i] = (handle, toknames[f])
508
+ lexindexnames[i] = f
509
+ elif handle is not None:
510
+ lexindexnames[i] = f
511
+ if f.find('ignore_') > 0:
512
+ lexindexfunc[i] = (None, None)
513
+ else:
514
+ lexindexfunc[i] = (None, toknames[f])
515
+
516
+ return [(lexre, lexindexfunc)], [regex], [lexindexnames]
517
+ except Exception:
518
+ m = int(len(relist)/2)
519
+ if m == 0:
520
+ m = 1
521
+ llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames)
522
+ rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames)
523
+ return (llist+rlist), (lre+rre), (lnames+rnames)
524
+
525
+ # -----------------------------------------------------------------------------
526
+ # def _statetoken(s,names)
527
+ #
528
+ # Given a declaration name s of the form "t_" and a dictionary whose keys are
529
+ # state names, this function returns a tuple (states,tokenname) where states
530
+ # is a tuple of state names and tokenname is the name of the token. For example,
531
+ # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
532
+ # -----------------------------------------------------------------------------
533
+ def _statetoken(s, names):
534
+ nonstate = 1
535
+ parts = s.split('_')
536
+ for i, part in enumerate(parts[1:], 1):
537
+ if part not in names and part != 'ANY':
538
+ break
539
+
540
+ if i > 1:
541
+ states = tuple(parts[1:i])
542
+ else:
543
+ states = ('INITIAL',)
544
+
545
+ if 'ANY' in states:
546
+ states = tuple(names)
547
+
548
+ tokenname = '_'.join(parts[i:])
549
+ return (states, tokenname)
550
+
551
+
552
+ # -----------------------------------------------------------------------------
553
+ # LexerReflect()
554
+ #
555
+ # This class represents information needed to build a lexer as extracted from a
556
+ # user's input file.
557
+ # -----------------------------------------------------------------------------
558
+ class LexerReflect(object):
559
+ def __init__(self, ldict, log=None, reflags=0):
560
+ self.ldict = ldict
561
+ self.error_func = None
562
+ self.tokens = []
563
+ self.reflags = reflags
564
+ self.stateinfo = {'INITIAL': 'inclusive'}
565
+ self.modules = set()
566
+ self.error = False
567
+ self.log = PlyLogger(sys.stderr) if log is None else log
568
+
569
+ # Get all of the basic information
570
+ def get_all(self):
571
+ self.get_tokens()
572
+ self.get_literals()
573
+ self.get_states()
574
+ self.get_rules()
575
+
576
+ # Validate all of the information
577
+ def validate_all(self):
578
+ self.validate_tokens()
579
+ self.validate_literals()
580
+ self.validate_rules()
581
+ return self.error
582
+
583
+ # Get the tokens map
584
+ def get_tokens(self):
585
+ tokens = self.ldict.get('tokens', None)
586
+ if not tokens:
587
+ self.log.error('No token list is defined')
588
+ self.error = True
589
+ return
590
+
591
+ if not isinstance(tokens, (list, tuple)):
592
+ self.log.error('tokens must be a list or tuple')
593
+ self.error = True
594
+ return
595
+
596
+ if not tokens:
597
+ self.log.error('tokens is empty')
598
+ self.error = True
599
+ return
600
+
601
+ self.tokens = tokens
602
+
603
+ # Validate the tokens
604
+ def validate_tokens(self):
605
+ terminals = {}
606
+ for n in self.tokens:
607
+ if not _is_identifier.match(n):
608
+ self.log.error("Bad token name '%s'", n)
609
+ self.error = True
610
+ if n in terminals:
611
+ self.log.warning("Token '%s' multiply defined", n)
612
+ terminals[n] = 1
613
+
614
+ # Get the literals specifier
615
+ def get_literals(self):
616
+ self.literals = self.ldict.get('literals', '')
617
+ if not self.literals:
618
+ self.literals = ''
619
+
620
+ # Validate literals
621
+ def validate_literals(self):
622
+ try:
623
+ for c in self.literals:
624
+ if not isinstance(c, StringTypes) or len(c) > 1:
625
+ self.log.error('Invalid literal %s. Must be a single character', repr(c))
626
+ self.error = True
627
+
628
+ except TypeError:
629
+ self.log.error('Invalid literals specification. literals must be a sequence of characters')
630
+ self.error = True
631
+
632
+ def get_states(self):
633
+ self.states = self.ldict.get('states', None)
634
+ # Build statemap
635
+ if self.states:
636
+ if not isinstance(self.states, (tuple, list)):
637
+ self.log.error('states must be defined as a tuple or list')
638
+ self.error = True
639
+ else:
640
+ for s in self.states:
641
+ if not isinstance(s, tuple) or len(s) != 2:
642
+ self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s))
643
+ self.error = True
644
+ continue
645
+ name, statetype = s
646
+ if not isinstance(name, StringTypes):
647
+ self.log.error('State name %s must be a string', repr(name))
648
+ self.error = True
649
+ continue
650
+ if not (statetype == 'inclusive' or statetype == 'exclusive'):
651
+ self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name)
652
+ self.error = True
653
+ continue
654
+ if name in self.stateinfo:
655
+ self.log.error("State '%s' already defined", name)
656
+ self.error = True
657
+ continue
658
+ self.stateinfo[name] = statetype
659
+
660
+ # Get all of the symbols with a t_ prefix and sort them into various
661
+ # categories (functions, strings, error functions, and ignore characters)
662
+
663
+ def get_rules(self):
664
+ tsymbols = [f for f in self.ldict if f[:2] == 't_']
665
+
666
+ # Now build up a list of functions and a list of strings
667
+ self.toknames = {} # Mapping of symbols to token names
668
+ self.funcsym = {} # Symbols defined as functions
669
+ self.strsym = {} # Symbols defined as strings
670
+ self.ignore = {} # Ignore strings by state
671
+ self.errorf = {} # Error functions by state
672
+ self.eoff = {} # EOF functions by state
673
+
674
+ for s in self.stateinfo:
675
+ self.funcsym[s] = []
676
+ self.strsym[s] = []
677
+
678
+ if len(tsymbols) == 0:
679
+ self.log.error('No rules of the form t_rulename are defined')
680
+ self.error = True
681
+ return
682
+
683
+ for f in tsymbols:
684
+ t = self.ldict[f]
685
+ states, tokname = _statetoken(f, self.stateinfo)
686
+ self.toknames[f] = tokname
687
+
688
+ if hasattr(t, '__call__'):
689
+ if tokname == 'error':
690
+ for s in states:
691
+ self.errorf[s] = t
692
+ elif tokname == 'eof':
693
+ for s in states:
694
+ self.eoff[s] = t
695
+ elif tokname == 'ignore':
696
+ line = t.__code__.co_firstlineno
697
+ file = t.__code__.co_filename
698
+ self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__)
699
+ self.error = True
700
+ else:
701
+ for s in states:
702
+ self.funcsym[s].append((f, t))
703
+ elif isinstance(t, StringTypes):
704
+ if tokname == 'ignore':
705
+ for s in states:
706
+ self.ignore[s] = t
707
+ if '\\' in t:
708
+ self.log.warning("%s contains a literal backslash '\\'", f)
709
+
710
+ elif tokname == 'error':
711
+ self.log.error("Rule '%s' must be defined as a function", f)
712
+ self.error = True
713
+ else:
714
+ for s in states:
715
+ self.strsym[s].append((f, t))
716
+ else:
717
+ self.log.error('%s not defined as a function or string', f)
718
+ self.error = True
719
+
720
+ # Sort the functions by line number
721
+ for f in self.funcsym.values():
722
+ f.sort(key=lambda x: x[1].__code__.co_firstlineno)
723
+
724
+ # Sort the strings by regular expression length
725
+ for s in self.strsym.values():
726
+ s.sort(key=lambda x: len(x[1]), reverse=True)
727
+
728
+ # Validate all of the t_rules collected
729
+ def validate_rules(self):
730
+ for state in self.stateinfo:
731
+ # Validate all rules defined by functions
732
+
733
+ for fname, f in self.funcsym[state]:
734
+ line = f.__code__.co_firstlineno
735
+ file = f.__code__.co_filename
736
+ module = inspect.getmodule(f)
737
+ self.modules.add(module)
738
+
739
+ tokname = self.toknames[fname]
740
+ if isinstance(f, types.MethodType):
741
+ reqargs = 2
742
+ else:
743
+ reqargs = 1
744
+ nargs = f.__code__.co_argcount
745
+ if nargs > reqargs:
746
+ self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)
747
+ self.error = True
748
+ continue
749
+
750
+ if nargs < reqargs:
751
+ self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
752
+ self.error = True
753
+ continue
754
+
755
+ if not _get_regex(f):
756
+ self.log.error("%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__)
757
+ self.error = True
758
+ continue
759
+
760
+ try:
761
+ c = re.compile('(?P<%s>%s)' % (fname, _get_regex(f)), re.VERBOSE | self.reflags)
762
+ if c.match(''):
763
+ self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file, line, f.__name__)
764
+ self.error = True
765
+ except re.error as e:
766
+ self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e)
767
+ if '#' in _get_regex(f):
768
+ self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__)
769
+ self.error = True
770
+
771
+ # Validate all rules defined by strings
772
+ for name, r in self.strsym[state]:
773
+ tokname = self.toknames[name]
774
+ if tokname == 'error':
775
+ self.log.error("Rule '%s' must be defined as a function", name)
776
+ self.error = True
777
+ continue
778
+
779
+ if tokname not in self.tokens and tokname.find('ignore_') < 0:
780
+ self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname)
781
+ self.error = True
782
+ continue
783
+
784
+ try:
785
+ c = re.compile('(?P<%s>%s)' % (name, r), re.VERBOSE | self.reflags)
786
+ if (c.match('')):
787
+ self.log.error("Regular expression for rule '%s' matches empty string", name)
788
+ self.error = True
789
+ except re.error as e:
790
+ self.log.error("Invalid regular expression for rule '%s'. %s", name, e)
791
+ if '#' in r:
792
+ self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name)
793
+ self.error = True
794
+
795
+ if not self.funcsym[state] and not self.strsym[state]:
796
+ self.log.error("No rules defined for state '%s'", state)
797
+ self.error = True
798
+
799
+ # Validate the error function
800
+ efunc = self.errorf.get(state, None)
801
+ if efunc:
802
+ f = efunc
803
+ line = f.__code__.co_firstlineno
804
+ file = f.__code__.co_filename
805
+ module = inspect.getmodule(f)
806
+ self.modules.add(module)
807
+
808
+ if isinstance(f, types.MethodType):
809
+ reqargs = 2
810
+ else:
811
+ reqargs = 1
812
+ nargs = f.__code__.co_argcount
813
+ if nargs > reqargs:
814
+ self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)
815
+ self.error = True
816
+
817
+ if nargs < reqargs:
818
+ self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
819
+ self.error = True
820
+
821
+ for module in self.modules:
822
+ self.validate_module(module)
823
+
824
+ # -----------------------------------------------------------------------------
825
+ # validate_module()
826
+ #
827
+ # This checks to see if there are duplicated t_rulename() functions or strings
828
+ # in the parser input file. This is done using a simple regular expression
829
+ # match on each line in the source code of the given module.
830
+ # -----------------------------------------------------------------------------
831
+
832
+ def validate_module(self, module):
833
+ lines, linen = inspect.getsourcelines(module)
834
+
835
+ fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
836
+ sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
837
+
838
+ counthash = {}
839
+ linen += 1
840
+ for line in lines:
841
+ m = fre.match(line)
842
+ if not m:
843
+ m = sre.match(line)
844
+ if m:
845
+ name = m.group(1)
846
+ prev = counthash.get(name)
847
+ if not prev:
848
+ counthash[name] = linen
849
+ else:
850
+ filename = inspect.getsourcefile(module)
851
+ self.log.error('%s:%d: Rule %s redefined. Previously defined on line %d', filename, linen, name, prev)
852
+ self.error = True
853
+ linen += 1
854
+
855
+ # -----------------------------------------------------------------------------
856
+ # lex(module)
857
+ #
858
+ # Build all of the regular expression rules from definitions in the supplied module
859
+ # -----------------------------------------------------------------------------
860
+ def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab',
861
+ reflags=0, nowarn=False, outputdir=None, debuglog=None, errorlog=None):
862
+
863
+ if lextab is None:
864
+ lextab = 'lextab'
865
+
866
+ global lexer
867
+
868
+ ldict = None
869
+ stateinfo = {'INITIAL': 'inclusive'}
870
+ lexobj = Lexer()
871
+ lexobj.lexoptimize = optimize
872
+ global token, input
873
+
874
+ if errorlog is None:
875
+ errorlog = PlyLogger(sys.stderr)
876
+
877
+ if debug:
878
+ if debuglog is None:
879
+ debuglog = PlyLogger(sys.stderr)
880
+
881
+ # Get the module dictionary used for the lexer
882
+ if object:
883
+ module = object
884
+
885
+ # Get the module dictionary used for the parser
886
+ if module:
887
+ _items = [(k, getattr(module, k)) for k in dir(module)]
888
+ ldict = dict(_items)
889
+ # If no __file__ attribute is available, try to obtain it from the __module__ instead
890
+ if '__file__' not in ldict:
891
+ ldict['__file__'] = sys.modules[ldict['__module__']].__file__
892
+ else:
893
+ ldict = get_caller_module_dict(2)
894
+
895
+ # Determine if the module is package of a package or not.
896
+ # If so, fix the tabmodule setting so that tables load correctly
897
+ pkg = ldict.get('__package__')
898
+ if pkg and isinstance(lextab, str):
899
+ if '.' not in lextab:
900
+ lextab = pkg + '.' + lextab
901
+
902
+ # Collect parser information from the dictionary
903
+ linfo = LexerReflect(ldict, log=errorlog, reflags=reflags)
904
+ linfo.get_all()
905
+ if not optimize:
906
+ if linfo.validate_all():
907
+ raise SyntaxError("Can't build lexer")
908
+
909
+ if optimize and lextab:
910
+ try:
911
+ lexobj.readtab(lextab, ldict)
912
+ token = lexobj.token
913
+ input = lexobj.input
914
+ lexer = lexobj
915
+ return lexobj
916
+
917
+ except ImportError:
918
+ pass
919
+
920
+ # Dump some basic debugging information
921
+ if debug:
922
+ debuglog.info('lex: tokens = %r', linfo.tokens)
923
+ debuglog.info('lex: literals = %r', linfo.literals)
924
+ debuglog.info('lex: states = %r', linfo.stateinfo)
925
+
926
+ # Build a dictionary of valid token names
927
+ lexobj.lextokens = set()
928
+ for n in linfo.tokens:
929
+ lexobj.lextokens.add(n)
930
+
931
+ # Get literals specification
932
+ if isinstance(linfo.literals, (list, tuple)):
933
+ lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)
934
+ else:
935
+ lexobj.lexliterals = linfo.literals
936
+
937
+ lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals)
938
+
939
+ # Get the stateinfo dictionary
940
+ stateinfo = linfo.stateinfo
941
+
942
+ regexs = {}
943
+ # Build the master regular expressions
944
+ for state in stateinfo:
945
+ regex_list = []
946
+
947
+ # Add rules defined by functions first
948
+ for fname, f in linfo.funcsym[state]:
949
+ line = f.__code__.co_firstlineno
950
+ file = f.__code__.co_filename
951
+ regex_list.append('(?P<%s>%s)' % (fname, _get_regex(f)))
952
+ if debug:
953
+ debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state)
954
+
955
+ # Now add all of the simple rules
956
+ for name, r in linfo.strsym[state]:
957
+ regex_list.append('(?P<%s>%s)' % (name, r))
958
+ if debug:
959
+ debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state)
960
+
961
+ regexs[state] = regex_list
962
+
963
+ # Build the master regular expressions
964
+
965
+ if debug:
966
+ debuglog.info('lex: ==== MASTER REGEXS FOLLOW ====')
967
+
968
+ for state in regexs:
969
+ lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames)
970
+ lexobj.lexstatere[state] = lexre
971
+ lexobj.lexstateretext[state] = re_text
972
+ lexobj.lexstaterenames[state] = re_names
973
+ if debug:
974
+ for i, text in enumerate(re_text):
975
+ debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text)
976
+
977
+ # For inclusive states, we need to add the regular expressions from the INITIAL state
978
+ for state, stype in stateinfo.items():
979
+ if state != 'INITIAL' and stype == 'inclusive':
980
+ lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
981
+ lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
982
+ lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL'])
983
+
984
+ lexobj.lexstateinfo = stateinfo
985
+ lexobj.lexre = lexobj.lexstatere['INITIAL']
986
+ lexobj.lexretext = lexobj.lexstateretext['INITIAL']
987
+ lexobj.lexreflags = reflags
988
+
989
+ # Set up ignore variables
990
+ lexobj.lexstateignore = linfo.ignore
991
+ lexobj.lexignore = lexobj.lexstateignore.get('INITIAL', '')
992
+
993
+ # Set up error functions
994
+ lexobj.lexstateerrorf = linfo.errorf
995
+ lexobj.lexerrorf = linfo.errorf.get('INITIAL', None)
996
+ if not lexobj.lexerrorf:
997
+ errorlog.warning('No t_error rule is defined')
998
+
999
+ # Set up eof functions
1000
+ lexobj.lexstateeoff = linfo.eoff
1001
+ lexobj.lexeoff = linfo.eoff.get('INITIAL', None)
1002
+
1003
+ # Check state information for ignore and error rules
1004
+ for s, stype in stateinfo.items():
1005
+ if stype == 'exclusive':
1006
+ if s not in linfo.errorf:
1007
+ errorlog.warning("No error rule is defined for exclusive state '%s'", s)
1008
+ if s not in linfo.ignore and lexobj.lexignore:
1009
+ errorlog.warning("No ignore rule is defined for exclusive state '%s'", s)
1010
+ elif stype == 'inclusive':
1011
+ if s not in linfo.errorf:
1012
+ linfo.errorf[s] = linfo.errorf.get('INITIAL', None)
1013
+ if s not in linfo.ignore:
1014
+ linfo.ignore[s] = linfo.ignore.get('INITIAL', '')
1015
+
1016
+ # Create global versions of the token() and input() functions
1017
+ token = lexobj.token
1018
+ input = lexobj.input
1019
+ lexer = lexobj
1020
+
1021
+ # If in optimize mode, we write the lextab
1022
+ if lextab and optimize:
1023
+ if outputdir is None:
1024
+ # If no output directory is set, the location of the output files
1025
+ # is determined according to the following rules:
1026
+ # - If lextab specifies a package, files go into that package directory
1027
+ # - Otherwise, files go in the same directory as the specifying module
1028
+ if isinstance(lextab, types.ModuleType):
1029
+ srcfile = lextab.__file__
1030
+ else:
1031
+ if '.' not in lextab:
1032
+ srcfile = ldict['__file__']
1033
+ else:
1034
+ parts = lextab.split('.')
1035
+ pkgname = '.'.join(parts[:-1])
1036
+ exec('import %s' % pkgname)
1037
+ srcfile = getattr(sys.modules[pkgname], '__file__', '')
1038
+ outputdir = os.path.dirname(srcfile)
1039
+ try:
1040
+ lexobj.writetab(lextab, outputdir)
1041
+ except IOError as e:
1042
+ errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e))
1043
+
1044
+ return lexobj
1045
+
1046
+ # -----------------------------------------------------------------------------
1047
+ # runmain()
1048
+ #
1049
+ # This runs the lexer as a main program
1050
+ # -----------------------------------------------------------------------------
1051
+
1052
+ def runmain(lexer=None, data=None):
1053
+ if not data:
1054
+ try:
1055
+ filename = sys.argv[1]
1056
+ f = open(filename)
1057
+ data = f.read()
1058
+ f.close()
1059
+ except IndexError:
1060
+ sys.stdout.write('Reading from standard input (type EOF to end):\n')
1061
+ data = sys.stdin.read()
1062
+
1063
+ if lexer:
1064
+ _input = lexer.input
1065
+ else:
1066
+ _input = input
1067
+ _input(data)
1068
+ if lexer:
1069
+ _token = lexer.token
1070
+ else:
1071
+ _token = token
1072
+
1073
+ while True:
1074
+ tok = _token()
1075
+ if not tok:
1076
+ break
1077
+ sys.stdout.write('(%s,%r,%d,%d)\n' % (tok.type, tok.value, tok.lineno, tok.lexpos))
1078
+
1079
+ # -----------------------------------------------------------------------------
1080
+ # @TOKEN(regex)
1081
+ #
1082
+ # This decorator function can be used to set the regex expression on a function
1083
+ # when its docstring might need to be set in an alternative way
1084
+ # -----------------------------------------------------------------------------
1085
+
1086
+ def TOKEN(r):
1087
+ def set_regex(f):
1088
+ if hasattr(r, '__call__'):
1089
+ f.regex = _get_regex(r)
1090
+ else:
1091
+ f.regex = r
1092
+ return f
1093
+ return set_regex
1094
+
1095
+ # Alternative spelling of the TOKEN decorator
1096
+ Token = TOKEN
1097
+