python-cc 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pcc/lex/c_lexer.py ADDED
@@ -0,0 +1,495 @@
1
+ #------------------------------------------------------------------------------
2
+ # pycparser: c_lexer.py
3
+ #
4
+ # CLexer class: lexer for the C language
5
+ #
6
+ # Copyright (C) 2008-2015, Eli Bendersky
7
+ # License: BSD
8
+ #------------------------------------------------------------------------------
9
+ import re
10
+ import sys
11
+
12
+ from ..ply import lex
13
+ from ..ply.lex import TOKEN
14
+
15
+
16
+ class CLexer(object):
17
+ """ A lexer for the C language. After building it, set the
18
+ input text with input(), and call token() to get new
19
+ tokens.
20
+
21
+ The public attribute filename can be set to an initial
22
+ filaneme, but the lexer will update it upon #line
23
+ directives.
24
+ """
25
+ def __init__(self, error_func, on_lbrace_func, on_rbrace_func,
26
+ type_lookup_func):
27
+ """ Create a new Lexer.
28
+
29
+ error_func:
30
+ An error function. Will be called with an error
31
+ message, line and column as arguments, in case of
32
+ an error during lexing.
33
+
34
+ on_lbrace_func, on_rbrace_func:
35
+ Called when an LBRACE or RBRACE is encountered
36
+ (likely to push/pop type_lookup_func's scope)
37
+
38
+ type_lookup_func:
39
+ A type lookup function. Given a string, it must
40
+ return True IFF this string is a name of a type
41
+ that was defined with a typedef earlier.
42
+ """
43
+ self.error_func = error_func
44
+ self.on_lbrace_func = on_lbrace_func
45
+ self.on_rbrace_func = on_rbrace_func
46
+ self.type_lookup_func = type_lookup_func
47
+ self.filename = ''
48
+
49
+ # Keeps track of the last token returned from self.token()
50
+ self.last_token = None
51
+
52
+ # Allow either "# line" or "# <num>" to support GCC's
53
+ # cpp output
54
+ #
55
+ self.line_pattern = re.compile('([ \t]*line\W)|([ \t]*\d+)')
56
+ self.pragma_pattern = re.compile('[ \t]*pragma\W')
57
+
58
+ def build(self, **kwargs):
59
+ """ Builds the lexer from the specification. Must be
60
+ called after the lexer object is created.
61
+
62
+ This method exists separately, because the PLY
63
+ manual warns against calling lex.lex inside
64
+ __init__
65
+ """
66
+ self.lexer = lex.lex(object=self, **kwargs)
67
+
68
+ def reset_lineno(self):
69
+ """ Resets the internal line number counter of the lexer.
70
+ """
71
+ self.lexer.lineno = 1
72
+
73
+ def input(self, text):
74
+ self.lexer.input(text)
75
+
76
+ def token(self):
77
+ self.last_token = self.lexer.token()
78
+ return self.last_token
79
+
80
+ def find_tok_column(self, token):
81
+ """ Find the column of the token in its line.
82
+ """
83
+ last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos)
84
+ return token.lexpos - last_cr
85
+
86
+ ######################-- PRIVATE --######################
87
+
88
+ ##
89
+ ## Internal auxiliary methods
90
+ ##
91
+ def _error(self, msg, token):
92
+ location = self._make_tok_location(token)
93
+ self.error_func(msg, location[0], location[1])
94
+ self.lexer.skip(1)
95
+
96
+ def _make_tok_location(self, token):
97
+ return (token.lineno, self.find_tok_column(token))
98
+
99
+ ##
100
+ ## Reserved keywords
101
+ ##
102
+ keywords = (
103
+ '_BOOL', '_COMPLEX', 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
104
+ 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
105
+ 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG',
106
+ 'REGISTER', 'OFFSETOF',
107
+ 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
108
+ 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
109
+ 'VOLATILE', 'WHILE',
110
+ )
111
+
112
+ keyword_map = {}
113
+ for keyword in keywords:
114
+ if keyword == '_BOOL':
115
+ keyword_map['_Bool'] = keyword
116
+ elif keyword == '_COMPLEX':
117
+ keyword_map['_Complex'] = keyword
118
+ else:
119
+ keyword_map[keyword.lower()] = keyword
120
+
121
+ ##
122
+ ## All the tokens recognized by the lexer
123
+ ##
124
+ tokens = keywords + (
125
+ # Identifiers
126
+ 'ID',
127
+
128
+ # Type identifiers (identifiers previously defined as
129
+ # types with typedef)
130
+ 'TYPEID',
131
+
132
+ # constants
133
+ 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN',
134
+ 'FLOAT_CONST', 'HEX_FLOAT_CONST',
135
+ 'CHAR_CONST',
136
+ 'WCHAR_CONST',
137
+
138
+ # String literals
139
+ 'STRING_LITERAL',
140
+ 'WSTRING_LITERAL',
141
+
142
+ # Operators
143
+ 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
144
+ 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
145
+ 'LOR', 'LAND', 'LNOT',
146
+ 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
147
+
148
+ # Assignment
149
+ 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
150
+ 'PLUSEQUAL', 'MINUSEQUAL',
151
+ 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
152
+ 'OREQUAL',
153
+
154
+ # Increment/decrement
155
+ 'PLUSPLUS', 'MINUSMINUS',
156
+
157
+ # Structure dereference (->)
158
+ 'ARROW',
159
+
160
+ # Conditional operator (?)
161
+ 'CONDOP',
162
+
163
+ # Delimeters
164
+ 'LPAREN', 'RPAREN', # ( )
165
+ 'LBRACKET', 'RBRACKET', # [ ]
166
+ 'LBRACE', 'RBRACE', # { }
167
+ 'COMMA', 'PERIOD', # . ,
168
+ 'SEMI', 'COLON', # ; :
169
+
170
+ # Ellipsis (...)
171
+ 'ELLIPSIS',
172
+
173
+ # pre-processor
174
+ 'PPHASH', # '#'
175
+ )
176
+
177
+ ##
178
+ ## Regexes for use in tokens
179
+ ##
180
+ ##
181
+
182
+ # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
183
+ identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*'
184
+
185
+ hex_prefix = '0[xX]'
186
+ hex_digits = '[0-9a-fA-F]+'
187
+ bin_prefix = '0[bB]'
188
+ bin_digits = '[01]+'
189
+
190
+ # integer constants (K&R2: A.2.5.1)
191
+ integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?'
192
+ decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
193
+ octal_constant = '0[0-7]*'+integer_suffix_opt
194
+ hex_constant = hex_prefix+hex_digits+integer_suffix_opt
195
+ bin_constant = bin_prefix+bin_digits+integer_suffix_opt
196
+
197
+ bad_octal_constant = '0[0-7]*[89]'
198
+
199
+ # character constants (K&R2: A.2.5.2)
200
+ # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
201
+ # directives with Windows paths as filenames (..\..\dir\file)
202
+ # For the same reason, decimal_escape allows all digit sequences. We want to
203
+ # parse all correct code, even if it means to sometimes parse incorrect
204
+ # code.
205
+ #
206
+ simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
207
+ decimal_escape = r"""(\d+)"""
208
+ hex_escape = r"""(x[0-9a-fA-F]+)"""
209
+ bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
210
+
211
+ escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
212
+ cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
213
+ char_const = "'"+cconst_char+"'"
214
+ wchar_const = 'L'+char_const
215
+ unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
216
+ bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
217
+
218
+ # string literals (K&R2: A.2.6)
219
+ string_char = r"""([^"\\\n]|"""+escape_sequence+')'
220
+ string_literal = '"'+string_char+'*"'
221
+ wstring_literal = 'L'+string_literal
222
+ bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
223
+
224
+ # floating constants (K&R2: A.2.5.3)
225
+ exponent_part = r"""([eE][-+]?[0-9]+)"""
226
+ fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
227
+ floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
228
+ binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
229
+ hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))"""
230
+ hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)'
231
+
232
+ ##
233
+ ## Lexer states: used for preprocessor \n-terminated directives
234
+ ##
235
+ states = (
236
+ # ppline: preprocessor line directives
237
+ #
238
+ ('ppline', 'exclusive'),
239
+
240
+ # pppragma: pragma
241
+ #
242
+ ('pppragma', 'exclusive'),
243
+ )
244
+
245
+ def t_PPHASH(self, t):
246
+ r'[ \t]*\#'
247
+ if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
248
+ t.lexer.begin('ppline')
249
+ self.pp_line = self.pp_filename = None
250
+ elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
251
+ t.lexer.begin('pppragma')
252
+ else:
253
+ t.type = 'PPHASH'
254
+ return t
255
+
256
+ ##
257
+ ## Rules for the ppline state
258
+ ##
259
+ @TOKEN(string_literal)
260
+ def t_ppline_FILENAME(self, t):
261
+ if self.pp_line is None:
262
+ self._error('filename before line number in #line', t)
263
+ else:
264
+ self.pp_filename = t.value.lstrip('"').rstrip('"')
265
+
266
+ @TOKEN(decimal_constant)
267
+ def t_ppline_LINE_NUMBER(self, t):
268
+ if self.pp_line is None:
269
+ self.pp_line = t.value
270
+ else:
271
+ # Ignore: GCC's cpp sometimes inserts a numeric flag
272
+ # after the file name
273
+ pass
274
+
275
+ def t_ppline_NEWLINE(self, t):
276
+ r'\n'
277
+
278
+ if self.pp_line is None:
279
+ self._error('line number missing in #line', t)
280
+ else:
281
+ self.lexer.lineno = int(self.pp_line)
282
+
283
+ if self.pp_filename is not None:
284
+ self.filename = self.pp_filename
285
+
286
+ t.lexer.begin('INITIAL')
287
+
288
+ def t_ppline_PPLINE(self, t):
289
+ r'line'
290
+ pass
291
+
292
+ t_ppline_ignore = ' \t'
293
+
294
+ def t_ppline_error(self, t):
295
+ self._error('invalid #line directive', t)
296
+
297
+ ##
298
+ ## Rules for the pppragma state
299
+ ##
300
+ def t_pppragma_NEWLINE(self, t):
301
+ r'\n'
302
+ t.lexer.lineno += 1
303
+ t.lexer.begin('INITIAL')
304
+
305
+ def t_pppragma_PPPRAGMA(self, t):
306
+ r'pragma'
307
+ pass
308
+
309
+ t_pppragma_ignore = ' \t<>.-{}();=+-*/$%@&^~!?:,0123456789'
310
+
311
+ @TOKEN(string_literal)
312
+ def t_pppragma_STR(self, t): pass
313
+
314
+ @TOKEN(identifier)
315
+ def t_pppragma_ID(self, t): pass
316
+
317
+ def t_pppragma_error(self, t):
318
+ self._error('invalid #pragma directive', t)
319
+
320
+ ##
321
+ ## Rules for the normal state
322
+ ##
323
+ t_ignore = ' \t'
324
+
325
+ # C-style block comments /* ... */
326
+ def t_BLOCK_COMMENT(self, t):
327
+ r'/\*[\s\S]*?\*/'
328
+ t.lexer.lineno += t.value.count('\n')
329
+
330
+ # C++ style line comments // ...
331
+ def t_LINE_COMMENT(self, t):
332
+ r'//[^\n]*'
333
+
334
+ # Newlines
335
+ def t_NEWLINE(self, t):
336
+ r'\n+'
337
+ t.lexer.lineno += t.value.count("\n")
338
+
339
+ # Operators
340
+ t_PLUS = r'\+'
341
+ t_MINUS = r'-'
342
+ t_TIMES = r'\*'
343
+ t_DIVIDE = r'/'
344
+ t_MOD = r'%'
345
+ t_OR = r'\|'
346
+ t_AND = r'&'
347
+ t_NOT = r'~'
348
+ t_XOR = r'\^'
349
+ t_LSHIFT = r'<<'
350
+ t_RSHIFT = r'>>'
351
+ t_LOR = r'\|\|'
352
+ t_LAND = r'&&'
353
+ t_LNOT = r'!'
354
+ t_LT = r'<'
355
+ t_GT = r'>'
356
+ t_LE = r'<='
357
+ t_GE = r'>='
358
+ t_EQ = r'=='
359
+ t_NE = r'!='
360
+
361
+ # Assignment operators
362
+ t_EQUALS = r'='
363
+ t_TIMESEQUAL = r'\*='
364
+ t_DIVEQUAL = r'/='
365
+ t_MODEQUAL = r'%='
366
+ t_PLUSEQUAL = r'\+='
367
+ t_MINUSEQUAL = r'-='
368
+ t_LSHIFTEQUAL = r'<<='
369
+ t_RSHIFTEQUAL = r'>>='
370
+ t_ANDEQUAL = r'&='
371
+ t_OREQUAL = r'\|='
372
+ t_XOREQUAL = r'\^='
373
+
374
+ # Increment/decrement
375
+ t_PLUSPLUS = r'\+\+'
376
+ t_MINUSMINUS = r'--'
377
+
378
+ # ->
379
+ t_ARROW = r'->'
380
+
381
+ # ?
382
+ t_CONDOP = r'\?'
383
+
384
+ # Delimeters
385
+ t_LPAREN = r'\('
386
+ t_RPAREN = r'\)'
387
+ t_LBRACKET = r'\['
388
+ t_RBRACKET = r'\]'
389
+ t_COMMA = r','
390
+ t_PERIOD = r'\.'
391
+ t_SEMI = r';'
392
+ t_COLON = r':'
393
+ t_ELLIPSIS = r'\.\.\.'
394
+
395
+ # Scope delimiters
396
+ # To see why on_lbrace_func is needed, consider:
397
+ # typedef char TT;
398
+ # void foo(int TT) { TT = 10; }
399
+ # TT x = 5;
400
+ # Outside the function, TT is a typedef, but inside (starting and ending
401
+ # with the braces) it's a parameter. The trouble begins with yacc's
402
+ # lookahead token. If we open a new scope in brace_open, then TT has
403
+ # already been read and incorrectly interpreted as TYPEID. So, we need
404
+ # to open and close scopes from within the lexer.
405
+ # Similar for the TT immediately outside the end of the function.
406
+ #
407
+ @TOKEN(r'\{')
408
+ def t_LBRACE(self, t):
409
+ self.on_lbrace_func()
410
+ return t
411
+ @TOKEN(r'\}')
412
+ def t_RBRACE(self, t):
413
+ self.on_rbrace_func()
414
+ return t
415
+
416
+ t_STRING_LITERAL = string_literal
417
+
418
+ # The following floating and integer constants are defined as
419
+ # functions to impose a strict order (otherwise, decimal
420
+ # is placed before the others because its regex is longer,
421
+ # and this is bad)
422
+ #
423
+ @TOKEN(floating_constant)
424
+ def t_FLOAT_CONST(self, t):
425
+ return t
426
+
427
+ @TOKEN(hex_floating_constant)
428
+ def t_HEX_FLOAT_CONST(self, t):
429
+ return t
430
+
431
+ @TOKEN(hex_constant)
432
+ def t_INT_CONST_HEX(self, t):
433
+ return t
434
+
435
+ @TOKEN(bin_constant)
436
+ def t_INT_CONST_BIN(self, t):
437
+ return t
438
+
439
+ @TOKEN(bad_octal_constant)
440
+ def t_BAD_CONST_OCT(self, t):
441
+ msg = "Invalid octal constant"
442
+ self._error(msg, t)
443
+
444
+ @TOKEN(octal_constant)
445
+ def t_INT_CONST_OCT(self, t):
446
+ return t
447
+
448
+ @TOKEN(decimal_constant)
449
+ def t_INT_CONST_DEC(self, t):
450
+ return t
451
+
452
+ # Must come before bad_char_const, to prevent it from
453
+ # catching valid char constants as invalid
454
+ #
455
+ @TOKEN(char_const)
456
+ def t_CHAR_CONST(self, t):
457
+ return t
458
+
459
+ @TOKEN(wchar_const)
460
+ def t_WCHAR_CONST(self, t):
461
+ return t
462
+
463
+ @TOKEN(unmatched_quote)
464
+ def t_UNMATCHED_QUOTE(self, t):
465
+ msg = "Unmatched '"
466
+ self._error(msg, t)
467
+
468
+ @TOKEN(bad_char_const)
469
+ def t_BAD_CHAR_CONST(self, t):
470
+ msg = "Invalid char constant %s" % t.value
471
+ self._error(msg, t)
472
+
473
+ @TOKEN(wstring_literal)
474
+ def t_WSTRING_LITERAL(self, t):
475
+ return t
476
+
477
+ # unmatched string literals are caught by the preprocessor
478
+
479
+ @TOKEN(bad_string_literal)
480
+ def t_BAD_STRING_LITERAL(self, t):
481
+ msg = "String contains invalid escape code"
482
+ self._error(msg, t)
483
+
484
+ @TOKEN(identifier)
485
+ def t_ID(self, t):
486
+ t.type = self.keyword_map.get(t.value, "ID")
487
+ if t.type == 'ID' and self.type_lookup_func(t.value):
488
+ t.type = "TYPEID"
489
+ return t
490
+
491
+ def t_error(self, t):
492
+ msg = 'Illegal character %s' % repr(t.value[0])
493
+ self._error(msg, t)
494
+
495
+
pcc/lex/lexer.py ADDED
@@ -0,0 +1,68 @@
1
+
2
+ from pcc.lex.token import TokenKind,Token
3
+
4
+ class Lexer(object):
5
+ """Lexer for Kaleidoscope.
6
+
7
+ Initialize the lexer with a string buffer. tokens() returns a generator that
8
+ can be queried for tokens. The generator will emit an EOF token before
9
+ stopping.
10
+ """
11
+ def __init__(self, buf):
12
+ assert len(buf) >= 1
13
+ self.buf = buf
14
+ self.pos = 0
15
+ self.lastchar = self.buf[0]
16
+
17
+ self._keyword_map = {
18
+ 'def': TokenKind.DEF,
19
+ 'extern': TokenKind.EXTERN,
20
+ 'if': TokenKind.IF,
21
+ 'then': TokenKind.THEN,
22
+ 'else': TokenKind.ELSE,
23
+ 'for': TokenKind.FOR,
24
+ 'in': TokenKind.IN,
25
+ 'binary': TokenKind.BINARY,
26
+ 'unary': TokenKind.UNARY,
27
+ 'var': TokenKind.VAR,
28
+ }
29
+
30
+ def tokens(self):
31
+ while self.lastchar:
32
+ # Skip whitespace
33
+ while self.lastchar.isspace():
34
+ self._advance()
35
+ # Identifier or keyword
36
+ if self.lastchar.isalpha():
37
+ id_str = ''
38
+ while self.lastchar.isalnum():
39
+ id_str += self.lastchar
40
+ self._advance()
41
+ if id_str in self._keyword_map:
42
+ yield Token(kind=self._keyword_map[id_str], value=id_str)
43
+ else:
44
+ yield Token(kind=TokenKind.IDENTIFIER, value=id_str)
45
+ # Number
46
+ elif self.lastchar.isdigit() or self.lastchar == '.':
47
+ num_str = ''
48
+ while self.lastchar.isdigit() or self.lastchar == '.':
49
+ num_str += self.lastchar
50
+ self._advance()
51
+ yield Token(kind=TokenKind.NUMBER, value=num_str)
52
+ # Comment
53
+ elif self.lastchar == '#':
54
+ self._advance()
55
+ while self.lastchar and self.lastchar not in '\r\n':
56
+ self._advance()
57
+ elif self.lastchar:
58
+ # Some other char
59
+ yield Token(kind=TokenKind.OPERATOR, value=self.lastchar)
60
+ self._advance()
61
+ yield Token(kind=TokenKind.EOF, value='')
62
+
63
+ def _advance(self):
64
+ try:
65
+ self.pos += 1
66
+ self.lastchar = self.buf[self.pos]
67
+ except IndexError:
68
+ self.lastchar = ''
pcc/lex/token.py ADDED
@@ -0,0 +1,24 @@
1
+ from collections import namedtuple
2
+ from enum import Enum
3
+
4
+
5
+ # Each token is a tuple of kind and value. kind is one of the enumeration values
6
+ # in TokenKind. value is the textual value of the token in the input.
7
+ class TokenKind(Enum):
8
+ EOF = -1
9
+ DEF = -2
10
+ EXTERN = -3
11
+ IDENTIFIER = -4
12
+ NUMBER = -5
13
+ OPERATOR = -6
14
+ IF = -7
15
+ THEN = -8
16
+ ELSE = -9
17
+ FOR = -10
18
+ IN = -11
19
+ BINARY = -12
20
+ UNARY = -13
21
+ VAR = -14
22
+
23
+
24
+ Token = namedtuple('Token', 'kind value')
pcc/parse/__init__.py ADDED
File without changes