snail-lang 0.2.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,526 @@
1
+ """Top down operator precedence parser.
2
+
3
+ This is an implementation of Vaughan R. Pratt's
4
+ "Top Down Operator Precedence" parser.
5
+ (http://dl.acm.org/citation.cfm?doid=512927.512931).
6
+
7
+ These are some additional resources that help explain the
8
+ general idea behind a Pratt parser:
9
+
10
+ * http://effbot.org/zone/simple-top-down-parsing.htm
11
+ * http://javascript.crockford.com/tdop/tdop.html
12
+
13
+ A few notes on the implementation.
14
+
15
+ * All the nud/led tokens are on the Parser class itself, and are dispatched
16
+ using getattr(). This keeps all the parsing logic contained to a single
17
+ class.
18
+ * We use two passes through the data. One to create a list of token,
19
+ then one pass through the tokens to create the AST. While the lexer actually
20
+ yields tokens, we convert it to a list so we can easily implement two tokens
21
+ of lookahead. A previous implementation used a fixed circular buffer, but it
22
+ was significantly slower. Also, the average jmespath expression typically
23
+ does not have a large amount of token so this is not an issue. And
24
+ interestingly enough, creating a token list first is actually faster than
25
+ consuming from the token iterator one token at a time.
26
+
27
+ """
28
+
29
+ import random
30
+
31
+ from . import lexer
32
+ from .compat import with_repr_method
33
+ from . import ast
34
+ from . import exceptions
35
+ from . import visitor
36
+
37
+
38
+ class Parser(object):
39
+ BINDING_POWER = {
40
+ "eof": 0,
41
+ "unquoted_identifier": 0,
42
+ "quoted_identifier": 0,
43
+ "literal": 0,
44
+ "rbracket": 0,
45
+ "rparen": 0,
46
+ "comma": 0,
47
+ "rbrace": 0,
48
+ "number": 0,
49
+ "current": 0,
50
+ "expref": 0,
51
+ "colon": 0,
52
+ "pipe": 1,
53
+ "or": 2,
54
+ "and": 3,
55
+ "eq": 5,
56
+ "gt": 5,
57
+ "lt": 5,
58
+ "gte": 5,
59
+ "lte": 5,
60
+ "ne": 5,
61
+ "flatten": 9,
62
+ # Everything above stops a projection.
63
+ "star": 20,
64
+ "filter": 21,
65
+ "dot": 40,
66
+ "not": 45,
67
+ "lbrace": 50,
68
+ "lbracket": 55,
69
+ "lparen": 60,
70
+ }
71
+ # The maximum binding power for a token that can stop
72
+ # a projection.
73
+ _PROJECTION_STOP = 10
74
+ # The _MAX_SIZE most recent expressions are cached in
75
+ # _CACHE dict.
76
+ _CACHE = {}
77
+ _MAX_SIZE = 128
78
+
79
+ def __init__(self, lookahead=2):
80
+ self.tokenizer = None
81
+ self._tokens = [None] * lookahead
82
+ self._buffer_size = lookahead
83
+ self._index = 0
84
+
85
+ def parse(self, expression):
86
+ cached = self._CACHE.get(expression)
87
+ if cached is not None:
88
+ return cached
89
+ parsed_result = self._do_parse(expression)
90
+ self._CACHE[expression] = parsed_result
91
+ if len(self._CACHE) > self._MAX_SIZE:
92
+ self._free_cache_entries()
93
+ return parsed_result
94
+
95
+ def _do_parse(self, expression):
96
+ try:
97
+ return self._parse(expression)
98
+ except exceptions.LexerError as e:
99
+ e.expression = expression
100
+ raise
101
+ except exceptions.IncompleteExpressionError as e:
102
+ e.set_expression(expression)
103
+ raise
104
+ except exceptions.ParseError as e:
105
+ e.expression = expression
106
+ raise
107
+
108
+ def _parse(self, expression):
109
+ self.tokenizer = lexer.Lexer().tokenize(expression)
110
+ self._tokens = list(self.tokenizer)
111
+ self._index = 0
112
+ parsed = self._expression(binding_power=0)
113
+ if not self._current_token() == "eof":
114
+ t = self._lookahead_token(0)
115
+ raise exceptions.ParseError(
116
+ t["start"], t["value"], t["type"], "Unexpected token: %s" % t["value"]
117
+ )
118
+ return ParsedResult(expression, parsed)
119
+
120
+ def _expression(self, binding_power=0):
121
+ left_token = self._lookahead_token(0)
122
+ self._advance()
123
+ nud_function = getattr(
124
+ self, "_token_nud_%s" % left_token["type"], self._error_nud_token
125
+ )
126
+ left = nud_function(left_token)
127
+ current_token = self._current_token()
128
+ while binding_power < self.BINDING_POWER[current_token]:
129
+ led = getattr(self, "_token_led_%s" % current_token, None)
130
+ if led is None:
131
+ error_token = self._lookahead_token(0)
132
+ self._error_led_token(error_token)
133
+ else:
134
+ self._advance()
135
+ left = led(left)
136
+ current_token = self._current_token()
137
+ return left
138
+
139
+ def _token_nud_literal(self, token):
140
+ return ast.literal(token["value"])
141
+
142
+ def _token_nud_unquoted_identifier(self, token):
143
+ return ast.field(token["value"])
144
+
145
+ def _token_nud_quoted_identifier(self, token):
146
+ field = ast.field(token["value"])
147
+ # You can't have a quoted identifier as a function
148
+ # name.
149
+ if self._current_token() == "lparen":
150
+ t = self._lookahead_token(0)
151
+ raise exceptions.ParseError(
152
+ 0,
153
+ t["value"],
154
+ t["type"],
155
+ "Quoted identifier not allowed for function names.",
156
+ )
157
+ return field
158
+
159
+ def _token_nud_star(self, token):
160
+ left = ast.identity()
161
+ if self._current_token() == "rbracket":
162
+ right = ast.identity()
163
+ else:
164
+ right = self._parse_projection_rhs(self.BINDING_POWER["star"])
165
+ return ast.value_projection(left, right)
166
+
167
+ def _token_nud_filter(self, token):
168
+ return self._token_led_filter(ast.identity())
169
+
170
+ def _token_nud_lbrace(self, token):
171
+ return self._parse_multi_select_hash()
172
+
173
+ def _token_nud_lparen(self, token):
174
+ expression = self._expression()
175
+ self._match("rparen")
176
+ return expression
177
+
178
+ def _token_nud_flatten(self, token):
179
+ left = ast.flatten(ast.identity())
180
+ right = self._parse_projection_rhs(self.BINDING_POWER["flatten"])
181
+ return ast.projection(left, right)
182
+
183
+ def _token_nud_not(self, token):
184
+ expr = self._expression(self.BINDING_POWER["not"])
185
+ return ast.not_expression(expr)
186
+
187
+ def _token_nud_lbracket(self, token):
188
+ if self._current_token() in ["number", "colon"]:
189
+ right = self._parse_index_expression()
190
+ # We could optimize this and remove the identity() node.
191
+ # We don't really need an index_expression node, we can
192
+ # just use emit an index node here if we're not dealing
193
+ # with a slice.
194
+ return self._project_if_slice(ast.identity(), right)
195
+ elif self._current_token() == "star" and self._lookahead(1) == "rbracket":
196
+ self._advance()
197
+ self._advance()
198
+ right = self._parse_projection_rhs(self.BINDING_POWER["star"])
199
+ return ast.projection(ast.identity(), right)
200
+ else:
201
+ return self._parse_multi_select_list()
202
+
203
+ def _parse_index_expression(self):
204
+ # We're here:
205
+ # [<current>
206
+ # ^
207
+ # | current token
208
+ if self._lookahead(0) == "colon" or self._lookahead(1) == "colon":
209
+ return self._parse_slice_expression()
210
+ else:
211
+ # Parse the syntax [number]
212
+ node = ast.index(self._lookahead_token(0)["value"])
213
+ self._advance()
214
+ self._match("rbracket")
215
+ return node
216
+
217
+ def _parse_slice_expression(self):
218
+ # [start:end:step]
219
+ # Where start, end, and step are optional.
220
+ # The last colon is optional as well.
221
+ parts = [None, None, None]
222
+ index = 0
223
+ current_token = self._current_token()
224
+ while not current_token == "rbracket" and index < 3:
225
+ if current_token == "colon":
226
+ index += 1
227
+ if index == 3:
228
+ self._raise_parse_error_for_token(
229
+ self._lookahead_token(0), "syntax error"
230
+ )
231
+ self._advance()
232
+ elif current_token == "number":
233
+ parts[index] = self._lookahead_token(0)["value"]
234
+ self._advance()
235
+ else:
236
+ self._raise_parse_error_for_token(
237
+ self._lookahead_token(0), "syntax error"
238
+ )
239
+ current_token = self._current_token()
240
+ self._match("rbracket")
241
+ return ast.slice(*parts)
242
+
243
+ def _token_nud_current(self, token):
244
+ return ast.current_node()
245
+
246
+ def _token_nud_expref(self, token):
247
+ expression = self._expression(self.BINDING_POWER["expref"])
248
+ return ast.expref(expression)
249
+
250
+ def _token_led_dot(self, left):
251
+ if not self._current_token() == "star":
252
+ right = self._parse_dot_rhs(self.BINDING_POWER["dot"])
253
+ if left["type"] == "subexpression":
254
+ left["children"].append(right)
255
+ return left
256
+ else:
257
+ return ast.subexpression([left, right])
258
+ else:
259
+ # We're creating a projection.
260
+ self._advance()
261
+ right = self._parse_projection_rhs(self.BINDING_POWER["dot"])
262
+ return ast.value_projection(left, right)
263
+
264
+ def _token_led_pipe(self, left):
265
+ right = self._expression(self.BINDING_POWER["pipe"])
266
+ return ast.pipe(left, right)
267
+
268
+ def _token_led_or(self, left):
269
+ right = self._expression(self.BINDING_POWER["or"])
270
+ return ast.or_expression(left, right)
271
+
272
+ def _token_led_and(self, left):
273
+ right = self._expression(self.BINDING_POWER["and"])
274
+ return ast.and_expression(left, right)
275
+
276
+ def _token_led_lparen(self, left):
277
+ if left["type"] != "field":
278
+ # 0 - first func arg or closing paren.
279
+ # -1 - '(' token
280
+ # -2 - invalid function "name".
281
+ prev_t = self._lookahead_token(-2)
282
+ raise exceptions.ParseError(
283
+ prev_t["start"],
284
+ prev_t["value"],
285
+ prev_t["type"],
286
+ "Invalid function name '%s'" % prev_t["value"],
287
+ )
288
+ name = left["value"]
289
+ args = []
290
+ while not self._current_token() == "rparen":
291
+ expression = self._expression()
292
+ if self._current_token() == "comma":
293
+ self._match("comma")
294
+ args.append(expression)
295
+ self._match("rparen")
296
+ function_node = ast.function_expression(name, args)
297
+ return function_node
298
+
299
+ def _token_led_filter(self, left):
300
+ # Filters are projections.
301
+ condition = self._expression(0)
302
+ self._match("rbracket")
303
+ if self._current_token() == "flatten":
304
+ right = ast.identity()
305
+ else:
306
+ right = self._parse_projection_rhs(self.BINDING_POWER["filter"])
307
+ return ast.filter_projection(left, right, condition)
308
+
309
+ def _token_led_eq(self, left):
310
+ return self._parse_comparator(left, "eq")
311
+
312
+ def _token_led_ne(self, left):
313
+ return self._parse_comparator(left, "ne")
314
+
315
+ def _token_led_gt(self, left):
316
+ return self._parse_comparator(left, "gt")
317
+
318
+ def _token_led_gte(self, left):
319
+ return self._parse_comparator(left, "gte")
320
+
321
+ def _token_led_lt(self, left):
322
+ return self._parse_comparator(left, "lt")
323
+
324
+ def _token_led_lte(self, left):
325
+ return self._parse_comparator(left, "lte")
326
+
327
+ def _token_led_flatten(self, left):
328
+ left = ast.flatten(left)
329
+ right = self._parse_projection_rhs(self.BINDING_POWER["flatten"])
330
+ return ast.projection(left, right)
331
+
332
+ def _token_led_lbracket(self, left):
333
+ token = self._lookahead_token(0)
334
+ if token["type"] in ["number", "colon"]:
335
+ right = self._parse_index_expression()
336
+ if left["type"] == "index_expression":
337
+ # Optimization: if the left node is an index expr,
338
+ # we can avoid creating another node and instead just add
339
+ # the right node as a child of the left.
340
+ left["children"].append(right)
341
+ return left
342
+ else:
343
+ return self._project_if_slice(left, right)
344
+ else:
345
+ # We have a projection
346
+ self._match("star")
347
+ self._match("rbracket")
348
+ right = self._parse_projection_rhs(self.BINDING_POWER["star"])
349
+ return ast.projection(left, right)
350
+
351
+ def _project_if_slice(self, left, right):
352
+ index_expr = ast.index_expression([left, right])
353
+ if right["type"] == "slice":
354
+ return ast.projection(
355
+ index_expr, self._parse_projection_rhs(self.BINDING_POWER["star"])
356
+ )
357
+ else:
358
+ return index_expr
359
+
360
+ def _parse_comparator(self, left, comparator):
361
+ right = self._expression(self.BINDING_POWER[comparator])
362
+ return ast.comparator(comparator, left, right)
363
+
364
+ def _parse_multi_select_list(self):
365
+ expressions = []
366
+ while True:
367
+ expression = self._expression()
368
+ expressions.append(expression)
369
+ if self._current_token() == "rbracket":
370
+ break
371
+ else:
372
+ self._match("comma")
373
+ self._match("rbracket")
374
+ return ast.multi_select_list(expressions)
375
+
376
+ def _parse_multi_select_hash(self):
377
+ pairs = []
378
+ while True:
379
+ key_token = self._lookahead_token(0)
380
+ # Before getting the token value, verify it's
381
+ # an identifier.
382
+ self._match_multiple_tokens(
383
+ token_types=["quoted_identifier", "unquoted_identifier"]
384
+ )
385
+ key_name = key_token["value"]
386
+ self._match("colon")
387
+ value = self._expression(0)
388
+ node = ast.key_val_pair(key_name=key_name, node=value)
389
+ pairs.append(node)
390
+ if self._current_token() == "comma":
391
+ self._match("comma")
392
+ elif self._current_token() == "rbrace":
393
+ self._match("rbrace")
394
+ break
395
+ return ast.multi_select_dict(nodes=pairs)
396
+
397
+ def _parse_projection_rhs(self, binding_power):
398
+ # Parse the right hand side of the projection.
399
+ if self.BINDING_POWER[self._current_token()] < self._PROJECTION_STOP:
400
+ # BP of 10 are all the tokens that stop a projection.
401
+ right = ast.identity()
402
+ elif self._current_token() == "lbracket":
403
+ right = self._expression(binding_power)
404
+ elif self._current_token() == "filter":
405
+ right = self._expression(binding_power)
406
+ elif self._current_token() == "dot":
407
+ self._match("dot")
408
+ right = self._parse_dot_rhs(binding_power)
409
+ else:
410
+ self._raise_parse_error_for_token(self._lookahead_token(0), "syntax error")
411
+ return right
412
+
413
+ def _parse_dot_rhs(self, binding_power):
414
+ # From the grammar:
415
+ # expression '.' ( identifier /
416
+ # multi-select-list /
417
+ # multi-select-hash /
418
+ # function-expression /
419
+ # *
420
+ # In terms of tokens that means that after a '.',
421
+ # you can have:
422
+ lookahead = self._current_token()
423
+ # Common case "foo.bar", so first check for an identifier.
424
+ if lookahead in ["quoted_identifier", "unquoted_identifier", "star"]:
425
+ return self._expression(binding_power)
426
+ elif lookahead == "lbracket":
427
+ self._match("lbracket")
428
+ return self._parse_multi_select_list()
429
+ elif lookahead == "lbrace":
430
+ self._match("lbrace")
431
+ return self._parse_multi_select_hash()
432
+ else:
433
+ t = self._lookahead_token(0)
434
+ allowed = ["quoted_identifier", "unquoted_identifier", "lbracket", "lbrace"]
435
+ msg = "Expecting: %s, got: %s" % (allowed, t["type"])
436
+ self._raise_parse_error_for_token(t, msg)
437
+
438
+ def _error_nud_token(self, token):
439
+ if token["type"] == "eof":
440
+ raise exceptions.IncompleteExpressionError(
441
+ token["start"], token["value"], token["type"]
442
+ )
443
+ self._raise_parse_error_for_token(token, "invalid token")
444
+
445
+ def _error_led_token(self, token):
446
+ self._raise_parse_error_for_token(token, "invalid token")
447
+
448
+ def _match(self, token_type=None):
449
+ # inline'd self._current_token()
450
+ if self._current_token() == token_type:
451
+ # inline'd self._advance()
452
+ self._advance()
453
+ else:
454
+ self._raise_parse_error_maybe_eof(token_type, self._lookahead_token(0))
455
+
456
+ def _match_multiple_tokens(self, token_types):
457
+ if self._current_token() not in token_types:
458
+ self._raise_parse_error_maybe_eof(token_types, self._lookahead_token(0))
459
+ self._advance()
460
+
461
+ def _advance(self):
462
+ self._index += 1
463
+
464
+ def _current_token(self):
465
+ return self._tokens[self._index]["type"]
466
+
467
+ def _lookahead(self, number):
468
+ return self._tokens[self._index + number]["type"]
469
+
470
+ def _lookahead_token(self, number):
471
+ return self._tokens[self._index + number]
472
+
473
+ def _raise_parse_error_for_token(self, token, reason):
474
+ lex_position = token["start"]
475
+ actual_value = token["value"]
476
+ actual_type = token["type"]
477
+ raise exceptions.ParseError(lex_position, actual_value, actual_type, reason)
478
+
479
+ def _raise_parse_error_maybe_eof(self, expected_type, token):
480
+ lex_position = token["start"]
481
+ actual_value = token["value"]
482
+ actual_type = token["type"]
483
+ if actual_type == "eof":
484
+ raise exceptions.IncompleteExpressionError(
485
+ lex_position, actual_value, actual_type
486
+ )
487
+ message = "Expecting: %s, got: %s" % (expected_type, actual_type)
488
+ raise exceptions.ParseError(lex_position, actual_value, actual_type, message)
489
+
490
+ def _free_cache_entries(self):
491
+ for key in random.sample(list(self._CACHE.keys()), int(self._MAX_SIZE / 2)):
492
+ self._CACHE.pop(key, None)
493
+
494
+ @classmethod
495
+ def purge(cls):
496
+ """Clear the expression compilation cache."""
497
+ cls._CACHE.clear()
498
+
499
+
500
+ @with_repr_method
501
+ class ParsedResult(object):
502
+ def __init__(self, expression, parsed):
503
+ self.expression = expression
504
+ self.parsed = parsed
505
+
506
+ def search(self, value, options=None):
507
+ interpreter = visitor.TreeInterpreter(options)
508
+ result = interpreter.visit(self.parsed, value)
509
+ return result
510
+
511
+ def _render_dot_file(self):
512
+ """Render the parsed AST as a dot file.
513
+
514
+ Note that this is marked as an internal method because
515
+ the AST is an implementation detail and is subject
516
+ to change. This method can be used to help troubleshoot
517
+ or for development purposes, but is not considered part
518
+ of the public supported API. Use at your own risk.
519
+
520
+ """
521
+ renderer = visitor.GraphvizVisitor()
522
+ contents = renderer.visit(self.parsed)
523
+ return contents
524
+
525
+ def __repr__(self):
526
+ return repr(self.parsed)