wxpath 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/core/parser.py ADDED
@@ -0,0 +1,598 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from itertools import pairwise
4
+ from typing import Iterable, Iterator, TypeAlias
5
+
6
+ try:
7
+ from enum import StrEnum
8
+ except ImportError:
9
+ from enum import Enum
10
+
11
+ class StrEnum(str, Enum):
12
+ pass
13
+
14
+
15
+ TOKEN_SPEC = [
16
+ ("NUMBER", r"\d+(\.\d+)?"),
17
+ ("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
18
+ ("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
19
+ # ("///URL", r"/{3}\s*url"),
20
+ # ("//URL", r"/{2}\s*url"),
21
+ # ("/URL", r"/{1}\s*url"),
22
+ ("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
23
+ # ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
24
+ ("FOLLOW", r",?\s{,}follow="),
25
+ ("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
26
+ ("LPAREN", r"\("),
27
+ ("RPAREN", r"\)"),
28
+ ("LBRACE", r"\{"),
29
+ ("RBRACE", r"\}"),
30
+ ("COLON", r":"),
31
+ ("COMMA", r","),
32
+ ("WS", r"\s+"),
33
+ ("DOT", r"\."),
34
+ ("OTHER", r"."), # Catch-all for xpath operators: /, @, [, ], etc.
35
+ ]
36
+
37
+ TOKEN_RE = re.compile("|".join(
38
+ f"(?P<{name}>{pattern})"
39
+ for name, pattern in TOKEN_SPEC
40
+ ))
41
+
42
+
43
+ @dataclass
44
+ class Token:
45
+ type: str
46
+ value: str
47
+ start: int = 0 # position in source string
48
+ end: int = 0
49
+
50
+
51
+ def tokenize(src: str):
52
+ for m in TOKEN_RE.finditer(src):
53
+ kind = m.lastgroup
54
+ # # NOTE: in order to preserve native XPath expressions that contain whitespace,
55
+ # # for example, "and not(...)", we can't skip whitespace
56
+ # if kind == "WS":
57
+ # continue
58
+ yield Token(kind, m.group(), m.start(), m.end())
59
+ yield Token("EOF", "", len(src), len(src))
60
+
61
+
62
+ @dataclass
63
+ class Number:
64
+ value: float
65
+
66
+ @dataclass
67
+ class String:
68
+ value: str
69
+
70
+
71
+ @dataclass
72
+ class Name:
73
+ value: str
74
+
75
+ @dataclass
76
+ class Xpath:
77
+ value: str
78
+
79
+ @dataclass
80
+ class Wxpath:
81
+ value: str
82
+
83
+ @dataclass
84
+ class Call:
85
+ func: str
86
+ args: list
87
+
88
+ @dataclass
89
+ class Url(Call):
90
+ pass
91
+
92
+ @dataclass
93
+ class UrlLiteral(Url):
94
+ pass
95
+
96
+ @dataclass
97
+ class UrlQuery(Url):
98
+ pass
99
+
100
+ UrlSelect = UrlQuery
101
+
102
+ @dataclass
103
+ class UrlCrawl(Url):
104
+ pass
105
+
106
+ UrlFollow = UrlCrawl
107
+
108
+ @dataclass
109
+ class Binary:
110
+ left: object
111
+ op: str
112
+ right: object
113
+
114
+ Segment: TypeAlias = Url | Xpath
115
+
116
+ class Segments(list):
117
+ def __repr__(self):
118
+ return f"Segments({super().__repr__()})"
119
+
120
+ def __str__(self):
121
+ return f"Segments({super().__str__()})"
122
+
123
+ @dataclass
124
+ class Other:
125
+ value: str
126
+
127
+
128
+ @dataclass
129
+ class ContextItem(Xpath):
130
+ """Represents the XPath context item expression: ."""
131
+ value: str = "."
132
+
133
+
134
+ PRECEDENCE = {
135
+ "||": 5, # String concatenation (lowest precedence)
136
+ "=": 10,
137
+ "!=": 10,
138
+ "<": 10,
139
+ "<=": 10,
140
+ ">": 10,
141
+ ">=": 10,
142
+ "+": 20,
143
+ "-": 20,
144
+ "*": 30,
145
+ "/": 30,
146
+ "!": 40, # Simple map operator (highest precedence)
147
+ }
148
+
149
+
150
+ class Parser:
151
+ """Pratt-style parser that produces wxpath AST nodes."""
152
+
153
+ def __init__(self, tokens: Iterable[Token]):
154
+ self.tokens: Iterator[Token] = iter(tokens)
155
+ self.token: Token = next(self.tokens)
156
+
157
+ def advance(self) -> None:
158
+ self.token = next(self.tokens)
159
+
160
+ def parse(self) -> object:
161
+ """Parse the input tokens into an AST or raise on unexpected trailing tokens."""
162
+ output = self.expression(0)
163
+ if self.token.type != "EOF":
164
+ raise SyntaxError(f"unexpected token: {self.token}")
165
+
166
+ return output
167
+
168
+ def expression(self, min_prec: int) -> object:
169
+ return self.parse_binary(min_prec)
170
+
171
+ def parse_binary(self, min_prec: int) -> object:
172
+ """Parse a binary expression chain honoring operator precedence."""
173
+ if self.token.type == "WXPATH":
174
+ left = self.parse_segments()
175
+ else:
176
+ left = self.nud()
177
+
178
+ while self.token.type == "OP" and PRECEDENCE.get(self.token.value, -1) >= min_prec:
179
+ op = self.token.value
180
+ prec = PRECEDENCE[op]
181
+ self.advance()
182
+ if self.token.type == 'WXPATH':
183
+ right = self.parse_segments()
184
+ else:
185
+ right = self.parse_binary(prec + 1)
186
+ left = Binary(left, op, right)
187
+
188
+ return left
189
+
190
+ @staticmethod
191
+ def _validate_segments(func):
192
+ """Decorator that validates segment invariants after parsing.
193
+
194
+ Raises ValueError if the xpath in ``url(<xpath>)`` begins with ``/``
195
+ or ``//`` when it follows an Xpath segment.
196
+
197
+ Args:
198
+ func: A bound method that returns a list of segments.
199
+
200
+ Returns:
201
+ The wrapped function that performs validation.
202
+ """
203
+ def _func(self) -> Segments:
204
+ segments = func(self)
205
+ for seg1, seg2 in pairwise(segments):
206
+ if isinstance(seg1, Xpath) and isinstance(seg2, Url):
207
+ if seg2.args[0].value.startswith(("/", "//")):
208
+ raise ValueError(
209
+ f"Invalid segments: {segments}. the <xpath> in url(<xpath>)"
210
+ " may not begin with / or // if following an Xpath segment."
211
+ )
212
+ return segments
213
+ return _func
214
+
215
+ @_validate_segments
216
+ def parse_segments(self) -> Segments:
217
+ """Parse a sequence of wxpath segments: url() calls interspersed with xpath.
218
+
219
+ Handles patterns like::
220
+
221
+ url('...')
222
+ url('...')//a/@href
223
+ url('...')//a/url(@href)//b
224
+ //a/@href
225
+ //a/map { 'key': value }
226
+
227
+ Returns:
228
+ A Segments list containing the parsed Url and Xpath nodes.
229
+ """
230
+ segments = []
231
+
232
+ while self.token.type != "EOF":
233
+ if self.token.type == "WXPATH":
234
+ # Parse url() call
235
+ call = self.nud()
236
+ if call is not None:
237
+ if isinstance(call, (Segments, list)):
238
+ segments.extend(call)
239
+ else:
240
+ segments.append(call)
241
+ elif self.token.type == "RPAREN":
242
+ # End of nested context
243
+ break
244
+ elif self.token.type == "COMMA":
245
+ # Argument separator - stop segment parsing
246
+ break
247
+ elif self.token.type == "RBRACE":
248
+ # End of map context - stop segment parsing
249
+ break
250
+ else:
251
+ # Capture xpath content until next url() or end
252
+ xpath_content = self.capture_xpath_until_wxpath_or_end()
253
+ if xpath_content.strip():
254
+ segments.append(Xpath(xpath_content.strip()))
255
+
256
+ return Segments(segments)
257
+
258
+
259
+ def nud(self) -> object | None:
260
+ """Parse a null-denoting expression (nud).
261
+
262
+ Null-denoting expressions include numbers, names, or expressions
263
+ enclosed in parentheses.
264
+
265
+ Returns:
266
+ The parsed AST node, or None if the token is unrecognized.
267
+
268
+ Raises:
269
+ SyntaxError: If the token cannot form a valid expression.
270
+ """
271
+ tok = self.token
272
+
273
+ if tok.type == "NUMBER":
274
+ self.advance()
275
+ return Number(float(tok.value))
276
+
277
+ if tok.type == "STRING":
278
+ self.advance()
279
+ return String(tok.value[1:-1]) # strip quotes
280
+
281
+ if tok.type == "DOT":
282
+ self.advance()
283
+ return ContextItem()
284
+
285
+ if tok.type == "WXPATH":
286
+ value = tok.value.replace(" ", "").replace("\n", "")
287
+ self.advance()
288
+
289
+ if self.token.type == "LPAREN":
290
+ return self.parse_call(value)
291
+
292
+ return Wxpath(value)
293
+
294
+ if tok.type == "NAME":
295
+ self.advance()
296
+
297
+ # function call
298
+ if self.token.type == "LPAREN":
299
+ return self.parse_call(tok.value)
300
+
301
+ return Name(tok.value)
302
+
303
+ if tok.type == "LPAREN":
304
+ self.advance()
305
+ expr = self.expression(0)
306
+ if self.token.type != "RPAREN":
307
+ raise SyntaxError("expected ')'")
308
+ self.advance()
309
+ return expr
310
+
311
+ # For other tokens (xpath content), return None to signal caller to handle
312
+ return None
313
+
314
+
315
+ def capture_xpath_until_wxpath_or_end(self) -> str:
316
+ """Capture xpath tokens until a WXPATH token, EOF, RPAREN, or COMMA.
317
+
318
+ Balances parentheses and braces so that xpath functions like
319
+ ``contains()`` and map constructors like ``map { ... }`` are captured
320
+ correctly.
321
+
322
+ Returns:
323
+ The accumulated xpath content as a string.
324
+ """
325
+ result = ""
326
+ paren_depth = 0
327
+ brace_depth = 0
328
+
329
+ while self.token.type != "EOF":
330
+ # Stop conditions (only at depth 0 for both parens and braces)
331
+ if paren_depth == 0 and brace_depth == 0:
332
+ if self.token.type == "WXPATH":
333
+ break
334
+ if self.token.type == "RPAREN":
335
+ break
336
+ if self.token.type == "COMMA":
337
+ break
338
+
339
+ # Track paren depth for xpath functions
340
+ if self.token.type == "LPAREN":
341
+ paren_depth += 1
342
+ elif self.token.type == "RPAREN":
343
+ paren_depth -= 1
344
+ if paren_depth < 0:
345
+ # This RPAREN closes an outer context
346
+ break
347
+
348
+ # Track brace depth for map constructors
349
+ if self.token.type == "LBRACE":
350
+ brace_depth += 1
351
+ elif self.token.type == "RBRACE":
352
+ brace_depth -= 1
353
+ if brace_depth < 0:
354
+ # This RBRACE closes an outer context
355
+ break
356
+
357
+ result += self.token.value
358
+ self.advance()
359
+
360
+ return result
361
+
362
+
363
+ def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
364
+ """Capture content inside a url() call, handling nested wxpath expressions.
365
+
366
+ Supports patterns like::
367
+
368
+ url('...') -> [String]
369
+ url('...' follow=//a/@href) -> [String, Xpath]
370
+ url(//a/@href) -> [Xpath]
371
+ url( url('..')//a/@href ) -> [Call, Xpath]
372
+ url( url( url('..')//a )//b ) -> [Call, Xpath]
373
+
374
+ Returns:
375
+ A list of parsed elements: Xpath nodes for xpath content and Call
376
+ nodes for nested url() calls.
377
+ """
378
+ elements = []
379
+ current_xpath = ""
380
+ paren_balance = 1 # We're already inside the opening paren of url()
381
+ brace_balance = 0 # Track braces for map constructors
382
+ reached_follow_token = False
383
+ follow_xpath = ""
384
+ while paren_balance > 0 and self.token.type != "EOF":
385
+ if self.token.type == "WXPATH":
386
+ # Found nested wxpath: save any accumulated xpath content first
387
+ if current_xpath.strip():
388
+ elements.append(Xpath(current_xpath.strip()))
389
+ current_xpath = ""
390
+
391
+ # Parse the nested url() call using nud()
392
+ # This recursively handles deeply nested wxpath
393
+ nested_call = self.nud()
394
+ if nested_call is not None:
395
+ elements.append(nested_call)
396
+
397
+ elif self.token.type == "FOLLOW":
398
+ reached_follow_token = True
399
+ self.advance()
400
+
401
+ elif self.token.type == "LPAREN":
402
+ # Opening paren that's NOT part of a url() call
403
+ # (it's part of an xpath function like contains(), starts-with(), etc.)
404
+ paren_balance += 1
405
+ current_xpath += self.token.value
406
+ self.advance()
407
+
408
+ elif self.token.type == "RPAREN":
409
+ paren_balance -= 1
410
+ if paren_balance == 0:
411
+ # This is the closing paren of the outer url()
412
+ break
413
+ current_xpath += self.token.value
414
+ self.advance()
415
+
416
+ elif self.token.type == "LBRACE":
417
+ # Opening brace for map constructors
418
+ brace_balance += 1
419
+ current_xpath += self.token.value
420
+ self.advance()
421
+
422
+ elif self.token.type == "RBRACE":
423
+ brace_balance -= 1
424
+ current_xpath += self.token.value
425
+ self.advance()
426
+
427
+ else:
428
+ # Accumulate all other tokens as xpath content
429
+ if not reached_follow_token:
430
+ current_xpath += self.token.value
431
+ else:
432
+ follow_xpath += self.token.value
433
+
434
+ self.advance()
435
+
436
+ if paren_balance != 0:
437
+ raise SyntaxError("unbalanced parentheses in url()")
438
+
439
+ # Save any remaining xpath content
440
+ if current_xpath.strip():
441
+ current_xpath = current_xpath.strip()
442
+ if current_xpath == ".":
443
+ elements.append(ContextItem())
444
+ else:
445
+ elements.append(Xpath(current_xpath))
446
+
447
+ if follow_xpath.strip():
448
+ elements.append(Xpath(follow_xpath.strip()))
449
+
450
+ return elements
451
+
452
+ def parse_call(self, func_name: str) -> Call | Segments:
453
+ """Parse a function call (including url variants) and specialize node types."""
454
+ self.advance() # consume '('
455
+ args = []
456
+ follow_arg = None
457
+
458
+ if func_name.endswith("url"):
459
+ if self.token.type == "STRING":
460
+ # Simple case: url('literal string')
461
+ args = [String(self.token.value[1:-1])] # strip quotes
462
+ self.advance()
463
+ # Handle follow=...
464
+ if self.token.type == "FOLLOW":
465
+ self.advance()
466
+ follow_arg = self.capture_url_arg_content()
467
+ args.extend(follow_arg)
468
+ elif self.token.type == "WXPATH":
469
+ # Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
470
+ # Use capture_url_arg_content to handle nested wxpath and xpath
471
+ args = self.capture_url_arg_content()
472
+ else:
473
+ # Simple xpath argument: url(//a/@href)
474
+ # Could still contain nested wxpath, so use capture_url_arg_content
475
+ args = self.capture_url_arg_content()
476
+
477
+ # Handle additional comma-separated arguments (e.g., follow=...)
478
+ if self.token.type != "RPAREN":
479
+ while True:
480
+ args.append(self.expression(0))
481
+ if self.token.type == "COMMA":
482
+ self.advance()
483
+ continue
484
+ break
485
+
486
+ if self.token.type != "RPAREN":
487
+ raise SyntaxError("expected ')'")
488
+ self.advance()
489
+
490
+ return _specify_call_types(func_name, args)
491
+
492
+
493
+ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
494
+ if func_name == "url":
495
+ if len(args) == 1:
496
+ if isinstance(args[0], String):
497
+ return UrlLiteral(func_name, args)
498
+ elif isinstance(args[0], (Xpath, ContextItem)):
499
+ return UrlQuery(func_name, args)
500
+ else:
501
+ raise ValueError(f"Unknown argument type: {type(args[0])}")
502
+ elif len(args) == 2:
503
+ if isinstance(args[0], String) and isinstance(args[1], Xpath):
504
+ return UrlCrawl(func_name, args)
505
+ elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
506
+ args.append(UrlQuery('url', [ContextItem()]))
507
+ return Segments(args)
508
+ elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
509
+ segs = args[0]
510
+ segs.append(args[1])
511
+ return Segments(segs)
512
+ else:
513
+ raise ValueError(f"Unknown arguments: {args}")
514
+ else:
515
+ raise ValueError(f"Unknown arguments: {args}")
516
+ elif func_name == "/url" or func_name == "//url":
517
+ if len(args) == 1:
518
+ if isinstance(args[0], (Xpath, ContextItem)):
519
+ return UrlQuery(func_name, args)
520
+ else:
521
+ raise ValueError(f"Unknown argument type: {type(args[0])}")
522
+ else:
523
+ raise ValueError(f"Unknown arguments: {args}")
524
+ elif func_name == "///url":
525
+ if len(args) == 1:
526
+ if isinstance(args[0], (Xpath, ContextItem)):
527
+ return UrlCrawl(func_name, args)
528
+ else:
529
+ raise ValueError(f"Unknown argument type: {type(args[0])}")
530
+ else:
531
+ raise ValueError(f"Unknown arguments: {args}")
532
+ else:
533
+ return Call(func_name, args)
534
+
535
+
536
+ def find_wxpath_boundary(tokens: list[Token]) -> tuple[int, int] | None:
537
+ """Find the operator that connects pure xpath to wxpath.
538
+
539
+ The boundary is the last operator at depth 0 before the first WXPATH token.
540
+
541
+ Args:
542
+ tokens: List of Token objects from the tokenizer.
543
+
544
+ Returns:
545
+ A tuple of (op_position, wxpath_position) or None if no boundary
546
+ exists.
547
+ """
548
+ # Find first WXPATH token position
549
+ wxpath_pos = None
550
+ for i, tok in enumerate(tokens):
551
+ if tok.type == "WXPATH":
552
+ wxpath_pos = i
553
+ break
554
+
555
+ if wxpath_pos is None:
556
+ return None
557
+
558
+ # Walk backwards from wxpath to find connecting operator at depth 0
559
+ paren_depth = 0
560
+ for i in range(wxpath_pos - 1, -1, -1):
561
+ tok = tokens[i]
562
+ if tok.type == "RPAREN":
563
+ paren_depth += 1
564
+ elif tok.type == "LPAREN":
565
+ paren_depth -= 1
566
+ elif paren_depth == 0 and tok.type == "OP":
567
+ return (i, wxpath_pos)
568
+
569
+ return None
570
+
571
+
572
+ def parse(src):
573
+ tokens = list(tokenize(src))
574
+
575
+ boundary = find_wxpath_boundary(tokens)
576
+
577
+ # If no wxpath at all, return as pure xpath
578
+ if boundary is None:
579
+ # Check if there's any WXPATH token
580
+ has_wxpath = any(t.type == "WXPATH" for t in tokens)
581
+ if not has_wxpath:
582
+ return Xpath(src.strip())
583
+ # Has wxpath but no boundary operator - parse normally
584
+ parser = Parser(iter(tokens))
585
+ return parser.parse()
586
+
587
+ op_pos, wxpath_pos = boundary
588
+
589
+ # Use source positions to extract xpath string (preserves whitespace)
590
+ op_token = tokens[op_pos]
591
+ xpath_str = src[:op_token.start].strip()
592
+
593
+ # Parse wxpath part (tokens after the operator)
594
+ wxpath_tokens = tokens[op_pos + 1:] # includes EOF
595
+ parser = Parser(iter(wxpath_tokens))
596
+ wxpath_ast = parser.parse()
597
+
598
+ return Binary(Xpath(xpath_str), op_token.value, wxpath_ast)
@@ -0,0 +1,5 @@
1
+ from wxpath.core.runtime.engine import WXPathEngine
2
+
3
+ __all__ = [
4
+ 'WXPathEngine',
5
+ ]