wxpath 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ from . import settings
1
2
  from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
2
3
  from .util.logging import configure_logging
3
4
 
@@ -6,4 +7,5 @@ __all__ = [
6
7
  'wxpath_async_blocking',
7
8
  'wxpath_async_blocking_iter',
8
9
  'configure_logging',
10
+ 'settings',
9
11
  ]
wxpath/cli.py CHANGED
@@ -47,6 +47,11 @@ def main():
47
47
  help="Respect robots.txt",
48
48
  default=True
49
49
  )
50
+ arg_parser.add_argument(
51
+ "--insecure",
52
+ action="store_true",
53
+ help="Disable SSL certificate verification (use for sites with broken chains)",
54
+ )
50
55
  arg_parser.add_argument(
51
56
  "--cache",
52
57
  action="store_true",
@@ -112,6 +117,7 @@ def main():
112
117
  concurrency=args.concurrency,
113
118
  per_host=args.concurrency_per_host,
114
119
  respect_robots=args.respect_robots,
120
+ verify_ssl=not args.insecure,
115
121
  headers=custom_headers
116
122
  )
117
123
  engine = WXPathEngine(crawler=crawler)
@@ -0,0 +1,53 @@
1
+ class XPathEvaluationError(Exception):
2
+ """Errors during XPath evaluation with elementpath."""
3
+
4
+ def __init__(
5
+ self,
6
+ message: str,
7
+ xpath: str,
8
+ base_url: str | None = None,
9
+ element_tag: str | None = None,
10
+ error_code: str | None = None, # XPath error codes like XPST0003
11
+ position: tuple[int, int] | None = None, # (line, column)
12
+ original_error: Exception | None = None
13
+ ):
14
+ context = {
15
+ "xpath": xpath,
16
+ "base_url": base_url,
17
+ "element_tag": element_tag,
18
+ "error_code": error_code,
19
+ "position": position,
20
+ }
21
+ if original_error:
22
+ context["original_error"] = str(original_error)
23
+ # Extract XPath error code if present (e.g., [err:XPST0003])
24
+ if hasattr(original_error, 'code'):
25
+ context["error_code"] = original_error.code
26
+
27
+ super().__init__(message, context)
28
+
29
+ def to_dict(self) -> dict:
30
+ return {
31
+ "message": self.message,
32
+ "xpath": self.xpath,
33
+ "base_url": self.base_url,
34
+ "element_tag": self.element_tag,
35
+ "error_code": self.error_code,
36
+ "position": self.position,
37
+ "original_error": self.original_error,
38
+ }
39
+
40
+
41
+ class XPathSyntaxError(XPathEvaluationError):
42
+ """Invalid XPath syntax."""
43
+ pass
44
+
45
+
46
+ class XPathTypeError(XPathEvaluationError):
47
+ """Type error in XPath expression."""
48
+ pass
49
+
50
+
51
+ class XPathRuntimeError(XPathEvaluationError):
52
+ """Runtime error during XPath evaluation."""
53
+ pass
wxpath/core/models.py CHANGED
@@ -61,6 +61,7 @@ class InfiniteCrawlIntent(ProcessIntent):
61
61
 
62
62
  @dataclass(slots=True)
63
63
  class ExtractIntent(ProcessIntent):
64
+ """TODO: May be redundant with ProcessIntent?"""
64
65
  pass
65
66
 
66
67
 
wxpath/core/ops.py CHANGED
@@ -2,11 +2,25 @@ from typing import Callable, Iterable
2
2
  from urllib.parse import urljoin
3
3
 
4
4
  import elementpath
5
+ from elementpath import (
6
+ ElementPathError,
7
+ ElementPathSyntaxError as EPSyntaxError,
8
+ ElementPathTypeError as EPTypeError,
9
+ ElementPathZeroDivisionError,
10
+ ElementPathRuntimeError as EPRuntimeError,
11
+ MissingContextError,
12
+ )
5
13
  from elementpath.datatypes import AnyAtomicType
6
14
  from elementpath.xpath3 import XPath3Parser
7
15
  from lxml import html
8
16
 
9
17
  from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
18
+ from wxpath.core.exceptions import (
19
+ XPathEvaluationError,
20
+ XPathSyntaxError,
21
+ XPathTypeError,
22
+ XPathRuntimeError,
23
+ )
10
24
  from wxpath.core.models import (
11
25
  CrawlIntent,
12
26
  DataIntent,
@@ -19,6 +33,7 @@ from wxpath.core.parser import (
19
33
  Binary,
20
34
  Call,
21
35
  ContextItem,
36
+ Depth,
22
37
  Segment,
23
38
  Segments,
24
39
  String,
@@ -78,7 +93,10 @@ def get_operator(
78
93
 
79
94
 
80
95
  @register('url', (String,))
96
+ @register('url', (String, Depth))
81
97
  @register('url', (String, Xpath))
98
+ @register('url', (String, Depth, Xpath))
99
+ @register('url', (String, Xpath, Depth))
82
100
  def _handle_url_str_lit(curr_elem: html.HtmlElement,
83
101
  curr_segments: list[Url | Xpath],
84
102
  curr_depth: int, **kwargs) -> Iterable[Intent]:
@@ -87,9 +105,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
87
105
 
88
106
  next_segments = curr_segments[1:]
89
107
 
90
- if len(url_call.args) == 2:
108
+ # NOTE: Expects parser to produce UrlCrawl node in expressions
109
+ # that look like `url('...', follow=//a/@href)`
110
+ if isinstance(url_call, UrlCrawl):
111
+ xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
91
112
  _segments = [
92
- UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
113
+ UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
93
114
  ] + next_segments
94
115
 
95
116
  yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
@@ -112,17 +133,52 @@ def _handle_xpath(curr_elem: html.HtmlElement,
112
133
  raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
113
134
  base_url = getattr(curr_elem, 'base_url', None)
114
135
  log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
115
-
116
- _backlink_str = f"string('{curr_elem.get('backlink')}')"
117
- # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
118
- # increment after each url*() hop
119
- _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
120
- expr = expr.replace('wx:backlink()', _backlink_str)
121
- expr = expr.replace('wx:backlink(.)', _backlink_str)
122
- expr = expr.replace('wx:depth()', _depth_str)
123
- expr = expr.replace('wx:depth(.)', _depth_str)
124
-
125
- elems = curr_elem.xpath3(expr)
136
+
137
+ try:
138
+ elems = curr_elem.xpath3(expr)
139
+ except EPSyntaxError as e:
140
+ # Parse the error message to extract line/column if available
141
+ # elementpath format: "... at line 1, column 7: [err:XPST0003] ..."
142
+ raise XPathSyntaxError(
143
+ f"Invalid XPath syntax: {str(e).split(': ', 1)[-1]}",
144
+ xpath=expr,
145
+ base_url=base_url,
146
+ element_tag=curr_elem.tag,
147
+ original_error=e
148
+ ) from e
149
+ except EPTypeError as e:
150
+ raise XPathTypeError(
151
+ f"XPath type error: {str(e).split(': ', 1)[-1]}",
152
+ xpath=expr,
153
+ base_url=base_url,
154
+ element_tag=curr_elem.tag,
155
+ original_error=e
156
+ ) from e
157
+ except ElementPathZeroDivisionError as e:
158
+ raise XPathRuntimeError(
159
+ f"Division by zero in XPath: {expr}",
160
+ xpath=expr,
161
+ base_url=base_url,
162
+ element_tag=curr_elem.tag,
163
+ original_error=e
164
+ ) from e
165
+ except MissingContextError as e:
166
+ raise XPathRuntimeError(
167
+ f"XPath requires context but none provided: {expr}",
168
+ xpath=expr,
169
+ base_url=base_url,
170
+ element_tag=curr_elem.tag,
171
+ original_error=e
172
+ ) from e
173
+ except ElementPathError as e:
174
+ # Catch-all for other elementpath errors
175
+ raise XPathEvaluationError(
176
+ f"XPath evaluation failed: {e}",
177
+ xpath=expr,
178
+ base_url=base_url,
179
+ element_tag=curr_elem.tag,
180
+ original_error=e
181
+ ) from e
126
182
 
127
183
  next_segments = curr_segments[1:]
128
184
  for elem in elems:
@@ -259,12 +315,37 @@ def _handle_binary(curr_elem: html.HtmlElement | str,
259
315
  base_url = getattr(curr_elem, 'base_url', None)
260
316
  next_segments = right
261
317
 
262
- results = elementpath.select(
263
- curr_elem,
264
- left.value,
265
- parser=XPath3Parser,
266
- item='' if curr_elem is None else None
267
- )
318
+ try:
319
+ results = elementpath.select(
320
+ curr_elem,
321
+ left.value,
322
+ parser=XPath3Parser,
323
+ item='' if curr_elem is None else None
324
+ )
325
+ except EPSyntaxError as e:
326
+ raise XPathSyntaxError(
327
+ f"Invalid XPath in binary operation: {str(e).split(': ', 1)[-1]}",
328
+ xpath=left.value,
329
+ base_url=base_url,
330
+ element_tag=getattr(curr_elem, 'tag', None),
331
+ original_error=e
332
+ ) from e
333
+ except EPTypeError as e:
334
+ raise XPathTypeError(
335
+ f"XPath type error in binary operation: {str(e).split(': ', 1)[-1]}",
336
+ xpath=left.value,
337
+ base_url=base_url,
338
+ element_tag=getattr(curr_elem, 'tag', None),
339
+ original_error=e
340
+ ) from e
341
+ except ElementPathError as e:
342
+ raise XPathEvaluationError(
343
+ f"XPath evaluation failed in binary operation: {e}",
344
+ xpath=left.value,
345
+ base_url=base_url,
346
+ element_tag=getattr(curr_elem, 'tag', None),
347
+ original_error=e
348
+ ) from e
268
349
 
269
350
  if isinstance(results, AnyAtomicType):
270
351
  results = [results]
wxpath/core/parser.py CHANGED
@@ -13,7 +13,9 @@ except ImportError:
13
13
 
14
14
 
15
15
  TOKEN_SPEC = [
16
- ("NUMBER", r"\d+(\.\d+)?"),
16
+ ("WXLOOP", r"wx:loop"),
17
+ ("NUMBER", r"\d+\.\d+"),
18
+ ("INTEGER", r"\d+"),
17
19
  ("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
18
20
  ("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
19
21
  # ("///URL", r"/{3}\s*url"),
@@ -22,6 +24,7 @@ TOKEN_SPEC = [
22
24
  ("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
23
25
  # ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
24
26
  ("FOLLOW", r",?\s{,}follow="),
27
+ ("DEPTH", r",?\s{,}depth="),
25
28
  ("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
26
29
  ("LPAREN", r"\("),
27
30
  ("RPAREN", r"\)"),
@@ -63,6 +66,14 @@ def tokenize(src: str):
63
66
  class Number:
64
67
  value: float
65
68
 
69
+ @dataclass
70
+ class Integer:
71
+ value: int
72
+
73
+ @dataclass
74
+ class Depth(Integer):
75
+ pass
76
+
66
77
  @dataclass
67
78
  class String:
68
79
  value: str
@@ -170,7 +181,7 @@ class Parser:
170
181
 
171
182
  def parse_binary(self, min_prec: int) -> object:
172
183
  """Parse a binary expression chain honoring operator precedence."""
173
- if self.token.type == "WXPATH":
184
+ if self.token.type == "WXPATH" or self.token.type == "WXLOOP":
174
185
  left = self.parse_segments()
175
186
  else:
176
187
  left = self.nud()
@@ -273,6 +284,10 @@ class Parser:
273
284
  if tok.type == "NUMBER":
274
285
  self.advance()
275
286
  return Number(float(tok.value))
287
+
288
+ if tok.type == "INTEGER":
289
+ self.advance()
290
+ return Integer(int(tok.value))
276
291
 
277
292
  if tok.type == "STRING":
278
293
  self.advance()
@@ -358,18 +373,18 @@ class Parser:
358
373
  self.advance()
359
374
 
360
375
  return result
361
-
362
376
 
363
377
  def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
364
378
  """Capture content inside a url() call, handling nested wxpath expressions.
365
379
 
366
380
  Supports patterns like::
367
381
 
368
- url('...') -> [String]
369
- url('...' follow=//a/@href) -> [String, Xpath]
370
- url(//a/@href) -> [Xpath]
371
- url( url('..')//a/@href ) -> [Call, Xpath]
372
- url( url( url('..')//a )//b ) -> [Call, Xpath]
382
+ url('...') -> [String]
383
+ url('...' follow=//a/@href) -> [String, Xpath]
384
+ url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
385
+ url(//a/@href depth=2) -> [Xpath, Integer]
386
+ url( url('..')//a/@href ) -> [Call, Xpath]
387
+ url( url( url('..')//a )//b ) -> [Call, Xpath]
373
388
 
374
389
  Returns:
375
390
  A list of parsed elements: Xpath nodes for xpath content and Call
@@ -380,7 +395,10 @@ class Parser:
380
395
  paren_balance = 1 # We're already inside the opening paren of url()
381
396
  brace_balance = 0 # Track braces for map constructors
382
397
  reached_follow_token = False
398
+ reached_depth_token = False
383
399
  follow_xpath = ""
400
+ depth_number = ""
401
+
384
402
  while paren_balance > 0 and self.token.type != "EOF":
385
403
  if self.token.type == "WXPATH":
386
404
  # Found nested wxpath: save any accumulated xpath content first
@@ -396,13 +414,22 @@ class Parser:
396
414
 
397
415
  elif self.token.type == "FOLLOW":
398
416
  reached_follow_token = True
417
+ reached_depth_token = False
418
+ self.advance()
419
+
420
+ elif self.token.type == "DEPTH":
421
+ reached_depth_token = True
422
+ reached_follow_token = False
399
423
  self.advance()
400
424
 
401
425
  elif self.token.type == "LPAREN":
402
426
  # Opening paren that's NOT part of a url() call
403
427
  # (it's part of an xpath function like contains(), starts-with(), etc.)
404
428
  paren_balance += 1
405
- current_xpath += self.token.value
429
+ if not reached_follow_token:
430
+ current_xpath += self.token.value
431
+ else:
432
+ follow_xpath += self.token.value
406
433
  self.advance()
407
434
 
408
435
  elif self.token.type == "RPAREN":
@@ -410,26 +437,37 @@ class Parser:
410
437
  if paren_balance == 0:
411
438
  # This is the closing paren of the outer url()
412
439
  break
413
- current_xpath += self.token.value
440
+ if not reached_follow_token:
441
+ current_xpath += self.token.value
442
+ else:
443
+ follow_xpath += self.token.value
414
444
  self.advance()
415
445
 
416
446
  elif self.token.type == "LBRACE":
417
447
  # Opening brace for map constructors
418
448
  brace_balance += 1
419
- current_xpath += self.token.value
449
+ if not reached_follow_token:
450
+ current_xpath += self.token.value
451
+ else:
452
+ follow_xpath += self.token.value
420
453
  self.advance()
421
454
 
422
455
  elif self.token.type == "RBRACE":
423
456
  brace_balance -= 1
424
- current_xpath += self.token.value
457
+ if not reached_follow_token:
458
+ current_xpath += self.token.value
459
+ else:
460
+ follow_xpath += self.token.value
425
461
  self.advance()
426
462
 
427
463
  else:
428
464
  # Accumulate all other tokens as xpath content
429
- if not reached_follow_token:
430
- current_xpath += self.token.value
431
- else:
465
+ if reached_follow_token:
432
466
  follow_xpath += self.token.value
467
+ elif reached_depth_token:
468
+ depth_number += self.token.value
469
+ else:
470
+ current_xpath += self.token.value
433
471
 
434
472
  self.advance()
435
473
 
@@ -447,6 +485,9 @@ class Parser:
447
485
  if follow_xpath.strip():
448
486
  elements.append(Xpath(follow_xpath.strip()))
449
487
 
488
+ if depth_number.strip():
489
+ elements.append(Depth(int(depth_number.strip())))
490
+
450
491
  return elements
451
492
 
452
493
  def parse_call(self, func_name: str) -> Call | Segments:
@@ -462,13 +503,16 @@ class Parser:
462
503
  self.advance()
463
504
  # Handle follow=...
464
505
  if self.token.type == "FOLLOW":
465
- self.advance()
466
506
  follow_arg = self.capture_url_arg_content()
467
507
  args.extend(follow_arg)
508
+ if self.token.type == "DEPTH":
509
+ depth_arg = self.capture_url_arg_content()
510
+ args.extend(depth_arg)
468
511
  elif self.token.type == "WXPATH":
469
512
  # Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
470
- # Use capture_url_arg_content to handle nested wxpath and xpath
471
- args = self.capture_url_arg_content()
513
+ # NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
514
+ # args = self.capture_url_arg_content()
515
+ args = self.nud()
472
516
  else:
473
517
  # Simple xpath argument: url(//a/@href)
474
518
  # Could still contain nested wxpath, so use capture_url_arg_content
@@ -489,8 +533,18 @@ class Parser:
489
533
 
490
534
  return _specify_call_types(func_name, args)
491
535
 
492
-
493
536
  def _specify_call_types(func_name: str, args: list) -> Call | Segments:
537
+ """
538
+ Specify the type of a call based on the function name and arguments.
539
+ TODO: Provide example wxpath expressions for each call type.
540
+
541
+ Args:
542
+ func_name: The name of the function.
543
+ args: The arguments of the function.
544
+
545
+ Returns:
546
+ Call | Segments: The type of the call.
547
+ """
494
548
  if func_name == "url":
495
549
  if len(args) == 1:
496
550
  if isinstance(args[0], String):
@@ -500,17 +554,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
500
554
  else:
501
555
  raise ValueError(f"Unknown argument type: {type(args[0])}")
502
556
  elif len(args) == 2:
503
- if isinstance(args[0], String) and isinstance(args[1], Xpath):
557
+ arg0, arg1 = args
558
+ if isinstance(arg0, String) and isinstance(arg1, Xpath):
559
+ # Example: url('...', follow=//a/@href)
504
560
  return UrlCrawl(func_name, args)
505
- elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
561
+ elif isinstance(arg0, String) and isinstance(arg1, Integer):
562
+ # Example: url('...', depth=2)
563
+ return UrlLiteral(func_name, args)
564
+ elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
506
565
  args.append(UrlQuery('url', [ContextItem()]))
507
566
  return Segments(args)
508
- elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
509
- segs = args[0]
510
- segs.append(args[1])
567
+ elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
568
+ segs = arg0
569
+ segs.append(arg1)
511
570
  return Segments(segs)
512
571
  else:
513
572
  raise ValueError(f"Unknown arguments: {args}")
573
+ elif len(args) == 3:
574
+ arg0, arg1, arg2 = args
575
+ if (isinstance(arg0, String) and (
576
+ (isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
577
+ (isinstance(arg1, Integer) and isinstance(arg2, Xpath))
578
+ )):
579
+ # Example: url('...', follow=//a/@href, depth=2)
580
+ # Example: url('...', depth=2, follow=//a/@href)
581
+ return UrlCrawl(func_name, args)
582
+ else:
583
+ raise ValueError(f"Unknown arguments: {args}")
514
584
  else:
515
585
  raise ValueError(f"Unknown arguments: {args}")
516
586
  elif func_name == "/url" or func_name == "//url":