wxpath 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/core/ops.py CHANGED
@@ -2,25 +2,11 @@ from typing import Callable, Iterable
2
2
  from urllib.parse import urljoin
3
3
 
4
4
  import elementpath
5
- from elementpath import (
6
- ElementPathError,
7
- ElementPathSyntaxError as EPSyntaxError,
8
- ElementPathTypeError as EPTypeError,
9
- ElementPathZeroDivisionError,
10
- ElementPathRuntimeError as EPRuntimeError,
11
- MissingContextError,
12
- )
13
5
  from elementpath.datatypes import AnyAtomicType
14
6
  from elementpath.xpath3 import XPath3Parser
15
7
  from lxml import html
16
8
 
17
9
  from wxpath.core.dom import get_absolute_links_from_elem_and_xpath
18
- from wxpath.core.exceptions import (
19
- XPathEvaluationError,
20
- XPathSyntaxError,
21
- XPathTypeError,
22
- XPathRuntimeError,
23
- )
24
10
  from wxpath.core.models import (
25
11
  CrawlIntent,
26
12
  DataIntent,
@@ -133,52 +119,7 @@ def _handle_xpath(curr_elem: html.HtmlElement,
133
119
  raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
134
120
  base_url = getattr(curr_elem, 'base_url', None)
135
121
  log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
136
-
137
- try:
138
- elems = curr_elem.xpath3(expr)
139
- except EPSyntaxError as e:
140
- # Parse the error message to extract line/column if available
141
- # elementpath format: "... at line 1, column 7: [err:XPST0003] ..."
142
- raise XPathSyntaxError(
143
- f"Invalid XPath syntax: {str(e).split(': ', 1)[-1]}",
144
- xpath=expr,
145
- base_url=base_url,
146
- element_tag=curr_elem.tag,
147
- original_error=e
148
- ) from e
149
- except EPTypeError as e:
150
- raise XPathTypeError(
151
- f"XPath type error: {str(e).split(': ', 1)[-1]}",
152
- xpath=expr,
153
- base_url=base_url,
154
- element_tag=curr_elem.tag,
155
- original_error=e
156
- ) from e
157
- except ElementPathZeroDivisionError as e:
158
- raise XPathRuntimeError(
159
- f"Division by zero in XPath: {expr}",
160
- xpath=expr,
161
- base_url=base_url,
162
- element_tag=curr_elem.tag,
163
- original_error=e
164
- ) from e
165
- except MissingContextError as e:
166
- raise XPathRuntimeError(
167
- f"XPath requires context but none provided: {expr}",
168
- xpath=expr,
169
- base_url=base_url,
170
- element_tag=curr_elem.tag,
171
- original_error=e
172
- ) from e
173
- except ElementPathError as e:
174
- # Catch-all for other elementpath errors
175
- raise XPathEvaluationError(
176
- f"XPath evaluation failed: {e}",
177
- xpath=expr,
178
- base_url=base_url,
179
- element_tag=curr_elem.tag,
180
- original_error=e
181
- ) from e
122
+ elems = curr_elem.xpath3(expr)
182
123
 
183
124
  next_segments = curr_segments[1:]
184
125
  for elem in elems:
@@ -315,37 +256,12 @@ def _handle_binary(curr_elem: html.HtmlElement | str,
315
256
  base_url = getattr(curr_elem, 'base_url', None)
316
257
  next_segments = right
317
258
 
318
- try:
319
- results = elementpath.select(
320
- curr_elem,
321
- left.value,
322
- parser=XPath3Parser,
323
- item='' if curr_elem is None else None
324
- )
325
- except EPSyntaxError as e:
326
- raise XPathSyntaxError(
327
- f"Invalid XPath in binary operation: {str(e).split(': ', 1)[-1]}",
328
- xpath=left.value,
329
- base_url=base_url,
330
- element_tag=getattr(curr_elem, 'tag', None),
331
- original_error=e
332
- ) from e
333
- except EPTypeError as e:
334
- raise XPathTypeError(
335
- f"XPath type error in binary operation: {str(e).split(': ', 1)[-1]}",
336
- xpath=left.value,
337
- base_url=base_url,
338
- element_tag=getattr(curr_elem, 'tag', None),
339
- original_error=e
340
- ) from e
341
- except ElementPathError as e:
342
- raise XPathEvaluationError(
343
- f"XPath evaluation failed in binary operation: {e}",
344
- xpath=left.value,
345
- base_url=base_url,
346
- element_tag=getattr(curr_elem, 'tag', None),
347
- original_error=e
348
- ) from e
259
+ results = elementpath.select(
260
+ curr_elem,
261
+ left.value,
262
+ parser=XPath3Parser,
263
+ item='' if curr_elem is None else None
264
+ )
349
265
 
350
266
  if isinstance(results, AnyAtomicType):
351
267
  results = [results]
wxpath/core/parser.py CHANGED
@@ -13,7 +13,6 @@ except ImportError:
13
13
 
14
14
 
15
15
  TOKEN_SPEC = [
16
- ("WXLOOP", r"wx:loop"),
17
16
  ("NUMBER", r"\d+\.\d+"),
18
17
  ("INTEGER", r"\d+"),
19
18
  ("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
@@ -181,7 +180,7 @@ class Parser:
181
180
 
182
181
  def parse_binary(self, min_prec: int) -> object:
183
182
  """Parse a binary expression chain honoring operator precedence."""
184
- if self.token.type == "WXPATH" or self.token.type == "WXLOOP":
183
+ if self.token.type == "WXPATH":
185
184
  left = self.parse_segments()
186
185
  else:
187
186
  left = self.nud()
@@ -2,14 +2,13 @@ import asyncio
2
2
  import contextlib
3
3
  import inspect
4
4
  from collections import deque
5
- from typing import Any, AsyncGenerator, Iterator, Iterable
5
+ from typing import Any, AsyncGenerator, Iterator
6
6
 
7
7
  from lxml.html import HtmlElement
8
8
  from tqdm import tqdm
9
9
 
10
10
  from wxpath import patches # noqa: F401
11
11
  from wxpath.core import parser
12
- from wxpath.core.exceptions import XPathEvaluationError
13
12
  from wxpath.core.models import (
14
13
  CrawlIntent,
15
14
  CrawlTask,
@@ -146,7 +145,6 @@ class WXPathEngine(HookedEngineBase):
146
145
  respect_robots: bool = True,
147
146
  allowed_response_codes: set[int] = None,
148
147
  allow_redirects: bool = True,
149
- yield_errors: bool = False,
150
148
  ):
151
149
  # NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
152
150
  self.seen_urls: set[str] = set()
@@ -159,7 +157,6 @@ class WXPathEngine(HookedEngineBase):
159
157
  self.allow_redirects = allow_redirects
160
158
  if allow_redirects:
161
159
  self.allowed_response_codes |= {301, 302, 303, 307, 308}
162
- self.yield_errors = yield_errors
163
160
 
164
161
  def _get_max_depth(self, bin_or_segs: Binary | Segments, max_depth: int) -> int:
165
162
  """Get the maximum crawl depth for a given expression. Will find a Depth
@@ -185,6 +182,7 @@ class WXPathEngine(HookedEngineBase):
185
182
  expression: str,
186
183
  max_depth: int,
187
184
  progress: bool = False,
185
+ yield_errors: bool = False,
188
186
  ) -> AsyncGenerator[Any, None]:
189
187
  """Execute a wxpath expression concurrently and yield results.
190
188
 
@@ -269,10 +267,7 @@ class WXPathEngine(HookedEngineBase):
269
267
  queue=queue,
270
268
  pbar=pbar,
271
269
  ):
272
- if isinstance(output, dict) and output.get("__type__") == "error":
273
- yield output
274
- else:
275
- yield await self.post_extract_hooks(output)
270
+ yield await self.post_extract_hooks(output)
276
271
 
277
272
  # While looping asynchronous generators, you MUST make sure
278
273
  # to check terminal conditions before re-iteration.
@@ -287,7 +282,7 @@ class WXPathEngine(HookedEngineBase):
287
282
  if task is None:
288
283
  log.warning(f"Got unexpected response from {resp.request.url}")
289
284
 
290
- if self.yield_errors:
285
+ if yield_errors:
291
286
  yield {
292
287
  "__type__": "error",
293
288
  "url": resp.request.url,
@@ -303,7 +298,7 @@ class WXPathEngine(HookedEngineBase):
303
298
  if resp.error:
304
299
  log.warning(f"Got error from {resp.request.url}: {resp.error}")
305
300
 
306
- if self.yield_errors:
301
+ if yield_errors:
307
302
  yield {
308
303
  "__type__": "error",
309
304
  "url": resp.request.url,
@@ -320,7 +315,7 @@ class WXPathEngine(HookedEngineBase):
320
315
  if resp.status not in self.allowed_response_codes or not resp.body:
321
316
  log.warning(f"Got non-200 response from {resp.request.url}")
322
317
 
323
- if self.yield_errors:
318
+ if yield_errors:
324
319
  yield {
325
320
  "__type__": "error",
326
321
  "url": resp.request.url,
@@ -418,11 +413,7 @@ class WXPathEngine(HookedEngineBase):
418
413
 
419
414
  binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
420
415
  operator = get_operator(binary_or_segment)
421
-
422
- if self.yield_errors:
423
- intents = _safe_iterator(operator(elem, bin_or_segs, depth))
424
- else:
425
- intents = operator(elem, bin_or_segs, depth)
416
+ intents = operator(elem, bin_or_segs, depth)
426
417
 
427
418
  if not intents:
428
419
  return
@@ -458,28 +449,6 @@ class WXPathEngine(HookedEngineBase):
458
449
  mini_queue.append((elem, next_segments))
459
450
 
460
451
 
461
- def _safe_iterator(iterable: Iterable[Any]) -> Iterator[Any]:
462
- """Wrap an iterable in a try/except block and return an iterator that yields the result or the error."""
463
- it = iter(iterable)
464
- while True:
465
- try:
466
- yield next(it)
467
- except StopIteration:
468
- break
469
- except XPathEvaluationError as e:
470
- yield {
471
- "__type__": "error",
472
- "reason": "xpath_evaluation_error",
473
- "exception": str(e),
474
- }
475
- except Exception as e:
476
- yield {
477
- "__type__": "error",
478
- "reason": "iterator_error",
479
- "exception": str(e),
480
- }
481
-
482
-
483
452
  def wxpath_async(path_expr: str,
484
453
  max_depth: int,
485
454
  progress: bool = False,
wxpath/tui.py CHANGED
@@ -20,7 +20,6 @@ Example:
20
20
  import asyncio
21
21
  import csv
22
22
  import json
23
- import traceback
24
23
  from datetime import datetime
25
24
  from pathlib import Path
26
25
  from typing import Any, Iterable
@@ -28,7 +27,7 @@ from typing import Any, Iterable
28
27
  from elementpath.xpath_tokens import XPathMap
29
28
  from lxml.html import HtmlElement, tostring
30
29
  from rich.console import RenderableType
31
- from textual import events, work
30
+ from textual import work
32
31
  from textual.app import App, ComposeResult
33
32
  from textual.containers import Container, Horizontal, Vertical, VerticalScroll
34
33
  from textual.reactive import reactive
@@ -559,7 +558,6 @@ class WXPathTUI(App):
559
558
  ("ctrl+r", "execute", "Execute"),
560
559
  ("escape", "cancel_crawl", "Cancel Crawl"),
561
560
  ("ctrl+c", "clear", "Clear"),
562
- ("ctrl+shift+backspace", "clear_editor", "Clear Editor"),
563
561
  ("ctrl+d", "clear_debug", "Clear Debug"),
564
562
  ("ctrl+shift+d", "toggle_debug", "Toggle Debug"),
565
563
  ("ctrl+e", "export", "Export"),
@@ -631,7 +629,6 @@ class WXPathTUI(App):
631
629
  " • Press [bold]Escape[/bold] to cancel a running crawl\n"
632
630
  " • Press [bold]Ctrl+E[/bold] to export table (CSV/JSON)\n"
633
631
  " • Press [bold]Ctrl+C[/bold] to clear output\n"
634
- " • Press [bold]Ctrl+Shift+Backspace[/bold] to clear expression editor\n"
635
632
  " • Press [bold]Ctrl+Shift+D[/bold] to toggle debug panel\n"
636
633
  " • Press [bold]Ctrl+H[/bold] to configure HTTP headers\n"
637
634
  " • Press [bold]Ctrl+Shift+S[/bold] to edit persistent settings (concurrency, robots)\n" # noqa: E501
@@ -648,7 +645,7 @@ class WXPathTUI(App):
648
645
  """Update global settings and subtitle when cache setting changes."""
649
646
  # Update the global settings - this is what the HTTP crawler will read
650
647
  SETTINGS.http.client.cache.enabled = bool(new_value)
651
- self._debug(f"Cache enabled: {SETTINGS.http.client.cache.enabled}")
648
+ print(f"Cache enabled: {SETTINGS.http.client.cache.enabled}")
652
649
  self._update_subtitle()
653
650
 
654
651
  def watch_custom_headers(self, new_value: dict) -> None:
@@ -661,8 +658,7 @@ class WXPathTUI(App):
661
658
 
662
659
  def _update_subtitle(self) -> None:
663
660
  """Update subtitle with current cache, headers, and persistent settings."""
664
- # cache_state = "ON" if self.cache_enabled else "OFF"
665
- cache_state = SETTINGS.http.client.cache.enabled
661
+ cache_state = "ON" if self.cache_enabled else "OFF"
666
662
  headers_count = len(self.custom_headers)
667
663
  headers_info = f"{headers_count} custom" if headers_count > 0 else "default"
668
664
  conc = self.tui_settings.get("concurrency", 16)
@@ -944,10 +940,7 @@ class WXPathTUI(App):
944
940
  columns_initialized = False
945
941
  column_keys: list[str] = []
946
942
 
947
- async for result in engine.run(expression, max_depth=1, progress=False, yield_errors=True):
948
- if isinstance(result, dict) and result.get("__type__") == "error":
949
- self._debug(f"Error: {result.get('reason')}: {result}")
950
- continue
943
+ async for result in engine.run(expression, max_depth=1, progress=False):
951
944
  count += 1
952
945
  if count % 100 == 0:
953
946
  self._debug(f"Received result {count} of type {type(result).__name__}")
@@ -997,16 +990,8 @@ class WXPathTUI(App):
997
990
  self._executing = False
998
991
  return
999
992
  except Exception as e:
1000
- # Log full stack trace to debug panel
1001
- self._debug(traceback.format_exc())
1002
- # Append error as next row of table (do not clear output panel)
1003
- err_msg = f"Execution Error: {type(e).__name__}: {e}"
1004
- if columns_initialized and column_keys:
1005
- row = [err_msg] + [""] * (len(column_keys) - 1)
1006
- data_table.add_row(*row, key=f"error-{count}")
1007
- else:
1008
- data_table.add_column("error", key="error")
1009
- data_table.add_row(err_msg, key="error-0")
993
+ # Handle execution errors separately
994
+ self._update_output(f"[red]Execution Error:[/red] {type(e).__name__}: {e}")
1010
995
  self._executing = False
1011
996
  return
1012
997
  finally:
@@ -1101,13 +1086,7 @@ class WXPathTUI(App):
1101
1086
  """Clear the output panel."""
1102
1087
  self._update_output("Waiting for expression...")
1103
1088
  self._debug("Cleared output panel.")
1104
-
1105
- def action_clear_editor(self) -> None:
1106
- """Clear the expression editor (all text)."""
1107
- editor = self.query_one("#expression-editor", )
1108
- editor.clear()
1109
- self._debug("Expression editor cleared.")
1110
-
1089
+
1111
1090
  def _update_output(self, content: str | RenderableType) -> None:
1112
1091
  """Update the output panel with new content."""
1113
1092
  # output_panel = self.query_one("#output-panel", OutputPanel)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
@@ -65,8 +65,10 @@ Requires Python 3.10+.
65
65
 
66
66
  ```
67
67
  pip install wxpath
68
- # For TUI support
68
+ # For TUI support:
69
69
  pip install "wxpath[tui]"
70
+ # Immediately launch the TUI via uv:
71
+ uvx --from "wxpath[tui]" wxpath-tui
70
72
  ```
71
73
  ---
72
74
 
@@ -2,16 +2,16 @@ wxpath/__init__.py,sha256=nKo2ggHdlNUhTPzZlPOW-XssyYjVar6XcqaFsZLxsKU,304
2
2
  wxpath/cli.py,sha256=P2MU6UzWHiN-5roo-GdEb4OpTlCp0XA3AlFmjL7fI1o,4544
3
3
  wxpath/patches.py,sha256=nNFpNuapF30aVMVz6K1iJDX2RWmiyiECx2_UprcwiT4,8417
4
4
  wxpath/settings.py,sha256=dBSIxVPlcsP3IYuhuHG0uxZaNXLzcNUdBxEaZsZHcAc,3862
5
- wxpath/tui.py,sha256=0fq8fZhqHMf7hX8tM75HmAjLIfD5VLVBN5mo_E3Q1GQ,43880
5
+ wxpath/tui.py,sha256=CG8xvGnYNbruD4lw50Agu8RKKUJEpEl0WG0SyLBW4c8,42786
6
6
  wxpath/tui_settings.py,sha256=rM2IBeOzQUIzjk2Ds1Jlnvb7IUtdJdKMN2j3GHk7Z9M,5051
7
7
  wxpath/core/__init__.py,sha256=U9_In2iRaZrpiIVavIli1M59gCB6Kn1en-1Fza-qIiI,257
8
8
  wxpath/core/dom.py,sha256=X0L3n8jRfO5evEypDaJTD-NQ3cLXWvnEUVERAHo3vV0,701
9
9
  wxpath/core/exceptions.py,sha256=BwVoBWhulv24m13hZaQ3hlVYF8foiufiXZMAh7gESE0,1615
10
10
  wxpath/core/models.py,sha256=xsNY9ZmUILB5_O1GHRkn3cLBtPs3-krguU5NlqFe0bM,1664
11
- wxpath/core/ops.py,sha256=0hyfAyBWTy4uNIW4oec6j1y3H-HC6msm5MreL2cIAvY,12428
12
- wxpath/core/parser.py,sha256=S6Wg1lxgqY2btEJxbmxbBrOVVL0GOqvS3qivdPY2eOQ,21452
11
+ wxpath/core/ops.py,sha256=4vzLOqRM_LbXc1cAnWCuKGt2m_pbvyHO0p5ee2Upjog,9569
12
+ wxpath/core/parser.py,sha256=ufUSEfyR6aO10pV_E39-uSiLQfYvngNQnHcs1GJlpbA,21392
13
13
  wxpath/core/runtime/__init__.py,sha256=_iCgkIWxXvxzQcenHOsjYGsk74HboTIYWOtgM8GtCyc,86
14
- wxpath/core/runtime/engine.py,sha256=vXRahRyz8iII0Y7dESk5zTU-NJNalMHJfN77nw2wi_o,20313
14
+ wxpath/core/runtime/engine.py,sha256=ocGBTIHdFgOh3LzkgEUKZ59Ozn3nKqvBBAloj4Ln5D4,19229
15
15
  wxpath/core/runtime/helpers.py,sha256=RFLonAjRsL_CHFV0biUsgk0lOL8MKvHXdFg7p65xEP8,1554
16
16
  wxpath/hooks/__init__.py,sha256=9JG63e4z_8CZLWugFcY786hebaEEPZ5FmZhyDHat-98,294
17
17
  wxpath/hooks/builtin.py,sha256=GJ4w1C9djWNzAmAA3U0qI9OoCOeC5R8tEGtWXJVHSYs,4125
@@ -37,9 +37,9 @@ wxpath/util/cleaners.py,sha256=JtUwCKjSJV-qw2CBrcB1oYswBDeXiqndGiz3-MlxeG0,946
37
37
  wxpath/util/common_paths.py,sha256=Y-0yq6IMjlSl1t4GbmK9TeJFTQ-MVvJOINhglvD4djA,980
38
38
  wxpath/util/logging.py,sha256=hgN4OC1y2oZWewtL-O-Ei_1lOaadH9eSyo0Iz2t_s1c,2858
39
39
  wxpath/util/serialize.py,sha256=uUs4C9VErpFd97smBM2bRWo2nW25kCgKdsMrVtVxhg8,575
40
- wxpath-0.5.1.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
41
- wxpath-0.5.1.dist-info/METADATA,sha256=ao3G5gyrVZ17uJCnm0ld5taVl4dHDCPsFStEbw3gn4U,22031
42
- wxpath-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
43
- wxpath-0.5.1.dist-info/entry_points.txt,sha256=CSr67nPxU_tZ_XdAdDmvW9b9VRUhFAhGhEC41YNJEfE,72
44
- wxpath-0.5.1.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
45
- wxpath-0.5.1.dist-info/RECORD,,
40
+ wxpath-0.5.2.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
41
+ wxpath-0.5.2.dist-info/METADATA,sha256=-zpBLuhWEJbrZhCp3CGbuRpS7-J1_bQIpiW-c78lz1E,22105
42
+ wxpath-0.5.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
43
+ wxpath-0.5.2.dist-info/entry_points.txt,sha256=CSr67nPxU_tZ_XdAdDmvW9b9VRUhFAhGhEC41YNJEfE,72
44
+ wxpath-0.5.2.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
45
+ wxpath-0.5.2.dist-info/RECORD,,
File without changes