wxpath 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,444 @@
1
+ import asyncio
2
+ import contextlib
3
+ import inspect
4
+ from collections import deque
5
+ from typing import Any, AsyncGenerator, Iterator
6
+
7
+ from lxml.html import HtmlElement
8
+ from tqdm import tqdm
9
+
10
+ from wxpath import patches # noqa: F401
11
+ from wxpath.core import parser
12
+ from wxpath.core.models import (
13
+ CrawlIntent,
14
+ CrawlTask,
15
+ DataIntent,
16
+ ExtractIntent,
17
+ InfiniteCrawlIntent,
18
+ ProcessIntent,
19
+ )
20
+ from wxpath.core.ops import get_operator
21
+ from wxpath.core.parser import Binary, Segment, Segments
22
+ from wxpath.core.runtime.helpers import parse_html
23
+ from wxpath.hooks.registry import FetchContext, get_hooks
24
+ from wxpath.http.client.crawler import Crawler
25
+ from wxpath.http.client.request import Request
26
+ from wxpath.util.logging import get_logger
27
+
28
+ log = get_logger(__name__)
29
+
30
+
31
+ class HookedEngineBase:
32
+ """Common hook invocation helpers shared by engine variants."""
33
+
34
+ async def post_fetch_hooks(self, body: bytes | str, task: CrawlTask) -> bytes | str | None:
35
+ """Run registered `post_fetch` hooks over a fetched response body.
36
+
37
+ Hooks may be synchronous or asynchronous and can transform or drop the
38
+ response payload entirely.
39
+
40
+ Args:
41
+ body: Raw response body bytes from the crawler.
42
+ task: The `CrawlTask` that produced the response.
43
+
44
+ Returns:
45
+ The transformed body, or `None` if any hook chooses to drop it.
46
+ """
47
+ for hook in get_hooks():
48
+ hook_method = getattr(hook, "post_fetch", lambda _, b: b)
49
+ if inspect.iscoroutinefunction(hook_method):
50
+ body = await hook_method(
51
+ FetchContext(task.url, task.backlink, task.depth, task.segments),
52
+ body
53
+ )
54
+ else:
55
+ body = hook_method(
56
+ FetchContext(task.url, task.backlink, task.depth, task.segments),
57
+ body
58
+ )
59
+ if not body:
60
+ log.debug(f"hook {type(hook).__name__} dropped {task.url}")
61
+ break
62
+ return body
63
+
64
+ async def post_parse_hooks(
65
+ self, elem: HtmlElement | None, task: CrawlTask
66
+ ) -> HtmlElement | None:
67
+ """Run registered `post_parse` hooks on a parsed DOM element.
68
+
69
+ Args:
70
+ elem: Parsed `lxml` element to process.
71
+ task: The originating `CrawlTask`.
72
+
73
+ Returns:
74
+ The transformed element, or `None` if a hook drops the branch.
75
+ """
76
+ for hook in get_hooks():
77
+ hook_method = getattr(hook, "post_parse", lambda _, e: e)
78
+ if inspect.iscoroutinefunction(hook_method):
79
+ elem = await hook_method(
80
+ FetchContext(
81
+ url=task.url,
82
+ backlink=task.backlink,
83
+ depth=task.depth,
84
+ segments=task.segments
85
+ ),
86
+ elem,
87
+ )
88
+ else:
89
+ elem = hook_method(
90
+ FetchContext(
91
+ url=task.url,
92
+ backlink=task.backlink,
93
+ depth=task.depth,
94
+ segments=task.segments
95
+ ),
96
+ elem,
97
+ )
98
+ if elem is None:
99
+ log.debug(f"hook {type(hook).__name__} dropped {task.url}")
100
+ break
101
+ return elem
102
+
103
+ async def post_extract_hooks(self, value: Any) -> Any | None:
104
+ """Run registered `post_extract` hooks on extracted values.
105
+
106
+ Args:
107
+ value: The extracted datum to post-process.
108
+
109
+ Returns:
110
+ The transformed value, or `None` if a hook drops it.
111
+ """
112
+ for hook in get_hooks():
113
+ hook_method = getattr(hook, "post_extract", lambda v: v)
114
+ if inspect.iscoroutinefunction(hook_method):
115
+ value = await hook_method(value)
116
+ else:
117
+ value = hook_method(value)
118
+ if value is None:
119
+ log.debug(f"hook {type(hook).__name__} dropped value")
120
+ break
121
+ return value
122
+
123
+
124
+ class WXPathEngine(HookedEngineBase):
125
+ """Main class for executing wxpath expressions.
126
+
127
+ The core pattern is to build a queue of CrawlTasks that are crawled and
128
+ processed FIFO. Traversal of the queue (and therefore the web graph) is
129
+ done concurrently in BFS-ish order.
130
+
131
+ Args:
132
+ crawler: Crawler instance to use for HTTP requests.
133
+ concurrency: Number of concurrent fetches at the Crawler level.
134
+ per_host: Number of concurrent fetches per host.
135
+ respect_robots: Whether to respect robots.txt directives.
136
+ allowed_response_codes: Set of allowed HTTP response codes. Defaults
137
+ to ``{200}``. Responses may still be filtered and dropped.
138
+ allow_redirects: Whether to follow HTTP redirects. Defaults to ``True``.
139
+ """
140
+ def __init__(
141
+ self,
142
+ crawler: Crawler | None = None,
143
+ concurrency: int = 16,
144
+ per_host: int = 8,
145
+ respect_robots: bool = True,
146
+ allowed_response_codes: set[int] = None,
147
+ allow_redirects: bool = True,
148
+ ):
149
+ # NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
150
+ self.seen_urls: set[str] = set()
151
+ self.crawler = crawler or Crawler(
152
+ concurrency=concurrency,
153
+ per_host=per_host,
154
+ respect_robots=respect_robots
155
+ )
156
+ self.allowed_response_codes = allowed_response_codes or {200}
157
+ self.allow_redirects = allow_redirects
158
+ if allow_redirects:
159
+ self.allowed_response_codes |= {301, 302, 303, 307, 308}
160
+
161
+ async def run(
162
+ self,
163
+ expression: str,
164
+ max_depth: int,
165
+ progress: bool = False
166
+ ) -> AsyncGenerator[Any, None]:
167
+ """Execute a wxpath expression concurrently and yield results.
168
+
169
+ Builds and drives a BFS-like crawl pipeline that honors robots rules,
170
+ throttling, and hook callbacks while walking the web graph.
171
+
172
+ Args:
173
+ expression: WXPath expression string to evaluate.
174
+ max_depth: Maximum crawl depth to follow for url hops.
175
+ progress: Whether to display a progress bar.
176
+
177
+ Yields:
178
+ Extracted values produced by the expression (HTML elements or
179
+ wxpath-specific value types).
180
+ """
181
+ segments = parser.parse(expression)
182
+
183
+ queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
184
+ inflight: dict[str, CrawlTask] = {}
185
+ pending_tasks = 0
186
+
187
+ def is_terminal():
188
+ # NOTE: consider adopting state machine pattern for determining
189
+ # the current state of the engine.
190
+ return queue.empty() and pending_tasks <= 0
191
+
192
+ total_yielded = 0
193
+ if progress:
194
+ pbar = tqdm(total=0)
195
+ else:
196
+ pbar = None
197
+
198
+ async with self.crawler as crawler:
199
+ async def submitter():
200
+ nonlocal pending_tasks
201
+ while True:
202
+ task = await queue.get()
203
+
204
+ if task is None:
205
+ break
206
+
207
+ if task.url in self.seen_urls or task.url in inflight:
208
+ queue.task_done()
209
+ continue
210
+
211
+ # Mark URL as seen immediately
212
+ self.seen_urls.add(task.url)
213
+ inflight[task.url] = task
214
+
215
+ pending_tasks += 1
216
+ crawler.submit(Request(task.url, max_retries=0))
217
+ queue.task_done()
218
+
219
+ submit_task = asyncio.create_task(submitter())
220
+
221
+ # Seed the pipeline with a dummy task
222
+ seed_task = CrawlTask(
223
+ elem=None,
224
+ url=None,
225
+ segments=segments,
226
+ depth=-1,
227
+ backlink=None,
228
+ )
229
+ async for output in self._process_pipeline(
230
+ task=seed_task,
231
+ elem=None,
232
+ depth=seed_task.depth,
233
+ max_depth=max_depth,
234
+ queue=queue,
235
+ pbar=pbar,
236
+ ):
237
+ yield await self.post_extract_hooks(output)
238
+
239
+ # While looping asynchronous generators, you MUST make sure
240
+ # to check terminal conditions before re-iteration.
241
+ async for resp in crawler:
242
+ if pbar is not None:
243
+ pbar.update(1)
244
+ pbar.refresh()
245
+
246
+ task = inflight.pop(resp.request.url, None)
247
+ pending_tasks -= 1
248
+
249
+ if task is None:
250
+ log.warning(f"Got unexpected response from {resp.request.url}")
251
+ if is_terminal():
252
+ break
253
+ continue
254
+
255
+ if resp.error:
256
+ log.warning(f"Got error from {resp.request.url}: {resp.error}")
257
+ if is_terminal():
258
+ break
259
+ continue
260
+
261
+ # NOTE: Consider allowing redirects
262
+ if resp.status not in self.allowed_response_codes or not resp.body:
263
+ log.warning(f"Got non-200 response from {resp.request.url}")
264
+ if is_terminal():
265
+ break
266
+ continue
267
+
268
+ body = await self.post_fetch_hooks(resp.body, task)
269
+ if not body:
270
+ if is_terminal():
271
+ break
272
+ continue
273
+
274
+ elem = parse_html(
275
+ body,
276
+ base_url=task.url,
277
+ backlink=task.backlink,
278
+ depth=task.depth,
279
+ )
280
+
281
+ elem = await self.post_parse_hooks(elem, task)
282
+ if elem is None:
283
+ if is_terminal():
284
+ break
285
+ continue
286
+
287
+ if task.segments:
288
+ async for output in self._process_pipeline(
289
+ task=task,
290
+ elem=elem,
291
+ depth=task.depth,
292
+ max_depth=max_depth,
293
+ queue=queue,
294
+ pbar=pbar
295
+ ):
296
+ total_yielded += 1
297
+ if pbar is not None:
298
+ pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
299
+
300
+ yield await self.post_extract_hooks(output)
301
+ else:
302
+ total_yielded += 1
303
+ if pbar is not None:
304
+ pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
305
+
306
+ yield await self.post_extract_hooks(elem)
307
+
308
+ # Termination condition
309
+ if is_terminal():
310
+ break
311
+
312
+ submit_task.cancel()
313
+ with contextlib.suppress(asyncio.CancelledError):
314
+ await submit_task
315
+
316
+ if pbar is not None:
317
+ pbar.close()
318
+
319
+ async def _process_pipeline(
320
+ self,
321
+ task: CrawlTask,
322
+ elem: Any,
323
+ depth: int,
324
+ max_depth: int,
325
+ queue: asyncio.Queue[CrawlTask],
326
+ pbar: tqdm = None
327
+ ) -> AsyncGenerator[Any, None]:
328
+ """Process a queue of intents for a single crawl branch.
329
+
330
+ Traverses wxpath segments depth-first within a page while coordinating
331
+ newly discovered crawl intents back to the shared queue.
332
+
333
+ Args:
334
+ task: The originating crawl task for this branch.
335
+ elem: Current DOM element (or extracted value) being processed.
336
+ depth: Current traversal depth.
337
+ max_depth: Maximum permitted crawl depth.
338
+ queue: Shared crawl queue for enqueuing downstream URLs.
339
+
340
+ Yields:
341
+ object: Extracted values or processed elements as produced by operators.
342
+ """
343
+ mini_queue: deque[tuple[HtmlElement | Any, list[Binary | Segment] | Segments]] = deque(
344
+ [(elem, task.segments)]
345
+ )
346
+
347
+ while mini_queue:
348
+ elem, bin_or_segs = mini_queue.popleft()
349
+
350
+ binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
351
+ operator = get_operator(binary_or_segment)
352
+ intents = operator(elem, bin_or_segs, depth)
353
+
354
+ if not intents:
355
+ return
356
+
357
+ for intent in intents:
358
+ if isinstance(intent, DataIntent):
359
+ yield intent.value
360
+
361
+ elif isinstance(intent, CrawlIntent):
362
+ next_depth = task.depth + 1
363
+ # if intent.url not in self.seen_urls and next_depth <= max_depth:
364
+ if next_depth <= max_depth and intent.url not in self.seen_urls:
365
+ # self.seen_urls.add(intent.url)
366
+ log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
367
+
368
+ queue.put_nowait(
369
+ CrawlTask(
370
+ elem=None,
371
+ url=intent.url,
372
+ segments=intent.next_segments,
373
+ depth=next_depth,
374
+ backlink=task.url,
375
+ )
376
+ )
377
+ if pbar is not None:
378
+ pbar.total += 1
379
+ pbar.refresh()
380
+
381
+ elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
382
+ # immediately traverse the extraction
383
+ elem = intent.elem
384
+ next_segments = intent.next_segments
385
+ mini_queue.append((elem, next_segments))
386
+
387
+
388
+ def wxpath_async(path_expr: str,
389
+ max_depth: int,
390
+ progress: bool = False,
391
+ engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
392
+ if engine is None:
393
+ engine = WXPathEngine()
394
+ return engine.run(path_expr, max_depth, progress=progress)
395
+
396
+
397
+ ##### ASYNC IN SYNC #####
398
+ def wxpath_async_blocking_iter(
399
+ path_expr: str,
400
+ max_depth: int = 1,
401
+ progress: bool = False,
402
+ engine: WXPathEngine | None = None,
403
+ ) -> Iterator[Any]:
404
+ """Evaluate a wxpath expression using concurrent breadth-first traversal.
405
+
406
+ Warning:
407
+ Spins up its own event loop therefore this function must **not** be
408
+ invoked from within an active asyncio event loop.
409
+
410
+ Args:
411
+ path_expr: A wxpath expression.
412
+ max_depth: Maximum crawl depth. Must be at least the number of
413
+ ``url*`` segments minus one.
414
+ engine: Optional pre-configured WXPathEngine instance.
415
+
416
+ Yields:
417
+ object: Extracted objects (HtmlElement, WxStr, dict, or other values)
418
+ produced by the expression evaluator.
419
+ """
420
+ loop = asyncio.new_event_loop()
421
+ asyncio.set_event_loop(loop)
422
+ agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress, engine=engine)
423
+
424
+ try:
425
+ while True:
426
+ try:
427
+ yield loop.run_until_complete(agen.__anext__())
428
+ except StopAsyncIteration:
429
+ break
430
+ finally:
431
+ loop.run_until_complete(loop.shutdown_asyncgens())
432
+ loop.close()
433
+
434
+
435
+ def wxpath_async_blocking(
436
+ path_expr: str,
437
+ max_depth: int = 1,
438
+ progress: bool = False,
439
+ engine: WXPathEngine | None = None,
440
+ ) -> list[Any]:
441
+ return list(wxpath_async_blocking_iter(path_expr,
442
+ max_depth=max_depth,
443
+ progress=progress,
444
+ engine=engine))
@@ -0,0 +1,41 @@
1
+ from lxml import etree, html
2
+
3
+ from wxpath import patches
4
+ from wxpath.util.logging import get_logger
5
+
6
+ log = get_logger(__name__)
7
+
8
+
9
+ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
10
+ elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
11
+ if base_url:
12
+ elem.getroottree().docinfo.URL = base_url # make base-uri() work
13
+ # Also set xml:base on the root element for XPath base-uri()
14
+ elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
15
+ elem.base_url = base_url # sets both attribute and doc-level URL
16
+
17
+ # NOTE: some pages may have multiple root elements, i.e.
18
+ # len(elem.itersiblings()) > 0 AND elem.getparent() is None.
19
+ # This breaks elementpath. If elem has siblings, recreate the
20
+ # root element and only the root element.
21
+ if len(list(elem.itersiblings())) > 0:
22
+ elem = detach_html_root(elem, base_url)
23
+
24
+ for k, v in elem_kv_pairs.items():
25
+ elem.set(k, str(v))
26
+ return elem
27
+
28
+
29
+ def detach_html_root(elem, base_url=None):
30
+ new_root = etree.HTML(
31
+ etree.tostring(elem, encoding="utf-8"),
32
+ parser=patches.html_parser_with_xpath3,
33
+ base_url=base_url
34
+ )
35
+
36
+ if base_url:
37
+ new_root.getroottree().docinfo.URL = base_url
38
+ new_root.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
39
+ new_root.base_url = base_url
40
+
41
+ return new_root
@@ -0,0 +1,9 @@
1
+ from wxpath.hooks.builtin import JSONLWriter as JSONLWriter
2
+ from wxpath.hooks.builtin import SerializeXPathMapAndNodeHook as SerializeXPathMapAndNodeHook
3
+ from wxpath.hooks.registry import register as register
4
+
5
+ __all__ = [
6
+ "JSONLWriter",
7
+ "SerializeXPathMapAndNodeHook",
8
+ "register",
9
+ ]
@@ -0,0 +1,113 @@
1
+ import atexit
2
+ import json
3
+ import os
4
+ import queue
5
+ import threading
6
+ import time
7
+
8
+ from elementpath.serialization import XPathMap, XPathNode
9
+
10
+ from wxpath.util.logging import get_logger
11
+
12
+ log = get_logger(__name__)
13
+
14
+
15
+ class SerializeXPathMapAndNodeHook:
16
+ """
17
+ Serialize XPathMap and XPathNode objects to plain Python types.
18
+ This is enabled by default (once this module is imported).
19
+ """
20
+ def post_extract(self, value):
21
+ if isinstance(value, (list, tuple, set)):
22
+ return type(value)(self.post_extract(v) for v in value)
23
+ if isinstance(value, XPathMap):
24
+ return {k: self.post_extract(v) for k, v in value.items()}
25
+ if isinstance(value, XPathNode):
26
+ return self.post_extract(value.obj)
27
+ return value
28
+
29
+
30
+ class JSONLWriter:
31
+ """
32
+ Efficient writer that mirrors items to an NDJSON file.
33
+ - Non-blocking: post_extract enqueues and returns immediately.
34
+ - Background thread flushes to disk.
35
+ - Skips non-JSONable values (e.g., raw HtmlElement) by default.
36
+ Customize _jsonable() to change behavior.
37
+ """
38
+ def __init__(self, path=None):
39
+ self.path = path or os.getenv("WXPATH_OUT", "extractions.ndjson")
40
+ self._q: "queue.Queue[str]" = queue.Queue(maxsize=10000)
41
+ self._written = 0
42
+ self._dropped = 0
43
+ self._stop = False
44
+ self._t = threading.Thread(target=self._writer, name="wxpath-ndjson-writer", daemon=True)
45
+ self._t.start()
46
+ atexit.register(self._shutdown)
47
+
48
+ # ---- hook API ----
49
+ def post_extract(self, value):
50
+ js = self._jsonable(value)
51
+ if js is not None:
52
+ line = json.dumps(js, ensure_ascii=False, separators=(",", ":"))
53
+ try:
54
+ self._q.put_nowait(line)
55
+ except queue.Full:
56
+ self._dropped += 1
57
+ if self._dropped in (1, 100, 1000) or self._dropped % 10000 == 0:
58
+ log.warning("NDJSON queue full; dropping items",
59
+ extra={"dropped": self._dropped, "written": self._written})
60
+ return value # always pass-through
61
+
62
+ # ---- internals ----
63
+ def _writer(self):
64
+ # Open lazily to avoid creating files when nothing is produced.
65
+ f = None
66
+ try:
67
+ last_flush = time.time()
68
+ while not self._stop or not self._q.empty():
69
+ try:
70
+ line = self._q.get(timeout=0.5)
71
+ except queue.Empty:
72
+ line = None
73
+ if line is not None:
74
+ if f is None:
75
+ f = open(self.path, "a", buffering=1, encoding="utf-8") # line-buffered
76
+ f.write(line)
77
+ f.write("\n")
78
+ self._written += 1
79
+ # periodic flush guard for OS buffers even with line buffering
80
+ if f and (time.time() - last_flush) > 1.0:
81
+ f.flush()
82
+ last_flush = time.time()
83
+ finally:
84
+ if f:
85
+ f.flush()
86
+ f.close()
87
+ if self._dropped:
88
+ log.warning("NDJSON writer finished with drops",
89
+ extra={"dropped": self._dropped, "written": self._written})
90
+
91
+ def _shutdown(self):
92
+ self._stop = True
93
+ if self._t.is_alive():
94
+ self._t.join(timeout=2)
95
+
96
+ def _jsonable(self, v):
97
+ # Keep it conservative: only write JSON-friendly shapes by default.
98
+ # You can relax this if you want to serialize HtmlElement metadata, etc.
99
+ if v is None or isinstance(v, (bool, int, float, str, list, dict)):
100
+ return v
101
+ # Handle common wxpath types gently:
102
+ # - WxStr: stringify
103
+ if v.__class__.__name__ == "WxStr":
104
+ return str(v)
105
+ # - lxml HtmlElement: record minimal metadata instead of the whole DOM
106
+ base_url = getattr(v, "base_url", None)
107
+ tag = getattr(v, "tag", None)
108
+ if base_url or tag:
109
+ return {"_element": tag, "url": base_url}
110
+ return None # skip unknowns
111
+
112
+
113
+ NDJSONWriter = JSONLWriter