wxpath 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,406 @@
1
+ import asyncio
2
+ import contextlib
3
+ import inspect
4
+ from collections import deque
5
+ from typing import Any, AsyncGenerator, Iterator
6
+
7
+ from lxml.html import HtmlElement
8
+
9
+ from wxpath import patches # noqa: F401
10
+ from wxpath.core import parser
11
+ from wxpath.core.models import (
12
+ CrawlIntent,
13
+ CrawlTask,
14
+ DataIntent,
15
+ ExtractIntent,
16
+ InfiniteCrawlIntent,
17
+ ProcessIntent,
18
+ )
19
+ from wxpath.core.ops import get_operator
20
+ from wxpath.core.parser import Binary, Segment, Segments
21
+ from wxpath.core.runtime.helpers import parse_html
22
+ from wxpath.hooks.registry import FetchContext, get_hooks
23
+ from wxpath.http.client.crawler import Crawler
24
+ from wxpath.http.client.request import Request
25
+ from wxpath.util.logging import get_logger
26
+
27
+ log = get_logger(__name__)
28
+
29
+
30
+ class HookedEngineBase:
31
+ """Common hook invocation helpers shared by engine variants."""
32
+
33
+ async def post_fetch_hooks(self, body: bytes | str, task: CrawlTask) -> bytes | str | None:
34
+ """Run registered `post_fetch` hooks over a fetched response body.
35
+
36
+ Hooks may be synchronous or asynchronous and can transform or drop the
37
+ response payload entirely.
38
+
39
+ Args:
40
+ body: Raw response body bytes from the crawler.
41
+ task: The `CrawlTask` that produced the response.
42
+
43
+ Returns:
44
+ The transformed body, or `None` if any hook chooses to drop it.
45
+ """
46
+ for hook in get_hooks():
47
+ hook_method = getattr(hook, "post_fetch", lambda _, b: b)
48
+ if inspect.iscoroutinefunction(hook_method):
49
+ body = await hook_method(
50
+ FetchContext(task.url, task.backlink, task.depth, task.segments),
51
+ body
52
+ )
53
+ else:
54
+ body = hook_method(
55
+ FetchContext(task.url, task.backlink, task.depth, task.segments),
56
+ body
57
+ )
58
+ if not body:
59
+ log.debug(f"hook {type(hook).__name__} dropped {task.url}")
60
+ break
61
+ return body
62
+
63
+ async def post_parse_hooks(
64
+ self, elem: HtmlElement | None, task: CrawlTask
65
+ ) -> HtmlElement | None:
66
+ """Run registered `post_parse` hooks on a parsed DOM element.
67
+
68
+ Args:
69
+ elem: Parsed `lxml` element to process.
70
+ task: The originating `CrawlTask`.
71
+
72
+ Returns:
73
+ The transformed element, or `None` if a hook drops the branch.
74
+ """
75
+ for hook in get_hooks():
76
+ hook_method = getattr(hook, "post_parse", lambda _, e: e)
77
+ if inspect.iscoroutinefunction(hook_method):
78
+ elem = await hook_method(
79
+ FetchContext(
80
+ url=task.url,
81
+ backlink=task.backlink,
82
+ depth=task.depth,
83
+ segments=task.segments
84
+ ),
85
+ elem,
86
+ )
87
+ else:
88
+ elem = hook_method(
89
+ FetchContext(
90
+ url=task.url,
91
+ backlink=task.backlink,
92
+ depth=task.depth,
93
+ segments=task.segments
94
+ ),
95
+ elem,
96
+ )
97
+ if elem is None:
98
+ log.debug(f"hook {type(hook).__name__} dropped {task.url}")
99
+ break
100
+ return elem
101
+
102
+ async def post_extract_hooks(self, value: Any) -> Any | None:
103
+ """Run registered `post_extract` hooks on extracted values.
104
+
105
+ Args:
106
+ value: The extracted datum to post-process.
107
+
108
+ Returns:
109
+ The transformed value, or `None` if a hook drops it.
110
+ """
111
+ for hook in get_hooks():
112
+ hook_method = getattr(hook, "post_extract", lambda v: v)
113
+ if inspect.iscoroutinefunction(hook_method):
114
+ value = await hook_method(value)
115
+ else:
116
+ value = hook_method(value)
117
+ if value is None:
118
+ log.debug(f"hook {type(hook).__name__} dropped value")
119
+ break
120
+ return value
121
+
122
+
123
+ class WXPathEngine(HookedEngineBase):
124
+ """Main class for executing wxpath expressions.
125
+
126
+ The core pattern is to build a queue of CrawlTasks that are crawled and
127
+ processed FIFO. Traversal of the queue (and therefore the web graph) is
128
+ done concurrently in BFS-ish order.
129
+
130
+ Args:
131
+ crawler: Crawler instance to use for HTTP requests.
132
+ concurrency: Number of concurrent fetches at the Crawler level.
133
+ per_host: Number of concurrent fetches per host.
134
+ respect_robots: Whether to respect robots.txt directives.
135
+ allowed_response_codes: Set of allowed HTTP response codes. Defaults
136
+ to ``{200}``. Responses may still be filtered and dropped.
137
+ allow_redirects: Whether to follow HTTP redirects. Defaults to ``True``.
138
+ """
139
+ def __init__(
140
+ self,
141
+ crawler: Crawler | None = None,
142
+ concurrency: int = 16,
143
+ per_host: int = 8,
144
+ respect_robots: bool = True,
145
+ allowed_response_codes: set[int] = None,
146
+ allow_redirects: bool = True,
147
+ ):
148
+ # NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
149
+ self.seen_urls: set[str] = set()
150
+ self.crawler = crawler or Crawler(
151
+ concurrency=concurrency,
152
+ per_host=per_host,
153
+ respect_robots=respect_robots
154
+ )
155
+ self.allowed_response_codes = allowed_response_codes or {200}
156
+ self.allow_redirects = allow_redirects
157
+ if allow_redirects:
158
+ self.allowed_response_codes |= {301, 302, 303, 307, 308}
159
+
160
+ async def run(self, expression: str, max_depth: int) -> AsyncGenerator[Any, None]:
161
+ """Execute a wxpath expression concurrently and yield results.
162
+
163
+ Builds and drives a BFS-like crawl pipeline that honors robots rules,
164
+ throttling, and hook callbacks while walking the web graph.
165
+
166
+ Args:
167
+ expression: WXPath expression string to evaluate.
168
+ max_depth: Maximum crawl depth to follow for url hops.
169
+
170
+ Yields:
171
+ Extracted values produced by the expression (HTML elements or
172
+ wxpath-specific value types).
173
+ """
174
+ segments = parser.parse(expression)
175
+
176
+ queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
177
+ inflight: dict[str, CrawlTask] = {}
178
+ pending_tasks = 0
179
+
180
+ def is_terminal():
181
+ # NOTE: consider adopting state machine pattern for determining
182
+ # the current state of the engine.
183
+ return queue.empty() and pending_tasks <= 0
184
+
185
+ async with self.crawler as crawler:
186
+ async def submitter():
187
+ nonlocal pending_tasks
188
+ while True:
189
+ task = await queue.get()
190
+
191
+ if task is None:
192
+ break
193
+
194
+ if task.url in self.seen_urls or task.url in inflight:
195
+ queue.task_done()
196
+ continue
197
+
198
+ # Mark URL as seen immediately
199
+ self.seen_urls.add(task.url)
200
+ inflight[task.url] = task
201
+
202
+ pending_tasks += 1
203
+ crawler.submit(Request(task.url, max_retries=0))
204
+ queue.task_done()
205
+
206
+ submit_task = asyncio.create_task(submitter())
207
+
208
+ # Seed the pipeline with a dummy task
209
+ seed_task = CrawlTask(
210
+ elem=None,
211
+ url=None,
212
+ segments=segments,
213
+ depth=-1,
214
+ backlink=None,
215
+ )
216
+ async for output in self._process_pipeline(
217
+ task=seed_task,
218
+ elem=None,
219
+ depth=seed_task.depth,
220
+ max_depth=max_depth,
221
+ queue=queue,
222
+ ):
223
+ yield await self.post_extract_hooks(output)
224
+
225
+ # While looping asynchronous generators, you MUST make sure
226
+ # to check terminal conditions before re-iteration.
227
+ async for resp in crawler:
228
+ task = inflight.pop(resp.request.url, None)
229
+ pending_tasks -= 1
230
+
231
+ if task is None:
232
+ log.warning(f"Got unexpected response from {resp.request.url}")
233
+ if is_terminal():
234
+ break
235
+ continue
236
+
237
+ if resp.error:
238
+ log.warning(f"Got error from {resp.request.url}: {resp.error}")
239
+ if is_terminal():
240
+ break
241
+ continue
242
+
243
+ # NOTE: Consider allowing redirects
244
+ if resp.status not in self.allowed_response_codes or not resp.body:
245
+ log.warning(f"Got non-200 response from {resp.request.url}")
246
+ if is_terminal():
247
+ break
248
+ continue
249
+
250
+ body = await self.post_fetch_hooks(resp.body, task)
251
+ if not body:
252
+ if is_terminal():
253
+ break
254
+ continue
255
+
256
+ elem = parse_html(
257
+ body,
258
+ base_url=task.url,
259
+ backlink=task.backlink,
260
+ depth=task.depth,
261
+ )
262
+
263
+ elem = await self.post_parse_hooks(elem, task)
264
+ if elem is None:
265
+ if is_terminal():
266
+ break
267
+ continue
268
+
269
+ if task.segments:
270
+ async for output in self._process_pipeline(
271
+ task=task,
272
+ elem=elem,
273
+ depth=task.depth,
274
+ max_depth=max_depth,
275
+ queue=queue,
276
+ ):
277
+
278
+ yield await self.post_extract_hooks(output)
279
+ else:
280
+ yield await self.post_extract_hooks(elem)
281
+
282
+ # Termination condition
283
+ if is_terminal():
284
+ break
285
+
286
+ submit_task.cancel()
287
+ with contextlib.suppress(asyncio.CancelledError):
288
+ await submit_task
289
+
290
+ async def _process_pipeline(
291
+ self,
292
+ task: CrawlTask,
293
+ elem: Any,
294
+ depth: int,
295
+ max_depth: int,
296
+ queue: asyncio.Queue[CrawlTask],
297
+ ) -> AsyncGenerator[Any, None]:
298
+ """Process a queue of intents for a single crawl branch.
299
+
300
+ Traverses wxpath segments depth-first within a page while coordinating
301
+ newly discovered crawl intents back to the shared queue.
302
+
303
+ Args:
304
+ task: The originating crawl task for this branch.
305
+ elem: Current DOM element (or extracted value) being processed.
306
+ depth: Current traversal depth.
307
+ max_depth: Maximum permitted crawl depth.
308
+ queue: Shared crawl queue for enqueuing downstream URLs.
309
+
310
+ Yields:
311
+ object: Extracted values or processed elements as produced by operators.
312
+ """
313
+ mini_queue: deque[tuple[HtmlElement | Any, list[Binary | Segment] | Segments]] = deque(
314
+ [(elem, task.segments)]
315
+ )
316
+
317
+ while mini_queue:
318
+ elem, bin_or_segs = mini_queue.popleft()
319
+
320
+ binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
321
+ operator = get_operator(binary_or_segment)
322
+ intents = operator(elem, bin_or_segs, depth)
323
+
324
+ if not intents:
325
+ return
326
+
327
+ for intent in intents:
328
+ if isinstance(intent, DataIntent):
329
+ yield intent.value
330
+
331
+ elif isinstance(intent, CrawlIntent):
332
+ next_depth = task.depth + 1
333
+ # if intent.url not in self.seen_urls and next_depth <= max_depth:
334
+ if next_depth <= max_depth:
335
+ # self.seen_urls.add(intent.url)
336
+ log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
337
+ queue.put_nowait(
338
+ CrawlTask(
339
+ elem=None,
340
+ url=intent.url,
341
+ segments=intent.next_segments,
342
+ depth=next_depth,
343
+ backlink=task.url,
344
+ )
345
+ )
346
+
347
+ elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
348
+ # immediately traverse the extraction
349
+ elem = intent.elem
350
+ next_segments = intent.next_segments
351
+ mini_queue.append((elem, next_segments))
352
+
353
+
354
+ def wxpath_async(path_expr: str,
355
+ max_depth: int,
356
+ engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
357
+ if engine is None:
358
+ engine = WXPathEngine()
359
+ return engine.run(path_expr, max_depth)
360
+
361
+
362
+ ##### ASYNC IN SYNC #####
363
+ def wxpath_async_blocking_iter(
364
+ path_expr: str,
365
+ max_depth: int = 1,
366
+ engine: WXPathEngine | None = None,
367
+ ) -> Iterator[Any]:
368
+ """Evaluate a wxpath expression using concurrent breadth-first traversal.
369
+
370
+ Warning:
371
+ Spins up its own event loop therefore this function must **not** be
372
+ invoked from within an active asyncio event loop.
373
+
374
+ Args:
375
+ path_expr: A wxpath expression.
376
+ max_depth: Maximum crawl depth. Must be at least the number of
377
+ ``url*`` segments minus one.
378
+ engine: Optional pre-configured WXPathEngine instance.
379
+
380
+ Yields:
381
+ object: Extracted objects (HtmlElement, WxStr, dict, or other values)
382
+ produced by the expression evaluator.
383
+ """
384
+ loop = asyncio.new_event_loop()
385
+ asyncio.set_event_loop(loop)
386
+ agen = wxpath_async(path_expr, max_depth=max_depth, engine=engine)
387
+
388
+ try:
389
+ while True:
390
+ try:
391
+ yield loop.run_until_complete(agen.__anext__())
392
+ except StopAsyncIteration:
393
+ break
394
+ finally:
395
+ loop.run_until_complete(loop.shutdown_asyncgens())
396
+ loop.close()
397
+
398
+
399
+ def wxpath_async_blocking(
400
+ path_expr: str,
401
+ max_depth: int = 1,
402
+ engine: WXPathEngine | None = None,
403
+ ) -> list[Any]:
404
+ return list(
405
+ wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine)
406
+ )
@@ -0,0 +1,41 @@
1
+ from lxml import etree, html
2
+
3
+ from wxpath import patches
4
+ from wxpath.util.logging import get_logger
5
+
6
+ log = get_logger(__name__)
7
+
8
+
9
+ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
10
+ elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
11
+ if base_url:
12
+ elem.getroottree().docinfo.URL = base_url # make base-uri() work
13
+ # Also set xml:base on the root element for XPath base-uri()
14
+ elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
15
+ elem.base_url = base_url # sets both attribute and doc-level URL
16
+
17
+ # NOTE: some pages may have multiple root elements, i.e.
18
+ # len(elem.itersiblings()) > 0 AND elem.getparent() is None.
19
+ # This breaks elementpath. If elem has siblings, recreate the
20
+ # root element and only the root element.
21
+ if len(list(elem.itersiblings())) > 0:
22
+ elem = detach_html_root(elem, base_url)
23
+
24
+ for k, v in elem_kv_pairs.items():
25
+ elem.set(k, str(v))
26
+ return elem
27
+
28
+
29
+ def detach_html_root(elem, base_url=None):
30
+ new_root = etree.HTML(
31
+ etree.tostring(elem, encoding="utf-8"),
32
+ parser=patches.html_parser_with_xpath3,
33
+ base_url=base_url
34
+ )
35
+
36
+ if base_url:
37
+ new_root.getroottree().docinfo.URL = base_url
38
+ new_root.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
39
+ new_root.base_url = base_url
40
+
41
+ return new_root
@@ -0,0 +1,9 @@
1
+ from wxpath.hooks.builtin import JSONLWriter as JSONLWriter
2
+ from wxpath.hooks.builtin import SerializeXPathMapAndNodeHook as SerializeXPathMapAndNodeHook
3
+ from wxpath.hooks.registry import register as register
4
+
5
+ __all__ = [
6
+ "JSONLWriter",
7
+ "SerializeXPathMapAndNodeHook",
8
+ "register",
9
+ ]
@@ -0,0 +1,113 @@
1
+ import atexit
2
+ import json
3
+ import os
4
+ import queue
5
+ import threading
6
+ import time
7
+
8
+ from elementpath.serialization import XPathMap, XPathNode
9
+
10
+ from wxpath.util.logging import get_logger
11
+
12
+ log = get_logger(__name__)
13
+
14
+
15
+ class SerializeXPathMapAndNodeHook:
16
+ """
17
+ Serialize XPathMap and XPathNode objects to plain Python types.
18
+ This is enabled by default (once this module is imported).
19
+ """
20
+ def post_extract(self, value):
21
+ if isinstance(value, (list, tuple, set)):
22
+ return type(value)(self.post_extract(v) for v in value)
23
+ if isinstance(value, XPathMap):
24
+ return {k: self.post_extract(v) for k, v in value.items()}
25
+ if isinstance(value, XPathNode):
26
+ return self.post_extract(value.obj)
27
+ return value
28
+
29
+
30
+ class JSONLWriter:
31
+ """
32
+ Efficient writer that mirrors items to an NDJSON file.
33
+ - Non-blocking: post_extract enqueues and returns immediately.
34
+ - Background thread flushes to disk.
35
+ - Skips non-JSONable values (e.g., raw HtmlElement) by default.
36
+ Customize _jsonable() to change behavior.
37
+ """
38
+ def __init__(self, path=None):
39
+ self.path = path or os.getenv("WXPATH_OUT", "extractions.ndjson")
40
+ self._q: "queue.Queue[str]" = queue.Queue(maxsize=10000)
41
+ self._written = 0
42
+ self._dropped = 0
43
+ self._stop = False
44
+ self._t = threading.Thread(target=self._writer, name="wxpath-ndjson-writer", daemon=True)
45
+ self._t.start()
46
+ atexit.register(self._shutdown)
47
+
48
+ # ---- hook API ----
49
+ def post_extract(self, value):
50
+ js = self._jsonable(value)
51
+ if js is not None:
52
+ line = json.dumps(js, ensure_ascii=False, separators=(",", ":"))
53
+ try:
54
+ self._q.put_nowait(line)
55
+ except queue.Full:
56
+ self._dropped += 1
57
+ if self._dropped in (1, 100, 1000) or self._dropped % 10000 == 0:
58
+ log.warning("NDJSON queue full; dropping items",
59
+ extra={"dropped": self._dropped, "written": self._written})
60
+ return value # always pass-through
61
+
62
+ # ---- internals ----
63
+ def _writer(self):
64
+ # Open lazily to avoid creating files when nothing is produced.
65
+ f = None
66
+ try:
67
+ last_flush = time.time()
68
+ while not self._stop or not self._q.empty():
69
+ try:
70
+ line = self._q.get(timeout=0.5)
71
+ except queue.Empty:
72
+ line = None
73
+ if line is not None:
74
+ if f is None:
75
+ f = open(self.path, "a", buffering=1, encoding="utf-8") # line-buffered
76
+ f.write(line)
77
+ f.write("\n")
78
+ self._written += 1
79
+ # periodic flush guard for OS buffers even with line buffering
80
+ if f and (time.time() - last_flush) > 1.0:
81
+ f.flush()
82
+ last_flush = time.time()
83
+ finally:
84
+ if f:
85
+ f.flush()
86
+ f.close()
87
+ if self._dropped:
88
+ log.warning("NDJSON writer finished with drops",
89
+ extra={"dropped": self._dropped, "written": self._written})
90
+
91
+ def _shutdown(self):
92
+ self._stop = True
93
+ if self._t.is_alive():
94
+ self._t.join(timeout=2)
95
+
96
+ def _jsonable(self, v):
97
+ # Keep it conservative: only write JSON-friendly shapes by default.
98
+ # You can relax this if you want to serialize HtmlElement metadata, etc.
99
+ if v is None or isinstance(v, (bool, int, float, str, list, dict)):
100
+ return v
101
+ # Handle common wxpath types gently:
102
+ # - WxStr: stringify
103
+ if v.__class__.__name__ == "WxStr":
104
+ return str(v)
105
+ # - lxml HtmlElement: record minimal metadata instead of the whole DOM
106
+ base_url = getattr(v, "base_url", None)
107
+ tag = getattr(v, "tag", None)
108
+ if base_url or tag:
109
+ return {"_element": tag, "url": base_url}
110
+ return None # skip unknowns
111
+
112
+
113
+ NDJSONWriter = JSONLWriter