wxpath 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +92 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +278 -0
- wxpath/core/parser.py +598 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +406 -0
- wxpath/core/runtime/helpers.py +41 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +145 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +231 -0
- wxpath/http/client/request.py +38 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/robots.py +82 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/METADATA +107 -129
- wxpath-0.3.0.dist-info/RECORD +33 -0
- wxpath-0.3.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.1.dist-info/RECORD +0 -6
- wxpath-0.1.1.dist-info/top_level.txt +0 -1
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextlib
|
|
3
|
+
import inspect
|
|
4
|
+
from collections import deque
|
|
5
|
+
from typing import Any, AsyncGenerator, Iterator
|
|
6
|
+
|
|
7
|
+
from lxml.html import HtmlElement
|
|
8
|
+
|
|
9
|
+
from wxpath import patches # noqa: F401
|
|
10
|
+
from wxpath.core import parser
|
|
11
|
+
from wxpath.core.models import (
|
|
12
|
+
CrawlIntent,
|
|
13
|
+
CrawlTask,
|
|
14
|
+
DataIntent,
|
|
15
|
+
ExtractIntent,
|
|
16
|
+
InfiniteCrawlIntent,
|
|
17
|
+
ProcessIntent,
|
|
18
|
+
)
|
|
19
|
+
from wxpath.core.ops import get_operator
|
|
20
|
+
from wxpath.core.parser import Binary, Segment, Segments
|
|
21
|
+
from wxpath.core.runtime.helpers import parse_html
|
|
22
|
+
from wxpath.hooks.registry import FetchContext, get_hooks
|
|
23
|
+
from wxpath.http.client.crawler import Crawler
|
|
24
|
+
from wxpath.http.client.request import Request
|
|
25
|
+
from wxpath.util.logging import get_logger
|
|
26
|
+
|
|
27
|
+
log = get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class HookedEngineBase:
|
|
31
|
+
"""Common hook invocation helpers shared by engine variants."""
|
|
32
|
+
|
|
33
|
+
async def post_fetch_hooks(self, body: bytes | str, task: CrawlTask) -> bytes | str | None:
|
|
34
|
+
"""Run registered `post_fetch` hooks over a fetched response body.
|
|
35
|
+
|
|
36
|
+
Hooks may be synchronous or asynchronous and can transform or drop the
|
|
37
|
+
response payload entirely.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
body: Raw response body bytes from the crawler.
|
|
41
|
+
task: The `CrawlTask` that produced the response.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The transformed body, or `None` if any hook chooses to drop it.
|
|
45
|
+
"""
|
|
46
|
+
for hook in get_hooks():
|
|
47
|
+
hook_method = getattr(hook, "post_fetch", lambda _, b: b)
|
|
48
|
+
if inspect.iscoroutinefunction(hook_method):
|
|
49
|
+
body = await hook_method(
|
|
50
|
+
FetchContext(task.url, task.backlink, task.depth, task.segments),
|
|
51
|
+
body
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
body = hook_method(
|
|
55
|
+
FetchContext(task.url, task.backlink, task.depth, task.segments),
|
|
56
|
+
body
|
|
57
|
+
)
|
|
58
|
+
if not body:
|
|
59
|
+
log.debug(f"hook {type(hook).__name__} dropped {task.url}")
|
|
60
|
+
break
|
|
61
|
+
return body
|
|
62
|
+
|
|
63
|
+
async def post_parse_hooks(
|
|
64
|
+
self, elem: HtmlElement | None, task: CrawlTask
|
|
65
|
+
) -> HtmlElement | None:
|
|
66
|
+
"""Run registered `post_parse` hooks on a parsed DOM element.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
elem: Parsed `lxml` element to process.
|
|
70
|
+
task: The originating `CrawlTask`.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The transformed element, or `None` if a hook drops the branch.
|
|
74
|
+
"""
|
|
75
|
+
for hook in get_hooks():
|
|
76
|
+
hook_method = getattr(hook, "post_parse", lambda _, e: e)
|
|
77
|
+
if inspect.iscoroutinefunction(hook_method):
|
|
78
|
+
elem = await hook_method(
|
|
79
|
+
FetchContext(
|
|
80
|
+
url=task.url,
|
|
81
|
+
backlink=task.backlink,
|
|
82
|
+
depth=task.depth,
|
|
83
|
+
segments=task.segments
|
|
84
|
+
),
|
|
85
|
+
elem,
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
elem = hook_method(
|
|
89
|
+
FetchContext(
|
|
90
|
+
url=task.url,
|
|
91
|
+
backlink=task.backlink,
|
|
92
|
+
depth=task.depth,
|
|
93
|
+
segments=task.segments
|
|
94
|
+
),
|
|
95
|
+
elem,
|
|
96
|
+
)
|
|
97
|
+
if elem is None:
|
|
98
|
+
log.debug(f"hook {type(hook).__name__} dropped {task.url}")
|
|
99
|
+
break
|
|
100
|
+
return elem
|
|
101
|
+
|
|
102
|
+
async def post_extract_hooks(self, value: Any) -> Any | None:
|
|
103
|
+
"""Run registered `post_extract` hooks on extracted values.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
value: The extracted datum to post-process.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
The transformed value, or `None` if a hook drops it.
|
|
110
|
+
"""
|
|
111
|
+
for hook in get_hooks():
|
|
112
|
+
hook_method = getattr(hook, "post_extract", lambda v: v)
|
|
113
|
+
if inspect.iscoroutinefunction(hook_method):
|
|
114
|
+
value = await hook_method(value)
|
|
115
|
+
else:
|
|
116
|
+
value = hook_method(value)
|
|
117
|
+
if value is None:
|
|
118
|
+
log.debug(f"hook {type(hook).__name__} dropped value")
|
|
119
|
+
break
|
|
120
|
+
return value
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class WXPathEngine(HookedEngineBase):
|
|
124
|
+
"""Main class for executing wxpath expressions.
|
|
125
|
+
|
|
126
|
+
The core pattern is to build a queue of CrawlTasks that are crawled and
|
|
127
|
+
processed FIFO. Traversal of the queue (and therefore the web graph) is
|
|
128
|
+
done concurrently in BFS-ish order.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
crawler: Crawler instance to use for HTTP requests.
|
|
132
|
+
concurrency: Number of concurrent fetches at the Crawler level.
|
|
133
|
+
per_host: Number of concurrent fetches per host.
|
|
134
|
+
respect_robots: Whether to respect robots.txt directives.
|
|
135
|
+
allowed_response_codes: Set of allowed HTTP response codes. Defaults
|
|
136
|
+
to ``{200}``. Responses may still be filtered and dropped.
|
|
137
|
+
allow_redirects: Whether to follow HTTP redirects. Defaults to ``True``.
|
|
138
|
+
"""
|
|
139
|
+
def __init__(
|
|
140
|
+
self,
|
|
141
|
+
crawler: Crawler | None = None,
|
|
142
|
+
concurrency: int = 16,
|
|
143
|
+
per_host: int = 8,
|
|
144
|
+
respect_robots: bool = True,
|
|
145
|
+
allowed_response_codes: set[int] = None,
|
|
146
|
+
allow_redirects: bool = True,
|
|
147
|
+
):
|
|
148
|
+
# NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
|
|
149
|
+
self.seen_urls: set[str] = set()
|
|
150
|
+
self.crawler = crawler or Crawler(
|
|
151
|
+
concurrency=concurrency,
|
|
152
|
+
per_host=per_host,
|
|
153
|
+
respect_robots=respect_robots
|
|
154
|
+
)
|
|
155
|
+
self.allowed_response_codes = allowed_response_codes or {200}
|
|
156
|
+
self.allow_redirects = allow_redirects
|
|
157
|
+
if allow_redirects:
|
|
158
|
+
self.allowed_response_codes |= {301, 302, 303, 307, 308}
|
|
159
|
+
|
|
160
|
+
async def run(self, expression: str, max_depth: int) -> AsyncGenerator[Any, None]:
|
|
161
|
+
"""Execute a wxpath expression concurrently and yield results.
|
|
162
|
+
|
|
163
|
+
Builds and drives a BFS-like crawl pipeline that honors robots rules,
|
|
164
|
+
throttling, and hook callbacks while walking the web graph.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
expression: WXPath expression string to evaluate.
|
|
168
|
+
max_depth: Maximum crawl depth to follow for url hops.
|
|
169
|
+
|
|
170
|
+
Yields:
|
|
171
|
+
Extracted values produced by the expression (HTML elements or
|
|
172
|
+
wxpath-specific value types).
|
|
173
|
+
"""
|
|
174
|
+
segments = parser.parse(expression)
|
|
175
|
+
|
|
176
|
+
queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
|
|
177
|
+
inflight: dict[str, CrawlTask] = {}
|
|
178
|
+
pending_tasks = 0
|
|
179
|
+
|
|
180
|
+
def is_terminal():
|
|
181
|
+
# NOTE: consider adopting state machine pattern for determining
|
|
182
|
+
# the current state of the engine.
|
|
183
|
+
return queue.empty() and pending_tasks <= 0
|
|
184
|
+
|
|
185
|
+
async with self.crawler as crawler:
|
|
186
|
+
async def submitter():
|
|
187
|
+
nonlocal pending_tasks
|
|
188
|
+
while True:
|
|
189
|
+
task = await queue.get()
|
|
190
|
+
|
|
191
|
+
if task is None:
|
|
192
|
+
break
|
|
193
|
+
|
|
194
|
+
if task.url in self.seen_urls or task.url in inflight:
|
|
195
|
+
queue.task_done()
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
# Mark URL as seen immediately
|
|
199
|
+
self.seen_urls.add(task.url)
|
|
200
|
+
inflight[task.url] = task
|
|
201
|
+
|
|
202
|
+
pending_tasks += 1
|
|
203
|
+
crawler.submit(Request(task.url, max_retries=0))
|
|
204
|
+
queue.task_done()
|
|
205
|
+
|
|
206
|
+
submit_task = asyncio.create_task(submitter())
|
|
207
|
+
|
|
208
|
+
# Seed the pipeline with a dummy task
|
|
209
|
+
seed_task = CrawlTask(
|
|
210
|
+
elem=None,
|
|
211
|
+
url=None,
|
|
212
|
+
segments=segments,
|
|
213
|
+
depth=-1,
|
|
214
|
+
backlink=None,
|
|
215
|
+
)
|
|
216
|
+
async for output in self._process_pipeline(
|
|
217
|
+
task=seed_task,
|
|
218
|
+
elem=None,
|
|
219
|
+
depth=seed_task.depth,
|
|
220
|
+
max_depth=max_depth,
|
|
221
|
+
queue=queue,
|
|
222
|
+
):
|
|
223
|
+
yield await self.post_extract_hooks(output)
|
|
224
|
+
|
|
225
|
+
# While looping asynchronous generators, you MUST make sure
|
|
226
|
+
# to check terminal conditions before re-iteration.
|
|
227
|
+
async for resp in crawler:
|
|
228
|
+
task = inflight.pop(resp.request.url, None)
|
|
229
|
+
pending_tasks -= 1
|
|
230
|
+
|
|
231
|
+
if task is None:
|
|
232
|
+
log.warning(f"Got unexpected response from {resp.request.url}")
|
|
233
|
+
if is_terminal():
|
|
234
|
+
break
|
|
235
|
+
continue
|
|
236
|
+
|
|
237
|
+
if resp.error:
|
|
238
|
+
log.warning(f"Got error from {resp.request.url}: {resp.error}")
|
|
239
|
+
if is_terminal():
|
|
240
|
+
break
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
# NOTE: Consider allowing redirects
|
|
244
|
+
if resp.status not in self.allowed_response_codes or not resp.body:
|
|
245
|
+
log.warning(f"Got non-200 response from {resp.request.url}")
|
|
246
|
+
if is_terminal():
|
|
247
|
+
break
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
body = await self.post_fetch_hooks(resp.body, task)
|
|
251
|
+
if not body:
|
|
252
|
+
if is_terminal():
|
|
253
|
+
break
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
elem = parse_html(
|
|
257
|
+
body,
|
|
258
|
+
base_url=task.url,
|
|
259
|
+
backlink=task.backlink,
|
|
260
|
+
depth=task.depth,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
elem = await self.post_parse_hooks(elem, task)
|
|
264
|
+
if elem is None:
|
|
265
|
+
if is_terminal():
|
|
266
|
+
break
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
if task.segments:
|
|
270
|
+
async for output in self._process_pipeline(
|
|
271
|
+
task=task,
|
|
272
|
+
elem=elem,
|
|
273
|
+
depth=task.depth,
|
|
274
|
+
max_depth=max_depth,
|
|
275
|
+
queue=queue,
|
|
276
|
+
):
|
|
277
|
+
|
|
278
|
+
yield await self.post_extract_hooks(output)
|
|
279
|
+
else:
|
|
280
|
+
yield await self.post_extract_hooks(elem)
|
|
281
|
+
|
|
282
|
+
# Termination condition
|
|
283
|
+
if is_terminal():
|
|
284
|
+
break
|
|
285
|
+
|
|
286
|
+
submit_task.cancel()
|
|
287
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
288
|
+
await submit_task
|
|
289
|
+
|
|
290
|
+
async def _process_pipeline(
|
|
291
|
+
self,
|
|
292
|
+
task: CrawlTask,
|
|
293
|
+
elem: Any,
|
|
294
|
+
depth: int,
|
|
295
|
+
max_depth: int,
|
|
296
|
+
queue: asyncio.Queue[CrawlTask],
|
|
297
|
+
) -> AsyncGenerator[Any, None]:
|
|
298
|
+
"""Process a queue of intents for a single crawl branch.
|
|
299
|
+
|
|
300
|
+
Traverses wxpath segments depth-first within a page while coordinating
|
|
301
|
+
newly discovered crawl intents back to the shared queue.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
task: The originating crawl task for this branch.
|
|
305
|
+
elem: Current DOM element (or extracted value) being processed.
|
|
306
|
+
depth: Current traversal depth.
|
|
307
|
+
max_depth: Maximum permitted crawl depth.
|
|
308
|
+
queue: Shared crawl queue for enqueuing downstream URLs.
|
|
309
|
+
|
|
310
|
+
Yields:
|
|
311
|
+
object: Extracted values or processed elements as produced by operators.
|
|
312
|
+
"""
|
|
313
|
+
mini_queue: deque[tuple[HtmlElement | Any, list[Binary | Segment] | Segments]] = deque(
|
|
314
|
+
[(elem, task.segments)]
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
while mini_queue:
|
|
318
|
+
elem, bin_or_segs = mini_queue.popleft()
|
|
319
|
+
|
|
320
|
+
binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
|
|
321
|
+
operator = get_operator(binary_or_segment)
|
|
322
|
+
intents = operator(elem, bin_or_segs, depth)
|
|
323
|
+
|
|
324
|
+
if not intents:
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
for intent in intents:
|
|
328
|
+
if isinstance(intent, DataIntent):
|
|
329
|
+
yield intent.value
|
|
330
|
+
|
|
331
|
+
elif isinstance(intent, CrawlIntent):
|
|
332
|
+
next_depth = task.depth + 1
|
|
333
|
+
# if intent.url not in self.seen_urls and next_depth <= max_depth:
|
|
334
|
+
if next_depth <= max_depth:
|
|
335
|
+
# self.seen_urls.add(intent.url)
|
|
336
|
+
log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
|
|
337
|
+
queue.put_nowait(
|
|
338
|
+
CrawlTask(
|
|
339
|
+
elem=None,
|
|
340
|
+
url=intent.url,
|
|
341
|
+
segments=intent.next_segments,
|
|
342
|
+
depth=next_depth,
|
|
343
|
+
backlink=task.url,
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
|
|
348
|
+
# immediately traverse the extraction
|
|
349
|
+
elem = intent.elem
|
|
350
|
+
next_segments = intent.next_segments
|
|
351
|
+
mini_queue.append((elem, next_segments))
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def wxpath_async(path_expr: str,
|
|
355
|
+
max_depth: int,
|
|
356
|
+
engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
|
|
357
|
+
if engine is None:
|
|
358
|
+
engine = WXPathEngine()
|
|
359
|
+
return engine.run(path_expr, max_depth)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
##### ASYNC IN SYNC #####
|
|
363
|
+
def wxpath_async_blocking_iter(
|
|
364
|
+
path_expr: str,
|
|
365
|
+
max_depth: int = 1,
|
|
366
|
+
engine: WXPathEngine | None = None,
|
|
367
|
+
) -> Iterator[Any]:
|
|
368
|
+
"""Evaluate a wxpath expression using concurrent breadth-first traversal.
|
|
369
|
+
|
|
370
|
+
Warning:
|
|
371
|
+
Spins up its own event loop therefore this function must **not** be
|
|
372
|
+
invoked from within an active asyncio event loop.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
path_expr: A wxpath expression.
|
|
376
|
+
max_depth: Maximum crawl depth. Must be at least the number of
|
|
377
|
+
``url*`` segments minus one.
|
|
378
|
+
engine: Optional pre-configured WXPathEngine instance.
|
|
379
|
+
|
|
380
|
+
Yields:
|
|
381
|
+
object: Extracted objects (HtmlElement, WxStr, dict, or other values)
|
|
382
|
+
produced by the expression evaluator.
|
|
383
|
+
"""
|
|
384
|
+
loop = asyncio.new_event_loop()
|
|
385
|
+
asyncio.set_event_loop(loop)
|
|
386
|
+
agen = wxpath_async(path_expr, max_depth=max_depth, engine=engine)
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
while True:
|
|
390
|
+
try:
|
|
391
|
+
yield loop.run_until_complete(agen.__anext__())
|
|
392
|
+
except StopAsyncIteration:
|
|
393
|
+
break
|
|
394
|
+
finally:
|
|
395
|
+
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
396
|
+
loop.close()
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def wxpath_async_blocking(
|
|
400
|
+
path_expr: str,
|
|
401
|
+
max_depth: int = 1,
|
|
402
|
+
engine: WXPathEngine | None = None,
|
|
403
|
+
) -> list[Any]:
|
|
404
|
+
return list(
|
|
405
|
+
wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine)
|
|
406
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from lxml import etree, html
|
|
2
|
+
|
|
3
|
+
from wxpath import patches
|
|
4
|
+
from wxpath.util.logging import get_logger
|
|
5
|
+
|
|
6
|
+
log = get_logger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
|
|
10
|
+
elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
|
|
11
|
+
if base_url:
|
|
12
|
+
elem.getroottree().docinfo.URL = base_url # make base-uri() work
|
|
13
|
+
# Also set xml:base on the root element for XPath base-uri()
|
|
14
|
+
elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
|
|
15
|
+
elem.base_url = base_url # sets both attribute and doc-level URL
|
|
16
|
+
|
|
17
|
+
# NOTE: some pages may have multiple root elements, i.e.
|
|
18
|
+
# len(elem.itersiblings()) > 0 AND elem.getparent() is None.
|
|
19
|
+
# This breaks elementpath. If elem has siblings, recreate the
|
|
20
|
+
# root element and only the root element.
|
|
21
|
+
if len(list(elem.itersiblings())) > 0:
|
|
22
|
+
elem = detach_html_root(elem, base_url)
|
|
23
|
+
|
|
24
|
+
for k, v in elem_kv_pairs.items():
|
|
25
|
+
elem.set(k, str(v))
|
|
26
|
+
return elem
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def detach_html_root(elem, base_url=None):
|
|
30
|
+
new_root = etree.HTML(
|
|
31
|
+
etree.tostring(elem, encoding="utf-8"),
|
|
32
|
+
parser=patches.html_parser_with_xpath3,
|
|
33
|
+
base_url=base_url
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
if base_url:
|
|
37
|
+
new_root.getroottree().docinfo.URL = base_url
|
|
38
|
+
new_root.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
|
|
39
|
+
new_root.base_url = base_url
|
|
40
|
+
|
|
41
|
+
return new_root
|
wxpath/hooks/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from wxpath.hooks.builtin import JSONLWriter as JSONLWriter
|
|
2
|
+
from wxpath.hooks.builtin import SerializeXPathMapAndNodeHook as SerializeXPathMapAndNodeHook
|
|
3
|
+
from wxpath.hooks.registry import register as register
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"JSONLWriter",
|
|
7
|
+
"SerializeXPathMapAndNodeHook",
|
|
8
|
+
"register",
|
|
9
|
+
]
|
wxpath/hooks/builtin.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import queue
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from elementpath.serialization import XPathMap, XPathNode
|
|
9
|
+
|
|
10
|
+
from wxpath.util.logging import get_logger
|
|
11
|
+
|
|
12
|
+
log = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SerializeXPathMapAndNodeHook:
|
|
16
|
+
"""
|
|
17
|
+
Serialize XPathMap and XPathNode objects to plain Python types.
|
|
18
|
+
This is enabled by default (once this module is imported).
|
|
19
|
+
"""
|
|
20
|
+
def post_extract(self, value):
|
|
21
|
+
if isinstance(value, (list, tuple, set)):
|
|
22
|
+
return type(value)(self.post_extract(v) for v in value)
|
|
23
|
+
if isinstance(value, XPathMap):
|
|
24
|
+
return {k: self.post_extract(v) for k, v in value.items()}
|
|
25
|
+
if isinstance(value, XPathNode):
|
|
26
|
+
return self.post_extract(value.obj)
|
|
27
|
+
return value
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class JSONLWriter:
|
|
31
|
+
"""
|
|
32
|
+
Efficient writer that mirrors items to an NDJSON file.
|
|
33
|
+
- Non-blocking: post_extract enqueues and returns immediately.
|
|
34
|
+
- Background thread flushes to disk.
|
|
35
|
+
- Skips non-JSONable values (e.g., raw HtmlElement) by default.
|
|
36
|
+
Customize _jsonable() to change behavior.
|
|
37
|
+
"""
|
|
38
|
+
def __init__(self, path=None):
|
|
39
|
+
self.path = path or os.getenv("WXPATH_OUT", "extractions.ndjson")
|
|
40
|
+
self._q: "queue.Queue[str]" = queue.Queue(maxsize=10000)
|
|
41
|
+
self._written = 0
|
|
42
|
+
self._dropped = 0
|
|
43
|
+
self._stop = False
|
|
44
|
+
self._t = threading.Thread(target=self._writer, name="wxpath-ndjson-writer", daemon=True)
|
|
45
|
+
self._t.start()
|
|
46
|
+
atexit.register(self._shutdown)
|
|
47
|
+
|
|
48
|
+
# ---- hook API ----
|
|
49
|
+
def post_extract(self, value):
|
|
50
|
+
js = self._jsonable(value)
|
|
51
|
+
if js is not None:
|
|
52
|
+
line = json.dumps(js, ensure_ascii=False, separators=(",", ":"))
|
|
53
|
+
try:
|
|
54
|
+
self._q.put_nowait(line)
|
|
55
|
+
except queue.Full:
|
|
56
|
+
self._dropped += 1
|
|
57
|
+
if self._dropped in (1, 100, 1000) or self._dropped % 10000 == 0:
|
|
58
|
+
log.warning("NDJSON queue full; dropping items",
|
|
59
|
+
extra={"dropped": self._dropped, "written": self._written})
|
|
60
|
+
return value # always pass-through
|
|
61
|
+
|
|
62
|
+
# ---- internals ----
|
|
63
|
+
def _writer(self):
|
|
64
|
+
# Open lazily to avoid creating files when nothing is produced.
|
|
65
|
+
f = None
|
|
66
|
+
try:
|
|
67
|
+
last_flush = time.time()
|
|
68
|
+
while not self._stop or not self._q.empty():
|
|
69
|
+
try:
|
|
70
|
+
line = self._q.get(timeout=0.5)
|
|
71
|
+
except queue.Empty:
|
|
72
|
+
line = None
|
|
73
|
+
if line is not None:
|
|
74
|
+
if f is None:
|
|
75
|
+
f = open(self.path, "a", buffering=1, encoding="utf-8") # line-buffered
|
|
76
|
+
f.write(line)
|
|
77
|
+
f.write("\n")
|
|
78
|
+
self._written += 1
|
|
79
|
+
# periodic flush guard for OS buffers even with line buffering
|
|
80
|
+
if f and (time.time() - last_flush) > 1.0:
|
|
81
|
+
f.flush()
|
|
82
|
+
last_flush = time.time()
|
|
83
|
+
finally:
|
|
84
|
+
if f:
|
|
85
|
+
f.flush()
|
|
86
|
+
f.close()
|
|
87
|
+
if self._dropped:
|
|
88
|
+
log.warning("NDJSON writer finished with drops",
|
|
89
|
+
extra={"dropped": self._dropped, "written": self._written})
|
|
90
|
+
|
|
91
|
+
def _shutdown(self):
|
|
92
|
+
self._stop = True
|
|
93
|
+
if self._t.is_alive():
|
|
94
|
+
self._t.join(timeout=2)
|
|
95
|
+
|
|
96
|
+
def _jsonable(self, v):
|
|
97
|
+
# Keep it conservative: only write JSON-friendly shapes by default.
|
|
98
|
+
# You can relax this if you want to serialize HtmlElement metadata, etc.
|
|
99
|
+
if v is None or isinstance(v, (bool, int, float, str, list, dict)):
|
|
100
|
+
return v
|
|
101
|
+
# Handle common wxpath types gently:
|
|
102
|
+
# - WxStr: stringify
|
|
103
|
+
if v.__class__.__name__ == "WxStr":
|
|
104
|
+
return str(v)
|
|
105
|
+
# - lxml HtmlElement: record minimal metadata instead of the whole DOM
|
|
106
|
+
base_url = getattr(v, "base_url", None)
|
|
107
|
+
tag = getattr(v, "tag", None)
|
|
108
|
+
if base_url or tag:
|
|
109
|
+
return {"_element": tag, "url": base_url}
|
|
110
|
+
return None # skip unknowns
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
NDJSONWriter = JSONLWriter
|