wxpath 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/cli.py +57 -12
- wxpath/core/runtime/engine.py +87 -11
- wxpath/http/client/cache.py +43 -0
- wxpath/http/client/crawler.py +106 -22
- wxpath/http/client/request.py +1 -1
- wxpath/http/stats.py +6 -0
- wxpath/settings.py +108 -0
- {wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/METADATA +140 -23
- {wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/RECORD +13 -11
- {wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/WHEEL +1 -1
- {wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/entry_points.txt +0 -0
- {wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.3.0.dist-info → wxpath-0.4.1.dist-info}/top_level.txt +0 -0
wxpath/cli.py
CHANGED
|
@@ -6,6 +6,7 @@ from wxpath.core import parser as wxpath_parser
|
|
|
6
6
|
from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
|
|
7
7
|
from wxpath.hooks import builtin, registry
|
|
8
8
|
from wxpath.http.client.crawler import Crawler
|
|
9
|
+
from wxpath.settings import SETTINGS
|
|
9
10
|
from wxpath.util.serialize import simplify
|
|
10
11
|
|
|
11
12
|
|
|
@@ -15,9 +16,11 @@ def main():
|
|
|
15
16
|
arg_parser.add_argument("expression", help="The wxpath expression")
|
|
16
17
|
arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
|
|
17
18
|
# debug
|
|
18
|
-
arg_parser.add_argument("--debug", action="store_true",
|
|
19
|
+
arg_parser.add_argument("--debug", action="store_true",
|
|
20
|
+
help="Debug mode. Provides verbose runtime output and information")
|
|
19
21
|
# verbose
|
|
20
|
-
arg_parser.add_argument("--verbose", action="store_true",
|
|
22
|
+
arg_parser.add_argument("--verbose", action="store_true",
|
|
23
|
+
help="Verbose mode. Prints CLI level information")
|
|
21
24
|
|
|
22
25
|
arg_parser.add_argument(
|
|
23
26
|
"--concurrency",
|
|
@@ -44,17 +47,27 @@ def main():
|
|
|
44
47
|
help="Respect robots.txt",
|
|
45
48
|
default=True
|
|
46
49
|
)
|
|
50
|
+
arg_parser.add_argument(
|
|
51
|
+
"--cache",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Use cache",
|
|
54
|
+
default=False
|
|
55
|
+
)
|
|
56
|
+
arg_parser.add_argument(
|
|
57
|
+
"--cache-backend",
|
|
58
|
+
type=str,
|
|
59
|
+
help="Cache backend. Possible values: redis, sqlite",
|
|
60
|
+
default="sqlite"
|
|
61
|
+
)
|
|
62
|
+
arg_parser.add_argument(
|
|
63
|
+
"--cache-db-path-or-url",
|
|
64
|
+
type=str,
|
|
65
|
+
help="Path to cache database",
|
|
66
|
+
default="cache.db"
|
|
67
|
+
)
|
|
47
68
|
|
|
48
69
|
args = arg_parser.parse_args()
|
|
49
70
|
|
|
50
|
-
if args.verbose:
|
|
51
|
-
segments = wxpath_parser.parse(args.expression)
|
|
52
|
-
print("parsed expression:\n\nSegments([")
|
|
53
|
-
for s in segments:
|
|
54
|
-
print(f"\t{s},")
|
|
55
|
-
print("])")
|
|
56
|
-
print()
|
|
57
|
-
|
|
58
71
|
if args.debug:
|
|
59
72
|
from wxpath import configure_logging
|
|
60
73
|
configure_logging('DEBUG')
|
|
@@ -72,6 +85,29 @@ def main():
|
|
|
72
85
|
print(f"Using custom headers: {custom_headers}")
|
|
73
86
|
print()
|
|
74
87
|
|
|
88
|
+
if args.cache:
|
|
89
|
+
SETTINGS.http.client.cache.enabled = True
|
|
90
|
+
if args.cache_backend == "redis":
|
|
91
|
+
SETTINGS.http.client.cache.backend = "redis"
|
|
92
|
+
SETTINGS.http.client.cache.redis.address = args.cache_db_path_or_url
|
|
93
|
+
elif args.cache_backend == "sqlite":
|
|
94
|
+
SETTINGS.http.client.cache.backend = "sqlite"
|
|
95
|
+
SETTINGS.http.client.cache.sqlite.cache_name = args.cache_db_path_or_url
|
|
96
|
+
|
|
97
|
+
if args.verbose:
|
|
98
|
+
print(f"Using concurrency: {args.concurrency}")
|
|
99
|
+
print(f"Using concurrency per host: {args.concurrency_per_host}")
|
|
100
|
+
print(f"Using respect robots: {args.respect_robots}")
|
|
101
|
+
print(f"Using cache: {args.cache}")
|
|
102
|
+
|
|
103
|
+
segments = wxpath_parser.parse(args.expression)
|
|
104
|
+
print("parsed expression:\n\nSegments([")
|
|
105
|
+
for s in segments:
|
|
106
|
+
print(f"\t{s},")
|
|
107
|
+
print("])")
|
|
108
|
+
print()
|
|
109
|
+
print()
|
|
110
|
+
|
|
75
111
|
crawler = Crawler(
|
|
76
112
|
concurrency=args.concurrency,
|
|
77
113
|
per_host=args.concurrency_per_host,
|
|
@@ -81,11 +117,20 @@ def main():
|
|
|
81
117
|
engine = WXPathEngine(crawler=crawler)
|
|
82
118
|
|
|
83
119
|
try:
|
|
84
|
-
for r in wxpath_async_blocking_iter(
|
|
120
|
+
for r in wxpath_async_blocking_iter(
|
|
121
|
+
path_expr=args.expression,
|
|
122
|
+
max_depth=args.depth,
|
|
123
|
+
engine=engine):
|
|
85
124
|
clean = simplify(r)
|
|
86
125
|
print(json.dumps(clean, ensure_ascii=False), flush=True)
|
|
87
126
|
except BrokenPipeError:
|
|
88
|
-
|
|
127
|
+
if args.verbose:
|
|
128
|
+
print("Pipe broken.")
|
|
129
|
+
|
|
130
|
+
if args.verbose:
|
|
131
|
+
print("Done. Printing crawl stats")
|
|
132
|
+
print(crawler._stats)
|
|
133
|
+
sys.exit(0)
|
|
89
134
|
|
|
90
135
|
|
|
91
136
|
if __name__ == "__main__":
|
wxpath/core/runtime/engine.py
CHANGED
|
@@ -5,6 +5,7 @@ from collections import deque
|
|
|
5
5
|
from typing import Any, AsyncGenerator, Iterator
|
|
6
6
|
|
|
7
7
|
from lxml.html import HtmlElement
|
|
8
|
+
from tqdm import tqdm
|
|
8
9
|
|
|
9
10
|
from wxpath import patches # noqa: F401
|
|
10
11
|
from wxpath.core import parser
|
|
@@ -157,7 +158,13 @@ class WXPathEngine(HookedEngineBase):
|
|
|
157
158
|
if allow_redirects:
|
|
158
159
|
self.allowed_response_codes |= {301, 302, 303, 307, 308}
|
|
159
160
|
|
|
160
|
-
async def run(
|
|
161
|
+
async def run(
|
|
162
|
+
self,
|
|
163
|
+
expression: str,
|
|
164
|
+
max_depth: int,
|
|
165
|
+
progress: bool = False,
|
|
166
|
+
yield_errors: bool = False,
|
|
167
|
+
) -> AsyncGenerator[Any, None]:
|
|
161
168
|
"""Execute a wxpath expression concurrently and yield results.
|
|
162
169
|
|
|
163
170
|
Builds and drives a BFS-like crawl pipeline that honors robots rules,
|
|
@@ -166,6 +173,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
166
173
|
Args:
|
|
167
174
|
expression: WXPath expression string to evaluate.
|
|
168
175
|
max_depth: Maximum crawl depth to follow for url hops.
|
|
176
|
+
progress: Whether to display a progress bar.
|
|
169
177
|
|
|
170
178
|
Yields:
|
|
171
179
|
Extracted values produced by the expression (HTML elements or
|
|
@@ -182,6 +190,12 @@ class WXPathEngine(HookedEngineBase):
|
|
|
182
190
|
# the current state of the engine.
|
|
183
191
|
return queue.empty() and pending_tasks <= 0
|
|
184
192
|
|
|
193
|
+
total_yielded = 0
|
|
194
|
+
if progress:
|
|
195
|
+
pbar = tqdm(total=0)
|
|
196
|
+
else:
|
|
197
|
+
pbar = None
|
|
198
|
+
|
|
185
199
|
async with self.crawler as crawler:
|
|
186
200
|
async def submitter():
|
|
187
201
|
nonlocal pending_tasks
|
|
@@ -219,23 +233,48 @@ class WXPathEngine(HookedEngineBase):
|
|
|
219
233
|
depth=seed_task.depth,
|
|
220
234
|
max_depth=max_depth,
|
|
221
235
|
queue=queue,
|
|
236
|
+
pbar=pbar,
|
|
222
237
|
):
|
|
223
238
|
yield await self.post_extract_hooks(output)
|
|
224
239
|
|
|
225
240
|
# While looping asynchronous generators, you MUST make sure
|
|
226
241
|
# to check terminal conditions before re-iteration.
|
|
227
242
|
async for resp in crawler:
|
|
243
|
+
if pbar is not None:
|
|
244
|
+
pbar.update(1)
|
|
245
|
+
pbar.refresh()
|
|
246
|
+
|
|
228
247
|
task = inflight.pop(resp.request.url, None)
|
|
229
248
|
pending_tasks -= 1
|
|
230
249
|
|
|
231
250
|
if task is None:
|
|
232
251
|
log.warning(f"Got unexpected response from {resp.request.url}")
|
|
252
|
+
|
|
253
|
+
if yield_errors:
|
|
254
|
+
yield {
|
|
255
|
+
"__type__": "error",
|
|
256
|
+
"url": resp.request.url,
|
|
257
|
+
"reason": "unexpected_response",
|
|
258
|
+
"status": resp.body,
|
|
259
|
+
"body": resp.body
|
|
260
|
+
}
|
|
261
|
+
|
|
233
262
|
if is_terminal():
|
|
234
263
|
break
|
|
235
264
|
continue
|
|
236
265
|
|
|
237
266
|
if resp.error:
|
|
238
267
|
log.warning(f"Got error from {resp.request.url}: {resp.error}")
|
|
268
|
+
|
|
269
|
+
if yield_errors:
|
|
270
|
+
yield {
|
|
271
|
+
"__type__": "error",
|
|
272
|
+
"url": resp.request.url,
|
|
273
|
+
"reason": "network_error",
|
|
274
|
+
"exception": str(resp.error),
|
|
275
|
+
"status": resp.status,
|
|
276
|
+
"body": resp.body
|
|
277
|
+
}
|
|
239
278
|
if is_terminal():
|
|
240
279
|
break
|
|
241
280
|
continue
|
|
@@ -243,6 +282,16 @@ class WXPathEngine(HookedEngineBase):
|
|
|
243
282
|
# NOTE: Consider allowing redirects
|
|
244
283
|
if resp.status not in self.allowed_response_codes or not resp.body:
|
|
245
284
|
log.warning(f"Got non-200 response from {resp.request.url}")
|
|
285
|
+
|
|
286
|
+
if yield_errors:
|
|
287
|
+
yield {
|
|
288
|
+
"__type__": "error",
|
|
289
|
+
"url": resp.request.url,
|
|
290
|
+
"reason": "bad_status",
|
|
291
|
+
"status": resp.status,
|
|
292
|
+
"body": resp.body
|
|
293
|
+
}
|
|
294
|
+
|
|
246
295
|
if is_terminal():
|
|
247
296
|
break
|
|
248
297
|
continue
|
|
@@ -273,10 +322,18 @@ class WXPathEngine(HookedEngineBase):
|
|
|
273
322
|
depth=task.depth,
|
|
274
323
|
max_depth=max_depth,
|
|
275
324
|
queue=queue,
|
|
276
|
-
|
|
325
|
+
pbar=pbar
|
|
326
|
+
):
|
|
327
|
+
total_yielded += 1
|
|
328
|
+
if pbar is not None:
|
|
329
|
+
pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
|
|
277
330
|
|
|
278
331
|
yield await self.post_extract_hooks(output)
|
|
279
332
|
else:
|
|
333
|
+
total_yielded += 1
|
|
334
|
+
if pbar is not None:
|
|
335
|
+
pbar.set_postfix(yielded=total_yielded, depth=task.depth,)
|
|
336
|
+
|
|
280
337
|
yield await self.post_extract_hooks(elem)
|
|
281
338
|
|
|
282
339
|
# Termination condition
|
|
@@ -287,6 +344,9 @@ class WXPathEngine(HookedEngineBase):
|
|
|
287
344
|
with contextlib.suppress(asyncio.CancelledError):
|
|
288
345
|
await submit_task
|
|
289
346
|
|
|
347
|
+
if pbar is not None:
|
|
348
|
+
pbar.close()
|
|
349
|
+
|
|
290
350
|
async def _process_pipeline(
|
|
291
351
|
self,
|
|
292
352
|
task: CrawlTask,
|
|
@@ -294,6 +354,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
294
354
|
depth: int,
|
|
295
355
|
max_depth: int,
|
|
296
356
|
queue: asyncio.Queue[CrawlTask],
|
|
357
|
+
pbar: tqdm = None
|
|
297
358
|
) -> AsyncGenerator[Any, None]:
|
|
298
359
|
"""Process a queue of intents for a single crawl branch.
|
|
299
360
|
|
|
@@ -331,9 +392,10 @@ class WXPathEngine(HookedEngineBase):
|
|
|
331
392
|
elif isinstance(intent, CrawlIntent):
|
|
332
393
|
next_depth = task.depth + 1
|
|
333
394
|
# if intent.url not in self.seen_urls and next_depth <= max_depth:
|
|
334
|
-
if next_depth <= max_depth:
|
|
395
|
+
if next_depth <= max_depth and intent.url not in self.seen_urls:
|
|
335
396
|
# self.seen_urls.add(intent.url)
|
|
336
397
|
log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
|
|
398
|
+
|
|
337
399
|
queue.put_nowait(
|
|
338
400
|
CrawlTask(
|
|
339
401
|
elem=None,
|
|
@@ -343,6 +405,9 @@ class WXPathEngine(HookedEngineBase):
|
|
|
343
405
|
backlink=task.url,
|
|
344
406
|
)
|
|
345
407
|
)
|
|
408
|
+
if pbar is not None:
|
|
409
|
+
pbar.total += 1
|
|
410
|
+
pbar.refresh()
|
|
346
411
|
|
|
347
412
|
elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
|
|
348
413
|
# immediately traverse the extraction
|
|
@@ -351,19 +416,24 @@ class WXPathEngine(HookedEngineBase):
|
|
|
351
416
|
mini_queue.append((elem, next_segments))
|
|
352
417
|
|
|
353
418
|
|
|
354
|
-
def wxpath_async(path_expr: str,
|
|
355
|
-
max_depth: int,
|
|
356
|
-
|
|
419
|
+
def wxpath_async(path_expr: str,
|
|
420
|
+
max_depth: int,
|
|
421
|
+
progress: bool = False,
|
|
422
|
+
engine: WXPathEngine | None = None,
|
|
423
|
+
yield_errors: bool = False
|
|
424
|
+
) -> AsyncGenerator[Any, None]:
|
|
357
425
|
if engine is None:
|
|
358
426
|
engine = WXPathEngine()
|
|
359
|
-
return engine.run(path_expr, max_depth)
|
|
427
|
+
return engine.run(path_expr, max_depth, progress=progress, yield_errors=yield_errors)
|
|
360
428
|
|
|
361
429
|
|
|
362
430
|
##### ASYNC IN SYNC #####
|
|
363
431
|
def wxpath_async_blocking_iter(
|
|
364
432
|
path_expr: str,
|
|
365
433
|
max_depth: int = 1,
|
|
434
|
+
progress: bool = False,
|
|
366
435
|
engine: WXPathEngine | None = None,
|
|
436
|
+
yield_errors: bool = False
|
|
367
437
|
) -> Iterator[Any]:
|
|
368
438
|
"""Evaluate a wxpath expression using concurrent breadth-first traversal.
|
|
369
439
|
|
|
@@ -383,7 +453,8 @@ def wxpath_async_blocking_iter(
|
|
|
383
453
|
"""
|
|
384
454
|
loop = asyncio.new_event_loop()
|
|
385
455
|
asyncio.set_event_loop(loop)
|
|
386
|
-
agen = wxpath_async(path_expr, max_depth=max_depth,
|
|
456
|
+
agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress,
|
|
457
|
+
engine=engine, yield_errors=yield_errors)
|
|
387
458
|
|
|
388
459
|
try:
|
|
389
460
|
while True:
|
|
@@ -399,8 +470,13 @@ def wxpath_async_blocking_iter(
|
|
|
399
470
|
def wxpath_async_blocking(
|
|
400
471
|
path_expr: str,
|
|
401
472
|
max_depth: int = 1,
|
|
473
|
+
progress: bool = False,
|
|
402
474
|
engine: WXPathEngine | None = None,
|
|
475
|
+
yield_errors: bool = False
|
|
403
476
|
) -> list[Any]:
|
|
404
|
-
return list(
|
|
405
|
-
|
|
406
|
-
|
|
477
|
+
return list(wxpath_async_blocking_iter(path_expr,
|
|
478
|
+
max_depth=max_depth,
|
|
479
|
+
progress=progress,
|
|
480
|
+
engine=engine,
|
|
481
|
+
yield_errors=yield_errors,
|
|
482
|
+
))
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from aiohttp_client_cache import SQLiteBackend
|
|
3
|
+
except ImportError:
|
|
4
|
+
CachedSession = None
|
|
5
|
+
|
|
6
|
+
from wxpath.settings import SETTINGS
|
|
7
|
+
from wxpath.util.logging import get_logger
|
|
8
|
+
|
|
9
|
+
log = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
CACHE_SETTINGS = SETTINGS.http.client.cache
|
|
12
|
+
|
|
13
|
+
def get_cache_backend():
|
|
14
|
+
log.info("cache backend", extra={"backend": CACHE_SETTINGS.backend})
|
|
15
|
+
if CACHE_SETTINGS.backend == "redis":
|
|
16
|
+
from aiohttp_client_cache.backends.redis import RedisBackend
|
|
17
|
+
return RedisBackend(
|
|
18
|
+
expire_after=CACHE_SETTINGS.expire_after,
|
|
19
|
+
urls_expire_after=CACHE_SETTINGS.urls_expire_after or None,
|
|
20
|
+
allowed_methods=CACHE_SETTINGS.allowed_methods,
|
|
21
|
+
allowed_codes=CACHE_SETTINGS.allowed_codes,
|
|
22
|
+
include_headers=CACHE_SETTINGS.include_headers,
|
|
23
|
+
ignored_parameters=CACHE_SETTINGS.ignored_parameters,
|
|
24
|
+
**CACHE_SETTINGS.redis
|
|
25
|
+
# cache_name=CACHE_SETTINGS.redis.cache_name,
|
|
26
|
+
# host=CACHE_SETTINGS.redis.host,
|
|
27
|
+
# port=CACHE_SETTINGS.redis.port,
|
|
28
|
+
# db=CACHE_SETTINGS.redis.db,
|
|
29
|
+
# cache_control=CACHE_SETTINGS.cache_control,
|
|
30
|
+
)
|
|
31
|
+
elif CACHE_SETTINGS.backend == "sqlite":
|
|
32
|
+
return SQLiteBackend(
|
|
33
|
+
cache_name=CACHE_SETTINGS.sqlite.cache_name,
|
|
34
|
+
expire_after=CACHE_SETTINGS.expire_after,
|
|
35
|
+
urls_expire_after=CACHE_SETTINGS.urls_expire_after or None,
|
|
36
|
+
allowed_methods=CACHE_SETTINGS.allowed_methods,
|
|
37
|
+
allowed_codes=CACHE_SETTINGS.allowed_codes,
|
|
38
|
+
include_headers=CACHE_SETTINGS.include_headers,
|
|
39
|
+
ignored_parameters=CACHE_SETTINGS.ignored_parameters,
|
|
40
|
+
# cache_control=CACHE_SETTINGS.cache_control,
|
|
41
|
+
)
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(f"Unknown cache backend: {CACHE_SETTINGS.backend}")
|
wxpath/http/client/crawler.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
import aiohttp
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from aiohttp_client_cache import CachedSession
|
|
5
|
+
except ImportError:
|
|
6
|
+
CachedSession = None
|
|
7
|
+
|
|
1
8
|
import asyncio
|
|
2
9
|
import time
|
|
3
10
|
import urllib.parse
|
|
@@ -5,21 +12,52 @@ from collections import defaultdict
|
|
|
5
12
|
from socket import gaierror
|
|
6
13
|
from typing import AsyncIterator
|
|
7
14
|
|
|
8
|
-
import
|
|
9
|
-
|
|
15
|
+
from wxpath.http.client.cache import get_cache_backend
|
|
10
16
|
from wxpath.http.client.request import Request
|
|
11
17
|
from wxpath.http.client.response import Response
|
|
12
18
|
from wxpath.http.policy.retry import RetryPolicy
|
|
13
19
|
from wxpath.http.policy.robots import RobotsTxtPolicy
|
|
14
20
|
from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
|
|
15
21
|
from wxpath.http.stats import CrawlerStats, build_trace_config
|
|
22
|
+
from wxpath.settings import SETTINGS
|
|
16
23
|
from wxpath.util.logging import get_logger
|
|
17
24
|
|
|
18
25
|
log = get_logger(__name__)
|
|
19
26
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
27
|
+
CACHE_SETTINGS = SETTINGS.http.client.cache
|
|
28
|
+
CRAWLER_SETTINGS = SETTINGS.http.client.crawler
|
|
29
|
+
|
|
30
|
+
def get_async_session(
|
|
31
|
+
headers: dict | None = None,
|
|
32
|
+
timeout: aiohttp.ClientTimeout | None = None,
|
|
33
|
+
connector: aiohttp.TCPConnector | None = None,
|
|
34
|
+
trace_config: aiohttp.TraceConfig | None = None
|
|
35
|
+
) -> aiohttp.ClientSession:
|
|
36
|
+
"""
|
|
37
|
+
Create and return a new aiohttp session. If aiohttp-client-cache is available
|
|
38
|
+
and enabled, return a new CachedSession bound to the configured SQLite backend.
|
|
39
|
+
The caller is responsible for closing the session.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
if timeout is None:
|
|
43
|
+
timeout = aiohttp.ClientTimeout(total=CRAWLER_SETTINGS.timeout)
|
|
44
|
+
|
|
45
|
+
if CACHE_SETTINGS.enabled and CachedSession:
|
|
46
|
+
log.info("using aiohttp-client-cache")
|
|
47
|
+
return CachedSession(
|
|
48
|
+
cache=get_cache_backend(),
|
|
49
|
+
headers=headers,
|
|
50
|
+
timeout=timeout,
|
|
51
|
+
connector=connector,
|
|
52
|
+
trace_configs=[trace_config] if trace_config is not None else None
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return aiohttp.ClientSession(
|
|
56
|
+
headers=headers,
|
|
57
|
+
timeout=timeout,
|
|
58
|
+
connector=connector,
|
|
59
|
+
trace_configs=[trace_config] if trace_config is not None else None
|
|
60
|
+
)
|
|
23
61
|
|
|
24
62
|
|
|
25
63
|
class Crawler:
|
|
@@ -27,33 +65,55 @@ class Crawler:
|
|
|
27
65
|
|
|
28
66
|
def __init__(
|
|
29
67
|
self,
|
|
30
|
-
concurrency: int =
|
|
31
|
-
per_host: int =
|
|
32
|
-
timeout: int =
|
|
68
|
+
concurrency: int = None,
|
|
69
|
+
per_host: int = None,
|
|
70
|
+
timeout: int = None,
|
|
33
71
|
*,
|
|
34
72
|
headers: dict | None = None,
|
|
35
73
|
proxies: dict | None = None,
|
|
36
74
|
retry_policy: RetryPolicy | None = None,
|
|
37
75
|
throttler: AbstractThrottler | None = None,
|
|
38
76
|
auto_throttle_target_concurrency: float = None,
|
|
39
|
-
auto_throttle_start_delay: float =
|
|
40
|
-
auto_throttle_max_delay: float =
|
|
77
|
+
auto_throttle_start_delay: float = None,
|
|
78
|
+
auto_throttle_max_delay: float = None,
|
|
41
79
|
respect_robots: bool = True,
|
|
42
80
|
):
|
|
43
|
-
|
|
81
|
+
cfg = CRAWLER_SETTINGS
|
|
82
|
+
|
|
83
|
+
self.concurrency = concurrency if concurrency is not None else cfg.concurrency
|
|
84
|
+
self.per_host = per_host if per_host is not None else cfg.per_host
|
|
85
|
+
|
|
86
|
+
timeout = timeout if timeout is not None else cfg.timeout
|
|
44
87
|
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
|
45
|
-
self._headers = HEADERS | (headers or {}) # merge headers
|
|
46
|
-
self._proxies = proxies if (isinstance(proxies, defaultdict) or proxies) else {}
|
|
47
|
-
self.respect_robots = respect_robots
|
|
48
88
|
|
|
89
|
+
self._headers = cfg.headers | (headers or {}) # merge headers
|
|
90
|
+
|
|
91
|
+
_proxies = proxies if proxies is not None else cfg.proxies
|
|
92
|
+
self._proxies = _proxies if (isinstance(_proxies, defaultdict) or _proxies) else {}
|
|
93
|
+
|
|
49
94
|
self.retry_policy = retry_policy or RetryPolicy()
|
|
95
|
+
|
|
96
|
+
# auto-throttle defaults
|
|
97
|
+
auto_throttle_target_concurrency = auto_throttle_target_concurrency \
|
|
98
|
+
if auto_throttle_target_concurrency is not None \
|
|
99
|
+
else cfg.auto_throttle_target_concurrency
|
|
100
|
+
|
|
101
|
+
auto_throttle_start_delay = auto_throttle_start_delay \
|
|
102
|
+
if auto_throttle_start_delay is not None \
|
|
103
|
+
else cfg.auto_throttle_start_delay
|
|
104
|
+
|
|
105
|
+
auto_throttle_max_delay = auto_throttle_max_delay \
|
|
106
|
+
if auto_throttle_max_delay is not None \
|
|
107
|
+
else cfg.auto_throttle_max_delay
|
|
108
|
+
|
|
50
109
|
self.throttler = throttler or AutoThrottler(
|
|
51
|
-
target_concurrency=auto_throttle_target_concurrency or concurrency/4.0,
|
|
110
|
+
target_concurrency=auto_throttle_target_concurrency or self.concurrency/4.0,
|
|
52
111
|
start_delay=auto_throttle_start_delay,
|
|
53
112
|
max_delay=auto_throttle_max_delay,
|
|
54
113
|
)
|
|
55
|
-
|
|
56
|
-
self.
|
|
114
|
+
|
|
115
|
+
self._sem_global = asyncio.Semaphore(self.concurrency)
|
|
116
|
+
self._sem_host = defaultdict(lambda: asyncio.Semaphore(self.per_host))
|
|
57
117
|
|
|
58
118
|
self._pending: asyncio.Queue[Request] = asyncio.Queue()
|
|
59
119
|
self._results: asyncio.Queue[Response] = asyncio.Queue()
|
|
@@ -62,18 +122,31 @@ class Crawler:
|
|
|
62
122
|
self._workers: list[asyncio.Task] = []
|
|
63
123
|
self._closed = False
|
|
64
124
|
self._stats = CrawlerStats()
|
|
125
|
+
|
|
126
|
+
self.respect_robots = respect_robots if respect_robots is not None else cfg.respect_robots
|
|
65
127
|
self._robots_policy: RobotsTxtPolicy | None = None
|
|
66
128
|
|
|
129
|
+
# WARN: If SQLiteBackend caching is enabled and min(concurrency, per_host) > 1,
|
|
130
|
+
# write-contention is likely to occur.
|
|
131
|
+
if (CACHE_SETTINGS.enabled
|
|
132
|
+
and CACHE_SETTINGS.backend == "sqlite"
|
|
133
|
+
and min(self.concurrency, self.per_host) > 1
|
|
134
|
+
):
|
|
135
|
+
log.warning(
|
|
136
|
+
"SQLiteBackend caching is enabled and min(concurrency, per_host) > 1. "
|
|
137
|
+
"Write-contention is likely to occur. Consider using RedisBackend."
|
|
138
|
+
)
|
|
139
|
+
|
|
67
140
|
def build_session(self) -> aiohttp.ClientSession:
|
|
68
141
|
"""Construct an `aiohttp.ClientSession` with tracing and pooling."""
|
|
69
142
|
trace_config = build_trace_config(self._stats)
|
|
70
143
|
# Need to build the connector as late as possible as it requires the loop
|
|
71
144
|
connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
|
|
72
|
-
return
|
|
73
|
-
headers=self._headers,
|
|
74
|
-
timeout=self._timeout,
|
|
75
|
-
connector=connector,
|
|
76
|
-
|
|
145
|
+
return get_async_session(
|
|
146
|
+
headers=self._headers,
|
|
147
|
+
timeout=self._timeout,
|
|
148
|
+
connector=connector,
|
|
149
|
+
trace_config=trace_config
|
|
77
150
|
)
|
|
78
151
|
|
|
79
152
|
async def __aenter__(self) -> "Crawler":
|
|
@@ -82,6 +155,7 @@ class Crawler:
|
|
|
82
155
|
# self._session = aiohttp.ClientSession(timeout=self._timeout)
|
|
83
156
|
self._session = self.build_session()
|
|
84
157
|
|
|
158
|
+
# Note: Set robots policy after session is created
|
|
85
159
|
if self.respect_robots:
|
|
86
160
|
self._robots_policy = RobotsTxtPolicy(self._session)
|
|
87
161
|
|
|
@@ -184,12 +258,22 @@ class Crawler:
|
|
|
184
258
|
|
|
185
259
|
start = time.monotonic()
|
|
186
260
|
try:
|
|
261
|
+
log.info("fetching", extra={"url": req.url})
|
|
187
262
|
async with self._session.get(
|
|
188
263
|
req.url,
|
|
189
264
|
headers=self._headers | req.headers,
|
|
190
265
|
proxy=self._proxy_for(req.url),
|
|
191
266
|
timeout=req.timeout or self._timeout,
|
|
192
267
|
) as resp:
|
|
268
|
+
from_cache = getattr(resp, "from_cache", False)
|
|
269
|
+
if from_cache:
|
|
270
|
+
# NOTE: This is a bit of a hack, but it works. aiohttp-client-cache does not
|
|
271
|
+
# interface with TraceConfigs on cache hit, so we have to do it here.
|
|
272
|
+
self._stats.requests_cache_hit += 1
|
|
273
|
+
log.info("[CACHE HIT]", extra={"req.url": req.url, "resp.url": resp.url})
|
|
274
|
+
else:
|
|
275
|
+
log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
|
|
276
|
+
|
|
193
277
|
body = await resp.read()
|
|
194
278
|
|
|
195
279
|
latency = time.monotonic() - start
|
wxpath/http/client/request.py
CHANGED
wxpath/http/stats.py
CHANGED
|
@@ -16,6 +16,7 @@ class CrawlerStats:
|
|
|
16
16
|
requests_enqueued: int = 0
|
|
17
17
|
requests_started: int = 0
|
|
18
18
|
requests_completed: int = 0
|
|
19
|
+
requests_cache_hit: int = 0
|
|
19
20
|
|
|
20
21
|
# ---- Concurrency ----
|
|
21
22
|
in_flight_global: int = 0
|
|
@@ -57,6 +58,9 @@ def build_trace_config(stats: CrawlerStats) -> TraceConfig:
|
|
|
57
58
|
context._start_time = time.monotonic()
|
|
58
59
|
|
|
59
60
|
async def on_request_end(session, context, params):
|
|
61
|
+
"""
|
|
62
|
+
Update stats on request completion.
|
|
63
|
+
"""
|
|
60
64
|
host = params.url.host
|
|
61
65
|
stats.in_flight_global -= 1
|
|
62
66
|
stats.in_flight_per_host[host] -= 1
|
|
@@ -82,6 +86,8 @@ def build_trace_config(stats: CrawlerStats) -> TraceConfig:
|
|
|
82
86
|
if not hasattr(stats, "bytes_received"):
|
|
83
87
|
stats.bytes_received = 0
|
|
84
88
|
stats.bytes_received += content_length
|
|
89
|
+
|
|
90
|
+
stats.requests_completed += 1
|
|
85
91
|
|
|
86
92
|
async def on_request_exception(session, context, params):
|
|
87
93
|
host = params.url.host
|
wxpath/settings.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Settings for wxpath.
|
|
3
|
+
|
|
4
|
+
These settings are global and can be accessed from any module in the wxpath package.
|
|
5
|
+
|
|
6
|
+
They are typically used by various modules to configure Class initializers.
|
|
7
|
+
|
|
8
|
+
The SETTINGS dict structure follows the structure of wxpath submodules.
|
|
9
|
+
|
|
10
|
+
Expected usage behavior:
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from wxpath.settings import SETTINGS
|
|
14
|
+
|
|
15
|
+
CACHE_SETTINGS = SETTINGS.http.client.cache
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Once initialized, the settings are expected to be immutable (not enforced).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from datetime import timedelta
|
|
22
|
+
|
|
23
|
+
# Settings match
|
|
24
|
+
SETTINGS = {
|
|
25
|
+
'http': {
|
|
26
|
+
'client': {
|
|
27
|
+
'cache': {
|
|
28
|
+
'enabled': False,
|
|
29
|
+
# 'db_path': 'cache.db',
|
|
30
|
+
'expire_after': timedelta(days=7),
|
|
31
|
+
'urls_expire_after': None,
|
|
32
|
+
'allowed_methods': ("GET", "HEAD"),
|
|
33
|
+
'allowed_codes': (200, 203, 301, 302, 307, 308),
|
|
34
|
+
'ignored_parameters': ["utm_*", "fbclid"],
|
|
35
|
+
'include_headers': False, # don’t vary cache keys on headers by default
|
|
36
|
+
'cache_control': False, # honor Cache-Control/Expires if present
|
|
37
|
+
# # TODO: size hedges (soft, enforced by wxpath)
|
|
38
|
+
# 'max_entries': None, # e.g. 1_000_000
|
|
39
|
+
# 'max_response_size': None, # bytes, e.g. 2_000_000
|
|
40
|
+
# 'max_db_size': None, # bytes, e.g. 5 * 1024**3
|
|
41
|
+
'backend': "sqlite",
|
|
42
|
+
'sqlite': {
|
|
43
|
+
'cache_name': "cache.db",
|
|
44
|
+
},
|
|
45
|
+
'redis': {
|
|
46
|
+
# 'host': "localhost",
|
|
47
|
+
# 'port': 6379,
|
|
48
|
+
# 'db': 0,
|
|
49
|
+
'address': 'redis://localhost:6379/0',
|
|
50
|
+
'cache_name': "wxpath:",
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
'crawler': {
|
|
54
|
+
'concurrency': 16,
|
|
55
|
+
'per_host': 8,
|
|
56
|
+
'timeout': 15,
|
|
57
|
+
'headers': {
|
|
58
|
+
"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
|
|
59
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
60
|
+
"Chrome/142.0.0.0 Safari/537.36")},
|
|
61
|
+
'proxies': None,
|
|
62
|
+
'auto_throttle_target_concurrency': None,
|
|
63
|
+
'auto_throttle_start_delay': 0.25,
|
|
64
|
+
'auto_throttle_max_delay': 10.0,
|
|
65
|
+
'respect_robots': True,
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class AttrDict(dict):
|
|
73
|
+
"""
|
|
74
|
+
A dictionary subclass that allows dot-notation access while
|
|
75
|
+
recursively converting nested dictionaries.
|
|
76
|
+
"""
|
|
77
|
+
def __init__(self, *args, **kwargs):
|
|
78
|
+
super().__init__(*args, **kwargs)
|
|
79
|
+
# Point the instance __dict__ to itself to allow attribute access
|
|
80
|
+
self.__dict__ = self
|
|
81
|
+
# Recursively convert any dicts passed during initialization
|
|
82
|
+
for key, value in self.items():
|
|
83
|
+
self[key] = self._convert(value)
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def _convert(cls, value):
|
|
87
|
+
"""Recursively converts dicts to AttrDicts, leaving other types alone."""
|
|
88
|
+
if isinstance(value, dict):
|
|
89
|
+
return cls(value)
|
|
90
|
+
elif isinstance(value, list):
|
|
91
|
+
# Optional: converts dicts inside lists while keeping the list container
|
|
92
|
+
return [cls._convert(item) for item in value]
|
|
93
|
+
return value
|
|
94
|
+
|
|
95
|
+
def __setitem__(self, key, value):
|
|
96
|
+
# Ensure that new items added via dict-syntax are also converted
|
|
97
|
+
super().__setitem__(key, self._convert(value))
|
|
98
|
+
|
|
99
|
+
def __getattr__(self, key):
|
|
100
|
+
try:
|
|
101
|
+
return self[key]
|
|
102
|
+
except KeyError as exc:
|
|
103
|
+
raise AttributeError(f"AttrDict object has no attribute '{key}'") from exc
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
SETTINGS = AttrDict(SETTINGS)
|
|
107
|
+
CACHE_SETTINGS = SETTINGS.http.client.cache
|
|
108
|
+
CRAWLER_SETTINGS = SETTINGS.http.client.crawler
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -10,6 +10,13 @@ License-File: LICENSE
|
|
|
10
10
|
Requires-Dist: lxml>=4.0
|
|
11
11
|
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
12
12
|
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
13
|
+
Requires-Dist: tqdm>=4.0.0
|
|
14
|
+
Provides-Extra: cache
|
|
15
|
+
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
|
|
16
|
+
Provides-Extra: cache-sqlite
|
|
17
|
+
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
|
|
18
|
+
Provides-Extra: cache-redis
|
|
19
|
+
Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
|
|
13
20
|
Provides-Extra: test
|
|
14
21
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
15
22
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
@@ -23,9 +30,42 @@ Dynamic: license-file
|
|
|
23
30
|
|
|
24
31
|
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
25
32
|
|
|
26
|
-
|
|
33
|
+
This expression fetches a page, extracts links, and streams them concurrently - no crawl loop required:
|
|
27
34
|
|
|
28
|
-
|
|
35
|
+
```python
|
|
36
|
+
import wxpath
|
|
37
|
+
|
|
38
|
+
expr = "url('https://example.com')//a/@href"
|
|
39
|
+
|
|
40
|
+
for link in wxpath.wxpath_async_blocking_iter(expr):
|
|
41
|
+
print(link)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import wxpath
|
|
49
|
+
|
|
50
|
+
path_expr = """
|
|
51
|
+
url('https://quotes.toscrape.com')
|
|
52
|
+
///url(//a/@href)
|
|
53
|
+
//a/@href
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
57
|
+
print(item)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
## Why wxpath?
|
|
62
|
+
|
|
63
|
+
Most web scrapers force you to write crawl control flow first, and extraction second.
|
|
64
|
+
|
|
65
|
+
**wxpath** inverts that:
|
|
66
|
+
- **You describe traversal declaratively**
|
|
67
|
+
- **Extraction is expressed inline**
|
|
68
|
+
- **The engine handles scheduling, concurrency, and deduplication**
|
|
29
69
|
|
|
30
70
|
|
|
31
71
|
## Contents
|
|
@@ -38,7 +78,10 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
38
78
|
- [Polite Crawling](#polite-crawling)
|
|
39
79
|
- [Output types](#output-types)
|
|
40
80
|
- [XPath 3.1](#xpath-31-by-default)
|
|
81
|
+
- [Progress Bar](#progress-bar)
|
|
41
82
|
- [CLI](#cli)
|
|
83
|
+
- [Persistence and Caching](#persistence-and-caching)
|
|
84
|
+
- [Settings](#settings)
|
|
42
85
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
43
86
|
- [Install](#install)
|
|
44
87
|
- [More Examples](EXAMPLES.md)
|
|
@@ -46,7 +89,8 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
46
89
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
47
90
|
- [Project Philosophy](#project-philosophy)
|
|
48
91
|
- [Warnings](#warnings)
|
|
49
|
-
- [Commercial support
|
|
92
|
+
- [Commercial support/consulting](#commercial-supportconsulting)
|
|
93
|
+
- [Versioning](#versioning)
|
|
50
94
|
- [License](#license)
|
|
51
95
|
|
|
52
96
|
|
|
@@ -54,32 +98,31 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
54
98
|
|
|
55
99
|
```python
|
|
56
100
|
import wxpath
|
|
101
|
+
from wxpath.settings import CRAWLER_SETTINGS
|
|
102
|
+
|
|
103
|
+
# Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
|
|
104
|
+
CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
|
|
57
105
|
|
|
58
106
|
# Crawl, extract fields, build a knowledge graph
|
|
59
107
|
path_expr = """
|
|
60
108
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
109
|
+
///url(
|
|
110
|
+
//main//a/@href[
|
|
111
|
+
starts-with(., '/wiki/') and not(contains(., ':'))
|
|
112
|
+
]
|
|
113
|
+
)
|
|
114
|
+
/map{
|
|
115
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
116
|
+
'url': string(base-uri(.)),
|
|
117
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
118
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
119
|
+
}
|
|
68
120
|
"""
|
|
69
121
|
|
|
70
122
|
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
71
123
|
print(item)
|
|
72
124
|
```
|
|
73
125
|
|
|
74
|
-
Output:
|
|
75
|
-
|
|
76
|
-
```python
|
|
77
|
-
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
78
|
-
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
79
|
-
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
80
|
-
...
|
|
81
|
-
```
|
|
82
|
-
|
|
83
126
|
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
84
127
|
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
85
128
|
|
|
@@ -195,6 +238,17 @@ path_expr = """
|
|
|
195
238
|
# ...]
|
|
196
239
|
```
|
|
197
240
|
|
|
241
|
+
## Progress Bar
|
|
242
|
+
|
|
243
|
+
**wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
|
|
244
|
+
|
|
245
|
+
Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
items = wxpath.wxpath_async_blocking("...", progress=True)
|
|
249
|
+
> 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
|
|
250
|
+
```
|
|
251
|
+
|
|
198
252
|
|
|
199
253
|
## CLI
|
|
200
254
|
|
|
@@ -237,9 +291,46 @@ Command line options:
|
|
|
237
291
|
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
238
292
|
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
239
293
|
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
294
|
+
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
## Persistence and Caching
|
|
299
|
+
|
|
300
|
+
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
301
|
+
|
|
302
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
303
|
+
|
|
304
|
+
To use, you must install the appropriate optional dependency:
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
pip install wxpath[cache-sqlite]
|
|
308
|
+
pip install wxpath[cache-redis]
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
Once the dependency is installed, you must enable the cache:
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
from wxpath.settings import SETTINGS
|
|
315
|
+
|
|
316
|
+
# To enable caching; sqlite is the default
|
|
317
|
+
SETTINGS.http.client.cache.enabled = True
|
|
318
|
+
|
|
319
|
+
# For redis backend
|
|
320
|
+
SETTINGS.http.client.cache.enabled = True
|
|
321
|
+
SETTINGS.http.client.cache.backend = "redis"
|
|
322
|
+
SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
|
|
323
|
+
|
|
324
|
+
# Run wxpath as usual
|
|
325
|
+
items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
|
|
240
326
|
```
|
|
241
327
|
|
|
242
328
|
|
|
329
|
+
## Settings
|
|
330
|
+
|
|
331
|
+
See [settings.py](src/wxpath/settings.py) for details of the settings.
|
|
332
|
+
|
|
333
|
+
|
|
243
334
|
## Hooks (Experimental)
|
|
244
335
|
|
|
245
336
|
**wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
|
|
@@ -290,6 +381,13 @@ Requires Python 3.10+.
|
|
|
290
381
|
pip install wxpath
|
|
291
382
|
```
|
|
292
383
|
|
|
384
|
+
For persisted/cached, wxpath supports the following backends:
|
|
385
|
+
|
|
386
|
+
```
|
|
387
|
+
pip install wxpath[cache-sqlite]
|
|
388
|
+
pip install wxpath[cache-redis]
|
|
389
|
+
```
|
|
390
|
+
|
|
293
391
|
|
|
294
392
|
## More Examples
|
|
295
393
|
|
|
@@ -336,6 +434,17 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//mai
|
|
|
336
434
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
337
435
|
```
|
|
338
436
|
|
|
437
|
+
### Runtime API (`wxpath_async*`) options
|
|
438
|
+
|
|
439
|
+
- `max_depth`: int = 1
|
|
440
|
+
- `progress`: bool = False
|
|
441
|
+
- `engine`: WXPathEngine | None = None
|
|
442
|
+
- `yield_errors`: bool = False
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
### Settings
|
|
446
|
+
You can also use [settings.py](src/wxpath/settings.py) to enable caching, throttling, concurrency and more.
|
|
447
|
+
|
|
339
448
|
|
|
340
449
|
## Project Philosophy
|
|
341
450
|
|
|
@@ -345,7 +454,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
345
454
|
- Stay lightweight and composable
|
|
346
455
|
- Asynchronous support for high-performance crawls
|
|
347
456
|
|
|
348
|
-
###
|
|
457
|
+
### Goals
|
|
349
458
|
|
|
350
459
|
- URLs are deduplicated on a best-effort, per-crawl basis.
|
|
351
460
|
- Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
|
|
@@ -356,7 +465,6 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
356
465
|
|
|
357
466
|
The following features are not yet supported:
|
|
358
467
|
|
|
359
|
-
- Persistent scheduling or crawl resumption
|
|
360
468
|
- Automatic proxy rotation
|
|
361
469
|
- Browser-based rendering (JavaScript execution)
|
|
362
470
|
- Strict result ordering
|
|
@@ -364,13 +472,15 @@ The following features are not yet supported:
|
|
|
364
472
|
|
|
365
473
|
## WARNINGS!!!
|
|
366
474
|
|
|
475
|
+
This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
476
|
+
|
|
367
477
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
368
478
|
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
369
479
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
370
480
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
371
481
|
|
|
372
482
|
|
|
373
|
-
## Commercial support
|
|
483
|
+
## Commercial support/consulting
|
|
374
484
|
|
|
375
485
|
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
376
486
|
|
|
@@ -379,6 +489,13 @@ If you want help building or operating crawlers/data feeds with wxpath (extracti
|
|
|
379
489
|
|
|
380
490
|
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
381
491
|
|
|
492
|
+
|
|
493
|
+
## Versioning
|
|
494
|
+
|
|
495
|
+
**wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
|
|
496
|
+
|
|
497
|
+
However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
|
|
498
|
+
|
|
382
499
|
## License
|
|
383
500
|
|
|
384
501
|
MIT
|
|
@@ -1,22 +1,24 @@
|
|
|
1
1
|
wxpath/__init__.py,sha256=w1hFE_VSIYq_TSFLoPfp6MJbG1sA6BeChX6PYsXIK4o,265
|
|
2
|
-
wxpath/cli.py,sha256=
|
|
2
|
+
wxpath/cli.py,sha256=e0-mHkpuC1B_WyJw7wH43UBmtuF8oL8phQ4GEzUX0Ns,4332
|
|
3
3
|
wxpath/patches.py,sha256=u0dOL-K-gvdO9SJvzGrqR9Zou6XduWjl6R7mzIcZtJg,2130
|
|
4
|
+
wxpath/settings.py,sha256=a4TlCAOvmO03oOXiiYQzIDBMZU0XpTqntwnjVsumnas,3809
|
|
4
5
|
wxpath/core/__init__.py,sha256=U9_In2iRaZrpiIVavIli1M59gCB6Kn1en-1Fza-qIiI,257
|
|
5
6
|
wxpath/core/dom.py,sha256=X0L3n8jRfO5evEypDaJTD-NQ3cLXWvnEUVERAHo3vV0,701
|
|
6
7
|
wxpath/core/models.py,sha256=3KYt-UwfLY2FlSRUHeA_getnYaNUMPW9wRrl2CRbPso,1611
|
|
7
8
|
wxpath/core/ops.py,sha256=PTjX6c4QvCqGaByYYqaK4dte5iWO3lZzgqGrMXp6f6g,9727
|
|
8
9
|
wxpath/core/parser.py,sha256=WfjQNixBz7nWtX2O0t19MOhUJmzGMg8Qol40P6oC8zc,18827
|
|
9
10
|
wxpath/core/runtime/__init__.py,sha256=_iCgkIWxXvxzQcenHOsjYGsk74HboTIYWOtgM8GtCyc,86
|
|
10
|
-
wxpath/core/runtime/engine.py,sha256=
|
|
11
|
+
wxpath/core/runtime/engine.py,sha256=UQ8wSr49TJibRRtXzIgXVSBvuB1VttYicKEwV4xcG6Q,17345
|
|
11
12
|
wxpath/core/runtime/helpers.py,sha256=M1i4BryCktAxeboa4LOXMTNiKVCJLDBD-KpWCQXadpw,1434
|
|
12
13
|
wxpath/hooks/__init__.py,sha256=9JG63e4z_8CZLWugFcY786hebaEEPZ5FmZhyDHat-98,294
|
|
13
14
|
wxpath/hooks/builtin.py,sha256=GJ4w1C9djWNzAmAA3U0qI9OoCOeC5R8tEGtWXJVHSYs,4125
|
|
14
15
|
wxpath/hooks/registry.py,sha256=-D11f_mMboeVAH8qsTkbKTQ0aGNaQ7F6zbXDsOIYxN0,4513
|
|
15
16
|
wxpath/http/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
wxpath/http/stats.py,sha256=
|
|
17
|
+
wxpath/http/stats.py,sha256=aqZWuybc5RCv-AmKdNbEX4uw1YvZtFoE6591UfukZns,3319
|
|
17
18
|
wxpath/http/client/__init__.py,sha256=QpdmqzcznUeuFvT3IIo-LmBUUHEa2BDq9sHGAHJnDLI,202
|
|
18
|
-
wxpath/http/client/
|
|
19
|
-
wxpath/http/client/
|
|
19
|
+
wxpath/http/client/cache.py,sha256=cHS4XlfOStoHTG83ypNITk3Oc0lqGoTRqV0_UWBWQFY,1811
|
|
20
|
+
wxpath/http/client/crawler.py,sha256=UiKtc5K2KBc0bBw2fTdRHLNTa2OFoE1tZsDjR7J4Xeo,12126
|
|
21
|
+
wxpath/http/client/request.py,sha256=cpqo_ASG_wKz0q6m33lsE0kIIthfANt8fx7ptxlyehY,1057
|
|
20
22
|
wxpath/http/client/response.py,sha256=z9LQPnDN-NZRnQpIKozaWCqgpRejc6nixCr_XaPyqUQ,334
|
|
21
23
|
wxpath/http/policy/backoff.py,sha256=NwdUR6bRe1RtUGSJOktj-p8IyC1l9xu_-Aa_Gj_u5sw,321
|
|
22
24
|
wxpath/http/policy/retry.py,sha256=WSrQfCy1F7IcXFpVGDi4HTphNhFq12p4DaMO0_4dgrw,982
|
|
@@ -25,9 +27,9 @@ wxpath/http/policy/throttler.py,sha256=wydMFV-0mxpHSI5iYkLfE78oY4z_fF8jW9MqCeb8G
|
|
|
25
27
|
wxpath/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
28
|
wxpath/util/logging.py,sha256=oQi8sp7yKWgXkkcJ4U4WHp7TyBCQiK4VhSXOSb8pGw0,2965
|
|
27
29
|
wxpath/util/serialize.py,sha256=uUs4C9VErpFd97smBM2bRWo2nW25kCgKdsMrVtVxhg8,575
|
|
28
|
-
wxpath-0.
|
|
29
|
-
wxpath-0.
|
|
30
|
-
wxpath-0.
|
|
31
|
-
wxpath-0.
|
|
32
|
-
wxpath-0.
|
|
33
|
-
wxpath-0.
|
|
30
|
+
wxpath-0.4.1.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
|
|
31
|
+
wxpath-0.4.1.dist-info/METADATA,sha256=LxmOTsWpspYFedvP02fDL1Wy5t1ygZKuIg2cHVQU_aY,19445
|
|
32
|
+
wxpath-0.4.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
33
|
+
wxpath-0.4.1.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
|
|
34
|
+
wxpath-0.4.1.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
|
|
35
|
+
wxpath-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|