wxpath 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +2 -0
- wxpath/cli.py +6 -0
- wxpath/core/models.py +1 -0
- wxpath/core/ops.py +9 -12
- wxpath/core/parser.py +92 -23
- wxpath/core/runtime/engine.py +79 -8
- wxpath/core/runtime/helpers.py +6 -3
- wxpath/http/client/__init__.py +1 -1
- wxpath/http/client/crawler.py +19 -7
- wxpath/http/client/request.py +1 -1
- wxpath/http/client/response.py +7 -1
- wxpath/http/policy/retry.py +2 -2
- wxpath/integrations/__init__.py +0 -0
- wxpath/integrations/langchain/__init__.py +0 -0
- wxpath/integrations/langchain/examples/basic_rag.py +85 -0
- wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
- wxpath/integrations/langchain/loader.py +60 -0
- wxpath/patches.py +215 -5
- wxpath/settings.py +3 -1
- wxpath/tui.py +1204 -0
- wxpath/tui_settings.py +151 -0
- wxpath/util/cleaners.py +31 -0
- wxpath/util/common_paths.py +22 -0
- wxpath/util/logging.py +3 -7
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/METADATA +123 -19
- wxpath-0.5.0.dist-info/RECORD +44 -0
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/WHEEL +1 -1
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/entry_points.txt +1 -0
- wxpath-0.4.0.dist-info/RECORD +0 -35
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/top_level.txt +0 -0
wxpath/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from . import settings
|
|
1
2
|
from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
|
|
2
3
|
from .util.logging import configure_logging
|
|
3
4
|
|
|
@@ -6,4 +7,5 @@ __all__ = [
|
|
|
6
7
|
'wxpath_async_blocking',
|
|
7
8
|
'wxpath_async_blocking_iter',
|
|
8
9
|
'configure_logging',
|
|
10
|
+
'settings',
|
|
9
11
|
]
|
wxpath/cli.py
CHANGED
|
@@ -47,6 +47,11 @@ def main():
|
|
|
47
47
|
help="Respect robots.txt",
|
|
48
48
|
default=True
|
|
49
49
|
)
|
|
50
|
+
arg_parser.add_argument(
|
|
51
|
+
"--insecure",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Disable SSL certificate verification (use for sites with broken chains)",
|
|
54
|
+
)
|
|
50
55
|
arg_parser.add_argument(
|
|
51
56
|
"--cache",
|
|
52
57
|
action="store_true",
|
|
@@ -112,6 +117,7 @@ def main():
|
|
|
112
117
|
concurrency=args.concurrency,
|
|
113
118
|
per_host=args.concurrency_per_host,
|
|
114
119
|
respect_robots=args.respect_robots,
|
|
120
|
+
verify_ssl=not args.insecure,
|
|
115
121
|
headers=custom_headers
|
|
116
122
|
)
|
|
117
123
|
engine = WXPathEngine(crawler=crawler)
|
wxpath/core/models.py
CHANGED
wxpath/core/ops.py
CHANGED
|
@@ -19,6 +19,7 @@ from wxpath.core.parser import (
|
|
|
19
19
|
Binary,
|
|
20
20
|
Call,
|
|
21
21
|
ContextItem,
|
|
22
|
+
Depth,
|
|
22
23
|
Segment,
|
|
23
24
|
Segments,
|
|
24
25
|
String,
|
|
@@ -78,7 +79,10 @@ def get_operator(
|
|
|
78
79
|
|
|
79
80
|
|
|
80
81
|
@register('url', (String,))
|
|
82
|
+
@register('url', (String, Depth))
|
|
81
83
|
@register('url', (String, Xpath))
|
|
84
|
+
@register('url', (String, Depth, Xpath))
|
|
85
|
+
@register('url', (String, Xpath, Depth))
|
|
82
86
|
def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
83
87
|
curr_segments: list[Url | Xpath],
|
|
84
88
|
curr_depth: int, **kwargs) -> Iterable[Intent]:
|
|
@@ -87,9 +91,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
|
|
|
87
91
|
|
|
88
92
|
next_segments = curr_segments[1:]
|
|
89
93
|
|
|
90
|
-
|
|
94
|
+
# NOTE: Expects parser to produce UrlCrawl node in expressions
|
|
95
|
+
# that look like `url('...', follow=//a/@href)`
|
|
96
|
+
if isinstance(url_call, UrlCrawl):
|
|
97
|
+
xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
|
|
91
98
|
_segments = [
|
|
92
|
-
UrlCrawl('///url', [
|
|
99
|
+
UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
|
|
93
100
|
] + next_segments
|
|
94
101
|
|
|
95
102
|
yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
|
|
@@ -112,16 +119,6 @@ def _handle_xpath(curr_elem: html.HtmlElement,
|
|
|
112
119
|
raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
|
|
113
120
|
base_url = getattr(curr_elem, 'base_url', None)
|
|
114
121
|
log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
|
|
115
|
-
|
|
116
|
-
_backlink_str = f"string('{curr_elem.get('backlink')}')"
|
|
117
|
-
# We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
|
|
118
|
-
# increment after each url*() hop
|
|
119
|
-
_depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
|
|
120
|
-
expr = expr.replace('wx:backlink()', _backlink_str)
|
|
121
|
-
expr = expr.replace('wx:backlink(.)', _backlink_str)
|
|
122
|
-
expr = expr.replace('wx:depth()', _depth_str)
|
|
123
|
-
expr = expr.replace('wx:depth(.)', _depth_str)
|
|
124
|
-
|
|
125
122
|
elems = curr_elem.xpath3(expr)
|
|
126
123
|
|
|
127
124
|
next_segments = curr_segments[1:]
|
wxpath/core/parser.py
CHANGED
|
@@ -13,7 +13,8 @@ except ImportError:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
TOKEN_SPEC = [
|
|
16
|
-
("NUMBER", r"\d
|
|
16
|
+
("NUMBER", r"\d+\.\d+"),
|
|
17
|
+
("INTEGER", r"\d+"),
|
|
17
18
|
("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
|
|
18
19
|
("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
19
20
|
# ("///URL", r"/{3}\s*url"),
|
|
@@ -22,6 +23,7 @@ TOKEN_SPEC = [
|
|
|
22
23
|
("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
|
|
23
24
|
# ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
|
|
24
25
|
("FOLLOW", r",?\s{,}follow="),
|
|
26
|
+
("DEPTH", r",?\s{,}depth="),
|
|
25
27
|
("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
|
|
26
28
|
("LPAREN", r"\("),
|
|
27
29
|
("RPAREN", r"\)"),
|
|
@@ -63,6 +65,14 @@ def tokenize(src: str):
|
|
|
63
65
|
class Number:
|
|
64
66
|
value: float
|
|
65
67
|
|
|
68
|
+
@dataclass
|
|
69
|
+
class Integer:
|
|
70
|
+
value: int
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class Depth(Integer):
|
|
74
|
+
pass
|
|
75
|
+
|
|
66
76
|
@dataclass
|
|
67
77
|
class String:
|
|
68
78
|
value: str
|
|
@@ -273,6 +283,10 @@ class Parser:
|
|
|
273
283
|
if tok.type == "NUMBER":
|
|
274
284
|
self.advance()
|
|
275
285
|
return Number(float(tok.value))
|
|
286
|
+
|
|
287
|
+
if tok.type == "INTEGER":
|
|
288
|
+
self.advance()
|
|
289
|
+
return Integer(int(tok.value))
|
|
276
290
|
|
|
277
291
|
if tok.type == "STRING":
|
|
278
292
|
self.advance()
|
|
@@ -358,18 +372,18 @@ class Parser:
|
|
|
358
372
|
self.advance()
|
|
359
373
|
|
|
360
374
|
return result
|
|
361
|
-
|
|
362
375
|
|
|
363
376
|
def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
|
|
364
377
|
"""Capture content inside a url() call, handling nested wxpath expressions.
|
|
365
378
|
|
|
366
379
|
Supports patterns like::
|
|
367
380
|
|
|
368
|
-
url('...')
|
|
369
|
-
url('...' follow=//a/@href)
|
|
370
|
-
url(
|
|
371
|
-
url(
|
|
372
|
-
url( url(
|
|
381
|
+
url('...') -> [String]
|
|
382
|
+
url('...' follow=//a/@href) -> [String, Xpath]
|
|
383
|
+
url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
|
|
384
|
+
url(//a/@href depth=2) -> [Xpath, Integer]
|
|
385
|
+
url( url('..')//a/@href ) -> [Call, Xpath]
|
|
386
|
+
url( url( url('..')//a )//b ) -> [Call, Xpath]
|
|
373
387
|
|
|
374
388
|
Returns:
|
|
375
389
|
A list of parsed elements: Xpath nodes for xpath content and Call
|
|
@@ -380,7 +394,10 @@ class Parser:
|
|
|
380
394
|
paren_balance = 1 # We're already inside the opening paren of url()
|
|
381
395
|
brace_balance = 0 # Track braces for map constructors
|
|
382
396
|
reached_follow_token = False
|
|
397
|
+
reached_depth_token = False
|
|
383
398
|
follow_xpath = ""
|
|
399
|
+
depth_number = ""
|
|
400
|
+
|
|
384
401
|
while paren_balance > 0 and self.token.type != "EOF":
|
|
385
402
|
if self.token.type == "WXPATH":
|
|
386
403
|
# Found nested wxpath: save any accumulated xpath content first
|
|
@@ -396,13 +413,22 @@ class Parser:
|
|
|
396
413
|
|
|
397
414
|
elif self.token.type == "FOLLOW":
|
|
398
415
|
reached_follow_token = True
|
|
416
|
+
reached_depth_token = False
|
|
417
|
+
self.advance()
|
|
418
|
+
|
|
419
|
+
elif self.token.type == "DEPTH":
|
|
420
|
+
reached_depth_token = True
|
|
421
|
+
reached_follow_token = False
|
|
399
422
|
self.advance()
|
|
400
423
|
|
|
401
424
|
elif self.token.type == "LPAREN":
|
|
402
425
|
# Opening paren that's NOT part of a url() call
|
|
403
426
|
# (it's part of an xpath function like contains(), starts-with(), etc.)
|
|
404
427
|
paren_balance += 1
|
|
405
|
-
|
|
428
|
+
if not reached_follow_token:
|
|
429
|
+
current_xpath += self.token.value
|
|
430
|
+
else:
|
|
431
|
+
follow_xpath += self.token.value
|
|
406
432
|
self.advance()
|
|
407
433
|
|
|
408
434
|
elif self.token.type == "RPAREN":
|
|
@@ -410,26 +436,37 @@ class Parser:
|
|
|
410
436
|
if paren_balance == 0:
|
|
411
437
|
# This is the closing paren of the outer url()
|
|
412
438
|
break
|
|
413
|
-
|
|
439
|
+
if not reached_follow_token:
|
|
440
|
+
current_xpath += self.token.value
|
|
441
|
+
else:
|
|
442
|
+
follow_xpath += self.token.value
|
|
414
443
|
self.advance()
|
|
415
444
|
|
|
416
445
|
elif self.token.type == "LBRACE":
|
|
417
446
|
# Opening brace for map constructors
|
|
418
447
|
brace_balance += 1
|
|
419
|
-
|
|
448
|
+
if not reached_follow_token:
|
|
449
|
+
current_xpath += self.token.value
|
|
450
|
+
else:
|
|
451
|
+
follow_xpath += self.token.value
|
|
420
452
|
self.advance()
|
|
421
453
|
|
|
422
454
|
elif self.token.type == "RBRACE":
|
|
423
455
|
brace_balance -= 1
|
|
424
|
-
|
|
456
|
+
if not reached_follow_token:
|
|
457
|
+
current_xpath += self.token.value
|
|
458
|
+
else:
|
|
459
|
+
follow_xpath += self.token.value
|
|
425
460
|
self.advance()
|
|
426
461
|
|
|
427
462
|
else:
|
|
428
463
|
# Accumulate all other tokens as xpath content
|
|
429
|
-
if
|
|
430
|
-
current_xpath += self.token.value
|
|
431
|
-
else:
|
|
464
|
+
if reached_follow_token:
|
|
432
465
|
follow_xpath += self.token.value
|
|
466
|
+
elif reached_depth_token:
|
|
467
|
+
depth_number += self.token.value
|
|
468
|
+
else:
|
|
469
|
+
current_xpath += self.token.value
|
|
433
470
|
|
|
434
471
|
self.advance()
|
|
435
472
|
|
|
@@ -447,6 +484,9 @@ class Parser:
|
|
|
447
484
|
if follow_xpath.strip():
|
|
448
485
|
elements.append(Xpath(follow_xpath.strip()))
|
|
449
486
|
|
|
487
|
+
if depth_number.strip():
|
|
488
|
+
elements.append(Depth(int(depth_number.strip())))
|
|
489
|
+
|
|
450
490
|
return elements
|
|
451
491
|
|
|
452
492
|
def parse_call(self, func_name: str) -> Call | Segments:
|
|
@@ -462,13 +502,16 @@ class Parser:
|
|
|
462
502
|
self.advance()
|
|
463
503
|
# Handle follow=...
|
|
464
504
|
if self.token.type == "FOLLOW":
|
|
465
|
-
self.advance()
|
|
466
505
|
follow_arg = self.capture_url_arg_content()
|
|
467
506
|
args.extend(follow_arg)
|
|
507
|
+
if self.token.type == "DEPTH":
|
|
508
|
+
depth_arg = self.capture_url_arg_content()
|
|
509
|
+
args.extend(depth_arg)
|
|
468
510
|
elif self.token.type == "WXPATH":
|
|
469
511
|
# Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
|
|
470
|
-
#
|
|
471
|
-
args = self.capture_url_arg_content()
|
|
512
|
+
# NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
|
|
513
|
+
# args = self.capture_url_arg_content()
|
|
514
|
+
args = self.nud()
|
|
472
515
|
else:
|
|
473
516
|
# Simple xpath argument: url(//a/@href)
|
|
474
517
|
# Could still contain nested wxpath, so use capture_url_arg_content
|
|
@@ -489,8 +532,18 @@ class Parser:
|
|
|
489
532
|
|
|
490
533
|
return _specify_call_types(func_name, args)
|
|
491
534
|
|
|
492
|
-
|
|
493
535
|
def _specify_call_types(func_name: str, args: list) -> Call | Segments:
|
|
536
|
+
"""
|
|
537
|
+
Specify the type of a call based on the function name and arguments.
|
|
538
|
+
TODO: Provide example wxpath expressions for each call type.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
func_name: The name of the function.
|
|
542
|
+
args: The arguments of the function.
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
Call | Segments: The type of the call.
|
|
546
|
+
"""
|
|
494
547
|
if func_name == "url":
|
|
495
548
|
if len(args) == 1:
|
|
496
549
|
if isinstance(args[0], String):
|
|
@@ -500,17 +553,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
|
|
|
500
553
|
else:
|
|
501
554
|
raise ValueError(f"Unknown argument type: {type(args[0])}")
|
|
502
555
|
elif len(args) == 2:
|
|
503
|
-
|
|
556
|
+
arg0, arg1 = args
|
|
557
|
+
if isinstance(arg0, String) and isinstance(arg1, Xpath):
|
|
558
|
+
# Example: url('...', follow=//a/@href)
|
|
504
559
|
return UrlCrawl(func_name, args)
|
|
505
|
-
elif isinstance(
|
|
560
|
+
elif isinstance(arg0, String) and isinstance(arg1, Integer):
|
|
561
|
+
# Example: url('...', depth=2)
|
|
562
|
+
return UrlLiteral(func_name, args)
|
|
563
|
+
elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
|
|
506
564
|
args.append(UrlQuery('url', [ContextItem()]))
|
|
507
565
|
return Segments(args)
|
|
508
|
-
elif isinstance(
|
|
509
|
-
segs =
|
|
510
|
-
segs.append(
|
|
566
|
+
elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
|
|
567
|
+
segs = arg0
|
|
568
|
+
segs.append(arg1)
|
|
511
569
|
return Segments(segs)
|
|
512
570
|
else:
|
|
513
571
|
raise ValueError(f"Unknown arguments: {args}")
|
|
572
|
+
elif len(args) == 3:
|
|
573
|
+
arg0, arg1, arg2 = args
|
|
574
|
+
if (isinstance(arg0, String) and (
|
|
575
|
+
(isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
|
|
576
|
+
(isinstance(arg1, Integer) and isinstance(arg2, Xpath))
|
|
577
|
+
)):
|
|
578
|
+
# Example: url('...', follow=//a/@href, depth=2)
|
|
579
|
+
# Example: url('...', depth=2, follow=//a/@href)
|
|
580
|
+
return UrlCrawl(func_name, args)
|
|
581
|
+
else:
|
|
582
|
+
raise ValueError(f"Unknown arguments: {args}")
|
|
514
583
|
else:
|
|
515
584
|
raise ValueError(f"Unknown arguments: {args}")
|
|
516
585
|
elif func_name == "/url" or func_name == "//url":
|
wxpath/core/runtime/engine.py
CHANGED
|
@@ -18,7 +18,7 @@ from wxpath.core.models import (
|
|
|
18
18
|
ProcessIntent,
|
|
19
19
|
)
|
|
20
20
|
from wxpath.core.ops import get_operator
|
|
21
|
-
from wxpath.core.parser import Binary, Segment, Segments
|
|
21
|
+
from wxpath.core.parser import Binary, Depth, Segment, Segments
|
|
22
22
|
from wxpath.core.runtime.helpers import parse_html
|
|
23
23
|
from wxpath.hooks.registry import FetchContext, get_hooks
|
|
24
24
|
from wxpath.http.client.crawler import Crawler
|
|
@@ -158,17 +158,48 @@ class WXPathEngine(HookedEngineBase):
|
|
|
158
158
|
if allow_redirects:
|
|
159
159
|
self.allowed_response_codes |= {301, 302, 303, 307, 308}
|
|
160
160
|
|
|
161
|
+
def _get_max_depth(self, bin_or_segs: Binary | Segments, max_depth: int) -> int:
|
|
162
|
+
"""Get the maximum crawl depth for a given expression. Will find a Depth
|
|
163
|
+
argument at the beginning of the expression and return its value. Otherwise, returns the
|
|
164
|
+
max_depth value provided.
|
|
165
|
+
TODO: There has to be a better way to do this.
|
|
166
|
+
"""
|
|
167
|
+
if isinstance(bin_or_segs, Binary):
|
|
168
|
+
if hasattr(bin_or_segs.left, 'func') == 'url':
|
|
169
|
+
depth_arg = [arg for arg in bin_or_segs.left.args if isinstance(arg, Depth)][0]
|
|
170
|
+
return int(depth_arg.value)
|
|
171
|
+
elif hasattr(bin_or_segs.right, 'func') == 'url':
|
|
172
|
+
depth_arg = [arg for arg in bin_or_segs.right.args if isinstance(arg, Depth)][0]
|
|
173
|
+
return int(depth_arg.value)
|
|
174
|
+
elif isinstance(bin_or_segs, Segments):
|
|
175
|
+
depth_arg = [arg for arg in bin_or_segs[0].args if isinstance(arg, Depth)]
|
|
176
|
+
if depth_arg:
|
|
177
|
+
return int(depth_arg[0].value)
|
|
178
|
+
return max_depth
|
|
179
|
+
|
|
161
180
|
async def run(
|
|
162
181
|
self,
|
|
163
182
|
expression: str,
|
|
164
183
|
max_depth: int,
|
|
165
|
-
progress: bool = False
|
|
184
|
+
progress: bool = False,
|
|
185
|
+
yield_errors: bool = False,
|
|
166
186
|
) -> AsyncGenerator[Any, None]:
|
|
167
187
|
"""Execute a wxpath expression concurrently and yield results.
|
|
168
188
|
|
|
169
189
|
Builds and drives a BFS-like crawl pipeline that honors robots rules,
|
|
170
190
|
throttling, and hook callbacks while walking the web graph.
|
|
171
191
|
|
|
192
|
+
NOTES ON max_depth:
|
|
193
|
+
If depth is provided in the expression, it will be used to limit the depth of the
|
|
194
|
+
crawl. If depth is provided in the expression and max_depth is provided as an argument
|
|
195
|
+
to `run`, the inline depth in the expression will take precedence.
|
|
196
|
+
|
|
197
|
+
Currently, max_depth control flow logic is detected and executed in the
|
|
198
|
+
engine. In the future, the operation handlers (ops.py) could be responsible for
|
|
199
|
+
detecting max_depth, and sending a terminal intent to the engine. It's also possible
|
|
200
|
+
that the depth terminals are relative to the current depth (i.e. `url(//xpath, depth=2)`
|
|
201
|
+
implies crawling only the next 2 levels). This is not yet supported.
|
|
202
|
+
|
|
172
203
|
Args:
|
|
173
204
|
expression: WXPath expression string to evaluate.
|
|
174
205
|
max_depth: Maximum crawl depth to follow for url hops.
|
|
@@ -178,7 +209,9 @@ class WXPathEngine(HookedEngineBase):
|
|
|
178
209
|
Extracted values produced by the expression (HTML elements or
|
|
179
210
|
wxpath-specific value types).
|
|
180
211
|
"""
|
|
181
|
-
|
|
212
|
+
bin_or_segs = parser.parse(expression)
|
|
213
|
+
|
|
214
|
+
max_depth = self._get_max_depth(bin_or_segs, max_depth)
|
|
182
215
|
|
|
183
216
|
queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
|
|
184
217
|
inflight: dict[str, CrawlTask] = {}
|
|
@@ -222,7 +255,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
222
255
|
seed_task = CrawlTask(
|
|
223
256
|
elem=None,
|
|
224
257
|
url=None,
|
|
225
|
-
segments=
|
|
258
|
+
segments=bin_or_segs,
|
|
226
259
|
depth=-1,
|
|
227
260
|
backlink=None,
|
|
228
261
|
)
|
|
@@ -248,12 +281,32 @@ class WXPathEngine(HookedEngineBase):
|
|
|
248
281
|
|
|
249
282
|
if task is None:
|
|
250
283
|
log.warning(f"Got unexpected response from {resp.request.url}")
|
|
284
|
+
|
|
285
|
+
if yield_errors:
|
|
286
|
+
yield {
|
|
287
|
+
"__type__": "error",
|
|
288
|
+
"url": resp.request.url,
|
|
289
|
+
"reason": "unexpected_response",
|
|
290
|
+
"status": resp.body,
|
|
291
|
+
"body": resp.body
|
|
292
|
+
}
|
|
293
|
+
|
|
251
294
|
if is_terminal():
|
|
252
295
|
break
|
|
253
296
|
continue
|
|
254
297
|
|
|
255
298
|
if resp.error:
|
|
256
299
|
log.warning(f"Got error from {resp.request.url}: {resp.error}")
|
|
300
|
+
|
|
301
|
+
if yield_errors:
|
|
302
|
+
yield {
|
|
303
|
+
"__type__": "error",
|
|
304
|
+
"url": resp.request.url,
|
|
305
|
+
"reason": "network_error",
|
|
306
|
+
"exception": str(resp.error),
|
|
307
|
+
"status": resp.status,
|
|
308
|
+
"body": resp.body
|
|
309
|
+
}
|
|
257
310
|
if is_terminal():
|
|
258
311
|
break
|
|
259
312
|
continue
|
|
@@ -261,6 +314,16 @@ class WXPathEngine(HookedEngineBase):
|
|
|
261
314
|
# NOTE: Consider allowing redirects
|
|
262
315
|
if resp.status not in self.allowed_response_codes or not resp.body:
|
|
263
316
|
log.warning(f"Got non-200 response from {resp.request.url}")
|
|
317
|
+
|
|
318
|
+
if yield_errors:
|
|
319
|
+
yield {
|
|
320
|
+
"__type__": "error",
|
|
321
|
+
"url": resp.request.url,
|
|
322
|
+
"reason": "bad_status",
|
|
323
|
+
"status": resp.status,
|
|
324
|
+
"body": resp.body
|
|
325
|
+
}
|
|
326
|
+
|
|
264
327
|
if is_terminal():
|
|
265
328
|
break
|
|
266
329
|
continue
|
|
@@ -276,6 +339,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
276
339
|
base_url=task.url,
|
|
277
340
|
backlink=task.backlink,
|
|
278
341
|
depth=task.depth,
|
|
342
|
+
response=resp
|
|
279
343
|
)
|
|
280
344
|
|
|
281
345
|
elem = await self.post_parse_hooks(elem, task)
|
|
@@ -388,10 +452,12 @@ class WXPathEngine(HookedEngineBase):
|
|
|
388
452
|
def wxpath_async(path_expr: str,
|
|
389
453
|
max_depth: int,
|
|
390
454
|
progress: bool = False,
|
|
391
|
-
engine: WXPathEngine | None = None
|
|
455
|
+
engine: WXPathEngine | None = None,
|
|
456
|
+
yield_errors: bool = False
|
|
457
|
+
) -> AsyncGenerator[Any, None]:
|
|
392
458
|
if engine is None:
|
|
393
459
|
engine = WXPathEngine()
|
|
394
|
-
return engine.run(path_expr, max_depth, progress=progress)
|
|
460
|
+
return engine.run(path_expr, max_depth, progress=progress, yield_errors=yield_errors)
|
|
395
461
|
|
|
396
462
|
|
|
397
463
|
##### ASYNC IN SYNC #####
|
|
@@ -400,6 +466,7 @@ def wxpath_async_blocking_iter(
|
|
|
400
466
|
max_depth: int = 1,
|
|
401
467
|
progress: bool = False,
|
|
402
468
|
engine: WXPathEngine | None = None,
|
|
469
|
+
yield_errors: bool = False
|
|
403
470
|
) -> Iterator[Any]:
|
|
404
471
|
"""Evaluate a wxpath expression using concurrent breadth-first traversal.
|
|
405
472
|
|
|
@@ -419,7 +486,8 @@ def wxpath_async_blocking_iter(
|
|
|
419
486
|
"""
|
|
420
487
|
loop = asyncio.new_event_loop()
|
|
421
488
|
asyncio.set_event_loop(loop)
|
|
422
|
-
agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress,
|
|
489
|
+
agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress,
|
|
490
|
+
engine=engine, yield_errors=yield_errors)
|
|
423
491
|
|
|
424
492
|
try:
|
|
425
493
|
while True:
|
|
@@ -437,8 +505,11 @@ def wxpath_async_blocking(
|
|
|
437
505
|
max_depth: int = 1,
|
|
438
506
|
progress: bool = False,
|
|
439
507
|
engine: WXPathEngine | None = None,
|
|
508
|
+
yield_errors: bool = False
|
|
440
509
|
) -> list[Any]:
|
|
441
510
|
return list(wxpath_async_blocking_iter(path_expr,
|
|
442
511
|
max_depth=max_depth,
|
|
443
512
|
progress=progress,
|
|
444
|
-
engine=engine
|
|
513
|
+
engine=engine,
|
|
514
|
+
yield_errors=yield_errors,
|
|
515
|
+
))
|
wxpath/core/runtime/helpers.py
CHANGED
|
@@ -6,7 +6,7 @@ from wxpath.util.logging import get_logger
|
|
|
6
6
|
log = get_logger(__name__)
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
|
|
9
|
+
def parse_html(content, base_url=None, response=None, **elem_kv_pairs) -> html.HtmlElement:
|
|
10
10
|
elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
|
|
11
11
|
if base_url:
|
|
12
12
|
elem.getroottree().docinfo.URL = base_url # make base-uri() work
|
|
@@ -14,12 +14,15 @@ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
|
|
|
14
14
|
elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
|
|
15
15
|
elem.base_url = base_url # sets both attribute and doc-level URL
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
if response:
|
|
18
|
+
elem.response = response
|
|
19
|
+
elem.getroottree().getroot().response = response
|
|
20
|
+
# NOTE: some pages may have multiple root elements, i.e.
|
|
18
21
|
# len(elem.itersiblings()) > 0 AND elem.getparent() is None.
|
|
19
22
|
# This breaks elementpath. If elem has siblings, recreate the
|
|
20
23
|
# root element and only the root element.
|
|
21
24
|
if len(list(elem.itersiblings())) > 0:
|
|
22
|
-
elem = detach_html_root(elem, base_url)
|
|
25
|
+
elem = detach_html_root(elem, base_url)
|
|
23
26
|
|
|
24
27
|
for k, v in elem_kv_pairs.items():
|
|
25
28
|
elem.set(k, str(v))
|
wxpath/http/client/__init__.py
CHANGED
wxpath/http/client/crawler.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import aiohttp
|
|
2
2
|
|
|
3
3
|
try:
|
|
4
|
-
from aiohttp_client_cache import CachedSession
|
|
4
|
+
from aiohttp_client_cache import CachedSession
|
|
5
5
|
except ImportError:
|
|
6
6
|
CachedSession = None
|
|
7
7
|
|
|
@@ -42,7 +42,7 @@ def get_async_session(
|
|
|
42
42
|
if timeout is None:
|
|
43
43
|
timeout = aiohttp.ClientTimeout(total=CRAWLER_SETTINGS.timeout)
|
|
44
44
|
|
|
45
|
-
if CACHE_SETTINGS.enabled and CachedSession
|
|
45
|
+
if CACHE_SETTINGS.enabled and CachedSession:
|
|
46
46
|
log.info("using aiohttp-client-cache")
|
|
47
47
|
return CachedSession(
|
|
48
48
|
cache=get_cache_backend(),
|
|
@@ -71,6 +71,7 @@ class Crawler:
|
|
|
71
71
|
*,
|
|
72
72
|
headers: dict | None = None,
|
|
73
73
|
proxies: dict | None = None,
|
|
74
|
+
verify_ssl: bool | None = None,
|
|
74
75
|
retry_policy: RetryPolicy | None = None,
|
|
75
76
|
throttler: AbstractThrottler | None = None,
|
|
76
77
|
auto_throttle_target_concurrency: float = None,
|
|
@@ -82,6 +83,9 @@ class Crawler:
|
|
|
82
83
|
|
|
83
84
|
self.concurrency = concurrency if concurrency is not None else cfg.concurrency
|
|
84
85
|
self.per_host = per_host if per_host is not None else cfg.per_host
|
|
86
|
+
self._verify_ssl = verify_ssl if verify_ssl is not None else getattr(
|
|
87
|
+
cfg, "verify_ssl", True
|
|
88
|
+
)
|
|
85
89
|
|
|
86
90
|
timeout = timeout if timeout is not None else cfg.timeout
|
|
87
91
|
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
|
@@ -141,7 +145,11 @@ class Crawler:
|
|
|
141
145
|
"""Construct an `aiohttp.ClientSession` with tracing and pooling."""
|
|
142
146
|
trace_config = build_trace_config(self._stats)
|
|
143
147
|
# Need to build the connector as late as possible as it requires the loop
|
|
144
|
-
connector = aiohttp.TCPConnector(
|
|
148
|
+
connector = aiohttp.TCPConnector(
|
|
149
|
+
limit=self.concurrency * 2,
|
|
150
|
+
ttl_dns_cache=300,
|
|
151
|
+
ssl=self._verify_ssl,
|
|
152
|
+
)
|
|
145
153
|
return get_async_session(
|
|
146
154
|
headers=self._headers,
|
|
147
155
|
timeout=self._timeout,
|
|
@@ -274,22 +282,26 @@ class Crawler:
|
|
|
274
282
|
else:
|
|
275
283
|
log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
|
|
276
284
|
|
|
285
|
+
_start = time.monotonic()
|
|
277
286
|
body = await resp.read()
|
|
278
287
|
|
|
279
|
-
|
|
288
|
+
end = time.monotonic()
|
|
289
|
+
latency = end - _start
|
|
280
290
|
self.throttler.record_latency(host, latency)
|
|
281
291
|
|
|
282
292
|
if self.retry_policy.should_retry(req, response=resp):
|
|
283
293
|
await self._retry(req)
|
|
284
294
|
return None
|
|
285
295
|
|
|
286
|
-
return Response(req, resp.status, body, dict(resp.headers)
|
|
296
|
+
return Response(req, resp.status, body, dict(resp.headers),
|
|
297
|
+
request_start=_start, response_end=end)
|
|
287
298
|
except asyncio.CancelledError:
|
|
288
299
|
# Normal during shutdown / timeout propagation
|
|
289
300
|
log.debug("cancelled error", extra={"url": req.url})
|
|
290
301
|
raise
|
|
291
302
|
except Exception as exc:
|
|
292
|
-
|
|
303
|
+
end = time.monotonic()
|
|
304
|
+
latency = end - start
|
|
293
305
|
self.throttler.record_latency(host, latency)
|
|
294
306
|
|
|
295
307
|
if self.retry_policy.should_retry(req, exception=exc):
|
|
@@ -297,7 +309,7 @@ class Crawler:
|
|
|
297
309
|
return None
|
|
298
310
|
|
|
299
311
|
log.error("request failed", extra={"url": req.url}, exc_info=exc)
|
|
300
|
-
return Response(req, 0, b"", error=exc)
|
|
312
|
+
return Response(req, 0, b"", error=exc, request_start=start, response_end=end)
|
|
301
313
|
|
|
302
314
|
async def _retry(self, req: Request) -> None:
|
|
303
315
|
"""Reschedule a request according to the retry policy."""
|
wxpath/http/client/request.py
CHANGED
wxpath/http/client/response.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# wxpath/http/response.py
|
|
2
1
|
from dataclasses import dataclass, field
|
|
3
2
|
from typing import Optional
|
|
4
3
|
|
|
@@ -12,3 +11,10 @@ class Response:
|
|
|
12
11
|
body: bytes
|
|
13
12
|
headers: dict[str, str] | None = None
|
|
14
13
|
error: Optional[Exception] = field(default=None, kw_only=True)
|
|
14
|
+
|
|
15
|
+
request_start: float | None = None
|
|
16
|
+
response_end: float | None = None
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def latency(self) -> float:
|
|
20
|
+
return self.response_end - self.request_start
|
wxpath/http/policy/retry.py
CHANGED
|
@@ -19,13 +19,13 @@ class RetryPolicy:
|
|
|
19
19
|
|
|
20
20
|
if request.max_retries is not None and request.retries >= request.max_retries:
|
|
21
21
|
return False
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
if request.retries >= self.max_retries:
|
|
24
24
|
return False
|
|
25
25
|
|
|
26
26
|
if response is not None and response.status in self.retry_statuses:
|
|
27
27
|
return True
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
if exception is not None:
|
|
30
30
|
return True
|
|
31
31
|
|
|
File without changes
|
|
File without changes
|