wxpath 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +92 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +278 -0
- wxpath/core/parser.py +598 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +406 -0
- wxpath/core/runtime/helpers.py +41 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +145 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +231 -0
- wxpath/http/client/request.py +38 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/robots.py +82 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/METADATA +107 -129
- wxpath-0.3.0.dist-info/RECORD +33 -0
- wxpath-0.3.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.1.dist-info/RECORD +0 -6
- wxpath-0.1.1.dist-info/top_level.txt +0 -1
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
wxpath/http/stats.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
aiohttp request statistics and tracing hooks.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from aiohttp import TraceConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class CrawlerStats:
|
|
15
|
+
# ---- Lifecycle counts ----
|
|
16
|
+
requests_enqueued: int = 0
|
|
17
|
+
requests_started: int = 0
|
|
18
|
+
requests_completed: int = 0
|
|
19
|
+
|
|
20
|
+
# ---- Concurrency ----
|
|
21
|
+
in_flight_global: int = 0
|
|
22
|
+
in_flight_per_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
23
|
+
|
|
24
|
+
# ---- Queueing ----
|
|
25
|
+
queue_size: int = 0
|
|
26
|
+
queue_wait_time_total: float = 0.0
|
|
27
|
+
|
|
28
|
+
# ---- Throttling ----
|
|
29
|
+
throttle_waits: int = 0
|
|
30
|
+
throttle_wait_time: float = 0.0
|
|
31
|
+
throttle_waits_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
32
|
+
|
|
33
|
+
# ---- Latency feedback ----
|
|
34
|
+
latency_samples: int = 0
|
|
35
|
+
latency_ewma: float = 0.0
|
|
36
|
+
min_latency: Optional[float] = None
|
|
37
|
+
max_latency: Optional[float] = None
|
|
38
|
+
|
|
39
|
+
# ---- Errors / retries ----
|
|
40
|
+
retries_scheduled: int = 0
|
|
41
|
+
retries_executed: int = 0
|
|
42
|
+
errors_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build_trace_config(stats: CrawlerStats) -> TraceConfig:
|
|
46
|
+
"""
|
|
47
|
+
Returns an aiohttp TraceConfig wired to the given stats instance.
|
|
48
|
+
Tracks detailed per-request, per-host, and queue/throttle metrics.
|
|
49
|
+
"""
|
|
50
|
+
trace = TraceConfig()
|
|
51
|
+
|
|
52
|
+
async def on_request_start(session, context, params):
|
|
53
|
+
stats.requests_started += 1
|
|
54
|
+
stats.in_flight_global += 1
|
|
55
|
+
host = params.url.host
|
|
56
|
+
stats.in_flight_per_host[host] += 1
|
|
57
|
+
context._start_time = time.monotonic()
|
|
58
|
+
|
|
59
|
+
async def on_request_end(session, context, params):
|
|
60
|
+
host = params.url.host
|
|
61
|
+
stats.in_flight_global -= 1
|
|
62
|
+
stats.in_flight_per_host[host] -= 1
|
|
63
|
+
|
|
64
|
+
latency = time.monotonic() - context._start_time
|
|
65
|
+
stats.latency_samples += 1
|
|
66
|
+
# EWMA update: alpha = 0.3
|
|
67
|
+
alpha = 0.3
|
|
68
|
+
stats.latency_ewma = (alpha * latency) + ((1 - alpha) * stats.latency_ewma)
|
|
69
|
+
stats.min_latency = latency if stats.min_latency is None \
|
|
70
|
+
else min(stats.min_latency, latency)
|
|
71
|
+
stats.max_latency = latency if stats.max_latency is None \
|
|
72
|
+
else max(stats.max_latency, latency)
|
|
73
|
+
|
|
74
|
+
status = getattr(params.response, "status", None)
|
|
75
|
+
if status is not None:
|
|
76
|
+
if not hasattr(stats, "status_counts"):
|
|
77
|
+
stats.status_counts = defaultdict(int)
|
|
78
|
+
stats.status_counts[status] += 1
|
|
79
|
+
|
|
80
|
+
content_length = getattr(params.response, "content_length", None)
|
|
81
|
+
if content_length:
|
|
82
|
+
if not hasattr(stats, "bytes_received"):
|
|
83
|
+
stats.bytes_received = 0
|
|
84
|
+
stats.bytes_received += content_length
|
|
85
|
+
|
|
86
|
+
async def on_request_exception(session, context, params):
|
|
87
|
+
host = params.url.host
|
|
88
|
+
stats.in_flight_global -= 1
|
|
89
|
+
stats.in_flight_per_host[host] -= 1
|
|
90
|
+
stats.errors_by_host[host] += 1
|
|
91
|
+
|
|
92
|
+
trace.on_request_start.append(on_request_start)
|
|
93
|
+
trace.on_request_end.append(on_request_end)
|
|
94
|
+
trace.on_request_exception.append(on_request_exception)
|
|
95
|
+
|
|
96
|
+
return trace
|
wxpath/patches.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import elementpath
|
|
2
|
+
from elementpath.xpath3 import XPath3Parser
|
|
3
|
+
from lxml import etree, html
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def html_element_repr(self):
|
|
7
|
+
return (f"HtmlElement(tag={self.tag}, "
|
|
8
|
+
f"depth={self.get('depth', -1)}, "
|
|
9
|
+
f"base_url={getattr(self, 'base_url', None)!r})")
|
|
10
|
+
|
|
11
|
+
# Patch lxml.html.HtmlElement.__repr__ to improve debugging with base_url.
|
|
12
|
+
html.HtmlElement.__repr__ = html_element_repr
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class XPath3Element(etree.ElementBase):
|
|
16
|
+
def xpath3(self, expr, **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
Evaluate an XPath 3 expression using elementpath library,
|
|
19
|
+
returning the results as a list.
|
|
20
|
+
"""
|
|
21
|
+
kwargs.setdefault("parser", XPath3Parser)
|
|
22
|
+
kwargs.setdefault(
|
|
23
|
+
"uri",
|
|
24
|
+
getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
|
|
25
|
+
)
|
|
26
|
+
return elementpath.select(self, expr, **kwargs)
|
|
27
|
+
|
|
28
|
+
# --- Convenience property for backward‑compatibility -----------------
|
|
29
|
+
@property
|
|
30
|
+
def base_url(self):
|
|
31
|
+
# 1) Per-element override (keeps our “multiple base URLs” feature)
|
|
32
|
+
url = self.get("base_url")
|
|
33
|
+
if url is not None:
|
|
34
|
+
return url
|
|
35
|
+
# 2) Fall back to document URL (O(1))
|
|
36
|
+
return self.getroottree().docinfo.URL
|
|
37
|
+
|
|
38
|
+
@base_url.setter
|
|
39
|
+
def base_url(self, value):
|
|
40
|
+
# Keep the per-element attribute (used by our crawler)
|
|
41
|
+
self.set("base_url", value)
|
|
42
|
+
# Set xml:base attribute so XPath base-uri() picks it up
|
|
43
|
+
self.set("{http://www.w3.org/XML/1998/namespace}base", value)
|
|
44
|
+
# Also store on the document so descendants can fetch it quickly
|
|
45
|
+
self.getroottree().docinfo.URL = value
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def depth(self):
|
|
49
|
+
return int(self.get("depth", -1))
|
|
50
|
+
|
|
51
|
+
@depth.setter
|
|
52
|
+
def depth(self, value):
|
|
53
|
+
self.set("depth", str(value))
|
|
54
|
+
|
|
55
|
+
# Create and register custom parser that returns XPath3Element instances
|
|
56
|
+
lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
|
|
57
|
+
parser = etree.HTMLParser()
|
|
58
|
+
parser.set_element_class_lookup(lookup)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Expose parser for use in parse_html
|
|
62
|
+
html_parser_with_xpath3 = parser
|
|
63
|
+
html.HtmlElement.xpath3 = XPath3Element.xpath3
|
wxpath/util/__init__.py
ADDED
|
File without changes
|
wxpath/util/logging.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from logging.config import dictConfig
|
|
3
|
+
from typing import Any, Mapping
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class KeyValueFormatter(logging.Formatter):
|
|
7
|
+
"""
|
|
8
|
+
Formatter that automatically renders any 'extra' context added to the record
|
|
9
|
+
as key=value pairs at the end of the log line.
|
|
10
|
+
"""
|
|
11
|
+
# Reserved keys that already exist in LogRecord and shouldn't be printed again
|
|
12
|
+
_RESERVED = {
|
|
13
|
+
'args', 'asctime', 'created', 'exc_info', 'exc_text', 'filename',
|
|
14
|
+
'funcName', 'levelname', 'levelno', 'lineno', 'message', 'module',
|
|
15
|
+
'msecs', 'msg', 'name', 'pathname', 'process', 'processName',
|
|
16
|
+
'relativeCreated', 'stack_info', 'thread', 'threadName', 'taskName'
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
20
|
+
# 1. Format the standard message first
|
|
21
|
+
s = super().format(record)
|
|
22
|
+
|
|
23
|
+
# 2. Find all 'extra' keys
|
|
24
|
+
extras = {k: v for k, v in record.__dict__.items() if k not in self._RESERVED}
|
|
25
|
+
|
|
26
|
+
# 3. Append them as key=value
|
|
27
|
+
if extras:
|
|
28
|
+
# Sort for deterministic logs
|
|
29
|
+
context_str = " ".join(f"{k}={v}" for k, v in sorted(extras.items()))
|
|
30
|
+
s = f"{s} | {context_str}"
|
|
31
|
+
|
|
32
|
+
return s
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_DEFAULT_LOGGING_CONF = {
|
|
36
|
+
"version": 1,
|
|
37
|
+
"disable_existing_loggers": False,
|
|
38
|
+
"formatters": {
|
|
39
|
+
"kv": {
|
|
40
|
+
# Note: We use the class path to our custom class
|
|
41
|
+
"()": KeyValueFormatter,
|
|
42
|
+
"format": "%(asctime)s [%(levelname).1s] %(name)s | %(funcName)s | %(message)s"
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"handlers": {
|
|
46
|
+
"stderr": {
|
|
47
|
+
"class": "logging.StreamHandler",
|
|
48
|
+
"formatter": "kv",
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"loggers": {
|
|
52
|
+
"wxpath": {"level": "INFO", "handlers": ["stderr"]},
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def configure_logging(level: str | int = "INFO", **overrides) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Configure wxpath's logger.
|
|
59
|
+
|
|
60
|
+
Call this once in an application entry-point **or** rely on defaults.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
level
|
|
65
|
+
"DEBUG"|"INFO"|... or `logging.DEBUG`, overrides the root wxpath logger.
|
|
66
|
+
overrides
|
|
67
|
+
Dict that is merged (shallow) into the default dictConfig.
|
|
68
|
+
Lets advanced users swap formatters/handlers.
|
|
69
|
+
"""
|
|
70
|
+
conf = {**_DEFAULT_LOGGING_CONF, **overrides}
|
|
71
|
+
conf["loggers"]["wxpath"]["level"] = level
|
|
72
|
+
dictConfig(conf)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class CrawlAdapter(logging.LoggerAdapter):
|
|
76
|
+
"""
|
|
77
|
+
Inject crawl context (depth, op, url) so the handler/formatter
|
|
78
|
+
never needs to know scraping internals.
|
|
79
|
+
"""
|
|
80
|
+
def process(self, msg: str, kwargs: Mapping[str, Any]):
|
|
81
|
+
extra = self.extra.copy()
|
|
82
|
+
extra.update(kwargs.pop("extra", {}))
|
|
83
|
+
kwargs["extra"] = extra
|
|
84
|
+
return msg, kwargs
|
|
85
|
+
|
|
86
|
+
def get_logger(name: str, **ctx) -> CrawlAdapter:
|
|
87
|
+
base = logging.getLogger(name)
|
|
88
|
+
# default placeholders so formatter never blows up
|
|
89
|
+
defaults = {"depth": "-", "op": "-", "url": "-"}
|
|
90
|
+
defaults.update(ctx)
|
|
91
|
+
return CrawlAdapter(base, defaults)
|
wxpath/util/serialize.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from wxpath.core.ops import WxStr
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def simplify(obj):
|
|
5
|
+
"""
|
|
6
|
+
Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
|
|
7
|
+
lxml elements) into plain built-in Python types so that printing or
|
|
8
|
+
JSON serialising shows clean values.
|
|
9
|
+
"""
|
|
10
|
+
# Scalars
|
|
11
|
+
if isinstance(obj, WxStr):
|
|
12
|
+
return str(obj)
|
|
13
|
+
|
|
14
|
+
# Mapping
|
|
15
|
+
if isinstance(obj, dict):
|
|
16
|
+
return {k: simplify(v) for k, v in obj.items()}
|
|
17
|
+
|
|
18
|
+
# Sequence (but not str/bytes)
|
|
19
|
+
if isinstance(obj, (list, tuple, set)):
|
|
20
|
+
return type(obj)(simplify(v) for v in obj)
|
|
21
|
+
|
|
22
|
+
return obj
|