wxpath 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/http/stats.py ADDED
@@ -0,0 +1,96 @@
1
+ """
2
+ aiohttp request statistics and tracing hooks.
3
+ """
4
+
5
+ import time
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from typing import Optional
9
+
10
+ from aiohttp import TraceConfig
11
+
12
+
13
+ @dataclass
14
+ class CrawlerStats:
15
+ # ---- Lifecycle counts ----
16
+ requests_enqueued: int = 0
17
+ requests_started: int = 0
18
+ requests_completed: int = 0
19
+
20
+ # ---- Concurrency ----
21
+ in_flight_global: int = 0
22
+ in_flight_per_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
23
+
24
+ # ---- Queueing ----
25
+ queue_size: int = 0
26
+ queue_wait_time_total: float = 0.0
27
+
28
+ # ---- Throttling ----
29
+ throttle_waits: int = 0
30
+ throttle_wait_time: float = 0.0
31
+ throttle_waits_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
32
+
33
+ # ---- Latency feedback ----
34
+ latency_samples: int = 0
35
+ latency_ewma: float = 0.0
36
+ min_latency: Optional[float] = None
37
+ max_latency: Optional[float] = None
38
+
39
+ # ---- Errors / retries ----
40
+ retries_scheduled: int = 0
41
+ retries_executed: int = 0
42
+ errors_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
43
+
44
+
45
+ def build_trace_config(stats: CrawlerStats) -> TraceConfig:
46
+ """
47
+ Returns an aiohttp TraceConfig wired to the given stats instance.
48
+ Tracks detailed per-request, per-host, and queue/throttle metrics.
49
+ """
50
+ trace = TraceConfig()
51
+
52
+ async def on_request_start(session, context, params):
53
+ stats.requests_started += 1
54
+ stats.in_flight_global += 1
55
+ host = params.url.host
56
+ stats.in_flight_per_host[host] += 1
57
+ context._start_time = time.monotonic()
58
+
59
+ async def on_request_end(session, context, params):
60
+ host = params.url.host
61
+ stats.in_flight_global -= 1
62
+ stats.in_flight_per_host[host] -= 1
63
+
64
+ latency = time.monotonic() - context._start_time
65
+ stats.latency_samples += 1
66
+ # EWMA update: alpha = 0.3
67
+ alpha = 0.3
68
+ stats.latency_ewma = (alpha * latency) + ((1 - alpha) * stats.latency_ewma)
69
+ stats.min_latency = latency if stats.min_latency is None \
70
+ else min(stats.min_latency, latency)
71
+ stats.max_latency = latency if stats.max_latency is None \
72
+ else max(stats.max_latency, latency)
73
+
74
+ status = getattr(params.response, "status", None)
75
+ if status is not None:
76
+ if not hasattr(stats, "status_counts"):
77
+ stats.status_counts = defaultdict(int)
78
+ stats.status_counts[status] += 1
79
+
80
+ content_length = getattr(params.response, "content_length", None)
81
+ if content_length:
82
+ if not hasattr(stats, "bytes_received"):
83
+ stats.bytes_received = 0
84
+ stats.bytes_received += content_length
85
+
86
+ async def on_request_exception(session, context, params):
87
+ host = params.url.host
88
+ stats.in_flight_global -= 1
89
+ stats.in_flight_per_host[host] -= 1
90
+ stats.errors_by_host[host] += 1
91
+
92
+ trace.on_request_start.append(on_request_start)
93
+ trace.on_request_end.append(on_request_end)
94
+ trace.on_request_exception.append(on_request_exception)
95
+
96
+ return trace
wxpath/patches.py ADDED
@@ -0,0 +1,63 @@
1
+ import elementpath
2
+ from elementpath.xpath3 import XPath3Parser
3
+ from lxml import etree, html
4
+
5
+
6
+ def html_element_repr(self):
7
+ return (f"HtmlElement(tag={self.tag}, "
8
+ f"depth={self.get('depth', -1)}, "
9
+ f"base_url={getattr(self, 'base_url', None)!r})")
10
+
11
+ # Patch lxml.html.HtmlElement.__repr__ to improve debugging with base_url.
12
+ html.HtmlElement.__repr__ = html_element_repr
13
+
14
+
15
+ class XPath3Element(etree.ElementBase):
16
+ def xpath3(self, expr, **kwargs):
17
+ """
18
+ Evaluate an XPath 3 expression using elementpath library,
19
+ returning the results as a list.
20
+ """
21
+ kwargs.setdefault("parser", XPath3Parser)
22
+ kwargs.setdefault(
23
+ "uri",
24
+ getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
25
+ )
26
+ return elementpath.select(self, expr, **kwargs)
27
+
28
+ # --- Convenience property for backward‑compatibility -----------------
29
+ @property
30
+ def base_url(self):
31
+ # 1) Per-element override (keeps our “multiple base URLs” feature)
32
+ url = self.get("base_url")
33
+ if url is not None:
34
+ return url
35
+ # 2) Fall back to document URL (O(1))
36
+ return self.getroottree().docinfo.URL
37
+
38
+ @base_url.setter
39
+ def base_url(self, value):
40
+ # Keep the per-element attribute (used by our crawler)
41
+ self.set("base_url", value)
42
+ # Set xml:base attribute so XPath base-uri() picks it up
43
+ self.set("{http://www.w3.org/XML/1998/namespace}base", value)
44
+ # Also store on the document so descendants can fetch it quickly
45
+ self.getroottree().docinfo.URL = value
46
+
47
+ @property
48
+ def depth(self):
49
+ return int(self.get("depth", -1))
50
+
51
+ @depth.setter
52
+ def depth(self, value):
53
+ self.set("depth", str(value))
54
+
55
+ # Create and register custom parser that returns XPath3Element instances
56
+ lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
57
+ parser = etree.HTMLParser()
58
+ parser.set_element_class_lookup(lookup)
59
+
60
+
61
+ # Expose parser for use in parse_html
62
+ html_parser_with_xpath3 = parser
63
+ html.HtmlElement.xpath3 = XPath3Element.xpath3
File without changes
wxpath/util/logging.py ADDED
@@ -0,0 +1,91 @@
1
+ import logging
2
+ from logging.config import dictConfig
3
+ from typing import Any, Mapping
4
+
5
+
6
+ class KeyValueFormatter(logging.Formatter):
7
+ """
8
+ Formatter that automatically renders any 'extra' context added to the record
9
+ as key=value pairs at the end of the log line.
10
+ """
11
+ # Reserved keys that already exist in LogRecord and shouldn't be printed again
12
+ _RESERVED = {
13
+ 'args', 'asctime', 'created', 'exc_info', 'exc_text', 'filename',
14
+ 'funcName', 'levelname', 'levelno', 'lineno', 'message', 'module',
15
+ 'msecs', 'msg', 'name', 'pathname', 'process', 'processName',
16
+ 'relativeCreated', 'stack_info', 'thread', 'threadName', 'taskName'
17
+ }
18
+
19
+ def format(self, record: logging.LogRecord) -> str:
20
+ # 1. Format the standard message first
21
+ s = super().format(record)
22
+
23
+ # 2. Find all 'extra' keys
24
+ extras = {k: v for k, v in record.__dict__.items() if k not in self._RESERVED}
25
+
26
+ # 3. Append them as key=value
27
+ if extras:
28
+ # Sort for deterministic logs
29
+ context_str = " ".join(f"{k}={v}" for k, v in sorted(extras.items()))
30
+ s = f"{s} | {context_str}"
31
+
32
+ return s
33
+
34
+
35
+ _DEFAULT_LOGGING_CONF = {
36
+ "version": 1,
37
+ "disable_existing_loggers": False,
38
+ "formatters": {
39
+ "kv": {
40
+ # Note: We use the class path to our custom class
41
+ "()": KeyValueFormatter,
42
+ "format": "%(asctime)s [%(levelname).1s] %(name)s | %(funcName)s | %(message)s"
43
+ }
44
+ },
45
+ "handlers": {
46
+ "stderr": {
47
+ "class": "logging.StreamHandler",
48
+ "formatter": "kv",
49
+ }
50
+ },
51
+ "loggers": {
52
+ "wxpath": {"level": "INFO", "handlers": ["stderr"]},
53
+ },
54
+ }
55
+
56
+ def configure_logging(level: str | int = "INFO", **overrides) -> None:
57
+ """
58
+ Configure wxpath's logger.
59
+
60
+ Call this once in an application entry-point **or** rely on defaults.
61
+
62
+ Parameters
63
+ ----------
64
+ level
65
+ "DEBUG"|"INFO"|... or `logging.DEBUG`, overrides the root wxpath logger.
66
+ overrides
67
+ Dict that is merged (shallow) into the default dictConfig.
68
+ Lets advanced users swap formatters/handlers.
69
+ """
70
+ conf = {**_DEFAULT_LOGGING_CONF, **overrides}
71
+ conf["loggers"]["wxpath"]["level"] = level
72
+ dictConfig(conf)
73
+
74
+
75
+ class CrawlAdapter(logging.LoggerAdapter):
76
+ """
77
+ Inject crawl context (depth, op, url) so the handler/formatter
78
+ never needs to know scraping internals.
79
+ """
80
+ def process(self, msg: str, kwargs: Mapping[str, Any]):
81
+ extra = self.extra.copy()
82
+ extra.update(kwargs.pop("extra", {}))
83
+ kwargs["extra"] = extra
84
+ return msg, kwargs
85
+
86
+ def get_logger(name: str, **ctx) -> CrawlAdapter:
87
+ base = logging.getLogger(name)
88
+ # default placeholders so formatter never blows up
89
+ defaults = {"depth": "-", "op": "-", "url": "-"}
90
+ defaults.update(ctx)
91
+ return CrawlAdapter(base, defaults)
@@ -0,0 +1,22 @@
1
+ from wxpath.core.ops import WxStr
2
+
3
+
4
+ def simplify(obj):
5
+ """
6
+ Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
7
+ lxml elements) into plain built-in Python types so that printing or
8
+ JSON serialising shows clean values.
9
+ """
10
+ # Scalars
11
+ if isinstance(obj, WxStr):
12
+ return str(obj)
13
+
14
+ # Mapping
15
+ if isinstance(obj, dict):
16
+ return {k: simplify(v) for k, v in obj.items()}
17
+
18
+ # Sequence (but not str/bytes)
19
+ if isinstance(obj, (list, tuple, set)):
20
+ return type(obj)(simplify(v) for v in obj)
21
+
22
+ return obj