wxpath 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/http/stats.py ADDED
@@ -0,0 +1,96 @@
1
+ """
2
+ aiohttp request statistics and tracing hooks.
3
+ """
4
+
5
+ import time
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from typing import Optional
9
+
10
+ from aiohttp import TraceConfig
11
+
12
+
13
+ @dataclass
14
+ class CrawlerStats:
15
+ # ---- Lifecycle counts ----
16
+ requests_enqueued: int = 0
17
+ requests_started: int = 0
18
+ requests_completed: int = 0
19
+
20
+ # ---- Concurrency ----
21
+ in_flight_global: int = 0
22
+ in_flight_per_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
23
+
24
+ # ---- Queueing ----
25
+ queue_size: int = 0
26
+ queue_wait_time_total: float = 0.0
27
+
28
+ # ---- Throttling ----
29
+ throttle_waits: int = 0
30
+ throttle_wait_time: float = 0.0
31
+ throttle_waits_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
32
+
33
+ # ---- Latency feedback ----
34
+ latency_samples: int = 0
35
+ latency_ewma: float = 0.0
36
+ min_latency: Optional[float] = None
37
+ max_latency: Optional[float] = None
38
+
39
+ # ---- Errors / retries ----
40
+ retries_scheduled: int = 0
41
+ retries_executed: int = 0
42
+ errors_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
43
+
44
+
45
+ def build_trace_config(stats: CrawlerStats) -> TraceConfig:
46
+ """
47
+ Returns an aiohttp TraceConfig wired to the given stats instance.
48
+ Tracks detailed per-request, per-host, and queue/throttle metrics.
49
+ """
50
+ trace = TraceConfig()
51
+
52
+ async def on_request_start(session, context, params):
53
+ stats.requests_started += 1
54
+ stats.in_flight_global += 1
55
+ host = params.url.host
56
+ stats.in_flight_per_host[host] += 1
57
+ context._start_time = time.monotonic()
58
+
59
+ async def on_request_end(session, context, params):
60
+ host = params.url.host
61
+ stats.in_flight_global -= 1
62
+ stats.in_flight_per_host[host] -= 1
63
+
64
+ latency = time.monotonic() - context._start_time
65
+ stats.latency_samples += 1
66
+ # EWMA update: alpha = 0.3
67
+ alpha = 0.3
68
+ stats.latency_ewma = (alpha * latency) + ((1 - alpha) * stats.latency_ewma)
69
+ stats.min_latency = latency if stats.min_latency is None \
70
+ else min(stats.min_latency, latency)
71
+ stats.max_latency = latency if stats.max_latency is None \
72
+ else max(stats.max_latency, latency)
73
+
74
+ status = getattr(params.response, "status", None)
75
+ if status is not None:
76
+ if not hasattr(stats, "status_counts"):
77
+ stats.status_counts = defaultdict(int)
78
+ stats.status_counts[status] += 1
79
+
80
+ content_length = getattr(params.response, "content_length", None)
81
+ if content_length:
82
+ if not hasattr(stats, "bytes_received"):
83
+ stats.bytes_received = 0
84
+ stats.bytes_received += content_length
85
+
86
+ async def on_request_exception(session, context, params):
87
+ host = params.url.host
88
+ stats.in_flight_global -= 1
89
+ stats.in_flight_per_host[host] -= 1
90
+ stats.errors_by_host[host] += 1
91
+
92
+ trace.on_request_start.append(on_request_start)
93
+ trace.on_request_end.append(on_request_end)
94
+ trace.on_request_exception.append(on_request_exception)
95
+
96
+ return trace
wxpath/patches.py ADDED
@@ -0,0 +1,63 @@
1
+ import elementpath
2
+ from elementpath.xpath3 import XPath3Parser
3
+ from lxml import etree, html
4
+
5
+
6
+ def html_element_repr(self):
7
+ return (f"HtmlElement(tag={self.tag}, "
8
+ f"depth={self.get('depth', -1)}, "
9
+ f"base_url={getattr(self, 'base_url', None)!r})")
10
+
11
+ # Patch lxml.html.HtmlElement.__repr__ to improve debugging with base_url.
12
+ html.HtmlElement.__repr__ = html_element_repr
13
+
14
+
15
+ class XPath3Element(etree.ElementBase):
16
+ def xpath3(self, expr, **kwargs):
17
+ """
18
+ Evaluate an XPath 3 expression using elementpath library,
19
+ returning the results as a list.
20
+ """
21
+ kwargs.setdefault("parser", XPath3Parser)
22
+ kwargs.setdefault(
23
+ "uri",
24
+ getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
25
+ )
26
+ return elementpath.select(self, expr, **kwargs)
27
+
28
+ # --- Convenience property for backward‑compatibility -----------------
29
+ @property
30
+ def base_url(self):
31
+ # 1) Per-element override (keeps our “multiple base URLs” feature)
32
+ url = self.get("base_url")
33
+ if url is not None:
34
+ return url
35
+ # 2) Fall back to document URL (O(1))
36
+ return self.getroottree().docinfo.URL
37
+
38
+ @base_url.setter
39
+ def base_url(self, value):
40
+ # Keep the per-element attribute (used by our crawler)
41
+ self.set("base_url", value)
42
+ # Set xml:base attribute so XPath base-uri() picks it up
43
+ self.set("{http://www.w3.org/XML/1998/namespace}base", value)
44
+ # Also store on the document so descendants can fetch it quickly
45
+ self.getroottree().docinfo.URL = value
46
+
47
+ @property
48
+ def depth(self):
49
+ return int(self.get("depth", -1))
50
+
51
+ @depth.setter
52
+ def depth(self, value):
53
+ self.set("depth", str(value))
54
+
55
+ # Create and register custom parser that returns XPath3Element instances
56
+ lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
57
+ parser = etree.HTMLParser()
58
+ parser.set_element_class_lookup(lookup)
59
+
60
+
61
+ # Expose parser for use in parse_html
62
+ html_parser_with_xpath3 = parser
63
+ html.HtmlElement.xpath3 = XPath3Element.xpath3
File without changes
wxpath/util/logging.py ADDED
@@ -0,0 +1,91 @@
1
+ import logging
2
+ from logging.config import dictConfig
3
+ from typing import Any, Mapping
4
+
5
+
6
+ class KeyValueFormatter(logging.Formatter):
7
+ """
8
+ Formatter that automatically renders any 'extra' context added to the record
9
+ as key=value pairs at the end of the log line.
10
+ """
11
+ # Reserved keys that already exist in LogRecord and shouldn't be printed again
12
+ _RESERVED = {
13
+ 'args', 'asctime', 'created', 'exc_info', 'exc_text', 'filename',
14
+ 'funcName', 'levelname', 'levelno', 'lineno', 'message', 'module',
15
+ 'msecs', 'msg', 'name', 'pathname', 'process', 'processName',
16
+ 'relativeCreated', 'stack_info', 'thread', 'threadName', 'taskName'
17
+ }
18
+
19
+ def format(self, record: logging.LogRecord) -> str:
20
+ # 1. Format the standard message first
21
+ s = super().format(record)
22
+
23
+ # 2. Find all 'extra' keys
24
+ extras = {k: v for k, v in record.__dict__.items() if k not in self._RESERVED}
25
+
26
+ # 3. Append them as key=value
27
+ if extras:
28
+ # Sort for deterministic logs
29
+ context_str = " ".join(f"{k}={v}" for k, v in sorted(extras.items()))
30
+ s = f"{s} | {context_str}"
31
+
32
+ return s
33
+
34
+
35
+ _DEFAULT_LOGGING_CONF = {
36
+ "version": 1,
37
+ "disable_existing_loggers": False,
38
+ "formatters": {
39
+ "kv": {
40
+ # Note: We use the class path to our custom class
41
+ "()": KeyValueFormatter,
42
+ "format": "%(asctime)s [%(levelname).1s] %(name)s | %(funcName)s | %(message)s"
43
+ }
44
+ },
45
+ "handlers": {
46
+ "stderr": {
47
+ "class": "logging.StreamHandler",
48
+ "formatter": "kv",
49
+ }
50
+ },
51
+ "loggers": {
52
+ "wxpath": {"level": "INFO", "handlers": ["stderr"]},
53
+ },
54
+ }
55
+
56
+ def configure_logging(level: str | int = "INFO", **overrides) -> None:
57
+ """
58
+ Configure wxpath's logger.
59
+
60
+ Call this once in an application entry-point **or** rely on defaults.
61
+
62
+ Parameters
63
+ ----------
64
+ level
65
+ "DEBUG"|"INFO"|... or `logging.DEBUG`, overrides the root wxpath logger.
66
+ overrides
67
+ Dict that is merged (shallow) into the default dictConfig.
68
+ Lets advanced users swap formatters/handlers.
69
+ """
70
+ conf = {**_DEFAULT_LOGGING_CONF, **overrides}
71
+ conf["loggers"]["wxpath"]["level"] = level
72
+ dictConfig(conf)
73
+
74
+
75
+ class CrawlAdapter(logging.LoggerAdapter):
76
+ """
77
+ Inject crawl context (depth, op, url) so the handler/formatter
78
+ never needs to know scraping internals.
79
+ """
80
+ def process(self, msg: str, kwargs: Mapping[str, Any]):
81
+ extra = self.extra.copy()
82
+ extra.update(kwargs.pop("extra", {}))
83
+ kwargs["extra"] = extra
84
+ return msg, kwargs
85
+
86
+ def get_logger(name: str, **ctx) -> CrawlAdapter:
87
+ base = logging.getLogger(name)
88
+ # default placeholders so formatter never blows up
89
+ defaults = {"depth": "-", "op": "-", "url": "-"}
90
+ defaults.update(ctx)
91
+ return CrawlAdapter(base, defaults)
@@ -0,0 +1,22 @@
1
+ from wxpath.core.ops import WxStr
2
+
3
+
4
+ def simplify(obj):
5
+ """
6
+ Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
7
+ lxml elements) into plain built-in Python types so that printing or
8
+ JSON serialising shows clean values.
9
+ """
10
+ # Scalars
11
+ if isinstance(obj, WxStr):
12
+ return str(obj)
13
+
14
+ # Mapping
15
+ if isinstance(obj, dict):
16
+ return {k: simplify(v) for k, v in obj.items()}
17
+
18
+ # Sequence (but not str/bytes)
19
+ if isinstance(obj, (list, tuple, set)):
20
+ return type(obj)(simplify(v) for v in obj)
21
+
22
+ return obj
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
@@ -9,11 +9,13 @@ Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: requests>=2.0
11
11
  Requires-Dist: lxml>=4.0
12
- Requires-Dist: elementpath>=5.0.0
13
- Requires-Dist: aiohttp>=3.8.0
12
+ Requires-Dist: elementpath<=5.0.3,>=5.0.0
13
+ Requires-Dist: aiohttp<=3.12.15,>=3.8.0
14
14
  Provides-Extra: test
15
15
  Requires-Dist: pytest>=7.0; extra == "test"
16
16
  Requires-Dist: pytest-asyncio>=0.23; extra == "test"
17
+ Provides-Extra: dev
18
+ Requires-Dist: ruff; extra == "dev"
17
19
  Dynamic: license-file
18
20
 
19
21
 
@@ -25,10 +27,11 @@ By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine
25
27
 
26
28
  NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
27
29
 
30
+
28
31
  ## Contents
29
32
 
30
33
  - [Example](#example)
31
- - [`url(...)` and `///` Explained](#url-and---explained)
34
+ - [`url(...)` and `///url(...)` Explained](#url-and---explained)
32
35
  - [General flow](#general-flow)
33
36
  - [Asynchronous Crawling](#asynchronous-crawling)
34
37
  - [Output types](#output-types)
@@ -37,11 +40,13 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
37
40
  - [Hooks (Experimental)](#hooks-experimental)
38
41
  - [Install](#install)
39
42
  - [More Examples](#more-examples)
43
+ - [Comparisons](#comparisons)
40
44
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
41
45
  - [Project Philosophy](#project-philosophy)
42
46
  - [Warnings](#warnings)
43
47
  - [License](#license)
44
48
 
49
+
45
50
  ## Example
46
51
 
47
52
  ```python
@@ -49,7 +54,7 @@ import wxpath
49
54
 
50
55
  path = """
51
56
  url('https://en.wikipedia.org/wiki/Expression_language')
52
- ///main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))]/url(.)
57
+ ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
53
58
  /map{
54
59
  'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
55
60
  'url':string(base-uri(.)),
@@ -84,10 +89,11 @@ The above expression does the following:
84
89
  4. Streams the extracted data as it is discovered.
85
90
 
86
91
 
87
- ## `url(...)` and `///` Explained
92
+ ## `url(...)` and `///url(...)` Explained
88
93
 
89
94
  - `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
90
- - `///` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
95
+ - `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
96
+
91
97
 
92
98
  ## General flow
93
99
 
@@ -97,14 +103,13 @@ The above expression does the following:
97
103
 
98
104
  XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
99
105
 
100
- `///` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
106
+ `///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
101
107
 
102
108
  Results are yielded as soon as they are ready.
103
109
 
104
110
 
105
111
  ## Asynchronous Crawling
106
112
 
107
-
108
113
  **wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
109
114
 
110
115
  ```python
@@ -114,7 +119,7 @@ from wxpath import wxpath_async
114
119
  items = []
115
120
 
116
121
  async def main():
117
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(@href[starts-with(., '/wiki/')])//a/@href"
122
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
118
123
  async for item in wxpath_async(path_expr, max_depth=1):
119
124
  items.append(item)
120
125
 
@@ -123,16 +128,16 @@ asyncio.run(main())
123
128
 
124
129
  ### Blocking, Concurrent Requests
125
130
 
126
-
127
131
  **wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
128
132
 
129
133
  ```python
130
134
  from wxpath import wxpath_async_blocking_iter
131
135
 
132
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(@href[starts-with(., '/wiki/')])//a/@href"
136
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
133
137
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
134
138
  ```
135
139
 
140
+
136
141
  ## Output types
137
142
 
138
143
  The wxpath Python API yields structured objects, not just strings.
@@ -156,7 +161,7 @@ The Python API preserves structure by default.
156
161
  ```python
157
162
  path_expr = """
158
163
  url('https://en.wikipedia.org/wiki/Expression_language')
159
- ///div[@id='mw-content-text']//a/url(@href)
164
+ ///url(//div[@id='mw-content-text']//a/@href)
160
165
  /map{
161
166
  'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
162
167
  'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
@@ -176,15 +181,18 @@ path_expr = """
176
181
  # ...]
177
182
  ```
178
183
 
184
+
179
185
  ## CLI
180
186
 
181
187
  **wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
182
188
 
189
+ The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
190
+
191
+ WARNING: Due to the everchanging nature of web content, the output may vary over time.
183
192
  ```bash
184
193
  > wxpath --depth 1 "\
185
194
  url('https://en.wikipedia.org/wiki/Expression_language')\
186
- ///div[@id='mw-content-text'] \
187
- //a/url(@href[starts-with(., '/wiki/') \
195
+ ///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
188
196
  and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
189
197
  /map{ \
190
198
  'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
@@ -256,90 +264,13 @@ pip install wxpath
256
264
 
257
265
  ## More Examples
258
266
 
259
- ```python
260
- import wxpath
267
+ See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
261
268
 
262
- #### EXAMPLE 1 - Simple, single page crawl and link extraction #######
263
- #
264
- # Starting from Expression language's wiki, extract all links (hrefs)
265
- # from the main section. The `url(...)` operator is used to execute a
266
- # web request to the specified URL and return the HTML content.
267
- #
268
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//main//a/@href"
269
-
270
- items = wxpath.wxpath_async_blocking(path_expr)
271
-
272
-
273
- #### EXAMPLE 2 - Two-deep crawl and link extraction ##################
274
- #
275
- # Starting from Expression language's wiki, crawl all child links
276
- # starting with '/wiki/', and extract each child's links (hrefs). The
277
- # `url(...)` operator is pipe'd arguments from the evaluated XPath.
278
- #
279
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(@href[starts-with(., '/wiki/')])//a/@href"
280
-
281
- #### EXAMPLE 3 - Infinite crawl with BFS tree depth limit ############
282
- #
283
- # Starting from Expression language's wiki, infinitely crawl all child
284
- # links (and child's child's links recursively). The `///` syntax is
285
- # used to indicate an infinite crawl.
286
- # Returns lxml.html.HtmlElement objects.
287
- #
288
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
289
-
290
- # The same expression written differently:
291
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//main//a/@href)"
292
-
293
- # Modify (inclusive) max_depth to limit the BFS tree (crawl depth).
294
- items = wxpath.wxpath_async_blocking(path_expr, max_depth=1)
295
-
296
- #### EXAMPLE 4 - Infinite crawl with field extraction ################
297
- #
298
- # Infinitely crawls Expression language's wiki's child links and
299
- # childs' child links (recursively) and then, for each child link
300
- # crawled, extracts objects with the named fields as a dict.
301
- #
302
- path_expr = """
303
- url('https://en.wikipedia.org/wiki/Expression_language')
304
- ///main//a/url(@href)
305
- /map {
306
- 'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
307
- 'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
308
- 'url'://link[@rel='canonical']/@href[1],
309
- 'backlink':wx:backlink(.),
310
- 'depth':wx:depth(.)
311
- }
312
- """
313
269
 
314
- # Under the hood of wxpath.core.wxpath, we generate `segments` list,
315
- # revealing the operations executed to accomplish the crawl.
316
- # >> segments = wxpath.core.parser.parse_wxpath_expr(path_expr);
317
- # >> segments
318
- # [Segment(op='url', value='https://en.wikipedia.org/wiki/Expression_language'),
319
- # Segment(op='url_inf', value='///url(//main//a/@href)'),
320
- # Segment(op='xpath', value='/map { \'title\':(//span[contains(@class, "mw-page-title-main")]/text())[1], \'short_description\':(//div[contains(@class, "shortdescription")]/text())[1], \'url\'://link[@rel=\'canonical\']/@href[1] }')]
321
-
322
- #### EXAMPLE 5 = Seeding from XPath function expression + mapping operator (`!`)
323
- #
324
- # Functionally create 10 Amazon book search result page URLs, map each URL to
325
- # the url(.) operator, and for each page, extract the title, price, and link of
326
- # each book listed.
327
- #
328
- base_url = "https://www.amazon.com/s?k=books&i=stripbooks&page="
329
-
330
- path_expr = f"""
331
- (1 to 10) ! ('{base_url}' || .) !
332
- url(.)
333
- //span[@data-component-type='s-search-results']//*[@role='listitem']
334
- /map {{
335
- 'title': (.//h2/span/text())[1],
336
- 'price': (.//span[@class='a-price']/span[@class='a-offscreen']/text())[1],
337
- 'link': (.//a[@aria-describedby='price-link']/@href)[1]
338
- }}
339
- """
270
+ ## Comparisons
271
+
272
+ See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
340
273
 
341
- items = list(wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1))
342
- ```
343
274
 
344
275
  ## Advanced: Engine & Crawler Configuration
345
276
 
@@ -364,7 +295,7 @@ engine = WXPathEngine(
364
295
  crawler=crawler,
365
296
  )
366
297
 
367
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
298
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
368
299
 
369
300
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
370
301
  ```
@@ -392,6 +323,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
392
323
  - Automatic proxy rotation
393
324
  - Browser-based rendering (JavaScript execution)
394
325
 
326
+
395
327
  ## WARNINGS!!!
396
328
 
397
329
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
@@ -399,6 +331,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
399
331
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
400
332
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
401
333
 
334
+
402
335
  ## License
403
336
 
404
337
  MIT
@@ -0,0 +1,33 @@
1
+ wxpath/__init__.py,sha256=w1hFE_VSIYq_TSFLoPfp6MJbG1sA6BeChX6PYsXIK4o,265
2
+ wxpath/cli.py,sha256=CHOFWH_WHsJ30aItIQw9c5jzjl2Y64DmW2K942OGwpo,1668
3
+ wxpath/patches.py,sha256=u0dOL-K-gvdO9SJvzGrqR9Zou6XduWjl6R7mzIcZtJg,2130
4
+ wxpath/core/__init__.py,sha256=U9_In2iRaZrpiIVavIli1M59gCB6Kn1en-1Fza-qIiI,257
5
+ wxpath/core/dom.py,sha256=X0L3n8jRfO5evEypDaJTD-NQ3cLXWvnEUVERAHo3vV0,701
6
+ wxpath/core/errors.py,sha256=q56Gs5JJSC4HKImUtdZhOHcqe8XsoIrVhsaaoJ2qhCQ,4198
7
+ wxpath/core/models.py,sha256=3KYt-UwfLY2FlSRUHeA_getnYaNUMPW9wRrl2CRbPso,1611
8
+ wxpath/core/ops.py,sha256=8hc8VTqsxGFpizOyPTgzxjc8Y5srHd2aaOugQ9fJ3sE,8918
9
+ wxpath/core/parser.py,sha256=0VQCkuznd4dYYzEeTAMFs1L2SmvTgSp1JWz-Um0uEjM,9911
10
+ wxpath/core/runtime/__init__.py,sha256=_iCgkIWxXvxzQcenHOsjYGsk74HboTIYWOtgM8GtCyc,86
11
+ wxpath/core/runtime/engine.py,sha256=Pn5wzPkBwp8bq48Ie0O0DVQzUFEAAzWIj1PHgChm2bo,10825
12
+ wxpath/core/runtime/helpers.py,sha256=NCL4Wl8Hpc1VTfERSthCen9wlVd5J0eS8th4gqEPmRg,1578
13
+ wxpath/hooks/__init__.py,sha256=9JG63e4z_8CZLWugFcY786hebaEEPZ5FmZhyDHat-98,294
14
+ wxpath/hooks/builtin.py,sha256=GJ4w1C9djWNzAmAA3U0qI9OoCOeC5R8tEGtWXJVHSYs,4125
15
+ wxpath/hooks/registry.py,sha256=q4MxYwDUv7LH4-WJGO_unXbBRFXXxsBCU4vU1co0gC4,4136
16
+ wxpath/http/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ wxpath/http/stats.py,sha256=FrXbFrnms113Gapf-Z5WiD5qaNiJ0XuOqjSQhwXfuEo,3172
18
+ wxpath/http/client/__init__.py,sha256=QpdmqzcznUeuFvT3IIo-LmBUUHEa2BDq9sHGAHJnDLI,202
19
+ wxpath/http/client/crawler.py,sha256=hN7EJXP102nsMA9ipaNPc9fWwDVpm_LJdGo6LSlAQp0,6996
20
+ wxpath/http/client/request.py,sha256=3nwwPQ2e_WycJQnSA6QieWJ2q3qg40jkGrp2NUDPsLI,888
21
+ wxpath/http/client/response.py,sha256=mDo3FswiVnulV1l5qjio5OQpGlT0-tfkR7daPSgSUuE,324
22
+ wxpath/http/policy/backoff.py,sha256=NwdUR6bRe1RtUGSJOktj-p8IyC1l9xu_-Aa_Gj_u5sw,321
23
+ wxpath/http/policy/retry.py,sha256=WSrQfCy1F7IcXFpVGDi4HTphNhFq12p4DaMO0_4dgrw,982
24
+ wxpath/http/policy/throttler.py,sha256=wydMFV-0mxpHSI5iYkLfE78oY4z_fF8jW9MqCeb8G54,3014
25
+ wxpath/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ wxpath/util/logging.py,sha256=oQi8sp7yKWgXkkcJ4U4WHp7TyBCQiK4VhSXOSb8pGw0,2965
27
+ wxpath/util/serialize.py,sha256=uUs4C9VErpFd97smBM2bRWo2nW25kCgKdsMrVtVxhg8,575
28
+ wxpath-0.2.0.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
29
+ wxpath-0.2.0.dist-info/METADATA,sha256=6CdIcq82gNqvXVIpBzhGCk_Q0eqDvok1JmEKWQkFals,14662
30
+ wxpath-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ wxpath-0.2.0.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
32
+ wxpath-0.2.0.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
33
+ wxpath-0.2.0.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ wxpath
@@ -1,6 +0,0 @@
1
- wxpath-0.1.0.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
2
- wxpath-0.1.0.dist-info/METADATA,sha256=Nf5dRmDU09BNwxFOxDM_nEdezRp5CA34lLD2oEA2aI4,17663
3
- wxpath-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
- wxpath-0.1.0.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
5
- wxpath-0.1.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
- wxpath-0.1.0.dist-info/RECORD,,
@@ -1 +0,0 @@
1
-
File without changes