wxpath 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +52 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/errors.py +134 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +244 -0
- wxpath/core/parser.py +319 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +315 -0
- wxpath/core/runtime/helpers.py +48 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +133 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +196 -0
- wxpath/http/client/request.py +35 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.1.dist-info → wxpath-0.2.0.dist-info}/METADATA +28 -97
- wxpath-0.2.0.dist-info/RECORD +33 -0
- wxpath-0.2.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.1.dist-info/RECORD +0 -6
- wxpath-0.1.1.dist-info/top_level.txt +0 -1
- {wxpath-0.1.1.dist-info → wxpath-0.2.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.2.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.2.0.dist-info}/licenses/LICENSE +0 -0
wxpath/http/stats.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
aiohttp request statistics and tracing hooks.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from aiohttp import TraceConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class CrawlerStats:
|
|
15
|
+
# ---- Lifecycle counts ----
|
|
16
|
+
requests_enqueued: int = 0
|
|
17
|
+
requests_started: int = 0
|
|
18
|
+
requests_completed: int = 0
|
|
19
|
+
|
|
20
|
+
# ---- Concurrency ----
|
|
21
|
+
in_flight_global: int = 0
|
|
22
|
+
in_flight_per_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
23
|
+
|
|
24
|
+
# ---- Queueing ----
|
|
25
|
+
queue_size: int = 0
|
|
26
|
+
queue_wait_time_total: float = 0.0
|
|
27
|
+
|
|
28
|
+
# ---- Throttling ----
|
|
29
|
+
throttle_waits: int = 0
|
|
30
|
+
throttle_wait_time: float = 0.0
|
|
31
|
+
throttle_waits_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
32
|
+
|
|
33
|
+
# ---- Latency feedback ----
|
|
34
|
+
latency_samples: int = 0
|
|
35
|
+
latency_ewma: float = 0.0
|
|
36
|
+
min_latency: Optional[float] = None
|
|
37
|
+
max_latency: Optional[float] = None
|
|
38
|
+
|
|
39
|
+
# ---- Errors / retries ----
|
|
40
|
+
retries_scheduled: int = 0
|
|
41
|
+
retries_executed: int = 0
|
|
42
|
+
errors_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build_trace_config(stats: CrawlerStats) -> TraceConfig:
|
|
46
|
+
"""
|
|
47
|
+
Returns an aiohttp TraceConfig wired to the given stats instance.
|
|
48
|
+
Tracks detailed per-request, per-host, and queue/throttle metrics.
|
|
49
|
+
"""
|
|
50
|
+
trace = TraceConfig()
|
|
51
|
+
|
|
52
|
+
async def on_request_start(session, context, params):
|
|
53
|
+
stats.requests_started += 1
|
|
54
|
+
stats.in_flight_global += 1
|
|
55
|
+
host = params.url.host
|
|
56
|
+
stats.in_flight_per_host[host] += 1
|
|
57
|
+
context._start_time = time.monotonic()
|
|
58
|
+
|
|
59
|
+
async def on_request_end(session, context, params):
|
|
60
|
+
host = params.url.host
|
|
61
|
+
stats.in_flight_global -= 1
|
|
62
|
+
stats.in_flight_per_host[host] -= 1
|
|
63
|
+
|
|
64
|
+
latency = time.monotonic() - context._start_time
|
|
65
|
+
stats.latency_samples += 1
|
|
66
|
+
# EWMA update: alpha = 0.3
|
|
67
|
+
alpha = 0.3
|
|
68
|
+
stats.latency_ewma = (alpha * latency) + ((1 - alpha) * stats.latency_ewma)
|
|
69
|
+
stats.min_latency = latency if stats.min_latency is None \
|
|
70
|
+
else min(stats.min_latency, latency)
|
|
71
|
+
stats.max_latency = latency if stats.max_latency is None \
|
|
72
|
+
else max(stats.max_latency, latency)
|
|
73
|
+
|
|
74
|
+
status = getattr(params.response, "status", None)
|
|
75
|
+
if status is not None:
|
|
76
|
+
if not hasattr(stats, "status_counts"):
|
|
77
|
+
stats.status_counts = defaultdict(int)
|
|
78
|
+
stats.status_counts[status] += 1
|
|
79
|
+
|
|
80
|
+
content_length = getattr(params.response, "content_length", None)
|
|
81
|
+
if content_length:
|
|
82
|
+
if not hasattr(stats, "bytes_received"):
|
|
83
|
+
stats.bytes_received = 0
|
|
84
|
+
stats.bytes_received += content_length
|
|
85
|
+
|
|
86
|
+
async def on_request_exception(session, context, params):
|
|
87
|
+
host = params.url.host
|
|
88
|
+
stats.in_flight_global -= 1
|
|
89
|
+
stats.in_flight_per_host[host] -= 1
|
|
90
|
+
stats.errors_by_host[host] += 1
|
|
91
|
+
|
|
92
|
+
trace.on_request_start.append(on_request_start)
|
|
93
|
+
trace.on_request_end.append(on_request_end)
|
|
94
|
+
trace.on_request_exception.append(on_request_exception)
|
|
95
|
+
|
|
96
|
+
return trace
|
wxpath/patches.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import elementpath
|
|
2
|
+
from elementpath.xpath3 import XPath3Parser
|
|
3
|
+
from lxml import etree, html
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def html_element_repr(self):
|
|
7
|
+
return (f"HtmlElement(tag={self.tag}, "
|
|
8
|
+
f"depth={self.get('depth', -1)}, "
|
|
9
|
+
f"base_url={getattr(self, 'base_url', None)!r})")
|
|
10
|
+
|
|
11
|
+
# Patch lxml.html.HtmlElement.__repr__ to improve debugging with base_url.
|
|
12
|
+
html.HtmlElement.__repr__ = html_element_repr
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class XPath3Element(etree.ElementBase):
|
|
16
|
+
def xpath3(self, expr, **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
Evaluate an XPath 3 expression using elementpath library,
|
|
19
|
+
returning the results as a list.
|
|
20
|
+
"""
|
|
21
|
+
kwargs.setdefault("parser", XPath3Parser)
|
|
22
|
+
kwargs.setdefault(
|
|
23
|
+
"uri",
|
|
24
|
+
getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
|
|
25
|
+
)
|
|
26
|
+
return elementpath.select(self, expr, **kwargs)
|
|
27
|
+
|
|
28
|
+
# --- Convenience property for backward‑compatibility -----------------
|
|
29
|
+
@property
|
|
30
|
+
def base_url(self):
|
|
31
|
+
# 1) Per-element override (keeps our “multiple base URLs” feature)
|
|
32
|
+
url = self.get("base_url")
|
|
33
|
+
if url is not None:
|
|
34
|
+
return url
|
|
35
|
+
# 2) Fall back to document URL (O(1))
|
|
36
|
+
return self.getroottree().docinfo.URL
|
|
37
|
+
|
|
38
|
+
@base_url.setter
|
|
39
|
+
def base_url(self, value):
|
|
40
|
+
# Keep the per-element attribute (used by our crawler)
|
|
41
|
+
self.set("base_url", value)
|
|
42
|
+
# Set xml:base attribute so XPath base-uri() picks it up
|
|
43
|
+
self.set("{http://www.w3.org/XML/1998/namespace}base", value)
|
|
44
|
+
# Also store on the document so descendants can fetch it quickly
|
|
45
|
+
self.getroottree().docinfo.URL = value
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def depth(self):
|
|
49
|
+
return int(self.get("depth", -1))
|
|
50
|
+
|
|
51
|
+
@depth.setter
|
|
52
|
+
def depth(self, value):
|
|
53
|
+
self.set("depth", str(value))
|
|
54
|
+
|
|
55
|
+
# Create and register custom parser that returns XPath3Element instances
|
|
56
|
+
lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
|
|
57
|
+
parser = etree.HTMLParser()
|
|
58
|
+
parser.set_element_class_lookup(lookup)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Expose parser for use in parse_html
|
|
62
|
+
html_parser_with_xpath3 = parser
|
|
63
|
+
html.HtmlElement.xpath3 = XPath3Element.xpath3
|
wxpath/util/__init__.py
ADDED
|
File without changes
|
wxpath/util/logging.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from logging.config import dictConfig
|
|
3
|
+
from typing import Any, Mapping
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class KeyValueFormatter(logging.Formatter):
|
|
7
|
+
"""
|
|
8
|
+
Formatter that automatically renders any 'extra' context added to the record
|
|
9
|
+
as key=value pairs at the end of the log line.
|
|
10
|
+
"""
|
|
11
|
+
# Reserved keys that already exist in LogRecord and shouldn't be printed again
|
|
12
|
+
_RESERVED = {
|
|
13
|
+
'args', 'asctime', 'created', 'exc_info', 'exc_text', 'filename',
|
|
14
|
+
'funcName', 'levelname', 'levelno', 'lineno', 'message', 'module',
|
|
15
|
+
'msecs', 'msg', 'name', 'pathname', 'process', 'processName',
|
|
16
|
+
'relativeCreated', 'stack_info', 'thread', 'threadName', 'taskName'
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
20
|
+
# 1. Format the standard message first
|
|
21
|
+
s = super().format(record)
|
|
22
|
+
|
|
23
|
+
# 2. Find all 'extra' keys
|
|
24
|
+
extras = {k: v for k, v in record.__dict__.items() if k not in self._RESERVED}
|
|
25
|
+
|
|
26
|
+
# 3. Append them as key=value
|
|
27
|
+
if extras:
|
|
28
|
+
# Sort for deterministic logs
|
|
29
|
+
context_str = " ".join(f"{k}={v}" for k, v in sorted(extras.items()))
|
|
30
|
+
s = f"{s} | {context_str}"
|
|
31
|
+
|
|
32
|
+
return s
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_DEFAULT_LOGGING_CONF = {
|
|
36
|
+
"version": 1,
|
|
37
|
+
"disable_existing_loggers": False,
|
|
38
|
+
"formatters": {
|
|
39
|
+
"kv": {
|
|
40
|
+
# Note: We use the class path to our custom class
|
|
41
|
+
"()": KeyValueFormatter,
|
|
42
|
+
"format": "%(asctime)s [%(levelname).1s] %(name)s | %(funcName)s | %(message)s"
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"handlers": {
|
|
46
|
+
"stderr": {
|
|
47
|
+
"class": "logging.StreamHandler",
|
|
48
|
+
"formatter": "kv",
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"loggers": {
|
|
52
|
+
"wxpath": {"level": "INFO", "handlers": ["stderr"]},
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def configure_logging(level: str | int = "INFO", **overrides) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Configure wxpath's logger.
|
|
59
|
+
|
|
60
|
+
Call this once in an application entry-point **or** rely on defaults.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
level
|
|
65
|
+
"DEBUG"|"INFO"|... or `logging.DEBUG`, overrides the root wxpath logger.
|
|
66
|
+
overrides
|
|
67
|
+
Dict that is merged (shallow) into the default dictConfig.
|
|
68
|
+
Lets advanced users swap formatters/handlers.
|
|
69
|
+
"""
|
|
70
|
+
conf = {**_DEFAULT_LOGGING_CONF, **overrides}
|
|
71
|
+
conf["loggers"]["wxpath"]["level"] = level
|
|
72
|
+
dictConfig(conf)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class CrawlAdapter(logging.LoggerAdapter):
|
|
76
|
+
"""
|
|
77
|
+
Inject crawl context (depth, op, url) so the handler/formatter
|
|
78
|
+
never needs to know scraping internals.
|
|
79
|
+
"""
|
|
80
|
+
def process(self, msg: str, kwargs: Mapping[str, Any]):
|
|
81
|
+
extra = self.extra.copy()
|
|
82
|
+
extra.update(kwargs.pop("extra", {}))
|
|
83
|
+
kwargs["extra"] = extra
|
|
84
|
+
return msg, kwargs
|
|
85
|
+
|
|
86
|
+
def get_logger(name: str, **ctx) -> CrawlAdapter:
|
|
87
|
+
base = logging.getLogger(name)
|
|
88
|
+
# default placeholders so formatter never blows up
|
|
89
|
+
defaults = {"depth": "-", "op": "-", "url": "-"}
|
|
90
|
+
defaults.update(ctx)
|
|
91
|
+
return CrawlAdapter(base, defaults)
|
wxpath/util/serialize.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from wxpath.core.ops import WxStr
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def simplify(obj):
|
|
5
|
+
"""
|
|
6
|
+
Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
|
|
7
|
+
lxml elements) into plain built-in Python types so that printing or
|
|
8
|
+
JSON serialising shows clean values.
|
|
9
|
+
"""
|
|
10
|
+
# Scalars
|
|
11
|
+
if isinstance(obj, WxStr):
|
|
12
|
+
return str(obj)
|
|
13
|
+
|
|
14
|
+
# Mapping
|
|
15
|
+
if isinstance(obj, dict):
|
|
16
|
+
return {k: simplify(v) for k, v in obj.items()}
|
|
17
|
+
|
|
18
|
+
# Sequence (but not str/bytes)
|
|
19
|
+
if isinstance(obj, (list, tuple, set)):
|
|
20
|
+
return type(obj)(simplify(v) for v in obj)
|
|
21
|
+
|
|
22
|
+
return obj
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -9,8 +9,8 @@ Description-Content-Type: text/markdown
|
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: requests>=2.0
|
|
11
11
|
Requires-Dist: lxml>=4.0
|
|
12
|
-
Requires-Dist: elementpath
|
|
13
|
-
Requires-Dist: aiohttp
|
|
12
|
+
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
13
|
+
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
14
14
|
Provides-Extra: test
|
|
15
15
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
16
16
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
@@ -27,10 +27,11 @@ By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine
|
|
|
27
27
|
|
|
28
28
|
NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
29
29
|
|
|
30
|
+
|
|
30
31
|
## Contents
|
|
31
32
|
|
|
32
33
|
- [Example](#example)
|
|
33
|
-
- [`url(...)` and
|
|
34
|
+
- [`url(...)` and `///url(...)` Explained](#url-and---explained)
|
|
34
35
|
- [General flow](#general-flow)
|
|
35
36
|
- [Asynchronous Crawling](#asynchronous-crawling)
|
|
36
37
|
- [Output types](#output-types)
|
|
@@ -39,11 +40,13 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
39
40
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
40
41
|
- [Install](#install)
|
|
41
42
|
- [More Examples](#more-examples)
|
|
43
|
+
- [Comparisons](#comparisons)
|
|
42
44
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
43
45
|
- [Project Philosophy](#project-philosophy)
|
|
44
46
|
- [Warnings](#warnings)
|
|
45
47
|
- [License](#license)
|
|
46
48
|
|
|
49
|
+
|
|
47
50
|
## Example
|
|
48
51
|
|
|
49
52
|
```python
|
|
@@ -51,7 +54,7 @@ import wxpath
|
|
|
51
54
|
|
|
52
55
|
path = """
|
|
53
56
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
54
|
-
///main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))]
|
|
57
|
+
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
55
58
|
/map{
|
|
56
59
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
57
60
|
'url':string(base-uri(.)),
|
|
@@ -86,10 +89,11 @@ The above expression does the following:
|
|
|
86
89
|
4. Streams the extracted data as it is discovered.
|
|
87
90
|
|
|
88
91
|
|
|
89
|
-
## `url(...)` and
|
|
92
|
+
## `url(...)` and `///url(...)` Explained
|
|
90
93
|
|
|
91
94
|
- `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
|
|
92
|
-
-
|
|
95
|
+
- `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
|
|
96
|
+
|
|
93
97
|
|
|
94
98
|
## General flow
|
|
95
99
|
|
|
@@ -99,14 +103,13 @@ The above expression does the following:
|
|
|
99
103
|
|
|
100
104
|
XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
|
|
101
105
|
|
|
102
|
-
|
|
106
|
+
`///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
|
|
103
107
|
|
|
104
108
|
Results are yielded as soon as they are ready.
|
|
105
109
|
|
|
106
110
|
|
|
107
111
|
## Asynchronous Crawling
|
|
108
112
|
|
|
109
|
-
|
|
110
113
|
**wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
|
|
111
114
|
|
|
112
115
|
```python
|
|
@@ -116,7 +119,7 @@ from wxpath import wxpath_async
|
|
|
116
119
|
items = []
|
|
117
120
|
|
|
118
121
|
async def main():
|
|
119
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
122
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
120
123
|
async for item in wxpath_async(path_expr, max_depth=1):
|
|
121
124
|
items.append(item)
|
|
122
125
|
|
|
@@ -125,16 +128,16 @@ asyncio.run(main())
|
|
|
125
128
|
|
|
126
129
|
### Blocking, Concurrent Requests
|
|
127
130
|
|
|
128
|
-
|
|
129
131
|
**wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
130
132
|
|
|
131
133
|
```python
|
|
132
134
|
from wxpath import wxpath_async_blocking_iter
|
|
133
135
|
|
|
134
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
136
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
135
137
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
136
138
|
```
|
|
137
139
|
|
|
140
|
+
|
|
138
141
|
## Output types
|
|
139
142
|
|
|
140
143
|
The wxpath Python API yields structured objects, not just strings.
|
|
@@ -158,7 +161,7 @@ The Python API preserves structure by default.
|
|
|
158
161
|
```python
|
|
159
162
|
path_expr = """
|
|
160
163
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
161
|
-
///div[@id='mw-content-text']//a
|
|
164
|
+
///url(//div[@id='mw-content-text']//a/@href)
|
|
162
165
|
/map{
|
|
163
166
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
164
167
|
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
@@ -178,15 +181,18 @@ path_expr = """
|
|
|
178
181
|
# ...]
|
|
179
182
|
```
|
|
180
183
|
|
|
184
|
+
|
|
181
185
|
## CLI
|
|
182
186
|
|
|
183
187
|
**wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
|
|
184
188
|
|
|
189
|
+
The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
|
|
190
|
+
|
|
191
|
+
WARNING: Due to the everchanging nature of web content, the output may vary over time.
|
|
185
192
|
```bash
|
|
186
193
|
> wxpath --depth 1 "\
|
|
187
194
|
url('https://en.wikipedia.org/wiki/Expression_language')\
|
|
188
|
-
///div[@id='mw-content-text'] \
|
|
189
|
-
//a/url(@href[starts-with(., '/wiki/') \
|
|
195
|
+
///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
|
|
190
196
|
and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
|
|
191
197
|
/map{ \
|
|
192
198
|
'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
|
|
@@ -258,90 +264,13 @@ pip install wxpath
|
|
|
258
264
|
|
|
259
265
|
## More Examples
|
|
260
266
|
|
|
261
|
-
|
|
262
|
-
import wxpath
|
|
267
|
+
See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
|
|
263
268
|
|
|
264
|
-
#### EXAMPLE 1 - Simple, single page crawl and link extraction #######
|
|
265
|
-
#
|
|
266
|
-
# Starting from Expression language's wiki, extract all links (hrefs)
|
|
267
|
-
# from the main section. The `url(...)` operator is used to execute a
|
|
268
|
-
# web request to the specified URL and return the HTML content.
|
|
269
|
-
#
|
|
270
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//main//a/@href"
|
|
271
|
-
|
|
272
|
-
items = wxpath.wxpath_async_blocking(path_expr)
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
#### EXAMPLE 2 - Two-deep crawl and link extraction ##################
|
|
276
|
-
#
|
|
277
|
-
# Starting from Expression language's wiki, crawl all child links
|
|
278
|
-
# starting with '/wiki/', and extract each child's links (hrefs). The
|
|
279
|
-
# `url(...)` operator is pipe'd arguments from the evaluated XPath.
|
|
280
|
-
#
|
|
281
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(@href[starts-with(., '/wiki/')])//a/@href"
|
|
282
|
-
|
|
283
|
-
#### EXAMPLE 3 - Infinite crawl with BFS tree depth limit ############
|
|
284
|
-
#
|
|
285
|
-
# Starting from Expression language's wiki, infinitely crawl all child
|
|
286
|
-
# links (and child's child's links recursively). The `///` syntax is
|
|
287
|
-
# used to indicate an infinite crawl.
|
|
288
|
-
# Returns lxml.html.HtmlElement objects.
|
|
289
|
-
#
|
|
290
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
|
|
291
|
-
|
|
292
|
-
# The same expression written differently:
|
|
293
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//main//a/@href)"
|
|
294
|
-
|
|
295
|
-
# Modify (inclusive) max_depth to limit the BFS tree (crawl depth).
|
|
296
|
-
items = wxpath.wxpath_async_blocking(path_expr, max_depth=1)
|
|
297
|
-
|
|
298
|
-
#### EXAMPLE 4 - Infinite crawl with field extraction ################
|
|
299
|
-
#
|
|
300
|
-
# Infinitely crawls Expression language's wiki's child links and
|
|
301
|
-
# childs' child links (recursively) and then, for each child link
|
|
302
|
-
# crawled, extracts objects with the named fields as a dict.
|
|
303
|
-
#
|
|
304
|
-
path_expr = """
|
|
305
|
-
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
306
|
-
///main//a/url(@href)
|
|
307
|
-
/map {
|
|
308
|
-
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
309
|
-
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
310
|
-
'url'://link[@rel='canonical']/@href[1],
|
|
311
|
-
'backlink':wx:backlink(.),
|
|
312
|
-
'depth':wx:depth(.)
|
|
313
|
-
}
|
|
314
|
-
"""
|
|
315
269
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
# >> segments
|
|
320
|
-
# [Segment(op='url', value='https://en.wikipedia.org/wiki/Expression_language'),
|
|
321
|
-
# Segment(op='url_inf', value='///url(//main//a/@href)'),
|
|
322
|
-
# Segment(op='xpath', value='/map { \'title\':(//span[contains(@class, "mw-page-title-main")]/text())[1], \'short_description\':(//div[contains(@class, "shortdescription")]/text())[1], \'url\'://link[@rel=\'canonical\']/@href[1] }')]
|
|
323
|
-
|
|
324
|
-
#### EXAMPLE 5 = Seeding from XPath function expression + mapping operator (`!`)
|
|
325
|
-
#
|
|
326
|
-
# Functionally create 10 Amazon book search result page URLs, map each URL to
|
|
327
|
-
# the url(.) operator, and for each page, extract the title, price, and link of
|
|
328
|
-
# each book listed.
|
|
329
|
-
#
|
|
330
|
-
base_url = "https://www.amazon.com/s?k=books&i=stripbooks&page="
|
|
331
|
-
|
|
332
|
-
path_expr = f"""
|
|
333
|
-
(1 to 10) ! ('{base_url}' || .) !
|
|
334
|
-
url(.)
|
|
335
|
-
//span[@data-component-type='s-search-results']//*[@role='listitem']
|
|
336
|
-
/map {{
|
|
337
|
-
'title': (.//h2/span/text())[1],
|
|
338
|
-
'price': (.//span[@class='a-price']/span[@class='a-offscreen']/text())[1],
|
|
339
|
-
'link': (.//a[@aria-describedby='price-link']/@href)[1]
|
|
340
|
-
}}
|
|
341
|
-
"""
|
|
270
|
+
## Comparisons
|
|
271
|
+
|
|
272
|
+
See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
|
|
342
273
|
|
|
343
|
-
items = list(wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
344
|
-
```
|
|
345
274
|
|
|
346
275
|
## Advanced: Engine & Crawler Configuration
|
|
347
276
|
|
|
@@ -366,7 +295,7 @@ engine = WXPathEngine(
|
|
|
366
295
|
crawler=crawler,
|
|
367
296
|
)
|
|
368
297
|
|
|
369
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')
|
|
298
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
|
|
370
299
|
|
|
371
300
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
372
301
|
```
|
|
@@ -394,6 +323,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
394
323
|
- Automatic proxy rotation
|
|
395
324
|
- Browser-based rendering (JavaScript execution)
|
|
396
325
|
|
|
326
|
+
|
|
397
327
|
## WARNINGS!!!
|
|
398
328
|
|
|
399
329
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
@@ -401,6 +331,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
401
331
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
402
332
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
403
333
|
|
|
334
|
+
|
|
404
335
|
## License
|
|
405
336
|
|
|
406
337
|
MIT
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
wxpath/__init__.py,sha256=w1hFE_VSIYq_TSFLoPfp6MJbG1sA6BeChX6PYsXIK4o,265
|
|
2
|
+
wxpath/cli.py,sha256=CHOFWH_WHsJ30aItIQw9c5jzjl2Y64DmW2K942OGwpo,1668
|
|
3
|
+
wxpath/patches.py,sha256=u0dOL-K-gvdO9SJvzGrqR9Zou6XduWjl6R7mzIcZtJg,2130
|
|
4
|
+
wxpath/core/__init__.py,sha256=U9_In2iRaZrpiIVavIli1M59gCB6Kn1en-1Fza-qIiI,257
|
|
5
|
+
wxpath/core/dom.py,sha256=X0L3n8jRfO5evEypDaJTD-NQ3cLXWvnEUVERAHo3vV0,701
|
|
6
|
+
wxpath/core/errors.py,sha256=q56Gs5JJSC4HKImUtdZhOHcqe8XsoIrVhsaaoJ2qhCQ,4198
|
|
7
|
+
wxpath/core/models.py,sha256=3KYt-UwfLY2FlSRUHeA_getnYaNUMPW9wRrl2CRbPso,1611
|
|
8
|
+
wxpath/core/ops.py,sha256=8hc8VTqsxGFpizOyPTgzxjc8Y5srHd2aaOugQ9fJ3sE,8918
|
|
9
|
+
wxpath/core/parser.py,sha256=0VQCkuznd4dYYzEeTAMFs1L2SmvTgSp1JWz-Um0uEjM,9911
|
|
10
|
+
wxpath/core/runtime/__init__.py,sha256=_iCgkIWxXvxzQcenHOsjYGsk74HboTIYWOtgM8GtCyc,86
|
|
11
|
+
wxpath/core/runtime/engine.py,sha256=Pn5wzPkBwp8bq48Ie0O0DVQzUFEAAzWIj1PHgChm2bo,10825
|
|
12
|
+
wxpath/core/runtime/helpers.py,sha256=NCL4Wl8Hpc1VTfERSthCen9wlVd5J0eS8th4gqEPmRg,1578
|
|
13
|
+
wxpath/hooks/__init__.py,sha256=9JG63e4z_8CZLWugFcY786hebaEEPZ5FmZhyDHat-98,294
|
|
14
|
+
wxpath/hooks/builtin.py,sha256=GJ4w1C9djWNzAmAA3U0qI9OoCOeC5R8tEGtWXJVHSYs,4125
|
|
15
|
+
wxpath/hooks/registry.py,sha256=q4MxYwDUv7LH4-WJGO_unXbBRFXXxsBCU4vU1co0gC4,4136
|
|
16
|
+
wxpath/http/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
wxpath/http/stats.py,sha256=FrXbFrnms113Gapf-Z5WiD5qaNiJ0XuOqjSQhwXfuEo,3172
|
|
18
|
+
wxpath/http/client/__init__.py,sha256=QpdmqzcznUeuFvT3IIo-LmBUUHEa2BDq9sHGAHJnDLI,202
|
|
19
|
+
wxpath/http/client/crawler.py,sha256=hN7EJXP102nsMA9ipaNPc9fWwDVpm_LJdGo6LSlAQp0,6996
|
|
20
|
+
wxpath/http/client/request.py,sha256=3nwwPQ2e_WycJQnSA6QieWJ2q3qg40jkGrp2NUDPsLI,888
|
|
21
|
+
wxpath/http/client/response.py,sha256=mDo3FswiVnulV1l5qjio5OQpGlT0-tfkR7daPSgSUuE,324
|
|
22
|
+
wxpath/http/policy/backoff.py,sha256=NwdUR6bRe1RtUGSJOktj-p8IyC1l9xu_-Aa_Gj_u5sw,321
|
|
23
|
+
wxpath/http/policy/retry.py,sha256=WSrQfCy1F7IcXFpVGDi4HTphNhFq12p4DaMO0_4dgrw,982
|
|
24
|
+
wxpath/http/policy/throttler.py,sha256=wydMFV-0mxpHSI5iYkLfE78oY4z_fF8jW9MqCeb8G54,3014
|
|
25
|
+
wxpath/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
wxpath/util/logging.py,sha256=oQi8sp7yKWgXkkcJ4U4WHp7TyBCQiK4VhSXOSb8pGw0,2965
|
|
27
|
+
wxpath/util/serialize.py,sha256=uUs4C9VErpFd97smBM2bRWo2nW25kCgKdsMrVtVxhg8,575
|
|
28
|
+
wxpath-0.2.0.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
|
|
29
|
+
wxpath-0.2.0.dist-info/METADATA,sha256=6CdIcq82gNqvXVIpBzhGCk_Q0eqDvok1JmEKWQkFals,14662
|
|
30
|
+
wxpath-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
wxpath-0.2.0.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
|
|
32
|
+
wxpath-0.2.0.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
|
|
33
|
+
wxpath-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
wxpath
|
wxpath-0.1.1.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
wxpath-0.1.1.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
|
|
2
|
-
wxpath-0.1.1.dist-info/METADATA,sha256=-CZQ3N2wjoO2ArbQ5JSdtMtMUrnLwiOGnQMtnBdzleE,17719
|
|
3
|
-
wxpath-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
4
|
-
wxpath-0.1.1.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
|
|
5
|
-
wxpath-0.1.1.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
6
|
-
wxpath-0.1.1.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|