wxpath 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +52 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/errors.py +134 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +244 -0
- wxpath/core/parser.py +319 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +315 -0
- wxpath/core/runtime/helpers.py +48 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +133 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +196 -0
- wxpath/http/client/request.py +35 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.0.dist-info → wxpath-0.2.0.dist-info}/METADATA +30 -97
- wxpath-0.2.0.dist-info/RECORD +33 -0
- wxpath-0.2.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.0.dist-info/RECORD +0 -6
- wxpath-0.1.0.dist-info/top_level.txt +0 -1
- {wxpath-0.1.0.dist-info → wxpath-0.2.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.0.dist-info → wxpath-0.2.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.0.dist-info → wxpath-0.2.0.dist-info}/licenses/LICENSE +0 -0
wxpath/http/stats.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
aiohttp request statistics and tracing hooks.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from aiohttp import TraceConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class CrawlerStats:
|
|
15
|
+
# ---- Lifecycle counts ----
|
|
16
|
+
requests_enqueued: int = 0
|
|
17
|
+
requests_started: int = 0
|
|
18
|
+
requests_completed: int = 0
|
|
19
|
+
|
|
20
|
+
# ---- Concurrency ----
|
|
21
|
+
in_flight_global: int = 0
|
|
22
|
+
in_flight_per_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
23
|
+
|
|
24
|
+
# ---- Queueing ----
|
|
25
|
+
queue_size: int = 0
|
|
26
|
+
queue_wait_time_total: float = 0.0
|
|
27
|
+
|
|
28
|
+
# ---- Throttling ----
|
|
29
|
+
throttle_waits: int = 0
|
|
30
|
+
throttle_wait_time: float = 0.0
|
|
31
|
+
throttle_waits_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
32
|
+
|
|
33
|
+
# ---- Latency feedback ----
|
|
34
|
+
latency_samples: int = 0
|
|
35
|
+
latency_ewma: float = 0.0
|
|
36
|
+
min_latency: Optional[float] = None
|
|
37
|
+
max_latency: Optional[float] = None
|
|
38
|
+
|
|
39
|
+
# ---- Errors / retries ----
|
|
40
|
+
retries_scheduled: int = 0
|
|
41
|
+
retries_executed: int = 0
|
|
42
|
+
errors_by_host: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build_trace_config(stats: CrawlerStats) -> TraceConfig:
|
|
46
|
+
"""
|
|
47
|
+
Returns an aiohttp TraceConfig wired to the given stats instance.
|
|
48
|
+
Tracks detailed per-request, per-host, and queue/throttle metrics.
|
|
49
|
+
"""
|
|
50
|
+
trace = TraceConfig()
|
|
51
|
+
|
|
52
|
+
async def on_request_start(session, context, params):
|
|
53
|
+
stats.requests_started += 1
|
|
54
|
+
stats.in_flight_global += 1
|
|
55
|
+
host = params.url.host
|
|
56
|
+
stats.in_flight_per_host[host] += 1
|
|
57
|
+
context._start_time = time.monotonic()
|
|
58
|
+
|
|
59
|
+
async def on_request_end(session, context, params):
|
|
60
|
+
host = params.url.host
|
|
61
|
+
stats.in_flight_global -= 1
|
|
62
|
+
stats.in_flight_per_host[host] -= 1
|
|
63
|
+
|
|
64
|
+
latency = time.monotonic() - context._start_time
|
|
65
|
+
stats.latency_samples += 1
|
|
66
|
+
# EWMA update: alpha = 0.3
|
|
67
|
+
alpha = 0.3
|
|
68
|
+
stats.latency_ewma = (alpha * latency) + ((1 - alpha) * stats.latency_ewma)
|
|
69
|
+
stats.min_latency = latency if stats.min_latency is None \
|
|
70
|
+
else min(stats.min_latency, latency)
|
|
71
|
+
stats.max_latency = latency if stats.max_latency is None \
|
|
72
|
+
else max(stats.max_latency, latency)
|
|
73
|
+
|
|
74
|
+
status = getattr(params.response, "status", None)
|
|
75
|
+
if status is not None:
|
|
76
|
+
if not hasattr(stats, "status_counts"):
|
|
77
|
+
stats.status_counts = defaultdict(int)
|
|
78
|
+
stats.status_counts[status] += 1
|
|
79
|
+
|
|
80
|
+
content_length = getattr(params.response, "content_length", None)
|
|
81
|
+
if content_length:
|
|
82
|
+
if not hasattr(stats, "bytes_received"):
|
|
83
|
+
stats.bytes_received = 0
|
|
84
|
+
stats.bytes_received += content_length
|
|
85
|
+
|
|
86
|
+
async def on_request_exception(session, context, params):
|
|
87
|
+
host = params.url.host
|
|
88
|
+
stats.in_flight_global -= 1
|
|
89
|
+
stats.in_flight_per_host[host] -= 1
|
|
90
|
+
stats.errors_by_host[host] += 1
|
|
91
|
+
|
|
92
|
+
trace.on_request_start.append(on_request_start)
|
|
93
|
+
trace.on_request_end.append(on_request_end)
|
|
94
|
+
trace.on_request_exception.append(on_request_exception)
|
|
95
|
+
|
|
96
|
+
return trace
|
wxpath/patches.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import elementpath
|
|
2
|
+
from elementpath.xpath3 import XPath3Parser
|
|
3
|
+
from lxml import etree, html
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def html_element_repr(self):
|
|
7
|
+
return (f"HtmlElement(tag={self.tag}, "
|
|
8
|
+
f"depth={self.get('depth', -1)}, "
|
|
9
|
+
f"base_url={getattr(self, 'base_url', None)!r})")
|
|
10
|
+
|
|
11
|
+
# Patch lxml.html.HtmlElement.__repr__ to improve debugging with base_url.
|
|
12
|
+
html.HtmlElement.__repr__ = html_element_repr
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class XPath3Element(etree.ElementBase):
|
|
16
|
+
def xpath3(self, expr, **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
Evaluate an XPath 3 expression using elementpath library,
|
|
19
|
+
returning the results as a list.
|
|
20
|
+
"""
|
|
21
|
+
kwargs.setdefault("parser", XPath3Parser)
|
|
22
|
+
kwargs.setdefault(
|
|
23
|
+
"uri",
|
|
24
|
+
getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
|
|
25
|
+
)
|
|
26
|
+
return elementpath.select(self, expr, **kwargs)
|
|
27
|
+
|
|
28
|
+
# --- Convenience property for backward‑compatibility -----------------
|
|
29
|
+
@property
|
|
30
|
+
def base_url(self):
|
|
31
|
+
# 1) Per-element override (keeps our “multiple base URLs” feature)
|
|
32
|
+
url = self.get("base_url")
|
|
33
|
+
if url is not None:
|
|
34
|
+
return url
|
|
35
|
+
# 2) Fall back to document URL (O(1))
|
|
36
|
+
return self.getroottree().docinfo.URL
|
|
37
|
+
|
|
38
|
+
@base_url.setter
|
|
39
|
+
def base_url(self, value):
|
|
40
|
+
# Keep the per-element attribute (used by our crawler)
|
|
41
|
+
self.set("base_url", value)
|
|
42
|
+
# Set xml:base attribute so XPath base-uri() picks it up
|
|
43
|
+
self.set("{http://www.w3.org/XML/1998/namespace}base", value)
|
|
44
|
+
# Also store on the document so descendants can fetch it quickly
|
|
45
|
+
self.getroottree().docinfo.URL = value
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def depth(self):
|
|
49
|
+
return int(self.get("depth", -1))
|
|
50
|
+
|
|
51
|
+
@depth.setter
|
|
52
|
+
def depth(self, value):
|
|
53
|
+
self.set("depth", str(value))
|
|
54
|
+
|
|
55
|
+
# Create and register custom parser that returns XPath3Element instances
|
|
56
|
+
lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
|
|
57
|
+
parser = etree.HTMLParser()
|
|
58
|
+
parser.set_element_class_lookup(lookup)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Expose parser for use in parse_html
|
|
62
|
+
html_parser_with_xpath3 = parser
|
|
63
|
+
html.HtmlElement.xpath3 = XPath3Element.xpath3
|
wxpath/util/__init__.py
ADDED
|
File without changes
|
wxpath/util/logging.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from logging.config import dictConfig
|
|
3
|
+
from typing import Any, Mapping
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class KeyValueFormatter(logging.Formatter):
|
|
7
|
+
"""
|
|
8
|
+
Formatter that automatically renders any 'extra' context added to the record
|
|
9
|
+
as key=value pairs at the end of the log line.
|
|
10
|
+
"""
|
|
11
|
+
# Reserved keys that already exist in LogRecord and shouldn't be printed again
|
|
12
|
+
_RESERVED = {
|
|
13
|
+
'args', 'asctime', 'created', 'exc_info', 'exc_text', 'filename',
|
|
14
|
+
'funcName', 'levelname', 'levelno', 'lineno', 'message', 'module',
|
|
15
|
+
'msecs', 'msg', 'name', 'pathname', 'process', 'processName',
|
|
16
|
+
'relativeCreated', 'stack_info', 'thread', 'threadName', 'taskName'
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
20
|
+
# 1. Format the standard message first
|
|
21
|
+
s = super().format(record)
|
|
22
|
+
|
|
23
|
+
# 2. Find all 'extra' keys
|
|
24
|
+
extras = {k: v for k, v in record.__dict__.items() if k not in self._RESERVED}
|
|
25
|
+
|
|
26
|
+
# 3. Append them as key=value
|
|
27
|
+
if extras:
|
|
28
|
+
# Sort for deterministic logs
|
|
29
|
+
context_str = " ".join(f"{k}={v}" for k, v in sorted(extras.items()))
|
|
30
|
+
s = f"{s} | {context_str}"
|
|
31
|
+
|
|
32
|
+
return s
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_DEFAULT_LOGGING_CONF = {
|
|
36
|
+
"version": 1,
|
|
37
|
+
"disable_existing_loggers": False,
|
|
38
|
+
"formatters": {
|
|
39
|
+
"kv": {
|
|
40
|
+
# Note: We use the class path to our custom class
|
|
41
|
+
"()": KeyValueFormatter,
|
|
42
|
+
"format": "%(asctime)s [%(levelname).1s] %(name)s | %(funcName)s | %(message)s"
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"handlers": {
|
|
46
|
+
"stderr": {
|
|
47
|
+
"class": "logging.StreamHandler",
|
|
48
|
+
"formatter": "kv",
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"loggers": {
|
|
52
|
+
"wxpath": {"level": "INFO", "handlers": ["stderr"]},
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def configure_logging(level: str | int = "INFO", **overrides) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Configure wxpath's logger.
|
|
59
|
+
|
|
60
|
+
Call this once in an application entry-point **or** rely on defaults.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
level
|
|
65
|
+
"DEBUG"|"INFO"|... or `logging.DEBUG`, overrides the root wxpath logger.
|
|
66
|
+
overrides
|
|
67
|
+
Dict that is merged (shallow) into the default dictConfig.
|
|
68
|
+
Lets advanced users swap formatters/handlers.
|
|
69
|
+
"""
|
|
70
|
+
conf = {**_DEFAULT_LOGGING_CONF, **overrides}
|
|
71
|
+
conf["loggers"]["wxpath"]["level"] = level
|
|
72
|
+
dictConfig(conf)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class CrawlAdapter(logging.LoggerAdapter):
|
|
76
|
+
"""
|
|
77
|
+
Inject crawl context (depth, op, url) so the handler/formatter
|
|
78
|
+
never needs to know scraping internals.
|
|
79
|
+
"""
|
|
80
|
+
def process(self, msg: str, kwargs: Mapping[str, Any]):
|
|
81
|
+
extra = self.extra.copy()
|
|
82
|
+
extra.update(kwargs.pop("extra", {}))
|
|
83
|
+
kwargs["extra"] = extra
|
|
84
|
+
return msg, kwargs
|
|
85
|
+
|
|
86
|
+
def get_logger(name: str, **ctx) -> CrawlAdapter:
|
|
87
|
+
base = logging.getLogger(name)
|
|
88
|
+
# default placeholders so formatter never blows up
|
|
89
|
+
defaults = {"depth": "-", "op": "-", "url": "-"}
|
|
90
|
+
defaults.update(ctx)
|
|
91
|
+
return CrawlAdapter(base, defaults)
|
wxpath/util/serialize.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from wxpath.core.ops import WxStr
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def simplify(obj):
|
|
5
|
+
"""
|
|
6
|
+
Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
|
|
7
|
+
lxml elements) into plain built-in Python types so that printing or
|
|
8
|
+
JSON serialising shows clean values.
|
|
9
|
+
"""
|
|
10
|
+
# Scalars
|
|
11
|
+
if isinstance(obj, WxStr):
|
|
12
|
+
return str(obj)
|
|
13
|
+
|
|
14
|
+
# Mapping
|
|
15
|
+
if isinstance(obj, dict):
|
|
16
|
+
return {k: simplify(v) for k, v in obj.items()}
|
|
17
|
+
|
|
18
|
+
# Sequence (but not str/bytes)
|
|
19
|
+
if isinstance(obj, (list, tuple, set)):
|
|
20
|
+
return type(obj)(simplify(v) for v in obj)
|
|
21
|
+
|
|
22
|
+
return obj
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -9,11 +9,13 @@ Description-Content-Type: text/markdown
|
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: requests>=2.0
|
|
11
11
|
Requires-Dist: lxml>=4.0
|
|
12
|
-
Requires-Dist: elementpath
|
|
13
|
-
Requires-Dist: aiohttp
|
|
12
|
+
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
13
|
+
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
14
14
|
Provides-Extra: test
|
|
15
15
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
16
16
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: ruff; extra == "dev"
|
|
17
19
|
Dynamic: license-file
|
|
18
20
|
|
|
19
21
|
|
|
@@ -25,10 +27,11 @@ By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine
|
|
|
25
27
|
|
|
26
28
|
NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
27
29
|
|
|
30
|
+
|
|
28
31
|
## Contents
|
|
29
32
|
|
|
30
33
|
- [Example](#example)
|
|
31
|
-
- [`url(...)` and
|
|
34
|
+
- [`url(...)` and `///url(...)` Explained](#url-and---explained)
|
|
32
35
|
- [General flow](#general-flow)
|
|
33
36
|
- [Asynchronous Crawling](#asynchronous-crawling)
|
|
34
37
|
- [Output types](#output-types)
|
|
@@ -37,11 +40,13 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
37
40
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
38
41
|
- [Install](#install)
|
|
39
42
|
- [More Examples](#more-examples)
|
|
43
|
+
- [Comparisons](#comparisons)
|
|
40
44
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
41
45
|
- [Project Philosophy](#project-philosophy)
|
|
42
46
|
- [Warnings](#warnings)
|
|
43
47
|
- [License](#license)
|
|
44
48
|
|
|
49
|
+
|
|
45
50
|
## Example
|
|
46
51
|
|
|
47
52
|
```python
|
|
@@ -49,7 +54,7 @@ import wxpath
|
|
|
49
54
|
|
|
50
55
|
path = """
|
|
51
56
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
52
|
-
///main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))]
|
|
57
|
+
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
53
58
|
/map{
|
|
54
59
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
55
60
|
'url':string(base-uri(.)),
|
|
@@ -84,10 +89,11 @@ The above expression does the following:
|
|
|
84
89
|
4. Streams the extracted data as it is discovered.
|
|
85
90
|
|
|
86
91
|
|
|
87
|
-
## `url(...)` and
|
|
92
|
+
## `url(...)` and `///url(...)` Explained
|
|
88
93
|
|
|
89
94
|
- `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
|
|
90
|
-
-
|
|
95
|
+
- `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
|
|
96
|
+
|
|
91
97
|
|
|
92
98
|
## General flow
|
|
93
99
|
|
|
@@ -97,14 +103,13 @@ The above expression does the following:
|
|
|
97
103
|
|
|
98
104
|
XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
|
|
99
105
|
|
|
100
|
-
|
|
106
|
+
`///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
|
|
101
107
|
|
|
102
108
|
Results are yielded as soon as they are ready.
|
|
103
109
|
|
|
104
110
|
|
|
105
111
|
## Asynchronous Crawling
|
|
106
112
|
|
|
107
|
-
|
|
108
113
|
**wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
|
|
109
114
|
|
|
110
115
|
```python
|
|
@@ -114,7 +119,7 @@ from wxpath import wxpath_async
|
|
|
114
119
|
items = []
|
|
115
120
|
|
|
116
121
|
async def main():
|
|
117
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
122
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
118
123
|
async for item in wxpath_async(path_expr, max_depth=1):
|
|
119
124
|
items.append(item)
|
|
120
125
|
|
|
@@ -123,16 +128,16 @@ asyncio.run(main())
|
|
|
123
128
|
|
|
124
129
|
### Blocking, Concurrent Requests
|
|
125
130
|
|
|
126
|
-
|
|
127
131
|
**wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
128
132
|
|
|
129
133
|
```python
|
|
130
134
|
from wxpath import wxpath_async_blocking_iter
|
|
131
135
|
|
|
132
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
136
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
133
137
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
134
138
|
```
|
|
135
139
|
|
|
140
|
+
|
|
136
141
|
## Output types
|
|
137
142
|
|
|
138
143
|
The wxpath Python API yields structured objects, not just strings.
|
|
@@ -156,7 +161,7 @@ The Python API preserves structure by default.
|
|
|
156
161
|
```python
|
|
157
162
|
path_expr = """
|
|
158
163
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
159
|
-
///div[@id='mw-content-text']//a
|
|
164
|
+
///url(//div[@id='mw-content-text']//a/@href)
|
|
160
165
|
/map{
|
|
161
166
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
162
167
|
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
@@ -176,15 +181,18 @@ path_expr = """
|
|
|
176
181
|
# ...]
|
|
177
182
|
```
|
|
178
183
|
|
|
184
|
+
|
|
179
185
|
## CLI
|
|
180
186
|
|
|
181
187
|
**wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
|
|
182
188
|
|
|
189
|
+
The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
|
|
190
|
+
|
|
191
|
+
WARNING: Due to the everchanging nature of web content, the output may vary over time.
|
|
183
192
|
```bash
|
|
184
193
|
> wxpath --depth 1 "\
|
|
185
194
|
url('https://en.wikipedia.org/wiki/Expression_language')\
|
|
186
|
-
///div[@id='mw-content-text'] \
|
|
187
|
-
//a/url(@href[starts-with(., '/wiki/') \
|
|
195
|
+
///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
|
|
188
196
|
and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
|
|
189
197
|
/map{ \
|
|
190
198
|
'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
|
|
@@ -256,90 +264,13 @@ pip install wxpath
|
|
|
256
264
|
|
|
257
265
|
## More Examples
|
|
258
266
|
|
|
259
|
-
|
|
260
|
-
import wxpath
|
|
267
|
+
See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
|
|
261
268
|
|
|
262
|
-
#### EXAMPLE 1 - Simple, single page crawl and link extraction #######
|
|
263
|
-
#
|
|
264
|
-
# Starting from Expression language's wiki, extract all links (hrefs)
|
|
265
|
-
# from the main section. The `url(...)` operator is used to execute a
|
|
266
|
-
# web request to the specified URL and return the HTML content.
|
|
267
|
-
#
|
|
268
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//main//a/@href"
|
|
269
|
-
|
|
270
|
-
items = wxpath.wxpath_async_blocking(path_expr)
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
#### EXAMPLE 2 - Two-deep crawl and link extraction ##################
|
|
274
|
-
#
|
|
275
|
-
# Starting from Expression language's wiki, crawl all child links
|
|
276
|
-
# starting with '/wiki/', and extract each child's links (hrefs). The
|
|
277
|
-
# `url(...)` operator is pipe'd arguments from the evaluated XPath.
|
|
278
|
-
#
|
|
279
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(@href[starts-with(., '/wiki/')])//a/@href"
|
|
280
|
-
|
|
281
|
-
#### EXAMPLE 3 - Infinite crawl with BFS tree depth limit ############
|
|
282
|
-
#
|
|
283
|
-
# Starting from Expression language's wiki, infinitely crawl all child
|
|
284
|
-
# links (and child's child's links recursively). The `///` syntax is
|
|
285
|
-
# used to indicate an infinite crawl.
|
|
286
|
-
# Returns lxml.html.HtmlElement objects.
|
|
287
|
-
#
|
|
288
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
|
|
289
|
-
|
|
290
|
-
# The same expression written differently:
|
|
291
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//main//a/@href)"
|
|
292
|
-
|
|
293
|
-
# Modify (inclusive) max_depth to limit the BFS tree (crawl depth).
|
|
294
|
-
items = wxpath.wxpath_async_blocking(path_expr, max_depth=1)
|
|
295
|
-
|
|
296
|
-
#### EXAMPLE 4 - Infinite crawl with field extraction ################
|
|
297
|
-
#
|
|
298
|
-
# Infinitely crawls Expression language's wiki's child links and
|
|
299
|
-
# childs' child links (recursively) and then, for each child link
|
|
300
|
-
# crawled, extracts objects with the named fields as a dict.
|
|
301
|
-
#
|
|
302
|
-
path_expr = """
|
|
303
|
-
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
304
|
-
///main//a/url(@href)
|
|
305
|
-
/map {
|
|
306
|
-
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
307
|
-
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
308
|
-
'url'://link[@rel='canonical']/@href[1],
|
|
309
|
-
'backlink':wx:backlink(.),
|
|
310
|
-
'depth':wx:depth(.)
|
|
311
|
-
}
|
|
312
|
-
"""
|
|
313
269
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
# >> segments
|
|
318
|
-
# [Segment(op='url', value='https://en.wikipedia.org/wiki/Expression_language'),
|
|
319
|
-
# Segment(op='url_inf', value='///url(//main//a/@href)'),
|
|
320
|
-
# Segment(op='xpath', value='/map { \'title\':(//span[contains(@class, "mw-page-title-main")]/text())[1], \'short_description\':(//div[contains(@class, "shortdescription")]/text())[1], \'url\'://link[@rel=\'canonical\']/@href[1] }')]
|
|
321
|
-
|
|
322
|
-
#### EXAMPLE 5 = Seeding from XPath function expression + mapping operator (`!`)
|
|
323
|
-
#
|
|
324
|
-
# Functionally create 10 Amazon book search result page URLs, map each URL to
|
|
325
|
-
# the url(.) operator, and for each page, extract the title, price, and link of
|
|
326
|
-
# each book listed.
|
|
327
|
-
#
|
|
328
|
-
base_url = "https://www.amazon.com/s?k=books&i=stripbooks&page="
|
|
329
|
-
|
|
330
|
-
path_expr = f"""
|
|
331
|
-
(1 to 10) ! ('{base_url}' || .) !
|
|
332
|
-
url(.)
|
|
333
|
-
//span[@data-component-type='s-search-results']//*[@role='listitem']
|
|
334
|
-
/map {{
|
|
335
|
-
'title': (.//h2/span/text())[1],
|
|
336
|
-
'price': (.//span[@class='a-price']/span[@class='a-offscreen']/text())[1],
|
|
337
|
-
'link': (.//a[@aria-describedby='price-link']/@href)[1]
|
|
338
|
-
}}
|
|
339
|
-
"""
|
|
270
|
+
## Comparisons
|
|
271
|
+
|
|
272
|
+
See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
|
|
340
273
|
|
|
341
|
-
items = list(wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
342
|
-
```
|
|
343
274
|
|
|
344
275
|
## Advanced: Engine & Crawler Configuration
|
|
345
276
|
|
|
@@ -364,7 +295,7 @@ engine = WXPathEngine(
|
|
|
364
295
|
crawler=crawler,
|
|
365
296
|
)
|
|
366
297
|
|
|
367
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')
|
|
298
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
|
|
368
299
|
|
|
369
300
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
370
301
|
```
|
|
@@ -392,6 +323,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
392
323
|
- Automatic proxy rotation
|
|
393
324
|
- Browser-based rendering (JavaScript execution)
|
|
394
325
|
|
|
326
|
+
|
|
395
327
|
## WARNINGS!!!
|
|
396
328
|
|
|
397
329
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
@@ -399,6 +331,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
399
331
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
400
332
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
401
333
|
|
|
334
|
+
|
|
402
335
|
## License
|
|
403
336
|
|
|
404
337
|
MIT
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
wxpath/__init__.py,sha256=w1hFE_VSIYq_TSFLoPfp6MJbG1sA6BeChX6PYsXIK4o,265
|
|
2
|
+
wxpath/cli.py,sha256=CHOFWH_WHsJ30aItIQw9c5jzjl2Y64DmW2K942OGwpo,1668
|
|
3
|
+
wxpath/patches.py,sha256=u0dOL-K-gvdO9SJvzGrqR9Zou6XduWjl6R7mzIcZtJg,2130
|
|
4
|
+
wxpath/core/__init__.py,sha256=U9_In2iRaZrpiIVavIli1M59gCB6Kn1en-1Fza-qIiI,257
|
|
5
|
+
wxpath/core/dom.py,sha256=X0L3n8jRfO5evEypDaJTD-NQ3cLXWvnEUVERAHo3vV0,701
|
|
6
|
+
wxpath/core/errors.py,sha256=q56Gs5JJSC4HKImUtdZhOHcqe8XsoIrVhsaaoJ2qhCQ,4198
|
|
7
|
+
wxpath/core/models.py,sha256=3KYt-UwfLY2FlSRUHeA_getnYaNUMPW9wRrl2CRbPso,1611
|
|
8
|
+
wxpath/core/ops.py,sha256=8hc8VTqsxGFpizOyPTgzxjc8Y5srHd2aaOugQ9fJ3sE,8918
|
|
9
|
+
wxpath/core/parser.py,sha256=0VQCkuznd4dYYzEeTAMFs1L2SmvTgSp1JWz-Um0uEjM,9911
|
|
10
|
+
wxpath/core/runtime/__init__.py,sha256=_iCgkIWxXvxzQcenHOsjYGsk74HboTIYWOtgM8GtCyc,86
|
|
11
|
+
wxpath/core/runtime/engine.py,sha256=Pn5wzPkBwp8bq48Ie0O0DVQzUFEAAzWIj1PHgChm2bo,10825
|
|
12
|
+
wxpath/core/runtime/helpers.py,sha256=NCL4Wl8Hpc1VTfERSthCen9wlVd5J0eS8th4gqEPmRg,1578
|
|
13
|
+
wxpath/hooks/__init__.py,sha256=9JG63e4z_8CZLWugFcY786hebaEEPZ5FmZhyDHat-98,294
|
|
14
|
+
wxpath/hooks/builtin.py,sha256=GJ4w1C9djWNzAmAA3U0qI9OoCOeC5R8tEGtWXJVHSYs,4125
|
|
15
|
+
wxpath/hooks/registry.py,sha256=q4MxYwDUv7LH4-WJGO_unXbBRFXXxsBCU4vU1co0gC4,4136
|
|
16
|
+
wxpath/http/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
wxpath/http/stats.py,sha256=FrXbFrnms113Gapf-Z5WiD5qaNiJ0XuOqjSQhwXfuEo,3172
|
|
18
|
+
wxpath/http/client/__init__.py,sha256=QpdmqzcznUeuFvT3IIo-LmBUUHEa2BDq9sHGAHJnDLI,202
|
|
19
|
+
wxpath/http/client/crawler.py,sha256=hN7EJXP102nsMA9ipaNPc9fWwDVpm_LJdGo6LSlAQp0,6996
|
|
20
|
+
wxpath/http/client/request.py,sha256=3nwwPQ2e_WycJQnSA6QieWJ2q3qg40jkGrp2NUDPsLI,888
|
|
21
|
+
wxpath/http/client/response.py,sha256=mDo3FswiVnulV1l5qjio5OQpGlT0-tfkR7daPSgSUuE,324
|
|
22
|
+
wxpath/http/policy/backoff.py,sha256=NwdUR6bRe1RtUGSJOktj-p8IyC1l9xu_-Aa_Gj_u5sw,321
|
|
23
|
+
wxpath/http/policy/retry.py,sha256=WSrQfCy1F7IcXFpVGDi4HTphNhFq12p4DaMO0_4dgrw,982
|
|
24
|
+
wxpath/http/policy/throttler.py,sha256=wydMFV-0mxpHSI5iYkLfE78oY4z_fF8jW9MqCeb8G54,3014
|
|
25
|
+
wxpath/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
wxpath/util/logging.py,sha256=oQi8sp7yKWgXkkcJ4U4WHp7TyBCQiK4VhSXOSb8pGw0,2965
|
|
27
|
+
wxpath/util/serialize.py,sha256=uUs4C9VErpFd97smBM2bRWo2nW25kCgKdsMrVtVxhg8,575
|
|
28
|
+
wxpath-0.2.0.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
|
|
29
|
+
wxpath-0.2.0.dist-info/METADATA,sha256=6CdIcq82gNqvXVIpBzhGCk_Q0eqDvok1JmEKWQkFals,14662
|
|
30
|
+
wxpath-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
wxpath-0.2.0.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
|
|
32
|
+
wxpath-0.2.0.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
|
|
33
|
+
wxpath-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
wxpath
|
wxpath-0.1.0.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
wxpath-0.1.0.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
|
|
2
|
-
wxpath-0.1.0.dist-info/METADATA,sha256=Nf5dRmDU09BNwxFOxDM_nEdezRp5CA34lLD2oEA2aI4,17663
|
|
3
|
-
wxpath-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
4
|
-
wxpath-0.1.0.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
|
|
5
|
-
wxpath-0.1.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
6
|
-
wxpath-0.1.0.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|