wxpath 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +137 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +278 -0
- wxpath/core/parser.py +598 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +444 -0
- wxpath/core/runtime/helpers.py +41 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +145 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/cache.py +43 -0
- wxpath/http/client/crawler.py +315 -0
- wxpath/http/client/request.py +38 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/robots.py +82 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +102 -0
- wxpath/patches.py +63 -0
- wxpath/settings.py +108 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- wxpath-0.4.0.dist-info/METADATA +460 -0
- wxpath-0.4.0.dist-info/RECORD +35 -0
- wxpath-0.4.0.dist-info/WHEEL +5 -0
- wxpath-0.4.0.dist-info/entry_points.txt +2 -0
- wxpath-0.4.0.dist-info/licenses/LICENSE +21 -0
- wxpath-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wxpath
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
|
+
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: lxml>=4.0
|
|
11
|
+
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
12
|
+
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
13
|
+
Requires-Dist: tqdm>=4.0.0
|
|
14
|
+
Provides-Extra: cache
|
|
15
|
+
Requires-Dist: aiohttp-client-cache>=0.14.0; extra == "cache"
|
|
16
|
+
Provides-Extra: cache-sqlite
|
|
17
|
+
Requires-Dist: aiohttp-client-cache[sqlite]; extra == "cache-sqlite"
|
|
18
|
+
Provides-Extra: cache-redis
|
|
19
|
+
Requires-Dist: aiohttp-client-cache[redis]; extra == "cache-redis"
|
|
20
|
+
Provides-Extra: test
|
|
21
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
22
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: ruff; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# **wxpath** - declarative web crawling with XPath
|
|
28
|
+
|
|
29
|
+
[](https://www.python.org/downloads/release/python-3100/)
|
|
30
|
+
|
|
31
|
+
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
32
|
+
|
|
33
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
|
|
34
|
+
|
|
35
|
+
NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
## Contents
|
|
39
|
+
|
|
40
|
+
- [Example](#example)
|
|
41
|
+
- [Language Design](DESIGN.md)
|
|
42
|
+
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
43
|
+
- [General flow](#general-flow)
|
|
44
|
+
- [Asynchronous Crawling](#asynchronous-crawling)
|
|
45
|
+
- [Polite Crawling](#polite-crawling)
|
|
46
|
+
- [Output types](#output-types)
|
|
47
|
+
- [XPath 3.1](#xpath-31-by-default)
|
|
48
|
+
- [Progress Bar](#progress-bar)
|
|
49
|
+
- [CLI](#cli)
|
|
50
|
+
- [Persistence and Caching](#persistence-and-caching)
|
|
51
|
+
- [Settings](#settings)
|
|
52
|
+
- [Hooks (Experimental)](#hooks-experimental)
|
|
53
|
+
- [Install](#install)
|
|
54
|
+
- [More Examples](EXAMPLES.md)
|
|
55
|
+
- [Comparisons](#comparisons)
|
|
56
|
+
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
57
|
+
- [Project Philosophy](#project-philosophy)
|
|
58
|
+
- [Warnings](#warnings)
|
|
59
|
+
- [Commercial support / consulting](#commercial-support--consulting)
|
|
60
|
+
- [Versioning](#versioning)
|
|
61
|
+
- [License](#license)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
## Example
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import wxpath
|
|
68
|
+
from wxpath.settings import CRAWLER_SETTINGS
|
|
69
|
+
|
|
70
|
+
# Custom headers for politeness; necessary for some sites (e.g., Wikipedia)
|
|
71
|
+
CRAWLER_SETTINGS.headers = {'User-Agent': 'my-app/0.4.0 (contact: you@example.com)'}
|
|
72
|
+
|
|
73
|
+
# Crawl, extract fields, build a knowledge graph
|
|
74
|
+
path_expr = """
|
|
75
|
+
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
76
|
+
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
77
|
+
/map{
|
|
78
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
79
|
+
'url': string(base-uri(.)),
|
|
80
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
81
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
82
|
+
}
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
86
|
+
print(item)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Output:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
93
|
+
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
94
|
+
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
95
|
+
...
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
99
|
+
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
The above expression does the following:
|
|
103
|
+
|
|
104
|
+
1. Starts at the specified URL, `https://en.wikipedia.org/wiki/Expression_language`.
|
|
105
|
+
2. Filters for links in the `<main>` section that start with `/wiki/` and do not contain a colon (`:`).
|
|
106
|
+
3. For each link found,
|
|
107
|
+
* it follows the link and extracts the title, URL, and short description of the page.
|
|
108
|
+
* it repeats step 2 until the maximum depth is reached.
|
|
109
|
+
4. Streams the extracted data as it is discovered.
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
## `url(...)` and `///url(...)` Explained
|
|
113
|
+
|
|
114
|
+
- `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
|
|
115
|
+
- `///url(...)` indicates a deep crawl. It tells the runtime engine to continue following links up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe deeper graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
## Language Design
|
|
119
|
+
|
|
120
|
+
See [DESIGN.md](DESIGN.md) for details of the language design. You will see the core concepts and design the language from the ground up.
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
## General flow
|
|
124
|
+
|
|
125
|
+
**wxpath** evaluates an expression as a list of traversal and extraction steps (internally referred to as `Segment`s).
|
|
126
|
+
|
|
127
|
+
`url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally, on a best-effort basis - not per-depth**.
|
|
128
|
+
|
|
129
|
+
XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
|
|
130
|
+
|
|
131
|
+
`///url(...)` indicates deep crawling - it proceeds breadth-first-*ish* up to `max_depth`.
|
|
132
|
+
|
|
133
|
+
Results are yielded as soon as they are ready.
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
## Asynchronous Crawling
|
|
137
|
+
|
|
138
|
+
**wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
import asyncio
|
|
142
|
+
from wxpath import wxpath_async
|
|
143
|
+
|
|
144
|
+
items = []
|
|
145
|
+
|
|
146
|
+
async def main():
|
|
147
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
148
|
+
async for item in wxpath_async(path_expr, max_depth=1):
|
|
149
|
+
items.append(item)
|
|
150
|
+
|
|
151
|
+
asyncio.run(main())
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Blocking, Concurrent Requests
|
|
155
|
+
|
|
156
|
+
**wxpath** also provides an asyncio-in-sync API, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from wxpath import wxpath_async_blocking_iter
|
|
160
|
+
|
|
161
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
162
|
+
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Polite Crawling
|
|
166
|
+
|
|
167
|
+
**wxpath** respects [robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard) by default via the `WXPathEngine(..., robotstxt=True)` constructor.
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
## Output types
|
|
171
|
+
|
|
172
|
+
The wxpath Python API yields structured objects.
|
|
173
|
+
|
|
174
|
+
Depending on the expression, results may include:
|
|
175
|
+
|
|
176
|
+
- `lxml.*` and `lxml.html.*` objects
|
|
177
|
+
- `elementpath.datatypes.*` objects (for XPath 3.1 features)
|
|
178
|
+
- `WxStr` (string values with provenance)
|
|
179
|
+
- dictionaries / maps
|
|
180
|
+
- lists or other XPath-native values
|
|
181
|
+
|
|
182
|
+
The CLI flattens these objects into plain JSON for display.
|
|
183
|
+
The Python API preserves structure by default.
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
## XPath 3.1 By Default
|
|
187
|
+
|
|
188
|
+
**wxpath** uses the `elementpath` library to provide XPath 3.1 support, enabling advanced XPath features like **maps**, **arrays**, and more. This allows you to write more powerful XPath queries.
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
path_expr = """
|
|
192
|
+
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
193
|
+
///url(//div[@id='mw-content-text']//a/@href)
|
|
194
|
+
/map{
|
|
195
|
+
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
196
|
+
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
197
|
+
'url'://link[@rel='canonical']/@href[1]
|
|
198
|
+
}
|
|
199
|
+
"""
|
|
200
|
+
# [...
|
|
201
|
+
# {'title': 'Computer language',
|
|
202
|
+
# 'short_description': 'Formal language for communicating with a computer',
|
|
203
|
+
# 'url': 'https://en.wikipedia.org/wiki/Computer_language'},
|
|
204
|
+
# {'title': 'Machine-readable medium and data',
|
|
205
|
+
# 'short_description': 'Medium capable of storing data in a format readable by a machine',
|
|
206
|
+
# 'url': 'https://en.wikipedia.org/wiki/Machine-readable_medium_and_data'},
|
|
207
|
+
# {'title': 'Domain knowledge',
|
|
208
|
+
# 'short_description': 'Specialist knowledge within a specific field',
|
|
209
|
+
# 'url': 'https://en.wikipedia.org/wiki/Domain_knowledge'},
|
|
210
|
+
# ...]
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Progress Bar
|
|
214
|
+
|
|
215
|
+
**wxpath** provides a progress bar (via `tqdm`) to track crawl progress. This is especially useful for long-running crawls.
|
|
216
|
+
|
|
217
|
+
Enable by setting `engine.run(..., progress=True)`, or pass `progress=True` to any of the `wxpath_async*(...)` functions.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
items = wxpath.wxpath_async_blocking("...", progress=True)
|
|
221
|
+
> 100%|██████████████████████████████████████████████████████████▎| 469/471 [00:05<00:00, 72.00it/s, depth=2, yielded=457]
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
## CLI
|
|
226
|
+
|
|
227
|
+
**wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
|
|
228
|
+
|
|
229
|
+
The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
|
|
230
|
+
|
|
231
|
+
NOTE: Due to the everchanging nature of web content, the output may vary over time.
|
|
232
|
+
```bash
|
|
233
|
+
> wxpath --depth 1 \
|
|
234
|
+
--header "User-Agent: my-app/0.1 (contact: you@example.com)" \
|
|
235
|
+
"url('https://en.wikipedia.org/wiki/Expression_language') \
|
|
236
|
+
///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
|
|
237
|
+
and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
|
|
238
|
+
/map{ \
|
|
239
|
+
'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
|
|
240
|
+
'short_description':(//div[contains(@class, 'shortdescription')]/text())[1], \
|
|
241
|
+
'url':string(base-uri(.)), \
|
|
242
|
+
'backlink':wx:backlink(.), \
|
|
243
|
+
'depth':wx:depth(.) \
|
|
244
|
+
}"
|
|
245
|
+
|
|
246
|
+
{"title": "Computer language", "short_description": "Formal language for communicating with a computer", "url": "https://en.wikipedia.org/wiki/Computer_language", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
247
|
+
{"title": "Machine-readable medium and data", "short_description": "Medium capable of storing data in a format readable by a machine", "url": "https://en.wikipedia.org/wiki/Machine_readable", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
248
|
+
{"title": "Domain knowledge", "short_description": "Specialist knowledge within a specific field", "url": "https://en.wikipedia.org/wiki/Domain_knowledge", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
249
|
+
{"title": "Advanced Boolean Expression Language", "short_description": "Hardware description language and software", "url": "https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
250
|
+
{"title": "Data Analysis Expressions", "short_description": "Formula and data query language", "url": "https://en.wikipedia.org/wiki/Data_Analysis_Expressions", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
251
|
+
{"title": "Jakarta Expression Language", "short_description": "Computer programming language", "url": "https://en.wikipedia.org/wiki/Jakarta_Expression_Language", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
252
|
+
{"title": "Rights Expression Language", "short_description": [], "url": "https://en.wikipedia.org/wiki/Rights_Expression_Language", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
253
|
+
{"title": "Computer science", "short_description": "Study of computation", "url": "https://en.wikipedia.org/wiki/Computer_science", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Command line options:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
--depth <depth> Max crawl depth
|
|
260
|
+
--verbose [true|false] Provides superficial CLI information
|
|
261
|
+
--debug [true|false] Provides verbose runtime output and information
|
|
262
|
+
--concurrency <concurrency> Number of concurrent fetches
|
|
263
|
+
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
264
|
+
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
265
|
+
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
266
|
+
--cache [true|false] (Default: False) Persist crawl results to a local database
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
## Persistence and Caching
|
|
271
|
+
|
|
272
|
+
**wxpath** optionally persists crawl results to a local database. This is especially useful when you're crawling a large number of URLs, and you decide to pause the crawl, change extraction expressions, or otherwise need to restart the crawl.
|
|
273
|
+
|
|
274
|
+
**wxpath** supports two backends: sqlite and redis. SQLite is great for small-scale crawls, with a single worker (i.e., `engine.crawler.concurrency == 1`). Redis is great for large-scale crawls, with multiple workers. You will be encounter a warning if you `min(engine.crawler.concurrency, engine.crawler.per_host) > 1` when using the sqlite backend.
|
|
275
|
+
|
|
276
|
+
To use, you must install the appropriate optional dependency:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
pip install wxpath[cache-sqlite]
|
|
280
|
+
pip install wxpath[cache-redis]
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Once the dependency is installed, you must enable the cache:
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
from wxpath.settings import SETTINGS
|
|
287
|
+
|
|
288
|
+
# To enable caching; sqlite is the default
|
|
289
|
+
SETTINGS.http.client.cache.enabled = True
|
|
290
|
+
|
|
291
|
+
# For redis backend
|
|
292
|
+
SETTINGS.http.client.cache.enabled = True
|
|
293
|
+
SETTINGS.http.client.cache.backend = "redis"
|
|
294
|
+
SETTINGS.http.client.cache.redis.address = "redis://localhost:6379/0"
|
|
295
|
+
|
|
296
|
+
# Run wxpath as usual
|
|
297
|
+
items = list(wxpath_async_blocking_iter('...', max_depth=1, engine=engine))
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
## Settings
|
|
302
|
+
|
|
303
|
+
See [settings.py](src/wxpath/settings.py) for details of the settings.
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
## Hooks (Experimental)
|
|
307
|
+
|
|
308
|
+
**wxpath** supports a pluggable hook system that allows you to modify the crawling and extraction behavior. You can register hooks to preprocess URLs, post-process HTML, filter extracted values, and more. Hooks will be executed in the order they are registered. Hooks may impact performance.
|
|
309
|
+
|
|
310
|
+
```python
|
|
311
|
+
|
|
312
|
+
from wxpath import hooks
|
|
313
|
+
|
|
314
|
+
@hooks.register
|
|
315
|
+
class OnlyEnglish:
|
|
316
|
+
def post_parse(self, ctx, elem):
|
|
317
|
+
lang = elem.xpath('string(/html/@lang)').lower()[:2]
|
|
318
|
+
return elem if lang in ("en", "") else None
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### Async usage
|
|
322
|
+
|
|
323
|
+
NOTE: Hooks may be synchronous or asynchronous, but all hooks in a project should follow the same style.
|
|
324
|
+
Mixing sync and async hooks is not supported and may lead to unexpected behavior.
|
|
325
|
+
|
|
326
|
+
```python
|
|
327
|
+
|
|
328
|
+
from wxpath import hooks
|
|
329
|
+
|
|
330
|
+
@hooks.register
|
|
331
|
+
class OnlyEnglish:
|
|
332
|
+
async def post_parse(self, ctx, elem):
|
|
333
|
+
lang = elem.xpath('string(/html/@lang)').lower()[:2]
|
|
334
|
+
return elem if lang in ("en", "") else None
|
|
335
|
+
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Predefined Hooks
|
|
339
|
+
|
|
340
|
+
`JSONLWriter` (aliased `NDJSONWriter`) is a built-in hook that writes extracted data to a newline-delimited JSON file. This is useful for storing results in a structured format that can be easily processed later.
|
|
341
|
+
|
|
342
|
+
```python
|
|
343
|
+
from wxpath import hooks
|
|
344
|
+
hooks.register(hooks.JSONLWriter)
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
## Install
|
|
349
|
+
|
|
350
|
+
Requires Python 3.10+.
|
|
351
|
+
|
|
352
|
+
```
|
|
353
|
+
pip install wxpath
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
For persisted/cached, wxpath supports the following backends:
|
|
357
|
+
|
|
358
|
+
```
|
|
359
|
+
pip install wxpath[cache-sqlite]
|
|
360
|
+
pip install wxpath[cache-redis]
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
## More Examples
|
|
365
|
+
|
|
366
|
+
See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
## Comparisons
|
|
370
|
+
|
|
371
|
+
See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
## Advanced: Engine & Crawler Configuration
|
|
375
|
+
|
|
376
|
+
You can alter the engine and crawler's behavior like so:
|
|
377
|
+
|
|
378
|
+
```python
|
|
379
|
+
from wxpath import wxpath_async_blocking_iter
|
|
380
|
+
from wxpath.core.runtime import WXPathEngine
|
|
381
|
+
from wxpath.http.client.crawler import Crawler
|
|
382
|
+
|
|
383
|
+
crawler = Crawler(
|
|
384
|
+
concurrency=8,
|
|
385
|
+
per_host=2,
|
|
386
|
+
timeout=10,
|
|
387
|
+
respect_robots=False,
|
|
388
|
+
headers={
|
|
389
|
+
"User-Agent": "my-app/0.1.0 (contact: you@example.com)", # Sites like Wikipedia will appreciate this
|
|
390
|
+
},
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# If `crawler` is not specified, a default Crawler will be created with
|
|
394
|
+
# the provided concurrency, per_host, and respect_robots values, or with defaults.
|
|
395
|
+
engine = WXPathEngine(
|
|
396
|
+
# concurrency: int = 16,
|
|
397
|
+
# per_host: int = 8,
|
|
398
|
+
# respect_robots: bool = True,
|
|
399
|
+
# allowed_response_codes: set[int] = {200},
|
|
400
|
+
# allow_redirects: bool = True,
|
|
401
|
+
crawler=crawler,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
|
|
405
|
+
|
|
406
|
+
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
## Project Philosophy
|
|
411
|
+
|
|
412
|
+
### Principles
|
|
413
|
+
|
|
414
|
+
- Enable declarative, crawling and scraping without boilerplate
|
|
415
|
+
- Stay lightweight and composable
|
|
416
|
+
- Asynchronous support for high-performance crawls
|
|
417
|
+
|
|
418
|
+
### Goals
|
|
419
|
+
|
|
420
|
+
- URLs are deduplicated on a best-effort, per-crawl basis.
|
|
421
|
+
- Crawls are intended to terminate once the frontier is exhausted or `max_depth` is reached.
|
|
422
|
+
- Requests are performed concurrently.
|
|
423
|
+
- Results are streamed as soon as they are available.
|
|
424
|
+
|
|
425
|
+
### Limitations (for now)
|
|
426
|
+
|
|
427
|
+
The following features are not yet supported:
|
|
428
|
+
|
|
429
|
+
- Automatic proxy rotation
|
|
430
|
+
- Browser-based rendering (JavaScript execution)
|
|
431
|
+
- Strict result ordering
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
## WARNINGS!!!
|
|
435
|
+
|
|
436
|
+
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
437
|
+
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
438
|
+
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
439
|
+
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
## Commercial support / consulting
|
|
443
|
+
|
|
444
|
+
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
### Donate
|
|
448
|
+
|
|
449
|
+
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
## Versioning
|
|
453
|
+
|
|
454
|
+
**wxpath** follows [semver](https://semver.org): `<MAJOR>.<MINOR>.<PATCH>`.
|
|
455
|
+
|
|
456
|
+
However, pre-1.0.0 follows `0.<MAJOR>.<MINOR|PATCH>`.
|
|
457
|
+
|
|
458
|
+
## License
|
|
459
|
+
|
|
460
|
+
MIT
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
wxpath/__init__.py,sha256=w1hFE_VSIYq_TSFLoPfp6MJbG1sA6BeChX6PYsXIK4o,265
|
|
2
|
+
wxpath/cli.py,sha256=e0-mHkpuC1B_WyJw7wH43UBmtuF8oL8phQ4GEzUX0Ns,4332
|
|
3
|
+
wxpath/patches.py,sha256=u0dOL-K-gvdO9SJvzGrqR9Zou6XduWjl6R7mzIcZtJg,2130
|
|
4
|
+
wxpath/settings.py,sha256=a4TlCAOvmO03oOXiiYQzIDBMZU0XpTqntwnjVsumnas,3809
|
|
5
|
+
wxpath/core/__init__.py,sha256=U9_In2iRaZrpiIVavIli1M59gCB6Kn1en-1Fza-qIiI,257
|
|
6
|
+
wxpath/core/dom.py,sha256=X0L3n8jRfO5evEypDaJTD-NQ3cLXWvnEUVERAHo3vV0,701
|
|
7
|
+
wxpath/core/models.py,sha256=3KYt-UwfLY2FlSRUHeA_getnYaNUMPW9wRrl2CRbPso,1611
|
|
8
|
+
wxpath/core/ops.py,sha256=PTjX6c4QvCqGaByYYqaK4dte5iWO3lZzgqGrMXp6f6g,9727
|
|
9
|
+
wxpath/core/parser.py,sha256=WfjQNixBz7nWtX2O0t19MOhUJmzGMg8Qol40P6oC8zc,18827
|
|
10
|
+
wxpath/core/runtime/__init__.py,sha256=_iCgkIWxXvxzQcenHOsjYGsk74HboTIYWOtgM8GtCyc,86
|
|
11
|
+
wxpath/core/runtime/engine.py,sha256=otCr2bGtw3MczuQxL-UKti_mnjvYVgHplVFcS_nDopo,15850
|
|
12
|
+
wxpath/core/runtime/helpers.py,sha256=M1i4BryCktAxeboa4LOXMTNiKVCJLDBD-KpWCQXadpw,1434
|
|
13
|
+
wxpath/hooks/__init__.py,sha256=9JG63e4z_8CZLWugFcY786hebaEEPZ5FmZhyDHat-98,294
|
|
14
|
+
wxpath/hooks/builtin.py,sha256=GJ4w1C9djWNzAmAA3U0qI9OoCOeC5R8tEGtWXJVHSYs,4125
|
|
15
|
+
wxpath/hooks/registry.py,sha256=-D11f_mMboeVAH8qsTkbKTQ0aGNaQ7F6zbXDsOIYxN0,4513
|
|
16
|
+
wxpath/http/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
wxpath/http/stats.py,sha256=aqZWuybc5RCv-AmKdNbEX4uw1YvZtFoE6591UfukZns,3319
|
|
18
|
+
wxpath/http/client/__init__.py,sha256=QpdmqzcznUeuFvT3IIo-LmBUUHEa2BDq9sHGAHJnDLI,202
|
|
19
|
+
wxpath/http/client/cache.py,sha256=cHS4XlfOStoHTG83ypNITk3Oc0lqGoTRqV0_UWBWQFY,1811
|
|
20
|
+
wxpath/http/client/crawler.py,sha256=ZbMXgs9CemYafKkAwbLfcOoT6LLdDsbmPwMH8jWzAAg,12159
|
|
21
|
+
wxpath/http/client/request.py,sha256=LF_OIXetfouyE5GwEqp0cya0oMAZouKRPNFRFGscQS8,1050
|
|
22
|
+
wxpath/http/client/response.py,sha256=z9LQPnDN-NZRnQpIKozaWCqgpRejc6nixCr_XaPyqUQ,334
|
|
23
|
+
wxpath/http/policy/backoff.py,sha256=NwdUR6bRe1RtUGSJOktj-p8IyC1l9xu_-Aa_Gj_u5sw,321
|
|
24
|
+
wxpath/http/policy/retry.py,sha256=WSrQfCy1F7IcXFpVGDi4HTphNhFq12p4DaMO0_4dgrw,982
|
|
25
|
+
wxpath/http/policy/robots.py,sha256=vllXX9me78YB6yrDdpH_bwyuR5QoC9uveGEl8PmHM9Q,3134
|
|
26
|
+
wxpath/http/policy/throttler.py,sha256=wydMFV-0mxpHSI5iYkLfE78oY4z_fF8jW9MqCeb8G54,3014
|
|
27
|
+
wxpath/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
wxpath/util/logging.py,sha256=oQi8sp7yKWgXkkcJ4U4WHp7TyBCQiK4VhSXOSb8pGw0,2965
|
|
29
|
+
wxpath/util/serialize.py,sha256=uUs4C9VErpFd97smBM2bRWo2nW25kCgKdsMrVtVxhg8,575
|
|
30
|
+
wxpath-0.4.0.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
|
|
31
|
+
wxpath-0.4.0.dist-info/METADATA,sha256=1iEqBwgUjNcZaupy5WinzJkm5DKzHUMnSG27w4gbWvg,19233
|
|
32
|
+
wxpath-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
33
|
+
wxpath-0.4.0.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
|
|
34
|
+
wxpath-0.4.0.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
|
|
35
|
+
wxpath-0.4.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Rod Palacios
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the “Software”), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
wxpath
|