wxpath 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +9 -0
- wxpath/cli.py +92 -0
- wxpath/core/__init__.py +13 -0
- wxpath/core/dom.py +22 -0
- wxpath/core/models.py +74 -0
- wxpath/core/ops.py +278 -0
- wxpath/core/parser.py +598 -0
- wxpath/core/runtime/__init__.py +5 -0
- wxpath/core/runtime/engine.py +406 -0
- wxpath/core/runtime/helpers.py +41 -0
- wxpath/hooks/__init__.py +9 -0
- wxpath/hooks/builtin.py +113 -0
- wxpath/hooks/registry.py +145 -0
- wxpath/http/__init__.py +0 -0
- wxpath/http/client/__init__.py +9 -0
- wxpath/http/client/crawler.py +231 -0
- wxpath/http/client/request.py +38 -0
- wxpath/http/client/response.py +14 -0
- wxpath/http/policy/backoff.py +16 -0
- wxpath/http/policy/retry.py +35 -0
- wxpath/http/policy/robots.py +82 -0
- wxpath/http/policy/throttler.py +114 -0
- wxpath/http/stats.py +96 -0
- wxpath/patches.py +63 -0
- wxpath/util/__init__.py +0 -0
- wxpath/util/logging.py +91 -0
- wxpath/util/serialize.py +22 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/METADATA +107 -129
- wxpath-0.3.0.dist-info/RECORD +33 -0
- wxpath-0.3.0.dist-info/top_level.txt +1 -0
- wxpath-0.1.1.dist-info/RECORD +0 -6
- wxpath-0.1.1.dist-info/top_level.txt +0 -1
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/WHEEL +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/entry_points.txt +0 -0
- {wxpath-0.1.1.dist-info → wxpath-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
|
-
Requires-Dist: requests>=2.0
|
|
11
10
|
Requires-Dist: lxml>=4.0
|
|
12
|
-
Requires-Dist: elementpath
|
|
13
|
-
Requires-Dist: aiohttp
|
|
11
|
+
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
12
|
+
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
14
13
|
Provides-Extra: test
|
|
15
14
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
16
15
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
@@ -18,64 +17,73 @@ Provides-Extra: dev
|
|
|
18
17
|
Requires-Dist: ruff; extra == "dev"
|
|
19
18
|
Dynamic: license-file
|
|
20
19
|
|
|
20
|
+
# **wxpath** - declarative web crawling with XPath
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
[](https://www.python.org/downloads/release/python-3100/)
|
|
23
23
|
|
|
24
|
-
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, you describe what to follow and what to extract in a single expression. **wxpath**
|
|
24
|
+
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
25
25
|
|
|
26
|
-
By introducing the `url(...)` operator and the `///` syntax,
|
|
26
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
|
|
27
27
|
|
|
28
28
|
NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
29
29
|
|
|
30
|
+
|
|
30
31
|
## Contents
|
|
31
32
|
|
|
32
33
|
- [Example](#example)
|
|
33
|
-
- [
|
|
34
|
+
- [Language Design](DESIGN.md)
|
|
35
|
+
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
34
36
|
- [General flow](#general-flow)
|
|
35
37
|
- [Asynchronous Crawling](#asynchronous-crawling)
|
|
38
|
+
- [Polite Crawling](#polite-crawling)
|
|
36
39
|
- [Output types](#output-types)
|
|
37
|
-
- [XPath 3.1
|
|
40
|
+
- [XPath 3.1](#xpath-31-by-default)
|
|
38
41
|
- [CLI](#cli)
|
|
39
42
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
40
43
|
- [Install](#install)
|
|
41
|
-
- [More Examples](
|
|
44
|
+
- [More Examples](EXAMPLES.md)
|
|
45
|
+
- [Comparisons](#comparisons)
|
|
42
46
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
43
47
|
- [Project Philosophy](#project-philosophy)
|
|
44
48
|
- [Warnings](#warnings)
|
|
49
|
+
- [Commercial support / consulting](#commercial-support--consulting)
|
|
45
50
|
- [License](#license)
|
|
46
51
|
|
|
52
|
+
|
|
47
53
|
## Example
|
|
48
54
|
|
|
49
55
|
```python
|
|
50
56
|
import wxpath
|
|
51
57
|
|
|
52
|
-
|
|
58
|
+
# Crawl, extract fields, build a knowledge graph
|
|
59
|
+
path_expr = """
|
|
53
60
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
54
|
-
///main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))]
|
|
61
|
+
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
55
62
|
/map{
|
|
56
|
-
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
57
|
-
'url':string(base-uri(.)),
|
|
58
|
-
'short_description':
|
|
63
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
64
|
+
'url': string(base-uri(.)),
|
|
65
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
66
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
59
67
|
}
|
|
60
68
|
"""
|
|
61
69
|
|
|
62
|
-
for item in wxpath.wxpath_async_blocking_iter(
|
|
70
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
63
71
|
print(item)
|
|
64
72
|
```
|
|
65
73
|
|
|
66
74
|
Output:
|
|
67
75
|
|
|
68
76
|
```python
|
|
69
|
-
map{'title':
|
|
70
|
-
map{'title':
|
|
71
|
-
map{'title':
|
|
72
|
-
|
|
73
|
-
map{'title': TextNode('Data Analysis Expressions'), 'url': 'https://en.wikipedia.org/wiki/Data_Analysis_Expressions', 'short_description': TextNode('Formula and data query language')}
|
|
74
|
-
map{'title': TextNode('Domain knowledge'), 'url': 'https://en.wikipedia.org/wiki/Domain_knowledge', 'short_description': TextNode('Specialist knowledge within a specific field')}
|
|
75
|
-
map{'title': TextNode('Rights Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Rights_Expression_Language', 'short_description': TextNode('Machine-processable language used to express intellectual property rights (such as copyright)')}
|
|
76
|
-
map{'title': TextNode('Computer science'), 'url': 'https://en.wikipedia.org/wiki/Computer_science', 'short_description': TextNode('Study of computation')}
|
|
77
|
+
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
78
|
+
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
79
|
+
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
80
|
+
...
|
|
77
81
|
```
|
|
78
82
|
|
|
83
|
+
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
84
|
+
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
85
|
+
|
|
86
|
+
|
|
79
87
|
The above expression does the following:
|
|
80
88
|
|
|
81
89
|
1. Starts at the specified URL, `https://en.wikipedia.org/wiki/Expression_language`.
|
|
@@ -86,27 +94,32 @@ The above expression does the following:
|
|
|
86
94
|
4. Streams the extracted data as it is discovered.
|
|
87
95
|
|
|
88
96
|
|
|
89
|
-
## `url(...)` and
|
|
97
|
+
## `url(...)` and `///url(...)` Explained
|
|
90
98
|
|
|
91
99
|
- `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
|
|
92
|
-
-
|
|
100
|
+
- `///url(...)` indicates a deep crawl. It tells the runtime engine to continue following links up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe deeper graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
## Language Design
|
|
104
|
+
|
|
105
|
+
See [DESIGN.md](DESIGN.md) for details of the language design. You will see the core concepts and design the language from the ground up.
|
|
106
|
+
|
|
93
107
|
|
|
94
108
|
## General flow
|
|
95
109
|
|
|
96
110
|
**wxpath** evaluates an expression as a list of traversal and extraction steps (internally referred to as `Segment`s).
|
|
97
111
|
|
|
98
|
-
`url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally,
|
|
112
|
+
`url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally, on a best-effort basis - not per-depth**.
|
|
99
113
|
|
|
100
114
|
XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
|
|
101
115
|
|
|
102
|
-
|
|
116
|
+
`///url(...)` indicates deep crawling - it proceeds breadth-first-*ish* up to `max_depth`.
|
|
103
117
|
|
|
104
118
|
Results are yielded as soon as they are ready.
|
|
105
119
|
|
|
106
120
|
|
|
107
121
|
## Asynchronous Crawling
|
|
108
122
|
|
|
109
|
-
|
|
110
123
|
**wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
|
|
111
124
|
|
|
112
125
|
```python
|
|
@@ -116,7 +129,7 @@ from wxpath import wxpath_async
|
|
|
116
129
|
items = []
|
|
117
130
|
|
|
118
131
|
async def main():
|
|
119
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
132
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
120
133
|
async for item in wxpath_async(path_expr, max_depth=1):
|
|
121
134
|
items.append(item)
|
|
122
135
|
|
|
@@ -125,19 +138,23 @@ asyncio.run(main())
|
|
|
125
138
|
|
|
126
139
|
### Blocking, Concurrent Requests
|
|
127
140
|
|
|
128
|
-
|
|
129
|
-
**wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
141
|
+
**wxpath** also provides an asyncio-in-sync API, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
130
142
|
|
|
131
143
|
```python
|
|
132
144
|
from wxpath import wxpath_async_blocking_iter
|
|
133
145
|
|
|
134
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
146
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
135
147
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
136
148
|
```
|
|
137
149
|
|
|
150
|
+
## Polite Crawling
|
|
151
|
+
|
|
152
|
+
**wxpath** respects [robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard) by default via the `WXPathEngine(..., robotstxt=True)` constructor.
|
|
153
|
+
|
|
154
|
+
|
|
138
155
|
## Output types
|
|
139
156
|
|
|
140
|
-
The wxpath Python API yields structured objects
|
|
157
|
+
The wxpath Python API yields structured objects.
|
|
141
158
|
|
|
142
159
|
Depending on the expression, results may include:
|
|
143
160
|
|
|
@@ -158,7 +175,7 @@ The Python API preserves structure by default.
|
|
|
158
175
|
```python
|
|
159
176
|
path_expr = """
|
|
160
177
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
161
|
-
///div[@id='mw-content-text']//a
|
|
178
|
+
///url(//div[@id='mw-content-text']//a/@href)
|
|
162
179
|
/map{
|
|
163
180
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
164
181
|
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
@@ -178,15 +195,19 @@ path_expr = """
|
|
|
178
195
|
# ...]
|
|
179
196
|
```
|
|
180
197
|
|
|
198
|
+
|
|
181
199
|
## CLI
|
|
182
200
|
|
|
183
201
|
**wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
|
|
184
202
|
|
|
203
|
+
The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
|
|
204
|
+
|
|
205
|
+
NOTE: Due to the everchanging nature of web content, the output may vary over time.
|
|
185
206
|
```bash
|
|
186
|
-
> wxpath --depth 1
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
207
|
+
> wxpath --depth 1 \
|
|
208
|
+
--header "User-Agent: my-app/0.1 (contact: you@example.com)" \
|
|
209
|
+
"url('https://en.wikipedia.org/wiki/Expression_language') \
|
|
210
|
+
///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
|
|
190
211
|
and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
|
|
191
212
|
/map{ \
|
|
192
213
|
'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
|
|
@@ -206,6 +227,18 @@ path_expr = """
|
|
|
206
227
|
{"title": "Computer science", "short_description": "Study of computation", "url": "https://en.wikipedia.org/wiki/Computer_science", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
207
228
|
```
|
|
208
229
|
|
|
230
|
+
Command line options:
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
--depth <depth> Max crawl depth
|
|
234
|
+
--verbose [true|false] Provides superficial CLI information
|
|
235
|
+
--debug [true|false] Provides verbose runtime output and information
|
|
236
|
+
--concurrency <concurrency> Number of concurrent fetches
|
|
237
|
+
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
238
|
+
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
239
|
+
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
240
|
+
```
|
|
241
|
+
|
|
209
242
|
|
|
210
243
|
## Hooks (Experimental)
|
|
211
244
|
|
|
@@ -251,6 +284,8 @@ hooks.register(hooks.JSONLWriter)
|
|
|
251
284
|
|
|
252
285
|
## Install
|
|
253
286
|
|
|
287
|
+
Requires Python 3.10+.
|
|
288
|
+
|
|
254
289
|
```
|
|
255
290
|
pip install wxpath
|
|
256
291
|
```
|
|
@@ -258,90 +293,13 @@ pip install wxpath
|
|
|
258
293
|
|
|
259
294
|
## More Examples
|
|
260
295
|
|
|
261
|
-
|
|
262
|
-
import wxpath
|
|
296
|
+
See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
|
|
263
297
|
|
|
264
|
-
#### EXAMPLE 1 - Simple, single page crawl and link extraction #######
|
|
265
|
-
#
|
|
266
|
-
# Starting from Expression language's wiki, extract all links (hrefs)
|
|
267
|
-
# from the main section. The `url(...)` operator is used to execute a
|
|
268
|
-
# web request to the specified URL and return the HTML content.
|
|
269
|
-
#
|
|
270
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//main//a/@href"
|
|
271
|
-
|
|
272
|
-
items = wxpath.wxpath_async_blocking(path_expr)
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
#### EXAMPLE 2 - Two-deep crawl and link extraction ##################
|
|
276
|
-
#
|
|
277
|
-
# Starting from Expression language's wiki, crawl all child links
|
|
278
|
-
# starting with '/wiki/', and extract each child's links (hrefs). The
|
|
279
|
-
# `url(...)` operator is pipe'd arguments from the evaluated XPath.
|
|
280
|
-
#
|
|
281
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(@href[starts-with(., '/wiki/')])//a/@href"
|
|
282
|
-
|
|
283
|
-
#### EXAMPLE 3 - Infinite crawl with BFS tree depth limit ############
|
|
284
|
-
#
|
|
285
|
-
# Starting from Expression language's wiki, infinitely crawl all child
|
|
286
|
-
# links (and child's child's links recursively). The `///` syntax is
|
|
287
|
-
# used to indicate an infinite crawl.
|
|
288
|
-
# Returns lxml.html.HtmlElement objects.
|
|
289
|
-
#
|
|
290
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
|
|
291
|
-
|
|
292
|
-
# The same expression written differently:
|
|
293
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//main//a/@href)"
|
|
294
|
-
|
|
295
|
-
# Modify (inclusive) max_depth to limit the BFS tree (crawl depth).
|
|
296
|
-
items = wxpath.wxpath_async_blocking(path_expr, max_depth=1)
|
|
297
|
-
|
|
298
|
-
#### EXAMPLE 4 - Infinite crawl with field extraction ################
|
|
299
|
-
#
|
|
300
|
-
# Infinitely crawls Expression language's wiki's child links and
|
|
301
|
-
# childs' child links (recursively) and then, for each child link
|
|
302
|
-
# crawled, extracts objects with the named fields as a dict.
|
|
303
|
-
#
|
|
304
|
-
path_expr = """
|
|
305
|
-
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
306
|
-
///main//a/url(@href)
|
|
307
|
-
/map {
|
|
308
|
-
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
309
|
-
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
310
|
-
'url'://link[@rel='canonical']/@href[1],
|
|
311
|
-
'backlink':wx:backlink(.),
|
|
312
|
-
'depth':wx:depth(.)
|
|
313
|
-
}
|
|
314
|
-
"""
|
|
315
298
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
# >> segments
|
|
320
|
-
# [Segment(op='url', value='https://en.wikipedia.org/wiki/Expression_language'),
|
|
321
|
-
# Segment(op='url_inf', value='///url(//main//a/@href)'),
|
|
322
|
-
# Segment(op='xpath', value='/map { \'title\':(//span[contains(@class, "mw-page-title-main")]/text())[1], \'short_description\':(//div[contains(@class, "shortdescription")]/text())[1], \'url\'://link[@rel=\'canonical\']/@href[1] }')]
|
|
323
|
-
|
|
324
|
-
#### EXAMPLE 5 = Seeding from XPath function expression + mapping operator (`!`)
|
|
325
|
-
#
|
|
326
|
-
# Functionally create 10 Amazon book search result page URLs, map each URL to
|
|
327
|
-
# the url(.) operator, and for each page, extract the title, price, and link of
|
|
328
|
-
# each book listed.
|
|
329
|
-
#
|
|
330
|
-
base_url = "https://www.amazon.com/s?k=books&i=stripbooks&page="
|
|
331
|
-
|
|
332
|
-
path_expr = f"""
|
|
333
|
-
(1 to 10) ! ('{base_url}' || .) !
|
|
334
|
-
url(.)
|
|
335
|
-
//span[@data-component-type='s-search-results']//*[@role='listitem']
|
|
336
|
-
/map {{
|
|
337
|
-
'title': (.//h2/span/text())[1],
|
|
338
|
-
'price': (.//span[@class='a-price']/span[@class='a-offscreen']/text())[1],
|
|
339
|
-
'link': (.//a[@aria-describedby='price-link']/@href)[1]
|
|
340
|
-
}}
|
|
341
|
-
"""
|
|
299
|
+
## Comparisons
|
|
300
|
+
|
|
301
|
+
See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
|
|
342
302
|
|
|
343
|
-
items = list(wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
344
|
-
```
|
|
345
303
|
|
|
346
304
|
## Advanced: Engine & Crawler Configuration
|
|
347
305
|
|
|
@@ -356,17 +314,24 @@ crawler = Crawler(
|
|
|
356
314
|
concurrency=8,
|
|
357
315
|
per_host=2,
|
|
358
316
|
timeout=10,
|
|
317
|
+
respect_robots=False,
|
|
318
|
+
headers={
|
|
319
|
+
"User-Agent": "my-app/0.1.0 (contact: you@example.com)", # Sites like Wikipedia will appreciate this
|
|
320
|
+
},
|
|
359
321
|
)
|
|
360
322
|
|
|
361
323
|
# If `crawler` is not specified, a default Crawler will be created with
|
|
362
|
-
# the provided concurrency and
|
|
324
|
+
# the provided concurrency, per_host, and respect_robots values, or with defaults.
|
|
363
325
|
engine = WXPathEngine(
|
|
364
|
-
# concurrency=16,
|
|
365
|
-
# per_host=8,
|
|
326
|
+
# concurrency: int = 16,
|
|
327
|
+
# per_host: int = 8,
|
|
328
|
+
# respect_robots: bool = True,
|
|
329
|
+
# allowed_response_codes: set[int] = {200},
|
|
330
|
+
# allow_redirects: bool = True,
|
|
366
331
|
crawler=crawler,
|
|
367
332
|
)
|
|
368
333
|
|
|
369
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')
|
|
334
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
|
|
370
335
|
|
|
371
336
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
372
337
|
```
|
|
@@ -376,7 +341,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
376
341
|
|
|
377
342
|
### Principles
|
|
378
343
|
|
|
379
|
-
- Enable declarative,
|
|
344
|
+
- Enable declarative, crawling and scraping without boilerplate
|
|
380
345
|
- Stay lightweight and composable
|
|
381
346
|
- Asynchronous support for high-performance crawls
|
|
382
347
|
|
|
@@ -387,20 +352,33 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
387
352
|
- Requests are performed concurrently.
|
|
388
353
|
- Results are streamed as soon as they are available.
|
|
389
354
|
|
|
390
|
-
###
|
|
355
|
+
### Limitations (for now)
|
|
356
|
+
|
|
357
|
+
The following features are not yet supported:
|
|
391
358
|
|
|
392
|
-
- Strict result ordering
|
|
393
359
|
- Persistent scheduling or crawl resumption
|
|
394
360
|
- Automatic proxy rotation
|
|
395
361
|
- Browser-based rendering (JavaScript execution)
|
|
362
|
+
- Strict result ordering
|
|
363
|
+
|
|
396
364
|
|
|
397
365
|
## WARNINGS!!!
|
|
398
366
|
|
|
399
367
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
400
|
-
-
|
|
368
|
+
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
401
369
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
402
370
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
403
371
|
|
|
372
|
+
|
|
373
|
+
## Commercial support / consulting
|
|
374
|
+
|
|
375
|
+
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
### Donate
|
|
379
|
+
|
|
380
|
+
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
381
|
+
|
|
404
382
|
## License
|
|
405
383
|
|
|
406
384
|
MIT
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
wxpath/__init__.py,sha256=w1hFE_VSIYq_TSFLoPfp6MJbG1sA6BeChX6PYsXIK4o,265
|
|
2
|
+
wxpath/cli.py,sha256=GJ4vAax5DlpxczZ_eLetlfRwa177VFKo2LHv09X-0eo,2799
|
|
3
|
+
wxpath/patches.py,sha256=u0dOL-K-gvdO9SJvzGrqR9Zou6XduWjl6R7mzIcZtJg,2130
|
|
4
|
+
wxpath/core/__init__.py,sha256=U9_In2iRaZrpiIVavIli1M59gCB6Kn1en-1Fza-qIiI,257
|
|
5
|
+
wxpath/core/dom.py,sha256=X0L3n8jRfO5evEypDaJTD-NQ3cLXWvnEUVERAHo3vV0,701
|
|
6
|
+
wxpath/core/models.py,sha256=3KYt-UwfLY2FlSRUHeA_getnYaNUMPW9wRrl2CRbPso,1611
|
|
7
|
+
wxpath/core/ops.py,sha256=PTjX6c4QvCqGaByYYqaK4dte5iWO3lZzgqGrMXp6f6g,9727
|
|
8
|
+
wxpath/core/parser.py,sha256=WfjQNixBz7nWtX2O0t19MOhUJmzGMg8Qol40P6oC8zc,18827
|
|
9
|
+
wxpath/core/runtime/__init__.py,sha256=_iCgkIWxXvxzQcenHOsjYGsk74HboTIYWOtgM8GtCyc,86
|
|
10
|
+
wxpath/core/runtime/engine.py,sha256=069ITKDXcHss__AwaYf0VSfliCNB49yZbnW2v3xEZO0,14512
|
|
11
|
+
wxpath/core/runtime/helpers.py,sha256=M1i4BryCktAxeboa4LOXMTNiKVCJLDBD-KpWCQXadpw,1434
|
|
12
|
+
wxpath/hooks/__init__.py,sha256=9JG63e4z_8CZLWugFcY786hebaEEPZ5FmZhyDHat-98,294
|
|
13
|
+
wxpath/hooks/builtin.py,sha256=GJ4w1C9djWNzAmAA3U0qI9OoCOeC5R8tEGtWXJVHSYs,4125
|
|
14
|
+
wxpath/hooks/registry.py,sha256=-D11f_mMboeVAH8qsTkbKTQ0aGNaQ7F6zbXDsOIYxN0,4513
|
|
15
|
+
wxpath/http/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
wxpath/http/stats.py,sha256=FrXbFrnms113Gapf-Z5WiD5qaNiJ0XuOqjSQhwXfuEo,3172
|
|
17
|
+
wxpath/http/client/__init__.py,sha256=QpdmqzcznUeuFvT3IIo-LmBUUHEa2BDq9sHGAHJnDLI,202
|
|
18
|
+
wxpath/http/client/crawler.py,sha256=YlE469UqMck0wqRd6J9kNxm5G9BCbE_x5O6MROwmcaE,8742
|
|
19
|
+
wxpath/http/client/request.py,sha256=LF_OIXetfouyE5GwEqp0cya0oMAZouKRPNFRFGscQS8,1050
|
|
20
|
+
wxpath/http/client/response.py,sha256=z9LQPnDN-NZRnQpIKozaWCqgpRejc6nixCr_XaPyqUQ,334
|
|
21
|
+
wxpath/http/policy/backoff.py,sha256=NwdUR6bRe1RtUGSJOktj-p8IyC1l9xu_-Aa_Gj_u5sw,321
|
|
22
|
+
wxpath/http/policy/retry.py,sha256=WSrQfCy1F7IcXFpVGDi4HTphNhFq12p4DaMO0_4dgrw,982
|
|
23
|
+
wxpath/http/policy/robots.py,sha256=vllXX9me78YB6yrDdpH_bwyuR5QoC9uveGEl8PmHM9Q,3134
|
|
24
|
+
wxpath/http/policy/throttler.py,sha256=wydMFV-0mxpHSI5iYkLfE78oY4z_fF8jW9MqCeb8G54,3014
|
|
25
|
+
wxpath/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
wxpath/util/logging.py,sha256=oQi8sp7yKWgXkkcJ4U4WHp7TyBCQiK4VhSXOSb8pGw0,2965
|
|
27
|
+
wxpath/util/serialize.py,sha256=uUs4C9VErpFd97smBM2bRWo2nW25kCgKdsMrVtVxhg8,575
|
|
28
|
+
wxpath-0.3.0.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
|
|
29
|
+
wxpath-0.3.0.dist-info/METADATA,sha256=9Y0V7Up2efXCRtKZ7Cceawz9LHvNcfH0olmEGK2mVk0,16326
|
|
30
|
+
wxpath-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
wxpath-0.3.0.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
|
|
32
|
+
wxpath-0.3.0.dist-info/top_level.txt,sha256=uFCcveG78mnefxRGvYsR2OexDlKR_Z1UD4vZijUcex8,7
|
|
33
|
+
wxpath-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
wxpath
|
wxpath-0.1.1.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
wxpath-0.1.1.dist-info/licenses/LICENSE,sha256=AVBZLhdWmqxm-f-dy5prVB1E-solHWoP2EXEIV_o-00,1076
|
|
2
|
-
wxpath-0.1.1.dist-info/METADATA,sha256=-CZQ3N2wjoO2ArbQ5JSdtMtMUrnLwiOGnQMtnBdzleE,17719
|
|
3
|
-
wxpath-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
4
|
-
wxpath-0.1.1.dist-info/entry_points.txt,sha256=FwoIOnUTl-DjPqVw-eb9EHHiiXCyRZy_mEQKFu2eb5Y,43
|
|
5
|
-
wxpath-0.1.1.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
6
|
-
wxpath-0.1.1.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|