wxpath 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.1.0/src/wxpath.egg-info → wxpath-0.2.0}/PKG-INFO +30 -97
- {wxpath-0.1.0 → wxpath-0.2.0}/README.md +25 -94
- {wxpath-0.1.0 → wxpath-0.2.0}/pyproject.toml +23 -5
- {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath/cli.py +13 -28
- wxpath-0.2.0/src/wxpath/core/__init__.py +13 -0
- wxpath-0.2.0/src/wxpath/core/dom.py +22 -0
- wxpath-0.2.0/src/wxpath/core/errors.py +134 -0
- wxpath-0.2.0/src/wxpath/core/models.py +74 -0
- wxpath-0.2.0/src/wxpath/core/ops.py +244 -0
- wxpath-0.2.0/src/wxpath/core/parser.py +319 -0
- wxpath-0.2.0/src/wxpath/core/runtime/__init__.py +5 -0
- wxpath-0.2.0/src/wxpath/core/runtime/engine.py +315 -0
- wxpath-0.2.0/src/wxpath/core/runtime/helpers.py +48 -0
- wxpath-0.2.0/src/wxpath/hooks/__init__.py +9 -0
- wxpath-0.2.0/src/wxpath/hooks/builtin.py +113 -0
- wxpath-0.2.0/src/wxpath/hooks/registry.py +133 -0
- wxpath-0.2.0/src/wxpath/http/__init__.py +0 -0
- wxpath-0.2.0/src/wxpath/http/client/__init__.py +9 -0
- wxpath-0.2.0/src/wxpath/http/client/crawler.py +196 -0
- wxpath-0.2.0/src/wxpath/http/client/request.py +35 -0
- wxpath-0.2.0/src/wxpath/http/client/response.py +14 -0
- wxpath-0.2.0/src/wxpath/http/policy/backoff.py +16 -0
- wxpath-0.2.0/src/wxpath/http/policy/retry.py +35 -0
- wxpath-0.2.0/src/wxpath/http/policy/throttler.py +114 -0
- wxpath-0.2.0/src/wxpath/http/stats.py +96 -0
- {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath/patches.py +7 -2
- wxpath-0.2.0/src/wxpath/util/__init__.py +0 -0
- wxpath-0.2.0/src/wxpath/util/logging.py +91 -0
- wxpath-0.2.0/src/wxpath/util/serialize.py +22 -0
- {wxpath-0.1.0 → wxpath-0.2.0/src/wxpath.egg-info}/PKG-INFO +30 -97
- wxpath-0.2.0/src/wxpath.egg-info/SOURCES.txt +36 -0
- {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath.egg-info/requires.txt +5 -2
- wxpath-0.2.0/src/wxpath.egg-info/top_level.txt +1 -0
- wxpath-0.1.0/src/wxpath.egg-info/SOURCES.txt +0 -12
- wxpath-0.1.0/src/wxpath.egg-info/top_level.txt +0 -1
- {wxpath-0.1.0 → wxpath-0.2.0}/LICENSE +0 -0
- {wxpath-0.1.0 → wxpath-0.2.0}/setup.cfg +0 -0
- {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath/__init__.py +0 -0
- {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath.egg-info/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -9,11 +9,13 @@ Description-Content-Type: text/markdown
|
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: requests>=2.0
|
|
11
11
|
Requires-Dist: lxml>=4.0
|
|
12
|
-
Requires-Dist: elementpath
|
|
13
|
-
Requires-Dist: aiohttp
|
|
12
|
+
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
13
|
+
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
14
14
|
Provides-Extra: test
|
|
15
15
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
16
16
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: ruff; extra == "dev"
|
|
17
19
|
Dynamic: license-file
|
|
18
20
|
|
|
19
21
|
|
|
@@ -25,10 +27,11 @@ By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine
|
|
|
25
27
|
|
|
26
28
|
NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
27
29
|
|
|
30
|
+
|
|
28
31
|
## Contents
|
|
29
32
|
|
|
30
33
|
- [Example](#example)
|
|
31
|
-
- [`url(...)` and
|
|
34
|
+
- [`url(...)` and `///url(...)` Explained](#url-and---explained)
|
|
32
35
|
- [General flow](#general-flow)
|
|
33
36
|
- [Asynchronous Crawling](#asynchronous-crawling)
|
|
34
37
|
- [Output types](#output-types)
|
|
@@ -37,11 +40,13 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
37
40
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
38
41
|
- [Install](#install)
|
|
39
42
|
- [More Examples](#more-examples)
|
|
43
|
+
- [Comparisons](#comparisons)
|
|
40
44
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
41
45
|
- [Project Philosophy](#project-philosophy)
|
|
42
46
|
- [Warnings](#warnings)
|
|
43
47
|
- [License](#license)
|
|
44
48
|
|
|
49
|
+
|
|
45
50
|
## Example
|
|
46
51
|
|
|
47
52
|
```python
|
|
@@ -49,7 +54,7 @@ import wxpath
|
|
|
49
54
|
|
|
50
55
|
path = """
|
|
51
56
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
52
|
-
///main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))]
|
|
57
|
+
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
53
58
|
/map{
|
|
54
59
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
55
60
|
'url':string(base-uri(.)),
|
|
@@ -84,10 +89,11 @@ The above expression does the following:
|
|
|
84
89
|
4. Streams the extracted data as it is discovered.
|
|
85
90
|
|
|
86
91
|
|
|
87
|
-
## `url(...)` and
|
|
92
|
+
## `url(...)` and `///url(...)` Explained
|
|
88
93
|
|
|
89
94
|
- `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
|
|
90
|
-
-
|
|
95
|
+
- `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
|
|
96
|
+
|
|
91
97
|
|
|
92
98
|
## General flow
|
|
93
99
|
|
|
@@ -97,14 +103,13 @@ The above expression does the following:
|
|
|
97
103
|
|
|
98
104
|
XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
|
|
99
105
|
|
|
100
|
-
|
|
106
|
+
`///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
|
|
101
107
|
|
|
102
108
|
Results are yielded as soon as they are ready.
|
|
103
109
|
|
|
104
110
|
|
|
105
111
|
## Asynchronous Crawling
|
|
106
112
|
|
|
107
|
-
|
|
108
113
|
**wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
|
|
109
114
|
|
|
110
115
|
```python
|
|
@@ -114,7 +119,7 @@ from wxpath import wxpath_async
|
|
|
114
119
|
items = []
|
|
115
120
|
|
|
116
121
|
async def main():
|
|
117
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
122
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
118
123
|
async for item in wxpath_async(path_expr, max_depth=1):
|
|
119
124
|
items.append(item)
|
|
120
125
|
|
|
@@ -123,16 +128,16 @@ asyncio.run(main())
|
|
|
123
128
|
|
|
124
129
|
### Blocking, Concurrent Requests
|
|
125
130
|
|
|
126
|
-
|
|
127
131
|
**wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
128
132
|
|
|
129
133
|
```python
|
|
130
134
|
from wxpath import wxpath_async_blocking_iter
|
|
131
135
|
|
|
132
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
136
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
133
137
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
134
138
|
```
|
|
135
139
|
|
|
140
|
+
|
|
136
141
|
## Output types
|
|
137
142
|
|
|
138
143
|
The wxpath Python API yields structured objects, not just strings.
|
|
@@ -156,7 +161,7 @@ The Python API preserves structure by default.
|
|
|
156
161
|
```python
|
|
157
162
|
path_expr = """
|
|
158
163
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
159
|
-
///div[@id='mw-content-text']//a
|
|
164
|
+
///url(//div[@id='mw-content-text']//a/@href)
|
|
160
165
|
/map{
|
|
161
166
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
162
167
|
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
@@ -176,15 +181,18 @@ path_expr = """
|
|
|
176
181
|
# ...]
|
|
177
182
|
```
|
|
178
183
|
|
|
184
|
+
|
|
179
185
|
## CLI
|
|
180
186
|
|
|
181
187
|
**wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
|
|
182
188
|
|
|
189
|
+
The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
|
|
190
|
+
|
|
191
|
+
WARNING: Due to the everchanging nature of web content, the output may vary over time.
|
|
183
192
|
```bash
|
|
184
193
|
> wxpath --depth 1 "\
|
|
185
194
|
url('https://en.wikipedia.org/wiki/Expression_language')\
|
|
186
|
-
///div[@id='mw-content-text'] \
|
|
187
|
-
//a/url(@href[starts-with(., '/wiki/') \
|
|
195
|
+
///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
|
|
188
196
|
and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
|
|
189
197
|
/map{ \
|
|
190
198
|
'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
|
|
@@ -256,90 +264,13 @@ pip install wxpath
|
|
|
256
264
|
|
|
257
265
|
## More Examples
|
|
258
266
|
|
|
259
|
-
|
|
260
|
-
import wxpath
|
|
267
|
+
See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
|
|
261
268
|
|
|
262
|
-
#### EXAMPLE 1 - Simple, single page crawl and link extraction #######
|
|
263
|
-
#
|
|
264
|
-
# Starting from Expression language's wiki, extract all links (hrefs)
|
|
265
|
-
# from the main section. The `url(...)` operator is used to execute a
|
|
266
|
-
# web request to the specified URL and return the HTML content.
|
|
267
|
-
#
|
|
268
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//main//a/@href"
|
|
269
|
-
|
|
270
|
-
items = wxpath.wxpath_async_blocking(path_expr)
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
#### EXAMPLE 2 - Two-deep crawl and link extraction ##################
|
|
274
|
-
#
|
|
275
|
-
# Starting from Expression language's wiki, crawl all child links
|
|
276
|
-
# starting with '/wiki/', and extract each child's links (hrefs). The
|
|
277
|
-
# `url(...)` operator is pipe'd arguments from the evaluated XPath.
|
|
278
|
-
#
|
|
279
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(@href[starts-with(., '/wiki/')])//a/@href"
|
|
280
|
-
|
|
281
|
-
#### EXAMPLE 3 - Infinite crawl with BFS tree depth limit ############
|
|
282
|
-
#
|
|
283
|
-
# Starting from Expression language's wiki, infinitely crawl all child
|
|
284
|
-
# links (and child's child's links recursively). The `///` syntax is
|
|
285
|
-
# used to indicate an infinite crawl.
|
|
286
|
-
# Returns lxml.html.HtmlElement objects.
|
|
287
|
-
#
|
|
288
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
|
|
289
|
-
|
|
290
|
-
# The same expression written differently:
|
|
291
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//main//a/@href)"
|
|
292
|
-
|
|
293
|
-
# Modify (inclusive) max_depth to limit the BFS tree (crawl depth).
|
|
294
|
-
items = wxpath.wxpath_async_blocking(path_expr, max_depth=1)
|
|
295
|
-
|
|
296
|
-
#### EXAMPLE 4 - Infinite crawl with field extraction ################
|
|
297
|
-
#
|
|
298
|
-
# Infinitely crawls Expression language's wiki's child links and
|
|
299
|
-
# childs' child links (recursively) and then, for each child link
|
|
300
|
-
# crawled, extracts objects with the named fields as a dict.
|
|
301
|
-
#
|
|
302
|
-
path_expr = """
|
|
303
|
-
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
304
|
-
///main//a/url(@href)
|
|
305
|
-
/map {
|
|
306
|
-
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
307
|
-
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
308
|
-
'url'://link[@rel='canonical']/@href[1],
|
|
309
|
-
'backlink':wx:backlink(.),
|
|
310
|
-
'depth':wx:depth(.)
|
|
311
|
-
}
|
|
312
|
-
"""
|
|
313
269
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
# >> segments
|
|
318
|
-
# [Segment(op='url', value='https://en.wikipedia.org/wiki/Expression_language'),
|
|
319
|
-
# Segment(op='url_inf', value='///url(//main//a/@href)'),
|
|
320
|
-
# Segment(op='xpath', value='/map { \'title\':(//span[contains(@class, "mw-page-title-main")]/text())[1], \'short_description\':(//div[contains(@class, "shortdescription")]/text())[1], \'url\'://link[@rel=\'canonical\']/@href[1] }')]
|
|
321
|
-
|
|
322
|
-
#### EXAMPLE 5 = Seeding from XPath function expression + mapping operator (`!`)
|
|
323
|
-
#
|
|
324
|
-
# Functionally create 10 Amazon book search result page URLs, map each URL to
|
|
325
|
-
# the url(.) operator, and for each page, extract the title, price, and link of
|
|
326
|
-
# each book listed.
|
|
327
|
-
#
|
|
328
|
-
base_url = "https://www.amazon.com/s?k=books&i=stripbooks&page="
|
|
329
|
-
|
|
330
|
-
path_expr = f"""
|
|
331
|
-
(1 to 10) ! ('{base_url}' || .) !
|
|
332
|
-
url(.)
|
|
333
|
-
//span[@data-component-type='s-search-results']//*[@role='listitem']
|
|
334
|
-
/map {{
|
|
335
|
-
'title': (.//h2/span/text())[1],
|
|
336
|
-
'price': (.//span[@class='a-price']/span[@class='a-offscreen']/text())[1],
|
|
337
|
-
'link': (.//a[@aria-describedby='price-link']/@href)[1]
|
|
338
|
-
}}
|
|
339
|
-
"""
|
|
270
|
+
## Comparisons
|
|
271
|
+
|
|
272
|
+
See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
|
|
340
273
|
|
|
341
|
-
items = list(wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
342
|
-
```
|
|
343
274
|
|
|
344
275
|
## Advanced: Engine & Crawler Configuration
|
|
345
276
|
|
|
@@ -364,7 +295,7 @@ engine = WXPathEngine(
|
|
|
364
295
|
crawler=crawler,
|
|
365
296
|
)
|
|
366
297
|
|
|
367
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')
|
|
298
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
|
|
368
299
|
|
|
369
300
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
370
301
|
```
|
|
@@ -392,6 +323,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
392
323
|
- Automatic proxy rotation
|
|
393
324
|
- Browser-based rendering (JavaScript execution)
|
|
394
325
|
|
|
326
|
+
|
|
395
327
|
## WARNINGS!!!
|
|
396
328
|
|
|
397
329
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
@@ -399,6 +331,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
399
331
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
400
332
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
401
333
|
|
|
334
|
+
|
|
402
335
|
## License
|
|
403
336
|
|
|
404
337
|
MIT
|
|
@@ -7,10 +7,11 @@ By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine
|
|
|
7
7
|
|
|
8
8
|
NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
9
9
|
|
|
10
|
+
|
|
10
11
|
## Contents
|
|
11
12
|
|
|
12
13
|
- [Example](#example)
|
|
13
|
-
- [`url(...)` and
|
|
14
|
+
- [`url(...)` and `///url(...)` Explained](#url-and---explained)
|
|
14
15
|
- [General flow](#general-flow)
|
|
15
16
|
- [Asynchronous Crawling](#asynchronous-crawling)
|
|
16
17
|
- [Output types](#output-types)
|
|
@@ -19,11 +20,13 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
19
20
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
20
21
|
- [Install](#install)
|
|
21
22
|
- [More Examples](#more-examples)
|
|
23
|
+
- [Comparisons](#comparisons)
|
|
22
24
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
23
25
|
- [Project Philosophy](#project-philosophy)
|
|
24
26
|
- [Warnings](#warnings)
|
|
25
27
|
- [License](#license)
|
|
26
28
|
|
|
29
|
+
|
|
27
30
|
## Example
|
|
28
31
|
|
|
29
32
|
```python
|
|
@@ -31,7 +34,7 @@ import wxpath
|
|
|
31
34
|
|
|
32
35
|
path = """
|
|
33
36
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
34
|
-
///main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))]
|
|
37
|
+
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
35
38
|
/map{
|
|
36
39
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
37
40
|
'url':string(base-uri(.)),
|
|
@@ -66,10 +69,11 @@ The above expression does the following:
|
|
|
66
69
|
4. Streams the extracted data as it is discovered.
|
|
67
70
|
|
|
68
71
|
|
|
69
|
-
## `url(...)` and
|
|
72
|
+
## `url(...)` and `///url(...)` Explained
|
|
70
73
|
|
|
71
74
|
- `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
|
|
72
|
-
-
|
|
75
|
+
- `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
|
|
76
|
+
|
|
73
77
|
|
|
74
78
|
## General flow
|
|
75
79
|
|
|
@@ -79,14 +83,13 @@ The above expression does the following:
|
|
|
79
83
|
|
|
80
84
|
XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
|
|
81
85
|
|
|
82
|
-
|
|
86
|
+
`///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
|
|
83
87
|
|
|
84
88
|
Results are yielded as soon as they are ready.
|
|
85
89
|
|
|
86
90
|
|
|
87
91
|
## Asynchronous Crawling
|
|
88
92
|
|
|
89
|
-
|
|
90
93
|
**wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
|
|
91
94
|
|
|
92
95
|
```python
|
|
@@ -96,7 +99,7 @@ from wxpath import wxpath_async
|
|
|
96
99
|
items = []
|
|
97
100
|
|
|
98
101
|
async def main():
|
|
99
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
102
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
100
103
|
async for item in wxpath_async(path_expr, max_depth=1):
|
|
101
104
|
items.append(item)
|
|
102
105
|
|
|
@@ -105,16 +108,16 @@ asyncio.run(main())
|
|
|
105
108
|
|
|
106
109
|
### Blocking, Concurrent Requests
|
|
107
110
|
|
|
108
|
-
|
|
109
111
|
**wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
110
112
|
|
|
111
113
|
```python
|
|
112
114
|
from wxpath import wxpath_async_blocking_iter
|
|
113
115
|
|
|
114
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(
|
|
116
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
|
|
115
117
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
116
118
|
```
|
|
117
119
|
|
|
120
|
+
|
|
118
121
|
## Output types
|
|
119
122
|
|
|
120
123
|
The wxpath Python API yields structured objects, not just strings.
|
|
@@ -138,7 +141,7 @@ The Python API preserves structure by default.
|
|
|
138
141
|
```python
|
|
139
142
|
path_expr = """
|
|
140
143
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
141
|
-
///div[@id='mw-content-text']//a
|
|
144
|
+
///url(//div[@id='mw-content-text']//a/@href)
|
|
142
145
|
/map{
|
|
143
146
|
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
144
147
|
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
@@ -158,15 +161,18 @@ path_expr = """
|
|
|
158
161
|
# ...]
|
|
159
162
|
```
|
|
160
163
|
|
|
164
|
+
|
|
161
165
|
## CLI
|
|
162
166
|
|
|
163
167
|
**wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
|
|
164
168
|
|
|
169
|
+
The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
|
|
170
|
+
|
|
171
|
+
WARNING: Due to the everchanging nature of web content, the output may vary over time.
|
|
165
172
|
```bash
|
|
166
173
|
> wxpath --depth 1 "\
|
|
167
174
|
url('https://en.wikipedia.org/wiki/Expression_language')\
|
|
168
|
-
///div[@id='mw-content-text'] \
|
|
169
|
-
//a/url(@href[starts-with(., '/wiki/') \
|
|
175
|
+
///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
|
|
170
176
|
and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
|
|
171
177
|
/map{ \
|
|
172
178
|
'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
|
|
@@ -238,90 +244,13 @@ pip install wxpath
|
|
|
238
244
|
|
|
239
245
|
## More Examples
|
|
240
246
|
|
|
241
|
-
|
|
242
|
-
import wxpath
|
|
247
|
+
See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
|
|
243
248
|
|
|
244
|
-
#### EXAMPLE 1 - Simple, single page crawl and link extraction #######
|
|
245
|
-
#
|
|
246
|
-
# Starting from Expression language's wiki, extract all links (hrefs)
|
|
247
|
-
# from the main section. The `url(...)` operator is used to execute a
|
|
248
|
-
# web request to the specified URL and return the HTML content.
|
|
249
|
-
#
|
|
250
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//main//a/@href"
|
|
251
|
-
|
|
252
|
-
items = wxpath.wxpath_async_blocking(path_expr)
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
#### EXAMPLE 2 - Two-deep crawl and link extraction ##################
|
|
256
|
-
#
|
|
257
|
-
# Starting from Expression language's wiki, crawl all child links
|
|
258
|
-
# starting with '/wiki/', and extract each child's links (hrefs). The
|
|
259
|
-
# `url(...)` operator is pipe'd arguments from the evaluated XPath.
|
|
260
|
-
#
|
|
261
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(@href[starts-with(., '/wiki/')])//a/@href"
|
|
262
|
-
|
|
263
|
-
#### EXAMPLE 3 - Infinite crawl with BFS tree depth limit ############
|
|
264
|
-
#
|
|
265
|
-
# Starting from Expression language's wiki, infinitely crawl all child
|
|
266
|
-
# links (and child's child's links recursively). The `///` syntax is
|
|
267
|
-
# used to indicate an infinite crawl.
|
|
268
|
-
# Returns lxml.html.HtmlElement objects.
|
|
269
|
-
#
|
|
270
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
|
|
271
|
-
|
|
272
|
-
# The same expression written differently:
|
|
273
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//main//a/@href)"
|
|
274
|
-
|
|
275
|
-
# Modify (inclusive) max_depth to limit the BFS tree (crawl depth).
|
|
276
|
-
items = wxpath.wxpath_async_blocking(path_expr, max_depth=1)
|
|
277
|
-
|
|
278
|
-
#### EXAMPLE 4 - Infinite crawl with field extraction ################
|
|
279
|
-
#
|
|
280
|
-
# Infinitely crawls Expression language's wiki's child links and
|
|
281
|
-
# childs' child links (recursively) and then, for each child link
|
|
282
|
-
# crawled, extracts objects with the named fields as a dict.
|
|
283
|
-
#
|
|
284
|
-
path_expr = """
|
|
285
|
-
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
286
|
-
///main//a/url(@href)
|
|
287
|
-
/map {
|
|
288
|
-
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
289
|
-
'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
|
|
290
|
-
'url'://link[@rel='canonical']/@href[1],
|
|
291
|
-
'backlink':wx:backlink(.),
|
|
292
|
-
'depth':wx:depth(.)
|
|
293
|
-
}
|
|
294
|
-
"""
|
|
295
249
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
# >> segments
|
|
300
|
-
# [Segment(op='url', value='https://en.wikipedia.org/wiki/Expression_language'),
|
|
301
|
-
# Segment(op='url_inf', value='///url(//main//a/@href)'),
|
|
302
|
-
# Segment(op='xpath', value='/map { \'title\':(//span[contains(@class, "mw-page-title-main")]/text())[1], \'short_description\':(//div[contains(@class, "shortdescription")]/text())[1], \'url\'://link[@rel=\'canonical\']/@href[1] }')]
|
|
303
|
-
|
|
304
|
-
#### EXAMPLE 5 = Seeding from XPath function expression + mapping operator (`!`)
|
|
305
|
-
#
|
|
306
|
-
# Functionally create 10 Amazon book search result page URLs, map each URL to
|
|
307
|
-
# the url(.) operator, and for each page, extract the title, price, and link of
|
|
308
|
-
# each book listed.
|
|
309
|
-
#
|
|
310
|
-
base_url = "https://www.amazon.com/s?k=books&i=stripbooks&page="
|
|
311
|
-
|
|
312
|
-
path_expr = f"""
|
|
313
|
-
(1 to 10) ! ('{base_url}' || .) !
|
|
314
|
-
url(.)
|
|
315
|
-
//span[@data-component-type='s-search-results']//*[@role='listitem']
|
|
316
|
-
/map {{
|
|
317
|
-
'title': (.//h2/span/text())[1],
|
|
318
|
-
'price': (.//span[@class='a-price']/span[@class='a-offscreen']/text())[1],
|
|
319
|
-
'link': (.//a[@aria-describedby='price-link']/@href)[1]
|
|
320
|
-
}}
|
|
321
|
-
"""
|
|
250
|
+
## Comparisons
|
|
251
|
+
|
|
252
|
+
See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
|
|
322
253
|
|
|
323
|
-
items = list(wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
324
|
-
```
|
|
325
254
|
|
|
326
255
|
## Advanced: Engine & Crawler Configuration
|
|
327
256
|
|
|
@@ -346,7 +275,7 @@ engine = WXPathEngine(
|
|
|
346
275
|
crawler=crawler,
|
|
347
276
|
)
|
|
348
277
|
|
|
349
|
-
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')
|
|
278
|
+
path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
|
|
350
279
|
|
|
351
280
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
352
281
|
```
|
|
@@ -374,6 +303,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
374
303
|
- Automatic proxy rotation
|
|
375
304
|
- Browser-based rendering (JavaScript execution)
|
|
376
305
|
|
|
306
|
+
|
|
377
307
|
## WARNINGS!!!
|
|
378
308
|
|
|
379
309
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
@@ -381,6 +311,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
381
311
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
382
312
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
383
313
|
|
|
314
|
+
|
|
384
315
|
## License
|
|
385
316
|
|
|
386
317
|
MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -16,12 +16,13 @@ license-files = ["LICENSE"]
|
|
|
16
16
|
dependencies = [
|
|
17
17
|
"requests>=2.0",
|
|
18
18
|
"lxml>=4.0",
|
|
19
|
-
"elementpath>=5.0.0",
|
|
20
|
-
"aiohttp>=3.8.0"
|
|
19
|
+
"elementpath>=5.0.0,<=5.0.3",
|
|
20
|
+
"aiohttp>=3.8.0,<=3.12.15"
|
|
21
21
|
]
|
|
22
22
|
|
|
23
23
|
[project.optional-dependencies]
|
|
24
24
|
test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
|
|
25
|
+
dev = ["ruff"]
|
|
25
26
|
|
|
26
27
|
[project.scripts]
|
|
27
28
|
wxpath = "wxpath.cli:main"
|
|
@@ -32,7 +33,24 @@ addopts = "-ra -q"
|
|
|
32
33
|
testpaths = ["tests"]
|
|
33
34
|
|
|
34
35
|
[tool.setuptools]
|
|
35
|
-
package-dir = {"" = "src"}
|
|
36
36
|
|
|
37
37
|
[tool.setuptools.packages.find]
|
|
38
|
-
|
|
38
|
+
where = ["src"]
|
|
39
|
+
include = ["wxpath", "wxpath.*"]
|
|
40
|
+
|
|
41
|
+
[tool.ruff]
|
|
42
|
+
target-version = "py311"
|
|
43
|
+
line-length = 100
|
|
44
|
+
|
|
45
|
+
lint.select = [
|
|
46
|
+
"F", # pyflakes (unused vars, undefined names, etc.)
|
|
47
|
+
"E", # pycodestyle errors
|
|
48
|
+
"B", # flake8-bugbear (real footguns)
|
|
49
|
+
"ASYNC", # async/await correctness
|
|
50
|
+
"I", # isort rules
|
|
51
|
+
"TID", # Tidy imports
|
|
52
|
+
"ICN", # Import conventions
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[tool.ruff.format]
|
|
56
|
+
quote-style = "single"
|
|
@@ -2,34 +2,14 @@ import argparse
|
|
|
2
2
|
import json
|
|
3
3
|
import sys
|
|
4
4
|
|
|
5
|
-
from wxpath.hooks import builtin # load default hooks
|
|
6
|
-
from wxpath.core.ops import WxStr
|
|
7
5
|
from wxpath.core.parser import parse_wxpath_expr
|
|
8
|
-
from wxpath.core.runtime.engine import
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def _simplify(obj):
|
|
12
|
-
"""
|
|
13
|
-
Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
|
|
14
|
-
lxml elements) into plain built-in Python types so that printing or
|
|
15
|
-
JSON serialising shows clean values.
|
|
16
|
-
"""
|
|
17
|
-
# Scalars
|
|
18
|
-
if isinstance(obj, WxStr):
|
|
19
|
-
return str(obj)
|
|
20
|
-
|
|
21
|
-
# Mapping
|
|
22
|
-
if isinstance(obj, dict):
|
|
23
|
-
return {k: _simplify(v) for k, v in obj.items()}
|
|
24
|
-
|
|
25
|
-
# Sequence (but not str/bytes)
|
|
26
|
-
if isinstance(obj, (list, tuple, set)):
|
|
27
|
-
return type(obj)(_simplify(v) for v in obj)
|
|
28
|
-
|
|
29
|
-
return obj
|
|
6
|
+
from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
|
|
7
|
+
from wxpath.hooks import builtin, registry
|
|
8
|
+
from wxpath.util.serialize import simplify
|
|
30
9
|
|
|
31
10
|
|
|
32
11
|
def main():
|
|
12
|
+
registry.register(builtin.SerializeXPathMapAndNodeHook)
|
|
33
13
|
parser = argparse.ArgumentParser(description="Run wxpath expression.")
|
|
34
14
|
parser.add_argument("expression", help="The wxpath expression")
|
|
35
15
|
parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
|
|
@@ -39,7 +19,12 @@ def main():
|
|
|
39
19
|
parser.add_argument("--verbose", action="store_true", help="Verbose mode")
|
|
40
20
|
|
|
41
21
|
parser.add_argument("--concurrency", type=int, default=16, help="Number of concurrent fetches")
|
|
42
|
-
parser.add_argument(
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"--concurrency-per-host",
|
|
24
|
+
type=int,
|
|
25
|
+
default=8,
|
|
26
|
+
help="Number of concurrent fetches per host"
|
|
27
|
+
)
|
|
43
28
|
|
|
44
29
|
args = parser.parse_args()
|
|
45
30
|
|
|
@@ -48,8 +33,8 @@ def main():
|
|
|
48
33
|
print("parsed expression:", parse_wxpath_expr(args.expression))
|
|
49
34
|
|
|
50
35
|
if args.debug:
|
|
51
|
-
from wxpath import configure_logging
|
|
52
|
-
configure_logging(
|
|
36
|
+
from wxpath import configure_logging
|
|
37
|
+
configure_logging('DEBUG')
|
|
53
38
|
|
|
54
39
|
engine = WXPathEngine(
|
|
55
40
|
concurrency=args.concurrency,
|
|
@@ -57,7 +42,7 @@ def main():
|
|
|
57
42
|
)
|
|
58
43
|
try:
|
|
59
44
|
for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
|
|
60
|
-
clean =
|
|
45
|
+
clean = simplify(r)
|
|
61
46
|
print(json.dumps(clean, ensure_ascii=False), flush=True)
|
|
62
47
|
except BrokenPipeError:
|
|
63
48
|
sys.exit(0)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from urllib.parse import urljoin
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _make_links_absolute(links: list[str], base_url: str) -> list[str]:
|
|
5
|
+
"""
|
|
6
|
+
Convert relative links to absolute links based on the base URL.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
links (list): List of link strings.
|
|
10
|
+
base_url (str): The base URL to resolve relative links against.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
List of absolute URLs.
|
|
14
|
+
"""
|
|
15
|
+
if base_url is None:
|
|
16
|
+
raise ValueError("base_url must not be None when making links absolute.")
|
|
17
|
+
return [urljoin(base_url, link) for link in links if link]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_absolute_links_from_elem_and_xpath(elem, xpath):
|
|
21
|
+
base_url = getattr(elem, 'base_url', None)
|
|
22
|
+
return _make_links_absolute(elem.xpath3(xpath), base_url)
|