wxpath 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.2.0/src/wxpath.egg-info → wxpath-0.3.0}/PKG-INFO +84 -37
- {wxpath-0.2.0 → wxpath-0.3.0}/README.md +82 -34
- {wxpath-0.2.0 → wxpath-0.3.0}/pyproject.toml +3 -4
- wxpath-0.3.0/src/wxpath/cli.py +92 -0
- wxpath-0.3.0/src/wxpath/core/ops.py +278 -0
- wxpath-0.3.0/src/wxpath/core/parser.py +598 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/runtime/engine.py +133 -42
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/runtime/helpers.py +0 -7
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/hooks/registry.py +29 -17
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/client/crawler.py +46 -11
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/client/request.py +6 -3
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/client/response.py +1 -1
- wxpath-0.3.0/src/wxpath/http/policy/robots.py +82 -0
- {wxpath-0.2.0 → wxpath-0.3.0/src/wxpath.egg-info}/PKG-INFO +84 -37
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/SOURCES.txt +1 -1
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/requires.txt +0 -1
- wxpath-0.2.0/src/wxpath/cli.py +0 -52
- wxpath-0.2.0/src/wxpath/core/errors.py +0 -134
- wxpath-0.2.0/src/wxpath/core/ops.py +0 -244
- wxpath-0.2.0/src/wxpath/core/parser.py +0 -319
- {wxpath-0.2.0 → wxpath-0.3.0}/LICENSE +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/setup.cfg +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/__init__.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/__init__.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/dom.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/models.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/runtime/__init__.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/hooks/__init__.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/hooks/builtin.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/__init__.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/client/__init__.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/policy/backoff.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/policy/retry.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/policy/throttler.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/stats.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/patches.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/util/__init__.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/util/logging.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/util/serialize.py +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/entry_points.txt +0 -0
- {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/top_level.txt +0 -0
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
|
-
Requires-Dist: requests>=2.0
|
|
11
10
|
Requires-Dist: lxml>=4.0
|
|
12
11
|
Requires-Dist: elementpath<=5.0.3,>=5.0.0
|
|
13
12
|
Requires-Dist: aiohttp<=3.12.15,>=3.8.0
|
|
@@ -18,12 +17,13 @@ Provides-Extra: dev
|
|
|
18
17
|
Requires-Dist: ruff; extra == "dev"
|
|
19
18
|
Dynamic: license-file
|
|
20
19
|
|
|
20
|
+
# **wxpath** - declarative web crawling with XPath
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
[](https://www.python.org/downloads/release/python-3100/)
|
|
23
23
|
|
|
24
|
-
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, you describe what to follow and what to extract in a single expression. **wxpath**
|
|
24
|
+
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
25
25
|
|
|
26
|
-
By introducing the `url(...)` operator and the `///` syntax,
|
|
26
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
|
|
27
27
|
|
|
28
28
|
NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
29
29
|
|
|
@@ -31,19 +31,22 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
31
31
|
## Contents
|
|
32
32
|
|
|
33
33
|
- [Example](#example)
|
|
34
|
-
- [
|
|
34
|
+
- [Language Design](DESIGN.md)
|
|
35
|
+
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
35
36
|
- [General flow](#general-flow)
|
|
36
37
|
- [Asynchronous Crawling](#asynchronous-crawling)
|
|
38
|
+
- [Polite Crawling](#polite-crawling)
|
|
37
39
|
- [Output types](#output-types)
|
|
38
|
-
- [XPath 3.1
|
|
40
|
+
- [XPath 3.1](#xpath-31-by-default)
|
|
39
41
|
- [CLI](#cli)
|
|
40
42
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
41
43
|
- [Install](#install)
|
|
42
|
-
- [More Examples](
|
|
44
|
+
- [More Examples](EXAMPLES.md)
|
|
43
45
|
- [Comparisons](#comparisons)
|
|
44
46
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
45
47
|
- [Project Philosophy](#project-philosophy)
|
|
46
48
|
- [Warnings](#warnings)
|
|
49
|
+
- [Commercial support / consulting](#commercial-support--consulting)
|
|
47
50
|
- [License](#license)
|
|
48
51
|
|
|
49
52
|
|
|
@@ -52,33 +55,35 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
52
55
|
```python
|
|
53
56
|
import wxpath
|
|
54
57
|
|
|
55
|
-
|
|
58
|
+
# Crawl, extract fields, build a knowledge graph
|
|
59
|
+
path_expr = """
|
|
56
60
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
57
61
|
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
58
62
|
/map{
|
|
59
|
-
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
60
|
-
'url':string(base-uri(.)),
|
|
61
|
-
'short_description':
|
|
63
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
64
|
+
'url': string(base-uri(.)),
|
|
65
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
66
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
62
67
|
}
|
|
63
68
|
"""
|
|
64
69
|
|
|
65
|
-
for item in wxpath.wxpath_async_blocking_iter(
|
|
70
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
66
71
|
print(item)
|
|
67
72
|
```
|
|
68
73
|
|
|
69
74
|
Output:
|
|
70
75
|
|
|
71
76
|
```python
|
|
72
|
-
map{'title':
|
|
73
|
-
map{'title':
|
|
74
|
-
map{'title':
|
|
75
|
-
|
|
76
|
-
map{'title': TextNode('Data Analysis Expressions'), 'url': 'https://en.wikipedia.org/wiki/Data_Analysis_Expressions', 'short_description': TextNode('Formula and data query language')}
|
|
77
|
-
map{'title': TextNode('Domain knowledge'), 'url': 'https://en.wikipedia.org/wiki/Domain_knowledge', 'short_description': TextNode('Specialist knowledge within a specific field')}
|
|
78
|
-
map{'title': TextNode('Rights Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Rights_Expression_Language', 'short_description': TextNode('Machine-processable language used to express intellectual property rights (such as copyright)')}
|
|
79
|
-
map{'title': TextNode('Computer science'), 'url': 'https://en.wikipedia.org/wiki/Computer_science', 'short_description': TextNode('Study of computation')}
|
|
77
|
+
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
78
|
+
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
79
|
+
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
80
|
+
...
|
|
80
81
|
```
|
|
81
82
|
|
|
83
|
+
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
84
|
+
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
85
|
+
|
|
86
|
+
|
|
82
87
|
The above expression does the following:
|
|
83
88
|
|
|
84
89
|
1. Starts at the specified URL, `https://en.wikipedia.org/wiki/Expression_language`.
|
|
@@ -92,18 +97,23 @@ The above expression does the following:
|
|
|
92
97
|
## `url(...)` and `///url(...)` Explained
|
|
93
98
|
|
|
94
99
|
- `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
|
|
95
|
-
- `///url(...)` indicates
|
|
100
|
+
- `///url(...)` indicates a deep crawl. It tells the runtime engine to continue following links up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe deeper graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
## Language Design
|
|
104
|
+
|
|
105
|
+
See [DESIGN.md](DESIGN.md) for details of the language design. You will see the core concepts and design the language from the ground up.
|
|
96
106
|
|
|
97
107
|
|
|
98
108
|
## General flow
|
|
99
109
|
|
|
100
110
|
**wxpath** evaluates an expression as a list of traversal and extraction steps (internally referred to as `Segment`s).
|
|
101
111
|
|
|
102
|
-
`url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally,
|
|
112
|
+
`url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally, on a best-effort basis - not per-depth**.
|
|
103
113
|
|
|
104
114
|
XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
|
|
105
115
|
|
|
106
|
-
`///url(...)` indicates
|
|
116
|
+
`///url(...)` indicates deep crawling - it proceeds breadth-first-*ish* up to `max_depth`.
|
|
107
117
|
|
|
108
118
|
Results are yielded as soon as they are ready.
|
|
109
119
|
|
|
@@ -128,7 +138,7 @@ asyncio.run(main())
|
|
|
128
138
|
|
|
129
139
|
### Blocking, Concurrent Requests
|
|
130
140
|
|
|
131
|
-
**wxpath** also
|
|
141
|
+
**wxpath** also provides an asyncio-in-sync API, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
132
142
|
|
|
133
143
|
```python
|
|
134
144
|
from wxpath import wxpath_async_blocking_iter
|
|
@@ -137,10 +147,14 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@h
|
|
|
137
147
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
138
148
|
```
|
|
139
149
|
|
|
150
|
+
## Polite Crawling
|
|
151
|
+
|
|
152
|
+
**wxpath** respects [robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard) by default via the `WXPathEngine(..., robotstxt=True)` constructor.
|
|
153
|
+
|
|
140
154
|
|
|
141
155
|
## Output types
|
|
142
156
|
|
|
143
|
-
The wxpath Python API yields structured objects
|
|
157
|
+
The wxpath Python API yields structured objects.
|
|
144
158
|
|
|
145
159
|
Depending on the expression, results may include:
|
|
146
160
|
|
|
@@ -188,10 +202,11 @@ path_expr = """
|
|
|
188
202
|
|
|
189
203
|
The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
|
|
190
204
|
|
|
191
|
-
|
|
205
|
+
NOTE: Due to the everchanging nature of web content, the output may vary over time.
|
|
192
206
|
```bash
|
|
193
|
-
> wxpath --depth 1
|
|
194
|
-
|
|
207
|
+
> wxpath --depth 1 \
|
|
208
|
+
--header "User-Agent: my-app/0.1 (contact: you@example.com)" \
|
|
209
|
+
"url('https://en.wikipedia.org/wiki/Expression_language') \
|
|
195
210
|
///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
|
|
196
211
|
and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
|
|
197
212
|
/map{ \
|
|
@@ -212,6 +227,18 @@ WARNING: Due to the everchanging nature of web content, the output may vary over
|
|
|
212
227
|
{"title": "Computer science", "short_description": "Study of computation", "url": "https://en.wikipedia.org/wiki/Computer_science", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
213
228
|
```
|
|
214
229
|
|
|
230
|
+
Command line options:
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
--depth <depth> Max crawl depth
|
|
234
|
+
--verbose [true|false] Provides superficial CLI information
|
|
235
|
+
--debug [true|false] Provides verbose runtime output and information
|
|
236
|
+
--concurrency <concurrency> Number of concurrent fetches
|
|
237
|
+
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
238
|
+
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
239
|
+
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
240
|
+
```
|
|
241
|
+
|
|
215
242
|
|
|
216
243
|
## Hooks (Experimental)
|
|
217
244
|
|
|
@@ -257,6 +284,8 @@ hooks.register(hooks.JSONLWriter)
|
|
|
257
284
|
|
|
258
285
|
## Install
|
|
259
286
|
|
|
287
|
+
Requires Python 3.10+.
|
|
288
|
+
|
|
260
289
|
```
|
|
261
290
|
pip install wxpath
|
|
262
291
|
```
|
|
@@ -285,13 +314,20 @@ crawler = Crawler(
|
|
|
285
314
|
concurrency=8,
|
|
286
315
|
per_host=2,
|
|
287
316
|
timeout=10,
|
|
317
|
+
respect_robots=False,
|
|
318
|
+
headers={
|
|
319
|
+
"User-Agent": "my-app/0.1.0 (contact: you@example.com)", # Sites like Wikipedia will appreciate this
|
|
320
|
+
},
|
|
288
321
|
)
|
|
289
322
|
|
|
290
323
|
# If `crawler` is not specified, a default Crawler will be created with
|
|
291
|
-
# the provided concurrency and
|
|
324
|
+
# the provided concurrency, per_host, and respect_robots values, or with defaults.
|
|
292
325
|
engine = WXPathEngine(
|
|
293
|
-
# concurrency=16,
|
|
294
|
-
# per_host=8,
|
|
326
|
+
# concurrency: int = 16,
|
|
327
|
+
# per_host: int = 8,
|
|
328
|
+
# respect_robots: bool = True,
|
|
329
|
+
# allowed_response_codes: set[int] = {200},
|
|
330
|
+
# allow_redirects: bool = True,
|
|
295
331
|
crawler=crawler,
|
|
296
332
|
)
|
|
297
333
|
|
|
@@ -305,7 +341,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
305
341
|
|
|
306
342
|
### Principles
|
|
307
343
|
|
|
308
|
-
- Enable declarative,
|
|
344
|
+
- Enable declarative, crawling and scraping without boilerplate
|
|
309
345
|
- Stay lightweight and composable
|
|
310
346
|
- Asynchronous support for high-performance crawls
|
|
311
347
|
|
|
@@ -316,22 +352,33 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
316
352
|
- Requests are performed concurrently.
|
|
317
353
|
- Results are streamed as soon as they are available.
|
|
318
354
|
|
|
319
|
-
###
|
|
355
|
+
### Limitations (for now)
|
|
356
|
+
|
|
357
|
+
The following features are not yet supported:
|
|
320
358
|
|
|
321
|
-
- Strict result ordering
|
|
322
359
|
- Persistent scheduling or crawl resumption
|
|
323
360
|
- Automatic proxy rotation
|
|
324
361
|
- Browser-based rendering (JavaScript execution)
|
|
362
|
+
- Strict result ordering
|
|
325
363
|
|
|
326
364
|
|
|
327
365
|
## WARNINGS!!!
|
|
328
366
|
|
|
329
367
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
330
|
-
-
|
|
368
|
+
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
331
369
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
332
370
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
333
371
|
|
|
334
372
|
|
|
373
|
+
## Commercial support / consulting
|
|
374
|
+
|
|
375
|
+
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
### Donate
|
|
379
|
+
|
|
380
|
+
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
381
|
+
|
|
335
382
|
## License
|
|
336
383
|
|
|
337
384
|
MIT
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
# **wxpath** - declarative web crawling with XPath
|
|
1
2
|
|
|
2
|
-
|
|
3
|
+
[](https://www.python.org/downloads/release/python-3100/)
|
|
3
4
|
|
|
4
|
-
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, you describe what to follow and what to extract in a single expression. **wxpath**
|
|
5
|
+
**wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
|
|
5
6
|
|
|
6
|
-
By introducing the `url(...)` operator and the `///` syntax,
|
|
7
|
+
By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
|
|
7
8
|
|
|
8
9
|
NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
|
|
9
10
|
|
|
@@ -11,19 +12,22 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
11
12
|
## Contents
|
|
12
13
|
|
|
13
14
|
- [Example](#example)
|
|
14
|
-
- [
|
|
15
|
+
- [Language Design](DESIGN.md)
|
|
16
|
+
- [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
|
|
15
17
|
- [General flow](#general-flow)
|
|
16
18
|
- [Asynchronous Crawling](#asynchronous-crawling)
|
|
19
|
+
- [Polite Crawling](#polite-crawling)
|
|
17
20
|
- [Output types](#output-types)
|
|
18
|
-
- [XPath 3.1
|
|
21
|
+
- [XPath 3.1](#xpath-31-by-default)
|
|
19
22
|
- [CLI](#cli)
|
|
20
23
|
- [Hooks (Experimental)](#hooks-experimental)
|
|
21
24
|
- [Install](#install)
|
|
22
|
-
- [More Examples](
|
|
25
|
+
- [More Examples](EXAMPLES.md)
|
|
23
26
|
- [Comparisons](#comparisons)
|
|
24
27
|
- [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
|
|
25
28
|
- [Project Philosophy](#project-philosophy)
|
|
26
29
|
- [Warnings](#warnings)
|
|
30
|
+
- [Commercial support / consulting](#commercial-support--consulting)
|
|
27
31
|
- [License](#license)
|
|
28
32
|
|
|
29
33
|
|
|
@@ -32,33 +36,35 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
|
|
|
32
36
|
```python
|
|
33
37
|
import wxpath
|
|
34
38
|
|
|
35
|
-
|
|
39
|
+
# Crawl, extract fields, build a knowledge graph
|
|
40
|
+
path_expr = """
|
|
36
41
|
url('https://en.wikipedia.org/wiki/Expression_language')
|
|
37
42
|
///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
|
|
38
43
|
/map{
|
|
39
|
-
'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
|
|
40
|
-
'url':string(base-uri(.)),
|
|
41
|
-
'short_description':
|
|
44
|
+
'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
|
|
45
|
+
'url': string(base-uri(.)),
|
|
46
|
+
'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
|
|
47
|
+
'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
|
|
42
48
|
}
|
|
43
49
|
"""
|
|
44
50
|
|
|
45
|
-
for item in wxpath.wxpath_async_blocking_iter(
|
|
51
|
+
for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
|
|
46
52
|
print(item)
|
|
47
53
|
```
|
|
48
54
|
|
|
49
55
|
Output:
|
|
50
56
|
|
|
51
57
|
```python
|
|
52
|
-
map{'title':
|
|
53
|
-
map{'title':
|
|
54
|
-
map{'title':
|
|
55
|
-
|
|
56
|
-
map{'title': TextNode('Data Analysis Expressions'), 'url': 'https://en.wikipedia.org/wiki/Data_Analysis_Expressions', 'short_description': TextNode('Formula and data query language')}
|
|
57
|
-
map{'title': TextNode('Domain knowledge'), 'url': 'https://en.wikipedia.org/wiki/Domain_knowledge', 'short_description': TextNode('Specialist knowledge within a specific field')}
|
|
58
|
-
map{'title': TextNode('Rights Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Rights_Expression_Language', 'short_description': TextNode('Machine-processable language used to express intellectual property rights (such as copyright)')}
|
|
59
|
-
map{'title': TextNode('Computer science'), 'url': 'https://en.wikipedia.org/wiki/Computer_science', 'short_description': TextNode('Study of computation')}
|
|
58
|
+
map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
|
|
59
|
+
map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
|
|
60
|
+
map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
|
|
61
|
+
...
|
|
60
62
|
```
|
|
61
63
|
|
|
64
|
+
**Note:** Some sites (including Wikipedia) may block requests without proper headers.
|
|
65
|
+
See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
|
|
66
|
+
|
|
67
|
+
|
|
62
68
|
The above expression does the following:
|
|
63
69
|
|
|
64
70
|
1. Starts at the specified URL, `https://en.wikipedia.org/wiki/Expression_language`.
|
|
@@ -72,18 +78,23 @@ The above expression does the following:
|
|
|
72
78
|
## `url(...)` and `///url(...)` Explained
|
|
73
79
|
|
|
74
80
|
- `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
|
|
75
|
-
- `///url(...)` indicates
|
|
81
|
+
- `///url(...)` indicates a deep crawl. It tells the runtime engine to continue following links up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe deeper graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
## Language Design
|
|
85
|
+
|
|
86
|
+
See [DESIGN.md](DESIGN.md) for details of the language design. You will see the core concepts and design the language from the ground up.
|
|
76
87
|
|
|
77
88
|
|
|
78
89
|
## General flow
|
|
79
90
|
|
|
80
91
|
**wxpath** evaluates an expression as a list of traversal and extraction steps (internally referred to as `Segment`s).
|
|
81
92
|
|
|
82
|
-
`url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally,
|
|
93
|
+
`url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally, on a best-effort basis - not per-depth**.
|
|
83
94
|
|
|
84
95
|
XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
|
|
85
96
|
|
|
86
|
-
`///url(...)` indicates
|
|
97
|
+
`///url(...)` indicates deep crawling - it proceeds breadth-first-*ish* up to `max_depth`.
|
|
87
98
|
|
|
88
99
|
Results are yielded as soon as they are ready.
|
|
89
100
|
|
|
@@ -108,7 +119,7 @@ asyncio.run(main())
|
|
|
108
119
|
|
|
109
120
|
### Blocking, Concurrent Requests
|
|
110
121
|
|
|
111
|
-
**wxpath** also
|
|
122
|
+
**wxpath** also provides an asyncio-in-sync API, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
|
|
112
123
|
|
|
113
124
|
```python
|
|
114
125
|
from wxpath import wxpath_async_blocking_iter
|
|
@@ -117,10 +128,14 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@h
|
|
|
117
128
|
items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
|
|
118
129
|
```
|
|
119
130
|
|
|
131
|
+
## Polite Crawling
|
|
132
|
+
|
|
133
|
+
**wxpath** respects [robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard) by default via the `WXPathEngine(..., robotstxt=True)` constructor.
|
|
134
|
+
|
|
120
135
|
|
|
121
136
|
## Output types
|
|
122
137
|
|
|
123
|
-
The wxpath Python API yields structured objects
|
|
138
|
+
The wxpath Python API yields structured objects.
|
|
124
139
|
|
|
125
140
|
Depending on the expression, results may include:
|
|
126
141
|
|
|
@@ -168,10 +183,11 @@ path_expr = """
|
|
|
168
183
|
|
|
169
184
|
The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
|
|
170
185
|
|
|
171
|
-
|
|
186
|
+
NOTE: Due to the everchanging nature of web content, the output may vary over time.
|
|
172
187
|
```bash
|
|
173
|
-
> wxpath --depth 1
|
|
174
|
-
|
|
188
|
+
> wxpath --depth 1 \
|
|
189
|
+
--header "User-Agent: my-app/0.1 (contact: you@example.com)" \
|
|
190
|
+
"url('https://en.wikipedia.org/wiki/Expression_language') \
|
|
175
191
|
///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
|
|
176
192
|
and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
|
|
177
193
|
/map{ \
|
|
@@ -192,6 +208,18 @@ WARNING: Due to the everchanging nature of web content, the output may vary over
|
|
|
192
208
|
{"title": "Computer science", "short_description": "Study of computation", "url": "https://en.wikipedia.org/wiki/Computer_science", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
|
|
193
209
|
```
|
|
194
210
|
|
|
211
|
+
Command line options:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
--depth <depth> Max crawl depth
|
|
215
|
+
--verbose [true|false] Provides superficial CLI information
|
|
216
|
+
--debug [true|false] Provides verbose runtime output and information
|
|
217
|
+
--concurrency <concurrency> Number of concurrent fetches
|
|
218
|
+
--concurrency-per-host <concurrency> Number of concurrent fetches per host
|
|
219
|
+
--header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
|
|
220
|
+
--respect-robots [true|false] (Default: True) Respects robots.txt
|
|
221
|
+
```
|
|
222
|
+
|
|
195
223
|
|
|
196
224
|
## Hooks (Experimental)
|
|
197
225
|
|
|
@@ -237,6 +265,8 @@ hooks.register(hooks.JSONLWriter)
|
|
|
237
265
|
|
|
238
266
|
## Install
|
|
239
267
|
|
|
268
|
+
Requires Python 3.10+.
|
|
269
|
+
|
|
240
270
|
```
|
|
241
271
|
pip install wxpath
|
|
242
272
|
```
|
|
@@ -265,13 +295,20 @@ crawler = Crawler(
|
|
|
265
295
|
concurrency=8,
|
|
266
296
|
per_host=2,
|
|
267
297
|
timeout=10,
|
|
298
|
+
respect_robots=False,
|
|
299
|
+
headers={
|
|
300
|
+
"User-Agent": "my-app/0.1.0 (contact: you@example.com)", # Sites like Wikipedia will appreciate this
|
|
301
|
+
},
|
|
268
302
|
)
|
|
269
303
|
|
|
270
304
|
# If `crawler` is not specified, a default Crawler will be created with
|
|
271
|
-
# the provided concurrency and
|
|
305
|
+
# the provided concurrency, per_host, and respect_robots values, or with defaults.
|
|
272
306
|
engine = WXPathEngine(
|
|
273
|
-
# concurrency=16,
|
|
274
|
-
# per_host=8,
|
|
307
|
+
# concurrency: int = 16,
|
|
308
|
+
# per_host: int = 8,
|
|
309
|
+
# respect_robots: bool = True,
|
|
310
|
+
# allowed_response_codes: set[int] = {200},
|
|
311
|
+
# allow_redirects: bool = True,
|
|
275
312
|
crawler=crawler,
|
|
276
313
|
)
|
|
277
314
|
|
|
@@ -285,7 +322,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
285
322
|
|
|
286
323
|
### Principles
|
|
287
324
|
|
|
288
|
-
- Enable declarative,
|
|
325
|
+
- Enable declarative, crawling and scraping without boilerplate
|
|
289
326
|
- Stay lightweight and composable
|
|
290
327
|
- Asynchronous support for high-performance crawls
|
|
291
328
|
|
|
@@ -296,22 +333,33 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
|
|
|
296
333
|
- Requests are performed concurrently.
|
|
297
334
|
- Results are streamed as soon as they are available.
|
|
298
335
|
|
|
299
|
-
###
|
|
336
|
+
### Limitations (for now)
|
|
337
|
+
|
|
338
|
+
The following features are not yet supported:
|
|
300
339
|
|
|
301
|
-
- Strict result ordering
|
|
302
340
|
- Persistent scheduling or crawl resumption
|
|
303
341
|
- Automatic proxy rotation
|
|
304
342
|
- Browser-based rendering (JavaScript execution)
|
|
343
|
+
- Strict result ordering
|
|
305
344
|
|
|
306
345
|
|
|
307
346
|
## WARNINGS!!!
|
|
308
347
|
|
|
309
348
|
- Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
|
|
310
|
-
-
|
|
349
|
+
- Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
|
|
311
350
|
- Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
|
|
312
351
|
- Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
|
|
313
352
|
|
|
314
353
|
|
|
354
|
+
## Commercial support / consulting
|
|
355
|
+
|
|
356
|
+
If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
### Donate
|
|
360
|
+
|
|
361
|
+
If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21¤cy_code=USD).
|
|
362
|
+
|
|
315
363
|
## License
|
|
316
364
|
|
|
317
365
|
MIT
|
|
@@ -4,17 +4,16 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
11
|
authors = [
|
|
12
12
|
{ name = "Rodrigo Palacios", email = "rodrigopala91@gmail.com" }
|
|
13
13
|
]
|
|
14
14
|
license = "MIT"
|
|
15
15
|
license-files = ["LICENSE"]
|
|
16
16
|
dependencies = [
|
|
17
|
-
"requests>=2.0",
|
|
18
17
|
"lxml>=4.0",
|
|
19
18
|
"elementpath>=5.0.0,<=5.0.3",
|
|
20
19
|
"aiohttp>=3.8.0,<=3.12.15"
|
|
@@ -39,7 +38,7 @@ where = ["src"]
|
|
|
39
38
|
include = ["wxpath", "wxpath.*"]
|
|
40
39
|
|
|
41
40
|
[tool.ruff]
|
|
42
|
-
target-version = "
|
|
41
|
+
target-version = "py310"
|
|
43
42
|
line-length = 100
|
|
44
43
|
|
|
45
44
|
lint.select = [
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from wxpath.core import parser as wxpath_parser
|
|
6
|
+
from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
|
|
7
|
+
from wxpath.hooks import builtin, registry
|
|
8
|
+
from wxpath.http.client.crawler import Crawler
|
|
9
|
+
from wxpath.util.serialize import simplify
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
registry.register(builtin.SerializeXPathMapAndNodeHook)
|
|
14
|
+
arg_parser = argparse.ArgumentParser(description="Run wxpath expression.")
|
|
15
|
+
arg_parser.add_argument("expression", help="The wxpath expression")
|
|
16
|
+
arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
|
|
17
|
+
# debug
|
|
18
|
+
arg_parser.add_argument("--debug", action="store_true", help="Debug mode")
|
|
19
|
+
# verbose
|
|
20
|
+
arg_parser.add_argument("--verbose", action="store_true", help="Verbose mode")
|
|
21
|
+
|
|
22
|
+
arg_parser.add_argument(
|
|
23
|
+
"--concurrency",
|
|
24
|
+
type=int,
|
|
25
|
+
default=16,
|
|
26
|
+
help="Number of concurrent fetches"
|
|
27
|
+
)
|
|
28
|
+
arg_parser.add_argument(
|
|
29
|
+
"--concurrency-per-host",
|
|
30
|
+
type=int,
|
|
31
|
+
default=8,
|
|
32
|
+
help="Number of concurrent fetches per host"
|
|
33
|
+
)
|
|
34
|
+
arg_parser.add_argument(
|
|
35
|
+
"--header",
|
|
36
|
+
action="append",
|
|
37
|
+
dest="header_list",
|
|
38
|
+
default=[],
|
|
39
|
+
help="Add a custom header (e.g., 'Key:Value'). Can be used multiple times.",
|
|
40
|
+
)
|
|
41
|
+
arg_parser.add_argument(
|
|
42
|
+
"--respect-robots",
|
|
43
|
+
action="store_true",
|
|
44
|
+
help="Respect robots.txt",
|
|
45
|
+
default=True
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
args = arg_parser.parse_args()
|
|
49
|
+
|
|
50
|
+
if args.verbose:
|
|
51
|
+
segments = wxpath_parser.parse(args.expression)
|
|
52
|
+
print("parsed expression:\n\nSegments([")
|
|
53
|
+
for s in segments:
|
|
54
|
+
print(f"\t{s},")
|
|
55
|
+
print("])")
|
|
56
|
+
print()
|
|
57
|
+
|
|
58
|
+
if args.debug:
|
|
59
|
+
from wxpath import configure_logging
|
|
60
|
+
configure_logging('DEBUG')
|
|
61
|
+
|
|
62
|
+
custom_headers = {}
|
|
63
|
+
if args.header_list:
|
|
64
|
+
for header_item in args.header_list:
|
|
65
|
+
try:
|
|
66
|
+
key, value = header_item.split(':', 1)
|
|
67
|
+
custom_headers[key.strip()] = value.strip()
|
|
68
|
+
except ValueError:
|
|
69
|
+
print(f"Warning: Invalid header format '{header_item}'. Use 'Key:Value'.")
|
|
70
|
+
|
|
71
|
+
if custom_headers and args.verbose:
|
|
72
|
+
print(f"Using custom headers: {custom_headers}")
|
|
73
|
+
print()
|
|
74
|
+
|
|
75
|
+
crawler = Crawler(
|
|
76
|
+
concurrency=args.concurrency,
|
|
77
|
+
per_host=args.concurrency_per_host,
|
|
78
|
+
respect_robots=args.respect_robots,
|
|
79
|
+
headers=custom_headers
|
|
80
|
+
)
|
|
81
|
+
engine = WXPathEngine(crawler=crawler)
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
|
|
85
|
+
clean = simplify(r)
|
|
86
|
+
print(json.dumps(clean, ensure_ascii=False), flush=True)
|
|
87
|
+
except BrokenPipeError:
|
|
88
|
+
sys.exit(0)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|