wxpath 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {wxpath-0.2.0/src/wxpath.egg-info → wxpath-0.3.0}/PKG-INFO +84 -37
  2. {wxpath-0.2.0 → wxpath-0.3.0}/README.md +82 -34
  3. {wxpath-0.2.0 → wxpath-0.3.0}/pyproject.toml +3 -4
  4. wxpath-0.3.0/src/wxpath/cli.py +92 -0
  5. wxpath-0.3.0/src/wxpath/core/ops.py +278 -0
  6. wxpath-0.3.0/src/wxpath/core/parser.py +598 -0
  7. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/runtime/engine.py +133 -42
  8. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/runtime/helpers.py +0 -7
  9. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/hooks/registry.py +29 -17
  10. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/client/crawler.py +46 -11
  11. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/client/request.py +6 -3
  12. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/client/response.py +1 -1
  13. wxpath-0.3.0/src/wxpath/http/policy/robots.py +82 -0
  14. {wxpath-0.2.0 → wxpath-0.3.0/src/wxpath.egg-info}/PKG-INFO +84 -37
  15. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/SOURCES.txt +1 -1
  16. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/requires.txt +0 -1
  17. wxpath-0.2.0/src/wxpath/cli.py +0 -52
  18. wxpath-0.2.0/src/wxpath/core/errors.py +0 -134
  19. wxpath-0.2.0/src/wxpath/core/ops.py +0 -244
  20. wxpath-0.2.0/src/wxpath/core/parser.py +0 -319
  21. {wxpath-0.2.0 → wxpath-0.3.0}/LICENSE +0 -0
  22. {wxpath-0.2.0 → wxpath-0.3.0}/setup.cfg +0 -0
  23. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/__init__.py +0 -0
  24. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/__init__.py +0 -0
  25. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/dom.py +0 -0
  26. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/models.py +0 -0
  27. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/core/runtime/__init__.py +0 -0
  28. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/hooks/__init__.py +0 -0
  29. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/hooks/builtin.py +0 -0
  30. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/__init__.py +0 -0
  31. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/client/__init__.py +0 -0
  32. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/policy/backoff.py +0 -0
  33. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/policy/retry.py +0 -0
  34. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/policy/throttler.py +0 -0
  35. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/http/stats.py +0 -0
  36. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/patches.py +0 -0
  37. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/util/__init__.py +0 -0
  38. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/util/logging.py +0 -0
  39. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath/util/serialize.py +0 -0
  40. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
  41. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/entry_points.txt +0 -0
  42. {wxpath-0.2.0 → wxpath-0.3.0}/src/wxpath.egg-info/top_level.txt +0 -0
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
7
- Requires-Python: >=3.9
7
+ Requires-Python: >=3.10
8
8
  Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
- Requires-Dist: requests>=2.0
11
10
  Requires-Dist: lxml>=4.0
12
11
  Requires-Dist: elementpath<=5.0.3,>=5.0.0
13
12
  Requires-Dist: aiohttp<=3.12.15,>=3.8.0
@@ -18,12 +17,13 @@ Provides-Extra: dev
18
17
  Requires-Dist: ruff; extra == "dev"
19
18
  Dynamic: license-file
20
19
 
20
+ # **wxpath** - declarative web crawling with XPath
21
21
 
22
- # wxpath - declarative web crawling with XPath
22
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
23
23
 
24
- **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, you describe what to follow and what to extract in a single expression. **wxpath** evaluates that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
24
+ **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
25
25
 
26
- By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine is able to perform deep, recursive web crawling and extraction.
26
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
27
27
 
28
28
  NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
29
29
 
@@ -31,19 +31,22 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
31
31
  ## Contents
32
32
 
33
33
  - [Example](#example)
34
- - [`url(...)` and `///url(...)` Explained](#url-and---explained)
34
+ - [Language Design](DESIGN.md)
35
+ - [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
35
36
  - [General flow](#general-flow)
36
37
  - [Asynchronous Crawling](#asynchronous-crawling)
38
+ - [Polite Crawling](#polite-crawling)
37
39
  - [Output types](#output-types)
38
- - [XPath 3.1 support](#xpath-31-support)
40
+ - [XPath 3.1](#xpath-31-by-default)
39
41
  - [CLI](#cli)
40
42
  - [Hooks (Experimental)](#hooks-experimental)
41
43
  - [Install](#install)
42
- - [More Examples](#more-examples)
44
+ - [More Examples](EXAMPLES.md)
43
45
  - [Comparisons](#comparisons)
44
46
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
45
47
  - [Project Philosophy](#project-philosophy)
46
48
  - [Warnings](#warnings)
49
+ - [Commercial support / consulting](#commercial-support--consulting)
47
50
  - [License](#license)
48
51
 
49
52
 
@@ -52,33 +55,35 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
52
55
  ```python
53
56
  import wxpath
54
57
 
55
- path = """
58
+ # Crawl, extract fields, build a knowledge graph
59
+ path_expr = """
56
60
  url('https://en.wikipedia.org/wiki/Expression_language')
57
61
  ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
58
62
  /map{
59
- 'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
60
- 'url':string(base-uri(.)),
61
- 'short_description':(//div[contains(@class, 'shortdescription')]/text())[1]
63
+ 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
64
+ 'url': string(base-uri(.)),
65
+ 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
66
+ 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
62
67
  }
63
68
  """
64
69
 
65
- for item in wxpath.wxpath_async_blocking_iter(path, max_depth=1):
70
+ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
66
71
  print(item)
67
72
  ```
68
73
 
69
74
  Output:
70
75
 
71
76
  ```python
72
- map{'title': TextNode('Computer language'), 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': TextNode('Formal language for communicating with a computer')}
73
- map{'title': TextNode('Machine-readable medium and data'), 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': TextNode('Medium capable of storing data in a format readable by a machine')}
74
- map{'title': TextNode('Advanced Boolean Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': TextNode('Hardware description language and software')}
75
- map{'title': TextNode('Jakarta Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Jakarta_Expression_Language', 'short_description': TextNode('Computer programming language')}
76
- map{'title': TextNode('Data Analysis Expressions'), 'url': 'https://en.wikipedia.org/wiki/Data_Analysis_Expressions', 'short_description': TextNode('Formula and data query language')}
77
- map{'title': TextNode('Domain knowledge'), 'url': 'https://en.wikipedia.org/wiki/Domain_knowledge', 'short_description': TextNode('Specialist knowledge within a specific field')}
78
- map{'title': TextNode('Rights Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Rights_Expression_Language', 'short_description': TextNode('Machine-processable language used to express intellectual property rights (such as copyright)')}
79
- map{'title': TextNode('Computer science'), 'url': 'https://en.wikipedia.org/wiki/Computer_science', 'short_description': TextNode('Study of computation')}
77
+ map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
78
+ map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
79
+ map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
80
+ ...
80
81
  ```
81
82
 
83
+ **Note:** Some sites (including Wikipedia) may block requests without proper headers.
84
+ See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
85
+
86
+
82
87
  The above expression does the following:
83
88
 
84
89
  1. Starts at the specified URL, `https://en.wikipedia.org/wiki/Expression_language`.
@@ -92,18 +97,23 @@ The above expression does the following:
92
97
  ## `url(...)` and `///url(...)` Explained
93
98
 
94
99
  - `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
95
- - `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
100
+ - `///url(...)` indicates a deep crawl. It tells the runtime engine to continue following links up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe deeper graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
101
+
102
+
103
+ ## Language Design
104
+
105
+ See [DESIGN.md](DESIGN.md) for details of the language design. You will see the core concepts and design the language from the ground up.
96
106
 
97
107
 
98
108
  ## General flow
99
109
 
100
110
  **wxpath** evaluates an expression as a list of traversal and extraction steps (internally referred to as `Segment`s).
101
111
 
102
- `url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally, not per-depth and on a best-effort basis**.
112
+ `url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally, on a best-effort basis - not per-depth**.
103
113
 
104
114
  XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
105
115
 
106
- `///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
116
+ `///url(...)` indicates deep crawling - it proceeds breadth-first-*ish* up to `max_depth`.
107
117
 
108
118
  Results are yielded as soon as they are ready.
109
119
 
@@ -128,7 +138,7 @@ asyncio.run(main())
128
138
 
129
139
  ### Blocking, Concurrent Requests
130
140
 
131
- **wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
141
+ **wxpath** also provides an asyncio-in-sync API, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
132
142
 
133
143
  ```python
134
144
  from wxpath import wxpath_async_blocking_iter
@@ -137,10 +147,14 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@h
137
147
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
138
148
  ```
139
149
 
150
+ ## Polite Crawling
151
+
152
+ **wxpath** respects [robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard) by default via the `WXPathEngine(..., robotstxt=True)` constructor.
153
+
140
154
 
141
155
  ## Output types
142
156
 
143
- The wxpath Python API yields structured objects, not just strings.
157
+ The wxpath Python API yields structured objects.
144
158
 
145
159
  Depending on the expression, results may include:
146
160
 
@@ -188,10 +202,11 @@ path_expr = """
188
202
 
189
203
  The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
190
204
 
191
- WARNING: Due to the everchanging nature of web content, the output may vary over time.
205
+ NOTE: Due to the everchanging nature of web content, the output may vary over time.
192
206
  ```bash
193
- > wxpath --depth 1 "\
194
- url('https://en.wikipedia.org/wiki/Expression_language')\
207
+ > wxpath --depth 1 \
208
+ --header "User-Agent: my-app/0.1 (contact: you@example.com)" \
209
+ "url('https://en.wikipedia.org/wiki/Expression_language') \
195
210
  ///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
196
211
  and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
197
212
  /map{ \
@@ -212,6 +227,18 @@ WARNING: Due to the everchanging nature of web content, the output may vary over
212
227
  {"title": "Computer science", "short_description": "Study of computation", "url": "https://en.wikipedia.org/wiki/Computer_science", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
213
228
  ```
214
229
 
230
+ Command line options:
231
+
232
+ ```bash
233
+ --depth <depth> Max crawl depth
234
+ --verbose [true|false] Provides superficial CLI information
235
+ --debug [true|false] Provides verbose runtime output and information
236
+ --concurrency <concurrency> Number of concurrent fetches
237
+ --concurrency-per-host <concurrency> Number of concurrent fetches per host
238
+ --header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
239
+ --respect-robots [true|false] (Default: True) Respects robots.txt
240
+ ```
241
+
215
242
 
216
243
  ## Hooks (Experimental)
217
244
 
@@ -257,6 +284,8 @@ hooks.register(hooks.JSONLWriter)
257
284
 
258
285
  ## Install
259
286
 
287
+ Requires Python 3.10+.
288
+
260
289
  ```
261
290
  pip install wxpath
262
291
  ```
@@ -285,13 +314,20 @@ crawler = Crawler(
285
314
  concurrency=8,
286
315
  per_host=2,
287
316
  timeout=10,
317
+ respect_robots=False,
318
+ headers={
319
+ "User-Agent": "my-app/0.1.0 (contact: you@example.com)", # Sites like Wikipedia will appreciate this
320
+ },
288
321
  )
289
322
 
290
323
  # If `crawler` is not specified, a default Crawler will be created with
291
- # the provided concurrency and per_host values, or with defaults.
324
+ # the provided concurrency, per_host, and respect_robots values, or with defaults.
292
325
  engine = WXPathEngine(
293
- # concurrency=16,
294
- # per_host=8,
326
+ # concurrency: int = 16,
327
+ # per_host: int = 8,
328
+ # respect_robots: bool = True,
329
+ # allowed_response_codes: set[int] = {200},
330
+ # allow_redirects: bool = True,
295
331
  crawler=crawler,
296
332
  )
297
333
 
@@ -305,7 +341,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
305
341
 
306
342
  ### Principles
307
343
 
308
- - Enable declarative, recursive scraping without boilerplate
344
+ - Enable declarative, crawling and scraping without boilerplate
309
345
  - Stay lightweight and composable
310
346
  - Asynchronous support for high-performance crawls
311
347
 
@@ -316,22 +352,33 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
316
352
  - Requests are performed concurrently.
317
353
  - Results are streamed as soon as they are available.
318
354
 
319
- ### Non-Goals/Limitations (for now)
355
+ ### Limitations (for now)
356
+
357
+ The following features are not yet supported:
320
358
 
321
- - Strict result ordering
322
359
  - Persistent scheduling or crawl resumption
323
360
  - Automatic proxy rotation
324
361
  - Browser-based rendering (JavaScript execution)
362
+ - Strict result ordering
325
363
 
326
364
 
327
365
  ## WARNINGS!!!
328
366
 
329
367
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
330
- - Recursive (`///`) crawls require user discipline to avoid unbounded expansion (traversal explosion).
368
+ - Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
331
369
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
332
370
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
333
371
 
334
372
 
373
+ ## Commercial support / consulting
374
+
375
+ If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
376
+
377
+
378
+ ### Donate
379
+
380
+ If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21&currency_code=USD).
381
+
335
382
  ## License
336
383
 
337
384
  MIT
@@ -1,9 +1,10 @@
1
+ # **wxpath** - declarative web crawling with XPath
1
2
 
2
- # wxpath - declarative web crawling with XPath
3
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-3100/)
3
4
 
4
- **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, you describe what to follow and what to extract in a single expression. **wxpath** evaluates that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
5
+ **wxpath** is a declarative web crawler where traversal is expressed directly in XPath. Instead of writing imperative crawl loops, wxpath lets you describe what to follow and what to extract in a single expression. **wxpath** executes that expression concurrently, breadth-first-*ish*, and streams results as they are discovered.
5
6
 
6
- By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine is able to perform deep, recursive web crawling and extraction.
7
+ By introducing the `url(...)` operator and the `///` syntax, wxpath's engine is able to perform deep (or paginated) web crawling and extraction.
7
8
 
8
9
  NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
9
10
 
@@ -11,19 +12,22 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
11
12
  ## Contents
12
13
 
13
14
  - [Example](#example)
14
- - [`url(...)` and `///url(...)` Explained](#url-and---explained)
15
+ - [Language Design](DESIGN.md)
16
+ - [`url(...)` and `///url(...)` Explained](#url-and-url-explained)
15
17
  - [General flow](#general-flow)
16
18
  - [Asynchronous Crawling](#asynchronous-crawling)
19
+ - [Polite Crawling](#polite-crawling)
17
20
  - [Output types](#output-types)
18
- - [XPath 3.1 support](#xpath-31-support)
21
+ - [XPath 3.1](#xpath-31-by-default)
19
22
  - [CLI](#cli)
20
23
  - [Hooks (Experimental)](#hooks-experimental)
21
24
  - [Install](#install)
22
- - [More Examples](#more-examples)
25
+ - [More Examples](EXAMPLES.md)
23
26
  - [Comparisons](#comparisons)
24
27
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
25
28
  - [Project Philosophy](#project-philosophy)
26
29
  - [Warnings](#warnings)
30
+ - [Commercial support / consulting](#commercial-support--consulting)
27
31
  - [License](#license)
28
32
 
29
33
 
@@ -32,33 +36,35 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
32
36
  ```python
33
37
  import wxpath
34
38
 
35
- path = """
39
+ # Crawl, extract fields, build a knowledge graph
40
+ path_expr = """
36
41
  url('https://en.wikipedia.org/wiki/Expression_language')
37
42
  ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
38
43
  /map{
39
- 'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
40
- 'url':string(base-uri(.)),
41
- 'short_description':(//div[contains(@class, 'shortdescription')]/text())[1]
44
+ 'title': (//span[contains(@class, "mw-page-title-main")]/text())[1] ! string(.),
45
+ 'url': string(base-uri(.)),
46
+ 'short_description': //div[contains(@class, 'shortdescription')]/text() ! string(.),
47
+ 'forward_links': //div[@id="mw-content-text"]//a/@href ! string(.)
42
48
  }
43
49
  """
44
50
 
45
- for item in wxpath.wxpath_async_blocking_iter(path, max_depth=1):
51
+ for item in wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1):
46
52
  print(item)
47
53
  ```
48
54
 
49
55
  Output:
50
56
 
51
57
  ```python
52
- map{'title': TextNode('Computer language'), 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': TextNode('Formal language for communicating with a computer')}
53
- map{'title': TextNode('Machine-readable medium and data'), 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': TextNode('Medium capable of storing data in a format readable by a machine')}
54
- map{'title': TextNode('Advanced Boolean Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': TextNode('Hardware description language and software')}
55
- map{'title': TextNode('Jakarta Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Jakarta_Expression_Language', 'short_description': TextNode('Computer programming language')}
56
- map{'title': TextNode('Data Analysis Expressions'), 'url': 'https://en.wikipedia.org/wiki/Data_Analysis_Expressions', 'short_description': TextNode('Formula and data query language')}
57
- map{'title': TextNode('Domain knowledge'), 'url': 'https://en.wikipedia.org/wiki/Domain_knowledge', 'short_description': TextNode('Specialist knowledge within a specific field')}
58
- map{'title': TextNode('Rights Expression Language'), 'url': 'https://en.wikipedia.org/wiki/Rights_Expression_Language', 'short_description': TextNode('Machine-processable language used to express intellectual property rights (such as copyright)')}
59
- map{'title': TextNode('Computer science'), 'url': 'https://en.wikipedia.org/wiki/Computer_science', 'short_description': TextNode('Study of computation')}
58
+ map{'title': 'Computer language', 'url': 'https://en.wikipedia.org/wiki/Computer_language', 'short_description': 'Formal language for communicating with a computer', 'forward_links': ['/wiki/Formal_language', '/wiki/Communication', ...]}
59
+ map{'title': 'Advanced Boolean Expression Language', 'url': 'https://en.wikipedia.org/wiki/Advanced_Boolean_Expression_Language', 'short_description': 'Hardware description language and software', 'forward_links': ['/wiki/File:ABEL_HDL_example_SN74162.png', '/wiki/Hardware_description_language', ...]}
60
+ map{'title': 'Machine-readable medium and data', 'url': 'https://en.wikipedia.org/wiki/Machine_readable', 'short_description': 'Medium capable of storing data in a format readable by a machine', 'forward_links': ['/wiki/File:EAN-13-ISBN-13.svg', '/wiki/ISBN', ...]}
61
+ ...
60
62
  ```
61
63
 
64
+ **Note:** Some sites (including Wikipedia) may block requests without proper headers.
65
+ See [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration) to set a custom `User-Agent`.
66
+
67
+
62
68
  The above expression does the following:
63
69
 
64
70
  1. Starts at the specified URL, `https://en.wikipedia.org/wiki/Expression_language`.
@@ -72,18 +78,23 @@ The above expression does the following:
72
78
  ## `url(...)` and `///url(...)` Explained
73
79
 
74
80
  - `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
75
- - `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
81
+ - `///url(...)` indicates a deep crawl. It tells the runtime engine to continue following links up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe deeper graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
82
+
83
+
84
+ ## Language Design
85
+
86
+ See [DESIGN.md](DESIGN.md) for details of the language design. You will see the core concepts and design the language from the ground up.
76
87
 
77
88
 
78
89
  ## General flow
79
90
 
80
91
  **wxpath** evaluates an expression as a list of traversal and extraction steps (internally referred to as `Segment`s).
81
92
 
82
- `url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally, not per-depth and on a best-effort basis**.
93
+ `url(...)` creates crawl tasks either statically (via a fixed URL) or dynamically (via a URL derived from the XPath expression). **URLs are deduplicated globally, on a best-effort basis - not per-depth**.
83
94
 
84
95
  XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
85
96
 
86
- `///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
97
+ `///url(...)` indicates deep crawling - it proceeds breadth-first-*ish* up to `max_depth`.
87
98
 
88
99
  Results are yielded as soon as they are ready.
89
100
 
@@ -108,7 +119,7 @@ asyncio.run(main())
108
119
 
109
120
  ### Blocking, Concurrent Requests
110
121
 
111
- **wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
122
+ **wxpath** also provides an asyncio-in-sync API, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
112
123
 
113
124
  ```python
114
125
  from wxpath import wxpath_async_blocking_iter
@@ -117,10 +128,14 @@ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@h
117
128
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
118
129
  ```
119
130
 
131
+ ## Polite Crawling
132
+
133
+ **wxpath** respects [robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard) by default via the `WXPathEngine(..., robotstxt=True)` constructor.
134
+
120
135
 
121
136
  ## Output types
122
137
 
123
- The wxpath Python API yields structured objects, not just strings.
138
+ The wxpath Python API yields structured objects.
124
139
 
125
140
  Depending on the expression, results may include:
126
141
 
@@ -168,10 +183,11 @@ path_expr = """
168
183
 
169
184
  The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
170
185
 
171
- WARNING: Due to the everchanging nature of web content, the output may vary over time.
186
+ NOTE: Due to the everchanging nature of web content, the output may vary over time.
172
187
  ```bash
173
- > wxpath --depth 1 "\
174
- url('https://en.wikipedia.org/wiki/Expression_language')\
188
+ > wxpath --depth 1 \
189
+ --header "User-Agent: my-app/0.1 (contact: you@example.com)" \
190
+ "url('https://en.wikipedia.org/wiki/Expression_language') \
175
191
  ///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
176
192
  and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
177
193
  /map{ \
@@ -192,6 +208,18 @@ WARNING: Due to the everchanging nature of web content, the output may vary over
192
208
  {"title": "Computer science", "short_description": "Study of computation", "url": "https://en.wikipedia.org/wiki/Computer_science", "backlink": "https://en.wikipedia.org/wiki/Expression_language", "depth": 1.0}
193
209
  ```
194
210
 
211
+ Command line options:
212
+
213
+ ```bash
214
+ --depth <depth> Max crawl depth
215
+ --verbose [true|false] Provides superficial CLI information
216
+ --debug [true|false] Provides verbose runtime output and information
217
+ --concurrency <concurrency> Number of concurrent fetches
218
+ --concurrency-per-host <concurrency> Number of concurrent fetches per host
219
+ --header "Key:Value" Add a custom header (e.g., 'Key:Value'). Can be used multiple times.
220
+ --respect-robots [true|false] (Default: True) Respects robots.txt
221
+ ```
222
+
195
223
 
196
224
  ## Hooks (Experimental)
197
225
 
@@ -237,6 +265,8 @@ hooks.register(hooks.JSONLWriter)
237
265
 
238
266
  ## Install
239
267
 
268
+ Requires Python 3.10+.
269
+
240
270
  ```
241
271
  pip install wxpath
242
272
  ```
@@ -265,13 +295,20 @@ crawler = Crawler(
265
295
  concurrency=8,
266
296
  per_host=2,
267
297
  timeout=10,
298
+ respect_robots=False,
299
+ headers={
300
+ "User-Agent": "my-app/0.1.0 (contact: you@example.com)", # Sites like Wikipedia will appreciate this
301
+ },
268
302
  )
269
303
 
270
304
  # If `crawler` is not specified, a default Crawler will be created with
271
- # the provided concurrency and per_host values, or with defaults.
305
+ # the provided concurrency, per_host, and respect_robots values, or with defaults.
272
306
  engine = WXPathEngine(
273
- # concurrency=16,
274
- # per_host=8,
307
+ # concurrency: int = 16,
308
+ # per_host: int = 8,
309
+ # respect_robots: bool = True,
310
+ # allowed_response_codes: set[int] = {200},
311
+ # allow_redirects: bool = True,
275
312
  crawler=crawler,
276
313
  )
277
314
 
@@ -285,7 +322,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
285
322
 
286
323
  ### Principles
287
324
 
288
- - Enable declarative, recursive scraping without boilerplate
325
+ - Enable declarative, crawling and scraping without boilerplate
289
326
  - Stay lightweight and composable
290
327
  - Asynchronous support for high-performance crawls
291
328
 
@@ -296,22 +333,33 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
296
333
  - Requests are performed concurrently.
297
334
  - Results are streamed as soon as they are available.
298
335
 
299
- ### Non-Goals/Limitations (for now)
336
+ ### Limitations (for now)
337
+
338
+ The following features are not yet supported:
300
339
 
301
- - Strict result ordering
302
340
  - Persistent scheduling or crawl resumption
303
341
  - Automatic proxy rotation
304
342
  - Browser-based rendering (JavaScript execution)
343
+ - Strict result ordering
305
344
 
306
345
 
307
346
  ## WARNINGS!!!
308
347
 
309
348
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
310
- - Recursive (`///`) crawls require user discipline to avoid unbounded expansion (traversal explosion).
349
+ - Deep crawls (`///`) require user discipline to avoid unbounded expansion (traversal explosion).
311
350
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
312
351
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
313
352
 
314
353
 
354
+ ## Commercial support / consulting
355
+
356
+ If you want help building or operating crawlers/data feeds with wxpath (extraction, scheduling, monitoring, breakage fixes) or other web-scraping needs, please contact me at: rodrigopala91@gmail.com.
357
+
358
+
359
+ ### Donate
360
+
361
+ If you like wxpath and want to support its development, please consider [donating](https://www.paypal.com/donate/?business=WDNDK6J6PJEXY&no_recurring=0&item_name=Thanks+for+using+wxpath%21+Donations+fund+development%2C+docs%2C+and+bug+fixes.+If+wxpath+saved+you+time%2C+a+small+contribution+helps%21&currency_code=USD).
362
+
315
363
  ## License
316
364
 
317
365
  MIT
@@ -4,17 +4,16 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "wxpath"
7
- version = "0.2.0"
7
+ version = "0.3.0"
8
8
  description = "wxpath - a declarative web crawler and data extractor"
9
9
  readme = "README.md"
10
- requires-python = ">=3.9"
10
+ requires-python = ">=3.10"
11
11
  authors = [
12
12
  { name = "Rodrigo Palacios", email = "rodrigopala91@gmail.com" }
13
13
  ]
14
14
  license = "MIT"
15
15
  license-files = ["LICENSE"]
16
16
  dependencies = [
17
- "requests>=2.0",
18
17
  "lxml>=4.0",
19
18
  "elementpath>=5.0.0,<=5.0.3",
20
19
  "aiohttp>=3.8.0,<=3.12.15"
@@ -39,7 +38,7 @@ where = ["src"]
39
38
  include = ["wxpath", "wxpath.*"]
40
39
 
41
40
  [tool.ruff]
42
- target-version = "py311"
41
+ target-version = "py310"
43
42
  line-length = 100
44
43
 
45
44
  lint.select = [
@@ -0,0 +1,92 @@
1
+ import argparse
2
+ import json
3
+ import sys
4
+
5
+ from wxpath.core import parser as wxpath_parser
6
+ from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
7
+ from wxpath.hooks import builtin, registry
8
+ from wxpath.http.client.crawler import Crawler
9
+ from wxpath.util.serialize import simplify
10
+
11
+
12
+ def main():
13
+ registry.register(builtin.SerializeXPathMapAndNodeHook)
14
+ arg_parser = argparse.ArgumentParser(description="Run wxpath expression.")
15
+ arg_parser.add_argument("expression", help="The wxpath expression")
16
+ arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
17
+ # debug
18
+ arg_parser.add_argument("--debug", action="store_true", help="Debug mode")
19
+ # verbose
20
+ arg_parser.add_argument("--verbose", action="store_true", help="Verbose mode")
21
+
22
+ arg_parser.add_argument(
23
+ "--concurrency",
24
+ type=int,
25
+ default=16,
26
+ help="Number of concurrent fetches"
27
+ )
28
+ arg_parser.add_argument(
29
+ "--concurrency-per-host",
30
+ type=int,
31
+ default=8,
32
+ help="Number of concurrent fetches per host"
33
+ )
34
+ arg_parser.add_argument(
35
+ "--header",
36
+ action="append",
37
+ dest="header_list",
38
+ default=[],
39
+ help="Add a custom header (e.g., 'Key:Value'). Can be used multiple times.",
40
+ )
41
+ arg_parser.add_argument(
42
+ "--respect-robots",
43
+ action="store_true",
44
+ help="Respect robots.txt",
45
+ default=True
46
+ )
47
+
48
+ args = arg_parser.parse_args()
49
+
50
+ if args.verbose:
51
+ segments = wxpath_parser.parse(args.expression)
52
+ print("parsed expression:\n\nSegments([")
53
+ for s in segments:
54
+ print(f"\t{s},")
55
+ print("])")
56
+ print()
57
+
58
+ if args.debug:
59
+ from wxpath import configure_logging
60
+ configure_logging('DEBUG')
61
+
62
+ custom_headers = {}
63
+ if args.header_list:
64
+ for header_item in args.header_list:
65
+ try:
66
+ key, value = header_item.split(':', 1)
67
+ custom_headers[key.strip()] = value.strip()
68
+ except ValueError:
69
+ print(f"Warning: Invalid header format '{header_item}'. Use 'Key:Value'.")
70
+
71
+ if custom_headers and args.verbose:
72
+ print(f"Using custom headers: {custom_headers}")
73
+ print()
74
+
75
+ crawler = Crawler(
76
+ concurrency=args.concurrency,
77
+ per_host=args.concurrency_per_host,
78
+ respect_robots=args.respect_robots,
79
+ headers=custom_headers
80
+ )
81
+ engine = WXPathEngine(crawler=crawler)
82
+
83
+ try:
84
+ for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
85
+ clean = simplify(r)
86
+ print(json.dumps(clean, ensure_ascii=False), flush=True)
87
+ except BrokenPipeError:
88
+ sys.exit(0)
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()