wxpath 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {wxpath-0.1.0/src/wxpath.egg-info → wxpath-0.2.0}/PKG-INFO +30 -97
  2. {wxpath-0.1.0 → wxpath-0.2.0}/README.md +25 -94
  3. {wxpath-0.1.0 → wxpath-0.2.0}/pyproject.toml +23 -5
  4. {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath/cli.py +13 -28
  5. wxpath-0.2.0/src/wxpath/core/__init__.py +13 -0
  6. wxpath-0.2.0/src/wxpath/core/dom.py +22 -0
  7. wxpath-0.2.0/src/wxpath/core/errors.py +134 -0
  8. wxpath-0.2.0/src/wxpath/core/models.py +74 -0
  9. wxpath-0.2.0/src/wxpath/core/ops.py +244 -0
  10. wxpath-0.2.0/src/wxpath/core/parser.py +319 -0
  11. wxpath-0.2.0/src/wxpath/core/runtime/__init__.py +5 -0
  12. wxpath-0.2.0/src/wxpath/core/runtime/engine.py +315 -0
  13. wxpath-0.2.0/src/wxpath/core/runtime/helpers.py +48 -0
  14. wxpath-0.2.0/src/wxpath/hooks/__init__.py +9 -0
  15. wxpath-0.2.0/src/wxpath/hooks/builtin.py +113 -0
  16. wxpath-0.2.0/src/wxpath/hooks/registry.py +133 -0
  17. wxpath-0.2.0/src/wxpath/http/__init__.py +0 -0
  18. wxpath-0.2.0/src/wxpath/http/client/__init__.py +9 -0
  19. wxpath-0.2.0/src/wxpath/http/client/crawler.py +196 -0
  20. wxpath-0.2.0/src/wxpath/http/client/request.py +35 -0
  21. wxpath-0.2.0/src/wxpath/http/client/response.py +14 -0
  22. wxpath-0.2.0/src/wxpath/http/policy/backoff.py +16 -0
  23. wxpath-0.2.0/src/wxpath/http/policy/retry.py +35 -0
  24. wxpath-0.2.0/src/wxpath/http/policy/throttler.py +114 -0
  25. wxpath-0.2.0/src/wxpath/http/stats.py +96 -0
  26. {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath/patches.py +7 -2
  27. wxpath-0.2.0/src/wxpath/util/__init__.py +0 -0
  28. wxpath-0.2.0/src/wxpath/util/logging.py +91 -0
  29. wxpath-0.2.0/src/wxpath/util/serialize.py +22 -0
  30. {wxpath-0.1.0 → wxpath-0.2.0/src/wxpath.egg-info}/PKG-INFO +30 -97
  31. wxpath-0.2.0/src/wxpath.egg-info/SOURCES.txt +36 -0
  32. {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath.egg-info/requires.txt +5 -2
  33. wxpath-0.2.0/src/wxpath.egg-info/top_level.txt +1 -0
  34. wxpath-0.1.0/src/wxpath.egg-info/SOURCES.txt +0 -12
  35. wxpath-0.1.0/src/wxpath.egg-info/top_level.txt +0 -1
  36. {wxpath-0.1.0 → wxpath-0.2.0}/LICENSE +0 -0
  37. {wxpath-0.1.0 → wxpath-0.2.0}/setup.cfg +0 -0
  38. {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath/__init__.py +0 -0
  39. {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath.egg-info/dependency_links.txt +0 -0
  40. {wxpath-0.1.0 → wxpath-0.2.0}/src/wxpath.egg-info/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wxpath
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: wxpath - a declarative web crawler and data extractor
5
5
  Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
6
6
  License-Expression: MIT
@@ -9,11 +9,13 @@ Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: requests>=2.0
11
11
  Requires-Dist: lxml>=4.0
12
- Requires-Dist: elementpath>=5.0.0
13
- Requires-Dist: aiohttp>=3.8.0
12
+ Requires-Dist: elementpath<=5.0.3,>=5.0.0
13
+ Requires-Dist: aiohttp<=3.12.15,>=3.8.0
14
14
  Provides-Extra: test
15
15
  Requires-Dist: pytest>=7.0; extra == "test"
16
16
  Requires-Dist: pytest-asyncio>=0.23; extra == "test"
17
+ Provides-Extra: dev
18
+ Requires-Dist: ruff; extra == "dev"
17
19
  Dynamic: license-file
18
20
 
19
21
 
@@ -25,10 +27,11 @@ By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine
25
27
 
26
28
  NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
27
29
 
30
+
28
31
  ## Contents
29
32
 
30
33
  - [Example](#example)
31
- - [`url(...)` and `///` Explained](#url-and---explained)
34
+ - [`url(...)` and `///url(...)` Explained](#url-and---explained)
32
35
  - [General flow](#general-flow)
33
36
  - [Asynchronous Crawling](#asynchronous-crawling)
34
37
  - [Output types](#output-types)
@@ -37,11 +40,13 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
37
40
  - [Hooks (Experimental)](#hooks-experimental)
38
41
  - [Install](#install)
39
42
  - [More Examples](#more-examples)
43
+ - [Comparisons](#comparisons)
40
44
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
41
45
  - [Project Philosophy](#project-philosophy)
42
46
  - [Warnings](#warnings)
43
47
  - [License](#license)
44
48
 
49
+
45
50
  ## Example
46
51
 
47
52
  ```python
@@ -49,7 +54,7 @@ import wxpath
49
54
 
50
55
  path = """
51
56
  url('https://en.wikipedia.org/wiki/Expression_language')
52
- ///main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))]/url(.)
57
+ ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
53
58
  /map{
54
59
  'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
55
60
  'url':string(base-uri(.)),
@@ -84,10 +89,11 @@ The above expression does the following:
84
89
  4. Streams the extracted data as it is discovered.
85
90
 
86
91
 
87
- ## `url(...)` and `///` Explained
92
+ ## `url(...)` and `///url(...)` Explained
88
93
 
89
94
  - `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
90
- - `///` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
95
+ - `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
96
+
91
97
 
92
98
  ## General flow
93
99
 
@@ -97,14 +103,13 @@ The above expression does the following:
97
103
 
98
104
  XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
99
105
 
100
- `///` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
106
+ `///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
101
107
 
102
108
  Results are yielded as soon as they are ready.
103
109
 
104
110
 
105
111
  ## Asynchronous Crawling
106
112
 
107
-
108
113
  **wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
109
114
 
110
115
  ```python
@@ -114,7 +119,7 @@ from wxpath import wxpath_async
114
119
  items = []
115
120
 
116
121
  async def main():
117
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(@href[starts-with(., '/wiki/')])//a/@href"
122
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
118
123
  async for item in wxpath_async(path_expr, max_depth=1):
119
124
  items.append(item)
120
125
 
@@ -123,16 +128,16 @@ asyncio.run(main())
123
128
 
124
129
  ### Blocking, Concurrent Requests
125
130
 
126
-
127
131
  **wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
128
132
 
129
133
  ```python
130
134
  from wxpath import wxpath_async_blocking_iter
131
135
 
132
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(@href[starts-with(., '/wiki/')])//a/@href"
136
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
133
137
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
134
138
  ```
135
139
 
140
+
136
141
  ## Output types
137
142
 
138
143
  The wxpath Python API yields structured objects, not just strings.
@@ -156,7 +161,7 @@ The Python API preserves structure by default.
156
161
  ```python
157
162
  path_expr = """
158
163
  url('https://en.wikipedia.org/wiki/Expression_language')
159
- ///div[@id='mw-content-text']//a/url(@href)
164
+ ///url(//div[@id='mw-content-text']//a/@href)
160
165
  /map{
161
166
  'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
162
167
  'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
@@ -176,15 +181,18 @@ path_expr = """
176
181
  # ...]
177
182
  ```
178
183
 
184
+
179
185
  ## CLI
180
186
 
181
187
  **wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
182
188
 
189
+ The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
190
+
191
+ WARNING: Due to the everchanging nature of web content, the output may vary over time.
183
192
  ```bash
184
193
  > wxpath --depth 1 "\
185
194
  url('https://en.wikipedia.org/wiki/Expression_language')\
186
- ///div[@id='mw-content-text'] \
187
- //a/url(@href[starts-with(., '/wiki/') \
195
+ ///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
188
196
  and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
189
197
  /map{ \
190
198
  'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
@@ -256,90 +264,13 @@ pip install wxpath
256
264
 
257
265
  ## More Examples
258
266
 
259
- ```python
260
- import wxpath
267
+ See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
261
268
 
262
- #### EXAMPLE 1 - Simple, single page crawl and link extraction #######
263
- #
264
- # Starting from Expression language's wiki, extract all links (hrefs)
265
- # from the main section. The `url(...)` operator is used to execute a
266
- # web request to the specified URL and return the HTML content.
267
- #
268
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//main//a/@href"
269
-
270
- items = wxpath.wxpath_async_blocking(path_expr)
271
-
272
-
273
- #### EXAMPLE 2 - Two-deep crawl and link extraction ##################
274
- #
275
- # Starting from Expression language's wiki, crawl all child links
276
- # starting with '/wiki/', and extract each child's links (hrefs). The
277
- # `url(...)` operator is pipe'd arguments from the evaluated XPath.
278
- #
279
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(@href[starts-with(., '/wiki/')])//a/@href"
280
-
281
- #### EXAMPLE 3 - Infinite crawl with BFS tree depth limit ############
282
- #
283
- # Starting from Expression language's wiki, infinitely crawl all child
284
- # links (and child's child's links recursively). The `///` syntax is
285
- # used to indicate an infinite crawl.
286
- # Returns lxml.html.HtmlElement objects.
287
- #
288
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
289
-
290
- # The same expression written differently:
291
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//main//a/@href)"
292
-
293
- # Modify (inclusive) max_depth to limit the BFS tree (crawl depth).
294
- items = wxpath.wxpath_async_blocking(path_expr, max_depth=1)
295
-
296
- #### EXAMPLE 4 - Infinite crawl with field extraction ################
297
- #
298
- # Infinitely crawls Expression language's wiki's child links and
299
- # childs' child links (recursively) and then, for each child link
300
- # crawled, extracts objects with the named fields as a dict.
301
- #
302
- path_expr = """
303
- url('https://en.wikipedia.org/wiki/Expression_language')
304
- ///main//a/url(@href)
305
- /map {
306
- 'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
307
- 'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
308
- 'url'://link[@rel='canonical']/@href[1],
309
- 'backlink':wx:backlink(.),
310
- 'depth':wx:depth(.)
311
- }
312
- """
313
269
 
314
- # Under the hood of wxpath.core.wxpath, we generate `segments` list,
315
- # revealing the operations executed to accomplish the crawl.
316
- # >> segments = wxpath.core.parser.parse_wxpath_expr(path_expr);
317
- # >> segments
318
- # [Segment(op='url', value='https://en.wikipedia.org/wiki/Expression_language'),
319
- # Segment(op='url_inf', value='///url(//main//a/@href)'),
320
- # Segment(op='xpath', value='/map { \'title\':(//span[contains(@class, "mw-page-title-main")]/text())[1], \'short_description\':(//div[contains(@class, "shortdescription")]/text())[1], \'url\'://link[@rel=\'canonical\']/@href[1] }')]
321
-
322
- #### EXAMPLE 5 = Seeding from XPath function expression + mapping operator (`!`)
323
- #
324
- # Functionally create 10 Amazon book search result page URLs, map each URL to
325
- # the url(.) operator, and for each page, extract the title, price, and link of
326
- # each book listed.
327
- #
328
- base_url = "https://www.amazon.com/s?k=books&i=stripbooks&page="
329
-
330
- path_expr = f"""
331
- (1 to 10) ! ('{base_url}' || .) !
332
- url(.)
333
- //span[@data-component-type='s-search-results']//*[@role='listitem']
334
- /map {{
335
- 'title': (.//h2/span/text())[1],
336
- 'price': (.//span[@class='a-price']/span[@class='a-offscreen']/text())[1],
337
- 'link': (.//a[@aria-describedby='price-link']/@href)[1]
338
- }}
339
- """
270
+ ## Comparisons
271
+
272
+ See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
340
273
 
341
- items = list(wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1))
342
- ```
343
274
 
344
275
  ## Advanced: Engine & Crawler Configuration
345
276
 
@@ -364,7 +295,7 @@ engine = WXPathEngine(
364
295
  crawler=crawler,
365
296
  )
366
297
 
367
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
298
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
368
299
 
369
300
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
370
301
  ```
@@ -392,6 +323,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
392
323
  - Automatic proxy rotation
393
324
  - Browser-based rendering (JavaScript execution)
394
325
 
326
+
395
327
  ## WARNINGS!!!
396
328
 
397
329
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
@@ -399,6 +331,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
399
331
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
400
332
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
401
333
 
334
+
402
335
  ## License
403
336
 
404
337
  MIT
@@ -7,10 +7,11 @@ By introducing the `url(...)` operator and the `///` syntax, **wxpath**'s engine
7
7
 
8
8
  NOTE: This project is in early development. Core concepts are stable, but the API and features may change. Please report issues - in particular, deadlocked crawls or unexpected behavior - and any features you'd like to see (no guarantee they'll be implemented).
9
9
 
10
+
10
11
  ## Contents
11
12
 
12
13
  - [Example](#example)
13
- - [`url(...)` and `///` Explained](#url-and---explained)
14
+ - [`url(...)` and `///url(...)` Explained](#url-and---explained)
14
15
  - [General flow](#general-flow)
15
16
  - [Asynchronous Crawling](#asynchronous-crawling)
16
17
  - [Output types](#output-types)
@@ -19,11 +20,13 @@ NOTE: This project is in early development. Core concepts are stable, but the AP
19
20
  - [Hooks (Experimental)](#hooks-experimental)
20
21
  - [Install](#install)
21
22
  - [More Examples](#more-examples)
23
+ - [Comparisons](#comparisons)
22
24
  - [Advanced: Engine & Crawler Configuration](#advanced-engine--crawler-configuration)
23
25
  - [Project Philosophy](#project-philosophy)
24
26
  - [Warnings](#warnings)
25
27
  - [License](#license)
26
28
 
29
+
27
30
  ## Example
28
31
 
29
32
  ```python
@@ -31,7 +34,7 @@ import wxpath
31
34
 
32
35
  path = """
33
36
  url('https://en.wikipedia.org/wiki/Expression_language')
34
- ///main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))]/url(.)
37
+ ///url(//main//a/@href[starts-with(., '/wiki/') and not(contains(., ':'))])
35
38
  /map{
36
39
  'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
37
40
  'url':string(base-uri(.)),
@@ -66,10 +69,11 @@ The above expression does the following:
66
69
  4. Streams the extracted data as it is discovered.
67
70
 
68
71
 
69
- ## `url(...)` and `///` Explained
72
+ ## `url(...)` and `///url(...)` Explained
70
73
 
71
74
  - `url(...)` is a custom operator that fetches the content of the user-specified or internally generated URL and returns it as an `lxml.html.HtmlElement` for further XPath processing.
72
- - `///` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
75
+ - `///url(...)` indicates infinite/recursive traversal. It tells **wxpath** to continue following links indefinitely, up to the specified `max_depth`. Unlike repeated `url()` hops, it allows a single expression to describe unbounded graph exploration. WARNING: Use with caution and constraints (via `max_depth` or XPath predicates) to avoid traversal explosion.
76
+
73
77
 
74
78
  ## General flow
75
79
 
@@ -79,14 +83,13 @@ The above expression does the following:
79
83
 
80
84
  XPath segments operate on fetched documents (fetched via the immediately preceding `url(...)` operations).
81
85
 
82
- `///` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
86
+ `///url(...)` indicates infinite/recursive traversal - it proceeds breadth-first-*ish* up to `max_depth`.
83
87
 
84
88
  Results are yielded as soon as they are ready.
85
89
 
86
90
 
87
91
  ## Asynchronous Crawling
88
92
 
89
-
90
93
  **wxpath** is `asyncio/aiohttp`-first, providing an asynchronous API for crawling and extracting data.
91
94
 
92
95
  ```python
@@ -96,7 +99,7 @@ from wxpath import wxpath_async
96
99
  items = []
97
100
 
98
101
  async def main():
99
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(@href[starts-with(., '/wiki/')])//a/@href"
102
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
100
103
  async for item in wxpath_async(path_expr, max_depth=1):
101
104
  items.append(item)
102
105
 
@@ -105,16 +108,16 @@ asyncio.run(main())
105
108
 
106
109
  ### Blocking, Concurrent Requests
107
110
 
108
-
109
111
  **wxpath** also supports concurrent requests using an asyncio-in-sync pattern, allowing you to crawl multiple pages concurrently while maintaining the simplicity of synchronous code. This is particularly useful for crawls in strictly synchronous execution environments (i.e., not inside an `asyncio` event loop) where performance is a concern.
110
112
 
111
113
  ```python
112
114
  from wxpath import wxpath_async_blocking_iter
113
115
 
114
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(@href[starts-with(., '/wiki/')])//a/@href"
116
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//@href[starts-with(., '/wiki/')])//a/@href"
115
117
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1))
116
118
  ```
117
119
 
120
+
118
121
  ## Output types
119
122
 
120
123
  The wxpath Python API yields structured objects, not just strings.
@@ -138,7 +141,7 @@ The Python API preserves structure by default.
138
141
  ```python
139
142
  path_expr = """
140
143
  url('https://en.wikipedia.org/wiki/Expression_language')
141
- ///div[@id='mw-content-text']//a/url(@href)
144
+ ///url(//div[@id='mw-content-text']//a/@href)
142
145
  /map{
143
146
  'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
144
147
  'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
@@ -158,15 +161,18 @@ path_expr = """
158
161
  # ...]
159
162
  ```
160
163
 
164
+
161
165
  ## CLI
162
166
 
163
167
  **wxpath** provides a command-line interface (CLI) to quickly experiment and execute wxpath expressions directly from the terminal.
164
168
 
169
+ The following example demonstrates how to crawl Wikipedia starting from the "Expression language" page, extract links to other wiki pages, and retrieve specific fields from each linked page.
170
+
171
+ WARNING: Due to the everchanging nature of web content, the output may vary over time.
165
172
  ```bash
166
173
  > wxpath --depth 1 "\
167
174
  url('https://en.wikipedia.org/wiki/Expression_language')\
168
- ///div[@id='mw-content-text'] \
169
- //a/url(@href[starts-with(., '/wiki/') \
175
+ ///url(//div[@id='mw-content-text']//a/@href[starts-with(., '/wiki/') \
170
176
  and not(matches(@href, '^(?:/wiki/)?(?:Wikipedia|File|Template|Special|Template_talk|Help):'))]) \
171
177
  /map{ \
172
178
  'title':(//span[contains(@class, 'mw-page-title-main')]/text())[1], \
@@ -238,90 +244,13 @@ pip install wxpath
238
244
 
239
245
  ## More Examples
240
246
 
241
- ```python
242
- import wxpath
247
+ See [EXAMPLES.md](EXAMPLES.md) for more usage examples.
243
248
 
244
- #### EXAMPLE 1 - Simple, single page crawl and link extraction #######
245
- #
246
- # Starting from Expression language's wiki, extract all links (hrefs)
247
- # from the main section. The `url(...)` operator is used to execute a
248
- # web request to the specified URL and return the HTML content.
249
- #
250
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//main//a/@href"
251
-
252
- items = wxpath.wxpath_async_blocking(path_expr)
253
-
254
-
255
- #### EXAMPLE 2 - Two-deep crawl and link extraction ##################
256
- #
257
- # Starting from Expression language's wiki, crawl all child links
258
- # starting with '/wiki/', and extract each child's links (hrefs). The
259
- # `url(...)` operator is pipe'd arguments from the evaluated XPath.
260
- #
261
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(@href[starts-with(., '/wiki/')])//a/@href"
262
-
263
- #### EXAMPLE 3 - Infinite crawl with BFS tree depth limit ############
264
- #
265
- # Starting from Expression language's wiki, infinitely crawl all child
266
- # links (and child's child's links recursively). The `///` syntax is
267
- # used to indicate an infinite crawl.
268
- # Returns lxml.html.HtmlElement objects.
269
- #
270
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
271
-
272
- # The same expression written differently:
273
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///url(//main//a/@href)"
274
-
275
- # Modify (inclusive) max_depth to limit the BFS tree (crawl depth).
276
- items = wxpath.wxpath_async_blocking(path_expr, max_depth=1)
277
-
278
- #### EXAMPLE 4 - Infinite crawl with field extraction ################
279
- #
280
- # Infinitely crawls Expression language's wiki's child links and
281
- # childs' child links (recursively) and then, for each child link
282
- # crawled, extracts objects with the named fields as a dict.
283
- #
284
- path_expr = """
285
- url('https://en.wikipedia.org/wiki/Expression_language')
286
- ///main//a/url(@href)
287
- /map {
288
- 'title':(//span[contains(@class, "mw-page-title-main")]/text())[1],
289
- 'short_description':(//div[contains(@class, "shortdescription")]/text())[1],
290
- 'url'://link[@rel='canonical']/@href[1],
291
- 'backlink':wx:backlink(.),
292
- 'depth':wx:depth(.)
293
- }
294
- """
295
249
 
296
- # Under the hood of wxpath.core.wxpath, we generate `segments` list,
297
- # revealing the operations executed to accomplish the crawl.
298
- # >> segments = wxpath.core.parser.parse_wxpath_expr(path_expr);
299
- # >> segments
300
- # [Segment(op='url', value='https://en.wikipedia.org/wiki/Expression_language'),
301
- # Segment(op='url_inf', value='///url(//main//a/@href)'),
302
- # Segment(op='xpath', value='/map { \'title\':(//span[contains(@class, "mw-page-title-main")]/text())[1], \'short_description\':(//div[contains(@class, "shortdescription")]/text())[1], \'url\'://link[@rel=\'canonical\']/@href[1] }')]
303
-
304
- #### EXAMPLE 5 = Seeding from XPath function expression + mapping operator (`!`)
305
- #
306
- # Functionally create 10 Amazon book search result page URLs, map each URL to
307
- # the url(.) operator, and for each page, extract the title, price, and link of
308
- # each book listed.
309
- #
310
- base_url = "https://www.amazon.com/s?k=books&i=stripbooks&page="
311
-
312
- path_expr = f"""
313
- (1 to 10) ! ('{base_url}' || .) !
314
- url(.)
315
- //span[@data-component-type='s-search-results']//*[@role='listitem']
316
- /map {{
317
- 'title': (.//h2/span/text())[1],
318
- 'price': (.//span[@class='a-price']/span[@class='a-offscreen']/text())[1],
319
- 'link': (.//a[@aria-describedby='price-link']/@href)[1]
320
- }}
321
- """
250
+ ## Comparisons
251
+
252
+ See [COMPARISONS.md](COMPARISONS.md) for comparisons with other web-scraping tools.
322
253
 
323
- items = list(wxpath.wxpath_async_blocking_iter(path_expr, max_depth=1))
324
- ```
325
254
 
326
255
  ## Advanced: Engine & Crawler Configuration
327
256
 
@@ -346,7 +275,7 @@ engine = WXPathEngine(
346
275
  crawler=crawler,
347
276
  )
348
277
 
349
- path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')///main//a/url(@href)"
278
+ path_expr = "url('https://en.wikipedia.org/wiki/Expression_language')//url(//main//a/@href)"
350
279
 
351
280
  items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
352
281
  ```
@@ -374,6 +303,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
374
303
  - Automatic proxy rotation
375
304
  - Browser-based rendering (JavaScript execution)
376
305
 
306
+
377
307
  ## WARNINGS!!!
378
308
 
379
309
  - Be respectful when crawling websites. A scrapy-inspired throttler is enabled by default.
@@ -381,6 +311,7 @@ items = list(wxpath_async_blocking_iter(path_expr, max_depth=1, engine=engine))
381
311
  - Deadlocks and hangs are possible in certain situations (e.g., all tasks waiting on blocked requests). Please report issues if you encounter such behavior.
382
312
  - Consider using timeouts, `max_depth`, and XPath predicates and filters to limit crawl scope.
383
313
 
314
+
384
315
  ## License
385
316
 
386
317
  MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "wxpath"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "wxpath - a declarative web crawler and data extractor"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -16,12 +16,13 @@ license-files = ["LICENSE"]
16
16
  dependencies = [
17
17
  "requests>=2.0",
18
18
  "lxml>=4.0",
19
- "elementpath>=5.0.0",
20
- "aiohttp>=3.8.0"
19
+ "elementpath>=5.0.0,<=5.0.3",
20
+ "aiohttp>=3.8.0,<=3.12.15"
21
21
  ]
22
22
 
23
23
  [project.optional-dependencies]
24
24
  test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
25
+ dev = ["ruff"]
25
26
 
26
27
  [project.scripts]
27
28
  wxpath = "wxpath.cli:main"
@@ -32,7 +33,24 @@ addopts = "-ra -q"
32
33
  testpaths = ["tests"]
33
34
 
34
35
  [tool.setuptools]
35
- package-dir = {"" = "src"}
36
36
 
37
37
  [tool.setuptools.packages.find]
38
- include = ["wxpath"]
38
+ where = ["src"]
39
+ include = ["wxpath", "wxpath.*"]
40
+
41
+ [tool.ruff]
42
+ target-version = "py311"
43
+ line-length = 100
44
+
45
+ lint.select = [
46
+ "F", # pyflakes (unused vars, undefined names, etc.)
47
+ "E", # pycodestyle errors
48
+ "B", # flake8-bugbear (real footguns)
49
+ "ASYNC", # async/await correctness
50
+ "I", # isort rules
51
+ "TID", # Tidy imports
52
+ "ICN", # Import conventions
53
+ ]
54
+
55
+ [tool.ruff.format]
56
+ quote-style = "single"
@@ -2,34 +2,14 @@ import argparse
2
2
  import json
3
3
  import sys
4
4
 
5
- from wxpath.hooks import builtin # load default hooks
6
- from wxpath.core.ops import WxStr
7
5
  from wxpath.core.parser import parse_wxpath_expr
8
- from wxpath.core.runtime.engine import wxpath_async_blocking_iter, WXPathEngine
9
-
10
-
11
- def _simplify(obj):
12
- """
13
- Recursively convert custom wrapper types (e.g., WxStr / ExtractedStr,
14
- lxml elements) into plain built-in Python types so that printing or
15
- JSON serialising shows clean values.
16
- """
17
- # Scalars
18
- if isinstance(obj, WxStr):
19
- return str(obj)
20
-
21
- # Mapping
22
- if isinstance(obj, dict):
23
- return {k: _simplify(v) for k, v in obj.items()}
24
-
25
- # Sequence (but not str/bytes)
26
- if isinstance(obj, (list, tuple, set)):
27
- return type(obj)(_simplify(v) for v in obj)
28
-
29
- return obj
6
+ from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
7
+ from wxpath.hooks import builtin, registry
8
+ from wxpath.util.serialize import simplify
30
9
 
31
10
 
32
11
  def main():
12
+ registry.register(builtin.SerializeXPathMapAndNodeHook)
33
13
  parser = argparse.ArgumentParser(description="Run wxpath expression.")
34
14
  parser.add_argument("expression", help="The wxpath expression")
35
15
  parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
@@ -39,7 +19,12 @@ def main():
39
19
  parser.add_argument("--verbose", action="store_true", help="Verbose mode")
40
20
 
41
21
  parser.add_argument("--concurrency", type=int, default=16, help="Number of concurrent fetches")
42
- parser.add_argument("--concurrency-per-host", type=int, default=8, help="Number of concurrent fetches per host")
22
+ parser.add_argument(
23
+ "--concurrency-per-host",
24
+ type=int,
25
+ default=8,
26
+ help="Number of concurrent fetches per host"
27
+ )
43
28
 
44
29
  args = parser.parse_args()
45
30
 
@@ -48,8 +33,8 @@ def main():
48
33
  print("parsed expression:", parse_wxpath_expr(args.expression))
49
34
 
50
35
  if args.debug:
51
- from wxpath import configure_logging, logging
52
- configure_logging(logging.DEBUG)
36
+ from wxpath import configure_logging
37
+ configure_logging('DEBUG')
53
38
 
54
39
  engine = WXPathEngine(
55
40
  concurrency=args.concurrency,
@@ -57,7 +42,7 @@ def main():
57
42
  )
58
43
  try:
59
44
  for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
60
- clean = _simplify(r)
45
+ clean = simplify(r)
61
46
  print(json.dumps(clean, ensure_ascii=False), flush=True)
62
47
  except BrokenPipeError:
63
48
  sys.exit(0)
@@ -0,0 +1,13 @@
1
+ from wxpath.core.runtime.engine import (
2
+ WXPathEngine,
3
+ wxpath_async,
4
+ wxpath_async_blocking,
5
+ wxpath_async_blocking_iter,
6
+ )
7
+
8
+ __all__ = [
9
+ 'wxpath_async',
10
+ 'wxpath_async_blocking',
11
+ 'wxpath_async_blocking_iter',
12
+ 'WXPathEngine',
13
+ ]
@@ -0,0 +1,22 @@
1
+ from urllib.parse import urljoin
2
+
3
+
4
+ def _make_links_absolute(links: list[str], base_url: str) -> list[str]:
5
+ """
6
+ Convert relative links to absolute links based on the base URL.
7
+
8
+ Args:
9
+ links (list): List of link strings.
10
+ base_url (str): The base URL to resolve relative links against.
11
+
12
+ Returns:
13
+ List of absolute URLs.
14
+ """
15
+ if base_url is None:
16
+ raise ValueError("base_url must not be None when making links absolute.")
17
+ return [urljoin(base_url, link) for link in links if link]
18
+
19
+
20
+ def get_absolute_links_from_elem_and_xpath(elem, xpath):
21
+ base_url = getattr(elem, 'base_url', None)
22
+ return _make_links_absolute(elem.xpath3(xpath), base_url)