curl-reap 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- curl_reap-0.1.0/.gitignore +12 -0
- curl_reap-0.1.0/LICENSE +21 -0
- curl_reap-0.1.0/PKG-INFO +143 -0
- curl_reap-0.1.0/README.md +112 -0
- curl_reap-0.1.0/assets/logo.svg +28 -0
- curl_reap-0.1.0/curl_reap/__init__.py +47 -0
- curl_reap-0.1.0/curl_reap/adaptive.py +105 -0
- curl_reap-0.1.0/curl_reap/engine.py +99 -0
- curl_reap-0.1.0/curl_reap/http.py +128 -0
- curl_reap-0.1.0/curl_reap/parser.py +172 -0
- curl_reap-0.1.0/curl_reap/pipelines.py +79 -0
- curl_reap-0.1.0/curl_reap/spider.py +33 -0
- curl_reap-0.1.0/curl_reap/throttle.py +31 -0
- curl_reap-0.1.0/pyproject.toml +44 -0
curl_reap-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anish
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
curl_reap-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: curl_reap
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reap the web: browser-grade TLS impersonation, self-healing selectors, and a concurrent crawl engine in one library.
|
|
5
|
+
Project-URL: Homepage, https://github.com/anishfyi/curl_reap
|
|
6
|
+
Project-URL: Repository, https://github.com/anishfyi/curl_reap
|
|
7
|
+
Project-URL: Issues, https://github.com/anishfyi/curl_reap/issues
|
|
8
|
+
Author-email: Anish <anishfyi@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: crawler,curl_cffi,impersonate,lxml,scraping,selectors,spider,tls-fingerprint
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Requires-Dist: cssselect>=1.2
|
|
24
|
+
Requires-Dist: curl-cffi>=0.7.0
|
|
25
|
+
Requires-Dist: lxml>=5.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
<p align="center">
|
|
33
|
+
<img src="assets/logo.svg" alt="curl_reap" width="420" />
|
|
34
|
+
</p>
|
|
35
|
+
|
|
36
|
+
<p align="center"><b>Reap the web.</b> Browser-grade TLS impersonation, self-healing selectors, and a concurrent crawl engine, in one small library.</p>
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<code>pip install curl_reap</code>
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Why
|
|
45
|
+
|
|
46
|
+
Modern scraping needs three things, and today you reach for three different tools:
|
|
47
|
+
|
|
48
|
+
1. **Get past the door.** Sites fingerprint your TLS handshake and block stock Python clients. `curl_cffi` solves this with real Chrome/Safari fingerprints.
|
|
49
|
+
2. **Survive markup changes.** Plain CSS and XPath break the moment a site renames a class. Scrapling pioneered self-healing selectors that re-find the element anyway.
|
|
50
|
+
3. **Crawl at scale.** Concurrency, throttling, retries, dedup, and pipelines. That is Scrapy.
|
|
51
|
+
|
|
52
|
+
`curl_reap` takes the best idea from each and puts them behind one friendly API.
|
|
53
|
+
|
|
54
|
+
| | curl_cffi | Scrapy | Scrapling | **curl_reap** |
|
|
55
|
+
|---|:---:|:---:|:---:|:---:|
|
|
56
|
+
| Real browser TLS / JA3 | yes | no | partial | **yes** |
|
|
57
|
+
| Parser built in | no | yes | yes | **yes** |
|
|
58
|
+
| Self-healing selectors | no | no | yes | **yes** |
|
|
59
|
+
| Concurrent crawl engine | no | yes | no | **yes** |
|
|
60
|
+
| AutoThrottle, retries, pipelines | no | yes | no | **yes** |
|
|
61
|
+
| One small dependency set | yes | no | no | **yes** |
|
|
62
|
+
|
|
63
|
+
## Install
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install curl_reap
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Requires Python 3.9+. Pulls in `curl_cffi`, `lxml`, and `cssselect`.
|
|
70
|
+
|
|
71
|
+
## Quick start
|
|
72
|
+
|
|
73
|
+
A one-shot fetch parses like parsel, but the request carries a genuine browser fingerprint:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import curl_reap as reap
|
|
77
|
+
|
|
78
|
+
page = reap.get("https://quotes.toscrape.com", impersonate="chrome124")
|
|
79
|
+
print(page.css("span.text::text").getall())
|
|
80
|
+
print(page.css_first("small.author::text"))
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Self-healing selectors
|
|
84
|
+
|
|
85
|
+
Save an element once. Later, even if the site renames the class or moves the node, `auto_match` relocates it by structural signature:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
page = reap.get("https://shop.example.com/item/42")
|
|
89
|
+
page.css_first("a.buy-btn").save("buy_button") # remember its shape
|
|
90
|
+
|
|
91
|
+
# weeks later, the class is now "purchase-cta" and the old selector misses:
|
|
92
|
+
later = reap.get("https://shop.example.com/item/99")
|
|
93
|
+
btn = later.css_first("a.buy-btn", auto_match=True, identifier="buy_button")
|
|
94
|
+
print(btn.attr("href")) # found anyway
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Other finders: `page.find_by_text("Sign in")` and `page.find_similar(some_element)`.
|
|
98
|
+
|
|
99
|
+
## Crawl at scale
|
|
100
|
+
|
|
101
|
+
A `Spider` yields items (dicts) and more `Request` objects. The engine handles concurrency, AutoThrottle, retries, dedup, and pipelines:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
import curl_reap as reap
|
|
105
|
+
from curl_reap import JsonLinesPipeline
|
|
106
|
+
|
|
107
|
+
class Quotes(reap.Spider):
|
|
108
|
+
start_urls = ["https://quotes.toscrape.com"]
|
|
109
|
+
|
|
110
|
+
def parse(self, page):
|
|
111
|
+
for q in page.css("div.quote"):
|
|
112
|
+
yield {
|
|
113
|
+
"text": q.css_first("span.text::text"),
|
|
114
|
+
"author": q.css_first("small.author::text"),
|
|
115
|
+
}
|
|
116
|
+
nxt = page.css_first("li.next a::attr(href)")
|
|
117
|
+
if nxt:
|
|
118
|
+
yield reap.Request("https://quotes.toscrape.com" + nxt, self.parse)
|
|
119
|
+
|
|
120
|
+
items = reap.run(
|
|
121
|
+
Quotes,
|
|
122
|
+
concurrency=8,
|
|
123
|
+
throttle=True, # AutoThrottle adapts to server latency
|
|
124
|
+
pipelines=[JsonLinesPipeline("quotes.jsonl")],
|
|
125
|
+
)
|
|
126
|
+
print(len(items), "items reaped")
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## API at a glance
|
|
130
|
+
|
|
131
|
+
- `reap.get(url, impersonate="chrome124", **kw)` and `reap.post(...)` return a `Response` you can `.css()` / `.xpath()` directly.
|
|
132
|
+
- `reap.Session(impersonate=..., headers=..., retries=...)` for a reusable client.
|
|
133
|
+
- `Selector` / `SelectorList`: `.css`, `.css_first`, `.xpath`, `.find_by_text`, `.find_similar`, `.save`, `.re`, `.text`, `.attr`.
|
|
134
|
+
- `reap.Spider`, `reap.Request`, `reap.run(spider, ...)`, `reap.Reaper(...)`.
|
|
135
|
+
- Pipelines: `DedupPipeline`, `JsonLinesPipeline`, `CsvPipeline`, or subclass `Pipeline`.
|
|
136
|
+
|
|
137
|
+
## Responsible use
|
|
138
|
+
|
|
139
|
+
`curl_reap` impersonates a real browser at the TLS level, which is exactly what a normal browser does. It does **not** ship a challenge solver and it will not break CAPTCHAs or anti-bot walls (Cloudflare challenges, DataDome, PerimeterX, and similar). If a site has deliberately put up an access-control wall, that is a signal to stop. Respect robots.txt and each site's terms, throttle your crawls, and only collect data you are allowed to collect.
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/logo.svg" alt="curl_reap" width="420" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center"><b>Reap the web.</b> Browser-grade TLS impersonation, self-healing selectors, and a concurrent crawl engine, in one small library.</p>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<code>pip install curl_reap</code>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Why
|
|
14
|
+
|
|
15
|
+
Modern scraping needs three things, and today you reach for three different tools:
|
|
16
|
+
|
|
17
|
+
1. **Get past the door.** Sites fingerprint your TLS handshake and block stock Python clients. `curl_cffi` solves this with real Chrome/Safari fingerprints.
|
|
18
|
+
2. **Survive markup changes.** Plain CSS and XPath break the moment a site renames a class. Scrapling pioneered self-healing selectors that re-find the element anyway.
|
|
19
|
+
3. **Crawl at scale.** Concurrency, throttling, retries, dedup, and pipelines. That is Scrapy.
|
|
20
|
+
|
|
21
|
+
`curl_reap` takes the best idea from each and puts them behind one friendly API.
|
|
22
|
+
|
|
23
|
+
| | curl_cffi | Scrapy | Scrapling | **curl_reap** |
|
|
24
|
+
|---|:---:|:---:|:---:|:---:|
|
|
25
|
+
| Real browser TLS / JA3 | yes | no | partial | **yes** |
|
|
26
|
+
| Parser built in | no | yes | yes | **yes** |
|
|
27
|
+
| Self-healing selectors | no | no | yes | **yes** |
|
|
28
|
+
| Concurrent crawl engine | no | yes | no | **yes** |
|
|
29
|
+
| AutoThrottle, retries, pipelines | no | yes | no | **yes** |
|
|
30
|
+
| One small dependency set | yes | no | no | **yes** |
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install curl_reap
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Requires Python 3.9+. Pulls in `curl_cffi`, `lxml`, and `cssselect`.
|
|
39
|
+
|
|
40
|
+
## Quick start
|
|
41
|
+
|
|
42
|
+
A one-shot fetch parses like parsel, but the request carries a genuine browser fingerprint:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
import curl_reap as reap
|
|
46
|
+
|
|
47
|
+
page = reap.get("https://quotes.toscrape.com", impersonate="chrome124")
|
|
48
|
+
print(page.css("span.text::text").getall())
|
|
49
|
+
print(page.css_first("small.author::text"))
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Self-healing selectors
|
|
53
|
+
|
|
54
|
+
Save an element once. Later, even if the site renames the class or moves the node, `auto_match` relocates it by structural signature:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
page = reap.get("https://shop.example.com/item/42")
|
|
58
|
+
page.css_first("a.buy-btn").save("buy_button") # remember its shape
|
|
59
|
+
|
|
60
|
+
# weeks later, the class is now "purchase-cta" and the old selector misses:
|
|
61
|
+
later = reap.get("https://shop.example.com/item/99")
|
|
62
|
+
btn = later.css_first("a.buy-btn", auto_match=True, identifier="buy_button")
|
|
63
|
+
print(btn.attr("href")) # found anyway
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Other finders: `page.find_by_text("Sign in")` and `page.find_similar(some_element)`.
|
|
67
|
+
|
|
68
|
+
## Crawl at scale
|
|
69
|
+
|
|
70
|
+
A `Spider` yields items (dicts) and more `Request` objects. The engine handles concurrency, AutoThrottle, retries, dedup, and pipelines:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import curl_reap as reap
|
|
74
|
+
from curl_reap import JsonLinesPipeline
|
|
75
|
+
|
|
76
|
+
class Quotes(reap.Spider):
|
|
77
|
+
start_urls = ["https://quotes.toscrape.com"]
|
|
78
|
+
|
|
79
|
+
def parse(self, page):
|
|
80
|
+
for q in page.css("div.quote"):
|
|
81
|
+
yield {
|
|
82
|
+
"text": q.css_first("span.text::text"),
|
|
83
|
+
"author": q.css_first("small.author::text"),
|
|
84
|
+
}
|
|
85
|
+
nxt = page.css_first("li.next a::attr(href)")
|
|
86
|
+
if nxt:
|
|
87
|
+
yield reap.Request("https://quotes.toscrape.com" + nxt, self.parse)
|
|
88
|
+
|
|
89
|
+
items = reap.run(
|
|
90
|
+
Quotes,
|
|
91
|
+
concurrency=8,
|
|
92
|
+
throttle=True, # AutoThrottle adapts to server latency
|
|
93
|
+
pipelines=[JsonLinesPipeline("quotes.jsonl")],
|
|
94
|
+
)
|
|
95
|
+
print(len(items), "items reaped")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## API at a glance
|
|
99
|
+
|
|
100
|
+
- `reap.get(url, impersonate="chrome124", **kw)` and `reap.post(...)` return a `Response` you can `.css()` / `.xpath()` directly.
|
|
101
|
+
- `reap.Session(impersonate=..., headers=..., retries=...)` for a reusable client.
|
|
102
|
+
- `Selector` / `SelectorList`: `.css`, `.css_first`, `.xpath`, `.find_by_text`, `.find_similar`, `.save`, `.re`, `.text`, `.attr`.
|
|
103
|
+
- `reap.Spider`, `reap.Request`, `reap.run(spider, ...)`, `reap.Reaper(...)`.
|
|
104
|
+
- Pipelines: `DedupPipeline`, `JsonLinesPipeline`, `CsvPipeline`, or subclass `Pipeline`.
|
|
105
|
+
|
|
106
|
+
## Responsible use
|
|
107
|
+
|
|
108
|
+
`curl_reap` impersonates a real browser at the TLS level, which is exactly what a normal browser does. It does **not** ship a challenge solver and it will not break CAPTCHAs or anti-bot walls (Cloudflare challenges, DataDome, PerimeterX, and similar). If a site has deliberately put up an access-control wall, that is a signal to stop. Respect robots.txt and each site's terms, throttle your crawls, and only collect data you are allowed to collect.
|
|
109
|
+
|
|
110
|
+
## License
|
|
111
|
+
|
|
112
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
<svg width="520" height="170" viewBox="0 0 520 170" fill="none" xmlns="http://www.w3.org/2000/svg" font-family="ui-monospace, SFMono-Regular, Menlo, monospace">
|
|
2
|
+
<defs>
|
|
3
|
+
<linearGradient id="blade" x1="20" y1="20" x2="120" y2="130" gradientUnits="userSpaceOnUse">
|
|
4
|
+
<stop offset="0" stop-color="#F0C24B"/>
|
|
5
|
+
<stop offset="1" stop-color="#C9871A"/>
|
|
6
|
+
</linearGradient>
|
|
7
|
+
</defs>
|
|
8
|
+
|
|
9
|
+
<!-- the reap mark: a sickle blade that curves like a curl -->
|
|
10
|
+
<g transform="translate(26,20)">
|
|
11
|
+
<!-- blade -->
|
|
12
|
+
<path d="M104 40 C 74 14, 26 30, 22 72 C 19 104, 44 124, 80 118"
|
|
13
|
+
stroke="url(#blade)" stroke-width="16" stroke-linecap="round" fill="none"/>
|
|
14
|
+
<!-- handle -->
|
|
15
|
+
<path d="M80 118 C 92 116, 104 122, 112 134"
|
|
16
|
+
stroke="#7A4E27" stroke-width="13" stroke-linecap="round" fill="none"/>
|
|
17
|
+
<!-- grain being reaped, inside the curl -->
|
|
18
|
+
<circle cx="112" cy="50" r="5.5" fill="#E7B23E"/>
|
|
19
|
+
<circle cx="120" cy="68" r="5" fill="#E7B23E"/>
|
|
20
|
+
<circle cx="105" cy="74" r="4.5" fill="#E7B23E"/>
|
|
21
|
+
</g>
|
|
22
|
+
|
|
23
|
+
<!-- wordmark -->
|
|
24
|
+
<text x="186" y="92" font-size="50" font-weight="700" letter-spacing="-1">
|
|
25
|
+
<tspan fill="#8A8178">curl_</tspan><tspan fill="#C9871A">reap</tspan>
|
|
26
|
+
</text>
|
|
27
|
+
<text x="188" y="124" font-size="18" font-weight="500" letter-spacing="3" fill="#9a9189">REAP THE WEB</text>
|
|
28
|
+
</svg>
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""curl_reap: reap the web.
|
|
2
|
+
|
|
3
|
+
Three pillars in one library:
|
|
4
|
+
1. Transport: real browser TLS/JA3 impersonation (powered by curl_cffi) so your
|
|
5
|
+
requests are not fingerprinted as a bot.
|
|
6
|
+
2. Parsing: a fast lxml selector with parsel-style css/xpath plus self-healing
|
|
7
|
+
selectors that survive markup changes.
|
|
8
|
+
3. Orchestration: a small concurrent crawl engine with dedup, retries,
|
|
9
|
+
AutoThrottle, and item pipelines.
|
|
10
|
+
|
|
11
|
+
Quick start:
|
|
12
|
+
|
|
13
|
+
import curl_reap as reap
|
|
14
|
+
|
|
15
|
+
page = reap.get("https://quotes.toscrape.com")
|
|
16
|
+
print(page.css("span.text::text").getall())
|
|
17
|
+
|
|
18
|
+
class Quotes(reap.Spider):
|
|
19
|
+
start_urls = ["https://quotes.toscrape.com"]
|
|
20
|
+
def parse(self, page):
|
|
21
|
+
for q in page.css("div.quote"):
|
|
22
|
+
yield {"text": q.css_first("span.text::text"),
|
|
23
|
+
"author": q.css_first("small.author::text")}
|
|
24
|
+
nxt = page.css_first("li.next a::attr(href)")
|
|
25
|
+
if nxt:
|
|
26
|
+
yield reap.Request("https://quotes.toscrape.com" + nxt, self.parse)
|
|
27
|
+
|
|
28
|
+
items = reap.run(Quotes, concurrency=8)
|
|
29
|
+
"""
|
|
30
|
+
from .adaptive import relocate, save, signature, similarity
|
|
31
|
+
from .engine import Reaper, run
|
|
32
|
+
from .http import Response, Session, fetch, get, post
|
|
33
|
+
from .parser import Selector, SelectorList
|
|
34
|
+
from .pipelines import CsvPipeline, DedupPipeline, JsonLinesPipeline, Pipeline
|
|
35
|
+
from .spider import Request, Spider
|
|
36
|
+
from .throttle import AutoThrottle
|
|
37
|
+
|
|
38
|
+
__version__ = "0.1.0"
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"get", "post", "fetch", "Session", "Response",
|
|
42
|
+
"Selector", "SelectorList",
|
|
43
|
+
"Spider", "Request", "Reaper", "run",
|
|
44
|
+
"Pipeline", "DedupPipeline", "JsonLinesPipeline", "CsvPipeline", "AutoThrottle",
|
|
45
|
+
"signature", "similarity", "save", "relocate",
|
|
46
|
+
"__version__",
|
|
47
|
+
]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Self-healing selectors (the Scrapling-inspired pillar).
|
|
2
|
+
|
|
3
|
+
Save a structural signature of an element once. Later, even if the site renames
|
|
4
|
+
classes or reshuffles its DOM, relocate() finds the element again by scoring every
|
|
5
|
+
node against that signature. This keeps scrapers alive across markup changes, which
|
|
6
|
+
is the single biggest maintenance cost of plain CSS/XPath scrapers.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
DEFAULT_STORE = ".reap_selectors.json"
|
|
14
|
+
|
|
15
|
+
_WEIGHTS = {"tag": 2.0, "classes": 3.0, "id": 2.0, "attrs": 1.0, "text": 1.5, "path": 1.5}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def signature(el):
|
|
19
|
+
"""A compact, comparable description of one element and where it sits."""
|
|
20
|
+
parent = el.getparent()
|
|
21
|
+
sibs = list(parent) if parent is not None else [el]
|
|
22
|
+
idx = sibs.index(el) if el in sibs else 0
|
|
23
|
+
return {
|
|
24
|
+
"tag": str(el.tag),
|
|
25
|
+
"classes": sorted((el.get("class") or "").split()),
|
|
26
|
+
"id": el.get("id") or "",
|
|
27
|
+
"attrs": sorted(k for k in el.keys() if k not in ("class", "id")),
|
|
28
|
+
"text": (el.text or "").strip()[:48],
|
|
29
|
+
"depth": _depth(el),
|
|
30
|
+
"index": idx,
|
|
31
|
+
"path": _path(el),
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _depth(el):
|
|
36
|
+
d = 0
|
|
37
|
+
p = el.getparent()
|
|
38
|
+
while p is not None:
|
|
39
|
+
d += 1
|
|
40
|
+
p = p.getparent()
|
|
41
|
+
return d
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _path(el):
|
|
45
|
+
parts = []
|
|
46
|
+
cur = el
|
|
47
|
+
while cur is not None and isinstance(cur.tag, str):
|
|
48
|
+
parts.append(cur.tag)
|
|
49
|
+
cur = cur.getparent()
|
|
50
|
+
return "/".join(reversed(parts))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _jaccard(a, b):
|
|
54
|
+
a, b = set(a), set(b)
|
|
55
|
+
if not a and not b:
|
|
56
|
+
return 1.0
|
|
57
|
+
return len(a & b) / max(1, len(a | b))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def similarity(a, b):
|
|
61
|
+
"""0..1 similarity between two signatures."""
|
|
62
|
+
score = total = 0.0
|
|
63
|
+
score += _WEIGHTS["tag"] * (1.0 if a["tag"] == b["tag"] else 0.0)
|
|
64
|
+
score += _WEIGHTS["classes"] * _jaccard(a["classes"], b["classes"])
|
|
65
|
+
score += _WEIGHTS["id"] * (1.0 if a["id"] and a["id"] == b["id"] else 0.0)
|
|
66
|
+
score += _WEIGHTS["attrs"] * _jaccard(a["attrs"], b["attrs"])
|
|
67
|
+
score += _WEIGHTS["text"] * (1.0 if a["text"] and a["text"] == b["text"] else 0.0)
|
|
68
|
+
tail_a, tail_b = a["path"].split("/")[-3:], b["path"].split("/")[-3:]
|
|
69
|
+
path_sc = 1.0 if a["path"] == b["path"] else (0.5 if tail_a == tail_b else 0.0)
|
|
70
|
+
score += _WEIGHTS["path"] * path_sc
|
|
71
|
+
total = sum(_WEIGHTS.values())
|
|
72
|
+
return score / total
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def save(identifier, el, storage=None):
|
|
76
|
+
path = storage or DEFAULT_STORE
|
|
77
|
+
data = _load(path)
|
|
78
|
+
data[identifier] = signature(el)
|
|
79
|
+
with open(path, "w", encoding="utf-8") as fh:
|
|
80
|
+
json.dump(data, fh, indent=1)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def relocate(identifier, tree, storage=None, threshold=0.6):
|
|
84
|
+
"""Return the best-matching element for a saved identifier, or None."""
|
|
85
|
+
sig = _load(storage or DEFAULT_STORE).get(identifier)
|
|
86
|
+
if not sig:
|
|
87
|
+
return None
|
|
88
|
+
best, best_score = None, threshold
|
|
89
|
+
for e in tree.iter():
|
|
90
|
+
if not isinstance(e.tag, str):
|
|
91
|
+
continue
|
|
92
|
+
sc = similarity(sig, signature(e))
|
|
93
|
+
if sc > best_score:
|
|
94
|
+
best, best_score = e, sc
|
|
95
|
+
return best
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _load(path):
|
|
99
|
+
if not os.path.exists(path):
|
|
100
|
+
return {}
|
|
101
|
+
try:
|
|
102
|
+
with open(path, encoding="utf-8") as fh:
|
|
103
|
+
return json.load(fh)
|
|
104
|
+
except Exception: # noqa: BLE001
|
|
105
|
+
return {}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""The crawl engine (the Scrapy idea, kept small): concurrent fetching with
|
|
2
|
+
dedup, retries, AutoThrottle, and item pipelines, on top of the impersonating
|
|
3
|
+
transport. Spider callbacks yield items (dicts) and further Requests.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import threading
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
|
|
10
|
+
from .http import Session
|
|
11
|
+
from .pipelines import DedupPipeline
|
|
12
|
+
from .spider import Request
|
|
13
|
+
from .throttle import AutoThrottle
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Reaper:
|
|
17
|
+
def __init__(self, spider, concurrency=8, retries=2, throttle=True, delay=0.0,
|
|
18
|
+
impersonate="chrome124", pipelines=None, dedup=True, on_item=None,
|
|
19
|
+
max_pages=None):
|
|
20
|
+
self.spider = spider
|
|
21
|
+
self.concurrency = concurrency
|
|
22
|
+
self.max_pages = max_pages
|
|
23
|
+
self.session = Session(impersonate=impersonate, retries=retries)
|
|
24
|
+
self.throttle = AutoThrottle(base_delay=delay, target_concurrency=concurrency, enabled=throttle)
|
|
25
|
+
self.pipelines = list(pipelines or [])
|
|
26
|
+
if dedup and not any(isinstance(p, DedupPipeline) for p in self.pipelines):
|
|
27
|
+
self.pipelines.insert(0, DedupPipeline())
|
|
28
|
+
self.on_item = on_item
|
|
29
|
+
self._seen = set()
|
|
30
|
+
self.items = []
|
|
31
|
+
self._lock = threading.Lock()
|
|
32
|
+
self.stats = {"requests": 0, "items": 0, "errors": 0, "dropped": 0}
|
|
33
|
+
|
|
34
|
+
def _fetch_and_parse(self, req):
|
|
35
|
+
with self._lock:
|
|
36
|
+
fp = req.fingerprint()
|
|
37
|
+
if fp in self._seen:
|
|
38
|
+
return []
|
|
39
|
+
self._seen.add(fp)
|
|
40
|
+
if self.max_pages and self.stats["requests"] >= self.max_pages:
|
|
41
|
+
return []
|
|
42
|
+
self.throttle.wait()
|
|
43
|
+
import time
|
|
44
|
+
t0 = time.time()
|
|
45
|
+
try:
|
|
46
|
+
resp = self.session.request(req.method, req.url, meta=req.meta, **req.kw)
|
|
47
|
+
except Exception: # noqa: BLE001
|
|
48
|
+
with self._lock:
|
|
49
|
+
self.stats["errors"] += 1
|
|
50
|
+
return []
|
|
51
|
+
self.throttle.observe(time.time() - t0)
|
|
52
|
+
with self._lock:
|
|
53
|
+
self.stats["requests"] += 1
|
|
54
|
+
callback = req.callback or self.spider.parse
|
|
55
|
+
produced = []
|
|
56
|
+
try:
|
|
57
|
+
for out in (callback(resp) or []):
|
|
58
|
+
produced.append(out)
|
|
59
|
+
except Exception: # noqa: BLE001
|
|
60
|
+
with self._lock:
|
|
61
|
+
self.stats["errors"] += 1
|
|
62
|
+
return produced
|
|
63
|
+
|
|
64
|
+
def run(self):
|
|
65
|
+
for p in self.pipelines:
|
|
66
|
+
p.open()
|
|
67
|
+
frontier = list(self.spider.start())
|
|
68
|
+
with ThreadPoolExecutor(max_workers=self.concurrency) as pool:
|
|
69
|
+
while frontier:
|
|
70
|
+
futures = [pool.submit(self._fetch_and_parse, r) for r in frontier]
|
|
71
|
+
frontier = []
|
|
72
|
+
for fut in futures:
|
|
73
|
+
for out in fut.result():
|
|
74
|
+
if isinstance(out, Request):
|
|
75
|
+
frontier.append(out)
|
|
76
|
+
elif out is not None:
|
|
77
|
+
self._emit(out)
|
|
78
|
+
for p in self.pipelines:
|
|
79
|
+
p.close()
|
|
80
|
+
return self.items
|
|
81
|
+
|
|
82
|
+
def _emit(self, item):
|
|
83
|
+
for p in self.pipelines:
|
|
84
|
+
item = p.process(item)
|
|
85
|
+
if item is None:
|
|
86
|
+
with self._lock:
|
|
87
|
+
self.stats["dropped"] += 1
|
|
88
|
+
return
|
|
89
|
+
with self._lock:
|
|
90
|
+
self.items.append(item)
|
|
91
|
+
self.stats["items"] += 1
|
|
92
|
+
if self.on_item:
|
|
93
|
+
self.on_item(item)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def run(spider, **kw):
|
|
97
|
+
"""Run a Spider (class or instance) to completion. Returns the scraped items."""
|
|
98
|
+
sp = spider() if isinstance(spider, type) else spider
|
|
99
|
+
return Reaper(sp, **kw).run()
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Transport layer: curl_cffi sessions with real browser TLS/JA3 impersonation.
|
|
2
|
+
|
|
3
|
+
This is the "get past the front door" pillar (the curl_cffi strength). Every
|
|
4
|
+
request carries a genuine Chrome/Safari TLS + HTTP2 fingerprint, which is what
|
|
5
|
+
defeats fingerprint-based bot detection that blocks stock Python clients.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from curl_cffi import requests as _cffi
|
|
10
|
+
|
|
11
|
+
from .parser import Selector
|
|
12
|
+
|
|
13
|
+
DEFAULT_IMPERSONATE = "chrome124"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Response:
|
|
17
|
+
"""A fetched page. Behaves like a parser (css/xpath pass through to a Selector)."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, raw, meta=None):
|
|
20
|
+
self.raw = raw
|
|
21
|
+
self.status = raw.status_code
|
|
22
|
+
self.url = str(raw.url)
|
|
23
|
+
self.headers = dict(raw.headers)
|
|
24
|
+
self.text = raw.text
|
|
25
|
+
self.content = raw.content
|
|
26
|
+
self.meta = meta or {}
|
|
27
|
+
self._sel = None
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def ok(self):
|
|
31
|
+
return 200 <= self.status < 300
|
|
32
|
+
|
|
33
|
+
def selector(self):
|
|
34
|
+
if self._sel is None:
|
|
35
|
+
self._sel = Selector(content=self.text, url=self.url, status=self.status, headers=self.headers)
|
|
36
|
+
return self._sel
|
|
37
|
+
|
|
38
|
+
# parser pass-throughs so a Response is usable directly as a page
|
|
39
|
+
def css(self, *a, **k):
|
|
40
|
+
return self.selector().css(*a, **k)
|
|
41
|
+
|
|
42
|
+
def css_first(self, *a, **k):
|
|
43
|
+
return self.selector().css_first(*a, **k)
|
|
44
|
+
|
|
45
|
+
def xpath(self, *a, **k):
|
|
46
|
+
return self.selector().xpath(*a, **k)
|
|
47
|
+
|
|
48
|
+
def find_by_text(self, *a, **k):
|
|
49
|
+
return self.selector().find_by_text(*a, **k)
|
|
50
|
+
|
|
51
|
+
def find_similar(self, *a, **k):
|
|
52
|
+
return self.selector().find_similar(*a, **k)
|
|
53
|
+
|
|
54
|
+
def re(self, *a, **k):
|
|
55
|
+
return self.selector().re(*a, **k)
|
|
56
|
+
|
|
57
|
+
def save(self, *a, **k):
|
|
58
|
+
return self.selector().save(*a, **k)
|
|
59
|
+
|
|
60
|
+
def json(self):
|
|
61
|
+
return self.raw.json()
|
|
62
|
+
|
|
63
|
+
def __repr__(self):
|
|
64
|
+
return f"<Response {self.status} {self.url}>"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class Session:
|
|
68
|
+
"""A reusable curl_cffi session with impersonation, default headers, retries."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, impersonate=DEFAULT_IMPERSONATE, headers=None, timeout=30,
|
|
71
|
+
retries=2, proxies=None, **kw):
|
|
72
|
+
self.impersonate = impersonate
|
|
73
|
+
self.timeout = timeout
|
|
74
|
+
self.retries = retries
|
|
75
|
+
self._headers = dict(headers or {})
|
|
76
|
+
self._s = _cffi.Session(impersonate=impersonate, proxies=proxies, **kw)
|
|
77
|
+
|
|
78
|
+
def request(self, method, url, **kw):
|
|
79
|
+
kw.setdefault("impersonate", self.impersonate)
|
|
80
|
+
kw.setdefault("timeout", self.timeout)
|
|
81
|
+
merged = dict(self._headers)
|
|
82
|
+
merged.update(kw.pop("headers", {}) or {})
|
|
83
|
+
if merged:
|
|
84
|
+
kw["headers"] = merged
|
|
85
|
+
meta = kw.pop("meta", None)
|
|
86
|
+
retries = kw.pop("retries", self.retries)
|
|
87
|
+
last = None
|
|
88
|
+
for _ in range(retries + 1):
|
|
89
|
+
try:
|
|
90
|
+
return Response(self._s.request(method, url, **kw), meta=meta)
|
|
91
|
+
except Exception as exc: # noqa: BLE001
|
|
92
|
+
last = exc
|
|
93
|
+
raise last
|
|
94
|
+
|
|
95
|
+
def get(self, url, **kw):
|
|
96
|
+
return self.request("GET", url, **kw)
|
|
97
|
+
|
|
98
|
+
def post(self, url, **kw):
|
|
99
|
+
return self.request("POST", url, **kw)
|
|
100
|
+
|
|
101
|
+
def close(self):
|
|
102
|
+
try:
|
|
103
|
+
self._s.close()
|
|
104
|
+
except Exception: # noqa: BLE001
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
_default = None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _session():
|
|
112
|
+
global _default
|
|
113
|
+
if _default is None:
|
|
114
|
+
_default = Session()
|
|
115
|
+
return _default
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get(url, **kw):
|
|
119
|
+
"""One-shot GET with a shared impersonating session. Returns a Response."""
|
|
120
|
+
return _session().get(url, **kw)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def post(url, **kw):
|
|
124
|
+
return _session().post(url, **kw)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def fetch(url, **kw):
|
|
128
|
+
return get(url, **kw)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Parsing layer: a fast lxml selector with parsel-style ergonomics plus the
|
|
2
|
+
Scrapling-style extras (find by text, find similar, and self-healing selectors).
|
|
3
|
+
|
|
4
|
+
Supports CSS with ::text and ::attr(name) pseudo elements, XPath, regex, and an
|
|
5
|
+
auto_match mode that re-locates an element from a saved signature when the site
|
|
6
|
+
changes its markup (see adaptive.py).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
import lxml.html
|
|
13
|
+
|
|
14
|
+
_ATTR_RE = re.compile(r"::attr\(\s*([\w:-]+)\s*\)\s*$")
|
|
15
|
+
_TEXT_RE = re.compile(r"::text\s*$")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _parse_pseudo(query):
|
|
19
|
+
m = _ATTR_RE.search(query)
|
|
20
|
+
if m:
|
|
21
|
+
return ("attr", m.group(1), _ATTR_RE.sub("", query).strip())
|
|
22
|
+
if _TEXT_RE.search(query):
|
|
23
|
+
return ("text", None, _TEXT_RE.sub("", query).strip())
|
|
24
|
+
return (None, None, query)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _text_of(el):
|
|
28
|
+
try:
|
|
29
|
+
return el.text_content().strip()
|
|
30
|
+
except Exception: # noqa: BLE001
|
|
31
|
+
return (getattr(el, "text", "") or "").strip()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SelectorList(list):
|
|
35
|
+
"""A list of Selectors or strings with parsel-style get / getall helpers."""
|
|
36
|
+
|
|
37
|
+
def get(self, default=None):
|
|
38
|
+
return self[0] if self else default
|
|
39
|
+
|
|
40
|
+
def getall(self):
|
|
41
|
+
return list(self)
|
|
42
|
+
|
|
43
|
+
def text(self):
|
|
44
|
+
out = SelectorList()
|
|
45
|
+
for s in self:
|
|
46
|
+
out.append(s.text if isinstance(s, Selector) else s)
|
|
47
|
+
return out
|
|
48
|
+
|
|
49
|
+
def attr(self, name, default=None):
|
|
50
|
+
out = SelectorList()
|
|
51
|
+
for s in self:
|
|
52
|
+
if isinstance(s, Selector):
|
|
53
|
+
out.append(s.attr(name, default))
|
|
54
|
+
return out
|
|
55
|
+
|
|
56
|
+
def css(self, query, **kw):
|
|
57
|
+
out = SelectorList()
|
|
58
|
+
for s in self:
|
|
59
|
+
if isinstance(s, Selector):
|
|
60
|
+
out.extend(s.css(query, **kw))
|
|
61
|
+
return out
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Selector:
|
|
65
|
+
"""Wraps one lxml element (or a parsed document)."""
|
|
66
|
+
|
|
67
|
+
def __init__(self, content=None, element=None, url=None, status=None, headers=None):
|
|
68
|
+
if element is not None:
|
|
69
|
+
self._el = element
|
|
70
|
+
elif content is not None:
|
|
71
|
+
self._el = lxml.html.fromstring(content)
|
|
72
|
+
else:
|
|
73
|
+
self._el = lxml.html.fromstring("<html></html>")
|
|
74
|
+
self.url = url
|
|
75
|
+
self.status = status
|
|
76
|
+
self.headers = dict(headers or {})
|
|
77
|
+
|
|
78
|
+
# --- selection ---------------------------------------------------------
|
|
79
|
+
def css(self, query, auto_match=False, identifier=None, storage=None):
|
|
80
|
+
kind, attr, q = _parse_pseudo(query)
|
|
81
|
+
try:
|
|
82
|
+
els = self._el.cssselect(q) if q else [self._el]
|
|
83
|
+
except Exception: # noqa: BLE001
|
|
84
|
+
els = []
|
|
85
|
+
if not els and auto_match:
|
|
86
|
+
from .adaptive import relocate
|
|
87
|
+
found = relocate(identifier or query, self._root(), storage=storage)
|
|
88
|
+
els = [found] if found is not None else []
|
|
89
|
+
if kind == "attr":
|
|
90
|
+
return SelectorList(e.get(attr) for e in els)
|
|
91
|
+
if kind == "text":
|
|
92
|
+
return SelectorList(_text_of(e) for e in els)
|
|
93
|
+
return SelectorList(Selector(element=e, url=self.url) for e in els)
|
|
94
|
+
|
|
95
|
+
def css_first(self, query, default=None, **kw):
|
|
96
|
+
res = self.css(query, **kw)
|
|
97
|
+
return res[0] if res else default
|
|
98
|
+
|
|
99
|
+
def xpath(self, query):
|
|
100
|
+
try:
|
|
101
|
+
res = self._el.xpath(query)
|
|
102
|
+
except Exception: # noqa: BLE001
|
|
103
|
+
return SelectorList()
|
|
104
|
+
out = SelectorList()
|
|
105
|
+
for r in res:
|
|
106
|
+
out.append(r if isinstance(r, str) else Selector(element=r, url=self.url))
|
|
107
|
+
return out
|
|
108
|
+
|
|
109
|
+
# --- Scrapling-style finders ------------------------------------------
|
|
110
|
+
def find_by_text(self, text, partial=True, first=False):
|
|
111
|
+
out = SelectorList()
|
|
112
|
+
for e in self._el.iter():
|
|
113
|
+
if not isinstance(e.tag, str):
|
|
114
|
+
continue
|
|
115
|
+
t = (e.text or "").strip()
|
|
116
|
+
hit = (text in t) if partial else (text == t)
|
|
117
|
+
if hit:
|
|
118
|
+
out.append(Selector(element=e, url=self.url))
|
|
119
|
+
if first:
|
|
120
|
+
break
|
|
121
|
+
return out
|
|
122
|
+
|
|
123
|
+
def find_similar(self, sample, threshold=0.6, limit=None):
|
|
124
|
+
"""Return elements structurally similar to a sample Selector."""
|
|
125
|
+
from .adaptive import signature, similarity
|
|
126
|
+
target = signature(sample._el if isinstance(sample, Selector) else sample)
|
|
127
|
+
scored = []
|
|
128
|
+
for e in self._el.iter():
|
|
129
|
+
if not isinstance(e.tag, str) or e is getattr(sample, "_el", None):
|
|
130
|
+
continue
|
|
131
|
+
sc = similarity(target, signature(e))
|
|
132
|
+
if sc >= threshold:
|
|
133
|
+
scored.append((sc, e))
|
|
134
|
+
scored.sort(key=lambda x: -x[0])
|
|
135
|
+
if limit:
|
|
136
|
+
scored = scored[:limit]
|
|
137
|
+
return SelectorList(Selector(element=e, url=self.url) for _, e in scored)
|
|
138
|
+
|
|
139
|
+
def save(self, identifier, storage=None):
|
|
140
|
+
"""Persist this element's signature so css(auto_match=True) can re-find it."""
|
|
141
|
+
from .adaptive import save as _save
|
|
142
|
+
_save(identifier, self._el, storage=storage)
|
|
143
|
+
return self
|
|
144
|
+
|
|
145
|
+
# --- value access ------------------------------------------------------
|
|
146
|
+
@property
|
|
147
|
+
def text(self):
|
|
148
|
+
return _text_of(self._el)
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def attrib(self):
|
|
152
|
+
return dict(self._el.attrib)
|
|
153
|
+
|
|
154
|
+
def attr(self, name, default=None):
|
|
155
|
+
return self._el.get(name, default)
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def html(self):
|
|
159
|
+
return lxml.html.tostring(self._el, encoding="unicode")
|
|
160
|
+
|
|
161
|
+
def re(self, pattern, flags=0):
|
|
162
|
+
return SelectorList(re.findall(pattern, self.html, flags))
|
|
163
|
+
|
|
164
|
+
def _root(self):
|
|
165
|
+
root = self._el
|
|
166
|
+
while root.getparent() is not None:
|
|
167
|
+
root = root.getparent()
|
|
168
|
+
return root
|
|
169
|
+
|
|
170
|
+
def __repr__(self):
|
|
171
|
+
t = getattr(self._el, "tag", "?")
|
|
172
|
+
return f"<Selector {t}>"
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Item pipelines (the Scrapy idea): each scraped item flows through a chain that
|
|
2
|
+
can validate, transform, dedup, or export it. A pipeline returning None drops the
|
|
3
|
+
item.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import csv
|
|
8
|
+
import json
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Pipeline:
|
|
12
|
+
def open(self):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
def process(self, item):
|
|
16
|
+
return item
|
|
17
|
+
|
|
18
|
+
def close(self):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DedupPipeline(Pipeline):
|
|
23
|
+
"""Drop items already seen. key=None dedups on the whole item."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, key=None):
|
|
26
|
+
self.key = key
|
|
27
|
+
self.seen = set()
|
|
28
|
+
|
|
29
|
+
def process(self, item):
|
|
30
|
+
try:
|
|
31
|
+
k = item.get(self.key) if self.key else json.dumps(item, sort_keys=True, default=str)
|
|
32
|
+
except Exception: # noqa: BLE001
|
|
33
|
+
k = str(item)
|
|
34
|
+
if k in self.seen:
|
|
35
|
+
return None
|
|
36
|
+
self.seen.add(k)
|
|
37
|
+
return item
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class JsonLinesPipeline(Pipeline):
|
|
41
|
+
"""Stream items to a .jsonl file as they are scraped."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, path):
|
|
44
|
+
self.path = path
|
|
45
|
+
self._fh = None
|
|
46
|
+
|
|
47
|
+
def open(self):
|
|
48
|
+
self._fh = open(self.path, "w", encoding="utf-8")
|
|
49
|
+
|
|
50
|
+
def process(self, item):
|
|
51
|
+
self._fh.write(json.dumps(item, ensure_ascii=False, default=str) + "\n")
|
|
52
|
+
return item
|
|
53
|
+
|
|
54
|
+
def close(self):
|
|
55
|
+
if self._fh:
|
|
56
|
+
self._fh.close()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class CsvPipeline(Pipeline):
|
|
60
|
+
"""Collect items and write a CSV on close (header from the first item)."""
|
|
61
|
+
|
|
62
|
+
def __init__(self, path):
|
|
63
|
+
self.path = path
|
|
64
|
+
self._rows = []
|
|
65
|
+
|
|
66
|
+
def process(self, item):
|
|
67
|
+
if isinstance(item, dict):
|
|
68
|
+
self._rows.append(item)
|
|
69
|
+
return item
|
|
70
|
+
|
|
71
|
+
def close(self):
|
|
72
|
+
if not self._rows:
|
|
73
|
+
return
|
|
74
|
+
cols = list({k: None for row in self._rows for k in row})
|
|
75
|
+
with open(self.path, "w", newline="", encoding="utf-8") as fh:
|
|
76
|
+
w = csv.DictWriter(fh, fieldnames=cols)
|
|
77
|
+
w.writeheader()
|
|
78
|
+
for row in self._rows:
|
|
79
|
+
w.writerow(row)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Spider and Request: the unit of work for the crawl engine."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Request:
|
|
6
|
+
"""A pending fetch plus the callback that parses its Response."""
|
|
7
|
+
|
|
8
|
+
def __init__(self, url, callback=None, method="GET", meta=None, **kw):
|
|
9
|
+
self.url = url
|
|
10
|
+
self.callback = callback
|
|
11
|
+
self.method = method
|
|
12
|
+
self.meta = meta or {}
|
|
13
|
+
self.kw = kw
|
|
14
|
+
|
|
15
|
+
def fingerprint(self):
|
|
16
|
+
return f"{self.method}:{self.url}"
|
|
17
|
+
|
|
18
|
+
def __repr__(self):
|
|
19
|
+
return f"<Request {self.method} {self.url}>"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Spider:
|
|
23
|
+
"""Subclass this: set start_urls and implement parse(self, page)."""
|
|
24
|
+
|
|
25
|
+
name = "reap"
|
|
26
|
+
start_urls = []
|
|
27
|
+
|
|
28
|
+
def start(self):
|
|
29
|
+
for url in self.start_urls:
|
|
30
|
+
yield Request(url, self.parse)
|
|
31
|
+
|
|
32
|
+
def parse(self, page):
|
|
33
|
+
raise NotImplementedError("Spider.parse must be implemented")
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""AutoThrottle: adapt the delay to the server's observed latency so the crawl
|
|
2
|
+
stays polite and avoids IP bans, the way Scrapy's AutoThrottle does.
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AutoThrottle:
|
|
11
|
+
def __init__(self, base_delay=0.0, target_concurrency=8, max_delay=10.0, enabled=True):
|
|
12
|
+
self.delay = base_delay
|
|
13
|
+
self.target = max(1, target_concurrency)
|
|
14
|
+
self.max_delay = max_delay
|
|
15
|
+
self.enabled = enabled
|
|
16
|
+
self._latencies = []
|
|
17
|
+
self._lock = threading.Lock()
|
|
18
|
+
|
|
19
|
+
def observe(self, latency):
|
|
20
|
+
if not self.enabled:
|
|
21
|
+
return
|
|
22
|
+
with self._lock:
|
|
23
|
+
self._latencies.append(latency)
|
|
24
|
+
self._latencies = self._latencies[-20:]
|
|
25
|
+
avg = sum(self._latencies) / len(self._latencies)
|
|
26
|
+
# aim for ~target concurrent requests: per-request delay = latency / target
|
|
27
|
+
self.delay = min(self.max_delay, max(0.0, avg / self.target))
|
|
28
|
+
|
|
29
|
+
def wait(self):
|
|
30
|
+
if self.enabled and self.delay > 0:
|
|
31
|
+
time.sleep(self.delay)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "curl_reap"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Reap the web: browser-grade TLS impersonation, self-healing selectors, and a concurrent crawl engine in one library."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Anish", email = "anishfyi@gmail.com" }]
|
|
13
|
+
keywords = ["scraping", "crawler", "curl_cffi", "tls-fingerprint", "impersonate", "selectors", "lxml", "spider"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"curl_cffi>=0.7.0",
|
|
28
|
+
"lxml>=5.0",
|
|
29
|
+
"cssselect>=1.2",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = ["pytest>=7.0", "build>=1.0", "twine>=5.0"]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/anishfyi/curl_reap"
|
|
37
|
+
Repository = "https://github.com/anishfyi/curl_reap"
|
|
38
|
+
Issues = "https://github.com/anishfyi/curl_reap/issues"
|
|
39
|
+
|
|
40
|
+
[tool.hatch.build.targets.wheel]
|
|
41
|
+
packages = ["curl_reap"]
|
|
42
|
+
|
|
43
|
+
[tool.hatch.build.targets.sdist]
|
|
44
|
+
include = ["curl_reap", "README.md", "LICENSE", "assets/logo.svg"]
|