curl-reap 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .eggs/
5
+ build/
6
+ dist/
7
+ .pytest_cache/
8
+ .reap_selectors.json
9
+ *.jsonl
10
+ .venv/
11
+ venv/
12
+ .DS_Store
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anish
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: curl_reap
3
+ Version: 0.1.0
4
+ Summary: Reap the web: browser-grade TLS impersonation, self-healing selectors, and a concurrent crawl engine in one library.
5
+ Project-URL: Homepage, https://github.com/anishfyi/curl_reap
6
+ Project-URL: Repository, https://github.com/anishfyi/curl_reap
7
+ Project-URL: Issues, https://github.com/anishfyi/curl_reap/issues
8
+ Author-email: Anish <anishfyi@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: crawler,curl_cffi,impersonate,lxml,scraping,selectors,spider,tls-fingerprint
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Internet :: WWW/HTTP
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.9
23
+ Requires-Dist: cssselect>=1.2
24
+ Requires-Dist: curl-cffi>=0.7.0
25
+ Requires-Dist: lxml>=5.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: build>=1.0; extra == 'dev'
28
+ Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Requires-Dist: twine>=5.0; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ <p align="center">
33
+ <img src="assets/logo.svg" alt="curl_reap" width="420" />
34
+ </p>
35
+
36
+ <p align="center"><b>Reap the web.</b> Browser-grade TLS impersonation, self-healing selectors, and a concurrent crawl engine, in one small library.</p>
37
+
38
+ <p align="center">
39
+ <code>pip install curl_reap</code>
40
+ </p>
41
+
42
+ ---
43
+
44
+ ## Why
45
+
46
+ Modern scraping needs three things, and today you reach for three different tools:
47
+
48
+ 1. **Get past the door.** Sites fingerprint your TLS handshake and block stock Python clients. `curl_cffi` solves this with real Chrome/Safari fingerprints.
49
+ 2. **Survive markup changes.** Plain CSS and XPath break the moment a site renames a class. Scrapling pioneered self-healing selectors that re-find the element anyway.
50
+ 3. **Crawl at scale.** Concurrency, throttling, retries, dedup, and pipelines. That is Scrapy.
51
+
52
+ `curl_reap` takes the best idea from each and puts them behind one friendly API.
53
+
54
+ | | curl_cffi | Scrapy | Scrapling | **curl_reap** |
55
+ |---|:---:|:---:|:---:|:---:|
56
+ | Real browser TLS / JA3 | yes | no | partial | **yes** |
57
+ | Parser built in | no | yes | yes | **yes** |
58
+ | Self-healing selectors | no | no | yes | **yes** |
59
+ | Concurrent crawl engine | no | yes | no | **yes** |
60
+ | AutoThrottle, retries, pipelines | no | yes | no | **yes** |
61
+ | One small dependency set | yes | no | no | **yes** |
62
+
63
+ ## Install
64
+
65
+ ```bash
66
+ pip install curl_reap
67
+ ```
68
+
69
+ Requires Python 3.9+. Pulls in `curl_cffi`, `lxml`, and `cssselect`.
70
+
71
+ ## Quick start
72
+
73
+ A one-shot fetch parses like parsel, but the request carries a genuine browser fingerprint:
74
+
75
+ ```python
76
+ import curl_reap as reap
77
+
78
+ page = reap.get("https://quotes.toscrape.com", impersonate="chrome124")
79
+ print(page.css("span.text::text").getall())
80
+ print(page.css_first("small.author::text"))
81
+ ```
82
+
83
+ ## Self-healing selectors
84
+
85
+ Save an element once. Later, even if the site renames the class or moves the node, `auto_match` relocates it by structural signature:
86
+
87
+ ```python
88
+ page = reap.get("https://shop.example.com/item/42")
89
+ page.css_first("a.buy-btn").save("buy_button") # remember its shape
90
+
91
+ # weeks later, the class is now "purchase-cta" and the old selector misses:
92
+ later = reap.get("https://shop.example.com/item/99")
93
+ btn = later.css_first("a.buy-btn", auto_match=True, identifier="buy_button")
94
+ print(btn.attr("href")) # found anyway
95
+ ```
96
+
97
+ Other finders: `page.find_by_text("Sign in")` and `page.find_similar(some_element)`.
98
+
99
+ ## Crawl at scale
100
+
101
+ A `Spider` yields items (dicts) and more `Request` objects. The engine handles concurrency, AutoThrottle, retries, dedup, and pipelines:
102
+
103
+ ```python
104
+ import curl_reap as reap
105
+ from curl_reap import JsonLinesPipeline
106
+
107
+ class Quotes(reap.Spider):
108
+ start_urls = ["https://quotes.toscrape.com"]
109
+
110
+ def parse(self, page):
111
+ for q in page.css("div.quote"):
112
+ yield {
113
+ "text": q.css_first("span.text::text"),
114
+ "author": q.css_first("small.author::text"),
115
+ }
116
+ nxt = page.css_first("li.next a::attr(href)")
117
+ if nxt:
118
+ yield reap.Request("https://quotes.toscrape.com" + nxt, self.parse)
119
+
120
+ items = reap.run(
121
+ Quotes,
122
+ concurrency=8,
123
+ throttle=True, # AutoThrottle adapts to server latency
124
+ pipelines=[JsonLinesPipeline("quotes.jsonl")],
125
+ )
126
+ print(len(items), "items reaped")
127
+ ```
128
+
129
+ ## API at a glance
130
+
131
+ - `reap.get(url, impersonate="chrome124", **kw)` and `reap.post(...)` return a `Response` you can `.css()` / `.xpath()` directly.
132
+ - `reap.Session(impersonate=..., headers=..., retries=...)` for a reusable client.
133
+ - `Selector` / `SelectorList`: `.css`, `.css_first`, `.xpath`, `.find_by_text`, `.find_similar`, `.save`, `.re`, `.text`, `.attr`.
134
+ - `reap.Spider`, `reap.Request`, `reap.run(spider, ...)`, `reap.Reaper(...)`.
135
+ - Pipelines: `DedupPipeline`, `JsonLinesPipeline`, `CsvPipeline`, or subclass `Pipeline`.
136
+
137
+ ## Responsible use
138
+
139
+ `curl_reap` impersonates a real browser at the TLS level, which is exactly what a normal browser does. It does **not** ship a challenge solver and it will not break CAPTCHAs or anti-bot walls (Cloudflare challenges, DataDome, PerimeterX, and similar). If a site has deliberately put up an access-control wall, that is a signal to stop. Respect robots.txt and each site's terms, throttle your crawls, and only collect data you are allowed to collect.
140
+
141
+ ## License
142
+
143
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,112 @@
1
+ <p align="center">
2
+ <img src="assets/logo.svg" alt="curl_reap" width="420" />
3
+ </p>
4
+
5
+ <p align="center"><b>Reap the web.</b> Browser-grade TLS impersonation, self-healing selectors, and a concurrent crawl engine, in one small library.</p>
6
+
7
+ <p align="center">
8
+ <code>pip install curl_reap</code>
9
+ </p>
10
+
11
+ ---
12
+
13
+ ## Why
14
+
15
+ Modern scraping needs three things, and today you reach for three different tools:
16
+
17
+ 1. **Get past the door.** Sites fingerprint your TLS handshake and block stock Python clients. `curl_cffi` solves this with real Chrome/Safari fingerprints.
18
+ 2. **Survive markup changes.** Plain CSS and XPath break the moment a site renames a class. Scrapling pioneered self-healing selectors that re-find the element anyway.
19
+ 3. **Crawl at scale.** Concurrency, throttling, retries, dedup, and pipelines. That is Scrapy.
20
+
21
+ `curl_reap` takes the best idea from each and puts them behind one friendly API.
22
+
23
+ | | curl_cffi | Scrapy | Scrapling | **curl_reap** |
24
+ |---|:---:|:---:|:---:|:---:|
25
+ | Real browser TLS / JA3 | yes | no | partial | **yes** |
26
+ | Parser built in | no | yes | yes | **yes** |
27
+ | Self-healing selectors | no | no | yes | **yes** |
28
+ | Concurrent crawl engine | no | yes | no | **yes** |
29
+ | AutoThrottle, retries, pipelines | no | yes | no | **yes** |
30
+ | One small dependency set | yes | no | no | **yes** |
31
+
32
+ ## Install
33
+
34
+ ```bash
35
+ pip install curl_reap
36
+ ```
37
+
38
+ Requires Python 3.9+. Pulls in `curl_cffi`, `lxml`, and `cssselect`.
39
+
40
+ ## Quick start
41
+
42
+ A one-shot fetch parses like parsel, but the request carries a genuine browser fingerprint:
43
+
44
+ ```python
45
+ import curl_reap as reap
46
+
47
+ page = reap.get("https://quotes.toscrape.com", impersonate="chrome124")
48
+ print(page.css("span.text::text").getall())
49
+ print(page.css_first("small.author::text"))
50
+ ```
51
+
52
+ ## Self-healing selectors
53
+
54
+ Save an element once. Later, even if the site renames the class or moves the node, `auto_match` relocates it by structural signature:
55
+
56
+ ```python
57
+ page = reap.get("https://shop.example.com/item/42")
58
+ page.css_first("a.buy-btn").save("buy_button") # remember its shape
59
+
60
+ # weeks later, the class is now "purchase-cta" and the old selector misses:
61
+ later = reap.get("https://shop.example.com/item/99")
62
+ btn = later.css_first("a.buy-btn", auto_match=True, identifier="buy_button")
63
+ print(btn.attr("href")) # found anyway
64
+ ```
65
+
66
+ Other finders: `page.find_by_text("Sign in")` and `page.find_similar(some_element)`.
67
+
68
+ ## Crawl at scale
69
+
70
+ A `Spider` yields items (dicts) and more `Request` objects. The engine handles concurrency, AutoThrottle, retries, dedup, and pipelines:
71
+
72
+ ```python
73
+ import curl_reap as reap
74
+ from curl_reap import JsonLinesPipeline
75
+
76
+ class Quotes(reap.Spider):
77
+ start_urls = ["https://quotes.toscrape.com"]
78
+
79
+ def parse(self, page):
80
+ for q in page.css("div.quote"):
81
+ yield {
82
+ "text": q.css_first("span.text::text"),
83
+ "author": q.css_first("small.author::text"),
84
+ }
85
+ nxt = page.css_first("li.next a::attr(href)")
86
+ if nxt:
87
+ yield reap.Request("https://quotes.toscrape.com" + nxt, self.parse)
88
+
89
+ items = reap.run(
90
+ Quotes,
91
+ concurrency=8,
92
+ throttle=True, # AutoThrottle adapts to server latency
93
+ pipelines=[JsonLinesPipeline("quotes.jsonl")],
94
+ )
95
+ print(len(items), "items reaped")
96
+ ```
97
+
98
+ ## API at a glance
99
+
100
+ - `reap.get(url, impersonate="chrome124", **kw)` and `reap.post(...)` return a `Response` you can `.css()` / `.xpath()` directly.
101
+ - `reap.Session(impersonate=..., headers=..., retries=...)` for a reusable client.
102
+ - `Selector` / `SelectorList`: `.css`, `.css_first`, `.xpath`, `.find_by_text`, `.find_similar`, `.save`, `.re`, `.text`, `.attr`.
103
+ - `reap.Spider`, `reap.Request`, `reap.run(spider, ...)`, `reap.Reaper(...)`.
104
+ - Pipelines: `DedupPipeline`, `JsonLinesPipeline`, `CsvPipeline`, or subclass `Pipeline`.
105
+
106
+ ## Responsible use
107
+
108
+ `curl_reap` impersonates a real browser at the TLS level, which is exactly what a normal browser does. It does **not** ship a challenge solver and it will not break CAPTCHAs or anti-bot walls (Cloudflare challenges, DataDome, PerimeterX, and similar). If a site has deliberately put up an access-control wall, that is a signal to stop. Respect robots.txt and each site's terms, throttle your crawls, and only collect data you are allowed to collect.
109
+
110
+ ## License
111
+
112
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,28 @@
1
+ <svg width="520" height="170" viewBox="0 0 520 170" fill="none" xmlns="http://www.w3.org/2000/svg" font-family="ui-monospace, SFMono-Regular, Menlo, monospace">
2
+ <defs>
3
+ <linearGradient id="blade" x1="20" y1="20" x2="120" y2="130" gradientUnits="userSpaceOnUse">
4
+ <stop offset="0" stop-color="#F0C24B"/>
5
+ <stop offset="1" stop-color="#C9871A"/>
6
+ </linearGradient>
7
+ </defs>
8
+
9
+ <!-- the reap mark: a sickle blade that curves like a curl -->
10
+ <g transform="translate(26,20)">
11
+ <!-- blade -->
12
+ <path d="M104 40 C 74 14, 26 30, 22 72 C 19 104, 44 124, 80 118"
13
+ stroke="url(#blade)" stroke-width="16" stroke-linecap="round" fill="none"/>
14
+ <!-- handle -->
15
+ <path d="M80 118 C 92 116, 104 122, 112 134"
16
+ stroke="#7A4E27" stroke-width="13" stroke-linecap="round" fill="none"/>
17
+ <!-- grain being reaped, inside the curl -->
18
+ <circle cx="112" cy="50" r="5.5" fill="#E7B23E"/>
19
+ <circle cx="120" cy="68" r="5" fill="#E7B23E"/>
20
+ <circle cx="105" cy="74" r="4.5" fill="#E7B23E"/>
21
+ </g>
22
+
23
+ <!-- wordmark -->
24
+ <text x="186" y="92" font-size="50" font-weight="700" letter-spacing="-1">
25
+ <tspan fill="#8A8178">curl_</tspan><tspan fill="#C9871A">reap</tspan>
26
+ </text>
27
+ <text x="188" y="124" font-size="18" font-weight="500" letter-spacing="3" fill="#9a9189">REAP THE WEB</text>
28
+ </svg>
@@ -0,0 +1,47 @@
1
+ """curl_reap: reap the web.
2
+
3
+ Three pillars in one library:
4
+ 1. Transport: real browser TLS/JA3 impersonation (powered by curl_cffi) so your
5
+ requests are not fingerprinted as a bot.
6
+ 2. Parsing: a fast lxml selector with parsel-style css/xpath plus self-healing
7
+ selectors that survive markup changes.
8
+ 3. Orchestration: a small concurrent crawl engine with dedup, retries,
9
+ AutoThrottle, and item pipelines.
10
+
11
+ Quick start:
12
+
13
+ import curl_reap as reap
14
+
15
+ page = reap.get("https://quotes.toscrape.com")
16
+ print(page.css("span.text::text").getall())
17
+
18
+ class Quotes(reap.Spider):
19
+ start_urls = ["https://quotes.toscrape.com"]
20
+ def parse(self, page):
21
+ for q in page.css("div.quote"):
22
+ yield {"text": q.css_first("span.text::text"),
23
+ "author": q.css_first("small.author::text")}
24
+ nxt = page.css_first("li.next a::attr(href)")
25
+ if nxt:
26
+ yield reap.Request("https://quotes.toscrape.com" + nxt, self.parse)
27
+
28
+ items = reap.run(Quotes, concurrency=8)
29
+ """
30
+ from .adaptive import relocate, save, signature, similarity
31
+ from .engine import Reaper, run
32
+ from .http import Response, Session, fetch, get, post
33
+ from .parser import Selector, SelectorList
34
+ from .pipelines import CsvPipeline, DedupPipeline, JsonLinesPipeline, Pipeline
35
+ from .spider import Request, Spider
36
+ from .throttle import AutoThrottle
37
+
38
+ __version__ = "0.1.0"
39
+
40
+ __all__ = [
41
+ "get", "post", "fetch", "Session", "Response",
42
+ "Selector", "SelectorList",
43
+ "Spider", "Request", "Reaper", "run",
44
+ "Pipeline", "DedupPipeline", "JsonLinesPipeline", "CsvPipeline", "AutoThrottle",
45
+ "signature", "similarity", "save", "relocate",
46
+ "__version__",
47
+ ]
@@ -0,0 +1,105 @@
1
+ """Self-healing selectors (the Scrapling-inspired pillar).
2
+
3
+ Save a structural signature of an element once. Later, even if the site renames
4
+ classes or reshuffles its DOM, relocate() finds the element again by scoring every
5
+ node against that signature. This keeps scrapers alive across markup changes, which
6
+ is the single biggest maintenance cost of plain CSS/XPath scrapers.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+
13
+ DEFAULT_STORE = ".reap_selectors.json"
14
+
15
+ _WEIGHTS = {"tag": 2.0, "classes": 3.0, "id": 2.0, "attrs": 1.0, "text": 1.5, "path": 1.5}
16
+
17
+
18
+ def signature(el):
19
+ """A compact, comparable description of one element and where it sits."""
20
+ parent = el.getparent()
21
+ sibs = list(parent) if parent is not None else [el]
22
+ idx = sibs.index(el) if el in sibs else 0
23
+ return {
24
+ "tag": str(el.tag),
25
+ "classes": sorted((el.get("class") or "").split()),
26
+ "id": el.get("id") or "",
27
+ "attrs": sorted(k for k in el.keys() if k not in ("class", "id")),
28
+ "text": (el.text or "").strip()[:48],
29
+ "depth": _depth(el),
30
+ "index": idx,
31
+ "path": _path(el),
32
+ }
33
+
34
+
35
+ def _depth(el):
36
+ d = 0
37
+ p = el.getparent()
38
+ while p is not None:
39
+ d += 1
40
+ p = p.getparent()
41
+ return d
42
+
43
+
44
+ def _path(el):
45
+ parts = []
46
+ cur = el
47
+ while cur is not None and isinstance(cur.tag, str):
48
+ parts.append(cur.tag)
49
+ cur = cur.getparent()
50
+ return "/".join(reversed(parts))
51
+
52
+
53
+ def _jaccard(a, b):
54
+ a, b = set(a), set(b)
55
+ if not a and not b:
56
+ return 1.0
57
+ return len(a & b) / max(1, len(a | b))
58
+
59
+
60
+ def similarity(a, b):
61
+ """0..1 similarity between two signatures."""
62
+ score = total = 0.0
63
+ score += _WEIGHTS["tag"] * (1.0 if a["tag"] == b["tag"] else 0.0)
64
+ score += _WEIGHTS["classes"] * _jaccard(a["classes"], b["classes"])
65
+ score += _WEIGHTS["id"] * (1.0 if a["id"] and a["id"] == b["id"] else 0.0)
66
+ score += _WEIGHTS["attrs"] * _jaccard(a["attrs"], b["attrs"])
67
+ score += _WEIGHTS["text"] * (1.0 if a["text"] and a["text"] == b["text"] else 0.0)
68
+ tail_a, tail_b = a["path"].split("/")[-3:], b["path"].split("/")[-3:]
69
+ path_sc = 1.0 if a["path"] == b["path"] else (0.5 if tail_a == tail_b else 0.0)
70
+ score += _WEIGHTS["path"] * path_sc
71
+ total = sum(_WEIGHTS.values())
72
+ return score / total
73
+
74
+
75
+ def save(identifier, el, storage=None):
76
+ path = storage or DEFAULT_STORE
77
+ data = _load(path)
78
+ data[identifier] = signature(el)
79
+ with open(path, "w", encoding="utf-8") as fh:
80
+ json.dump(data, fh, indent=1)
81
+
82
+
83
+ def relocate(identifier, tree, storage=None, threshold=0.6):
84
+ """Return the best-matching element for a saved identifier, or None."""
85
+ sig = _load(storage or DEFAULT_STORE).get(identifier)
86
+ if not sig:
87
+ return None
88
+ best, best_score = None, threshold
89
+ for e in tree.iter():
90
+ if not isinstance(e.tag, str):
91
+ continue
92
+ sc = similarity(sig, signature(e))
93
+ if sc > best_score:
94
+ best, best_score = e, sc
95
+ return best
96
+
97
+
98
+ def _load(path):
99
+ if not os.path.exists(path):
100
+ return {}
101
+ try:
102
+ with open(path, encoding="utf-8") as fh:
103
+ return json.load(fh)
104
+ except Exception: # noqa: BLE001
105
+ return {}
@@ -0,0 +1,99 @@
1
+ """The crawl engine (the Scrapy idea, kept small): concurrent fetching with
2
+ dedup, retries, AutoThrottle, and item pipelines, on top of the impersonating
3
+ transport. Spider callbacks yield items (dicts) and further Requests.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import threading
8
+ from concurrent.futures import ThreadPoolExecutor
9
+
10
+ from .http import Session
11
+ from .pipelines import DedupPipeline
12
+ from .spider import Request
13
+ from .throttle import AutoThrottle
14
+
15
+
16
+ class Reaper:
17
+ def __init__(self, spider, concurrency=8, retries=2, throttle=True, delay=0.0,
18
+ impersonate="chrome124", pipelines=None, dedup=True, on_item=None,
19
+ max_pages=None):
20
+ self.spider = spider
21
+ self.concurrency = concurrency
22
+ self.max_pages = max_pages
23
+ self.session = Session(impersonate=impersonate, retries=retries)
24
+ self.throttle = AutoThrottle(base_delay=delay, target_concurrency=concurrency, enabled=throttle)
25
+ self.pipelines = list(pipelines or [])
26
+ if dedup and not any(isinstance(p, DedupPipeline) for p in self.pipelines):
27
+ self.pipelines.insert(0, DedupPipeline())
28
+ self.on_item = on_item
29
+ self._seen = set()
30
+ self.items = []
31
+ self._lock = threading.Lock()
32
+ self.stats = {"requests": 0, "items": 0, "errors": 0, "dropped": 0}
33
+
34
+ def _fetch_and_parse(self, req):
35
+ with self._lock:
36
+ fp = req.fingerprint()
37
+ if fp in self._seen:
38
+ return []
39
+ self._seen.add(fp)
40
+ if self.max_pages and self.stats["requests"] >= self.max_pages:
41
+ return []
42
+ self.throttle.wait()
43
+ import time
44
+ t0 = time.time()
45
+ try:
46
+ resp = self.session.request(req.method, req.url, meta=req.meta, **req.kw)
47
+ except Exception: # noqa: BLE001
48
+ with self._lock:
49
+ self.stats["errors"] += 1
50
+ return []
51
+ self.throttle.observe(time.time() - t0)
52
+ with self._lock:
53
+ self.stats["requests"] += 1
54
+ callback = req.callback or self.spider.parse
55
+ produced = []
56
+ try:
57
+ for out in (callback(resp) or []):
58
+ produced.append(out)
59
+ except Exception: # noqa: BLE001
60
+ with self._lock:
61
+ self.stats["errors"] += 1
62
+ return produced
63
+
64
+ def run(self):
65
+ for p in self.pipelines:
66
+ p.open()
67
+ frontier = list(self.spider.start())
68
+ with ThreadPoolExecutor(max_workers=self.concurrency) as pool:
69
+ while frontier:
70
+ futures = [pool.submit(self._fetch_and_parse, r) for r in frontier]
71
+ frontier = []
72
+ for fut in futures:
73
+ for out in fut.result():
74
+ if isinstance(out, Request):
75
+ frontier.append(out)
76
+ elif out is not None:
77
+ self._emit(out)
78
+ for p in self.pipelines:
79
+ p.close()
80
+ return self.items
81
+
82
+ def _emit(self, item):
83
+ for p in self.pipelines:
84
+ item = p.process(item)
85
+ if item is None:
86
+ with self._lock:
87
+ self.stats["dropped"] += 1
88
+ return
89
+ with self._lock:
90
+ self.items.append(item)
91
+ self.stats["items"] += 1
92
+ if self.on_item:
93
+ self.on_item(item)
94
+
95
+
96
+ def run(spider, **kw):
97
+ """Run a Spider (class or instance) to completion. Returns the scraped items."""
98
+ sp = spider() if isinstance(spider, type) else spider
99
+ return Reaper(sp, **kw).run()
@@ -0,0 +1,128 @@
1
+ """Transport layer: curl_cffi sessions with real browser TLS/JA3 impersonation.
2
+
3
+ This is the "get past the front door" pillar (the curl_cffi strength). Every
4
+ request carries a genuine Chrome/Safari TLS + HTTP2 fingerprint, which is what
5
+ defeats fingerprint-based bot detection that blocks stock Python clients.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from curl_cffi import requests as _cffi
10
+
11
+ from .parser import Selector
12
+
13
+ DEFAULT_IMPERSONATE = "chrome124"
14
+
15
+
16
+ class Response:
17
+ """A fetched page. Behaves like a parser (css/xpath pass through to a Selector)."""
18
+
19
+ def __init__(self, raw, meta=None):
20
+ self.raw = raw
21
+ self.status = raw.status_code
22
+ self.url = str(raw.url)
23
+ self.headers = dict(raw.headers)
24
+ self.text = raw.text
25
+ self.content = raw.content
26
+ self.meta = meta or {}
27
+ self._sel = None
28
+
29
+ @property
30
+ def ok(self):
31
+ return 200 <= self.status < 300
32
+
33
+ def selector(self):
34
+ if self._sel is None:
35
+ self._sel = Selector(content=self.text, url=self.url, status=self.status, headers=self.headers)
36
+ return self._sel
37
+
38
+ # parser pass-throughs so a Response is usable directly as a page
39
+ def css(self, *a, **k):
40
+ return self.selector().css(*a, **k)
41
+
42
+ def css_first(self, *a, **k):
43
+ return self.selector().css_first(*a, **k)
44
+
45
+ def xpath(self, *a, **k):
46
+ return self.selector().xpath(*a, **k)
47
+
48
+ def find_by_text(self, *a, **k):
49
+ return self.selector().find_by_text(*a, **k)
50
+
51
+ def find_similar(self, *a, **k):
52
+ return self.selector().find_similar(*a, **k)
53
+
54
+ def re(self, *a, **k):
55
+ return self.selector().re(*a, **k)
56
+
57
+ def save(self, *a, **k):
58
+ return self.selector().save(*a, **k)
59
+
60
+ def json(self):
61
+ return self.raw.json()
62
+
63
+ def __repr__(self):
64
+ return f"<Response {self.status} {self.url}>"
65
+
66
+
67
+ class Session:
68
+ """A reusable curl_cffi session with impersonation, default headers, retries."""
69
+
70
+ def __init__(self, impersonate=DEFAULT_IMPERSONATE, headers=None, timeout=30,
71
+ retries=2, proxies=None, **kw):
72
+ self.impersonate = impersonate
73
+ self.timeout = timeout
74
+ self.retries = retries
75
+ self._headers = dict(headers or {})
76
+ self._s = _cffi.Session(impersonate=impersonate, proxies=proxies, **kw)
77
+
78
+ def request(self, method, url, **kw):
79
+ kw.setdefault("impersonate", self.impersonate)
80
+ kw.setdefault("timeout", self.timeout)
81
+ merged = dict(self._headers)
82
+ merged.update(kw.pop("headers", {}) or {})
83
+ if merged:
84
+ kw["headers"] = merged
85
+ meta = kw.pop("meta", None)
86
+ retries = kw.pop("retries", self.retries)
87
+ last = None
88
+ for _ in range(retries + 1):
89
+ try:
90
+ return Response(self._s.request(method, url, **kw), meta=meta)
91
+ except Exception as exc: # noqa: BLE001
92
+ last = exc
93
+ raise last
94
+
95
+ def get(self, url, **kw):
96
+ return self.request("GET", url, **kw)
97
+
98
+ def post(self, url, **kw):
99
+ return self.request("POST", url, **kw)
100
+
101
+ def close(self):
102
+ try:
103
+ self._s.close()
104
+ except Exception: # noqa: BLE001
105
+ pass
106
+
107
+
108
+ _default = None
109
+
110
+
111
+ def _session():
112
+ global _default
113
+ if _default is None:
114
+ _default = Session()
115
+ return _default
116
+
117
+
118
+ def get(url, **kw):
119
+ """One-shot GET with a shared impersonating session. Returns a Response."""
120
+ return _session().get(url, **kw)
121
+
122
+
123
+ def post(url, **kw):
124
+ return _session().post(url, **kw)
125
+
126
+
127
+ def fetch(url, **kw):
128
+ return get(url, **kw)
@@ -0,0 +1,172 @@
1
+ """Parsing layer: a fast lxml selector with parsel-style ergonomics plus the
2
+ Scrapling-style extras (find by text, find similar, and self-healing selectors).
3
+
4
+ Supports CSS with ::text and ::attr(name) pseudo elements, XPath, regex, and an
5
+ auto_match mode that re-locates an element from a saved signature when the site
6
+ changes its markup (see adaptive.py).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ import lxml.html
13
+
14
+ _ATTR_RE = re.compile(r"::attr\(\s*([\w:-]+)\s*\)\s*$")
15
+ _TEXT_RE = re.compile(r"::text\s*$")
16
+
17
+
18
+ def _parse_pseudo(query):
19
+ m = _ATTR_RE.search(query)
20
+ if m:
21
+ return ("attr", m.group(1), _ATTR_RE.sub("", query).strip())
22
+ if _TEXT_RE.search(query):
23
+ return ("text", None, _TEXT_RE.sub("", query).strip())
24
+ return (None, None, query)
25
+
26
+
27
+ def _text_of(el):
28
+ try:
29
+ return el.text_content().strip()
30
+ except Exception: # noqa: BLE001
31
+ return (getattr(el, "text", "") or "").strip()
32
+
33
+
34
+ class SelectorList(list):
35
+ """A list of Selectors or strings with parsel-style get / getall helpers."""
36
+
37
+ def get(self, default=None):
38
+ return self[0] if self else default
39
+
40
+ def getall(self):
41
+ return list(self)
42
+
43
+ def text(self):
44
+ out = SelectorList()
45
+ for s in self:
46
+ out.append(s.text if isinstance(s, Selector) else s)
47
+ return out
48
+
49
+ def attr(self, name, default=None):
50
+ out = SelectorList()
51
+ for s in self:
52
+ if isinstance(s, Selector):
53
+ out.append(s.attr(name, default))
54
+ return out
55
+
56
+ def css(self, query, **kw):
57
+ out = SelectorList()
58
+ for s in self:
59
+ if isinstance(s, Selector):
60
+ out.extend(s.css(query, **kw))
61
+ return out
62
+
63
+
64
+ class Selector:
65
+ """Wraps one lxml element (or a parsed document)."""
66
+
67
+ def __init__(self, content=None, element=None, url=None, status=None, headers=None):
68
+ if element is not None:
69
+ self._el = element
70
+ elif content is not None:
71
+ self._el = lxml.html.fromstring(content)
72
+ else:
73
+ self._el = lxml.html.fromstring("<html></html>")
74
+ self.url = url
75
+ self.status = status
76
+ self.headers = dict(headers or {})
77
+
78
+ # --- selection ---------------------------------------------------------
79
+ def css(self, query, auto_match=False, identifier=None, storage=None):
80
+ kind, attr, q = _parse_pseudo(query)
81
+ try:
82
+ els = self._el.cssselect(q) if q else [self._el]
83
+ except Exception: # noqa: BLE001
84
+ els = []
85
+ if not els and auto_match:
86
+ from .adaptive import relocate
87
+ found = relocate(identifier or query, self._root(), storage=storage)
88
+ els = [found] if found is not None else []
89
+ if kind == "attr":
90
+ return SelectorList(e.get(attr) for e in els)
91
+ if kind == "text":
92
+ return SelectorList(_text_of(e) for e in els)
93
+ return SelectorList(Selector(element=e, url=self.url) for e in els)
94
+
95
+ def css_first(self, query, default=None, **kw):
96
+ res = self.css(query, **kw)
97
+ return res[0] if res else default
98
+
99
+ def xpath(self, query):
100
+ try:
101
+ res = self._el.xpath(query)
102
+ except Exception: # noqa: BLE001
103
+ return SelectorList()
104
+ out = SelectorList()
105
+ for r in res:
106
+ out.append(r if isinstance(r, str) else Selector(element=r, url=self.url))
107
+ return out
108
+
109
+ # --- Scrapling-style finders ------------------------------------------
110
+ def find_by_text(self, text, partial=True, first=False):
111
+ out = SelectorList()
112
+ for e in self._el.iter():
113
+ if not isinstance(e.tag, str):
114
+ continue
115
+ t = (e.text or "").strip()
116
+ hit = (text in t) if partial else (text == t)
117
+ if hit:
118
+ out.append(Selector(element=e, url=self.url))
119
+ if first:
120
+ break
121
+ return out
122
+
123
+ def find_similar(self, sample, threshold=0.6, limit=None):
124
+ """Return elements structurally similar to a sample Selector."""
125
+ from .adaptive import signature, similarity
126
+ target = signature(sample._el if isinstance(sample, Selector) else sample)
127
+ scored = []
128
+ for e in self._el.iter():
129
+ if not isinstance(e.tag, str) or e is getattr(sample, "_el", None):
130
+ continue
131
+ sc = similarity(target, signature(e))
132
+ if sc >= threshold:
133
+ scored.append((sc, e))
134
+ scored.sort(key=lambda x: -x[0])
135
+ if limit:
136
+ scored = scored[:limit]
137
+ return SelectorList(Selector(element=e, url=self.url) for _, e in scored)
138
+
139
+ def save(self, identifier, storage=None):
140
+ """Persist this element's signature so css(auto_match=True) can re-find it."""
141
+ from .adaptive import save as _save
142
+ _save(identifier, self._el, storage=storage)
143
+ return self
144
+
145
+ # --- value access ------------------------------------------------------
146
+ @property
147
+ def text(self):
148
+ return _text_of(self._el)
149
+
150
+ @property
151
+ def attrib(self):
152
+ return dict(self._el.attrib)
153
+
154
+ def attr(self, name, default=None):
155
+ return self._el.get(name, default)
156
+
157
+ @property
158
+ def html(self):
159
+ return lxml.html.tostring(self._el, encoding="unicode")
160
+
161
+ def re(self, pattern, flags=0):
162
+ return SelectorList(re.findall(pattern, self.html, flags))
163
+
164
+ def _root(self):
165
+ root = self._el
166
+ while root.getparent() is not None:
167
+ root = root.getparent()
168
+ return root
169
+
170
+ def __repr__(self):
171
+ t = getattr(self._el, "tag", "?")
172
+ return f"<Selector {t}>"
@@ -0,0 +1,79 @@
1
+ """Item pipelines (the Scrapy idea): each scraped item flows through a chain that
2
+ can validate, transform, dedup, or export it. A pipeline returning None drops the
3
+ item.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import csv
8
+ import json
9
+
10
+
11
+ class Pipeline:
12
+ def open(self):
13
+ pass
14
+
15
+ def process(self, item):
16
+ return item
17
+
18
+ def close(self):
19
+ pass
20
+
21
+
22
+ class DedupPipeline(Pipeline):
23
+ """Drop items already seen. key=None dedups on the whole item."""
24
+
25
+ def __init__(self, key=None):
26
+ self.key = key
27
+ self.seen = set()
28
+
29
+ def process(self, item):
30
+ try:
31
+ k = item.get(self.key) if self.key else json.dumps(item, sort_keys=True, default=str)
32
+ except Exception: # noqa: BLE001
33
+ k = str(item)
34
+ if k in self.seen:
35
+ return None
36
+ self.seen.add(k)
37
+ return item
38
+
39
+
40
+ class JsonLinesPipeline(Pipeline):
41
+ """Stream items to a .jsonl file as they are scraped."""
42
+
43
+ def __init__(self, path):
44
+ self.path = path
45
+ self._fh = None
46
+
47
+ def open(self):
48
+ self._fh = open(self.path, "w", encoding="utf-8")
49
+
50
+ def process(self, item):
51
+ self._fh.write(json.dumps(item, ensure_ascii=False, default=str) + "\n")
52
+ return item
53
+
54
+ def close(self):
55
+ if self._fh:
56
+ self._fh.close()
57
+
58
+
59
+ class CsvPipeline(Pipeline):
60
+ """Collect items and write a CSV on close (header from the first item)."""
61
+
62
+ def __init__(self, path):
63
+ self.path = path
64
+ self._rows = []
65
+
66
+ def process(self, item):
67
+ if isinstance(item, dict):
68
+ self._rows.append(item)
69
+ return item
70
+
71
+ def close(self):
72
+ if not self._rows:
73
+ return
74
+ cols = list({k: None for row in self._rows for k in row})
75
+ with open(self.path, "w", newline="", encoding="utf-8") as fh:
76
+ w = csv.DictWriter(fh, fieldnames=cols)
77
+ w.writeheader()
78
+ for row in self._rows:
79
+ w.writerow(row)
@@ -0,0 +1,33 @@
1
+ """Spider and Request: the unit of work for the crawl engine."""
2
+ from __future__ import annotations
3
+
4
+
5
+ class Request:
6
+ """A pending fetch plus the callback that parses its Response."""
7
+
8
+ def __init__(self, url, callback=None, method="GET", meta=None, **kw):
9
+ self.url = url
10
+ self.callback = callback
11
+ self.method = method
12
+ self.meta = meta or {}
13
+ self.kw = kw
14
+
15
+ def fingerprint(self):
16
+ return f"{self.method}:{self.url}"
17
+
18
+ def __repr__(self):
19
+ return f"<Request {self.method} {self.url}>"
20
+
21
+
22
+ class Spider:
23
+ """Subclass this: set start_urls and implement parse(self, page)."""
24
+
25
+ name = "reap"
26
+ start_urls = []
27
+
28
+ def start(self):
29
+ for url in self.start_urls:
30
+ yield Request(url, self.parse)
31
+
32
+ def parse(self, page):
33
+ raise NotImplementedError("Spider.parse must be implemented")
@@ -0,0 +1,31 @@
1
+ """AutoThrottle: adapt the delay to the server's observed latency so the crawl
2
+ stays polite and avoids IP bans, the way Scrapy's AutoThrottle does.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import threading
7
+ import time
8
+
9
+
10
+ class AutoThrottle:
11
+ def __init__(self, base_delay=0.0, target_concurrency=8, max_delay=10.0, enabled=True):
12
+ self.delay = base_delay
13
+ self.target = max(1, target_concurrency)
14
+ self.max_delay = max_delay
15
+ self.enabled = enabled
16
+ self._latencies = []
17
+ self._lock = threading.Lock()
18
+
19
+ def observe(self, latency):
20
+ if not self.enabled:
21
+ return
22
+ with self._lock:
23
+ self._latencies.append(latency)
24
+ self._latencies = self._latencies[-20:]
25
+ avg = sum(self._latencies) / len(self._latencies)
26
+ # aim for ~target concurrent requests: per-request delay = latency / target
27
+ self.delay = min(self.max_delay, max(0.0, avg / self.target))
28
+
29
+ def wait(self):
30
+ if self.enabled and self.delay > 0:
31
+ time.sleep(self.delay)
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "curl_reap"
7
+ version = "0.1.0"
8
+ description = "Reap the web: browser-grade TLS impersonation, self-healing selectors, and a concurrent crawl engine in one library."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Anish", email = "anishfyi@gmail.com" }]
13
+ keywords = ["scraping", "crawler", "curl_cffi", "tls-fingerprint", "impersonate", "selectors", "lxml", "spider"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Internet :: WWW/HTTP",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ ]
26
+ dependencies = [
27
+ "curl_cffi>=0.7.0",
28
+ "lxml>=5.0",
29
+ "cssselect>=1.2",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ dev = ["pytest>=7.0", "build>=1.0", "twine>=5.0"]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/anishfyi/curl_reap"
37
+ Repository = "https://github.com/anishfyi/curl_reap"
38
+ Issues = "https://github.com/anishfyi/curl_reap/issues"
39
+
40
+ [tool.hatch.build.targets.wheel]
41
+ packages = ["curl_reap"]
42
+
43
+ [tool.hatch.build.targets.sdist]
44
+ include = ["curl_reap", "README.md", "LICENSE", "assets/logo.svg"]