unblock-requests 0.0.1a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unblock_requests-0.0.1a2/PKG-INFO +10 -0
- unblock_requests-0.0.1a2/README.md +71 -0
- unblock_requests-0.0.1a2/pyproject.toml +21 -0
- unblock_requests-0.0.1a2/setup.cfg +4 -0
- unblock_requests-0.0.1a2/tests/test_session.py +81 -0
- unblock_requests-0.0.1a2/unblock_requests/__init__.py +47 -0
- unblock_requests-0.0.1a2/unblock_requests/session.py +259 -0
- unblock_requests-0.0.1a2/unblock_requests/version.py +8 -0
- unblock_requests-0.0.1a2/unblock_requests.egg-info/PKG-INFO +10 -0
- unblock_requests-0.0.1a2/unblock_requests.egg-info/SOURCES.txt +11 -0
- unblock_requests-0.0.1a2/unblock_requests.egg-info/dependency_links.txt +1 -0
- unblock_requests-0.0.1a2/unblock_requests.egg-info/requires.txt +7 -0
- unblock_requests-0.0.1a2/unblock_requests.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: unblock_requests
|
|
3
|
+
Version: 0.0.1a2
|
|
4
|
+
Summary: A requests.Session subclass that bypasses Cloudflare via curl_cffi, FlareSolverr, or the Wayback Machine
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Requires-Dist: requests>=2.28
|
|
7
|
+
Provides-Extra: stealth
|
|
8
|
+
Requires-Dist: curl-cffi; extra == "stealth"
|
|
9
|
+
Provides-Extra: test
|
|
10
|
+
Requires-Dist: pytest; extra == "test"
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# unblock_requests
|
|
2
|
+
|
|
3
|
+
A **drop-in `requests.Session` subclass** that gets your request through
|
|
4
|
+
Cloudflare. It's the anti-bot counterpart to
|
|
5
|
+
[`anon_requests`](https://github.com/TigreGotico/anon_requests) (which handles
|
|
6
|
+
IP anonymity via proxy/Tor rotation): `unblock_requests` handles *bot detection*
|
|
7
|
+
— TLS fingerprinting and JS challenges — and degrades gracefully to the archive.
|
|
8
|
+
|
|
9
|
+
Because it subclasses `requests.Session` and only overrides `request()`, every
|
|
10
|
+
`.get()/.post()/...` keeps working and anything typed against
|
|
11
|
+
`requests.Session` accepts it unchanged.
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from unblock_requests import CloudflareSession # alias: Session
|
|
15
|
+
|
|
16
|
+
s = CloudflareSession(flaresolverr_url="http://192.168.1.116:8191")
|
|
17
|
+
html = s.get("https://www.progarchives.com/artist.asp?id=1").text # solved live
|
|
18
|
+
import requests; assert isinstance(s, requests.Session) # True
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Transports
|
|
22
|
+
|
|
23
|
+
Pick with the `mode=` kwarg, or the `<PREFIX>_TRANSPORT` env var (default prefix
|
|
24
|
+
`UNBLOCK_REQUESTS`). Explicit kwargs always win over the environment.
|
|
25
|
+
|
|
26
|
+
| Mode | What it does |
|
|
27
|
+
|---|---|
|
|
28
|
+
| `curl_cffi` *(default)* | Chrome TLS impersonation (install the `stealth` extra). Clears the bot check on most networks. |
|
|
29
|
+
| `requests` | Plain `requests`, no impersonation. |
|
|
30
|
+
| `flaresolverr` | Proxy through a [FlareSolverr](https://github.com/FlareSolverr/FlareSolverr) headless browser that solves the JS challenge — **live** data. Selected automatically when `flaresolverr_url` is set. |
|
|
31
|
+
| `wayback` | Read the latest Internet Archive snapshot — stale, but needs no infrastructure. |
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
CloudflareSession(flaresolverr_url="http://host:8191") # solve live
|
|
35
|
+
CloudflareSession(mode="wayback") # force the archive
|
|
36
|
+
CloudflareSession(flaresolverr_url="http://host:8191", wayback_fallback=True) # live, archive on failure
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Composing with anon_requests
|
|
40
|
+
|
|
41
|
+
`unblock_requests` (anti-bot) and `anon_requests` (IP rotation) are orthogonal
|
|
42
|
+
and stack: a modernized `anon_requests` wraps an inner `requests.Session` built
|
|
43
|
+
by a `session_factory`, so you can inject a `CloudflareSession` and get rotation
|
|
44
|
+
**and** challenge-solving together. A rotated proxy flows through every mode —
|
|
45
|
+
including into FlareSolverr (via its `proxy` field):
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from anon_requests import RotatingProxySession # once modernized
|
|
49
|
+
from unblock_requests import CloudflareSession
|
|
50
|
+
|
|
51
|
+
session = RotatingProxySession(
|
|
52
|
+
session_factory=lambda: CloudflareSession(flaresolverr_url="http://host:8191"),
|
|
53
|
+
)
|
|
54
|
+
session.get(url) # rotates IP + solves Cloudflare
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install unblock_requests
|
|
61
|
+
pip install unblock_requests[stealth] # adds curl_cffi (recommended)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Notes / limits
|
|
65
|
+
|
|
66
|
+
- In the `wayback`/`flaresolverr` modes the response is **synthesized** from the
|
|
67
|
+
fetched HTML (real `requests.Response`, but `stream=`/adapters/connection
|
|
68
|
+
pooling don't apply). `requests`/`curl_cffi` modes are native.
|
|
69
|
+
- Challenge detection is heuristic (`is_challenge()`), used to trigger the
|
|
70
|
+
optional Wayback fallback on blocked GETs.
|
|
71
|
+
- `wayback_html(url)` and `is_challenge(text)` are exposed for direct use.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "unblock_requests"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "A requests.Session subclass that bypasses Cloudflare via curl_cffi, FlareSolverr, or the Wayback Machine"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
dependencies = ["requests>=2.28"]
|
|
11
|
+
|
|
12
|
+
[project.optional-dependencies]
|
|
13
|
+
stealth = ["curl-cffi"]
|
|
14
|
+
test = ["pytest"]
|
|
15
|
+
|
|
16
|
+
[tool.setuptools.packages.find]
|
|
17
|
+
where = ["."]
|
|
18
|
+
include = ["unblock_requests*"]
|
|
19
|
+
|
|
20
|
+
[tool.setuptools.dynamic]
|
|
21
|
+
version = {attr = "unblock_requests.version.__version__"}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Offline tests for CloudflareSession (no network)."""
|
|
2
|
+
import requests
|
|
3
|
+
|
|
4
|
+
from unblock_requests import CloudflareSession, is_challenge, wayback_raw_url
|
|
5
|
+
from unblock_requests.session import _full_url, _make_response, _url_variants, _flaresolverr_extract
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_is_a_requests_session():
|
|
9
|
+
s = CloudflareSession()
|
|
10
|
+
assert isinstance(s, requests.Session)
|
|
11
|
+
assert "User-Agent" in s.headers # inherited Session machinery
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_mode_resolution(monkeypatch):
|
|
15
|
+
for k in ("UNBLOCK_REQUESTS_TRANSPORT", "UNBLOCK_REQUESTS_FLARESOLVERR_URL", "UNBLOCK_REQUESTS_WAYBACK_FALLBACK"):
|
|
16
|
+
monkeypatch.delenv(k, raising=False)
|
|
17
|
+
assert CloudflareSession()._resolved_mode() == "curl_cffi"
|
|
18
|
+
assert CloudflareSession(mode="wayback")._resolved_mode() == "wayback"
|
|
19
|
+
assert CloudflareSession(flaresolverr_url="http://x:8191")._resolved_mode() == "flaresolverr"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_kwarg_beats_env(monkeypatch):
|
|
23
|
+
monkeypatch.setenv("UNBLOCK_REQUESTS_TRANSPORT", "requests")
|
|
24
|
+
assert CloudflareSession(mode="wayback")._resolved_mode() == "wayback"
|
|
25
|
+
assert CloudflareSession()._resolved_mode() == "requests"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_env_prefix(monkeypatch):
|
|
29
|
+
monkeypatch.setenv("PYPROG_FLARESOLVERR_URL", "http://box:8191")
|
|
30
|
+
s = CloudflareSession(env_prefix="PYPROG")
|
|
31
|
+
assert s._fs_url() == "http://box:8191"
|
|
32
|
+
assert s._resolved_mode() == "flaresolverr"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_bad_mode():
|
|
36
|
+
import pytest
|
|
37
|
+
with pytest.raises(ValueError):
|
|
38
|
+
CloudflareSession(mode="nonsense")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_make_response_is_real_response():
|
|
42
|
+
r = _make_response("https://x/y", content=b'{"a": 1}',
|
|
43
|
+
headers={"Content-Type": "application/json"})
|
|
44
|
+
assert isinstance(r, requests.Response)
|
|
45
|
+
assert r.status_code == 200
|
|
46
|
+
assert r.text == '{"a": 1}'
|
|
47
|
+
assert r.json() == {"a": 1}
|
|
48
|
+
assert r.content == b'{"a": 1}'
|
|
49
|
+
r.raise_for_status() # 200 → no raise
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_make_response_raise_for_status():
|
|
53
|
+
import pytest
|
|
54
|
+
r = _make_response("https://x", content=b"nope", status=503, reason="boom")
|
|
55
|
+
with pytest.raises(requests.HTTPError):
|
|
56
|
+
r.raise_for_status()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_helpers():
|
|
60
|
+
assert is_challenge("<title>Just a moment...</title>")
|
|
61
|
+
assert not is_challenge("<html><body>real page</body></html>")
|
|
62
|
+
assert wayback_raw_url("http://web.archive.org/web/20250101/https://x/y") == \
|
|
63
|
+
"http://web.archive.org/web/20250101id_/https://x/y"
|
|
64
|
+
assert _full_url("https://x/p", {"a": "b"}) == "https://x/p?a=b"
|
|
65
|
+
variants = list(_url_variants("https://www.x.com/p"))
|
|
66
|
+
assert "http://www.x.com/p" in variants and "www.x.com/p" in variants
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_flaresolverr_extract():
|
|
70
|
+
import pytest
|
|
71
|
+
assert _flaresolverr_extract({"status": "ok", "solution": {"response": "<h1>hi</h1>"}}) == "<h1>hi</h1>"
|
|
72
|
+
with pytest.raises(RuntimeError):
|
|
73
|
+
_flaresolverr_extract({"status": "error", "message": "x"})
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_proxy_url_selection():
|
|
77
|
+
s = CloudflareSession()
|
|
78
|
+
assert s._proxy_url({"proxies": {"https": "socks5://1.2.3.4:1080"}}) == "socks5://1.2.3.4:1080"
|
|
79
|
+
s.proxies = {"http": "http://5.6.7.8:3128"}
|
|
80
|
+
assert s._proxy_url({}) == "http://5.6.7.8:3128"
|
|
81
|
+
assert CloudflareSession()._proxy_url({}) is None
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""unblock_requests — a ``requests.Session`` that shrugs off Cloudflare.
|
|
2
|
+
|
|
3
|
+
``CloudflareSession`` is a **drop-in subclass of** :class:`requests.Session`.
|
|
4
|
+
Because every ``.get()/.post()/.head()/...`` call funnels through
|
|
5
|
+
``Session.request()``, overriding that one method lets us transparently route
|
|
6
|
+
through:
|
|
7
|
+
|
|
8
|
+
- ``curl_cffi`` Chrome TLS impersonation (the default when installed),
|
|
9
|
+
- a **FlareSolverr** proxy that solves the JS challenge in a real browser
|
|
10
|
+
(live data),
|
|
11
|
+
- the **Wayback Machine** (stale but dependency-free),
|
|
12
|
+
|
|
13
|
+
with an optional live→archive fallback — while always returning genuine
|
|
14
|
+
:class:`requests.Response` objects. Any code typed against ``requests.Session``
|
|
15
|
+
keeps working unchanged::
|
|
16
|
+
|
|
17
|
+
from unblock_requests import CloudflareSession
|
|
18
|
+
|
|
19
|
+
s = CloudflareSession(flaresolverr_url="http://192.168.1.116:8191")
|
|
20
|
+
s.get("https://www.progarchives.com/artist.asp?id=1").text # solved live
|
|
21
|
+
s.headers["Referer"] = "..." # inherited
|
|
22
|
+
assert isinstance(s, requests.Session) # True
|
|
23
|
+
|
|
24
|
+
Modes (``mode=`` kwarg, or the ``<PREFIX>_TRANSPORT`` env var):
|
|
25
|
+
``"requests"`` / ``"curl_cffi"`` / ``"wayback"`` / ``"flaresolverr"``.
|
|
26
|
+
Explicit kwargs always win over the environment. Setting ``flaresolverr_url``
|
|
27
|
+
(kwarg or ``<PREFIX>_FLARESOLVERR_URL``) selects FlareSolverr automatically.
|
|
28
|
+
"""
|
|
29
|
+
from unblock_requests.session import (
|
|
30
|
+
CloudflareSession,
|
|
31
|
+
is_challenge,
|
|
32
|
+
wayback_html,
|
|
33
|
+
wayback_raw_url,
|
|
34
|
+
)
|
|
35
|
+
from unblock_requests.version import __version__
|
|
36
|
+
|
|
37
|
+
# Convenience alias — read as "a Session, but cloudflare-aware".
|
|
38
|
+
Session = CloudflareSession
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"CloudflareSession",
|
|
42
|
+
"Session",
|
|
43
|
+
"is_challenge",
|
|
44
|
+
"wayback_html",
|
|
45
|
+
"wayback_raw_url",
|
|
46
|
+
"__version__",
|
|
47
|
+
]
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""The ``CloudflareSession`` drop-in and its transport helpers."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from requests.models import PreparedRequest, Response
|
|
10
|
+
from requests.structures import CaseInsensitiveDict
|
|
11
|
+
|
|
12
|
+
_WAYBACK_AVAILABLE_API = "http://archive.org/wayback/available"
|
|
13
|
+
_VALID_MODES = {"requests", "curl_cffi", "wayback", "flaresolverr"}
|
|
14
|
+
|
|
15
|
+
_DEFAULT_HEADERS = {
|
|
16
|
+
"User-Agent": (
|
|
17
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
18
|
+
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
19
|
+
),
|
|
20
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
21
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# stateless helpers
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
def is_challenge(text: str) -> bool:
|
|
30
|
+
"""Heuristically detect a Cloudflare interstitial in a response body."""
|
|
31
|
+
head = (text or "")[:1500].lower()
|
|
32
|
+
return ("just a moment" in head or "cf-mitigated" in head
|
|
33
|
+
or "challenge-platform" in head or "cf_chl_opt" in head)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _truthy(value: Optional[str]) -> bool:
|
|
37
|
+
return (value or "").strip().lower() in {"1", "true", "yes", "on"}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _full_url(url: str, params: Any) -> str:
|
|
41
|
+
"""Bake query *params* into *url* using requests' own encoder."""
|
|
42
|
+
if not params:
|
|
43
|
+
return url
|
|
44
|
+
pr = PreparedRequest()
|
|
45
|
+
pr.prepare_url(url, params)
|
|
46
|
+
return pr.url
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def wayback_raw_url(snapshot_url: str) -> str:
|
|
50
|
+
"""``.../web/<ts>/<original>`` → ``.../web/<ts>id_/<original>`` (raw bytes,
|
|
51
|
+
no Wayback toolbar or link rewriting)."""
|
|
52
|
+
return re.sub(r"(/web/\d+)/", r"\1id_/", snapshot_url, count=1)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _url_variants(url: str):
|
|
56
|
+
seen = set()
|
|
57
|
+
bare = re.sub(r"^https?://", "", url)
|
|
58
|
+
for candidate in (url, "http://" + bare, "https://" + bare, bare):
|
|
59
|
+
if candidate not in seen:
|
|
60
|
+
seen.add(candidate)
|
|
61
|
+
yield candidate
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _make_response(url: str, *, content: bytes, status: int = 200,
|
|
65
|
+
headers: Optional[dict] = None, reason: str = "OK",
|
|
66
|
+
request: Optional[PreparedRequest] = None) -> Response:
|
|
67
|
+
"""Build a genuine :class:`requests.Response` from raw bytes."""
|
|
68
|
+
r = Response()
|
|
69
|
+
r.status_code = status
|
|
70
|
+
r._content = content
|
|
71
|
+
r.url = url
|
|
72
|
+
r.reason = reason
|
|
73
|
+
r.encoding = "utf-8"
|
|
74
|
+
r.headers = CaseInsensitiveDict(headers or {"Content-Type": "text/html; charset=utf-8"})
|
|
75
|
+
if request is not None:
|
|
76
|
+
r.request = request
|
|
77
|
+
return r
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def wayback_html(url: str, *, timeout: float = 30.0,
|
|
81
|
+
headers: Optional[dict] = None) -> Optional[str]:
|
|
82
|
+
"""Return the latest Wayback Machine snapshot of *url* as raw HTML, or
|
|
83
|
+
``None`` if the archive has no usable capture. archive.org is not
|
|
84
|
+
Cloudflare-gated, so plain ``requests`` is used."""
|
|
85
|
+
headers = headers or _DEFAULT_HEADERS
|
|
86
|
+
snap = None
|
|
87
|
+
for candidate in _url_variants(url):
|
|
88
|
+
try:
|
|
89
|
+
meta = requests.get(_WAYBACK_AVAILABLE_API, params={"url": candidate},
|
|
90
|
+
headers=headers, timeout=timeout).json()
|
|
91
|
+
except Exception:
|
|
92
|
+
continue
|
|
93
|
+
snap = (meta.get("archived_snapshots", {}) or {}).get("closest")
|
|
94
|
+
if snap and snap.get("available") and snap.get("url"):
|
|
95
|
+
break
|
|
96
|
+
snap = None
|
|
97
|
+
if not snap:
|
|
98
|
+
return None
|
|
99
|
+
try:
|
|
100
|
+
r = requests.get(wayback_raw_url(snap["url"]), headers=headers, timeout=timeout)
|
|
101
|
+
r.raise_for_status()
|
|
102
|
+
except Exception:
|
|
103
|
+
return None
|
|
104
|
+
return r.text
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _flaresolverr_extract(data: dict) -> str:
|
|
108
|
+
if data.get("status") != "ok":
|
|
109
|
+
raise RuntimeError(f"FlareSolverr error: {data.get('message') or data.get('status')}")
|
|
110
|
+
return (data.get("solution") or {}).get("response", "")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
# the Session subclass
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
class CloudflareSession(requests.Session):
|
|
118
|
+
"""A :class:`requests.Session` that transparently bypasses Cloudflare.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
mode: ``"requests"`` / ``"curl_cffi"`` / ``"wayback"``
|
|
122
|
+
/ ``"flaresolverr"``. ``None`` → resolve from
|
|
123
|
+
the environment, then auto (FlareSolverr if a
|
|
124
|
+
URL is configured, else curl_cffi).
|
|
125
|
+
flaresolverr_url: FlareSolverr base URL (e.g.
|
|
126
|
+
``"http://192.168.1.116:8191"``).
|
|
127
|
+
flaresolverr_timeout_ms: per-request solve budget (default 60000).
|
|
128
|
+
wayback_fallback: fall back to the Wayback Machine when a live
|
|
129
|
+
GET is blocked. ``None`` → read the env flag.
|
|
130
|
+
impersonate: curl_cffi browser target (default ``"chrome"``).
|
|
131
|
+
env_prefix: namespace for the env fallbacks (default
|
|
132
|
+
``"UNBLOCK_REQUESTS"``): ``<PREFIX>_TRANSPORT``,
|
|
133
|
+
``<PREFIX>_FLARESOLVERR_URL``,
|
|
134
|
+
``<PREFIX>_FLARESOLVERR_TIMEOUT``,
|
|
135
|
+
``<PREFIX>_WAYBACK_FALLBACK``.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
def __init__(self, *, mode: Optional[str] = None,
|
|
139
|
+
flaresolverr_url: Optional[str] = None,
|
|
140
|
+
flaresolverr_timeout_ms: Optional[int] = None,
|
|
141
|
+
wayback_fallback: Optional[bool] = None,
|
|
142
|
+
impersonate: str = "chrome",
|
|
143
|
+
env_prefix: str = "UNBLOCK_REQUESTS") -> None:
|
|
144
|
+
super().__init__()
|
|
145
|
+
if mode is not None and mode.lower() not in _VALID_MODES:
|
|
146
|
+
raise ValueError(f"mode must be one of {sorted(_VALID_MODES)} or None, got {mode!r}")
|
|
147
|
+
self.cf_mode = mode.lower() if mode else None
|
|
148
|
+
self.flaresolverr_url = flaresolverr_url
|
|
149
|
+
self.flaresolverr_timeout_ms = flaresolverr_timeout_ms
|
|
150
|
+
self.wayback_fallback = wayback_fallback
|
|
151
|
+
self.impersonate = impersonate
|
|
152
|
+
self.env_prefix = env_prefix
|
|
153
|
+
self.headers.update(_DEFAULT_HEADERS)
|
|
154
|
+
self._curl: Any = None
|
|
155
|
+
|
|
156
|
+
# -- config resolution (explicit kwarg > env > default) ----------------
|
|
157
|
+
|
|
158
|
+
def _env(self, suffix: str) -> str:
|
|
159
|
+
return os.environ.get(f"{self.env_prefix}_{suffix}", "").strip()
|
|
160
|
+
|
|
161
|
+
def _fs_url(self) -> str:
|
|
162
|
+
return self.flaresolverr_url or self._env("FLARESOLVERR_URL")
|
|
163
|
+
|
|
164
|
+
def _fs_timeout(self) -> int:
|
|
165
|
+
if self.flaresolverr_timeout_ms is not None:
|
|
166
|
+
return self.flaresolverr_timeout_ms
|
|
167
|
+
return int(self._env("FLARESOLVERR_TIMEOUT") or "60000")
|
|
168
|
+
|
|
169
|
+
def _resolved_mode(self) -> str:
|
|
170
|
+
if self.cf_mode:
|
|
171
|
+
return self.cf_mode
|
|
172
|
+
env = self._env("TRANSPORT").lower()
|
|
173
|
+
if env:
|
|
174
|
+
return env
|
|
175
|
+
if self._fs_url():
|
|
176
|
+
return "flaresolverr"
|
|
177
|
+
return "curl_cffi"
|
|
178
|
+
|
|
179
|
+
def _do_wayback_fallback(self) -> bool:
|
|
180
|
+
if self.wayback_fallback is not None:
|
|
181
|
+
return self.wayback_fallback
|
|
182
|
+
return _truthy(self._env("WAYBACK_FALLBACK"))
|
|
183
|
+
|
|
184
|
+
# -- per-mode fetchers (all return requests.Response) ------------------
|
|
185
|
+
|
|
186
|
+
def _curl_session(self):
|
|
187
|
+
if self._curl is None:
|
|
188
|
+
from curl_cffi import requests as cffi # type: ignore[import]
|
|
189
|
+
self._curl = cffi.Session(impersonate=self.impersonate)
|
|
190
|
+
self._curl.headers.update(self.headers)
|
|
191
|
+
return self._curl
|
|
192
|
+
|
|
193
|
+
def _via_curl(self, method: str, url: str, **kwargs) -> Response:
|
|
194
|
+
allowed = {"params", "data", "json", "headers", "cookies", "timeout",
|
|
195
|
+
"allow_redirects", "proxies"}
|
|
196
|
+
cr = self._curl_session().request(
|
|
197
|
+
method, url, **{k: v for k, v in kwargs.items() if k in allowed})
|
|
198
|
+
return _make_response(
|
|
199
|
+
getattr(cr, "url", url), content=cr.content, status=cr.status_code,
|
|
200
|
+
headers=dict(cr.headers), reason=getattr(cr, "reason", "") or "OK")
|
|
201
|
+
|
|
202
|
+
def _via_flaresolverr(self, url: str, proxy: Optional[str] = None) -> Response:
|
|
203
|
+
endpoint = (self._fs_url() or "http://localhost:8191").rstrip("/")
|
|
204
|
+
timeout_ms = self._fs_timeout()
|
|
205
|
+
payload = {"cmd": "request.get", "url": url, "maxTimeout": timeout_ms}
|
|
206
|
+
if proxy:
|
|
207
|
+
# FlareSolverr drives the headless browser through this proxy — so a
|
|
208
|
+
# rotated proxy (e.g. from anon_requests) reaches the solved request.
|
|
209
|
+
payload["proxy"] = {"url": proxy}
|
|
210
|
+
resp = requests.post(f"{endpoint}/v1", json=payload, timeout=timeout_ms / 1000 + 30)
|
|
211
|
+
resp.raise_for_status()
|
|
212
|
+
html = _flaresolverr_extract(resp.json())
|
|
213
|
+
return _make_response(url, content=html.encode("utf-8"))
|
|
214
|
+
|
|
215
|
+
def _proxy_url(self, kwargs: dict) -> Optional[str]:
|
|
216
|
+
"""A single proxy URL from per-request ``proxies=`` or ``self.proxies``."""
|
|
217
|
+
proxies = kwargs.get("proxies") or self.proxies or {}
|
|
218
|
+
return proxies.get("https") or proxies.get("http") or next(iter(proxies.values()), None)
|
|
219
|
+
|
|
220
|
+
def _via_wayback(self, url: str) -> Optional[Response]:
|
|
221
|
+
html = wayback_html(url, headers=dict(self.headers))
|
|
222
|
+
if html is None:
|
|
223
|
+
return None
|
|
224
|
+
return _make_response(url, content=html.encode("utf-8"))
|
|
225
|
+
|
|
226
|
+
# -- the one override that makes the whole Session cloudflare-aware ----
|
|
227
|
+
|
|
228
|
+
def request(self, method: str, url: str, **kwargs) -> Response: # type: ignore[override]
|
|
229
|
+
mode = self._resolved_mode()
|
|
230
|
+
full = _full_url(url, kwargs.get("params"))
|
|
231
|
+
is_get = method.upper() == "GET"
|
|
232
|
+
|
|
233
|
+
if mode == "wayback":
|
|
234
|
+
resp = self._via_wayback(full)
|
|
235
|
+
if resp is None:
|
|
236
|
+
raise RuntimeError(f"no Wayback Machine snapshot available for {full}")
|
|
237
|
+
return resp
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
if mode == "flaresolverr" and is_get:
|
|
241
|
+
resp = self._via_flaresolverr(full, proxy=self._proxy_url(kwargs))
|
|
242
|
+
elif mode == "curl_cffi":
|
|
243
|
+
try:
|
|
244
|
+
resp = self._via_curl(method, url, **kwargs)
|
|
245
|
+
except ImportError:
|
|
246
|
+
resp = super().request(method, url, **kwargs) # curl_cffi absent
|
|
247
|
+
else:
|
|
248
|
+
resp = super().request(method, url, **kwargs)
|
|
249
|
+
# treat a served challenge as a failure so the fallback can fire
|
|
250
|
+
if is_get and (resp.status_code in (403, 503) or is_challenge(resp.text)) \
|
|
251
|
+
and is_challenge(resp.text):
|
|
252
|
+
raise RuntimeError("Cloudflare challenge served")
|
|
253
|
+
return resp
|
|
254
|
+
except Exception:
|
|
255
|
+
if is_get and self._do_wayback_fallback():
|
|
256
|
+
resp = self._via_wayback(full)
|
|
257
|
+
if resp is not None:
|
|
258
|
+
return resp
|
|
259
|
+
raise
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: unblock_requests
|
|
3
|
+
Version: 0.0.1a2
|
|
4
|
+
Summary: A requests.Session subclass that bypasses Cloudflare via curl_cffi, FlareSolverr, or the Wayback Machine
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Requires-Dist: requests>=2.28
|
|
7
|
+
Provides-Extra: stealth
|
|
8
|
+
Requires-Dist: curl-cffi; extra == "stealth"
|
|
9
|
+
Provides-Extra: test
|
|
10
|
+
Requires-Dist: pytest; extra == "test"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
tests/test_session.py
|
|
4
|
+
unblock_requests/__init__.py
|
|
5
|
+
unblock_requests/session.py
|
|
6
|
+
unblock_requests/version.py
|
|
7
|
+
unblock_requests.egg-info/PKG-INFO
|
|
8
|
+
unblock_requests.egg-info/SOURCES.txt
|
|
9
|
+
unblock_requests.egg-info/dependency_links.txt
|
|
10
|
+
unblock_requests.egg-info/requires.txt
|
|
11
|
+
unblock_requests.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
unblock_requests
|