scrapy-rotating-proxy-middleware 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapy_rotating_proxy/__init__.py +4 -0
- scrapy_rotating_proxy/middleware.py +123 -0
- scrapy_rotating_proxy_middleware-0.1.0.dist-info/METADATA +104 -0
- scrapy_rotating_proxy_middleware-0.1.0.dist-info/RECORD +7 -0
- scrapy_rotating_proxy_middleware-0.1.0.dist-info/WHEEL +5 -0
- scrapy_rotating_proxy_middleware-0.1.0.dist-info/licenses/LICENSE +21 -0
- scrapy_rotating_proxy_middleware-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Drop-in Scrapy downloader middleware for rotating proxies with ban detection.
|
|
2
|
+
|
|
3
|
+
Works with any HTTP/HTTPS/SOCKS proxy — a static list, or a single rotating
|
|
4
|
+
gateway (residential providers rotate the exit IP for you on every connection).
|
|
5
|
+
On an anti-bot block (403/429/Cloudflare/DataDome) it transparently rotates the
|
|
6
|
+
proxy and retries, so your spider keeps flowing instead of dying on bans.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
import random
|
|
10
|
+
from urllib.parse import urlparse, urlunparse
|
|
11
|
+
|
|
12
|
+
from scrapy.exceptions import NotConfigured
|
|
13
|
+
from w3lib.http import basic_auth_header
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Default response signals that mean "this exit IP got blocked, try another".
|
|
18
|
+
DEFAULT_BAN_CODES = {403, 407, 429, 503}
|
|
19
|
+
DEFAULT_BAN_MARKERS = (
|
|
20
|
+
b"cf-chl", # Cloudflare challenge
|
|
21
|
+
b"Just a moment", # Cloudflare interstitial
|
|
22
|
+
b"Attention Required", # Cloudflare block
|
|
23
|
+
b"Access denied",
|
|
24
|
+
b"captcha-delivery", # DataDome
|
|
25
|
+
b"datadome",
|
|
26
|
+
b"px-captcha", # PerimeterX
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RotatingProxyMiddleware:
|
|
31
|
+
"""Assign a proxy per request and retry on bans by rotating to a fresh one.
|
|
32
|
+
|
|
33
|
+
Settings:
|
|
34
|
+
ROTATING_PROXY_LIST list of proxy URLs (http://user:pass@host:port)
|
|
35
|
+
ROTATING_PROXY_GATEWAY single rotating-gateway URL (alternative to LIST)
|
|
36
|
+
ROTATING_PROXY_BAN_CODES status codes treated as bans (default 403/407/429/503)
|
|
37
|
+
ROTATING_PROXY_MAX_RETRIES retries per request before giving up (default 5)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, proxies, gateway, ban_codes, max_retries):
|
|
41
|
+
self.proxies = proxies or []
|
|
42
|
+
self.gateway = gateway
|
|
43
|
+
self.ban_codes = ban_codes
|
|
44
|
+
self.max_retries = max_retries
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_crawler(cls, crawler):
|
|
48
|
+
s = crawler.settings
|
|
49
|
+
proxies = s.getlist("ROTATING_PROXY_LIST")
|
|
50
|
+
gateway = s.get("ROTATING_PROXY_GATEWAY")
|
|
51
|
+
if not proxies and not gateway:
|
|
52
|
+
raise NotConfigured(
|
|
53
|
+
"Set ROTATING_PROXY_LIST or ROTATING_PROXY_GATEWAY to use "
|
|
54
|
+
"RotatingProxyMiddleware."
|
|
55
|
+
)
|
|
56
|
+
ban_codes = set(s.getlist("ROTATING_PROXY_BAN_CODES")) or DEFAULT_BAN_CODES
|
|
57
|
+
ban_codes = {int(c) for c in ban_codes}
|
|
58
|
+
max_retries = s.getint("ROTATING_PROXY_MAX_RETRIES", 5)
|
|
59
|
+
return cls(proxies, gateway, ban_codes, max_retries)
|
|
60
|
+
|
|
61
|
+
def _pick(self):
|
|
62
|
+
"""Return the next proxy URL: random from the list, or the gateway."""
|
|
63
|
+
if self.proxies:
|
|
64
|
+
return random.choice(self.proxies)
|
|
65
|
+
return self.gateway
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _apply(request, proxy_url):
|
|
69
|
+
"""Set the proxy on the request, moving any inline credentials into the
|
|
70
|
+
Proxy-Authorization header (Scrapy does not read user:pass from the URL)."""
|
|
71
|
+
parsed = urlparse(proxy_url)
|
|
72
|
+
if parsed.username:
|
|
73
|
+
creds = basic_auth_header(parsed.username, parsed.password or "")
|
|
74
|
+
request.headers[b"Proxy-Authorization"] = creds
|
|
75
|
+
netloc = parsed.hostname + (f":{parsed.port}" if parsed.port else "")
|
|
76
|
+
proxy_url = urlunparse((parsed.scheme, netloc, "", "", "", ""))
|
|
77
|
+
request.meta["proxy"] = proxy_url
|
|
78
|
+
|
|
79
|
+
def process_request(self, request, spider):
|
|
80
|
+
if "proxy" in request.meta and not request.meta.get("_rotating"):
|
|
81
|
+
return # respect a proxy the spider set deliberately
|
|
82
|
+
self._apply(request, self._pick())
|
|
83
|
+
|
|
84
|
+
def _is_ban(self, response):
|
|
85
|
+
if response.status in self.ban_codes:
|
|
86
|
+
return True
|
|
87
|
+
body = response.body[:4096].lower()
|
|
88
|
+
return any(m.lower() in body for m in DEFAULT_BAN_MARKERS)
|
|
89
|
+
|
|
90
|
+
def process_response(self, request, response, spider):
|
|
91
|
+
if not self._is_ban(response):
|
|
92
|
+
return response
|
|
93
|
+
retries = request.meta.get("_proxy_retries", 0)
|
|
94
|
+
if retries >= self.max_retries:
|
|
95
|
+
logger.warning(
|
|
96
|
+
"Gave up on %s after %d proxy rotations (last status %d)",
|
|
97
|
+
request.url, retries, response.status,
|
|
98
|
+
)
|
|
99
|
+
return response
|
|
100
|
+
logger.debug(
|
|
101
|
+
"Ban detected (%d) on %s — rotating proxy, retry %d/%d",
|
|
102
|
+
response.status, request.url, retries + 1, self.max_retries,
|
|
103
|
+
)
|
|
104
|
+
new = request.copy()
|
|
105
|
+
new.meta["_proxy_retries"] = retries + 1
|
|
106
|
+
new.meta["_rotating"] = True
|
|
107
|
+
new.meta.pop("proxy", None)
|
|
108
|
+
self._apply(new, self._pick())
|
|
109
|
+
new.dont_filter = True
|
|
110
|
+
return new
|
|
111
|
+
|
|
112
|
+
def process_exception(self, request, exception, spider):
|
|
113
|
+
# Connection errors to a dead proxy: rotate and retry, same budget.
|
|
114
|
+
retries = request.meta.get("_proxy_retries", 0)
|
|
115
|
+
if retries >= self.max_retries:
|
|
116
|
+
return None
|
|
117
|
+
new = request.copy()
|
|
118
|
+
new.meta["_proxy_retries"] = retries + 1
|
|
119
|
+
new.meta["_rotating"] = True
|
|
120
|
+
new.meta.pop("proxy", None)
|
|
121
|
+
self._apply(new, self._pick())
|
|
122
|
+
new.dont_filter = True
|
|
123
|
+
return new
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapy-rotating-proxy-middleware
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scrapy downloader middleware that rotates proxies and retries on Cloudflare/DataDome/PerimeterX bans.
|
|
5
|
+
Author-email: JiBao Proxy <support@jibaoproxy.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://jibaoproxy.com
|
|
8
|
+
Project-URL: Source, https://github.com/jibaoproxyofficial-pixel/scrapy-rotating-proxy-middleware
|
|
9
|
+
Keywords: scrapy,proxy,rotating-proxy,web-scraping,cloudflare,datadome,anti-bot,residential-proxy
|
|
10
|
+
Classifier: Framework :: Scrapy
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: Scrapy>=2.0
|
|
19
|
+
Requires-Dist: w3lib
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# scrapy-rotating-proxy-middleware
|
|
23
|
+
|
|
24
|
+
[](https://pypi.org/project/scrapy-rotating-proxy-middleware/)
|
|
25
|
+
[](https://pypi.org/project/scrapy-rotating-proxy-middleware/)
|
|
26
|
+
[](https://opensource.org/licenses/MIT)
|
|
27
|
+
|
|
28
|
+
A drop-in [Scrapy](https://scrapy.org) downloader middleware that **rotates proxies and retries on bans** — `403`, `429`, Cloudflare "Just a moment", DataDome, and PerimeterX challenges. Point it at a static proxy list or a single rotating gateway and your spider stops dying on blocks.
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install scrapy-rotating-proxy-middleware
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Why
|
|
35
|
+
|
|
36
|
+
Scrapy's built-in `HttpProxyMiddleware` assigns **one** proxy and never reacts when that exit IP gets blocked. In practice most anti-bot blocks aren't about your spider logic — they're about the IP and its [TLS fingerprint](https://jibaoproxy.com/blog/ja3-tls-fingerprint-detection-explained.html) being scored before your request reaches the page. This middleware:
|
|
37
|
+
|
|
38
|
+
- assigns a proxy per request (random from a list, or a rotating gateway),
|
|
39
|
+
- **detects bans** by status code *and* response-body signature (Cloudflare / DataDome / PerimeterX),
|
|
40
|
+
- transparently **rotates to a fresh proxy and retries**, with a per-request retry budget,
|
|
41
|
+
- moves inline `user:pass` credentials into the `Proxy-Authorization` header automatically.
|
|
42
|
+
|
|
43
|
+
## Setup
|
|
44
|
+
|
|
45
|
+
Enable it in `settings.py` and disable Scrapy's default proxy middleware:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
49
|
+
"scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": None,
|
|
50
|
+
"scrapy_rotating_proxy.middleware.RotatingProxyMiddleware": 610,
|
|
51
|
+
}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Option A — a rotating residential gateway (recommended)
|
|
55
|
+
|
|
56
|
+
A residential gateway gives you a **new exit IP on every connection** from a single URL, so you don't manage a list at all:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
# settings.py
|
|
60
|
+
ROTATING_PROXY_GATEWAY = "http://USERNAME:PASSWORD@us.jibaoproxy.com:913"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Option B — a static proxy list
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
ROTATING_PROXY_LIST = [
|
|
67
|
+
"http://USERNAME:PASSWORD@proxy-a.example.com:8000",
|
|
68
|
+
"http://USERNAME:PASSWORD@proxy-b.example.com:8000",
|
|
69
|
+
"socks5://USERNAME:PASSWORD@proxy-c.example.com:1080",
|
|
70
|
+
]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
That's it — run your spider as usual.
|
|
74
|
+
|
|
75
|
+
## Configuration
|
|
76
|
+
|
|
77
|
+
| Setting | Default | Description |
|
|
78
|
+
| --- | --- | --- |
|
|
79
|
+
| `ROTATING_PROXY_GATEWAY` | – | Single rotating-gateway URL. |
|
|
80
|
+
| `ROTATING_PROXY_LIST` | – | List of proxy URLs (used if no gateway). |
|
|
81
|
+
| `ROTATING_PROXY_BAN_CODES` | `403, 407, 429, 503` | Status codes treated as bans. |
|
|
82
|
+
| `ROTATING_PROXY_MAX_RETRIES` | `5` | Proxy rotations per request before giving up. |
|
|
83
|
+
|
|
84
|
+
Set a proxy on a single request explicitly and the middleware leaves it alone:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
yield scrapy.Request(url, meta={"proxy": "http://USERNAME:PASSWORD@host:port"})
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Ban detection
|
|
91
|
+
|
|
92
|
+
A response counts as a ban when its status is in `ROTATING_PROXY_BAN_CODES`, **or** the first 4 KB of the body matches a known anti-bot signature (`cf-chl`, `Just a moment`, `Attention Required`, `captcha-delivery`/DataDome, `px-captcha`/PerimeterX). On a ban the request is re-scheduled with a fresh proxy and `dont_filter=True`, up to the retry budget.
|
|
93
|
+
|
|
94
|
+
If you keep hitting bans after rotation, the exit IPs themselves are the problem — datacenter ranges get scored as bot traffic at the ASN level. Residential exits with clean ASN reputation are what actually pass. We build [JiBao Proxy](https://jibaoproxy.com) for exactly this: 72M+ residential IPs across 200+ countries, sticky sessions, and SOCKS5/HTTP gateways. The middleware works with any provider, though.
|
|
95
|
+
|
|
96
|
+
## Related
|
|
97
|
+
|
|
98
|
+
- [Scrapy proxy middleware: the complete guide](https://jibaoproxy.com/blog/scrapy-proxy-middleware-guide.html)
|
|
99
|
+
- [Why your JA3/TLS fingerprint gets you blocked](https://jibaoproxy.com/blog/ja3-tls-fingerprint-detection-explained.html)
|
|
100
|
+
- [Bypassing DataDome & PerimeterX in 2026](https://jibaoproxy.com/blog/datadome-perimeterx-bypass-2026.html)
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
MIT
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
scrapy_rotating_proxy/__init__.py,sha256=o3vPM0cwNozHr_F9LlQCy8O-iDrD5FSO6c0NgqZ45NQ,109
|
|
2
|
+
scrapy_rotating_proxy/middleware.py,sha256=cZKGY2FyptswODmEcyNHyz4TPaFMYWGU78lhAfVus9w,4971
|
|
3
|
+
scrapy_rotating_proxy_middleware-0.1.0.dist-info/licenses/LICENSE,sha256=EqlXESyqjpkAYUf6imqo1JrW9eYaOurSRNBh4LO7bQE,1068
|
|
4
|
+
scrapy_rotating_proxy_middleware-0.1.0.dist-info/METADATA,sha256=TjyOjlwibr_Q_EVn05Z10XT3gBAEE_RaG0tCxmYOHP0,4984
|
|
5
|
+
scrapy_rotating_proxy_middleware-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
scrapy_rotating_proxy_middleware-0.1.0.dist-info/top_level.txt,sha256=E4N3CweZpjiY77HqEG76gQN1LX5JeMB3nO42Evhc_-k,22
|
|
7
|
+
scrapy_rotating_proxy_middleware-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 JiBao Proxy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
scrapy_rotating_proxy
|