proxyspin 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proxyspin-0.1.0/.gitignore +4 -0
- proxyspin-0.1.0/LICENSE +21 -0
- proxyspin-0.1.0/PKG-INFO +133 -0
- proxyspin-0.1.0/README.md +108 -0
- proxyspin-0.1.0/pyproject.toml +39 -0
- proxyspin-0.1.0/src/proxyspin/__init__.py +7 -0
- proxyspin-0.1.0/src/proxyspin/cli.py +94 -0
- proxyspin-0.1.0/src/proxyspin/playwright_helper.py +38 -0
- proxyspin-0.1.0/src/proxyspin/pool.py +167 -0
- proxyspin-0.1.0/src/proxyspin/proxy.py +97 -0
- proxyspin-0.1.0/src/proxyspin/requests_adapter.py +63 -0
- proxyspin-0.1.0/src/proxyspin/scrapy_middleware.py +127 -0
- proxyspin-0.1.0/tests/test_proxyspin.py +110 -0
proxyspin-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 GProxy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
proxyspin-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: proxyspin
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Rotating proxy pool for Scrapy, Playwright and requests — health tracking, ban detection, sticky sessions. Zero dependencies.
|
|
5
|
+
Project-URL: Homepage, https://github.com/gproxynet/proxyspin
|
|
6
|
+
Project-URL: Issues, https://github.com/gproxynet/proxyspin/issues
|
|
7
|
+
Author-email: GProxy <support@gproxy.net>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: anti-ban,playwright,proxy,proxy-pool,proxy-rotation,requests,rotating-proxies,scrapy,web-scraping
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Framework :: Scrapy
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Provides-Extra: playwright
|
|
19
|
+
Requires-Dist: playwright>=1.30; extra == 'playwright'
|
|
20
|
+
Provides-Extra: requests
|
|
21
|
+
Requires-Dist: requests>=2.25; extra == 'requests'
|
|
22
|
+
Provides-Extra: scrapy
|
|
23
|
+
Requires-Dist: scrapy>=2.5; extra == 'scrapy'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# proxyspin
|
|
27
|
+
|
|
28
|
+
Rotating proxy pool for **Scrapy**, **Playwright** and **requests** — with health tracking, ban detection, sticky sessions and a built-in list checker. Zero required dependencies, pure stdlib.
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
pip install proxyspin
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Why another one? The classic `scrapy-rotating-proxies` hasn't been updated in years and is Scrapy-only. `proxyspin` is one small pool you can share across Scrapy spiders, Playwright contexts and plain `requests` code, with the same health model everywhere.
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
- **One pool, three integrations** — Scrapy middleware, Playwright helper, `requests` session.
|
|
39
|
+
- **Rotation strategies** — `round_robin`, `random`, `sticky` (same proxy per domain/account/worker until it dies).
|
|
40
|
+
- **Health model** — a proxy failing N times in a row is benched with exponential backoff and returns automatically; a success resets its streak.
|
|
41
|
+
- **Ban detection** — configurable HTTP codes (403/407/429 by default) count as proxy failures and trigger a retry through the next proxy.
|
|
42
|
+
- **Any list format** — `scheme://user:pass@host:port`, `host:port`, `host:port:user:pass`, `user:pass@host:port`; load from a list, a file or a provider URL.
|
|
43
|
+
- **CLI checker** — `proxyspin check proxies.txt` tests the whole list concurrently and can write the alive ones out.
|
|
44
|
+
|
|
45
|
+
## Quickstart
|
|
46
|
+
|
|
47
|
+
### The pool
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from proxyspin import ProxyPool
|
|
51
|
+
|
|
52
|
+
pool = ProxyPool.from_file("proxies.txt", strategy="round_robin")
|
|
53
|
+
# or inline:
|
|
54
|
+
pool = ProxyPool(["http://user:pass@gate1.example.com:8000", "10.0.0.2:8000"])
|
|
55
|
+
# or straight from your provider's export endpoint:
|
|
56
|
+
pool = ProxyPool.from_url("https://provider.example.com/api/my-list.txt")
|
|
57
|
+
|
|
58
|
+
proxy = pool.get() # -> Proxy; proxy.url is ready to use
|
|
59
|
+
pool.mark_failed(proxy) # bench it after repeated failures
|
|
60
|
+
pool.mark_ok(proxy) # reset its failure streak
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Scrapy
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
# settings.py
|
|
67
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
68
|
+
"proxyspin.scrapy_middleware.ProxySpinMiddleware": 610,
|
|
69
|
+
}
|
|
70
|
+
PROXYSPIN_FILE = "proxies.txt"
|
|
71
|
+
PROXYSPIN_STRATEGY = "sticky" # one proxy per target host
|
|
72
|
+
PROXYSPIN_BAN_CODES = [403, 429] # these responses rotate the proxy
|
|
73
|
+
PROXYSPIN_MAX_RETRIES = 3
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Per-request overrides: set `request.meta["proxy"]` to pin a proxy, `meta["proxyspin_disabled"] = True` to skip proxying, `meta["proxyspin_key"]` to control stickiness.
|
|
77
|
+
|
|
78
|
+
### Playwright
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from playwright.sync_api import sync_playwright
|
|
82
|
+
from proxyspin import ProxyPool
|
|
83
|
+
from proxyspin.playwright_helper import proxy_settings
|
|
84
|
+
|
|
85
|
+
pool = ProxyPool.from_file("proxies.txt", strategy="sticky")
|
|
86
|
+
|
|
87
|
+
with sync_playwright() as p:
|
|
88
|
+
browser = p.chromium.launch()
|
|
89
|
+
for account in accounts:
|
|
90
|
+
context = browser.new_context(proxy=proxy_settings(pool, key=account.id))
|
|
91
|
+
# each account keeps its own IP for the whole session
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### requests
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from proxyspin import ProxyPool
|
|
98
|
+
from proxyspin.requests_adapter import RotatingSession
|
|
99
|
+
|
|
100
|
+
session = RotatingSession(ProxyPool.from_file("proxies.txt"))
|
|
101
|
+
print(session.get("https://httpbin.org/ip").json()) # new IP per call
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Check a list
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
$ proxyspin check proxies.txt --workers 100 --alive-out alive.txt
|
|
108
|
+
OK 45.155.10.4:8000 612 ms HTTP 200
|
|
109
|
+
DEAD 91.10.77.2:3128 TimeoutError
|
|
110
|
+
...
|
|
111
|
+
118/200 alive
|
|
112
|
+
wrote 118 proxies to alive.txt
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Providers
|
|
116
|
+
|
|
117
|
+
`proxyspin` works with any proxy source: your own servers, free lists, or commercial providers. Example with [GProxy](https://gproxy.net/?utm_source=github&utm_medium=readme&utm_campaign=proxyspin) residential/mobile gateways (rotation happens server-side, so a pool of one entry per gateway is enough — use `sticky` if you need session pinning):
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
pool = ProxyPool([
|
|
121
|
+
"http://USER:PASS@gate.gproxy.net:8000", # residential, rotating
|
|
122
|
+
])
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Any other provider works the same way — put its gateway or IP list into the pool.
|
|
126
|
+
|
|
127
|
+
## Health model in one paragraph
|
|
128
|
+
|
|
129
|
+
Every proxy starts healthy. `mark_failed` increments its failure streak; when the streak reaches `max_failures` (default 2) the proxy is benched for `cooldown * 2**overshoot` seconds (default base 60 s, capped at 1 h), then automatically rejoins rotation. `mark_ok` resets the streak. The Scrapy middleware and `RotatingSession` call these for you based on exceptions and ban codes; with Playwright you call them yourself since only your code knows what a "ban" looks like for your flow.
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# proxyspin
|
|
2
|
+
|
|
3
|
+
Rotating proxy pool for **Scrapy**, **Playwright** and **requests** — with health tracking, ban detection, sticky sessions and a built-in list checker. Zero required dependencies, pure stdlib.
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
pip install proxyspin
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Why another one? The classic `scrapy-rotating-proxies` hasn't been updated in years and is Scrapy-only. `proxyspin` is one small pool you can share across Scrapy spiders, Playwright contexts and plain `requests` code, with the same health model everywhere.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- **One pool, three integrations** — Scrapy middleware, Playwright helper, `requests` session.
|
|
14
|
+
- **Rotation strategies** — `round_robin`, `random`, `sticky` (same proxy per domain/account/worker until it dies).
|
|
15
|
+
- **Health model** — a proxy failing N times in a row is benched with exponential backoff and returns automatically; a success resets its streak.
|
|
16
|
+
- **Ban detection** — configurable HTTP codes (403/407/429 by default) count as proxy failures and trigger a retry through the next proxy.
|
|
17
|
+
- **Any list format** — `scheme://user:pass@host:port`, `host:port`, `host:port:user:pass`, `user:pass@host:port`; load from a list, a file or a provider URL.
|
|
18
|
+
- **CLI checker** — `proxyspin check proxies.txt` tests the whole list concurrently and can write the alive ones out.
|
|
19
|
+
|
|
20
|
+
## Quickstart
|
|
21
|
+
|
|
22
|
+
### The pool
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from proxyspin import ProxyPool
|
|
26
|
+
|
|
27
|
+
pool = ProxyPool.from_file("proxies.txt", strategy="round_robin")
|
|
28
|
+
# or inline:
|
|
29
|
+
pool = ProxyPool(["http://user:pass@gate1.example.com:8000", "10.0.0.2:8000"])
|
|
30
|
+
# or straight from your provider's export endpoint:
|
|
31
|
+
pool = ProxyPool.from_url("https://provider.example.com/api/my-list.txt")
|
|
32
|
+
|
|
33
|
+
proxy = pool.get() # -> Proxy; proxy.url is ready to use
|
|
34
|
+
pool.mark_failed(proxy) # bench it after repeated failures
|
|
35
|
+
pool.mark_ok(proxy) # reset its failure streak
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Scrapy
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
# settings.py
|
|
42
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
43
|
+
"proxyspin.scrapy_middleware.ProxySpinMiddleware": 610,
|
|
44
|
+
}
|
|
45
|
+
PROXYSPIN_FILE = "proxies.txt"
|
|
46
|
+
PROXYSPIN_STRATEGY = "sticky" # one proxy per target host
|
|
47
|
+
PROXYSPIN_BAN_CODES = [403, 429] # these responses rotate the proxy
|
|
48
|
+
PROXYSPIN_MAX_RETRIES = 3
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Per-request overrides: set `request.meta["proxy"]` to pin a proxy, `meta["proxyspin_disabled"] = True` to skip proxying, `meta["proxyspin_key"]` to control stickiness.
|
|
52
|
+
|
|
53
|
+
### Playwright
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from playwright.sync_api import sync_playwright
|
|
57
|
+
from proxyspin import ProxyPool
|
|
58
|
+
from proxyspin.playwright_helper import proxy_settings
|
|
59
|
+
|
|
60
|
+
pool = ProxyPool.from_file("proxies.txt", strategy="sticky")
|
|
61
|
+
|
|
62
|
+
with sync_playwright() as p:
|
|
63
|
+
browser = p.chromium.launch()
|
|
64
|
+
for account in accounts:
|
|
65
|
+
context = browser.new_context(proxy=proxy_settings(pool, key=account.id))
|
|
66
|
+
# each account keeps its own IP for the whole session
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### requests
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from proxyspin import ProxyPool
|
|
73
|
+
from proxyspin.requests_adapter import RotatingSession
|
|
74
|
+
|
|
75
|
+
session = RotatingSession(ProxyPool.from_file("proxies.txt"))
|
|
76
|
+
print(session.get("https://httpbin.org/ip").json()) # new IP per call
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Check a list
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
$ proxyspin check proxies.txt --workers 100 --alive-out alive.txt
|
|
83
|
+
OK 45.155.10.4:8000 612 ms HTTP 200
|
|
84
|
+
DEAD 91.10.77.2:3128 TimeoutError
|
|
85
|
+
...
|
|
86
|
+
118/200 alive
|
|
87
|
+
wrote 118 proxies to alive.txt
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Providers
|
|
91
|
+
|
|
92
|
+
`proxyspin` works with any proxy source: your own servers, free lists, or commercial providers. Example with [GProxy](https://gproxy.net/?utm_source=github&utm_medium=readme&utm_campaign=proxyspin) residential/mobile gateways (rotation happens server-side, so a pool of one entry per gateway is enough — use `sticky` if you need session pinning):
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
pool = ProxyPool([
|
|
96
|
+
"http://USER:PASS@gate.gproxy.net:8000", # residential, rotating
|
|
97
|
+
])
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Any other provider works the same way — put its gateway or IP list into the pool.
|
|
101
|
+
|
|
102
|
+
## Health model in one paragraph
|
|
103
|
+
|
|
104
|
+
Every proxy starts healthy. `mark_failed` increments its failure streak; when the streak reaches `max_failures` (default 2) the proxy is benched for `cooldown * 2**overshoot` seconds (default base 60 s, capped at 1 h), then automatically rejoins rotation. `mark_ok` resets the streak. The Scrapy middleware and `RotatingSession` call these for you based on exceptions and ban codes; with Playwright you call them yourself since only your code knows what a "ban" looks like for your flow.
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "proxyspin"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Rotating proxy pool for Scrapy, Playwright and requests — health tracking, ban detection, sticky sessions. Zero dependencies."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [{ name = "GProxy", email = "support@gproxy.net" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"proxy", "rotating-proxies", "scrapy", "playwright", "requests",
|
|
15
|
+
"web-scraping", "proxy-rotation", "proxy-pool", "anti-ban",
|
|
16
|
+
]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 4 - Beta",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
22
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
23
|
+
"Framework :: Scrapy",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
requests = ["requests>=2.25"]
|
|
28
|
+
scrapy = ["scrapy>=2.5"]
|
|
29
|
+
playwright = ["playwright>=1.30"]
|
|
30
|
+
|
|
31
|
+
[project.scripts]
|
|
32
|
+
proxyspin = "proxyspin.cli:main"
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/gproxynet/proxyspin"
|
|
36
|
+
Issues = "https://github.com/gproxynet/proxyspin/issues"
|
|
37
|
+
|
|
38
|
+
[tool.hatch.build.targets.wheel]
|
|
39
|
+
packages = ["src/proxyspin"]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""proxyspin — rotating proxy pool for Scrapy, Playwright and requests."""
|
|
2
|
+
|
|
3
|
+
from .pool import NoHealthyProxies, ProxyPool
|
|
4
|
+
from .proxy import Proxy, parse_proxy
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
__all__ = ["Proxy", "ProxyPool", "NoHealthyProxies", "parse_proxy", "__version__"]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""``proxyspin check`` — fast concurrent liveness check for a proxy list.
|
|
2
|
+
|
|
3
|
+
Zero dependencies: stdlib only (http/https proxies; socks lines are skipped).
|
|
4
|
+
|
|
5
|
+
::
|
|
6
|
+
|
|
7
|
+
proxyspin check proxies.txt
|
|
8
|
+
proxyspin check proxies.txt --url https://example.com --timeout 5 --workers 100
|
|
9
|
+
proxyspin check proxies.txt --alive-out alive.txt
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
import urllib.error
|
|
17
|
+
import urllib.request
|
|
18
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from .proxy import Proxy, parse_proxy
|
|
22
|
+
|
|
23
|
+
DEFAULT_TEST_URL = "http://httpbin.org/ip"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def check_one(proxy: Proxy, url: str, timeout: float) -> tuple[Proxy, float | None, str]:
|
|
27
|
+
"""Return (proxy, latency_seconds_or_None, detail)."""
|
|
28
|
+
handler = urllib.request.ProxyHandler({"http": proxy.url, "https": proxy.url})
|
|
29
|
+
opener = urllib.request.build_opener(handler)
|
|
30
|
+
opener.addheaders = [("User-Agent", "proxyspin-check")]
|
|
31
|
+
start = time.monotonic()
|
|
32
|
+
try:
|
|
33
|
+
with opener.open(url, timeout=timeout) as resp:
|
|
34
|
+
resp.read(256)
|
|
35
|
+
return proxy, time.monotonic() - start, f"HTTP {resp.status}"
|
|
36
|
+
except urllib.error.HTTPError as exc:
|
|
37
|
+
return proxy, None, f"HTTP {exc.code}"
|
|
38
|
+
except Exception as exc:
|
|
39
|
+
return proxy, None, exc.__class__.__name__
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def main(argv: list[str] | None = None) -> int:
|
|
43
|
+
parser = argparse.ArgumentParser(prog="proxyspin", description=__doc__)
|
|
44
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
45
|
+
check = sub.add_parser("check", help="check which proxies in a list are alive")
|
|
46
|
+
check.add_argument("file", help="proxy list, one per line (any common format)")
|
|
47
|
+
check.add_argument("--url", default=DEFAULT_TEST_URL, help="URL fetched through each proxy")
|
|
48
|
+
check.add_argument("--timeout", type=float, default=10.0)
|
|
49
|
+
check.add_argument("--workers", type=int, default=50)
|
|
50
|
+
check.add_argument("--alive-out", help="write working proxies to this file")
|
|
51
|
+
args = parser.parse_args(argv)
|
|
52
|
+
|
|
53
|
+
proxies: list[Proxy] = []
|
|
54
|
+
for line in Path(args.file).read_text().splitlines():
|
|
55
|
+
line = line.strip()
|
|
56
|
+
if not line or line.startswith("#"):
|
|
57
|
+
continue
|
|
58
|
+
try:
|
|
59
|
+
proxy = parse_proxy(line)
|
|
60
|
+
except ValueError as exc:
|
|
61
|
+
print(f" skip: {exc}", file=sys.stderr)
|
|
62
|
+
continue
|
|
63
|
+
if proxy.scheme.startswith("socks"):
|
|
64
|
+
print(f" skip (socks not supported by checker): {proxy.address}", file=sys.stderr)
|
|
65
|
+
continue
|
|
66
|
+
proxies.append(proxy)
|
|
67
|
+
|
|
68
|
+
if not proxies:
|
|
69
|
+
print("no proxies to check", file=sys.stderr)
|
|
70
|
+
return 2
|
|
71
|
+
|
|
72
|
+
alive: list[tuple[Proxy, float]] = []
|
|
73
|
+
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
|
74
|
+
futures = [pool.submit(check_one, p, args.url, args.timeout) for p in proxies]
|
|
75
|
+
for future in as_completed(futures):
|
|
76
|
+
proxy, latency, detail = future.result()
|
|
77
|
+
if latency is not None:
|
|
78
|
+
alive.append((proxy, latency))
|
|
79
|
+
print(f"OK {proxy.address:<21} {latency * 1000:6.0f} ms {detail}")
|
|
80
|
+
else:
|
|
81
|
+
print(f"DEAD {proxy.address:<21} {detail}")
|
|
82
|
+
|
|
83
|
+
print(f"\n{len(alive)}/{len(proxies)} alive")
|
|
84
|
+
if args.alive_out:
|
|
85
|
+
alive.sort(key=lambda item: item[1])
|
|
86
|
+
Path(args.alive_out).write_text(
|
|
87
|
+
"\n".join(proxy.url for proxy, _ in alive) + ("\n" if alive else "")
|
|
88
|
+
)
|
|
89
|
+
print(f"wrote {len(alive)} proxies to {args.alive_out}")
|
|
90
|
+
return 0 if alive else 1
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__": # pragma: no cover
|
|
94
|
+
sys.exit(main())
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Playwright integration: per-context rotating proxies.
|
|
2
|
+
|
|
3
|
+
Playwright accepts a proxy per browser launch or per context. The natural
|
|
4
|
+
rotation unit is the context::
|
|
5
|
+
|
|
6
|
+
from playwright.sync_api import sync_playwright
|
|
7
|
+
from proxyspin import ProxyPool
|
|
8
|
+
from proxyspin.playwright_helper import proxy_settings
|
|
9
|
+
|
|
10
|
+
pool = ProxyPool.from_file("proxies.txt", strategy="sticky")
|
|
11
|
+
|
|
12
|
+
with sync_playwright() as p:
|
|
13
|
+
browser = p.chromium.launch()
|
|
14
|
+
context = browser.new_context(proxy=proxy_settings(pool, key="job-42"))
|
|
15
|
+
page = context.new_page()
|
|
16
|
+
page.goto("https://example.com")
|
|
17
|
+
|
|
18
|
+
Async API is identical — ``proxy_settings`` is plain data, no I/O.
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from .pool import ProxyPool
|
|
23
|
+
from .proxy import Proxy
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def proxy_settings(pool_or_proxy: ProxyPool | Proxy, key: str | None = None) -> dict:
|
|
27
|
+
"""Build the ``proxy`` dict Playwright expects, from a pool or a proxy.
|
|
28
|
+
|
|
29
|
+
Credentials are passed via the dedicated fields (Playwright ignores
|
|
30
|
+
userinfo embedded in the server URL).
|
|
31
|
+
"""
|
|
32
|
+
proxy = pool_or_proxy.get(key=key) if isinstance(pool_or_proxy, ProxyPool) else pool_or_proxy
|
|
33
|
+
settings: dict = {"server": f"{proxy.scheme}://{proxy.host}:{proxy.port}"}
|
|
34
|
+
if proxy.username is not None:
|
|
35
|
+
settings["username"] = proxy.username
|
|
36
|
+
if proxy.password is not None:
|
|
37
|
+
settings["password"] = proxy.password
|
|
38
|
+
return settings
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Proxy pool: sources, rotation strategies and health tracking."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import random
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
import urllib.request
|
|
8
|
+
from collections.abc import Iterable
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .proxy import Proxy, ProxyState, parse_proxy
|
|
12
|
+
|
|
13
|
+
STRATEGIES = ("round_robin", "random", "sticky")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NoHealthyProxies(RuntimeError):
|
|
17
|
+
"""Every proxy in the pool is currently banned."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ProxyPool:
|
|
21
|
+
"""Thread-safe rotating proxy pool with failure-based cooldowns.
|
|
22
|
+
|
|
23
|
+
Rotation strategies:
|
|
24
|
+
round_robin cycle through healthy proxies in order (default)
|
|
25
|
+
random pick a healthy proxy at random
|
|
26
|
+
sticky keep returning the same proxy for a given ``key``
|
|
27
|
+
(e.g. a target domain) until it goes unhealthy
|
|
28
|
+
|
|
29
|
+
Health model: a proxy that fails ``max_failures`` times in a row is
|
|
30
|
+
benched for ``cooldown * 2**(streak-1)`` seconds (capped at
|
|
31
|
+
``max_cooldown``), then automatically returns to rotation. A success
|
|
32
|
+
resets its failure counter.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
proxies: Iterable[Proxy | str] = (),
|
|
38
|
+
*,
|
|
39
|
+
strategy: str = "round_robin",
|
|
40
|
+
max_failures: int = 2,
|
|
41
|
+
cooldown: float = 60.0,
|
|
42
|
+
max_cooldown: float = 3600.0,
|
|
43
|
+
default_scheme: str = "http",
|
|
44
|
+
) -> None:
|
|
45
|
+
if strategy not in STRATEGIES:
|
|
46
|
+
raise ValueError(f"unknown strategy {strategy!r}, expected one of {STRATEGIES}")
|
|
47
|
+
self.strategy = strategy
|
|
48
|
+
self.max_failures = max_failures
|
|
49
|
+
self.cooldown = cooldown
|
|
50
|
+
self.max_cooldown = max_cooldown
|
|
51
|
+
self.default_scheme = default_scheme
|
|
52
|
+
self._lock = threading.Lock()
|
|
53
|
+
self._states: dict[Proxy, ProxyState] = {}
|
|
54
|
+
self._order: list[Proxy] = []
|
|
55
|
+
self._rr_index = 0
|
|
56
|
+
self._sticky: dict[str, Proxy] = {}
|
|
57
|
+
self.extend(proxies)
|
|
58
|
+
|
|
59
|
+
# ------------------------------------------------------------- sources
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_file(cls, path: str | Path, **kwargs) -> "ProxyPool":
|
|
62
|
+
"""One proxy per line; blank lines and ``#`` comments are skipped."""
|
|
63
|
+
lines = Path(path).read_text().splitlines()
|
|
64
|
+
return cls([l for l in lines if l.strip() and not l.lstrip().startswith("#")], **kwargs)
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_url(cls, url: str, *, timeout: float = 15.0, **kwargs) -> "ProxyPool":
|
|
68
|
+
"""Fetch a plain-text proxy list (one per line) from a URL.
|
|
69
|
+
|
|
70
|
+
Handy for provider endpoints that export your current proxy list.
|
|
71
|
+
"""
|
|
72
|
+
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
|
73
|
+
body = resp.read().decode("utf-8", "replace")
|
|
74
|
+
lines = [l for l in body.splitlines() if l.strip() and not l.lstrip().startswith("#")]
|
|
75
|
+
return cls(lines, **kwargs)
|
|
76
|
+
|
|
77
|
+
def extend(self, proxies: Iterable[Proxy | str]) -> None:
|
|
78
|
+
with self._lock:
|
|
79
|
+
for item in proxies:
|
|
80
|
+
proxy = item if isinstance(item, Proxy) else parse_proxy(item, self.default_scheme)
|
|
81
|
+
if proxy not in self._states:
|
|
82
|
+
self._states[proxy] = ProxyState(proxy)
|
|
83
|
+
self._order.append(proxy)
|
|
84
|
+
|
|
85
|
+
# ------------------------------------------------------------ rotation
|
|
86
|
+
def get(self, key: str | None = None) -> Proxy:
|
|
87
|
+
"""Return the next healthy proxy according to the pool strategy.
|
|
88
|
+
|
|
89
|
+
``key`` is only used by the ``sticky`` strategy (any hashable label:
|
|
90
|
+
target domain, account id, worker name...).
|
|
91
|
+
"""
|
|
92
|
+
with self._lock:
|
|
93
|
+
healthy = self._healthy_locked()
|
|
94
|
+
if not healthy:
|
|
95
|
+
raise NoHealthyProxies(
|
|
96
|
+
f"all {len(self._order)} proxies are cooling down; "
|
|
97
|
+
"retry later or add more proxies"
|
|
98
|
+
)
|
|
99
|
+
if self.strategy == "sticky" and key is not None:
|
|
100
|
+
current = self._sticky.get(key)
|
|
101
|
+
if current is not None and current in healthy:
|
|
102
|
+
return current
|
|
103
|
+
choice = random.choice(healthy)
|
|
104
|
+
self._sticky[key] = choice
|
|
105
|
+
return choice
|
|
106
|
+
if self.strategy == "random":
|
|
107
|
+
return random.choice(healthy)
|
|
108
|
+
self._rr_index = (self._rr_index + 1) % len(healthy)
|
|
109
|
+
return healthy[self._rr_index]
|
|
110
|
+
|
|
111
|
+
# -------------------------------------------------------------- health
|
|
112
|
+
def mark_ok(self, proxy: Proxy) -> None:
|
|
113
|
+
with self._lock:
|
|
114
|
+
state = self._states.get(proxy)
|
|
115
|
+
if state is not None:
|
|
116
|
+
state.successes += 1
|
|
117
|
+
state.failures = 0
|
|
118
|
+
state.banned_until = 0.0
|
|
119
|
+
|
|
120
|
+
def mark_failed(self, proxy: Proxy) -> None:
|
|
121
|
+
with self._lock:
|
|
122
|
+
state = self._states.get(proxy)
|
|
123
|
+
if state is None:
|
|
124
|
+
return
|
|
125
|
+
state.failures += 1
|
|
126
|
+
if state.failures >= self.max_failures:
|
|
127
|
+
streak = state.failures - self.max_failures
|
|
128
|
+
delay = min(self.cooldown * (2**streak), self.max_cooldown)
|
|
129
|
+
state.banned_until = time.monotonic() + delay
|
|
130
|
+
for key, sticky_proxy in list(self._sticky.items()):
|
|
131
|
+
if sticky_proxy == proxy:
|
|
132
|
+
del self._sticky[key]
|
|
133
|
+
|
|
134
|
+
def remove(self, proxy: Proxy) -> None:
|
|
135
|
+
with self._lock:
|
|
136
|
+
self._states.pop(proxy, None)
|
|
137
|
+
if proxy in self._order:
|
|
138
|
+
self._order.remove(proxy)
|
|
139
|
+
for key, sticky_proxy in list(self._sticky.items()):
|
|
140
|
+
if sticky_proxy == proxy:
|
|
141
|
+
del self._sticky[key]
|
|
142
|
+
|
|
143
|
+
# --------------------------------------------------------------- stats
|
|
144
|
+
def _healthy_locked(self) -> list[Proxy]:
|
|
145
|
+
now = time.monotonic()
|
|
146
|
+
return [p for p in self._order if self._states[p].banned_until <= now]
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def healthy_count(self) -> int:
|
|
150
|
+
with self._lock:
|
|
151
|
+
return len(self._healthy_locked())
|
|
152
|
+
|
|
153
|
+
def __len__(self) -> int:
|
|
154
|
+
return len(self._order)
|
|
155
|
+
|
|
156
|
+
def stats(self) -> dict[str, dict]:
|
|
157
|
+
"""Snapshot of per-proxy health, keyed by ``host:port``."""
|
|
158
|
+
now = time.monotonic()
|
|
159
|
+
with self._lock:
|
|
160
|
+
return {
|
|
161
|
+
p.address: {
|
|
162
|
+
"successes": s.successes,
|
|
163
|
+
"failures": s.failures,
|
|
164
|
+
"banned_for": max(0.0, round(s.banned_until - now, 1)),
|
|
165
|
+
}
|
|
166
|
+
for p, s in self._states.items()
|
|
167
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Proxy model and parsing of common list formats."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from urllib.parse import quote, unquote, urlsplit
|
|
6
|
+
|
|
7
|
+
SUPPORTED_SCHEMES = ("http", "https", "socks4", "socks5", "socks5h")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class Proxy:
|
|
12
|
+
host: str
|
|
13
|
+
port: int
|
|
14
|
+
scheme: str = "http"
|
|
15
|
+
username: str | None = None
|
|
16
|
+
password: str | None = None
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def url(self) -> str:
|
|
20
|
+
"""Full proxy URL, credentials included if present."""
|
|
21
|
+
auth = ""
|
|
22
|
+
if self.username is not None:
|
|
23
|
+
auth = quote(self.username, safe="")
|
|
24
|
+
if self.password is not None:
|
|
25
|
+
auth += ":" + quote(self.password, safe="")
|
|
26
|
+
auth += "@"
|
|
27
|
+
return f"{self.scheme}://{auth}{self.host}:{self.port}"
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def address(self) -> str:
|
|
31
|
+
return f"{self.host}:{self.port}"
|
|
32
|
+
|
|
33
|
+
def __str__(self) -> str: # pragma: no cover - convenience
|
|
34
|
+
return self.url
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def parse_proxy(line: str, default_scheme: str = "http") -> Proxy:
|
|
38
|
+
"""Parse one proxy from any of the common formats.
|
|
39
|
+
|
|
40
|
+
Accepted:
|
|
41
|
+
scheme://user:pass@host:port
|
|
42
|
+
scheme://host:port
|
|
43
|
+
user:pass@host:port
|
|
44
|
+
host:port
|
|
45
|
+
host:port:user:pass
|
|
46
|
+
"""
|
|
47
|
+
line = line.strip()
|
|
48
|
+
if not line:
|
|
49
|
+
raise ValueError("empty proxy line")
|
|
50
|
+
|
|
51
|
+
if "://" in line:
|
|
52
|
+
parts = urlsplit(line)
|
|
53
|
+
if parts.scheme not in SUPPORTED_SCHEMES:
|
|
54
|
+
raise ValueError(f"unsupported proxy scheme: {parts.scheme!r}")
|
|
55
|
+
if not parts.hostname or not parts.port:
|
|
56
|
+
raise ValueError(f"proxy needs host and port: {line!r}")
|
|
57
|
+
return Proxy(
|
|
58
|
+
host=parts.hostname,
|
|
59
|
+
port=parts.port,
|
|
60
|
+
scheme=parts.scheme,
|
|
61
|
+
username=unquote(parts.username) if parts.username else None,
|
|
62
|
+
password=unquote(parts.password) if parts.password else None,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if "@" in line:
|
|
66
|
+
creds, _, hostport = line.rpartition("@")
|
|
67
|
+
user, _, pwd = creds.partition(":")
|
|
68
|
+
host, _, port = hostport.rpartition(":")
|
|
69
|
+
_validate(host, port, line)
|
|
70
|
+
return Proxy(host, int(port), default_scheme, user, pwd or None)
|
|
71
|
+
|
|
72
|
+
pieces = line.split(":")
|
|
73
|
+
if len(pieces) == 2:
|
|
74
|
+
host, port = pieces
|
|
75
|
+
_validate(host, port, line)
|
|
76
|
+
return Proxy(host, int(port), default_scheme)
|
|
77
|
+
if len(pieces) == 4:
|
|
78
|
+
host, port, user, pwd = pieces
|
|
79
|
+
_validate(host, port, line)
|
|
80
|
+
return Proxy(host, int(port), default_scheme, user, pwd)
|
|
81
|
+
raise ValueError(f"cannot parse proxy: {line!r}")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _validate(host: str, port: str, line: str) -> None:
|
|
85
|
+
if not host or not port.isdigit() or not 0 < int(port) < 65536:
|
|
86
|
+
raise ValueError(f"cannot parse proxy: {line!r}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class ProxyState:
|
|
91
|
+
"""Mutable health bookkeeping attached to a proxy inside a pool."""
|
|
92
|
+
|
|
93
|
+
proxy: Proxy
|
|
94
|
+
failures: int = 0
|
|
95
|
+
successes: int = 0
|
|
96
|
+
banned_until: float = 0.0
|
|
97
|
+
extra: dict = field(default_factory=dict)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""requests integration: a Session that rotates proxies per request.
|
|
2
|
+
|
|
3
|
+
::
|
|
4
|
+
|
|
5
|
+
from proxyspin import ProxyPool
|
|
6
|
+
from proxyspin.requests_adapter import RotatingSession
|
|
7
|
+
|
|
8
|
+
pool = ProxyPool.from_file("proxies.txt")
|
|
9
|
+
session = RotatingSession(pool)
|
|
10
|
+
r = session.get("https://httpbin.org/ip") # each call uses the next proxy
|
|
11
|
+
|
|
12
|
+
Failures (connect errors, or responses with a status in ``ban_codes``)
|
|
13
|
+
mark the proxy in the pool and the call is retried through another proxy
|
|
14
|
+
up to ``max_retries`` times before the last error is raised.
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import requests
|
|
20
|
+
except ImportError as exc: # pragma: no cover
|
|
21
|
+
raise ImportError("proxyspin.requests_adapter needs requests: pip install proxyspin[requests]") from exc
|
|
22
|
+
|
|
23
|
+
from .pool import ProxyPool
|
|
24
|
+
|
|
25
|
+
DEFAULT_BAN_CODES = frozenset({403, 407, 429})
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RotatingSession(requests.Session):
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
pool: ProxyPool,
|
|
32
|
+
*,
|
|
33
|
+
max_retries: int = 3,
|
|
34
|
+
ban_codes: frozenset[int] = DEFAULT_BAN_CODES,
|
|
35
|
+
) -> None:
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.pool = pool
|
|
38
|
+
self.max_retries = max_retries
|
|
39
|
+
self.ban_codes = ban_codes
|
|
40
|
+
|
|
41
|
+
def request(self, method, url, **kwargs): # type: ignore[override]
|
|
42
|
+
if "proxies" in kwargs: # caller pinned a proxy — don't interfere
|
|
43
|
+
return super().request(method, url, **kwargs)
|
|
44
|
+
sticky_key = kwargs.pop("proxyspin_key", None)
|
|
45
|
+
last_exc: Exception | None = None
|
|
46
|
+
for _ in range(self.max_retries + 1):
|
|
47
|
+
proxy = self.pool.get(key=sticky_key)
|
|
48
|
+
kwargs["proxies"] = {"http": proxy.url, "https": proxy.url}
|
|
49
|
+
try:
|
|
50
|
+
response = super().request(method, url, **kwargs)
|
|
51
|
+
except (requests.ConnectionError, requests.Timeout) as exc:
|
|
52
|
+
self.pool.mark_failed(proxy)
|
|
53
|
+
last_exc = exc
|
|
54
|
+
continue
|
|
55
|
+
if response.status_code in self.ban_codes:
|
|
56
|
+
self.pool.mark_failed(proxy)
|
|
57
|
+
last_response = response
|
|
58
|
+
continue
|
|
59
|
+
self.pool.mark_ok(proxy)
|
|
60
|
+
return response
|
|
61
|
+
if last_exc is not None:
|
|
62
|
+
raise last_exc
|
|
63
|
+
return last_response
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Scrapy downloader middleware with rotation, ban detection and retries.
|
|
2
|
+
|
|
3
|
+
Enable in ``settings.py``::
|
|
4
|
+
|
|
5
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
6
|
+
"proxyspin.scrapy_middleware.ProxySpinMiddleware": 610,
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
PROXYSPIN_LIST = ["http://user:pass@gate.example.com:8000"]
|
|
10
|
+
# or PROXYSPIN_FILE = "proxies.txt"
|
|
11
|
+
# or PROXYSPIN_URL = "https://provider.example.com/api/list"
|
|
12
|
+
|
|
13
|
+
Optional settings (defaults shown)::
|
|
14
|
+
|
|
15
|
+
PROXYSPIN_STRATEGY = "round_robin" # round_robin | random | sticky
|
|
16
|
+
PROXYSPIN_BAN_CODES = [403, 407, 429] # responses treated as proxy bans
|
|
17
|
+
PROXYSPIN_MAX_RETRIES = 3 # per-request proxy switches
|
|
18
|
+
PROXYSPIN_MAX_FAILURES = 2 # pool: failures before cooldown
|
|
19
|
+
PROXYSPIN_COOLDOWN = 60.0 # pool: base cooldown, seconds
|
|
20
|
+
|
|
21
|
+
Per-request control via ``request.meta``:
|
|
22
|
+
|
|
23
|
+
``proxy`` set explicitly to bypass the pool
|
|
24
|
+
``proxyspin_disabled`` truthy to skip proxying this request
|
|
25
|
+
``proxyspin_key`` sticky key (defaults to the request host)
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
|
|
31
|
+
from .pool import NoHealthyProxies, ProxyPool
|
|
32
|
+
from .proxy import parse_proxy
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ProxySpinMiddleware:
|
|
38
|
+
def __init__(self, pool: ProxyPool, ban_codes: frozenset[int], max_retries: int) -> None:
|
|
39
|
+
self.pool = pool
|
|
40
|
+
self.ban_codes = ban_codes
|
|
41
|
+
self.max_retries = max_retries
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def from_crawler(cls, crawler):
|
|
45
|
+
settings = crawler.settings
|
|
46
|
+
kwargs = {
|
|
47
|
+
"strategy": settings.get("PROXYSPIN_STRATEGY", "round_robin"),
|
|
48
|
+
"max_failures": settings.getint("PROXYSPIN_MAX_FAILURES", 2),
|
|
49
|
+
"cooldown": settings.getfloat("PROXYSPIN_COOLDOWN", 60.0),
|
|
50
|
+
}
|
|
51
|
+
if settings.get("PROXYSPIN_URL"):
|
|
52
|
+
pool = ProxyPool.from_url(settings.get("PROXYSPIN_URL"), **kwargs)
|
|
53
|
+
elif settings.get("PROXYSPIN_FILE"):
|
|
54
|
+
pool = ProxyPool.from_file(settings.get("PROXYSPIN_FILE"), **kwargs)
|
|
55
|
+
else:
|
|
56
|
+
proxies = settings.getlist("PROXYSPIN_LIST")
|
|
57
|
+
if not proxies:
|
|
58
|
+
from scrapy.exceptions import NotConfigured
|
|
59
|
+
|
|
60
|
+
raise NotConfigured(
|
|
61
|
+
"set PROXYSPIN_LIST, PROXYSPIN_FILE or PROXYSPIN_URL"
|
|
62
|
+
)
|
|
63
|
+
pool = ProxyPool(proxies, **kwargs)
|
|
64
|
+
logger.info("proxyspin: loaded %d proxies", len(pool))
|
|
65
|
+
ban_codes = frozenset(
|
|
66
|
+
int(c) for c in settings.getlist("PROXYSPIN_BAN_CODES", [403, 407, 429])
|
|
67
|
+
)
|
|
68
|
+
return cls(pool, ban_codes, settings.getint("PROXYSPIN_MAX_RETRIES", 3))
|
|
69
|
+
|
|
70
|
+
# ----------------------------------------------------------- lifecycle
|
|
71
|
+
def process_request(self, request, spider):
|
|
72
|
+
if request.meta.get("proxyspin_disabled"):
|
|
73
|
+
return None
|
|
74
|
+
if "proxy" in request.meta and not request.meta.get("_proxyspin_managed"):
|
|
75
|
+
return None # user pinned a proxy explicitly — leave it alone
|
|
76
|
+
key = request.meta.get("proxyspin_key") or _host(request.url)
|
|
77
|
+
proxy = self.pool.get(key=key)
|
|
78
|
+
request.meta["proxy"] = proxy.url
|
|
79
|
+
request.meta["_proxyspin_managed"] = True
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
def process_response(self, request, response, spider):
|
|
83
|
+
proxy_url = request.meta.get("proxy")
|
|
84
|
+
if not request.meta.get("_proxyspin_managed") or not proxy_url:
|
|
85
|
+
return response
|
|
86
|
+
if response.status in self.ban_codes:
|
|
87
|
+
self.pool.mark_failed(parse_proxy(proxy_url))
|
|
88
|
+
retried = self._retry(request, spider, reason=f"HTTP {response.status}")
|
|
89
|
+
if retried is not None:
|
|
90
|
+
return retried
|
|
91
|
+
else:
|
|
92
|
+
self.pool.mark_ok(parse_proxy(proxy_url))
|
|
93
|
+
return response
|
|
94
|
+
|
|
95
|
+
def process_exception(self, request, exception, spider):
|
|
96
|
+
proxy_url = request.meta.get("proxy")
|
|
97
|
+
if not request.meta.get("_proxyspin_managed") or not proxy_url:
|
|
98
|
+
return None
|
|
99
|
+
self.pool.mark_failed(parse_proxy(proxy_url))
|
|
100
|
+
return self._retry(request, spider, reason=repr(exception))
|
|
101
|
+
|
|
102
|
+
# ------------------------------------------------------------- helpers
|
|
103
|
+
def _retry(self, request, spider, *, reason: str):
|
|
104
|
+
retries = request.meta.get("_proxyspin_retries", 0)
|
|
105
|
+
if retries >= self.max_retries:
|
|
106
|
+
logger.warning(
|
|
107
|
+
"proxyspin: giving up on %s after %d proxy switches (%s)",
|
|
108
|
+
request.url, retries, reason,
|
|
109
|
+
)
|
|
110
|
+
return None
|
|
111
|
+
try:
|
|
112
|
+
proxy = self.pool.get(key=request.meta.get("proxyspin_key") or _host(request.url))
|
|
113
|
+
except NoHealthyProxies:
|
|
114
|
+
logger.warning("proxyspin: no healthy proxies left for %s", request.url)
|
|
115
|
+
return None
|
|
116
|
+
retry = request.copy()
|
|
117
|
+
retry.meta["proxy"] = proxy.url
|
|
118
|
+
retry.meta["_proxyspin_retries"] = retries + 1
|
|
119
|
+
retry.dont_filter = True
|
|
120
|
+
logger.debug("proxyspin: retrying %s via %s (%s)", request.url, proxy.address, reason)
|
|
121
|
+
return retry
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _host(url: str) -> str:
|
|
125
|
+
from urllib.parse import urlsplit
|
|
126
|
+
|
|
127
|
+
return urlsplit(url).hostname or url
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import unittest
|
|
3
|
+
|
|
4
|
+
from proxyspin import NoHealthyProxies, Proxy, ProxyPool, parse_proxy
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestParse(unittest.TestCase):
|
|
8
|
+
def test_url_with_auth(self):
|
|
9
|
+
p = parse_proxy("http://user:p%40ss@10.0.0.1:8000")
|
|
10
|
+
self.assertEqual((p.host, p.port, p.username, p.password), ("10.0.0.1", 8000, "user", "p@ss"))
|
|
11
|
+
self.assertEqual(p.url, "http://user:p%40ss@10.0.0.1:8000")
|
|
12
|
+
|
|
13
|
+
def test_socks(self):
|
|
14
|
+
p = parse_proxy("socks5://10.0.0.1:1080")
|
|
15
|
+
self.assertEqual(p.scheme, "socks5")
|
|
16
|
+
|
|
17
|
+
def test_host_port(self):
|
|
18
|
+
p = parse_proxy("10.0.0.1:8000")
|
|
19
|
+
self.assertEqual(p, Proxy("10.0.0.1", 8000))
|
|
20
|
+
|
|
21
|
+
def test_host_port_user_pass(self):
|
|
22
|
+
p = parse_proxy("10.0.0.1:8000:alice:secret")
|
|
23
|
+
self.assertEqual(p.username, "alice")
|
|
24
|
+
self.assertEqual(p.password, "secret")
|
|
25
|
+
|
|
26
|
+
def test_user_pass_at_host_port(self):
|
|
27
|
+
p = parse_proxy("alice:secret@10.0.0.1:8000")
|
|
28
|
+
self.assertEqual((p.username, p.password, p.port), ("alice", "secret", 8000))
|
|
29
|
+
|
|
30
|
+
def test_invalid(self):
|
|
31
|
+
for bad in ("", "hostonly", "10.0.0.1:notaport", "ftp://10.0.0.1:21", "a:1:b:c:d"):
|
|
32
|
+
with self.assertRaises(ValueError, msg=bad):
|
|
33
|
+
parse_proxy(bad)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TestPool(unittest.TestCase):
|
|
37
|
+
def proxies(self, n=3):
|
|
38
|
+
return [f"10.0.0.{i}:8000" for i in range(1, n + 1)]
|
|
39
|
+
|
|
40
|
+
def test_round_robin_cycles(self):
|
|
41
|
+
pool = ProxyPool(self.proxies())
|
|
42
|
+
seen = {pool.get().address for _ in range(3)}
|
|
43
|
+
self.assertEqual(len(seen), 3)
|
|
44
|
+
|
|
45
|
+
def test_dedup(self):
|
|
46
|
+
pool = ProxyPool(["10.0.0.1:8000", "http://10.0.0.1:8000"])
|
|
47
|
+
self.assertEqual(len(pool), 1)
|
|
48
|
+
|
|
49
|
+
def test_cooldown_and_recovery(self):
|
|
50
|
+
pool = ProxyPool(self.proxies(2), max_failures=1, cooldown=0.05)
|
|
51
|
+
bad = pool.get()
|
|
52
|
+
pool.mark_failed(bad)
|
|
53
|
+
self.assertEqual(pool.healthy_count, 1)
|
|
54
|
+
for _ in range(5):
|
|
55
|
+
self.assertNotEqual(pool.get(), bad)
|
|
56
|
+
time.sleep(0.06)
|
|
57
|
+
self.assertEqual(pool.healthy_count, 2)
|
|
58
|
+
|
|
59
|
+
def test_success_resets_failures(self):
|
|
60
|
+
pool = ProxyPool(self.proxies(1), max_failures=2)
|
|
61
|
+
p = pool.get()
|
|
62
|
+
pool.mark_failed(p)
|
|
63
|
+
pool.mark_ok(p)
|
|
64
|
+
pool.mark_failed(p)
|
|
65
|
+
self.assertEqual(pool.healthy_count, 1) # streak broken, not banned
|
|
66
|
+
|
|
67
|
+
def test_all_banned_raises(self):
|
|
68
|
+
pool = ProxyPool(self.proxies(2), max_failures=1, cooldown=60)
|
|
69
|
+
for _ in range(2):
|
|
70
|
+
pool.mark_failed(pool.get())
|
|
71
|
+
with self.assertRaises(NoHealthyProxies):
|
|
72
|
+
pool.get()
|
|
73
|
+
|
|
74
|
+
def test_sticky_keeps_proxy_per_key(self):
|
|
75
|
+
pool = ProxyPool(self.proxies(3), strategy="sticky")
|
|
76
|
+
first = pool.get(key="acct-1")
|
|
77
|
+
for _ in range(5):
|
|
78
|
+
self.assertEqual(pool.get(key="acct-1"), first)
|
|
79
|
+
|
|
80
|
+
def test_sticky_reassigns_after_ban(self):
|
|
81
|
+
pool = ProxyPool(self.proxies(2), strategy="sticky", max_failures=1, cooldown=60)
|
|
82
|
+
first = pool.get(key="acct-1")
|
|
83
|
+
pool.mark_failed(first)
|
|
84
|
+
self.assertNotEqual(pool.get(key="acct-1"), first)
|
|
85
|
+
|
|
86
|
+
def test_stats(self):
|
|
87
|
+
pool = ProxyPool(self.proxies(1))
|
|
88
|
+
p = pool.get()
|
|
89
|
+
pool.mark_ok(p)
|
|
90
|
+
self.assertEqual(pool.stats()[p.address]["successes"], 1)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TestPlaywrightHelper(unittest.TestCase):
|
|
94
|
+
def test_settings_with_auth(self):
|
|
95
|
+
from proxyspin.playwright_helper import proxy_settings
|
|
96
|
+
|
|
97
|
+
pool = ProxyPool(["http://u:p@10.0.0.1:8000"])
|
|
98
|
+
settings = proxy_settings(pool)
|
|
99
|
+
self.assertEqual(
|
|
100
|
+
settings, {"server": "http://10.0.0.1:8000", "username": "u", "password": "p"}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def test_settings_bare_proxy(self):
|
|
104
|
+
from proxyspin.playwright_helper import proxy_settings
|
|
105
|
+
|
|
106
|
+
self.assertEqual(proxy_settings(Proxy("10.0.0.1", 8000)), {"server": "http://10.0.0.1:8000"})
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
if __name__ == "__main__":
|
|
110
|
+
unittest.main()
|