proxyspin 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ __pycache__/
2
+ *.egg-info/
3
+ dist/
4
+ .venv/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 GProxy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: proxyspin
3
+ Version: 0.1.0
4
+ Summary: Rotating proxy pool for Scrapy, Playwright and requests — health tracking, ban detection, sticky sessions. Zero dependencies.
5
+ Project-URL: Homepage, https://github.com/gproxynet/proxyspin
6
+ Project-URL: Issues, https://github.com/gproxynet/proxyspin/issues
7
+ Author-email: GProxy <support@gproxy.net>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: anti-ban,playwright,proxy,proxy-pool,proxy-rotation,requests,rotating-proxies,scrapy,web-scraping
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Framework :: Scrapy
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Internet :: WWW/HTTP
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Requires-Python: >=3.9
18
+ Provides-Extra: playwright
19
+ Requires-Dist: playwright>=1.30; extra == 'playwright'
20
+ Provides-Extra: requests
21
+ Requires-Dist: requests>=2.25; extra == 'requests'
22
+ Provides-Extra: scrapy
23
+ Requires-Dist: scrapy>=2.5; extra == 'scrapy'
24
+ Description-Content-Type: text/markdown
25
+
26
+ # proxyspin
27
+
28
+ Rotating proxy pool for **Scrapy**, **Playwright** and **requests** — with health tracking, ban detection, sticky sessions and a built-in list checker. Zero required dependencies, pure stdlib.
29
+
30
+ ```
31
+ pip install proxyspin
32
+ ```
33
+
34
+ Why another one? The classic `scrapy-rotating-proxies` hasn't been updated in years and is Scrapy-only. `proxyspin` is one small pool you can share across Scrapy spiders, Playwright contexts and plain `requests` code, with the same health model everywhere.
35
+
36
+ ## Features
37
+
38
+ - **One pool, three integrations** — Scrapy middleware, Playwright helper, `requests` session.
39
+ - **Rotation strategies** — `round_robin`, `random`, `sticky` (same proxy per domain/account/worker until it dies).
40
+ - **Health model** — a proxy failing N times in a row is benched with exponential backoff and returns automatically; a success resets its streak.
41
+ - **Ban detection** — configurable HTTP codes (403/407/429 by default) count as proxy failures and trigger a retry through the next proxy.
42
+ - **Any list format** — `scheme://user:pass@host:port`, `host:port`, `host:port:user:pass`, `user:pass@host:port`; load from a list, a file or a provider URL.
43
+ - **CLI checker** — `proxyspin check proxies.txt` tests the whole list concurrently and can write the alive ones out.
44
+
45
+ ## Quickstart
46
+
47
+ ### The pool
48
+
49
+ ```python
50
+ from proxyspin import ProxyPool
51
+
52
+ pool = ProxyPool.from_file("proxies.txt", strategy="round_robin")
53
+ # or inline:
54
+ pool = ProxyPool(["http://user:pass@gate1.example.com:8000", "10.0.0.2:8000"])
55
+ # or straight from your provider's export endpoint:
56
+ pool = ProxyPool.from_url("https://provider.example.com/api/my-list.txt")
57
+
58
+ proxy = pool.get() # -> Proxy; proxy.url is ready to use
59
+ pool.mark_failed(proxy) # bench it after repeated failures
60
+ pool.mark_ok(proxy) # reset its failure streak
61
+ ```
62
+
63
+ ### Scrapy
64
+
65
+ ```python
66
+ # settings.py
67
+ DOWNLOADER_MIDDLEWARES = {
68
+ "proxyspin.scrapy_middleware.ProxySpinMiddleware": 610,
69
+ }
70
+ PROXYSPIN_FILE = "proxies.txt"
71
+ PROXYSPIN_STRATEGY = "sticky" # one proxy per target host
72
+ PROXYSPIN_BAN_CODES = [403, 429] # these responses rotate the proxy
73
+ PROXYSPIN_MAX_RETRIES = 3
74
+ ```
75
+
76
+ Per-request overrides: set `request.meta["proxy"]` to pin a proxy, `meta["proxyspin_disabled"] = True` to skip proxying, `meta["proxyspin_key"]` to control stickiness.
77
+
78
+ ### Playwright
79
+
80
+ ```python
81
+ from playwright.sync_api import sync_playwright
82
+ from proxyspin import ProxyPool
83
+ from proxyspin.playwright_helper import proxy_settings
84
+
85
+ pool = ProxyPool.from_file("proxies.txt", strategy="sticky")
86
+
87
+ with sync_playwright() as p:
88
+ browser = p.chromium.launch()
89
+ for account in accounts:
90
+ context = browser.new_context(proxy=proxy_settings(pool, key=account.id))
91
+ # each account keeps its own IP for the whole session
92
+ ```
93
+
94
+ ### requests
95
+
96
+ ```python
97
+ from proxyspin import ProxyPool
98
+ from proxyspin.requests_adapter import RotatingSession
99
+
100
+ session = RotatingSession(ProxyPool.from_file("proxies.txt"))
101
+ print(session.get("https://httpbin.org/ip").json()) # new IP per call
102
+ ```
103
+
104
+ ### Check a list
105
+
106
+ ```
107
+ $ proxyspin check proxies.txt --workers 100 --alive-out alive.txt
108
+ OK 45.155.10.4:8000 612 ms HTTP 200
109
+ DEAD 91.10.77.2:3128 TimeoutError
110
+ ...
111
+ 118/200 alive
112
+ wrote 118 proxies to alive.txt
113
+ ```
114
+
115
+ ## Providers
116
+
117
+ `proxyspin` works with any proxy source: your own servers, free lists, or commercial providers. Example with [GProxy](https://gproxy.net/?utm_source=github&utm_medium=readme&utm_campaign=proxyspin) residential/mobile gateways (rotation happens server-side, so a pool of one entry per gateway is enough — use `sticky` if you need session pinning):
118
+
119
+ ```python
120
+ pool = ProxyPool([
121
+ "http://USER:PASS@gate.gproxy.net:8000", # residential, rotating
122
+ ])
123
+ ```
124
+
125
+ Any other provider works the same way — put its gateway or IP list into the pool.
126
+
127
+ ## Health model in one paragraph
128
+
129
+ Every proxy starts healthy. `mark_failed` increments its failure streak; when the streak reaches `max_failures` (default 2) the proxy is benched for `cooldown * 2**overshoot` seconds (default base 60 s, capped at 1 h), then automatically rejoins rotation. `mark_ok` resets the streak. The Scrapy middleware and `RotatingSession` call these for you based on exceptions and ban codes; with Playwright you call them yourself since only your code knows what a "ban" looks like for your flow.
130
+
131
+ ## License
132
+
133
+ MIT
@@ -0,0 +1,108 @@
1
+ # proxyspin
2
+
3
+ Rotating proxy pool for **Scrapy**, **Playwright** and **requests** — with health tracking, ban detection, sticky sessions and a built-in list checker. Zero required dependencies, pure stdlib.
4
+
5
+ ```
6
+ pip install proxyspin
7
+ ```
8
+
9
+ Why another one? The classic `scrapy-rotating-proxies` hasn't been updated in years and is Scrapy-only. `proxyspin` is one small pool you can share across Scrapy spiders, Playwright contexts and plain `requests` code, with the same health model everywhere.
10
+
11
+ ## Features
12
+
13
+ - **One pool, three integrations** — Scrapy middleware, Playwright helper, `requests` session.
14
+ - **Rotation strategies** — `round_robin`, `random`, `sticky` (same proxy per domain/account/worker until it dies).
15
+ - **Health model** — a proxy failing N times in a row is benched with exponential backoff and returns automatically; a success resets its streak.
16
+ - **Ban detection** — configurable HTTP codes (403/407/429 by default) count as proxy failures and trigger a retry through the next proxy.
17
+ - **Any list format** — `scheme://user:pass@host:port`, `host:port`, `host:port:user:pass`, `user:pass@host:port`; load from a list, a file or a provider URL.
18
+ - **CLI checker** — `proxyspin check proxies.txt` tests the whole list concurrently and can write the alive ones out.
19
+
20
+ ## Quickstart
21
+
22
+ ### The pool
23
+
24
+ ```python
25
+ from proxyspin import ProxyPool
26
+
27
+ pool = ProxyPool.from_file("proxies.txt", strategy="round_robin")
28
+ # or inline:
29
+ pool = ProxyPool(["http://user:pass@gate1.example.com:8000", "10.0.0.2:8000"])
30
+ # or straight from your provider's export endpoint:
31
+ pool = ProxyPool.from_url("https://provider.example.com/api/my-list.txt")
32
+
33
+ proxy = pool.get() # -> Proxy; proxy.url is ready to use
34
+ pool.mark_failed(proxy) # bench it after repeated failures
35
+ pool.mark_ok(proxy) # reset its failure streak
36
+ ```
37
+
38
+ ### Scrapy
39
+
40
+ ```python
41
+ # settings.py
42
+ DOWNLOADER_MIDDLEWARES = {
43
+ "proxyspin.scrapy_middleware.ProxySpinMiddleware": 610,
44
+ }
45
+ PROXYSPIN_FILE = "proxies.txt"
46
+ PROXYSPIN_STRATEGY = "sticky" # one proxy per target host
47
+ PROXYSPIN_BAN_CODES = [403, 429] # these responses rotate the proxy
48
+ PROXYSPIN_MAX_RETRIES = 3
49
+ ```
50
+
51
+ Per-request overrides: set `request.meta["proxy"]` to pin a proxy, `meta["proxyspin_disabled"] = True` to skip proxying, `meta["proxyspin_key"]` to control stickiness.
52
+
53
+ ### Playwright
54
+
55
+ ```python
56
+ from playwright.sync_api import sync_playwright
57
+ from proxyspin import ProxyPool
58
+ from proxyspin.playwright_helper import proxy_settings
59
+
60
+ pool = ProxyPool.from_file("proxies.txt", strategy="sticky")
61
+
62
+ with sync_playwright() as p:
63
+ browser = p.chromium.launch()
64
+ for account in accounts:
65
+ context = browser.new_context(proxy=proxy_settings(pool, key=account.id))
66
+ # each account keeps its own IP for the whole session
67
+ ```
68
+
69
+ ### requests
70
+
71
+ ```python
72
+ from proxyspin import ProxyPool
73
+ from proxyspin.requests_adapter import RotatingSession
74
+
75
+ session = RotatingSession(ProxyPool.from_file("proxies.txt"))
76
+ print(session.get("https://httpbin.org/ip").json()) # new IP per call
77
+ ```
78
+
79
+ ### Check a list
80
+
81
+ ```
82
+ $ proxyspin check proxies.txt --workers 100 --alive-out alive.txt
83
+ OK 45.155.10.4:8000 612 ms HTTP 200
84
+ DEAD 91.10.77.2:3128 TimeoutError
85
+ ...
86
+ 118/200 alive
87
+ wrote 118 proxies to alive.txt
88
+ ```
89
+
90
+ ## Providers
91
+
92
+ `proxyspin` works with any proxy source: your own servers, free lists, or commercial providers. Example with [GProxy](https://gproxy.net/?utm_source=github&utm_medium=readme&utm_campaign=proxyspin) residential/mobile gateways (rotation happens server-side, so a pool of one entry per gateway is enough — use `sticky` if you need session pinning):
93
+
94
+ ```python
95
+ pool = ProxyPool([
96
+ "http://USER:PASS@gate.gproxy.net:8000", # residential, rotating
97
+ ])
98
+ ```
99
+
100
+ Any other provider works the same way — put its gateway or IP list into the pool.
101
+
102
+ ## Health model in one paragraph
103
+
104
+ Every proxy starts healthy. `mark_failed` increments its failure streak; when the streak reaches `max_failures` (default 2) the proxy is benched for `cooldown * 2**overshoot` seconds (default base 60 s, capped at 1 h), then automatically rejoins rotation. `mark_ok` resets the streak. The Scrapy middleware and `RotatingSession` call these for you based on exceptions and ban codes; with Playwright you call them yourself since only your code knows what a "ban" looks like for your flow.
105
+
106
+ ## License
107
+
108
+ MIT
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "proxyspin"
7
+ version = "0.1.0"
8
+ description = "Rotating proxy pool for Scrapy, Playwright and requests — health tracking, ban detection, sticky sessions. Zero dependencies."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [{ name = "GProxy", email = "support@gproxy.net" }]
13
+ keywords = [
14
+ "proxy", "rotating-proxies", "scrapy", "playwright", "requests",
15
+ "web-scraping", "proxy-rotation", "proxy-pool", "anti-ban",
16
+ ]
17
+ classifiers = [
18
+ "Development Status :: 4 - Beta",
19
+ "Intended Audience :: Developers",
20
+ "Programming Language :: Python :: 3",
21
+ "Topic :: Internet :: WWW/HTTP",
22
+ "Topic :: Software Development :: Libraries :: Python Modules",
23
+ "Framework :: Scrapy",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ requests = ["requests>=2.25"]
28
+ scrapy = ["scrapy>=2.5"]
29
+ playwright = ["playwright>=1.30"]
30
+
31
+ [project.scripts]
32
+ proxyspin = "proxyspin.cli:main"
33
+
34
+ [project.urls]
35
+ Homepage = "https://github.com/gproxynet/proxyspin"
36
+ Issues = "https://github.com/gproxynet/proxyspin/issues"
37
+
38
+ [tool.hatch.build.targets.wheel]
39
+ packages = ["src/proxyspin"]
@@ -0,0 +1,7 @@
1
+ """proxyspin — rotating proxy pool for Scrapy, Playwright and requests."""
2
+
3
+ from .pool import NoHealthyProxies, ProxyPool
4
+ from .proxy import Proxy, parse_proxy
5
+
6
+ __version__ = "0.1.0"
7
+ __all__ = ["Proxy", "ProxyPool", "NoHealthyProxies", "parse_proxy", "__version__"]
@@ -0,0 +1,94 @@
1
+ """``proxyspin check`` — fast concurrent liveness check for a proxy list.
2
+
3
+ Zero dependencies: stdlib only (http/https proxies; socks lines are skipped).
4
+
5
+ ::
6
+
7
+ proxyspin check proxies.txt
8
+ proxyspin check proxies.txt --url https://example.com --timeout 5 --workers 100
9
+ proxyspin check proxies.txt --alive-out alive.txt
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import sys
15
+ import time
16
+ import urllib.error
17
+ import urllib.request
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from pathlib import Path
20
+
21
+ from .proxy import Proxy, parse_proxy
22
+
23
+ DEFAULT_TEST_URL = "http://httpbin.org/ip"
24
+
25
+
26
+ def check_one(proxy: Proxy, url: str, timeout: float) -> tuple[Proxy, float | None, str]:
27
+ """Return (proxy, latency_seconds_or_None, detail)."""
28
+ handler = urllib.request.ProxyHandler({"http": proxy.url, "https": proxy.url})
29
+ opener = urllib.request.build_opener(handler)
30
+ opener.addheaders = [("User-Agent", "proxyspin-check")]
31
+ start = time.monotonic()
32
+ try:
33
+ with opener.open(url, timeout=timeout) as resp:
34
+ resp.read(256)
35
+ return proxy, time.monotonic() - start, f"HTTP {resp.status}"
36
+ except urllib.error.HTTPError as exc:
37
+ return proxy, None, f"HTTP {exc.code}"
38
+ except Exception as exc:
39
+ return proxy, None, exc.__class__.__name__
40
+
41
+
42
+ def main(argv: list[str] | None = None) -> int:
43
+ parser = argparse.ArgumentParser(prog="proxyspin", description=__doc__)
44
+ sub = parser.add_subparsers(dest="command", required=True)
45
+ check = sub.add_parser("check", help="check which proxies in a list are alive")
46
+ check.add_argument("file", help="proxy list, one per line (any common format)")
47
+ check.add_argument("--url", default=DEFAULT_TEST_URL, help="URL fetched through each proxy")
48
+ check.add_argument("--timeout", type=float, default=10.0)
49
+ check.add_argument("--workers", type=int, default=50)
50
+ check.add_argument("--alive-out", help="write working proxies to this file")
51
+ args = parser.parse_args(argv)
52
+
53
+ proxies: list[Proxy] = []
54
+ for line in Path(args.file).read_text().splitlines():
55
+ line = line.strip()
56
+ if not line or line.startswith("#"):
57
+ continue
58
+ try:
59
+ proxy = parse_proxy(line)
60
+ except ValueError as exc:
61
+ print(f" skip: {exc}", file=sys.stderr)
62
+ continue
63
+ if proxy.scheme.startswith("socks"):
64
+ print(f" skip (socks not supported by checker): {proxy.address}", file=sys.stderr)
65
+ continue
66
+ proxies.append(proxy)
67
+
68
+ if not proxies:
69
+ print("no proxies to check", file=sys.stderr)
70
+ return 2
71
+
72
+ alive: list[tuple[Proxy, float]] = []
73
+ with ThreadPoolExecutor(max_workers=args.workers) as pool:
74
+ futures = [pool.submit(check_one, p, args.url, args.timeout) for p in proxies]
75
+ for future in as_completed(futures):
76
+ proxy, latency, detail = future.result()
77
+ if latency is not None:
78
+ alive.append((proxy, latency))
79
+ print(f"OK {proxy.address:<21} {latency * 1000:6.0f} ms {detail}")
80
+ else:
81
+ print(f"DEAD {proxy.address:<21} {detail}")
82
+
83
+ print(f"\n{len(alive)}/{len(proxies)} alive")
84
+ if args.alive_out:
85
+ alive.sort(key=lambda item: item[1])
86
+ Path(args.alive_out).write_text(
87
+ "\n".join(proxy.url for proxy, _ in alive) + ("\n" if alive else "")
88
+ )
89
+ print(f"wrote {len(alive)} proxies to {args.alive_out}")
90
+ return 0 if alive else 1
91
+
92
+
93
+ if __name__ == "__main__": # pragma: no cover
94
+ sys.exit(main())
@@ -0,0 +1,38 @@
1
+ """Playwright integration: per-context rotating proxies.
2
+
3
+ Playwright accepts a proxy per browser launch or per context. The natural
4
+ rotation unit is the context::
5
+
6
+ from playwright.sync_api import sync_playwright
7
+ from proxyspin import ProxyPool
8
+ from proxyspin.playwright_helper import proxy_settings
9
+
10
+ pool = ProxyPool.from_file("proxies.txt", strategy="sticky")
11
+
12
+ with sync_playwright() as p:
13
+ browser = p.chromium.launch()
14
+ context = browser.new_context(proxy=proxy_settings(pool, key="job-42"))
15
+ page = context.new_page()
16
+ page.goto("https://example.com")
17
+
18
+ Async API is identical — ``proxy_settings`` is plain data, no I/O.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ from .pool import ProxyPool
23
+ from .proxy import Proxy
24
+
25
+
26
+ def proxy_settings(pool_or_proxy: ProxyPool | Proxy, key: str | None = None) -> dict:
27
+ """Build the ``proxy`` dict Playwright expects, from a pool or a proxy.
28
+
29
+ Credentials are passed via the dedicated fields (Playwright ignores
30
+ userinfo embedded in the server URL).
31
+ """
32
+ proxy = pool_or_proxy.get(key=key) if isinstance(pool_or_proxy, ProxyPool) else pool_or_proxy
33
+ settings: dict = {"server": f"{proxy.scheme}://{proxy.host}:{proxy.port}"}
34
+ if proxy.username is not None:
35
+ settings["username"] = proxy.username
36
+ if proxy.password is not None:
37
+ settings["password"] = proxy.password
38
+ return settings
@@ -0,0 +1,167 @@
1
+ """Proxy pool: sources, rotation strategies and health tracking."""
2
+ from __future__ import annotations
3
+
4
+ import random
5
+ import threading
6
+ import time
7
+ import urllib.request
8
+ from collections.abc import Iterable
9
+ from pathlib import Path
10
+
11
+ from .proxy import Proxy, ProxyState, parse_proxy
12
+
13
+ STRATEGIES = ("round_robin", "random", "sticky")
14
+
15
+
16
+ class NoHealthyProxies(RuntimeError):
17
+ """Every proxy in the pool is currently banned."""
18
+
19
+
20
+ class ProxyPool:
21
+ """Thread-safe rotating proxy pool with failure-based cooldowns.
22
+
23
+ Rotation strategies:
24
+ round_robin cycle through healthy proxies in order (default)
25
+ random pick a healthy proxy at random
26
+ sticky keep returning the same proxy for a given ``key``
27
+ (e.g. a target domain) until it goes unhealthy
28
+
29
+ Health model: a proxy that fails ``max_failures`` times in a row is
30
+ benched for ``cooldown * 2**(streak-1)`` seconds (capped at
31
+ ``max_cooldown``), then automatically returns to rotation. A success
32
+ resets its failure counter.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ proxies: Iterable[Proxy | str] = (),
38
+ *,
39
+ strategy: str = "round_robin",
40
+ max_failures: int = 2,
41
+ cooldown: float = 60.0,
42
+ max_cooldown: float = 3600.0,
43
+ default_scheme: str = "http",
44
+ ) -> None:
45
+ if strategy not in STRATEGIES:
46
+ raise ValueError(f"unknown strategy {strategy!r}, expected one of {STRATEGIES}")
47
+ self.strategy = strategy
48
+ self.max_failures = max_failures
49
+ self.cooldown = cooldown
50
+ self.max_cooldown = max_cooldown
51
+ self.default_scheme = default_scheme
52
+ self._lock = threading.Lock()
53
+ self._states: dict[Proxy, ProxyState] = {}
54
+ self._order: list[Proxy] = []
55
+ self._rr_index = 0
56
+ self._sticky: dict[str, Proxy] = {}
57
+ self.extend(proxies)
58
+
59
+ # ------------------------------------------------------------- sources
60
+ @classmethod
61
+ def from_file(cls, path: str | Path, **kwargs) -> "ProxyPool":
62
+ """One proxy per line; blank lines and ``#`` comments are skipped."""
63
+ lines = Path(path).read_text().splitlines()
64
+ return cls([l for l in lines if l.strip() and not l.lstrip().startswith("#")], **kwargs)
65
+
66
+ @classmethod
67
+ def from_url(cls, url: str, *, timeout: float = 15.0, **kwargs) -> "ProxyPool":
68
+ """Fetch a plain-text proxy list (one per line) from a URL.
69
+
70
+ Handy for provider endpoints that export your current proxy list.
71
+ """
72
+ with urllib.request.urlopen(url, timeout=timeout) as resp:
73
+ body = resp.read().decode("utf-8", "replace")
74
+ lines = [l for l in body.splitlines() if l.strip() and not l.lstrip().startswith("#")]
75
+ return cls(lines, **kwargs)
76
+
77
+ def extend(self, proxies: Iterable[Proxy | str]) -> None:
78
+ with self._lock:
79
+ for item in proxies:
80
+ proxy = item if isinstance(item, Proxy) else parse_proxy(item, self.default_scheme)
81
+ if proxy not in self._states:
82
+ self._states[proxy] = ProxyState(proxy)
83
+ self._order.append(proxy)
84
+
85
+ # ------------------------------------------------------------ rotation
86
+ def get(self, key: str | None = None) -> Proxy:
87
+ """Return the next healthy proxy according to the pool strategy.
88
+
89
+ ``key`` is only used by the ``sticky`` strategy (any hashable label:
90
+ target domain, account id, worker name...).
91
+ """
92
+ with self._lock:
93
+ healthy = self._healthy_locked()
94
+ if not healthy:
95
+ raise NoHealthyProxies(
96
+ f"all {len(self._order)} proxies are cooling down; "
97
+ "retry later or add more proxies"
98
+ )
99
+ if self.strategy == "sticky" and key is not None:
100
+ current = self._sticky.get(key)
101
+ if current is not None and current in healthy:
102
+ return current
103
+ choice = random.choice(healthy)
104
+ self._sticky[key] = choice
105
+ return choice
106
+ if self.strategy == "random":
107
+ return random.choice(healthy)
108
+ self._rr_index = (self._rr_index + 1) % len(healthy)
109
+ return healthy[self._rr_index]
110
+
111
+ # -------------------------------------------------------------- health
112
+ def mark_ok(self, proxy: Proxy) -> None:
113
+ with self._lock:
114
+ state = self._states.get(proxy)
115
+ if state is not None:
116
+ state.successes += 1
117
+ state.failures = 0
118
+ state.banned_until = 0.0
119
+
120
+ def mark_failed(self, proxy: Proxy) -> None:
121
+ with self._lock:
122
+ state = self._states.get(proxy)
123
+ if state is None:
124
+ return
125
+ state.failures += 1
126
+ if state.failures >= self.max_failures:
127
+ streak = state.failures - self.max_failures
128
+ delay = min(self.cooldown * (2**streak), self.max_cooldown)
129
+ state.banned_until = time.monotonic() + delay
130
+ for key, sticky_proxy in list(self._sticky.items()):
131
+ if sticky_proxy == proxy:
132
+ del self._sticky[key]
133
+
134
+ def remove(self, proxy: Proxy) -> None:
135
+ with self._lock:
136
+ self._states.pop(proxy, None)
137
+ if proxy in self._order:
138
+ self._order.remove(proxy)
139
+ for key, sticky_proxy in list(self._sticky.items()):
140
+ if sticky_proxy == proxy:
141
+ del self._sticky[key]
142
+
143
+ # --------------------------------------------------------------- stats
144
+ def _healthy_locked(self) -> list[Proxy]:
145
+ now = time.monotonic()
146
+ return [p for p in self._order if self._states[p].banned_until <= now]
147
+
148
+ @property
149
+ def healthy_count(self) -> int:
150
+ with self._lock:
151
+ return len(self._healthy_locked())
152
+
153
+ def __len__(self) -> int:
154
+ return len(self._order)
155
+
156
+ def stats(self) -> dict[str, dict]:
157
+ """Snapshot of per-proxy health, keyed by ``host:port``."""
158
+ now = time.monotonic()
159
+ with self._lock:
160
+ return {
161
+ p.address: {
162
+ "successes": s.successes,
163
+ "failures": s.failures,
164
+ "banned_for": max(0.0, round(s.banned_until - now, 1)),
165
+ }
166
+ for p, s in self._states.items()
167
+ }
@@ -0,0 +1,97 @@
1
+ """Proxy model and parsing of common list formats."""
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass, field
5
+ from urllib.parse import quote, unquote, urlsplit
6
+
7
+ SUPPORTED_SCHEMES = ("http", "https", "socks4", "socks5", "socks5h")
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class Proxy:
12
+ host: str
13
+ port: int
14
+ scheme: str = "http"
15
+ username: str | None = None
16
+ password: str | None = None
17
+
18
+ @property
19
+ def url(self) -> str:
20
+ """Full proxy URL, credentials included if present."""
21
+ auth = ""
22
+ if self.username is not None:
23
+ auth = quote(self.username, safe="")
24
+ if self.password is not None:
25
+ auth += ":" + quote(self.password, safe="")
26
+ auth += "@"
27
+ return f"{self.scheme}://{auth}{self.host}:{self.port}"
28
+
29
+ @property
30
+ def address(self) -> str:
31
+ return f"{self.host}:{self.port}"
32
+
33
+ def __str__(self) -> str: # pragma: no cover - convenience
34
+ return self.url
35
+
36
+
37
+ def parse_proxy(line: str, default_scheme: str = "http") -> Proxy:
38
+ """Parse one proxy from any of the common formats.
39
+
40
+ Accepted:
41
+ scheme://user:pass@host:port
42
+ scheme://host:port
43
+ user:pass@host:port
44
+ host:port
45
+ host:port:user:pass
46
+ """
47
+ line = line.strip()
48
+ if not line:
49
+ raise ValueError("empty proxy line")
50
+
51
+ if "://" in line:
52
+ parts = urlsplit(line)
53
+ if parts.scheme not in SUPPORTED_SCHEMES:
54
+ raise ValueError(f"unsupported proxy scheme: {parts.scheme!r}")
55
+ if not parts.hostname or not parts.port:
56
+ raise ValueError(f"proxy needs host and port: {line!r}")
57
+ return Proxy(
58
+ host=parts.hostname,
59
+ port=parts.port,
60
+ scheme=parts.scheme,
61
+ username=unquote(parts.username) if parts.username else None,
62
+ password=unquote(parts.password) if parts.password else None,
63
+ )
64
+
65
+ if "@" in line:
66
+ creds, _, hostport = line.rpartition("@")
67
+ user, _, pwd = creds.partition(":")
68
+ host, _, port = hostport.rpartition(":")
69
+ _validate(host, port, line)
70
+ return Proxy(host, int(port), default_scheme, user, pwd or None)
71
+
72
+ pieces = line.split(":")
73
+ if len(pieces) == 2:
74
+ host, port = pieces
75
+ _validate(host, port, line)
76
+ return Proxy(host, int(port), default_scheme)
77
+ if len(pieces) == 4:
78
+ host, port, user, pwd = pieces
79
+ _validate(host, port, line)
80
+ return Proxy(host, int(port), default_scheme, user, pwd)
81
+ raise ValueError(f"cannot parse proxy: {line!r}")
82
+
83
+
84
+ def _validate(host: str, port: str, line: str) -> None:
85
+ if not host or not port.isdigit() or not 0 < int(port) < 65536:
86
+ raise ValueError(f"cannot parse proxy: {line!r}")
87
+
88
+
89
+ @dataclass
90
+ class ProxyState:
91
+ """Mutable health bookkeeping attached to a proxy inside a pool."""
92
+
93
+ proxy: Proxy
94
+ failures: int = 0
95
+ successes: int = 0
96
+ banned_until: float = 0.0
97
+ extra: dict = field(default_factory=dict)
@@ -0,0 +1,63 @@
1
+ """requests integration: a Session that rotates proxies per request.
2
+
3
+ ::
4
+
5
+ from proxyspin import ProxyPool
6
+ from proxyspin.requests_adapter import RotatingSession
7
+
8
+ pool = ProxyPool.from_file("proxies.txt")
9
+ session = RotatingSession(pool)
10
+ r = session.get("https://httpbin.org/ip") # each call uses the next proxy
11
+
12
+ Failures (connect errors, or responses with a status in ``ban_codes``)
13
+ mark the proxy in the pool and the call is retried through another proxy
14
+ up to ``max_retries`` times before the last error is raised.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ try:
19
+ import requests
20
+ except ImportError as exc: # pragma: no cover
21
+ raise ImportError("proxyspin.requests_adapter needs requests: pip install proxyspin[requests]") from exc
22
+
23
+ from .pool import ProxyPool
24
+
25
+ DEFAULT_BAN_CODES = frozenset({403, 407, 429})
26
+
27
+
28
+ class RotatingSession(requests.Session):
29
+ def __init__(
30
+ self,
31
+ pool: ProxyPool,
32
+ *,
33
+ max_retries: int = 3,
34
+ ban_codes: frozenset[int] = DEFAULT_BAN_CODES,
35
+ ) -> None:
36
+ super().__init__()
37
+ self.pool = pool
38
+ self.max_retries = max_retries
39
+ self.ban_codes = ban_codes
40
+
41
+ def request(self, method, url, **kwargs): # type: ignore[override]
42
+ if "proxies" in kwargs: # caller pinned a proxy — don't interfere
43
+ return super().request(method, url, **kwargs)
44
+ sticky_key = kwargs.pop("proxyspin_key", None)
45
+ last_exc: Exception | None = None
46
+ for _ in range(self.max_retries + 1):
47
+ proxy = self.pool.get(key=sticky_key)
48
+ kwargs["proxies"] = {"http": proxy.url, "https": proxy.url}
49
+ try:
50
+ response = super().request(method, url, **kwargs)
51
+ except (requests.ConnectionError, requests.Timeout) as exc:
52
+ self.pool.mark_failed(proxy)
53
+ last_exc = exc
54
+ continue
55
+ if response.status_code in self.ban_codes:
56
+ self.pool.mark_failed(proxy)
57
+ last_response = response
58
+ continue
59
+ self.pool.mark_ok(proxy)
60
+ return response
61
+ if last_exc is not None:
62
+ raise last_exc
63
+ return last_response
@@ -0,0 +1,127 @@
1
+ """Scrapy downloader middleware with rotation, ban detection and retries.
2
+
3
+ Enable in ``settings.py``::
4
+
5
+ DOWNLOADER_MIDDLEWARES = {
6
+ "proxyspin.scrapy_middleware.ProxySpinMiddleware": 610,
7
+ }
8
+
9
+ PROXYSPIN_LIST = ["http://user:pass@gate.example.com:8000"]
10
+ # or PROXYSPIN_FILE = "proxies.txt"
11
+ # or PROXYSPIN_URL = "https://provider.example.com/api/list"
12
+
13
+ Optional settings (defaults shown)::
14
+
15
+ PROXYSPIN_STRATEGY = "round_robin" # round_robin | random | sticky
16
+ PROXYSPIN_BAN_CODES = [403, 407, 429] # responses treated as proxy bans
17
+ PROXYSPIN_MAX_RETRIES = 3 # per-request proxy switches
18
+ PROXYSPIN_MAX_FAILURES = 2 # pool: failures before cooldown
19
+ PROXYSPIN_COOLDOWN = 60.0 # pool: base cooldown, seconds
20
+
21
+ Per-request control via ``request.meta``:
22
+
23
+ ``proxy`` set explicitly to bypass the pool
24
+ ``proxyspin_disabled`` truthy to skip proxying this request
25
+ ``proxyspin_key`` sticky key (defaults to the request host)
26
+ """
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+
31
+ from .pool import NoHealthyProxies, ProxyPool
32
+ from .proxy import parse_proxy
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class ProxySpinMiddleware:
38
+ def __init__(self, pool: ProxyPool, ban_codes: frozenset[int], max_retries: int) -> None:
39
+ self.pool = pool
40
+ self.ban_codes = ban_codes
41
+ self.max_retries = max_retries
42
+
43
+ @classmethod
44
+ def from_crawler(cls, crawler):
45
+ settings = crawler.settings
46
+ kwargs = {
47
+ "strategy": settings.get("PROXYSPIN_STRATEGY", "round_robin"),
48
+ "max_failures": settings.getint("PROXYSPIN_MAX_FAILURES", 2),
49
+ "cooldown": settings.getfloat("PROXYSPIN_COOLDOWN", 60.0),
50
+ }
51
+ if settings.get("PROXYSPIN_URL"):
52
+ pool = ProxyPool.from_url(settings.get("PROXYSPIN_URL"), **kwargs)
53
+ elif settings.get("PROXYSPIN_FILE"):
54
+ pool = ProxyPool.from_file(settings.get("PROXYSPIN_FILE"), **kwargs)
55
+ else:
56
+ proxies = settings.getlist("PROXYSPIN_LIST")
57
+ if not proxies:
58
+ from scrapy.exceptions import NotConfigured
59
+
60
+ raise NotConfigured(
61
+ "set PROXYSPIN_LIST, PROXYSPIN_FILE or PROXYSPIN_URL"
62
+ )
63
+ pool = ProxyPool(proxies, **kwargs)
64
+ logger.info("proxyspin: loaded %d proxies", len(pool))
65
+ ban_codes = frozenset(
66
+ int(c) for c in settings.getlist("PROXYSPIN_BAN_CODES", [403, 407, 429])
67
+ )
68
+ return cls(pool, ban_codes, settings.getint("PROXYSPIN_MAX_RETRIES", 3))
69
+
70
+ # ----------------------------------------------------------- lifecycle
71
+ def process_request(self, request, spider):
72
+ if request.meta.get("proxyspin_disabled"):
73
+ return None
74
+ if "proxy" in request.meta and not request.meta.get("_proxyspin_managed"):
75
+ return None # user pinned a proxy explicitly — leave it alone
76
+ key = request.meta.get("proxyspin_key") or _host(request.url)
77
+ proxy = self.pool.get(key=key)
78
+ request.meta["proxy"] = proxy.url
79
+ request.meta["_proxyspin_managed"] = True
80
+ return None
81
+
82
+ def process_response(self, request, response, spider):
83
+ proxy_url = request.meta.get("proxy")
84
+ if not request.meta.get("_proxyspin_managed") or not proxy_url:
85
+ return response
86
+ if response.status in self.ban_codes:
87
+ self.pool.mark_failed(parse_proxy(proxy_url))
88
+ retried = self._retry(request, spider, reason=f"HTTP {response.status}")
89
+ if retried is not None:
90
+ return retried
91
+ else:
92
+ self.pool.mark_ok(parse_proxy(proxy_url))
93
+ return response
94
+
95
+ def process_exception(self, request, exception, spider):
96
+ proxy_url = request.meta.get("proxy")
97
+ if not request.meta.get("_proxyspin_managed") or not proxy_url:
98
+ return None
99
+ self.pool.mark_failed(parse_proxy(proxy_url))
100
+ return self._retry(request, spider, reason=repr(exception))
101
+
102
+ # ------------------------------------------------------------- helpers
103
+ def _retry(self, request, spider, *, reason: str):
104
+ retries = request.meta.get("_proxyspin_retries", 0)
105
+ if retries >= self.max_retries:
106
+ logger.warning(
107
+ "proxyspin: giving up on %s after %d proxy switches (%s)",
108
+ request.url, retries, reason,
109
+ )
110
+ return None
111
+ try:
112
+ proxy = self.pool.get(key=request.meta.get("proxyspin_key") or _host(request.url))
113
+ except NoHealthyProxies:
114
+ logger.warning("proxyspin: no healthy proxies left for %s", request.url)
115
+ return None
116
+ retry = request.copy()
117
+ retry.meta["proxy"] = proxy.url
118
+ retry.meta["_proxyspin_retries"] = retries + 1
119
+ retry.dont_filter = True
120
+ logger.debug("proxyspin: retrying %s via %s (%s)", request.url, proxy.address, reason)
121
+ return retry
122
+
123
+
124
+ def _host(url: str) -> str:
125
+ from urllib.parse import urlsplit
126
+
127
+ return urlsplit(url).hostname or url
@@ -0,0 +1,110 @@
1
+ import time
2
+ import unittest
3
+
4
+ from proxyspin import NoHealthyProxies, Proxy, ProxyPool, parse_proxy
5
+
6
+
7
+ class TestParse(unittest.TestCase):
8
+ def test_url_with_auth(self):
9
+ p = parse_proxy("http://user:p%40ss@10.0.0.1:8000")
10
+ self.assertEqual((p.host, p.port, p.username, p.password), ("10.0.0.1", 8000, "user", "p@ss"))
11
+ self.assertEqual(p.url, "http://user:p%40ss@10.0.0.1:8000")
12
+
13
+ def test_socks(self):
14
+ p = parse_proxy("socks5://10.0.0.1:1080")
15
+ self.assertEqual(p.scheme, "socks5")
16
+
17
+ def test_host_port(self):
18
+ p = parse_proxy("10.0.0.1:8000")
19
+ self.assertEqual(p, Proxy("10.0.0.1", 8000))
20
+
21
+ def test_host_port_user_pass(self):
22
+ p = parse_proxy("10.0.0.1:8000:alice:secret")
23
+ self.assertEqual(p.username, "alice")
24
+ self.assertEqual(p.password, "secret")
25
+
26
+ def test_user_pass_at_host_port(self):
27
+ p = parse_proxy("alice:secret@10.0.0.1:8000")
28
+ self.assertEqual((p.username, p.password, p.port), ("alice", "secret", 8000))
29
+
30
+ def test_invalid(self):
31
+ for bad in ("", "hostonly", "10.0.0.1:notaport", "ftp://10.0.0.1:21", "a:1:b:c:d"):
32
+ with self.assertRaises(ValueError, msg=bad):
33
+ parse_proxy(bad)
34
+
35
+
36
+ class TestPool(unittest.TestCase):
37
+ def proxies(self, n=3):
38
+ return [f"10.0.0.{i}:8000" for i in range(1, n + 1)]
39
+
40
+ def test_round_robin_cycles(self):
41
+ pool = ProxyPool(self.proxies())
42
+ seen = {pool.get().address for _ in range(3)}
43
+ self.assertEqual(len(seen), 3)
44
+
45
+ def test_dedup(self):
46
+ pool = ProxyPool(["10.0.0.1:8000", "http://10.0.0.1:8000"])
47
+ self.assertEqual(len(pool), 1)
48
+
49
+ def test_cooldown_and_recovery(self):
50
+ pool = ProxyPool(self.proxies(2), max_failures=1, cooldown=0.05)
51
+ bad = pool.get()
52
+ pool.mark_failed(bad)
53
+ self.assertEqual(pool.healthy_count, 1)
54
+ for _ in range(5):
55
+ self.assertNotEqual(pool.get(), bad)
56
+ time.sleep(0.06)
57
+ self.assertEqual(pool.healthy_count, 2)
58
+
59
+ def test_success_resets_failures(self):
60
+ pool = ProxyPool(self.proxies(1), max_failures=2)
61
+ p = pool.get()
62
+ pool.mark_failed(p)
63
+ pool.mark_ok(p)
64
+ pool.mark_failed(p)
65
+ self.assertEqual(pool.healthy_count, 1) # streak broken, not banned
66
+
67
+ def test_all_banned_raises(self):
68
+ pool = ProxyPool(self.proxies(2), max_failures=1, cooldown=60)
69
+ for _ in range(2):
70
+ pool.mark_failed(pool.get())
71
+ with self.assertRaises(NoHealthyProxies):
72
+ pool.get()
73
+
74
+ def test_sticky_keeps_proxy_per_key(self):
75
+ pool = ProxyPool(self.proxies(3), strategy="sticky")
76
+ first = pool.get(key="acct-1")
77
+ for _ in range(5):
78
+ self.assertEqual(pool.get(key="acct-1"), first)
79
+
80
+ def test_sticky_reassigns_after_ban(self):
81
+ pool = ProxyPool(self.proxies(2), strategy="sticky", max_failures=1, cooldown=60)
82
+ first = pool.get(key="acct-1")
83
+ pool.mark_failed(first)
84
+ self.assertNotEqual(pool.get(key="acct-1"), first)
85
+
86
+ def test_stats(self):
87
+ pool = ProxyPool(self.proxies(1))
88
+ p = pool.get()
89
+ pool.mark_ok(p)
90
+ self.assertEqual(pool.stats()[p.address]["successes"], 1)
91
+
92
+
93
+ class TestPlaywrightHelper(unittest.TestCase):
94
+ def test_settings_with_auth(self):
95
+ from proxyspin.playwright_helper import proxy_settings
96
+
97
+ pool = ProxyPool(["http://u:p@10.0.0.1:8000"])
98
+ settings = proxy_settings(pool)
99
+ self.assertEqual(
100
+ settings, {"server": "http://10.0.0.1:8000", "username": "u", "password": "p"}
101
+ )
102
+
103
+ def test_settings_bare_proxy(self):
104
+ from proxyspin.playwright_helper import proxy_settings
105
+
106
+ self.assertEqual(proxy_settings(Proxy("10.0.0.1", 8000)), {"server": "http://10.0.0.1:8000"})
107
+
108
+
109
+ if __name__ == "__main__":
110
+ unittest.main()