thordata-sdk 0.2.4__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +151 -0
- thordata/_example_utils.py +77 -0
- thordata/_utils.py +190 -0
- thordata/async_client.py +1675 -0
- thordata/client.py +1644 -0
- thordata/demo.py +138 -0
- thordata/enums.py +384 -0
- thordata/exceptions.py +355 -0
- thordata/models.py +1197 -0
- thordata/retry.py +382 -0
- thordata/serp_engines.py +166 -0
- thordata_sdk-1.2.0.dist-info/METADATA +208 -0
- thordata_sdk-1.2.0.dist-info/RECORD +16 -0
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-1.2.0.dist-info}/WHEEL +1 -1
- thordata_sdk-1.2.0.dist-info/licenses/LICENSE +21 -0
- thordata_sdk-1.2.0.dist-info/top_level.txt +1 -0
- thordata_sdk/__init__.py +0 -9
- thordata_sdk/async_client.py +0 -247
- thordata_sdk/client.py +0 -303
- thordata_sdk/enums.py +0 -20
- thordata_sdk/parameters.py +0 -41
- thordata_sdk-0.2.4.dist-info/LICENSE +0 -201
- thordata_sdk-0.2.4.dist-info/METADATA +0 -113
- thordata_sdk-0.2.4.dist-info/RECORD +0 -10
- thordata_sdk-0.2.4.dist-info/top_level.txt +0 -1
thordata/client.py
ADDED
|
@@ -0,0 +1,1644 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Synchronous client for the Thordata API.
|
|
3
|
+
|
|
4
|
+
This module provides the main ThordataClient class for interacting with
|
|
5
|
+
Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> from thordata import ThordataClient
|
|
9
|
+
>>>
|
|
10
|
+
>>> client = ThordataClient(
|
|
11
|
+
... scraper_token="your_token",
|
|
12
|
+
... public_token="your_public_token",
|
|
13
|
+
... public_key="your_public_key"
|
|
14
|
+
... )
|
|
15
|
+
>>>
|
|
16
|
+
>>> # Use the proxy network
|
|
17
|
+
>>> response = client.get("https://httpbin.org/ip")
|
|
18
|
+
>>> print(response.json())
|
|
19
|
+
>>>
|
|
20
|
+
>>> # Search with SERP API
|
|
21
|
+
>>> results = client.serp_search("python tutorial", engine="google")
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import base64
|
|
27
|
+
import contextlib
|
|
28
|
+
import hashlib
|
|
29
|
+
import logging
|
|
30
|
+
import os
|
|
31
|
+
import socket
|
|
32
|
+
import ssl
|
|
33
|
+
from datetime import date
|
|
34
|
+
from typing import Any, cast
|
|
35
|
+
from urllib.parse import urlencode, urlparse
|
|
36
|
+
|
|
37
|
+
import requests
|
|
38
|
+
import urllib3
|
|
39
|
+
from requests.structures import CaseInsensitiveDict
|
|
40
|
+
|
|
41
|
+
from .serp_engines import SerpNamespace
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
import socks
|
|
45
|
+
|
|
46
|
+
HAS_PYSOCKS = True
|
|
47
|
+
except ImportError:
|
|
48
|
+
HAS_PYSOCKS = False
|
|
49
|
+
|
|
50
|
+
from . import __version__ as _sdk_version
|
|
51
|
+
from ._utils import (
|
|
52
|
+
build_auth_headers,
|
|
53
|
+
build_builder_headers,
|
|
54
|
+
build_public_api_headers,
|
|
55
|
+
build_user_agent,
|
|
56
|
+
decode_base64_image,
|
|
57
|
+
extract_error_message,
|
|
58
|
+
parse_json_response,
|
|
59
|
+
)
|
|
60
|
+
from .enums import Engine, ProxyType
|
|
61
|
+
from .exceptions import (
|
|
62
|
+
ThordataConfigError,
|
|
63
|
+
ThordataNetworkError,
|
|
64
|
+
ThordataTimeoutError,
|
|
65
|
+
raise_for_code,
|
|
66
|
+
)
|
|
67
|
+
from .models import (
|
|
68
|
+
CommonSettings,
|
|
69
|
+
ProxyConfig,
|
|
70
|
+
ProxyProduct,
|
|
71
|
+
ProxyServer,
|
|
72
|
+
ProxyUserList,
|
|
73
|
+
ScraperTaskConfig,
|
|
74
|
+
SerpRequest,
|
|
75
|
+
UniversalScrapeRequest,
|
|
76
|
+
UsageStatistics,
|
|
77
|
+
VideoTaskConfig,
|
|
78
|
+
)
|
|
79
|
+
from .retry import RetryConfig, with_retry
|
|
80
|
+
|
|
81
|
+
logger = logging.getLogger(__name__)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# =========================================================================
|
|
85
|
+
# Upstream Proxy Support (for users behind firewall)
|
|
86
|
+
# =========================================================================
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _parse_upstream_proxy() -> dict[str, Any] | None:
|
|
90
|
+
"""
|
|
91
|
+
Parse THORDATA_UPSTREAM_PROXY environment variable.
|
|
92
|
+
|
|
93
|
+
Supported formats:
|
|
94
|
+
- http://127.0.0.1:7897
|
|
95
|
+
- socks5://127.0.0.1:7897
|
|
96
|
+
- socks5://user:pass@127.0.0.1:7897
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Dict with proxy config or None if not set.
|
|
100
|
+
"""
|
|
101
|
+
upstream_url = os.environ.get("THORDATA_UPSTREAM_PROXY", "").strip()
|
|
102
|
+
if not upstream_url:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
parsed = urlparse(upstream_url)
|
|
106
|
+
scheme = (parsed.scheme or "").lower()
|
|
107
|
+
|
|
108
|
+
if scheme not in ("http", "https", "socks5", "socks5h", "socks4"):
|
|
109
|
+
logger.warning(f"Unsupported upstream proxy scheme: {scheme}")
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
"scheme": scheme,
|
|
114
|
+
"host": parsed.hostname or "127.0.0.1",
|
|
115
|
+
"port": parsed.port or (1080 if scheme.startswith("socks") else 7897),
|
|
116
|
+
"username": parsed.username,
|
|
117
|
+
"password": parsed.password,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class _UpstreamProxySocketFactory:
|
|
122
|
+
"""
|
|
123
|
+
Socket factory that creates connections through an upstream proxy.
|
|
124
|
+
Used for proxy chaining when accessing Thordata from behind a firewall.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def __init__(self, upstream_config: dict[str, Any]):
|
|
128
|
+
self.config = upstream_config
|
|
129
|
+
|
|
130
|
+
def create_connection(
|
|
131
|
+
self,
|
|
132
|
+
address: tuple[str, int],
|
|
133
|
+
timeout: float | None = None,
|
|
134
|
+
source_address: tuple[str, int] | None = None,
|
|
135
|
+
) -> socket.socket:
|
|
136
|
+
"""Create a socket connection through the upstream proxy."""
|
|
137
|
+
scheme = self.config["scheme"]
|
|
138
|
+
|
|
139
|
+
if scheme.startswith("socks"):
|
|
140
|
+
return self._create_socks_connection(address, timeout)
|
|
141
|
+
else:
|
|
142
|
+
return self._create_http_tunnel(address, timeout)
|
|
143
|
+
|
|
144
|
+
def _create_socks_connection(
|
|
145
|
+
self,
|
|
146
|
+
address: tuple[str, int],
|
|
147
|
+
timeout: float | None = None,
|
|
148
|
+
) -> socket.socket:
|
|
149
|
+
"""Create connection through SOCKS proxy."""
|
|
150
|
+
if not HAS_PYSOCKS:
|
|
151
|
+
raise RuntimeError(
|
|
152
|
+
"PySocks is required for SOCKS upstream proxy. "
|
|
153
|
+
"Install with: pip install PySocks"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
scheme = self.config["scheme"]
|
|
157
|
+
proxy_type = socks.SOCKS5 if "socks5" in scheme else socks.SOCKS4
|
|
158
|
+
|
|
159
|
+
sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
|
|
160
|
+
sock.set_proxy(
|
|
161
|
+
proxy_type,
|
|
162
|
+
self.config["host"],
|
|
163
|
+
self.config["port"],
|
|
164
|
+
rdns=True,
|
|
165
|
+
username=self.config.get("username"),
|
|
166
|
+
password=self.config.get("password"),
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if timeout is not None:
|
|
170
|
+
sock.settimeout(timeout)
|
|
171
|
+
|
|
172
|
+
sock.connect(address)
|
|
173
|
+
return sock
|
|
174
|
+
|
|
175
|
+
def _create_http_tunnel(
|
|
176
|
+
self,
|
|
177
|
+
address: tuple[str, int],
|
|
178
|
+
timeout: float | None = None,
|
|
179
|
+
) -> socket.socket:
|
|
180
|
+
"""Create connection through HTTP CONNECT tunnel."""
|
|
181
|
+
# Connect to upstream proxy
|
|
182
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
183
|
+
if timeout is not None:
|
|
184
|
+
sock.settimeout(timeout)
|
|
185
|
+
|
|
186
|
+
sock.connect((self.config["host"], self.config["port"]))
|
|
187
|
+
|
|
188
|
+
# Build CONNECT request
|
|
189
|
+
target_host, target_port = address
|
|
190
|
+
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
191
|
+
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
192
|
+
|
|
193
|
+
# Add proxy auth if provided
|
|
194
|
+
if self.config.get("username"):
|
|
195
|
+
credentials = f"{self.config['username']}:{self.config.get('password', '')}"
|
|
196
|
+
encoded = base64.b64encode(credentials.encode()).decode()
|
|
197
|
+
connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
|
|
198
|
+
|
|
199
|
+
connect_req += "\r\n"
|
|
200
|
+
|
|
201
|
+
sock.sendall(connect_req.encode())
|
|
202
|
+
|
|
203
|
+
# Read response
|
|
204
|
+
response = b""
|
|
205
|
+
while b"\r\n\r\n" not in response:
|
|
206
|
+
chunk = sock.recv(1024)
|
|
207
|
+
if not chunk:
|
|
208
|
+
raise ConnectionError("Upstream proxy closed connection")
|
|
209
|
+
response += chunk
|
|
210
|
+
|
|
211
|
+
# Check status
|
|
212
|
+
status_line = response.split(b"\r\n")[0].decode()
|
|
213
|
+
if "200" not in status_line:
|
|
214
|
+
sock.close()
|
|
215
|
+
raise ConnectionError(f"Upstream proxy CONNECT failed: {status_line}")
|
|
216
|
+
|
|
217
|
+
return sock
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class _TLSInTLSSocket:
|
|
221
|
+
"""
|
|
222
|
+
A socket-like wrapper for TLS-in-TLS connections.
|
|
223
|
+
|
|
224
|
+
Uses SSLObject + MemoryBIO to implement TLS over an existing TLS connection.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
def __init__(
|
|
228
|
+
self,
|
|
229
|
+
outer_sock: ssl.SSLSocket,
|
|
230
|
+
ssl_obj: ssl.SSLObject,
|
|
231
|
+
incoming: ssl.MemoryBIO,
|
|
232
|
+
outgoing: ssl.MemoryBIO,
|
|
233
|
+
):
|
|
234
|
+
self._outer = outer_sock
|
|
235
|
+
self._ssl = ssl_obj
|
|
236
|
+
self._incoming = incoming
|
|
237
|
+
self._outgoing = outgoing
|
|
238
|
+
self._timeout: float | None = None
|
|
239
|
+
|
|
240
|
+
def settimeout(self, timeout: float | None) -> None:
|
|
241
|
+
self._timeout = timeout
|
|
242
|
+
self._outer.settimeout(timeout)
|
|
243
|
+
|
|
244
|
+
def sendall(self, data: bytes) -> None:
|
|
245
|
+
"""Send data through the inner TLS connection."""
|
|
246
|
+
self._ssl.write(data)
|
|
247
|
+
encrypted = self._outgoing.read()
|
|
248
|
+
if encrypted:
|
|
249
|
+
self._outer.sendall(encrypted)
|
|
250
|
+
|
|
251
|
+
def recv(self, bufsize: int) -> bytes:
|
|
252
|
+
"""Receive data from the inner TLS connection."""
|
|
253
|
+
while True:
|
|
254
|
+
try:
|
|
255
|
+
return self._ssl.read(bufsize)
|
|
256
|
+
except ssl.SSLWantReadError:
|
|
257
|
+
self._outer.settimeout(self._timeout)
|
|
258
|
+
try:
|
|
259
|
+
received = self._outer.recv(8192)
|
|
260
|
+
if not received:
|
|
261
|
+
return b""
|
|
262
|
+
self._incoming.write(received)
|
|
263
|
+
except socket.timeout:
|
|
264
|
+
return b""
|
|
265
|
+
|
|
266
|
+
def close(self) -> None:
|
|
267
|
+
with contextlib.suppress(Exception):
|
|
268
|
+
self._outer.close()
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# =========================================================================
|
|
272
|
+
# Main Client Class
|
|
273
|
+
# =========================================================================
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class ThordataClient:
|
|
277
|
+
# API Endpoints
|
|
278
|
+
BASE_URL = "https://scraperapi.thordata.com"
|
|
279
|
+
UNIVERSAL_URL = "https://universalapi.thordata.com"
|
|
280
|
+
API_URL = "https://openapi.thordata.com/api/web-scraper-api"
|
|
281
|
+
LOCATIONS_URL = "https://openapi.thordata.com/api/locations"
|
|
282
|
+
|
|
283
|
+
def __init__(
|
|
284
|
+
self,
|
|
285
|
+
scraper_token: str | None = None, # Change: Optional
|
|
286
|
+
public_token: str | None = None,
|
|
287
|
+
public_key: str | None = None,
|
|
288
|
+
proxy_host: str = "pr.thordata.net",
|
|
289
|
+
proxy_port: int = 9999,
|
|
290
|
+
timeout: int = 30,
|
|
291
|
+
api_timeout: int = 60,
|
|
292
|
+
retry_config: RetryConfig | None = None,
|
|
293
|
+
auth_mode: str = "bearer",
|
|
294
|
+
scraperapi_base_url: str | None = None,
|
|
295
|
+
universalapi_base_url: str | None = None,
|
|
296
|
+
web_scraper_api_base_url: str | None = None,
|
|
297
|
+
locations_base_url: str | None = None,
|
|
298
|
+
) -> None:
|
|
299
|
+
"""Initialize the Thordata Client."""
|
|
300
|
+
|
|
301
|
+
self.serp = SerpNamespace(self)
|
|
302
|
+
|
|
303
|
+
self.scraper_token = scraper_token
|
|
304
|
+
self.public_token = public_token
|
|
305
|
+
self.public_key = public_key
|
|
306
|
+
|
|
307
|
+
self._proxy_host = proxy_host
|
|
308
|
+
self._proxy_port = proxy_port
|
|
309
|
+
self._default_timeout = timeout
|
|
310
|
+
self._api_timeout = api_timeout
|
|
311
|
+
self._retry_config = retry_config or RetryConfig()
|
|
312
|
+
|
|
313
|
+
self._auth_mode = auth_mode.lower()
|
|
314
|
+
if self._auth_mode not in ("bearer", "header_token"):
|
|
315
|
+
raise ThordataConfigError(
|
|
316
|
+
f"Invalid auth_mode: {auth_mode}. Must be 'bearer' or 'header_token'."
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
self._proxy_session = requests.Session()
|
|
320
|
+
self._proxy_session.trust_env = False
|
|
321
|
+
self._proxy_managers: dict[str, urllib3.PoolManager] = {}
|
|
322
|
+
|
|
323
|
+
self._api_session = requests.Session()
|
|
324
|
+
self._api_session.trust_env = True
|
|
325
|
+
self._api_session.headers.update(
|
|
326
|
+
{"User-Agent": build_user_agent(_sdk_version, "requests")}
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Base URLs
|
|
330
|
+
scraperapi_base = (
|
|
331
|
+
scraperapi_base_url
|
|
332
|
+
or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
|
|
333
|
+
or self.BASE_URL
|
|
334
|
+
).rstrip("/")
|
|
335
|
+
|
|
336
|
+
universalapi_base = (
|
|
337
|
+
universalapi_base_url
|
|
338
|
+
or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
|
|
339
|
+
or self.UNIVERSAL_URL
|
|
340
|
+
).rstrip("/")
|
|
341
|
+
|
|
342
|
+
web_scraper_api_base = (
|
|
343
|
+
web_scraper_api_base_url
|
|
344
|
+
or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
|
|
345
|
+
or self.API_URL
|
|
346
|
+
).rstrip("/")
|
|
347
|
+
|
|
348
|
+
locations_base = (
|
|
349
|
+
locations_base_url
|
|
350
|
+
or os.getenv("THORDATA_LOCATIONS_BASE_URL")
|
|
351
|
+
or self.LOCATIONS_URL
|
|
352
|
+
).rstrip("/")
|
|
353
|
+
|
|
354
|
+
gateway_base = os.getenv(
|
|
355
|
+
"THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
|
|
356
|
+
)
|
|
357
|
+
self._gateway_base_url = gateway_base
|
|
358
|
+
self._child_base_url = os.getenv(
|
|
359
|
+
"THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
self._serp_url = f"{scraperapi_base}/request"
|
|
363
|
+
self._builder_url = f"{scraperapi_base}/builder"
|
|
364
|
+
self._video_builder_url = f"{scraperapi_base}/video_builder"
|
|
365
|
+
self._universal_url = f"{universalapi_base}/request"
|
|
366
|
+
|
|
367
|
+
self._status_url = f"{web_scraper_api_base}/tasks-status"
|
|
368
|
+
self._download_url = f"{web_scraper_api_base}/tasks-download"
|
|
369
|
+
self._list_url = f"{web_scraper_api_base}/tasks-list"
|
|
370
|
+
|
|
371
|
+
self._locations_base_url = locations_base
|
|
372
|
+
|
|
373
|
+
self._usage_stats_url = (
|
|
374
|
+
f"{locations_base.replace('/locations', '')}/account/usage-statistics"
|
|
375
|
+
)
|
|
376
|
+
self._proxy_users_url = (
|
|
377
|
+
f"{locations_base.replace('/locations', '')}/proxy-users"
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
whitelist_base = os.getenv(
|
|
381
|
+
"THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
|
|
382
|
+
)
|
|
383
|
+
self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
|
|
384
|
+
|
|
385
|
+
proxy_api_base = os.getenv(
|
|
386
|
+
"THORDATA_PROXY_API_BASE_URL", "https://openapi.thordata.com/api"
|
|
387
|
+
)
|
|
388
|
+
self._proxy_list_url = f"{proxy_api_base}/proxy/proxy-list"
|
|
389
|
+
self._proxy_expiration_url = f"{proxy_api_base}/proxy/expiration-time"
|
|
390
|
+
|
|
391
|
+
# =========================================================================
|
|
392
|
+
# Proxy Network Methods
|
|
393
|
+
# =========================================================================
|
|
394
|
+
|
|
395
|
+
def get(
|
|
396
|
+
self,
|
|
397
|
+
url: str,
|
|
398
|
+
*,
|
|
399
|
+
proxy_config: ProxyConfig | None = None,
|
|
400
|
+
timeout: int | None = None,
|
|
401
|
+
**kwargs: Any,
|
|
402
|
+
) -> requests.Response:
|
|
403
|
+
logger.debug(f"Proxy GET request: {url}")
|
|
404
|
+
return self._proxy_verb("GET", url, proxy_config, timeout, **kwargs)
|
|
405
|
+
|
|
406
|
+
def post(
|
|
407
|
+
self,
|
|
408
|
+
url: str,
|
|
409
|
+
*,
|
|
410
|
+
proxy_config: ProxyConfig | None = None,
|
|
411
|
+
timeout: int | None = None,
|
|
412
|
+
**kwargs: Any,
|
|
413
|
+
) -> requests.Response:
|
|
414
|
+
logger.debug(f"Proxy POST request: {url}")
|
|
415
|
+
return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
|
|
416
|
+
|
|
417
|
+
def _proxy_verb(
|
|
418
|
+
self,
|
|
419
|
+
method: str,
|
|
420
|
+
url: str,
|
|
421
|
+
proxy_config: ProxyConfig | None,
|
|
422
|
+
timeout: int | None,
|
|
423
|
+
**kwargs: Any,
|
|
424
|
+
) -> requests.Response:
|
|
425
|
+
timeout = timeout or self._default_timeout
|
|
426
|
+
|
|
427
|
+
if proxy_config is None:
|
|
428
|
+
proxy_config = self._get_default_proxy_config_from_env()
|
|
429
|
+
|
|
430
|
+
if proxy_config is None:
|
|
431
|
+
raise ThordataConfigError(
|
|
432
|
+
"Proxy credentials are missing. "
|
|
433
|
+
"Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
kwargs.pop("proxies", None)
|
|
437
|
+
|
|
438
|
+
@with_retry(self._retry_config)
|
|
439
|
+
def _do() -> requests.Response:
|
|
440
|
+
return self._proxy_request_with_proxy_manager(
|
|
441
|
+
method,
|
|
442
|
+
url,
|
|
443
|
+
proxy_config=proxy_config, # type: ignore
|
|
444
|
+
timeout=timeout, # type: ignore
|
|
445
|
+
headers=kwargs.pop("headers", None),
|
|
446
|
+
params=kwargs.pop("params", None),
|
|
447
|
+
data=kwargs.pop("data", None),
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
return _do()
|
|
452
|
+
except requests.Timeout as e:
|
|
453
|
+
raise ThordataTimeoutError(
|
|
454
|
+
f"Request timed out: {e}", original_error=e
|
|
455
|
+
) from e
|
|
456
|
+
except Exception as e:
|
|
457
|
+
raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
|
|
458
|
+
|
|
459
|
+
def build_proxy_url(
|
|
460
|
+
self,
|
|
461
|
+
username: str,
|
|
462
|
+
password: str,
|
|
463
|
+
*,
|
|
464
|
+
country: str | None = None,
|
|
465
|
+
state: str | None = None,
|
|
466
|
+
city: str | None = None,
|
|
467
|
+
session_id: str | None = None,
|
|
468
|
+
session_duration: int | None = None,
|
|
469
|
+
product: ProxyProduct | str = ProxyProduct.RESIDENTIAL,
|
|
470
|
+
) -> str:
|
|
471
|
+
config = ProxyConfig(
|
|
472
|
+
username=username,
|
|
473
|
+
password=password,
|
|
474
|
+
host=self._proxy_host,
|
|
475
|
+
port=self._proxy_port,
|
|
476
|
+
product=product,
|
|
477
|
+
country=country,
|
|
478
|
+
state=state,
|
|
479
|
+
city=city,
|
|
480
|
+
session_id=session_id,
|
|
481
|
+
session_duration=session_duration,
|
|
482
|
+
)
|
|
483
|
+
return config.build_proxy_url()
|
|
484
|
+
|
|
485
|
+
# =========================================================================
|
|
486
|
+
# Internal Request Helpers
|
|
487
|
+
# =========================================================================
|
|
488
|
+
|
|
489
|
+
def _api_request_with_retry(
|
|
490
|
+
self,
|
|
491
|
+
method: str,
|
|
492
|
+
url: str,
|
|
493
|
+
*,
|
|
494
|
+
data: dict[str, Any] | None = None,
|
|
495
|
+
headers: dict[str, str] | None = None,
|
|
496
|
+
params: dict[str, Any] | None = None,
|
|
497
|
+
) -> requests.Response:
|
|
498
|
+
@with_retry(self._retry_config)
|
|
499
|
+
def _do_request() -> requests.Response:
|
|
500
|
+
return self._api_session.request(
|
|
501
|
+
method,
|
|
502
|
+
url,
|
|
503
|
+
data=data,
|
|
504
|
+
headers=headers,
|
|
505
|
+
params=params,
|
|
506
|
+
timeout=self._api_timeout,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
return _do_request()
|
|
511
|
+
except requests.Timeout as e:
|
|
512
|
+
raise ThordataTimeoutError(
|
|
513
|
+
f"API request timed out: {e}", original_error=e
|
|
514
|
+
) from e
|
|
515
|
+
except requests.RequestException as e:
|
|
516
|
+
raise ThordataNetworkError(
|
|
517
|
+
f"API request failed: {e}", original_error=e
|
|
518
|
+
) from e
|
|
519
|
+
|
|
520
|
+
def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
|
|
521
|
+
"""Build a stable cache key for ProxyManager instances."""
|
|
522
|
+
if not userpass:
|
|
523
|
+
return proxy_endpoint
|
|
524
|
+
h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
|
|
525
|
+
return f"{proxy_endpoint}|auth={h}"
|
|
526
|
+
|
|
527
|
+
def _get_proxy_manager(
|
|
528
|
+
self,
|
|
529
|
+
proxy_url: str,
|
|
530
|
+
*,
|
|
531
|
+
cache_key: str,
|
|
532
|
+
proxy_headers: dict[str, str] | None = None,
|
|
533
|
+
) -> urllib3.PoolManager:
|
|
534
|
+
"""Get or create a ProxyManager for the given proxy URL (Pooled)."""
|
|
535
|
+
cached = self._proxy_managers.get(cache_key)
|
|
536
|
+
if cached is not None:
|
|
537
|
+
return cached
|
|
538
|
+
|
|
539
|
+
if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
|
|
540
|
+
try:
|
|
541
|
+
from urllib3.contrib.socks import SOCKSProxyManager
|
|
542
|
+
except Exception as e:
|
|
543
|
+
raise ThordataConfigError(
|
|
544
|
+
"SOCKS proxy requested but SOCKS dependencies are missing. "
|
|
545
|
+
"Install: pip install 'urllib3[socks]' or pip install PySocks"
|
|
546
|
+
) from e
|
|
547
|
+
|
|
548
|
+
pm_socks = SOCKSProxyManager(
|
|
549
|
+
proxy_url,
|
|
550
|
+
num_pools=10,
|
|
551
|
+
maxsize=10,
|
|
552
|
+
)
|
|
553
|
+
pm = cast(urllib3.PoolManager, pm_socks)
|
|
554
|
+
self._proxy_managers[cache_key] = pm
|
|
555
|
+
return pm
|
|
556
|
+
|
|
557
|
+
# HTTP/HTTPS proxies
|
|
558
|
+
proxy_ssl_context = None
|
|
559
|
+
if proxy_url.startswith("https://"):
|
|
560
|
+
proxy_ssl_context = ssl.create_default_context()
|
|
561
|
+
|
|
562
|
+
pm_http = urllib3.ProxyManager(
|
|
563
|
+
proxy_url,
|
|
564
|
+
proxy_headers=proxy_headers,
|
|
565
|
+
proxy_ssl_context=proxy_ssl_context,
|
|
566
|
+
num_pools=10,
|
|
567
|
+
maxsize=10,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
pm = cast(urllib3.PoolManager, pm_http)
|
|
571
|
+
self._proxy_managers[cache_key] = pm
|
|
572
|
+
return pm
|
|
573
|
+
|
|
574
|
+
def _proxy_request_with_proxy_manager(
|
|
575
|
+
self,
|
|
576
|
+
method: str,
|
|
577
|
+
url: str,
|
|
578
|
+
*,
|
|
579
|
+
proxy_config: ProxyConfig,
|
|
580
|
+
timeout: int,
|
|
581
|
+
headers: dict[str, str] | None = None,
|
|
582
|
+
params: dict[str, Any] | None = None,
|
|
583
|
+
data: Any = None,
|
|
584
|
+
) -> requests.Response:
|
|
585
|
+
"""Execute request through proxy, with optional upstream proxy support."""
|
|
586
|
+
|
|
587
|
+
# Check for upstream proxy
|
|
588
|
+
upstream_config = _parse_upstream_proxy()
|
|
589
|
+
|
|
590
|
+
if upstream_config:
|
|
591
|
+
return self._proxy_request_with_upstream(
|
|
592
|
+
method,
|
|
593
|
+
url,
|
|
594
|
+
proxy_config=proxy_config,
|
|
595
|
+
timeout=timeout,
|
|
596
|
+
headers=headers,
|
|
597
|
+
params=params,
|
|
598
|
+
data=data,
|
|
599
|
+
upstream_config=upstream_config,
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
# Original implementation (no upstream proxy)
|
|
603
|
+
req = requests.Request(method=method.upper(), url=url, params=params)
|
|
604
|
+
prepped = self._proxy_session.prepare_request(req)
|
|
605
|
+
final_url = prepped.url or url
|
|
606
|
+
|
|
607
|
+
proxy_endpoint = proxy_config.build_proxy_endpoint()
|
|
608
|
+
is_socks = proxy_endpoint.startswith(
|
|
609
|
+
("socks5://", "socks5h://", "socks4://", "socks4a://")
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
if is_socks:
|
|
613
|
+
proxy_url_for_manager = proxy_config.build_proxy_url()
|
|
614
|
+
userpass = proxy_config.build_proxy_basic_auth()
|
|
615
|
+
cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
|
|
616
|
+
|
|
617
|
+
pm = self._get_proxy_manager(
|
|
618
|
+
proxy_url_for_manager,
|
|
619
|
+
cache_key=cache_key,
|
|
620
|
+
proxy_headers=None,
|
|
621
|
+
)
|
|
622
|
+
else:
|
|
623
|
+
userpass = proxy_config.build_proxy_basic_auth()
|
|
624
|
+
proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
|
|
625
|
+
cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
|
|
626
|
+
|
|
627
|
+
pm = self._get_proxy_manager(
|
|
628
|
+
proxy_endpoint,
|
|
629
|
+
cache_key=cache_key,
|
|
630
|
+
proxy_headers=dict(proxy_headers),
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
req_headers = dict(headers or {})
|
|
634
|
+
body = None
|
|
635
|
+
if data is not None:
|
|
636
|
+
if isinstance(data, dict):
|
|
637
|
+
body = urlencode({k: str(v) for k, v in data.items()})
|
|
638
|
+
req_headers.setdefault(
|
|
639
|
+
"Content-Type", "application/x-www-form-urlencoded"
|
|
640
|
+
)
|
|
641
|
+
else:
|
|
642
|
+
body = data
|
|
643
|
+
|
|
644
|
+
http_resp = pm.request(
|
|
645
|
+
method.upper(),
|
|
646
|
+
final_url,
|
|
647
|
+
body=body,
|
|
648
|
+
headers=req_headers or None,
|
|
649
|
+
timeout=urllib3.Timeout(connect=timeout, read=timeout),
|
|
650
|
+
retries=False,
|
|
651
|
+
preload_content=True,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
r = requests.Response()
|
|
655
|
+
r.status_code = int(getattr(http_resp, "status", 0) or 0)
|
|
656
|
+
r._content = http_resp.data or b""
|
|
657
|
+
r.url = final_url
|
|
658
|
+
r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
|
|
659
|
+
return r
|
|
660
|
+
|
|
661
|
+
# =========================================================================
|
|
662
|
+
# Upstream Proxy Support (Proxy Chaining)
|
|
663
|
+
# =========================================================================
|
|
664
|
+
|
|
665
|
+
def _proxy_request_with_upstream(
|
|
666
|
+
self,
|
|
667
|
+
method: str,
|
|
668
|
+
url: str,
|
|
669
|
+
*,
|
|
670
|
+
proxy_config: ProxyConfig,
|
|
671
|
+
timeout: int,
|
|
672
|
+
headers: dict[str, str] | None = None,
|
|
673
|
+
params: dict[str, Any] | None = None,
|
|
674
|
+
data: Any = None,
|
|
675
|
+
upstream_config: dict[str, Any],
|
|
676
|
+
) -> requests.Response:
|
|
677
|
+
"""Execute request through proxy chain: Upstream -> Thordata -> Target."""
|
|
678
|
+
if not HAS_PYSOCKS:
|
|
679
|
+
raise ThordataConfigError(
|
|
680
|
+
"PySocks is required for upstream proxy support. "
|
|
681
|
+
"Install with: pip install PySocks"
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
req = requests.Request(method=method.upper(), url=url, params=params)
|
|
685
|
+
prepped = self._proxy_session.prepare_request(req)
|
|
686
|
+
final_url = prepped.url or url
|
|
687
|
+
|
|
688
|
+
parsed_target = urlparse(final_url)
|
|
689
|
+
target_host = parsed_target.hostname or ""
|
|
690
|
+
target_port = parsed_target.port or (
|
|
691
|
+
443 if parsed_target.scheme == "https" else 80
|
|
692
|
+
)
|
|
693
|
+
target_is_https = parsed_target.scheme == "https"
|
|
694
|
+
|
|
695
|
+
protocol = proxy_config.protocol.lower()
|
|
696
|
+
if protocol == "socks5":
|
|
697
|
+
protocol = "socks5h"
|
|
698
|
+
|
|
699
|
+
thordata_host = proxy_config.host or ""
|
|
700
|
+
thordata_port = proxy_config.port or 9999
|
|
701
|
+
thordata_username = proxy_config.build_username()
|
|
702
|
+
thordata_password = proxy_config.password
|
|
703
|
+
|
|
704
|
+
socket_factory = _UpstreamProxySocketFactory(upstream_config)
|
|
705
|
+
|
|
706
|
+
logger.debug(
|
|
707
|
+
f"Proxy chain: upstream({upstream_config['host']}:{upstream_config['port']}) "
|
|
708
|
+
f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
|
|
709
|
+
f"-> target({target_host}:{target_port})"
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
raw_sock = socket_factory.create_connection(
|
|
713
|
+
(thordata_host, thordata_port),
|
|
714
|
+
timeout=float(timeout),
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
try:
|
|
718
|
+
if protocol.startswith("socks"):
|
|
719
|
+
sock = self._socks5_handshake(
|
|
720
|
+
raw_sock,
|
|
721
|
+
target_host,
|
|
722
|
+
target_port,
|
|
723
|
+
thordata_username,
|
|
724
|
+
thordata_password,
|
|
725
|
+
)
|
|
726
|
+
if target_is_https:
|
|
727
|
+
context = ssl.create_default_context()
|
|
728
|
+
sock = context.wrap_socket(sock, server_hostname=target_host)
|
|
729
|
+
|
|
730
|
+
elif protocol == "https":
|
|
731
|
+
proxy_context = ssl.create_default_context()
|
|
732
|
+
proxy_ssl_sock = proxy_context.wrap_socket(
|
|
733
|
+
raw_sock, server_hostname=thordata_host
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
self._send_connect_request(
|
|
737
|
+
proxy_ssl_sock,
|
|
738
|
+
target_host,
|
|
739
|
+
target_port,
|
|
740
|
+
thordata_username,
|
|
741
|
+
thordata_password,
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
if target_is_https:
|
|
745
|
+
sock = self._create_tls_in_tls_socket(
|
|
746
|
+
proxy_ssl_sock, target_host, timeout
|
|
747
|
+
) # type: ignore[assignment]
|
|
748
|
+
else:
|
|
749
|
+
sock = proxy_ssl_sock
|
|
750
|
+
|
|
751
|
+
else: # HTTP proxy
|
|
752
|
+
self._send_connect_request(
|
|
753
|
+
raw_sock,
|
|
754
|
+
target_host,
|
|
755
|
+
target_port,
|
|
756
|
+
thordata_username,
|
|
757
|
+
thordata_password,
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
if target_is_https:
|
|
761
|
+
context = ssl.create_default_context()
|
|
762
|
+
sock = context.wrap_socket(raw_sock, server_hostname=target_host)
|
|
763
|
+
else:
|
|
764
|
+
sock = raw_sock
|
|
765
|
+
|
|
766
|
+
return self._send_http_request(
|
|
767
|
+
sock, method, parsed_target, headers, data, final_url, timeout
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
finally:
|
|
771
|
+
with contextlib.suppress(Exception):
|
|
772
|
+
raw_sock.close()
|
|
773
|
+
|
|
774
|
+
def _send_connect_request(
|
|
775
|
+
self,
|
|
776
|
+
sock: socket.socket,
|
|
777
|
+
target_host: str,
|
|
778
|
+
target_port: int,
|
|
779
|
+
proxy_username: str,
|
|
780
|
+
proxy_password: str,
|
|
781
|
+
) -> None:
|
|
782
|
+
"""Send HTTP CONNECT request to proxy and verify response."""
|
|
783
|
+
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
784
|
+
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
785
|
+
|
|
786
|
+
credentials = f"{proxy_username}:{proxy_password}"
|
|
787
|
+
encoded = base64.b64encode(credentials.encode()).decode()
|
|
788
|
+
connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
|
|
789
|
+
connect_req += "\r\n"
|
|
790
|
+
|
|
791
|
+
sock.sendall(connect_req.encode())
|
|
792
|
+
|
|
793
|
+
response = b""
|
|
794
|
+
while b"\r\n\r\n" not in response:
|
|
795
|
+
chunk = sock.recv(4096)
|
|
796
|
+
if not chunk:
|
|
797
|
+
raise ConnectionError("Proxy closed connection during CONNECT")
|
|
798
|
+
response += chunk
|
|
799
|
+
|
|
800
|
+
status_line = response.split(b"\r\n")[0].decode()
|
|
801
|
+
if "200" not in status_line:
|
|
802
|
+
raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
|
|
803
|
+
|
|
804
|
+
def _create_tls_in_tls_socket(
|
|
805
|
+
self,
|
|
806
|
+
outer_ssl_sock: ssl.SSLSocket,
|
|
807
|
+
hostname: str,
|
|
808
|
+
timeout: int,
|
|
809
|
+
) -> _TLSInTLSSocket:
|
|
810
|
+
"""Create a TLS connection over an existing TLS connection."""
|
|
811
|
+
context = ssl.create_default_context()
|
|
812
|
+
|
|
813
|
+
incoming = ssl.MemoryBIO()
|
|
814
|
+
outgoing = ssl.MemoryBIO()
|
|
815
|
+
|
|
816
|
+
ssl_obj = context.wrap_bio(incoming, outgoing, server_hostname=hostname)
|
|
817
|
+
|
|
818
|
+
while True:
|
|
819
|
+
try:
|
|
820
|
+
ssl_obj.do_handshake()
|
|
821
|
+
break
|
|
822
|
+
except ssl.SSLWantReadError:
|
|
823
|
+
data_to_send = outgoing.read()
|
|
824
|
+
if data_to_send:
|
|
825
|
+
outer_ssl_sock.sendall(data_to_send)
|
|
826
|
+
|
|
827
|
+
outer_ssl_sock.settimeout(float(timeout))
|
|
828
|
+
try:
|
|
829
|
+
received = outer_ssl_sock.recv(8192)
|
|
830
|
+
if not received:
|
|
831
|
+
raise ConnectionError("Connection closed during TLS handshake")
|
|
832
|
+
incoming.write(received)
|
|
833
|
+
except socket.timeout as e:
|
|
834
|
+
raise ConnectionError("Timeout during TLS handshake") from e
|
|
835
|
+
except ssl.SSLWantWriteError:
|
|
836
|
+
data_to_send = outgoing.read()
|
|
837
|
+
if data_to_send:
|
|
838
|
+
outer_ssl_sock.sendall(data_to_send)
|
|
839
|
+
|
|
840
|
+
data_to_send = outgoing.read()
|
|
841
|
+
if data_to_send:
|
|
842
|
+
outer_ssl_sock.sendall(data_to_send)
|
|
843
|
+
|
|
844
|
+
return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
|
|
845
|
+
|
|
846
|
+
def _send_http_request(
|
|
847
|
+
self,
|
|
848
|
+
sock: socket.socket | ssl.SSLSocket | Any,
|
|
849
|
+
method: str,
|
|
850
|
+
parsed_url: Any,
|
|
851
|
+
headers: dict[str, str] | None,
|
|
852
|
+
data: Any,
|
|
853
|
+
final_url: str,
|
|
854
|
+
timeout: int,
|
|
855
|
+
) -> requests.Response:
|
|
856
|
+
"""Send HTTP request over established connection and parse response."""
|
|
857
|
+
target_host = parsed_url.hostname
|
|
858
|
+
|
|
859
|
+
req_headers = dict(headers or {})
|
|
860
|
+
req_headers.setdefault("Host", target_host)
|
|
861
|
+
req_headers.setdefault("User-Agent", build_user_agent(_sdk_version, "requests"))
|
|
862
|
+
req_headers.setdefault("Connection", "close")
|
|
863
|
+
|
|
864
|
+
path = parsed_url.path or "/"
|
|
865
|
+
if parsed_url.query:
|
|
866
|
+
path += f"?{parsed_url.query}"
|
|
867
|
+
|
|
868
|
+
http_req = f"{method.upper()} {path} HTTP/1.1\r\n"
|
|
869
|
+
for k, v in req_headers.items():
|
|
870
|
+
http_req += f"{k}: {v}\r\n"
|
|
871
|
+
|
|
872
|
+
body = None
|
|
873
|
+
if data is not None:
|
|
874
|
+
if isinstance(data, dict):
|
|
875
|
+
body = urlencode({k: str(v) for k, v in data.items()}).encode()
|
|
876
|
+
http_req += "Content-Type: application/x-www-form-urlencoded\r\n"
|
|
877
|
+
http_req += f"Content-Length: {len(body)}\r\n"
|
|
878
|
+
elif isinstance(data, bytes):
|
|
879
|
+
body = data
|
|
880
|
+
http_req += f"Content-Length: {len(body)}\r\n"
|
|
881
|
+
else:
|
|
882
|
+
body = str(data).encode()
|
|
883
|
+
http_req += f"Content-Length: {len(body)}\r\n"
|
|
884
|
+
|
|
885
|
+
http_req += "\r\n"
|
|
886
|
+
sock.sendall(http_req.encode())
|
|
887
|
+
|
|
888
|
+
if body:
|
|
889
|
+
sock.sendall(body)
|
|
890
|
+
|
|
891
|
+
if hasattr(sock, "settimeout"):
|
|
892
|
+
sock.settimeout(float(timeout))
|
|
893
|
+
|
|
894
|
+
response_data = b""
|
|
895
|
+
try:
|
|
896
|
+
while True:
|
|
897
|
+
chunk = sock.recv(8192)
|
|
898
|
+
if not chunk:
|
|
899
|
+
break
|
|
900
|
+
response_data += chunk
|
|
901
|
+
if b"\r\n\r\n" in response_data:
|
|
902
|
+
header_end = response_data.index(b"\r\n\r\n") + 4
|
|
903
|
+
headers_part = (
|
|
904
|
+
response_data[:header_end]
|
|
905
|
+
.decode("utf-8", errors="replace")
|
|
906
|
+
.lower()
|
|
907
|
+
)
|
|
908
|
+
if "content-length:" in headers_part:
|
|
909
|
+
for line in headers_part.split("\r\n"):
|
|
910
|
+
if line.startswith("content-length:"):
|
|
911
|
+
content_length = int(line.split(":")[1].strip())
|
|
912
|
+
if len(response_data) >= header_end + content_length:
|
|
913
|
+
break
|
|
914
|
+
elif "transfer-encoding: chunked" not in headers_part:
|
|
915
|
+
break
|
|
916
|
+
except socket.timeout:
|
|
917
|
+
pass
|
|
918
|
+
|
|
919
|
+
return self._parse_http_response(response_data, final_url)
|
|
920
|
+
|
|
921
|
+
def _socks5_handshake(
|
|
922
|
+
self,
|
|
923
|
+
sock: socket.socket,
|
|
924
|
+
target_host: str,
|
|
925
|
+
target_port: int,
|
|
926
|
+
username: str | None,
|
|
927
|
+
password: str | None,
|
|
928
|
+
) -> socket.socket:
|
|
929
|
+
"""Perform SOCKS5 handshake over existing socket."""
|
|
930
|
+
if username and password:
|
|
931
|
+
sock.sendall(b"\x05\x02\x00\x02")
|
|
932
|
+
else:
|
|
933
|
+
sock.sendall(b"\x05\x01\x00")
|
|
934
|
+
|
|
935
|
+
response = sock.recv(2)
|
|
936
|
+
if len(response) < 2:
|
|
937
|
+
raise ConnectionError("SOCKS5 handshake failed: incomplete response")
|
|
938
|
+
|
|
939
|
+
if response[0] != 0x05:
|
|
940
|
+
raise ConnectionError(f"SOCKS5 version mismatch: {response[0]}")
|
|
941
|
+
|
|
942
|
+
auth_method = response[1]
|
|
943
|
+
|
|
944
|
+
if auth_method == 0x02:
|
|
945
|
+
if not username or not password:
|
|
946
|
+
raise ConnectionError(
|
|
947
|
+
"SOCKS5 server requires auth but no credentials provided"
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
auth_req = bytes([0x01, len(username)]) + username.encode()
|
|
951
|
+
auth_req += bytes([len(password)]) + password.encode()
|
|
952
|
+
sock.sendall(auth_req)
|
|
953
|
+
|
|
954
|
+
auth_resp = sock.recv(2)
|
|
955
|
+
if len(auth_resp) < 2 or auth_resp[1] != 0x00:
|
|
956
|
+
raise ConnectionError("SOCKS5 authentication failed")
|
|
957
|
+
|
|
958
|
+
elif auth_method == 0xFF:
|
|
959
|
+
raise ConnectionError("SOCKS5 no acceptable auth method")
|
|
960
|
+
|
|
961
|
+
connect_req = b"\x05\x01\x00\x03"
|
|
962
|
+
connect_req += bytes([len(target_host)]) + target_host.encode()
|
|
963
|
+
connect_req += target_port.to_bytes(2, "big")
|
|
964
|
+
sock.sendall(connect_req)
|
|
965
|
+
|
|
966
|
+
resp = sock.recv(4)
|
|
967
|
+
if len(resp) < 4:
|
|
968
|
+
raise ConnectionError("SOCKS5 connect failed: incomplete response")
|
|
969
|
+
|
|
970
|
+
if resp[1] != 0x00:
|
|
971
|
+
error_codes = {
|
|
972
|
+
0x01: "General failure",
|
|
973
|
+
0x02: "Connection not allowed",
|
|
974
|
+
0x03: "Network unreachable",
|
|
975
|
+
0x04: "Host unreachable",
|
|
976
|
+
0x05: "Connection refused",
|
|
977
|
+
0x06: "TTL expired",
|
|
978
|
+
0x07: "Command not supported",
|
|
979
|
+
0x08: "Address type not supported",
|
|
980
|
+
}
|
|
981
|
+
error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
|
|
982
|
+
raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
|
|
983
|
+
|
|
984
|
+
addr_type = resp[3]
|
|
985
|
+
if addr_type == 0x01:
|
|
986
|
+
sock.recv(4 + 2)
|
|
987
|
+
elif addr_type == 0x03:
|
|
988
|
+
domain_len = sock.recv(1)[0]
|
|
989
|
+
sock.recv(domain_len + 2)
|
|
990
|
+
elif addr_type == 0x04:
|
|
991
|
+
sock.recv(16 + 2)
|
|
992
|
+
|
|
993
|
+
return sock
|
|
994
|
+
|
|
995
|
+
def _parse_http_response(
|
|
996
|
+
self,
|
|
997
|
+
response_data: bytes,
|
|
998
|
+
url: str,
|
|
999
|
+
) -> requests.Response:
|
|
1000
|
+
"""Parse raw HTTP response into requests.Response."""
|
|
1001
|
+
if b"\r\n\r\n" in response_data:
|
|
1002
|
+
header_data, body = response_data.split(b"\r\n\r\n", 1)
|
|
1003
|
+
else:
|
|
1004
|
+
header_data = response_data
|
|
1005
|
+
body = b""
|
|
1006
|
+
|
|
1007
|
+
header_lines = header_data.decode("utf-8", errors="replace").split("\r\n")
|
|
1008
|
+
|
|
1009
|
+
status_line = header_lines[0] if header_lines else ""
|
|
1010
|
+
parts = status_line.split(" ", 2)
|
|
1011
|
+
status_code = int(parts[1]) if len(parts) > 1 else 0
|
|
1012
|
+
|
|
1013
|
+
headers_dict = {}
|
|
1014
|
+
for line in header_lines[1:]:
|
|
1015
|
+
if ": " in line:
|
|
1016
|
+
k, v = line.split(": ", 1)
|
|
1017
|
+
headers_dict[k] = v
|
|
1018
|
+
|
|
1019
|
+
if headers_dict.get("Transfer-Encoding", "").lower() == "chunked":
|
|
1020
|
+
body = self._decode_chunked(body)
|
|
1021
|
+
|
|
1022
|
+
r = requests.Response()
|
|
1023
|
+
r.status_code = status_code
|
|
1024
|
+
r._content = body
|
|
1025
|
+
r.url = url
|
|
1026
|
+
r.headers = CaseInsensitiveDict(headers_dict)
|
|
1027
|
+
return r
|
|
1028
|
+
|
|
1029
|
+
def _decode_chunked(self, data: bytes) -> bytes:
|
|
1030
|
+
"""Decode chunked transfer encoding."""
|
|
1031
|
+
result = b""
|
|
1032
|
+
while data:
|
|
1033
|
+
if b"\r\n" not in data:
|
|
1034
|
+
break
|
|
1035
|
+
size_line, data = data.split(b"\r\n", 1)
|
|
1036
|
+
try:
|
|
1037
|
+
chunk_size = int(size_line.decode().strip(), 16)
|
|
1038
|
+
except ValueError:
|
|
1039
|
+
break
|
|
1040
|
+
|
|
1041
|
+
if chunk_size == 0:
|
|
1042
|
+
break
|
|
1043
|
+
|
|
1044
|
+
result += data[:chunk_size]
|
|
1045
|
+
data = data[chunk_size:]
|
|
1046
|
+
|
|
1047
|
+
if data.startswith(b"\r\n"):
|
|
1048
|
+
data = data[2:]
|
|
1049
|
+
|
|
1050
|
+
return result
|
|
1051
|
+
|
|
1052
|
+
# =========================================================================
|
|
1053
|
+
# SERP API Methods
|
|
1054
|
+
# =========================================================================
|
|
1055
|
+
|
|
1056
|
+
def serp_search(
|
|
1057
|
+
self,
|
|
1058
|
+
query: str,
|
|
1059
|
+
*,
|
|
1060
|
+
engine: Engine | str = Engine.GOOGLE,
|
|
1061
|
+
num: int = 10,
|
|
1062
|
+
country: str | None = None,
|
|
1063
|
+
language: str | None = None,
|
|
1064
|
+
search_type: str | None = None,
|
|
1065
|
+
device: str | None = None,
|
|
1066
|
+
render_js: bool | None = None,
|
|
1067
|
+
no_cache: bool | None = None,
|
|
1068
|
+
output_format: str = "json",
|
|
1069
|
+
**kwargs: Any,
|
|
1070
|
+
) -> dict[str, Any]:
|
|
1071
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
1072
|
+
|
|
1073
|
+
request = SerpRequest(
|
|
1074
|
+
query=query,
|
|
1075
|
+
engine=engine_str,
|
|
1076
|
+
num=num,
|
|
1077
|
+
country=country,
|
|
1078
|
+
language=language,
|
|
1079
|
+
search_type=search_type,
|
|
1080
|
+
device=device,
|
|
1081
|
+
render_js=render_js,
|
|
1082
|
+
no_cache=no_cache,
|
|
1083
|
+
output_format=output_format,
|
|
1084
|
+
extra_params=kwargs,
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
return self.serp_search_advanced(request)
|
|
1088
|
+
|
|
1089
|
+
def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
|
|
1090
|
+
if not self.scraper_token:
|
|
1091
|
+
raise ThordataConfigError("scraper_token is required for SERP API")
|
|
1092
|
+
|
|
1093
|
+
payload = request.to_payload()
|
|
1094
|
+
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
1095
|
+
|
|
1096
|
+
logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
|
|
1097
|
+
|
|
1098
|
+
try:
|
|
1099
|
+
response = self._api_request_with_retry(
|
|
1100
|
+
"POST",
|
|
1101
|
+
self._serp_url,
|
|
1102
|
+
data=payload,
|
|
1103
|
+
headers=headers,
|
|
1104
|
+
)
|
|
1105
|
+
response.raise_for_status()
|
|
1106
|
+
|
|
1107
|
+
if request.output_format.lower() == "json":
|
|
1108
|
+
data = response.json()
|
|
1109
|
+
if isinstance(data, dict):
|
|
1110
|
+
code = data.get("code")
|
|
1111
|
+
if code is not None and code != 200:
|
|
1112
|
+
msg = extract_error_message(data)
|
|
1113
|
+
raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
|
|
1114
|
+
return parse_json_response(data)
|
|
1115
|
+
|
|
1116
|
+
return {"html": response.text}
|
|
1117
|
+
|
|
1118
|
+
except requests.Timeout as e:
|
|
1119
|
+
raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
|
|
1120
|
+
except requests.RequestException as e:
|
|
1121
|
+
raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
|
|
1122
|
+
|
|
1123
|
+
# =========================================================================
|
|
1124
|
+
# Universal Scraping API
|
|
1125
|
+
# =========================================================================
|
|
1126
|
+
|
|
1127
|
+
def universal_scrape(
|
|
1128
|
+
self,
|
|
1129
|
+
url: str,
|
|
1130
|
+
*,
|
|
1131
|
+
js_render: bool = False,
|
|
1132
|
+
output_format: str = "html",
|
|
1133
|
+
country: str | None = None,
|
|
1134
|
+
block_resources: str | None = None,
|
|
1135
|
+
wait: int | None = None,
|
|
1136
|
+
wait_for: str | None = None,
|
|
1137
|
+
**kwargs: Any,
|
|
1138
|
+
) -> str | bytes:
|
|
1139
|
+
request = UniversalScrapeRequest(
|
|
1140
|
+
url=url,
|
|
1141
|
+
js_render=js_render,
|
|
1142
|
+
output_format=output_format,
|
|
1143
|
+
country=country,
|
|
1144
|
+
block_resources=block_resources,
|
|
1145
|
+
wait=wait,
|
|
1146
|
+
wait_for=wait_for,
|
|
1147
|
+
extra_params=kwargs,
|
|
1148
|
+
)
|
|
1149
|
+
return self.universal_scrape_advanced(request)
|
|
1150
|
+
|
|
1151
|
+
def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
|
|
1152
|
+
if not self.scraper_token:
|
|
1153
|
+
raise ThordataConfigError("scraper_token is required for Universal API")
|
|
1154
|
+
|
|
1155
|
+
payload = request.to_payload()
|
|
1156
|
+
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
1157
|
+
|
|
1158
|
+
logger.info(f"Universal Scrape: {request.url}")
|
|
1159
|
+
|
|
1160
|
+
try:
|
|
1161
|
+
response = self._api_request_with_retry(
|
|
1162
|
+
"POST",
|
|
1163
|
+
self._universal_url,
|
|
1164
|
+
data=payload,
|
|
1165
|
+
headers=headers,
|
|
1166
|
+
)
|
|
1167
|
+
response.raise_for_status()
|
|
1168
|
+
return self._process_universal_response(response, request.output_format)
|
|
1169
|
+
|
|
1170
|
+
except requests.Timeout as e:
|
|
1171
|
+
raise ThordataTimeoutError(
|
|
1172
|
+
f"Universal timeout: {e}", original_error=e
|
|
1173
|
+
) from e
|
|
1174
|
+
except requests.RequestException as e:
|
|
1175
|
+
raise ThordataNetworkError(
|
|
1176
|
+
f"Universal failed: {e}", original_error=e
|
|
1177
|
+
) from e
|
|
1178
|
+
|
|
1179
|
+
def _process_universal_response(
|
|
1180
|
+
self, response: requests.Response, output_format: str
|
|
1181
|
+
) -> str | bytes:
|
|
1182
|
+
try:
|
|
1183
|
+
resp_json = response.json()
|
|
1184
|
+
except ValueError:
|
|
1185
|
+
return response.content if output_format.lower() == "png" else response.text
|
|
1186
|
+
|
|
1187
|
+
if isinstance(resp_json, dict):
|
|
1188
|
+
code = resp_json.get("code")
|
|
1189
|
+
if code is not None and code != 200:
|
|
1190
|
+
msg = extract_error_message(resp_json)
|
|
1191
|
+
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
1192
|
+
|
|
1193
|
+
if "html" in resp_json:
|
|
1194
|
+
return resp_json["html"]
|
|
1195
|
+
if "png" in resp_json:
|
|
1196
|
+
return decode_base64_image(resp_json["png"])
|
|
1197
|
+
|
|
1198
|
+
return str(resp_json)
|
|
1199
|
+
|
|
1200
|
+
# =========================================================================
|
|
1201
|
+
# Web Scraper API (Tasks)
|
|
1202
|
+
# =========================================================================
|
|
1203
|
+
|
|
1204
|
+
def create_scraper_task(
|
|
1205
|
+
self,
|
|
1206
|
+
file_name: str,
|
|
1207
|
+
spider_id: str,
|
|
1208
|
+
spider_name: str,
|
|
1209
|
+
parameters: dict[str, Any],
|
|
1210
|
+
universal_params: dict[str, Any] | None = None,
|
|
1211
|
+
) -> str:
|
|
1212
|
+
config = ScraperTaskConfig(
|
|
1213
|
+
file_name=file_name,
|
|
1214
|
+
spider_id=spider_id,
|
|
1215
|
+
spider_name=spider_name,
|
|
1216
|
+
parameters=parameters,
|
|
1217
|
+
universal_params=universal_params,
|
|
1218
|
+
)
|
|
1219
|
+
return self.create_scraper_task_advanced(config)
|
|
1220
|
+
|
|
1221
|
+
def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
1222
|
+
self._require_public_credentials()
|
|
1223
|
+
if not self.scraper_token:
|
|
1224
|
+
raise ThordataConfigError("scraper_token is required for Task Builder")
|
|
1225
|
+
payload = config.to_payload()
|
|
1226
|
+
headers = build_builder_headers(
|
|
1227
|
+
self.scraper_token, self.public_token or "", self.public_key or ""
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
try:
|
|
1231
|
+
response = self._api_request_with_retry(
|
|
1232
|
+
"POST", self._builder_url, data=payload, headers=headers
|
|
1233
|
+
)
|
|
1234
|
+
response.raise_for_status()
|
|
1235
|
+
data = response.json()
|
|
1236
|
+
if data.get("code") != 200:
|
|
1237
|
+
raise_for_code(
|
|
1238
|
+
"Task creation failed", code=data.get("code"), payload=data
|
|
1239
|
+
)
|
|
1240
|
+
return data["data"]["task_id"]
|
|
1241
|
+
except requests.RequestException as e:
|
|
1242
|
+
raise ThordataNetworkError(
|
|
1243
|
+
f"Task creation failed: {e}", original_error=e
|
|
1244
|
+
) from e
|
|
1245
|
+
|
|
1246
|
+
def create_video_task(
|
|
1247
|
+
self,
|
|
1248
|
+
file_name: str,
|
|
1249
|
+
spider_id: str,
|
|
1250
|
+
spider_name: str,
|
|
1251
|
+
parameters: dict[str, Any],
|
|
1252
|
+
common_settings: CommonSettings,
|
|
1253
|
+
) -> str:
|
|
1254
|
+
config = VideoTaskConfig(
|
|
1255
|
+
file_name=file_name,
|
|
1256
|
+
spider_id=spider_id,
|
|
1257
|
+
spider_name=spider_name,
|
|
1258
|
+
parameters=parameters,
|
|
1259
|
+
common_settings=common_settings,
|
|
1260
|
+
)
|
|
1261
|
+
return self.create_video_task_advanced(config)
|
|
1262
|
+
|
|
1263
|
+
def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
|
|
1264
|
+
self._require_public_credentials()
|
|
1265
|
+
if not self.scraper_token:
|
|
1266
|
+
raise ThordataConfigError(
|
|
1267
|
+
"scraper_token is required for Video Task Builder"
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
payload = config.to_payload()
|
|
1271
|
+
headers = build_builder_headers(
|
|
1272
|
+
self.scraper_token, self.public_token or "", self.public_key or ""
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
response = self._api_request_with_retry(
|
|
1276
|
+
"POST", self._video_builder_url, data=payload, headers=headers
|
|
1277
|
+
)
|
|
1278
|
+
response.raise_for_status()
|
|
1279
|
+
data = response.json()
|
|
1280
|
+
if data.get("code") != 200:
|
|
1281
|
+
raise_for_code(
|
|
1282
|
+
"Video task creation failed", code=data.get("code"), payload=data
|
|
1283
|
+
)
|
|
1284
|
+
return data["data"]["task_id"]
|
|
1285
|
+
|
|
1286
|
+
def get_task_status(self, task_id: str) -> str:
|
|
1287
|
+
self._require_public_credentials()
|
|
1288
|
+
headers = build_public_api_headers(
|
|
1289
|
+
self.public_token or "", self.public_key or ""
|
|
1290
|
+
)
|
|
1291
|
+
try:
|
|
1292
|
+
response = self._api_request_with_retry(
|
|
1293
|
+
"POST",
|
|
1294
|
+
self._status_url,
|
|
1295
|
+
data={"tasks_ids": task_id},
|
|
1296
|
+
headers=headers,
|
|
1297
|
+
)
|
|
1298
|
+
response.raise_for_status()
|
|
1299
|
+
data = response.json()
|
|
1300
|
+
if data.get("code") != 200:
|
|
1301
|
+
raise_for_code("Task status error", code=data.get("code"), payload=data)
|
|
1302
|
+
|
|
1303
|
+
items = data.get("data") or []
|
|
1304
|
+
for item in items:
|
|
1305
|
+
if str(item.get("task_id")) == str(task_id):
|
|
1306
|
+
return item.get("status", "unknown")
|
|
1307
|
+
return "unknown"
|
|
1308
|
+
except requests.RequestException as e:
|
|
1309
|
+
raise ThordataNetworkError(
|
|
1310
|
+
f"Status check failed: {e}", original_error=e
|
|
1311
|
+
) from e
|
|
1312
|
+
|
|
1313
|
+
def safe_get_task_status(self, task_id: str) -> str:
|
|
1314
|
+
try:
|
|
1315
|
+
return self.get_task_status(task_id)
|
|
1316
|
+
except Exception:
|
|
1317
|
+
return "error"
|
|
1318
|
+
|
|
1319
|
+
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
1320
|
+
self._require_public_credentials()
|
|
1321
|
+
headers = build_public_api_headers(
|
|
1322
|
+
self.public_token or "", self.public_key or ""
|
|
1323
|
+
)
|
|
1324
|
+
try:
|
|
1325
|
+
response = self._api_request_with_retry(
|
|
1326
|
+
"POST",
|
|
1327
|
+
self._download_url,
|
|
1328
|
+
data={"tasks_id": task_id, "type": file_type},
|
|
1329
|
+
headers=headers,
|
|
1330
|
+
)
|
|
1331
|
+
response.raise_for_status()
|
|
1332
|
+
data = response.json()
|
|
1333
|
+
if data.get("code") == 200 and data.get("data"):
|
|
1334
|
+
return data["data"]["download"]
|
|
1335
|
+
raise_for_code("Get result failed", code=data.get("code"), payload=data)
|
|
1336
|
+
return ""
|
|
1337
|
+
except requests.RequestException as e:
|
|
1338
|
+
raise ThordataNetworkError(
|
|
1339
|
+
f"Get result failed: {e}", original_error=e
|
|
1340
|
+
) from e
|
|
1341
|
+
|
|
1342
|
+
def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
|
|
1343
|
+
self._require_public_credentials()
|
|
1344
|
+
headers = build_public_api_headers(
|
|
1345
|
+
self.public_token or "", self.public_key or ""
|
|
1346
|
+
)
|
|
1347
|
+
response = self._api_request_with_retry(
|
|
1348
|
+
"POST",
|
|
1349
|
+
self._list_url,
|
|
1350
|
+
data={"page": str(page), "size": str(size)},
|
|
1351
|
+
headers=headers,
|
|
1352
|
+
)
|
|
1353
|
+
response.raise_for_status()
|
|
1354
|
+
data = response.json()
|
|
1355
|
+
if data.get("code") != 200:
|
|
1356
|
+
raise_for_code("List tasks failed", code=data.get("code"), payload=data)
|
|
1357
|
+
return data.get("data", {"count": 0, "list": []})
|
|
1358
|
+
|
|
1359
|
+
def wait_for_task(
|
|
1360
|
+
self,
|
|
1361
|
+
task_id: str,
|
|
1362
|
+
*,
|
|
1363
|
+
poll_interval: float = 5.0,
|
|
1364
|
+
max_wait: float = 600.0,
|
|
1365
|
+
) -> str:
|
|
1366
|
+
import time
|
|
1367
|
+
|
|
1368
|
+
start = time.monotonic()
|
|
1369
|
+
while (time.monotonic() - start) < max_wait:
|
|
1370
|
+
status = self.get_task_status(task_id)
|
|
1371
|
+
if status.lower() in {
|
|
1372
|
+
"ready",
|
|
1373
|
+
"success",
|
|
1374
|
+
"finished",
|
|
1375
|
+
"failed",
|
|
1376
|
+
"error",
|
|
1377
|
+
"cancelled",
|
|
1378
|
+
}:
|
|
1379
|
+
return status
|
|
1380
|
+
time.sleep(poll_interval)
|
|
1381
|
+
raise TimeoutError(f"Task {task_id} timeout")
|
|
1382
|
+
|
|
1383
|
+
# =========================================================================
|
|
1384
|
+
# Account / Locations / Utils
|
|
1385
|
+
# =========================================================================
|
|
1386
|
+
|
|
1387
|
+
def get_usage_statistics(
|
|
1388
|
+
self,
|
|
1389
|
+
from_date: str | date,
|
|
1390
|
+
to_date: str | date,
|
|
1391
|
+
) -> UsageStatistics:
|
|
1392
|
+
self._require_public_credentials()
|
|
1393
|
+
if isinstance(from_date, date):
|
|
1394
|
+
from_date = from_date.strftime("%Y-%m-%d")
|
|
1395
|
+
if isinstance(to_date, date):
|
|
1396
|
+
to_date = to_date.strftime("%Y-%m-%d")
|
|
1397
|
+
|
|
1398
|
+
params = {
|
|
1399
|
+
"token": self.public_token,
|
|
1400
|
+
"key": self.public_key,
|
|
1401
|
+
"from_date": from_date,
|
|
1402
|
+
"to_date": to_date,
|
|
1403
|
+
}
|
|
1404
|
+
response = self._api_request_with_retry(
|
|
1405
|
+
"GET", self._usage_stats_url, params=params
|
|
1406
|
+
)
|
|
1407
|
+
response.raise_for_status()
|
|
1408
|
+
data = response.json()
|
|
1409
|
+
if data.get("code") != 200:
|
|
1410
|
+
raise_for_code("Usage stats error", code=data.get("code"), payload=data)
|
|
1411
|
+
return UsageStatistics.from_dict(data.get("data", data))
|
|
1412
|
+
|
|
1413
|
+
def list_proxy_users(
|
|
1414
|
+
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
1415
|
+
) -> ProxyUserList:
|
|
1416
|
+
self._require_public_credentials()
|
|
1417
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1418
|
+
params = {
|
|
1419
|
+
"token": self.public_token,
|
|
1420
|
+
"key": self.public_key,
|
|
1421
|
+
"proxy_type": str(pt),
|
|
1422
|
+
}
|
|
1423
|
+
response = self._api_request_with_retry(
|
|
1424
|
+
"GET", f"{self._proxy_users_url}/user-list", params=params
|
|
1425
|
+
)
|
|
1426
|
+
response.raise_for_status()
|
|
1427
|
+
data = response.json()
|
|
1428
|
+
if data.get("code") != 200:
|
|
1429
|
+
raise_for_code("List users error", code=data.get("code"), payload=data)
|
|
1430
|
+
return ProxyUserList.from_dict(data.get("data", data))
|
|
1431
|
+
|
|
1432
|
+
def create_proxy_user(
|
|
1433
|
+
self,
|
|
1434
|
+
username: str,
|
|
1435
|
+
password: str,
|
|
1436
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1437
|
+
traffic_limit: int = 0,
|
|
1438
|
+
status: bool = True,
|
|
1439
|
+
) -> dict[str, Any]:
|
|
1440
|
+
self._require_public_credentials()
|
|
1441
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1442
|
+
headers = build_public_api_headers(
|
|
1443
|
+
self.public_token or "", self.public_key or ""
|
|
1444
|
+
)
|
|
1445
|
+
payload = {
|
|
1446
|
+
"proxy_type": str(pt),
|
|
1447
|
+
"username": username,
|
|
1448
|
+
"password": password,
|
|
1449
|
+
"traffic_limit": str(traffic_limit),
|
|
1450
|
+
"status": "true" if status else "false",
|
|
1451
|
+
}
|
|
1452
|
+
response = self._api_request_with_retry(
|
|
1453
|
+
"POST",
|
|
1454
|
+
f"{self._proxy_users_url}/create-user",
|
|
1455
|
+
data=payload,
|
|
1456
|
+
headers=headers,
|
|
1457
|
+
)
|
|
1458
|
+
response.raise_for_status()
|
|
1459
|
+
data = response.json()
|
|
1460
|
+
if data.get("code") != 200:
|
|
1461
|
+
raise_for_code("Create user failed", code=data.get("code"), payload=data)
|
|
1462
|
+
return data.get("data", {})
|
|
1463
|
+
|
|
1464
|
+
def add_whitelist_ip(
|
|
1465
|
+
self,
|
|
1466
|
+
ip: str,
|
|
1467
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1468
|
+
status: bool = True,
|
|
1469
|
+
) -> dict[str, Any]:
|
|
1470
|
+
self._require_public_credentials()
|
|
1471
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1472
|
+
headers = build_public_api_headers(
|
|
1473
|
+
self.public_token or "", self.public_key or ""
|
|
1474
|
+
)
|
|
1475
|
+
payload = {
|
|
1476
|
+
"proxy_type": str(pt),
|
|
1477
|
+
"ip": ip,
|
|
1478
|
+
"status": "true" if status else "false",
|
|
1479
|
+
}
|
|
1480
|
+
response = self._api_request_with_retry(
|
|
1481
|
+
"POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
|
|
1482
|
+
)
|
|
1483
|
+
response.raise_for_status()
|
|
1484
|
+
data = response.json()
|
|
1485
|
+
if data.get("code") != 200:
|
|
1486
|
+
raise_for_code(
|
|
1487
|
+
"Add whitelist IP failed", code=data.get("code"), payload=data
|
|
1488
|
+
)
|
|
1489
|
+
return data.get("data", {})
|
|
1490
|
+
|
|
1491
|
+
def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
|
|
1492
|
+
self._require_public_credentials()
|
|
1493
|
+
params = {
|
|
1494
|
+
"token": self.public_token,
|
|
1495
|
+
"key": self.public_key,
|
|
1496
|
+
"proxy_type": str(proxy_type),
|
|
1497
|
+
}
|
|
1498
|
+
response = self._api_request_with_retry(
|
|
1499
|
+
"GET", self._proxy_list_url, params=params
|
|
1500
|
+
)
|
|
1501
|
+
response.raise_for_status()
|
|
1502
|
+
data = response.json()
|
|
1503
|
+
if data.get("code") != 200:
|
|
1504
|
+
raise_for_code(
|
|
1505
|
+
"List proxy servers error", code=data.get("code"), payload=data
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1508
|
+
server_list = []
|
|
1509
|
+
if isinstance(data, dict):
|
|
1510
|
+
server_list = data.get("data", data.get("list", []))
|
|
1511
|
+
elif isinstance(data, list):
|
|
1512
|
+
server_list = data
|
|
1513
|
+
|
|
1514
|
+
return [ProxyServer.from_dict(s) for s in server_list]
|
|
1515
|
+
|
|
1516
|
+
def get_proxy_expiration(
|
|
1517
|
+
self, ips: str | list[str], proxy_type: int
|
|
1518
|
+
) -> dict[str, Any]:
|
|
1519
|
+
self._require_public_credentials()
|
|
1520
|
+
if isinstance(ips, list):
|
|
1521
|
+
ips = ",".join(ips)
|
|
1522
|
+
params = {
|
|
1523
|
+
"token": self.public_token,
|
|
1524
|
+
"key": self.public_key,
|
|
1525
|
+
"proxy_type": str(proxy_type),
|
|
1526
|
+
"ips": ips,
|
|
1527
|
+
}
|
|
1528
|
+
response = self._api_request_with_retry(
|
|
1529
|
+
"GET", self._proxy_expiration_url, params=params
|
|
1530
|
+
)
|
|
1531
|
+
response.raise_for_status()
|
|
1532
|
+
data = response.json()
|
|
1533
|
+
if data.get("code") != 200:
|
|
1534
|
+
raise_for_code("Get expiration error", code=data.get("code"), payload=data)
|
|
1535
|
+
return data.get("data", data)
|
|
1536
|
+
|
|
1537
|
+
def list_countries(
|
|
1538
|
+
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
1539
|
+
) -> list[dict[str, Any]]:
|
|
1540
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1541
|
+
return self._get_locations("countries", proxy_type=pt)
|
|
1542
|
+
|
|
1543
|
+
def list_states(
|
|
1544
|
+
self,
|
|
1545
|
+
country_code: str,
|
|
1546
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1547
|
+
) -> list[dict[str, Any]]:
|
|
1548
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1549
|
+
return self._get_locations("states", proxy_type=pt, country_code=country_code)
|
|
1550
|
+
|
|
1551
|
+
def list_cities(
|
|
1552
|
+
self,
|
|
1553
|
+
country_code: str,
|
|
1554
|
+
state_code: str | None = None,
|
|
1555
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1556
|
+
) -> list[dict[str, Any]]:
|
|
1557
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1558
|
+
kwargs = {"proxy_type": pt, "country_code": country_code}
|
|
1559
|
+
if state_code:
|
|
1560
|
+
kwargs["state_code"] = state_code
|
|
1561
|
+
return self._get_locations("cities", **kwargs)
|
|
1562
|
+
|
|
1563
|
+
def list_asn(
|
|
1564
|
+
self,
|
|
1565
|
+
country_code: str,
|
|
1566
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1567
|
+
) -> list[dict[str, Any]]:
|
|
1568
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1569
|
+
return self._get_locations("asn", proxy_type=pt, country_code=country_code)
|
|
1570
|
+
|
|
1571
|
+
def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
|
|
1572
|
+
self._require_public_credentials()
|
|
1573
|
+
params = {"token": self.public_token, "key": self.public_key}
|
|
1574
|
+
for k, v in kwargs.items():
|
|
1575
|
+
params[k] = str(v)
|
|
1576
|
+
|
|
1577
|
+
response = self._api_request_with_retry(
|
|
1578
|
+
"GET", f"{self._locations_base_url}/{endpoint}", params=params
|
|
1579
|
+
)
|
|
1580
|
+
response.raise_for_status()
|
|
1581
|
+
data = response.json()
|
|
1582
|
+
if isinstance(data, dict):
|
|
1583
|
+
if data.get("code") != 200:
|
|
1584
|
+
raise RuntimeError(f"Locations error: {data.get('msg')}")
|
|
1585
|
+
return data.get("data") or []
|
|
1586
|
+
return data if isinstance(data, list) else []
|
|
1587
|
+
|
|
1588
|
+
def _require_public_credentials(self) -> None:
|
|
1589
|
+
if not self.public_token or not self.public_key:
|
|
1590
|
+
raise ThordataConfigError(
|
|
1591
|
+
"public_token and public_key are required for this operation."
|
|
1592
|
+
)
|
|
1593
|
+
|
|
1594
|
+
def _get_proxy_endpoint_overrides(
|
|
1595
|
+
self, product: ProxyProduct
|
|
1596
|
+
) -> tuple[str | None, int | None, str]:
|
|
1597
|
+
prefix = product.value.upper()
|
|
1598
|
+
host = os.getenv(f"THORDATA_{prefix}_PROXY_HOST") or os.getenv(
|
|
1599
|
+
"THORDATA_PROXY_HOST"
|
|
1600
|
+
)
|
|
1601
|
+
port_raw = os.getenv(f"THORDATA_{prefix}_PROXY_PORT") or os.getenv(
|
|
1602
|
+
"THORDATA_PROXY_PORT"
|
|
1603
|
+
)
|
|
1604
|
+
protocol = (
|
|
1605
|
+
os.getenv(f"THORDATA_{prefix}_PROXY_PROTOCOL")
|
|
1606
|
+
or os.getenv("THORDATA_PROXY_PROTOCOL")
|
|
1607
|
+
or "https"
|
|
1608
|
+
)
|
|
1609
|
+
port = int(port_raw) if port_raw and port_raw.isdigit() else None
|
|
1610
|
+
return host or None, port, protocol
|
|
1611
|
+
|
|
1612
|
+
def _get_default_proxy_config_from_env(self) -> ProxyConfig | None:
|
|
1613
|
+
for prod in [
|
|
1614
|
+
ProxyProduct.RESIDENTIAL,
|
|
1615
|
+
ProxyProduct.DATACENTER,
|
|
1616
|
+
ProxyProduct.MOBILE,
|
|
1617
|
+
]:
|
|
1618
|
+
prefix = prod.value.upper()
|
|
1619
|
+
u = os.getenv(f"THORDATA_{prefix}_USERNAME")
|
|
1620
|
+
p = os.getenv(f"THORDATA_{prefix}_PASSWORD")
|
|
1621
|
+
if u and p:
|
|
1622
|
+
h, port, proto = self._get_proxy_endpoint_overrides(prod)
|
|
1623
|
+
return ProxyConfig(
|
|
1624
|
+
username=u,
|
|
1625
|
+
password=p,
|
|
1626
|
+
product=prod,
|
|
1627
|
+
host=h,
|
|
1628
|
+
port=port,
|
|
1629
|
+
protocol=proto,
|
|
1630
|
+
)
|
|
1631
|
+
return None
|
|
1632
|
+
|
|
1633
|
+
def close(self) -> None:
|
|
1634
|
+
self._proxy_session.close()
|
|
1635
|
+
self._api_session.close()
|
|
1636
|
+
for pm in self._proxy_managers.values():
|
|
1637
|
+
pm.clear()
|
|
1638
|
+
self._proxy_managers.clear()
|
|
1639
|
+
|
|
1640
|
+
def __enter__(self) -> ThordataClient:
|
|
1641
|
+
return self
|
|
1642
|
+
|
|
1643
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
1644
|
+
self.close()
|