thordata-sdk 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +4 -40
- thordata/async_client.py +503 -1796
- thordata/client.py +444 -1322
- thordata/core/__init__.py +23 -0
- thordata/core/async_http_client.py +91 -0
- thordata/core/http_client.py +79 -0
- thordata/core/tunnel.py +287 -0
- thordata/enums.py +41 -380
- thordata/exceptions.py +70 -19
- thordata/models.py +37 -1193
- thordata/retry.py +1 -1
- thordata/tools/__init__.py +38 -0
- thordata/tools/base.py +42 -0
- thordata/tools/code.py +39 -0
- thordata/tools/ecommerce.py +251 -0
- thordata/tools/professional.py +155 -0
- thordata/tools/search.py +115 -0
- thordata/tools/social.py +374 -0
- thordata/tools/travel.py +100 -0
- thordata/tools/video.py +154 -0
- thordata/types/__init__.py +77 -0
- thordata/types/common.py +141 -0
- thordata/types/proxy.py +340 -0
- thordata/types/serp.py +224 -0
- thordata/types/task.py +156 -0
- thordata/types/universal.py +66 -0
- thordata/unlimited.py +67 -0
- thordata_sdk-1.6.0.dist-info/METADATA +287 -0
- thordata_sdk-1.6.0.dist-info/RECORD +35 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/WHEEL +1 -1
- thordata/_example_utils.py +0 -77
- thordata/demo.py +0 -138
- thordata_sdk-1.4.0.dist-info/METADATA +0 -208
- thordata_sdk-1.4.0.dist-info/RECORD +0 -18
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/top_level.txt +0 -0
thordata/client.py
CHANGED
|
@@ -3,74 +3,60 @@ Synchronous client for the Thordata API.
|
|
|
3
3
|
|
|
4
4
|
This module provides the main ThordataClient class for interacting with
|
|
5
5
|
Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
6
|
-
|
|
7
|
-
Example:
|
|
8
|
-
>>> from thordata import ThordataClient
|
|
9
|
-
>>>
|
|
10
|
-
>>> client = ThordataClient(
|
|
11
|
-
... scraper_token="your_token",
|
|
12
|
-
... public_token="your_public_token",
|
|
13
|
-
... public_key="your_public_key"
|
|
14
|
-
... )
|
|
15
|
-
>>>
|
|
16
|
-
>>> # Use the proxy network
|
|
17
|
-
>>> response = client.get("https://httpbin.org/ip")
|
|
18
|
-
>>> print(response.json())
|
|
19
|
-
>>>
|
|
20
|
-
>>> # Search with SERP API
|
|
21
|
-
>>> results = client.serp_search("python tutorial", engine="google")
|
|
22
6
|
"""
|
|
23
7
|
|
|
24
8
|
from __future__ import annotations
|
|
25
9
|
|
|
26
10
|
import base64
|
|
27
|
-
import contextlib
|
|
28
11
|
import hashlib
|
|
29
|
-
import json
|
|
30
12
|
import logging
|
|
31
13
|
import os
|
|
32
14
|
import socket
|
|
33
15
|
import ssl
|
|
34
16
|
from datetime import date
|
|
35
17
|
from typing import Any, cast
|
|
36
|
-
from urllib.parse import
|
|
18
|
+
from urllib.parse import urlencode, urlparse
|
|
37
19
|
|
|
38
20
|
import requests
|
|
39
21
|
import urllib3
|
|
40
22
|
from requests.structures import CaseInsensitiveDict
|
|
41
23
|
|
|
42
|
-
|
|
43
|
-
from .unlimited import UnlimitedNamespace
|
|
44
|
-
|
|
45
|
-
try:
|
|
46
|
-
import socks
|
|
47
|
-
|
|
48
|
-
HAS_PYSOCKS = True
|
|
49
|
-
except ImportError:
|
|
50
|
-
HAS_PYSOCKS = False
|
|
51
|
-
|
|
52
|
-
from . import __version__ as _sdk_version
|
|
24
|
+
# Import Legacy/Compat
|
|
53
25
|
from ._utils import (
|
|
54
26
|
build_auth_headers,
|
|
55
27
|
build_builder_headers,
|
|
56
28
|
build_public_api_headers,
|
|
57
|
-
build_user_agent,
|
|
58
29
|
decode_base64_image,
|
|
59
30
|
extract_error_message,
|
|
60
31
|
parse_json_response,
|
|
61
32
|
)
|
|
62
|
-
|
|
33
|
+
|
|
34
|
+
# Import Core Components
|
|
35
|
+
from .core.http_client import ThordataHttpSession
|
|
36
|
+
from .core.tunnel import (
|
|
37
|
+
HAS_PYSOCKS,
|
|
38
|
+
UpstreamProxySocketFactory,
|
|
39
|
+
create_tls_in_tls,
|
|
40
|
+
parse_upstream_proxy,
|
|
41
|
+
socks5_handshake,
|
|
42
|
+
)
|
|
43
|
+
from .enums import Engine
|
|
63
44
|
from .exceptions import (
|
|
64
45
|
ThordataConfigError,
|
|
65
46
|
ThordataNetworkError,
|
|
66
47
|
ThordataTimeoutError,
|
|
67
48
|
raise_for_code,
|
|
68
49
|
)
|
|
69
|
-
from .
|
|
50
|
+
from .retry import RetryConfig, with_retry
|
|
51
|
+
from .serp_engines import SerpNamespace
|
|
52
|
+
|
|
53
|
+
# Import Types (Modernized)
|
|
54
|
+
from .types import (
|
|
70
55
|
CommonSettings,
|
|
71
56
|
ProxyConfig,
|
|
72
57
|
ProxyProduct,
|
|
73
58
|
ProxyServer,
|
|
59
|
+
ProxyType,
|
|
74
60
|
ProxyUserList,
|
|
75
61
|
ScraperTaskConfig,
|
|
76
62
|
SerpRequest,
|
|
@@ -78,196 +64,17 @@ from .models import (
|
|
|
78
64
|
UsageStatistics,
|
|
79
65
|
VideoTaskConfig,
|
|
80
66
|
)
|
|
81
|
-
from .
|
|
67
|
+
from .unlimited import UnlimitedNamespace
|
|
82
68
|
|
|
83
69
|
logger = logging.getLogger(__name__)
|
|
84
70
|
|
|
85
|
-
|
|
86
71
|
# =========================================================================
|
|
87
|
-
#
|
|
72
|
+
# Internal Logic for Upstream Proxies
|
|
88
73
|
# =========================================================================
|
|
89
74
|
|
|
90
75
|
|
|
91
76
|
def _parse_upstream_proxy() -> dict[str, Any] | None:
|
|
92
|
-
|
|
93
|
-
Parse THORDATA_UPSTREAM_PROXY environment variable.
|
|
94
|
-
|
|
95
|
-
Supported formats:
|
|
96
|
-
- http://127.0.0.1:7897
|
|
97
|
-
- socks5://127.0.0.1:7897
|
|
98
|
-
- socks5://user:pass@127.0.0.1:7897
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
Dict with proxy config or None if not set.
|
|
102
|
-
"""
|
|
103
|
-
upstream_url = os.environ.get("THORDATA_UPSTREAM_PROXY", "").strip()
|
|
104
|
-
if not upstream_url:
|
|
105
|
-
return None
|
|
106
|
-
|
|
107
|
-
parsed = urlparse(upstream_url)
|
|
108
|
-
scheme = (parsed.scheme or "").lower()
|
|
109
|
-
|
|
110
|
-
if scheme not in ("http", "https", "socks5", "socks5h", "socks4"):
|
|
111
|
-
logger.warning(f"Unsupported upstream proxy scheme: {scheme}")
|
|
112
|
-
return None
|
|
113
|
-
|
|
114
|
-
return {
|
|
115
|
-
"scheme": scheme,
|
|
116
|
-
"host": parsed.hostname or "127.0.0.1",
|
|
117
|
-
"port": parsed.port or (1080 if scheme.startswith("socks") else 7897),
|
|
118
|
-
"username": parsed.username,
|
|
119
|
-
"password": parsed.password,
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class _UpstreamProxySocketFactory:
|
|
124
|
-
"""
|
|
125
|
-
Socket factory that creates connections through an upstream proxy.
|
|
126
|
-
Used for proxy chaining when accessing Thordata from behind a firewall.
|
|
127
|
-
"""
|
|
128
|
-
|
|
129
|
-
def __init__(self, upstream_config: dict[str, Any]):
|
|
130
|
-
self.config = upstream_config
|
|
131
|
-
|
|
132
|
-
def create_connection(
|
|
133
|
-
self,
|
|
134
|
-
address: tuple[str, int],
|
|
135
|
-
timeout: float | None = None,
|
|
136
|
-
source_address: tuple[str, int] | None = None,
|
|
137
|
-
) -> socket.socket:
|
|
138
|
-
"""Create a socket connection through the upstream proxy."""
|
|
139
|
-
scheme = self.config["scheme"]
|
|
140
|
-
|
|
141
|
-
if scheme.startswith("socks"):
|
|
142
|
-
return self._create_socks_connection(address, timeout)
|
|
143
|
-
else:
|
|
144
|
-
return self._create_http_tunnel(address, timeout)
|
|
145
|
-
|
|
146
|
-
def _create_socks_connection(
|
|
147
|
-
self,
|
|
148
|
-
address: tuple[str, int],
|
|
149
|
-
timeout: float | None = None,
|
|
150
|
-
) -> socket.socket:
|
|
151
|
-
"""Create connection through SOCKS proxy."""
|
|
152
|
-
if not HAS_PYSOCKS:
|
|
153
|
-
raise RuntimeError(
|
|
154
|
-
"PySocks is required for SOCKS upstream proxy. "
|
|
155
|
-
"Install with: pip install PySocks"
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
scheme = self.config["scheme"]
|
|
159
|
-
proxy_type = socks.SOCKS5 if "socks5" in scheme else socks.SOCKS4
|
|
160
|
-
|
|
161
|
-
sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
|
|
162
|
-
sock.set_proxy(
|
|
163
|
-
proxy_type,
|
|
164
|
-
self.config["host"],
|
|
165
|
-
self.config["port"],
|
|
166
|
-
rdns=True,
|
|
167
|
-
username=self.config.get("username"),
|
|
168
|
-
password=self.config.get("password"),
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
if timeout is not None:
|
|
172
|
-
sock.settimeout(timeout)
|
|
173
|
-
|
|
174
|
-
sock.connect(address)
|
|
175
|
-
return sock
|
|
176
|
-
|
|
177
|
-
def _create_http_tunnel(
|
|
178
|
-
self,
|
|
179
|
-
address: tuple[str, int],
|
|
180
|
-
timeout: float | None = None,
|
|
181
|
-
) -> socket.socket:
|
|
182
|
-
"""Create connection through HTTP CONNECT tunnel."""
|
|
183
|
-
# Connect to upstream proxy
|
|
184
|
-
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
185
|
-
if timeout is not None:
|
|
186
|
-
sock.settimeout(timeout)
|
|
187
|
-
|
|
188
|
-
sock.connect((self.config["host"], self.config["port"]))
|
|
189
|
-
|
|
190
|
-
# Build CONNECT request
|
|
191
|
-
target_host, target_port = address
|
|
192
|
-
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
193
|
-
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
194
|
-
|
|
195
|
-
# Add proxy auth if provided
|
|
196
|
-
if self.config.get("username"):
|
|
197
|
-
credentials = f"{self.config['username']}:{self.config.get('password', '')}"
|
|
198
|
-
encoded = base64.b64encode(credentials.encode()).decode()
|
|
199
|
-
connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
|
|
200
|
-
|
|
201
|
-
connect_req += "\r\n"
|
|
202
|
-
|
|
203
|
-
sock.sendall(connect_req.encode())
|
|
204
|
-
|
|
205
|
-
# Read response
|
|
206
|
-
response = b""
|
|
207
|
-
while b"\r\n\r\n" not in response:
|
|
208
|
-
chunk = sock.recv(1024)
|
|
209
|
-
if not chunk:
|
|
210
|
-
raise ConnectionError("Upstream proxy closed connection")
|
|
211
|
-
response += chunk
|
|
212
|
-
|
|
213
|
-
# Check status
|
|
214
|
-
status_line = response.split(b"\r\n")[0].decode()
|
|
215
|
-
if "200" not in status_line:
|
|
216
|
-
sock.close()
|
|
217
|
-
raise ConnectionError(f"Upstream proxy CONNECT failed: {status_line}")
|
|
218
|
-
|
|
219
|
-
return sock
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
class _TLSInTLSSocket:
|
|
223
|
-
"""
|
|
224
|
-
A socket-like wrapper for TLS-in-TLS connections.
|
|
225
|
-
|
|
226
|
-
Uses SSLObject + MemoryBIO to implement TLS over an existing TLS connection.
|
|
227
|
-
"""
|
|
228
|
-
|
|
229
|
-
def __init__(
|
|
230
|
-
self,
|
|
231
|
-
outer_sock: ssl.SSLSocket,
|
|
232
|
-
ssl_obj: ssl.SSLObject,
|
|
233
|
-
incoming: ssl.MemoryBIO,
|
|
234
|
-
outgoing: ssl.MemoryBIO,
|
|
235
|
-
):
|
|
236
|
-
self._outer = outer_sock
|
|
237
|
-
self._ssl = ssl_obj
|
|
238
|
-
self._incoming = incoming
|
|
239
|
-
self._outgoing = outgoing
|
|
240
|
-
self._timeout: float | None = None
|
|
241
|
-
|
|
242
|
-
def settimeout(self, timeout: float | None) -> None:
|
|
243
|
-
self._timeout = timeout
|
|
244
|
-
self._outer.settimeout(timeout)
|
|
245
|
-
|
|
246
|
-
def sendall(self, data: bytes) -> None:
|
|
247
|
-
"""Send data through the inner TLS connection."""
|
|
248
|
-
self._ssl.write(data)
|
|
249
|
-
encrypted = self._outgoing.read()
|
|
250
|
-
if encrypted:
|
|
251
|
-
self._outer.sendall(encrypted)
|
|
252
|
-
|
|
253
|
-
def recv(self, bufsize: int) -> bytes:
|
|
254
|
-
"""Receive data from the inner TLS connection."""
|
|
255
|
-
while True:
|
|
256
|
-
try:
|
|
257
|
-
return self._ssl.read(bufsize)
|
|
258
|
-
except ssl.SSLWantReadError:
|
|
259
|
-
self._outer.settimeout(self._timeout)
|
|
260
|
-
try:
|
|
261
|
-
received = self._outer.recv(8192)
|
|
262
|
-
if not received:
|
|
263
|
-
return b""
|
|
264
|
-
self._incoming.write(received)
|
|
265
|
-
except socket.timeout:
|
|
266
|
-
return b""
|
|
267
|
-
|
|
268
|
-
def close(self) -> None:
|
|
269
|
-
with contextlib.suppress(Exception):
|
|
270
|
-
self._outer.close()
|
|
77
|
+
return parse_upstream_proxy()
|
|
271
78
|
|
|
272
79
|
|
|
273
80
|
# =========================================================================
|
|
@@ -300,24 +107,6 @@ class ThordataClient:
|
|
|
300
107
|
web_scraper_api_base_url: str | None = None,
|
|
301
108
|
locations_base_url: str | None = None,
|
|
302
109
|
) -> None:
|
|
303
|
-
"""Initialize the Thordata Client.
|
|
304
|
-
|
|
305
|
-
Args:
|
|
306
|
-
scraper_token: Token for SERP/Universal scraping APIs.
|
|
307
|
-
public_token: Public API token for account/management operations.
|
|
308
|
-
public_key: Public API key for account/management operations.
|
|
309
|
-
proxy_host: Default proxy host for residential proxies.
|
|
310
|
-
proxy_port: Default proxy port for residential proxies.
|
|
311
|
-
timeout: Default timeout for proxy requests.
|
|
312
|
-
api_timeout: Default timeout for API requests.
|
|
313
|
-
retry_config: Configuration for retry behavior.
|
|
314
|
-
auth_mode: Authentication mode for scraper_token ("bearer" or "header_token").
|
|
315
|
-
scraperapi_base_url: Override base URL for SERP API.
|
|
316
|
-
universalapi_base_url: Override base URL for Universal Scraping API.
|
|
317
|
-
web_scraper_api_base_url: Override base URL for Web Scraper API.
|
|
318
|
-
locations_base_url: Override base URL for Locations API.
|
|
319
|
-
"""
|
|
320
|
-
|
|
321
110
|
self.scraper_token = scraper_token
|
|
322
111
|
self.public_token = public_token
|
|
323
112
|
self.public_key = public_key
|
|
@@ -334,17 +123,17 @@ class ThordataClient:
|
|
|
334
123
|
f"Invalid auth_mode: {auth_mode}. Must be 'bearer' or 'header_token'."
|
|
335
124
|
)
|
|
336
125
|
|
|
126
|
+
# Initialize Core HTTP Client for API calls
|
|
127
|
+
self._http = ThordataHttpSession(
|
|
128
|
+
timeout=api_timeout, retry_config=self._retry_config
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Legacy logic for Proxy Network connections (requests.Session)
|
|
337
132
|
self._proxy_session = requests.Session()
|
|
338
133
|
self._proxy_session.trust_env = False
|
|
339
134
|
self._proxy_managers: dict[str, urllib3.PoolManager] = {}
|
|
340
135
|
|
|
341
|
-
|
|
342
|
-
self._api_session.trust_env = True
|
|
343
|
-
self._api_session.headers.update(
|
|
344
|
-
{"User-Agent": build_user_agent(_sdk_version, "requests")}
|
|
345
|
-
)
|
|
346
|
-
|
|
347
|
-
# Base URLs
|
|
136
|
+
# Base URLs Configuration
|
|
348
137
|
scraperapi_base = (
|
|
349
138
|
scraperapi_base_url
|
|
350
139
|
or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
|
|
@@ -369,14 +158,14 @@ class ThordataClient:
|
|
|
369
158
|
or self.LOCATIONS_URL
|
|
370
159
|
).rstrip("/")
|
|
371
160
|
|
|
372
|
-
|
|
373
|
-
"THORDATA_GATEWAY_BASE_URL", "https://
|
|
161
|
+
self._gateway_base_url = os.getenv(
|
|
162
|
+
"THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
|
|
374
163
|
)
|
|
375
|
-
self._gateway_base_url = gateway_base
|
|
376
164
|
self._child_base_url = os.getenv(
|
|
377
|
-
"THORDATA_CHILD_BASE_URL", "https://
|
|
165
|
+
"THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
|
|
378
166
|
)
|
|
379
167
|
|
|
168
|
+
# URL Construction
|
|
380
169
|
self._serp_url = f"{scraperapi_base}/request"
|
|
381
170
|
self._builder_url = f"{scraperapi_base}/builder"
|
|
382
171
|
self._video_builder_url = f"{scraperapi_base}/video_builder"
|
|
@@ -388,15 +177,13 @@ class ThordataClient:
|
|
|
388
177
|
|
|
389
178
|
self._locations_base_url = locations_base
|
|
390
179
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
self._proxy_users_url =
|
|
395
|
-
f"{locations_base.replace('/locations', '')}/proxy-users"
|
|
396
|
-
)
|
|
180
|
+
# Determine shared API base from locations URL
|
|
181
|
+
shared_api_base = locations_base.replace("/locations", "")
|
|
182
|
+
self._usage_stats_url = f"{shared_api_base}/account/usage-statistics"
|
|
183
|
+
self._proxy_users_url = f"{shared_api_base}/proxy-users"
|
|
397
184
|
|
|
398
185
|
whitelist_base = os.getenv(
|
|
399
|
-
"THORDATA_WHITELIST_BASE_URL", "https://
|
|
186
|
+
"THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
|
|
400
187
|
)
|
|
401
188
|
self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
|
|
402
189
|
|
|
@@ -406,7 +193,7 @@ class ThordataClient:
|
|
|
406
193
|
self._proxy_list_url = f"{proxy_api_base}/proxy/proxy-list"
|
|
407
194
|
self._proxy_expiration_url = f"{proxy_api_base}/proxy/expiration-time"
|
|
408
195
|
|
|
409
|
-
# Initialize Namespaces
|
|
196
|
+
# Initialize Namespaces
|
|
410
197
|
self.serp = SerpNamespace(self)
|
|
411
198
|
self.unlimited = UnlimitedNamespace(self)
|
|
412
199
|
|
|
@@ -416,8 +203,8 @@ class ThordataClient:
|
|
|
416
203
|
|
|
417
204
|
def close(self) -> None:
|
|
418
205
|
"""Close the client and release resources."""
|
|
206
|
+
self._http.close()
|
|
419
207
|
self._proxy_session.close()
|
|
420
|
-
self._api_session.close()
|
|
421
208
|
for pm in self._proxy_managers.values():
|
|
422
209
|
pm.clear()
|
|
423
210
|
self._proxy_managers.clear()
|
|
@@ -428,6 +215,30 @@ class ThordataClient:
|
|
|
428
215
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
429
216
|
self.close()
|
|
430
217
|
|
|
218
|
+
# =========================================================================
|
|
219
|
+
# Internal Helper: API Request Delegation
|
|
220
|
+
# =========================================================================
|
|
221
|
+
|
|
222
|
+
def _api_request_with_retry(
|
|
223
|
+
self,
|
|
224
|
+
method: str,
|
|
225
|
+
url: str,
|
|
226
|
+
*,
|
|
227
|
+
data: dict[str, Any] | None = None,
|
|
228
|
+
headers: dict[str, str] | None = None,
|
|
229
|
+
params: dict[str, Any] | None = None,
|
|
230
|
+
) -> requests.Response:
|
|
231
|
+
"""Delegate to Core HTTP Client."""
|
|
232
|
+
return self._http.request(
|
|
233
|
+
method=method, url=url, data=data, headers=headers, params=params
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
def _require_public_credentials(self) -> None:
|
|
237
|
+
if not self.public_token or not self.public_key:
|
|
238
|
+
raise ThordataConfigError(
|
|
239
|
+
"public_token and public_key are required for this operation."
|
|
240
|
+
)
|
|
241
|
+
|
|
431
242
|
# =========================================================================
|
|
432
243
|
# Proxy Network Methods
|
|
433
244
|
# =========================================================================
|
|
@@ -440,17 +251,6 @@ class ThordataClient:
|
|
|
440
251
|
timeout: int | None = None,
|
|
441
252
|
**kwargs: Any,
|
|
442
253
|
) -> requests.Response:
|
|
443
|
-
"""Make a GET request through the proxy network.
|
|
444
|
-
|
|
445
|
-
Args:
|
|
446
|
-
url: Target URL to request.
|
|
447
|
-
proxy_config: Proxy configuration. If not provided, uses environment variables.
|
|
448
|
-
timeout: Request timeout in seconds.
|
|
449
|
-
**kwargs: Additional arguments passed to requests.
|
|
450
|
-
|
|
451
|
-
Returns:
|
|
452
|
-
Response object.
|
|
453
|
-
"""
|
|
454
254
|
logger.debug(f"Proxy GET request: {url}")
|
|
455
255
|
return self._proxy_verb("GET", url, proxy_config, timeout, **kwargs)
|
|
456
256
|
|
|
@@ -462,17 +262,6 @@ class ThordataClient:
|
|
|
462
262
|
timeout: int | None = None,
|
|
463
263
|
**kwargs: Any,
|
|
464
264
|
) -> requests.Response:
|
|
465
|
-
"""Make a POST request through the proxy network.
|
|
466
|
-
|
|
467
|
-
Args:
|
|
468
|
-
url: Target URL to request.
|
|
469
|
-
proxy_config: Proxy configuration. If not provided, uses environment variables.
|
|
470
|
-
timeout: Request timeout in seconds.
|
|
471
|
-
**kwargs: Additional arguments passed to requests.
|
|
472
|
-
|
|
473
|
-
Returns:
|
|
474
|
-
Response object.
|
|
475
|
-
"""
|
|
476
265
|
logger.debug(f"Proxy POST request: {url}")
|
|
477
266
|
return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
|
|
478
267
|
|
|
@@ -488,21 +277,6 @@ class ThordataClient:
|
|
|
488
277
|
session_duration: int | None = None,
|
|
489
278
|
product: ProxyProduct | str = ProxyProduct.RESIDENTIAL,
|
|
490
279
|
) -> str:
|
|
491
|
-
"""Build a proxy URL with location and session parameters.
|
|
492
|
-
|
|
493
|
-
Args:
|
|
494
|
-
username: Proxy username.
|
|
495
|
-
password: Proxy password.
|
|
496
|
-
country: Country code (e.g., "us", "uk").
|
|
497
|
-
state: State/region code (e.g., "ca", "ny").
|
|
498
|
-
city: City name (e.g., "new-york", "london").
|
|
499
|
-
session_id: Session identifier for sticky sessions.
|
|
500
|
-
session_duration: Session duration in minutes (1-90).
|
|
501
|
-
product: Proxy product type (RESIDENTIAL, DATACENTER, MOBILE).
|
|
502
|
-
|
|
503
|
-
Returns:
|
|
504
|
-
Formatted proxy URL.
|
|
505
|
-
"""
|
|
506
280
|
config = ProxyConfig(
|
|
507
281
|
username=username,
|
|
508
282
|
password=password,
|
|
@@ -536,24 +310,6 @@ class ThordataClient:
|
|
|
536
310
|
output_format: str = "json",
|
|
537
311
|
**kwargs: Any,
|
|
538
312
|
) -> dict[str, Any]:
|
|
539
|
-
"""Perform a search engine query using SERP API.
|
|
540
|
-
|
|
541
|
-
Args:
|
|
542
|
-
query: Search query string.
|
|
543
|
-
engine: Search engine (GOOGLE, BING, YAHOO, etc.).
|
|
544
|
-
num: Number of results to return.
|
|
545
|
-
country: Country code for localized results.
|
|
546
|
-
language: Language code for interface.
|
|
547
|
-
search_type: Type of search (images, news, video, etc.).
|
|
548
|
-
device: Device type (desktop, mobile).
|
|
549
|
-
render_js: Whether to render JavaScript.
|
|
550
|
-
no_cache: Bypass cache.
|
|
551
|
-
output_format: Output format ("json" or "html").
|
|
552
|
-
**kwargs: Additional engine-specific parameters.
|
|
553
|
-
|
|
554
|
-
Returns:
|
|
555
|
-
Search results as dictionary.
|
|
556
|
-
"""
|
|
557
313
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
558
314
|
|
|
559
315
|
request = SerpRequest(
|
|
@@ -569,18 +325,9 @@ class ThordataClient:
|
|
|
569
325
|
output_format=output_format,
|
|
570
326
|
extra_params=kwargs,
|
|
571
327
|
)
|
|
572
|
-
|
|
573
328
|
return self.serp_search_advanced(request)
|
|
574
329
|
|
|
575
330
|
def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
|
|
576
|
-
"""Perform advanced search with a SerpRequest object.
|
|
577
|
-
|
|
578
|
-
Args:
|
|
579
|
-
request: SerpRequest object with search parameters.
|
|
580
|
-
|
|
581
|
-
Returns:
|
|
582
|
-
Search results as dictionary.
|
|
583
|
-
"""
|
|
584
331
|
if not self.scraper_token:
|
|
585
332
|
raise ThordataConfigError("scraper_token is required for SERP API")
|
|
586
333
|
|
|
@@ -589,30 +336,24 @@ class ThordataClient:
|
|
|
589
336
|
|
|
590
337
|
logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
|
|
591
338
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
response.raise_for_status()
|
|
600
|
-
|
|
601
|
-
if request.output_format.lower() == "json":
|
|
602
|
-
data = response.json()
|
|
603
|
-
if isinstance(data, dict):
|
|
604
|
-
code = data.get("code")
|
|
605
|
-
if code is not None and code != 200:
|
|
606
|
-
msg = extract_error_message(data)
|
|
607
|
-
raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
|
|
608
|
-
return parse_json_response(data)
|
|
339
|
+
response = self._api_request_with_retry(
|
|
340
|
+
"POST",
|
|
341
|
+
self._serp_url,
|
|
342
|
+
data=payload,
|
|
343
|
+
headers=headers,
|
|
344
|
+
)
|
|
345
|
+
response.raise_for_status()
|
|
609
346
|
|
|
610
|
-
|
|
347
|
+
if request.output_format.lower() == "json":
|
|
348
|
+
data = response.json()
|
|
349
|
+
if isinstance(data, dict):
|
|
350
|
+
code = data.get("code")
|
|
351
|
+
if code is not None and code != 200:
|
|
352
|
+
msg = extract_error_message(data)
|
|
353
|
+
raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
|
|
354
|
+
return parse_json_response(data)
|
|
611
355
|
|
|
612
|
-
|
|
613
|
-
raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
|
|
614
|
-
except requests.RequestException as e:
|
|
615
|
-
raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
|
|
356
|
+
return {"html": response.text}
|
|
616
357
|
|
|
617
358
|
# =========================================================================
|
|
618
359
|
# Universal Scraping API (WEB UNLOCKER) Methods
|
|
@@ -630,21 +371,6 @@ class ThordataClient:
|
|
|
630
371
|
wait_for: str | None = None,
|
|
631
372
|
**kwargs: Any,
|
|
632
373
|
) -> str | bytes:
|
|
633
|
-
"""Scrape a URL using Universal Scraping API.
|
|
634
|
-
|
|
635
|
-
Args:
|
|
636
|
-
url: Target URL to scrape.
|
|
637
|
-
js_render: Whether to render JavaScript.
|
|
638
|
-
output_format: Output format ("html" or "png").
|
|
639
|
-
country: Country for IP geolocation.
|
|
640
|
-
block_resources: Block specific resources (e.g., "script,css").
|
|
641
|
-
wait: Wait time in milliseconds before fetching.
|
|
642
|
-
wait_for: CSS selector to wait for before fetching.
|
|
643
|
-
**kwargs: Additional parameters.
|
|
644
|
-
|
|
645
|
-
Returns:
|
|
646
|
-
Scraped content as string (HTML) or bytes (PNG).
|
|
647
|
-
"""
|
|
648
374
|
request = UniversalScrapeRequest(
|
|
649
375
|
url=url,
|
|
650
376
|
js_render=js_render,
|
|
@@ -658,40 +384,17 @@ class ThordataClient:
|
|
|
658
384
|
return self.universal_scrape_advanced(request)
|
|
659
385
|
|
|
660
386
|
def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
|
|
661
|
-
"""Scrape with advanced options using UniversalScrapeRequest.
|
|
662
|
-
|
|
663
|
-
Args:
|
|
664
|
-
request: UniversalScrapeRequest object with scrape parameters.
|
|
665
|
-
|
|
666
|
-
Returns:
|
|
667
|
-
Scraped content as string (HTML) or bytes (PNG).
|
|
668
|
-
"""
|
|
669
387
|
if not self.scraper_token:
|
|
670
|
-
raise ThordataConfigError("scraper_token
|
|
388
|
+
raise ThordataConfigError("scraper_token required")
|
|
671
389
|
|
|
672
390
|
payload = request.to_payload()
|
|
673
391
|
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
674
392
|
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
self._universal_url,
|
|
681
|
-
data=payload,
|
|
682
|
-
headers=headers,
|
|
683
|
-
)
|
|
684
|
-
response.raise_for_status()
|
|
685
|
-
return self._process_universal_response(response, request.output_format)
|
|
686
|
-
|
|
687
|
-
except requests.Timeout as e:
|
|
688
|
-
raise ThordataTimeoutError(
|
|
689
|
-
f"Universal timeout: {e}", original_error=e
|
|
690
|
-
) from e
|
|
691
|
-
except requests.RequestException as e:
|
|
692
|
-
raise ThordataNetworkError(
|
|
693
|
-
f"Universal failed: {e}", original_error=e
|
|
694
|
-
) from e
|
|
393
|
+
response = self._api_request_with_retry(
|
|
394
|
+
"POST", self._universal_url, data=payload, headers=headers
|
|
395
|
+
)
|
|
396
|
+
response.raise_for_status()
|
|
397
|
+
return self._process_universal_response(response, request.output_format)
|
|
695
398
|
|
|
696
399
|
# =========================================================================
|
|
697
400
|
# Web Scraper API - Task Management
|
|
@@ -702,21 +405,9 @@ class ThordataClient:
|
|
|
702
405
|
file_name: str,
|
|
703
406
|
spider_id: str,
|
|
704
407
|
spider_name: str,
|
|
705
|
-
parameters: dict[str, Any],
|
|
408
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
706
409
|
universal_params: dict[str, Any] | None = None,
|
|
707
410
|
) -> str:
|
|
708
|
-
"""Create a web scraping task.
|
|
709
|
-
|
|
710
|
-
Args:
|
|
711
|
-
file_name: Name for the output file (supports {{TasksID}} template).
|
|
712
|
-
spider_id: Spider identifier from Dashboard.
|
|
713
|
-
spider_name: Spider name (target domain, e.g., "amazon.com").
|
|
714
|
-
parameters: Spider-specific parameters.
|
|
715
|
-
universal_params: Global spider settings.
|
|
716
|
-
|
|
717
|
-
Returns:
|
|
718
|
-
Task ID.
|
|
719
|
-
"""
|
|
720
411
|
config = ScraperTaskConfig(
|
|
721
412
|
file_name=file_name,
|
|
722
413
|
spider_id=spider_id,
|
|
@@ -726,59 +417,82 @@ class ThordataClient:
|
|
|
726
417
|
)
|
|
727
418
|
return self.create_scraper_task_advanced(config)
|
|
728
419
|
|
|
729
|
-
def
|
|
730
|
-
|
|
420
|
+
def run_tool(
|
|
421
|
+
self,
|
|
422
|
+
tool_request: Any,
|
|
423
|
+
file_name: str | None = None,
|
|
424
|
+
universal_params: dict[str, Any] | None = None,
|
|
425
|
+
) -> str:
|
|
426
|
+
"""
|
|
427
|
+
Run a specific pre-defined tool.
|
|
428
|
+
Supports both standard Scrapers and Video downloaders.
|
|
429
|
+
"""
|
|
430
|
+
if not hasattr(tool_request, "to_task_parameters") or not hasattr(
|
|
431
|
+
tool_request, "get_spider_id"
|
|
432
|
+
):
|
|
433
|
+
raise ValueError(
|
|
434
|
+
"tool_request must be an instance of a thordata.tools class"
|
|
435
|
+
)
|
|
731
436
|
|
|
732
|
-
|
|
733
|
-
|
|
437
|
+
spider_id = tool_request.get_spider_id()
|
|
438
|
+
spider_name = tool_request.get_spider_name()
|
|
439
|
+
params = tool_request.to_task_parameters()
|
|
734
440
|
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
441
|
+
if not file_name:
|
|
442
|
+
import uuid
|
|
443
|
+
|
|
444
|
+
short_id = uuid.uuid4().hex[:8]
|
|
445
|
+
file_name = f"{spider_id}_{short_id}"
|
|
446
|
+
|
|
447
|
+
# Check if it's a Video Tool (Duck typing check for common_settings)
|
|
448
|
+
if hasattr(tool_request, "common_settings"):
|
|
449
|
+
# It is a Video Task
|
|
450
|
+
config_video = VideoTaskConfig(
|
|
451
|
+
file_name=file_name,
|
|
452
|
+
spider_id=spider_id,
|
|
453
|
+
spider_name=spider_name,
|
|
454
|
+
parameters=params,
|
|
455
|
+
common_settings=tool_request.common_settings,
|
|
456
|
+
)
|
|
457
|
+
return self.create_video_task_advanced(config_video)
|
|
458
|
+
else:
|
|
459
|
+
# It is a Standard Scraper Task
|
|
460
|
+
config = ScraperTaskConfig(
|
|
461
|
+
file_name=file_name,
|
|
462
|
+
spider_id=spider_id,
|
|
463
|
+
spider_name=spider_name,
|
|
464
|
+
parameters=params,
|
|
465
|
+
universal_params=universal_params,
|
|
466
|
+
)
|
|
467
|
+
return self.create_scraper_task_advanced(config)
|
|
468
|
+
|
|
469
|
+
def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
738
470
|
self._require_public_credentials()
|
|
739
471
|
if not self.scraper_token:
|
|
740
472
|
raise ThordataConfigError("scraper_token is required for Task Builder")
|
|
473
|
+
|
|
741
474
|
payload = config.to_payload()
|
|
742
475
|
headers = build_builder_headers(
|
|
743
|
-
self.scraper_token, self.public_token
|
|
476
|
+
self.scraper_token, str(self.public_token), str(self.public_key)
|
|
744
477
|
)
|
|
745
478
|
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
"Task creation failed", code=data.get("code"), payload=data
|
|
755
|
-
)
|
|
756
|
-
return data["data"]["task_id"]
|
|
757
|
-
except requests.RequestException as e:
|
|
758
|
-
raise ThordataNetworkError(
|
|
759
|
-
f"Task creation failed: {e}", original_error=e
|
|
760
|
-
) from e
|
|
479
|
+
response = self._api_request_with_retry(
|
|
480
|
+
"POST", self._builder_url, data=payload, headers=headers
|
|
481
|
+
)
|
|
482
|
+
response.raise_for_status()
|
|
483
|
+
data = response.json()
|
|
484
|
+
if data.get("code") != 200:
|
|
485
|
+
raise_for_code("Task creation failed", code=data.get("code"), payload=data)
|
|
486
|
+
return data["data"]["task_id"]
|
|
761
487
|
|
|
762
488
|
def create_video_task(
|
|
763
489
|
self,
|
|
764
490
|
file_name: str,
|
|
765
491
|
spider_id: str,
|
|
766
492
|
spider_name: str,
|
|
767
|
-
parameters: dict[str, Any],
|
|
493
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
768
494
|
common_settings: CommonSettings,
|
|
769
495
|
) -> str:
|
|
770
|
-
"""Create a video/audio download task (YouTube, etc.).
|
|
771
|
-
|
|
772
|
-
Args:
|
|
773
|
-
file_name: Name for the output file.
|
|
774
|
-
spider_id: Spider identifier (e.g., "youtube_video_by-url").
|
|
775
|
-
spider_name: Target site (e.g., "youtube.com").
|
|
776
|
-
parameters: Spider-specific parameters (URLs, etc.).
|
|
777
|
-
common_settings: Video/audio settings (resolution, subtitles, etc.).
|
|
778
|
-
|
|
779
|
-
Returns:
|
|
780
|
-
Task ID.
|
|
781
|
-
"""
|
|
782
496
|
config = VideoTaskConfig(
|
|
783
497
|
file_name=file_name,
|
|
784
498
|
spider_id=spider_id,
|
|
@@ -789,14 +503,6 @@ class ThordataClient:
|
|
|
789
503
|
return self.create_video_task_advanced(config)
|
|
790
504
|
|
|
791
505
|
def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
|
|
792
|
-
"""Create a video task with advanced configuration.
|
|
793
|
-
|
|
794
|
-
Args:
|
|
795
|
-
config: VideoTaskConfig object with task configuration.
|
|
796
|
-
|
|
797
|
-
Returns:
|
|
798
|
-
Task ID.
|
|
799
|
-
"""
|
|
800
506
|
self._require_public_credentials()
|
|
801
507
|
if not self.scraper_token:
|
|
802
508
|
raise ThordataConfigError(
|
|
@@ -805,7 +511,7 @@ class ThordataClient:
|
|
|
805
511
|
|
|
806
512
|
payload = config.to_payload()
|
|
807
513
|
headers = build_builder_headers(
|
|
808
|
-
self.scraper_token, self.public_token
|
|
514
|
+
self.scraper_token, str(self.public_token), str(self.public_key)
|
|
809
515
|
)
|
|
810
516
|
|
|
811
517
|
response = self._api_request_with_retry(
|
|
@@ -820,100 +526,78 @@ class ThordataClient:
|
|
|
820
526
|
return data["data"]["task_id"]
|
|
821
527
|
|
|
822
528
|
def get_task_status(self, task_id: str) -> str:
|
|
823
|
-
|
|
529
|
+
self._require_public_credentials()
|
|
530
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
824
531
|
|
|
825
|
-
|
|
826
|
-
|
|
532
|
+
response = self._api_request_with_retry(
|
|
533
|
+
"POST",
|
|
534
|
+
self._status_url,
|
|
535
|
+
data={"tasks_ids": task_id},
|
|
536
|
+
headers=headers,
|
|
537
|
+
)
|
|
538
|
+
response.raise_for_status()
|
|
539
|
+
data = response.json()
|
|
540
|
+
if data.get("code") != 200:
|
|
541
|
+
raise_for_code("Task status error", code=data.get("code"), payload=data)
|
|
542
|
+
|
|
543
|
+
items = data.get("data") or []
|
|
544
|
+
for item in items:
|
|
545
|
+
if str(item.get("task_id")) == str(task_id):
|
|
546
|
+
return item.get("status", "unknown")
|
|
547
|
+
return "unknown"
|
|
827
548
|
|
|
828
|
-
|
|
829
|
-
|
|
549
|
+
def get_latest_task_status(self) -> dict[str, Any]:
|
|
550
|
+
"""
|
|
551
|
+
Get the status of the last task of the specified account.
|
|
830
552
|
"""
|
|
831
553
|
self._require_public_credentials()
|
|
832
|
-
headers = build_public_api_headers(
|
|
833
|
-
|
|
554
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
555
|
+
parsed = urlparse(self._status_url)
|
|
556
|
+
base = f"{parsed.scheme}://{parsed.netloc}"
|
|
557
|
+
endpoint = "/api/web_scraper_api/get_latest_task_status"
|
|
558
|
+
|
|
559
|
+
response = self._api_request_with_retry(
|
|
560
|
+
"POST",
|
|
561
|
+
f"{base}{endpoint}",
|
|
562
|
+
headers=headers,
|
|
834
563
|
)
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
"POST",
|
|
838
|
-
self._status_url,
|
|
839
|
-
data={"tasks_ids": task_id},
|
|
840
|
-
headers=headers,
|
|
841
|
-
)
|
|
842
|
-
response.raise_for_status()
|
|
843
|
-
data = response.json()
|
|
844
|
-
if data.get("code") != 200:
|
|
845
|
-
raise_for_code("Task status error", code=data.get("code"), payload=data)
|
|
846
|
-
|
|
847
|
-
items = data.get("data") or []
|
|
848
|
-
for item in items:
|
|
849
|
-
if str(item.get("task_id")) == str(task_id):
|
|
850
|
-
return item.get("status", "unknown")
|
|
851
|
-
return "unknown"
|
|
852
|
-
except requests.RequestException as e:
|
|
853
|
-
raise ThordataNetworkError(
|
|
854
|
-
f"Status check failed: {e}", original_error=e
|
|
855
|
-
) from e
|
|
564
|
+
response.raise_for_status()
|
|
565
|
+
data = response.json()
|
|
856
566
|
|
|
857
|
-
|
|
858
|
-
|
|
567
|
+
if data.get("code") != 200:
|
|
568
|
+
raise_for_code(
|
|
569
|
+
"Get latest task status failed", code=data.get("code"), payload=data
|
|
570
|
+
)
|
|
859
571
|
|
|
860
|
-
|
|
861
|
-
task_id: Task identifier.
|
|
572
|
+
return data.get("data", {})
|
|
862
573
|
|
|
863
|
-
|
|
864
|
-
Status string or "error" on failure.
|
|
865
|
-
"""
|
|
574
|
+
def safe_get_task_status(self, task_id: str) -> str:
|
|
866
575
|
try:
|
|
867
576
|
return self.get_task_status(task_id)
|
|
868
577
|
except Exception:
|
|
869
578
|
return "error"
|
|
870
579
|
|
|
871
580
|
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
872
|
-
"""Get the download URL for a completed task.
|
|
873
|
-
|
|
874
|
-
Args:
|
|
875
|
-
task_id: Task identifier.
|
|
876
|
-
file_type: File type to download (json, csv, video, audio, subtitle).
|
|
877
|
-
|
|
878
|
-
Returns:
|
|
879
|
-
Download URL.
|
|
880
|
-
"""
|
|
881
581
|
self._require_public_credentials()
|
|
882
|
-
headers = build_public_api_headers(
|
|
883
|
-
|
|
582
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
583
|
+
|
|
584
|
+
response = self._api_request_with_retry(
|
|
585
|
+
"POST",
|
|
586
|
+
self._download_url,
|
|
587
|
+
data={"tasks_id": task_id, "type": file_type},
|
|
588
|
+
headers=headers,
|
|
884
589
|
)
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
)
|
|
892
|
-
response.raise_for_status()
|
|
893
|
-
data = response.json()
|
|
894
|
-
if data.get("code") == 200 and data.get("data"):
|
|
895
|
-
return data["data"]["download"]
|
|
896
|
-
raise_for_code("Get result failed", code=data.get("code"), payload=data)
|
|
897
|
-
return ""
|
|
898
|
-
except requests.RequestException as e:
|
|
899
|
-
raise ThordataNetworkError(
|
|
900
|
-
f"Get result failed: {e}", original_error=e
|
|
901
|
-
) from e
|
|
590
|
+
response.raise_for_status()
|
|
591
|
+
data = response.json()
|
|
592
|
+
if data.get("code") == 200 and data.get("data"):
|
|
593
|
+
return data["data"]["download"]
|
|
594
|
+
raise_for_code("Get result failed", code=data.get("code"), payload=data)
|
|
595
|
+
return ""
|
|
902
596
|
|
|
903
597
|
def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
|
|
904
|
-
"""List all scraping tasks.
|
|
905
|
-
|
|
906
|
-
Args:
|
|
907
|
-
page: Page number for pagination.
|
|
908
|
-
size: Number of items per page.
|
|
909
|
-
|
|
910
|
-
Returns:
|
|
911
|
-
Dictionary with count and list of tasks.
|
|
912
|
-
"""
|
|
913
598
|
self._require_public_credentials()
|
|
914
|
-
headers = build_public_api_headers(
|
|
915
|
-
|
|
916
|
-
)
|
|
599
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
600
|
+
|
|
917
601
|
response = self._api_request_with_retry(
|
|
918
602
|
"POST",
|
|
919
603
|
self._list_url,
|
|
@@ -933,16 +617,6 @@ class ThordataClient:
|
|
|
933
617
|
poll_interval: float = 5.0,
|
|
934
618
|
max_wait: float = 600.0,
|
|
935
619
|
) -> str:
|
|
936
|
-
"""Wait for a task to complete.
|
|
937
|
-
|
|
938
|
-
Args:
|
|
939
|
-
task_id: Task identifier.
|
|
940
|
-
poll_interval: Polling interval in seconds.
|
|
941
|
-
max_wait: Maximum time to wait in seconds.
|
|
942
|
-
|
|
943
|
-
Returns:
|
|
944
|
-
Final status of the task.
|
|
945
|
-
"""
|
|
946
620
|
import time
|
|
947
621
|
|
|
948
622
|
start = time.monotonic()
|
|
@@ -965,49 +639,21 @@ class ThordataClient:
|
|
|
965
639
|
file_name: str,
|
|
966
640
|
spider_id: str,
|
|
967
641
|
spider_name: str,
|
|
968
|
-
parameters: dict[str, Any],
|
|
642
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
969
643
|
universal_params: dict[str, Any] | None = None,
|
|
970
644
|
*,
|
|
971
645
|
max_wait: float = 600.0,
|
|
972
646
|
initial_poll_interval: float = 2.0,
|
|
973
647
|
max_poll_interval: float = 10.0,
|
|
974
648
|
include_errors: bool = True,
|
|
975
|
-
|
|
976
|
-
task_type: str = "web", # "web" or "video"
|
|
649
|
+
task_type: str = "web",
|
|
977
650
|
common_settings: CommonSettings | None = None,
|
|
978
651
|
) -> str:
|
|
979
|
-
"""High-level wrapper to run a task and wait for result.
|
|
980
|
-
|
|
981
|
-
This method handles the entire lifecycle:
|
|
982
|
-
1. Create Task
|
|
983
|
-
2. Poll status (with exponential backoff)
|
|
984
|
-
3. Get download URL when ready
|
|
985
|
-
|
|
986
|
-
Args:
|
|
987
|
-
file_name: Name for the output file.
|
|
988
|
-
spider_id: Spider identifier from Dashboard.
|
|
989
|
-
spider_name: Spider name (target domain).
|
|
990
|
-
parameters: Spider-specific parameters.
|
|
991
|
-
universal_params: Global spider settings.
|
|
992
|
-
max_wait: Maximum seconds to wait for completion.
|
|
993
|
-
initial_poll_interval: Starting poll interval in seconds.
|
|
994
|
-
max_poll_interval: Maximum poll interval cap.
|
|
995
|
-
include_errors: Whether to include error logs.
|
|
996
|
-
|
|
997
|
-
Returns:
|
|
998
|
-
The download URL for the task result.
|
|
999
|
-
|
|
1000
|
-
Raises:
|
|
1001
|
-
ThordataTimeoutError: If task takes longer than max_wait.
|
|
1002
|
-
ThordataAPIError: If task fails or is cancelled.
|
|
1003
|
-
"""
|
|
1004
652
|
import time
|
|
1005
653
|
|
|
1006
|
-
# 1. Create Task
|
|
1007
654
|
if task_type == "video":
|
|
1008
655
|
if common_settings is None:
|
|
1009
656
|
raise ValueError("common_settings is required for video tasks")
|
|
1010
|
-
|
|
1011
657
|
config_video = VideoTaskConfig(
|
|
1012
658
|
file_name=file_name,
|
|
1013
659
|
spider_id=spider_id,
|
|
@@ -1028,9 +674,8 @@ class ThordataClient:
|
|
|
1028
674
|
)
|
|
1029
675
|
task_id = self.create_scraper_task_advanced(config)
|
|
1030
676
|
|
|
1031
|
-
logger.info(f"Task created
|
|
677
|
+
logger.info(f"Task created: {task_id}. Polling...")
|
|
1032
678
|
|
|
1033
|
-
# 2. Poll Status (Smart Backoff)
|
|
1034
679
|
start_time = time.monotonic()
|
|
1035
680
|
current_poll = initial_poll_interval
|
|
1036
681
|
|
|
@@ -1039,20 +684,17 @@ class ThordataClient:
|
|
|
1039
684
|
status_lower = status.lower()
|
|
1040
685
|
|
|
1041
686
|
if status_lower in {"ready", "success", "finished"}:
|
|
1042
|
-
logger.info(f"Task {task_id} finished. Status: {status}")
|
|
1043
|
-
# 3. Get Result
|
|
1044
687
|
return self.get_task_result(task_id)
|
|
1045
688
|
|
|
1046
689
|
if status_lower in {"failed", "error", "cancelled"}:
|
|
1047
690
|
raise ThordataNetworkError(
|
|
1048
|
-
f"Task {task_id}
|
|
691
|
+
f"Task {task_id} failed with status: {status}"
|
|
1049
692
|
)
|
|
1050
693
|
|
|
1051
|
-
# Wait and increase interval (capped)
|
|
1052
694
|
time.sleep(current_poll)
|
|
1053
695
|
current_poll = min(current_poll * 1.5, max_poll_interval)
|
|
1054
696
|
|
|
1055
|
-
raise ThordataTimeoutError(f"Task {task_id} timed out
|
|
697
|
+
raise ThordataTimeoutError(f"Task {task_id} timed out")
|
|
1056
698
|
|
|
1057
699
|
# =========================================================================
|
|
1058
700
|
# Account & Usage Methods
|
|
@@ -1063,15 +705,6 @@ class ThordataClient:
|
|
|
1063
705
|
from_date: str | date,
|
|
1064
706
|
to_date: str | date,
|
|
1065
707
|
) -> UsageStatistics:
|
|
1066
|
-
"""Get usage statistics for a date range.
|
|
1067
|
-
|
|
1068
|
-
Args:
|
|
1069
|
-
from_date: Start date (YYYY-MM-DD format or date object).
|
|
1070
|
-
to_date: End date (YYYY-MM-DD format or date object).
|
|
1071
|
-
|
|
1072
|
-
Returns:
|
|
1073
|
-
UsageStatistics object with traffic data.
|
|
1074
|
-
"""
|
|
1075
708
|
self._require_public_credentials()
|
|
1076
709
|
if isinstance(from_date, date):
|
|
1077
710
|
from_date = from_date.strftime("%Y-%m-%d")
|
|
@@ -1094,17 +727,9 @@ class ThordataClient:
|
|
|
1094
727
|
return UsageStatistics.from_dict(data.get("data", data))
|
|
1095
728
|
|
|
1096
729
|
def get_traffic_balance(self) -> float:
|
|
1097
|
-
"""
|
|
1098
|
-
Get the current traffic balance in KB via Public API.
|
|
1099
|
-
"""
|
|
1100
730
|
self._require_public_credentials()
|
|
1101
|
-
|
|
1102
|
-
params = {
|
|
1103
|
-
"token": self.public_token,
|
|
1104
|
-
"key": self.public_key,
|
|
1105
|
-
}
|
|
731
|
+
params = {"token": self.public_token, "key": self.public_key}
|
|
1106
732
|
api_base = self._locations_base_url.replace("/locations", "")
|
|
1107
|
-
|
|
1108
733
|
response = self._api_request_with_retry(
|
|
1109
734
|
"GET", f"{api_base}/account/traffic-balance", params=params
|
|
1110
735
|
)
|
|
@@ -1114,21 +739,12 @@ class ThordataClient:
|
|
|
1114
739
|
raise_for_code(
|
|
1115
740
|
"Get traffic balance failed", code=data.get("code"), payload=data
|
|
1116
741
|
)
|
|
1117
|
-
|
|
1118
742
|
return float(data.get("data", {}).get("traffic_balance", 0))
|
|
1119
743
|
|
|
1120
744
|
def get_wallet_balance(self) -> float:
|
|
1121
|
-
"""
|
|
1122
|
-
Get the current wallet balance via Public API.
|
|
1123
|
-
"""
|
|
1124
745
|
self._require_public_credentials()
|
|
1125
|
-
|
|
1126
|
-
params = {
|
|
1127
|
-
"token": self.public_token,
|
|
1128
|
-
"key": self.public_key,
|
|
1129
|
-
}
|
|
746
|
+
params = {"token": self.public_token, "key": self.public_key}
|
|
1130
747
|
api_base = self._locations_base_url.replace("/locations", "")
|
|
1131
|
-
|
|
1132
748
|
response = self._api_request_with_retry(
|
|
1133
749
|
"GET", f"{api_base}/account/wallet-balance", params=params
|
|
1134
750
|
)
|
|
@@ -1138,7 +754,6 @@ class ThordataClient:
|
|
|
1138
754
|
raise_for_code(
|
|
1139
755
|
"Get wallet balance failed", code=data.get("code"), payload=data
|
|
1140
756
|
)
|
|
1141
|
-
|
|
1142
757
|
return float(data.get("data", {}).get("balance", 0))
|
|
1143
758
|
|
|
1144
759
|
def get_proxy_user_usage(
|
|
@@ -1148,21 +763,8 @@ class ThordataClient:
|
|
|
1148
763
|
end_date: str | date,
|
|
1149
764
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1150
765
|
) -> list[dict[str, Any]]:
|
|
1151
|
-
"""
|
|
1152
|
-
Get traffic usage statistics for a specific proxy user.
|
|
1153
|
-
|
|
1154
|
-
Args:
|
|
1155
|
-
username: Sub-account username.
|
|
1156
|
-
start_date: Start date (YYYY-MM-DD).
|
|
1157
|
-
end_date: End date (YYYY-MM-DD).
|
|
1158
|
-
proxy_type: Proxy product type.
|
|
1159
|
-
|
|
1160
|
-
Returns:
|
|
1161
|
-
List of daily usage records.
|
|
1162
|
-
"""
|
|
1163
766
|
self._require_public_credentials()
|
|
1164
767
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1165
|
-
|
|
1166
768
|
if isinstance(start_date, date):
|
|
1167
769
|
start_date = start_date.strftime("%Y-%m-%d")
|
|
1168
770
|
if isinstance(end_date, date):
|
|
@@ -1176,7 +778,6 @@ class ThordataClient:
|
|
|
1176
778
|
"from_date": start_date,
|
|
1177
779
|
"to_date": end_date,
|
|
1178
780
|
}
|
|
1179
|
-
|
|
1180
781
|
response = self._api_request_with_retry(
|
|
1181
782
|
"GET", f"{self._proxy_users_url}/usage-statistics", params=params
|
|
1182
783
|
)
|
|
@@ -1184,10 +785,51 @@ class ThordataClient:
|
|
|
1184
785
|
data = response.json()
|
|
1185
786
|
if data.get("code") != 200:
|
|
1186
787
|
raise_for_code("Get user usage failed", code=data.get("code"), payload=data)
|
|
1187
|
-
|
|
1188
|
-
# Structure: { "data": [ { "date": "...", "usage_traffic": ... } ] }
|
|
1189
788
|
return data.get("data", [])
|
|
1190
789
|
|
|
790
|
+
def get_proxy_user_usage_hour(
|
|
791
|
+
self,
|
|
792
|
+
username: str,
|
|
793
|
+
from_date: str, # Format: yyyy-mm-dd HH
|
|
794
|
+
to_date: str, # Format: yyyy-mm-dd HH
|
|
795
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
796
|
+
) -> list[dict[str, Any]]:
|
|
797
|
+
"""
|
|
798
|
+
Get proxy user traffic usage logs by hour.
|
|
799
|
+
|
|
800
|
+
Args:
|
|
801
|
+
username: The proxy username.
|
|
802
|
+
from_date: Start date string (yyyy-mm-dd HH).
|
|
803
|
+
to_date: End date string (yyyy-mm-dd HH).
|
|
804
|
+
proxy_type: Proxy type (default: Residential).
|
|
805
|
+
"""
|
|
806
|
+
self._require_public_credentials()
|
|
807
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
808
|
+
|
|
809
|
+
params = {
|
|
810
|
+
"token": self.public_token,
|
|
811
|
+
"key": self.public_key,
|
|
812
|
+
"proxy_type": str(pt),
|
|
813
|
+
"username": username,
|
|
814
|
+
"from_date": from_date,
|
|
815
|
+
"to_date": to_date,
|
|
816
|
+
}
|
|
817
|
+
response = self._api_request_with_retry(
|
|
818
|
+
"GET", f"{self._proxy_users_url}/usage-statistics-hour", params=params
|
|
819
|
+
)
|
|
820
|
+
response.raise_for_status()
|
|
821
|
+
data = response.json()
|
|
822
|
+
if data.get("code") != 200:
|
|
823
|
+
raise_for_code(
|
|
824
|
+
"Get hourly usage failed", code=data.get("code"), payload=data
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
# API returns { "data": { "data": [...] } } structure
|
|
828
|
+
inner_data = data.get("data", {})
|
|
829
|
+
if isinstance(inner_data, dict):
|
|
830
|
+
return inner_data.get("data", [])
|
|
831
|
+
return []
|
|
832
|
+
|
|
1191
833
|
def extract_ip_list(
|
|
1192
834
|
self,
|
|
1193
835
|
num: int = 1,
|
|
@@ -1199,40 +841,16 @@ class ThordataClient:
|
|
|
1199
841
|
return_type: str = "txt",
|
|
1200
842
|
protocol: str = "http",
|
|
1201
843
|
sep: str = "\r\n",
|
|
1202
|
-
product: str = "residential",
|
|
844
|
+
product: str = "residential",
|
|
1203
845
|
) -> list[str]:
|
|
1204
|
-
"""
|
|
1205
|
-
Extract proxy IP list via API (get-ip.thordata.net).
|
|
1206
|
-
Requires IP whitelist configuration.
|
|
1207
|
-
|
|
1208
|
-
Args:
|
|
1209
|
-
num: Number of IPs to extract.
|
|
1210
|
-
country: Country code.
|
|
1211
|
-
state: State code.
|
|
1212
|
-
city: City name.
|
|
1213
|
-
time_limit: Session duration (1-90 mins).
|
|
1214
|
-
port: Specific port.
|
|
1215
|
-
return_type: "txt" or "json".
|
|
1216
|
-
protocol: "http" or "socks5".
|
|
1217
|
-
sep: Separator for txt output.
|
|
1218
|
-
product: "residential" or "unlimited".
|
|
1219
|
-
|
|
1220
|
-
Returns:
|
|
1221
|
-
List of "IP:Port" strings.
|
|
1222
|
-
"""
|
|
1223
|
-
# Determine endpoint based on product
|
|
1224
846
|
base_url = "https://get-ip.thordata.net"
|
|
1225
847
|
endpoint = "/unlimited_api" if product == "unlimited" else "/api"
|
|
1226
|
-
|
|
1227
|
-
# Build params
|
|
1228
848
|
params: dict[str, Any] = {
|
|
1229
849
|
"num": str(num),
|
|
1230
850
|
"return_type": return_type,
|
|
1231
851
|
"protocol": protocol,
|
|
1232
852
|
"sep": sep,
|
|
1233
853
|
}
|
|
1234
|
-
|
|
1235
|
-
# Add optional params
|
|
1236
854
|
if country:
|
|
1237
855
|
params["country"] = country
|
|
1238
856
|
if state:
|
|
@@ -1244,21 +862,24 @@ class ThordataClient:
|
|
|
1244
862
|
if port:
|
|
1245
863
|
params["port"] = str(port)
|
|
1246
864
|
|
|
1247
|
-
|
|
865
|
+
if product == "unlimited":
|
|
866
|
+
username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
|
|
867
|
+
"THORDATA_RESIDENTIAL_USERNAME"
|
|
868
|
+
)
|
|
869
|
+
else:
|
|
870
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
1248
871
|
if username:
|
|
1249
872
|
params["td-customer"] = username
|
|
1250
873
|
|
|
1251
|
-
response = self.
|
|
1252
|
-
f"{base_url}{endpoint}", params=params
|
|
874
|
+
response = self._api_request_with_retry(
|
|
875
|
+
"GET", f"{base_url}{endpoint}", params=params
|
|
1253
876
|
)
|
|
1254
877
|
response.raise_for_status()
|
|
1255
878
|
|
|
1256
|
-
# Parse result
|
|
1257
879
|
if return_type == "json":
|
|
1258
880
|
data = response.json()
|
|
1259
|
-
# JSON format: { "code": 0, "data": [ { "ip": "...", "port": ... } ] }
|
|
1260
881
|
if isinstance(data, dict):
|
|
1261
|
-
if data.get("code")
|
|
882
|
+
if data.get("code") in (0, 200):
|
|
1262
883
|
raw_list = data.get("data") or []
|
|
1263
884
|
return [f"{item['ip']}:{item['port']}" for item in raw_list]
|
|
1264
885
|
else:
|
|
@@ -1266,40 +887,28 @@ class ThordataClient:
|
|
|
1266
887
|
"Extract IPs failed", code=data.get("code"), payload=data
|
|
1267
888
|
)
|
|
1268
889
|
return []
|
|
1269
|
-
|
|
1270
|
-
else: # txt
|
|
890
|
+
else:
|
|
1271
891
|
text = response.text.strip()
|
|
1272
|
-
# Check for error message in text (often starts with { or contains "error")
|
|
1273
892
|
if text.startswith("{") and "code" in text:
|
|
1274
|
-
# Try parsing as JSON error
|
|
1275
893
|
try:
|
|
1276
|
-
err_data = json
|
|
894
|
+
err_data = response.json()
|
|
1277
895
|
raise_for_code(
|
|
1278
896
|
"Extract IPs failed",
|
|
1279
897
|
code=err_data.get("code"),
|
|
1280
898
|
payload=err_data,
|
|
1281
899
|
)
|
|
1282
|
-
except
|
|
900
|
+
except ValueError:
|
|
1283
901
|
pass
|
|
1284
|
-
|
|
1285
902
|
actual_sep = sep.replace("\\r", "\r").replace("\\n", "\n")
|
|
1286
903
|
return [line.strip() for line in text.split(actual_sep) if line.strip()]
|
|
1287
904
|
|
|
1288
905
|
# =========================================================================
|
|
1289
|
-
# Proxy Users Management
|
|
906
|
+
# Proxy Users Management
|
|
1290
907
|
# =========================================================================
|
|
1291
908
|
|
|
1292
909
|
def list_proxy_users(
|
|
1293
910
|
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
1294
911
|
) -> ProxyUserList:
|
|
1295
|
-
"""List all proxy sub-accounts.
|
|
1296
|
-
|
|
1297
|
-
Args:
|
|
1298
|
-
proxy_type: Proxy product type.
|
|
1299
|
-
|
|
1300
|
-
Returns:
|
|
1301
|
-
ProxyUserList with user information.
|
|
1302
|
-
"""
|
|
1303
912
|
self._require_public_credentials()
|
|
1304
913
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1305
914
|
params = {
|
|
@@ -1324,23 +933,9 @@ class ThordataClient:
|
|
|
1324
933
|
traffic_limit: int = 0,
|
|
1325
934
|
status: bool = True,
|
|
1326
935
|
) -> dict[str, Any]:
|
|
1327
|
-
"""Create a new proxy sub-account.
|
|
1328
|
-
|
|
1329
|
-
Args:
|
|
1330
|
-
username: Sub-account username.
|
|
1331
|
-
password: Sub-account password.
|
|
1332
|
-
proxy_type: Proxy product type.
|
|
1333
|
-
traffic_limit: Traffic limit in MB (0 = unlimited).
|
|
1334
|
-
status: Enable or disable the account.
|
|
1335
|
-
|
|
1336
|
-
Returns:
|
|
1337
|
-
API response data.
|
|
1338
|
-
"""
|
|
1339
936
|
self._require_public_credentials()
|
|
1340
937
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1341
|
-
headers = build_public_api_headers(
|
|
1342
|
-
self.public_token or "", self.public_key or ""
|
|
1343
|
-
)
|
|
938
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1344
939
|
payload = {
|
|
1345
940
|
"proxy_type": str(pt),
|
|
1346
941
|
"username": username,
|
|
@@ -1363,41 +958,36 @@ class ThordataClient:
|
|
|
1363
958
|
def update_proxy_user(
|
|
1364
959
|
self,
|
|
1365
960
|
username: str,
|
|
1366
|
-
password: str,
|
|
961
|
+
password: str,
|
|
1367
962
|
traffic_limit: int | None = None,
|
|
1368
963
|
status: bool | None = None,
|
|
1369
964
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
965
|
+
new_username: str | None = None, # Added optional new_username
|
|
1370
966
|
) -> dict[str, Any]:
|
|
1371
967
|
"""
|
|
1372
|
-
Update
|
|
1373
|
-
|
|
1374
|
-
Note: Password is required by the API even if not changing it.
|
|
1375
|
-
|
|
1376
|
-
Args:
|
|
1377
|
-
username: The sub-account username.
|
|
1378
|
-
password: The sub-account password (required for update).
|
|
1379
|
-
traffic_limit: New traffic limit in MB (0 for unlimited). None to keep unchanged.
|
|
1380
|
-
status: New status (True=enabled, False=disabled). None to keep unchanged.
|
|
1381
|
-
proxy_type: Proxy product type.
|
|
1382
|
-
|
|
1383
|
-
Returns:
|
|
1384
|
-
API response data.
|
|
968
|
+
Update a proxy user.
|
|
969
|
+
Note: API requires 'new_' prefixed fields and ALL are required.
|
|
1385
970
|
"""
|
|
1386
971
|
self._require_public_credentials()
|
|
1387
972
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1388
|
-
headers = build_public_api_headers(
|
|
1389
|
-
self.public_token or "", self.public_key or ""
|
|
1390
|
-
)
|
|
973
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1391
974
|
|
|
975
|
+
# Defaults
|
|
976
|
+
limit_val = str(traffic_limit) if traffic_limit is not None else "0"
|
|
977
|
+
status_val = "true" if (status is None or status) else "false"
|
|
978
|
+
|
|
979
|
+
# If new_username is not provided, keep the old one (API requires new_username field)
|
|
980
|
+
target_username = new_username or username
|
|
981
|
+
|
|
982
|
+
# Mapping to API specific field names (new_...)
|
|
1392
983
|
payload = {
|
|
1393
984
|
"proxy_type": str(pt),
|
|
1394
|
-
"username": username,
|
|
1395
|
-
"
|
|
985
|
+
"username": username, # Who to update
|
|
986
|
+
"new_username": target_username, # Required field
|
|
987
|
+
"new_password": password, # Required field
|
|
988
|
+
"new_traffic_limit": limit_val, # Required field
|
|
989
|
+
"new_status": status_val, # Required field
|
|
1396
990
|
}
|
|
1397
|
-
if traffic_limit is not None:
|
|
1398
|
-
payload["traffic_limit"] = str(traffic_limit)
|
|
1399
|
-
if status is not None:
|
|
1400
|
-
payload["status"] = "true" if status else "false"
|
|
1401
991
|
|
|
1402
992
|
response = self._api_request_with_retry(
|
|
1403
993
|
"POST",
|
|
@@ -1405,7 +995,6 @@ class ThordataClient:
|
|
|
1405
995
|
data=payload,
|
|
1406
996
|
headers=headers,
|
|
1407
997
|
)
|
|
1408
|
-
response.raise_for_status()
|
|
1409
998
|
data = response.json()
|
|
1410
999
|
if data.get("code") != 200:
|
|
1411
1000
|
raise_for_code("Update user failed", code=data.get("code"), payload=data)
|
|
@@ -1416,26 +1005,10 @@ class ThordataClient:
|
|
|
1416
1005
|
username: str,
|
|
1417
1006
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1418
1007
|
) -> dict[str, Any]:
|
|
1419
|
-
"""Delete a proxy user.
|
|
1420
|
-
|
|
1421
|
-
Args:
|
|
1422
|
-
username: The sub-account username.
|
|
1423
|
-
proxy_type: Proxy product type.
|
|
1424
|
-
|
|
1425
|
-
Returns:
|
|
1426
|
-
API response data.
|
|
1427
|
-
"""
|
|
1428
1008
|
self._require_public_credentials()
|
|
1429
1009
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1430
|
-
headers = build_public_api_headers(
|
|
1431
|
-
|
|
1432
|
-
)
|
|
1433
|
-
|
|
1434
|
-
payload = {
|
|
1435
|
-
"proxy_type": str(pt),
|
|
1436
|
-
"username": username,
|
|
1437
|
-
}
|
|
1438
|
-
|
|
1010
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1011
|
+
payload = {"proxy_type": str(pt), "username": username}
|
|
1439
1012
|
response = self._api_request_with_retry(
|
|
1440
1013
|
"POST",
|
|
1441
1014
|
f"{self._proxy_users_url}/delete-user",
|
|
@@ -1458,21 +1031,9 @@ class ThordataClient:
|
|
|
1458
1031
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1459
1032
|
status: bool = True,
|
|
1460
1033
|
) -> dict[str, Any]:
|
|
1461
|
-
"""Add an IP to the whitelist.
|
|
1462
|
-
|
|
1463
|
-
Args:
|
|
1464
|
-
ip: IP address to whitelist.
|
|
1465
|
-
proxy_type: Proxy product type.
|
|
1466
|
-
status: Enable or disable the whitelist entry.
|
|
1467
|
-
|
|
1468
|
-
Returns:
|
|
1469
|
-
API response data.
|
|
1470
|
-
"""
|
|
1471
1034
|
self._require_public_credentials()
|
|
1472
1035
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1473
|
-
headers = build_public_api_headers(
|
|
1474
|
-
self.public_token or "", self.public_key or ""
|
|
1475
|
-
)
|
|
1036
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1476
1037
|
payload = {
|
|
1477
1038
|
"proxy_type": str(pt),
|
|
1478
1039
|
"ip": ip,
|
|
@@ -1494,24 +1055,10 @@ class ThordataClient:
|
|
|
1494
1055
|
ip: str,
|
|
1495
1056
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1496
1057
|
) -> dict[str, Any]:
|
|
1497
|
-
"""Delete an IP from the whitelist.
|
|
1498
|
-
|
|
1499
|
-
Args:
|
|
1500
|
-
ip: The IP address to remove.
|
|
1501
|
-
proxy_type: Proxy product type.
|
|
1502
|
-
|
|
1503
|
-
Returns:
|
|
1504
|
-
API response data.
|
|
1505
|
-
"""
|
|
1506
1058
|
self._require_public_credentials()
|
|
1507
1059
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1508
|
-
headers = build_public_api_headers(
|
|
1509
|
-
|
|
1510
|
-
)
|
|
1511
|
-
payload = {
|
|
1512
|
-
"proxy_type": str(pt),
|
|
1513
|
-
"ip": ip,
|
|
1514
|
-
}
|
|
1060
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1061
|
+
payload = {"proxy_type": str(pt), "ip": ip}
|
|
1515
1062
|
response = self._api_request_with_retry(
|
|
1516
1063
|
"POST", f"{self._whitelist_url}/delete-ip", data=payload, headers=headers
|
|
1517
1064
|
)
|
|
@@ -1527,14 +1074,6 @@ class ThordataClient:
|
|
|
1527
1074
|
self,
|
|
1528
1075
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1529
1076
|
) -> list[str]:
|
|
1530
|
-
"""List all whitelisted IPs.
|
|
1531
|
-
|
|
1532
|
-
Args:
|
|
1533
|
-
proxy_type: Proxy product type.
|
|
1534
|
-
|
|
1535
|
-
Returns:
|
|
1536
|
-
List of IP address strings.
|
|
1537
|
-
"""
|
|
1538
1077
|
self._require_public_credentials()
|
|
1539
1078
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1540
1079
|
params = {
|
|
@@ -1552,7 +1091,6 @@ class ThordataClient:
|
|
|
1552
1091
|
"List whitelist IPs failed", code=data.get("code"), payload=data
|
|
1553
1092
|
)
|
|
1554
1093
|
|
|
1555
|
-
# API usually returns {"data": ["1.1.1.1", ...]} OR {"data": [{"ip": "..."}]}
|
|
1556
1094
|
items = data.get("data", []) or []
|
|
1557
1095
|
result = []
|
|
1558
1096
|
for item in items:
|
|
@@ -1568,17 +1106,27 @@ class ThordataClient:
|
|
|
1568
1106
|
# Locations & ASN Methods
|
|
1569
1107
|
# =========================================================================
|
|
1570
1108
|
|
|
1109
|
+
def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
|
|
1110
|
+
self._require_public_credentials()
|
|
1111
|
+
params = {"token": self.public_token, "key": self.public_key}
|
|
1112
|
+
for k, v in kwargs.items():
|
|
1113
|
+
params[k] = str(v)
|
|
1114
|
+
|
|
1115
|
+
response = self._api_request_with_retry(
|
|
1116
|
+
"GET", f"{self._locations_base_url}/{endpoint}", params=params
|
|
1117
|
+
)
|
|
1118
|
+
response.raise_for_status()
|
|
1119
|
+
data = response.json()
|
|
1120
|
+
|
|
1121
|
+
if isinstance(data, dict):
|
|
1122
|
+
if data.get("code") != 200:
|
|
1123
|
+
raise RuntimeError(f"Locations error: {data.get('msg')}")
|
|
1124
|
+
return data.get("data") or []
|
|
1125
|
+
return data if isinstance(data, list) else []
|
|
1126
|
+
|
|
1571
1127
|
def list_countries(
|
|
1572
1128
|
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
1573
1129
|
) -> list[dict[str, Any]]:
|
|
1574
|
-
"""List available countries for proxy locations.
|
|
1575
|
-
|
|
1576
|
-
Args:
|
|
1577
|
-
proxy_type: Proxy product type.
|
|
1578
|
-
|
|
1579
|
-
Returns:
|
|
1580
|
-
List of country dictionaries.
|
|
1581
|
-
"""
|
|
1582
1130
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1583
1131
|
return self._get_locations("countries", proxy_type=pt)
|
|
1584
1132
|
|
|
@@ -1587,15 +1135,6 @@ class ThordataClient:
|
|
|
1587
1135
|
country_code: str,
|
|
1588
1136
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1589
1137
|
) -> list[dict[str, Any]]:
|
|
1590
|
-
"""List available states/provinces for a country.
|
|
1591
|
-
|
|
1592
|
-
Args:
|
|
1593
|
-
country_code: Country code (e.g., "US", "GB").
|
|
1594
|
-
proxy_type: Proxy product type.
|
|
1595
|
-
|
|
1596
|
-
Returns:
|
|
1597
|
-
List of state dictionaries.
|
|
1598
|
-
"""
|
|
1599
1138
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1600
1139
|
return self._get_locations("states", proxy_type=pt, country_code=country_code)
|
|
1601
1140
|
|
|
@@ -1605,16 +1144,6 @@ class ThordataClient:
|
|
|
1605
1144
|
state_code: str | None = None,
|
|
1606
1145
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1607
1146
|
) -> list[dict[str, Any]]:
|
|
1608
|
-
"""List available cities for a country/state.
|
|
1609
|
-
|
|
1610
|
-
Args:
|
|
1611
|
-
country_code: Country code.
|
|
1612
|
-
state_code: State code (optional).
|
|
1613
|
-
proxy_type: Proxy product type.
|
|
1614
|
-
|
|
1615
|
-
Returns:
|
|
1616
|
-
List of city dictionaries.
|
|
1617
|
-
"""
|
|
1618
1147
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1619
1148
|
kwargs = {"proxy_type": pt, "country_code": country_code}
|
|
1620
1149
|
if state_code:
|
|
@@ -1626,15 +1155,6 @@ class ThordataClient:
|
|
|
1626
1155
|
country_code: str,
|
|
1627
1156
|
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1628
1157
|
) -> list[dict[str, Any]]:
|
|
1629
|
-
"""List available ASN numbers for a country.
|
|
1630
|
-
|
|
1631
|
-
Args:
|
|
1632
|
-
country_code: Country code.
|
|
1633
|
-
proxy_type: Proxy product type.
|
|
1634
|
-
|
|
1635
|
-
Returns:
|
|
1636
|
-
List of ASN dictionaries.
|
|
1637
|
-
"""
|
|
1638
1158
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1639
1159
|
return self._get_locations("asn", proxy_type=pt, country_code=country_code)
|
|
1640
1160
|
|
|
@@ -1643,14 +1163,6 @@ class ThordataClient:
|
|
|
1643
1163
|
# =========================================================================
|
|
1644
1164
|
|
|
1645
1165
|
def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
|
|
1646
|
-
"""List purchased proxy servers (ISP/Datacenter).
|
|
1647
|
-
|
|
1648
|
-
Args:
|
|
1649
|
-
proxy_type: Proxy type (1=ISP, 2=Datacenter).
|
|
1650
|
-
|
|
1651
|
-
Returns:
|
|
1652
|
-
List of ProxyServer objects.
|
|
1653
|
-
"""
|
|
1654
1166
|
self._require_public_credentials()
|
|
1655
1167
|
params = {
|
|
1656
1168
|
"token": self.public_token,
|
|
@@ -1672,21 +1184,11 @@ class ThordataClient:
|
|
|
1672
1184
|
server_list = data.get("data", data.get("list", []))
|
|
1673
1185
|
elif isinstance(data, list):
|
|
1674
1186
|
server_list = data
|
|
1675
|
-
|
|
1676
1187
|
return [ProxyServer.from_dict(s) for s in server_list]
|
|
1677
1188
|
|
|
1678
1189
|
def get_proxy_expiration(
|
|
1679
1190
|
self, ips: str | list[str], proxy_type: int
|
|
1680
1191
|
) -> dict[str, Any]:
|
|
1681
|
-
"""Get expiration time for proxy IPs.
|
|
1682
|
-
|
|
1683
|
-
Args:
|
|
1684
|
-
ips: Single IP or comma-separated list of IPs.
|
|
1685
|
-
proxy_type: Proxy type (1=ISP, 2=Datacenter).
|
|
1686
|
-
|
|
1687
|
-
Returns:
|
|
1688
|
-
Dictionary with IP expiration times.
|
|
1689
|
-
"""
|
|
1690
1192
|
self._require_public_credentials()
|
|
1691
1193
|
if isinstance(ips, list):
|
|
1692
1194
|
ips = ",".join(ips)
|
|
@@ -1706,98 +1208,12 @@ class ThordataClient:
|
|
|
1706
1208
|
return data.get("data", data)
|
|
1707
1209
|
|
|
1708
1210
|
# =========================================================================
|
|
1709
|
-
#
|
|
1211
|
+
# Helpers needed for compatibility
|
|
1710
1212
|
# =========================================================================
|
|
1711
1213
|
|
|
1712
|
-
def _api_request_with_retry(
|
|
1713
|
-
self,
|
|
1714
|
-
method: str,
|
|
1715
|
-
url: str,
|
|
1716
|
-
*,
|
|
1717
|
-
data: dict[str, Any] | None = None,
|
|
1718
|
-
headers: dict[str, str] | None = None,
|
|
1719
|
-
params: dict[str, Any] | None = None,
|
|
1720
|
-
) -> requests.Response:
|
|
1721
|
-
"""Make an API request with retry logic.
|
|
1722
|
-
|
|
1723
|
-
Args:
|
|
1724
|
-
method: HTTP method.
|
|
1725
|
-
url: Request URL.
|
|
1726
|
-
data: Request body data.
|
|
1727
|
-
headers: Request headers.
|
|
1728
|
-
query_params: Query string parameters.
|
|
1729
|
-
|
|
1730
|
-
Returns:
|
|
1731
|
-
Response object.
|
|
1732
|
-
"""
|
|
1733
|
-
|
|
1734
|
-
@with_retry(self._retry_config)
|
|
1735
|
-
def _do_request() -> requests.Response:
|
|
1736
|
-
return self._api_session.request(
|
|
1737
|
-
method,
|
|
1738
|
-
url,
|
|
1739
|
-
data=data,
|
|
1740
|
-
headers=headers,
|
|
1741
|
-
params=params,
|
|
1742
|
-
timeout=self._api_timeout,
|
|
1743
|
-
)
|
|
1744
|
-
|
|
1745
|
-
try:
|
|
1746
|
-
return _do_request()
|
|
1747
|
-
except requests.Timeout as e:
|
|
1748
|
-
raise ThordataTimeoutError(
|
|
1749
|
-
f"API request timed out: {e}", original_error=e
|
|
1750
|
-
) from e
|
|
1751
|
-
except requests.RequestException as e:
|
|
1752
|
-
raise ThordataNetworkError(
|
|
1753
|
-
f"API request failed: {e}", original_error=e
|
|
1754
|
-
) from e
|
|
1755
|
-
|
|
1756
|
-
def _require_public_credentials(self) -> None:
|
|
1757
|
-
"""Check that public credentials are set."""
|
|
1758
|
-
if not self.public_token or not self.public_key:
|
|
1759
|
-
raise ThordataConfigError(
|
|
1760
|
-
"public_token and public_key are required for this operation."
|
|
1761
|
-
)
|
|
1762
|
-
|
|
1763
|
-
def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
|
|
1764
|
-
"""Internal method to fetch location data.
|
|
1765
|
-
|
|
1766
|
-
Args:
|
|
1767
|
-
endpoint: Location endpoint (countries, states, cities, asn).
|
|
1768
|
-
**kwargs: Query parameters.
|
|
1769
|
-
|
|
1770
|
-
Returns:
|
|
1771
|
-
List of location dictionaries.
|
|
1772
|
-
"""
|
|
1773
|
-
self._require_public_credentials()
|
|
1774
|
-
params = {"token": self.public_token, "key": self.public_key}
|
|
1775
|
-
for k, v in kwargs.items():
|
|
1776
|
-
params[k] = str(v)
|
|
1777
|
-
|
|
1778
|
-
response = self._api_request_with_retry(
|
|
1779
|
-
"GET", f"{self._locations_base_url}/{endpoint}", params=params
|
|
1780
|
-
)
|
|
1781
|
-
response.raise_for_status()
|
|
1782
|
-
data = response.json()
|
|
1783
|
-
if isinstance(data, dict):
|
|
1784
|
-
if data.get("code") != 200:
|
|
1785
|
-
raise RuntimeError(f"Locations error: {data.get('msg')}")
|
|
1786
|
-
return data.get("data") or []
|
|
1787
|
-
return data if isinstance(data, list) else []
|
|
1788
|
-
|
|
1789
1214
|
def _process_universal_response(
|
|
1790
1215
|
self, response: requests.Response, output_format: str
|
|
1791
1216
|
) -> str | bytes:
|
|
1792
|
-
"""Process Universal API response.
|
|
1793
|
-
|
|
1794
|
-
Args:
|
|
1795
|
-
response: Response object.
|
|
1796
|
-
output_format: Expected output format.
|
|
1797
|
-
|
|
1798
|
-
Returns:
|
|
1799
|
-
Processed content.
|
|
1800
|
-
"""
|
|
1801
1217
|
try:
|
|
1802
1218
|
resp_json = response.json()
|
|
1803
1219
|
except ValueError:
|
|
@@ -1813,11 +1229,31 @@ class ThordataClient:
|
|
|
1813
1229
|
return resp_json["html"]
|
|
1814
1230
|
if "png" in resp_json:
|
|
1815
1231
|
return decode_base64_image(resp_json["png"])
|
|
1816
|
-
|
|
1817
1232
|
return str(resp_json)
|
|
1818
1233
|
|
|
1234
|
+
def get_browser_connection_url(
|
|
1235
|
+
self, username: str | None = None, password: str | None = None
|
|
1236
|
+
) -> str:
|
|
1237
|
+
# User requested modification: ONLY use browser credentials, do not fall back to residential.
|
|
1238
|
+
user = username or os.getenv("THORDATA_BROWSER_USERNAME")
|
|
1239
|
+
pwd = password or os.getenv("THORDATA_BROWSER_PASSWORD")
|
|
1240
|
+
|
|
1241
|
+
if not user or not pwd:
|
|
1242
|
+
raise ThordataConfigError(
|
|
1243
|
+
"Browser credentials missing. Set THORDATA_BROWSER_USERNAME/PASSWORD or pass arguments."
|
|
1244
|
+
)
|
|
1245
|
+
prefix = "td-customer-"
|
|
1246
|
+
final_user = f"{prefix}{user}" if not user.startswith(prefix) else user
|
|
1247
|
+
|
|
1248
|
+
from urllib.parse import quote
|
|
1249
|
+
|
|
1250
|
+
safe_user = quote(final_user, safe="")
|
|
1251
|
+
safe_pass = quote(pwd, safe="")
|
|
1252
|
+
|
|
1253
|
+
return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"
|
|
1254
|
+
|
|
1819
1255
|
# =========================================================================
|
|
1820
|
-
# Proxy
|
|
1256
|
+
# Proxy Internal Logic
|
|
1821
1257
|
# =========================================================================
|
|
1822
1258
|
|
|
1823
1259
|
def _proxy_verb(
|
|
@@ -1828,17 +1264,11 @@ class ThordataClient:
|
|
|
1828
1264
|
timeout: int | None,
|
|
1829
1265
|
**kwargs: Any,
|
|
1830
1266
|
) -> requests.Response:
|
|
1831
|
-
"""Internal method for proxy requests."""
|
|
1832
1267
|
timeout = timeout or self._default_timeout
|
|
1833
|
-
|
|
1834
1268
|
if proxy_config is None:
|
|
1835
1269
|
proxy_config = self._get_default_proxy_config_from_env()
|
|
1836
|
-
|
|
1837
1270
|
if proxy_config is None:
|
|
1838
|
-
raise ThordataConfigError(
|
|
1839
|
-
"Proxy credentials are missing. "
|
|
1840
|
-
"Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
|
|
1841
|
-
)
|
|
1271
|
+
raise ThordataConfigError("Proxy credentials are missing.")
|
|
1842
1272
|
|
|
1843
1273
|
kwargs.pop("proxies", None)
|
|
1844
1274
|
|
|
@@ -1847,8 +1277,8 @@ class ThordataClient:
|
|
|
1847
1277
|
return self._proxy_request_with_proxy_manager(
|
|
1848
1278
|
method,
|
|
1849
1279
|
url,
|
|
1850
|
-
proxy_config=proxy_config,
|
|
1851
|
-
timeout=timeout,
|
|
1280
|
+
proxy_config=cast(ProxyConfig, proxy_config),
|
|
1281
|
+
timeout=cast(int, timeout),
|
|
1852
1282
|
headers=kwargs.pop("headers", None),
|
|
1853
1283
|
params=kwargs.pop("params", None),
|
|
1854
1284
|
data=kwargs.pop("data", None),
|
|
@@ -1856,15 +1286,10 @@ class ThordataClient:
|
|
|
1856
1286
|
|
|
1857
1287
|
try:
|
|
1858
1288
|
return _do()
|
|
1859
|
-
except requests.Timeout as e:
|
|
1860
|
-
raise ThordataTimeoutError(
|
|
1861
|
-
f"Request timed out: {e}", original_error=e
|
|
1862
|
-
) from e
|
|
1863
1289
|
except Exception as e:
|
|
1864
1290
|
raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
|
|
1865
1291
|
|
|
1866
1292
|
def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
|
|
1867
|
-
"""Build a stable cache key for ProxyManager instances."""
|
|
1868
1293
|
if not userpass:
|
|
1869
1294
|
return proxy_endpoint
|
|
1870
1295
|
h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
|
|
@@ -1877,43 +1302,34 @@ class ThordataClient:
|
|
|
1877
1302
|
cache_key: str,
|
|
1878
1303
|
proxy_headers: dict[str, str] | None = None,
|
|
1879
1304
|
) -> urllib3.PoolManager:
|
|
1880
|
-
"""Get or create a ProxyManager for the given proxy URL (Pooled)."""
|
|
1881
1305
|
cached = self._proxy_managers.get(cache_key)
|
|
1882
1306
|
if cached is not None:
|
|
1883
1307
|
return cached
|
|
1884
1308
|
|
|
1885
1309
|
if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
|
|
1886
|
-
|
|
1887
|
-
from urllib3.contrib.socks import SOCKSProxyManager
|
|
1888
|
-
except Exception as e:
|
|
1310
|
+
if not HAS_PYSOCKS:
|
|
1889
1311
|
raise ThordataConfigError(
|
|
1890
|
-
"SOCKS
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
num_pools=10,
|
|
1897
|
-
maxsize=10,
|
|
1312
|
+
"SOCKS support requires PySocks/urllib3[socks]"
|
|
1313
|
+
)
|
|
1314
|
+
from urllib3.contrib.socks import SOCKSProxyManager
|
|
1315
|
+
|
|
1316
|
+
pm = cast(
|
|
1317
|
+
urllib3.PoolManager,
|
|
1318
|
+
SOCKSProxyManager(proxy_url, num_pools=10, maxsize=10),
|
|
1898
1319
|
)
|
|
1899
|
-
pm = cast(urllib3.PoolManager, pm_socks)
|
|
1900
1320
|
self._proxy_managers[cache_key] = pm
|
|
1901
1321
|
return pm
|
|
1902
1322
|
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
pm_http = urllib3.ProxyManager(
|
|
1323
|
+
proxy_ssl_context = (
|
|
1324
|
+
ssl.create_default_context() if proxy_url.startswith("https://") else None
|
|
1325
|
+
)
|
|
1326
|
+
pm = urllib3.ProxyManager(
|
|
1909
1327
|
proxy_url,
|
|
1910
1328
|
proxy_headers=proxy_headers,
|
|
1911
1329
|
proxy_ssl_context=proxy_ssl_context,
|
|
1912
1330
|
num_pools=10,
|
|
1913
1331
|
maxsize=10,
|
|
1914
1332
|
)
|
|
1915
|
-
|
|
1916
|
-
pm = cast(urllib3.PoolManager, pm_http)
|
|
1917
1333
|
self._proxy_managers[cache_key] = pm
|
|
1918
1334
|
return pm
|
|
1919
1335
|
|
|
@@ -1928,12 +1344,8 @@ class ThordataClient:
|
|
|
1928
1344
|
params: dict[str, Any] | None = None,
|
|
1929
1345
|
data: Any = None,
|
|
1930
1346
|
) -> requests.Response:
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
# Check for upstream proxy
|
|
1934
|
-
upstream_config = _parse_upstream_proxy()
|
|
1935
|
-
|
|
1936
|
-
if upstream_config:
|
|
1347
|
+
upstream = _parse_upstream_proxy()
|
|
1348
|
+
if upstream:
|
|
1937
1349
|
return self._proxy_request_with_upstream(
|
|
1938
1350
|
method,
|
|
1939
1351
|
url,
|
|
@@ -1942,41 +1354,30 @@ class ThordataClient:
|
|
|
1942
1354
|
headers=headers,
|
|
1943
1355
|
params=params,
|
|
1944
1356
|
data=data,
|
|
1945
|
-
upstream_config=
|
|
1357
|
+
upstream_config=upstream,
|
|
1946
1358
|
)
|
|
1947
1359
|
|
|
1948
|
-
# Original implementation (no upstream proxy)
|
|
1949
1360
|
req = requests.Request(method=method.upper(), url=url, params=params)
|
|
1950
1361
|
prepped = self._proxy_session.prepare_request(req)
|
|
1951
1362
|
final_url = prepped.url or url
|
|
1952
1363
|
|
|
1953
1364
|
proxy_endpoint = proxy_config.build_proxy_endpoint()
|
|
1954
|
-
is_socks = proxy_endpoint.startswith(
|
|
1955
|
-
("socks5://", "socks5h://", "socks4://", "socks4a://")
|
|
1956
|
-
)
|
|
1365
|
+
is_socks = proxy_endpoint.startswith(("socks",))
|
|
1957
1366
|
|
|
1958
1367
|
if is_socks:
|
|
1959
1368
|
proxy_url_for_manager = proxy_config.build_proxy_url()
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
pm = self._get_proxy_manager(
|
|
1964
|
-
proxy_url_for_manager,
|
|
1965
|
-
cache_key=cache_key,
|
|
1966
|
-
proxy_headers=None,
|
|
1967
|
-
)
|
|
1369
|
+
cache_key = proxy_url_for_manager
|
|
1370
|
+
pm = self._get_proxy_manager(proxy_url_for_manager, cache_key=cache_key)
|
|
1371
|
+
req_headers = dict(headers or {})
|
|
1968
1372
|
else:
|
|
1969
1373
|
userpass = proxy_config.build_proxy_basic_auth()
|
|
1970
1374
|
proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
|
|
1971
1375
|
cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
|
|
1972
|
-
|
|
1973
1376
|
pm = self._get_proxy_manager(
|
|
1974
|
-
proxy_endpoint,
|
|
1975
|
-
cache_key=cache_key,
|
|
1976
|
-
proxy_headers=dict(proxy_headers),
|
|
1377
|
+
proxy_endpoint, cache_key=cache_key, proxy_headers=dict(proxy_headers)
|
|
1977
1378
|
)
|
|
1379
|
+
req_headers = dict(headers or {})
|
|
1978
1380
|
|
|
1979
|
-
req_headers = dict(headers or {})
|
|
1980
1381
|
body = None
|
|
1981
1382
|
if data is not None:
|
|
1982
1383
|
if isinstance(data, dict):
|
|
@@ -1998,16 +1399,12 @@ class ThordataClient:
|
|
|
1998
1399
|
)
|
|
1999
1400
|
|
|
2000
1401
|
r = requests.Response()
|
|
2001
|
-
r.status_code = int(getattr(http_resp, "status", 0)
|
|
1402
|
+
r.status_code = int(getattr(http_resp, "status", 0))
|
|
2002
1403
|
r._content = http_resp.data or b""
|
|
2003
1404
|
r.url = final_url
|
|
2004
1405
|
r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
|
|
2005
1406
|
return r
|
|
2006
1407
|
|
|
2007
|
-
# =========================================================================
|
|
2008
|
-
# Upstream Proxy Support (Proxy Chaining)
|
|
2009
|
-
# =========================================================================
|
|
2010
|
-
|
|
2011
1408
|
def _proxy_request_with_upstream(
|
|
2012
1409
|
self,
|
|
2013
1410
|
method: str,
|
|
@@ -2020,12 +1417,8 @@ class ThordataClient:
|
|
|
2020
1417
|
data: Any = None,
|
|
2021
1418
|
upstream_config: dict[str, Any],
|
|
2022
1419
|
) -> requests.Response:
|
|
2023
|
-
"""Execute request through proxy chain: Upstream -> Thordata -> Target."""
|
|
2024
1420
|
if not HAS_PYSOCKS:
|
|
2025
|
-
raise ThordataConfigError(
|
|
2026
|
-
"PySocks is required for upstream proxy support. "
|
|
2027
|
-
"Install with: pip install PySocks"
|
|
2028
|
-
)
|
|
1421
|
+
raise ThordataConfigError("PySocks required for upstream proxy support.")
|
|
2029
1422
|
|
|
2030
1423
|
req = requests.Request(method=method.upper(), url=url, params=params)
|
|
2031
1424
|
prepped = self._proxy_session.prepare_request(req)
|
|
@@ -2036,370 +1429,141 @@ class ThordataClient:
|
|
|
2036
1429
|
target_port = parsed_target.port or (
|
|
2037
1430
|
443 if parsed_target.scheme == "https" else 80
|
|
2038
1431
|
)
|
|
2039
|
-
target_is_https = parsed_target.scheme == "https"
|
|
2040
|
-
|
|
2041
|
-
protocol = proxy_config.protocol.lower()
|
|
2042
|
-
if protocol == "socks5":
|
|
2043
|
-
protocol = "socks5h"
|
|
2044
1432
|
|
|
2045
|
-
thordata_host = proxy_config.host or ""
|
|
1433
|
+
thordata_host = proxy_config.host or "pr.thordata.net"
|
|
2046
1434
|
thordata_port = proxy_config.port or 9999
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
socket_factory = _UpstreamProxySocketFactory(upstream_config)
|
|
2051
|
-
|
|
2052
|
-
logger.debug(
|
|
2053
|
-
f"Proxy chain: upstream({upstream_config['host']}:{upstream_config['port']}) "
|
|
2054
|
-
f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
|
|
2055
|
-
f"-> target({target_host}:{target_port})"
|
|
2056
|
-
)
|
|
1435
|
+
thordata_user = proxy_config.build_username()
|
|
1436
|
+
thordata_pass = proxy_config.password
|
|
2057
1437
|
|
|
2058
|
-
|
|
1438
|
+
# 1. Connect to Upstream -> Thordata Node
|
|
1439
|
+
factory = UpstreamProxySocketFactory(upstream_config)
|
|
1440
|
+
raw_sock = factory.create_connection(
|
|
2059
1441
|
(thordata_host, thordata_port),
|
|
2060
1442
|
timeout=float(timeout),
|
|
2061
1443
|
)
|
|
2062
1444
|
|
|
2063
1445
|
try:
|
|
2064
|
-
|
|
2065
|
-
sock = self._socks5_handshake(
|
|
2066
|
-
raw_sock,
|
|
2067
|
-
target_host,
|
|
2068
|
-
target_port,
|
|
2069
|
-
thordata_username,
|
|
2070
|
-
thordata_password,
|
|
2071
|
-
)
|
|
2072
|
-
if target_is_https:
|
|
2073
|
-
context = ssl.create_default_context()
|
|
2074
|
-
sock = context.wrap_socket(sock, server_hostname=target_host)
|
|
2075
|
-
|
|
2076
|
-
elif protocol == "https":
|
|
2077
|
-
proxy_context = ssl.create_default_context()
|
|
2078
|
-
proxy_ssl_sock = proxy_context.wrap_socket(
|
|
2079
|
-
raw_sock, server_hostname=thordata_host
|
|
2080
|
-
)
|
|
2081
|
-
|
|
2082
|
-
self._send_connect_request(
|
|
2083
|
-
proxy_ssl_sock,
|
|
2084
|
-
target_host,
|
|
2085
|
-
target_port,
|
|
2086
|
-
thordata_username,
|
|
2087
|
-
thordata_password,
|
|
2088
|
-
)
|
|
1446
|
+
protocol = proxy_config.protocol.lower().replace("socks5", "socks5h")
|
|
2089
1447
|
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
) # type: ignore[assignment]
|
|
2095
|
-
else:
|
|
2096
|
-
sock = proxy_ssl_sock
|
|
2097
|
-
|
|
2098
|
-
else: # HTTP proxy
|
|
2099
|
-
self._send_connect_request(
|
|
2100
|
-
raw_sock,
|
|
2101
|
-
target_host,
|
|
2102
|
-
target_port,
|
|
2103
|
-
thordata_username,
|
|
2104
|
-
thordata_password,
|
|
1448
|
+
# 2. Handshake with Thordata
|
|
1449
|
+
if protocol.startswith("socks"):
|
|
1450
|
+
sock = socks5_handshake(
|
|
1451
|
+
raw_sock, target_host, target_port, thordata_user, thordata_pass
|
|
2105
1452
|
)
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
1453
|
+
if parsed_target.scheme == "https":
|
|
1454
|
+
ctx = ssl.create_default_context()
|
|
1455
|
+
sock = ctx.wrap_socket(sock, server_hostname=target_host)
|
|
1456
|
+
else:
|
|
1457
|
+
# HTTP/HTTPS Tunnel
|
|
1458
|
+
if protocol == "https":
|
|
1459
|
+
ctx = ssl.create_default_context()
|
|
1460
|
+
sock = ctx.wrap_socket(raw_sock, server_hostname=thordata_host)
|
|
2110
1461
|
else:
|
|
2111
1462
|
sock = raw_sock
|
|
2112
1463
|
|
|
2113
|
-
|
|
1464
|
+
# CONNECT to Thordata
|
|
1465
|
+
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
1466
|
+
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
1467
|
+
auth = base64.b64encode(
|
|
1468
|
+
f"{thordata_user}:{thordata_pass}".encode()
|
|
1469
|
+
).decode()
|
|
1470
|
+
connect_req += f"Proxy-Authorization: Basic {auth}\r\n\r\n"
|
|
1471
|
+
sock.sendall(connect_req.encode())
|
|
1472
|
+
|
|
1473
|
+
resp = b""
|
|
1474
|
+
while b"\r\n\r\n" not in resp:
|
|
1475
|
+
resp += sock.recv(1024)
|
|
1476
|
+
if b"200" not in resp.split(b"\r\n")[0]:
|
|
1477
|
+
raise ConnectionError("Thordata CONNECT failed")
|
|
1478
|
+
|
|
1479
|
+
# 3. If Target is HTTPS, wrap TLS inside the tunnel
|
|
1480
|
+
if parsed_target.scheme == "https":
|
|
1481
|
+
if isinstance(sock, ssl.SSLSocket):
|
|
1482
|
+
sock = cast(
|
|
1483
|
+
socket.socket,
|
|
1484
|
+
create_tls_in_tls(sock, target_host, float(timeout)),
|
|
1485
|
+
)
|
|
1486
|
+
else:
|
|
1487
|
+
ctx = ssl.create_default_context()
|
|
1488
|
+
sock = ctx.wrap_socket(sock, server_hostname=target_host)
|
|
1489
|
+
|
|
1490
|
+
# 4. Send actual Request
|
|
1491
|
+
return self._send_http_via_socket(
|
|
2114
1492
|
sock, method, parsed_target, headers, data, final_url, timeout
|
|
2115
1493
|
)
|
|
2116
1494
|
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
def _send_connect_request(
|
|
2122
|
-
self,
|
|
2123
|
-
sock: socket.socket,
|
|
2124
|
-
target_host: str,
|
|
2125
|
-
target_port: int,
|
|
2126
|
-
proxy_username: str,
|
|
2127
|
-
proxy_password: str,
|
|
2128
|
-
) -> None:
|
|
2129
|
-
"""Send HTTP CONNECT request to proxy and verify response."""
|
|
2130
|
-
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
2131
|
-
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
2132
|
-
|
|
2133
|
-
credentials = f"{proxy_username}:{proxy_password}"
|
|
2134
|
-
encoded = base64.b64encode(credentials.encode()).decode()
|
|
2135
|
-
connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
|
|
2136
|
-
connect_req += "\r\n"
|
|
2137
|
-
|
|
2138
|
-
sock.sendall(connect_req.encode())
|
|
2139
|
-
|
|
2140
|
-
response = b""
|
|
2141
|
-
while b"\r\n\r\n" not in response:
|
|
2142
|
-
chunk = sock.recv(4096)
|
|
2143
|
-
if not chunk:
|
|
2144
|
-
raise ConnectionError("Proxy closed connection during CONNECT")
|
|
2145
|
-
response += chunk
|
|
2146
|
-
|
|
2147
|
-
status_line = response.split(b"\r\n")[0].decode()
|
|
2148
|
-
if "200" not in status_line:
|
|
2149
|
-
raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
|
|
1495
|
+
except Exception:
|
|
1496
|
+
raw_sock.close()
|
|
1497
|
+
raise
|
|
2150
1498
|
|
|
2151
|
-
def
|
|
1499
|
+
def _send_http_via_socket(
|
|
2152
1500
|
self,
|
|
2153
|
-
|
|
2154
|
-
hostname: str,
|
|
2155
|
-
timeout: int,
|
|
2156
|
-
) -> _TLSInTLSSocket:
|
|
2157
|
-
"""Create a TLS connection over an existing TLS connection."""
|
|
2158
|
-
context = ssl.create_default_context()
|
|
2159
|
-
|
|
2160
|
-
incoming = ssl.MemoryBIO()
|
|
2161
|
-
outgoing = ssl.MemoryBIO()
|
|
2162
|
-
|
|
2163
|
-
ssl_obj = context.wrap_bio(incoming, outgoing, server_hostname=hostname)
|
|
2164
|
-
|
|
2165
|
-
while True:
|
|
2166
|
-
try:
|
|
2167
|
-
ssl_obj.do_handshake()
|
|
2168
|
-
break
|
|
2169
|
-
except ssl.SSLWantReadError:
|
|
2170
|
-
data_to_send = outgoing.read()
|
|
2171
|
-
if data_to_send:
|
|
2172
|
-
outer_ssl_sock.sendall(data_to_send)
|
|
2173
|
-
|
|
2174
|
-
outer_ssl_sock.settimeout(float(timeout))
|
|
2175
|
-
try:
|
|
2176
|
-
received = outer_ssl_sock.recv(8192)
|
|
2177
|
-
if not received:
|
|
2178
|
-
raise ConnectionError("Connection closed during TLS handshake")
|
|
2179
|
-
incoming.write(received)
|
|
2180
|
-
except socket.timeout as e:
|
|
2181
|
-
raise ConnectionError("Timeout during TLS handshake") from e
|
|
2182
|
-
except ssl.SSLWantWriteError:
|
|
2183
|
-
data_to_send = outgoing.read()
|
|
2184
|
-
if data_to_send:
|
|
2185
|
-
outer_ssl_sock.sendall(data_to_send)
|
|
2186
|
-
|
|
2187
|
-
data_to_send = outgoing.read()
|
|
2188
|
-
if data_to_send:
|
|
2189
|
-
outer_ssl_sock.sendall(data_to_send)
|
|
2190
|
-
|
|
2191
|
-
return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
|
|
2192
|
-
|
|
2193
|
-
def _send_http_request(
|
|
2194
|
-
self,
|
|
2195
|
-
sock: socket.socket | ssl.SSLSocket | Any,
|
|
1501
|
+
sock: socket.socket | Any, # Fix for TLSInTLSSocket typing issue
|
|
2196
1502
|
method: str,
|
|
2197
|
-
|
|
2198
|
-
headers:
|
|
1503
|
+
parsed: Any,
|
|
1504
|
+
headers: Any,
|
|
2199
1505
|
data: Any,
|
|
2200
1506
|
final_url: str,
|
|
2201
1507
|
timeout: int,
|
|
2202
1508
|
) -> requests.Response:
|
|
2203
|
-
"""Send HTTP request over established connection and parse response."""
|
|
2204
|
-
target_host = parsed_url.hostname
|
|
2205
|
-
|
|
2206
1509
|
req_headers = dict(headers or {})
|
|
2207
|
-
req_headers.setdefault("Host",
|
|
2208
|
-
req_headers.setdefault("User-Agent",
|
|
1510
|
+
req_headers.setdefault("Host", parsed.hostname)
|
|
1511
|
+
req_headers.setdefault("User-Agent", "python-thordata-sdk")
|
|
2209
1512
|
req_headers.setdefault("Connection", "close")
|
|
2210
1513
|
|
|
2211
|
-
path =
|
|
2212
|
-
if
|
|
2213
|
-
path += f"?{
|
|
1514
|
+
path = parsed.path or "/"
|
|
1515
|
+
if parsed.query:
|
|
1516
|
+
path += f"?{parsed.query}"
|
|
2214
1517
|
|
|
2215
|
-
|
|
1518
|
+
msg = f"{method} {path} HTTP/1.1\r\n"
|
|
2216
1519
|
for k, v in req_headers.items():
|
|
2217
|
-
|
|
1520
|
+
msg += f"{k}: {v}\r\n"
|
|
2218
1521
|
|
|
2219
|
-
body =
|
|
2220
|
-
if data
|
|
1522
|
+
body = b""
|
|
1523
|
+
if data:
|
|
2221
1524
|
if isinstance(data, dict):
|
|
2222
|
-
body = urlencode(
|
|
2223
|
-
|
|
2224
|
-
http_req += f"Content-Length: {len(body)}\r\n"
|
|
1525
|
+
body = urlencode(data).encode()
|
|
1526
|
+
msg += "Content-Type: application/x-www-form-urlencoded\r\n"
|
|
2225
1527
|
elif isinstance(data, bytes):
|
|
2226
1528
|
body = data
|
|
2227
|
-
http_req += f"Content-Length: {len(body)}\r\n"
|
|
2228
1529
|
else:
|
|
2229
1530
|
body = str(data).encode()
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
http_req += "\r\n"
|
|
2233
|
-
sock.sendall(http_req.encode())
|
|
1531
|
+
msg += f"Content-Length: {len(body)}\r\n"
|
|
2234
1532
|
|
|
1533
|
+
msg += "\r\n"
|
|
1534
|
+
sock.sendall(msg.encode())
|
|
2235
1535
|
if body:
|
|
2236
1536
|
sock.sendall(body)
|
|
2237
1537
|
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
while True:
|
|
2244
|
-
chunk = sock.recv(8192)
|
|
1538
|
+
# Read Response
|
|
1539
|
+
resp_data = b""
|
|
1540
|
+
while True:
|
|
1541
|
+
try:
|
|
1542
|
+
chunk = sock.recv(4096)
|
|
2245
1543
|
if not chunk:
|
|
2246
1544
|
break
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
header_end = response_data.index(b"\r\n\r\n") + 4
|
|
2250
|
-
headers_part = (
|
|
2251
|
-
response_data[:header_end]
|
|
2252
|
-
.decode("utf-8", errors="replace")
|
|
2253
|
-
.lower()
|
|
2254
|
-
)
|
|
2255
|
-
if "content-length:" in headers_part:
|
|
2256
|
-
for line in headers_part.split("\r\n"):
|
|
2257
|
-
if line.startswith("content-length:"):
|
|
2258
|
-
content_length = int(line.split(":")[1].strip())
|
|
2259
|
-
if len(response_data) >= header_end + content_length:
|
|
2260
|
-
break
|
|
2261
|
-
elif "transfer-encoding: chunked" not in headers_part:
|
|
2262
|
-
break
|
|
2263
|
-
except socket.timeout:
|
|
2264
|
-
pass
|
|
2265
|
-
|
|
2266
|
-
return self._parse_http_response(response_data, final_url)
|
|
2267
|
-
|
|
2268
|
-
def _socks5_handshake(
|
|
2269
|
-
self,
|
|
2270
|
-
sock: socket.socket,
|
|
2271
|
-
target_host: str,
|
|
2272
|
-
target_port: int,
|
|
2273
|
-
username: str | None,
|
|
2274
|
-
password: str | None,
|
|
2275
|
-
) -> socket.socket:
|
|
2276
|
-
"""Perform SOCKS5 handshake over existing socket."""
|
|
2277
|
-
if username and password:
|
|
2278
|
-
sock.sendall(b"\x05\x02\x00\x02")
|
|
2279
|
-
else:
|
|
2280
|
-
sock.sendall(b"\x05\x01\x00")
|
|
2281
|
-
|
|
2282
|
-
response = sock.recv(2)
|
|
2283
|
-
if len(response) < 2:
|
|
2284
|
-
raise ConnectionError("SOCKS5 handshake failed: incomplete response")
|
|
2285
|
-
|
|
2286
|
-
if response[0] != 0x05:
|
|
2287
|
-
raise ConnectionError(f"SOCKS5 version mismatch: {response[0]}")
|
|
2288
|
-
|
|
2289
|
-
auth_method = response[1]
|
|
2290
|
-
|
|
2291
|
-
if auth_method == 0x02:
|
|
2292
|
-
if not username or not password:
|
|
2293
|
-
raise ConnectionError(
|
|
2294
|
-
"SOCKS5 server requires auth but no credentials provided"
|
|
2295
|
-
)
|
|
2296
|
-
|
|
2297
|
-
auth_req = bytes([0x01, len(username)]) + username.encode()
|
|
2298
|
-
auth_req += bytes([len(password)]) + password.encode()
|
|
2299
|
-
sock.sendall(auth_req)
|
|
2300
|
-
|
|
2301
|
-
auth_resp = sock.recv(2)
|
|
2302
|
-
if len(auth_resp) < 2 or auth_resp[1] != 0x00:
|
|
2303
|
-
raise ConnectionError("SOCKS5 authentication failed")
|
|
2304
|
-
|
|
2305
|
-
elif auth_method == 0xFF:
|
|
2306
|
-
raise ConnectionError("SOCKS5 no acceptable auth method")
|
|
2307
|
-
|
|
2308
|
-
connect_req = b"\x05\x01\x00\x03"
|
|
2309
|
-
connect_req += bytes([len(target_host)]) + target_host.encode()
|
|
2310
|
-
connect_req += target_port.to_bytes(2, "big")
|
|
2311
|
-
sock.sendall(connect_req)
|
|
2312
|
-
|
|
2313
|
-
resp = sock.recv(4)
|
|
2314
|
-
if len(resp) < 4:
|
|
2315
|
-
raise ConnectionError("SOCKS5 connect failed: incomplete response")
|
|
2316
|
-
|
|
2317
|
-
if resp[1] != 0x00:
|
|
2318
|
-
error_codes = {
|
|
2319
|
-
0x01: "General failure",
|
|
2320
|
-
0x02: "Connection not allowed",
|
|
2321
|
-
0x03: "Network unreachable",
|
|
2322
|
-
0x04: "Host unreachable",
|
|
2323
|
-
0x05: "Connection refused",
|
|
2324
|
-
0x06: "TTL expired",
|
|
2325
|
-
0x07: "Command not supported",
|
|
2326
|
-
0x08: "Address type not supported",
|
|
2327
|
-
}
|
|
2328
|
-
error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
|
|
2329
|
-
raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
|
|
2330
|
-
|
|
2331
|
-
addr_type = resp[3]
|
|
2332
|
-
if addr_type == 0x01:
|
|
2333
|
-
sock.recv(4 + 2)
|
|
2334
|
-
elif addr_type == 0x03:
|
|
2335
|
-
domain_len = sock.recv(1)[0]
|
|
2336
|
-
sock.recv(domain_len + 2)
|
|
2337
|
-
elif addr_type == 0x04:
|
|
2338
|
-
sock.recv(16 + 2)
|
|
2339
|
-
|
|
2340
|
-
return sock
|
|
2341
|
-
|
|
2342
|
-
def _parse_http_response(
|
|
2343
|
-
self,
|
|
2344
|
-
response_data: bytes,
|
|
2345
|
-
url: str,
|
|
2346
|
-
) -> requests.Response:
|
|
2347
|
-
"""Parse raw HTTP response into requests.Response."""
|
|
2348
|
-
if b"\r\n\r\n" in response_data:
|
|
2349
|
-
header_data, body = response_data.split(b"\r\n\r\n", 1)
|
|
2350
|
-
else:
|
|
2351
|
-
header_data = response_data
|
|
2352
|
-
body = b""
|
|
2353
|
-
|
|
2354
|
-
header_lines = header_data.decode("utf-8", errors="replace").split("\r\n")
|
|
2355
|
-
|
|
2356
|
-
status_line = header_lines[0] if header_lines else ""
|
|
2357
|
-
parts = status_line.split(" ", 2)
|
|
2358
|
-
status_code = int(parts[1]) if len(parts) > 1 else 0
|
|
2359
|
-
|
|
2360
|
-
headers_dict = {}
|
|
2361
|
-
for line in header_lines[1:]:
|
|
2362
|
-
if ": " in line:
|
|
2363
|
-
k, v = line.split(": ", 1)
|
|
2364
|
-
headers_dict[k] = v
|
|
2365
|
-
|
|
2366
|
-
if headers_dict.get("Transfer-Encoding", "").lower() == "chunked":
|
|
2367
|
-
body = self._decode_chunked(body)
|
|
2368
|
-
|
|
2369
|
-
r = requests.Response()
|
|
2370
|
-
r.status_code = status_code
|
|
2371
|
-
r._content = body
|
|
2372
|
-
r.url = url
|
|
2373
|
-
r.headers = CaseInsensitiveDict(headers_dict)
|
|
2374
|
-
return r
|
|
2375
|
-
|
|
2376
|
-
def _decode_chunked(self, data: bytes) -> bytes:
|
|
2377
|
-
"""Decode chunked transfer encoding."""
|
|
2378
|
-
result = b""
|
|
2379
|
-
while data:
|
|
2380
|
-
if b"\r\n" not in data:
|
|
2381
|
-
break
|
|
2382
|
-
size_line, data = data.split(b"\r\n", 1)
|
|
2383
|
-
try:
|
|
2384
|
-
chunk_size = int(size_line.decode().strip(), 16)
|
|
2385
|
-
except ValueError:
|
|
2386
|
-
break
|
|
2387
|
-
|
|
2388
|
-
if chunk_size == 0:
|
|
1545
|
+
resp_data += chunk
|
|
1546
|
+
except socket.timeout:
|
|
2389
1547
|
break
|
|
2390
1548
|
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
1549
|
+
if b"\r\n\r\n" in resp_data:
|
|
1550
|
+
head, content = resp_data.split(b"\r\n\r\n", 1)
|
|
1551
|
+
status_line = head.split(b"\r\n")[0].decode()
|
|
1552
|
+
try:
|
|
1553
|
+
status_code = int(status_line.split(" ")[1])
|
|
1554
|
+
except (ValueError, IndexError):
|
|
1555
|
+
status_code = 0
|
|
2396
1556
|
|
|
2397
|
-
|
|
1557
|
+
r = requests.Response()
|
|
1558
|
+
r.status_code = status_code
|
|
1559
|
+
r._content = content
|
|
1560
|
+
r.url = final_url
|
|
1561
|
+
return r
|
|
1562
|
+
raise ConnectionError("Empty response from socket")
|
|
2398
1563
|
|
|
2399
1564
|
def _get_proxy_endpoint_overrides(
|
|
2400
1565
|
self, product: ProxyProduct
|
|
2401
1566
|
) -> tuple[str | None, int | None, str]:
|
|
2402
|
-
"""Get proxy endpoint overrides from environment variables."""
|
|
2403
1567
|
prefix = product.value.upper()
|
|
2404
1568
|
host = os.getenv(f"THORDATA_{prefix}_PROXY_HOST") or os.getenv(
|
|
2405
1569
|
"THORDATA_PROXY_HOST"
|
|
@@ -2410,13 +1574,12 @@ class ThordataClient:
|
|
|
2410
1574
|
protocol = (
|
|
2411
1575
|
os.getenv(f"THORDATA_{prefix}_PROXY_PROTOCOL")
|
|
2412
1576
|
or os.getenv("THORDATA_PROXY_PROTOCOL")
|
|
2413
|
-
or "
|
|
1577
|
+
or "http"
|
|
2414
1578
|
)
|
|
2415
1579
|
port = int(port_raw) if port_raw and port_raw.isdigit() else None
|
|
2416
1580
|
return host or None, port, protocol
|
|
2417
1581
|
|
|
2418
1582
|
def _get_default_proxy_config_from_env(self) -> ProxyConfig | None:
|
|
2419
|
-
"""Get proxy configuration from environment variables."""
|
|
2420
1583
|
for prod in [
|
|
2421
1584
|
ProxyProduct.RESIDENTIAL,
|
|
2422
1585
|
ProxyProduct.DATACENTER,
|
|
@@ -2436,44 +1599,3 @@ class ThordataClient:
|
|
|
2436
1599
|
protocol=proto,
|
|
2437
1600
|
)
|
|
2438
1601
|
return None
|
|
2439
|
-
|
|
2440
|
-
def get_browser_connection_url(
|
|
2441
|
-
self, username: str | None = None, password: str | None = None
|
|
2442
|
-
) -> str:
|
|
2443
|
-
"""
|
|
2444
|
-
Generate the WebSocket URL for connecting to Scraping Browser.
|
|
2445
|
-
|
|
2446
|
-
Args:
|
|
2447
|
-
username: Proxy username (without 'td-customer-' prefix).
|
|
2448
|
-
Defaults to THORDATA_BROWSER_USERNAME or THORDATA_RESIDENTIAL_USERNAME.
|
|
2449
|
-
password: Proxy password.
|
|
2450
|
-
|
|
2451
|
-
Returns:
|
|
2452
|
-
WSS URL string suitable for playwright.connect_over_cdp().
|
|
2453
|
-
|
|
2454
|
-
Raises:
|
|
2455
|
-
ThordataConfigError: If credentials are missing.
|
|
2456
|
-
"""
|
|
2457
|
-
user = (
|
|
2458
|
-
username
|
|
2459
|
-
or os.getenv("THORDATA_BROWSER_USERNAME")
|
|
2460
|
-
or os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
2461
|
-
)
|
|
2462
|
-
pwd = (
|
|
2463
|
-
password
|
|
2464
|
-
or os.getenv("THORDATA_BROWSER_PASSWORD")
|
|
2465
|
-
or os.getenv("THORDATA_RESIDENTIAL_PASSWORD")
|
|
2466
|
-
)
|
|
2467
|
-
|
|
2468
|
-
if not user or not pwd:
|
|
2469
|
-
raise ThordataConfigError(
|
|
2470
|
-
"Browser credentials missing. Set THORDATA_BROWSER_USERNAME/PASSWORD or pass arguments."
|
|
2471
|
-
)
|
|
2472
|
-
prefix = "td-customer-"
|
|
2473
|
-
final_user = f"{prefix}{user}" if not user.startswith(prefix) else user
|
|
2474
|
-
|
|
2475
|
-
# URL encode
|
|
2476
|
-
safe_user = quote(final_user, safe="")
|
|
2477
|
-
safe_pass = quote(pwd, safe="")
|
|
2478
|
-
|
|
2479
|
-
return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"
|