thordata-sdk 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +4 -40
- thordata/async_client.py +572 -1241
- thordata/async_unlimited.py +130 -0
- thordata/client.py +1184 -1309
- thordata/core/__init__.py +23 -0
- thordata/core/async_http_client.py +91 -0
- thordata/core/http_client.py +79 -0
- thordata/core/tunnel.py +287 -0
- thordata/demo.py +2 -2
- thordata/enums.py +41 -380
- thordata/models.py +37 -1193
- thordata/tools/__init__.py +28 -0
- thordata/tools/base.py +42 -0
- thordata/tools/code.py +26 -0
- thordata/tools/ecommerce.py +67 -0
- thordata/tools/search.py +73 -0
- thordata/tools/social.py +190 -0
- thordata/tools/video.py +81 -0
- thordata/types/__init__.py +77 -0
- thordata/types/common.py +141 -0
- thordata/types/proxy.py +340 -0
- thordata/types/serp.py +224 -0
- thordata/types/task.py +144 -0
- thordata/types/universal.py +66 -0
- thordata/unlimited.py +169 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/METADATA +74 -51
- thordata_sdk-1.5.0.dist-info/RECORD +35 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/WHEEL +1 -1
- thordata_sdk-1.3.0.dist-info/RECORD +0 -16
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/top_level.txt +0 -0
thordata/client.py
CHANGED
|
@@ -3,28 +3,11 @@ Synchronous client for the Thordata API.
|
|
|
3
3
|
|
|
4
4
|
This module provides the main ThordataClient class for interacting with
|
|
5
5
|
Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
6
|
-
|
|
7
|
-
Example:
|
|
8
|
-
>>> from thordata import ThordataClient
|
|
9
|
-
>>>
|
|
10
|
-
>>> client = ThordataClient(
|
|
11
|
-
... scraper_token="your_token",
|
|
12
|
-
... public_token="your_public_token",
|
|
13
|
-
... public_key="your_public_key"
|
|
14
|
-
... )
|
|
15
|
-
>>>
|
|
16
|
-
>>> # Use the proxy network
|
|
17
|
-
>>> response = client.get("https://httpbin.org/ip")
|
|
18
|
-
>>> print(response.json())
|
|
19
|
-
>>>
|
|
20
|
-
>>> # Search with SERP API
|
|
21
|
-
>>> results = client.serp_search("python tutorial", engine="google")
|
|
22
6
|
"""
|
|
23
7
|
|
|
24
8
|
from __future__ import annotations
|
|
25
9
|
|
|
26
10
|
import base64
|
|
27
|
-
import contextlib
|
|
28
11
|
import hashlib
|
|
29
12
|
import logging
|
|
30
13
|
import os
|
|
@@ -38,37 +21,42 @@ import requests
|
|
|
38
21
|
import urllib3
|
|
39
22
|
from requests.structures import CaseInsensitiveDict
|
|
40
23
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
try:
|
|
44
|
-
import socks
|
|
45
|
-
|
|
46
|
-
HAS_PYSOCKS = True
|
|
47
|
-
except ImportError:
|
|
48
|
-
HAS_PYSOCKS = False
|
|
49
|
-
|
|
50
|
-
from . import __version__ as _sdk_version
|
|
24
|
+
# Import Legacy/Compat
|
|
51
25
|
from ._utils import (
|
|
52
26
|
build_auth_headers,
|
|
53
27
|
build_builder_headers,
|
|
54
28
|
build_public_api_headers,
|
|
55
|
-
build_user_agent,
|
|
56
29
|
decode_base64_image,
|
|
57
30
|
extract_error_message,
|
|
58
31
|
parse_json_response,
|
|
59
32
|
)
|
|
60
|
-
|
|
33
|
+
|
|
34
|
+
# Import Core Components
|
|
35
|
+
from .core.http_client import ThordataHttpSession
|
|
36
|
+
from .core.tunnel import (
|
|
37
|
+
HAS_PYSOCKS,
|
|
38
|
+
UpstreamProxySocketFactory,
|
|
39
|
+
create_tls_in_tls,
|
|
40
|
+
parse_upstream_proxy,
|
|
41
|
+
socks5_handshake,
|
|
42
|
+
)
|
|
43
|
+
from .enums import Engine
|
|
61
44
|
from .exceptions import (
|
|
62
45
|
ThordataConfigError,
|
|
63
46
|
ThordataNetworkError,
|
|
64
47
|
ThordataTimeoutError,
|
|
65
48
|
raise_for_code,
|
|
66
49
|
)
|
|
67
|
-
from .
|
|
50
|
+
from .retry import RetryConfig, with_retry
|
|
51
|
+
from .serp_engines import SerpNamespace
|
|
52
|
+
|
|
53
|
+
# Import Types (Modernized)
|
|
54
|
+
from .types import (
|
|
68
55
|
CommonSettings,
|
|
69
56
|
ProxyConfig,
|
|
70
57
|
ProxyProduct,
|
|
71
58
|
ProxyServer,
|
|
59
|
+
ProxyType,
|
|
72
60
|
ProxyUserList,
|
|
73
61
|
ScraperTaskConfig,
|
|
74
62
|
SerpRequest,
|
|
@@ -76,196 +64,17 @@ from .models import (
|
|
|
76
64
|
UsageStatistics,
|
|
77
65
|
VideoTaskConfig,
|
|
78
66
|
)
|
|
79
|
-
from .
|
|
67
|
+
from .unlimited import UnlimitedNamespace
|
|
80
68
|
|
|
81
69
|
logger = logging.getLogger(__name__)
|
|
82
70
|
|
|
83
|
-
|
|
84
71
|
# =========================================================================
|
|
85
|
-
#
|
|
72
|
+
# Internal Logic for Upstream Proxies
|
|
86
73
|
# =========================================================================
|
|
87
74
|
|
|
88
75
|
|
|
89
76
|
def _parse_upstream_proxy() -> dict[str, Any] | None:
|
|
90
|
-
|
|
91
|
-
Parse THORDATA_UPSTREAM_PROXY environment variable.
|
|
92
|
-
|
|
93
|
-
Supported formats:
|
|
94
|
-
- http://127.0.0.1:7897
|
|
95
|
-
- socks5://127.0.0.1:7897
|
|
96
|
-
- socks5://user:pass@127.0.0.1:7897
|
|
97
|
-
|
|
98
|
-
Returns:
|
|
99
|
-
Dict with proxy config or None if not set.
|
|
100
|
-
"""
|
|
101
|
-
upstream_url = os.environ.get("THORDATA_UPSTREAM_PROXY", "").strip()
|
|
102
|
-
if not upstream_url:
|
|
103
|
-
return None
|
|
104
|
-
|
|
105
|
-
parsed = urlparse(upstream_url)
|
|
106
|
-
scheme = (parsed.scheme or "").lower()
|
|
107
|
-
|
|
108
|
-
if scheme not in ("http", "https", "socks5", "socks5h", "socks4"):
|
|
109
|
-
logger.warning(f"Unsupported upstream proxy scheme: {scheme}")
|
|
110
|
-
return None
|
|
111
|
-
|
|
112
|
-
return {
|
|
113
|
-
"scheme": scheme,
|
|
114
|
-
"host": parsed.hostname or "127.0.0.1",
|
|
115
|
-
"port": parsed.port or (1080 if scheme.startswith("socks") else 7897),
|
|
116
|
-
"username": parsed.username,
|
|
117
|
-
"password": parsed.password,
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
class _UpstreamProxySocketFactory:
|
|
122
|
-
"""
|
|
123
|
-
Socket factory that creates connections through an upstream proxy.
|
|
124
|
-
Used for proxy chaining when accessing Thordata from behind a firewall.
|
|
125
|
-
"""
|
|
126
|
-
|
|
127
|
-
def __init__(self, upstream_config: dict[str, Any]):
|
|
128
|
-
self.config = upstream_config
|
|
129
|
-
|
|
130
|
-
def create_connection(
|
|
131
|
-
self,
|
|
132
|
-
address: tuple[str, int],
|
|
133
|
-
timeout: float | None = None,
|
|
134
|
-
source_address: tuple[str, int] | None = None,
|
|
135
|
-
) -> socket.socket:
|
|
136
|
-
"""Create a socket connection through the upstream proxy."""
|
|
137
|
-
scheme = self.config["scheme"]
|
|
138
|
-
|
|
139
|
-
if scheme.startswith("socks"):
|
|
140
|
-
return self._create_socks_connection(address, timeout)
|
|
141
|
-
else:
|
|
142
|
-
return self._create_http_tunnel(address, timeout)
|
|
143
|
-
|
|
144
|
-
def _create_socks_connection(
|
|
145
|
-
self,
|
|
146
|
-
address: tuple[str, int],
|
|
147
|
-
timeout: float | None = None,
|
|
148
|
-
) -> socket.socket:
|
|
149
|
-
"""Create connection through SOCKS proxy."""
|
|
150
|
-
if not HAS_PYSOCKS:
|
|
151
|
-
raise RuntimeError(
|
|
152
|
-
"PySocks is required for SOCKS upstream proxy. "
|
|
153
|
-
"Install with: pip install PySocks"
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
scheme = self.config["scheme"]
|
|
157
|
-
proxy_type = socks.SOCKS5 if "socks5" in scheme else socks.SOCKS4
|
|
158
|
-
|
|
159
|
-
sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
|
|
160
|
-
sock.set_proxy(
|
|
161
|
-
proxy_type,
|
|
162
|
-
self.config["host"],
|
|
163
|
-
self.config["port"],
|
|
164
|
-
rdns=True,
|
|
165
|
-
username=self.config.get("username"),
|
|
166
|
-
password=self.config.get("password"),
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
if timeout is not None:
|
|
170
|
-
sock.settimeout(timeout)
|
|
171
|
-
|
|
172
|
-
sock.connect(address)
|
|
173
|
-
return sock
|
|
174
|
-
|
|
175
|
-
def _create_http_tunnel(
|
|
176
|
-
self,
|
|
177
|
-
address: tuple[str, int],
|
|
178
|
-
timeout: float | None = None,
|
|
179
|
-
) -> socket.socket:
|
|
180
|
-
"""Create connection through HTTP CONNECT tunnel."""
|
|
181
|
-
# Connect to upstream proxy
|
|
182
|
-
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
183
|
-
if timeout is not None:
|
|
184
|
-
sock.settimeout(timeout)
|
|
185
|
-
|
|
186
|
-
sock.connect((self.config["host"], self.config["port"]))
|
|
187
|
-
|
|
188
|
-
# Build CONNECT request
|
|
189
|
-
target_host, target_port = address
|
|
190
|
-
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
191
|
-
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
192
|
-
|
|
193
|
-
# Add proxy auth if provided
|
|
194
|
-
if self.config.get("username"):
|
|
195
|
-
credentials = f"{self.config['username']}:{self.config.get('password', '')}"
|
|
196
|
-
encoded = base64.b64encode(credentials.encode()).decode()
|
|
197
|
-
connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
|
|
198
|
-
|
|
199
|
-
connect_req += "\r\n"
|
|
200
|
-
|
|
201
|
-
sock.sendall(connect_req.encode())
|
|
202
|
-
|
|
203
|
-
# Read response
|
|
204
|
-
response = b""
|
|
205
|
-
while b"\r\n\r\n" not in response:
|
|
206
|
-
chunk = sock.recv(1024)
|
|
207
|
-
if not chunk:
|
|
208
|
-
raise ConnectionError("Upstream proxy closed connection")
|
|
209
|
-
response += chunk
|
|
210
|
-
|
|
211
|
-
# Check status
|
|
212
|
-
status_line = response.split(b"\r\n")[0].decode()
|
|
213
|
-
if "200" not in status_line:
|
|
214
|
-
sock.close()
|
|
215
|
-
raise ConnectionError(f"Upstream proxy CONNECT failed: {status_line}")
|
|
216
|
-
|
|
217
|
-
return sock
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
class _TLSInTLSSocket:
|
|
221
|
-
"""
|
|
222
|
-
A socket-like wrapper for TLS-in-TLS connections.
|
|
223
|
-
|
|
224
|
-
Uses SSLObject + MemoryBIO to implement TLS over an existing TLS connection.
|
|
225
|
-
"""
|
|
226
|
-
|
|
227
|
-
def __init__(
|
|
228
|
-
self,
|
|
229
|
-
outer_sock: ssl.SSLSocket,
|
|
230
|
-
ssl_obj: ssl.SSLObject,
|
|
231
|
-
incoming: ssl.MemoryBIO,
|
|
232
|
-
outgoing: ssl.MemoryBIO,
|
|
233
|
-
):
|
|
234
|
-
self._outer = outer_sock
|
|
235
|
-
self._ssl = ssl_obj
|
|
236
|
-
self._incoming = incoming
|
|
237
|
-
self._outgoing = outgoing
|
|
238
|
-
self._timeout: float | None = None
|
|
239
|
-
|
|
240
|
-
def settimeout(self, timeout: float | None) -> None:
|
|
241
|
-
self._timeout = timeout
|
|
242
|
-
self._outer.settimeout(timeout)
|
|
243
|
-
|
|
244
|
-
def sendall(self, data: bytes) -> None:
|
|
245
|
-
"""Send data through the inner TLS connection."""
|
|
246
|
-
self._ssl.write(data)
|
|
247
|
-
encrypted = self._outgoing.read()
|
|
248
|
-
if encrypted:
|
|
249
|
-
self._outer.sendall(encrypted)
|
|
250
|
-
|
|
251
|
-
def recv(self, bufsize: int) -> bytes:
|
|
252
|
-
"""Receive data from the inner TLS connection."""
|
|
253
|
-
while True:
|
|
254
|
-
try:
|
|
255
|
-
return self._ssl.read(bufsize)
|
|
256
|
-
except ssl.SSLWantReadError:
|
|
257
|
-
self._outer.settimeout(self._timeout)
|
|
258
|
-
try:
|
|
259
|
-
received = self._outer.recv(8192)
|
|
260
|
-
if not received:
|
|
261
|
-
return b""
|
|
262
|
-
self._incoming.write(received)
|
|
263
|
-
except socket.timeout:
|
|
264
|
-
return b""
|
|
265
|
-
|
|
266
|
-
def close(self) -> None:
|
|
267
|
-
with contextlib.suppress(Exception):
|
|
268
|
-
self._outer.close()
|
|
77
|
+
return parse_upstream_proxy()
|
|
269
78
|
|
|
270
79
|
|
|
271
80
|
# =========================================================================
|
|
@@ -274,6 +83,8 @@ class _TLSInTLSSocket:
|
|
|
274
83
|
|
|
275
84
|
|
|
276
85
|
class ThordataClient:
|
|
86
|
+
"""Main client for interacting with Thordata API services."""
|
|
87
|
+
|
|
277
88
|
# API Endpoints
|
|
278
89
|
BASE_URL = "https://scraperapi.thordata.com"
|
|
279
90
|
UNIVERSAL_URL = "https://universalapi.thordata.com"
|
|
@@ -282,7 +93,7 @@ class ThordataClient:
|
|
|
282
93
|
|
|
283
94
|
def __init__(
|
|
284
95
|
self,
|
|
285
|
-
scraper_token: str | None = None,
|
|
96
|
+
scraper_token: str | None = None,
|
|
286
97
|
public_token: str | None = None,
|
|
287
98
|
public_key: str | None = None,
|
|
288
99
|
proxy_host: str = "pr.thordata.net",
|
|
@@ -296,10 +107,6 @@ class ThordataClient:
|
|
|
296
107
|
web_scraper_api_base_url: str | None = None,
|
|
297
108
|
locations_base_url: str | None = None,
|
|
298
109
|
) -> None:
|
|
299
|
-
"""Initialize the Thordata Client."""
|
|
300
|
-
|
|
301
|
-
self.serp = SerpNamespace(self)
|
|
302
|
-
|
|
303
110
|
self.scraper_token = scraper_token
|
|
304
111
|
self.public_token = public_token
|
|
305
112
|
self.public_key = public_key
|
|
@@ -316,17 +123,17 @@ class ThordataClient:
|
|
|
316
123
|
f"Invalid auth_mode: {auth_mode}. Must be 'bearer' or 'header_token'."
|
|
317
124
|
)
|
|
318
125
|
|
|
126
|
+
# Initialize Core HTTP Client for API calls
|
|
127
|
+
self._http = ThordataHttpSession(
|
|
128
|
+
timeout=api_timeout, retry_config=self._retry_config
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Legacy logic for Proxy Network connections (requests.Session)
|
|
319
132
|
self._proxy_session = requests.Session()
|
|
320
133
|
self._proxy_session.trust_env = False
|
|
321
134
|
self._proxy_managers: dict[str, urllib3.PoolManager] = {}
|
|
322
135
|
|
|
323
|
-
|
|
324
|
-
self._api_session.trust_env = True
|
|
325
|
-
self._api_session.headers.update(
|
|
326
|
-
{"User-Agent": build_user_agent(_sdk_version, "requests")}
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
# Base URLs
|
|
136
|
+
# Base URLs Configuration
|
|
330
137
|
scraperapi_base = (
|
|
331
138
|
scraperapi_base_url
|
|
332
139
|
or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
|
|
@@ -351,14 +158,14 @@ class ThordataClient:
|
|
|
351
158
|
or self.LOCATIONS_URL
|
|
352
159
|
).rstrip("/")
|
|
353
160
|
|
|
354
|
-
|
|
161
|
+
self._gateway_base_url = os.getenv(
|
|
355
162
|
"THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
|
|
356
163
|
)
|
|
357
|
-
self._gateway_base_url = gateway_base
|
|
358
164
|
self._child_base_url = os.getenv(
|
|
359
165
|
"THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
|
|
360
166
|
)
|
|
361
167
|
|
|
168
|
+
# URL Construction
|
|
362
169
|
self._serp_url = f"{scraperapi_base}/request"
|
|
363
170
|
self._builder_url = f"{scraperapi_base}/builder"
|
|
364
171
|
self._video_builder_url = f"{scraperapi_base}/video_builder"
|
|
@@ -370,12 +177,10 @@ class ThordataClient:
|
|
|
370
177
|
|
|
371
178
|
self._locations_base_url = locations_base
|
|
372
179
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
self._proxy_users_url =
|
|
377
|
-
f"{locations_base.replace('/locations', '')}/proxy-users"
|
|
378
|
-
)
|
|
180
|
+
# Determine shared API base from locations URL
|
|
181
|
+
shared_api_base = locations_base.replace("/locations", "")
|
|
182
|
+
self._usage_stats_url = f"{shared_api_base}/account/usage-statistics"
|
|
183
|
+
self._proxy_users_url = f"{shared_api_base}/proxy-users"
|
|
379
184
|
|
|
380
185
|
whitelist_base = os.getenv(
|
|
381
186
|
"THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
|
|
@@ -388,6 +193,52 @@ class ThordataClient:
|
|
|
388
193
|
self._proxy_list_url = f"{proxy_api_base}/proxy/proxy-list"
|
|
389
194
|
self._proxy_expiration_url = f"{proxy_api_base}/proxy/expiration-time"
|
|
390
195
|
|
|
196
|
+
# Initialize Namespaces
|
|
197
|
+
self.serp = SerpNamespace(self)
|
|
198
|
+
self.unlimited = UnlimitedNamespace(self)
|
|
199
|
+
|
|
200
|
+
# =========================================================================
|
|
201
|
+
# Context Manager
|
|
202
|
+
# =========================================================================
|
|
203
|
+
|
|
204
|
+
def close(self) -> None:
|
|
205
|
+
"""Close the client and release resources."""
|
|
206
|
+
self._http.close()
|
|
207
|
+
self._proxy_session.close()
|
|
208
|
+
for pm in self._proxy_managers.values():
|
|
209
|
+
pm.clear()
|
|
210
|
+
self._proxy_managers.clear()
|
|
211
|
+
|
|
212
|
+
def __enter__(self) -> ThordataClient:
|
|
213
|
+
return self
|
|
214
|
+
|
|
215
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
216
|
+
self.close()
|
|
217
|
+
|
|
218
|
+
# =========================================================================
|
|
219
|
+
# Internal Helper: API Request Delegation
|
|
220
|
+
# =========================================================================
|
|
221
|
+
|
|
222
|
+
def _api_request_with_retry(
|
|
223
|
+
self,
|
|
224
|
+
method: str,
|
|
225
|
+
url: str,
|
|
226
|
+
*,
|
|
227
|
+
data: dict[str, Any] | None = None,
|
|
228
|
+
headers: dict[str, str] | None = None,
|
|
229
|
+
params: dict[str, Any] | None = None,
|
|
230
|
+
) -> requests.Response:
|
|
231
|
+
"""Delegate to Core HTTP Client."""
|
|
232
|
+
return self._http.request(
|
|
233
|
+
method=method, url=url, data=data, headers=headers, params=params
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
def _require_public_credentials(self) -> None:
|
|
237
|
+
if not self.public_token or not self.public_key:
|
|
238
|
+
raise ThordataConfigError(
|
|
239
|
+
"public_token and public_key are required for this operation."
|
|
240
|
+
)
|
|
241
|
+
|
|
391
242
|
# =========================================================================
|
|
392
243
|
# Proxy Network Methods
|
|
393
244
|
# =========================================================================
|
|
@@ -414,48 +265,6 @@ class ThordataClient:
|
|
|
414
265
|
logger.debug(f"Proxy POST request: {url}")
|
|
415
266
|
return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
|
|
416
267
|
|
|
417
|
-
def _proxy_verb(
|
|
418
|
-
self,
|
|
419
|
-
method: str,
|
|
420
|
-
url: str,
|
|
421
|
-
proxy_config: ProxyConfig | None,
|
|
422
|
-
timeout: int | None,
|
|
423
|
-
**kwargs: Any,
|
|
424
|
-
) -> requests.Response:
|
|
425
|
-
timeout = timeout or self._default_timeout
|
|
426
|
-
|
|
427
|
-
if proxy_config is None:
|
|
428
|
-
proxy_config = self._get_default_proxy_config_from_env()
|
|
429
|
-
|
|
430
|
-
if proxy_config is None:
|
|
431
|
-
raise ThordataConfigError(
|
|
432
|
-
"Proxy credentials are missing. "
|
|
433
|
-
"Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
|
|
434
|
-
)
|
|
435
|
-
|
|
436
|
-
kwargs.pop("proxies", None)
|
|
437
|
-
|
|
438
|
-
@with_retry(self._retry_config)
|
|
439
|
-
def _do() -> requests.Response:
|
|
440
|
-
return self._proxy_request_with_proxy_manager(
|
|
441
|
-
method,
|
|
442
|
-
url,
|
|
443
|
-
proxy_config=proxy_config, # type: ignore
|
|
444
|
-
timeout=timeout, # type: ignore
|
|
445
|
-
headers=kwargs.pop("headers", None),
|
|
446
|
-
params=kwargs.pop("params", None),
|
|
447
|
-
data=kwargs.pop("data", None),
|
|
448
|
-
)
|
|
449
|
-
|
|
450
|
-
try:
|
|
451
|
-
return _do()
|
|
452
|
-
except requests.Timeout as e:
|
|
453
|
-
raise ThordataTimeoutError(
|
|
454
|
-
f"Request timed out: {e}", original_error=e
|
|
455
|
-
) from e
|
|
456
|
-
except Exception as e:
|
|
457
|
-
raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
|
|
458
|
-
|
|
459
268
|
def build_proxy_url(
|
|
460
269
|
self,
|
|
461
270
|
username: str,
|
|
@@ -483,1190 +292,1269 @@ class ThordataClient:
|
|
|
483
292
|
return config.build_proxy_url()
|
|
484
293
|
|
|
485
294
|
# =========================================================================
|
|
486
|
-
#
|
|
295
|
+
# SERP API Methods
|
|
487
296
|
# =========================================================================
|
|
488
297
|
|
|
489
|
-
def
|
|
298
|
+
def serp_search(
|
|
490
299
|
self,
|
|
491
|
-
|
|
492
|
-
url: str,
|
|
300
|
+
query: str,
|
|
493
301
|
*,
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
timeout=self._api_timeout,
|
|
507
|
-
)
|
|
508
|
-
|
|
509
|
-
try:
|
|
510
|
-
return _do_request()
|
|
511
|
-
except requests.Timeout as e:
|
|
512
|
-
raise ThordataTimeoutError(
|
|
513
|
-
f"API request timed out: {e}", original_error=e
|
|
514
|
-
) from e
|
|
515
|
-
except requests.RequestException as e:
|
|
516
|
-
raise ThordataNetworkError(
|
|
517
|
-
f"API request failed: {e}", original_error=e
|
|
518
|
-
) from e
|
|
302
|
+
engine: Engine | str = Engine.GOOGLE,
|
|
303
|
+
num: int = 10,
|
|
304
|
+
country: str | None = None,
|
|
305
|
+
language: str | None = None,
|
|
306
|
+
search_type: str | None = None,
|
|
307
|
+
device: str | None = None,
|
|
308
|
+
render_js: bool | None = None,
|
|
309
|
+
no_cache: bool | None = None,
|
|
310
|
+
output_format: str = "json",
|
|
311
|
+
**kwargs: Any,
|
|
312
|
+
) -> dict[str, Any]:
|
|
313
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
519
314
|
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
315
|
+
request = SerpRequest(
|
|
316
|
+
query=query,
|
|
317
|
+
engine=engine_str,
|
|
318
|
+
num=num,
|
|
319
|
+
country=country,
|
|
320
|
+
language=language,
|
|
321
|
+
search_type=search_type,
|
|
322
|
+
device=device,
|
|
323
|
+
render_js=render_js,
|
|
324
|
+
no_cache=no_cache,
|
|
325
|
+
output_format=output_format,
|
|
326
|
+
extra_params=kwargs,
|
|
327
|
+
)
|
|
328
|
+
return self.serp_search_advanced(request)
|
|
526
329
|
|
|
527
|
-
def
|
|
528
|
-
self
|
|
529
|
-
|
|
530
|
-
*,
|
|
531
|
-
cache_key: str,
|
|
532
|
-
proxy_headers: dict[str, str] | None = None,
|
|
533
|
-
) -> urllib3.PoolManager:
|
|
534
|
-
"""Get or create a ProxyManager for the given proxy URL (Pooled)."""
|
|
535
|
-
cached = self._proxy_managers.get(cache_key)
|
|
536
|
-
if cached is not None:
|
|
537
|
-
return cached
|
|
330
|
+
def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
|
|
331
|
+
if not self.scraper_token:
|
|
332
|
+
raise ThordataConfigError("scraper_token is required for SERP API")
|
|
538
333
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
from urllib3.contrib.socks import SOCKSProxyManager
|
|
542
|
-
except Exception as e:
|
|
543
|
-
raise ThordataConfigError(
|
|
544
|
-
"SOCKS proxy requested but SOCKS dependencies are missing. "
|
|
545
|
-
"Install: pip install 'urllib3[socks]' or pip install PySocks"
|
|
546
|
-
) from e
|
|
547
|
-
|
|
548
|
-
pm_socks = SOCKSProxyManager(
|
|
549
|
-
proxy_url,
|
|
550
|
-
num_pools=10,
|
|
551
|
-
maxsize=10,
|
|
552
|
-
)
|
|
553
|
-
pm = cast(urllib3.PoolManager, pm_socks)
|
|
554
|
-
self._proxy_managers[cache_key] = pm
|
|
555
|
-
return pm
|
|
334
|
+
payload = request.to_payload()
|
|
335
|
+
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
556
336
|
|
|
557
|
-
|
|
558
|
-
proxy_ssl_context = None
|
|
559
|
-
if proxy_url.startswith("https://"):
|
|
560
|
-
proxy_ssl_context = ssl.create_default_context()
|
|
337
|
+
logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
|
|
561
338
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
maxsize=10,
|
|
339
|
+
response = self._api_request_with_retry(
|
|
340
|
+
"POST",
|
|
341
|
+
self._serp_url,
|
|
342
|
+
data=payload,
|
|
343
|
+
headers=headers,
|
|
568
344
|
)
|
|
345
|
+
response.raise_for_status()
|
|
569
346
|
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
347
|
+
if request.output_format.lower() == "json":
|
|
348
|
+
data = response.json()
|
|
349
|
+
if isinstance(data, dict):
|
|
350
|
+
code = data.get("code")
|
|
351
|
+
if code is not None and code != 200:
|
|
352
|
+
msg = extract_error_message(data)
|
|
353
|
+
raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
|
|
354
|
+
return parse_json_response(data)
|
|
573
355
|
|
|
574
|
-
|
|
356
|
+
return {"html": response.text}
|
|
357
|
+
|
|
358
|
+
# =========================================================================
|
|
359
|
+
# Universal Scraping API (WEB UNLOCKER) Methods
|
|
360
|
+
# =========================================================================
|
|
361
|
+
|
|
362
|
+
def universal_scrape(
|
|
575
363
|
self,
|
|
576
|
-
method: str,
|
|
577
364
|
url: str,
|
|
578
365
|
*,
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
366
|
+
js_render: bool = False,
|
|
367
|
+
output_format: str = "html",
|
|
368
|
+
country: str | None = None,
|
|
369
|
+
block_resources: str | None = None,
|
|
370
|
+
wait: int | None = None,
|
|
371
|
+
wait_for: str | None = None,
|
|
372
|
+
**kwargs: Any,
|
|
373
|
+
) -> str | bytes:
|
|
374
|
+
request = UniversalScrapeRequest(
|
|
375
|
+
url=url,
|
|
376
|
+
js_render=js_render,
|
|
377
|
+
output_format=output_format,
|
|
378
|
+
country=country,
|
|
379
|
+
block_resources=block_resources,
|
|
380
|
+
wait=wait,
|
|
381
|
+
wait_for=wait_for,
|
|
382
|
+
extra_params=kwargs,
|
|
383
|
+
)
|
|
384
|
+
return self.universal_scrape_advanced(request)
|
|
586
385
|
|
|
587
|
-
|
|
588
|
-
|
|
386
|
+
def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
|
|
387
|
+
if not self.scraper_token:
|
|
388
|
+
raise ThordataConfigError("scraper_token required")
|
|
589
389
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
method,
|
|
593
|
-
url,
|
|
594
|
-
proxy_config=proxy_config,
|
|
595
|
-
timeout=timeout,
|
|
596
|
-
headers=headers,
|
|
597
|
-
params=params,
|
|
598
|
-
data=data,
|
|
599
|
-
upstream_config=upstream_config,
|
|
600
|
-
)
|
|
390
|
+
payload = request.to_payload()
|
|
391
|
+
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
601
392
|
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
393
|
+
response = self._api_request_with_retry(
|
|
394
|
+
"POST", self._universal_url, data=payload, headers=headers
|
|
395
|
+
)
|
|
396
|
+
response.raise_for_status()
|
|
397
|
+
return self._process_universal_response(response, request.output_format)
|
|
606
398
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
399
|
+
# =========================================================================
|
|
400
|
+
# Web Scraper API - Task Management
|
|
401
|
+
# =========================================================================
|
|
402
|
+
|
|
403
|
+
def create_scraper_task(
|
|
404
|
+
self,
|
|
405
|
+
file_name: str,
|
|
406
|
+
spider_id: str,
|
|
407
|
+
spider_name: str,
|
|
408
|
+
parameters: dict[str, Any],
|
|
409
|
+
universal_params: dict[str, Any] | None = None,
|
|
410
|
+
) -> str:
|
|
411
|
+
config = ScraperTaskConfig(
|
|
412
|
+
file_name=file_name,
|
|
413
|
+
spider_id=spider_id,
|
|
414
|
+
spider_name=spider_name,
|
|
415
|
+
parameters=parameters,
|
|
416
|
+
universal_params=universal_params,
|
|
610
417
|
)
|
|
418
|
+
return self.create_scraper_task_advanced(config)
|
|
611
419
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
420
|
+
def run_tool(
|
|
421
|
+
self,
|
|
422
|
+
tool_request: Any,
|
|
423
|
+
file_name: str | None = None,
|
|
424
|
+
universal_params: dict[str, Any] | None = None,
|
|
425
|
+
) -> str:
|
|
426
|
+
"""
|
|
427
|
+
Run a specific pre-defined tool.
|
|
428
|
+
Supports both standard Scrapers and Video downloaders.
|
|
429
|
+
"""
|
|
430
|
+
if not hasattr(tool_request, "to_task_parameters") or not hasattr(
|
|
431
|
+
tool_request, "get_spider_id"
|
|
432
|
+
):
|
|
433
|
+
raise ValueError(
|
|
434
|
+
"tool_request must be an instance of a thordata.tools class"
|
|
435
|
+
)
|
|
616
436
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
437
|
+
spider_id = tool_request.get_spider_id()
|
|
438
|
+
spider_name = tool_request.get_spider_name()
|
|
439
|
+
params = tool_request.to_task_parameters()
|
|
440
|
+
|
|
441
|
+
if not file_name:
|
|
442
|
+
import uuid
|
|
443
|
+
|
|
444
|
+
short_id = uuid.uuid4().hex[:8]
|
|
445
|
+
file_name = f"{spider_id}_{short_id}"
|
|
446
|
+
|
|
447
|
+
# Check if it's a Video Tool (Duck typing check for common_settings)
|
|
448
|
+
if hasattr(tool_request, "common_settings"):
|
|
449
|
+
# It is a Video Task
|
|
450
|
+
config_video = VideoTaskConfig(
|
|
451
|
+
file_name=file_name,
|
|
452
|
+
spider_id=spider_id,
|
|
453
|
+
spider_name=spider_name,
|
|
454
|
+
parameters=params,
|
|
455
|
+
common_settings=tool_request.common_settings,
|
|
621
456
|
)
|
|
457
|
+
return self.create_video_task_advanced(config_video)
|
|
622
458
|
else:
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
proxy_headers=dict(proxy_headers),
|
|
459
|
+
# It is a Standard Scraper Task
|
|
460
|
+
config = ScraperTaskConfig(
|
|
461
|
+
file_name=file_name,
|
|
462
|
+
spider_id=spider_id,
|
|
463
|
+
spider_name=spider_name,
|
|
464
|
+
parameters=params,
|
|
465
|
+
universal_params=universal_params,
|
|
631
466
|
)
|
|
467
|
+
return self.create_scraper_task_advanced(config)
|
|
632
468
|
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
if
|
|
636
|
-
|
|
637
|
-
body = urlencode({k: str(v) for k, v in data.items()})
|
|
638
|
-
req_headers.setdefault(
|
|
639
|
-
"Content-Type", "application/x-www-form-urlencoded"
|
|
640
|
-
)
|
|
641
|
-
else:
|
|
642
|
-
body = data
|
|
469
|
+
def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
470
|
+
self._require_public_credentials()
|
|
471
|
+
if not self.scraper_token:
|
|
472
|
+
raise ThordataConfigError("scraper_token is required for Task Builder")
|
|
643
473
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
body=body,
|
|
648
|
-
headers=req_headers or None,
|
|
649
|
-
timeout=urllib3.Timeout(connect=timeout, read=timeout),
|
|
650
|
-
retries=False,
|
|
651
|
-
preload_content=True,
|
|
474
|
+
payload = config.to_payload()
|
|
475
|
+
headers = build_builder_headers(
|
|
476
|
+
self.scraper_token, str(self.public_token), str(self.public_key)
|
|
652
477
|
)
|
|
653
478
|
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
# Upstream Proxy Support (Proxy Chaining)
|
|
663
|
-
# =========================================================================
|
|
479
|
+
response = self._api_request_with_retry(
|
|
480
|
+
"POST", self._builder_url, data=payload, headers=headers
|
|
481
|
+
)
|
|
482
|
+
response.raise_for_status()
|
|
483
|
+
data = response.json()
|
|
484
|
+
if data.get("code") != 200:
|
|
485
|
+
raise_for_code("Task creation failed", code=data.get("code"), payload=data)
|
|
486
|
+
return data["data"]["task_id"]
|
|
664
487
|
|
|
665
|
-
def
|
|
488
|
+
def create_video_task(
|
|
666
489
|
self,
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
490
|
+
file_name: str,
|
|
491
|
+
spider_id: str,
|
|
492
|
+
spider_name: str,
|
|
493
|
+
parameters: dict[str, Any],
|
|
494
|
+
common_settings: CommonSettings,
|
|
495
|
+
) -> str:
|
|
496
|
+
config = VideoTaskConfig(
|
|
497
|
+
file_name=file_name,
|
|
498
|
+
spider_id=spider_id,
|
|
499
|
+
spider_name=spider_name,
|
|
500
|
+
parameters=parameters,
|
|
501
|
+
common_settings=common_settings,
|
|
502
|
+
)
|
|
503
|
+
return self.create_video_task_advanced(config)
|
|
504
|
+
|
|
505
|
+
def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
|
|
506
|
+
self._require_public_credentials()
|
|
507
|
+
if not self.scraper_token:
|
|
679
508
|
raise ThordataConfigError(
|
|
680
|
-
"
|
|
681
|
-
"Install with: pip install PySocks"
|
|
509
|
+
"scraper_token is required for Video Task Builder"
|
|
682
510
|
)
|
|
683
511
|
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
512
|
+
payload = config.to_payload()
|
|
513
|
+
headers = build_builder_headers(
|
|
514
|
+
self.scraper_token, str(self.public_token), str(self.public_key)
|
|
515
|
+
)
|
|
687
516
|
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
target_port = parsed_target.port or (
|
|
691
|
-
443 if parsed_target.scheme == "https" else 80
|
|
517
|
+
response = self._api_request_with_retry(
|
|
518
|
+
"POST", self._video_builder_url, data=payload, headers=headers
|
|
692
519
|
)
|
|
693
|
-
|
|
520
|
+
response.raise_for_status()
|
|
521
|
+
data = response.json()
|
|
522
|
+
if data.get("code") != 200:
|
|
523
|
+
raise_for_code(
|
|
524
|
+
"Video task creation failed", code=data.get("code"), payload=data
|
|
525
|
+
)
|
|
526
|
+
return data["data"]["task_id"]
|
|
694
527
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
528
|
+
def get_task_status(self, task_id: str) -> str:
|
|
529
|
+
self._require_public_credentials()
|
|
530
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
698
531
|
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
532
|
+
response = self._api_request_with_retry(
|
|
533
|
+
"POST",
|
|
534
|
+
self._status_url,
|
|
535
|
+
data={"tasks_ids": task_id},
|
|
536
|
+
headers=headers,
|
|
537
|
+
)
|
|
538
|
+
response.raise_for_status()
|
|
539
|
+
data = response.json()
|
|
540
|
+
if data.get("code") != 200:
|
|
541
|
+
raise_for_code("Task status error", code=data.get("code"), payload=data)
|
|
703
542
|
|
|
704
|
-
|
|
543
|
+
items = data.get("data") or []
|
|
544
|
+
for item in items:
|
|
545
|
+
if str(item.get("task_id")) == str(task_id):
|
|
546
|
+
return item.get("status", "unknown")
|
|
547
|
+
return "unknown"
|
|
705
548
|
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
)
|
|
549
|
+
def get_latest_task_status(self) -> dict[str, Any]:
|
|
550
|
+
"""
|
|
551
|
+
Get the status of the last task of the specified account.
|
|
552
|
+
"""
|
|
553
|
+
self._require_public_credentials()
|
|
554
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
555
|
+
parsed = urlparse(self._status_url)
|
|
556
|
+
base = f"{parsed.scheme}://{parsed.netloc}"
|
|
557
|
+
endpoint = "/api/web_scraper_api/get_latest_task_status"
|
|
711
558
|
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
559
|
+
response = self._api_request_with_retry(
|
|
560
|
+
"POST",
|
|
561
|
+
f"{base}{endpoint}",
|
|
562
|
+
headers=headers,
|
|
715
563
|
)
|
|
564
|
+
response.raise_for_status()
|
|
565
|
+
data = response.json()
|
|
716
566
|
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
target_host,
|
|
722
|
-
target_port,
|
|
723
|
-
thordata_username,
|
|
724
|
-
thordata_password,
|
|
725
|
-
)
|
|
726
|
-
if target_is_https:
|
|
727
|
-
context = ssl.create_default_context()
|
|
728
|
-
sock = context.wrap_socket(sock, server_hostname=target_host)
|
|
729
|
-
|
|
730
|
-
elif protocol == "https":
|
|
731
|
-
proxy_context = ssl.create_default_context()
|
|
732
|
-
proxy_ssl_sock = proxy_context.wrap_socket(
|
|
733
|
-
raw_sock, server_hostname=thordata_host
|
|
734
|
-
)
|
|
567
|
+
if data.get("code") != 200:
|
|
568
|
+
raise_for_code(
|
|
569
|
+
"Get latest task status failed", code=data.get("code"), payload=data
|
|
570
|
+
)
|
|
735
571
|
|
|
736
|
-
|
|
737
|
-
proxy_ssl_sock,
|
|
738
|
-
target_host,
|
|
739
|
-
target_port,
|
|
740
|
-
thordata_username,
|
|
741
|
-
thordata_password,
|
|
742
|
-
)
|
|
572
|
+
return data.get("data", {})
|
|
743
573
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
sock = proxy_ssl_sock
|
|
750
|
-
|
|
751
|
-
else: # HTTP proxy
|
|
752
|
-
self._send_connect_request(
|
|
753
|
-
raw_sock,
|
|
754
|
-
target_host,
|
|
755
|
-
target_port,
|
|
756
|
-
thordata_username,
|
|
757
|
-
thordata_password,
|
|
758
|
-
)
|
|
574
|
+
def safe_get_task_status(self, task_id: str) -> str:
|
|
575
|
+
try:
|
|
576
|
+
return self.get_task_status(task_id)
|
|
577
|
+
except Exception:
|
|
578
|
+
return "error"
|
|
759
579
|
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
else:
|
|
764
|
-
sock = raw_sock
|
|
580
|
+
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
581
|
+
self._require_public_credentials()
|
|
582
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
765
583
|
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
584
|
+
response = self._api_request_with_retry(
|
|
585
|
+
"POST",
|
|
586
|
+
self._download_url,
|
|
587
|
+
data={"tasks_id": task_id, "type": file_type},
|
|
588
|
+
headers=headers,
|
|
589
|
+
)
|
|
590
|
+
response.raise_for_status()
|
|
591
|
+
data = response.json()
|
|
592
|
+
if data.get("code") == 200 and data.get("data"):
|
|
593
|
+
return data["data"]["download"]
|
|
594
|
+
raise_for_code("Get result failed", code=data.get("code"), payload=data)
|
|
595
|
+
return ""
|
|
596
|
+
|
|
597
|
+
def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
|
|
598
|
+
self._require_public_credentials()
|
|
599
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
769
600
|
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
601
|
+
response = self._api_request_with_retry(
|
|
602
|
+
"POST",
|
|
603
|
+
self._list_url,
|
|
604
|
+
data={"page": str(page), "size": str(size)},
|
|
605
|
+
headers=headers,
|
|
606
|
+
)
|
|
607
|
+
response.raise_for_status()
|
|
608
|
+
data = response.json()
|
|
609
|
+
if data.get("code") != 200:
|
|
610
|
+
raise_for_code("List tasks failed", code=data.get("code"), payload=data)
|
|
611
|
+
return data.get("data", {"count": 0, "list": []})
|
|
773
612
|
|
|
774
|
-
def
|
|
613
|
+
def wait_for_task(
|
|
775
614
|
self,
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
"""Send HTTP CONNECT request to proxy and verify response."""
|
|
783
|
-
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
784
|
-
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
615
|
+
task_id: str,
|
|
616
|
+
*,
|
|
617
|
+
poll_interval: float = 5.0,
|
|
618
|
+
max_wait: float = 600.0,
|
|
619
|
+
) -> str:
|
|
620
|
+
import time
|
|
785
621
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
622
|
+
start = time.monotonic()
|
|
623
|
+
while (time.monotonic() - start) < max_wait:
|
|
624
|
+
status = self.get_task_status(task_id)
|
|
625
|
+
if status.lower() in {
|
|
626
|
+
"ready",
|
|
627
|
+
"success",
|
|
628
|
+
"finished",
|
|
629
|
+
"failed",
|
|
630
|
+
"error",
|
|
631
|
+
"cancelled",
|
|
632
|
+
}:
|
|
633
|
+
return status
|
|
634
|
+
time.sleep(poll_interval)
|
|
635
|
+
raise TimeoutError(f"Task {task_id} timeout")
|
|
790
636
|
|
|
791
|
-
|
|
637
|
+
def run_task(
|
|
638
|
+
self,
|
|
639
|
+
file_name: str,
|
|
640
|
+
spider_id: str,
|
|
641
|
+
spider_name: str,
|
|
642
|
+
parameters: dict[str, Any],
|
|
643
|
+
universal_params: dict[str, Any] | None = None,
|
|
644
|
+
*,
|
|
645
|
+
max_wait: float = 600.0,
|
|
646
|
+
initial_poll_interval: float = 2.0,
|
|
647
|
+
max_poll_interval: float = 10.0,
|
|
648
|
+
include_errors: bool = True,
|
|
649
|
+
task_type: str = "web",
|
|
650
|
+
common_settings: CommonSettings | None = None,
|
|
651
|
+
) -> str:
|
|
652
|
+
import time
|
|
792
653
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
654
|
+
if task_type == "video":
|
|
655
|
+
if common_settings is None:
|
|
656
|
+
raise ValueError("common_settings is required for video tasks")
|
|
657
|
+
config_video = VideoTaskConfig(
|
|
658
|
+
file_name=file_name,
|
|
659
|
+
spider_id=spider_id,
|
|
660
|
+
spider_name=spider_name,
|
|
661
|
+
parameters=parameters,
|
|
662
|
+
common_settings=common_settings,
|
|
663
|
+
include_errors=include_errors,
|
|
664
|
+
)
|
|
665
|
+
task_id = self.create_video_task_advanced(config_video)
|
|
666
|
+
else:
|
|
667
|
+
config = ScraperTaskConfig(
|
|
668
|
+
file_name=file_name,
|
|
669
|
+
spider_id=spider_id,
|
|
670
|
+
spider_name=spider_name,
|
|
671
|
+
parameters=parameters,
|
|
672
|
+
universal_params=universal_params,
|
|
673
|
+
include_errors=include_errors,
|
|
674
|
+
)
|
|
675
|
+
task_id = self.create_scraper_task_advanced(config)
|
|
799
676
|
|
|
800
|
-
|
|
801
|
-
if "200" not in status_line:
|
|
802
|
-
raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
|
|
677
|
+
logger.info(f"Task created: {task_id}. Polling...")
|
|
803
678
|
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
outer_ssl_sock: ssl.SSLSocket,
|
|
807
|
-
hostname: str,
|
|
808
|
-
timeout: int,
|
|
809
|
-
) -> _TLSInTLSSocket:
|
|
810
|
-
"""Create a TLS connection over an existing TLS connection."""
|
|
811
|
-
context = ssl.create_default_context()
|
|
679
|
+
start_time = time.monotonic()
|
|
680
|
+
current_poll = initial_poll_interval
|
|
812
681
|
|
|
813
|
-
|
|
814
|
-
|
|
682
|
+
while (time.monotonic() - start_time) < max_wait:
|
|
683
|
+
status = self.get_task_status(task_id)
|
|
684
|
+
status_lower = status.lower()
|
|
815
685
|
|
|
816
|
-
|
|
686
|
+
if status_lower in {"ready", "success", "finished"}:
|
|
687
|
+
return self.get_task_result(task_id)
|
|
817
688
|
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
except ssl.SSLWantReadError:
|
|
823
|
-
data_to_send = outgoing.read()
|
|
824
|
-
if data_to_send:
|
|
825
|
-
outer_ssl_sock.sendall(data_to_send)
|
|
689
|
+
if status_lower in {"failed", "error", "cancelled"}:
|
|
690
|
+
raise ThordataNetworkError(
|
|
691
|
+
f"Task {task_id} failed with status: {status}"
|
|
692
|
+
)
|
|
826
693
|
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
received = outer_ssl_sock.recv(8192)
|
|
830
|
-
if not received:
|
|
831
|
-
raise ConnectionError("Connection closed during TLS handshake")
|
|
832
|
-
incoming.write(received)
|
|
833
|
-
except socket.timeout as e:
|
|
834
|
-
raise ConnectionError("Timeout during TLS handshake") from e
|
|
835
|
-
except ssl.SSLWantWriteError:
|
|
836
|
-
data_to_send = outgoing.read()
|
|
837
|
-
if data_to_send:
|
|
838
|
-
outer_ssl_sock.sendall(data_to_send)
|
|
839
|
-
|
|
840
|
-
data_to_send = outgoing.read()
|
|
841
|
-
if data_to_send:
|
|
842
|
-
outer_ssl_sock.sendall(data_to_send)
|
|
843
|
-
|
|
844
|
-
return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
|
|
845
|
-
|
|
846
|
-
def _send_http_request(
|
|
847
|
-
self,
|
|
848
|
-
sock: socket.socket | ssl.SSLSocket | Any,
|
|
849
|
-
method: str,
|
|
850
|
-
parsed_url: Any,
|
|
851
|
-
headers: dict[str, str] | None,
|
|
852
|
-
data: Any,
|
|
853
|
-
final_url: str,
|
|
854
|
-
timeout: int,
|
|
855
|
-
) -> requests.Response:
|
|
856
|
-
"""Send HTTP request over established connection and parse response."""
|
|
857
|
-
target_host = parsed_url.hostname
|
|
858
|
-
|
|
859
|
-
req_headers = dict(headers or {})
|
|
860
|
-
req_headers.setdefault("Host", target_host)
|
|
861
|
-
req_headers.setdefault("User-Agent", build_user_agent(_sdk_version, "requests"))
|
|
862
|
-
req_headers.setdefault("Connection", "close")
|
|
863
|
-
|
|
864
|
-
path = parsed_url.path or "/"
|
|
865
|
-
if parsed_url.query:
|
|
866
|
-
path += f"?{parsed_url.query}"
|
|
867
|
-
|
|
868
|
-
http_req = f"{method.upper()} {path} HTTP/1.1\r\n"
|
|
869
|
-
for k, v in req_headers.items():
|
|
870
|
-
http_req += f"{k}: {v}\r\n"
|
|
871
|
-
|
|
872
|
-
body = None
|
|
873
|
-
if data is not None:
|
|
874
|
-
if isinstance(data, dict):
|
|
875
|
-
body = urlencode({k: str(v) for k, v in data.items()}).encode()
|
|
876
|
-
http_req += "Content-Type: application/x-www-form-urlencoded\r\n"
|
|
877
|
-
http_req += f"Content-Length: {len(body)}\r\n"
|
|
878
|
-
elif isinstance(data, bytes):
|
|
879
|
-
body = data
|
|
880
|
-
http_req += f"Content-Length: {len(body)}\r\n"
|
|
881
|
-
else:
|
|
882
|
-
body = str(data).encode()
|
|
883
|
-
http_req += f"Content-Length: {len(body)}\r\n"
|
|
884
|
-
|
|
885
|
-
http_req += "\r\n"
|
|
886
|
-
sock.sendall(http_req.encode())
|
|
694
|
+
time.sleep(current_poll)
|
|
695
|
+
current_poll = min(current_poll * 1.5, max_poll_interval)
|
|
887
696
|
|
|
888
|
-
|
|
889
|
-
sock.sendall(body)
|
|
697
|
+
raise ThordataTimeoutError(f"Task {task_id} timed out")
|
|
890
698
|
|
|
891
|
-
|
|
892
|
-
|
|
699
|
+
# =========================================================================
|
|
700
|
+
# Account & Usage Methods
|
|
701
|
+
# =========================================================================
|
|
893
702
|
|
|
894
|
-
|
|
895
|
-
try:
|
|
896
|
-
while True:
|
|
897
|
-
chunk = sock.recv(8192)
|
|
898
|
-
if not chunk:
|
|
899
|
-
break
|
|
900
|
-
response_data += chunk
|
|
901
|
-
if b"\r\n\r\n" in response_data:
|
|
902
|
-
header_end = response_data.index(b"\r\n\r\n") + 4
|
|
903
|
-
headers_part = (
|
|
904
|
-
response_data[:header_end]
|
|
905
|
-
.decode("utf-8", errors="replace")
|
|
906
|
-
.lower()
|
|
907
|
-
)
|
|
908
|
-
if "content-length:" in headers_part:
|
|
909
|
-
for line in headers_part.split("\r\n"):
|
|
910
|
-
if line.startswith("content-length:"):
|
|
911
|
-
content_length = int(line.split(":")[1].strip())
|
|
912
|
-
if len(response_data) >= header_end + content_length:
|
|
913
|
-
break
|
|
914
|
-
elif "transfer-encoding: chunked" not in headers_part:
|
|
915
|
-
break
|
|
916
|
-
except socket.timeout:
|
|
917
|
-
pass
|
|
918
|
-
|
|
919
|
-
return self._parse_http_response(response_data, final_url)
|
|
920
|
-
|
|
921
|
-
def _socks5_handshake(
|
|
703
|
+
def get_usage_statistics(
|
|
922
704
|
self,
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
sock.sendall(b"\x05\x02\x00\x02")
|
|
932
|
-
else:
|
|
933
|
-
sock.sendall(b"\x05\x01\x00")
|
|
934
|
-
|
|
935
|
-
response = sock.recv(2)
|
|
936
|
-
if len(response) < 2:
|
|
937
|
-
raise ConnectionError("SOCKS5 handshake failed: incomplete response")
|
|
705
|
+
from_date: str | date,
|
|
706
|
+
to_date: str | date,
|
|
707
|
+
) -> UsageStatistics:
|
|
708
|
+
self._require_public_credentials()
|
|
709
|
+
if isinstance(from_date, date):
|
|
710
|
+
from_date = from_date.strftime("%Y-%m-%d")
|
|
711
|
+
if isinstance(to_date, date):
|
|
712
|
+
to_date = to_date.strftime("%Y-%m-%d")
|
|
938
713
|
|
|
939
|
-
|
|
940
|
-
|
|
714
|
+
params = {
|
|
715
|
+
"token": self.public_token,
|
|
716
|
+
"key": self.public_key,
|
|
717
|
+
"from_date": from_date,
|
|
718
|
+
"to_date": to_date,
|
|
719
|
+
}
|
|
720
|
+
response = self._api_request_with_retry(
|
|
721
|
+
"GET", self._usage_stats_url, params=params
|
|
722
|
+
)
|
|
723
|
+
response.raise_for_status()
|
|
724
|
+
data = response.json()
|
|
725
|
+
if data.get("code") != 200:
|
|
726
|
+
raise_for_code("Usage stats error", code=data.get("code"), payload=data)
|
|
727
|
+
return UsageStatistics.from_dict(data.get("data", data))
|
|
941
728
|
|
|
942
|
-
|
|
729
|
+
def get_traffic_balance(self) -> float:
|
|
730
|
+
self._require_public_credentials()
|
|
731
|
+
params = {"token": self.public_token, "key": self.public_key}
|
|
732
|
+
api_base = self._locations_base_url.replace("/locations", "")
|
|
733
|
+
response = self._api_request_with_retry(
|
|
734
|
+
"GET", f"{api_base}/account/traffic-balance", params=params
|
|
735
|
+
)
|
|
736
|
+
response.raise_for_status()
|
|
737
|
+
data = response.json()
|
|
738
|
+
if data.get("code") != 200:
|
|
739
|
+
raise_for_code(
|
|
740
|
+
"Get traffic balance failed", code=data.get("code"), payload=data
|
|
741
|
+
)
|
|
742
|
+
return float(data.get("data", {}).get("traffic_balance", 0))
|
|
943
743
|
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
744
|
+
def get_wallet_balance(self) -> float:
|
|
745
|
+
self._require_public_credentials()
|
|
746
|
+
params = {"token": self.public_token, "key": self.public_key}
|
|
747
|
+
api_base = self._locations_base_url.replace("/locations", "")
|
|
748
|
+
response = self._api_request_with_retry(
|
|
749
|
+
"GET", f"{api_base}/account/wallet-balance", params=params
|
|
750
|
+
)
|
|
751
|
+
response.raise_for_status()
|
|
752
|
+
data = response.json()
|
|
753
|
+
if data.get("code") != 200:
|
|
754
|
+
raise_for_code(
|
|
755
|
+
"Get wallet balance failed", code=data.get("code"), payload=data
|
|
756
|
+
)
|
|
757
|
+
return float(data.get("data", {}).get("balance", 0))
|
|
949
758
|
|
|
950
|
-
|
|
951
|
-
auth_req += bytes([len(password)]) + password.encode()
|
|
952
|
-
sock.sendall(auth_req)
|
|
953
|
-
|
|
954
|
-
auth_resp = sock.recv(2)
|
|
955
|
-
if len(auth_resp) < 2 or auth_resp[1] != 0x00:
|
|
956
|
-
raise ConnectionError("SOCKS5 authentication failed")
|
|
957
|
-
|
|
958
|
-
elif auth_method == 0xFF:
|
|
959
|
-
raise ConnectionError("SOCKS5 no acceptable auth method")
|
|
960
|
-
|
|
961
|
-
connect_req = b"\x05\x01\x00\x03"
|
|
962
|
-
connect_req += bytes([len(target_host)]) + target_host.encode()
|
|
963
|
-
connect_req += target_port.to_bytes(2, "big")
|
|
964
|
-
sock.sendall(connect_req)
|
|
965
|
-
|
|
966
|
-
resp = sock.recv(4)
|
|
967
|
-
if len(resp) < 4:
|
|
968
|
-
raise ConnectionError("SOCKS5 connect failed: incomplete response")
|
|
969
|
-
|
|
970
|
-
if resp[1] != 0x00:
|
|
971
|
-
error_codes = {
|
|
972
|
-
0x01: "General failure",
|
|
973
|
-
0x02: "Connection not allowed",
|
|
974
|
-
0x03: "Network unreachable",
|
|
975
|
-
0x04: "Host unreachable",
|
|
976
|
-
0x05: "Connection refused",
|
|
977
|
-
0x06: "TTL expired",
|
|
978
|
-
0x07: "Command not supported",
|
|
979
|
-
0x08: "Address type not supported",
|
|
980
|
-
}
|
|
981
|
-
error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
|
|
982
|
-
raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
|
|
983
|
-
|
|
984
|
-
addr_type = resp[3]
|
|
985
|
-
if addr_type == 0x01:
|
|
986
|
-
sock.recv(4 + 2)
|
|
987
|
-
elif addr_type == 0x03:
|
|
988
|
-
domain_len = sock.recv(1)[0]
|
|
989
|
-
sock.recv(domain_len + 2)
|
|
990
|
-
elif addr_type == 0x04:
|
|
991
|
-
sock.recv(16 + 2)
|
|
992
|
-
|
|
993
|
-
return sock
|
|
994
|
-
|
|
995
|
-
def _parse_http_response(
|
|
759
|
+
def get_proxy_user_usage(
|
|
996
760
|
self,
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
else
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
status_line = header_lines[0] if header_lines else ""
|
|
1010
|
-
parts = status_line.split(" ", 2)
|
|
1011
|
-
status_code = int(parts[1]) if len(parts) > 1 else 0
|
|
761
|
+
username: str,
|
|
762
|
+
start_date: str | date,
|
|
763
|
+
end_date: str | date,
|
|
764
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
765
|
+
) -> list[dict[str, Any]]:
|
|
766
|
+
self._require_public_credentials()
|
|
767
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
768
|
+
if isinstance(start_date, date):
|
|
769
|
+
start_date = start_date.strftime("%Y-%m-%d")
|
|
770
|
+
if isinstance(end_date, date):
|
|
771
|
+
end_date = end_date.strftime("%Y-%m-%d")
|
|
1012
772
|
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
773
|
+
params = {
|
|
774
|
+
"token": self.public_token,
|
|
775
|
+
"key": self.public_key,
|
|
776
|
+
"proxy_type": str(pt),
|
|
777
|
+
"username": username,
|
|
778
|
+
"from_date": start_date,
|
|
779
|
+
"to_date": end_date,
|
|
780
|
+
}
|
|
781
|
+
response = self._api_request_with_retry(
|
|
782
|
+
"GET", f"{self._proxy_users_url}/usage-statistics", params=params
|
|
783
|
+
)
|
|
784
|
+
response.raise_for_status()
|
|
785
|
+
data = response.json()
|
|
786
|
+
if data.get("code") != 200:
|
|
787
|
+
raise_for_code("Get user usage failed", code=data.get("code"), payload=data)
|
|
788
|
+
return data.get("data", [])
|
|
1018
789
|
|
|
1019
|
-
|
|
1020
|
-
|
|
790
|
+
def get_proxy_user_usage_hour(
|
|
791
|
+
self,
|
|
792
|
+
username: str,
|
|
793
|
+
from_date: str, # Format: yyyy-mm-dd HH
|
|
794
|
+
to_date: str, # Format: yyyy-mm-dd HH
|
|
795
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
796
|
+
) -> list[dict[str, Any]]:
|
|
797
|
+
"""
|
|
798
|
+
Get proxy user traffic usage logs by hour.
|
|
1021
799
|
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
800
|
+
Args:
|
|
801
|
+
username: The proxy username.
|
|
802
|
+
from_date: Start date string (yyyy-mm-dd HH).
|
|
803
|
+
to_date: End date string (yyyy-mm-dd HH).
|
|
804
|
+
proxy_type: Proxy type (default: Residential).
|
|
805
|
+
"""
|
|
806
|
+
self._require_public_credentials()
|
|
807
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1028
808
|
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
809
|
+
params = {
|
|
810
|
+
"token": self.public_token,
|
|
811
|
+
"key": self.public_key,
|
|
812
|
+
"proxy_type": str(pt),
|
|
813
|
+
"username": username,
|
|
814
|
+
"from_date": from_date,
|
|
815
|
+
"to_date": to_date,
|
|
816
|
+
}
|
|
817
|
+
response = self._api_request_with_retry(
|
|
818
|
+
"GET", f"{self._proxy_users_url}/usage-statistics-hour", params=params
|
|
819
|
+
)
|
|
820
|
+
response.raise_for_status()
|
|
821
|
+
data = response.json()
|
|
822
|
+
if data.get("code") != 200:
|
|
823
|
+
raise_for_code(
|
|
824
|
+
"Get hourly usage failed", code=data.get("code"), payload=data
|
|
825
|
+
)
|
|
1040
826
|
|
|
1041
|
-
|
|
1042
|
-
|
|
827
|
+
# API returns { "data": { "data": [...] } } structure
|
|
828
|
+
inner_data = data.get("data", {})
|
|
829
|
+
if isinstance(inner_data, dict):
|
|
830
|
+
return inner_data.get("data", [])
|
|
831
|
+
return []
|
|
1043
832
|
|
|
1044
|
-
|
|
1045
|
-
|
|
833
|
+
def extract_ip_list(
|
|
834
|
+
self,
|
|
835
|
+
num: int = 1,
|
|
836
|
+
country: str | None = None,
|
|
837
|
+
state: str | None = None,
|
|
838
|
+
city: str | None = None,
|
|
839
|
+
time_limit: int | None = None,
|
|
840
|
+
port: int | None = None,
|
|
841
|
+
return_type: str = "txt",
|
|
842
|
+
protocol: str = "http",
|
|
843
|
+
sep: str = "\r\n",
|
|
844
|
+
product: str = "residential",
|
|
845
|
+
) -> list[str]:
|
|
846
|
+
base_url = "https://get-ip.thordata.net"
|
|
847
|
+
endpoint = "/unlimited_api" if product == "unlimited" else "/api"
|
|
848
|
+
params: dict[str, Any] = {
|
|
849
|
+
"num": str(num),
|
|
850
|
+
"return_type": return_type,
|
|
851
|
+
"protocol": protocol,
|
|
852
|
+
"sep": sep,
|
|
853
|
+
}
|
|
854
|
+
if country:
|
|
855
|
+
params["country"] = country
|
|
856
|
+
if state:
|
|
857
|
+
params["state"] = state
|
|
858
|
+
if city:
|
|
859
|
+
params["city"] = city
|
|
860
|
+
if time_limit:
|
|
861
|
+
params["time"] = str(time_limit)
|
|
862
|
+
if port:
|
|
863
|
+
params["port"] = str(port)
|
|
864
|
+
|
|
865
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
866
|
+
if username:
|
|
867
|
+
params["td-customer"] = username
|
|
1046
868
|
|
|
1047
|
-
|
|
1048
|
-
|
|
869
|
+
response = self._api_request_with_retry(
|
|
870
|
+
"GET", f"{base_url}{endpoint}", params=params
|
|
871
|
+
)
|
|
872
|
+
response.raise_for_status()
|
|
1049
873
|
|
|
1050
|
-
|
|
874
|
+
if return_type == "json":
|
|
875
|
+
data = response.json()
|
|
876
|
+
if isinstance(data, dict):
|
|
877
|
+
if data.get("code") in (0, 200):
|
|
878
|
+
raw_list = data.get("data") or []
|
|
879
|
+
return [f"{item['ip']}:{item['port']}" for item in raw_list]
|
|
880
|
+
else:
|
|
881
|
+
raise_for_code(
|
|
882
|
+
"Extract IPs failed", code=data.get("code"), payload=data
|
|
883
|
+
)
|
|
884
|
+
return []
|
|
885
|
+
else:
|
|
886
|
+
text = response.text.strip()
|
|
887
|
+
if text.startswith("{") and "code" in text:
|
|
888
|
+
try:
|
|
889
|
+
err_data = response.json()
|
|
890
|
+
raise_for_code(
|
|
891
|
+
"Extract IPs failed",
|
|
892
|
+
code=err_data.get("code"),
|
|
893
|
+
payload=err_data,
|
|
894
|
+
)
|
|
895
|
+
except ValueError:
|
|
896
|
+
pass
|
|
897
|
+
actual_sep = sep.replace("\\r", "\r").replace("\\n", "\n")
|
|
898
|
+
return [line.strip() for line in text.split(actual_sep) if line.strip()]
|
|
1051
899
|
|
|
1052
900
|
# =========================================================================
|
|
1053
|
-
#
|
|
901
|
+
# Proxy Users Management
|
|
1054
902
|
# =========================================================================
|
|
1055
903
|
|
|
1056
|
-
def
|
|
904
|
+
def list_proxy_users(
|
|
905
|
+
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
906
|
+
) -> ProxyUserList:
|
|
907
|
+
self._require_public_credentials()
|
|
908
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
909
|
+
params = {
|
|
910
|
+
"token": self.public_token,
|
|
911
|
+
"key": self.public_key,
|
|
912
|
+
"proxy_type": str(pt),
|
|
913
|
+
}
|
|
914
|
+
response = self._api_request_with_retry(
|
|
915
|
+
"GET", f"{self._proxy_users_url}/user-list", params=params
|
|
916
|
+
)
|
|
917
|
+
response.raise_for_status()
|
|
918
|
+
data = response.json()
|
|
919
|
+
if data.get("code") != 200:
|
|
920
|
+
raise_for_code("List users error", code=data.get("code"), payload=data)
|
|
921
|
+
return ProxyUserList.from_dict(data.get("data", data))
|
|
922
|
+
|
|
923
|
+
def create_proxy_user(
|
|
1057
924
|
self,
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
language: str | None = None,
|
|
1064
|
-
search_type: str | None = None,
|
|
1065
|
-
device: str | None = None,
|
|
1066
|
-
render_js: bool | None = None,
|
|
1067
|
-
no_cache: bool | None = None,
|
|
1068
|
-
output_format: str = "json",
|
|
1069
|
-
**kwargs: Any,
|
|
925
|
+
username: str,
|
|
926
|
+
password: str,
|
|
927
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
928
|
+
traffic_limit: int = 0,
|
|
929
|
+
status: bool = True,
|
|
1070
930
|
) -> dict[str, Any]:
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
931
|
+
self._require_public_credentials()
|
|
932
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
933
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
934
|
+
payload = {
|
|
935
|
+
"proxy_type": str(pt),
|
|
936
|
+
"username": username,
|
|
937
|
+
"password": password,
|
|
938
|
+
"traffic_limit": str(traffic_limit),
|
|
939
|
+
"status": "true" if status else "false",
|
|
940
|
+
}
|
|
941
|
+
response = self._api_request_with_retry(
|
|
942
|
+
"POST",
|
|
943
|
+
f"{self._proxy_users_url}/create-user",
|
|
944
|
+
data=payload,
|
|
945
|
+
headers=headers,
|
|
1085
946
|
)
|
|
947
|
+
response.raise_for_status()
|
|
948
|
+
data = response.json()
|
|
949
|
+
if data.get("code") != 200:
|
|
950
|
+
raise_for_code("Create user failed", code=data.get("code"), payload=data)
|
|
951
|
+
return data.get("data", {})
|
|
1086
952
|
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
953
|
+
def update_proxy_user(
|
|
954
|
+
self,
|
|
955
|
+
username: str,
|
|
956
|
+
password: str,
|
|
957
|
+
traffic_limit: int | None = None,
|
|
958
|
+
status: bool | None = None,
|
|
959
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
960
|
+
new_username: str | None = None, # Added optional new_username
|
|
961
|
+
) -> dict[str, Any]:
|
|
962
|
+
"""
|
|
963
|
+
Update a proxy user.
|
|
964
|
+
Note: API requires 'new_' prefixed fields and ALL are required.
|
|
965
|
+
"""
|
|
966
|
+
self._require_public_credentials()
|
|
967
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
968
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1095
969
|
|
|
1096
|
-
|
|
970
|
+
# Defaults
|
|
971
|
+
limit_val = str(traffic_limit) if traffic_limit is not None else "0"
|
|
972
|
+
status_val = "true" if (status is None or status) else "false"
|
|
1097
973
|
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
"POST",
|
|
1101
|
-
self._serp_url,
|
|
1102
|
-
data=payload,
|
|
1103
|
-
headers=headers,
|
|
1104
|
-
)
|
|
1105
|
-
response.raise_for_status()
|
|
974
|
+
# If new_username is not provided, keep the old one (API requires new_username field)
|
|
975
|
+
target_username = new_username or username
|
|
1106
976
|
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
977
|
+
# Mapping to API specific field names (new_...)
|
|
978
|
+
payload = {
|
|
979
|
+
"proxy_type": str(pt),
|
|
980
|
+
"username": username, # Who to update
|
|
981
|
+
"new_username": target_username, # Required field
|
|
982
|
+
"new_password": password, # Required field
|
|
983
|
+
"new_traffic_limit": limit_val, # Required field
|
|
984
|
+
"new_status": status_val, # Required field
|
|
985
|
+
}
|
|
1115
986
|
|
|
1116
|
-
|
|
987
|
+
response = self._api_request_with_retry(
|
|
988
|
+
"POST",
|
|
989
|
+
f"{self._proxy_users_url}/update-user",
|
|
990
|
+
data=payload,
|
|
991
|
+
headers=headers,
|
|
992
|
+
)
|
|
993
|
+
data = response.json()
|
|
994
|
+
if data.get("code") != 200:
|
|
995
|
+
raise_for_code("Update user failed", code=data.get("code"), payload=data)
|
|
996
|
+
return data.get("data", {})
|
|
1117
997
|
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
998
|
+
def delete_proxy_user(
|
|
999
|
+
self,
|
|
1000
|
+
username: str,
|
|
1001
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1002
|
+
) -> dict[str, Any]:
|
|
1003
|
+
self._require_public_credentials()
|
|
1004
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1005
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1006
|
+
payload = {"proxy_type": str(pt), "username": username}
|
|
1007
|
+
response = self._api_request_with_retry(
|
|
1008
|
+
"POST",
|
|
1009
|
+
f"{self._proxy_users_url}/delete-user",
|
|
1010
|
+
data=payload,
|
|
1011
|
+
headers=headers,
|
|
1012
|
+
)
|
|
1013
|
+
response.raise_for_status()
|
|
1014
|
+
data = response.json()
|
|
1015
|
+
if data.get("code") != 200:
|
|
1016
|
+
raise_for_code("Delete user failed", code=data.get("code"), payload=data)
|
|
1017
|
+
return data.get("data", {})
|
|
1122
1018
|
|
|
1123
1019
|
# =========================================================================
|
|
1124
|
-
#
|
|
1020
|
+
# Whitelist IP Management
|
|
1125
1021
|
# =========================================================================
|
|
1126
1022
|
|
|
1127
|
-
def
|
|
1023
|
+
def add_whitelist_ip(
|
|
1128
1024
|
self,
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
country=country,
|
|
1144
|
-
block_resources=block_resources,
|
|
1145
|
-
wait=wait,
|
|
1146
|
-
wait_for=wait_for,
|
|
1147
|
-
extra_params=kwargs,
|
|
1025
|
+
ip: str,
|
|
1026
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1027
|
+
status: bool = True,
|
|
1028
|
+
) -> dict[str, Any]:
|
|
1029
|
+
self._require_public_credentials()
|
|
1030
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1031
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1032
|
+
payload = {
|
|
1033
|
+
"proxy_type": str(pt),
|
|
1034
|
+
"ip": ip,
|
|
1035
|
+
"status": "true" if status else "false",
|
|
1036
|
+
}
|
|
1037
|
+
response = self._api_request_with_retry(
|
|
1038
|
+
"POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
|
|
1148
1039
|
)
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
payload = request.to_payload()
|
|
1156
|
-
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
1157
|
-
|
|
1158
|
-
logger.info(f"Universal Scrape: {request.url}")
|
|
1159
|
-
|
|
1160
|
-
try:
|
|
1161
|
-
response = self._api_request_with_retry(
|
|
1162
|
-
"POST",
|
|
1163
|
-
self._universal_url,
|
|
1164
|
-
data=payload,
|
|
1165
|
-
headers=headers,
|
|
1040
|
+
response.raise_for_status()
|
|
1041
|
+
data = response.json()
|
|
1042
|
+
if data.get("code") != 200:
|
|
1043
|
+
raise_for_code(
|
|
1044
|
+
"Add whitelist IP failed", code=data.get("code"), payload=data
|
|
1166
1045
|
)
|
|
1167
|
-
|
|
1168
|
-
return self._process_universal_response(response, request.output_format)
|
|
1169
|
-
|
|
1170
|
-
except requests.Timeout as e:
|
|
1171
|
-
raise ThordataTimeoutError(
|
|
1172
|
-
f"Universal timeout: {e}", original_error=e
|
|
1173
|
-
) from e
|
|
1174
|
-
except requests.RequestException as e:
|
|
1175
|
-
raise ThordataNetworkError(
|
|
1176
|
-
f"Universal failed: {e}", original_error=e
|
|
1177
|
-
) from e
|
|
1178
|
-
|
|
1179
|
-
def _process_universal_response(
|
|
1180
|
-
self, response: requests.Response, output_format: str
|
|
1181
|
-
) -> str | bytes:
|
|
1182
|
-
try:
|
|
1183
|
-
resp_json = response.json()
|
|
1184
|
-
except ValueError:
|
|
1185
|
-
return response.content if output_format.lower() == "png" else response.text
|
|
1046
|
+
return data.get("data", {})
|
|
1186
1047
|
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1048
|
+
def delete_whitelist_ip(
|
|
1049
|
+
self,
|
|
1050
|
+
ip: str,
|
|
1051
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1052
|
+
) -> dict[str, Any]:
|
|
1053
|
+
self._require_public_credentials()
|
|
1054
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1055
|
+
headers = build_public_api_headers(str(self.public_token), str(self.public_key))
|
|
1056
|
+
payload = {"proxy_type": str(pt), "ip": ip}
|
|
1057
|
+
response = self._api_request_with_retry(
|
|
1058
|
+
"POST", f"{self._whitelist_url}/delete-ip", data=payload, headers=headers
|
|
1059
|
+
)
|
|
1060
|
+
response.raise_for_status()
|
|
1061
|
+
data = response.json()
|
|
1062
|
+
if data.get("code") != 200:
|
|
1063
|
+
raise_for_code(
|
|
1064
|
+
"Delete whitelist IP failed", code=data.get("code"), payload=data
|
|
1065
|
+
)
|
|
1066
|
+
return data.get("data", {})
|
|
1192
1067
|
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1068
|
+
def list_whitelist_ips(
|
|
1069
|
+
self,
|
|
1070
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1071
|
+
) -> list[str]:
|
|
1072
|
+
self._require_public_credentials()
|
|
1073
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1074
|
+
params = {
|
|
1075
|
+
"token": self.public_token,
|
|
1076
|
+
"key": self.public_key,
|
|
1077
|
+
"proxy_type": str(pt),
|
|
1078
|
+
}
|
|
1079
|
+
response = self._api_request_with_retry(
|
|
1080
|
+
"GET", f"{self._whitelist_url}/ip-list", params=params
|
|
1081
|
+
)
|
|
1082
|
+
response.raise_for_status()
|
|
1083
|
+
data = response.json()
|
|
1084
|
+
if data.get("code") != 200:
|
|
1085
|
+
raise_for_code(
|
|
1086
|
+
"List whitelist IPs failed", code=data.get("code"), payload=data
|
|
1087
|
+
)
|
|
1197
1088
|
|
|
1198
|
-
|
|
1089
|
+
items = data.get("data", []) or []
|
|
1090
|
+
result = []
|
|
1091
|
+
for item in items:
|
|
1092
|
+
if isinstance(item, str):
|
|
1093
|
+
result.append(item)
|
|
1094
|
+
elif isinstance(item, dict) and "ip" in item:
|
|
1095
|
+
result.append(str(item["ip"]))
|
|
1096
|
+
else:
|
|
1097
|
+
result.append(str(item))
|
|
1098
|
+
return result
|
|
1199
1099
|
|
|
1200
1100
|
# =========================================================================
|
|
1201
|
-
#
|
|
1101
|
+
# Locations & ASN Methods
|
|
1202
1102
|
# =========================================================================
|
|
1203
1103
|
|
|
1204
|
-
def
|
|
1205
|
-
self,
|
|
1206
|
-
file_name: str,
|
|
1207
|
-
spider_id: str,
|
|
1208
|
-
spider_name: str,
|
|
1209
|
-
parameters: dict[str, Any],
|
|
1210
|
-
universal_params: dict[str, Any] | None = None,
|
|
1211
|
-
) -> str:
|
|
1212
|
-
config = ScraperTaskConfig(
|
|
1213
|
-
file_name=file_name,
|
|
1214
|
-
spider_id=spider_id,
|
|
1215
|
-
spider_name=spider_name,
|
|
1216
|
-
parameters=parameters,
|
|
1217
|
-
universal_params=universal_params,
|
|
1218
|
-
)
|
|
1219
|
-
return self.create_scraper_task_advanced(config)
|
|
1220
|
-
|
|
1221
|
-
def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
1104
|
+
def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
|
|
1222
1105
|
self._require_public_credentials()
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1106
|
+
params = {"token": self.public_token, "key": self.public_key}
|
|
1107
|
+
for k, v in kwargs.items():
|
|
1108
|
+
params[k] = str(v)
|
|
1109
|
+
|
|
1110
|
+
response = self._api_request_with_retry(
|
|
1111
|
+
"GET", f"{self._locations_base_url}/{endpoint}", params=params
|
|
1228
1112
|
)
|
|
1113
|
+
response.raise_for_status()
|
|
1114
|
+
data = response.json()
|
|
1229
1115
|
|
|
1230
|
-
|
|
1231
|
-
response = self._api_request_with_retry(
|
|
1232
|
-
"POST", self._builder_url, data=payload, headers=headers
|
|
1233
|
-
)
|
|
1234
|
-
response.raise_for_status()
|
|
1235
|
-
data = response.json()
|
|
1116
|
+
if isinstance(data, dict):
|
|
1236
1117
|
if data.get("code") != 200:
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
return data["data"]["task_id"]
|
|
1241
|
-
except requests.RequestException as e:
|
|
1242
|
-
raise ThordataNetworkError(
|
|
1243
|
-
f"Task creation failed: {e}", original_error=e
|
|
1244
|
-
) from e
|
|
1118
|
+
raise RuntimeError(f"Locations error: {data.get('msg')}")
|
|
1119
|
+
return data.get("data") or []
|
|
1120
|
+
return data if isinstance(data, list) else []
|
|
1245
1121
|
|
|
1246
|
-
def
|
|
1122
|
+
def list_countries(
|
|
1123
|
+
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
1124
|
+
) -> list[dict[str, Any]]:
|
|
1125
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1126
|
+
return self._get_locations("countries", proxy_type=pt)
|
|
1127
|
+
|
|
1128
|
+
def list_states(
|
|
1247
1129
|
self,
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
) -> str:
|
|
1254
|
-
config = VideoTaskConfig(
|
|
1255
|
-
file_name=file_name,
|
|
1256
|
-
spider_id=spider_id,
|
|
1257
|
-
spider_name=spider_name,
|
|
1258
|
-
parameters=parameters,
|
|
1259
|
-
common_settings=common_settings,
|
|
1260
|
-
)
|
|
1261
|
-
return self.create_video_task_advanced(config)
|
|
1130
|
+
country_code: str,
|
|
1131
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1132
|
+
) -> list[dict[str, Any]]:
|
|
1133
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1134
|
+
return self._get_locations("states", proxy_type=pt, country_code=country_code)
|
|
1262
1135
|
|
|
1263
|
-
def
|
|
1264
|
-
self
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1136
|
+
def list_cities(
|
|
1137
|
+
self,
|
|
1138
|
+
country_code: str,
|
|
1139
|
+
state_code: str | None = None,
|
|
1140
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1141
|
+
) -> list[dict[str, Any]]:
|
|
1142
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1143
|
+
kwargs = {"proxy_type": pt, "country_code": country_code}
|
|
1144
|
+
if state_code:
|
|
1145
|
+
kwargs["state_code"] = state_code
|
|
1146
|
+
return self._get_locations("cities", **kwargs)
|
|
1269
1147
|
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1148
|
+
def list_asn(
|
|
1149
|
+
self,
|
|
1150
|
+
country_code: str,
|
|
1151
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1152
|
+
) -> list[dict[str, Any]]:
|
|
1153
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1154
|
+
return self._get_locations("asn", proxy_type=pt, country_code=country_code)
|
|
1155
|
+
|
|
1156
|
+
# =========================================================================
|
|
1157
|
+
# ISP & Datacenter Proxy Management
|
|
1158
|
+
# =========================================================================
|
|
1274
1159
|
|
|
1160
|
+
def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
|
|
1161
|
+
self._require_public_credentials()
|
|
1162
|
+
params = {
|
|
1163
|
+
"token": self.public_token,
|
|
1164
|
+
"key": self.public_key,
|
|
1165
|
+
"proxy_type": str(proxy_type),
|
|
1166
|
+
}
|
|
1275
1167
|
response = self._api_request_with_retry(
|
|
1276
|
-
"
|
|
1168
|
+
"GET", self._proxy_list_url, params=params
|
|
1277
1169
|
)
|
|
1278
1170
|
response.raise_for_status()
|
|
1279
1171
|
data = response.json()
|
|
1280
1172
|
if data.get("code") != 200:
|
|
1281
1173
|
raise_for_code(
|
|
1282
|
-
"
|
|
1283
|
-
)
|
|
1284
|
-
return data["data"]["task_id"]
|
|
1285
|
-
|
|
1286
|
-
def get_task_status(self, task_id: str) -> str:
|
|
1287
|
-
self._require_public_credentials()
|
|
1288
|
-
headers = build_public_api_headers(
|
|
1289
|
-
self.public_token or "", self.public_key or ""
|
|
1290
|
-
)
|
|
1291
|
-
try:
|
|
1292
|
-
response = self._api_request_with_retry(
|
|
1293
|
-
"POST",
|
|
1294
|
-
self._status_url,
|
|
1295
|
-
data={"tasks_ids": task_id},
|
|
1296
|
-
headers=headers,
|
|
1174
|
+
"List proxy servers error", code=data.get("code"), payload=data
|
|
1297
1175
|
)
|
|
1298
|
-
response.raise_for_status()
|
|
1299
|
-
data = response.json()
|
|
1300
|
-
if data.get("code") != 200:
|
|
1301
|
-
raise_for_code("Task status error", code=data.get("code"), payload=data)
|
|
1302
|
-
|
|
1303
|
-
items = data.get("data") or []
|
|
1304
|
-
for item in items:
|
|
1305
|
-
if str(item.get("task_id")) == str(task_id):
|
|
1306
|
-
return item.get("status", "unknown")
|
|
1307
|
-
return "unknown"
|
|
1308
|
-
except requests.RequestException as e:
|
|
1309
|
-
raise ThordataNetworkError(
|
|
1310
|
-
f"Status check failed: {e}", original_error=e
|
|
1311
|
-
) from e
|
|
1312
|
-
|
|
1313
|
-
def safe_get_task_status(self, task_id: str) -> str:
|
|
1314
|
-
try:
|
|
1315
|
-
return self.get_task_status(task_id)
|
|
1316
|
-
except Exception:
|
|
1317
|
-
return "error"
|
|
1318
1176
|
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
response = self._api_request_with_retry(
|
|
1326
|
-
"POST",
|
|
1327
|
-
self._download_url,
|
|
1328
|
-
data={"tasks_id": task_id, "type": file_type},
|
|
1329
|
-
headers=headers,
|
|
1330
|
-
)
|
|
1331
|
-
response.raise_for_status()
|
|
1332
|
-
data = response.json()
|
|
1333
|
-
if data.get("code") == 200 and data.get("data"):
|
|
1334
|
-
return data["data"]["download"]
|
|
1335
|
-
raise_for_code("Get result failed", code=data.get("code"), payload=data)
|
|
1336
|
-
return ""
|
|
1337
|
-
except requests.RequestException as e:
|
|
1338
|
-
raise ThordataNetworkError(
|
|
1339
|
-
f"Get result failed: {e}", original_error=e
|
|
1340
|
-
) from e
|
|
1177
|
+
server_list = []
|
|
1178
|
+
if isinstance(data, dict):
|
|
1179
|
+
server_list = data.get("data", data.get("list", []))
|
|
1180
|
+
elif isinstance(data, list):
|
|
1181
|
+
server_list = data
|
|
1182
|
+
return [ProxyServer.from_dict(s) for s in server_list]
|
|
1341
1183
|
|
|
1342
|
-
def
|
|
1184
|
+
def get_proxy_expiration(
|
|
1185
|
+
self, ips: str | list[str], proxy_type: int
|
|
1186
|
+
) -> dict[str, Any]:
|
|
1343
1187
|
self._require_public_credentials()
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1188
|
+
if isinstance(ips, list):
|
|
1189
|
+
ips = ",".join(ips)
|
|
1190
|
+
params = {
|
|
1191
|
+
"token": self.public_token,
|
|
1192
|
+
"key": self.public_key,
|
|
1193
|
+
"proxy_type": str(proxy_type),
|
|
1194
|
+
"ips": ips,
|
|
1195
|
+
}
|
|
1347
1196
|
response = self._api_request_with_retry(
|
|
1348
|
-
"
|
|
1349
|
-
self._list_url,
|
|
1350
|
-
data={"page": str(page), "size": str(size)},
|
|
1351
|
-
headers=headers,
|
|
1197
|
+
"GET", self._proxy_expiration_url, params=params
|
|
1352
1198
|
)
|
|
1353
1199
|
response.raise_for_status()
|
|
1354
1200
|
data = response.json()
|
|
1355
1201
|
if data.get("code") != 200:
|
|
1356
|
-
raise_for_code("
|
|
1357
|
-
return data.get("data",
|
|
1202
|
+
raise_for_code("Get expiration error", code=data.get("code"), payload=data)
|
|
1203
|
+
return data.get("data", data)
|
|
1358
1204
|
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
*,
|
|
1363
|
-
poll_interval: float = 5.0,
|
|
1364
|
-
max_wait: float = 600.0,
|
|
1365
|
-
) -> str:
|
|
1366
|
-
import time
|
|
1205
|
+
# =========================================================================
|
|
1206
|
+
# Helpers needed for compatibility
|
|
1207
|
+
# =========================================================================
|
|
1367
1208
|
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
"failed",
|
|
1376
|
-
"error",
|
|
1377
|
-
"cancelled",
|
|
1378
|
-
}:
|
|
1379
|
-
return status
|
|
1380
|
-
time.sleep(poll_interval)
|
|
1381
|
-
raise TimeoutError(f"Task {task_id} timeout")
|
|
1209
|
+
def _process_universal_response(
|
|
1210
|
+
self, response: requests.Response, output_format: str
|
|
1211
|
+
) -> str | bytes:
|
|
1212
|
+
try:
|
|
1213
|
+
resp_json = response.json()
|
|
1214
|
+
except ValueError:
|
|
1215
|
+
return response.content if output_format.lower() == "png" else response.text
|
|
1382
1216
|
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1217
|
+
if isinstance(resp_json, dict):
|
|
1218
|
+
code = resp_json.get("code")
|
|
1219
|
+
if code is not None and code != 200:
|
|
1220
|
+
msg = extract_error_message(resp_json)
|
|
1221
|
+
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
1222
|
+
|
|
1223
|
+
if "html" in resp_json:
|
|
1224
|
+
return resp_json["html"]
|
|
1225
|
+
if "png" in resp_json:
|
|
1226
|
+
return decode_base64_image(resp_json["png"])
|
|
1227
|
+
return str(resp_json)
|
|
1228
|
+
|
|
1229
|
+
def get_browser_connection_url(
|
|
1230
|
+
self, username: str | None = None, password: str | None = None
|
|
1395
1231
|
) -> str:
|
|
1396
|
-
|
|
1397
|
-
|
|
1232
|
+
# User requested modification: ONLY use browser credentials, do not fall back to residential.
|
|
1233
|
+
user = username or os.getenv("THORDATA_BROWSER_USERNAME")
|
|
1234
|
+
pwd = password or os.getenv("THORDATA_BROWSER_PASSWORD")
|
|
1398
1235
|
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1236
|
+
if not user or not pwd:
|
|
1237
|
+
raise ThordataConfigError(
|
|
1238
|
+
"Browser credentials missing. Set THORDATA_BROWSER_USERNAME/PASSWORD or pass arguments."
|
|
1239
|
+
)
|
|
1240
|
+
prefix = "td-customer-"
|
|
1241
|
+
final_user = f"{prefix}{user}" if not user.startswith(prefix) else user
|
|
1403
1242
|
|
|
1404
|
-
|
|
1405
|
-
file_name: Name for the output file.
|
|
1406
|
-
spider_id: Spider identifier from Dashboard.
|
|
1407
|
-
spider_name: Spider name (target domain).
|
|
1408
|
-
parameters: Spider-specific parameters.
|
|
1409
|
-
universal_params: Global spider settings.
|
|
1410
|
-
max_wait: Maximum seconds to wait for task completion (default 600).
|
|
1411
|
-
initial_poll_interval: Starting poll interval in seconds.
|
|
1412
|
-
max_poll_interval: Maximum poll interval cap.
|
|
1413
|
-
include_errors: Whether to include error logs in the task result.
|
|
1414
|
-
|
|
1415
|
-
Returns:
|
|
1416
|
-
str: The download URL for the task result (default JSON).
|
|
1417
|
-
|
|
1418
|
-
Raises:
|
|
1419
|
-
ThordataTimeoutError: If task takes longer than max_wait.
|
|
1420
|
-
ThordataAPIError: If task fails or is cancelled.
|
|
1421
|
-
"""
|
|
1422
|
-
import time
|
|
1243
|
+
from urllib.parse import quote
|
|
1423
1244
|
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
file_name=file_name,
|
|
1427
|
-
spider_id=spider_id,
|
|
1428
|
-
spider_name=spider_name,
|
|
1429
|
-
parameters=parameters,
|
|
1430
|
-
universal_params=universal_params,
|
|
1431
|
-
include_errors=include_errors,
|
|
1432
|
-
)
|
|
1433
|
-
task_id = self.create_scraper_task_advanced(config)
|
|
1434
|
-
logger.info(f"Task created successfully: {task_id}. Waiting for completion...")
|
|
1245
|
+
safe_user = quote(final_user, safe="")
|
|
1246
|
+
safe_pass = quote(pwd, safe="")
|
|
1435
1247
|
|
|
1436
|
-
|
|
1437
|
-
start_time = time.monotonic()
|
|
1438
|
-
current_poll = initial_poll_interval
|
|
1248
|
+
return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"
|
|
1439
1249
|
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1250
|
+
# =========================================================================
|
|
1251
|
+
# Proxy Internal Logic
|
|
1252
|
+
# =========================================================================
|
|
1443
1253
|
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1254
|
+
def _proxy_verb(
|
|
1255
|
+
self,
|
|
1256
|
+
method: str,
|
|
1257
|
+
url: str,
|
|
1258
|
+
proxy_config: ProxyConfig | None,
|
|
1259
|
+
timeout: int | None,
|
|
1260
|
+
**kwargs: Any,
|
|
1261
|
+
) -> requests.Response:
|
|
1262
|
+
timeout = timeout or self._default_timeout
|
|
1263
|
+
if proxy_config is None:
|
|
1264
|
+
proxy_config = self._get_default_proxy_config_from_env()
|
|
1265
|
+
if proxy_config is None:
|
|
1266
|
+
raise ThordataConfigError("Proxy credentials are missing.")
|
|
1448
1267
|
|
|
1449
|
-
|
|
1450
|
-
raise ThordataNetworkError(
|
|
1451
|
-
f"Task {task_id} ended with failed status: {status}"
|
|
1452
|
-
)
|
|
1268
|
+
kwargs.pop("proxies", None)
|
|
1453
1269
|
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1270
|
+
@with_retry(self._retry_config)
|
|
1271
|
+
def _do() -> requests.Response:
|
|
1272
|
+
return self._proxy_request_with_proxy_manager(
|
|
1273
|
+
method,
|
|
1274
|
+
url,
|
|
1275
|
+
proxy_config=cast(ProxyConfig, proxy_config),
|
|
1276
|
+
timeout=cast(int, timeout),
|
|
1277
|
+
headers=kwargs.pop("headers", None),
|
|
1278
|
+
params=kwargs.pop("params", None),
|
|
1279
|
+
data=kwargs.pop("data", None),
|
|
1280
|
+
)
|
|
1457
1281
|
|
|
1458
|
-
|
|
1282
|
+
try:
|
|
1283
|
+
return _do()
|
|
1284
|
+
except Exception as e:
|
|
1285
|
+
raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
|
|
1459
1286
|
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1287
|
+
def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
|
|
1288
|
+
if not userpass:
|
|
1289
|
+
return proxy_endpoint
|
|
1290
|
+
h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
|
|
1291
|
+
return f"{proxy_endpoint}|auth={h}"
|
|
1463
1292
|
|
|
1464
|
-
def
|
|
1293
|
+
def _get_proxy_manager(
|
|
1465
1294
|
self,
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
if
|
|
1473
|
-
|
|
1295
|
+
proxy_url: str,
|
|
1296
|
+
*,
|
|
1297
|
+
cache_key: str,
|
|
1298
|
+
proxy_headers: dict[str, str] | None = None,
|
|
1299
|
+
) -> urllib3.PoolManager:
|
|
1300
|
+
cached = self._proxy_managers.get(cache_key)
|
|
1301
|
+
if cached is not None:
|
|
1302
|
+
return cached
|
|
1303
|
+
|
|
1304
|
+
if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
|
|
1305
|
+
if not HAS_PYSOCKS:
|
|
1306
|
+
raise ThordataConfigError(
|
|
1307
|
+
"SOCKS support requires PySocks/urllib3[socks]"
|
|
1308
|
+
)
|
|
1309
|
+
from urllib3.contrib.socks import SOCKSProxyManager
|
|
1310
|
+
|
|
1311
|
+
pm = cast(
|
|
1312
|
+
urllib3.PoolManager,
|
|
1313
|
+
SOCKSProxyManager(proxy_url, num_pools=10, maxsize=10),
|
|
1314
|
+
)
|
|
1315
|
+
self._proxy_managers[cache_key] = pm
|
|
1316
|
+
return pm
|
|
1317
|
+
|
|
1318
|
+
proxy_ssl_context = (
|
|
1319
|
+
ssl.create_default_context() if proxy_url.startswith("https://") else None
|
|
1320
|
+
)
|
|
1321
|
+
pm = urllib3.ProxyManager(
|
|
1322
|
+
proxy_url,
|
|
1323
|
+
proxy_headers=proxy_headers,
|
|
1324
|
+
proxy_ssl_context=proxy_ssl_context,
|
|
1325
|
+
num_pools=10,
|
|
1326
|
+
maxsize=10,
|
|
1327
|
+
)
|
|
1328
|
+
self._proxy_managers[cache_key] = pm
|
|
1329
|
+
return pm
|
|
1330
|
+
|
|
1331
|
+
def _proxy_request_with_proxy_manager(
|
|
1332
|
+
self,
|
|
1333
|
+
method: str,
|
|
1334
|
+
url: str,
|
|
1335
|
+
*,
|
|
1336
|
+
proxy_config: ProxyConfig,
|
|
1337
|
+
timeout: int,
|
|
1338
|
+
headers: dict[str, str] | None = None,
|
|
1339
|
+
params: dict[str, Any] | None = None,
|
|
1340
|
+
data: Any = None,
|
|
1341
|
+
) -> requests.Response:
|
|
1342
|
+
upstream = _parse_upstream_proxy()
|
|
1343
|
+
if upstream:
|
|
1344
|
+
return self._proxy_request_with_upstream(
|
|
1345
|
+
method,
|
|
1346
|
+
url,
|
|
1347
|
+
proxy_config=proxy_config,
|
|
1348
|
+
timeout=timeout,
|
|
1349
|
+
headers=headers,
|
|
1350
|
+
params=params,
|
|
1351
|
+
data=data,
|
|
1352
|
+
upstream_config=upstream,
|
|
1353
|
+
)
|
|
1354
|
+
|
|
1355
|
+
req = requests.Request(method=method.upper(), url=url, params=params)
|
|
1356
|
+
prepped = self._proxy_session.prepare_request(req)
|
|
1357
|
+
final_url = prepped.url or url
|
|
1358
|
+
|
|
1359
|
+
proxy_endpoint = proxy_config.build_proxy_endpoint()
|
|
1360
|
+
is_socks = proxy_endpoint.startswith(("socks",))
|
|
1361
|
+
|
|
1362
|
+
if is_socks:
|
|
1363
|
+
proxy_url_for_manager = proxy_config.build_proxy_url()
|
|
1364
|
+
cache_key = proxy_url_for_manager
|
|
1365
|
+
pm = self._get_proxy_manager(proxy_url_for_manager, cache_key=cache_key)
|
|
1366
|
+
req_headers = dict(headers or {})
|
|
1367
|
+
else:
|
|
1368
|
+
userpass = proxy_config.build_proxy_basic_auth()
|
|
1369
|
+
proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
|
|
1370
|
+
cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
|
|
1371
|
+
pm = self._get_proxy_manager(
|
|
1372
|
+
proxy_endpoint, cache_key=cache_key, proxy_headers=dict(proxy_headers)
|
|
1373
|
+
)
|
|
1374
|
+
req_headers = dict(headers or {})
|
|
1375
|
+
|
|
1376
|
+
body = None
|
|
1377
|
+
if data is not None:
|
|
1378
|
+
if isinstance(data, dict):
|
|
1379
|
+
body = urlencode({k: str(v) for k, v in data.items()})
|
|
1380
|
+
req_headers.setdefault(
|
|
1381
|
+
"Content-Type", "application/x-www-form-urlencoded"
|
|
1382
|
+
)
|
|
1383
|
+
else:
|
|
1384
|
+
body = data
|
|
1474
1385
|
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1386
|
+
http_resp = pm.request(
|
|
1387
|
+
method.upper(),
|
|
1388
|
+
final_url,
|
|
1389
|
+
body=body,
|
|
1390
|
+
headers=req_headers or None,
|
|
1391
|
+
timeout=urllib3.Timeout(connect=timeout, read=timeout),
|
|
1392
|
+
retries=False,
|
|
1393
|
+
preload_content=True,
|
|
1483
1394
|
)
|
|
1484
|
-
response.raise_for_status()
|
|
1485
|
-
data = response.json()
|
|
1486
|
-
if data.get("code") != 200:
|
|
1487
|
-
raise_for_code("Usage stats error", code=data.get("code"), payload=data)
|
|
1488
|
-
return UsageStatistics.from_dict(data.get("data", data))
|
|
1489
1395
|
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
"token": self.public_token,
|
|
1497
|
-
"key": self.public_key,
|
|
1498
|
-
"proxy_type": str(pt),
|
|
1499
|
-
}
|
|
1500
|
-
response = self._api_request_with_retry(
|
|
1501
|
-
"GET", f"{self._proxy_users_url}/user-list", params=params
|
|
1502
|
-
)
|
|
1503
|
-
response.raise_for_status()
|
|
1504
|
-
data = response.json()
|
|
1505
|
-
if data.get("code") != 200:
|
|
1506
|
-
raise_for_code("List users error", code=data.get("code"), payload=data)
|
|
1507
|
-
return ProxyUserList.from_dict(data.get("data", data))
|
|
1396
|
+
r = requests.Response()
|
|
1397
|
+
r.status_code = int(getattr(http_resp, "status", 0))
|
|
1398
|
+
r._content = http_resp.data or b""
|
|
1399
|
+
r.url = final_url
|
|
1400
|
+
r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
|
|
1401
|
+
return r
|
|
1508
1402
|
|
|
1509
|
-
def
|
|
1403
|
+
def _proxy_request_with_upstream(
|
|
1510
1404
|
self,
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
"proxy_type": str(pt),
|
|
1524
|
-
"username": username,
|
|
1525
|
-
"password": password,
|
|
1526
|
-
"traffic_limit": str(traffic_limit),
|
|
1527
|
-
"status": "true" if status else "false",
|
|
1528
|
-
}
|
|
1529
|
-
response = self._api_request_with_retry(
|
|
1530
|
-
"POST",
|
|
1531
|
-
f"{self._proxy_users_url}/create-user",
|
|
1532
|
-
data=payload,
|
|
1533
|
-
headers=headers,
|
|
1534
|
-
)
|
|
1535
|
-
response.raise_for_status()
|
|
1536
|
-
data = response.json()
|
|
1537
|
-
if data.get("code") != 200:
|
|
1538
|
-
raise_for_code("Create user failed", code=data.get("code"), payload=data)
|
|
1539
|
-
return data.get("data", {})
|
|
1405
|
+
method: str,
|
|
1406
|
+
url: str,
|
|
1407
|
+
*,
|
|
1408
|
+
proxy_config: ProxyConfig,
|
|
1409
|
+
timeout: int,
|
|
1410
|
+
headers: dict[str, str] | None = None,
|
|
1411
|
+
params: dict[str, Any] | None = None,
|
|
1412
|
+
data: Any = None,
|
|
1413
|
+
upstream_config: dict[str, Any],
|
|
1414
|
+
) -> requests.Response:
|
|
1415
|
+
if not HAS_PYSOCKS:
|
|
1416
|
+
raise ThordataConfigError("PySocks required for upstream proxy support.")
|
|
1540
1417
|
|
|
1541
|
-
|
|
1542
|
-
self
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
headers = build_public_api_headers(
|
|
1550
|
-
self.public_token or "", self.public_key or ""
|
|
1551
|
-
)
|
|
1552
|
-
payload = {
|
|
1553
|
-
"proxy_type": str(pt),
|
|
1554
|
-
"ip": ip,
|
|
1555
|
-
"status": "true" if status else "false",
|
|
1556
|
-
}
|
|
1557
|
-
response = self._api_request_with_retry(
|
|
1558
|
-
"POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
|
|
1418
|
+
req = requests.Request(method=method.upper(), url=url, params=params)
|
|
1419
|
+
prepped = self._proxy_session.prepare_request(req)
|
|
1420
|
+
final_url = prepped.url or url
|
|
1421
|
+
|
|
1422
|
+
parsed_target = urlparse(final_url)
|
|
1423
|
+
target_host = parsed_target.hostname or ""
|
|
1424
|
+
target_port = parsed_target.port or (
|
|
1425
|
+
443 if parsed_target.scheme == "https" else 80
|
|
1559
1426
|
)
|
|
1560
|
-
response.raise_for_status()
|
|
1561
|
-
data = response.json()
|
|
1562
|
-
if data.get("code") != 200:
|
|
1563
|
-
raise_for_code(
|
|
1564
|
-
"Add whitelist IP failed", code=data.get("code"), payload=data
|
|
1565
|
-
)
|
|
1566
|
-
return data.get("data", {})
|
|
1567
1427
|
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1428
|
+
thordata_host = proxy_config.host or "pr.thordata.net"
|
|
1429
|
+
thordata_port = proxy_config.port or 9999
|
|
1430
|
+
thordata_user = proxy_config.build_username()
|
|
1431
|
+
thordata_pass = proxy_config.password
|
|
1432
|
+
|
|
1433
|
+
# 1. Connect to Upstream -> Thordata Node
|
|
1434
|
+
factory = UpstreamProxySocketFactory(upstream_config)
|
|
1435
|
+
raw_sock = factory.create_connection(
|
|
1436
|
+
(thordata_host, thordata_port),
|
|
1437
|
+
timeout=float(timeout),
|
|
1577
1438
|
)
|
|
1578
|
-
response.raise_for_status()
|
|
1579
|
-
data = response.json()
|
|
1580
|
-
if data.get("code") != 200:
|
|
1581
|
-
raise_for_code(
|
|
1582
|
-
"List proxy servers error", code=data.get("code"), payload=data
|
|
1583
|
-
)
|
|
1584
1439
|
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
server_list = data.get("data", data.get("list", []))
|
|
1588
|
-
elif isinstance(data, list):
|
|
1589
|
-
server_list = data
|
|
1440
|
+
try:
|
|
1441
|
+
protocol = proxy_config.protocol.lower().replace("socks5", "socks5h")
|
|
1590
1442
|
|
|
1591
|
-
|
|
1443
|
+
# 2. Handshake with Thordata
|
|
1444
|
+
if protocol.startswith("socks"):
|
|
1445
|
+
sock = socks5_handshake(
|
|
1446
|
+
raw_sock, target_host, target_port, thordata_user, thordata_pass
|
|
1447
|
+
)
|
|
1448
|
+
if parsed_target.scheme == "https":
|
|
1449
|
+
ctx = ssl.create_default_context()
|
|
1450
|
+
sock = ctx.wrap_socket(sock, server_hostname=target_host)
|
|
1451
|
+
else:
|
|
1452
|
+
# HTTP/HTTPS Tunnel
|
|
1453
|
+
if protocol == "https":
|
|
1454
|
+
ctx = ssl.create_default_context()
|
|
1455
|
+
sock = ctx.wrap_socket(raw_sock, server_hostname=thordata_host)
|
|
1456
|
+
else:
|
|
1457
|
+
sock = raw_sock
|
|
1592
1458
|
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1459
|
+
# CONNECT to Thordata
|
|
1460
|
+
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
1461
|
+
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
1462
|
+
auth = base64.b64encode(
|
|
1463
|
+
f"{thordata_user}:{thordata_pass}".encode()
|
|
1464
|
+
).decode()
|
|
1465
|
+
connect_req += f"Proxy-Authorization: Basic {auth}\r\n\r\n"
|
|
1466
|
+
sock.sendall(connect_req.encode())
|
|
1467
|
+
|
|
1468
|
+
resp = b""
|
|
1469
|
+
while b"\r\n\r\n" not in resp:
|
|
1470
|
+
resp += sock.recv(1024)
|
|
1471
|
+
if b"200" not in resp.split(b"\r\n")[0]:
|
|
1472
|
+
raise ConnectionError("Thordata CONNECT failed")
|
|
1473
|
+
|
|
1474
|
+
# 3. If Target is HTTPS, wrap TLS inside the tunnel
|
|
1475
|
+
if parsed_target.scheme == "https":
|
|
1476
|
+
if isinstance(sock, ssl.SSLSocket):
|
|
1477
|
+
sock = cast(
|
|
1478
|
+
socket.socket,
|
|
1479
|
+
create_tls_in_tls(sock, target_host, float(timeout)),
|
|
1480
|
+
)
|
|
1481
|
+
else:
|
|
1482
|
+
ctx = ssl.create_default_context()
|
|
1483
|
+
sock = ctx.wrap_socket(sock, server_hostname=target_host)
|
|
1484
|
+
|
|
1485
|
+
# 4. Send actual Request
|
|
1486
|
+
return self._send_http_via_socket(
|
|
1487
|
+
sock, method, parsed_target, headers, data, final_url, timeout
|
|
1488
|
+
)
|
|
1613
1489
|
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1618
|
-
return self._get_locations("countries", proxy_type=pt)
|
|
1490
|
+
except Exception:
|
|
1491
|
+
raw_sock.close()
|
|
1492
|
+
raise
|
|
1619
1493
|
|
|
1620
|
-
def
|
|
1494
|
+
def _send_http_via_socket(
|
|
1621
1495
|
self,
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1496
|
+
sock: socket.socket | Any, # Fix for TLSInTLSSocket typing issue
|
|
1497
|
+
method: str,
|
|
1498
|
+
parsed: Any,
|
|
1499
|
+
headers: Any,
|
|
1500
|
+
data: Any,
|
|
1501
|
+
final_url: str,
|
|
1502
|
+
timeout: int,
|
|
1503
|
+
) -> requests.Response:
|
|
1504
|
+
req_headers = dict(headers or {})
|
|
1505
|
+
req_headers.setdefault("Host", parsed.hostname)
|
|
1506
|
+
req_headers.setdefault("User-Agent", "python-thordata-sdk")
|
|
1507
|
+
req_headers.setdefault("Connection", "close")
|
|
1627
1508
|
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
state_code: str | None = None,
|
|
1632
|
-
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1633
|
-
) -> list[dict[str, Any]]:
|
|
1634
|
-
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1635
|
-
kwargs = {"proxy_type": pt, "country_code": country_code}
|
|
1636
|
-
if state_code:
|
|
1637
|
-
kwargs["state_code"] = state_code
|
|
1638
|
-
return self._get_locations("cities", **kwargs)
|
|
1509
|
+
path = parsed.path or "/"
|
|
1510
|
+
if parsed.query:
|
|
1511
|
+
path += f"?{parsed.query}"
|
|
1639
1512
|
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1644
|
-
) -> list[dict[str, Any]]:
|
|
1645
|
-
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1646
|
-
return self._get_locations("asn", proxy_type=pt, country_code=country_code)
|
|
1513
|
+
msg = f"{method} {path} HTTP/1.1\r\n"
|
|
1514
|
+
for k, v in req_headers.items():
|
|
1515
|
+
msg += f"{k}: {v}\r\n"
|
|
1647
1516
|
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1517
|
+
body = b""
|
|
1518
|
+
if data:
|
|
1519
|
+
if isinstance(data, dict):
|
|
1520
|
+
body = urlencode(data).encode()
|
|
1521
|
+
msg += "Content-Type: application/x-www-form-urlencoded\r\n"
|
|
1522
|
+
elif isinstance(data, bytes):
|
|
1523
|
+
body = data
|
|
1524
|
+
else:
|
|
1525
|
+
body = str(data).encode()
|
|
1526
|
+
msg += f"Content-Length: {len(body)}\r\n"
|
|
1653
1527
|
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
data = response.json()
|
|
1659
|
-
if isinstance(data, dict):
|
|
1660
|
-
if data.get("code") != 200:
|
|
1661
|
-
raise RuntimeError(f"Locations error: {data.get('msg')}")
|
|
1662
|
-
return data.get("data") or []
|
|
1663
|
-
return data if isinstance(data, list) else []
|
|
1528
|
+
msg += "\r\n"
|
|
1529
|
+
sock.sendall(msg.encode())
|
|
1530
|
+
if body:
|
|
1531
|
+
sock.sendall(body)
|
|
1664
1532
|
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1533
|
+
# Read Response
|
|
1534
|
+
resp_data = b""
|
|
1535
|
+
while True:
|
|
1536
|
+
try:
|
|
1537
|
+
chunk = sock.recv(4096)
|
|
1538
|
+
if not chunk:
|
|
1539
|
+
break
|
|
1540
|
+
resp_data += chunk
|
|
1541
|
+
except socket.timeout:
|
|
1542
|
+
break
|
|
1543
|
+
|
|
1544
|
+
if b"\r\n\r\n" in resp_data:
|
|
1545
|
+
head, content = resp_data.split(b"\r\n\r\n", 1)
|
|
1546
|
+
status_line = head.split(b"\r\n")[0].decode()
|
|
1547
|
+
try:
|
|
1548
|
+
status_code = int(status_line.split(" ")[1])
|
|
1549
|
+
except (ValueError, IndexError):
|
|
1550
|
+
status_code = 0
|
|
1551
|
+
|
|
1552
|
+
r = requests.Response()
|
|
1553
|
+
r.status_code = status_code
|
|
1554
|
+
r._content = content
|
|
1555
|
+
r.url = final_url
|
|
1556
|
+
return r
|
|
1557
|
+
raise ConnectionError("Empty response from socket")
|
|
1670
1558
|
|
|
1671
1559
|
def _get_proxy_endpoint_overrides(
|
|
1672
1560
|
self, product: ProxyProduct
|
|
@@ -1681,7 +1569,7 @@ class ThordataClient:
|
|
|
1681
1569
|
protocol = (
|
|
1682
1570
|
os.getenv(f"THORDATA_{prefix}_PROXY_PROTOCOL")
|
|
1683
1571
|
or os.getenv("THORDATA_PROXY_PROTOCOL")
|
|
1684
|
-
or "
|
|
1572
|
+
or "http"
|
|
1685
1573
|
)
|
|
1686
1574
|
port = int(port_raw) if port_raw and port_raw.isdigit() else None
|
|
1687
1575
|
return host or None, port, protocol
|
|
@@ -1706,16 +1594,3 @@ class ThordataClient:
|
|
|
1706
1594
|
protocol=proto,
|
|
1707
1595
|
)
|
|
1708
1596
|
return None
|
|
1709
|
-
|
|
1710
|
-
def close(self) -> None:
|
|
1711
|
-
self._proxy_session.close()
|
|
1712
|
-
self._api_session.close()
|
|
1713
|
-
for pm in self._proxy_managers.values():
|
|
1714
|
-
pm.clear()
|
|
1715
|
-
self._proxy_managers.clear()
|
|
1716
|
-
|
|
1717
|
-
def __enter__(self) -> ThordataClient:
|
|
1718
|
-
return self
|
|
1719
|
-
|
|
1720
|
-
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
1721
|
-
self.close()
|