thordata-sdk 0.2.4__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/client.py ADDED
@@ -0,0 +1,1644 @@
1
+ """
2
+ Synchronous client for the Thordata API.
3
+
4
+ This module provides the main ThordataClient class for interacting with
5
+ Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
6
+
7
+ Example:
8
+ >>> from thordata import ThordataClient
9
+ >>>
10
+ >>> client = ThordataClient(
11
+ ... scraper_token="your_token",
12
+ ... public_token="your_public_token",
13
+ ... public_key="your_public_key"
14
+ ... )
15
+ >>>
16
+ >>> # Use the proxy network
17
+ >>> response = client.get("https://httpbin.org/ip")
18
+ >>> print(response.json())
19
+ >>>
20
+ >>> # Search with SERP API
21
+ >>> results = client.serp_search("python tutorial", engine="google")
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import base64
27
+ import contextlib
28
+ import hashlib
29
+ import logging
30
+ import os
31
+ import socket
32
+ import ssl
33
+ from datetime import date
34
+ from typing import Any, cast
35
+ from urllib.parse import urlencode, urlparse
36
+
37
+ import requests
38
+ import urllib3
39
+ from requests.structures import CaseInsensitiveDict
40
+
41
+ from .serp_engines import SerpNamespace
42
+
43
+ try:
44
+ import socks
45
+
46
+ HAS_PYSOCKS = True
47
+ except ImportError:
48
+ HAS_PYSOCKS = False
49
+
50
+ from . import __version__ as _sdk_version
51
+ from ._utils import (
52
+ build_auth_headers,
53
+ build_builder_headers,
54
+ build_public_api_headers,
55
+ build_user_agent,
56
+ decode_base64_image,
57
+ extract_error_message,
58
+ parse_json_response,
59
+ )
60
+ from .enums import Engine, ProxyType
61
+ from .exceptions import (
62
+ ThordataConfigError,
63
+ ThordataNetworkError,
64
+ ThordataTimeoutError,
65
+ raise_for_code,
66
+ )
67
+ from .models import (
68
+ CommonSettings,
69
+ ProxyConfig,
70
+ ProxyProduct,
71
+ ProxyServer,
72
+ ProxyUserList,
73
+ ScraperTaskConfig,
74
+ SerpRequest,
75
+ UniversalScrapeRequest,
76
+ UsageStatistics,
77
+ VideoTaskConfig,
78
+ )
79
+ from .retry import RetryConfig, with_retry
80
+
81
+ logger = logging.getLogger(__name__)
82
+
83
+
84
+ # =========================================================================
85
+ # Upstream Proxy Support (for users behind firewall)
86
+ # =========================================================================
87
+
88
+
89
+ def _parse_upstream_proxy() -> dict[str, Any] | None:
90
+ """
91
+ Parse THORDATA_UPSTREAM_PROXY environment variable.
92
+
93
+ Supported formats:
94
+ - http://127.0.0.1:7897
95
+ - socks5://127.0.0.1:7897
96
+ - socks5://user:pass@127.0.0.1:7897
97
+
98
+ Returns:
99
+ Dict with proxy config or None if not set.
100
+ """
101
+ upstream_url = os.environ.get("THORDATA_UPSTREAM_PROXY", "").strip()
102
+ if not upstream_url:
103
+ return None
104
+
105
+ parsed = urlparse(upstream_url)
106
+ scheme = (parsed.scheme or "").lower()
107
+
108
+ if scheme not in ("http", "https", "socks5", "socks5h", "socks4"):
109
+ logger.warning(f"Unsupported upstream proxy scheme: {scheme}")
110
+ return None
111
+
112
+ return {
113
+ "scheme": scheme,
114
+ "host": parsed.hostname or "127.0.0.1",
115
+ "port": parsed.port or (1080 if scheme.startswith("socks") else 7897),
116
+ "username": parsed.username,
117
+ "password": parsed.password,
118
+ }
119
+
120
+
121
+ class _UpstreamProxySocketFactory:
122
+ """
123
+ Socket factory that creates connections through an upstream proxy.
124
+ Used for proxy chaining when accessing Thordata from behind a firewall.
125
+ """
126
+
127
+ def __init__(self, upstream_config: dict[str, Any]):
128
+ self.config = upstream_config
129
+
130
+ def create_connection(
131
+ self,
132
+ address: tuple[str, int],
133
+ timeout: float | None = None,
134
+ source_address: tuple[str, int] | None = None,
135
+ ) -> socket.socket:
136
+ """Create a socket connection through the upstream proxy."""
137
+ scheme = self.config["scheme"]
138
+
139
+ if scheme.startswith("socks"):
140
+ return self._create_socks_connection(address, timeout)
141
+ else:
142
+ return self._create_http_tunnel(address, timeout)
143
+
144
+ def _create_socks_connection(
145
+ self,
146
+ address: tuple[str, int],
147
+ timeout: float | None = None,
148
+ ) -> socket.socket:
149
+ """Create connection through SOCKS proxy."""
150
+ if not HAS_PYSOCKS:
151
+ raise RuntimeError(
152
+ "PySocks is required for SOCKS upstream proxy. "
153
+ "Install with: pip install PySocks"
154
+ )
155
+
156
+ scheme = self.config["scheme"]
157
+ proxy_type = socks.SOCKS5 if "socks5" in scheme else socks.SOCKS4
158
+
159
+ sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
160
+ sock.set_proxy(
161
+ proxy_type,
162
+ self.config["host"],
163
+ self.config["port"],
164
+ rdns=True,
165
+ username=self.config.get("username"),
166
+ password=self.config.get("password"),
167
+ )
168
+
169
+ if timeout is not None:
170
+ sock.settimeout(timeout)
171
+
172
+ sock.connect(address)
173
+ return sock
174
+
175
+ def _create_http_tunnel(
176
+ self,
177
+ address: tuple[str, int],
178
+ timeout: float | None = None,
179
+ ) -> socket.socket:
180
+ """Create connection through HTTP CONNECT tunnel."""
181
+ # Connect to upstream proxy
182
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
183
+ if timeout is not None:
184
+ sock.settimeout(timeout)
185
+
186
+ sock.connect((self.config["host"], self.config["port"]))
187
+
188
+ # Build CONNECT request
189
+ target_host, target_port = address
190
+ connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
191
+ connect_req += f"Host: {target_host}:{target_port}\r\n"
192
+
193
+ # Add proxy auth if provided
194
+ if self.config.get("username"):
195
+ credentials = f"{self.config['username']}:{self.config.get('password', '')}"
196
+ encoded = base64.b64encode(credentials.encode()).decode()
197
+ connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
198
+
199
+ connect_req += "\r\n"
200
+
201
+ sock.sendall(connect_req.encode())
202
+
203
+ # Read response
204
+ response = b""
205
+ while b"\r\n\r\n" not in response:
206
+ chunk = sock.recv(1024)
207
+ if not chunk:
208
+ raise ConnectionError("Upstream proxy closed connection")
209
+ response += chunk
210
+
211
+ # Check status
212
+ status_line = response.split(b"\r\n")[0].decode()
213
+ if "200" not in status_line:
214
+ sock.close()
215
+ raise ConnectionError(f"Upstream proxy CONNECT failed: {status_line}")
216
+
217
+ return sock
218
+
219
+
220
+ class _TLSInTLSSocket:
221
+ """
222
+ A socket-like wrapper for TLS-in-TLS connections.
223
+
224
+ Uses SSLObject + MemoryBIO to implement TLS over an existing TLS connection.
225
+ """
226
+
227
+ def __init__(
228
+ self,
229
+ outer_sock: ssl.SSLSocket,
230
+ ssl_obj: ssl.SSLObject,
231
+ incoming: ssl.MemoryBIO,
232
+ outgoing: ssl.MemoryBIO,
233
+ ):
234
+ self._outer = outer_sock
235
+ self._ssl = ssl_obj
236
+ self._incoming = incoming
237
+ self._outgoing = outgoing
238
+ self._timeout: float | None = None
239
+
240
+ def settimeout(self, timeout: float | None) -> None:
241
+ self._timeout = timeout
242
+ self._outer.settimeout(timeout)
243
+
244
+ def sendall(self, data: bytes) -> None:
245
+ """Send data through the inner TLS connection."""
246
+ self._ssl.write(data)
247
+ encrypted = self._outgoing.read()
248
+ if encrypted:
249
+ self._outer.sendall(encrypted)
250
+
251
+ def recv(self, bufsize: int) -> bytes:
252
+ """Receive data from the inner TLS connection."""
253
+ while True:
254
+ try:
255
+ return self._ssl.read(bufsize)
256
+ except ssl.SSLWantReadError:
257
+ self._outer.settimeout(self._timeout)
258
+ try:
259
+ received = self._outer.recv(8192)
260
+ if not received:
261
+ return b""
262
+ self._incoming.write(received)
263
+ except socket.timeout:
264
+ return b""
265
+
266
+ def close(self) -> None:
267
+ with contextlib.suppress(Exception):
268
+ self._outer.close()
269
+
270
+
271
+ # =========================================================================
272
+ # Main Client Class
273
+ # =========================================================================
274
+
275
+
276
+ class ThordataClient:
277
+ # API Endpoints
278
+ BASE_URL = "https://scraperapi.thordata.com"
279
+ UNIVERSAL_URL = "https://universalapi.thordata.com"
280
+ API_URL = "https://openapi.thordata.com/api/web-scraper-api"
281
+ LOCATIONS_URL = "https://openapi.thordata.com/api/locations"
282
+
283
+ def __init__(
284
+ self,
285
+ scraper_token: str | None = None, # Change: Optional
286
+ public_token: str | None = None,
287
+ public_key: str | None = None,
288
+ proxy_host: str = "pr.thordata.net",
289
+ proxy_port: int = 9999,
290
+ timeout: int = 30,
291
+ api_timeout: int = 60,
292
+ retry_config: RetryConfig | None = None,
293
+ auth_mode: str = "bearer",
294
+ scraperapi_base_url: str | None = None,
295
+ universalapi_base_url: str | None = None,
296
+ web_scraper_api_base_url: str | None = None,
297
+ locations_base_url: str | None = None,
298
+ ) -> None:
299
+ """Initialize the Thordata Client."""
300
+
301
+ self.serp = SerpNamespace(self)
302
+
303
+ self.scraper_token = scraper_token
304
+ self.public_token = public_token
305
+ self.public_key = public_key
306
+
307
+ self._proxy_host = proxy_host
308
+ self._proxy_port = proxy_port
309
+ self._default_timeout = timeout
310
+ self._api_timeout = api_timeout
311
+ self._retry_config = retry_config or RetryConfig()
312
+
313
+ self._auth_mode = auth_mode.lower()
314
+ if self._auth_mode not in ("bearer", "header_token"):
315
+ raise ThordataConfigError(
316
+ f"Invalid auth_mode: {auth_mode}. Must be 'bearer' or 'header_token'."
317
+ )
318
+
319
+ self._proxy_session = requests.Session()
320
+ self._proxy_session.trust_env = False
321
+ self._proxy_managers: dict[str, urllib3.PoolManager] = {}
322
+
323
+ self._api_session = requests.Session()
324
+ self._api_session.trust_env = True
325
+ self._api_session.headers.update(
326
+ {"User-Agent": build_user_agent(_sdk_version, "requests")}
327
+ )
328
+
329
+ # Base URLs
330
+ scraperapi_base = (
331
+ scraperapi_base_url
332
+ or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
333
+ or self.BASE_URL
334
+ ).rstrip("/")
335
+
336
+ universalapi_base = (
337
+ universalapi_base_url
338
+ or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
339
+ or self.UNIVERSAL_URL
340
+ ).rstrip("/")
341
+
342
+ web_scraper_api_base = (
343
+ web_scraper_api_base_url
344
+ or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
345
+ or self.API_URL
346
+ ).rstrip("/")
347
+
348
+ locations_base = (
349
+ locations_base_url
350
+ or os.getenv("THORDATA_LOCATIONS_BASE_URL")
351
+ or self.LOCATIONS_URL
352
+ ).rstrip("/")
353
+
354
+ gateway_base = os.getenv(
355
+ "THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
356
+ )
357
+ self._gateway_base_url = gateway_base
358
+ self._child_base_url = os.getenv(
359
+ "THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
360
+ )
361
+
362
+ self._serp_url = f"{scraperapi_base}/request"
363
+ self._builder_url = f"{scraperapi_base}/builder"
364
+ self._video_builder_url = f"{scraperapi_base}/video_builder"
365
+ self._universal_url = f"{universalapi_base}/request"
366
+
367
+ self._status_url = f"{web_scraper_api_base}/tasks-status"
368
+ self._download_url = f"{web_scraper_api_base}/tasks-download"
369
+ self._list_url = f"{web_scraper_api_base}/tasks-list"
370
+
371
+ self._locations_base_url = locations_base
372
+
373
+ self._usage_stats_url = (
374
+ f"{locations_base.replace('/locations', '')}/account/usage-statistics"
375
+ )
376
+ self._proxy_users_url = (
377
+ f"{locations_base.replace('/locations', '')}/proxy-users"
378
+ )
379
+
380
+ whitelist_base = os.getenv(
381
+ "THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
382
+ )
383
+ self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
384
+
385
+ proxy_api_base = os.getenv(
386
+ "THORDATA_PROXY_API_BASE_URL", "https://openapi.thordata.com/api"
387
+ )
388
+ self._proxy_list_url = f"{proxy_api_base}/proxy/proxy-list"
389
+ self._proxy_expiration_url = f"{proxy_api_base}/proxy/expiration-time"
390
+
391
+ # =========================================================================
392
+ # Proxy Network Methods
393
+ # =========================================================================
394
+
395
+ def get(
396
+ self,
397
+ url: str,
398
+ *,
399
+ proxy_config: ProxyConfig | None = None,
400
+ timeout: int | None = None,
401
+ **kwargs: Any,
402
+ ) -> requests.Response:
403
+ logger.debug(f"Proxy GET request: {url}")
404
+ return self._proxy_verb("GET", url, proxy_config, timeout, **kwargs)
405
+
406
+ def post(
407
+ self,
408
+ url: str,
409
+ *,
410
+ proxy_config: ProxyConfig | None = None,
411
+ timeout: int | None = None,
412
+ **kwargs: Any,
413
+ ) -> requests.Response:
414
+ logger.debug(f"Proxy POST request: {url}")
415
+ return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
416
+
417
+ def _proxy_verb(
418
+ self,
419
+ method: str,
420
+ url: str,
421
+ proxy_config: ProxyConfig | None,
422
+ timeout: int | None,
423
+ **kwargs: Any,
424
+ ) -> requests.Response:
425
+ timeout = timeout or self._default_timeout
426
+
427
+ if proxy_config is None:
428
+ proxy_config = self._get_default_proxy_config_from_env()
429
+
430
+ if proxy_config is None:
431
+ raise ThordataConfigError(
432
+ "Proxy credentials are missing. "
433
+ "Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
434
+ )
435
+
436
+ kwargs.pop("proxies", None)
437
+
438
+ @with_retry(self._retry_config)
439
+ def _do() -> requests.Response:
440
+ return self._proxy_request_with_proxy_manager(
441
+ method,
442
+ url,
443
+ proxy_config=proxy_config, # type: ignore
444
+ timeout=timeout, # type: ignore
445
+ headers=kwargs.pop("headers", None),
446
+ params=kwargs.pop("params", None),
447
+ data=kwargs.pop("data", None),
448
+ )
449
+
450
+ try:
451
+ return _do()
452
+ except requests.Timeout as e:
453
+ raise ThordataTimeoutError(
454
+ f"Request timed out: {e}", original_error=e
455
+ ) from e
456
+ except Exception as e:
457
+ raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
458
+
459
+ def build_proxy_url(
460
+ self,
461
+ username: str,
462
+ password: str,
463
+ *,
464
+ country: str | None = None,
465
+ state: str | None = None,
466
+ city: str | None = None,
467
+ session_id: str | None = None,
468
+ session_duration: int | None = None,
469
+ product: ProxyProduct | str = ProxyProduct.RESIDENTIAL,
470
+ ) -> str:
471
+ config = ProxyConfig(
472
+ username=username,
473
+ password=password,
474
+ host=self._proxy_host,
475
+ port=self._proxy_port,
476
+ product=product,
477
+ country=country,
478
+ state=state,
479
+ city=city,
480
+ session_id=session_id,
481
+ session_duration=session_duration,
482
+ )
483
+ return config.build_proxy_url()
484
+
485
+ # =========================================================================
486
+ # Internal Request Helpers
487
+ # =========================================================================
488
+
489
+ def _api_request_with_retry(
490
+ self,
491
+ method: str,
492
+ url: str,
493
+ *,
494
+ data: dict[str, Any] | None = None,
495
+ headers: dict[str, str] | None = None,
496
+ params: dict[str, Any] | None = None,
497
+ ) -> requests.Response:
498
+ @with_retry(self._retry_config)
499
+ def _do_request() -> requests.Response:
500
+ return self._api_session.request(
501
+ method,
502
+ url,
503
+ data=data,
504
+ headers=headers,
505
+ params=params,
506
+ timeout=self._api_timeout,
507
+ )
508
+
509
+ try:
510
+ return _do_request()
511
+ except requests.Timeout as e:
512
+ raise ThordataTimeoutError(
513
+ f"API request timed out: {e}", original_error=e
514
+ ) from e
515
+ except requests.RequestException as e:
516
+ raise ThordataNetworkError(
517
+ f"API request failed: {e}", original_error=e
518
+ ) from e
519
+
520
+ def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
521
+ """Build a stable cache key for ProxyManager instances."""
522
+ if not userpass:
523
+ return proxy_endpoint
524
+ h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
525
+ return f"{proxy_endpoint}|auth={h}"
526
+
527
+ def _get_proxy_manager(
528
+ self,
529
+ proxy_url: str,
530
+ *,
531
+ cache_key: str,
532
+ proxy_headers: dict[str, str] | None = None,
533
+ ) -> urllib3.PoolManager:
534
+ """Get or create a ProxyManager for the given proxy URL (Pooled)."""
535
+ cached = self._proxy_managers.get(cache_key)
536
+ if cached is not None:
537
+ return cached
538
+
539
+ if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
540
+ try:
541
+ from urllib3.contrib.socks import SOCKSProxyManager
542
+ except Exception as e:
543
+ raise ThordataConfigError(
544
+ "SOCKS proxy requested but SOCKS dependencies are missing. "
545
+ "Install: pip install 'urllib3[socks]' or pip install PySocks"
546
+ ) from e
547
+
548
+ pm_socks = SOCKSProxyManager(
549
+ proxy_url,
550
+ num_pools=10,
551
+ maxsize=10,
552
+ )
553
+ pm = cast(urllib3.PoolManager, pm_socks)
554
+ self._proxy_managers[cache_key] = pm
555
+ return pm
556
+
557
+ # HTTP/HTTPS proxies
558
+ proxy_ssl_context = None
559
+ if proxy_url.startswith("https://"):
560
+ proxy_ssl_context = ssl.create_default_context()
561
+
562
+ pm_http = urllib3.ProxyManager(
563
+ proxy_url,
564
+ proxy_headers=proxy_headers,
565
+ proxy_ssl_context=proxy_ssl_context,
566
+ num_pools=10,
567
+ maxsize=10,
568
+ )
569
+
570
+ pm = cast(urllib3.PoolManager, pm_http)
571
+ self._proxy_managers[cache_key] = pm
572
+ return pm
573
+
574
+ def _proxy_request_with_proxy_manager(
575
+ self,
576
+ method: str,
577
+ url: str,
578
+ *,
579
+ proxy_config: ProxyConfig,
580
+ timeout: int,
581
+ headers: dict[str, str] | None = None,
582
+ params: dict[str, Any] | None = None,
583
+ data: Any = None,
584
+ ) -> requests.Response:
585
+ """Execute request through proxy, with optional upstream proxy support."""
586
+
587
+ # Check for upstream proxy
588
+ upstream_config = _parse_upstream_proxy()
589
+
590
+ if upstream_config:
591
+ return self._proxy_request_with_upstream(
592
+ method,
593
+ url,
594
+ proxy_config=proxy_config,
595
+ timeout=timeout,
596
+ headers=headers,
597
+ params=params,
598
+ data=data,
599
+ upstream_config=upstream_config,
600
+ )
601
+
602
+ # Original implementation (no upstream proxy)
603
+ req = requests.Request(method=method.upper(), url=url, params=params)
604
+ prepped = self._proxy_session.prepare_request(req)
605
+ final_url = prepped.url or url
606
+
607
+ proxy_endpoint = proxy_config.build_proxy_endpoint()
608
+ is_socks = proxy_endpoint.startswith(
609
+ ("socks5://", "socks5h://", "socks4://", "socks4a://")
610
+ )
611
+
612
+ if is_socks:
613
+ proxy_url_for_manager = proxy_config.build_proxy_url()
614
+ userpass = proxy_config.build_proxy_basic_auth()
615
+ cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
616
+
617
+ pm = self._get_proxy_manager(
618
+ proxy_url_for_manager,
619
+ cache_key=cache_key,
620
+ proxy_headers=None,
621
+ )
622
+ else:
623
+ userpass = proxy_config.build_proxy_basic_auth()
624
+ proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
625
+ cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
626
+
627
+ pm = self._get_proxy_manager(
628
+ proxy_endpoint,
629
+ cache_key=cache_key,
630
+ proxy_headers=dict(proxy_headers),
631
+ )
632
+
633
+ req_headers = dict(headers or {})
634
+ body = None
635
+ if data is not None:
636
+ if isinstance(data, dict):
637
+ body = urlencode({k: str(v) for k, v in data.items()})
638
+ req_headers.setdefault(
639
+ "Content-Type", "application/x-www-form-urlencoded"
640
+ )
641
+ else:
642
+ body = data
643
+
644
+ http_resp = pm.request(
645
+ method.upper(),
646
+ final_url,
647
+ body=body,
648
+ headers=req_headers or None,
649
+ timeout=urllib3.Timeout(connect=timeout, read=timeout),
650
+ retries=False,
651
+ preload_content=True,
652
+ )
653
+
654
+ r = requests.Response()
655
+ r.status_code = int(getattr(http_resp, "status", 0) or 0)
656
+ r._content = http_resp.data or b""
657
+ r.url = final_url
658
+ r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
659
+ return r
660
+
661
+ # =========================================================================
662
+ # Upstream Proxy Support (Proxy Chaining)
663
+ # =========================================================================
664
+
665
+ def _proxy_request_with_upstream(
666
+ self,
667
+ method: str,
668
+ url: str,
669
+ *,
670
+ proxy_config: ProxyConfig,
671
+ timeout: int,
672
+ headers: dict[str, str] | None = None,
673
+ params: dict[str, Any] | None = None,
674
+ data: Any = None,
675
+ upstream_config: dict[str, Any],
676
+ ) -> requests.Response:
677
+ """Execute request through proxy chain: Upstream -> Thordata -> Target."""
678
+ if not HAS_PYSOCKS:
679
+ raise ThordataConfigError(
680
+ "PySocks is required for upstream proxy support. "
681
+ "Install with: pip install PySocks"
682
+ )
683
+
684
+ req = requests.Request(method=method.upper(), url=url, params=params)
685
+ prepped = self._proxy_session.prepare_request(req)
686
+ final_url = prepped.url or url
687
+
688
+ parsed_target = urlparse(final_url)
689
+ target_host = parsed_target.hostname or ""
690
+ target_port = parsed_target.port or (
691
+ 443 if parsed_target.scheme == "https" else 80
692
+ )
693
+ target_is_https = parsed_target.scheme == "https"
694
+
695
+ protocol = proxy_config.protocol.lower()
696
+ if protocol == "socks5":
697
+ protocol = "socks5h"
698
+
699
+ thordata_host = proxy_config.host or ""
700
+ thordata_port = proxy_config.port or 9999
701
+ thordata_username = proxy_config.build_username()
702
+ thordata_password = proxy_config.password
703
+
704
+ socket_factory = _UpstreamProxySocketFactory(upstream_config)
705
+
706
+ logger.debug(
707
+ f"Proxy chain: upstream({upstream_config['host']}:{upstream_config['port']}) "
708
+ f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
709
+ f"-> target({target_host}:{target_port})"
710
+ )
711
+
712
+ raw_sock = socket_factory.create_connection(
713
+ (thordata_host, thordata_port),
714
+ timeout=float(timeout),
715
+ )
716
+
717
+ try:
718
+ if protocol.startswith("socks"):
719
+ sock = self._socks5_handshake(
720
+ raw_sock,
721
+ target_host,
722
+ target_port,
723
+ thordata_username,
724
+ thordata_password,
725
+ )
726
+ if target_is_https:
727
+ context = ssl.create_default_context()
728
+ sock = context.wrap_socket(sock, server_hostname=target_host)
729
+
730
+ elif protocol == "https":
731
+ proxy_context = ssl.create_default_context()
732
+ proxy_ssl_sock = proxy_context.wrap_socket(
733
+ raw_sock, server_hostname=thordata_host
734
+ )
735
+
736
+ self._send_connect_request(
737
+ proxy_ssl_sock,
738
+ target_host,
739
+ target_port,
740
+ thordata_username,
741
+ thordata_password,
742
+ )
743
+
744
+ if target_is_https:
745
+ sock = self._create_tls_in_tls_socket(
746
+ proxy_ssl_sock, target_host, timeout
747
+ ) # type: ignore[assignment]
748
+ else:
749
+ sock = proxy_ssl_sock
750
+
751
+ else: # HTTP proxy
752
+ self._send_connect_request(
753
+ raw_sock,
754
+ target_host,
755
+ target_port,
756
+ thordata_username,
757
+ thordata_password,
758
+ )
759
+
760
+ if target_is_https:
761
+ context = ssl.create_default_context()
762
+ sock = context.wrap_socket(raw_sock, server_hostname=target_host)
763
+ else:
764
+ sock = raw_sock
765
+
766
+ return self._send_http_request(
767
+ sock, method, parsed_target, headers, data, final_url, timeout
768
+ )
769
+
770
+ finally:
771
+ with contextlib.suppress(Exception):
772
+ raw_sock.close()
773
+
774
+ def _send_connect_request(
775
+ self,
776
+ sock: socket.socket,
777
+ target_host: str,
778
+ target_port: int,
779
+ proxy_username: str,
780
+ proxy_password: str,
781
+ ) -> None:
782
+ """Send HTTP CONNECT request to proxy and verify response."""
783
+ connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
784
+ connect_req += f"Host: {target_host}:{target_port}\r\n"
785
+
786
+ credentials = f"{proxy_username}:{proxy_password}"
787
+ encoded = base64.b64encode(credentials.encode()).decode()
788
+ connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
789
+ connect_req += "\r\n"
790
+
791
+ sock.sendall(connect_req.encode())
792
+
793
+ response = b""
794
+ while b"\r\n\r\n" not in response:
795
+ chunk = sock.recv(4096)
796
+ if not chunk:
797
+ raise ConnectionError("Proxy closed connection during CONNECT")
798
+ response += chunk
799
+
800
+ status_line = response.split(b"\r\n")[0].decode()
801
+ if "200" not in status_line:
802
+ raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
803
+
804
+ def _create_tls_in_tls_socket(
805
+ self,
806
+ outer_ssl_sock: ssl.SSLSocket,
807
+ hostname: str,
808
+ timeout: int,
809
+ ) -> _TLSInTLSSocket:
810
+ """Create a TLS connection over an existing TLS connection."""
811
+ context = ssl.create_default_context()
812
+
813
+ incoming = ssl.MemoryBIO()
814
+ outgoing = ssl.MemoryBIO()
815
+
816
+ ssl_obj = context.wrap_bio(incoming, outgoing, server_hostname=hostname)
817
+
818
+ while True:
819
+ try:
820
+ ssl_obj.do_handshake()
821
+ break
822
+ except ssl.SSLWantReadError:
823
+ data_to_send = outgoing.read()
824
+ if data_to_send:
825
+ outer_ssl_sock.sendall(data_to_send)
826
+
827
+ outer_ssl_sock.settimeout(float(timeout))
828
+ try:
829
+ received = outer_ssl_sock.recv(8192)
830
+ if not received:
831
+ raise ConnectionError("Connection closed during TLS handshake")
832
+ incoming.write(received)
833
+ except socket.timeout as e:
834
+ raise ConnectionError("Timeout during TLS handshake") from e
835
+ except ssl.SSLWantWriteError:
836
+ data_to_send = outgoing.read()
837
+ if data_to_send:
838
+ outer_ssl_sock.sendall(data_to_send)
839
+
840
+ data_to_send = outgoing.read()
841
+ if data_to_send:
842
+ outer_ssl_sock.sendall(data_to_send)
843
+
844
+ return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
845
+
846
+ def _send_http_request(
847
+ self,
848
+ sock: socket.socket | ssl.SSLSocket | Any,
849
+ method: str,
850
+ parsed_url: Any,
851
+ headers: dict[str, str] | None,
852
+ data: Any,
853
+ final_url: str,
854
+ timeout: int,
855
+ ) -> requests.Response:
856
+ """Send HTTP request over established connection and parse response."""
857
+ target_host = parsed_url.hostname
858
+
859
+ req_headers = dict(headers or {})
860
+ req_headers.setdefault("Host", target_host)
861
+ req_headers.setdefault("User-Agent", build_user_agent(_sdk_version, "requests"))
862
+ req_headers.setdefault("Connection", "close")
863
+
864
+ path = parsed_url.path or "/"
865
+ if parsed_url.query:
866
+ path += f"?{parsed_url.query}"
867
+
868
+ http_req = f"{method.upper()} {path} HTTP/1.1\r\n"
869
+ for k, v in req_headers.items():
870
+ http_req += f"{k}: {v}\r\n"
871
+
872
+ body = None
873
+ if data is not None:
874
+ if isinstance(data, dict):
875
+ body = urlencode({k: str(v) for k, v in data.items()}).encode()
876
+ http_req += "Content-Type: application/x-www-form-urlencoded\r\n"
877
+ http_req += f"Content-Length: {len(body)}\r\n"
878
+ elif isinstance(data, bytes):
879
+ body = data
880
+ http_req += f"Content-Length: {len(body)}\r\n"
881
+ else:
882
+ body = str(data).encode()
883
+ http_req += f"Content-Length: {len(body)}\r\n"
884
+
885
+ http_req += "\r\n"
886
+ sock.sendall(http_req.encode())
887
+
888
+ if body:
889
+ sock.sendall(body)
890
+
891
+ if hasattr(sock, "settimeout"):
892
+ sock.settimeout(float(timeout))
893
+
894
+ response_data = b""
895
+ try:
896
+ while True:
897
+ chunk = sock.recv(8192)
898
+ if not chunk:
899
+ break
900
+ response_data += chunk
901
+ if b"\r\n\r\n" in response_data:
902
+ header_end = response_data.index(b"\r\n\r\n") + 4
903
+ headers_part = (
904
+ response_data[:header_end]
905
+ .decode("utf-8", errors="replace")
906
+ .lower()
907
+ )
908
+ if "content-length:" in headers_part:
909
+ for line in headers_part.split("\r\n"):
910
+ if line.startswith("content-length:"):
911
+ content_length = int(line.split(":")[1].strip())
912
+ if len(response_data) >= header_end + content_length:
913
+ break
914
+ elif "transfer-encoding: chunked" not in headers_part:
915
+ break
916
+ except socket.timeout:
917
+ pass
918
+
919
+ return self._parse_http_response(response_data, final_url)
920
+
921
+ def _socks5_handshake(
922
+ self,
923
+ sock: socket.socket,
924
+ target_host: str,
925
+ target_port: int,
926
+ username: str | None,
927
+ password: str | None,
928
+ ) -> socket.socket:
929
+ """Perform SOCKS5 handshake over existing socket."""
930
+ if username and password:
931
+ sock.sendall(b"\x05\x02\x00\x02")
932
+ else:
933
+ sock.sendall(b"\x05\x01\x00")
934
+
935
+ response = sock.recv(2)
936
+ if len(response) < 2:
937
+ raise ConnectionError("SOCKS5 handshake failed: incomplete response")
938
+
939
+ if response[0] != 0x05:
940
+ raise ConnectionError(f"SOCKS5 version mismatch: {response[0]}")
941
+
942
+ auth_method = response[1]
943
+
944
+ if auth_method == 0x02:
945
+ if not username or not password:
946
+ raise ConnectionError(
947
+ "SOCKS5 server requires auth but no credentials provided"
948
+ )
949
+
950
+ auth_req = bytes([0x01, len(username)]) + username.encode()
951
+ auth_req += bytes([len(password)]) + password.encode()
952
+ sock.sendall(auth_req)
953
+
954
+ auth_resp = sock.recv(2)
955
+ if len(auth_resp) < 2 or auth_resp[1] != 0x00:
956
+ raise ConnectionError("SOCKS5 authentication failed")
957
+
958
+ elif auth_method == 0xFF:
959
+ raise ConnectionError("SOCKS5 no acceptable auth method")
960
+
961
+ connect_req = b"\x05\x01\x00\x03"
962
+ connect_req += bytes([len(target_host)]) + target_host.encode()
963
+ connect_req += target_port.to_bytes(2, "big")
964
+ sock.sendall(connect_req)
965
+
966
+ resp = sock.recv(4)
967
+ if len(resp) < 4:
968
+ raise ConnectionError("SOCKS5 connect failed: incomplete response")
969
+
970
+ if resp[1] != 0x00:
971
+ error_codes = {
972
+ 0x01: "General failure",
973
+ 0x02: "Connection not allowed",
974
+ 0x03: "Network unreachable",
975
+ 0x04: "Host unreachable",
976
+ 0x05: "Connection refused",
977
+ 0x06: "TTL expired",
978
+ 0x07: "Command not supported",
979
+ 0x08: "Address type not supported",
980
+ }
981
+ error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
982
+ raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
983
+
984
+ addr_type = resp[3]
985
+ if addr_type == 0x01:
986
+ sock.recv(4 + 2)
987
+ elif addr_type == 0x03:
988
+ domain_len = sock.recv(1)[0]
989
+ sock.recv(domain_len + 2)
990
+ elif addr_type == 0x04:
991
+ sock.recv(16 + 2)
992
+
993
+ return sock
994
+
995
+ def _parse_http_response(
996
+ self,
997
+ response_data: bytes,
998
+ url: str,
999
+ ) -> requests.Response:
1000
+ """Parse raw HTTP response into requests.Response."""
1001
+ if b"\r\n\r\n" in response_data:
1002
+ header_data, body = response_data.split(b"\r\n\r\n", 1)
1003
+ else:
1004
+ header_data = response_data
1005
+ body = b""
1006
+
1007
+ header_lines = header_data.decode("utf-8", errors="replace").split("\r\n")
1008
+
1009
+ status_line = header_lines[0] if header_lines else ""
1010
+ parts = status_line.split(" ", 2)
1011
+ status_code = int(parts[1]) if len(parts) > 1 else 0
1012
+
1013
+ headers_dict = {}
1014
+ for line in header_lines[1:]:
1015
+ if ": " in line:
1016
+ k, v = line.split(": ", 1)
1017
+ headers_dict[k] = v
1018
+
1019
+ if headers_dict.get("Transfer-Encoding", "").lower() == "chunked":
1020
+ body = self._decode_chunked(body)
1021
+
1022
+ r = requests.Response()
1023
+ r.status_code = status_code
1024
+ r._content = body
1025
+ r.url = url
1026
+ r.headers = CaseInsensitiveDict(headers_dict)
1027
+ return r
1028
+
1029
+ def _decode_chunked(self, data: bytes) -> bytes:
1030
+ """Decode chunked transfer encoding."""
1031
+ result = b""
1032
+ while data:
1033
+ if b"\r\n" not in data:
1034
+ break
1035
+ size_line, data = data.split(b"\r\n", 1)
1036
+ try:
1037
+ chunk_size = int(size_line.decode().strip(), 16)
1038
+ except ValueError:
1039
+ break
1040
+
1041
+ if chunk_size == 0:
1042
+ break
1043
+
1044
+ result += data[:chunk_size]
1045
+ data = data[chunk_size:]
1046
+
1047
+ if data.startswith(b"\r\n"):
1048
+ data = data[2:]
1049
+
1050
+ return result
1051
+
1052
+ # =========================================================================
1053
+ # SERP API Methods
1054
+ # =========================================================================
1055
+
1056
+ def serp_search(
1057
+ self,
1058
+ query: str,
1059
+ *,
1060
+ engine: Engine | str = Engine.GOOGLE,
1061
+ num: int = 10,
1062
+ country: str | None = None,
1063
+ language: str | None = None,
1064
+ search_type: str | None = None,
1065
+ device: str | None = None,
1066
+ render_js: bool | None = None,
1067
+ no_cache: bool | None = None,
1068
+ output_format: str = "json",
1069
+ **kwargs: Any,
1070
+ ) -> dict[str, Any]:
1071
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
1072
+
1073
+ request = SerpRequest(
1074
+ query=query,
1075
+ engine=engine_str,
1076
+ num=num,
1077
+ country=country,
1078
+ language=language,
1079
+ search_type=search_type,
1080
+ device=device,
1081
+ render_js=render_js,
1082
+ no_cache=no_cache,
1083
+ output_format=output_format,
1084
+ extra_params=kwargs,
1085
+ )
1086
+
1087
+ return self.serp_search_advanced(request)
1088
+
1089
+ def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
1090
+ if not self.scraper_token:
1091
+ raise ThordataConfigError("scraper_token is required for SERP API")
1092
+
1093
+ payload = request.to_payload()
1094
+ headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
1095
+
1096
+ logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
1097
+
1098
+ try:
1099
+ response = self._api_request_with_retry(
1100
+ "POST",
1101
+ self._serp_url,
1102
+ data=payload,
1103
+ headers=headers,
1104
+ )
1105
+ response.raise_for_status()
1106
+
1107
+ if request.output_format.lower() == "json":
1108
+ data = response.json()
1109
+ if isinstance(data, dict):
1110
+ code = data.get("code")
1111
+ if code is not None and code != 200:
1112
+ msg = extract_error_message(data)
1113
+ raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
1114
+ return parse_json_response(data)
1115
+
1116
+ return {"html": response.text}
1117
+
1118
+ except requests.Timeout as e:
1119
+ raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
1120
+ except requests.RequestException as e:
1121
+ raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
1122
+
1123
+ # =========================================================================
1124
+ # Universal Scraping API
1125
+ # =========================================================================
1126
+
1127
+ def universal_scrape(
1128
+ self,
1129
+ url: str,
1130
+ *,
1131
+ js_render: bool = False,
1132
+ output_format: str = "html",
1133
+ country: str | None = None,
1134
+ block_resources: str | None = None,
1135
+ wait: int | None = None,
1136
+ wait_for: str | None = None,
1137
+ **kwargs: Any,
1138
+ ) -> str | bytes:
1139
+ request = UniversalScrapeRequest(
1140
+ url=url,
1141
+ js_render=js_render,
1142
+ output_format=output_format,
1143
+ country=country,
1144
+ block_resources=block_resources,
1145
+ wait=wait,
1146
+ wait_for=wait_for,
1147
+ extra_params=kwargs,
1148
+ )
1149
+ return self.universal_scrape_advanced(request)
1150
+
1151
+ def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
1152
+ if not self.scraper_token:
1153
+ raise ThordataConfigError("scraper_token is required for Universal API")
1154
+
1155
+ payload = request.to_payload()
1156
+ headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
1157
+
1158
+ logger.info(f"Universal Scrape: {request.url}")
1159
+
1160
+ try:
1161
+ response = self._api_request_with_retry(
1162
+ "POST",
1163
+ self._universal_url,
1164
+ data=payload,
1165
+ headers=headers,
1166
+ )
1167
+ response.raise_for_status()
1168
+ return self._process_universal_response(response, request.output_format)
1169
+
1170
+ except requests.Timeout as e:
1171
+ raise ThordataTimeoutError(
1172
+ f"Universal timeout: {e}", original_error=e
1173
+ ) from e
1174
+ except requests.RequestException as e:
1175
+ raise ThordataNetworkError(
1176
+ f"Universal failed: {e}", original_error=e
1177
+ ) from e
1178
+
1179
+ def _process_universal_response(
1180
+ self, response: requests.Response, output_format: str
1181
+ ) -> str | bytes:
1182
+ try:
1183
+ resp_json = response.json()
1184
+ except ValueError:
1185
+ return response.content if output_format.lower() == "png" else response.text
1186
+
1187
+ if isinstance(resp_json, dict):
1188
+ code = resp_json.get("code")
1189
+ if code is not None and code != 200:
1190
+ msg = extract_error_message(resp_json)
1191
+ raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
1192
+
1193
+ if "html" in resp_json:
1194
+ return resp_json["html"]
1195
+ if "png" in resp_json:
1196
+ return decode_base64_image(resp_json["png"])
1197
+
1198
+ return str(resp_json)
1199
+
1200
+ # =========================================================================
1201
+ # Web Scraper API (Tasks)
1202
+ # =========================================================================
1203
+
1204
+ def create_scraper_task(
1205
+ self,
1206
+ file_name: str,
1207
+ spider_id: str,
1208
+ spider_name: str,
1209
+ parameters: dict[str, Any],
1210
+ universal_params: dict[str, Any] | None = None,
1211
+ ) -> str:
1212
+ config = ScraperTaskConfig(
1213
+ file_name=file_name,
1214
+ spider_id=spider_id,
1215
+ spider_name=spider_name,
1216
+ parameters=parameters,
1217
+ universal_params=universal_params,
1218
+ )
1219
+ return self.create_scraper_task_advanced(config)
1220
+
1221
+ def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
1222
+ self._require_public_credentials()
1223
+ if not self.scraper_token:
1224
+ raise ThordataConfigError("scraper_token is required for Task Builder")
1225
+ payload = config.to_payload()
1226
+ headers = build_builder_headers(
1227
+ self.scraper_token, self.public_token or "", self.public_key or ""
1228
+ )
1229
+
1230
+ try:
1231
+ response = self._api_request_with_retry(
1232
+ "POST", self._builder_url, data=payload, headers=headers
1233
+ )
1234
+ response.raise_for_status()
1235
+ data = response.json()
1236
+ if data.get("code") != 200:
1237
+ raise_for_code(
1238
+ "Task creation failed", code=data.get("code"), payload=data
1239
+ )
1240
+ return data["data"]["task_id"]
1241
+ except requests.RequestException as e:
1242
+ raise ThordataNetworkError(
1243
+ f"Task creation failed: {e}", original_error=e
1244
+ ) from e
1245
+
1246
+ def create_video_task(
1247
+ self,
1248
+ file_name: str,
1249
+ spider_id: str,
1250
+ spider_name: str,
1251
+ parameters: dict[str, Any],
1252
+ common_settings: CommonSettings,
1253
+ ) -> str:
1254
+ config = VideoTaskConfig(
1255
+ file_name=file_name,
1256
+ spider_id=spider_id,
1257
+ spider_name=spider_name,
1258
+ parameters=parameters,
1259
+ common_settings=common_settings,
1260
+ )
1261
+ return self.create_video_task_advanced(config)
1262
+
1263
+ def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
1264
+ self._require_public_credentials()
1265
+ if not self.scraper_token:
1266
+ raise ThordataConfigError(
1267
+ "scraper_token is required for Video Task Builder"
1268
+ )
1269
+
1270
+ payload = config.to_payload()
1271
+ headers = build_builder_headers(
1272
+ self.scraper_token, self.public_token or "", self.public_key or ""
1273
+ )
1274
+
1275
+ response = self._api_request_with_retry(
1276
+ "POST", self._video_builder_url, data=payload, headers=headers
1277
+ )
1278
+ response.raise_for_status()
1279
+ data = response.json()
1280
+ if data.get("code") != 200:
1281
+ raise_for_code(
1282
+ "Video task creation failed", code=data.get("code"), payload=data
1283
+ )
1284
+ return data["data"]["task_id"]
1285
+
1286
+ def get_task_status(self, task_id: str) -> str:
1287
+ self._require_public_credentials()
1288
+ headers = build_public_api_headers(
1289
+ self.public_token or "", self.public_key or ""
1290
+ )
1291
+ try:
1292
+ response = self._api_request_with_retry(
1293
+ "POST",
1294
+ self._status_url,
1295
+ data={"tasks_ids": task_id},
1296
+ headers=headers,
1297
+ )
1298
+ response.raise_for_status()
1299
+ data = response.json()
1300
+ if data.get("code") != 200:
1301
+ raise_for_code("Task status error", code=data.get("code"), payload=data)
1302
+
1303
+ items = data.get("data") or []
1304
+ for item in items:
1305
+ if str(item.get("task_id")) == str(task_id):
1306
+ return item.get("status", "unknown")
1307
+ return "unknown"
1308
+ except requests.RequestException as e:
1309
+ raise ThordataNetworkError(
1310
+ f"Status check failed: {e}", original_error=e
1311
+ ) from e
1312
+
1313
+ def safe_get_task_status(self, task_id: str) -> str:
1314
+ try:
1315
+ return self.get_task_status(task_id)
1316
+ except Exception:
1317
+ return "error"
1318
+
1319
+ def get_task_result(self, task_id: str, file_type: str = "json") -> str:
1320
+ self._require_public_credentials()
1321
+ headers = build_public_api_headers(
1322
+ self.public_token or "", self.public_key or ""
1323
+ )
1324
+ try:
1325
+ response = self._api_request_with_retry(
1326
+ "POST",
1327
+ self._download_url,
1328
+ data={"tasks_id": task_id, "type": file_type},
1329
+ headers=headers,
1330
+ )
1331
+ response.raise_for_status()
1332
+ data = response.json()
1333
+ if data.get("code") == 200 and data.get("data"):
1334
+ return data["data"]["download"]
1335
+ raise_for_code("Get result failed", code=data.get("code"), payload=data)
1336
+ return ""
1337
+ except requests.RequestException as e:
1338
+ raise ThordataNetworkError(
1339
+ f"Get result failed: {e}", original_error=e
1340
+ ) from e
1341
+
1342
+ def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
1343
+ self._require_public_credentials()
1344
+ headers = build_public_api_headers(
1345
+ self.public_token or "", self.public_key or ""
1346
+ )
1347
+ response = self._api_request_with_retry(
1348
+ "POST",
1349
+ self._list_url,
1350
+ data={"page": str(page), "size": str(size)},
1351
+ headers=headers,
1352
+ )
1353
+ response.raise_for_status()
1354
+ data = response.json()
1355
+ if data.get("code") != 200:
1356
+ raise_for_code("List tasks failed", code=data.get("code"), payload=data)
1357
+ return data.get("data", {"count": 0, "list": []})
1358
+
1359
+ def wait_for_task(
1360
+ self,
1361
+ task_id: str,
1362
+ *,
1363
+ poll_interval: float = 5.0,
1364
+ max_wait: float = 600.0,
1365
+ ) -> str:
1366
+ import time
1367
+
1368
+ start = time.monotonic()
1369
+ while (time.monotonic() - start) < max_wait:
1370
+ status = self.get_task_status(task_id)
1371
+ if status.lower() in {
1372
+ "ready",
1373
+ "success",
1374
+ "finished",
1375
+ "failed",
1376
+ "error",
1377
+ "cancelled",
1378
+ }:
1379
+ return status
1380
+ time.sleep(poll_interval)
1381
+ raise TimeoutError(f"Task {task_id} timeout")
1382
+
1383
+ # =========================================================================
1384
+ # Account / Locations / Utils
1385
+ # =========================================================================
1386
+
1387
+ def get_usage_statistics(
1388
+ self,
1389
+ from_date: str | date,
1390
+ to_date: str | date,
1391
+ ) -> UsageStatistics:
1392
+ self._require_public_credentials()
1393
+ if isinstance(from_date, date):
1394
+ from_date = from_date.strftime("%Y-%m-%d")
1395
+ if isinstance(to_date, date):
1396
+ to_date = to_date.strftime("%Y-%m-%d")
1397
+
1398
+ params = {
1399
+ "token": self.public_token,
1400
+ "key": self.public_key,
1401
+ "from_date": from_date,
1402
+ "to_date": to_date,
1403
+ }
1404
+ response = self._api_request_with_retry(
1405
+ "GET", self._usage_stats_url, params=params
1406
+ )
1407
+ response.raise_for_status()
1408
+ data = response.json()
1409
+ if data.get("code") != 200:
1410
+ raise_for_code("Usage stats error", code=data.get("code"), payload=data)
1411
+ return UsageStatistics.from_dict(data.get("data", data))
1412
+
1413
+ def list_proxy_users(
1414
+ self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1415
+ ) -> ProxyUserList:
1416
+ self._require_public_credentials()
1417
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1418
+ params = {
1419
+ "token": self.public_token,
1420
+ "key": self.public_key,
1421
+ "proxy_type": str(pt),
1422
+ }
1423
+ response = self._api_request_with_retry(
1424
+ "GET", f"{self._proxy_users_url}/user-list", params=params
1425
+ )
1426
+ response.raise_for_status()
1427
+ data = response.json()
1428
+ if data.get("code") != 200:
1429
+ raise_for_code("List users error", code=data.get("code"), payload=data)
1430
+ return ProxyUserList.from_dict(data.get("data", data))
1431
+
1432
+ def create_proxy_user(
1433
+ self,
1434
+ username: str,
1435
+ password: str,
1436
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1437
+ traffic_limit: int = 0,
1438
+ status: bool = True,
1439
+ ) -> dict[str, Any]:
1440
+ self._require_public_credentials()
1441
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1442
+ headers = build_public_api_headers(
1443
+ self.public_token or "", self.public_key or ""
1444
+ )
1445
+ payload = {
1446
+ "proxy_type": str(pt),
1447
+ "username": username,
1448
+ "password": password,
1449
+ "traffic_limit": str(traffic_limit),
1450
+ "status": "true" if status else "false",
1451
+ }
1452
+ response = self._api_request_with_retry(
1453
+ "POST",
1454
+ f"{self._proxy_users_url}/create-user",
1455
+ data=payload,
1456
+ headers=headers,
1457
+ )
1458
+ response.raise_for_status()
1459
+ data = response.json()
1460
+ if data.get("code") != 200:
1461
+ raise_for_code("Create user failed", code=data.get("code"), payload=data)
1462
+ return data.get("data", {})
1463
+
1464
+ def add_whitelist_ip(
1465
+ self,
1466
+ ip: str,
1467
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1468
+ status: bool = True,
1469
+ ) -> dict[str, Any]:
1470
+ self._require_public_credentials()
1471
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1472
+ headers = build_public_api_headers(
1473
+ self.public_token or "", self.public_key or ""
1474
+ )
1475
+ payload = {
1476
+ "proxy_type": str(pt),
1477
+ "ip": ip,
1478
+ "status": "true" if status else "false",
1479
+ }
1480
+ response = self._api_request_with_retry(
1481
+ "POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
1482
+ )
1483
+ response.raise_for_status()
1484
+ data = response.json()
1485
+ if data.get("code") != 200:
1486
+ raise_for_code(
1487
+ "Add whitelist IP failed", code=data.get("code"), payload=data
1488
+ )
1489
+ return data.get("data", {})
1490
+
1491
+ def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
1492
+ self._require_public_credentials()
1493
+ params = {
1494
+ "token": self.public_token,
1495
+ "key": self.public_key,
1496
+ "proxy_type": str(proxy_type),
1497
+ }
1498
+ response = self._api_request_with_retry(
1499
+ "GET", self._proxy_list_url, params=params
1500
+ )
1501
+ response.raise_for_status()
1502
+ data = response.json()
1503
+ if data.get("code") != 200:
1504
+ raise_for_code(
1505
+ "List proxy servers error", code=data.get("code"), payload=data
1506
+ )
1507
+
1508
+ server_list = []
1509
+ if isinstance(data, dict):
1510
+ server_list = data.get("data", data.get("list", []))
1511
+ elif isinstance(data, list):
1512
+ server_list = data
1513
+
1514
+ return [ProxyServer.from_dict(s) for s in server_list]
1515
+
1516
+ def get_proxy_expiration(
1517
+ self, ips: str | list[str], proxy_type: int
1518
+ ) -> dict[str, Any]:
1519
+ self._require_public_credentials()
1520
+ if isinstance(ips, list):
1521
+ ips = ",".join(ips)
1522
+ params = {
1523
+ "token": self.public_token,
1524
+ "key": self.public_key,
1525
+ "proxy_type": str(proxy_type),
1526
+ "ips": ips,
1527
+ }
1528
+ response = self._api_request_with_retry(
1529
+ "GET", self._proxy_expiration_url, params=params
1530
+ )
1531
+ response.raise_for_status()
1532
+ data = response.json()
1533
+ if data.get("code") != 200:
1534
+ raise_for_code("Get expiration error", code=data.get("code"), payload=data)
1535
+ return data.get("data", data)
1536
+
1537
+ def list_countries(
1538
+ self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1539
+ ) -> list[dict[str, Any]]:
1540
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1541
+ return self._get_locations("countries", proxy_type=pt)
1542
+
1543
+ def list_states(
1544
+ self,
1545
+ country_code: str,
1546
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1547
+ ) -> list[dict[str, Any]]:
1548
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1549
+ return self._get_locations("states", proxy_type=pt, country_code=country_code)
1550
+
1551
+ def list_cities(
1552
+ self,
1553
+ country_code: str,
1554
+ state_code: str | None = None,
1555
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1556
+ ) -> list[dict[str, Any]]:
1557
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1558
+ kwargs = {"proxy_type": pt, "country_code": country_code}
1559
+ if state_code:
1560
+ kwargs["state_code"] = state_code
1561
+ return self._get_locations("cities", **kwargs)
1562
+
1563
+ def list_asn(
1564
+ self,
1565
+ country_code: str,
1566
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1567
+ ) -> list[dict[str, Any]]:
1568
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1569
+ return self._get_locations("asn", proxy_type=pt, country_code=country_code)
1570
+
1571
+ def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
1572
+ self._require_public_credentials()
1573
+ params = {"token": self.public_token, "key": self.public_key}
1574
+ for k, v in kwargs.items():
1575
+ params[k] = str(v)
1576
+
1577
+ response = self._api_request_with_retry(
1578
+ "GET", f"{self._locations_base_url}/{endpoint}", params=params
1579
+ )
1580
+ response.raise_for_status()
1581
+ data = response.json()
1582
+ if isinstance(data, dict):
1583
+ if data.get("code") != 200:
1584
+ raise RuntimeError(f"Locations error: {data.get('msg')}")
1585
+ return data.get("data") or []
1586
+ return data if isinstance(data, list) else []
1587
+
1588
+ def _require_public_credentials(self) -> None:
1589
+ if not self.public_token or not self.public_key:
1590
+ raise ThordataConfigError(
1591
+ "public_token and public_key are required for this operation."
1592
+ )
1593
+
1594
+ def _get_proxy_endpoint_overrides(
1595
+ self, product: ProxyProduct
1596
+ ) -> tuple[str | None, int | None, str]:
1597
+ prefix = product.value.upper()
1598
+ host = os.getenv(f"THORDATA_{prefix}_PROXY_HOST") or os.getenv(
1599
+ "THORDATA_PROXY_HOST"
1600
+ )
1601
+ port_raw = os.getenv(f"THORDATA_{prefix}_PROXY_PORT") or os.getenv(
1602
+ "THORDATA_PROXY_PORT"
1603
+ )
1604
+ protocol = (
1605
+ os.getenv(f"THORDATA_{prefix}_PROXY_PROTOCOL")
1606
+ or os.getenv("THORDATA_PROXY_PROTOCOL")
1607
+ or "https"
1608
+ )
1609
+ port = int(port_raw) if port_raw and port_raw.isdigit() else None
1610
+ return host or None, port, protocol
1611
+
1612
+ def _get_default_proxy_config_from_env(self) -> ProxyConfig | None:
1613
+ for prod in [
1614
+ ProxyProduct.RESIDENTIAL,
1615
+ ProxyProduct.DATACENTER,
1616
+ ProxyProduct.MOBILE,
1617
+ ]:
1618
+ prefix = prod.value.upper()
1619
+ u = os.getenv(f"THORDATA_{prefix}_USERNAME")
1620
+ p = os.getenv(f"THORDATA_{prefix}_PASSWORD")
1621
+ if u and p:
1622
+ h, port, proto = self._get_proxy_endpoint_overrides(prod)
1623
+ return ProxyConfig(
1624
+ username=u,
1625
+ password=p,
1626
+ product=prod,
1627
+ host=h,
1628
+ port=port,
1629
+ protocol=proto,
1630
+ )
1631
+ return None
1632
+
1633
+ def close(self) -> None:
1634
+ self._proxy_session.close()
1635
+ self._api_session.close()
1636
+ for pm in self._proxy_managers.values():
1637
+ pm.clear()
1638
+ self._proxy_managers.clear()
1639
+
1640
+ def __enter__(self) -> ThordataClient:
1641
+ return self
1642
+
1643
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
1644
+ self.close()