thordata-sdk 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/client.py CHANGED
@@ -3,74 +3,60 @@ Synchronous client for the Thordata API.
3
3
 
4
4
  This module provides the main ThordataClient class for interacting with
5
5
  Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
6
-
7
- Example:
8
- >>> from thordata import ThordataClient
9
- >>>
10
- >>> client = ThordataClient(
11
- ... scraper_token="your_token",
12
- ... public_token="your_public_token",
13
- ... public_key="your_public_key"
14
- ... )
15
- >>>
16
- >>> # Use the proxy network
17
- >>> response = client.get("https://httpbin.org/ip")
18
- >>> print(response.json())
19
- >>>
20
- >>> # Search with SERP API
21
- >>> results = client.serp_search("python tutorial", engine="google")
22
6
  """
23
7
 
24
8
  from __future__ import annotations
25
9
 
26
10
  import base64
27
- import contextlib
28
11
  import hashlib
29
- import json
30
12
  import logging
31
13
  import os
32
14
  import socket
33
15
  import ssl
34
16
  from datetime import date
35
17
  from typing import Any, cast
36
- from urllib.parse import quote, urlencode, urlparse
18
+ from urllib.parse import urlencode, urlparse
37
19
 
38
20
  import requests
39
21
  import urllib3
40
22
  from requests.structures import CaseInsensitiveDict
41
23
 
42
- from .serp_engines import SerpNamespace
43
- from .unlimited import UnlimitedNamespace
44
-
45
- try:
46
- import socks
47
-
48
- HAS_PYSOCKS = True
49
- except ImportError:
50
- HAS_PYSOCKS = False
51
-
52
- from . import __version__ as _sdk_version
24
+ # Import Legacy/Compat
53
25
  from ._utils import (
54
26
  build_auth_headers,
55
27
  build_builder_headers,
56
28
  build_public_api_headers,
57
- build_user_agent,
58
29
  decode_base64_image,
59
30
  extract_error_message,
60
31
  parse_json_response,
61
32
  )
62
- from .enums import Engine, ProxyType
33
+
34
+ # Import Core Components
35
+ from .core.http_client import ThordataHttpSession
36
+ from .core.tunnel import (
37
+ HAS_PYSOCKS,
38
+ UpstreamProxySocketFactory,
39
+ create_tls_in_tls,
40
+ parse_upstream_proxy,
41
+ socks5_handshake,
42
+ )
43
+ from .enums import Engine
63
44
  from .exceptions import (
64
45
  ThordataConfigError,
65
46
  ThordataNetworkError,
66
47
  ThordataTimeoutError,
67
48
  raise_for_code,
68
49
  )
69
- from .models import (
50
+ from .retry import RetryConfig, with_retry
51
+ from .serp_engines import SerpNamespace
52
+
53
+ # Import Types (Modernized)
54
+ from .types import (
70
55
  CommonSettings,
71
56
  ProxyConfig,
72
57
  ProxyProduct,
73
58
  ProxyServer,
59
+ ProxyType,
74
60
  ProxyUserList,
75
61
  ScraperTaskConfig,
76
62
  SerpRequest,
@@ -78,196 +64,17 @@ from .models import (
78
64
  UsageStatistics,
79
65
  VideoTaskConfig,
80
66
  )
81
- from .retry import RetryConfig, with_retry
67
+ from .unlimited import UnlimitedNamespace
82
68
 
83
69
  logger = logging.getLogger(__name__)
84
70
 
85
-
86
71
  # =========================================================================
87
- # Upstream Proxy Support (for users behind firewall)
72
+ # Internal Logic for Upstream Proxies
88
73
  # =========================================================================
89
74
 
90
75
 
91
76
  def _parse_upstream_proxy() -> dict[str, Any] | None:
92
- """
93
- Parse THORDATA_UPSTREAM_PROXY environment variable.
94
-
95
- Supported formats:
96
- - http://127.0.0.1:7897
97
- - socks5://127.0.0.1:7897
98
- - socks5://user:pass@127.0.0.1:7897
99
-
100
- Returns:
101
- Dict with proxy config or None if not set.
102
- """
103
- upstream_url = os.environ.get("THORDATA_UPSTREAM_PROXY", "").strip()
104
- if not upstream_url:
105
- return None
106
-
107
- parsed = urlparse(upstream_url)
108
- scheme = (parsed.scheme or "").lower()
109
-
110
- if scheme not in ("http", "https", "socks5", "socks5h", "socks4"):
111
- logger.warning(f"Unsupported upstream proxy scheme: {scheme}")
112
- return None
113
-
114
- return {
115
- "scheme": scheme,
116
- "host": parsed.hostname or "127.0.0.1",
117
- "port": parsed.port or (1080 if scheme.startswith("socks") else 7897),
118
- "username": parsed.username,
119
- "password": parsed.password,
120
- }
121
-
122
-
123
- class _UpstreamProxySocketFactory:
124
- """
125
- Socket factory that creates connections through an upstream proxy.
126
- Used for proxy chaining when accessing Thordata from behind a firewall.
127
- """
128
-
129
- def __init__(self, upstream_config: dict[str, Any]):
130
- self.config = upstream_config
131
-
132
- def create_connection(
133
- self,
134
- address: tuple[str, int],
135
- timeout: float | None = None,
136
- source_address: tuple[str, int] | None = None,
137
- ) -> socket.socket:
138
- """Create a socket connection through the upstream proxy."""
139
- scheme = self.config["scheme"]
140
-
141
- if scheme.startswith("socks"):
142
- return self._create_socks_connection(address, timeout)
143
- else:
144
- return self._create_http_tunnel(address, timeout)
145
-
146
- def _create_socks_connection(
147
- self,
148
- address: tuple[str, int],
149
- timeout: float | None = None,
150
- ) -> socket.socket:
151
- """Create connection through SOCKS proxy."""
152
- if not HAS_PYSOCKS:
153
- raise RuntimeError(
154
- "PySocks is required for SOCKS upstream proxy. "
155
- "Install with: pip install PySocks"
156
- )
157
-
158
- scheme = self.config["scheme"]
159
- proxy_type = socks.SOCKS5 if "socks5" in scheme else socks.SOCKS4
160
-
161
- sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
162
- sock.set_proxy(
163
- proxy_type,
164
- self.config["host"],
165
- self.config["port"],
166
- rdns=True,
167
- username=self.config.get("username"),
168
- password=self.config.get("password"),
169
- )
170
-
171
- if timeout is not None:
172
- sock.settimeout(timeout)
173
-
174
- sock.connect(address)
175
- return sock
176
-
177
- def _create_http_tunnel(
178
- self,
179
- address: tuple[str, int],
180
- timeout: float | None = None,
181
- ) -> socket.socket:
182
- """Create connection through HTTP CONNECT tunnel."""
183
- # Connect to upstream proxy
184
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
185
- if timeout is not None:
186
- sock.settimeout(timeout)
187
-
188
- sock.connect((self.config["host"], self.config["port"]))
189
-
190
- # Build CONNECT request
191
- target_host, target_port = address
192
- connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
193
- connect_req += f"Host: {target_host}:{target_port}\r\n"
194
-
195
- # Add proxy auth if provided
196
- if self.config.get("username"):
197
- credentials = f"{self.config['username']}:{self.config.get('password', '')}"
198
- encoded = base64.b64encode(credentials.encode()).decode()
199
- connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
200
-
201
- connect_req += "\r\n"
202
-
203
- sock.sendall(connect_req.encode())
204
-
205
- # Read response
206
- response = b""
207
- while b"\r\n\r\n" not in response:
208
- chunk = sock.recv(1024)
209
- if not chunk:
210
- raise ConnectionError("Upstream proxy closed connection")
211
- response += chunk
212
-
213
- # Check status
214
- status_line = response.split(b"\r\n")[0].decode()
215
- if "200" not in status_line:
216
- sock.close()
217
- raise ConnectionError(f"Upstream proxy CONNECT failed: {status_line}")
218
-
219
- return sock
220
-
221
-
222
- class _TLSInTLSSocket:
223
- """
224
- A socket-like wrapper for TLS-in-TLS connections.
225
-
226
- Uses SSLObject + MemoryBIO to implement TLS over an existing TLS connection.
227
- """
228
-
229
- def __init__(
230
- self,
231
- outer_sock: ssl.SSLSocket,
232
- ssl_obj: ssl.SSLObject,
233
- incoming: ssl.MemoryBIO,
234
- outgoing: ssl.MemoryBIO,
235
- ):
236
- self._outer = outer_sock
237
- self._ssl = ssl_obj
238
- self._incoming = incoming
239
- self._outgoing = outgoing
240
- self._timeout: float | None = None
241
-
242
- def settimeout(self, timeout: float | None) -> None:
243
- self._timeout = timeout
244
- self._outer.settimeout(timeout)
245
-
246
- def sendall(self, data: bytes) -> None:
247
- """Send data through the inner TLS connection."""
248
- self._ssl.write(data)
249
- encrypted = self._outgoing.read()
250
- if encrypted:
251
- self._outer.sendall(encrypted)
252
-
253
- def recv(self, bufsize: int) -> bytes:
254
- """Receive data from the inner TLS connection."""
255
- while True:
256
- try:
257
- return self._ssl.read(bufsize)
258
- except ssl.SSLWantReadError:
259
- self._outer.settimeout(self._timeout)
260
- try:
261
- received = self._outer.recv(8192)
262
- if not received:
263
- return b""
264
- self._incoming.write(received)
265
- except socket.timeout:
266
- return b""
267
-
268
- def close(self) -> None:
269
- with contextlib.suppress(Exception):
270
- self._outer.close()
77
+ return parse_upstream_proxy()
271
78
 
272
79
 
273
80
  # =========================================================================
@@ -300,24 +107,6 @@ class ThordataClient:
300
107
  web_scraper_api_base_url: str | None = None,
301
108
  locations_base_url: str | None = None,
302
109
  ) -> None:
303
- """Initialize the Thordata Client.
304
-
305
- Args:
306
- scraper_token: Token for SERP/Universal scraping APIs.
307
- public_token: Public API token for account/management operations.
308
- public_key: Public API key for account/management operations.
309
- proxy_host: Default proxy host for residential proxies.
310
- proxy_port: Default proxy port for residential proxies.
311
- timeout: Default timeout for proxy requests.
312
- api_timeout: Default timeout for API requests.
313
- retry_config: Configuration for retry behavior.
314
- auth_mode: Authentication mode for scraper_token ("bearer" or "header_token").
315
- scraperapi_base_url: Override base URL for SERP API.
316
- universalapi_base_url: Override base URL for Universal Scraping API.
317
- web_scraper_api_base_url: Override base URL for Web Scraper API.
318
- locations_base_url: Override base URL for Locations API.
319
- """
320
-
321
110
  self.scraper_token = scraper_token
322
111
  self.public_token = public_token
323
112
  self.public_key = public_key
@@ -334,17 +123,17 @@ class ThordataClient:
334
123
  f"Invalid auth_mode: {auth_mode}. Must be 'bearer' or 'header_token'."
335
124
  )
336
125
 
126
+ # Initialize Core HTTP Client for API calls
127
+ self._http = ThordataHttpSession(
128
+ timeout=api_timeout, retry_config=self._retry_config
129
+ )
130
+
131
+ # Legacy logic for Proxy Network connections (requests.Session)
337
132
  self._proxy_session = requests.Session()
338
133
  self._proxy_session.trust_env = False
339
134
  self._proxy_managers: dict[str, urllib3.PoolManager] = {}
340
135
 
341
- self._api_session = requests.Session()
342
- self._api_session.trust_env = True
343
- self._api_session.headers.update(
344
- {"User-Agent": build_user_agent(_sdk_version, "requests")}
345
- )
346
-
347
- # Base URLs
136
+ # Base URLs Configuration
348
137
  scraperapi_base = (
349
138
  scraperapi_base_url
350
139
  or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
@@ -369,14 +158,14 @@ class ThordataClient:
369
158
  or self.LOCATIONS_URL
370
159
  ).rstrip("/")
371
160
 
372
- gateway_base = os.getenv(
161
+ self._gateway_base_url = os.getenv(
373
162
  "THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
374
163
  )
375
- self._gateway_base_url = gateway_base
376
164
  self._child_base_url = os.getenv(
377
165
  "THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
378
166
  )
379
167
 
168
+ # URL Construction
380
169
  self._serp_url = f"{scraperapi_base}/request"
381
170
  self._builder_url = f"{scraperapi_base}/builder"
382
171
  self._video_builder_url = f"{scraperapi_base}/video_builder"
@@ -388,12 +177,10 @@ class ThordataClient:
388
177
 
389
178
  self._locations_base_url = locations_base
390
179
 
391
- self._usage_stats_url = (
392
- f"{locations_base.replace('/locations', '')}/account/usage-statistics"
393
- )
394
- self._proxy_users_url = (
395
- f"{locations_base.replace('/locations', '')}/proxy-users"
396
- )
180
+ # Determine shared API base from locations URL
181
+ shared_api_base = locations_base.replace("/locations", "")
182
+ self._usage_stats_url = f"{shared_api_base}/account/usage-statistics"
183
+ self._proxy_users_url = f"{shared_api_base}/proxy-users"
397
184
 
398
185
  whitelist_base = os.getenv(
399
186
  "THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
@@ -406,7 +193,7 @@ class ThordataClient:
406
193
  self._proxy_list_url = f"{proxy_api_base}/proxy/proxy-list"
407
194
  self._proxy_expiration_url = f"{proxy_api_base}/proxy/expiration-time"
408
195
 
409
- # Initialize Namespaces AFTER all base URLs are set
196
+ # Initialize Namespaces
410
197
  self.serp = SerpNamespace(self)
411
198
  self.unlimited = UnlimitedNamespace(self)
412
199
 
@@ -416,8 +203,8 @@ class ThordataClient:
416
203
 
417
204
  def close(self) -> None:
418
205
  """Close the client and release resources."""
206
+ self._http.close()
419
207
  self._proxy_session.close()
420
- self._api_session.close()
421
208
  for pm in self._proxy_managers.values():
422
209
  pm.clear()
423
210
  self._proxy_managers.clear()
@@ -428,6 +215,30 @@ class ThordataClient:
428
215
  def __exit__(self, exc_type, exc_val, exc_tb) -> None:
429
216
  self.close()
430
217
 
218
+ # =========================================================================
219
+ # Internal Helper: API Request Delegation
220
+ # =========================================================================
221
+
222
+ def _api_request_with_retry(
223
+ self,
224
+ method: str,
225
+ url: str,
226
+ *,
227
+ data: dict[str, Any] | None = None,
228
+ headers: dict[str, str] | None = None,
229
+ params: dict[str, Any] | None = None,
230
+ ) -> requests.Response:
231
+ """Delegate to Core HTTP Client."""
232
+ return self._http.request(
233
+ method=method, url=url, data=data, headers=headers, params=params
234
+ )
235
+
236
+ def _require_public_credentials(self) -> None:
237
+ if not self.public_token or not self.public_key:
238
+ raise ThordataConfigError(
239
+ "public_token and public_key are required for this operation."
240
+ )
241
+
431
242
  # =========================================================================
432
243
  # Proxy Network Methods
433
244
  # =========================================================================
@@ -440,17 +251,6 @@ class ThordataClient:
440
251
  timeout: int | None = None,
441
252
  **kwargs: Any,
442
253
  ) -> requests.Response:
443
- """Make a GET request through the proxy network.
444
-
445
- Args:
446
- url: Target URL to request.
447
- proxy_config: Proxy configuration. If not provided, uses environment variables.
448
- timeout: Request timeout in seconds.
449
- **kwargs: Additional arguments passed to requests.
450
-
451
- Returns:
452
- Response object.
453
- """
454
254
  logger.debug(f"Proxy GET request: {url}")
455
255
  return self._proxy_verb("GET", url, proxy_config, timeout, **kwargs)
456
256
 
@@ -462,17 +262,6 @@ class ThordataClient:
462
262
  timeout: int | None = None,
463
263
  **kwargs: Any,
464
264
  ) -> requests.Response:
465
- """Make a POST request through the proxy network.
466
-
467
- Args:
468
- url: Target URL to request.
469
- proxy_config: Proxy configuration. If not provided, uses environment variables.
470
- timeout: Request timeout in seconds.
471
- **kwargs: Additional arguments passed to requests.
472
-
473
- Returns:
474
- Response object.
475
- """
476
265
  logger.debug(f"Proxy POST request: {url}")
477
266
  return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
478
267
 
@@ -488,21 +277,6 @@ class ThordataClient:
488
277
  session_duration: int | None = None,
489
278
  product: ProxyProduct | str = ProxyProduct.RESIDENTIAL,
490
279
  ) -> str:
491
- """Build a proxy URL with location and session parameters.
492
-
493
- Args:
494
- username: Proxy username.
495
- password: Proxy password.
496
- country: Country code (e.g., "us", "uk").
497
- state: State/region code (e.g., "ca", "ny").
498
- city: City name (e.g., "new-york", "london").
499
- session_id: Session identifier for sticky sessions.
500
- session_duration: Session duration in minutes (1-90).
501
- product: Proxy product type (RESIDENTIAL, DATACENTER, MOBILE).
502
-
503
- Returns:
504
- Formatted proxy URL.
505
- """
506
280
  config = ProxyConfig(
507
281
  username=username,
508
282
  password=password,
@@ -536,24 +310,6 @@ class ThordataClient:
536
310
  output_format: str = "json",
537
311
  **kwargs: Any,
538
312
  ) -> dict[str, Any]:
539
- """Perform a search engine query using SERP API.
540
-
541
- Args:
542
- query: Search query string.
543
- engine: Search engine (GOOGLE, BING, YAHOO, etc.).
544
- num: Number of results to return.
545
- country: Country code for localized results.
546
- language: Language code for interface.
547
- search_type: Type of search (images, news, video, etc.).
548
- device: Device type (desktop, mobile).
549
- render_js: Whether to render JavaScript.
550
- no_cache: Bypass cache.
551
- output_format: Output format ("json" or "html").
552
- **kwargs: Additional engine-specific parameters.
553
-
554
- Returns:
555
- Search results as dictionary.
556
- """
557
313
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
558
314
 
559
315
  request = SerpRequest(
@@ -569,18 +325,9 @@ class ThordataClient:
569
325
  output_format=output_format,
570
326
  extra_params=kwargs,
571
327
  )
572
-
573
328
  return self.serp_search_advanced(request)
574
329
 
575
330
  def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
576
- """Perform advanced search with a SerpRequest object.
577
-
578
- Args:
579
- request: SerpRequest object with search parameters.
580
-
581
- Returns:
582
- Search results as dictionary.
583
- """
584
331
  if not self.scraper_token:
585
332
  raise ThordataConfigError("scraper_token is required for SERP API")
586
333
 
@@ -589,30 +336,24 @@ class ThordataClient:
589
336
 
590
337
  logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
591
338
 
592
- try:
593
- response = self._api_request_with_retry(
594
- "POST",
595
- self._serp_url,
596
- data=payload,
597
- headers=headers,
598
- )
599
- response.raise_for_status()
600
-
601
- if request.output_format.lower() == "json":
602
- data = response.json()
603
- if isinstance(data, dict):
604
- code = data.get("code")
605
- if code is not None and code != 200:
606
- msg = extract_error_message(data)
607
- raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
608
- return parse_json_response(data)
339
+ response = self._api_request_with_retry(
340
+ "POST",
341
+ self._serp_url,
342
+ data=payload,
343
+ headers=headers,
344
+ )
345
+ response.raise_for_status()
609
346
 
610
- return {"html": response.text}
347
+ if request.output_format.lower() == "json":
348
+ data = response.json()
349
+ if isinstance(data, dict):
350
+ code = data.get("code")
351
+ if code is not None and code != 200:
352
+ msg = extract_error_message(data)
353
+ raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
354
+ return parse_json_response(data)
611
355
 
612
- except requests.Timeout as e:
613
- raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
614
- except requests.RequestException as e:
615
- raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
356
+ return {"html": response.text}
616
357
 
617
358
  # =========================================================================
618
359
  # Universal Scraping API (WEB UNLOCKER) Methods
@@ -630,21 +371,6 @@ class ThordataClient:
630
371
  wait_for: str | None = None,
631
372
  **kwargs: Any,
632
373
  ) -> str | bytes:
633
- """Scrape a URL using Universal Scraping API.
634
-
635
- Args:
636
- url: Target URL to scrape.
637
- js_render: Whether to render JavaScript.
638
- output_format: Output format ("html" or "png").
639
- country: Country for IP geolocation.
640
- block_resources: Block specific resources (e.g., "script,css").
641
- wait: Wait time in milliseconds before fetching.
642
- wait_for: CSS selector to wait for before fetching.
643
- **kwargs: Additional parameters.
644
-
645
- Returns:
646
- Scraped content as string (HTML) or bytes (PNG).
647
- """
648
374
  request = UniversalScrapeRequest(
649
375
  url=url,
650
376
  js_render=js_render,
@@ -658,40 +384,17 @@ class ThordataClient:
658
384
  return self.universal_scrape_advanced(request)
659
385
 
660
386
  def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
661
- """Scrape with advanced options using UniversalScrapeRequest.
662
-
663
- Args:
664
- request: UniversalScrapeRequest object with scrape parameters.
665
-
666
- Returns:
667
- Scraped content as string (HTML) or bytes (PNG).
668
- """
669
387
  if not self.scraper_token:
670
- raise ThordataConfigError("scraper_token is required for Universal API")
388
+ raise ThordataConfigError("scraper_token required")
671
389
 
672
390
  payload = request.to_payload()
673
391
  headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
674
392
 
675
- logger.info(f"Universal Scrape: {request.url}")
676
-
677
- try:
678
- response = self._api_request_with_retry(
679
- "POST",
680
- self._universal_url,
681
- data=payload,
682
- headers=headers,
683
- )
684
- response.raise_for_status()
685
- return self._process_universal_response(response, request.output_format)
686
-
687
- except requests.Timeout as e:
688
- raise ThordataTimeoutError(
689
- f"Universal timeout: {e}", original_error=e
690
- ) from e
691
- except requests.RequestException as e:
692
- raise ThordataNetworkError(
693
- f"Universal failed: {e}", original_error=e
694
- ) from e
393
+ response = self._api_request_with_retry(
394
+ "POST", self._universal_url, data=payload, headers=headers
395
+ )
396
+ response.raise_for_status()
397
+ return self._process_universal_response(response, request.output_format)
695
398
 
696
399
  # =========================================================================
697
400
  # Web Scraper API - Task Management
@@ -705,18 +408,6 @@ class ThordataClient:
705
408
  parameters: dict[str, Any],
706
409
  universal_params: dict[str, Any] | None = None,
707
410
  ) -> str:
708
- """Create a web scraping task.
709
-
710
- Args:
711
- file_name: Name for the output file (supports {{TasksID}} template).
712
- spider_id: Spider identifier from Dashboard.
713
- spider_name: Spider name (target domain, e.g., "amazon.com").
714
- parameters: Spider-specific parameters.
715
- universal_params: Global spider settings.
716
-
717
- Returns:
718
- Task ID.
719
- """
720
411
  config = ScraperTaskConfig(
721
412
  file_name=file_name,
722
413
  spider_id=spider_id,
@@ -726,38 +417,73 @@ class ThordataClient:
726
417
  )
727
418
  return self.create_scraper_task_advanced(config)
728
419
 
729
- def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
730
- """Create a web scraping task with advanced configuration.
420
+ def run_tool(
421
+ self,
422
+ tool_request: Any,
423
+ file_name: str | None = None,
424
+ universal_params: dict[str, Any] | None = None,
425
+ ) -> str:
426
+ """
427
+ Run a specific pre-defined tool.
428
+ Supports both standard Scrapers and Video downloaders.
429
+ """
430
+ if not hasattr(tool_request, "to_task_parameters") or not hasattr(
431
+ tool_request, "get_spider_id"
432
+ ):
433
+ raise ValueError(
434
+ "tool_request must be an instance of a thordata.tools class"
435
+ )
731
436
 
732
- Args:
733
- config: ScraperTaskConfig object with task configuration.
437
+ spider_id = tool_request.get_spider_id()
438
+ spider_name = tool_request.get_spider_name()
439
+ params = tool_request.to_task_parameters()
734
440
 
735
- Returns:
736
- Task ID.
737
- """
441
+ if not file_name:
442
+ import uuid
443
+
444
+ short_id = uuid.uuid4().hex[:8]
445
+ file_name = f"{spider_id}_{short_id}"
446
+
447
+ # Check if it's a Video Tool (Duck typing check for common_settings)
448
+ if hasattr(tool_request, "common_settings"):
449
+ # It is a Video Task
450
+ config_video = VideoTaskConfig(
451
+ file_name=file_name,
452
+ spider_id=spider_id,
453
+ spider_name=spider_name,
454
+ parameters=params,
455
+ common_settings=tool_request.common_settings,
456
+ )
457
+ return self.create_video_task_advanced(config_video)
458
+ else:
459
+ # It is a Standard Scraper Task
460
+ config = ScraperTaskConfig(
461
+ file_name=file_name,
462
+ spider_id=spider_id,
463
+ spider_name=spider_name,
464
+ parameters=params,
465
+ universal_params=universal_params,
466
+ )
467
+ return self.create_scraper_task_advanced(config)
468
+
469
+ def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
738
470
  self._require_public_credentials()
739
471
  if not self.scraper_token:
740
472
  raise ThordataConfigError("scraper_token is required for Task Builder")
473
+
741
474
  payload = config.to_payload()
742
475
  headers = build_builder_headers(
743
- self.scraper_token, self.public_token or "", self.public_key or ""
476
+ self.scraper_token, str(self.public_token), str(self.public_key)
744
477
  )
745
478
 
746
- try:
747
- response = self._api_request_with_retry(
748
- "POST", self._builder_url, data=payload, headers=headers
749
- )
750
- response.raise_for_status()
751
- data = response.json()
752
- if data.get("code") != 200:
753
- raise_for_code(
754
- "Task creation failed", code=data.get("code"), payload=data
755
- )
756
- return data["data"]["task_id"]
757
- except requests.RequestException as e:
758
- raise ThordataNetworkError(
759
- f"Task creation failed: {e}", original_error=e
760
- ) from e
479
+ response = self._api_request_with_retry(
480
+ "POST", self._builder_url, data=payload, headers=headers
481
+ )
482
+ response.raise_for_status()
483
+ data = response.json()
484
+ if data.get("code") != 200:
485
+ raise_for_code("Task creation failed", code=data.get("code"), payload=data)
486
+ return data["data"]["task_id"]
761
487
 
762
488
  def create_video_task(
763
489
  self,
@@ -767,18 +493,6 @@ class ThordataClient:
767
493
  parameters: dict[str, Any],
768
494
  common_settings: CommonSettings,
769
495
  ) -> str:
770
- """Create a video/audio download task (YouTube, etc.).
771
-
772
- Args:
773
- file_name: Name for the output file.
774
- spider_id: Spider identifier (e.g., "youtube_video_by-url").
775
- spider_name: Target site (e.g., "youtube.com").
776
- parameters: Spider-specific parameters (URLs, etc.).
777
- common_settings: Video/audio settings (resolution, subtitles, etc.).
778
-
779
- Returns:
780
- Task ID.
781
- """
782
496
  config = VideoTaskConfig(
783
497
  file_name=file_name,
784
498
  spider_id=spider_id,
@@ -789,14 +503,6 @@ class ThordataClient:
789
503
  return self.create_video_task_advanced(config)
790
504
 
791
505
  def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
792
- """Create a video task with advanced configuration.
793
-
794
- Args:
795
- config: VideoTaskConfig object with task configuration.
796
-
797
- Returns:
798
- Task ID.
799
- """
800
506
  self._require_public_credentials()
801
507
  if not self.scraper_token:
802
508
  raise ThordataConfigError(
@@ -805,7 +511,7 @@ class ThordataClient:
805
511
 
806
512
  payload = config.to_payload()
807
513
  headers = build_builder_headers(
808
- self.scraper_token, self.public_token or "", self.public_key or ""
514
+ self.scraper_token, str(self.public_token), str(self.public_key)
809
515
  )
810
516
 
811
517
  response = self._api_request_with_retry(
@@ -820,100 +526,78 @@ class ThordataClient:
820
526
  return data["data"]["task_id"]
821
527
 
822
528
  def get_task_status(self, task_id: str) -> str:
823
- """Get the status of a scraping task.
529
+ self._require_public_credentials()
530
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
824
531
 
825
- Args:
826
- task_id: Task identifier.
532
+ response = self._api_request_with_retry(
533
+ "POST",
534
+ self._status_url,
535
+ data={"tasks_ids": task_id},
536
+ headers=headers,
537
+ )
538
+ response.raise_for_status()
539
+ data = response.json()
540
+ if data.get("code") != 200:
541
+ raise_for_code("Task status error", code=data.get("code"), payload=data)
542
+
543
+ items = data.get("data") or []
544
+ for item in items:
545
+ if str(item.get("task_id")) == str(task_id):
546
+ return item.get("status", "unknown")
547
+ return "unknown"
827
548
 
828
- Returns:
829
- Status string (running, success, failed, etc.).
549
+ def get_latest_task_status(self) -> dict[str, Any]:
550
+ """
551
+ Get the status of the last task of the specified account.
830
552
  """
831
553
  self._require_public_credentials()
832
- headers = build_public_api_headers(
833
- self.public_token or "", self.public_key or ""
554
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
555
+ parsed = urlparse(self._status_url)
556
+ base = f"{parsed.scheme}://{parsed.netloc}"
557
+ endpoint = "/api/web_scraper_api/get_latest_task_status"
558
+
559
+ response = self._api_request_with_retry(
560
+ "POST",
561
+ f"{base}{endpoint}",
562
+ headers=headers,
834
563
  )
835
- try:
836
- response = self._api_request_with_retry(
837
- "POST",
838
- self._status_url,
839
- data={"tasks_ids": task_id},
840
- headers=headers,
841
- )
842
- response.raise_for_status()
843
- data = response.json()
844
- if data.get("code") != 200:
845
- raise_for_code("Task status error", code=data.get("code"), payload=data)
846
-
847
- items = data.get("data") or []
848
- for item in items:
849
- if str(item.get("task_id")) == str(task_id):
850
- return item.get("status", "unknown")
851
- return "unknown"
852
- except requests.RequestException as e:
853
- raise ThordataNetworkError(
854
- f"Status check failed: {e}", original_error=e
855
- ) from e
564
+ response.raise_for_status()
565
+ data = response.json()
856
566
 
857
- def safe_get_task_status(self, task_id: str) -> str:
858
- """Get task status with error handling.
567
+ if data.get("code") != 200:
568
+ raise_for_code(
569
+ "Get latest task status failed", code=data.get("code"), payload=data
570
+ )
859
571
 
860
- Args:
861
- task_id: Task identifier.
572
+ return data.get("data", {})
862
573
 
863
- Returns:
864
- Status string or "error" on failure.
865
- """
574
+ def safe_get_task_status(self, task_id: str) -> str:
866
575
  try:
867
576
  return self.get_task_status(task_id)
868
577
  except Exception:
869
578
  return "error"
870
579
 
871
580
  def get_task_result(self, task_id: str, file_type: str = "json") -> str:
872
- """Get the download URL for a completed task.
873
-
874
- Args:
875
- task_id: Task identifier.
876
- file_type: File type to download (json, csv, video, audio, subtitle).
877
-
878
- Returns:
879
- Download URL.
880
- """
881
581
  self._require_public_credentials()
882
- headers = build_public_api_headers(
883
- self.public_token or "", self.public_key or ""
582
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
583
+
584
+ response = self._api_request_with_retry(
585
+ "POST",
586
+ self._download_url,
587
+ data={"tasks_id": task_id, "type": file_type},
588
+ headers=headers,
884
589
  )
885
- try:
886
- response = self._api_request_with_retry(
887
- "POST",
888
- self._download_url,
889
- data={"tasks_id": task_id, "type": file_type},
890
- headers=headers,
891
- )
892
- response.raise_for_status()
893
- data = response.json()
894
- if data.get("code") == 200 and data.get("data"):
895
- return data["data"]["download"]
896
- raise_for_code("Get result failed", code=data.get("code"), payload=data)
897
- return ""
898
- except requests.RequestException as e:
899
- raise ThordataNetworkError(
900
- f"Get result failed: {e}", original_error=e
901
- ) from e
590
+ response.raise_for_status()
591
+ data = response.json()
592
+ if data.get("code") == 200 and data.get("data"):
593
+ return data["data"]["download"]
594
+ raise_for_code("Get result failed", code=data.get("code"), payload=data)
595
+ return ""
902
596
 
903
597
  def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
904
- """List all scraping tasks.
905
-
906
- Args:
907
- page: Page number for pagination.
908
- size: Number of items per page.
909
-
910
- Returns:
911
- Dictionary with count and list of tasks.
912
- """
913
598
  self._require_public_credentials()
914
- headers = build_public_api_headers(
915
- self.public_token or "", self.public_key or ""
916
- )
599
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
600
+
917
601
  response = self._api_request_with_retry(
918
602
  "POST",
919
603
  self._list_url,
@@ -933,16 +617,6 @@ class ThordataClient:
933
617
  poll_interval: float = 5.0,
934
618
  max_wait: float = 600.0,
935
619
  ) -> str:
936
- """Wait for a task to complete.
937
-
938
- Args:
939
- task_id: Task identifier.
940
- poll_interval: Polling interval in seconds.
941
- max_wait: Maximum time to wait in seconds.
942
-
943
- Returns:
944
- Final status of the task.
945
- """
946
620
  import time
947
621
 
948
622
  start = time.monotonic()
@@ -972,42 +646,14 @@ class ThordataClient:
972
646
  initial_poll_interval: float = 2.0,
973
647
  max_poll_interval: float = 10.0,
974
648
  include_errors: bool = True,
975
- # New parameters
976
- task_type: str = "web", # "web" or "video"
649
+ task_type: str = "web",
977
650
  common_settings: CommonSettings | None = None,
978
651
  ) -> str:
979
- """High-level wrapper to run a task and wait for result.
980
-
981
- This method handles the entire lifecycle:
982
- 1. Create Task
983
- 2. Poll status (with exponential backoff)
984
- 3. Get download URL when ready
985
-
986
- Args:
987
- file_name: Name for the output file.
988
- spider_id: Spider identifier from Dashboard.
989
- spider_name: Spider name (target domain).
990
- parameters: Spider-specific parameters.
991
- universal_params: Global spider settings.
992
- max_wait: Maximum seconds to wait for completion.
993
- initial_poll_interval: Starting poll interval in seconds.
994
- max_poll_interval: Maximum poll interval cap.
995
- include_errors: Whether to include error logs.
996
-
997
- Returns:
998
- The download URL for the task result.
999
-
1000
- Raises:
1001
- ThordataTimeoutError: If task takes longer than max_wait.
1002
- ThordataAPIError: If task fails or is cancelled.
1003
- """
1004
652
  import time
1005
653
 
1006
- # 1. Create Task
1007
654
  if task_type == "video":
1008
655
  if common_settings is None:
1009
656
  raise ValueError("common_settings is required for video tasks")
1010
-
1011
657
  config_video = VideoTaskConfig(
1012
658
  file_name=file_name,
1013
659
  spider_id=spider_id,
@@ -1028,9 +674,8 @@ class ThordataClient:
1028
674
  )
1029
675
  task_id = self.create_scraper_task_advanced(config)
1030
676
 
1031
- logger.info(f"Task created successfully: {task_id}. Waiting for completion...")
677
+ logger.info(f"Task created: {task_id}. Polling...")
1032
678
 
1033
- # 2. Poll Status (Smart Backoff)
1034
679
  start_time = time.monotonic()
1035
680
  current_poll = initial_poll_interval
1036
681
 
@@ -1039,20 +684,17 @@ class ThordataClient:
1039
684
  status_lower = status.lower()
1040
685
 
1041
686
  if status_lower in {"ready", "success", "finished"}:
1042
- logger.info(f"Task {task_id} finished. Status: {status}")
1043
- # 3. Get Result
1044
687
  return self.get_task_result(task_id)
1045
688
 
1046
689
  if status_lower in {"failed", "error", "cancelled"}:
1047
690
  raise ThordataNetworkError(
1048
- f"Task {task_id} ended with failed status: {status}"
691
+ f"Task {task_id} failed with status: {status}"
1049
692
  )
1050
693
 
1051
- # Wait and increase interval (capped)
1052
694
  time.sleep(current_poll)
1053
695
  current_poll = min(current_poll * 1.5, max_poll_interval)
1054
696
 
1055
- raise ThordataTimeoutError(f"Task {task_id} timed out after {max_wait} seconds")
697
+ raise ThordataTimeoutError(f"Task {task_id} timed out")
1056
698
 
1057
699
  # =========================================================================
1058
700
  # Account & Usage Methods
@@ -1063,15 +705,6 @@ class ThordataClient:
1063
705
  from_date: str | date,
1064
706
  to_date: str | date,
1065
707
  ) -> UsageStatistics:
1066
- """Get usage statistics for a date range.
1067
-
1068
- Args:
1069
- from_date: Start date (YYYY-MM-DD format or date object).
1070
- to_date: End date (YYYY-MM-DD format or date object).
1071
-
1072
- Returns:
1073
- UsageStatistics object with traffic data.
1074
- """
1075
708
  self._require_public_credentials()
1076
709
  if isinstance(from_date, date):
1077
710
  from_date = from_date.strftime("%Y-%m-%d")
@@ -1094,17 +727,9 @@ class ThordataClient:
1094
727
  return UsageStatistics.from_dict(data.get("data", data))
1095
728
 
1096
729
  def get_traffic_balance(self) -> float:
1097
- """
1098
- Get the current traffic balance in KB via Public API.
1099
- """
1100
730
  self._require_public_credentials()
1101
- # FIX: Auth params must be in Query, NOT Headers
1102
- params = {
1103
- "token": self.public_token,
1104
- "key": self.public_key,
1105
- }
731
+ params = {"token": self.public_token, "key": self.public_key}
1106
732
  api_base = self._locations_base_url.replace("/locations", "")
1107
-
1108
733
  response = self._api_request_with_retry(
1109
734
  "GET", f"{api_base}/account/traffic-balance", params=params
1110
735
  )
@@ -1114,21 +739,12 @@ class ThordataClient:
1114
739
  raise_for_code(
1115
740
  "Get traffic balance failed", code=data.get("code"), payload=data
1116
741
  )
1117
-
1118
742
  return float(data.get("data", {}).get("traffic_balance", 0))
1119
743
 
1120
744
  def get_wallet_balance(self) -> float:
1121
- """
1122
- Get the current wallet balance via Public API.
1123
- """
1124
745
  self._require_public_credentials()
1125
- # FIX: Auth params must be in Query, NOT Headers
1126
- params = {
1127
- "token": self.public_token,
1128
- "key": self.public_key,
1129
- }
746
+ params = {"token": self.public_token, "key": self.public_key}
1130
747
  api_base = self._locations_base_url.replace("/locations", "")
1131
-
1132
748
  response = self._api_request_with_retry(
1133
749
  "GET", f"{api_base}/account/wallet-balance", params=params
1134
750
  )
@@ -1138,7 +754,6 @@ class ThordataClient:
1138
754
  raise_for_code(
1139
755
  "Get wallet balance failed", code=data.get("code"), payload=data
1140
756
  )
1141
-
1142
757
  return float(data.get("data", {}).get("balance", 0))
1143
758
 
1144
759
  def get_proxy_user_usage(
@@ -1148,21 +763,8 @@ class ThordataClient:
1148
763
  end_date: str | date,
1149
764
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1150
765
  ) -> list[dict[str, Any]]:
1151
- """
1152
- Get traffic usage statistics for a specific proxy user.
1153
-
1154
- Args:
1155
- username: Sub-account username.
1156
- start_date: Start date (YYYY-MM-DD).
1157
- end_date: End date (YYYY-MM-DD).
1158
- proxy_type: Proxy product type.
1159
-
1160
- Returns:
1161
- List of daily usage records.
1162
- """
1163
766
  self._require_public_credentials()
1164
767
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1165
-
1166
768
  if isinstance(start_date, date):
1167
769
  start_date = start_date.strftime("%Y-%m-%d")
1168
770
  if isinstance(end_date, date):
@@ -1176,7 +778,6 @@ class ThordataClient:
1176
778
  "from_date": start_date,
1177
779
  "to_date": end_date,
1178
780
  }
1179
-
1180
781
  response = self._api_request_with_retry(
1181
782
  "GET", f"{self._proxy_users_url}/usage-statistics", params=params
1182
783
  )
@@ -1184,10 +785,51 @@ class ThordataClient:
1184
785
  data = response.json()
1185
786
  if data.get("code") != 200:
1186
787
  raise_for_code("Get user usage failed", code=data.get("code"), payload=data)
1187
-
1188
- # Structure: { "data": [ { "date": "...", "usage_traffic": ... } ] }
1189
788
  return data.get("data", [])
1190
789
 
790
+ def get_proxy_user_usage_hour(
791
+ self,
792
+ username: str,
793
+ from_date: str, # Format: yyyy-mm-dd HH
794
+ to_date: str, # Format: yyyy-mm-dd HH
795
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
796
+ ) -> list[dict[str, Any]]:
797
+ """
798
+ Get proxy user traffic usage logs by hour.
799
+
800
+ Args:
801
+ username: The proxy username.
802
+ from_date: Start date string (yyyy-mm-dd HH).
803
+ to_date: End date string (yyyy-mm-dd HH).
804
+ proxy_type: Proxy type (default: Residential).
805
+ """
806
+ self._require_public_credentials()
807
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
808
+
809
+ params = {
810
+ "token": self.public_token,
811
+ "key": self.public_key,
812
+ "proxy_type": str(pt),
813
+ "username": username,
814
+ "from_date": from_date,
815
+ "to_date": to_date,
816
+ }
817
+ response = self._api_request_with_retry(
818
+ "GET", f"{self._proxy_users_url}/usage-statistics-hour", params=params
819
+ )
820
+ response.raise_for_status()
821
+ data = response.json()
822
+ if data.get("code") != 200:
823
+ raise_for_code(
824
+ "Get hourly usage failed", code=data.get("code"), payload=data
825
+ )
826
+
827
+ # API returns { "data": { "data": [...] } } structure
828
+ inner_data = data.get("data", {})
829
+ if isinstance(inner_data, dict):
830
+ return inner_data.get("data", [])
831
+ return []
832
+
1191
833
  def extract_ip_list(
1192
834
  self,
1193
835
  num: int = 1,
@@ -1199,40 +841,16 @@ class ThordataClient:
1199
841
  return_type: str = "txt",
1200
842
  protocol: str = "http",
1201
843
  sep: str = "\r\n",
1202
- product: str = "residential", # residential or unlimited
844
+ product: str = "residential",
1203
845
  ) -> list[str]:
1204
- """
1205
- Extract proxy IP list via API (get-ip.thordata.net).
1206
- Requires IP whitelist configuration.
1207
-
1208
- Args:
1209
- num: Number of IPs to extract.
1210
- country: Country code.
1211
- state: State code.
1212
- city: City name.
1213
- time_limit: Session duration (1-90 mins).
1214
- port: Specific port.
1215
- return_type: "txt" or "json".
1216
- protocol: "http" or "socks5".
1217
- sep: Separator for txt output.
1218
- product: "residential" or "unlimited".
1219
-
1220
- Returns:
1221
- List of "IP:Port" strings.
1222
- """
1223
- # Determine endpoint based on product
1224
846
  base_url = "https://get-ip.thordata.net"
1225
847
  endpoint = "/unlimited_api" if product == "unlimited" else "/api"
1226
-
1227
- # Build params
1228
848
  params: dict[str, Any] = {
1229
849
  "num": str(num),
1230
850
  "return_type": return_type,
1231
851
  "protocol": protocol,
1232
852
  "sep": sep,
1233
853
  }
1234
-
1235
- # Add optional params
1236
854
  if country:
1237
855
  params["country"] = country
1238
856
  if state:
@@ -1248,17 +866,15 @@ class ThordataClient:
1248
866
  if username:
1249
867
  params["td-customer"] = username
1250
868
 
1251
- response = self._api_session.get(
1252
- f"{base_url}{endpoint}", params=params, timeout=self._default_timeout
869
+ response = self._api_request_with_retry(
870
+ "GET", f"{base_url}{endpoint}", params=params
1253
871
  )
1254
872
  response.raise_for_status()
1255
873
 
1256
- # Parse result
1257
874
  if return_type == "json":
1258
875
  data = response.json()
1259
- # JSON format: { "code": 0, "data": [ { "ip": "...", "port": ... } ] }
1260
876
  if isinstance(data, dict):
1261
- if data.get("code") == 0 or data.get("code") == 200:
877
+ if data.get("code") in (0, 200):
1262
878
  raw_list = data.get("data") or []
1263
879
  return [f"{item['ip']}:{item['port']}" for item in raw_list]
1264
880
  else:
@@ -1266,40 +882,28 @@ class ThordataClient:
1266
882
  "Extract IPs failed", code=data.get("code"), payload=data
1267
883
  )
1268
884
  return []
1269
-
1270
- else: # txt
885
+ else:
1271
886
  text = response.text.strip()
1272
- # Check for error message in text (often starts with { or contains "error")
1273
887
  if text.startswith("{") and "code" in text:
1274
- # Try parsing as JSON error
1275
888
  try:
1276
- err_data = json.loads(text)
889
+ err_data = response.json()
1277
890
  raise_for_code(
1278
891
  "Extract IPs failed",
1279
892
  code=err_data.get("code"),
1280
893
  payload=err_data,
1281
894
  )
1282
- except json.JSONDecodeError:
895
+ except ValueError:
1283
896
  pass
1284
-
1285
897
  actual_sep = sep.replace("\\r", "\r").replace("\\n", "\n")
1286
898
  return [line.strip() for line in text.split(actual_sep) if line.strip()]
1287
899
 
1288
900
  # =========================================================================
1289
- # Proxy Users Management (Sub-accounts)
901
+ # Proxy Users Management
1290
902
  # =========================================================================
1291
903
 
1292
904
  def list_proxy_users(
1293
905
  self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1294
906
  ) -> ProxyUserList:
1295
- """List all proxy sub-accounts.
1296
-
1297
- Args:
1298
- proxy_type: Proxy product type.
1299
-
1300
- Returns:
1301
- ProxyUserList with user information.
1302
- """
1303
907
  self._require_public_credentials()
1304
908
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1305
909
  params = {
@@ -1324,23 +928,9 @@ class ThordataClient:
1324
928
  traffic_limit: int = 0,
1325
929
  status: bool = True,
1326
930
  ) -> dict[str, Any]:
1327
- """Create a new proxy sub-account.
1328
-
1329
- Args:
1330
- username: Sub-account username.
1331
- password: Sub-account password.
1332
- proxy_type: Proxy product type.
1333
- traffic_limit: Traffic limit in MB (0 = unlimited).
1334
- status: Enable or disable the account.
1335
-
1336
- Returns:
1337
- API response data.
1338
- """
1339
931
  self._require_public_credentials()
1340
932
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1341
- headers = build_public_api_headers(
1342
- self.public_token or "", self.public_key or ""
1343
- )
933
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
1344
934
  payload = {
1345
935
  "proxy_type": str(pt),
1346
936
  "username": username,
@@ -1363,41 +953,36 @@ class ThordataClient:
1363
953
  def update_proxy_user(
1364
954
  self,
1365
955
  username: str,
1366
- password: str, # Added password as required argument
956
+ password: str,
1367
957
  traffic_limit: int | None = None,
1368
958
  status: bool | None = None,
1369
959
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
960
+ new_username: str | None = None, # Added optional new_username
1370
961
  ) -> dict[str, Any]:
1371
962
  """
1372
- Update an existing proxy user's settings.
1373
-
1374
- Note: Password is required by the API even if not changing it.
1375
-
1376
- Args:
1377
- username: The sub-account username.
1378
- password: The sub-account password (required for update).
1379
- traffic_limit: New traffic limit in MB (0 for unlimited). None to keep unchanged.
1380
- status: New status (True=enabled, False=disabled). None to keep unchanged.
1381
- proxy_type: Proxy product type.
1382
-
1383
- Returns:
1384
- API response data.
963
+ Update a proxy user.
964
+ Note: API requires 'new_' prefixed fields and ALL are required.
1385
965
  """
1386
966
  self._require_public_credentials()
1387
967
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1388
- headers = build_public_api_headers(
1389
- self.public_token or "", self.public_key or ""
1390
- )
968
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
969
+
970
+ # Defaults
971
+ limit_val = str(traffic_limit) if traffic_limit is not None else "0"
972
+ status_val = "true" if (status is None or status) else "false"
1391
973
 
974
+ # If new_username is not provided, keep the old one (API requires new_username field)
975
+ target_username = new_username or username
976
+
977
+ # Mapping to API specific field names (new_...)
1392
978
  payload = {
1393
979
  "proxy_type": str(pt),
1394
- "username": username,
1395
- "password": password, # Include password
980
+ "username": username, # Who to update
981
+ "new_username": target_username, # Required field
982
+ "new_password": password, # Required field
983
+ "new_traffic_limit": limit_val, # Required field
984
+ "new_status": status_val, # Required field
1396
985
  }
1397
- if traffic_limit is not None:
1398
- payload["traffic_limit"] = str(traffic_limit)
1399
- if status is not None:
1400
- payload["status"] = "true" if status else "false"
1401
986
 
1402
987
  response = self._api_request_with_retry(
1403
988
  "POST",
@@ -1405,7 +990,6 @@ class ThordataClient:
1405
990
  data=payload,
1406
991
  headers=headers,
1407
992
  )
1408
- response.raise_for_status()
1409
993
  data = response.json()
1410
994
  if data.get("code") != 200:
1411
995
  raise_for_code("Update user failed", code=data.get("code"), payload=data)
@@ -1416,26 +1000,10 @@ class ThordataClient:
1416
1000
  username: str,
1417
1001
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1418
1002
  ) -> dict[str, Any]:
1419
- """Delete a proxy user.
1420
-
1421
- Args:
1422
- username: The sub-account username.
1423
- proxy_type: Proxy product type.
1424
-
1425
- Returns:
1426
- API response data.
1427
- """
1428
1003
  self._require_public_credentials()
1429
1004
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1430
- headers = build_public_api_headers(
1431
- self.public_token or "", self.public_key or ""
1432
- )
1433
-
1434
- payload = {
1435
- "proxy_type": str(pt),
1436
- "username": username,
1437
- }
1438
-
1005
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
1006
+ payload = {"proxy_type": str(pt), "username": username}
1439
1007
  response = self._api_request_with_retry(
1440
1008
  "POST",
1441
1009
  f"{self._proxy_users_url}/delete-user",
@@ -1458,21 +1026,9 @@ class ThordataClient:
1458
1026
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1459
1027
  status: bool = True,
1460
1028
  ) -> dict[str, Any]:
1461
- """Add an IP to the whitelist.
1462
-
1463
- Args:
1464
- ip: IP address to whitelist.
1465
- proxy_type: Proxy product type.
1466
- status: Enable or disable the whitelist entry.
1467
-
1468
- Returns:
1469
- API response data.
1470
- """
1471
1029
  self._require_public_credentials()
1472
1030
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1473
- headers = build_public_api_headers(
1474
- self.public_token or "", self.public_key or ""
1475
- )
1031
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
1476
1032
  payload = {
1477
1033
  "proxy_type": str(pt),
1478
1034
  "ip": ip,
@@ -1494,24 +1050,10 @@ class ThordataClient:
1494
1050
  ip: str,
1495
1051
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1496
1052
  ) -> dict[str, Any]:
1497
- """Delete an IP from the whitelist.
1498
-
1499
- Args:
1500
- ip: The IP address to remove.
1501
- proxy_type: Proxy product type.
1502
-
1503
- Returns:
1504
- API response data.
1505
- """
1506
1053
  self._require_public_credentials()
1507
1054
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1508
- headers = build_public_api_headers(
1509
- self.public_token or "", self.public_key or ""
1510
- )
1511
- payload = {
1512
- "proxy_type": str(pt),
1513
- "ip": ip,
1514
- }
1055
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
1056
+ payload = {"proxy_type": str(pt), "ip": ip}
1515
1057
  response = self._api_request_with_retry(
1516
1058
  "POST", f"{self._whitelist_url}/delete-ip", data=payload, headers=headers
1517
1059
  )
@@ -1527,14 +1069,6 @@ class ThordataClient:
1527
1069
  self,
1528
1070
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1529
1071
  ) -> list[str]:
1530
- """List all whitelisted IPs.
1531
-
1532
- Args:
1533
- proxy_type: Proxy product type.
1534
-
1535
- Returns:
1536
- List of IP address strings.
1537
- """
1538
1072
  self._require_public_credentials()
1539
1073
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1540
1074
  params = {
@@ -1552,7 +1086,6 @@ class ThordataClient:
1552
1086
  "List whitelist IPs failed", code=data.get("code"), payload=data
1553
1087
  )
1554
1088
 
1555
- # API usually returns {"data": ["1.1.1.1", ...]} OR {"data": [{"ip": "..."}]}
1556
1089
  items = data.get("data", []) or []
1557
1090
  result = []
1558
1091
  for item in items:
@@ -1568,17 +1101,27 @@ class ThordataClient:
1568
1101
  # Locations & ASN Methods
1569
1102
  # =========================================================================
1570
1103
 
1104
+ def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
1105
+ self._require_public_credentials()
1106
+ params = {"token": self.public_token, "key": self.public_key}
1107
+ for k, v in kwargs.items():
1108
+ params[k] = str(v)
1109
+
1110
+ response = self._api_request_with_retry(
1111
+ "GET", f"{self._locations_base_url}/{endpoint}", params=params
1112
+ )
1113
+ response.raise_for_status()
1114
+ data = response.json()
1115
+
1116
+ if isinstance(data, dict):
1117
+ if data.get("code") != 200:
1118
+ raise RuntimeError(f"Locations error: {data.get('msg')}")
1119
+ return data.get("data") or []
1120
+ return data if isinstance(data, list) else []
1121
+
1571
1122
  def list_countries(
1572
1123
  self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1573
1124
  ) -> list[dict[str, Any]]:
1574
- """List available countries for proxy locations.
1575
-
1576
- Args:
1577
- proxy_type: Proxy product type.
1578
-
1579
- Returns:
1580
- List of country dictionaries.
1581
- """
1582
1125
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1583
1126
  return self._get_locations("countries", proxy_type=pt)
1584
1127
 
@@ -1587,15 +1130,6 @@ class ThordataClient:
1587
1130
  country_code: str,
1588
1131
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1589
1132
  ) -> list[dict[str, Any]]:
1590
- """List available states/provinces for a country.
1591
-
1592
- Args:
1593
- country_code: Country code (e.g., "US", "GB").
1594
- proxy_type: Proxy product type.
1595
-
1596
- Returns:
1597
- List of state dictionaries.
1598
- """
1599
1133
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1600
1134
  return self._get_locations("states", proxy_type=pt, country_code=country_code)
1601
1135
 
@@ -1605,16 +1139,6 @@ class ThordataClient:
1605
1139
  state_code: str | None = None,
1606
1140
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1607
1141
  ) -> list[dict[str, Any]]:
1608
- """List available cities for a country/state.
1609
-
1610
- Args:
1611
- country_code: Country code.
1612
- state_code: State code (optional).
1613
- proxy_type: Proxy product type.
1614
-
1615
- Returns:
1616
- List of city dictionaries.
1617
- """
1618
1142
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1619
1143
  kwargs = {"proxy_type": pt, "country_code": country_code}
1620
1144
  if state_code:
@@ -1626,15 +1150,6 @@ class ThordataClient:
1626
1150
  country_code: str,
1627
1151
  proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1628
1152
  ) -> list[dict[str, Any]]:
1629
- """List available ASN numbers for a country.
1630
-
1631
- Args:
1632
- country_code: Country code.
1633
- proxy_type: Proxy product type.
1634
-
1635
- Returns:
1636
- List of ASN dictionaries.
1637
- """
1638
1153
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1639
1154
  return self._get_locations("asn", proxy_type=pt, country_code=country_code)
1640
1155
 
@@ -1643,14 +1158,6 @@ class ThordataClient:
1643
1158
  # =========================================================================
1644
1159
 
1645
1160
  def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
1646
- """List purchased proxy servers (ISP/Datacenter).
1647
-
1648
- Args:
1649
- proxy_type: Proxy type (1=ISP, 2=Datacenter).
1650
-
1651
- Returns:
1652
- List of ProxyServer objects.
1653
- """
1654
1161
  self._require_public_credentials()
1655
1162
  params = {
1656
1163
  "token": self.public_token,
@@ -1672,21 +1179,11 @@ class ThordataClient:
1672
1179
  server_list = data.get("data", data.get("list", []))
1673
1180
  elif isinstance(data, list):
1674
1181
  server_list = data
1675
-
1676
1182
  return [ProxyServer.from_dict(s) for s in server_list]
1677
1183
 
1678
1184
  def get_proxy_expiration(
1679
1185
  self, ips: str | list[str], proxy_type: int
1680
1186
  ) -> dict[str, Any]:
1681
- """Get expiration time for proxy IPs.
1682
-
1683
- Args:
1684
- ips: Single IP or comma-separated list of IPs.
1685
- proxy_type: Proxy type (1=ISP, 2=Datacenter).
1686
-
1687
- Returns:
1688
- Dictionary with IP expiration times.
1689
- """
1690
1187
  self._require_public_credentials()
1691
1188
  if isinstance(ips, list):
1692
1189
  ips = ",".join(ips)
@@ -1706,98 +1203,12 @@ class ThordataClient:
1706
1203
  return data.get("data", data)
1707
1204
 
1708
1205
  # =========================================================================
1709
- # Internal Request Helpers
1206
+ # Helpers needed for compatibility
1710
1207
  # =========================================================================
1711
1208
 
1712
- def _api_request_with_retry(
1713
- self,
1714
- method: str,
1715
- url: str,
1716
- *,
1717
- data: dict[str, Any] | None = None,
1718
- headers: dict[str, str] | None = None,
1719
- params: dict[str, Any] | None = None,
1720
- ) -> requests.Response:
1721
- """Make an API request with retry logic.
1722
-
1723
- Args:
1724
- method: HTTP method.
1725
- url: Request URL.
1726
- data: Request body data.
1727
- headers: Request headers.
1728
- query_params: Query string parameters.
1729
-
1730
- Returns:
1731
- Response object.
1732
- """
1733
-
1734
- @with_retry(self._retry_config)
1735
- def _do_request() -> requests.Response:
1736
- return self._api_session.request(
1737
- method,
1738
- url,
1739
- data=data,
1740
- headers=headers,
1741
- params=params,
1742
- timeout=self._api_timeout,
1743
- )
1744
-
1745
- try:
1746
- return _do_request()
1747
- except requests.Timeout as e:
1748
- raise ThordataTimeoutError(
1749
- f"API request timed out: {e}", original_error=e
1750
- ) from e
1751
- except requests.RequestException as e:
1752
- raise ThordataNetworkError(
1753
- f"API request failed: {e}", original_error=e
1754
- ) from e
1755
-
1756
- def _require_public_credentials(self) -> None:
1757
- """Check that public credentials are set."""
1758
- if not self.public_token or not self.public_key:
1759
- raise ThordataConfigError(
1760
- "public_token and public_key are required for this operation."
1761
- )
1762
-
1763
- def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
1764
- """Internal method to fetch location data.
1765
-
1766
- Args:
1767
- endpoint: Location endpoint (countries, states, cities, asn).
1768
- **kwargs: Query parameters.
1769
-
1770
- Returns:
1771
- List of location dictionaries.
1772
- """
1773
- self._require_public_credentials()
1774
- params = {"token": self.public_token, "key": self.public_key}
1775
- for k, v in kwargs.items():
1776
- params[k] = str(v)
1777
-
1778
- response = self._api_request_with_retry(
1779
- "GET", f"{self._locations_base_url}/{endpoint}", params=params
1780
- )
1781
- response.raise_for_status()
1782
- data = response.json()
1783
- if isinstance(data, dict):
1784
- if data.get("code") != 200:
1785
- raise RuntimeError(f"Locations error: {data.get('msg')}")
1786
- return data.get("data") or []
1787
- return data if isinstance(data, list) else []
1788
-
1789
1209
  def _process_universal_response(
1790
1210
  self, response: requests.Response, output_format: str
1791
1211
  ) -> str | bytes:
1792
- """Process Universal API response.
1793
-
1794
- Args:
1795
- response: Response object.
1796
- output_format: Expected output format.
1797
-
1798
- Returns:
1799
- Processed content.
1800
- """
1801
1212
  try:
1802
1213
  resp_json = response.json()
1803
1214
  except ValueError:
@@ -1813,11 +1224,31 @@ class ThordataClient:
1813
1224
  return resp_json["html"]
1814
1225
  if "png" in resp_json:
1815
1226
  return decode_base64_image(resp_json["png"])
1816
-
1817
1227
  return str(resp_json)
1818
1228
 
1229
+ def get_browser_connection_url(
1230
+ self, username: str | None = None, password: str | None = None
1231
+ ) -> str:
1232
+ # User requested modification: ONLY use browser credentials, do not fall back to residential.
1233
+ user = username or os.getenv("THORDATA_BROWSER_USERNAME")
1234
+ pwd = password or os.getenv("THORDATA_BROWSER_PASSWORD")
1235
+
1236
+ if not user or not pwd:
1237
+ raise ThordataConfigError(
1238
+ "Browser credentials missing. Set THORDATA_BROWSER_USERNAME/PASSWORD or pass arguments."
1239
+ )
1240
+ prefix = "td-customer-"
1241
+ final_user = f"{prefix}{user}" if not user.startswith(prefix) else user
1242
+
1243
+ from urllib.parse import quote
1244
+
1245
+ safe_user = quote(final_user, safe="")
1246
+ safe_pass = quote(pwd, safe="")
1247
+
1248
+ return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"
1249
+
1819
1250
  # =========================================================================
1820
- # Proxy Implementation Details
1251
+ # Proxy Internal Logic
1821
1252
  # =========================================================================
1822
1253
 
1823
1254
  def _proxy_verb(
@@ -1828,17 +1259,11 @@ class ThordataClient:
1828
1259
  timeout: int | None,
1829
1260
  **kwargs: Any,
1830
1261
  ) -> requests.Response:
1831
- """Internal method for proxy requests."""
1832
1262
  timeout = timeout or self._default_timeout
1833
-
1834
1263
  if proxy_config is None:
1835
1264
  proxy_config = self._get_default_proxy_config_from_env()
1836
-
1837
1265
  if proxy_config is None:
1838
- raise ThordataConfigError(
1839
- "Proxy credentials are missing. "
1840
- "Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
1841
- )
1266
+ raise ThordataConfigError("Proxy credentials are missing.")
1842
1267
 
1843
1268
  kwargs.pop("proxies", None)
1844
1269
 
@@ -1847,8 +1272,8 @@ class ThordataClient:
1847
1272
  return self._proxy_request_with_proxy_manager(
1848
1273
  method,
1849
1274
  url,
1850
- proxy_config=proxy_config,
1851
- timeout=timeout,
1275
+ proxy_config=cast(ProxyConfig, proxy_config),
1276
+ timeout=cast(int, timeout),
1852
1277
  headers=kwargs.pop("headers", None),
1853
1278
  params=kwargs.pop("params", None),
1854
1279
  data=kwargs.pop("data", None),
@@ -1856,15 +1281,10 @@ class ThordataClient:
1856
1281
 
1857
1282
  try:
1858
1283
  return _do()
1859
- except requests.Timeout as e:
1860
- raise ThordataTimeoutError(
1861
- f"Request timed out: {e}", original_error=e
1862
- ) from e
1863
1284
  except Exception as e:
1864
1285
  raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
1865
1286
 
1866
1287
  def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
1867
- """Build a stable cache key for ProxyManager instances."""
1868
1288
  if not userpass:
1869
1289
  return proxy_endpoint
1870
1290
  h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
@@ -1877,43 +1297,34 @@ class ThordataClient:
1877
1297
  cache_key: str,
1878
1298
  proxy_headers: dict[str, str] | None = None,
1879
1299
  ) -> urllib3.PoolManager:
1880
- """Get or create a ProxyManager for the given proxy URL (Pooled)."""
1881
1300
  cached = self._proxy_managers.get(cache_key)
1882
1301
  if cached is not None:
1883
1302
  return cached
1884
1303
 
1885
1304
  if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
1886
- try:
1887
- from urllib3.contrib.socks import SOCKSProxyManager
1888
- except Exception as e:
1305
+ if not HAS_PYSOCKS:
1889
1306
  raise ThordataConfigError(
1890
- "SOCKS proxy requested but SOCKS dependencies are missing. "
1891
- "Install: pip install 'urllib3[socks]' or pip install PySocks"
1892
- ) from e
1893
-
1894
- pm_socks = SOCKSProxyManager(
1895
- proxy_url,
1896
- num_pools=10,
1897
- maxsize=10,
1307
+ "SOCKS support requires PySocks/urllib3[socks]"
1308
+ )
1309
+ from urllib3.contrib.socks import SOCKSProxyManager
1310
+
1311
+ pm = cast(
1312
+ urllib3.PoolManager,
1313
+ SOCKSProxyManager(proxy_url, num_pools=10, maxsize=10),
1898
1314
  )
1899
- pm = cast(urllib3.PoolManager, pm_socks)
1900
1315
  self._proxy_managers[cache_key] = pm
1901
1316
  return pm
1902
1317
 
1903
- # HTTP/HTTPS proxies
1904
- proxy_ssl_context = None
1905
- if proxy_url.startswith("https://"):
1906
- proxy_ssl_context = ssl.create_default_context()
1907
-
1908
- pm_http = urllib3.ProxyManager(
1318
+ proxy_ssl_context = (
1319
+ ssl.create_default_context() if proxy_url.startswith("https://") else None
1320
+ )
1321
+ pm = urllib3.ProxyManager(
1909
1322
  proxy_url,
1910
1323
  proxy_headers=proxy_headers,
1911
1324
  proxy_ssl_context=proxy_ssl_context,
1912
1325
  num_pools=10,
1913
1326
  maxsize=10,
1914
1327
  )
1915
-
1916
- pm = cast(urllib3.PoolManager, pm_http)
1917
1328
  self._proxy_managers[cache_key] = pm
1918
1329
  return pm
1919
1330
 
@@ -1928,12 +1339,8 @@ class ThordataClient:
1928
1339
  params: dict[str, Any] | None = None,
1929
1340
  data: Any = None,
1930
1341
  ) -> requests.Response:
1931
- """Execute request through proxy, with optional upstream proxy support."""
1932
-
1933
- # Check for upstream proxy
1934
- upstream_config = _parse_upstream_proxy()
1935
-
1936
- if upstream_config:
1342
+ upstream = _parse_upstream_proxy()
1343
+ if upstream:
1937
1344
  return self._proxy_request_with_upstream(
1938
1345
  method,
1939
1346
  url,
@@ -1942,41 +1349,30 @@ class ThordataClient:
1942
1349
  headers=headers,
1943
1350
  params=params,
1944
1351
  data=data,
1945
- upstream_config=upstream_config,
1352
+ upstream_config=upstream,
1946
1353
  )
1947
1354
 
1948
- # Original implementation (no upstream proxy)
1949
1355
  req = requests.Request(method=method.upper(), url=url, params=params)
1950
1356
  prepped = self._proxy_session.prepare_request(req)
1951
1357
  final_url = prepped.url or url
1952
1358
 
1953
1359
  proxy_endpoint = proxy_config.build_proxy_endpoint()
1954
- is_socks = proxy_endpoint.startswith(
1955
- ("socks5://", "socks5h://", "socks4://", "socks4a://")
1956
- )
1360
+ is_socks = proxy_endpoint.startswith(("socks",))
1957
1361
 
1958
1362
  if is_socks:
1959
1363
  proxy_url_for_manager = proxy_config.build_proxy_url()
1960
- userpass = proxy_config.build_proxy_basic_auth()
1961
- cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
1962
-
1963
- pm = self._get_proxy_manager(
1964
- proxy_url_for_manager,
1965
- cache_key=cache_key,
1966
- proxy_headers=None,
1967
- )
1364
+ cache_key = proxy_url_for_manager
1365
+ pm = self._get_proxy_manager(proxy_url_for_manager, cache_key=cache_key)
1366
+ req_headers = dict(headers or {})
1968
1367
  else:
1969
1368
  userpass = proxy_config.build_proxy_basic_auth()
1970
1369
  proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
1971
1370
  cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
1972
-
1973
1371
  pm = self._get_proxy_manager(
1974
- proxy_endpoint,
1975
- cache_key=cache_key,
1976
- proxy_headers=dict(proxy_headers),
1372
+ proxy_endpoint, cache_key=cache_key, proxy_headers=dict(proxy_headers)
1977
1373
  )
1374
+ req_headers = dict(headers or {})
1978
1375
 
1979
- req_headers = dict(headers or {})
1980
1376
  body = None
1981
1377
  if data is not None:
1982
1378
  if isinstance(data, dict):
@@ -1998,16 +1394,12 @@ class ThordataClient:
1998
1394
  )
1999
1395
 
2000
1396
  r = requests.Response()
2001
- r.status_code = int(getattr(http_resp, "status", 0) or 0)
1397
+ r.status_code = int(getattr(http_resp, "status", 0))
2002
1398
  r._content = http_resp.data or b""
2003
1399
  r.url = final_url
2004
1400
  r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
2005
1401
  return r
2006
1402
 
2007
- # =========================================================================
2008
- # Upstream Proxy Support (Proxy Chaining)
2009
- # =========================================================================
2010
-
2011
1403
  def _proxy_request_with_upstream(
2012
1404
  self,
2013
1405
  method: str,
@@ -2020,12 +1412,8 @@ class ThordataClient:
2020
1412
  data: Any = None,
2021
1413
  upstream_config: dict[str, Any],
2022
1414
  ) -> requests.Response:
2023
- """Execute request through proxy chain: Upstream -> Thordata -> Target."""
2024
1415
  if not HAS_PYSOCKS:
2025
- raise ThordataConfigError(
2026
- "PySocks is required for upstream proxy support. "
2027
- "Install with: pip install PySocks"
2028
- )
1416
+ raise ThordataConfigError("PySocks required for upstream proxy support.")
2029
1417
 
2030
1418
  req = requests.Request(method=method.upper(), url=url, params=params)
2031
1419
  prepped = self._proxy_session.prepare_request(req)
@@ -2036,370 +1424,141 @@ class ThordataClient:
2036
1424
  target_port = parsed_target.port or (
2037
1425
  443 if parsed_target.scheme == "https" else 80
2038
1426
  )
2039
- target_is_https = parsed_target.scheme == "https"
2040
1427
 
2041
- protocol = proxy_config.protocol.lower()
2042
- if protocol == "socks5":
2043
- protocol = "socks5h"
2044
-
2045
- thordata_host = proxy_config.host or ""
1428
+ thordata_host = proxy_config.host or "pr.thordata.net"
2046
1429
  thordata_port = proxy_config.port or 9999
2047
- thordata_username = proxy_config.build_username()
2048
- thordata_password = proxy_config.password
2049
-
2050
- socket_factory = _UpstreamProxySocketFactory(upstream_config)
1430
+ thordata_user = proxy_config.build_username()
1431
+ thordata_pass = proxy_config.password
2051
1432
 
2052
- logger.debug(
2053
- f"Proxy chain: upstream({upstream_config['host']}:{upstream_config['port']}) "
2054
- f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
2055
- f"-> target({target_host}:{target_port})"
2056
- )
2057
-
2058
- raw_sock = socket_factory.create_connection(
1433
+ # 1. Connect to Upstream -> Thordata Node
1434
+ factory = UpstreamProxySocketFactory(upstream_config)
1435
+ raw_sock = factory.create_connection(
2059
1436
  (thordata_host, thordata_port),
2060
1437
  timeout=float(timeout),
2061
1438
  )
2062
1439
 
2063
1440
  try:
2064
- if protocol.startswith("socks"):
2065
- sock = self._socks5_handshake(
2066
- raw_sock,
2067
- target_host,
2068
- target_port,
2069
- thordata_username,
2070
- thordata_password,
2071
- )
2072
- if target_is_https:
2073
- context = ssl.create_default_context()
2074
- sock = context.wrap_socket(sock, server_hostname=target_host)
2075
-
2076
- elif protocol == "https":
2077
- proxy_context = ssl.create_default_context()
2078
- proxy_ssl_sock = proxy_context.wrap_socket(
2079
- raw_sock, server_hostname=thordata_host
2080
- )
1441
+ protocol = proxy_config.protocol.lower().replace("socks5", "socks5h")
2081
1442
 
2082
- self._send_connect_request(
2083
- proxy_ssl_sock,
2084
- target_host,
2085
- target_port,
2086
- thordata_username,
2087
- thordata_password,
2088
- )
2089
-
2090
- if target_is_https:
2091
- # FIX: Add type ignore for MyPy because _TLSInTLSSocket is duck-typed as socket
2092
- sock = self._create_tls_in_tls_socket(
2093
- proxy_ssl_sock, target_host, timeout
2094
- ) # type: ignore[assignment]
2095
- else:
2096
- sock = proxy_ssl_sock
2097
-
2098
- else: # HTTP proxy
2099
- self._send_connect_request(
2100
- raw_sock,
2101
- target_host,
2102
- target_port,
2103
- thordata_username,
2104
- thordata_password,
1443
+ # 2. Handshake with Thordata
1444
+ if protocol.startswith("socks"):
1445
+ sock = socks5_handshake(
1446
+ raw_sock, target_host, target_port, thordata_user, thordata_pass
2105
1447
  )
2106
-
2107
- if target_is_https:
2108
- context = ssl.create_default_context()
2109
- sock = context.wrap_socket(raw_sock, server_hostname=target_host)
1448
+ if parsed_target.scheme == "https":
1449
+ ctx = ssl.create_default_context()
1450
+ sock = ctx.wrap_socket(sock, server_hostname=target_host)
1451
+ else:
1452
+ # HTTP/HTTPS Tunnel
1453
+ if protocol == "https":
1454
+ ctx = ssl.create_default_context()
1455
+ sock = ctx.wrap_socket(raw_sock, server_hostname=thordata_host)
2110
1456
  else:
2111
1457
  sock = raw_sock
2112
1458
 
2113
- return self._send_http_request(
1459
+ # CONNECT to Thordata
1460
+ connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
1461
+ connect_req += f"Host: {target_host}:{target_port}\r\n"
1462
+ auth = base64.b64encode(
1463
+ f"{thordata_user}:{thordata_pass}".encode()
1464
+ ).decode()
1465
+ connect_req += f"Proxy-Authorization: Basic {auth}\r\n\r\n"
1466
+ sock.sendall(connect_req.encode())
1467
+
1468
+ resp = b""
1469
+ while b"\r\n\r\n" not in resp:
1470
+ resp += sock.recv(1024)
1471
+ if b"200" not in resp.split(b"\r\n")[0]:
1472
+ raise ConnectionError("Thordata CONNECT failed")
1473
+
1474
+ # 3. If Target is HTTPS, wrap TLS inside the tunnel
1475
+ if parsed_target.scheme == "https":
1476
+ if isinstance(sock, ssl.SSLSocket):
1477
+ sock = cast(
1478
+ socket.socket,
1479
+ create_tls_in_tls(sock, target_host, float(timeout)),
1480
+ )
1481
+ else:
1482
+ ctx = ssl.create_default_context()
1483
+ sock = ctx.wrap_socket(sock, server_hostname=target_host)
1484
+
1485
+ # 4. Send actual Request
1486
+ return self._send_http_via_socket(
2114
1487
  sock, method, parsed_target, headers, data, final_url, timeout
2115
1488
  )
2116
1489
 
2117
- finally:
2118
- with contextlib.suppress(Exception):
2119
- raw_sock.close()
2120
-
2121
- def _send_connect_request(
2122
- self,
2123
- sock: socket.socket,
2124
- target_host: str,
2125
- target_port: int,
2126
- proxy_username: str,
2127
- proxy_password: str,
2128
- ) -> None:
2129
- """Send HTTP CONNECT request to proxy and verify response."""
2130
- connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
2131
- connect_req += f"Host: {target_host}:{target_port}\r\n"
2132
-
2133
- credentials = f"{proxy_username}:{proxy_password}"
2134
- encoded = base64.b64encode(credentials.encode()).decode()
2135
- connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
2136
- connect_req += "\r\n"
2137
-
2138
- sock.sendall(connect_req.encode())
2139
-
2140
- response = b""
2141
- while b"\r\n\r\n" not in response:
2142
- chunk = sock.recv(4096)
2143
- if not chunk:
2144
- raise ConnectionError("Proxy closed connection during CONNECT")
2145
- response += chunk
2146
-
2147
- status_line = response.split(b"\r\n")[0].decode()
2148
- if "200" not in status_line:
2149
- raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
2150
-
2151
- def _create_tls_in_tls_socket(
2152
- self,
2153
- outer_ssl_sock: ssl.SSLSocket,
2154
- hostname: str,
2155
- timeout: int,
2156
- ) -> _TLSInTLSSocket:
2157
- """Create a TLS connection over an existing TLS connection."""
2158
- context = ssl.create_default_context()
2159
-
2160
- incoming = ssl.MemoryBIO()
2161
- outgoing = ssl.MemoryBIO()
2162
-
2163
- ssl_obj = context.wrap_bio(incoming, outgoing, server_hostname=hostname)
2164
-
2165
- while True:
2166
- try:
2167
- ssl_obj.do_handshake()
2168
- break
2169
- except ssl.SSLWantReadError:
2170
- data_to_send = outgoing.read()
2171
- if data_to_send:
2172
- outer_ssl_sock.sendall(data_to_send)
1490
+ except Exception:
1491
+ raw_sock.close()
1492
+ raise
2173
1493
 
2174
- outer_ssl_sock.settimeout(float(timeout))
2175
- try:
2176
- received = outer_ssl_sock.recv(8192)
2177
- if not received:
2178
- raise ConnectionError("Connection closed during TLS handshake")
2179
- incoming.write(received)
2180
- except socket.timeout as e:
2181
- raise ConnectionError("Timeout during TLS handshake") from e
2182
- except ssl.SSLWantWriteError:
2183
- data_to_send = outgoing.read()
2184
- if data_to_send:
2185
- outer_ssl_sock.sendall(data_to_send)
2186
-
2187
- data_to_send = outgoing.read()
2188
- if data_to_send:
2189
- outer_ssl_sock.sendall(data_to_send)
2190
-
2191
- return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
2192
-
2193
- def _send_http_request(
1494
+ def _send_http_via_socket(
2194
1495
  self,
2195
- sock: socket.socket | ssl.SSLSocket | Any,
1496
+ sock: socket.socket | Any, # Fix for TLSInTLSSocket typing issue
2196
1497
  method: str,
2197
- parsed_url: Any,
2198
- headers: dict[str, str] | None,
1498
+ parsed: Any,
1499
+ headers: Any,
2199
1500
  data: Any,
2200
1501
  final_url: str,
2201
1502
  timeout: int,
2202
1503
  ) -> requests.Response:
2203
- """Send HTTP request over established connection and parse response."""
2204
- target_host = parsed_url.hostname
2205
-
2206
1504
  req_headers = dict(headers or {})
2207
- req_headers.setdefault("Host", target_host)
2208
- req_headers.setdefault("User-Agent", build_user_agent(_sdk_version, "requests"))
1505
+ req_headers.setdefault("Host", parsed.hostname)
1506
+ req_headers.setdefault("User-Agent", "python-thordata-sdk")
2209
1507
  req_headers.setdefault("Connection", "close")
2210
1508
 
2211
- path = parsed_url.path or "/"
2212
- if parsed_url.query:
2213
- path += f"?{parsed_url.query}"
1509
+ path = parsed.path or "/"
1510
+ if parsed.query:
1511
+ path += f"?{parsed.query}"
2214
1512
 
2215
- http_req = f"{method.upper()} {path} HTTP/1.1\r\n"
1513
+ msg = f"{method} {path} HTTP/1.1\r\n"
2216
1514
  for k, v in req_headers.items():
2217
- http_req += f"{k}: {v}\r\n"
1515
+ msg += f"{k}: {v}\r\n"
2218
1516
 
2219
- body = None
2220
- if data is not None:
1517
+ body = b""
1518
+ if data:
2221
1519
  if isinstance(data, dict):
2222
- body = urlencode({k: str(v) for k, v in data.items()}).encode()
2223
- http_req += "Content-Type: application/x-www-form-urlencoded\r\n"
2224
- http_req += f"Content-Length: {len(body)}\r\n"
1520
+ body = urlencode(data).encode()
1521
+ msg += "Content-Type: application/x-www-form-urlencoded\r\n"
2225
1522
  elif isinstance(data, bytes):
2226
1523
  body = data
2227
- http_req += f"Content-Length: {len(body)}\r\n"
2228
1524
  else:
2229
1525
  body = str(data).encode()
2230
- http_req += f"Content-Length: {len(body)}\r\n"
2231
-
2232
- http_req += "\r\n"
2233
- sock.sendall(http_req.encode())
1526
+ msg += f"Content-Length: {len(body)}\r\n"
2234
1527
 
1528
+ msg += "\r\n"
1529
+ sock.sendall(msg.encode())
2235
1530
  if body:
2236
1531
  sock.sendall(body)
2237
1532
 
2238
- if hasattr(sock, "settimeout"):
2239
- sock.settimeout(float(timeout))
2240
-
2241
- response_data = b""
2242
- try:
2243
- while True:
2244
- chunk = sock.recv(8192)
1533
+ # Read Response
1534
+ resp_data = b""
1535
+ while True:
1536
+ try:
1537
+ chunk = sock.recv(4096)
2245
1538
  if not chunk:
2246
1539
  break
2247
- response_data += chunk
2248
- if b"\r\n\r\n" in response_data:
2249
- header_end = response_data.index(b"\r\n\r\n") + 4
2250
- headers_part = (
2251
- response_data[:header_end]
2252
- .decode("utf-8", errors="replace")
2253
- .lower()
2254
- )
2255
- if "content-length:" in headers_part:
2256
- for line in headers_part.split("\r\n"):
2257
- if line.startswith("content-length:"):
2258
- content_length = int(line.split(":")[1].strip())
2259
- if len(response_data) >= header_end + content_length:
2260
- break
2261
- elif "transfer-encoding: chunked" not in headers_part:
2262
- break
2263
- except socket.timeout:
2264
- pass
2265
-
2266
- return self._parse_http_response(response_data, final_url)
2267
-
2268
- def _socks5_handshake(
2269
- self,
2270
- sock: socket.socket,
2271
- target_host: str,
2272
- target_port: int,
2273
- username: str | None,
2274
- password: str | None,
2275
- ) -> socket.socket:
2276
- """Perform SOCKS5 handshake over existing socket."""
2277
- if username and password:
2278
- sock.sendall(b"\x05\x02\x00\x02")
2279
- else:
2280
- sock.sendall(b"\x05\x01\x00")
2281
-
2282
- response = sock.recv(2)
2283
- if len(response) < 2:
2284
- raise ConnectionError("SOCKS5 handshake failed: incomplete response")
2285
-
2286
- if response[0] != 0x05:
2287
- raise ConnectionError(f"SOCKS5 version mismatch: {response[0]}")
2288
-
2289
- auth_method = response[1]
2290
-
2291
- if auth_method == 0x02:
2292
- if not username or not password:
2293
- raise ConnectionError(
2294
- "SOCKS5 server requires auth but no credentials provided"
2295
- )
2296
-
2297
- auth_req = bytes([0x01, len(username)]) + username.encode()
2298
- auth_req += bytes([len(password)]) + password.encode()
2299
- sock.sendall(auth_req)
2300
-
2301
- auth_resp = sock.recv(2)
2302
- if len(auth_resp) < 2 or auth_resp[1] != 0x00:
2303
- raise ConnectionError("SOCKS5 authentication failed")
2304
-
2305
- elif auth_method == 0xFF:
2306
- raise ConnectionError("SOCKS5 no acceptable auth method")
2307
-
2308
- connect_req = b"\x05\x01\x00\x03"
2309
- connect_req += bytes([len(target_host)]) + target_host.encode()
2310
- connect_req += target_port.to_bytes(2, "big")
2311
- sock.sendall(connect_req)
2312
-
2313
- resp = sock.recv(4)
2314
- if len(resp) < 4:
2315
- raise ConnectionError("SOCKS5 connect failed: incomplete response")
2316
-
2317
- if resp[1] != 0x00:
2318
- error_codes = {
2319
- 0x01: "General failure",
2320
- 0x02: "Connection not allowed",
2321
- 0x03: "Network unreachable",
2322
- 0x04: "Host unreachable",
2323
- 0x05: "Connection refused",
2324
- 0x06: "TTL expired",
2325
- 0x07: "Command not supported",
2326
- 0x08: "Address type not supported",
2327
- }
2328
- error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
2329
- raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
2330
-
2331
- addr_type = resp[3]
2332
- if addr_type == 0x01:
2333
- sock.recv(4 + 2)
2334
- elif addr_type == 0x03:
2335
- domain_len = sock.recv(1)[0]
2336
- sock.recv(domain_len + 2)
2337
- elif addr_type == 0x04:
2338
- sock.recv(16 + 2)
2339
-
2340
- return sock
2341
-
2342
- def _parse_http_response(
2343
- self,
2344
- response_data: bytes,
2345
- url: str,
2346
- ) -> requests.Response:
2347
- """Parse raw HTTP response into requests.Response."""
2348
- if b"\r\n\r\n" in response_data:
2349
- header_data, body = response_data.split(b"\r\n\r\n", 1)
2350
- else:
2351
- header_data = response_data
2352
- body = b""
2353
-
2354
- header_lines = header_data.decode("utf-8", errors="replace").split("\r\n")
2355
-
2356
- status_line = header_lines[0] if header_lines else ""
2357
- parts = status_line.split(" ", 2)
2358
- status_code = int(parts[1]) if len(parts) > 1 else 0
2359
-
2360
- headers_dict = {}
2361
- for line in header_lines[1:]:
2362
- if ": " in line:
2363
- k, v = line.split(": ", 1)
2364
- headers_dict[k] = v
2365
-
2366
- if headers_dict.get("Transfer-Encoding", "").lower() == "chunked":
2367
- body = self._decode_chunked(body)
2368
-
2369
- r = requests.Response()
2370
- r.status_code = status_code
2371
- r._content = body
2372
- r.url = url
2373
- r.headers = CaseInsensitiveDict(headers_dict)
2374
- return r
2375
-
2376
- def _decode_chunked(self, data: bytes) -> bytes:
2377
- """Decode chunked transfer encoding."""
2378
- result = b""
2379
- while data:
2380
- if b"\r\n" not in data:
2381
- break
2382
- size_line, data = data.split(b"\r\n", 1)
2383
- try:
2384
- chunk_size = int(size_line.decode().strip(), 16)
2385
- except ValueError:
2386
- break
2387
-
2388
- if chunk_size == 0:
1540
+ resp_data += chunk
1541
+ except socket.timeout:
2389
1542
  break
2390
1543
 
2391
- result += data[:chunk_size]
2392
- data = data[chunk_size:]
2393
-
2394
- if data.startswith(b"\r\n"):
2395
- data = data[2:]
1544
+ if b"\r\n\r\n" in resp_data:
1545
+ head, content = resp_data.split(b"\r\n\r\n", 1)
1546
+ status_line = head.split(b"\r\n")[0].decode()
1547
+ try:
1548
+ status_code = int(status_line.split(" ")[1])
1549
+ except (ValueError, IndexError):
1550
+ status_code = 0
2396
1551
 
2397
- return result
1552
+ r = requests.Response()
1553
+ r.status_code = status_code
1554
+ r._content = content
1555
+ r.url = final_url
1556
+ return r
1557
+ raise ConnectionError("Empty response from socket")
2398
1558
 
2399
1559
  def _get_proxy_endpoint_overrides(
2400
1560
  self, product: ProxyProduct
2401
1561
  ) -> tuple[str | None, int | None, str]:
2402
- """Get proxy endpoint overrides from environment variables."""
2403
1562
  prefix = product.value.upper()
2404
1563
  host = os.getenv(f"THORDATA_{prefix}_PROXY_HOST") or os.getenv(
2405
1564
  "THORDATA_PROXY_HOST"
@@ -2410,13 +1569,12 @@ class ThordataClient:
2410
1569
  protocol = (
2411
1570
  os.getenv(f"THORDATA_{prefix}_PROXY_PROTOCOL")
2412
1571
  or os.getenv("THORDATA_PROXY_PROTOCOL")
2413
- or "https"
1572
+ or "http"
2414
1573
  )
2415
1574
  port = int(port_raw) if port_raw and port_raw.isdigit() else None
2416
1575
  return host or None, port, protocol
2417
1576
 
2418
1577
  def _get_default_proxy_config_from_env(self) -> ProxyConfig | None:
2419
- """Get proxy configuration from environment variables."""
2420
1578
  for prod in [
2421
1579
  ProxyProduct.RESIDENTIAL,
2422
1580
  ProxyProduct.DATACENTER,
@@ -2436,44 +1594,3 @@ class ThordataClient:
2436
1594
  protocol=proto,
2437
1595
  )
2438
1596
  return None
2439
-
2440
- def get_browser_connection_url(
2441
- self, username: str | None = None, password: str | None = None
2442
- ) -> str:
2443
- """
2444
- Generate the WebSocket URL for connecting to Scraping Browser.
2445
-
2446
- Args:
2447
- username: Proxy username (without 'td-customer-' prefix).
2448
- Defaults to THORDATA_BROWSER_USERNAME or THORDATA_RESIDENTIAL_USERNAME.
2449
- password: Proxy password.
2450
-
2451
- Returns:
2452
- WSS URL string suitable for playwright.connect_over_cdp().
2453
-
2454
- Raises:
2455
- ThordataConfigError: If credentials are missing.
2456
- """
2457
- user = (
2458
- username
2459
- or os.getenv("THORDATA_BROWSER_USERNAME")
2460
- or os.getenv("THORDATA_RESIDENTIAL_USERNAME")
2461
- )
2462
- pwd = (
2463
- password
2464
- or os.getenv("THORDATA_BROWSER_PASSWORD")
2465
- or os.getenv("THORDATA_RESIDENTIAL_PASSWORD")
2466
- )
2467
-
2468
- if not user or not pwd:
2469
- raise ThordataConfigError(
2470
- "Browser credentials missing. Set THORDATA_BROWSER_USERNAME/PASSWORD or pass arguments."
2471
- )
2472
- prefix = "td-customer-"
2473
- final_user = f"{prefix}{user}" if not user.startswith(prefix) else user
2474
-
2475
- # URL encode
2476
- safe_user = quote(final_user, safe="")
2477
- safe_pass = quote(pwd, safe="")
2478
-
2479
- return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"