thordata-sdk 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/client.py CHANGED
@@ -3,28 +3,11 @@ Synchronous client for the Thordata API.
3
3
 
4
4
  This module provides the main ThordataClient class for interacting with
5
5
  Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
6
-
7
- Example:
8
- >>> from thordata import ThordataClient
9
- >>>
10
- >>> client = ThordataClient(
11
- ... scraper_token="your_token",
12
- ... public_token="your_public_token",
13
- ... public_key="your_public_key"
14
- ... )
15
- >>>
16
- >>> # Use the proxy network
17
- >>> response = client.get("https://httpbin.org/ip")
18
- >>> print(response.json())
19
- >>>
20
- >>> # Search with SERP API
21
- >>> results = client.serp_search("python tutorial", engine="google")
22
6
  """
23
7
 
24
8
  from __future__ import annotations
25
9
 
26
10
  import base64
27
- import contextlib
28
11
  import hashlib
29
12
  import logging
30
13
  import os
@@ -38,37 +21,42 @@ import requests
38
21
  import urllib3
39
22
  from requests.structures import CaseInsensitiveDict
40
23
 
41
- from .serp_engines import SerpNamespace
42
-
43
- try:
44
- import socks
45
-
46
- HAS_PYSOCKS = True
47
- except ImportError:
48
- HAS_PYSOCKS = False
49
-
50
- from . import __version__ as _sdk_version
24
+ # Import Legacy/Compat
51
25
  from ._utils import (
52
26
  build_auth_headers,
53
27
  build_builder_headers,
54
28
  build_public_api_headers,
55
- build_user_agent,
56
29
  decode_base64_image,
57
30
  extract_error_message,
58
31
  parse_json_response,
59
32
  )
60
- from .enums import Engine, ProxyType
33
+
34
+ # Import Core Components
35
+ from .core.http_client import ThordataHttpSession
36
+ from .core.tunnel import (
37
+ HAS_PYSOCKS,
38
+ UpstreamProxySocketFactory,
39
+ create_tls_in_tls,
40
+ parse_upstream_proxy,
41
+ socks5_handshake,
42
+ )
43
+ from .enums import Engine
61
44
  from .exceptions import (
62
45
  ThordataConfigError,
63
46
  ThordataNetworkError,
64
47
  ThordataTimeoutError,
65
48
  raise_for_code,
66
49
  )
67
- from .models import (
50
+ from .retry import RetryConfig, with_retry
51
+ from .serp_engines import SerpNamespace
52
+
53
+ # Import Types (Modernized)
54
+ from .types import (
68
55
  CommonSettings,
69
56
  ProxyConfig,
70
57
  ProxyProduct,
71
58
  ProxyServer,
59
+ ProxyType,
72
60
  ProxyUserList,
73
61
  ScraperTaskConfig,
74
62
  SerpRequest,
@@ -76,196 +64,17 @@ from .models import (
76
64
  UsageStatistics,
77
65
  VideoTaskConfig,
78
66
  )
79
- from .retry import RetryConfig, with_retry
67
+ from .unlimited import UnlimitedNamespace
80
68
 
81
69
  logger = logging.getLogger(__name__)
82
70
 
83
-
84
71
  # =========================================================================
85
- # Upstream Proxy Support (for users behind firewall)
72
+ # Internal Logic for Upstream Proxies
86
73
  # =========================================================================
87
74
 
88
75
 
89
76
  def _parse_upstream_proxy() -> dict[str, Any] | None:
90
- """
91
- Parse THORDATA_UPSTREAM_PROXY environment variable.
92
-
93
- Supported formats:
94
- - http://127.0.0.1:7897
95
- - socks5://127.0.0.1:7897
96
- - socks5://user:pass@127.0.0.1:7897
97
-
98
- Returns:
99
- Dict with proxy config or None if not set.
100
- """
101
- upstream_url = os.environ.get("THORDATA_UPSTREAM_PROXY", "").strip()
102
- if not upstream_url:
103
- return None
104
-
105
- parsed = urlparse(upstream_url)
106
- scheme = (parsed.scheme or "").lower()
107
-
108
- if scheme not in ("http", "https", "socks5", "socks5h", "socks4"):
109
- logger.warning(f"Unsupported upstream proxy scheme: {scheme}")
110
- return None
111
-
112
- return {
113
- "scheme": scheme,
114
- "host": parsed.hostname or "127.0.0.1",
115
- "port": parsed.port or (1080 if scheme.startswith("socks") else 7897),
116
- "username": parsed.username,
117
- "password": parsed.password,
118
- }
119
-
120
-
121
- class _UpstreamProxySocketFactory:
122
- """
123
- Socket factory that creates connections through an upstream proxy.
124
- Used for proxy chaining when accessing Thordata from behind a firewall.
125
- """
126
-
127
- def __init__(self, upstream_config: dict[str, Any]):
128
- self.config = upstream_config
129
-
130
- def create_connection(
131
- self,
132
- address: tuple[str, int],
133
- timeout: float | None = None,
134
- source_address: tuple[str, int] | None = None,
135
- ) -> socket.socket:
136
- """Create a socket connection through the upstream proxy."""
137
- scheme = self.config["scheme"]
138
-
139
- if scheme.startswith("socks"):
140
- return self._create_socks_connection(address, timeout)
141
- else:
142
- return self._create_http_tunnel(address, timeout)
143
-
144
- def _create_socks_connection(
145
- self,
146
- address: tuple[str, int],
147
- timeout: float | None = None,
148
- ) -> socket.socket:
149
- """Create connection through SOCKS proxy."""
150
- if not HAS_PYSOCKS:
151
- raise RuntimeError(
152
- "PySocks is required for SOCKS upstream proxy. "
153
- "Install with: pip install PySocks"
154
- )
155
-
156
- scheme = self.config["scheme"]
157
- proxy_type = socks.SOCKS5 if "socks5" in scheme else socks.SOCKS4
158
-
159
- sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
160
- sock.set_proxy(
161
- proxy_type,
162
- self.config["host"],
163
- self.config["port"],
164
- rdns=True,
165
- username=self.config.get("username"),
166
- password=self.config.get("password"),
167
- )
168
-
169
- if timeout is not None:
170
- sock.settimeout(timeout)
171
-
172
- sock.connect(address)
173
- return sock
174
-
175
- def _create_http_tunnel(
176
- self,
177
- address: tuple[str, int],
178
- timeout: float | None = None,
179
- ) -> socket.socket:
180
- """Create connection through HTTP CONNECT tunnel."""
181
- # Connect to upstream proxy
182
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
183
- if timeout is not None:
184
- sock.settimeout(timeout)
185
-
186
- sock.connect((self.config["host"], self.config["port"]))
187
-
188
- # Build CONNECT request
189
- target_host, target_port = address
190
- connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
191
- connect_req += f"Host: {target_host}:{target_port}\r\n"
192
-
193
- # Add proxy auth if provided
194
- if self.config.get("username"):
195
- credentials = f"{self.config['username']}:{self.config.get('password', '')}"
196
- encoded = base64.b64encode(credentials.encode()).decode()
197
- connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
198
-
199
- connect_req += "\r\n"
200
-
201
- sock.sendall(connect_req.encode())
202
-
203
- # Read response
204
- response = b""
205
- while b"\r\n\r\n" not in response:
206
- chunk = sock.recv(1024)
207
- if not chunk:
208
- raise ConnectionError("Upstream proxy closed connection")
209
- response += chunk
210
-
211
- # Check status
212
- status_line = response.split(b"\r\n")[0].decode()
213
- if "200" not in status_line:
214
- sock.close()
215
- raise ConnectionError(f"Upstream proxy CONNECT failed: {status_line}")
216
-
217
- return sock
218
-
219
-
220
- class _TLSInTLSSocket:
221
- """
222
- A socket-like wrapper for TLS-in-TLS connections.
223
-
224
- Uses SSLObject + MemoryBIO to implement TLS over an existing TLS connection.
225
- """
226
-
227
- def __init__(
228
- self,
229
- outer_sock: ssl.SSLSocket,
230
- ssl_obj: ssl.SSLObject,
231
- incoming: ssl.MemoryBIO,
232
- outgoing: ssl.MemoryBIO,
233
- ):
234
- self._outer = outer_sock
235
- self._ssl = ssl_obj
236
- self._incoming = incoming
237
- self._outgoing = outgoing
238
- self._timeout: float | None = None
239
-
240
- def settimeout(self, timeout: float | None) -> None:
241
- self._timeout = timeout
242
- self._outer.settimeout(timeout)
243
-
244
- def sendall(self, data: bytes) -> None:
245
- """Send data through the inner TLS connection."""
246
- self._ssl.write(data)
247
- encrypted = self._outgoing.read()
248
- if encrypted:
249
- self._outer.sendall(encrypted)
250
-
251
- def recv(self, bufsize: int) -> bytes:
252
- """Receive data from the inner TLS connection."""
253
- while True:
254
- try:
255
- return self._ssl.read(bufsize)
256
- except ssl.SSLWantReadError:
257
- self._outer.settimeout(self._timeout)
258
- try:
259
- received = self._outer.recv(8192)
260
- if not received:
261
- return b""
262
- self._incoming.write(received)
263
- except socket.timeout:
264
- return b""
265
-
266
- def close(self) -> None:
267
- with contextlib.suppress(Exception):
268
- self._outer.close()
77
+ return parse_upstream_proxy()
269
78
 
270
79
 
271
80
  # =========================================================================
@@ -274,6 +83,8 @@ class _TLSInTLSSocket:
274
83
 
275
84
 
276
85
  class ThordataClient:
86
+ """Main client for interacting with Thordata API services."""
87
+
277
88
  # API Endpoints
278
89
  BASE_URL = "https://scraperapi.thordata.com"
279
90
  UNIVERSAL_URL = "https://universalapi.thordata.com"
@@ -282,7 +93,7 @@ class ThordataClient:
282
93
 
283
94
  def __init__(
284
95
  self,
285
- scraper_token: str | None = None, # Change: Optional
96
+ scraper_token: str | None = None,
286
97
  public_token: str | None = None,
287
98
  public_key: str | None = None,
288
99
  proxy_host: str = "pr.thordata.net",
@@ -296,10 +107,6 @@ class ThordataClient:
296
107
  web_scraper_api_base_url: str | None = None,
297
108
  locations_base_url: str | None = None,
298
109
  ) -> None:
299
- """Initialize the Thordata Client."""
300
-
301
- self.serp = SerpNamespace(self)
302
-
303
110
  self.scraper_token = scraper_token
304
111
  self.public_token = public_token
305
112
  self.public_key = public_key
@@ -316,17 +123,17 @@ class ThordataClient:
316
123
  f"Invalid auth_mode: {auth_mode}. Must be 'bearer' or 'header_token'."
317
124
  )
318
125
 
126
+ # Initialize Core HTTP Client for API calls
127
+ self._http = ThordataHttpSession(
128
+ timeout=api_timeout, retry_config=self._retry_config
129
+ )
130
+
131
+ # Legacy logic for Proxy Network connections (requests.Session)
319
132
  self._proxy_session = requests.Session()
320
133
  self._proxy_session.trust_env = False
321
134
  self._proxy_managers: dict[str, urllib3.PoolManager] = {}
322
135
 
323
- self._api_session = requests.Session()
324
- self._api_session.trust_env = True
325
- self._api_session.headers.update(
326
- {"User-Agent": build_user_agent(_sdk_version, "requests")}
327
- )
328
-
329
- # Base URLs
136
+ # Base URLs Configuration
330
137
  scraperapi_base = (
331
138
  scraperapi_base_url
332
139
  or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
@@ -351,14 +158,14 @@ class ThordataClient:
351
158
  or self.LOCATIONS_URL
352
159
  ).rstrip("/")
353
160
 
354
- gateway_base = os.getenv(
161
+ self._gateway_base_url = os.getenv(
355
162
  "THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
356
163
  )
357
- self._gateway_base_url = gateway_base
358
164
  self._child_base_url = os.getenv(
359
165
  "THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
360
166
  )
361
167
 
168
+ # URL Construction
362
169
  self._serp_url = f"{scraperapi_base}/request"
363
170
  self._builder_url = f"{scraperapi_base}/builder"
364
171
  self._video_builder_url = f"{scraperapi_base}/video_builder"
@@ -370,12 +177,10 @@ class ThordataClient:
370
177
 
371
178
  self._locations_base_url = locations_base
372
179
 
373
- self._usage_stats_url = (
374
- f"{locations_base.replace('/locations', '')}/account/usage-statistics"
375
- )
376
- self._proxy_users_url = (
377
- f"{locations_base.replace('/locations', '')}/proxy-users"
378
- )
180
+ # Determine shared API base from locations URL
181
+ shared_api_base = locations_base.replace("/locations", "")
182
+ self._usage_stats_url = f"{shared_api_base}/account/usage-statistics"
183
+ self._proxy_users_url = f"{shared_api_base}/proxy-users"
379
184
 
380
185
  whitelist_base = os.getenv(
381
186
  "THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
@@ -388,6 +193,52 @@ class ThordataClient:
388
193
  self._proxy_list_url = f"{proxy_api_base}/proxy/proxy-list"
389
194
  self._proxy_expiration_url = f"{proxy_api_base}/proxy/expiration-time"
390
195
 
196
+ # Initialize Namespaces
197
+ self.serp = SerpNamespace(self)
198
+ self.unlimited = UnlimitedNamespace(self)
199
+
200
+ # =========================================================================
201
+ # Context Manager
202
+ # =========================================================================
203
+
204
+ def close(self) -> None:
205
+ """Close the client and release resources."""
206
+ self._http.close()
207
+ self._proxy_session.close()
208
+ for pm in self._proxy_managers.values():
209
+ pm.clear()
210
+ self._proxy_managers.clear()
211
+
212
+ def __enter__(self) -> ThordataClient:
213
+ return self
214
+
215
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
216
+ self.close()
217
+
218
+ # =========================================================================
219
+ # Internal Helper: API Request Delegation
220
+ # =========================================================================
221
+
222
+ def _api_request_with_retry(
223
+ self,
224
+ method: str,
225
+ url: str,
226
+ *,
227
+ data: dict[str, Any] | None = None,
228
+ headers: dict[str, str] | None = None,
229
+ params: dict[str, Any] | None = None,
230
+ ) -> requests.Response:
231
+ """Delegate to Core HTTP Client."""
232
+ return self._http.request(
233
+ method=method, url=url, data=data, headers=headers, params=params
234
+ )
235
+
236
+ def _require_public_credentials(self) -> None:
237
+ if not self.public_token or not self.public_key:
238
+ raise ThordataConfigError(
239
+ "public_token and public_key are required for this operation."
240
+ )
241
+
391
242
  # =========================================================================
392
243
  # Proxy Network Methods
393
244
  # =========================================================================
@@ -414,48 +265,6 @@ class ThordataClient:
414
265
  logger.debug(f"Proxy POST request: {url}")
415
266
  return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
416
267
 
417
- def _proxy_verb(
418
- self,
419
- method: str,
420
- url: str,
421
- proxy_config: ProxyConfig | None,
422
- timeout: int | None,
423
- **kwargs: Any,
424
- ) -> requests.Response:
425
- timeout = timeout or self._default_timeout
426
-
427
- if proxy_config is None:
428
- proxy_config = self._get_default_proxy_config_from_env()
429
-
430
- if proxy_config is None:
431
- raise ThordataConfigError(
432
- "Proxy credentials are missing. "
433
- "Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
434
- )
435
-
436
- kwargs.pop("proxies", None)
437
-
438
- @with_retry(self._retry_config)
439
- def _do() -> requests.Response:
440
- return self._proxy_request_with_proxy_manager(
441
- method,
442
- url,
443
- proxy_config=proxy_config, # type: ignore
444
- timeout=timeout, # type: ignore
445
- headers=kwargs.pop("headers", None),
446
- params=kwargs.pop("params", None),
447
- data=kwargs.pop("data", None),
448
- )
449
-
450
- try:
451
- return _do()
452
- except requests.Timeout as e:
453
- raise ThordataTimeoutError(
454
- f"Request timed out: {e}", original_error=e
455
- ) from e
456
- except Exception as e:
457
- raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
458
-
459
268
  def build_proxy_url(
460
269
  self,
461
270
  username: str,
@@ -483,1190 +292,1269 @@ class ThordataClient:
483
292
  return config.build_proxy_url()
484
293
 
485
294
  # =========================================================================
486
- # Internal Request Helpers
295
+ # SERP API Methods
487
296
  # =========================================================================
488
297
 
489
- def _api_request_with_retry(
298
+ def serp_search(
490
299
  self,
491
- method: str,
492
- url: str,
300
+ query: str,
493
301
  *,
494
- data: dict[str, Any] | None = None,
495
- headers: dict[str, str] | None = None,
496
- params: dict[str, Any] | None = None,
497
- ) -> requests.Response:
498
- @with_retry(self._retry_config)
499
- def _do_request() -> requests.Response:
500
- return self._api_session.request(
501
- method,
502
- url,
503
- data=data,
504
- headers=headers,
505
- params=params,
506
- timeout=self._api_timeout,
507
- )
508
-
509
- try:
510
- return _do_request()
511
- except requests.Timeout as e:
512
- raise ThordataTimeoutError(
513
- f"API request timed out: {e}", original_error=e
514
- ) from e
515
- except requests.RequestException as e:
516
- raise ThordataNetworkError(
517
- f"API request failed: {e}", original_error=e
518
- ) from e
302
+ engine: Engine | str = Engine.GOOGLE,
303
+ num: int = 10,
304
+ country: str | None = None,
305
+ language: str | None = None,
306
+ search_type: str | None = None,
307
+ device: str | None = None,
308
+ render_js: bool | None = None,
309
+ no_cache: bool | None = None,
310
+ output_format: str = "json",
311
+ **kwargs: Any,
312
+ ) -> dict[str, Any]:
313
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
519
314
 
520
- def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
521
- """Build a stable cache key for ProxyManager instances."""
522
- if not userpass:
523
- return proxy_endpoint
524
- h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
525
- return f"{proxy_endpoint}|auth={h}"
315
+ request = SerpRequest(
316
+ query=query,
317
+ engine=engine_str,
318
+ num=num,
319
+ country=country,
320
+ language=language,
321
+ search_type=search_type,
322
+ device=device,
323
+ render_js=render_js,
324
+ no_cache=no_cache,
325
+ output_format=output_format,
326
+ extra_params=kwargs,
327
+ )
328
+ return self.serp_search_advanced(request)
526
329
 
527
- def _get_proxy_manager(
528
- self,
529
- proxy_url: str,
530
- *,
531
- cache_key: str,
532
- proxy_headers: dict[str, str] | None = None,
533
- ) -> urllib3.PoolManager:
534
- """Get or create a ProxyManager for the given proxy URL (Pooled)."""
535
- cached = self._proxy_managers.get(cache_key)
536
- if cached is not None:
537
- return cached
330
+ def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
331
+ if not self.scraper_token:
332
+ raise ThordataConfigError("scraper_token is required for SERP API")
538
333
 
539
- if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
540
- try:
541
- from urllib3.contrib.socks import SOCKSProxyManager
542
- except Exception as e:
543
- raise ThordataConfigError(
544
- "SOCKS proxy requested but SOCKS dependencies are missing. "
545
- "Install: pip install 'urllib3[socks]' or pip install PySocks"
546
- ) from e
547
-
548
- pm_socks = SOCKSProxyManager(
549
- proxy_url,
550
- num_pools=10,
551
- maxsize=10,
552
- )
553
- pm = cast(urllib3.PoolManager, pm_socks)
554
- self._proxy_managers[cache_key] = pm
555
- return pm
334
+ payload = request.to_payload()
335
+ headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
556
336
 
557
- # HTTP/HTTPS proxies
558
- proxy_ssl_context = None
559
- if proxy_url.startswith("https://"):
560
- proxy_ssl_context = ssl.create_default_context()
337
+ logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
561
338
 
562
- pm_http = urllib3.ProxyManager(
563
- proxy_url,
564
- proxy_headers=proxy_headers,
565
- proxy_ssl_context=proxy_ssl_context,
566
- num_pools=10,
567
- maxsize=10,
339
+ response = self._api_request_with_retry(
340
+ "POST",
341
+ self._serp_url,
342
+ data=payload,
343
+ headers=headers,
568
344
  )
345
+ response.raise_for_status()
569
346
 
570
- pm = cast(urllib3.PoolManager, pm_http)
571
- self._proxy_managers[cache_key] = pm
572
- return pm
347
+ if request.output_format.lower() == "json":
348
+ data = response.json()
349
+ if isinstance(data, dict):
350
+ code = data.get("code")
351
+ if code is not None and code != 200:
352
+ msg = extract_error_message(data)
353
+ raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
354
+ return parse_json_response(data)
573
355
 
574
- def _proxy_request_with_proxy_manager(
356
+ return {"html": response.text}
357
+
358
+ # =========================================================================
359
+ # Universal Scraping API (WEB UNLOCKER) Methods
360
+ # =========================================================================
361
+
362
+ def universal_scrape(
575
363
  self,
576
- method: str,
577
364
  url: str,
578
365
  *,
579
- proxy_config: ProxyConfig,
580
- timeout: int,
581
- headers: dict[str, str] | None = None,
582
- params: dict[str, Any] | None = None,
583
- data: Any = None,
584
- ) -> requests.Response:
585
- """Execute request through proxy, with optional upstream proxy support."""
366
+ js_render: bool = False,
367
+ output_format: str = "html",
368
+ country: str | None = None,
369
+ block_resources: str | None = None,
370
+ wait: int | None = None,
371
+ wait_for: str | None = None,
372
+ **kwargs: Any,
373
+ ) -> str | bytes:
374
+ request = UniversalScrapeRequest(
375
+ url=url,
376
+ js_render=js_render,
377
+ output_format=output_format,
378
+ country=country,
379
+ block_resources=block_resources,
380
+ wait=wait,
381
+ wait_for=wait_for,
382
+ extra_params=kwargs,
383
+ )
384
+ return self.universal_scrape_advanced(request)
586
385
 
587
- # Check for upstream proxy
588
- upstream_config = _parse_upstream_proxy()
386
+ def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
387
+ if not self.scraper_token:
388
+ raise ThordataConfigError("scraper_token required")
589
389
 
590
- if upstream_config:
591
- return self._proxy_request_with_upstream(
592
- method,
593
- url,
594
- proxy_config=proxy_config,
595
- timeout=timeout,
596
- headers=headers,
597
- params=params,
598
- data=data,
599
- upstream_config=upstream_config,
600
- )
390
+ payload = request.to_payload()
391
+ headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
601
392
 
602
- # Original implementation (no upstream proxy)
603
- req = requests.Request(method=method.upper(), url=url, params=params)
604
- prepped = self._proxy_session.prepare_request(req)
605
- final_url = prepped.url or url
393
+ response = self._api_request_with_retry(
394
+ "POST", self._universal_url, data=payload, headers=headers
395
+ )
396
+ response.raise_for_status()
397
+ return self._process_universal_response(response, request.output_format)
606
398
 
607
- proxy_endpoint = proxy_config.build_proxy_endpoint()
608
- is_socks = proxy_endpoint.startswith(
609
- ("socks5://", "socks5h://", "socks4://", "socks4a://")
399
+ # =========================================================================
400
+ # Web Scraper API - Task Management
401
+ # =========================================================================
402
+
403
+ def create_scraper_task(
404
+ self,
405
+ file_name: str,
406
+ spider_id: str,
407
+ spider_name: str,
408
+ parameters: dict[str, Any],
409
+ universal_params: dict[str, Any] | None = None,
410
+ ) -> str:
411
+ config = ScraperTaskConfig(
412
+ file_name=file_name,
413
+ spider_id=spider_id,
414
+ spider_name=spider_name,
415
+ parameters=parameters,
416
+ universal_params=universal_params,
610
417
  )
418
+ return self.create_scraper_task_advanced(config)
611
419
 
612
- if is_socks:
613
- proxy_url_for_manager = proxy_config.build_proxy_url()
614
- userpass = proxy_config.build_proxy_basic_auth()
615
- cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
420
+ def run_tool(
421
+ self,
422
+ tool_request: Any,
423
+ file_name: str | None = None,
424
+ universal_params: dict[str, Any] | None = None,
425
+ ) -> str:
426
+ """
427
+ Run a specific pre-defined tool.
428
+ Supports both standard Scrapers and Video downloaders.
429
+ """
430
+ if not hasattr(tool_request, "to_task_parameters") or not hasattr(
431
+ tool_request, "get_spider_id"
432
+ ):
433
+ raise ValueError(
434
+ "tool_request must be an instance of a thordata.tools class"
435
+ )
616
436
 
617
- pm = self._get_proxy_manager(
618
- proxy_url_for_manager,
619
- cache_key=cache_key,
620
- proxy_headers=None,
437
+ spider_id = tool_request.get_spider_id()
438
+ spider_name = tool_request.get_spider_name()
439
+ params = tool_request.to_task_parameters()
440
+
441
+ if not file_name:
442
+ import uuid
443
+
444
+ short_id = uuid.uuid4().hex[:8]
445
+ file_name = f"{spider_id}_{short_id}"
446
+
447
+ # Check if it's a Video Tool (Duck typing check for common_settings)
448
+ if hasattr(tool_request, "common_settings"):
449
+ # It is a Video Task
450
+ config_video = VideoTaskConfig(
451
+ file_name=file_name,
452
+ spider_id=spider_id,
453
+ spider_name=spider_name,
454
+ parameters=params,
455
+ common_settings=tool_request.common_settings,
621
456
  )
457
+ return self.create_video_task_advanced(config_video)
622
458
  else:
623
- userpass = proxy_config.build_proxy_basic_auth()
624
- proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
625
- cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
626
-
627
- pm = self._get_proxy_manager(
628
- proxy_endpoint,
629
- cache_key=cache_key,
630
- proxy_headers=dict(proxy_headers),
459
+ # It is a Standard Scraper Task
460
+ config = ScraperTaskConfig(
461
+ file_name=file_name,
462
+ spider_id=spider_id,
463
+ spider_name=spider_name,
464
+ parameters=params,
465
+ universal_params=universal_params,
631
466
  )
467
+ return self.create_scraper_task_advanced(config)
632
468
 
633
- req_headers = dict(headers or {})
634
- body = None
635
- if data is not None:
636
- if isinstance(data, dict):
637
- body = urlencode({k: str(v) for k, v in data.items()})
638
- req_headers.setdefault(
639
- "Content-Type", "application/x-www-form-urlencoded"
640
- )
641
- else:
642
- body = data
469
+ def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
470
+ self._require_public_credentials()
471
+ if not self.scraper_token:
472
+ raise ThordataConfigError("scraper_token is required for Task Builder")
643
473
 
644
- http_resp = pm.request(
645
- method.upper(),
646
- final_url,
647
- body=body,
648
- headers=req_headers or None,
649
- timeout=urllib3.Timeout(connect=timeout, read=timeout),
650
- retries=False,
651
- preload_content=True,
474
+ payload = config.to_payload()
475
+ headers = build_builder_headers(
476
+ self.scraper_token, str(self.public_token), str(self.public_key)
652
477
  )
653
478
 
654
- r = requests.Response()
655
- r.status_code = int(getattr(http_resp, "status", 0) or 0)
656
- r._content = http_resp.data or b""
657
- r.url = final_url
658
- r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
659
- return r
660
-
661
- # =========================================================================
662
- # Upstream Proxy Support (Proxy Chaining)
663
- # =========================================================================
479
+ response = self._api_request_with_retry(
480
+ "POST", self._builder_url, data=payload, headers=headers
481
+ )
482
+ response.raise_for_status()
483
+ data = response.json()
484
+ if data.get("code") != 200:
485
+ raise_for_code("Task creation failed", code=data.get("code"), payload=data)
486
+ return data["data"]["task_id"]
664
487
 
665
- def _proxy_request_with_upstream(
488
+ def create_video_task(
666
489
  self,
667
- method: str,
668
- url: str,
669
- *,
670
- proxy_config: ProxyConfig,
671
- timeout: int,
672
- headers: dict[str, str] | None = None,
673
- params: dict[str, Any] | None = None,
674
- data: Any = None,
675
- upstream_config: dict[str, Any],
676
- ) -> requests.Response:
677
- """Execute request through proxy chain: Upstream -> Thordata -> Target."""
678
- if not HAS_PYSOCKS:
490
+ file_name: str,
491
+ spider_id: str,
492
+ spider_name: str,
493
+ parameters: dict[str, Any],
494
+ common_settings: CommonSettings,
495
+ ) -> str:
496
+ config = VideoTaskConfig(
497
+ file_name=file_name,
498
+ spider_id=spider_id,
499
+ spider_name=spider_name,
500
+ parameters=parameters,
501
+ common_settings=common_settings,
502
+ )
503
+ return self.create_video_task_advanced(config)
504
+
505
+ def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
506
+ self._require_public_credentials()
507
+ if not self.scraper_token:
679
508
  raise ThordataConfigError(
680
- "PySocks is required for upstream proxy support. "
681
- "Install with: pip install PySocks"
509
+ "scraper_token is required for Video Task Builder"
682
510
  )
683
511
 
684
- req = requests.Request(method=method.upper(), url=url, params=params)
685
- prepped = self._proxy_session.prepare_request(req)
686
- final_url = prepped.url or url
512
+ payload = config.to_payload()
513
+ headers = build_builder_headers(
514
+ self.scraper_token, str(self.public_token), str(self.public_key)
515
+ )
687
516
 
688
- parsed_target = urlparse(final_url)
689
- target_host = parsed_target.hostname or ""
690
- target_port = parsed_target.port or (
691
- 443 if parsed_target.scheme == "https" else 80
517
+ response = self._api_request_with_retry(
518
+ "POST", self._video_builder_url, data=payload, headers=headers
692
519
  )
693
- target_is_https = parsed_target.scheme == "https"
520
+ response.raise_for_status()
521
+ data = response.json()
522
+ if data.get("code") != 200:
523
+ raise_for_code(
524
+ "Video task creation failed", code=data.get("code"), payload=data
525
+ )
526
+ return data["data"]["task_id"]
694
527
 
695
- protocol = proxy_config.protocol.lower()
696
- if protocol == "socks5":
697
- protocol = "socks5h"
528
+ def get_task_status(self, task_id: str) -> str:
529
+ self._require_public_credentials()
530
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
698
531
 
699
- thordata_host = proxy_config.host or ""
700
- thordata_port = proxy_config.port or 9999
701
- thordata_username = proxy_config.build_username()
702
- thordata_password = proxy_config.password
532
+ response = self._api_request_with_retry(
533
+ "POST",
534
+ self._status_url,
535
+ data={"tasks_ids": task_id},
536
+ headers=headers,
537
+ )
538
+ response.raise_for_status()
539
+ data = response.json()
540
+ if data.get("code") != 200:
541
+ raise_for_code("Task status error", code=data.get("code"), payload=data)
703
542
 
704
- socket_factory = _UpstreamProxySocketFactory(upstream_config)
543
+ items = data.get("data") or []
544
+ for item in items:
545
+ if str(item.get("task_id")) == str(task_id):
546
+ return item.get("status", "unknown")
547
+ return "unknown"
705
548
 
706
- logger.debug(
707
- f"Proxy chain: upstream({upstream_config['host']}:{upstream_config['port']}) "
708
- f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
709
- f"-> target({target_host}:{target_port})"
710
- )
549
+ def get_latest_task_status(self) -> dict[str, Any]:
550
+ """
551
+ Get the status of the last task of the specified account.
552
+ """
553
+ self._require_public_credentials()
554
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
555
+ parsed = urlparse(self._status_url)
556
+ base = f"{parsed.scheme}://{parsed.netloc}"
557
+ endpoint = "/api/web_scraper_api/get_latest_task_status"
711
558
 
712
- raw_sock = socket_factory.create_connection(
713
- (thordata_host, thordata_port),
714
- timeout=float(timeout),
559
+ response = self._api_request_with_retry(
560
+ "POST",
561
+ f"{base}{endpoint}",
562
+ headers=headers,
715
563
  )
564
+ response.raise_for_status()
565
+ data = response.json()
716
566
 
717
- try:
718
- if protocol.startswith("socks"):
719
- sock = self._socks5_handshake(
720
- raw_sock,
721
- target_host,
722
- target_port,
723
- thordata_username,
724
- thordata_password,
725
- )
726
- if target_is_https:
727
- context = ssl.create_default_context()
728
- sock = context.wrap_socket(sock, server_hostname=target_host)
729
-
730
- elif protocol == "https":
731
- proxy_context = ssl.create_default_context()
732
- proxy_ssl_sock = proxy_context.wrap_socket(
733
- raw_sock, server_hostname=thordata_host
734
- )
567
+ if data.get("code") != 200:
568
+ raise_for_code(
569
+ "Get latest task status failed", code=data.get("code"), payload=data
570
+ )
735
571
 
736
- self._send_connect_request(
737
- proxy_ssl_sock,
738
- target_host,
739
- target_port,
740
- thordata_username,
741
- thordata_password,
742
- )
572
+ return data.get("data", {})
743
573
 
744
- if target_is_https:
745
- sock = self._create_tls_in_tls_socket(
746
- proxy_ssl_sock, target_host, timeout
747
- ) # type: ignore[assignment]
748
- else:
749
- sock = proxy_ssl_sock
750
-
751
- else: # HTTP proxy
752
- self._send_connect_request(
753
- raw_sock,
754
- target_host,
755
- target_port,
756
- thordata_username,
757
- thordata_password,
758
- )
574
+ def safe_get_task_status(self, task_id: str) -> str:
575
+ try:
576
+ return self.get_task_status(task_id)
577
+ except Exception:
578
+ return "error"
759
579
 
760
- if target_is_https:
761
- context = ssl.create_default_context()
762
- sock = context.wrap_socket(raw_sock, server_hostname=target_host)
763
- else:
764
- sock = raw_sock
580
+ def get_task_result(self, task_id: str, file_type: str = "json") -> str:
581
+ self._require_public_credentials()
582
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
765
583
 
766
- return self._send_http_request(
767
- sock, method, parsed_target, headers, data, final_url, timeout
768
- )
584
+ response = self._api_request_with_retry(
585
+ "POST",
586
+ self._download_url,
587
+ data={"tasks_id": task_id, "type": file_type},
588
+ headers=headers,
589
+ )
590
+ response.raise_for_status()
591
+ data = response.json()
592
+ if data.get("code") == 200 and data.get("data"):
593
+ return data["data"]["download"]
594
+ raise_for_code("Get result failed", code=data.get("code"), payload=data)
595
+ return ""
596
+
597
+ def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
598
+ self._require_public_credentials()
599
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
769
600
 
770
- finally:
771
- with contextlib.suppress(Exception):
772
- raw_sock.close()
601
+ response = self._api_request_with_retry(
602
+ "POST",
603
+ self._list_url,
604
+ data={"page": str(page), "size": str(size)},
605
+ headers=headers,
606
+ )
607
+ response.raise_for_status()
608
+ data = response.json()
609
+ if data.get("code") != 200:
610
+ raise_for_code("List tasks failed", code=data.get("code"), payload=data)
611
+ return data.get("data", {"count": 0, "list": []})
773
612
 
774
- def _send_connect_request(
613
+ def wait_for_task(
775
614
  self,
776
- sock: socket.socket,
777
- target_host: str,
778
- target_port: int,
779
- proxy_username: str,
780
- proxy_password: str,
781
- ) -> None:
782
- """Send HTTP CONNECT request to proxy and verify response."""
783
- connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
784
- connect_req += f"Host: {target_host}:{target_port}\r\n"
615
+ task_id: str,
616
+ *,
617
+ poll_interval: float = 5.0,
618
+ max_wait: float = 600.0,
619
+ ) -> str:
620
+ import time
785
621
 
786
- credentials = f"{proxy_username}:{proxy_password}"
787
- encoded = base64.b64encode(credentials.encode()).decode()
788
- connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
789
- connect_req += "\r\n"
622
+ start = time.monotonic()
623
+ while (time.monotonic() - start) < max_wait:
624
+ status = self.get_task_status(task_id)
625
+ if status.lower() in {
626
+ "ready",
627
+ "success",
628
+ "finished",
629
+ "failed",
630
+ "error",
631
+ "cancelled",
632
+ }:
633
+ return status
634
+ time.sleep(poll_interval)
635
+ raise TimeoutError(f"Task {task_id} timeout")
790
636
 
791
- sock.sendall(connect_req.encode())
637
+ def run_task(
638
+ self,
639
+ file_name: str,
640
+ spider_id: str,
641
+ spider_name: str,
642
+ parameters: dict[str, Any],
643
+ universal_params: dict[str, Any] | None = None,
644
+ *,
645
+ max_wait: float = 600.0,
646
+ initial_poll_interval: float = 2.0,
647
+ max_poll_interval: float = 10.0,
648
+ include_errors: bool = True,
649
+ task_type: str = "web",
650
+ common_settings: CommonSettings | None = None,
651
+ ) -> str:
652
+ import time
792
653
 
793
- response = b""
794
- while b"\r\n\r\n" not in response:
795
- chunk = sock.recv(4096)
796
- if not chunk:
797
- raise ConnectionError("Proxy closed connection during CONNECT")
798
- response += chunk
654
+ if task_type == "video":
655
+ if common_settings is None:
656
+ raise ValueError("common_settings is required for video tasks")
657
+ config_video = VideoTaskConfig(
658
+ file_name=file_name,
659
+ spider_id=spider_id,
660
+ spider_name=spider_name,
661
+ parameters=parameters,
662
+ common_settings=common_settings,
663
+ include_errors=include_errors,
664
+ )
665
+ task_id = self.create_video_task_advanced(config_video)
666
+ else:
667
+ config = ScraperTaskConfig(
668
+ file_name=file_name,
669
+ spider_id=spider_id,
670
+ spider_name=spider_name,
671
+ parameters=parameters,
672
+ universal_params=universal_params,
673
+ include_errors=include_errors,
674
+ )
675
+ task_id = self.create_scraper_task_advanced(config)
799
676
 
800
- status_line = response.split(b"\r\n")[0].decode()
801
- if "200" not in status_line:
802
- raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
677
+ logger.info(f"Task created: {task_id}. Polling...")
803
678
 
804
- def _create_tls_in_tls_socket(
805
- self,
806
- outer_ssl_sock: ssl.SSLSocket,
807
- hostname: str,
808
- timeout: int,
809
- ) -> _TLSInTLSSocket:
810
- """Create a TLS connection over an existing TLS connection."""
811
- context = ssl.create_default_context()
679
+ start_time = time.monotonic()
680
+ current_poll = initial_poll_interval
812
681
 
813
- incoming = ssl.MemoryBIO()
814
- outgoing = ssl.MemoryBIO()
682
+ while (time.monotonic() - start_time) < max_wait:
683
+ status = self.get_task_status(task_id)
684
+ status_lower = status.lower()
815
685
 
816
- ssl_obj = context.wrap_bio(incoming, outgoing, server_hostname=hostname)
686
+ if status_lower in {"ready", "success", "finished"}:
687
+ return self.get_task_result(task_id)
817
688
 
818
- while True:
819
- try:
820
- ssl_obj.do_handshake()
821
- break
822
- except ssl.SSLWantReadError:
823
- data_to_send = outgoing.read()
824
- if data_to_send:
825
- outer_ssl_sock.sendall(data_to_send)
689
+ if status_lower in {"failed", "error", "cancelled"}:
690
+ raise ThordataNetworkError(
691
+ f"Task {task_id} failed with status: {status}"
692
+ )
826
693
 
827
- outer_ssl_sock.settimeout(float(timeout))
828
- try:
829
- received = outer_ssl_sock.recv(8192)
830
- if not received:
831
- raise ConnectionError("Connection closed during TLS handshake")
832
- incoming.write(received)
833
- except socket.timeout as e:
834
- raise ConnectionError("Timeout during TLS handshake") from e
835
- except ssl.SSLWantWriteError:
836
- data_to_send = outgoing.read()
837
- if data_to_send:
838
- outer_ssl_sock.sendall(data_to_send)
839
-
840
- data_to_send = outgoing.read()
841
- if data_to_send:
842
- outer_ssl_sock.sendall(data_to_send)
843
-
844
- return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
845
-
846
- def _send_http_request(
847
- self,
848
- sock: socket.socket | ssl.SSLSocket | Any,
849
- method: str,
850
- parsed_url: Any,
851
- headers: dict[str, str] | None,
852
- data: Any,
853
- final_url: str,
854
- timeout: int,
855
- ) -> requests.Response:
856
- """Send HTTP request over established connection and parse response."""
857
- target_host = parsed_url.hostname
858
-
859
- req_headers = dict(headers or {})
860
- req_headers.setdefault("Host", target_host)
861
- req_headers.setdefault("User-Agent", build_user_agent(_sdk_version, "requests"))
862
- req_headers.setdefault("Connection", "close")
863
-
864
- path = parsed_url.path or "/"
865
- if parsed_url.query:
866
- path += f"?{parsed_url.query}"
867
-
868
- http_req = f"{method.upper()} {path} HTTP/1.1\r\n"
869
- for k, v in req_headers.items():
870
- http_req += f"{k}: {v}\r\n"
871
-
872
- body = None
873
- if data is not None:
874
- if isinstance(data, dict):
875
- body = urlencode({k: str(v) for k, v in data.items()}).encode()
876
- http_req += "Content-Type: application/x-www-form-urlencoded\r\n"
877
- http_req += f"Content-Length: {len(body)}\r\n"
878
- elif isinstance(data, bytes):
879
- body = data
880
- http_req += f"Content-Length: {len(body)}\r\n"
881
- else:
882
- body = str(data).encode()
883
- http_req += f"Content-Length: {len(body)}\r\n"
884
-
885
- http_req += "\r\n"
886
- sock.sendall(http_req.encode())
694
+ time.sleep(current_poll)
695
+ current_poll = min(current_poll * 1.5, max_poll_interval)
887
696
 
888
- if body:
889
- sock.sendall(body)
697
+ raise ThordataTimeoutError(f"Task {task_id} timed out")
890
698
 
891
- if hasattr(sock, "settimeout"):
892
- sock.settimeout(float(timeout))
699
+ # =========================================================================
700
+ # Account & Usage Methods
701
+ # =========================================================================
893
702
 
894
- response_data = b""
895
- try:
896
- while True:
897
- chunk = sock.recv(8192)
898
- if not chunk:
899
- break
900
- response_data += chunk
901
- if b"\r\n\r\n" in response_data:
902
- header_end = response_data.index(b"\r\n\r\n") + 4
903
- headers_part = (
904
- response_data[:header_end]
905
- .decode("utf-8", errors="replace")
906
- .lower()
907
- )
908
- if "content-length:" in headers_part:
909
- for line in headers_part.split("\r\n"):
910
- if line.startswith("content-length:"):
911
- content_length = int(line.split(":")[1].strip())
912
- if len(response_data) >= header_end + content_length:
913
- break
914
- elif "transfer-encoding: chunked" not in headers_part:
915
- break
916
- except socket.timeout:
917
- pass
918
-
919
- return self._parse_http_response(response_data, final_url)
920
-
921
- def _socks5_handshake(
703
+ def get_usage_statistics(
922
704
  self,
923
- sock: socket.socket,
924
- target_host: str,
925
- target_port: int,
926
- username: str | None,
927
- password: str | None,
928
- ) -> socket.socket:
929
- """Perform SOCKS5 handshake over existing socket."""
930
- if username and password:
931
- sock.sendall(b"\x05\x02\x00\x02")
932
- else:
933
- sock.sendall(b"\x05\x01\x00")
934
-
935
- response = sock.recv(2)
936
- if len(response) < 2:
937
- raise ConnectionError("SOCKS5 handshake failed: incomplete response")
705
+ from_date: str | date,
706
+ to_date: str | date,
707
+ ) -> UsageStatistics:
708
+ self._require_public_credentials()
709
+ if isinstance(from_date, date):
710
+ from_date = from_date.strftime("%Y-%m-%d")
711
+ if isinstance(to_date, date):
712
+ to_date = to_date.strftime("%Y-%m-%d")
938
713
 
939
- if response[0] != 0x05:
940
- raise ConnectionError(f"SOCKS5 version mismatch: {response[0]}")
714
+ params = {
715
+ "token": self.public_token,
716
+ "key": self.public_key,
717
+ "from_date": from_date,
718
+ "to_date": to_date,
719
+ }
720
+ response = self._api_request_with_retry(
721
+ "GET", self._usage_stats_url, params=params
722
+ )
723
+ response.raise_for_status()
724
+ data = response.json()
725
+ if data.get("code") != 200:
726
+ raise_for_code("Usage stats error", code=data.get("code"), payload=data)
727
+ return UsageStatistics.from_dict(data.get("data", data))
941
728
 
942
- auth_method = response[1]
729
+ def get_traffic_balance(self) -> float:
730
+ self._require_public_credentials()
731
+ params = {"token": self.public_token, "key": self.public_key}
732
+ api_base = self._locations_base_url.replace("/locations", "")
733
+ response = self._api_request_with_retry(
734
+ "GET", f"{api_base}/account/traffic-balance", params=params
735
+ )
736
+ response.raise_for_status()
737
+ data = response.json()
738
+ if data.get("code") != 200:
739
+ raise_for_code(
740
+ "Get traffic balance failed", code=data.get("code"), payload=data
741
+ )
742
+ return float(data.get("data", {}).get("traffic_balance", 0))
943
743
 
944
- if auth_method == 0x02:
945
- if not username or not password:
946
- raise ConnectionError(
947
- "SOCKS5 server requires auth but no credentials provided"
948
- )
744
+ def get_wallet_balance(self) -> float:
745
+ self._require_public_credentials()
746
+ params = {"token": self.public_token, "key": self.public_key}
747
+ api_base = self._locations_base_url.replace("/locations", "")
748
+ response = self._api_request_with_retry(
749
+ "GET", f"{api_base}/account/wallet-balance", params=params
750
+ )
751
+ response.raise_for_status()
752
+ data = response.json()
753
+ if data.get("code") != 200:
754
+ raise_for_code(
755
+ "Get wallet balance failed", code=data.get("code"), payload=data
756
+ )
757
+ return float(data.get("data", {}).get("balance", 0))
949
758
 
950
- auth_req = bytes([0x01, len(username)]) + username.encode()
951
- auth_req += bytes([len(password)]) + password.encode()
952
- sock.sendall(auth_req)
953
-
954
- auth_resp = sock.recv(2)
955
- if len(auth_resp) < 2 or auth_resp[1] != 0x00:
956
- raise ConnectionError("SOCKS5 authentication failed")
957
-
958
- elif auth_method == 0xFF:
959
- raise ConnectionError("SOCKS5 no acceptable auth method")
960
-
961
- connect_req = b"\x05\x01\x00\x03"
962
- connect_req += bytes([len(target_host)]) + target_host.encode()
963
- connect_req += target_port.to_bytes(2, "big")
964
- sock.sendall(connect_req)
965
-
966
- resp = sock.recv(4)
967
- if len(resp) < 4:
968
- raise ConnectionError("SOCKS5 connect failed: incomplete response")
969
-
970
- if resp[1] != 0x00:
971
- error_codes = {
972
- 0x01: "General failure",
973
- 0x02: "Connection not allowed",
974
- 0x03: "Network unreachable",
975
- 0x04: "Host unreachable",
976
- 0x05: "Connection refused",
977
- 0x06: "TTL expired",
978
- 0x07: "Command not supported",
979
- 0x08: "Address type not supported",
980
- }
981
- error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
982
- raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
983
-
984
- addr_type = resp[3]
985
- if addr_type == 0x01:
986
- sock.recv(4 + 2)
987
- elif addr_type == 0x03:
988
- domain_len = sock.recv(1)[0]
989
- sock.recv(domain_len + 2)
990
- elif addr_type == 0x04:
991
- sock.recv(16 + 2)
992
-
993
- return sock
994
-
995
- def _parse_http_response(
759
+ def get_proxy_user_usage(
996
760
  self,
997
- response_data: bytes,
998
- url: str,
999
- ) -> requests.Response:
1000
- """Parse raw HTTP response into requests.Response."""
1001
- if b"\r\n\r\n" in response_data:
1002
- header_data, body = response_data.split(b"\r\n\r\n", 1)
1003
- else:
1004
- header_data = response_data
1005
- body = b""
1006
-
1007
- header_lines = header_data.decode("utf-8", errors="replace").split("\r\n")
1008
-
1009
- status_line = header_lines[0] if header_lines else ""
1010
- parts = status_line.split(" ", 2)
1011
- status_code = int(parts[1]) if len(parts) > 1 else 0
761
+ username: str,
762
+ start_date: str | date,
763
+ end_date: str | date,
764
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
765
+ ) -> list[dict[str, Any]]:
766
+ self._require_public_credentials()
767
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
768
+ if isinstance(start_date, date):
769
+ start_date = start_date.strftime("%Y-%m-%d")
770
+ if isinstance(end_date, date):
771
+ end_date = end_date.strftime("%Y-%m-%d")
1012
772
 
1013
- headers_dict = {}
1014
- for line in header_lines[1:]:
1015
- if ": " in line:
1016
- k, v = line.split(": ", 1)
1017
- headers_dict[k] = v
773
+ params = {
774
+ "token": self.public_token,
775
+ "key": self.public_key,
776
+ "proxy_type": str(pt),
777
+ "username": username,
778
+ "from_date": start_date,
779
+ "to_date": end_date,
780
+ }
781
+ response = self._api_request_with_retry(
782
+ "GET", f"{self._proxy_users_url}/usage-statistics", params=params
783
+ )
784
+ response.raise_for_status()
785
+ data = response.json()
786
+ if data.get("code") != 200:
787
+ raise_for_code("Get user usage failed", code=data.get("code"), payload=data)
788
+ return data.get("data", [])
1018
789
 
1019
- if headers_dict.get("Transfer-Encoding", "").lower() == "chunked":
1020
- body = self._decode_chunked(body)
790
+ def get_proxy_user_usage_hour(
791
+ self,
792
+ username: str,
793
+ from_date: str, # Format: yyyy-mm-dd HH
794
+ to_date: str, # Format: yyyy-mm-dd HH
795
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
796
+ ) -> list[dict[str, Any]]:
797
+ """
798
+ Get proxy user traffic usage logs by hour.
1021
799
 
1022
- r = requests.Response()
1023
- r.status_code = status_code
1024
- r._content = body
1025
- r.url = url
1026
- r.headers = CaseInsensitiveDict(headers_dict)
1027
- return r
800
+ Args:
801
+ username: The proxy username.
802
+ from_date: Start date string (yyyy-mm-dd HH).
803
+ to_date: End date string (yyyy-mm-dd HH).
804
+ proxy_type: Proxy type (default: Residential).
805
+ """
806
+ self._require_public_credentials()
807
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1028
808
 
1029
- def _decode_chunked(self, data: bytes) -> bytes:
1030
- """Decode chunked transfer encoding."""
1031
- result = b""
1032
- while data:
1033
- if b"\r\n" not in data:
1034
- break
1035
- size_line, data = data.split(b"\r\n", 1)
1036
- try:
1037
- chunk_size = int(size_line.decode().strip(), 16)
1038
- except ValueError:
1039
- break
809
+ params = {
810
+ "token": self.public_token,
811
+ "key": self.public_key,
812
+ "proxy_type": str(pt),
813
+ "username": username,
814
+ "from_date": from_date,
815
+ "to_date": to_date,
816
+ }
817
+ response = self._api_request_with_retry(
818
+ "GET", f"{self._proxy_users_url}/usage-statistics-hour", params=params
819
+ )
820
+ response.raise_for_status()
821
+ data = response.json()
822
+ if data.get("code") != 200:
823
+ raise_for_code(
824
+ "Get hourly usage failed", code=data.get("code"), payload=data
825
+ )
1040
826
 
1041
- if chunk_size == 0:
1042
- break
827
+ # API returns { "data": { "data": [...] } } structure
828
+ inner_data = data.get("data", {})
829
+ if isinstance(inner_data, dict):
830
+ return inner_data.get("data", [])
831
+ return []
1043
832
 
1044
- result += data[:chunk_size]
1045
- data = data[chunk_size:]
833
+ def extract_ip_list(
834
+ self,
835
+ num: int = 1,
836
+ country: str | None = None,
837
+ state: str | None = None,
838
+ city: str | None = None,
839
+ time_limit: int | None = None,
840
+ port: int | None = None,
841
+ return_type: str = "txt",
842
+ protocol: str = "http",
843
+ sep: str = "\r\n",
844
+ product: str = "residential",
845
+ ) -> list[str]:
846
+ base_url = "https://get-ip.thordata.net"
847
+ endpoint = "/unlimited_api" if product == "unlimited" else "/api"
848
+ params: dict[str, Any] = {
849
+ "num": str(num),
850
+ "return_type": return_type,
851
+ "protocol": protocol,
852
+ "sep": sep,
853
+ }
854
+ if country:
855
+ params["country"] = country
856
+ if state:
857
+ params["state"] = state
858
+ if city:
859
+ params["city"] = city
860
+ if time_limit:
861
+ params["time"] = str(time_limit)
862
+ if port:
863
+ params["port"] = str(port)
864
+
865
+ username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
866
+ if username:
867
+ params["td-customer"] = username
1046
868
 
1047
- if data.startswith(b"\r\n"):
1048
- data = data[2:]
869
+ response = self._api_request_with_retry(
870
+ "GET", f"{base_url}{endpoint}", params=params
871
+ )
872
+ response.raise_for_status()
1049
873
 
1050
- return result
874
+ if return_type == "json":
875
+ data = response.json()
876
+ if isinstance(data, dict):
877
+ if data.get("code") in (0, 200):
878
+ raw_list = data.get("data") or []
879
+ return [f"{item['ip']}:{item['port']}" for item in raw_list]
880
+ else:
881
+ raise_for_code(
882
+ "Extract IPs failed", code=data.get("code"), payload=data
883
+ )
884
+ return []
885
+ else:
886
+ text = response.text.strip()
887
+ if text.startswith("{") and "code" in text:
888
+ try:
889
+ err_data = response.json()
890
+ raise_for_code(
891
+ "Extract IPs failed",
892
+ code=err_data.get("code"),
893
+ payload=err_data,
894
+ )
895
+ except ValueError:
896
+ pass
897
+ actual_sep = sep.replace("\\r", "\r").replace("\\n", "\n")
898
+ return [line.strip() for line in text.split(actual_sep) if line.strip()]
1051
899
 
1052
900
  # =========================================================================
1053
- # SERP API Methods
901
+ # Proxy Users Management
1054
902
  # =========================================================================
1055
903
 
1056
- def serp_search(
904
+ def list_proxy_users(
905
+ self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
906
+ ) -> ProxyUserList:
907
+ self._require_public_credentials()
908
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
909
+ params = {
910
+ "token": self.public_token,
911
+ "key": self.public_key,
912
+ "proxy_type": str(pt),
913
+ }
914
+ response = self._api_request_with_retry(
915
+ "GET", f"{self._proxy_users_url}/user-list", params=params
916
+ )
917
+ response.raise_for_status()
918
+ data = response.json()
919
+ if data.get("code") != 200:
920
+ raise_for_code("List users error", code=data.get("code"), payload=data)
921
+ return ProxyUserList.from_dict(data.get("data", data))
922
+
923
+ def create_proxy_user(
1057
924
  self,
1058
- query: str,
1059
- *,
1060
- engine: Engine | str = Engine.GOOGLE,
1061
- num: int = 10,
1062
- country: str | None = None,
1063
- language: str | None = None,
1064
- search_type: str | None = None,
1065
- device: str | None = None,
1066
- render_js: bool | None = None,
1067
- no_cache: bool | None = None,
1068
- output_format: str = "json",
1069
- **kwargs: Any,
925
+ username: str,
926
+ password: str,
927
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
928
+ traffic_limit: int = 0,
929
+ status: bool = True,
1070
930
  ) -> dict[str, Any]:
1071
- engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
1072
-
1073
- request = SerpRequest(
1074
- query=query,
1075
- engine=engine_str,
1076
- num=num,
1077
- country=country,
1078
- language=language,
1079
- search_type=search_type,
1080
- device=device,
1081
- render_js=render_js,
1082
- no_cache=no_cache,
1083
- output_format=output_format,
1084
- extra_params=kwargs,
931
+ self._require_public_credentials()
932
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
933
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
934
+ payload = {
935
+ "proxy_type": str(pt),
936
+ "username": username,
937
+ "password": password,
938
+ "traffic_limit": str(traffic_limit),
939
+ "status": "true" if status else "false",
940
+ }
941
+ response = self._api_request_with_retry(
942
+ "POST",
943
+ f"{self._proxy_users_url}/create-user",
944
+ data=payload,
945
+ headers=headers,
1085
946
  )
947
+ response.raise_for_status()
948
+ data = response.json()
949
+ if data.get("code") != 200:
950
+ raise_for_code("Create user failed", code=data.get("code"), payload=data)
951
+ return data.get("data", {})
1086
952
 
1087
- return self.serp_search_advanced(request)
1088
-
1089
- def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
1090
- if not self.scraper_token:
1091
- raise ThordataConfigError("scraper_token is required for SERP API")
1092
-
1093
- payload = request.to_payload()
1094
- headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
953
+ def update_proxy_user(
954
+ self,
955
+ username: str,
956
+ password: str,
957
+ traffic_limit: int | None = None,
958
+ status: bool | None = None,
959
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
960
+ new_username: str | None = None, # Added optional new_username
961
+ ) -> dict[str, Any]:
962
+ """
963
+ Update a proxy user.
964
+ Note: API requires 'new_' prefixed fields and ALL are required.
965
+ """
966
+ self._require_public_credentials()
967
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
968
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
1095
969
 
1096
- logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
970
+ # Defaults
971
+ limit_val = str(traffic_limit) if traffic_limit is not None else "0"
972
+ status_val = "true" if (status is None or status) else "false"
1097
973
 
1098
- try:
1099
- response = self._api_request_with_retry(
1100
- "POST",
1101
- self._serp_url,
1102
- data=payload,
1103
- headers=headers,
1104
- )
1105
- response.raise_for_status()
974
+ # If new_username is not provided, keep the old one (API requires new_username field)
975
+ target_username = new_username or username
1106
976
 
1107
- if request.output_format.lower() == "json":
1108
- data = response.json()
1109
- if isinstance(data, dict):
1110
- code = data.get("code")
1111
- if code is not None and code != 200:
1112
- msg = extract_error_message(data)
1113
- raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
1114
- return parse_json_response(data)
977
+ # Mapping to API specific field names (new_...)
978
+ payload = {
979
+ "proxy_type": str(pt),
980
+ "username": username, # Who to update
981
+ "new_username": target_username, # Required field
982
+ "new_password": password, # Required field
983
+ "new_traffic_limit": limit_val, # Required field
984
+ "new_status": status_val, # Required field
985
+ }
1115
986
 
1116
- return {"html": response.text}
987
+ response = self._api_request_with_retry(
988
+ "POST",
989
+ f"{self._proxy_users_url}/update-user",
990
+ data=payload,
991
+ headers=headers,
992
+ )
993
+ data = response.json()
994
+ if data.get("code") != 200:
995
+ raise_for_code("Update user failed", code=data.get("code"), payload=data)
996
+ return data.get("data", {})
1117
997
 
1118
- except requests.Timeout as e:
1119
- raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
1120
- except requests.RequestException as e:
1121
- raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
998
+ def delete_proxy_user(
999
+ self,
1000
+ username: str,
1001
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1002
+ ) -> dict[str, Any]:
1003
+ self._require_public_credentials()
1004
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1005
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
1006
+ payload = {"proxy_type": str(pt), "username": username}
1007
+ response = self._api_request_with_retry(
1008
+ "POST",
1009
+ f"{self._proxy_users_url}/delete-user",
1010
+ data=payload,
1011
+ headers=headers,
1012
+ )
1013
+ response.raise_for_status()
1014
+ data = response.json()
1015
+ if data.get("code") != 200:
1016
+ raise_for_code("Delete user failed", code=data.get("code"), payload=data)
1017
+ return data.get("data", {})
1122
1018
 
1123
1019
  # =========================================================================
1124
- # Universal Scraping API
1020
+ # Whitelist IP Management
1125
1021
  # =========================================================================
1126
1022
 
1127
- def universal_scrape(
1023
+ def add_whitelist_ip(
1128
1024
  self,
1129
- url: str,
1130
- *,
1131
- js_render: bool = False,
1132
- output_format: str = "html",
1133
- country: str | None = None,
1134
- block_resources: str | None = None,
1135
- wait: int | None = None,
1136
- wait_for: str | None = None,
1137
- **kwargs: Any,
1138
- ) -> str | bytes:
1139
- request = UniversalScrapeRequest(
1140
- url=url,
1141
- js_render=js_render,
1142
- output_format=output_format,
1143
- country=country,
1144
- block_resources=block_resources,
1145
- wait=wait,
1146
- wait_for=wait_for,
1147
- extra_params=kwargs,
1025
+ ip: str,
1026
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1027
+ status: bool = True,
1028
+ ) -> dict[str, Any]:
1029
+ self._require_public_credentials()
1030
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1031
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
1032
+ payload = {
1033
+ "proxy_type": str(pt),
1034
+ "ip": ip,
1035
+ "status": "true" if status else "false",
1036
+ }
1037
+ response = self._api_request_with_retry(
1038
+ "POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
1148
1039
  )
1149
- return self.universal_scrape_advanced(request)
1150
-
1151
- def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
1152
- if not self.scraper_token:
1153
- raise ThordataConfigError("scraper_token is required for Universal API")
1154
-
1155
- payload = request.to_payload()
1156
- headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
1157
-
1158
- logger.info(f"Universal Scrape: {request.url}")
1159
-
1160
- try:
1161
- response = self._api_request_with_retry(
1162
- "POST",
1163
- self._universal_url,
1164
- data=payload,
1165
- headers=headers,
1040
+ response.raise_for_status()
1041
+ data = response.json()
1042
+ if data.get("code") != 200:
1043
+ raise_for_code(
1044
+ "Add whitelist IP failed", code=data.get("code"), payload=data
1166
1045
  )
1167
- response.raise_for_status()
1168
- return self._process_universal_response(response, request.output_format)
1169
-
1170
- except requests.Timeout as e:
1171
- raise ThordataTimeoutError(
1172
- f"Universal timeout: {e}", original_error=e
1173
- ) from e
1174
- except requests.RequestException as e:
1175
- raise ThordataNetworkError(
1176
- f"Universal failed: {e}", original_error=e
1177
- ) from e
1178
-
1179
- def _process_universal_response(
1180
- self, response: requests.Response, output_format: str
1181
- ) -> str | bytes:
1182
- try:
1183
- resp_json = response.json()
1184
- except ValueError:
1185
- return response.content if output_format.lower() == "png" else response.text
1046
+ return data.get("data", {})
1186
1047
 
1187
- if isinstance(resp_json, dict):
1188
- code = resp_json.get("code")
1189
- if code is not None and code != 200:
1190
- msg = extract_error_message(resp_json)
1191
- raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
1048
+ def delete_whitelist_ip(
1049
+ self,
1050
+ ip: str,
1051
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1052
+ ) -> dict[str, Any]:
1053
+ self._require_public_credentials()
1054
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1055
+ headers = build_public_api_headers(str(self.public_token), str(self.public_key))
1056
+ payload = {"proxy_type": str(pt), "ip": ip}
1057
+ response = self._api_request_with_retry(
1058
+ "POST", f"{self._whitelist_url}/delete-ip", data=payload, headers=headers
1059
+ )
1060
+ response.raise_for_status()
1061
+ data = response.json()
1062
+ if data.get("code") != 200:
1063
+ raise_for_code(
1064
+ "Delete whitelist IP failed", code=data.get("code"), payload=data
1065
+ )
1066
+ return data.get("data", {})
1192
1067
 
1193
- if "html" in resp_json:
1194
- return resp_json["html"]
1195
- if "png" in resp_json:
1196
- return decode_base64_image(resp_json["png"])
1068
+ def list_whitelist_ips(
1069
+ self,
1070
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1071
+ ) -> list[str]:
1072
+ self._require_public_credentials()
1073
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1074
+ params = {
1075
+ "token": self.public_token,
1076
+ "key": self.public_key,
1077
+ "proxy_type": str(pt),
1078
+ }
1079
+ response = self._api_request_with_retry(
1080
+ "GET", f"{self._whitelist_url}/ip-list", params=params
1081
+ )
1082
+ response.raise_for_status()
1083
+ data = response.json()
1084
+ if data.get("code") != 200:
1085
+ raise_for_code(
1086
+ "List whitelist IPs failed", code=data.get("code"), payload=data
1087
+ )
1197
1088
 
1198
- return str(resp_json)
1089
+ items = data.get("data", []) or []
1090
+ result = []
1091
+ for item in items:
1092
+ if isinstance(item, str):
1093
+ result.append(item)
1094
+ elif isinstance(item, dict) and "ip" in item:
1095
+ result.append(str(item["ip"]))
1096
+ else:
1097
+ result.append(str(item))
1098
+ return result
1199
1099
 
1200
1100
  # =========================================================================
1201
- # Web Scraper API (Tasks)
1101
+ # Locations & ASN Methods
1202
1102
  # =========================================================================
1203
1103
 
1204
- def create_scraper_task(
1205
- self,
1206
- file_name: str,
1207
- spider_id: str,
1208
- spider_name: str,
1209
- parameters: dict[str, Any],
1210
- universal_params: dict[str, Any] | None = None,
1211
- ) -> str:
1212
- config = ScraperTaskConfig(
1213
- file_name=file_name,
1214
- spider_id=spider_id,
1215
- spider_name=spider_name,
1216
- parameters=parameters,
1217
- universal_params=universal_params,
1218
- )
1219
- return self.create_scraper_task_advanced(config)
1220
-
1221
- def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
1104
+ def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
1222
1105
  self._require_public_credentials()
1223
- if not self.scraper_token:
1224
- raise ThordataConfigError("scraper_token is required for Task Builder")
1225
- payload = config.to_payload()
1226
- headers = build_builder_headers(
1227
- self.scraper_token, self.public_token or "", self.public_key or ""
1106
+ params = {"token": self.public_token, "key": self.public_key}
1107
+ for k, v in kwargs.items():
1108
+ params[k] = str(v)
1109
+
1110
+ response = self._api_request_with_retry(
1111
+ "GET", f"{self._locations_base_url}/{endpoint}", params=params
1228
1112
  )
1113
+ response.raise_for_status()
1114
+ data = response.json()
1229
1115
 
1230
- try:
1231
- response = self._api_request_with_retry(
1232
- "POST", self._builder_url, data=payload, headers=headers
1233
- )
1234
- response.raise_for_status()
1235
- data = response.json()
1116
+ if isinstance(data, dict):
1236
1117
  if data.get("code") != 200:
1237
- raise_for_code(
1238
- "Task creation failed", code=data.get("code"), payload=data
1239
- )
1240
- return data["data"]["task_id"]
1241
- except requests.RequestException as e:
1242
- raise ThordataNetworkError(
1243
- f"Task creation failed: {e}", original_error=e
1244
- ) from e
1118
+ raise RuntimeError(f"Locations error: {data.get('msg')}")
1119
+ return data.get("data") or []
1120
+ return data if isinstance(data, list) else []
1245
1121
 
1246
- def create_video_task(
1122
+ def list_countries(
1123
+ self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1124
+ ) -> list[dict[str, Any]]:
1125
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1126
+ return self._get_locations("countries", proxy_type=pt)
1127
+
1128
+ def list_states(
1247
1129
  self,
1248
- file_name: str,
1249
- spider_id: str,
1250
- spider_name: str,
1251
- parameters: dict[str, Any],
1252
- common_settings: CommonSettings,
1253
- ) -> str:
1254
- config = VideoTaskConfig(
1255
- file_name=file_name,
1256
- spider_id=spider_id,
1257
- spider_name=spider_name,
1258
- parameters=parameters,
1259
- common_settings=common_settings,
1260
- )
1261
- return self.create_video_task_advanced(config)
1130
+ country_code: str,
1131
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1132
+ ) -> list[dict[str, Any]]:
1133
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1134
+ return self._get_locations("states", proxy_type=pt, country_code=country_code)
1262
1135
 
1263
- def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
1264
- self._require_public_credentials()
1265
- if not self.scraper_token:
1266
- raise ThordataConfigError(
1267
- "scraper_token is required for Video Task Builder"
1268
- )
1136
+ def list_cities(
1137
+ self,
1138
+ country_code: str,
1139
+ state_code: str | None = None,
1140
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1141
+ ) -> list[dict[str, Any]]:
1142
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1143
+ kwargs = {"proxy_type": pt, "country_code": country_code}
1144
+ if state_code:
1145
+ kwargs["state_code"] = state_code
1146
+ return self._get_locations("cities", **kwargs)
1269
1147
 
1270
- payload = config.to_payload()
1271
- headers = build_builder_headers(
1272
- self.scraper_token, self.public_token or "", self.public_key or ""
1273
- )
1148
+ def list_asn(
1149
+ self,
1150
+ country_code: str,
1151
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1152
+ ) -> list[dict[str, Any]]:
1153
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1154
+ return self._get_locations("asn", proxy_type=pt, country_code=country_code)
1155
+
1156
+ # =========================================================================
1157
+ # ISP & Datacenter Proxy Management
1158
+ # =========================================================================
1274
1159
 
1160
+ def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
1161
+ self._require_public_credentials()
1162
+ params = {
1163
+ "token": self.public_token,
1164
+ "key": self.public_key,
1165
+ "proxy_type": str(proxy_type),
1166
+ }
1275
1167
  response = self._api_request_with_retry(
1276
- "POST", self._video_builder_url, data=payload, headers=headers
1168
+ "GET", self._proxy_list_url, params=params
1277
1169
  )
1278
1170
  response.raise_for_status()
1279
1171
  data = response.json()
1280
1172
  if data.get("code") != 200:
1281
1173
  raise_for_code(
1282
- "Video task creation failed", code=data.get("code"), payload=data
1283
- )
1284
- return data["data"]["task_id"]
1285
-
1286
- def get_task_status(self, task_id: str) -> str:
1287
- self._require_public_credentials()
1288
- headers = build_public_api_headers(
1289
- self.public_token or "", self.public_key or ""
1290
- )
1291
- try:
1292
- response = self._api_request_with_retry(
1293
- "POST",
1294
- self._status_url,
1295
- data={"tasks_ids": task_id},
1296
- headers=headers,
1174
+ "List proxy servers error", code=data.get("code"), payload=data
1297
1175
  )
1298
- response.raise_for_status()
1299
- data = response.json()
1300
- if data.get("code") != 200:
1301
- raise_for_code("Task status error", code=data.get("code"), payload=data)
1302
-
1303
- items = data.get("data") or []
1304
- for item in items:
1305
- if str(item.get("task_id")) == str(task_id):
1306
- return item.get("status", "unknown")
1307
- return "unknown"
1308
- except requests.RequestException as e:
1309
- raise ThordataNetworkError(
1310
- f"Status check failed: {e}", original_error=e
1311
- ) from e
1312
-
1313
- def safe_get_task_status(self, task_id: str) -> str:
1314
- try:
1315
- return self.get_task_status(task_id)
1316
- except Exception:
1317
- return "error"
1318
1176
 
1319
- def get_task_result(self, task_id: str, file_type: str = "json") -> str:
1320
- self._require_public_credentials()
1321
- headers = build_public_api_headers(
1322
- self.public_token or "", self.public_key or ""
1323
- )
1324
- try:
1325
- response = self._api_request_with_retry(
1326
- "POST",
1327
- self._download_url,
1328
- data={"tasks_id": task_id, "type": file_type},
1329
- headers=headers,
1330
- )
1331
- response.raise_for_status()
1332
- data = response.json()
1333
- if data.get("code") == 200 and data.get("data"):
1334
- return data["data"]["download"]
1335
- raise_for_code("Get result failed", code=data.get("code"), payload=data)
1336
- return ""
1337
- except requests.RequestException as e:
1338
- raise ThordataNetworkError(
1339
- f"Get result failed: {e}", original_error=e
1340
- ) from e
1177
+ server_list = []
1178
+ if isinstance(data, dict):
1179
+ server_list = data.get("data", data.get("list", []))
1180
+ elif isinstance(data, list):
1181
+ server_list = data
1182
+ return [ProxyServer.from_dict(s) for s in server_list]
1341
1183
 
1342
- def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
1184
+ def get_proxy_expiration(
1185
+ self, ips: str | list[str], proxy_type: int
1186
+ ) -> dict[str, Any]:
1343
1187
  self._require_public_credentials()
1344
- headers = build_public_api_headers(
1345
- self.public_token or "", self.public_key or ""
1346
- )
1188
+ if isinstance(ips, list):
1189
+ ips = ",".join(ips)
1190
+ params = {
1191
+ "token": self.public_token,
1192
+ "key": self.public_key,
1193
+ "proxy_type": str(proxy_type),
1194
+ "ips": ips,
1195
+ }
1347
1196
  response = self._api_request_with_retry(
1348
- "POST",
1349
- self._list_url,
1350
- data={"page": str(page), "size": str(size)},
1351
- headers=headers,
1197
+ "GET", self._proxy_expiration_url, params=params
1352
1198
  )
1353
1199
  response.raise_for_status()
1354
1200
  data = response.json()
1355
1201
  if data.get("code") != 200:
1356
- raise_for_code("List tasks failed", code=data.get("code"), payload=data)
1357
- return data.get("data", {"count": 0, "list": []})
1202
+ raise_for_code("Get expiration error", code=data.get("code"), payload=data)
1203
+ return data.get("data", data)
1358
1204
 
1359
- def wait_for_task(
1360
- self,
1361
- task_id: str,
1362
- *,
1363
- poll_interval: float = 5.0,
1364
- max_wait: float = 600.0,
1365
- ) -> str:
1366
- import time
1205
+ # =========================================================================
1206
+ # Helpers needed for compatibility
1207
+ # =========================================================================
1367
1208
 
1368
- start = time.monotonic()
1369
- while (time.monotonic() - start) < max_wait:
1370
- status = self.get_task_status(task_id)
1371
- if status.lower() in {
1372
- "ready",
1373
- "success",
1374
- "finished",
1375
- "failed",
1376
- "error",
1377
- "cancelled",
1378
- }:
1379
- return status
1380
- time.sleep(poll_interval)
1381
- raise TimeoutError(f"Task {task_id} timeout")
1209
+ def _process_universal_response(
1210
+ self, response: requests.Response, output_format: str
1211
+ ) -> str | bytes:
1212
+ try:
1213
+ resp_json = response.json()
1214
+ except ValueError:
1215
+ return response.content if output_format.lower() == "png" else response.text
1382
1216
 
1383
- def run_task(
1384
- self,
1385
- file_name: str,
1386
- spider_id: str,
1387
- spider_name: str,
1388
- parameters: dict[str, Any],
1389
- universal_params: dict[str, Any] | None = None,
1390
- *,
1391
- max_wait: float = 600.0,
1392
- initial_poll_interval: float = 2.0,
1393
- max_poll_interval: float = 10.0,
1394
- include_errors: bool = True,
1217
+ if isinstance(resp_json, dict):
1218
+ code = resp_json.get("code")
1219
+ if code is not None and code != 200:
1220
+ msg = extract_error_message(resp_json)
1221
+ raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
1222
+
1223
+ if "html" in resp_json:
1224
+ return resp_json["html"]
1225
+ if "png" in resp_json:
1226
+ return decode_base64_image(resp_json["png"])
1227
+ return str(resp_json)
1228
+
1229
+ def get_browser_connection_url(
1230
+ self, username: str | None = None, password: str | None = None
1395
1231
  ) -> str:
1396
- """
1397
- High-level wrapper to Run a Web Scraper task and wait for the result download URL.
1232
+ # User requested modification: ONLY use browser credentials, do not fall back to residential.
1233
+ user = username or os.getenv("THORDATA_BROWSER_USERNAME")
1234
+ pwd = password or os.getenv("THORDATA_BROWSER_PASSWORD")
1398
1235
 
1399
- This method handles the entire lifecycle:
1400
- 1. Create Task
1401
- 2. Poll status (with exponential backoff)
1402
- 3. Get download URL when ready
1236
+ if not user or not pwd:
1237
+ raise ThordataConfigError(
1238
+ "Browser credentials missing. Set THORDATA_BROWSER_USERNAME/PASSWORD or pass arguments."
1239
+ )
1240
+ prefix = "td-customer-"
1241
+ final_user = f"{prefix}{user}" if not user.startswith(prefix) else user
1403
1242
 
1404
- Args:
1405
- file_name: Name for the output file.
1406
- spider_id: Spider identifier from Dashboard.
1407
- spider_name: Spider name (target domain).
1408
- parameters: Spider-specific parameters.
1409
- universal_params: Global spider settings.
1410
- max_wait: Maximum seconds to wait for task completion (default 600).
1411
- initial_poll_interval: Starting poll interval in seconds.
1412
- max_poll_interval: Maximum poll interval cap.
1413
- include_errors: Whether to include error logs in the task result.
1414
-
1415
- Returns:
1416
- str: The download URL for the task result (default JSON).
1417
-
1418
- Raises:
1419
- ThordataTimeoutError: If task takes longer than max_wait.
1420
- ThordataAPIError: If task fails or is cancelled.
1421
- """
1422
- import time
1243
+ from urllib.parse import quote
1423
1244
 
1424
- # 1. Create Task
1425
- config = ScraperTaskConfig(
1426
- file_name=file_name,
1427
- spider_id=spider_id,
1428
- spider_name=spider_name,
1429
- parameters=parameters,
1430
- universal_params=universal_params,
1431
- include_errors=include_errors,
1432
- )
1433
- task_id = self.create_scraper_task_advanced(config)
1434
- logger.info(f"Task created successfully: {task_id}. Waiting for completion...")
1245
+ safe_user = quote(final_user, safe="")
1246
+ safe_pass = quote(pwd, safe="")
1435
1247
 
1436
- # 2. Poll Status (Smart Backoff)
1437
- start_time = time.monotonic()
1438
- current_poll = initial_poll_interval
1248
+ return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"
1439
1249
 
1440
- while (time.monotonic() - start_time) < max_wait:
1441
- status = self.get_task_status(task_id)
1442
- status_lower = status.lower()
1250
+ # =========================================================================
1251
+ # Proxy Internal Logic
1252
+ # =========================================================================
1443
1253
 
1444
- if status_lower in {"ready", "success", "finished"}:
1445
- logger.info(f"Task {task_id} finished. Status: {status}")
1446
- # 3. Get Result
1447
- return self.get_task_result(task_id)
1254
+ def _proxy_verb(
1255
+ self,
1256
+ method: str,
1257
+ url: str,
1258
+ proxy_config: ProxyConfig | None,
1259
+ timeout: int | None,
1260
+ **kwargs: Any,
1261
+ ) -> requests.Response:
1262
+ timeout = timeout or self._default_timeout
1263
+ if proxy_config is None:
1264
+ proxy_config = self._get_default_proxy_config_from_env()
1265
+ if proxy_config is None:
1266
+ raise ThordataConfigError("Proxy credentials are missing.")
1448
1267
 
1449
- if status_lower in {"failed", "error", "cancelled"}:
1450
- raise ThordataNetworkError(
1451
- f"Task {task_id} ended with failed status: {status}"
1452
- )
1268
+ kwargs.pop("proxies", None)
1453
1269
 
1454
- # Wait and increase interval (capped)
1455
- time.sleep(current_poll)
1456
- current_poll = min(current_poll * 1.5, max_poll_interval)
1270
+ @with_retry(self._retry_config)
1271
+ def _do() -> requests.Response:
1272
+ return self._proxy_request_with_proxy_manager(
1273
+ method,
1274
+ url,
1275
+ proxy_config=cast(ProxyConfig, proxy_config),
1276
+ timeout=cast(int, timeout),
1277
+ headers=kwargs.pop("headers", None),
1278
+ params=kwargs.pop("params", None),
1279
+ data=kwargs.pop("data", None),
1280
+ )
1457
1281
 
1458
- raise ThordataTimeoutError(f"Task {task_id} timed out after {max_wait} seconds")
1282
+ try:
1283
+ return _do()
1284
+ except Exception as e:
1285
+ raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
1459
1286
 
1460
- # =========================================================================
1461
- # Account / Locations / Utils
1462
- # =========================================================================
1287
+ def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
1288
+ if not userpass:
1289
+ return proxy_endpoint
1290
+ h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
1291
+ return f"{proxy_endpoint}|auth={h}"
1463
1292
 
1464
- def get_usage_statistics(
1293
+ def _get_proxy_manager(
1465
1294
  self,
1466
- from_date: str | date,
1467
- to_date: str | date,
1468
- ) -> UsageStatistics:
1469
- self._require_public_credentials()
1470
- if isinstance(from_date, date):
1471
- from_date = from_date.strftime("%Y-%m-%d")
1472
- if isinstance(to_date, date):
1473
- to_date = to_date.strftime("%Y-%m-%d")
1295
+ proxy_url: str,
1296
+ *,
1297
+ cache_key: str,
1298
+ proxy_headers: dict[str, str] | None = None,
1299
+ ) -> urllib3.PoolManager:
1300
+ cached = self._proxy_managers.get(cache_key)
1301
+ if cached is not None:
1302
+ return cached
1303
+
1304
+ if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
1305
+ if not HAS_PYSOCKS:
1306
+ raise ThordataConfigError(
1307
+ "SOCKS support requires PySocks/urllib3[socks]"
1308
+ )
1309
+ from urllib3.contrib.socks import SOCKSProxyManager
1310
+
1311
+ pm = cast(
1312
+ urllib3.PoolManager,
1313
+ SOCKSProxyManager(proxy_url, num_pools=10, maxsize=10),
1314
+ )
1315
+ self._proxy_managers[cache_key] = pm
1316
+ return pm
1317
+
1318
+ proxy_ssl_context = (
1319
+ ssl.create_default_context() if proxy_url.startswith("https://") else None
1320
+ )
1321
+ pm = urllib3.ProxyManager(
1322
+ proxy_url,
1323
+ proxy_headers=proxy_headers,
1324
+ proxy_ssl_context=proxy_ssl_context,
1325
+ num_pools=10,
1326
+ maxsize=10,
1327
+ )
1328
+ self._proxy_managers[cache_key] = pm
1329
+ return pm
1330
+
1331
+ def _proxy_request_with_proxy_manager(
1332
+ self,
1333
+ method: str,
1334
+ url: str,
1335
+ *,
1336
+ proxy_config: ProxyConfig,
1337
+ timeout: int,
1338
+ headers: dict[str, str] | None = None,
1339
+ params: dict[str, Any] | None = None,
1340
+ data: Any = None,
1341
+ ) -> requests.Response:
1342
+ upstream = _parse_upstream_proxy()
1343
+ if upstream:
1344
+ return self._proxy_request_with_upstream(
1345
+ method,
1346
+ url,
1347
+ proxy_config=proxy_config,
1348
+ timeout=timeout,
1349
+ headers=headers,
1350
+ params=params,
1351
+ data=data,
1352
+ upstream_config=upstream,
1353
+ )
1354
+
1355
+ req = requests.Request(method=method.upper(), url=url, params=params)
1356
+ prepped = self._proxy_session.prepare_request(req)
1357
+ final_url = prepped.url or url
1358
+
1359
+ proxy_endpoint = proxy_config.build_proxy_endpoint()
1360
+ is_socks = proxy_endpoint.startswith(("socks",))
1361
+
1362
+ if is_socks:
1363
+ proxy_url_for_manager = proxy_config.build_proxy_url()
1364
+ cache_key = proxy_url_for_manager
1365
+ pm = self._get_proxy_manager(proxy_url_for_manager, cache_key=cache_key)
1366
+ req_headers = dict(headers or {})
1367
+ else:
1368
+ userpass = proxy_config.build_proxy_basic_auth()
1369
+ proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
1370
+ cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
1371
+ pm = self._get_proxy_manager(
1372
+ proxy_endpoint, cache_key=cache_key, proxy_headers=dict(proxy_headers)
1373
+ )
1374
+ req_headers = dict(headers or {})
1375
+
1376
+ body = None
1377
+ if data is not None:
1378
+ if isinstance(data, dict):
1379
+ body = urlencode({k: str(v) for k, v in data.items()})
1380
+ req_headers.setdefault(
1381
+ "Content-Type", "application/x-www-form-urlencoded"
1382
+ )
1383
+ else:
1384
+ body = data
1474
1385
 
1475
- params = {
1476
- "token": self.public_token,
1477
- "key": self.public_key,
1478
- "from_date": from_date,
1479
- "to_date": to_date,
1480
- }
1481
- response = self._api_request_with_retry(
1482
- "GET", self._usage_stats_url, params=params
1386
+ http_resp = pm.request(
1387
+ method.upper(),
1388
+ final_url,
1389
+ body=body,
1390
+ headers=req_headers or None,
1391
+ timeout=urllib3.Timeout(connect=timeout, read=timeout),
1392
+ retries=False,
1393
+ preload_content=True,
1483
1394
  )
1484
- response.raise_for_status()
1485
- data = response.json()
1486
- if data.get("code") != 200:
1487
- raise_for_code("Usage stats error", code=data.get("code"), payload=data)
1488
- return UsageStatistics.from_dict(data.get("data", data))
1489
1395
 
1490
- def list_proxy_users(
1491
- self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1492
- ) -> ProxyUserList:
1493
- self._require_public_credentials()
1494
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1495
- params = {
1496
- "token": self.public_token,
1497
- "key": self.public_key,
1498
- "proxy_type": str(pt),
1499
- }
1500
- response = self._api_request_with_retry(
1501
- "GET", f"{self._proxy_users_url}/user-list", params=params
1502
- )
1503
- response.raise_for_status()
1504
- data = response.json()
1505
- if data.get("code") != 200:
1506
- raise_for_code("List users error", code=data.get("code"), payload=data)
1507
- return ProxyUserList.from_dict(data.get("data", data))
1396
+ r = requests.Response()
1397
+ r.status_code = int(getattr(http_resp, "status", 0))
1398
+ r._content = http_resp.data or b""
1399
+ r.url = final_url
1400
+ r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
1401
+ return r
1508
1402
 
1509
- def create_proxy_user(
1403
+ def _proxy_request_with_upstream(
1510
1404
  self,
1511
- username: str,
1512
- password: str,
1513
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1514
- traffic_limit: int = 0,
1515
- status: bool = True,
1516
- ) -> dict[str, Any]:
1517
- self._require_public_credentials()
1518
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1519
- headers = build_public_api_headers(
1520
- self.public_token or "", self.public_key or ""
1521
- )
1522
- payload = {
1523
- "proxy_type": str(pt),
1524
- "username": username,
1525
- "password": password,
1526
- "traffic_limit": str(traffic_limit),
1527
- "status": "true" if status else "false",
1528
- }
1529
- response = self._api_request_with_retry(
1530
- "POST",
1531
- f"{self._proxy_users_url}/create-user",
1532
- data=payload,
1533
- headers=headers,
1534
- )
1535
- response.raise_for_status()
1536
- data = response.json()
1537
- if data.get("code") != 200:
1538
- raise_for_code("Create user failed", code=data.get("code"), payload=data)
1539
- return data.get("data", {})
1405
+ method: str,
1406
+ url: str,
1407
+ *,
1408
+ proxy_config: ProxyConfig,
1409
+ timeout: int,
1410
+ headers: dict[str, str] | None = None,
1411
+ params: dict[str, Any] | None = None,
1412
+ data: Any = None,
1413
+ upstream_config: dict[str, Any],
1414
+ ) -> requests.Response:
1415
+ if not HAS_PYSOCKS:
1416
+ raise ThordataConfigError("PySocks required for upstream proxy support.")
1540
1417
 
1541
- def add_whitelist_ip(
1542
- self,
1543
- ip: str,
1544
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1545
- status: bool = True,
1546
- ) -> dict[str, Any]:
1547
- self._require_public_credentials()
1548
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1549
- headers = build_public_api_headers(
1550
- self.public_token or "", self.public_key or ""
1551
- )
1552
- payload = {
1553
- "proxy_type": str(pt),
1554
- "ip": ip,
1555
- "status": "true" if status else "false",
1556
- }
1557
- response = self._api_request_with_retry(
1558
- "POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
1418
+ req = requests.Request(method=method.upper(), url=url, params=params)
1419
+ prepped = self._proxy_session.prepare_request(req)
1420
+ final_url = prepped.url or url
1421
+
1422
+ parsed_target = urlparse(final_url)
1423
+ target_host = parsed_target.hostname or ""
1424
+ target_port = parsed_target.port or (
1425
+ 443 if parsed_target.scheme == "https" else 80
1559
1426
  )
1560
- response.raise_for_status()
1561
- data = response.json()
1562
- if data.get("code") != 200:
1563
- raise_for_code(
1564
- "Add whitelist IP failed", code=data.get("code"), payload=data
1565
- )
1566
- return data.get("data", {})
1567
1427
 
1568
- def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
1569
- self._require_public_credentials()
1570
- params = {
1571
- "token": self.public_token,
1572
- "key": self.public_key,
1573
- "proxy_type": str(proxy_type),
1574
- }
1575
- response = self._api_request_with_retry(
1576
- "GET", self._proxy_list_url, params=params
1428
+ thordata_host = proxy_config.host or "pr.thordata.net"
1429
+ thordata_port = proxy_config.port or 9999
1430
+ thordata_user = proxy_config.build_username()
1431
+ thordata_pass = proxy_config.password
1432
+
1433
+ # 1. Connect to Upstream -> Thordata Node
1434
+ factory = UpstreamProxySocketFactory(upstream_config)
1435
+ raw_sock = factory.create_connection(
1436
+ (thordata_host, thordata_port),
1437
+ timeout=float(timeout),
1577
1438
  )
1578
- response.raise_for_status()
1579
- data = response.json()
1580
- if data.get("code") != 200:
1581
- raise_for_code(
1582
- "List proxy servers error", code=data.get("code"), payload=data
1583
- )
1584
1439
 
1585
- server_list = []
1586
- if isinstance(data, dict):
1587
- server_list = data.get("data", data.get("list", []))
1588
- elif isinstance(data, list):
1589
- server_list = data
1440
+ try:
1441
+ protocol = proxy_config.protocol.lower().replace("socks5", "socks5h")
1590
1442
 
1591
- return [ProxyServer.from_dict(s) for s in server_list]
1443
+ # 2. Handshake with Thordata
1444
+ if protocol.startswith("socks"):
1445
+ sock = socks5_handshake(
1446
+ raw_sock, target_host, target_port, thordata_user, thordata_pass
1447
+ )
1448
+ if parsed_target.scheme == "https":
1449
+ ctx = ssl.create_default_context()
1450
+ sock = ctx.wrap_socket(sock, server_hostname=target_host)
1451
+ else:
1452
+ # HTTP/HTTPS Tunnel
1453
+ if protocol == "https":
1454
+ ctx = ssl.create_default_context()
1455
+ sock = ctx.wrap_socket(raw_sock, server_hostname=thordata_host)
1456
+ else:
1457
+ sock = raw_sock
1592
1458
 
1593
- def get_proxy_expiration(
1594
- self, ips: str | list[str], proxy_type: int
1595
- ) -> dict[str, Any]:
1596
- self._require_public_credentials()
1597
- if isinstance(ips, list):
1598
- ips = ",".join(ips)
1599
- params = {
1600
- "token": self.public_token,
1601
- "key": self.public_key,
1602
- "proxy_type": str(proxy_type),
1603
- "ips": ips,
1604
- }
1605
- response = self._api_request_with_retry(
1606
- "GET", self._proxy_expiration_url, params=params
1607
- )
1608
- response.raise_for_status()
1609
- data = response.json()
1610
- if data.get("code") != 200:
1611
- raise_for_code("Get expiration error", code=data.get("code"), payload=data)
1612
- return data.get("data", data)
1459
+ # CONNECT to Thordata
1460
+ connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
1461
+ connect_req += f"Host: {target_host}:{target_port}\r\n"
1462
+ auth = base64.b64encode(
1463
+ f"{thordata_user}:{thordata_pass}".encode()
1464
+ ).decode()
1465
+ connect_req += f"Proxy-Authorization: Basic {auth}\r\n\r\n"
1466
+ sock.sendall(connect_req.encode())
1467
+
1468
+ resp = b""
1469
+ while b"\r\n\r\n" not in resp:
1470
+ resp += sock.recv(1024)
1471
+ if b"200" not in resp.split(b"\r\n")[0]:
1472
+ raise ConnectionError("Thordata CONNECT failed")
1473
+
1474
+ # 3. If Target is HTTPS, wrap TLS inside the tunnel
1475
+ if parsed_target.scheme == "https":
1476
+ if isinstance(sock, ssl.SSLSocket):
1477
+ sock = cast(
1478
+ socket.socket,
1479
+ create_tls_in_tls(sock, target_host, float(timeout)),
1480
+ )
1481
+ else:
1482
+ ctx = ssl.create_default_context()
1483
+ sock = ctx.wrap_socket(sock, server_hostname=target_host)
1484
+
1485
+ # 4. Send actual Request
1486
+ return self._send_http_via_socket(
1487
+ sock, method, parsed_target, headers, data, final_url, timeout
1488
+ )
1613
1489
 
1614
- def list_countries(
1615
- self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1616
- ) -> list[dict[str, Any]]:
1617
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1618
- return self._get_locations("countries", proxy_type=pt)
1490
+ except Exception:
1491
+ raw_sock.close()
1492
+ raise
1619
1493
 
1620
- def list_states(
1494
+ def _send_http_via_socket(
1621
1495
  self,
1622
- country_code: str,
1623
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1624
- ) -> list[dict[str, Any]]:
1625
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1626
- return self._get_locations("states", proxy_type=pt, country_code=country_code)
1496
+ sock: socket.socket | Any, # Fix for TLSInTLSSocket typing issue
1497
+ method: str,
1498
+ parsed: Any,
1499
+ headers: Any,
1500
+ data: Any,
1501
+ final_url: str,
1502
+ timeout: int,
1503
+ ) -> requests.Response:
1504
+ req_headers = dict(headers or {})
1505
+ req_headers.setdefault("Host", parsed.hostname)
1506
+ req_headers.setdefault("User-Agent", "python-thordata-sdk")
1507
+ req_headers.setdefault("Connection", "close")
1627
1508
 
1628
- def list_cities(
1629
- self,
1630
- country_code: str,
1631
- state_code: str | None = None,
1632
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1633
- ) -> list[dict[str, Any]]:
1634
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1635
- kwargs = {"proxy_type": pt, "country_code": country_code}
1636
- if state_code:
1637
- kwargs["state_code"] = state_code
1638
- return self._get_locations("cities", **kwargs)
1509
+ path = parsed.path or "/"
1510
+ if parsed.query:
1511
+ path += f"?{parsed.query}"
1639
1512
 
1640
- def list_asn(
1641
- self,
1642
- country_code: str,
1643
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1644
- ) -> list[dict[str, Any]]:
1645
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1646
- return self._get_locations("asn", proxy_type=pt, country_code=country_code)
1513
+ msg = f"{method} {path} HTTP/1.1\r\n"
1514
+ for k, v in req_headers.items():
1515
+ msg += f"{k}: {v}\r\n"
1647
1516
 
1648
- def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
1649
- self._require_public_credentials()
1650
- params = {"token": self.public_token, "key": self.public_key}
1651
- for k, v in kwargs.items():
1652
- params[k] = str(v)
1517
+ body = b""
1518
+ if data:
1519
+ if isinstance(data, dict):
1520
+ body = urlencode(data).encode()
1521
+ msg += "Content-Type: application/x-www-form-urlencoded\r\n"
1522
+ elif isinstance(data, bytes):
1523
+ body = data
1524
+ else:
1525
+ body = str(data).encode()
1526
+ msg += f"Content-Length: {len(body)}\r\n"
1653
1527
 
1654
- response = self._api_request_with_retry(
1655
- "GET", f"{self._locations_base_url}/{endpoint}", params=params
1656
- )
1657
- response.raise_for_status()
1658
- data = response.json()
1659
- if isinstance(data, dict):
1660
- if data.get("code") != 200:
1661
- raise RuntimeError(f"Locations error: {data.get('msg')}")
1662
- return data.get("data") or []
1663
- return data if isinstance(data, list) else []
1528
+ msg += "\r\n"
1529
+ sock.sendall(msg.encode())
1530
+ if body:
1531
+ sock.sendall(body)
1664
1532
 
1665
- def _require_public_credentials(self) -> None:
1666
- if not self.public_token or not self.public_key:
1667
- raise ThordataConfigError(
1668
- "public_token and public_key are required for this operation."
1669
- )
1533
+ # Read Response
1534
+ resp_data = b""
1535
+ while True:
1536
+ try:
1537
+ chunk = sock.recv(4096)
1538
+ if not chunk:
1539
+ break
1540
+ resp_data += chunk
1541
+ except socket.timeout:
1542
+ break
1543
+
1544
+ if b"\r\n\r\n" in resp_data:
1545
+ head, content = resp_data.split(b"\r\n\r\n", 1)
1546
+ status_line = head.split(b"\r\n")[0].decode()
1547
+ try:
1548
+ status_code = int(status_line.split(" ")[1])
1549
+ except (ValueError, IndexError):
1550
+ status_code = 0
1551
+
1552
+ r = requests.Response()
1553
+ r.status_code = status_code
1554
+ r._content = content
1555
+ r.url = final_url
1556
+ return r
1557
+ raise ConnectionError("Empty response from socket")
1670
1558
 
1671
1559
  def _get_proxy_endpoint_overrides(
1672
1560
  self, product: ProxyProduct
@@ -1681,7 +1569,7 @@ class ThordataClient:
1681
1569
  protocol = (
1682
1570
  os.getenv(f"THORDATA_{prefix}_PROXY_PROTOCOL")
1683
1571
  or os.getenv("THORDATA_PROXY_PROTOCOL")
1684
- or "https"
1572
+ or "http"
1685
1573
  )
1686
1574
  port = int(port_raw) if port_raw and port_raw.isdigit() else None
1687
1575
  return host or None, port, protocol
@@ -1706,16 +1594,3 @@ class ThordataClient:
1706
1594
  protocol=proto,
1707
1595
  )
1708
1596
  return None
1709
-
1710
- def close(self) -> None:
1711
- self._proxy_session.close()
1712
- self._api_session.close()
1713
- for pm in self._proxy_managers.values():
1714
- pm.clear()
1715
- self._proxy_managers.clear()
1716
-
1717
- def __enter__(self) -> ThordataClient:
1718
- return self
1719
-
1720
- def __exit__(self, exc_type, exc_val, exc_tb) -> None:
1721
- self.close()