thordata-sdk 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/client.py CHANGED
@@ -26,19 +26,21 @@ from __future__ import annotations
26
26
  import base64
27
27
  import contextlib
28
28
  import hashlib
29
+ import json
29
30
  import logging
30
31
  import os
31
32
  import socket
32
33
  import ssl
33
34
  from datetime import date
34
35
  from typing import Any, cast
35
- from urllib.parse import urlencode, urlparse
36
+ from urllib.parse import quote, urlencode, urlparse
36
37
 
37
38
  import requests
38
39
  import urllib3
39
40
  from requests.structures import CaseInsensitiveDict
40
41
 
41
42
  from .serp_engines import SerpNamespace
43
+ from .unlimited import UnlimitedNamespace
42
44
 
43
45
  try:
44
46
  import socks
@@ -274,6 +276,8 @@ class _TLSInTLSSocket:
274
276
 
275
277
 
276
278
  class ThordataClient:
279
+ """Main client for interacting with Thordata API services."""
280
+
277
281
  # API Endpoints
278
282
  BASE_URL = "https://scraperapi.thordata.com"
279
283
  UNIVERSAL_URL = "https://universalapi.thordata.com"
@@ -282,7 +286,7 @@ class ThordataClient:
282
286
 
283
287
  def __init__(
284
288
  self,
285
- scraper_token: str | None = None, # Change: Optional
289
+ scraper_token: str | None = None,
286
290
  public_token: str | None = None,
287
291
  public_key: str | None = None,
288
292
  proxy_host: str = "pr.thordata.net",
@@ -296,9 +300,23 @@ class ThordataClient:
296
300
  web_scraper_api_base_url: str | None = None,
297
301
  locations_base_url: str | None = None,
298
302
  ) -> None:
299
- """Initialize the Thordata Client."""
303
+ """Initialize the Thordata Client.
300
304
 
301
- self.serp = SerpNamespace(self)
305
+ Args:
306
+ scraper_token: Token for SERP/Universal scraping APIs.
307
+ public_token: Public API token for account/management operations.
308
+ public_key: Public API key for account/management operations.
309
+ proxy_host: Default proxy host for residential proxies.
310
+ proxy_port: Default proxy port for residential proxies.
311
+ timeout: Default timeout for proxy requests.
312
+ api_timeout: Default timeout for API requests.
313
+ retry_config: Configuration for retry behavior.
314
+ auth_mode: Authentication mode for scraper_token ("bearer" or "header_token").
315
+ scraperapi_base_url: Override base URL for SERP API.
316
+ universalapi_base_url: Override base URL for Universal Scraping API.
317
+ web_scraper_api_base_url: Override base URL for Web Scraper API.
318
+ locations_base_url: Override base URL for Locations API.
319
+ """
302
320
 
303
321
  self.scraper_token = scraper_token
304
322
  self.public_token = public_token
@@ -388,6 +406,28 @@ class ThordataClient:
388
406
  self._proxy_list_url = f"{proxy_api_base}/proxy/proxy-list"
389
407
  self._proxy_expiration_url = f"{proxy_api_base}/proxy/expiration-time"
390
408
 
409
+ # Initialize Namespaces AFTER all base URLs are set
410
+ self.serp = SerpNamespace(self)
411
+ self.unlimited = UnlimitedNamespace(self)
412
+
413
+ # =========================================================================
414
+ # Context Manager
415
+ # =========================================================================
416
+
417
+ def close(self) -> None:
418
+ """Close the client and release resources."""
419
+ self._proxy_session.close()
420
+ self._api_session.close()
421
+ for pm in self._proxy_managers.values():
422
+ pm.clear()
423
+ self._proxy_managers.clear()
424
+
425
+ def __enter__(self) -> ThordataClient:
426
+ return self
427
+
428
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
429
+ self.close()
430
+
391
431
  # =========================================================================
392
432
  # Proxy Network Methods
393
433
  # =========================================================================
@@ -400,6 +440,17 @@ class ThordataClient:
400
440
  timeout: int | None = None,
401
441
  **kwargs: Any,
402
442
  ) -> requests.Response:
443
+ """Make a GET request through the proxy network.
444
+
445
+ Args:
446
+ url: Target URL to request.
447
+ proxy_config: Proxy configuration. If not provided, uses environment variables.
448
+ timeout: Request timeout in seconds.
449
+ **kwargs: Additional arguments passed to requests.
450
+
451
+ Returns:
452
+ Response object.
453
+ """
403
454
  logger.debug(f"Proxy GET request: {url}")
404
455
  return self._proxy_verb("GET", url, proxy_config, timeout, **kwargs)
405
456
 
@@ -411,50 +462,19 @@ class ThordataClient:
411
462
  timeout: int | None = None,
412
463
  **kwargs: Any,
413
464
  ) -> requests.Response:
414
- logger.debug(f"Proxy POST request: {url}")
415
- return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
416
-
417
- def _proxy_verb(
418
- self,
419
- method: str,
420
- url: str,
421
- proxy_config: ProxyConfig | None,
422
- timeout: int | None,
423
- **kwargs: Any,
424
- ) -> requests.Response:
425
- timeout = timeout or self._default_timeout
426
-
427
- if proxy_config is None:
428
- proxy_config = self._get_default_proxy_config_from_env()
429
-
430
- if proxy_config is None:
431
- raise ThordataConfigError(
432
- "Proxy credentials are missing. "
433
- "Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
434
- )
465
+ """Make a POST request through the proxy network.
435
466
 
436
- kwargs.pop("proxies", None)
437
-
438
- @with_retry(self._retry_config)
439
- def _do() -> requests.Response:
440
- return self._proxy_request_with_proxy_manager(
441
- method,
442
- url,
443
- proxy_config=proxy_config, # type: ignore
444
- timeout=timeout, # type: ignore
445
- headers=kwargs.pop("headers", None),
446
- params=kwargs.pop("params", None),
447
- data=kwargs.pop("data", None),
448
- )
467
+ Args:
468
+ url: Target URL to request.
469
+ proxy_config: Proxy configuration. If not provided, uses environment variables.
470
+ timeout: Request timeout in seconds.
471
+ **kwargs: Additional arguments passed to requests.
449
472
 
450
- try:
451
- return _do()
452
- except requests.Timeout as e:
453
- raise ThordataTimeoutError(
454
- f"Request timed out: {e}", original_error=e
455
- ) from e
456
- except Exception as e:
457
- raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
473
+ Returns:
474
+ Response object.
475
+ """
476
+ logger.debug(f"Proxy POST request: {url}")
477
+ return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
458
478
 
459
479
  def build_proxy_url(
460
480
  self,
@@ -468,6 +488,21 @@ class ThordataClient:
468
488
  session_duration: int | None = None,
469
489
  product: ProxyProduct | str = ProxyProduct.RESIDENTIAL,
470
490
  ) -> str:
491
+ """Build a proxy URL with location and session parameters.
492
+
493
+ Args:
494
+ username: Proxy username.
495
+ password: Proxy password.
496
+ country: Country code (e.g., "us", "uk").
497
+ state: State/region code (e.g., "ca", "ny").
498
+ city: City name (e.g., "new-york", "london").
499
+ session_id: Session identifier for sticky sessions.
500
+ session_duration: Session duration in minutes (1-90).
501
+ product: Proxy product type (RESIDENTIAL, DATACENTER, MOBILE).
502
+
503
+ Returns:
504
+ Formatted proxy URL.
505
+ """
471
506
  config = ProxyConfig(
472
507
  username=username,
473
508
  password=password,
@@ -483,1013 +518,1023 @@ class ThordataClient:
483
518
  return config.build_proxy_url()
484
519
 
485
520
  # =========================================================================
486
- # Internal Request Helpers
521
+ # SERP API Methods
487
522
  # =========================================================================
488
523
 
489
- def _api_request_with_retry(
524
+ def serp_search(
490
525
  self,
491
- method: str,
492
- url: str,
526
+ query: str,
493
527
  *,
494
- data: dict[str, Any] | None = None,
495
- headers: dict[str, str] | None = None,
496
- params: dict[str, Any] | None = None,
497
- ) -> requests.Response:
498
- @with_retry(self._retry_config)
499
- def _do_request() -> requests.Response:
500
- return self._api_session.request(
501
- method,
502
- url,
503
- data=data,
504
- headers=headers,
505
- params=params,
506
- timeout=self._api_timeout,
507
- )
508
-
509
- try:
510
- return _do_request()
511
- except requests.Timeout as e:
512
- raise ThordataTimeoutError(
513
- f"API request timed out: {e}", original_error=e
514
- ) from e
515
- except requests.RequestException as e:
516
- raise ThordataNetworkError(
517
- f"API request failed: {e}", original_error=e
518
- ) from e
528
+ engine: Engine | str = Engine.GOOGLE,
529
+ num: int = 10,
530
+ country: str | None = None,
531
+ language: str | None = None,
532
+ search_type: str | None = None,
533
+ device: str | None = None,
534
+ render_js: bool | None = None,
535
+ no_cache: bool | None = None,
536
+ output_format: str = "json",
537
+ **kwargs: Any,
538
+ ) -> dict[str, Any]:
539
+ """Perform a search engine query using SERP API.
519
540
 
520
- def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
521
- """Build a stable cache key for ProxyManager instances."""
522
- if not userpass:
523
- return proxy_endpoint
524
- h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
525
- return f"{proxy_endpoint}|auth={h}"
541
+ Args:
542
+ query: Search query string.
543
+ engine: Search engine (GOOGLE, BING, YAHOO, etc.).
544
+ num: Number of results to return.
545
+ country: Country code for localized results.
546
+ language: Language code for interface.
547
+ search_type: Type of search (images, news, video, etc.).
548
+ device: Device type (desktop, mobile).
549
+ render_js: Whether to render JavaScript.
550
+ no_cache: Bypass cache.
551
+ output_format: Output format ("json" or "html").
552
+ **kwargs: Additional engine-specific parameters.
526
553
 
527
- def _get_proxy_manager(
528
- self,
529
- proxy_url: str,
530
- *,
531
- cache_key: str,
532
- proxy_headers: dict[str, str] | None = None,
533
- ) -> urllib3.PoolManager:
534
- """Get or create a ProxyManager for the given proxy URL (Pooled)."""
535
- cached = self._proxy_managers.get(cache_key)
536
- if cached is not None:
537
- return cached
554
+ Returns:
555
+ Search results as dictionary.
556
+ """
557
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
538
558
 
539
- if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
540
- try:
541
- from urllib3.contrib.socks import SOCKSProxyManager
542
- except Exception as e:
543
- raise ThordataConfigError(
544
- "SOCKS proxy requested but SOCKS dependencies are missing. "
545
- "Install: pip install 'urllib3[socks]' or pip install PySocks"
546
- ) from e
559
+ request = SerpRequest(
560
+ query=query,
561
+ engine=engine_str,
562
+ num=num,
563
+ country=country,
564
+ language=language,
565
+ search_type=search_type,
566
+ device=device,
567
+ render_js=render_js,
568
+ no_cache=no_cache,
569
+ output_format=output_format,
570
+ extra_params=kwargs,
571
+ )
547
572
 
548
- pm_socks = SOCKSProxyManager(
549
- proxy_url,
550
- num_pools=10,
551
- maxsize=10,
552
- )
553
- pm = cast(urllib3.PoolManager, pm_socks)
554
- self._proxy_managers[cache_key] = pm
555
- return pm
573
+ return self.serp_search_advanced(request)
556
574
 
557
- # HTTP/HTTPS proxies
558
- proxy_ssl_context = None
559
- if proxy_url.startswith("https://"):
560
- proxy_ssl_context = ssl.create_default_context()
575
+ def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
576
+ """Perform advanced search with a SerpRequest object.
561
577
 
562
- pm_http = urllib3.ProxyManager(
563
- proxy_url,
564
- proxy_headers=proxy_headers,
565
- proxy_ssl_context=proxy_ssl_context,
566
- num_pools=10,
567
- maxsize=10,
568
- )
578
+ Args:
579
+ request: SerpRequest object with search parameters.
569
580
 
570
- pm = cast(urllib3.PoolManager, pm_http)
571
- self._proxy_managers[cache_key] = pm
572
- return pm
581
+ Returns:
582
+ Search results as dictionary.
583
+ """
584
+ if not self.scraper_token:
585
+ raise ThordataConfigError("scraper_token is required for SERP API")
573
586
 
574
- def _proxy_request_with_proxy_manager(
575
- self,
576
- method: str,
577
- url: str,
578
- *,
579
- proxy_config: ProxyConfig,
580
- timeout: int,
581
- headers: dict[str, str] | None = None,
582
- params: dict[str, Any] | None = None,
583
- data: Any = None,
584
- ) -> requests.Response:
585
- """Execute request through proxy, with optional upstream proxy support."""
587
+ payload = request.to_payload()
588
+ headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
586
589
 
587
- # Check for upstream proxy
588
- upstream_config = _parse_upstream_proxy()
590
+ logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
589
591
 
590
- if upstream_config:
591
- return self._proxy_request_with_upstream(
592
- method,
593
- url,
594
- proxy_config=proxy_config,
595
- timeout=timeout,
592
+ try:
593
+ response = self._api_request_with_retry(
594
+ "POST",
595
+ self._serp_url,
596
+ data=payload,
596
597
  headers=headers,
597
- params=params,
598
- data=data,
599
- upstream_config=upstream_config,
600
598
  )
599
+ response.raise_for_status()
601
600
 
602
- # Original implementation (no upstream proxy)
603
- req = requests.Request(method=method.upper(), url=url, params=params)
604
- prepped = self._proxy_session.prepare_request(req)
605
- final_url = prepped.url or url
601
+ if request.output_format.lower() == "json":
602
+ data = response.json()
603
+ if isinstance(data, dict):
604
+ code = data.get("code")
605
+ if code is not None and code != 200:
606
+ msg = extract_error_message(data)
607
+ raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
608
+ return parse_json_response(data)
606
609
 
607
- proxy_endpoint = proxy_config.build_proxy_endpoint()
608
- is_socks = proxy_endpoint.startswith(
609
- ("socks5://", "socks5h://", "socks4://", "socks4a://")
610
- )
610
+ return {"html": response.text}
611
611
 
612
- if is_socks:
613
- proxy_url_for_manager = proxy_config.build_proxy_url()
614
- userpass = proxy_config.build_proxy_basic_auth()
615
- cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
612
+ except requests.Timeout as e:
613
+ raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
614
+ except requests.RequestException as e:
615
+ raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
616
616
 
617
- pm = self._get_proxy_manager(
618
- proxy_url_for_manager,
619
- cache_key=cache_key,
620
- proxy_headers=None,
621
- )
622
- else:
623
- userpass = proxy_config.build_proxy_basic_auth()
624
- proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
625
- cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
617
+ # =========================================================================
618
+ # Universal Scraping API (WEB UNLOCKER) Methods
619
+ # =========================================================================
626
620
 
627
- pm = self._get_proxy_manager(
628
- proxy_endpoint,
629
- cache_key=cache_key,
630
- proxy_headers=dict(proxy_headers),
631
- )
621
+ def universal_scrape(
622
+ self,
623
+ url: str,
624
+ *,
625
+ js_render: bool = False,
626
+ output_format: str = "html",
627
+ country: str | None = None,
628
+ block_resources: str | None = None,
629
+ wait: int | None = None,
630
+ wait_for: str | None = None,
631
+ **kwargs: Any,
632
+ ) -> str | bytes:
633
+ """Scrape a URL using Universal Scraping API.
632
634
 
633
- req_headers = dict(headers or {})
634
- body = None
635
- if data is not None:
636
- if isinstance(data, dict):
637
- body = urlencode({k: str(v) for k, v in data.items()})
638
- req_headers.setdefault(
639
- "Content-Type", "application/x-www-form-urlencoded"
640
- )
641
- else:
642
- body = data
635
+ Args:
636
+ url: Target URL to scrape.
637
+ js_render: Whether to render JavaScript.
638
+ output_format: Output format ("html" or "png").
639
+ country: Country for IP geolocation.
640
+ block_resources: Block specific resources (e.g., "script,css").
641
+ wait: Wait time in milliseconds before fetching.
642
+ wait_for: CSS selector to wait for before fetching.
643
+ **kwargs: Additional parameters.
643
644
 
644
- http_resp = pm.request(
645
- method.upper(),
646
- final_url,
647
- body=body,
648
- headers=req_headers or None,
649
- timeout=urllib3.Timeout(connect=timeout, read=timeout),
650
- retries=False,
651
- preload_content=True,
645
+ Returns:
646
+ Scraped content as string (HTML) or bytes (PNG).
647
+ """
648
+ request = UniversalScrapeRequest(
649
+ url=url,
650
+ js_render=js_render,
651
+ output_format=output_format,
652
+ country=country,
653
+ block_resources=block_resources,
654
+ wait=wait,
655
+ wait_for=wait_for,
656
+ extra_params=kwargs,
652
657
  )
658
+ return self.universal_scrape_advanced(request)
653
659
 
654
- r = requests.Response()
655
- r.status_code = int(getattr(http_resp, "status", 0) or 0)
656
- r._content = http_resp.data or b""
657
- r.url = final_url
658
- r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
659
- return r
660
+ def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
661
+ """Scrape with advanced options using UniversalScrapeRequest.
662
+
663
+ Args:
664
+ request: UniversalScrapeRequest object with scrape parameters.
665
+
666
+ Returns:
667
+ Scraped content as string (HTML) or bytes (PNG).
668
+ """
669
+ if not self.scraper_token:
670
+ raise ThordataConfigError("scraper_token is required for Universal API")
671
+
672
+ payload = request.to_payload()
673
+ headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
674
+
675
+ logger.info(f"Universal Scrape: {request.url}")
676
+
677
+ try:
678
+ response = self._api_request_with_retry(
679
+ "POST",
680
+ self._universal_url,
681
+ data=payload,
682
+ headers=headers,
683
+ )
684
+ response.raise_for_status()
685
+ return self._process_universal_response(response, request.output_format)
686
+
687
+ except requests.Timeout as e:
688
+ raise ThordataTimeoutError(
689
+ f"Universal timeout: {e}", original_error=e
690
+ ) from e
691
+ except requests.RequestException as e:
692
+ raise ThordataNetworkError(
693
+ f"Universal failed: {e}", original_error=e
694
+ ) from e
660
695
 
661
696
  # =========================================================================
662
- # Upstream Proxy Support (Proxy Chaining)
697
+ # Web Scraper API - Task Management
663
698
  # =========================================================================
664
699
 
665
- def _proxy_request_with_upstream(
700
+ def create_scraper_task(
666
701
  self,
667
- method: str,
668
- url: str,
669
- *,
670
- proxy_config: ProxyConfig,
671
- timeout: int,
672
- headers: dict[str, str] | None = None,
673
- params: dict[str, Any] | None = None,
674
- data: Any = None,
675
- upstream_config: dict[str, Any],
676
- ) -> requests.Response:
677
- """Execute request through proxy chain: Upstream -> Thordata -> Target."""
678
- if not HAS_PYSOCKS:
679
- raise ThordataConfigError(
680
- "PySocks is required for upstream proxy support. "
681
- "Install with: pip install PySocks"
682
- )
702
+ file_name: str,
703
+ spider_id: str,
704
+ spider_name: str,
705
+ parameters: dict[str, Any],
706
+ universal_params: dict[str, Any] | None = None,
707
+ ) -> str:
708
+ """Create a web scraping task.
683
709
 
684
- req = requests.Request(method=method.upper(), url=url, params=params)
685
- prepped = self._proxy_session.prepare_request(req)
686
- final_url = prepped.url or url
710
+ Args:
711
+ file_name: Name for the output file (supports {{TasksID}} template).
712
+ spider_id: Spider identifier from Dashboard.
713
+ spider_name: Spider name (target domain, e.g., "amazon.com").
714
+ parameters: Spider-specific parameters.
715
+ universal_params: Global spider settings.
687
716
 
688
- parsed_target = urlparse(final_url)
689
- target_host = parsed_target.hostname or ""
690
- target_port = parsed_target.port or (
691
- 443 if parsed_target.scheme == "https" else 80
717
+ Returns:
718
+ Task ID.
719
+ """
720
+ config = ScraperTaskConfig(
721
+ file_name=file_name,
722
+ spider_id=spider_id,
723
+ spider_name=spider_name,
724
+ parameters=parameters,
725
+ universal_params=universal_params,
692
726
  )
693
- target_is_https = parsed_target.scheme == "https"
694
-
695
- protocol = proxy_config.protocol.lower()
696
- if protocol == "socks5":
697
- protocol = "socks5h"
698
-
699
- thordata_host = proxy_config.host or ""
700
- thordata_port = proxy_config.port or 9999
701
- thordata_username = proxy_config.build_username()
702
- thordata_password = proxy_config.password
727
+ return self.create_scraper_task_advanced(config)
703
728
 
704
- socket_factory = _UpstreamProxySocketFactory(upstream_config)
729
+ def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
730
+ """Create a web scraping task with advanced configuration.
705
731
 
706
- logger.debug(
707
- f"Proxy chain: upstream({upstream_config['host']}:{upstream_config['port']}) "
708
- f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
709
- f"-> target({target_host}:{target_port})"
710
- )
732
+ Args:
733
+ config: ScraperTaskConfig object with task configuration.
711
734
 
712
- raw_sock = socket_factory.create_connection(
713
- (thordata_host, thordata_port),
714
- timeout=float(timeout),
735
+ Returns:
736
+ Task ID.
737
+ """
738
+ self._require_public_credentials()
739
+ if not self.scraper_token:
740
+ raise ThordataConfigError("scraper_token is required for Task Builder")
741
+ payload = config.to_payload()
742
+ headers = build_builder_headers(
743
+ self.scraper_token, self.public_token or "", self.public_key or ""
715
744
  )
716
745
 
717
746
  try:
718
- if protocol.startswith("socks"):
719
- sock = self._socks5_handshake(
720
- raw_sock,
721
- target_host,
722
- target_port,
723
- thordata_username,
724
- thordata_password,
747
+ response = self._api_request_with_retry(
748
+ "POST", self._builder_url, data=payload, headers=headers
749
+ )
750
+ response.raise_for_status()
751
+ data = response.json()
752
+ if data.get("code") != 200:
753
+ raise_for_code(
754
+ "Task creation failed", code=data.get("code"), payload=data
725
755
  )
726
- if target_is_https:
727
- context = ssl.create_default_context()
728
- sock = context.wrap_socket(sock, server_hostname=target_host)
756
+ return data["data"]["task_id"]
757
+ except requests.RequestException as e:
758
+ raise ThordataNetworkError(
759
+ f"Task creation failed: {e}", original_error=e
760
+ ) from e
729
761
 
730
- elif protocol == "https":
731
- proxy_context = ssl.create_default_context()
732
- proxy_ssl_sock = proxy_context.wrap_socket(
733
- raw_sock, server_hostname=thordata_host
734
- )
762
+ def create_video_task(
763
+ self,
764
+ file_name: str,
765
+ spider_id: str,
766
+ spider_name: str,
767
+ parameters: dict[str, Any],
768
+ common_settings: CommonSettings,
769
+ ) -> str:
770
+ """Create a video/audio download task (YouTube, etc.).
735
771
 
736
- self._send_connect_request(
737
- proxy_ssl_sock,
738
- target_host,
739
- target_port,
740
- thordata_username,
741
- thordata_password,
742
- )
772
+ Args:
773
+ file_name: Name for the output file.
774
+ spider_id: Spider identifier (e.g., "youtube_video_by-url").
775
+ spider_name: Target site (e.g., "youtube.com").
776
+ parameters: Spider-specific parameters (URLs, etc.).
777
+ common_settings: Video/audio settings (resolution, subtitles, etc.).
743
778
 
744
- if target_is_https:
745
- sock = self._create_tls_in_tls_socket(
746
- proxy_ssl_sock, target_host, timeout
747
- ) # type: ignore[assignment]
748
- else:
749
- sock = proxy_ssl_sock
779
+ Returns:
780
+ Task ID.
781
+ """
782
+ config = VideoTaskConfig(
783
+ file_name=file_name,
784
+ spider_id=spider_id,
785
+ spider_name=spider_name,
786
+ parameters=parameters,
787
+ common_settings=common_settings,
788
+ )
789
+ return self.create_video_task_advanced(config)
750
790
 
751
- else: # HTTP proxy
752
- self._send_connect_request(
753
- raw_sock,
754
- target_host,
755
- target_port,
756
- thordata_username,
757
- thordata_password,
758
- )
791
+ def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
792
+ """Create a video task with advanced configuration.
759
793
 
760
- if target_is_https:
761
- context = ssl.create_default_context()
762
- sock = context.wrap_socket(raw_sock, server_hostname=target_host)
763
- else:
764
- sock = raw_sock
794
+ Args:
795
+ config: VideoTaskConfig object with task configuration.
765
796
 
766
- return self._send_http_request(
767
- sock, method, parsed_target, headers, data, final_url, timeout
797
+ Returns:
798
+ Task ID.
799
+ """
800
+ self._require_public_credentials()
801
+ if not self.scraper_token:
802
+ raise ThordataConfigError(
803
+ "scraper_token is required for Video Task Builder"
768
804
  )
769
805
 
770
- finally:
771
- with contextlib.suppress(Exception):
772
- raw_sock.close()
806
+ payload = config.to_payload()
807
+ headers = build_builder_headers(
808
+ self.scraper_token, self.public_token or "", self.public_key or ""
809
+ )
773
810
 
774
- def _send_connect_request(
775
- self,
776
- sock: socket.socket,
777
- target_host: str,
778
- target_port: int,
779
- proxy_username: str,
780
- proxy_password: str,
781
- ) -> None:
782
- """Send HTTP CONNECT request to proxy and verify response."""
783
- connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
784
- connect_req += f"Host: {target_host}:{target_port}\r\n"
811
+ response = self._api_request_with_retry(
812
+ "POST", self._video_builder_url, data=payload, headers=headers
813
+ )
814
+ response.raise_for_status()
815
+ data = response.json()
816
+ if data.get("code") != 200:
817
+ raise_for_code(
818
+ "Video task creation failed", code=data.get("code"), payload=data
819
+ )
820
+ return data["data"]["task_id"]
785
821
 
786
- credentials = f"{proxy_username}:{proxy_password}"
787
- encoded = base64.b64encode(credentials.encode()).decode()
788
- connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
789
- connect_req += "\r\n"
822
+ def get_task_status(self, task_id: str) -> str:
823
+ """Get the status of a scraping task.
790
824
 
791
- sock.sendall(connect_req.encode())
825
+ Args:
826
+ task_id: Task identifier.
792
827
 
793
- response = b""
794
- while b"\r\n\r\n" not in response:
795
- chunk = sock.recv(4096)
796
- if not chunk:
797
- raise ConnectionError("Proxy closed connection during CONNECT")
798
- response += chunk
828
+ Returns:
829
+ Status string (running, success, failed, etc.).
830
+ """
831
+ self._require_public_credentials()
832
+ headers = build_public_api_headers(
833
+ self.public_token or "", self.public_key or ""
834
+ )
835
+ try:
836
+ response = self._api_request_with_retry(
837
+ "POST",
838
+ self._status_url,
839
+ data={"tasks_ids": task_id},
840
+ headers=headers,
841
+ )
842
+ response.raise_for_status()
843
+ data = response.json()
844
+ if data.get("code") != 200:
845
+ raise_for_code("Task status error", code=data.get("code"), payload=data)
799
846
 
800
- status_line = response.split(b"\r\n")[0].decode()
801
- if "200" not in status_line:
802
- raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
847
+ items = data.get("data") or []
848
+ for item in items:
849
+ if str(item.get("task_id")) == str(task_id):
850
+ return item.get("status", "unknown")
851
+ return "unknown"
852
+ except requests.RequestException as e:
853
+ raise ThordataNetworkError(
854
+ f"Status check failed: {e}", original_error=e
855
+ ) from e
803
856
 
804
- def _create_tls_in_tls_socket(
805
- self,
806
- outer_ssl_sock: ssl.SSLSocket,
807
- hostname: str,
808
- timeout: int,
809
- ) -> _TLSInTLSSocket:
810
- """Create a TLS connection over an existing TLS connection."""
811
- context = ssl.create_default_context()
857
+ def safe_get_task_status(self, task_id: str) -> str:
858
+ """Get task status with error handling.
812
859
 
813
- incoming = ssl.MemoryBIO()
814
- outgoing = ssl.MemoryBIO()
860
+ Args:
861
+ task_id: Task identifier.
815
862
 
816
- ssl_obj = context.wrap_bio(incoming, outgoing, server_hostname=hostname)
863
+ Returns:
864
+ Status string or "error" on failure.
865
+ """
866
+ try:
867
+ return self.get_task_status(task_id)
868
+ except Exception:
869
+ return "error"
817
870
 
818
- while True:
819
- try:
820
- ssl_obj.do_handshake()
821
- break
822
- except ssl.SSLWantReadError:
823
- data_to_send = outgoing.read()
824
- if data_to_send:
825
- outer_ssl_sock.sendall(data_to_send)
871
+ def get_task_result(self, task_id: str, file_type: str = "json") -> str:
872
+ """Get the download URL for a completed task.
826
873
 
827
- outer_ssl_sock.settimeout(float(timeout))
828
- try:
829
- received = outer_ssl_sock.recv(8192)
830
- if not received:
831
- raise ConnectionError("Connection closed during TLS handshake")
832
- incoming.write(received)
833
- except socket.timeout as e:
834
- raise ConnectionError("Timeout during TLS handshake") from e
835
- except ssl.SSLWantWriteError:
836
- data_to_send = outgoing.read()
837
- if data_to_send:
838
- outer_ssl_sock.sendall(data_to_send)
874
+ Args:
875
+ task_id: Task identifier.
876
+ file_type: File type to download (json, csv, video, audio, subtitle).
839
877
 
840
- data_to_send = outgoing.read()
841
- if data_to_send:
842
- outer_ssl_sock.sendall(data_to_send)
878
+ Returns:
879
+ Download URL.
880
+ """
881
+ self._require_public_credentials()
882
+ headers = build_public_api_headers(
883
+ self.public_token or "", self.public_key or ""
884
+ )
885
+ try:
886
+ response = self._api_request_with_retry(
887
+ "POST",
888
+ self._download_url,
889
+ data={"tasks_id": task_id, "type": file_type},
890
+ headers=headers,
891
+ )
892
+ response.raise_for_status()
893
+ data = response.json()
894
+ if data.get("code") == 200 and data.get("data"):
895
+ return data["data"]["download"]
896
+ raise_for_code("Get result failed", code=data.get("code"), payload=data)
897
+ return ""
898
+ except requests.RequestException as e:
899
+ raise ThordataNetworkError(
900
+ f"Get result failed: {e}", original_error=e
901
+ ) from e
843
902
 
844
- return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
903
+ def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
904
+ """List all scraping tasks.
845
905
 
846
- def _send_http_request(
906
+ Args:
907
+ page: Page number for pagination.
908
+ size: Number of items per page.
909
+
910
+ Returns:
911
+ Dictionary with count and list of tasks.
912
+ """
913
+ self._require_public_credentials()
914
+ headers = build_public_api_headers(
915
+ self.public_token or "", self.public_key or ""
916
+ )
917
+ response = self._api_request_with_retry(
918
+ "POST",
919
+ self._list_url,
920
+ data={"page": str(page), "size": str(size)},
921
+ headers=headers,
922
+ )
923
+ response.raise_for_status()
924
+ data = response.json()
925
+ if data.get("code") != 200:
926
+ raise_for_code("List tasks failed", code=data.get("code"), payload=data)
927
+ return data.get("data", {"count": 0, "list": []})
928
+
929
+ def wait_for_task(
847
930
  self,
848
- sock: socket.socket | ssl.SSLSocket | Any,
849
- method: str,
850
- parsed_url: Any,
851
- headers: dict[str, str] | None,
852
- data: Any,
853
- final_url: str,
854
- timeout: int,
855
- ) -> requests.Response:
856
- """Send HTTP request over established connection and parse response."""
857
- target_host = parsed_url.hostname
931
+ task_id: str,
932
+ *,
933
+ poll_interval: float = 5.0,
934
+ max_wait: float = 600.0,
935
+ ) -> str:
936
+ """Wait for a task to complete.
858
937
 
859
- req_headers = dict(headers or {})
860
- req_headers.setdefault("Host", target_host)
861
- req_headers.setdefault("User-Agent", build_user_agent(_sdk_version, "requests"))
862
- req_headers.setdefault("Connection", "close")
938
+ Args:
939
+ task_id: Task identifier.
940
+ poll_interval: Polling interval in seconds.
941
+ max_wait: Maximum time to wait in seconds.
863
942
 
864
- path = parsed_url.path or "/"
865
- if parsed_url.query:
866
- path += f"?{parsed_url.query}"
943
+ Returns:
944
+ Final status of the task.
945
+ """
946
+ import time
867
947
 
868
- http_req = f"{method.upper()} {path} HTTP/1.1\r\n"
869
- for k, v in req_headers.items():
870
- http_req += f"{k}: {v}\r\n"
948
+ start = time.monotonic()
949
+ while (time.monotonic() - start) < max_wait:
950
+ status = self.get_task_status(task_id)
951
+ if status.lower() in {
952
+ "ready",
953
+ "success",
954
+ "finished",
955
+ "failed",
956
+ "error",
957
+ "cancelled",
958
+ }:
959
+ return status
960
+ time.sleep(poll_interval)
961
+ raise TimeoutError(f"Task {task_id} timeout")
871
962
 
872
- body = None
873
- if data is not None:
874
- if isinstance(data, dict):
875
- body = urlencode({k: str(v) for k, v in data.items()}).encode()
876
- http_req += "Content-Type: application/x-www-form-urlencoded\r\n"
877
- http_req += f"Content-Length: {len(body)}\r\n"
878
- elif isinstance(data, bytes):
879
- body = data
880
- http_req += f"Content-Length: {len(body)}\r\n"
881
- else:
882
- body = str(data).encode()
883
- http_req += f"Content-Length: {len(body)}\r\n"
963
+ def run_task(
964
+ self,
965
+ file_name: str,
966
+ spider_id: str,
967
+ spider_name: str,
968
+ parameters: dict[str, Any],
969
+ universal_params: dict[str, Any] | None = None,
970
+ *,
971
+ max_wait: float = 600.0,
972
+ initial_poll_interval: float = 2.0,
973
+ max_poll_interval: float = 10.0,
974
+ include_errors: bool = True,
975
+ # New parameters
976
+ task_type: str = "web", # "web" or "video"
977
+ common_settings: CommonSettings | None = None,
978
+ ) -> str:
979
+ """High-level wrapper to run a task and wait for result.
884
980
 
885
- http_req += "\r\n"
886
- sock.sendall(http_req.encode())
981
+ This method handles the entire lifecycle:
982
+ 1. Create Task
983
+ 2. Poll status (with exponential backoff)
984
+ 3. Get download URL when ready
887
985
 
888
- if body:
889
- sock.sendall(body)
986
+ Args:
987
+ file_name: Name for the output file.
988
+ spider_id: Spider identifier from Dashboard.
989
+ spider_name: Spider name (target domain).
990
+ parameters: Spider-specific parameters.
991
+ universal_params: Global spider settings.
992
+ max_wait: Maximum seconds to wait for completion.
993
+ initial_poll_interval: Starting poll interval in seconds.
994
+ max_poll_interval: Maximum poll interval cap.
995
+ include_errors: Whether to include error logs.
890
996
 
891
- if hasattr(sock, "settimeout"):
892
- sock.settimeout(float(timeout))
997
+ Returns:
998
+ The download URL for the task result.
893
999
 
894
- response_data = b""
895
- try:
896
- while True:
897
- chunk = sock.recv(8192)
898
- if not chunk:
899
- break
900
- response_data += chunk
901
- if b"\r\n\r\n" in response_data:
902
- header_end = response_data.index(b"\r\n\r\n") + 4
903
- headers_part = (
904
- response_data[:header_end]
905
- .decode("utf-8", errors="replace")
906
- .lower()
907
- )
908
- if "content-length:" in headers_part:
909
- for line in headers_part.split("\r\n"):
910
- if line.startswith("content-length:"):
911
- content_length = int(line.split(":")[1].strip())
912
- if len(response_data) >= header_end + content_length:
913
- break
914
- elif "transfer-encoding: chunked" not in headers_part:
915
- break
916
- except socket.timeout:
917
- pass
918
-
919
- return self._parse_http_response(response_data, final_url)
1000
+ Raises:
1001
+ ThordataTimeoutError: If task takes longer than max_wait.
1002
+ ThordataAPIError: If task fails or is cancelled.
1003
+ """
1004
+ import time
920
1005
 
921
- def _socks5_handshake(
922
- self,
923
- sock: socket.socket,
924
- target_host: str,
925
- target_port: int,
926
- username: str | None,
927
- password: str | None,
928
- ) -> socket.socket:
929
- """Perform SOCKS5 handshake over existing socket."""
930
- if username and password:
931
- sock.sendall(b"\x05\x02\x00\x02")
1006
+ # 1. Create Task
1007
+ if task_type == "video":
1008
+ if common_settings is None:
1009
+ raise ValueError("common_settings is required for video tasks")
1010
+
1011
+ config_video = VideoTaskConfig(
1012
+ file_name=file_name,
1013
+ spider_id=spider_id,
1014
+ spider_name=spider_name,
1015
+ parameters=parameters,
1016
+ common_settings=common_settings,
1017
+ include_errors=include_errors,
1018
+ )
1019
+ task_id = self.create_video_task_advanced(config_video)
932
1020
  else:
933
- sock.sendall(b"\x05\x01\x00")
1021
+ config = ScraperTaskConfig(
1022
+ file_name=file_name,
1023
+ spider_id=spider_id,
1024
+ spider_name=spider_name,
1025
+ parameters=parameters,
1026
+ universal_params=universal_params,
1027
+ include_errors=include_errors,
1028
+ )
1029
+ task_id = self.create_scraper_task_advanced(config)
934
1030
 
935
- response = sock.recv(2)
936
- if len(response) < 2:
937
- raise ConnectionError("SOCKS5 handshake failed: incomplete response")
1031
+ logger.info(f"Task created successfully: {task_id}. Waiting for completion...")
938
1032
 
939
- if response[0] != 0x05:
940
- raise ConnectionError(f"SOCKS5 version mismatch: {response[0]}")
1033
+ # 2. Poll Status (Smart Backoff)
1034
+ start_time = time.monotonic()
1035
+ current_poll = initial_poll_interval
941
1036
 
942
- auth_method = response[1]
1037
+ while (time.monotonic() - start_time) < max_wait:
1038
+ status = self.get_task_status(task_id)
1039
+ status_lower = status.lower()
943
1040
 
944
- if auth_method == 0x02:
945
- if not username or not password:
946
- raise ConnectionError(
947
- "SOCKS5 server requires auth but no credentials provided"
1041
+ if status_lower in {"ready", "success", "finished"}:
1042
+ logger.info(f"Task {task_id} finished. Status: {status}")
1043
+ # 3. Get Result
1044
+ return self.get_task_result(task_id)
1045
+
1046
+ if status_lower in {"failed", "error", "cancelled"}:
1047
+ raise ThordataNetworkError(
1048
+ f"Task {task_id} ended with failed status: {status}"
948
1049
  )
949
1050
 
950
- auth_req = bytes([0x01, len(username)]) + username.encode()
951
- auth_req += bytes([len(password)]) + password.encode()
952
- sock.sendall(auth_req)
1051
+ # Wait and increase interval (capped)
1052
+ time.sleep(current_poll)
1053
+ current_poll = min(current_poll * 1.5, max_poll_interval)
953
1054
 
954
- auth_resp = sock.recv(2)
955
- if len(auth_resp) < 2 or auth_resp[1] != 0x00:
956
- raise ConnectionError("SOCKS5 authentication failed")
1055
+ raise ThordataTimeoutError(f"Task {task_id} timed out after {max_wait} seconds")
957
1056
 
958
- elif auth_method == 0xFF:
959
- raise ConnectionError("SOCKS5 no acceptable auth method")
1057
+ # =========================================================================
1058
+ # Account & Usage Methods
1059
+ # =========================================================================
960
1060
 
961
- connect_req = b"\x05\x01\x00\x03"
962
- connect_req += bytes([len(target_host)]) + target_host.encode()
963
- connect_req += target_port.to_bytes(2, "big")
964
- sock.sendall(connect_req)
1061
+ def get_usage_statistics(
1062
+ self,
1063
+ from_date: str | date,
1064
+ to_date: str | date,
1065
+ ) -> UsageStatistics:
1066
+ """Get usage statistics for a date range.
965
1067
 
966
- resp = sock.recv(4)
967
- if len(resp) < 4:
968
- raise ConnectionError("SOCKS5 connect failed: incomplete response")
1068
+ Args:
1069
+ from_date: Start date (YYYY-MM-DD format or date object).
1070
+ to_date: End date (YYYY-MM-DD format or date object).
969
1071
 
970
- if resp[1] != 0x00:
971
- error_codes = {
972
- 0x01: "General failure",
973
- 0x02: "Connection not allowed",
974
- 0x03: "Network unreachable",
975
- 0x04: "Host unreachable",
976
- 0x05: "Connection refused",
977
- 0x06: "TTL expired",
978
- 0x07: "Command not supported",
979
- 0x08: "Address type not supported",
980
- }
981
- error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
982
- raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
1072
+ Returns:
1073
+ UsageStatistics object with traffic data.
1074
+ """
1075
+ self._require_public_credentials()
1076
+ if isinstance(from_date, date):
1077
+ from_date = from_date.strftime("%Y-%m-%d")
1078
+ if isinstance(to_date, date):
1079
+ to_date = to_date.strftime("%Y-%m-%d")
983
1080
 
984
- addr_type = resp[3]
985
- if addr_type == 0x01:
986
- sock.recv(4 + 2)
987
- elif addr_type == 0x03:
988
- domain_len = sock.recv(1)[0]
989
- sock.recv(domain_len + 2)
990
- elif addr_type == 0x04:
991
- sock.recv(16 + 2)
1081
+ params = {
1082
+ "token": self.public_token,
1083
+ "key": self.public_key,
1084
+ "from_date": from_date,
1085
+ "to_date": to_date,
1086
+ }
1087
+ response = self._api_request_with_retry(
1088
+ "GET", self._usage_stats_url, params=params
1089
+ )
1090
+ response.raise_for_status()
1091
+ data = response.json()
1092
+ if data.get("code") != 200:
1093
+ raise_for_code("Usage stats error", code=data.get("code"), payload=data)
1094
+ return UsageStatistics.from_dict(data.get("data", data))
992
1095
 
993
- return sock
1096
+ def get_traffic_balance(self) -> float:
1097
+ """
1098
+ Get the current traffic balance in KB via Public API.
1099
+ """
1100
+ self._require_public_credentials()
1101
+ # FIX: Auth params must be in Query, NOT Headers
1102
+ params = {
1103
+ "token": self.public_token,
1104
+ "key": self.public_key,
1105
+ }
1106
+ api_base = self._locations_base_url.replace("/locations", "")
994
1107
 
995
- def _parse_http_response(
996
- self,
997
- response_data: bytes,
998
- url: str,
999
- ) -> requests.Response:
1000
- """Parse raw HTTP response into requests.Response."""
1001
- if b"\r\n\r\n" in response_data:
1002
- header_data, body = response_data.split(b"\r\n\r\n", 1)
1003
- else:
1004
- header_data = response_data
1005
- body = b""
1108
+ response = self._api_request_with_retry(
1109
+ "GET", f"{api_base}/account/traffic-balance", params=params
1110
+ )
1111
+ response.raise_for_status()
1112
+ data = response.json()
1113
+ if data.get("code") != 200:
1114
+ raise_for_code(
1115
+ "Get traffic balance failed", code=data.get("code"), payload=data
1116
+ )
1006
1117
 
1007
- header_lines = header_data.decode("utf-8", errors="replace").split("\r\n")
1118
+ return float(data.get("data", {}).get("traffic_balance", 0))
1008
1119
 
1009
- status_line = header_lines[0] if header_lines else ""
1010
- parts = status_line.split(" ", 2)
1011
- status_code = int(parts[1]) if len(parts) > 1 else 0
1120
+ def get_wallet_balance(self) -> float:
1121
+ """
1122
+ Get the current wallet balance via Public API.
1123
+ """
1124
+ self._require_public_credentials()
1125
+ # FIX: Auth params must be in Query, NOT Headers
1126
+ params = {
1127
+ "token": self.public_token,
1128
+ "key": self.public_key,
1129
+ }
1130
+ api_base = self._locations_base_url.replace("/locations", "")
1012
1131
 
1013
- headers_dict = {}
1014
- for line in header_lines[1:]:
1015
- if ": " in line:
1016
- k, v = line.split(": ", 1)
1017
- headers_dict[k] = v
1132
+ response = self._api_request_with_retry(
1133
+ "GET", f"{api_base}/account/wallet-balance", params=params
1134
+ )
1135
+ response.raise_for_status()
1136
+ data = response.json()
1137
+ if data.get("code") != 200:
1138
+ raise_for_code(
1139
+ "Get wallet balance failed", code=data.get("code"), payload=data
1140
+ )
1018
1141
 
1019
- if headers_dict.get("Transfer-Encoding", "").lower() == "chunked":
1020
- body = self._decode_chunked(body)
1142
+ return float(data.get("data", {}).get("balance", 0))
1021
1143
 
1022
- r = requests.Response()
1023
- r.status_code = status_code
1024
- r._content = body
1025
- r.url = url
1026
- r.headers = CaseInsensitiveDict(headers_dict)
1027
- return r
1144
+ def get_proxy_user_usage(
1145
+ self,
1146
+ username: str,
1147
+ start_date: str | date,
1148
+ end_date: str | date,
1149
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1150
+ ) -> list[dict[str, Any]]:
1151
+ """
1152
+ Get traffic usage statistics for a specific proxy user.
1028
1153
 
1029
- def _decode_chunked(self, data: bytes) -> bytes:
1030
- """Decode chunked transfer encoding."""
1031
- result = b""
1032
- while data:
1033
- if b"\r\n" not in data:
1034
- break
1035
- size_line, data = data.split(b"\r\n", 1)
1036
- try:
1037
- chunk_size = int(size_line.decode().strip(), 16)
1038
- except ValueError:
1039
- break
1154
+ Args:
1155
+ username: Sub-account username.
1156
+ start_date: Start date (YYYY-MM-DD).
1157
+ end_date: End date (YYYY-MM-DD).
1158
+ proxy_type: Proxy product type.
1040
1159
 
1041
- if chunk_size == 0:
1042
- break
1160
+ Returns:
1161
+ List of daily usage records.
1162
+ """
1163
+ self._require_public_credentials()
1164
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1043
1165
 
1044
- result += data[:chunk_size]
1045
- data = data[chunk_size:]
1166
+ if isinstance(start_date, date):
1167
+ start_date = start_date.strftime("%Y-%m-%d")
1168
+ if isinstance(end_date, date):
1169
+ end_date = end_date.strftime("%Y-%m-%d")
1046
1170
 
1047
- if data.startswith(b"\r\n"):
1048
- data = data[2:]
1171
+ params = {
1172
+ "token": self.public_token,
1173
+ "key": self.public_key,
1174
+ "proxy_type": str(pt),
1175
+ "username": username,
1176
+ "from_date": start_date,
1177
+ "to_date": end_date,
1178
+ }
1049
1179
 
1050
- return result
1180
+ response = self._api_request_with_retry(
1181
+ "GET", f"{self._proxy_users_url}/usage-statistics", params=params
1182
+ )
1183
+ response.raise_for_status()
1184
+ data = response.json()
1185
+ if data.get("code") != 200:
1186
+ raise_for_code("Get user usage failed", code=data.get("code"), payload=data)
1051
1187
 
1052
- # =========================================================================
1053
- # SERP API Methods
1054
- # =========================================================================
1188
+ # Structure: { "data": [ { "date": "...", "usage_traffic": ... } ] }
1189
+ return data.get("data", [])
1055
1190
 
1056
- def serp_search(
1191
+ def extract_ip_list(
1057
1192
  self,
1058
- query: str,
1059
- *,
1060
- engine: Engine | str = Engine.GOOGLE,
1061
- num: int = 10,
1193
+ num: int = 1,
1062
1194
  country: str | None = None,
1063
- language: str | None = None,
1064
- search_type: str | None = None,
1065
- device: str | None = None,
1066
- render_js: bool | None = None,
1067
- no_cache: bool | None = None,
1068
- output_format: str = "json",
1069
- **kwargs: Any,
1070
- ) -> dict[str, Any]:
1071
- engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
1072
-
1073
- request = SerpRequest(
1074
- query=query,
1075
- engine=engine_str,
1076
- num=num,
1077
- country=country,
1078
- language=language,
1079
- search_type=search_type,
1080
- device=device,
1081
- render_js=render_js,
1082
- no_cache=no_cache,
1083
- output_format=output_format,
1084
- extra_params=kwargs,
1085
- )
1195
+ state: str | None = None,
1196
+ city: str | None = None,
1197
+ time_limit: int | None = None,
1198
+ port: int | None = None,
1199
+ return_type: str = "txt",
1200
+ protocol: str = "http",
1201
+ sep: str = "\r\n",
1202
+ product: str = "residential", # residential or unlimited
1203
+ ) -> list[str]:
1204
+ """
1205
+ Extract proxy IP list via API (get-ip.thordata.net).
1206
+ Requires IP whitelist configuration.
1086
1207
 
1087
- return self.serp_search_advanced(request)
1208
+ Args:
1209
+ num: Number of IPs to extract.
1210
+ country: Country code.
1211
+ state: State code.
1212
+ city: City name.
1213
+ time_limit: Session duration (1-90 mins).
1214
+ port: Specific port.
1215
+ return_type: "txt" or "json".
1216
+ protocol: "http" or "socks5".
1217
+ sep: Separator for txt output.
1218
+ product: "residential" or "unlimited".
1088
1219
 
1089
- def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
1090
- if not self.scraper_token:
1091
- raise ThordataConfigError("scraper_token is required for SERP API")
1092
-
1093
- payload = request.to_payload()
1094
- headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
1095
-
1096
- logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
1097
-
1098
- try:
1099
- response = self._api_request_with_retry(
1100
- "POST",
1101
- self._serp_url,
1102
- data=payload,
1103
- headers=headers,
1104
- )
1105
- response.raise_for_status()
1106
-
1107
- if request.output_format.lower() == "json":
1108
- data = response.json()
1109
- if isinstance(data, dict):
1110
- code = data.get("code")
1111
- if code is not None and code != 200:
1112
- msg = extract_error_message(data)
1113
- raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
1114
- return parse_json_response(data)
1115
-
1116
- return {"html": response.text}
1117
-
1118
- except requests.Timeout as e:
1119
- raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
1120
- except requests.RequestException as e:
1121
- raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
1122
-
1123
- # =========================================================================
1124
- # Universal Scraping API
1125
- # =========================================================================
1220
+ Returns:
1221
+ List of "IP:Port" strings.
1222
+ """
1223
+ # Determine endpoint based on product
1224
+ base_url = "https://get-ip.thordata.net"
1225
+ endpoint = "/unlimited_api" if product == "unlimited" else "/api"
1226
+
1227
+ # Build params
1228
+ params: dict[str, Any] = {
1229
+ "num": str(num),
1230
+ "return_type": return_type,
1231
+ "protocol": protocol,
1232
+ "sep": sep,
1233
+ }
1126
1234
 
1127
- def universal_scrape(
1128
- self,
1129
- url: str,
1130
- *,
1131
- js_render: bool = False,
1132
- output_format: str = "html",
1133
- country: str | None = None,
1134
- block_resources: str | None = None,
1135
- wait: int | None = None,
1136
- wait_for: str | None = None,
1137
- **kwargs: Any,
1138
- ) -> str | bytes:
1139
- request = UniversalScrapeRequest(
1140
- url=url,
1141
- js_render=js_render,
1142
- output_format=output_format,
1143
- country=country,
1144
- block_resources=block_resources,
1145
- wait=wait,
1146
- wait_for=wait_for,
1147
- extra_params=kwargs,
1235
+ # Add optional params
1236
+ if country:
1237
+ params["country"] = country
1238
+ if state:
1239
+ params["state"] = state
1240
+ if city:
1241
+ params["city"] = city
1242
+ if time_limit:
1243
+ params["time"] = str(time_limit)
1244
+ if port:
1245
+ params["port"] = str(port)
1246
+
1247
+ username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
1248
+ if username:
1249
+ params["td-customer"] = username
1250
+
1251
+ response = self._api_session.get(
1252
+ f"{base_url}{endpoint}", params=params, timeout=self._default_timeout
1148
1253
  )
1149
- return self.universal_scrape_advanced(request)
1150
-
1151
- def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
1152
- if not self.scraper_token:
1153
- raise ThordataConfigError("scraper_token is required for Universal API")
1154
-
1155
- payload = request.to_payload()
1156
- headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
1157
-
1158
- logger.info(f"Universal Scrape: {request.url}")
1159
-
1160
- try:
1161
- response = self._api_request_with_retry(
1162
- "POST",
1163
- self._universal_url,
1164
- data=payload,
1165
- headers=headers,
1166
- )
1167
- response.raise_for_status()
1168
- return self._process_universal_response(response, request.output_format)
1169
-
1170
- except requests.Timeout as e:
1171
- raise ThordataTimeoutError(
1172
- f"Universal timeout: {e}", original_error=e
1173
- ) from e
1174
- except requests.RequestException as e:
1175
- raise ThordataNetworkError(
1176
- f"Universal failed: {e}", original_error=e
1177
- ) from e
1178
-
1179
- def _process_universal_response(
1180
- self, response: requests.Response, output_format: str
1181
- ) -> str | bytes:
1182
- try:
1183
- resp_json = response.json()
1184
- except ValueError:
1185
- return response.content if output_format.lower() == "png" else response.text
1254
+ response.raise_for_status()
1186
1255
 
1187
- if isinstance(resp_json, dict):
1188
- code = resp_json.get("code")
1189
- if code is not None and code != 200:
1190
- msg = extract_error_message(resp_json)
1191
- raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
1256
+ # Parse result
1257
+ if return_type == "json":
1258
+ data = response.json()
1259
+ # JSON format: { "code": 0, "data": [ { "ip": "...", "port": ... } ] }
1260
+ if isinstance(data, dict):
1261
+ if data.get("code") == 0 or data.get("code") == 200:
1262
+ raw_list = data.get("data") or []
1263
+ return [f"{item['ip']}:{item['port']}" for item in raw_list]
1264
+ else:
1265
+ raise_for_code(
1266
+ "Extract IPs failed", code=data.get("code"), payload=data
1267
+ )
1268
+ return []
1192
1269
 
1193
- if "html" in resp_json:
1194
- return resp_json["html"]
1195
- if "png" in resp_json:
1196
- return decode_base64_image(resp_json["png"])
1270
+ else: # txt
1271
+ text = response.text.strip()
1272
+ # Check for error message in text (often starts with { or contains "error")
1273
+ if text.startswith("{") and "code" in text:
1274
+ # Try parsing as JSON error
1275
+ try:
1276
+ err_data = json.loads(text)
1277
+ raise_for_code(
1278
+ "Extract IPs failed",
1279
+ code=err_data.get("code"),
1280
+ payload=err_data,
1281
+ )
1282
+ except json.JSONDecodeError:
1283
+ pass
1197
1284
 
1198
- return str(resp_json)
1285
+ actual_sep = sep.replace("\\r", "\r").replace("\\n", "\n")
1286
+ return [line.strip() for line in text.split(actual_sep) if line.strip()]
1199
1287
 
1200
1288
  # =========================================================================
1201
- # Web Scraper API (Tasks)
1289
+ # Proxy Users Management (Sub-accounts)
1202
1290
  # =========================================================================
1203
1291
 
1204
- def create_scraper_task(
1205
- self,
1206
- file_name: str,
1207
- spider_id: str,
1208
- spider_name: str,
1209
- parameters: dict[str, Any],
1210
- universal_params: dict[str, Any] | None = None,
1211
- ) -> str:
1212
- config = ScraperTaskConfig(
1213
- file_name=file_name,
1214
- spider_id=spider_id,
1215
- spider_name=spider_name,
1216
- parameters=parameters,
1217
- universal_params=universal_params,
1218
- )
1219
- return self.create_scraper_task_advanced(config)
1292
+ def list_proxy_users(
1293
+ self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1294
+ ) -> ProxyUserList:
1295
+ """List all proxy sub-accounts.
1220
1296
 
1221
- def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
1297
+ Args:
1298
+ proxy_type: Proxy product type.
1299
+
1300
+ Returns:
1301
+ ProxyUserList with user information.
1302
+ """
1222
1303
  self._require_public_credentials()
1223
- if not self.scraper_token:
1224
- raise ThordataConfigError("scraper_token is required for Task Builder")
1225
- payload = config.to_payload()
1226
- headers = build_builder_headers(
1227
- self.scraper_token, self.public_token or "", self.public_key or ""
1304
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1305
+ params = {
1306
+ "token": self.public_token,
1307
+ "key": self.public_key,
1308
+ "proxy_type": str(pt),
1309
+ }
1310
+ response = self._api_request_with_retry(
1311
+ "GET", f"{self._proxy_users_url}/user-list", params=params
1228
1312
  )
1313
+ response.raise_for_status()
1314
+ data = response.json()
1315
+ if data.get("code") != 200:
1316
+ raise_for_code("List users error", code=data.get("code"), payload=data)
1317
+ return ProxyUserList.from_dict(data.get("data", data))
1229
1318
 
1230
- try:
1231
- response = self._api_request_with_retry(
1232
- "POST", self._builder_url, data=payload, headers=headers
1233
- )
1234
- response.raise_for_status()
1235
- data = response.json()
1236
- if data.get("code") != 200:
1237
- raise_for_code(
1238
- "Task creation failed", code=data.get("code"), payload=data
1239
- )
1240
- return data["data"]["task_id"]
1241
- except requests.RequestException as e:
1242
- raise ThordataNetworkError(
1243
- f"Task creation failed: {e}", original_error=e
1244
- ) from e
1245
-
1246
- def create_video_task(
1319
+ def create_proxy_user(
1247
1320
  self,
1248
- file_name: str,
1249
- spider_id: str,
1250
- spider_name: str,
1251
- parameters: dict[str, Any],
1252
- common_settings: CommonSettings,
1253
- ) -> str:
1254
- config = VideoTaskConfig(
1255
- file_name=file_name,
1256
- spider_id=spider_id,
1257
- spider_name=spider_name,
1258
- parameters=parameters,
1259
- common_settings=common_settings,
1260
- )
1261
- return self.create_video_task_advanced(config)
1321
+ username: str,
1322
+ password: str,
1323
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1324
+ traffic_limit: int = 0,
1325
+ status: bool = True,
1326
+ ) -> dict[str, Any]:
1327
+ """Create a new proxy sub-account.
1262
1328
 
1263
- def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
1264
- self._require_public_credentials()
1265
- if not self.scraper_token:
1266
- raise ThordataConfigError(
1267
- "scraper_token is required for Video Task Builder"
1268
- )
1329
+ Args:
1330
+ username: Sub-account username.
1331
+ password: Sub-account password.
1332
+ proxy_type: Proxy product type.
1333
+ traffic_limit: Traffic limit in MB (0 = unlimited).
1334
+ status: Enable or disable the account.
1269
1335
 
1270
- payload = config.to_payload()
1271
- headers = build_builder_headers(
1272
- self.scraper_token, self.public_token or "", self.public_key or ""
1336
+ Returns:
1337
+ API response data.
1338
+ """
1339
+ self._require_public_credentials()
1340
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1341
+ headers = build_public_api_headers(
1342
+ self.public_token or "", self.public_key or ""
1273
1343
  )
1274
-
1344
+ payload = {
1345
+ "proxy_type": str(pt),
1346
+ "username": username,
1347
+ "password": password,
1348
+ "traffic_limit": str(traffic_limit),
1349
+ "status": "true" if status else "false",
1350
+ }
1275
1351
  response = self._api_request_with_retry(
1276
- "POST", self._video_builder_url, data=payload, headers=headers
1352
+ "POST",
1353
+ f"{self._proxy_users_url}/create-user",
1354
+ data=payload,
1355
+ headers=headers,
1277
1356
  )
1278
1357
  response.raise_for_status()
1279
1358
  data = response.json()
1280
1359
  if data.get("code") != 200:
1281
- raise_for_code(
1282
- "Video task creation failed", code=data.get("code"), payload=data
1283
- )
1284
- return data["data"]["task_id"]
1360
+ raise_for_code("Create user failed", code=data.get("code"), payload=data)
1361
+ return data.get("data", {})
1285
1362
 
1286
- def get_task_status(self, task_id: str) -> str:
1363
+ def update_proxy_user(
1364
+ self,
1365
+ username: str,
1366
+ password: str, # Added password as required argument
1367
+ traffic_limit: int | None = None,
1368
+ status: bool | None = None,
1369
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1370
+ ) -> dict[str, Any]:
1371
+ """
1372
+ Update an existing proxy user's settings.
1373
+
1374
+ Note: Password is required by the API even if not changing it.
1375
+
1376
+ Args:
1377
+ username: The sub-account username.
1378
+ password: The sub-account password (required for update).
1379
+ traffic_limit: New traffic limit in MB (0 for unlimited). None to keep unchanged.
1380
+ status: New status (True=enabled, False=disabled). None to keep unchanged.
1381
+ proxy_type: Proxy product type.
1382
+
1383
+ Returns:
1384
+ API response data.
1385
+ """
1287
1386
  self._require_public_credentials()
1387
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1288
1388
  headers = build_public_api_headers(
1289
1389
  self.public_token or "", self.public_key or ""
1290
1390
  )
1291
- try:
1292
- response = self._api_request_with_retry(
1293
- "POST",
1294
- self._status_url,
1295
- data={"tasks_ids": task_id},
1296
- headers=headers,
1297
- )
1298
- response.raise_for_status()
1299
- data = response.json()
1300
- if data.get("code") != 200:
1301
- raise_for_code("Task status error", code=data.get("code"), payload=data)
1302
1391
 
1303
- items = data.get("data") or []
1304
- for item in items:
1305
- if str(item.get("task_id")) == str(task_id):
1306
- return item.get("status", "unknown")
1307
- return "unknown"
1308
- except requests.RequestException as e:
1309
- raise ThordataNetworkError(
1310
- f"Status check failed: {e}", original_error=e
1311
- ) from e
1312
-
1313
- def safe_get_task_status(self, task_id: str) -> str:
1314
- try:
1315
- return self.get_task_status(task_id)
1316
- except Exception:
1317
- return "error"
1392
+ payload = {
1393
+ "proxy_type": str(pt),
1394
+ "username": username,
1395
+ "password": password, # Include password
1396
+ }
1397
+ if traffic_limit is not None:
1398
+ payload["traffic_limit"] = str(traffic_limit)
1399
+ if status is not None:
1400
+ payload["status"] = "true" if status else "false"
1318
1401
 
1319
- def get_task_result(self, task_id: str, file_type: str = "json") -> str:
1320
- self._require_public_credentials()
1321
- headers = build_public_api_headers(
1322
- self.public_token or "", self.public_key or ""
1402
+ response = self._api_request_with_retry(
1403
+ "POST",
1404
+ f"{self._proxy_users_url}/update-user",
1405
+ data=payload,
1406
+ headers=headers,
1323
1407
  )
1324
- try:
1325
- response = self._api_request_with_retry(
1326
- "POST",
1327
- self._download_url,
1328
- data={"tasks_id": task_id, "type": file_type},
1329
- headers=headers,
1330
- )
1331
- response.raise_for_status()
1332
- data = response.json()
1333
- if data.get("code") == 200 and data.get("data"):
1334
- return data["data"]["download"]
1335
- raise_for_code("Get result failed", code=data.get("code"), payload=data)
1336
- return ""
1337
- except requests.RequestException as e:
1338
- raise ThordataNetworkError(
1339
- f"Get result failed: {e}", original_error=e
1340
- ) from e
1408
+ response.raise_for_status()
1409
+ data = response.json()
1410
+ if data.get("code") != 200:
1411
+ raise_for_code("Update user failed", code=data.get("code"), payload=data)
1412
+ return data.get("data", {})
1341
1413
 
1342
- def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
1414
+ def delete_proxy_user(
1415
+ self,
1416
+ username: str,
1417
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1418
+ ) -> dict[str, Any]:
1419
+ """Delete a proxy user.
1420
+
1421
+ Args:
1422
+ username: The sub-account username.
1423
+ proxy_type: Proxy product type.
1424
+
1425
+ Returns:
1426
+ API response data.
1427
+ """
1343
1428
  self._require_public_credentials()
1429
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1344
1430
  headers = build_public_api_headers(
1345
1431
  self.public_token or "", self.public_key or ""
1346
1432
  )
1433
+
1434
+ payload = {
1435
+ "proxy_type": str(pt),
1436
+ "username": username,
1437
+ }
1438
+
1347
1439
  response = self._api_request_with_retry(
1348
1440
  "POST",
1349
- self._list_url,
1350
- data={"page": str(page), "size": str(size)},
1441
+ f"{self._proxy_users_url}/delete-user",
1442
+ data=payload,
1351
1443
  headers=headers,
1352
1444
  )
1353
1445
  response.raise_for_status()
1354
1446
  data = response.json()
1355
1447
  if data.get("code") != 200:
1356
- raise_for_code("List tasks failed", code=data.get("code"), payload=data)
1357
- return data.get("data", {"count": 0, "list": []})
1358
-
1359
- def wait_for_task(
1360
- self,
1361
- task_id: str,
1362
- *,
1363
- poll_interval: float = 5.0,
1364
- max_wait: float = 600.0,
1365
- ) -> str:
1366
- import time
1448
+ raise_for_code("Delete user failed", code=data.get("code"), payload=data)
1449
+ return data.get("data", {})
1367
1450
 
1368
- start = time.monotonic()
1369
- while (time.monotonic() - start) < max_wait:
1370
- status = self.get_task_status(task_id)
1371
- if status.lower() in {
1372
- "ready",
1373
- "success",
1374
- "finished",
1375
- "failed",
1376
- "error",
1377
- "cancelled",
1378
- }:
1379
- return status
1380
- time.sleep(poll_interval)
1381
- raise TimeoutError(f"Task {task_id} timeout")
1451
+ # =========================================================================
1452
+ # Whitelist IP Management
1453
+ # =========================================================================
1382
1454
 
1383
- def run_task(
1455
+ def add_whitelist_ip(
1384
1456
  self,
1385
- file_name: str,
1386
- spider_id: str,
1387
- spider_name: str,
1388
- parameters: dict[str, Any],
1389
- universal_params: dict[str, Any] | None = None,
1390
- *,
1391
- max_wait: float = 600.0,
1392
- initial_poll_interval: float = 2.0,
1393
- max_poll_interval: float = 10.0,
1394
- include_errors: bool = True,
1395
- ) -> str:
1396
- """
1397
- High-level wrapper to Run a Web Scraper task and wait for the result download URL.
1398
-
1399
- This method handles the entire lifecycle:
1400
- 1. Create Task
1401
- 2. Poll status (with exponential backoff)
1402
- 3. Get download URL when ready
1457
+ ip: str,
1458
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1459
+ status: bool = True,
1460
+ ) -> dict[str, Any]:
1461
+ """Add an IP to the whitelist.
1403
1462
 
1404
1463
  Args:
1405
- file_name: Name for the output file.
1406
- spider_id: Spider identifier from Dashboard.
1407
- spider_name: Spider name (target domain).
1408
- parameters: Spider-specific parameters.
1409
- universal_params: Global spider settings.
1410
- max_wait: Maximum seconds to wait for task completion (default 600).
1411
- initial_poll_interval: Starting poll interval in seconds.
1412
- max_poll_interval: Maximum poll interval cap.
1413
- include_errors: Whether to include error logs in the task result.
1464
+ ip: IP address to whitelist.
1465
+ proxy_type: Proxy product type.
1466
+ status: Enable or disable the whitelist entry.
1414
1467
 
1415
1468
  Returns:
1416
- str: The download URL for the task result (default JSON).
1417
-
1418
- Raises:
1419
- ThordataTimeoutError: If task takes longer than max_wait.
1420
- ThordataAPIError: If task fails or is cancelled.
1469
+ API response data.
1421
1470
  """
1422
- import time
1423
-
1424
- # 1. Create Task
1425
- config = ScraperTaskConfig(
1426
- file_name=file_name,
1427
- spider_id=spider_id,
1428
- spider_name=spider_name,
1429
- parameters=parameters,
1430
- universal_params=universal_params,
1431
- include_errors=include_errors,
1471
+ self._require_public_credentials()
1472
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1473
+ headers = build_public_api_headers(
1474
+ self.public_token or "", self.public_key or ""
1432
1475
  )
1433
- task_id = self.create_scraper_task_advanced(config)
1434
- logger.info(f"Task created successfully: {task_id}. Waiting for completion...")
1435
-
1436
- # 2. Poll Status (Smart Backoff)
1437
- start_time = time.monotonic()
1438
- current_poll = initial_poll_interval
1439
-
1440
- while (time.monotonic() - start_time) < max_wait:
1441
- status = self.get_task_status(task_id)
1442
- status_lower = status.lower()
1443
-
1444
- if status_lower in {"ready", "success", "finished"}:
1445
- logger.info(f"Task {task_id} finished. Status: {status}")
1446
- # 3. Get Result
1447
- return self.get_task_result(task_id)
1448
-
1449
- if status_lower in {"failed", "error", "cancelled"}:
1450
- raise ThordataNetworkError(
1451
- f"Task {task_id} ended with failed status: {status}"
1452
- )
1453
-
1454
- # Wait and increase interval (capped)
1455
- time.sleep(current_poll)
1456
- current_poll = min(current_poll * 1.5, max_poll_interval)
1476
+ payload = {
1477
+ "proxy_type": str(pt),
1478
+ "ip": ip,
1479
+ "status": "true" if status else "false",
1480
+ }
1481
+ response = self._api_request_with_retry(
1482
+ "POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
1483
+ )
1484
+ response.raise_for_status()
1485
+ data = response.json()
1486
+ if data.get("code") != 200:
1487
+ raise_for_code(
1488
+ "Add whitelist IP failed", code=data.get("code"), payload=data
1489
+ )
1490
+ return data.get("data", {})
1457
1491
 
1458
- raise ThordataTimeoutError(f"Task {task_id} timed out after {max_wait} seconds")
1492
+ def delete_whitelist_ip(
1493
+ self,
1494
+ ip: str,
1495
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1496
+ ) -> dict[str, Any]:
1497
+ """Delete an IP from the whitelist.
1459
1498
 
1460
- # =========================================================================
1461
- # Account / Locations / Utils
1462
- # =========================================================================
1499
+ Args:
1500
+ ip: The IP address to remove.
1501
+ proxy_type: Proxy product type.
1463
1502
 
1464
- def get_usage_statistics(
1465
- self,
1466
- from_date: str | date,
1467
- to_date: str | date,
1468
- ) -> UsageStatistics:
1503
+ Returns:
1504
+ API response data.
1505
+ """
1469
1506
  self._require_public_credentials()
1470
- if isinstance(from_date, date):
1471
- from_date = from_date.strftime("%Y-%m-%d")
1472
- if isinstance(to_date, date):
1473
- to_date = to_date.strftime("%Y-%m-%d")
1474
-
1475
- params = {
1476
- "token": self.public_token,
1477
- "key": self.public_key,
1478
- "from_date": from_date,
1479
- "to_date": to_date,
1507
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1508
+ headers = build_public_api_headers(
1509
+ self.public_token or "", self.public_key or ""
1510
+ )
1511
+ payload = {
1512
+ "proxy_type": str(pt),
1513
+ "ip": ip,
1480
1514
  }
1481
1515
  response = self._api_request_with_retry(
1482
- "GET", self._usage_stats_url, params=params
1516
+ "POST", f"{self._whitelist_url}/delete-ip", data=payload, headers=headers
1483
1517
  )
1484
1518
  response.raise_for_status()
1485
1519
  data = response.json()
1486
1520
  if data.get("code") != 200:
1487
- raise_for_code("Usage stats error", code=data.get("code"), payload=data)
1488
- return UsageStatistics.from_dict(data.get("data", data))
1521
+ raise_for_code(
1522
+ "Delete whitelist IP failed", code=data.get("code"), payload=data
1523
+ )
1524
+ return data.get("data", {})
1489
1525
 
1490
- def list_proxy_users(
1491
- self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1492
- ) -> ProxyUserList:
1526
+ def list_whitelist_ips(
1527
+ self,
1528
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1529
+ ) -> list[str]:
1530
+ """List all whitelisted IPs.
1531
+
1532
+ Args:
1533
+ proxy_type: Proxy product type.
1534
+
1535
+ Returns:
1536
+ List of IP address strings.
1537
+ """
1493
1538
  self._require_public_credentials()
1494
1539
  pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1495
1540
  params = {
@@ -1498,179 +1543,863 @@ class ThordataClient:
1498
1543
  "proxy_type": str(pt),
1499
1544
  }
1500
1545
  response = self._api_request_with_retry(
1501
- "GET", f"{self._proxy_users_url}/user-list", params=params
1546
+ "GET", f"{self._whitelist_url}/ip-list", params=params
1502
1547
  )
1503
1548
  response.raise_for_status()
1504
1549
  data = response.json()
1505
1550
  if data.get("code") != 200:
1506
- raise_for_code("List users error", code=data.get("code"), payload=data)
1507
- return ProxyUserList.from_dict(data.get("data", data))
1551
+ raise_for_code(
1552
+ "List whitelist IPs failed", code=data.get("code"), payload=data
1553
+ )
1554
+
1555
+ # API usually returns {"data": ["1.1.1.1", ...]} OR {"data": [{"ip": "..."}]}
1556
+ items = data.get("data", []) or []
1557
+ result = []
1558
+ for item in items:
1559
+ if isinstance(item, str):
1560
+ result.append(item)
1561
+ elif isinstance(item, dict) and "ip" in item:
1562
+ result.append(str(item["ip"]))
1563
+ else:
1564
+ result.append(str(item))
1565
+ return result
1566
+
1567
+ # =========================================================================
1568
+ # Locations & ASN Methods
1569
+ # =========================================================================
1570
+
1571
+ def list_countries(
1572
+ self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1573
+ ) -> list[dict[str, Any]]:
1574
+ """List available countries for proxy locations.
1575
+
1576
+ Args:
1577
+ proxy_type: Proxy product type.
1578
+
1579
+ Returns:
1580
+ List of country dictionaries.
1581
+ """
1582
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1583
+ return self._get_locations("countries", proxy_type=pt)
1584
+
1585
+ def list_states(
1586
+ self,
1587
+ country_code: str,
1588
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1589
+ ) -> list[dict[str, Any]]:
1590
+ """List available states/provinces for a country.
1591
+
1592
+ Args:
1593
+ country_code: Country code (e.g., "US", "GB").
1594
+ proxy_type: Proxy product type.
1595
+
1596
+ Returns:
1597
+ List of state dictionaries.
1598
+ """
1599
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1600
+ return self._get_locations("states", proxy_type=pt, country_code=country_code)
1601
+
1602
+ def list_cities(
1603
+ self,
1604
+ country_code: str,
1605
+ state_code: str | None = None,
1606
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1607
+ ) -> list[dict[str, Any]]:
1608
+ """List available cities for a country/state.
1609
+
1610
+ Args:
1611
+ country_code: Country code.
1612
+ state_code: State code (optional).
1613
+ proxy_type: Proxy product type.
1614
+
1615
+ Returns:
1616
+ List of city dictionaries.
1617
+ """
1618
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1619
+ kwargs = {"proxy_type": pt, "country_code": country_code}
1620
+ if state_code:
1621
+ kwargs["state_code"] = state_code
1622
+ return self._get_locations("cities", **kwargs)
1623
+
1624
+ def list_asn(
1625
+ self,
1626
+ country_code: str,
1627
+ proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1628
+ ) -> list[dict[str, Any]]:
1629
+ """List available ASN numbers for a country.
1630
+
1631
+ Args:
1632
+ country_code: Country code.
1633
+ proxy_type: Proxy product type.
1634
+
1635
+ Returns:
1636
+ List of ASN dictionaries.
1637
+ """
1638
+ pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1639
+ return self._get_locations("asn", proxy_type=pt, country_code=country_code)
1640
+
1641
+ # =========================================================================
1642
+ # ISP & Datacenter Proxy Management
1643
+ # =========================================================================
1644
+
1645
+ def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
1646
+ """List purchased proxy servers (ISP/Datacenter).
1647
+
1648
+ Args:
1649
+ proxy_type: Proxy type (1=ISP, 2=Datacenter).
1650
+
1651
+ Returns:
1652
+ List of ProxyServer objects.
1653
+ """
1654
+ self._require_public_credentials()
1655
+ params = {
1656
+ "token": self.public_token,
1657
+ "key": self.public_key,
1658
+ "proxy_type": str(proxy_type),
1659
+ }
1660
+ response = self._api_request_with_retry(
1661
+ "GET", self._proxy_list_url, params=params
1662
+ )
1663
+ response.raise_for_status()
1664
+ data = response.json()
1665
+ if data.get("code") != 200:
1666
+ raise_for_code(
1667
+ "List proxy servers error", code=data.get("code"), payload=data
1668
+ )
1669
+
1670
+ server_list = []
1671
+ if isinstance(data, dict):
1672
+ server_list = data.get("data", data.get("list", []))
1673
+ elif isinstance(data, list):
1674
+ server_list = data
1675
+
1676
+ return [ProxyServer.from_dict(s) for s in server_list]
1677
+
1678
+ def get_proxy_expiration(
1679
+ self, ips: str | list[str], proxy_type: int
1680
+ ) -> dict[str, Any]:
1681
+ """Get expiration time for proxy IPs.
1682
+
1683
+ Args:
1684
+ ips: Single IP or comma-separated list of IPs.
1685
+ proxy_type: Proxy type (1=ISP, 2=Datacenter).
1686
+
1687
+ Returns:
1688
+ Dictionary with IP expiration times.
1689
+ """
1690
+ self._require_public_credentials()
1691
+ if isinstance(ips, list):
1692
+ ips = ",".join(ips)
1693
+ params = {
1694
+ "token": self.public_token,
1695
+ "key": self.public_key,
1696
+ "proxy_type": str(proxy_type),
1697
+ "ips": ips,
1698
+ }
1699
+ response = self._api_request_with_retry(
1700
+ "GET", self._proxy_expiration_url, params=params
1701
+ )
1702
+ response.raise_for_status()
1703
+ data = response.json()
1704
+ if data.get("code") != 200:
1705
+ raise_for_code("Get expiration error", code=data.get("code"), payload=data)
1706
+ return data.get("data", data)
1707
+
1708
+ # =========================================================================
1709
+ # Internal Request Helpers
1710
+ # =========================================================================
1711
+
1712
+ def _api_request_with_retry(
1713
+ self,
1714
+ method: str,
1715
+ url: str,
1716
+ *,
1717
+ data: dict[str, Any] | None = None,
1718
+ headers: dict[str, str] | None = None,
1719
+ params: dict[str, Any] | None = None,
1720
+ ) -> requests.Response:
1721
+ """Make an API request with retry logic.
1722
+
1723
+ Args:
1724
+ method: HTTP method.
1725
+ url: Request URL.
1726
+ data: Request body data.
1727
+ headers: Request headers.
1728
+ query_params: Query string parameters.
1729
+
1730
+ Returns:
1731
+ Response object.
1732
+ """
1733
+
1734
+ @with_retry(self._retry_config)
1735
+ def _do_request() -> requests.Response:
1736
+ return self._api_session.request(
1737
+ method,
1738
+ url,
1739
+ data=data,
1740
+ headers=headers,
1741
+ params=params,
1742
+ timeout=self._api_timeout,
1743
+ )
1744
+
1745
+ try:
1746
+ return _do_request()
1747
+ except requests.Timeout as e:
1748
+ raise ThordataTimeoutError(
1749
+ f"API request timed out: {e}", original_error=e
1750
+ ) from e
1751
+ except requests.RequestException as e:
1752
+ raise ThordataNetworkError(
1753
+ f"API request failed: {e}", original_error=e
1754
+ ) from e
1755
+
1756
+ def _require_public_credentials(self) -> None:
1757
+ """Check that public credentials are set."""
1758
+ if not self.public_token or not self.public_key:
1759
+ raise ThordataConfigError(
1760
+ "public_token and public_key are required for this operation."
1761
+ )
1762
+
1763
+ def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
1764
+ """Internal method to fetch location data.
1765
+
1766
+ Args:
1767
+ endpoint: Location endpoint (countries, states, cities, asn).
1768
+ **kwargs: Query parameters.
1769
+
1770
+ Returns:
1771
+ List of location dictionaries.
1772
+ """
1773
+ self._require_public_credentials()
1774
+ params = {"token": self.public_token, "key": self.public_key}
1775
+ for k, v in kwargs.items():
1776
+ params[k] = str(v)
1777
+
1778
+ response = self._api_request_with_retry(
1779
+ "GET", f"{self._locations_base_url}/{endpoint}", params=params
1780
+ )
1781
+ response.raise_for_status()
1782
+ data = response.json()
1783
+ if isinstance(data, dict):
1784
+ if data.get("code") != 200:
1785
+ raise RuntimeError(f"Locations error: {data.get('msg')}")
1786
+ return data.get("data") or []
1787
+ return data if isinstance(data, list) else []
1788
+
1789
+ def _process_universal_response(
1790
+ self, response: requests.Response, output_format: str
1791
+ ) -> str | bytes:
1792
+ """Process Universal API response.
1793
+
1794
+ Args:
1795
+ response: Response object.
1796
+ output_format: Expected output format.
1797
+
1798
+ Returns:
1799
+ Processed content.
1800
+ """
1801
+ try:
1802
+ resp_json = response.json()
1803
+ except ValueError:
1804
+ return response.content if output_format.lower() == "png" else response.text
1805
+
1806
+ if isinstance(resp_json, dict):
1807
+ code = resp_json.get("code")
1808
+ if code is not None and code != 200:
1809
+ msg = extract_error_message(resp_json)
1810
+ raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
1811
+
1812
+ if "html" in resp_json:
1813
+ return resp_json["html"]
1814
+ if "png" in resp_json:
1815
+ return decode_base64_image(resp_json["png"])
1816
+
1817
+ return str(resp_json)
1818
+
1819
+ # =========================================================================
1820
+ # Proxy Implementation Details
1821
+ # =========================================================================
1822
+
1823
+ def _proxy_verb(
1824
+ self,
1825
+ method: str,
1826
+ url: str,
1827
+ proxy_config: ProxyConfig | None,
1828
+ timeout: int | None,
1829
+ **kwargs: Any,
1830
+ ) -> requests.Response:
1831
+ """Internal method for proxy requests."""
1832
+ timeout = timeout or self._default_timeout
1833
+
1834
+ if proxy_config is None:
1835
+ proxy_config = self._get_default_proxy_config_from_env()
1836
+
1837
+ if proxy_config is None:
1838
+ raise ThordataConfigError(
1839
+ "Proxy credentials are missing. "
1840
+ "Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
1841
+ )
1842
+
1843
+ kwargs.pop("proxies", None)
1844
+
1845
+ @with_retry(self._retry_config)
1846
+ def _do() -> requests.Response:
1847
+ return self._proxy_request_with_proxy_manager(
1848
+ method,
1849
+ url,
1850
+ proxy_config=proxy_config,
1851
+ timeout=timeout,
1852
+ headers=kwargs.pop("headers", None),
1853
+ params=kwargs.pop("params", None),
1854
+ data=kwargs.pop("data", None),
1855
+ )
1856
+
1857
+ try:
1858
+ return _do()
1859
+ except requests.Timeout as e:
1860
+ raise ThordataTimeoutError(
1861
+ f"Request timed out: {e}", original_error=e
1862
+ ) from e
1863
+ except Exception as e:
1864
+ raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
1865
+
1866
+ def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
1867
+ """Build a stable cache key for ProxyManager instances."""
1868
+ if not userpass:
1869
+ return proxy_endpoint
1870
+ h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
1871
+ return f"{proxy_endpoint}|auth={h}"
1872
+
1873
+ def _get_proxy_manager(
1874
+ self,
1875
+ proxy_url: str,
1876
+ *,
1877
+ cache_key: str,
1878
+ proxy_headers: dict[str, str] | None = None,
1879
+ ) -> urllib3.PoolManager:
1880
+ """Get or create a ProxyManager for the given proxy URL (Pooled)."""
1881
+ cached = self._proxy_managers.get(cache_key)
1882
+ if cached is not None:
1883
+ return cached
1884
+
1885
+ if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
1886
+ try:
1887
+ from urllib3.contrib.socks import SOCKSProxyManager
1888
+ except Exception as e:
1889
+ raise ThordataConfigError(
1890
+ "SOCKS proxy requested but SOCKS dependencies are missing. "
1891
+ "Install: pip install 'urllib3[socks]' or pip install PySocks"
1892
+ ) from e
1893
+
1894
+ pm_socks = SOCKSProxyManager(
1895
+ proxy_url,
1896
+ num_pools=10,
1897
+ maxsize=10,
1898
+ )
1899
+ pm = cast(urllib3.PoolManager, pm_socks)
1900
+ self._proxy_managers[cache_key] = pm
1901
+ return pm
1902
+
1903
+ # HTTP/HTTPS proxies
1904
+ proxy_ssl_context = None
1905
+ if proxy_url.startswith("https://"):
1906
+ proxy_ssl_context = ssl.create_default_context()
1907
+
1908
+ pm_http = urllib3.ProxyManager(
1909
+ proxy_url,
1910
+ proxy_headers=proxy_headers,
1911
+ proxy_ssl_context=proxy_ssl_context,
1912
+ num_pools=10,
1913
+ maxsize=10,
1914
+ )
1915
+
1916
+ pm = cast(urllib3.PoolManager, pm_http)
1917
+ self._proxy_managers[cache_key] = pm
1918
+ return pm
1919
+
1920
+ def _proxy_request_with_proxy_manager(
1921
+ self,
1922
+ method: str,
1923
+ url: str,
1924
+ *,
1925
+ proxy_config: ProxyConfig,
1926
+ timeout: int,
1927
+ headers: dict[str, str] | None = None,
1928
+ params: dict[str, Any] | None = None,
1929
+ data: Any = None,
1930
+ ) -> requests.Response:
1931
+ """Execute request through proxy, with optional upstream proxy support."""
1932
+
1933
+ # Check for upstream proxy
1934
+ upstream_config = _parse_upstream_proxy()
1935
+
1936
+ if upstream_config:
1937
+ return self._proxy_request_with_upstream(
1938
+ method,
1939
+ url,
1940
+ proxy_config=proxy_config,
1941
+ timeout=timeout,
1942
+ headers=headers,
1943
+ params=params,
1944
+ data=data,
1945
+ upstream_config=upstream_config,
1946
+ )
1947
+
1948
+ # Original implementation (no upstream proxy)
1949
+ req = requests.Request(method=method.upper(), url=url, params=params)
1950
+ prepped = self._proxy_session.prepare_request(req)
1951
+ final_url = prepped.url or url
1952
+
1953
+ proxy_endpoint = proxy_config.build_proxy_endpoint()
1954
+ is_socks = proxy_endpoint.startswith(
1955
+ ("socks5://", "socks5h://", "socks4://", "socks4a://")
1956
+ )
1957
+
1958
+ if is_socks:
1959
+ proxy_url_for_manager = proxy_config.build_proxy_url()
1960
+ userpass = proxy_config.build_proxy_basic_auth()
1961
+ cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
1962
+
1963
+ pm = self._get_proxy_manager(
1964
+ proxy_url_for_manager,
1965
+ cache_key=cache_key,
1966
+ proxy_headers=None,
1967
+ )
1968
+ else:
1969
+ userpass = proxy_config.build_proxy_basic_auth()
1970
+ proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
1971
+ cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
1972
+
1973
+ pm = self._get_proxy_manager(
1974
+ proxy_endpoint,
1975
+ cache_key=cache_key,
1976
+ proxy_headers=dict(proxy_headers),
1977
+ )
1978
+
1979
+ req_headers = dict(headers or {})
1980
+ body = None
1981
+ if data is not None:
1982
+ if isinstance(data, dict):
1983
+ body = urlencode({k: str(v) for k, v in data.items()})
1984
+ req_headers.setdefault(
1985
+ "Content-Type", "application/x-www-form-urlencoded"
1986
+ )
1987
+ else:
1988
+ body = data
1989
+
1990
+ http_resp = pm.request(
1991
+ method.upper(),
1992
+ final_url,
1993
+ body=body,
1994
+ headers=req_headers or None,
1995
+ timeout=urllib3.Timeout(connect=timeout, read=timeout),
1996
+ retries=False,
1997
+ preload_content=True,
1998
+ )
1999
+
2000
+ r = requests.Response()
2001
+ r.status_code = int(getattr(http_resp, "status", 0) or 0)
2002
+ r._content = http_resp.data or b""
2003
+ r.url = final_url
2004
+ r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
2005
+ return r
2006
+
2007
+ # =========================================================================
2008
+ # Upstream Proxy Support (Proxy Chaining)
2009
+ # =========================================================================
2010
+
2011
+ def _proxy_request_with_upstream(
2012
+ self,
2013
+ method: str,
2014
+ url: str,
2015
+ *,
2016
+ proxy_config: ProxyConfig,
2017
+ timeout: int,
2018
+ headers: dict[str, str] | None = None,
2019
+ params: dict[str, Any] | None = None,
2020
+ data: Any = None,
2021
+ upstream_config: dict[str, Any],
2022
+ ) -> requests.Response:
2023
+ """Execute request through proxy chain: Upstream -> Thordata -> Target."""
2024
+ if not HAS_PYSOCKS:
2025
+ raise ThordataConfigError(
2026
+ "PySocks is required for upstream proxy support. "
2027
+ "Install with: pip install PySocks"
2028
+ )
2029
+
2030
+ req = requests.Request(method=method.upper(), url=url, params=params)
2031
+ prepped = self._proxy_session.prepare_request(req)
2032
+ final_url = prepped.url or url
2033
+
2034
+ parsed_target = urlparse(final_url)
2035
+ target_host = parsed_target.hostname or ""
2036
+ target_port = parsed_target.port or (
2037
+ 443 if parsed_target.scheme == "https" else 80
2038
+ )
2039
+ target_is_https = parsed_target.scheme == "https"
2040
+
2041
+ protocol = proxy_config.protocol.lower()
2042
+ if protocol == "socks5":
2043
+ protocol = "socks5h"
2044
+
2045
+ thordata_host = proxy_config.host or ""
2046
+ thordata_port = proxy_config.port or 9999
2047
+ thordata_username = proxy_config.build_username()
2048
+ thordata_password = proxy_config.password
2049
+
2050
+ socket_factory = _UpstreamProxySocketFactory(upstream_config)
2051
+
2052
+ logger.debug(
2053
+ f"Proxy chain: upstream({upstream_config['host']}:{upstream_config['port']}) "
2054
+ f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
2055
+ f"-> target({target_host}:{target_port})"
2056
+ )
2057
+
2058
+ raw_sock = socket_factory.create_connection(
2059
+ (thordata_host, thordata_port),
2060
+ timeout=float(timeout),
2061
+ )
2062
+
2063
+ try:
2064
+ if protocol.startswith("socks"):
2065
+ sock = self._socks5_handshake(
2066
+ raw_sock,
2067
+ target_host,
2068
+ target_port,
2069
+ thordata_username,
2070
+ thordata_password,
2071
+ )
2072
+ if target_is_https:
2073
+ context = ssl.create_default_context()
2074
+ sock = context.wrap_socket(sock, server_hostname=target_host)
2075
+
2076
+ elif protocol == "https":
2077
+ proxy_context = ssl.create_default_context()
2078
+ proxy_ssl_sock = proxy_context.wrap_socket(
2079
+ raw_sock, server_hostname=thordata_host
2080
+ )
2081
+
2082
+ self._send_connect_request(
2083
+ proxy_ssl_sock,
2084
+ target_host,
2085
+ target_port,
2086
+ thordata_username,
2087
+ thordata_password,
2088
+ )
2089
+
2090
+ if target_is_https:
2091
+ # FIX: Add type ignore for MyPy because _TLSInTLSSocket is duck-typed as socket
2092
+ sock = self._create_tls_in_tls_socket(
2093
+ proxy_ssl_sock, target_host, timeout
2094
+ ) # type: ignore[assignment]
2095
+ else:
2096
+ sock = proxy_ssl_sock
2097
+
2098
+ else: # HTTP proxy
2099
+ self._send_connect_request(
2100
+ raw_sock,
2101
+ target_host,
2102
+ target_port,
2103
+ thordata_username,
2104
+ thordata_password,
2105
+ )
2106
+
2107
+ if target_is_https:
2108
+ context = ssl.create_default_context()
2109
+ sock = context.wrap_socket(raw_sock, server_hostname=target_host)
2110
+ else:
2111
+ sock = raw_sock
2112
+
2113
+ return self._send_http_request(
2114
+ sock, method, parsed_target, headers, data, final_url, timeout
2115
+ )
2116
+
2117
+ finally:
2118
+ with contextlib.suppress(Exception):
2119
+ raw_sock.close()
2120
+
2121
+ def _send_connect_request(
2122
+ self,
2123
+ sock: socket.socket,
2124
+ target_host: str,
2125
+ target_port: int,
2126
+ proxy_username: str,
2127
+ proxy_password: str,
2128
+ ) -> None:
2129
+ """Send HTTP CONNECT request to proxy and verify response."""
2130
+ connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
2131
+ connect_req += f"Host: {target_host}:{target_port}\r\n"
2132
+
2133
+ credentials = f"{proxy_username}:{proxy_password}"
2134
+ encoded = base64.b64encode(credentials.encode()).decode()
2135
+ connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
2136
+ connect_req += "\r\n"
2137
+
2138
+ sock.sendall(connect_req.encode())
2139
+
2140
+ response = b""
2141
+ while b"\r\n\r\n" not in response:
2142
+ chunk = sock.recv(4096)
2143
+ if not chunk:
2144
+ raise ConnectionError("Proxy closed connection during CONNECT")
2145
+ response += chunk
2146
+
2147
+ status_line = response.split(b"\r\n")[0].decode()
2148
+ if "200" not in status_line:
2149
+ raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
2150
+
2151
+ def _create_tls_in_tls_socket(
2152
+ self,
2153
+ outer_ssl_sock: ssl.SSLSocket,
2154
+ hostname: str,
2155
+ timeout: int,
2156
+ ) -> _TLSInTLSSocket:
2157
+ """Create a TLS connection over an existing TLS connection."""
2158
+ context = ssl.create_default_context()
2159
+
2160
+ incoming = ssl.MemoryBIO()
2161
+ outgoing = ssl.MemoryBIO()
2162
+
2163
+ ssl_obj = context.wrap_bio(incoming, outgoing, server_hostname=hostname)
2164
+
2165
+ while True:
2166
+ try:
2167
+ ssl_obj.do_handshake()
2168
+ break
2169
+ except ssl.SSLWantReadError:
2170
+ data_to_send = outgoing.read()
2171
+ if data_to_send:
2172
+ outer_ssl_sock.sendall(data_to_send)
2173
+
2174
+ outer_ssl_sock.settimeout(float(timeout))
2175
+ try:
2176
+ received = outer_ssl_sock.recv(8192)
2177
+ if not received:
2178
+ raise ConnectionError("Connection closed during TLS handshake")
2179
+ incoming.write(received)
2180
+ except socket.timeout as e:
2181
+ raise ConnectionError("Timeout during TLS handshake") from e
2182
+ except ssl.SSLWantWriteError:
2183
+ data_to_send = outgoing.read()
2184
+ if data_to_send:
2185
+ outer_ssl_sock.sendall(data_to_send)
2186
+
2187
+ data_to_send = outgoing.read()
2188
+ if data_to_send:
2189
+ outer_ssl_sock.sendall(data_to_send)
2190
+
2191
+ return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
2192
+
2193
+ def _send_http_request(
2194
+ self,
2195
+ sock: socket.socket | ssl.SSLSocket | Any,
2196
+ method: str,
2197
+ parsed_url: Any,
2198
+ headers: dict[str, str] | None,
2199
+ data: Any,
2200
+ final_url: str,
2201
+ timeout: int,
2202
+ ) -> requests.Response:
2203
+ """Send HTTP request over established connection and parse response."""
2204
+ target_host = parsed_url.hostname
2205
+
2206
+ req_headers = dict(headers or {})
2207
+ req_headers.setdefault("Host", target_host)
2208
+ req_headers.setdefault("User-Agent", build_user_agent(_sdk_version, "requests"))
2209
+ req_headers.setdefault("Connection", "close")
2210
+
2211
+ path = parsed_url.path or "/"
2212
+ if parsed_url.query:
2213
+ path += f"?{parsed_url.query}"
2214
+
2215
+ http_req = f"{method.upper()} {path} HTTP/1.1\r\n"
2216
+ for k, v in req_headers.items():
2217
+ http_req += f"{k}: {v}\r\n"
2218
+
2219
+ body = None
2220
+ if data is not None:
2221
+ if isinstance(data, dict):
2222
+ body = urlencode({k: str(v) for k, v in data.items()}).encode()
2223
+ http_req += "Content-Type: application/x-www-form-urlencoded\r\n"
2224
+ http_req += f"Content-Length: {len(body)}\r\n"
2225
+ elif isinstance(data, bytes):
2226
+ body = data
2227
+ http_req += f"Content-Length: {len(body)}\r\n"
2228
+ else:
2229
+ body = str(data).encode()
2230
+ http_req += f"Content-Length: {len(body)}\r\n"
2231
+
2232
+ http_req += "\r\n"
2233
+ sock.sendall(http_req.encode())
2234
+
2235
+ if body:
2236
+ sock.sendall(body)
2237
+
2238
+ if hasattr(sock, "settimeout"):
2239
+ sock.settimeout(float(timeout))
2240
+
2241
+ response_data = b""
2242
+ try:
2243
+ while True:
2244
+ chunk = sock.recv(8192)
2245
+ if not chunk:
2246
+ break
2247
+ response_data += chunk
2248
+ if b"\r\n\r\n" in response_data:
2249
+ header_end = response_data.index(b"\r\n\r\n") + 4
2250
+ headers_part = (
2251
+ response_data[:header_end]
2252
+ .decode("utf-8", errors="replace")
2253
+ .lower()
2254
+ )
2255
+ if "content-length:" in headers_part:
2256
+ for line in headers_part.split("\r\n"):
2257
+ if line.startswith("content-length:"):
2258
+ content_length = int(line.split(":")[1].strip())
2259
+ if len(response_data) >= header_end + content_length:
2260
+ break
2261
+ elif "transfer-encoding: chunked" not in headers_part:
2262
+ break
2263
+ except socket.timeout:
2264
+ pass
2265
+
2266
+ return self._parse_http_response(response_data, final_url)
2267
+
2268
+ def _socks5_handshake(
2269
+ self,
2270
+ sock: socket.socket,
2271
+ target_host: str,
2272
+ target_port: int,
2273
+ username: str | None,
2274
+ password: str | None,
2275
+ ) -> socket.socket:
2276
+ """Perform SOCKS5 handshake over existing socket."""
2277
+ if username and password:
2278
+ sock.sendall(b"\x05\x02\x00\x02")
2279
+ else:
2280
+ sock.sendall(b"\x05\x01\x00")
2281
+
2282
+ response = sock.recv(2)
2283
+ if len(response) < 2:
2284
+ raise ConnectionError("SOCKS5 handshake failed: incomplete response")
2285
+
2286
+ if response[0] != 0x05:
2287
+ raise ConnectionError(f"SOCKS5 version mismatch: {response[0]}")
2288
+
2289
+ auth_method = response[1]
2290
+
2291
+ if auth_method == 0x02:
2292
+ if not username or not password:
2293
+ raise ConnectionError(
2294
+ "SOCKS5 server requires auth but no credentials provided"
2295
+ )
2296
+
2297
+ auth_req = bytes([0x01, len(username)]) + username.encode()
2298
+ auth_req += bytes([len(password)]) + password.encode()
2299
+ sock.sendall(auth_req)
1508
2300
 
1509
- def create_proxy_user(
1510
- self,
1511
- username: str,
1512
- password: str,
1513
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1514
- traffic_limit: int = 0,
1515
- status: bool = True,
1516
- ) -> dict[str, Any]:
1517
- self._require_public_credentials()
1518
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1519
- headers = build_public_api_headers(
1520
- self.public_token or "", self.public_key or ""
1521
- )
1522
- payload = {
1523
- "proxy_type": str(pt),
1524
- "username": username,
1525
- "password": password,
1526
- "traffic_limit": str(traffic_limit),
1527
- "status": "true" if status else "false",
1528
- }
1529
- response = self._api_request_with_retry(
1530
- "POST",
1531
- f"{self._proxy_users_url}/create-user",
1532
- data=payload,
1533
- headers=headers,
1534
- )
1535
- response.raise_for_status()
1536
- data = response.json()
1537
- if data.get("code") != 200:
1538
- raise_for_code("Create user failed", code=data.get("code"), payload=data)
1539
- return data.get("data", {})
2301
+ auth_resp = sock.recv(2)
2302
+ if len(auth_resp) < 2 or auth_resp[1] != 0x00:
2303
+ raise ConnectionError("SOCKS5 authentication failed")
1540
2304
 
1541
- def add_whitelist_ip(
1542
- self,
1543
- ip: str,
1544
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1545
- status: bool = True,
1546
- ) -> dict[str, Any]:
1547
- self._require_public_credentials()
1548
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1549
- headers = build_public_api_headers(
1550
- self.public_token or "", self.public_key or ""
1551
- )
1552
- payload = {
1553
- "proxy_type": str(pt),
1554
- "ip": ip,
1555
- "status": "true" if status else "false",
1556
- }
1557
- response = self._api_request_with_retry(
1558
- "POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
1559
- )
1560
- response.raise_for_status()
1561
- data = response.json()
1562
- if data.get("code") != 200:
1563
- raise_for_code(
1564
- "Add whitelist IP failed", code=data.get("code"), payload=data
1565
- )
1566
- return data.get("data", {})
2305
+ elif auth_method == 0xFF:
2306
+ raise ConnectionError("SOCKS5 no acceptable auth method")
1567
2307
 
1568
- def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
1569
- self._require_public_credentials()
1570
- params = {
1571
- "token": self.public_token,
1572
- "key": self.public_key,
1573
- "proxy_type": str(proxy_type),
1574
- }
1575
- response = self._api_request_with_retry(
1576
- "GET", self._proxy_list_url, params=params
1577
- )
1578
- response.raise_for_status()
1579
- data = response.json()
1580
- if data.get("code") != 200:
1581
- raise_for_code(
1582
- "List proxy servers error", code=data.get("code"), payload=data
1583
- )
2308
+ connect_req = b"\x05\x01\x00\x03"
2309
+ connect_req += bytes([len(target_host)]) + target_host.encode()
2310
+ connect_req += target_port.to_bytes(2, "big")
2311
+ sock.sendall(connect_req)
1584
2312
 
1585
- server_list = []
1586
- if isinstance(data, dict):
1587
- server_list = data.get("data", data.get("list", []))
1588
- elif isinstance(data, list):
1589
- server_list = data
2313
+ resp = sock.recv(4)
2314
+ if len(resp) < 4:
2315
+ raise ConnectionError("SOCKS5 connect failed: incomplete response")
1590
2316
 
1591
- return [ProxyServer.from_dict(s) for s in server_list]
2317
+ if resp[1] != 0x00:
2318
+ error_codes = {
2319
+ 0x01: "General failure",
2320
+ 0x02: "Connection not allowed",
2321
+ 0x03: "Network unreachable",
2322
+ 0x04: "Host unreachable",
2323
+ 0x05: "Connection refused",
2324
+ 0x06: "TTL expired",
2325
+ 0x07: "Command not supported",
2326
+ 0x08: "Address type not supported",
2327
+ }
2328
+ error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
2329
+ raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
1592
2330
 
1593
- def get_proxy_expiration(
1594
- self, ips: str | list[str], proxy_type: int
1595
- ) -> dict[str, Any]:
1596
- self._require_public_credentials()
1597
- if isinstance(ips, list):
1598
- ips = ",".join(ips)
1599
- params = {
1600
- "token": self.public_token,
1601
- "key": self.public_key,
1602
- "proxy_type": str(proxy_type),
1603
- "ips": ips,
1604
- }
1605
- response = self._api_request_with_retry(
1606
- "GET", self._proxy_expiration_url, params=params
1607
- )
1608
- response.raise_for_status()
1609
- data = response.json()
1610
- if data.get("code") != 200:
1611
- raise_for_code("Get expiration error", code=data.get("code"), payload=data)
1612
- return data.get("data", data)
2331
+ addr_type = resp[3]
2332
+ if addr_type == 0x01:
2333
+ sock.recv(4 + 2)
2334
+ elif addr_type == 0x03:
2335
+ domain_len = sock.recv(1)[0]
2336
+ sock.recv(domain_len + 2)
2337
+ elif addr_type == 0x04:
2338
+ sock.recv(16 + 2)
1613
2339
 
1614
- def list_countries(
1615
- self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
1616
- ) -> list[dict[str, Any]]:
1617
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1618
- return self._get_locations("countries", proxy_type=pt)
2340
+ return sock
1619
2341
 
1620
- def list_states(
2342
+ def _parse_http_response(
1621
2343
  self,
1622
- country_code: str,
1623
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1624
- ) -> list[dict[str, Any]]:
1625
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1626
- return self._get_locations("states", proxy_type=pt, country_code=country_code)
2344
+ response_data: bytes,
2345
+ url: str,
2346
+ ) -> requests.Response:
2347
+ """Parse raw HTTP response into requests.Response."""
2348
+ if b"\r\n\r\n" in response_data:
2349
+ header_data, body = response_data.split(b"\r\n\r\n", 1)
2350
+ else:
2351
+ header_data = response_data
2352
+ body = b""
1627
2353
 
1628
- def list_cities(
1629
- self,
1630
- country_code: str,
1631
- state_code: str | None = None,
1632
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1633
- ) -> list[dict[str, Any]]:
1634
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1635
- kwargs = {"proxy_type": pt, "country_code": country_code}
1636
- if state_code:
1637
- kwargs["state_code"] = state_code
1638
- return self._get_locations("cities", **kwargs)
2354
+ header_lines = header_data.decode("utf-8", errors="replace").split("\r\n")
1639
2355
 
1640
- def list_asn(
1641
- self,
1642
- country_code: str,
1643
- proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
1644
- ) -> list[dict[str, Any]]:
1645
- pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
1646
- return self._get_locations("asn", proxy_type=pt, country_code=country_code)
2356
+ status_line = header_lines[0] if header_lines else ""
2357
+ parts = status_line.split(" ", 2)
2358
+ status_code = int(parts[1]) if len(parts) > 1 else 0
1647
2359
 
1648
- def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
1649
- self._require_public_credentials()
1650
- params = {"token": self.public_token, "key": self.public_key}
1651
- for k, v in kwargs.items():
1652
- params[k] = str(v)
2360
+ headers_dict = {}
2361
+ for line in header_lines[1:]:
2362
+ if ": " in line:
2363
+ k, v = line.split(": ", 1)
2364
+ headers_dict[k] = v
1653
2365
 
1654
- response = self._api_request_with_retry(
1655
- "GET", f"{self._locations_base_url}/{endpoint}", params=params
1656
- )
1657
- response.raise_for_status()
1658
- data = response.json()
1659
- if isinstance(data, dict):
1660
- if data.get("code") != 200:
1661
- raise RuntimeError(f"Locations error: {data.get('msg')}")
1662
- return data.get("data") or []
1663
- return data if isinstance(data, list) else []
2366
+ if headers_dict.get("Transfer-Encoding", "").lower() == "chunked":
2367
+ body = self._decode_chunked(body)
1664
2368
 
1665
- def _require_public_credentials(self) -> None:
1666
- if not self.public_token or not self.public_key:
1667
- raise ThordataConfigError(
1668
- "public_token and public_key are required for this operation."
1669
- )
2369
+ r = requests.Response()
2370
+ r.status_code = status_code
2371
+ r._content = body
2372
+ r.url = url
2373
+ r.headers = CaseInsensitiveDict(headers_dict)
2374
+ return r
2375
+
2376
+ def _decode_chunked(self, data: bytes) -> bytes:
2377
+ """Decode chunked transfer encoding."""
2378
+ result = b""
2379
+ while data:
2380
+ if b"\r\n" not in data:
2381
+ break
2382
+ size_line, data = data.split(b"\r\n", 1)
2383
+ try:
2384
+ chunk_size = int(size_line.decode().strip(), 16)
2385
+ except ValueError:
2386
+ break
2387
+
2388
+ if chunk_size == 0:
2389
+ break
2390
+
2391
+ result += data[:chunk_size]
2392
+ data = data[chunk_size:]
2393
+
2394
+ if data.startswith(b"\r\n"):
2395
+ data = data[2:]
2396
+
2397
+ return result
1670
2398
 
1671
2399
  def _get_proxy_endpoint_overrides(
1672
2400
  self, product: ProxyProduct
1673
2401
  ) -> tuple[str | None, int | None, str]:
2402
+ """Get proxy endpoint overrides from environment variables."""
1674
2403
  prefix = product.value.upper()
1675
2404
  host = os.getenv(f"THORDATA_{prefix}_PROXY_HOST") or os.getenv(
1676
2405
  "THORDATA_PROXY_HOST"
@@ -1687,6 +2416,7 @@ class ThordataClient:
1687
2416
  return host or None, port, protocol
1688
2417
 
1689
2418
  def _get_default_proxy_config_from_env(self) -> ProxyConfig | None:
2419
+ """Get proxy configuration from environment variables."""
1690
2420
  for prod in [
1691
2421
  ProxyProduct.RESIDENTIAL,
1692
2422
  ProxyProduct.DATACENTER,
@@ -1707,15 +2437,43 @@ class ThordataClient:
1707
2437
  )
1708
2438
  return None
1709
2439
 
1710
- def close(self) -> None:
1711
- self._proxy_session.close()
1712
- self._api_session.close()
1713
- for pm in self._proxy_managers.values():
1714
- pm.clear()
1715
- self._proxy_managers.clear()
2440
+ def get_browser_connection_url(
2441
+ self, username: str | None = None, password: str | None = None
2442
+ ) -> str:
2443
+ """
2444
+ Generate the WebSocket URL for connecting to Scraping Browser.
1716
2445
 
1717
- def __enter__(self) -> ThordataClient:
1718
- return self
2446
+ Args:
2447
+ username: Proxy username (without 'td-customer-' prefix).
2448
+ Defaults to THORDATA_BROWSER_USERNAME or THORDATA_RESIDENTIAL_USERNAME.
2449
+ password: Proxy password.
1719
2450
 
1720
- def __exit__(self, exc_type, exc_val, exc_tb) -> None:
1721
- self.close()
2451
+ Returns:
2452
+ WSS URL string suitable for playwright.connect_over_cdp().
2453
+
2454
+ Raises:
2455
+ ThordataConfigError: If credentials are missing.
2456
+ """
2457
+ user = (
2458
+ username
2459
+ or os.getenv("THORDATA_BROWSER_USERNAME")
2460
+ or os.getenv("THORDATA_RESIDENTIAL_USERNAME")
2461
+ )
2462
+ pwd = (
2463
+ password
2464
+ or os.getenv("THORDATA_BROWSER_PASSWORD")
2465
+ or os.getenv("THORDATA_RESIDENTIAL_PASSWORD")
2466
+ )
2467
+
2468
+ if not user or not pwd:
2469
+ raise ThordataConfigError(
2470
+ "Browser credentials missing. Set THORDATA_BROWSER_USERNAME/PASSWORD or pass arguments."
2471
+ )
2472
+ prefix = "td-customer-"
2473
+ final_user = f"{prefix}{user}" if not user.startswith(prefix) else user
2474
+
2475
+ # URL encode
2476
+ safe_user = quote(final_user, safe="")
2477
+ safe_pass = quote(pwd, safe="")
2478
+
2479
+ return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"