thordata-sdk 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +1 -1
- thordata/async_client.py +862 -233
- thordata/async_unlimited.py +130 -0
- thordata/client.py +1808 -1050
- thordata/demo.py +2 -2
- thordata/unlimited.py +102 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.4.0.dist-info}/METADATA +2 -2
- thordata_sdk-1.4.0.dist-info/RECORD +18 -0
- thordata_sdk-1.3.0.dist-info/RECORD +0 -16
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.4.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.4.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.4.0.dist-info}/top_level.txt +0 -0
thordata/client.py
CHANGED
|
@@ -26,19 +26,21 @@ from __future__ import annotations
|
|
|
26
26
|
import base64
|
|
27
27
|
import contextlib
|
|
28
28
|
import hashlib
|
|
29
|
+
import json
|
|
29
30
|
import logging
|
|
30
31
|
import os
|
|
31
32
|
import socket
|
|
32
33
|
import ssl
|
|
33
34
|
from datetime import date
|
|
34
35
|
from typing import Any, cast
|
|
35
|
-
from urllib.parse import urlencode, urlparse
|
|
36
|
+
from urllib.parse import quote, urlencode, urlparse
|
|
36
37
|
|
|
37
38
|
import requests
|
|
38
39
|
import urllib3
|
|
39
40
|
from requests.structures import CaseInsensitiveDict
|
|
40
41
|
|
|
41
42
|
from .serp_engines import SerpNamespace
|
|
43
|
+
from .unlimited import UnlimitedNamespace
|
|
42
44
|
|
|
43
45
|
try:
|
|
44
46
|
import socks
|
|
@@ -274,6 +276,8 @@ class _TLSInTLSSocket:
|
|
|
274
276
|
|
|
275
277
|
|
|
276
278
|
class ThordataClient:
|
|
279
|
+
"""Main client for interacting with Thordata API services."""
|
|
280
|
+
|
|
277
281
|
# API Endpoints
|
|
278
282
|
BASE_URL = "https://scraperapi.thordata.com"
|
|
279
283
|
UNIVERSAL_URL = "https://universalapi.thordata.com"
|
|
@@ -282,7 +286,7 @@ class ThordataClient:
|
|
|
282
286
|
|
|
283
287
|
def __init__(
|
|
284
288
|
self,
|
|
285
|
-
scraper_token: str | None = None,
|
|
289
|
+
scraper_token: str | None = None,
|
|
286
290
|
public_token: str | None = None,
|
|
287
291
|
public_key: str | None = None,
|
|
288
292
|
proxy_host: str = "pr.thordata.net",
|
|
@@ -296,9 +300,23 @@ class ThordataClient:
|
|
|
296
300
|
web_scraper_api_base_url: str | None = None,
|
|
297
301
|
locations_base_url: str | None = None,
|
|
298
302
|
) -> None:
|
|
299
|
-
"""Initialize the Thordata Client.
|
|
303
|
+
"""Initialize the Thordata Client.
|
|
300
304
|
|
|
301
|
-
|
|
305
|
+
Args:
|
|
306
|
+
scraper_token: Token for SERP/Universal scraping APIs.
|
|
307
|
+
public_token: Public API token for account/management operations.
|
|
308
|
+
public_key: Public API key for account/management operations.
|
|
309
|
+
proxy_host: Default proxy host for residential proxies.
|
|
310
|
+
proxy_port: Default proxy port for residential proxies.
|
|
311
|
+
timeout: Default timeout for proxy requests.
|
|
312
|
+
api_timeout: Default timeout for API requests.
|
|
313
|
+
retry_config: Configuration for retry behavior.
|
|
314
|
+
auth_mode: Authentication mode for scraper_token ("bearer" or "header_token").
|
|
315
|
+
scraperapi_base_url: Override base URL for SERP API.
|
|
316
|
+
universalapi_base_url: Override base URL for Universal Scraping API.
|
|
317
|
+
web_scraper_api_base_url: Override base URL for Web Scraper API.
|
|
318
|
+
locations_base_url: Override base URL for Locations API.
|
|
319
|
+
"""
|
|
302
320
|
|
|
303
321
|
self.scraper_token = scraper_token
|
|
304
322
|
self.public_token = public_token
|
|
@@ -388,6 +406,28 @@ class ThordataClient:
|
|
|
388
406
|
self._proxy_list_url = f"{proxy_api_base}/proxy/proxy-list"
|
|
389
407
|
self._proxy_expiration_url = f"{proxy_api_base}/proxy/expiration-time"
|
|
390
408
|
|
|
409
|
+
# Initialize Namespaces AFTER all base URLs are set
|
|
410
|
+
self.serp = SerpNamespace(self)
|
|
411
|
+
self.unlimited = UnlimitedNamespace(self)
|
|
412
|
+
|
|
413
|
+
# =========================================================================
|
|
414
|
+
# Context Manager
|
|
415
|
+
# =========================================================================
|
|
416
|
+
|
|
417
|
+
def close(self) -> None:
|
|
418
|
+
"""Close the client and release resources."""
|
|
419
|
+
self._proxy_session.close()
|
|
420
|
+
self._api_session.close()
|
|
421
|
+
for pm in self._proxy_managers.values():
|
|
422
|
+
pm.clear()
|
|
423
|
+
self._proxy_managers.clear()
|
|
424
|
+
|
|
425
|
+
def __enter__(self) -> ThordataClient:
|
|
426
|
+
return self
|
|
427
|
+
|
|
428
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
429
|
+
self.close()
|
|
430
|
+
|
|
391
431
|
# =========================================================================
|
|
392
432
|
# Proxy Network Methods
|
|
393
433
|
# =========================================================================
|
|
@@ -400,6 +440,17 @@ class ThordataClient:
|
|
|
400
440
|
timeout: int | None = None,
|
|
401
441
|
**kwargs: Any,
|
|
402
442
|
) -> requests.Response:
|
|
443
|
+
"""Make a GET request through the proxy network.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
url: Target URL to request.
|
|
447
|
+
proxy_config: Proxy configuration. If not provided, uses environment variables.
|
|
448
|
+
timeout: Request timeout in seconds.
|
|
449
|
+
**kwargs: Additional arguments passed to requests.
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Response object.
|
|
453
|
+
"""
|
|
403
454
|
logger.debug(f"Proxy GET request: {url}")
|
|
404
455
|
return self._proxy_verb("GET", url, proxy_config, timeout, **kwargs)
|
|
405
456
|
|
|
@@ -411,50 +462,19 @@ class ThordataClient:
|
|
|
411
462
|
timeout: int | None = None,
|
|
412
463
|
**kwargs: Any,
|
|
413
464
|
) -> requests.Response:
|
|
414
|
-
|
|
415
|
-
return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
|
|
416
|
-
|
|
417
|
-
def _proxy_verb(
|
|
418
|
-
self,
|
|
419
|
-
method: str,
|
|
420
|
-
url: str,
|
|
421
|
-
proxy_config: ProxyConfig | None,
|
|
422
|
-
timeout: int | None,
|
|
423
|
-
**kwargs: Any,
|
|
424
|
-
) -> requests.Response:
|
|
425
|
-
timeout = timeout or self._default_timeout
|
|
426
|
-
|
|
427
|
-
if proxy_config is None:
|
|
428
|
-
proxy_config = self._get_default_proxy_config_from_env()
|
|
429
|
-
|
|
430
|
-
if proxy_config is None:
|
|
431
|
-
raise ThordataConfigError(
|
|
432
|
-
"Proxy credentials are missing. "
|
|
433
|
-
"Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
|
|
434
|
-
)
|
|
465
|
+
"""Make a POST request through the proxy network.
|
|
435
466
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
method,
|
|
442
|
-
url,
|
|
443
|
-
proxy_config=proxy_config, # type: ignore
|
|
444
|
-
timeout=timeout, # type: ignore
|
|
445
|
-
headers=kwargs.pop("headers", None),
|
|
446
|
-
params=kwargs.pop("params", None),
|
|
447
|
-
data=kwargs.pop("data", None),
|
|
448
|
-
)
|
|
467
|
+
Args:
|
|
468
|
+
url: Target URL to request.
|
|
469
|
+
proxy_config: Proxy configuration. If not provided, uses environment variables.
|
|
470
|
+
timeout: Request timeout in seconds.
|
|
471
|
+
**kwargs: Additional arguments passed to requests.
|
|
449
472
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
) from e
|
|
456
|
-
except Exception as e:
|
|
457
|
-
raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
|
|
473
|
+
Returns:
|
|
474
|
+
Response object.
|
|
475
|
+
"""
|
|
476
|
+
logger.debug(f"Proxy POST request: {url}")
|
|
477
|
+
return self._proxy_verb("POST", url, proxy_config, timeout, **kwargs)
|
|
458
478
|
|
|
459
479
|
def build_proxy_url(
|
|
460
480
|
self,
|
|
@@ -468,6 +488,21 @@ class ThordataClient:
|
|
|
468
488
|
session_duration: int | None = None,
|
|
469
489
|
product: ProxyProduct | str = ProxyProduct.RESIDENTIAL,
|
|
470
490
|
) -> str:
|
|
491
|
+
"""Build a proxy URL with location and session parameters.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
username: Proxy username.
|
|
495
|
+
password: Proxy password.
|
|
496
|
+
country: Country code (e.g., "us", "uk").
|
|
497
|
+
state: State/region code (e.g., "ca", "ny").
|
|
498
|
+
city: City name (e.g., "new-york", "london").
|
|
499
|
+
session_id: Session identifier for sticky sessions.
|
|
500
|
+
session_duration: Session duration in minutes (1-90).
|
|
501
|
+
product: Proxy product type (RESIDENTIAL, DATACENTER, MOBILE).
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
Formatted proxy URL.
|
|
505
|
+
"""
|
|
471
506
|
config = ProxyConfig(
|
|
472
507
|
username=username,
|
|
473
508
|
password=password,
|
|
@@ -483,1013 +518,1023 @@ class ThordataClient:
|
|
|
483
518
|
return config.build_proxy_url()
|
|
484
519
|
|
|
485
520
|
# =========================================================================
|
|
486
|
-
#
|
|
521
|
+
# SERP API Methods
|
|
487
522
|
# =========================================================================
|
|
488
523
|
|
|
489
|
-
def
|
|
524
|
+
def serp_search(
|
|
490
525
|
self,
|
|
491
|
-
|
|
492
|
-
url: str,
|
|
526
|
+
query: str,
|
|
493
527
|
*,
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
timeout=self._api_timeout,
|
|
507
|
-
)
|
|
508
|
-
|
|
509
|
-
try:
|
|
510
|
-
return _do_request()
|
|
511
|
-
except requests.Timeout as e:
|
|
512
|
-
raise ThordataTimeoutError(
|
|
513
|
-
f"API request timed out: {e}", original_error=e
|
|
514
|
-
) from e
|
|
515
|
-
except requests.RequestException as e:
|
|
516
|
-
raise ThordataNetworkError(
|
|
517
|
-
f"API request failed: {e}", original_error=e
|
|
518
|
-
) from e
|
|
528
|
+
engine: Engine | str = Engine.GOOGLE,
|
|
529
|
+
num: int = 10,
|
|
530
|
+
country: str | None = None,
|
|
531
|
+
language: str | None = None,
|
|
532
|
+
search_type: str | None = None,
|
|
533
|
+
device: str | None = None,
|
|
534
|
+
render_js: bool | None = None,
|
|
535
|
+
no_cache: bool | None = None,
|
|
536
|
+
output_format: str = "json",
|
|
537
|
+
**kwargs: Any,
|
|
538
|
+
) -> dict[str, Any]:
|
|
539
|
+
"""Perform a search engine query using SERP API.
|
|
519
540
|
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
return
|
|
524
|
-
|
|
525
|
-
|
|
541
|
+
Args:
|
|
542
|
+
query: Search query string.
|
|
543
|
+
engine: Search engine (GOOGLE, BING, YAHOO, etc.).
|
|
544
|
+
num: Number of results to return.
|
|
545
|
+
country: Country code for localized results.
|
|
546
|
+
language: Language code for interface.
|
|
547
|
+
search_type: Type of search (images, news, video, etc.).
|
|
548
|
+
device: Device type (desktop, mobile).
|
|
549
|
+
render_js: Whether to render JavaScript.
|
|
550
|
+
no_cache: Bypass cache.
|
|
551
|
+
output_format: Output format ("json" or "html").
|
|
552
|
+
**kwargs: Additional engine-specific parameters.
|
|
526
553
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
cache_key: str,
|
|
532
|
-
proxy_headers: dict[str, str] | None = None,
|
|
533
|
-
) -> urllib3.PoolManager:
|
|
534
|
-
"""Get or create a ProxyManager for the given proxy URL (Pooled)."""
|
|
535
|
-
cached = self._proxy_managers.get(cache_key)
|
|
536
|
-
if cached is not None:
|
|
537
|
-
return cached
|
|
554
|
+
Returns:
|
|
555
|
+
Search results as dictionary.
|
|
556
|
+
"""
|
|
557
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
538
558
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
559
|
+
request = SerpRequest(
|
|
560
|
+
query=query,
|
|
561
|
+
engine=engine_str,
|
|
562
|
+
num=num,
|
|
563
|
+
country=country,
|
|
564
|
+
language=language,
|
|
565
|
+
search_type=search_type,
|
|
566
|
+
device=device,
|
|
567
|
+
render_js=render_js,
|
|
568
|
+
no_cache=no_cache,
|
|
569
|
+
output_format=output_format,
|
|
570
|
+
extra_params=kwargs,
|
|
571
|
+
)
|
|
547
572
|
|
|
548
|
-
|
|
549
|
-
proxy_url,
|
|
550
|
-
num_pools=10,
|
|
551
|
-
maxsize=10,
|
|
552
|
-
)
|
|
553
|
-
pm = cast(urllib3.PoolManager, pm_socks)
|
|
554
|
-
self._proxy_managers[cache_key] = pm
|
|
555
|
-
return pm
|
|
573
|
+
return self.serp_search_advanced(request)
|
|
556
574
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
if proxy_url.startswith("https://"):
|
|
560
|
-
proxy_ssl_context = ssl.create_default_context()
|
|
575
|
+
def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
|
|
576
|
+
"""Perform advanced search with a SerpRequest object.
|
|
561
577
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
proxy_headers=proxy_headers,
|
|
565
|
-
proxy_ssl_context=proxy_ssl_context,
|
|
566
|
-
num_pools=10,
|
|
567
|
-
maxsize=10,
|
|
568
|
-
)
|
|
578
|
+
Args:
|
|
579
|
+
request: SerpRequest object with search parameters.
|
|
569
580
|
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
581
|
+
Returns:
|
|
582
|
+
Search results as dictionary.
|
|
583
|
+
"""
|
|
584
|
+
if not self.scraper_token:
|
|
585
|
+
raise ThordataConfigError("scraper_token is required for SERP API")
|
|
573
586
|
|
|
574
|
-
|
|
575
|
-
self,
|
|
576
|
-
method: str,
|
|
577
|
-
url: str,
|
|
578
|
-
*,
|
|
579
|
-
proxy_config: ProxyConfig,
|
|
580
|
-
timeout: int,
|
|
581
|
-
headers: dict[str, str] | None = None,
|
|
582
|
-
params: dict[str, Any] | None = None,
|
|
583
|
-
data: Any = None,
|
|
584
|
-
) -> requests.Response:
|
|
585
|
-
"""Execute request through proxy, with optional upstream proxy support."""
|
|
587
|
+
payload = request.to_payload()
|
|
588
|
+
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
586
589
|
|
|
587
|
-
|
|
588
|
-
upstream_config = _parse_upstream_proxy()
|
|
590
|
+
logger.info(f"SERP Advanced Search: {request.engine} - {request.query[:50]}")
|
|
589
591
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
timeout=timeout,
|
|
592
|
+
try:
|
|
593
|
+
response = self._api_request_with_retry(
|
|
594
|
+
"POST",
|
|
595
|
+
self._serp_url,
|
|
596
|
+
data=payload,
|
|
596
597
|
headers=headers,
|
|
597
|
-
params=params,
|
|
598
|
-
data=data,
|
|
599
|
-
upstream_config=upstream_config,
|
|
600
598
|
)
|
|
599
|
+
response.raise_for_status()
|
|
601
600
|
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
601
|
+
if request.output_format.lower() == "json":
|
|
602
|
+
data = response.json()
|
|
603
|
+
if isinstance(data, dict):
|
|
604
|
+
code = data.get("code")
|
|
605
|
+
if code is not None and code != 200:
|
|
606
|
+
msg = extract_error_message(data)
|
|
607
|
+
raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
|
|
608
|
+
return parse_json_response(data)
|
|
606
609
|
|
|
607
|
-
|
|
608
|
-
is_socks = proxy_endpoint.startswith(
|
|
609
|
-
("socks5://", "socks5h://", "socks4://", "socks4a://")
|
|
610
|
-
)
|
|
610
|
+
return {"html": response.text}
|
|
611
611
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
612
|
+
except requests.Timeout as e:
|
|
613
|
+
raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
|
|
614
|
+
except requests.RequestException as e:
|
|
615
|
+
raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
|
|
616
616
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
proxy_headers=None,
|
|
621
|
-
)
|
|
622
|
-
else:
|
|
623
|
-
userpass = proxy_config.build_proxy_basic_auth()
|
|
624
|
-
proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
|
|
625
|
-
cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
|
|
617
|
+
# =========================================================================
|
|
618
|
+
# Universal Scraping API (WEB UNLOCKER) Methods
|
|
619
|
+
# =========================================================================
|
|
626
620
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
621
|
+
def universal_scrape(
|
|
622
|
+
self,
|
|
623
|
+
url: str,
|
|
624
|
+
*,
|
|
625
|
+
js_render: bool = False,
|
|
626
|
+
output_format: str = "html",
|
|
627
|
+
country: str | None = None,
|
|
628
|
+
block_resources: str | None = None,
|
|
629
|
+
wait: int | None = None,
|
|
630
|
+
wait_for: str | None = None,
|
|
631
|
+
**kwargs: Any,
|
|
632
|
+
) -> str | bytes:
|
|
633
|
+
"""Scrape a URL using Universal Scraping API.
|
|
632
634
|
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
body = data
|
|
635
|
+
Args:
|
|
636
|
+
url: Target URL to scrape.
|
|
637
|
+
js_render: Whether to render JavaScript.
|
|
638
|
+
output_format: Output format ("html" or "png").
|
|
639
|
+
country: Country for IP geolocation.
|
|
640
|
+
block_resources: Block specific resources (e.g., "script,css").
|
|
641
|
+
wait: Wait time in milliseconds before fetching.
|
|
642
|
+
wait_for: CSS selector to wait for before fetching.
|
|
643
|
+
**kwargs: Additional parameters.
|
|
643
644
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
645
|
+
Returns:
|
|
646
|
+
Scraped content as string (HTML) or bytes (PNG).
|
|
647
|
+
"""
|
|
648
|
+
request = UniversalScrapeRequest(
|
|
649
|
+
url=url,
|
|
650
|
+
js_render=js_render,
|
|
651
|
+
output_format=output_format,
|
|
652
|
+
country=country,
|
|
653
|
+
block_resources=block_resources,
|
|
654
|
+
wait=wait,
|
|
655
|
+
wait_for=wait_for,
|
|
656
|
+
extra_params=kwargs,
|
|
652
657
|
)
|
|
658
|
+
return self.universal_scrape_advanced(request)
|
|
653
659
|
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
+
def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
|
|
661
|
+
"""Scrape with advanced options using UniversalScrapeRequest.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
request: UniversalScrapeRequest object with scrape parameters.
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
Scraped content as string (HTML) or bytes (PNG).
|
|
668
|
+
"""
|
|
669
|
+
if not self.scraper_token:
|
|
670
|
+
raise ThordataConfigError("scraper_token is required for Universal API")
|
|
671
|
+
|
|
672
|
+
payload = request.to_payload()
|
|
673
|
+
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
674
|
+
|
|
675
|
+
logger.info(f"Universal Scrape: {request.url}")
|
|
676
|
+
|
|
677
|
+
try:
|
|
678
|
+
response = self._api_request_with_retry(
|
|
679
|
+
"POST",
|
|
680
|
+
self._universal_url,
|
|
681
|
+
data=payload,
|
|
682
|
+
headers=headers,
|
|
683
|
+
)
|
|
684
|
+
response.raise_for_status()
|
|
685
|
+
return self._process_universal_response(response, request.output_format)
|
|
686
|
+
|
|
687
|
+
except requests.Timeout as e:
|
|
688
|
+
raise ThordataTimeoutError(
|
|
689
|
+
f"Universal timeout: {e}", original_error=e
|
|
690
|
+
) from e
|
|
691
|
+
except requests.RequestException as e:
|
|
692
|
+
raise ThordataNetworkError(
|
|
693
|
+
f"Universal failed: {e}", original_error=e
|
|
694
|
+
) from e
|
|
660
695
|
|
|
661
696
|
# =========================================================================
|
|
662
|
-
#
|
|
697
|
+
# Web Scraper API - Task Management
|
|
663
698
|
# =========================================================================
|
|
664
699
|
|
|
665
|
-
def
|
|
700
|
+
def create_scraper_task(
|
|
666
701
|
self,
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
data: Any = None,
|
|
675
|
-
upstream_config: dict[str, Any],
|
|
676
|
-
) -> requests.Response:
|
|
677
|
-
"""Execute request through proxy chain: Upstream -> Thordata -> Target."""
|
|
678
|
-
if not HAS_PYSOCKS:
|
|
679
|
-
raise ThordataConfigError(
|
|
680
|
-
"PySocks is required for upstream proxy support. "
|
|
681
|
-
"Install with: pip install PySocks"
|
|
682
|
-
)
|
|
702
|
+
file_name: str,
|
|
703
|
+
spider_id: str,
|
|
704
|
+
spider_name: str,
|
|
705
|
+
parameters: dict[str, Any],
|
|
706
|
+
universal_params: dict[str, Any] | None = None,
|
|
707
|
+
) -> str:
|
|
708
|
+
"""Create a web scraping task.
|
|
683
709
|
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
710
|
+
Args:
|
|
711
|
+
file_name: Name for the output file (supports {{TasksID}} template).
|
|
712
|
+
spider_id: Spider identifier from Dashboard.
|
|
713
|
+
spider_name: Spider name (target domain, e.g., "amazon.com").
|
|
714
|
+
parameters: Spider-specific parameters.
|
|
715
|
+
universal_params: Global spider settings.
|
|
687
716
|
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
717
|
+
Returns:
|
|
718
|
+
Task ID.
|
|
719
|
+
"""
|
|
720
|
+
config = ScraperTaskConfig(
|
|
721
|
+
file_name=file_name,
|
|
722
|
+
spider_id=spider_id,
|
|
723
|
+
spider_name=spider_name,
|
|
724
|
+
parameters=parameters,
|
|
725
|
+
universal_params=universal_params,
|
|
692
726
|
)
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
protocol = proxy_config.protocol.lower()
|
|
696
|
-
if protocol == "socks5":
|
|
697
|
-
protocol = "socks5h"
|
|
698
|
-
|
|
699
|
-
thordata_host = proxy_config.host or ""
|
|
700
|
-
thordata_port = proxy_config.port or 9999
|
|
701
|
-
thordata_username = proxy_config.build_username()
|
|
702
|
-
thordata_password = proxy_config.password
|
|
727
|
+
return self.create_scraper_task_advanced(config)
|
|
703
728
|
|
|
704
|
-
|
|
729
|
+
def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
730
|
+
"""Create a web scraping task with advanced configuration.
|
|
705
731
|
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
|
|
709
|
-
f"-> target({target_host}:{target_port})"
|
|
710
|
-
)
|
|
732
|
+
Args:
|
|
733
|
+
config: ScraperTaskConfig object with task configuration.
|
|
711
734
|
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
735
|
+
Returns:
|
|
736
|
+
Task ID.
|
|
737
|
+
"""
|
|
738
|
+
self._require_public_credentials()
|
|
739
|
+
if not self.scraper_token:
|
|
740
|
+
raise ThordataConfigError("scraper_token is required for Task Builder")
|
|
741
|
+
payload = config.to_payload()
|
|
742
|
+
headers = build_builder_headers(
|
|
743
|
+
self.scraper_token, self.public_token or "", self.public_key or ""
|
|
715
744
|
)
|
|
716
745
|
|
|
717
746
|
try:
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
747
|
+
response = self._api_request_with_retry(
|
|
748
|
+
"POST", self._builder_url, data=payload, headers=headers
|
|
749
|
+
)
|
|
750
|
+
response.raise_for_status()
|
|
751
|
+
data = response.json()
|
|
752
|
+
if data.get("code") != 200:
|
|
753
|
+
raise_for_code(
|
|
754
|
+
"Task creation failed", code=data.get("code"), payload=data
|
|
725
755
|
)
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
756
|
+
return data["data"]["task_id"]
|
|
757
|
+
except requests.RequestException as e:
|
|
758
|
+
raise ThordataNetworkError(
|
|
759
|
+
f"Task creation failed: {e}", original_error=e
|
|
760
|
+
) from e
|
|
729
761
|
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
762
|
+
def create_video_task(
|
|
763
|
+
self,
|
|
764
|
+
file_name: str,
|
|
765
|
+
spider_id: str,
|
|
766
|
+
spider_name: str,
|
|
767
|
+
parameters: dict[str, Any],
|
|
768
|
+
common_settings: CommonSettings,
|
|
769
|
+
) -> str:
|
|
770
|
+
"""Create a video/audio download task (YouTube, etc.).
|
|
735
771
|
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
)
|
|
772
|
+
Args:
|
|
773
|
+
file_name: Name for the output file.
|
|
774
|
+
spider_id: Spider identifier (e.g., "youtube_video_by-url").
|
|
775
|
+
spider_name: Target site (e.g., "youtube.com").
|
|
776
|
+
parameters: Spider-specific parameters (URLs, etc.).
|
|
777
|
+
common_settings: Video/audio settings (resolution, subtitles, etc.).
|
|
743
778
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
779
|
+
Returns:
|
|
780
|
+
Task ID.
|
|
781
|
+
"""
|
|
782
|
+
config = VideoTaskConfig(
|
|
783
|
+
file_name=file_name,
|
|
784
|
+
spider_id=spider_id,
|
|
785
|
+
spider_name=spider_name,
|
|
786
|
+
parameters=parameters,
|
|
787
|
+
common_settings=common_settings,
|
|
788
|
+
)
|
|
789
|
+
return self.create_video_task_advanced(config)
|
|
750
790
|
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
raw_sock,
|
|
754
|
-
target_host,
|
|
755
|
-
target_port,
|
|
756
|
-
thordata_username,
|
|
757
|
-
thordata_password,
|
|
758
|
-
)
|
|
791
|
+
def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
|
|
792
|
+
"""Create a video task with advanced configuration.
|
|
759
793
|
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
sock = context.wrap_socket(raw_sock, server_hostname=target_host)
|
|
763
|
-
else:
|
|
764
|
-
sock = raw_sock
|
|
794
|
+
Args:
|
|
795
|
+
config: VideoTaskConfig object with task configuration.
|
|
765
796
|
|
|
766
|
-
|
|
767
|
-
|
|
797
|
+
Returns:
|
|
798
|
+
Task ID.
|
|
799
|
+
"""
|
|
800
|
+
self._require_public_credentials()
|
|
801
|
+
if not self.scraper_token:
|
|
802
|
+
raise ThordataConfigError(
|
|
803
|
+
"scraper_token is required for Video Task Builder"
|
|
768
804
|
)
|
|
769
805
|
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
806
|
+
payload = config.to_payload()
|
|
807
|
+
headers = build_builder_headers(
|
|
808
|
+
self.scraper_token, self.public_token or "", self.public_key or ""
|
|
809
|
+
)
|
|
773
810
|
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
811
|
+
response = self._api_request_with_retry(
|
|
812
|
+
"POST", self._video_builder_url, data=payload, headers=headers
|
|
813
|
+
)
|
|
814
|
+
response.raise_for_status()
|
|
815
|
+
data = response.json()
|
|
816
|
+
if data.get("code") != 200:
|
|
817
|
+
raise_for_code(
|
|
818
|
+
"Video task creation failed", code=data.get("code"), payload=data
|
|
819
|
+
)
|
|
820
|
+
return data["data"]["task_id"]
|
|
785
821
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
|
|
789
|
-
connect_req += "\r\n"
|
|
822
|
+
def get_task_status(self, task_id: str) -> str:
|
|
823
|
+
"""Get the status of a scraping task.
|
|
790
824
|
|
|
791
|
-
|
|
825
|
+
Args:
|
|
826
|
+
task_id: Task identifier.
|
|
792
827
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
828
|
+
Returns:
|
|
829
|
+
Status string (running, success, failed, etc.).
|
|
830
|
+
"""
|
|
831
|
+
self._require_public_credentials()
|
|
832
|
+
headers = build_public_api_headers(
|
|
833
|
+
self.public_token or "", self.public_key or ""
|
|
834
|
+
)
|
|
835
|
+
try:
|
|
836
|
+
response = self._api_request_with_retry(
|
|
837
|
+
"POST",
|
|
838
|
+
self._status_url,
|
|
839
|
+
data={"tasks_ids": task_id},
|
|
840
|
+
headers=headers,
|
|
841
|
+
)
|
|
842
|
+
response.raise_for_status()
|
|
843
|
+
data = response.json()
|
|
844
|
+
if data.get("code") != 200:
|
|
845
|
+
raise_for_code("Task status error", code=data.get("code"), payload=data)
|
|
799
846
|
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
847
|
+
items = data.get("data") or []
|
|
848
|
+
for item in items:
|
|
849
|
+
if str(item.get("task_id")) == str(task_id):
|
|
850
|
+
return item.get("status", "unknown")
|
|
851
|
+
return "unknown"
|
|
852
|
+
except requests.RequestException as e:
|
|
853
|
+
raise ThordataNetworkError(
|
|
854
|
+
f"Status check failed: {e}", original_error=e
|
|
855
|
+
) from e
|
|
803
856
|
|
|
804
|
-
def
|
|
805
|
-
|
|
806
|
-
outer_ssl_sock: ssl.SSLSocket,
|
|
807
|
-
hostname: str,
|
|
808
|
-
timeout: int,
|
|
809
|
-
) -> _TLSInTLSSocket:
|
|
810
|
-
"""Create a TLS connection over an existing TLS connection."""
|
|
811
|
-
context = ssl.create_default_context()
|
|
857
|
+
def safe_get_task_status(self, task_id: str) -> str:
|
|
858
|
+
"""Get task status with error handling.
|
|
812
859
|
|
|
813
|
-
|
|
814
|
-
|
|
860
|
+
Args:
|
|
861
|
+
task_id: Task identifier.
|
|
815
862
|
|
|
816
|
-
|
|
863
|
+
Returns:
|
|
864
|
+
Status string or "error" on failure.
|
|
865
|
+
"""
|
|
866
|
+
try:
|
|
867
|
+
return self.get_task_status(task_id)
|
|
868
|
+
except Exception:
|
|
869
|
+
return "error"
|
|
817
870
|
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
ssl_obj.do_handshake()
|
|
821
|
-
break
|
|
822
|
-
except ssl.SSLWantReadError:
|
|
823
|
-
data_to_send = outgoing.read()
|
|
824
|
-
if data_to_send:
|
|
825
|
-
outer_ssl_sock.sendall(data_to_send)
|
|
871
|
+
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
872
|
+
"""Get the download URL for a completed task.
|
|
826
873
|
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
if not received:
|
|
831
|
-
raise ConnectionError("Connection closed during TLS handshake")
|
|
832
|
-
incoming.write(received)
|
|
833
|
-
except socket.timeout as e:
|
|
834
|
-
raise ConnectionError("Timeout during TLS handshake") from e
|
|
835
|
-
except ssl.SSLWantWriteError:
|
|
836
|
-
data_to_send = outgoing.read()
|
|
837
|
-
if data_to_send:
|
|
838
|
-
outer_ssl_sock.sendall(data_to_send)
|
|
874
|
+
Args:
|
|
875
|
+
task_id: Task identifier.
|
|
876
|
+
file_type: File type to download (json, csv, video, audio, subtitle).
|
|
839
877
|
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
878
|
+
Returns:
|
|
879
|
+
Download URL.
|
|
880
|
+
"""
|
|
881
|
+
self._require_public_credentials()
|
|
882
|
+
headers = build_public_api_headers(
|
|
883
|
+
self.public_token or "", self.public_key or ""
|
|
884
|
+
)
|
|
885
|
+
try:
|
|
886
|
+
response = self._api_request_with_retry(
|
|
887
|
+
"POST",
|
|
888
|
+
self._download_url,
|
|
889
|
+
data={"tasks_id": task_id, "type": file_type},
|
|
890
|
+
headers=headers,
|
|
891
|
+
)
|
|
892
|
+
response.raise_for_status()
|
|
893
|
+
data = response.json()
|
|
894
|
+
if data.get("code") == 200 and data.get("data"):
|
|
895
|
+
return data["data"]["download"]
|
|
896
|
+
raise_for_code("Get result failed", code=data.get("code"), payload=data)
|
|
897
|
+
return ""
|
|
898
|
+
except requests.RequestException as e:
|
|
899
|
+
raise ThordataNetworkError(
|
|
900
|
+
f"Get result failed: {e}", original_error=e
|
|
901
|
+
) from e
|
|
843
902
|
|
|
844
|
-
|
|
903
|
+
def list_tasks(self, page: int = 1, size: int = 20) -> dict[str, Any]:
|
|
904
|
+
"""List all scraping tasks.
|
|
845
905
|
|
|
846
|
-
|
|
906
|
+
Args:
|
|
907
|
+
page: Page number for pagination.
|
|
908
|
+
size: Number of items per page.
|
|
909
|
+
|
|
910
|
+
Returns:
|
|
911
|
+
Dictionary with count and list of tasks.
|
|
912
|
+
"""
|
|
913
|
+
self._require_public_credentials()
|
|
914
|
+
headers = build_public_api_headers(
|
|
915
|
+
self.public_token or "", self.public_key or ""
|
|
916
|
+
)
|
|
917
|
+
response = self._api_request_with_retry(
|
|
918
|
+
"POST",
|
|
919
|
+
self._list_url,
|
|
920
|
+
data={"page": str(page), "size": str(size)},
|
|
921
|
+
headers=headers,
|
|
922
|
+
)
|
|
923
|
+
response.raise_for_status()
|
|
924
|
+
data = response.json()
|
|
925
|
+
if data.get("code") != 200:
|
|
926
|
+
raise_for_code("List tasks failed", code=data.get("code"), payload=data)
|
|
927
|
+
return data.get("data", {"count": 0, "list": []})
|
|
928
|
+
|
|
929
|
+
def wait_for_task(
|
|
847
930
|
self,
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
timeout: int,
|
|
855
|
-
) -> requests.Response:
|
|
856
|
-
"""Send HTTP request over established connection and parse response."""
|
|
857
|
-
target_host = parsed_url.hostname
|
|
931
|
+
task_id: str,
|
|
932
|
+
*,
|
|
933
|
+
poll_interval: float = 5.0,
|
|
934
|
+
max_wait: float = 600.0,
|
|
935
|
+
) -> str:
|
|
936
|
+
"""Wait for a task to complete.
|
|
858
937
|
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
938
|
+
Args:
|
|
939
|
+
task_id: Task identifier.
|
|
940
|
+
poll_interval: Polling interval in seconds.
|
|
941
|
+
max_wait: Maximum time to wait in seconds.
|
|
863
942
|
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
943
|
+
Returns:
|
|
944
|
+
Final status of the task.
|
|
945
|
+
"""
|
|
946
|
+
import time
|
|
867
947
|
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
948
|
+
start = time.monotonic()
|
|
949
|
+
while (time.monotonic() - start) < max_wait:
|
|
950
|
+
status = self.get_task_status(task_id)
|
|
951
|
+
if status.lower() in {
|
|
952
|
+
"ready",
|
|
953
|
+
"success",
|
|
954
|
+
"finished",
|
|
955
|
+
"failed",
|
|
956
|
+
"error",
|
|
957
|
+
"cancelled",
|
|
958
|
+
}:
|
|
959
|
+
return status
|
|
960
|
+
time.sleep(poll_interval)
|
|
961
|
+
raise TimeoutError(f"Task {task_id} timeout")
|
|
871
962
|
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
963
|
+
def run_task(
|
|
964
|
+
self,
|
|
965
|
+
file_name: str,
|
|
966
|
+
spider_id: str,
|
|
967
|
+
spider_name: str,
|
|
968
|
+
parameters: dict[str, Any],
|
|
969
|
+
universal_params: dict[str, Any] | None = None,
|
|
970
|
+
*,
|
|
971
|
+
max_wait: float = 600.0,
|
|
972
|
+
initial_poll_interval: float = 2.0,
|
|
973
|
+
max_poll_interval: float = 10.0,
|
|
974
|
+
include_errors: bool = True,
|
|
975
|
+
# New parameters
|
|
976
|
+
task_type: str = "web", # "web" or "video"
|
|
977
|
+
common_settings: CommonSettings | None = None,
|
|
978
|
+
) -> str:
|
|
979
|
+
"""High-level wrapper to run a task and wait for result.
|
|
884
980
|
|
|
885
|
-
|
|
886
|
-
|
|
981
|
+
This method handles the entire lifecycle:
|
|
982
|
+
1. Create Task
|
|
983
|
+
2. Poll status (with exponential backoff)
|
|
984
|
+
3. Get download URL when ready
|
|
887
985
|
|
|
888
|
-
|
|
889
|
-
|
|
986
|
+
Args:
|
|
987
|
+
file_name: Name for the output file.
|
|
988
|
+
spider_id: Spider identifier from Dashboard.
|
|
989
|
+
spider_name: Spider name (target domain).
|
|
990
|
+
parameters: Spider-specific parameters.
|
|
991
|
+
universal_params: Global spider settings.
|
|
992
|
+
max_wait: Maximum seconds to wait for completion.
|
|
993
|
+
initial_poll_interval: Starting poll interval in seconds.
|
|
994
|
+
max_poll_interval: Maximum poll interval cap.
|
|
995
|
+
include_errors: Whether to include error logs.
|
|
890
996
|
|
|
891
|
-
|
|
892
|
-
|
|
997
|
+
Returns:
|
|
998
|
+
The download URL for the task result.
|
|
893
999
|
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
break
|
|
900
|
-
response_data += chunk
|
|
901
|
-
if b"\r\n\r\n" in response_data:
|
|
902
|
-
header_end = response_data.index(b"\r\n\r\n") + 4
|
|
903
|
-
headers_part = (
|
|
904
|
-
response_data[:header_end]
|
|
905
|
-
.decode("utf-8", errors="replace")
|
|
906
|
-
.lower()
|
|
907
|
-
)
|
|
908
|
-
if "content-length:" in headers_part:
|
|
909
|
-
for line in headers_part.split("\r\n"):
|
|
910
|
-
if line.startswith("content-length:"):
|
|
911
|
-
content_length = int(line.split(":")[1].strip())
|
|
912
|
-
if len(response_data) >= header_end + content_length:
|
|
913
|
-
break
|
|
914
|
-
elif "transfer-encoding: chunked" not in headers_part:
|
|
915
|
-
break
|
|
916
|
-
except socket.timeout:
|
|
917
|
-
pass
|
|
918
|
-
|
|
919
|
-
return self._parse_http_response(response_data, final_url)
|
|
1000
|
+
Raises:
|
|
1001
|
+
ThordataTimeoutError: If task takes longer than max_wait.
|
|
1002
|
+
ThordataAPIError: If task fails or is cancelled.
|
|
1003
|
+
"""
|
|
1004
|
+
import time
|
|
920
1005
|
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
1006
|
+
# 1. Create Task
|
|
1007
|
+
if task_type == "video":
|
|
1008
|
+
if common_settings is None:
|
|
1009
|
+
raise ValueError("common_settings is required for video tasks")
|
|
1010
|
+
|
|
1011
|
+
config_video = VideoTaskConfig(
|
|
1012
|
+
file_name=file_name,
|
|
1013
|
+
spider_id=spider_id,
|
|
1014
|
+
spider_name=spider_name,
|
|
1015
|
+
parameters=parameters,
|
|
1016
|
+
common_settings=common_settings,
|
|
1017
|
+
include_errors=include_errors,
|
|
1018
|
+
)
|
|
1019
|
+
task_id = self.create_video_task_advanced(config_video)
|
|
932
1020
|
else:
|
|
933
|
-
|
|
1021
|
+
config = ScraperTaskConfig(
|
|
1022
|
+
file_name=file_name,
|
|
1023
|
+
spider_id=spider_id,
|
|
1024
|
+
spider_name=spider_name,
|
|
1025
|
+
parameters=parameters,
|
|
1026
|
+
universal_params=universal_params,
|
|
1027
|
+
include_errors=include_errors,
|
|
1028
|
+
)
|
|
1029
|
+
task_id = self.create_scraper_task_advanced(config)
|
|
934
1030
|
|
|
935
|
-
|
|
936
|
-
if len(response) < 2:
|
|
937
|
-
raise ConnectionError("SOCKS5 handshake failed: incomplete response")
|
|
1031
|
+
logger.info(f"Task created successfully: {task_id}. Waiting for completion...")
|
|
938
1032
|
|
|
939
|
-
|
|
940
|
-
|
|
1033
|
+
# 2. Poll Status (Smart Backoff)
|
|
1034
|
+
start_time = time.monotonic()
|
|
1035
|
+
current_poll = initial_poll_interval
|
|
941
1036
|
|
|
942
|
-
|
|
1037
|
+
while (time.monotonic() - start_time) < max_wait:
|
|
1038
|
+
status = self.get_task_status(task_id)
|
|
1039
|
+
status_lower = status.lower()
|
|
943
1040
|
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
1041
|
+
if status_lower in {"ready", "success", "finished"}:
|
|
1042
|
+
logger.info(f"Task {task_id} finished. Status: {status}")
|
|
1043
|
+
# 3. Get Result
|
|
1044
|
+
return self.get_task_result(task_id)
|
|
1045
|
+
|
|
1046
|
+
if status_lower in {"failed", "error", "cancelled"}:
|
|
1047
|
+
raise ThordataNetworkError(
|
|
1048
|
+
f"Task {task_id} ended with failed status: {status}"
|
|
948
1049
|
)
|
|
949
1050
|
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
1051
|
+
# Wait and increase interval (capped)
|
|
1052
|
+
time.sleep(current_poll)
|
|
1053
|
+
current_poll = min(current_poll * 1.5, max_poll_interval)
|
|
953
1054
|
|
|
954
|
-
|
|
955
|
-
if len(auth_resp) < 2 or auth_resp[1] != 0x00:
|
|
956
|
-
raise ConnectionError("SOCKS5 authentication failed")
|
|
1055
|
+
raise ThordataTimeoutError(f"Task {task_id} timed out after {max_wait} seconds")
|
|
957
1056
|
|
|
958
|
-
|
|
959
|
-
|
|
1057
|
+
# =========================================================================
|
|
1058
|
+
# Account & Usage Methods
|
|
1059
|
+
# =========================================================================
|
|
960
1060
|
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
1061
|
+
def get_usage_statistics(
|
|
1062
|
+
self,
|
|
1063
|
+
from_date: str | date,
|
|
1064
|
+
to_date: str | date,
|
|
1065
|
+
) -> UsageStatistics:
|
|
1066
|
+
"""Get usage statistics for a date range.
|
|
965
1067
|
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
1068
|
+
Args:
|
|
1069
|
+
from_date: Start date (YYYY-MM-DD format or date object).
|
|
1070
|
+
to_date: End date (YYYY-MM-DD format or date object).
|
|
969
1071
|
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
0x07: "Command not supported",
|
|
979
|
-
0x08: "Address type not supported",
|
|
980
|
-
}
|
|
981
|
-
error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
|
|
982
|
-
raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
|
|
1072
|
+
Returns:
|
|
1073
|
+
UsageStatistics object with traffic data.
|
|
1074
|
+
"""
|
|
1075
|
+
self._require_public_credentials()
|
|
1076
|
+
if isinstance(from_date, date):
|
|
1077
|
+
from_date = from_date.strftime("%Y-%m-%d")
|
|
1078
|
+
if isinstance(to_date, date):
|
|
1079
|
+
to_date = to_date.strftime("%Y-%m-%d")
|
|
983
1080
|
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
1081
|
+
params = {
|
|
1082
|
+
"token": self.public_token,
|
|
1083
|
+
"key": self.public_key,
|
|
1084
|
+
"from_date": from_date,
|
|
1085
|
+
"to_date": to_date,
|
|
1086
|
+
}
|
|
1087
|
+
response = self._api_request_with_retry(
|
|
1088
|
+
"GET", self._usage_stats_url, params=params
|
|
1089
|
+
)
|
|
1090
|
+
response.raise_for_status()
|
|
1091
|
+
data = response.json()
|
|
1092
|
+
if data.get("code") != 200:
|
|
1093
|
+
raise_for_code("Usage stats error", code=data.get("code"), payload=data)
|
|
1094
|
+
return UsageStatistics.from_dict(data.get("data", data))
|
|
992
1095
|
|
|
993
|
-
|
|
1096
|
+
def get_traffic_balance(self) -> float:
|
|
1097
|
+
"""
|
|
1098
|
+
Get the current traffic balance in KB via Public API.
|
|
1099
|
+
"""
|
|
1100
|
+
self._require_public_credentials()
|
|
1101
|
+
# FIX: Auth params must be in Query, NOT Headers
|
|
1102
|
+
params = {
|
|
1103
|
+
"token": self.public_token,
|
|
1104
|
+
"key": self.public_key,
|
|
1105
|
+
}
|
|
1106
|
+
api_base = self._locations_base_url.replace("/locations", "")
|
|
994
1107
|
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
""
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
header_data = response_data
|
|
1005
|
-
body = b""
|
|
1108
|
+
response = self._api_request_with_retry(
|
|
1109
|
+
"GET", f"{api_base}/account/traffic-balance", params=params
|
|
1110
|
+
)
|
|
1111
|
+
response.raise_for_status()
|
|
1112
|
+
data = response.json()
|
|
1113
|
+
if data.get("code") != 200:
|
|
1114
|
+
raise_for_code(
|
|
1115
|
+
"Get traffic balance failed", code=data.get("code"), payload=data
|
|
1116
|
+
)
|
|
1006
1117
|
|
|
1007
|
-
|
|
1118
|
+
return float(data.get("data", {}).get("traffic_balance", 0))
|
|
1008
1119
|
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1120
|
+
def get_wallet_balance(self) -> float:
|
|
1121
|
+
"""
|
|
1122
|
+
Get the current wallet balance via Public API.
|
|
1123
|
+
"""
|
|
1124
|
+
self._require_public_credentials()
|
|
1125
|
+
# FIX: Auth params must be in Query, NOT Headers
|
|
1126
|
+
params = {
|
|
1127
|
+
"token": self.public_token,
|
|
1128
|
+
"key": self.public_key,
|
|
1129
|
+
}
|
|
1130
|
+
api_base = self._locations_base_url.replace("/locations", "")
|
|
1012
1131
|
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1132
|
+
response = self._api_request_with_retry(
|
|
1133
|
+
"GET", f"{api_base}/account/wallet-balance", params=params
|
|
1134
|
+
)
|
|
1135
|
+
response.raise_for_status()
|
|
1136
|
+
data = response.json()
|
|
1137
|
+
if data.get("code") != 200:
|
|
1138
|
+
raise_for_code(
|
|
1139
|
+
"Get wallet balance failed", code=data.get("code"), payload=data
|
|
1140
|
+
)
|
|
1018
1141
|
|
|
1019
|
-
|
|
1020
|
-
body = self._decode_chunked(body)
|
|
1142
|
+
return float(data.get("data", {}).get("balance", 0))
|
|
1021
1143
|
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1144
|
+
def get_proxy_user_usage(
|
|
1145
|
+
self,
|
|
1146
|
+
username: str,
|
|
1147
|
+
start_date: str | date,
|
|
1148
|
+
end_date: str | date,
|
|
1149
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1150
|
+
) -> list[dict[str, Any]]:
|
|
1151
|
+
"""
|
|
1152
|
+
Get traffic usage statistics for a specific proxy user.
|
|
1028
1153
|
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
break
|
|
1035
|
-
size_line, data = data.split(b"\r\n", 1)
|
|
1036
|
-
try:
|
|
1037
|
-
chunk_size = int(size_line.decode().strip(), 16)
|
|
1038
|
-
except ValueError:
|
|
1039
|
-
break
|
|
1154
|
+
Args:
|
|
1155
|
+
username: Sub-account username.
|
|
1156
|
+
start_date: Start date (YYYY-MM-DD).
|
|
1157
|
+
end_date: End date (YYYY-MM-DD).
|
|
1158
|
+
proxy_type: Proxy product type.
|
|
1040
1159
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1160
|
+
Returns:
|
|
1161
|
+
List of daily usage records.
|
|
1162
|
+
"""
|
|
1163
|
+
self._require_public_credentials()
|
|
1164
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1043
1165
|
|
|
1044
|
-
|
|
1045
|
-
|
|
1166
|
+
if isinstance(start_date, date):
|
|
1167
|
+
start_date = start_date.strftime("%Y-%m-%d")
|
|
1168
|
+
if isinstance(end_date, date):
|
|
1169
|
+
end_date = end_date.strftime("%Y-%m-%d")
|
|
1046
1170
|
|
|
1047
|
-
|
|
1048
|
-
|
|
1171
|
+
params = {
|
|
1172
|
+
"token": self.public_token,
|
|
1173
|
+
"key": self.public_key,
|
|
1174
|
+
"proxy_type": str(pt),
|
|
1175
|
+
"username": username,
|
|
1176
|
+
"from_date": start_date,
|
|
1177
|
+
"to_date": end_date,
|
|
1178
|
+
}
|
|
1049
1179
|
|
|
1050
|
-
|
|
1180
|
+
response = self._api_request_with_retry(
|
|
1181
|
+
"GET", f"{self._proxy_users_url}/usage-statistics", params=params
|
|
1182
|
+
)
|
|
1183
|
+
response.raise_for_status()
|
|
1184
|
+
data = response.json()
|
|
1185
|
+
if data.get("code") != 200:
|
|
1186
|
+
raise_for_code("Get user usage failed", code=data.get("code"), payload=data)
|
|
1051
1187
|
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
# =========================================================================
|
|
1188
|
+
# Structure: { "data": [ { "date": "...", "usage_traffic": ... } ] }
|
|
1189
|
+
return data.get("data", [])
|
|
1055
1190
|
|
|
1056
|
-
def
|
|
1191
|
+
def extract_ip_list(
|
|
1057
1192
|
self,
|
|
1058
|
-
|
|
1059
|
-
*,
|
|
1060
|
-
engine: Engine | str = Engine.GOOGLE,
|
|
1061
|
-
num: int = 10,
|
|
1193
|
+
num: int = 1,
|
|
1062
1194
|
country: str | None = None,
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
engine=engine_str,
|
|
1076
|
-
num=num,
|
|
1077
|
-
country=country,
|
|
1078
|
-
language=language,
|
|
1079
|
-
search_type=search_type,
|
|
1080
|
-
device=device,
|
|
1081
|
-
render_js=render_js,
|
|
1082
|
-
no_cache=no_cache,
|
|
1083
|
-
output_format=output_format,
|
|
1084
|
-
extra_params=kwargs,
|
|
1085
|
-
)
|
|
1195
|
+
state: str | None = None,
|
|
1196
|
+
city: str | None = None,
|
|
1197
|
+
time_limit: int | None = None,
|
|
1198
|
+
port: int | None = None,
|
|
1199
|
+
return_type: str = "txt",
|
|
1200
|
+
protocol: str = "http",
|
|
1201
|
+
sep: str = "\r\n",
|
|
1202
|
+
product: str = "residential", # residential or unlimited
|
|
1203
|
+
) -> list[str]:
|
|
1204
|
+
"""
|
|
1205
|
+
Extract proxy IP list via API (get-ip.thordata.net).
|
|
1206
|
+
Requires IP whitelist configuration.
|
|
1086
1207
|
|
|
1087
|
-
|
|
1208
|
+
Args:
|
|
1209
|
+
num: Number of IPs to extract.
|
|
1210
|
+
country: Country code.
|
|
1211
|
+
state: State code.
|
|
1212
|
+
city: City name.
|
|
1213
|
+
time_limit: Session duration (1-90 mins).
|
|
1214
|
+
port: Specific port.
|
|
1215
|
+
return_type: "txt" or "json".
|
|
1216
|
+
protocol: "http" or "socks5".
|
|
1217
|
+
sep: Separator for txt output.
|
|
1218
|
+
product: "residential" or "unlimited".
|
|
1088
1219
|
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
headers=headers,
|
|
1104
|
-
)
|
|
1105
|
-
response.raise_for_status()
|
|
1106
|
-
|
|
1107
|
-
if request.output_format.lower() == "json":
|
|
1108
|
-
data = response.json()
|
|
1109
|
-
if isinstance(data, dict):
|
|
1110
|
-
code = data.get("code")
|
|
1111
|
-
if code is not None and code != 200:
|
|
1112
|
-
msg = extract_error_message(data)
|
|
1113
|
-
raise_for_code(f"SERP Error: {msg}", code=code, payload=data)
|
|
1114
|
-
return parse_json_response(data)
|
|
1115
|
-
|
|
1116
|
-
return {"html": response.text}
|
|
1117
|
-
|
|
1118
|
-
except requests.Timeout as e:
|
|
1119
|
-
raise ThordataTimeoutError(f"SERP timeout: {e}", original_error=e) from e
|
|
1120
|
-
except requests.RequestException as e:
|
|
1121
|
-
raise ThordataNetworkError(f"SERP failed: {e}", original_error=e) from e
|
|
1122
|
-
|
|
1123
|
-
# =========================================================================
|
|
1124
|
-
# Universal Scraping API
|
|
1125
|
-
# =========================================================================
|
|
1220
|
+
Returns:
|
|
1221
|
+
List of "IP:Port" strings.
|
|
1222
|
+
"""
|
|
1223
|
+
# Determine endpoint based on product
|
|
1224
|
+
base_url = "https://get-ip.thordata.net"
|
|
1225
|
+
endpoint = "/unlimited_api" if product == "unlimited" else "/api"
|
|
1226
|
+
|
|
1227
|
+
# Build params
|
|
1228
|
+
params: dict[str, Any] = {
|
|
1229
|
+
"num": str(num),
|
|
1230
|
+
"return_type": return_type,
|
|
1231
|
+
"protocol": protocol,
|
|
1232
|
+
"sep": sep,
|
|
1233
|
+
}
|
|
1126
1234
|
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
wait=wait,
|
|
1146
|
-
wait_for=wait_for,
|
|
1147
|
-
extra_params=kwargs,
|
|
1235
|
+
# Add optional params
|
|
1236
|
+
if country:
|
|
1237
|
+
params["country"] = country
|
|
1238
|
+
if state:
|
|
1239
|
+
params["state"] = state
|
|
1240
|
+
if city:
|
|
1241
|
+
params["city"] = city
|
|
1242
|
+
if time_limit:
|
|
1243
|
+
params["time"] = str(time_limit)
|
|
1244
|
+
if port:
|
|
1245
|
+
params["port"] = str(port)
|
|
1246
|
+
|
|
1247
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
1248
|
+
if username:
|
|
1249
|
+
params["td-customer"] = username
|
|
1250
|
+
|
|
1251
|
+
response = self._api_session.get(
|
|
1252
|
+
f"{base_url}{endpoint}", params=params, timeout=self._default_timeout
|
|
1148
1253
|
)
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
|
|
1152
|
-
if not self.scraper_token:
|
|
1153
|
-
raise ThordataConfigError("scraper_token is required for Universal API")
|
|
1154
|
-
|
|
1155
|
-
payload = request.to_payload()
|
|
1156
|
-
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
1157
|
-
|
|
1158
|
-
logger.info(f"Universal Scrape: {request.url}")
|
|
1159
|
-
|
|
1160
|
-
try:
|
|
1161
|
-
response = self._api_request_with_retry(
|
|
1162
|
-
"POST",
|
|
1163
|
-
self._universal_url,
|
|
1164
|
-
data=payload,
|
|
1165
|
-
headers=headers,
|
|
1166
|
-
)
|
|
1167
|
-
response.raise_for_status()
|
|
1168
|
-
return self._process_universal_response(response, request.output_format)
|
|
1169
|
-
|
|
1170
|
-
except requests.Timeout as e:
|
|
1171
|
-
raise ThordataTimeoutError(
|
|
1172
|
-
f"Universal timeout: {e}", original_error=e
|
|
1173
|
-
) from e
|
|
1174
|
-
except requests.RequestException as e:
|
|
1175
|
-
raise ThordataNetworkError(
|
|
1176
|
-
f"Universal failed: {e}", original_error=e
|
|
1177
|
-
) from e
|
|
1178
|
-
|
|
1179
|
-
def _process_universal_response(
|
|
1180
|
-
self, response: requests.Response, output_format: str
|
|
1181
|
-
) -> str | bytes:
|
|
1182
|
-
try:
|
|
1183
|
-
resp_json = response.json()
|
|
1184
|
-
except ValueError:
|
|
1185
|
-
return response.content if output_format.lower() == "png" else response.text
|
|
1254
|
+
response.raise_for_status()
|
|
1186
1255
|
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1256
|
+
# Parse result
|
|
1257
|
+
if return_type == "json":
|
|
1258
|
+
data = response.json()
|
|
1259
|
+
# JSON format: { "code": 0, "data": [ { "ip": "...", "port": ... } ] }
|
|
1260
|
+
if isinstance(data, dict):
|
|
1261
|
+
if data.get("code") == 0 or data.get("code") == 200:
|
|
1262
|
+
raw_list = data.get("data") or []
|
|
1263
|
+
return [f"{item['ip']}:{item['port']}" for item in raw_list]
|
|
1264
|
+
else:
|
|
1265
|
+
raise_for_code(
|
|
1266
|
+
"Extract IPs failed", code=data.get("code"), payload=data
|
|
1267
|
+
)
|
|
1268
|
+
return []
|
|
1192
1269
|
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1270
|
+
else: # txt
|
|
1271
|
+
text = response.text.strip()
|
|
1272
|
+
# Check for error message in text (often starts with { or contains "error")
|
|
1273
|
+
if text.startswith("{") and "code" in text:
|
|
1274
|
+
# Try parsing as JSON error
|
|
1275
|
+
try:
|
|
1276
|
+
err_data = json.loads(text)
|
|
1277
|
+
raise_for_code(
|
|
1278
|
+
"Extract IPs failed",
|
|
1279
|
+
code=err_data.get("code"),
|
|
1280
|
+
payload=err_data,
|
|
1281
|
+
)
|
|
1282
|
+
except json.JSONDecodeError:
|
|
1283
|
+
pass
|
|
1197
1284
|
|
|
1198
|
-
|
|
1285
|
+
actual_sep = sep.replace("\\r", "\r").replace("\\n", "\n")
|
|
1286
|
+
return [line.strip() for line in text.split(actual_sep) if line.strip()]
|
|
1199
1287
|
|
|
1200
1288
|
# =========================================================================
|
|
1201
|
-
#
|
|
1289
|
+
# Proxy Users Management (Sub-accounts)
|
|
1202
1290
|
# =========================================================================
|
|
1203
1291
|
|
|
1204
|
-
def
|
|
1205
|
-
self,
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
spider_name: str,
|
|
1209
|
-
parameters: dict[str, Any],
|
|
1210
|
-
universal_params: dict[str, Any] | None = None,
|
|
1211
|
-
) -> str:
|
|
1212
|
-
config = ScraperTaskConfig(
|
|
1213
|
-
file_name=file_name,
|
|
1214
|
-
spider_id=spider_id,
|
|
1215
|
-
spider_name=spider_name,
|
|
1216
|
-
parameters=parameters,
|
|
1217
|
-
universal_params=universal_params,
|
|
1218
|
-
)
|
|
1219
|
-
return self.create_scraper_task_advanced(config)
|
|
1292
|
+
def list_proxy_users(
|
|
1293
|
+
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
1294
|
+
) -> ProxyUserList:
|
|
1295
|
+
"""List all proxy sub-accounts.
|
|
1220
1296
|
|
|
1221
|
-
|
|
1297
|
+
Args:
|
|
1298
|
+
proxy_type: Proxy product type.
|
|
1299
|
+
|
|
1300
|
+
Returns:
|
|
1301
|
+
ProxyUserList with user information.
|
|
1302
|
+
"""
|
|
1222
1303
|
self._require_public_credentials()
|
|
1223
|
-
if
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1304
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1305
|
+
params = {
|
|
1306
|
+
"token": self.public_token,
|
|
1307
|
+
"key": self.public_key,
|
|
1308
|
+
"proxy_type": str(pt),
|
|
1309
|
+
}
|
|
1310
|
+
response = self._api_request_with_retry(
|
|
1311
|
+
"GET", f"{self._proxy_users_url}/user-list", params=params
|
|
1228
1312
|
)
|
|
1313
|
+
response.raise_for_status()
|
|
1314
|
+
data = response.json()
|
|
1315
|
+
if data.get("code") != 200:
|
|
1316
|
+
raise_for_code("List users error", code=data.get("code"), payload=data)
|
|
1317
|
+
return ProxyUserList.from_dict(data.get("data", data))
|
|
1229
1318
|
|
|
1230
|
-
|
|
1231
|
-
response = self._api_request_with_retry(
|
|
1232
|
-
"POST", self._builder_url, data=payload, headers=headers
|
|
1233
|
-
)
|
|
1234
|
-
response.raise_for_status()
|
|
1235
|
-
data = response.json()
|
|
1236
|
-
if data.get("code") != 200:
|
|
1237
|
-
raise_for_code(
|
|
1238
|
-
"Task creation failed", code=data.get("code"), payload=data
|
|
1239
|
-
)
|
|
1240
|
-
return data["data"]["task_id"]
|
|
1241
|
-
except requests.RequestException as e:
|
|
1242
|
-
raise ThordataNetworkError(
|
|
1243
|
-
f"Task creation failed: {e}", original_error=e
|
|
1244
|
-
) from e
|
|
1245
|
-
|
|
1246
|
-
def create_video_task(
|
|
1319
|
+
def create_proxy_user(
|
|
1247
1320
|
self,
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
) -> str:
|
|
1254
|
-
|
|
1255
|
-
file_name=file_name,
|
|
1256
|
-
spider_id=spider_id,
|
|
1257
|
-
spider_name=spider_name,
|
|
1258
|
-
parameters=parameters,
|
|
1259
|
-
common_settings=common_settings,
|
|
1260
|
-
)
|
|
1261
|
-
return self.create_video_task_advanced(config)
|
|
1321
|
+
username: str,
|
|
1322
|
+
password: str,
|
|
1323
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1324
|
+
traffic_limit: int = 0,
|
|
1325
|
+
status: bool = True,
|
|
1326
|
+
) -> dict[str, Any]:
|
|
1327
|
+
"""Create a new proxy sub-account.
|
|
1262
1328
|
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1329
|
+
Args:
|
|
1330
|
+
username: Sub-account username.
|
|
1331
|
+
password: Sub-account password.
|
|
1332
|
+
proxy_type: Proxy product type.
|
|
1333
|
+
traffic_limit: Traffic limit in MB (0 = unlimited).
|
|
1334
|
+
status: Enable or disable the account.
|
|
1269
1335
|
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1336
|
+
Returns:
|
|
1337
|
+
API response data.
|
|
1338
|
+
"""
|
|
1339
|
+
self._require_public_credentials()
|
|
1340
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1341
|
+
headers = build_public_api_headers(
|
|
1342
|
+
self.public_token or "", self.public_key or ""
|
|
1273
1343
|
)
|
|
1274
|
-
|
|
1344
|
+
payload = {
|
|
1345
|
+
"proxy_type": str(pt),
|
|
1346
|
+
"username": username,
|
|
1347
|
+
"password": password,
|
|
1348
|
+
"traffic_limit": str(traffic_limit),
|
|
1349
|
+
"status": "true" if status else "false",
|
|
1350
|
+
}
|
|
1275
1351
|
response = self._api_request_with_retry(
|
|
1276
|
-
"POST",
|
|
1352
|
+
"POST",
|
|
1353
|
+
f"{self._proxy_users_url}/create-user",
|
|
1354
|
+
data=payload,
|
|
1355
|
+
headers=headers,
|
|
1277
1356
|
)
|
|
1278
1357
|
response.raise_for_status()
|
|
1279
1358
|
data = response.json()
|
|
1280
1359
|
if data.get("code") != 200:
|
|
1281
|
-
raise_for_code(
|
|
1282
|
-
|
|
1283
|
-
)
|
|
1284
|
-
return data["data"]["task_id"]
|
|
1360
|
+
raise_for_code("Create user failed", code=data.get("code"), payload=data)
|
|
1361
|
+
return data.get("data", {})
|
|
1285
1362
|
|
|
1286
|
-
def
|
|
1363
|
+
def update_proxy_user(
|
|
1364
|
+
self,
|
|
1365
|
+
username: str,
|
|
1366
|
+
password: str, # Added password as required argument
|
|
1367
|
+
traffic_limit: int | None = None,
|
|
1368
|
+
status: bool | None = None,
|
|
1369
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1370
|
+
) -> dict[str, Any]:
|
|
1371
|
+
"""
|
|
1372
|
+
Update an existing proxy user's settings.
|
|
1373
|
+
|
|
1374
|
+
Note: Password is required by the API even if not changing it.
|
|
1375
|
+
|
|
1376
|
+
Args:
|
|
1377
|
+
username: The sub-account username.
|
|
1378
|
+
password: The sub-account password (required for update).
|
|
1379
|
+
traffic_limit: New traffic limit in MB (0 for unlimited). None to keep unchanged.
|
|
1380
|
+
status: New status (True=enabled, False=disabled). None to keep unchanged.
|
|
1381
|
+
proxy_type: Proxy product type.
|
|
1382
|
+
|
|
1383
|
+
Returns:
|
|
1384
|
+
API response data.
|
|
1385
|
+
"""
|
|
1287
1386
|
self._require_public_credentials()
|
|
1387
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1288
1388
|
headers = build_public_api_headers(
|
|
1289
1389
|
self.public_token or "", self.public_key or ""
|
|
1290
1390
|
)
|
|
1291
|
-
try:
|
|
1292
|
-
response = self._api_request_with_retry(
|
|
1293
|
-
"POST",
|
|
1294
|
-
self._status_url,
|
|
1295
|
-
data={"tasks_ids": task_id},
|
|
1296
|
-
headers=headers,
|
|
1297
|
-
)
|
|
1298
|
-
response.raise_for_status()
|
|
1299
|
-
data = response.json()
|
|
1300
|
-
if data.get("code") != 200:
|
|
1301
|
-
raise_for_code("Task status error", code=data.get("code"), payload=data)
|
|
1302
1391
|
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
def safe_get_task_status(self, task_id: str) -> str:
|
|
1314
|
-
try:
|
|
1315
|
-
return self.get_task_status(task_id)
|
|
1316
|
-
except Exception:
|
|
1317
|
-
return "error"
|
|
1392
|
+
payload = {
|
|
1393
|
+
"proxy_type": str(pt),
|
|
1394
|
+
"username": username,
|
|
1395
|
+
"password": password, # Include password
|
|
1396
|
+
}
|
|
1397
|
+
if traffic_limit is not None:
|
|
1398
|
+
payload["traffic_limit"] = str(traffic_limit)
|
|
1399
|
+
if status is not None:
|
|
1400
|
+
payload["status"] = "true" if status else "false"
|
|
1318
1401
|
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1402
|
+
response = self._api_request_with_retry(
|
|
1403
|
+
"POST",
|
|
1404
|
+
f"{self._proxy_users_url}/update-user",
|
|
1405
|
+
data=payload,
|
|
1406
|
+
headers=headers,
|
|
1323
1407
|
)
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
headers=headers,
|
|
1330
|
-
)
|
|
1331
|
-
response.raise_for_status()
|
|
1332
|
-
data = response.json()
|
|
1333
|
-
if data.get("code") == 200 and data.get("data"):
|
|
1334
|
-
return data["data"]["download"]
|
|
1335
|
-
raise_for_code("Get result failed", code=data.get("code"), payload=data)
|
|
1336
|
-
return ""
|
|
1337
|
-
except requests.RequestException as e:
|
|
1338
|
-
raise ThordataNetworkError(
|
|
1339
|
-
f"Get result failed: {e}", original_error=e
|
|
1340
|
-
) from e
|
|
1408
|
+
response.raise_for_status()
|
|
1409
|
+
data = response.json()
|
|
1410
|
+
if data.get("code") != 200:
|
|
1411
|
+
raise_for_code("Update user failed", code=data.get("code"), payload=data)
|
|
1412
|
+
return data.get("data", {})
|
|
1341
1413
|
|
|
1342
|
-
def
|
|
1414
|
+
def delete_proxy_user(
|
|
1415
|
+
self,
|
|
1416
|
+
username: str,
|
|
1417
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1418
|
+
) -> dict[str, Any]:
|
|
1419
|
+
"""Delete a proxy user.
|
|
1420
|
+
|
|
1421
|
+
Args:
|
|
1422
|
+
username: The sub-account username.
|
|
1423
|
+
proxy_type: Proxy product type.
|
|
1424
|
+
|
|
1425
|
+
Returns:
|
|
1426
|
+
API response data.
|
|
1427
|
+
"""
|
|
1343
1428
|
self._require_public_credentials()
|
|
1429
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1344
1430
|
headers = build_public_api_headers(
|
|
1345
1431
|
self.public_token or "", self.public_key or ""
|
|
1346
1432
|
)
|
|
1433
|
+
|
|
1434
|
+
payload = {
|
|
1435
|
+
"proxy_type": str(pt),
|
|
1436
|
+
"username": username,
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1347
1439
|
response = self._api_request_with_retry(
|
|
1348
1440
|
"POST",
|
|
1349
|
-
self.
|
|
1350
|
-
data=
|
|
1441
|
+
f"{self._proxy_users_url}/delete-user",
|
|
1442
|
+
data=payload,
|
|
1351
1443
|
headers=headers,
|
|
1352
1444
|
)
|
|
1353
1445
|
response.raise_for_status()
|
|
1354
1446
|
data = response.json()
|
|
1355
1447
|
if data.get("code") != 200:
|
|
1356
|
-
raise_for_code("
|
|
1357
|
-
return data.get("data", {
|
|
1358
|
-
|
|
1359
|
-
def wait_for_task(
|
|
1360
|
-
self,
|
|
1361
|
-
task_id: str,
|
|
1362
|
-
*,
|
|
1363
|
-
poll_interval: float = 5.0,
|
|
1364
|
-
max_wait: float = 600.0,
|
|
1365
|
-
) -> str:
|
|
1366
|
-
import time
|
|
1448
|
+
raise_for_code("Delete user failed", code=data.get("code"), payload=data)
|
|
1449
|
+
return data.get("data", {})
|
|
1367
1450
|
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
if status.lower() in {
|
|
1372
|
-
"ready",
|
|
1373
|
-
"success",
|
|
1374
|
-
"finished",
|
|
1375
|
-
"failed",
|
|
1376
|
-
"error",
|
|
1377
|
-
"cancelled",
|
|
1378
|
-
}:
|
|
1379
|
-
return status
|
|
1380
|
-
time.sleep(poll_interval)
|
|
1381
|
-
raise TimeoutError(f"Task {task_id} timeout")
|
|
1451
|
+
# =========================================================================
|
|
1452
|
+
# Whitelist IP Management
|
|
1453
|
+
# =========================================================================
|
|
1382
1454
|
|
|
1383
|
-
def
|
|
1455
|
+
def add_whitelist_ip(
|
|
1384
1456
|
self,
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
*,
|
|
1391
|
-
max_wait: float = 600.0,
|
|
1392
|
-
initial_poll_interval: float = 2.0,
|
|
1393
|
-
max_poll_interval: float = 10.0,
|
|
1394
|
-
include_errors: bool = True,
|
|
1395
|
-
) -> str:
|
|
1396
|
-
"""
|
|
1397
|
-
High-level wrapper to Run a Web Scraper task and wait for the result download URL.
|
|
1398
|
-
|
|
1399
|
-
This method handles the entire lifecycle:
|
|
1400
|
-
1. Create Task
|
|
1401
|
-
2. Poll status (with exponential backoff)
|
|
1402
|
-
3. Get download URL when ready
|
|
1457
|
+
ip: str,
|
|
1458
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1459
|
+
status: bool = True,
|
|
1460
|
+
) -> dict[str, Any]:
|
|
1461
|
+
"""Add an IP to the whitelist.
|
|
1403
1462
|
|
|
1404
1463
|
Args:
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
parameters: Spider-specific parameters.
|
|
1409
|
-
universal_params: Global spider settings.
|
|
1410
|
-
max_wait: Maximum seconds to wait for task completion (default 600).
|
|
1411
|
-
initial_poll_interval: Starting poll interval in seconds.
|
|
1412
|
-
max_poll_interval: Maximum poll interval cap.
|
|
1413
|
-
include_errors: Whether to include error logs in the task result.
|
|
1464
|
+
ip: IP address to whitelist.
|
|
1465
|
+
proxy_type: Proxy product type.
|
|
1466
|
+
status: Enable or disable the whitelist entry.
|
|
1414
1467
|
|
|
1415
1468
|
Returns:
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
Raises:
|
|
1419
|
-
ThordataTimeoutError: If task takes longer than max_wait.
|
|
1420
|
-
ThordataAPIError: If task fails or is cancelled.
|
|
1469
|
+
API response data.
|
|
1421
1470
|
"""
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
file_name=file_name,
|
|
1427
|
-
spider_id=spider_id,
|
|
1428
|
-
spider_name=spider_name,
|
|
1429
|
-
parameters=parameters,
|
|
1430
|
-
universal_params=universal_params,
|
|
1431
|
-
include_errors=include_errors,
|
|
1471
|
+
self._require_public_credentials()
|
|
1472
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1473
|
+
headers = build_public_api_headers(
|
|
1474
|
+
self.public_token or "", self.public_key or ""
|
|
1432
1475
|
)
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
if status_lower in {"failed", "error", "cancelled"}:
|
|
1450
|
-
raise ThordataNetworkError(
|
|
1451
|
-
f"Task {task_id} ended with failed status: {status}"
|
|
1452
|
-
)
|
|
1453
|
-
|
|
1454
|
-
# Wait and increase interval (capped)
|
|
1455
|
-
time.sleep(current_poll)
|
|
1456
|
-
current_poll = min(current_poll * 1.5, max_poll_interval)
|
|
1476
|
+
payload = {
|
|
1477
|
+
"proxy_type": str(pt),
|
|
1478
|
+
"ip": ip,
|
|
1479
|
+
"status": "true" if status else "false",
|
|
1480
|
+
}
|
|
1481
|
+
response = self._api_request_with_retry(
|
|
1482
|
+
"POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
|
|
1483
|
+
)
|
|
1484
|
+
response.raise_for_status()
|
|
1485
|
+
data = response.json()
|
|
1486
|
+
if data.get("code") != 200:
|
|
1487
|
+
raise_for_code(
|
|
1488
|
+
"Add whitelist IP failed", code=data.get("code"), payload=data
|
|
1489
|
+
)
|
|
1490
|
+
return data.get("data", {})
|
|
1457
1491
|
|
|
1458
|
-
|
|
1492
|
+
def delete_whitelist_ip(
|
|
1493
|
+
self,
|
|
1494
|
+
ip: str,
|
|
1495
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1496
|
+
) -> dict[str, Any]:
|
|
1497
|
+
"""Delete an IP from the whitelist.
|
|
1459
1498
|
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1499
|
+
Args:
|
|
1500
|
+
ip: The IP address to remove.
|
|
1501
|
+
proxy_type: Proxy product type.
|
|
1463
1502
|
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
to_date: str | date,
|
|
1468
|
-
) -> UsageStatistics:
|
|
1503
|
+
Returns:
|
|
1504
|
+
API response data.
|
|
1505
|
+
"""
|
|
1469
1506
|
self._require_public_credentials()
|
|
1470
|
-
if isinstance(
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
"
|
|
1477
|
-
"key": self.public_key,
|
|
1478
|
-
"from_date": from_date,
|
|
1479
|
-
"to_date": to_date,
|
|
1507
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1508
|
+
headers = build_public_api_headers(
|
|
1509
|
+
self.public_token or "", self.public_key or ""
|
|
1510
|
+
)
|
|
1511
|
+
payload = {
|
|
1512
|
+
"proxy_type": str(pt),
|
|
1513
|
+
"ip": ip,
|
|
1480
1514
|
}
|
|
1481
1515
|
response = self._api_request_with_retry(
|
|
1482
|
-
"
|
|
1516
|
+
"POST", f"{self._whitelist_url}/delete-ip", data=payload, headers=headers
|
|
1483
1517
|
)
|
|
1484
1518
|
response.raise_for_status()
|
|
1485
1519
|
data = response.json()
|
|
1486
1520
|
if data.get("code") != 200:
|
|
1487
|
-
raise_for_code(
|
|
1488
|
-
|
|
1521
|
+
raise_for_code(
|
|
1522
|
+
"Delete whitelist IP failed", code=data.get("code"), payload=data
|
|
1523
|
+
)
|
|
1524
|
+
return data.get("data", {})
|
|
1489
1525
|
|
|
1490
|
-
def
|
|
1491
|
-
self,
|
|
1492
|
-
|
|
1526
|
+
def list_whitelist_ips(
|
|
1527
|
+
self,
|
|
1528
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1529
|
+
) -> list[str]:
|
|
1530
|
+
"""List all whitelisted IPs.
|
|
1531
|
+
|
|
1532
|
+
Args:
|
|
1533
|
+
proxy_type: Proxy product type.
|
|
1534
|
+
|
|
1535
|
+
Returns:
|
|
1536
|
+
List of IP address strings.
|
|
1537
|
+
"""
|
|
1493
1538
|
self._require_public_credentials()
|
|
1494
1539
|
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1495
1540
|
params = {
|
|
@@ -1498,179 +1543,863 @@ class ThordataClient:
|
|
|
1498
1543
|
"proxy_type": str(pt),
|
|
1499
1544
|
}
|
|
1500
1545
|
response = self._api_request_with_retry(
|
|
1501
|
-
"GET", f"{self.
|
|
1546
|
+
"GET", f"{self._whitelist_url}/ip-list", params=params
|
|
1502
1547
|
)
|
|
1503
1548
|
response.raise_for_status()
|
|
1504
1549
|
data = response.json()
|
|
1505
1550
|
if data.get("code") != 200:
|
|
1506
|
-
raise_for_code(
|
|
1507
|
-
|
|
1551
|
+
raise_for_code(
|
|
1552
|
+
"List whitelist IPs failed", code=data.get("code"), payload=data
|
|
1553
|
+
)
|
|
1554
|
+
|
|
1555
|
+
# API usually returns {"data": ["1.1.1.1", ...]} OR {"data": [{"ip": "..."}]}
|
|
1556
|
+
items = data.get("data", []) or []
|
|
1557
|
+
result = []
|
|
1558
|
+
for item in items:
|
|
1559
|
+
if isinstance(item, str):
|
|
1560
|
+
result.append(item)
|
|
1561
|
+
elif isinstance(item, dict) and "ip" in item:
|
|
1562
|
+
result.append(str(item["ip"]))
|
|
1563
|
+
else:
|
|
1564
|
+
result.append(str(item))
|
|
1565
|
+
return result
|
|
1566
|
+
|
|
1567
|
+
# =========================================================================
|
|
1568
|
+
# Locations & ASN Methods
|
|
1569
|
+
# =========================================================================
|
|
1570
|
+
|
|
1571
|
+
def list_countries(
|
|
1572
|
+
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
1573
|
+
) -> list[dict[str, Any]]:
|
|
1574
|
+
"""List available countries for proxy locations.
|
|
1575
|
+
|
|
1576
|
+
Args:
|
|
1577
|
+
proxy_type: Proxy product type.
|
|
1578
|
+
|
|
1579
|
+
Returns:
|
|
1580
|
+
List of country dictionaries.
|
|
1581
|
+
"""
|
|
1582
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1583
|
+
return self._get_locations("countries", proxy_type=pt)
|
|
1584
|
+
|
|
1585
|
+
def list_states(
|
|
1586
|
+
self,
|
|
1587
|
+
country_code: str,
|
|
1588
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1589
|
+
) -> list[dict[str, Any]]:
|
|
1590
|
+
"""List available states/provinces for a country.
|
|
1591
|
+
|
|
1592
|
+
Args:
|
|
1593
|
+
country_code: Country code (e.g., "US", "GB").
|
|
1594
|
+
proxy_type: Proxy product type.
|
|
1595
|
+
|
|
1596
|
+
Returns:
|
|
1597
|
+
List of state dictionaries.
|
|
1598
|
+
"""
|
|
1599
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1600
|
+
return self._get_locations("states", proxy_type=pt, country_code=country_code)
|
|
1601
|
+
|
|
1602
|
+
def list_cities(
|
|
1603
|
+
self,
|
|
1604
|
+
country_code: str,
|
|
1605
|
+
state_code: str | None = None,
|
|
1606
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1607
|
+
) -> list[dict[str, Any]]:
|
|
1608
|
+
"""List available cities for a country/state.
|
|
1609
|
+
|
|
1610
|
+
Args:
|
|
1611
|
+
country_code: Country code.
|
|
1612
|
+
state_code: State code (optional).
|
|
1613
|
+
proxy_type: Proxy product type.
|
|
1614
|
+
|
|
1615
|
+
Returns:
|
|
1616
|
+
List of city dictionaries.
|
|
1617
|
+
"""
|
|
1618
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1619
|
+
kwargs = {"proxy_type": pt, "country_code": country_code}
|
|
1620
|
+
if state_code:
|
|
1621
|
+
kwargs["state_code"] = state_code
|
|
1622
|
+
return self._get_locations("cities", **kwargs)
|
|
1623
|
+
|
|
1624
|
+
def list_asn(
|
|
1625
|
+
self,
|
|
1626
|
+
country_code: str,
|
|
1627
|
+
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1628
|
+
) -> list[dict[str, Any]]:
|
|
1629
|
+
"""List available ASN numbers for a country.
|
|
1630
|
+
|
|
1631
|
+
Args:
|
|
1632
|
+
country_code: Country code.
|
|
1633
|
+
proxy_type: Proxy product type.
|
|
1634
|
+
|
|
1635
|
+
Returns:
|
|
1636
|
+
List of ASN dictionaries.
|
|
1637
|
+
"""
|
|
1638
|
+
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1639
|
+
return self._get_locations("asn", proxy_type=pt, country_code=country_code)
|
|
1640
|
+
|
|
1641
|
+
# =========================================================================
|
|
1642
|
+
# ISP & Datacenter Proxy Management
|
|
1643
|
+
# =========================================================================
|
|
1644
|
+
|
|
1645
|
+
def list_proxy_servers(self, proxy_type: int) -> list[ProxyServer]:
|
|
1646
|
+
"""List purchased proxy servers (ISP/Datacenter).
|
|
1647
|
+
|
|
1648
|
+
Args:
|
|
1649
|
+
proxy_type: Proxy type (1=ISP, 2=Datacenter).
|
|
1650
|
+
|
|
1651
|
+
Returns:
|
|
1652
|
+
List of ProxyServer objects.
|
|
1653
|
+
"""
|
|
1654
|
+
self._require_public_credentials()
|
|
1655
|
+
params = {
|
|
1656
|
+
"token": self.public_token,
|
|
1657
|
+
"key": self.public_key,
|
|
1658
|
+
"proxy_type": str(proxy_type),
|
|
1659
|
+
}
|
|
1660
|
+
response = self._api_request_with_retry(
|
|
1661
|
+
"GET", self._proxy_list_url, params=params
|
|
1662
|
+
)
|
|
1663
|
+
response.raise_for_status()
|
|
1664
|
+
data = response.json()
|
|
1665
|
+
if data.get("code") != 200:
|
|
1666
|
+
raise_for_code(
|
|
1667
|
+
"List proxy servers error", code=data.get("code"), payload=data
|
|
1668
|
+
)
|
|
1669
|
+
|
|
1670
|
+
server_list = []
|
|
1671
|
+
if isinstance(data, dict):
|
|
1672
|
+
server_list = data.get("data", data.get("list", []))
|
|
1673
|
+
elif isinstance(data, list):
|
|
1674
|
+
server_list = data
|
|
1675
|
+
|
|
1676
|
+
return [ProxyServer.from_dict(s) for s in server_list]
|
|
1677
|
+
|
|
1678
|
+
def get_proxy_expiration(
|
|
1679
|
+
self, ips: str | list[str], proxy_type: int
|
|
1680
|
+
) -> dict[str, Any]:
|
|
1681
|
+
"""Get expiration time for proxy IPs.
|
|
1682
|
+
|
|
1683
|
+
Args:
|
|
1684
|
+
ips: Single IP or comma-separated list of IPs.
|
|
1685
|
+
proxy_type: Proxy type (1=ISP, 2=Datacenter).
|
|
1686
|
+
|
|
1687
|
+
Returns:
|
|
1688
|
+
Dictionary with IP expiration times.
|
|
1689
|
+
"""
|
|
1690
|
+
self._require_public_credentials()
|
|
1691
|
+
if isinstance(ips, list):
|
|
1692
|
+
ips = ",".join(ips)
|
|
1693
|
+
params = {
|
|
1694
|
+
"token": self.public_token,
|
|
1695
|
+
"key": self.public_key,
|
|
1696
|
+
"proxy_type": str(proxy_type),
|
|
1697
|
+
"ips": ips,
|
|
1698
|
+
}
|
|
1699
|
+
response = self._api_request_with_retry(
|
|
1700
|
+
"GET", self._proxy_expiration_url, params=params
|
|
1701
|
+
)
|
|
1702
|
+
response.raise_for_status()
|
|
1703
|
+
data = response.json()
|
|
1704
|
+
if data.get("code") != 200:
|
|
1705
|
+
raise_for_code("Get expiration error", code=data.get("code"), payload=data)
|
|
1706
|
+
return data.get("data", data)
|
|
1707
|
+
|
|
1708
|
+
# =========================================================================
|
|
1709
|
+
# Internal Request Helpers
|
|
1710
|
+
# =========================================================================
|
|
1711
|
+
|
|
1712
|
+
def _api_request_with_retry(
|
|
1713
|
+
self,
|
|
1714
|
+
method: str,
|
|
1715
|
+
url: str,
|
|
1716
|
+
*,
|
|
1717
|
+
data: dict[str, Any] | None = None,
|
|
1718
|
+
headers: dict[str, str] | None = None,
|
|
1719
|
+
params: dict[str, Any] | None = None,
|
|
1720
|
+
) -> requests.Response:
|
|
1721
|
+
"""Make an API request with retry logic.
|
|
1722
|
+
|
|
1723
|
+
Args:
|
|
1724
|
+
method: HTTP method.
|
|
1725
|
+
url: Request URL.
|
|
1726
|
+
data: Request body data.
|
|
1727
|
+
headers: Request headers.
|
|
1728
|
+
query_params: Query string parameters.
|
|
1729
|
+
|
|
1730
|
+
Returns:
|
|
1731
|
+
Response object.
|
|
1732
|
+
"""
|
|
1733
|
+
|
|
1734
|
+
@with_retry(self._retry_config)
|
|
1735
|
+
def _do_request() -> requests.Response:
|
|
1736
|
+
return self._api_session.request(
|
|
1737
|
+
method,
|
|
1738
|
+
url,
|
|
1739
|
+
data=data,
|
|
1740
|
+
headers=headers,
|
|
1741
|
+
params=params,
|
|
1742
|
+
timeout=self._api_timeout,
|
|
1743
|
+
)
|
|
1744
|
+
|
|
1745
|
+
try:
|
|
1746
|
+
return _do_request()
|
|
1747
|
+
except requests.Timeout as e:
|
|
1748
|
+
raise ThordataTimeoutError(
|
|
1749
|
+
f"API request timed out: {e}", original_error=e
|
|
1750
|
+
) from e
|
|
1751
|
+
except requests.RequestException as e:
|
|
1752
|
+
raise ThordataNetworkError(
|
|
1753
|
+
f"API request failed: {e}", original_error=e
|
|
1754
|
+
) from e
|
|
1755
|
+
|
|
1756
|
+
def _require_public_credentials(self) -> None:
|
|
1757
|
+
"""Check that public credentials are set."""
|
|
1758
|
+
if not self.public_token or not self.public_key:
|
|
1759
|
+
raise ThordataConfigError(
|
|
1760
|
+
"public_token and public_key are required for this operation."
|
|
1761
|
+
)
|
|
1762
|
+
|
|
1763
|
+
def _get_locations(self, endpoint: str, **kwargs: Any) -> list[dict[str, Any]]:
|
|
1764
|
+
"""Internal method to fetch location data.
|
|
1765
|
+
|
|
1766
|
+
Args:
|
|
1767
|
+
endpoint: Location endpoint (countries, states, cities, asn).
|
|
1768
|
+
**kwargs: Query parameters.
|
|
1769
|
+
|
|
1770
|
+
Returns:
|
|
1771
|
+
List of location dictionaries.
|
|
1772
|
+
"""
|
|
1773
|
+
self._require_public_credentials()
|
|
1774
|
+
params = {"token": self.public_token, "key": self.public_key}
|
|
1775
|
+
for k, v in kwargs.items():
|
|
1776
|
+
params[k] = str(v)
|
|
1777
|
+
|
|
1778
|
+
response = self._api_request_with_retry(
|
|
1779
|
+
"GET", f"{self._locations_base_url}/{endpoint}", params=params
|
|
1780
|
+
)
|
|
1781
|
+
response.raise_for_status()
|
|
1782
|
+
data = response.json()
|
|
1783
|
+
if isinstance(data, dict):
|
|
1784
|
+
if data.get("code") != 200:
|
|
1785
|
+
raise RuntimeError(f"Locations error: {data.get('msg')}")
|
|
1786
|
+
return data.get("data") or []
|
|
1787
|
+
return data if isinstance(data, list) else []
|
|
1788
|
+
|
|
1789
|
+
def _process_universal_response(
|
|
1790
|
+
self, response: requests.Response, output_format: str
|
|
1791
|
+
) -> str | bytes:
|
|
1792
|
+
"""Process Universal API response.
|
|
1793
|
+
|
|
1794
|
+
Args:
|
|
1795
|
+
response: Response object.
|
|
1796
|
+
output_format: Expected output format.
|
|
1797
|
+
|
|
1798
|
+
Returns:
|
|
1799
|
+
Processed content.
|
|
1800
|
+
"""
|
|
1801
|
+
try:
|
|
1802
|
+
resp_json = response.json()
|
|
1803
|
+
except ValueError:
|
|
1804
|
+
return response.content if output_format.lower() == "png" else response.text
|
|
1805
|
+
|
|
1806
|
+
if isinstance(resp_json, dict):
|
|
1807
|
+
code = resp_json.get("code")
|
|
1808
|
+
if code is not None and code != 200:
|
|
1809
|
+
msg = extract_error_message(resp_json)
|
|
1810
|
+
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
1811
|
+
|
|
1812
|
+
if "html" in resp_json:
|
|
1813
|
+
return resp_json["html"]
|
|
1814
|
+
if "png" in resp_json:
|
|
1815
|
+
return decode_base64_image(resp_json["png"])
|
|
1816
|
+
|
|
1817
|
+
return str(resp_json)
|
|
1818
|
+
|
|
1819
|
+
# =========================================================================
|
|
1820
|
+
# Proxy Implementation Details
|
|
1821
|
+
# =========================================================================
|
|
1822
|
+
|
|
1823
|
+
def _proxy_verb(
|
|
1824
|
+
self,
|
|
1825
|
+
method: str,
|
|
1826
|
+
url: str,
|
|
1827
|
+
proxy_config: ProxyConfig | None,
|
|
1828
|
+
timeout: int | None,
|
|
1829
|
+
**kwargs: Any,
|
|
1830
|
+
) -> requests.Response:
|
|
1831
|
+
"""Internal method for proxy requests."""
|
|
1832
|
+
timeout = timeout or self._default_timeout
|
|
1833
|
+
|
|
1834
|
+
if proxy_config is None:
|
|
1835
|
+
proxy_config = self._get_default_proxy_config_from_env()
|
|
1836
|
+
|
|
1837
|
+
if proxy_config is None:
|
|
1838
|
+
raise ThordataConfigError(
|
|
1839
|
+
"Proxy credentials are missing. "
|
|
1840
|
+
"Pass proxy_config or set THORDATA_RESIDENTIAL_USERNAME/PASSWORD env vars."
|
|
1841
|
+
)
|
|
1842
|
+
|
|
1843
|
+
kwargs.pop("proxies", None)
|
|
1844
|
+
|
|
1845
|
+
@with_retry(self._retry_config)
|
|
1846
|
+
def _do() -> requests.Response:
|
|
1847
|
+
return self._proxy_request_with_proxy_manager(
|
|
1848
|
+
method,
|
|
1849
|
+
url,
|
|
1850
|
+
proxy_config=proxy_config,
|
|
1851
|
+
timeout=timeout,
|
|
1852
|
+
headers=kwargs.pop("headers", None),
|
|
1853
|
+
params=kwargs.pop("params", None),
|
|
1854
|
+
data=kwargs.pop("data", None),
|
|
1855
|
+
)
|
|
1856
|
+
|
|
1857
|
+
try:
|
|
1858
|
+
return _do()
|
|
1859
|
+
except requests.Timeout as e:
|
|
1860
|
+
raise ThordataTimeoutError(
|
|
1861
|
+
f"Request timed out: {e}", original_error=e
|
|
1862
|
+
) from e
|
|
1863
|
+
except Exception as e:
|
|
1864
|
+
raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
|
|
1865
|
+
|
|
1866
|
+
def _proxy_manager_key(self, proxy_endpoint: str, userpass: str | None) -> str:
|
|
1867
|
+
"""Build a stable cache key for ProxyManager instances."""
|
|
1868
|
+
if not userpass:
|
|
1869
|
+
return proxy_endpoint
|
|
1870
|
+
h = hashlib.sha256(userpass.encode("utf-8")).hexdigest()[:12]
|
|
1871
|
+
return f"{proxy_endpoint}|auth={h}"
|
|
1872
|
+
|
|
1873
|
+
def _get_proxy_manager(
|
|
1874
|
+
self,
|
|
1875
|
+
proxy_url: str,
|
|
1876
|
+
*,
|
|
1877
|
+
cache_key: str,
|
|
1878
|
+
proxy_headers: dict[str, str] | None = None,
|
|
1879
|
+
) -> urllib3.PoolManager:
|
|
1880
|
+
"""Get or create a ProxyManager for the given proxy URL (Pooled)."""
|
|
1881
|
+
cached = self._proxy_managers.get(cache_key)
|
|
1882
|
+
if cached is not None:
|
|
1883
|
+
return cached
|
|
1884
|
+
|
|
1885
|
+
if proxy_url.startswith(("socks5://", "socks5h://", "socks4://", "socks4a://")):
|
|
1886
|
+
try:
|
|
1887
|
+
from urllib3.contrib.socks import SOCKSProxyManager
|
|
1888
|
+
except Exception as e:
|
|
1889
|
+
raise ThordataConfigError(
|
|
1890
|
+
"SOCKS proxy requested but SOCKS dependencies are missing. "
|
|
1891
|
+
"Install: pip install 'urllib3[socks]' or pip install PySocks"
|
|
1892
|
+
) from e
|
|
1893
|
+
|
|
1894
|
+
pm_socks = SOCKSProxyManager(
|
|
1895
|
+
proxy_url,
|
|
1896
|
+
num_pools=10,
|
|
1897
|
+
maxsize=10,
|
|
1898
|
+
)
|
|
1899
|
+
pm = cast(urllib3.PoolManager, pm_socks)
|
|
1900
|
+
self._proxy_managers[cache_key] = pm
|
|
1901
|
+
return pm
|
|
1902
|
+
|
|
1903
|
+
# HTTP/HTTPS proxies
|
|
1904
|
+
proxy_ssl_context = None
|
|
1905
|
+
if proxy_url.startswith("https://"):
|
|
1906
|
+
proxy_ssl_context = ssl.create_default_context()
|
|
1907
|
+
|
|
1908
|
+
pm_http = urllib3.ProxyManager(
|
|
1909
|
+
proxy_url,
|
|
1910
|
+
proxy_headers=proxy_headers,
|
|
1911
|
+
proxy_ssl_context=proxy_ssl_context,
|
|
1912
|
+
num_pools=10,
|
|
1913
|
+
maxsize=10,
|
|
1914
|
+
)
|
|
1915
|
+
|
|
1916
|
+
pm = cast(urllib3.PoolManager, pm_http)
|
|
1917
|
+
self._proxy_managers[cache_key] = pm
|
|
1918
|
+
return pm
|
|
1919
|
+
|
|
1920
|
+
def _proxy_request_with_proxy_manager(
|
|
1921
|
+
self,
|
|
1922
|
+
method: str,
|
|
1923
|
+
url: str,
|
|
1924
|
+
*,
|
|
1925
|
+
proxy_config: ProxyConfig,
|
|
1926
|
+
timeout: int,
|
|
1927
|
+
headers: dict[str, str] | None = None,
|
|
1928
|
+
params: dict[str, Any] | None = None,
|
|
1929
|
+
data: Any = None,
|
|
1930
|
+
) -> requests.Response:
|
|
1931
|
+
"""Execute request through proxy, with optional upstream proxy support."""
|
|
1932
|
+
|
|
1933
|
+
# Check for upstream proxy
|
|
1934
|
+
upstream_config = _parse_upstream_proxy()
|
|
1935
|
+
|
|
1936
|
+
if upstream_config:
|
|
1937
|
+
return self._proxy_request_with_upstream(
|
|
1938
|
+
method,
|
|
1939
|
+
url,
|
|
1940
|
+
proxy_config=proxy_config,
|
|
1941
|
+
timeout=timeout,
|
|
1942
|
+
headers=headers,
|
|
1943
|
+
params=params,
|
|
1944
|
+
data=data,
|
|
1945
|
+
upstream_config=upstream_config,
|
|
1946
|
+
)
|
|
1947
|
+
|
|
1948
|
+
# Original implementation (no upstream proxy)
|
|
1949
|
+
req = requests.Request(method=method.upper(), url=url, params=params)
|
|
1950
|
+
prepped = self._proxy_session.prepare_request(req)
|
|
1951
|
+
final_url = prepped.url or url
|
|
1952
|
+
|
|
1953
|
+
proxy_endpoint = proxy_config.build_proxy_endpoint()
|
|
1954
|
+
is_socks = proxy_endpoint.startswith(
|
|
1955
|
+
("socks5://", "socks5h://", "socks4://", "socks4a://")
|
|
1956
|
+
)
|
|
1957
|
+
|
|
1958
|
+
if is_socks:
|
|
1959
|
+
proxy_url_for_manager = proxy_config.build_proxy_url()
|
|
1960
|
+
userpass = proxy_config.build_proxy_basic_auth()
|
|
1961
|
+
cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
|
|
1962
|
+
|
|
1963
|
+
pm = self._get_proxy_manager(
|
|
1964
|
+
proxy_url_for_manager,
|
|
1965
|
+
cache_key=cache_key,
|
|
1966
|
+
proxy_headers=None,
|
|
1967
|
+
)
|
|
1968
|
+
else:
|
|
1969
|
+
userpass = proxy_config.build_proxy_basic_auth()
|
|
1970
|
+
proxy_headers = urllib3.make_headers(proxy_basic_auth=userpass)
|
|
1971
|
+
cache_key = self._proxy_manager_key(proxy_endpoint, userpass)
|
|
1972
|
+
|
|
1973
|
+
pm = self._get_proxy_manager(
|
|
1974
|
+
proxy_endpoint,
|
|
1975
|
+
cache_key=cache_key,
|
|
1976
|
+
proxy_headers=dict(proxy_headers),
|
|
1977
|
+
)
|
|
1978
|
+
|
|
1979
|
+
req_headers = dict(headers or {})
|
|
1980
|
+
body = None
|
|
1981
|
+
if data is not None:
|
|
1982
|
+
if isinstance(data, dict):
|
|
1983
|
+
body = urlencode({k: str(v) for k, v in data.items()})
|
|
1984
|
+
req_headers.setdefault(
|
|
1985
|
+
"Content-Type", "application/x-www-form-urlencoded"
|
|
1986
|
+
)
|
|
1987
|
+
else:
|
|
1988
|
+
body = data
|
|
1989
|
+
|
|
1990
|
+
http_resp = pm.request(
|
|
1991
|
+
method.upper(),
|
|
1992
|
+
final_url,
|
|
1993
|
+
body=body,
|
|
1994
|
+
headers=req_headers or None,
|
|
1995
|
+
timeout=urllib3.Timeout(connect=timeout, read=timeout),
|
|
1996
|
+
retries=False,
|
|
1997
|
+
preload_content=True,
|
|
1998
|
+
)
|
|
1999
|
+
|
|
2000
|
+
r = requests.Response()
|
|
2001
|
+
r.status_code = int(getattr(http_resp, "status", 0) or 0)
|
|
2002
|
+
r._content = http_resp.data or b""
|
|
2003
|
+
r.url = final_url
|
|
2004
|
+
r.headers = CaseInsensitiveDict(dict(http_resp.headers or {}))
|
|
2005
|
+
return r
|
|
2006
|
+
|
|
2007
|
+
# =========================================================================
|
|
2008
|
+
# Upstream Proxy Support (Proxy Chaining)
|
|
2009
|
+
# =========================================================================
|
|
2010
|
+
|
|
2011
|
+
def _proxy_request_with_upstream(
|
|
2012
|
+
self,
|
|
2013
|
+
method: str,
|
|
2014
|
+
url: str,
|
|
2015
|
+
*,
|
|
2016
|
+
proxy_config: ProxyConfig,
|
|
2017
|
+
timeout: int,
|
|
2018
|
+
headers: dict[str, str] | None = None,
|
|
2019
|
+
params: dict[str, Any] | None = None,
|
|
2020
|
+
data: Any = None,
|
|
2021
|
+
upstream_config: dict[str, Any],
|
|
2022
|
+
) -> requests.Response:
|
|
2023
|
+
"""Execute request through proxy chain: Upstream -> Thordata -> Target."""
|
|
2024
|
+
if not HAS_PYSOCKS:
|
|
2025
|
+
raise ThordataConfigError(
|
|
2026
|
+
"PySocks is required for upstream proxy support. "
|
|
2027
|
+
"Install with: pip install PySocks"
|
|
2028
|
+
)
|
|
2029
|
+
|
|
2030
|
+
req = requests.Request(method=method.upper(), url=url, params=params)
|
|
2031
|
+
prepped = self._proxy_session.prepare_request(req)
|
|
2032
|
+
final_url = prepped.url or url
|
|
2033
|
+
|
|
2034
|
+
parsed_target = urlparse(final_url)
|
|
2035
|
+
target_host = parsed_target.hostname or ""
|
|
2036
|
+
target_port = parsed_target.port or (
|
|
2037
|
+
443 if parsed_target.scheme == "https" else 80
|
|
2038
|
+
)
|
|
2039
|
+
target_is_https = parsed_target.scheme == "https"
|
|
2040
|
+
|
|
2041
|
+
protocol = proxy_config.protocol.lower()
|
|
2042
|
+
if protocol == "socks5":
|
|
2043
|
+
protocol = "socks5h"
|
|
2044
|
+
|
|
2045
|
+
thordata_host = proxy_config.host or ""
|
|
2046
|
+
thordata_port = proxy_config.port or 9999
|
|
2047
|
+
thordata_username = proxy_config.build_username()
|
|
2048
|
+
thordata_password = proxy_config.password
|
|
2049
|
+
|
|
2050
|
+
socket_factory = _UpstreamProxySocketFactory(upstream_config)
|
|
2051
|
+
|
|
2052
|
+
logger.debug(
|
|
2053
|
+
f"Proxy chain: upstream({upstream_config['host']}:{upstream_config['port']}) "
|
|
2054
|
+
f"-> thordata({protocol}://{thordata_host}:{thordata_port}) "
|
|
2055
|
+
f"-> target({target_host}:{target_port})"
|
|
2056
|
+
)
|
|
2057
|
+
|
|
2058
|
+
raw_sock = socket_factory.create_connection(
|
|
2059
|
+
(thordata_host, thordata_port),
|
|
2060
|
+
timeout=float(timeout),
|
|
2061
|
+
)
|
|
2062
|
+
|
|
2063
|
+
try:
|
|
2064
|
+
if protocol.startswith("socks"):
|
|
2065
|
+
sock = self._socks5_handshake(
|
|
2066
|
+
raw_sock,
|
|
2067
|
+
target_host,
|
|
2068
|
+
target_port,
|
|
2069
|
+
thordata_username,
|
|
2070
|
+
thordata_password,
|
|
2071
|
+
)
|
|
2072
|
+
if target_is_https:
|
|
2073
|
+
context = ssl.create_default_context()
|
|
2074
|
+
sock = context.wrap_socket(sock, server_hostname=target_host)
|
|
2075
|
+
|
|
2076
|
+
elif protocol == "https":
|
|
2077
|
+
proxy_context = ssl.create_default_context()
|
|
2078
|
+
proxy_ssl_sock = proxy_context.wrap_socket(
|
|
2079
|
+
raw_sock, server_hostname=thordata_host
|
|
2080
|
+
)
|
|
2081
|
+
|
|
2082
|
+
self._send_connect_request(
|
|
2083
|
+
proxy_ssl_sock,
|
|
2084
|
+
target_host,
|
|
2085
|
+
target_port,
|
|
2086
|
+
thordata_username,
|
|
2087
|
+
thordata_password,
|
|
2088
|
+
)
|
|
2089
|
+
|
|
2090
|
+
if target_is_https:
|
|
2091
|
+
# FIX: Add type ignore for MyPy because _TLSInTLSSocket is duck-typed as socket
|
|
2092
|
+
sock = self._create_tls_in_tls_socket(
|
|
2093
|
+
proxy_ssl_sock, target_host, timeout
|
|
2094
|
+
) # type: ignore[assignment]
|
|
2095
|
+
else:
|
|
2096
|
+
sock = proxy_ssl_sock
|
|
2097
|
+
|
|
2098
|
+
else: # HTTP proxy
|
|
2099
|
+
self._send_connect_request(
|
|
2100
|
+
raw_sock,
|
|
2101
|
+
target_host,
|
|
2102
|
+
target_port,
|
|
2103
|
+
thordata_username,
|
|
2104
|
+
thordata_password,
|
|
2105
|
+
)
|
|
2106
|
+
|
|
2107
|
+
if target_is_https:
|
|
2108
|
+
context = ssl.create_default_context()
|
|
2109
|
+
sock = context.wrap_socket(raw_sock, server_hostname=target_host)
|
|
2110
|
+
else:
|
|
2111
|
+
sock = raw_sock
|
|
2112
|
+
|
|
2113
|
+
return self._send_http_request(
|
|
2114
|
+
sock, method, parsed_target, headers, data, final_url, timeout
|
|
2115
|
+
)
|
|
2116
|
+
|
|
2117
|
+
finally:
|
|
2118
|
+
with contextlib.suppress(Exception):
|
|
2119
|
+
raw_sock.close()
|
|
2120
|
+
|
|
2121
|
+
def _send_connect_request(
|
|
2122
|
+
self,
|
|
2123
|
+
sock: socket.socket,
|
|
2124
|
+
target_host: str,
|
|
2125
|
+
target_port: int,
|
|
2126
|
+
proxy_username: str,
|
|
2127
|
+
proxy_password: str,
|
|
2128
|
+
) -> None:
|
|
2129
|
+
"""Send HTTP CONNECT request to proxy and verify response."""
|
|
2130
|
+
connect_req = f"CONNECT {target_host}:{target_port} HTTP/1.1\r\n"
|
|
2131
|
+
connect_req += f"Host: {target_host}:{target_port}\r\n"
|
|
2132
|
+
|
|
2133
|
+
credentials = f"{proxy_username}:{proxy_password}"
|
|
2134
|
+
encoded = base64.b64encode(credentials.encode()).decode()
|
|
2135
|
+
connect_req += f"Proxy-Authorization: Basic {encoded}\r\n"
|
|
2136
|
+
connect_req += "\r\n"
|
|
2137
|
+
|
|
2138
|
+
sock.sendall(connect_req.encode())
|
|
2139
|
+
|
|
2140
|
+
response = b""
|
|
2141
|
+
while b"\r\n\r\n" not in response:
|
|
2142
|
+
chunk = sock.recv(4096)
|
|
2143
|
+
if not chunk:
|
|
2144
|
+
raise ConnectionError("Proxy closed connection during CONNECT")
|
|
2145
|
+
response += chunk
|
|
2146
|
+
|
|
2147
|
+
status_line = response.split(b"\r\n")[0].decode()
|
|
2148
|
+
if "200" not in status_line:
|
|
2149
|
+
raise ConnectionError(f"Proxy CONNECT failed: {status_line}")
|
|
2150
|
+
|
|
2151
|
+
def _create_tls_in_tls_socket(
|
|
2152
|
+
self,
|
|
2153
|
+
outer_ssl_sock: ssl.SSLSocket,
|
|
2154
|
+
hostname: str,
|
|
2155
|
+
timeout: int,
|
|
2156
|
+
) -> _TLSInTLSSocket:
|
|
2157
|
+
"""Create a TLS connection over an existing TLS connection."""
|
|
2158
|
+
context = ssl.create_default_context()
|
|
2159
|
+
|
|
2160
|
+
incoming = ssl.MemoryBIO()
|
|
2161
|
+
outgoing = ssl.MemoryBIO()
|
|
2162
|
+
|
|
2163
|
+
ssl_obj = context.wrap_bio(incoming, outgoing, server_hostname=hostname)
|
|
2164
|
+
|
|
2165
|
+
while True:
|
|
2166
|
+
try:
|
|
2167
|
+
ssl_obj.do_handshake()
|
|
2168
|
+
break
|
|
2169
|
+
except ssl.SSLWantReadError:
|
|
2170
|
+
data_to_send = outgoing.read()
|
|
2171
|
+
if data_to_send:
|
|
2172
|
+
outer_ssl_sock.sendall(data_to_send)
|
|
2173
|
+
|
|
2174
|
+
outer_ssl_sock.settimeout(float(timeout))
|
|
2175
|
+
try:
|
|
2176
|
+
received = outer_ssl_sock.recv(8192)
|
|
2177
|
+
if not received:
|
|
2178
|
+
raise ConnectionError("Connection closed during TLS handshake")
|
|
2179
|
+
incoming.write(received)
|
|
2180
|
+
except socket.timeout as e:
|
|
2181
|
+
raise ConnectionError("Timeout during TLS handshake") from e
|
|
2182
|
+
except ssl.SSLWantWriteError:
|
|
2183
|
+
data_to_send = outgoing.read()
|
|
2184
|
+
if data_to_send:
|
|
2185
|
+
outer_ssl_sock.sendall(data_to_send)
|
|
2186
|
+
|
|
2187
|
+
data_to_send = outgoing.read()
|
|
2188
|
+
if data_to_send:
|
|
2189
|
+
outer_ssl_sock.sendall(data_to_send)
|
|
2190
|
+
|
|
2191
|
+
return _TLSInTLSSocket(outer_ssl_sock, ssl_obj, incoming, outgoing)
|
|
2192
|
+
|
|
2193
|
+
def _send_http_request(
|
|
2194
|
+
self,
|
|
2195
|
+
sock: socket.socket | ssl.SSLSocket | Any,
|
|
2196
|
+
method: str,
|
|
2197
|
+
parsed_url: Any,
|
|
2198
|
+
headers: dict[str, str] | None,
|
|
2199
|
+
data: Any,
|
|
2200
|
+
final_url: str,
|
|
2201
|
+
timeout: int,
|
|
2202
|
+
) -> requests.Response:
|
|
2203
|
+
"""Send HTTP request over established connection and parse response."""
|
|
2204
|
+
target_host = parsed_url.hostname
|
|
2205
|
+
|
|
2206
|
+
req_headers = dict(headers or {})
|
|
2207
|
+
req_headers.setdefault("Host", target_host)
|
|
2208
|
+
req_headers.setdefault("User-Agent", build_user_agent(_sdk_version, "requests"))
|
|
2209
|
+
req_headers.setdefault("Connection", "close")
|
|
2210
|
+
|
|
2211
|
+
path = parsed_url.path or "/"
|
|
2212
|
+
if parsed_url.query:
|
|
2213
|
+
path += f"?{parsed_url.query}"
|
|
2214
|
+
|
|
2215
|
+
http_req = f"{method.upper()} {path} HTTP/1.1\r\n"
|
|
2216
|
+
for k, v in req_headers.items():
|
|
2217
|
+
http_req += f"{k}: {v}\r\n"
|
|
2218
|
+
|
|
2219
|
+
body = None
|
|
2220
|
+
if data is not None:
|
|
2221
|
+
if isinstance(data, dict):
|
|
2222
|
+
body = urlencode({k: str(v) for k, v in data.items()}).encode()
|
|
2223
|
+
http_req += "Content-Type: application/x-www-form-urlencoded\r\n"
|
|
2224
|
+
http_req += f"Content-Length: {len(body)}\r\n"
|
|
2225
|
+
elif isinstance(data, bytes):
|
|
2226
|
+
body = data
|
|
2227
|
+
http_req += f"Content-Length: {len(body)}\r\n"
|
|
2228
|
+
else:
|
|
2229
|
+
body = str(data).encode()
|
|
2230
|
+
http_req += f"Content-Length: {len(body)}\r\n"
|
|
2231
|
+
|
|
2232
|
+
http_req += "\r\n"
|
|
2233
|
+
sock.sendall(http_req.encode())
|
|
2234
|
+
|
|
2235
|
+
if body:
|
|
2236
|
+
sock.sendall(body)
|
|
2237
|
+
|
|
2238
|
+
if hasattr(sock, "settimeout"):
|
|
2239
|
+
sock.settimeout(float(timeout))
|
|
2240
|
+
|
|
2241
|
+
response_data = b""
|
|
2242
|
+
try:
|
|
2243
|
+
while True:
|
|
2244
|
+
chunk = sock.recv(8192)
|
|
2245
|
+
if not chunk:
|
|
2246
|
+
break
|
|
2247
|
+
response_data += chunk
|
|
2248
|
+
if b"\r\n\r\n" in response_data:
|
|
2249
|
+
header_end = response_data.index(b"\r\n\r\n") + 4
|
|
2250
|
+
headers_part = (
|
|
2251
|
+
response_data[:header_end]
|
|
2252
|
+
.decode("utf-8", errors="replace")
|
|
2253
|
+
.lower()
|
|
2254
|
+
)
|
|
2255
|
+
if "content-length:" in headers_part:
|
|
2256
|
+
for line in headers_part.split("\r\n"):
|
|
2257
|
+
if line.startswith("content-length:"):
|
|
2258
|
+
content_length = int(line.split(":")[1].strip())
|
|
2259
|
+
if len(response_data) >= header_end + content_length:
|
|
2260
|
+
break
|
|
2261
|
+
elif "transfer-encoding: chunked" not in headers_part:
|
|
2262
|
+
break
|
|
2263
|
+
except socket.timeout:
|
|
2264
|
+
pass
|
|
2265
|
+
|
|
2266
|
+
return self._parse_http_response(response_data, final_url)
|
|
2267
|
+
|
|
2268
|
+
def _socks5_handshake(
|
|
2269
|
+
self,
|
|
2270
|
+
sock: socket.socket,
|
|
2271
|
+
target_host: str,
|
|
2272
|
+
target_port: int,
|
|
2273
|
+
username: str | None,
|
|
2274
|
+
password: str | None,
|
|
2275
|
+
) -> socket.socket:
|
|
2276
|
+
"""Perform SOCKS5 handshake over existing socket."""
|
|
2277
|
+
if username and password:
|
|
2278
|
+
sock.sendall(b"\x05\x02\x00\x02")
|
|
2279
|
+
else:
|
|
2280
|
+
sock.sendall(b"\x05\x01\x00")
|
|
2281
|
+
|
|
2282
|
+
response = sock.recv(2)
|
|
2283
|
+
if len(response) < 2:
|
|
2284
|
+
raise ConnectionError("SOCKS5 handshake failed: incomplete response")
|
|
2285
|
+
|
|
2286
|
+
if response[0] != 0x05:
|
|
2287
|
+
raise ConnectionError(f"SOCKS5 version mismatch: {response[0]}")
|
|
2288
|
+
|
|
2289
|
+
auth_method = response[1]
|
|
2290
|
+
|
|
2291
|
+
if auth_method == 0x02:
|
|
2292
|
+
if not username or not password:
|
|
2293
|
+
raise ConnectionError(
|
|
2294
|
+
"SOCKS5 server requires auth but no credentials provided"
|
|
2295
|
+
)
|
|
2296
|
+
|
|
2297
|
+
auth_req = bytes([0x01, len(username)]) + username.encode()
|
|
2298
|
+
auth_req += bytes([len(password)]) + password.encode()
|
|
2299
|
+
sock.sendall(auth_req)
|
|
1508
2300
|
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
password: str,
|
|
1513
|
-
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1514
|
-
traffic_limit: int = 0,
|
|
1515
|
-
status: bool = True,
|
|
1516
|
-
) -> dict[str, Any]:
|
|
1517
|
-
self._require_public_credentials()
|
|
1518
|
-
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1519
|
-
headers = build_public_api_headers(
|
|
1520
|
-
self.public_token or "", self.public_key or ""
|
|
1521
|
-
)
|
|
1522
|
-
payload = {
|
|
1523
|
-
"proxy_type": str(pt),
|
|
1524
|
-
"username": username,
|
|
1525
|
-
"password": password,
|
|
1526
|
-
"traffic_limit": str(traffic_limit),
|
|
1527
|
-
"status": "true" if status else "false",
|
|
1528
|
-
}
|
|
1529
|
-
response = self._api_request_with_retry(
|
|
1530
|
-
"POST",
|
|
1531
|
-
f"{self._proxy_users_url}/create-user",
|
|
1532
|
-
data=payload,
|
|
1533
|
-
headers=headers,
|
|
1534
|
-
)
|
|
1535
|
-
response.raise_for_status()
|
|
1536
|
-
data = response.json()
|
|
1537
|
-
if data.get("code") != 200:
|
|
1538
|
-
raise_for_code("Create user failed", code=data.get("code"), payload=data)
|
|
1539
|
-
return data.get("data", {})
|
|
2301
|
+
auth_resp = sock.recv(2)
|
|
2302
|
+
if len(auth_resp) < 2 or auth_resp[1] != 0x00:
|
|
2303
|
+
raise ConnectionError("SOCKS5 authentication failed")
|
|
1540
2304
|
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
ip: str,
|
|
1544
|
-
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1545
|
-
status: bool = True,
|
|
1546
|
-
) -> dict[str, Any]:
|
|
1547
|
-
self._require_public_credentials()
|
|
1548
|
-
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1549
|
-
headers = build_public_api_headers(
|
|
1550
|
-
self.public_token or "", self.public_key or ""
|
|
1551
|
-
)
|
|
1552
|
-
payload = {
|
|
1553
|
-
"proxy_type": str(pt),
|
|
1554
|
-
"ip": ip,
|
|
1555
|
-
"status": "true" if status else "false",
|
|
1556
|
-
}
|
|
1557
|
-
response = self._api_request_with_retry(
|
|
1558
|
-
"POST", f"{self._whitelist_url}/add-ip", data=payload, headers=headers
|
|
1559
|
-
)
|
|
1560
|
-
response.raise_for_status()
|
|
1561
|
-
data = response.json()
|
|
1562
|
-
if data.get("code") != 200:
|
|
1563
|
-
raise_for_code(
|
|
1564
|
-
"Add whitelist IP failed", code=data.get("code"), payload=data
|
|
1565
|
-
)
|
|
1566
|
-
return data.get("data", {})
|
|
2305
|
+
elif auth_method == 0xFF:
|
|
2306
|
+
raise ConnectionError("SOCKS5 no acceptable auth method")
|
|
1567
2307
|
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
"key": self.public_key,
|
|
1573
|
-
"proxy_type": str(proxy_type),
|
|
1574
|
-
}
|
|
1575
|
-
response = self._api_request_with_retry(
|
|
1576
|
-
"GET", self._proxy_list_url, params=params
|
|
1577
|
-
)
|
|
1578
|
-
response.raise_for_status()
|
|
1579
|
-
data = response.json()
|
|
1580
|
-
if data.get("code") != 200:
|
|
1581
|
-
raise_for_code(
|
|
1582
|
-
"List proxy servers error", code=data.get("code"), payload=data
|
|
1583
|
-
)
|
|
2308
|
+
connect_req = b"\x05\x01\x00\x03"
|
|
2309
|
+
connect_req += bytes([len(target_host)]) + target_host.encode()
|
|
2310
|
+
connect_req += target_port.to_bytes(2, "big")
|
|
2311
|
+
sock.sendall(connect_req)
|
|
1584
2312
|
|
|
1585
|
-
|
|
1586
|
-
if
|
|
1587
|
-
|
|
1588
|
-
elif isinstance(data, list):
|
|
1589
|
-
server_list = data
|
|
2313
|
+
resp = sock.recv(4)
|
|
2314
|
+
if len(resp) < 4:
|
|
2315
|
+
raise ConnectionError("SOCKS5 connect failed: incomplete response")
|
|
1590
2316
|
|
|
1591
|
-
|
|
2317
|
+
if resp[1] != 0x00:
|
|
2318
|
+
error_codes = {
|
|
2319
|
+
0x01: "General failure",
|
|
2320
|
+
0x02: "Connection not allowed",
|
|
2321
|
+
0x03: "Network unreachable",
|
|
2322
|
+
0x04: "Host unreachable",
|
|
2323
|
+
0x05: "Connection refused",
|
|
2324
|
+
0x06: "TTL expired",
|
|
2325
|
+
0x07: "Command not supported",
|
|
2326
|
+
0x08: "Address type not supported",
|
|
2327
|
+
}
|
|
2328
|
+
error_msg = error_codes.get(resp[1], f"Unknown error {resp[1]}")
|
|
2329
|
+
raise ConnectionError(f"SOCKS5 connect failed: {error_msg}")
|
|
1592
2330
|
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
"key": self.public_key,
|
|
1602
|
-
"proxy_type": str(proxy_type),
|
|
1603
|
-
"ips": ips,
|
|
1604
|
-
}
|
|
1605
|
-
response = self._api_request_with_retry(
|
|
1606
|
-
"GET", self._proxy_expiration_url, params=params
|
|
1607
|
-
)
|
|
1608
|
-
response.raise_for_status()
|
|
1609
|
-
data = response.json()
|
|
1610
|
-
if data.get("code") != 200:
|
|
1611
|
-
raise_for_code("Get expiration error", code=data.get("code"), payload=data)
|
|
1612
|
-
return data.get("data", data)
|
|
2331
|
+
addr_type = resp[3]
|
|
2332
|
+
if addr_type == 0x01:
|
|
2333
|
+
sock.recv(4 + 2)
|
|
2334
|
+
elif addr_type == 0x03:
|
|
2335
|
+
domain_len = sock.recv(1)[0]
|
|
2336
|
+
sock.recv(domain_len + 2)
|
|
2337
|
+
elif addr_type == 0x04:
|
|
2338
|
+
sock.recv(16 + 2)
|
|
1613
2339
|
|
|
1614
|
-
|
|
1615
|
-
self, proxy_type: ProxyType | int = ProxyType.RESIDENTIAL
|
|
1616
|
-
) -> list[dict[str, Any]]:
|
|
1617
|
-
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1618
|
-
return self._get_locations("countries", proxy_type=pt)
|
|
2340
|
+
return sock
|
|
1619
2341
|
|
|
1620
|
-
def
|
|
2342
|
+
def _parse_http_response(
|
|
1621
2343
|
self,
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
) ->
|
|
1625
|
-
|
|
1626
|
-
|
|
2344
|
+
response_data: bytes,
|
|
2345
|
+
url: str,
|
|
2346
|
+
) -> requests.Response:
|
|
2347
|
+
"""Parse raw HTTP response into requests.Response."""
|
|
2348
|
+
if b"\r\n\r\n" in response_data:
|
|
2349
|
+
header_data, body = response_data.split(b"\r\n\r\n", 1)
|
|
2350
|
+
else:
|
|
2351
|
+
header_data = response_data
|
|
2352
|
+
body = b""
|
|
1627
2353
|
|
|
1628
|
-
|
|
1629
|
-
self,
|
|
1630
|
-
country_code: str,
|
|
1631
|
-
state_code: str | None = None,
|
|
1632
|
-
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1633
|
-
) -> list[dict[str, Any]]:
|
|
1634
|
-
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1635
|
-
kwargs = {"proxy_type": pt, "country_code": country_code}
|
|
1636
|
-
if state_code:
|
|
1637
|
-
kwargs["state_code"] = state_code
|
|
1638
|
-
return self._get_locations("cities", **kwargs)
|
|
2354
|
+
header_lines = header_data.decode("utf-8", errors="replace").split("\r\n")
|
|
1639
2355
|
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
proxy_type: ProxyType | int = ProxyType.RESIDENTIAL,
|
|
1644
|
-
) -> list[dict[str, Any]]:
|
|
1645
|
-
pt = int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
1646
|
-
return self._get_locations("asn", proxy_type=pt, country_code=country_code)
|
|
2356
|
+
status_line = header_lines[0] if header_lines else ""
|
|
2357
|
+
parts = status_line.split(" ", 2)
|
|
2358
|
+
status_code = int(parts[1]) if len(parts) > 1 else 0
|
|
1647
2359
|
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
2360
|
+
headers_dict = {}
|
|
2361
|
+
for line in header_lines[1:]:
|
|
2362
|
+
if ": " in line:
|
|
2363
|
+
k, v = line.split(": ", 1)
|
|
2364
|
+
headers_dict[k] = v
|
|
1653
2365
|
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
)
|
|
1657
|
-
response.raise_for_status()
|
|
1658
|
-
data = response.json()
|
|
1659
|
-
if isinstance(data, dict):
|
|
1660
|
-
if data.get("code") != 200:
|
|
1661
|
-
raise RuntimeError(f"Locations error: {data.get('msg')}")
|
|
1662
|
-
return data.get("data") or []
|
|
1663
|
-
return data if isinstance(data, list) else []
|
|
2366
|
+
if headers_dict.get("Transfer-Encoding", "").lower() == "chunked":
|
|
2367
|
+
body = self._decode_chunked(body)
|
|
1664
2368
|
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
2369
|
+
r = requests.Response()
|
|
2370
|
+
r.status_code = status_code
|
|
2371
|
+
r._content = body
|
|
2372
|
+
r.url = url
|
|
2373
|
+
r.headers = CaseInsensitiveDict(headers_dict)
|
|
2374
|
+
return r
|
|
2375
|
+
|
|
2376
|
+
def _decode_chunked(self, data: bytes) -> bytes:
|
|
2377
|
+
"""Decode chunked transfer encoding."""
|
|
2378
|
+
result = b""
|
|
2379
|
+
while data:
|
|
2380
|
+
if b"\r\n" not in data:
|
|
2381
|
+
break
|
|
2382
|
+
size_line, data = data.split(b"\r\n", 1)
|
|
2383
|
+
try:
|
|
2384
|
+
chunk_size = int(size_line.decode().strip(), 16)
|
|
2385
|
+
except ValueError:
|
|
2386
|
+
break
|
|
2387
|
+
|
|
2388
|
+
if chunk_size == 0:
|
|
2389
|
+
break
|
|
2390
|
+
|
|
2391
|
+
result += data[:chunk_size]
|
|
2392
|
+
data = data[chunk_size:]
|
|
2393
|
+
|
|
2394
|
+
if data.startswith(b"\r\n"):
|
|
2395
|
+
data = data[2:]
|
|
2396
|
+
|
|
2397
|
+
return result
|
|
1670
2398
|
|
|
1671
2399
|
def _get_proxy_endpoint_overrides(
|
|
1672
2400
|
self, product: ProxyProduct
|
|
1673
2401
|
) -> tuple[str | None, int | None, str]:
|
|
2402
|
+
"""Get proxy endpoint overrides from environment variables."""
|
|
1674
2403
|
prefix = product.value.upper()
|
|
1675
2404
|
host = os.getenv(f"THORDATA_{prefix}_PROXY_HOST") or os.getenv(
|
|
1676
2405
|
"THORDATA_PROXY_HOST"
|
|
@@ -1687,6 +2416,7 @@ class ThordataClient:
|
|
|
1687
2416
|
return host or None, port, protocol
|
|
1688
2417
|
|
|
1689
2418
|
def _get_default_proxy_config_from_env(self) -> ProxyConfig | None:
|
|
2419
|
+
"""Get proxy configuration from environment variables."""
|
|
1690
2420
|
for prod in [
|
|
1691
2421
|
ProxyProduct.RESIDENTIAL,
|
|
1692
2422
|
ProxyProduct.DATACENTER,
|
|
@@ -1707,15 +2437,43 @@ class ThordataClient:
|
|
|
1707
2437
|
)
|
|
1708
2438
|
return None
|
|
1709
2439
|
|
|
1710
|
-
def
|
|
1711
|
-
self
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
self._proxy_managers.clear()
|
|
2440
|
+
def get_browser_connection_url(
|
|
2441
|
+
self, username: str | None = None, password: str | None = None
|
|
2442
|
+
) -> str:
|
|
2443
|
+
"""
|
|
2444
|
+
Generate the WebSocket URL for connecting to Scraping Browser.
|
|
1716
2445
|
|
|
1717
|
-
|
|
1718
|
-
|
|
2446
|
+
Args:
|
|
2447
|
+
username: Proxy username (without 'td-customer-' prefix).
|
|
2448
|
+
Defaults to THORDATA_BROWSER_USERNAME or THORDATA_RESIDENTIAL_USERNAME.
|
|
2449
|
+
password: Proxy password.
|
|
1719
2450
|
|
|
1720
|
-
|
|
1721
|
-
|
|
2451
|
+
Returns:
|
|
2452
|
+
WSS URL string suitable for playwright.connect_over_cdp().
|
|
2453
|
+
|
|
2454
|
+
Raises:
|
|
2455
|
+
ThordataConfigError: If credentials are missing.
|
|
2456
|
+
"""
|
|
2457
|
+
user = (
|
|
2458
|
+
username
|
|
2459
|
+
or os.getenv("THORDATA_BROWSER_USERNAME")
|
|
2460
|
+
or os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
2461
|
+
)
|
|
2462
|
+
pwd = (
|
|
2463
|
+
password
|
|
2464
|
+
or os.getenv("THORDATA_BROWSER_PASSWORD")
|
|
2465
|
+
or os.getenv("THORDATA_RESIDENTIAL_PASSWORD")
|
|
2466
|
+
)
|
|
2467
|
+
|
|
2468
|
+
if not user or not pwd:
|
|
2469
|
+
raise ThordataConfigError(
|
|
2470
|
+
"Browser credentials missing. Set THORDATA_BROWSER_USERNAME/PASSWORD or pass arguments."
|
|
2471
|
+
)
|
|
2472
|
+
prefix = "td-customer-"
|
|
2473
|
+
final_user = f"{prefix}{user}" if not user.startswith(prefix) else user
|
|
2474
|
+
|
|
2475
|
+
# URL encode
|
|
2476
|
+
safe_user = quote(final_user, safe="")
|
|
2477
|
+
safe_pass = quote(pwd, safe="")
|
|
2478
|
+
|
|
2479
|
+
return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"
|