thordata-sdk 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/client.py CHANGED
@@ -1,995 +1,1040 @@
1
- """
2
- Synchronous client for the Thordata API.
3
-
4
- This module provides the main ThordataClient class for interacting with
5
- Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
6
-
7
- Example:
8
- >>> from thordata import ThordataClient
9
- >>>
10
- >>> client = ThordataClient(
11
- ... scraper_token="your_token",
12
- ... public_token="your_public_token",
13
- ... public_key="your_public_key"
14
- ... )
15
- >>>
16
- >>> # Use the proxy network
17
- >>> response = client.get("https://httpbin.org/ip")
18
- >>> print(response.json())
19
- >>>
20
- >>> # Search with SERP API
21
- >>> results = client.serp_search("python tutorial", engine="google")
22
- """
23
-
24
- from __future__ import annotations
25
-
26
- import logging
27
- from typing import Any, Dict, List, Optional, Union
28
-
29
- import os
30
- import requests
31
-
32
- from ._utils import (
33
- build_auth_headers,
34
- build_public_api_headers,
35
- decode_base64_image,
36
- extract_error_message,
37
- parse_json_response,
38
- )
39
- from .enums import Engine, ProxyType
40
- from .exceptions import (
41
- ThordataConfigError,
42
- ThordataNetworkError,
43
- ThordataTimeoutError,
44
- raise_for_code,
45
- )
46
- from .models import (
47
- ProxyConfig,
48
- ProxyProduct,
49
- ScraperTaskConfig,
50
- SerpRequest,
51
- UniversalScrapeRequest,
52
- )
53
- from .retry import RetryConfig, with_retry
54
-
55
- logger = logging.getLogger(__name__)
56
-
57
-
58
- class ThordataClient:
59
- """
60
- The official synchronous Python client for Thordata.
61
-
62
- This client handles authentication and communication with:
63
- - Proxy Network (Residential/Datacenter/Mobile/ISP via HTTP/HTTPS)
64
- - SERP API (Real-time Search Engine Results)
65
- - Universal Scraping API (Web Unlocker - Single Page Rendering)
66
- - Web Scraper API (Async Task Management)
67
-
68
- Args:
69
- scraper_token: The API token from your Dashboard.
70
- public_token: The public API token (for task status, locations).
71
- public_key: The public API key.
72
- proxy_host: Custom proxy gateway host (optional).
73
- proxy_port: Custom proxy gateway port (optional).
74
- timeout: Default request timeout in seconds (default: 30).
75
- retry_config: Configuration for automatic retries (optional).
76
-
77
- Example:
78
- >>> client = ThordataClient(
79
- ... scraper_token="your_scraper_token",
80
- ... public_token="your_public_token",
81
- ... public_key="your_public_key"
82
- ... )
83
- """
84
-
85
- # API Endpoints
86
- BASE_URL = "https://scraperapi.thordata.com"
87
- UNIVERSAL_URL = "https://universalapi.thordata.com"
88
- API_URL = "https://api.thordata.com/api/web-scraper-api"
89
- LOCATIONS_URL = "https://api.thordata.com/api/locations"
90
-
91
- def __init__(
92
- self,
93
- scraper_token: str,
94
- public_token: Optional[str] = None,
95
- public_key: Optional[str] = None,
96
- proxy_host: str = "pr.thordata.net",
97
- proxy_port: int = 9999,
98
- timeout: int = 30,
99
- retry_config: Optional[RetryConfig] = None,
100
- scraperapi_base_url: Optional[str] = None,
101
- universalapi_base_url: Optional[str] = None,
102
- web_scraper_api_base_url: Optional[str] = None,
103
- locations_base_url: Optional[str] = None,
104
- ) -> None:
105
- """Initialize the Thordata Client."""
106
- if not scraper_token:
107
- raise ThordataConfigError("scraper_token is required")
108
-
109
- self.scraper_token = scraper_token
110
- self.public_token = public_token
111
- self.public_key = public_key
112
-
113
- # Proxy configuration
114
- self._proxy_host = proxy_host
115
- self._proxy_port = proxy_port
116
- self._default_timeout = timeout
117
-
118
- # Retry configuration
119
- self._retry_config = retry_config or RetryConfig()
120
-
121
- # Build default proxy URL (for basic usage)
122
- self._default_proxy_url = (
123
- f"http://td-customer-{self.scraper_token}:@{proxy_host}:{proxy_port}"
124
- )
125
-
126
- # Sessions:
127
- # - _proxy_session: used for proxy network traffic to target sites
128
- # - _api_session: used for Thordata APIs (SERP/Universal/Tasks/Locations)
129
- #
130
- # We intentionally do NOT set session-level proxies for _api_session,
131
- # so developers can rely on system proxy settings (e.g., Clash) via env vars.
132
- self._proxy_session = requests.Session()
133
- self._proxy_session.trust_env = False
134
- self._proxy_session.proxies = {
135
- "http": self._default_proxy_url,
136
- "https": self._default_proxy_url,
137
- }
138
-
139
- self._api_session = requests.Session()
140
- self._api_session.trust_env = True
141
-
142
- # Base URLs (allow override via args or env vars for testing and custom routing)
143
- scraperapi_base = (
144
- scraperapi_base_url
145
- or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
146
- or self.BASE_URL
147
- ).rstrip("/")
148
-
149
- universalapi_base = (
150
- universalapi_base_url
151
- or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
152
- or self.UNIVERSAL_URL
153
- ).rstrip("/")
154
-
155
- web_scraper_api_base = (
156
- web_scraper_api_base_url
157
- or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
158
- or self.API_URL
159
- ).rstrip("/")
160
-
161
- locations_base = (
162
- locations_base_url
163
- or os.getenv("THORDATA_LOCATIONS_BASE_URL")
164
- or self.LOCATIONS_URL
165
- ).rstrip("/")
166
-
167
- self._serp_url = f"{scraperapi_base}/request"
168
- self._builder_url = f"{scraperapi_base}/builder"
169
- self._universal_url = f"{universalapi_base}/request"
170
- self._status_url = f"{web_scraper_api_base}/tasks-status"
171
- self._download_url = f"{web_scraper_api_base}/tasks-download"
172
- self._locations_base_url = locations_base
173
-
174
- # =========================================================================
175
- # Proxy Network Methods
176
- # =========================================================================
177
-
178
- def get(
179
- self,
180
- url: str,
181
- *,
182
- proxy_config: Optional[ProxyConfig] = None,
183
- timeout: Optional[int] = None,
184
- **kwargs: Any,
185
- ) -> requests.Response:
186
- """
187
- Send a GET request through the Thordata Proxy Network.
188
-
189
- Args:
190
- url: The target URL.
191
- proxy_config: Custom proxy configuration for geo-targeting/sessions.
192
- timeout: Request timeout in seconds.
193
- **kwargs: Additional arguments to pass to requests.get().
194
-
195
- Returns:
196
- The response object.
197
-
198
- Example:
199
- >>> # Basic request
200
- >>> response = client.get("https://httpbin.org/ip")
201
- >>>
202
- >>> # With geo-targeting
203
- >>> from thordata.models import ProxyConfig
204
- >>> config = ProxyConfig(
205
- ... username="myuser",
206
- ... password="mypass",
207
- ... country="us",
208
- ... city="seattle"
209
- ... )
210
- >>> response = client.get("https://httpbin.org/ip", proxy_config=config)
211
- """
212
- logger.debug(f"Proxy GET request: {url}")
213
-
214
- timeout = timeout or self._default_timeout
215
-
216
- if proxy_config:
217
- proxies = proxy_config.to_proxies_dict()
218
- kwargs["proxies"] = proxies
219
-
220
- return self._request_with_retry("GET", url, timeout=timeout, **kwargs)
221
-
222
- def post(
223
- self,
224
- url: str,
225
- *,
226
- proxy_config: Optional[ProxyConfig] = None,
227
- timeout: Optional[int] = None,
228
- **kwargs: Any,
229
- ) -> requests.Response:
230
- """
231
- Send a POST request through the Thordata Proxy Network.
232
-
233
- Args:
234
- url: The target URL.
235
- proxy_config: Custom proxy configuration.
236
- timeout: Request timeout in seconds.
237
- **kwargs: Additional arguments to pass to requests.post().
238
-
239
- Returns:
240
- The response object.
241
- """
242
- logger.debug(f"Proxy POST request: {url}")
243
-
244
- timeout = timeout or self._default_timeout
245
-
246
- if proxy_config:
247
- proxies = proxy_config.to_proxies_dict()
248
- kwargs["proxies"] = proxies
249
-
250
- return self._request_with_retry("POST", url, timeout=timeout, **kwargs)
251
-
252
- def build_proxy_url(
253
- self,
254
- *,
255
- country: Optional[str] = None,
256
- state: Optional[str] = None,
257
- city: Optional[str] = None,
258
- session_id: Optional[str] = None,
259
- session_duration: Optional[int] = None,
260
- product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL,
261
- ) -> str:
262
- """
263
- Build a proxy URL with custom targeting options.
264
-
265
- This is a convenience method for creating proxy URLs without
266
- manually constructing a ProxyConfig.
267
-
268
- Args:
269
- country: Target country code (e.g., 'us', 'gb').
270
- state: Target state (e.g., 'california').
271
- city: Target city (e.g., 'seattle').
272
- session_id: Session ID for sticky sessions.
273
- session_duration: Session duration in minutes (1-90).
274
- product: Proxy product type.
275
-
276
- Returns:
277
- The proxy URL string.
278
-
279
- Example:
280
- >>> url = client.build_proxy_url(country="us", city="seattle")
281
- >>> proxies = {"http": url, "https": url}
282
- >>> requests.get("https://example.com", proxies=proxies)
283
- """
284
- config = ProxyConfig(
285
- username=self.scraper_token,
286
- password="",
287
- host=self._proxy_host,
288
- port=self._proxy_port,
289
- product=product,
290
- country=country,
291
- state=state,
292
- city=city,
293
- session_id=session_id,
294
- session_duration=session_duration,
295
- )
296
- return config.build_proxy_url()
297
-
298
- # =========================================================================
299
- # SERP API Methods
300
- # =========================================================================
301
-
302
- def serp_search(
303
- self,
304
- query: str,
305
- *,
306
- engine: Union[Engine, str] = Engine.GOOGLE,
307
- num: int = 10,
308
- country: Optional[str] = None,
309
- language: Optional[str] = None,
310
- search_type: Optional[str] = None,
311
- device: Optional[str] = None,
312
- render_js: Optional[bool] = None,
313
- no_cache: Optional[bool] = None,
314
- output_format: str = "json",
315
- **kwargs: Any,
316
- ) -> Dict[str, Any]:
317
- """
318
- Execute a real-time SERP (Search Engine Results Page) search.
319
-
320
- Args:
321
- query: The search keywords.
322
- engine: Search engine (google, bing, yandex, duckduckgo, baidu).
323
- num: Number of results to retrieve (default: 10).
324
- country: Country code for localized results (e.g., 'us').
325
- language: Language code for interface (e.g., 'en').
326
- search_type: Type of search (images, news, shopping, videos, etc.).
327
- device: Device type ('desktop', 'mobile', 'tablet').
328
- render_js: Enable JavaScript rendering in SERP (render_js=True).
329
- no_cache: Disable internal caching (no_cache=True).
330
- output_format: 'json' to return parsed JSON (default),
331
- 'html' to return HTML wrapped in {'html': ...}.
332
- **kwargs: Additional engine-specific parameters.
333
-
334
- Returns:
335
- Dict[str, Any]: Parsed JSON results or a dict with 'html' key.
336
-
337
- Example:
338
- >>> # Basic search
339
- >>> results = client.serp_search("python tutorial")
340
- >>>
341
- >>> # With options
342
- >>> results = client.serp_search(
343
- ... "laptop reviews",
344
- ... engine="google",
345
- ... num=20,
346
- ... country="us",
347
- ... search_type="shopping",
348
- ... device="mobile",
349
- ... render_js=True,
350
- ... no_cache=True,
351
- ... )
352
- """
353
- # Normalize engine
354
- engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
355
-
356
- # Build request using model
357
- request = SerpRequest(
358
- query=query,
359
- engine=engine_str,
360
- num=num,
361
- country=country,
362
- language=language,
363
- search_type=search_type,
364
- device=device,
365
- render_js=render_js,
366
- no_cache=no_cache,
367
- output_format=output_format,
368
- extra_params=kwargs,
369
- )
370
-
371
- payload = request.to_payload()
372
- headers = build_auth_headers(self.scraper_token)
373
-
374
- logger.info(f"SERP Search: {engine_str} - {query}")
375
-
376
- try:
377
- response = self._api_session.post(
378
- self._serp_url,
379
- data=payload,
380
- headers=headers,
381
- timeout=60,
382
- )
383
- response.raise_for_status()
384
-
385
- # JSON mode (default)
386
- if output_format.lower() == "json":
387
- data = response.json()
388
-
389
- if isinstance(data, dict):
390
- code = data.get("code")
391
- if code is not None and code != 200:
392
- msg = extract_error_message(data)
393
- raise_for_code(
394
- f"SERP API Error: {msg}",
395
- code=code,
396
- payload=data,
397
- )
398
-
399
- return parse_json_response(data)
400
-
401
- # HTML mode: wrap as dict to keep return type stable
402
- return {"html": response.text}
403
-
404
- except requests.Timeout as e:
405
- raise ThordataTimeoutError(
406
- f"SERP request timed out: {e}",
407
- original_error=e,
408
- )
409
- except requests.RequestException as e:
410
- raise ThordataNetworkError(
411
- f"SERP request failed: {e}",
412
- original_error=e,
413
- )
414
-
415
- def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
416
- """
417
- Execute a SERP search using a SerpRequest object.
418
-
419
- This method provides full control over all search parameters.
420
-
421
- Args:
422
- request: A SerpRequest object with all parameters configured.
423
-
424
- Returns:
425
- Dict[str, Any]: Parsed JSON results or dict with 'html' key.
426
-
427
- Example:
428
- >>> from thordata.models import SerpRequest
429
- >>> request = SerpRequest(
430
- ... query="python programming",
431
- ... engine="google",
432
- ... num=50,
433
- ... country="us",
434
- ... language="en",
435
- ... search_type="news",
436
- ... time_filter="week",
437
- ... safe_search=True
438
- ... )
439
- >>> results = client.serp_search_advanced(request)
440
- """
441
- payload = request.to_payload()
442
- headers = build_auth_headers(self.scraper_token)
443
-
444
- logger.info(f"SERP Advanced Search: {request.engine} - {request.query}")
445
-
446
- try:
447
- response = self._api_session.post(
448
- self._serp_url,
449
- data=payload,
450
- headers=headers,
451
- timeout=60,
452
- )
453
- response.raise_for_status()
454
-
455
- if request.output_format.lower() == "json":
456
- data = response.json()
457
-
458
- if isinstance(data, dict):
459
- code = data.get("code")
460
- if code is not None and code != 200:
461
- msg = extract_error_message(data)
462
- raise_for_code(
463
- f"SERP API Error: {msg}",
464
- code=code,
465
- payload=data,
466
- )
467
-
468
- return parse_json_response(data)
469
-
470
- return {"html": response.text}
471
-
472
- except requests.Timeout as e:
473
- raise ThordataTimeoutError(
474
- f"SERP request timed out: {e}",
475
- original_error=e,
476
- )
477
- except requests.RequestException as e:
478
- raise ThordataNetworkError(
479
- f"SERP request failed: {e}",
480
- original_error=e,
481
- )
482
-
483
- # =========================================================================
484
- # Universal Scraping API (Web Unlocker) Methods
485
- # =========================================================================
486
-
487
- def universal_scrape(
488
- self,
489
- url: str,
490
- *,
491
- js_render: bool = False,
492
- output_format: str = "html",
493
- country: Optional[str] = None,
494
- block_resources: Optional[str] = None,
495
- wait: Optional[int] = None,
496
- wait_for: Optional[str] = None,
497
- **kwargs: Any,
498
- ) -> Union[str, bytes]:
499
- """
500
- Scrape a URL using the Universal Scraping API (Web Unlocker).
501
-
502
- Automatically bypasses Cloudflare, CAPTCHAs, and antibot systems.
503
-
504
- Args:
505
- url: Target URL.
506
- js_render: Enable JavaScript rendering (headless browser).
507
- output_format: "html" or "png" (screenshot).
508
- country: Geo-targeting country code.
509
- block_resources: Resources to block (e.g., 'script,image').
510
- wait: Wait time in milliseconds after page load.
511
- wait_for: CSS selector to wait for.
512
- **kwargs: Additional parameters.
513
-
514
- Returns:
515
- HTML string or PNG bytes depending on output_format.
516
-
517
- Example:
518
- >>> # Get HTML
519
- >>> html = client.universal_scrape("https://example.com", js_render=True)
520
- >>>
521
- >>> # Get screenshot
522
- >>> png = client.universal_scrape(
523
- ... "https://example.com",
524
- ... js_render=True,
525
- ... output_format="png"
526
- ... )
527
- >>> with open("screenshot.png", "wb") as f:
528
- ... f.write(png)
529
- """
530
- request = UniversalScrapeRequest(
531
- url=url,
532
- js_render=js_render,
533
- output_format=output_format,
534
- country=country,
535
- block_resources=block_resources,
536
- wait=wait,
537
- wait_for=wait_for,
538
- extra_params=kwargs,
539
- )
540
-
541
- return self.universal_scrape_advanced(request)
542
-
543
- def universal_scrape_advanced(
544
- self, request: UniversalScrapeRequest
545
- ) -> Union[str, bytes]:
546
- """
547
- Scrape using a UniversalScrapeRequest object for full control.
548
-
549
- Args:
550
- request: A UniversalScrapeRequest with all parameters.
551
-
552
- Returns:
553
- HTML string or PNG bytes.
554
- """
555
- payload = request.to_payload()
556
- headers = build_auth_headers(self.scraper_token)
557
-
558
- logger.info(
559
- f"Universal Scrape: {request.url} (format: {request.output_format})"
560
- )
561
-
562
- try:
563
- response = self._api_session.post(
564
- self._universal_url,
565
- data=payload,
566
- headers=headers,
567
- timeout=60,
568
- )
569
- response.raise_for_status()
570
-
571
- return self._process_universal_response(response, request.output_format)
572
-
573
- except requests.Timeout as e:
574
- raise ThordataTimeoutError(
575
- f"Universal scrape timed out: {e}", original_error=e
576
- )
577
- except requests.RequestException as e:
578
- raise ThordataNetworkError(
579
- f"Universal scrape failed: {e}", original_error=e
580
- )
581
-
582
- def _process_universal_response(
583
- self, response: requests.Response, output_format: str
584
- ) -> Union[str, bytes]:
585
- """Process the response from Universal API."""
586
- # Try to parse as JSON
587
- try:
588
- resp_json = response.json()
589
- except ValueError:
590
- # Raw content returned
591
- if output_format.lower() == "png":
592
- return response.content
593
- return response.text
594
-
595
- # Check for API-level errors
596
- if isinstance(resp_json, dict):
597
- code = resp_json.get("code")
598
- if code is not None and code != 200:
599
- msg = extract_error_message(resp_json)
600
- raise_for_code(
601
- f"Universal API Error: {msg}", code=code, payload=resp_json
602
- )
603
-
604
- # Extract HTML
605
- if "html" in resp_json:
606
- return resp_json["html"]
607
-
608
- # Extract PNG
609
- if "png" in resp_json:
610
- return decode_base64_image(resp_json["png"])
611
-
612
- # Fallback
613
- return str(resp_json)
614
-
615
- # =========================================================================
616
- # Web Scraper API (Task-based) Methods
617
- # =========================================================================
618
-
619
- def create_scraper_task(
620
- self,
621
- file_name: str,
622
- spider_id: str,
623
- spider_name: str,
624
- parameters: Dict[str, Any],
625
- universal_params: Optional[Dict[str, Any]] = None,
626
- ) -> str:
627
- """
628
- Create an asynchronous Web Scraper task.
629
-
630
- Note: Get spider_id and spider_name from the Thordata Dashboard.
631
-
632
- Args:
633
- file_name: Name for the output file.
634
- spider_id: Spider identifier from Dashboard.
635
- spider_name: Spider name (e.g., "youtube.com").
636
- parameters: Spider-specific parameters.
637
- universal_params: Global spider settings.
638
-
639
- Returns:
640
- The created task_id.
641
-
642
- Example:
643
- >>> task_id = client.create_scraper_task(
644
- ... file_name="youtube_data",
645
- ... spider_id="youtube_video-post_by-url",
646
- ... spider_name="youtube.com",
647
- ... parameters={"url": "https://youtube.com/@channel/videos"}
648
- ... )
649
- """
650
- config = ScraperTaskConfig(
651
- file_name=file_name,
652
- spider_id=spider_id,
653
- spider_name=spider_name,
654
- parameters=parameters,
655
- universal_params=universal_params,
656
- )
657
-
658
- return self.create_scraper_task_advanced(config)
659
-
660
- def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
661
- """
662
- Create a scraper task using a ScraperTaskConfig object.
663
-
664
- Args:
665
- config: Task configuration.
666
-
667
- Returns:
668
- The created task_id.
669
- """
670
- payload = config.to_payload()
671
- headers = build_auth_headers(self.scraper_token)
672
-
673
- logger.info(f"Creating Scraper Task: {config.spider_name}")
674
-
675
- try:
676
- response = self._api_session.post(
677
- self._builder_url,
678
- data=payload,
679
- headers=headers,
680
- timeout=30,
681
- )
682
- response.raise_for_status()
683
-
684
- data = response.json()
685
- code = data.get("code")
686
-
687
- if code != 200:
688
- msg = extract_error_message(data)
689
- raise_for_code(f"Task creation failed: {msg}", code=code, payload=data)
690
-
691
- return data["data"]["task_id"]
692
-
693
- except requests.RequestException as e:
694
- raise ThordataNetworkError(f"Task creation failed: {e}", original_error=e)
695
-
696
- def get_task_status(self, task_id: str) -> str:
697
- """
698
- Check the status of an asynchronous scraping task.
699
-
700
- Args:
701
- task_id: The task ID from create_scraper_task.
702
-
703
- Returns:
704
- Status string (e.g., "running", "ready", "failed").
705
- """
706
- self._require_public_credentials()
707
-
708
- headers = build_public_api_headers(
709
- self.public_token or "", self.public_key or ""
710
- )
711
- payload = {"tasks_ids": task_id}
712
-
713
- try:
714
- response = self._api_session.post(
715
- self._status_url,
716
- data=payload,
717
- headers=headers,
718
- timeout=30,
719
- )
720
- response.raise_for_status()
721
-
722
- data = response.json()
723
-
724
- if data.get("code") == 200 and data.get("data"):
725
- for item in data["data"]:
726
- if str(item.get("task_id")) == str(task_id):
727
- return item.get("status", "unknown")
728
-
729
- return "unknown"
730
-
731
- except Exception as e:
732
- logger.error(f"Status check failed: {e}")
733
- return "error"
734
-
735
- def get_task_result(self, task_id: str, file_type: str = "json") -> str:
736
- """
737
- Get the download URL for a completed task.
738
- """
739
- self._require_public_credentials()
740
-
741
- headers = build_public_api_headers(
742
- self.public_token or "", self.public_key or ""
743
- )
744
- payload = {"tasks_id": task_id, "type": file_type}
745
-
746
- logger.info(f"Getting result URL for Task: {task_id}")
747
-
748
- try:
749
- response = self._api_session.post(
750
- self._download_url,
751
- data=payload,
752
- headers=headers,
753
- timeout=30,
754
- )
755
- response.raise_for_status()
756
-
757
- data = response.json()
758
- code = data.get("code")
759
-
760
- if code == 200 and data.get("data"):
761
- return data["data"]["download"]
762
-
763
- msg = extract_error_message(data)
764
- raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
765
- # This line won't be reached, but satisfies mypy
766
- raise RuntimeError("Unexpected state")
767
-
768
- except requests.RequestException as e:
769
- raise ThordataNetworkError(f"Get result failed: {e}", original_error=e)
770
-
771
- def wait_for_task(
772
- self,
773
- task_id: str,
774
- *,
775
- poll_interval: float = 5.0,
776
- max_wait: float = 600.0,
777
- ) -> str:
778
- """
779
- Wait for a task to complete.
780
-
781
- Args:
782
- task_id: The task ID to wait for.
783
- poll_interval: Seconds between status checks.
784
- max_wait: Maximum seconds to wait.
785
-
786
- Returns:
787
- Final task status.
788
-
789
- Raises:
790
- TimeoutError: If max_wait is exceeded.
791
-
792
- Example:
793
- >>> task_id = client.create_scraper_task(...)
794
- >>> status = client.wait_for_task(task_id, max_wait=300)
795
- >>> if status in ("ready", "success"):
796
- ... url = client.get_task_result(task_id)
797
- """
798
- import time
799
-
800
- elapsed = 0.0
801
-
802
- while elapsed < max_wait:
803
- status = self.get_task_status(task_id)
804
-
805
- logger.debug(f"Task {task_id} status: {status}")
806
-
807
- terminal_statuses = {
808
- "ready",
809
- "success",
810
- "finished",
811
- "failed",
812
- "error",
813
- "cancelled",
814
- }
815
-
816
- if status.lower() in terminal_statuses:
817
- return status
818
-
819
- time.sleep(poll_interval)
820
- elapsed += poll_interval
821
-
822
- raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
823
-
824
- # =========================================================================
825
- # Location API Methods
826
- # =========================================================================
827
-
828
- def list_countries(
829
- self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
830
- ) -> List[Dict[str, Any]]:
831
- """
832
- List supported countries for proxies.
833
-
834
- Args:
835
- proxy_type: 1 for residential, 2 for unlimited.
836
-
837
- Returns:
838
- List of country records with 'country_code' and 'country_name'.
839
- """
840
- return self._get_locations(
841
- "countries",
842
- proxy_type=(
843
- int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
844
- ),
845
- )
846
-
847
- def list_states(
848
- self,
849
- country_code: str,
850
- proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
851
- ) -> List[Dict[str, Any]]:
852
- """
853
- List supported states for a country.
854
-
855
- Args:
856
- country_code: Country code (e.g., 'US').
857
- proxy_type: Proxy type.
858
-
859
- Returns:
860
- List of state records.
861
- """
862
- return self._get_locations(
863
- "states",
864
- proxy_type=(
865
- int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
866
- ),
867
- country_code=country_code,
868
- )
869
-
870
- def list_cities(
871
- self,
872
- country_code: str,
873
- state_code: Optional[str] = None,
874
- proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
875
- ) -> List[Dict[str, Any]]:
876
- """
877
- List supported cities for a country/state.
878
-
879
- Args:
880
- country_code: Country code.
881
- state_code: Optional state code.
882
- proxy_type: Proxy type.
883
-
884
- Returns:
885
- List of city records.
886
- """
887
- kwargs = {
888
- "proxy_type": (
889
- int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
890
- ),
891
- "country_code": country_code,
892
- }
893
- if state_code:
894
- kwargs["state_code"] = state_code
895
-
896
- return self._get_locations("cities", **kwargs)
897
-
898
- def list_asn(
899
- self,
900
- country_code: str,
901
- proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
902
- ) -> List[Dict[str, Any]]:
903
- """
904
- List supported ASNs for a country.
905
-
906
- Args:
907
- country_code: Country code.
908
- proxy_type: Proxy type.
909
-
910
- Returns:
911
- List of ASN records.
912
- """
913
- return self._get_locations(
914
- "asn",
915
- proxy_type=(
916
- int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
917
- ),
918
- country_code=country_code,
919
- )
920
-
921
- def _get_locations(self, endpoint: str, **kwargs: Any) -> List[Dict[str, Any]]:
922
- """Internal method to call locations API."""
923
- self._require_public_credentials()
924
-
925
- params = {
926
- "token": self.public_token,
927
- "key": self.public_key,
928
- }
929
-
930
- for key, value in kwargs.items():
931
- params[key] = str(value)
932
-
933
- url = f"{self._locations_base_url}/{endpoint}"
934
-
935
- logger.debug(f"Locations API request: {url}")
936
-
937
- # Use requests.get directly (no proxy needed for this API)
938
- response = self._api_session.get(url, params=params, timeout=30)
939
- response.raise_for_status()
940
-
941
- data = response.json()
942
-
943
- if isinstance(data, dict):
944
- code = data.get("code")
945
- if code is not None and code != 200:
946
- msg = data.get("msg", "")
947
- raise RuntimeError(
948
- f"Locations API error ({endpoint}): code={code}, msg={msg}"
949
- )
950
- return data.get("data") or []
951
-
952
- if isinstance(data, list):
953
- return data
954
-
955
- return []
956
-
957
- # =========================================================================
958
- # Helper Methods
959
- # =========================================================================
960
-
961
- def _require_public_credentials(self) -> None:
962
- """Ensure public API credentials are available."""
963
- if not self.public_token or not self.public_key:
964
- raise ThordataConfigError(
965
- "public_token and public_key are required for this operation. "
966
- "Please provide them when initializing ThordataClient."
967
- )
968
-
969
- def _request_with_retry(
970
- self, method: str, url: str, **kwargs: Any
971
- ) -> requests.Response:
972
- """Make a request with automatic retry."""
973
- kwargs.setdefault("timeout", self._default_timeout)
974
-
975
- @with_retry(self._retry_config)
976
- def _do_request() -> requests.Response:
977
- return self._proxy_session.request(method, url, **kwargs)
978
-
979
- try:
980
- return _do_request()
981
- except requests.Timeout as e:
982
- raise ThordataTimeoutError(f"Request timed out: {e}", original_error=e)
983
- except requests.RequestException as e:
984
- raise ThordataNetworkError(f"Request failed: {e}", original_error=e)
985
-
986
- def close(self) -> None:
987
- """Close the underlying session."""
988
- self._proxy_session.close()
989
- self._api_session.close()
990
-
991
- def __enter__(self) -> ThordataClient:
992
- return self
993
-
994
- def __exit__(self, exc_type, exc_val, exc_tb) -> None:
995
- self.close()
1
+ """
2
+ Synchronous client for the Thordata API.
3
+
4
+ This module provides the main ThordataClient class for interacting with
5
+ Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
6
+
7
+ Example:
8
+ >>> from thordata import ThordataClient
9
+ >>>
10
+ >>> client = ThordataClient(
11
+ ... scraper_token="your_token",
12
+ ... public_token="your_public_token",
13
+ ... public_key="your_public_key"
14
+ ... )
15
+ >>>
16
+ >>> # Use the proxy network
17
+ >>> response = client.get("https://httpbin.org/ip")
18
+ >>> print(response.json())
19
+ >>>
20
+ >>> # Search with SERP API
21
+ >>> results = client.serp_search("python tutorial", engine="google")
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import logging
27
+ import os
28
+ from typing import Any, Dict, List, Optional, Union
29
+
30
+ import requests
31
+
32
+ from . import __version__ as _sdk_version
33
+ from ._utils import (
34
+ build_auth_headers,
35
+ build_public_api_headers,
36
+ build_user_agent,
37
+ decode_base64_image,
38
+ extract_error_message,
39
+ parse_json_response,
40
+ )
41
+ from .enums import Engine, ProxyType
42
+ from .exceptions import (
43
+ ThordataConfigError,
44
+ ThordataNetworkError,
45
+ ThordataTimeoutError,
46
+ raise_for_code,
47
+ )
48
+ from .models import (
49
+ ProxyConfig,
50
+ ProxyProduct,
51
+ ScraperTaskConfig,
52
+ SerpRequest,
53
+ UniversalScrapeRequest,
54
+ )
55
+ from .retry import RetryConfig, with_retry
56
+
57
+ logger = logging.getLogger(__name__)
58
+
59
+
60
+ class ThordataClient:
61
+ """
62
+ The official synchronous Python client for Thordata.
63
+
64
+ This client handles authentication and communication with:
65
+ - Proxy Network (Residential/Datacenter/Mobile/ISP via HTTP/HTTPS)
66
+ - SERP API (Real-time Search Engine Results)
67
+ - Universal Scraping API (Web Unlocker - Single Page Rendering)
68
+ - Web Scraper API (Async Task Management)
69
+
70
+ Args:
71
+ scraper_token: The API token from your Dashboard.
72
+ public_token: The public API token (for task status, locations).
73
+ public_key: The public API key.
74
+ proxy_host: Custom proxy gateway host (optional).
75
+ proxy_port: Custom proxy gateway port (optional).
76
+ timeout: Default request timeout in seconds (default: 30).
77
+ retry_config: Configuration for automatic retries (optional).
78
+
79
+ Example:
80
+ >>> client = ThordataClient(
81
+ ... scraper_token="your_scraper_token",
82
+ ... public_token="your_public_token",
83
+ ... public_key="your_public_key"
84
+ ... )
85
+ """
86
+
87
+ # API Endpoints
88
+ BASE_URL = "https://scraperapi.thordata.com"
89
+ UNIVERSAL_URL = "https://universalapi.thordata.com"
90
+ API_URL = "https://api.thordata.com/api/web-scraper-api"
91
+ LOCATIONS_URL = "https://api.thordata.com/api/locations"
92
+
93
+ def __init__(
94
+ self,
95
+ scraper_token: str,
96
+ public_token: Optional[str] = None,
97
+ public_key: Optional[str] = None,
98
+ proxy_host: str = "pr.thordata.net",
99
+ proxy_port: int = 9999,
100
+ timeout: int = 30,
101
+ retry_config: Optional[RetryConfig] = None,
102
+ scraperapi_base_url: Optional[str] = None,
103
+ universalapi_base_url: Optional[str] = None,
104
+ web_scraper_api_base_url: Optional[str] = None,
105
+ locations_base_url: Optional[str] = None,
106
+ ) -> None:
107
+ """Initialize the Thordata Client."""
108
+ if not scraper_token:
109
+ raise ThordataConfigError("scraper_token is required")
110
+
111
+ self.scraper_token = scraper_token
112
+ self.public_token = public_token
113
+ self.public_key = public_key
114
+
115
+ # Proxy configuration
116
+ self._proxy_host = proxy_host
117
+ self._proxy_port = proxy_port
118
+ self._default_timeout = timeout
119
+
120
+ # Retry configuration
121
+ self._retry_config = retry_config or RetryConfig()
122
+
123
+ # Build default proxy URL (for basic usage)
124
+ self._default_proxy_url = (
125
+ f"http://td-customer-{self.scraper_token}:@{proxy_host}:{proxy_port}"
126
+ )
127
+
128
+ # Sessions:
129
+ # - _proxy_session: used for proxy network traffic to target sites
130
+ # - _api_session: used for Thordata APIs (SERP/Universal/Tasks/Locations)
131
+ #
132
+ # We intentionally do NOT set session-level proxies for _api_session,
133
+ # so developers can rely on system proxy settings (e.g., Clash) via env vars.
134
+ self._proxy_session = requests.Session()
135
+ self._proxy_session.trust_env = False
136
+ self._proxy_session.proxies = {
137
+ "http": self._default_proxy_url,
138
+ "https": self._default_proxy_url,
139
+ }
140
+
141
+ self._api_session = requests.Session()
142
+ self._api_session.trust_env = True
143
+
144
+ self._api_session.headers.update(
145
+ {"User-Agent": build_user_agent(_sdk_version, "requests")}
146
+ )
147
+
148
+ # Base URLs (allow override via args or env vars for testing and custom routing)
149
+ scraperapi_base = (
150
+ scraperapi_base_url
151
+ or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
152
+ or self.BASE_URL
153
+ ).rstrip("/")
154
+
155
+ universalapi_base = (
156
+ universalapi_base_url
157
+ or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
158
+ or self.UNIVERSAL_URL
159
+ ).rstrip("/")
160
+
161
+ web_scraper_api_base = (
162
+ web_scraper_api_base_url
163
+ or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
164
+ or self.API_URL
165
+ ).rstrip("/")
166
+
167
+ locations_base = (
168
+ locations_base_url
169
+ or os.getenv("THORDATA_LOCATIONS_BASE_URL")
170
+ or self.LOCATIONS_URL
171
+ ).rstrip("/")
172
+
173
+ self._serp_url = f"{scraperapi_base}/request"
174
+ self._builder_url = f"{scraperapi_base}/builder"
175
+ self._universal_url = f"{universalapi_base}/request"
176
+ self._status_url = f"{web_scraper_api_base}/tasks-status"
177
+ self._download_url = f"{web_scraper_api_base}/tasks-download"
178
+ self._locations_base_url = locations_base
179
+
180
+ # =========================================================================
181
+ # Proxy Network Methods
182
+ # =========================================================================
183
+
184
+ def get(
185
+ self,
186
+ url: str,
187
+ *,
188
+ proxy_config: Optional[ProxyConfig] = None,
189
+ timeout: Optional[int] = None,
190
+ **kwargs: Any,
191
+ ) -> requests.Response:
192
+ """
193
+ Send a GET request through the Thordata Proxy Network.
194
+
195
+ Args:
196
+ url: The target URL.
197
+ proxy_config: Custom proxy configuration for geo-targeting/sessions.
198
+ timeout: Request timeout in seconds.
199
+ **kwargs: Additional arguments to pass to requests.get().
200
+
201
+ Returns:
202
+ The response object.
203
+
204
+ Example:
205
+ >>> # Basic request
206
+ >>> response = client.get("https://httpbin.org/ip")
207
+ >>>
208
+ >>> # With geo-targeting
209
+ >>> from thordata.models import ProxyConfig
210
+ >>> config = ProxyConfig(
211
+ ... username="myuser",
212
+ ... password="mypass",
213
+ ... country="us",
214
+ ... city="seattle"
215
+ ... )
216
+ >>> response = client.get("https://httpbin.org/ip", proxy_config=config)
217
+ """
218
+ logger.debug(f"Proxy GET request: {url}")
219
+
220
+ timeout = timeout or self._default_timeout
221
+
222
+ if proxy_config:
223
+ proxies = proxy_config.to_proxies_dict()
224
+ kwargs["proxies"] = proxies
225
+
226
+ return self._request_with_retry("GET", url, timeout=timeout, **kwargs)
227
+
228
+ def post(
229
+ self,
230
+ url: str,
231
+ *,
232
+ proxy_config: Optional[ProxyConfig] = None,
233
+ timeout: Optional[int] = None,
234
+ **kwargs: Any,
235
+ ) -> requests.Response:
236
+ """
237
+ Send a POST request through the Thordata Proxy Network.
238
+
239
+ Args:
240
+ url: The target URL.
241
+ proxy_config: Custom proxy configuration.
242
+ timeout: Request timeout in seconds.
243
+ **kwargs: Additional arguments to pass to requests.post().
244
+
245
+ Returns:
246
+ The response object.
247
+ """
248
+ logger.debug(f"Proxy POST request: {url}")
249
+
250
+ timeout = timeout or self._default_timeout
251
+
252
+ if proxy_config:
253
+ proxies = proxy_config.to_proxies_dict()
254
+ kwargs["proxies"] = proxies
255
+
256
+ return self._request_with_retry("POST", url, timeout=timeout, **kwargs)
257
+
258
+ def build_proxy_url(
259
+ self,
260
+ *,
261
+ country: Optional[str] = None,
262
+ state: Optional[str] = None,
263
+ city: Optional[str] = None,
264
+ session_id: Optional[str] = None,
265
+ session_duration: Optional[int] = None,
266
+ product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL,
267
+ ) -> str:
268
+ """
269
+ Build a proxy URL with custom targeting options.
270
+
271
+ This is a convenience method for creating proxy URLs without
272
+ manually constructing a ProxyConfig.
273
+
274
+ Args:
275
+ country: Target country code (e.g., 'us', 'gb').
276
+ state: Target state (e.g., 'california').
277
+ city: Target city (e.g., 'seattle').
278
+ session_id: Session ID for sticky sessions.
279
+ session_duration: Session duration in minutes (1-90).
280
+ product: Proxy product type.
281
+
282
+ Returns:
283
+ The proxy URL string.
284
+
285
+ Example:
286
+ >>> url = client.build_proxy_url(country="us", city="seattle")
287
+ >>> proxies = {"http": url, "https": url}
288
+ >>> requests.get("https://example.com", proxies=proxies)
289
+ """
290
+ config = ProxyConfig(
291
+ username=self.scraper_token,
292
+ password="",
293
+ host=self._proxy_host,
294
+ port=self._proxy_port,
295
+ product=product,
296
+ country=country,
297
+ state=state,
298
+ city=city,
299
+ session_id=session_id,
300
+ session_duration=session_duration,
301
+ )
302
+ return config.build_proxy_url()
303
+
304
+ # =========================================================================
305
+ # SERP API Methods
306
+ # =========================================================================
307
+
308
+ def serp_search(
309
+ self,
310
+ query: str,
311
+ *,
312
+ engine: Union[Engine, str] = Engine.GOOGLE,
313
+ num: int = 10,
314
+ country: Optional[str] = None,
315
+ language: Optional[str] = None,
316
+ search_type: Optional[str] = None,
317
+ device: Optional[str] = None,
318
+ render_js: Optional[bool] = None,
319
+ no_cache: Optional[bool] = None,
320
+ output_format: str = "json",
321
+ **kwargs: Any,
322
+ ) -> Dict[str, Any]:
323
+ """
324
+ Execute a real-time SERP (Search Engine Results Page) search.
325
+
326
+ Args:
327
+ query: The search keywords.
328
+ engine: Search engine (google, bing, yandex, duckduckgo, baidu).
329
+ num: Number of results to retrieve (default: 10).
330
+ country: Country code for localized results (e.g., 'us').
331
+ language: Language code for interface (e.g., 'en').
332
+ search_type: Type of search (images, news, shopping, videos, etc.).
333
+ device: Device type ('desktop', 'mobile', 'tablet').
334
+ render_js: Enable JavaScript rendering in SERP (render_js=True).
335
+ no_cache: Disable internal caching (no_cache=True).
336
+ output_format: 'json' to return parsed JSON (default),
337
+ 'html' to return HTML wrapped in {'html': ...}.
338
+ **kwargs: Additional engine-specific parameters.
339
+
340
+ Returns:
341
+ Dict[str, Any]: Parsed JSON results or a dict with 'html' key.
342
+
343
+ Example:
344
+ >>> # Basic search
345
+ >>> results = client.serp_search("python tutorial")
346
+ >>>
347
+ >>> # With options
348
+ >>> results = client.serp_search(
349
+ ... "laptop reviews",
350
+ ... engine="google",
351
+ ... num=20,
352
+ ... country="us",
353
+ ... search_type="shopping",
354
+ ... device="mobile",
355
+ ... render_js=True,
356
+ ... no_cache=True,
357
+ ... )
358
+ """
359
+ # Normalize engine
360
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
361
+
362
+ # Build request using model
363
+ request = SerpRequest(
364
+ query=query,
365
+ engine=engine_str,
366
+ num=num,
367
+ country=country,
368
+ language=language,
369
+ search_type=search_type,
370
+ device=device,
371
+ render_js=render_js,
372
+ no_cache=no_cache,
373
+ output_format=output_format,
374
+ extra_params=kwargs,
375
+ )
376
+
377
+ payload = request.to_payload()
378
+ headers = build_auth_headers(self.scraper_token)
379
+
380
+ logger.info(f"SERP Search: {engine_str} - {query}")
381
+
382
+ try:
383
+ response = self._api_session.post(
384
+ self._serp_url,
385
+ data=payload,
386
+ headers=headers,
387
+ timeout=60,
388
+ )
389
+ response.raise_for_status()
390
+
391
+ # JSON mode (default)
392
+ if output_format.lower() == "json":
393
+ data = response.json()
394
+
395
+ if isinstance(data, dict):
396
+ code = data.get("code")
397
+ if code is not None and code != 200:
398
+ msg = extract_error_message(data)
399
+ raise_for_code(
400
+ f"SERP API Error: {msg}",
401
+ code=code,
402
+ payload=data,
403
+ )
404
+
405
+ return parse_json_response(data)
406
+
407
+ # HTML mode: wrap as dict to keep return type stable
408
+ return {"html": response.text}
409
+
410
+ except requests.Timeout as e:
411
+ raise ThordataTimeoutError(
412
+ f"SERP request timed out: {e}",
413
+ original_error=e,
414
+ ) from e
415
+ except requests.RequestException as e:
416
+ raise ThordataNetworkError(
417
+ f"SERP request failed: {e}",
418
+ original_error=e,
419
+ ) from e
420
+
421
+ def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
422
+ """
423
+ Execute a SERP search using a SerpRequest object.
424
+
425
+ This method provides full control over all search parameters.
426
+
427
+ Args:
428
+ request: A SerpRequest object with all parameters configured.
429
+
430
+ Returns:
431
+ Dict[str, Any]: Parsed JSON results or dict with 'html' key.
432
+
433
+ Example:
434
+ >>> from thordata.models import SerpRequest
435
+ >>> request = SerpRequest(
436
+ ... query="python programming",
437
+ ... engine="google",
438
+ ... num=50,
439
+ ... country="us",
440
+ ... language="en",
441
+ ... search_type="news",
442
+ ... time_filter="week",
443
+ ... safe_search=True
444
+ ... )
445
+ >>> results = client.serp_search_advanced(request)
446
+ """
447
+ payload = request.to_payload()
448
+ headers = build_auth_headers(self.scraper_token)
449
+
450
+ logger.info(f"SERP Advanced Search: {request.engine} - {request.query}")
451
+
452
+ try:
453
+ response = self._api_session.post(
454
+ self._serp_url,
455
+ data=payload,
456
+ headers=headers,
457
+ timeout=60,
458
+ )
459
+ response.raise_for_status()
460
+
461
+ if request.output_format.lower() == "json":
462
+ data = response.json()
463
+
464
+ if isinstance(data, dict):
465
+ code = data.get("code")
466
+ if code is not None and code != 200:
467
+ msg = extract_error_message(data)
468
+ raise_for_code(
469
+ f"SERP API Error: {msg}",
470
+ code=code,
471
+ payload=data,
472
+ )
473
+
474
+ return parse_json_response(data)
475
+
476
+ return {"html": response.text}
477
+
478
+ except requests.Timeout as e:
479
+ raise ThordataTimeoutError(
480
+ f"SERP request timed out: {e}",
481
+ original_error=e,
482
+ ) from e
483
+ except requests.RequestException as e:
484
+ raise ThordataNetworkError(
485
+ f"SERP request failed: {e}",
486
+ original_error=e,
487
+ ) from e
488
+
489
+ # =========================================================================
490
+ # Universal Scraping API (Web Unlocker) Methods
491
+ # =========================================================================
492
+
493
+ def universal_scrape(
494
+ self,
495
+ url: str,
496
+ *,
497
+ js_render: bool = False,
498
+ output_format: str = "html",
499
+ country: Optional[str] = None,
500
+ block_resources: Optional[str] = None,
501
+ wait: Optional[int] = None,
502
+ wait_for: Optional[str] = None,
503
+ **kwargs: Any,
504
+ ) -> Union[str, bytes]:
505
+ """
506
+ Scrape a URL using the Universal Scraping API (Web Unlocker).
507
+
508
+ Automatically bypasses Cloudflare, CAPTCHAs, and antibot systems.
509
+
510
+ Args:
511
+ url: Target URL.
512
+ js_render: Enable JavaScript rendering (headless browser).
513
+ output_format: "html" or "png" (screenshot).
514
+ country: Geo-targeting country code.
515
+ block_resources: Resources to block (e.g., 'script,image').
516
+ wait: Wait time in milliseconds after page load.
517
+ wait_for: CSS selector to wait for.
518
+ **kwargs: Additional parameters.
519
+
520
+ Returns:
521
+ HTML string or PNG bytes depending on output_format.
522
+
523
+ Example:
524
+ >>> # Get HTML
525
+ >>> html = client.universal_scrape("https://example.com", js_render=True)
526
+ >>>
527
+ >>> # Get screenshot
528
+ >>> png = client.universal_scrape(
529
+ ... "https://example.com",
530
+ ... js_render=True,
531
+ ... output_format="png"
532
+ ... )
533
+ >>> with open("screenshot.png", "wb") as f:
534
+ ... f.write(png)
535
+ """
536
+ request = UniversalScrapeRequest(
537
+ url=url,
538
+ js_render=js_render,
539
+ output_format=output_format,
540
+ country=country,
541
+ block_resources=block_resources,
542
+ wait=wait,
543
+ wait_for=wait_for,
544
+ extra_params=kwargs,
545
+ )
546
+
547
+ return self.universal_scrape_advanced(request)
548
+
549
+ def universal_scrape_advanced(
550
+ self, request: UniversalScrapeRequest
551
+ ) -> Union[str, bytes]:
552
+ """
553
+ Scrape using a UniversalScrapeRequest object for full control.
554
+
555
+ Args:
556
+ request: A UniversalScrapeRequest with all parameters.
557
+
558
+ Returns:
559
+ HTML string or PNG bytes.
560
+ """
561
+ payload = request.to_payload()
562
+ headers = build_auth_headers(self.scraper_token)
563
+
564
+ logger.info(
565
+ f"Universal Scrape: {request.url} (format: {request.output_format})"
566
+ )
567
+
568
+ try:
569
+ response = self._api_session.post(
570
+ self._universal_url,
571
+ data=payload,
572
+ headers=headers,
573
+ timeout=60,
574
+ )
575
+ response.raise_for_status()
576
+
577
+ return self._process_universal_response(response, request.output_format)
578
+
579
+ except requests.Timeout as e:
580
+ raise ThordataTimeoutError(
581
+ f"Universal scrape timed out: {e}", original_error=e
582
+ ) from e
583
+ except requests.RequestException as e:
584
+ raise ThordataNetworkError(
585
+ f"Universal scrape failed: {e}", original_error=e
586
+ ) from e
587
+
588
+ def _process_universal_response(
589
+ self, response: requests.Response, output_format: str
590
+ ) -> Union[str, bytes]:
591
+ """Process the response from Universal API."""
592
+ # Try to parse as JSON
593
+ try:
594
+ resp_json = response.json()
595
+ except ValueError:
596
+ # Raw content returned
597
+ if output_format.lower() == "png":
598
+ return response.content
599
+ return response.text
600
+
601
+ # Check for API-level errors
602
+ if isinstance(resp_json, dict):
603
+ code = resp_json.get("code")
604
+ if code is not None and code != 200:
605
+ msg = extract_error_message(resp_json)
606
+ raise_for_code(
607
+ f"Universal API Error: {msg}", code=code, payload=resp_json
608
+ )
609
+
610
+ # Extract HTML
611
+ if "html" in resp_json:
612
+ return resp_json["html"]
613
+
614
+ # Extract PNG
615
+ if "png" in resp_json:
616
+ return decode_base64_image(resp_json["png"])
617
+
618
+ # Fallback
619
+ return str(resp_json)
620
+
621
+ # =========================================================================
622
+ # Web Scraper API (Task-based) Methods
623
+ # =========================================================================
624
+
625
+ def create_scraper_task(
626
+ self,
627
+ file_name: str,
628
+ spider_id: str,
629
+ spider_name: str,
630
+ parameters: Dict[str, Any],
631
+ universal_params: Optional[Dict[str, Any]] = None,
632
+ ) -> str:
633
+ """
634
+ Create an asynchronous Web Scraper task.
635
+
636
+ Note: Get spider_id and spider_name from the Thordata Dashboard.
637
+
638
+ Args:
639
+ file_name: Name for the output file.
640
+ spider_id: Spider identifier from Dashboard.
641
+ spider_name: Spider name (e.g., "youtube.com").
642
+ parameters: Spider-specific parameters.
643
+ universal_params: Global spider settings.
644
+
645
+ Returns:
646
+ The created task_id.
647
+
648
+ Example:
649
+ >>> task_id = client.create_scraper_task(
650
+ ... file_name="youtube_data",
651
+ ... spider_id="youtube_video-post_by-url",
652
+ ... spider_name="youtube.com",
653
+ ... parameters={"url": "https://youtube.com/@channel/videos"}
654
+ ... )
655
+ """
656
+ config = ScraperTaskConfig(
657
+ file_name=file_name,
658
+ spider_id=spider_id,
659
+ spider_name=spider_name,
660
+ parameters=parameters,
661
+ universal_params=universal_params,
662
+ )
663
+
664
+ return self.create_scraper_task_advanced(config)
665
+
666
+ def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
667
+ """
668
+ Create a scraper task using a ScraperTaskConfig object.
669
+
670
+ Args:
671
+ config: Task configuration.
672
+
673
+ Returns:
674
+ The created task_id.
675
+ """
676
+ payload = config.to_payload()
677
+ headers = build_auth_headers(self.scraper_token)
678
+
679
+ logger.info(f"Creating Scraper Task: {config.spider_name}")
680
+
681
+ try:
682
+ response = self._api_session.post(
683
+ self._builder_url,
684
+ data=payload,
685
+ headers=headers,
686
+ timeout=30,
687
+ )
688
+ response.raise_for_status()
689
+
690
+ data = response.json()
691
+ code = data.get("code")
692
+
693
+ if code != 200:
694
+ msg = extract_error_message(data)
695
+ raise_for_code(f"Task creation failed: {msg}", code=code, payload=data)
696
+
697
+ return data["data"]["task_id"]
698
+
699
+ except requests.RequestException as e:
700
+ raise ThordataNetworkError(
701
+ f"Task creation failed: {e}", original_error=e
702
+ ) from e
703
+
704
+ def get_task_status(self, task_id: str) -> str:
705
+ """
706
+ Check the status of an asynchronous scraping task.
707
+
708
+ Returns:
709
+ Status string (e.g., "running", "ready", "failed").
710
+
711
+ Raises:
712
+ ThordataConfigError: If public credentials are missing.
713
+ ThordataAPIError: If API returns a non-200 code in JSON payload.
714
+ ThordataNetworkError: If network/HTTP request fails.
715
+ """
716
+ self._require_public_credentials()
717
+
718
+ headers = build_public_api_headers(
719
+ self.public_token or "", self.public_key or ""
720
+ )
721
+ payload = {"tasks_ids": task_id}
722
+
723
+ try:
724
+ response = self._api_session.post(
725
+ self._status_url,
726
+ data=payload,
727
+ headers=headers,
728
+ timeout=30,
729
+ )
730
+ response.raise_for_status()
731
+ data = response.json()
732
+
733
+ if isinstance(data, dict):
734
+ code = data.get("code")
735
+ if code is not None and code != 200:
736
+ msg = extract_error_message(data)
737
+ raise_for_code(
738
+ f"Task status API Error: {msg}",
739
+ code=code,
740
+ payload=data,
741
+ )
742
+
743
+ items = data.get("data") or []
744
+ for item in items:
745
+ if str(item.get("task_id")) == str(task_id):
746
+ return item.get("status", "unknown")
747
+
748
+ return "unknown"
749
+
750
+ # Unexpected payload type
751
+ raise ThordataNetworkError(
752
+ f"Unexpected task status response type: {type(data).__name__}",
753
+ original_error=None,
754
+ )
755
+
756
+ except requests.Timeout as e:
757
+ raise ThordataTimeoutError(
758
+ f"Status check timed out: {e}", original_error=e
759
+ ) from e
760
+ except requests.RequestException as e:
761
+ raise ThordataNetworkError(
762
+ f"Status check failed: {e}", original_error=e
763
+ ) from e
764
+
765
+ def safe_get_task_status(self, task_id: str) -> str:
766
+ """
767
+ Backward-compatible status check.
768
+
769
+ Returns:
770
+ Status string, or "error" on any exception.
771
+ """
772
+ try:
773
+ return self.get_task_status(task_id)
774
+ except Exception:
775
+ return "error"
776
+
777
+ def get_task_result(self, task_id: str, file_type: str = "json") -> str:
778
+ """
779
+ Get the download URL for a completed task.
780
+ """
781
+ self._require_public_credentials()
782
+
783
+ headers = build_public_api_headers(
784
+ self.public_token or "", self.public_key or ""
785
+ )
786
+ payload = {"tasks_id": task_id, "type": file_type}
787
+
788
+ logger.info(f"Getting result URL for Task: {task_id}")
789
+
790
+ try:
791
+ response = self._api_session.post(
792
+ self._download_url,
793
+ data=payload,
794
+ headers=headers,
795
+ timeout=30,
796
+ )
797
+ response.raise_for_status()
798
+
799
+ data = response.json()
800
+ code = data.get("code")
801
+
802
+ if code == 200 and data.get("data"):
803
+ return data["data"]["download"]
804
+
805
+ msg = extract_error_message(data)
806
+ raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
807
+ # This line won't be reached, but satisfies mypy
808
+ raise RuntimeError("Unexpected state")
809
+
810
+ except requests.RequestException as e:
811
+ raise ThordataNetworkError(
812
+ f"Get result failed: {e}", original_error=e
813
+ ) from e
814
+
815
+ def wait_for_task(
816
+ self,
817
+ task_id: str,
818
+ *,
819
+ poll_interval: float = 5.0,
820
+ max_wait: float = 600.0,
821
+ ) -> str:
822
+ """
823
+ Wait for a task to complete.
824
+
825
+ Args:
826
+ task_id: The task ID to wait for.
827
+ poll_interval: Seconds between status checks.
828
+ max_wait: Maximum seconds to wait.
829
+
830
+ Returns:
831
+ Final task status.
832
+
833
+ Raises:
834
+ TimeoutError: If max_wait is exceeded.
835
+
836
+ Example:
837
+ >>> task_id = client.create_scraper_task(...)
838
+ >>> status = client.wait_for_task(task_id, max_wait=300)
839
+ >>> if status in ("ready", "success"):
840
+ ... url = client.get_task_result(task_id)
841
+ """
842
+ import time
843
+
844
+ start = time.monotonic()
845
+
846
+ while (time.monotonic() - start) < max_wait:
847
+ status = self.get_task_status(task_id)
848
+
849
+ logger.debug(f"Task {task_id} status: {status}")
850
+
851
+ terminal_statuses = {
852
+ "ready",
853
+ "success",
854
+ "finished",
855
+ "failed",
856
+ "error",
857
+ "cancelled",
858
+ }
859
+
860
+ if status.lower() in terminal_statuses:
861
+ return status
862
+
863
+ time.sleep(poll_interval)
864
+
865
+ raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
866
+
867
+ # =========================================================================
868
+ # Location API Methods
869
+ # =========================================================================
870
+
871
+ def list_countries(
872
+ self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
873
+ ) -> List[Dict[str, Any]]:
874
+ """
875
+ List supported countries for proxies.
876
+
877
+ Args:
878
+ proxy_type: 1 for residential, 2 for unlimited.
879
+
880
+ Returns:
881
+ List of country records with 'country_code' and 'country_name'.
882
+ """
883
+ return self._get_locations(
884
+ "countries",
885
+ proxy_type=(
886
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
887
+ ),
888
+ )
889
+
890
+ def list_states(
891
+ self,
892
+ country_code: str,
893
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
894
+ ) -> List[Dict[str, Any]]:
895
+ """
896
+ List supported states for a country.
897
+
898
+ Args:
899
+ country_code: Country code (e.g., 'US').
900
+ proxy_type: Proxy type.
901
+
902
+ Returns:
903
+ List of state records.
904
+ """
905
+ return self._get_locations(
906
+ "states",
907
+ proxy_type=(
908
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
909
+ ),
910
+ country_code=country_code,
911
+ )
912
+
913
+ def list_cities(
914
+ self,
915
+ country_code: str,
916
+ state_code: Optional[str] = None,
917
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
918
+ ) -> List[Dict[str, Any]]:
919
+ """
920
+ List supported cities for a country/state.
921
+
922
+ Args:
923
+ country_code: Country code.
924
+ state_code: Optional state code.
925
+ proxy_type: Proxy type.
926
+
927
+ Returns:
928
+ List of city records.
929
+ """
930
+ kwargs = {
931
+ "proxy_type": (
932
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
933
+ ),
934
+ "country_code": country_code,
935
+ }
936
+ if state_code:
937
+ kwargs["state_code"] = state_code
938
+
939
+ return self._get_locations("cities", **kwargs)
940
+
941
+ def list_asn(
942
+ self,
943
+ country_code: str,
944
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
945
+ ) -> List[Dict[str, Any]]:
946
+ """
947
+ List supported ASNs for a country.
948
+
949
+ Args:
950
+ country_code: Country code.
951
+ proxy_type: Proxy type.
952
+
953
+ Returns:
954
+ List of ASN records.
955
+ """
956
+ return self._get_locations(
957
+ "asn",
958
+ proxy_type=(
959
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
960
+ ),
961
+ country_code=country_code,
962
+ )
963
+
964
+ def _get_locations(self, endpoint: str, **kwargs: Any) -> List[Dict[str, Any]]:
965
+ """Internal method to call locations API."""
966
+ self._require_public_credentials()
967
+
968
+ params = {
969
+ "token": self.public_token,
970
+ "key": self.public_key,
971
+ }
972
+
973
+ for key, value in kwargs.items():
974
+ params[key] = str(value)
975
+
976
+ url = f"{self._locations_base_url}/{endpoint}"
977
+
978
+ logger.debug(f"Locations API request: {url}")
979
+
980
+ # Use requests.get directly (no proxy needed for this API)
981
+ response = self._api_session.get(url, params=params, timeout=30)
982
+ response.raise_for_status()
983
+
984
+ data = response.json()
985
+
986
+ if isinstance(data, dict):
987
+ code = data.get("code")
988
+ if code is not None and code != 200:
989
+ msg = data.get("msg", "")
990
+ raise RuntimeError(
991
+ f"Locations API error ({endpoint}): code={code}, msg={msg}"
992
+ )
993
+ return data.get("data") or []
994
+
995
+ if isinstance(data, list):
996
+ return data
997
+
998
+ return []
999
+
1000
+ # =========================================================================
1001
+ # Helper Methods
1002
+ # =========================================================================
1003
+
1004
+ def _require_public_credentials(self) -> None:
1005
+ """Ensure public API credentials are available."""
1006
+ if not self.public_token or not self.public_key:
1007
+ raise ThordataConfigError(
1008
+ "public_token and public_key are required for this operation. "
1009
+ "Please provide them when initializing ThordataClient."
1010
+ )
1011
+
1012
+ def _request_with_retry(
1013
+ self, method: str, url: str, **kwargs: Any
1014
+ ) -> requests.Response:
1015
+ """Make a request with automatic retry."""
1016
+ kwargs.setdefault("timeout", self._default_timeout)
1017
+
1018
+ @with_retry(self._retry_config)
1019
+ def _do_request() -> requests.Response:
1020
+ return self._proxy_session.request(method, url, **kwargs)
1021
+
1022
+ try:
1023
+ return _do_request()
1024
+ except requests.Timeout as e:
1025
+ raise ThordataTimeoutError(
1026
+ f"Request timed out: {e}", original_error=e
1027
+ ) from e
1028
+ except requests.RequestException as e:
1029
+ raise ThordataNetworkError(f"Request failed: {e}", original_error=e) from e
1030
+
1031
+ def close(self) -> None:
1032
+ """Close the underlying session."""
1033
+ self._proxy_session.close()
1034
+ self._api_session.close()
1035
+
1036
+ def __enter__(self) -> ThordataClient:
1037
+ return self
1038
+
1039
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
1040
+ self.close()