thordata-sdk 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/client.py CHANGED
@@ -1,13 +1,57 @@
1
- import requests
1
+ """
2
+ Synchronous client for the Thordata API.
3
+
4
+ This module provides the main ThordataClient class for interacting with
5
+ Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
6
+
7
+ Example:
8
+ >>> from thordata import ThordataClient
9
+ >>>
10
+ >>> client = ThordataClient(
11
+ ... scraper_token="your_token",
12
+ ... public_token="your_public_token",
13
+ ... public_key="your_public_key"
14
+ ... )
15
+ >>>
16
+ >>> # Use the proxy network
17
+ >>> response = client.get("https://httpbin.org/ip")
18
+ >>> print(response.json())
19
+ >>>
20
+ >>> # Search with SERP API
21
+ >>> results = client.serp_search("python tutorial", engine="google")
22
+ """
23
+
24
+ from __future__ import annotations
25
+
2
26
  import logging
3
- import json
4
- import base64
5
- from typing import Dict, Any, Union, Optional, List
27
+ from typing import Any, Dict, List, Optional, Union
28
+
29
+ import os
30
+ import requests
6
31
 
7
- from .enums import Engine
8
- from .parameters import normalize_serp_params
32
+ from ._utils import (
33
+ build_auth_headers,
34
+ build_public_api_headers,
35
+ decode_base64_image,
36
+ extract_error_message,
37
+ parse_json_response,
38
+ )
39
+ from .enums import Engine, ProxyType
40
+ from .exceptions import (
41
+ ThordataConfigError,
42
+ ThordataNetworkError,
43
+ ThordataTimeoutError,
44
+ raise_for_code,
45
+ )
46
+ from .models import (
47
+ ProxyConfig,
48
+ ProxyProduct,
49
+ ScraperTaskConfig,
50
+ SerpRequest,
51
+ UniversalScrapeRequest,
52
+ )
53
+ from .retry import RetryConfig, with_retry
9
54
 
10
- # Configure a library-specific logger to avoid interfering with user's logging
11
55
  logger = logging.getLogger(__name__)
12
56
 
13
57
 
@@ -16,471 +60,936 @@ class ThordataClient:
16
60
  The official synchronous Python client for Thordata.
17
61
 
18
62
  This client handles authentication and communication with:
19
- 1. Proxy Network (Residential/Datacenter via HTTP/HTTPS)
20
- 2. SERP API (Real-time Search Engine Results)
21
- 3. Universal Scraping API (Single Page Rendering & Extraction)
22
- 4. Web Scraper API (Async Task Management for large scale jobs)
63
+ - Proxy Network (Residential/Datacenter/Mobile/ISP via HTTP/HTTPS)
64
+ - SERP API (Real-time Search Engine Results)
65
+ - Universal Scraping API (Web Unlocker - Single Page Rendering)
66
+ - Web Scraper API (Async Task Management)
67
+
68
+ Args:
69
+ scraper_token: The API token from your Dashboard.
70
+ public_token: The public API token (for task status, locations).
71
+ public_key: The public API key.
72
+ proxy_host: Custom proxy gateway host (optional).
73
+ proxy_port: Custom proxy gateway port (optional).
74
+ timeout: Default request timeout in seconds (default: 30).
75
+ retry_config: Configuration for automatic retries (optional).
76
+
77
+ Example:
78
+ >>> client = ThordataClient(
79
+ ... scraper_token="your_scraper_token",
80
+ ... public_token="your_public_token",
81
+ ... public_key="your_public_key"
82
+ ... )
23
83
  """
24
84
 
85
+ # API Endpoints
86
+ BASE_URL = "https://scraperapi.thordata.com"
87
+ UNIVERSAL_URL = "https://universalapi.thordata.com"
88
+ API_URL = "https://api.thordata.com/api/web-scraper-api"
89
+ LOCATIONS_URL = "https://api.thordata.com/api/locations"
90
+
25
91
  def __init__(
26
92
  self,
27
93
  scraper_token: str,
28
- public_token: str,
29
- public_key: str,
30
- proxy_host: str = "gate.thordata.com",
31
- proxy_port: int = 22225
32
- ):
33
- """
34
- Initialize the Thordata Client.
94
+ public_token: Optional[str] = None,
95
+ public_key: Optional[str] = None,
96
+ proxy_host: str = "pr.thordata.net",
97
+ proxy_port: int = 9999,
98
+ timeout: int = 30,
99
+ retry_config: Optional[RetryConfig] = None,
100
+ scraperapi_base_url: Optional[str] = None,
101
+ universalapi_base_url: Optional[str] = None,
102
+ web_scraper_api_base_url: Optional[str] = None,
103
+ locations_base_url: Optional[str] = None,
104
+ ) -> None:
105
+ """Initialize the Thordata Client."""
106
+ if not scraper_token:
107
+ raise ThordataConfigError("scraper_token is required")
35
108
 
36
- Args:
37
- scraper_token (str): The secret token found at the bottom of the Dashboard.
38
- public_token (str): The token from the Public API section.
39
- public_key (str): The key from the Public API section.
40
- proxy_host (str): The proxy gateway host (default: gate.thordata.com).
41
- proxy_port (int): The proxy gateway port (default: 22225).
42
- """
43
109
  self.scraper_token = scraper_token
44
110
  self.public_token = public_token
45
111
  self.public_key = public_key
46
112
 
47
- # Proxy Configuration
48
- self.proxy_url = (
49
- f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
113
+ # Proxy configuration
114
+ self._proxy_host = proxy_host
115
+ self._proxy_port = proxy_port
116
+ self._default_timeout = timeout
117
+
118
+ # Retry configuration
119
+ self._retry_config = retry_config or RetryConfig()
120
+
121
+ # Build default proxy URL (for basic usage)
122
+ self._default_proxy_url = (
123
+ f"http://td-customer-{self.scraper_token}:@{proxy_host}:{proxy_port}"
50
124
  )
51
125
 
52
- # API Endpoints Definition
53
- self.base_url = "https://scraperapi.thordata.com"
54
- self.universal_url = "https://universalapi.thordata.com"
55
- self.api_url = "https://api.thordata.com/api/web-scraper-api"
56
- self.locations_url = "https://api.thordata.com/api/locations"
57
-
58
- self.SERP_API_URL = f"{self.base_url}/request"
59
- self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
60
- self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
61
- self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
62
- self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
63
-
64
- # Initialize Session with Proxy settings
65
- self.session = requests.Session()
66
- self.session.proxies = {
67
- "http": self.proxy_url,
68
- "https": self.proxy_url,
126
+ # Sessions:
127
+ # - _proxy_session: used for proxy network traffic to target sites
128
+ # - _api_session: used for Thordata APIs (SERP/Universal/Tasks/Locations)
129
+ #
130
+ # We intentionally do NOT set session-level proxies for _api_session,
131
+ # so developers can rely on system proxy settings (e.g., Clash) via env vars.
132
+ self._proxy_session = requests.Session()
133
+ self._proxy_session.trust_env = False
134
+ self._proxy_session.proxies = {
135
+ "http": self._default_proxy_url,
136
+ "https": self._default_proxy_url,
69
137
  }
70
138
 
71
- def get(self, url: str, **kwargs) -> requests.Response:
139
+ self._api_session = requests.Session()
140
+ self._api_session.trust_env = True
141
+
142
+ # Base URLs (allow override via args or env vars for testing and custom routing)
143
+ scraperapi_base = (
144
+ scraperapi_base_url
145
+ or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
146
+ or self.BASE_URL
147
+ ).rstrip("/")
148
+
149
+ universalapi_base = (
150
+ universalapi_base_url
151
+ or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
152
+ or self.UNIVERSAL_URL
153
+ ).rstrip("/")
154
+
155
+ web_scraper_api_base = (
156
+ web_scraper_api_base_url
157
+ or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
158
+ or self.API_URL
159
+ ).rstrip("/")
160
+
161
+ locations_base = (
162
+ locations_base_url
163
+ or os.getenv("THORDATA_LOCATIONS_BASE_URL")
164
+ or self.LOCATIONS_URL
165
+ ).rstrip("/")
166
+
167
+ self._serp_url = f"{scraperapi_base}/request"
168
+ self._builder_url = f"{scraperapi_base}/builder"
169
+ self._universal_url = f"{universalapi_base}/request"
170
+ self._status_url = f"{web_scraper_api_base}/tasks-status"
171
+ self._download_url = f"{web_scraper_api_base}/tasks-download"
172
+ self._locations_base_url = locations_base
173
+
174
+ # =========================================================================
175
+ # Proxy Network Methods
176
+ # =========================================================================
177
+
178
+ def get(
179
+ self,
180
+ url: str,
181
+ *,
182
+ proxy_config: Optional[ProxyConfig] = None,
183
+ timeout: Optional[int] = None,
184
+ **kwargs: Any,
185
+ ) -> requests.Response:
186
+ """
187
+ Send a GET request through the Thordata Proxy Network.
188
+
189
+ Args:
190
+ url: The target URL.
191
+ proxy_config: Custom proxy configuration for geo-targeting/sessions.
192
+ timeout: Request timeout in seconds.
193
+ **kwargs: Additional arguments to pass to requests.get().
194
+
195
+ Returns:
196
+ The response object.
197
+
198
+ Example:
199
+ >>> # Basic request
200
+ >>> response = client.get("https://httpbin.org/ip")
201
+ >>>
202
+ >>> # With geo-targeting
203
+ >>> from thordata.models import ProxyConfig
204
+ >>> config = ProxyConfig(
205
+ ... username="myuser",
206
+ ... password="mypass",
207
+ ... country="us",
208
+ ... city="seattle"
209
+ ... )
210
+ >>> response = client.get("https://httpbin.org/ip", proxy_config=config)
211
+ """
212
+ logger.debug(f"Proxy GET request: {url}")
213
+
214
+ timeout = timeout or self._default_timeout
215
+
216
+ if proxy_config:
217
+ proxies = proxy_config.to_proxies_dict()
218
+ kwargs["proxies"] = proxies
219
+
220
+ return self._request_with_retry("GET", url, timeout=timeout, **kwargs)
221
+
222
+ def post(
223
+ self,
224
+ url: str,
225
+ *,
226
+ proxy_config: Optional[ProxyConfig] = None,
227
+ timeout: Optional[int] = None,
228
+ **kwargs: Any,
229
+ ) -> requests.Response:
230
+ """
231
+ Send a POST request through the Thordata Proxy Network.
232
+
233
+ Args:
234
+ url: The target URL.
235
+ proxy_config: Custom proxy configuration.
236
+ timeout: Request timeout in seconds.
237
+ **kwargs: Additional arguments to pass to requests.post().
238
+
239
+ Returns:
240
+ The response object.
241
+ """
242
+ logger.debug(f"Proxy POST request: {url}")
243
+
244
+ timeout = timeout or self._default_timeout
245
+
246
+ if proxy_config:
247
+ proxies = proxy_config.to_proxies_dict()
248
+ kwargs["proxies"] = proxies
249
+
250
+ return self._request_with_retry("POST", url, timeout=timeout, **kwargs)
251
+
252
+ def build_proxy_url(
253
+ self,
254
+ *,
255
+ country: Optional[str] = None,
256
+ state: Optional[str] = None,
257
+ city: Optional[str] = None,
258
+ session_id: Optional[str] = None,
259
+ session_duration: Optional[int] = None,
260
+ product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL,
261
+ ) -> str:
72
262
  """
73
- Send a standard GET request through the Thordata Residential Proxy Network.
263
+ Build a proxy URL with custom targeting options.
264
+
265
+ This is a convenience method for creating proxy URLs without
266
+ manually constructing a ProxyConfig.
74
267
 
75
268
  Args:
76
- url (str): The target URL.
77
- **kwargs: Arguments to pass to requests.get().
269
+ country: Target country code (e.g., 'us', 'gb').
270
+ state: Target state (e.g., 'california').
271
+ city: Target city (e.g., 'seattle').
272
+ session_id: Session ID for sticky sessions.
273
+ session_duration: Session duration in minutes (1-90).
274
+ product: Proxy product type.
78
275
 
79
276
  Returns:
80
- requests.Response: The response object.
277
+ The proxy URL string.
278
+
279
+ Example:
280
+ >>> url = client.build_proxy_url(country="us", city="seattle")
281
+ >>> proxies = {"http": url, "https": url}
282
+ >>> requests.get("https://example.com", proxies=proxies)
81
283
  """
82
- logger.debug(f"Proxy Request: {url}")
83
- kwargs.setdefault("timeout", 30)
84
- return self.session.get(url, **kwargs)
284
+ config = ProxyConfig(
285
+ username=self.scraper_token,
286
+ password="",
287
+ host=self._proxy_host,
288
+ port=self._proxy_port,
289
+ product=product,
290
+ country=country,
291
+ state=state,
292
+ city=city,
293
+ session_id=session_id,
294
+ session_duration=session_duration,
295
+ )
296
+ return config.build_proxy_url()
297
+
298
+ # =========================================================================
299
+ # SERP API Methods
300
+ # =========================================================================
85
301
 
86
302
  def serp_search(
87
- self,
88
- query: str,
303
+ self,
304
+ query: str,
305
+ *,
89
306
  engine: Union[Engine, str] = Engine.GOOGLE,
90
- num: int = 10,
91
- **kwargs
307
+ num: int = 10,
308
+ country: Optional[str] = None,
309
+ language: Optional[str] = None,
310
+ search_type: Optional[str] = None,
311
+ device: Optional[str] = None,
312
+ render_js: Optional[bool] = None,
313
+ no_cache: Optional[bool] = None,
314
+ output_format: str = "json",
315
+ **kwargs: Any,
92
316
  ) -> Dict[str, Any]:
93
317
  """
94
318
  Execute a real-time SERP (Search Engine Results Page) search.
95
-
319
+
96
320
  Args:
97
- query (str): The search keywords.
98
- engine (Union[Engine, str]): The search engine (e.g., 'google', 'bing').
99
- num (int): Number of results to retrieve (default 10).
100
- **kwargs: Additional parameters (e.g., type="shopping", location="London").
321
+ query: The search keywords.
322
+ engine: Search engine (google, bing, yandex, duckduckgo, baidu).
323
+ num: Number of results to retrieve (default: 10).
324
+ country: Country code for localized results (e.g., 'us').
325
+ language: Language code for interface (e.g., 'en').
326
+ search_type: Type of search (images, news, shopping, videos, etc.).
327
+ device: Device type ('desktop', 'mobile', 'tablet').
328
+ render_js: Enable JavaScript rendering in SERP (render_js=True).
329
+ no_cache: Disable internal caching (no_cache=True).
330
+ output_format: 'json' to return parsed JSON (default),
331
+ 'html' to return HTML wrapped in {'html': ...}.
332
+ **kwargs: Additional engine-specific parameters.
101
333
 
102
334
  Returns:
103
- Dict[str, Any]: The parsed JSON result from the search engine.
335
+ Dict[str, Any]: Parsed JSON results or a dict with 'html' key.
336
+
337
+ Example:
338
+ >>> # Basic search
339
+ >>> results = client.serp_search("python tutorial")
340
+ >>>
341
+ >>> # With options
342
+ >>> results = client.serp_search(
343
+ ... "laptop reviews",
344
+ ... engine="google",
345
+ ... num=20,
346
+ ... country="us",
347
+ ... search_type="shopping",
348
+ ... device="mobile",
349
+ ... render_js=True,
350
+ ... no_cache=True,
351
+ ... )
104
352
  """
105
- # Handle Enum or String input for engine
353
+ # Normalize engine
106
354
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
107
355
 
108
- # Normalize parameters via internal helper
109
- payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
356
+ # Build request using model
357
+ request = SerpRequest(
358
+ query=query,
359
+ engine=engine_str,
360
+ num=num,
361
+ country=country,
362
+ language=language,
363
+ search_type=search_type,
364
+ device=device,
365
+ render_js=render_js,
366
+ no_cache=no_cache,
367
+ output_format=output_format,
368
+ extra_params=kwargs,
369
+ )
110
370
 
111
- headers = {
112
- "Authorization": f"Bearer {self.scraper_token}",
113
- "Content-Type": "application/x-www-form-urlencoded"
114
- }
371
+ payload = request.to_payload()
372
+ headers = build_auth_headers(self.scraper_token)
115
373
 
116
374
  logger.info(f"SERP Search: {engine_str} - {query}")
375
+
117
376
  try:
118
- response = self.session.post(
119
- self.SERP_API_URL,
377
+ response = self._api_session.post(
378
+ self._serp_url,
120
379
  data=payload,
121
380
  headers=headers,
122
- timeout=60
381
+ timeout=60,
123
382
  )
124
383
  response.raise_for_status()
125
-
126
- data = response.json()
127
- # Handle cases where the API returns a stringified JSON
128
- if isinstance(data, str):
129
- try:
130
- data = json.loads(data)
131
- except json.JSONDecodeError:
132
- pass
133
- return data
134
- except Exception as e:
135
- logger.error(f"SERP Request Failed: {e}")
136
- raise
384
+
385
+ # JSON mode (default)
386
+ if output_format.lower() == "json":
387
+ data = response.json()
388
+
389
+ if isinstance(data, dict):
390
+ code = data.get("code")
391
+ if code is not None and code != 200:
392
+ msg = extract_error_message(data)
393
+ raise_for_code(
394
+ f"SERP API Error: {msg}",
395
+ code=code,
396
+ payload=data,
397
+ )
398
+
399
+ return parse_json_response(data)
400
+
401
+ # HTML mode: wrap as dict to keep return type stable
402
+ return {"html": response.text}
403
+
404
+ except requests.Timeout as e:
405
+ raise ThordataTimeoutError(
406
+ f"SERP request timed out: {e}",
407
+ original_error=e,
408
+ )
409
+ except requests.RequestException as e:
410
+ raise ThordataNetworkError(
411
+ f"SERP request failed: {e}",
412
+ original_error=e,
413
+ )
414
+
415
+ def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
416
+ """
417
+ Execute a SERP search using a SerpRequest object.
418
+
419
+ This method provides full control over all search parameters.
420
+
421
+ Args:
422
+ request: A SerpRequest object with all parameters configured.
423
+
424
+ Returns:
425
+ Dict[str, Any]: Parsed JSON results or dict with 'html' key.
426
+
427
+ Example:
428
+ >>> from thordata.models import SerpRequest
429
+ >>> request = SerpRequest(
430
+ ... query="python programming",
431
+ ... engine="google",
432
+ ... num=50,
433
+ ... country="us",
434
+ ... language="en",
435
+ ... search_type="news",
436
+ ... time_filter="week",
437
+ ... safe_search=True
438
+ ... )
439
+ >>> results = client.serp_search_advanced(request)
440
+ """
441
+ payload = request.to_payload()
442
+ headers = build_auth_headers(self.scraper_token)
443
+
444
+ logger.info(f"SERP Advanced Search: {request.engine} - {request.query}")
445
+
446
+ try:
447
+ response = self._api_session.post(
448
+ self._serp_url,
449
+ data=payload,
450
+ headers=headers,
451
+ timeout=60,
452
+ )
453
+ response.raise_for_status()
454
+
455
+ if request.output_format.lower() == "json":
456
+ data = response.json()
457
+
458
+ if isinstance(data, dict):
459
+ code = data.get("code")
460
+ if code is not None and code != 200:
461
+ msg = extract_error_message(data)
462
+ raise_for_code(
463
+ f"SERP API Error: {msg}",
464
+ code=code,
465
+ payload=data,
466
+ )
467
+
468
+ return parse_json_response(data)
469
+
470
+ return {"html": response.text}
471
+
472
+ except requests.Timeout as e:
473
+ raise ThordataTimeoutError(
474
+ f"SERP request timed out: {e}",
475
+ original_error=e,
476
+ )
477
+ except requests.RequestException as e:
478
+ raise ThordataNetworkError(
479
+ f"SERP request failed: {e}",
480
+ original_error=e,
481
+ )
482
+
483
+ # =========================================================================
484
+ # Universal Scraping API (Web Unlocker) Methods
485
+ # =========================================================================
137
486
 
138
487
  def universal_scrape(
139
488
  self,
140
489
  url: str,
490
+ *,
141
491
  js_render: bool = False,
142
- output_format: str = "HTML",
492
+ output_format: str = "html",
143
493
  country: Optional[str] = None,
144
- block_resources: bool = False
494
+ block_resources: Optional[str] = None,
495
+ wait: Optional[int] = None,
496
+ wait_for: Optional[str] = None,
497
+ **kwargs: Any,
145
498
  ) -> Union[str, bytes]:
146
499
  """
147
- Unlock target pages via the Universal Scraping API.
148
- Bypasses Cloudflare, CAPTCHAs, and antibot systems automatically.
500
+ Scrape a URL using the Universal Scraping API (Web Unlocker).
501
+
502
+ Automatically bypasses Cloudflare, CAPTCHAs, and antibot systems.
149
503
 
150
504
  Args:
151
- url (str): Target URL.
152
- js_render (bool): Whether to render JavaScript (Headless Browser).
153
- output_format (str): "HTML" or "PNG" (screenshot).
154
- country (Optional[str]): Geo-targeting country code (e.g., 'us').
155
- block_resources (bool): Block images/css to speed up loading.
505
+ url: Target URL.
506
+ js_render: Enable JavaScript rendering (headless browser).
507
+ output_format: "html" or "png" (screenshot).
508
+ country: Geo-targeting country code.
509
+ block_resources: Resources to block (e.g., 'script,image').
510
+ wait: Wait time in milliseconds after page load.
511
+ wait_for: CSS selector to wait for.
512
+ **kwargs: Additional parameters.
156
513
 
157
514
  Returns:
158
- Union[str, bytes]: HTML string or PNG bytes.
515
+ HTML string or PNG bytes depending on output_format.
516
+
517
+ Example:
518
+ >>> # Get HTML
519
+ >>> html = client.universal_scrape("https://example.com", js_render=True)
520
+ >>>
521
+ >>> # Get screenshot
522
+ >>> png = client.universal_scrape(
523
+ ... "https://example.com",
524
+ ... js_render=True,
525
+ ... output_format="png"
526
+ ... )
527
+ >>> with open("screenshot.png", "wb") as f:
528
+ ... f.write(png)
159
529
  """
160
- headers = {
161
- "Authorization": f"Bearer {self.scraper_token}",
162
- "Content-Type": "application/x-www-form-urlencoded"
163
- }
530
+ request = UniversalScrapeRequest(
531
+ url=url,
532
+ js_render=js_render,
533
+ output_format=output_format,
534
+ country=country,
535
+ block_resources=block_resources,
536
+ wait=wait,
537
+ wait_for=wait_for,
538
+ extra_params=kwargs,
539
+ )
164
540
 
165
- payload = {
166
- "url": url,
167
- "js_render": "True" if js_render else "False",
168
- "type": output_format.lower(),
169
- "block_resources": "True" if block_resources else "False"
170
- }
171
- if country:
172
- payload["country"] = country
541
+ return self.universal_scrape_advanced(request)
173
542
 
174
- logger.info(f"Universal Scrape: {url} (Format: {output_format})")
543
+ def universal_scrape_advanced(
544
+ self, request: UniversalScrapeRequest
545
+ ) -> Union[str, bytes]:
546
+ """
547
+ Scrape using a UniversalScrapeRequest object for full control.
548
+
549
+ Args:
550
+ request: A UniversalScrapeRequest with all parameters.
551
+
552
+ Returns:
553
+ HTML string or PNG bytes.
554
+ """
555
+ payload = request.to_payload()
556
+ headers = build_auth_headers(self.scraper_token)
557
+
558
+ logger.info(
559
+ f"Universal Scrape: {request.url} (format: {request.output_format})"
560
+ )
175
561
 
176
562
  try:
177
- response = self.session.post(
178
- self.UNIVERSAL_API_URL,
563
+ response = self._api_session.post(
564
+ self._universal_url,
179
565
  data=payload,
180
566
  headers=headers,
181
- timeout=60
567
+ timeout=60,
182
568
  )
183
569
  response.raise_for_status()
184
570
 
185
- # Attempt to parse JSON wrapper
186
- try:
187
- resp_json = response.json()
188
- except json.JSONDecodeError:
189
- # Fallback: if the API returns raw content directly
190
- if output_format.upper() == "PNG":
191
- return response.content
192
- return response.text
193
-
194
- # Check for API-level errors inside the JSON
195
- if isinstance(resp_json, dict) and resp_json.get("code") \
196
- and resp_json.get("code") != 200:
197
- raise Exception(f"Universal API Error: {resp_json}")
198
-
199
- # Case 1: Return HTML
200
- if "html" in resp_json:
201
- return resp_json["html"]
202
-
203
- # Case 2: Return PNG Image
204
- if "png" in resp_json:
205
- png_str = resp_json["png"]
206
- if not png_str:
207
- raise Exception("API returned empty PNG data")
208
-
209
- # Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
210
- if "," in png_str:
211
- png_str = png_str.split(",", 1)[1]
212
-
213
- # Fix Base64 Padding
214
- png_str = png_str.replace("\n", "").replace("\r", "")
215
- missing_padding = len(png_str) % 4
216
- if missing_padding:
217
- png_str += '=' * (4 - missing_padding)
218
-
219
- return base64.b64decode(png_str)
220
-
221
- # Fallback
222
- return str(resp_json)
571
+ return self._process_universal_response(response, request.output_format)
223
572
 
224
- except Exception as e:
225
- logger.error(f"Universal Scrape Failed: {e}")
226
- raise
573
+ except requests.Timeout as e:
574
+ raise ThordataTimeoutError(
575
+ f"Universal scrape timed out: {e}", original_error=e
576
+ )
577
+ except requests.RequestException as e:
578
+ raise ThordataNetworkError(
579
+ f"Universal scrape failed: {e}", original_error=e
580
+ )
581
+
582
+ def _process_universal_response(
583
+ self, response: requests.Response, output_format: str
584
+ ) -> Union[str, bytes]:
585
+ """Process the response from Universal API."""
586
+ # Try to parse as JSON
587
+ try:
588
+ resp_json = response.json()
589
+ except ValueError:
590
+ # Raw content returned
591
+ if output_format.lower() == "png":
592
+ return response.content
593
+ return response.text
594
+
595
+ # Check for API-level errors
596
+ if isinstance(resp_json, dict):
597
+ code = resp_json.get("code")
598
+ if code is not None and code != 200:
599
+ msg = extract_error_message(resp_json)
600
+ raise_for_code(
601
+ f"Universal API Error: {msg}", code=code, payload=resp_json
602
+ )
603
+
604
+ # Extract HTML
605
+ if "html" in resp_json:
606
+ return resp_json["html"]
607
+
608
+ # Extract PNG
609
+ if "png" in resp_json:
610
+ return decode_base64_image(resp_json["png"])
611
+
612
+ # Fallback
613
+ return str(resp_json)
614
+
615
+ # =========================================================================
616
+ # Web Scraper API (Task-based) Methods
617
+ # =========================================================================
227
618
 
228
619
  def create_scraper_task(
229
620
  self,
230
621
  file_name: str,
231
622
  spider_id: str,
232
623
  spider_name: str,
233
- individual_params: Dict[str, Any],
234
- universal_params: Optional[Dict[str, Any]] = None
624
+ parameters: Dict[str, Any],
625
+ universal_params: Optional[Dict[str, Any]] = None,
235
626
  ) -> str:
236
627
  """
237
- Create a generic Web Scraper Task (Async).
238
-
239
- IMPORTANT: You must retrieve the correct 'spider_id' and 'spider_name'
240
- from the Thordata Dashboard before calling this method.
628
+ Create an asynchronous Web Scraper task.
629
+
630
+ Note: Get spider_id and spider_name from the Thordata Dashboard.
241
631
 
242
632
  Args:
243
- file_name (str): Name for the output file.
244
- spider_id (str): The ID of the spider (from Dashboard).
245
- spider_name (str): The name of the spider (e.g., "youtube.com").
246
- individual_params (Dict): Parameters specific to the spider.
247
- universal_params (Optional[Dict]): Global settings for the scraper.
633
+ file_name: Name for the output file.
634
+ spider_id: Spider identifier from Dashboard.
635
+ spider_name: Spider name (e.g., "youtube.com").
636
+ parameters: Spider-specific parameters.
637
+ universal_params: Global spider settings.
248
638
 
249
639
  Returns:
250
- str: The created task_id.
640
+ The created task_id.
641
+
642
+ Example:
643
+ >>> task_id = client.create_scraper_task(
644
+ ... file_name="youtube_data",
645
+ ... spider_id="youtube_video-post_by-url",
646
+ ... spider_name="youtube.com",
647
+ ... parameters={"url": "https://youtube.com/@channel/videos"}
648
+ ... )
251
649
  """
252
- headers = {
253
- "Authorization": f"Bearer {self.scraper_token}",
254
- "Content-Type": "application/x-www-form-urlencoded"
255
- }
650
+ config = ScraperTaskConfig(
651
+ file_name=file_name,
652
+ spider_id=spider_id,
653
+ spider_name=spider_name,
654
+ parameters=parameters,
655
+ universal_params=universal_params,
656
+ )
256
657
 
257
- # Payload construction
258
- payload = {
259
- "spider_name": spider_name,
260
- "spider_id": spider_id,
261
- "spider_parameters": json.dumps([individual_params]),
262
- "spider_errors": "true",
263
- "file_name": file_name
264
- }
265
- if universal_params:
266
- payload["spider_universal"] = json.dumps(universal_params)
658
+ return self.create_scraper_task_advanced(config)
659
+
660
+ def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
661
+ """
662
+ Create a scraper task using a ScraperTaskConfig object.
663
+
664
+ Args:
665
+ config: Task configuration.
666
+
667
+ Returns:
668
+ The created task_id.
669
+ """
670
+ payload = config.to_payload()
671
+ headers = build_auth_headers(self.scraper_token)
672
+
673
+ logger.info(f"Creating Scraper Task: {config.spider_name}")
267
674
 
268
- logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
269
675
  try:
270
- response = self.session.post(
271
- self.SCRAPER_BUILDER_URL,
676
+ response = self._api_session.post(
677
+ self._builder_url,
272
678
  data=payload,
273
- headers=headers
679
+ headers=headers,
680
+ timeout=30,
274
681
  )
275
682
  response.raise_for_status()
683
+
276
684
  data = response.json()
685
+ code = data.get("code")
686
+
687
+ if code != 200:
688
+ msg = extract_error_message(data)
689
+ raise_for_code(f"Task creation failed: {msg}", code=code, payload=data)
277
690
 
278
- if data.get("code") != 200:
279
- raise Exception(f"Creation failed: {data}")
280
691
  return data["data"]["task_id"]
281
- except Exception as e:
282
- logger.error(f"Task Creation Failed: {e}")
283
- raise
692
+
693
+ except requests.RequestException as e:
694
+ raise ThordataNetworkError(f"Task creation failed: {e}", original_error=e)
284
695
 
285
696
  def get_task_status(self, task_id: str) -> str:
286
697
  """
287
698
  Check the status of an asynchronous scraping task.
288
699
 
289
700
  Args:
290
- task_id (str): The ID returned by create_scraper_task.
701
+ task_id: The task ID from create_scraper_task.
291
702
 
292
703
  Returns:
293
- str: The status string (e.g., "finished", "running", "error").
704
+ Status string (e.g., "running", "ready", "failed").
294
705
  """
295
- headers = {
296
- "token": self.public_token,
297
- "key": self.public_key,
298
- "Content-Type": "application/x-www-form-urlencoded"
299
- }
706
+ self._require_public_credentials()
707
+
708
+ headers = build_public_api_headers(
709
+ self.public_token or "", self.public_key or ""
710
+ )
300
711
  payload = {"tasks_ids": task_id}
301
712
 
302
713
  try:
303
- response = self.session.post(
304
- self.SCRAPER_STATUS_URL,
714
+ response = self._api_session.post(
715
+ self._status_url,
305
716
  data=payload,
306
- headers=headers
717
+ headers=headers,
718
+ timeout=30,
307
719
  )
308
720
  response.raise_for_status()
721
+
309
722
  data = response.json()
310
723
 
311
724
  if data.get("code") == 200 and data.get("data"):
312
725
  for item in data["data"]:
313
726
  if str(item.get("task_id")) == str(task_id):
314
- return item["status"]
315
- return "Unknown"
727
+ return item.get("status", "unknown")
728
+
729
+ return "unknown"
730
+
316
731
  except Exception as e:
317
- logger.error(f"Status Check Failed: {e}")
318
- return "Error"
732
+ logger.error(f"Status check failed: {e}")
733
+ return "error"
319
734
 
320
735
  def get_task_result(self, task_id: str, file_type: str = "json") -> str:
321
736
  """
322
- Retrieve the download URL for a completed task.
323
-
324
- Args:
325
- task_id (str): The task ID.
326
- file_type (str): Format required (default "json").
327
-
328
- Returns:
329
- str: The URL to download the result file.
737
+ Get the download URL for a completed task.
330
738
  """
331
- headers = {
332
- "token": self.public_token,
333
- "key": self.public_key,
334
- "Content-Type": "application/x-www-form-urlencoded"
335
- }
739
+ self._require_public_credentials()
740
+
741
+ headers = build_public_api_headers(
742
+ self.public_token or "", self.public_key or ""
743
+ )
336
744
  payload = {"tasks_id": task_id, "type": file_type}
337
745
 
338
746
  logger.info(f"Getting result URL for Task: {task_id}")
747
+
339
748
  try:
340
- response = self.session.post(
341
- self.SCRAPER_DOWNLOAD_URL,
749
+ response = self._api_session.post(
750
+ self._download_url,
342
751
  data=payload,
343
- headers=headers
752
+ headers=headers,
753
+ timeout=30,
344
754
  )
345
755
  response.raise_for_status()
756
+
346
757
  data = response.json()
758
+ code = data.get("code")
347
759
 
348
- if data.get("code") == 200 and data.get("data"):
760
+ if code == 200 and data.get("data"):
349
761
  return data["data"]["download"]
350
- raise Exception(f"API returned error: {data}")
351
- except Exception as e:
352
- logger.error(f"Get Result Failed: {e}")
353
- raise
354
-
355
- def _get_locations(self, endpoint: str, params: Dict[str, str]) -> List[Dict[str, Any]]:
762
+
763
+ msg = extract_error_message(data)
764
+ raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
765
+ # This line won't be reached, but satisfies mypy
766
+ raise RuntimeError("Unexpected state")
767
+
768
+ except requests.RequestException as e:
769
+ raise ThordataNetworkError(f"Get result failed: {e}", original_error=e)
770
+
771
+ def wait_for_task(
772
+ self,
773
+ task_id: str,
774
+ *,
775
+ poll_interval: float = 5.0,
776
+ max_wait: float = 600.0,
777
+ ) -> str:
356
778
  """
357
- Internal helper to call the public locations API.
779
+ Wait for a task to complete.
358
780
 
359
781
  Args:
360
- endpoint: One of 'countries', 'states', 'cities', 'asn'.
361
- params: Query parameters (must include token, key, proxy_type, etc.)
782
+ task_id: The task ID to wait for.
783
+ poll_interval: Seconds between status checks.
784
+ max_wait: Maximum seconds to wait.
362
785
 
363
786
  Returns:
364
- List of location records from the 'data' field.
787
+ Final task status.
365
788
 
366
789
  Raises:
367
- RuntimeError: If token/key are missing or API returns an error code.
790
+ TimeoutError: If max_wait is exceeded.
791
+
792
+ Example:
793
+ >>> task_id = client.create_scraper_task(...)
794
+ >>> status = client.wait_for_task(task_id, max_wait=300)
795
+ >>> if status in ("ready", "success"):
796
+ ... url = client.get_task_result(task_id)
368
797
  """
369
- if not self.public_token or not self.public_key:
370
- raise RuntimeError(
371
- "Public API token/key are required for locations endpoints. "
372
- "Please provide 'public_token' and 'public_key' when "
373
- "initializing ThordataClient."
374
- )
798
+ import time
375
799
 
376
- url = f"{self.locations_url}/{endpoint}"
377
- logger.info("Locations API request: %s", url)
800
+ elapsed = 0.0
378
801
 
379
- # Use a direct requests.get here; no need to go through the proxy gateway.
380
- response = requests.get(
381
- url,
382
- params=params,
383
- timeout=30,
384
- )
385
- response.raise_for_status()
802
+ while elapsed < max_wait:
803
+ status = self.get_task_status(task_id)
386
804
 
387
- data = response.json()
388
- if isinstance(data, dict):
389
- code = data.get("code")
390
- if code is not None and code != 200:
391
- msg = data.get("msg", "")
392
- raise RuntimeError(
393
- f"Locations API error ({endpoint}): code={code}, msg={msg}"
394
- )
395
- return data.get("data") or []
396
- # Fallback: if backend ever returns a list directly
397
- if isinstance(data, list):
398
- return data
399
- return []
400
-
401
- def list_countries(self, proxy_type: int = 1) -> List[Dict[str, Any]]:
805
+ logger.debug(f"Task {task_id} status: {status}")
806
+
807
+ terminal_statuses = {
808
+ "ready",
809
+ "success",
810
+ "finished",
811
+ "failed",
812
+ "error",
813
+ "cancelled",
814
+ }
815
+
816
+ if status.lower() in terminal_statuses:
817
+ return status
818
+
819
+ time.sleep(poll_interval)
820
+ elapsed += poll_interval
821
+
822
+ raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
823
+
824
+ # =========================================================================
825
+ # Location API Methods
826
+ # =========================================================================
827
+
828
+ def list_countries(
829
+ self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
830
+ ) -> List[Dict[str, Any]]:
402
831
  """
403
- List supported countries for Thordata residential or unlimited proxies.
832
+ List supported countries for proxies.
404
833
 
405
834
  Args:
406
- proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
835
+ proxy_type: 1 for residential, 2 for unlimited.
407
836
 
408
837
  Returns:
409
- List[Dict[str, Any]]: Each record contains 'country_code' and 'country_name'.
838
+ List of country records with 'country_code' and 'country_name'.
410
839
  """
411
- params = {
412
- "token": self.public_token,
413
- "key": self.public_key,
414
- "proxy_type": str(proxy_type),
415
- }
416
- return self._get_locations("countries", params)
840
+ return self._get_locations(
841
+ "countries",
842
+ proxy_type=(
843
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
844
+ ),
845
+ )
417
846
 
418
- def list_states(self, country_code: str, proxy_type: int = 1) -> List[Dict[str, Any]]:
847
+ def list_states(
848
+ self,
849
+ country_code: str,
850
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
851
+ ) -> List[Dict[str, Any]]:
419
852
  """
420
- List supported states for a given country.
853
+ List supported states for a country.
421
854
 
422
855
  Args:
423
- country_code (str): Country code (e.g., 'US').
424
- proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
856
+ country_code: Country code (e.g., 'US').
857
+ proxy_type: Proxy type.
425
858
 
426
859
  Returns:
427
- List[Dict[str, Any]]: Each record contains 'state_code' and 'state_name'.
860
+ List of state records.
428
861
  """
429
- params = {
430
- "token": self.public_token,
431
- "key": self.public_key,
432
- "proxy_type": str(proxy_type),
433
- "country_code": country_code,
434
- }
435
- return self._get_locations("states", params)
862
+ return self._get_locations(
863
+ "states",
864
+ proxy_type=(
865
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
866
+ ),
867
+ country_code=country_code,
868
+ )
436
869
 
437
870
  def list_cities(
438
871
  self,
439
872
  country_code: str,
440
873
  state_code: Optional[str] = None,
441
- proxy_type: int = 1,
874
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
442
875
  ) -> List[Dict[str, Any]]:
443
876
  """
444
- List supported cities for a given country (and optional state).
877
+ List supported cities for a country/state.
445
878
 
446
879
  Args:
447
- country_code (str): Country code (e.g., 'US').
448
- state_code (Optional[str]): State code (e.g., 'alabama'), if applicable.
449
- proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
880
+ country_code: Country code.
881
+ state_code: Optional state code.
882
+ proxy_type: Proxy type.
450
883
 
451
884
  Returns:
452
- List[Dict[str, Any]]: Each record contains 'city_code' and 'city_name'.
885
+ List of city records.
453
886
  """
454
- params: Dict[str, str] = {
455
- "token": self.public_token,
456
- "key": self.public_key,
457
- "proxy_type": str(proxy_type),
887
+ kwargs = {
888
+ "proxy_type": (
889
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
890
+ ),
458
891
  "country_code": country_code,
459
892
  }
460
893
  if state_code:
461
- params["state_code"] = state_code
894
+ kwargs["state_code"] = state_code
462
895
 
463
- return self._get_locations("cities", params)
896
+ return self._get_locations("cities", **kwargs)
464
897
 
465
898
  def list_asn(
466
899
  self,
467
900
  country_code: str,
468
- proxy_type: int = 1,
901
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
469
902
  ) -> List[Dict[str, Any]]:
470
903
  """
471
- List supported ASNs for a given country.
904
+ List supported ASNs for a country.
472
905
 
473
906
  Args:
474
- country_code (str): Country code (e.g., 'US').
475
- proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
907
+ country_code: Country code.
908
+ proxy_type: Proxy type.
476
909
 
477
910
  Returns:
478
- List[Dict[str, Any]]: Each record contains 'asn_code' and 'asn_name'.
911
+ List of ASN records.
479
912
  """
913
+ return self._get_locations(
914
+ "asn",
915
+ proxy_type=(
916
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
917
+ ),
918
+ country_code=country_code,
919
+ )
920
+
921
+ def _get_locations(self, endpoint: str, **kwargs: Any) -> List[Dict[str, Any]]:
922
+ """Internal method to call locations API."""
923
+ self._require_public_credentials()
924
+
480
925
  params = {
481
926
  "token": self.public_token,
482
927
  "key": self.public_key,
483
- "proxy_type": str(proxy_type),
484
- "country_code": country_code,
485
928
  }
486
- return self._get_locations("asn", params)
929
+
930
+ for key, value in kwargs.items():
931
+ params[key] = str(value)
932
+
933
+ url = f"{self._locations_base_url}/{endpoint}"
934
+
935
+ logger.debug(f"Locations API request: {url}")
936
+
937
+ # Use requests.get directly (no proxy needed for this API)
938
+ response = self._api_session.get(url, params=params, timeout=30)
939
+ response.raise_for_status()
940
+
941
+ data = response.json()
942
+
943
+ if isinstance(data, dict):
944
+ code = data.get("code")
945
+ if code is not None and code != 200:
946
+ msg = data.get("msg", "")
947
+ raise RuntimeError(
948
+ f"Locations API error ({endpoint}): code={code}, msg={msg}"
949
+ )
950
+ return data.get("data") or []
951
+
952
+ if isinstance(data, list):
953
+ return data
954
+
955
+ return []
956
+
957
+ # =========================================================================
958
+ # Helper Methods
959
+ # =========================================================================
960
+
961
+ def _require_public_credentials(self) -> None:
962
+ """Ensure public API credentials are available."""
963
+ if not self.public_token or not self.public_key:
964
+ raise ThordataConfigError(
965
+ "public_token and public_key are required for this operation. "
966
+ "Please provide them when initializing ThordataClient."
967
+ )
968
+
969
+ def _request_with_retry(
970
+ self, method: str, url: str, **kwargs: Any
971
+ ) -> requests.Response:
972
+ """Make a request with automatic retry."""
973
+ kwargs.setdefault("timeout", self._default_timeout)
974
+
975
+ @with_retry(self._retry_config)
976
+ def _do_request() -> requests.Response:
977
+ return self._proxy_session.request(method, url, **kwargs)
978
+
979
+ try:
980
+ return _do_request()
981
+ except requests.Timeout as e:
982
+ raise ThordataTimeoutError(f"Request timed out: {e}", original_error=e)
983
+ except requests.RequestException as e:
984
+ raise ThordataNetworkError(f"Request failed: {e}", original_error=e)
985
+
986
+ def close(self) -> None:
987
+ """Close the underlying session."""
988
+ self._proxy_session.close()
989
+ self._api_session.close()
990
+
991
+ def __enter__(self) -> ThordataClient:
992
+ return self
993
+
994
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
995
+ self.close()