thordata-sdk 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/client.py CHANGED
@@ -1,13 +1,55 @@
1
- import requests
2
- import logging
3
- import json
4
- import base64
5
- from typing import Dict, Any, Union, Optional, List
1
+ """
2
+ Synchronous client for the Thordata API.
3
+
4
+ This module provides the main ThordataClient class for interacting with
5
+ Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
6
+
7
+ Example:
8
+ >>> from thordata import ThordataClient
9
+ >>>
10
+ >>> client = ThordataClient(
11
+ ... scraper_token="your_token",
12
+ ... public_token="your_public_token",
13
+ ... public_key="your_public_key"
14
+ ... )
15
+ >>>
16
+ >>> # Use the proxy network
17
+ >>> response = client.get("https://httpbin.org/ip")
18
+ >>> print(response.json())
19
+ >>>
20
+ >>> # Search with SERP API
21
+ >>> results = client.serp_search("python tutorial", engine="google")
22
+ """
23
+
24
+ from __future__ import annotations
6
25
 
7
- from .enums import Engine
8
- from .parameters import normalize_serp_params
26
+ import logging
27
+ import requests
28
+ from typing import Any, Dict, List, Optional, Union
29
+
30
+ from .enums import Engine, ProxyType
31
+ from .exceptions import (
32
+ ThordataConfigError,
33
+ ThordataNetworkError,
34
+ ThordataTimeoutError,
35
+ raise_for_code,
36
+ )
37
+ from .models import (
38
+ ProxyConfig,
39
+ ProxyProduct,
40
+ SerpRequest,
41
+ UniversalScrapeRequest,
42
+ ScraperTaskConfig,
43
+ )
44
+ from .retry import RetryConfig, with_retry
45
+ from ._utils import (
46
+ parse_json_response,
47
+ decode_base64_image,
48
+ build_auth_headers,
49
+ build_public_api_headers,
50
+ extract_error_message,
51
+ )
9
52
 
10
- # Configure a library-specific logger to avoid interfering with user's logging
11
53
  logger = logging.getLogger(__name__)
12
54
 
13
55
 
@@ -16,471 +58,879 @@ class ThordataClient:
16
58
  The official synchronous Python client for Thordata.
17
59
 
18
60
  This client handles authentication and communication with:
19
- 1. Proxy Network (Residential/Datacenter via HTTP/HTTPS)
20
- 2. SERP API (Real-time Search Engine Results)
21
- 3. Universal Scraping API (Single Page Rendering & Extraction)
22
- 4. Web Scraper API (Async Task Management for large scale jobs)
61
+ - Proxy Network (Residential/Datacenter/Mobile/ISP via HTTP/HTTPS)
62
+ - SERP API (Real-time Search Engine Results)
63
+ - Universal Scraping API (Web Unlocker - Single Page Rendering)
64
+ - Web Scraper API (Async Task Management)
65
+
66
+ Args:
67
+ scraper_token: The API token from your Dashboard.
68
+ public_token: The public API token (for task status, locations).
69
+ public_key: The public API key.
70
+ proxy_host: Custom proxy gateway host (optional).
71
+ proxy_port: Custom proxy gateway port (optional).
72
+ timeout: Default request timeout in seconds (default: 30).
73
+ retry_config: Configuration for automatic retries (optional).
74
+
75
+ Example:
76
+ >>> client = ThordataClient(
77
+ ... scraper_token="your_scraper_token",
78
+ ... public_token="your_public_token",
79
+ ... public_key="your_public_key"
80
+ ... )
23
81
  """
24
82
 
83
+ # API Endpoints
84
+ BASE_URL = "https://scraperapi.thordata.com"
85
+ UNIVERSAL_URL = "https://universalapi.thordata.com"
86
+ API_URL = "https://api.thordata.com/api/web-scraper-api"
87
+ LOCATIONS_URL = "https://api.thordata.com/api/locations"
88
+
25
89
  def __init__(
26
90
  self,
27
91
  scraper_token: str,
28
- public_token: str,
29
- public_key: str,
30
- proxy_host: str = "gate.thordata.com",
31
- proxy_port: int = 22225
32
- ):
33
- """
34
- Initialize the Thordata Client.
35
-
36
- Args:
37
- scraper_token (str): The secret token found at the bottom of the Dashboard.
38
- public_token (str): The token from the Public API section.
39
- public_key (str): The key from the Public API section.
40
- proxy_host (str): The proxy gateway host (default: gate.thordata.com).
41
- proxy_port (int): The proxy gateway port (default: 22225).
42
- """
92
+ public_token: Optional[str] = None,
93
+ public_key: Optional[str] = None,
94
+ proxy_host: str = "pr.thordata.net",
95
+ proxy_port: int = 9999,
96
+ timeout: int = 30,
97
+ retry_config: Optional[RetryConfig] = None,
98
+ ) -> None:
99
+ """Initialize the Thordata Client."""
100
+ if not scraper_token:
101
+ raise ThordataConfigError("scraper_token is required")
102
+
43
103
  self.scraper_token = scraper_token
44
104
  self.public_token = public_token
45
105
  self.public_key = public_key
46
-
47
- # Proxy Configuration
48
- self.proxy_url = (
49
- f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
106
+
107
+ # Proxy configuration
108
+ self._proxy_host = proxy_host
109
+ self._proxy_port = proxy_port
110
+ self._default_timeout = timeout
111
+
112
+ # Retry configuration
113
+ self._retry_config = retry_config or RetryConfig()
114
+
115
+ # Build default proxy URL (for basic usage)
116
+ self._default_proxy_url = (
117
+ f"http://td-customer-{self.scraper_token}:@{proxy_host}:{proxy_port}"
50
118
  )
51
-
52
- # API Endpoints Definition
53
- self.base_url = "https://scraperapi.thordata.com"
54
- self.universal_url = "https://universalapi.thordata.com"
55
- self.api_url = "https://api.thordata.com/api/web-scraper-api"
56
- self.locations_url = "https://api.thordata.com/api/locations"
57
-
58
- self.SERP_API_URL = f"{self.base_url}/request"
59
- self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
60
- self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
61
- self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
62
- self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
63
-
64
- # Initialize Session with Proxy settings
65
- self.session = requests.Session()
66
- self.session.proxies = {
67
- "http": self.proxy_url,
68
- "https": self.proxy_url,
119
+
120
+ # Initialize session with default proxy settings
121
+ self._session = requests.Session()
122
+ self._session.proxies = {
123
+ "http": self._default_proxy_url,
124
+ "https": self._default_proxy_url,
69
125
  }
126
+
127
+ # Store endpoint URLs
128
+ self._serp_url = f"{self.BASE_URL}/request"
129
+ self._universal_url = f"{self.UNIVERSAL_URL}/request"
130
+ self._builder_url = f"{self.BASE_URL}/builder"
131
+ self._status_url = f"{self.API_URL}/tasks-status"
132
+ self._download_url = f"{self.API_URL}/tasks-download"
133
+
134
+ # =========================================================================
135
+ # Proxy Network Methods
136
+ # =========================================================================
137
+
138
+ def get(
139
+ self,
140
+ url: str,
141
+ *,
142
+ proxy_config: Optional[ProxyConfig] = None,
143
+ timeout: Optional[int] = None,
144
+ **kwargs: Any,
145
+ ) -> requests.Response:
146
+ """
147
+ Send a GET request through the Thordata Proxy Network.
70
148
 
71
- def get(self, url: str, **kwargs) -> requests.Response:
149
+ Args:
150
+ url: The target URL.
151
+ proxy_config: Custom proxy configuration for geo-targeting/sessions.
152
+ timeout: Request timeout in seconds.
153
+ **kwargs: Additional arguments to pass to requests.get().
154
+
155
+ Returns:
156
+ The response object.
157
+
158
+ Example:
159
+ >>> # Basic request
160
+ >>> response = client.get("https://httpbin.org/ip")
161
+ >>>
162
+ >>> # With geo-targeting
163
+ >>> from thordata.models import ProxyConfig
164
+ >>> config = ProxyConfig(
165
+ ... username="myuser",
166
+ ... password="mypass",
167
+ ... country="us",
168
+ ... city="seattle"
169
+ ... )
170
+ >>> response = client.get("https://httpbin.org/ip", proxy_config=config)
72
171
  """
73
- Send a standard GET request through the Thordata Residential Proxy Network.
172
+ logger.debug(f"Proxy GET request: {url}")
173
+
174
+ timeout = timeout or self._default_timeout
175
+
176
+ if proxy_config:
177
+ proxies = proxy_config.to_proxies_dict()
178
+ kwargs["proxies"] = proxies
179
+
180
+ return self._request_with_retry("GET", url, timeout=timeout, **kwargs)
181
+
182
+ def post(
183
+ self,
184
+ url: str,
185
+ *,
186
+ proxy_config: Optional[ProxyConfig] = None,
187
+ timeout: Optional[int] = None,
188
+ **kwargs: Any,
189
+ ) -> requests.Response:
190
+ """
191
+ Send a POST request through the Thordata Proxy Network.
74
192
 
75
193
  Args:
76
- url (str): The target URL.
77
- **kwargs: Arguments to pass to requests.get().
194
+ url: The target URL.
195
+ proxy_config: Custom proxy configuration.
196
+ timeout: Request timeout in seconds.
197
+ **kwargs: Additional arguments to pass to requests.post().
78
198
 
79
199
  Returns:
80
- requests.Response: The response object.
200
+ The response object.
81
201
  """
82
- logger.debug(f"Proxy Request: {url}")
83
- kwargs.setdefault("timeout", 30)
84
- return self.session.get(url, **kwargs)
202
+ logger.debug(f"Proxy POST request: {url}")
203
+
204
+ timeout = timeout or self._default_timeout
205
+
206
+ if proxy_config:
207
+ proxies = proxy_config.to_proxies_dict()
208
+ kwargs["proxies"] = proxies
209
+
210
+ return self._request_with_retry("POST", url, timeout=timeout, **kwargs)
211
+
212
+ def build_proxy_url(
213
+ self,
214
+ *,
215
+ country: Optional[str] = None,
216
+ state: Optional[str] = None,
217
+ city: Optional[str] = None,
218
+ session_id: Optional[str] = None,
219
+ session_duration: Optional[int] = None,
220
+ product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL,
221
+ ) -> str:
222
+ """
223
+ Build a proxy URL with custom targeting options.
224
+
225
+ This is a convenience method for creating proxy URLs without
226
+ manually constructing a ProxyConfig.
227
+
228
+ Args:
229
+ country: Target country code (e.g., 'us', 'gb').
230
+ state: Target state (e.g., 'california').
231
+ city: Target city (e.g., 'seattle').
232
+ session_id: Session ID for sticky sessions.
233
+ session_duration: Session duration in minutes (1-90).
234
+ product: Proxy product type.
235
+
236
+ Returns:
237
+ The proxy URL string.
238
+
239
+ Example:
240
+ >>> url = client.build_proxy_url(country="us", city="seattle")
241
+ >>> proxies = {"http": url, "https": url}
242
+ >>> requests.get("https://example.com", proxies=proxies)
243
+ """
244
+ config = ProxyConfig(
245
+ username=self.scraper_token,
246
+ password="",
247
+ host=self._proxy_host,
248
+ port=self._proxy_port,
249
+ product=product,
250
+ country=country,
251
+ state=state,
252
+ city=city,
253
+ session_id=session_id,
254
+ session_duration=session_duration,
255
+ )
256
+ return config.build_proxy_url()
257
+
258
+ # =========================================================================
259
+ # SERP API Methods
260
+ # =========================================================================
85
261
 
86
262
  def serp_search(
87
- self,
88
- query: str,
263
+ self,
264
+ query: str,
265
+ *,
89
266
  engine: Union[Engine, str] = Engine.GOOGLE,
90
- num: int = 10,
91
- **kwargs
267
+ num: int = 10,
268
+ country: Optional[str] = None,
269
+ language: Optional[str] = None,
270
+ search_type: Optional[str] = None,
271
+ **kwargs: Any,
92
272
  ) -> Dict[str, Any]:
93
273
  """
94
274
  Execute a real-time SERP (Search Engine Results Page) search.
95
275
 
96
276
  Args:
97
- query (str): The search keywords.
98
- engine (Union[Engine, str]): The search engine (e.g., 'google', 'bing').
99
- num (int): Number of results to retrieve (default 10).
100
- **kwargs: Additional parameters (e.g., type="shopping", location="London").
277
+ query: The search keywords.
278
+ engine: Search engine (google, bing, yandex, duckduckgo, baidu).
279
+ num: Number of results to retrieve (default: 10).
280
+ country: Country code for localized results (e.g., 'us').
281
+ language: Language code for interface (e.g., 'en').
282
+ search_type: Type of search (images, news, shopping, videos).
283
+ **kwargs: Additional engine-specific parameters.
101
284
 
102
285
  Returns:
103
- Dict[str, Any]: The parsed JSON result from the search engine.
286
+ Parsed JSON results from the search.
287
+
288
+ Example:
289
+ >>> # Basic search
290
+ >>> results = client.serp_search("python tutorial")
291
+ >>>
292
+ >>> # With options
293
+ >>> results = client.serp_search(
294
+ ... "laptop reviews",
295
+ ... engine="google",
296
+ ... num=20,
297
+ ... country="us",
298
+ ... search_type="shopping"
299
+ ... )
104
300
  """
105
- # Handle Enum or String input for engine
301
+ # Normalize engine
106
302
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
107
-
108
- # Normalize parameters via internal helper
109
- payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
110
-
111
- headers = {
112
- "Authorization": f"Bearer {self.scraper_token}",
113
- "Content-Type": "application/x-www-form-urlencoded"
114
- }
115
-
303
+
304
+ # Build request using model
305
+ request = SerpRequest(
306
+ query=query,
307
+ engine=engine_str,
308
+ num=num,
309
+ country=country,
310
+ language=language,
311
+ search_type=search_type,
312
+ extra_params=kwargs,
313
+ )
314
+
315
+ payload = request.to_payload()
316
+ headers = build_auth_headers(self.scraper_token)
317
+
116
318
  logger.info(f"SERP Search: {engine_str} - {query}")
319
+
117
320
  try:
118
- response = self.session.post(
119
- self.SERP_API_URL,
321
+ response = self._session.post(
322
+ self._serp_url,
120
323
  data=payload,
121
324
  headers=headers,
122
- timeout=60
325
+ timeout=60,
123
326
  )
124
327
  response.raise_for_status()
125
328
 
126
329
  data = response.json()
127
- # Handle cases where the API returns a stringified JSON
128
- if isinstance(data, str):
129
- try:
130
- data = json.loads(data)
131
- except json.JSONDecodeError:
132
- pass
133
- return data
134
- except Exception as e:
135
- logger.error(f"SERP Request Failed: {e}")
136
- raise
330
+ return parse_json_response(data)
331
+
332
+ except requests.Timeout as e:
333
+ raise ThordataTimeoutError(
334
+ f"SERP request timed out: {e}",
335
+ original_error=e
336
+ )
337
+ except requests.RequestException as e:
338
+ raise ThordataNetworkError(
339
+ f"SERP request failed: {e}",
340
+ original_error=e
341
+ )
342
+
343
+ def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
344
+ """
345
+ Execute a SERP search using a SerpRequest object.
346
+
347
+ This method provides full control over all search parameters.
348
+
349
+ Args:
350
+ request: A SerpRequest object with all parameters configured.
351
+
352
+ Returns:
353
+ Parsed JSON results.
354
+
355
+ Example:
356
+ >>> from thordata.models import SerpRequest
357
+ >>> request = SerpRequest(
358
+ ... query="python programming",
359
+ ... engine="google",
360
+ ... num=50,
361
+ ... country="us",
362
+ ... language="en",
363
+ ... search_type="news",
364
+ ... time_filter="week",
365
+ ... safe_search=True
366
+ ... )
367
+ >>> results = client.serp_search_advanced(request)
368
+ """
369
+ payload = request.to_payload()
370
+ headers = build_auth_headers(self.scraper_token)
371
+
372
+ logger.info(f"SERP Advanced Search: {request.engine} - {request.query}")
373
+
374
+ try:
375
+ response = self._session.post(
376
+ self._serp_url,
377
+ data=payload,
378
+ headers=headers,
379
+ timeout=60,
380
+ )
381
+ response.raise_for_status()
382
+
383
+ data = response.json()
384
+ return parse_json_response(data)
385
+
386
+ except requests.Timeout as e:
387
+ raise ThordataTimeoutError(
388
+ f"SERP request timed out: {e}",
389
+ original_error=e
390
+ )
391
+ except requests.RequestException as e:
392
+ raise ThordataNetworkError(
393
+ f"SERP request failed: {e}",
394
+ original_error=e
395
+ )
396
+
397
+ # =========================================================================
398
+ # Universal Scraping API (Web Unlocker) Methods
399
+ # =========================================================================
137
400
 
138
401
  def universal_scrape(
139
402
  self,
140
403
  url: str,
404
+ *,
141
405
  js_render: bool = False,
142
- output_format: str = "HTML",
406
+ output_format: str = "html",
143
407
  country: Optional[str] = None,
144
- block_resources: bool = False
408
+ block_resources: Optional[str] = None,
409
+ wait: Optional[int] = None,
410
+ wait_for: Optional[str] = None,
411
+ **kwargs: Any,
145
412
  ) -> Union[str, bytes]:
146
413
  """
147
- Unlock target pages via the Universal Scraping API.
148
- Bypasses Cloudflare, CAPTCHAs, and antibot systems automatically.
414
+ Scrape a URL using the Universal Scraping API (Web Unlocker).
415
+
416
+ Automatically bypasses Cloudflare, CAPTCHAs, and antibot systems.
149
417
 
150
418
  Args:
151
- url (str): Target URL.
152
- js_render (bool): Whether to render JavaScript (Headless Browser).
153
- output_format (str): "HTML" or "PNG" (screenshot).
154
- country (Optional[str]): Geo-targeting country code (e.g., 'us').
155
- block_resources (bool): Block images/css to speed up loading.
419
+ url: Target URL.
420
+ js_render: Enable JavaScript rendering (headless browser).
421
+ output_format: "html" or "png" (screenshot).
422
+ country: Geo-targeting country code.
423
+ block_resources: Resources to block (e.g., 'script,image').
424
+ wait: Wait time in milliseconds after page load.
425
+ wait_for: CSS selector to wait for.
426
+ **kwargs: Additional parameters.
156
427
 
157
428
  Returns:
158
- Union[str, bytes]: HTML string or PNG bytes.
429
+ HTML string or PNG bytes depending on output_format.
430
+
431
+ Example:
432
+ >>> # Get HTML
433
+ >>> html = client.universal_scrape("https://example.com", js_render=True)
434
+ >>>
435
+ >>> # Get screenshot
436
+ >>> png = client.universal_scrape(
437
+ ... "https://example.com",
438
+ ... js_render=True,
439
+ ... output_format="png"
440
+ ... )
441
+ >>> with open("screenshot.png", "wb") as f:
442
+ ... f.write(png)
159
443
  """
160
- headers = {
161
- "Authorization": f"Bearer {self.scraper_token}",
162
- "Content-Type": "application/x-www-form-urlencoded"
163
- }
164
-
165
- payload = {
166
- "url": url,
167
- "js_render": "True" if js_render else "False",
168
- "type": output_format.lower(),
169
- "block_resources": "True" if block_resources else "False"
170
- }
171
- if country:
172
- payload["country"] = country
173
-
174
- logger.info(f"Universal Scrape: {url} (Format: {output_format})")
444
+ request = UniversalScrapeRequest(
445
+ url=url,
446
+ js_render=js_render,
447
+ output_format=output_format,
448
+ country=country,
449
+ block_resources=block_resources,
450
+ wait=wait,
451
+ wait_for=wait_for,
452
+ extra_params=kwargs,
453
+ )
454
+
455
+ return self.universal_scrape_advanced(request)
175
456
 
457
+ def universal_scrape_advanced(
458
+ self,
459
+ request: UniversalScrapeRequest
460
+ ) -> Union[str, bytes]:
461
+ """
462
+ Scrape using a UniversalScrapeRequest object for full control.
463
+
464
+ Args:
465
+ request: A UniversalScrapeRequest with all parameters.
466
+
467
+ Returns:
468
+ HTML string or PNG bytes.
469
+ """
470
+ payload = request.to_payload()
471
+ headers = build_auth_headers(self.scraper_token)
472
+
473
+ logger.info(f"Universal Scrape: {request.url} (format: {request.output_format})")
474
+
176
475
  try:
177
- response = self.session.post(
178
- self.UNIVERSAL_API_URL,
476
+ response = self._session.post(
477
+ self._universal_url,
179
478
  data=payload,
180
479
  headers=headers,
181
- timeout=60
480
+ timeout=60,
182
481
  )
183
482
  response.raise_for_status()
483
+
484
+ return self._process_universal_response(response, request.output_format)
485
+
486
+ except requests.Timeout as e:
487
+ raise ThordataTimeoutError(
488
+ f"Universal scrape timed out: {e}",
489
+ original_error=e
490
+ )
491
+ except requests.RequestException as e:
492
+ raise ThordataNetworkError(
493
+ f"Universal scrape failed: {e}",
494
+ original_error=e
495
+ )
184
496
 
185
- # Attempt to parse JSON wrapper
186
- try:
187
- resp_json = response.json()
188
- except json.JSONDecodeError:
189
- # Fallback: if the API returns raw content directly
190
- if output_format.upper() == "PNG":
191
- return response.content
192
- return response.text
193
-
194
- # Check for API-level errors inside the JSON
195
- if isinstance(resp_json, dict) and resp_json.get("code") \
196
- and resp_json.get("code") != 200:
197
- raise Exception(f"Universal API Error: {resp_json}")
198
-
199
- # Case 1: Return HTML
200
- if "html" in resp_json:
201
- return resp_json["html"]
202
-
203
- # Case 2: Return PNG Image
204
- if "png" in resp_json:
205
- png_str = resp_json["png"]
206
- if not png_str:
207
- raise Exception("API returned empty PNG data")
208
-
209
- # Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
210
- if "," in png_str:
211
- png_str = png_str.split(",", 1)[1]
212
-
213
- # Fix Base64 Padding
214
- png_str = png_str.replace("\n", "").replace("\r", "")
215
- missing_padding = len(png_str) % 4
216
- if missing_padding:
217
- png_str += '=' * (4 - missing_padding)
218
-
219
- return base64.b64decode(png_str)
220
-
221
- # Fallback
222
- return str(resp_json)
497
+ def _process_universal_response(
498
+ self,
499
+ response: requests.Response,
500
+ output_format: str
501
+ ) -> Union[str, bytes]:
502
+ """Process the response from Universal API."""
503
+ # Try to parse as JSON
504
+ try:
505
+ resp_json = response.json()
506
+ except ValueError:
507
+ # Raw content returned
508
+ if output_format.lower() == "png":
509
+ return response.content
510
+ return response.text
511
+
512
+ # Check for API-level errors
513
+ if isinstance(resp_json, dict):
514
+ code = resp_json.get("code")
515
+ if code is not None and code != 200:
516
+ msg = extract_error_message(resp_json)
517
+ raise_for_code(
518
+ f"Universal API Error: {msg}",
519
+ code=code,
520
+ payload=resp_json
521
+ )
522
+
523
+ # Extract HTML
524
+ if "html" in resp_json:
525
+ return resp_json["html"]
526
+
527
+ # Extract PNG
528
+ if "png" in resp_json:
529
+ return decode_base64_image(resp_json["png"])
530
+
531
+ # Fallback
532
+ return str(resp_json)
223
533
 
224
- except Exception as e:
225
- logger.error(f"Universal Scrape Failed: {e}")
226
- raise
534
+ # =========================================================================
535
+ # Web Scraper API (Task-based) Methods
536
+ # =========================================================================
227
537
 
228
538
  def create_scraper_task(
229
539
  self,
230
540
  file_name: str,
231
541
  spider_id: str,
232
542
  spider_name: str,
233
- individual_params: Dict[str, Any],
234
- universal_params: Optional[Dict[str, Any]] = None
543
+ parameters: Dict[str, Any],
544
+ universal_params: Optional[Dict[str, Any]] = None,
235
545
  ) -> str:
236
546
  """
237
- Create a generic Web Scraper Task (Async).
547
+ Create an asynchronous Web Scraper task.
238
548
 
239
- IMPORTANT: You must retrieve the correct 'spider_id' and 'spider_name'
240
- from the Thordata Dashboard before calling this method.
549
+ Note: Get spider_id and spider_name from the Thordata Dashboard.
241
550
 
242
551
  Args:
243
- file_name (str): Name for the output file.
244
- spider_id (str): The ID of the spider (from Dashboard).
245
- spider_name (str): The name of the spider (e.g., "youtube.com").
246
- individual_params (Dict): Parameters specific to the spider.
247
- universal_params (Optional[Dict]): Global settings for the scraper.
552
+ file_name: Name for the output file.
553
+ spider_id: Spider identifier from Dashboard.
554
+ spider_name: Spider name (e.g., "youtube.com").
555
+ parameters: Spider-specific parameters.
556
+ universal_params: Global spider settings.
248
557
 
249
558
  Returns:
250
- str: The created task_id.
559
+ The created task_id.
560
+
561
+ Example:
562
+ >>> task_id = client.create_scraper_task(
563
+ ... file_name="youtube_data",
564
+ ... spider_id="youtube_video-post_by-url",
565
+ ... spider_name="youtube.com",
566
+ ... parameters={"url": "https://youtube.com/@channel/videos"}
567
+ ... )
251
568
  """
252
- headers = {
253
- "Authorization": f"Bearer {self.scraper_token}",
254
- "Content-Type": "application/x-www-form-urlencoded"
255
- }
256
-
257
- # Payload construction
258
- payload = {
259
- "spider_name": spider_name,
260
- "spider_id": spider_id,
261
- "spider_parameters": json.dumps([individual_params]),
262
- "spider_errors": "true",
263
- "file_name": file_name
264
- }
265
- if universal_params:
266
- payload["spider_universal"] = json.dumps(universal_params)
569
+ config = ScraperTaskConfig(
570
+ file_name=file_name,
571
+ spider_id=spider_id,
572
+ spider_name=spider_name,
573
+ parameters=parameters,
574
+ universal_params=universal_params,
575
+ )
576
+
577
+ return self.create_scraper_task_advanced(config)
267
578
 
268
- logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
579
+ def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
580
+ """
581
+ Create a scraper task using a ScraperTaskConfig object.
582
+
583
+ Args:
584
+ config: Task configuration.
585
+
586
+ Returns:
587
+ The created task_id.
588
+ """
589
+ payload = config.to_payload()
590
+ headers = build_auth_headers(self.scraper_token)
591
+
592
+ logger.info(f"Creating Scraper Task: {config.spider_name}")
593
+
269
594
  try:
270
- response = self.session.post(
271
- self.SCRAPER_BUILDER_URL,
595
+ response = self._session.post(
596
+ self._builder_url,
272
597
  data=payload,
273
- headers=headers
598
+ headers=headers,
599
+ timeout=30,
274
600
  )
275
601
  response.raise_for_status()
602
+
276
603
  data = response.json()
277
-
278
- if data.get("code") != 200:
279
- raise Exception(f"Creation failed: {data}")
604
+ code = data.get("code")
605
+
606
+ if code != 200:
607
+ msg = extract_error_message(data)
608
+ raise_for_code(
609
+ f"Task creation failed: {msg}",
610
+ code=code,
611
+ payload=data
612
+ )
613
+
280
614
  return data["data"]["task_id"]
281
- except Exception as e:
282
- logger.error(f"Task Creation Failed: {e}")
283
- raise
615
+
616
+ except requests.RequestException as e:
617
+ raise ThordataNetworkError(
618
+ f"Task creation failed: {e}",
619
+ original_error=e
620
+ )
284
621
 
285
622
  def get_task_status(self, task_id: str) -> str:
286
623
  """
287
624
  Check the status of an asynchronous scraping task.
288
625
 
289
626
  Args:
290
- task_id (str): The ID returned by create_scraper_task.
627
+ task_id: The task ID from create_scraper_task.
291
628
 
292
629
  Returns:
293
- str: The status string (e.g., "finished", "running", "error").
630
+ Status string (e.g., "running", "ready", "failed").
294
631
  """
295
- headers = {
296
- "token": self.public_token,
297
- "key": self.public_key,
298
- "Content-Type": "application/x-www-form-urlencoded"
299
- }
632
+ self._require_public_credentials()
633
+
634
+ headers = build_public_api_headers(self.public_token, self.public_key)
300
635
  payload = {"tasks_ids": task_id}
301
-
636
+
302
637
  try:
303
- response = self.session.post(
304
- self.SCRAPER_STATUS_URL,
638
+ response = self._session.post(
639
+ self._status_url,
305
640
  data=payload,
306
- headers=headers
641
+ headers=headers,
642
+ timeout=30,
307
643
  )
308
644
  response.raise_for_status()
645
+
309
646
  data = response.json()
310
-
647
+
311
648
  if data.get("code") == 200 and data.get("data"):
312
649
  for item in data["data"]:
313
650
  if str(item.get("task_id")) == str(task_id):
314
- return item["status"]
315
- return "Unknown"
651
+ return item.get("status", "unknown")
652
+
653
+ return "unknown"
654
+
316
655
  except Exception as e:
317
- logger.error(f"Status Check Failed: {e}")
318
- return "Error"
656
+ logger.error(f"Status check failed: {e}")
657
+ return "error"
319
658
 
320
- def get_task_result(self, task_id: str, file_type: str = "json") -> str:
659
+ def get_task_result(
660
+ self,
661
+ task_id: str,
662
+ file_type: str = "json"
663
+ ) -> str:
321
664
  """
322
- Retrieve the download URL for a completed task.
665
+ Get the download URL for a completed task.
323
666
 
324
667
  Args:
325
- task_id (str): The task ID.
326
- file_type (str): Format required (default "json").
668
+ task_id: The task ID.
669
+ file_type: Output format ("json", "csv", "xlsx").
327
670
 
328
671
  Returns:
329
- str: The URL to download the result file.
672
+ The download URL for the result file.
330
673
  """
331
- headers = {
332
- "token": self.public_token,
333
- "key": self.public_key,
334
- "Content-Type": "application/x-www-form-urlencoded"
335
- }
674
+ self._require_public_credentials()
675
+
676
+ headers = build_public_api_headers(self.public_token, self.public_key)
336
677
  payload = {"tasks_id": task_id, "type": file_type}
337
-
678
+
338
679
  logger.info(f"Getting result URL for Task: {task_id}")
680
+
339
681
  try:
340
- response = self.session.post(
341
- self.SCRAPER_DOWNLOAD_URL,
682
+ response = self._session.post(
683
+ self._download_url,
342
684
  data=payload,
343
- headers=headers
685
+ headers=headers,
686
+ timeout=30,
344
687
  )
345
688
  response.raise_for_status()
689
+
346
690
  data = response.json()
347
-
348
- if data.get("code") == 200 and data.get("data"):
691
+ code = data.get("code")
692
+
693
+ if code == 200 and data.get("data"):
349
694
  return data["data"]["download"]
350
- raise Exception(f"API returned error: {data}")
351
- except Exception as e:
352
- logger.error(f"Get Result Failed: {e}")
353
- raise
354
-
355
- def _get_locations(self, endpoint: str, params: Dict[str, str]) -> List[Dict[str, Any]]:
356
- """
357
- Internal helper to call the public locations API.
695
+
696
+ msg = extract_error_message(data)
697
+ raise_for_code(
698
+ f"Get result failed: {msg}",
699
+ code=code,
700
+ payload=data
701
+ )
702
+
703
+ except requests.RequestException as e:
704
+ raise ThordataNetworkError(
705
+ f"Get result failed: {e}",
706
+ original_error=e
707
+ )
358
708
 
709
+ def wait_for_task(
710
+ self,
711
+ task_id: str,
712
+ *,
713
+ poll_interval: float = 5.0,
714
+ max_wait: float = 600.0,
715
+ ) -> str:
716
+ """
717
+ Wait for a task to complete.
718
+
359
719
  Args:
360
- endpoint: One of 'countries', 'states', 'cities', 'asn'.
361
- params: Query parameters (must include token, key, proxy_type, etc.)
362
-
720
+ task_id: The task ID to wait for.
721
+ poll_interval: Seconds between status checks.
722
+ max_wait: Maximum seconds to wait.
723
+
363
724
  Returns:
364
- List of location records from the 'data' field.
365
-
725
+ Final task status.
726
+
366
727
  Raises:
367
- RuntimeError: If token/key are missing or API returns an error code.
728
+ TimeoutError: If max_wait is exceeded.
729
+
730
+ Example:
731
+ >>> task_id = client.create_scraper_task(...)
732
+ >>> status = client.wait_for_task(task_id, max_wait=300)
733
+ >>> if status in ("ready", "success"):
734
+ ... url = client.get_task_result(task_id)
368
735
  """
369
- if not self.public_token or not self.public_key:
370
- raise RuntimeError(
371
- "Public API token/key are required for locations endpoints. "
372
- "Please provide 'public_token' and 'public_key' when "
373
- "initializing ThordataClient."
374
- )
375
-
376
- url = f"{self.locations_url}/{endpoint}"
377
- logger.info("Locations API request: %s", url)
378
-
379
- # Use a direct requests.get here; no need to go through the proxy gateway.
380
- response = requests.get(
381
- url,
382
- params=params,
383
- timeout=30,
736
+ import time
737
+
738
+ elapsed = 0.0
739
+
740
+ while elapsed < max_wait:
741
+ status = self.get_task_status(task_id)
742
+
743
+ logger.debug(f"Task {task_id} status: {status}")
744
+
745
+ terminal_statuses = {
746
+ "ready", "success", "finished",
747
+ "failed", "error", "cancelled"
748
+ }
749
+
750
+ if status.lower() in terminal_statuses:
751
+ return status
752
+
753
+ time.sleep(poll_interval)
754
+ elapsed += poll_interval
755
+
756
+ raise TimeoutError(
757
+ f"Task {task_id} did not complete within {max_wait} seconds"
384
758
  )
385
- response.raise_for_status()
386
759
 
387
- data = response.json()
388
- if isinstance(data, dict):
389
- code = data.get("code")
390
- if code is not None and code != 200:
391
- msg = data.get("msg", "")
392
- raise RuntimeError(
393
- f"Locations API error ({endpoint}): code={code}, msg={msg}"
394
- )
395
- return data.get("data") or []
396
- # Fallback: if backend ever returns a list directly
397
- if isinstance(data, list):
398
- return data
399
- return []
400
-
401
- def list_countries(self, proxy_type: int = 1) -> List[Dict[str, Any]]:
760
+ # =========================================================================
761
+ # Location API Methods
762
+ # =========================================================================
763
+
764
+ def list_countries(
765
+ self,
766
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
767
+ ) -> List[Dict[str, Any]]:
402
768
  """
403
- List supported countries for Thordata residential or unlimited proxies.
769
+ List supported countries for proxies.
404
770
 
405
771
  Args:
406
- proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
772
+ proxy_type: 1 for residential, 2 for unlimited.
407
773
 
408
774
  Returns:
409
- List[Dict[str, Any]]: Each record contains 'country_code' and 'country_name'.
775
+ List of country records with 'country_code' and 'country_name'.
410
776
  """
411
- params = {
412
- "token": self.public_token,
413
- "key": self.public_key,
414
- "proxy_type": str(proxy_type),
415
- }
416
- return self._get_locations("countries", params)
777
+ return self._get_locations(
778
+ "countries",
779
+ proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
780
+ )
417
781
 
418
- def list_states(self, country_code: str, proxy_type: int = 1) -> List[Dict[str, Any]]:
782
+ def list_states(
783
+ self,
784
+ country_code: str,
785
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
786
+ ) -> List[Dict[str, Any]]:
419
787
  """
420
- List supported states for a given country.
788
+ List supported states for a country.
421
789
 
422
790
  Args:
423
- country_code (str): Country code (e.g., 'US').
424
- proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
791
+ country_code: Country code (e.g., 'US').
792
+ proxy_type: Proxy type.
425
793
 
426
794
  Returns:
427
- List[Dict[str, Any]]: Each record contains 'state_code' and 'state_name'.
795
+ List of state records.
428
796
  """
429
- params = {
430
- "token": self.public_token,
431
- "key": self.public_key,
432
- "proxy_type": str(proxy_type),
433
- "country_code": country_code,
434
- }
435
- return self._get_locations("states", params)
797
+ return self._get_locations(
798
+ "states",
799
+ proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
800
+ country_code=country_code
801
+ )
436
802
 
437
803
  def list_cities(
438
804
  self,
439
805
  country_code: str,
440
806
  state_code: Optional[str] = None,
441
- proxy_type: int = 1,
807
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
442
808
  ) -> List[Dict[str, Any]]:
443
809
  """
444
- List supported cities for a given country (and optional state).
810
+ List supported cities for a country/state.
445
811
 
446
812
  Args:
447
- country_code (str): Country code (e.g., 'US').
448
- state_code (Optional[str]): State code (e.g., 'alabama'), if applicable.
449
- proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
813
+ country_code: Country code.
814
+ state_code: Optional state code.
815
+ proxy_type: Proxy type.
450
816
 
451
817
  Returns:
452
- List[Dict[str, Any]]: Each record contains 'city_code' and 'city_name'.
818
+ List of city records.
453
819
  """
454
- params: Dict[str, str] = {
455
- "token": self.public_token,
456
- "key": self.public_key,
457
- "proxy_type": str(proxy_type),
458
- "country_code": country_code,
820
+ kwargs = {
821
+ "proxy_type": int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
822
+ "country_code": country_code
459
823
  }
460
824
  if state_code:
461
- params["state_code"] = state_code
462
-
463
- return self._get_locations("cities", params)
825
+ kwargs["state_code"] = state_code
826
+
827
+ return self._get_locations("cities", **kwargs)
464
828
 
465
829
  def list_asn(
466
830
  self,
467
831
  country_code: str,
468
- proxy_type: int = 1,
832
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
469
833
  ) -> List[Dict[str, Any]]:
470
834
  """
471
- List supported ASNs for a given country.
835
+ List supported ASNs for a country.
472
836
 
473
837
  Args:
474
- country_code (str): Country code (e.g., 'US').
475
- proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
838
+ country_code: Country code.
839
+ proxy_type: Proxy type.
476
840
 
477
841
  Returns:
478
- List[Dict[str, Any]]: Each record contains 'asn_code' and 'asn_name'.
842
+ List of ASN records.
479
843
  """
844
+ return self._get_locations(
845
+ "asn",
846
+ proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
847
+ country_code=country_code
848
+ )
849
+
850
+ def _get_locations(
851
+ self,
852
+ endpoint: str,
853
+ **kwargs: Any
854
+ ) -> List[Dict[str, Any]]:
855
+ """Internal method to call locations API."""
856
+ self._require_public_credentials()
857
+
480
858
  params = {
481
859
  "token": self.public_token,
482
860
  "key": self.public_key,
483
- "proxy_type": str(proxy_type),
484
- "country_code": country_code,
485
861
  }
486
- return self._get_locations("asn", params)
862
+
863
+ for key, value in kwargs.items():
864
+ params[key] = str(value)
865
+
866
+ url = f"{self.LOCATIONS_URL}/{endpoint}"
867
+
868
+ logger.debug(f"Locations API request: {url}")
869
+
870
+ # Use requests.get directly (no proxy needed for this API)
871
+ response = requests.get(url, params=params, timeout=30)
872
+ response.raise_for_status()
873
+
874
+ data = response.json()
875
+
876
+ if isinstance(data, dict):
877
+ code = data.get("code")
878
+ if code is not None and code != 200:
879
+ msg = data.get("msg", "")
880
+ raise RuntimeError(
881
+ f"Locations API error ({endpoint}): code={code}, msg={msg}"
882
+ )
883
+ return data.get("data") or []
884
+
885
+ if isinstance(data, list):
886
+ return data
887
+
888
+ return []
889
+
890
+ # =========================================================================
891
+ # Helper Methods
892
+ # =========================================================================
893
+
894
+ def _require_public_credentials(self) -> None:
895
+ """Ensure public API credentials are available."""
896
+ if not self.public_token or not self.public_key:
897
+ raise ThordataConfigError(
898
+ "public_token and public_key are required for this operation. "
899
+ "Please provide them when initializing ThordataClient."
900
+ )
901
+
902
+ def _request_with_retry(
903
+ self,
904
+ method: str,
905
+ url: str,
906
+ **kwargs: Any
907
+ ) -> requests.Response:
908
+ """Make a request with automatic retry."""
909
+ kwargs.setdefault("timeout", self._default_timeout)
910
+
911
+ @with_retry(self._retry_config)
912
+ def _do_request() -> requests.Response:
913
+ return self._session.request(method, url, **kwargs)
914
+
915
+ try:
916
+ return _do_request()
917
+ except requests.Timeout as e:
918
+ raise ThordataTimeoutError(
919
+ f"Request timed out: {e}",
920
+ original_error=e
921
+ )
922
+ except requests.RequestException as e:
923
+ raise ThordataNetworkError(
924
+ f"Request failed: {e}",
925
+ original_error=e
926
+ )
927
+
928
+ def close(self) -> None:
929
+ """Close the underlying session."""
930
+ self._session.close()
931
+
932
+ def __enter__(self) -> "ThordataClient":
933
+ return self
934
+
935
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
936
+ self.close()