thordata-sdk 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/async_client.py CHANGED
@@ -1,281 +1,768 @@
1
- import aiohttp
1
+ """
2
+ Asynchronous client for the Thordata API.
3
+
4
+ This module provides the AsyncThordataClient for high-concurrency workloads,
5
+ built on aiohttp.
6
+
7
+ Example:
8
+ >>> import asyncio
9
+ >>> from thordata import AsyncThordataClient
10
+ >>>
11
+ >>> async def main():
12
+ ... async with AsyncThordataClient(
13
+ ... scraper_token="your_token",
14
+ ... public_token="your_public_token",
15
+ ... public_key="your_public_key"
16
+ ... ) as client:
17
+ ... response = await client.get("https://httpbin.org/ip")
18
+ ... print(await response.json())
19
+ >>>
20
+ >>> asyncio.run(main())
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import asyncio
2
26
  import logging
3
- import json
4
- import base64
5
- from typing import Optional, Dict, Any, Union
27
+ from typing import Any, Dict, List, Optional, Union
28
+
29
+ import os
30
+ import aiohttp
6
31
 
7
- # Import shared logic
8
- from .enums import Engine
9
- from .parameters import normalize_serp_params
32
+ from ._utils import (
33
+ build_auth_headers,
34
+ build_public_api_headers,
35
+ decode_base64_image,
36
+ extract_error_message,
37
+ parse_json_response,
38
+ )
39
+ from .enums import Engine, ProxyType
40
+ from .exceptions import (
41
+ ThordataConfigError,
42
+ ThordataNetworkError,
43
+ ThordataTimeoutError,
44
+ raise_for_code,
45
+ )
46
+ from .models import (
47
+ ProxyConfig,
48
+ ScraperTaskConfig,
49
+ SerpRequest,
50
+ UniversalScrapeRequest,
51
+ )
52
+ from .retry import RetryConfig
10
53
 
11
54
  logger = logging.getLogger(__name__)
12
55
 
13
56
 
14
57
  class AsyncThordataClient:
15
58
  """
16
- The official Asynchronous Python client for Thordata (built on aiohttp).
59
+ The official asynchronous Python client for Thordata.
60
+
17
61
  Designed for high-concurrency AI agents and data pipelines.
62
+
63
+ Args:
64
+ scraper_token: The API token from your Dashboard.
65
+ public_token: The public API token.
66
+ public_key: The public API key.
67
+ proxy_host: Custom proxy gateway host.
68
+ proxy_port: Custom proxy gateway port.
69
+ timeout: Default request timeout in seconds.
70
+ retry_config: Configuration for automatic retries.
71
+
72
+ Example:
73
+ >>> async with AsyncThordataClient(
74
+ ... scraper_token="token",
75
+ ... public_token="pub_token",
76
+ ... public_key="pub_key"
77
+ ... ) as client:
78
+ ... results = await client.serp_search("python")
18
79
  """
19
80
 
81
+ # API Endpoints (same as sync client)
82
+ BASE_URL = "https://scraperapi.thordata.com"
83
+ UNIVERSAL_URL = "https://universalapi.thordata.com"
84
+ API_URL = "https://api.thordata.com/api/web-scraper-api"
85
+ LOCATIONS_URL = "https://api.thordata.com/api/locations"
86
+
20
87
  def __init__(
21
88
  self,
22
89
  scraper_token: str,
23
- public_token: str,
24
- public_key: str,
25
- proxy_host: str = "gate.thordata.com",
26
- proxy_port: int = 22225
27
- ):
28
- """
29
- Initialize the Async Client.
30
- """
90
+ public_token: Optional[str] = None,
91
+ public_key: Optional[str] = None,
92
+ proxy_host: str = "pr.thordata.net",
93
+ proxy_port: int = 9999,
94
+ timeout: int = 30,
95
+ retry_config: Optional[RetryConfig] = None,
96
+ scraperapi_base_url: Optional[str] = None,
97
+ universalapi_base_url: Optional[str] = None,
98
+ web_scraper_api_base_url: Optional[str] = None,
99
+ locations_base_url: Optional[str] = None,
100
+ ) -> None:
101
+ """Initialize the Async Thordata Client."""
102
+ if not scraper_token:
103
+ raise ThordataConfigError("scraper_token is required")
104
+
31
105
  self.scraper_token = scraper_token
32
106
  self.public_token = public_token
33
107
  self.public_key = public_key
34
108
 
35
- # Pre-calculate proxy auth for performance
36
- self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
37
- self.proxy_url = f"http://{proxy_host}:{proxy_port}"
38
-
39
- # API Endpoints
40
- self.base_url = "https://scraperapi.thordata.com"
41
- self.universal_url = "https://universalapi.thordata.com"
42
- self.api_url = "https://api.thordata.com/api/web-scraper-api"
43
-
44
- self.SERP_API_URL = f"{self.base_url}/request"
45
- self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
46
- self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
47
- self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
48
- self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
49
-
50
- # Session is initialized lazily or via context manager
109
+ # Proxy configuration
110
+ self._proxy_host = proxy_host
111
+ self._proxy_port = proxy_port
112
+ self._default_timeout = aiohttp.ClientTimeout(total=timeout)
113
+
114
+ # Retry configuration
115
+ self._retry_config = retry_config or RetryConfig()
116
+
117
+ # Pre-calculate proxy auth
118
+ self._proxy_url = f"http://{proxy_host}:{proxy_port}"
119
+ self._proxy_auth = aiohttp.BasicAuth(
120
+ login=f"td-customer-{scraper_token}", password=""
121
+ )
122
+
123
+ # Base URLs (allow override via args or env vars for testing and custom routing)
124
+ scraperapi_base = (
125
+ scraperapi_base_url
126
+ or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
127
+ or self.BASE_URL
128
+ ).rstrip("/")
129
+
130
+ universalapi_base = (
131
+ universalapi_base_url
132
+ or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
133
+ or self.UNIVERSAL_URL
134
+ ).rstrip("/")
135
+
136
+ web_scraper_api_base = (
137
+ web_scraper_api_base_url
138
+ or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
139
+ or self.API_URL
140
+ ).rstrip("/")
141
+
142
+ locations_base = (
143
+ locations_base_url
144
+ or os.getenv("THORDATA_LOCATIONS_BASE_URL")
145
+ or self.LOCATIONS_URL
146
+ ).rstrip("/")
147
+
148
+ self._serp_url = f"{scraperapi_base}/request"
149
+ self._builder_url = f"{scraperapi_base}/builder"
150
+ self._universal_url = f"{universalapi_base}/request"
151
+ self._status_url = f"{web_scraper_api_base}/tasks-status"
152
+ self._download_url = f"{web_scraper_api_base}/tasks-download"
153
+ self._locations_base_url = locations_base
154
+
155
+ # Session initialized lazily
51
156
  self._session: Optional[aiohttp.ClientSession] = None
52
157
 
53
- async def __aenter__(self):
158
+ async def __aenter__(self) -> AsyncThordataClient:
159
+ """Async context manager entry."""
54
160
  if self._session is None or self._session.closed:
55
- self._session = aiohttp.ClientSession(trust_env=True)
161
+ self._session = aiohttp.ClientSession(
162
+ timeout=self._default_timeout, trust_env=True
163
+ )
56
164
  return self
57
165
 
58
- async def __aexit__(self, exc_type, exc, tb):
166
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
167
+ """Async context manager exit."""
59
168
  await self.close()
60
169
 
61
- async def close(self):
170
+ async def close(self) -> None:
62
171
  """Close the underlying aiohttp session."""
63
172
  if self._session and not self._session.closed:
64
173
  await self._session.close()
65
174
  self._session = None
66
175
 
67
176
  def _get_session(self) -> aiohttp.ClientSession:
68
- """Internal helper to ensure session exists."""
177
+ """Get the session, raising if not initialized."""
69
178
  if self._session is None or self._session.closed:
70
179
  raise RuntimeError(
71
- "Client session not initialized. Use 'async with ThordataClient(...) as client:'"
180
+ "Client session not initialized. "
181
+ "Use 'async with AsyncThordataClient(...) as client:'"
72
182
  )
73
183
  return self._session
74
184
 
75
- async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
185
+ # =========================================================================
186
+ # Proxy Network Methods
187
+ # =========================================================================
188
+
189
+ async def get(
190
+ self,
191
+ url: str,
192
+ *,
193
+ proxy_config: Optional[ProxyConfig] = None,
194
+ **kwargs: Any,
195
+ ) -> aiohttp.ClientResponse:
76
196
  """
77
197
  Send an async GET request through the Proxy Network.
198
+
199
+ Args:
200
+ url: The target URL.
201
+ proxy_config: Custom proxy configuration.
202
+ **kwargs: Additional aiohttp arguments.
203
+
204
+ Returns:
205
+ The aiohttp response object.
78
206
  """
79
207
  session = self._get_session()
208
+
209
+ logger.debug(f"Async Proxy GET: {url}")
210
+
211
+ if proxy_config:
212
+ proxy_url, proxy_auth = proxy_config.to_aiohttp_config()
213
+ else:
214
+ proxy_url = self._proxy_url
215
+ proxy_auth = self._proxy_auth
216
+
80
217
  try:
81
- logger.debug(f"Async Proxy Request: {url}")
82
218
  return await session.get(
83
- url,
84
- proxy=self.proxy_url,
85
- proxy_auth=self.proxy_auth,
86
- **kwargs
219
+ url, proxy=proxy_url, proxy_auth=proxy_auth, **kwargs
220
+ )
221
+ except asyncio.TimeoutError as e:
222
+ raise ThordataTimeoutError(
223
+ f"Async request timed out: {e}", original_error=e
87
224
  )
88
225
  except aiohttp.ClientError as e:
89
- logger.error(f"Async Request failed: {e}")
90
- raise
226
+ raise ThordataNetworkError(f"Async request failed: {e}", original_error=e)
227
+
228
+ async def post(
229
+ self,
230
+ url: str,
231
+ *,
232
+ proxy_config: Optional[ProxyConfig] = None,
233
+ **kwargs: Any,
234
+ ) -> aiohttp.ClientResponse:
235
+ """
236
+ Send an async POST request through the Proxy Network.
237
+
238
+ Args:
239
+ url: The target URL.
240
+ proxy_config: Custom proxy configuration.
241
+ **kwargs: Additional aiohttp arguments.
242
+
243
+ Returns:
244
+ The aiohttp response object.
245
+ """
246
+ session = self._get_session()
247
+
248
+ logger.debug(f"Async Proxy POST: {url}")
249
+
250
+ if proxy_config:
251
+ proxy_url, proxy_auth = proxy_config.to_aiohttp_config()
252
+ else:
253
+ proxy_url = self._proxy_url
254
+ proxy_auth = self._proxy_auth
255
+
256
+ try:
257
+ return await session.post(
258
+ url, proxy=proxy_url, proxy_auth=proxy_auth, **kwargs
259
+ )
260
+ except asyncio.TimeoutError as e:
261
+ raise ThordataTimeoutError(
262
+ f"Async request timed out: {e}", original_error=e
263
+ )
264
+ except aiohttp.ClientError as e:
265
+ raise ThordataNetworkError(f"Async request failed: {e}", original_error=e)
266
+
267
+ # =========================================================================
268
+ # SERP API Methods
269
+ # =========================================================================
91
270
 
92
271
  async def serp_search(
93
- self,
94
- query: str,
95
- engine: Union[Engine, str] = Engine.GOOGLE,
96
- num: int = 10,
97
- **kwargs
272
+ self,
273
+ query: str,
274
+ *,
275
+ engine: Union[Engine, str] = Engine.GOOGLE,
276
+ num: int = 10,
277
+ country: Optional[str] = None,
278
+ language: Optional[str] = None,
279
+ search_type: Optional[str] = None,
280
+ device: Optional[str] = None,
281
+ render_js: Optional[bool] = None,
282
+ no_cache: Optional[bool] = None,
283
+ output_format: str = "json",
284
+ **kwargs: Any,
98
285
  ) -> Dict[str, Any]:
99
286
  """
100
- Execute a real-time SERP search (Async).
287
+ Execute an async SERP search.
288
+
289
+ Args:
290
+ query: Search keywords.
291
+ engine: Search engine.
292
+ num: Number of results.
293
+ country: Country code for localization.
294
+ language: Language code.
295
+ search_type: Type of search.
296
+ device: Device type ('desktop', 'mobile', 'tablet').
297
+ render_js: Enable JavaScript rendering in SERP.
298
+ no_cache: Disable internal caching.
299
+ output_format: 'json' or 'html'.
300
+ **kwargs: Additional parameters.
301
+
302
+ Returns:
303
+ Parsed JSON results or dict with 'html' key.
101
304
  """
102
305
  session = self._get_session()
103
306
 
104
- # 1. Handle Enum conversion
105
307
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
106
308
 
107
- # 2. Normalize parameters
108
- payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
109
-
110
- headers = {
111
- "Authorization": f"Bearer {self.scraper_token}",
112
- "Content-Type": "application/x-www-form-urlencoded"
113
- }
309
+ request = SerpRequest(
310
+ query=query,
311
+ engine=engine_str,
312
+ num=num,
313
+ country=country,
314
+ language=language,
315
+ search_type=search_type,
316
+ device=device,
317
+ render_js=render_js,
318
+ no_cache=no_cache,
319
+ output_format=output_format,
320
+ extra_params=kwargs,
321
+ )
322
+
323
+ payload = request.to_payload()
324
+ headers = build_auth_headers(self.scraper_token)
114
325
 
115
- # 3. Execute Request
116
326
  logger.info(f"Async SERP Search: {engine_str} - {query}")
117
- async with session.post(
118
- self.SERP_API_URL, data=payload, headers=headers
119
- ) as response:
120
- response.raise_for_status()
121
-
122
- data = await response.json()
123
- # Handle double-encoded JSON strings if they occur
124
- if isinstance(data, str):
125
- try:
126
- data = json.loads(data)
127
- except json.JSONDecodeError:
128
- pass
129
- return data
327
+
328
+ try:
329
+ async with session.post(
330
+ self._serp_url,
331
+ data=payload,
332
+ headers=headers,
333
+ ) as response:
334
+ response.raise_for_status()
335
+
336
+ if output_format.lower() == "json":
337
+ data = await response.json()
338
+
339
+ if isinstance(data, dict):
340
+ code = data.get("code")
341
+ if code is not None and code != 200:
342
+ msg = extract_error_message(data)
343
+ raise_for_code(
344
+ f"SERP API Error: {msg}",
345
+ code=code,
346
+ payload=data,
347
+ )
348
+
349
+ return parse_json_response(data)
350
+
351
+ text = await response.text()
352
+ return {"html": text}
353
+
354
+ except asyncio.TimeoutError as e:
355
+ raise ThordataTimeoutError(
356
+ f"SERP request timed out: {e}",
357
+ original_error=e,
358
+ )
359
+ except aiohttp.ClientError as e:
360
+ raise ThordataNetworkError(
361
+ f"SERP request failed: {e}",
362
+ original_error=e,
363
+ )
364
+
365
+ async def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
366
+ """
367
+ Execute an async SERP search using a SerpRequest object.
368
+ """
369
+ session = self._get_session()
370
+
371
+ payload = request.to_payload()
372
+ headers = build_auth_headers(self.scraper_token)
373
+
374
+ logger.info(f"Async SERP Advanced: {request.engine} - {request.query}")
375
+
376
+ try:
377
+ async with session.post(
378
+ self._serp_url,
379
+ data=payload,
380
+ headers=headers,
381
+ ) as response:
382
+ response.raise_for_status()
383
+
384
+ if request.output_format.lower() == "json":
385
+ data = await response.json()
386
+
387
+ if isinstance(data, dict):
388
+ code = data.get("code")
389
+ if code is not None and code != 200:
390
+ msg = extract_error_message(data)
391
+ raise_for_code(
392
+ f"SERP API Error: {msg}",
393
+ code=code,
394
+ payload=data,
395
+ )
396
+
397
+ return parse_json_response(data)
398
+
399
+ text = await response.text()
400
+ return {"html": text}
401
+
402
+ except asyncio.TimeoutError as e:
403
+ raise ThordataTimeoutError(
404
+ f"SERP request timed out: {e}",
405
+ original_error=e,
406
+ )
407
+ except aiohttp.ClientError as e:
408
+ raise ThordataNetworkError(
409
+ f"SERP request failed: {e}",
410
+ original_error=e,
411
+ )
412
+
413
+ # =========================================================================
414
+ # Universal Scraping API Methods
415
+ # =========================================================================
130
416
 
131
417
  async def universal_scrape(
132
418
  self,
133
419
  url: str,
420
+ *,
134
421
  js_render: bool = False,
135
- output_format: str = "HTML",
422
+ output_format: str = "html",
136
423
  country: Optional[str] = None,
137
- block_resources: bool = False
424
+ block_resources: Optional[str] = None,
425
+ wait: Optional[int] = None,
426
+ wait_for: Optional[str] = None,
427
+ **kwargs: Any,
428
+ ) -> Union[str, bytes]:
429
+ """
430
+ Async scrape using Universal API (Web Unlocker).
431
+
432
+ Args:
433
+ url: Target URL.
434
+ js_render: Enable JavaScript rendering.
435
+ output_format: "html" or "png".
436
+ country: Geo-targeting country.
437
+ block_resources: Resources to block.
438
+ wait: Wait time in ms.
439
+ wait_for: CSS selector to wait for.
440
+
441
+ Returns:
442
+ HTML string or PNG bytes.
443
+ """
444
+ request = UniversalScrapeRequest(
445
+ url=url,
446
+ js_render=js_render,
447
+ output_format=output_format,
448
+ country=country,
449
+ block_resources=block_resources,
450
+ wait=wait,
451
+ wait_for=wait_for,
452
+ extra_params=kwargs,
453
+ )
454
+
455
+ return await self.universal_scrape_advanced(request)
456
+
457
+ async def universal_scrape_advanced(
458
+ self, request: UniversalScrapeRequest
138
459
  ) -> Union[str, bytes]:
139
460
  """
140
- Async Universal Scraping (Bypass Cloudflare/CAPTCHA).
461
+ Async scrape using a UniversalScrapeRequest object.
141
462
  """
142
463
  session = self._get_session()
143
464
 
144
- headers = {
145
- "Authorization": f"Bearer {self.scraper_token}",
146
- "Content-Type": "application/x-www-form-urlencoded"
147
- }
465
+ payload = request.to_payload()
466
+ headers = build_auth_headers(self.scraper_token)
148
467
 
149
- payload = {
150
- "url": url,
151
- "js_render": "True" if js_render else "False",
152
- "type": output_format.lower(),
153
- "block_resources": "True" if block_resources else "False"
154
- }
155
- if country:
156
- payload["country"] = country
157
-
158
- logger.info(f"Async Universal Scrape: {url}")
159
- async with session.post(
160
- self.UNIVERSAL_API_URL, data=payload, headers=headers
161
- ) as response:
162
- response.raise_for_status()
163
-
164
- try:
165
- resp_json = await response.json()
166
- except json.JSONDecodeError:
167
- # Fallback for raw content
168
- if output_format.upper() == "PNG":
169
- return await response.read()
170
- return await response.text()
171
-
172
- # Check API error codes
173
- if isinstance(resp_json, dict) and resp_json.get("code") \
174
- and resp_json.get("code") != 200:
175
- raise Exception(f"Universal API Error: {resp_json}")
176
-
177
- if "html" in resp_json:
178
- return resp_json["html"]
179
-
180
- if "png" in resp_json:
181
- png_str = resp_json["png"]
182
- if not png_str:
183
- raise Exception("API returned empty PNG data")
184
-
185
- # Clean Data URI Scheme
186
- if "," in png_str:
187
- png_str = png_str.split(",", 1)[1]
188
-
189
- # Fix Base64 Padding
190
- png_str = png_str.replace("\n", "").replace("\r", "")
191
- missing_padding = len(png_str) % 4
192
- if missing_padding:
193
- png_str += '=' * (4 - missing_padding)
194
-
195
- return base64.b64decode(png_str)
196
-
197
- return str(resp_json)
468
+ logger.info(f"Async Universal Scrape: {request.url}")
469
+
470
+ try:
471
+ async with session.post(
472
+ self._universal_url, data=payload, headers=headers
473
+ ) as response:
474
+ response.raise_for_status()
475
+
476
+ try:
477
+ resp_json = await response.json()
478
+ except ValueError:
479
+ if request.output_format.lower() == "png":
480
+ return await response.read()
481
+ return await response.text()
482
+
483
+ # Check for API errors
484
+ if isinstance(resp_json, dict):
485
+ code = resp_json.get("code")
486
+ if code is not None and code != 200:
487
+ msg = extract_error_message(resp_json)
488
+ raise_for_code(
489
+ f"Universal API Error: {msg}", code=code, payload=resp_json
490
+ )
491
+
492
+ if "html" in resp_json:
493
+ return resp_json["html"]
494
+
495
+ if "png" in resp_json:
496
+ return decode_base64_image(resp_json["png"])
497
+
498
+ return str(resp_json)
499
+
500
+ except asyncio.TimeoutError as e:
501
+ raise ThordataTimeoutError(
502
+ f"Universal scrape timed out: {e}", original_error=e
503
+ )
504
+ except aiohttp.ClientError as e:
505
+ raise ThordataNetworkError(
506
+ f"Universal scrape failed: {e}", original_error=e
507
+ )
508
+
509
+ # =========================================================================
510
+ # Web Scraper API Methods
511
+ # =========================================================================
198
512
 
199
513
  async def create_scraper_task(
200
514
  self,
201
515
  file_name: str,
202
516
  spider_id: str,
203
517
  spider_name: str,
204
- individual_params: Dict[str, Any],
205
- universal_params: Optional[Dict[str, Any]] = None
518
+ parameters: Dict[str, Any],
519
+ universal_params: Optional[Dict[str, Any]] = None,
206
520
  ) -> str:
207
521
  """
208
- Create an Asynchronous Web Scraper Task.
522
+ Create an async Web Scraper task.
523
+ """
524
+ config = ScraperTaskConfig(
525
+ file_name=file_name,
526
+ spider_id=spider_id,
527
+ spider_name=spider_name,
528
+ parameters=parameters,
529
+ universal_params=universal_params,
530
+ )
531
+
532
+ return await self.create_scraper_task_advanced(config)
533
+
534
+ async def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
535
+ """
536
+ Create a task using ScraperTaskConfig.
209
537
  """
210
538
  session = self._get_session()
211
539
 
212
- headers = {
213
- "Authorization": f"Bearer {self.scraper_token}",
214
- "Content-Type": "application/x-www-form-urlencoded"
215
- }
540
+ payload = config.to_payload()
541
+ headers = build_auth_headers(self.scraper_token)
216
542
 
217
- payload = {
218
- "file_name": file_name,
219
- "spider_id": spider_id,
220
- "spider_name": spider_name,
221
- "spider_parameters": json.dumps([individual_params]),
222
- "spider_errors": "true"
223
- }
224
- if universal_params:
225
- payload["spider_universal"] = json.dumps(universal_params)
226
-
227
- logger.info(f"Async Task Creation: {spider_name}")
228
- async with session.post(
229
- self.SCRAPER_BUILDER_URL, data=payload, headers=headers
230
- ) as response:
231
- response.raise_for_status()
232
- data = await response.json()
233
-
234
- if data.get("code") != 200:
235
- raise Exception(f"Creation failed: {data}")
236
- return data["data"]["task_id"]
543
+ logger.info(f"Async Task Creation: {config.spider_name}")
544
+
545
+ try:
546
+ async with session.post(
547
+ self._builder_url, data=payload, headers=headers
548
+ ) as response:
549
+ response.raise_for_status()
550
+ data = await response.json()
551
+
552
+ code = data.get("code")
553
+ if code != 200:
554
+ msg = extract_error_message(data)
555
+ raise_for_code(
556
+ f"Task creation failed: {msg}", code=code, payload=data
557
+ )
558
+
559
+ return data["data"]["task_id"]
560
+
561
+ except aiohttp.ClientError as e:
562
+ raise ThordataNetworkError(f"Task creation failed: {e}", original_error=e)
237
563
 
238
564
  async def get_task_status(self, task_id: str) -> str:
239
565
  """
240
- Check task status.
566
+ Check async task status.
241
567
  """
568
+ self._require_public_credentials()
242
569
  session = self._get_session()
243
570
 
244
- headers = {
245
- "token": self.public_token,
246
- "key": self.public_key,
247
- "Content-Type": "application/x-www-form-urlencoded"
248
- }
571
+ headers = build_public_api_headers(
572
+ self.public_token or "", self.public_key or ""
573
+ )
249
574
  payload = {"tasks_ids": task_id}
250
575
 
251
- async with session.post(
252
- self.SCRAPER_STATUS_URL, data=payload, headers=headers
253
- ) as response:
254
- data = await response.json()
255
- if data.get("code") == 200 and data.get("data"):
256
- for item in data["data"]:
257
- if str(item.get("task_id")) == str(task_id):
258
- return item["status"]
259
- return "Unknown"
576
+ try:
577
+ async with session.post(
578
+ self._status_url, data=payload, headers=headers
579
+ ) as response:
580
+ data = await response.json()
581
+
582
+ if data.get("code") == 200 and data.get("data"):
583
+ for item in data["data"]:
584
+ if str(item.get("task_id")) == str(task_id):
585
+ return item.get("status", "unknown")
586
+
587
+ return "unknown"
588
+
589
+ except Exception as e:
590
+ logger.error(f"Async status check failed: {e}")
591
+ return "error"
260
592
 
261
593
  async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
262
594
  """
263
- Get the download URL for a finished task.
595
+ Get download URL for completed task.
264
596
  """
597
+ self._require_public_credentials()
265
598
  session = self._get_session()
266
-
267
- headers = {
599
+
600
+ headers = build_public_api_headers(
601
+ self.public_token or "", self.public_key or ""
602
+ )
603
+ payload = {"tasks_id": task_id, "type": file_type}
604
+
605
+ logger.info(f"Async getting result for Task: {task_id}")
606
+
607
+ try:
608
+ async with session.post(
609
+ self._download_url, data=payload, headers=headers
610
+ ) as response:
611
+ data = await response.json()
612
+ code = data.get("code")
613
+
614
+ if code == 200 and data.get("data"):
615
+ return data["data"]["download"]
616
+
617
+ msg = extract_error_message(data)
618
+ raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
619
+ # This line won't be reached, but satisfies mypy
620
+ raise RuntimeError("Unexpected state")
621
+
622
+ except aiohttp.ClientError as e:
623
+ raise ThordataNetworkError(f"Get result failed: {e}", original_error=e)
624
+
625
+ async def wait_for_task(
626
+ self,
627
+ task_id: str,
628
+ *,
629
+ poll_interval: float = 5.0,
630
+ max_wait: float = 600.0,
631
+ ) -> str:
632
+ """
633
+ Wait for a task to complete.
634
+ """
635
+ elapsed = 0.0
636
+
637
+ while elapsed < max_wait:
638
+ status = await self.get_task_status(task_id)
639
+
640
+ logger.debug(f"Task {task_id} status: {status}")
641
+
642
+ terminal_statuses = {
643
+ "ready",
644
+ "success",
645
+ "finished",
646
+ "failed",
647
+ "error",
648
+ "cancelled",
649
+ }
650
+
651
+ if status.lower() in terminal_statuses:
652
+ return status
653
+
654
+ await asyncio.sleep(poll_interval)
655
+ elapsed += poll_interval
656
+
657
+ raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
658
+
659
+ # =========================================================================
660
+ # Location API Methods
661
+ # =========================================================================
662
+
663
+ async def list_countries(
664
+ self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
665
+ ) -> List[Dict[str, Any]]:
666
+ """List supported countries."""
667
+ return await self._get_locations(
668
+ "countries",
669
+ proxy_type=(
670
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
671
+ ),
672
+ )
673
+
674
+ async def list_states(
675
+ self,
676
+ country_code: str,
677
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
678
+ ) -> List[Dict[str, Any]]:
679
+ """List supported states for a country."""
680
+ return await self._get_locations(
681
+ "states",
682
+ proxy_type=(
683
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
684
+ ),
685
+ country_code=country_code,
686
+ )
687
+
688
+ async def list_cities(
689
+ self,
690
+ country_code: str,
691
+ state_code: Optional[str] = None,
692
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
693
+ ) -> List[Dict[str, Any]]:
694
+ """List supported cities."""
695
+ kwargs = {
696
+ "proxy_type": (
697
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
698
+ ),
699
+ "country_code": country_code,
700
+ }
701
+ if state_code:
702
+ kwargs["state_code"] = state_code
703
+
704
+ return await self._get_locations("cities", **kwargs)
705
+
706
+ async def list_asn(
707
+ self,
708
+ country_code: str,
709
+ proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
710
+ ) -> List[Dict[str, Any]]:
711
+ """List supported ASNs."""
712
+ return await self._get_locations(
713
+ "asn",
714
+ proxy_type=(
715
+ int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
716
+ ),
717
+ country_code=country_code,
718
+ )
719
+
720
+ async def _get_locations(
721
+ self, endpoint: str, **kwargs: Any
722
+ ) -> List[Dict[str, Any]]:
723
+ """Internal async locations API call."""
724
+ self._require_public_credentials()
725
+
726
+ params = {
268
727
  "token": self.public_token,
269
728
  "key": self.public_key,
270
- "Content-Type": "application/x-www-form-urlencoded"
271
729
  }
272
- # Fixed: Use the file_type argument instead of hardcoding "json"
273
- payload = {"tasks_id": task_id, "type": file_type}
274
730
 
275
- async with session.post(
276
- self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
277
- ) as response:
278
- data = await response.json()
279
- if data.get("code") == 200 and data.get("data"):
280
- return data["data"]["download"]
281
- raise Exception(f"Result Error: {data}")
731
+ for key, value in kwargs.items():
732
+ params[key] = str(value)
733
+
734
+ url = f"{self._locations_base_url}/{endpoint}"
735
+
736
+ logger.debug(f"Async Locations API: {url}")
737
+
738
+ # Create temporary session for this request (no proxy needed)
739
+ async with aiohttp.ClientSession(trust_env=True) as temp_session:
740
+ async with temp_session.get(url, params=params) as response:
741
+ response.raise_for_status()
742
+ data = await response.json()
743
+
744
+ if isinstance(data, dict):
745
+ code = data.get("code")
746
+ if code is not None and code != 200:
747
+ msg = data.get("msg", "")
748
+ raise RuntimeError(
749
+ f"Locations API error ({endpoint}): code={code}, msg={msg}"
750
+ )
751
+ return data.get("data") or []
752
+
753
+ if isinstance(data, list):
754
+ return data
755
+
756
+ return []
757
+
758
+ # =========================================================================
759
+ # Helper Methods
760
+ # =========================================================================
761
+
762
+ def _require_public_credentials(self) -> None:
763
+ """Ensure public API credentials are available."""
764
+ if not self.public_token or not self.public_key:
765
+ raise ThordataConfigError(
766
+ "public_token and public_key are required for this operation. "
767
+ "Please provide them when initializing AsyncThordataClient."
768
+ )