thordata-sdk 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +33 -36
- thordata/_utils.py +21 -21
- thordata/async_client.py +230 -192
- thordata/client.py +281 -222
- thordata/enums.py +32 -6
- thordata/exceptions.py +60 -31
- thordata/models.py +173 -146
- thordata/parameters.py +7 -6
- thordata/retry.py +109 -111
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/METADATA +228 -10
- thordata_sdk-0.5.0.dist-info/RECORD +14 -0
- thordata_sdk-0.4.0.dist-info/RECORD +0 -14
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/top_level.txt +0 -0
thordata/async_client.py
CHANGED
|
@@ -7,7 +7,7 @@ built on aiohttp.
|
|
|
7
7
|
Example:
|
|
8
8
|
>>> import asyncio
|
|
9
9
|
>>> from thordata import AsyncThordataClient
|
|
10
|
-
>>>
|
|
10
|
+
>>>
|
|
11
11
|
>>> async def main():
|
|
12
12
|
... async with AsyncThordataClient(
|
|
13
13
|
... scraper_token="your_token",
|
|
@@ -16,7 +16,7 @@ Example:
|
|
|
16
16
|
... ) as client:
|
|
17
17
|
... response = await client.get("https://httpbin.org/ip")
|
|
18
18
|
... print(await response.json())
|
|
19
|
-
>>>
|
|
19
|
+
>>>
|
|
20
20
|
>>> asyncio.run(main())
|
|
21
21
|
"""
|
|
22
22
|
|
|
@@ -26,8 +26,16 @@ import asyncio
|
|
|
26
26
|
import logging
|
|
27
27
|
from typing import Any, Dict, List, Optional, Union
|
|
28
28
|
|
|
29
|
+
import os
|
|
29
30
|
import aiohttp
|
|
30
31
|
|
|
32
|
+
from ._utils import (
|
|
33
|
+
build_auth_headers,
|
|
34
|
+
build_public_api_headers,
|
|
35
|
+
decode_base64_image,
|
|
36
|
+
extract_error_message,
|
|
37
|
+
parse_json_response,
|
|
38
|
+
)
|
|
31
39
|
from .enums import Engine, ProxyType
|
|
32
40
|
from .exceptions import (
|
|
33
41
|
ThordataConfigError,
|
|
@@ -37,19 +45,11 @@ from .exceptions import (
|
|
|
37
45
|
)
|
|
38
46
|
from .models import (
|
|
39
47
|
ProxyConfig,
|
|
40
|
-
|
|
48
|
+
ScraperTaskConfig,
|
|
41
49
|
SerpRequest,
|
|
42
50
|
UniversalScrapeRequest,
|
|
43
|
-
ScraperTaskConfig,
|
|
44
51
|
)
|
|
45
52
|
from .retry import RetryConfig
|
|
46
|
-
from ._utils import (
|
|
47
|
-
parse_json_response,
|
|
48
|
-
decode_base64_image,
|
|
49
|
-
build_auth_headers,
|
|
50
|
-
build_public_api_headers,
|
|
51
|
-
extract_error_message,
|
|
52
|
-
)
|
|
53
53
|
|
|
54
54
|
logger = logging.getLogger(__name__)
|
|
55
55
|
|
|
@@ -59,7 +59,7 @@ class AsyncThordataClient:
|
|
|
59
59
|
The official asynchronous Python client for Thordata.
|
|
60
60
|
|
|
61
61
|
Designed for high-concurrency AI agents and data pipelines.
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
Args:
|
|
64
64
|
scraper_token: The API token from your Dashboard.
|
|
65
65
|
public_token: The public API token.
|
|
@@ -68,7 +68,7 @@ class AsyncThordataClient:
|
|
|
68
68
|
proxy_port: Custom proxy gateway port.
|
|
69
69
|
timeout: Default request timeout in seconds.
|
|
70
70
|
retry_config: Configuration for automatic retries.
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
Example:
|
|
73
73
|
>>> async with AsyncThordataClient(
|
|
74
74
|
... scraper_token="token",
|
|
@@ -93,46 +93,73 @@ class AsyncThordataClient:
|
|
|
93
93
|
proxy_port: int = 9999,
|
|
94
94
|
timeout: int = 30,
|
|
95
95
|
retry_config: Optional[RetryConfig] = None,
|
|
96
|
+
scraperapi_base_url: Optional[str] = None,
|
|
97
|
+
universalapi_base_url: Optional[str] = None,
|
|
98
|
+
web_scraper_api_base_url: Optional[str] = None,
|
|
99
|
+
locations_base_url: Optional[str] = None,
|
|
96
100
|
) -> None:
|
|
97
101
|
"""Initialize the Async Thordata Client."""
|
|
98
102
|
if not scraper_token:
|
|
99
103
|
raise ThordataConfigError("scraper_token is required")
|
|
100
|
-
|
|
104
|
+
|
|
101
105
|
self.scraper_token = scraper_token
|
|
102
106
|
self.public_token = public_token
|
|
103
107
|
self.public_key = public_key
|
|
104
|
-
|
|
108
|
+
|
|
105
109
|
# Proxy configuration
|
|
106
110
|
self._proxy_host = proxy_host
|
|
107
111
|
self._proxy_port = proxy_port
|
|
108
112
|
self._default_timeout = aiohttp.ClientTimeout(total=timeout)
|
|
109
|
-
|
|
113
|
+
|
|
110
114
|
# Retry configuration
|
|
111
115
|
self._retry_config = retry_config or RetryConfig()
|
|
112
|
-
|
|
116
|
+
|
|
113
117
|
# Pre-calculate proxy auth
|
|
114
118
|
self._proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
115
119
|
self._proxy_auth = aiohttp.BasicAuth(
|
|
116
|
-
login=f"td-customer-{scraper_token}",
|
|
117
|
-
password=""
|
|
120
|
+
login=f"td-customer-{scraper_token}", password=""
|
|
118
121
|
)
|
|
119
|
-
|
|
120
|
-
#
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
122
|
+
|
|
123
|
+
# Base URLs (allow override via args or env vars for testing and custom routing)
|
|
124
|
+
scraperapi_base = (
|
|
125
|
+
scraperapi_base_url
|
|
126
|
+
or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
|
|
127
|
+
or self.BASE_URL
|
|
128
|
+
).rstrip("/")
|
|
129
|
+
|
|
130
|
+
universalapi_base = (
|
|
131
|
+
universalapi_base_url
|
|
132
|
+
or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
|
|
133
|
+
or self.UNIVERSAL_URL
|
|
134
|
+
).rstrip("/")
|
|
135
|
+
|
|
136
|
+
web_scraper_api_base = (
|
|
137
|
+
web_scraper_api_base_url
|
|
138
|
+
or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
|
|
139
|
+
or self.API_URL
|
|
140
|
+
).rstrip("/")
|
|
141
|
+
|
|
142
|
+
locations_base = (
|
|
143
|
+
locations_base_url
|
|
144
|
+
or os.getenv("THORDATA_LOCATIONS_BASE_URL")
|
|
145
|
+
or self.LOCATIONS_URL
|
|
146
|
+
).rstrip("/")
|
|
147
|
+
|
|
148
|
+
self._serp_url = f"{scraperapi_base}/request"
|
|
149
|
+
self._builder_url = f"{scraperapi_base}/builder"
|
|
150
|
+
self._universal_url = f"{universalapi_base}/request"
|
|
151
|
+
self._status_url = f"{web_scraper_api_base}/tasks-status"
|
|
152
|
+
self._download_url = f"{web_scraper_api_base}/tasks-download"
|
|
153
|
+
self._locations_base_url = locations_base
|
|
154
|
+
|
|
127
155
|
# Session initialized lazily
|
|
128
156
|
self._session: Optional[aiohttp.ClientSession] = None
|
|
129
157
|
|
|
130
|
-
async def __aenter__(self) ->
|
|
158
|
+
async def __aenter__(self) -> AsyncThordataClient:
|
|
131
159
|
"""Async context manager entry."""
|
|
132
160
|
if self._session is None or self._session.closed:
|
|
133
161
|
self._session = aiohttp.ClientSession(
|
|
134
|
-
timeout=self._default_timeout,
|
|
135
|
-
trust_env=True
|
|
162
|
+
timeout=self._default_timeout, trust_env=True
|
|
136
163
|
)
|
|
137
164
|
return self
|
|
138
165
|
|
|
@@ -178,32 +205,25 @@ class AsyncThordataClient:
|
|
|
178
205
|
The aiohttp response object.
|
|
179
206
|
"""
|
|
180
207
|
session = self._get_session()
|
|
181
|
-
|
|
208
|
+
|
|
182
209
|
logger.debug(f"Async Proxy GET: {url}")
|
|
183
|
-
|
|
210
|
+
|
|
184
211
|
if proxy_config:
|
|
185
212
|
proxy_url, proxy_auth = proxy_config.to_aiohttp_config()
|
|
186
213
|
else:
|
|
187
214
|
proxy_url = self._proxy_url
|
|
188
215
|
proxy_auth = self._proxy_auth
|
|
189
|
-
|
|
216
|
+
|
|
190
217
|
try:
|
|
191
218
|
return await session.get(
|
|
192
|
-
url,
|
|
193
|
-
proxy=proxy_url,
|
|
194
|
-
proxy_auth=proxy_auth,
|
|
195
|
-
**kwargs
|
|
219
|
+
url, proxy=proxy_url, proxy_auth=proxy_auth, **kwargs
|
|
196
220
|
)
|
|
197
221
|
except asyncio.TimeoutError as e:
|
|
198
222
|
raise ThordataTimeoutError(
|
|
199
|
-
f"Async request timed out: {e}",
|
|
200
|
-
original_error=e
|
|
223
|
+
f"Async request timed out: {e}", original_error=e
|
|
201
224
|
)
|
|
202
225
|
except aiohttp.ClientError as e:
|
|
203
|
-
raise ThordataNetworkError(
|
|
204
|
-
f"Async request failed: {e}",
|
|
205
|
-
original_error=e
|
|
206
|
-
)
|
|
226
|
+
raise ThordataNetworkError(f"Async request failed: {e}", original_error=e)
|
|
207
227
|
|
|
208
228
|
async def post(
|
|
209
229
|
self,
|
|
@@ -224,32 +244,25 @@ class AsyncThordataClient:
|
|
|
224
244
|
The aiohttp response object.
|
|
225
245
|
"""
|
|
226
246
|
session = self._get_session()
|
|
227
|
-
|
|
247
|
+
|
|
228
248
|
logger.debug(f"Async Proxy POST: {url}")
|
|
229
|
-
|
|
249
|
+
|
|
230
250
|
if proxy_config:
|
|
231
251
|
proxy_url, proxy_auth = proxy_config.to_aiohttp_config()
|
|
232
252
|
else:
|
|
233
253
|
proxy_url = self._proxy_url
|
|
234
254
|
proxy_auth = self._proxy_auth
|
|
235
|
-
|
|
255
|
+
|
|
236
256
|
try:
|
|
237
257
|
return await session.post(
|
|
238
|
-
url,
|
|
239
|
-
proxy=proxy_url,
|
|
240
|
-
proxy_auth=proxy_auth,
|
|
241
|
-
**kwargs
|
|
258
|
+
url, proxy=proxy_url, proxy_auth=proxy_auth, **kwargs
|
|
242
259
|
)
|
|
243
260
|
except asyncio.TimeoutError as e:
|
|
244
261
|
raise ThordataTimeoutError(
|
|
245
|
-
f"Async request timed out: {e}",
|
|
246
|
-
original_error=e
|
|
262
|
+
f"Async request timed out: {e}", original_error=e
|
|
247
263
|
)
|
|
248
264
|
except aiohttp.ClientError as e:
|
|
249
|
-
raise ThordataNetworkError(
|
|
250
|
-
f"Async request failed: {e}",
|
|
251
|
-
original_error=e
|
|
252
|
-
)
|
|
265
|
+
raise ThordataNetworkError(f"Async request failed: {e}", original_error=e)
|
|
253
266
|
|
|
254
267
|
# =========================================================================
|
|
255
268
|
# SERP API Methods
|
|
@@ -264,11 +277,15 @@ class AsyncThordataClient:
|
|
|
264
277
|
country: Optional[str] = None,
|
|
265
278
|
language: Optional[str] = None,
|
|
266
279
|
search_type: Optional[str] = None,
|
|
280
|
+
device: Optional[str] = None,
|
|
281
|
+
render_js: Optional[bool] = None,
|
|
282
|
+
no_cache: Optional[bool] = None,
|
|
283
|
+
output_format: str = "json",
|
|
267
284
|
**kwargs: Any,
|
|
268
285
|
) -> Dict[str, Any]:
|
|
269
286
|
"""
|
|
270
287
|
Execute an async SERP search.
|
|
271
|
-
|
|
288
|
+
|
|
272
289
|
Args:
|
|
273
290
|
query: Search keywords.
|
|
274
291
|
engine: Search engine.
|
|
@@ -276,15 +293,19 @@ class AsyncThordataClient:
|
|
|
276
293
|
country: Country code for localization.
|
|
277
294
|
language: Language code.
|
|
278
295
|
search_type: Type of search.
|
|
296
|
+
device: Device type ('desktop', 'mobile', 'tablet').
|
|
297
|
+
render_js: Enable JavaScript rendering in SERP.
|
|
298
|
+
no_cache: Disable internal caching.
|
|
299
|
+
output_format: 'json' or 'html'.
|
|
279
300
|
**kwargs: Additional parameters.
|
|
280
301
|
|
|
281
302
|
Returns:
|
|
282
|
-
Parsed JSON results.
|
|
303
|
+
Parsed JSON results or dict with 'html' key.
|
|
283
304
|
"""
|
|
284
305
|
session = self._get_session()
|
|
285
|
-
|
|
306
|
+
|
|
286
307
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
287
|
-
|
|
308
|
+
|
|
288
309
|
request = SerpRequest(
|
|
289
310
|
query=query,
|
|
290
311
|
engine=engine_str,
|
|
@@ -292,33 +313,53 @@ class AsyncThordataClient:
|
|
|
292
313
|
country=country,
|
|
293
314
|
language=language,
|
|
294
315
|
search_type=search_type,
|
|
316
|
+
device=device,
|
|
317
|
+
render_js=render_js,
|
|
318
|
+
no_cache=no_cache,
|
|
319
|
+
output_format=output_format,
|
|
295
320
|
extra_params=kwargs,
|
|
296
321
|
)
|
|
297
|
-
|
|
322
|
+
|
|
298
323
|
payload = request.to_payload()
|
|
299
324
|
headers = build_auth_headers(self.scraper_token)
|
|
300
|
-
|
|
325
|
+
|
|
301
326
|
logger.info(f"Async SERP Search: {engine_str} - {query}")
|
|
302
|
-
|
|
327
|
+
|
|
303
328
|
try:
|
|
304
329
|
async with session.post(
|
|
305
330
|
self._serp_url,
|
|
306
331
|
data=payload,
|
|
307
|
-
headers=headers
|
|
332
|
+
headers=headers,
|
|
308
333
|
) as response:
|
|
309
334
|
response.raise_for_status()
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
335
|
+
|
|
336
|
+
if output_format.lower() == "json":
|
|
337
|
+
data = await response.json()
|
|
338
|
+
|
|
339
|
+
if isinstance(data, dict):
|
|
340
|
+
code = data.get("code")
|
|
341
|
+
if code is not None and code != 200:
|
|
342
|
+
msg = extract_error_message(data)
|
|
343
|
+
raise_for_code(
|
|
344
|
+
f"SERP API Error: {msg}",
|
|
345
|
+
code=code,
|
|
346
|
+
payload=data,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
return parse_json_response(data)
|
|
350
|
+
|
|
351
|
+
text = await response.text()
|
|
352
|
+
return {"html": text}
|
|
353
|
+
|
|
313
354
|
except asyncio.TimeoutError as e:
|
|
314
355
|
raise ThordataTimeoutError(
|
|
315
356
|
f"SERP request timed out: {e}",
|
|
316
|
-
original_error=e
|
|
357
|
+
original_error=e,
|
|
317
358
|
)
|
|
318
359
|
except aiohttp.ClientError as e:
|
|
319
360
|
raise ThordataNetworkError(
|
|
320
361
|
f"SERP request failed: {e}",
|
|
321
|
-
original_error=e
|
|
362
|
+
original_error=e,
|
|
322
363
|
)
|
|
323
364
|
|
|
324
365
|
async def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
|
|
@@ -326,31 +367,47 @@ class AsyncThordataClient:
|
|
|
326
367
|
Execute an async SERP search using a SerpRequest object.
|
|
327
368
|
"""
|
|
328
369
|
session = self._get_session()
|
|
329
|
-
|
|
370
|
+
|
|
330
371
|
payload = request.to_payload()
|
|
331
372
|
headers = build_auth_headers(self.scraper_token)
|
|
332
|
-
|
|
373
|
+
|
|
333
374
|
logger.info(f"Async SERP Advanced: {request.engine} - {request.query}")
|
|
334
|
-
|
|
375
|
+
|
|
335
376
|
try:
|
|
336
377
|
async with session.post(
|
|
337
378
|
self._serp_url,
|
|
338
379
|
data=payload,
|
|
339
|
-
headers=headers
|
|
380
|
+
headers=headers,
|
|
340
381
|
) as response:
|
|
341
382
|
response.raise_for_status()
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
383
|
+
|
|
384
|
+
if request.output_format.lower() == "json":
|
|
385
|
+
data = await response.json()
|
|
386
|
+
|
|
387
|
+
if isinstance(data, dict):
|
|
388
|
+
code = data.get("code")
|
|
389
|
+
if code is not None and code != 200:
|
|
390
|
+
msg = extract_error_message(data)
|
|
391
|
+
raise_for_code(
|
|
392
|
+
f"SERP API Error: {msg}",
|
|
393
|
+
code=code,
|
|
394
|
+
payload=data,
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
return parse_json_response(data)
|
|
398
|
+
|
|
399
|
+
text = await response.text()
|
|
400
|
+
return {"html": text}
|
|
401
|
+
|
|
345
402
|
except asyncio.TimeoutError as e:
|
|
346
403
|
raise ThordataTimeoutError(
|
|
347
404
|
f"SERP request timed out: {e}",
|
|
348
|
-
original_error=e
|
|
405
|
+
original_error=e,
|
|
349
406
|
)
|
|
350
407
|
except aiohttp.ClientError as e:
|
|
351
408
|
raise ThordataNetworkError(
|
|
352
409
|
f"SERP request failed: {e}",
|
|
353
|
-
original_error=e
|
|
410
|
+
original_error=e,
|
|
354
411
|
)
|
|
355
412
|
|
|
356
413
|
# =========================================================================
|
|
@@ -394,66 +451,59 @@ class AsyncThordataClient:
|
|
|
394
451
|
wait_for=wait_for,
|
|
395
452
|
extra_params=kwargs,
|
|
396
453
|
)
|
|
397
|
-
|
|
454
|
+
|
|
398
455
|
return await self.universal_scrape_advanced(request)
|
|
399
456
|
|
|
400
457
|
async def universal_scrape_advanced(
|
|
401
|
-
self,
|
|
402
|
-
request: UniversalScrapeRequest
|
|
458
|
+
self, request: UniversalScrapeRequest
|
|
403
459
|
) -> Union[str, bytes]:
|
|
404
460
|
"""
|
|
405
461
|
Async scrape using a UniversalScrapeRequest object.
|
|
406
462
|
"""
|
|
407
463
|
session = self._get_session()
|
|
408
|
-
|
|
464
|
+
|
|
409
465
|
payload = request.to_payload()
|
|
410
466
|
headers = build_auth_headers(self.scraper_token)
|
|
411
|
-
|
|
467
|
+
|
|
412
468
|
logger.info(f"Async Universal Scrape: {request.url}")
|
|
413
|
-
|
|
469
|
+
|
|
414
470
|
try:
|
|
415
471
|
async with session.post(
|
|
416
|
-
self._universal_url,
|
|
417
|
-
data=payload,
|
|
418
|
-
headers=headers
|
|
472
|
+
self._universal_url, data=payload, headers=headers
|
|
419
473
|
) as response:
|
|
420
474
|
response.raise_for_status()
|
|
421
|
-
|
|
475
|
+
|
|
422
476
|
try:
|
|
423
477
|
resp_json = await response.json()
|
|
424
478
|
except ValueError:
|
|
425
479
|
if request.output_format.lower() == "png":
|
|
426
480
|
return await response.read()
|
|
427
481
|
return await response.text()
|
|
428
|
-
|
|
482
|
+
|
|
429
483
|
# Check for API errors
|
|
430
484
|
if isinstance(resp_json, dict):
|
|
431
485
|
code = resp_json.get("code")
|
|
432
486
|
if code is not None and code != 200:
|
|
433
487
|
msg = extract_error_message(resp_json)
|
|
434
488
|
raise_for_code(
|
|
435
|
-
f"Universal API Error: {msg}",
|
|
436
|
-
code=code,
|
|
437
|
-
payload=resp_json
|
|
489
|
+
f"Universal API Error: {msg}", code=code, payload=resp_json
|
|
438
490
|
)
|
|
439
|
-
|
|
491
|
+
|
|
440
492
|
if "html" in resp_json:
|
|
441
493
|
return resp_json["html"]
|
|
442
|
-
|
|
494
|
+
|
|
443
495
|
if "png" in resp_json:
|
|
444
496
|
return decode_base64_image(resp_json["png"])
|
|
445
|
-
|
|
497
|
+
|
|
446
498
|
return str(resp_json)
|
|
447
|
-
|
|
499
|
+
|
|
448
500
|
except asyncio.TimeoutError as e:
|
|
449
501
|
raise ThordataTimeoutError(
|
|
450
|
-
f"Universal scrape timed out: {e}",
|
|
451
|
-
original_error=e
|
|
502
|
+
f"Universal scrape timed out: {e}", original_error=e
|
|
452
503
|
)
|
|
453
504
|
except aiohttp.ClientError as e:
|
|
454
505
|
raise ThordataNetworkError(
|
|
455
|
-
f"Universal scrape failed: {e}",
|
|
456
|
-
original_error=e
|
|
506
|
+
f"Universal scrape failed: {e}", original_error=e
|
|
457
507
|
)
|
|
458
508
|
|
|
459
509
|
# =========================================================================
|
|
@@ -478,48 +528,38 @@ class AsyncThordataClient:
|
|
|
478
528
|
parameters=parameters,
|
|
479
529
|
universal_params=universal_params,
|
|
480
530
|
)
|
|
481
|
-
|
|
531
|
+
|
|
482
532
|
return await self.create_scraper_task_advanced(config)
|
|
483
533
|
|
|
484
|
-
async def create_scraper_task_advanced(
|
|
485
|
-
self,
|
|
486
|
-
config: ScraperTaskConfig
|
|
487
|
-
) -> str:
|
|
534
|
+
async def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
488
535
|
"""
|
|
489
536
|
Create a task using ScraperTaskConfig.
|
|
490
537
|
"""
|
|
491
538
|
session = self._get_session()
|
|
492
|
-
|
|
539
|
+
|
|
493
540
|
payload = config.to_payload()
|
|
494
541
|
headers = build_auth_headers(self.scraper_token)
|
|
495
|
-
|
|
542
|
+
|
|
496
543
|
logger.info(f"Async Task Creation: {config.spider_name}")
|
|
497
|
-
|
|
544
|
+
|
|
498
545
|
try:
|
|
499
546
|
async with session.post(
|
|
500
|
-
self._builder_url,
|
|
501
|
-
data=payload,
|
|
502
|
-
headers=headers
|
|
547
|
+
self._builder_url, data=payload, headers=headers
|
|
503
548
|
) as response:
|
|
504
549
|
response.raise_for_status()
|
|
505
550
|
data = await response.json()
|
|
506
|
-
|
|
551
|
+
|
|
507
552
|
code = data.get("code")
|
|
508
553
|
if code != 200:
|
|
509
554
|
msg = extract_error_message(data)
|
|
510
555
|
raise_for_code(
|
|
511
|
-
f"Task creation failed: {msg}",
|
|
512
|
-
code=code,
|
|
513
|
-
payload=data
|
|
556
|
+
f"Task creation failed: {msg}", code=code, payload=data
|
|
514
557
|
)
|
|
515
|
-
|
|
558
|
+
|
|
516
559
|
return data["data"]["task_id"]
|
|
517
|
-
|
|
560
|
+
|
|
518
561
|
except aiohttp.ClientError as e:
|
|
519
|
-
raise ThordataNetworkError(
|
|
520
|
-
f"Task creation failed: {e}",
|
|
521
|
-
original_error=e
|
|
522
|
-
)
|
|
562
|
+
raise ThordataNetworkError(f"Task creation failed: {e}", original_error=e)
|
|
523
563
|
|
|
524
564
|
async def get_task_status(self, task_id: str) -> str:
|
|
525
565
|
"""
|
|
@@ -527,69 +567,60 @@ class AsyncThordataClient:
|
|
|
527
567
|
"""
|
|
528
568
|
self._require_public_credentials()
|
|
529
569
|
session = self._get_session()
|
|
530
|
-
|
|
531
|
-
headers = build_public_api_headers(
|
|
570
|
+
|
|
571
|
+
headers = build_public_api_headers(
|
|
572
|
+
self.public_token or "", self.public_key or ""
|
|
573
|
+
)
|
|
532
574
|
payload = {"tasks_ids": task_id}
|
|
533
|
-
|
|
575
|
+
|
|
534
576
|
try:
|
|
535
577
|
async with session.post(
|
|
536
|
-
self._status_url,
|
|
537
|
-
data=payload,
|
|
538
|
-
headers=headers
|
|
578
|
+
self._status_url, data=payload, headers=headers
|
|
539
579
|
) as response:
|
|
540
580
|
data = await response.json()
|
|
541
|
-
|
|
581
|
+
|
|
542
582
|
if data.get("code") == 200 and data.get("data"):
|
|
543
583
|
for item in data["data"]:
|
|
544
584
|
if str(item.get("task_id")) == str(task_id):
|
|
545
585
|
return item.get("status", "unknown")
|
|
546
|
-
|
|
586
|
+
|
|
547
587
|
return "unknown"
|
|
548
|
-
|
|
588
|
+
|
|
549
589
|
except Exception as e:
|
|
550
590
|
logger.error(f"Async status check failed: {e}")
|
|
551
591
|
return "error"
|
|
552
592
|
|
|
553
|
-
async def get_task_result(
|
|
554
|
-
self,
|
|
555
|
-
task_id: str,
|
|
556
|
-
file_type: str = "json"
|
|
557
|
-
) -> str:
|
|
593
|
+
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
558
594
|
"""
|
|
559
595
|
Get download URL for completed task.
|
|
560
596
|
"""
|
|
561
597
|
self._require_public_credentials()
|
|
562
598
|
session = self._get_session()
|
|
563
|
-
|
|
564
|
-
headers = build_public_api_headers(
|
|
599
|
+
|
|
600
|
+
headers = build_public_api_headers(
|
|
601
|
+
self.public_token or "", self.public_key or ""
|
|
602
|
+
)
|
|
565
603
|
payload = {"tasks_id": task_id, "type": file_type}
|
|
566
|
-
|
|
604
|
+
|
|
567
605
|
logger.info(f"Async getting result for Task: {task_id}")
|
|
568
|
-
|
|
606
|
+
|
|
569
607
|
try:
|
|
570
608
|
async with session.post(
|
|
571
|
-
self._download_url,
|
|
572
|
-
data=payload,
|
|
573
|
-
headers=headers
|
|
609
|
+
self._download_url, data=payload, headers=headers
|
|
574
610
|
) as response:
|
|
575
611
|
data = await response.json()
|
|
576
612
|
code = data.get("code")
|
|
577
|
-
|
|
613
|
+
|
|
578
614
|
if code == 200 and data.get("data"):
|
|
579
615
|
return data["data"]["download"]
|
|
580
|
-
|
|
616
|
+
|
|
581
617
|
msg = extract_error_message(data)
|
|
582
|
-
raise_for_code(
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
)
|
|
587
|
-
|
|
618
|
+
raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
|
|
619
|
+
# This line won't be reached, but satisfies mypy
|
|
620
|
+
raise RuntimeError("Unexpected state")
|
|
621
|
+
|
|
588
622
|
except aiohttp.ClientError as e:
|
|
589
|
-
raise ThordataNetworkError(
|
|
590
|
-
f"Get result failed: {e}",
|
|
591
|
-
original_error=e
|
|
592
|
-
)
|
|
623
|
+
raise ThordataNetworkError(f"Get result failed: {e}", original_error=e)
|
|
593
624
|
|
|
594
625
|
async def wait_for_task(
|
|
595
626
|
self,
|
|
@@ -602,107 +633,114 @@ class AsyncThordataClient:
|
|
|
602
633
|
Wait for a task to complete.
|
|
603
634
|
"""
|
|
604
635
|
elapsed = 0.0
|
|
605
|
-
|
|
636
|
+
|
|
606
637
|
while elapsed < max_wait:
|
|
607
638
|
status = await self.get_task_status(task_id)
|
|
608
|
-
|
|
639
|
+
|
|
609
640
|
logger.debug(f"Task {task_id} status: {status}")
|
|
610
|
-
|
|
641
|
+
|
|
611
642
|
terminal_statuses = {
|
|
612
|
-
"ready",
|
|
613
|
-
"
|
|
643
|
+
"ready",
|
|
644
|
+
"success",
|
|
645
|
+
"finished",
|
|
646
|
+
"failed",
|
|
647
|
+
"error",
|
|
648
|
+
"cancelled",
|
|
614
649
|
}
|
|
615
|
-
|
|
650
|
+
|
|
616
651
|
if status.lower() in terminal_statuses:
|
|
617
652
|
return status
|
|
618
|
-
|
|
653
|
+
|
|
619
654
|
await asyncio.sleep(poll_interval)
|
|
620
655
|
elapsed += poll_interval
|
|
621
|
-
|
|
622
|
-
raise TimeoutError(
|
|
623
|
-
f"Task {task_id} did not complete within {max_wait} seconds"
|
|
624
|
-
)
|
|
656
|
+
|
|
657
|
+
raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
|
|
625
658
|
|
|
626
659
|
# =========================================================================
|
|
627
660
|
# Location API Methods
|
|
628
661
|
# =========================================================================
|
|
629
662
|
|
|
630
663
|
async def list_countries(
|
|
631
|
-
self,
|
|
632
|
-
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
664
|
+
self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
633
665
|
) -> List[Dict[str, Any]]:
|
|
634
666
|
"""List supported countries."""
|
|
635
667
|
return await self._get_locations(
|
|
636
668
|
"countries",
|
|
637
|
-
proxy_type=
|
|
669
|
+
proxy_type=(
|
|
670
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
671
|
+
),
|
|
638
672
|
)
|
|
639
673
|
|
|
640
674
|
async def list_states(
|
|
641
675
|
self,
|
|
642
676
|
country_code: str,
|
|
643
|
-
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
677
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
644
678
|
) -> List[Dict[str, Any]]:
|
|
645
679
|
"""List supported states for a country."""
|
|
646
680
|
return await self._get_locations(
|
|
647
681
|
"states",
|
|
648
|
-
proxy_type=
|
|
649
|
-
|
|
682
|
+
proxy_type=(
|
|
683
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
684
|
+
),
|
|
685
|
+
country_code=country_code,
|
|
650
686
|
)
|
|
651
687
|
|
|
652
688
|
async def list_cities(
|
|
653
689
|
self,
|
|
654
690
|
country_code: str,
|
|
655
691
|
state_code: Optional[str] = None,
|
|
656
|
-
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
692
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
657
693
|
) -> List[Dict[str, Any]]:
|
|
658
694
|
"""List supported cities."""
|
|
659
695
|
kwargs = {
|
|
660
|
-
"proxy_type":
|
|
661
|
-
|
|
696
|
+
"proxy_type": (
|
|
697
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
698
|
+
),
|
|
699
|
+
"country_code": country_code,
|
|
662
700
|
}
|
|
663
701
|
if state_code:
|
|
664
702
|
kwargs["state_code"] = state_code
|
|
665
|
-
|
|
703
|
+
|
|
666
704
|
return await self._get_locations("cities", **kwargs)
|
|
667
705
|
|
|
668
706
|
async def list_asn(
|
|
669
707
|
self,
|
|
670
708
|
country_code: str,
|
|
671
|
-
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
709
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
672
710
|
) -> List[Dict[str, Any]]:
|
|
673
711
|
"""List supported ASNs."""
|
|
674
712
|
return await self._get_locations(
|
|
675
713
|
"asn",
|
|
676
|
-
proxy_type=
|
|
677
|
-
|
|
714
|
+
proxy_type=(
|
|
715
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
716
|
+
),
|
|
717
|
+
country_code=country_code,
|
|
678
718
|
)
|
|
679
719
|
|
|
680
720
|
async def _get_locations(
|
|
681
|
-
self,
|
|
682
|
-
endpoint: str,
|
|
683
|
-
**kwargs: Any
|
|
721
|
+
self, endpoint: str, **kwargs: Any
|
|
684
722
|
) -> List[Dict[str, Any]]:
|
|
685
723
|
"""Internal async locations API call."""
|
|
686
724
|
self._require_public_credentials()
|
|
687
|
-
|
|
725
|
+
|
|
688
726
|
params = {
|
|
689
727
|
"token": self.public_token,
|
|
690
728
|
"key": self.public_key,
|
|
691
729
|
}
|
|
692
|
-
|
|
730
|
+
|
|
693
731
|
for key, value in kwargs.items():
|
|
694
732
|
params[key] = str(value)
|
|
695
|
-
|
|
696
|
-
url = f"{self.
|
|
697
|
-
|
|
733
|
+
|
|
734
|
+
url = f"{self._locations_base_url}/{endpoint}"
|
|
735
|
+
|
|
698
736
|
logger.debug(f"Async Locations API: {url}")
|
|
699
|
-
|
|
737
|
+
|
|
700
738
|
# Create temporary session for this request (no proxy needed)
|
|
701
|
-
async with aiohttp.ClientSession() as temp_session:
|
|
739
|
+
async with aiohttp.ClientSession(trust_env=True) as temp_session:
|
|
702
740
|
async with temp_session.get(url, params=params) as response:
|
|
703
741
|
response.raise_for_status()
|
|
704
742
|
data = await response.json()
|
|
705
|
-
|
|
743
|
+
|
|
706
744
|
if isinstance(data, dict):
|
|
707
745
|
code = data.get("code")
|
|
708
746
|
if code is not None and code != 200:
|
|
@@ -711,10 +749,10 @@ class AsyncThordataClient:
|
|
|
711
749
|
f"Locations API error ({endpoint}): code={code}, msg={msg}"
|
|
712
750
|
)
|
|
713
751
|
return data.get("data") or []
|
|
714
|
-
|
|
752
|
+
|
|
715
753
|
if isinstance(data, list):
|
|
716
754
|
return data
|
|
717
|
-
|
|
755
|
+
|
|
718
756
|
return []
|
|
719
757
|
|
|
720
758
|
# =========================================================================
|
|
@@ -727,4 +765,4 @@ class AsyncThordataClient:
|
|
|
727
765
|
raise ThordataConfigError(
|
|
728
766
|
"public_token and public_key are required for this operation. "
|
|
729
767
|
"Please provide them when initializing AsyncThordataClient."
|
|
730
|
-
)
|
|
768
|
+
)
|