thordata-sdk 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +131 -9
- thordata/_utils.py +126 -0
- thordata/async_client.py +640 -191
- thordata/client.py +764 -314
- thordata/enums.py +275 -11
- thordata/exceptions.py +315 -0
- thordata/models.py +698 -0
- thordata/retry.py +382 -0
- thordata_sdk-0.4.0.dist-info/METADATA +678 -0
- thordata_sdk-0.4.0.dist-info/RECORD +14 -0
- thordata_sdk-0.4.0.dist-info/licenses/LICENSE +21 -0
- thordata_sdk-0.3.1.dist-info/METADATA +0 -200
- thordata_sdk-0.3.1.dist-info/RECORD +0 -10
- thordata_sdk-0.3.1.dist-info/licenses/LICENSE +0 -201
- {thordata_sdk-0.3.1.dist-info → thordata_sdk-0.4.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.3.1.dist-info → thordata_sdk-0.4.0.dist-info}/top_level.txt +0 -0
thordata/async_client.py
CHANGED
|
@@ -1,281 +1,730 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
Asynchronous client for the Thordata API.
|
|
3
|
+
|
|
4
|
+
This module provides the AsyncThordataClient for high-concurrency workloads,
|
|
5
|
+
built on aiohttp.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> import asyncio
|
|
9
|
+
>>> from thordata import AsyncThordataClient
|
|
10
|
+
>>>
|
|
11
|
+
>>> async def main():
|
|
12
|
+
... async with AsyncThordataClient(
|
|
13
|
+
... scraper_token="your_token",
|
|
14
|
+
... public_token="your_public_token",
|
|
15
|
+
... public_key="your_public_key"
|
|
16
|
+
... ) as client:
|
|
17
|
+
... response = await client.get("https://httpbin.org/ip")
|
|
18
|
+
... print(await response.json())
|
|
19
|
+
>>>
|
|
20
|
+
>>> asyncio.run(main())
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import asyncio
|
|
2
26
|
import logging
|
|
3
|
-
import
|
|
4
|
-
|
|
5
|
-
|
|
27
|
+
from typing import Any, Dict, List, Optional, Union
|
|
28
|
+
|
|
29
|
+
import aiohttp
|
|
6
30
|
|
|
7
|
-
|
|
8
|
-
from .
|
|
9
|
-
|
|
31
|
+
from .enums import Engine, ProxyType
|
|
32
|
+
from .exceptions import (
|
|
33
|
+
ThordataConfigError,
|
|
34
|
+
ThordataNetworkError,
|
|
35
|
+
ThordataTimeoutError,
|
|
36
|
+
raise_for_code,
|
|
37
|
+
)
|
|
38
|
+
from .models import (
|
|
39
|
+
ProxyConfig,
|
|
40
|
+
ProxyProduct,
|
|
41
|
+
SerpRequest,
|
|
42
|
+
UniversalScrapeRequest,
|
|
43
|
+
ScraperTaskConfig,
|
|
44
|
+
)
|
|
45
|
+
from .retry import RetryConfig
|
|
46
|
+
from ._utils import (
|
|
47
|
+
parse_json_response,
|
|
48
|
+
decode_base64_image,
|
|
49
|
+
build_auth_headers,
|
|
50
|
+
build_public_api_headers,
|
|
51
|
+
extract_error_message,
|
|
52
|
+
)
|
|
10
53
|
|
|
11
54
|
logger = logging.getLogger(__name__)
|
|
12
55
|
|
|
13
56
|
|
|
14
57
|
class AsyncThordataClient:
|
|
15
58
|
"""
|
|
16
|
-
The official
|
|
59
|
+
The official asynchronous Python client for Thordata.
|
|
60
|
+
|
|
17
61
|
Designed for high-concurrency AI agents and data pipelines.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
scraper_token: The API token from your Dashboard.
|
|
65
|
+
public_token: The public API token.
|
|
66
|
+
public_key: The public API key.
|
|
67
|
+
proxy_host: Custom proxy gateway host.
|
|
68
|
+
proxy_port: Custom proxy gateway port.
|
|
69
|
+
timeout: Default request timeout in seconds.
|
|
70
|
+
retry_config: Configuration for automatic retries.
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> async with AsyncThordataClient(
|
|
74
|
+
... scraper_token="token",
|
|
75
|
+
... public_token="pub_token",
|
|
76
|
+
... public_key="pub_key"
|
|
77
|
+
... ) as client:
|
|
78
|
+
... results = await client.serp_search("python")
|
|
18
79
|
"""
|
|
19
80
|
|
|
81
|
+
# API Endpoints (same as sync client)
|
|
82
|
+
BASE_URL = "https://scraperapi.thordata.com"
|
|
83
|
+
UNIVERSAL_URL = "https://universalapi.thordata.com"
|
|
84
|
+
API_URL = "https://api.thordata.com/api/web-scraper-api"
|
|
85
|
+
LOCATIONS_URL = "https://api.thordata.com/api/locations"
|
|
86
|
+
|
|
20
87
|
def __init__(
|
|
21
88
|
self,
|
|
22
89
|
scraper_token: str,
|
|
23
|
-
public_token: str,
|
|
24
|
-
public_key: str,
|
|
25
|
-
proxy_host: str = "
|
|
26
|
-
proxy_port: int =
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
"""
|
|
90
|
+
public_token: Optional[str] = None,
|
|
91
|
+
public_key: Optional[str] = None,
|
|
92
|
+
proxy_host: str = "pr.thordata.net",
|
|
93
|
+
proxy_port: int = 9999,
|
|
94
|
+
timeout: int = 30,
|
|
95
|
+
retry_config: Optional[RetryConfig] = None,
|
|
96
|
+
) -> None:
|
|
97
|
+
"""Initialize the Async Thordata Client."""
|
|
98
|
+
if not scraper_token:
|
|
99
|
+
raise ThordataConfigError("scraper_token is required")
|
|
100
|
+
|
|
31
101
|
self.scraper_token = scraper_token
|
|
32
102
|
self.public_token = public_token
|
|
33
103
|
self.public_key = public_key
|
|
34
|
-
|
|
35
|
-
#
|
|
36
|
-
self.
|
|
37
|
-
self.
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
self.
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
self.
|
|
45
|
-
self.
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
#
|
|
104
|
+
|
|
105
|
+
# Proxy configuration
|
|
106
|
+
self._proxy_host = proxy_host
|
|
107
|
+
self._proxy_port = proxy_port
|
|
108
|
+
self._default_timeout = aiohttp.ClientTimeout(total=timeout)
|
|
109
|
+
|
|
110
|
+
# Retry configuration
|
|
111
|
+
self._retry_config = retry_config or RetryConfig()
|
|
112
|
+
|
|
113
|
+
# Pre-calculate proxy auth
|
|
114
|
+
self._proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
115
|
+
self._proxy_auth = aiohttp.BasicAuth(
|
|
116
|
+
login=f"td-customer-{scraper_token}",
|
|
117
|
+
password=""
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Store endpoint URLs
|
|
121
|
+
self._serp_url = f"{self.BASE_URL}/request"
|
|
122
|
+
self._universal_url = f"{self.UNIVERSAL_URL}/request"
|
|
123
|
+
self._builder_url = f"{self.BASE_URL}/builder"
|
|
124
|
+
self._status_url = f"{self.API_URL}/tasks-status"
|
|
125
|
+
self._download_url = f"{self.API_URL}/tasks-download"
|
|
126
|
+
|
|
127
|
+
# Session initialized lazily
|
|
51
128
|
self._session: Optional[aiohttp.ClientSession] = None
|
|
52
129
|
|
|
53
|
-
async def __aenter__(self):
|
|
130
|
+
async def __aenter__(self) -> "AsyncThordataClient":
|
|
131
|
+
"""Async context manager entry."""
|
|
54
132
|
if self._session is None or self._session.closed:
|
|
55
|
-
self._session = aiohttp.ClientSession(
|
|
133
|
+
self._session = aiohttp.ClientSession(
|
|
134
|
+
timeout=self._default_timeout,
|
|
135
|
+
trust_env=True
|
|
136
|
+
)
|
|
56
137
|
return self
|
|
57
138
|
|
|
58
|
-
async def __aexit__(self, exc_type,
|
|
139
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
140
|
+
"""Async context manager exit."""
|
|
59
141
|
await self.close()
|
|
60
142
|
|
|
61
|
-
async def close(self):
|
|
143
|
+
async def close(self) -> None:
|
|
62
144
|
"""Close the underlying aiohttp session."""
|
|
63
145
|
if self._session and not self._session.closed:
|
|
64
146
|
await self._session.close()
|
|
65
147
|
self._session = None
|
|
66
148
|
|
|
67
149
|
def _get_session(self) -> aiohttp.ClientSession:
|
|
68
|
-
"""
|
|
150
|
+
"""Get the session, raising if not initialized."""
|
|
69
151
|
if self._session is None or self._session.closed:
|
|
70
152
|
raise RuntimeError(
|
|
71
|
-
"Client session not initialized.
|
|
153
|
+
"Client session not initialized. "
|
|
154
|
+
"Use 'async with AsyncThordataClient(...) as client:'"
|
|
72
155
|
)
|
|
73
156
|
return self._session
|
|
74
157
|
|
|
75
|
-
|
|
158
|
+
# =========================================================================
|
|
159
|
+
# Proxy Network Methods
|
|
160
|
+
# =========================================================================
|
|
161
|
+
|
|
162
|
+
async def get(
|
|
163
|
+
self,
|
|
164
|
+
url: str,
|
|
165
|
+
*,
|
|
166
|
+
proxy_config: Optional[ProxyConfig] = None,
|
|
167
|
+
**kwargs: Any,
|
|
168
|
+
) -> aiohttp.ClientResponse:
|
|
76
169
|
"""
|
|
77
170
|
Send an async GET request through the Proxy Network.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
url: The target URL.
|
|
174
|
+
proxy_config: Custom proxy configuration.
|
|
175
|
+
**kwargs: Additional aiohttp arguments.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
The aiohttp response object.
|
|
78
179
|
"""
|
|
79
180
|
session = self._get_session()
|
|
181
|
+
|
|
182
|
+
logger.debug(f"Async Proxy GET: {url}")
|
|
183
|
+
|
|
184
|
+
if proxy_config:
|
|
185
|
+
proxy_url, proxy_auth = proxy_config.to_aiohttp_config()
|
|
186
|
+
else:
|
|
187
|
+
proxy_url = self._proxy_url
|
|
188
|
+
proxy_auth = self._proxy_auth
|
|
189
|
+
|
|
80
190
|
try:
|
|
81
|
-
logger.debug(f"Async Proxy Request: {url}")
|
|
82
191
|
return await session.get(
|
|
83
192
|
url,
|
|
84
|
-
proxy=
|
|
85
|
-
proxy_auth=
|
|
193
|
+
proxy=proxy_url,
|
|
194
|
+
proxy_auth=proxy_auth,
|
|
86
195
|
**kwargs
|
|
87
196
|
)
|
|
197
|
+
except asyncio.TimeoutError as e:
|
|
198
|
+
raise ThordataTimeoutError(
|
|
199
|
+
f"Async request timed out: {e}",
|
|
200
|
+
original_error=e
|
|
201
|
+
)
|
|
88
202
|
except aiohttp.ClientError as e:
|
|
89
|
-
|
|
90
|
-
|
|
203
|
+
raise ThordataNetworkError(
|
|
204
|
+
f"Async request failed: {e}",
|
|
205
|
+
original_error=e
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
async def post(
|
|
209
|
+
self,
|
|
210
|
+
url: str,
|
|
211
|
+
*,
|
|
212
|
+
proxy_config: Optional[ProxyConfig] = None,
|
|
213
|
+
**kwargs: Any,
|
|
214
|
+
) -> aiohttp.ClientResponse:
|
|
215
|
+
"""
|
|
216
|
+
Send an async POST request through the Proxy Network.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
url: The target URL.
|
|
220
|
+
proxy_config: Custom proxy configuration.
|
|
221
|
+
**kwargs: Additional aiohttp arguments.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
The aiohttp response object.
|
|
225
|
+
"""
|
|
226
|
+
session = self._get_session()
|
|
227
|
+
|
|
228
|
+
logger.debug(f"Async Proxy POST: {url}")
|
|
229
|
+
|
|
230
|
+
if proxy_config:
|
|
231
|
+
proxy_url, proxy_auth = proxy_config.to_aiohttp_config()
|
|
232
|
+
else:
|
|
233
|
+
proxy_url = self._proxy_url
|
|
234
|
+
proxy_auth = self._proxy_auth
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
return await session.post(
|
|
238
|
+
url,
|
|
239
|
+
proxy=proxy_url,
|
|
240
|
+
proxy_auth=proxy_auth,
|
|
241
|
+
**kwargs
|
|
242
|
+
)
|
|
243
|
+
except asyncio.TimeoutError as e:
|
|
244
|
+
raise ThordataTimeoutError(
|
|
245
|
+
f"Async request timed out: {e}",
|
|
246
|
+
original_error=e
|
|
247
|
+
)
|
|
248
|
+
except aiohttp.ClientError as e:
|
|
249
|
+
raise ThordataNetworkError(
|
|
250
|
+
f"Async request failed: {e}",
|
|
251
|
+
original_error=e
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# =========================================================================
|
|
255
|
+
# SERP API Methods
|
|
256
|
+
# =========================================================================
|
|
91
257
|
|
|
92
258
|
async def serp_search(
|
|
93
|
-
self,
|
|
94
|
-
query: str,
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
259
|
+
self,
|
|
260
|
+
query: str,
|
|
261
|
+
*,
|
|
262
|
+
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
263
|
+
num: int = 10,
|
|
264
|
+
country: Optional[str] = None,
|
|
265
|
+
language: Optional[str] = None,
|
|
266
|
+
search_type: Optional[str] = None,
|
|
267
|
+
**kwargs: Any,
|
|
98
268
|
) -> Dict[str, Any]:
|
|
99
269
|
"""
|
|
100
|
-
Execute
|
|
270
|
+
Execute an async SERP search.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
query: Search keywords.
|
|
274
|
+
engine: Search engine.
|
|
275
|
+
num: Number of results.
|
|
276
|
+
country: Country code for localization.
|
|
277
|
+
language: Language code.
|
|
278
|
+
search_type: Type of search.
|
|
279
|
+
**kwargs: Additional parameters.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Parsed JSON results.
|
|
101
283
|
"""
|
|
102
284
|
session = self._get_session()
|
|
103
|
-
|
|
104
|
-
# 1. Handle Enum conversion
|
|
285
|
+
|
|
105
286
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
287
|
+
|
|
288
|
+
request = SerpRequest(
|
|
289
|
+
query=query,
|
|
290
|
+
engine=engine_str,
|
|
291
|
+
num=num,
|
|
292
|
+
country=country,
|
|
293
|
+
language=language,
|
|
294
|
+
search_type=search_type,
|
|
295
|
+
extra_params=kwargs,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
payload = request.to_payload()
|
|
299
|
+
headers = build_auth_headers(self.scraper_token)
|
|
300
|
+
|
|
301
|
+
logger.info(f"Async SERP Search: {engine_str} - {query}")
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
async with session.post(
|
|
305
|
+
self._serp_url,
|
|
306
|
+
data=payload,
|
|
307
|
+
headers=headers
|
|
308
|
+
) as response:
|
|
309
|
+
response.raise_for_status()
|
|
310
|
+
data = await response.json()
|
|
311
|
+
return parse_json_response(data)
|
|
312
|
+
|
|
313
|
+
except asyncio.TimeoutError as e:
|
|
314
|
+
raise ThordataTimeoutError(
|
|
315
|
+
f"SERP request timed out: {e}",
|
|
316
|
+
original_error=e
|
|
317
|
+
)
|
|
318
|
+
except aiohttp.ClientError as e:
|
|
319
|
+
raise ThordataNetworkError(
|
|
320
|
+
f"SERP request failed: {e}",
|
|
321
|
+
original_error=e
|
|
322
|
+
)
|
|
106
323
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
324
|
+
async def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
|
|
325
|
+
"""
|
|
326
|
+
Execute an async SERP search using a SerpRequest object.
|
|
327
|
+
"""
|
|
328
|
+
session = self._get_session()
|
|
329
|
+
|
|
330
|
+
payload = request.to_payload()
|
|
331
|
+
headers = build_auth_headers(self.scraper_token)
|
|
332
|
+
|
|
333
|
+
logger.info(f"Async SERP Advanced: {request.engine} - {request.query}")
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
async with session.post(
|
|
337
|
+
self._serp_url,
|
|
338
|
+
data=payload,
|
|
339
|
+
headers=headers
|
|
340
|
+
) as response:
|
|
341
|
+
response.raise_for_status()
|
|
342
|
+
data = await response.json()
|
|
343
|
+
return parse_json_response(data)
|
|
344
|
+
|
|
345
|
+
except asyncio.TimeoutError as e:
|
|
346
|
+
raise ThordataTimeoutError(
|
|
347
|
+
f"SERP request timed out: {e}",
|
|
348
|
+
original_error=e
|
|
349
|
+
)
|
|
350
|
+
except aiohttp.ClientError as e:
|
|
351
|
+
raise ThordataNetworkError(
|
|
352
|
+
f"SERP request failed: {e}",
|
|
353
|
+
original_error=e
|
|
354
|
+
)
|
|
114
355
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
self.SERP_API_URL, data=payload, headers=headers
|
|
119
|
-
) as response:
|
|
120
|
-
response.raise_for_status()
|
|
121
|
-
|
|
122
|
-
data = await response.json()
|
|
123
|
-
# Handle double-encoded JSON strings if they occur
|
|
124
|
-
if isinstance(data, str):
|
|
125
|
-
try:
|
|
126
|
-
data = json.loads(data)
|
|
127
|
-
except json.JSONDecodeError:
|
|
128
|
-
pass
|
|
129
|
-
return data
|
|
356
|
+
# =========================================================================
|
|
357
|
+
# Universal Scraping API Methods
|
|
358
|
+
# =========================================================================
|
|
130
359
|
|
|
131
360
|
async def universal_scrape(
|
|
132
361
|
self,
|
|
133
362
|
url: str,
|
|
363
|
+
*,
|
|
134
364
|
js_render: bool = False,
|
|
135
|
-
output_format: str = "
|
|
365
|
+
output_format: str = "html",
|
|
136
366
|
country: Optional[str] = None,
|
|
137
|
-
block_resources:
|
|
367
|
+
block_resources: Optional[str] = None,
|
|
368
|
+
wait: Optional[int] = None,
|
|
369
|
+
wait_for: Optional[str] = None,
|
|
370
|
+
**kwargs: Any,
|
|
138
371
|
) -> Union[str, bytes]:
|
|
139
372
|
"""
|
|
140
|
-
Async Universal
|
|
373
|
+
Async scrape using Universal API (Web Unlocker).
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
url: Target URL.
|
|
377
|
+
js_render: Enable JavaScript rendering.
|
|
378
|
+
output_format: "html" or "png".
|
|
379
|
+
country: Geo-targeting country.
|
|
380
|
+
block_resources: Resources to block.
|
|
381
|
+
wait: Wait time in ms.
|
|
382
|
+
wait_for: CSS selector to wait for.
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
HTML string or PNG bytes.
|
|
141
386
|
"""
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
387
|
+
request = UniversalScrapeRequest(
|
|
388
|
+
url=url,
|
|
389
|
+
js_render=js_render,
|
|
390
|
+
output_format=output_format,
|
|
391
|
+
country=country,
|
|
392
|
+
block_resources=block_resources,
|
|
393
|
+
wait=wait,
|
|
394
|
+
wait_for=wait_for,
|
|
395
|
+
extra_params=kwargs,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return await self.universal_scrape_advanced(request)
|
|
148
399
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
if
|
|
193
|
-
|
|
400
|
+
async def universal_scrape_advanced(
|
|
401
|
+
self,
|
|
402
|
+
request: UniversalScrapeRequest
|
|
403
|
+
) -> Union[str, bytes]:
|
|
404
|
+
"""
|
|
405
|
+
Async scrape using a UniversalScrapeRequest object.
|
|
406
|
+
"""
|
|
407
|
+
session = self._get_session()
|
|
408
|
+
|
|
409
|
+
payload = request.to_payload()
|
|
410
|
+
headers = build_auth_headers(self.scraper_token)
|
|
411
|
+
|
|
412
|
+
logger.info(f"Async Universal Scrape: {request.url}")
|
|
413
|
+
|
|
414
|
+
try:
|
|
415
|
+
async with session.post(
|
|
416
|
+
self._universal_url,
|
|
417
|
+
data=payload,
|
|
418
|
+
headers=headers
|
|
419
|
+
) as response:
|
|
420
|
+
response.raise_for_status()
|
|
421
|
+
|
|
422
|
+
try:
|
|
423
|
+
resp_json = await response.json()
|
|
424
|
+
except ValueError:
|
|
425
|
+
if request.output_format.lower() == "png":
|
|
426
|
+
return await response.read()
|
|
427
|
+
return await response.text()
|
|
428
|
+
|
|
429
|
+
# Check for API errors
|
|
430
|
+
if isinstance(resp_json, dict):
|
|
431
|
+
code = resp_json.get("code")
|
|
432
|
+
if code is not None and code != 200:
|
|
433
|
+
msg = extract_error_message(resp_json)
|
|
434
|
+
raise_for_code(
|
|
435
|
+
f"Universal API Error: {msg}",
|
|
436
|
+
code=code,
|
|
437
|
+
payload=resp_json
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
if "html" in resp_json:
|
|
441
|
+
return resp_json["html"]
|
|
442
|
+
|
|
443
|
+
if "png" in resp_json:
|
|
444
|
+
return decode_base64_image(resp_json["png"])
|
|
445
|
+
|
|
446
|
+
return str(resp_json)
|
|
194
447
|
|
|
195
|
-
|
|
448
|
+
except asyncio.TimeoutError as e:
|
|
449
|
+
raise ThordataTimeoutError(
|
|
450
|
+
f"Universal scrape timed out: {e}",
|
|
451
|
+
original_error=e
|
|
452
|
+
)
|
|
453
|
+
except aiohttp.ClientError as e:
|
|
454
|
+
raise ThordataNetworkError(
|
|
455
|
+
f"Universal scrape failed: {e}",
|
|
456
|
+
original_error=e
|
|
457
|
+
)
|
|
196
458
|
|
|
197
|
-
|
|
459
|
+
# =========================================================================
|
|
460
|
+
# Web Scraper API Methods
|
|
461
|
+
# =========================================================================
|
|
198
462
|
|
|
199
463
|
async def create_scraper_task(
|
|
200
464
|
self,
|
|
201
465
|
file_name: str,
|
|
202
466
|
spider_id: str,
|
|
203
467
|
spider_name: str,
|
|
204
|
-
|
|
205
|
-
universal_params: Optional[Dict[str, Any]] = None
|
|
468
|
+
parameters: Dict[str, Any],
|
|
469
|
+
universal_params: Optional[Dict[str, Any]] = None,
|
|
206
470
|
) -> str:
|
|
207
471
|
"""
|
|
208
|
-
Create an
|
|
472
|
+
Create an async Web Scraper task.
|
|
209
473
|
"""
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
474
|
+
config = ScraperTaskConfig(
|
|
475
|
+
file_name=file_name,
|
|
476
|
+
spider_id=spider_id,
|
|
477
|
+
spider_name=spider_name,
|
|
478
|
+
parameters=parameters,
|
|
479
|
+
universal_params=universal_params,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
return await self.create_scraper_task_advanced(config)
|
|
216
483
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
484
|
+
async def create_scraper_task_advanced(
|
|
485
|
+
self,
|
|
486
|
+
config: ScraperTaskConfig
|
|
487
|
+
) -> str:
|
|
488
|
+
"""
|
|
489
|
+
Create a task using ScraperTaskConfig.
|
|
490
|
+
"""
|
|
491
|
+
session = self._get_session()
|
|
492
|
+
|
|
493
|
+
payload = config.to_payload()
|
|
494
|
+
headers = build_auth_headers(self.scraper_token)
|
|
495
|
+
|
|
496
|
+
logger.info(f"Async Task Creation: {config.spider_name}")
|
|
497
|
+
|
|
498
|
+
try:
|
|
499
|
+
async with session.post(
|
|
500
|
+
self._builder_url,
|
|
501
|
+
data=payload,
|
|
502
|
+
headers=headers
|
|
503
|
+
) as response:
|
|
504
|
+
response.raise_for_status()
|
|
505
|
+
data = await response.json()
|
|
506
|
+
|
|
507
|
+
code = data.get("code")
|
|
508
|
+
if code != 200:
|
|
509
|
+
msg = extract_error_message(data)
|
|
510
|
+
raise_for_code(
|
|
511
|
+
f"Task creation failed: {msg}",
|
|
512
|
+
code=code,
|
|
513
|
+
payload=data
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
return data["data"]["task_id"]
|
|
517
|
+
|
|
518
|
+
except aiohttp.ClientError as e:
|
|
519
|
+
raise ThordataNetworkError(
|
|
520
|
+
f"Task creation failed: {e}",
|
|
521
|
+
original_error=e
|
|
522
|
+
)
|
|
237
523
|
|
|
238
524
|
async def get_task_status(self, task_id: str) -> str:
|
|
239
525
|
"""
|
|
240
|
-
Check task status.
|
|
526
|
+
Check async task status.
|
|
241
527
|
"""
|
|
528
|
+
self._require_public_credentials()
|
|
242
529
|
session = self._get_session()
|
|
243
|
-
|
|
244
|
-
headers =
|
|
245
|
-
"token": self.public_token,
|
|
246
|
-
"key": self.public_key,
|
|
247
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
248
|
-
}
|
|
530
|
+
|
|
531
|
+
headers = build_public_api_headers(self.public_token, self.public_key)
|
|
249
532
|
payload = {"tasks_ids": task_id}
|
|
533
|
+
|
|
534
|
+
try:
|
|
535
|
+
async with session.post(
|
|
536
|
+
self._status_url,
|
|
537
|
+
data=payload,
|
|
538
|
+
headers=headers
|
|
539
|
+
) as response:
|
|
540
|
+
data = await response.json()
|
|
541
|
+
|
|
542
|
+
if data.get("code") == 200 and data.get("data"):
|
|
543
|
+
for item in data["data"]:
|
|
544
|
+
if str(item.get("task_id")) == str(task_id):
|
|
545
|
+
return item.get("status", "unknown")
|
|
546
|
+
|
|
547
|
+
return "unknown"
|
|
548
|
+
|
|
549
|
+
except Exception as e:
|
|
550
|
+
logger.error(f"Async status check failed: {e}")
|
|
551
|
+
return "error"
|
|
250
552
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
for item in data["data"]:
|
|
257
|
-
if str(item.get("task_id")) == str(task_id):
|
|
258
|
-
return item["status"]
|
|
259
|
-
return "Unknown"
|
|
260
|
-
|
|
261
|
-
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
553
|
+
async def get_task_result(
|
|
554
|
+
self,
|
|
555
|
+
task_id: str,
|
|
556
|
+
file_type: str = "json"
|
|
557
|
+
) -> str:
|
|
262
558
|
"""
|
|
263
|
-
Get
|
|
559
|
+
Get download URL for completed task.
|
|
264
560
|
"""
|
|
561
|
+
self._require_public_credentials()
|
|
265
562
|
session = self._get_session()
|
|
266
563
|
|
|
267
|
-
headers =
|
|
564
|
+
headers = build_public_api_headers(self.public_token, self.public_key)
|
|
565
|
+
payload = {"tasks_id": task_id, "type": file_type}
|
|
566
|
+
|
|
567
|
+
logger.info(f"Async getting result for Task: {task_id}")
|
|
568
|
+
|
|
569
|
+
try:
|
|
570
|
+
async with session.post(
|
|
571
|
+
self._download_url,
|
|
572
|
+
data=payload,
|
|
573
|
+
headers=headers
|
|
574
|
+
) as response:
|
|
575
|
+
data = await response.json()
|
|
576
|
+
code = data.get("code")
|
|
577
|
+
|
|
578
|
+
if code == 200 and data.get("data"):
|
|
579
|
+
return data["data"]["download"]
|
|
580
|
+
|
|
581
|
+
msg = extract_error_message(data)
|
|
582
|
+
raise_for_code(
|
|
583
|
+
f"Get result failed: {msg}",
|
|
584
|
+
code=code,
|
|
585
|
+
payload=data
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
except aiohttp.ClientError as e:
|
|
589
|
+
raise ThordataNetworkError(
|
|
590
|
+
f"Get result failed: {e}",
|
|
591
|
+
original_error=e
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
async def wait_for_task(
|
|
595
|
+
self,
|
|
596
|
+
task_id: str,
|
|
597
|
+
*,
|
|
598
|
+
poll_interval: float = 5.0,
|
|
599
|
+
max_wait: float = 600.0,
|
|
600
|
+
) -> str:
|
|
601
|
+
"""
|
|
602
|
+
Wait for a task to complete.
|
|
603
|
+
"""
|
|
604
|
+
elapsed = 0.0
|
|
605
|
+
|
|
606
|
+
while elapsed < max_wait:
|
|
607
|
+
status = await self.get_task_status(task_id)
|
|
608
|
+
|
|
609
|
+
logger.debug(f"Task {task_id} status: {status}")
|
|
610
|
+
|
|
611
|
+
terminal_statuses = {
|
|
612
|
+
"ready", "success", "finished",
|
|
613
|
+
"failed", "error", "cancelled"
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
if status.lower() in terminal_statuses:
|
|
617
|
+
return status
|
|
618
|
+
|
|
619
|
+
await asyncio.sleep(poll_interval)
|
|
620
|
+
elapsed += poll_interval
|
|
621
|
+
|
|
622
|
+
raise TimeoutError(
|
|
623
|
+
f"Task {task_id} did not complete within {max_wait} seconds"
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
# =========================================================================
|
|
627
|
+
# Location API Methods
|
|
628
|
+
# =========================================================================
|
|
629
|
+
|
|
630
|
+
async def list_countries(
|
|
631
|
+
self,
|
|
632
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
633
|
+
) -> List[Dict[str, Any]]:
|
|
634
|
+
"""List supported countries."""
|
|
635
|
+
return await self._get_locations(
|
|
636
|
+
"countries",
|
|
637
|
+
proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
async def list_states(
|
|
641
|
+
self,
|
|
642
|
+
country_code: str,
|
|
643
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
644
|
+
) -> List[Dict[str, Any]]:
|
|
645
|
+
"""List supported states for a country."""
|
|
646
|
+
return await self._get_locations(
|
|
647
|
+
"states",
|
|
648
|
+
proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
|
|
649
|
+
country_code=country_code
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
async def list_cities(
|
|
653
|
+
self,
|
|
654
|
+
country_code: str,
|
|
655
|
+
state_code: Optional[str] = None,
|
|
656
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
657
|
+
) -> List[Dict[str, Any]]:
|
|
658
|
+
"""List supported cities."""
|
|
659
|
+
kwargs = {
|
|
660
|
+
"proxy_type": int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
|
|
661
|
+
"country_code": country_code
|
|
662
|
+
}
|
|
663
|
+
if state_code:
|
|
664
|
+
kwargs["state_code"] = state_code
|
|
665
|
+
|
|
666
|
+
return await self._get_locations("cities", **kwargs)
|
|
667
|
+
|
|
668
|
+
async def list_asn(
|
|
669
|
+
self,
|
|
670
|
+
country_code: str,
|
|
671
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
672
|
+
) -> List[Dict[str, Any]]:
|
|
673
|
+
"""List supported ASNs."""
|
|
674
|
+
return await self._get_locations(
|
|
675
|
+
"asn",
|
|
676
|
+
proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
|
|
677
|
+
country_code=country_code
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
async def _get_locations(
|
|
681
|
+
self,
|
|
682
|
+
endpoint: str,
|
|
683
|
+
**kwargs: Any
|
|
684
|
+
) -> List[Dict[str, Any]]:
|
|
685
|
+
"""Internal async locations API call."""
|
|
686
|
+
self._require_public_credentials()
|
|
687
|
+
|
|
688
|
+
params = {
|
|
268
689
|
"token": self.public_token,
|
|
269
690
|
"key": self.public_key,
|
|
270
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
271
691
|
}
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
692
|
+
|
|
693
|
+
for key, value in kwargs.items():
|
|
694
|
+
params[key] = str(value)
|
|
695
|
+
|
|
696
|
+
url = f"{self.LOCATIONS_URL}/{endpoint}"
|
|
697
|
+
|
|
698
|
+
logger.debug(f"Async Locations API: {url}")
|
|
699
|
+
|
|
700
|
+
# Create temporary session for this request (no proxy needed)
|
|
701
|
+
async with aiohttp.ClientSession() as temp_session:
|
|
702
|
+
async with temp_session.get(url, params=params) as response:
|
|
703
|
+
response.raise_for_status()
|
|
704
|
+
data = await response.json()
|
|
705
|
+
|
|
706
|
+
if isinstance(data, dict):
|
|
707
|
+
code = data.get("code")
|
|
708
|
+
if code is not None and code != 200:
|
|
709
|
+
msg = data.get("msg", "")
|
|
710
|
+
raise RuntimeError(
|
|
711
|
+
f"Locations API error ({endpoint}): code={code}, msg={msg}"
|
|
712
|
+
)
|
|
713
|
+
return data.get("data") or []
|
|
714
|
+
|
|
715
|
+
if isinstance(data, list):
|
|
716
|
+
return data
|
|
717
|
+
|
|
718
|
+
return []
|
|
719
|
+
|
|
720
|
+
# =========================================================================
|
|
721
|
+
# Helper Methods
|
|
722
|
+
# =========================================================================
|
|
723
|
+
|
|
724
|
+
def _require_public_credentials(self) -> None:
|
|
725
|
+
"""Ensure public API credentials are available."""
|
|
726
|
+
if not self.public_token or not self.public_key:
|
|
727
|
+
raise ThordataConfigError(
|
|
728
|
+
"public_token and public_key are required for this operation. "
|
|
729
|
+
"Please provide them when initializing AsyncThordataClient."
|
|
730
|
+
)
|