thordata-sdk 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +130 -11
- thordata/_utils.py +126 -0
- thordata/async_client.py +672 -185
- thordata/client.py +809 -300
- thordata/enums.py +301 -11
- thordata/exceptions.py +344 -0
- thordata/models.py +725 -0
- thordata/parameters.py +7 -6
- thordata/retry.py +380 -0
- thordata_sdk-0.5.0.dist-info/METADATA +896 -0
- thordata_sdk-0.5.0.dist-info/RECORD +14 -0
- thordata_sdk-0.5.0.dist-info/licenses/LICENSE +21 -0
- thordata_sdk-0.3.1.dist-info/METADATA +0 -200
- thordata_sdk-0.3.1.dist-info/RECORD +0 -10
- thordata_sdk-0.3.1.dist-info/licenses/LICENSE +0 -201
- {thordata_sdk-0.3.1.dist-info → thordata_sdk-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.3.1.dist-info → thordata_sdk-0.5.0.dist-info}/top_level.txt +0 -0
thordata/async_client.py
CHANGED
|
@@ -1,281 +1,768 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
Asynchronous client for the Thordata API.
|
|
3
|
+
|
|
4
|
+
This module provides the AsyncThordataClient for high-concurrency workloads,
|
|
5
|
+
built on aiohttp.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> import asyncio
|
|
9
|
+
>>> from thordata import AsyncThordataClient
|
|
10
|
+
>>>
|
|
11
|
+
>>> async def main():
|
|
12
|
+
... async with AsyncThordataClient(
|
|
13
|
+
... scraper_token="your_token",
|
|
14
|
+
... public_token="your_public_token",
|
|
15
|
+
... public_key="your_public_key"
|
|
16
|
+
... ) as client:
|
|
17
|
+
... response = await client.get("https://httpbin.org/ip")
|
|
18
|
+
... print(await response.json())
|
|
19
|
+
>>>
|
|
20
|
+
>>> asyncio.run(main())
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import asyncio
|
|
2
26
|
import logging
|
|
3
|
-
import
|
|
4
|
-
|
|
5
|
-
|
|
27
|
+
from typing import Any, Dict, List, Optional, Union
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import aiohttp
|
|
6
31
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
32
|
+
from ._utils import (
|
|
33
|
+
build_auth_headers,
|
|
34
|
+
build_public_api_headers,
|
|
35
|
+
decode_base64_image,
|
|
36
|
+
extract_error_message,
|
|
37
|
+
parse_json_response,
|
|
38
|
+
)
|
|
39
|
+
from .enums import Engine, ProxyType
|
|
40
|
+
from .exceptions import (
|
|
41
|
+
ThordataConfigError,
|
|
42
|
+
ThordataNetworkError,
|
|
43
|
+
ThordataTimeoutError,
|
|
44
|
+
raise_for_code,
|
|
45
|
+
)
|
|
46
|
+
from .models import (
|
|
47
|
+
ProxyConfig,
|
|
48
|
+
ScraperTaskConfig,
|
|
49
|
+
SerpRequest,
|
|
50
|
+
UniversalScrapeRequest,
|
|
51
|
+
)
|
|
52
|
+
from .retry import RetryConfig
|
|
10
53
|
|
|
11
54
|
logger = logging.getLogger(__name__)
|
|
12
55
|
|
|
13
56
|
|
|
14
57
|
class AsyncThordataClient:
|
|
15
58
|
"""
|
|
16
|
-
The official
|
|
59
|
+
The official asynchronous Python client for Thordata.
|
|
60
|
+
|
|
17
61
|
Designed for high-concurrency AI agents and data pipelines.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
scraper_token: The API token from your Dashboard.
|
|
65
|
+
public_token: The public API token.
|
|
66
|
+
public_key: The public API key.
|
|
67
|
+
proxy_host: Custom proxy gateway host.
|
|
68
|
+
proxy_port: Custom proxy gateway port.
|
|
69
|
+
timeout: Default request timeout in seconds.
|
|
70
|
+
retry_config: Configuration for automatic retries.
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> async with AsyncThordataClient(
|
|
74
|
+
... scraper_token="token",
|
|
75
|
+
... public_token="pub_token",
|
|
76
|
+
... public_key="pub_key"
|
|
77
|
+
... ) as client:
|
|
78
|
+
... results = await client.serp_search("python")
|
|
18
79
|
"""
|
|
19
80
|
|
|
81
|
+
# API Endpoints (same as sync client)
|
|
82
|
+
BASE_URL = "https://scraperapi.thordata.com"
|
|
83
|
+
UNIVERSAL_URL = "https://universalapi.thordata.com"
|
|
84
|
+
API_URL = "https://api.thordata.com/api/web-scraper-api"
|
|
85
|
+
LOCATIONS_URL = "https://api.thordata.com/api/locations"
|
|
86
|
+
|
|
20
87
|
def __init__(
|
|
21
88
|
self,
|
|
22
89
|
scraper_token: str,
|
|
23
|
-
public_token: str,
|
|
24
|
-
public_key: str,
|
|
25
|
-
proxy_host: str = "
|
|
26
|
-
proxy_port: int =
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
90
|
+
public_token: Optional[str] = None,
|
|
91
|
+
public_key: Optional[str] = None,
|
|
92
|
+
proxy_host: str = "pr.thordata.net",
|
|
93
|
+
proxy_port: int = 9999,
|
|
94
|
+
timeout: int = 30,
|
|
95
|
+
retry_config: Optional[RetryConfig] = None,
|
|
96
|
+
scraperapi_base_url: Optional[str] = None,
|
|
97
|
+
universalapi_base_url: Optional[str] = None,
|
|
98
|
+
web_scraper_api_base_url: Optional[str] = None,
|
|
99
|
+
locations_base_url: Optional[str] = None,
|
|
100
|
+
) -> None:
|
|
101
|
+
"""Initialize the Async Thordata Client."""
|
|
102
|
+
if not scraper_token:
|
|
103
|
+
raise ThordataConfigError("scraper_token is required")
|
|
104
|
+
|
|
31
105
|
self.scraper_token = scraper_token
|
|
32
106
|
self.public_token = public_token
|
|
33
107
|
self.public_key = public_key
|
|
34
108
|
|
|
35
|
-
#
|
|
36
|
-
self.
|
|
37
|
-
self.
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
self.
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
self.
|
|
45
|
-
self.
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
109
|
+
# Proxy configuration
|
|
110
|
+
self._proxy_host = proxy_host
|
|
111
|
+
self._proxy_port = proxy_port
|
|
112
|
+
self._default_timeout = aiohttp.ClientTimeout(total=timeout)
|
|
113
|
+
|
|
114
|
+
# Retry configuration
|
|
115
|
+
self._retry_config = retry_config or RetryConfig()
|
|
116
|
+
|
|
117
|
+
# Pre-calculate proxy auth
|
|
118
|
+
self._proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
119
|
+
self._proxy_auth = aiohttp.BasicAuth(
|
|
120
|
+
login=f"td-customer-{scraper_token}", password=""
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Base URLs (allow override via args or env vars for testing and custom routing)
|
|
124
|
+
scraperapi_base = (
|
|
125
|
+
scraperapi_base_url
|
|
126
|
+
or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
|
|
127
|
+
or self.BASE_URL
|
|
128
|
+
).rstrip("/")
|
|
129
|
+
|
|
130
|
+
universalapi_base = (
|
|
131
|
+
universalapi_base_url
|
|
132
|
+
or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
|
|
133
|
+
or self.UNIVERSAL_URL
|
|
134
|
+
).rstrip("/")
|
|
135
|
+
|
|
136
|
+
web_scraper_api_base = (
|
|
137
|
+
web_scraper_api_base_url
|
|
138
|
+
or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
|
|
139
|
+
or self.API_URL
|
|
140
|
+
).rstrip("/")
|
|
141
|
+
|
|
142
|
+
locations_base = (
|
|
143
|
+
locations_base_url
|
|
144
|
+
or os.getenv("THORDATA_LOCATIONS_BASE_URL")
|
|
145
|
+
or self.LOCATIONS_URL
|
|
146
|
+
).rstrip("/")
|
|
147
|
+
|
|
148
|
+
self._serp_url = f"{scraperapi_base}/request"
|
|
149
|
+
self._builder_url = f"{scraperapi_base}/builder"
|
|
150
|
+
self._universal_url = f"{universalapi_base}/request"
|
|
151
|
+
self._status_url = f"{web_scraper_api_base}/tasks-status"
|
|
152
|
+
self._download_url = f"{web_scraper_api_base}/tasks-download"
|
|
153
|
+
self._locations_base_url = locations_base
|
|
154
|
+
|
|
155
|
+
# Session initialized lazily
|
|
51
156
|
self._session: Optional[aiohttp.ClientSession] = None
|
|
52
157
|
|
|
53
|
-
async def __aenter__(self):
|
|
158
|
+
async def __aenter__(self) -> AsyncThordataClient:
|
|
159
|
+
"""Async context manager entry."""
|
|
54
160
|
if self._session is None or self._session.closed:
|
|
55
|
-
self._session = aiohttp.ClientSession(
|
|
161
|
+
self._session = aiohttp.ClientSession(
|
|
162
|
+
timeout=self._default_timeout, trust_env=True
|
|
163
|
+
)
|
|
56
164
|
return self
|
|
57
165
|
|
|
58
|
-
async def __aexit__(self, exc_type,
|
|
166
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
167
|
+
"""Async context manager exit."""
|
|
59
168
|
await self.close()
|
|
60
169
|
|
|
61
|
-
async def close(self):
|
|
170
|
+
async def close(self) -> None:
|
|
62
171
|
"""Close the underlying aiohttp session."""
|
|
63
172
|
if self._session and not self._session.closed:
|
|
64
173
|
await self._session.close()
|
|
65
174
|
self._session = None
|
|
66
175
|
|
|
67
176
|
def _get_session(self) -> aiohttp.ClientSession:
|
|
68
|
-
"""
|
|
177
|
+
"""Get the session, raising if not initialized."""
|
|
69
178
|
if self._session is None or self._session.closed:
|
|
70
179
|
raise RuntimeError(
|
|
71
|
-
"Client session not initialized.
|
|
180
|
+
"Client session not initialized. "
|
|
181
|
+
"Use 'async with AsyncThordataClient(...) as client:'"
|
|
72
182
|
)
|
|
73
183
|
return self._session
|
|
74
184
|
|
|
75
|
-
|
|
185
|
+
# =========================================================================
|
|
186
|
+
# Proxy Network Methods
|
|
187
|
+
# =========================================================================
|
|
188
|
+
|
|
189
|
+
async def get(
|
|
190
|
+
self,
|
|
191
|
+
url: str,
|
|
192
|
+
*,
|
|
193
|
+
proxy_config: Optional[ProxyConfig] = None,
|
|
194
|
+
**kwargs: Any,
|
|
195
|
+
) -> aiohttp.ClientResponse:
|
|
76
196
|
"""
|
|
77
197
|
Send an async GET request through the Proxy Network.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
url: The target URL.
|
|
201
|
+
proxy_config: Custom proxy configuration.
|
|
202
|
+
**kwargs: Additional aiohttp arguments.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
The aiohttp response object.
|
|
78
206
|
"""
|
|
79
207
|
session = self._get_session()
|
|
208
|
+
|
|
209
|
+
logger.debug(f"Async Proxy GET: {url}")
|
|
210
|
+
|
|
211
|
+
if proxy_config:
|
|
212
|
+
proxy_url, proxy_auth = proxy_config.to_aiohttp_config()
|
|
213
|
+
else:
|
|
214
|
+
proxy_url = self._proxy_url
|
|
215
|
+
proxy_auth = self._proxy_auth
|
|
216
|
+
|
|
80
217
|
try:
|
|
81
|
-
logger.debug(f"Async Proxy Request: {url}")
|
|
82
218
|
return await session.get(
|
|
83
|
-
url,
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
219
|
+
url, proxy=proxy_url, proxy_auth=proxy_auth, **kwargs
|
|
220
|
+
)
|
|
221
|
+
except asyncio.TimeoutError as e:
|
|
222
|
+
raise ThordataTimeoutError(
|
|
223
|
+
f"Async request timed out: {e}", original_error=e
|
|
87
224
|
)
|
|
88
225
|
except aiohttp.ClientError as e:
|
|
89
|
-
|
|
90
|
-
|
|
226
|
+
raise ThordataNetworkError(f"Async request failed: {e}", original_error=e)
|
|
227
|
+
|
|
228
|
+
async def post(
|
|
229
|
+
self,
|
|
230
|
+
url: str,
|
|
231
|
+
*,
|
|
232
|
+
proxy_config: Optional[ProxyConfig] = None,
|
|
233
|
+
**kwargs: Any,
|
|
234
|
+
) -> aiohttp.ClientResponse:
|
|
235
|
+
"""
|
|
236
|
+
Send an async POST request through the Proxy Network.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
url: The target URL.
|
|
240
|
+
proxy_config: Custom proxy configuration.
|
|
241
|
+
**kwargs: Additional aiohttp arguments.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
The aiohttp response object.
|
|
245
|
+
"""
|
|
246
|
+
session = self._get_session()
|
|
247
|
+
|
|
248
|
+
logger.debug(f"Async Proxy POST: {url}")
|
|
249
|
+
|
|
250
|
+
if proxy_config:
|
|
251
|
+
proxy_url, proxy_auth = proxy_config.to_aiohttp_config()
|
|
252
|
+
else:
|
|
253
|
+
proxy_url = self._proxy_url
|
|
254
|
+
proxy_auth = self._proxy_auth
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
return await session.post(
|
|
258
|
+
url, proxy=proxy_url, proxy_auth=proxy_auth, **kwargs
|
|
259
|
+
)
|
|
260
|
+
except asyncio.TimeoutError as e:
|
|
261
|
+
raise ThordataTimeoutError(
|
|
262
|
+
f"Async request timed out: {e}", original_error=e
|
|
263
|
+
)
|
|
264
|
+
except aiohttp.ClientError as e:
|
|
265
|
+
raise ThordataNetworkError(f"Async request failed: {e}", original_error=e)
|
|
266
|
+
|
|
267
|
+
# =========================================================================
|
|
268
|
+
# SERP API Methods
|
|
269
|
+
# =========================================================================
|
|
91
270
|
|
|
92
271
|
async def serp_search(
|
|
93
|
-
self,
|
|
94
|
-
query: str,
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
272
|
+
self,
|
|
273
|
+
query: str,
|
|
274
|
+
*,
|
|
275
|
+
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
276
|
+
num: int = 10,
|
|
277
|
+
country: Optional[str] = None,
|
|
278
|
+
language: Optional[str] = None,
|
|
279
|
+
search_type: Optional[str] = None,
|
|
280
|
+
device: Optional[str] = None,
|
|
281
|
+
render_js: Optional[bool] = None,
|
|
282
|
+
no_cache: Optional[bool] = None,
|
|
283
|
+
output_format: str = "json",
|
|
284
|
+
**kwargs: Any,
|
|
98
285
|
) -> Dict[str, Any]:
|
|
99
286
|
"""
|
|
100
|
-
Execute
|
|
287
|
+
Execute an async SERP search.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
query: Search keywords.
|
|
291
|
+
engine: Search engine.
|
|
292
|
+
num: Number of results.
|
|
293
|
+
country: Country code for localization.
|
|
294
|
+
language: Language code.
|
|
295
|
+
search_type: Type of search.
|
|
296
|
+
device: Device type ('desktop', 'mobile', 'tablet').
|
|
297
|
+
render_js: Enable JavaScript rendering in SERP.
|
|
298
|
+
no_cache: Disable internal caching.
|
|
299
|
+
output_format: 'json' or 'html'.
|
|
300
|
+
**kwargs: Additional parameters.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Parsed JSON results or dict with 'html' key.
|
|
101
304
|
"""
|
|
102
305
|
session = self._get_session()
|
|
103
306
|
|
|
104
|
-
# 1. Handle Enum conversion
|
|
105
307
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
106
308
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
309
|
+
request = SerpRequest(
|
|
310
|
+
query=query,
|
|
311
|
+
engine=engine_str,
|
|
312
|
+
num=num,
|
|
313
|
+
country=country,
|
|
314
|
+
language=language,
|
|
315
|
+
search_type=search_type,
|
|
316
|
+
device=device,
|
|
317
|
+
render_js=render_js,
|
|
318
|
+
no_cache=no_cache,
|
|
319
|
+
output_format=output_format,
|
|
320
|
+
extra_params=kwargs,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
payload = request.to_payload()
|
|
324
|
+
headers = build_auth_headers(self.scraper_token)
|
|
114
325
|
|
|
115
|
-
# 3. Execute Request
|
|
116
326
|
logger.info(f"Async SERP Search: {engine_str} - {query}")
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
async with session.post(
|
|
330
|
+
self._serp_url,
|
|
331
|
+
data=payload,
|
|
332
|
+
headers=headers,
|
|
333
|
+
) as response:
|
|
334
|
+
response.raise_for_status()
|
|
335
|
+
|
|
336
|
+
if output_format.lower() == "json":
|
|
337
|
+
data = await response.json()
|
|
338
|
+
|
|
339
|
+
if isinstance(data, dict):
|
|
340
|
+
code = data.get("code")
|
|
341
|
+
if code is not None and code != 200:
|
|
342
|
+
msg = extract_error_message(data)
|
|
343
|
+
raise_for_code(
|
|
344
|
+
f"SERP API Error: {msg}",
|
|
345
|
+
code=code,
|
|
346
|
+
payload=data,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
return parse_json_response(data)
|
|
350
|
+
|
|
351
|
+
text = await response.text()
|
|
352
|
+
return {"html": text}
|
|
353
|
+
|
|
354
|
+
except asyncio.TimeoutError as e:
|
|
355
|
+
raise ThordataTimeoutError(
|
|
356
|
+
f"SERP request timed out: {e}",
|
|
357
|
+
original_error=e,
|
|
358
|
+
)
|
|
359
|
+
except aiohttp.ClientError as e:
|
|
360
|
+
raise ThordataNetworkError(
|
|
361
|
+
f"SERP request failed: {e}",
|
|
362
|
+
original_error=e,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
async def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
|
|
366
|
+
"""
|
|
367
|
+
Execute an async SERP search using a SerpRequest object.
|
|
368
|
+
"""
|
|
369
|
+
session = self._get_session()
|
|
370
|
+
|
|
371
|
+
payload = request.to_payload()
|
|
372
|
+
headers = build_auth_headers(self.scraper_token)
|
|
373
|
+
|
|
374
|
+
logger.info(f"Async SERP Advanced: {request.engine} - {request.query}")
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
async with session.post(
|
|
378
|
+
self._serp_url,
|
|
379
|
+
data=payload,
|
|
380
|
+
headers=headers,
|
|
381
|
+
) as response:
|
|
382
|
+
response.raise_for_status()
|
|
383
|
+
|
|
384
|
+
if request.output_format.lower() == "json":
|
|
385
|
+
data = await response.json()
|
|
386
|
+
|
|
387
|
+
if isinstance(data, dict):
|
|
388
|
+
code = data.get("code")
|
|
389
|
+
if code is not None and code != 200:
|
|
390
|
+
msg = extract_error_message(data)
|
|
391
|
+
raise_for_code(
|
|
392
|
+
f"SERP API Error: {msg}",
|
|
393
|
+
code=code,
|
|
394
|
+
payload=data,
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
return parse_json_response(data)
|
|
398
|
+
|
|
399
|
+
text = await response.text()
|
|
400
|
+
return {"html": text}
|
|
401
|
+
|
|
402
|
+
except asyncio.TimeoutError as e:
|
|
403
|
+
raise ThordataTimeoutError(
|
|
404
|
+
f"SERP request timed out: {e}",
|
|
405
|
+
original_error=e,
|
|
406
|
+
)
|
|
407
|
+
except aiohttp.ClientError as e:
|
|
408
|
+
raise ThordataNetworkError(
|
|
409
|
+
f"SERP request failed: {e}",
|
|
410
|
+
original_error=e,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# =========================================================================
|
|
414
|
+
# Universal Scraping API Methods
|
|
415
|
+
# =========================================================================
|
|
130
416
|
|
|
131
417
|
async def universal_scrape(
|
|
132
418
|
self,
|
|
133
419
|
url: str,
|
|
420
|
+
*,
|
|
134
421
|
js_render: bool = False,
|
|
135
|
-
output_format: str = "
|
|
422
|
+
output_format: str = "html",
|
|
136
423
|
country: Optional[str] = None,
|
|
137
|
-
block_resources:
|
|
424
|
+
block_resources: Optional[str] = None,
|
|
425
|
+
wait: Optional[int] = None,
|
|
426
|
+
wait_for: Optional[str] = None,
|
|
427
|
+
**kwargs: Any,
|
|
428
|
+
) -> Union[str, bytes]:
|
|
429
|
+
"""
|
|
430
|
+
Async scrape using Universal API (Web Unlocker).
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
url: Target URL.
|
|
434
|
+
js_render: Enable JavaScript rendering.
|
|
435
|
+
output_format: "html" or "png".
|
|
436
|
+
country: Geo-targeting country.
|
|
437
|
+
block_resources: Resources to block.
|
|
438
|
+
wait: Wait time in ms.
|
|
439
|
+
wait_for: CSS selector to wait for.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
HTML string or PNG bytes.
|
|
443
|
+
"""
|
|
444
|
+
request = UniversalScrapeRequest(
|
|
445
|
+
url=url,
|
|
446
|
+
js_render=js_render,
|
|
447
|
+
output_format=output_format,
|
|
448
|
+
country=country,
|
|
449
|
+
block_resources=block_resources,
|
|
450
|
+
wait=wait,
|
|
451
|
+
wait_for=wait_for,
|
|
452
|
+
extra_params=kwargs,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
return await self.universal_scrape_advanced(request)
|
|
456
|
+
|
|
457
|
+
async def universal_scrape_advanced(
|
|
458
|
+
self, request: UniversalScrapeRequest
|
|
138
459
|
) -> Union[str, bytes]:
|
|
139
460
|
"""
|
|
140
|
-
Async
|
|
461
|
+
Async scrape using a UniversalScrapeRequest object.
|
|
141
462
|
"""
|
|
142
463
|
session = self._get_session()
|
|
143
464
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
147
|
-
}
|
|
465
|
+
payload = request.to_payload()
|
|
466
|
+
headers = build_auth_headers(self.scraper_token)
|
|
148
467
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
resp_json
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
png_str += '=' * (4 - missing_padding)
|
|
194
|
-
|
|
195
|
-
return base64.b64decode(png_str)
|
|
196
|
-
|
|
197
|
-
return str(resp_json)
|
|
468
|
+
logger.info(f"Async Universal Scrape: {request.url}")
|
|
469
|
+
|
|
470
|
+
try:
|
|
471
|
+
async with session.post(
|
|
472
|
+
self._universal_url, data=payload, headers=headers
|
|
473
|
+
) as response:
|
|
474
|
+
response.raise_for_status()
|
|
475
|
+
|
|
476
|
+
try:
|
|
477
|
+
resp_json = await response.json()
|
|
478
|
+
except ValueError:
|
|
479
|
+
if request.output_format.lower() == "png":
|
|
480
|
+
return await response.read()
|
|
481
|
+
return await response.text()
|
|
482
|
+
|
|
483
|
+
# Check for API errors
|
|
484
|
+
if isinstance(resp_json, dict):
|
|
485
|
+
code = resp_json.get("code")
|
|
486
|
+
if code is not None and code != 200:
|
|
487
|
+
msg = extract_error_message(resp_json)
|
|
488
|
+
raise_for_code(
|
|
489
|
+
f"Universal API Error: {msg}", code=code, payload=resp_json
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
if "html" in resp_json:
|
|
493
|
+
return resp_json["html"]
|
|
494
|
+
|
|
495
|
+
if "png" in resp_json:
|
|
496
|
+
return decode_base64_image(resp_json["png"])
|
|
497
|
+
|
|
498
|
+
return str(resp_json)
|
|
499
|
+
|
|
500
|
+
except asyncio.TimeoutError as e:
|
|
501
|
+
raise ThordataTimeoutError(
|
|
502
|
+
f"Universal scrape timed out: {e}", original_error=e
|
|
503
|
+
)
|
|
504
|
+
except aiohttp.ClientError as e:
|
|
505
|
+
raise ThordataNetworkError(
|
|
506
|
+
f"Universal scrape failed: {e}", original_error=e
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# =========================================================================
|
|
510
|
+
# Web Scraper API Methods
|
|
511
|
+
# =========================================================================
|
|
198
512
|
|
|
199
513
|
async def create_scraper_task(
|
|
200
514
|
self,
|
|
201
515
|
file_name: str,
|
|
202
516
|
spider_id: str,
|
|
203
517
|
spider_name: str,
|
|
204
|
-
|
|
205
|
-
universal_params: Optional[Dict[str, Any]] = None
|
|
518
|
+
parameters: Dict[str, Any],
|
|
519
|
+
universal_params: Optional[Dict[str, Any]] = None,
|
|
206
520
|
) -> str:
|
|
207
521
|
"""
|
|
208
|
-
Create an
|
|
522
|
+
Create an async Web Scraper task.
|
|
523
|
+
"""
|
|
524
|
+
config = ScraperTaskConfig(
|
|
525
|
+
file_name=file_name,
|
|
526
|
+
spider_id=spider_id,
|
|
527
|
+
spider_name=spider_name,
|
|
528
|
+
parameters=parameters,
|
|
529
|
+
universal_params=universal_params,
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
return await self.create_scraper_task_advanced(config)
|
|
533
|
+
|
|
534
|
+
async def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
535
|
+
"""
|
|
536
|
+
Create a task using ScraperTaskConfig.
|
|
209
537
|
"""
|
|
210
538
|
session = self._get_session()
|
|
211
539
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
215
|
-
}
|
|
540
|
+
payload = config.to_payload()
|
|
541
|
+
headers = build_auth_headers(self.scraper_token)
|
|
216
542
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
543
|
+
logger.info(f"Async Task Creation: {config.spider_name}")
|
|
544
|
+
|
|
545
|
+
try:
|
|
546
|
+
async with session.post(
|
|
547
|
+
self._builder_url, data=payload, headers=headers
|
|
548
|
+
) as response:
|
|
549
|
+
response.raise_for_status()
|
|
550
|
+
data = await response.json()
|
|
551
|
+
|
|
552
|
+
code = data.get("code")
|
|
553
|
+
if code != 200:
|
|
554
|
+
msg = extract_error_message(data)
|
|
555
|
+
raise_for_code(
|
|
556
|
+
f"Task creation failed: {msg}", code=code, payload=data
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
return data["data"]["task_id"]
|
|
560
|
+
|
|
561
|
+
except aiohttp.ClientError as e:
|
|
562
|
+
raise ThordataNetworkError(f"Task creation failed: {e}", original_error=e)
|
|
237
563
|
|
|
238
564
|
async def get_task_status(self, task_id: str) -> str:
|
|
239
565
|
"""
|
|
240
|
-
Check task status.
|
|
566
|
+
Check async task status.
|
|
241
567
|
"""
|
|
568
|
+
self._require_public_credentials()
|
|
242
569
|
session = self._get_session()
|
|
243
570
|
|
|
244
|
-
headers =
|
|
245
|
-
"
|
|
246
|
-
|
|
247
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
248
|
-
}
|
|
571
|
+
headers = build_public_api_headers(
|
|
572
|
+
self.public_token or "", self.public_key or ""
|
|
573
|
+
)
|
|
249
574
|
payload = {"tasks_ids": task_id}
|
|
250
575
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
576
|
+
try:
|
|
577
|
+
async with session.post(
|
|
578
|
+
self._status_url, data=payload, headers=headers
|
|
579
|
+
) as response:
|
|
580
|
+
data = await response.json()
|
|
581
|
+
|
|
582
|
+
if data.get("code") == 200 and data.get("data"):
|
|
583
|
+
for item in data["data"]:
|
|
584
|
+
if str(item.get("task_id")) == str(task_id):
|
|
585
|
+
return item.get("status", "unknown")
|
|
586
|
+
|
|
587
|
+
return "unknown"
|
|
588
|
+
|
|
589
|
+
except Exception as e:
|
|
590
|
+
logger.error(f"Async status check failed: {e}")
|
|
591
|
+
return "error"
|
|
260
592
|
|
|
261
593
|
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
262
594
|
"""
|
|
263
|
-
Get
|
|
595
|
+
Get download URL for completed task.
|
|
264
596
|
"""
|
|
597
|
+
self._require_public_credentials()
|
|
265
598
|
session = self._get_session()
|
|
266
|
-
|
|
267
|
-
headers =
|
|
599
|
+
|
|
600
|
+
headers = build_public_api_headers(
|
|
601
|
+
self.public_token or "", self.public_key or ""
|
|
602
|
+
)
|
|
603
|
+
payload = {"tasks_id": task_id, "type": file_type}
|
|
604
|
+
|
|
605
|
+
logger.info(f"Async getting result for Task: {task_id}")
|
|
606
|
+
|
|
607
|
+
try:
|
|
608
|
+
async with session.post(
|
|
609
|
+
self._download_url, data=payload, headers=headers
|
|
610
|
+
) as response:
|
|
611
|
+
data = await response.json()
|
|
612
|
+
code = data.get("code")
|
|
613
|
+
|
|
614
|
+
if code == 200 and data.get("data"):
|
|
615
|
+
return data["data"]["download"]
|
|
616
|
+
|
|
617
|
+
msg = extract_error_message(data)
|
|
618
|
+
raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
|
|
619
|
+
# This line won't be reached, but satisfies mypy
|
|
620
|
+
raise RuntimeError("Unexpected state")
|
|
621
|
+
|
|
622
|
+
except aiohttp.ClientError as e:
|
|
623
|
+
raise ThordataNetworkError(f"Get result failed: {e}", original_error=e)
|
|
624
|
+
|
|
625
|
+
async def wait_for_task(
|
|
626
|
+
self,
|
|
627
|
+
task_id: str,
|
|
628
|
+
*,
|
|
629
|
+
poll_interval: float = 5.0,
|
|
630
|
+
max_wait: float = 600.0,
|
|
631
|
+
) -> str:
|
|
632
|
+
"""
|
|
633
|
+
Wait for a task to complete.
|
|
634
|
+
"""
|
|
635
|
+
elapsed = 0.0
|
|
636
|
+
|
|
637
|
+
while elapsed < max_wait:
|
|
638
|
+
status = await self.get_task_status(task_id)
|
|
639
|
+
|
|
640
|
+
logger.debug(f"Task {task_id} status: {status}")
|
|
641
|
+
|
|
642
|
+
terminal_statuses = {
|
|
643
|
+
"ready",
|
|
644
|
+
"success",
|
|
645
|
+
"finished",
|
|
646
|
+
"failed",
|
|
647
|
+
"error",
|
|
648
|
+
"cancelled",
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
if status.lower() in terminal_statuses:
|
|
652
|
+
return status
|
|
653
|
+
|
|
654
|
+
await asyncio.sleep(poll_interval)
|
|
655
|
+
elapsed += poll_interval
|
|
656
|
+
|
|
657
|
+
raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
|
|
658
|
+
|
|
659
|
+
# =========================================================================
|
|
660
|
+
# Location API Methods
|
|
661
|
+
# =========================================================================
|
|
662
|
+
|
|
663
|
+
async def list_countries(
|
|
664
|
+
self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
665
|
+
) -> List[Dict[str, Any]]:
|
|
666
|
+
"""List supported countries."""
|
|
667
|
+
return await self._get_locations(
|
|
668
|
+
"countries",
|
|
669
|
+
proxy_type=(
|
|
670
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
671
|
+
),
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
async def list_states(
|
|
675
|
+
self,
|
|
676
|
+
country_code: str,
|
|
677
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
678
|
+
) -> List[Dict[str, Any]]:
|
|
679
|
+
"""List supported states for a country."""
|
|
680
|
+
return await self._get_locations(
|
|
681
|
+
"states",
|
|
682
|
+
proxy_type=(
|
|
683
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
684
|
+
),
|
|
685
|
+
country_code=country_code,
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
async def list_cities(
|
|
689
|
+
self,
|
|
690
|
+
country_code: str,
|
|
691
|
+
state_code: Optional[str] = None,
|
|
692
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
693
|
+
) -> List[Dict[str, Any]]:
|
|
694
|
+
"""List supported cities."""
|
|
695
|
+
kwargs = {
|
|
696
|
+
"proxy_type": (
|
|
697
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
698
|
+
),
|
|
699
|
+
"country_code": country_code,
|
|
700
|
+
}
|
|
701
|
+
if state_code:
|
|
702
|
+
kwargs["state_code"] = state_code
|
|
703
|
+
|
|
704
|
+
return await self._get_locations("cities", **kwargs)
|
|
705
|
+
|
|
706
|
+
async def list_asn(
|
|
707
|
+
self,
|
|
708
|
+
country_code: str,
|
|
709
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
710
|
+
) -> List[Dict[str, Any]]:
|
|
711
|
+
"""List supported ASNs."""
|
|
712
|
+
return await self._get_locations(
|
|
713
|
+
"asn",
|
|
714
|
+
proxy_type=(
|
|
715
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
716
|
+
),
|
|
717
|
+
country_code=country_code,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
async def _get_locations(
|
|
721
|
+
self, endpoint: str, **kwargs: Any
|
|
722
|
+
) -> List[Dict[str, Any]]:
|
|
723
|
+
"""Internal async locations API call."""
|
|
724
|
+
self._require_public_credentials()
|
|
725
|
+
|
|
726
|
+
params = {
|
|
268
727
|
"token": self.public_token,
|
|
269
728
|
"key": self.public_key,
|
|
270
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
271
729
|
}
|
|
272
|
-
# Fixed: Use the file_type argument instead of hardcoding "json"
|
|
273
|
-
payload = {"tasks_id": task_id, "type": file_type}
|
|
274
730
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
731
|
+
for key, value in kwargs.items():
|
|
732
|
+
params[key] = str(value)
|
|
733
|
+
|
|
734
|
+
url = f"{self._locations_base_url}/{endpoint}"
|
|
735
|
+
|
|
736
|
+
logger.debug(f"Async Locations API: {url}")
|
|
737
|
+
|
|
738
|
+
# Create temporary session for this request (no proxy needed)
|
|
739
|
+
async with aiohttp.ClientSession(trust_env=True) as temp_session:
|
|
740
|
+
async with temp_session.get(url, params=params) as response:
|
|
741
|
+
response.raise_for_status()
|
|
742
|
+
data = await response.json()
|
|
743
|
+
|
|
744
|
+
if isinstance(data, dict):
|
|
745
|
+
code = data.get("code")
|
|
746
|
+
if code is not None and code != 200:
|
|
747
|
+
msg = data.get("msg", "")
|
|
748
|
+
raise RuntimeError(
|
|
749
|
+
f"Locations API error ({endpoint}): code={code}, msg={msg}"
|
|
750
|
+
)
|
|
751
|
+
return data.get("data") or []
|
|
752
|
+
|
|
753
|
+
if isinstance(data, list):
|
|
754
|
+
return data
|
|
755
|
+
|
|
756
|
+
return []
|
|
757
|
+
|
|
758
|
+
# =========================================================================
|
|
759
|
+
# Helper Methods
|
|
760
|
+
# =========================================================================
|
|
761
|
+
|
|
762
|
+
def _require_public_credentials(self) -> None:
|
|
763
|
+
"""Ensure public API credentials are available."""
|
|
764
|
+
if not self.public_token or not self.public_key:
|
|
765
|
+
raise ThordataConfigError(
|
|
766
|
+
"public_token and public_key are required for this operation. "
|
|
767
|
+
"Please provide them when initializing AsyncThordataClient."
|
|
768
|
+
)
|