thordata-sdk 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +33 -36
- thordata/_utils.py +21 -21
- thordata/async_client.py +230 -192
- thordata/client.py +281 -222
- thordata/enums.py +32 -6
- thordata/exceptions.py +60 -31
- thordata/models.py +173 -146
- thordata/parameters.py +7 -6
- thordata/retry.py +109 -111
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/METADATA +228 -10
- thordata_sdk-0.5.0.dist-info/RECORD +14 -0
- thordata_sdk-0.4.0.dist-info/RECORD +0 -14
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/top_level.txt +0 -0
thordata/client.py
CHANGED
|
@@ -6,17 +6,17 @@ Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
|
6
6
|
|
|
7
7
|
Example:
|
|
8
8
|
>>> from thordata import ThordataClient
|
|
9
|
-
>>>
|
|
9
|
+
>>>
|
|
10
10
|
>>> client = ThordataClient(
|
|
11
11
|
... scraper_token="your_token",
|
|
12
12
|
... public_token="your_public_token",
|
|
13
13
|
... public_key="your_public_key"
|
|
14
14
|
... )
|
|
15
|
-
>>>
|
|
15
|
+
>>>
|
|
16
16
|
>>> # Use the proxy network
|
|
17
17
|
>>> response = client.get("https://httpbin.org/ip")
|
|
18
18
|
>>> print(response.json())
|
|
19
|
-
>>>
|
|
19
|
+
>>>
|
|
20
20
|
>>> # Search with SERP API
|
|
21
21
|
>>> results = client.serp_search("python tutorial", engine="google")
|
|
22
22
|
"""
|
|
@@ -24,9 +24,18 @@ Example:
|
|
|
24
24
|
from __future__ import annotations
|
|
25
25
|
|
|
26
26
|
import logging
|
|
27
|
-
import requests
|
|
28
27
|
from typing import Any, Dict, List, Optional, Union
|
|
29
28
|
|
|
29
|
+
import os
|
|
30
|
+
import requests
|
|
31
|
+
|
|
32
|
+
from ._utils import (
|
|
33
|
+
build_auth_headers,
|
|
34
|
+
build_public_api_headers,
|
|
35
|
+
decode_base64_image,
|
|
36
|
+
extract_error_message,
|
|
37
|
+
parse_json_response,
|
|
38
|
+
)
|
|
30
39
|
from .enums import Engine, ProxyType
|
|
31
40
|
from .exceptions import (
|
|
32
41
|
ThordataConfigError,
|
|
@@ -37,18 +46,11 @@ from .exceptions import (
|
|
|
37
46
|
from .models import (
|
|
38
47
|
ProxyConfig,
|
|
39
48
|
ProxyProduct,
|
|
49
|
+
ScraperTaskConfig,
|
|
40
50
|
SerpRequest,
|
|
41
51
|
UniversalScrapeRequest,
|
|
42
|
-
ScraperTaskConfig,
|
|
43
52
|
)
|
|
44
53
|
from .retry import RetryConfig, with_retry
|
|
45
|
-
from ._utils import (
|
|
46
|
-
parse_json_response,
|
|
47
|
-
decode_base64_image,
|
|
48
|
-
build_auth_headers,
|
|
49
|
-
build_public_api_headers,
|
|
50
|
-
extract_error_message,
|
|
51
|
-
)
|
|
52
54
|
|
|
53
55
|
logger = logging.getLogger(__name__)
|
|
54
56
|
|
|
@@ -62,7 +64,7 @@ class ThordataClient:
|
|
|
62
64
|
- SERP API (Real-time Search Engine Results)
|
|
63
65
|
- Universal Scraping API (Web Unlocker - Single Page Rendering)
|
|
64
66
|
- Web Scraper API (Async Task Management)
|
|
65
|
-
|
|
67
|
+
|
|
66
68
|
Args:
|
|
67
69
|
scraper_token: The API token from your Dashboard.
|
|
68
70
|
public_token: The public API token (for task status, locations).
|
|
@@ -71,7 +73,7 @@ class ThordataClient:
|
|
|
71
73
|
proxy_port: Custom proxy gateway port (optional).
|
|
72
74
|
timeout: Default request timeout in seconds (default: 30).
|
|
73
75
|
retry_config: Configuration for automatic retries (optional).
|
|
74
|
-
|
|
76
|
+
|
|
75
77
|
Example:
|
|
76
78
|
>>> client = ThordataClient(
|
|
77
79
|
... scraper_token="your_scraper_token",
|
|
@@ -95,41 +97,79 @@ class ThordataClient:
|
|
|
95
97
|
proxy_port: int = 9999,
|
|
96
98
|
timeout: int = 30,
|
|
97
99
|
retry_config: Optional[RetryConfig] = None,
|
|
100
|
+
scraperapi_base_url: Optional[str] = None,
|
|
101
|
+
universalapi_base_url: Optional[str] = None,
|
|
102
|
+
web_scraper_api_base_url: Optional[str] = None,
|
|
103
|
+
locations_base_url: Optional[str] = None,
|
|
98
104
|
) -> None:
|
|
99
105
|
"""Initialize the Thordata Client."""
|
|
100
106
|
if not scraper_token:
|
|
101
107
|
raise ThordataConfigError("scraper_token is required")
|
|
102
|
-
|
|
108
|
+
|
|
103
109
|
self.scraper_token = scraper_token
|
|
104
110
|
self.public_token = public_token
|
|
105
111
|
self.public_key = public_key
|
|
106
|
-
|
|
112
|
+
|
|
107
113
|
# Proxy configuration
|
|
108
114
|
self._proxy_host = proxy_host
|
|
109
115
|
self._proxy_port = proxy_port
|
|
110
116
|
self._default_timeout = timeout
|
|
111
|
-
|
|
117
|
+
|
|
112
118
|
# Retry configuration
|
|
113
119
|
self._retry_config = retry_config or RetryConfig()
|
|
114
|
-
|
|
120
|
+
|
|
115
121
|
# Build default proxy URL (for basic usage)
|
|
116
122
|
self._default_proxy_url = (
|
|
117
123
|
f"http://td-customer-{self.scraper_token}:@{proxy_host}:{proxy_port}"
|
|
118
124
|
)
|
|
119
|
-
|
|
120
|
-
#
|
|
121
|
-
|
|
122
|
-
|
|
125
|
+
|
|
126
|
+
# Sessions:
|
|
127
|
+
# - _proxy_session: used for proxy network traffic to target sites
|
|
128
|
+
# - _api_session: used for Thordata APIs (SERP/Universal/Tasks/Locations)
|
|
129
|
+
#
|
|
130
|
+
# We intentionally do NOT set session-level proxies for _api_session,
|
|
131
|
+
# so developers can rely on system proxy settings (e.g., Clash) via env vars.
|
|
132
|
+
self._proxy_session = requests.Session()
|
|
133
|
+
self._proxy_session.trust_env = False
|
|
134
|
+
self._proxy_session.proxies = {
|
|
123
135
|
"http": self._default_proxy_url,
|
|
124
136
|
"https": self._default_proxy_url,
|
|
125
137
|
}
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
self.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
138
|
+
|
|
139
|
+
self._api_session = requests.Session()
|
|
140
|
+
self._api_session.trust_env = True
|
|
141
|
+
|
|
142
|
+
# Base URLs (allow override via args or env vars for testing and custom routing)
|
|
143
|
+
scraperapi_base = (
|
|
144
|
+
scraperapi_base_url
|
|
145
|
+
or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
|
|
146
|
+
or self.BASE_URL
|
|
147
|
+
).rstrip("/")
|
|
148
|
+
|
|
149
|
+
universalapi_base = (
|
|
150
|
+
universalapi_base_url
|
|
151
|
+
or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
|
|
152
|
+
or self.UNIVERSAL_URL
|
|
153
|
+
).rstrip("/")
|
|
154
|
+
|
|
155
|
+
web_scraper_api_base = (
|
|
156
|
+
web_scraper_api_base_url
|
|
157
|
+
or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
|
|
158
|
+
or self.API_URL
|
|
159
|
+
).rstrip("/")
|
|
160
|
+
|
|
161
|
+
locations_base = (
|
|
162
|
+
locations_base_url
|
|
163
|
+
or os.getenv("THORDATA_LOCATIONS_BASE_URL")
|
|
164
|
+
or self.LOCATIONS_URL
|
|
165
|
+
).rstrip("/")
|
|
166
|
+
|
|
167
|
+
self._serp_url = f"{scraperapi_base}/request"
|
|
168
|
+
self._builder_url = f"{scraperapi_base}/builder"
|
|
169
|
+
self._universal_url = f"{universalapi_base}/request"
|
|
170
|
+
self._status_url = f"{web_scraper_api_base}/tasks-status"
|
|
171
|
+
self._download_url = f"{web_scraper_api_base}/tasks-download"
|
|
172
|
+
self._locations_base_url = locations_base
|
|
133
173
|
|
|
134
174
|
# =========================================================================
|
|
135
175
|
# Proxy Network Methods
|
|
@@ -154,11 +194,11 @@ class ThordataClient:
|
|
|
154
194
|
|
|
155
195
|
Returns:
|
|
156
196
|
The response object.
|
|
157
|
-
|
|
197
|
+
|
|
158
198
|
Example:
|
|
159
199
|
>>> # Basic request
|
|
160
200
|
>>> response = client.get("https://httpbin.org/ip")
|
|
161
|
-
>>>
|
|
201
|
+
>>>
|
|
162
202
|
>>> # With geo-targeting
|
|
163
203
|
>>> from thordata.models import ProxyConfig
|
|
164
204
|
>>> config = ProxyConfig(
|
|
@@ -170,13 +210,13 @@ class ThordataClient:
|
|
|
170
210
|
>>> response = client.get("https://httpbin.org/ip", proxy_config=config)
|
|
171
211
|
"""
|
|
172
212
|
logger.debug(f"Proxy GET request: {url}")
|
|
173
|
-
|
|
213
|
+
|
|
174
214
|
timeout = timeout or self._default_timeout
|
|
175
|
-
|
|
215
|
+
|
|
176
216
|
if proxy_config:
|
|
177
217
|
proxies = proxy_config.to_proxies_dict()
|
|
178
218
|
kwargs["proxies"] = proxies
|
|
179
|
-
|
|
219
|
+
|
|
180
220
|
return self._request_with_retry("GET", url, timeout=timeout, **kwargs)
|
|
181
221
|
|
|
182
222
|
def post(
|
|
@@ -200,13 +240,13 @@ class ThordataClient:
|
|
|
200
240
|
The response object.
|
|
201
241
|
"""
|
|
202
242
|
logger.debug(f"Proxy POST request: {url}")
|
|
203
|
-
|
|
243
|
+
|
|
204
244
|
timeout = timeout or self._default_timeout
|
|
205
|
-
|
|
245
|
+
|
|
206
246
|
if proxy_config:
|
|
207
247
|
proxies = proxy_config.to_proxies_dict()
|
|
208
248
|
kwargs["proxies"] = proxies
|
|
209
|
-
|
|
249
|
+
|
|
210
250
|
return self._request_with_retry("POST", url, timeout=timeout, **kwargs)
|
|
211
251
|
|
|
212
252
|
def build_proxy_url(
|
|
@@ -221,10 +261,10 @@ class ThordataClient:
|
|
|
221
261
|
) -> str:
|
|
222
262
|
"""
|
|
223
263
|
Build a proxy URL with custom targeting options.
|
|
224
|
-
|
|
264
|
+
|
|
225
265
|
This is a convenience method for creating proxy URLs without
|
|
226
266
|
manually constructing a ProxyConfig.
|
|
227
|
-
|
|
267
|
+
|
|
228
268
|
Args:
|
|
229
269
|
country: Target country code (e.g., 'us', 'gb').
|
|
230
270
|
state: Target state (e.g., 'california').
|
|
@@ -232,10 +272,10 @@ class ThordataClient:
|
|
|
232
272
|
session_id: Session ID for sticky sessions.
|
|
233
273
|
session_duration: Session duration in minutes (1-90).
|
|
234
274
|
product: Proxy product type.
|
|
235
|
-
|
|
275
|
+
|
|
236
276
|
Returns:
|
|
237
277
|
The proxy URL string.
|
|
238
|
-
|
|
278
|
+
|
|
239
279
|
Example:
|
|
240
280
|
>>> url = client.build_proxy_url(country="us", city="seattle")
|
|
241
281
|
>>> proxies = {"http": url, "https": url}
|
|
@@ -268,39 +308,51 @@ class ThordataClient:
|
|
|
268
308
|
country: Optional[str] = None,
|
|
269
309
|
language: Optional[str] = None,
|
|
270
310
|
search_type: Optional[str] = None,
|
|
311
|
+
device: Optional[str] = None,
|
|
312
|
+
render_js: Optional[bool] = None,
|
|
313
|
+
no_cache: Optional[bool] = None,
|
|
314
|
+
output_format: str = "json",
|
|
271
315
|
**kwargs: Any,
|
|
272
316
|
) -> Dict[str, Any]:
|
|
273
317
|
"""
|
|
274
318
|
Execute a real-time SERP (Search Engine Results Page) search.
|
|
275
|
-
|
|
319
|
+
|
|
276
320
|
Args:
|
|
277
321
|
query: The search keywords.
|
|
278
322
|
engine: Search engine (google, bing, yandex, duckduckgo, baidu).
|
|
279
323
|
num: Number of results to retrieve (default: 10).
|
|
280
324
|
country: Country code for localized results (e.g., 'us').
|
|
281
325
|
language: Language code for interface (e.g., 'en').
|
|
282
|
-
search_type: Type of search (images, news, shopping, videos).
|
|
326
|
+
search_type: Type of search (images, news, shopping, videos, etc.).
|
|
327
|
+
device: Device type ('desktop', 'mobile', 'tablet').
|
|
328
|
+
render_js: Enable JavaScript rendering in SERP (render_js=True).
|
|
329
|
+
no_cache: Disable internal caching (no_cache=True).
|
|
330
|
+
output_format: 'json' to return parsed JSON (default),
|
|
331
|
+
'html' to return HTML wrapped in {'html': ...}.
|
|
283
332
|
**kwargs: Additional engine-specific parameters.
|
|
284
333
|
|
|
285
334
|
Returns:
|
|
286
|
-
Parsed JSON results
|
|
287
|
-
|
|
335
|
+
Dict[str, Any]: Parsed JSON results or a dict with 'html' key.
|
|
336
|
+
|
|
288
337
|
Example:
|
|
289
338
|
>>> # Basic search
|
|
290
339
|
>>> results = client.serp_search("python tutorial")
|
|
291
|
-
>>>
|
|
340
|
+
>>>
|
|
292
341
|
>>> # With options
|
|
293
342
|
>>> results = client.serp_search(
|
|
294
343
|
... "laptop reviews",
|
|
295
344
|
... engine="google",
|
|
296
345
|
... num=20,
|
|
297
346
|
... country="us",
|
|
298
|
-
... search_type="shopping"
|
|
347
|
+
... search_type="shopping",
|
|
348
|
+
... device="mobile",
|
|
349
|
+
... render_js=True,
|
|
350
|
+
... no_cache=True,
|
|
299
351
|
... )
|
|
300
352
|
"""
|
|
301
353
|
# Normalize engine
|
|
302
354
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
303
|
-
|
|
355
|
+
|
|
304
356
|
# Build request using model
|
|
305
357
|
request = SerpRequest(
|
|
306
358
|
query=query,
|
|
@@ -309,49 +361,69 @@ class ThordataClient:
|
|
|
309
361
|
country=country,
|
|
310
362
|
language=language,
|
|
311
363
|
search_type=search_type,
|
|
364
|
+
device=device,
|
|
365
|
+
render_js=render_js,
|
|
366
|
+
no_cache=no_cache,
|
|
367
|
+
output_format=output_format,
|
|
312
368
|
extra_params=kwargs,
|
|
313
369
|
)
|
|
314
|
-
|
|
370
|
+
|
|
315
371
|
payload = request.to_payload()
|
|
316
372
|
headers = build_auth_headers(self.scraper_token)
|
|
317
|
-
|
|
373
|
+
|
|
318
374
|
logger.info(f"SERP Search: {engine_str} - {query}")
|
|
319
|
-
|
|
375
|
+
|
|
320
376
|
try:
|
|
321
|
-
response = self.
|
|
377
|
+
response = self._api_session.post(
|
|
322
378
|
self._serp_url,
|
|
323
379
|
data=payload,
|
|
324
380
|
headers=headers,
|
|
325
381
|
timeout=60,
|
|
326
382
|
)
|
|
327
383
|
response.raise_for_status()
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
384
|
+
|
|
385
|
+
# JSON mode (default)
|
|
386
|
+
if output_format.lower() == "json":
|
|
387
|
+
data = response.json()
|
|
388
|
+
|
|
389
|
+
if isinstance(data, dict):
|
|
390
|
+
code = data.get("code")
|
|
391
|
+
if code is not None and code != 200:
|
|
392
|
+
msg = extract_error_message(data)
|
|
393
|
+
raise_for_code(
|
|
394
|
+
f"SERP API Error: {msg}",
|
|
395
|
+
code=code,
|
|
396
|
+
payload=data,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
return parse_json_response(data)
|
|
400
|
+
|
|
401
|
+
# HTML mode: wrap as dict to keep return type stable
|
|
402
|
+
return {"html": response.text}
|
|
403
|
+
|
|
332
404
|
except requests.Timeout as e:
|
|
333
405
|
raise ThordataTimeoutError(
|
|
334
406
|
f"SERP request timed out: {e}",
|
|
335
|
-
original_error=e
|
|
407
|
+
original_error=e,
|
|
336
408
|
)
|
|
337
409
|
except requests.RequestException as e:
|
|
338
410
|
raise ThordataNetworkError(
|
|
339
411
|
f"SERP request failed: {e}",
|
|
340
|
-
original_error=e
|
|
412
|
+
original_error=e,
|
|
341
413
|
)
|
|
342
414
|
|
|
343
415
|
def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
|
|
344
416
|
"""
|
|
345
417
|
Execute a SERP search using a SerpRequest object.
|
|
346
|
-
|
|
418
|
+
|
|
347
419
|
This method provides full control over all search parameters.
|
|
348
|
-
|
|
420
|
+
|
|
349
421
|
Args:
|
|
350
422
|
request: A SerpRequest object with all parameters configured.
|
|
351
|
-
|
|
423
|
+
|
|
352
424
|
Returns:
|
|
353
|
-
Parsed JSON results.
|
|
354
|
-
|
|
425
|
+
Dict[str, Any]: Parsed JSON results or dict with 'html' key.
|
|
426
|
+
|
|
355
427
|
Example:
|
|
356
428
|
>>> from thordata.models import SerpRequest
|
|
357
429
|
>>> request = SerpRequest(
|
|
@@ -368,30 +440,44 @@ class ThordataClient:
|
|
|
368
440
|
"""
|
|
369
441
|
payload = request.to_payload()
|
|
370
442
|
headers = build_auth_headers(self.scraper_token)
|
|
371
|
-
|
|
443
|
+
|
|
372
444
|
logger.info(f"SERP Advanced Search: {request.engine} - {request.query}")
|
|
373
|
-
|
|
445
|
+
|
|
374
446
|
try:
|
|
375
|
-
response = self.
|
|
447
|
+
response = self._api_session.post(
|
|
376
448
|
self._serp_url,
|
|
377
449
|
data=payload,
|
|
378
450
|
headers=headers,
|
|
379
451
|
timeout=60,
|
|
380
452
|
)
|
|
381
453
|
response.raise_for_status()
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
454
|
+
|
|
455
|
+
if request.output_format.lower() == "json":
|
|
456
|
+
data = response.json()
|
|
457
|
+
|
|
458
|
+
if isinstance(data, dict):
|
|
459
|
+
code = data.get("code")
|
|
460
|
+
if code is not None and code != 200:
|
|
461
|
+
msg = extract_error_message(data)
|
|
462
|
+
raise_for_code(
|
|
463
|
+
f"SERP API Error: {msg}",
|
|
464
|
+
code=code,
|
|
465
|
+
payload=data,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
return parse_json_response(data)
|
|
469
|
+
|
|
470
|
+
return {"html": response.text}
|
|
471
|
+
|
|
386
472
|
except requests.Timeout as e:
|
|
387
473
|
raise ThordataTimeoutError(
|
|
388
474
|
f"SERP request timed out: {e}",
|
|
389
|
-
original_error=e
|
|
475
|
+
original_error=e,
|
|
390
476
|
)
|
|
391
477
|
except requests.RequestException as e:
|
|
392
478
|
raise ThordataNetworkError(
|
|
393
479
|
f"SERP request failed: {e}",
|
|
394
|
-
original_error=e
|
|
480
|
+
original_error=e,
|
|
395
481
|
)
|
|
396
482
|
|
|
397
483
|
# =========================================================================
|
|
@@ -412,7 +498,7 @@ class ThordataClient:
|
|
|
412
498
|
) -> Union[str, bytes]:
|
|
413
499
|
"""
|
|
414
500
|
Scrape a URL using the Universal Scraping API (Web Unlocker).
|
|
415
|
-
|
|
501
|
+
|
|
416
502
|
Automatically bypasses Cloudflare, CAPTCHAs, and antibot systems.
|
|
417
503
|
|
|
418
504
|
Args:
|
|
@@ -427,11 +513,11 @@ class ThordataClient:
|
|
|
427
513
|
|
|
428
514
|
Returns:
|
|
429
515
|
HTML string or PNG bytes depending on output_format.
|
|
430
|
-
|
|
516
|
+
|
|
431
517
|
Example:
|
|
432
518
|
>>> # Get HTML
|
|
433
519
|
>>> html = client.universal_scrape("https://example.com", js_render=True)
|
|
434
|
-
>>>
|
|
520
|
+
>>>
|
|
435
521
|
>>> # Get screenshot
|
|
436
522
|
>>> png = client.universal_scrape(
|
|
437
523
|
... "https://example.com",
|
|
@@ -451,53 +537,50 @@ class ThordataClient:
|
|
|
451
537
|
wait_for=wait_for,
|
|
452
538
|
extra_params=kwargs,
|
|
453
539
|
)
|
|
454
|
-
|
|
540
|
+
|
|
455
541
|
return self.universal_scrape_advanced(request)
|
|
456
542
|
|
|
457
543
|
def universal_scrape_advanced(
|
|
458
|
-
self,
|
|
459
|
-
request: UniversalScrapeRequest
|
|
544
|
+
self, request: UniversalScrapeRequest
|
|
460
545
|
) -> Union[str, bytes]:
|
|
461
546
|
"""
|
|
462
547
|
Scrape using a UniversalScrapeRequest object for full control.
|
|
463
|
-
|
|
548
|
+
|
|
464
549
|
Args:
|
|
465
550
|
request: A UniversalScrapeRequest with all parameters.
|
|
466
|
-
|
|
551
|
+
|
|
467
552
|
Returns:
|
|
468
553
|
HTML string or PNG bytes.
|
|
469
554
|
"""
|
|
470
555
|
payload = request.to_payload()
|
|
471
556
|
headers = build_auth_headers(self.scraper_token)
|
|
472
|
-
|
|
473
|
-
logger.info(
|
|
474
|
-
|
|
557
|
+
|
|
558
|
+
logger.info(
|
|
559
|
+
f"Universal Scrape: {request.url} (format: {request.output_format})"
|
|
560
|
+
)
|
|
561
|
+
|
|
475
562
|
try:
|
|
476
|
-
response = self.
|
|
563
|
+
response = self._api_session.post(
|
|
477
564
|
self._universal_url,
|
|
478
565
|
data=payload,
|
|
479
566
|
headers=headers,
|
|
480
567
|
timeout=60,
|
|
481
568
|
)
|
|
482
569
|
response.raise_for_status()
|
|
483
|
-
|
|
570
|
+
|
|
484
571
|
return self._process_universal_response(response, request.output_format)
|
|
485
|
-
|
|
572
|
+
|
|
486
573
|
except requests.Timeout as e:
|
|
487
574
|
raise ThordataTimeoutError(
|
|
488
|
-
f"Universal scrape timed out: {e}",
|
|
489
|
-
original_error=e
|
|
575
|
+
f"Universal scrape timed out: {e}", original_error=e
|
|
490
576
|
)
|
|
491
577
|
except requests.RequestException as e:
|
|
492
578
|
raise ThordataNetworkError(
|
|
493
|
-
f"Universal scrape failed: {e}",
|
|
494
|
-
original_error=e
|
|
579
|
+
f"Universal scrape failed: {e}", original_error=e
|
|
495
580
|
)
|
|
496
581
|
|
|
497
582
|
def _process_universal_response(
|
|
498
|
-
self,
|
|
499
|
-
response: requests.Response,
|
|
500
|
-
output_format: str
|
|
583
|
+
self, response: requests.Response, output_format: str
|
|
501
584
|
) -> Union[str, bytes]:
|
|
502
585
|
"""Process the response from Universal API."""
|
|
503
586
|
# Try to parse as JSON
|
|
@@ -508,26 +591,24 @@ class ThordataClient:
|
|
|
508
591
|
if output_format.lower() == "png":
|
|
509
592
|
return response.content
|
|
510
593
|
return response.text
|
|
511
|
-
|
|
594
|
+
|
|
512
595
|
# Check for API-level errors
|
|
513
596
|
if isinstance(resp_json, dict):
|
|
514
597
|
code = resp_json.get("code")
|
|
515
598
|
if code is not None and code != 200:
|
|
516
599
|
msg = extract_error_message(resp_json)
|
|
517
600
|
raise_for_code(
|
|
518
|
-
f"Universal API Error: {msg}",
|
|
519
|
-
code=code,
|
|
520
|
-
payload=resp_json
|
|
601
|
+
f"Universal API Error: {msg}", code=code, payload=resp_json
|
|
521
602
|
)
|
|
522
|
-
|
|
603
|
+
|
|
523
604
|
# Extract HTML
|
|
524
605
|
if "html" in resp_json:
|
|
525
606
|
return resp_json["html"]
|
|
526
|
-
|
|
607
|
+
|
|
527
608
|
# Extract PNG
|
|
528
609
|
if "png" in resp_json:
|
|
529
610
|
return decode_base64_image(resp_json["png"])
|
|
530
|
-
|
|
611
|
+
|
|
531
612
|
# Fallback
|
|
532
613
|
return str(resp_json)
|
|
533
614
|
|
|
@@ -545,7 +626,7 @@ class ThordataClient:
|
|
|
545
626
|
) -> str:
|
|
546
627
|
"""
|
|
547
628
|
Create an asynchronous Web Scraper task.
|
|
548
|
-
|
|
629
|
+
|
|
549
630
|
Note: Get spider_id and spider_name from the Thordata Dashboard.
|
|
550
631
|
|
|
551
632
|
Args:
|
|
@@ -557,7 +638,7 @@ class ThordataClient:
|
|
|
557
638
|
|
|
558
639
|
Returns:
|
|
559
640
|
The created task_id.
|
|
560
|
-
|
|
641
|
+
|
|
561
642
|
Example:
|
|
562
643
|
>>> task_id = client.create_scraper_task(
|
|
563
644
|
... file_name="youtube_data",
|
|
@@ -573,51 +654,44 @@ class ThordataClient:
|
|
|
573
654
|
parameters=parameters,
|
|
574
655
|
universal_params=universal_params,
|
|
575
656
|
)
|
|
576
|
-
|
|
657
|
+
|
|
577
658
|
return self.create_scraper_task_advanced(config)
|
|
578
659
|
|
|
579
660
|
def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
580
661
|
"""
|
|
581
662
|
Create a scraper task using a ScraperTaskConfig object.
|
|
582
|
-
|
|
663
|
+
|
|
583
664
|
Args:
|
|
584
665
|
config: Task configuration.
|
|
585
|
-
|
|
666
|
+
|
|
586
667
|
Returns:
|
|
587
668
|
The created task_id.
|
|
588
669
|
"""
|
|
589
670
|
payload = config.to_payload()
|
|
590
671
|
headers = build_auth_headers(self.scraper_token)
|
|
591
|
-
|
|
672
|
+
|
|
592
673
|
logger.info(f"Creating Scraper Task: {config.spider_name}")
|
|
593
|
-
|
|
674
|
+
|
|
594
675
|
try:
|
|
595
|
-
response = self.
|
|
676
|
+
response = self._api_session.post(
|
|
596
677
|
self._builder_url,
|
|
597
678
|
data=payload,
|
|
598
679
|
headers=headers,
|
|
599
680
|
timeout=30,
|
|
600
681
|
)
|
|
601
682
|
response.raise_for_status()
|
|
602
|
-
|
|
683
|
+
|
|
603
684
|
data = response.json()
|
|
604
685
|
code = data.get("code")
|
|
605
|
-
|
|
686
|
+
|
|
606
687
|
if code != 200:
|
|
607
688
|
msg = extract_error_message(data)
|
|
608
|
-
raise_for_code(
|
|
609
|
-
|
|
610
|
-
code=code,
|
|
611
|
-
payload=data
|
|
612
|
-
)
|
|
613
|
-
|
|
689
|
+
raise_for_code(f"Task creation failed: {msg}", code=code, payload=data)
|
|
690
|
+
|
|
614
691
|
return data["data"]["task_id"]
|
|
615
|
-
|
|
692
|
+
|
|
616
693
|
except requests.RequestException as e:
|
|
617
|
-
raise ThordataNetworkError(
|
|
618
|
-
f"Task creation failed: {e}",
|
|
619
|
-
original_error=e
|
|
620
|
-
)
|
|
694
|
+
raise ThordataNetworkError(f"Task creation failed: {e}", original_error=e)
|
|
621
695
|
|
|
622
696
|
def get_task_status(self, task_id: str) -> str:
|
|
623
697
|
"""
|
|
@@ -630,81 +704,69 @@ class ThordataClient:
|
|
|
630
704
|
Status string (e.g., "running", "ready", "failed").
|
|
631
705
|
"""
|
|
632
706
|
self._require_public_credentials()
|
|
633
|
-
|
|
634
|
-
headers = build_public_api_headers(
|
|
707
|
+
|
|
708
|
+
headers = build_public_api_headers(
|
|
709
|
+
self.public_token or "", self.public_key or ""
|
|
710
|
+
)
|
|
635
711
|
payload = {"tasks_ids": task_id}
|
|
636
|
-
|
|
712
|
+
|
|
637
713
|
try:
|
|
638
|
-
response = self.
|
|
714
|
+
response = self._api_session.post(
|
|
639
715
|
self._status_url,
|
|
640
716
|
data=payload,
|
|
641
717
|
headers=headers,
|
|
642
718
|
timeout=30,
|
|
643
719
|
)
|
|
644
720
|
response.raise_for_status()
|
|
645
|
-
|
|
721
|
+
|
|
646
722
|
data = response.json()
|
|
647
|
-
|
|
723
|
+
|
|
648
724
|
if data.get("code") == 200 and data.get("data"):
|
|
649
725
|
for item in data["data"]:
|
|
650
726
|
if str(item.get("task_id")) == str(task_id):
|
|
651
727
|
return item.get("status", "unknown")
|
|
652
|
-
|
|
728
|
+
|
|
653
729
|
return "unknown"
|
|
654
|
-
|
|
730
|
+
|
|
655
731
|
except Exception as e:
|
|
656
732
|
logger.error(f"Status check failed: {e}")
|
|
657
733
|
return "error"
|
|
658
734
|
|
|
659
|
-
def get_task_result(
|
|
660
|
-
self,
|
|
661
|
-
task_id: str,
|
|
662
|
-
file_type: str = "json"
|
|
663
|
-
) -> str:
|
|
735
|
+
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
664
736
|
"""
|
|
665
737
|
Get the download URL for a completed task.
|
|
666
|
-
|
|
667
|
-
Args:
|
|
668
|
-
task_id: The task ID.
|
|
669
|
-
file_type: Output format ("json", "csv", "xlsx").
|
|
670
|
-
|
|
671
|
-
Returns:
|
|
672
|
-
The download URL for the result file.
|
|
673
738
|
"""
|
|
674
739
|
self._require_public_credentials()
|
|
675
|
-
|
|
676
|
-
headers = build_public_api_headers(
|
|
740
|
+
|
|
741
|
+
headers = build_public_api_headers(
|
|
742
|
+
self.public_token or "", self.public_key or ""
|
|
743
|
+
)
|
|
677
744
|
payload = {"tasks_id": task_id, "type": file_type}
|
|
678
|
-
|
|
745
|
+
|
|
679
746
|
logger.info(f"Getting result URL for Task: {task_id}")
|
|
680
|
-
|
|
747
|
+
|
|
681
748
|
try:
|
|
682
|
-
response = self.
|
|
749
|
+
response = self._api_session.post(
|
|
683
750
|
self._download_url,
|
|
684
751
|
data=payload,
|
|
685
752
|
headers=headers,
|
|
686
753
|
timeout=30,
|
|
687
754
|
)
|
|
688
755
|
response.raise_for_status()
|
|
689
|
-
|
|
756
|
+
|
|
690
757
|
data = response.json()
|
|
691
758
|
code = data.get("code")
|
|
692
|
-
|
|
759
|
+
|
|
693
760
|
if code == 200 and data.get("data"):
|
|
694
761
|
return data["data"]["download"]
|
|
695
|
-
|
|
762
|
+
|
|
696
763
|
msg = extract_error_message(data)
|
|
697
|
-
raise_for_code(
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
)
|
|
702
|
-
|
|
764
|
+
raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
|
|
765
|
+
# This line won't be reached, but satisfies mypy
|
|
766
|
+
raise RuntimeError("Unexpected state")
|
|
767
|
+
|
|
703
768
|
except requests.RequestException as e:
|
|
704
|
-
raise ThordataNetworkError(
|
|
705
|
-
f"Get result failed: {e}",
|
|
706
|
-
original_error=e
|
|
707
|
-
)
|
|
769
|
+
raise ThordataNetworkError(f"Get result failed: {e}", original_error=e)
|
|
708
770
|
|
|
709
771
|
def wait_for_task(
|
|
710
772
|
self,
|
|
@@ -715,18 +777,18 @@ class ThordataClient:
|
|
|
715
777
|
) -> str:
|
|
716
778
|
"""
|
|
717
779
|
Wait for a task to complete.
|
|
718
|
-
|
|
780
|
+
|
|
719
781
|
Args:
|
|
720
782
|
task_id: The task ID to wait for.
|
|
721
783
|
poll_interval: Seconds between status checks.
|
|
722
784
|
max_wait: Maximum seconds to wait.
|
|
723
|
-
|
|
785
|
+
|
|
724
786
|
Returns:
|
|
725
787
|
Final task status.
|
|
726
|
-
|
|
788
|
+
|
|
727
789
|
Raises:
|
|
728
790
|
TimeoutError: If max_wait is exceeded.
|
|
729
|
-
|
|
791
|
+
|
|
730
792
|
Example:
|
|
731
793
|
>>> task_id = client.create_scraper_task(...)
|
|
732
794
|
>>> status = client.wait_for_task(task_id, max_wait=300)
|
|
@@ -734,36 +796,37 @@ class ThordataClient:
|
|
|
734
796
|
... url = client.get_task_result(task_id)
|
|
735
797
|
"""
|
|
736
798
|
import time
|
|
737
|
-
|
|
799
|
+
|
|
738
800
|
elapsed = 0.0
|
|
739
|
-
|
|
801
|
+
|
|
740
802
|
while elapsed < max_wait:
|
|
741
803
|
status = self.get_task_status(task_id)
|
|
742
|
-
|
|
804
|
+
|
|
743
805
|
logger.debug(f"Task {task_id} status: {status}")
|
|
744
|
-
|
|
806
|
+
|
|
745
807
|
terminal_statuses = {
|
|
746
|
-
"ready",
|
|
747
|
-
"
|
|
808
|
+
"ready",
|
|
809
|
+
"success",
|
|
810
|
+
"finished",
|
|
811
|
+
"failed",
|
|
812
|
+
"error",
|
|
813
|
+
"cancelled",
|
|
748
814
|
}
|
|
749
|
-
|
|
815
|
+
|
|
750
816
|
if status.lower() in terminal_statuses:
|
|
751
817
|
return status
|
|
752
|
-
|
|
818
|
+
|
|
753
819
|
time.sleep(poll_interval)
|
|
754
820
|
elapsed += poll_interval
|
|
755
|
-
|
|
756
|
-
raise TimeoutError(
|
|
757
|
-
f"Task {task_id} did not complete within {max_wait} seconds"
|
|
758
|
-
)
|
|
821
|
+
|
|
822
|
+
raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
|
|
759
823
|
|
|
760
824
|
# =========================================================================
|
|
761
825
|
# Location API Methods
|
|
762
826
|
# =========================================================================
|
|
763
827
|
|
|
764
828
|
def list_countries(
|
|
765
|
-
self,
|
|
766
|
-
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
829
|
+
self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
767
830
|
) -> List[Dict[str, Any]]:
|
|
768
831
|
"""
|
|
769
832
|
List supported countries for proxies.
|
|
@@ -776,13 +839,15 @@ class ThordataClient:
|
|
|
776
839
|
"""
|
|
777
840
|
return self._get_locations(
|
|
778
841
|
"countries",
|
|
779
|
-
proxy_type=
|
|
842
|
+
proxy_type=(
|
|
843
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
844
|
+
),
|
|
780
845
|
)
|
|
781
846
|
|
|
782
847
|
def list_states(
|
|
783
848
|
self,
|
|
784
849
|
country_code: str,
|
|
785
|
-
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
850
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
786
851
|
) -> List[Dict[str, Any]]:
|
|
787
852
|
"""
|
|
788
853
|
List supported states for a country.
|
|
@@ -796,15 +861,17 @@ class ThordataClient:
|
|
|
796
861
|
"""
|
|
797
862
|
return self._get_locations(
|
|
798
863
|
"states",
|
|
799
|
-
proxy_type=
|
|
800
|
-
|
|
864
|
+
proxy_type=(
|
|
865
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
866
|
+
),
|
|
867
|
+
country_code=country_code,
|
|
801
868
|
)
|
|
802
869
|
|
|
803
870
|
def list_cities(
|
|
804
871
|
self,
|
|
805
872
|
country_code: str,
|
|
806
873
|
state_code: Optional[str] = None,
|
|
807
|
-
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
874
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
808
875
|
) -> List[Dict[str, Any]]:
|
|
809
876
|
"""
|
|
810
877
|
List supported cities for a country/state.
|
|
@@ -818,18 +885,20 @@ class ThordataClient:
|
|
|
818
885
|
List of city records.
|
|
819
886
|
"""
|
|
820
887
|
kwargs = {
|
|
821
|
-
"proxy_type":
|
|
822
|
-
|
|
888
|
+
"proxy_type": (
|
|
889
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
890
|
+
),
|
|
891
|
+
"country_code": country_code,
|
|
823
892
|
}
|
|
824
893
|
if state_code:
|
|
825
894
|
kwargs["state_code"] = state_code
|
|
826
|
-
|
|
895
|
+
|
|
827
896
|
return self._get_locations("cities", **kwargs)
|
|
828
897
|
|
|
829
898
|
def list_asn(
|
|
830
899
|
self,
|
|
831
900
|
country_code: str,
|
|
832
|
-
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
901
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
833
902
|
) -> List[Dict[str, Any]]:
|
|
834
903
|
"""
|
|
835
904
|
List supported ASNs for a country.
|
|
@@ -843,36 +912,34 @@ class ThordataClient:
|
|
|
843
912
|
"""
|
|
844
913
|
return self._get_locations(
|
|
845
914
|
"asn",
|
|
846
|
-
proxy_type=
|
|
847
|
-
|
|
915
|
+
proxy_type=(
|
|
916
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
917
|
+
),
|
|
918
|
+
country_code=country_code,
|
|
848
919
|
)
|
|
849
920
|
|
|
850
|
-
def _get_locations(
|
|
851
|
-
self,
|
|
852
|
-
endpoint: str,
|
|
853
|
-
**kwargs: Any
|
|
854
|
-
) -> List[Dict[str, Any]]:
|
|
921
|
+
def _get_locations(self, endpoint: str, **kwargs: Any) -> List[Dict[str, Any]]:
|
|
855
922
|
"""Internal method to call locations API."""
|
|
856
923
|
self._require_public_credentials()
|
|
857
|
-
|
|
924
|
+
|
|
858
925
|
params = {
|
|
859
926
|
"token": self.public_token,
|
|
860
927
|
"key": self.public_key,
|
|
861
928
|
}
|
|
862
|
-
|
|
929
|
+
|
|
863
930
|
for key, value in kwargs.items():
|
|
864
931
|
params[key] = str(value)
|
|
865
|
-
|
|
866
|
-
url = f"{self.
|
|
867
|
-
|
|
932
|
+
|
|
933
|
+
url = f"{self._locations_base_url}/{endpoint}"
|
|
934
|
+
|
|
868
935
|
logger.debug(f"Locations API request: {url}")
|
|
869
|
-
|
|
936
|
+
|
|
870
937
|
# Use requests.get directly (no proxy needed for this API)
|
|
871
|
-
response =
|
|
938
|
+
response = self._api_session.get(url, params=params, timeout=30)
|
|
872
939
|
response.raise_for_status()
|
|
873
|
-
|
|
940
|
+
|
|
874
941
|
data = response.json()
|
|
875
|
-
|
|
942
|
+
|
|
876
943
|
if isinstance(data, dict):
|
|
877
944
|
code = data.get("code")
|
|
878
945
|
if code is not None and code != 200:
|
|
@@ -881,10 +948,10 @@ class ThordataClient:
|
|
|
881
948
|
f"Locations API error ({endpoint}): code={code}, msg={msg}"
|
|
882
949
|
)
|
|
883
950
|
return data.get("data") or []
|
|
884
|
-
|
|
951
|
+
|
|
885
952
|
if isinstance(data, list):
|
|
886
953
|
return data
|
|
887
|
-
|
|
954
|
+
|
|
888
955
|
return []
|
|
889
956
|
|
|
890
957
|
# =========================================================================
|
|
@@ -900,37 +967,29 @@ class ThordataClient:
|
|
|
900
967
|
)
|
|
901
968
|
|
|
902
969
|
def _request_with_retry(
|
|
903
|
-
self,
|
|
904
|
-
method: str,
|
|
905
|
-
url: str,
|
|
906
|
-
**kwargs: Any
|
|
970
|
+
self, method: str, url: str, **kwargs: Any
|
|
907
971
|
) -> requests.Response:
|
|
908
972
|
"""Make a request with automatic retry."""
|
|
909
973
|
kwargs.setdefault("timeout", self._default_timeout)
|
|
910
|
-
|
|
974
|
+
|
|
911
975
|
@with_retry(self._retry_config)
|
|
912
976
|
def _do_request() -> requests.Response:
|
|
913
|
-
return self.
|
|
914
|
-
|
|
977
|
+
return self._proxy_session.request(method, url, **kwargs)
|
|
978
|
+
|
|
915
979
|
try:
|
|
916
980
|
return _do_request()
|
|
917
981
|
except requests.Timeout as e:
|
|
918
|
-
raise ThordataTimeoutError(
|
|
919
|
-
f"Request timed out: {e}",
|
|
920
|
-
original_error=e
|
|
921
|
-
)
|
|
982
|
+
raise ThordataTimeoutError(f"Request timed out: {e}", original_error=e)
|
|
922
983
|
except requests.RequestException as e:
|
|
923
|
-
raise ThordataNetworkError(
|
|
924
|
-
f"Request failed: {e}",
|
|
925
|
-
original_error=e
|
|
926
|
-
)
|
|
984
|
+
raise ThordataNetworkError(f"Request failed: {e}", original_error=e)
|
|
927
985
|
|
|
928
986
|
def close(self) -> None:
|
|
929
987
|
"""Close the underlying session."""
|
|
930
|
-
self.
|
|
988
|
+
self._proxy_session.close()
|
|
989
|
+
self._api_session.close()
|
|
931
990
|
|
|
932
|
-
def __enter__(self) ->
|
|
991
|
+
def __enter__(self) -> ThordataClient:
|
|
933
992
|
return self
|
|
934
993
|
|
|
935
994
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
936
|
-
self.close()
|
|
995
|
+
self.close()
|