thordata-sdk 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +1 -1
- thordata/async_client.py +55 -13
- thordata/client.py +64 -13
- thordata/enums.py +2 -2
- thordata/exceptions.py +80 -20
- thordata/models.py +1 -1
- thordata/retry.py +1 -1
- thordata/tools/__init__.py +11 -1
- thordata/tools/code.py +17 -4
- thordata/tools/ecommerce.py +194 -10
- thordata/tools/professional.py +155 -0
- thordata/tools/search.py +47 -5
- thordata/tools/social.py +225 -41
- thordata/tools/travel.py +100 -0
- thordata/tools/video.py +80 -7
- thordata/types/serp.py +6 -2
- thordata/types/task.py +75 -9
- thordata/types/universal.py +37 -5
- {thordata_sdk-1.5.0.dist-info → thordata_sdk-1.7.0.dist-info}/METADATA +63 -7
- thordata_sdk-1.7.0.dist-info/RECORD +35 -0
- {thordata_sdk-1.5.0.dist-info → thordata_sdk-1.7.0.dist-info}/WHEEL +1 -1
- thordata/_example_utils.py +0 -77
- thordata/demo.py +0 -138
- thordata_sdk-1.5.0.dist-info/RECORD +0 -35
- {thordata_sdk-1.5.0.dist-info → thordata_sdk-1.7.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.5.0.dist-info → thordata_sdk-1.7.0.dist-info}/top_level.txt +0 -0
thordata/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ Official Python client for Thordata's Proxy Network, SERP API,
|
|
|
5
5
|
Universal Scraping API (Web Unlocker), and Web Scraper API.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "1.
|
|
8
|
+
__version__ = "1.6.0"
|
|
9
9
|
__author__ = "Thordata Developer Team/Kael Odin"
|
|
10
10
|
__email__ = "support@thordata.com"
|
|
11
11
|
|
thordata/async_client.py
CHANGED
|
@@ -124,10 +124,10 @@ class AsyncThordataClient:
|
|
|
124
124
|
).rstrip("/")
|
|
125
125
|
|
|
126
126
|
self._gateway_base_url = os.getenv(
|
|
127
|
-
"THORDATA_GATEWAY_BASE_URL", "https://
|
|
127
|
+
"THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
|
|
128
128
|
)
|
|
129
129
|
self._child_base_url = os.getenv(
|
|
130
|
-
"THORDATA_CHILD_BASE_URL", "https://
|
|
130
|
+
"THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
|
|
131
131
|
)
|
|
132
132
|
|
|
133
133
|
# URL Construction
|
|
@@ -145,7 +145,7 @@ class AsyncThordataClient:
|
|
|
145
145
|
self._proxy_users_url = f"{shared_api_base}/proxy-users"
|
|
146
146
|
|
|
147
147
|
whitelist_base = os.getenv(
|
|
148
|
-
"THORDATA_WHITELIST_BASE_URL", "https://
|
|
148
|
+
"THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
|
|
149
149
|
)
|
|
150
150
|
self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
|
|
151
151
|
|
|
@@ -293,28 +293,36 @@ class AsyncThordataClient:
|
|
|
293
293
|
url: str,
|
|
294
294
|
*,
|
|
295
295
|
js_render: bool = False,
|
|
296
|
-
output_format: str = "html",
|
|
296
|
+
output_format: str | list[str] = "html",
|
|
297
297
|
country: str | None = None,
|
|
298
298
|
block_resources: str | None = None,
|
|
299
|
+
clean_content: str | None = None,
|
|
299
300
|
wait: int | None = None,
|
|
300
301
|
wait_for: str | None = None,
|
|
302
|
+
follow_redirect: bool | None = None,
|
|
303
|
+
headers: list[dict[str, str]] | None = None,
|
|
304
|
+
cookies: list[dict[str, str]] | None = None,
|
|
301
305
|
**kwargs: Any,
|
|
302
|
-
) -> str | bytes:
|
|
306
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
303
307
|
request = UniversalScrapeRequest(
|
|
304
308
|
url=url,
|
|
305
309
|
js_render=js_render,
|
|
306
310
|
output_format=output_format,
|
|
307
311
|
country=country,
|
|
308
312
|
block_resources=block_resources,
|
|
313
|
+
clean_content=clean_content,
|
|
309
314
|
wait=wait,
|
|
310
315
|
wait_for=wait_for,
|
|
316
|
+
follow_redirect=follow_redirect,
|
|
317
|
+
headers=headers,
|
|
318
|
+
cookies=cookies,
|
|
311
319
|
extra_params=kwargs,
|
|
312
320
|
)
|
|
313
321
|
return await self.universal_scrape_advanced(request)
|
|
314
322
|
|
|
315
323
|
async def universal_scrape_advanced(
|
|
316
324
|
self, request: UniversalScrapeRequest
|
|
317
|
-
) -> str | bytes:
|
|
325
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
318
326
|
if not self.scraper_token:
|
|
319
327
|
raise ThordataConfigError("scraper_token required")
|
|
320
328
|
payload = request.to_payload()
|
|
@@ -327,9 +335,17 @@ class AsyncThordataClient:
|
|
|
327
335
|
try:
|
|
328
336
|
resp_json = await response.json()
|
|
329
337
|
except ValueError:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
338
|
+
# If not JSON, return raw content based on format
|
|
339
|
+
if isinstance(request.output_format, list) or (
|
|
340
|
+
isinstance(request.output_format, str) and "," in request.output_format
|
|
341
|
+
):
|
|
342
|
+
return {"raw": await response.read()}
|
|
343
|
+
fmt = (
|
|
344
|
+
request.output_format.lower()
|
|
345
|
+
if isinstance(request.output_format, str)
|
|
346
|
+
else str(request.output_format).lower()
|
|
347
|
+
)
|
|
348
|
+
return await response.read() if fmt == "png" else await response.text()
|
|
333
349
|
|
|
334
350
|
if isinstance(resp_json, dict):
|
|
335
351
|
code = resp_json.get("code")
|
|
@@ -337,6 +353,27 @@ class AsyncThordataClient:
|
|
|
337
353
|
msg = extract_error_message(resp_json)
|
|
338
354
|
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
339
355
|
|
|
356
|
+
# Handle multiple output formats
|
|
357
|
+
if isinstance(request.output_format, list) or (
|
|
358
|
+
isinstance(request.output_format, str) and "," in request.output_format
|
|
359
|
+
):
|
|
360
|
+
result: dict[str, str | bytes] = {}
|
|
361
|
+
formats = (
|
|
362
|
+
request.output_format
|
|
363
|
+
if isinstance(request.output_format, list)
|
|
364
|
+
else [f.strip() for f in request.output_format.split(",")]
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
for fmt in formats:
|
|
368
|
+
fmt_lower = fmt.lower()
|
|
369
|
+
if fmt_lower == "html" and "html" in resp_json:
|
|
370
|
+
result["html"] = resp_json["html"]
|
|
371
|
+
elif fmt_lower == "png" and "png" in resp_json:
|
|
372
|
+
result["png"] = decode_base64_image(resp_json["png"])
|
|
373
|
+
|
|
374
|
+
if result:
|
|
375
|
+
return result
|
|
376
|
+
|
|
340
377
|
if "html" in resp_json:
|
|
341
378
|
return resp_json["html"]
|
|
342
379
|
if "png" in resp_json:
|
|
@@ -352,7 +389,7 @@ class AsyncThordataClient:
|
|
|
352
389
|
file_name: str,
|
|
353
390
|
spider_id: str,
|
|
354
391
|
spider_name: str,
|
|
355
|
-
parameters: dict[str, Any],
|
|
392
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
356
393
|
universal_params: dict[str, Any] | None = None,
|
|
357
394
|
) -> str:
|
|
358
395
|
config = ScraperTaskConfig(
|
|
@@ -434,7 +471,7 @@ class AsyncThordataClient:
|
|
|
434
471
|
file_name: str,
|
|
435
472
|
spider_id: str,
|
|
436
473
|
spider_name: str,
|
|
437
|
-
parameters: dict[str, Any],
|
|
474
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
438
475
|
common_settings: CommonSettings,
|
|
439
476
|
) -> str:
|
|
440
477
|
config = VideoTaskConfig(
|
|
@@ -550,7 +587,7 @@ class AsyncThordataClient:
|
|
|
550
587
|
file_name: str,
|
|
551
588
|
spider_id: str,
|
|
552
589
|
spider_name: str,
|
|
553
|
-
parameters: dict[str, Any],
|
|
590
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
554
591
|
universal_params: dict[str, Any] | None = None,
|
|
555
592
|
*,
|
|
556
593
|
max_wait: float = 600.0,
|
|
@@ -971,7 +1008,12 @@ class AsyncThordataClient:
|
|
|
971
1008
|
if port:
|
|
972
1009
|
params["port"] = str(port)
|
|
973
1010
|
|
|
974
|
-
|
|
1011
|
+
if product == "unlimited":
|
|
1012
|
+
username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
|
|
1013
|
+
"THORDATA_RESIDENTIAL_USERNAME"
|
|
1014
|
+
)
|
|
1015
|
+
else:
|
|
1016
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
975
1017
|
if username:
|
|
976
1018
|
params["td-customer"] = username
|
|
977
1019
|
|
thordata/client.py
CHANGED
|
@@ -53,6 +53,7 @@ from .serp_engines import SerpNamespace
|
|
|
53
53
|
# Import Types (Modernized)
|
|
54
54
|
from .types import (
|
|
55
55
|
CommonSettings,
|
|
56
|
+
DataFormat,
|
|
56
57
|
ProxyConfig,
|
|
57
58
|
ProxyProduct,
|
|
58
59
|
ProxyServer,
|
|
@@ -159,10 +160,10 @@ class ThordataClient:
|
|
|
159
160
|
).rstrip("/")
|
|
160
161
|
|
|
161
162
|
self._gateway_base_url = os.getenv(
|
|
162
|
-
"THORDATA_GATEWAY_BASE_URL", "https://
|
|
163
|
+
"THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
|
|
163
164
|
)
|
|
164
165
|
self._child_base_url = os.getenv(
|
|
165
|
-
"THORDATA_CHILD_BASE_URL", "https://
|
|
166
|
+
"THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
|
|
166
167
|
)
|
|
167
168
|
|
|
168
169
|
# URL Construction
|
|
@@ -183,7 +184,7 @@ class ThordataClient:
|
|
|
183
184
|
self._proxy_users_url = f"{shared_api_base}/proxy-users"
|
|
184
185
|
|
|
185
186
|
whitelist_base = os.getenv(
|
|
186
|
-
"THORDATA_WHITELIST_BASE_URL", "https://
|
|
187
|
+
"THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
|
|
187
188
|
)
|
|
188
189
|
self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
|
|
189
190
|
|
|
@@ -364,26 +365,36 @@ class ThordataClient:
|
|
|
364
365
|
url: str,
|
|
365
366
|
*,
|
|
366
367
|
js_render: bool = False,
|
|
367
|
-
output_format: str = "html",
|
|
368
|
+
output_format: str | list[str] = "html",
|
|
368
369
|
country: str | None = None,
|
|
369
370
|
block_resources: str | None = None,
|
|
371
|
+
clean_content: str | None = None,
|
|
370
372
|
wait: int | None = None,
|
|
371
373
|
wait_for: str | None = None,
|
|
374
|
+
follow_redirect: bool | None = None,
|
|
375
|
+
headers: list[dict[str, str]] | None = None,
|
|
376
|
+
cookies: list[dict[str, str]] | None = None,
|
|
372
377
|
**kwargs: Any,
|
|
373
|
-
) -> str | bytes:
|
|
378
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
374
379
|
request = UniversalScrapeRequest(
|
|
375
380
|
url=url,
|
|
376
381
|
js_render=js_render,
|
|
377
382
|
output_format=output_format,
|
|
378
383
|
country=country,
|
|
379
384
|
block_resources=block_resources,
|
|
385
|
+
clean_content=clean_content,
|
|
380
386
|
wait=wait,
|
|
381
387
|
wait_for=wait_for,
|
|
388
|
+
follow_redirect=follow_redirect,
|
|
389
|
+
headers=headers,
|
|
390
|
+
cookies=cookies,
|
|
382
391
|
extra_params=kwargs,
|
|
383
392
|
)
|
|
384
393
|
return self.universal_scrape_advanced(request)
|
|
385
394
|
|
|
386
|
-
def universal_scrape_advanced(
|
|
395
|
+
def universal_scrape_advanced(
|
|
396
|
+
self, request: UniversalScrapeRequest
|
|
397
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
387
398
|
if not self.scraper_token:
|
|
388
399
|
raise ThordataConfigError("scraper_token required")
|
|
389
400
|
|
|
@@ -405,7 +416,7 @@ class ThordataClient:
|
|
|
405
416
|
file_name: str,
|
|
406
417
|
spider_id: str,
|
|
407
418
|
spider_name: str,
|
|
408
|
-
parameters: dict[str, Any],
|
|
419
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
409
420
|
universal_params: dict[str, Any] | None = None,
|
|
410
421
|
) -> str:
|
|
411
422
|
config = ScraperTaskConfig(
|
|
@@ -490,7 +501,7 @@ class ThordataClient:
|
|
|
490
501
|
file_name: str,
|
|
491
502
|
spider_id: str,
|
|
492
503
|
spider_name: str,
|
|
493
|
-
parameters: dict[str, Any],
|
|
504
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
494
505
|
common_settings: CommonSettings,
|
|
495
506
|
) -> str:
|
|
496
507
|
config = VideoTaskConfig(
|
|
@@ -639,7 +650,7 @@ class ThordataClient:
|
|
|
639
650
|
file_name: str,
|
|
640
651
|
spider_id: str,
|
|
641
652
|
spider_name: str,
|
|
642
|
-
parameters: dict[str, Any],
|
|
653
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
643
654
|
universal_params: dict[str, Any] | None = None,
|
|
644
655
|
*,
|
|
645
656
|
max_wait: float = 600.0,
|
|
@@ -648,6 +659,7 @@ class ThordataClient:
|
|
|
648
659
|
include_errors: bool = True,
|
|
649
660
|
task_type: str = "web",
|
|
650
661
|
common_settings: CommonSettings | None = None,
|
|
662
|
+
data_format: DataFormat | str | None = None,
|
|
651
663
|
) -> str:
|
|
652
664
|
import time
|
|
653
665
|
|
|
@@ -671,6 +683,7 @@ class ThordataClient:
|
|
|
671
683
|
parameters=parameters,
|
|
672
684
|
universal_params=universal_params,
|
|
673
685
|
include_errors=include_errors,
|
|
686
|
+
data_format=data_format,
|
|
674
687
|
)
|
|
675
688
|
task_id = self.create_scraper_task_advanced(config)
|
|
676
689
|
|
|
@@ -862,7 +875,12 @@ class ThordataClient:
|
|
|
862
875
|
if port:
|
|
863
876
|
params["port"] = str(port)
|
|
864
877
|
|
|
865
|
-
|
|
878
|
+
if product == "unlimited":
|
|
879
|
+
username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
|
|
880
|
+
"THORDATA_RESIDENTIAL_USERNAME"
|
|
881
|
+
)
|
|
882
|
+
else:
|
|
883
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
866
884
|
if username:
|
|
867
885
|
params["td-customer"] = username
|
|
868
886
|
|
|
@@ -1207,12 +1225,22 @@ class ThordataClient:
|
|
|
1207
1225
|
# =========================================================================
|
|
1208
1226
|
|
|
1209
1227
|
def _process_universal_response(
|
|
1210
|
-
self, response: requests.Response, output_format: str
|
|
1211
|
-
) -> str | bytes:
|
|
1228
|
+
self, response: requests.Response, output_format: str | list[str]
|
|
1229
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
1230
|
+
"""Process universal scrape response. Returns single value or dict if multiple formats requested."""
|
|
1212
1231
|
try:
|
|
1213
1232
|
resp_json = response.json()
|
|
1214
1233
|
except ValueError:
|
|
1215
|
-
|
|
1234
|
+
# If not JSON, return raw content based on format
|
|
1235
|
+
if isinstance(output_format, list):
|
|
1236
|
+
# Multiple formats requested but got non-JSON response
|
|
1237
|
+
return {"raw": response.content}
|
|
1238
|
+
fmt = (
|
|
1239
|
+
output_format.lower()
|
|
1240
|
+
if isinstance(output_format, str)
|
|
1241
|
+
else str(output_format).lower()
|
|
1242
|
+
)
|
|
1243
|
+
return response.content if fmt == "png" else response.text
|
|
1216
1244
|
|
|
1217
1245
|
if isinstance(resp_json, dict):
|
|
1218
1246
|
code = resp_json.get("code")
|
|
@@ -1220,6 +1248,29 @@ class ThordataClient:
|
|
|
1220
1248
|
msg = extract_error_message(resp_json)
|
|
1221
1249
|
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
1222
1250
|
|
|
1251
|
+
# Handle multiple output formats
|
|
1252
|
+
if isinstance(output_format, list) or (
|
|
1253
|
+
isinstance(output_format, str) and "," in output_format
|
|
1254
|
+
):
|
|
1255
|
+
result: dict[str, str | bytes] = {}
|
|
1256
|
+
formats = (
|
|
1257
|
+
output_format
|
|
1258
|
+
if isinstance(output_format, list)
|
|
1259
|
+
else [f.strip() for f in output_format.split(",")]
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1262
|
+
for fmt in formats:
|
|
1263
|
+
fmt_lower = fmt.lower()
|
|
1264
|
+
if fmt_lower == "html" and "html" in resp_json:
|
|
1265
|
+
result["html"] = resp_json["html"]
|
|
1266
|
+
elif fmt_lower == "png" and "png" in resp_json:
|
|
1267
|
+
result["png"] = decode_base64_image(resp_json["png"])
|
|
1268
|
+
|
|
1269
|
+
# If we got results, return dict; otherwise return single value for backward compatibility
|
|
1270
|
+
if result:
|
|
1271
|
+
return result
|
|
1272
|
+
|
|
1273
|
+
# Single format (backward compatibility)
|
|
1223
1274
|
if "html" in resp_json:
|
|
1224
1275
|
return resp_json["html"]
|
|
1225
1276
|
if "png" in resp_json:
|
thordata/enums.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Enumerations for the Thordata Python SDK.
|
|
3
|
-
Moved to thordata.types in v1.
|
|
3
|
+
Moved to thordata.types in v1.6.0.
|
|
4
4
|
This file is kept for backward compatibility.
|
|
5
5
|
"""
|
|
6
6
|
|
|
@@ -21,7 +21,7 @@ from .types import (
|
|
|
21
21
|
SessionType,
|
|
22
22
|
TaskStatus,
|
|
23
23
|
TimeRange,
|
|
24
|
-
normalize_enum_value,
|
|
24
|
+
normalize_enum_value,
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
__all__ = [
|
thordata/exceptions.py
CHANGED
|
@@ -15,6 +15,7 @@ Exception Hierarchy:
|
|
|
15
15
|
|
|
16
16
|
from __future__ import annotations
|
|
17
17
|
|
|
18
|
+
from collections.abc import Mapping
|
|
18
19
|
from typing import Any
|
|
19
20
|
|
|
20
21
|
# =============================================================================
|
|
@@ -235,6 +236,46 @@ class ThordataNotCollectedError(ThordataAPIError):
|
|
|
235
236
|
# =============================================================================
|
|
236
237
|
|
|
237
238
|
|
|
239
|
+
def _extract_request_id(payload: Any) -> str | None:
|
|
240
|
+
if isinstance(payload, Mapping):
|
|
241
|
+
for key in ("request_id", "requestId", "x_request_id", "x-request-id"):
|
|
242
|
+
val = payload.get(key)
|
|
243
|
+
if val is not None:
|
|
244
|
+
return str(val)
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _extract_retry_after(payload: Any) -> int | None:
|
|
249
|
+
if isinstance(payload, Mapping):
|
|
250
|
+
for key in ("retry_after", "retryAfter", "retry-after"):
|
|
251
|
+
val = payload.get(key)
|
|
252
|
+
if isinstance(val, int):
|
|
253
|
+
return val
|
|
254
|
+
if isinstance(val, str) and val.isdigit():
|
|
255
|
+
return int(val)
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _build_error_message(
|
|
260
|
+
message: str,
|
|
261
|
+
*,
|
|
262
|
+
status_code: int | None,
|
|
263
|
+
code: int | None,
|
|
264
|
+
request_id: str | None,
|
|
265
|
+
) -> str:
|
|
266
|
+
parts: list[str] = [message]
|
|
267
|
+
meta: list[str] = []
|
|
268
|
+
if status_code is not None:
|
|
269
|
+
meta.append(f"http={status_code}")
|
|
270
|
+
if code is not None and code != status_code:
|
|
271
|
+
meta.append(f"code={code}")
|
|
272
|
+
if request_id:
|
|
273
|
+
meta.append(f"request_id={request_id}")
|
|
274
|
+
if meta:
|
|
275
|
+
parts.append("(" + ", ".join(meta) + ")")
|
|
276
|
+
return " ".join(parts)
|
|
277
|
+
|
|
278
|
+
|
|
238
279
|
def raise_for_code(
|
|
239
280
|
message: str,
|
|
240
281
|
*,
|
|
@@ -266,49 +307,59 @@ def raise_for_code(
|
|
|
266
307
|
# Determine the effective error code.
|
|
267
308
|
# Prefer payload `code` when present and not success (200),
|
|
268
309
|
# otherwise fall back to HTTP status when it indicates an error.
|
|
310
|
+
# Determine the effective error code for routing.
|
|
269
311
|
effective_code: int | None = None
|
|
270
|
-
|
|
271
312
|
if code is not None and code != 200:
|
|
272
313
|
effective_code = code
|
|
273
|
-
elif status_code is not None and status_code
|
|
314
|
+
elif status_code is not None and status_code >= 400:
|
|
274
315
|
effective_code = status_code
|
|
275
316
|
else:
|
|
276
317
|
effective_code = code if code is not None else status_code
|
|
277
318
|
|
|
319
|
+
# Extract additional context from payload
|
|
320
|
+
final_request_id = request_id or _extract_request_id(payload)
|
|
321
|
+
|
|
322
|
+
# Build a consistent, informative error message
|
|
323
|
+
final_message = _build_error_message(
|
|
324
|
+
message,
|
|
325
|
+
status_code=status_code,
|
|
326
|
+
code=code,
|
|
327
|
+
request_id=final_request_id,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Prepare common arguments for exception constructors
|
|
278
331
|
kwargs = {
|
|
279
332
|
"status_code": status_code,
|
|
280
333
|
"code": code,
|
|
281
334
|
"payload": payload,
|
|
282
|
-
"request_id":
|
|
335
|
+
"request_id": final_request_id,
|
|
283
336
|
}
|
|
284
337
|
|
|
338
|
+
# --- Route to the correct exception class ---
|
|
339
|
+
|
|
285
340
|
# Not collected (API payload code 300, often retryable, not billed)
|
|
286
|
-
# Check this FIRST since 300 is in API_CODES, not HTTP_STATUS_CODES
|
|
287
341
|
if effective_code in ThordataNotCollectedError.API_CODES:
|
|
288
|
-
raise ThordataNotCollectedError(
|
|
342
|
+
raise ThordataNotCollectedError(final_message, **kwargs)
|
|
289
343
|
|
|
290
|
-
# Auth errors
|
|
344
|
+
# Auth errors (401, 403)
|
|
291
345
|
if effective_code in ThordataAuthError.HTTP_STATUS_CODES:
|
|
292
|
-
raise ThordataAuthError(
|
|
346
|
+
raise ThordataAuthError(final_message, **kwargs)
|
|
293
347
|
|
|
294
|
-
# Rate limit errors
|
|
348
|
+
# Rate limit errors (429, 402)
|
|
295
349
|
if effective_code in ThordataRateLimitError.HTTP_STATUS_CODES:
|
|
296
|
-
|
|
297
|
-
retry_after
|
|
298
|
-
if isinstance(payload, dict):
|
|
299
|
-
retry_after = payload.get("retry_after")
|
|
300
|
-
raise ThordataRateLimitError(message, retry_after=retry_after, **kwargs)
|
|
350
|
+
retry_after = _extract_retry_after(payload)
|
|
351
|
+
raise ThordataRateLimitError(final_message, retry_after=retry_after, **kwargs)
|
|
301
352
|
|
|
302
|
-
# Server errors
|
|
353
|
+
# Server errors (5xx)
|
|
303
354
|
if effective_code is not None and 500 <= effective_code < 600:
|
|
304
|
-
raise ThordataServerError(
|
|
355
|
+
raise ThordataServerError(final_message, **kwargs)
|
|
305
356
|
|
|
306
|
-
# Validation errors
|
|
357
|
+
# Validation errors (400, 422)
|
|
307
358
|
if effective_code in ThordataValidationError.HTTP_STATUS_CODES:
|
|
308
|
-
raise ThordataValidationError(
|
|
359
|
+
raise ThordataValidationError(final_message, **kwargs)
|
|
309
360
|
|
|
310
|
-
#
|
|
311
|
-
raise ThordataAPIError(
|
|
361
|
+
# Fallback to generic API error if no specific match
|
|
362
|
+
raise ThordataAPIError(final_message, **kwargs)
|
|
312
363
|
|
|
313
364
|
|
|
314
365
|
# =============================================================================
|
|
@@ -339,7 +390,16 @@ def is_retryable_exception(exc: Exception) -> bool:
|
|
|
339
390
|
try:
|
|
340
391
|
import requests
|
|
341
392
|
|
|
342
|
-
|
|
393
|
+
# requests exposes SSLError under requests.exceptions.SSLError (not requests.SSLError)
|
|
394
|
+
ssl_error = getattr(getattr(requests, "exceptions", None), "SSLError", None)
|
|
395
|
+
retryable: tuple[type[BaseException], ...] = (
|
|
396
|
+
requests.Timeout,
|
|
397
|
+
requests.ConnectionError,
|
|
398
|
+
)
|
|
399
|
+
if ssl_error is not None:
|
|
400
|
+
retryable = retryable + (ssl_error,)
|
|
401
|
+
|
|
402
|
+
if isinstance(exc, retryable):
|
|
343
403
|
return True
|
|
344
404
|
except ImportError:
|
|
345
405
|
pass
|
thordata/models.py
CHANGED
thordata/retry.py
CHANGED
|
@@ -186,7 +186,7 @@ def with_retry(
|
|
|
186
186
|
if isinstance(e, ThordataRateLimitError) and e.retry_after:
|
|
187
187
|
delay = max(delay, e.retry_after)
|
|
188
188
|
|
|
189
|
-
logger.
|
|
189
|
+
logger.info(
|
|
190
190
|
f"Retry attempt {attempt + 1}/{config.max_retries} "
|
|
191
191
|
f"after {delay:.2f}s due to: {e}"
|
|
192
192
|
)
|
thordata/tools/__init__.py
CHANGED
|
@@ -5,15 +5,19 @@ High-level abstractions for specific scraping targets.
|
|
|
5
5
|
|
|
6
6
|
from .base import ToolRequest, VideoToolRequest
|
|
7
7
|
from .code import GitHub
|
|
8
|
-
from .ecommerce import Amazon
|
|
8
|
+
from .ecommerce import Amazon, Walmart, eBay
|
|
9
|
+
from .professional import Crunchbase, Glassdoor, Indeed
|
|
9
10
|
from .search import GoogleMaps, GooglePlay, GoogleShopping
|
|
10
11
|
from .social import Facebook, Instagram, LinkedIn, Reddit, TikTok, Twitter
|
|
12
|
+
from .travel import Airbnb, Booking, Zillow
|
|
11
13
|
from .video import YouTube
|
|
12
14
|
|
|
13
15
|
__all__ = [
|
|
14
16
|
"ToolRequest",
|
|
15
17
|
"VideoToolRequest",
|
|
16
18
|
"Amazon",
|
|
19
|
+
"eBay",
|
|
20
|
+
"Walmart",
|
|
17
21
|
"GoogleMaps",
|
|
18
22
|
"GoogleShopping",
|
|
19
23
|
"GooglePlay",
|
|
@@ -25,4 +29,10 @@ __all__ = [
|
|
|
25
29
|
"Reddit",
|
|
26
30
|
"YouTube",
|
|
27
31
|
"GitHub",
|
|
32
|
+
"Indeed",
|
|
33
|
+
"Glassdoor",
|
|
34
|
+
"Crunchbase",
|
|
35
|
+
"Booking",
|
|
36
|
+
"Zillow",
|
|
37
|
+
"Airbnb",
|
|
28
38
|
]
|
thordata/tools/code.py
CHANGED
|
@@ -14,13 +14,26 @@ class GitHub:
|
|
|
14
14
|
|
|
15
15
|
@dataclass
|
|
16
16
|
class Repository(ToolRequest):
|
|
17
|
-
"""Github Repository Scraper"""
|
|
17
|
+
"""Github Repository Scraper by Repo URL"""
|
|
18
18
|
|
|
19
19
|
SPIDER_ID = "github_repository_by-repo-url"
|
|
20
20
|
SPIDER_NAME = "github.com"
|
|
21
|
-
|
|
22
21
|
repo_url: str
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RepositoryBySearchUrl(ToolRequest):
|
|
25
|
+
"""Github Repository Scraper by Search URL"""
|
|
26
|
+
|
|
27
|
+
SPIDER_ID = "github_repository_by-search-url"
|
|
28
|
+
SPIDER_NAME = "github.com"
|
|
29
|
+
search_url: str
|
|
25
30
|
page_turning: int | None = None
|
|
26
31
|
max_num: int | None = None
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class RepositoryByUrl(ToolRequest):
|
|
35
|
+
"""Github Repository Scraper by URL"""
|
|
36
|
+
|
|
37
|
+
SPIDER_ID = "github_repository_by-url"
|
|
38
|
+
SPIDER_NAME = "github.com"
|
|
39
|
+
url: str
|