thordata-sdk 1.6.0__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {thordata_sdk-1.6.0/src/thordata_sdk.egg-info → thordata_sdk-1.7.0}/PKG-INFO +1 -1
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/pyproject.toml +2 -1
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/async_client.py +43 -6
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/client.py +52 -6
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/exceptions.py +10 -1
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/serp.py +6 -2
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/task.py +63 -9
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/universal.py +37 -5
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0/src/thordata_sdk.egg-info}/PKG-INFO +1 -1
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/LICENSE +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/README.md +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/setup.cfg +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/__init__.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/_utils.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/async_unlimited.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/core/__init__.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/core/async_http_client.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/core/http_client.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/core/tunnel.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/enums.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/models.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/retry.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/serp_engines.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/__init__.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/base.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/code.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/ecommerce.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/professional.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/search.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/social.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/travel.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/video.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/__init__.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/common.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/proxy.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/unlimited.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/SOURCES.txt +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/dependency_links.txt +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/requires.txt +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/top_level.txt +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_async_client.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_async_client_errors.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_batch_creation.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_client.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_client_errors.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_enums.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_examples.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_exceptions.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_integration_proxy_protocols.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_models.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_retry.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_spec_parity.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_task_status_and_wait.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_tools.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_tools_coverage.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_unlimited.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_user_agent.py +0 -0
- {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_utils.py +0 -0
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "thordata-sdk"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.7.0"
|
|
9
9
|
description = "The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network."
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
requires-python = ">=3.9"
|
|
@@ -115,6 +115,7 @@ check_untyped_defs = false
|
|
|
115
115
|
strict_optional = false
|
|
116
116
|
show_error_codes = true
|
|
117
117
|
ignore_missing_imports = true
|
|
118
|
+
follow_imports = "skip"
|
|
118
119
|
|
|
119
120
|
[[tool.mypy.overrides]]
|
|
120
121
|
module = ["aiohttp.*", "requests.*"]
|
|
@@ -293,28 +293,36 @@ class AsyncThordataClient:
|
|
|
293
293
|
url: str,
|
|
294
294
|
*,
|
|
295
295
|
js_render: bool = False,
|
|
296
|
-
output_format: str = "html",
|
|
296
|
+
output_format: str | list[str] = "html",
|
|
297
297
|
country: str | None = None,
|
|
298
298
|
block_resources: str | None = None,
|
|
299
|
+
clean_content: str | None = None,
|
|
299
300
|
wait: int | None = None,
|
|
300
301
|
wait_for: str | None = None,
|
|
302
|
+
follow_redirect: bool | None = None,
|
|
303
|
+
headers: list[dict[str, str]] | None = None,
|
|
304
|
+
cookies: list[dict[str, str]] | None = None,
|
|
301
305
|
**kwargs: Any,
|
|
302
|
-
) -> str | bytes:
|
|
306
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
303
307
|
request = UniversalScrapeRequest(
|
|
304
308
|
url=url,
|
|
305
309
|
js_render=js_render,
|
|
306
310
|
output_format=output_format,
|
|
307
311
|
country=country,
|
|
308
312
|
block_resources=block_resources,
|
|
313
|
+
clean_content=clean_content,
|
|
309
314
|
wait=wait,
|
|
310
315
|
wait_for=wait_for,
|
|
316
|
+
follow_redirect=follow_redirect,
|
|
317
|
+
headers=headers,
|
|
318
|
+
cookies=cookies,
|
|
311
319
|
extra_params=kwargs,
|
|
312
320
|
)
|
|
313
321
|
return await self.universal_scrape_advanced(request)
|
|
314
322
|
|
|
315
323
|
async def universal_scrape_advanced(
|
|
316
324
|
self, request: UniversalScrapeRequest
|
|
317
|
-
) -> str | bytes:
|
|
325
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
318
326
|
if not self.scraper_token:
|
|
319
327
|
raise ThordataConfigError("scraper_token required")
|
|
320
328
|
payload = request.to_payload()
|
|
@@ -327,9 +335,17 @@ class AsyncThordataClient:
|
|
|
327
335
|
try:
|
|
328
336
|
resp_json = await response.json()
|
|
329
337
|
except ValueError:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
338
|
+
# If not JSON, return raw content based on format
|
|
339
|
+
if isinstance(request.output_format, list) or (
|
|
340
|
+
isinstance(request.output_format, str) and "," in request.output_format
|
|
341
|
+
):
|
|
342
|
+
return {"raw": await response.read()}
|
|
343
|
+
fmt = (
|
|
344
|
+
request.output_format.lower()
|
|
345
|
+
if isinstance(request.output_format, str)
|
|
346
|
+
else str(request.output_format).lower()
|
|
347
|
+
)
|
|
348
|
+
return await response.read() if fmt == "png" else await response.text()
|
|
333
349
|
|
|
334
350
|
if isinstance(resp_json, dict):
|
|
335
351
|
code = resp_json.get("code")
|
|
@@ -337,6 +353,27 @@ class AsyncThordataClient:
|
|
|
337
353
|
msg = extract_error_message(resp_json)
|
|
338
354
|
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
339
355
|
|
|
356
|
+
# Handle multiple output formats
|
|
357
|
+
if isinstance(request.output_format, list) or (
|
|
358
|
+
isinstance(request.output_format, str) and "," in request.output_format
|
|
359
|
+
):
|
|
360
|
+
result: dict[str, str | bytes] = {}
|
|
361
|
+
formats = (
|
|
362
|
+
request.output_format
|
|
363
|
+
if isinstance(request.output_format, list)
|
|
364
|
+
else [f.strip() for f in request.output_format.split(",")]
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
for fmt in formats:
|
|
368
|
+
fmt_lower = fmt.lower()
|
|
369
|
+
if fmt_lower == "html" and "html" in resp_json:
|
|
370
|
+
result["html"] = resp_json["html"]
|
|
371
|
+
elif fmt_lower == "png" and "png" in resp_json:
|
|
372
|
+
result["png"] = decode_base64_image(resp_json["png"])
|
|
373
|
+
|
|
374
|
+
if result:
|
|
375
|
+
return result
|
|
376
|
+
|
|
340
377
|
if "html" in resp_json:
|
|
341
378
|
return resp_json["html"]
|
|
342
379
|
if "png" in resp_json:
|
|
@@ -53,6 +53,7 @@ from .serp_engines import SerpNamespace
|
|
|
53
53
|
# Import Types (Modernized)
|
|
54
54
|
from .types import (
|
|
55
55
|
CommonSettings,
|
|
56
|
+
DataFormat,
|
|
56
57
|
ProxyConfig,
|
|
57
58
|
ProxyProduct,
|
|
58
59
|
ProxyServer,
|
|
@@ -364,26 +365,36 @@ class ThordataClient:
|
|
|
364
365
|
url: str,
|
|
365
366
|
*,
|
|
366
367
|
js_render: bool = False,
|
|
367
|
-
output_format: str = "html",
|
|
368
|
+
output_format: str | list[str] = "html",
|
|
368
369
|
country: str | None = None,
|
|
369
370
|
block_resources: str | None = None,
|
|
371
|
+
clean_content: str | None = None,
|
|
370
372
|
wait: int | None = None,
|
|
371
373
|
wait_for: str | None = None,
|
|
374
|
+
follow_redirect: bool | None = None,
|
|
375
|
+
headers: list[dict[str, str]] | None = None,
|
|
376
|
+
cookies: list[dict[str, str]] | None = None,
|
|
372
377
|
**kwargs: Any,
|
|
373
|
-
) -> str | bytes:
|
|
378
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
374
379
|
request = UniversalScrapeRequest(
|
|
375
380
|
url=url,
|
|
376
381
|
js_render=js_render,
|
|
377
382
|
output_format=output_format,
|
|
378
383
|
country=country,
|
|
379
384
|
block_resources=block_resources,
|
|
385
|
+
clean_content=clean_content,
|
|
380
386
|
wait=wait,
|
|
381
387
|
wait_for=wait_for,
|
|
388
|
+
follow_redirect=follow_redirect,
|
|
389
|
+
headers=headers,
|
|
390
|
+
cookies=cookies,
|
|
382
391
|
extra_params=kwargs,
|
|
383
392
|
)
|
|
384
393
|
return self.universal_scrape_advanced(request)
|
|
385
394
|
|
|
386
|
-
def universal_scrape_advanced(
|
|
395
|
+
def universal_scrape_advanced(
|
|
396
|
+
self, request: UniversalScrapeRequest
|
|
397
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
387
398
|
if not self.scraper_token:
|
|
388
399
|
raise ThordataConfigError("scraper_token required")
|
|
389
400
|
|
|
@@ -648,6 +659,7 @@ class ThordataClient:
|
|
|
648
659
|
include_errors: bool = True,
|
|
649
660
|
task_type: str = "web",
|
|
650
661
|
common_settings: CommonSettings | None = None,
|
|
662
|
+
data_format: DataFormat | str | None = None,
|
|
651
663
|
) -> str:
|
|
652
664
|
import time
|
|
653
665
|
|
|
@@ -671,6 +683,7 @@ class ThordataClient:
|
|
|
671
683
|
parameters=parameters,
|
|
672
684
|
universal_params=universal_params,
|
|
673
685
|
include_errors=include_errors,
|
|
686
|
+
data_format=data_format,
|
|
674
687
|
)
|
|
675
688
|
task_id = self.create_scraper_task_advanced(config)
|
|
676
689
|
|
|
@@ -1212,12 +1225,22 @@ class ThordataClient:
|
|
|
1212
1225
|
# =========================================================================
|
|
1213
1226
|
|
|
1214
1227
|
def _process_universal_response(
|
|
1215
|
-
self, response: requests.Response, output_format: str
|
|
1216
|
-
) -> str | bytes:
|
|
1228
|
+
self, response: requests.Response, output_format: str | list[str]
|
|
1229
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
1230
|
+
"""Process universal scrape response. Returns single value or dict if multiple formats requested."""
|
|
1217
1231
|
try:
|
|
1218
1232
|
resp_json = response.json()
|
|
1219
1233
|
except ValueError:
|
|
1220
|
-
|
|
1234
|
+
# If not JSON, return raw content based on format
|
|
1235
|
+
if isinstance(output_format, list):
|
|
1236
|
+
# Multiple formats requested but got non-JSON response
|
|
1237
|
+
return {"raw": response.content}
|
|
1238
|
+
fmt = (
|
|
1239
|
+
output_format.lower()
|
|
1240
|
+
if isinstance(output_format, str)
|
|
1241
|
+
else str(output_format).lower()
|
|
1242
|
+
)
|
|
1243
|
+
return response.content if fmt == "png" else response.text
|
|
1221
1244
|
|
|
1222
1245
|
if isinstance(resp_json, dict):
|
|
1223
1246
|
code = resp_json.get("code")
|
|
@@ -1225,6 +1248,29 @@ class ThordataClient:
|
|
|
1225
1248
|
msg = extract_error_message(resp_json)
|
|
1226
1249
|
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
1227
1250
|
|
|
1251
|
+
# Handle multiple output formats
|
|
1252
|
+
if isinstance(output_format, list) or (
|
|
1253
|
+
isinstance(output_format, str) and "," in output_format
|
|
1254
|
+
):
|
|
1255
|
+
result: dict[str, str | bytes] = {}
|
|
1256
|
+
formats = (
|
|
1257
|
+
output_format
|
|
1258
|
+
if isinstance(output_format, list)
|
|
1259
|
+
else [f.strip() for f in output_format.split(",")]
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1262
|
+
for fmt in formats:
|
|
1263
|
+
fmt_lower = fmt.lower()
|
|
1264
|
+
if fmt_lower == "html" and "html" in resp_json:
|
|
1265
|
+
result["html"] = resp_json["html"]
|
|
1266
|
+
elif fmt_lower == "png" and "png" in resp_json:
|
|
1267
|
+
result["png"] = decode_base64_image(resp_json["png"])
|
|
1268
|
+
|
|
1269
|
+
# If we got results, return dict; otherwise return single value for backward compatibility
|
|
1270
|
+
if result:
|
|
1271
|
+
return result
|
|
1272
|
+
|
|
1273
|
+
# Single format (backward compatibility)
|
|
1228
1274
|
if "html" in resp_json:
|
|
1229
1275
|
return resp_json["html"]
|
|
1230
1276
|
if "png" in resp_json:
|
|
@@ -390,7 +390,16 @@ def is_retryable_exception(exc: Exception) -> bool:
|
|
|
390
390
|
try:
|
|
391
391
|
import requests
|
|
392
392
|
|
|
393
|
-
|
|
393
|
+
# requests exposes SSLError under requests.exceptions.SSLError (not requests.SSLError)
|
|
394
|
+
ssl_error = getattr(getattr(requests, "exceptions", None), "SSLError", None)
|
|
395
|
+
retryable: tuple[type[BaseException], ...] = (
|
|
396
|
+
requests.Timeout,
|
|
397
|
+
requests.ConnectionError,
|
|
398
|
+
)
|
|
399
|
+
if ssl_error is not None:
|
|
400
|
+
retryable = retryable + (ssl_error,)
|
|
401
|
+
|
|
402
|
+
if isinstance(exc, retryable):
|
|
394
403
|
return True
|
|
395
404
|
except ImportError:
|
|
396
405
|
pass
|
|
@@ -117,7 +117,7 @@ class SerpRequest(ThordataBaseConfig):
|
|
|
117
117
|
render_js: bool | None = None
|
|
118
118
|
no_cache: bool | None = None
|
|
119
119
|
|
|
120
|
-
# Output
|
|
120
|
+
# Output format: "json" (json=1), "html" (json=3), "light_json" (json=4), or "both" (json=2)
|
|
121
121
|
output_format: str = "json"
|
|
122
122
|
|
|
123
123
|
# Advanced Google
|
|
@@ -155,13 +155,17 @@ class SerpRequest(ThordataBaseConfig):
|
|
|
155
155
|
}
|
|
156
156
|
|
|
157
157
|
# JSON output handling
|
|
158
|
+
# Dashboard mapping: json=1 (json), json=3 (html), json=4 (light json), json=2 (both)
|
|
158
159
|
fmt = self.output_format.lower()
|
|
159
160
|
if fmt == "json":
|
|
160
161
|
payload["json"] = "1"
|
|
161
162
|
elif fmt == "html":
|
|
162
|
-
|
|
163
|
+
payload["json"] = "3"
|
|
164
|
+
elif fmt in ("light_json", "light-json", "lightjson"):
|
|
165
|
+
payload["json"] = "4"
|
|
163
166
|
elif fmt in ("2", "both", "json+html"):
|
|
164
167
|
payload["json"] = "2"
|
|
168
|
+
# If no json param is set, default to HTML (legacy behavior)
|
|
165
169
|
|
|
166
170
|
# Query param handling
|
|
167
171
|
if engine == "yandex":
|
|
@@ -8,6 +8,7 @@ import json
|
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from enum import Enum
|
|
10
10
|
from typing import Any
|
|
11
|
+
from urllib.parse import unquote
|
|
11
12
|
|
|
12
13
|
from .common import CommonSettings, ThordataBaseConfig
|
|
13
14
|
|
|
@@ -49,6 +50,52 @@ class DataFormat(str, Enum):
|
|
|
49
50
|
XLSX = "xlsx"
|
|
50
51
|
|
|
51
52
|
|
|
53
|
+
def _normalize_url_value(value: Any) -> Any:
|
|
54
|
+
if not isinstance(value, str):
|
|
55
|
+
return value
|
|
56
|
+
# Decode all percent-encoded characters to match Dashboard format
|
|
57
|
+
# Dashboard expects URLs in their raw/decoded form, not URL-encoded
|
|
58
|
+
# This ensures API/SDK submissions match manual Dashboard input exactly
|
|
59
|
+
try:
|
|
60
|
+
# Check if URL contains any percent-encoded characters
|
|
61
|
+
if "%" in value:
|
|
62
|
+
# Fully decode the URL to match Dashboard format
|
|
63
|
+
decoded = unquote(value)
|
|
64
|
+
# If decoding changed the value, use decoded version
|
|
65
|
+
# This handles cases like %26 -> &, %3A -> :, %2F -> /, etc.
|
|
66
|
+
if decoded != value:
|
|
67
|
+
return decoded
|
|
68
|
+
except Exception:
|
|
69
|
+
# If decoding fails, return original value
|
|
70
|
+
pass
|
|
71
|
+
return value
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _normalize_parameters(params: dict[str, Any]) -> dict[str, Any]:
|
|
75
|
+
# All parameter keys that contain URLs and should be normalized
|
|
76
|
+
# This ensures API/SDK submissions match Dashboard format exactly
|
|
77
|
+
url_keys = {
|
|
78
|
+
"url",
|
|
79
|
+
"domain",
|
|
80
|
+
"profileurl",
|
|
81
|
+
"posturl",
|
|
82
|
+
"seller_url",
|
|
83
|
+
# Additional URL-related keys that may be used
|
|
84
|
+
"link",
|
|
85
|
+
"href",
|
|
86
|
+
"page_url",
|
|
87
|
+
"product_url",
|
|
88
|
+
"category_url",
|
|
89
|
+
}
|
|
90
|
+
out: dict[str, Any] = {}
|
|
91
|
+
for k, v in params.items():
|
|
92
|
+
if k in url_keys:
|
|
93
|
+
out[k] = _normalize_url_value(v)
|
|
94
|
+
else:
|
|
95
|
+
out[k] = v
|
|
96
|
+
return out
|
|
97
|
+
|
|
98
|
+
|
|
52
99
|
@dataclass
|
|
53
100
|
class ScraperTaskConfig(ThordataBaseConfig):
|
|
54
101
|
file_name: str
|
|
@@ -57,13 +104,18 @@ class ScraperTaskConfig(ThordataBaseConfig):
|
|
|
57
104
|
parameters: dict[str, Any] | list[dict[str, Any]]
|
|
58
105
|
universal_params: dict[str, Any] | None = None
|
|
59
106
|
include_errors: bool = True
|
|
107
|
+
data_format: DataFormat | str | None = (
|
|
108
|
+
None # Support json, csv, xlsx output formats
|
|
109
|
+
)
|
|
60
110
|
|
|
61
111
|
def to_payload(self) -> dict[str, Any]:
|
|
62
|
-
#
|
|
112
|
+
# Normalize parameters: decode percent-encoded URLs to reduce API/Dashboard divergence
|
|
63
113
|
if isinstance(self.parameters, list):
|
|
64
|
-
|
|
114
|
+
normalized_list = [_normalize_parameters(p) for p in self.parameters]
|
|
115
|
+
params_json = json.dumps(normalized_list)
|
|
65
116
|
else:
|
|
66
|
-
|
|
117
|
+
normalized_one = _normalize_parameters(self.parameters)
|
|
118
|
+
params_json = json.dumps([normalized_one])
|
|
67
119
|
|
|
68
120
|
payload: dict[str, Any] = {
|
|
69
121
|
"file_name": self.file_name,
|
|
@@ -74,6 +126,14 @@ class ScraperTaskConfig(ThordataBaseConfig):
|
|
|
74
126
|
}
|
|
75
127
|
if self.universal_params:
|
|
76
128
|
payload["spider_universal"] = json.dumps(self.universal_params)
|
|
129
|
+
# Add data_format if specified (for json/csv/xlsx output)
|
|
130
|
+
if self.data_format:
|
|
131
|
+
fmt = (
|
|
132
|
+
self.data_format.value
|
|
133
|
+
if isinstance(self.data_format, DataFormat)
|
|
134
|
+
else str(self.data_format).lower()
|
|
135
|
+
)
|
|
136
|
+
payload["data_format"] = fmt
|
|
77
137
|
return payload
|
|
78
138
|
|
|
79
139
|
|
|
@@ -87,7 +147,6 @@ class VideoTaskConfig(ThordataBaseConfig):
|
|
|
87
147
|
include_errors: bool = True
|
|
88
148
|
|
|
89
149
|
def to_payload(self) -> dict[str, Any]:
|
|
90
|
-
# Handle batch parameters
|
|
91
150
|
if isinstance(self.parameters, list):
|
|
92
151
|
params_json = json.dumps(self.parameters)
|
|
93
152
|
else:
|
|
@@ -99,13 +158,8 @@ class VideoTaskConfig(ThordataBaseConfig):
|
|
|
99
158
|
"spider_name": self.spider_name,
|
|
100
159
|
"spider_parameters": params_json,
|
|
101
160
|
"spider_errors": "true" if self.include_errors else "false",
|
|
102
|
-
# v2.0 Doc explicitly requires 'spider_universal' key for video tasks too sometimes,
|
|
103
|
-
# but usually it's passed as 'common_settings' or 'spider_universal'.
|
|
104
|
-
# Sticking to original models.py key logic for now to ensure stability.
|
|
105
161
|
"spider_universal": self.common_settings.to_json(),
|
|
106
162
|
}
|
|
107
|
-
# Note: If API expects 'common_settings' key specifically, adjust here.
|
|
108
|
-
# Based on v2 context, video builder often uses spider_universal.
|
|
109
163
|
return payload
|
|
110
164
|
|
|
111
165
|
|
|
@@ -15,12 +15,15 @@ from .common import ThordataBaseConfig
|
|
|
15
15
|
class UniversalScrapeRequest(ThordataBaseConfig):
|
|
16
16
|
url: str
|
|
17
17
|
js_render: bool = False
|
|
18
|
-
output_format: str
|
|
18
|
+
output_format: str | list[str] = (
|
|
19
|
+
"html" # 'html', 'png', or ['png', 'html'] for both
|
|
20
|
+
)
|
|
19
21
|
country: str | None = None
|
|
20
|
-
block_resources: str | None = None # 'script,image'
|
|
22
|
+
block_resources: str | None = None # 'script,image,video'
|
|
21
23
|
clean_content: str | None = None # 'js,css'
|
|
22
24
|
wait: int | None = None # ms
|
|
23
25
|
wait_for: str | None = None # selector
|
|
26
|
+
follow_redirect: bool | None = None # Follow redirects
|
|
24
27
|
|
|
25
28
|
# Headers/Cookies must be serialized to JSON in payload
|
|
26
29
|
headers: list[dict[str, str]] | None = None
|
|
@@ -29,12 +32,26 @@ class UniversalScrapeRequest(ThordataBaseConfig):
|
|
|
29
32
|
extra_params: dict[str, Any] = field(default_factory=dict)
|
|
30
33
|
|
|
31
34
|
def __post_init__(self) -> None:
|
|
35
|
+
# Normalize output_format to list for easier handling
|
|
36
|
+
if isinstance(self.output_format, str):
|
|
37
|
+
formats = [f.strip().lower() for f in self.output_format.split(",")]
|
|
38
|
+
else:
|
|
39
|
+
formats = [
|
|
40
|
+
f.lower() if isinstance(f, str) else str(f).lower()
|
|
41
|
+
for f in self.output_format
|
|
42
|
+
]
|
|
43
|
+
|
|
32
44
|
valid_formats = {"html", "png"}
|
|
33
|
-
if
|
|
45
|
+
invalid = [f for f in formats if f not in valid_formats]
|
|
46
|
+
if invalid:
|
|
34
47
|
raise ValueError(
|
|
35
|
-
f"Invalid output_format: {
|
|
48
|
+
f"Invalid output_format: {invalid}. Must be one or more of: {valid_formats}. "
|
|
49
|
+
f"Use comma-separated string like 'png,html' or list ['png', 'html'] for multiple formats."
|
|
36
50
|
)
|
|
37
51
|
|
|
52
|
+
# Store as list for to_payload
|
|
53
|
+
self._output_formats = formats
|
|
54
|
+
|
|
38
55
|
if self.wait is not None and (self.wait < 0 or self.wait > 100000):
|
|
39
56
|
raise ValueError("wait must be between 0 and 100000 milliseconds")
|
|
40
57
|
|
|
@@ -42,9 +59,22 @@ class UniversalScrapeRequest(ThordataBaseConfig):
|
|
|
42
59
|
payload: dict[str, Any] = {
|
|
43
60
|
"url": self.url,
|
|
44
61
|
"js_render": "True" if self.js_render else "False",
|
|
45
|
-
"type": self.output_format.lower(),
|
|
46
62
|
}
|
|
47
63
|
|
|
64
|
+
# Handle output format: support single or multiple formats (e.g., "png,html")
|
|
65
|
+
if hasattr(self, "_output_formats") and self._output_formats:
|
|
66
|
+
if len(self._output_formats) == 1:
|
|
67
|
+
payload["type"] = self._output_formats[0]
|
|
68
|
+
else:
|
|
69
|
+
# Multiple formats: join with comma (e.g., "png,html")
|
|
70
|
+
payload["type"] = ",".join(self._output_formats)
|
|
71
|
+
else:
|
|
72
|
+
# Fallback for backward compatibility
|
|
73
|
+
if isinstance(self.output_format, str):
|
|
74
|
+
payload["type"] = self.output_format.lower()
|
|
75
|
+
else:
|
|
76
|
+
payload["type"] = ",".join([str(f).lower() for f in self.output_format])
|
|
77
|
+
|
|
48
78
|
if self.country:
|
|
49
79
|
payload["country"] = self.country.lower()
|
|
50
80
|
if self.block_resources:
|
|
@@ -55,6 +85,8 @@ class UniversalScrapeRequest(ThordataBaseConfig):
|
|
|
55
85
|
payload["wait"] = str(self.wait)
|
|
56
86
|
if self.wait_for:
|
|
57
87
|
payload["wait_for"] = self.wait_for
|
|
88
|
+
if self.follow_redirect is not None:
|
|
89
|
+
payload["follow_redirect"] = "True" if self.follow_redirect else "False"
|
|
58
90
|
|
|
59
91
|
# Serialize complex objects as JSON strings
|
|
60
92
|
if self.headers:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|