thordata-sdk 1.6.0__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {thordata_sdk-1.6.0/src/thordata_sdk.egg-info → thordata_sdk-1.7.0}/PKG-INFO +1 -1
  2. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/pyproject.toml +2 -1
  3. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/async_client.py +43 -6
  4. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/client.py +52 -6
  5. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/exceptions.py +10 -1
  6. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/serp.py +6 -2
  7. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/task.py +63 -9
  8. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/universal.py +37 -5
  9. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0/src/thordata_sdk.egg-info}/PKG-INFO +1 -1
  10. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/LICENSE +0 -0
  11. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/README.md +0 -0
  12. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/setup.cfg +0 -0
  13. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/__init__.py +0 -0
  14. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/_utils.py +0 -0
  15. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/async_unlimited.py +0 -0
  16. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/core/__init__.py +0 -0
  17. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/core/async_http_client.py +0 -0
  18. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/core/http_client.py +0 -0
  19. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/core/tunnel.py +0 -0
  20. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/enums.py +0 -0
  21. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/models.py +0 -0
  22. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/retry.py +0 -0
  23. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/serp_engines.py +0 -0
  24. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/__init__.py +0 -0
  25. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/base.py +0 -0
  26. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/code.py +0 -0
  27. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/ecommerce.py +0 -0
  28. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/professional.py +0 -0
  29. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/search.py +0 -0
  30. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/social.py +0 -0
  31. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/travel.py +0 -0
  32. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/tools/video.py +0 -0
  33. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/__init__.py +0 -0
  34. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/common.py +0 -0
  35. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/types/proxy.py +0 -0
  36. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata/unlimited.py +0 -0
  37. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/SOURCES.txt +0 -0
  38. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/dependency_links.txt +0 -0
  39. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/requires.txt +0 -0
  40. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/top_level.txt +0 -0
  41. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_async_client.py +0 -0
  42. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_async_client_errors.py +0 -0
  43. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_batch_creation.py +0 -0
  44. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_client.py +0 -0
  45. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_client_errors.py +0 -0
  46. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_enums.py +0 -0
  47. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_examples.py +0 -0
  48. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_exceptions.py +0 -0
  49. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_integration_proxy_protocols.py +0 -0
  50. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_models.py +0 -0
  51. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_retry.py +0 -0
  52. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_spec_parity.py +0 -0
  53. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_task_status_and_wait.py +0 -0
  54. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_tools.py +0 -0
  55. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_tools_coverage.py +0 -0
  56. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_unlimited.py +0 -0
  57. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_user_agent.py +0 -0
  58. {thordata_sdk-1.6.0 → thordata_sdk-1.7.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thordata-sdk
3
- Version: 1.6.0
3
+ Version: 1.7.0
4
4
  Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
5
  Author-email: Thordata Developer Team <support@thordata.com>
6
6
  License: MIT
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "thordata-sdk"
8
- version = "1.6.0"
8
+ version = "1.7.0"
9
9
  description = "The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network."
10
10
  readme = "README.md"
11
11
  requires-python = ">=3.9"
@@ -115,6 +115,7 @@ check_untyped_defs = false
115
115
  strict_optional = false
116
116
  show_error_codes = true
117
117
  ignore_missing_imports = true
118
+ follow_imports = "skip"
118
119
 
119
120
  [[tool.mypy.overrides]]
120
121
  module = ["aiohttp.*", "requests.*"]
@@ -293,28 +293,36 @@ class AsyncThordataClient:
293
293
  url: str,
294
294
  *,
295
295
  js_render: bool = False,
296
- output_format: str = "html",
296
+ output_format: str | list[str] = "html",
297
297
  country: str | None = None,
298
298
  block_resources: str | None = None,
299
+ clean_content: str | None = None,
299
300
  wait: int | None = None,
300
301
  wait_for: str | None = None,
302
+ follow_redirect: bool | None = None,
303
+ headers: list[dict[str, str]] | None = None,
304
+ cookies: list[dict[str, str]] | None = None,
301
305
  **kwargs: Any,
302
- ) -> str | bytes:
306
+ ) -> str | bytes | dict[str, str | bytes]:
303
307
  request = UniversalScrapeRequest(
304
308
  url=url,
305
309
  js_render=js_render,
306
310
  output_format=output_format,
307
311
  country=country,
308
312
  block_resources=block_resources,
313
+ clean_content=clean_content,
309
314
  wait=wait,
310
315
  wait_for=wait_for,
316
+ follow_redirect=follow_redirect,
317
+ headers=headers,
318
+ cookies=cookies,
311
319
  extra_params=kwargs,
312
320
  )
313
321
  return await self.universal_scrape_advanced(request)
314
322
 
315
323
  async def universal_scrape_advanced(
316
324
  self, request: UniversalScrapeRequest
317
- ) -> str | bytes:
325
+ ) -> str | bytes | dict[str, str | bytes]:
318
326
  if not self.scraper_token:
319
327
  raise ThordataConfigError("scraper_token required")
320
328
  payload = request.to_payload()
@@ -327,9 +335,17 @@ class AsyncThordataClient:
327
335
  try:
328
336
  resp_json = await response.json()
329
337
  except ValueError:
330
- if request.output_format.lower() == "png":
331
- return await response.read()
332
- return await response.text()
338
+ # If not JSON, return raw content based on format
339
+ if isinstance(request.output_format, list) or (
340
+ isinstance(request.output_format, str) and "," in request.output_format
341
+ ):
342
+ return {"raw": await response.read()}
343
+ fmt = (
344
+ request.output_format.lower()
345
+ if isinstance(request.output_format, str)
346
+ else str(request.output_format).lower()
347
+ )
348
+ return await response.read() if fmt == "png" else await response.text()
333
349
 
334
350
  if isinstance(resp_json, dict):
335
351
  code = resp_json.get("code")
@@ -337,6 +353,27 @@ class AsyncThordataClient:
337
353
  msg = extract_error_message(resp_json)
338
354
  raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
339
355
 
356
+ # Handle multiple output formats
357
+ if isinstance(request.output_format, list) or (
358
+ isinstance(request.output_format, str) and "," in request.output_format
359
+ ):
360
+ result: dict[str, str | bytes] = {}
361
+ formats = (
362
+ request.output_format
363
+ if isinstance(request.output_format, list)
364
+ else [f.strip() for f in request.output_format.split(",")]
365
+ )
366
+
367
+ for fmt in formats:
368
+ fmt_lower = fmt.lower()
369
+ if fmt_lower == "html" and "html" in resp_json:
370
+ result["html"] = resp_json["html"]
371
+ elif fmt_lower == "png" and "png" in resp_json:
372
+ result["png"] = decode_base64_image(resp_json["png"])
373
+
374
+ if result:
375
+ return result
376
+
340
377
  if "html" in resp_json:
341
378
  return resp_json["html"]
342
379
  if "png" in resp_json:
@@ -53,6 +53,7 @@ from .serp_engines import SerpNamespace
53
53
  # Import Types (Modernized)
54
54
  from .types import (
55
55
  CommonSettings,
56
+ DataFormat,
56
57
  ProxyConfig,
57
58
  ProxyProduct,
58
59
  ProxyServer,
@@ -364,26 +365,36 @@ class ThordataClient:
364
365
  url: str,
365
366
  *,
366
367
  js_render: bool = False,
367
- output_format: str = "html",
368
+ output_format: str | list[str] = "html",
368
369
  country: str | None = None,
369
370
  block_resources: str | None = None,
371
+ clean_content: str | None = None,
370
372
  wait: int | None = None,
371
373
  wait_for: str | None = None,
374
+ follow_redirect: bool | None = None,
375
+ headers: list[dict[str, str]] | None = None,
376
+ cookies: list[dict[str, str]] | None = None,
372
377
  **kwargs: Any,
373
- ) -> str | bytes:
378
+ ) -> str | bytes | dict[str, str | bytes]:
374
379
  request = UniversalScrapeRequest(
375
380
  url=url,
376
381
  js_render=js_render,
377
382
  output_format=output_format,
378
383
  country=country,
379
384
  block_resources=block_resources,
385
+ clean_content=clean_content,
380
386
  wait=wait,
381
387
  wait_for=wait_for,
388
+ follow_redirect=follow_redirect,
389
+ headers=headers,
390
+ cookies=cookies,
382
391
  extra_params=kwargs,
383
392
  )
384
393
  return self.universal_scrape_advanced(request)
385
394
 
386
- def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
395
+ def universal_scrape_advanced(
396
+ self, request: UniversalScrapeRequest
397
+ ) -> str | bytes | dict[str, str | bytes]:
387
398
  if not self.scraper_token:
388
399
  raise ThordataConfigError("scraper_token required")
389
400
 
@@ -648,6 +659,7 @@ class ThordataClient:
648
659
  include_errors: bool = True,
649
660
  task_type: str = "web",
650
661
  common_settings: CommonSettings | None = None,
662
+ data_format: DataFormat | str | None = None,
651
663
  ) -> str:
652
664
  import time
653
665
 
@@ -671,6 +683,7 @@ class ThordataClient:
671
683
  parameters=parameters,
672
684
  universal_params=universal_params,
673
685
  include_errors=include_errors,
686
+ data_format=data_format,
674
687
  )
675
688
  task_id = self.create_scraper_task_advanced(config)
676
689
 
@@ -1212,12 +1225,22 @@ class ThordataClient:
1212
1225
  # =========================================================================
1213
1226
 
1214
1227
  def _process_universal_response(
1215
- self, response: requests.Response, output_format: str
1216
- ) -> str | bytes:
1228
+ self, response: requests.Response, output_format: str | list[str]
1229
+ ) -> str | bytes | dict[str, str | bytes]:
1230
+ """Process universal scrape response. Returns single value or dict if multiple formats requested."""
1217
1231
  try:
1218
1232
  resp_json = response.json()
1219
1233
  except ValueError:
1220
- return response.content if output_format.lower() == "png" else response.text
1234
+ # If not JSON, return raw content based on format
1235
+ if isinstance(output_format, list):
1236
+ # Multiple formats requested but got non-JSON response
1237
+ return {"raw": response.content}
1238
+ fmt = (
1239
+ output_format.lower()
1240
+ if isinstance(output_format, str)
1241
+ else str(output_format).lower()
1242
+ )
1243
+ return response.content if fmt == "png" else response.text
1221
1244
 
1222
1245
  if isinstance(resp_json, dict):
1223
1246
  code = resp_json.get("code")
@@ -1225,6 +1248,29 @@ class ThordataClient:
1225
1248
  msg = extract_error_message(resp_json)
1226
1249
  raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
1227
1250
 
1251
+ # Handle multiple output formats
1252
+ if isinstance(output_format, list) or (
1253
+ isinstance(output_format, str) and "," in output_format
1254
+ ):
1255
+ result: dict[str, str | bytes] = {}
1256
+ formats = (
1257
+ output_format
1258
+ if isinstance(output_format, list)
1259
+ else [f.strip() for f in output_format.split(",")]
1260
+ )
1261
+
1262
+ for fmt in formats:
1263
+ fmt_lower = fmt.lower()
1264
+ if fmt_lower == "html" and "html" in resp_json:
1265
+ result["html"] = resp_json["html"]
1266
+ elif fmt_lower == "png" and "png" in resp_json:
1267
+ result["png"] = decode_base64_image(resp_json["png"])
1268
+
1269
+ # If we got results, return dict; otherwise return single value for backward compatibility
1270
+ if result:
1271
+ return result
1272
+
1273
+ # Single format (backward compatibility)
1228
1274
  if "html" in resp_json:
1229
1275
  return resp_json["html"]
1230
1276
  if "png" in resp_json:
@@ -390,7 +390,16 @@ def is_retryable_exception(exc: Exception) -> bool:
390
390
  try:
391
391
  import requests
392
392
 
393
- if isinstance(exc, (requests.Timeout, requests.ConnectionError)):
393
+ # requests exposes SSLError under requests.exceptions.SSLError (not requests.SSLError)
394
+ ssl_error = getattr(getattr(requests, "exceptions", None), "SSLError", None)
395
+ retryable: tuple[type[BaseException], ...] = (
396
+ requests.Timeout,
397
+ requests.ConnectionError,
398
+ )
399
+ if ssl_error is not None:
400
+ retryable = retryable + (ssl_error,)
401
+
402
+ if isinstance(exc, retryable):
394
403
  return True
395
404
  except ImportError:
396
405
  pass
@@ -117,7 +117,7 @@ class SerpRequest(ThordataBaseConfig):
117
117
  render_js: bool | None = None
118
118
  no_cache: bool | None = None
119
119
 
120
- # Output
120
+ # Output format: "json" (json=1), "html" (json=3), "light_json" (json=4), or "both" (json=2)
121
121
  output_format: str = "json"
122
122
 
123
123
  # Advanced Google
@@ -155,13 +155,17 @@ class SerpRequest(ThordataBaseConfig):
155
155
  }
156
156
 
157
157
  # JSON output handling
158
+ # Dashboard mapping: json=1 (json), json=3 (html), json=4 (light json), json=2 (both)
158
159
  fmt = self.output_format.lower()
159
160
  if fmt == "json":
160
161
  payload["json"] = "1"
161
162
  elif fmt == "html":
162
- pass # No json param means HTML
163
+ payload["json"] = "3"
164
+ elif fmt in ("light_json", "light-json", "lightjson"):
165
+ payload["json"] = "4"
163
166
  elif fmt in ("2", "both", "json+html"):
164
167
  payload["json"] = "2"
168
+ # If no json param is set, default to HTML (legacy behavior)
165
169
 
166
170
  # Query param handling
167
171
  if engine == "yandex":
@@ -8,6 +8,7 @@ import json
8
8
  from dataclasses import dataclass
9
9
  from enum import Enum
10
10
  from typing import Any
11
+ from urllib.parse import unquote
11
12
 
12
13
  from .common import CommonSettings, ThordataBaseConfig
13
14
 
@@ -49,6 +50,52 @@ class DataFormat(str, Enum):
49
50
  XLSX = "xlsx"
50
51
 
51
52
 
53
+ def _normalize_url_value(value: Any) -> Any:
54
+ if not isinstance(value, str):
55
+ return value
56
+ # Decode all percent-encoded characters to match Dashboard format
57
+ # Dashboard expects URLs in their raw/decoded form, not URL-encoded
58
+ # This ensures API/SDK submissions match manual Dashboard input exactly
59
+ try:
60
+ # Check if URL contains any percent-encoded characters
61
+ if "%" in value:
62
+ # Fully decode the URL to match Dashboard format
63
+ decoded = unquote(value)
64
+ # If decoding changed the value, use decoded version
65
+ # This handles cases like %26 -> &, %3A -> :, %2F -> /, etc.
66
+ if decoded != value:
67
+ return decoded
68
+ except Exception:
69
+ # If decoding fails, return original value
70
+ pass
71
+ return value
72
+
73
+
74
+ def _normalize_parameters(params: dict[str, Any]) -> dict[str, Any]:
75
+ # All parameter keys that contain URLs and should be normalized
76
+ # This ensures API/SDK submissions match Dashboard format exactly
77
+ url_keys = {
78
+ "url",
79
+ "domain",
80
+ "profileurl",
81
+ "posturl",
82
+ "seller_url",
83
+ # Additional URL-related keys that may be used
84
+ "link",
85
+ "href",
86
+ "page_url",
87
+ "product_url",
88
+ "category_url",
89
+ }
90
+ out: dict[str, Any] = {}
91
+ for k, v in params.items():
92
+ if k in url_keys:
93
+ out[k] = _normalize_url_value(v)
94
+ else:
95
+ out[k] = v
96
+ return out
97
+
98
+
52
99
  @dataclass
53
100
  class ScraperTaskConfig(ThordataBaseConfig):
54
101
  file_name: str
@@ -57,13 +104,18 @@ class ScraperTaskConfig(ThordataBaseConfig):
57
104
  parameters: dict[str, Any] | list[dict[str, Any]]
58
105
  universal_params: dict[str, Any] | None = None
59
106
  include_errors: bool = True
107
+ data_format: DataFormat | str | None = (
108
+ None # Support json, csv, xlsx output formats
109
+ )
60
110
 
61
111
  def to_payload(self) -> dict[str, Any]:
62
- # Handle batch parameters: if list, use as is; if dict, wrap in list
112
+ # Normalize parameters: decode percent-encoded URLs to reduce API/Dashboard divergence
63
113
  if isinstance(self.parameters, list):
64
- params_json = json.dumps(self.parameters)
114
+ normalized_list = [_normalize_parameters(p) for p in self.parameters]
115
+ params_json = json.dumps(normalized_list)
65
116
  else:
66
- params_json = json.dumps([self.parameters])
117
+ normalized_one = _normalize_parameters(self.parameters)
118
+ params_json = json.dumps([normalized_one])
67
119
 
68
120
  payload: dict[str, Any] = {
69
121
  "file_name": self.file_name,
@@ -74,6 +126,14 @@ class ScraperTaskConfig(ThordataBaseConfig):
74
126
  }
75
127
  if self.universal_params:
76
128
  payload["spider_universal"] = json.dumps(self.universal_params)
129
+ # Add data_format if specified (for json/csv/xlsx output)
130
+ if self.data_format:
131
+ fmt = (
132
+ self.data_format.value
133
+ if isinstance(self.data_format, DataFormat)
134
+ else str(self.data_format).lower()
135
+ )
136
+ payload["data_format"] = fmt
77
137
  return payload
78
138
 
79
139
 
@@ -87,7 +147,6 @@ class VideoTaskConfig(ThordataBaseConfig):
87
147
  include_errors: bool = True
88
148
 
89
149
  def to_payload(self) -> dict[str, Any]:
90
- # Handle batch parameters
91
150
  if isinstance(self.parameters, list):
92
151
  params_json = json.dumps(self.parameters)
93
152
  else:
@@ -99,13 +158,8 @@ class VideoTaskConfig(ThordataBaseConfig):
99
158
  "spider_name": self.spider_name,
100
159
  "spider_parameters": params_json,
101
160
  "spider_errors": "true" if self.include_errors else "false",
102
- # v2.0 Doc explicitly requires 'spider_universal' key for video tasks too sometimes,
103
- # but usually it's passed as 'common_settings' or 'spider_universal'.
104
- # Sticking to original models.py key logic for now to ensure stability.
105
161
  "spider_universal": self.common_settings.to_json(),
106
162
  }
107
- # Note: If API expects 'common_settings' key specifically, adjust here.
108
- # Based on v2 context, video builder often uses spider_universal.
109
163
  return payload
110
164
 
111
165
 
@@ -15,12 +15,15 @@ from .common import ThordataBaseConfig
15
15
  class UniversalScrapeRequest(ThordataBaseConfig):
16
16
  url: str
17
17
  js_render: bool = False
18
- output_format: str = "html" # 'html' or 'png'
18
+ output_format: str | list[str] = (
19
+ "html" # 'html', 'png', or ['png', 'html'] for both
20
+ )
19
21
  country: str | None = None
20
- block_resources: str | None = None # 'script,image'
22
+ block_resources: str | None = None # 'script,image,video'
21
23
  clean_content: str | None = None # 'js,css'
22
24
  wait: int | None = None # ms
23
25
  wait_for: str | None = None # selector
26
+ follow_redirect: bool | None = None # Follow redirects
24
27
 
25
28
  # Headers/Cookies must be serialized to JSON in payload
26
29
  headers: list[dict[str, str]] | None = None
@@ -29,12 +32,26 @@ class UniversalScrapeRequest(ThordataBaseConfig):
29
32
  extra_params: dict[str, Any] = field(default_factory=dict)
30
33
 
31
34
  def __post_init__(self) -> None:
35
+ # Normalize output_format to list for easier handling
36
+ if isinstance(self.output_format, str):
37
+ formats = [f.strip().lower() for f in self.output_format.split(",")]
38
+ else:
39
+ formats = [
40
+ f.lower() if isinstance(f, str) else str(f).lower()
41
+ for f in self.output_format
42
+ ]
43
+
32
44
  valid_formats = {"html", "png"}
33
- if self.output_format.lower() not in valid_formats:
45
+ invalid = [f for f in formats if f not in valid_formats]
46
+ if invalid:
34
47
  raise ValueError(
35
- f"Invalid output_format: {self.output_format}. Must be one of: {valid_formats}"
48
+ f"Invalid output_format: {invalid}. Must be one or more of: {valid_formats}. "
49
+ f"Use comma-separated string like 'png,html' or list ['png', 'html'] for multiple formats."
36
50
  )
37
51
 
52
+ # Store as list for to_payload
53
+ self._output_formats = formats
54
+
38
55
  if self.wait is not None and (self.wait < 0 or self.wait > 100000):
39
56
  raise ValueError("wait must be between 0 and 100000 milliseconds")
40
57
 
@@ -42,9 +59,22 @@ class UniversalScrapeRequest(ThordataBaseConfig):
42
59
  payload: dict[str, Any] = {
43
60
  "url": self.url,
44
61
  "js_render": "True" if self.js_render else "False",
45
- "type": self.output_format.lower(),
46
62
  }
47
63
 
64
+ # Handle output format: support single or multiple formats (e.g., "png,html")
65
+ if hasattr(self, "_output_formats") and self._output_formats:
66
+ if len(self._output_formats) == 1:
67
+ payload["type"] = self._output_formats[0]
68
+ else:
69
+ # Multiple formats: join with comma (e.g., "png,html")
70
+ payload["type"] = ",".join(self._output_formats)
71
+ else:
72
+ # Fallback for backward compatibility
73
+ if isinstance(self.output_format, str):
74
+ payload["type"] = self.output_format.lower()
75
+ else:
76
+ payload["type"] = ",".join([str(f).lower() for f in self.output_format])
77
+
48
78
  if self.country:
49
79
  payload["country"] = self.country.lower()
50
80
  if self.block_resources:
@@ -55,6 +85,8 @@ class UniversalScrapeRequest(ThordataBaseConfig):
55
85
  payload["wait"] = str(self.wait)
56
86
  if self.wait_for:
57
87
  payload["wait_for"] = self.wait_for
88
+ if self.follow_redirect is not None:
89
+ payload["follow_redirect"] = "True" if self.follow_redirect else "False"
58
90
 
59
91
  # Serialize complex objects as JSON strings
60
92
  if self.headers:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thordata-sdk
3
- Version: 1.6.0
3
+ Version: 1.7.0
4
4
  Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
5
  Author-email: Thordata Developer Team <support@thordata.com>
6
6
  License: MIT
File without changes
File without changes
File without changes