thordata-sdk 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/__init__.py CHANGED
@@ -5,7 +5,7 @@ Official Python client for Thordata's Proxy Network, SERP API,
5
5
  Universal Scraping API (Web Unlocker), and Web Scraper API.
6
6
  """
7
7
 
8
- __version__ = "1.5.0"
8
+ __version__ = "1.6.0"
9
9
  __author__ = "Thordata Developer Team/Kael Odin"
10
10
  __email__ = "support@thordata.com"
11
11
 
thordata/async_client.py CHANGED
@@ -124,10 +124,10 @@ class AsyncThordataClient:
124
124
  ).rstrip("/")
125
125
 
126
126
  self._gateway_base_url = os.getenv(
127
- "THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
127
+ "THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
128
128
  )
129
129
  self._child_base_url = os.getenv(
130
- "THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
130
+ "THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
131
131
  )
132
132
 
133
133
  # URL Construction
@@ -145,7 +145,7 @@ class AsyncThordataClient:
145
145
  self._proxy_users_url = f"{shared_api_base}/proxy-users"
146
146
 
147
147
  whitelist_base = os.getenv(
148
- "THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
148
+ "THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
149
149
  )
150
150
  self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
151
151
 
@@ -352,7 +352,7 @@ class AsyncThordataClient:
352
352
  file_name: str,
353
353
  spider_id: str,
354
354
  spider_name: str,
355
- parameters: dict[str, Any],
355
+ parameters: dict[str, Any] | list[dict[str, Any]],
356
356
  universal_params: dict[str, Any] | None = None,
357
357
  ) -> str:
358
358
  config = ScraperTaskConfig(
@@ -434,7 +434,7 @@ class AsyncThordataClient:
434
434
  file_name: str,
435
435
  spider_id: str,
436
436
  spider_name: str,
437
- parameters: dict[str, Any],
437
+ parameters: dict[str, Any] | list[dict[str, Any]],
438
438
  common_settings: CommonSettings,
439
439
  ) -> str:
440
440
  config = VideoTaskConfig(
@@ -550,7 +550,7 @@ class AsyncThordataClient:
550
550
  file_name: str,
551
551
  spider_id: str,
552
552
  spider_name: str,
553
- parameters: dict[str, Any],
553
+ parameters: dict[str, Any] | list[dict[str, Any]],
554
554
  universal_params: dict[str, Any] | None = None,
555
555
  *,
556
556
  max_wait: float = 600.0,
@@ -971,7 +971,12 @@ class AsyncThordataClient:
971
971
  if port:
972
972
  params["port"] = str(port)
973
973
 
974
- username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
974
+ if product == "unlimited":
975
+ username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
976
+ "THORDATA_RESIDENTIAL_USERNAME"
977
+ )
978
+ else:
979
+ username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
975
980
  if username:
976
981
  params["td-customer"] = username
977
982
 
thordata/client.py CHANGED
@@ -159,10 +159,10 @@ class ThordataClient:
159
159
  ).rstrip("/")
160
160
 
161
161
  self._gateway_base_url = os.getenv(
162
- "THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
162
+ "THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
163
163
  )
164
164
  self._child_base_url = os.getenv(
165
- "THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
165
+ "THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
166
166
  )
167
167
 
168
168
  # URL Construction
@@ -183,7 +183,7 @@ class ThordataClient:
183
183
  self._proxy_users_url = f"{shared_api_base}/proxy-users"
184
184
 
185
185
  whitelist_base = os.getenv(
186
- "THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
186
+ "THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
187
187
  )
188
188
  self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
189
189
 
@@ -405,7 +405,7 @@ class ThordataClient:
405
405
  file_name: str,
406
406
  spider_id: str,
407
407
  spider_name: str,
408
- parameters: dict[str, Any],
408
+ parameters: dict[str, Any] | list[dict[str, Any]],
409
409
  universal_params: dict[str, Any] | None = None,
410
410
  ) -> str:
411
411
  config = ScraperTaskConfig(
@@ -490,7 +490,7 @@ class ThordataClient:
490
490
  file_name: str,
491
491
  spider_id: str,
492
492
  spider_name: str,
493
- parameters: dict[str, Any],
493
+ parameters: dict[str, Any] | list[dict[str, Any]],
494
494
  common_settings: CommonSettings,
495
495
  ) -> str:
496
496
  config = VideoTaskConfig(
@@ -639,7 +639,7 @@ class ThordataClient:
639
639
  file_name: str,
640
640
  spider_id: str,
641
641
  spider_name: str,
642
- parameters: dict[str, Any],
642
+ parameters: dict[str, Any] | list[dict[str, Any]],
643
643
  universal_params: dict[str, Any] | None = None,
644
644
  *,
645
645
  max_wait: float = 600.0,
@@ -862,7 +862,12 @@ class ThordataClient:
862
862
  if port:
863
863
  params["port"] = str(port)
864
864
 
865
- username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
865
+ if product == "unlimited":
866
+ username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
867
+ "THORDATA_RESIDENTIAL_USERNAME"
868
+ )
869
+ else:
870
+ username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
866
871
  if username:
867
872
  params["td-customer"] = username
868
873
 
thordata/enums.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """
2
2
  Enumerations for the Thordata Python SDK.
3
- Moved to thordata.types in v1.5.0.
3
+ Moved to thordata.types in v1.6.0.
4
4
  This file is kept for backward compatibility.
5
5
  """
6
6
 
@@ -21,7 +21,7 @@ from .types import (
21
21
  SessionType,
22
22
  TaskStatus,
23
23
  TimeRange,
24
- normalize_enum_value, # 新增
24
+ normalize_enum_value,
25
25
  )
26
26
 
27
27
  __all__ = [
thordata/exceptions.py CHANGED
@@ -15,6 +15,7 @@ Exception Hierarchy:
15
15
 
16
16
  from __future__ import annotations
17
17
 
18
+ from collections.abc import Mapping
18
19
  from typing import Any
19
20
 
20
21
  # =============================================================================
@@ -235,6 +236,46 @@ class ThordataNotCollectedError(ThordataAPIError):
235
236
  # =============================================================================
236
237
 
237
238
 
239
+ def _extract_request_id(payload: Any) -> str | None:
240
+ if isinstance(payload, Mapping):
241
+ for key in ("request_id", "requestId", "x_request_id", "x-request-id"):
242
+ val = payload.get(key)
243
+ if val is not None:
244
+ return str(val)
245
+ return None
246
+
247
+
248
+ def _extract_retry_after(payload: Any) -> int | None:
249
+ if isinstance(payload, Mapping):
250
+ for key in ("retry_after", "retryAfter", "retry-after"):
251
+ val = payload.get(key)
252
+ if isinstance(val, int):
253
+ return val
254
+ if isinstance(val, str) and val.isdigit():
255
+ return int(val)
256
+ return None
257
+
258
+
259
+ def _build_error_message(
260
+ message: str,
261
+ *,
262
+ status_code: int | None,
263
+ code: int | None,
264
+ request_id: str | None,
265
+ ) -> str:
266
+ parts: list[str] = [message]
267
+ meta: list[str] = []
268
+ if status_code is not None:
269
+ meta.append(f"http={status_code}")
270
+ if code is not None and code != status_code:
271
+ meta.append(f"code={code}")
272
+ if request_id:
273
+ meta.append(f"request_id={request_id}")
274
+ if meta:
275
+ parts.append("(" + ", ".join(meta) + ")")
276
+ return " ".join(parts)
277
+
278
+
238
279
  def raise_for_code(
239
280
  message: str,
240
281
  *,
@@ -266,49 +307,59 @@ def raise_for_code(
266
307
  # Determine the effective error code.
267
308
  # Prefer payload `code` when present and not success (200),
268
309
  # otherwise fall back to HTTP status when it indicates an error.
310
+ # Determine the effective error code for routing.
269
311
  effective_code: int | None = None
270
-
271
312
  if code is not None and code != 200:
272
313
  effective_code = code
273
- elif status_code is not None and status_code != 200:
314
+ elif status_code is not None and status_code >= 400:
274
315
  effective_code = status_code
275
316
  else:
276
317
  effective_code = code if code is not None else status_code
277
318
 
319
+ # Extract additional context from payload
320
+ final_request_id = request_id or _extract_request_id(payload)
321
+
322
+ # Build a consistent, informative error message
323
+ final_message = _build_error_message(
324
+ message,
325
+ status_code=status_code,
326
+ code=code,
327
+ request_id=final_request_id,
328
+ )
329
+
330
+ # Prepare common arguments for exception constructors
278
331
  kwargs = {
279
332
  "status_code": status_code,
280
333
  "code": code,
281
334
  "payload": payload,
282
- "request_id": request_id,
335
+ "request_id": final_request_id,
283
336
  }
284
337
 
338
+ # --- Route to the correct exception class ---
339
+
285
340
  # Not collected (API payload code 300, often retryable, not billed)
286
- # Check this FIRST since 300 is in API_CODES, not HTTP_STATUS_CODES
287
341
  if effective_code in ThordataNotCollectedError.API_CODES:
288
- raise ThordataNotCollectedError(message, **kwargs)
342
+ raise ThordataNotCollectedError(final_message, **kwargs)
289
343
 
290
- # Auth errors
344
+ # Auth errors (401, 403)
291
345
  if effective_code in ThordataAuthError.HTTP_STATUS_CODES:
292
- raise ThordataAuthError(message, **kwargs)
346
+ raise ThordataAuthError(final_message, **kwargs)
293
347
 
294
- # Rate limit errors
348
+ # Rate limit errors (429, 402)
295
349
  if effective_code in ThordataRateLimitError.HTTP_STATUS_CODES:
296
- # Try to extract retry_after from payload
297
- retry_after = None
298
- if isinstance(payload, dict):
299
- retry_after = payload.get("retry_after")
300
- raise ThordataRateLimitError(message, retry_after=retry_after, **kwargs)
350
+ retry_after = _extract_retry_after(payload)
351
+ raise ThordataRateLimitError(final_message, retry_after=retry_after, **kwargs)
301
352
 
302
- # Server errors
353
+ # Server errors (5xx)
303
354
  if effective_code is not None and 500 <= effective_code < 600:
304
- raise ThordataServerError(message, **kwargs)
355
+ raise ThordataServerError(final_message, **kwargs)
305
356
 
306
- # Validation errors
357
+ # Validation errors (400, 422)
307
358
  if effective_code in ThordataValidationError.HTTP_STATUS_CODES:
308
- raise ThordataValidationError(message, **kwargs)
359
+ raise ThordataValidationError(final_message, **kwargs)
309
360
 
310
- # Generic API error
311
- raise ThordataAPIError(message, **kwargs)
361
+ # Fallback to generic API error if no specific match
362
+ raise ThordataAPIError(final_message, **kwargs)
312
363
 
313
364
 
314
365
  # =============================================================================
thordata/models.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """
2
2
  Data models for the Thordata Python SDK.
3
- Moved to thordata.types in v1.5.0.
3
+ Moved to thordata.types in v1.6.0.
4
4
  This file is kept for backward compatibility.
5
5
  """
6
6
 
thordata/retry.py CHANGED
@@ -186,7 +186,7 @@ def with_retry(
186
186
  if isinstance(e, ThordataRateLimitError) and e.retry_after:
187
187
  delay = max(delay, e.retry_after)
188
188
 
189
- logger.warning(
189
+ logger.info(
190
190
  f"Retry attempt {attempt + 1}/{config.max_retries} "
191
191
  f"after {delay:.2f}s due to: {e}"
192
192
  )
@@ -5,15 +5,19 @@ High-level abstractions for specific scraping targets.
5
5
 
6
6
  from .base import ToolRequest, VideoToolRequest
7
7
  from .code import GitHub
8
- from .ecommerce import Amazon
8
+ from .ecommerce import Amazon, Walmart, eBay
9
+ from .professional import Crunchbase, Glassdoor, Indeed
9
10
  from .search import GoogleMaps, GooglePlay, GoogleShopping
10
11
  from .social import Facebook, Instagram, LinkedIn, Reddit, TikTok, Twitter
12
+ from .travel import Airbnb, Booking, Zillow
11
13
  from .video import YouTube
12
14
 
13
15
  __all__ = [
14
16
  "ToolRequest",
15
17
  "VideoToolRequest",
16
18
  "Amazon",
19
+ "eBay",
20
+ "Walmart",
17
21
  "GoogleMaps",
18
22
  "GoogleShopping",
19
23
  "GooglePlay",
@@ -25,4 +29,10 @@ __all__ = [
25
29
  "Reddit",
26
30
  "YouTube",
27
31
  "GitHub",
32
+ "Indeed",
33
+ "Glassdoor",
34
+ "Crunchbase",
35
+ "Booking",
36
+ "Zillow",
37
+ "Airbnb",
28
38
  ]
thordata/tools/code.py CHANGED
@@ -14,13 +14,26 @@ class GitHub:
14
14
 
15
15
  @dataclass
16
16
  class Repository(ToolRequest):
17
- """Github Repository Scraper"""
17
+ """Github Repository Scraper by Repo URL"""
18
18
 
19
19
  SPIDER_ID = "github_repository_by-repo-url"
20
20
  SPIDER_NAME = "github.com"
21
-
22
21
  repo_url: str
23
- search_url: str | None = None
24
- url: str | None = None # The generic URL param
22
+
23
+ @dataclass
24
+ class RepositoryBySearchUrl(ToolRequest):
25
+ """Github Repository Scraper by Search URL"""
26
+
27
+ SPIDER_ID = "github_repository_by-search-url"
28
+ SPIDER_NAME = "github.com"
29
+ search_url: str
25
30
  page_turning: int | None = None
26
31
  max_num: int | None = None
32
+
33
+ @dataclass
34
+ class RepositoryByUrl(ToolRequest):
35
+ """Github Repository Scraper by URL"""
36
+
37
+ SPIDER_ID = "github_repository_by-url"
38
+ SPIDER_NAME = "github.com"
39
+ url: str
@@ -12,9 +12,10 @@ from .base import ToolRequest
12
12
  class Amazon:
13
13
  """Namespace for Amazon tools."""
14
14
 
15
+ # --- Product Details (5 methods) ---
15
16
  @dataclass
16
- class Product(ToolRequest):
17
- """Amazon Product Details Scraper"""
17
+ class ProductByAsin(ToolRequest):
18
+ """Amazon Product Details Scraper by ASIN."""
18
19
 
19
20
  SPIDER_ID = "amazon_product_by-asin"
20
21
  SPIDER_NAME = "amazon.com"
@@ -22,16 +23,112 @@ class Amazon:
22
23
  asin: str
23
24
  domain: str = "amazon.com"
24
25
 
26
+ # Backward compatible alias
27
+ Product = ProductByAsin
28
+
25
29
  @dataclass
26
- class GlobalProduct(ToolRequest):
27
- """Amazon Global Product Details Scraper"""
30
+ class ProductByUrl(ToolRequest):
31
+ """Amazon Product Details Scraper by URL."""
28
32
 
29
- SPIDER_ID = "amazon_global-product_by-url"
33
+ SPIDER_ID = "amazon_product_by-url"
30
34
  SPIDER_NAME = "amazon.com"
31
35
 
32
36
  url: str
33
37
  zip_code: str | None = None
34
38
 
39
+ @dataclass
40
+ class ProductByKeywords(ToolRequest):
41
+ """Amazon Product Details Scraper by Keywords."""
42
+
43
+ SPIDER_ID = "amazon_product_by-keywords"
44
+ SPIDER_NAME = "amazon.com"
45
+
46
+ keyword: str
47
+ page_turning: int | None = None
48
+ lowest_price: float | None = None
49
+ highest_price: float | None = None
50
+
51
+ @dataclass
52
+ class ProductByCategoryUrl(ToolRequest):
53
+ """Amazon Product Details Scraper by Category URL."""
54
+
55
+ SPIDER_ID = "amazon_product_by-category-url"
56
+ SPIDER_NAME = "amazon.com"
57
+
58
+ url: str
59
+ sort_by: str | None = None
60
+ page_turning: int | None = None
61
+
62
+ @dataclass
63
+ class ProductByBestSellers(ToolRequest):
64
+ """Amazon Product Details Scraper by Best Sellers URL."""
65
+
66
+ SPIDER_ID = "amazon_product_by-best-sellers"
67
+ SPIDER_NAME = "amazon.com"
68
+
69
+ url: str
70
+ page_turning: int | None = None
71
+
72
+ # --- Other Amazon Tools ---
73
+
74
+ @dataclass
75
+ class GlobalProductByUrl(ToolRequest):
76
+ """Amazon Global Product Details Scraper by URL"""
77
+
78
+ SPIDER_ID = "amazon_global-product_by-url"
79
+ SPIDER_NAME = "amazon.com"
80
+
81
+ url: str
82
+
83
+ # Backward compatible alias
84
+ GlobalProduct = GlobalProductByUrl
85
+
86
+ @dataclass
87
+ class GlobalProductByCategoryUrl(ToolRequest):
88
+ """Amazon Global Product Details Scraper by Category URL"""
89
+
90
+ SPIDER_ID = "amazon_global-product_by-category-url"
91
+ SPIDER_NAME = "amazon.com"
92
+
93
+ url: str
94
+ sort_by: str | None = None
95
+ get_sponsored: str | None = None
96
+ maximum: int | None = None
97
+
98
+ @dataclass
99
+ class GlobalProductBySellerUrl(ToolRequest):
100
+ """Amazon Global Product Details Scraper by Seller URL"""
101
+
102
+ SPIDER_ID = "amazon_global-product_by-seller-url"
103
+ SPIDER_NAME = "amazon.com"
104
+
105
+ url: str
106
+ maximum: int | None = None
107
+
108
+ @dataclass
109
+ class GlobalProductByKeywords(ToolRequest):
110
+ """Amazon Global Product Details Scraper by Keywords"""
111
+
112
+ SPIDER_ID = "amazon_global-product_by-keywords"
113
+ SPIDER_NAME = "amazon.com"
114
+
115
+ keyword: str
116
+ domain: str = "https://www.amazon.com"
117
+ lowest_price: str | None = None
118
+ highest_price: str | None = None
119
+ page_turning: int | None = None
120
+
121
+ @dataclass
122
+ class GlobalProductByKeywordsBrand(ToolRequest):
123
+ """Amazon Global Product Details Scraper by Keywords and Brand"""
124
+
125
+ SPIDER_ID = "amazon_global-product_by-keywords-brand"
126
+ SPIDER_NAME = "amazon.com"
127
+
128
+ keyword: str
129
+ brands: str
130
+ page_turning: int | None = None
131
+
35
132
  @dataclass
36
133
  class Review(ToolRequest):
37
134
  """Amazon Product Review Scraper"""
@@ -59,9 +156,96 @@ class Amazon:
59
156
  SPIDER_NAME = "amazon.com"
60
157
 
61
158
  keyword: str
62
- domain: str = "amazon.com"
159
+ domain: str = "https://www.amazon.com/"
63
160
  page_turning: int = 1
64
- sort_by: str | None = None # Best Sellers, Newest Arrivals, etc.
65
- min_price: float | None = None
66
- max_price: float | None = None
67
- get_sponsored: bool | None = None
161
+
162
+
163
+ class eBay:
164
+ """Namespace for eBay tools."""
165
+
166
+ @dataclass
167
+ class ProductByUrl(ToolRequest):
168
+ """eBay Information Scraper by URL"""
169
+
170
+ SPIDER_ID = "ebay_ebay_by-url"
171
+ SPIDER_NAME = "ebay.com"
172
+ url: str
173
+
174
+ @dataclass
175
+ class ProductByCategoryUrl(ToolRequest):
176
+ """eBay Information Scraper by Category URL"""
177
+
178
+ SPIDER_ID = "ebay_ebay_by-category-url"
179
+ SPIDER_NAME = "ebay.com"
180
+ url: str
181
+ count: str | None = None
182
+
183
+ @dataclass
184
+ class ProductByKeywords(ToolRequest):
185
+ """eBay Information Scraper by Keywords"""
186
+
187
+ SPIDER_ID = "ebay_ebay_by-keywords"
188
+ SPIDER_NAME = "ebay.com"
189
+ keywords: str
190
+ count: str | None = None
191
+
192
+ @dataclass
193
+ class ProductByListUrl(ToolRequest):
194
+ """eBay Information Scraper by List URL"""
195
+
196
+ SPIDER_ID = "ebay_ebay_by-listurl"
197
+ SPIDER_NAME = "ebay.com"
198
+ url: str
199
+ count: str | None = None
200
+
201
+
202
+ class Walmart:
203
+ """Namespace for Walmart tools."""
204
+
205
+ @dataclass
206
+ class ProductByUrl(ToolRequest):
207
+ """Walmart Product Information Scraper by URL"""
208
+
209
+ SPIDER_ID = "walmart_product_by-url"
210
+ SPIDER_NAME = "walmart.com"
211
+ url: str
212
+ all_variations: str | None = None
213
+
214
+ @dataclass
215
+ class ProductByCategoryUrl(ToolRequest):
216
+ """Walmart Product Information Scraper by Category URL"""
217
+
218
+ SPIDER_ID = "walmart_product_by-category-url"
219
+ SPIDER_NAME = "walmart.com"
220
+ category_url: str
221
+ all_variations: str | None = None
222
+ page_turning: int | None = None
223
+
224
+ @dataclass
225
+ class ProductBySku(ToolRequest):
226
+ """Walmart Product Information Scraper by SKU"""
227
+
228
+ SPIDER_ID = "walmart_product_by-sku"
229
+ SPIDER_NAME = "walmart.com"
230
+ sku: str
231
+ all_variations: str | None = None
232
+
233
+ @dataclass
234
+ class ProductByKeywords(ToolRequest):
235
+ """Walmart Product Information Scraper by Keywords"""
236
+
237
+ SPIDER_ID = "walmart_product_by-keywords"
238
+ SPIDER_NAME = "walmart.com"
239
+ keyword: str
240
+ domain: str = "https://www.walmart.com/"
241
+ all_variations: str | None = None
242
+ page_turning: int | None = None
243
+
244
+ @dataclass
245
+ class ProductByZipcodes(ToolRequest):
246
+ """Walmart Product Information Scraper by Zipcodes"""
247
+
248
+ SPIDER_ID = "walmart_product_by-zipcodes"
249
+ SPIDER_NAME = "walmart.com"
250
+ url: str
251
+ zip_code: str | None = None