PyPI - thordata-sdk - Versions diffs - 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

thordata-sdk 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

thordata/__init__.py +1 -1
thordata/async_client.py +55 -13
thordata/client.py +64 -13
thordata/enums.py +2 -2
thordata/exceptions.py +80 -20
thordata/models.py +1 -1
thordata/retry.py +1 -1
thordata/tools/__init__.py +11 -1
thordata/tools/code.py +17 -4
thordata/tools/ecommerce.py +194 -10
thordata/tools/professional.py +155 -0
thordata/tools/search.py +47 -5
thordata/tools/social.py +225 -41
thordata/tools/travel.py +100 -0
thordata/tools/video.py +80 -7
thordata/types/serp.py +6 -2
thordata/types/task.py +75 -9
thordata/types/universal.py +37 -5
{thordata_sdk-1.5.0.dist-info → thordata_sdk-1.7.0.dist-info}/METADATA +63 -7
thordata_sdk-1.7.0.dist-info/RECORD +35 -0
{thordata_sdk-1.5.0.dist-info → thordata_sdk-1.7.0.dist-info}/WHEEL +1 -1
thordata/_example_utils.py +0 -77
thordata/demo.py +0 -138
thordata_sdk-1.5.0.dist-info/RECORD +0 -35
{thordata_sdk-1.5.0.dist-info → thordata_sdk-1.7.0.dist-info}/licenses/LICENSE +0 -0
{thordata_sdk-1.5.0.dist-info → thordata_sdk-1.7.0.dist-info}/top_level.txt +0 -0

thordata/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ Official Python client for Thordata's Proxy Network, SERP API,
 Universal Scraping API (Web Unlocker), and Web Scraper API.
 """
-__version__ = "1.5.0"
+__version__ = "1.6.0"
 __author__ = "Thordata Developer Team/Kael Odin"
 __email__ = "support@thordata.com"

thordata/async_client.py CHANGED Viewed

@@ -124,10 +124,10 @@ class AsyncThordataClient:
         ).rstrip("/")
         self._gateway_base_url = os.getenv(
-            "THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
+            "THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
         )
         self._child_base_url = os.getenv(
-            "THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
+            "THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
         )
         # URL Construction
@@ -145,7 +145,7 @@ class AsyncThordataClient:
         self._proxy_users_url = f"{shared_api_base}/proxy-users"
         whitelist_base = os.getenv(
-            "THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
+            "THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
         )
         self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
@@ -293,28 +293,36 @@ class AsyncThordataClient:
         url: str,
         *,
         js_render: bool = False,
-        output_format: str = "html",
+        output_format: str | list[str] = "html",
         country: str | None = None,
         block_resources: str | None = None,
+        clean_content: str | None = None,
         wait: int | None = None,
         wait_for: str | None = None,
+        follow_redirect: bool | None = None,
+        headers: list[dict[str, str]] | None = None,
+        cookies: list[dict[str, str]] | None = None,
         **kwargs: Any,
-    ) -> str | bytes:
+    ) -> str | bytes | dict[str, str | bytes]:
         request = UniversalScrapeRequest(
             url=url,
             js_render=js_render,
             output_format=output_format,
             country=country,
             block_resources=block_resources,
+            clean_content=clean_content,
             wait=wait,
             wait_for=wait_for,
+            follow_redirect=follow_redirect,
+            headers=headers,
+            cookies=cookies,
             extra_params=kwargs,
         )
         return await self.universal_scrape_advanced(request)
     async def universal_scrape_advanced(
         self, request: UniversalScrapeRequest
-    ) -> str | bytes:
+    ) -> str | bytes | dict[str, str | bytes]:
         if not self.scraper_token:
             raise ThordataConfigError("scraper_token required")
         payload = request.to_payload()
@@ -327,9 +335,17 @@ class AsyncThordataClient:
         try:
             resp_json = await response.json()
         except ValueError:
-            if request.output_format.lower() == "png":
-                return await response.read()
-            return await response.text()
+            # If not JSON, return raw content based on format
+            if isinstance(request.output_format, list) or (
+                isinstance(request.output_format, str) and "," in request.output_format
+            ):
+                return {"raw": await response.read()}
+            fmt = (
+                request.output_format.lower()
+                if isinstance(request.output_format, str)
+                else str(request.output_format).lower()
+            )
+            return await response.read() if fmt == "png" else await response.text()
         if isinstance(resp_json, dict):
             code = resp_json.get("code")
@@ -337,6 +353,27 @@ class AsyncThordataClient:
                 msg = extract_error_message(resp_json)
                 raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
+        # Handle multiple output formats
+        if isinstance(request.output_format, list) or (
+            isinstance(request.output_format, str) and "," in request.output_format
+        ):
+            result: dict[str, str | bytes] = {}
+            formats = (
+                request.output_format
+                if isinstance(request.output_format, list)
+                else [f.strip() for f in request.output_format.split(",")]
+            )
+            for fmt in formats:
+                fmt_lower = fmt.lower()
+                if fmt_lower == "html" and "html" in resp_json:
+                    result["html"] = resp_json["html"]
+                elif fmt_lower == "png" and "png" in resp_json:
+                    result["png"] = decode_base64_image(resp_json["png"])
+            if result:
+                return result
         if "html" in resp_json:
             return resp_json["html"]
         if "png" in resp_json:
@@ -352,7 +389,7 @@ class AsyncThordataClient:
         file_name: str,
         spider_id: str,
         spider_name: str,
-        parameters: dict[str, Any],
+        parameters: dict[str, Any] | list[dict[str, Any]],
         universal_params: dict[str, Any] | None = None,
     ) -> str:
         config = ScraperTaskConfig(
@@ -434,7 +471,7 @@ class AsyncThordataClient:
         file_name: str,
         spider_id: str,
         spider_name: str,
-        parameters: dict[str, Any],
+        parameters: dict[str, Any] | list[dict[str, Any]],
         common_settings: CommonSettings,
     ) -> str:
         config = VideoTaskConfig(
@@ -550,7 +587,7 @@ class AsyncThordataClient:
         file_name: str,
         spider_id: str,
         spider_name: str,
-        parameters: dict[str, Any],
+        parameters: dict[str, Any] | list[dict[str, Any]],
         universal_params: dict[str, Any] | None = None,
         *,
         max_wait: float = 600.0,
@@ -971,7 +1008,12 @@ class AsyncThordataClient:
         if port:
             params["port"] = str(port)
-        username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
+        if product == "unlimited":
+            username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
+                "THORDATA_RESIDENTIAL_USERNAME"
+            )
+        else:
+            username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
         if username:
             params["td-customer"] = username

thordata/client.py CHANGED Viewed

@@ -53,6 +53,7 @@ from .serp_engines import SerpNamespace
 # Import Types (Modernized)
 from .types import (
     CommonSettings,
+    DataFormat,
     ProxyConfig,
     ProxyProduct,
     ProxyServer,
@@ -159,10 +160,10 @@ class ThordataClient:
         ).rstrip("/")
         self._gateway_base_url = os.getenv(
-            "THORDATA_GATEWAY_BASE_URL", "https://api.thordata.com/api/gateway"
+            "THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
         )
         self._child_base_url = os.getenv(
-            "THORDATA_CHILD_BASE_URL", "https://api.thordata.com/api/child"
+            "THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
         )
         # URL Construction
@@ -183,7 +184,7 @@ class ThordataClient:
         self._proxy_users_url = f"{shared_api_base}/proxy-users"
         whitelist_base = os.getenv(
-            "THORDATA_WHITELIST_BASE_URL", "https://api.thordata.com/api"
+            "THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
         )
         self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
@@ -364,26 +365,36 @@ class ThordataClient:
         url: str,
         *,
         js_render: bool = False,
-        output_format: str = "html",
+        output_format: str | list[str] = "html",
         country: str | None = None,
         block_resources: str | None = None,
+        clean_content: str | None = None,
         wait: int | None = None,
         wait_for: str | None = None,
+        follow_redirect: bool | None = None,
+        headers: list[dict[str, str]] | None = None,
+        cookies: list[dict[str, str]] | None = None,
         **kwargs: Any,
-    ) -> str | bytes:
+    ) -> str | bytes | dict[str, str | bytes]:
         request = UniversalScrapeRequest(
             url=url,
             js_render=js_render,
             output_format=output_format,
             country=country,
             block_resources=block_resources,
+            clean_content=clean_content,
             wait=wait,
             wait_for=wait_for,
+            follow_redirect=follow_redirect,
+            headers=headers,
+            cookies=cookies,
             extra_params=kwargs,
         )
         return self.universal_scrape_advanced(request)
-    def universal_scrape_advanced(self, request: UniversalScrapeRequest) -> str | bytes:
+    def universal_scrape_advanced(
+        self, request: UniversalScrapeRequest
+    ) -> str | bytes | dict[str, str | bytes]:
         if not self.scraper_token:
             raise ThordataConfigError("scraper_token required")
@@ -405,7 +416,7 @@ class ThordataClient:
         file_name: str,
         spider_id: str,
         spider_name: str,
-        parameters: dict[str, Any],
+        parameters: dict[str, Any] | list[dict[str, Any]],
         universal_params: dict[str, Any] | None = None,
     ) -> str:
         config = ScraperTaskConfig(
@@ -490,7 +501,7 @@ class ThordataClient:
         file_name: str,
         spider_id: str,
         spider_name: str,
-        parameters: dict[str, Any],
+        parameters: dict[str, Any] | list[dict[str, Any]],
         common_settings: CommonSettings,
     ) -> str:
         config = VideoTaskConfig(
@@ -639,7 +650,7 @@ class ThordataClient:
         file_name: str,
         spider_id: str,
         spider_name: str,
-        parameters: dict[str, Any],
+        parameters: dict[str, Any] | list[dict[str, Any]],
         universal_params: dict[str, Any] | None = None,
         *,
         max_wait: float = 600.0,
@@ -648,6 +659,7 @@ class ThordataClient:
         include_errors: bool = True,
         task_type: str = "web",
         common_settings: CommonSettings | None = None,
+        data_format: DataFormat | str | None = None,
     ) -> str:
         import time
@@ -671,6 +683,7 @@ class ThordataClient:
                 parameters=parameters,
                 universal_params=universal_params,
                 include_errors=include_errors,
+                data_format=data_format,
             )
             task_id = self.create_scraper_task_advanced(config)
@@ -862,7 +875,12 @@ class ThordataClient:
         if port:
             params["port"] = str(port)
-        username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
+        if product == "unlimited":
+            username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
+                "THORDATA_RESIDENTIAL_USERNAME"
+            )
+        else:
+            username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
         if username:
             params["td-customer"] = username
@@ -1207,12 +1225,22 @@ class ThordataClient:
     # =========================================================================
     def _process_universal_response(
-        self, response: requests.Response, output_format: str
-    ) -> str | bytes:
+        self, response: requests.Response, output_format: str | list[str]
+    ) -> str | bytes | dict[str, str | bytes]:
+        """Process universal scrape response. Returns single value or dict if multiple formats requested."""
         try:
             resp_json = response.json()
         except ValueError:
-            return response.content if output_format.lower() == "png" else response.text
+            # If not JSON, return raw content based on format
+            if isinstance(output_format, list):
+                # Multiple formats requested but got non-JSON response
+                return {"raw": response.content}
+            fmt = (
+                output_format.lower()
+                if isinstance(output_format, str)
+                else str(output_format).lower()
+            )
+            return response.content if fmt == "png" else response.text
         if isinstance(resp_json, dict):
             code = resp_json.get("code")
@@ -1220,6 +1248,29 @@ class ThordataClient:
                 msg = extract_error_message(resp_json)
                 raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
+        # Handle multiple output formats
+        if isinstance(output_format, list) or (
+            isinstance(output_format, str) and "," in output_format
+        ):
+            result: dict[str, str | bytes] = {}
+            formats = (
+                output_format
+                if isinstance(output_format, list)
+                else [f.strip() for f in output_format.split(",")]
+            )
+            for fmt in formats:
+                fmt_lower = fmt.lower()
+                if fmt_lower == "html" and "html" in resp_json:
+                    result["html"] = resp_json["html"]
+                elif fmt_lower == "png" and "png" in resp_json:
+                    result["png"] = decode_base64_image(resp_json["png"])
+            # If we got results, return dict; otherwise return single value for backward compatibility
+            if result:
+                return result
+        # Single format (backward compatibility)
         if "html" in resp_json:
             return resp_json["html"]
         if "png" in resp_json:

thordata/enums.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Enumerations for the Thordata Python SDK.
-Moved to thordata.types in v1.5.0.
+Moved to thordata.types in v1.6.0.
 This file is kept for backward compatibility.
 """
@@ -21,7 +21,7 @@ from .types import (
     SessionType,
     TaskStatus,
     TimeRange,
-    normalize_enum_value,  # 新增
+    normalize_enum_value,
 )
 __all__ = [

thordata/exceptions.py CHANGED Viewed

@@ -15,6 +15,7 @@ Exception Hierarchy:
 from __future__ import annotations
+from collections.abc import Mapping
 from typing import Any
 # =============================================================================
@@ -235,6 +236,46 @@ class ThordataNotCollectedError(ThordataAPIError):
 # =============================================================================
+def _extract_request_id(payload: Any) -> str | None:
+    if isinstance(payload, Mapping):
+        for key in ("request_id", "requestId", "x_request_id", "x-request-id"):
+            val = payload.get(key)
+            if val is not None:
+                return str(val)
+    return None
+def _extract_retry_after(payload: Any) -> int | None:
+    if isinstance(payload, Mapping):
+        for key in ("retry_after", "retryAfter", "retry-after"):
+            val = payload.get(key)
+            if isinstance(val, int):
+                return val
+            if isinstance(val, str) and val.isdigit():
+                return int(val)
+    return None
+def _build_error_message(
+    message: str,
+    *,
+    status_code: int | None,
+    code: int | None,
+    request_id: str | None,
+) -> str:
+    parts: list[str] = [message]
+    meta: list[str] = []
+    if status_code is not None:
+        meta.append(f"http={status_code}")
+    if code is not None and code != status_code:
+        meta.append(f"code={code}")
+    if request_id:
+        meta.append(f"request_id={request_id}")
+    if meta:
+        parts.append("(" + ", ".join(meta) + ")")
+    return " ".join(parts)
 def raise_for_code(
     message: str,
     *,
@@ -266,49 +307,59 @@ def raise_for_code(
     # Determine the effective error code.
     # Prefer payload `code` when present and not success (200),
     # otherwise fall back to HTTP status when it indicates an error.
+    # Determine the effective error code for routing.
     effective_code: int | None = None
     if code is not None and code != 200:
         effective_code = code
-    elif status_code is not None and status_code != 200:
+    elif status_code is not None and status_code >= 400:
         effective_code = status_code
     else:
         effective_code = code if code is not None else status_code
+    # Extract additional context from payload
+    final_request_id = request_id or _extract_request_id(payload)
+    # Build a consistent, informative error message
+    final_message = _build_error_message(
+        message,
+        status_code=status_code,
+        code=code,
+        request_id=final_request_id,
+    )
+    # Prepare common arguments for exception constructors
     kwargs = {
         "status_code": status_code,
         "code": code,
         "payload": payload,
-        "request_id": request_id,
+        "request_id": final_request_id,
     }
+    # --- Route to the correct exception class ---
     # Not collected (API payload code 300, often retryable, not billed)
-    # Check this FIRST since 300 is in API_CODES, not HTTP_STATUS_CODES
     if effective_code in ThordataNotCollectedError.API_CODES:
-        raise ThordataNotCollectedError(message, **kwargs)
+        raise ThordataNotCollectedError(final_message, **kwargs)
-    # Auth errors
+    # Auth errors (401, 403)
     if effective_code in ThordataAuthError.HTTP_STATUS_CODES:
-        raise ThordataAuthError(message, **kwargs)
+        raise ThordataAuthError(final_message, **kwargs)
-    # Rate limit errors
+    # Rate limit errors (429, 402)
     if effective_code in ThordataRateLimitError.HTTP_STATUS_CODES:
-        # Try to extract retry_after from payload
-        retry_after = None
-        if isinstance(payload, dict):
-            retry_after = payload.get("retry_after")
-        raise ThordataRateLimitError(message, retry_after=retry_after, **kwargs)
+        retry_after = _extract_retry_after(payload)
+        raise ThordataRateLimitError(final_message, retry_after=retry_after, **kwargs)
-    # Server errors
+    # Server errors (5xx)
     if effective_code is not None and 500 <= effective_code < 600:
-        raise ThordataServerError(message, **kwargs)
+        raise ThordataServerError(final_message, **kwargs)
-    # Validation errors
+    # Validation errors (400, 422)
     if effective_code in ThordataValidationError.HTTP_STATUS_CODES:
-        raise ThordataValidationError(message, **kwargs)
+        raise ThordataValidationError(final_message, **kwargs)
-    # Generic API error
-    raise ThordataAPIError(message, **kwargs)
+    # Fallback to generic API error if no specific match
+    raise ThordataAPIError(final_message, **kwargs)
 # =============================================================================
@@ -339,7 +390,16 @@ def is_retryable_exception(exc: Exception) -> bool:
     try:
         import requests
-        if isinstance(exc, (requests.Timeout, requests.ConnectionError)):
+        # requests exposes SSLError under requests.exceptions.SSLError (not requests.SSLError)
+        ssl_error = getattr(getattr(requests, "exceptions", None), "SSLError", None)
+        retryable: tuple[type[BaseException], ...] = (
+            requests.Timeout,
+            requests.ConnectionError,
+        )
+        if ssl_error is not None:
+            retryable = retryable + (ssl_error,)
+        if isinstance(exc, retryable):
             return True
     except ImportError:
         pass

thordata/models.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Data models for the Thordata Python SDK.
-Moved to thordata.types in v1.5.0.
+Moved to thordata.types in v1.6.0.
 This file is kept for backward compatibility.
 """

thordata/retry.py CHANGED Viewed

@@ -186,7 +186,7 @@ def with_retry(
                     if isinstance(e, ThordataRateLimitError) and e.retry_after:
                         delay = max(delay, e.retry_after)
-                    logger.warning(
+                    logger.info(
                         f"Retry attempt {attempt + 1}/{config.max_retries} "
                         f"after {delay:.2f}s due to: {e}"
                     )

thordata/tools/__init__.py CHANGED Viewed

@@ -5,15 +5,19 @@ High-level abstractions for specific scraping targets.
 from .base import ToolRequest, VideoToolRequest
 from .code import GitHub
-from .ecommerce import Amazon
+from .ecommerce import Amazon, Walmart, eBay
+from .professional import Crunchbase, Glassdoor, Indeed
 from .search import GoogleMaps, GooglePlay, GoogleShopping
 from .social import Facebook, Instagram, LinkedIn, Reddit, TikTok, Twitter
+from .travel import Airbnb, Booking, Zillow
 from .video import YouTube
 __all__ = [
     "ToolRequest",
     "VideoToolRequest",
     "Amazon",
+    "eBay",
+    "Walmart",
     "GoogleMaps",
     "GoogleShopping",
     "GooglePlay",
@@ -25,4 +29,10 @@ __all__ = [
     "Reddit",
     "YouTube",
     "GitHub",
+    "Indeed",
+    "Glassdoor",
+    "Crunchbase",
+    "Booking",
+    "Zillow",
+    "Airbnb",
 ]

thordata/tools/code.py CHANGED Viewed

@@ -14,13 +14,26 @@ class GitHub:
     @dataclass
     class Repository(ToolRequest):
-        """Github Repository Scraper"""
+        """Github Repository Scraper by Repo URL"""
         SPIDER_ID = "github_repository_by-repo-url"
         SPIDER_NAME = "github.com"
         repo_url: str
-        search_url: str | None = None
-        url: str | None = None  # The generic URL param
+    @dataclass
+    class RepositoryBySearchUrl(ToolRequest):
+        """Github Repository Scraper by Search URL"""
+        SPIDER_ID = "github_repository_by-search-url"
+        SPIDER_NAME = "github.com"
+        search_url: str
         page_turning: int | None = None
         max_num: int | None = None
+    @dataclass
+    class RepositoryByUrl(ToolRequest):
+        """Github Repository Scraper by URL"""
+        SPIDER_ID = "github_repository_by-url"
+        SPIDER_NAME = "github.com"
+        url: str

thordata-sdk 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

thordata-sdk 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl