PyPI - thordata-sdk - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

thordata-sdk 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

thordata/__init__.py +33 -36
thordata/_utils.py +21 -21
thordata/async_client.py +230 -192
thordata/client.py +281 -222
thordata/enums.py +32 -6
thordata/exceptions.py +60 -31
thordata/models.py +173 -146
thordata/parameters.py +7 -6
thordata/retry.py +109 -111
{thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/METADATA +228 -10
thordata_sdk-0.5.0.dist-info/RECORD +14 -0
thordata_sdk-0.4.0.dist-info/RECORD +0 -14
{thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/WHEEL +0 -0
{thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/licenses/LICENSE +0 -0
{thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/top_level.txt +0 -0

thordata/models.py CHANGED Viewed

@@ -7,16 +7,16 @@ IDE autocomplete and reduces parameter errors.
 Example:
     >>> from thordata.models import ProxyConfig, SerpRequest
-    >>>
+    >>>
     >>> # Build a proxy URL with geo-targeting
     >>> proxy = ProxyConfig(
     ...     username="myuser",
-    ...     password="mypass",
+    ...     password="mypass",
     ...     country="us",
     ...     city="seattle"
     ... )
     >>> print(proxy.build_proxy_url())
     >>> # Configure a SERP request
     >>> serp = SerpRequest(query="python tutorial", engine="google", num=20)
     >>> print(serp.to_payload())
@@ -24,29 +24,30 @@ Example:
 from __future__ import annotations
+import json
 import re
 import uuid
-import json
 from dataclasses import dataclass, field
-from typing import Optional, Dict, Any, List, Union
 from enum import Enum
+from typing import Any, Dict, List, Optional, Union
 # =============================================================================
 # Proxy Product Types
 # =============================================================================
 class ProxyProduct(str, Enum):
     """
     Thordata proxy product types with their default ports.
     Each product type has a specific port on the proxy gateway.
     """
     RESIDENTIAL = "residential"
     MOBILE = "mobile"
     DATACENTER = "datacenter"
     ISP = "isp"
     @property
     def default_port(self) -> int:
         """Get the default port for this proxy product."""
@@ -63,14 +64,15 @@ class ProxyProduct(str, Enum):
 # Proxy Configuration Models
 # =============================================================================
 @dataclass
 class ProxyConfig:
     """
     Configuration for building a Thordata proxy URL.
     This class handles the complex username format required by Thordata proxies,
     where geo-targeting and session parameters are embedded in the username.
     Args:
         username: Your Thordata account username (the part after 'td-customer-').
         password: Your Thordata account password.
@@ -78,18 +80,18 @@ class ProxyConfig:
         host: Proxy gateway host. If None, uses default based on product.
         port: Proxy gateway port. If None, uses default based on product.
         protocol: Proxy protocol - 'http' or 'https'.
         # Geo-targeting (all optional)
         continent: Target continent code (af/an/as/eu/na/oc/sa).
         country: Target country code in ISO 3166-1 alpha-2 format.
         state: Target state name in lowercase.
         city: Target city name in lowercase.
         asn: Target ASN code (e.g., 'AS12322'). Must be used with country.
         # Session control (optional)
         session_id: Session identifier for sticky sessions.
         session_duration: Session duration in minutes (1-90).
     Example:
         >>> config = ProxyConfig(
         ...     username="GnrqUwwu3obt",
@@ -103,45 +105,45 @@ class ProxyConfig:
         >>> print(config.build_proxy_url())
         http://td-customer-GnrqUwwu3obt-country-us-state-california-sessid-mysession123-sesstime-10:PkCSzvt30iww@....pr.thordata.net:9999
     """
     username: str
     password: str
     product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL
     host: Optional[str] = None
     port: Optional[int] = None
     protocol: str = "http"
     # Geo-targeting
     continent: Optional[str] = None
     country: Optional[str] = None
     state: Optional[str] = None
     city: Optional[str] = None
     asn: Optional[str] = None
     # Session control
     session_id: Optional[str] = None
     session_duration: Optional[int] = None  # minutes, 1-90
     # Valid continent codes
     VALID_CONTINENTS = {"af", "an", "as", "eu", "na", "oc", "sa"}
     def __post_init__(self) -> None:
         """Validate configuration after initialization."""
         # Normalize product to enum
         if isinstance(self.product, str):
             self.product = ProxyProduct(self.product.lower())
         # Set default host and port based on product
         if self.host is None:
             # Extract user prefix from username if available
             # Default to generic host
             self.host = "pr.thordata.net"
         if self.port is None:
             self.port = self.product.default_port
         self._validate()
     def _validate(self) -> None:
         """Validate the proxy configuration."""
         # Validate protocol
@@ -149,7 +151,7 @@ class ProxyConfig:
             raise ValueError(
                 f"Invalid protocol: {self.protocol}. Must be 'http' or 'https'."
             )
         # Validate session duration
         if self.session_duration is not None:
             if not 1 <= self.session_duration <= 90:
@@ -159,96 +161,96 @@ class ProxyConfig:
                 )
             if not self.session_id:
                 raise ValueError("session_duration requires session_id to be set")
         # Validate ASN requires country
         if self.asn and not self.country:
             raise ValueError("ASN targeting requires country to be specified")
         # Validate continent code
         if self.continent and self.continent.lower() not in self.VALID_CONTINENTS:
             raise ValueError(
                 f"Invalid continent code: {self.continent}. "
                 f"Must be one of: {', '.join(sorted(self.VALID_CONTINENTS))}"
             )
         # Validate country code format (2 letters)
         if self.country and not re.match(r"^[a-zA-Z]{2}$", self.country):
             raise ValueError(
                 f"Invalid country code: {self.country}. "
                 "Must be a 2-letter ISO 3166-1 alpha-2 code."
             )
     def build_username(self) -> str:
         """
         Build the complete username string with embedded parameters.
         Returns:
             The formatted username string for proxy authentication.
         """
         parts = [f"td-customer-{self.username}"]
         # Add geo-targeting parameters (order matters)
         if self.continent:
             parts.append(f"continent-{self.continent.lower()}")
         if self.country:
             parts.append(f"country-{self.country.lower()}")
         if self.state:
             parts.append(f"state-{self.state.lower()}")
         if self.city:
             parts.append(f"city-{self.city.lower()}")
         if self.asn:
             # Ensure ASN has correct format
             asn_value = self.asn.upper()
             if not asn_value.startswith("AS"):
                 asn_value = f"AS{asn_value}"
             parts.append(f"asn-{asn_value}")
         # Add session parameters
         if self.session_id:
             parts.append(f"sessid-{self.session_id}")
         if self.session_duration:
             parts.append(f"sesstime-{self.session_duration}")
         return "-".join(parts)
     def build_proxy_url(self) -> str:
         """
         Build the complete proxy URL.
         Returns:
             The formatted proxy URL for use with requests/aiohttp.
         """
         username = self.build_username()
         return f"{self.protocol}://{username}:{self.password}@{self.host}:{self.port}"
     def to_proxies_dict(self) -> Dict[str, str]:
         """
         Build a proxies dict suitable for the requests library.
         Returns:
             Dict with 'http' and 'https' keys pointing to the proxy URL.
         """
         url = self.build_proxy_url()
         return {"http": url, "https": url}
     def to_aiohttp_config(self) -> tuple:
         """
         Get proxy configuration for aiohttp.
         Returns:
             Tuple of (proxy_url, proxy_auth) for aiohttp.
         """
         try:
             import aiohttp
             proxy_url = f"{self.protocol}://{self.host}:{self.port}"
             proxy_auth = aiohttp.BasicAuth(
-                login=self.build_username(),
-                password=self.password
+                login=self.build_username(), password=self.password
             )
             return proxy_url, proxy_auth
         except ImportError:
@@ -259,14 +261,14 @@ class ProxyConfig:
 class StickySession(ProxyConfig):
     """
     Convenience class for creating sticky session proxy configurations.
     A sticky session keeps the same IP address for a specified duration,
     useful for multi-step operations that require IP consistency.
     Args:
         duration_minutes: How long to keep the same IP (1-90 minutes).
         auto_session_id: If True, automatically generates a unique session ID.
     Example:
         >>> session = StickySession(
         ...     username="myuser",
@@ -277,18 +279,18 @@ class StickySession(ProxyConfig):
         >>> # Each call to build_proxy_url() uses the same session
         >>> url = session.build_proxy_url()
     """
     duration_minutes: int = 10
     auto_session_id: bool = True
     def __post_init__(self) -> None:
         # Auto-generate session ID if requested and not provided
         if self.auto_session_id and not self.session_id:
             self.session_id = uuid.uuid4().hex[:12]
         # Set session_duration from duration_minutes
         self.session_duration = self.duration_minutes
         # Call parent post_init
         super().__post_init__()
@@ -297,86 +299,94 @@ class StickySession(ProxyConfig):
 # SERP API Models
 # =============================================================================
 @dataclass
 class SerpRequest:
     """
     Configuration for a SERP API request.
     Supports Google, Bing, Yandex, DuckDuckGo, and Baidu search engines.
     Args:
         query: The search query string (required).
         engine: Search engine to use (default: 'google').
         num: Number of results per page (default: 10).
         start: Result offset for pagination (default: 0).
         # Localization
         country: Country code for results (gl parameter for Google).
         language: Language code for interface (hl parameter for Google).
         google_domain: Google domain to use (e.g., 'google.co.uk').
         # Geo-targeting
         location: Location name for geo-targeting.
         uule: Encoded location parameter (use with location).
         # Search type
         search_type: Type of search (images, news, shopping, videos, etc.).
         # Filters
         safe_search: Enable safe search filtering.
         time_filter: Time range filter (hour, day, week, month, year).
+        no_autocorrect: Disable automatic spelling correction (nfpr).
+        filter_duplicates: Enable/disable duplicate filtering.
+        # Device & Rendering
+        device: Device type ('desktop', 'mobile', 'tablet').
+        render_js: Enable JavaScript rendering in SERP (render_js=True/False).
+        no_cache: Disable internal caching (no_cache=True/False).
+        # Output
+        output_format: 'json' (default) or 'html'.
         # Advanced
-        device: Device type (desktop, mobile, tablet).
-        extra_params: Additional parameters to pass through.
-    Example:
-        >>> req = SerpRequest(
-        ...     query="python programming",
-        ...     engine="google",
-        ...     num=20,
-        ...     country="us",
-        ...     language="en",
-        ...     search_type="news"
-        ... )
-        >>> payload = req.to_payload()
+        ludocid: Google Place ID.
+        kgmid: Google Knowledge Graph ID.
+        # Extra
+        extra_params: Additional parameters to pass through (ibp, lsig, si, uds, ...).
     """
     query: str
     engine: str = "google"
     num: int = 10
     start: int = 0
     # Localization
     country: Optional[str] = None  # 'gl' for Google
     language: Optional[str] = None  # 'hl' for Google
     google_domain: Optional[str] = None
     countries_filter: Optional[str] = None  # 'cr' parameter
     languages_filter: Optional[str] = None  # 'lr' parameter
     # Geo-targeting
     location: Optional[str] = None
     uule: Optional[str] = None  # Encoded location
     # Search type
-    search_type: Optional[str] = None  # tbm parameter (isch, shop, nws, vid)
+    search_type: Optional[str] = None  # tbm parameter (isch, shop, nws, vid, ...)
     # Filters
     safe_search: Optional[bool] = None
-    time_filter: Optional[str] = None  # tbs parameter
+    time_filter: Optional[str] = None  # tbs parameter (time part)
     no_autocorrect: bool = False  # nfpr parameter
-    filter_duplicates: Optional[bool] = None
-    # Advanced
-    device: Optional[str] = None
+    filter_duplicates: Optional[bool] = None  # filter parameter
+    # Device & Rendering
+    device: Optional[str] = None  # 'desktop', 'mobile', 'tablet'
+    render_js: Optional[bool] = None  # render_js parameter
+    no_cache: Optional[bool] = None  # no_cache parameter
+    # Output format
+    output_format: str = "json"  # 'json' or 'html'
     # Advanced Google parameters
     ludocid: Optional[str] = None  # Google Place ID
     kgmid: Optional[str] = None  # Knowledge Graph ID
     # Pass-through
     extra_params: Dict[str, Any] = field(default_factory=dict)
     # Search type mappings for tbm parameter
     SEARCH_TYPE_MAP = {
         "images": "isch",
@@ -389,7 +399,7 @@ class SerpRequest:
         "nws": "nws",
         "vid": "vid",
     }
     # Time filter mappings for tbs parameter
     TIME_FILTER_MAP = {
         "hour": "qdr:h",
@@ -398,7 +408,7 @@ class SerpRequest:
         "month": "qdr:m",
         "year": "qdr:y",
     }
     # Engine URL defaults
     ENGINE_URLS = {
         "google": "google.com",
@@ -407,93 +417,103 @@ class SerpRequest:
         "duckduckgo": "duckduckgo.com",
         "baidu": "baidu.com",
     }
     def to_payload(self) -> Dict[str, Any]:
         """
         Convert to API request payload.
         Returns:
             Dictionary ready to be sent to the SERP API.
         """
         engine = self.engine.lower()
         payload: Dict[str, Any] = {
             "engine": engine,
             "num": str(self.num),
-            "json": "1",
+            # output_format: json=1 for JSON, json=0 for raw HTML
+            "json": "1" if self.output_format.lower() == "json" else "0",
         }
         # Handle query parameter (Yandex uses 'text', others use 'q')
         if engine == "yandex":
             payload["text"] = self.query
         else:
             payload["q"] = self.query
-        # Set URL based on google_domain or engine default
+        # Set URL / domain based on google_domain or engine default
         if self.google_domain:
+            # 显式设置 google_domain 参数，同时设置 url
+            payload["google_domain"] = self.google_domain
             payload["url"] = self.google_domain
         elif engine in self.ENGINE_URLS:
             payload["url"] = self.ENGINE_URLS[engine]
         # Pagination
         if self.start > 0:
             payload["start"] = str(self.start)
         # Localization
         if self.country:
             payload["gl"] = self.country.lower()
         if self.language:
             payload["hl"] = self.language.lower()
         if self.countries_filter:
             payload["cr"] = self.countries_filter
         if self.languages_filter:
             payload["lr"] = self.languages_filter
         # Geo-targeting
         if self.location:
             payload["location"] = self.location
         if self.uule:
             payload["uule"] = self.uule
-        # Search type
+        # Search type (tbm)
         if self.search_type:
             search_type_lower = self.search_type.lower()
             tbm_value = self.SEARCH_TYPE_MAP.get(search_type_lower, search_type_lower)
             payload["tbm"] = tbm_value
         # Filters
         if self.safe_search is not None:
             payload["safe"] = "active" if self.safe_search else "off"
         if self.time_filter:
             time_lower = self.time_filter.lower()
             tbs_value = self.TIME_FILTER_MAP.get(time_lower, time_lower)
             payload["tbs"] = tbs_value
         if self.no_autocorrect:
             payload["nfpr"] = "1"
         if self.filter_duplicates is not None:
             payload["filter"] = "1" if self.filter_duplicates else "0"
         # Device
         if self.device:
             payload["device"] = self.device.lower()
+        # Rendering & cache control
+        if self.render_js is not None:
+            payload["render_js"] = "True" if self.render_js else "False"
+        if self.no_cache is not None:
+            payload["no_cache"] = "True" if self.no_cache else "False"
         # Advanced Google parameters
         if self.ludocid:
             payload["ludocid"] = self.ludocid
         if self.kgmid:
             payload["kgmid"] = self.kgmid
-        # Extra parameters
+        # Extra parameters (ibp, lsig, si, uds, etc.)
         payload.update(self.extra_params)
         return payload
@@ -501,13 +521,14 @@ class SerpRequest:
 # Universal Scraper (Web Unlocker) Models
 # =============================================================================
 @dataclass
 class UniversalScrapeRequest:
     """
     Configuration for a Universal Scraping API (Web Unlocker) request.
     This API bypasses anti-bot protections like Cloudflare, CAPTCHAs, etc.
     Args:
         url: Target URL to scrape (required).
         js_render: Enable JavaScript rendering with headless browser.
@@ -520,7 +541,7 @@ class UniversalScrapeRequest:
         headers: Custom request headers as list of {name, value} dicts.
         cookies: Custom cookies as list of {name, value} dicts.
         extra_params: Additional parameters to pass through.
     Example:
         >>> req = UniversalScrapeRequest(
         ...     url="https://example.com",
@@ -532,7 +553,7 @@ class UniversalScrapeRequest:
         ... )
         >>> payload = req.to_payload()
     """
     url: str
     js_render: bool = False
     output_format: str = "html"  # 'html' or 'png'
@@ -544,7 +565,7 @@ class UniversalScrapeRequest:
     headers: Optional[List[Dict[str, str]]] = None  # [{"name": "...", "value": "..."}]
     cookies: Optional[List[Dict[str, str]]] = None  # [{"name": "...", "value": "..."}]
     extra_params: Dict[str, Any] = field(default_factory=dict)  # 这个必须用 field()
     def __post_init__(self) -> None:
         """Validate configuration."""
         valid_formats = {"html", "png"}
@@ -553,16 +574,16 @@ class UniversalScrapeRequest:
                 f"Invalid output_format: {self.output_format}. "
                 f"Must be one of: {', '.join(valid_formats)}"
             )
         if self.wait is not None and (self.wait < 0 or self.wait > 100000):
             raise ValueError(
                 f"wait must be between 0 and 100000 milliseconds, got {self.wait}"
             )
     def to_payload(self) -> Dict[str, Any]:
         """
         Convert to API request payload.
         Returns:
             Dictionary ready to be sent to the Universal API.
         """
@@ -571,30 +592,30 @@ class UniversalScrapeRequest:
             "js_render": "True" if self.js_render else "False",
             "type": self.output_format.lower(),
         }
         if self.country:
             payload["country"] = self.country.lower()
         if self.block_resources:
             payload["block_resources"] = self.block_resources
         if self.clean_content:
             payload["clean_content"] = self.clean_content
         if self.wait is not None:
             payload["wait"] = str(self.wait)
         if self.wait_for:
             payload["wait_for"] = self.wait_for
         if self.headers:
             payload["headers"] = json.dumps(self.headers)
         if self.cookies:
             payload["cookies"] = json.dumps(self.cookies)
         payload.update(self.extra_params)
         return payload
@@ -602,13 +623,14 @@ class UniversalScrapeRequest:
 # Web Scraper Task Models
 # =============================================================================
 @dataclass
 class ScraperTaskConfig:
     """
     Configuration for creating a Web Scraper API task.
     Note: You must get spider_id and spider_name from the Thordata Dashboard.
     Args:
         file_name: Name for the output file.
         spider_id: Spider identifier from Dashboard.
@@ -616,7 +638,7 @@ class ScraperTaskConfig:
         parameters: Spider-specific parameters.
         universal_params: Global spider settings.
         include_errors: Include error details in output.
     Example:
         >>> config = ScraperTaskConfig(
         ...     file_name="youtube_data",
@@ -629,18 +651,18 @@ class ScraperTaskConfig:
         ... )
         >>> payload = config.to_payload()
     """
     file_name: str
     spider_id: str
     spider_name: str
     parameters: Dict[str, Any]
     universal_params: Optional[Dict[str, Any]] = None
     include_errors: bool = True
     def to_payload(self) -> Dict[str, Any]:
         """
         Convert to API request payload.
         Returns:
             Dictionary ready to be sent to the Web Scraper API.
         """
@@ -651,10 +673,10 @@ class ScraperTaskConfig:
             "spider_parameters": json.dumps([self.parameters]),
             "spider_errors": "true" if self.include_errors else "false",
         }
         if self.universal_params:
             payload["spider_universal"] = json.dumps(self.universal_params)
         return payload
@@ -662,37 +684,42 @@ class ScraperTaskConfig:
 # Response Models
 # =============================================================================
 @dataclass
 class TaskStatusResponse:
     """
     Response from task status check.
     Attributes:
         task_id: The task identifier.
         status: Current task status.
         progress: Optional progress percentage.
         message: Optional status message.
     """
     task_id: str
     status: str
     progress: Optional[int] = None
     message: Optional[str] = None
     def is_complete(self) -> bool:
         """Check if the task has completed (success or failure)."""
         terminal_statuses = {
-            "ready", "success", "finished",
-            "failed", "error", "cancelled"
+            "ready",
+            "success",
+            "finished",
+            "failed",
+            "error",
+            "cancelled",
         }
         return self.status.lower() in terminal_statuses
     def is_success(self) -> bool:
         """Check if the task completed successfully."""
         success_statuses = {"ready", "success", "finished"}
         return self.status.lower() in success_statuses
     def is_failed(self) -> bool:
         """Check if the task failed."""
         failure_statuses = {"failed", "error"}
-        return self.status.lower() in failure_statuses
+        return self.status.lower() in failure_statuses

thordata-sdk 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

thordata-sdk 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl