PyPI - thordata-sdk - Versions diffs - 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

thordata-sdk 0.3.1py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

thordata/__init__.py +130 -11
thordata/_utils.py +126 -0
thordata/async_client.py +672 -185
thordata/client.py +809 -300
thordata/enums.py +301 -11
thordata/exceptions.py +344 -0
thordata/models.py +725 -0
thordata/parameters.py +7 -6
thordata/retry.py +380 -0
thordata_sdk-0.5.0.dist-info/METADATA +896 -0
thordata_sdk-0.5.0.dist-info/RECORD +14 -0
thordata_sdk-0.5.0.dist-info/licenses/LICENSE +21 -0
thordata_sdk-0.3.1.dist-info/METADATA +0 -200
thordata_sdk-0.3.1.dist-info/RECORD +0 -10
thordata_sdk-0.3.1.dist-info/licenses/LICENSE +0 -201
{thordata_sdk-0.3.1.dist-info → thordata_sdk-0.5.0.dist-info}/WHEEL +0 -0
{thordata_sdk-0.3.1.dist-info → thordata_sdk-0.5.0.dist-info}/top_level.txt +0 -0

thordata/client.py CHANGED Viewed

@@ -1,13 +1,57 @@
-import requests
+"""
+Synchronous client for the Thordata API.
+This module provides the main ThordataClient class for interacting with
+Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
+Example:
+    >>> from thordata import ThordataClient
+    >>>
+    >>> client = ThordataClient(
+    ...     scraper_token="your_token",
+    ...     public_token="your_public_token",
+    ...     public_key="your_public_key"
+    ... )
+    >>>
+    >>> # Use the proxy network
+    >>> response = client.get("https://httpbin.org/ip")
+    >>> print(response.json())
+    >>>
+    >>> # Search with SERP API
+    >>> results = client.serp_search("python tutorial", engine="google")
+"""
+from __future__ import annotations
 import logging
-import json
-import base64
-from typing import Dict, Any, Union, Optional, List
+from typing import Any, Dict, List, Optional, Union
+import os
+import requests
-from .enums import Engine
-from .parameters import normalize_serp_params
+from ._utils import (
+    build_auth_headers,
+    build_public_api_headers,
+    decode_base64_image,
+    extract_error_message,
+    parse_json_response,
+)
+from .enums import Engine, ProxyType
+from .exceptions import (
+    ThordataConfigError,
+    ThordataNetworkError,
+    ThordataTimeoutError,
+    raise_for_code,
+)
+from .models import (
+    ProxyConfig,
+    ProxyProduct,
+    ScraperTaskConfig,
+    SerpRequest,
+    UniversalScrapeRequest,
+)
+from .retry import RetryConfig, with_retry
-# Configure a library-specific logger to avoid interfering with user's logging
 logger = logging.getLogger(__name__)
@@ -16,471 +60,936 @@ class ThordataClient:
     The official synchronous Python client for Thordata.
     This client handles authentication and communication with:
-    1. Proxy Network (Residential/Datacenter via HTTP/HTTPS)
-    2. SERP API (Real-time Search Engine Results)
-    3. Universal Scraping API (Single Page Rendering & Extraction)
-    4. Web Scraper API (Async Task Management for large scale jobs)
+    - Proxy Network (Residential/Datacenter/Mobile/ISP via HTTP/HTTPS)
+    - SERP API (Real-time Search Engine Results)
+    - Universal Scraping API (Web Unlocker - Single Page Rendering)
+    - Web Scraper API (Async Task Management)
+    Args:
+        scraper_token: The API token from your Dashboard.
+        public_token: The public API token (for task status, locations).
+        public_key: The public API key.
+        proxy_host: Custom proxy gateway host (optional).
+        proxy_port: Custom proxy gateway port (optional).
+        timeout: Default request timeout in seconds (default: 30).
+        retry_config: Configuration for automatic retries (optional).
+    Example:
+        >>> client = ThordataClient(
+        ...     scraper_token="your_scraper_token",
+        ...     public_token="your_public_token",
+        ...     public_key="your_public_key"
+        ... )
     """
+    # API Endpoints
+    BASE_URL = "https://scraperapi.thordata.com"
+    UNIVERSAL_URL = "https://universalapi.thordata.com"
+    API_URL = "https://api.thordata.com/api/web-scraper-api"
+    LOCATIONS_URL = "https://api.thordata.com/api/locations"
     def __init__(
         self,
         scraper_token: str,
-        public_token: str,
-        public_key: str,
-        proxy_host: str = "gate.thordata.com",
-        proxy_port: int = 22225
-    ):
-        """
-        Initialize the Thordata Client.
+        public_token: Optional[str] = None,
+        public_key: Optional[str] = None,
+        proxy_host: str = "pr.thordata.net",
+        proxy_port: int = 9999,
+        timeout: int = 30,
+        retry_config: Optional[RetryConfig] = None,
+        scraperapi_base_url: Optional[str] = None,
+        universalapi_base_url: Optional[str] = None,
+        web_scraper_api_base_url: Optional[str] = None,
+        locations_base_url: Optional[str] = None,
+    ) -> None:
+        """Initialize the Thordata Client."""
+        if not scraper_token:
+            raise ThordataConfigError("scraper_token is required")
-        Args:
-            scraper_token (str): The secret token found at the bottom of the Dashboard.
-            public_token (str): The token from the Public API section.
-            public_key (str): The key from the Public API section.
-            proxy_host (str): The proxy gateway host (default: gate.thordata.com).
-            proxy_port (int): The proxy gateway port (default: 22225).
-        """
         self.scraper_token = scraper_token
         self.public_token = public_token
         self.public_key = public_key
-        # Proxy Configuration
-        self.proxy_url = (
-            f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
+        # Proxy configuration
+        self._proxy_host = proxy_host
+        self._proxy_port = proxy_port
+        self._default_timeout = timeout
+        # Retry configuration
+        self._retry_config = retry_config or RetryConfig()
+        # Build default proxy URL (for basic usage)
+        self._default_proxy_url = (
+            f"http://td-customer-{self.scraper_token}:@{proxy_host}:{proxy_port}"
         )
-        # API Endpoints Definition
-        self.base_url = "https://scraperapi.thordata.com"
-        self.universal_url = "https://universalapi.thordata.com"
-        self.api_url = "https://api.thordata.com/api/web-scraper-api"
-        self.locations_url = "https://api.thordata.com/api/locations"
-        self.SERP_API_URL = f"{self.base_url}/request"
-        self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
-        self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
-        self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
-        self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
-        # Initialize Session with Proxy settings
-        self.session = requests.Session()
-        self.session.proxies = {
-            "http": self.proxy_url,
-            "https": self.proxy_url,
+        # Sessions:
+        # - _proxy_session: used for proxy network traffic to target sites
+        # - _api_session: used for Thordata APIs (SERP/Universal/Tasks/Locations)
+        #
+        # We intentionally do NOT set session-level proxies for _api_session,
+        # so developers can rely on system proxy settings (e.g., Clash) via env vars.
+        self._proxy_session = requests.Session()
+        self._proxy_session.trust_env = False
+        self._proxy_session.proxies = {
+            "http": self._default_proxy_url,
+            "https": self._default_proxy_url,
         }
-    def get(self, url: str, **kwargs) -> requests.Response:
+        self._api_session = requests.Session()
+        self._api_session.trust_env = True
+        # Base URLs (allow override via args or env vars for testing and custom routing)
+        scraperapi_base = (
+            scraperapi_base_url
+            or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
+            or self.BASE_URL
+        ).rstrip("/")
+        universalapi_base = (
+            universalapi_base_url
+            or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
+            or self.UNIVERSAL_URL
+        ).rstrip("/")
+        web_scraper_api_base = (
+            web_scraper_api_base_url
+            or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
+            or self.API_URL
+        ).rstrip("/")
+        locations_base = (
+            locations_base_url
+            or os.getenv("THORDATA_LOCATIONS_BASE_URL")
+            or self.LOCATIONS_URL
+        ).rstrip("/")
+        self._serp_url = f"{scraperapi_base}/request"
+        self._builder_url = f"{scraperapi_base}/builder"
+        self._universal_url = f"{universalapi_base}/request"
+        self._status_url = f"{web_scraper_api_base}/tasks-status"
+        self._download_url = f"{web_scraper_api_base}/tasks-download"
+        self._locations_base_url = locations_base
+    # =========================================================================
+    # Proxy Network Methods
+    # =========================================================================
+    def get(
+        self,
+        url: str,
+        *,
+        proxy_config: Optional[ProxyConfig] = None,
+        timeout: Optional[int] = None,
+        **kwargs: Any,
+    ) -> requests.Response:
+        """
+        Send a GET request through the Thordata Proxy Network.
+        Args:
+            url: The target URL.
+            proxy_config: Custom proxy configuration for geo-targeting/sessions.
+            timeout: Request timeout in seconds.
+            **kwargs: Additional arguments to pass to requests.get().
+        Returns:
+            The response object.
+        Example:
+            >>> # Basic request
+            >>> response = client.get("https://httpbin.org/ip")
+            >>>
+            >>> # With geo-targeting
+            >>> from thordata.models import ProxyConfig
+            >>> config = ProxyConfig(
+            ...     username="myuser",
+            ...     password="mypass",
+            ...     country="us",
+            ...     city="seattle"
+            ... )
+            >>> response = client.get("https://httpbin.org/ip", proxy_config=config)
+        """
+        logger.debug(f"Proxy GET request: {url}")
+        timeout = timeout or self._default_timeout
+        if proxy_config:
+            proxies = proxy_config.to_proxies_dict()
+            kwargs["proxies"] = proxies
+        return self._request_with_retry("GET", url, timeout=timeout, **kwargs)
+    def post(
+        self,
+        url: str,
+        *,
+        proxy_config: Optional[ProxyConfig] = None,
+        timeout: Optional[int] = None,
+        **kwargs: Any,
+    ) -> requests.Response:
+        """
+        Send a POST request through the Thordata Proxy Network.
+        Args:
+            url: The target URL.
+            proxy_config: Custom proxy configuration.
+            timeout: Request timeout in seconds.
+            **kwargs: Additional arguments to pass to requests.post().
+        Returns:
+            The response object.
+        """
+        logger.debug(f"Proxy POST request: {url}")
+        timeout = timeout or self._default_timeout
+        if proxy_config:
+            proxies = proxy_config.to_proxies_dict()
+            kwargs["proxies"] = proxies
+        return self._request_with_retry("POST", url, timeout=timeout, **kwargs)
+    def build_proxy_url(
+        self,
+        *,
+        country: Optional[str] = None,
+        state: Optional[str] = None,
+        city: Optional[str] = None,
+        session_id: Optional[str] = None,
+        session_duration: Optional[int] = None,
+        product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL,
+    ) -> str:
         """
-        Send a standard GET request through the Thordata Residential Proxy Network.
+        Build a proxy URL with custom targeting options.
+        This is a convenience method for creating proxy URLs without
+        manually constructing a ProxyConfig.
         Args:
-            url (str): The target URL.
-            **kwargs: Arguments to pass to requests.get().
+            country: Target country code (e.g., 'us', 'gb').
+            state: Target state (e.g., 'california').
+            city: Target city (e.g., 'seattle').
+            session_id: Session ID for sticky sessions.
+            session_duration: Session duration in minutes (1-90).
+            product: Proxy product type.
         Returns:
-            requests.Response: The response object.
+            The proxy URL string.
+        Example:
+            >>> url = client.build_proxy_url(country="us", city="seattle")
+            >>> proxies = {"http": url, "https": url}
+            >>> requests.get("https://example.com", proxies=proxies)
         """
-        logger.debug(f"Proxy Request: {url}")
-        kwargs.setdefault("timeout", 30)
-        return self.session.get(url, **kwargs)
+        config = ProxyConfig(
+            username=self.scraper_token,
+            password="",
+            host=self._proxy_host,
+            port=self._proxy_port,
+            product=product,
+            country=country,
+            state=state,
+            city=city,
+            session_id=session_id,
+            session_duration=session_duration,
+        )
+        return config.build_proxy_url()
+    # =========================================================================
+    # SERP API Methods
+    # =========================================================================
     def serp_search(
-        self,
-        query: str,
+        self,
+        query: str,
+        *,
         engine: Union[Engine, str] = Engine.GOOGLE,
-        num: int = 10,
-        **kwargs
+        num: int = 10,
+        country: Optional[str] = None,
+        language: Optional[str] = None,
+        search_type: Optional[str] = None,
+        device: Optional[str] = None,
+        render_js: Optional[bool] = None,
+        no_cache: Optional[bool] = None,
+        output_format: str = "json",
+        **kwargs: Any,
     ) -> Dict[str, Any]:
         """
         Execute a real-time SERP (Search Engine Results Page) search.
         Args:
-            query (str): The search keywords.
-            engine (Union[Engine, str]): The search engine (e.g., 'google', 'bing').
-            num (int): Number of results to retrieve (default 10).
-            **kwargs: Additional parameters (e.g., type="shopping", location="London").
+            query: The search keywords.
+            engine: Search engine (google, bing, yandex, duckduckgo, baidu).
+            num: Number of results to retrieve (default: 10).
+            country: Country code for localized results (e.g., 'us').
+            language: Language code for interface (e.g., 'en').
+            search_type: Type of search (images, news, shopping, videos, etc.).
+            device: Device type ('desktop', 'mobile', 'tablet').
+            render_js: Enable JavaScript rendering in SERP (render_js=True).
+            no_cache: Disable internal caching (no_cache=True).
+            output_format: 'json' to return parsed JSON (default),
+                           'html' to return HTML wrapped in {'html': ...}.
+            **kwargs: Additional engine-specific parameters.
         Returns:
-            Dict[str, Any]: The parsed JSON result from the search engine.
+            Dict[str, Any]: Parsed JSON results or a dict with 'html' key.
+        Example:
+            >>> # Basic search
+            >>> results = client.serp_search("python tutorial")
+            >>>
+            >>> # With options
+            >>> results = client.serp_search(
+            ...     "laptop reviews",
+            ...     engine="google",
+            ...     num=20,
+            ...     country="us",
+            ...     search_type="shopping",
+            ...     device="mobile",
+            ...     render_js=True,
+            ...     no_cache=True,
+            ... )
         """
-        # Handle Enum or String input for engine
+        # Normalize engine
         engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
-        # Normalize parameters via internal helper
-        payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
+        # Build request using model
+        request = SerpRequest(
+            query=query,
+            engine=engine_str,
+            num=num,
+            country=country,
+            language=language,
+            search_type=search_type,
+            device=device,
+            render_js=render_js,
+            no_cache=no_cache,
+            output_format=output_format,
+            extra_params=kwargs,
+        )
-        headers = {
-            "Authorization": f"Bearer {self.scraper_token}",
-            "Content-Type": "application/x-www-form-urlencoded"
-        }
+        payload = request.to_payload()
+        headers = build_auth_headers(self.scraper_token)
         logger.info(f"SERP Search: {engine_str} - {query}")
         try:
-            response = self.session.post(
-                self.SERP_API_URL,
+            response = self._api_session.post(
+                self._serp_url,
                 data=payload,
                 headers=headers,
-                timeout=60
+                timeout=60,
             )
             response.raise_for_status()
-            data = response.json()
-            # Handle cases where the API returns a stringified JSON
-            if isinstance(data, str):
-                try:
-                    data = json.loads(data)
-                except json.JSONDecodeError:
-                    pass
-            return data
-        except Exception as e:
-            logger.error(f"SERP Request Failed: {e}")
-            raise
+            # JSON mode (default)
+            if output_format.lower() == "json":
+                data = response.json()
+                if isinstance(data, dict):
+                    code = data.get("code")
+                    if code is not None and code != 200:
+                        msg = extract_error_message(data)
+                        raise_for_code(
+                            f"SERP API Error: {msg}",
+                            code=code,
+                            payload=data,
+                        )
+                return parse_json_response(data)
+            # HTML mode: wrap as dict to keep return type stable
+            return {"html": response.text}
+        except requests.Timeout as e:
+            raise ThordataTimeoutError(
+                f"SERP request timed out: {e}",
+                original_error=e,
+            )
+        except requests.RequestException as e:
+            raise ThordataNetworkError(
+                f"SERP request failed: {e}",
+                original_error=e,
+            )
+    def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
+        """
+        Execute a SERP search using a SerpRequest object.
+        This method provides full control over all search parameters.
+        Args:
+            request: A SerpRequest object with all parameters configured.
+        Returns:
+            Dict[str, Any]: Parsed JSON results or dict with 'html' key.
+        Example:
+            >>> from thordata.models import SerpRequest
+            >>> request = SerpRequest(
+            ...     query="python programming",
+            ...     engine="google",
+            ...     num=50,
+            ...     country="us",
+            ...     language="en",
+            ...     search_type="news",
+            ...     time_filter="week",
+            ...     safe_search=True
+            ... )
+            >>> results = client.serp_search_advanced(request)
+        """
+        payload = request.to_payload()
+        headers = build_auth_headers(self.scraper_token)
+        logger.info(f"SERP Advanced Search: {request.engine} - {request.query}")
+        try:
+            response = self._api_session.post(
+                self._serp_url,
+                data=payload,
+                headers=headers,
+                timeout=60,
+            )
+            response.raise_for_status()
+            if request.output_format.lower() == "json":
+                data = response.json()
+                if isinstance(data, dict):
+                    code = data.get("code")
+                    if code is not None and code != 200:
+                        msg = extract_error_message(data)
+                        raise_for_code(
+                            f"SERP API Error: {msg}",
+                            code=code,
+                            payload=data,
+                        )
+                return parse_json_response(data)
+            return {"html": response.text}
+        except requests.Timeout as e:
+            raise ThordataTimeoutError(
+                f"SERP request timed out: {e}",
+                original_error=e,
+            )
+        except requests.RequestException as e:
+            raise ThordataNetworkError(
+                f"SERP request failed: {e}",
+                original_error=e,
+            )
+    # =========================================================================
+    # Universal Scraping API (Web Unlocker) Methods
+    # =========================================================================
     def universal_scrape(
         self,
         url: str,
+        *,
         js_render: bool = False,
-        output_format: str = "HTML",
+        output_format: str = "html",
         country: Optional[str] = None,
-        block_resources: bool = False
+        block_resources: Optional[str] = None,
+        wait: Optional[int] = None,
+        wait_for: Optional[str] = None,
+        **kwargs: Any,
     ) -> Union[str, bytes]:
         """
-        Unlock target pages via the Universal Scraping API.
-        Bypasses Cloudflare, CAPTCHAs, and antibot systems automatically.
+        Scrape a URL using the Universal Scraping API (Web Unlocker).
+        Automatically bypasses Cloudflare, CAPTCHAs, and antibot systems.
         Args:
-            url (str): Target URL.
-            js_render (bool): Whether to render JavaScript (Headless Browser).
-            output_format (str): "HTML" or "PNG" (screenshot).
-            country (Optional[str]): Geo-targeting country code (e.g., 'us').
-            block_resources (bool): Block images/css to speed up loading.
+            url: Target URL.
+            js_render: Enable JavaScript rendering (headless browser).
+            output_format: "html" or "png" (screenshot).
+            country: Geo-targeting country code.
+            block_resources: Resources to block (e.g., 'script,image').
+            wait: Wait time in milliseconds after page load.
+            wait_for: CSS selector to wait for.
+            **kwargs: Additional parameters.
         Returns:
-            Union[str, bytes]: HTML string or PNG bytes.
+            HTML string or PNG bytes depending on output_format.
+        Example:
+            >>> # Get HTML
+            >>> html = client.universal_scrape("https://example.com", js_render=True)
+            >>>
+            >>> # Get screenshot
+            >>> png = client.universal_scrape(
+            ...     "https://example.com",
+            ...     js_render=True,
+            ...     output_format="png"
+            ... )
+            >>> with open("screenshot.png", "wb") as f:
+            ...     f.write(png)
         """
-        headers = {
-            "Authorization": f"Bearer {self.scraper_token}",
-            "Content-Type": "application/x-www-form-urlencoded"
-        }
+        request = UniversalScrapeRequest(
+            url=url,
+            js_render=js_render,
+            output_format=output_format,
+            country=country,
+            block_resources=block_resources,
+            wait=wait,
+            wait_for=wait_for,
+            extra_params=kwargs,
+        )
-        payload = {
-            "url": url,
-            "js_render": "True" if js_render else "False",
-            "type": output_format.lower(),
-            "block_resources": "True" if block_resources else "False"
-        }
-        if country:
-            payload["country"] = country
+        return self.universal_scrape_advanced(request)
-        logger.info(f"Universal Scrape: {url} (Format: {output_format})")
+    def universal_scrape_advanced(
+        self, request: UniversalScrapeRequest
+    ) -> Union[str, bytes]:
+        """
+        Scrape using a UniversalScrapeRequest object for full control.
+        Args:
+            request: A UniversalScrapeRequest with all parameters.
+        Returns:
+            HTML string or PNG bytes.
+        """
+        payload = request.to_payload()
+        headers = build_auth_headers(self.scraper_token)
+        logger.info(
+            f"Universal Scrape: {request.url} (format: {request.output_format})"
+        )
         try:
-            response = self.session.post(
-                self.UNIVERSAL_API_URL,
+            response = self._api_session.post(
+                self._universal_url,
                 data=payload,
                 headers=headers,
-                timeout=60
+                timeout=60,
             )
             response.raise_for_status()
-            # Attempt to parse JSON wrapper
-            try:
-                resp_json = response.json()
-            except json.JSONDecodeError:
-                # Fallback: if the API returns raw content directly
-                if output_format.upper() == "PNG":
-                    return response.content
-                return response.text
-            # Check for API-level errors inside the JSON
-            if isinstance(resp_json, dict) and resp_json.get("code") \
-                    and resp_json.get("code") != 200:
-                raise Exception(f"Universal API Error: {resp_json}")
-            # Case 1: Return HTML
-            if "html" in resp_json:
-                return resp_json["html"]
-            # Case 2: Return PNG Image
-            if "png" in resp_json:
-                png_str = resp_json["png"]
-                if not png_str:
-                    raise Exception("API returned empty PNG data")
-                # Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
-                if "," in png_str:
-                    png_str = png_str.split(",", 1)[1]
-                # Fix Base64 Padding
-                png_str = png_str.replace("\n", "").replace("\r", "")
-                missing_padding = len(png_str) % 4
-                if missing_padding:
-                    png_str += '=' * (4 - missing_padding)
-                return base64.b64decode(png_str)
-            # Fallback
-            return str(resp_json)
+            return self._process_universal_response(response, request.output_format)
-        except Exception as e:
-            logger.error(f"Universal Scrape Failed: {e}")
-            raise
+        except requests.Timeout as e:
+            raise ThordataTimeoutError(
+                f"Universal scrape timed out: {e}", original_error=e
+            )
+        except requests.RequestException as e:
+            raise ThordataNetworkError(
+                f"Universal scrape failed: {e}", original_error=e
+            )
+    def _process_universal_response(
+        self, response: requests.Response, output_format: str
+    ) -> Union[str, bytes]:
+        """Process the response from Universal API."""
+        # Try to parse as JSON
+        try:
+            resp_json = response.json()
+        except ValueError:
+            # Raw content returned
+            if output_format.lower() == "png":
+                return response.content
+            return response.text
+        # Check for API-level errors
+        if isinstance(resp_json, dict):
+            code = resp_json.get("code")
+            if code is not None and code != 200:
+                msg = extract_error_message(resp_json)
+                raise_for_code(
+                    f"Universal API Error: {msg}", code=code, payload=resp_json
+                )
+        # Extract HTML
+        if "html" in resp_json:
+            return resp_json["html"]
+        # Extract PNG
+        if "png" in resp_json:
+            return decode_base64_image(resp_json["png"])
+        # Fallback
+        return str(resp_json)
+    # =========================================================================
+    # Web Scraper API (Task-based) Methods
+    # =========================================================================
     def create_scraper_task(
         self,
         file_name: str,
         spider_id: str,
         spider_name: str,
-        individual_params: Dict[str, Any],
-        universal_params: Optional[Dict[str, Any]] = None
+        parameters: Dict[str, Any],
+        universal_params: Optional[Dict[str, Any]] = None,
     ) -> str:
         """
-        Create a generic Web Scraper Task (Async).
-        IMPORTANT: You must retrieve the correct 'spider_id' and 'spider_name'
-        from the Thordata Dashboard before calling this method.
+        Create an asynchronous Web Scraper task.
+        Note: Get spider_id and spider_name from the Thordata Dashboard.
         Args:
-            file_name (str): Name for the output file.
-            spider_id (str): The ID of the spider (from Dashboard).
-            spider_name (str): The name of the spider (e.g., "youtube.com").
-            individual_params (Dict): Parameters specific to the spider.
-            universal_params (Optional[Dict]): Global settings for the scraper.
+            file_name: Name for the output file.
+            spider_id: Spider identifier from Dashboard.
+            spider_name: Spider name (e.g., "youtube.com").
+            parameters: Spider-specific parameters.
+            universal_params: Global spider settings.
         Returns:
-            str: The created task_id.
+            The created task_id.
+        Example:
+            >>> task_id = client.create_scraper_task(
+            ...     file_name="youtube_data",
+            ...     spider_id="youtube_video-post_by-url",
+            ...     spider_name="youtube.com",
+            ...     parameters={"url": "https://youtube.com/@channel/videos"}
+            ... )
         """
-        headers = {
-            "Authorization": f"Bearer {self.scraper_token}",
-            "Content-Type": "application/x-www-form-urlencoded"
-        }
+        config = ScraperTaskConfig(
+            file_name=file_name,
+            spider_id=spider_id,
+            spider_name=spider_name,
+            parameters=parameters,
+            universal_params=universal_params,
+        )
-        # Payload construction
-        payload = {
-            "spider_name": spider_name,
-            "spider_id": spider_id,
-            "spider_parameters": json.dumps([individual_params]),
-            "spider_errors": "true",
-            "file_name": file_name
-        }
-        if universal_params:
-            payload["spider_universal"] = json.dumps(universal_params)
+        return self.create_scraper_task_advanced(config)
+    def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
+        """
+        Create a scraper task using a ScraperTaskConfig object.
+        Args:
+            config: Task configuration.
+        Returns:
+            The created task_id.
+        """
+        payload = config.to_payload()
+        headers = build_auth_headers(self.scraper_token)
+        logger.info(f"Creating Scraper Task: {config.spider_name}")
-        logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
         try:
-            response = self.session.post(
-                self.SCRAPER_BUILDER_URL,
+            response = self._api_session.post(
+                self._builder_url,
                 data=payload,
-                headers=headers
+                headers=headers,
+                timeout=30,
             )
             response.raise_for_status()
             data = response.json()
+            code = data.get("code")
+            if code != 200:
+                msg = extract_error_message(data)
+                raise_for_code(f"Task creation failed: {msg}", code=code, payload=data)
-            if data.get("code") != 200:
-                raise Exception(f"Creation failed: {data}")
             return data["data"]["task_id"]
-        except Exception as e:
-            logger.error(f"Task Creation Failed: {e}")
-            raise
+        except requests.RequestException as e:
+            raise ThordataNetworkError(f"Task creation failed: {e}", original_error=e)
     def get_task_status(self, task_id: str) -> str:
         """
         Check the status of an asynchronous scraping task.
         Args:
-            task_id (str): The ID returned by create_scraper_task.
+            task_id: The task ID from create_scraper_task.
         Returns:
-            str: The status string (e.g., "finished", "running", "error").
+            Status string (e.g., "running", "ready", "failed").
         """
-        headers = {
-            "token": self.public_token,
-            "key": self.public_key,
-            "Content-Type": "application/x-www-form-urlencoded"
-        }
+        self._require_public_credentials()
+        headers = build_public_api_headers(
+            self.public_token or "", self.public_key or ""
+        )
         payload = {"tasks_ids": task_id}
         try:
-            response = self.session.post(
-                self.SCRAPER_STATUS_URL,
+            response = self._api_session.post(
+                self._status_url,
                 data=payload,
-                headers=headers
+                headers=headers,
+                timeout=30,
             )
             response.raise_for_status()
             data = response.json()
             if data.get("code") == 200 and data.get("data"):
                 for item in data["data"]:
                     if str(item.get("task_id")) == str(task_id):
-                        return item["status"]
-            return "Unknown"
+                        return item.get("status", "unknown")
+            return "unknown"
         except Exception as e:
-            logger.error(f"Status Check Failed: {e}")
-            return "Error"
+            logger.error(f"Status check failed: {e}")
+            return "error"
     def get_task_result(self, task_id: str, file_type: str = "json") -> str:
         """
-        Retrieve the download URL for a completed task.
-        Args:
-            task_id (str): The task ID.
-            file_type (str): Format required (default "json").
-        Returns:
-            str: The URL to download the result file.
+        Get the download URL for a completed task.
         """
-        headers = {
-            "token": self.public_token,
-            "key": self.public_key,
-            "Content-Type": "application/x-www-form-urlencoded"
-        }
+        self._require_public_credentials()
+        headers = build_public_api_headers(
+            self.public_token or "", self.public_key or ""
+        )
         payload = {"tasks_id": task_id, "type": file_type}
         logger.info(f"Getting result URL for Task: {task_id}")
         try:
-            response = self.session.post(
-                self.SCRAPER_DOWNLOAD_URL,
+            response = self._api_session.post(
+                self._download_url,
                 data=payload,
-                headers=headers
+                headers=headers,
+                timeout=30,
             )
             response.raise_for_status()
             data = response.json()
+            code = data.get("code")
-            if data.get("code") == 200 and data.get("data"):
+            if code == 200 and data.get("data"):
                 return data["data"]["download"]
-            raise Exception(f"API returned error: {data}")
-        except Exception as e:
-            logger.error(f"Get Result Failed: {e}")
-            raise
-    def _get_locations(self, endpoint: str, params: Dict[str, str]) -> List[Dict[str, Any]]:
+            msg = extract_error_message(data)
+            raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
+            # This line won't be reached, but satisfies mypy
+            raise RuntimeError("Unexpected state")
+        except requests.RequestException as e:
+            raise ThordataNetworkError(f"Get result failed: {e}", original_error=e)
+    def wait_for_task(
+        self,
+        task_id: str,
+        *,
+        poll_interval: float = 5.0,
+        max_wait: float = 600.0,
+    ) -> str:
         """
-        Internal helper to call the public locations API.
+        Wait for a task to complete.
         Args:
-            endpoint: One of 'countries', 'states', 'cities', 'asn'.
-            params: Query parameters (must include token, key, proxy_type, etc.)
+            task_id: The task ID to wait for.
+            poll_interval: Seconds between status checks.
+            max_wait: Maximum seconds to wait.
         Returns:
-            List of location records from the 'data' field.
+            Final task status.
         Raises:
-            RuntimeError: If token/key are missing or API returns an error code.
+            TimeoutError: If max_wait is exceeded.
+        Example:
+            >>> task_id = client.create_scraper_task(...)
+            >>> status = client.wait_for_task(task_id, max_wait=300)
+            >>> if status in ("ready", "success"):
+            ...     url = client.get_task_result(task_id)
         """
-        if not self.public_token or not self.public_key:
-            raise RuntimeError(
-                "Public API token/key are required for locations endpoints. "
-                "Please provide 'public_token' and 'public_key' when "
-                "initializing ThordataClient."
-            )
+        import time
-        url = f"{self.locations_url}/{endpoint}"
-        logger.info("Locations API request: %s", url)
+        elapsed = 0.0
-        # Use a direct requests.get here; no need to go through the proxy gateway.
-        response = requests.get(
-            url,
-            params=params,
-            timeout=30,
-        )
-        response.raise_for_status()
+        while elapsed < max_wait:
+            status = self.get_task_status(task_id)
-        data = response.json()
-        if isinstance(data, dict):
-            code = data.get("code")
-            if code is not None and code != 200:
-                msg = data.get("msg", "")
-                raise RuntimeError(
-                    f"Locations API error ({endpoint}): code={code}, msg={msg}"
-                )
-            return data.get("data") or []
-        # Fallback: if backend ever returns a list directly
-        if isinstance(data, list):
-            return data
-        return []
-    def list_countries(self, proxy_type: int = 1) -> List[Dict[str, Any]]:
+            logger.debug(f"Task {task_id} status: {status}")
+            terminal_statuses = {
+                "ready",
+                "success",
+                "finished",
+                "failed",
+                "error",
+                "cancelled",
+            }
+            if status.lower() in terminal_statuses:
+                return status
+            time.sleep(poll_interval)
+            elapsed += poll_interval
+        raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
+    # =========================================================================
+    # Location API Methods
+    # =========================================================================
+    def list_countries(
+        self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
+    ) -> List[Dict[str, Any]]:
         """
-        List supported countries for Thordata residential or unlimited proxies.
+        List supported countries for proxies.
         Args:
-            proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
+            proxy_type: 1 for residential, 2 for unlimited.
         Returns:
-            List[Dict[str, Any]]: Each record contains 'country_code' and 'country_name'.
+            List of country records with 'country_code' and 'country_name'.
         """
-        params = {
-            "token": self.public_token,
-            "key": self.public_key,
-            "proxy_type": str(proxy_type),
-        }
-        return self._get_locations("countries", params)
+        return self._get_locations(
+            "countries",
+            proxy_type=(
+                int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
+            ),
+        )
-    def list_states(self, country_code: str, proxy_type: int = 1) -> List[Dict[str, Any]]:
+    def list_states(
+        self,
+        country_code: str,
+        proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
+    ) -> List[Dict[str, Any]]:
         """
-        List supported states for a given country.
+        List supported states for a country.
         Args:
-            country_code (str): Country code (e.g., 'US').
-            proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
+            country_code: Country code (e.g., 'US').
+            proxy_type: Proxy type.
         Returns:
-            List[Dict[str, Any]]: Each record contains 'state_code' and 'state_name'.
+            List of state records.
         """
-        params = {
-            "token": self.public_token,
-            "key": self.public_key,
-            "proxy_type": str(proxy_type),
-            "country_code": country_code,
-        }
-        return self._get_locations("states", params)
+        return self._get_locations(
+            "states",
+            proxy_type=(
+                int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
+            ),
+            country_code=country_code,
+        )
     def list_cities(
         self,
         country_code: str,
         state_code: Optional[str] = None,
-        proxy_type: int = 1,
+        proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
     ) -> List[Dict[str, Any]]:
         """
-        List supported cities for a given country (and optional state).
+        List supported cities for a country/state.
         Args:
-            country_code (str): Country code (e.g., 'US').
-            state_code (Optional[str]): State code (e.g., 'alabama'), if applicable.
-            proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
+            country_code: Country code.
+            state_code: Optional state code.
+            proxy_type: Proxy type.
         Returns:
-            List[Dict[str, Any]]: Each record contains 'city_code' and 'city_name'.
+            List of city records.
         """
-        params: Dict[str, str] = {
-            "token": self.public_token,
-            "key": self.public_key,
-            "proxy_type": str(proxy_type),
+        kwargs = {
+            "proxy_type": (
+                int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
+            ),
             "country_code": country_code,
         }
         if state_code:
-            params["state_code"] = state_code
+            kwargs["state_code"] = state_code
-        return self._get_locations("cities", params)
+        return self._get_locations("cities", **kwargs)
     def list_asn(
         self,
         country_code: str,
-        proxy_type: int = 1,
+        proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
     ) -> List[Dict[str, Any]]:
         """
-        List supported ASNs for a given country.
+        List supported ASNs for a country.
         Args:
-            country_code (str): Country code (e.g., 'US').
-            proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
+            country_code: Country code.
+            proxy_type: Proxy type.
         Returns:
-            List[Dict[str, Any]]: Each record contains 'asn_code' and 'asn_name'.
+            List of ASN records.
         """
+        return self._get_locations(
+            "asn",
+            proxy_type=(
+                int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
+            ),
+            country_code=country_code,
+        )
+    def _get_locations(self, endpoint: str, **kwargs: Any) -> List[Dict[str, Any]]:
+        """Internal method to call locations API."""
+        self._require_public_credentials()
         params = {
             "token": self.public_token,
             "key": self.public_key,
-            "proxy_type": str(proxy_type),
-            "country_code": country_code,
         }
-        return self._get_locations("asn", params)
+        for key, value in kwargs.items():
+            params[key] = str(value)
+        url = f"{self._locations_base_url}/{endpoint}"
+        logger.debug(f"Locations API request: {url}")
+        # Use requests.get directly (no proxy needed for this API)
+        response = self._api_session.get(url, params=params, timeout=30)
+        response.raise_for_status()
+        data = response.json()
+        if isinstance(data, dict):
+            code = data.get("code")
+            if code is not None and code != 200:
+                msg = data.get("msg", "")
+                raise RuntimeError(
+                    f"Locations API error ({endpoint}): code={code}, msg={msg}"
+                )
+            return data.get("data") or []
+        if isinstance(data, list):
+            return data
+        return []
+    # =========================================================================
+    # Helper Methods
+    # =========================================================================
+    def _require_public_credentials(self) -> None:
+        """Ensure public API credentials are available."""
+        if not self.public_token or not self.public_key:
+            raise ThordataConfigError(
+                "public_token and public_key are required for this operation. "
+                "Please provide them when initializing ThordataClient."
+            )
+    def _request_with_retry(
+        self, method: str, url: str, **kwargs: Any
+    ) -> requests.Response:
+        """Make a request with automatic retry."""
+        kwargs.setdefault("timeout", self._default_timeout)
+        @with_retry(self._retry_config)
+        def _do_request() -> requests.Response:
+            return self._proxy_session.request(method, url, **kwargs)
+        try:
+            return _do_request()
+        except requests.Timeout as e:
+            raise ThordataTimeoutError(f"Request timed out: {e}", original_error=e)
+        except requests.RequestException as e:
+            raise ThordataNetworkError(f"Request failed: {e}", original_error=e)
+    def close(self) -> None:
+        """Close the underlying session."""
+        self._proxy_session.close()
+        self._api_session.close()
+    def __enter__(self) -> ThordataClient:
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()

thordata-sdk 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

thordata-sdk 0.3.1py3-none-any.whl → 0.5.0py3-none-any.whl