PyPI - thordata-sdk - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

thordata-sdk 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{thordata_sdk-0.2.2/thordata_sdk.egg-info → thordata_sdk-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: thordata_sdk
-Version: 0.2.2
+Version: 0.2.3
 Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
 Home-page: https://github.com/Thordata/thordata-python-sdk
 Author: Thordata Developer Team

{thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name='thordata_sdk',
-    version='0.2.2',  # Bump version due to breaking auth changes
+    version='0.2.3',  # Bump version due to breaking auth changes
     packages=find_packages(include=['thordata_sdk', 'thordata_sdk.*']),
     install_requires=[
         'requests>=2.25.0',  # Standard synchronous HTTP

{thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk/__init__.py RENAMED Viewed

@@ -3,6 +3,6 @@ from .client import ThordataClient
 from .async_client import AsyncThordataClient
 # Version of the thordata-sdk package
-__version__ = "0.2.2"
+__version__ = "0.2.3"
 __all__ = ["ThordataClient", "AsyncThordataClient"]

{thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk/async_client.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import aiohttp
 import logging
 import json
-from typing import Optional, Dict, Any
+import base64
+from typing import Optional, Dict, Any, Union
 logger = logging.getLogger(__name__)
@@ -9,11 +10,6 @@ logger = logging.getLogger(__name__)
 class AsyncThordataClient:
     """
     Thordata Asynchronous Client (built on aiohttp).
-    Designed for high-concurrency and low-latency data collection tasks.
-    Usage:
-        async with AsyncThordataClient(...) as client:
-            await client.get("http://example.com")
     """
     def __init__(
@@ -24,22 +20,19 @@ class AsyncThordataClient:
         proxy_host: str = "gate.thordata.com",
         proxy_port: int = 22225
     ):
-        """
-        Initialize the asynchronous client.
-        """
         self.scraper_token = scraper_token
         self.public_token = public_token
         self.public_key = public_key
-        # Proxy Authentication
         self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
         self.proxy_url = f"http://{proxy_host}:{proxy_port}"
-        # API Endpoints
         self.base_url = "https://scraperapi.thordata.com"
+        self.universal_url = "https://universalapi.thordata.com"
         self.api_url = "https://api.thordata.com/api/web-scraper-api"
         self.SERP_API_URL = f"{self.base_url}/request"
+        self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
         self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
         self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
         self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
@@ -55,21 +48,14 @@ class AsyncThordataClient:
         await self.close()
     async def close(self):
-        """Close the underlying aiohttp session."""
         if self._session and not self._session.closed:
             await self._session.close()
             self._session = None
-    # --- Proxy Usage ---
+    # --- Proxy ---
     async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
-        """
-        Send an asynchronous GET request through the Thordata Proxy.
-        """
         if self._session is None:
             raise RuntimeError("Client session not initialized.")
-        logger.debug(f"Async Proxy Request: {url}")
         try:
             return await self._session.get(
                 url,
@@ -81,21 +67,16 @@ class AsyncThordataClient:
             logger.error(f"Async Request failed: {e}")
             raise
-    # --- SERP API ---
+    # --- SERP ---
     async def serp_search(
         self, query: str, engine: str = "google", num: int = 10, **kwargs
     ) -> Dict[str, Any]:
-        """Async SERP search."""
         if self._session is None:
             raise RuntimeError("Client session not initialized.")
         payload = {
-            "q": query,
-            "num": str(num),
-            "json": "1",
-            "engine": engine.lower(),
-            **kwargs
+            "q": query, "num": str(num), "json": "1",
+            "engine": engine.lower(), **kwargs
         }
         if engine.lower() == 'yandex':
             payload['text'] = payload.pop('q')
@@ -117,7 +98,6 @@ class AsyncThordataClient:
         ) as response:
             response.raise_for_status()
             data = await response.json()
-            # Handle double-encoding
             if isinstance(data, str):
                 try:
                     data = json.loads(data)
@@ -125,8 +105,65 @@ class AsyncThordataClient:
                     pass
             return data
-    # --- Web Scraper API ---
+    # --- Universal ---
+    async def universal_scrape(
+        self,
+        url: str,
+        js_render: bool = False,
+        output_format: str = "HTML",
+        country: str = None,
+        block_resources: bool = False
+    ) -> Union[str, bytes]:
+        if self._session is None:
+            raise RuntimeError("Client session not initialized.")
+        headers = {
+            "Authorization": f"Bearer {self.scraper_token}",
+            "Content-Type": "application/x-www-form-urlencoded"
+        }
+        payload = {
+            "url": url,
+            "js_render": "True" if js_render else "False",
+            "type": output_format.lower(),
+            "block_resources": "True" if block_resources else "False"
+        }
+        if country:
+            payload["country"] = country
+        async with self._session.post(
+            self.UNIVERSAL_API_URL, data=payload, headers=headers
+        ) as response:
+            response.raise_for_status()
+            try:
+                resp_json = await response.json()
+            except Exception:
+                if output_format.upper() == "PNG":
+                    return await response.read()
+                return await response.text()
+            if isinstance(resp_json, dict) and resp_json.get("code") \
+                    and resp_json.get("code") != 200:
+                raise Exception(f"Universal API Error: {resp_json}")
+            if "html" in resp_json:
+                return resp_json["html"]
+            if "png" in resp_json:
+                png_str = resp_json["png"]
+                if not png_str:
+                    raise Exception("API returned empty PNG data")
+                png_str = png_str.replace("\n", "").replace("\r", "")
+                missing_padding = len(png_str) % 4
+                if missing_padding:
+                    png_str += '=' * (4 - missing_padding)
+                return base64.b64decode(png_str)
+            return str(resp_json)
+    # --- Web Scraper ---
     async def create_scraper_task(
         self,
         file_name: str,
@@ -135,7 +172,6 @@ class AsyncThordataClient:
         spider_name: str = "youtube.com",
         universal_params: Dict[str, Any] = None
     ) -> str:
-        """Create an async scraping task."""
         if self._session is None:
             raise RuntimeError("Client session not initialized.")
@@ -164,7 +200,6 @@ class AsyncThordataClient:
             return data["data"]["task_id"]
     async def get_task_status(self, task_id: str) -> str:
-        """Check task status."""
         headers = {
             "token": self.public_token,
             "key": self.public_key,
@@ -183,7 +218,6 @@ class AsyncThordataClient:
             return "Unknown"
     async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
-        """Get download link."""
         headers = {
             "token": self.public_token,
             "key": self.public_key,

{thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk/client.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import requests
 import logging
 import json
-from typing import Dict, Any
+import base64
+from typing import Dict, Any, Union
 # Configure a library-specific logger
 logger = logging.getLogger(__name__)
@@ -14,7 +15,8 @@ class ThordataClient:
     Handles authentication for:
     1. Proxy Network (HTTP/HTTPS)
     2. SERP API (Real-time Search)
-    3. Web Scraper API (Async Task Management)
+    3. Universal Scraping API (Single Page)
+    4. Web Scraper API (Async Task Management)
     """
     def __init__(
@@ -39,16 +41,18 @@ class ThordataClient:
         self.public_token = public_token
         self.public_key = public_key
-        # Proxy Configuration (User: Scraper Token, Pass: Empty)
+        # Proxy Configuration
         self.proxy_url = (
             f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
         )
         # API Endpoints
         self.base_url = "https://scraperapi.thordata.com"
+        self.universal_url = "https://universalapi.thordata.com"
         self.api_url = "https://api.thordata.com/api/web-scraper-api"
         self.SERP_API_URL = f"{self.base_url}/request"
+        self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
         self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
         self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
         self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
@@ -62,13 +66,6 @@ class ThordataClient:
     def get(self, url: str, **kwargs) -> requests.Response:
         """
         Send a GET request through the Thordata Proxy Network.
-        Args:
-            url (str): The target URL.
-            **kwargs: Additional arguments passed to requests.get().
-        Returns:
-            requests.Response: The HTTP response.
         """
         logger.debug(f"Proxy Request: {url}")
         kwargs.setdefault("timeout", 30)
@@ -88,7 +85,6 @@ class ThordataClient:
             **kwargs
         }
-        # Engine-specific parameter adjustments
         if engine.lower() == 'yandex':
             payload['text'] = payload.pop('q')
             if 'url' not in payload:
@@ -117,7 +113,6 @@ class ThordataClient:
             response.raise_for_status()
             data = response.json()
-            # Handle potential double-encoded JSON strings
             if isinstance(data, str):
                 try:
                     data = json.loads(data)
@@ -128,6 +123,79 @@ class ThordataClient:
             logger.error(f"SERP Request Failed: {e}")
             raise
+    def universal_scrape(
+        self,
+        url: str,
+        js_render: bool = False,
+        output_format: str = "HTML",
+        country: str = None,
+        block_resources: bool = False
+    ) -> Union[str, bytes]:
+        """
+        Unlock target pages via the Universal Scraping API.
+        """
+        headers = {
+            "Authorization": f"Bearer {self.scraper_token}",
+            "Content-Type": "application/x-www-form-urlencoded"
+        }
+        payload = {
+            "url": url,
+            "js_render": "True" if js_render else "False",
+            "type": output_format.lower(),
+            "block_resources": "True" if block_resources else "False"
+        }
+        if country:
+            payload["country"] = country
+        logger.info(f"Universal Scrape: {url}")
+        try:
+            response = self.session.post(
+                self.UNIVERSAL_API_URL,
+                data=payload,
+                headers=headers,
+                timeout=60
+            )
+            response.raise_for_status()
+            # Parse JSON wrapper
+            try:
+                resp_json = response.json()
+            except json.JSONDecodeError:
+                # Fallback for raw response
+                if output_format.upper() == "PNG":
+                    return response.content
+                return response.text
+            # Check API errors
+            if isinstance(resp_json, dict) and resp_json.get("code") \
+                    and resp_json.get("code") != 200:
+                raise Exception(f"Universal API Error: {resp_json}")
+            # Extract HTML
+            if "html" in resp_json:
+                return resp_json["html"]
+            # Extract PNG (Base64 decoding with padding fix)
+            if "png" in resp_json:
+                png_str = resp_json["png"]
+                if not png_str:
+                    raise Exception("API returned empty PNG data")
+                png_str = png_str.replace("\n", "").replace("\r", "")
+                missing_padding = len(png_str) % 4
+                if missing_padding:
+                    png_str += '=' * (4 - missing_padding)
+                return base64.b64decode(png_str)
+            return str(resp_json)
+        except Exception as e:
+            logger.error(f"Universal Scrape Failed: {e}")
+            raise
     def create_scraper_task(
         self,
         file_name: str,
@@ -174,7 +242,6 @@ class ThordataClient:
     def get_task_status(self, task_id: str) -> str:
         """
         Check the status of a task.
-        Returns: 'Running', 'Ready', 'Failed', or 'Unknown'.
         """
         headers = {
             "token": self.public_token,

{thordata_sdk-0.2.2 → thordata_sdk-0.2.3/thordata_sdk.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: thordata_sdk
-Version: 0.2.2
+Version: 0.2.3
 Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
 Home-page: https://github.com/Thordata/thordata-python-sdk
 Author: Thordata Developer Team