PyPI - thordata-sdk - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

thordata-sdk 0.2.4py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

thordata/__init__.py +16 -0
{thordata_sdk → thordata}/async_client.py +67 -33
thordata/client.py +486 -0
thordata/enums.py +25 -0
thordata/parameters.py +52 -0
thordata_sdk-0.3.1.dist-info/METADATA +200 -0
thordata_sdk-0.3.1.dist-info/RECORD +10 -0
{thordata_sdk-0.2.4.dist-info → thordata_sdk-0.3.1.dist-info}/WHEEL +1 -1
thordata_sdk-0.3.1.dist-info/top_level.txt +1 -0
thordata_sdk/__init__.py +0 -9
thordata_sdk/client.py +0 -303
thordata_sdk/enums.py +0 -20
thordata_sdk/parameters.py +0 -41
thordata_sdk-0.2.4.dist-info/METADATA +0 -113
thordata_sdk-0.2.4.dist-info/RECORD +0 -10
thordata_sdk-0.2.4.dist-info/top_level.txt +0 -1
{thordata_sdk-0.2.4.dist-info → thordata_sdk-0.3.1.dist-info/licenses}/LICENSE +0 -0

thordata/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# src/thordata/__init__.py
+from .client import ThordataClient
+from .async_client import AsyncThordataClient
+from .enums import Engine, GoogleSearchType
+# Package version
+__version__ = "0.3.1"
+# Explicitly export classes to simplify user imports
+__all__ = [
+    "ThordataClient",
+    "AsyncThordataClient",
+    "Engine",
+    "GoogleSearchType"
+]

{thordata_sdk → thordata}/async_client.py RENAMED Viewed

@@ -4,7 +4,7 @@ import json
 import base64
 from typing import Optional, Dict, Any, Union
-# 复用我们刚刚写好的逻辑和枚举
+# Import shared logic
 from .enums import Engine
 from .parameters import normalize_serp_params
@@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
 class AsyncThordataClient:
     """
-    Thordata Asynchronous Client (built on aiohttp).
+    The official Asynchronous Python client for Thordata (built on aiohttp).
+    Designed for high-concurrency AI agents and data pipelines.
     """
     def __init__(
@@ -24,13 +25,18 @@ class AsyncThordataClient:
         proxy_host: str = "gate.thordata.com",
         proxy_port: int = 22225
     ):
+        """
+        Initialize the Async Client.
+        """
         self.scraper_token = scraper_token
         self.public_token = public_token
         self.public_key = public_key
+        # Pre-calculate proxy auth for performance
         self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
         self.proxy_url = f"http://{proxy_host}:{proxy_port}"
+        # API Endpoints
         self.base_url = "https://scraperapi.thordata.com"
         self.universal_url = "https://universalapi.thordata.com"
         self.api_url = "https://api.thordata.com/api/web-scraper-api"
@@ -41,6 +47,7 @@ class AsyncThordataClient:
         self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
         self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
+        # Session is initialized lazily or via context manager
         self._session: Optional[aiohttp.ClientSession] = None
     async def __aenter__(self):
@@ -52,16 +59,27 @@ class AsyncThordataClient:
         await self.close()
     async def close(self):
+        """Close the underlying aiohttp session."""
         if self._session and not self._session.closed:
             await self._session.close()
             self._session = None
-    # --- Proxy (Unchanged) ---
+    def _get_session(self) -> aiohttp.ClientSession:
+        """Internal helper to ensure session exists."""
+        if self._session is None or self._session.closed:
+            raise RuntimeError(
+                "Client session not initialized. Use 'async with ThordataClient(...) as client:'"
+            )
+        return self._session
     async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
-        if self._session is None:
-            raise RuntimeError("Client session not initialized.")
+        """
+        Send an async GET request through the Proxy Network.
+        """
+        session = self._get_session()
         try:
-            return await self._session.get(
+            logger.debug(f"Async Proxy Request: {url}")
+            return await session.get(
                 url,
                 proxy=self.proxy_url,
                 proxy_auth=self.proxy_auth,
@@ -71,7 +89,6 @@ class AsyncThordataClient:
             logger.error(f"Async Request failed: {e}")
             raise
-    # --- SERP (Optimized) ---
     async def serp_search(
         self,
         query: str,
@@ -82,13 +99,12 @@ class AsyncThordataClient:
         """
         Execute a real-time SERP search (Async).
         """
-        if self._session is None:
-            raise RuntimeError("Client session not initialized.")
+        session = self._get_session()
-        # 1. 转换枚举
+        # 1. Handle Enum conversion
         engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
-        # 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
+        # 2. Normalize parameters
         payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
         headers = {
@@ -96,30 +112,34 @@ class AsyncThordataClient:
             "Content-Type": "application/x-www-form-urlencoded"
         }
-        # 3. 发送请求
-        async with self._session.post(
+        # 3. Execute Request
+        logger.info(f"Async SERP Search: {engine_str} - {query}")
+        async with session.post(
             self.SERP_API_URL, data=payload, headers=headers
         ) as response:
             response.raise_for_status()
             data = await response.json()
+            # Handle double-encoded JSON strings if they occur
             if isinstance(data, str):
                 try:
                     data = json.loads(data)
-                except Exception:
+                except json.JSONDecodeError:
                     pass
             return data
-    # --- Universal (Unchanged) ---
     async def universal_scrape(
         self,
         url: str,
         js_render: bool = False,
         output_format: str = "HTML",
-        country: str = None,
+        country: Optional[str] = None,
         block_resources: bool = False
     ) -> Union[str, bytes]:
-        if self._session is None:
-            raise RuntimeError("Client session not initialized.")
+        """
+        Async Universal Scraping (Bypass Cloudflare/CAPTCHA).
+        """
+        session = self._get_session()
         headers = {
             "Authorization": f"Bearer {self.scraper_token}",
@@ -135,18 +155,21 @@ class AsyncThordataClient:
         if country:
             payload["country"] = country
-        async with self._session.post(
+        logger.info(f"Async Universal Scrape: {url}")
+        async with session.post(
             self.UNIVERSAL_API_URL, data=payload, headers=headers
         ) as response:
             response.raise_for_status()
             try:
                 resp_json = await response.json()
-            except Exception:
+            except json.JSONDecodeError:
+                # Fallback for raw content
                 if output_format.upper() == "PNG":
                     return await response.read()
                 return await response.text()
+            # Check API error codes
             if isinstance(resp_json, dict) and resp_json.get("code") \
                     and resp_json.get("code") != 200:
                 raise Exception(f"Universal API Error: {resp_json}")
@@ -159,39 +182,38 @@ class AsyncThordataClient:
                 if not png_str:
                     raise Exception("API returned empty PNG data")
-                # 🛠️ FIX: 移除 Data URI Scheme 前缀
+                # Clean Data URI Scheme
                 if "," in png_str:
                     png_str = png_str.split(",", 1)[1]
+                # Fix Base64 Padding
                 png_str = png_str.replace("\n", "").replace("\r", "")
                 missing_padding = len(png_str) % 4
                 if missing_padding:
                     png_str += '=' * (4 - missing_padding)
                 return base64.b64decode(png_str)
             return str(resp_json)
-    # --- Web Scraper (Optimized) ---
     async def create_scraper_task(
         self,
         file_name: str,
         spider_id: str,
         spider_name: str,
         individual_params: Dict[str, Any],
-        universal_params: Dict[str, Any] = None
+        universal_params: Optional[Dict[str, Any]] = None
     ) -> str:
         """
         Create an Asynchronous Web Scraper Task.
         """
-        if self._session is None:
-            raise RuntimeError("Client session not initialized.")
+        session = self._get_session()
         headers = {
             "Authorization": f"Bearer {self.scraper_token}",
             "Content-Type": "application/x-www-form-urlencoded"
         }
-        # 简化 Payload 构建，移除不必要的检查
         payload = {
             "file_name": file_name,
             "spider_id": spider_id,
@@ -202,17 +224,23 @@ class AsyncThordataClient:
         if universal_params:
             payload["spider_universal"] = json.dumps(universal_params)
-        async with self._session.post(
+        logger.info(f"Async Task Creation: {spider_name}")
+        async with session.post(
             self.SCRAPER_BUILDER_URL, data=payload, headers=headers
         ) as response:
             response.raise_for_status()
             data = await response.json()
             if data.get("code") != 200:
                 raise Exception(f"Creation failed: {data}")
             return data["data"]["task_id"]
-    # --- Status & Result (Unchanged) ---
     async def get_task_status(self, task_id: str) -> str:
+        """
+        Check task status.
+        """
+        session = self._get_session()
         headers = {
             "token": self.public_token,
             "key": self.public_key,
@@ -220,28 +248,34 @@ class AsyncThordataClient:
         }
         payload = {"tasks_ids": task_id}
-        async with self._session.post(
+        async with session.post(
             self.SCRAPER_STATUS_URL, data=payload, headers=headers
         ) as response:
             data = await response.json()
             if data.get("code") == 200 and data.get("data"):
                 for item in data["data"]:
-                    if str(item["task_id"]) == str(task_id):
+                    if str(item.get("task_id")) == str(task_id):
                         return item["status"]
             return "Unknown"
     async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
+        """
+        Get the download URL for a finished task.
+        """
+        session = self._get_session()
         headers = {
             "token": self.public_token,
             "key": self.public_key,
             "Content-Type": "application/x-www-form-urlencoded"
         }
-        payload = {"tasks_id": task_id, "type": "json"}
+        # Fixed: Use the file_type argument instead of hardcoding "json"
+        payload = {"tasks_id": task_id, "type": file_type}
-        async with self._session.post(
+        async with session.post(
             self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
         ) as response:
             data = await response.json()
-            if data.get("code") == 200:
+            if data.get("code") == 200 and data.get("data"):
                 return data["data"]["download"]
             raise Exception(f"Result Error: {data}")

thordata-sdk 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl

thordata-sdk 0.2.4py3-none-any.whl → 0.3.1py3-none-any.whl