PyPI - thordata-sdk - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

thordata-sdk 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

thordata_sdk/__init__.py +2 -1
thordata_sdk/async_client.py +32 -19
thordata_sdk/client.py +38 -32
thordata_sdk/enums.py +20 -0
thordata_sdk/parameters.py +41 -0
{thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/METADATA +3 -15
thordata_sdk-0.2.4.dist-info/RECORD +10 -0
{thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/WHEEL +1 -1
thordata_sdk-0.2.3.dist-info/RECORD +0 -8
{thordata_sdk-0.2.3.dist-info/licenses → thordata_sdk-0.2.4.dist-info}/LICENSE +0 -0
{thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/top_level.txt +0 -0

thordata_sdk/__init__.py CHANGED Viewed

@@ -1,8 +1,9 @@
 # Expose main clients
 from .client import ThordataClient
 from .async_client import AsyncThordataClient
+from .enums import Engine, GoogleSearchType
 # Version of the thordata-sdk package
-__version__ = "0.2.3"
+__version__ = "0.2.4"
 __all__ = ["ThordataClient", "AsyncThordataClient"]

thordata_sdk/async_client.py CHANGED Viewed

@@ -4,6 +4,10 @@ import json
 import base64
 from typing import Optional, Dict, Any, Union
+# 复用我们刚刚写好的逻辑和枚举
+from .enums import Engine
+from .parameters import normalize_serp_params
 logger = logging.getLogger(__name__)
@@ -52,7 +56,7 @@ class AsyncThordataClient:
             await self._session.close()
             self._session = None
-    # --- Proxy ---
+    # --- Proxy (Unchanged) ---
     async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
         if self._session is None:
             raise RuntimeError("Client session not initialized.")
@@ -67,32 +71,32 @@ class AsyncThordataClient:
             logger.error(f"Async Request failed: {e}")
             raise
-    # --- SERP ---
+    # --- SERP (Optimized) ---
     async def serp_search(
-        self, query: str, engine: str = "google", num: int = 10, **kwargs
+        self,
+        query: str,
+        engine: Union[Engine, str] = Engine.GOOGLE,
+        num: int = 10,
+        **kwargs
     ) -> Dict[str, Any]:
+        """
+        Execute a real-time SERP search (Async).
+        """
         if self._session is None:
             raise RuntimeError("Client session not initialized.")
-        payload = {
-            "q": query, "num": str(num), "json": "1",
-            "engine": engine.lower(), **kwargs
-        }
-        if engine.lower() == 'yandex':
-            payload['text'] = payload.pop('q')
-            if 'url' not in payload:
-                payload['url'] = "yandex.com"
-        elif 'url' not in payload:
-            if engine == 'google':
-                payload['url'] = "google.com"
-            elif engine == 'bing':
-                payload['url'] = "bing.com"
+        # 1. 转换枚举
+        engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
+        # 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
+        payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
         headers = {
             "Authorization": f"Bearer {self.scraper_token}",
             "Content-Type": "application/x-www-form-urlencoded"
         }
+        # 3. 发送请求
         async with self._session.post(
             self.SERP_API_URL, data=payload, headers=headers
         ) as response:
@@ -105,7 +109,7 @@ class AsyncThordataClient:
                     pass
             return data
-    # --- Universal ---
+    # --- Universal (Unchanged) ---
     async def universal_scrape(
         self,
         url: str,
@@ -155,6 +159,10 @@ class AsyncThordataClient:
                 if not png_str:
                     raise Exception("API returned empty PNG data")
+                # 🛠️ FIX: 移除 Data URI Scheme 前缀
+                if "," in png_str:
+                    png_str = png_str.split(",", 1)[1]
                 png_str = png_str.replace("\n", "").replace("\r", "")
                 missing_padding = len(png_str) % 4
                 if missing_padding:
@@ -163,15 +171,18 @@ class AsyncThordataClient:
             return str(resp_json)
-    # --- Web Scraper ---
+    # --- Web Scraper (Optimized) ---
     async def create_scraper_task(
         self,
         file_name: str,
         spider_id: str,
+        spider_name: str,
         individual_params: Dict[str, Any],
-        spider_name: str = "youtube.com",
         universal_params: Dict[str, Any] = None
     ) -> str:
+        """
+        Create an Asynchronous Web Scraper Task.
+        """
         if self._session is None:
             raise RuntimeError("Client session not initialized.")
@@ -180,6 +191,7 @@ class AsyncThordataClient:
             "Content-Type": "application/x-www-form-urlencoded"
         }
+        # 简化 Payload 构建，移除不必要的检查
         payload = {
             "file_name": file_name,
             "spider_id": spider_id,
@@ -199,6 +211,7 @@ class AsyncThordataClient:
                 raise Exception(f"Creation failed: {data}")
             return data["data"]["task_id"]
+    # --- Status & Result (Unchanged) ---
     async def get_task_status(self, task_id: str) -> str:
         headers = {
             "token": self.public_token,

thordata_sdk/client.py CHANGED Viewed

@@ -2,7 +2,10 @@ import requests
 import logging
 import json
 import base64
-from typing import Dict, Any, Union
+from typing import Dict, Any, Union, Optional
+from .enums import Engine
+from .parameters import normalize_serp_params
 # Configure a library-specific logger
 logger = logging.getLogger(__name__)
@@ -72,37 +75,33 @@ class ThordataClient:
         return self.session.get(url, **kwargs)
     def serp_search(
-        self, query: str, engine: str = "google", num: int = 10, **kwargs
+        self,
+        query: str,
+        engine: Union[Engine, str] = Engine.GOOGLE, # 既可以是枚举，也可以是字符串
+        num: int = 10,
+        **kwargs # 这里接收所有额外参数 (比如 type="maps")
     ) -> Dict[str, Any]:
         """
         Execute a real-time SERP search.
+        Args:
+            query: Keywords
+            engine: 'google', 'bing', 'yandex' etc.
+            num: Number of results (default 10)
+            **kwargs: Extra parameters (e.g., type="shopping", location="London")
         """
-        payload = {
-            "q": query,
-            "num": str(num),
-            "json": "1",
-            "engine": engine.lower(),
-            **kwargs
-        }
+        # 兼容处理：如果用户传的是枚举对象，取它的值；如果是字符串，转小写
+        engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
-        if engine.lower() == 'yandex':
-            payload['text'] = payload.pop('q')
-            if 'url' not in payload:
-                payload['url'] = "yandex.com"
-        elif 'url' not in payload:
-            if engine == 'google':
-                payload['url'] = "google.com"
-            elif engine == 'bing':
-                payload['url'] = "bing.com"
-            elif engine == 'duckduckgo':
-                payload['url'] = "duckduckgo.com"
+        # 调用 parameters.py 里的逻辑
+        payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
         headers = {
             "Authorization": f"Bearer {self.scraper_token}",
             "Content-Type": "application/x-www-form-urlencoded"
         }
-        logger.info(f"SERP Search: {engine} - {query}")
+        logger.info(f"SERP Search: {engine_str} - {query}")
         try:
             response = self.session.post(
                 self.SERP_API_URL,
@@ -111,18 +110,17 @@ class ThordataClient:
                 timeout=60
             )
             response.raise_for_status()
             data = response.json()
             if isinstance(data, str):
-                try:
-                    data = json.loads(data)
-                except json.JSONDecodeError:
-                    pass
+                try: data = json.loads(data)
+                except: pass
             return data
         except Exception as e:
             logger.error(f"SERP Request Failed: {e}")
             raise
     def universal_scrape(
         self,
         url: str,
@@ -177,12 +175,17 @@ class ThordataClient:
             if "html" in resp_json:
                 return resp_json["html"]
-            # Extract PNG (Base64 decoding with padding fix)
+            # Extract PNG
             if "png" in resp_json:
                 png_str = resp_json["png"]
                 if not png_str:
                     raise Exception("API returned empty PNG data")
+                # 🛠️ FIX: 移除 Data URI Scheme 前缀 (data:image/png;base64,)
+                if "," in png_str:
+                    png_str = png_str.split(",", 1)[1]
+                # Base64 解码 (处理 padding)
                 png_str = png_str.replace("\n", "").replace("\r", "")
                 missing_padding = len(png_str) % 4
                 if missing_padding:
@@ -199,19 +202,22 @@ class ThordataClient:
     def create_scraper_task(
         self,
         file_name: str,
-        spider_id: str,
-        individual_params: Dict[str, Any],
-        spider_name: str = "youtube.com",
+        spider_id: str,     # 必须传，用户从仪表板获取
+        spider_name: str,   # 必须传，例如 "youtube.com"
+        individual_params: Dict[str, Any], # 用户把具体的参数打包在这个字典里传进来
         universal_params: Dict[str, Any] = None
     ) -> str:
         """
-        Create an Asynchronous Web Scraper Task.
+        Create a generic Web Scraper Task.
+        Note: Check the Thordata Dashboard to get the correct 'spider_id' and 'spider_name'.
         """
         headers = {
             "Authorization": f"Bearer {self.scraper_token}",
             "Content-Type": "application/x-www-form-urlencoded"
         }
+        # 直接打包发送，不替用户做太多复杂的校验，保证兼容性
         payload = {
             "spider_name": spider_name,
             "spider_id": spider_id,
@@ -222,7 +228,7 @@ class ThordataClient:
         if universal_params:
             payload["spider_universal"] = json.dumps(universal_params)
-        logger.info(f"Creating Scraper Task: {spider_id}")
+        logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
         try:
             response = self.session.post(
                 self.SCRAPER_BUILDER_URL,

thordata_sdk/enums.py ADDED Viewed

@@ -0,0 +1,20 @@
+# thordata_sdk/enums.py
+from enum import Enum
+class Engine(str, Enum):
+    """SERP 核心支持的四大引擎"""
+    GOOGLE = "google"
+    BING = "bing"
+    YANDEX = "yandex"
+    DUCKDUCKGO = "duckduckgo"
+    BAIDU = "baidu"
+class GoogleSearchType(str, Enum):
+    """Google 搜索的常见子类型 (参考你的截图)"""
+    SEARCH = "search"      # 默认网页搜索
+    MAPS = "maps"          # 地图
+    SHOPPING = "shopping"  # 购物
+    NEWS = "news"          # 新闻
+    IMAGES = "images"      # 图片
+    VIDEOS = "videos"      # 视频
+    # 其他冷门的先不写，用户可以通过字符串传参

thordata_sdk/parameters.py ADDED Viewed

@@ -0,0 +1,41 @@
+# thordata_sdk/parameters.py
+from typing import Dict, Any
+def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
+    """
+    统一不同搜索引擎的参数差异。
+    """
+    # 1. 基础参数
+    payload = {
+        "num": str(kwargs.get("num", 10)),
+        "json": "1",
+        "engine": engine,
+    }
+    # 2. 处理查询关键词 (Yandex 用 text，其他用 q)
+    if engine == "yandex":
+        payload["text"] = query
+        # 如果用户没传 url，给个默认的
+        if "url" not in kwargs:
+            payload["url"] = "yandex.com"
+    else:
+        payload["q"] = query
+        # 3. 处理默认 URL (如果用户没传)
+        if "url" not in kwargs:
+            defaults = {
+                "google": "google.com",
+                "bing": "bing.com",
+                "duckduckgo": "duckduckgo.com",
+                "baidu": "baidu.com"
+            }
+            if engine in defaults:
+                payload["url"] = defaults[engine]
+    # 4. 把用户传入的其他所有参数（比如 type="shopping", google_domain="google.co.uk"）都透传进去
+    # 这样你就不用去定义那几十种类型了，用户传啥就是啥
+    for k, v in kwargs.items():
+        if k not in ["num", "engine", "q", "text"]: # 避免覆盖
+            payload[k] = v
+    return payload

{thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.4
-Name: thordata_sdk
-Version: 0.2.3
+Metadata-Version: 2.1
+Name: thordata-sdk
+Version: 0.2.4
 Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
 Home-page: https://github.com/Thordata/thordata-python-sdk
 Author: Thordata Developer Team
@@ -24,18 +24,6 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: requests>=2.25.0
 Requires-Dist: aiohttp>=3.8.0
-Dynamic: author
-Dynamic: author-email
-Dynamic: classifier
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: license
-Dynamic: license-file
-Dynamic: project-url
-Dynamic: requires-dist
-Dynamic: requires-python
-Dynamic: summary
 # Thordata Python SDK

thordata_sdk-0.2.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+thordata_sdk/__init__.py,sha256=TpVRMWiWSkxq6MUoX1LCkfmuZTH9FWC65JbaALeVoVY,268
+thordata_sdk/async_client.py,sha256=YIIKddghCzGAvrx2Bqy8XkGcgFLbCPgzkQw-jcq2WH8,8612
+thordata_sdk/client.py,sha256=UyRLjRFKep2SLOWExjAJ5EB0ED0BUiBlfWGwts3sykw,10372
+thordata_sdk/enums.py,sha256=gKpaqV-_OO7w1LCg9PTuSUiJJq_q4ad5k6f88UlTPQw,639
+thordata_sdk/parameters.py,sha256=3ck0XP0lZaUYs4eEZoLLo6zDTClRRrLO9TlggesMmwI,1384
+thordata_sdk-0.2.4.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+thordata_sdk-0.2.4.dist-info/METADATA,sha256=mluyngNHvMXlRfAgA4F7JHC6Sc1f0z4cuut3CI42yow,3734
+thordata_sdk-0.2.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+thordata_sdk-0.2.4.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
+thordata_sdk-0.2.4.dist-info/RECORD,,

{thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: bdist_wheel (0.45.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

thordata_sdk-0.2.3.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-thordata_sdk/__init__.py,sha256=aZ2P8F15HJlnnuMRYA1R-ENcZRVQ7eo0r1SD4a_1UbI,223
-thordata_sdk/async_client.py,sha256=fwoDSQA2GdikkNHrbKAoLwjqmn-zafEoe2HGf-j8bp8,8202
-thordata_sdk/client.py,sha256=drlhRHCCUoYiwmaJHLsYQZrfj7rB5wsK2P2yn2DkhqQ,9732
-thordata_sdk-0.2.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-thordata_sdk-0.2.3.dist-info/METADATA,sha256=X_b16_FfyQmV7VS9Wy_QRtgXp8JVYhxSatt0HpAA9QU,4003
-thordata_sdk-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-thordata_sdk-0.2.3.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
-thordata_sdk-0.2.3.dist-info/RECORD,,

{thordata_sdk-0.2.3.dist-info/licenses → thordata_sdk-0.2.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

thordata-sdk 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

thordata-sdk 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl