thordata-sdk 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata_sdk/__init__.py CHANGED
@@ -1,8 +1,9 @@
1
1
  # Expose main clients
2
2
  from .client import ThordataClient
3
3
  from .async_client import AsyncThordataClient
4
+ from .enums import Engine, GoogleSearchType
4
5
 
5
6
  # Version of the thordata-sdk package
6
- __version__ = "0.2.3"
7
+ __version__ = "0.2.4"
7
8
 
8
9
  __all__ = ["ThordataClient", "AsyncThordataClient"]
@@ -4,6 +4,10 @@ import json
4
4
  import base64
5
5
  from typing import Optional, Dict, Any, Union
6
6
 
7
+ # 复用我们刚刚写好的逻辑和枚举
8
+ from .enums import Engine
9
+ from .parameters import normalize_serp_params
10
+
7
11
  logger = logging.getLogger(__name__)
8
12
 
9
13
 
@@ -52,7 +56,7 @@ class AsyncThordataClient:
52
56
  await self._session.close()
53
57
  self._session = None
54
58
 
55
- # --- Proxy ---
59
+ # --- Proxy (Unchanged) ---
56
60
  async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
57
61
  if self._session is None:
58
62
  raise RuntimeError("Client session not initialized.")
@@ -67,32 +71,32 @@ class AsyncThordataClient:
67
71
  logger.error(f"Async Request failed: {e}")
68
72
  raise
69
73
 
70
- # --- SERP ---
74
+ # --- SERP (Optimized) ---
71
75
  async def serp_search(
72
- self, query: str, engine: str = "google", num: int = 10, **kwargs
76
+ self,
77
+ query: str,
78
+ engine: Union[Engine, str] = Engine.GOOGLE,
79
+ num: int = 10,
80
+ **kwargs
73
81
  ) -> Dict[str, Any]:
82
+ """
83
+ Execute a real-time SERP search (Async).
84
+ """
74
85
  if self._session is None:
75
86
  raise RuntimeError("Client session not initialized.")
76
87
 
77
- payload = {
78
- "q": query, "num": str(num), "json": "1",
79
- "engine": engine.lower(), **kwargs
80
- }
81
- if engine.lower() == 'yandex':
82
- payload['text'] = payload.pop('q')
83
- if 'url' not in payload:
84
- payload['url'] = "yandex.com"
85
- elif 'url' not in payload:
86
- if engine == 'google':
87
- payload['url'] = "google.com"
88
- elif engine == 'bing':
89
- payload['url'] = "bing.com"
88
+ # 1. 转换枚举
89
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
90
+
91
+ # 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
92
+ payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
90
93
 
91
94
  headers = {
92
95
  "Authorization": f"Bearer {self.scraper_token}",
93
96
  "Content-Type": "application/x-www-form-urlencoded"
94
97
  }
95
98
 
99
+ # 3. 发送请求
96
100
  async with self._session.post(
97
101
  self.SERP_API_URL, data=payload, headers=headers
98
102
  ) as response:
@@ -105,7 +109,7 @@ class AsyncThordataClient:
105
109
  pass
106
110
  return data
107
111
 
108
- # --- Universal ---
112
+ # --- Universal (Unchanged) ---
109
113
  async def universal_scrape(
110
114
  self,
111
115
  url: str,
@@ -155,6 +159,10 @@ class AsyncThordataClient:
155
159
  if not png_str:
156
160
  raise Exception("API returned empty PNG data")
157
161
 
162
+ # 🛠️ FIX: 移除 Data URI Scheme 前缀
163
+ if "," in png_str:
164
+ png_str = png_str.split(",", 1)[1]
165
+
158
166
  png_str = png_str.replace("\n", "").replace("\r", "")
159
167
  missing_padding = len(png_str) % 4
160
168
  if missing_padding:
@@ -163,15 +171,18 @@ class AsyncThordataClient:
163
171
 
164
172
  return str(resp_json)
165
173
 
166
- # --- Web Scraper ---
174
+ # --- Web Scraper (Optimized) ---
167
175
  async def create_scraper_task(
168
176
  self,
169
177
  file_name: str,
170
178
  spider_id: str,
179
+ spider_name: str,
171
180
  individual_params: Dict[str, Any],
172
- spider_name: str = "youtube.com",
173
181
  universal_params: Dict[str, Any] = None
174
182
  ) -> str:
183
+ """
184
+ Create an Asynchronous Web Scraper Task.
185
+ """
175
186
  if self._session is None:
176
187
  raise RuntimeError("Client session not initialized.")
177
188
 
@@ -180,6 +191,7 @@ class AsyncThordataClient:
180
191
  "Content-Type": "application/x-www-form-urlencoded"
181
192
  }
182
193
 
194
+ # 简化 Payload 构建,移除不必要的检查
183
195
  payload = {
184
196
  "file_name": file_name,
185
197
  "spider_id": spider_id,
@@ -199,6 +211,7 @@ class AsyncThordataClient:
199
211
  raise Exception(f"Creation failed: {data}")
200
212
  return data["data"]["task_id"]
201
213
 
214
+ # --- Status & Result (Unchanged) ---
202
215
  async def get_task_status(self, task_id: str) -> str:
203
216
  headers = {
204
217
  "token": self.public_token,
thordata_sdk/client.py CHANGED
@@ -2,7 +2,10 @@ import requests
2
2
  import logging
3
3
  import json
4
4
  import base64
5
- from typing import Dict, Any, Union
5
+ from typing import Dict, Any, Union, Optional
6
+
7
+ from .enums import Engine
8
+ from .parameters import normalize_serp_params
6
9
 
7
10
  # Configure a library-specific logger
8
11
  logger = logging.getLogger(__name__)
@@ -72,37 +75,33 @@ class ThordataClient:
72
75
  return self.session.get(url, **kwargs)
73
76
 
74
77
  def serp_search(
75
- self, query: str, engine: str = "google", num: int = 10, **kwargs
78
+ self,
79
+ query: str,
80
+ engine: Union[Engine, str] = Engine.GOOGLE, # 既可以是枚举,也可以是字符串
81
+ num: int = 10,
82
+ **kwargs # 这里接收所有额外参数 (比如 type="maps")
76
83
  ) -> Dict[str, Any]:
77
84
  """
78
85
  Execute a real-time SERP search.
86
+
87
+ Args:
88
+ query: Keywords
89
+ engine: 'google', 'bing', 'yandex' etc.
90
+ num: Number of results (default 10)
91
+ **kwargs: Extra parameters (e.g., type="shopping", location="London")
79
92
  """
80
- payload = {
81
- "q": query,
82
- "num": str(num),
83
- "json": "1",
84
- "engine": engine.lower(),
85
- **kwargs
86
- }
93
+ # 兼容处理:如果用户传的是枚举对象,取它的值;如果是字符串,转小写
94
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
87
95
 
88
- if engine.lower() == 'yandex':
89
- payload['text'] = payload.pop('q')
90
- if 'url' not in payload:
91
- payload['url'] = "yandex.com"
92
- elif 'url' not in payload:
93
- if engine == 'google':
94
- payload['url'] = "google.com"
95
- elif engine == 'bing':
96
- payload['url'] = "bing.com"
97
- elif engine == 'duckduckgo':
98
- payload['url'] = "duckduckgo.com"
96
+ # 调用 parameters.py 里的逻辑
97
+ payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
99
98
 
100
99
  headers = {
101
100
  "Authorization": f"Bearer {self.scraper_token}",
102
101
  "Content-Type": "application/x-www-form-urlencoded"
103
102
  }
104
103
 
105
- logger.info(f"SERP Search: {engine} - {query}")
104
+ logger.info(f"SERP Search: {engine_str} - {query}")
106
105
  try:
107
106
  response = self.session.post(
108
107
  self.SERP_API_URL,
@@ -111,18 +110,17 @@ class ThordataClient:
111
110
  timeout=60
112
111
  )
113
112
  response.raise_for_status()
113
+
114
114
  data = response.json()
115
-
116
115
  if isinstance(data, str):
117
- try:
118
- data = json.loads(data)
119
- except json.JSONDecodeError:
120
- pass
116
+ try: data = json.loads(data)
117
+ except: pass
121
118
  return data
122
119
  except Exception as e:
123
120
  logger.error(f"SERP Request Failed: {e}")
124
121
  raise
125
122
 
123
+
126
124
  def universal_scrape(
127
125
  self,
128
126
  url: str,
@@ -177,12 +175,17 @@ class ThordataClient:
177
175
  if "html" in resp_json:
178
176
  return resp_json["html"]
179
177
 
180
- # Extract PNG (Base64 decoding with padding fix)
178
+ # Extract PNG
181
179
  if "png" in resp_json:
182
180
  png_str = resp_json["png"]
183
181
  if not png_str:
184
182
  raise Exception("API returned empty PNG data")
185
183
 
184
+ # 🛠️ FIX: 移除 Data URI Scheme 前缀 (data:image/png;base64,)
185
+ if "," in png_str:
186
+ png_str = png_str.split(",", 1)[1]
187
+
188
+ # Base64 解码 (处理 padding)
186
189
  png_str = png_str.replace("\n", "").replace("\r", "")
187
190
  missing_padding = len(png_str) % 4
188
191
  if missing_padding:
@@ -199,19 +202,22 @@ class ThordataClient:
199
202
  def create_scraper_task(
200
203
  self,
201
204
  file_name: str,
202
- spider_id: str,
203
- individual_params: Dict[str, Any],
204
- spider_name: str = "youtube.com",
205
+ spider_id: str, # 必须传,用户从仪表板获取
206
+ spider_name: str, # 必须传,例如 "youtube.com"
207
+ individual_params: Dict[str, Any], # 用户把具体的参数打包在这个字典里传进来
205
208
  universal_params: Dict[str, Any] = None
206
209
  ) -> str:
207
210
  """
208
- Create an Asynchronous Web Scraper Task.
211
+ Create a generic Web Scraper Task.
212
+
213
+ Note: Check the Thordata Dashboard to get the correct 'spider_id' and 'spider_name'.
209
214
  """
210
215
  headers = {
211
216
  "Authorization": f"Bearer {self.scraper_token}",
212
217
  "Content-Type": "application/x-www-form-urlencoded"
213
218
  }
214
219
 
220
+ # 直接打包发送,不替用户做太多复杂的校验,保证兼容性
215
221
  payload = {
216
222
  "spider_name": spider_name,
217
223
  "spider_id": spider_id,
@@ -222,7 +228,7 @@ class ThordataClient:
222
228
  if universal_params:
223
229
  payload["spider_universal"] = json.dumps(universal_params)
224
230
 
225
- logger.info(f"Creating Scraper Task: {spider_id}")
231
+ logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
226
232
  try:
227
233
  response = self.session.post(
228
234
  self.SCRAPER_BUILDER_URL,
thordata_sdk/enums.py ADDED
@@ -0,0 +1,20 @@
1
+ # thordata_sdk/enums.py
2
+ from enum import Enum
3
+
4
+ class Engine(str, Enum):
5
+ """SERP 核心支持的四大引擎"""
6
+ GOOGLE = "google"
7
+ BING = "bing"
8
+ YANDEX = "yandex"
9
+ DUCKDUCKGO = "duckduckgo"
10
+ BAIDU = "baidu"
11
+
12
+ class GoogleSearchType(str, Enum):
13
+ """Google 搜索的常见子类型 (参考你的截图)"""
14
+ SEARCH = "search" # 默认网页搜索
15
+ MAPS = "maps" # 地图
16
+ SHOPPING = "shopping" # 购物
17
+ NEWS = "news" # 新闻
18
+ IMAGES = "images" # 图片
19
+ VIDEOS = "videos" # 视频
20
+ # 其他冷门的先不写,用户可以通过字符串传参
@@ -0,0 +1,41 @@
1
+ # thordata_sdk/parameters.py
2
+ from typing import Dict, Any
3
+
4
+ def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
5
+ """
6
+ 统一不同搜索引擎的参数差异。
7
+ """
8
+ # 1. 基础参数
9
+ payload = {
10
+ "num": str(kwargs.get("num", 10)),
11
+ "json": "1",
12
+ "engine": engine,
13
+ }
14
+
15
+ # 2. 处理查询关键词 (Yandex 用 text,其他用 q)
16
+ if engine == "yandex":
17
+ payload["text"] = query
18
+ # 如果用户没传 url,给个默认的
19
+ if "url" not in kwargs:
20
+ payload["url"] = "yandex.com"
21
+ else:
22
+ payload["q"] = query
23
+
24
+ # 3. 处理默认 URL (如果用户没传)
25
+ if "url" not in kwargs:
26
+ defaults = {
27
+ "google": "google.com",
28
+ "bing": "bing.com",
29
+ "duckduckgo": "duckduckgo.com",
30
+ "baidu": "baidu.com"
31
+ }
32
+ if engine in defaults:
33
+ payload["url"] = defaults[engine]
34
+
35
+ # 4. 把用户传入的其他所有参数(比如 type="shopping", google_domain="google.co.uk")都透传进去
36
+ # 这样你就不用去定义那几十种类型了,用户传啥就是啥
37
+ for k, v in kwargs.items():
38
+ if k not in ["num", "engine", "q", "text"]: # 避免覆盖
39
+ payload[k] = v
40
+
41
+ return payload
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
2
- Name: thordata_sdk
3
- Version: 0.2.3
1
+ Metadata-Version: 2.1
2
+ Name: thordata-sdk
3
+ Version: 0.2.4
4
4
  Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
5
5
  Home-page: https://github.com/Thordata/thordata-python-sdk
6
6
  Author: Thordata Developer Team
@@ -24,18 +24,6 @@ Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: requests>=2.25.0
26
26
  Requires-Dist: aiohttp>=3.8.0
27
- Dynamic: author
28
- Dynamic: author-email
29
- Dynamic: classifier
30
- Dynamic: description
31
- Dynamic: description-content-type
32
- Dynamic: home-page
33
- Dynamic: license
34
- Dynamic: license-file
35
- Dynamic: project-url
36
- Dynamic: requires-dist
37
- Dynamic: requires-python
38
- Dynamic: summary
39
27
 
40
28
  # Thordata Python SDK
41
29
 
@@ -0,0 +1,10 @@
1
+ thordata_sdk/__init__.py,sha256=TpVRMWiWSkxq6MUoX1LCkfmuZTH9FWC65JbaALeVoVY,268
2
+ thordata_sdk/async_client.py,sha256=YIIKddghCzGAvrx2Bqy8XkGcgFLbCPgzkQw-jcq2WH8,8612
3
+ thordata_sdk/client.py,sha256=UyRLjRFKep2SLOWExjAJ5EB0ED0BUiBlfWGwts3sykw,10372
4
+ thordata_sdk/enums.py,sha256=gKpaqV-_OO7w1LCg9PTuSUiJJq_q4ad5k6f88UlTPQw,639
5
+ thordata_sdk/parameters.py,sha256=3ck0XP0lZaUYs4eEZoLLo6zDTClRRrLO9TlggesMmwI,1384
6
+ thordata_sdk-0.2.4.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
7
+ thordata_sdk-0.2.4.dist-info/METADATA,sha256=mluyngNHvMXlRfAgA4F7JHC6Sc1f0z4cuut3CI42yow,3734
8
+ thordata_sdk-0.2.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
9
+ thordata_sdk-0.2.4.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
10
+ thordata_sdk-0.2.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- thordata_sdk/__init__.py,sha256=aZ2P8F15HJlnnuMRYA1R-ENcZRVQ7eo0r1SD4a_1UbI,223
2
- thordata_sdk/async_client.py,sha256=fwoDSQA2GdikkNHrbKAoLwjqmn-zafEoe2HGf-j8bp8,8202
3
- thordata_sdk/client.py,sha256=drlhRHCCUoYiwmaJHLsYQZrfj7rB5wsK2P2yn2DkhqQ,9732
4
- thordata_sdk-0.2.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
5
- thordata_sdk-0.2.3.dist-info/METADATA,sha256=X_b16_FfyQmV7VS9Wy_QRtgXp8JVYhxSatt0HpAA9QU,4003
6
- thordata_sdk-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- thordata_sdk-0.2.3.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
8
- thordata_sdk-0.2.3.dist-info/RECORD,,