thordata-sdk 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata_sdk/__init__.py +2 -1
- thordata_sdk/async_client.py +32 -19
- thordata_sdk/client.py +38 -32
- thordata_sdk/enums.py +20 -0
- thordata_sdk/parameters.py +41 -0
- {thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/METADATA +3 -15
- thordata_sdk-0.2.4.dist-info/RECORD +10 -0
- {thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/WHEEL +1 -1
- thordata_sdk-0.2.3.dist-info/RECORD +0 -8
- {thordata_sdk-0.2.3.dist-info/licenses → thordata_sdk-0.2.4.dist-info}/LICENSE +0 -0
- {thordata_sdk-0.2.3.dist-info → thordata_sdk-0.2.4.dist-info}/top_level.txt +0 -0
thordata_sdk/__init__.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# Expose main clients
|
|
2
2
|
from .client import ThordataClient
|
|
3
3
|
from .async_client import AsyncThordataClient
|
|
4
|
+
from .enums import Engine, GoogleSearchType
|
|
4
5
|
|
|
5
6
|
# Version of the thordata-sdk package
|
|
6
|
-
__version__ = "0.2.
|
|
7
|
+
__version__ = "0.2.4"
|
|
7
8
|
|
|
8
9
|
__all__ = ["ThordataClient", "AsyncThordataClient"]
|
thordata_sdk/async_client.py
CHANGED
|
@@ -4,6 +4,10 @@ import json
|
|
|
4
4
|
import base64
|
|
5
5
|
from typing import Optional, Dict, Any, Union
|
|
6
6
|
|
|
7
|
+
# 复用我们刚刚写好的逻辑和枚举
|
|
8
|
+
from .enums import Engine
|
|
9
|
+
from .parameters import normalize_serp_params
|
|
10
|
+
|
|
7
11
|
logger = logging.getLogger(__name__)
|
|
8
12
|
|
|
9
13
|
|
|
@@ -52,7 +56,7 @@ class AsyncThordataClient:
|
|
|
52
56
|
await self._session.close()
|
|
53
57
|
self._session = None
|
|
54
58
|
|
|
55
|
-
# --- Proxy ---
|
|
59
|
+
# --- Proxy (Unchanged) ---
|
|
56
60
|
async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
|
|
57
61
|
if self._session is None:
|
|
58
62
|
raise RuntimeError("Client session not initialized.")
|
|
@@ -67,32 +71,32 @@ class AsyncThordataClient:
|
|
|
67
71
|
logger.error(f"Async Request failed: {e}")
|
|
68
72
|
raise
|
|
69
73
|
|
|
70
|
-
# --- SERP ---
|
|
74
|
+
# --- SERP (Optimized) ---
|
|
71
75
|
async def serp_search(
|
|
72
|
-
self,
|
|
76
|
+
self,
|
|
77
|
+
query: str,
|
|
78
|
+
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
79
|
+
num: int = 10,
|
|
80
|
+
**kwargs
|
|
73
81
|
) -> Dict[str, Any]:
|
|
82
|
+
"""
|
|
83
|
+
Execute a real-time SERP search (Async).
|
|
84
|
+
"""
|
|
74
85
|
if self._session is None:
|
|
75
86
|
raise RuntimeError("Client session not initialized.")
|
|
76
87
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
payload['text'] = payload.pop('q')
|
|
83
|
-
if 'url' not in payload:
|
|
84
|
-
payload['url'] = "yandex.com"
|
|
85
|
-
elif 'url' not in payload:
|
|
86
|
-
if engine == 'google':
|
|
87
|
-
payload['url'] = "google.com"
|
|
88
|
-
elif engine == 'bing':
|
|
89
|
-
payload['url'] = "bing.com"
|
|
88
|
+
# 1. 转换枚举
|
|
89
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
90
|
+
|
|
91
|
+
# 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
|
|
92
|
+
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
90
93
|
|
|
91
94
|
headers = {
|
|
92
95
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
93
96
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
94
97
|
}
|
|
95
98
|
|
|
99
|
+
# 3. 发送请求
|
|
96
100
|
async with self._session.post(
|
|
97
101
|
self.SERP_API_URL, data=payload, headers=headers
|
|
98
102
|
) as response:
|
|
@@ -105,7 +109,7 @@ class AsyncThordataClient:
|
|
|
105
109
|
pass
|
|
106
110
|
return data
|
|
107
111
|
|
|
108
|
-
# --- Universal ---
|
|
112
|
+
# --- Universal (Unchanged) ---
|
|
109
113
|
async def universal_scrape(
|
|
110
114
|
self,
|
|
111
115
|
url: str,
|
|
@@ -155,6 +159,10 @@ class AsyncThordataClient:
|
|
|
155
159
|
if not png_str:
|
|
156
160
|
raise Exception("API returned empty PNG data")
|
|
157
161
|
|
|
162
|
+
# 🛠️ FIX: 移除 Data URI Scheme 前缀
|
|
163
|
+
if "," in png_str:
|
|
164
|
+
png_str = png_str.split(",", 1)[1]
|
|
165
|
+
|
|
158
166
|
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
159
167
|
missing_padding = len(png_str) % 4
|
|
160
168
|
if missing_padding:
|
|
@@ -163,15 +171,18 @@ class AsyncThordataClient:
|
|
|
163
171
|
|
|
164
172
|
return str(resp_json)
|
|
165
173
|
|
|
166
|
-
# --- Web Scraper ---
|
|
174
|
+
# --- Web Scraper (Optimized) ---
|
|
167
175
|
async def create_scraper_task(
|
|
168
176
|
self,
|
|
169
177
|
file_name: str,
|
|
170
178
|
spider_id: str,
|
|
179
|
+
spider_name: str,
|
|
171
180
|
individual_params: Dict[str, Any],
|
|
172
|
-
spider_name: str = "youtube.com",
|
|
173
181
|
universal_params: Dict[str, Any] = None
|
|
174
182
|
) -> str:
|
|
183
|
+
"""
|
|
184
|
+
Create an Asynchronous Web Scraper Task.
|
|
185
|
+
"""
|
|
175
186
|
if self._session is None:
|
|
176
187
|
raise RuntimeError("Client session not initialized.")
|
|
177
188
|
|
|
@@ -180,6 +191,7 @@ class AsyncThordataClient:
|
|
|
180
191
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
181
192
|
}
|
|
182
193
|
|
|
194
|
+
# 简化 Payload 构建,移除不必要的检查
|
|
183
195
|
payload = {
|
|
184
196
|
"file_name": file_name,
|
|
185
197
|
"spider_id": spider_id,
|
|
@@ -199,6 +211,7 @@ class AsyncThordataClient:
|
|
|
199
211
|
raise Exception(f"Creation failed: {data}")
|
|
200
212
|
return data["data"]["task_id"]
|
|
201
213
|
|
|
214
|
+
# --- Status & Result (Unchanged) ---
|
|
202
215
|
async def get_task_status(self, task_id: str) -> str:
|
|
203
216
|
headers = {
|
|
204
217
|
"token": self.public_token,
|
thordata_sdk/client.py
CHANGED
|
@@ -2,7 +2,10 @@ import requests
|
|
|
2
2
|
import logging
|
|
3
3
|
import json
|
|
4
4
|
import base64
|
|
5
|
-
from typing import Dict, Any, Union
|
|
5
|
+
from typing import Dict, Any, Union, Optional
|
|
6
|
+
|
|
7
|
+
from .enums import Engine
|
|
8
|
+
from .parameters import normalize_serp_params
|
|
6
9
|
|
|
7
10
|
# Configure a library-specific logger
|
|
8
11
|
logger = logging.getLogger(__name__)
|
|
@@ -72,37 +75,33 @@ class ThordataClient:
|
|
|
72
75
|
return self.session.get(url, **kwargs)
|
|
73
76
|
|
|
74
77
|
def serp_search(
|
|
75
|
-
self,
|
|
78
|
+
self,
|
|
79
|
+
query: str,
|
|
80
|
+
engine: Union[Engine, str] = Engine.GOOGLE, # 既可以是枚举,也可以是字符串
|
|
81
|
+
num: int = 10,
|
|
82
|
+
**kwargs # 这里接收所有额外参数 (比如 type="maps")
|
|
76
83
|
) -> Dict[str, Any]:
|
|
77
84
|
"""
|
|
78
85
|
Execute a real-time SERP search.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
query: Keywords
|
|
89
|
+
engine: 'google', 'bing', 'yandex' etc.
|
|
90
|
+
num: Number of results (default 10)
|
|
91
|
+
**kwargs: Extra parameters (e.g., type="shopping", location="London")
|
|
79
92
|
"""
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
"num": str(num),
|
|
83
|
-
"json": "1",
|
|
84
|
-
"engine": engine.lower(),
|
|
85
|
-
**kwargs
|
|
86
|
-
}
|
|
93
|
+
# 兼容处理:如果用户传的是枚举对象,取它的值;如果是字符串,转小写
|
|
94
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
87
95
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
if 'url' not in payload:
|
|
91
|
-
payload['url'] = "yandex.com"
|
|
92
|
-
elif 'url' not in payload:
|
|
93
|
-
if engine == 'google':
|
|
94
|
-
payload['url'] = "google.com"
|
|
95
|
-
elif engine == 'bing':
|
|
96
|
-
payload['url'] = "bing.com"
|
|
97
|
-
elif engine == 'duckduckgo':
|
|
98
|
-
payload['url'] = "duckduckgo.com"
|
|
96
|
+
# 调用 parameters.py 里的逻辑
|
|
97
|
+
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
99
98
|
|
|
100
99
|
headers = {
|
|
101
100
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
102
101
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
103
102
|
}
|
|
104
103
|
|
|
105
|
-
logger.info(f"SERP Search: {
|
|
104
|
+
logger.info(f"SERP Search: {engine_str} - {query}")
|
|
106
105
|
try:
|
|
107
106
|
response = self.session.post(
|
|
108
107
|
self.SERP_API_URL,
|
|
@@ -111,18 +110,17 @@ class ThordataClient:
|
|
|
111
110
|
timeout=60
|
|
112
111
|
)
|
|
113
112
|
response.raise_for_status()
|
|
113
|
+
|
|
114
114
|
data = response.json()
|
|
115
|
-
|
|
116
115
|
if isinstance(data, str):
|
|
117
|
-
try:
|
|
118
|
-
|
|
119
|
-
except json.JSONDecodeError:
|
|
120
|
-
pass
|
|
116
|
+
try: data = json.loads(data)
|
|
117
|
+
except: pass
|
|
121
118
|
return data
|
|
122
119
|
except Exception as e:
|
|
123
120
|
logger.error(f"SERP Request Failed: {e}")
|
|
124
121
|
raise
|
|
125
122
|
|
|
123
|
+
|
|
126
124
|
def universal_scrape(
|
|
127
125
|
self,
|
|
128
126
|
url: str,
|
|
@@ -177,12 +175,17 @@ class ThordataClient:
|
|
|
177
175
|
if "html" in resp_json:
|
|
178
176
|
return resp_json["html"]
|
|
179
177
|
|
|
180
|
-
# Extract PNG
|
|
178
|
+
# Extract PNG
|
|
181
179
|
if "png" in resp_json:
|
|
182
180
|
png_str = resp_json["png"]
|
|
183
181
|
if not png_str:
|
|
184
182
|
raise Exception("API returned empty PNG data")
|
|
185
183
|
|
|
184
|
+
# 🛠️ FIX: 移除 Data URI Scheme 前缀 (data:image/png;base64,)
|
|
185
|
+
if "," in png_str:
|
|
186
|
+
png_str = png_str.split(",", 1)[1]
|
|
187
|
+
|
|
188
|
+
# Base64 解码 (处理 padding)
|
|
186
189
|
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
187
190
|
missing_padding = len(png_str) % 4
|
|
188
191
|
if missing_padding:
|
|
@@ -199,19 +202,22 @@ class ThordataClient:
|
|
|
199
202
|
def create_scraper_task(
|
|
200
203
|
self,
|
|
201
204
|
file_name: str,
|
|
202
|
-
spider_id: str,
|
|
203
|
-
|
|
204
|
-
|
|
205
|
+
spider_id: str, # 必须传,用户从仪表板获取
|
|
206
|
+
spider_name: str, # 必须传,例如 "youtube.com"
|
|
207
|
+
individual_params: Dict[str, Any], # 用户把具体的参数打包在这个字典里传进来
|
|
205
208
|
universal_params: Dict[str, Any] = None
|
|
206
209
|
) -> str:
|
|
207
210
|
"""
|
|
208
|
-
Create
|
|
211
|
+
Create a generic Web Scraper Task.
|
|
212
|
+
|
|
213
|
+
Note: Check the Thordata Dashboard to get the correct 'spider_id' and 'spider_name'.
|
|
209
214
|
"""
|
|
210
215
|
headers = {
|
|
211
216
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
212
217
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
213
218
|
}
|
|
214
219
|
|
|
220
|
+
# 直接打包发送,不替用户做太多复杂的校验,保证兼容性
|
|
215
221
|
payload = {
|
|
216
222
|
"spider_name": spider_name,
|
|
217
223
|
"spider_id": spider_id,
|
|
@@ -222,7 +228,7 @@ class ThordataClient:
|
|
|
222
228
|
if universal_params:
|
|
223
229
|
payload["spider_universal"] = json.dumps(universal_params)
|
|
224
230
|
|
|
225
|
-
logger.info(f"Creating Scraper Task: {spider_id}")
|
|
231
|
+
logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
|
|
226
232
|
try:
|
|
227
233
|
response = self.session.post(
|
|
228
234
|
self.SCRAPER_BUILDER_URL,
|
thordata_sdk/enums.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# thordata_sdk/enums.py
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
class Engine(str, Enum):
|
|
5
|
+
"""SERP 核心支持的四大引擎"""
|
|
6
|
+
GOOGLE = "google"
|
|
7
|
+
BING = "bing"
|
|
8
|
+
YANDEX = "yandex"
|
|
9
|
+
DUCKDUCKGO = "duckduckgo"
|
|
10
|
+
BAIDU = "baidu"
|
|
11
|
+
|
|
12
|
+
class GoogleSearchType(str, Enum):
|
|
13
|
+
"""Google 搜索的常见子类型 (参考你的截图)"""
|
|
14
|
+
SEARCH = "search" # 默认网页搜索
|
|
15
|
+
MAPS = "maps" # 地图
|
|
16
|
+
SHOPPING = "shopping" # 购物
|
|
17
|
+
NEWS = "news" # 新闻
|
|
18
|
+
IMAGES = "images" # 图片
|
|
19
|
+
VIDEOS = "videos" # 视频
|
|
20
|
+
# 其他冷门的先不写,用户可以通过字符串传参
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# thordata_sdk/parameters.py
|
|
2
|
+
from typing import Dict, Any
|
|
3
|
+
|
|
4
|
+
def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
|
|
5
|
+
"""
|
|
6
|
+
统一不同搜索引擎的参数差异。
|
|
7
|
+
"""
|
|
8
|
+
# 1. 基础参数
|
|
9
|
+
payload = {
|
|
10
|
+
"num": str(kwargs.get("num", 10)),
|
|
11
|
+
"json": "1",
|
|
12
|
+
"engine": engine,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
# 2. 处理查询关键词 (Yandex 用 text,其他用 q)
|
|
16
|
+
if engine == "yandex":
|
|
17
|
+
payload["text"] = query
|
|
18
|
+
# 如果用户没传 url,给个默认的
|
|
19
|
+
if "url" not in kwargs:
|
|
20
|
+
payload["url"] = "yandex.com"
|
|
21
|
+
else:
|
|
22
|
+
payload["q"] = query
|
|
23
|
+
|
|
24
|
+
# 3. 处理默认 URL (如果用户没传)
|
|
25
|
+
if "url" not in kwargs:
|
|
26
|
+
defaults = {
|
|
27
|
+
"google": "google.com",
|
|
28
|
+
"bing": "bing.com",
|
|
29
|
+
"duckduckgo": "duckduckgo.com",
|
|
30
|
+
"baidu": "baidu.com"
|
|
31
|
+
}
|
|
32
|
+
if engine in defaults:
|
|
33
|
+
payload["url"] = defaults[engine]
|
|
34
|
+
|
|
35
|
+
# 4. 把用户传入的其他所有参数(比如 type="shopping", google_domain="google.co.uk")都透传进去
|
|
36
|
+
# 这样你就不用去定义那几十种类型了,用户传啥就是啥
|
|
37
|
+
for k, v in kwargs.items():
|
|
38
|
+
if k not in ["num", "engine", "q", "text"]: # 避免覆盖
|
|
39
|
+
payload[k] = v
|
|
40
|
+
|
|
41
|
+
return payload
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
2
|
-
Name:
|
|
3
|
-
Version: 0.2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
|
|
5
5
|
Home-page: https://github.com/Thordata/thordata-python-sdk
|
|
6
6
|
Author: Thordata Developer Team
|
|
@@ -24,18 +24,6 @@ Description-Content-Type: text/markdown
|
|
|
24
24
|
License-File: LICENSE
|
|
25
25
|
Requires-Dist: requests>=2.25.0
|
|
26
26
|
Requires-Dist: aiohttp>=3.8.0
|
|
27
|
-
Dynamic: author
|
|
28
|
-
Dynamic: author-email
|
|
29
|
-
Dynamic: classifier
|
|
30
|
-
Dynamic: description
|
|
31
|
-
Dynamic: description-content-type
|
|
32
|
-
Dynamic: home-page
|
|
33
|
-
Dynamic: license
|
|
34
|
-
Dynamic: license-file
|
|
35
|
-
Dynamic: project-url
|
|
36
|
-
Dynamic: requires-dist
|
|
37
|
-
Dynamic: requires-python
|
|
38
|
-
Dynamic: summary
|
|
39
27
|
|
|
40
28
|
# Thordata Python SDK
|
|
41
29
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
thordata_sdk/__init__.py,sha256=TpVRMWiWSkxq6MUoX1LCkfmuZTH9FWC65JbaALeVoVY,268
|
|
2
|
+
thordata_sdk/async_client.py,sha256=YIIKddghCzGAvrx2Bqy8XkGcgFLbCPgzkQw-jcq2WH8,8612
|
|
3
|
+
thordata_sdk/client.py,sha256=UyRLjRFKep2SLOWExjAJ5EB0ED0BUiBlfWGwts3sykw,10372
|
|
4
|
+
thordata_sdk/enums.py,sha256=gKpaqV-_OO7w1LCg9PTuSUiJJq_q4ad5k6f88UlTPQw,639
|
|
5
|
+
thordata_sdk/parameters.py,sha256=3ck0XP0lZaUYs4eEZoLLo6zDTClRRrLO9TlggesMmwI,1384
|
|
6
|
+
thordata_sdk-0.2.4.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
7
|
+
thordata_sdk-0.2.4.dist-info/METADATA,sha256=mluyngNHvMXlRfAgA4F7JHC6Sc1f0z4cuut3CI42yow,3734
|
|
8
|
+
thordata_sdk-0.2.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
9
|
+
thordata_sdk-0.2.4.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
|
|
10
|
+
thordata_sdk-0.2.4.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
thordata_sdk/__init__.py,sha256=aZ2P8F15HJlnnuMRYA1R-ENcZRVQ7eo0r1SD4a_1UbI,223
|
|
2
|
-
thordata_sdk/async_client.py,sha256=fwoDSQA2GdikkNHrbKAoLwjqmn-zafEoe2HGf-j8bp8,8202
|
|
3
|
-
thordata_sdk/client.py,sha256=drlhRHCCUoYiwmaJHLsYQZrfj7rB5wsK2P2yn2DkhqQ,9732
|
|
4
|
-
thordata_sdk-0.2.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
5
|
-
thordata_sdk-0.2.3.dist-info/METADATA,sha256=X_b16_FfyQmV7VS9Wy_QRtgXp8JVYhxSatt0HpAA9QU,4003
|
|
6
|
-
thordata_sdk-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
thordata_sdk-0.2.3.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
|
|
8
|
-
thordata_sdk-0.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|