thordata-sdk 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +16 -0
- {thordata_sdk → thordata}/async_client.py +67 -33
- thordata/client.py +486 -0
- thordata/enums.py +25 -0
- thordata/parameters.py +52 -0
- thordata_sdk-0.3.1.dist-info/METADATA +200 -0
- thordata_sdk-0.3.1.dist-info/RECORD +10 -0
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-0.3.1.dist-info}/WHEEL +1 -1
- thordata_sdk-0.3.1.dist-info/top_level.txt +1 -0
- thordata_sdk/__init__.py +0 -9
- thordata_sdk/client.py +0 -303
- thordata_sdk/enums.py +0 -20
- thordata_sdk/parameters.py +0 -41
- thordata_sdk-0.2.4.dist-info/METADATA +0 -113
- thordata_sdk-0.2.4.dist-info/RECORD +0 -10
- thordata_sdk-0.2.4.dist-info/top_level.txt +0 -1
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-0.3.1.dist-info/licenses}/LICENSE +0 -0
thordata/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# src/thordata/__init__.py
|
|
2
|
+
|
|
3
|
+
from .client import ThordataClient
|
|
4
|
+
from .async_client import AsyncThordataClient
|
|
5
|
+
from .enums import Engine, GoogleSearchType
|
|
6
|
+
|
|
7
|
+
# Package version
|
|
8
|
+
__version__ = "0.3.1"
|
|
9
|
+
|
|
10
|
+
# Explicitly export classes to simplify user imports
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ThordataClient",
|
|
13
|
+
"AsyncThordataClient",
|
|
14
|
+
"Engine",
|
|
15
|
+
"GoogleSearchType"
|
|
16
|
+
]
|
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import base64
|
|
5
5
|
from typing import Optional, Dict, Any, Union
|
|
6
6
|
|
|
7
|
-
#
|
|
7
|
+
# Import shared logic
|
|
8
8
|
from .enums import Engine
|
|
9
9
|
from .parameters import normalize_serp_params
|
|
10
10
|
|
|
@@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
|
|
|
13
13
|
|
|
14
14
|
class AsyncThordataClient:
|
|
15
15
|
"""
|
|
16
|
-
|
|
16
|
+
The official Asynchronous Python client for Thordata (built on aiohttp).
|
|
17
|
+
Designed for high-concurrency AI agents and data pipelines.
|
|
17
18
|
"""
|
|
18
19
|
|
|
19
20
|
def __init__(
|
|
@@ -24,13 +25,18 @@ class AsyncThordataClient:
|
|
|
24
25
|
proxy_host: str = "gate.thordata.com",
|
|
25
26
|
proxy_port: int = 22225
|
|
26
27
|
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the Async Client.
|
|
30
|
+
"""
|
|
27
31
|
self.scraper_token = scraper_token
|
|
28
32
|
self.public_token = public_token
|
|
29
33
|
self.public_key = public_key
|
|
30
34
|
|
|
35
|
+
# Pre-calculate proxy auth for performance
|
|
31
36
|
self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
|
|
32
37
|
self.proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
33
38
|
|
|
39
|
+
# API Endpoints
|
|
34
40
|
self.base_url = "https://scraperapi.thordata.com"
|
|
35
41
|
self.universal_url = "https://universalapi.thordata.com"
|
|
36
42
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
@@ -41,6 +47,7 @@ class AsyncThordataClient:
|
|
|
41
47
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
42
48
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
43
49
|
|
|
50
|
+
# Session is initialized lazily or via context manager
|
|
44
51
|
self._session: Optional[aiohttp.ClientSession] = None
|
|
45
52
|
|
|
46
53
|
async def __aenter__(self):
|
|
@@ -52,16 +59,27 @@ class AsyncThordataClient:
|
|
|
52
59
|
await self.close()
|
|
53
60
|
|
|
54
61
|
async def close(self):
|
|
62
|
+
"""Close the underlying aiohttp session."""
|
|
55
63
|
if self._session and not self._session.closed:
|
|
56
64
|
await self._session.close()
|
|
57
65
|
self._session = None
|
|
58
66
|
|
|
59
|
-
|
|
67
|
+
def _get_session(self) -> aiohttp.ClientSession:
|
|
68
|
+
"""Internal helper to ensure session exists."""
|
|
69
|
+
if self._session is None or self._session.closed:
|
|
70
|
+
raise RuntimeError(
|
|
71
|
+
"Client session not initialized. Use 'async with ThordataClient(...) as client:'"
|
|
72
|
+
)
|
|
73
|
+
return self._session
|
|
74
|
+
|
|
60
75
|
async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
|
|
61
|
-
|
|
62
|
-
|
|
76
|
+
"""
|
|
77
|
+
Send an async GET request through the Proxy Network.
|
|
78
|
+
"""
|
|
79
|
+
session = self._get_session()
|
|
63
80
|
try:
|
|
64
|
-
|
|
81
|
+
logger.debug(f"Async Proxy Request: {url}")
|
|
82
|
+
return await session.get(
|
|
65
83
|
url,
|
|
66
84
|
proxy=self.proxy_url,
|
|
67
85
|
proxy_auth=self.proxy_auth,
|
|
@@ -71,7 +89,6 @@ class AsyncThordataClient:
|
|
|
71
89
|
logger.error(f"Async Request failed: {e}")
|
|
72
90
|
raise
|
|
73
91
|
|
|
74
|
-
# --- SERP (Optimized) ---
|
|
75
92
|
async def serp_search(
|
|
76
93
|
self,
|
|
77
94
|
query: str,
|
|
@@ -82,13 +99,12 @@ class AsyncThordataClient:
|
|
|
82
99
|
"""
|
|
83
100
|
Execute a real-time SERP search (Async).
|
|
84
101
|
"""
|
|
85
|
-
|
|
86
|
-
raise RuntimeError("Client session not initialized.")
|
|
102
|
+
session = self._get_session()
|
|
87
103
|
|
|
88
|
-
# 1.
|
|
104
|
+
# 1. Handle Enum conversion
|
|
89
105
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
90
106
|
|
|
91
|
-
# 2.
|
|
107
|
+
# 2. Normalize parameters
|
|
92
108
|
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
93
109
|
|
|
94
110
|
headers = {
|
|
@@ -96,30 +112,34 @@ class AsyncThordataClient:
|
|
|
96
112
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
97
113
|
}
|
|
98
114
|
|
|
99
|
-
# 3.
|
|
100
|
-
|
|
115
|
+
# 3. Execute Request
|
|
116
|
+
logger.info(f"Async SERP Search: {engine_str} - {query}")
|
|
117
|
+
async with session.post(
|
|
101
118
|
self.SERP_API_URL, data=payload, headers=headers
|
|
102
119
|
) as response:
|
|
103
120
|
response.raise_for_status()
|
|
121
|
+
|
|
104
122
|
data = await response.json()
|
|
123
|
+
# Handle double-encoded JSON strings if they occur
|
|
105
124
|
if isinstance(data, str):
|
|
106
125
|
try:
|
|
107
126
|
data = json.loads(data)
|
|
108
|
-
except
|
|
127
|
+
except json.JSONDecodeError:
|
|
109
128
|
pass
|
|
110
129
|
return data
|
|
111
130
|
|
|
112
|
-
# --- Universal (Unchanged) ---
|
|
113
131
|
async def universal_scrape(
|
|
114
132
|
self,
|
|
115
133
|
url: str,
|
|
116
134
|
js_render: bool = False,
|
|
117
135
|
output_format: str = "HTML",
|
|
118
|
-
country: str = None,
|
|
136
|
+
country: Optional[str] = None,
|
|
119
137
|
block_resources: bool = False
|
|
120
138
|
) -> Union[str, bytes]:
|
|
121
|
-
|
|
122
|
-
|
|
139
|
+
"""
|
|
140
|
+
Async Universal Scraping (Bypass Cloudflare/CAPTCHA).
|
|
141
|
+
"""
|
|
142
|
+
session = self._get_session()
|
|
123
143
|
|
|
124
144
|
headers = {
|
|
125
145
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
@@ -135,18 +155,21 @@ class AsyncThordataClient:
|
|
|
135
155
|
if country:
|
|
136
156
|
payload["country"] = country
|
|
137
157
|
|
|
138
|
-
|
|
158
|
+
logger.info(f"Async Universal Scrape: {url}")
|
|
159
|
+
async with session.post(
|
|
139
160
|
self.UNIVERSAL_API_URL, data=payload, headers=headers
|
|
140
161
|
) as response:
|
|
141
162
|
response.raise_for_status()
|
|
142
163
|
|
|
143
164
|
try:
|
|
144
165
|
resp_json = await response.json()
|
|
145
|
-
except
|
|
166
|
+
except json.JSONDecodeError:
|
|
167
|
+
# Fallback for raw content
|
|
146
168
|
if output_format.upper() == "PNG":
|
|
147
169
|
return await response.read()
|
|
148
170
|
return await response.text()
|
|
149
171
|
|
|
172
|
+
# Check API error codes
|
|
150
173
|
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
151
174
|
and resp_json.get("code") != 200:
|
|
152
175
|
raise Exception(f"Universal API Error: {resp_json}")
|
|
@@ -159,39 +182,38 @@ class AsyncThordataClient:
|
|
|
159
182
|
if not png_str:
|
|
160
183
|
raise Exception("API returned empty PNG data")
|
|
161
184
|
|
|
162
|
-
#
|
|
185
|
+
# Clean Data URI Scheme
|
|
163
186
|
if "," in png_str:
|
|
164
187
|
png_str = png_str.split(",", 1)[1]
|
|
165
188
|
|
|
189
|
+
# Fix Base64 Padding
|
|
166
190
|
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
167
191
|
missing_padding = len(png_str) % 4
|
|
168
192
|
if missing_padding:
|
|
169
193
|
png_str += '=' * (4 - missing_padding)
|
|
194
|
+
|
|
170
195
|
return base64.b64decode(png_str)
|
|
171
196
|
|
|
172
197
|
return str(resp_json)
|
|
173
198
|
|
|
174
|
-
# --- Web Scraper (Optimized) ---
|
|
175
199
|
async def create_scraper_task(
|
|
176
200
|
self,
|
|
177
201
|
file_name: str,
|
|
178
202
|
spider_id: str,
|
|
179
203
|
spider_name: str,
|
|
180
204
|
individual_params: Dict[str, Any],
|
|
181
|
-
universal_params: Dict[str, Any] = None
|
|
205
|
+
universal_params: Optional[Dict[str, Any]] = None
|
|
182
206
|
) -> str:
|
|
183
207
|
"""
|
|
184
208
|
Create an Asynchronous Web Scraper Task.
|
|
185
209
|
"""
|
|
186
|
-
|
|
187
|
-
raise RuntimeError("Client session not initialized.")
|
|
210
|
+
session = self._get_session()
|
|
188
211
|
|
|
189
212
|
headers = {
|
|
190
213
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
191
214
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
192
215
|
}
|
|
193
216
|
|
|
194
|
-
# 简化 Payload 构建,移除不必要的检查
|
|
195
217
|
payload = {
|
|
196
218
|
"file_name": file_name,
|
|
197
219
|
"spider_id": spider_id,
|
|
@@ -202,17 +224,23 @@ class AsyncThordataClient:
|
|
|
202
224
|
if universal_params:
|
|
203
225
|
payload["spider_universal"] = json.dumps(universal_params)
|
|
204
226
|
|
|
205
|
-
|
|
227
|
+
logger.info(f"Async Task Creation: {spider_name}")
|
|
228
|
+
async with session.post(
|
|
206
229
|
self.SCRAPER_BUILDER_URL, data=payload, headers=headers
|
|
207
230
|
) as response:
|
|
208
231
|
response.raise_for_status()
|
|
209
232
|
data = await response.json()
|
|
233
|
+
|
|
210
234
|
if data.get("code") != 200:
|
|
211
235
|
raise Exception(f"Creation failed: {data}")
|
|
212
236
|
return data["data"]["task_id"]
|
|
213
237
|
|
|
214
|
-
# --- Status & Result (Unchanged) ---
|
|
215
238
|
async def get_task_status(self, task_id: str) -> str:
|
|
239
|
+
"""
|
|
240
|
+
Check task status.
|
|
241
|
+
"""
|
|
242
|
+
session = self._get_session()
|
|
243
|
+
|
|
216
244
|
headers = {
|
|
217
245
|
"token": self.public_token,
|
|
218
246
|
"key": self.public_key,
|
|
@@ -220,28 +248,34 @@ class AsyncThordataClient:
|
|
|
220
248
|
}
|
|
221
249
|
payload = {"tasks_ids": task_id}
|
|
222
250
|
|
|
223
|
-
async with
|
|
251
|
+
async with session.post(
|
|
224
252
|
self.SCRAPER_STATUS_URL, data=payload, headers=headers
|
|
225
253
|
) as response:
|
|
226
254
|
data = await response.json()
|
|
227
255
|
if data.get("code") == 200 and data.get("data"):
|
|
228
256
|
for item in data["data"]:
|
|
229
|
-
if str(item
|
|
257
|
+
if str(item.get("task_id")) == str(task_id):
|
|
230
258
|
return item["status"]
|
|
231
259
|
return "Unknown"
|
|
232
260
|
|
|
233
261
|
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
262
|
+
"""
|
|
263
|
+
Get the download URL for a finished task.
|
|
264
|
+
"""
|
|
265
|
+
session = self._get_session()
|
|
266
|
+
|
|
234
267
|
headers = {
|
|
235
268
|
"token": self.public_token,
|
|
236
269
|
"key": self.public_key,
|
|
237
270
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
238
271
|
}
|
|
239
|
-
|
|
272
|
+
# Fixed: Use the file_type argument instead of hardcoding "json"
|
|
273
|
+
payload = {"tasks_id": task_id, "type": file_type}
|
|
240
274
|
|
|
241
|
-
async with
|
|
275
|
+
async with session.post(
|
|
242
276
|
self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
|
|
243
277
|
) as response:
|
|
244
278
|
data = await response.json()
|
|
245
|
-
if data.get("code") == 200:
|
|
279
|
+
if data.get("code") == 200 and data.get("data"):
|
|
246
280
|
return data["data"]["download"]
|
|
247
281
|
raise Exception(f"Result Error: {data}")
|