thordata-sdk 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ # src/thordata/__init__.py
2
+
3
+ from .client import ThordataClient
4
+ from .async_client import AsyncThordataClient
5
+ from .enums import Engine, GoogleSearchType
6
+
7
+ # Package version
8
+ __version__ = "0.3.1"
9
+
10
+ # Explicitly export classes to simplify user imports
11
+ __all__ = [
12
+ "ThordataClient",
13
+ "AsyncThordataClient",
14
+ "Engine",
15
+ "GoogleSearchType"
16
+ ]
@@ -4,7 +4,7 @@ import json
4
4
  import base64
5
5
  from typing import Optional, Dict, Any, Union
6
6
 
7
- # 复用我们刚刚写好的逻辑和枚举
7
+ # Import shared logic
8
8
  from .enums import Engine
9
9
  from .parameters import normalize_serp_params
10
10
 
@@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
  class AsyncThordataClient:
15
15
  """
16
- Thordata Asynchronous Client (built on aiohttp).
16
+ The official Asynchronous Python client for Thordata (built on aiohttp).
17
+ Designed for high-concurrency AI agents and data pipelines.
17
18
  """
18
19
 
19
20
  def __init__(
@@ -24,13 +25,18 @@ class AsyncThordataClient:
24
25
  proxy_host: str = "gate.thordata.com",
25
26
  proxy_port: int = 22225
26
27
  ):
28
+ """
29
+ Initialize the Async Client.
30
+ """
27
31
  self.scraper_token = scraper_token
28
32
  self.public_token = public_token
29
33
  self.public_key = public_key
30
34
 
35
+ # Pre-calculate proxy auth for performance
31
36
  self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
32
37
  self.proxy_url = f"http://{proxy_host}:{proxy_port}"
33
38
 
39
+ # API Endpoints
34
40
  self.base_url = "https://scraperapi.thordata.com"
35
41
  self.universal_url = "https://universalapi.thordata.com"
36
42
  self.api_url = "https://api.thordata.com/api/web-scraper-api"
@@ -41,6 +47,7 @@ class AsyncThordataClient:
41
47
  self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
42
48
  self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
43
49
 
50
+ # Session is initialized lazily or via context manager
44
51
  self._session: Optional[aiohttp.ClientSession] = None
45
52
 
46
53
  async def __aenter__(self):
@@ -52,16 +59,27 @@ class AsyncThordataClient:
52
59
  await self.close()
53
60
 
54
61
  async def close(self):
62
+ """Close the underlying aiohttp session."""
55
63
  if self._session and not self._session.closed:
56
64
  await self._session.close()
57
65
  self._session = None
58
66
 
59
- # --- Proxy (Unchanged) ---
67
+ def _get_session(self) -> aiohttp.ClientSession:
68
+ """Internal helper to ensure session exists."""
69
+ if self._session is None or self._session.closed:
70
+ raise RuntimeError(
71
+ "Client session not initialized. Use 'async with ThordataClient(...) as client:'"
72
+ )
73
+ return self._session
74
+
60
75
  async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
61
- if self._session is None:
62
- raise RuntimeError("Client session not initialized.")
76
+ """
77
+ Send an async GET request through the Proxy Network.
78
+ """
79
+ session = self._get_session()
63
80
  try:
64
- return await self._session.get(
81
+ logger.debug(f"Async Proxy Request: {url}")
82
+ return await session.get(
65
83
  url,
66
84
  proxy=self.proxy_url,
67
85
  proxy_auth=self.proxy_auth,
@@ -71,7 +89,6 @@ class AsyncThordataClient:
71
89
  logger.error(f"Async Request failed: {e}")
72
90
  raise
73
91
 
74
- # --- SERP (Optimized) ---
75
92
  async def serp_search(
76
93
  self,
77
94
  query: str,
@@ -82,13 +99,12 @@ class AsyncThordataClient:
82
99
  """
83
100
  Execute a real-time SERP search (Async).
84
101
  """
85
- if self._session is None:
86
- raise RuntimeError("Client session not initialized.")
102
+ session = self._get_session()
87
103
 
88
- # 1. 转换枚举
104
+ # 1. Handle Enum conversion
89
105
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
90
106
 
91
- # 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
107
+ # 2. Normalize parameters
92
108
  payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
93
109
 
94
110
  headers = {
@@ -96,30 +112,34 @@ class AsyncThordataClient:
96
112
  "Content-Type": "application/x-www-form-urlencoded"
97
113
  }
98
114
 
99
- # 3. 发送请求
100
- async with self._session.post(
115
+ # 3. Execute Request
116
+ logger.info(f"Async SERP Search: {engine_str} - {query}")
117
+ async with session.post(
101
118
  self.SERP_API_URL, data=payload, headers=headers
102
119
  ) as response:
103
120
  response.raise_for_status()
121
+
104
122
  data = await response.json()
123
+ # Handle double-encoded JSON strings if they occur
105
124
  if isinstance(data, str):
106
125
  try:
107
126
  data = json.loads(data)
108
- except Exception:
127
+ except json.JSONDecodeError:
109
128
  pass
110
129
  return data
111
130
 
112
- # --- Universal (Unchanged) ---
113
131
  async def universal_scrape(
114
132
  self,
115
133
  url: str,
116
134
  js_render: bool = False,
117
135
  output_format: str = "HTML",
118
- country: str = None,
136
+ country: Optional[str] = None,
119
137
  block_resources: bool = False
120
138
  ) -> Union[str, bytes]:
121
- if self._session is None:
122
- raise RuntimeError("Client session not initialized.")
139
+ """
140
+ Async Universal Scraping (Bypass Cloudflare/CAPTCHA).
141
+ """
142
+ session = self._get_session()
123
143
 
124
144
  headers = {
125
145
  "Authorization": f"Bearer {self.scraper_token}",
@@ -135,18 +155,21 @@ class AsyncThordataClient:
135
155
  if country:
136
156
  payload["country"] = country
137
157
 
138
- async with self._session.post(
158
+ logger.info(f"Async Universal Scrape: {url}")
159
+ async with session.post(
139
160
  self.UNIVERSAL_API_URL, data=payload, headers=headers
140
161
  ) as response:
141
162
  response.raise_for_status()
142
163
 
143
164
  try:
144
165
  resp_json = await response.json()
145
- except Exception:
166
+ except json.JSONDecodeError:
167
+ # Fallback for raw content
146
168
  if output_format.upper() == "PNG":
147
169
  return await response.read()
148
170
  return await response.text()
149
171
 
172
+ # Check API error codes
150
173
  if isinstance(resp_json, dict) and resp_json.get("code") \
151
174
  and resp_json.get("code") != 200:
152
175
  raise Exception(f"Universal API Error: {resp_json}")
@@ -159,39 +182,38 @@ class AsyncThordataClient:
159
182
  if not png_str:
160
183
  raise Exception("API returned empty PNG data")
161
184
 
162
- # 🛠️ FIX: 移除 Data URI Scheme 前缀
185
+ # Clean Data URI Scheme
163
186
  if "," in png_str:
164
187
  png_str = png_str.split(",", 1)[1]
165
188
 
189
+ # Fix Base64 Padding
166
190
  png_str = png_str.replace("\n", "").replace("\r", "")
167
191
  missing_padding = len(png_str) % 4
168
192
  if missing_padding:
169
193
  png_str += '=' * (4 - missing_padding)
194
+
170
195
  return base64.b64decode(png_str)
171
196
 
172
197
  return str(resp_json)
173
198
 
174
- # --- Web Scraper (Optimized) ---
175
199
  async def create_scraper_task(
176
200
  self,
177
201
  file_name: str,
178
202
  spider_id: str,
179
203
  spider_name: str,
180
204
  individual_params: Dict[str, Any],
181
- universal_params: Dict[str, Any] = None
205
+ universal_params: Optional[Dict[str, Any]] = None
182
206
  ) -> str:
183
207
  """
184
208
  Create an Asynchronous Web Scraper Task.
185
209
  """
186
- if self._session is None:
187
- raise RuntimeError("Client session not initialized.")
210
+ session = self._get_session()
188
211
 
189
212
  headers = {
190
213
  "Authorization": f"Bearer {self.scraper_token}",
191
214
  "Content-Type": "application/x-www-form-urlencoded"
192
215
  }
193
216
 
194
- # 简化 Payload 构建,移除不必要的检查
195
217
  payload = {
196
218
  "file_name": file_name,
197
219
  "spider_id": spider_id,
@@ -202,17 +224,23 @@ class AsyncThordataClient:
202
224
  if universal_params:
203
225
  payload["spider_universal"] = json.dumps(universal_params)
204
226
 
205
- async with self._session.post(
227
+ logger.info(f"Async Task Creation: {spider_name}")
228
+ async with session.post(
206
229
  self.SCRAPER_BUILDER_URL, data=payload, headers=headers
207
230
  ) as response:
208
231
  response.raise_for_status()
209
232
  data = await response.json()
233
+
210
234
  if data.get("code") != 200:
211
235
  raise Exception(f"Creation failed: {data}")
212
236
  return data["data"]["task_id"]
213
237
 
214
- # --- Status & Result (Unchanged) ---
215
238
  async def get_task_status(self, task_id: str) -> str:
239
+ """
240
+ Check task status.
241
+ """
242
+ session = self._get_session()
243
+
216
244
  headers = {
217
245
  "token": self.public_token,
218
246
  "key": self.public_key,
@@ -220,28 +248,34 @@ class AsyncThordataClient:
220
248
  }
221
249
  payload = {"tasks_ids": task_id}
222
250
 
223
- async with self._session.post(
251
+ async with session.post(
224
252
  self.SCRAPER_STATUS_URL, data=payload, headers=headers
225
253
  ) as response:
226
254
  data = await response.json()
227
255
  if data.get("code") == 200 and data.get("data"):
228
256
  for item in data["data"]:
229
- if str(item["task_id"]) == str(task_id):
257
+ if str(item.get("task_id")) == str(task_id):
230
258
  return item["status"]
231
259
  return "Unknown"
232
260
 
233
261
  async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
262
+ """
263
+ Get the download URL for a finished task.
264
+ """
265
+ session = self._get_session()
266
+
234
267
  headers = {
235
268
  "token": self.public_token,
236
269
  "key": self.public_key,
237
270
  "Content-Type": "application/x-www-form-urlencoded"
238
271
  }
239
- payload = {"tasks_id": task_id, "type": "json"}
272
+ # Fixed: Use the file_type argument instead of hardcoding "json"
273
+ payload = {"tasks_id": task_id, "type": file_type}
240
274
 
241
- async with self._session.post(
275
+ async with session.post(
242
276
  self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
243
277
  ) as response:
244
278
  data = await response.json()
245
- if data.get("code") == 200:
279
+ if data.get("code") == 200 and data.get("data"):
246
280
  return data["data"]["download"]
247
281
  raise Exception(f"Result Error: {data}")