thordata-sdk 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ # src/thordata/__init__.py
2
+
3
+ from .client import ThordataClient
4
+ from .async_client import AsyncThordataClient
5
+ from .enums import Engine, GoogleSearchType
6
+
7
+ # Package version
8
+ __version__ = "0.3.0"
9
+
10
+ # Explicitly export classes to simplify user imports
11
+ __all__ = [
12
+ "ThordataClient",
13
+ "AsyncThordataClient",
14
+ "Engine",
15
+ "GoogleSearchType"
16
+ ]
@@ -4,7 +4,7 @@ import json
4
4
  import base64
5
5
  from typing import Optional, Dict, Any, Union
6
6
 
7
- # 复用我们刚刚写好的逻辑和枚举
7
+ # Import shared logic
8
8
  from .enums import Engine
9
9
  from .parameters import normalize_serp_params
10
10
 
@@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
  class AsyncThordataClient:
15
15
  """
16
- Thordata Asynchronous Client (built on aiohttp).
16
+ The official Asynchronous Python client for Thordata (built on aiohttp).
17
+ Designed for high-concurrency AI agents and data pipelines.
17
18
  """
18
19
 
19
20
  def __init__(
@@ -24,13 +25,18 @@ class AsyncThordataClient:
24
25
  proxy_host: str = "gate.thordata.com",
25
26
  proxy_port: int = 22225
26
27
  ):
28
+ """
29
+ Initialize the Async Client.
30
+ """
27
31
  self.scraper_token = scraper_token
28
32
  self.public_token = public_token
29
33
  self.public_key = public_key
30
34
 
35
+ # Pre-calculate proxy auth for performance
31
36
  self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
32
37
  self.proxy_url = f"http://{proxy_host}:{proxy_port}"
33
38
 
39
+ # API Endpoints
34
40
  self.base_url = "https://scraperapi.thordata.com"
35
41
  self.universal_url = "https://universalapi.thordata.com"
36
42
  self.api_url = "https://api.thordata.com/api/web-scraper-api"
@@ -41,6 +47,7 @@ class AsyncThordataClient:
41
47
  self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
42
48
  self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
43
49
 
50
+ # Session is initialized lazily or via context manager
44
51
  self._session: Optional[aiohttp.ClientSession] = None
45
52
 
46
53
  async def __aenter__(self):
@@ -52,16 +59,27 @@ class AsyncThordataClient:
52
59
  await self.close()
53
60
 
54
61
  async def close(self):
62
+ """Close the underlying aiohttp session."""
55
63
  if self._session and not self._session.closed:
56
64
  await self._session.close()
57
65
  self._session = None
58
66
 
59
- # --- Proxy (Unchanged) ---
67
+ def _get_session(self) -> aiohttp.ClientSession:
68
+ """Internal helper to ensure session exists."""
69
+ if self._session is None or self._session.closed:
70
+ raise RuntimeError(
71
+ "Client session not initialized. Use 'async with ThordataClient(...) as client:'"
72
+ )
73
+ return self._session
74
+
60
75
  async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
61
- if self._session is None:
62
- raise RuntimeError("Client session not initialized.")
76
+ """
77
+ Send an async GET request through the Proxy Network.
78
+ """
79
+ session = self._get_session()
63
80
  try:
64
- return await self._session.get(
81
+ logger.debug(f"Async Proxy Request: {url}")
82
+ return await session.get(
65
83
  url,
66
84
  proxy=self.proxy_url,
67
85
  proxy_auth=self.proxy_auth,
@@ -71,7 +89,6 @@ class AsyncThordataClient:
71
89
  logger.error(f"Async Request failed: {e}")
72
90
  raise
73
91
 
74
- # --- SERP (Optimized) ---
75
92
  async def serp_search(
76
93
  self,
77
94
  query: str,
@@ -82,13 +99,12 @@ class AsyncThordataClient:
82
99
  """
83
100
  Execute a real-time SERP search (Async).
84
101
  """
85
- if self._session is None:
86
- raise RuntimeError("Client session not initialized.")
102
+ session = self._get_session()
87
103
 
88
- # 1. 转换枚举
104
+ # 1. Handle Enum conversion
89
105
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
90
106
 
91
- # 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
107
+ # 2. Normalize parameters
92
108
  payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
93
109
 
94
110
  headers = {
@@ -96,30 +112,34 @@ class AsyncThordataClient:
96
112
  "Content-Type": "application/x-www-form-urlencoded"
97
113
  }
98
114
 
99
- # 3. 发送请求
100
- async with self._session.post(
115
+ # 3. Execute Request
116
+ logger.info(f"Async SERP Search: {engine_str} - {query}")
117
+ async with session.post(
101
118
  self.SERP_API_URL, data=payload, headers=headers
102
119
  ) as response:
103
120
  response.raise_for_status()
121
+
104
122
  data = await response.json()
123
+ # Handle double-encoded JSON strings if they occur
105
124
  if isinstance(data, str):
106
125
  try:
107
126
  data = json.loads(data)
108
- except Exception:
127
+ except json.JSONDecodeError:
109
128
  pass
110
129
  return data
111
130
 
112
- # --- Universal (Unchanged) ---
113
131
  async def universal_scrape(
114
132
  self,
115
133
  url: str,
116
134
  js_render: bool = False,
117
135
  output_format: str = "HTML",
118
- country: str = None,
136
+ country: Optional[str] = None,
119
137
  block_resources: bool = False
120
138
  ) -> Union[str, bytes]:
121
- if self._session is None:
122
- raise RuntimeError("Client session not initialized.")
139
+ """
140
+ Async Universal Scraping (Bypass Cloudflare/CAPTCHA).
141
+ """
142
+ session = self._get_session()
123
143
 
124
144
  headers = {
125
145
  "Authorization": f"Bearer {self.scraper_token}",
@@ -135,18 +155,21 @@ class AsyncThordataClient:
135
155
  if country:
136
156
  payload["country"] = country
137
157
 
138
- async with self._session.post(
158
+ logger.info(f"Async Universal Scrape: {url}")
159
+ async with session.post(
139
160
  self.UNIVERSAL_API_URL, data=payload, headers=headers
140
161
  ) as response:
141
162
  response.raise_for_status()
142
163
 
143
164
  try:
144
165
  resp_json = await response.json()
145
- except Exception:
166
+ except json.JSONDecodeError:
167
+ # Fallback for raw content
146
168
  if output_format.upper() == "PNG":
147
169
  return await response.read()
148
170
  return await response.text()
149
171
 
172
+ # Check API error codes
150
173
  if isinstance(resp_json, dict) and resp_json.get("code") \
151
174
  and resp_json.get("code") != 200:
152
175
  raise Exception(f"Universal API Error: {resp_json}")
@@ -159,39 +182,38 @@ class AsyncThordataClient:
159
182
  if not png_str:
160
183
  raise Exception("API returned empty PNG data")
161
184
 
162
- # 🛠️ FIX: 移除 Data URI Scheme 前缀
185
+ # Clean Data URI Scheme
163
186
  if "," in png_str:
164
187
  png_str = png_str.split(",", 1)[1]
165
188
 
189
+ # Fix Base64 Padding
166
190
  png_str = png_str.replace("\n", "").replace("\r", "")
167
191
  missing_padding = len(png_str) % 4
168
192
  if missing_padding:
169
193
  png_str += '=' * (4 - missing_padding)
194
+
170
195
  return base64.b64decode(png_str)
171
196
 
172
197
  return str(resp_json)
173
198
 
174
- # --- Web Scraper (Optimized) ---
175
199
  async def create_scraper_task(
176
200
  self,
177
201
  file_name: str,
178
202
  spider_id: str,
179
203
  spider_name: str,
180
204
  individual_params: Dict[str, Any],
181
- universal_params: Dict[str, Any] = None
205
+ universal_params: Optional[Dict[str, Any]] = None
182
206
  ) -> str:
183
207
  """
184
208
  Create an Asynchronous Web Scraper Task.
185
209
  """
186
- if self._session is None:
187
- raise RuntimeError("Client session not initialized.")
210
+ session = self._get_session()
188
211
 
189
212
  headers = {
190
213
  "Authorization": f"Bearer {self.scraper_token}",
191
214
  "Content-Type": "application/x-www-form-urlencoded"
192
215
  }
193
216
 
194
- # 简化 Payload 构建,移除不必要的检查
195
217
  payload = {
196
218
  "file_name": file_name,
197
219
  "spider_id": spider_id,
@@ -202,17 +224,23 @@ class AsyncThordataClient:
202
224
  if universal_params:
203
225
  payload["spider_universal"] = json.dumps(universal_params)
204
226
 
205
- async with self._session.post(
227
+ logger.info(f"Async Task Creation: {spider_name}")
228
+ async with session.post(
206
229
  self.SCRAPER_BUILDER_URL, data=payload, headers=headers
207
230
  ) as response:
208
231
  response.raise_for_status()
209
232
  data = await response.json()
233
+
210
234
  if data.get("code") != 200:
211
235
  raise Exception(f"Creation failed: {data}")
212
236
  return data["data"]["task_id"]
213
237
 
214
- # --- Status & Result (Unchanged) ---
215
238
  async def get_task_status(self, task_id: str) -> str:
239
+ """
240
+ Check task status.
241
+ """
242
+ session = self._get_session()
243
+
216
244
  headers = {
217
245
  "token": self.public_token,
218
246
  "key": self.public_key,
@@ -220,28 +248,34 @@ class AsyncThordataClient:
220
248
  }
221
249
  payload = {"tasks_ids": task_id}
222
250
 
223
- async with self._session.post(
251
+ async with session.post(
224
252
  self.SCRAPER_STATUS_URL, data=payload, headers=headers
225
253
  ) as response:
226
254
  data = await response.json()
227
255
  if data.get("code") == 200 and data.get("data"):
228
256
  for item in data["data"]:
229
- if str(item["task_id"]) == str(task_id):
257
+ if str(item.get("task_id")) == str(task_id):
230
258
  return item["status"]
231
259
  return "Unknown"
232
260
 
233
261
  async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
262
+ """
263
+ Get the download URL for a finished task.
264
+ """
265
+ session = self._get_session()
266
+
234
267
  headers = {
235
268
  "token": self.public_token,
236
269
  "key": self.public_key,
237
270
  "Content-Type": "application/x-www-form-urlencoded"
238
271
  }
239
- payload = {"tasks_id": task_id, "type": "json"}
272
+ # Fixed: Use the file_type argument instead of hardcoding "json"
273
+ payload = {"tasks_id": task_id, "type": file_type}
240
274
 
241
- async with self._session.post(
275
+ async with session.post(
242
276
  self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
243
277
  ) as response:
244
278
  data = await response.json()
245
- if data.get("code") == 200:
279
+ if data.get("code") == 200 and data.get("data"):
246
280
  return data["data"]["download"]
247
281
  raise Exception(f"Result Error: {data}")
@@ -7,7 +7,7 @@ from typing import Dict, Any, Union, Optional
7
7
  from .enums import Engine
8
8
  from .parameters import normalize_serp_params
9
9
 
10
- # Configure a library-specific logger
10
+ # Configure a library-specific logger to avoid interfering with user's logging
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
@@ -15,11 +15,11 @@ class ThordataClient:
15
15
  """
16
16
  The official synchronous Python client for Thordata.
17
17
 
18
- Handles authentication for:
19
- 1. Proxy Network (HTTP/HTTPS)
20
- 2. SERP API (Real-time Search)
21
- 3. Universal Scraping API (Single Page)
22
- 4. Web Scraper API (Async Task Management)
18
+ This client handles authentication and communication with:
19
+ 1. Proxy Network (Residential/Datacenter via HTTP/HTTPS)
20
+ 2. SERP API (Real-time Search Engine Results)
21
+ 3. Universal Scraping API (Single Page Rendering & Extraction)
22
+ 4. Web Scraper API (Async Task Management for large scale jobs)
23
23
  """
24
24
 
25
25
  def __init__(
@@ -34,11 +34,11 @@ class ThordataClient:
34
34
  Initialize the Thordata Client.
35
35
 
36
36
  Args:
37
- scraper_token (str): Token from Dashboard bottom.
38
- public_token (str): Token from Public API section.
39
- public_key (str): Key from Public API section.
40
- proxy_host (str): Proxy gateway host.
41
- proxy_port (int): Proxy gateway port.
37
+ scraper_token (str): The secret token found at the bottom of the Dashboard.
38
+ public_token (str): The token from the Public API section.
39
+ public_key (str): The key from the Public API section.
40
+ proxy_host (str): The proxy gateway host (default: gate.thordata.com).
41
+ proxy_port (int): The proxy gateway port (default: 22225).
42
42
  """
43
43
  self.scraper_token = scraper_token
44
44
  self.public_token = public_token
@@ -49,7 +49,7 @@ class ThordataClient:
49
49
  f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
50
50
  )
51
51
 
52
- # API Endpoints
52
+ # API Endpoints Definition
53
53
  self.base_url = "https://scraperapi.thordata.com"
54
54
  self.universal_url = "https://universalapi.thordata.com"
55
55
  self.api_url = "https://api.thordata.com/api/web-scraper-api"
@@ -60,6 +60,7 @@ class ThordataClient:
60
60
  self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
61
61
  self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
62
62
 
63
+ # Initialize Session with Proxy settings
63
64
  self.session = requests.Session()
64
65
  self.session.proxies = {
65
66
  "http": self.proxy_url,
@@ -68,7 +69,14 @@ class ThordataClient:
68
69
 
69
70
  def get(self, url: str, **kwargs) -> requests.Response:
70
71
  """
71
- Send a GET request through the Thordata Proxy Network.
72
+ Send a standard GET request through the Thordata Residential Proxy Network.
73
+
74
+ Args:
75
+ url (str): The target URL.
76
+ **kwargs: Arguments to pass to requests.get().
77
+
78
+ Returns:
79
+ requests.Response: The response object.
72
80
  """
73
81
  logger.debug(f"Proxy Request: {url}")
74
82
  kwargs.setdefault("timeout", 30)
@@ -77,23 +85,26 @@ class ThordataClient:
77
85
  def serp_search(
78
86
  self,
79
87
  query: str,
80
- engine: Union[Engine, str] = Engine.GOOGLE, # 既可以是枚举,也可以是字符串
88
+ engine: Union[Engine, str] = Engine.GOOGLE,
81
89
  num: int = 10,
82
- **kwargs # 这里接收所有额外参数 (比如 type="maps")
90
+ **kwargs
83
91
  ) -> Dict[str, Any]:
84
92
  """
85
- Execute a real-time SERP search.
93
+ Execute a real-time SERP (Search Engine Results Page) search.
86
94
 
87
95
  Args:
88
- query: Keywords
89
- engine: 'google', 'bing', 'yandex' etc.
90
- num: Number of results (default 10)
91
- **kwargs: Extra parameters (e.g., type="shopping", location="London")
96
+ query (str): The search keywords.
97
+ engine (Union[Engine, str]): The search engine (e.g., 'google', 'bing').
98
+ num (int): Number of results to retrieve (default 10).
99
+ **kwargs: Additional parameters (e.g., type="shopping", location="London").
100
+
101
+ Returns:
102
+ Dict[str, Any]: The parsed JSON result from the search engine.
92
103
  """
93
- # 兼容处理:如果用户传的是枚举对象,取它的值;如果是字符串,转小写
104
+ # Handle Enum or String input for engine
94
105
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
95
106
 
96
- # 调用 parameters.py 里的逻辑
107
+ # Normalize parameters via internal helper
97
108
  payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
98
109
 
99
110
  headers = {
@@ -112,25 +123,38 @@ class ThordataClient:
112
123
  response.raise_for_status()
113
124
 
114
125
  data = response.json()
126
+ # Handle cases where the API returns a stringified JSON
115
127
  if isinstance(data, str):
116
- try: data = json.loads(data)
117
- except: pass
128
+ try:
129
+ data = json.loads(data)
130
+ except json.JSONDecodeError:
131
+ pass
118
132
  return data
119
133
  except Exception as e:
120
134
  logger.error(f"SERP Request Failed: {e}")
121
135
  raise
122
136
 
123
-
124
137
  def universal_scrape(
125
138
  self,
126
139
  url: str,
127
140
  js_render: bool = False,
128
141
  output_format: str = "HTML",
129
- country: str = None,
142
+ country: Optional[str] = None,
130
143
  block_resources: bool = False
131
144
  ) -> Union[str, bytes]:
132
145
  """
133
146
  Unlock target pages via the Universal Scraping API.
147
+ Bypasses Cloudflare, CAPTCHAs, and antibot systems automatically.
148
+
149
+ Args:
150
+ url (str): Target URL.
151
+ js_render (bool): Whether to render JavaScript (Headless Browser).
152
+ output_format (str): "HTML" or "PNG" (screenshot).
153
+ country (Optional[str]): Geo-targeting country code (e.g., 'us').
154
+ block_resources (bool): Block images/css to speed up loading.
155
+
156
+ Returns:
157
+ Union[str, bytes]: HTML string or PNG bytes.
134
158
  """
135
159
  headers = {
136
160
  "Authorization": f"Bearer {self.scraper_token}",
@@ -146,7 +170,7 @@ class ThordataClient:
146
170
  if country:
147
171
  payload["country"] = country
148
172
 
149
- logger.info(f"Universal Scrape: {url}")
173
+ logger.info(f"Universal Scrape: {url} (Format: {output_format})")
150
174
 
151
175
  try:
152
176
  response = self.session.post(
@@ -157,35 +181,35 @@ class ThordataClient:
157
181
  )
158
182
  response.raise_for_status()
159
183
 
160
- # Parse JSON wrapper
184
+ # Attempt to parse JSON wrapper
161
185
  try:
162
186
  resp_json = response.json()
163
187
  except json.JSONDecodeError:
164
- # Fallback for raw response
188
+ # Fallback: if the API returns raw content directly
165
189
  if output_format.upper() == "PNG":
166
190
  return response.content
167
191
  return response.text
168
192
 
169
- # Check API errors
193
+ # Check for API-level errors inside the JSON
170
194
  if isinstance(resp_json, dict) and resp_json.get("code") \
171
195
  and resp_json.get("code") != 200:
172
196
  raise Exception(f"Universal API Error: {resp_json}")
173
197
 
174
- # Extract HTML
198
+ # Case 1: Return HTML
175
199
  if "html" in resp_json:
176
200
  return resp_json["html"]
177
201
 
178
- # Extract PNG
202
+ # Case 2: Return PNG Image
179
203
  if "png" in resp_json:
180
204
  png_str = resp_json["png"]
181
205
  if not png_str:
182
206
  raise Exception("API returned empty PNG data")
183
207
 
184
- # 🛠️ FIX: 移除 Data URI Scheme 前缀 (data:image/png;base64,)
208
+ # Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
185
209
  if "," in png_str:
186
210
  png_str = png_str.split(",", 1)[1]
187
211
 
188
- # Base64 解码 (处理 padding)
212
+ # Fix Base64 Padding
189
213
  png_str = png_str.replace("\n", "").replace("\r", "")
190
214
  missing_padding = len(png_str) % 4
191
215
  if missing_padding:
@@ -193,6 +217,7 @@ class ThordataClient:
193
217
 
194
218
  return base64.b64decode(png_str)
195
219
 
220
+ # Fallback
196
221
  return str(resp_json)
197
222
 
198
223
  except Exception as e:
@@ -202,22 +227,33 @@ class ThordataClient:
202
227
  def create_scraper_task(
203
228
  self,
204
229
  file_name: str,
205
- spider_id: str, # 必须传,用户从仪表板获取
206
- spider_name: str, # 必须传,例如 "youtube.com"
207
- individual_params: Dict[str, Any], # 用户把具体的参数打包在这个字典里传进来
208
- universal_params: Dict[str, Any] = None
230
+ spider_id: str,
231
+ spider_name: str,
232
+ individual_params: Dict[str, Any],
233
+ universal_params: Optional[Dict[str, Any]] = None
209
234
  ) -> str:
210
235
  """
211
- Create a generic Web Scraper Task.
236
+ Create a generic Web Scraper Task (Async).
212
237
 
213
- Note: Check the Thordata Dashboard to get the correct 'spider_id' and 'spider_name'.
238
+ IMPORTANT: You must retrieve the correct 'spider_id' and 'spider_name'
239
+ from the Thordata Dashboard before calling this method.
240
+
241
+ Args:
242
+ file_name (str): Name for the output file.
243
+ spider_id (str): The ID of the spider (from Dashboard).
244
+ spider_name (str): The name of the spider (e.g., "youtube.com").
245
+ individual_params (Dict): Parameters specific to the spider.
246
+ universal_params (Optional[Dict]): Global settings for the scraper.
247
+
248
+ Returns:
249
+ str: The created task_id.
214
250
  """
215
251
  headers = {
216
252
  "Authorization": f"Bearer {self.scraper_token}",
217
253
  "Content-Type": "application/x-www-form-urlencoded"
218
254
  }
219
255
 
220
- # 直接打包发送,不替用户做太多复杂的校验,保证兼容性
256
+ # Payload construction
221
257
  payload = {
222
258
  "spider_name": spider_name,
223
259
  "spider_id": spider_id,
@@ -247,7 +283,13 @@ class ThordataClient:
247
283
 
248
284
  def get_task_status(self, task_id: str) -> str:
249
285
  """
250
- Check the status of a task.
286
+ Check the status of an asynchronous scraping task.
287
+
288
+ Args:
289
+ task_id (str): The ID returned by create_scraper_task.
290
+
291
+ Returns:
292
+ str: The status string (e.g., "finished", "running", "error").
251
293
  """
252
294
  headers = {
253
295
  "token": self.public_token,
@@ -277,6 +319,13 @@ class ThordataClient:
277
319
  def get_task_result(self, task_id: str, file_type: str = "json") -> str:
278
320
  """
279
321
  Retrieve the download URL for a completed task.
322
+
323
+ Args:
324
+ task_id (str): The task ID.
325
+ file_type (str): Format required (default "json").
326
+
327
+ Returns:
328
+ str: The URL to download the result file.
280
329
  """
281
330
  headers = {
282
331
  "token": self.public_token,
@@ -285,7 +334,7 @@ class ThordataClient:
285
334
  }
286
335
  payload = {"tasks_id": task_id, "type": file_type}
287
336
 
288
- logger.info(f"Getting result URL: {task_id}")
337
+ logger.info(f"Getting result URL for Task: {task_id}")
289
338
  try:
290
339
  response = self.session.post(
291
340
  self.SCRAPER_DOWNLOAD_URL,
thordata/enums.py ADDED
@@ -0,0 +1,25 @@
1
+ # src/thordata/enums.py
2
+
3
+ from enum import Enum
4
+
5
+ class Engine(str, Enum):
6
+ """
7
+ Supported Search Engines for SERP API.
8
+ """
9
+ GOOGLE = "google"
10
+ BING = "bing"
11
+ YANDEX = "yandex"
12
+ DUCKDUCKGO = "duckduckgo"
13
+ BAIDU = "baidu"
14
+
15
+ class GoogleSearchType(str, Enum):
16
+ """
17
+ Specific search types for Google Engine.
18
+ """
19
+ SEARCH = "search" # Default web search
20
+ MAPS = "maps" # Google Maps
21
+ SHOPPING = "shopping" # Google Shopping
22
+ NEWS = "news" # Google News
23
+ IMAGES = "images" # Google Images
24
+ VIDEOS = "videos" # Google Videos
25
+ # Users can pass other strings manually if needed
thordata/parameters.py ADDED
@@ -0,0 +1,52 @@
1
+ # src/thordata/parameters.py
2
+
3
+ from typing import Dict, Any, Optional
4
+
5
+ def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
6
+ """
7
+ Normalizes parameters across different search engines to ensure a unified API surface.
8
+
9
+ Args:
10
+ engine (str): The search engine to use (e.g., 'google', 'yandex').
11
+ query (str): The search query string.
12
+ **kwargs: Additional parameters to pass to the API.
13
+
14
+ Returns:
15
+ Dict[str, Any]: The constructed payload for the API request.
16
+ """
17
+ # 1. Base parameters
18
+ payload = {
19
+ "num": str(kwargs.get("num", 10)), # Default to 10 results
20
+ "json": "1", # Force JSON response
21
+ "engine": engine,
22
+ }
23
+
24
+ # 2. Handle Query Parameter Differences (Yandex uses 'text', others use 'q')
25
+ if engine == "yandex":
26
+ payload["text"] = query
27
+ # Set default URL for Yandex if not provided
28
+ if "url" not in kwargs:
29
+ payload["url"] = "yandex.com"
30
+ else:
31
+ payload["q"] = query
32
+
33
+ # 3. Handle Default URLs for other engines
34
+ if "url" not in kwargs:
35
+ defaults = {
36
+ "google": "google.com",
37
+ "bing": "bing.com",
38
+ "duckduckgo": "duckduckgo.com",
39
+ "baidu": "baidu.com"
40
+ }
41
+ if engine in defaults:
42
+ payload["url"] = defaults[engine]
43
+
44
+ # 4. Passthrough for all other user-provided arguments
45
+ # This allows support for engine-specific parameters (e.g., tbm, uule, gl)
46
+ # without explicitly defining them all.
47
+ protected_keys = {"num", "engine", "q", "text"}
48
+ for key, value in kwargs.items():
49
+ if key not in protected_keys:
50
+ payload[key] = value
51
+
52
+ return payload
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.4
2
+ Name: thordata-sdk
3
+ Version: 0.3.0
4
+ Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
+ Author-email: Thordata Developer Team <support@thordata.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://www.thordata.com
8
+ Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
9
+ Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
10
+ Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
11
+ Keywords: web scraping,proxy,ai,llm,data-mining,serp,thordata
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Topic :: Internet :: WWW/HTTP
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: License :: OSI Approved :: Apache Software License
22
+ Classifier: Operating System :: OS Independent
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: requests>=2.25.0
27
+ Requires-Dist: aiohttp>=3.8.0
28
+ Dynamic: license-file
29
+
30
+ # Thordata Python SDK
31
+
32
+ <h4 align="center">
33
+ Official Python client for Thordata's Proxy Network, SERP API, Universal Scraping API, and Web Scraper API.
34
+ <br>
35
+ <i>Async-ready, built for AI agents and large-scale data collection.</i>
36
+ </h4>
37
+
38
+ <p align="center">
39
+ <a href="https://pypi.org/project/thordata-sdk/">
40
+ <img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version">
41
+ </a>
42
+ <a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE">
43
+ <img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License">
44
+ </a>
45
+ <a href="https://python.org">
46
+ <img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions">
47
+ </a>
48
+ </p>
49
+
50
+ ---
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install thordata-sdk
56
+ ```
57
+
58
+ ## Quick Start
59
+
60
+ All examples below use the unified client:
61
+
62
+ ```python
63
+ from thordata import ThordataClient, AsyncThordataClient
64
+ ```
65
+
66
+ You can copy `examples/.env.example` to `.env` and fill in your tokens from the Thordata Dashboard.
67
+
68
+ ### 1. Proxy Network (Simple GET)
69
+
70
+ ```python
71
+ import os
72
+ from dotenv import load_dotenv
73
+ from thordata import ThordataClient
74
+
75
+ load_dotenv()
76
+
77
+ client = ThordataClient(
78
+ scraper_token=os.getenv("THORDATA_SCRAPER_TOKEN"),
79
+ public_token=os.getenv("THORDATA_PUBLIC_TOKEN"),
80
+ public_key=os.getenv("THORDATA_PUBLIC_KEY"),
81
+ )
82
+
83
+ resp = client.get("http://httpbin.org/ip")
84
+ print(resp.json())
85
+ ```
86
+
87
+ ### 2. SERP API (Google, Bing, Yandex, DuckDuckGo)
88
+
89
+ ```python
90
+ from thordata import ThordataClient, Engine
91
+
92
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
93
+
94
+ results = client.serp_search(
95
+ query="Thordata technology",
96
+ engine=Engine.GOOGLE,
97
+ num=10,
98
+ # Any engine-specific parameters are passed via **kwargs
99
+ # e.g. type="shopping", location="United States"
100
+ )
101
+
102
+ print(len(results.get("organic", [])))
103
+ ```
104
+
105
+ ### 3. Universal Scraping API
106
+
107
+ ```python
108
+ from thordata import ThordataClient
109
+
110
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
111
+
112
+ html = client.universal_scrape(
113
+ url="https://www.google.com",
114
+ js_render=True,
115
+ output_format="HTML",
116
+ )
117
+ print(html[:200])
118
+ ```
119
+
120
+ ### 4. Web Scraper API (Task-based)
121
+
122
+ ```python
123
+ import time
124
+ from thordata import ThordataClient
125
+
126
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
127
+
128
+ task_id = client.create_scraper_task(
129
+ file_name="demo_youtube_data",
130
+ spider_id="youtube_video-post_by-url",
131
+ spider_name="youtube.com",
132
+ individual_params={
133
+ "url": "https://www.youtube.com/@stephcurry/videos",
134
+ "order_by": "",
135
+ "num_of_posts": ""
136
+ },
137
+ )
138
+
139
+ for _ in range(10):
140
+ status = client.get_task_status(task_id)
141
+ print("Status:", status)
142
+ if status in ["Ready", "Success"]:
143
+ break
144
+ if status == "Failed":
145
+ raise RuntimeError("Task failed")
146
+ time.sleep(3)
147
+
148
+ download_url = client.get_task_result(task_id)
149
+ print("Download URL:", download_url)
150
+ ```
151
+
152
+ ### 5. Asynchronous Usage (High Concurrency)
153
+
154
+ ```python
155
+ import asyncio
156
+ from thordata import AsyncThordataClient
157
+
158
+ async def main():
159
+ async with AsyncThordataClient(
160
+ scraper_token="SCRAPER_TOKEN",
161
+ public_token="PUBLIC_TOKEN",
162
+ public_key="PUBLIC_KEY",
163
+ ) as client:
164
+ resp = await client.get("http://httpbin.org/ip")
165
+ print(await resp.json())
166
+
167
+ asyncio.run(main())
168
+ ```
169
+
170
+ More examples are available in the `examples/` directory.
171
+
172
+ ---
173
+
174
+ ## Features
175
+
176
+ | Feature | Status | Description |
177
+ |---------|--------|-------------|
178
+ | Proxy Network | Stable | Residential, ISP, Mobile, Datacenter via HTTP/HTTPS gateway. |
179
+ | SERP API | Stable | Google / Bing / Yandex / DuckDuckGo, flexible parameters. |
180
+ | Universal Scraping API | Stable | JS rendering, HTML / PNG output, antibot bypass. |
181
+ | Web Scraper API | Stable | Task-based scraping for complex sites (YouTube, E-commerce). |
182
+ | Async Client | Stable | aiohttp-based client for high-concurrency workloads. |
183
+
184
+ ---
185
+
186
+ ## Development & Contributing
187
+
188
+ See `CONTRIBUTING.md` for local development and contribution guidelines.
189
+
190
+ ## License
191
+
192
+ This project is licensed under the Apache License 2.0.
193
+
194
+ ## Support
195
+
196
+ For technical support, please contact support@thordata.com
197
+ or verify your tokens and quotas in the Thordata Dashboard.
@@ -0,0 +1,10 @@
1
+ thordata/__init__.py,sha256=HVb6cHBsYRFoA1Sf_y_WSZ88vGV3DsT67rCdbZSuUYE,365
2
+ thordata/async_client.py,sha256=cpBtRIzr8oH6GuZs8gTh505tGYYV1aRFBUzbtmFOfEg,9717
3
+ thordata/client.py,sha256=w_EXs6CLM2qFtFPNU-x_Li66LEH1j7pQb2ca2MDKqyA,12432
4
+ thordata/enums.py,sha256=PGUCQX3jw5a9mX8_JfhuyoR1WriWjWQpAgibVP_bpdM,679
5
+ thordata/parameters.py,sha256=1lNx_BSS8ztBKEj_MXZMaIQQ9_W3EAlS-VFiBqSWb9E,1841
6
+ thordata_sdk-0.3.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
7
+ thordata_sdk-0.3.0.dist-info/METADATA,sha256=Yj6W3vSLkkUhSXTj6AK4AaMfdlJvGOVaK6cFI2MNqV8,5697
8
+ thordata_sdk-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ thordata_sdk-0.3.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
10
+ thordata_sdk-0.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1 @@
1
+ thordata
thordata_sdk/__init__.py DELETED
@@ -1,9 +0,0 @@
1
- # Expose main clients
2
- from .client import ThordataClient
3
- from .async_client import AsyncThordataClient
4
- from .enums import Engine, GoogleSearchType
5
-
6
- # Version of the thordata-sdk package
7
- __version__ = "0.2.4"
8
-
9
- __all__ = ["ThordataClient", "AsyncThordataClient"]
thordata_sdk/enums.py DELETED
@@ -1,20 +0,0 @@
1
- # thordata_sdk/enums.py
2
- from enum import Enum
3
-
4
- class Engine(str, Enum):
5
- """SERP 核心支持的四大引擎"""
6
- GOOGLE = "google"
7
- BING = "bing"
8
- YANDEX = "yandex"
9
- DUCKDUCKGO = "duckduckgo"
10
- BAIDU = "baidu"
11
-
12
- class GoogleSearchType(str, Enum):
13
- """Google 搜索的常见子类型 (参考你的截图)"""
14
- SEARCH = "search" # 默认网页搜索
15
- MAPS = "maps" # 地图
16
- SHOPPING = "shopping" # 购物
17
- NEWS = "news" # 新闻
18
- IMAGES = "images" # 图片
19
- VIDEOS = "videos" # 视频
20
- # 其他冷门的先不写,用户可以通过字符串传参
@@ -1,41 +0,0 @@
1
- # thordata_sdk/parameters.py
2
- from typing import Dict, Any
3
-
4
- def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
5
- """
6
- 统一不同搜索引擎的参数差异。
7
- """
8
- # 1. 基础参数
9
- payload = {
10
- "num": str(kwargs.get("num", 10)),
11
- "json": "1",
12
- "engine": engine,
13
- }
14
-
15
- # 2. 处理查询关键词 (Yandex 用 text,其他用 q)
16
- if engine == "yandex":
17
- payload["text"] = query
18
- # 如果用户没传 url,给个默认的
19
- if "url" not in kwargs:
20
- payload["url"] = "yandex.com"
21
- else:
22
- payload["q"] = query
23
-
24
- # 3. 处理默认 URL (如果用户没传)
25
- if "url" not in kwargs:
26
- defaults = {
27
- "google": "google.com",
28
- "bing": "bing.com",
29
- "duckduckgo": "duckduckgo.com",
30
- "baidu": "baidu.com"
31
- }
32
- if engine in defaults:
33
- payload["url"] = defaults[engine]
34
-
35
- # 4. 把用户传入的其他所有参数(比如 type="shopping", google_domain="google.co.uk")都透传进去
36
- # 这样你就不用去定义那几十种类型了,用户传啥就是啥
37
- for k, v in kwargs.items():
38
- if k not in ["num", "engine", "q", "text"]: # 避免覆盖
39
- payload[k] = v
40
-
41
- return payload
@@ -1,113 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: thordata-sdk
3
- Version: 0.2.4
4
- Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
5
- Home-page: https://github.com/Thordata/thordata-python-sdk
6
- Author: Thordata Developer Team
7
- Author-email: support@thordata.com
8
- License: Apache License 2.0
9
- Project-URL: Bug Tracker, https://github.com/Thordata/thordata-python-sdk/issues
10
- Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
11
- Classifier: Development Status :: 4 - Beta
12
- Classifier: Intended Audience :: Developers
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.8
15
- Classifier: Programming Language :: Python :: 3.9
16
- Classifier: Programming Language :: Python :: 3.10
17
- Classifier: Programming Language :: Python :: 3.11
18
- Classifier: License :: OSI Approved :: Apache Software License
19
- Classifier: Operating System :: OS Independent
20
- Classifier: Topic :: Internet :: WWW/HTTP
21
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
- Requires-Python: >=3.8
23
- Description-Content-Type: text/markdown
24
- License-File: LICENSE
25
- Requires-Dist: requests>=2.25.0
26
- Requires-Dist: aiohttp>=3.8.0
27
-
28
- # Thordata Python SDK
29
-
30
- <h4 align="center">
31
- The Official Python Client for the Thordata Proxy Network & Web Scraper API.
32
- <br>
33
- <i>High-performance, async-ready, designed for AI Agents and large-scale data collection.</i>
34
- </h4>
35
-
36
- <p align="center">
37
- <a href="https://pypi.org/project/thordata-sdk/"><img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version"></a>
38
- <a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
39
- <a href="https://python.org"><img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions"></a>
40
- </p>
41
-
42
- ---
43
-
44
- ## 🛠 Installation
45
-
46
- Install via pip:
47
-
48
- ```bash
49
- pip install thordata-sdk
50
- ```
51
-
52
- ## ⚡ Quick Start
53
-
54
- ### 1. Proxy Usage (Simple GET Request)
55
-
56
- **Python**
57
-
58
- ```python
59
- from thordata_sdk import ThordataClient
60
-
61
- # Initialize with your credentials from the Thordata Dashboard
62
- client = ThordataClient(
63
- scraper_token="YOUR_SCRAPER_TOKEN", # From "Scraping Tool Token"
64
- public_token="YOUR_PUBLIC_TOKEN", # From "Public API"
65
- public_key="YOUR_PUBLIC_KEY" # From "Public API"
66
- )
67
-
68
- # Send a request through the proxy
69
- response = client.get("http://httpbin.org/ip")
70
- print(response.json())
71
- ```
72
-
73
- ### 2. Real-time SERP Search
74
-
75
- **Python**
76
-
77
- ```python
78
- results = client.serp_search("Thordata technology", engine="google")
79
- print(f"Results found: {len(results.get('organic', []))}")
80
- ```
81
-
82
- ### 3. Asynchronous Usage (High Concurrency)
83
-
84
- **Python**
85
-
86
- ```python
87
- import asyncio
88
- from thordata_sdk import AsyncThordataClient
89
-
90
- async def main():
91
- async with AsyncThordataClient(scraper_token="...", public_token="...", public_key="...") as client:
92
- response = await client.get("http://httpbin.org/ip")
93
- print(await response.json())
94
-
95
- asyncio.run(main())
96
- ```
97
-
98
- ## ⚙️ Features Status
99
-
100
- | Feature | Status | Description |
101
- |---------|--------|-------------|
102
- | Proxy Network | ✅ Stable | Synchronous & Asynchronous support via aiohttp. |
103
- | SERP API | ✅ Stable | Real-time Google/Bing/Yandex search results. |
104
- | Web Scraper | ✅ Stable | Async task management for scraping complex sites (e.g., YouTube). |
105
- | Authentication | ✅ Secure | Dual-token system for enhanced security. |
106
-
107
- ## 📄 License
108
-
109
- This project is licensed under the Apache License 2.0.
110
-
111
- ## 📞 Support
112
-
113
- For technical assistance, please contact support@thordata.com or verify your tokens in the Thordata Dashboard.
@@ -1,10 +0,0 @@
1
- thordata_sdk/__init__.py,sha256=TpVRMWiWSkxq6MUoX1LCkfmuZTH9FWC65JbaALeVoVY,268
2
- thordata_sdk/async_client.py,sha256=YIIKddghCzGAvrx2Bqy8XkGcgFLbCPgzkQw-jcq2WH8,8612
3
- thordata_sdk/client.py,sha256=UyRLjRFKep2SLOWExjAJ5EB0ED0BUiBlfWGwts3sykw,10372
4
- thordata_sdk/enums.py,sha256=gKpaqV-_OO7w1LCg9PTuSUiJJq_q4ad5k6f88UlTPQw,639
5
- thordata_sdk/parameters.py,sha256=3ck0XP0lZaUYs4eEZoLLo6zDTClRRrLO9TlggesMmwI,1384
6
- thordata_sdk-0.2.4.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
7
- thordata_sdk-0.2.4.dist-info/METADATA,sha256=mluyngNHvMXlRfAgA4F7JHC6Sc1f0z4cuut3CI42yow,3734
8
- thordata_sdk-0.2.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
9
- thordata_sdk-0.2.4.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
10
- thordata_sdk-0.2.4.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- thordata_sdk