thordata-sdk 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ # src/thordata/__init__.py
2
+
3
+ from .client import ThordataClient
4
+ from .async_client import AsyncThordataClient
5
+ from .enums import Engine, GoogleSearchType
6
+
7
+ # Package version
8
+ __version__ = "0.3.0"
9
+
10
+ # Explicitly export classes to simplify user imports
11
+ __all__ = [
12
+ "ThordataClient",
13
+ "AsyncThordataClient",
14
+ "Engine",
15
+ "GoogleSearchType"
16
+ ]
@@ -4,12 +4,17 @@ import json
4
4
  import base64
5
5
  from typing import Optional, Dict, Any, Union
6
6
 
7
+ # Import shared logic
8
+ from .enums import Engine
9
+ from .parameters import normalize_serp_params
10
+
7
11
  logger = logging.getLogger(__name__)
8
12
 
9
13
 
10
14
  class AsyncThordataClient:
11
15
  """
12
- Thordata Asynchronous Client (built on aiohttp).
16
+ The official Asynchronous Python client for Thordata (built on aiohttp).
17
+ Designed for high-concurrency AI agents and data pipelines.
13
18
  """
14
19
 
15
20
  def __init__(
@@ -20,13 +25,18 @@ class AsyncThordataClient:
20
25
  proxy_host: str = "gate.thordata.com",
21
26
  proxy_port: int = 22225
22
27
  ):
28
+ """
29
+ Initialize the Async Client.
30
+ """
23
31
  self.scraper_token = scraper_token
24
32
  self.public_token = public_token
25
33
  self.public_key = public_key
26
34
 
35
+ # Pre-calculate proxy auth for performance
27
36
  self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
28
37
  self.proxy_url = f"http://{proxy_host}:{proxy_port}"
29
38
 
39
+ # API Endpoints
30
40
  self.base_url = "https://scraperapi.thordata.com"
31
41
  self.universal_url = "https://universalapi.thordata.com"
32
42
  self.api_url = "https://api.thordata.com/api/web-scraper-api"
@@ -37,6 +47,7 @@ class AsyncThordataClient:
37
47
  self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
38
48
  self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
39
49
 
50
+ # Session is initialized lazily or via context manager
40
51
  self._session: Optional[aiohttp.ClientSession] = None
41
52
 
42
53
  async def __aenter__(self):
@@ -48,16 +59,27 @@ class AsyncThordataClient:
48
59
  await self.close()
49
60
 
50
61
  async def close(self):
62
+ """Close the underlying aiohttp session."""
51
63
  if self._session and not self._session.closed:
52
64
  await self._session.close()
53
65
  self._session = None
54
66
 
55
- # --- Proxy ---
67
+ def _get_session(self) -> aiohttp.ClientSession:
68
+ """Internal helper to ensure session exists."""
69
+ if self._session is None or self._session.closed:
70
+ raise RuntimeError(
71
+ "Client session not initialized. Use 'async with ThordataClient(...) as client:'"
72
+ )
73
+ return self._session
74
+
56
75
  async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
57
- if self._session is None:
58
- raise RuntimeError("Client session not initialized.")
76
+ """
77
+ Send an async GET request through the Proxy Network.
78
+ """
79
+ session = self._get_session()
59
80
  try:
60
- return await self._session.get(
81
+ logger.debug(f"Async Proxy Request: {url}")
82
+ return await session.get(
61
83
  url,
62
84
  proxy=self.proxy_url,
63
85
  proxy_auth=self.proxy_auth,
@@ -67,55 +89,57 @@ class AsyncThordataClient:
67
89
  logger.error(f"Async Request failed: {e}")
68
90
  raise
69
91
 
70
- # --- SERP ---
71
92
  async def serp_search(
72
- self, query: str, engine: str = "google", num: int = 10, **kwargs
93
+ self,
94
+ query: str,
95
+ engine: Union[Engine, str] = Engine.GOOGLE,
96
+ num: int = 10,
97
+ **kwargs
73
98
  ) -> Dict[str, Any]:
74
- if self._session is None:
75
- raise RuntimeError("Client session not initialized.")
99
+ """
100
+ Execute a real-time SERP search (Async).
101
+ """
102
+ session = self._get_session()
76
103
 
77
- payload = {
78
- "q": query, "num": str(num), "json": "1",
79
- "engine": engine.lower(), **kwargs
80
- }
81
- if engine.lower() == 'yandex':
82
- payload['text'] = payload.pop('q')
83
- if 'url' not in payload:
84
- payload['url'] = "yandex.com"
85
- elif 'url' not in payload:
86
- if engine == 'google':
87
- payload['url'] = "google.com"
88
- elif engine == 'bing':
89
- payload['url'] = "bing.com"
104
+ # 1. Handle Enum conversion
105
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
106
+
107
+ # 2. Normalize parameters
108
+ payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
90
109
 
91
110
  headers = {
92
111
  "Authorization": f"Bearer {self.scraper_token}",
93
112
  "Content-Type": "application/x-www-form-urlencoded"
94
113
  }
95
114
 
96
- async with self._session.post(
115
+ # 3. Execute Request
116
+ logger.info(f"Async SERP Search: {engine_str} - {query}")
117
+ async with session.post(
97
118
  self.SERP_API_URL, data=payload, headers=headers
98
119
  ) as response:
99
120
  response.raise_for_status()
121
+
100
122
  data = await response.json()
123
+ # Handle double-encoded JSON strings if they occur
101
124
  if isinstance(data, str):
102
125
  try:
103
126
  data = json.loads(data)
104
- except Exception:
127
+ except json.JSONDecodeError:
105
128
  pass
106
129
  return data
107
130
 
108
- # --- Universal ---
109
131
  async def universal_scrape(
110
132
  self,
111
133
  url: str,
112
134
  js_render: bool = False,
113
135
  output_format: str = "HTML",
114
- country: str = None,
136
+ country: Optional[str] = None,
115
137
  block_resources: bool = False
116
138
  ) -> Union[str, bytes]:
117
- if self._session is None:
118
- raise RuntimeError("Client session not initialized.")
139
+ """
140
+ Async Universal Scraping (Bypass Cloudflare/CAPTCHA).
141
+ """
142
+ session = self._get_session()
119
143
 
120
144
  headers = {
121
145
  "Authorization": f"Bearer {self.scraper_token}",
@@ -131,18 +155,21 @@ class AsyncThordataClient:
131
155
  if country:
132
156
  payload["country"] = country
133
157
 
134
- async with self._session.post(
158
+ logger.info(f"Async Universal Scrape: {url}")
159
+ async with session.post(
135
160
  self.UNIVERSAL_API_URL, data=payload, headers=headers
136
161
  ) as response:
137
162
  response.raise_for_status()
138
163
 
139
164
  try:
140
165
  resp_json = await response.json()
141
- except Exception:
166
+ except json.JSONDecodeError:
167
+ # Fallback for raw content
142
168
  if output_format.upper() == "PNG":
143
169
  return await response.read()
144
170
  return await response.text()
145
171
 
172
+ # Check API error codes
146
173
  if isinstance(resp_json, dict) and resp_json.get("code") \
147
174
  and resp_json.get("code") != 200:
148
175
  raise Exception(f"Universal API Error: {resp_json}")
@@ -155,25 +182,32 @@ class AsyncThordataClient:
155
182
  if not png_str:
156
183
  raise Exception("API returned empty PNG data")
157
184
 
185
+ # Clean Data URI Scheme
186
+ if "," in png_str:
187
+ png_str = png_str.split(",", 1)[1]
188
+
189
+ # Fix Base64 Padding
158
190
  png_str = png_str.replace("\n", "").replace("\r", "")
159
191
  missing_padding = len(png_str) % 4
160
192
  if missing_padding:
161
193
  png_str += '=' * (4 - missing_padding)
194
+
162
195
  return base64.b64decode(png_str)
163
196
 
164
197
  return str(resp_json)
165
198
 
166
- # --- Web Scraper ---
167
199
  async def create_scraper_task(
168
200
  self,
169
201
  file_name: str,
170
202
  spider_id: str,
203
+ spider_name: str,
171
204
  individual_params: Dict[str, Any],
172
- spider_name: str = "youtube.com",
173
- universal_params: Dict[str, Any] = None
205
+ universal_params: Optional[Dict[str, Any]] = None
174
206
  ) -> str:
175
- if self._session is None:
176
- raise RuntimeError("Client session not initialized.")
207
+ """
208
+ Create an Asynchronous Web Scraper Task.
209
+ """
210
+ session = self._get_session()
177
211
 
178
212
  headers = {
179
213
  "Authorization": f"Bearer {self.scraper_token}",
@@ -190,16 +224,23 @@ class AsyncThordataClient:
190
224
  if universal_params:
191
225
  payload["spider_universal"] = json.dumps(universal_params)
192
226
 
193
- async with self._session.post(
227
+ logger.info(f"Async Task Creation: {spider_name}")
228
+ async with session.post(
194
229
  self.SCRAPER_BUILDER_URL, data=payload, headers=headers
195
230
  ) as response:
196
231
  response.raise_for_status()
197
232
  data = await response.json()
233
+
198
234
  if data.get("code") != 200:
199
235
  raise Exception(f"Creation failed: {data}")
200
236
  return data["data"]["task_id"]
201
237
 
202
238
  async def get_task_status(self, task_id: str) -> str:
239
+ """
240
+ Check task status.
241
+ """
242
+ session = self._get_session()
243
+
203
244
  headers = {
204
245
  "token": self.public_token,
205
246
  "key": self.public_key,
@@ -207,28 +248,34 @@ class AsyncThordataClient:
207
248
  }
208
249
  payload = {"tasks_ids": task_id}
209
250
 
210
- async with self._session.post(
251
+ async with session.post(
211
252
  self.SCRAPER_STATUS_URL, data=payload, headers=headers
212
253
  ) as response:
213
254
  data = await response.json()
214
255
  if data.get("code") == 200 and data.get("data"):
215
256
  for item in data["data"]:
216
- if str(item["task_id"]) == str(task_id):
257
+ if str(item.get("task_id")) == str(task_id):
217
258
  return item["status"]
218
259
  return "Unknown"
219
260
 
220
261
  async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
262
+ """
263
+ Get the download URL for a finished task.
264
+ """
265
+ session = self._get_session()
266
+
221
267
  headers = {
222
268
  "token": self.public_token,
223
269
  "key": self.public_key,
224
270
  "Content-Type": "application/x-www-form-urlencoded"
225
271
  }
226
- payload = {"tasks_id": task_id, "type": "json"}
272
+ # Fixed: Use the file_type argument instead of hardcoding "json"
273
+ payload = {"tasks_id": task_id, "type": file_type}
227
274
 
228
- async with self._session.post(
275
+ async with session.post(
229
276
  self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
230
277
  ) as response:
231
278
  data = await response.json()
232
- if data.get("code") == 200:
279
+ if data.get("code") == 200 and data.get("data"):
233
280
  return data["data"]["download"]
234
281
  raise Exception(f"Result Error: {data}")
@@ -2,9 +2,12 @@ import requests
2
2
  import logging
3
3
  import json
4
4
  import base64
5
- from typing import Dict, Any, Union
5
+ from typing import Dict, Any, Union, Optional
6
6
 
7
- # Configure a library-specific logger
7
+ from .enums import Engine
8
+ from .parameters import normalize_serp_params
9
+
10
+ # Configure a library-specific logger to avoid interfering with user's logging
8
11
  logger = logging.getLogger(__name__)
9
12
 
10
13
 
@@ -12,11 +15,11 @@ class ThordataClient:
12
15
  """
13
16
  The official synchronous Python client for Thordata.
14
17
 
15
- Handles authentication for:
16
- 1. Proxy Network (HTTP/HTTPS)
17
- 2. SERP API (Real-time Search)
18
- 3. Universal Scraping API (Single Page)
19
- 4. Web Scraper API (Async Task Management)
18
+ This client handles authentication and communication with:
19
+ 1. Proxy Network (Residential/Datacenter via HTTP/HTTPS)
20
+ 2. SERP API (Real-time Search Engine Results)
21
+ 3. Universal Scraping API (Single Page Rendering & Extraction)
22
+ 4. Web Scraper API (Async Task Management for large scale jobs)
20
23
  """
21
24
 
22
25
  def __init__(
@@ -31,11 +34,11 @@ class ThordataClient:
31
34
  Initialize the Thordata Client.
32
35
 
33
36
  Args:
34
- scraper_token (str): Token from Dashboard bottom.
35
- public_token (str): Token from Public API section.
36
- public_key (str): Key from Public API section.
37
- proxy_host (str): Proxy gateway host.
38
- proxy_port (int): Proxy gateway port.
37
+ scraper_token (str): The secret token found at the bottom of the Dashboard.
38
+ public_token (str): The token from the Public API section.
39
+ public_key (str): The key from the Public API section.
40
+ proxy_host (str): The proxy gateway host (default: gate.thordata.com).
41
+ proxy_port (int): The proxy gateway port (default: 22225).
39
42
  """
40
43
  self.scraper_token = scraper_token
41
44
  self.public_token = public_token
@@ -46,7 +49,7 @@ class ThordataClient:
46
49
  f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
47
50
  )
48
51
 
49
- # API Endpoints
52
+ # API Endpoints Definition
50
53
  self.base_url = "https://scraperapi.thordata.com"
51
54
  self.universal_url = "https://universalapi.thordata.com"
52
55
  self.api_url = "https://api.thordata.com/api/web-scraper-api"
@@ -57,6 +60,7 @@ class ThordataClient:
57
60
  self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
58
61
  self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
59
62
 
63
+ # Initialize Session with Proxy settings
60
64
  self.session = requests.Session()
61
65
  self.session.proxies = {
62
66
  "http": self.proxy_url,
@@ -65,44 +69,50 @@ class ThordataClient:
65
69
 
66
70
  def get(self, url: str, **kwargs) -> requests.Response:
67
71
  """
68
- Send a GET request through the Thordata Proxy Network.
72
+ Send a standard GET request through the Thordata Residential Proxy Network.
73
+
74
+ Args:
75
+ url (str): The target URL.
76
+ **kwargs: Arguments to pass to requests.get().
77
+
78
+ Returns:
79
+ requests.Response: The response object.
69
80
  """
70
81
  logger.debug(f"Proxy Request: {url}")
71
82
  kwargs.setdefault("timeout", 30)
72
83
  return self.session.get(url, **kwargs)
73
84
 
74
85
  def serp_search(
75
- self, query: str, engine: str = "google", num: int = 10, **kwargs
86
+ self,
87
+ query: str,
88
+ engine: Union[Engine, str] = Engine.GOOGLE,
89
+ num: int = 10,
90
+ **kwargs
76
91
  ) -> Dict[str, Any]:
77
92
  """
78
- Execute a real-time SERP search.
93
+ Execute a real-time SERP (Search Engine Results Page) search.
94
+
95
+ Args:
96
+ query (str): The search keywords.
97
+ engine (Union[Engine, str]): The search engine (e.g., 'google', 'bing').
98
+ num (int): Number of results to retrieve (default 10).
99
+ **kwargs: Additional parameters (e.g., type="shopping", location="London").
100
+
101
+ Returns:
102
+ Dict[str, Any]: The parsed JSON result from the search engine.
79
103
  """
80
- payload = {
81
- "q": query,
82
- "num": str(num),
83
- "json": "1",
84
- "engine": engine.lower(),
85
- **kwargs
86
- }
104
+ # Handle Enum or String input for engine
105
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
87
106
 
88
- if engine.lower() == 'yandex':
89
- payload['text'] = payload.pop('q')
90
- if 'url' not in payload:
91
- payload['url'] = "yandex.com"
92
- elif 'url' not in payload:
93
- if engine == 'google':
94
- payload['url'] = "google.com"
95
- elif engine == 'bing':
96
- payload['url'] = "bing.com"
97
- elif engine == 'duckduckgo':
98
- payload['url'] = "duckduckgo.com"
107
+ # Normalize parameters via internal helper
108
+ payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
99
109
 
100
110
  headers = {
101
111
  "Authorization": f"Bearer {self.scraper_token}",
102
112
  "Content-Type": "application/x-www-form-urlencoded"
103
113
  }
104
114
 
105
- logger.info(f"SERP Search: {engine} - {query}")
115
+ logger.info(f"SERP Search: {engine_str} - {query}")
106
116
  try:
107
117
  response = self.session.post(
108
118
  self.SERP_API_URL,
@@ -111,12 +121,13 @@ class ThordataClient:
111
121
  timeout=60
112
122
  )
113
123
  response.raise_for_status()
124
+
114
125
  data = response.json()
115
-
126
+ # Handle cases where the API returns a stringified JSON
116
127
  if isinstance(data, str):
117
- try:
128
+ try:
118
129
  data = json.loads(data)
119
- except json.JSONDecodeError:
130
+ except json.JSONDecodeError:
120
131
  pass
121
132
  return data
122
133
  except Exception as e:
@@ -128,11 +139,22 @@ class ThordataClient:
128
139
  url: str,
129
140
  js_render: bool = False,
130
141
  output_format: str = "HTML",
131
- country: str = None,
142
+ country: Optional[str] = None,
132
143
  block_resources: bool = False
133
144
  ) -> Union[str, bytes]:
134
145
  """
135
146
  Unlock target pages via the Universal Scraping API.
147
+ Bypasses Cloudflare, CAPTCHAs, and antibot systems automatically.
148
+
149
+ Args:
150
+ url (str): Target URL.
151
+ js_render (bool): Whether to render JavaScript (Headless Browser).
152
+ output_format (str): "HTML" or "PNG" (screenshot).
153
+ country (Optional[str]): Geo-targeting country code (e.g., 'us').
154
+ block_resources (bool): Block images/css to speed up loading.
155
+
156
+ Returns:
157
+ Union[str, bytes]: HTML string or PNG bytes.
136
158
  """
137
159
  headers = {
138
160
  "Authorization": f"Bearer {self.scraper_token}",
@@ -148,7 +170,7 @@ class ThordataClient:
148
170
  if country:
149
171
  payload["country"] = country
150
172
 
151
- logger.info(f"Universal Scrape: {url}")
173
+ logger.info(f"Universal Scrape: {url} (Format: {output_format})")
152
174
 
153
175
  try:
154
176
  response = self.session.post(
@@ -159,30 +181,35 @@ class ThordataClient:
159
181
  )
160
182
  response.raise_for_status()
161
183
 
162
- # Parse JSON wrapper
184
+ # Attempt to parse JSON wrapper
163
185
  try:
164
186
  resp_json = response.json()
165
187
  except json.JSONDecodeError:
166
- # Fallback for raw response
188
+ # Fallback: if the API returns raw content directly
167
189
  if output_format.upper() == "PNG":
168
190
  return response.content
169
191
  return response.text
170
192
 
171
- # Check API errors
193
+ # Check for API-level errors inside the JSON
172
194
  if isinstance(resp_json, dict) and resp_json.get("code") \
173
195
  and resp_json.get("code") != 200:
174
196
  raise Exception(f"Universal API Error: {resp_json}")
175
197
 
176
- # Extract HTML
198
+ # Case 1: Return HTML
177
199
  if "html" in resp_json:
178
200
  return resp_json["html"]
179
201
 
180
- # Extract PNG (Base64 decoding with padding fix)
202
+ # Case 2: Return PNG Image
181
203
  if "png" in resp_json:
182
204
  png_str = resp_json["png"]
183
205
  if not png_str:
184
206
  raise Exception("API returned empty PNG data")
185
207
 
208
+ # Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
209
+ if "," in png_str:
210
+ png_str = png_str.split(",", 1)[1]
211
+
212
+ # Fix Base64 Padding
186
213
  png_str = png_str.replace("\n", "").replace("\r", "")
187
214
  missing_padding = len(png_str) % 4
188
215
  if missing_padding:
@@ -190,6 +217,7 @@ class ThordataClient:
190
217
 
191
218
  return base64.b64decode(png_str)
192
219
 
220
+ # Fallback
193
221
  return str(resp_json)
194
222
 
195
223
  except Exception as e:
@@ -200,18 +228,32 @@ class ThordataClient:
200
228
  self,
201
229
  file_name: str,
202
230
  spider_id: str,
231
+ spider_name: str,
203
232
  individual_params: Dict[str, Any],
204
- spider_name: str = "youtube.com",
205
- universal_params: Dict[str, Any] = None
233
+ universal_params: Optional[Dict[str, Any]] = None
206
234
  ) -> str:
207
235
  """
208
- Create an Asynchronous Web Scraper Task.
236
+ Create a generic Web Scraper Task (Async).
237
+
238
+ IMPORTANT: You must retrieve the correct 'spider_id' and 'spider_name'
239
+ from the Thordata Dashboard before calling this method.
240
+
241
+ Args:
242
+ file_name (str): Name for the output file.
243
+ spider_id (str): The ID of the spider (from Dashboard).
244
+ spider_name (str): The name of the spider (e.g., "youtube.com").
245
+ individual_params (Dict): Parameters specific to the spider.
246
+ universal_params (Optional[Dict]): Global settings for the scraper.
247
+
248
+ Returns:
249
+ str: The created task_id.
209
250
  """
210
251
  headers = {
211
252
  "Authorization": f"Bearer {self.scraper_token}",
212
253
  "Content-Type": "application/x-www-form-urlencoded"
213
254
  }
214
255
 
256
+ # Payload construction
215
257
  payload = {
216
258
  "spider_name": spider_name,
217
259
  "spider_id": spider_id,
@@ -222,7 +264,7 @@ class ThordataClient:
222
264
  if universal_params:
223
265
  payload["spider_universal"] = json.dumps(universal_params)
224
266
 
225
- logger.info(f"Creating Scraper Task: {spider_id}")
267
+ logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
226
268
  try:
227
269
  response = self.session.post(
228
270
  self.SCRAPER_BUILDER_URL,
@@ -241,7 +283,13 @@ class ThordataClient:
241
283
 
242
284
  def get_task_status(self, task_id: str) -> str:
243
285
  """
244
- Check the status of a task.
286
+ Check the status of an asynchronous scraping task.
287
+
288
+ Args:
289
+ task_id (str): The ID returned by create_scraper_task.
290
+
291
+ Returns:
292
+ str: The status string (e.g., "finished", "running", "error").
245
293
  """
246
294
  headers = {
247
295
  "token": self.public_token,
@@ -271,6 +319,13 @@ class ThordataClient:
271
319
  def get_task_result(self, task_id: str, file_type: str = "json") -> str:
272
320
  """
273
321
  Retrieve the download URL for a completed task.
322
+
323
+ Args:
324
+ task_id (str): The task ID.
325
+ file_type (str): Format required (default "json").
326
+
327
+ Returns:
328
+ str: The URL to download the result file.
274
329
  """
275
330
  headers = {
276
331
  "token": self.public_token,
@@ -279,7 +334,7 @@ class ThordataClient:
279
334
  }
280
335
  payload = {"tasks_id": task_id, "type": file_type}
281
336
 
282
- logger.info(f"Getting result URL: {task_id}")
337
+ logger.info(f"Getting result URL for Task: {task_id}")
283
338
  try:
284
339
  response = self.session.post(
285
340
  self.SCRAPER_DOWNLOAD_URL,
thordata/enums.py ADDED
@@ -0,0 +1,25 @@
1
+ # src/thordata/enums.py
2
+
3
+ from enum import Enum
4
+
5
+ class Engine(str, Enum):
6
+ """
7
+ Supported Search Engines for SERP API.
8
+ """
9
+ GOOGLE = "google"
10
+ BING = "bing"
11
+ YANDEX = "yandex"
12
+ DUCKDUCKGO = "duckduckgo"
13
+ BAIDU = "baidu"
14
+
15
+ class GoogleSearchType(str, Enum):
16
+ """
17
+ Specific search types for Google Engine.
18
+ """
19
+ SEARCH = "search" # Default web search
20
+ MAPS = "maps" # Google Maps
21
+ SHOPPING = "shopping" # Google Shopping
22
+ NEWS = "news" # Google News
23
+ IMAGES = "images" # Google Images
24
+ VIDEOS = "videos" # Google Videos
25
+ # Users can pass other strings manually if needed
thordata/parameters.py ADDED
@@ -0,0 +1,52 @@
1
+ # src/thordata/parameters.py
2
+
3
+ from typing import Dict, Any, Optional
4
+
5
+ def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
6
+ """
7
+ Normalizes parameters across different search engines to ensure a unified API surface.
8
+
9
+ Args:
10
+ engine (str): The search engine to use (e.g., 'google', 'yandex').
11
+ query (str): The search query string.
12
+ **kwargs: Additional parameters to pass to the API.
13
+
14
+ Returns:
15
+ Dict[str, Any]: The constructed payload for the API request.
16
+ """
17
+ # 1. Base parameters
18
+ payload = {
19
+ "num": str(kwargs.get("num", 10)), # Default to 10 results
20
+ "json": "1", # Force JSON response
21
+ "engine": engine,
22
+ }
23
+
24
+ # 2. Handle Query Parameter Differences (Yandex uses 'text', others use 'q')
25
+ if engine == "yandex":
26
+ payload["text"] = query
27
+ # Set default URL for Yandex if not provided
28
+ if "url" not in kwargs:
29
+ payload["url"] = "yandex.com"
30
+ else:
31
+ payload["q"] = query
32
+
33
+ # 3. Handle Default URLs for other engines
34
+ if "url" not in kwargs:
35
+ defaults = {
36
+ "google": "google.com",
37
+ "bing": "bing.com",
38
+ "duckduckgo": "duckduckgo.com",
39
+ "baidu": "baidu.com"
40
+ }
41
+ if engine in defaults:
42
+ payload["url"] = defaults[engine]
43
+
44
+ # 4. Passthrough for all other user-provided arguments
45
+ # This allows support for engine-specific parameters (e.g., tbm, uule, gl)
46
+ # without explicitly defining them all.
47
+ protected_keys = {"num", "engine", "q", "text"}
48
+ for key, value in kwargs.items():
49
+ if key not in protected_keys:
50
+ payload[key] = value
51
+
52
+ return payload
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.4
2
+ Name: thordata-sdk
3
+ Version: 0.3.0
4
+ Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
+ Author-email: Thordata Developer Team <support@thordata.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://www.thordata.com
8
+ Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
9
+ Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
10
+ Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
11
+ Keywords: web scraping,proxy,ai,llm,data-mining,serp,thordata
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Topic :: Internet :: WWW/HTTP
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: License :: OSI Approved :: Apache Software License
22
+ Classifier: Operating System :: OS Independent
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: requests>=2.25.0
27
+ Requires-Dist: aiohttp>=3.8.0
28
+ Dynamic: license-file
29
+
30
+ # Thordata Python SDK
31
+
32
+ <h4 align="center">
33
+ Official Python client for Thordata's Proxy Network, SERP API, Universal Scraping API, and Web Scraper API.
34
+ <br>
35
+ <i>Async-ready, built for AI agents and large-scale data collection.</i>
36
+ </h4>
37
+
38
+ <p align="center">
39
+ <a href="https://pypi.org/project/thordata-sdk/">
40
+ <img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version">
41
+ </a>
42
+ <a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE">
43
+ <img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License">
44
+ </a>
45
+ <a href="https://python.org">
46
+ <img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions">
47
+ </a>
48
+ </p>
49
+
50
+ ---
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install thordata-sdk
56
+ ```
57
+
58
+ ## Quick Start
59
+
60
+ All examples below use the unified client:
61
+
62
+ ```python
63
+ from thordata import ThordataClient, AsyncThordataClient
64
+ ```
65
+
66
+ You can copy `examples/.env.example` to `.env` and fill in your tokens from the Thordata Dashboard.
67
+
68
+ ### 1. Proxy Network (Simple GET)
69
+
70
+ ```python
71
+ import os
72
+ from dotenv import load_dotenv
73
+ from thordata import ThordataClient
74
+
75
+ load_dotenv()
76
+
77
+ client = ThordataClient(
78
+ scraper_token=os.getenv("THORDATA_SCRAPER_TOKEN"),
79
+ public_token=os.getenv("THORDATA_PUBLIC_TOKEN"),
80
+ public_key=os.getenv("THORDATA_PUBLIC_KEY"),
81
+ )
82
+
83
+ resp = client.get("http://httpbin.org/ip")
84
+ print(resp.json())
85
+ ```
86
+
87
+ ### 2. SERP API (Google, Bing, Yandex, DuckDuckGo)
88
+
89
+ ```python
90
+ from thordata import ThordataClient, Engine
91
+
92
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
93
+
94
+ results = client.serp_search(
95
+ query="Thordata technology",
96
+ engine=Engine.GOOGLE,
97
+ num=10,
98
+ # Any engine-specific parameters are passed via **kwargs
99
+ # e.g. type="shopping", location="United States"
100
+ )
101
+
102
+ print(len(results.get("organic", [])))
103
+ ```
104
+
105
+ ### 3. Universal Scraping API
106
+
107
+ ```python
108
+ from thordata import ThordataClient
109
+
110
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
111
+
112
+ html = client.universal_scrape(
113
+ url="https://www.google.com",
114
+ js_render=True,
115
+ output_format="HTML",
116
+ )
117
+ print(html[:200])
118
+ ```
119
+
120
+ ### 4. Web Scraper API (Task-based)
121
+
122
+ ```python
123
+ import time
124
+ from thordata import ThordataClient
125
+
126
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
127
+
128
+ task_id = client.create_scraper_task(
129
+ file_name="demo_youtube_data",
130
+ spider_id="youtube_video-post_by-url",
131
+ spider_name="youtube.com",
132
+ individual_params={
133
+ "url": "https://www.youtube.com/@stephcurry/videos",
134
+ "order_by": "",
135
+ "num_of_posts": ""
136
+ },
137
+ )
138
+
139
+ for _ in range(10):
140
+ status = client.get_task_status(task_id)
141
+ print("Status:", status)
142
+ if status in ["Ready", "Success"]:
143
+ break
144
+ if status == "Failed":
145
+ raise RuntimeError("Task failed")
146
+ time.sleep(3)
147
+
148
+ download_url = client.get_task_result(task_id)
149
+ print("Download URL:", download_url)
150
+ ```
151
+
152
+ ### 5. Asynchronous Usage (High Concurrency)
153
+
154
+ ```python
155
+ import asyncio
156
+ from thordata import AsyncThordataClient
157
+
158
+ async def main():
159
+ async with AsyncThordataClient(
160
+ scraper_token="SCRAPER_TOKEN",
161
+ public_token="PUBLIC_TOKEN",
162
+ public_key="PUBLIC_KEY",
163
+ ) as client:
164
+ resp = await client.get("http://httpbin.org/ip")
165
+ print(await resp.json())
166
+
167
+ asyncio.run(main())
168
+ ```
169
+
170
+ More examples are available in the `examples/` directory.
171
+
172
+ ---
173
+
174
+ ## Features
175
+
176
+ | Feature | Status | Description |
177
+ |---------|--------|-------------|
178
+ | Proxy Network | Stable | Residential, ISP, Mobile, Datacenter via HTTP/HTTPS gateway. |
179
+ | SERP API | Stable | Google / Bing / Yandex / DuckDuckGo, flexible parameters. |
180
+ | Universal Scraping API | Stable | JS rendering, HTML / PNG output, antibot bypass. |
181
+ | Web Scraper API | Stable | Task-based scraping for complex sites (YouTube, E-commerce). |
182
+ | Async Client | Stable | aiohttp-based client for high-concurrency workloads. |
183
+
184
+ ---
185
+
186
+ ## Development & Contributing
187
+
188
+ See `CONTRIBUTING.md` for local development and contribution guidelines.
189
+
190
+ ## License
191
+
192
+ This project is licensed under the Apache License 2.0.
193
+
194
+ ## Support
195
+
196
+ For technical support, please contact support@thordata.com
197
+ or verify your tokens and quotas in the Thordata Dashboard.
@@ -0,0 +1,10 @@
1
+ thordata/__init__.py,sha256=HVb6cHBsYRFoA1Sf_y_WSZ88vGV3DsT67rCdbZSuUYE,365
2
+ thordata/async_client.py,sha256=cpBtRIzr8oH6GuZs8gTh505tGYYV1aRFBUzbtmFOfEg,9717
3
+ thordata/client.py,sha256=w_EXs6CLM2qFtFPNU-x_Li66LEH1j7pQb2ca2MDKqyA,12432
4
+ thordata/enums.py,sha256=PGUCQX3jw5a9mX8_JfhuyoR1WriWjWQpAgibVP_bpdM,679
5
+ thordata/parameters.py,sha256=1lNx_BSS8ztBKEj_MXZMaIQQ9_W3EAlS-VFiBqSWb9E,1841
6
+ thordata_sdk-0.3.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
7
+ thordata_sdk-0.3.0.dist-info/METADATA,sha256=Yj6W3vSLkkUhSXTj6AK4AaMfdlJvGOVaK6cFI2MNqV8,5697
8
+ thordata_sdk-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ thordata_sdk-0.3.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
10
+ thordata_sdk-0.3.0.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ thordata
thordata_sdk/__init__.py DELETED
@@ -1,8 +0,0 @@
1
- # Expose main clients
2
- from .client import ThordataClient
3
- from .async_client import AsyncThordataClient
4
-
5
- # Version of the thordata-sdk package
6
- __version__ = "0.2.3"
7
-
8
- __all__ = ["ThordataClient", "AsyncThordataClient"]
@@ -1,125 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: thordata_sdk
3
- Version: 0.2.3
4
- Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
5
- Home-page: https://github.com/Thordata/thordata-python-sdk
6
- Author: Thordata Developer Team
7
- Author-email: support@thordata.com
8
- License: Apache License 2.0
9
- Project-URL: Bug Tracker, https://github.com/Thordata/thordata-python-sdk/issues
10
- Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
11
- Classifier: Development Status :: 4 - Beta
12
- Classifier: Intended Audience :: Developers
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.8
15
- Classifier: Programming Language :: Python :: 3.9
16
- Classifier: Programming Language :: Python :: 3.10
17
- Classifier: Programming Language :: Python :: 3.11
18
- Classifier: License :: OSI Approved :: Apache Software License
19
- Classifier: Operating System :: OS Independent
20
- Classifier: Topic :: Internet :: WWW/HTTP
21
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
- Requires-Python: >=3.8
23
- Description-Content-Type: text/markdown
24
- License-File: LICENSE
25
- Requires-Dist: requests>=2.25.0
26
- Requires-Dist: aiohttp>=3.8.0
27
- Dynamic: author
28
- Dynamic: author-email
29
- Dynamic: classifier
30
- Dynamic: description
31
- Dynamic: description-content-type
32
- Dynamic: home-page
33
- Dynamic: license
34
- Dynamic: license-file
35
- Dynamic: project-url
36
- Dynamic: requires-dist
37
- Dynamic: requires-python
38
- Dynamic: summary
39
-
40
- # Thordata Python SDK
41
-
42
- <h4 align="center">
43
- The Official Python Client for the Thordata Proxy Network & Web Scraper API.
44
- <br>
45
- <i>High-performance, async-ready, designed for AI Agents and large-scale data collection.</i>
46
- </h4>
47
-
48
- <p align="center">
49
- <a href="https://pypi.org/project/thordata-sdk/"><img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version"></a>
50
- <a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
51
- <a href="https://python.org"><img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions"></a>
52
- </p>
53
-
54
- ---
55
-
56
- ## 🛠 Installation
57
-
58
- Install via pip:
59
-
60
- ```bash
61
- pip install thordata-sdk
62
- ```
63
-
64
- ## ⚡ Quick Start
65
-
66
- ### 1. Proxy Usage (Simple GET Request)
67
-
68
- **Python**
69
-
70
- ```python
71
- from thordata_sdk import ThordataClient
72
-
73
- # Initialize with your credentials from the Thordata Dashboard
74
- client = ThordataClient(
75
- scraper_token="YOUR_SCRAPER_TOKEN", # From "Scraping Tool Token"
76
- public_token="YOUR_PUBLIC_TOKEN", # From "Public API"
77
- public_key="YOUR_PUBLIC_KEY" # From "Public API"
78
- )
79
-
80
- # Send a request through the proxy
81
- response = client.get("http://httpbin.org/ip")
82
- print(response.json())
83
- ```
84
-
85
- ### 2. Real-time SERP Search
86
-
87
- **Python**
88
-
89
- ```python
90
- results = client.serp_search("Thordata technology", engine="google")
91
- print(f"Results found: {len(results.get('organic', []))}")
92
- ```
93
-
94
- ### 3. Asynchronous Usage (High Concurrency)
95
-
96
- **Python**
97
-
98
- ```python
99
- import asyncio
100
- from thordata_sdk import AsyncThordataClient
101
-
102
- async def main():
103
- async with AsyncThordataClient(scraper_token="...", public_token="...", public_key="...") as client:
104
- response = await client.get("http://httpbin.org/ip")
105
- print(await response.json())
106
-
107
- asyncio.run(main())
108
- ```
109
-
110
- ## ⚙️ Features Status
111
-
112
- | Feature | Status | Description |
113
- |---------|--------|-------------|
114
- | Proxy Network | ✅ Stable | Synchronous & Asynchronous support via aiohttp. |
115
- | SERP API | ✅ Stable | Real-time Google/Bing/Yandex search results. |
116
- | Web Scraper | ✅ Stable | Async task management for scraping complex sites (e.g., YouTube). |
117
- | Authentication | ✅ Secure | Dual-token system for enhanced security. |
118
-
119
- ## 📄 License
120
-
121
- This project is licensed under the Apache License 2.0.
122
-
123
- ## 📞 Support
124
-
125
- For technical assistance, please contact support@thordata.com or verify your tokens in the Thordata Dashboard.
@@ -1,8 +0,0 @@
1
- thordata_sdk/__init__.py,sha256=aZ2P8F15HJlnnuMRYA1R-ENcZRVQ7eo0r1SD4a_1UbI,223
2
- thordata_sdk/async_client.py,sha256=fwoDSQA2GdikkNHrbKAoLwjqmn-zafEoe2HGf-j8bp8,8202
3
- thordata_sdk/client.py,sha256=drlhRHCCUoYiwmaJHLsYQZrfj7rB5wsK2P2yn2DkhqQ,9732
4
- thordata_sdk-0.2.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
5
- thordata_sdk-0.2.3.dist-info/METADATA,sha256=X_b16_FfyQmV7VS9Wy_QRtgXp8JVYhxSatt0HpAA9QU,4003
6
- thordata_sdk-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- thordata_sdk-0.2.3.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
8
- thordata_sdk-0.2.3.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- thordata_sdk