thordata-sdk 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/client.py ADDED
@@ -0,0 +1,486 @@
1
+ import requests
2
+ import logging
3
+ import json
4
+ import base64
5
+ from typing import Dict, Any, Union, Optional, List
6
+
7
+ from .enums import Engine
8
+ from .parameters import normalize_serp_params
9
+
10
+ # Configure a library-specific logger to avoid interfering with user's logging
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ThordataClient:
15
+ """
16
+ The official synchronous Python client for Thordata.
17
+
18
+ This client handles authentication and communication with:
19
+ 1. Proxy Network (Residential/Datacenter via HTTP/HTTPS)
20
+ 2. SERP API (Real-time Search Engine Results)
21
+ 3. Universal Scraping API (Single Page Rendering & Extraction)
22
+ 4. Web Scraper API (Async Task Management for large scale jobs)
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ scraper_token: str,
28
+ public_token: str,
29
+ public_key: str,
30
+ proxy_host: str = "gate.thordata.com",
31
+ proxy_port: int = 22225
32
+ ):
33
+ """
34
+ Initialize the Thordata Client.
35
+
36
+ Args:
37
+ scraper_token (str): The secret token found at the bottom of the Dashboard.
38
+ public_token (str): The token from the Public API section.
39
+ public_key (str): The key from the Public API section.
40
+ proxy_host (str): The proxy gateway host (default: gate.thordata.com).
41
+ proxy_port (int): The proxy gateway port (default: 22225).
42
+ """
43
+ self.scraper_token = scraper_token
44
+ self.public_token = public_token
45
+ self.public_key = public_key
46
+
47
+ # Proxy Configuration
48
+ self.proxy_url = (
49
+ f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
50
+ )
51
+
52
+ # API Endpoints Definition
53
+ self.base_url = "https://scraperapi.thordata.com"
54
+ self.universal_url = "https://universalapi.thordata.com"
55
+ self.api_url = "https://api.thordata.com/api/web-scraper-api"
56
+ self.locations_url = "https://api.thordata.com/api/locations"
57
+
58
+ self.SERP_API_URL = f"{self.base_url}/request"
59
+ self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
60
+ self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
61
+ self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
62
+ self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
63
+
64
+ # Initialize Session with Proxy settings
65
+ self.session = requests.Session()
66
+ self.session.proxies = {
67
+ "http": self.proxy_url,
68
+ "https": self.proxy_url,
69
+ }
70
+
71
+ def get(self, url: str, **kwargs) -> requests.Response:
72
+ """
73
+ Send a standard GET request through the Thordata Residential Proxy Network.
74
+
75
+ Args:
76
+ url (str): The target URL.
77
+ **kwargs: Arguments to pass to requests.get().
78
+
79
+ Returns:
80
+ requests.Response: The response object.
81
+ """
82
+ logger.debug(f"Proxy Request: {url}")
83
+ kwargs.setdefault("timeout", 30)
84
+ return self.session.get(url, **kwargs)
85
+
86
+ def serp_search(
87
+ self,
88
+ query: str,
89
+ engine: Union[Engine, str] = Engine.GOOGLE,
90
+ num: int = 10,
91
+ **kwargs
92
+ ) -> Dict[str, Any]:
93
+ """
94
+ Execute a real-time SERP (Search Engine Results Page) search.
95
+
96
+ Args:
97
+ query (str): The search keywords.
98
+ engine (Union[Engine, str]): The search engine (e.g., 'google', 'bing').
99
+ num (int): Number of results to retrieve (default 10).
100
+ **kwargs: Additional parameters (e.g., type="shopping", location="London").
101
+
102
+ Returns:
103
+ Dict[str, Any]: The parsed JSON result from the search engine.
104
+ """
105
+ # Handle Enum or String input for engine
106
+ engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
107
+
108
+ # Normalize parameters via internal helper
109
+ payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
110
+
111
+ headers = {
112
+ "Authorization": f"Bearer {self.scraper_token}",
113
+ "Content-Type": "application/x-www-form-urlencoded"
114
+ }
115
+
116
+ logger.info(f"SERP Search: {engine_str} - {query}")
117
+ try:
118
+ response = self.session.post(
119
+ self.SERP_API_URL,
120
+ data=payload,
121
+ headers=headers,
122
+ timeout=60
123
+ )
124
+ response.raise_for_status()
125
+
126
+ data = response.json()
127
+ # Handle cases where the API returns a stringified JSON
128
+ if isinstance(data, str):
129
+ try:
130
+ data = json.loads(data)
131
+ except json.JSONDecodeError:
132
+ pass
133
+ return data
134
+ except Exception as e:
135
+ logger.error(f"SERP Request Failed: {e}")
136
+ raise
137
+
138
+ def universal_scrape(
139
+ self,
140
+ url: str,
141
+ js_render: bool = False,
142
+ output_format: str = "HTML",
143
+ country: Optional[str] = None,
144
+ block_resources: bool = False
145
+ ) -> Union[str, bytes]:
146
+ """
147
+ Unlock target pages via the Universal Scraping API.
148
+ Bypasses Cloudflare, CAPTCHAs, and antibot systems automatically.
149
+
150
+ Args:
151
+ url (str): Target URL.
152
+ js_render (bool): Whether to render JavaScript (Headless Browser).
153
+ output_format (str): "HTML" or "PNG" (screenshot).
154
+ country (Optional[str]): Geo-targeting country code (e.g., 'us').
155
+ block_resources (bool): Block images/css to speed up loading.
156
+
157
+ Returns:
158
+ Union[str, bytes]: HTML string or PNG bytes.
159
+ """
160
+ headers = {
161
+ "Authorization": f"Bearer {self.scraper_token}",
162
+ "Content-Type": "application/x-www-form-urlencoded"
163
+ }
164
+
165
+ payload = {
166
+ "url": url,
167
+ "js_render": "True" if js_render else "False",
168
+ "type": output_format.lower(),
169
+ "block_resources": "True" if block_resources else "False"
170
+ }
171
+ if country:
172
+ payload["country"] = country
173
+
174
+ logger.info(f"Universal Scrape: {url} (Format: {output_format})")
175
+
176
+ try:
177
+ response = self.session.post(
178
+ self.UNIVERSAL_API_URL,
179
+ data=payload,
180
+ headers=headers,
181
+ timeout=60
182
+ )
183
+ response.raise_for_status()
184
+
185
+ # Attempt to parse JSON wrapper
186
+ try:
187
+ resp_json = response.json()
188
+ except json.JSONDecodeError:
189
+ # Fallback: if the API returns raw content directly
190
+ if output_format.upper() == "PNG":
191
+ return response.content
192
+ return response.text
193
+
194
+ # Check for API-level errors inside the JSON
195
+ if isinstance(resp_json, dict) and resp_json.get("code") \
196
+ and resp_json.get("code") != 200:
197
+ raise Exception(f"Universal API Error: {resp_json}")
198
+
199
+ # Case 1: Return HTML
200
+ if "html" in resp_json:
201
+ return resp_json["html"]
202
+
203
+ # Case 2: Return PNG Image
204
+ if "png" in resp_json:
205
+ png_str = resp_json["png"]
206
+ if not png_str:
207
+ raise Exception("API returned empty PNG data")
208
+
209
+ # Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
210
+ if "," in png_str:
211
+ png_str = png_str.split(",", 1)[1]
212
+
213
+ # Fix Base64 Padding
214
+ png_str = png_str.replace("\n", "").replace("\r", "")
215
+ missing_padding = len(png_str) % 4
216
+ if missing_padding:
217
+ png_str += '=' * (4 - missing_padding)
218
+
219
+ return base64.b64decode(png_str)
220
+
221
+ # Fallback
222
+ return str(resp_json)
223
+
224
+ except Exception as e:
225
+ logger.error(f"Universal Scrape Failed: {e}")
226
+ raise
227
+
228
+ def create_scraper_task(
229
+ self,
230
+ file_name: str,
231
+ spider_id: str,
232
+ spider_name: str,
233
+ individual_params: Dict[str, Any],
234
+ universal_params: Optional[Dict[str, Any]] = None
235
+ ) -> str:
236
+ """
237
+ Create a generic Web Scraper Task (Async).
238
+
239
+ IMPORTANT: You must retrieve the correct 'spider_id' and 'spider_name'
240
+ from the Thordata Dashboard before calling this method.
241
+
242
+ Args:
243
+ file_name (str): Name for the output file.
244
+ spider_id (str): The ID of the spider (from Dashboard).
245
+ spider_name (str): The name of the spider (e.g., "youtube.com").
246
+ individual_params (Dict): Parameters specific to the spider.
247
+ universal_params (Optional[Dict]): Global settings for the scraper.
248
+
249
+ Returns:
250
+ str: The created task_id.
251
+ """
252
+ headers = {
253
+ "Authorization": f"Bearer {self.scraper_token}",
254
+ "Content-Type": "application/x-www-form-urlencoded"
255
+ }
256
+
257
+ # Payload construction
258
+ payload = {
259
+ "spider_name": spider_name,
260
+ "spider_id": spider_id,
261
+ "spider_parameters": json.dumps([individual_params]),
262
+ "spider_errors": "true",
263
+ "file_name": file_name
264
+ }
265
+ if universal_params:
266
+ payload["spider_universal"] = json.dumps(universal_params)
267
+
268
+ logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
269
+ try:
270
+ response = self.session.post(
271
+ self.SCRAPER_BUILDER_URL,
272
+ data=payload,
273
+ headers=headers
274
+ )
275
+ response.raise_for_status()
276
+ data = response.json()
277
+
278
+ if data.get("code") != 200:
279
+ raise Exception(f"Creation failed: {data}")
280
+ return data["data"]["task_id"]
281
+ except Exception as e:
282
+ logger.error(f"Task Creation Failed: {e}")
283
+ raise
284
+
285
+ def get_task_status(self, task_id: str) -> str:
286
+ """
287
+ Check the status of an asynchronous scraping task.
288
+
289
+ Args:
290
+ task_id (str): The ID returned by create_scraper_task.
291
+
292
+ Returns:
293
+ str: The status string (e.g., "finished", "running", "error").
294
+ """
295
+ headers = {
296
+ "token": self.public_token,
297
+ "key": self.public_key,
298
+ "Content-Type": "application/x-www-form-urlencoded"
299
+ }
300
+ payload = {"tasks_ids": task_id}
301
+
302
+ try:
303
+ response = self.session.post(
304
+ self.SCRAPER_STATUS_URL,
305
+ data=payload,
306
+ headers=headers
307
+ )
308
+ response.raise_for_status()
309
+ data = response.json()
310
+
311
+ if data.get("code") == 200 and data.get("data"):
312
+ for item in data["data"]:
313
+ if str(item.get("task_id")) == str(task_id):
314
+ return item["status"]
315
+ return "Unknown"
316
+ except Exception as e:
317
+ logger.error(f"Status Check Failed: {e}")
318
+ return "Error"
319
+
320
+ def get_task_result(self, task_id: str, file_type: str = "json") -> str:
321
+ """
322
+ Retrieve the download URL for a completed task.
323
+
324
+ Args:
325
+ task_id (str): The task ID.
326
+ file_type (str): Format required (default "json").
327
+
328
+ Returns:
329
+ str: The URL to download the result file.
330
+ """
331
+ headers = {
332
+ "token": self.public_token,
333
+ "key": self.public_key,
334
+ "Content-Type": "application/x-www-form-urlencoded"
335
+ }
336
+ payload = {"tasks_id": task_id, "type": file_type}
337
+
338
+ logger.info(f"Getting result URL for Task: {task_id}")
339
+ try:
340
+ response = self.session.post(
341
+ self.SCRAPER_DOWNLOAD_URL,
342
+ data=payload,
343
+ headers=headers
344
+ )
345
+ response.raise_for_status()
346
+ data = response.json()
347
+
348
+ if data.get("code") == 200 and data.get("data"):
349
+ return data["data"]["download"]
350
+ raise Exception(f"API returned error: {data}")
351
+ except Exception as e:
352
+ logger.error(f"Get Result Failed: {e}")
353
+ raise
354
+
355
+ def _get_locations(self, endpoint: str, params: Dict[str, str]) -> List[Dict[str, Any]]:
356
+ """
357
+ Internal helper to call the public locations API.
358
+
359
+ Args:
360
+ endpoint: One of 'countries', 'states', 'cities', 'asn'.
361
+ params: Query parameters (must include token, key, proxy_type, etc.)
362
+
363
+ Returns:
364
+ List of location records from the 'data' field.
365
+
366
+ Raises:
367
+ RuntimeError: If token/key are missing or API returns an error code.
368
+ """
369
+ if not self.public_token or not self.public_key:
370
+ raise RuntimeError(
371
+ "Public API token/key are required for locations endpoints. "
372
+ "Please provide 'public_token' and 'public_key' when "
373
+ "initializing ThordataClient."
374
+ )
375
+
376
+ url = f"{self.locations_url}/{endpoint}"
377
+ logger.info("Locations API request: %s", url)
378
+
379
+ # Use a direct requests.get here; no need to go through the proxy gateway.
380
+ response = requests.get(
381
+ url,
382
+ params=params,
383
+ timeout=30,
384
+ )
385
+ response.raise_for_status()
386
+
387
+ data = response.json()
388
+ if isinstance(data, dict):
389
+ code = data.get("code")
390
+ if code is not None and code != 200:
391
+ msg = data.get("msg", "")
392
+ raise RuntimeError(
393
+ f"Locations API error ({endpoint}): code={code}, msg={msg}"
394
+ )
395
+ return data.get("data") or []
396
+ # Fallback: if backend ever returns a list directly
397
+ if isinstance(data, list):
398
+ return data
399
+ return []
400
+
401
+ def list_countries(self, proxy_type: int = 1) -> List[Dict[str, Any]]:
402
+ """
403
+ List supported countries for Thordata residential or unlimited proxies.
404
+
405
+ Args:
406
+ proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
407
+
408
+ Returns:
409
+ List[Dict[str, Any]]: Each record contains 'country_code' and 'country_name'.
410
+ """
411
+ params = {
412
+ "token": self.public_token,
413
+ "key": self.public_key,
414
+ "proxy_type": str(proxy_type),
415
+ }
416
+ return self._get_locations("countries", params)
417
+
418
+ def list_states(self, country_code: str, proxy_type: int = 1) -> List[Dict[str, Any]]:
419
+ """
420
+ List supported states for a given country.
421
+
422
+ Args:
423
+ country_code (str): Country code (e.g., 'US').
424
+ proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
425
+
426
+ Returns:
427
+ List[Dict[str, Any]]: Each record contains 'state_code' and 'state_name'.
428
+ """
429
+ params = {
430
+ "token": self.public_token,
431
+ "key": self.public_key,
432
+ "proxy_type": str(proxy_type),
433
+ "country_code": country_code,
434
+ }
435
+ return self._get_locations("states", params)
436
+
437
+ def list_cities(
438
+ self,
439
+ country_code: str,
440
+ state_code: Optional[str] = None,
441
+ proxy_type: int = 1,
442
+ ) -> List[Dict[str, Any]]:
443
+ """
444
+ List supported cities for a given country (and optional state).
445
+
446
+ Args:
447
+ country_code (str): Country code (e.g., 'US').
448
+ state_code (Optional[str]): State code (e.g., 'alabama'), if applicable.
449
+ proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
450
+
451
+ Returns:
452
+ List[Dict[str, Any]]: Each record contains 'city_code' and 'city_name'.
453
+ """
454
+ params: Dict[str, str] = {
455
+ "token": self.public_token,
456
+ "key": self.public_key,
457
+ "proxy_type": str(proxy_type),
458
+ "country_code": country_code,
459
+ }
460
+ if state_code:
461
+ params["state_code"] = state_code
462
+
463
+ return self._get_locations("cities", params)
464
+
465
+ def list_asn(
466
+ self,
467
+ country_code: str,
468
+ proxy_type: int = 1,
469
+ ) -> List[Dict[str, Any]]:
470
+ """
471
+ List supported ASNs for a given country.
472
+
473
+ Args:
474
+ country_code (str): Country code (e.g., 'US').
475
+ proxy_type (int): 1 for residential proxies, 2 for unlimited proxies.
476
+
477
+ Returns:
478
+ List[Dict[str, Any]]: Each record contains 'asn_code' and 'asn_name'.
479
+ """
480
+ params = {
481
+ "token": self.public_token,
482
+ "key": self.public_key,
483
+ "proxy_type": str(proxy_type),
484
+ "country_code": country_code,
485
+ }
486
+ return self._get_locations("asn", params)
thordata/enums.py ADDED
@@ -0,0 +1,25 @@
1
+ # src/thordata/enums.py
2
+
3
+ from enum import Enum
4
+
5
+ class Engine(str, Enum):
6
+ """
7
+ Supported Search Engines for SERP API.
8
+ """
9
+ GOOGLE = "google"
10
+ BING = "bing"
11
+ YANDEX = "yandex"
12
+ DUCKDUCKGO = "duckduckgo"
13
+ BAIDU = "baidu"
14
+
15
+ class GoogleSearchType(str, Enum):
16
+ """
17
+ Specific search types for Google Engine.
18
+ """
19
+ SEARCH = "search" # Default web search
20
+ MAPS = "maps" # Google Maps
21
+ SHOPPING = "shopping" # Google Shopping
22
+ NEWS = "news" # Google News
23
+ IMAGES = "images" # Google Images
24
+ VIDEOS = "videos" # Google Videos
25
+ # Users can pass other strings manually if needed
thordata/parameters.py ADDED
@@ -0,0 +1,52 @@
1
+ # src/thordata/parameters.py
2
+
3
+ from typing import Dict, Any, Optional
4
+
5
+ def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
6
+ """
7
+ Normalizes parameters across different search engines to ensure a unified API surface.
8
+
9
+ Args:
10
+ engine (str): The search engine to use (e.g., 'google', 'yandex').
11
+ query (str): The search query string.
12
+ **kwargs: Additional parameters to pass to the API.
13
+
14
+ Returns:
15
+ Dict[str, Any]: The constructed payload for the API request.
16
+ """
17
+ # 1. Base parameters
18
+ payload = {
19
+ "num": str(kwargs.get("num", 10)), # Default to 10 results
20
+ "json": "1", # Force JSON response
21
+ "engine": engine,
22
+ }
23
+
24
+ # 2. Handle Query Parameter Differences (Yandex uses 'text', others use 'q')
25
+ if engine == "yandex":
26
+ payload["text"] = query
27
+ # Set default URL for Yandex if not provided
28
+ if "url" not in kwargs:
29
+ payload["url"] = "yandex.com"
30
+ else:
31
+ payload["q"] = query
32
+
33
+ # 3. Handle Default URLs for other engines
34
+ if "url" not in kwargs:
35
+ defaults = {
36
+ "google": "google.com",
37
+ "bing": "bing.com",
38
+ "duckduckgo": "duckduckgo.com",
39
+ "baidu": "baidu.com"
40
+ }
41
+ if engine in defaults:
42
+ payload["url"] = defaults[engine]
43
+
44
+ # 4. Passthrough for all other user-provided arguments
45
+ # This allows support for engine-specific parameters (e.g., tbm, uule, gl)
46
+ # without explicitly defining them all.
47
+ protected_keys = {"num", "engine", "q", "text"}
48
+ for key, value in kwargs.items():
49
+ if key not in protected_keys:
50
+ payload[key] = value
51
+
52
+ return payload