thordata-sdk 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +16 -0
- {thordata_sdk → thordata}/async_client.py +88 -41
- {thordata_sdk → thordata}/client.py +106 -51
- thordata/enums.py +25 -0
- thordata/parameters.py +52 -0
- thordata_sdk-0.3.0.dist-info/METADATA +197 -0
- thordata_sdk-0.3.0.dist-info/RECORD +10 -0
- thordata_sdk-0.3.0.dist-info/top_level.txt +1 -0
- thordata_sdk/__init__.py +0 -8
- thordata_sdk-0.2.3.dist-info/METADATA +0 -125
- thordata_sdk-0.2.3.dist-info/RECORD +0 -8
- thordata_sdk-0.2.3.dist-info/top_level.txt +0 -1
- {thordata_sdk-0.2.3.dist-info → thordata_sdk-0.3.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.2.3.dist-info → thordata_sdk-0.3.0.dist-info}/licenses/LICENSE +0 -0
thordata/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# src/thordata/__init__.py
|
|
2
|
+
|
|
3
|
+
from .client import ThordataClient
|
|
4
|
+
from .async_client import AsyncThordataClient
|
|
5
|
+
from .enums import Engine, GoogleSearchType
|
|
6
|
+
|
|
7
|
+
# Package version
|
|
8
|
+
__version__ = "0.3.0"
|
|
9
|
+
|
|
10
|
+
# Explicitly export classes to simplify user imports
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ThordataClient",
|
|
13
|
+
"AsyncThordataClient",
|
|
14
|
+
"Engine",
|
|
15
|
+
"GoogleSearchType"
|
|
16
|
+
]
|
|
@@ -4,12 +4,17 @@ import json
|
|
|
4
4
|
import base64
|
|
5
5
|
from typing import Optional, Dict, Any, Union
|
|
6
6
|
|
|
7
|
+
# Import shared logic
|
|
8
|
+
from .enums import Engine
|
|
9
|
+
from .parameters import normalize_serp_params
|
|
10
|
+
|
|
7
11
|
logger = logging.getLogger(__name__)
|
|
8
12
|
|
|
9
13
|
|
|
10
14
|
class AsyncThordataClient:
|
|
11
15
|
"""
|
|
12
|
-
|
|
16
|
+
The official Asynchronous Python client for Thordata (built on aiohttp).
|
|
17
|
+
Designed for high-concurrency AI agents and data pipelines.
|
|
13
18
|
"""
|
|
14
19
|
|
|
15
20
|
def __init__(
|
|
@@ -20,13 +25,18 @@ class AsyncThordataClient:
|
|
|
20
25
|
proxy_host: str = "gate.thordata.com",
|
|
21
26
|
proxy_port: int = 22225
|
|
22
27
|
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the Async Client.
|
|
30
|
+
"""
|
|
23
31
|
self.scraper_token = scraper_token
|
|
24
32
|
self.public_token = public_token
|
|
25
33
|
self.public_key = public_key
|
|
26
34
|
|
|
35
|
+
# Pre-calculate proxy auth for performance
|
|
27
36
|
self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
|
|
28
37
|
self.proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
29
38
|
|
|
39
|
+
# API Endpoints
|
|
30
40
|
self.base_url = "https://scraperapi.thordata.com"
|
|
31
41
|
self.universal_url = "https://universalapi.thordata.com"
|
|
32
42
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
@@ -37,6 +47,7 @@ class AsyncThordataClient:
|
|
|
37
47
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
38
48
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
39
49
|
|
|
50
|
+
# Session is initialized lazily or via context manager
|
|
40
51
|
self._session: Optional[aiohttp.ClientSession] = None
|
|
41
52
|
|
|
42
53
|
async def __aenter__(self):
|
|
@@ -48,16 +59,27 @@ class AsyncThordataClient:
|
|
|
48
59
|
await self.close()
|
|
49
60
|
|
|
50
61
|
async def close(self):
|
|
62
|
+
"""Close the underlying aiohttp session."""
|
|
51
63
|
if self._session and not self._session.closed:
|
|
52
64
|
await self._session.close()
|
|
53
65
|
self._session = None
|
|
54
66
|
|
|
55
|
-
|
|
67
|
+
def _get_session(self) -> aiohttp.ClientSession:
|
|
68
|
+
"""Internal helper to ensure session exists."""
|
|
69
|
+
if self._session is None or self._session.closed:
|
|
70
|
+
raise RuntimeError(
|
|
71
|
+
"Client session not initialized. Use 'async with ThordataClient(...) as client:'"
|
|
72
|
+
)
|
|
73
|
+
return self._session
|
|
74
|
+
|
|
56
75
|
async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
|
|
57
|
-
|
|
58
|
-
|
|
76
|
+
"""
|
|
77
|
+
Send an async GET request through the Proxy Network.
|
|
78
|
+
"""
|
|
79
|
+
session = self._get_session()
|
|
59
80
|
try:
|
|
60
|
-
|
|
81
|
+
logger.debug(f"Async Proxy Request: {url}")
|
|
82
|
+
return await session.get(
|
|
61
83
|
url,
|
|
62
84
|
proxy=self.proxy_url,
|
|
63
85
|
proxy_auth=self.proxy_auth,
|
|
@@ -67,55 +89,57 @@ class AsyncThordataClient:
|
|
|
67
89
|
logger.error(f"Async Request failed: {e}")
|
|
68
90
|
raise
|
|
69
91
|
|
|
70
|
-
# --- SERP ---
|
|
71
92
|
async def serp_search(
|
|
72
|
-
self,
|
|
93
|
+
self,
|
|
94
|
+
query: str,
|
|
95
|
+
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
96
|
+
num: int = 10,
|
|
97
|
+
**kwargs
|
|
73
98
|
) -> Dict[str, Any]:
|
|
74
|
-
|
|
75
|
-
|
|
99
|
+
"""
|
|
100
|
+
Execute a real-time SERP search (Async).
|
|
101
|
+
"""
|
|
102
|
+
session = self._get_session()
|
|
76
103
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
payload['text'] = payload.pop('q')
|
|
83
|
-
if 'url' not in payload:
|
|
84
|
-
payload['url'] = "yandex.com"
|
|
85
|
-
elif 'url' not in payload:
|
|
86
|
-
if engine == 'google':
|
|
87
|
-
payload['url'] = "google.com"
|
|
88
|
-
elif engine == 'bing':
|
|
89
|
-
payload['url'] = "bing.com"
|
|
104
|
+
# 1. Handle Enum conversion
|
|
105
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
106
|
+
|
|
107
|
+
# 2. Normalize parameters
|
|
108
|
+
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
90
109
|
|
|
91
110
|
headers = {
|
|
92
111
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
93
112
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
94
113
|
}
|
|
95
114
|
|
|
96
|
-
|
|
115
|
+
# 3. Execute Request
|
|
116
|
+
logger.info(f"Async SERP Search: {engine_str} - {query}")
|
|
117
|
+
async with session.post(
|
|
97
118
|
self.SERP_API_URL, data=payload, headers=headers
|
|
98
119
|
) as response:
|
|
99
120
|
response.raise_for_status()
|
|
121
|
+
|
|
100
122
|
data = await response.json()
|
|
123
|
+
# Handle double-encoded JSON strings if they occur
|
|
101
124
|
if isinstance(data, str):
|
|
102
125
|
try:
|
|
103
126
|
data = json.loads(data)
|
|
104
|
-
except
|
|
127
|
+
except json.JSONDecodeError:
|
|
105
128
|
pass
|
|
106
129
|
return data
|
|
107
130
|
|
|
108
|
-
# --- Universal ---
|
|
109
131
|
async def universal_scrape(
|
|
110
132
|
self,
|
|
111
133
|
url: str,
|
|
112
134
|
js_render: bool = False,
|
|
113
135
|
output_format: str = "HTML",
|
|
114
|
-
country: str = None,
|
|
136
|
+
country: Optional[str] = None,
|
|
115
137
|
block_resources: bool = False
|
|
116
138
|
) -> Union[str, bytes]:
|
|
117
|
-
|
|
118
|
-
|
|
139
|
+
"""
|
|
140
|
+
Async Universal Scraping (Bypass Cloudflare/CAPTCHA).
|
|
141
|
+
"""
|
|
142
|
+
session = self._get_session()
|
|
119
143
|
|
|
120
144
|
headers = {
|
|
121
145
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
@@ -131,18 +155,21 @@ class AsyncThordataClient:
|
|
|
131
155
|
if country:
|
|
132
156
|
payload["country"] = country
|
|
133
157
|
|
|
134
|
-
|
|
158
|
+
logger.info(f"Async Universal Scrape: {url}")
|
|
159
|
+
async with session.post(
|
|
135
160
|
self.UNIVERSAL_API_URL, data=payload, headers=headers
|
|
136
161
|
) as response:
|
|
137
162
|
response.raise_for_status()
|
|
138
163
|
|
|
139
164
|
try:
|
|
140
165
|
resp_json = await response.json()
|
|
141
|
-
except
|
|
166
|
+
except json.JSONDecodeError:
|
|
167
|
+
# Fallback for raw content
|
|
142
168
|
if output_format.upper() == "PNG":
|
|
143
169
|
return await response.read()
|
|
144
170
|
return await response.text()
|
|
145
171
|
|
|
172
|
+
# Check API error codes
|
|
146
173
|
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
147
174
|
and resp_json.get("code") != 200:
|
|
148
175
|
raise Exception(f"Universal API Error: {resp_json}")
|
|
@@ -155,25 +182,32 @@ class AsyncThordataClient:
|
|
|
155
182
|
if not png_str:
|
|
156
183
|
raise Exception("API returned empty PNG data")
|
|
157
184
|
|
|
185
|
+
# Clean Data URI Scheme
|
|
186
|
+
if "," in png_str:
|
|
187
|
+
png_str = png_str.split(",", 1)[1]
|
|
188
|
+
|
|
189
|
+
# Fix Base64 Padding
|
|
158
190
|
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
159
191
|
missing_padding = len(png_str) % 4
|
|
160
192
|
if missing_padding:
|
|
161
193
|
png_str += '=' * (4 - missing_padding)
|
|
194
|
+
|
|
162
195
|
return base64.b64decode(png_str)
|
|
163
196
|
|
|
164
197
|
return str(resp_json)
|
|
165
198
|
|
|
166
|
-
# --- Web Scraper ---
|
|
167
199
|
async def create_scraper_task(
|
|
168
200
|
self,
|
|
169
201
|
file_name: str,
|
|
170
202
|
spider_id: str,
|
|
203
|
+
spider_name: str,
|
|
171
204
|
individual_params: Dict[str, Any],
|
|
172
|
-
|
|
173
|
-
universal_params: Dict[str, Any] = None
|
|
205
|
+
universal_params: Optional[Dict[str, Any]] = None
|
|
174
206
|
) -> str:
|
|
175
|
-
|
|
176
|
-
|
|
207
|
+
"""
|
|
208
|
+
Create an Asynchronous Web Scraper Task.
|
|
209
|
+
"""
|
|
210
|
+
session = self._get_session()
|
|
177
211
|
|
|
178
212
|
headers = {
|
|
179
213
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
@@ -190,16 +224,23 @@ class AsyncThordataClient:
|
|
|
190
224
|
if universal_params:
|
|
191
225
|
payload["spider_universal"] = json.dumps(universal_params)
|
|
192
226
|
|
|
193
|
-
|
|
227
|
+
logger.info(f"Async Task Creation: {spider_name}")
|
|
228
|
+
async with session.post(
|
|
194
229
|
self.SCRAPER_BUILDER_URL, data=payload, headers=headers
|
|
195
230
|
) as response:
|
|
196
231
|
response.raise_for_status()
|
|
197
232
|
data = await response.json()
|
|
233
|
+
|
|
198
234
|
if data.get("code") != 200:
|
|
199
235
|
raise Exception(f"Creation failed: {data}")
|
|
200
236
|
return data["data"]["task_id"]
|
|
201
237
|
|
|
202
238
|
async def get_task_status(self, task_id: str) -> str:
|
|
239
|
+
"""
|
|
240
|
+
Check task status.
|
|
241
|
+
"""
|
|
242
|
+
session = self._get_session()
|
|
243
|
+
|
|
203
244
|
headers = {
|
|
204
245
|
"token": self.public_token,
|
|
205
246
|
"key": self.public_key,
|
|
@@ -207,28 +248,34 @@ class AsyncThordataClient:
|
|
|
207
248
|
}
|
|
208
249
|
payload = {"tasks_ids": task_id}
|
|
209
250
|
|
|
210
|
-
async with
|
|
251
|
+
async with session.post(
|
|
211
252
|
self.SCRAPER_STATUS_URL, data=payload, headers=headers
|
|
212
253
|
) as response:
|
|
213
254
|
data = await response.json()
|
|
214
255
|
if data.get("code") == 200 and data.get("data"):
|
|
215
256
|
for item in data["data"]:
|
|
216
|
-
if str(item
|
|
257
|
+
if str(item.get("task_id")) == str(task_id):
|
|
217
258
|
return item["status"]
|
|
218
259
|
return "Unknown"
|
|
219
260
|
|
|
220
261
|
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
262
|
+
"""
|
|
263
|
+
Get the download URL for a finished task.
|
|
264
|
+
"""
|
|
265
|
+
session = self._get_session()
|
|
266
|
+
|
|
221
267
|
headers = {
|
|
222
268
|
"token": self.public_token,
|
|
223
269
|
"key": self.public_key,
|
|
224
270
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
225
271
|
}
|
|
226
|
-
|
|
272
|
+
# Fixed: Use the file_type argument instead of hardcoding "json"
|
|
273
|
+
payload = {"tasks_id": task_id, "type": file_type}
|
|
227
274
|
|
|
228
|
-
async with
|
|
275
|
+
async with session.post(
|
|
229
276
|
self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
|
|
230
277
|
) as response:
|
|
231
278
|
data = await response.json()
|
|
232
|
-
if data.get("code") == 200:
|
|
279
|
+
if data.get("code") == 200 and data.get("data"):
|
|
233
280
|
return data["data"]["download"]
|
|
234
281
|
raise Exception(f"Result Error: {data}")
|
|
@@ -2,9 +2,12 @@ import requests
|
|
|
2
2
|
import logging
|
|
3
3
|
import json
|
|
4
4
|
import base64
|
|
5
|
-
from typing import Dict, Any, Union
|
|
5
|
+
from typing import Dict, Any, Union, Optional
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
from .enums import Engine
|
|
8
|
+
from .parameters import normalize_serp_params
|
|
9
|
+
|
|
10
|
+
# Configure a library-specific logger to avoid interfering with user's logging
|
|
8
11
|
logger = logging.getLogger(__name__)
|
|
9
12
|
|
|
10
13
|
|
|
@@ -12,11 +15,11 @@ class ThordataClient:
|
|
|
12
15
|
"""
|
|
13
16
|
The official synchronous Python client for Thordata.
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
1. Proxy Network (HTTP/HTTPS)
|
|
17
|
-
2. SERP API (Real-time Search)
|
|
18
|
-
3. Universal Scraping API (Single Page)
|
|
19
|
-
4. Web Scraper API (Async Task Management)
|
|
18
|
+
This client handles authentication and communication with:
|
|
19
|
+
1. Proxy Network (Residential/Datacenter via HTTP/HTTPS)
|
|
20
|
+
2. SERP API (Real-time Search Engine Results)
|
|
21
|
+
3. Universal Scraping API (Single Page Rendering & Extraction)
|
|
22
|
+
4. Web Scraper API (Async Task Management for large scale jobs)
|
|
20
23
|
"""
|
|
21
24
|
|
|
22
25
|
def __init__(
|
|
@@ -31,11 +34,11 @@ class ThordataClient:
|
|
|
31
34
|
Initialize the Thordata Client.
|
|
32
35
|
|
|
33
36
|
Args:
|
|
34
|
-
scraper_token (str):
|
|
35
|
-
public_token (str):
|
|
36
|
-
public_key (str):
|
|
37
|
-
proxy_host (str):
|
|
38
|
-
proxy_port (int):
|
|
37
|
+
scraper_token (str): The secret token found at the bottom of the Dashboard.
|
|
38
|
+
public_token (str): The token from the Public API section.
|
|
39
|
+
public_key (str): The key from the Public API section.
|
|
40
|
+
proxy_host (str): The proxy gateway host (default: gate.thordata.com).
|
|
41
|
+
proxy_port (int): The proxy gateway port (default: 22225).
|
|
39
42
|
"""
|
|
40
43
|
self.scraper_token = scraper_token
|
|
41
44
|
self.public_token = public_token
|
|
@@ -46,7 +49,7 @@ class ThordataClient:
|
|
|
46
49
|
f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
|
|
47
50
|
)
|
|
48
51
|
|
|
49
|
-
# API Endpoints
|
|
52
|
+
# API Endpoints Definition
|
|
50
53
|
self.base_url = "https://scraperapi.thordata.com"
|
|
51
54
|
self.universal_url = "https://universalapi.thordata.com"
|
|
52
55
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
@@ -57,6 +60,7 @@ class ThordataClient:
|
|
|
57
60
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
58
61
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
59
62
|
|
|
63
|
+
# Initialize Session with Proxy settings
|
|
60
64
|
self.session = requests.Session()
|
|
61
65
|
self.session.proxies = {
|
|
62
66
|
"http": self.proxy_url,
|
|
@@ -65,44 +69,50 @@ class ThordataClient:
|
|
|
65
69
|
|
|
66
70
|
def get(self, url: str, **kwargs) -> requests.Response:
|
|
67
71
|
"""
|
|
68
|
-
Send a GET request through the Thordata Proxy Network.
|
|
72
|
+
Send a standard GET request through the Thordata Residential Proxy Network.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
url (str): The target URL.
|
|
76
|
+
**kwargs: Arguments to pass to requests.get().
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
requests.Response: The response object.
|
|
69
80
|
"""
|
|
70
81
|
logger.debug(f"Proxy Request: {url}")
|
|
71
82
|
kwargs.setdefault("timeout", 30)
|
|
72
83
|
return self.session.get(url, **kwargs)
|
|
73
84
|
|
|
74
85
|
def serp_search(
|
|
75
|
-
self,
|
|
86
|
+
self,
|
|
87
|
+
query: str,
|
|
88
|
+
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
89
|
+
num: int = 10,
|
|
90
|
+
**kwargs
|
|
76
91
|
) -> Dict[str, Any]:
|
|
77
92
|
"""
|
|
78
|
-
Execute a real-time SERP search.
|
|
93
|
+
Execute a real-time SERP (Search Engine Results Page) search.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
query (str): The search keywords.
|
|
97
|
+
engine (Union[Engine, str]): The search engine (e.g., 'google', 'bing').
|
|
98
|
+
num (int): Number of results to retrieve (default 10).
|
|
99
|
+
**kwargs: Additional parameters (e.g., type="shopping", location="London").
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Dict[str, Any]: The parsed JSON result from the search engine.
|
|
79
103
|
"""
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
"num": str(num),
|
|
83
|
-
"json": "1",
|
|
84
|
-
"engine": engine.lower(),
|
|
85
|
-
**kwargs
|
|
86
|
-
}
|
|
104
|
+
# Handle Enum or String input for engine
|
|
105
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
87
106
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
if 'url' not in payload:
|
|
91
|
-
payload['url'] = "yandex.com"
|
|
92
|
-
elif 'url' not in payload:
|
|
93
|
-
if engine == 'google':
|
|
94
|
-
payload['url'] = "google.com"
|
|
95
|
-
elif engine == 'bing':
|
|
96
|
-
payload['url'] = "bing.com"
|
|
97
|
-
elif engine == 'duckduckgo':
|
|
98
|
-
payload['url'] = "duckduckgo.com"
|
|
107
|
+
# Normalize parameters via internal helper
|
|
108
|
+
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
99
109
|
|
|
100
110
|
headers = {
|
|
101
111
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
102
112
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
103
113
|
}
|
|
104
114
|
|
|
105
|
-
logger.info(f"SERP Search: {
|
|
115
|
+
logger.info(f"SERP Search: {engine_str} - {query}")
|
|
106
116
|
try:
|
|
107
117
|
response = self.session.post(
|
|
108
118
|
self.SERP_API_URL,
|
|
@@ -111,12 +121,13 @@ class ThordataClient:
|
|
|
111
121
|
timeout=60
|
|
112
122
|
)
|
|
113
123
|
response.raise_for_status()
|
|
124
|
+
|
|
114
125
|
data = response.json()
|
|
115
|
-
|
|
126
|
+
# Handle cases where the API returns a stringified JSON
|
|
116
127
|
if isinstance(data, str):
|
|
117
|
-
try:
|
|
128
|
+
try:
|
|
118
129
|
data = json.loads(data)
|
|
119
|
-
except json.JSONDecodeError:
|
|
130
|
+
except json.JSONDecodeError:
|
|
120
131
|
pass
|
|
121
132
|
return data
|
|
122
133
|
except Exception as e:
|
|
@@ -128,11 +139,22 @@ class ThordataClient:
|
|
|
128
139
|
url: str,
|
|
129
140
|
js_render: bool = False,
|
|
130
141
|
output_format: str = "HTML",
|
|
131
|
-
country: str = None,
|
|
142
|
+
country: Optional[str] = None,
|
|
132
143
|
block_resources: bool = False
|
|
133
144
|
) -> Union[str, bytes]:
|
|
134
145
|
"""
|
|
135
146
|
Unlock target pages via the Universal Scraping API.
|
|
147
|
+
Bypasses Cloudflare, CAPTCHAs, and antibot systems automatically.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
url (str): Target URL.
|
|
151
|
+
js_render (bool): Whether to render JavaScript (Headless Browser).
|
|
152
|
+
output_format (str): "HTML" or "PNG" (screenshot).
|
|
153
|
+
country (Optional[str]): Geo-targeting country code (e.g., 'us').
|
|
154
|
+
block_resources (bool): Block images/css to speed up loading.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Union[str, bytes]: HTML string or PNG bytes.
|
|
136
158
|
"""
|
|
137
159
|
headers = {
|
|
138
160
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
@@ -148,7 +170,7 @@ class ThordataClient:
|
|
|
148
170
|
if country:
|
|
149
171
|
payload["country"] = country
|
|
150
172
|
|
|
151
|
-
logger.info(f"Universal Scrape: {url}")
|
|
173
|
+
logger.info(f"Universal Scrape: {url} (Format: {output_format})")
|
|
152
174
|
|
|
153
175
|
try:
|
|
154
176
|
response = self.session.post(
|
|
@@ -159,30 +181,35 @@ class ThordataClient:
|
|
|
159
181
|
)
|
|
160
182
|
response.raise_for_status()
|
|
161
183
|
|
|
162
|
-
#
|
|
184
|
+
# Attempt to parse JSON wrapper
|
|
163
185
|
try:
|
|
164
186
|
resp_json = response.json()
|
|
165
187
|
except json.JSONDecodeError:
|
|
166
|
-
# Fallback
|
|
188
|
+
# Fallback: if the API returns raw content directly
|
|
167
189
|
if output_format.upper() == "PNG":
|
|
168
190
|
return response.content
|
|
169
191
|
return response.text
|
|
170
192
|
|
|
171
|
-
# Check API errors
|
|
193
|
+
# Check for API-level errors inside the JSON
|
|
172
194
|
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
173
195
|
and resp_json.get("code") != 200:
|
|
174
196
|
raise Exception(f"Universal API Error: {resp_json}")
|
|
175
197
|
|
|
176
|
-
#
|
|
198
|
+
# Case 1: Return HTML
|
|
177
199
|
if "html" in resp_json:
|
|
178
200
|
return resp_json["html"]
|
|
179
201
|
|
|
180
|
-
#
|
|
202
|
+
# Case 2: Return PNG Image
|
|
181
203
|
if "png" in resp_json:
|
|
182
204
|
png_str = resp_json["png"]
|
|
183
205
|
if not png_str:
|
|
184
206
|
raise Exception("API returned empty PNG data")
|
|
185
207
|
|
|
208
|
+
# Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
|
|
209
|
+
if "," in png_str:
|
|
210
|
+
png_str = png_str.split(",", 1)[1]
|
|
211
|
+
|
|
212
|
+
# Fix Base64 Padding
|
|
186
213
|
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
187
214
|
missing_padding = len(png_str) % 4
|
|
188
215
|
if missing_padding:
|
|
@@ -190,6 +217,7 @@ class ThordataClient:
|
|
|
190
217
|
|
|
191
218
|
return base64.b64decode(png_str)
|
|
192
219
|
|
|
220
|
+
# Fallback
|
|
193
221
|
return str(resp_json)
|
|
194
222
|
|
|
195
223
|
except Exception as e:
|
|
@@ -200,18 +228,32 @@ class ThordataClient:
|
|
|
200
228
|
self,
|
|
201
229
|
file_name: str,
|
|
202
230
|
spider_id: str,
|
|
231
|
+
spider_name: str,
|
|
203
232
|
individual_params: Dict[str, Any],
|
|
204
|
-
|
|
205
|
-
universal_params: Dict[str, Any] = None
|
|
233
|
+
universal_params: Optional[Dict[str, Any]] = None
|
|
206
234
|
) -> str:
|
|
207
235
|
"""
|
|
208
|
-
Create
|
|
236
|
+
Create a generic Web Scraper Task (Async).
|
|
237
|
+
|
|
238
|
+
IMPORTANT: You must retrieve the correct 'spider_id' and 'spider_name'
|
|
239
|
+
from the Thordata Dashboard before calling this method.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
file_name (str): Name for the output file.
|
|
243
|
+
spider_id (str): The ID of the spider (from Dashboard).
|
|
244
|
+
spider_name (str): The name of the spider (e.g., "youtube.com").
|
|
245
|
+
individual_params (Dict): Parameters specific to the spider.
|
|
246
|
+
universal_params (Optional[Dict]): Global settings for the scraper.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
str: The created task_id.
|
|
209
250
|
"""
|
|
210
251
|
headers = {
|
|
211
252
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
212
253
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
213
254
|
}
|
|
214
255
|
|
|
256
|
+
# Payload construction
|
|
215
257
|
payload = {
|
|
216
258
|
"spider_name": spider_name,
|
|
217
259
|
"spider_id": spider_id,
|
|
@@ -222,7 +264,7 @@ class ThordataClient:
|
|
|
222
264
|
if universal_params:
|
|
223
265
|
payload["spider_universal"] = json.dumps(universal_params)
|
|
224
266
|
|
|
225
|
-
logger.info(f"Creating Scraper Task: {spider_id}")
|
|
267
|
+
logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
|
|
226
268
|
try:
|
|
227
269
|
response = self.session.post(
|
|
228
270
|
self.SCRAPER_BUILDER_URL,
|
|
@@ -241,7 +283,13 @@ class ThordataClient:
|
|
|
241
283
|
|
|
242
284
|
def get_task_status(self, task_id: str) -> str:
|
|
243
285
|
"""
|
|
244
|
-
Check the status of
|
|
286
|
+
Check the status of an asynchronous scraping task.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
task_id (str): The ID returned by create_scraper_task.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
str: The status string (e.g., "finished", "running", "error").
|
|
245
293
|
"""
|
|
246
294
|
headers = {
|
|
247
295
|
"token": self.public_token,
|
|
@@ -271,6 +319,13 @@ class ThordataClient:
|
|
|
271
319
|
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
272
320
|
"""
|
|
273
321
|
Retrieve the download URL for a completed task.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
task_id (str): The task ID.
|
|
325
|
+
file_type (str): Format required (default "json").
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
str: The URL to download the result file.
|
|
274
329
|
"""
|
|
275
330
|
headers = {
|
|
276
331
|
"token": self.public_token,
|
|
@@ -279,7 +334,7 @@ class ThordataClient:
|
|
|
279
334
|
}
|
|
280
335
|
payload = {"tasks_id": task_id, "type": file_type}
|
|
281
336
|
|
|
282
|
-
logger.info(f"Getting result URL: {task_id}")
|
|
337
|
+
logger.info(f"Getting result URL for Task: {task_id}")
|
|
283
338
|
try:
|
|
284
339
|
response = self.session.post(
|
|
285
340
|
self.SCRAPER_DOWNLOAD_URL,
|
thordata/enums.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# src/thordata/enums.py
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
class Engine(str, Enum):
|
|
6
|
+
"""
|
|
7
|
+
Supported Search Engines for SERP API.
|
|
8
|
+
"""
|
|
9
|
+
GOOGLE = "google"
|
|
10
|
+
BING = "bing"
|
|
11
|
+
YANDEX = "yandex"
|
|
12
|
+
DUCKDUCKGO = "duckduckgo"
|
|
13
|
+
BAIDU = "baidu"
|
|
14
|
+
|
|
15
|
+
class GoogleSearchType(str, Enum):
|
|
16
|
+
"""
|
|
17
|
+
Specific search types for Google Engine.
|
|
18
|
+
"""
|
|
19
|
+
SEARCH = "search" # Default web search
|
|
20
|
+
MAPS = "maps" # Google Maps
|
|
21
|
+
SHOPPING = "shopping" # Google Shopping
|
|
22
|
+
NEWS = "news" # Google News
|
|
23
|
+
IMAGES = "images" # Google Images
|
|
24
|
+
VIDEOS = "videos" # Google Videos
|
|
25
|
+
# Users can pass other strings manually if needed
|
thordata/parameters.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# src/thordata/parameters.py
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Any, Optional
|
|
4
|
+
|
|
5
|
+
def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
|
|
6
|
+
"""
|
|
7
|
+
Normalizes parameters across different search engines to ensure a unified API surface.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
engine (str): The search engine to use (e.g., 'google', 'yandex').
|
|
11
|
+
query (str): The search query string.
|
|
12
|
+
**kwargs: Additional parameters to pass to the API.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Dict[str, Any]: The constructed payload for the API request.
|
|
16
|
+
"""
|
|
17
|
+
# 1. Base parameters
|
|
18
|
+
payload = {
|
|
19
|
+
"num": str(kwargs.get("num", 10)), # Default to 10 results
|
|
20
|
+
"json": "1", # Force JSON response
|
|
21
|
+
"engine": engine,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# 2. Handle Query Parameter Differences (Yandex uses 'text', others use 'q')
|
|
25
|
+
if engine == "yandex":
|
|
26
|
+
payload["text"] = query
|
|
27
|
+
# Set default URL for Yandex if not provided
|
|
28
|
+
if "url" not in kwargs:
|
|
29
|
+
payload["url"] = "yandex.com"
|
|
30
|
+
else:
|
|
31
|
+
payload["q"] = query
|
|
32
|
+
|
|
33
|
+
# 3. Handle Default URLs for other engines
|
|
34
|
+
if "url" not in kwargs:
|
|
35
|
+
defaults = {
|
|
36
|
+
"google": "google.com",
|
|
37
|
+
"bing": "bing.com",
|
|
38
|
+
"duckduckgo": "duckduckgo.com",
|
|
39
|
+
"baidu": "baidu.com"
|
|
40
|
+
}
|
|
41
|
+
if engine in defaults:
|
|
42
|
+
payload["url"] = defaults[engine]
|
|
43
|
+
|
|
44
|
+
# 4. Passthrough for all other user-provided arguments
|
|
45
|
+
# This allows support for engine-specific parameters (e.g., tbm, uule, gl)
|
|
46
|
+
# without explicitly defining them all.
|
|
47
|
+
protected_keys = {"num", "engine", "q", "text"}
|
|
48
|
+
for key, value in kwargs.items():
|
|
49
|
+
if key not in protected_keys:
|
|
50
|
+
payload[key] = value
|
|
51
|
+
|
|
52
|
+
return payload
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
|
+
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://www.thordata.com
|
|
8
|
+
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
9
|
+
Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
|
|
10
|
+
Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
11
|
+
Keywords: web scraping,proxy,ai,llm,data-mining,serp,thordata
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
22
|
+
Classifier: Operating System :: OS Independent
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: requests>=2.25.0
|
|
27
|
+
Requires-Dist: aiohttp>=3.8.0
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# Thordata Python SDK
|
|
31
|
+
|
|
32
|
+
<h4 align="center">
|
|
33
|
+
Official Python client for Thordata's Proxy Network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
34
|
+
<br>
|
|
35
|
+
<i>Async-ready, built for AI agents and large-scale data collection.</i>
|
|
36
|
+
</h4>
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<a href="https://pypi.org/project/thordata-sdk/">
|
|
40
|
+
<img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version">
|
|
41
|
+
</a>
|
|
42
|
+
<a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE">
|
|
43
|
+
<img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License">
|
|
44
|
+
</a>
|
|
45
|
+
<a href="https://python.org">
|
|
46
|
+
<img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions">
|
|
47
|
+
</a>
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install thordata-sdk
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
All examples below use the unified client:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from thordata import ThordataClient, AsyncThordataClient
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
You can copy `examples/.env.example` to `.env` and fill in your tokens from the Thordata Dashboard.
|
|
67
|
+
|
|
68
|
+
### 1. Proxy Network (Simple GET)
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
import os
|
|
72
|
+
from dotenv import load_dotenv
|
|
73
|
+
from thordata import ThordataClient
|
|
74
|
+
|
|
75
|
+
load_dotenv()
|
|
76
|
+
|
|
77
|
+
client = ThordataClient(
|
|
78
|
+
scraper_token=os.getenv("THORDATA_SCRAPER_TOKEN"),
|
|
79
|
+
public_token=os.getenv("THORDATA_PUBLIC_TOKEN"),
|
|
80
|
+
public_key=os.getenv("THORDATA_PUBLIC_KEY"),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
resp = client.get("http://httpbin.org/ip")
|
|
84
|
+
print(resp.json())
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2. SERP API (Google, Bing, Yandex, DuckDuckGo)
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from thordata import ThordataClient, Engine
|
|
91
|
+
|
|
92
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
93
|
+
|
|
94
|
+
results = client.serp_search(
|
|
95
|
+
query="Thordata technology",
|
|
96
|
+
engine=Engine.GOOGLE,
|
|
97
|
+
num=10,
|
|
98
|
+
# Any engine-specific parameters are passed via **kwargs
|
|
99
|
+
# e.g. type="shopping", location="United States"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(len(results.get("organic", [])))
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### 3. Universal Scraping API
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from thordata import ThordataClient
|
|
109
|
+
|
|
110
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
111
|
+
|
|
112
|
+
html = client.universal_scrape(
|
|
113
|
+
url="https://www.google.com",
|
|
114
|
+
js_render=True,
|
|
115
|
+
output_format="HTML",
|
|
116
|
+
)
|
|
117
|
+
print(html[:200])
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### 4. Web Scraper API (Task-based)
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
import time
|
|
124
|
+
from thordata import ThordataClient
|
|
125
|
+
|
|
126
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
127
|
+
|
|
128
|
+
task_id = client.create_scraper_task(
|
|
129
|
+
file_name="demo_youtube_data",
|
|
130
|
+
spider_id="youtube_video-post_by-url",
|
|
131
|
+
spider_name="youtube.com",
|
|
132
|
+
individual_params={
|
|
133
|
+
"url": "https://www.youtube.com/@stephcurry/videos",
|
|
134
|
+
"order_by": "",
|
|
135
|
+
"num_of_posts": ""
|
|
136
|
+
},
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
for _ in range(10):
|
|
140
|
+
status = client.get_task_status(task_id)
|
|
141
|
+
print("Status:", status)
|
|
142
|
+
if status in ["Ready", "Success"]:
|
|
143
|
+
break
|
|
144
|
+
if status == "Failed":
|
|
145
|
+
raise RuntimeError("Task failed")
|
|
146
|
+
time.sleep(3)
|
|
147
|
+
|
|
148
|
+
download_url = client.get_task_result(task_id)
|
|
149
|
+
print("Download URL:", download_url)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### 5. Asynchronous Usage (High Concurrency)
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
import asyncio
|
|
156
|
+
from thordata import AsyncThordataClient
|
|
157
|
+
|
|
158
|
+
async def main():
|
|
159
|
+
async with AsyncThordataClient(
|
|
160
|
+
scraper_token="SCRAPER_TOKEN",
|
|
161
|
+
public_token="PUBLIC_TOKEN",
|
|
162
|
+
public_key="PUBLIC_KEY",
|
|
163
|
+
) as client:
|
|
164
|
+
resp = await client.get("http://httpbin.org/ip")
|
|
165
|
+
print(await resp.json())
|
|
166
|
+
|
|
167
|
+
asyncio.run(main())
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
More examples are available in the `examples/` directory.
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## Features
|
|
175
|
+
|
|
176
|
+
| Feature | Status | Description |
|
|
177
|
+
|---------|--------|-------------|
|
|
178
|
+
| Proxy Network | Stable | Residential, ISP, Mobile, Datacenter via HTTP/HTTPS gateway. |
|
|
179
|
+
| SERP API | Stable | Google / Bing / Yandex / DuckDuckGo, flexible parameters. |
|
|
180
|
+
| Universal Scraping API | Stable | JS rendering, HTML / PNG output, antibot bypass. |
|
|
181
|
+
| Web Scraper API | Stable | Task-based scraping for complex sites (YouTube, E-commerce). |
|
|
182
|
+
| Async Client | Stable | aiohttp-based client for high-concurrency workloads. |
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Development & Contributing
|
|
187
|
+
|
|
188
|
+
See `CONTRIBUTING.md` for local development and contribution guidelines.
|
|
189
|
+
|
|
190
|
+
## License
|
|
191
|
+
|
|
192
|
+
This project is licensed under the Apache License 2.0.
|
|
193
|
+
|
|
194
|
+
## Support
|
|
195
|
+
|
|
196
|
+
For technical support, please contact support@thordata.com
|
|
197
|
+
or verify your tokens and quotas in the Thordata Dashboard.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
thordata/__init__.py,sha256=HVb6cHBsYRFoA1Sf_y_WSZ88vGV3DsT67rCdbZSuUYE,365
|
|
2
|
+
thordata/async_client.py,sha256=cpBtRIzr8oH6GuZs8gTh505tGYYV1aRFBUzbtmFOfEg,9717
|
|
3
|
+
thordata/client.py,sha256=w_EXs6CLM2qFtFPNU-x_Li66LEH1j7pQb2ca2MDKqyA,12432
|
|
4
|
+
thordata/enums.py,sha256=PGUCQX3jw5a9mX8_JfhuyoR1WriWjWQpAgibVP_bpdM,679
|
|
5
|
+
thordata/parameters.py,sha256=1lNx_BSS8ztBKEj_MXZMaIQQ9_W3EAlS-VFiBqSWb9E,1841
|
|
6
|
+
thordata_sdk-0.3.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
7
|
+
thordata_sdk-0.3.0.dist-info/METADATA,sha256=Yj6W3vSLkkUhSXTj6AK4AaMfdlJvGOVaK6cFI2MNqV8,5697
|
|
8
|
+
thordata_sdk-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
thordata_sdk-0.3.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
|
|
10
|
+
thordata_sdk-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
thordata
|
thordata_sdk/__init__.py
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: thordata_sdk
|
|
3
|
-
Version: 0.2.3
|
|
4
|
-
Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
|
|
5
|
-
Home-page: https://github.com/Thordata/thordata-python-sdk
|
|
6
|
-
Author: Thordata Developer Team
|
|
7
|
-
Author-email: support@thordata.com
|
|
8
|
-
License: Apache License 2.0
|
|
9
|
-
Project-URL: Bug Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
10
|
-
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
11
|
-
Classifier: Development Status :: 4 - Beta
|
|
12
|
-
Classifier: Intended Audience :: Developers
|
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
19
|
-
Classifier: Operating System :: OS Independent
|
|
20
|
-
Classifier: Topic :: Internet :: WWW/HTTP
|
|
21
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
-
Requires-Python: >=3.8
|
|
23
|
-
Description-Content-Type: text/markdown
|
|
24
|
-
License-File: LICENSE
|
|
25
|
-
Requires-Dist: requests>=2.25.0
|
|
26
|
-
Requires-Dist: aiohttp>=3.8.0
|
|
27
|
-
Dynamic: author
|
|
28
|
-
Dynamic: author-email
|
|
29
|
-
Dynamic: classifier
|
|
30
|
-
Dynamic: description
|
|
31
|
-
Dynamic: description-content-type
|
|
32
|
-
Dynamic: home-page
|
|
33
|
-
Dynamic: license
|
|
34
|
-
Dynamic: license-file
|
|
35
|
-
Dynamic: project-url
|
|
36
|
-
Dynamic: requires-dist
|
|
37
|
-
Dynamic: requires-python
|
|
38
|
-
Dynamic: summary
|
|
39
|
-
|
|
40
|
-
# Thordata Python SDK
|
|
41
|
-
|
|
42
|
-
<h4 align="center">
|
|
43
|
-
The Official Python Client for the Thordata Proxy Network & Web Scraper API.
|
|
44
|
-
<br>
|
|
45
|
-
<i>High-performance, async-ready, designed for AI Agents and large-scale data collection.</i>
|
|
46
|
-
</h4>
|
|
47
|
-
|
|
48
|
-
<p align="center">
|
|
49
|
-
<a href="https://pypi.org/project/thordata-sdk/"><img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version"></a>
|
|
50
|
-
<a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
|
|
51
|
-
<a href="https://python.org"><img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions"></a>
|
|
52
|
-
</p>
|
|
53
|
-
|
|
54
|
-
---
|
|
55
|
-
|
|
56
|
-
## 🛠 Installation
|
|
57
|
-
|
|
58
|
-
Install via pip:
|
|
59
|
-
|
|
60
|
-
```bash
|
|
61
|
-
pip install thordata-sdk
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
## ⚡ Quick Start
|
|
65
|
-
|
|
66
|
-
### 1. Proxy Usage (Simple GET Request)
|
|
67
|
-
|
|
68
|
-
**Python**
|
|
69
|
-
|
|
70
|
-
```python
|
|
71
|
-
from thordata_sdk import ThordataClient
|
|
72
|
-
|
|
73
|
-
# Initialize with your credentials from the Thordata Dashboard
|
|
74
|
-
client = ThordataClient(
|
|
75
|
-
scraper_token="YOUR_SCRAPER_TOKEN", # From "Scraping Tool Token"
|
|
76
|
-
public_token="YOUR_PUBLIC_TOKEN", # From "Public API"
|
|
77
|
-
public_key="YOUR_PUBLIC_KEY" # From "Public API"
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
# Send a request through the proxy
|
|
81
|
-
response = client.get("http://httpbin.org/ip")
|
|
82
|
-
print(response.json())
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
### 2. Real-time SERP Search
|
|
86
|
-
|
|
87
|
-
**Python**
|
|
88
|
-
|
|
89
|
-
```python
|
|
90
|
-
results = client.serp_search("Thordata technology", engine="google")
|
|
91
|
-
print(f"Results found: {len(results.get('organic', []))}")
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
### 3. Asynchronous Usage (High Concurrency)
|
|
95
|
-
|
|
96
|
-
**Python**
|
|
97
|
-
|
|
98
|
-
```python
|
|
99
|
-
import asyncio
|
|
100
|
-
from thordata_sdk import AsyncThordataClient
|
|
101
|
-
|
|
102
|
-
async def main():
|
|
103
|
-
async with AsyncThordataClient(scraper_token="...", public_token="...", public_key="...") as client:
|
|
104
|
-
response = await client.get("http://httpbin.org/ip")
|
|
105
|
-
print(await response.json())
|
|
106
|
-
|
|
107
|
-
asyncio.run(main())
|
|
108
|
-
```
|
|
109
|
-
|
|
110
|
-
## ⚙️ Features Status
|
|
111
|
-
|
|
112
|
-
| Feature | Status | Description |
|
|
113
|
-
|---------|--------|-------------|
|
|
114
|
-
| Proxy Network | ✅ Stable | Synchronous & Asynchronous support via aiohttp. |
|
|
115
|
-
| SERP API | ✅ Stable | Real-time Google/Bing/Yandex search results. |
|
|
116
|
-
| Web Scraper | ✅ Stable | Async task management for scraping complex sites (e.g., YouTube). |
|
|
117
|
-
| Authentication | ✅ Secure | Dual-token system for enhanced security. |
|
|
118
|
-
|
|
119
|
-
## 📄 License
|
|
120
|
-
|
|
121
|
-
This project is licensed under the Apache License 2.0.
|
|
122
|
-
|
|
123
|
-
## 📞 Support
|
|
124
|
-
|
|
125
|
-
For technical assistance, please contact support@thordata.com or verify your tokens in the Thordata Dashboard.
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
thordata_sdk/__init__.py,sha256=aZ2P8F15HJlnnuMRYA1R-ENcZRVQ7eo0r1SD4a_1UbI,223
|
|
2
|
-
thordata_sdk/async_client.py,sha256=fwoDSQA2GdikkNHrbKAoLwjqmn-zafEoe2HGf-j8bp8,8202
|
|
3
|
-
thordata_sdk/client.py,sha256=drlhRHCCUoYiwmaJHLsYQZrfj7rB5wsK2P2yn2DkhqQ,9732
|
|
4
|
-
thordata_sdk-0.2.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
5
|
-
thordata_sdk-0.2.3.dist-info/METADATA,sha256=X_b16_FfyQmV7VS9Wy_QRtgXp8JVYhxSatt0HpAA9QU,4003
|
|
6
|
-
thordata_sdk-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
thordata_sdk-0.2.3.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
|
|
8
|
-
thordata_sdk-0.2.3.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
thordata_sdk
|
|
File without changes
|
|
File without changes
|