thordata-sdk 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +16 -0
- {thordata_sdk → thordata}/async_client.py +67 -33
- {thordata_sdk → thordata}/client.py +92 -43
- thordata/enums.py +25 -0
- thordata/parameters.py +52 -0
- thordata_sdk-0.3.0.dist-info/METADATA +197 -0
- thordata_sdk-0.3.0.dist-info/RECORD +10 -0
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-0.3.0.dist-info}/WHEEL +1 -1
- thordata_sdk-0.3.0.dist-info/top_level.txt +1 -0
- thordata_sdk/__init__.py +0 -9
- thordata_sdk/enums.py +0 -20
- thordata_sdk/parameters.py +0 -41
- thordata_sdk-0.2.4.dist-info/METADATA +0 -113
- thordata_sdk-0.2.4.dist-info/RECORD +0 -10
- thordata_sdk-0.2.4.dist-info/top_level.txt +0 -1
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-0.3.0.dist-info/licenses}/LICENSE +0 -0
thordata/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# src/thordata/__init__.py
|
|
2
|
+
|
|
3
|
+
from .client import ThordataClient
|
|
4
|
+
from .async_client import AsyncThordataClient
|
|
5
|
+
from .enums import Engine, GoogleSearchType
|
|
6
|
+
|
|
7
|
+
# Package version
|
|
8
|
+
__version__ = "0.3.0"
|
|
9
|
+
|
|
10
|
+
# Explicitly export classes to simplify user imports
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ThordataClient",
|
|
13
|
+
"AsyncThordataClient",
|
|
14
|
+
"Engine",
|
|
15
|
+
"GoogleSearchType"
|
|
16
|
+
]
|
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import base64
|
|
5
5
|
from typing import Optional, Dict, Any, Union
|
|
6
6
|
|
|
7
|
-
#
|
|
7
|
+
# Import shared logic
|
|
8
8
|
from .enums import Engine
|
|
9
9
|
from .parameters import normalize_serp_params
|
|
10
10
|
|
|
@@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
|
|
|
13
13
|
|
|
14
14
|
class AsyncThordataClient:
|
|
15
15
|
"""
|
|
16
|
-
|
|
16
|
+
The official Asynchronous Python client for Thordata (built on aiohttp).
|
|
17
|
+
Designed for high-concurrency AI agents and data pipelines.
|
|
17
18
|
"""
|
|
18
19
|
|
|
19
20
|
def __init__(
|
|
@@ -24,13 +25,18 @@ class AsyncThordataClient:
|
|
|
24
25
|
proxy_host: str = "gate.thordata.com",
|
|
25
26
|
proxy_port: int = 22225
|
|
26
27
|
):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the Async Client.
|
|
30
|
+
"""
|
|
27
31
|
self.scraper_token = scraper_token
|
|
28
32
|
self.public_token = public_token
|
|
29
33
|
self.public_key = public_key
|
|
30
34
|
|
|
35
|
+
# Pre-calculate proxy auth for performance
|
|
31
36
|
self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
|
|
32
37
|
self.proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
33
38
|
|
|
39
|
+
# API Endpoints
|
|
34
40
|
self.base_url = "https://scraperapi.thordata.com"
|
|
35
41
|
self.universal_url = "https://universalapi.thordata.com"
|
|
36
42
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
@@ -41,6 +47,7 @@ class AsyncThordataClient:
|
|
|
41
47
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
42
48
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
43
49
|
|
|
50
|
+
# Session is initialized lazily or via context manager
|
|
44
51
|
self._session: Optional[aiohttp.ClientSession] = None
|
|
45
52
|
|
|
46
53
|
async def __aenter__(self):
|
|
@@ -52,16 +59,27 @@ class AsyncThordataClient:
|
|
|
52
59
|
await self.close()
|
|
53
60
|
|
|
54
61
|
async def close(self):
|
|
62
|
+
"""Close the underlying aiohttp session."""
|
|
55
63
|
if self._session and not self._session.closed:
|
|
56
64
|
await self._session.close()
|
|
57
65
|
self._session = None
|
|
58
66
|
|
|
59
|
-
|
|
67
|
+
def _get_session(self) -> aiohttp.ClientSession:
|
|
68
|
+
"""Internal helper to ensure session exists."""
|
|
69
|
+
if self._session is None or self._session.closed:
|
|
70
|
+
raise RuntimeError(
|
|
71
|
+
"Client session not initialized. Use 'async with ThordataClient(...) as client:'"
|
|
72
|
+
)
|
|
73
|
+
return self._session
|
|
74
|
+
|
|
60
75
|
async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
|
|
61
|
-
|
|
62
|
-
|
|
76
|
+
"""
|
|
77
|
+
Send an async GET request through the Proxy Network.
|
|
78
|
+
"""
|
|
79
|
+
session = self._get_session()
|
|
63
80
|
try:
|
|
64
|
-
|
|
81
|
+
logger.debug(f"Async Proxy Request: {url}")
|
|
82
|
+
return await session.get(
|
|
65
83
|
url,
|
|
66
84
|
proxy=self.proxy_url,
|
|
67
85
|
proxy_auth=self.proxy_auth,
|
|
@@ -71,7 +89,6 @@ class AsyncThordataClient:
|
|
|
71
89
|
logger.error(f"Async Request failed: {e}")
|
|
72
90
|
raise
|
|
73
91
|
|
|
74
|
-
# --- SERP (Optimized) ---
|
|
75
92
|
async def serp_search(
|
|
76
93
|
self,
|
|
77
94
|
query: str,
|
|
@@ -82,13 +99,12 @@ class AsyncThordataClient:
|
|
|
82
99
|
"""
|
|
83
100
|
Execute a real-time SERP search (Async).
|
|
84
101
|
"""
|
|
85
|
-
|
|
86
|
-
raise RuntimeError("Client session not initialized.")
|
|
102
|
+
session = self._get_session()
|
|
87
103
|
|
|
88
|
-
# 1.
|
|
104
|
+
# 1. Handle Enum conversion
|
|
89
105
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
90
106
|
|
|
91
|
-
# 2.
|
|
107
|
+
# 2. Normalize parameters
|
|
92
108
|
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
93
109
|
|
|
94
110
|
headers = {
|
|
@@ -96,30 +112,34 @@ class AsyncThordataClient:
|
|
|
96
112
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
97
113
|
}
|
|
98
114
|
|
|
99
|
-
# 3.
|
|
100
|
-
|
|
115
|
+
# 3. Execute Request
|
|
116
|
+
logger.info(f"Async SERP Search: {engine_str} - {query}")
|
|
117
|
+
async with session.post(
|
|
101
118
|
self.SERP_API_URL, data=payload, headers=headers
|
|
102
119
|
) as response:
|
|
103
120
|
response.raise_for_status()
|
|
121
|
+
|
|
104
122
|
data = await response.json()
|
|
123
|
+
# Handle double-encoded JSON strings if they occur
|
|
105
124
|
if isinstance(data, str):
|
|
106
125
|
try:
|
|
107
126
|
data = json.loads(data)
|
|
108
|
-
except
|
|
127
|
+
except json.JSONDecodeError:
|
|
109
128
|
pass
|
|
110
129
|
return data
|
|
111
130
|
|
|
112
|
-
# --- Universal (Unchanged) ---
|
|
113
131
|
async def universal_scrape(
|
|
114
132
|
self,
|
|
115
133
|
url: str,
|
|
116
134
|
js_render: bool = False,
|
|
117
135
|
output_format: str = "HTML",
|
|
118
|
-
country: str = None,
|
|
136
|
+
country: Optional[str] = None,
|
|
119
137
|
block_resources: bool = False
|
|
120
138
|
) -> Union[str, bytes]:
|
|
121
|
-
|
|
122
|
-
|
|
139
|
+
"""
|
|
140
|
+
Async Universal Scraping (Bypass Cloudflare/CAPTCHA).
|
|
141
|
+
"""
|
|
142
|
+
session = self._get_session()
|
|
123
143
|
|
|
124
144
|
headers = {
|
|
125
145
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
@@ -135,18 +155,21 @@ class AsyncThordataClient:
|
|
|
135
155
|
if country:
|
|
136
156
|
payload["country"] = country
|
|
137
157
|
|
|
138
|
-
|
|
158
|
+
logger.info(f"Async Universal Scrape: {url}")
|
|
159
|
+
async with session.post(
|
|
139
160
|
self.UNIVERSAL_API_URL, data=payload, headers=headers
|
|
140
161
|
) as response:
|
|
141
162
|
response.raise_for_status()
|
|
142
163
|
|
|
143
164
|
try:
|
|
144
165
|
resp_json = await response.json()
|
|
145
|
-
except
|
|
166
|
+
except json.JSONDecodeError:
|
|
167
|
+
# Fallback for raw content
|
|
146
168
|
if output_format.upper() == "PNG":
|
|
147
169
|
return await response.read()
|
|
148
170
|
return await response.text()
|
|
149
171
|
|
|
172
|
+
# Check API error codes
|
|
150
173
|
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
151
174
|
and resp_json.get("code") != 200:
|
|
152
175
|
raise Exception(f"Universal API Error: {resp_json}")
|
|
@@ -159,39 +182,38 @@ class AsyncThordataClient:
|
|
|
159
182
|
if not png_str:
|
|
160
183
|
raise Exception("API returned empty PNG data")
|
|
161
184
|
|
|
162
|
-
#
|
|
185
|
+
# Clean Data URI Scheme
|
|
163
186
|
if "," in png_str:
|
|
164
187
|
png_str = png_str.split(",", 1)[1]
|
|
165
188
|
|
|
189
|
+
# Fix Base64 Padding
|
|
166
190
|
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
167
191
|
missing_padding = len(png_str) % 4
|
|
168
192
|
if missing_padding:
|
|
169
193
|
png_str += '=' * (4 - missing_padding)
|
|
194
|
+
|
|
170
195
|
return base64.b64decode(png_str)
|
|
171
196
|
|
|
172
197
|
return str(resp_json)
|
|
173
198
|
|
|
174
|
-
# --- Web Scraper (Optimized) ---
|
|
175
199
|
async def create_scraper_task(
|
|
176
200
|
self,
|
|
177
201
|
file_name: str,
|
|
178
202
|
spider_id: str,
|
|
179
203
|
spider_name: str,
|
|
180
204
|
individual_params: Dict[str, Any],
|
|
181
|
-
universal_params: Dict[str, Any] = None
|
|
205
|
+
universal_params: Optional[Dict[str, Any]] = None
|
|
182
206
|
) -> str:
|
|
183
207
|
"""
|
|
184
208
|
Create an Asynchronous Web Scraper Task.
|
|
185
209
|
"""
|
|
186
|
-
|
|
187
|
-
raise RuntimeError("Client session not initialized.")
|
|
210
|
+
session = self._get_session()
|
|
188
211
|
|
|
189
212
|
headers = {
|
|
190
213
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
191
214
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
192
215
|
}
|
|
193
216
|
|
|
194
|
-
# 简化 Payload 构建,移除不必要的检查
|
|
195
217
|
payload = {
|
|
196
218
|
"file_name": file_name,
|
|
197
219
|
"spider_id": spider_id,
|
|
@@ -202,17 +224,23 @@ class AsyncThordataClient:
|
|
|
202
224
|
if universal_params:
|
|
203
225
|
payload["spider_universal"] = json.dumps(universal_params)
|
|
204
226
|
|
|
205
|
-
|
|
227
|
+
logger.info(f"Async Task Creation: {spider_name}")
|
|
228
|
+
async with session.post(
|
|
206
229
|
self.SCRAPER_BUILDER_URL, data=payload, headers=headers
|
|
207
230
|
) as response:
|
|
208
231
|
response.raise_for_status()
|
|
209
232
|
data = await response.json()
|
|
233
|
+
|
|
210
234
|
if data.get("code") != 200:
|
|
211
235
|
raise Exception(f"Creation failed: {data}")
|
|
212
236
|
return data["data"]["task_id"]
|
|
213
237
|
|
|
214
|
-
# --- Status & Result (Unchanged) ---
|
|
215
238
|
async def get_task_status(self, task_id: str) -> str:
|
|
239
|
+
"""
|
|
240
|
+
Check task status.
|
|
241
|
+
"""
|
|
242
|
+
session = self._get_session()
|
|
243
|
+
|
|
216
244
|
headers = {
|
|
217
245
|
"token": self.public_token,
|
|
218
246
|
"key": self.public_key,
|
|
@@ -220,28 +248,34 @@ class AsyncThordataClient:
|
|
|
220
248
|
}
|
|
221
249
|
payload = {"tasks_ids": task_id}
|
|
222
250
|
|
|
223
|
-
async with
|
|
251
|
+
async with session.post(
|
|
224
252
|
self.SCRAPER_STATUS_URL, data=payload, headers=headers
|
|
225
253
|
) as response:
|
|
226
254
|
data = await response.json()
|
|
227
255
|
if data.get("code") == 200 and data.get("data"):
|
|
228
256
|
for item in data["data"]:
|
|
229
|
-
if str(item
|
|
257
|
+
if str(item.get("task_id")) == str(task_id):
|
|
230
258
|
return item["status"]
|
|
231
259
|
return "Unknown"
|
|
232
260
|
|
|
233
261
|
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
262
|
+
"""
|
|
263
|
+
Get the download URL for a finished task.
|
|
264
|
+
"""
|
|
265
|
+
session = self._get_session()
|
|
266
|
+
|
|
234
267
|
headers = {
|
|
235
268
|
"token": self.public_token,
|
|
236
269
|
"key": self.public_key,
|
|
237
270
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
238
271
|
}
|
|
239
|
-
|
|
272
|
+
# Fixed: Use the file_type argument instead of hardcoding "json"
|
|
273
|
+
payload = {"tasks_id": task_id, "type": file_type}
|
|
240
274
|
|
|
241
|
-
async with
|
|
275
|
+
async with session.post(
|
|
242
276
|
self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
|
|
243
277
|
) as response:
|
|
244
278
|
data = await response.json()
|
|
245
|
-
if data.get("code") == 200:
|
|
279
|
+
if data.get("code") == 200 and data.get("data"):
|
|
246
280
|
return data["data"]["download"]
|
|
247
281
|
raise Exception(f"Result Error: {data}")
|
|
@@ -7,7 +7,7 @@ from typing import Dict, Any, Union, Optional
|
|
|
7
7
|
from .enums import Engine
|
|
8
8
|
from .parameters import normalize_serp_params
|
|
9
9
|
|
|
10
|
-
# Configure a library-specific logger
|
|
10
|
+
# Configure a library-specific logger to avoid interfering with user's logging
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
@@ -15,11 +15,11 @@ class ThordataClient:
|
|
|
15
15
|
"""
|
|
16
16
|
The official synchronous Python client for Thordata.
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
1. Proxy Network (HTTP/HTTPS)
|
|
20
|
-
2. SERP API (Real-time Search)
|
|
21
|
-
3. Universal Scraping API (Single Page)
|
|
22
|
-
4. Web Scraper API (Async Task Management)
|
|
18
|
+
This client handles authentication and communication with:
|
|
19
|
+
1. Proxy Network (Residential/Datacenter via HTTP/HTTPS)
|
|
20
|
+
2. SERP API (Real-time Search Engine Results)
|
|
21
|
+
3. Universal Scraping API (Single Page Rendering & Extraction)
|
|
22
|
+
4. Web Scraper API (Async Task Management for large scale jobs)
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
25
|
def __init__(
|
|
@@ -34,11 +34,11 @@ class ThordataClient:
|
|
|
34
34
|
Initialize the Thordata Client.
|
|
35
35
|
|
|
36
36
|
Args:
|
|
37
|
-
scraper_token (str):
|
|
38
|
-
public_token (str):
|
|
39
|
-
public_key (str):
|
|
40
|
-
proxy_host (str):
|
|
41
|
-
proxy_port (int):
|
|
37
|
+
scraper_token (str): The secret token found at the bottom of the Dashboard.
|
|
38
|
+
public_token (str): The token from the Public API section.
|
|
39
|
+
public_key (str): The key from the Public API section.
|
|
40
|
+
proxy_host (str): The proxy gateway host (default: gate.thordata.com).
|
|
41
|
+
proxy_port (int): The proxy gateway port (default: 22225).
|
|
42
42
|
"""
|
|
43
43
|
self.scraper_token = scraper_token
|
|
44
44
|
self.public_token = public_token
|
|
@@ -49,7 +49,7 @@ class ThordataClient:
|
|
|
49
49
|
f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
# API Endpoints
|
|
52
|
+
# API Endpoints Definition
|
|
53
53
|
self.base_url = "https://scraperapi.thordata.com"
|
|
54
54
|
self.universal_url = "https://universalapi.thordata.com"
|
|
55
55
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
@@ -60,6 +60,7 @@ class ThordataClient:
|
|
|
60
60
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
61
61
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
62
62
|
|
|
63
|
+
# Initialize Session with Proxy settings
|
|
63
64
|
self.session = requests.Session()
|
|
64
65
|
self.session.proxies = {
|
|
65
66
|
"http": self.proxy_url,
|
|
@@ -68,7 +69,14 @@ class ThordataClient:
|
|
|
68
69
|
|
|
69
70
|
def get(self, url: str, **kwargs) -> requests.Response:
|
|
70
71
|
"""
|
|
71
|
-
Send a GET request through the Thordata Proxy Network.
|
|
72
|
+
Send a standard GET request through the Thordata Residential Proxy Network.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
url (str): The target URL.
|
|
76
|
+
**kwargs: Arguments to pass to requests.get().
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
requests.Response: The response object.
|
|
72
80
|
"""
|
|
73
81
|
logger.debug(f"Proxy Request: {url}")
|
|
74
82
|
kwargs.setdefault("timeout", 30)
|
|
@@ -77,23 +85,26 @@ class ThordataClient:
|
|
|
77
85
|
def serp_search(
|
|
78
86
|
self,
|
|
79
87
|
query: str,
|
|
80
|
-
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
88
|
+
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
81
89
|
num: int = 10,
|
|
82
|
-
**kwargs
|
|
90
|
+
**kwargs
|
|
83
91
|
) -> Dict[str, Any]:
|
|
84
92
|
"""
|
|
85
|
-
Execute a real-time SERP search.
|
|
93
|
+
Execute a real-time SERP (Search Engine Results Page) search.
|
|
86
94
|
|
|
87
95
|
Args:
|
|
88
|
-
query:
|
|
89
|
-
engine: 'google', 'bing'
|
|
90
|
-
num: Number of results (default 10)
|
|
91
|
-
**kwargs:
|
|
96
|
+
query (str): The search keywords.
|
|
97
|
+
engine (Union[Engine, str]): The search engine (e.g., 'google', 'bing').
|
|
98
|
+
num (int): Number of results to retrieve (default 10).
|
|
99
|
+
**kwargs: Additional parameters (e.g., type="shopping", location="London").
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Dict[str, Any]: The parsed JSON result from the search engine.
|
|
92
103
|
"""
|
|
93
|
-
#
|
|
104
|
+
# Handle Enum or String input for engine
|
|
94
105
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
95
106
|
|
|
96
|
-
#
|
|
107
|
+
# Normalize parameters via internal helper
|
|
97
108
|
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
98
109
|
|
|
99
110
|
headers = {
|
|
@@ -112,25 +123,38 @@ class ThordataClient:
|
|
|
112
123
|
response.raise_for_status()
|
|
113
124
|
|
|
114
125
|
data = response.json()
|
|
126
|
+
# Handle cases where the API returns a stringified JSON
|
|
115
127
|
if isinstance(data, str):
|
|
116
|
-
try:
|
|
117
|
-
|
|
128
|
+
try:
|
|
129
|
+
data = json.loads(data)
|
|
130
|
+
except json.JSONDecodeError:
|
|
131
|
+
pass
|
|
118
132
|
return data
|
|
119
133
|
except Exception as e:
|
|
120
134
|
logger.error(f"SERP Request Failed: {e}")
|
|
121
135
|
raise
|
|
122
136
|
|
|
123
|
-
|
|
124
137
|
def universal_scrape(
|
|
125
138
|
self,
|
|
126
139
|
url: str,
|
|
127
140
|
js_render: bool = False,
|
|
128
141
|
output_format: str = "HTML",
|
|
129
|
-
country: str = None,
|
|
142
|
+
country: Optional[str] = None,
|
|
130
143
|
block_resources: bool = False
|
|
131
144
|
) -> Union[str, bytes]:
|
|
132
145
|
"""
|
|
133
146
|
Unlock target pages via the Universal Scraping API.
|
|
147
|
+
Bypasses Cloudflare, CAPTCHAs, and antibot systems automatically.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
url (str): Target URL.
|
|
151
|
+
js_render (bool): Whether to render JavaScript (Headless Browser).
|
|
152
|
+
output_format (str): "HTML" or "PNG" (screenshot).
|
|
153
|
+
country (Optional[str]): Geo-targeting country code (e.g., 'us').
|
|
154
|
+
block_resources (bool): Block images/css to speed up loading.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Union[str, bytes]: HTML string or PNG bytes.
|
|
134
158
|
"""
|
|
135
159
|
headers = {
|
|
136
160
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
@@ -146,7 +170,7 @@ class ThordataClient:
|
|
|
146
170
|
if country:
|
|
147
171
|
payload["country"] = country
|
|
148
172
|
|
|
149
|
-
logger.info(f"Universal Scrape: {url}")
|
|
173
|
+
logger.info(f"Universal Scrape: {url} (Format: {output_format})")
|
|
150
174
|
|
|
151
175
|
try:
|
|
152
176
|
response = self.session.post(
|
|
@@ -157,35 +181,35 @@ class ThordataClient:
|
|
|
157
181
|
)
|
|
158
182
|
response.raise_for_status()
|
|
159
183
|
|
|
160
|
-
#
|
|
184
|
+
# Attempt to parse JSON wrapper
|
|
161
185
|
try:
|
|
162
186
|
resp_json = response.json()
|
|
163
187
|
except json.JSONDecodeError:
|
|
164
|
-
# Fallback
|
|
188
|
+
# Fallback: if the API returns raw content directly
|
|
165
189
|
if output_format.upper() == "PNG":
|
|
166
190
|
return response.content
|
|
167
191
|
return response.text
|
|
168
192
|
|
|
169
|
-
# Check API errors
|
|
193
|
+
# Check for API-level errors inside the JSON
|
|
170
194
|
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
171
195
|
and resp_json.get("code") != 200:
|
|
172
196
|
raise Exception(f"Universal API Error: {resp_json}")
|
|
173
197
|
|
|
174
|
-
#
|
|
198
|
+
# Case 1: Return HTML
|
|
175
199
|
if "html" in resp_json:
|
|
176
200
|
return resp_json["html"]
|
|
177
201
|
|
|
178
|
-
#
|
|
202
|
+
# Case 2: Return PNG Image
|
|
179
203
|
if "png" in resp_json:
|
|
180
204
|
png_str = resp_json["png"]
|
|
181
205
|
if not png_str:
|
|
182
206
|
raise Exception("API returned empty PNG data")
|
|
183
207
|
|
|
184
|
-
#
|
|
208
|
+
# Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
|
|
185
209
|
if "," in png_str:
|
|
186
210
|
png_str = png_str.split(",", 1)[1]
|
|
187
211
|
|
|
188
|
-
# Base64
|
|
212
|
+
# Fix Base64 Padding
|
|
189
213
|
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
190
214
|
missing_padding = len(png_str) % 4
|
|
191
215
|
if missing_padding:
|
|
@@ -193,6 +217,7 @@ class ThordataClient:
|
|
|
193
217
|
|
|
194
218
|
return base64.b64decode(png_str)
|
|
195
219
|
|
|
220
|
+
# Fallback
|
|
196
221
|
return str(resp_json)
|
|
197
222
|
|
|
198
223
|
except Exception as e:
|
|
@@ -202,22 +227,33 @@ class ThordataClient:
|
|
|
202
227
|
def create_scraper_task(
|
|
203
228
|
self,
|
|
204
229
|
file_name: str,
|
|
205
|
-
spider_id: str,
|
|
206
|
-
spider_name: str,
|
|
207
|
-
individual_params: Dict[str, Any],
|
|
208
|
-
universal_params: Dict[str, Any] = None
|
|
230
|
+
spider_id: str,
|
|
231
|
+
spider_name: str,
|
|
232
|
+
individual_params: Dict[str, Any],
|
|
233
|
+
universal_params: Optional[Dict[str, Any]] = None
|
|
209
234
|
) -> str:
|
|
210
235
|
"""
|
|
211
|
-
Create a generic Web Scraper Task.
|
|
236
|
+
Create a generic Web Scraper Task (Async).
|
|
212
237
|
|
|
213
|
-
|
|
238
|
+
IMPORTANT: You must retrieve the correct 'spider_id' and 'spider_name'
|
|
239
|
+
from the Thordata Dashboard before calling this method.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
file_name (str): Name for the output file.
|
|
243
|
+
spider_id (str): The ID of the spider (from Dashboard).
|
|
244
|
+
spider_name (str): The name of the spider (e.g., "youtube.com").
|
|
245
|
+
individual_params (Dict): Parameters specific to the spider.
|
|
246
|
+
universal_params (Optional[Dict]): Global settings for the scraper.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
str: The created task_id.
|
|
214
250
|
"""
|
|
215
251
|
headers = {
|
|
216
252
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
217
253
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
218
254
|
}
|
|
219
255
|
|
|
220
|
-
#
|
|
256
|
+
# Payload construction
|
|
221
257
|
payload = {
|
|
222
258
|
"spider_name": spider_name,
|
|
223
259
|
"spider_id": spider_id,
|
|
@@ -247,7 +283,13 @@ class ThordataClient:
|
|
|
247
283
|
|
|
248
284
|
def get_task_status(self, task_id: str) -> str:
|
|
249
285
|
"""
|
|
250
|
-
Check the status of
|
|
286
|
+
Check the status of an asynchronous scraping task.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
task_id (str): The ID returned by create_scraper_task.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
str: The status string (e.g., "finished", "running", "error").
|
|
251
293
|
"""
|
|
252
294
|
headers = {
|
|
253
295
|
"token": self.public_token,
|
|
@@ -277,6 +319,13 @@ class ThordataClient:
|
|
|
277
319
|
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
278
320
|
"""
|
|
279
321
|
Retrieve the download URL for a completed task.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
task_id (str): The task ID.
|
|
325
|
+
file_type (str): Format required (default "json").
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
str: The URL to download the result file.
|
|
280
329
|
"""
|
|
281
330
|
headers = {
|
|
282
331
|
"token": self.public_token,
|
|
@@ -285,7 +334,7 @@ class ThordataClient:
|
|
|
285
334
|
}
|
|
286
335
|
payload = {"tasks_id": task_id, "type": file_type}
|
|
287
336
|
|
|
288
|
-
logger.info(f"Getting result URL: {task_id}")
|
|
337
|
+
logger.info(f"Getting result URL for Task: {task_id}")
|
|
289
338
|
try:
|
|
290
339
|
response = self.session.post(
|
|
291
340
|
self.SCRAPER_DOWNLOAD_URL,
|
thordata/enums.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# src/thordata/enums.py
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
class Engine(str, Enum):
|
|
6
|
+
"""
|
|
7
|
+
Supported Search Engines for SERP API.
|
|
8
|
+
"""
|
|
9
|
+
GOOGLE = "google"
|
|
10
|
+
BING = "bing"
|
|
11
|
+
YANDEX = "yandex"
|
|
12
|
+
DUCKDUCKGO = "duckduckgo"
|
|
13
|
+
BAIDU = "baidu"
|
|
14
|
+
|
|
15
|
+
class GoogleSearchType(str, Enum):
|
|
16
|
+
"""
|
|
17
|
+
Specific search types for Google Engine.
|
|
18
|
+
"""
|
|
19
|
+
SEARCH = "search" # Default web search
|
|
20
|
+
MAPS = "maps" # Google Maps
|
|
21
|
+
SHOPPING = "shopping" # Google Shopping
|
|
22
|
+
NEWS = "news" # Google News
|
|
23
|
+
IMAGES = "images" # Google Images
|
|
24
|
+
VIDEOS = "videos" # Google Videos
|
|
25
|
+
# Users can pass other strings manually if needed
|
thordata/parameters.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# src/thordata/parameters.py
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Any, Optional
|
|
4
|
+
|
|
5
|
+
def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
|
|
6
|
+
"""
|
|
7
|
+
Normalizes parameters across different search engines to ensure a unified API surface.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
engine (str): The search engine to use (e.g., 'google', 'yandex').
|
|
11
|
+
query (str): The search query string.
|
|
12
|
+
**kwargs: Additional parameters to pass to the API.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Dict[str, Any]: The constructed payload for the API request.
|
|
16
|
+
"""
|
|
17
|
+
# 1. Base parameters
|
|
18
|
+
payload = {
|
|
19
|
+
"num": str(kwargs.get("num", 10)), # Default to 10 results
|
|
20
|
+
"json": "1", # Force JSON response
|
|
21
|
+
"engine": engine,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# 2. Handle Query Parameter Differences (Yandex uses 'text', others use 'q')
|
|
25
|
+
if engine == "yandex":
|
|
26
|
+
payload["text"] = query
|
|
27
|
+
# Set default URL for Yandex if not provided
|
|
28
|
+
if "url" not in kwargs:
|
|
29
|
+
payload["url"] = "yandex.com"
|
|
30
|
+
else:
|
|
31
|
+
payload["q"] = query
|
|
32
|
+
|
|
33
|
+
# 3. Handle Default URLs for other engines
|
|
34
|
+
if "url" not in kwargs:
|
|
35
|
+
defaults = {
|
|
36
|
+
"google": "google.com",
|
|
37
|
+
"bing": "bing.com",
|
|
38
|
+
"duckduckgo": "duckduckgo.com",
|
|
39
|
+
"baidu": "baidu.com"
|
|
40
|
+
}
|
|
41
|
+
if engine in defaults:
|
|
42
|
+
payload["url"] = defaults[engine]
|
|
43
|
+
|
|
44
|
+
# 4. Passthrough for all other user-provided arguments
|
|
45
|
+
# This allows support for engine-specific parameters (e.g., tbm, uule, gl)
|
|
46
|
+
# without explicitly defining them all.
|
|
47
|
+
protected_keys = {"num", "engine", "q", "text"}
|
|
48
|
+
for key, value in kwargs.items():
|
|
49
|
+
if key not in protected_keys:
|
|
50
|
+
payload[key] = value
|
|
51
|
+
|
|
52
|
+
return payload
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
|
+
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://www.thordata.com
|
|
8
|
+
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
9
|
+
Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
|
|
10
|
+
Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
11
|
+
Keywords: web scraping,proxy,ai,llm,data-mining,serp,thordata
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
22
|
+
Classifier: Operating System :: OS Independent
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: requests>=2.25.0
|
|
27
|
+
Requires-Dist: aiohttp>=3.8.0
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# Thordata Python SDK
|
|
31
|
+
|
|
32
|
+
<h4 align="center">
|
|
33
|
+
Official Python client for Thordata's Proxy Network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
34
|
+
<br>
|
|
35
|
+
<i>Async-ready, built for AI agents and large-scale data collection.</i>
|
|
36
|
+
</h4>
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<a href="https://pypi.org/project/thordata-sdk/">
|
|
40
|
+
<img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version">
|
|
41
|
+
</a>
|
|
42
|
+
<a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE">
|
|
43
|
+
<img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License">
|
|
44
|
+
</a>
|
|
45
|
+
<a href="https://python.org">
|
|
46
|
+
<img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions">
|
|
47
|
+
</a>
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install thordata-sdk
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
All examples below use the unified client:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from thordata import ThordataClient, AsyncThordataClient
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
You can copy `examples/.env.example` to `.env` and fill in your tokens from the Thordata Dashboard.
|
|
67
|
+
|
|
68
|
+
### 1. Proxy Network (Simple GET)
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
import os
|
|
72
|
+
from dotenv import load_dotenv
|
|
73
|
+
from thordata import ThordataClient
|
|
74
|
+
|
|
75
|
+
load_dotenv()
|
|
76
|
+
|
|
77
|
+
client = ThordataClient(
|
|
78
|
+
scraper_token=os.getenv("THORDATA_SCRAPER_TOKEN"),
|
|
79
|
+
public_token=os.getenv("THORDATA_PUBLIC_TOKEN"),
|
|
80
|
+
public_key=os.getenv("THORDATA_PUBLIC_KEY"),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
resp = client.get("http://httpbin.org/ip")
|
|
84
|
+
print(resp.json())
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2. SERP API (Google, Bing, Yandex, DuckDuckGo)
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from thordata import ThordataClient, Engine
|
|
91
|
+
|
|
92
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
93
|
+
|
|
94
|
+
results = client.serp_search(
|
|
95
|
+
query="Thordata technology",
|
|
96
|
+
engine=Engine.GOOGLE,
|
|
97
|
+
num=10,
|
|
98
|
+
# Any engine-specific parameters are passed via **kwargs
|
|
99
|
+
# e.g. type="shopping", location="United States"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(len(results.get("organic", [])))
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### 3. Universal Scraping API
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from thordata import ThordataClient
|
|
109
|
+
|
|
110
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
111
|
+
|
|
112
|
+
html = client.universal_scrape(
|
|
113
|
+
url="https://www.google.com",
|
|
114
|
+
js_render=True,
|
|
115
|
+
output_format="HTML",
|
|
116
|
+
)
|
|
117
|
+
print(html[:200])
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### 4. Web Scraper API (Task-based)
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
import time
|
|
124
|
+
from thordata import ThordataClient
|
|
125
|
+
|
|
126
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
127
|
+
|
|
128
|
+
task_id = client.create_scraper_task(
|
|
129
|
+
file_name="demo_youtube_data",
|
|
130
|
+
spider_id="youtube_video-post_by-url",
|
|
131
|
+
spider_name="youtube.com",
|
|
132
|
+
individual_params={
|
|
133
|
+
"url": "https://www.youtube.com/@stephcurry/videos",
|
|
134
|
+
"order_by": "",
|
|
135
|
+
"num_of_posts": ""
|
|
136
|
+
},
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
for _ in range(10):
|
|
140
|
+
status = client.get_task_status(task_id)
|
|
141
|
+
print("Status:", status)
|
|
142
|
+
if status in ["Ready", "Success"]:
|
|
143
|
+
break
|
|
144
|
+
if status == "Failed":
|
|
145
|
+
raise RuntimeError("Task failed")
|
|
146
|
+
time.sleep(3)
|
|
147
|
+
|
|
148
|
+
download_url = client.get_task_result(task_id)
|
|
149
|
+
print("Download URL:", download_url)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### 5. Asynchronous Usage (High Concurrency)
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
import asyncio
|
|
156
|
+
from thordata import AsyncThordataClient
|
|
157
|
+
|
|
158
|
+
async def main():
|
|
159
|
+
async with AsyncThordataClient(
|
|
160
|
+
scraper_token="SCRAPER_TOKEN",
|
|
161
|
+
public_token="PUBLIC_TOKEN",
|
|
162
|
+
public_key="PUBLIC_KEY",
|
|
163
|
+
) as client:
|
|
164
|
+
resp = await client.get("http://httpbin.org/ip")
|
|
165
|
+
print(await resp.json())
|
|
166
|
+
|
|
167
|
+
asyncio.run(main())
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
More examples are available in the `examples/` directory.
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## Features
|
|
175
|
+
|
|
176
|
+
| Feature | Status | Description |
|
|
177
|
+
|---------|--------|-------------|
|
|
178
|
+
| Proxy Network | Stable | Residential, ISP, Mobile, Datacenter via HTTP/HTTPS gateway. |
|
|
179
|
+
| SERP API | Stable | Google / Bing / Yandex / DuckDuckGo, flexible parameters. |
|
|
180
|
+
| Universal Scraping API | Stable | JS rendering, HTML / PNG output, antibot bypass. |
|
|
181
|
+
| Web Scraper API | Stable | Task-based scraping for complex sites (YouTube, E-commerce). |
|
|
182
|
+
| Async Client | Stable | aiohttp-based client for high-concurrency workloads. |
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Development & Contributing
|
|
187
|
+
|
|
188
|
+
See `CONTRIBUTING.md` for local development and contribution guidelines.
|
|
189
|
+
|
|
190
|
+
## License
|
|
191
|
+
|
|
192
|
+
This project is licensed under the Apache License 2.0.
|
|
193
|
+
|
|
194
|
+
## Support
|
|
195
|
+
|
|
196
|
+
For technical support, please contact support@thordata.com
|
|
197
|
+
or verify your tokens and quotas in the Thordata Dashboard.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
thordata/__init__.py,sha256=HVb6cHBsYRFoA1Sf_y_WSZ88vGV3DsT67rCdbZSuUYE,365
|
|
2
|
+
thordata/async_client.py,sha256=cpBtRIzr8oH6GuZs8gTh505tGYYV1aRFBUzbtmFOfEg,9717
|
|
3
|
+
thordata/client.py,sha256=w_EXs6CLM2qFtFPNU-x_Li66LEH1j7pQb2ca2MDKqyA,12432
|
|
4
|
+
thordata/enums.py,sha256=PGUCQX3jw5a9mX8_JfhuyoR1WriWjWQpAgibVP_bpdM,679
|
|
5
|
+
thordata/parameters.py,sha256=1lNx_BSS8ztBKEj_MXZMaIQQ9_W3EAlS-VFiBqSWb9E,1841
|
|
6
|
+
thordata_sdk-0.3.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
7
|
+
thordata_sdk-0.3.0.dist-info/METADATA,sha256=Yj6W3vSLkkUhSXTj6AK4AaMfdlJvGOVaK6cFI2MNqV8,5697
|
|
8
|
+
thordata_sdk-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
thordata_sdk-0.3.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
|
|
10
|
+
thordata_sdk-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
thordata
|
thordata_sdk/__init__.py
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
# Expose main clients
|
|
2
|
-
from .client import ThordataClient
|
|
3
|
-
from .async_client import AsyncThordataClient
|
|
4
|
-
from .enums import Engine, GoogleSearchType
|
|
5
|
-
|
|
6
|
-
# Version of the thordata-sdk package
|
|
7
|
-
__version__ = "0.2.4"
|
|
8
|
-
|
|
9
|
-
__all__ = ["ThordataClient", "AsyncThordataClient"]
|
thordata_sdk/enums.py
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
# thordata_sdk/enums.py
|
|
2
|
-
from enum import Enum
|
|
3
|
-
|
|
4
|
-
class Engine(str, Enum):
|
|
5
|
-
"""SERP 核心支持的四大引擎"""
|
|
6
|
-
GOOGLE = "google"
|
|
7
|
-
BING = "bing"
|
|
8
|
-
YANDEX = "yandex"
|
|
9
|
-
DUCKDUCKGO = "duckduckgo"
|
|
10
|
-
BAIDU = "baidu"
|
|
11
|
-
|
|
12
|
-
class GoogleSearchType(str, Enum):
|
|
13
|
-
"""Google 搜索的常见子类型 (参考你的截图)"""
|
|
14
|
-
SEARCH = "search" # 默认网页搜索
|
|
15
|
-
MAPS = "maps" # 地图
|
|
16
|
-
SHOPPING = "shopping" # 购物
|
|
17
|
-
NEWS = "news" # 新闻
|
|
18
|
-
IMAGES = "images" # 图片
|
|
19
|
-
VIDEOS = "videos" # 视频
|
|
20
|
-
# 其他冷门的先不写,用户可以通过字符串传参
|
thordata_sdk/parameters.py
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# thordata_sdk/parameters.py
|
|
2
|
-
from typing import Dict, Any
|
|
3
|
-
|
|
4
|
-
def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
|
|
5
|
-
"""
|
|
6
|
-
统一不同搜索引擎的参数差异。
|
|
7
|
-
"""
|
|
8
|
-
# 1. 基础参数
|
|
9
|
-
payload = {
|
|
10
|
-
"num": str(kwargs.get("num", 10)),
|
|
11
|
-
"json": "1",
|
|
12
|
-
"engine": engine,
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
# 2. 处理查询关键词 (Yandex 用 text,其他用 q)
|
|
16
|
-
if engine == "yandex":
|
|
17
|
-
payload["text"] = query
|
|
18
|
-
# 如果用户没传 url,给个默认的
|
|
19
|
-
if "url" not in kwargs:
|
|
20
|
-
payload["url"] = "yandex.com"
|
|
21
|
-
else:
|
|
22
|
-
payload["q"] = query
|
|
23
|
-
|
|
24
|
-
# 3. 处理默认 URL (如果用户没传)
|
|
25
|
-
if "url" not in kwargs:
|
|
26
|
-
defaults = {
|
|
27
|
-
"google": "google.com",
|
|
28
|
-
"bing": "bing.com",
|
|
29
|
-
"duckduckgo": "duckduckgo.com",
|
|
30
|
-
"baidu": "baidu.com"
|
|
31
|
-
}
|
|
32
|
-
if engine in defaults:
|
|
33
|
-
payload["url"] = defaults[engine]
|
|
34
|
-
|
|
35
|
-
# 4. 把用户传入的其他所有参数(比如 type="shopping", google_domain="google.co.uk")都透传进去
|
|
36
|
-
# 这样你就不用去定义那几十种类型了,用户传啥就是啥
|
|
37
|
-
for k, v in kwargs.items():
|
|
38
|
-
if k not in ["num", "engine", "q", "text"]: # 避免覆盖
|
|
39
|
-
payload[k] = v
|
|
40
|
-
|
|
41
|
-
return payload
|
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: thordata-sdk
|
|
3
|
-
Version: 0.2.4
|
|
4
|
-
Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
|
|
5
|
-
Home-page: https://github.com/Thordata/thordata-python-sdk
|
|
6
|
-
Author: Thordata Developer Team
|
|
7
|
-
Author-email: support@thordata.com
|
|
8
|
-
License: Apache License 2.0
|
|
9
|
-
Project-URL: Bug Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
10
|
-
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
11
|
-
Classifier: Development Status :: 4 - Beta
|
|
12
|
-
Classifier: Intended Audience :: Developers
|
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
19
|
-
Classifier: Operating System :: OS Independent
|
|
20
|
-
Classifier: Topic :: Internet :: WWW/HTTP
|
|
21
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
-
Requires-Python: >=3.8
|
|
23
|
-
Description-Content-Type: text/markdown
|
|
24
|
-
License-File: LICENSE
|
|
25
|
-
Requires-Dist: requests>=2.25.0
|
|
26
|
-
Requires-Dist: aiohttp>=3.8.0
|
|
27
|
-
|
|
28
|
-
# Thordata Python SDK
|
|
29
|
-
|
|
30
|
-
<h4 align="center">
|
|
31
|
-
The Official Python Client for the Thordata Proxy Network & Web Scraper API.
|
|
32
|
-
<br>
|
|
33
|
-
<i>High-performance, async-ready, designed for AI Agents and large-scale data collection.</i>
|
|
34
|
-
</h4>
|
|
35
|
-
|
|
36
|
-
<p align="center">
|
|
37
|
-
<a href="https://pypi.org/project/thordata-sdk/"><img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version"></a>
|
|
38
|
-
<a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
|
|
39
|
-
<a href="https://python.org"><img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions"></a>
|
|
40
|
-
</p>
|
|
41
|
-
|
|
42
|
-
---
|
|
43
|
-
|
|
44
|
-
## 🛠 Installation
|
|
45
|
-
|
|
46
|
-
Install via pip:
|
|
47
|
-
|
|
48
|
-
```bash
|
|
49
|
-
pip install thordata-sdk
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
## ⚡ Quick Start
|
|
53
|
-
|
|
54
|
-
### 1. Proxy Usage (Simple GET Request)
|
|
55
|
-
|
|
56
|
-
**Python**
|
|
57
|
-
|
|
58
|
-
```python
|
|
59
|
-
from thordata_sdk import ThordataClient
|
|
60
|
-
|
|
61
|
-
# Initialize with your credentials from the Thordata Dashboard
|
|
62
|
-
client = ThordataClient(
|
|
63
|
-
scraper_token="YOUR_SCRAPER_TOKEN", # From "Scraping Tool Token"
|
|
64
|
-
public_token="YOUR_PUBLIC_TOKEN", # From "Public API"
|
|
65
|
-
public_key="YOUR_PUBLIC_KEY" # From "Public API"
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
# Send a request through the proxy
|
|
69
|
-
response = client.get("http://httpbin.org/ip")
|
|
70
|
-
print(response.json())
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
### 2. Real-time SERP Search
|
|
74
|
-
|
|
75
|
-
**Python**
|
|
76
|
-
|
|
77
|
-
```python
|
|
78
|
-
results = client.serp_search("Thordata technology", engine="google")
|
|
79
|
-
print(f"Results found: {len(results.get('organic', []))}")
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
### 3. Asynchronous Usage (High Concurrency)
|
|
83
|
-
|
|
84
|
-
**Python**
|
|
85
|
-
|
|
86
|
-
```python
|
|
87
|
-
import asyncio
|
|
88
|
-
from thordata_sdk import AsyncThordataClient
|
|
89
|
-
|
|
90
|
-
async def main():
|
|
91
|
-
async with AsyncThordataClient(scraper_token="...", public_token="...", public_key="...") as client:
|
|
92
|
-
response = await client.get("http://httpbin.org/ip")
|
|
93
|
-
print(await response.json())
|
|
94
|
-
|
|
95
|
-
asyncio.run(main())
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
## ⚙️ Features Status
|
|
99
|
-
|
|
100
|
-
| Feature | Status | Description |
|
|
101
|
-
|---------|--------|-------------|
|
|
102
|
-
| Proxy Network | ✅ Stable | Synchronous & Asynchronous support via aiohttp. |
|
|
103
|
-
| SERP API | ✅ Stable | Real-time Google/Bing/Yandex search results. |
|
|
104
|
-
| Web Scraper | ✅ Stable | Async task management for scraping complex sites (e.g., YouTube). |
|
|
105
|
-
| Authentication | ✅ Secure | Dual-token system for enhanced security. |
|
|
106
|
-
|
|
107
|
-
## 📄 License
|
|
108
|
-
|
|
109
|
-
This project is licensed under the Apache License 2.0.
|
|
110
|
-
|
|
111
|
-
## 📞 Support
|
|
112
|
-
|
|
113
|
-
For technical assistance, please contact support@thordata.com or verify your tokens in the Thordata Dashboard.
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
thordata_sdk/__init__.py,sha256=TpVRMWiWSkxq6MUoX1LCkfmuZTH9FWC65JbaALeVoVY,268
|
|
2
|
-
thordata_sdk/async_client.py,sha256=YIIKddghCzGAvrx2Bqy8XkGcgFLbCPgzkQw-jcq2WH8,8612
|
|
3
|
-
thordata_sdk/client.py,sha256=UyRLjRFKep2SLOWExjAJ5EB0ED0BUiBlfWGwts3sykw,10372
|
|
4
|
-
thordata_sdk/enums.py,sha256=gKpaqV-_OO7w1LCg9PTuSUiJJq_q4ad5k6f88UlTPQw,639
|
|
5
|
-
thordata_sdk/parameters.py,sha256=3ck0XP0lZaUYs4eEZoLLo6zDTClRRrLO9TlggesMmwI,1384
|
|
6
|
-
thordata_sdk-0.2.4.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
7
|
-
thordata_sdk-0.2.4.dist-info/METADATA,sha256=mluyngNHvMXlRfAgA4F7JHC6Sc1f0z4cuut3CI42yow,3734
|
|
8
|
-
thordata_sdk-0.2.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
9
|
-
thordata_sdk-0.2.4.dist-info/top_level.txt,sha256=0b2NvIf8zEcLXLF0alJAeurAEeB-2e9qh72bLukM6zI,13
|
|
10
|
-
thordata_sdk-0.2.4.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
thordata_sdk
|
|
File without changes
|