thordata-sdk 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +130 -11
- thordata/_utils.py +126 -0
- thordata/async_client.py +672 -185
- thordata/client.py +809 -300
- thordata/enums.py +301 -11
- thordata/exceptions.py +344 -0
- thordata/models.py +725 -0
- thordata/parameters.py +7 -6
- thordata/retry.py +380 -0
- thordata_sdk-0.5.0.dist-info/METADATA +896 -0
- thordata_sdk-0.5.0.dist-info/RECORD +14 -0
- thordata_sdk-0.5.0.dist-info/licenses/LICENSE +21 -0
- thordata_sdk-0.3.1.dist-info/METADATA +0 -200
- thordata_sdk-0.3.1.dist-info/RECORD +0 -10
- thordata_sdk-0.3.1.dist-info/licenses/LICENSE +0 -201
- {thordata_sdk-0.3.1.dist-info → thordata_sdk-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.3.1.dist-info → thordata_sdk-0.5.0.dist-info}/top_level.txt +0 -0
thordata/client.py
CHANGED
|
@@ -1,13 +1,57 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
Synchronous client for the Thordata API.
|
|
3
|
+
|
|
4
|
+
This module provides the main ThordataClient class for interacting with
|
|
5
|
+
Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> from thordata import ThordataClient
|
|
9
|
+
>>>
|
|
10
|
+
>>> client = ThordataClient(
|
|
11
|
+
... scraper_token="your_token",
|
|
12
|
+
... public_token="your_public_token",
|
|
13
|
+
... public_key="your_public_key"
|
|
14
|
+
... )
|
|
15
|
+
>>>
|
|
16
|
+
>>> # Use the proxy network
|
|
17
|
+
>>> response = client.get("https://httpbin.org/ip")
|
|
18
|
+
>>> print(response.json())
|
|
19
|
+
>>>
|
|
20
|
+
>>> # Search with SERP API
|
|
21
|
+
>>> results = client.serp_search("python tutorial", engine="google")
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
2
26
|
import logging
|
|
3
|
-
import
|
|
4
|
-
|
|
5
|
-
|
|
27
|
+
from typing import Any, Dict, List, Optional, Union
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import requests
|
|
6
31
|
|
|
7
|
-
from .
|
|
8
|
-
|
|
32
|
+
from ._utils import (
|
|
33
|
+
build_auth_headers,
|
|
34
|
+
build_public_api_headers,
|
|
35
|
+
decode_base64_image,
|
|
36
|
+
extract_error_message,
|
|
37
|
+
parse_json_response,
|
|
38
|
+
)
|
|
39
|
+
from .enums import Engine, ProxyType
|
|
40
|
+
from .exceptions import (
|
|
41
|
+
ThordataConfigError,
|
|
42
|
+
ThordataNetworkError,
|
|
43
|
+
ThordataTimeoutError,
|
|
44
|
+
raise_for_code,
|
|
45
|
+
)
|
|
46
|
+
from .models import (
|
|
47
|
+
ProxyConfig,
|
|
48
|
+
ProxyProduct,
|
|
49
|
+
ScraperTaskConfig,
|
|
50
|
+
SerpRequest,
|
|
51
|
+
UniversalScrapeRequest,
|
|
52
|
+
)
|
|
53
|
+
from .retry import RetryConfig, with_retry
|
|
9
54
|
|
|
10
|
-
# Configure a library-specific logger to avoid interfering with user's logging
|
|
11
55
|
logger = logging.getLogger(__name__)
|
|
12
56
|
|
|
13
57
|
|
|
@@ -16,471 +60,936 @@ class ThordataClient:
|
|
|
16
60
|
The official synchronous Python client for Thordata.
|
|
17
61
|
|
|
18
62
|
This client handles authentication and communication with:
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
63
|
+
- Proxy Network (Residential/Datacenter/Mobile/ISP via HTTP/HTTPS)
|
|
64
|
+
- SERP API (Real-time Search Engine Results)
|
|
65
|
+
- Universal Scraping API (Web Unlocker - Single Page Rendering)
|
|
66
|
+
- Web Scraper API (Async Task Management)
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
scraper_token: The API token from your Dashboard.
|
|
70
|
+
public_token: The public API token (for task status, locations).
|
|
71
|
+
public_key: The public API key.
|
|
72
|
+
proxy_host: Custom proxy gateway host (optional).
|
|
73
|
+
proxy_port: Custom proxy gateway port (optional).
|
|
74
|
+
timeout: Default request timeout in seconds (default: 30).
|
|
75
|
+
retry_config: Configuration for automatic retries (optional).
|
|
76
|
+
|
|
77
|
+
Example:
|
|
78
|
+
>>> client = ThordataClient(
|
|
79
|
+
... scraper_token="your_scraper_token",
|
|
80
|
+
... public_token="your_public_token",
|
|
81
|
+
... public_key="your_public_key"
|
|
82
|
+
... )
|
|
23
83
|
"""
|
|
24
84
|
|
|
85
|
+
# API Endpoints
|
|
86
|
+
BASE_URL = "https://scraperapi.thordata.com"
|
|
87
|
+
UNIVERSAL_URL = "https://universalapi.thordata.com"
|
|
88
|
+
API_URL = "https://api.thordata.com/api/web-scraper-api"
|
|
89
|
+
LOCATIONS_URL = "https://api.thordata.com/api/locations"
|
|
90
|
+
|
|
25
91
|
def __init__(
|
|
26
92
|
self,
|
|
27
93
|
scraper_token: str,
|
|
28
|
-
public_token: str,
|
|
29
|
-
public_key: str,
|
|
30
|
-
proxy_host: str = "
|
|
31
|
-
proxy_port: int =
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
94
|
+
public_token: Optional[str] = None,
|
|
95
|
+
public_key: Optional[str] = None,
|
|
96
|
+
proxy_host: str = "pr.thordata.net",
|
|
97
|
+
proxy_port: int = 9999,
|
|
98
|
+
timeout: int = 30,
|
|
99
|
+
retry_config: Optional[RetryConfig] = None,
|
|
100
|
+
scraperapi_base_url: Optional[str] = None,
|
|
101
|
+
universalapi_base_url: Optional[str] = None,
|
|
102
|
+
web_scraper_api_base_url: Optional[str] = None,
|
|
103
|
+
locations_base_url: Optional[str] = None,
|
|
104
|
+
) -> None:
|
|
105
|
+
"""Initialize the Thordata Client."""
|
|
106
|
+
if not scraper_token:
|
|
107
|
+
raise ThordataConfigError("scraper_token is required")
|
|
35
108
|
|
|
36
|
-
Args:
|
|
37
|
-
scraper_token (str): The secret token found at the bottom of the Dashboard.
|
|
38
|
-
public_token (str): The token from the Public API section.
|
|
39
|
-
public_key (str): The key from the Public API section.
|
|
40
|
-
proxy_host (str): The proxy gateway host (default: gate.thordata.com).
|
|
41
|
-
proxy_port (int): The proxy gateway port (default: 22225).
|
|
42
|
-
"""
|
|
43
109
|
self.scraper_token = scraper_token
|
|
44
110
|
self.public_token = public_token
|
|
45
111
|
self.public_key = public_key
|
|
46
112
|
|
|
47
|
-
# Proxy
|
|
48
|
-
self.
|
|
49
|
-
|
|
113
|
+
# Proxy configuration
|
|
114
|
+
self._proxy_host = proxy_host
|
|
115
|
+
self._proxy_port = proxy_port
|
|
116
|
+
self._default_timeout = timeout
|
|
117
|
+
|
|
118
|
+
# Retry configuration
|
|
119
|
+
self._retry_config = retry_config or RetryConfig()
|
|
120
|
+
|
|
121
|
+
# Build default proxy URL (for basic usage)
|
|
122
|
+
self._default_proxy_url = (
|
|
123
|
+
f"http://td-customer-{self.scraper_token}:@{proxy_host}:{proxy_port}"
|
|
50
124
|
)
|
|
51
125
|
|
|
52
|
-
#
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
self.
|
|
59
|
-
self.
|
|
60
|
-
self.
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
# Initialize Session with Proxy settings
|
|
65
|
-
self.session = requests.Session()
|
|
66
|
-
self.session.proxies = {
|
|
67
|
-
"http": self.proxy_url,
|
|
68
|
-
"https": self.proxy_url,
|
|
126
|
+
# Sessions:
|
|
127
|
+
# - _proxy_session: used for proxy network traffic to target sites
|
|
128
|
+
# - _api_session: used for Thordata APIs (SERP/Universal/Tasks/Locations)
|
|
129
|
+
#
|
|
130
|
+
# We intentionally do NOT set session-level proxies for _api_session,
|
|
131
|
+
# so developers can rely on system proxy settings (e.g., Clash) via env vars.
|
|
132
|
+
self._proxy_session = requests.Session()
|
|
133
|
+
self._proxy_session.trust_env = False
|
|
134
|
+
self._proxy_session.proxies = {
|
|
135
|
+
"http": self._default_proxy_url,
|
|
136
|
+
"https": self._default_proxy_url,
|
|
69
137
|
}
|
|
70
138
|
|
|
71
|
-
|
|
139
|
+
self._api_session = requests.Session()
|
|
140
|
+
self._api_session.trust_env = True
|
|
141
|
+
|
|
142
|
+
# Base URLs (allow override via args or env vars for testing and custom routing)
|
|
143
|
+
scraperapi_base = (
|
|
144
|
+
scraperapi_base_url
|
|
145
|
+
or os.getenv("THORDATA_SCRAPERAPI_BASE_URL")
|
|
146
|
+
or self.BASE_URL
|
|
147
|
+
).rstrip("/")
|
|
148
|
+
|
|
149
|
+
universalapi_base = (
|
|
150
|
+
universalapi_base_url
|
|
151
|
+
or os.getenv("THORDATA_UNIVERSALAPI_BASE_URL")
|
|
152
|
+
or self.UNIVERSAL_URL
|
|
153
|
+
).rstrip("/")
|
|
154
|
+
|
|
155
|
+
web_scraper_api_base = (
|
|
156
|
+
web_scraper_api_base_url
|
|
157
|
+
or os.getenv("THORDATA_WEB_SCRAPER_API_BASE_URL")
|
|
158
|
+
or self.API_URL
|
|
159
|
+
).rstrip("/")
|
|
160
|
+
|
|
161
|
+
locations_base = (
|
|
162
|
+
locations_base_url
|
|
163
|
+
or os.getenv("THORDATA_LOCATIONS_BASE_URL")
|
|
164
|
+
or self.LOCATIONS_URL
|
|
165
|
+
).rstrip("/")
|
|
166
|
+
|
|
167
|
+
self._serp_url = f"{scraperapi_base}/request"
|
|
168
|
+
self._builder_url = f"{scraperapi_base}/builder"
|
|
169
|
+
self._universal_url = f"{universalapi_base}/request"
|
|
170
|
+
self._status_url = f"{web_scraper_api_base}/tasks-status"
|
|
171
|
+
self._download_url = f"{web_scraper_api_base}/tasks-download"
|
|
172
|
+
self._locations_base_url = locations_base
|
|
173
|
+
|
|
174
|
+
# =========================================================================
|
|
175
|
+
# Proxy Network Methods
|
|
176
|
+
# =========================================================================
|
|
177
|
+
|
|
178
|
+
def get(
|
|
179
|
+
self,
|
|
180
|
+
url: str,
|
|
181
|
+
*,
|
|
182
|
+
proxy_config: Optional[ProxyConfig] = None,
|
|
183
|
+
timeout: Optional[int] = None,
|
|
184
|
+
**kwargs: Any,
|
|
185
|
+
) -> requests.Response:
|
|
186
|
+
"""
|
|
187
|
+
Send a GET request through the Thordata Proxy Network.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
url: The target URL.
|
|
191
|
+
proxy_config: Custom proxy configuration for geo-targeting/sessions.
|
|
192
|
+
timeout: Request timeout in seconds.
|
|
193
|
+
**kwargs: Additional arguments to pass to requests.get().
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
The response object.
|
|
197
|
+
|
|
198
|
+
Example:
|
|
199
|
+
>>> # Basic request
|
|
200
|
+
>>> response = client.get("https://httpbin.org/ip")
|
|
201
|
+
>>>
|
|
202
|
+
>>> # With geo-targeting
|
|
203
|
+
>>> from thordata.models import ProxyConfig
|
|
204
|
+
>>> config = ProxyConfig(
|
|
205
|
+
... username="myuser",
|
|
206
|
+
... password="mypass",
|
|
207
|
+
... country="us",
|
|
208
|
+
... city="seattle"
|
|
209
|
+
... )
|
|
210
|
+
>>> response = client.get("https://httpbin.org/ip", proxy_config=config)
|
|
211
|
+
"""
|
|
212
|
+
logger.debug(f"Proxy GET request: {url}")
|
|
213
|
+
|
|
214
|
+
timeout = timeout or self._default_timeout
|
|
215
|
+
|
|
216
|
+
if proxy_config:
|
|
217
|
+
proxies = proxy_config.to_proxies_dict()
|
|
218
|
+
kwargs["proxies"] = proxies
|
|
219
|
+
|
|
220
|
+
return self._request_with_retry("GET", url, timeout=timeout, **kwargs)
|
|
221
|
+
|
|
222
|
+
def post(
|
|
223
|
+
self,
|
|
224
|
+
url: str,
|
|
225
|
+
*,
|
|
226
|
+
proxy_config: Optional[ProxyConfig] = None,
|
|
227
|
+
timeout: Optional[int] = None,
|
|
228
|
+
**kwargs: Any,
|
|
229
|
+
) -> requests.Response:
|
|
230
|
+
"""
|
|
231
|
+
Send a POST request through the Thordata Proxy Network.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
url: The target URL.
|
|
235
|
+
proxy_config: Custom proxy configuration.
|
|
236
|
+
timeout: Request timeout in seconds.
|
|
237
|
+
**kwargs: Additional arguments to pass to requests.post().
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
The response object.
|
|
241
|
+
"""
|
|
242
|
+
logger.debug(f"Proxy POST request: {url}")
|
|
243
|
+
|
|
244
|
+
timeout = timeout or self._default_timeout
|
|
245
|
+
|
|
246
|
+
if proxy_config:
|
|
247
|
+
proxies = proxy_config.to_proxies_dict()
|
|
248
|
+
kwargs["proxies"] = proxies
|
|
249
|
+
|
|
250
|
+
return self._request_with_retry("POST", url, timeout=timeout, **kwargs)
|
|
251
|
+
|
|
252
|
+
def build_proxy_url(
|
|
253
|
+
self,
|
|
254
|
+
*,
|
|
255
|
+
country: Optional[str] = None,
|
|
256
|
+
state: Optional[str] = None,
|
|
257
|
+
city: Optional[str] = None,
|
|
258
|
+
session_id: Optional[str] = None,
|
|
259
|
+
session_duration: Optional[int] = None,
|
|
260
|
+
product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL,
|
|
261
|
+
) -> str:
|
|
72
262
|
"""
|
|
73
|
-
|
|
263
|
+
Build a proxy URL with custom targeting options.
|
|
264
|
+
|
|
265
|
+
This is a convenience method for creating proxy URLs without
|
|
266
|
+
manually constructing a ProxyConfig.
|
|
74
267
|
|
|
75
268
|
Args:
|
|
76
|
-
|
|
77
|
-
|
|
269
|
+
country: Target country code (e.g., 'us', 'gb').
|
|
270
|
+
state: Target state (e.g., 'california').
|
|
271
|
+
city: Target city (e.g., 'seattle').
|
|
272
|
+
session_id: Session ID for sticky sessions.
|
|
273
|
+
session_duration: Session duration in minutes (1-90).
|
|
274
|
+
product: Proxy product type.
|
|
78
275
|
|
|
79
276
|
Returns:
|
|
80
|
-
|
|
277
|
+
The proxy URL string.
|
|
278
|
+
|
|
279
|
+
Example:
|
|
280
|
+
>>> url = client.build_proxy_url(country="us", city="seattle")
|
|
281
|
+
>>> proxies = {"http": url, "https": url}
|
|
282
|
+
>>> requests.get("https://example.com", proxies=proxies)
|
|
81
283
|
"""
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
284
|
+
config = ProxyConfig(
|
|
285
|
+
username=self.scraper_token,
|
|
286
|
+
password="",
|
|
287
|
+
host=self._proxy_host,
|
|
288
|
+
port=self._proxy_port,
|
|
289
|
+
product=product,
|
|
290
|
+
country=country,
|
|
291
|
+
state=state,
|
|
292
|
+
city=city,
|
|
293
|
+
session_id=session_id,
|
|
294
|
+
session_duration=session_duration,
|
|
295
|
+
)
|
|
296
|
+
return config.build_proxy_url()
|
|
297
|
+
|
|
298
|
+
# =========================================================================
|
|
299
|
+
# SERP API Methods
|
|
300
|
+
# =========================================================================
|
|
85
301
|
|
|
86
302
|
def serp_search(
|
|
87
|
-
self,
|
|
88
|
-
query: str,
|
|
303
|
+
self,
|
|
304
|
+
query: str,
|
|
305
|
+
*,
|
|
89
306
|
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
90
|
-
num: int = 10,
|
|
91
|
-
|
|
307
|
+
num: int = 10,
|
|
308
|
+
country: Optional[str] = None,
|
|
309
|
+
language: Optional[str] = None,
|
|
310
|
+
search_type: Optional[str] = None,
|
|
311
|
+
device: Optional[str] = None,
|
|
312
|
+
render_js: Optional[bool] = None,
|
|
313
|
+
no_cache: Optional[bool] = None,
|
|
314
|
+
output_format: str = "json",
|
|
315
|
+
**kwargs: Any,
|
|
92
316
|
) -> Dict[str, Any]:
|
|
93
317
|
"""
|
|
94
318
|
Execute a real-time SERP (Search Engine Results Page) search.
|
|
95
|
-
|
|
319
|
+
|
|
96
320
|
Args:
|
|
97
|
-
query
|
|
98
|
-
engine
|
|
99
|
-
num
|
|
100
|
-
|
|
321
|
+
query: The search keywords.
|
|
322
|
+
engine: Search engine (google, bing, yandex, duckduckgo, baidu).
|
|
323
|
+
num: Number of results to retrieve (default: 10).
|
|
324
|
+
country: Country code for localized results (e.g., 'us').
|
|
325
|
+
language: Language code for interface (e.g., 'en').
|
|
326
|
+
search_type: Type of search (images, news, shopping, videos, etc.).
|
|
327
|
+
device: Device type ('desktop', 'mobile', 'tablet').
|
|
328
|
+
render_js: Enable JavaScript rendering in SERP (render_js=True).
|
|
329
|
+
no_cache: Disable internal caching (no_cache=True).
|
|
330
|
+
output_format: 'json' to return parsed JSON (default),
|
|
331
|
+
'html' to return HTML wrapped in {'html': ...}.
|
|
332
|
+
**kwargs: Additional engine-specific parameters.
|
|
101
333
|
|
|
102
334
|
Returns:
|
|
103
|
-
Dict[str, Any]:
|
|
335
|
+
Dict[str, Any]: Parsed JSON results or a dict with 'html' key.
|
|
336
|
+
|
|
337
|
+
Example:
|
|
338
|
+
>>> # Basic search
|
|
339
|
+
>>> results = client.serp_search("python tutorial")
|
|
340
|
+
>>>
|
|
341
|
+
>>> # With options
|
|
342
|
+
>>> results = client.serp_search(
|
|
343
|
+
... "laptop reviews",
|
|
344
|
+
... engine="google",
|
|
345
|
+
... num=20,
|
|
346
|
+
... country="us",
|
|
347
|
+
... search_type="shopping",
|
|
348
|
+
... device="mobile",
|
|
349
|
+
... render_js=True,
|
|
350
|
+
... no_cache=True,
|
|
351
|
+
... )
|
|
104
352
|
"""
|
|
105
|
-
#
|
|
353
|
+
# Normalize engine
|
|
106
354
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
107
355
|
|
|
108
|
-
#
|
|
109
|
-
|
|
356
|
+
# Build request using model
|
|
357
|
+
request = SerpRequest(
|
|
358
|
+
query=query,
|
|
359
|
+
engine=engine_str,
|
|
360
|
+
num=num,
|
|
361
|
+
country=country,
|
|
362
|
+
language=language,
|
|
363
|
+
search_type=search_type,
|
|
364
|
+
device=device,
|
|
365
|
+
render_js=render_js,
|
|
366
|
+
no_cache=no_cache,
|
|
367
|
+
output_format=output_format,
|
|
368
|
+
extra_params=kwargs,
|
|
369
|
+
)
|
|
110
370
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
114
|
-
}
|
|
371
|
+
payload = request.to_payload()
|
|
372
|
+
headers = build_auth_headers(self.scraper_token)
|
|
115
373
|
|
|
116
374
|
logger.info(f"SERP Search: {engine_str} - {query}")
|
|
375
|
+
|
|
117
376
|
try:
|
|
118
|
-
response = self.
|
|
119
|
-
self.
|
|
377
|
+
response = self._api_session.post(
|
|
378
|
+
self._serp_url,
|
|
120
379
|
data=payload,
|
|
121
380
|
headers=headers,
|
|
122
|
-
timeout=60
|
|
381
|
+
timeout=60,
|
|
123
382
|
)
|
|
124
383
|
response.raise_for_status()
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
384
|
+
|
|
385
|
+
# JSON mode (default)
|
|
386
|
+
if output_format.lower() == "json":
|
|
387
|
+
data = response.json()
|
|
388
|
+
|
|
389
|
+
if isinstance(data, dict):
|
|
390
|
+
code = data.get("code")
|
|
391
|
+
if code is not None and code != 200:
|
|
392
|
+
msg = extract_error_message(data)
|
|
393
|
+
raise_for_code(
|
|
394
|
+
f"SERP API Error: {msg}",
|
|
395
|
+
code=code,
|
|
396
|
+
payload=data,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
return parse_json_response(data)
|
|
400
|
+
|
|
401
|
+
# HTML mode: wrap as dict to keep return type stable
|
|
402
|
+
return {"html": response.text}
|
|
403
|
+
|
|
404
|
+
except requests.Timeout as e:
|
|
405
|
+
raise ThordataTimeoutError(
|
|
406
|
+
f"SERP request timed out: {e}",
|
|
407
|
+
original_error=e,
|
|
408
|
+
)
|
|
409
|
+
except requests.RequestException as e:
|
|
410
|
+
raise ThordataNetworkError(
|
|
411
|
+
f"SERP request failed: {e}",
|
|
412
|
+
original_error=e,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
|
|
416
|
+
"""
|
|
417
|
+
Execute a SERP search using a SerpRequest object.
|
|
418
|
+
|
|
419
|
+
This method provides full control over all search parameters.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
request: A SerpRequest object with all parameters configured.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
Dict[str, Any]: Parsed JSON results or dict with 'html' key.
|
|
426
|
+
|
|
427
|
+
Example:
|
|
428
|
+
>>> from thordata.models import SerpRequest
|
|
429
|
+
>>> request = SerpRequest(
|
|
430
|
+
... query="python programming",
|
|
431
|
+
... engine="google",
|
|
432
|
+
... num=50,
|
|
433
|
+
... country="us",
|
|
434
|
+
... language="en",
|
|
435
|
+
... search_type="news",
|
|
436
|
+
... time_filter="week",
|
|
437
|
+
... safe_search=True
|
|
438
|
+
... )
|
|
439
|
+
>>> results = client.serp_search_advanced(request)
|
|
440
|
+
"""
|
|
441
|
+
payload = request.to_payload()
|
|
442
|
+
headers = build_auth_headers(self.scraper_token)
|
|
443
|
+
|
|
444
|
+
logger.info(f"SERP Advanced Search: {request.engine} - {request.query}")
|
|
445
|
+
|
|
446
|
+
try:
|
|
447
|
+
response = self._api_session.post(
|
|
448
|
+
self._serp_url,
|
|
449
|
+
data=payload,
|
|
450
|
+
headers=headers,
|
|
451
|
+
timeout=60,
|
|
452
|
+
)
|
|
453
|
+
response.raise_for_status()
|
|
454
|
+
|
|
455
|
+
if request.output_format.lower() == "json":
|
|
456
|
+
data = response.json()
|
|
457
|
+
|
|
458
|
+
if isinstance(data, dict):
|
|
459
|
+
code = data.get("code")
|
|
460
|
+
if code is not None and code != 200:
|
|
461
|
+
msg = extract_error_message(data)
|
|
462
|
+
raise_for_code(
|
|
463
|
+
f"SERP API Error: {msg}",
|
|
464
|
+
code=code,
|
|
465
|
+
payload=data,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
return parse_json_response(data)
|
|
469
|
+
|
|
470
|
+
return {"html": response.text}
|
|
471
|
+
|
|
472
|
+
except requests.Timeout as e:
|
|
473
|
+
raise ThordataTimeoutError(
|
|
474
|
+
f"SERP request timed out: {e}",
|
|
475
|
+
original_error=e,
|
|
476
|
+
)
|
|
477
|
+
except requests.RequestException as e:
|
|
478
|
+
raise ThordataNetworkError(
|
|
479
|
+
f"SERP request failed: {e}",
|
|
480
|
+
original_error=e,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# =========================================================================
|
|
484
|
+
# Universal Scraping API (Web Unlocker) Methods
|
|
485
|
+
# =========================================================================
|
|
137
486
|
|
|
138
487
|
def universal_scrape(
|
|
139
488
|
self,
|
|
140
489
|
url: str,
|
|
490
|
+
*,
|
|
141
491
|
js_render: bool = False,
|
|
142
|
-
output_format: str = "
|
|
492
|
+
output_format: str = "html",
|
|
143
493
|
country: Optional[str] = None,
|
|
144
|
-
block_resources:
|
|
494
|
+
block_resources: Optional[str] = None,
|
|
495
|
+
wait: Optional[int] = None,
|
|
496
|
+
wait_for: Optional[str] = None,
|
|
497
|
+
**kwargs: Any,
|
|
145
498
|
) -> Union[str, bytes]:
|
|
146
499
|
"""
|
|
147
|
-
|
|
148
|
-
|
|
500
|
+
Scrape a URL using the Universal Scraping API (Web Unlocker).
|
|
501
|
+
|
|
502
|
+
Automatically bypasses Cloudflare, CAPTCHAs, and antibot systems.
|
|
149
503
|
|
|
150
504
|
Args:
|
|
151
|
-
url
|
|
152
|
-
js_render
|
|
153
|
-
output_format
|
|
154
|
-
country
|
|
155
|
-
block_resources
|
|
505
|
+
url: Target URL.
|
|
506
|
+
js_render: Enable JavaScript rendering (headless browser).
|
|
507
|
+
output_format: "html" or "png" (screenshot).
|
|
508
|
+
country: Geo-targeting country code.
|
|
509
|
+
block_resources: Resources to block (e.g., 'script,image').
|
|
510
|
+
wait: Wait time in milliseconds after page load.
|
|
511
|
+
wait_for: CSS selector to wait for.
|
|
512
|
+
**kwargs: Additional parameters.
|
|
156
513
|
|
|
157
514
|
Returns:
|
|
158
|
-
|
|
515
|
+
HTML string or PNG bytes depending on output_format.
|
|
516
|
+
|
|
517
|
+
Example:
|
|
518
|
+
>>> # Get HTML
|
|
519
|
+
>>> html = client.universal_scrape("https://example.com", js_render=True)
|
|
520
|
+
>>>
|
|
521
|
+
>>> # Get screenshot
|
|
522
|
+
>>> png = client.universal_scrape(
|
|
523
|
+
... "https://example.com",
|
|
524
|
+
... js_render=True,
|
|
525
|
+
... output_format="png"
|
|
526
|
+
... )
|
|
527
|
+
>>> with open("screenshot.png", "wb") as f:
|
|
528
|
+
... f.write(png)
|
|
159
529
|
"""
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
530
|
+
request = UniversalScrapeRequest(
|
|
531
|
+
url=url,
|
|
532
|
+
js_render=js_render,
|
|
533
|
+
output_format=output_format,
|
|
534
|
+
country=country,
|
|
535
|
+
block_resources=block_resources,
|
|
536
|
+
wait=wait,
|
|
537
|
+
wait_for=wait_for,
|
|
538
|
+
extra_params=kwargs,
|
|
539
|
+
)
|
|
164
540
|
|
|
165
|
-
|
|
166
|
-
"url": url,
|
|
167
|
-
"js_render": "True" if js_render else "False",
|
|
168
|
-
"type": output_format.lower(),
|
|
169
|
-
"block_resources": "True" if block_resources else "False"
|
|
170
|
-
}
|
|
171
|
-
if country:
|
|
172
|
-
payload["country"] = country
|
|
541
|
+
return self.universal_scrape_advanced(request)
|
|
173
542
|
|
|
174
|
-
|
|
543
|
+
def universal_scrape_advanced(
|
|
544
|
+
self, request: UniversalScrapeRequest
|
|
545
|
+
) -> Union[str, bytes]:
|
|
546
|
+
"""
|
|
547
|
+
Scrape using a UniversalScrapeRequest object for full control.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
request: A UniversalScrapeRequest with all parameters.
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
HTML string or PNG bytes.
|
|
554
|
+
"""
|
|
555
|
+
payload = request.to_payload()
|
|
556
|
+
headers = build_auth_headers(self.scraper_token)
|
|
557
|
+
|
|
558
|
+
logger.info(
|
|
559
|
+
f"Universal Scrape: {request.url} (format: {request.output_format})"
|
|
560
|
+
)
|
|
175
561
|
|
|
176
562
|
try:
|
|
177
|
-
response = self.
|
|
178
|
-
self.
|
|
563
|
+
response = self._api_session.post(
|
|
564
|
+
self._universal_url,
|
|
179
565
|
data=payload,
|
|
180
566
|
headers=headers,
|
|
181
|
-
timeout=60
|
|
567
|
+
timeout=60,
|
|
182
568
|
)
|
|
183
569
|
response.raise_for_status()
|
|
184
570
|
|
|
185
|
-
|
|
186
|
-
try:
|
|
187
|
-
resp_json = response.json()
|
|
188
|
-
except json.JSONDecodeError:
|
|
189
|
-
# Fallback: if the API returns raw content directly
|
|
190
|
-
if output_format.upper() == "PNG":
|
|
191
|
-
return response.content
|
|
192
|
-
return response.text
|
|
193
|
-
|
|
194
|
-
# Check for API-level errors inside the JSON
|
|
195
|
-
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
196
|
-
and resp_json.get("code") != 200:
|
|
197
|
-
raise Exception(f"Universal API Error: {resp_json}")
|
|
198
|
-
|
|
199
|
-
# Case 1: Return HTML
|
|
200
|
-
if "html" in resp_json:
|
|
201
|
-
return resp_json["html"]
|
|
202
|
-
|
|
203
|
-
# Case 2: Return PNG Image
|
|
204
|
-
if "png" in resp_json:
|
|
205
|
-
png_str = resp_json["png"]
|
|
206
|
-
if not png_str:
|
|
207
|
-
raise Exception("API returned empty PNG data")
|
|
208
|
-
|
|
209
|
-
# Clean Data URI Scheme if present (e.g., data:image/png;base64,...)
|
|
210
|
-
if "," in png_str:
|
|
211
|
-
png_str = png_str.split(",", 1)[1]
|
|
212
|
-
|
|
213
|
-
# Fix Base64 Padding
|
|
214
|
-
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
215
|
-
missing_padding = len(png_str) % 4
|
|
216
|
-
if missing_padding:
|
|
217
|
-
png_str += '=' * (4 - missing_padding)
|
|
218
|
-
|
|
219
|
-
return base64.b64decode(png_str)
|
|
220
|
-
|
|
221
|
-
# Fallback
|
|
222
|
-
return str(resp_json)
|
|
571
|
+
return self._process_universal_response(response, request.output_format)
|
|
223
572
|
|
|
224
|
-
except
|
|
225
|
-
|
|
226
|
-
|
|
573
|
+
except requests.Timeout as e:
|
|
574
|
+
raise ThordataTimeoutError(
|
|
575
|
+
f"Universal scrape timed out: {e}", original_error=e
|
|
576
|
+
)
|
|
577
|
+
except requests.RequestException as e:
|
|
578
|
+
raise ThordataNetworkError(
|
|
579
|
+
f"Universal scrape failed: {e}", original_error=e
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
def _process_universal_response(
|
|
583
|
+
self, response: requests.Response, output_format: str
|
|
584
|
+
) -> Union[str, bytes]:
|
|
585
|
+
"""Process the response from Universal API."""
|
|
586
|
+
# Try to parse as JSON
|
|
587
|
+
try:
|
|
588
|
+
resp_json = response.json()
|
|
589
|
+
except ValueError:
|
|
590
|
+
# Raw content returned
|
|
591
|
+
if output_format.lower() == "png":
|
|
592
|
+
return response.content
|
|
593
|
+
return response.text
|
|
594
|
+
|
|
595
|
+
# Check for API-level errors
|
|
596
|
+
if isinstance(resp_json, dict):
|
|
597
|
+
code = resp_json.get("code")
|
|
598
|
+
if code is not None and code != 200:
|
|
599
|
+
msg = extract_error_message(resp_json)
|
|
600
|
+
raise_for_code(
|
|
601
|
+
f"Universal API Error: {msg}", code=code, payload=resp_json
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Extract HTML
|
|
605
|
+
if "html" in resp_json:
|
|
606
|
+
return resp_json["html"]
|
|
607
|
+
|
|
608
|
+
# Extract PNG
|
|
609
|
+
if "png" in resp_json:
|
|
610
|
+
return decode_base64_image(resp_json["png"])
|
|
611
|
+
|
|
612
|
+
# Fallback
|
|
613
|
+
return str(resp_json)
|
|
614
|
+
|
|
615
|
+
# =========================================================================
|
|
616
|
+
# Web Scraper API (Task-based) Methods
|
|
617
|
+
# =========================================================================
|
|
227
618
|
|
|
228
619
|
def create_scraper_task(
|
|
229
620
|
self,
|
|
230
621
|
file_name: str,
|
|
231
622
|
spider_id: str,
|
|
232
623
|
spider_name: str,
|
|
233
|
-
|
|
234
|
-
universal_params: Optional[Dict[str, Any]] = None
|
|
624
|
+
parameters: Dict[str, Any],
|
|
625
|
+
universal_params: Optional[Dict[str, Any]] = None,
|
|
235
626
|
) -> str:
|
|
236
627
|
"""
|
|
237
|
-
Create
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
from the Thordata Dashboard before calling this method.
|
|
628
|
+
Create an asynchronous Web Scraper task.
|
|
629
|
+
|
|
630
|
+
Note: Get spider_id and spider_name from the Thordata Dashboard.
|
|
241
631
|
|
|
242
632
|
Args:
|
|
243
|
-
file_name
|
|
244
|
-
spider_id
|
|
245
|
-
spider_name
|
|
246
|
-
|
|
247
|
-
universal_params
|
|
633
|
+
file_name: Name for the output file.
|
|
634
|
+
spider_id: Spider identifier from Dashboard.
|
|
635
|
+
spider_name: Spider name (e.g., "youtube.com").
|
|
636
|
+
parameters: Spider-specific parameters.
|
|
637
|
+
universal_params: Global spider settings.
|
|
248
638
|
|
|
249
639
|
Returns:
|
|
250
|
-
|
|
640
|
+
The created task_id.
|
|
641
|
+
|
|
642
|
+
Example:
|
|
643
|
+
>>> task_id = client.create_scraper_task(
|
|
644
|
+
... file_name="youtube_data",
|
|
645
|
+
... spider_id="youtube_video-post_by-url",
|
|
646
|
+
... spider_name="youtube.com",
|
|
647
|
+
... parameters={"url": "https://youtube.com/@channel/videos"}
|
|
648
|
+
... )
|
|
251
649
|
"""
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
650
|
+
config = ScraperTaskConfig(
|
|
651
|
+
file_name=file_name,
|
|
652
|
+
spider_id=spider_id,
|
|
653
|
+
spider_name=spider_name,
|
|
654
|
+
parameters=parameters,
|
|
655
|
+
universal_params=universal_params,
|
|
656
|
+
)
|
|
256
657
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
658
|
+
return self.create_scraper_task_advanced(config)
|
|
659
|
+
|
|
660
|
+
def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
661
|
+
"""
|
|
662
|
+
Create a scraper task using a ScraperTaskConfig object.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
config: Task configuration.
|
|
666
|
+
|
|
667
|
+
Returns:
|
|
668
|
+
The created task_id.
|
|
669
|
+
"""
|
|
670
|
+
payload = config.to_payload()
|
|
671
|
+
headers = build_auth_headers(self.scraper_token)
|
|
672
|
+
|
|
673
|
+
logger.info(f"Creating Scraper Task: {config.spider_name}")
|
|
267
674
|
|
|
268
|
-
logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
|
|
269
675
|
try:
|
|
270
|
-
response = self.
|
|
271
|
-
self.
|
|
676
|
+
response = self._api_session.post(
|
|
677
|
+
self._builder_url,
|
|
272
678
|
data=payload,
|
|
273
|
-
headers=headers
|
|
679
|
+
headers=headers,
|
|
680
|
+
timeout=30,
|
|
274
681
|
)
|
|
275
682
|
response.raise_for_status()
|
|
683
|
+
|
|
276
684
|
data = response.json()
|
|
685
|
+
code = data.get("code")
|
|
686
|
+
|
|
687
|
+
if code != 200:
|
|
688
|
+
msg = extract_error_message(data)
|
|
689
|
+
raise_for_code(f"Task creation failed: {msg}", code=code, payload=data)
|
|
277
690
|
|
|
278
|
-
if data.get("code") != 200:
|
|
279
|
-
raise Exception(f"Creation failed: {data}")
|
|
280
691
|
return data["data"]["task_id"]
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
raise
|
|
692
|
+
|
|
693
|
+
except requests.RequestException as e:
|
|
694
|
+
raise ThordataNetworkError(f"Task creation failed: {e}", original_error=e)
|
|
284
695
|
|
|
285
696
|
def get_task_status(self, task_id: str) -> str:
|
|
286
697
|
"""
|
|
287
698
|
Check the status of an asynchronous scraping task.
|
|
288
699
|
|
|
289
700
|
Args:
|
|
290
|
-
task_id
|
|
701
|
+
task_id: The task ID from create_scraper_task.
|
|
291
702
|
|
|
292
703
|
Returns:
|
|
293
|
-
|
|
704
|
+
Status string (e.g., "running", "ready", "failed").
|
|
294
705
|
"""
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
"
|
|
299
|
-
|
|
706
|
+
self._require_public_credentials()
|
|
707
|
+
|
|
708
|
+
headers = build_public_api_headers(
|
|
709
|
+
self.public_token or "", self.public_key or ""
|
|
710
|
+
)
|
|
300
711
|
payload = {"tasks_ids": task_id}
|
|
301
712
|
|
|
302
713
|
try:
|
|
303
|
-
response = self.
|
|
304
|
-
self.
|
|
714
|
+
response = self._api_session.post(
|
|
715
|
+
self._status_url,
|
|
305
716
|
data=payload,
|
|
306
|
-
headers=headers
|
|
717
|
+
headers=headers,
|
|
718
|
+
timeout=30,
|
|
307
719
|
)
|
|
308
720
|
response.raise_for_status()
|
|
721
|
+
|
|
309
722
|
data = response.json()
|
|
310
723
|
|
|
311
724
|
if data.get("code") == 200 and data.get("data"):
|
|
312
725
|
for item in data["data"]:
|
|
313
726
|
if str(item.get("task_id")) == str(task_id):
|
|
314
|
-
return item
|
|
315
|
-
|
|
727
|
+
return item.get("status", "unknown")
|
|
728
|
+
|
|
729
|
+
return "unknown"
|
|
730
|
+
|
|
316
731
|
except Exception as e:
|
|
317
|
-
logger.error(f"Status
|
|
318
|
-
return "
|
|
732
|
+
logger.error(f"Status check failed: {e}")
|
|
733
|
+
return "error"
|
|
319
734
|
|
|
320
735
|
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
321
736
|
"""
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
Args:
|
|
325
|
-
task_id (str): The task ID.
|
|
326
|
-
file_type (str): Format required (default "json").
|
|
327
|
-
|
|
328
|
-
Returns:
|
|
329
|
-
str: The URL to download the result file.
|
|
737
|
+
Get the download URL for a completed task.
|
|
330
738
|
"""
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
"
|
|
335
|
-
|
|
739
|
+
self._require_public_credentials()
|
|
740
|
+
|
|
741
|
+
headers = build_public_api_headers(
|
|
742
|
+
self.public_token or "", self.public_key or ""
|
|
743
|
+
)
|
|
336
744
|
payload = {"tasks_id": task_id, "type": file_type}
|
|
337
745
|
|
|
338
746
|
logger.info(f"Getting result URL for Task: {task_id}")
|
|
747
|
+
|
|
339
748
|
try:
|
|
340
|
-
response = self.
|
|
341
|
-
self.
|
|
749
|
+
response = self._api_session.post(
|
|
750
|
+
self._download_url,
|
|
342
751
|
data=payload,
|
|
343
|
-
headers=headers
|
|
752
|
+
headers=headers,
|
|
753
|
+
timeout=30,
|
|
344
754
|
)
|
|
345
755
|
response.raise_for_status()
|
|
756
|
+
|
|
346
757
|
data = response.json()
|
|
758
|
+
code = data.get("code")
|
|
347
759
|
|
|
348
|
-
if
|
|
760
|
+
if code == 200 and data.get("data"):
|
|
349
761
|
return data["data"]["download"]
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
762
|
+
|
|
763
|
+
msg = extract_error_message(data)
|
|
764
|
+
raise_for_code(f"Get result failed: {msg}", code=code, payload=data)
|
|
765
|
+
# This line won't be reached, but satisfies mypy
|
|
766
|
+
raise RuntimeError("Unexpected state")
|
|
767
|
+
|
|
768
|
+
except requests.RequestException as e:
|
|
769
|
+
raise ThordataNetworkError(f"Get result failed: {e}", original_error=e)
|
|
770
|
+
|
|
771
|
+
def wait_for_task(
|
|
772
|
+
self,
|
|
773
|
+
task_id: str,
|
|
774
|
+
*,
|
|
775
|
+
poll_interval: float = 5.0,
|
|
776
|
+
max_wait: float = 600.0,
|
|
777
|
+
) -> str:
|
|
356
778
|
"""
|
|
357
|
-
|
|
779
|
+
Wait for a task to complete.
|
|
358
780
|
|
|
359
781
|
Args:
|
|
360
|
-
|
|
361
|
-
|
|
782
|
+
task_id: The task ID to wait for.
|
|
783
|
+
poll_interval: Seconds between status checks.
|
|
784
|
+
max_wait: Maximum seconds to wait.
|
|
362
785
|
|
|
363
786
|
Returns:
|
|
364
|
-
|
|
787
|
+
Final task status.
|
|
365
788
|
|
|
366
789
|
Raises:
|
|
367
|
-
|
|
790
|
+
TimeoutError: If max_wait is exceeded.
|
|
791
|
+
|
|
792
|
+
Example:
|
|
793
|
+
>>> task_id = client.create_scraper_task(...)
|
|
794
|
+
>>> status = client.wait_for_task(task_id, max_wait=300)
|
|
795
|
+
>>> if status in ("ready", "success"):
|
|
796
|
+
... url = client.get_task_result(task_id)
|
|
368
797
|
"""
|
|
369
|
-
|
|
370
|
-
raise RuntimeError(
|
|
371
|
-
"Public API token/key are required for locations endpoints. "
|
|
372
|
-
"Please provide 'public_token' and 'public_key' when "
|
|
373
|
-
"initializing ThordataClient."
|
|
374
|
-
)
|
|
798
|
+
import time
|
|
375
799
|
|
|
376
|
-
|
|
377
|
-
logger.info("Locations API request: %s", url)
|
|
800
|
+
elapsed = 0.0
|
|
378
801
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
url,
|
|
382
|
-
params=params,
|
|
383
|
-
timeout=30,
|
|
384
|
-
)
|
|
385
|
-
response.raise_for_status()
|
|
802
|
+
while elapsed < max_wait:
|
|
803
|
+
status = self.get_task_status(task_id)
|
|
386
804
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
805
|
+
logger.debug(f"Task {task_id} status: {status}")
|
|
806
|
+
|
|
807
|
+
terminal_statuses = {
|
|
808
|
+
"ready",
|
|
809
|
+
"success",
|
|
810
|
+
"finished",
|
|
811
|
+
"failed",
|
|
812
|
+
"error",
|
|
813
|
+
"cancelled",
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
if status.lower() in terminal_statuses:
|
|
817
|
+
return status
|
|
818
|
+
|
|
819
|
+
time.sleep(poll_interval)
|
|
820
|
+
elapsed += poll_interval
|
|
821
|
+
|
|
822
|
+
raise TimeoutError(f"Task {task_id} did not complete within {max_wait} seconds")
|
|
823
|
+
|
|
824
|
+
# =========================================================================
|
|
825
|
+
# Location API Methods
|
|
826
|
+
# =========================================================================
|
|
827
|
+
|
|
828
|
+
def list_countries(
|
|
829
|
+
self, proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
830
|
+
) -> List[Dict[str, Any]]:
|
|
402
831
|
"""
|
|
403
|
-
List supported countries for
|
|
832
|
+
List supported countries for proxies.
|
|
404
833
|
|
|
405
834
|
Args:
|
|
406
|
-
proxy_type
|
|
835
|
+
proxy_type: 1 for residential, 2 for unlimited.
|
|
407
836
|
|
|
408
837
|
Returns:
|
|
409
|
-
List
|
|
838
|
+
List of country records with 'country_code' and 'country_name'.
|
|
410
839
|
"""
|
|
411
|
-
|
|
412
|
-
"
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
840
|
+
return self._get_locations(
|
|
841
|
+
"countries",
|
|
842
|
+
proxy_type=(
|
|
843
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
844
|
+
),
|
|
845
|
+
)
|
|
417
846
|
|
|
418
|
-
def list_states(
|
|
847
|
+
def list_states(
|
|
848
|
+
self,
|
|
849
|
+
country_code: str,
|
|
850
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
851
|
+
) -> List[Dict[str, Any]]:
|
|
419
852
|
"""
|
|
420
|
-
List supported states for a
|
|
853
|
+
List supported states for a country.
|
|
421
854
|
|
|
422
855
|
Args:
|
|
423
|
-
country_code
|
|
424
|
-
proxy_type
|
|
856
|
+
country_code: Country code (e.g., 'US').
|
|
857
|
+
proxy_type: Proxy type.
|
|
425
858
|
|
|
426
859
|
Returns:
|
|
427
|
-
List
|
|
860
|
+
List of state records.
|
|
428
861
|
"""
|
|
429
|
-
|
|
430
|
-
"
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
862
|
+
return self._get_locations(
|
|
863
|
+
"states",
|
|
864
|
+
proxy_type=(
|
|
865
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
866
|
+
),
|
|
867
|
+
country_code=country_code,
|
|
868
|
+
)
|
|
436
869
|
|
|
437
870
|
def list_cities(
|
|
438
871
|
self,
|
|
439
872
|
country_code: str,
|
|
440
873
|
state_code: Optional[str] = None,
|
|
441
|
-
proxy_type: int =
|
|
874
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
442
875
|
) -> List[Dict[str, Any]]:
|
|
443
876
|
"""
|
|
444
|
-
List supported cities for a
|
|
877
|
+
List supported cities for a country/state.
|
|
445
878
|
|
|
446
879
|
Args:
|
|
447
|
-
country_code
|
|
448
|
-
state_code
|
|
449
|
-
proxy_type
|
|
880
|
+
country_code: Country code.
|
|
881
|
+
state_code: Optional state code.
|
|
882
|
+
proxy_type: Proxy type.
|
|
450
883
|
|
|
451
884
|
Returns:
|
|
452
|
-
List
|
|
885
|
+
List of city records.
|
|
453
886
|
"""
|
|
454
|
-
|
|
455
|
-
"
|
|
456
|
-
|
|
457
|
-
|
|
887
|
+
kwargs = {
|
|
888
|
+
"proxy_type": (
|
|
889
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
890
|
+
),
|
|
458
891
|
"country_code": country_code,
|
|
459
892
|
}
|
|
460
893
|
if state_code:
|
|
461
|
-
|
|
894
|
+
kwargs["state_code"] = state_code
|
|
462
895
|
|
|
463
|
-
return self._get_locations("cities",
|
|
896
|
+
return self._get_locations("cities", **kwargs)
|
|
464
897
|
|
|
465
898
|
def list_asn(
|
|
466
899
|
self,
|
|
467
900
|
country_code: str,
|
|
468
|
-
proxy_type: int =
|
|
901
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL,
|
|
469
902
|
) -> List[Dict[str, Any]]:
|
|
470
903
|
"""
|
|
471
|
-
List supported ASNs for a
|
|
904
|
+
List supported ASNs for a country.
|
|
472
905
|
|
|
473
906
|
Args:
|
|
474
|
-
country_code
|
|
475
|
-
proxy_type
|
|
907
|
+
country_code: Country code.
|
|
908
|
+
proxy_type: Proxy type.
|
|
476
909
|
|
|
477
910
|
Returns:
|
|
478
|
-
List
|
|
911
|
+
List of ASN records.
|
|
479
912
|
"""
|
|
913
|
+
return self._get_locations(
|
|
914
|
+
"asn",
|
|
915
|
+
proxy_type=(
|
|
916
|
+
int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
917
|
+
),
|
|
918
|
+
country_code=country_code,
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
def _get_locations(self, endpoint: str, **kwargs: Any) -> List[Dict[str, Any]]:
|
|
922
|
+
"""Internal method to call locations API."""
|
|
923
|
+
self._require_public_credentials()
|
|
924
|
+
|
|
480
925
|
params = {
|
|
481
926
|
"token": self.public_token,
|
|
482
927
|
"key": self.public_key,
|
|
483
|
-
"proxy_type": str(proxy_type),
|
|
484
|
-
"country_code": country_code,
|
|
485
928
|
}
|
|
486
|
-
|
|
929
|
+
|
|
930
|
+
for key, value in kwargs.items():
|
|
931
|
+
params[key] = str(value)
|
|
932
|
+
|
|
933
|
+
url = f"{self._locations_base_url}/{endpoint}"
|
|
934
|
+
|
|
935
|
+
logger.debug(f"Locations API request: {url}")
|
|
936
|
+
|
|
937
|
+
# Use requests.get directly (no proxy needed for this API)
|
|
938
|
+
response = self._api_session.get(url, params=params, timeout=30)
|
|
939
|
+
response.raise_for_status()
|
|
940
|
+
|
|
941
|
+
data = response.json()
|
|
942
|
+
|
|
943
|
+
if isinstance(data, dict):
|
|
944
|
+
code = data.get("code")
|
|
945
|
+
if code is not None and code != 200:
|
|
946
|
+
msg = data.get("msg", "")
|
|
947
|
+
raise RuntimeError(
|
|
948
|
+
f"Locations API error ({endpoint}): code={code}, msg={msg}"
|
|
949
|
+
)
|
|
950
|
+
return data.get("data") or []
|
|
951
|
+
|
|
952
|
+
if isinstance(data, list):
|
|
953
|
+
return data
|
|
954
|
+
|
|
955
|
+
return []
|
|
956
|
+
|
|
957
|
+
# =========================================================================
|
|
958
|
+
# Helper Methods
|
|
959
|
+
# =========================================================================
|
|
960
|
+
|
|
961
|
+
def _require_public_credentials(self) -> None:
|
|
962
|
+
"""Ensure public API credentials are available."""
|
|
963
|
+
if not self.public_token or not self.public_key:
|
|
964
|
+
raise ThordataConfigError(
|
|
965
|
+
"public_token and public_key are required for this operation. "
|
|
966
|
+
"Please provide them when initializing ThordataClient."
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
def _request_with_retry(
|
|
970
|
+
self, method: str, url: str, **kwargs: Any
|
|
971
|
+
) -> requests.Response:
|
|
972
|
+
"""Make a request with automatic retry."""
|
|
973
|
+
kwargs.setdefault("timeout", self._default_timeout)
|
|
974
|
+
|
|
975
|
+
@with_retry(self._retry_config)
|
|
976
|
+
def _do_request() -> requests.Response:
|
|
977
|
+
return self._proxy_session.request(method, url, **kwargs)
|
|
978
|
+
|
|
979
|
+
try:
|
|
980
|
+
return _do_request()
|
|
981
|
+
except requests.Timeout as e:
|
|
982
|
+
raise ThordataTimeoutError(f"Request timed out: {e}", original_error=e)
|
|
983
|
+
except requests.RequestException as e:
|
|
984
|
+
raise ThordataNetworkError(f"Request failed: {e}", original_error=e)
|
|
985
|
+
|
|
986
|
+
def close(self) -> None:
|
|
987
|
+
"""Close the underlying session."""
|
|
988
|
+
self._proxy_session.close()
|
|
989
|
+
self._api_session.close()
|
|
990
|
+
|
|
991
|
+
def __enter__(self) -> ThordataClient:
|
|
992
|
+
return self
|
|
993
|
+
|
|
994
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
995
|
+
self.close()
|