thordata-sdk 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +131 -9
- thordata/_utils.py +126 -0
- thordata/async_client.py +640 -191
- thordata/client.py +764 -314
- thordata/enums.py +275 -11
- thordata/exceptions.py +315 -0
- thordata/models.py +698 -0
- thordata/retry.py +382 -0
- thordata_sdk-0.4.0.dist-info/METADATA +678 -0
- thordata_sdk-0.4.0.dist-info/RECORD +14 -0
- thordata_sdk-0.4.0.dist-info/licenses/LICENSE +21 -0
- thordata_sdk-0.3.1.dist-info/METADATA +0 -200
- thordata_sdk-0.3.1.dist-info/RECORD +0 -10
- thordata_sdk-0.3.1.dist-info/licenses/LICENSE +0 -201
- {thordata_sdk-0.3.1.dist-info → thordata_sdk-0.4.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.3.1.dist-info → thordata_sdk-0.4.0.dist-info}/top_level.txt +0 -0
thordata/client.py
CHANGED
|
@@ -1,13 +1,55 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
"""
|
|
2
|
+
Synchronous client for the Thordata API.
|
|
3
|
+
|
|
4
|
+
This module provides the main ThordataClient class for interacting with
|
|
5
|
+
Thordata's proxy network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> from thordata import ThordataClient
|
|
9
|
+
>>>
|
|
10
|
+
>>> client = ThordataClient(
|
|
11
|
+
... scraper_token="your_token",
|
|
12
|
+
... public_token="your_public_token",
|
|
13
|
+
... public_key="your_public_key"
|
|
14
|
+
... )
|
|
15
|
+
>>>
|
|
16
|
+
>>> # Use the proxy network
|
|
17
|
+
>>> response = client.get("https://httpbin.org/ip")
|
|
18
|
+
>>> print(response.json())
|
|
19
|
+
>>>
|
|
20
|
+
>>> # Search with SERP API
|
|
21
|
+
>>> results = client.serp_search("python tutorial", engine="google")
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
6
25
|
|
|
7
|
-
|
|
8
|
-
|
|
26
|
+
import logging
|
|
27
|
+
import requests
|
|
28
|
+
from typing import Any, Dict, List, Optional, Union
|
|
29
|
+
|
|
30
|
+
from .enums import Engine, ProxyType
|
|
31
|
+
from .exceptions import (
|
|
32
|
+
ThordataConfigError,
|
|
33
|
+
ThordataNetworkError,
|
|
34
|
+
ThordataTimeoutError,
|
|
35
|
+
raise_for_code,
|
|
36
|
+
)
|
|
37
|
+
from .models import (
|
|
38
|
+
ProxyConfig,
|
|
39
|
+
ProxyProduct,
|
|
40
|
+
SerpRequest,
|
|
41
|
+
UniversalScrapeRequest,
|
|
42
|
+
ScraperTaskConfig,
|
|
43
|
+
)
|
|
44
|
+
from .retry import RetryConfig, with_retry
|
|
45
|
+
from ._utils import (
|
|
46
|
+
parse_json_response,
|
|
47
|
+
decode_base64_image,
|
|
48
|
+
build_auth_headers,
|
|
49
|
+
build_public_api_headers,
|
|
50
|
+
extract_error_message,
|
|
51
|
+
)
|
|
9
52
|
|
|
10
|
-
# Configure a library-specific logger to avoid interfering with user's logging
|
|
11
53
|
logger = logging.getLogger(__name__)
|
|
12
54
|
|
|
13
55
|
|
|
@@ -16,471 +58,879 @@ class ThordataClient:
|
|
|
16
58
|
The official synchronous Python client for Thordata.
|
|
17
59
|
|
|
18
60
|
This client handles authentication and communication with:
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
61
|
+
- Proxy Network (Residential/Datacenter/Mobile/ISP via HTTP/HTTPS)
|
|
62
|
+
- SERP API (Real-time Search Engine Results)
|
|
63
|
+
- Universal Scraping API (Web Unlocker - Single Page Rendering)
|
|
64
|
+
- Web Scraper API (Async Task Management)
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
scraper_token: The API token from your Dashboard.
|
|
68
|
+
public_token: The public API token (for task status, locations).
|
|
69
|
+
public_key: The public API key.
|
|
70
|
+
proxy_host: Custom proxy gateway host (optional).
|
|
71
|
+
proxy_port: Custom proxy gateway port (optional).
|
|
72
|
+
timeout: Default request timeout in seconds (default: 30).
|
|
73
|
+
retry_config: Configuration for automatic retries (optional).
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
>>> client = ThordataClient(
|
|
77
|
+
... scraper_token="your_scraper_token",
|
|
78
|
+
... public_token="your_public_token",
|
|
79
|
+
... public_key="your_public_key"
|
|
80
|
+
... )
|
|
23
81
|
"""
|
|
24
82
|
|
|
83
|
+
# API Endpoints
|
|
84
|
+
BASE_URL = "https://scraperapi.thordata.com"
|
|
85
|
+
UNIVERSAL_URL = "https://universalapi.thordata.com"
|
|
86
|
+
API_URL = "https://api.thordata.com/api/web-scraper-api"
|
|
87
|
+
LOCATIONS_URL = "https://api.thordata.com/api/locations"
|
|
88
|
+
|
|
25
89
|
def __init__(
|
|
26
90
|
self,
|
|
27
91
|
scraper_token: str,
|
|
28
|
-
public_token: str,
|
|
29
|
-
public_key: str,
|
|
30
|
-
proxy_host: str = "
|
|
31
|
-
proxy_port: int =
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
public_key (str): The key from the Public API section.
|
|
40
|
-
proxy_host (str): The proxy gateway host (default: gate.thordata.com).
|
|
41
|
-
proxy_port (int): The proxy gateway port (default: 22225).
|
|
42
|
-
"""
|
|
92
|
+
public_token: Optional[str] = None,
|
|
93
|
+
public_key: Optional[str] = None,
|
|
94
|
+
proxy_host: str = "pr.thordata.net",
|
|
95
|
+
proxy_port: int = 9999,
|
|
96
|
+
timeout: int = 30,
|
|
97
|
+
retry_config: Optional[RetryConfig] = None,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""Initialize the Thordata Client."""
|
|
100
|
+
if not scraper_token:
|
|
101
|
+
raise ThordataConfigError("scraper_token is required")
|
|
102
|
+
|
|
43
103
|
self.scraper_token = scraper_token
|
|
44
104
|
self.public_token = public_token
|
|
45
105
|
self.public_key = public_key
|
|
46
|
-
|
|
47
|
-
# Proxy
|
|
48
|
-
self.
|
|
49
|
-
|
|
106
|
+
|
|
107
|
+
# Proxy configuration
|
|
108
|
+
self._proxy_host = proxy_host
|
|
109
|
+
self._proxy_port = proxy_port
|
|
110
|
+
self._default_timeout = timeout
|
|
111
|
+
|
|
112
|
+
# Retry configuration
|
|
113
|
+
self._retry_config = retry_config or RetryConfig()
|
|
114
|
+
|
|
115
|
+
# Build default proxy URL (for basic usage)
|
|
116
|
+
self._default_proxy_url = (
|
|
117
|
+
f"http://td-customer-{self.scraper_token}:@{proxy_host}:{proxy_port}"
|
|
50
118
|
)
|
|
51
|
-
|
|
52
|
-
#
|
|
53
|
-
self.
|
|
54
|
-
self.
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
self.SERP_API_URL = f"{self.base_url}/request"
|
|
59
|
-
self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
|
|
60
|
-
self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
|
|
61
|
-
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
62
|
-
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
63
|
-
|
|
64
|
-
# Initialize Session with Proxy settings
|
|
65
|
-
self.session = requests.Session()
|
|
66
|
-
self.session.proxies = {
|
|
67
|
-
"http": self.proxy_url,
|
|
68
|
-
"https": self.proxy_url,
|
|
119
|
+
|
|
120
|
+
# Initialize session with default proxy settings
|
|
121
|
+
self._session = requests.Session()
|
|
122
|
+
self._session.proxies = {
|
|
123
|
+
"http": self._default_proxy_url,
|
|
124
|
+
"https": self._default_proxy_url,
|
|
69
125
|
}
|
|
126
|
+
|
|
127
|
+
# Store endpoint URLs
|
|
128
|
+
self._serp_url = f"{self.BASE_URL}/request"
|
|
129
|
+
self._universal_url = f"{self.UNIVERSAL_URL}/request"
|
|
130
|
+
self._builder_url = f"{self.BASE_URL}/builder"
|
|
131
|
+
self._status_url = f"{self.API_URL}/tasks-status"
|
|
132
|
+
self._download_url = f"{self.API_URL}/tasks-download"
|
|
133
|
+
|
|
134
|
+
# =========================================================================
|
|
135
|
+
# Proxy Network Methods
|
|
136
|
+
# =========================================================================
|
|
137
|
+
|
|
138
|
+
def get(
|
|
139
|
+
self,
|
|
140
|
+
url: str,
|
|
141
|
+
*,
|
|
142
|
+
proxy_config: Optional[ProxyConfig] = None,
|
|
143
|
+
timeout: Optional[int] = None,
|
|
144
|
+
**kwargs: Any,
|
|
145
|
+
) -> requests.Response:
|
|
146
|
+
"""
|
|
147
|
+
Send a GET request through the Thordata Proxy Network.
|
|
70
148
|
|
|
71
|
-
|
|
149
|
+
Args:
|
|
150
|
+
url: The target URL.
|
|
151
|
+
proxy_config: Custom proxy configuration for geo-targeting/sessions.
|
|
152
|
+
timeout: Request timeout in seconds.
|
|
153
|
+
**kwargs: Additional arguments to pass to requests.get().
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
The response object.
|
|
157
|
+
|
|
158
|
+
Example:
|
|
159
|
+
>>> # Basic request
|
|
160
|
+
>>> response = client.get("https://httpbin.org/ip")
|
|
161
|
+
>>>
|
|
162
|
+
>>> # With geo-targeting
|
|
163
|
+
>>> from thordata.models import ProxyConfig
|
|
164
|
+
>>> config = ProxyConfig(
|
|
165
|
+
... username="myuser",
|
|
166
|
+
... password="mypass",
|
|
167
|
+
... country="us",
|
|
168
|
+
... city="seattle"
|
|
169
|
+
... )
|
|
170
|
+
>>> response = client.get("https://httpbin.org/ip", proxy_config=config)
|
|
72
171
|
"""
|
|
73
|
-
|
|
172
|
+
logger.debug(f"Proxy GET request: {url}")
|
|
173
|
+
|
|
174
|
+
timeout = timeout or self._default_timeout
|
|
175
|
+
|
|
176
|
+
if proxy_config:
|
|
177
|
+
proxies = proxy_config.to_proxies_dict()
|
|
178
|
+
kwargs["proxies"] = proxies
|
|
179
|
+
|
|
180
|
+
return self._request_with_retry("GET", url, timeout=timeout, **kwargs)
|
|
181
|
+
|
|
182
|
+
def post(
|
|
183
|
+
self,
|
|
184
|
+
url: str,
|
|
185
|
+
*,
|
|
186
|
+
proxy_config: Optional[ProxyConfig] = None,
|
|
187
|
+
timeout: Optional[int] = None,
|
|
188
|
+
**kwargs: Any,
|
|
189
|
+
) -> requests.Response:
|
|
190
|
+
"""
|
|
191
|
+
Send a POST request through the Thordata Proxy Network.
|
|
74
192
|
|
|
75
193
|
Args:
|
|
76
|
-
url
|
|
77
|
-
|
|
194
|
+
url: The target URL.
|
|
195
|
+
proxy_config: Custom proxy configuration.
|
|
196
|
+
timeout: Request timeout in seconds.
|
|
197
|
+
**kwargs: Additional arguments to pass to requests.post().
|
|
78
198
|
|
|
79
199
|
Returns:
|
|
80
|
-
|
|
200
|
+
The response object.
|
|
81
201
|
"""
|
|
82
|
-
logger.debug(f"Proxy
|
|
83
|
-
|
|
84
|
-
|
|
202
|
+
logger.debug(f"Proxy POST request: {url}")
|
|
203
|
+
|
|
204
|
+
timeout = timeout or self._default_timeout
|
|
205
|
+
|
|
206
|
+
if proxy_config:
|
|
207
|
+
proxies = proxy_config.to_proxies_dict()
|
|
208
|
+
kwargs["proxies"] = proxies
|
|
209
|
+
|
|
210
|
+
return self._request_with_retry("POST", url, timeout=timeout, **kwargs)
|
|
211
|
+
|
|
212
|
+
def build_proxy_url(
|
|
213
|
+
self,
|
|
214
|
+
*,
|
|
215
|
+
country: Optional[str] = None,
|
|
216
|
+
state: Optional[str] = None,
|
|
217
|
+
city: Optional[str] = None,
|
|
218
|
+
session_id: Optional[str] = None,
|
|
219
|
+
session_duration: Optional[int] = None,
|
|
220
|
+
product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL,
|
|
221
|
+
) -> str:
|
|
222
|
+
"""
|
|
223
|
+
Build a proxy URL with custom targeting options.
|
|
224
|
+
|
|
225
|
+
This is a convenience method for creating proxy URLs without
|
|
226
|
+
manually constructing a ProxyConfig.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
country: Target country code (e.g., 'us', 'gb').
|
|
230
|
+
state: Target state (e.g., 'california').
|
|
231
|
+
city: Target city (e.g., 'seattle').
|
|
232
|
+
session_id: Session ID for sticky sessions.
|
|
233
|
+
session_duration: Session duration in minutes (1-90).
|
|
234
|
+
product: Proxy product type.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
The proxy URL string.
|
|
238
|
+
|
|
239
|
+
Example:
|
|
240
|
+
>>> url = client.build_proxy_url(country="us", city="seattle")
|
|
241
|
+
>>> proxies = {"http": url, "https": url}
|
|
242
|
+
>>> requests.get("https://example.com", proxies=proxies)
|
|
243
|
+
"""
|
|
244
|
+
config = ProxyConfig(
|
|
245
|
+
username=self.scraper_token,
|
|
246
|
+
password="",
|
|
247
|
+
host=self._proxy_host,
|
|
248
|
+
port=self._proxy_port,
|
|
249
|
+
product=product,
|
|
250
|
+
country=country,
|
|
251
|
+
state=state,
|
|
252
|
+
city=city,
|
|
253
|
+
session_id=session_id,
|
|
254
|
+
session_duration=session_duration,
|
|
255
|
+
)
|
|
256
|
+
return config.build_proxy_url()
|
|
257
|
+
|
|
258
|
+
# =========================================================================
|
|
259
|
+
# SERP API Methods
|
|
260
|
+
# =========================================================================
|
|
85
261
|
|
|
86
262
|
def serp_search(
|
|
87
|
-
self,
|
|
88
|
-
query: str,
|
|
263
|
+
self,
|
|
264
|
+
query: str,
|
|
265
|
+
*,
|
|
89
266
|
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
90
|
-
num: int = 10,
|
|
91
|
-
|
|
267
|
+
num: int = 10,
|
|
268
|
+
country: Optional[str] = None,
|
|
269
|
+
language: Optional[str] = None,
|
|
270
|
+
search_type: Optional[str] = None,
|
|
271
|
+
**kwargs: Any,
|
|
92
272
|
) -> Dict[str, Any]:
|
|
93
273
|
"""
|
|
94
274
|
Execute a real-time SERP (Search Engine Results Page) search.
|
|
95
275
|
|
|
96
276
|
Args:
|
|
97
|
-
query
|
|
98
|
-
engine
|
|
99
|
-
num
|
|
100
|
-
|
|
277
|
+
query: The search keywords.
|
|
278
|
+
engine: Search engine (google, bing, yandex, duckduckgo, baidu).
|
|
279
|
+
num: Number of results to retrieve (default: 10).
|
|
280
|
+
country: Country code for localized results (e.g., 'us').
|
|
281
|
+
language: Language code for interface (e.g., 'en').
|
|
282
|
+
search_type: Type of search (images, news, shopping, videos).
|
|
283
|
+
**kwargs: Additional engine-specific parameters.
|
|
101
284
|
|
|
102
285
|
Returns:
|
|
103
|
-
|
|
286
|
+
Parsed JSON results from the search.
|
|
287
|
+
|
|
288
|
+
Example:
|
|
289
|
+
>>> # Basic search
|
|
290
|
+
>>> results = client.serp_search("python tutorial")
|
|
291
|
+
>>>
|
|
292
|
+
>>> # With options
|
|
293
|
+
>>> results = client.serp_search(
|
|
294
|
+
... "laptop reviews",
|
|
295
|
+
... engine="google",
|
|
296
|
+
... num=20,
|
|
297
|
+
... country="us",
|
|
298
|
+
... search_type="shopping"
|
|
299
|
+
... )
|
|
104
300
|
"""
|
|
105
|
-
#
|
|
301
|
+
# Normalize engine
|
|
106
302
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
107
|
-
|
|
108
|
-
#
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
303
|
+
|
|
304
|
+
# Build request using model
|
|
305
|
+
request = SerpRequest(
|
|
306
|
+
query=query,
|
|
307
|
+
engine=engine_str,
|
|
308
|
+
num=num,
|
|
309
|
+
country=country,
|
|
310
|
+
language=language,
|
|
311
|
+
search_type=search_type,
|
|
312
|
+
extra_params=kwargs,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
payload = request.to_payload()
|
|
316
|
+
headers = build_auth_headers(self.scraper_token)
|
|
317
|
+
|
|
116
318
|
logger.info(f"SERP Search: {engine_str} - {query}")
|
|
319
|
+
|
|
117
320
|
try:
|
|
118
|
-
response = self.
|
|
119
|
-
self.
|
|
321
|
+
response = self._session.post(
|
|
322
|
+
self._serp_url,
|
|
120
323
|
data=payload,
|
|
121
324
|
headers=headers,
|
|
122
|
-
timeout=60
|
|
325
|
+
timeout=60,
|
|
123
326
|
)
|
|
124
327
|
response.raise_for_status()
|
|
125
328
|
|
|
126
329
|
data = response.json()
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
except
|
|
135
|
-
|
|
136
|
-
|
|
330
|
+
return parse_json_response(data)
|
|
331
|
+
|
|
332
|
+
except requests.Timeout as e:
|
|
333
|
+
raise ThordataTimeoutError(
|
|
334
|
+
f"SERP request timed out: {e}",
|
|
335
|
+
original_error=e
|
|
336
|
+
)
|
|
337
|
+
except requests.RequestException as e:
|
|
338
|
+
raise ThordataNetworkError(
|
|
339
|
+
f"SERP request failed: {e}",
|
|
340
|
+
original_error=e
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
def serp_search_advanced(self, request: SerpRequest) -> Dict[str, Any]:
|
|
344
|
+
"""
|
|
345
|
+
Execute a SERP search using a SerpRequest object.
|
|
346
|
+
|
|
347
|
+
This method provides full control over all search parameters.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
request: A SerpRequest object with all parameters configured.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Parsed JSON results.
|
|
354
|
+
|
|
355
|
+
Example:
|
|
356
|
+
>>> from thordata.models import SerpRequest
|
|
357
|
+
>>> request = SerpRequest(
|
|
358
|
+
... query="python programming",
|
|
359
|
+
... engine="google",
|
|
360
|
+
... num=50,
|
|
361
|
+
... country="us",
|
|
362
|
+
... language="en",
|
|
363
|
+
... search_type="news",
|
|
364
|
+
... time_filter="week",
|
|
365
|
+
... safe_search=True
|
|
366
|
+
... )
|
|
367
|
+
>>> results = client.serp_search_advanced(request)
|
|
368
|
+
"""
|
|
369
|
+
payload = request.to_payload()
|
|
370
|
+
headers = build_auth_headers(self.scraper_token)
|
|
371
|
+
|
|
372
|
+
logger.info(f"SERP Advanced Search: {request.engine} - {request.query}")
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
response = self._session.post(
|
|
376
|
+
self._serp_url,
|
|
377
|
+
data=payload,
|
|
378
|
+
headers=headers,
|
|
379
|
+
timeout=60,
|
|
380
|
+
)
|
|
381
|
+
response.raise_for_status()
|
|
382
|
+
|
|
383
|
+
data = response.json()
|
|
384
|
+
return parse_json_response(data)
|
|
385
|
+
|
|
386
|
+
except requests.Timeout as e:
|
|
387
|
+
raise ThordataTimeoutError(
|
|
388
|
+
f"SERP request timed out: {e}",
|
|
389
|
+
original_error=e
|
|
390
|
+
)
|
|
391
|
+
except requests.RequestException as e:
|
|
392
|
+
raise ThordataNetworkError(
|
|
393
|
+
f"SERP request failed: {e}",
|
|
394
|
+
original_error=e
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# =========================================================================
|
|
398
|
+
# Universal Scraping API (Web Unlocker) Methods
|
|
399
|
+
# =========================================================================
|
|
137
400
|
|
|
138
401
|
def universal_scrape(
|
|
139
402
|
self,
|
|
140
403
|
url: str,
|
|
404
|
+
*,
|
|
141
405
|
js_render: bool = False,
|
|
142
|
-
output_format: str = "
|
|
406
|
+
output_format: str = "html",
|
|
143
407
|
country: Optional[str] = None,
|
|
144
|
-
block_resources:
|
|
408
|
+
block_resources: Optional[str] = None,
|
|
409
|
+
wait: Optional[int] = None,
|
|
410
|
+
wait_for: Optional[str] = None,
|
|
411
|
+
**kwargs: Any,
|
|
145
412
|
) -> Union[str, bytes]:
|
|
146
413
|
"""
|
|
147
|
-
|
|
148
|
-
|
|
414
|
+
Scrape a URL using the Universal Scraping API (Web Unlocker).
|
|
415
|
+
|
|
416
|
+
Automatically bypasses Cloudflare, CAPTCHAs, and antibot systems.
|
|
149
417
|
|
|
150
418
|
Args:
|
|
151
|
-
url
|
|
152
|
-
js_render
|
|
153
|
-
output_format
|
|
154
|
-
country
|
|
155
|
-
block_resources
|
|
419
|
+
url: Target URL.
|
|
420
|
+
js_render: Enable JavaScript rendering (headless browser).
|
|
421
|
+
output_format: "html" or "png" (screenshot).
|
|
422
|
+
country: Geo-targeting country code.
|
|
423
|
+
block_resources: Resources to block (e.g., 'script,image').
|
|
424
|
+
wait: Wait time in milliseconds after page load.
|
|
425
|
+
wait_for: CSS selector to wait for.
|
|
426
|
+
**kwargs: Additional parameters.
|
|
156
427
|
|
|
157
428
|
Returns:
|
|
158
|
-
|
|
429
|
+
HTML string or PNG bytes depending on output_format.
|
|
430
|
+
|
|
431
|
+
Example:
|
|
432
|
+
>>> # Get HTML
|
|
433
|
+
>>> html = client.universal_scrape("https://example.com", js_render=True)
|
|
434
|
+
>>>
|
|
435
|
+
>>> # Get screenshot
|
|
436
|
+
>>> png = client.universal_scrape(
|
|
437
|
+
... "https://example.com",
|
|
438
|
+
... js_render=True,
|
|
439
|
+
... output_format="png"
|
|
440
|
+
... )
|
|
441
|
+
>>> with open("screenshot.png", "wb") as f:
|
|
442
|
+
... f.write(png)
|
|
159
443
|
"""
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
payload["country"] = country
|
|
173
|
-
|
|
174
|
-
logger.info(f"Universal Scrape: {url} (Format: {output_format})")
|
|
444
|
+
request = UniversalScrapeRequest(
|
|
445
|
+
url=url,
|
|
446
|
+
js_render=js_render,
|
|
447
|
+
output_format=output_format,
|
|
448
|
+
country=country,
|
|
449
|
+
block_resources=block_resources,
|
|
450
|
+
wait=wait,
|
|
451
|
+
wait_for=wait_for,
|
|
452
|
+
extra_params=kwargs,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
return self.universal_scrape_advanced(request)
|
|
175
456
|
|
|
457
|
+
def universal_scrape_advanced(
|
|
458
|
+
self,
|
|
459
|
+
request: UniversalScrapeRequest
|
|
460
|
+
) -> Union[str, bytes]:
|
|
461
|
+
"""
|
|
462
|
+
Scrape using a UniversalScrapeRequest object for full control.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
request: A UniversalScrapeRequest with all parameters.
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
HTML string or PNG bytes.
|
|
469
|
+
"""
|
|
470
|
+
payload = request.to_payload()
|
|
471
|
+
headers = build_auth_headers(self.scraper_token)
|
|
472
|
+
|
|
473
|
+
logger.info(f"Universal Scrape: {request.url} (format: {request.output_format})")
|
|
474
|
+
|
|
176
475
|
try:
|
|
177
|
-
response = self.
|
|
178
|
-
self.
|
|
476
|
+
response = self._session.post(
|
|
477
|
+
self._universal_url,
|
|
179
478
|
data=payload,
|
|
180
479
|
headers=headers,
|
|
181
|
-
timeout=60
|
|
480
|
+
timeout=60,
|
|
182
481
|
)
|
|
183
482
|
response.raise_for_status()
|
|
483
|
+
|
|
484
|
+
return self._process_universal_response(response, request.output_format)
|
|
485
|
+
|
|
486
|
+
except requests.Timeout as e:
|
|
487
|
+
raise ThordataTimeoutError(
|
|
488
|
+
f"Universal scrape timed out: {e}",
|
|
489
|
+
original_error=e
|
|
490
|
+
)
|
|
491
|
+
except requests.RequestException as e:
|
|
492
|
+
raise ThordataNetworkError(
|
|
493
|
+
f"Universal scrape failed: {e}",
|
|
494
|
+
original_error=e
|
|
495
|
+
)
|
|
184
496
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
# Fallback
|
|
222
|
-
return str(resp_json)
|
|
497
|
+
def _process_universal_response(
|
|
498
|
+
self,
|
|
499
|
+
response: requests.Response,
|
|
500
|
+
output_format: str
|
|
501
|
+
) -> Union[str, bytes]:
|
|
502
|
+
"""Process the response from Universal API."""
|
|
503
|
+
# Try to parse as JSON
|
|
504
|
+
try:
|
|
505
|
+
resp_json = response.json()
|
|
506
|
+
except ValueError:
|
|
507
|
+
# Raw content returned
|
|
508
|
+
if output_format.lower() == "png":
|
|
509
|
+
return response.content
|
|
510
|
+
return response.text
|
|
511
|
+
|
|
512
|
+
# Check for API-level errors
|
|
513
|
+
if isinstance(resp_json, dict):
|
|
514
|
+
code = resp_json.get("code")
|
|
515
|
+
if code is not None and code != 200:
|
|
516
|
+
msg = extract_error_message(resp_json)
|
|
517
|
+
raise_for_code(
|
|
518
|
+
f"Universal API Error: {msg}",
|
|
519
|
+
code=code,
|
|
520
|
+
payload=resp_json
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# Extract HTML
|
|
524
|
+
if "html" in resp_json:
|
|
525
|
+
return resp_json["html"]
|
|
526
|
+
|
|
527
|
+
# Extract PNG
|
|
528
|
+
if "png" in resp_json:
|
|
529
|
+
return decode_base64_image(resp_json["png"])
|
|
530
|
+
|
|
531
|
+
# Fallback
|
|
532
|
+
return str(resp_json)
|
|
223
533
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
534
|
+
# =========================================================================
|
|
535
|
+
# Web Scraper API (Task-based) Methods
|
|
536
|
+
# =========================================================================
|
|
227
537
|
|
|
228
538
|
def create_scraper_task(
|
|
229
539
|
self,
|
|
230
540
|
file_name: str,
|
|
231
541
|
spider_id: str,
|
|
232
542
|
spider_name: str,
|
|
233
|
-
|
|
234
|
-
universal_params: Optional[Dict[str, Any]] = None
|
|
543
|
+
parameters: Dict[str, Any],
|
|
544
|
+
universal_params: Optional[Dict[str, Any]] = None,
|
|
235
545
|
) -> str:
|
|
236
546
|
"""
|
|
237
|
-
Create
|
|
547
|
+
Create an asynchronous Web Scraper task.
|
|
238
548
|
|
|
239
|
-
|
|
240
|
-
from the Thordata Dashboard before calling this method.
|
|
549
|
+
Note: Get spider_id and spider_name from the Thordata Dashboard.
|
|
241
550
|
|
|
242
551
|
Args:
|
|
243
|
-
file_name
|
|
244
|
-
spider_id
|
|
245
|
-
spider_name
|
|
246
|
-
|
|
247
|
-
universal_params
|
|
552
|
+
file_name: Name for the output file.
|
|
553
|
+
spider_id: Spider identifier from Dashboard.
|
|
554
|
+
spider_name: Spider name (e.g., "youtube.com").
|
|
555
|
+
parameters: Spider-specific parameters.
|
|
556
|
+
universal_params: Global spider settings.
|
|
248
557
|
|
|
249
558
|
Returns:
|
|
250
|
-
|
|
559
|
+
The created task_id.
|
|
560
|
+
|
|
561
|
+
Example:
|
|
562
|
+
>>> task_id = client.create_scraper_task(
|
|
563
|
+
... file_name="youtube_data",
|
|
564
|
+
... spider_id="youtube_video-post_by-url",
|
|
565
|
+
... spider_name="youtube.com",
|
|
566
|
+
... parameters={"url": "https://youtube.com/@channel/videos"}
|
|
567
|
+
... )
|
|
251
568
|
"""
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
"spider_parameters": json.dumps([individual_params]),
|
|
262
|
-
"spider_errors": "true",
|
|
263
|
-
"file_name": file_name
|
|
264
|
-
}
|
|
265
|
-
if universal_params:
|
|
266
|
-
payload["spider_universal"] = json.dumps(universal_params)
|
|
569
|
+
config = ScraperTaskConfig(
|
|
570
|
+
file_name=file_name,
|
|
571
|
+
spider_id=spider_id,
|
|
572
|
+
spider_name=spider_name,
|
|
573
|
+
parameters=parameters,
|
|
574
|
+
universal_params=universal_params,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
return self.create_scraper_task_advanced(config)
|
|
267
578
|
|
|
268
|
-
|
|
579
|
+
def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
580
|
+
"""
|
|
581
|
+
Create a scraper task using a ScraperTaskConfig object.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
config: Task configuration.
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
The created task_id.
|
|
588
|
+
"""
|
|
589
|
+
payload = config.to_payload()
|
|
590
|
+
headers = build_auth_headers(self.scraper_token)
|
|
591
|
+
|
|
592
|
+
logger.info(f"Creating Scraper Task: {config.spider_name}")
|
|
593
|
+
|
|
269
594
|
try:
|
|
270
|
-
response = self.
|
|
271
|
-
self.
|
|
595
|
+
response = self._session.post(
|
|
596
|
+
self._builder_url,
|
|
272
597
|
data=payload,
|
|
273
|
-
headers=headers
|
|
598
|
+
headers=headers,
|
|
599
|
+
timeout=30,
|
|
274
600
|
)
|
|
275
601
|
response.raise_for_status()
|
|
602
|
+
|
|
276
603
|
data = response.json()
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
604
|
+
code = data.get("code")
|
|
605
|
+
|
|
606
|
+
if code != 200:
|
|
607
|
+
msg = extract_error_message(data)
|
|
608
|
+
raise_for_code(
|
|
609
|
+
f"Task creation failed: {msg}",
|
|
610
|
+
code=code,
|
|
611
|
+
payload=data
|
|
612
|
+
)
|
|
613
|
+
|
|
280
614
|
return data["data"]["task_id"]
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
raise
|
|
615
|
+
|
|
616
|
+
except requests.RequestException as e:
|
|
617
|
+
raise ThordataNetworkError(
|
|
618
|
+
f"Task creation failed: {e}",
|
|
619
|
+
original_error=e
|
|
620
|
+
)
|
|
284
621
|
|
|
285
622
|
def get_task_status(self, task_id: str) -> str:
|
|
286
623
|
"""
|
|
287
624
|
Check the status of an asynchronous scraping task.
|
|
288
625
|
|
|
289
626
|
Args:
|
|
290
|
-
task_id
|
|
627
|
+
task_id: The task ID from create_scraper_task.
|
|
291
628
|
|
|
292
629
|
Returns:
|
|
293
|
-
|
|
630
|
+
Status string (e.g., "running", "ready", "failed").
|
|
294
631
|
"""
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
299
|
-
}
|
|
632
|
+
self._require_public_credentials()
|
|
633
|
+
|
|
634
|
+
headers = build_public_api_headers(self.public_token, self.public_key)
|
|
300
635
|
payload = {"tasks_ids": task_id}
|
|
301
|
-
|
|
636
|
+
|
|
302
637
|
try:
|
|
303
|
-
response = self.
|
|
304
|
-
self.
|
|
638
|
+
response = self._session.post(
|
|
639
|
+
self._status_url,
|
|
305
640
|
data=payload,
|
|
306
|
-
headers=headers
|
|
641
|
+
headers=headers,
|
|
642
|
+
timeout=30,
|
|
307
643
|
)
|
|
308
644
|
response.raise_for_status()
|
|
645
|
+
|
|
309
646
|
data = response.json()
|
|
310
|
-
|
|
647
|
+
|
|
311
648
|
if data.get("code") == 200 and data.get("data"):
|
|
312
649
|
for item in data["data"]:
|
|
313
650
|
if str(item.get("task_id")) == str(task_id):
|
|
314
|
-
return item
|
|
315
|
-
|
|
651
|
+
return item.get("status", "unknown")
|
|
652
|
+
|
|
653
|
+
return "unknown"
|
|
654
|
+
|
|
316
655
|
except Exception as e:
|
|
317
|
-
logger.error(f"Status
|
|
318
|
-
return "
|
|
656
|
+
logger.error(f"Status check failed: {e}")
|
|
657
|
+
return "error"
|
|
319
658
|
|
|
320
|
-
def get_task_result(
|
|
659
|
+
def get_task_result(
|
|
660
|
+
self,
|
|
661
|
+
task_id: str,
|
|
662
|
+
file_type: str = "json"
|
|
663
|
+
) -> str:
|
|
321
664
|
"""
|
|
322
|
-
|
|
665
|
+
Get the download URL for a completed task.
|
|
323
666
|
|
|
324
667
|
Args:
|
|
325
|
-
task_id
|
|
326
|
-
file_type
|
|
668
|
+
task_id: The task ID.
|
|
669
|
+
file_type: Output format ("json", "csv", "xlsx").
|
|
327
670
|
|
|
328
671
|
Returns:
|
|
329
|
-
|
|
672
|
+
The download URL for the result file.
|
|
330
673
|
"""
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
335
|
-
}
|
|
674
|
+
self._require_public_credentials()
|
|
675
|
+
|
|
676
|
+
headers = build_public_api_headers(self.public_token, self.public_key)
|
|
336
677
|
payload = {"tasks_id": task_id, "type": file_type}
|
|
337
|
-
|
|
678
|
+
|
|
338
679
|
logger.info(f"Getting result URL for Task: {task_id}")
|
|
680
|
+
|
|
339
681
|
try:
|
|
340
|
-
response = self.
|
|
341
|
-
self.
|
|
682
|
+
response = self._session.post(
|
|
683
|
+
self._download_url,
|
|
342
684
|
data=payload,
|
|
343
|
-
headers=headers
|
|
685
|
+
headers=headers,
|
|
686
|
+
timeout=30,
|
|
344
687
|
)
|
|
345
688
|
response.raise_for_status()
|
|
689
|
+
|
|
346
690
|
data = response.json()
|
|
347
|
-
|
|
348
|
-
|
|
691
|
+
code = data.get("code")
|
|
692
|
+
|
|
693
|
+
if code == 200 and data.get("data"):
|
|
349
694
|
return data["data"]["download"]
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
695
|
+
|
|
696
|
+
msg = extract_error_message(data)
|
|
697
|
+
raise_for_code(
|
|
698
|
+
f"Get result failed: {msg}",
|
|
699
|
+
code=code,
|
|
700
|
+
payload=data
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
except requests.RequestException as e:
|
|
704
|
+
raise ThordataNetworkError(
|
|
705
|
+
f"Get result failed: {e}",
|
|
706
|
+
original_error=e
|
|
707
|
+
)
|
|
358
708
|
|
|
709
|
+
def wait_for_task(
|
|
710
|
+
self,
|
|
711
|
+
task_id: str,
|
|
712
|
+
*,
|
|
713
|
+
poll_interval: float = 5.0,
|
|
714
|
+
max_wait: float = 600.0,
|
|
715
|
+
) -> str:
|
|
716
|
+
"""
|
|
717
|
+
Wait for a task to complete.
|
|
718
|
+
|
|
359
719
|
Args:
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
720
|
+
task_id: The task ID to wait for.
|
|
721
|
+
poll_interval: Seconds between status checks.
|
|
722
|
+
max_wait: Maximum seconds to wait.
|
|
723
|
+
|
|
363
724
|
Returns:
|
|
364
|
-
|
|
365
|
-
|
|
725
|
+
Final task status.
|
|
726
|
+
|
|
366
727
|
Raises:
|
|
367
|
-
|
|
728
|
+
TimeoutError: If max_wait is exceeded.
|
|
729
|
+
|
|
730
|
+
Example:
|
|
731
|
+
>>> task_id = client.create_scraper_task(...)
|
|
732
|
+
>>> status = client.wait_for_task(task_id, max_wait=300)
|
|
733
|
+
>>> if status in ("ready", "success"):
|
|
734
|
+
... url = client.get_task_result(task_id)
|
|
368
735
|
"""
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
736
|
+
import time
|
|
737
|
+
|
|
738
|
+
elapsed = 0.0
|
|
739
|
+
|
|
740
|
+
while elapsed < max_wait:
|
|
741
|
+
status = self.get_task_status(task_id)
|
|
742
|
+
|
|
743
|
+
logger.debug(f"Task {task_id} status: {status}")
|
|
744
|
+
|
|
745
|
+
terminal_statuses = {
|
|
746
|
+
"ready", "success", "finished",
|
|
747
|
+
"failed", "error", "cancelled"
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
if status.lower() in terminal_statuses:
|
|
751
|
+
return status
|
|
752
|
+
|
|
753
|
+
time.sleep(poll_interval)
|
|
754
|
+
elapsed += poll_interval
|
|
755
|
+
|
|
756
|
+
raise TimeoutError(
|
|
757
|
+
f"Task {task_id} did not complete within {max_wait} seconds"
|
|
384
758
|
)
|
|
385
|
-
response.raise_for_status()
|
|
386
759
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
return data.get("data") or []
|
|
396
|
-
# Fallback: if backend ever returns a list directly
|
|
397
|
-
if isinstance(data, list):
|
|
398
|
-
return data
|
|
399
|
-
return []
|
|
400
|
-
|
|
401
|
-
def list_countries(self, proxy_type: int = 1) -> List[Dict[str, Any]]:
|
|
760
|
+
# =========================================================================
|
|
761
|
+
# Location API Methods
|
|
762
|
+
# =========================================================================
|
|
763
|
+
|
|
764
|
+
def list_countries(
|
|
765
|
+
self,
|
|
766
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
767
|
+
) -> List[Dict[str, Any]]:
|
|
402
768
|
"""
|
|
403
|
-
List supported countries for
|
|
769
|
+
List supported countries for proxies.
|
|
404
770
|
|
|
405
771
|
Args:
|
|
406
|
-
proxy_type
|
|
772
|
+
proxy_type: 1 for residential, 2 for unlimited.
|
|
407
773
|
|
|
408
774
|
Returns:
|
|
409
|
-
List
|
|
775
|
+
List of country records with 'country_code' and 'country_name'.
|
|
410
776
|
"""
|
|
411
|
-
|
|
412
|
-
"
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
}
|
|
416
|
-
return self._get_locations("countries", params)
|
|
777
|
+
return self._get_locations(
|
|
778
|
+
"countries",
|
|
779
|
+
proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type
|
|
780
|
+
)
|
|
417
781
|
|
|
418
|
-
def list_states(
|
|
782
|
+
def list_states(
|
|
783
|
+
self,
|
|
784
|
+
country_code: str,
|
|
785
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
786
|
+
) -> List[Dict[str, Any]]:
|
|
419
787
|
"""
|
|
420
|
-
List supported states for a
|
|
788
|
+
List supported states for a country.
|
|
421
789
|
|
|
422
790
|
Args:
|
|
423
|
-
country_code
|
|
424
|
-
proxy_type
|
|
791
|
+
country_code: Country code (e.g., 'US').
|
|
792
|
+
proxy_type: Proxy type.
|
|
425
793
|
|
|
426
794
|
Returns:
|
|
427
|
-
List
|
|
795
|
+
List of state records.
|
|
428
796
|
"""
|
|
429
|
-
|
|
430
|
-
"
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
}
|
|
435
|
-
return self._get_locations("states", params)
|
|
797
|
+
return self._get_locations(
|
|
798
|
+
"states",
|
|
799
|
+
proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
|
|
800
|
+
country_code=country_code
|
|
801
|
+
)
|
|
436
802
|
|
|
437
803
|
def list_cities(
|
|
438
804
|
self,
|
|
439
805
|
country_code: str,
|
|
440
806
|
state_code: Optional[str] = None,
|
|
441
|
-
proxy_type: int =
|
|
807
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
442
808
|
) -> List[Dict[str, Any]]:
|
|
443
809
|
"""
|
|
444
|
-
List supported cities for a
|
|
810
|
+
List supported cities for a country/state.
|
|
445
811
|
|
|
446
812
|
Args:
|
|
447
|
-
country_code
|
|
448
|
-
state_code
|
|
449
|
-
proxy_type
|
|
813
|
+
country_code: Country code.
|
|
814
|
+
state_code: Optional state code.
|
|
815
|
+
proxy_type: Proxy type.
|
|
450
816
|
|
|
451
817
|
Returns:
|
|
452
|
-
List
|
|
818
|
+
List of city records.
|
|
453
819
|
"""
|
|
454
|
-
|
|
455
|
-
"
|
|
456
|
-
"
|
|
457
|
-
"proxy_type": str(proxy_type),
|
|
458
|
-
"country_code": country_code,
|
|
820
|
+
kwargs = {
|
|
821
|
+
"proxy_type": int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
|
|
822
|
+
"country_code": country_code
|
|
459
823
|
}
|
|
460
824
|
if state_code:
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
return self._get_locations("cities",
|
|
825
|
+
kwargs["state_code"] = state_code
|
|
826
|
+
|
|
827
|
+
return self._get_locations("cities", **kwargs)
|
|
464
828
|
|
|
465
829
|
def list_asn(
|
|
466
830
|
self,
|
|
467
831
|
country_code: str,
|
|
468
|
-
proxy_type: int =
|
|
832
|
+
proxy_type: Union[ProxyType, int] = ProxyType.RESIDENTIAL
|
|
469
833
|
) -> List[Dict[str, Any]]:
|
|
470
834
|
"""
|
|
471
|
-
List supported ASNs for a
|
|
835
|
+
List supported ASNs for a country.
|
|
472
836
|
|
|
473
837
|
Args:
|
|
474
|
-
country_code
|
|
475
|
-
proxy_type
|
|
838
|
+
country_code: Country code.
|
|
839
|
+
proxy_type: Proxy type.
|
|
476
840
|
|
|
477
841
|
Returns:
|
|
478
|
-
List
|
|
842
|
+
List of ASN records.
|
|
479
843
|
"""
|
|
844
|
+
return self._get_locations(
|
|
845
|
+
"asn",
|
|
846
|
+
proxy_type=int(proxy_type) if isinstance(proxy_type, ProxyType) else proxy_type,
|
|
847
|
+
country_code=country_code
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
def _get_locations(
|
|
851
|
+
self,
|
|
852
|
+
endpoint: str,
|
|
853
|
+
**kwargs: Any
|
|
854
|
+
) -> List[Dict[str, Any]]:
|
|
855
|
+
"""Internal method to call locations API."""
|
|
856
|
+
self._require_public_credentials()
|
|
857
|
+
|
|
480
858
|
params = {
|
|
481
859
|
"token": self.public_token,
|
|
482
860
|
"key": self.public_key,
|
|
483
|
-
"proxy_type": str(proxy_type),
|
|
484
|
-
"country_code": country_code,
|
|
485
861
|
}
|
|
486
|
-
|
|
862
|
+
|
|
863
|
+
for key, value in kwargs.items():
|
|
864
|
+
params[key] = str(value)
|
|
865
|
+
|
|
866
|
+
url = f"{self.LOCATIONS_URL}/{endpoint}"
|
|
867
|
+
|
|
868
|
+
logger.debug(f"Locations API request: {url}")
|
|
869
|
+
|
|
870
|
+
# Use requests.get directly (no proxy needed for this API)
|
|
871
|
+
response = requests.get(url, params=params, timeout=30)
|
|
872
|
+
response.raise_for_status()
|
|
873
|
+
|
|
874
|
+
data = response.json()
|
|
875
|
+
|
|
876
|
+
if isinstance(data, dict):
|
|
877
|
+
code = data.get("code")
|
|
878
|
+
if code is not None and code != 200:
|
|
879
|
+
msg = data.get("msg", "")
|
|
880
|
+
raise RuntimeError(
|
|
881
|
+
f"Locations API error ({endpoint}): code={code}, msg={msg}"
|
|
882
|
+
)
|
|
883
|
+
return data.get("data") or []
|
|
884
|
+
|
|
885
|
+
if isinstance(data, list):
|
|
886
|
+
return data
|
|
887
|
+
|
|
888
|
+
return []
|
|
889
|
+
|
|
890
|
+
# =========================================================================
|
|
891
|
+
# Helper Methods
|
|
892
|
+
# =========================================================================
|
|
893
|
+
|
|
894
|
+
def _require_public_credentials(self) -> None:
|
|
895
|
+
"""Ensure public API credentials are available."""
|
|
896
|
+
if not self.public_token or not self.public_key:
|
|
897
|
+
raise ThordataConfigError(
|
|
898
|
+
"public_token and public_key are required for this operation. "
|
|
899
|
+
"Please provide them when initializing ThordataClient."
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
def _request_with_retry(
|
|
903
|
+
self,
|
|
904
|
+
method: str,
|
|
905
|
+
url: str,
|
|
906
|
+
**kwargs: Any
|
|
907
|
+
) -> requests.Response:
|
|
908
|
+
"""Make a request with automatic retry."""
|
|
909
|
+
kwargs.setdefault("timeout", self._default_timeout)
|
|
910
|
+
|
|
911
|
+
@with_retry(self._retry_config)
|
|
912
|
+
def _do_request() -> requests.Response:
|
|
913
|
+
return self._session.request(method, url, **kwargs)
|
|
914
|
+
|
|
915
|
+
try:
|
|
916
|
+
return _do_request()
|
|
917
|
+
except requests.Timeout as e:
|
|
918
|
+
raise ThordataTimeoutError(
|
|
919
|
+
f"Request timed out: {e}",
|
|
920
|
+
original_error=e
|
|
921
|
+
)
|
|
922
|
+
except requests.RequestException as e:
|
|
923
|
+
raise ThordataNetworkError(
|
|
924
|
+
f"Request failed: {e}",
|
|
925
|
+
original_error=e
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
def close(self) -> None:
|
|
929
|
+
"""Close the underlying session."""
|
|
930
|
+
self._session.close()
|
|
931
|
+
|
|
932
|
+
def __enter__(self) -> "ThordataClient":
|
|
933
|
+
return self
|
|
934
|
+
|
|
935
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
936
|
+
self.close()
|