thordata-sdk 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +33 -36
- thordata/_utils.py +21 -21
- thordata/async_client.py +230 -192
- thordata/client.py +281 -222
- thordata/enums.py +32 -6
- thordata/exceptions.py +60 -31
- thordata/models.py +173 -146
- thordata/parameters.py +7 -6
- thordata/retry.py +109 -111
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/METADATA +228 -10
- thordata_sdk-0.5.0.dist-info/RECORD +14 -0
- thordata_sdk-0.4.0.dist-info/RECORD +0 -14
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/top_level.txt +0 -0
thordata/models.py
CHANGED
|
@@ -7,16 +7,16 @@ IDE autocomplete and reduces parameter errors.
|
|
|
7
7
|
|
|
8
8
|
Example:
|
|
9
9
|
>>> from thordata.models import ProxyConfig, SerpRequest
|
|
10
|
-
>>>
|
|
10
|
+
>>>
|
|
11
11
|
>>> # Build a proxy URL with geo-targeting
|
|
12
12
|
>>> proxy = ProxyConfig(
|
|
13
13
|
... username="myuser",
|
|
14
|
-
... password="mypass",
|
|
14
|
+
... password="mypass",
|
|
15
15
|
... country="us",
|
|
16
16
|
... city="seattle"
|
|
17
17
|
... )
|
|
18
18
|
>>> print(proxy.build_proxy_url())
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
>>> # Configure a SERP request
|
|
21
21
|
>>> serp = SerpRequest(query="python tutorial", engine="google", num=20)
|
|
22
22
|
>>> print(serp.to_payload())
|
|
@@ -24,29 +24,30 @@ Example:
|
|
|
24
24
|
|
|
25
25
|
from __future__ import annotations
|
|
26
26
|
|
|
27
|
+
import json
|
|
27
28
|
import re
|
|
28
29
|
import uuid
|
|
29
|
-
import json
|
|
30
30
|
from dataclasses import dataclass, field
|
|
31
|
-
from typing import Optional, Dict, Any, List, Union
|
|
32
31
|
from enum import Enum
|
|
33
|
-
|
|
32
|
+
from typing import Any, Dict, List, Optional, Union
|
|
34
33
|
|
|
35
34
|
# =============================================================================
|
|
36
35
|
# Proxy Product Types
|
|
37
36
|
# =============================================================================
|
|
38
37
|
|
|
38
|
+
|
|
39
39
|
class ProxyProduct(str, Enum):
|
|
40
40
|
"""
|
|
41
41
|
Thordata proxy product types with their default ports.
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
Each product type has a specific port on the proxy gateway.
|
|
44
44
|
"""
|
|
45
|
+
|
|
45
46
|
RESIDENTIAL = "residential"
|
|
46
47
|
MOBILE = "mobile"
|
|
47
48
|
DATACENTER = "datacenter"
|
|
48
49
|
ISP = "isp"
|
|
49
|
-
|
|
50
|
+
|
|
50
51
|
@property
|
|
51
52
|
def default_port(self) -> int:
|
|
52
53
|
"""Get the default port for this proxy product."""
|
|
@@ -63,14 +64,15 @@ class ProxyProduct(str, Enum):
|
|
|
63
64
|
# Proxy Configuration Models
|
|
64
65
|
# =============================================================================
|
|
65
66
|
|
|
67
|
+
|
|
66
68
|
@dataclass
|
|
67
69
|
class ProxyConfig:
|
|
68
70
|
"""
|
|
69
71
|
Configuration for building a Thordata proxy URL.
|
|
70
|
-
|
|
72
|
+
|
|
71
73
|
This class handles the complex username format required by Thordata proxies,
|
|
72
74
|
where geo-targeting and session parameters are embedded in the username.
|
|
73
|
-
|
|
75
|
+
|
|
74
76
|
Args:
|
|
75
77
|
username: Your Thordata account username (the part after 'td-customer-').
|
|
76
78
|
password: Your Thordata account password.
|
|
@@ -78,18 +80,18 @@ class ProxyConfig:
|
|
|
78
80
|
host: Proxy gateway host. If None, uses default based on product.
|
|
79
81
|
port: Proxy gateway port. If None, uses default based on product.
|
|
80
82
|
protocol: Proxy protocol - 'http' or 'https'.
|
|
81
|
-
|
|
83
|
+
|
|
82
84
|
# Geo-targeting (all optional)
|
|
83
85
|
continent: Target continent code (af/an/as/eu/na/oc/sa).
|
|
84
86
|
country: Target country code in ISO 3166-1 alpha-2 format.
|
|
85
87
|
state: Target state name in lowercase.
|
|
86
88
|
city: Target city name in lowercase.
|
|
87
89
|
asn: Target ASN code (e.g., 'AS12322'). Must be used with country.
|
|
88
|
-
|
|
90
|
+
|
|
89
91
|
# Session control (optional)
|
|
90
92
|
session_id: Session identifier for sticky sessions.
|
|
91
93
|
session_duration: Session duration in minutes (1-90).
|
|
92
|
-
|
|
94
|
+
|
|
93
95
|
Example:
|
|
94
96
|
>>> config = ProxyConfig(
|
|
95
97
|
... username="GnrqUwwu3obt",
|
|
@@ -103,45 +105,45 @@ class ProxyConfig:
|
|
|
103
105
|
>>> print(config.build_proxy_url())
|
|
104
106
|
http://td-customer-GnrqUwwu3obt-country-us-state-california-sessid-mysession123-sesstime-10:PkCSzvt30iww@....pr.thordata.net:9999
|
|
105
107
|
"""
|
|
106
|
-
|
|
108
|
+
|
|
107
109
|
username: str
|
|
108
110
|
password: str
|
|
109
111
|
product: Union[ProxyProduct, str] = ProxyProduct.RESIDENTIAL
|
|
110
112
|
host: Optional[str] = None
|
|
111
113
|
port: Optional[int] = None
|
|
112
114
|
protocol: str = "http"
|
|
113
|
-
|
|
115
|
+
|
|
114
116
|
# Geo-targeting
|
|
115
117
|
continent: Optional[str] = None
|
|
116
118
|
country: Optional[str] = None
|
|
117
119
|
state: Optional[str] = None
|
|
118
120
|
city: Optional[str] = None
|
|
119
121
|
asn: Optional[str] = None
|
|
120
|
-
|
|
122
|
+
|
|
121
123
|
# Session control
|
|
122
124
|
session_id: Optional[str] = None
|
|
123
125
|
session_duration: Optional[int] = None # minutes, 1-90
|
|
124
|
-
|
|
126
|
+
|
|
125
127
|
# Valid continent codes
|
|
126
128
|
VALID_CONTINENTS = {"af", "an", "as", "eu", "na", "oc", "sa"}
|
|
127
|
-
|
|
129
|
+
|
|
128
130
|
def __post_init__(self) -> None:
|
|
129
131
|
"""Validate configuration after initialization."""
|
|
130
132
|
# Normalize product to enum
|
|
131
133
|
if isinstance(self.product, str):
|
|
132
134
|
self.product = ProxyProduct(self.product.lower())
|
|
133
|
-
|
|
135
|
+
|
|
134
136
|
# Set default host and port based on product
|
|
135
137
|
if self.host is None:
|
|
136
138
|
# Extract user prefix from username if available
|
|
137
139
|
# Default to generic host
|
|
138
140
|
self.host = "pr.thordata.net"
|
|
139
|
-
|
|
141
|
+
|
|
140
142
|
if self.port is None:
|
|
141
143
|
self.port = self.product.default_port
|
|
142
|
-
|
|
144
|
+
|
|
143
145
|
self._validate()
|
|
144
|
-
|
|
146
|
+
|
|
145
147
|
def _validate(self) -> None:
|
|
146
148
|
"""Validate the proxy configuration."""
|
|
147
149
|
# Validate protocol
|
|
@@ -149,7 +151,7 @@ class ProxyConfig:
|
|
|
149
151
|
raise ValueError(
|
|
150
152
|
f"Invalid protocol: {self.protocol}. Must be 'http' or 'https'."
|
|
151
153
|
)
|
|
152
|
-
|
|
154
|
+
|
|
153
155
|
# Validate session duration
|
|
154
156
|
if self.session_duration is not None:
|
|
155
157
|
if not 1 <= self.session_duration <= 90:
|
|
@@ -159,96 +161,96 @@ class ProxyConfig:
|
|
|
159
161
|
)
|
|
160
162
|
if not self.session_id:
|
|
161
163
|
raise ValueError("session_duration requires session_id to be set")
|
|
162
|
-
|
|
164
|
+
|
|
163
165
|
# Validate ASN requires country
|
|
164
166
|
if self.asn and not self.country:
|
|
165
167
|
raise ValueError("ASN targeting requires country to be specified")
|
|
166
|
-
|
|
168
|
+
|
|
167
169
|
# Validate continent code
|
|
168
170
|
if self.continent and self.continent.lower() not in self.VALID_CONTINENTS:
|
|
169
171
|
raise ValueError(
|
|
170
172
|
f"Invalid continent code: {self.continent}. "
|
|
171
173
|
f"Must be one of: {', '.join(sorted(self.VALID_CONTINENTS))}"
|
|
172
174
|
)
|
|
173
|
-
|
|
175
|
+
|
|
174
176
|
# Validate country code format (2 letters)
|
|
175
177
|
if self.country and not re.match(r"^[a-zA-Z]{2}$", self.country):
|
|
176
178
|
raise ValueError(
|
|
177
179
|
f"Invalid country code: {self.country}. "
|
|
178
180
|
"Must be a 2-letter ISO 3166-1 alpha-2 code."
|
|
179
181
|
)
|
|
180
|
-
|
|
182
|
+
|
|
181
183
|
def build_username(self) -> str:
|
|
182
184
|
"""
|
|
183
185
|
Build the complete username string with embedded parameters.
|
|
184
|
-
|
|
186
|
+
|
|
185
187
|
Returns:
|
|
186
188
|
The formatted username string for proxy authentication.
|
|
187
189
|
"""
|
|
188
190
|
parts = [f"td-customer-{self.username}"]
|
|
189
|
-
|
|
191
|
+
|
|
190
192
|
# Add geo-targeting parameters (order matters)
|
|
191
193
|
if self.continent:
|
|
192
194
|
parts.append(f"continent-{self.continent.lower()}")
|
|
193
|
-
|
|
195
|
+
|
|
194
196
|
if self.country:
|
|
195
197
|
parts.append(f"country-{self.country.lower()}")
|
|
196
|
-
|
|
198
|
+
|
|
197
199
|
if self.state:
|
|
198
200
|
parts.append(f"state-{self.state.lower()}")
|
|
199
|
-
|
|
201
|
+
|
|
200
202
|
if self.city:
|
|
201
203
|
parts.append(f"city-{self.city.lower()}")
|
|
202
|
-
|
|
204
|
+
|
|
203
205
|
if self.asn:
|
|
204
206
|
# Ensure ASN has correct format
|
|
205
207
|
asn_value = self.asn.upper()
|
|
206
208
|
if not asn_value.startswith("AS"):
|
|
207
209
|
asn_value = f"AS{asn_value}"
|
|
208
210
|
parts.append(f"asn-{asn_value}")
|
|
209
|
-
|
|
211
|
+
|
|
210
212
|
# Add session parameters
|
|
211
213
|
if self.session_id:
|
|
212
214
|
parts.append(f"sessid-{self.session_id}")
|
|
213
|
-
|
|
215
|
+
|
|
214
216
|
if self.session_duration:
|
|
215
217
|
parts.append(f"sesstime-{self.session_duration}")
|
|
216
|
-
|
|
218
|
+
|
|
217
219
|
return "-".join(parts)
|
|
218
|
-
|
|
220
|
+
|
|
219
221
|
def build_proxy_url(self) -> str:
|
|
220
222
|
"""
|
|
221
223
|
Build the complete proxy URL.
|
|
222
|
-
|
|
224
|
+
|
|
223
225
|
Returns:
|
|
224
226
|
The formatted proxy URL for use with requests/aiohttp.
|
|
225
227
|
"""
|
|
226
228
|
username = self.build_username()
|
|
227
229
|
return f"{self.protocol}://{username}:{self.password}@{self.host}:{self.port}"
|
|
228
|
-
|
|
230
|
+
|
|
229
231
|
def to_proxies_dict(self) -> Dict[str, str]:
|
|
230
232
|
"""
|
|
231
233
|
Build a proxies dict suitable for the requests library.
|
|
232
|
-
|
|
234
|
+
|
|
233
235
|
Returns:
|
|
234
236
|
Dict with 'http' and 'https' keys pointing to the proxy URL.
|
|
235
237
|
"""
|
|
236
238
|
url = self.build_proxy_url()
|
|
237
239
|
return {"http": url, "https": url}
|
|
238
|
-
|
|
240
|
+
|
|
239
241
|
def to_aiohttp_config(self) -> tuple:
|
|
240
242
|
"""
|
|
241
243
|
Get proxy configuration for aiohttp.
|
|
242
|
-
|
|
244
|
+
|
|
243
245
|
Returns:
|
|
244
246
|
Tuple of (proxy_url, proxy_auth) for aiohttp.
|
|
245
247
|
"""
|
|
246
248
|
try:
|
|
247
249
|
import aiohttp
|
|
250
|
+
|
|
248
251
|
proxy_url = f"{self.protocol}://{self.host}:{self.port}"
|
|
249
252
|
proxy_auth = aiohttp.BasicAuth(
|
|
250
|
-
login=self.build_username(),
|
|
251
|
-
password=self.password
|
|
253
|
+
login=self.build_username(), password=self.password
|
|
252
254
|
)
|
|
253
255
|
return proxy_url, proxy_auth
|
|
254
256
|
except ImportError:
|
|
@@ -259,14 +261,14 @@ class ProxyConfig:
|
|
|
259
261
|
class StickySession(ProxyConfig):
|
|
260
262
|
"""
|
|
261
263
|
Convenience class for creating sticky session proxy configurations.
|
|
262
|
-
|
|
264
|
+
|
|
263
265
|
A sticky session keeps the same IP address for a specified duration,
|
|
264
266
|
useful for multi-step operations that require IP consistency.
|
|
265
|
-
|
|
267
|
+
|
|
266
268
|
Args:
|
|
267
269
|
duration_minutes: How long to keep the same IP (1-90 minutes).
|
|
268
270
|
auto_session_id: If True, automatically generates a unique session ID.
|
|
269
|
-
|
|
271
|
+
|
|
270
272
|
Example:
|
|
271
273
|
>>> session = StickySession(
|
|
272
274
|
... username="myuser",
|
|
@@ -277,18 +279,18 @@ class StickySession(ProxyConfig):
|
|
|
277
279
|
>>> # Each call to build_proxy_url() uses the same session
|
|
278
280
|
>>> url = session.build_proxy_url()
|
|
279
281
|
"""
|
|
280
|
-
|
|
282
|
+
|
|
281
283
|
duration_minutes: int = 10
|
|
282
284
|
auto_session_id: bool = True
|
|
283
|
-
|
|
285
|
+
|
|
284
286
|
def __post_init__(self) -> None:
|
|
285
287
|
# Auto-generate session ID if requested and not provided
|
|
286
288
|
if self.auto_session_id and not self.session_id:
|
|
287
289
|
self.session_id = uuid.uuid4().hex[:12]
|
|
288
|
-
|
|
290
|
+
|
|
289
291
|
# Set session_duration from duration_minutes
|
|
290
292
|
self.session_duration = self.duration_minutes
|
|
291
|
-
|
|
293
|
+
|
|
292
294
|
# Call parent post_init
|
|
293
295
|
super().__post_init__()
|
|
294
296
|
|
|
@@ -297,86 +299,94 @@ class StickySession(ProxyConfig):
|
|
|
297
299
|
# SERP API Models
|
|
298
300
|
# =============================================================================
|
|
299
301
|
|
|
302
|
+
|
|
300
303
|
@dataclass
|
|
301
304
|
class SerpRequest:
|
|
302
305
|
"""
|
|
303
306
|
Configuration for a SERP API request.
|
|
304
|
-
|
|
307
|
+
|
|
305
308
|
Supports Google, Bing, Yandex, DuckDuckGo, and Baidu search engines.
|
|
306
|
-
|
|
309
|
+
|
|
307
310
|
Args:
|
|
308
311
|
query: The search query string (required).
|
|
309
312
|
engine: Search engine to use (default: 'google').
|
|
310
313
|
num: Number of results per page (default: 10).
|
|
311
314
|
start: Result offset for pagination (default: 0).
|
|
312
|
-
|
|
315
|
+
|
|
313
316
|
# Localization
|
|
314
317
|
country: Country code for results (gl parameter for Google).
|
|
315
318
|
language: Language code for interface (hl parameter for Google).
|
|
316
319
|
google_domain: Google domain to use (e.g., 'google.co.uk').
|
|
317
|
-
|
|
320
|
+
|
|
318
321
|
# Geo-targeting
|
|
319
322
|
location: Location name for geo-targeting.
|
|
320
323
|
uule: Encoded location parameter (use with location).
|
|
321
|
-
|
|
324
|
+
|
|
322
325
|
# Search type
|
|
323
326
|
search_type: Type of search (images, news, shopping, videos, etc.).
|
|
324
|
-
|
|
327
|
+
|
|
325
328
|
# Filters
|
|
326
329
|
safe_search: Enable safe search filtering.
|
|
327
330
|
time_filter: Time range filter (hour, day, week, month, year).
|
|
328
|
-
|
|
331
|
+
no_autocorrect: Disable automatic spelling correction (nfpr).
|
|
332
|
+
filter_duplicates: Enable/disable duplicate filtering.
|
|
333
|
+
|
|
334
|
+
# Device & Rendering
|
|
335
|
+
device: Device type ('desktop', 'mobile', 'tablet').
|
|
336
|
+
render_js: Enable JavaScript rendering in SERP (render_js=True/False).
|
|
337
|
+
no_cache: Disable internal caching (no_cache=True/False).
|
|
338
|
+
|
|
339
|
+
# Output
|
|
340
|
+
output_format: 'json' (default) or 'html'.
|
|
341
|
+
|
|
329
342
|
# Advanced
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
... query="python programming",
|
|
336
|
-
... engine="google",
|
|
337
|
-
... num=20,
|
|
338
|
-
... country="us",
|
|
339
|
-
... language="en",
|
|
340
|
-
... search_type="news"
|
|
341
|
-
... )
|
|
342
|
-
>>> payload = req.to_payload()
|
|
343
|
+
ludocid: Google Place ID.
|
|
344
|
+
kgmid: Google Knowledge Graph ID.
|
|
345
|
+
|
|
346
|
+
# Extra
|
|
347
|
+
extra_params: Additional parameters to pass through (ibp, lsig, si, uds, ...).
|
|
343
348
|
"""
|
|
344
|
-
|
|
349
|
+
|
|
345
350
|
query: str
|
|
346
351
|
engine: str = "google"
|
|
347
352
|
num: int = 10
|
|
348
353
|
start: int = 0
|
|
349
|
-
|
|
354
|
+
|
|
350
355
|
# Localization
|
|
351
356
|
country: Optional[str] = None # 'gl' for Google
|
|
352
357
|
language: Optional[str] = None # 'hl' for Google
|
|
353
358
|
google_domain: Optional[str] = None
|
|
354
359
|
countries_filter: Optional[str] = None # 'cr' parameter
|
|
355
360
|
languages_filter: Optional[str] = None # 'lr' parameter
|
|
356
|
-
|
|
361
|
+
|
|
357
362
|
# Geo-targeting
|
|
358
363
|
location: Optional[str] = None
|
|
359
364
|
uule: Optional[str] = None # Encoded location
|
|
360
|
-
|
|
365
|
+
|
|
361
366
|
# Search type
|
|
362
|
-
search_type: Optional[str] = None # tbm parameter (isch, shop, nws, vid)
|
|
363
|
-
|
|
367
|
+
search_type: Optional[str] = None # tbm parameter (isch, shop, nws, vid, ...)
|
|
368
|
+
|
|
364
369
|
# Filters
|
|
365
370
|
safe_search: Optional[bool] = None
|
|
366
|
-
time_filter: Optional[str] = None # tbs parameter
|
|
371
|
+
time_filter: Optional[str] = None # tbs parameter (time part)
|
|
367
372
|
no_autocorrect: bool = False # nfpr parameter
|
|
368
|
-
filter_duplicates: Optional[bool] = None
|
|
369
|
-
|
|
370
|
-
#
|
|
371
|
-
device: Optional[str] = None
|
|
372
|
-
|
|
373
|
+
filter_duplicates: Optional[bool] = None # filter parameter
|
|
374
|
+
|
|
375
|
+
# Device & Rendering
|
|
376
|
+
device: Optional[str] = None # 'desktop', 'mobile', 'tablet'
|
|
377
|
+
render_js: Optional[bool] = None # render_js parameter
|
|
378
|
+
no_cache: Optional[bool] = None # no_cache parameter
|
|
379
|
+
|
|
380
|
+
# Output format
|
|
381
|
+
output_format: str = "json" # 'json' or 'html'
|
|
382
|
+
|
|
373
383
|
# Advanced Google parameters
|
|
374
384
|
ludocid: Optional[str] = None # Google Place ID
|
|
375
385
|
kgmid: Optional[str] = None # Knowledge Graph ID
|
|
376
|
-
|
|
386
|
+
|
|
377
387
|
# Pass-through
|
|
378
388
|
extra_params: Dict[str, Any] = field(default_factory=dict)
|
|
379
|
-
|
|
389
|
+
|
|
380
390
|
# Search type mappings for tbm parameter
|
|
381
391
|
SEARCH_TYPE_MAP = {
|
|
382
392
|
"images": "isch",
|
|
@@ -389,7 +399,7 @@ class SerpRequest:
|
|
|
389
399
|
"nws": "nws",
|
|
390
400
|
"vid": "vid",
|
|
391
401
|
}
|
|
392
|
-
|
|
402
|
+
|
|
393
403
|
# Time filter mappings for tbs parameter
|
|
394
404
|
TIME_FILTER_MAP = {
|
|
395
405
|
"hour": "qdr:h",
|
|
@@ -398,7 +408,7 @@ class SerpRequest:
|
|
|
398
408
|
"month": "qdr:m",
|
|
399
409
|
"year": "qdr:y",
|
|
400
410
|
}
|
|
401
|
-
|
|
411
|
+
|
|
402
412
|
# Engine URL defaults
|
|
403
413
|
ENGINE_URLS = {
|
|
404
414
|
"google": "google.com",
|
|
@@ -407,93 +417,103 @@ class SerpRequest:
|
|
|
407
417
|
"duckduckgo": "duckduckgo.com",
|
|
408
418
|
"baidu": "baidu.com",
|
|
409
419
|
}
|
|
410
|
-
|
|
420
|
+
|
|
411
421
|
def to_payload(self) -> Dict[str, Any]:
|
|
412
422
|
"""
|
|
413
423
|
Convert to API request payload.
|
|
414
|
-
|
|
424
|
+
|
|
415
425
|
Returns:
|
|
416
426
|
Dictionary ready to be sent to the SERP API.
|
|
417
427
|
"""
|
|
418
428
|
engine = self.engine.lower()
|
|
419
|
-
|
|
429
|
+
|
|
420
430
|
payload: Dict[str, Any] = {
|
|
421
431
|
"engine": engine,
|
|
422
432
|
"num": str(self.num),
|
|
423
|
-
|
|
433
|
+
# output_format: json=1 for JSON, json=0 for raw HTML
|
|
434
|
+
"json": "1" if self.output_format.lower() == "json" else "0",
|
|
424
435
|
}
|
|
425
|
-
|
|
436
|
+
|
|
426
437
|
# Handle query parameter (Yandex uses 'text', others use 'q')
|
|
427
438
|
if engine == "yandex":
|
|
428
439
|
payload["text"] = self.query
|
|
429
440
|
else:
|
|
430
441
|
payload["q"] = self.query
|
|
431
|
-
|
|
432
|
-
# Set URL based on google_domain or engine default
|
|
442
|
+
|
|
443
|
+
# Set URL / domain based on google_domain or engine default
|
|
433
444
|
if self.google_domain:
|
|
445
|
+
# 显式设置 google_domain 参数,同时设置 url
|
|
446
|
+
payload["google_domain"] = self.google_domain
|
|
434
447
|
payload["url"] = self.google_domain
|
|
435
448
|
elif engine in self.ENGINE_URLS:
|
|
436
449
|
payload["url"] = self.ENGINE_URLS[engine]
|
|
437
|
-
|
|
450
|
+
|
|
438
451
|
# Pagination
|
|
439
452
|
if self.start > 0:
|
|
440
453
|
payload["start"] = str(self.start)
|
|
441
|
-
|
|
454
|
+
|
|
442
455
|
# Localization
|
|
443
456
|
if self.country:
|
|
444
457
|
payload["gl"] = self.country.lower()
|
|
445
|
-
|
|
458
|
+
|
|
446
459
|
if self.language:
|
|
447
460
|
payload["hl"] = self.language.lower()
|
|
448
|
-
|
|
461
|
+
|
|
449
462
|
if self.countries_filter:
|
|
450
463
|
payload["cr"] = self.countries_filter
|
|
451
|
-
|
|
464
|
+
|
|
452
465
|
if self.languages_filter:
|
|
453
466
|
payload["lr"] = self.languages_filter
|
|
454
|
-
|
|
467
|
+
|
|
455
468
|
# Geo-targeting
|
|
456
469
|
if self.location:
|
|
457
470
|
payload["location"] = self.location
|
|
458
|
-
|
|
471
|
+
|
|
459
472
|
if self.uule:
|
|
460
473
|
payload["uule"] = self.uule
|
|
461
|
-
|
|
462
|
-
# Search type
|
|
474
|
+
|
|
475
|
+
# Search type (tbm)
|
|
463
476
|
if self.search_type:
|
|
464
477
|
search_type_lower = self.search_type.lower()
|
|
465
478
|
tbm_value = self.SEARCH_TYPE_MAP.get(search_type_lower, search_type_lower)
|
|
466
479
|
payload["tbm"] = tbm_value
|
|
467
|
-
|
|
480
|
+
|
|
468
481
|
# Filters
|
|
469
482
|
if self.safe_search is not None:
|
|
470
483
|
payload["safe"] = "active" if self.safe_search else "off"
|
|
471
|
-
|
|
484
|
+
|
|
472
485
|
if self.time_filter:
|
|
473
486
|
time_lower = self.time_filter.lower()
|
|
474
487
|
tbs_value = self.TIME_FILTER_MAP.get(time_lower, time_lower)
|
|
475
488
|
payload["tbs"] = tbs_value
|
|
476
|
-
|
|
489
|
+
|
|
477
490
|
if self.no_autocorrect:
|
|
478
491
|
payload["nfpr"] = "1"
|
|
479
|
-
|
|
492
|
+
|
|
480
493
|
if self.filter_duplicates is not None:
|
|
481
494
|
payload["filter"] = "1" if self.filter_duplicates else "0"
|
|
482
|
-
|
|
495
|
+
|
|
483
496
|
# Device
|
|
484
497
|
if self.device:
|
|
485
498
|
payload["device"] = self.device.lower()
|
|
486
|
-
|
|
499
|
+
|
|
500
|
+
# Rendering & cache control
|
|
501
|
+
if self.render_js is not None:
|
|
502
|
+
payload["render_js"] = "True" if self.render_js else "False"
|
|
503
|
+
|
|
504
|
+
if self.no_cache is not None:
|
|
505
|
+
payload["no_cache"] = "True" if self.no_cache else "False"
|
|
506
|
+
|
|
487
507
|
# Advanced Google parameters
|
|
488
508
|
if self.ludocid:
|
|
489
509
|
payload["ludocid"] = self.ludocid
|
|
490
|
-
|
|
510
|
+
|
|
491
511
|
if self.kgmid:
|
|
492
512
|
payload["kgmid"] = self.kgmid
|
|
493
|
-
|
|
494
|
-
# Extra parameters
|
|
513
|
+
|
|
514
|
+
# Extra parameters (ibp, lsig, si, uds, etc.)
|
|
495
515
|
payload.update(self.extra_params)
|
|
496
|
-
|
|
516
|
+
|
|
497
517
|
return payload
|
|
498
518
|
|
|
499
519
|
|
|
@@ -501,13 +521,14 @@ class SerpRequest:
|
|
|
501
521
|
# Universal Scraper (Web Unlocker) Models
|
|
502
522
|
# =============================================================================
|
|
503
523
|
|
|
524
|
+
|
|
504
525
|
@dataclass
|
|
505
526
|
class UniversalScrapeRequest:
|
|
506
527
|
"""
|
|
507
528
|
Configuration for a Universal Scraping API (Web Unlocker) request.
|
|
508
|
-
|
|
529
|
+
|
|
509
530
|
This API bypasses anti-bot protections like Cloudflare, CAPTCHAs, etc.
|
|
510
|
-
|
|
531
|
+
|
|
511
532
|
Args:
|
|
512
533
|
url: Target URL to scrape (required).
|
|
513
534
|
js_render: Enable JavaScript rendering with headless browser.
|
|
@@ -520,7 +541,7 @@ class UniversalScrapeRequest:
|
|
|
520
541
|
headers: Custom request headers as list of {name, value} dicts.
|
|
521
542
|
cookies: Custom cookies as list of {name, value} dicts.
|
|
522
543
|
extra_params: Additional parameters to pass through.
|
|
523
|
-
|
|
544
|
+
|
|
524
545
|
Example:
|
|
525
546
|
>>> req = UniversalScrapeRequest(
|
|
526
547
|
... url="https://example.com",
|
|
@@ -532,7 +553,7 @@ class UniversalScrapeRequest:
|
|
|
532
553
|
... )
|
|
533
554
|
>>> payload = req.to_payload()
|
|
534
555
|
"""
|
|
535
|
-
|
|
556
|
+
|
|
536
557
|
url: str
|
|
537
558
|
js_render: bool = False
|
|
538
559
|
output_format: str = "html" # 'html' or 'png'
|
|
@@ -544,7 +565,7 @@ class UniversalScrapeRequest:
|
|
|
544
565
|
headers: Optional[List[Dict[str, str]]] = None # [{"name": "...", "value": "..."}]
|
|
545
566
|
cookies: Optional[List[Dict[str, str]]] = None # [{"name": "...", "value": "..."}]
|
|
546
567
|
extra_params: Dict[str, Any] = field(default_factory=dict) # 这个必须用 field()
|
|
547
|
-
|
|
568
|
+
|
|
548
569
|
def __post_init__(self) -> None:
|
|
549
570
|
"""Validate configuration."""
|
|
550
571
|
valid_formats = {"html", "png"}
|
|
@@ -553,16 +574,16 @@ class UniversalScrapeRequest:
|
|
|
553
574
|
f"Invalid output_format: {self.output_format}. "
|
|
554
575
|
f"Must be one of: {', '.join(valid_formats)}"
|
|
555
576
|
)
|
|
556
|
-
|
|
577
|
+
|
|
557
578
|
if self.wait is not None and (self.wait < 0 or self.wait > 100000):
|
|
558
579
|
raise ValueError(
|
|
559
580
|
f"wait must be between 0 and 100000 milliseconds, got {self.wait}"
|
|
560
581
|
)
|
|
561
|
-
|
|
582
|
+
|
|
562
583
|
def to_payload(self) -> Dict[str, Any]:
|
|
563
584
|
"""
|
|
564
585
|
Convert to API request payload.
|
|
565
|
-
|
|
586
|
+
|
|
566
587
|
Returns:
|
|
567
588
|
Dictionary ready to be sent to the Universal API.
|
|
568
589
|
"""
|
|
@@ -571,30 +592,30 @@ class UniversalScrapeRequest:
|
|
|
571
592
|
"js_render": "True" if self.js_render else "False",
|
|
572
593
|
"type": self.output_format.lower(),
|
|
573
594
|
}
|
|
574
|
-
|
|
595
|
+
|
|
575
596
|
if self.country:
|
|
576
597
|
payload["country"] = self.country.lower()
|
|
577
|
-
|
|
598
|
+
|
|
578
599
|
if self.block_resources:
|
|
579
600
|
payload["block_resources"] = self.block_resources
|
|
580
|
-
|
|
601
|
+
|
|
581
602
|
if self.clean_content:
|
|
582
603
|
payload["clean_content"] = self.clean_content
|
|
583
|
-
|
|
604
|
+
|
|
584
605
|
if self.wait is not None:
|
|
585
606
|
payload["wait"] = str(self.wait)
|
|
586
|
-
|
|
607
|
+
|
|
587
608
|
if self.wait_for:
|
|
588
609
|
payload["wait_for"] = self.wait_for
|
|
589
|
-
|
|
610
|
+
|
|
590
611
|
if self.headers:
|
|
591
612
|
payload["headers"] = json.dumps(self.headers)
|
|
592
|
-
|
|
613
|
+
|
|
593
614
|
if self.cookies:
|
|
594
615
|
payload["cookies"] = json.dumps(self.cookies)
|
|
595
|
-
|
|
616
|
+
|
|
596
617
|
payload.update(self.extra_params)
|
|
597
|
-
|
|
618
|
+
|
|
598
619
|
return payload
|
|
599
620
|
|
|
600
621
|
|
|
@@ -602,13 +623,14 @@ class UniversalScrapeRequest:
|
|
|
602
623
|
# Web Scraper Task Models
|
|
603
624
|
# =============================================================================
|
|
604
625
|
|
|
626
|
+
|
|
605
627
|
@dataclass
|
|
606
628
|
class ScraperTaskConfig:
|
|
607
629
|
"""
|
|
608
630
|
Configuration for creating a Web Scraper API task.
|
|
609
|
-
|
|
631
|
+
|
|
610
632
|
Note: You must get spider_id and spider_name from the Thordata Dashboard.
|
|
611
|
-
|
|
633
|
+
|
|
612
634
|
Args:
|
|
613
635
|
file_name: Name for the output file.
|
|
614
636
|
spider_id: Spider identifier from Dashboard.
|
|
@@ -616,7 +638,7 @@ class ScraperTaskConfig:
|
|
|
616
638
|
parameters: Spider-specific parameters.
|
|
617
639
|
universal_params: Global spider settings.
|
|
618
640
|
include_errors: Include error details in output.
|
|
619
|
-
|
|
641
|
+
|
|
620
642
|
Example:
|
|
621
643
|
>>> config = ScraperTaskConfig(
|
|
622
644
|
... file_name="youtube_data",
|
|
@@ -629,18 +651,18 @@ class ScraperTaskConfig:
|
|
|
629
651
|
... )
|
|
630
652
|
>>> payload = config.to_payload()
|
|
631
653
|
"""
|
|
632
|
-
|
|
654
|
+
|
|
633
655
|
file_name: str
|
|
634
656
|
spider_id: str
|
|
635
657
|
spider_name: str
|
|
636
658
|
parameters: Dict[str, Any]
|
|
637
659
|
universal_params: Optional[Dict[str, Any]] = None
|
|
638
660
|
include_errors: bool = True
|
|
639
|
-
|
|
661
|
+
|
|
640
662
|
def to_payload(self) -> Dict[str, Any]:
|
|
641
663
|
"""
|
|
642
664
|
Convert to API request payload.
|
|
643
|
-
|
|
665
|
+
|
|
644
666
|
Returns:
|
|
645
667
|
Dictionary ready to be sent to the Web Scraper API.
|
|
646
668
|
"""
|
|
@@ -651,10 +673,10 @@ class ScraperTaskConfig:
|
|
|
651
673
|
"spider_parameters": json.dumps([self.parameters]),
|
|
652
674
|
"spider_errors": "true" if self.include_errors else "false",
|
|
653
675
|
}
|
|
654
|
-
|
|
676
|
+
|
|
655
677
|
if self.universal_params:
|
|
656
678
|
payload["spider_universal"] = json.dumps(self.universal_params)
|
|
657
|
-
|
|
679
|
+
|
|
658
680
|
return payload
|
|
659
681
|
|
|
660
682
|
|
|
@@ -662,37 +684,42 @@ class ScraperTaskConfig:
|
|
|
662
684
|
# Response Models
|
|
663
685
|
# =============================================================================
|
|
664
686
|
|
|
687
|
+
|
|
665
688
|
@dataclass
|
|
666
689
|
class TaskStatusResponse:
|
|
667
690
|
"""
|
|
668
691
|
Response from task status check.
|
|
669
|
-
|
|
692
|
+
|
|
670
693
|
Attributes:
|
|
671
694
|
task_id: The task identifier.
|
|
672
695
|
status: Current task status.
|
|
673
696
|
progress: Optional progress percentage.
|
|
674
697
|
message: Optional status message.
|
|
675
698
|
"""
|
|
676
|
-
|
|
699
|
+
|
|
677
700
|
task_id: str
|
|
678
701
|
status: str
|
|
679
702
|
progress: Optional[int] = None
|
|
680
703
|
message: Optional[str] = None
|
|
681
|
-
|
|
704
|
+
|
|
682
705
|
def is_complete(self) -> bool:
|
|
683
706
|
"""Check if the task has completed (success or failure)."""
|
|
684
707
|
terminal_statuses = {
|
|
685
|
-
"ready",
|
|
686
|
-
"
|
|
708
|
+
"ready",
|
|
709
|
+
"success",
|
|
710
|
+
"finished",
|
|
711
|
+
"failed",
|
|
712
|
+
"error",
|
|
713
|
+
"cancelled",
|
|
687
714
|
}
|
|
688
715
|
return self.status.lower() in terminal_statuses
|
|
689
|
-
|
|
716
|
+
|
|
690
717
|
def is_success(self) -> bool:
|
|
691
718
|
"""Check if the task completed successfully."""
|
|
692
719
|
success_statuses = {"ready", "success", "finished"}
|
|
693
720
|
return self.status.lower() in success_statuses
|
|
694
|
-
|
|
721
|
+
|
|
695
722
|
def is_failed(self) -> bool:
|
|
696
723
|
"""Check if the task failed."""
|
|
697
724
|
failure_statuses = {"failed", "error"}
|
|
698
|
-
return self.status.lower() in failure_statuses
|
|
725
|
+
return self.status.lower() in failure_statuses
|