thordata-sdk 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +4 -40
- thordata/async_client.py +572 -1241
- thordata/async_unlimited.py +130 -0
- thordata/client.py +1184 -1309
- thordata/core/__init__.py +23 -0
- thordata/core/async_http_client.py +91 -0
- thordata/core/http_client.py +79 -0
- thordata/core/tunnel.py +287 -0
- thordata/demo.py +2 -2
- thordata/enums.py +41 -380
- thordata/models.py +37 -1193
- thordata/tools/__init__.py +28 -0
- thordata/tools/base.py +42 -0
- thordata/tools/code.py +26 -0
- thordata/tools/ecommerce.py +67 -0
- thordata/tools/search.py +73 -0
- thordata/tools/social.py +190 -0
- thordata/tools/video.py +81 -0
- thordata/types/__init__.py +77 -0
- thordata/types/common.py +141 -0
- thordata/types/proxy.py +340 -0
- thordata/types/serp.py +224 -0
- thordata/types/task.py +144 -0
- thordata/types/universal.py +66 -0
- thordata/unlimited.py +169 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/METADATA +74 -51
- thordata_sdk-1.5.0.dist-info/RECORD +35 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/WHEEL +1 -1
- thordata_sdk-1.3.0.dist-info/RECORD +0 -16
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/top_level.txt +0 -0
thordata/models.py
CHANGED
|
@@ -1,1197 +1,41 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Data models for the Thordata Python SDK.
|
|
3
|
-
|
|
4
|
-
This
|
|
5
|
-
SERP API calls, and Universal Scraping requests. Using these models enables
|
|
6
|
-
IDE autocomplete and reduces parameter errors.
|
|
7
|
-
|
|
8
|
-
Example:
|
|
9
|
-
>>> from thordata.models import ProxyConfig, SerpRequest
|
|
10
|
-
>>>
|
|
11
|
-
>>> # Build a proxy URL with geo-targeting
|
|
12
|
-
>>> proxy = ProxyConfig(
|
|
13
|
-
... username="myuser",
|
|
14
|
-
... password="mypass",
|
|
15
|
-
... country="us",
|
|
16
|
-
... city="seattle"
|
|
17
|
-
... )
|
|
18
|
-
>>> print(proxy.build_proxy_url())
|
|
19
|
-
|
|
20
|
-
>>> # Configure a SERP request
|
|
21
|
-
>>> serp = SerpRequest(query="python tutorial", engine="google", num=20)
|
|
22
|
-
>>> print(serp.to_payload())
|
|
3
|
+
Moved to thordata.types in v1.5.0.
|
|
4
|
+
This file is kept for backward compatibility.
|
|
23
5
|
"""
|
|
24
6
|
|
|
25
|
-
from
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
""
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
}
|
|
61
|
-
return ports[self.value]
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
# =============================================================================
|
|
65
|
-
# Proxy Configuration Models
|
|
66
|
-
# =============================================================================
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
@dataclass
|
|
70
|
-
class ProxyConfig:
|
|
71
|
-
"""
|
|
72
|
-
Configuration for building a Thordata proxy URL.
|
|
73
|
-
|
|
74
|
-
This class handles the complex username format required by Thordata proxies,
|
|
75
|
-
where geo-targeting and session parameters are embedded in the username.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
username: Your Thordata account username (the part after 'td-customer-').
|
|
79
|
-
password: Your Thordata account password.
|
|
80
|
-
product: Proxy product type (residential, mobile, datacenter, isp).
|
|
81
|
-
host: Proxy gateway host. If None, uses default based on product.
|
|
82
|
-
port: Proxy gateway port. If None, uses default based on product.
|
|
83
|
-
protocol: Proxy protocol - 'http' or 'https'.
|
|
84
|
-
|
|
85
|
-
# Geo-targeting (all optional)
|
|
86
|
-
continent: Target continent code (af/an/as/eu/na/oc/sa).
|
|
87
|
-
country: Target country code in ISO 3166-1 alpha-2 format.
|
|
88
|
-
state: Target state name in lowercase.
|
|
89
|
-
city: Target city name in lowercase.
|
|
90
|
-
asn: Target ASN code (e.g., 'AS12322'). Must be used with country.
|
|
91
|
-
|
|
92
|
-
# Session control (optional)
|
|
93
|
-
session_id: Session identifier for sticky sessions.
|
|
94
|
-
session_duration: Session duration in minutes (1-90).
|
|
95
|
-
|
|
96
|
-
Example:
|
|
97
|
-
>>> config = ProxyConfig(
|
|
98
|
-
... username="GnrqUwwu3obt",
|
|
99
|
-
... password="PkCSzvt30iww",
|
|
100
|
-
... product=ProxyProduct.RESIDENTIAL,
|
|
101
|
-
... country="us",
|
|
102
|
-
... state="california",
|
|
103
|
-
... session_id="mysession123",
|
|
104
|
-
... session_duration=10
|
|
105
|
-
... )
|
|
106
|
-
>>> print(config.build_proxy_url())
|
|
107
|
-
http://td-customer-GnrqUwwu3obt-country-us-state-california-sessid-mysession123-sesstime-10:PkCSzvt30iww@....pr.thordata.net:9999
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
username: str
|
|
111
|
-
password: str
|
|
112
|
-
product: ProxyProduct | str = ProxyProduct.RESIDENTIAL
|
|
113
|
-
host: str | None = None
|
|
114
|
-
port: int | None = None
|
|
115
|
-
protocol: str = "https"
|
|
116
|
-
|
|
117
|
-
# Geo-targeting
|
|
118
|
-
continent: str | None = None
|
|
119
|
-
country: str | None = None
|
|
120
|
-
state: str | None = None
|
|
121
|
-
city: str | None = None
|
|
122
|
-
asn: str | None = None
|
|
123
|
-
|
|
124
|
-
# Session control
|
|
125
|
-
session_id: str | None = None
|
|
126
|
-
session_duration: int | None = None # minutes, 1-90
|
|
127
|
-
|
|
128
|
-
# Valid continent codes
|
|
129
|
-
VALID_CONTINENTS = {"af", "an", "as", "eu", "na", "oc", "sa"}
|
|
130
|
-
|
|
131
|
-
def __post_init__(self) -> None:
|
|
132
|
-
"""Validate configuration after initialization."""
|
|
133
|
-
# Normalize product to enum
|
|
134
|
-
if isinstance(self.product, str):
|
|
135
|
-
self.product = ProxyProduct(self.product.lower())
|
|
136
|
-
|
|
137
|
-
# Set default host and port based on product
|
|
138
|
-
if self.host is None:
|
|
139
|
-
# Set host based on product type
|
|
140
|
-
host_map = {
|
|
141
|
-
# User&Pass auth entry (docs examples use t.pr.thordata.net for authenticated proxy)
|
|
142
|
-
ProxyProduct.RESIDENTIAL: "t.pr.thordata.net",
|
|
143
|
-
ProxyProduct.DATACENTER: "dc.pr.thordata.net",
|
|
144
|
-
ProxyProduct.MOBILE: "m.pr.thordata.net",
|
|
145
|
-
ProxyProduct.ISP: "isp.pr.thordata.net",
|
|
146
|
-
}
|
|
147
|
-
self.host = host_map.get(self.product, "pr.thordata.net")
|
|
148
|
-
|
|
149
|
-
if self.port is None:
|
|
150
|
-
self.port = self.product.default_port
|
|
151
|
-
|
|
152
|
-
self._validate()
|
|
153
|
-
|
|
154
|
-
def _validate(self) -> None:
|
|
155
|
-
"""Validate the proxy configuration."""
|
|
156
|
-
# Validate protocol
|
|
157
|
-
if self.protocol not in ("http", "https", "socks5", "socks5h"):
|
|
158
|
-
raise ValueError(
|
|
159
|
-
f"Invalid protocol: {self.protocol}. Must be 'http', 'https', 'socks5', or 'socks5h'."
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
# Validate session duration
|
|
163
|
-
if self.session_duration is not None:
|
|
164
|
-
if not 1 <= self.session_duration <= 90:
|
|
165
|
-
raise ValueError(
|
|
166
|
-
f"session_duration must be between 1 and 90 minutes, "
|
|
167
|
-
f"got {self.session_duration}"
|
|
168
|
-
)
|
|
169
|
-
if not self.session_id:
|
|
170
|
-
raise ValueError("session_duration requires session_id to be set")
|
|
171
|
-
|
|
172
|
-
# Validate ASN requires country
|
|
173
|
-
if self.asn and not self.country:
|
|
174
|
-
raise ValueError("ASN targeting requires country to be specified")
|
|
175
|
-
|
|
176
|
-
# Validate continent code
|
|
177
|
-
if self.continent and self.continent.lower() not in self.VALID_CONTINENTS:
|
|
178
|
-
raise ValueError(
|
|
179
|
-
f"Invalid continent code: {self.continent}. "
|
|
180
|
-
f"Must be one of: {', '.join(sorted(self.VALID_CONTINENTS))}"
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
# Validate country code format (2 letters)
|
|
184
|
-
if self.country and not re.match(r"^[a-zA-Z]{2}$", self.country):
|
|
185
|
-
raise ValueError(
|
|
186
|
-
f"Invalid country code: {self.country}. "
|
|
187
|
-
"Must be a 2-letter ISO 3166-1 alpha-2 code."
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
def build_username(self) -> str:
|
|
191
|
-
"""
|
|
192
|
-
Build the complete username string with embedded parameters.
|
|
193
|
-
|
|
194
|
-
Returns:
|
|
195
|
-
The formatted username string for proxy authentication.
|
|
196
|
-
"""
|
|
197
|
-
parts = [f"td-customer-{self.username}"]
|
|
198
|
-
|
|
199
|
-
# Add geo-targeting parameters (order matters)
|
|
200
|
-
if self.continent:
|
|
201
|
-
parts.append(f"continent-{self.continent.lower()}")
|
|
202
|
-
|
|
203
|
-
if self.country:
|
|
204
|
-
parts.append(f"country-{self.country.lower()}")
|
|
205
|
-
|
|
206
|
-
if self.state:
|
|
207
|
-
parts.append(f"state-{self.state.lower()}")
|
|
208
|
-
|
|
209
|
-
if self.city:
|
|
210
|
-
parts.append(f"city-{self.city.lower()}")
|
|
211
|
-
|
|
212
|
-
if self.asn:
|
|
213
|
-
# Ensure ASN has correct format
|
|
214
|
-
asn_value = self.asn.upper()
|
|
215
|
-
if not asn_value.startswith("AS"):
|
|
216
|
-
asn_value = f"AS{asn_value}"
|
|
217
|
-
parts.append(f"asn-{asn_value}")
|
|
218
|
-
|
|
219
|
-
# Add session parameters
|
|
220
|
-
if self.session_id:
|
|
221
|
-
parts.append(f"sessid-{self.session_id}")
|
|
222
|
-
|
|
223
|
-
if self.session_duration:
|
|
224
|
-
parts.append(f"sesstime-{self.session_duration}")
|
|
225
|
-
|
|
226
|
-
return "-".join(parts)
|
|
227
|
-
|
|
228
|
-
def build_proxy_url(self) -> str:
|
|
229
|
-
username = self.build_username()
|
|
230
|
-
|
|
231
|
-
proto = self.protocol
|
|
232
|
-
if proto == "socks5":
|
|
233
|
-
proto = "socks5h"
|
|
234
|
-
|
|
235
|
-
# IMPORTANT: SOCKS URLs must URL-encode credentials, otherwise special chars
|
|
236
|
-
# like @ : / ? # will break parsing and often show up as timeouts.
|
|
237
|
-
if proto.startswith("socks"):
|
|
238
|
-
username_enc = quote(username, safe="")
|
|
239
|
-
password_enc = quote(self.password, safe="")
|
|
240
|
-
return f"{proto}://{username_enc}:{password_enc}@{self.host}:{self.port}"
|
|
241
|
-
|
|
242
|
-
return f"{proto}://{username}:{self.password}@{self.host}:{self.port}"
|
|
243
|
-
|
|
244
|
-
def build_proxy_endpoint(self) -> str:
|
|
245
|
-
proto = self.protocol
|
|
246
|
-
if proto == "socks5":
|
|
247
|
-
proto = "socks5h"
|
|
248
|
-
return f"{self.protocol}://{self.host}:{self.port}"
|
|
249
|
-
|
|
250
|
-
def build_proxy_basic_auth(self) -> str:
|
|
251
|
-
"""Basic auth string 'username:password' for Proxy-Authorization."""
|
|
252
|
-
return f"{self.build_username()}:{self.password}"
|
|
253
|
-
|
|
254
|
-
def to_proxies_dict(self) -> dict[str, str]:
|
|
255
|
-
"""
|
|
256
|
-
Build a proxies dict suitable for the requests library.
|
|
257
|
-
|
|
258
|
-
Returns:
|
|
259
|
-
Dict with 'http' and 'https' keys pointing to the proxy URL.
|
|
260
|
-
"""
|
|
261
|
-
url = self.build_proxy_url()
|
|
262
|
-
return {"http": url, "https": url}
|
|
263
|
-
|
|
264
|
-
def to_aiohttp_config(self) -> tuple:
|
|
265
|
-
"""
|
|
266
|
-
Get proxy configuration for aiohttp.
|
|
267
|
-
|
|
268
|
-
Returns:
|
|
269
|
-
Tuple of (proxy_url, proxy_auth) for aiohttp.
|
|
270
|
-
"""
|
|
271
|
-
try:
|
|
272
|
-
import aiohttp
|
|
273
|
-
|
|
274
|
-
proxy_url = f"{self.protocol}://{self.host}:{self.port}"
|
|
275
|
-
proxy_auth = aiohttp.BasicAuth(
|
|
276
|
-
login=self.build_username(), password=self.password
|
|
277
|
-
)
|
|
278
|
-
return proxy_url, proxy_auth
|
|
279
|
-
except ImportError as e:
|
|
280
|
-
raise ImportError(
|
|
281
|
-
"aiohttp is required for async proxy configuration"
|
|
282
|
-
) from e
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
@dataclass
|
|
286
|
-
class WhitelistProxyConfig:
|
|
287
|
-
"""
|
|
288
|
-
Proxy config for IP-whitelist authentication mode (no username/password).
|
|
289
|
-
|
|
290
|
-
In whitelist mode, you do NOT pass proxy auth.
|
|
291
|
-
You only connect to the proxy entry node (host:port).
|
|
292
|
-
|
|
293
|
-
Examples (from docs):
|
|
294
|
-
- Global random: pr.thordata.net:9999
|
|
295
|
-
- Country nodes: us-pr.thordata.net:10000, etc.
|
|
296
|
-
"""
|
|
297
|
-
|
|
298
|
-
host: str = "pr.thordata.net"
|
|
299
|
-
port: int = 9999
|
|
300
|
-
protocol: str = "https" # use http for proxy scheme; target URL can still be https
|
|
301
|
-
|
|
302
|
-
def __post_init__(self) -> None:
|
|
303
|
-
if self.protocol not in ("http", "https"):
|
|
304
|
-
raise ValueError("protocol must be 'http' or 'https'")
|
|
305
|
-
|
|
306
|
-
def build_proxy_url(self) -> str:
|
|
307
|
-
return f"{self.protocol}://{self.host}:{self.port}"
|
|
308
|
-
|
|
309
|
-
def to_proxies_dict(self) -> dict[str, str]:
|
|
310
|
-
url = self.build_proxy_url()
|
|
311
|
-
return {"http": url, "https": url}
|
|
312
|
-
|
|
313
|
-
def to_aiohttp_config(self) -> tuple:
|
|
314
|
-
# aiohttp: proxy_auth should be None in whitelist mode
|
|
315
|
-
return self.build_proxy_url(), None
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
@dataclass
|
|
319
|
-
class StaticISPProxy:
|
|
320
|
-
"""
|
|
321
|
-
Configuration for static ISP proxy with direct IP connection.
|
|
322
|
-
|
|
323
|
-
Static ISP proxies connect directly to a purchased IP address,
|
|
324
|
-
not through the gateway.
|
|
325
|
-
|
|
326
|
-
Args:
|
|
327
|
-
host: The static IP address you purchased.
|
|
328
|
-
username: Your ISP proxy username.
|
|
329
|
-
password: Your ISP proxy password.
|
|
330
|
-
port: Port number (default: 6666).
|
|
331
|
-
protocol: Proxy protocol - 'http' or 'https'.
|
|
332
|
-
|
|
333
|
-
Example:
|
|
334
|
-
>>> proxy = StaticISPProxy(
|
|
335
|
-
... host="xx.xxx.xxx.xxx",
|
|
336
|
-
... username="myuser",
|
|
337
|
-
... password="mypass"
|
|
338
|
-
... )
|
|
339
|
-
>>> print(proxy.build_proxy_url())
|
|
340
|
-
http://myuser:mypass@xx.xxx.xxx.xxx:6666
|
|
341
|
-
"""
|
|
342
|
-
|
|
343
|
-
host: str
|
|
344
|
-
username: str
|
|
345
|
-
password: str
|
|
346
|
-
port: int = 6666
|
|
347
|
-
protocol: str = "https"
|
|
348
|
-
|
|
349
|
-
def __post_init__(self) -> None:
|
|
350
|
-
"""Validate configuration."""
|
|
351
|
-
if self.protocol not in ("http", "https", "socks5", "socks5h"):
|
|
352
|
-
raise ValueError(
|
|
353
|
-
f"Invalid protocol: {self.protocol}. Must be 'http', 'https', 'socks5', or 'socks5h'."
|
|
354
|
-
)
|
|
355
|
-
|
|
356
|
-
def build_proxy_url(self) -> str:
|
|
357
|
-
"""
|
|
358
|
-
Build the complete proxy URL for direct connection.
|
|
359
|
-
|
|
360
|
-
Returns:
|
|
361
|
-
The formatted proxy URL.
|
|
362
|
-
"""
|
|
363
|
-
proto = self.protocol
|
|
364
|
-
if proto == "socks5":
|
|
365
|
-
proto = "socks5h"
|
|
366
|
-
|
|
367
|
-
if proto.startswith("socks"):
|
|
368
|
-
u = quote(self.username, safe="")
|
|
369
|
-
p = quote(self.password, safe="")
|
|
370
|
-
return f"{proto}://{u}:{p}@{self.host}:{self.port}"
|
|
371
|
-
|
|
372
|
-
return f"{proto}://{self.username}:{self.password}@{self.host}:{self.port}"
|
|
373
|
-
|
|
374
|
-
def to_proxies_dict(self) -> dict[str, str]:
|
|
375
|
-
"""
|
|
376
|
-
Build a proxies dict suitable for the requests library.
|
|
377
|
-
|
|
378
|
-
Returns:
|
|
379
|
-
Dict with 'http' and 'https' keys pointing to the proxy URL.
|
|
380
|
-
"""
|
|
381
|
-
url = self.build_proxy_url()
|
|
382
|
-
return {"http": url, "https": url}
|
|
383
|
-
|
|
384
|
-
def to_aiohttp_config(self) -> tuple:
|
|
385
|
-
"""
|
|
386
|
-
Get proxy configuration for aiohttp.
|
|
387
|
-
|
|
388
|
-
Returns:
|
|
389
|
-
Tuple of (proxy_url, proxy_auth) for aiohttp.
|
|
390
|
-
"""
|
|
391
|
-
try:
|
|
392
|
-
import aiohttp
|
|
393
|
-
|
|
394
|
-
proxy_url = f"{self.protocol}://{self.host}:{self.port}"
|
|
395
|
-
proxy_auth = aiohttp.BasicAuth(login=self.username, password=self.password)
|
|
396
|
-
return proxy_url, proxy_auth
|
|
397
|
-
except ImportError as e:
|
|
398
|
-
raise ImportError(
|
|
399
|
-
"aiohttp is required for async proxy configuration"
|
|
400
|
-
) from e
|
|
401
|
-
|
|
402
|
-
@classmethod
|
|
403
|
-
def from_env(cls) -> StaticISPProxy:
|
|
404
|
-
"""
|
|
405
|
-
Create StaticISPProxy from environment variables.
|
|
406
|
-
|
|
407
|
-
Required env vars:
|
|
408
|
-
- THORDATA_ISP_HOST
|
|
409
|
-
- THORDATA_ISP_USERNAME
|
|
410
|
-
- THORDATA_ISP_PASSWORD
|
|
411
|
-
|
|
412
|
-
Returns:
|
|
413
|
-
Configured StaticISPProxy instance.
|
|
414
|
-
|
|
415
|
-
Raises:
|
|
416
|
-
ValueError: If required environment variables are missing.
|
|
417
|
-
"""
|
|
418
|
-
import os
|
|
419
|
-
|
|
420
|
-
host = os.getenv("THORDATA_ISP_HOST")
|
|
421
|
-
username = os.getenv("THORDATA_ISP_USERNAME")
|
|
422
|
-
password = os.getenv("THORDATA_ISP_PASSWORD")
|
|
423
|
-
|
|
424
|
-
if not all([host, username, password]):
|
|
425
|
-
raise ValueError(
|
|
426
|
-
"THORDATA_ISP_HOST, THORDATA_ISP_USERNAME, and "
|
|
427
|
-
"THORDATA_ISP_PASSWORD are required"
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
return cls(host=host, username=username, password=password)
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
@dataclass
|
|
434
|
-
class StickySession(ProxyConfig):
|
|
435
|
-
"""
|
|
436
|
-
Convenience class for creating sticky session proxy configurations.
|
|
437
|
-
|
|
438
|
-
A sticky session keeps the same IP address for a specified duration,
|
|
439
|
-
useful for multi-step operations that require IP consistency.
|
|
440
|
-
|
|
441
|
-
Args:
|
|
442
|
-
duration_minutes: How long to keep the same IP (1-90 minutes).
|
|
443
|
-
auto_session_id: If True, automatically generates a unique session ID.
|
|
444
|
-
|
|
445
|
-
Example:
|
|
446
|
-
>>> session = StickySession(
|
|
447
|
-
... username="myuser",
|
|
448
|
-
... password="mypass",
|
|
449
|
-
... country="us",
|
|
450
|
-
... duration_minutes=15
|
|
451
|
-
... )
|
|
452
|
-
>>> # Each call to build_proxy_url() uses the same session
|
|
453
|
-
>>> url = session.build_proxy_url()
|
|
454
|
-
"""
|
|
455
|
-
|
|
456
|
-
duration_minutes: int = 10
|
|
457
|
-
auto_session_id: bool = True
|
|
458
|
-
|
|
459
|
-
def __post_init__(self) -> None:
|
|
460
|
-
# Auto-generate session ID if requested and not provided
|
|
461
|
-
if self.auto_session_id and not self.session_id:
|
|
462
|
-
self.session_id = uuid.uuid4().hex[:12]
|
|
463
|
-
|
|
464
|
-
# Set session_duration from duration_minutes
|
|
465
|
-
self.session_duration = self.duration_minutes
|
|
466
|
-
|
|
467
|
-
# Call parent post_init
|
|
468
|
-
super().__post_init__()
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
# =============================================================================
|
|
472
|
-
# SERP API Models
|
|
473
|
-
# =============================================================================
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
@dataclass
|
|
477
|
-
class SerpRequest:
|
|
478
|
-
"""
|
|
479
|
-
Configuration for a SERP API request.
|
|
480
|
-
|
|
481
|
-
Supports Google, Bing, Yandex, DuckDuckGo, and Baidu search engines.
|
|
482
|
-
|
|
483
|
-
Args:
|
|
484
|
-
query: The search query string (required).
|
|
485
|
-
engine: Search engine to use (default: 'google').
|
|
486
|
-
num: Number of results per page (default: 10).
|
|
487
|
-
start: Result offset for pagination (default: 0).
|
|
488
|
-
|
|
489
|
-
# Localization
|
|
490
|
-
country: Country code for results (gl parameter for Google).
|
|
491
|
-
language: Language code for interface (hl parameter for Google).
|
|
492
|
-
google_domain: Google domain to use (e.g., 'google.co.uk').
|
|
493
|
-
|
|
494
|
-
# Geo-targeting
|
|
495
|
-
location: Location name for geo-targeting.
|
|
496
|
-
uule: Encoded location parameter (use with location).
|
|
497
|
-
|
|
498
|
-
# Search type
|
|
499
|
-
search_type: Type of search (images, news, shopping, videos, etc.).
|
|
500
|
-
|
|
501
|
-
# Filters
|
|
502
|
-
safe_search: Enable safe search filtering.
|
|
503
|
-
time_filter: Time range filter (hour, day, week, month, year).
|
|
504
|
-
no_autocorrect: Disable automatic spelling correction (nfpr).
|
|
505
|
-
filter_duplicates: Enable/disable duplicate filtering.
|
|
506
|
-
|
|
507
|
-
# Device & Rendering
|
|
508
|
-
device: Device type ('desktop', 'mobile', 'tablet').
|
|
509
|
-
render_js: Enable JavaScript rendering in SERP (render_js=True/False).
|
|
510
|
-
no_cache: Disable internal caching (no_cache=True/False).
|
|
511
|
-
|
|
512
|
-
# Output
|
|
513
|
-
output_format: 'json' (default) or 'html'.
|
|
514
|
-
|
|
515
|
-
# Advanced
|
|
516
|
-
ludocid: Google Place ID.
|
|
517
|
-
kgmid: Google Knowledge Graph ID.
|
|
518
|
-
|
|
519
|
-
# Extra
|
|
520
|
-
extra_params: Additional parameters to pass through (ibp, lsig, si, uds, ...).
|
|
521
|
-
"""
|
|
522
|
-
|
|
523
|
-
query: str
|
|
524
|
-
engine: str = "google"
|
|
525
|
-
num: int = 10
|
|
526
|
-
start: int = 0
|
|
527
|
-
|
|
528
|
-
# Localization
|
|
529
|
-
country: str | None = None # 'gl' for Google
|
|
530
|
-
language: str | None = None # 'hl' for Google
|
|
531
|
-
google_domain: str | None = None
|
|
532
|
-
countries_filter: str | None = None # 'cr' parameter
|
|
533
|
-
languages_filter: str | None = None # 'lr' parameter
|
|
534
|
-
|
|
535
|
-
# Geo-targeting
|
|
536
|
-
location: str | None = None
|
|
537
|
-
uule: str | None = None # Encoded location
|
|
538
|
-
|
|
539
|
-
# Search type
|
|
540
|
-
search_type: str | None = None # tbm parameter (isch, shop, nws, vid, ...)
|
|
541
|
-
|
|
542
|
-
# Filters
|
|
543
|
-
safe_search: bool | None = None
|
|
544
|
-
time_filter: str | None = None # tbs parameter (time part)
|
|
545
|
-
no_autocorrect: bool = False # nfpr parameter
|
|
546
|
-
filter_duplicates: bool | None = None # filter parameter
|
|
547
|
-
|
|
548
|
-
# Device & Rendering
|
|
549
|
-
device: str | None = None # 'desktop', 'mobile', 'tablet'
|
|
550
|
-
render_js: bool | None = None # render_js parameter
|
|
551
|
-
no_cache: bool | None = None # no_cache parameter
|
|
552
|
-
|
|
553
|
-
# Output format
|
|
554
|
-
output_format: str = "json" # 'json' or 'html'
|
|
555
|
-
|
|
556
|
-
# Advanced Google parameters
|
|
557
|
-
ludocid: str | None = None # Google Place ID
|
|
558
|
-
kgmid: str | None = None # Knowledge Graph ID
|
|
559
|
-
|
|
560
|
-
# Pass-through
|
|
561
|
-
extra_params: dict[str, Any] = field(default_factory=dict)
|
|
562
|
-
|
|
563
|
-
# Search type mappings for tbm parameter
|
|
564
|
-
SEARCH_TYPE_MAP = {
|
|
565
|
-
"images": "isch",
|
|
566
|
-
"shopping": "shop",
|
|
567
|
-
"news": "nws",
|
|
568
|
-
"videos": "vid",
|
|
569
|
-
# Direct values also work
|
|
570
|
-
"isch": "isch",
|
|
571
|
-
"shop": "shop",
|
|
572
|
-
"nws": "nws",
|
|
573
|
-
"vid": "vid",
|
|
574
|
-
}
|
|
575
|
-
|
|
576
|
-
# Time filter mappings for tbs parameter
|
|
577
|
-
TIME_FILTER_MAP = {
|
|
578
|
-
"hour": "qdr:h",
|
|
579
|
-
"day": "qdr:d",
|
|
580
|
-
"week": "qdr:w",
|
|
581
|
-
"month": "qdr:m",
|
|
582
|
-
"year": "qdr:y",
|
|
583
|
-
}
|
|
584
|
-
|
|
585
|
-
# Engine URL defaults
|
|
586
|
-
ENGINE_URLS = {
|
|
587
|
-
"google": "google.com",
|
|
588
|
-
"bing": "bing.com",
|
|
589
|
-
"yandex": "yandex.com",
|
|
590
|
-
"duckduckgo": "duckduckgo.com",
|
|
591
|
-
"baidu": "baidu.com",
|
|
592
|
-
}
|
|
593
|
-
|
|
594
|
-
def to_payload(self) -> dict[str, Any]:
|
|
595
|
-
"""
|
|
596
|
-
Convert to API request payload.
|
|
597
|
-
|
|
598
|
-
Returns:
|
|
599
|
-
Dictionary ready to be sent to the SERP API.
|
|
600
|
-
"""
|
|
601
|
-
engine = self.engine.lower()
|
|
602
|
-
|
|
603
|
-
payload: dict[str, Any] = {
|
|
604
|
-
"engine": engine,
|
|
605
|
-
"num": str(self.num),
|
|
606
|
-
}
|
|
607
|
-
|
|
608
|
-
fmt = self.output_format.lower()
|
|
609
|
-
if fmt == "json":
|
|
610
|
-
payload["json"] = "1"
|
|
611
|
-
elif fmt == "html":
|
|
612
|
-
# omit "json" to get raw HTML (per docs: no json -> HTML)
|
|
613
|
-
pass
|
|
614
|
-
else:
|
|
615
|
-
# keep backward compatibility: if user passes "2"/"both"/etc.
|
|
616
|
-
if fmt in ("2", "both", "json+html", "json_html"):
|
|
617
|
-
payload["json"] = "2"
|
|
618
|
-
|
|
619
|
-
# Handle query parameter (Yandex uses 'text', others use 'q')
|
|
620
|
-
if engine == "yandex":
|
|
621
|
-
payload["text"] = self.query
|
|
622
|
-
else:
|
|
623
|
-
payload["q"] = self.query
|
|
624
|
-
|
|
625
|
-
# Domain overrides (preferred by docs)
|
|
626
|
-
if self.google_domain:
|
|
627
|
-
payload["google_domain"] = self.google_domain
|
|
628
|
-
|
|
629
|
-
# Pagination
|
|
630
|
-
if self.start > 0:
|
|
631
|
-
payload["start"] = str(self.start)
|
|
632
|
-
|
|
633
|
-
# Localization
|
|
634
|
-
if self.country:
|
|
635
|
-
payload["gl"] = self.country.lower()
|
|
636
|
-
|
|
637
|
-
if self.language:
|
|
638
|
-
payload["hl"] = self.language.lower()
|
|
639
|
-
|
|
640
|
-
if self.countries_filter:
|
|
641
|
-
payload["cr"] = self.countries_filter
|
|
642
|
-
|
|
643
|
-
if self.languages_filter:
|
|
644
|
-
payload["lr"] = self.languages_filter
|
|
645
|
-
|
|
646
|
-
# Geo-targeting
|
|
647
|
-
if self.location:
|
|
648
|
-
payload["location"] = self.location
|
|
649
|
-
|
|
650
|
-
if self.uule:
|
|
651
|
-
payload["uule"] = self.uule
|
|
652
|
-
|
|
653
|
-
# Search type (tbm)
|
|
654
|
-
if self.search_type:
|
|
655
|
-
search_type_lower = self.search_type.lower()
|
|
656
|
-
tbm_value = self.SEARCH_TYPE_MAP.get(search_type_lower, search_type_lower)
|
|
657
|
-
payload["tbm"] = tbm_value
|
|
658
|
-
|
|
659
|
-
# Filters
|
|
660
|
-
if self.safe_search is not None:
|
|
661
|
-
payload["safe"] = "active" if self.safe_search else "off"
|
|
662
|
-
|
|
663
|
-
if self.time_filter:
|
|
664
|
-
time_lower = self.time_filter.lower()
|
|
665
|
-
tbs_value = self.TIME_FILTER_MAP.get(time_lower, time_lower)
|
|
666
|
-
payload["tbs"] = tbs_value
|
|
667
|
-
|
|
668
|
-
if self.no_autocorrect:
|
|
669
|
-
payload["nfpr"] = "1"
|
|
670
|
-
|
|
671
|
-
if self.filter_duplicates is not None:
|
|
672
|
-
payload["filter"] = "1" if self.filter_duplicates else "0"
|
|
673
|
-
|
|
674
|
-
# Device
|
|
675
|
-
if self.device:
|
|
676
|
-
payload["device"] = self.device.lower()
|
|
677
|
-
|
|
678
|
-
# Rendering & cache control
|
|
679
|
-
if self.render_js is not None:
|
|
680
|
-
payload["render_js"] = "True" if self.render_js else "False"
|
|
681
|
-
|
|
682
|
-
if self.no_cache is not None:
|
|
683
|
-
payload["no_cache"] = "True" if self.no_cache else "False"
|
|
684
|
-
|
|
685
|
-
# Advanced Google parameters
|
|
686
|
-
if self.ludocid:
|
|
687
|
-
payload["ludocid"] = self.ludocid
|
|
688
|
-
|
|
689
|
-
if self.kgmid:
|
|
690
|
-
payload["kgmid"] = self.kgmid
|
|
691
|
-
|
|
692
|
-
# Extra parameters (ibp, lsig, si, uds, etc.)
|
|
693
|
-
payload.update(self.extra_params)
|
|
694
|
-
|
|
695
|
-
return payload
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
# =============================================================================
|
|
699
|
-
# Universal Scraper (Web Unlocker) Models
|
|
700
|
-
# =============================================================================
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
@dataclass
|
|
704
|
-
class UniversalScrapeRequest:
|
|
705
|
-
"""
|
|
706
|
-
Configuration for a Universal Scraping API (Web Unlocker) request.
|
|
707
|
-
|
|
708
|
-
This API bypasses anti-bot protections like Cloudflare, CAPTCHAs, etc.
|
|
709
|
-
|
|
710
|
-
Args:
|
|
711
|
-
url: Target URL to scrape (required).
|
|
712
|
-
js_render: Enable JavaScript rendering with headless browser.
|
|
713
|
-
output_format: Output format - 'html' or 'png' (screenshot).
|
|
714
|
-
country: Country code for geo-targeting the request.
|
|
715
|
-
block_resources: Block specific resources (e.g., 'script', 'image').
|
|
716
|
-
clean_content: Remove JS/CSS from returned content (e.g., 'js,css').
|
|
717
|
-
wait: Wait time in milliseconds after page load (max 100000).
|
|
718
|
-
wait_for: CSS selector to wait for before returning.
|
|
719
|
-
headers: Custom request headers as list of {name, value} dicts.
|
|
720
|
-
cookies: Custom cookies as list of {name, value} dicts.
|
|
721
|
-
extra_params: Additional parameters to pass through.
|
|
722
|
-
|
|
723
|
-
Example:
|
|
724
|
-
>>> req = UniversalScrapeRequest(
|
|
725
|
-
... url="https://example.com",
|
|
726
|
-
... js_render=True,
|
|
727
|
-
... output_format="html",
|
|
728
|
-
... country="us",
|
|
729
|
-
... wait=5000,
|
|
730
|
-
... wait_for=".content"
|
|
731
|
-
... )
|
|
732
|
-
>>> payload = req.to_payload()
|
|
733
|
-
"""
|
|
734
|
-
|
|
735
|
-
url: str
|
|
736
|
-
js_render: bool = False
|
|
737
|
-
output_format: str = "html" # 'html' or 'png'
|
|
738
|
-
country: str | None = None
|
|
739
|
-
block_resources: str | None = None # e.g., 'script', 'image', 'script,image'
|
|
740
|
-
clean_content: str | None = None # e.g., 'js', 'css', 'js,css'
|
|
741
|
-
wait: int | None = None # Milliseconds, max 100000
|
|
742
|
-
wait_for: str | None = None # CSS selector
|
|
743
|
-
headers: list[dict[str, str]] | None = None # [{"name": "...", "value": "..."}]
|
|
744
|
-
cookies: list[dict[str, str]] | None = None # [{"name": "...", "value": "..."}]
|
|
745
|
-
extra_params: dict[str, Any] = field(default_factory=dict) # 这个必须用 field()
|
|
746
|
-
|
|
747
|
-
def __post_init__(self) -> None:
|
|
748
|
-
"""Validate configuration."""
|
|
749
|
-
valid_formats = {"html", "png"}
|
|
750
|
-
if self.output_format.lower() not in valid_formats:
|
|
751
|
-
raise ValueError(
|
|
752
|
-
f"Invalid output_format: {self.output_format}. "
|
|
753
|
-
f"Must be one of: {', '.join(valid_formats)}"
|
|
754
|
-
)
|
|
755
|
-
|
|
756
|
-
if self.wait is not None and (self.wait < 0 or self.wait > 100000):
|
|
757
|
-
raise ValueError(
|
|
758
|
-
f"wait must be between 0 and 100000 milliseconds, got {self.wait}"
|
|
759
|
-
)
|
|
760
|
-
|
|
761
|
-
def to_payload(self) -> dict[str, Any]:
|
|
762
|
-
"""
|
|
763
|
-
Convert to API request payload.
|
|
764
|
-
|
|
765
|
-
Returns:
|
|
766
|
-
Dictionary ready to be sent to the Universal API.
|
|
767
|
-
"""
|
|
768
|
-
payload: dict[str, Any] = {
|
|
769
|
-
"url": self.url,
|
|
770
|
-
"js_render": "True" if self.js_render else "False",
|
|
771
|
-
"type": self.output_format.lower(),
|
|
772
|
-
}
|
|
773
|
-
|
|
774
|
-
if self.country:
|
|
775
|
-
payload["country"] = self.country.lower()
|
|
776
|
-
|
|
777
|
-
if self.block_resources:
|
|
778
|
-
payload["block_resources"] = self.block_resources
|
|
779
|
-
|
|
780
|
-
if self.clean_content:
|
|
781
|
-
payload["clean_content"] = self.clean_content
|
|
782
|
-
|
|
783
|
-
if self.wait is not None:
|
|
784
|
-
payload["wait"] = str(self.wait)
|
|
785
|
-
|
|
786
|
-
if self.wait_for:
|
|
787
|
-
payload["wait_for"] = self.wait_for
|
|
788
|
-
|
|
789
|
-
if self.headers:
|
|
790
|
-
payload["headers"] = json.dumps(self.headers)
|
|
791
|
-
|
|
792
|
-
if self.cookies:
|
|
793
|
-
payload["cookies"] = json.dumps(self.cookies)
|
|
794
|
-
|
|
795
|
-
payload.update(self.extra_params)
|
|
796
|
-
|
|
797
|
-
return payload
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
# =============================================================================
|
|
801
|
-
# Web Scraper Task Models
|
|
802
|
-
# =============================================================================
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
@dataclass
|
|
806
|
-
class ScraperTaskConfig:
|
|
807
|
-
"""
|
|
808
|
-
Configuration for creating a Web Scraper API task.
|
|
809
|
-
|
|
810
|
-
Note: You must get spider_id and spider_name from the Thordata Dashboard.
|
|
811
|
-
|
|
812
|
-
Args:
|
|
813
|
-
file_name: Name for the output file.
|
|
814
|
-
spider_id: Spider identifier from Dashboard.
|
|
815
|
-
spider_name: Spider name (usually the target domain).
|
|
816
|
-
parameters: Spider-specific parameters.
|
|
817
|
-
universal_params: Global spider settings.
|
|
818
|
-
include_errors: Include error details in output.
|
|
819
|
-
|
|
820
|
-
Example:
|
|
821
|
-
>>> config = ScraperTaskConfig(
|
|
822
|
-
... file_name="youtube_data",
|
|
823
|
-
... spider_id="youtube_video-post_by-url",
|
|
824
|
-
... spider_name="youtube.com",
|
|
825
|
-
... parameters={
|
|
826
|
-
... "url": "https://youtube.com/@channel/videos",
|
|
827
|
-
... "num_of_posts": "50"
|
|
828
|
-
... }
|
|
829
|
-
... )
|
|
830
|
-
>>> payload = config.to_payload()
|
|
831
|
-
"""
|
|
832
|
-
|
|
833
|
-
file_name: str
|
|
834
|
-
spider_id: str
|
|
835
|
-
spider_name: str
|
|
836
|
-
parameters: dict[str, Any]
|
|
837
|
-
universal_params: dict[str, Any] | None = None
|
|
838
|
-
include_errors: bool = True
|
|
839
|
-
|
|
840
|
-
def to_payload(self) -> dict[str, Any]:
|
|
841
|
-
"""
|
|
842
|
-
Convert to API request payload.
|
|
843
|
-
|
|
844
|
-
Returns:
|
|
845
|
-
Dictionary ready to be sent to the Web Scraper API.
|
|
846
|
-
"""
|
|
847
|
-
payload: dict[str, Any] = {
|
|
848
|
-
"file_name": self.file_name,
|
|
849
|
-
"spider_id": self.spider_id,
|
|
850
|
-
"spider_name": self.spider_name,
|
|
851
|
-
"spider_parameters": json.dumps([self.parameters]),
|
|
852
|
-
"spider_errors": "true" if self.include_errors else "false",
|
|
853
|
-
}
|
|
854
|
-
|
|
855
|
-
if self.universal_params:
|
|
856
|
-
payload["spider_universal"] = json.dumps(self.universal_params)
|
|
857
|
-
|
|
858
|
-
return payload
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
@dataclass
|
|
862
|
-
class CommonSettings:
|
|
863
|
-
"""
|
|
864
|
-
Common settings for YouTube video/audio downloads.
|
|
865
|
-
|
|
866
|
-
Used by /video_builder endpoint as `common_settings` parameter.
|
|
867
|
-
Also known as `spider_universal` in some documentation.
|
|
868
|
-
|
|
869
|
-
Args:
|
|
870
|
-
resolution: Video resolution (360p/480p/720p/1080p/1440p/2160p).
|
|
871
|
-
audio_format: Audio format (opus/mp3).
|
|
872
|
-
bitrate: Audio bitrate (48/64/128/160/256/320 or with Kbps suffix).
|
|
873
|
-
is_subtitles: Whether to download subtitles ("true"/"false").
|
|
874
|
-
subtitles_language: Subtitle language code (e.g., "en", "zh-Hans").
|
|
875
|
-
|
|
876
|
-
Example for video:
|
|
877
|
-
>>> settings = CommonSettings(
|
|
878
|
-
... resolution="1080p",
|
|
879
|
-
... is_subtitles="true",
|
|
880
|
-
... subtitles_language="en"
|
|
881
|
-
... )
|
|
882
|
-
|
|
883
|
-
Example for audio:
|
|
884
|
-
>>> settings = CommonSettings(
|
|
885
|
-
... audio_format="mp3",
|
|
886
|
-
... bitrate="320",
|
|
887
|
-
... is_subtitles="true",
|
|
888
|
-
... subtitles_language="en"
|
|
889
|
-
... )
|
|
890
|
-
"""
|
|
891
|
-
|
|
892
|
-
# Video settings
|
|
893
|
-
resolution: str | None = None
|
|
894
|
-
|
|
895
|
-
# Audio settings
|
|
896
|
-
audio_format: str | None = None
|
|
897
|
-
bitrate: str | None = None
|
|
898
|
-
|
|
899
|
-
# Subtitle settings (used by both video and audio)
|
|
900
|
-
is_subtitles: str | None = None
|
|
901
|
-
subtitles_language: str | None = None
|
|
902
|
-
|
|
903
|
-
# Valid values for validation
|
|
904
|
-
VALID_RESOLUTIONS = {"360p", "480p", "720p", "1080p", "1440p", "2160p"}
|
|
905
|
-
VALID_AUDIO_FORMATS = {"opus", "mp3"}
|
|
906
|
-
|
|
907
|
-
def to_dict(self) -> dict[str, Any]:
|
|
908
|
-
"""Convert to dictionary, excluding None values."""
|
|
909
|
-
result = {}
|
|
910
|
-
if self.resolution is not None:
|
|
911
|
-
result["resolution"] = self.resolution
|
|
912
|
-
if self.audio_format is not None:
|
|
913
|
-
result["audio_format"] = self.audio_format
|
|
914
|
-
if self.bitrate is not None:
|
|
915
|
-
result["bitrate"] = self.bitrate
|
|
916
|
-
if self.is_subtitles is not None:
|
|
917
|
-
result["is_subtitles"] = self.is_subtitles
|
|
918
|
-
if self.subtitles_language is not None:
|
|
919
|
-
result["subtitles_language"] = self.subtitles_language
|
|
920
|
-
return result
|
|
921
|
-
|
|
922
|
-
def to_json(self) -> str:
|
|
923
|
-
"""Convert to JSON string for form submission."""
|
|
924
|
-
return json.dumps(self.to_dict())
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
@dataclass
|
|
928
|
-
class VideoTaskConfig:
|
|
929
|
-
"""
|
|
930
|
-
Configuration for creating a YouTube video/audio download task.
|
|
931
|
-
|
|
932
|
-
Uses the /video_builder endpoint.
|
|
933
|
-
|
|
934
|
-
Args:
|
|
935
|
-
file_name: Name for the output file. Supports {{TasksID}}, {{VideoID}}.
|
|
936
|
-
spider_id: Spider identifier (e.g., "youtube_video_by-url", "youtube_audio_by-url").
|
|
937
|
-
spider_name: Spider name (typically "youtube.com").
|
|
938
|
-
parameters: Spider-specific parameters (e.g., video URL).
|
|
939
|
-
common_settings: Video/audio settings (resolution, format, subtitles).
|
|
940
|
-
include_errors: Include error details in output.
|
|
941
|
-
|
|
942
|
-
Example:
|
|
943
|
-
>>> config = VideoTaskConfig(
|
|
944
|
-
... file_name="{{VideoID}}",
|
|
945
|
-
... spider_id="youtube_video_by-url",
|
|
946
|
-
... spider_name="youtube.com",
|
|
947
|
-
... parameters={"url": "https://www.youtube.com/watch?v=xxx"},
|
|
948
|
-
... common_settings=CommonSettings(
|
|
949
|
-
... resolution="1080p",
|
|
950
|
-
... is_subtitles="true",
|
|
951
|
-
... subtitles_language="en"
|
|
952
|
-
... )
|
|
953
|
-
... )
|
|
954
|
-
"""
|
|
955
|
-
|
|
956
|
-
file_name: str
|
|
957
|
-
spider_id: str
|
|
958
|
-
spider_name: str
|
|
959
|
-
parameters: dict[str, Any]
|
|
960
|
-
common_settings: CommonSettings
|
|
961
|
-
include_errors: bool = True
|
|
962
|
-
|
|
963
|
-
def to_payload(self) -> dict[str, Any]:
|
|
964
|
-
"""
|
|
965
|
-
Convert to API request payload.
|
|
966
|
-
|
|
967
|
-
Returns:
|
|
968
|
-
Dictionary ready to be sent to the video_builder API.
|
|
969
|
-
"""
|
|
970
|
-
payload: dict[str, Any] = {
|
|
971
|
-
"file_name": self.file_name,
|
|
972
|
-
"spider_id": self.spider_id,
|
|
973
|
-
"spider_name": self.spider_name,
|
|
974
|
-
"spider_parameters": json.dumps([self.parameters]),
|
|
975
|
-
"spider_errors": "true" if self.include_errors else "false",
|
|
976
|
-
"common_settings": self.common_settings.to_json(),
|
|
977
|
-
}
|
|
978
|
-
return payload
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
# =============================================================================
|
|
982
|
-
# Response Models
|
|
983
|
-
# =============================================================================
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
@dataclass
|
|
987
|
-
class TaskStatusResponse:
|
|
988
|
-
"""
|
|
989
|
-
Response from task status check.
|
|
990
|
-
|
|
991
|
-
Attributes:
|
|
992
|
-
task_id: The task identifier.
|
|
993
|
-
status: Current task status.
|
|
994
|
-
progress: Optional progress percentage.
|
|
995
|
-
message: Optional status message.
|
|
996
|
-
"""
|
|
997
|
-
|
|
998
|
-
task_id: str
|
|
999
|
-
status: str
|
|
1000
|
-
progress: int | None = None
|
|
1001
|
-
message: str | None = None
|
|
1002
|
-
|
|
1003
|
-
def is_complete(self) -> bool:
|
|
1004
|
-
"""Check if the task has completed (success or failure)."""
|
|
1005
|
-
terminal_statuses = {
|
|
1006
|
-
"ready",
|
|
1007
|
-
"success",
|
|
1008
|
-
"finished",
|
|
1009
|
-
"failed",
|
|
1010
|
-
"error",
|
|
1011
|
-
"cancelled",
|
|
1012
|
-
}
|
|
1013
|
-
return self.status.lower() in terminal_statuses
|
|
1014
|
-
|
|
1015
|
-
def is_success(self) -> bool:
|
|
1016
|
-
"""Check if the task completed successfully."""
|
|
1017
|
-
success_statuses = {"ready", "success", "finished"}
|
|
1018
|
-
return self.status.lower() in success_statuses
|
|
1019
|
-
|
|
1020
|
-
def is_failed(self) -> bool:
|
|
1021
|
-
"""Check if the task failed."""
|
|
1022
|
-
failure_statuses = {"failed", "error"}
|
|
1023
|
-
return self.status.lower() in failure_statuses
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
@dataclass
|
|
1027
|
-
class UsageStatistics:
|
|
1028
|
-
"""
|
|
1029
|
-
Response model for account usage statistics.
|
|
1030
|
-
|
|
1031
|
-
Attributes:
|
|
1032
|
-
total_usage_traffic: Total traffic used (KB).
|
|
1033
|
-
traffic_balance: Remaining traffic balance (KB).
|
|
1034
|
-
query_days: Number of days in the query range.
|
|
1035
|
-
range_usage_traffic: Traffic used in the specified date range (KB).
|
|
1036
|
-
data: Daily usage breakdown.
|
|
1037
|
-
"""
|
|
1038
|
-
|
|
1039
|
-
total_usage_traffic: float
|
|
1040
|
-
traffic_balance: float
|
|
1041
|
-
query_days: int
|
|
1042
|
-
range_usage_traffic: float
|
|
1043
|
-
data: list[dict[str, Any]]
|
|
1044
|
-
|
|
1045
|
-
@classmethod
|
|
1046
|
-
def from_dict(cls, data: dict[str, Any]) -> UsageStatistics:
|
|
1047
|
-
"""Create from API response dict."""
|
|
1048
|
-
return cls(
|
|
1049
|
-
total_usage_traffic=float(data.get("total_usage_traffic", 0)),
|
|
1050
|
-
traffic_balance=float(data.get("traffic_balance", 0)),
|
|
1051
|
-
query_days=int(data.get("query_days", 0)),
|
|
1052
|
-
range_usage_traffic=float(data.get("range_usage_traffic", 0)),
|
|
1053
|
-
data=data.get("data", []),
|
|
1054
|
-
)
|
|
1055
|
-
|
|
1056
|
-
def total_usage_gb(self) -> float:
|
|
1057
|
-
"""Get total usage in GB."""
|
|
1058
|
-
return self.total_usage_traffic / (1024 * 1024)
|
|
1059
|
-
|
|
1060
|
-
def balance_gb(self) -> float:
|
|
1061
|
-
"""Get balance in GB."""
|
|
1062
|
-
return self.traffic_balance / (1024 * 1024)
|
|
1063
|
-
|
|
1064
|
-
def range_usage_gb(self) -> float:
|
|
1065
|
-
"""Get range usage in GB."""
|
|
1066
|
-
return self.range_usage_traffic / (1024 * 1024)
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
@dataclass
|
|
1070
|
-
class ProxyUser:
|
|
1071
|
-
"""
|
|
1072
|
-
Proxy user (sub-account) information.
|
|
1073
|
-
|
|
1074
|
-
Attributes:
|
|
1075
|
-
username: User's username.
|
|
1076
|
-
password: User's password.
|
|
1077
|
-
status: User status (True=enabled, False=disabled).
|
|
1078
|
-
traffic_limit: Traffic limit in MB (0 = unlimited).
|
|
1079
|
-
usage_traffic: Traffic used in KB.
|
|
1080
|
-
"""
|
|
1081
|
-
|
|
1082
|
-
username: str
|
|
1083
|
-
password: str
|
|
1084
|
-
status: bool
|
|
1085
|
-
traffic_limit: int
|
|
1086
|
-
usage_traffic: float
|
|
1087
|
-
|
|
1088
|
-
@classmethod
|
|
1089
|
-
def from_dict(cls, data: dict[str, Any]) -> ProxyUser:
|
|
1090
|
-
"""Create from API response dict."""
|
|
1091
|
-
return cls(
|
|
1092
|
-
username=data.get("username", ""),
|
|
1093
|
-
password=data.get("password", ""),
|
|
1094
|
-
status=data.get("status") in (True, "true", 1),
|
|
1095
|
-
traffic_limit=int(data.get("traffic_limit", 0)),
|
|
1096
|
-
usage_traffic=float(data.get("usage_traffic", 0)),
|
|
1097
|
-
)
|
|
1098
|
-
|
|
1099
|
-
def usage_gb(self) -> float:
|
|
1100
|
-
"""Get usage in GB."""
|
|
1101
|
-
return self.usage_traffic / (1024 * 1024)
|
|
1102
|
-
|
|
1103
|
-
def limit_gb(self) -> float:
|
|
1104
|
-
"""Get limit in GB (0 means unlimited)."""
|
|
1105
|
-
if self.traffic_limit == 0:
|
|
1106
|
-
return 0
|
|
1107
|
-
return self.traffic_limit / 1024
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
@dataclass
|
|
1111
|
-
class ProxyUserList:
|
|
1112
|
-
"""
|
|
1113
|
-
Response model for proxy user list.
|
|
1114
|
-
|
|
1115
|
-
Attributes:
|
|
1116
|
-
limit: Total traffic limit (KB).
|
|
1117
|
-
remaining_limit: Remaining traffic limit (KB).
|
|
1118
|
-
user_count: Number of users.
|
|
1119
|
-
users: List of proxy users.
|
|
1120
|
-
"""
|
|
1121
|
-
|
|
1122
|
-
limit: float
|
|
1123
|
-
remaining_limit: float
|
|
1124
|
-
user_count: int
|
|
1125
|
-
users: list[ProxyUser]
|
|
1126
|
-
|
|
1127
|
-
@classmethod
|
|
1128
|
-
def from_dict(cls, data: dict[str, Any]) -> ProxyUserList:
|
|
1129
|
-
"""Create from API response dict."""
|
|
1130
|
-
user_list = data.get("list", [])
|
|
1131
|
-
users = [ProxyUser.from_dict(u) for u in user_list]
|
|
1132
|
-
|
|
1133
|
-
return cls(
|
|
1134
|
-
limit=float(data.get("limit", 0)),
|
|
1135
|
-
remaining_limit=float(data.get("remaining_limit", 0)),
|
|
1136
|
-
user_count=int(data.get("user_count", len(users))),
|
|
1137
|
-
users=users,
|
|
1138
|
-
)
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
@dataclass
|
|
1142
|
-
class ProxyServer:
|
|
1143
|
-
"""
|
|
1144
|
-
ISP or Datacenter proxy server information.
|
|
1145
|
-
|
|
1146
|
-
Attributes:
|
|
1147
|
-
ip: Proxy server IP address.
|
|
1148
|
-
port: Proxy server port.
|
|
1149
|
-
username: Authentication username.
|
|
1150
|
-
password: Authentication password.
|
|
1151
|
-
expiration_time: Expiration timestamp (Unix timestamp or datetime string).
|
|
1152
|
-
region: Server region (optional).
|
|
1153
|
-
"""
|
|
1154
|
-
|
|
1155
|
-
ip: str
|
|
1156
|
-
port: int
|
|
1157
|
-
username: str
|
|
1158
|
-
password: str
|
|
1159
|
-
expiration_time: int | str | None = None
|
|
1160
|
-
region: str | None = None
|
|
1161
|
-
|
|
1162
|
-
@classmethod
|
|
1163
|
-
def from_dict(cls, data: dict[str, Any]) -> ProxyServer:
|
|
1164
|
-
"""Create from API response dict."""
|
|
1165
|
-
return cls(
|
|
1166
|
-
ip=data.get("ip", ""),
|
|
1167
|
-
port=int(data.get("port", 0)),
|
|
1168
|
-
username=data.get("username", data.get("user", "")),
|
|
1169
|
-
password=data.get("password", data.get("pwd", "")),
|
|
1170
|
-
expiration_time=data.get("expiration_time", data.get("expireTime")),
|
|
1171
|
-
region=data.get("region"),
|
|
1172
|
-
)
|
|
1173
|
-
|
|
1174
|
-
def to_proxy_url(self, protocol: str = "https") -> str:
|
|
1175
|
-
"""
|
|
1176
|
-
Build proxy URL for this server.
|
|
1177
|
-
|
|
1178
|
-
Args:
|
|
1179
|
-
protocol: Proxy protocol (http/https/socks5).
|
|
1180
|
-
|
|
1181
|
-
Returns:
|
|
1182
|
-
Complete proxy URL.
|
|
1183
|
-
"""
|
|
1184
|
-
return f"{protocol}://{self.username}:{self.password}@{self.ip}:{self.port}"
|
|
1185
|
-
|
|
1186
|
-
def is_expired(self) -> bool:
|
|
1187
|
-
"""Check if proxy has expired (if expiration_time is available)."""
|
|
1188
|
-
if self.expiration_time is None:
|
|
1189
|
-
return False
|
|
1190
|
-
|
|
1191
|
-
import time
|
|
1192
|
-
|
|
1193
|
-
if isinstance(self.expiration_time, int):
|
|
1194
|
-
return time.time() > self.expiration_time
|
|
1195
|
-
|
|
1196
|
-
# String timestamp handling would need datetime parsing
|
|
1197
|
-
return False
|
|
7
|
+
from .types.common import CommonSettings
|
|
8
|
+
from .types.proxy import (
|
|
9
|
+
ProxyConfig,
|
|
10
|
+
ProxyProduct,
|
|
11
|
+
ProxyServer,
|
|
12
|
+
ProxyUser,
|
|
13
|
+
ProxyUserList,
|
|
14
|
+
StaticISPProxy,
|
|
15
|
+
StickySession,
|
|
16
|
+
)
|
|
17
|
+
from .types.serp import SerpRequest
|
|
18
|
+
from .types.task import (
|
|
19
|
+
ScraperTaskConfig,
|
|
20
|
+
TaskStatusResponse,
|
|
21
|
+
UsageStatistics,
|
|
22
|
+
VideoTaskConfig,
|
|
23
|
+
)
|
|
24
|
+
from .types.universal import UniversalScrapeRequest
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"ProxyProduct",
|
|
28
|
+
"ProxyConfig",
|
|
29
|
+
"StickySession",
|
|
30
|
+
"StaticISPProxy",
|
|
31
|
+
"ProxyUser",
|
|
32
|
+
"ProxyUserList",
|
|
33
|
+
"ProxyServer",
|
|
34
|
+
"SerpRequest",
|
|
35
|
+
"UniversalScrapeRequest",
|
|
36
|
+
"ScraperTaskConfig",
|
|
37
|
+
"CommonSettings",
|
|
38
|
+
"VideoTaskConfig",
|
|
39
|
+
"TaskStatusResponse",
|
|
40
|
+
"UsageStatistics",
|
|
41
|
+
]
|