thordata-sdk 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +4 -40
- thordata/async_client.py +572 -1241
- thordata/async_unlimited.py +130 -0
- thordata/client.py +1184 -1309
- thordata/core/__init__.py +23 -0
- thordata/core/async_http_client.py +91 -0
- thordata/core/http_client.py +79 -0
- thordata/core/tunnel.py +287 -0
- thordata/demo.py +2 -2
- thordata/enums.py +41 -380
- thordata/models.py +37 -1193
- thordata/tools/__init__.py +28 -0
- thordata/tools/base.py +42 -0
- thordata/tools/code.py +26 -0
- thordata/tools/ecommerce.py +67 -0
- thordata/tools/search.py +73 -0
- thordata/tools/social.py +190 -0
- thordata/tools/video.py +81 -0
- thordata/types/__init__.py +77 -0
- thordata/types/common.py +141 -0
- thordata/types/proxy.py +340 -0
- thordata/types/serp.py +224 -0
- thordata/types/task.py +144 -0
- thordata/types/universal.py +66 -0
- thordata/unlimited.py +169 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/METADATA +74 -51
- thordata_sdk-1.5.0.dist-info/RECORD +35 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/WHEEL +1 -1
- thordata_sdk-1.3.0.dist-info/RECORD +0 -16
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.3.0.dist-info → thordata_sdk-1.5.0.dist-info}/top_level.txt +0 -0
thordata/types/common.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Common types shared across different modules.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ThordataBaseConfig:
|
|
14
|
+
"""Base class for all config objects with payload conversion."""
|
|
15
|
+
|
|
16
|
+
def to_payload(self) -> dict[str, Any]:
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Device(str, Enum):
|
|
21
|
+
DESKTOP = "desktop"
|
|
22
|
+
MOBILE = "mobile"
|
|
23
|
+
TABLET = "tablet"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class OutputFormat(str, Enum):
|
|
27
|
+
HTML = "html"
|
|
28
|
+
PNG = "png"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class CommonSettings:
|
|
33
|
+
"""
|
|
34
|
+
Common settings for video/audio downloads.
|
|
35
|
+
Keys strictly aligned with Thordata Video Builder API.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
resolution: str | None = None
|
|
39
|
+
video_codec: str | None = None # vp9, avc
|
|
40
|
+
audio_format: str | None = None # opus, mp3
|
|
41
|
+
bitrate: str | None = None
|
|
42
|
+
selected_only: str | bool | None = None
|
|
43
|
+
is_subtitles: str | bool | None = None
|
|
44
|
+
subtitles_language: str | None = None
|
|
45
|
+
|
|
46
|
+
def to_dict(self) -> dict[str, Any]:
|
|
47
|
+
result = {}
|
|
48
|
+
for k, v in self.__dict__.items():
|
|
49
|
+
if v is not None:
|
|
50
|
+
# API expects explicit string "true"/"false" for booleans
|
|
51
|
+
if isinstance(v, bool):
|
|
52
|
+
result[k] = "true" if v else "false"
|
|
53
|
+
else:
|
|
54
|
+
result[k] = str(v)
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
def to_json(self) -> str:
|
|
58
|
+
return json.dumps(self.to_dict())
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def normalize_enum_value(value: object, enum_class: type) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Safely convert an enum or string to its string value.
|
|
64
|
+
"""
|
|
65
|
+
if isinstance(value, enum_class):
|
|
66
|
+
return str(getattr(value, "value", value)).lower()
|
|
67
|
+
if isinstance(value, str):
|
|
68
|
+
return value.lower()
|
|
69
|
+
raise TypeError(
|
|
70
|
+
f"Expected {enum_class.__name__} or str, got {type(value).__name__}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# --- Geography Enums ---
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class Continent(str, Enum):
|
|
78
|
+
AFRICA = "af"
|
|
79
|
+
ANTARCTICA = "an"
|
|
80
|
+
ASIA = "as"
|
|
81
|
+
EUROPE = "eu"
|
|
82
|
+
NORTH_AMERICA = "na"
|
|
83
|
+
OCEANIA = "oc"
|
|
84
|
+
SOUTH_AMERICA = "sa"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Country(str, Enum):
|
|
88
|
+
US = "us"
|
|
89
|
+
CA = "ca"
|
|
90
|
+
MX = "mx"
|
|
91
|
+
GB = "gb"
|
|
92
|
+
DE = "de"
|
|
93
|
+
FR = "fr"
|
|
94
|
+
ES = "es"
|
|
95
|
+
IT = "it"
|
|
96
|
+
NL = "nl"
|
|
97
|
+
PL = "pl"
|
|
98
|
+
RU = "ru"
|
|
99
|
+
UA = "ua"
|
|
100
|
+
SE = "se"
|
|
101
|
+
NO = "no"
|
|
102
|
+
DK = "dk"
|
|
103
|
+
FI = "fi"
|
|
104
|
+
CH = "ch"
|
|
105
|
+
AT = "at"
|
|
106
|
+
BE = "be"
|
|
107
|
+
PT = "pt"
|
|
108
|
+
IE = "ie"
|
|
109
|
+
CZ = "cz"
|
|
110
|
+
GR = "gr"
|
|
111
|
+
CN = "cn"
|
|
112
|
+
JP = "jp"
|
|
113
|
+
KR = "kr"
|
|
114
|
+
IN = "in"
|
|
115
|
+
AU = "au"
|
|
116
|
+
NZ = "nz"
|
|
117
|
+
SG = "sg"
|
|
118
|
+
HK = "hk"
|
|
119
|
+
TW = "tw"
|
|
120
|
+
TH = "th"
|
|
121
|
+
VN = "vn"
|
|
122
|
+
ID = "id"
|
|
123
|
+
MY = "my"
|
|
124
|
+
PH = "ph"
|
|
125
|
+
PK = "pk"
|
|
126
|
+
BD = "bd"
|
|
127
|
+
BR = "br"
|
|
128
|
+
AR = "ar"
|
|
129
|
+
CL = "cl"
|
|
130
|
+
CO = "co"
|
|
131
|
+
PE = "pe"
|
|
132
|
+
VE = "ve"
|
|
133
|
+
AE = "ae"
|
|
134
|
+
SA = "sa"
|
|
135
|
+
IL = "il"
|
|
136
|
+
TR = "tr"
|
|
137
|
+
ZA = "za"
|
|
138
|
+
EG = "eg"
|
|
139
|
+
NG = "ng"
|
|
140
|
+
KE = "ke"
|
|
141
|
+
MA = "ma"
|
thordata/types/proxy.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Proxy related types and configurations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import uuid
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum, IntEnum
|
|
11
|
+
from typing import Any
|
|
12
|
+
from urllib.parse import quote
|
|
13
|
+
|
|
14
|
+
# Import geography from common to avoid circular issues
|
|
15
|
+
from .common import Continent
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ProxyProduct(str, Enum):
|
|
19
|
+
RESIDENTIAL = "residential"
|
|
20
|
+
MOBILE = "mobile"
|
|
21
|
+
DATACENTER = "datacenter"
|
|
22
|
+
ISP = "isp"
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def default_port(self) -> int:
|
|
26
|
+
ports = {
|
|
27
|
+
"residential": 9999,
|
|
28
|
+
"mobile": 5555,
|
|
29
|
+
"datacenter": 7777,
|
|
30
|
+
"isp": 6666,
|
|
31
|
+
}
|
|
32
|
+
return ports.get(self.value, 9999)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ProxyType(IntEnum):
|
|
36
|
+
RESIDENTIAL = 1
|
|
37
|
+
UNLIMITED = 2
|
|
38
|
+
DATACENTER = 3
|
|
39
|
+
ISP = 4
|
|
40
|
+
MOBILE = 5
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SessionType(str, Enum):
|
|
44
|
+
ROTATING = "rotating"
|
|
45
|
+
STICKY = "sticky"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ProxyHost(str, Enum):
|
|
49
|
+
DEFAULT = "pr.thordata.net"
|
|
50
|
+
NORTH_AMERICA = "t.na.thordata.net"
|
|
51
|
+
EUROPE = "t.eu.thordata.net"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ProxyPort(IntEnum):
|
|
55
|
+
RESIDENTIAL = 9999
|
|
56
|
+
MOBILE = 5555
|
|
57
|
+
DATACENTER = 7777
|
|
58
|
+
ISP = 6666
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class ProxyConfig:
|
|
63
|
+
username: str
|
|
64
|
+
password: str
|
|
65
|
+
product: ProxyProduct | str = ProxyProduct.RESIDENTIAL
|
|
66
|
+
host: str | None = None
|
|
67
|
+
port: int | None = None
|
|
68
|
+
protocol: str = "https"
|
|
69
|
+
|
|
70
|
+
# Geo-targeting
|
|
71
|
+
continent: str | None = None
|
|
72
|
+
country: str | None = None
|
|
73
|
+
state: str | None = None
|
|
74
|
+
city: str | None = None
|
|
75
|
+
asn: str | None = None
|
|
76
|
+
|
|
77
|
+
# Session control
|
|
78
|
+
session_id: str | None = None
|
|
79
|
+
session_duration: int | None = None # minutes, 1-90
|
|
80
|
+
|
|
81
|
+
# Use a set of values for validation logic
|
|
82
|
+
_VALID_CONTINENTS = {v.value for v in Continent}
|
|
83
|
+
|
|
84
|
+
def __post_init__(self) -> None:
|
|
85
|
+
if isinstance(self.product, str):
|
|
86
|
+
self.product = ProxyProduct(self.product.lower())
|
|
87
|
+
|
|
88
|
+
if self.host is None:
|
|
89
|
+
host_map = {
|
|
90
|
+
ProxyProduct.RESIDENTIAL: "pr.thordata.net",
|
|
91
|
+
ProxyProduct.DATACENTER: "dc.pr.thordata.net",
|
|
92
|
+
ProxyProduct.MOBILE: "m.pr.thordata.net",
|
|
93
|
+
ProxyProduct.ISP: "isp.pr.thordata.net",
|
|
94
|
+
}
|
|
95
|
+
self.host = host_map.get(self.product, "pr.thordata.net")
|
|
96
|
+
|
|
97
|
+
if self.port is None:
|
|
98
|
+
self.port = self.product.default_port
|
|
99
|
+
|
|
100
|
+
self._validate()
|
|
101
|
+
|
|
102
|
+
def _validate(self) -> None:
|
|
103
|
+
if self.protocol not in ("http", "https", "socks5", "socks5h"):
|
|
104
|
+
raise ValueError(f"Invalid protocol: {self.protocol}")
|
|
105
|
+
|
|
106
|
+
if self.session_duration is not None:
|
|
107
|
+
if not 1 <= self.session_duration <= 90:
|
|
108
|
+
raise ValueError("session_duration must be between 1 and 90 minutes")
|
|
109
|
+
if not self.session_id:
|
|
110
|
+
raise ValueError("session_duration requires session_id")
|
|
111
|
+
|
|
112
|
+
if self.asn and not self.country:
|
|
113
|
+
raise ValueError("ASN targeting requires country")
|
|
114
|
+
|
|
115
|
+
if self.continent and self.continent.lower() not in self._VALID_CONTINENTS:
|
|
116
|
+
raise ValueError(f"Invalid continent code: {self.continent}")
|
|
117
|
+
|
|
118
|
+
if self.country and not re.match(r"^[a-zA-Z]{2}$", self.country):
|
|
119
|
+
raise ValueError("Invalid country code")
|
|
120
|
+
|
|
121
|
+
def build_username(self) -> str:
|
|
122
|
+
base = self.username
|
|
123
|
+
if not base.startswith("td-customer-"):
|
|
124
|
+
base = f"td-customer-{base}"
|
|
125
|
+
|
|
126
|
+
parts = [base]
|
|
127
|
+
|
|
128
|
+
if self.continent:
|
|
129
|
+
parts.append(f"continent-{self.continent.lower()}")
|
|
130
|
+
if self.country:
|
|
131
|
+
parts.append(f"country-{self.country.lower()}")
|
|
132
|
+
if self.state:
|
|
133
|
+
parts.append(f"state-{self.state.lower()}")
|
|
134
|
+
if self.city:
|
|
135
|
+
parts.append(f"city-{self.city.lower()}")
|
|
136
|
+
if self.asn:
|
|
137
|
+
asn_val = (
|
|
138
|
+
self.asn.upper()
|
|
139
|
+
if self.asn.upper().startswith("AS")
|
|
140
|
+
else f"AS{self.asn.upper()}"
|
|
141
|
+
)
|
|
142
|
+
parts.append(f"asn-{asn_val}")
|
|
143
|
+
if self.session_id:
|
|
144
|
+
parts.append(f"sessid-{self.session_id}")
|
|
145
|
+
if self.session_duration:
|
|
146
|
+
parts.append(f"sesstime-{self.session_duration}")
|
|
147
|
+
|
|
148
|
+
return "-".join(parts)
|
|
149
|
+
|
|
150
|
+
def build_proxy_url(self) -> str:
|
|
151
|
+
user = self.build_username()
|
|
152
|
+
proto = "socks5h" if self.protocol == "socks5" else self.protocol
|
|
153
|
+
|
|
154
|
+
safe_user = quote(user, safe="")
|
|
155
|
+
safe_pass = quote(self.password, safe="")
|
|
156
|
+
|
|
157
|
+
return f"{proto}://{safe_user}:{safe_pass}@{self.host}:{self.port}"
|
|
158
|
+
|
|
159
|
+
def build_proxy_endpoint(self) -> str:
|
|
160
|
+
proto = "socks5h" if self.protocol == "socks5" else self.protocol
|
|
161
|
+
return f"{proto}://{self.host}:{self.port}"
|
|
162
|
+
|
|
163
|
+
def build_proxy_basic_auth(self) -> str:
|
|
164
|
+
return f"{self.build_username()}:{self.password}"
|
|
165
|
+
|
|
166
|
+
def to_proxies_dict(self) -> dict[str, str]:
|
|
167
|
+
url = self.build_proxy_url()
|
|
168
|
+
return {"http": url, "https": url}
|
|
169
|
+
|
|
170
|
+
def to_aiohttp_config(self) -> tuple:
|
|
171
|
+
try:
|
|
172
|
+
import aiohttp
|
|
173
|
+
|
|
174
|
+
return (
|
|
175
|
+
f"{self.protocol}://{self.host}:{self.port}",
|
|
176
|
+
aiohttp.BasicAuth(login=self.build_username(), password=self.password),
|
|
177
|
+
)
|
|
178
|
+
except ImportError as e:
|
|
179
|
+
# Fix B904: chain the exception
|
|
180
|
+
raise ImportError("aiohttp required") from e
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclass
|
|
184
|
+
class StaticISPProxy:
|
|
185
|
+
host: str
|
|
186
|
+
username: str
|
|
187
|
+
password: str
|
|
188
|
+
port: int = 6666
|
|
189
|
+
protocol: str = "https"
|
|
190
|
+
|
|
191
|
+
def __post_init__(self) -> None:
|
|
192
|
+
if self.protocol not in ("http", "https", "socks5", "socks5h"):
|
|
193
|
+
raise ValueError(f"Invalid protocol: {self.protocol}")
|
|
194
|
+
|
|
195
|
+
def build_username(self) -> str:
|
|
196
|
+
# Static ISP usually doesn't use the 'td-customer-' prefix logic
|
|
197
|
+
# or special params, it uses raw username.
|
|
198
|
+
return self.username
|
|
199
|
+
|
|
200
|
+
def build_proxy_endpoint(self) -> str:
|
|
201
|
+
# FIX: Added this method to satisfy client.py interface
|
|
202
|
+
proto = "socks5h" if self.protocol == "socks5" else self.protocol
|
|
203
|
+
return f"{proto}://{self.host}:{self.port}"
|
|
204
|
+
|
|
205
|
+
def build_proxy_basic_auth(self) -> str:
|
|
206
|
+
# FIX: Added this method to satisfy client.py interface
|
|
207
|
+
return f"{self.username}:{self.password}"
|
|
208
|
+
|
|
209
|
+
def build_proxy_url(self) -> str:
|
|
210
|
+
proto = "socks5h" if self.protocol == "socks5" else self.protocol
|
|
211
|
+
safe_user = quote(self.username, safe="")
|
|
212
|
+
safe_pass = quote(self.password, safe="")
|
|
213
|
+
return f"{proto}://{safe_user}:{safe_pass}@{self.host}:{self.port}"
|
|
214
|
+
|
|
215
|
+
def to_proxies_dict(self) -> dict[str, str]:
|
|
216
|
+
url = self.build_proxy_url()
|
|
217
|
+
return {"http": url, "https": url}
|
|
218
|
+
|
|
219
|
+
def to_aiohttp_config(self) -> tuple:
|
|
220
|
+
try:
|
|
221
|
+
import aiohttp
|
|
222
|
+
|
|
223
|
+
return (
|
|
224
|
+
f"{self.protocol}://{self.host}:{self.port}",
|
|
225
|
+
aiohttp.BasicAuth(login=self.username, password=self.password),
|
|
226
|
+
)
|
|
227
|
+
except ImportError as e:
|
|
228
|
+
# Fix B904: chain the exception
|
|
229
|
+
raise ImportError("aiohttp required") from e
|
|
230
|
+
|
|
231
|
+
@classmethod
|
|
232
|
+
def from_env(cls) -> StaticISPProxy:
|
|
233
|
+
import os
|
|
234
|
+
|
|
235
|
+
host = os.getenv("THORDATA_ISP_HOST")
|
|
236
|
+
username = os.getenv("THORDATA_ISP_USERNAME")
|
|
237
|
+
password = os.getenv("THORDATA_ISP_PASSWORD")
|
|
238
|
+
if not all([host, username, password]):
|
|
239
|
+
raise ValueError(
|
|
240
|
+
"THORDATA_ISP_HOST, THORDATA_ISP_USERNAME, and THORDATA_ISP_PASSWORD are required"
|
|
241
|
+
)
|
|
242
|
+
return cls(host=host, username=username, password=password) # type: ignore
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@dataclass
|
|
246
|
+
class StickySession(ProxyConfig):
|
|
247
|
+
duration_minutes: int = 10
|
|
248
|
+
auto_session_id: bool = True
|
|
249
|
+
|
|
250
|
+
def __post_init__(self) -> None:
|
|
251
|
+
if self.auto_session_id and not self.session_id:
|
|
252
|
+
self.session_id = uuid.uuid4().hex[:12]
|
|
253
|
+
self.session_duration = self.duration_minutes
|
|
254
|
+
super().__post_init__()
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@dataclass
|
|
258
|
+
class ProxyUser:
|
|
259
|
+
username: str
|
|
260
|
+
password: str
|
|
261
|
+
status: bool
|
|
262
|
+
traffic_limit: int
|
|
263
|
+
usage_traffic: float
|
|
264
|
+
|
|
265
|
+
@classmethod
|
|
266
|
+
def from_dict(cls, data: dict[str, Any]) -> ProxyUser:
|
|
267
|
+
return cls(
|
|
268
|
+
username=str(data.get("username", "")),
|
|
269
|
+
password=str(data.get("password", "")),
|
|
270
|
+
status=str(data.get("status")).lower() in ("true", "1"),
|
|
271
|
+
traffic_limit=int(data.get("traffic_limit", 0)),
|
|
272
|
+
usage_traffic=float(data.get("usage_traffic", 0)),
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
def usage_gb(self) -> float:
|
|
276
|
+
return self.usage_traffic / (1024 * 1024)
|
|
277
|
+
|
|
278
|
+
def limit_gb(self) -> float:
|
|
279
|
+
if self.traffic_limit == 0:
|
|
280
|
+
return 0.0
|
|
281
|
+
return self.traffic_limit / 1024.0
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dataclass
|
|
285
|
+
class ProxyUserList:
|
|
286
|
+
limit: float
|
|
287
|
+
remaining_limit: float
|
|
288
|
+
user_count: int
|
|
289
|
+
users: list[ProxyUser]
|
|
290
|
+
|
|
291
|
+
@classmethod
|
|
292
|
+
def from_dict(cls, data: dict[str, Any]) -> ProxyUserList:
|
|
293
|
+
user_list_raw = data.get("list")
|
|
294
|
+
if user_list_raw is None:
|
|
295
|
+
possible_data = data.get("data")
|
|
296
|
+
user_list_raw = possible_data if isinstance(possible_data, list) else []
|
|
297
|
+
if not isinstance(user_list_raw, list):
|
|
298
|
+
user_list_raw = []
|
|
299
|
+
|
|
300
|
+
users = [ProxyUser.from_dict(u) for u in user_list_raw]
|
|
301
|
+
|
|
302
|
+
return cls(
|
|
303
|
+
limit=float(data.get("limit", 0)),
|
|
304
|
+
remaining_limit=float(data.get("remaining_limit", 0)),
|
|
305
|
+
user_count=int(data.get("user_count", len(users))),
|
|
306
|
+
users=users,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@dataclass
|
|
311
|
+
class ProxyServer:
|
|
312
|
+
ip: str
|
|
313
|
+
port: int
|
|
314
|
+
username: str
|
|
315
|
+
password: str
|
|
316
|
+
expiration_time: int | str | None = None
|
|
317
|
+
region: str | None = None
|
|
318
|
+
|
|
319
|
+
@classmethod
|
|
320
|
+
def from_dict(cls, data: dict[str, Any]) -> ProxyServer:
|
|
321
|
+
return cls(
|
|
322
|
+
ip=str(data.get("ip", "")),
|
|
323
|
+
port=int(data.get("port", 0)),
|
|
324
|
+
username=str(data.get("username", data.get("user", ""))),
|
|
325
|
+
password=str(data.get("password", data.get("pwd", ""))),
|
|
326
|
+
expiration_time=data.get("expiration_time", data.get("expireTime")),
|
|
327
|
+
region=str(data.get("region")) if data.get("region") else None,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
def to_proxy_url(self, protocol: str = "https") -> str:
|
|
331
|
+
return f"{protocol}://{self.username}:{self.password}@{self.ip}:{self.port}"
|
|
332
|
+
|
|
333
|
+
def is_expired(self) -> bool:
|
|
334
|
+
if self.expiration_time is None:
|
|
335
|
+
return False
|
|
336
|
+
import time
|
|
337
|
+
|
|
338
|
+
if isinstance(self.expiration_time, int):
|
|
339
|
+
return time.time() > self.expiration_time
|
|
340
|
+
return False
|
thordata/types/serp.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SERP (Search Engine Results Page) related types and configurations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .common import ThordataBaseConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Engine(str, Enum):
|
|
15
|
+
# Google
|
|
16
|
+
GOOGLE = "google"
|
|
17
|
+
GOOGLE_NEWS = "google_news"
|
|
18
|
+
GOOGLE_SHOPPING = "google_shopping"
|
|
19
|
+
GOOGLE_VIDEOS = "google_videos"
|
|
20
|
+
GOOGLE_IMAGES = "google_images"
|
|
21
|
+
GOOGLE_MAPS = "google_maps"
|
|
22
|
+
GOOGLE_JOBS = "google_jobs"
|
|
23
|
+
GOOGLE_PLAY = "google_play"
|
|
24
|
+
GOOGLE_TRENDS = "google_trends"
|
|
25
|
+
GOOGLE_SCHOLAR = "google_scholar"
|
|
26
|
+
GOOGLE_PATENTS = "google_patents"
|
|
27
|
+
GOOGLE_FINANCE = "google_finance"
|
|
28
|
+
GOOGLE_FLIGHTS = "google_flights"
|
|
29
|
+
GOOGLE_LENS = "google_lens"
|
|
30
|
+
GOOGLE_HOTELS = "google_hotels"
|
|
31
|
+
|
|
32
|
+
# Bing
|
|
33
|
+
BING = "bing"
|
|
34
|
+
BING_NEWS = "bing_news"
|
|
35
|
+
BING_SHOPPING = "bing_shopping"
|
|
36
|
+
BING_IMAGES = "bing_images"
|
|
37
|
+
BING_VIDEOS = "bing_videos"
|
|
38
|
+
BING_MAPS = "bing_maps"
|
|
39
|
+
|
|
40
|
+
# Others
|
|
41
|
+
YANDEX = "yandex"
|
|
42
|
+
DUCKDUCKGO = "duckduckgo"
|
|
43
|
+
BAIDU = "baidu"
|
|
44
|
+
|
|
45
|
+
# Legacy / Compatibility Aliases
|
|
46
|
+
GOOGLE_SEARCH = "google_search"
|
|
47
|
+
GOOGLE_WEB = "google_web"
|
|
48
|
+
GOOGLE_LOCAL = "google_local"
|
|
49
|
+
GOOGLE_PRODUCT = (
|
|
50
|
+
"google_product" # mapped to shopping with product_id internally usually
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class GoogleSearchType(str, Enum):
|
|
55
|
+
SEARCH = "search"
|
|
56
|
+
NEWS = "news"
|
|
57
|
+
SHOPPING = "shopping"
|
|
58
|
+
IMAGES = "images"
|
|
59
|
+
VIDEOS = "videos"
|
|
60
|
+
MAPS = "maps"
|
|
61
|
+
# Add others as needed
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class BingSearchType(str, Enum):
|
|
65
|
+
SEARCH = "search"
|
|
66
|
+
NEWS = "news"
|
|
67
|
+
SHOPPING = "shopping"
|
|
68
|
+
IMAGES = "images"
|
|
69
|
+
VIDEOS = "videos"
|
|
70
|
+
MAPS = "maps"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class GoogleTbm(str, Enum):
|
|
74
|
+
NEWS = "nws"
|
|
75
|
+
SHOPPING = "shop"
|
|
76
|
+
IMAGES = "isch"
|
|
77
|
+
VIDEOS = "vid"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class TimeRange(str, Enum):
|
|
81
|
+
HOUR = "hour"
|
|
82
|
+
DAY = "day"
|
|
83
|
+
WEEK = "week"
|
|
84
|
+
MONTH = "month"
|
|
85
|
+
YEAR = "year"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class SerpRequest(ThordataBaseConfig):
|
|
90
|
+
query: str
|
|
91
|
+
engine: str = "google"
|
|
92
|
+
num: int = 10
|
|
93
|
+
start: int = 0
|
|
94
|
+
|
|
95
|
+
# Localization
|
|
96
|
+
country: str | None = None # 'gl'
|
|
97
|
+
language: str | None = None # 'hl'
|
|
98
|
+
google_domain: str | None = None
|
|
99
|
+
countries_filter: str | None = None # 'cr'
|
|
100
|
+
languages_filter: str | None = None # 'lr'
|
|
101
|
+
|
|
102
|
+
# Geo-targeting
|
|
103
|
+
location: str | None = None
|
|
104
|
+
uule: str | None = None
|
|
105
|
+
|
|
106
|
+
# Search type
|
|
107
|
+
search_type: str | None = None # 'tbm'
|
|
108
|
+
|
|
109
|
+
# Filters
|
|
110
|
+
safe_search: bool | None = None
|
|
111
|
+
time_filter: str | None = None # 'tbs'
|
|
112
|
+
no_autocorrect: bool = False # 'nfpr'
|
|
113
|
+
filter_duplicates: bool | None = None # 'filter'
|
|
114
|
+
|
|
115
|
+
# Device & Rendering
|
|
116
|
+
device: str | None = None
|
|
117
|
+
render_js: bool | None = None
|
|
118
|
+
no_cache: bool | None = None
|
|
119
|
+
|
|
120
|
+
# Output
|
|
121
|
+
output_format: str = "json"
|
|
122
|
+
|
|
123
|
+
# Advanced Google
|
|
124
|
+
ludocid: str | None = None
|
|
125
|
+
kgmid: str | None = None
|
|
126
|
+
|
|
127
|
+
# Pass-through for any other param
|
|
128
|
+
extra_params: dict[str, Any] = field(default_factory=dict)
|
|
129
|
+
|
|
130
|
+
# Mappings
|
|
131
|
+
SEARCH_TYPE_MAP = {
|
|
132
|
+
"images": "isch",
|
|
133
|
+
"shopping": "shop",
|
|
134
|
+
"news": "nws",
|
|
135
|
+
"videos": "vid",
|
|
136
|
+
"isch": "isch",
|
|
137
|
+
"shop": "shop",
|
|
138
|
+
"nws": "nws",
|
|
139
|
+
"vid": "vid",
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
TIME_FILTER_MAP = {
|
|
143
|
+
"hour": "qdr:h",
|
|
144
|
+
"day": "qdr:d",
|
|
145
|
+
"week": "qdr:w",
|
|
146
|
+
"month": "qdr:m",
|
|
147
|
+
"year": "qdr:y",
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
def to_payload(self) -> dict[str, Any]:
|
|
151
|
+
engine = self.engine.lower()
|
|
152
|
+
payload: dict[str, Any] = {
|
|
153
|
+
"engine": engine,
|
|
154
|
+
"num": str(self.num),
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# JSON output handling
|
|
158
|
+
fmt = self.output_format.lower()
|
|
159
|
+
if fmt == "json":
|
|
160
|
+
payload["json"] = "1"
|
|
161
|
+
elif fmt == "html":
|
|
162
|
+
pass # No json param means HTML
|
|
163
|
+
elif fmt in ("2", "both", "json+html"):
|
|
164
|
+
payload["json"] = "2"
|
|
165
|
+
|
|
166
|
+
# Query param handling
|
|
167
|
+
if engine == "yandex":
|
|
168
|
+
payload["text"] = self.query
|
|
169
|
+
else:
|
|
170
|
+
payload["q"] = self.query
|
|
171
|
+
|
|
172
|
+
# Basic fields
|
|
173
|
+
if self.google_domain:
|
|
174
|
+
payload["google_domain"] = self.google_domain
|
|
175
|
+
if self.start > 0:
|
|
176
|
+
payload["start"] = str(self.start)
|
|
177
|
+
if self.country:
|
|
178
|
+
payload["gl"] = self.country.lower()
|
|
179
|
+
if self.language:
|
|
180
|
+
payload["hl"] = self.language.lower()
|
|
181
|
+
if self.countries_filter:
|
|
182
|
+
payload["cr"] = self.countries_filter
|
|
183
|
+
if self.languages_filter:
|
|
184
|
+
payload["lr"] = self.languages_filter
|
|
185
|
+
if self.location:
|
|
186
|
+
payload["location"] = self.location
|
|
187
|
+
if self.uule:
|
|
188
|
+
payload["uule"] = self.uule
|
|
189
|
+
|
|
190
|
+
# Search Type (tbm)
|
|
191
|
+
if self.search_type:
|
|
192
|
+
val = self.search_type.lower()
|
|
193
|
+
payload["tbm"] = self.SEARCH_TYPE_MAP.get(val, val)
|
|
194
|
+
|
|
195
|
+
# Filters
|
|
196
|
+
if self.safe_search is not None:
|
|
197
|
+
payload["safe"] = "active" if self.safe_search else "off"
|
|
198
|
+
|
|
199
|
+
if self.time_filter:
|
|
200
|
+
val = self.time_filter.lower()
|
|
201
|
+
payload["tbs"] = self.TIME_FILTER_MAP.get(val, val)
|
|
202
|
+
|
|
203
|
+
if self.no_autocorrect:
|
|
204
|
+
payload["nfpr"] = "1"
|
|
205
|
+
if self.filter_duplicates is not None:
|
|
206
|
+
payload["filter"] = "1" if self.filter_duplicates else "0"
|
|
207
|
+
|
|
208
|
+
# Device & Rendering
|
|
209
|
+
if self.device:
|
|
210
|
+
payload["device"] = self.device.lower()
|
|
211
|
+
if self.render_js is not None:
|
|
212
|
+
payload["render_js"] = "True" if self.render_js else "False"
|
|
213
|
+
if self.no_cache is not None:
|
|
214
|
+
payload["no_cache"] = "True" if self.no_cache else "False"
|
|
215
|
+
|
|
216
|
+
# Advanced
|
|
217
|
+
if self.ludocid:
|
|
218
|
+
payload["ludocid"] = self.ludocid
|
|
219
|
+
if self.kgmid:
|
|
220
|
+
payload["kgmid"] = self.kgmid
|
|
221
|
+
|
|
222
|
+
# Merge extras
|
|
223
|
+
payload.update(self.extra_params)
|
|
224
|
+
return payload
|