thordata-sdk 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ """
2
+ Common types shared across different modules.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from dataclasses import dataclass
9
+ from enum import Enum
10
+ from typing import Any
11
+
12
+
13
+ class ThordataBaseConfig:
14
+ """Base class for all config objects with payload conversion."""
15
+
16
+ def to_payload(self) -> dict[str, Any]:
17
+ raise NotImplementedError
18
+
19
+
20
+ class Device(str, Enum):
21
+ DESKTOP = "desktop"
22
+ MOBILE = "mobile"
23
+ TABLET = "tablet"
24
+
25
+
26
+ class OutputFormat(str, Enum):
27
+ HTML = "html"
28
+ PNG = "png"
29
+
30
+
31
+ @dataclass
32
+ class CommonSettings:
33
+ """
34
+ Common settings for video/audio downloads.
35
+ Keys strictly aligned with Thordata Video Builder API.
36
+ """
37
+
38
+ resolution: str | None = None
39
+ video_codec: str | None = None # vp9, avc
40
+ audio_format: str | None = None # opus, mp3
41
+ bitrate: str | None = None
42
+ selected_only: str | bool | None = None
43
+ is_subtitles: str | bool | None = None
44
+ subtitles_language: str | None = None
45
+
46
+ def to_dict(self) -> dict[str, Any]:
47
+ result = {}
48
+ for k, v in self.__dict__.items():
49
+ if v is not None:
50
+ # API expects explicit string "true"/"false" for booleans
51
+ if isinstance(v, bool):
52
+ result[k] = "true" if v else "false"
53
+ else:
54
+ result[k] = str(v)
55
+ return result
56
+
57
+ def to_json(self) -> str:
58
+ return json.dumps(self.to_dict())
59
+
60
+
61
+ def normalize_enum_value(value: object, enum_class: type) -> str:
62
+ """
63
+ Safely convert an enum or string to its string value.
64
+ """
65
+ if isinstance(value, enum_class):
66
+ return str(getattr(value, "value", value)).lower()
67
+ if isinstance(value, str):
68
+ return value.lower()
69
+ raise TypeError(
70
+ f"Expected {enum_class.__name__} or str, got {type(value).__name__}"
71
+ )
72
+
73
+
74
+ # --- Geography Enums ---
75
+
76
+
77
+ class Continent(str, Enum):
78
+ AFRICA = "af"
79
+ ANTARCTICA = "an"
80
+ ASIA = "as"
81
+ EUROPE = "eu"
82
+ NORTH_AMERICA = "na"
83
+ OCEANIA = "oc"
84
+ SOUTH_AMERICA = "sa"
85
+
86
+
87
+ class Country(str, Enum):
88
+ US = "us"
89
+ CA = "ca"
90
+ MX = "mx"
91
+ GB = "gb"
92
+ DE = "de"
93
+ FR = "fr"
94
+ ES = "es"
95
+ IT = "it"
96
+ NL = "nl"
97
+ PL = "pl"
98
+ RU = "ru"
99
+ UA = "ua"
100
+ SE = "se"
101
+ NO = "no"
102
+ DK = "dk"
103
+ FI = "fi"
104
+ CH = "ch"
105
+ AT = "at"
106
+ BE = "be"
107
+ PT = "pt"
108
+ IE = "ie"
109
+ CZ = "cz"
110
+ GR = "gr"
111
+ CN = "cn"
112
+ JP = "jp"
113
+ KR = "kr"
114
+ IN = "in"
115
+ AU = "au"
116
+ NZ = "nz"
117
+ SG = "sg"
118
+ HK = "hk"
119
+ TW = "tw"
120
+ TH = "th"
121
+ VN = "vn"
122
+ ID = "id"
123
+ MY = "my"
124
+ PH = "ph"
125
+ PK = "pk"
126
+ BD = "bd"
127
+ BR = "br"
128
+ AR = "ar"
129
+ CL = "cl"
130
+ CO = "co"
131
+ PE = "pe"
132
+ VE = "ve"
133
+ AE = "ae"
134
+ SA = "sa"
135
+ IL = "il"
136
+ TR = "tr"
137
+ ZA = "za"
138
+ EG = "eg"
139
+ NG = "ng"
140
+ KE = "ke"
141
+ MA = "ma"
@@ -0,0 +1,340 @@
1
+ """
2
+ Proxy related types and configurations.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import re
8
+ import uuid
9
+ from dataclasses import dataclass
10
+ from enum import Enum, IntEnum
11
+ from typing import Any
12
+ from urllib.parse import quote
13
+
14
+ # Import geography from common to avoid circular issues
15
+ from .common import Continent
16
+
17
+
18
+ class ProxyProduct(str, Enum):
19
+ RESIDENTIAL = "residential"
20
+ MOBILE = "mobile"
21
+ DATACENTER = "datacenter"
22
+ ISP = "isp"
23
+
24
+ @property
25
+ def default_port(self) -> int:
26
+ ports = {
27
+ "residential": 9999,
28
+ "mobile": 5555,
29
+ "datacenter": 7777,
30
+ "isp": 6666,
31
+ }
32
+ return ports.get(self.value, 9999)
33
+
34
+
35
+ class ProxyType(IntEnum):
36
+ RESIDENTIAL = 1
37
+ UNLIMITED = 2
38
+ DATACENTER = 3
39
+ ISP = 4
40
+ MOBILE = 5
41
+
42
+
43
+ class SessionType(str, Enum):
44
+ ROTATING = "rotating"
45
+ STICKY = "sticky"
46
+
47
+
48
+ class ProxyHost(str, Enum):
49
+ DEFAULT = "pr.thordata.net"
50
+ NORTH_AMERICA = "t.na.thordata.net"
51
+ EUROPE = "t.eu.thordata.net"
52
+
53
+
54
+ class ProxyPort(IntEnum):
55
+ RESIDENTIAL = 9999
56
+ MOBILE = 5555
57
+ DATACENTER = 7777
58
+ ISP = 6666
59
+
60
+
61
+ @dataclass
62
+ class ProxyConfig:
63
+ username: str
64
+ password: str
65
+ product: ProxyProduct | str = ProxyProduct.RESIDENTIAL
66
+ host: str | None = None
67
+ port: int | None = None
68
+ protocol: str = "https"
69
+
70
+ # Geo-targeting
71
+ continent: str | None = None
72
+ country: str | None = None
73
+ state: str | None = None
74
+ city: str | None = None
75
+ asn: str | None = None
76
+
77
+ # Session control
78
+ session_id: str | None = None
79
+ session_duration: int | None = None # minutes, 1-90
80
+
81
+ # Use a set of values for validation logic
82
+ _VALID_CONTINENTS = {v.value for v in Continent}
83
+
84
+ def __post_init__(self) -> None:
85
+ if isinstance(self.product, str):
86
+ self.product = ProxyProduct(self.product.lower())
87
+
88
+ if self.host is None:
89
+ host_map = {
90
+ ProxyProduct.RESIDENTIAL: "pr.thordata.net",
91
+ ProxyProduct.DATACENTER: "dc.pr.thordata.net",
92
+ ProxyProduct.MOBILE: "m.pr.thordata.net",
93
+ ProxyProduct.ISP: "isp.pr.thordata.net",
94
+ }
95
+ self.host = host_map.get(self.product, "pr.thordata.net")
96
+
97
+ if self.port is None:
98
+ self.port = self.product.default_port
99
+
100
+ self._validate()
101
+
102
+ def _validate(self) -> None:
103
+ if self.protocol not in ("http", "https", "socks5", "socks5h"):
104
+ raise ValueError(f"Invalid protocol: {self.protocol}")
105
+
106
+ if self.session_duration is not None:
107
+ if not 1 <= self.session_duration <= 90:
108
+ raise ValueError("session_duration must be between 1 and 90 minutes")
109
+ if not self.session_id:
110
+ raise ValueError("session_duration requires session_id")
111
+
112
+ if self.asn and not self.country:
113
+ raise ValueError("ASN targeting requires country")
114
+
115
+ if self.continent and self.continent.lower() not in self._VALID_CONTINENTS:
116
+ raise ValueError(f"Invalid continent code: {self.continent}")
117
+
118
+ if self.country and not re.match(r"^[a-zA-Z]{2}$", self.country):
119
+ raise ValueError("Invalid country code")
120
+
121
+ def build_username(self) -> str:
122
+ base = self.username
123
+ if not base.startswith("td-customer-"):
124
+ base = f"td-customer-{base}"
125
+
126
+ parts = [base]
127
+
128
+ if self.continent:
129
+ parts.append(f"continent-{self.continent.lower()}")
130
+ if self.country:
131
+ parts.append(f"country-{self.country.lower()}")
132
+ if self.state:
133
+ parts.append(f"state-{self.state.lower()}")
134
+ if self.city:
135
+ parts.append(f"city-{self.city.lower()}")
136
+ if self.asn:
137
+ asn_val = (
138
+ self.asn.upper()
139
+ if self.asn.upper().startswith("AS")
140
+ else f"AS{self.asn.upper()}"
141
+ )
142
+ parts.append(f"asn-{asn_val}")
143
+ if self.session_id:
144
+ parts.append(f"sessid-{self.session_id}")
145
+ if self.session_duration:
146
+ parts.append(f"sesstime-{self.session_duration}")
147
+
148
+ return "-".join(parts)
149
+
150
+ def build_proxy_url(self) -> str:
151
+ user = self.build_username()
152
+ proto = "socks5h" if self.protocol == "socks5" else self.protocol
153
+
154
+ safe_user = quote(user, safe="")
155
+ safe_pass = quote(self.password, safe="")
156
+
157
+ return f"{proto}://{safe_user}:{safe_pass}@{self.host}:{self.port}"
158
+
159
+ def build_proxy_endpoint(self) -> str:
160
+ proto = "socks5h" if self.protocol == "socks5" else self.protocol
161
+ return f"{proto}://{self.host}:{self.port}"
162
+
163
+ def build_proxy_basic_auth(self) -> str:
164
+ return f"{self.build_username()}:{self.password}"
165
+
166
+ def to_proxies_dict(self) -> dict[str, str]:
167
+ url = self.build_proxy_url()
168
+ return {"http": url, "https": url}
169
+
170
+ def to_aiohttp_config(self) -> tuple:
171
+ try:
172
+ import aiohttp
173
+
174
+ return (
175
+ f"{self.protocol}://{self.host}:{self.port}",
176
+ aiohttp.BasicAuth(login=self.build_username(), password=self.password),
177
+ )
178
+ except ImportError as e:
179
+ # Fix B904: chain the exception
180
+ raise ImportError("aiohttp required") from e
181
+
182
+
183
+ @dataclass
184
+ class StaticISPProxy:
185
+ host: str
186
+ username: str
187
+ password: str
188
+ port: int = 6666
189
+ protocol: str = "https"
190
+
191
+ def __post_init__(self) -> None:
192
+ if self.protocol not in ("http", "https", "socks5", "socks5h"):
193
+ raise ValueError(f"Invalid protocol: {self.protocol}")
194
+
195
+ def build_username(self) -> str:
196
+ # Static ISP usually doesn't use the 'td-customer-' prefix logic
197
+ # or special params, it uses raw username.
198
+ return self.username
199
+
200
+ def build_proxy_endpoint(self) -> str:
201
+ # FIX: Added this method to satisfy client.py interface
202
+ proto = "socks5h" if self.protocol == "socks5" else self.protocol
203
+ return f"{proto}://{self.host}:{self.port}"
204
+
205
+ def build_proxy_basic_auth(self) -> str:
206
+ # FIX: Added this method to satisfy client.py interface
207
+ return f"{self.username}:{self.password}"
208
+
209
+ def build_proxy_url(self) -> str:
210
+ proto = "socks5h" if self.protocol == "socks5" else self.protocol
211
+ safe_user = quote(self.username, safe="")
212
+ safe_pass = quote(self.password, safe="")
213
+ return f"{proto}://{safe_user}:{safe_pass}@{self.host}:{self.port}"
214
+
215
+ def to_proxies_dict(self) -> dict[str, str]:
216
+ url = self.build_proxy_url()
217
+ return {"http": url, "https": url}
218
+
219
+ def to_aiohttp_config(self) -> tuple:
220
+ try:
221
+ import aiohttp
222
+
223
+ return (
224
+ f"{self.protocol}://{self.host}:{self.port}",
225
+ aiohttp.BasicAuth(login=self.username, password=self.password),
226
+ )
227
+ except ImportError as e:
228
+ # Fix B904: chain the exception
229
+ raise ImportError("aiohttp required") from e
230
+
231
+ @classmethod
232
+ def from_env(cls) -> StaticISPProxy:
233
+ import os
234
+
235
+ host = os.getenv("THORDATA_ISP_HOST")
236
+ username = os.getenv("THORDATA_ISP_USERNAME")
237
+ password = os.getenv("THORDATA_ISP_PASSWORD")
238
+ if not all([host, username, password]):
239
+ raise ValueError(
240
+ "THORDATA_ISP_HOST, THORDATA_ISP_USERNAME, and THORDATA_ISP_PASSWORD are required"
241
+ )
242
+ return cls(host=host, username=username, password=password) # type: ignore
243
+
244
+
245
+ @dataclass
246
+ class StickySession(ProxyConfig):
247
+ duration_minutes: int = 10
248
+ auto_session_id: bool = True
249
+
250
+ def __post_init__(self) -> None:
251
+ if self.auto_session_id and not self.session_id:
252
+ self.session_id = uuid.uuid4().hex[:12]
253
+ self.session_duration = self.duration_minutes
254
+ super().__post_init__()
255
+
256
+
257
+ @dataclass
258
+ class ProxyUser:
259
+ username: str
260
+ password: str
261
+ status: bool
262
+ traffic_limit: int
263
+ usage_traffic: float
264
+
265
+ @classmethod
266
+ def from_dict(cls, data: dict[str, Any]) -> ProxyUser:
267
+ return cls(
268
+ username=str(data.get("username", "")),
269
+ password=str(data.get("password", "")),
270
+ status=str(data.get("status")).lower() in ("true", "1"),
271
+ traffic_limit=int(data.get("traffic_limit", 0)),
272
+ usage_traffic=float(data.get("usage_traffic", 0)),
273
+ )
274
+
275
+ def usage_gb(self) -> float:
276
+ return self.usage_traffic / (1024 * 1024)
277
+
278
+ def limit_gb(self) -> float:
279
+ if self.traffic_limit == 0:
280
+ return 0.0
281
+ return self.traffic_limit / 1024.0
282
+
283
+
284
+ @dataclass
285
+ class ProxyUserList:
286
+ limit: float
287
+ remaining_limit: float
288
+ user_count: int
289
+ users: list[ProxyUser]
290
+
291
+ @classmethod
292
+ def from_dict(cls, data: dict[str, Any]) -> ProxyUserList:
293
+ user_list_raw = data.get("list")
294
+ if user_list_raw is None:
295
+ possible_data = data.get("data")
296
+ user_list_raw = possible_data if isinstance(possible_data, list) else []
297
+ if not isinstance(user_list_raw, list):
298
+ user_list_raw = []
299
+
300
+ users = [ProxyUser.from_dict(u) for u in user_list_raw]
301
+
302
+ return cls(
303
+ limit=float(data.get("limit", 0)),
304
+ remaining_limit=float(data.get("remaining_limit", 0)),
305
+ user_count=int(data.get("user_count", len(users))),
306
+ users=users,
307
+ )
308
+
309
+
310
+ @dataclass
311
+ class ProxyServer:
312
+ ip: str
313
+ port: int
314
+ username: str
315
+ password: str
316
+ expiration_time: int | str | None = None
317
+ region: str | None = None
318
+
319
+ @classmethod
320
+ def from_dict(cls, data: dict[str, Any]) -> ProxyServer:
321
+ return cls(
322
+ ip=str(data.get("ip", "")),
323
+ port=int(data.get("port", 0)),
324
+ username=str(data.get("username", data.get("user", ""))),
325
+ password=str(data.get("password", data.get("pwd", ""))),
326
+ expiration_time=data.get("expiration_time", data.get("expireTime")),
327
+ region=str(data.get("region")) if data.get("region") else None,
328
+ )
329
+
330
+ def to_proxy_url(self, protocol: str = "https") -> str:
331
+ return f"{protocol}://{self.username}:{self.password}@{self.ip}:{self.port}"
332
+
333
+ def is_expired(self) -> bool:
334
+ if self.expiration_time is None:
335
+ return False
336
+ import time
337
+
338
+ if isinstance(self.expiration_time, int):
339
+ return time.time() > self.expiration_time
340
+ return False
thordata/types/serp.py ADDED
@@ -0,0 +1,224 @@
1
+ """
2
+ SERP (Search Engine Results Page) related types and configurations.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass, field
8
+ from enum import Enum
9
+ from typing import Any
10
+
11
+ from .common import ThordataBaseConfig
12
+
13
+
14
+ class Engine(str, Enum):
15
+ # Google
16
+ GOOGLE = "google"
17
+ GOOGLE_NEWS = "google_news"
18
+ GOOGLE_SHOPPING = "google_shopping"
19
+ GOOGLE_VIDEOS = "google_videos"
20
+ GOOGLE_IMAGES = "google_images"
21
+ GOOGLE_MAPS = "google_maps"
22
+ GOOGLE_JOBS = "google_jobs"
23
+ GOOGLE_PLAY = "google_play"
24
+ GOOGLE_TRENDS = "google_trends"
25
+ GOOGLE_SCHOLAR = "google_scholar"
26
+ GOOGLE_PATENTS = "google_patents"
27
+ GOOGLE_FINANCE = "google_finance"
28
+ GOOGLE_FLIGHTS = "google_flights"
29
+ GOOGLE_LENS = "google_lens"
30
+ GOOGLE_HOTELS = "google_hotels"
31
+
32
+ # Bing
33
+ BING = "bing"
34
+ BING_NEWS = "bing_news"
35
+ BING_SHOPPING = "bing_shopping"
36
+ BING_IMAGES = "bing_images"
37
+ BING_VIDEOS = "bing_videos"
38
+ BING_MAPS = "bing_maps"
39
+
40
+ # Others
41
+ YANDEX = "yandex"
42
+ DUCKDUCKGO = "duckduckgo"
43
+ BAIDU = "baidu"
44
+
45
+ # Legacy / Compatibility Aliases
46
+ GOOGLE_SEARCH = "google_search"
47
+ GOOGLE_WEB = "google_web"
48
+ GOOGLE_LOCAL = "google_local"
49
+ GOOGLE_PRODUCT = (
50
+ "google_product" # mapped to shopping with product_id internally usually
51
+ )
52
+
53
+
54
+ class GoogleSearchType(str, Enum):
55
+ SEARCH = "search"
56
+ NEWS = "news"
57
+ SHOPPING = "shopping"
58
+ IMAGES = "images"
59
+ VIDEOS = "videos"
60
+ MAPS = "maps"
61
+ # Add others as needed
62
+
63
+
64
+ class BingSearchType(str, Enum):
65
+ SEARCH = "search"
66
+ NEWS = "news"
67
+ SHOPPING = "shopping"
68
+ IMAGES = "images"
69
+ VIDEOS = "videos"
70
+ MAPS = "maps"
71
+
72
+
73
+ class GoogleTbm(str, Enum):
74
+ NEWS = "nws"
75
+ SHOPPING = "shop"
76
+ IMAGES = "isch"
77
+ VIDEOS = "vid"
78
+
79
+
80
+ class TimeRange(str, Enum):
81
+ HOUR = "hour"
82
+ DAY = "day"
83
+ WEEK = "week"
84
+ MONTH = "month"
85
+ YEAR = "year"
86
+
87
+
88
+ @dataclass
89
+ class SerpRequest(ThordataBaseConfig):
90
+ query: str
91
+ engine: str = "google"
92
+ num: int = 10
93
+ start: int = 0
94
+
95
+ # Localization
96
+ country: str | None = None # 'gl'
97
+ language: str | None = None # 'hl'
98
+ google_domain: str | None = None
99
+ countries_filter: str | None = None # 'cr'
100
+ languages_filter: str | None = None # 'lr'
101
+
102
+ # Geo-targeting
103
+ location: str | None = None
104
+ uule: str | None = None
105
+
106
+ # Search type
107
+ search_type: str | None = None # 'tbm'
108
+
109
+ # Filters
110
+ safe_search: bool | None = None
111
+ time_filter: str | None = None # 'tbs'
112
+ no_autocorrect: bool = False # 'nfpr'
113
+ filter_duplicates: bool | None = None # 'filter'
114
+
115
+ # Device & Rendering
116
+ device: str | None = None
117
+ render_js: bool | None = None
118
+ no_cache: bool | None = None
119
+
120
+ # Output
121
+ output_format: str = "json"
122
+
123
+ # Advanced Google
124
+ ludocid: str | None = None
125
+ kgmid: str | None = None
126
+
127
+ # Pass-through for any other param
128
+ extra_params: dict[str, Any] = field(default_factory=dict)
129
+
130
+ # Mappings
131
+ SEARCH_TYPE_MAP = {
132
+ "images": "isch",
133
+ "shopping": "shop",
134
+ "news": "nws",
135
+ "videos": "vid",
136
+ "isch": "isch",
137
+ "shop": "shop",
138
+ "nws": "nws",
139
+ "vid": "vid",
140
+ }
141
+
142
+ TIME_FILTER_MAP = {
143
+ "hour": "qdr:h",
144
+ "day": "qdr:d",
145
+ "week": "qdr:w",
146
+ "month": "qdr:m",
147
+ "year": "qdr:y",
148
+ }
149
+
150
+ def to_payload(self) -> dict[str, Any]:
151
+ engine = self.engine.lower()
152
+ payload: dict[str, Any] = {
153
+ "engine": engine,
154
+ "num": str(self.num),
155
+ }
156
+
157
+ # JSON output handling
158
+ fmt = self.output_format.lower()
159
+ if fmt == "json":
160
+ payload["json"] = "1"
161
+ elif fmt == "html":
162
+ pass # No json param means HTML
163
+ elif fmt in ("2", "both", "json+html"):
164
+ payload["json"] = "2"
165
+
166
+ # Query param handling
167
+ if engine == "yandex":
168
+ payload["text"] = self.query
169
+ else:
170
+ payload["q"] = self.query
171
+
172
+ # Basic fields
173
+ if self.google_domain:
174
+ payload["google_domain"] = self.google_domain
175
+ if self.start > 0:
176
+ payload["start"] = str(self.start)
177
+ if self.country:
178
+ payload["gl"] = self.country.lower()
179
+ if self.language:
180
+ payload["hl"] = self.language.lower()
181
+ if self.countries_filter:
182
+ payload["cr"] = self.countries_filter
183
+ if self.languages_filter:
184
+ payload["lr"] = self.languages_filter
185
+ if self.location:
186
+ payload["location"] = self.location
187
+ if self.uule:
188
+ payload["uule"] = self.uule
189
+
190
+ # Search Type (tbm)
191
+ if self.search_type:
192
+ val = self.search_type.lower()
193
+ payload["tbm"] = self.SEARCH_TYPE_MAP.get(val, val)
194
+
195
+ # Filters
196
+ if self.safe_search is not None:
197
+ payload["safe"] = "active" if self.safe_search else "off"
198
+
199
+ if self.time_filter:
200
+ val = self.time_filter.lower()
201
+ payload["tbs"] = self.TIME_FILTER_MAP.get(val, val)
202
+
203
+ if self.no_autocorrect:
204
+ payload["nfpr"] = "1"
205
+ if self.filter_duplicates is not None:
206
+ payload["filter"] = "1" if self.filter_duplicates else "0"
207
+
208
+ # Device & Rendering
209
+ if self.device:
210
+ payload["device"] = self.device.lower()
211
+ if self.render_js is not None:
212
+ payload["render_js"] = "True" if self.render_js else "False"
213
+ if self.no_cache is not None:
214
+ payload["no_cache"] = "True" if self.no_cache else "False"
215
+
216
+ # Advanced
217
+ if self.ludocid:
218
+ payload["ludocid"] = self.ludocid
219
+ if self.kgmid:
220
+ payload["kgmid"] = self.kgmid
221
+
222
+ # Merge extras
223
+ payload.update(self.extra_params)
224
+ return payload