thordata-sdk 0.7.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/enums.py CHANGED
@@ -34,12 +34,13 @@ class Continent(str, Enum):
34
34
  class ProxyHost(str, Enum):
35
35
  """
36
36
  Available proxy gateway hosts.
37
+
38
+ Note: Dashboard provides user-specific hosts like {shard}.{region}.thordata.net
37
39
  """
38
40
 
39
41
  DEFAULT = "pr.thordata.net"
40
42
  NORTH_AMERICA = "t.na.thordata.net"
41
43
  EUROPE = "t.eu.thordata.net"
42
- GATE = "gate.thordata.com"
43
44
 
44
45
 
45
46
  class ProxyPort(IntEnum):
@@ -47,11 +48,10 @@ class ProxyPort(IntEnum):
47
48
  Available proxy gateway ports.
48
49
  """
49
50
 
50
- DEFAULT = 9999
51
+ RESIDENTIAL = 9999
51
52
  MOBILE = 5555
52
53
  DATACENTER = 7777
53
54
  ISP = 6666
54
- ALTERNATIVE = 22225
55
55
 
56
56
 
57
57
  # =============================================================================
thordata/exceptions.py CHANGED
@@ -222,7 +222,8 @@ class ThordataNotCollectedError(ThordataAPIError):
222
222
  This error is often transient and typically safe to retry.
223
223
  """
224
224
 
225
- HTTP_STATUS_CODES = {300}
225
+ API_CODES = {300}
226
+ HTTP_STATUS_CODES: Set[int] = set()
226
227
 
227
228
  @property
228
229
  def is_retryable(self) -> bool:
@@ -262,8 +263,17 @@ def raise_for_code(
262
263
  ThordataValidationError: For 400/422 codes.
263
264
  ThordataAPIError: For all other error codes.
264
265
  """
265
- # Use the code from payload if status_code not available
266
- effective_code = status_code or code
266
+ # Determine the effective error code.
267
+ # Prefer payload `code` when present and not success (200),
268
+ # otherwise fall back to HTTP status when it indicates an error.
269
+ effective_code: Optional[int] = None
270
+
271
+ if code is not None and code != 200:
272
+ effective_code = code
273
+ elif status_code is not None and status_code != 200:
274
+ effective_code = status_code
275
+ else:
276
+ effective_code = code if code is not None else status_code
267
277
 
268
278
  kwargs = {
269
279
  "status_code": status_code,
@@ -272,8 +282,9 @@ def raise_for_code(
272
282
  "request_id": request_id,
273
283
  }
274
284
 
275
- # Not collected (often retryable, not billed)
276
- if effective_code in ThordataNotCollectedError.HTTP_STATUS_CODES:
285
+ # Not collected (API payload code 300, often retryable, not billed)
286
+ # Check this FIRST since 300 is in API_CODES, not HTTP_STATUS_CODES
287
+ if effective_code in ThordataNotCollectedError.API_CODES:
277
288
  raise ThordataNotCollectedError(message, **kwargs)
278
289
 
279
290
  # Auth errors
thordata/models.py CHANGED
@@ -26,11 +26,14 @@ from __future__ import annotations
26
26
 
27
27
  import json
28
28
  import re
29
+ import ssl
29
30
  import uuid
30
31
  from dataclasses import dataclass, field
31
32
  from enum import Enum
32
33
  from typing import Any, Dict, List, Optional, Union
33
34
 
35
+ import urllib3
36
+
34
37
  # =============================================================================
35
38
  # Proxy Product Types
36
39
  # =============================================================================
@@ -137,6 +140,7 @@ class ProxyConfig:
137
140
  if self.host is None:
138
141
  # Set host based on product type
139
142
  host_map = {
143
+ # User&Pass auth entry (docs examples use t.pr.thordata.net for authenticated proxy)
140
144
  ProxyProduct.RESIDENTIAL: "t.pr.thordata.net",
141
145
  ProxyProduct.DATACENTER: "dc.pr.thordata.net",
142
146
  ProxyProduct.MOBILE: "m.pr.thordata.net",
@@ -233,6 +237,14 @@ class ProxyConfig:
233
237
  username = self.build_username()
234
238
  return f"{self.protocol}://{username}:{self.password}@{self.host}:{self.port}"
235
239
 
240
+ def build_proxy_endpoint(self) -> str:
241
+ """Proxy endpoint without credentials, for HTTPS proxy managers."""
242
+ return f"{self.protocol}://{self.host}:{self.port}"
243
+
244
+ def build_proxy_basic_auth(self) -> str:
245
+ """Basic auth string 'username:password' for Proxy-Authorization."""
246
+ return f"{self.build_username()}:{self.password}"
247
+
236
248
  def to_proxies_dict(self) -> Dict[str, str]:
237
249
  """
238
250
  Build a proxies dict suitable for the requests library.
@@ -264,6 +276,39 @@ class ProxyConfig:
264
276
  ) from e
265
277
 
266
278
 
279
+ @dataclass
280
+ class WhitelistProxyConfig:
281
+ """
282
+ Proxy config for IP-whitelist authentication mode (no username/password).
283
+
284
+ In whitelist mode, you do NOT pass proxy auth.
285
+ You only connect to the proxy entry node (host:port).
286
+
287
+ Examples (from docs):
288
+ - Global random: pr.thordata.net:9999
289
+ - Country nodes: us-pr.thordata.net:10000, etc.
290
+ """
291
+
292
+ host: str = "pr.thordata.net"
293
+ port: int = 9999
294
+ protocol: str = "http" # use http for proxy scheme; target URL can still be https
295
+
296
+ def __post_init__(self) -> None:
297
+ if self.protocol not in ("http", "https"):
298
+ raise ValueError("protocol must be 'http' or 'https'")
299
+
300
+ def build_proxy_url(self) -> str:
301
+ return f"{self.protocol}://{self.host}:{self.port}"
302
+
303
+ def to_proxies_dict(self) -> Dict[str, str]:
304
+ url = self.build_proxy_url()
305
+ return {"http": url, "https": url}
306
+
307
+ def to_aiohttp_config(self) -> tuple:
308
+ # aiohttp: proxy_auth should be None in whitelist mode
309
+ return self.build_proxy_url(), None
310
+
311
+
267
312
  @dataclass
268
313
  class StaticISPProxy:
269
314
  """
@@ -545,23 +590,28 @@ class SerpRequest:
545
590
  payload: Dict[str, Any] = {
546
591
  "engine": engine,
547
592
  "num": str(self.num),
548
- # output_format: json=1 for JSON, json=0 for raw HTML
549
- "json": "1" if self.output_format.lower() == "json" else "0",
550
593
  }
551
594
 
595
+ fmt = self.output_format.lower()
596
+ if fmt == "json":
597
+ payload["json"] = "1"
598
+ elif fmt == "html":
599
+ # omit "json" to get raw HTML (per docs: no json -> HTML)
600
+ pass
601
+ else:
602
+ # keep backward compatibility: if user passes "2"/"both"/etc.
603
+ if fmt in ("2", "both", "json+html", "json_html"):
604
+ payload["json"] = "2"
605
+
552
606
  # Handle query parameter (Yandex uses 'text', others use 'q')
553
607
  if engine == "yandex":
554
608
  payload["text"] = self.query
555
609
  else:
556
610
  payload["q"] = self.query
557
611
 
558
- # Set URL / domain based on google_domain or engine default
612
+ # Domain overrides (preferred by docs)
559
613
  if self.google_domain:
560
- # 显式设置 google_domain 参数,同时设置 url
561
614
  payload["google_domain"] = self.google_domain
562
- payload["url"] = self.google_domain
563
- elif engine in self.ENGINE_URLS:
564
- payload["url"] = self.ENGINE_URLS[engine]
565
615
 
566
616
  # Pagination
567
617
  if self.start > 0:
@@ -795,6 +845,126 @@ class ScraperTaskConfig:
795
845
  return payload
796
846
 
797
847
 
848
+ @dataclass
849
+ class CommonSettings:
850
+ """
851
+ Common settings for YouTube video/audio downloads.
852
+
853
+ Used by /video_builder endpoint as `common_settings` parameter.
854
+ Also known as `spider_universal` in some documentation.
855
+
856
+ Args:
857
+ resolution: Video resolution (360p/480p/720p/1080p/1440p/2160p).
858
+ audio_format: Audio format (opus/mp3).
859
+ bitrate: Audio bitrate (48/64/128/160/256/320 or with Kbps suffix).
860
+ is_subtitles: Whether to download subtitles ("true"/"false").
861
+ subtitles_language: Subtitle language code (e.g., "en", "zh-Hans").
862
+
863
+ Example for video:
864
+ >>> settings = CommonSettings(
865
+ ... resolution="1080p",
866
+ ... is_subtitles="true",
867
+ ... subtitles_language="en"
868
+ ... )
869
+
870
+ Example for audio:
871
+ >>> settings = CommonSettings(
872
+ ... audio_format="mp3",
873
+ ... bitrate="320",
874
+ ... is_subtitles="true",
875
+ ... subtitles_language="en"
876
+ ... )
877
+ """
878
+
879
+ # Video settings
880
+ resolution: Optional[str] = None
881
+
882
+ # Audio settings
883
+ audio_format: Optional[str] = None
884
+ bitrate: Optional[str] = None
885
+
886
+ # Subtitle settings (used by both video and audio)
887
+ is_subtitles: Optional[str] = None
888
+ subtitles_language: Optional[str] = None
889
+
890
+ # Valid values for validation
891
+ VALID_RESOLUTIONS = {"360p", "480p", "720p", "1080p", "1440p", "2160p"}
892
+ VALID_AUDIO_FORMATS = {"opus", "mp3"}
893
+
894
+ def to_dict(self) -> Dict[str, Any]:
895
+ """Convert to dictionary, excluding None values."""
896
+ result = {}
897
+ if self.resolution is not None:
898
+ result["resolution"] = self.resolution
899
+ if self.audio_format is not None:
900
+ result["audio_format"] = self.audio_format
901
+ if self.bitrate is not None:
902
+ result["bitrate"] = self.bitrate
903
+ if self.is_subtitles is not None:
904
+ result["is_subtitles"] = self.is_subtitles
905
+ if self.subtitles_language is not None:
906
+ result["subtitles_language"] = self.subtitles_language
907
+ return result
908
+
909
+ def to_json(self) -> str:
910
+ """Convert to JSON string for form submission."""
911
+ return json.dumps(self.to_dict())
912
+
913
+
914
+ @dataclass
915
+ class VideoTaskConfig:
916
+ """
917
+ Configuration for creating a YouTube video/audio download task.
918
+
919
+ Uses the /video_builder endpoint.
920
+
921
+ Args:
922
+ file_name: Name for the output file. Supports {{TasksID}}, {{VideoID}}.
923
+ spider_id: Spider identifier (e.g., "youtube_video_by-url", "youtube_audio_by-url").
924
+ spider_name: Spider name (typically "youtube.com").
925
+ parameters: Spider-specific parameters (e.g., video URL).
926
+ common_settings: Video/audio settings (resolution, format, subtitles).
927
+ include_errors: Include error details in output.
928
+
929
+ Example:
930
+ >>> config = VideoTaskConfig(
931
+ ... file_name="{{VideoID}}",
932
+ ... spider_id="youtube_video_by-url",
933
+ ... spider_name="youtube.com",
934
+ ... parameters={"url": "https://www.youtube.com/watch?v=xxx"},
935
+ ... common_settings=CommonSettings(
936
+ ... resolution="1080p",
937
+ ... is_subtitles="true",
938
+ ... subtitles_language="en"
939
+ ... )
940
+ ... )
941
+ """
942
+
943
+ file_name: str
944
+ spider_id: str
945
+ spider_name: str
946
+ parameters: Dict[str, Any]
947
+ common_settings: CommonSettings
948
+ include_errors: bool = True
949
+
950
+ def to_payload(self) -> Dict[str, Any]:
951
+ """
952
+ Convert to API request payload.
953
+
954
+ Returns:
955
+ Dictionary ready to be sent to the video_builder API.
956
+ """
957
+ payload: Dict[str, Any] = {
958
+ "file_name": self.file_name,
959
+ "spider_id": self.spider_id,
960
+ "spider_name": self.spider_name,
961
+ "spider_parameters": json.dumps([self.parameters]),
962
+ "spider_errors": "true" if self.include_errors else "false",
963
+ "common_settings": self.common_settings.to_json(),
964
+ }
965
+ return payload
966
+
967
+
798
968
  # =============================================================================
799
969
  # Response Models
800
970
  # =============================================================================
@@ -838,3 +1008,177 @@ class TaskStatusResponse:
838
1008
  """Check if the task failed."""
839
1009
  failure_statuses = {"failed", "error"}
840
1010
  return self.status.lower() in failure_statuses
1011
+
1012
+
1013
+ @dataclass
1014
+ class UsageStatistics:
1015
+ """
1016
+ Response model for account usage statistics.
1017
+
1018
+ Attributes:
1019
+ total_usage_traffic: Total traffic used (KB).
1020
+ traffic_balance: Remaining traffic balance (KB).
1021
+ query_days: Number of days in the query range.
1022
+ range_usage_traffic: Traffic used in the specified date range (KB).
1023
+ data: Daily usage breakdown.
1024
+ """
1025
+
1026
+ total_usage_traffic: float
1027
+ traffic_balance: float
1028
+ query_days: int
1029
+ range_usage_traffic: float
1030
+ data: List[Dict[str, Any]]
1031
+
1032
+ @classmethod
1033
+ def from_dict(cls, data: Dict[str, Any]) -> "UsageStatistics":
1034
+ """Create from API response dict."""
1035
+ return cls(
1036
+ total_usage_traffic=float(data.get("total_usage_traffic", 0)),
1037
+ traffic_balance=float(data.get("traffic_balance", 0)),
1038
+ query_days=int(data.get("query_days", 0)),
1039
+ range_usage_traffic=float(data.get("range_usage_traffic", 0)),
1040
+ data=data.get("data", []),
1041
+ )
1042
+
1043
+ def total_usage_gb(self) -> float:
1044
+ """Get total usage in GB."""
1045
+ return self.total_usage_traffic / (1024 * 1024)
1046
+
1047
+ def balance_gb(self) -> float:
1048
+ """Get balance in GB."""
1049
+ return self.traffic_balance / (1024 * 1024)
1050
+
1051
+ def range_usage_gb(self) -> float:
1052
+ """Get range usage in GB."""
1053
+ return self.range_usage_traffic / (1024 * 1024)
1054
+
1055
+
1056
+ @dataclass
1057
+ class ProxyUser:
1058
+ """
1059
+ Proxy user (sub-account) information.
1060
+
1061
+ Attributes:
1062
+ username: User's username.
1063
+ password: User's password.
1064
+ status: User status (True=enabled, False=disabled).
1065
+ traffic_limit: Traffic limit in MB (0 = unlimited).
1066
+ usage_traffic: Traffic used in KB.
1067
+ """
1068
+
1069
+ username: str
1070
+ password: str
1071
+ status: bool
1072
+ traffic_limit: int
1073
+ usage_traffic: float
1074
+
1075
+ @classmethod
1076
+ def from_dict(cls, data: Dict[str, Any]) -> "ProxyUser":
1077
+ """Create from API response dict."""
1078
+ return cls(
1079
+ username=data.get("username", ""),
1080
+ password=data.get("password", ""),
1081
+ status=data.get("status") in (True, "true", 1),
1082
+ traffic_limit=int(data.get("traffic_limit", 0)),
1083
+ usage_traffic=float(data.get("usage_traffic", 0)),
1084
+ )
1085
+
1086
+ def usage_gb(self) -> float:
1087
+ """Get usage in GB."""
1088
+ return self.usage_traffic / (1024 * 1024)
1089
+
1090
+ def limit_gb(self) -> float:
1091
+ """Get limit in GB (0 means unlimited)."""
1092
+ if self.traffic_limit == 0:
1093
+ return 0
1094
+ return self.traffic_limit / 1024
1095
+
1096
+
1097
+ @dataclass
1098
+ class ProxyUserList:
1099
+ """
1100
+ Response model for proxy user list.
1101
+
1102
+ Attributes:
1103
+ limit: Total traffic limit (KB).
1104
+ remaining_limit: Remaining traffic limit (KB).
1105
+ user_count: Number of users.
1106
+ users: List of proxy users.
1107
+ """
1108
+
1109
+ limit: float
1110
+ remaining_limit: float
1111
+ user_count: int
1112
+ users: List[ProxyUser]
1113
+
1114
+ @classmethod
1115
+ def from_dict(cls, data: Dict[str, Any]) -> "ProxyUserList":
1116
+ """Create from API response dict."""
1117
+ user_list = data.get("list", [])
1118
+ users = [ProxyUser.from_dict(u) for u in user_list]
1119
+
1120
+ return cls(
1121
+ limit=float(data.get("limit", 0)),
1122
+ remaining_limit=float(data.get("remaining_limit", 0)),
1123
+ user_count=int(data.get("user_count", len(users))),
1124
+ users=users,
1125
+ )
1126
+
1127
+
1128
+ @dataclass
1129
+ class ProxyServer:
1130
+ """
1131
+ ISP or Datacenter proxy server information.
1132
+
1133
+ Attributes:
1134
+ ip: Proxy server IP address.
1135
+ port: Proxy server port.
1136
+ username: Authentication username.
1137
+ password: Authentication password.
1138
+ expiration_time: Expiration timestamp (Unix timestamp or datetime string).
1139
+ region: Server region (optional).
1140
+ """
1141
+
1142
+ ip: str
1143
+ port: int
1144
+ username: str
1145
+ password: str
1146
+ expiration_time: Optional[Union[int, str]] = None
1147
+ region: Optional[str] = None
1148
+
1149
+ @classmethod
1150
+ def from_dict(cls, data: Dict[str, Any]) -> "ProxyServer":
1151
+ """Create from API response dict."""
1152
+ return cls(
1153
+ ip=data.get("ip", ""),
1154
+ port=int(data.get("port", 0)),
1155
+ username=data.get("username", data.get("user", "")),
1156
+ password=data.get("password", data.get("pwd", "")),
1157
+ expiration_time=data.get("expiration_time", data.get("expireTime")),
1158
+ region=data.get("region"),
1159
+ )
1160
+
1161
+ def to_proxy_url(self, protocol: str = "http") -> str:
1162
+ """
1163
+ Build proxy URL for this server.
1164
+
1165
+ Args:
1166
+ protocol: Proxy protocol (http/https/socks5).
1167
+
1168
+ Returns:
1169
+ Complete proxy URL.
1170
+ """
1171
+ return f"{protocol}://{self.username}:{self.password}@{self.ip}:{self.port}"
1172
+
1173
+ def is_expired(self) -> bool:
1174
+ """Check if proxy has expired (if expiration_time is available)."""
1175
+ if self.expiration_time is None:
1176
+ return False
1177
+
1178
+ import time
1179
+
1180
+ if isinstance(self.expiration_time, int):
1181
+ return time.time() > self.expiration_time
1182
+
1183
+ # String timestamp handling would need datetime parsing
1184
+ return False
thordata/retry.py CHANGED
@@ -16,6 +16,7 @@ Example:
16
16
 
17
17
  from __future__ import annotations
18
18
 
19
+ import inspect
19
20
  import logging
20
21
  import random
21
22
  import time
@@ -64,7 +65,10 @@ class RetryConfig:
64
65
 
65
66
  # Status codes to retry on (5xx server errors + 429 rate limit)
66
67
  retry_on_status_codes: Set[int] = field(
67
- default_factory=lambda: {300, 429, 500, 502, 503, 504}
68
+ default_factory=lambda: {429, 500, 502, 503, 504}
69
+ )
70
+ retry_on_api_codes: Set[int] = field(
71
+ default_factory=lambda: {300} # API response body code
68
72
  )
69
73
 
70
74
  # Exception types to always retry on
@@ -198,8 +202,6 @@ def with_retry(
198
202
 
199
203
  @wraps(func)
200
204
  async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
201
- import asyncio
202
-
203
205
  last_exception: Optional[Exception] = None
204
206
 
205
207
  for attempt in range(config.max_retries + 1):
@@ -235,7 +237,7 @@ def with_retry(
235
237
  # Check if the function is async
236
238
  import asyncio
237
239
 
238
- if asyncio.iscoroutinefunction(func):
240
+ if inspect.iscoroutinefunction(func):
239
241
  return async_wrapper
240
242
  return sync_wrapper
241
243