thordata-sdk 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/enums.py CHANGED
@@ -34,12 +34,13 @@ class Continent(str, Enum):
34
34
  class ProxyHost(str, Enum):
35
35
  """
36
36
  Available proxy gateway hosts.
37
+
38
+ Note: Dashboard provides user-specific hosts like {shard}.{region}.thordata.net
37
39
  """
38
40
 
39
41
  DEFAULT = "pr.thordata.net"
40
42
  NORTH_AMERICA = "t.na.thordata.net"
41
43
  EUROPE = "t.eu.thordata.net"
42
- GATE = "gate.thordata.com"
43
44
 
44
45
 
45
46
  class ProxyPort(IntEnum):
@@ -47,11 +48,10 @@ class ProxyPort(IntEnum):
47
48
  Available proxy gateway ports.
48
49
  """
49
50
 
50
- DEFAULT = 9999
51
+ RESIDENTIAL = 9999
51
52
  MOBILE = 5555
52
53
  DATACENTER = 7777
53
54
  ISP = 6666
54
- ALTERNATIVE = 22225
55
55
 
56
56
 
57
57
  # =============================================================================
@@ -62,33 +62,90 @@ class ProxyPort(IntEnum):
62
62
  class Engine(str, Enum):
63
63
  """
64
64
  Supported search engines for SERP API.
65
+
66
+ Engine naming convention:
67
+ - Base search: {engine} for basic web search (google, bing, yandex, duckduckgo)
68
+ - Verticals: {engine}_{vertical} (e.g., google_news, bing_images)
69
+ - Sub-verticals: {engine}_{vertical}_{sub} (e.g., google_scholar_cite)
65
70
  """
66
71
 
72
+ # ===================
73
+ # Google
74
+ # ===================
67
75
  GOOGLE = "google"
76
+ GOOGLE_SEARCH = "google_search"
77
+ GOOGLE_AI_MODE = "google_ai_mode"
78
+ GOOGLE_WEB = "google_web"
79
+ GOOGLE_SHOPPING = "google_shopping"
80
+ GOOGLE_LOCAL = "google_local"
81
+ GOOGLE_VIDEOS = "google_videos"
82
+ GOOGLE_NEWS = "google_news"
83
+ GOOGLE_FLIGHTS = "google_flights"
84
+ GOOGLE_IMAGES = "google_images"
85
+ GOOGLE_LENS = "google_lens"
86
+ GOOGLE_TRENDS = "google_trends"
87
+ GOOGLE_HOTELS = "google_hotels"
88
+ GOOGLE_PLAY = "google_play"
89
+ GOOGLE_JOBS = "google_jobs"
90
+ GOOGLE_SCHOLAR = "google_scholar"
91
+ GOOGLE_SCHOLAR_CITE = "google_scholar_cite"
92
+ GOOGLE_SCHOLAR_AUTHOR = "google_scholar_author"
93
+ GOOGLE_MAPS = "google_maps"
94
+ GOOGLE_FINANCE = "google_finance"
95
+ GOOGLE_FINANCE_MARKETS = "google_finance_markets"
96
+ GOOGLE_PATENTS = "google_patents"
97
+ GOOGLE_PATENTS_DETAILS = "google_patents_details"
98
+
99
+ # ===================
100
+ # Bing
101
+ # ===================
68
102
  BING = "bing"
103
+ BING_SEARCH = "bing_search"
104
+ BING_IMAGES = "bing_images"
105
+ BING_VIDEOS = "bing_videos"
106
+ BING_NEWS = "bing_news"
107
+ BING_MAPS = "bing_maps"
108
+ BING_SHOPPING = "bing_shopping"
109
+
110
+ # ===================
111
+ # Yandex
112
+ # ===================
69
113
  YANDEX = "yandex"
114
+ YANDEX_SEARCH = "yandex_search"
115
+
116
+ # ===================
117
+ # DuckDuckGo
118
+ # ===================
70
119
  DUCKDUCKGO = "duckduckgo"
71
- BAIDU = "baidu"
72
- YAHOO = "yahoo"
73
- NAVER = "naver"
120
+ DUCKDUCKGO_SEARCH = "duckduckgo_search"
74
121
 
75
122
 
76
123
  class GoogleSearchType(str, Enum):
77
124
  """
78
125
  Search types specific to Google.
126
+
127
+ These map to the second part of Google engine names.
128
+ For example, GOOGLE + NEWS = google_news
79
129
  """
80
130
 
81
131
  SEARCH = "search"
82
- MAPS = "maps"
132
+ AI_MODE = "ai_mode"
133
+ WEB = "web"
83
134
  SHOPPING = "shopping"
135
+ LOCAL = "local"
136
+ VIDEOS = "videos"
84
137
  NEWS = "news"
138
+ FLIGHTS = "flights"
85
139
  IMAGES = "images"
86
- VIDEOS = "videos"
87
- SCHOLAR = "scholar"
88
- PATENTS = "patents"
140
+ LENS = "lens"
141
+ TRENDS = "trends"
142
+ HOTELS = "hotels"
143
+ PLAY = "play"
89
144
  JOBS = "jobs"
90
- FLIGHTS = "flights"
145
+ SCHOLAR = "scholar"
146
+ MAPS = "maps"
91
147
  FINANCE = "finance"
148
+ PATENTS = "patents"
92
149
 
93
150
 
94
151
  class BingSearchType(str, Enum):
@@ -101,6 +158,20 @@ class BingSearchType(str, Enum):
101
158
  VIDEOS = "videos"
102
159
  NEWS = "news"
103
160
  MAPS = "maps"
161
+ SHOPPING = "shopping"
162
+
163
+
164
+ class GoogleTbm(str, Enum):
165
+ """
166
+ Google tbm (to be matched) parameter values.
167
+
168
+ Only available when using specific Google engines that support tbm.
169
+ """
170
+
171
+ NEWS = "nws"
172
+ SHOPPING = "shop"
173
+ IMAGES = "isch"
174
+ VIDEOS = "vid"
104
175
 
105
176
 
106
177
  class Device(str, Enum):
@@ -159,13 +230,12 @@ class SessionType(str, Enum):
159
230
  class OutputFormat(str, Enum):
160
231
  """
161
232
  Output formats for Universal Scraping API.
233
+
234
+ Currently supported: html, png
162
235
  """
163
236
 
164
237
  HTML = "html"
165
238
  PNG = "png"
166
- PDF = "pdf"
167
- MARKDOWN = "markdown"
168
- TEXT = "text"
169
239
 
170
240
 
171
241
  class DataFormat(str, Enum):
@@ -222,7 +292,7 @@ class TaskStatus(str, Enum):
222
292
 
223
293
 
224
294
  # =============================================================================
225
- # Country Enum (常用国家)
295
+ # Country Enum (Common Countries)
226
296
  # =============================================================================
227
297
 
228
298
 
@@ -306,7 +376,6 @@ def normalize_enum_value(value: object, enum_class: type) -> str:
306
376
  Safely convert an enum or string to its string value.
307
377
  """
308
378
  if isinstance(value, enum_class):
309
- # value is an enum member, get its .value
310
379
  return str(getattr(value, "value", value)).lower()
311
380
  if isinstance(value, str):
312
381
  return value.lower()
thordata/exceptions.py CHANGED
@@ -222,7 +222,8 @@ class ThordataNotCollectedError(ThordataAPIError):
222
222
  This error is often transient and typically safe to retry.
223
223
  """
224
224
 
225
- HTTP_STATUS_CODES = {300}
225
+ API_CODES = {300}
226
+ HTTP_STATUS_CODES: Set[int] = set()
226
227
 
227
228
  @property
228
229
  def is_retryable(self) -> bool:
@@ -262,8 +263,17 @@ def raise_for_code(
262
263
  ThordataValidationError: For 400/422 codes.
263
264
  ThordataAPIError: For all other error codes.
264
265
  """
265
- # Use the code from payload if status_code not available
266
- effective_code = status_code or code
266
+ # Determine the effective error code.
267
+ # Prefer payload `code` when present and not success (200),
268
+ # otherwise fall back to HTTP status when it indicates an error.
269
+ effective_code: Optional[int] = None
270
+
271
+ if code is not None and code != 200:
272
+ effective_code = code
273
+ elif status_code is not None and status_code != 200:
274
+ effective_code = status_code
275
+ else:
276
+ effective_code = code if code is not None else status_code
267
277
 
268
278
  kwargs = {
269
279
  "status_code": status_code,
@@ -272,8 +282,9 @@ def raise_for_code(
272
282
  "request_id": request_id,
273
283
  }
274
284
 
275
- # Not collected (often retryable, not billed)
276
- if effective_code in ThordataNotCollectedError.HTTP_STATUS_CODES:
285
+ # Not collected (API payload code 300, often retryable, not billed)
286
+ # Check this FIRST since 300 is in API_CODES, not HTTP_STATUS_CODES
287
+ if effective_code in ThordataNotCollectedError.API_CODES:
277
288
  raise ThordataNotCollectedError(message, **kwargs)
278
289
 
279
290
  # Auth errors
thordata/models.py CHANGED
@@ -795,6 +795,126 @@ class ScraperTaskConfig:
795
795
  return payload
796
796
 
797
797
 
798
+ @dataclass
799
+ class CommonSettings:
800
+ """
801
+ Common settings for YouTube video/audio downloads.
802
+
803
+ Used by /video_builder endpoint as `common_settings` parameter.
804
+ Also known as `spider_universal` in some documentation.
805
+
806
+ Args:
807
+ resolution: Video resolution (360p/480p/720p/1080p/1440p/2160p).
808
+ audio_format: Audio format (opus/mp3).
809
+ bitrate: Audio bitrate (48/64/128/160/256/320 or with Kbps suffix).
810
+ is_subtitles: Whether to download subtitles ("true"/"false").
811
+ subtitles_language: Subtitle language code (e.g., "en", "zh-Hans").
812
+
813
+ Example for video:
814
+ >>> settings = CommonSettings(
815
+ ... resolution="1080p",
816
+ ... is_subtitles="true",
817
+ ... subtitles_language="en"
818
+ ... )
819
+
820
+ Example for audio:
821
+ >>> settings = CommonSettings(
822
+ ... audio_format="mp3",
823
+ ... bitrate="320",
824
+ ... is_subtitles="true",
825
+ ... subtitles_language="en"
826
+ ... )
827
+ """
828
+
829
+ # Video settings
830
+ resolution: Optional[str] = None
831
+
832
+ # Audio settings
833
+ audio_format: Optional[str] = None
834
+ bitrate: Optional[str] = None
835
+
836
+ # Subtitle settings (used by both video and audio)
837
+ is_subtitles: Optional[str] = None
838
+ subtitles_language: Optional[str] = None
839
+
840
+ # Valid values for validation
841
+ VALID_RESOLUTIONS = {"360p", "480p", "720p", "1080p", "1440p", "2160p"}
842
+ VALID_AUDIO_FORMATS = {"opus", "mp3"}
843
+
844
+ def to_dict(self) -> Dict[str, Any]:
845
+ """Convert to dictionary, excluding None values."""
846
+ result = {}
847
+ if self.resolution is not None:
848
+ result["resolution"] = self.resolution
849
+ if self.audio_format is not None:
850
+ result["audio_format"] = self.audio_format
851
+ if self.bitrate is not None:
852
+ result["bitrate"] = self.bitrate
853
+ if self.is_subtitles is not None:
854
+ result["is_subtitles"] = self.is_subtitles
855
+ if self.subtitles_language is not None:
856
+ result["subtitles_language"] = self.subtitles_language
857
+ return result
858
+
859
+ def to_json(self) -> str:
860
+ """Convert to JSON string for form submission."""
861
+ return json.dumps(self.to_dict())
862
+
863
+
864
+ @dataclass
865
+ class VideoTaskConfig:
866
+ """
867
+ Configuration for creating a YouTube video/audio download task.
868
+
869
+ Uses the /video_builder endpoint.
870
+
871
+ Args:
872
+ file_name: Name for the output file. Supports {{TasksID}}, {{VideoID}}.
873
+ spider_id: Spider identifier (e.g., "youtube_video_by-url", "youtube_audio_by-url").
874
+ spider_name: Spider name (typically "youtube.com").
875
+ parameters: Spider-specific parameters (e.g., video URL).
876
+ common_settings: Video/audio settings (resolution, format, subtitles).
877
+ include_errors: Include error details in output.
878
+
879
+ Example:
880
+ >>> config = VideoTaskConfig(
881
+ ... file_name="{{VideoID}}",
882
+ ... spider_id="youtube_video_by-url",
883
+ ... spider_name="youtube.com",
884
+ ... parameters={"url": "https://www.youtube.com/watch?v=xxx"},
885
+ ... common_settings=CommonSettings(
886
+ ... resolution="1080p",
887
+ ... is_subtitles="true",
888
+ ... subtitles_language="en"
889
+ ... )
890
+ ... )
891
+ """
892
+
893
+ file_name: str
894
+ spider_id: str
895
+ spider_name: str
896
+ parameters: Dict[str, Any]
897
+ common_settings: CommonSettings
898
+ include_errors: bool = True
899
+
900
+ def to_payload(self) -> Dict[str, Any]:
901
+ """
902
+ Convert to API request payload.
903
+
904
+ Returns:
905
+ Dictionary ready to be sent to the video_builder API.
906
+ """
907
+ payload: Dict[str, Any] = {
908
+ "file_name": self.file_name,
909
+ "spider_id": self.spider_id,
910
+ "spider_name": self.spider_name,
911
+ "spider_parameters": json.dumps([self.parameters]),
912
+ "spider_errors": "true" if self.include_errors else "false",
913
+ "common_settings": self.common_settings.to_json(),
914
+ }
915
+ return payload
916
+
917
+
798
918
  # =============================================================================
799
919
  # Response Models
800
920
  # =============================================================================
@@ -838,3 +958,177 @@ class TaskStatusResponse:
838
958
  """Check if the task failed."""
839
959
  failure_statuses = {"failed", "error"}
840
960
  return self.status.lower() in failure_statuses
961
+
962
+
963
+ @dataclass
964
+ class UsageStatistics:
965
+ """
966
+ Response model for account usage statistics.
967
+
968
+ Attributes:
969
+ total_usage_traffic: Total traffic used (KB).
970
+ traffic_balance: Remaining traffic balance (KB).
971
+ query_days: Number of days in the query range.
972
+ range_usage_traffic: Traffic used in the specified date range (KB).
973
+ data: Daily usage breakdown.
974
+ """
975
+
976
+ total_usage_traffic: float
977
+ traffic_balance: float
978
+ query_days: int
979
+ range_usage_traffic: float
980
+ data: List[Dict[str, Any]]
981
+
982
+ @classmethod
983
+ def from_dict(cls, data: Dict[str, Any]) -> "UsageStatistics":
984
+ """Create from API response dict."""
985
+ return cls(
986
+ total_usage_traffic=float(data.get("total_usage_traffic", 0)),
987
+ traffic_balance=float(data.get("traffic_balance", 0)),
988
+ query_days=int(data.get("query_days", 0)),
989
+ range_usage_traffic=float(data.get("range_usage_traffic", 0)),
990
+ data=data.get("data", []),
991
+ )
992
+
993
+ def total_usage_gb(self) -> float:
994
+ """Get total usage in GB."""
995
+ return self.total_usage_traffic / (1024 * 1024)
996
+
997
+ def balance_gb(self) -> float:
998
+ """Get balance in GB."""
999
+ return self.traffic_balance / (1024 * 1024)
1000
+
1001
+ def range_usage_gb(self) -> float:
1002
+ """Get range usage in GB."""
1003
+ return self.range_usage_traffic / (1024 * 1024)
1004
+
1005
+
1006
+ @dataclass
1007
+ class ProxyUser:
1008
+ """
1009
+ Proxy user (sub-account) information.
1010
+
1011
+ Attributes:
1012
+ username: User's username.
1013
+ password: User's password.
1014
+ status: User status (True=enabled, False=disabled).
1015
+ traffic_limit: Traffic limit in MB (0 = unlimited).
1016
+ usage_traffic: Traffic used in KB.
1017
+ """
1018
+
1019
+ username: str
1020
+ password: str
1021
+ status: bool
1022
+ traffic_limit: int
1023
+ usage_traffic: float
1024
+
1025
+ @classmethod
1026
+ def from_dict(cls, data: Dict[str, Any]) -> "ProxyUser":
1027
+ """Create from API response dict."""
1028
+ return cls(
1029
+ username=data.get("username", ""),
1030
+ password=data.get("password", ""),
1031
+ status=data.get("status") in (True, "true", 1),
1032
+ traffic_limit=int(data.get("traffic_limit", 0)),
1033
+ usage_traffic=float(data.get("usage_traffic", 0)),
1034
+ )
1035
+
1036
+ def usage_gb(self) -> float:
1037
+ """Get usage in GB."""
1038
+ return self.usage_traffic / (1024 * 1024)
1039
+
1040
+ def limit_gb(self) -> float:
1041
+ """Get limit in GB (0 means unlimited)."""
1042
+ if self.traffic_limit == 0:
1043
+ return 0
1044
+ return self.traffic_limit / 1024
1045
+
1046
+
1047
+ @dataclass
1048
+ class ProxyUserList:
1049
+ """
1050
+ Response model for proxy user list.
1051
+
1052
+ Attributes:
1053
+ limit: Total traffic limit (KB).
1054
+ remaining_limit: Remaining traffic limit (KB).
1055
+ user_count: Number of users.
1056
+ users: List of proxy users.
1057
+ """
1058
+
1059
+ limit: float
1060
+ remaining_limit: float
1061
+ user_count: int
1062
+ users: List[ProxyUser]
1063
+
1064
+ @classmethod
1065
+ def from_dict(cls, data: Dict[str, Any]) -> "ProxyUserList":
1066
+ """Create from API response dict."""
1067
+ user_list = data.get("list", [])
1068
+ users = [ProxyUser.from_dict(u) for u in user_list]
1069
+
1070
+ return cls(
1071
+ limit=float(data.get("limit", 0)),
1072
+ remaining_limit=float(data.get("remaining_limit", 0)),
1073
+ user_count=int(data.get("user_count", len(users))),
1074
+ users=users,
1075
+ )
1076
+
1077
+
1078
+ @dataclass
1079
+ class ProxyServer:
1080
+ """
1081
+ ISP or Datacenter proxy server information.
1082
+
1083
+ Attributes:
1084
+ ip: Proxy server IP address.
1085
+ port: Proxy server port.
1086
+ username: Authentication username.
1087
+ password: Authentication password.
1088
+ expiration_time: Expiration timestamp (Unix timestamp or datetime string).
1089
+ region: Server region (optional).
1090
+ """
1091
+
1092
+ ip: str
1093
+ port: int
1094
+ username: str
1095
+ password: str
1096
+ expiration_time: Optional[Union[int, str]] = None
1097
+ region: Optional[str] = None
1098
+
1099
+ @classmethod
1100
+ def from_dict(cls, data: Dict[str, Any]) -> "ProxyServer":
1101
+ """Create from API response dict."""
1102
+ return cls(
1103
+ ip=data.get("ip", ""),
1104
+ port=int(data.get("port", 0)),
1105
+ username=data.get("username", data.get("user", "")),
1106
+ password=data.get("password", data.get("pwd", "")),
1107
+ expiration_time=data.get("expiration_time", data.get("expireTime")),
1108
+ region=data.get("region"),
1109
+ )
1110
+
1111
+ def to_proxy_url(self, protocol: str = "http") -> str:
1112
+ """
1113
+ Build proxy URL for this server.
1114
+
1115
+ Args:
1116
+ protocol: Proxy protocol (http/https/socks5).
1117
+
1118
+ Returns:
1119
+ Complete proxy URL.
1120
+ """
1121
+ return f"{protocol}://{self.username}:{self.password}@{self.ip}:{self.port}"
1122
+
1123
+ def is_expired(self) -> bool:
1124
+ """Check if proxy has expired (if expiration_time is available)."""
1125
+ if self.expiration_time is None:
1126
+ return False
1127
+
1128
+ import time
1129
+
1130
+ if isinstance(self.expiration_time, int):
1131
+ return time.time() > self.expiration_time
1132
+
1133
+ # String timestamp handling would need datetime parsing
1134
+ return False
thordata/retry.py CHANGED
@@ -64,7 +64,10 @@ class RetryConfig:
64
64
 
65
65
  # Status codes to retry on (5xx server errors + 429 rate limit)
66
66
  retry_on_status_codes: Set[int] = field(
67
- default_factory=lambda: {300, 429, 500, 502, 503, 504}
67
+ default_factory=lambda: {429, 500, 502, 503, 504}
68
+ )
69
+ retry_on_api_codes: Set[int] = field(
70
+ default_factory=lambda: {300} # API response body code
68
71
  )
69
72
 
70
73
  # Exception types to always retry on