thordata-sdk 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +13 -1
- thordata/_utils.py +66 -3
- thordata/async_client.py +787 -8
- thordata/client.py +851 -33
- thordata/enums.py +3 -3
- thordata/exceptions.py +16 -5
- thordata/models.py +294 -0
- thordata/retry.py +4 -1
- thordata_sdk-0.8.0.dist-info/METADATA +212 -0
- thordata_sdk-0.8.0.dist-info/RECORD +14 -0
- thordata/parameters.py +0 -53
- thordata_sdk-0.7.0.dist-info/METADATA +0 -1053
- thordata_sdk-0.7.0.dist-info/RECORD +0 -15
- {thordata_sdk-0.7.0.dist-info → thordata_sdk-0.8.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.7.0.dist-info → thordata_sdk-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-0.7.0.dist-info → thordata_sdk-0.8.0.dist-info}/top_level.txt +0 -0
thordata/enums.py
CHANGED
|
@@ -34,12 +34,13 @@ class Continent(str, Enum):
|
|
|
34
34
|
class ProxyHost(str, Enum):
|
|
35
35
|
"""
|
|
36
36
|
Available proxy gateway hosts.
|
|
37
|
+
|
|
38
|
+
Note: Dashboard provides user-specific hosts like {shard}.{region}.thordata.net
|
|
37
39
|
"""
|
|
38
40
|
|
|
39
41
|
DEFAULT = "pr.thordata.net"
|
|
40
42
|
NORTH_AMERICA = "t.na.thordata.net"
|
|
41
43
|
EUROPE = "t.eu.thordata.net"
|
|
42
|
-
GATE = "gate.thordata.com"
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
class ProxyPort(IntEnum):
|
|
@@ -47,11 +48,10 @@ class ProxyPort(IntEnum):
|
|
|
47
48
|
Available proxy gateway ports.
|
|
48
49
|
"""
|
|
49
50
|
|
|
50
|
-
|
|
51
|
+
RESIDENTIAL = 9999
|
|
51
52
|
MOBILE = 5555
|
|
52
53
|
DATACENTER = 7777
|
|
53
54
|
ISP = 6666
|
|
54
|
-
ALTERNATIVE = 22225
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
# =============================================================================
|
thordata/exceptions.py
CHANGED
|
@@ -222,7 +222,8 @@ class ThordataNotCollectedError(ThordataAPIError):
|
|
|
222
222
|
This error is often transient and typically safe to retry.
|
|
223
223
|
"""
|
|
224
224
|
|
|
225
|
-
|
|
225
|
+
API_CODES = {300}
|
|
226
|
+
HTTP_STATUS_CODES: Set[int] = set()
|
|
226
227
|
|
|
227
228
|
@property
|
|
228
229
|
def is_retryable(self) -> bool:
|
|
@@ -262,8 +263,17 @@ def raise_for_code(
|
|
|
262
263
|
ThordataValidationError: For 400/422 codes.
|
|
263
264
|
ThordataAPIError: For all other error codes.
|
|
264
265
|
"""
|
|
265
|
-
#
|
|
266
|
-
|
|
266
|
+
# Determine the effective error code.
|
|
267
|
+
# Prefer payload `code` when present and not success (200),
|
|
268
|
+
# otherwise fall back to HTTP status when it indicates an error.
|
|
269
|
+
effective_code: Optional[int] = None
|
|
270
|
+
|
|
271
|
+
if code is not None and code != 200:
|
|
272
|
+
effective_code = code
|
|
273
|
+
elif status_code is not None and status_code != 200:
|
|
274
|
+
effective_code = status_code
|
|
275
|
+
else:
|
|
276
|
+
effective_code = code if code is not None else status_code
|
|
267
277
|
|
|
268
278
|
kwargs = {
|
|
269
279
|
"status_code": status_code,
|
|
@@ -272,8 +282,9 @@ def raise_for_code(
|
|
|
272
282
|
"request_id": request_id,
|
|
273
283
|
}
|
|
274
284
|
|
|
275
|
-
# Not collected (often retryable, not billed)
|
|
276
|
-
|
|
285
|
+
# Not collected (API payload code 300, often retryable, not billed)
|
|
286
|
+
# Check this FIRST since 300 is in API_CODES, not HTTP_STATUS_CODES
|
|
287
|
+
if effective_code in ThordataNotCollectedError.API_CODES:
|
|
277
288
|
raise ThordataNotCollectedError(message, **kwargs)
|
|
278
289
|
|
|
279
290
|
# Auth errors
|
thordata/models.py
CHANGED
|
@@ -795,6 +795,126 @@ class ScraperTaskConfig:
|
|
|
795
795
|
return payload
|
|
796
796
|
|
|
797
797
|
|
|
798
|
+
@dataclass
|
|
799
|
+
class CommonSettings:
|
|
800
|
+
"""
|
|
801
|
+
Common settings for YouTube video/audio downloads.
|
|
802
|
+
|
|
803
|
+
Used by /video_builder endpoint as `common_settings` parameter.
|
|
804
|
+
Also known as `spider_universal` in some documentation.
|
|
805
|
+
|
|
806
|
+
Args:
|
|
807
|
+
resolution: Video resolution (360p/480p/720p/1080p/1440p/2160p).
|
|
808
|
+
audio_format: Audio format (opus/mp3).
|
|
809
|
+
bitrate: Audio bitrate (48/64/128/160/256/320 or with Kbps suffix).
|
|
810
|
+
is_subtitles: Whether to download subtitles ("true"/"false").
|
|
811
|
+
subtitles_language: Subtitle language code (e.g., "en", "zh-Hans").
|
|
812
|
+
|
|
813
|
+
Example for video:
|
|
814
|
+
>>> settings = CommonSettings(
|
|
815
|
+
... resolution="1080p",
|
|
816
|
+
... is_subtitles="true",
|
|
817
|
+
... subtitles_language="en"
|
|
818
|
+
... )
|
|
819
|
+
|
|
820
|
+
Example for audio:
|
|
821
|
+
>>> settings = CommonSettings(
|
|
822
|
+
... audio_format="mp3",
|
|
823
|
+
... bitrate="320",
|
|
824
|
+
... is_subtitles="true",
|
|
825
|
+
... subtitles_language="en"
|
|
826
|
+
... )
|
|
827
|
+
"""
|
|
828
|
+
|
|
829
|
+
# Video settings
|
|
830
|
+
resolution: Optional[str] = None
|
|
831
|
+
|
|
832
|
+
# Audio settings
|
|
833
|
+
audio_format: Optional[str] = None
|
|
834
|
+
bitrate: Optional[str] = None
|
|
835
|
+
|
|
836
|
+
# Subtitle settings (used by both video and audio)
|
|
837
|
+
is_subtitles: Optional[str] = None
|
|
838
|
+
subtitles_language: Optional[str] = None
|
|
839
|
+
|
|
840
|
+
# Valid values for validation
|
|
841
|
+
VALID_RESOLUTIONS = {"360p", "480p", "720p", "1080p", "1440p", "2160p"}
|
|
842
|
+
VALID_AUDIO_FORMATS = {"opus", "mp3"}
|
|
843
|
+
|
|
844
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
845
|
+
"""Convert to dictionary, excluding None values."""
|
|
846
|
+
result = {}
|
|
847
|
+
if self.resolution is not None:
|
|
848
|
+
result["resolution"] = self.resolution
|
|
849
|
+
if self.audio_format is not None:
|
|
850
|
+
result["audio_format"] = self.audio_format
|
|
851
|
+
if self.bitrate is not None:
|
|
852
|
+
result["bitrate"] = self.bitrate
|
|
853
|
+
if self.is_subtitles is not None:
|
|
854
|
+
result["is_subtitles"] = self.is_subtitles
|
|
855
|
+
if self.subtitles_language is not None:
|
|
856
|
+
result["subtitles_language"] = self.subtitles_language
|
|
857
|
+
return result
|
|
858
|
+
|
|
859
|
+
def to_json(self) -> str:
|
|
860
|
+
"""Convert to JSON string for form submission."""
|
|
861
|
+
return json.dumps(self.to_dict())
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
@dataclass
|
|
865
|
+
class VideoTaskConfig:
|
|
866
|
+
"""
|
|
867
|
+
Configuration for creating a YouTube video/audio download task.
|
|
868
|
+
|
|
869
|
+
Uses the /video_builder endpoint.
|
|
870
|
+
|
|
871
|
+
Args:
|
|
872
|
+
file_name: Name for the output file. Supports {{TasksID}}, {{VideoID}}.
|
|
873
|
+
spider_id: Spider identifier (e.g., "youtube_video_by-url", "youtube_audio_by-url").
|
|
874
|
+
spider_name: Spider name (typically "youtube.com").
|
|
875
|
+
parameters: Spider-specific parameters (e.g., video URL).
|
|
876
|
+
common_settings: Video/audio settings (resolution, format, subtitles).
|
|
877
|
+
include_errors: Include error details in output.
|
|
878
|
+
|
|
879
|
+
Example:
|
|
880
|
+
>>> config = VideoTaskConfig(
|
|
881
|
+
... file_name="{{VideoID}}",
|
|
882
|
+
... spider_id="youtube_video_by-url",
|
|
883
|
+
... spider_name="youtube.com",
|
|
884
|
+
... parameters={"url": "https://www.youtube.com/watch?v=xxx"},
|
|
885
|
+
... common_settings=CommonSettings(
|
|
886
|
+
... resolution="1080p",
|
|
887
|
+
... is_subtitles="true",
|
|
888
|
+
... subtitles_language="en"
|
|
889
|
+
... )
|
|
890
|
+
... )
|
|
891
|
+
"""
|
|
892
|
+
|
|
893
|
+
file_name: str
|
|
894
|
+
spider_id: str
|
|
895
|
+
spider_name: str
|
|
896
|
+
parameters: Dict[str, Any]
|
|
897
|
+
common_settings: CommonSettings
|
|
898
|
+
include_errors: bool = True
|
|
899
|
+
|
|
900
|
+
def to_payload(self) -> Dict[str, Any]:
|
|
901
|
+
"""
|
|
902
|
+
Convert to API request payload.
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
Dictionary ready to be sent to the video_builder API.
|
|
906
|
+
"""
|
|
907
|
+
payload: Dict[str, Any] = {
|
|
908
|
+
"file_name": self.file_name,
|
|
909
|
+
"spider_id": self.spider_id,
|
|
910
|
+
"spider_name": self.spider_name,
|
|
911
|
+
"spider_parameters": json.dumps([self.parameters]),
|
|
912
|
+
"spider_errors": "true" if self.include_errors else "false",
|
|
913
|
+
"common_settings": self.common_settings.to_json(),
|
|
914
|
+
}
|
|
915
|
+
return payload
|
|
916
|
+
|
|
917
|
+
|
|
798
918
|
# =============================================================================
|
|
799
919
|
# Response Models
|
|
800
920
|
# =============================================================================
|
|
@@ -838,3 +958,177 @@ class TaskStatusResponse:
|
|
|
838
958
|
"""Check if the task failed."""
|
|
839
959
|
failure_statuses = {"failed", "error"}
|
|
840
960
|
return self.status.lower() in failure_statuses
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
@dataclass
|
|
964
|
+
class UsageStatistics:
|
|
965
|
+
"""
|
|
966
|
+
Response model for account usage statistics.
|
|
967
|
+
|
|
968
|
+
Attributes:
|
|
969
|
+
total_usage_traffic: Total traffic used (KB).
|
|
970
|
+
traffic_balance: Remaining traffic balance (KB).
|
|
971
|
+
query_days: Number of days in the query range.
|
|
972
|
+
range_usage_traffic: Traffic used in the specified date range (KB).
|
|
973
|
+
data: Daily usage breakdown.
|
|
974
|
+
"""
|
|
975
|
+
|
|
976
|
+
total_usage_traffic: float
|
|
977
|
+
traffic_balance: float
|
|
978
|
+
query_days: int
|
|
979
|
+
range_usage_traffic: float
|
|
980
|
+
data: List[Dict[str, Any]]
|
|
981
|
+
|
|
982
|
+
@classmethod
|
|
983
|
+
def from_dict(cls, data: Dict[str, Any]) -> "UsageStatistics":
|
|
984
|
+
"""Create from API response dict."""
|
|
985
|
+
return cls(
|
|
986
|
+
total_usage_traffic=float(data.get("total_usage_traffic", 0)),
|
|
987
|
+
traffic_balance=float(data.get("traffic_balance", 0)),
|
|
988
|
+
query_days=int(data.get("query_days", 0)),
|
|
989
|
+
range_usage_traffic=float(data.get("range_usage_traffic", 0)),
|
|
990
|
+
data=data.get("data", []),
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
def total_usage_gb(self) -> float:
|
|
994
|
+
"""Get total usage in GB."""
|
|
995
|
+
return self.total_usage_traffic / (1024 * 1024)
|
|
996
|
+
|
|
997
|
+
def balance_gb(self) -> float:
|
|
998
|
+
"""Get balance in GB."""
|
|
999
|
+
return self.traffic_balance / (1024 * 1024)
|
|
1000
|
+
|
|
1001
|
+
def range_usage_gb(self) -> float:
|
|
1002
|
+
"""Get range usage in GB."""
|
|
1003
|
+
return self.range_usage_traffic / (1024 * 1024)
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
@dataclass
|
|
1007
|
+
class ProxyUser:
|
|
1008
|
+
"""
|
|
1009
|
+
Proxy user (sub-account) information.
|
|
1010
|
+
|
|
1011
|
+
Attributes:
|
|
1012
|
+
username: User's username.
|
|
1013
|
+
password: User's password.
|
|
1014
|
+
status: User status (True=enabled, False=disabled).
|
|
1015
|
+
traffic_limit: Traffic limit in MB (0 = unlimited).
|
|
1016
|
+
usage_traffic: Traffic used in KB.
|
|
1017
|
+
"""
|
|
1018
|
+
|
|
1019
|
+
username: str
|
|
1020
|
+
password: str
|
|
1021
|
+
status: bool
|
|
1022
|
+
traffic_limit: int
|
|
1023
|
+
usage_traffic: float
|
|
1024
|
+
|
|
1025
|
+
@classmethod
|
|
1026
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ProxyUser":
|
|
1027
|
+
"""Create from API response dict."""
|
|
1028
|
+
return cls(
|
|
1029
|
+
username=data.get("username", ""),
|
|
1030
|
+
password=data.get("password", ""),
|
|
1031
|
+
status=data.get("status") in (True, "true", 1),
|
|
1032
|
+
traffic_limit=int(data.get("traffic_limit", 0)),
|
|
1033
|
+
usage_traffic=float(data.get("usage_traffic", 0)),
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
def usage_gb(self) -> float:
|
|
1037
|
+
"""Get usage in GB."""
|
|
1038
|
+
return self.usage_traffic / (1024 * 1024)
|
|
1039
|
+
|
|
1040
|
+
def limit_gb(self) -> float:
|
|
1041
|
+
"""Get limit in GB (0 means unlimited)."""
|
|
1042
|
+
if self.traffic_limit == 0:
|
|
1043
|
+
return 0
|
|
1044
|
+
return self.traffic_limit / 1024
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
@dataclass
|
|
1048
|
+
class ProxyUserList:
|
|
1049
|
+
"""
|
|
1050
|
+
Response model for proxy user list.
|
|
1051
|
+
|
|
1052
|
+
Attributes:
|
|
1053
|
+
limit: Total traffic limit (KB).
|
|
1054
|
+
remaining_limit: Remaining traffic limit (KB).
|
|
1055
|
+
user_count: Number of users.
|
|
1056
|
+
users: List of proxy users.
|
|
1057
|
+
"""
|
|
1058
|
+
|
|
1059
|
+
limit: float
|
|
1060
|
+
remaining_limit: float
|
|
1061
|
+
user_count: int
|
|
1062
|
+
users: List[ProxyUser]
|
|
1063
|
+
|
|
1064
|
+
@classmethod
|
|
1065
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ProxyUserList":
|
|
1066
|
+
"""Create from API response dict."""
|
|
1067
|
+
user_list = data.get("list", [])
|
|
1068
|
+
users = [ProxyUser.from_dict(u) for u in user_list]
|
|
1069
|
+
|
|
1070
|
+
return cls(
|
|
1071
|
+
limit=float(data.get("limit", 0)),
|
|
1072
|
+
remaining_limit=float(data.get("remaining_limit", 0)),
|
|
1073
|
+
user_count=int(data.get("user_count", len(users))),
|
|
1074
|
+
users=users,
|
|
1075
|
+
)
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
@dataclass
|
|
1079
|
+
class ProxyServer:
|
|
1080
|
+
"""
|
|
1081
|
+
ISP or Datacenter proxy server information.
|
|
1082
|
+
|
|
1083
|
+
Attributes:
|
|
1084
|
+
ip: Proxy server IP address.
|
|
1085
|
+
port: Proxy server port.
|
|
1086
|
+
username: Authentication username.
|
|
1087
|
+
password: Authentication password.
|
|
1088
|
+
expiration_time: Expiration timestamp (Unix timestamp or datetime string).
|
|
1089
|
+
region: Server region (optional).
|
|
1090
|
+
"""
|
|
1091
|
+
|
|
1092
|
+
ip: str
|
|
1093
|
+
port: int
|
|
1094
|
+
username: str
|
|
1095
|
+
password: str
|
|
1096
|
+
expiration_time: Optional[Union[int, str]] = None
|
|
1097
|
+
region: Optional[str] = None
|
|
1098
|
+
|
|
1099
|
+
@classmethod
|
|
1100
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ProxyServer":
|
|
1101
|
+
"""Create from API response dict."""
|
|
1102
|
+
return cls(
|
|
1103
|
+
ip=data.get("ip", ""),
|
|
1104
|
+
port=int(data.get("port", 0)),
|
|
1105
|
+
username=data.get("username", data.get("user", "")),
|
|
1106
|
+
password=data.get("password", data.get("pwd", "")),
|
|
1107
|
+
expiration_time=data.get("expiration_time", data.get("expireTime")),
|
|
1108
|
+
region=data.get("region"),
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
def to_proxy_url(self, protocol: str = "http") -> str:
|
|
1112
|
+
"""
|
|
1113
|
+
Build proxy URL for this server.
|
|
1114
|
+
|
|
1115
|
+
Args:
|
|
1116
|
+
protocol: Proxy protocol (http/https/socks5).
|
|
1117
|
+
|
|
1118
|
+
Returns:
|
|
1119
|
+
Complete proxy URL.
|
|
1120
|
+
"""
|
|
1121
|
+
return f"{protocol}://{self.username}:{self.password}@{self.ip}:{self.port}"
|
|
1122
|
+
|
|
1123
|
+
def is_expired(self) -> bool:
|
|
1124
|
+
"""Check if proxy has expired (if expiration_time is available)."""
|
|
1125
|
+
if self.expiration_time is None:
|
|
1126
|
+
return False
|
|
1127
|
+
|
|
1128
|
+
import time
|
|
1129
|
+
|
|
1130
|
+
if isinstance(self.expiration_time, int):
|
|
1131
|
+
return time.time() > self.expiration_time
|
|
1132
|
+
|
|
1133
|
+
# String timestamp handling would need datetime parsing
|
|
1134
|
+
return False
|
thordata/retry.py
CHANGED
|
@@ -64,7 +64,10 @@ class RetryConfig:
|
|
|
64
64
|
|
|
65
65
|
# Status codes to retry on (5xx server errors + 429 rate limit)
|
|
66
66
|
retry_on_status_codes: Set[int] = field(
|
|
67
|
-
default_factory=lambda: {
|
|
67
|
+
default_factory=lambda: {429, 500, 502, 503, 504}
|
|
68
|
+
)
|
|
69
|
+
retry_on_api_codes: Set[int] = field(
|
|
70
|
+
default_factory=lambda: {300} # API response body code
|
|
68
71
|
)
|
|
69
72
|
|
|
70
73
|
# Exception types to always retry on
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
|
+
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://www.thordata.com
|
|
8
|
+
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
9
|
+
Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
|
|
10
|
+
Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/Thordata/thordata-python-sdk/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: web scraping,proxy,residential proxy,datacenter proxy,ai,llm,data-mining,serp,thordata,web scraper,anti-bot bypass
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Topic :: Internet :: Proxy Servers
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
24
|
+
Classifier: Operating System :: OS Independent
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: requests>=2.25.0
|
|
30
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-httpserver>=1.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
39
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: types-requests>=2.28.0; extra == "dev"
|
|
41
|
+
Requires-Dist: aioresponses>=0.7.6; extra == "dev"
|
|
42
|
+
Dynamic: license-file
|
|
43
|
+
|
|
44
|
+
# Thordata Python SDK
|
|
45
|
+
|
|
46
|
+
<div align="center">
|
|
47
|
+
|
|
48
|
+
**Official Python client for Thordata's Proxy Network, SERP API, Web Unlocker, and Web Scraper API.**
|
|
49
|
+
|
|
50
|
+
*Async-ready, type-safe, built for AI agents and large-scale data collection.*
|
|
51
|
+
|
|
52
|
+
[](https://pypi.org/project/thordata-sdk/)
|
|
53
|
+
[](https://python.org)
|
|
54
|
+
[](LICENSE)
|
|
55
|
+
|
|
56
|
+
[Documentation](https://doc.thordata.com) • [Dashboard](https://www.thordata.com) • [Examples](examples/)
|
|
57
|
+
|
|
58
|
+
</div>
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## ✨ Features
|
|
63
|
+
|
|
64
|
+
- 🌐 **Proxy Network**: Residential, Mobile, Datacenter, ISP proxies with geo-targeting
|
|
65
|
+
- 🔍 **SERP API**: Google, Bing, Yandex, DuckDuckGo search results
|
|
66
|
+
- 🔓 **Web Unlocker**: Bypass Cloudflare, CAPTCHAs, anti-bot systems
|
|
67
|
+
- 🕷️ **Web Scraper API**: Async task-based scraping (Text & Video/Audio)
|
|
68
|
+
- 📊 **Account Management**: Usage stats, sub-users, IP whitelist
|
|
69
|
+
- ⚡ **Async Support**: Full async/await support with aiohttp
|
|
70
|
+
- 🔄 **Auto Retry**: Configurable retry with exponential backoff
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## 📦 Installation
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install thordata-sdk
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## 🔐 Configuration
|
|
83
|
+
|
|
84
|
+
Set environment variables:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Required for Scraper APIs (SERP, Universal, Tasks)
|
|
88
|
+
export THORDATA_SCRAPER_TOKEN=your_token
|
|
89
|
+
|
|
90
|
+
# Required for Public/Location APIs (Dashboard -> My Account)
|
|
91
|
+
export THORDATA_PUBLIC_TOKEN=your_public_token
|
|
92
|
+
export THORDATA_PUBLIC_KEY=your_public_key
|
|
93
|
+
|
|
94
|
+
# Required for Public API NEW (Dashboard -> Public API NEW)
|
|
95
|
+
# If not set, SDK falls back to PUBLIC_TOKEN/KEY
|
|
96
|
+
export THORDATA_SIGN=your_sign
|
|
97
|
+
export THORDATA_API_KEY=your_api_key
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 🚀 Quick Start
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from thordata import ThordataClient, Engine
|
|
106
|
+
|
|
107
|
+
# Initialize (reads from env vars)
|
|
108
|
+
client = ThordataClient(
|
|
109
|
+
scraper_token="your_token",
|
|
110
|
+
public_token="pub_token",
|
|
111
|
+
public_key="pub_key"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# SERP Search
|
|
115
|
+
results = client.serp_search("python tutorial", engine=Engine.GOOGLE)
|
|
116
|
+
print(f"Found {len(results.get('organic', []))} results")
|
|
117
|
+
|
|
118
|
+
# Universal Scrape
|
|
119
|
+
html = client.universal_scrape("https://httpbin.org/html")
|
|
120
|
+
print(html[:100])
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## 📖 Feature Guide
|
|
126
|
+
|
|
127
|
+
### SERP API
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from thordata import SerpRequest
|
|
131
|
+
|
|
132
|
+
# Advanced search
|
|
133
|
+
results = client.serp_search_advanced(SerpRequest(
|
|
134
|
+
query="pizza",
|
|
135
|
+
engine="google_local",
|
|
136
|
+
country="us",
|
|
137
|
+
location="New York",
|
|
138
|
+
num=10
|
|
139
|
+
))
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Web Scraper API (Async Tasks)
|
|
143
|
+
|
|
144
|
+
**Create Task:**
|
|
145
|
+
```python
|
|
146
|
+
task_id = client.create_scraper_task(
|
|
147
|
+
file_name="my_task",
|
|
148
|
+
spider_id="universal",
|
|
149
|
+
spider_name="universal",
|
|
150
|
+
parameters={"url": "https://example.com"}
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Video Download (New):**
|
|
155
|
+
```python
|
|
156
|
+
from thordata import CommonSettings
|
|
157
|
+
|
|
158
|
+
task_id = client.create_video_task(
|
|
159
|
+
file_name="{{VideoID}}",
|
|
160
|
+
spider_id="youtube_video_by-url",
|
|
161
|
+
spider_name="youtube.com",
|
|
162
|
+
parameters={"url": "https://youtube.com/watch?v=..."},
|
|
163
|
+
common_settings=CommonSettings(resolution="1080p")
|
|
164
|
+
)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
**Wait & Download:**
|
|
168
|
+
```python
|
|
169
|
+
status = client.wait_for_task(task_id)
|
|
170
|
+
if status == "ready":
|
|
171
|
+
url = client.get_task_result(task_id)
|
|
172
|
+
print(url)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Account Management
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
# Usage Statistics
|
|
179
|
+
stats = client.get_usage_statistics("2024-01-01", "2024-01-31")
|
|
180
|
+
print(f"Balance: {stats.balance_gb():.2f} GB")
|
|
181
|
+
|
|
182
|
+
# Proxy Users
|
|
183
|
+
users = client.list_proxy_users()
|
|
184
|
+
print(f"Sub-users: {users.user_count}")
|
|
185
|
+
|
|
186
|
+
# Whitelist IP
|
|
187
|
+
client.add_whitelist_ip("1.2.3.4")
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Proxy Network
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from thordata import ProxyConfig
|
|
194
|
+
|
|
195
|
+
# Generate Proxy URL
|
|
196
|
+
proxy_url = client.build_proxy_url(
|
|
197
|
+
username="proxy_user",
|
|
198
|
+
password="proxy_pass",
|
|
199
|
+
country="us",
|
|
200
|
+
city="ny"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Use with requests
|
|
204
|
+
import requests
|
|
205
|
+
requests.get("https://httpbin.org/ip", proxies={"http": proxy_url, "https": proxy_url})
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## 📄 License
|
|
211
|
+
|
|
212
|
+
MIT License
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
thordata/__init__.py,sha256=yaIxW1T_nsCeiPE6iIHunjRzPrtbiN0BciveICgL4dM,3195
|
|
2
|
+
thordata/_utils.py,sha256=epF-ewHyk7McdejlhHNAfxhIQ8sN3TlIjUJ9H4HOaUE,5254
|
|
3
|
+
thordata/async_client.py,sha256=tC9y1wmcO6RsXCysBo0a0GNRZR3QQjJlCmEwG5HVukQ,53169
|
|
4
|
+
thordata/client.py,sha256=VN5Jm3er7fdZDfT2G9g4siBSYNo0ZWj4WOi6TAiAZcE,59638
|
|
5
|
+
thordata/demo.py,sha256=zmG4I4cHXnbmQfbr063SeRK7_9IXrfof9QFoGqGTVm8,3806
|
|
6
|
+
thordata/enums.py,sha256=MpZnS9_8sg2vtcFqM6UicB94cKZm5R1t83L3ejNSbLs,8502
|
|
7
|
+
thordata/exceptions.py,sha256=IgMsFuh49cPxU5YofsKP1UhP5A_snhtuN6xD1yZWLiI,10018
|
|
8
|
+
thordata/models.py,sha256=NG4wn1bq4-FC4Aex8vwBOldiHovwg0JzhdtBsI1mL_8,36118
|
|
9
|
+
thordata/retry.py,sha256=nkh17ca2TIEcTc-uNo-xcNdJPuxZ_VGlMbC70X6p-_Q,11518
|
|
10
|
+
thordata_sdk-0.8.0.dist-info/licenses/LICENSE,sha256=bAxpWgQIzb-5jl3nhLdOwOJ_vlbHLtSG7yev2B7vioY,1088
|
|
11
|
+
thordata_sdk-0.8.0.dist-info/METADATA,sha256=IgL554I6mzya9FdbqCxKdvO3r-bywiHJjZi1xdk8W48,5850
|
|
12
|
+
thordata_sdk-0.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
thordata_sdk-0.8.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
|
|
14
|
+
thordata_sdk-0.8.0.dist-info/RECORD,,
|
thordata/parameters.py
DELETED
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
# src/thordata/parameters.py
|
|
2
|
-
|
|
3
|
-
from typing import Any, Dict
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
|
|
7
|
-
"""
|
|
8
|
-
Normalizes parameters across different search engines to ensure a unified API surface.
|
|
9
|
-
|
|
10
|
-
Args:
|
|
11
|
-
engine (str): The search engine to use (e.g., 'google', 'yandex').
|
|
12
|
-
query (str): The search query string.
|
|
13
|
-
**kwargs: Additional parameters to pass to the API.
|
|
14
|
-
|
|
15
|
-
Returns:
|
|
16
|
-
Dict[str, Any]: The constructed payload for the API request.
|
|
17
|
-
"""
|
|
18
|
-
# 1. Base parameters
|
|
19
|
-
payload = {
|
|
20
|
-
"num": str(kwargs.get("num", 10)), # Default to 10 results
|
|
21
|
-
"json": "1", # Force JSON response
|
|
22
|
-
"engine": engine,
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
# 2. Handle Query Parameter Differences (Yandex uses 'text', others use 'q')
|
|
26
|
-
if engine == "yandex":
|
|
27
|
-
payload["text"] = query
|
|
28
|
-
# Set default URL for Yandex if not provided
|
|
29
|
-
if "url" not in kwargs:
|
|
30
|
-
payload["url"] = "yandex.com"
|
|
31
|
-
else:
|
|
32
|
-
payload["q"] = query
|
|
33
|
-
|
|
34
|
-
# 3. Handle Default URLs for other engines
|
|
35
|
-
if "url" not in kwargs:
|
|
36
|
-
defaults = {
|
|
37
|
-
"google": "google.com",
|
|
38
|
-
"bing": "bing.com",
|
|
39
|
-
"duckduckgo": "duckduckgo.com",
|
|
40
|
-
"baidu": "baidu.com",
|
|
41
|
-
}
|
|
42
|
-
if engine in defaults:
|
|
43
|
-
payload["url"] = defaults[engine]
|
|
44
|
-
|
|
45
|
-
# 4. Passthrough for all other user-provided arguments
|
|
46
|
-
# This allows support for engine-specific parameters (e.g., tbm, uule, gl)
|
|
47
|
-
# without explicitly defining them all.
|
|
48
|
-
protected_keys = {"num", "engine", "q", "text"}
|
|
49
|
-
for key, value in kwargs.items():
|
|
50
|
-
if key not in protected_keys:
|
|
51
|
-
payload[key] = value
|
|
52
|
-
|
|
53
|
-
return payload
|