thordata-sdk 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +1 -1
- thordata/_example_utils.py +3 -2
- thordata/_utils.py +4 -4
- thordata/async_client.py +80 -79
- thordata/client.py +69 -72
- thordata/demo.py +1 -3
- thordata/exceptions.py +12 -12
- thordata/models.py +67 -70
- thordata/retry.py +13 -13
- thordata_sdk-1.1.0.dist-info/METADATA +271 -0
- thordata_sdk-1.1.0.dist-info/RECORD +15 -0
- thordata_sdk-1.0.1.dist-info/METADATA +0 -208
- thordata_sdk-1.0.1.dist-info/RECORD +0 -15
- {thordata_sdk-1.0.1.dist-info → thordata_sdk-1.1.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-1.0.1.dist-info → thordata_sdk-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.0.1.dist-info → thordata_sdk-1.1.0.dist-info}/top_level.txt +0 -0
thordata/models.py
CHANGED
|
@@ -26,13 +26,10 @@ from __future__ import annotations
|
|
|
26
26
|
|
|
27
27
|
import json
|
|
28
28
|
import re
|
|
29
|
-
import ssl
|
|
30
29
|
import uuid
|
|
31
30
|
from dataclasses import dataclass, field
|
|
32
31
|
from enum import Enum
|
|
33
|
-
from typing import Any
|
|
34
|
-
|
|
35
|
-
import urllib3
|
|
32
|
+
from typing import Any
|
|
36
33
|
|
|
37
34
|
# =============================================================================
|
|
38
35
|
# Proxy Product Types
|
|
@@ -111,21 +108,21 @@ class ProxyConfig:
|
|
|
111
108
|
|
|
112
109
|
username: str
|
|
113
110
|
password: str
|
|
114
|
-
product:
|
|
115
|
-
host:
|
|
116
|
-
port:
|
|
111
|
+
product: ProxyProduct | str = ProxyProduct.RESIDENTIAL
|
|
112
|
+
host: str | None = None
|
|
113
|
+
port: int | None = None
|
|
117
114
|
protocol: str = "http"
|
|
118
115
|
|
|
119
116
|
# Geo-targeting
|
|
120
|
-
continent:
|
|
121
|
-
country:
|
|
122
|
-
state:
|
|
123
|
-
city:
|
|
124
|
-
asn:
|
|
117
|
+
continent: str | None = None
|
|
118
|
+
country: str | None = None
|
|
119
|
+
state: str | None = None
|
|
120
|
+
city: str | None = None
|
|
121
|
+
asn: str | None = None
|
|
125
122
|
|
|
126
123
|
# Session control
|
|
127
|
-
session_id:
|
|
128
|
-
session_duration:
|
|
124
|
+
session_id: str | None = None
|
|
125
|
+
session_duration: int | None = None # minutes, 1-90
|
|
129
126
|
|
|
130
127
|
# Valid continent codes
|
|
131
128
|
VALID_CONTINENTS = {"af", "an", "as", "eu", "na", "oc", "sa"}
|
|
@@ -245,7 +242,7 @@ class ProxyConfig:
|
|
|
245
242
|
"""Basic auth string 'username:password' for Proxy-Authorization."""
|
|
246
243
|
return f"{self.build_username()}:{self.password}"
|
|
247
244
|
|
|
248
|
-
def to_proxies_dict(self) ->
|
|
245
|
+
def to_proxies_dict(self) -> dict[str, str]:
|
|
249
246
|
"""
|
|
250
247
|
Build a proxies dict suitable for the requests library.
|
|
251
248
|
|
|
@@ -300,7 +297,7 @@ class WhitelistProxyConfig:
|
|
|
300
297
|
def build_proxy_url(self) -> str:
|
|
301
298
|
return f"{self.protocol}://{self.host}:{self.port}"
|
|
302
299
|
|
|
303
|
-
def to_proxies_dict(self) ->
|
|
300
|
+
def to_proxies_dict(self) -> dict[str, str]:
|
|
304
301
|
url = self.build_proxy_url()
|
|
305
302
|
return {"http": url, "https": url}
|
|
306
303
|
|
|
@@ -358,7 +355,7 @@ class StaticISPProxy:
|
|
|
358
355
|
f"{self.protocol}://{self.username}:{self.password}@{self.host}:{self.port}"
|
|
359
356
|
)
|
|
360
357
|
|
|
361
|
-
def to_proxies_dict(self) ->
|
|
358
|
+
def to_proxies_dict(self) -> dict[str, str]:
|
|
362
359
|
"""
|
|
363
360
|
Build a proxies dict suitable for the requests library.
|
|
364
361
|
|
|
@@ -387,7 +384,7 @@ class StaticISPProxy:
|
|
|
387
384
|
) from e
|
|
388
385
|
|
|
389
386
|
@classmethod
|
|
390
|
-
def from_env(cls) ->
|
|
387
|
+
def from_env(cls) -> StaticISPProxy:
|
|
391
388
|
"""
|
|
392
389
|
Create StaticISPProxy from environment variables.
|
|
393
390
|
|
|
@@ -513,39 +510,39 @@ class SerpRequest:
|
|
|
513
510
|
start: int = 0
|
|
514
511
|
|
|
515
512
|
# Localization
|
|
516
|
-
country:
|
|
517
|
-
language:
|
|
518
|
-
google_domain:
|
|
519
|
-
countries_filter:
|
|
520
|
-
languages_filter:
|
|
513
|
+
country: str | None = None # 'gl' for Google
|
|
514
|
+
language: str | None = None # 'hl' for Google
|
|
515
|
+
google_domain: str | None = None
|
|
516
|
+
countries_filter: str | None = None # 'cr' parameter
|
|
517
|
+
languages_filter: str | None = None # 'lr' parameter
|
|
521
518
|
|
|
522
519
|
# Geo-targeting
|
|
523
|
-
location:
|
|
524
|
-
uule:
|
|
520
|
+
location: str | None = None
|
|
521
|
+
uule: str | None = None # Encoded location
|
|
525
522
|
|
|
526
523
|
# Search type
|
|
527
|
-
search_type:
|
|
524
|
+
search_type: str | None = None # tbm parameter (isch, shop, nws, vid, ...)
|
|
528
525
|
|
|
529
526
|
# Filters
|
|
530
|
-
safe_search:
|
|
531
|
-
time_filter:
|
|
527
|
+
safe_search: bool | None = None
|
|
528
|
+
time_filter: str | None = None # tbs parameter (time part)
|
|
532
529
|
no_autocorrect: bool = False # nfpr parameter
|
|
533
|
-
filter_duplicates:
|
|
530
|
+
filter_duplicates: bool | None = None # filter parameter
|
|
534
531
|
|
|
535
532
|
# Device & Rendering
|
|
536
|
-
device:
|
|
537
|
-
render_js:
|
|
538
|
-
no_cache:
|
|
533
|
+
device: str | None = None # 'desktop', 'mobile', 'tablet'
|
|
534
|
+
render_js: bool | None = None # render_js parameter
|
|
535
|
+
no_cache: bool | None = None # no_cache parameter
|
|
539
536
|
|
|
540
537
|
# Output format
|
|
541
538
|
output_format: str = "json" # 'json' or 'html'
|
|
542
539
|
|
|
543
540
|
# Advanced Google parameters
|
|
544
|
-
ludocid:
|
|
545
|
-
kgmid:
|
|
541
|
+
ludocid: str | None = None # Google Place ID
|
|
542
|
+
kgmid: str | None = None # Knowledge Graph ID
|
|
546
543
|
|
|
547
544
|
# Pass-through
|
|
548
|
-
extra_params:
|
|
545
|
+
extra_params: dict[str, Any] = field(default_factory=dict)
|
|
549
546
|
|
|
550
547
|
# Search type mappings for tbm parameter
|
|
551
548
|
SEARCH_TYPE_MAP = {
|
|
@@ -578,7 +575,7 @@ class SerpRequest:
|
|
|
578
575
|
"baidu": "baidu.com",
|
|
579
576
|
}
|
|
580
577
|
|
|
581
|
-
def to_payload(self) ->
|
|
578
|
+
def to_payload(self) -> dict[str, Any]:
|
|
582
579
|
"""
|
|
583
580
|
Convert to API request payload.
|
|
584
581
|
|
|
@@ -587,7 +584,7 @@ class SerpRequest:
|
|
|
587
584
|
"""
|
|
588
585
|
engine = self.engine.lower()
|
|
589
586
|
|
|
590
|
-
payload:
|
|
587
|
+
payload: dict[str, Any] = {
|
|
591
588
|
"engine": engine,
|
|
592
589
|
"num": str(self.num),
|
|
593
590
|
}
|
|
@@ -722,14 +719,14 @@ class UniversalScrapeRequest:
|
|
|
722
719
|
url: str
|
|
723
720
|
js_render: bool = False
|
|
724
721
|
output_format: str = "html" # 'html' or 'png'
|
|
725
|
-
country:
|
|
726
|
-
block_resources:
|
|
727
|
-
clean_content:
|
|
728
|
-
wait:
|
|
729
|
-
wait_for:
|
|
730
|
-
headers:
|
|
731
|
-
cookies:
|
|
732
|
-
extra_params:
|
|
722
|
+
country: str | None = None
|
|
723
|
+
block_resources: str | None = None # e.g., 'script', 'image', 'script,image'
|
|
724
|
+
clean_content: str | None = None # e.g., 'js', 'css', 'js,css'
|
|
725
|
+
wait: int | None = None # Milliseconds, max 100000
|
|
726
|
+
wait_for: str | None = None # CSS selector
|
|
727
|
+
headers: list[dict[str, str]] | None = None # [{"name": "...", "value": "..."}]
|
|
728
|
+
cookies: list[dict[str, str]] | None = None # [{"name": "...", "value": "..."}]
|
|
729
|
+
extra_params: dict[str, Any] = field(default_factory=dict) # 这个必须用 field()
|
|
733
730
|
|
|
734
731
|
def __post_init__(self) -> None:
|
|
735
732
|
"""Validate configuration."""
|
|
@@ -745,14 +742,14 @@ class UniversalScrapeRequest:
|
|
|
745
742
|
f"wait must be between 0 and 100000 milliseconds, got {self.wait}"
|
|
746
743
|
)
|
|
747
744
|
|
|
748
|
-
def to_payload(self) ->
|
|
745
|
+
def to_payload(self) -> dict[str, Any]:
|
|
749
746
|
"""
|
|
750
747
|
Convert to API request payload.
|
|
751
748
|
|
|
752
749
|
Returns:
|
|
753
750
|
Dictionary ready to be sent to the Universal API.
|
|
754
751
|
"""
|
|
755
|
-
payload:
|
|
752
|
+
payload: dict[str, Any] = {
|
|
756
753
|
"url": self.url,
|
|
757
754
|
"js_render": "True" if self.js_render else "False",
|
|
758
755
|
"type": self.output_format.lower(),
|
|
@@ -820,18 +817,18 @@ class ScraperTaskConfig:
|
|
|
820
817
|
file_name: str
|
|
821
818
|
spider_id: str
|
|
822
819
|
spider_name: str
|
|
823
|
-
parameters:
|
|
824
|
-
universal_params:
|
|
820
|
+
parameters: dict[str, Any]
|
|
821
|
+
universal_params: dict[str, Any] | None = None
|
|
825
822
|
include_errors: bool = True
|
|
826
823
|
|
|
827
|
-
def to_payload(self) ->
|
|
824
|
+
def to_payload(self) -> dict[str, Any]:
|
|
828
825
|
"""
|
|
829
826
|
Convert to API request payload.
|
|
830
827
|
|
|
831
828
|
Returns:
|
|
832
829
|
Dictionary ready to be sent to the Web Scraper API.
|
|
833
830
|
"""
|
|
834
|
-
payload:
|
|
831
|
+
payload: dict[str, Any] = {
|
|
835
832
|
"file_name": self.file_name,
|
|
836
833
|
"spider_id": self.spider_id,
|
|
837
834
|
"spider_name": self.spider_name,
|
|
@@ -877,21 +874,21 @@ class CommonSettings:
|
|
|
877
874
|
"""
|
|
878
875
|
|
|
879
876
|
# Video settings
|
|
880
|
-
resolution:
|
|
877
|
+
resolution: str | None = None
|
|
881
878
|
|
|
882
879
|
# Audio settings
|
|
883
|
-
audio_format:
|
|
884
|
-
bitrate:
|
|
880
|
+
audio_format: str | None = None
|
|
881
|
+
bitrate: str | None = None
|
|
885
882
|
|
|
886
883
|
# Subtitle settings (used by both video and audio)
|
|
887
|
-
is_subtitles:
|
|
888
|
-
subtitles_language:
|
|
884
|
+
is_subtitles: str | None = None
|
|
885
|
+
subtitles_language: str | None = None
|
|
889
886
|
|
|
890
887
|
# Valid values for validation
|
|
891
888
|
VALID_RESOLUTIONS = {"360p", "480p", "720p", "1080p", "1440p", "2160p"}
|
|
892
889
|
VALID_AUDIO_FORMATS = {"opus", "mp3"}
|
|
893
890
|
|
|
894
|
-
def to_dict(self) ->
|
|
891
|
+
def to_dict(self) -> dict[str, Any]:
|
|
895
892
|
"""Convert to dictionary, excluding None values."""
|
|
896
893
|
result = {}
|
|
897
894
|
if self.resolution is not None:
|
|
@@ -943,18 +940,18 @@ class VideoTaskConfig:
|
|
|
943
940
|
file_name: str
|
|
944
941
|
spider_id: str
|
|
945
942
|
spider_name: str
|
|
946
|
-
parameters:
|
|
943
|
+
parameters: dict[str, Any]
|
|
947
944
|
common_settings: CommonSettings
|
|
948
945
|
include_errors: bool = True
|
|
949
946
|
|
|
950
|
-
def to_payload(self) ->
|
|
947
|
+
def to_payload(self) -> dict[str, Any]:
|
|
951
948
|
"""
|
|
952
949
|
Convert to API request payload.
|
|
953
950
|
|
|
954
951
|
Returns:
|
|
955
952
|
Dictionary ready to be sent to the video_builder API.
|
|
956
953
|
"""
|
|
957
|
-
payload:
|
|
954
|
+
payload: dict[str, Any] = {
|
|
958
955
|
"file_name": self.file_name,
|
|
959
956
|
"spider_id": self.spider_id,
|
|
960
957
|
"spider_name": self.spider_name,
|
|
@@ -984,8 +981,8 @@ class TaskStatusResponse:
|
|
|
984
981
|
|
|
985
982
|
task_id: str
|
|
986
983
|
status: str
|
|
987
|
-
progress:
|
|
988
|
-
message:
|
|
984
|
+
progress: int | None = None
|
|
985
|
+
message: str | None = None
|
|
989
986
|
|
|
990
987
|
def is_complete(self) -> bool:
|
|
991
988
|
"""Check if the task has completed (success or failure)."""
|
|
@@ -1027,10 +1024,10 @@ class UsageStatistics:
|
|
|
1027
1024
|
traffic_balance: float
|
|
1028
1025
|
query_days: int
|
|
1029
1026
|
range_usage_traffic: float
|
|
1030
|
-
data:
|
|
1027
|
+
data: list[dict[str, Any]]
|
|
1031
1028
|
|
|
1032
1029
|
@classmethod
|
|
1033
|
-
def from_dict(cls, data:
|
|
1030
|
+
def from_dict(cls, data: dict[str, Any]) -> UsageStatistics:
|
|
1034
1031
|
"""Create from API response dict."""
|
|
1035
1032
|
return cls(
|
|
1036
1033
|
total_usage_traffic=float(data.get("total_usage_traffic", 0)),
|
|
@@ -1073,7 +1070,7 @@ class ProxyUser:
|
|
|
1073
1070
|
usage_traffic: float
|
|
1074
1071
|
|
|
1075
1072
|
@classmethod
|
|
1076
|
-
def from_dict(cls, data:
|
|
1073
|
+
def from_dict(cls, data: dict[str, Any]) -> ProxyUser:
|
|
1077
1074
|
"""Create from API response dict."""
|
|
1078
1075
|
return cls(
|
|
1079
1076
|
username=data.get("username", ""),
|
|
@@ -1109,10 +1106,10 @@ class ProxyUserList:
|
|
|
1109
1106
|
limit: float
|
|
1110
1107
|
remaining_limit: float
|
|
1111
1108
|
user_count: int
|
|
1112
|
-
users:
|
|
1109
|
+
users: list[ProxyUser]
|
|
1113
1110
|
|
|
1114
1111
|
@classmethod
|
|
1115
|
-
def from_dict(cls, data:
|
|
1112
|
+
def from_dict(cls, data: dict[str, Any]) -> ProxyUserList:
|
|
1116
1113
|
"""Create from API response dict."""
|
|
1117
1114
|
user_list = data.get("list", [])
|
|
1118
1115
|
users = [ProxyUser.from_dict(u) for u in user_list]
|
|
@@ -1143,11 +1140,11 @@ class ProxyServer:
|
|
|
1143
1140
|
port: int
|
|
1144
1141
|
username: str
|
|
1145
1142
|
password: str
|
|
1146
|
-
expiration_time:
|
|
1147
|
-
region:
|
|
1143
|
+
expiration_time: int | str | None = None
|
|
1144
|
+
region: str | None = None
|
|
1148
1145
|
|
|
1149
1146
|
@classmethod
|
|
1150
|
-
def from_dict(cls, data:
|
|
1147
|
+
def from_dict(cls, data: dict[str, Any]) -> ProxyServer:
|
|
1151
1148
|
"""Create from API response dict."""
|
|
1152
1149
|
return cls(
|
|
1153
1150
|
ip=data.get("ip", ""),
|
thordata/retry.py
CHANGED
|
@@ -22,7 +22,7 @@ import random
|
|
|
22
22
|
import time
|
|
23
23
|
from dataclasses import dataclass, field
|
|
24
24
|
from functools import wraps
|
|
25
|
-
from typing import Any, Callable
|
|
25
|
+
from typing import Any, Callable
|
|
26
26
|
|
|
27
27
|
from .exceptions import (
|
|
28
28
|
ThordataNetworkError,
|
|
@@ -64,15 +64,15 @@ class RetryConfig:
|
|
|
64
64
|
jitter_factor: float = 0.1
|
|
65
65
|
|
|
66
66
|
# Status codes to retry on (5xx server errors + 429 rate limit)
|
|
67
|
-
retry_on_status_codes:
|
|
67
|
+
retry_on_status_codes: set[int] = field(
|
|
68
68
|
default_factory=lambda: {429, 500, 502, 503, 504}
|
|
69
69
|
)
|
|
70
|
-
retry_on_api_codes:
|
|
70
|
+
retry_on_api_codes: set[int] = field(
|
|
71
71
|
default_factory=lambda: {300} # API response body code
|
|
72
72
|
)
|
|
73
73
|
|
|
74
74
|
# Exception types to always retry on
|
|
75
|
-
retry_on_exceptions:
|
|
75
|
+
retry_on_exceptions: tuple[type, ...] = field(
|
|
76
76
|
default_factory=lambda: (
|
|
77
77
|
ThordataNetworkError,
|
|
78
78
|
ThordataServerError,
|
|
@@ -104,7 +104,7 @@ class RetryConfig:
|
|
|
104
104
|
return delay
|
|
105
105
|
|
|
106
106
|
def should_retry(
|
|
107
|
-
self, exception: Exception, attempt: int, status_code:
|
|
107
|
+
self, exception: Exception, attempt: int, status_code: int | None = None
|
|
108
108
|
) -> bool:
|
|
109
109
|
"""
|
|
110
110
|
Determine if a request should be retried.
|
|
@@ -138,8 +138,8 @@ class RetryConfig:
|
|
|
138
138
|
|
|
139
139
|
|
|
140
140
|
def with_retry(
|
|
141
|
-
config:
|
|
142
|
-
on_retry:
|
|
141
|
+
config: RetryConfig | None = None,
|
|
142
|
+
on_retry: Callable[[int, Exception, float], None] | None = None,
|
|
143
143
|
) -> Callable:
|
|
144
144
|
"""
|
|
145
145
|
Decorator to add retry logic to a function.
|
|
@@ -168,7 +168,7 @@ def with_retry(
|
|
|
168
168
|
def decorator(func: Callable) -> Callable:
|
|
169
169
|
@wraps(func)
|
|
170
170
|
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
171
|
-
last_exception:
|
|
171
|
+
last_exception: Exception | None = None
|
|
172
172
|
|
|
173
173
|
for attempt in range(config.max_retries + 1):
|
|
174
174
|
try:
|
|
@@ -202,7 +202,7 @@ def with_retry(
|
|
|
202
202
|
|
|
203
203
|
@wraps(func)
|
|
204
204
|
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
205
|
-
last_exception:
|
|
205
|
+
last_exception: Exception | None = None
|
|
206
206
|
|
|
207
207
|
for attempt in range(config.max_retries + 1):
|
|
208
208
|
try:
|
|
@@ -244,7 +244,7 @@ def with_retry(
|
|
|
244
244
|
return decorator
|
|
245
245
|
|
|
246
246
|
|
|
247
|
-
def _extract_status_code(exception: Exception) ->
|
|
247
|
+
def _extract_status_code(exception: Exception) -> int | None:
|
|
248
248
|
"""
|
|
249
249
|
Extract HTTP status code from various exception types.
|
|
250
250
|
|
|
@@ -302,10 +302,10 @@ class RetryableRequest:
|
|
|
302
302
|
... retry.wait()
|
|
303
303
|
"""
|
|
304
304
|
|
|
305
|
-
def __init__(self, config:
|
|
305
|
+
def __init__(self, config: RetryConfig | None = None) -> None:
|
|
306
306
|
self.config = config or RetryConfig()
|
|
307
307
|
self.attempt = 0
|
|
308
|
-
self.last_exception:
|
|
308
|
+
self.last_exception: Exception | None = None
|
|
309
309
|
|
|
310
310
|
def __enter__(self) -> RetryableRequest:
|
|
311
311
|
return self
|
|
@@ -314,7 +314,7 @@ class RetryableRequest:
|
|
|
314
314
|
pass
|
|
315
315
|
|
|
316
316
|
def should_continue(
|
|
317
|
-
self, exception: Exception, status_code:
|
|
317
|
+
self, exception: Exception, status_code: int | None = None
|
|
318
318
|
) -> bool:
|
|
319
319
|
"""
|
|
320
320
|
Check if we should continue retrying.
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
|
+
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://www.thordata.com
|
|
8
|
+
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
9
|
+
Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
|
|
10
|
+
Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/Thordata/thordata-python-sdk/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: web scraping,proxy,residential proxy,datacenter proxy,ai,llm,data-mining,serp,thordata,web scraper,anti-bot bypass
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Topic :: Internet :: Proxy Servers
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
24
|
+
Classifier: Operating System :: OS Independent
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: requests>=2.25.0
|
|
30
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-httpserver>=1.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
39
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: types-requests>=2.28.0; extra == "dev"
|
|
41
|
+
Requires-Dist: aioresponses>=0.7.6; extra == "dev"
|
|
42
|
+
Dynamic: license-file
|
|
43
|
+
|
|
44
|
+
# Thordata Python SDK
|
|
45
|
+
|
|
46
|
+
<div align="center">
|
|
47
|
+
|
|
48
|
+
**Official Python Client for Thordata APIs**
|
|
49
|
+
|
|
50
|
+
*Proxy Network • SERP API • Web Unlocker • Web Scraper API*
|
|
51
|
+
|
|
52
|
+
[](https://pypi.org/project/thordata-sdk/)
|
|
53
|
+
[](https://pypi.org/project/thordata-sdk/)
|
|
54
|
+
[](LICENSE)
|
|
55
|
+
|
|
56
|
+
</div>
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## 📦 Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install thordata-sdk
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Optional dependencies for Scraping Browser examples:
|
|
67
|
+
```bash
|
|
68
|
+
pip install playwright
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## 🔐 Configuration
|
|
72
|
+
|
|
73
|
+
Set the following environment variables (recommended):
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Required for SERP, Universal, and Proxy Network
|
|
77
|
+
export THORDATA_SCRAPER_TOKEN="your_scraper_token"
|
|
78
|
+
|
|
79
|
+
# Required for Web Scraper Tasks & Account Management
|
|
80
|
+
export THORDATA_PUBLIC_TOKEN="your_public_token"
|
|
81
|
+
export THORDATA_PUBLIC_KEY="your_public_key"
|
|
82
|
+
|
|
83
|
+
# Optional: Default Proxy Credentials (for Proxy Network)
|
|
84
|
+
export THORDATA_RESIDENTIAL_USERNAME="user"
|
|
85
|
+
export THORDATA_RESIDENTIAL_PASSWORD="pass"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## 🚀 Quick Start
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from thordata import ThordataClient
|
|
92
|
+
|
|
93
|
+
# Initialize (credentials loaded from env)
|
|
94
|
+
client = ThordataClient(scraper_token="...")
|
|
95
|
+
|
|
96
|
+
# 1. SERP Search
|
|
97
|
+
print("--- SERP Search ---")
|
|
98
|
+
results = client.serp_search("python tutorial", engine="google")
|
|
99
|
+
print(f"Title: {results['organic'][0]['title']}")
|
|
100
|
+
|
|
101
|
+
# 2. Universal Scrape (Web Unlocker)
|
|
102
|
+
print("\n--- Universal Scrape ---")
|
|
103
|
+
html = client.universal_scrape("https://httpbin.org/html")
|
|
104
|
+
print(f"HTML Length: {len(html)}")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## 📚 Core Features
|
|
108
|
+
|
|
109
|
+
### 🌐 Proxy Network
|
|
110
|
+
|
|
111
|
+
Easily generate proxy URLs with geo-targeting and sticky sessions. The SDK handles connection pooling automatically.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from thordata import ProxyConfig, ProxyProduct
|
|
115
|
+
|
|
116
|
+
# Create a proxy configuration
|
|
117
|
+
proxy = ProxyConfig(
|
|
118
|
+
username="user",
|
|
119
|
+
password="pass",
|
|
120
|
+
product=ProxyProduct.RESIDENTIAL,
|
|
121
|
+
country="us",
|
|
122
|
+
city="new_york",
|
|
123
|
+
session_id="session123",
|
|
124
|
+
session_duration=10 # Sticky for 10 mins
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Use with the client (high performance)
|
|
128
|
+
response = client.get("https://httpbin.org/ip", proxy_config=proxy)
|
|
129
|
+
print(response.json())
|
|
130
|
+
|
|
131
|
+
# Or get the URL string for other libs (requests, scrapy, etc.)
|
|
132
|
+
proxy_url = proxy.build_proxy_url()
|
|
133
|
+
print(f"Proxy URL: {proxy_url}")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### 🔍 SERP API
|
|
137
|
+
|
|
138
|
+
Real-time search results from Google, Bing, Yandex, etc.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from thordata import SerpRequest, Engine
|
|
142
|
+
|
|
143
|
+
# Simple
|
|
144
|
+
results = client.serp_search(
|
|
145
|
+
query="pizza near me",
|
|
146
|
+
engine=Engine.GOOGLE_MAPS,
|
|
147
|
+
country="us"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Advanced (Strongly Typed)
|
|
151
|
+
request = SerpRequest(
|
|
152
|
+
query="AI news",
|
|
153
|
+
engine="google_news",
|
|
154
|
+
num=50,
|
|
155
|
+
time_filter="week",
|
|
156
|
+
location="San Francisco",
|
|
157
|
+
render_js=True
|
|
158
|
+
)
|
|
159
|
+
results = client.serp_search_advanced(request)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### 🔓 Universal Scraping API (Web Unlocker)
|
|
163
|
+
|
|
164
|
+
Bypass Cloudflare, CAPTCHAs, and antibot systems.
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
html = client.universal_scrape(
|
|
168
|
+
url="https://example.com/protected",
|
|
169
|
+
js_render=True,
|
|
170
|
+
wait_for=".content",
|
|
171
|
+
country="gb",
|
|
172
|
+
output_format="html"
|
|
173
|
+
)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### 🕷️ Web Scraper API (Async Tasks)
|
|
177
|
+
|
|
178
|
+
Manage asynchronous scraping tasks for massive scale.
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
# 1. Create Task
|
|
182
|
+
task_id = client.create_scraper_task(
|
|
183
|
+
file_name="my_task",
|
|
184
|
+
spider_id="universal",
|
|
185
|
+
spider_name="universal",
|
|
186
|
+
parameters={"url": "https://example.com"}
|
|
187
|
+
)
|
|
188
|
+
print(f"Task Created: {task_id}")
|
|
189
|
+
|
|
190
|
+
# 2. Wait for Completion
|
|
191
|
+
status = client.wait_for_task(task_id, max_wait=600)
|
|
192
|
+
|
|
193
|
+
# 3. Get Result
|
|
194
|
+
if status == "ready":
|
|
195
|
+
download_url = client.get_task_result(task_id)
|
|
196
|
+
print(f"Result: {download_url}")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### 📹 Video/Audio Tasks
|
|
200
|
+
|
|
201
|
+
Download content from YouTube and other supported platforms.
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from thordata import CommonSettings
|
|
205
|
+
|
|
206
|
+
task_id = client.create_video_task(
|
|
207
|
+
file_name="video_{{VideoID}}",
|
|
208
|
+
spider_id="youtube_video_by-url",
|
|
209
|
+
spider_name="youtube.com",
|
|
210
|
+
parameters={"url": "https://youtube.com/watch?v=..."},
|
|
211
|
+
common_settings=CommonSettings(resolution="1080p")
|
|
212
|
+
)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### 📊 Account Management
|
|
216
|
+
|
|
217
|
+
Access usage statistics, manage sub-users, and whitelist IPs.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
# Get Usage Stats
|
|
221
|
+
stats = client.get_usage_statistics("2024-01-01", "2024-01-31")
|
|
222
|
+
print(f"Balance: {stats.balance_gb():.2f} GB")
|
|
223
|
+
|
|
224
|
+
# List Proxy Users
|
|
225
|
+
users = client.list_proxy_users()
|
|
226
|
+
print(f"Active Sub-users: {users.user_count}")
|
|
227
|
+
|
|
228
|
+
# Whitelist IP
|
|
229
|
+
client.add_whitelist_ip("1.2.3.4")
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## ⚙️ Advanced Usage
|
|
233
|
+
|
|
234
|
+
### Async Client
|
|
235
|
+
|
|
236
|
+
For high-concurrency applications, use `AsyncThordataClient`.
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
import asyncio
|
|
240
|
+
from thordata import AsyncThordataClient
|
|
241
|
+
|
|
242
|
+
async def main():
|
|
243
|
+
async with AsyncThordataClient(scraper_token="...") as client:
|
|
244
|
+
# SERP
|
|
245
|
+
results = await client.serp_search("async python")
|
|
246
|
+
|
|
247
|
+
# Universal
|
|
248
|
+
html = await client.universal_scrape("https://example.com")
|
|
249
|
+
|
|
250
|
+
asyncio.run(main())
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
Note: `AsyncThordataClient` does not support HTTPS proxy tunneling (TLS-in-TLS) due to `aiohttp` limitations. For proxy network requests, use the sync client.
|
|
254
|
+
|
|
255
|
+
### Custom Retry Configuration
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
from thordata import RetryConfig
|
|
259
|
+
|
|
260
|
+
retry = RetryConfig(
|
|
261
|
+
max_retries=5,
|
|
262
|
+
backoff_factor=1.5,
|
|
263
|
+
retry_on_status_codes={429, 500, 502, 503, 504}
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
client = ThordataClient(..., retry_config=retry)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## 📄 License
|
|
270
|
+
|
|
271
|
+
MIT License
|