thordata-sdk 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +4 -40
- thordata/async_client.py +503 -1796
- thordata/client.py +444 -1322
- thordata/core/__init__.py +23 -0
- thordata/core/async_http_client.py +91 -0
- thordata/core/http_client.py +79 -0
- thordata/core/tunnel.py +287 -0
- thordata/enums.py +41 -380
- thordata/exceptions.py +70 -19
- thordata/models.py +37 -1193
- thordata/retry.py +1 -1
- thordata/tools/__init__.py +38 -0
- thordata/tools/base.py +42 -0
- thordata/tools/code.py +39 -0
- thordata/tools/ecommerce.py +251 -0
- thordata/tools/professional.py +155 -0
- thordata/tools/search.py +115 -0
- thordata/tools/social.py +374 -0
- thordata/tools/travel.py +100 -0
- thordata/tools/video.py +154 -0
- thordata/types/__init__.py +77 -0
- thordata/types/common.py +141 -0
- thordata/types/proxy.py +340 -0
- thordata/types/serp.py +224 -0
- thordata/types/task.py +156 -0
- thordata/types/universal.py +66 -0
- thordata/unlimited.py +67 -0
- thordata_sdk-1.6.0.dist-info/METADATA +287 -0
- thordata_sdk-1.6.0.dist-info/RECORD +35 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/WHEEL +1 -1
- thordata/_example_utils.py +0 -77
- thordata/demo.py +0 -138
- thordata_sdk-1.4.0.dist-info/METADATA +0 -208
- thordata_sdk-1.4.0.dist-info/RECORD +0 -18
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.4.0.dist-info → thordata_sdk-1.6.0.dist-info}/top_level.txt +0 -0
thordata/types/task.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Web Scraper Task types.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from .common import CommonSettings, ThordataBaseConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TaskStatus(str, Enum):
|
|
16
|
+
PENDING = "pending"
|
|
17
|
+
RUNNING = "running"
|
|
18
|
+
READY = "ready"
|
|
19
|
+
SUCCESS = "success"
|
|
20
|
+
FINISHED = "finished"
|
|
21
|
+
FAILED = "failed"
|
|
22
|
+
ERROR = "error"
|
|
23
|
+
CANCELLED = "cancelled"
|
|
24
|
+
UNKNOWN = "unknown"
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def is_terminal(cls, status: TaskStatus) -> bool:
|
|
28
|
+
return status in {
|
|
29
|
+
cls.READY,
|
|
30
|
+
cls.SUCCESS,
|
|
31
|
+
cls.FINISHED,
|
|
32
|
+
cls.FAILED,
|
|
33
|
+
cls.ERROR,
|
|
34
|
+
cls.CANCELLED,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def is_success(cls, status: TaskStatus) -> bool:
|
|
39
|
+
return status in {cls.READY, cls.SUCCESS, cls.FINISHED}
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def is_failure(cls, status: TaskStatus) -> bool:
|
|
43
|
+
return status in {cls.FAILED, cls.ERROR}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DataFormat(str, Enum):
|
|
47
|
+
JSON = "json"
|
|
48
|
+
CSV = "csv"
|
|
49
|
+
XLSX = "xlsx"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ScraperTaskConfig(ThordataBaseConfig):
|
|
54
|
+
file_name: str
|
|
55
|
+
spider_id: str
|
|
56
|
+
spider_name: str
|
|
57
|
+
parameters: dict[str, Any] | list[dict[str, Any]]
|
|
58
|
+
universal_params: dict[str, Any] | None = None
|
|
59
|
+
include_errors: bool = True
|
|
60
|
+
|
|
61
|
+
def to_payload(self) -> dict[str, Any]:
|
|
62
|
+
# Handle batch parameters: if list, use as is; if dict, wrap in list
|
|
63
|
+
if isinstance(self.parameters, list):
|
|
64
|
+
params_json = json.dumps(self.parameters)
|
|
65
|
+
else:
|
|
66
|
+
params_json = json.dumps([self.parameters])
|
|
67
|
+
|
|
68
|
+
payload: dict[str, Any] = {
|
|
69
|
+
"file_name": self.file_name,
|
|
70
|
+
"spider_id": self.spider_id,
|
|
71
|
+
"spider_name": self.spider_name,
|
|
72
|
+
"spider_parameters": params_json,
|
|
73
|
+
"spider_errors": "true" if self.include_errors else "false",
|
|
74
|
+
}
|
|
75
|
+
if self.universal_params:
|
|
76
|
+
payload["spider_universal"] = json.dumps(self.universal_params)
|
|
77
|
+
return payload
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class VideoTaskConfig(ThordataBaseConfig):
|
|
82
|
+
file_name: str
|
|
83
|
+
spider_id: str
|
|
84
|
+
spider_name: str
|
|
85
|
+
parameters: dict[str, Any] | list[dict[str, Any]]
|
|
86
|
+
common_settings: CommonSettings
|
|
87
|
+
include_errors: bool = True
|
|
88
|
+
|
|
89
|
+
def to_payload(self) -> dict[str, Any]:
|
|
90
|
+
# Handle batch parameters
|
|
91
|
+
if isinstance(self.parameters, list):
|
|
92
|
+
params_json = json.dumps(self.parameters)
|
|
93
|
+
else:
|
|
94
|
+
params_json = json.dumps([self.parameters])
|
|
95
|
+
|
|
96
|
+
payload: dict[str, Any] = {
|
|
97
|
+
"file_name": self.file_name,
|
|
98
|
+
"spider_id": self.spider_id,
|
|
99
|
+
"spider_name": self.spider_name,
|
|
100
|
+
"spider_parameters": params_json,
|
|
101
|
+
"spider_errors": "true" if self.include_errors else "false",
|
|
102
|
+
# v2.0 Doc explicitly requires 'spider_universal' key for video tasks too sometimes,
|
|
103
|
+
# but usually it's passed as 'common_settings' or 'spider_universal'.
|
|
104
|
+
# Sticking to original models.py key logic for now to ensure stability.
|
|
105
|
+
"spider_universal": self.common_settings.to_json(),
|
|
106
|
+
}
|
|
107
|
+
# Note: If API expects 'common_settings' key specifically, adjust here.
|
|
108
|
+
# Based on v2 context, video builder often uses spider_universal.
|
|
109
|
+
return payload
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class TaskStatusResponse:
|
|
114
|
+
task_id: str
|
|
115
|
+
status: str
|
|
116
|
+
progress: int | None = None
|
|
117
|
+
message: str | None = None
|
|
118
|
+
|
|
119
|
+
def is_complete(self) -> bool:
|
|
120
|
+
terminal_statuses = {
|
|
121
|
+
"ready",
|
|
122
|
+
"success",
|
|
123
|
+
"finished",
|
|
124
|
+
"failed",
|
|
125
|
+
"error",
|
|
126
|
+
"cancelled",
|
|
127
|
+
}
|
|
128
|
+
return self.status.lower() in terminal_statuses
|
|
129
|
+
|
|
130
|
+
def is_success(self) -> bool:
|
|
131
|
+
return self.status.lower() in {"ready", "success", "finished"}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class UsageStatistics:
|
|
136
|
+
total_usage_traffic: float
|
|
137
|
+
traffic_balance: float
|
|
138
|
+
query_days: int
|
|
139
|
+
range_usage_traffic: float
|
|
140
|
+
data: list[dict[str, Any]]
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def from_dict(cls, data: dict[str, Any]) -> UsageStatistics:
|
|
144
|
+
return cls(
|
|
145
|
+
total_usage_traffic=float(data.get("total_usage_traffic", 0)),
|
|
146
|
+
traffic_balance=float(data.get("traffic_balance", 0)),
|
|
147
|
+
query_days=int(data.get("query_days", 0)),
|
|
148
|
+
range_usage_traffic=float(data.get("range_usage_traffic", 0)),
|
|
149
|
+
data=data.get("data", []),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def total_usage_gb(self) -> float:
|
|
153
|
+
return self.total_usage_traffic / (1024 * 1024)
|
|
154
|
+
|
|
155
|
+
def balance_gb(self) -> float:
|
|
156
|
+
return self.traffic_balance / (1024 * 1024)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Universal Scraping (Web Unlocker) types.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .common import ThordataBaseConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class UniversalScrapeRequest(ThordataBaseConfig):
|
|
16
|
+
url: str
|
|
17
|
+
js_render: bool = False
|
|
18
|
+
output_format: str = "html" # 'html' or 'png'
|
|
19
|
+
country: str | None = None
|
|
20
|
+
block_resources: str | None = None # 'script,image'
|
|
21
|
+
clean_content: str | None = None # 'js,css'
|
|
22
|
+
wait: int | None = None # ms
|
|
23
|
+
wait_for: str | None = None # selector
|
|
24
|
+
|
|
25
|
+
# Headers/Cookies must be serialized to JSON in payload
|
|
26
|
+
headers: list[dict[str, str]] | None = None
|
|
27
|
+
cookies: list[dict[str, str]] | None = None
|
|
28
|
+
|
|
29
|
+
extra_params: dict[str, Any] = field(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
def __post_init__(self) -> None:
|
|
32
|
+
valid_formats = {"html", "png"}
|
|
33
|
+
if self.output_format.lower() not in valid_formats:
|
|
34
|
+
raise ValueError(
|
|
35
|
+
f"Invalid output_format: {self.output_format}. Must be one of: {valid_formats}"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
if self.wait is not None and (self.wait < 0 or self.wait > 100000):
|
|
39
|
+
raise ValueError("wait must be between 0 and 100000 milliseconds")
|
|
40
|
+
|
|
41
|
+
def to_payload(self) -> dict[str, Any]:
|
|
42
|
+
payload: dict[str, Any] = {
|
|
43
|
+
"url": self.url,
|
|
44
|
+
"js_render": "True" if self.js_render else "False",
|
|
45
|
+
"type": self.output_format.lower(),
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if self.country:
|
|
49
|
+
payload["country"] = self.country.lower()
|
|
50
|
+
if self.block_resources:
|
|
51
|
+
payload["block_resources"] = self.block_resources
|
|
52
|
+
if self.clean_content:
|
|
53
|
+
payload["clean_content"] = self.clean_content
|
|
54
|
+
if self.wait is not None:
|
|
55
|
+
payload["wait"] = str(self.wait)
|
|
56
|
+
if self.wait_for:
|
|
57
|
+
payload["wait_for"] = self.wait_for
|
|
58
|
+
|
|
59
|
+
# Serialize complex objects as JSON strings
|
|
60
|
+
if self.headers:
|
|
61
|
+
payload["headers"] = json.dumps(self.headers)
|
|
62
|
+
if self.cookies:
|
|
63
|
+
payload["cookies"] = json.dumps(self.cookies)
|
|
64
|
+
|
|
65
|
+
payload.update(self.extra_params)
|
|
66
|
+
return payload
|
thordata/unlimited.py
CHANGED
|
@@ -100,3 +100,70 @@ class UnlimitedNamespace:
|
|
|
100
100
|
f"Action {endpoint} failed", code=data.get("code"), payload=data
|
|
101
101
|
)
|
|
102
102
|
return data.get("data", {})
|
|
103
|
+
|
|
104
|
+
def get_server_monitor(
|
|
105
|
+
self,
|
|
106
|
+
ins_id: str,
|
|
107
|
+
region: str,
|
|
108
|
+
start_time: int,
|
|
109
|
+
end_time: int,
|
|
110
|
+
period: int = 300,
|
|
111
|
+
) -> dict[str, Any]:
|
|
112
|
+
"""
|
|
113
|
+
Get Unlimited Proxy Server monitor (CPU, Mem, TCP, Bandwidth).
|
|
114
|
+
"""
|
|
115
|
+
self._client._require_public_credentials()
|
|
116
|
+
params = {
|
|
117
|
+
"token": self._client.public_token,
|
|
118
|
+
"key": self._client.public_key,
|
|
119
|
+
"ins_id": ins_id,
|
|
120
|
+
"region": region,
|
|
121
|
+
"start_time": str(start_time),
|
|
122
|
+
"end_time": str(end_time),
|
|
123
|
+
"period": str(period),
|
|
124
|
+
}
|
|
125
|
+
# Note: Endpoint is /api/unlimited/server-monitor
|
|
126
|
+
response = self._client._api_request_with_retry(
|
|
127
|
+
"GET", f"{self._api_base}/unlimited/server-monitor", params=params
|
|
128
|
+
)
|
|
129
|
+
response.raise_for_status()
|
|
130
|
+
data = response.json()
|
|
131
|
+
if data.get("code") != 200:
|
|
132
|
+
raise_for_code(
|
|
133
|
+
"Get server monitor failed", code=data.get("code"), payload=data
|
|
134
|
+
)
|
|
135
|
+
return data.get("data", {})
|
|
136
|
+
|
|
137
|
+
def get_balancing_monitor(
|
|
138
|
+
self,
|
|
139
|
+
ins_id: str,
|
|
140
|
+
region: str,
|
|
141
|
+
start_time: int,
|
|
142
|
+
end_time: int,
|
|
143
|
+
period: int = 300,
|
|
144
|
+
) -> dict[str, Any]:
|
|
145
|
+
"""
|
|
146
|
+
Get Unlimited Residential Proxy Load Balancing Machine Monitoring.
|
|
147
|
+
"""
|
|
148
|
+
self._client._require_public_credentials()
|
|
149
|
+
params = {
|
|
150
|
+
"token": self._client.public_token,
|
|
151
|
+
"key": self._client.public_key,
|
|
152
|
+
"ins_id": ins_id,
|
|
153
|
+
"region": region,
|
|
154
|
+
"start_time": str(start_time),
|
|
155
|
+
"end_time": str(end_time),
|
|
156
|
+
"period": str(period),
|
|
157
|
+
}
|
|
158
|
+
response = self._client._api_request_with_retry(
|
|
159
|
+
"GET", f"{self._api_base}/unlimited/balancing-monitor", params=params
|
|
160
|
+
)
|
|
161
|
+
response.raise_for_status()
|
|
162
|
+
data = response.json()
|
|
163
|
+
|
|
164
|
+
# Note: This endpoint uses 'status_code' instead of 'code' in the root
|
|
165
|
+
code = data.get("status_code", data.get("code"))
|
|
166
|
+
if code != 200:
|
|
167
|
+
raise_for_code("Get balancing monitor failed", code=code, payload=data)
|
|
168
|
+
|
|
169
|
+
return data.get("payload", {})
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 1.6.0
|
|
4
|
+
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
|
+
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://www.thordata.com
|
|
8
|
+
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
9
|
+
Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
|
|
10
|
+
Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/Thordata/thordata-python-sdk/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: web scraping,proxy,residential proxy,datacenter proxy,ai,llm,data-mining,serp,thordata,web scraper,anti-bot bypass
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Topic :: Internet :: Proxy Servers
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
24
|
+
Classifier: Operating System :: OS Independent
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: requests>=2.25.0
|
|
30
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
31
|
+
Requires-Dist: PySocks>=1.7.1
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-httpserver>=1.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: black>=25.11.0; extra == "dev"
|
|
39
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
40
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: types-requests>=2.28.0; extra == "dev"
|
|
42
|
+
Requires-Dist: aioresponses>=0.7.6; extra == "dev"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# Thordata Python SDK
|
|
46
|
+
|
|
47
|
+
<div align="center">
|
|
48
|
+
|
|
49
|
+
<img src="https://img.shields.io/badge/Thordata-AI%20Infrastructure-blue?style=for-the-badge" alt="Thordata Logo">
|
|
50
|
+
|
|
51
|
+
**The Official Python Client for Thordata APIs**
|
|
52
|
+
|
|
53
|
+
*Proxy Network • SERP API • Web Unlocker • Web Scraper API*
|
|
54
|
+
|
|
55
|
+
[](https://pypi.org/project/thordata-sdk/)
|
|
56
|
+
[](https://pypi.org/project/thordata-sdk/)
|
|
57
|
+
[](LICENSE)
|
|
58
|
+
[](https://github.com/Thordata/thordata-python-sdk/actions)
|
|
59
|
+
|
|
60
|
+
</div>
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## 📖 Introduction
|
|
65
|
+
|
|
66
|
+
The **Thordata Python SDK v1.6.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
|
|
67
|
+
|
|
68
|
+
**Why v1.6.0?**
|
|
69
|
+
* **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
|
|
70
|
+
* **⚡ Async First**: First-class `asyncio` support with `aiohttp` for high-concurrency scraping (1000+ RPS).
|
|
71
|
+
* **🧩 100% API Coverage**: Every endpoint documented by Thordata (including Hourly Usage, Server Monitor, and Task Management) is implemented.
|
|
72
|
+
* **🤖 Type Safe**: Fully typed (`mypy` strict) for excellent IDE autocompletion and error checking.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## 📦 Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install thordata-sdk
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## 🔐 Configuration
|
|
85
|
+
|
|
86
|
+
Set environment variables to avoid hardcoding credentials. **Full reference:** copy [.env.example](.env.example) to `.env` and fill in values.
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# [Scraping APIs]
|
|
90
|
+
export THORDATA_SCRAPER_TOKEN="your_scraper_token"
|
|
91
|
+
|
|
92
|
+
# [Management APIs]
|
|
93
|
+
export THORDATA_PUBLIC_TOKEN="your_public_token"
|
|
94
|
+
export THORDATA_PUBLIC_KEY="your_public_key"
|
|
95
|
+
|
|
96
|
+
# [Proxy: Residential / Unlimited / Datacenter / Mobile / ISP]
|
|
97
|
+
export THORDATA_RESIDENTIAL_USERNAME="your_username"
|
|
98
|
+
export THORDATA_RESIDENTIAL_PASSWORD="your_password"
|
|
99
|
+
# Optional: Unlimited (high-bandwidth) if your plan has separate credentials
|
|
100
|
+
# export THORDATA_UNLIMITED_USERNAME="..."
|
|
101
|
+
# export THORDATA_UNLIMITED_PASSWORD="..."
|
|
102
|
+
|
|
103
|
+
# Optional: Upstream proxy when behind firewall (e.g. Clash Verge port 7897)
|
|
104
|
+
# export THORDATA_UPSTREAM_PROXY="http://127.0.0.1:7897"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Default proxy port is **9999** (residential); other products use different ports (see `.env.example`).
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 🚀 Quick Start
|
|
112
|
+
|
|
113
|
+
### 1. SERP Search (Google/Bing)
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from thordata import ThordataClient, Engine
|
|
117
|
+
|
|
118
|
+
client = ThordataClient()
|
|
119
|
+
|
|
120
|
+
# Search Google
|
|
121
|
+
results = client.serp_search(
|
|
122
|
+
query="latest AI trends",
|
|
123
|
+
engine=Engine.GOOGLE,
|
|
124
|
+
num=10,
|
|
125
|
+
location="United States"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
for item in results.get("organic", []):
|
|
129
|
+
print(f"{item['title']} - {item['link']}")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### 2. Universal Scrape (Web Unlocker)
|
|
133
|
+
|
|
134
|
+
Automatically handles JS rendering, CAPTCHAs, and fingerprinting.
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
html = client.universal_scrape(
|
|
138
|
+
url="https://example.com",
|
|
139
|
+
js_render=True,
|
|
140
|
+
country="us",
|
|
141
|
+
wait_for=".content-loaded" # Smart waiting
|
|
142
|
+
)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### 3. High-Performance Proxy Tunneling
|
|
146
|
+
|
|
147
|
+
Use Thordata's residential IPs directly with `requests` (Sync) or `aiohttp` (Async). The SDK handles the complex authentication and rotation logic.
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from thordata import ProxyConfig, ProxyProduct
|
|
151
|
+
|
|
152
|
+
# Config is optional if env vars are set
|
|
153
|
+
proxy = ProxyConfig(
|
|
154
|
+
product=ProxyProduct.RESIDENTIAL,
|
|
155
|
+
country="jp",
|
|
156
|
+
session_duration=10 # Sticky IP for 10 mins
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# The client automatically routes this through Thordata's network
|
|
160
|
+
response = client.get("https://httpbin.org/ip", proxy_config=proxy)
|
|
161
|
+
print(response.json())
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## ⚙️ Advanced Usage
|
|
167
|
+
|
|
168
|
+
### Async High-Concurrency
|
|
169
|
+
|
|
170
|
+
Perfect for building high-throughput AI agents.
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
import asyncio
|
|
174
|
+
from thordata import AsyncThordataClient
|
|
175
|
+
|
|
176
|
+
async def main():
|
|
177
|
+
async with AsyncThordataClient() as client:
|
|
178
|
+
# Fire off 10 requests in parallel
|
|
179
|
+
tasks = [client.serp_search(f"query {i}") for i in range(10)]
|
|
180
|
+
results = await asyncio.gather(*tasks)
|
|
181
|
+
print(f"Completed {len(results)} searches")
|
|
182
|
+
|
|
183
|
+
asyncio.run(main())
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Task Management (Batch Scraping)
|
|
187
|
+
|
|
188
|
+
Handle large-scale scraping jobs asynchronously.
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
# 1. Create a task
|
|
192
|
+
task_id = client.create_scraper_task(
|
|
193
|
+
file_name="daily_scrape",
|
|
194
|
+
spider_id="universal",
|
|
195
|
+
spider_name="universal",
|
|
196
|
+
parameters={"url": "https://example.com"}
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# 2. Poll for completion (Helper method)
|
|
200
|
+
status = client.wait_for_task(task_id, max_wait=600)
|
|
201
|
+
|
|
202
|
+
# 3. Download results
|
|
203
|
+
if status == "finished":
|
|
204
|
+
data_url = client.get_task_result(task_id)
|
|
205
|
+
print(f"Download: {data_url}")
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Web Scraper Tools (120+ Pre-built Tools)
|
|
209
|
+
|
|
210
|
+
Use pre-built tools for popular platforms. See [Tool Coverage Matrix](docs/TOOL_COVERAGE_MATRIX.md) for full list.
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from thordata import ThordataClient
|
|
214
|
+
from thordata.tools import Amazon, GoogleMaps, YouTube, TikTok, eBay, Walmart
|
|
215
|
+
|
|
216
|
+
client = ThordataClient()
|
|
217
|
+
|
|
218
|
+
# Amazon Product by ASIN
|
|
219
|
+
task_id = client.run_tool(Amazon.ProductByAsin(asin="B0BZYCJK89"))
|
|
220
|
+
|
|
221
|
+
# Google Maps by Place ID
|
|
222
|
+
task_id = client.run_tool(GoogleMaps.DetailsByPlaceId(place_id="ChIJPTacEpBQwokRKwIlDXelxkA"))
|
|
223
|
+
|
|
224
|
+
# YouTube Video Download
|
|
225
|
+
from thordata import CommonSettings
|
|
226
|
+
settings = CommonSettings(resolution="<=360p", video_codec="vp9")
|
|
227
|
+
task_id = client.run_tool(YouTube.VideoDownload(
|
|
228
|
+
url="https://www.youtube.com/watch?v=jNQXAC9IVRw",
|
|
229
|
+
common_settings=settings
|
|
230
|
+
))
|
|
231
|
+
|
|
232
|
+
# Wait and get results
|
|
233
|
+
status = client.wait_for_task(task_id, max_wait=300)
|
|
234
|
+
if status == "ready":
|
|
235
|
+
download_url = client.get_task_result(task_id)
|
|
236
|
+
print(f"Results: {download_url}")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Available Platforms:**
|
|
240
|
+
- **E-Commerce**: Amazon, eBay, Walmart
|
|
241
|
+
- **Social Media**: TikTok, Instagram, Facebook, Twitter/X, Reddit, LinkedIn
|
|
242
|
+
- **Search**: Google Maps, Google Shopping, Google Play
|
|
243
|
+
- **Video**: YouTube (download, info, subtitles)
|
|
244
|
+
- **Code**: GitHub
|
|
245
|
+
- **Professional**: Indeed, Glassdoor, Crunchbase
|
|
246
|
+
- **Travel/Real Estate**: Booking, Airbnb, Zillow
|
|
247
|
+
|
|
248
|
+
See `examples/tools/` for more examples.
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## 🛠️ Management APIs
|
|
253
|
+
|
|
254
|
+
Manage your infrastructure programmatically.
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
# Check Balance
|
|
258
|
+
balance = client.get_traffic_balance()
|
|
259
|
+
|
|
260
|
+
# Manage Whitelist
|
|
261
|
+
client.add_whitelist_ip("1.2.3.4")
|
|
262
|
+
|
|
263
|
+
# Create Sub-users
|
|
264
|
+
client.create_proxy_user("new_user", "pass123", traffic_limit=500)
|
|
265
|
+
|
|
266
|
+
# Monitor Unlimited Proxies
|
|
267
|
+
monitor = client.unlimited.get_server_monitor(
|
|
268
|
+
ins_id="ins-123",
|
|
269
|
+
region="us",
|
|
270
|
+
start_time=1700000000,
|
|
271
|
+
end_time=1700003600
|
|
272
|
+
)
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## 🧪 Development & Testing
|
|
278
|
+
|
|
279
|
+
- **Full env reference**: Copy [.env.example](.env.example) to `.env` and fill in credentials.
|
|
280
|
+
- **Unit tests** (no network): `pytest` or `python -m coverage run -m pytest -p no:cov tests && python -m coverage report -m`
|
|
281
|
+
- **Integration tests** (live API/proxy): Set `THORDATA_INTEGRATION=true` in `.env`; optional `THORDATA_UPSTREAM_PROXY` (e.g. Clash) if behind a firewall. See [CONTRIBUTING.md](CONTRIBUTING.md#-testing-guidelines).
|
|
282
|
+
|
|
283
|
+
---
|
|
284
|
+
|
|
285
|
+
## 📄 License
|
|
286
|
+
|
|
287
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
thordata/__init__.py,sha256=FMOku6d17GrFjiJlRhvkx-JmhLLD7VlaADLC3FP6hHg,2287
|
|
2
|
+
thordata/_utils.py,sha256=Acr_6sHgdZXU7SQozd6FEYTZV6iHw__nlhpBTDwb66U,4917
|
|
3
|
+
thordata/async_client.py,sha256=F_t5EeYUM8BYM9tOQb2lzrcO81whGfO1g53Qagxcyq8,39713
|
|
4
|
+
thordata/async_unlimited.py,sha256=kzTksFkN21rDM21Pwy3hcayjfyGYNGGyGR3fRLtZC6I,4510
|
|
5
|
+
thordata/client.py,sha256=fG7X9JpFS0HKlWZl_6R_Phzt_o2hV25rVUUyCXhioYM,56991
|
|
6
|
+
thordata/enums.py,sha256=dO5QWpPFLpYP2GfLAdoFtxMTemhGNdr_NPqBoYfSFkk,764
|
|
7
|
+
thordata/exceptions.py,sha256=foAtH5U2pLUXM6u1C_63AVVh4-afuwt5y5MO7jDF0s8,11585
|
|
8
|
+
thordata/models.py,sha256=7GshQklo5aqke_ZQ2QIXiz9Ac5v6IRtvjWIjsBKEq6A,853
|
|
9
|
+
thordata/retry.py,sha256=X6Sa5IIb5EWD5fUJjKyhvWJyWQGPVgxLB3-vKoWfa5Q,11453
|
|
10
|
+
thordata/serp_engines.py,sha256=iuMWncelcGOskCHXFzpcPMMTL5qfiLkazHB1uj3zpZo,5985
|
|
11
|
+
thordata/unlimited.py,sha256=RzrtwcotYlbOWuSLysDyI75IkMVL7ygdfE9HKNoe02M,6087
|
|
12
|
+
thordata/core/__init__.py,sha256=EFT6mZpSdec_7uFUpSpDDHVwbTxy314uxJC_uprR6J4,500
|
|
13
|
+
thordata/core/async_http_client.py,sha256=KKsmhXN6bWRTDFvqa0H-WRf4R-TWH8WSgpDBRv6TEvg,3052
|
|
14
|
+
thordata/core/http_client.py,sha256=8lSwclmVweM-Go1qMW36zYnMKAUT_9RyDdPF7qMS4-Y,2280
|
|
15
|
+
thordata/core/tunnel.py,sha256=rbM_4zGwY4FXqdxYmCOURQw2s1EuAWFBVBM-1joNjGI,8373
|
|
16
|
+
thordata/tools/__init__.py,sha256=_Sr042bW-OMMj-WruA93YeQ6FfeIXvWmHoHMAFQ72a8,840
|
|
17
|
+
thordata/tools/base.py,sha256=fHuCp53y8eB59DuCdA1wHcbMVmsd5ikL9KlT5m_jJn0,1006
|
|
18
|
+
thordata/tools/code.py,sha256=fGuLEn_CydIq79XgMw5-EJDcp-nq2fenWVp7hKpsRNw,930
|
|
19
|
+
thordata/tools/ecommerce.py,sha256=8iZ7f46CYovPDfAS3lZhRXpXEyJ9PSFBw9w99-Zw8Qs,6584
|
|
20
|
+
thordata/tools/professional.py,sha256=2RJ76Sx1seftFpwgD4VRfRinoo-HAqYZucTnuIdV4Kw,4350
|
|
21
|
+
thordata/tools/search.py,sha256=2HLQaYK6JiGvzOFF9or9ORXNrzv6nDQUaEt83YbqiQA,2903
|
|
22
|
+
thordata/tools/social.py,sha256=6gcj1GUWJvDALpBMeobohIn6yPVo-LsqDsuUroNpHG8,10465
|
|
23
|
+
thordata/tools/travel.py,sha256=vRJAU-uzFVvLQ5Tc58vp3CY7OPWd2lcWh_9MvWMc1fs,2725
|
|
24
|
+
thordata/tools/video.py,sha256=HUFqdue-dtWmTVlYtmf5ffzuYDIzw5l3wk3Vr7AXQW0,4689
|
|
25
|
+
thordata/types/__init__.py,sha256=hlLt5UCVm7QdeOCN5_YWXS4Vy8tJUhIp0XbWjAoQiQg,1357
|
|
26
|
+
thordata/types/common.py,sha256=hkTZ1QtokpE1yT9BvTmYfQz9AUjeCIIPvjib2pnq_Ag,2818
|
|
27
|
+
thordata/types/proxy.py,sha256=IU45wQHCBOIlbdcCN9veypAkDT0q9NIikLu674CudOU,10438
|
|
28
|
+
thordata/types/serp.py,sha256=NO52I1NprjVBgKQe4o2xEp82a3Oy9wCBYG-2Q0oegnU,5817
|
|
29
|
+
thordata/types/task.py,sha256=b9TzcFigWUJDsr2t1hvaDv_CU1xk2d2cMrthmwPn7VU,4602
|
|
30
|
+
thordata/types/universal.py,sha256=Kw8lf_2ElXIfylsNfVosLE1MvlEQkryv4fWEaQw6ecg,2161
|
|
31
|
+
thordata_sdk-1.6.0.dist-info/licenses/LICENSE,sha256=bAxpWgQIzb-5jl3nhLdOwOJ_vlbHLtSG7yev2B7vioY,1088
|
|
32
|
+
thordata_sdk-1.6.0.dist-info/METADATA,sha256=bBy6xzDLWZ9l5bGLu0Jh91X9GtYVjlKCtpp13OZchmU,9308
|
|
33
|
+
thordata_sdk-1.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
34
|
+
thordata_sdk-1.6.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
|
|
35
|
+
thordata_sdk-1.6.0.dist-info/RECORD,,
|