thordata-sdk 1.5.0__tar.gz → 1.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/PKG-INFO +63 -7
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/README.md +62 -6
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/pyproject.toml +3 -3
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/__init__.py +1 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/async_client.py +12 -7
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/client.py +12 -7
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/enums.py +2 -2
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/exceptions.py +70 -19
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/models.py +1 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/retry.py +1 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/tools/__init__.py +11 -1
- thordata_sdk-1.6.0/src/thordata/tools/code.py +39 -0
- thordata_sdk-1.6.0/src/thordata/tools/ecommerce.py +251 -0
- thordata_sdk-1.6.0/src/thordata/tools/professional.py +155 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/tools/search.py +47 -5
- thordata_sdk-1.6.0/src/thordata/tools/social.py +374 -0
- thordata_sdk-1.6.0/src/thordata/tools/travel.py +100 -0
- thordata_sdk-1.6.0/src/thordata/tools/video.py +154 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/types/task.py +16 -4
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata_sdk.egg-info/PKG-INFO +63 -7
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata_sdk.egg-info/SOURCES.txt +8 -3
- thordata_sdk-1.6.0/tests/test_async_client.py +424 -0
- thordata_sdk-1.6.0/tests/test_batch_creation.py +116 -0
- thordata_sdk-1.6.0/tests/test_client.py +606 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_enums.py +1 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_examples.py +4 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_integration_proxy_protocols.py +2 -3
- thordata_sdk-1.6.0/tests/test_retry.py +317 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_spec_parity.py +36 -2
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_tools.py +5 -3
- thordata_sdk-1.6.0/tests/test_tools_coverage.py +102 -0
- thordata_sdk-1.6.0/tests/test_unlimited.py +184 -0
- thordata_sdk-1.6.0/tests/test_utils.py +126 -0
- thordata_sdk-1.5.0/src/thordata/_example_utils.py +0 -77
- thordata_sdk-1.5.0/src/thordata/demo.py +0 -138
- thordata_sdk-1.5.0/src/thordata/tools/code.py +0 -26
- thordata_sdk-1.5.0/src/thordata/tools/ecommerce.py +0 -67
- thordata_sdk-1.5.0/src/thordata/tools/social.py +0 -190
- thordata_sdk-1.5.0/src/thordata/tools/video.py +0 -81
- thordata_sdk-1.5.0/tests/test_async_client.py +0 -111
- thordata_sdk-1.5.0/tests/test_client.py +0 -121
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/LICENSE +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/setup.cfg +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/_utils.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/async_unlimited.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/core/__init__.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/core/async_http_client.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/core/http_client.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/core/tunnel.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/serp_engines.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/tools/base.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/types/__init__.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/types/common.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/types/proxy.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/types/serp.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/types/universal.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata/unlimited.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata_sdk.egg-info/dependency_links.txt +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata_sdk.egg-info/requires.txt +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/src/thordata_sdk.egg-info/top_level.txt +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_async_client_errors.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_client_errors.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_exceptions.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_models.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_task_status_and_wait.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.6.0}/tests/test_user_agent.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: thordata-sdk
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.6.0
|
|
4
4
|
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
5
|
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
6
|
License: MIT
|
|
@@ -63,9 +63,9 @@ Dynamic: license-file
|
|
|
63
63
|
|
|
64
64
|
## 📖 Introduction
|
|
65
65
|
|
|
66
|
-
The **Thordata Python SDK v1.
|
|
66
|
+
The **Thordata Python SDK v1.6.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
|
|
67
67
|
|
|
68
|
-
**Why v1.
|
|
68
|
+
**Why v1.6.0?**
|
|
69
69
|
* **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
|
|
70
70
|
* **⚡ Async First**: First-class `asyncio` support with `aiohttp` for high-concurrency scraping (1000+ RPS).
|
|
71
71
|
* **🧩 100% API Coverage**: Every endpoint documented by Thordata (including Hourly Usage, Server Monitor, and Task Management) is implemented.
|
|
@@ -83,7 +83,7 @@ pip install thordata-sdk
|
|
|
83
83
|
|
|
84
84
|
## 🔐 Configuration
|
|
85
85
|
|
|
86
|
-
Set environment variables to avoid hardcoding credentials.
|
|
86
|
+
Set environment variables to avoid hardcoding credentials. **Full reference:** copy [.env.example](.env.example) to `.env` and fill in values.
|
|
87
87
|
|
|
88
88
|
```bash
|
|
89
89
|
# [Scraping APIs]
|
|
@@ -93,13 +93,19 @@ export THORDATA_SCRAPER_TOKEN="your_scraper_token"
|
|
|
93
93
|
export THORDATA_PUBLIC_TOKEN="your_public_token"
|
|
94
94
|
export THORDATA_PUBLIC_KEY="your_public_key"
|
|
95
95
|
|
|
96
|
-
# [Proxy
|
|
96
|
+
# [Proxy: Residential / Unlimited / Datacenter / Mobile / ISP]
|
|
97
97
|
export THORDATA_RESIDENTIAL_USERNAME="your_username"
|
|
98
98
|
export THORDATA_RESIDENTIAL_PASSWORD="your_password"
|
|
99
|
-
# Optional:
|
|
100
|
-
# export
|
|
99
|
+
# Optional: Unlimited (high-bandwidth) if your plan has separate credentials
|
|
100
|
+
# export THORDATA_UNLIMITED_USERNAME="..."
|
|
101
|
+
# export THORDATA_UNLIMITED_PASSWORD="..."
|
|
102
|
+
|
|
103
|
+
# Optional: Upstream proxy when behind firewall (e.g. Clash Verge port 7897)
|
|
104
|
+
# export THORDATA_UPSTREAM_PROXY="http://127.0.0.1:7897"
|
|
101
105
|
```
|
|
102
106
|
|
|
107
|
+
Default proxy port is **9999** (residential); other products use different ports (see `.env.example`).
|
|
108
|
+
|
|
103
109
|
---
|
|
104
110
|
|
|
105
111
|
## 🚀 Quick Start
|
|
@@ -199,6 +205,48 @@ if status == "finished":
|
|
|
199
205
|
print(f"Download: {data_url}")
|
|
200
206
|
```
|
|
201
207
|
|
|
208
|
+
### Web Scraper Tools (120+ Pre-built Tools)
|
|
209
|
+
|
|
210
|
+
Use pre-built tools for popular platforms. See [Tool Coverage Matrix](docs/TOOL_COVERAGE_MATRIX.md) for full list.
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from thordata import ThordataClient
|
|
214
|
+
from thordata.tools import Amazon, GoogleMaps, YouTube, TikTok, eBay, Walmart
|
|
215
|
+
|
|
216
|
+
client = ThordataClient()
|
|
217
|
+
|
|
218
|
+
# Amazon Product by ASIN
|
|
219
|
+
task_id = client.run_tool(Amazon.ProductByAsin(asin="B0BZYCJK89"))
|
|
220
|
+
|
|
221
|
+
# Google Maps by Place ID
|
|
222
|
+
task_id = client.run_tool(GoogleMaps.DetailsByPlaceId(place_id="ChIJPTacEpBQwokRKwIlDXelxkA"))
|
|
223
|
+
|
|
224
|
+
# YouTube Video Download
|
|
225
|
+
from thordata import CommonSettings
|
|
226
|
+
settings = CommonSettings(resolution="<=360p", video_codec="vp9")
|
|
227
|
+
task_id = client.run_tool(YouTube.VideoDownload(
|
|
228
|
+
url="https://www.youtube.com/watch?v=jNQXAC9IVRw",
|
|
229
|
+
common_settings=settings
|
|
230
|
+
))
|
|
231
|
+
|
|
232
|
+
# Wait and get results
|
|
233
|
+
status = client.wait_for_task(task_id, max_wait=300)
|
|
234
|
+
if status == "ready":
|
|
235
|
+
download_url = client.get_task_result(task_id)
|
|
236
|
+
print(f"Results: {download_url}")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Available Platforms:**
|
|
240
|
+
- **E-Commerce**: Amazon, eBay, Walmart
|
|
241
|
+
- **Social Media**: TikTok, Instagram, Facebook, Twitter/X, Reddit, LinkedIn
|
|
242
|
+
- **Search**: Google Maps, Google Shopping, Google Play
|
|
243
|
+
- **Video**: YouTube (download, info, subtitles)
|
|
244
|
+
- **Code**: GitHub
|
|
245
|
+
- **Professional**: Indeed, Glassdoor, Crunchbase
|
|
246
|
+
- **Travel/Real Estate**: Booking, Airbnb, Zillow
|
|
247
|
+
|
|
248
|
+
See `examples/tools/` for more examples.
|
|
249
|
+
|
|
202
250
|
---
|
|
203
251
|
|
|
204
252
|
## 🛠️ Management APIs
|
|
@@ -226,6 +274,14 @@ monitor = client.unlimited.get_server_monitor(
|
|
|
226
274
|
|
|
227
275
|
---
|
|
228
276
|
|
|
277
|
+
## 🧪 Development & Testing
|
|
278
|
+
|
|
279
|
+
- **Full env reference**: Copy [.env.example](.env.example) to `.env` and fill in credentials.
|
|
280
|
+
- **Unit tests** (no network): `pytest` or `python -m coverage run -m pytest -p no:cov tests && python -m coverage report -m`
|
|
281
|
+
- **Integration tests** (live API/proxy): Set `THORDATA_INTEGRATION=true` in `.env`; optional `THORDATA_UPSTREAM_PROXY` (e.g. Clash) if behind a firewall. See [CONTRIBUTING.md](CONTRIBUTING.md#-testing-guidelines).
|
|
282
|
+
|
|
283
|
+
---
|
|
284
|
+
|
|
229
285
|
## 📄 License
|
|
230
286
|
|
|
231
287
|
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -19,9 +19,9 @@
|
|
|
19
19
|
|
|
20
20
|
## 📖 Introduction
|
|
21
21
|
|
|
22
|
-
The **Thordata Python SDK v1.
|
|
22
|
+
The **Thordata Python SDK v1.6.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
|
|
23
23
|
|
|
24
|
-
**Why v1.
|
|
24
|
+
**Why v1.6.0?**
|
|
25
25
|
* **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
|
|
26
26
|
* **⚡ Async First**: First-class `asyncio` support with `aiohttp` for high-concurrency scraping (1000+ RPS).
|
|
27
27
|
* **🧩 100% API Coverage**: Every endpoint documented by Thordata (including Hourly Usage, Server Monitor, and Task Management) is implemented.
|
|
@@ -39,7 +39,7 @@ pip install thordata-sdk
|
|
|
39
39
|
|
|
40
40
|
## 🔐 Configuration
|
|
41
41
|
|
|
42
|
-
Set environment variables to avoid hardcoding credentials.
|
|
42
|
+
Set environment variables to avoid hardcoding credentials. **Full reference:** copy [.env.example](.env.example) to `.env` and fill in values.
|
|
43
43
|
|
|
44
44
|
```bash
|
|
45
45
|
# [Scraping APIs]
|
|
@@ -49,13 +49,19 @@ export THORDATA_SCRAPER_TOKEN="your_scraper_token"
|
|
|
49
49
|
export THORDATA_PUBLIC_TOKEN="your_public_token"
|
|
50
50
|
export THORDATA_PUBLIC_KEY="your_public_key"
|
|
51
51
|
|
|
52
|
-
# [Proxy
|
|
52
|
+
# [Proxy: Residential / Unlimited / Datacenter / Mobile / ISP]
|
|
53
53
|
export THORDATA_RESIDENTIAL_USERNAME="your_username"
|
|
54
54
|
export THORDATA_RESIDENTIAL_PASSWORD="your_password"
|
|
55
|
-
# Optional:
|
|
56
|
-
# export
|
|
55
|
+
# Optional: Unlimited (high-bandwidth) if your plan has separate credentials
|
|
56
|
+
# export THORDATA_UNLIMITED_USERNAME="..."
|
|
57
|
+
# export THORDATA_UNLIMITED_PASSWORD="..."
|
|
58
|
+
|
|
59
|
+
# Optional: Upstream proxy when behind firewall (e.g. Clash Verge port 7897)
|
|
60
|
+
# export THORDATA_UPSTREAM_PROXY="http://127.0.0.1:7897"
|
|
57
61
|
```
|
|
58
62
|
|
|
63
|
+
Default proxy port is **9999** (residential); other products use different ports (see `.env.example`).
|
|
64
|
+
|
|
59
65
|
---
|
|
60
66
|
|
|
61
67
|
## 🚀 Quick Start
|
|
@@ -155,6 +161,48 @@ if status == "finished":
|
|
|
155
161
|
print(f"Download: {data_url}")
|
|
156
162
|
```
|
|
157
163
|
|
|
164
|
+
### Web Scraper Tools (120+ Pre-built Tools)
|
|
165
|
+
|
|
166
|
+
Use pre-built tools for popular platforms. See [Tool Coverage Matrix](docs/TOOL_COVERAGE_MATRIX.md) for full list.
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from thordata import ThordataClient
|
|
170
|
+
from thordata.tools import Amazon, GoogleMaps, YouTube, TikTok, eBay, Walmart
|
|
171
|
+
|
|
172
|
+
client = ThordataClient()
|
|
173
|
+
|
|
174
|
+
# Amazon Product by ASIN
|
|
175
|
+
task_id = client.run_tool(Amazon.ProductByAsin(asin="B0BZYCJK89"))
|
|
176
|
+
|
|
177
|
+
# Google Maps by Place ID
|
|
178
|
+
task_id = client.run_tool(GoogleMaps.DetailsByPlaceId(place_id="ChIJPTacEpBQwokRKwIlDXelxkA"))
|
|
179
|
+
|
|
180
|
+
# YouTube Video Download
|
|
181
|
+
from thordata import CommonSettings
|
|
182
|
+
settings = CommonSettings(resolution="<=360p", video_codec="vp9")
|
|
183
|
+
task_id = client.run_tool(YouTube.VideoDownload(
|
|
184
|
+
url="https://www.youtube.com/watch?v=jNQXAC9IVRw",
|
|
185
|
+
common_settings=settings
|
|
186
|
+
))
|
|
187
|
+
|
|
188
|
+
# Wait and get results
|
|
189
|
+
status = client.wait_for_task(task_id, max_wait=300)
|
|
190
|
+
if status == "ready":
|
|
191
|
+
download_url = client.get_task_result(task_id)
|
|
192
|
+
print(f"Results: {download_url}")
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
**Available Platforms:**
|
|
196
|
+
- **E-Commerce**: Amazon, eBay, Walmart
|
|
197
|
+
- **Social Media**: TikTok, Instagram, Facebook, Twitter/X, Reddit, LinkedIn
|
|
198
|
+
- **Search**: Google Maps, Google Shopping, Google Play
|
|
199
|
+
- **Video**: YouTube (download, info, subtitles)
|
|
200
|
+
- **Code**: GitHub
|
|
201
|
+
- **Professional**: Indeed, Glassdoor, Crunchbase
|
|
202
|
+
- **Travel/Real Estate**: Booking, Airbnb, Zillow
|
|
203
|
+
|
|
204
|
+
See `examples/tools/` for more examples.
|
|
205
|
+
|
|
158
206
|
---
|
|
159
207
|
|
|
160
208
|
## 🛠️ Management APIs
|
|
@@ -182,6 +230,14 @@ monitor = client.unlimited.get_server_monitor(
|
|
|
182
230
|
|
|
183
231
|
---
|
|
184
232
|
|
|
233
|
+
## 🧪 Development & Testing
|
|
234
|
+
|
|
235
|
+
- **Full env reference**: Copy [.env.example](.env.example) to `.env` and fill in credentials.
|
|
236
|
+
- **Unit tests** (no network): `pytest` or `python -m coverage run -m pytest -p no:cov tests && python -m coverage report -m`
|
|
237
|
+
- **Integration tests** (live API/proxy): Set `THORDATA_INTEGRATION=true` in `.env`; optional `THORDATA_UPSTREAM_PROXY` (e.g. Clash) if behind a firewall. See [CONTRIBUTING.md](CONTRIBUTING.md#-testing-guidelines).
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
185
241
|
## 📄 License
|
|
186
242
|
|
|
187
243
|
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "thordata-sdk"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.6.0"
|
|
9
9
|
description = "The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network."
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
requires-python = ">=3.9"
|
|
@@ -120,11 +120,11 @@ ignore_missing_imports = true
|
|
|
120
120
|
module = ["aiohttp.*", "requests.*"]
|
|
121
121
|
ignore_missing_imports = true
|
|
122
122
|
|
|
123
|
-
# Pytest setup
|
|
123
|
+
# Pytest setup (coverage is run separately via coverage CLI for reliability)
|
|
124
124
|
[tool.pytest.ini_options]
|
|
125
125
|
testpaths = ["tests"]
|
|
126
126
|
asyncio_mode = "auto"
|
|
127
|
-
addopts = "-v
|
|
127
|
+
addopts = "-v"
|
|
128
128
|
markers = ["integration: live tests that require real credentials"]
|
|
129
129
|
|
|
130
130
|
# Coverage setup
|
|
@@ -5,7 +5,7 @@ Official Python client for Thordata's Proxy Network, SERP API,
|
|
|
5
5
|
Universal Scraping API (Web Unlocker), and Web Scraper API.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "1.
|
|
8
|
+
__version__ = "1.6.0"
|
|
9
9
|
__author__ = "Thordata Developer Team/Kael Odin"
|
|
10
10
|
__email__ = "support@thordata.com"
|
|
11
11
|
|
|
@@ -124,10 +124,10 @@ class AsyncThordataClient:
|
|
|
124
124
|
).rstrip("/")
|
|
125
125
|
|
|
126
126
|
self._gateway_base_url = os.getenv(
|
|
127
|
-
"THORDATA_GATEWAY_BASE_URL", "https://
|
|
127
|
+
"THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
|
|
128
128
|
)
|
|
129
129
|
self._child_base_url = os.getenv(
|
|
130
|
-
"THORDATA_CHILD_BASE_URL", "https://
|
|
130
|
+
"THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
|
|
131
131
|
)
|
|
132
132
|
|
|
133
133
|
# URL Construction
|
|
@@ -145,7 +145,7 @@ class AsyncThordataClient:
|
|
|
145
145
|
self._proxy_users_url = f"{shared_api_base}/proxy-users"
|
|
146
146
|
|
|
147
147
|
whitelist_base = os.getenv(
|
|
148
|
-
"THORDATA_WHITELIST_BASE_URL", "https://
|
|
148
|
+
"THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
|
|
149
149
|
)
|
|
150
150
|
self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
|
|
151
151
|
|
|
@@ -352,7 +352,7 @@ class AsyncThordataClient:
|
|
|
352
352
|
file_name: str,
|
|
353
353
|
spider_id: str,
|
|
354
354
|
spider_name: str,
|
|
355
|
-
parameters: dict[str, Any],
|
|
355
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
356
356
|
universal_params: dict[str, Any] | None = None,
|
|
357
357
|
) -> str:
|
|
358
358
|
config = ScraperTaskConfig(
|
|
@@ -434,7 +434,7 @@ class AsyncThordataClient:
|
|
|
434
434
|
file_name: str,
|
|
435
435
|
spider_id: str,
|
|
436
436
|
spider_name: str,
|
|
437
|
-
parameters: dict[str, Any],
|
|
437
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
438
438
|
common_settings: CommonSettings,
|
|
439
439
|
) -> str:
|
|
440
440
|
config = VideoTaskConfig(
|
|
@@ -550,7 +550,7 @@ class AsyncThordataClient:
|
|
|
550
550
|
file_name: str,
|
|
551
551
|
spider_id: str,
|
|
552
552
|
spider_name: str,
|
|
553
|
-
parameters: dict[str, Any],
|
|
553
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
554
554
|
universal_params: dict[str, Any] | None = None,
|
|
555
555
|
*,
|
|
556
556
|
max_wait: float = 600.0,
|
|
@@ -971,7 +971,12 @@ class AsyncThordataClient:
|
|
|
971
971
|
if port:
|
|
972
972
|
params["port"] = str(port)
|
|
973
973
|
|
|
974
|
-
|
|
974
|
+
if product == "unlimited":
|
|
975
|
+
username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
|
|
976
|
+
"THORDATA_RESIDENTIAL_USERNAME"
|
|
977
|
+
)
|
|
978
|
+
else:
|
|
979
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
975
980
|
if username:
|
|
976
981
|
params["td-customer"] = username
|
|
977
982
|
|
|
@@ -159,10 +159,10 @@ class ThordataClient:
|
|
|
159
159
|
).rstrip("/")
|
|
160
160
|
|
|
161
161
|
self._gateway_base_url = os.getenv(
|
|
162
|
-
"THORDATA_GATEWAY_BASE_URL", "https://
|
|
162
|
+
"THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
|
|
163
163
|
)
|
|
164
164
|
self._child_base_url = os.getenv(
|
|
165
|
-
"THORDATA_CHILD_BASE_URL", "https://
|
|
165
|
+
"THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
|
|
166
166
|
)
|
|
167
167
|
|
|
168
168
|
# URL Construction
|
|
@@ -183,7 +183,7 @@ class ThordataClient:
|
|
|
183
183
|
self._proxy_users_url = f"{shared_api_base}/proxy-users"
|
|
184
184
|
|
|
185
185
|
whitelist_base = os.getenv(
|
|
186
|
-
"THORDATA_WHITELIST_BASE_URL", "https://
|
|
186
|
+
"THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
|
|
187
187
|
)
|
|
188
188
|
self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
|
|
189
189
|
|
|
@@ -405,7 +405,7 @@ class ThordataClient:
|
|
|
405
405
|
file_name: str,
|
|
406
406
|
spider_id: str,
|
|
407
407
|
spider_name: str,
|
|
408
|
-
parameters: dict[str, Any],
|
|
408
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
409
409
|
universal_params: dict[str, Any] | None = None,
|
|
410
410
|
) -> str:
|
|
411
411
|
config = ScraperTaskConfig(
|
|
@@ -490,7 +490,7 @@ class ThordataClient:
|
|
|
490
490
|
file_name: str,
|
|
491
491
|
spider_id: str,
|
|
492
492
|
spider_name: str,
|
|
493
|
-
parameters: dict[str, Any],
|
|
493
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
494
494
|
common_settings: CommonSettings,
|
|
495
495
|
) -> str:
|
|
496
496
|
config = VideoTaskConfig(
|
|
@@ -639,7 +639,7 @@ class ThordataClient:
|
|
|
639
639
|
file_name: str,
|
|
640
640
|
spider_id: str,
|
|
641
641
|
spider_name: str,
|
|
642
|
-
parameters: dict[str, Any],
|
|
642
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
643
643
|
universal_params: dict[str, Any] | None = None,
|
|
644
644
|
*,
|
|
645
645
|
max_wait: float = 600.0,
|
|
@@ -862,7 +862,12 @@ class ThordataClient:
|
|
|
862
862
|
if port:
|
|
863
863
|
params["port"] = str(port)
|
|
864
864
|
|
|
865
|
-
|
|
865
|
+
if product == "unlimited":
|
|
866
|
+
username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
|
|
867
|
+
"THORDATA_RESIDENTIAL_USERNAME"
|
|
868
|
+
)
|
|
869
|
+
else:
|
|
870
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
866
871
|
if username:
|
|
867
872
|
params["td-customer"] = username
|
|
868
873
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Enumerations for the Thordata Python SDK.
|
|
3
|
-
Moved to thordata.types in v1.
|
|
3
|
+
Moved to thordata.types in v1.6.0.
|
|
4
4
|
This file is kept for backward compatibility.
|
|
5
5
|
"""
|
|
6
6
|
|
|
@@ -21,7 +21,7 @@ from .types import (
|
|
|
21
21
|
SessionType,
|
|
22
22
|
TaskStatus,
|
|
23
23
|
TimeRange,
|
|
24
|
-
normalize_enum_value,
|
|
24
|
+
normalize_enum_value,
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
__all__ = [
|
|
@@ -15,6 +15,7 @@ Exception Hierarchy:
|
|
|
15
15
|
|
|
16
16
|
from __future__ import annotations
|
|
17
17
|
|
|
18
|
+
from collections.abc import Mapping
|
|
18
19
|
from typing import Any
|
|
19
20
|
|
|
20
21
|
# =============================================================================
|
|
@@ -235,6 +236,46 @@ class ThordataNotCollectedError(ThordataAPIError):
|
|
|
235
236
|
# =============================================================================
|
|
236
237
|
|
|
237
238
|
|
|
239
|
+
def _extract_request_id(payload: Any) -> str | None:
|
|
240
|
+
if isinstance(payload, Mapping):
|
|
241
|
+
for key in ("request_id", "requestId", "x_request_id", "x-request-id"):
|
|
242
|
+
val = payload.get(key)
|
|
243
|
+
if val is not None:
|
|
244
|
+
return str(val)
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _extract_retry_after(payload: Any) -> int | None:
|
|
249
|
+
if isinstance(payload, Mapping):
|
|
250
|
+
for key in ("retry_after", "retryAfter", "retry-after"):
|
|
251
|
+
val = payload.get(key)
|
|
252
|
+
if isinstance(val, int):
|
|
253
|
+
return val
|
|
254
|
+
if isinstance(val, str) and val.isdigit():
|
|
255
|
+
return int(val)
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _build_error_message(
|
|
260
|
+
message: str,
|
|
261
|
+
*,
|
|
262
|
+
status_code: int | None,
|
|
263
|
+
code: int | None,
|
|
264
|
+
request_id: str | None,
|
|
265
|
+
) -> str:
|
|
266
|
+
parts: list[str] = [message]
|
|
267
|
+
meta: list[str] = []
|
|
268
|
+
if status_code is not None:
|
|
269
|
+
meta.append(f"http={status_code}")
|
|
270
|
+
if code is not None and code != status_code:
|
|
271
|
+
meta.append(f"code={code}")
|
|
272
|
+
if request_id:
|
|
273
|
+
meta.append(f"request_id={request_id}")
|
|
274
|
+
if meta:
|
|
275
|
+
parts.append("(" + ", ".join(meta) + ")")
|
|
276
|
+
return " ".join(parts)
|
|
277
|
+
|
|
278
|
+
|
|
238
279
|
def raise_for_code(
|
|
239
280
|
message: str,
|
|
240
281
|
*,
|
|
@@ -266,49 +307,59 @@ def raise_for_code(
|
|
|
266
307
|
# Determine the effective error code.
|
|
267
308
|
# Prefer payload `code` when present and not success (200),
|
|
268
309
|
# otherwise fall back to HTTP status when it indicates an error.
|
|
310
|
+
# Determine the effective error code for routing.
|
|
269
311
|
effective_code: int | None = None
|
|
270
|
-
|
|
271
312
|
if code is not None and code != 200:
|
|
272
313
|
effective_code = code
|
|
273
|
-
elif status_code is not None and status_code
|
|
314
|
+
elif status_code is not None and status_code >= 400:
|
|
274
315
|
effective_code = status_code
|
|
275
316
|
else:
|
|
276
317
|
effective_code = code if code is not None else status_code
|
|
277
318
|
|
|
319
|
+
# Extract additional context from payload
|
|
320
|
+
final_request_id = request_id or _extract_request_id(payload)
|
|
321
|
+
|
|
322
|
+
# Build a consistent, informative error message
|
|
323
|
+
final_message = _build_error_message(
|
|
324
|
+
message,
|
|
325
|
+
status_code=status_code,
|
|
326
|
+
code=code,
|
|
327
|
+
request_id=final_request_id,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Prepare common arguments for exception constructors
|
|
278
331
|
kwargs = {
|
|
279
332
|
"status_code": status_code,
|
|
280
333
|
"code": code,
|
|
281
334
|
"payload": payload,
|
|
282
|
-
"request_id":
|
|
335
|
+
"request_id": final_request_id,
|
|
283
336
|
}
|
|
284
337
|
|
|
338
|
+
# --- Route to the correct exception class ---
|
|
339
|
+
|
|
285
340
|
# Not collected (API payload code 300, often retryable, not billed)
|
|
286
|
-
# Check this FIRST since 300 is in API_CODES, not HTTP_STATUS_CODES
|
|
287
341
|
if effective_code in ThordataNotCollectedError.API_CODES:
|
|
288
|
-
raise ThordataNotCollectedError(
|
|
342
|
+
raise ThordataNotCollectedError(final_message, **kwargs)
|
|
289
343
|
|
|
290
|
-
# Auth errors
|
|
344
|
+
# Auth errors (401, 403)
|
|
291
345
|
if effective_code in ThordataAuthError.HTTP_STATUS_CODES:
|
|
292
|
-
raise ThordataAuthError(
|
|
346
|
+
raise ThordataAuthError(final_message, **kwargs)
|
|
293
347
|
|
|
294
|
-
# Rate limit errors
|
|
348
|
+
# Rate limit errors (429, 402)
|
|
295
349
|
if effective_code in ThordataRateLimitError.HTTP_STATUS_CODES:
|
|
296
|
-
|
|
297
|
-
retry_after
|
|
298
|
-
if isinstance(payload, dict):
|
|
299
|
-
retry_after = payload.get("retry_after")
|
|
300
|
-
raise ThordataRateLimitError(message, retry_after=retry_after, **kwargs)
|
|
350
|
+
retry_after = _extract_retry_after(payload)
|
|
351
|
+
raise ThordataRateLimitError(final_message, retry_after=retry_after, **kwargs)
|
|
301
352
|
|
|
302
|
-
# Server errors
|
|
353
|
+
# Server errors (5xx)
|
|
303
354
|
if effective_code is not None and 500 <= effective_code < 600:
|
|
304
|
-
raise ThordataServerError(
|
|
355
|
+
raise ThordataServerError(final_message, **kwargs)
|
|
305
356
|
|
|
306
|
-
# Validation errors
|
|
357
|
+
# Validation errors (400, 422)
|
|
307
358
|
if effective_code in ThordataValidationError.HTTP_STATUS_CODES:
|
|
308
|
-
raise ThordataValidationError(
|
|
359
|
+
raise ThordataValidationError(final_message, **kwargs)
|
|
309
360
|
|
|
310
|
-
#
|
|
311
|
-
raise ThordataAPIError(
|
|
361
|
+
# Fallback to generic API error if no specific match
|
|
362
|
+
raise ThordataAPIError(final_message, **kwargs)
|
|
312
363
|
|
|
313
364
|
|
|
314
365
|
# =============================================================================
|
|
@@ -186,7 +186,7 @@ def with_retry(
|
|
|
186
186
|
if isinstance(e, ThordataRateLimitError) and e.retry_after:
|
|
187
187
|
delay = max(delay, e.retry_after)
|
|
188
188
|
|
|
189
|
-
logger.
|
|
189
|
+
logger.info(
|
|
190
190
|
f"Retry attempt {attempt + 1}/{config.max_retries} "
|
|
191
191
|
f"after {delay:.2f}s due to: {e}"
|
|
192
192
|
)
|
|
@@ -5,15 +5,19 @@ High-level abstractions for specific scraping targets.
|
|
|
5
5
|
|
|
6
6
|
from .base import ToolRequest, VideoToolRequest
|
|
7
7
|
from .code import GitHub
|
|
8
|
-
from .ecommerce import Amazon
|
|
8
|
+
from .ecommerce import Amazon, Walmart, eBay
|
|
9
|
+
from .professional import Crunchbase, Glassdoor, Indeed
|
|
9
10
|
from .search import GoogleMaps, GooglePlay, GoogleShopping
|
|
10
11
|
from .social import Facebook, Instagram, LinkedIn, Reddit, TikTok, Twitter
|
|
12
|
+
from .travel import Airbnb, Booking, Zillow
|
|
11
13
|
from .video import YouTube
|
|
12
14
|
|
|
13
15
|
__all__ = [
|
|
14
16
|
"ToolRequest",
|
|
15
17
|
"VideoToolRequest",
|
|
16
18
|
"Amazon",
|
|
19
|
+
"eBay",
|
|
20
|
+
"Walmart",
|
|
17
21
|
"GoogleMaps",
|
|
18
22
|
"GoogleShopping",
|
|
19
23
|
"GooglePlay",
|
|
@@ -25,4 +29,10 @@ __all__ = [
|
|
|
25
29
|
"Reddit",
|
|
26
30
|
"YouTube",
|
|
27
31
|
"GitHub",
|
|
32
|
+
"Indeed",
|
|
33
|
+
"Glassdoor",
|
|
34
|
+
"Crunchbase",
|
|
35
|
+
"Booking",
|
|
36
|
+
"Zillow",
|
|
37
|
+
"Airbnb",
|
|
28
38
|
]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code Repository Scraper Tools (GitHub, etc.)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .base import ToolRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GitHub:
|
|
13
|
+
"""Namespace for GitHub tools."""
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Repository(ToolRequest):
|
|
17
|
+
"""Github Repository Scraper by Repo URL"""
|
|
18
|
+
|
|
19
|
+
SPIDER_ID = "github_repository_by-repo-url"
|
|
20
|
+
SPIDER_NAME = "github.com"
|
|
21
|
+
repo_url: str
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RepositoryBySearchUrl(ToolRequest):
|
|
25
|
+
"""Github Repository Scraper by Search URL"""
|
|
26
|
+
|
|
27
|
+
SPIDER_ID = "github_repository_by-search-url"
|
|
28
|
+
SPIDER_NAME = "github.com"
|
|
29
|
+
search_url: str
|
|
30
|
+
page_turning: int | None = None
|
|
31
|
+
max_num: int | None = None
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class RepositoryByUrl(ToolRequest):
|
|
35
|
+
"""Github Repository Scraper by URL"""
|
|
36
|
+
|
|
37
|
+
SPIDER_ID = "github_repository_by-url"
|
|
38
|
+
SPIDER_NAME = "github.com"
|
|
39
|
+
url: str
|