thordata-sdk 1.5.0__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/PKG-INFO +63 -7
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/README.md +62 -6
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/pyproject.toml +4 -3
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/__init__.py +1 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/async_client.py +55 -13
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/client.py +64 -13
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/enums.py +2 -2
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/exceptions.py +80 -20
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/models.py +1 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/retry.py +1 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/tools/__init__.py +11 -1
- thordata_sdk-1.7.0/src/thordata/tools/code.py +39 -0
- thordata_sdk-1.7.0/src/thordata/tools/ecommerce.py +251 -0
- thordata_sdk-1.7.0/src/thordata/tools/professional.py +155 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/tools/search.py +47 -5
- thordata_sdk-1.7.0/src/thordata/tools/social.py +374 -0
- thordata_sdk-1.7.0/src/thordata/tools/travel.py +100 -0
- thordata_sdk-1.7.0/src/thordata/tools/video.py +154 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/types/serp.py +6 -2
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/types/task.py +75 -9
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/types/universal.py +37 -5
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/PKG-INFO +63 -7
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/SOURCES.txt +8 -3
- thordata_sdk-1.7.0/tests/test_async_client.py +424 -0
- thordata_sdk-1.7.0/tests/test_batch_creation.py +116 -0
- thordata_sdk-1.7.0/tests/test_client.py +606 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_enums.py +1 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_examples.py +4 -1
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_integration_proxy_protocols.py +2 -3
- thordata_sdk-1.7.0/tests/test_retry.py +317 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_spec_parity.py +36 -2
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_tools.py +5 -3
- thordata_sdk-1.7.0/tests/test_tools_coverage.py +102 -0
- thordata_sdk-1.7.0/tests/test_unlimited.py +184 -0
- thordata_sdk-1.7.0/tests/test_utils.py +126 -0
- thordata_sdk-1.5.0/src/thordata/_example_utils.py +0 -77
- thordata_sdk-1.5.0/src/thordata/demo.py +0 -138
- thordata_sdk-1.5.0/src/thordata/tools/code.py +0 -26
- thordata_sdk-1.5.0/src/thordata/tools/ecommerce.py +0 -67
- thordata_sdk-1.5.0/src/thordata/tools/social.py +0 -190
- thordata_sdk-1.5.0/src/thordata/tools/video.py +0 -81
- thordata_sdk-1.5.0/tests/test_async_client.py +0 -111
- thordata_sdk-1.5.0/tests/test_client.py +0 -121
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/LICENSE +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/setup.cfg +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/_utils.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/async_unlimited.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/core/__init__.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/core/async_http_client.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/core/http_client.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/core/tunnel.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/serp_engines.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/tools/base.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/types/__init__.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/types/common.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/types/proxy.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata/unlimited.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/dependency_links.txt +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/requires.txt +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/src/thordata_sdk.egg-info/top_level.txt +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_async_client_errors.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_client_errors.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_exceptions.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_models.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_task_status_and_wait.py +0 -0
- {thordata_sdk-1.5.0 → thordata_sdk-1.7.0}/tests/test_user_agent.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: thordata-sdk
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
5
|
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
6
|
License: MIT
|
|
@@ -63,9 +63,9 @@ Dynamic: license-file
|
|
|
63
63
|
|
|
64
64
|
## 📖 Introduction
|
|
65
65
|
|
|
66
|
-
The **Thordata Python SDK v1.
|
|
66
|
+
The **Thordata Python SDK v1.6.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
|
|
67
67
|
|
|
68
|
-
**Why v1.
|
|
68
|
+
**Why v1.6.0?**
|
|
69
69
|
* **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
|
|
70
70
|
* **⚡ Async First**: First-class `asyncio` support with `aiohttp` for high-concurrency scraping (1000+ RPS).
|
|
71
71
|
* **🧩 100% API Coverage**: Every endpoint documented by Thordata (including Hourly Usage, Server Monitor, and Task Management) is implemented.
|
|
@@ -83,7 +83,7 @@ pip install thordata-sdk
|
|
|
83
83
|
|
|
84
84
|
## 🔐 Configuration
|
|
85
85
|
|
|
86
|
-
Set environment variables to avoid hardcoding credentials.
|
|
86
|
+
Set environment variables to avoid hardcoding credentials. **Full reference:** copy [.env.example](.env.example) to `.env` and fill in values.
|
|
87
87
|
|
|
88
88
|
```bash
|
|
89
89
|
# [Scraping APIs]
|
|
@@ -93,13 +93,19 @@ export THORDATA_SCRAPER_TOKEN="your_scraper_token"
|
|
|
93
93
|
export THORDATA_PUBLIC_TOKEN="your_public_token"
|
|
94
94
|
export THORDATA_PUBLIC_KEY="your_public_key"
|
|
95
95
|
|
|
96
|
-
# [Proxy
|
|
96
|
+
# [Proxy: Residential / Unlimited / Datacenter / Mobile / ISP]
|
|
97
97
|
export THORDATA_RESIDENTIAL_USERNAME="your_username"
|
|
98
98
|
export THORDATA_RESIDENTIAL_PASSWORD="your_password"
|
|
99
|
-
# Optional:
|
|
100
|
-
# export
|
|
99
|
+
# Optional: Unlimited (high-bandwidth) if your plan has separate credentials
|
|
100
|
+
# export THORDATA_UNLIMITED_USERNAME="..."
|
|
101
|
+
# export THORDATA_UNLIMITED_PASSWORD="..."
|
|
102
|
+
|
|
103
|
+
# Optional: Upstream proxy when behind firewall (e.g. Clash Verge port 7897)
|
|
104
|
+
# export THORDATA_UPSTREAM_PROXY="http://127.0.0.1:7897"
|
|
101
105
|
```
|
|
102
106
|
|
|
107
|
+
Default proxy port is **9999** (residential); other products use different ports (see `.env.example`).
|
|
108
|
+
|
|
103
109
|
---
|
|
104
110
|
|
|
105
111
|
## 🚀 Quick Start
|
|
@@ -199,6 +205,48 @@ if status == "finished":
|
|
|
199
205
|
print(f"Download: {data_url}")
|
|
200
206
|
```
|
|
201
207
|
|
|
208
|
+
### Web Scraper Tools (120+ Pre-built Tools)
|
|
209
|
+
|
|
210
|
+
Use pre-built tools for popular platforms. See [Tool Coverage Matrix](docs/TOOL_COVERAGE_MATRIX.md) for full list.
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from thordata import ThordataClient
|
|
214
|
+
from thordata.tools import Amazon, GoogleMaps, YouTube, TikTok, eBay, Walmart
|
|
215
|
+
|
|
216
|
+
client = ThordataClient()
|
|
217
|
+
|
|
218
|
+
# Amazon Product by ASIN
|
|
219
|
+
task_id = client.run_tool(Amazon.ProductByAsin(asin="B0BZYCJK89"))
|
|
220
|
+
|
|
221
|
+
# Google Maps by Place ID
|
|
222
|
+
task_id = client.run_tool(GoogleMaps.DetailsByPlaceId(place_id="ChIJPTacEpBQwokRKwIlDXelxkA"))
|
|
223
|
+
|
|
224
|
+
# YouTube Video Download
|
|
225
|
+
from thordata import CommonSettings
|
|
226
|
+
settings = CommonSettings(resolution="<=360p", video_codec="vp9")
|
|
227
|
+
task_id = client.run_tool(YouTube.VideoDownload(
|
|
228
|
+
url="https://www.youtube.com/watch?v=jNQXAC9IVRw",
|
|
229
|
+
common_settings=settings
|
|
230
|
+
))
|
|
231
|
+
|
|
232
|
+
# Wait and get results
|
|
233
|
+
status = client.wait_for_task(task_id, max_wait=300)
|
|
234
|
+
if status == "ready":
|
|
235
|
+
download_url = client.get_task_result(task_id)
|
|
236
|
+
print(f"Results: {download_url}")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Available Platforms:**
|
|
240
|
+
- **E-Commerce**: Amazon, eBay, Walmart
|
|
241
|
+
- **Social Media**: TikTok, Instagram, Facebook, Twitter/X, Reddit, LinkedIn
|
|
242
|
+
- **Search**: Google Maps, Google Shopping, Google Play
|
|
243
|
+
- **Video**: YouTube (download, info, subtitles)
|
|
244
|
+
- **Code**: GitHub
|
|
245
|
+
- **Professional**: Indeed, Glassdoor, Crunchbase
|
|
246
|
+
- **Travel/Real Estate**: Booking, Airbnb, Zillow
|
|
247
|
+
|
|
248
|
+
See `examples/tools/` for more examples.
|
|
249
|
+
|
|
202
250
|
---
|
|
203
251
|
|
|
204
252
|
## 🛠️ Management APIs
|
|
@@ -226,6 +274,14 @@ monitor = client.unlimited.get_server_monitor(
|
|
|
226
274
|
|
|
227
275
|
---
|
|
228
276
|
|
|
277
|
+
## 🧪 Development & Testing
|
|
278
|
+
|
|
279
|
+
- **Full env reference**: Copy [.env.example](.env.example) to `.env` and fill in credentials.
|
|
280
|
+
- **Unit tests** (no network): `pytest` or `python -m coverage run -m pytest -p no:cov tests && python -m coverage report -m`
|
|
281
|
+
- **Integration tests** (live API/proxy): Set `THORDATA_INTEGRATION=true` in `.env`; optional `THORDATA_UPSTREAM_PROXY` (e.g. Clash) if behind a firewall. See [CONTRIBUTING.md](CONTRIBUTING.md#-testing-guidelines).
|
|
282
|
+
|
|
283
|
+
---
|
|
284
|
+
|
|
229
285
|
## 📄 License
|
|
230
286
|
|
|
231
287
|
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -19,9 +19,9 @@
|
|
|
19
19
|
|
|
20
20
|
## 📖 Introduction
|
|
21
21
|
|
|
22
|
-
The **Thordata Python SDK v1.
|
|
22
|
+
The **Thordata Python SDK v1.6.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
|
|
23
23
|
|
|
24
|
-
**Why v1.
|
|
24
|
+
**Why v1.6.0?**
|
|
25
25
|
* **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
|
|
26
26
|
* **⚡ Async First**: First-class `asyncio` support with `aiohttp` for high-concurrency scraping (1000+ RPS).
|
|
27
27
|
* **🧩 100% API Coverage**: Every endpoint documented by Thordata (including Hourly Usage, Server Monitor, and Task Management) is implemented.
|
|
@@ -39,7 +39,7 @@ pip install thordata-sdk
|
|
|
39
39
|
|
|
40
40
|
## 🔐 Configuration
|
|
41
41
|
|
|
42
|
-
Set environment variables to avoid hardcoding credentials.
|
|
42
|
+
Set environment variables to avoid hardcoding credentials. **Full reference:** copy [.env.example](.env.example) to `.env` and fill in values.
|
|
43
43
|
|
|
44
44
|
```bash
|
|
45
45
|
# [Scraping APIs]
|
|
@@ -49,13 +49,19 @@ export THORDATA_SCRAPER_TOKEN="your_scraper_token"
|
|
|
49
49
|
export THORDATA_PUBLIC_TOKEN="your_public_token"
|
|
50
50
|
export THORDATA_PUBLIC_KEY="your_public_key"
|
|
51
51
|
|
|
52
|
-
# [Proxy
|
|
52
|
+
# [Proxy: Residential / Unlimited / Datacenter / Mobile / ISP]
|
|
53
53
|
export THORDATA_RESIDENTIAL_USERNAME="your_username"
|
|
54
54
|
export THORDATA_RESIDENTIAL_PASSWORD="your_password"
|
|
55
|
-
# Optional:
|
|
56
|
-
# export
|
|
55
|
+
# Optional: Unlimited (high-bandwidth) if your plan has separate credentials
|
|
56
|
+
# export THORDATA_UNLIMITED_USERNAME="..."
|
|
57
|
+
# export THORDATA_UNLIMITED_PASSWORD="..."
|
|
58
|
+
|
|
59
|
+
# Optional: Upstream proxy when behind firewall (e.g. Clash Verge port 7897)
|
|
60
|
+
# export THORDATA_UPSTREAM_PROXY="http://127.0.0.1:7897"
|
|
57
61
|
```
|
|
58
62
|
|
|
63
|
+
Default proxy port is **9999** (residential); other products use different ports (see `.env.example`).
|
|
64
|
+
|
|
59
65
|
---
|
|
60
66
|
|
|
61
67
|
## 🚀 Quick Start
|
|
@@ -155,6 +161,48 @@ if status == "finished":
|
|
|
155
161
|
print(f"Download: {data_url}")
|
|
156
162
|
```
|
|
157
163
|
|
|
164
|
+
### Web Scraper Tools (120+ Pre-built Tools)
|
|
165
|
+
|
|
166
|
+
Use pre-built tools for popular platforms. See [Tool Coverage Matrix](docs/TOOL_COVERAGE_MATRIX.md) for full list.
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from thordata import ThordataClient
|
|
170
|
+
from thordata.tools import Amazon, GoogleMaps, YouTube, TikTok, eBay, Walmart
|
|
171
|
+
|
|
172
|
+
client = ThordataClient()
|
|
173
|
+
|
|
174
|
+
# Amazon Product by ASIN
|
|
175
|
+
task_id = client.run_tool(Amazon.ProductByAsin(asin="B0BZYCJK89"))
|
|
176
|
+
|
|
177
|
+
# Google Maps by Place ID
|
|
178
|
+
task_id = client.run_tool(GoogleMaps.DetailsByPlaceId(place_id="ChIJPTacEpBQwokRKwIlDXelxkA"))
|
|
179
|
+
|
|
180
|
+
# YouTube Video Download
|
|
181
|
+
from thordata import CommonSettings
|
|
182
|
+
settings = CommonSettings(resolution="<=360p", video_codec="vp9")
|
|
183
|
+
task_id = client.run_tool(YouTube.VideoDownload(
|
|
184
|
+
url="https://www.youtube.com/watch?v=jNQXAC9IVRw",
|
|
185
|
+
common_settings=settings
|
|
186
|
+
))
|
|
187
|
+
|
|
188
|
+
# Wait and get results
|
|
189
|
+
status = client.wait_for_task(task_id, max_wait=300)
|
|
190
|
+
if status == "ready":
|
|
191
|
+
download_url = client.get_task_result(task_id)
|
|
192
|
+
print(f"Results: {download_url}")
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
**Available Platforms:**
|
|
196
|
+
- **E-Commerce**: Amazon, eBay, Walmart
|
|
197
|
+
- **Social Media**: TikTok, Instagram, Facebook, Twitter/X, Reddit, LinkedIn
|
|
198
|
+
- **Search**: Google Maps, Google Shopping, Google Play
|
|
199
|
+
- **Video**: YouTube (download, info, subtitles)
|
|
200
|
+
- **Code**: GitHub
|
|
201
|
+
- **Professional**: Indeed, Glassdoor, Crunchbase
|
|
202
|
+
- **Travel/Real Estate**: Booking, Airbnb, Zillow
|
|
203
|
+
|
|
204
|
+
See `examples/tools/` for more examples.
|
|
205
|
+
|
|
158
206
|
---
|
|
159
207
|
|
|
160
208
|
## 🛠️ Management APIs
|
|
@@ -182,6 +230,14 @@ monitor = client.unlimited.get_server_monitor(
|
|
|
182
230
|
|
|
183
231
|
---
|
|
184
232
|
|
|
233
|
+
## 🧪 Development & Testing
|
|
234
|
+
|
|
235
|
+
- **Full env reference**: Copy [.env.example](.env.example) to `.env` and fill in credentials.
|
|
236
|
+
- **Unit tests** (no network): `pytest` or `python -m coverage run -m pytest -p no:cov tests && python -m coverage report -m`
|
|
237
|
+
- **Integration tests** (live API/proxy): Set `THORDATA_INTEGRATION=true` in `.env`; optional `THORDATA_UPSTREAM_PROXY` (e.g. Clash) if behind a firewall. See [CONTRIBUTING.md](CONTRIBUTING.md#-testing-guidelines).
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
185
241
|
## 📄 License
|
|
186
242
|
|
|
187
243
|
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "thordata-sdk"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.7.0"
|
|
9
9
|
description = "The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network."
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
requires-python = ">=3.9"
|
|
@@ -115,16 +115,17 @@ check_untyped_defs = false
|
|
|
115
115
|
strict_optional = false
|
|
116
116
|
show_error_codes = true
|
|
117
117
|
ignore_missing_imports = true
|
|
118
|
+
follow_imports = "skip"
|
|
118
119
|
|
|
119
120
|
[[tool.mypy.overrides]]
|
|
120
121
|
module = ["aiohttp.*", "requests.*"]
|
|
121
122
|
ignore_missing_imports = true
|
|
122
123
|
|
|
123
|
-
# Pytest setup
|
|
124
|
+
# Pytest setup (coverage is run separately via coverage CLI for reliability)
|
|
124
125
|
[tool.pytest.ini_options]
|
|
125
126
|
testpaths = ["tests"]
|
|
126
127
|
asyncio_mode = "auto"
|
|
127
|
-
addopts = "-v
|
|
128
|
+
addopts = "-v"
|
|
128
129
|
markers = ["integration: live tests that require real credentials"]
|
|
129
130
|
|
|
130
131
|
# Coverage setup
|
|
@@ -5,7 +5,7 @@ Official Python client for Thordata's Proxy Network, SERP API,
|
|
|
5
5
|
Universal Scraping API (Web Unlocker), and Web Scraper API.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "1.
|
|
8
|
+
__version__ = "1.6.0"
|
|
9
9
|
__author__ = "Thordata Developer Team/Kael Odin"
|
|
10
10
|
__email__ = "support@thordata.com"
|
|
11
11
|
|
|
@@ -124,10 +124,10 @@ class AsyncThordataClient:
|
|
|
124
124
|
).rstrip("/")
|
|
125
125
|
|
|
126
126
|
self._gateway_base_url = os.getenv(
|
|
127
|
-
"THORDATA_GATEWAY_BASE_URL", "https://
|
|
127
|
+
"THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
|
|
128
128
|
)
|
|
129
129
|
self._child_base_url = os.getenv(
|
|
130
|
-
"THORDATA_CHILD_BASE_URL", "https://
|
|
130
|
+
"THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
|
|
131
131
|
)
|
|
132
132
|
|
|
133
133
|
# URL Construction
|
|
@@ -145,7 +145,7 @@ class AsyncThordataClient:
|
|
|
145
145
|
self._proxy_users_url = f"{shared_api_base}/proxy-users"
|
|
146
146
|
|
|
147
147
|
whitelist_base = os.getenv(
|
|
148
|
-
"THORDATA_WHITELIST_BASE_URL", "https://
|
|
148
|
+
"THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
|
|
149
149
|
)
|
|
150
150
|
self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
|
|
151
151
|
|
|
@@ -293,28 +293,36 @@ class AsyncThordataClient:
|
|
|
293
293
|
url: str,
|
|
294
294
|
*,
|
|
295
295
|
js_render: bool = False,
|
|
296
|
-
output_format: str = "html",
|
|
296
|
+
output_format: str | list[str] = "html",
|
|
297
297
|
country: str | None = None,
|
|
298
298
|
block_resources: str | None = None,
|
|
299
|
+
clean_content: str | None = None,
|
|
299
300
|
wait: int | None = None,
|
|
300
301
|
wait_for: str | None = None,
|
|
302
|
+
follow_redirect: bool | None = None,
|
|
303
|
+
headers: list[dict[str, str]] | None = None,
|
|
304
|
+
cookies: list[dict[str, str]] | None = None,
|
|
301
305
|
**kwargs: Any,
|
|
302
|
-
) -> str | bytes:
|
|
306
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
303
307
|
request = UniversalScrapeRequest(
|
|
304
308
|
url=url,
|
|
305
309
|
js_render=js_render,
|
|
306
310
|
output_format=output_format,
|
|
307
311
|
country=country,
|
|
308
312
|
block_resources=block_resources,
|
|
313
|
+
clean_content=clean_content,
|
|
309
314
|
wait=wait,
|
|
310
315
|
wait_for=wait_for,
|
|
316
|
+
follow_redirect=follow_redirect,
|
|
317
|
+
headers=headers,
|
|
318
|
+
cookies=cookies,
|
|
311
319
|
extra_params=kwargs,
|
|
312
320
|
)
|
|
313
321
|
return await self.universal_scrape_advanced(request)
|
|
314
322
|
|
|
315
323
|
async def universal_scrape_advanced(
|
|
316
324
|
self, request: UniversalScrapeRequest
|
|
317
|
-
) -> str | bytes:
|
|
325
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
318
326
|
if not self.scraper_token:
|
|
319
327
|
raise ThordataConfigError("scraper_token required")
|
|
320
328
|
payload = request.to_payload()
|
|
@@ -327,9 +335,17 @@ class AsyncThordataClient:
|
|
|
327
335
|
try:
|
|
328
336
|
resp_json = await response.json()
|
|
329
337
|
except ValueError:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
338
|
+
# If not JSON, return raw content based on format
|
|
339
|
+
if isinstance(request.output_format, list) or (
|
|
340
|
+
isinstance(request.output_format, str) and "," in request.output_format
|
|
341
|
+
):
|
|
342
|
+
return {"raw": await response.read()}
|
|
343
|
+
fmt = (
|
|
344
|
+
request.output_format.lower()
|
|
345
|
+
if isinstance(request.output_format, str)
|
|
346
|
+
else str(request.output_format).lower()
|
|
347
|
+
)
|
|
348
|
+
return await response.read() if fmt == "png" else await response.text()
|
|
333
349
|
|
|
334
350
|
if isinstance(resp_json, dict):
|
|
335
351
|
code = resp_json.get("code")
|
|
@@ -337,6 +353,27 @@ class AsyncThordataClient:
|
|
|
337
353
|
msg = extract_error_message(resp_json)
|
|
338
354
|
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
339
355
|
|
|
356
|
+
# Handle multiple output formats
|
|
357
|
+
if isinstance(request.output_format, list) or (
|
|
358
|
+
isinstance(request.output_format, str) and "," in request.output_format
|
|
359
|
+
):
|
|
360
|
+
result: dict[str, str | bytes] = {}
|
|
361
|
+
formats = (
|
|
362
|
+
request.output_format
|
|
363
|
+
if isinstance(request.output_format, list)
|
|
364
|
+
else [f.strip() for f in request.output_format.split(",")]
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
for fmt in formats:
|
|
368
|
+
fmt_lower = fmt.lower()
|
|
369
|
+
if fmt_lower == "html" and "html" in resp_json:
|
|
370
|
+
result["html"] = resp_json["html"]
|
|
371
|
+
elif fmt_lower == "png" and "png" in resp_json:
|
|
372
|
+
result["png"] = decode_base64_image(resp_json["png"])
|
|
373
|
+
|
|
374
|
+
if result:
|
|
375
|
+
return result
|
|
376
|
+
|
|
340
377
|
if "html" in resp_json:
|
|
341
378
|
return resp_json["html"]
|
|
342
379
|
if "png" in resp_json:
|
|
@@ -352,7 +389,7 @@ class AsyncThordataClient:
|
|
|
352
389
|
file_name: str,
|
|
353
390
|
spider_id: str,
|
|
354
391
|
spider_name: str,
|
|
355
|
-
parameters: dict[str, Any],
|
|
392
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
356
393
|
universal_params: dict[str, Any] | None = None,
|
|
357
394
|
) -> str:
|
|
358
395
|
config = ScraperTaskConfig(
|
|
@@ -434,7 +471,7 @@ class AsyncThordataClient:
|
|
|
434
471
|
file_name: str,
|
|
435
472
|
spider_id: str,
|
|
436
473
|
spider_name: str,
|
|
437
|
-
parameters: dict[str, Any],
|
|
474
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
438
475
|
common_settings: CommonSettings,
|
|
439
476
|
) -> str:
|
|
440
477
|
config = VideoTaskConfig(
|
|
@@ -550,7 +587,7 @@ class AsyncThordataClient:
|
|
|
550
587
|
file_name: str,
|
|
551
588
|
spider_id: str,
|
|
552
589
|
spider_name: str,
|
|
553
|
-
parameters: dict[str, Any],
|
|
590
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
554
591
|
universal_params: dict[str, Any] | None = None,
|
|
555
592
|
*,
|
|
556
593
|
max_wait: float = 600.0,
|
|
@@ -971,7 +1008,12 @@ class AsyncThordataClient:
|
|
|
971
1008
|
if port:
|
|
972
1009
|
params["port"] = str(port)
|
|
973
1010
|
|
|
974
|
-
|
|
1011
|
+
if product == "unlimited":
|
|
1012
|
+
username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
|
|
1013
|
+
"THORDATA_RESIDENTIAL_USERNAME"
|
|
1014
|
+
)
|
|
1015
|
+
else:
|
|
1016
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
975
1017
|
if username:
|
|
976
1018
|
params["td-customer"] = username
|
|
977
1019
|
|
|
@@ -53,6 +53,7 @@ from .serp_engines import SerpNamespace
|
|
|
53
53
|
# Import Types (Modernized)
|
|
54
54
|
from .types import (
|
|
55
55
|
CommonSettings,
|
|
56
|
+
DataFormat,
|
|
56
57
|
ProxyConfig,
|
|
57
58
|
ProxyProduct,
|
|
58
59
|
ProxyServer,
|
|
@@ -159,10 +160,10 @@ class ThordataClient:
|
|
|
159
160
|
).rstrip("/")
|
|
160
161
|
|
|
161
162
|
self._gateway_base_url = os.getenv(
|
|
162
|
-
"THORDATA_GATEWAY_BASE_URL", "https://
|
|
163
|
+
"THORDATA_GATEWAY_BASE_URL", "https://openapi.thordata.com/api/gateway"
|
|
163
164
|
)
|
|
164
165
|
self._child_base_url = os.getenv(
|
|
165
|
-
"THORDATA_CHILD_BASE_URL", "https://
|
|
166
|
+
"THORDATA_CHILD_BASE_URL", "https://openapi.thordata.com/api/child"
|
|
166
167
|
)
|
|
167
168
|
|
|
168
169
|
# URL Construction
|
|
@@ -183,7 +184,7 @@ class ThordataClient:
|
|
|
183
184
|
self._proxy_users_url = f"{shared_api_base}/proxy-users"
|
|
184
185
|
|
|
185
186
|
whitelist_base = os.getenv(
|
|
186
|
-
"THORDATA_WHITELIST_BASE_URL", "https://
|
|
187
|
+
"THORDATA_WHITELIST_BASE_URL", "https://openapi.thordata.com/api"
|
|
187
188
|
)
|
|
188
189
|
self._whitelist_url = f"{whitelist_base}/whitelisted-ips"
|
|
189
190
|
|
|
@@ -364,26 +365,36 @@ class ThordataClient:
|
|
|
364
365
|
url: str,
|
|
365
366
|
*,
|
|
366
367
|
js_render: bool = False,
|
|
367
|
-
output_format: str = "html",
|
|
368
|
+
output_format: str | list[str] = "html",
|
|
368
369
|
country: str | None = None,
|
|
369
370
|
block_resources: str | None = None,
|
|
371
|
+
clean_content: str | None = None,
|
|
370
372
|
wait: int | None = None,
|
|
371
373
|
wait_for: str | None = None,
|
|
374
|
+
follow_redirect: bool | None = None,
|
|
375
|
+
headers: list[dict[str, str]] | None = None,
|
|
376
|
+
cookies: list[dict[str, str]] | None = None,
|
|
372
377
|
**kwargs: Any,
|
|
373
|
-
) -> str | bytes:
|
|
378
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
374
379
|
request = UniversalScrapeRequest(
|
|
375
380
|
url=url,
|
|
376
381
|
js_render=js_render,
|
|
377
382
|
output_format=output_format,
|
|
378
383
|
country=country,
|
|
379
384
|
block_resources=block_resources,
|
|
385
|
+
clean_content=clean_content,
|
|
380
386
|
wait=wait,
|
|
381
387
|
wait_for=wait_for,
|
|
388
|
+
follow_redirect=follow_redirect,
|
|
389
|
+
headers=headers,
|
|
390
|
+
cookies=cookies,
|
|
382
391
|
extra_params=kwargs,
|
|
383
392
|
)
|
|
384
393
|
return self.universal_scrape_advanced(request)
|
|
385
394
|
|
|
386
|
-
def universal_scrape_advanced(
|
|
395
|
+
def universal_scrape_advanced(
|
|
396
|
+
self, request: UniversalScrapeRequest
|
|
397
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
387
398
|
if not self.scraper_token:
|
|
388
399
|
raise ThordataConfigError("scraper_token required")
|
|
389
400
|
|
|
@@ -405,7 +416,7 @@ class ThordataClient:
|
|
|
405
416
|
file_name: str,
|
|
406
417
|
spider_id: str,
|
|
407
418
|
spider_name: str,
|
|
408
|
-
parameters: dict[str, Any],
|
|
419
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
409
420
|
universal_params: dict[str, Any] | None = None,
|
|
410
421
|
) -> str:
|
|
411
422
|
config = ScraperTaskConfig(
|
|
@@ -490,7 +501,7 @@ class ThordataClient:
|
|
|
490
501
|
file_name: str,
|
|
491
502
|
spider_id: str,
|
|
492
503
|
spider_name: str,
|
|
493
|
-
parameters: dict[str, Any],
|
|
504
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
494
505
|
common_settings: CommonSettings,
|
|
495
506
|
) -> str:
|
|
496
507
|
config = VideoTaskConfig(
|
|
@@ -639,7 +650,7 @@ class ThordataClient:
|
|
|
639
650
|
file_name: str,
|
|
640
651
|
spider_id: str,
|
|
641
652
|
spider_name: str,
|
|
642
|
-
parameters: dict[str, Any],
|
|
653
|
+
parameters: dict[str, Any] | list[dict[str, Any]],
|
|
643
654
|
universal_params: dict[str, Any] | None = None,
|
|
644
655
|
*,
|
|
645
656
|
max_wait: float = 600.0,
|
|
@@ -648,6 +659,7 @@ class ThordataClient:
|
|
|
648
659
|
include_errors: bool = True,
|
|
649
660
|
task_type: str = "web",
|
|
650
661
|
common_settings: CommonSettings | None = None,
|
|
662
|
+
data_format: DataFormat | str | None = None,
|
|
651
663
|
) -> str:
|
|
652
664
|
import time
|
|
653
665
|
|
|
@@ -671,6 +683,7 @@ class ThordataClient:
|
|
|
671
683
|
parameters=parameters,
|
|
672
684
|
universal_params=universal_params,
|
|
673
685
|
include_errors=include_errors,
|
|
686
|
+
data_format=data_format,
|
|
674
687
|
)
|
|
675
688
|
task_id = self.create_scraper_task_advanced(config)
|
|
676
689
|
|
|
@@ -862,7 +875,12 @@ class ThordataClient:
|
|
|
862
875
|
if port:
|
|
863
876
|
params["port"] = str(port)
|
|
864
877
|
|
|
865
|
-
|
|
878
|
+
if product == "unlimited":
|
|
879
|
+
username = os.getenv("THORDATA_UNLIMITED_USERNAME") or os.getenv(
|
|
880
|
+
"THORDATA_RESIDENTIAL_USERNAME"
|
|
881
|
+
)
|
|
882
|
+
else:
|
|
883
|
+
username = os.getenv("THORDATA_RESIDENTIAL_USERNAME")
|
|
866
884
|
if username:
|
|
867
885
|
params["td-customer"] = username
|
|
868
886
|
|
|
@@ -1207,12 +1225,22 @@ class ThordataClient:
|
|
|
1207
1225
|
# =========================================================================
|
|
1208
1226
|
|
|
1209
1227
|
def _process_universal_response(
|
|
1210
|
-
self, response: requests.Response, output_format: str
|
|
1211
|
-
) -> str | bytes:
|
|
1228
|
+
self, response: requests.Response, output_format: str | list[str]
|
|
1229
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
1230
|
+
"""Process universal scrape response. Returns single value or dict if multiple formats requested."""
|
|
1212
1231
|
try:
|
|
1213
1232
|
resp_json = response.json()
|
|
1214
1233
|
except ValueError:
|
|
1215
|
-
|
|
1234
|
+
# If not JSON, return raw content based on format
|
|
1235
|
+
if isinstance(output_format, list):
|
|
1236
|
+
# Multiple formats requested but got non-JSON response
|
|
1237
|
+
return {"raw": response.content}
|
|
1238
|
+
fmt = (
|
|
1239
|
+
output_format.lower()
|
|
1240
|
+
if isinstance(output_format, str)
|
|
1241
|
+
else str(output_format).lower()
|
|
1242
|
+
)
|
|
1243
|
+
return response.content if fmt == "png" else response.text
|
|
1216
1244
|
|
|
1217
1245
|
if isinstance(resp_json, dict):
|
|
1218
1246
|
code = resp_json.get("code")
|
|
@@ -1220,6 +1248,29 @@ class ThordataClient:
|
|
|
1220
1248
|
msg = extract_error_message(resp_json)
|
|
1221
1249
|
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
1222
1250
|
|
|
1251
|
+
# Handle multiple output formats
|
|
1252
|
+
if isinstance(output_format, list) or (
|
|
1253
|
+
isinstance(output_format, str) and "," in output_format
|
|
1254
|
+
):
|
|
1255
|
+
result: dict[str, str | bytes] = {}
|
|
1256
|
+
formats = (
|
|
1257
|
+
output_format
|
|
1258
|
+
if isinstance(output_format, list)
|
|
1259
|
+
else [f.strip() for f in output_format.split(",")]
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1262
|
+
for fmt in formats:
|
|
1263
|
+
fmt_lower = fmt.lower()
|
|
1264
|
+
if fmt_lower == "html" and "html" in resp_json:
|
|
1265
|
+
result["html"] = resp_json["html"]
|
|
1266
|
+
elif fmt_lower == "png" and "png" in resp_json:
|
|
1267
|
+
result["png"] = decode_base64_image(resp_json["png"])
|
|
1268
|
+
|
|
1269
|
+
# If we got results, return dict; otherwise return single value for backward compatibility
|
|
1270
|
+
if result:
|
|
1271
|
+
return result
|
|
1272
|
+
|
|
1273
|
+
# Single format (backward compatibility)
|
|
1223
1274
|
if "html" in resp_json:
|
|
1224
1275
|
return resp_json["html"]
|
|
1225
1276
|
if "png" in resp_json:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Enumerations for the Thordata Python SDK.
|
|
3
|
-
Moved to thordata.types in v1.
|
|
3
|
+
Moved to thordata.types in v1.6.0.
|
|
4
4
|
This file is kept for backward compatibility.
|
|
5
5
|
"""
|
|
6
6
|
|
|
@@ -21,7 +21,7 @@ from .types import (
|
|
|
21
21
|
SessionType,
|
|
22
22
|
TaskStatus,
|
|
23
23
|
TimeRange,
|
|
24
|
-
normalize_enum_value,
|
|
24
|
+
normalize_enum_value,
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
__all__ = [
|