thordata-sdk 0.2.4__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +151 -0
- thordata/_example_utils.py +77 -0
- thordata/_utils.py +190 -0
- thordata/async_client.py +1675 -0
- thordata/client.py +1644 -0
- thordata/demo.py +138 -0
- thordata/enums.py +384 -0
- thordata/exceptions.py +355 -0
- thordata/models.py +1197 -0
- thordata/retry.py +382 -0
- thordata/serp_engines.py +166 -0
- thordata_sdk-1.2.0.dist-info/METADATA +208 -0
- thordata_sdk-1.2.0.dist-info/RECORD +16 -0
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-1.2.0.dist-info}/WHEEL +1 -1
- thordata_sdk-1.2.0.dist-info/licenses/LICENSE +21 -0
- thordata_sdk-1.2.0.dist-info/top_level.txt +1 -0
- thordata_sdk/__init__.py +0 -9
- thordata_sdk/async_client.py +0 -247
- thordata_sdk/client.py +0 -303
- thordata_sdk/enums.py +0 -20
- thordata_sdk/parameters.py +0 -41
- thordata_sdk-0.2.4.dist-info/LICENSE +0 -201
- thordata_sdk-0.2.4.dist-info/METADATA +0 -113
- thordata_sdk-0.2.4.dist-info/RECORD +0 -10
- thordata_sdk-0.2.4.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
|
+
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://www.thordata.com
|
|
8
|
+
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
9
|
+
Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
|
|
10
|
+
Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/Thordata/thordata-python-sdk/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: web scraping,proxy,residential proxy,datacenter proxy,ai,llm,data-mining,serp,thordata,web scraper,anti-bot bypass
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Topic :: Internet :: Proxy Servers
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
24
|
+
Classifier: Operating System :: OS Independent
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: requests>=2.25.0
|
|
30
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
31
|
+
Requires-Dist: PySocks>=1.7.1
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-httpserver>=1.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
39
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
40
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: types-requests>=2.28.0; extra == "dev"
|
|
42
|
+
Requires-Dist: aioresponses>=0.7.6; extra == "dev"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# Thordata Python SDK
|
|
46
|
+
|
|
47
|
+
<div align="center">
|
|
48
|
+
|
|
49
|
+
<img src="https://img.shields.io/badge/Thordata-AI%20Infrastructure-blue?style=for-the-badge" alt="Thordata Logo">
|
|
50
|
+
|
|
51
|
+
**The Official Python Client for Thordata APIs**
|
|
52
|
+
|
|
53
|
+
*Proxy Network • SERP API • Web Unlocker • Web Scraper API*
|
|
54
|
+
|
|
55
|
+
[](https://pypi.org/project/thordata-sdk/)
|
|
56
|
+
[](https://pypi.org/project/thordata-sdk/)
|
|
57
|
+
[](LICENSE)
|
|
58
|
+
[](https://github.com/Thordata/thordata-python-sdk/actions)
|
|
59
|
+
|
|
60
|
+
</div>
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## 📖 Introduction
|
|
65
|
+
|
|
66
|
+
This SDK provides a robust, high-performance interface to Thordata's AI data infrastructure. It is designed for high-concurrency scraping, reliable proxy tunneling, and seamless data extraction.
|
|
67
|
+
|
|
68
|
+
**Key Features:**
|
|
69
|
+
* **🚀 Production Ready:** Built on `urllib3` connection pooling for low-latency proxy requests.
|
|
70
|
+
* **⚡ Async Support:** Native `aiohttp` client for high-concurrency SERP/Universal scraping.
|
|
71
|
+
* **🛡️ Robust:** Handles TLS-in-TLS tunneling, retries, and error parsing automatically.
|
|
72
|
+
* **✨ Developer Experience:** Fully typed (`mypy` compatible) with intuitive IDE autocomplete.
|
|
73
|
+
* **🧩 Lazy Validation:** Only validate credentials for the features you actually use.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## 📦 Installation
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install thordata-sdk
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## 🔐 Configuration
|
|
86
|
+
|
|
87
|
+
Set environment variables to avoid hardcoding credentials. You only need to set the variables for the features you use.
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# [Required for SERP & Web Unlocker]
|
|
91
|
+
export THORDATA_SCRAPER_TOKEN="your_token_here"
|
|
92
|
+
|
|
93
|
+
# [Required for Proxy Network]
|
|
94
|
+
export THORDATA_RESIDENTIAL_USERNAME="your_username"
|
|
95
|
+
export THORDATA_RESIDENTIAL_PASSWORD="your_password"
|
|
96
|
+
export THORDATA_PROXY_HOST="vpnXXXX.pr.thordata.net"
|
|
97
|
+
|
|
98
|
+
# [Required for Task Management]
|
|
99
|
+
export THORDATA_PUBLIC_TOKEN="public_token"
|
|
100
|
+
export THORDATA_PUBLIC_KEY="public_key"
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## 🚀 Quick Start
|
|
106
|
+
|
|
107
|
+
### 1. SERP Search (Google/Bing/Yandex)
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from thordata import ThordataClient, Engine
|
|
111
|
+
|
|
112
|
+
client = ThordataClient() # Loads THORDATA_SCRAPER_TOKEN from env
|
|
113
|
+
|
|
114
|
+
# Simple Search
|
|
115
|
+
print("Searching...")
|
|
116
|
+
results = client.serp_search("latest AI trends", engine=Engine.GOOGLE_NEWS)
|
|
117
|
+
|
|
118
|
+
for news in results.get("news_results", [])[:3]:
|
|
119
|
+
print(f"- {news['title']} ({news['source']})")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 2. Universal Scrape (Web Unlocker)
|
|
123
|
+
|
|
124
|
+
Bypass Cloudflare/Akamai and render JavaScript automatically.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
html = client.universal_scrape(
|
|
128
|
+
url="https://example.com/protected-page",
|
|
129
|
+
js_render=True,
|
|
130
|
+
wait_for=".content-loaded",
|
|
131
|
+
country="us"
|
|
132
|
+
)
|
|
133
|
+
print(f"Scraped {len(html)} bytes")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### 3. High-Performance Proxy
|
|
137
|
+
|
|
138
|
+
Use Thordata's residential IPs with automatic connection pooling.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from thordata import ProxyConfig, ProxyProduct
|
|
142
|
+
|
|
143
|
+
# Config is optional if env vars are set, but allows granular control
|
|
144
|
+
proxy = ProxyConfig(
|
|
145
|
+
product=ProxyProduct.RESIDENTIAL,
|
|
146
|
+
country="jp",
|
|
147
|
+
city="tokyo",
|
|
148
|
+
session_id="session-001",
|
|
149
|
+
session_duration=10 # Sticky IP for 10 mins
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Use the client to make requests (Reuses TCP connections)
|
|
153
|
+
response = client.get("https://httpbin.org/ip", proxy_config=proxy)
|
|
154
|
+
print(response.json())
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## ⚙️ Advanced Usage
|
|
160
|
+
|
|
161
|
+
### Async Client (High Concurrency)
|
|
162
|
+
|
|
163
|
+
For building AI agents or high-throughput spiders.
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
import asyncio
|
|
167
|
+
from thordata import AsyncThordataClient
|
|
168
|
+
|
|
169
|
+
async def main():
|
|
170
|
+
async with AsyncThordataClient() as client:
|
|
171
|
+
# Fire off multiple requests in parallel
|
|
172
|
+
tasks = [
|
|
173
|
+
client.serp_search(f"query {i}")
|
|
174
|
+
for i in range(5)
|
|
175
|
+
]
|
|
176
|
+
results = await asyncio.gather(*tasks)
|
|
177
|
+
print(f"Completed {len(results)} searches")
|
|
178
|
+
|
|
179
|
+
asyncio.run(main())
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Web Scraper API (Task Management)
|
|
183
|
+
|
|
184
|
+
Create and manage large-scale scraping tasks asynchronously.
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
# 1. Create a task
|
|
188
|
+
task_id = client.create_scraper_task(
|
|
189
|
+
file_name="daily_scrape",
|
|
190
|
+
spider_id="universal",
|
|
191
|
+
spider_name="universal",
|
|
192
|
+
parameters={"url": "https://example.com"}
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# 2. Wait for completion (Polling)
|
|
196
|
+
status = client.wait_for_task(task_id)
|
|
197
|
+
|
|
198
|
+
# 3. Get results
|
|
199
|
+
if status == "ready":
|
|
200
|
+
url = client.get_task_result(task_id)
|
|
201
|
+
print(f"Download Data: {url}")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## 📄 License
|
|
207
|
+
|
|
208
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
thordata/__init__.py,sha256=Ac7vBgJx4idvcw-Q5HFI0K9xkpSjtavQ_CRN5fZnBPk,3195
|
|
2
|
+
thordata/_example_utils.py,sha256=T9QtVq9BHhubOShgtGp2GSusYYd-ZFUJFJAw7ubIsa4,2199
|
|
3
|
+
thordata/_utils.py,sha256=Acr_6sHgdZXU7SQozd6FEYTZV6iHw__nlhpBTDwb66U,4917
|
|
4
|
+
thordata/async_client.py,sha256=Cd94u7Si8WRClZMzVfPfUO0a9U1vEDOXysuBQMEjvjs,56075
|
|
5
|
+
thordata/client.py,sha256=KRveHA62hF3NzbJHYQSQdIsEYeozhL7rdDHg--N7GaM,56159
|
|
6
|
+
thordata/demo.py,sha256=HQzgaUM33bWD7mBQ6HEkK5K6zqFnSAHLvaam6BwPgFA,3762
|
|
7
|
+
thordata/enums.py,sha256=MpZnS9_8sg2vtcFqM6UicB94cKZm5R1t83L3ejNSbLs,8502
|
|
8
|
+
thordata/exceptions.py,sha256=P9czrxkFhT439DxW3LE5W-koS595ObH4-mAQOfaDM18,9976
|
|
9
|
+
thordata/models.py,sha256=qtB7jE0v5zNEQfSpmOqdiacB5DgM2QfVR2PaYs-DisM,38206
|
|
10
|
+
thordata/retry.py,sha256=5kRwULl3X68Nx8PlSzr9benfyCL0nRSpVQXrwjWr45M,11456
|
|
11
|
+
thordata/serp_engines.py,sha256=iuMWncelcGOskCHXFzpcPMMTL5qfiLkazHB1uj3zpZo,5985
|
|
12
|
+
thordata_sdk-1.2.0.dist-info/licenses/LICENSE,sha256=bAxpWgQIzb-5jl3nhLdOwOJ_vlbHLtSG7yev2B7vioY,1088
|
|
13
|
+
thordata_sdk-1.2.0.dist-info/METADATA,sha256=fFDZQ8nh5_4RM8kUS7CAMylANPKROscmDusJId21ZjU,6600
|
|
14
|
+
thordata_sdk-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
thordata_sdk-1.2.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
|
|
16
|
+
thordata_sdk-1.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Thordata · AI Proxy & Web Data
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
thordata
|
thordata_sdk/__init__.py
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
# Expose main clients
|
|
2
|
-
from .client import ThordataClient
|
|
3
|
-
from .async_client import AsyncThordataClient
|
|
4
|
-
from .enums import Engine, GoogleSearchType
|
|
5
|
-
|
|
6
|
-
# Version of the thordata-sdk package
|
|
7
|
-
__version__ = "0.2.4"
|
|
8
|
-
|
|
9
|
-
__all__ = ["ThordataClient", "AsyncThordataClient"]
|
thordata_sdk/async_client.py
DELETED
|
@@ -1,247 +0,0 @@
|
|
|
1
|
-
import aiohttp
|
|
2
|
-
import logging
|
|
3
|
-
import json
|
|
4
|
-
import base64
|
|
5
|
-
from typing import Optional, Dict, Any, Union
|
|
6
|
-
|
|
7
|
-
# 复用我们刚刚写好的逻辑和枚举
|
|
8
|
-
from .enums import Engine
|
|
9
|
-
from .parameters import normalize_serp_params
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class AsyncThordataClient:
|
|
15
|
-
"""
|
|
16
|
-
Thordata Asynchronous Client (built on aiohttp).
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
scraper_token: str,
|
|
22
|
-
public_token: str,
|
|
23
|
-
public_key: str,
|
|
24
|
-
proxy_host: str = "gate.thordata.com",
|
|
25
|
-
proxy_port: int = 22225
|
|
26
|
-
):
|
|
27
|
-
self.scraper_token = scraper_token
|
|
28
|
-
self.public_token = public_token
|
|
29
|
-
self.public_key = public_key
|
|
30
|
-
|
|
31
|
-
self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
|
|
32
|
-
self.proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
33
|
-
|
|
34
|
-
self.base_url = "https://scraperapi.thordata.com"
|
|
35
|
-
self.universal_url = "https://universalapi.thordata.com"
|
|
36
|
-
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
37
|
-
|
|
38
|
-
self.SERP_API_URL = f"{self.base_url}/request"
|
|
39
|
-
self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
|
|
40
|
-
self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
|
|
41
|
-
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
42
|
-
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
43
|
-
|
|
44
|
-
self._session: Optional[aiohttp.ClientSession] = None
|
|
45
|
-
|
|
46
|
-
async def __aenter__(self):
|
|
47
|
-
if self._session is None or self._session.closed:
|
|
48
|
-
self._session = aiohttp.ClientSession(trust_env=True)
|
|
49
|
-
return self
|
|
50
|
-
|
|
51
|
-
async def __aexit__(self, exc_type, exc, tb):
|
|
52
|
-
await self.close()
|
|
53
|
-
|
|
54
|
-
async def close(self):
|
|
55
|
-
if self._session and not self._session.closed:
|
|
56
|
-
await self._session.close()
|
|
57
|
-
self._session = None
|
|
58
|
-
|
|
59
|
-
# --- Proxy (Unchanged) ---
|
|
60
|
-
async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
|
|
61
|
-
if self._session is None:
|
|
62
|
-
raise RuntimeError("Client session not initialized.")
|
|
63
|
-
try:
|
|
64
|
-
return await self._session.get(
|
|
65
|
-
url,
|
|
66
|
-
proxy=self.proxy_url,
|
|
67
|
-
proxy_auth=self.proxy_auth,
|
|
68
|
-
**kwargs
|
|
69
|
-
)
|
|
70
|
-
except aiohttp.ClientError as e:
|
|
71
|
-
logger.error(f"Async Request failed: {e}")
|
|
72
|
-
raise
|
|
73
|
-
|
|
74
|
-
# --- SERP (Optimized) ---
|
|
75
|
-
async def serp_search(
|
|
76
|
-
self,
|
|
77
|
-
query: str,
|
|
78
|
-
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
79
|
-
num: int = 10,
|
|
80
|
-
**kwargs
|
|
81
|
-
) -> Dict[str, Any]:
|
|
82
|
-
"""
|
|
83
|
-
Execute a real-time SERP search (Async).
|
|
84
|
-
"""
|
|
85
|
-
if self._session is None:
|
|
86
|
-
raise RuntimeError("Client session not initialized.")
|
|
87
|
-
|
|
88
|
-
# 1. 转换枚举
|
|
89
|
-
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
90
|
-
|
|
91
|
-
# 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
|
|
92
|
-
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
93
|
-
|
|
94
|
-
headers = {
|
|
95
|
-
"Authorization": f"Bearer {self.scraper_token}",
|
|
96
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
# 3. 发送请求
|
|
100
|
-
async with self._session.post(
|
|
101
|
-
self.SERP_API_URL, data=payload, headers=headers
|
|
102
|
-
) as response:
|
|
103
|
-
response.raise_for_status()
|
|
104
|
-
data = await response.json()
|
|
105
|
-
if isinstance(data, str):
|
|
106
|
-
try:
|
|
107
|
-
data = json.loads(data)
|
|
108
|
-
except Exception:
|
|
109
|
-
pass
|
|
110
|
-
return data
|
|
111
|
-
|
|
112
|
-
# --- Universal (Unchanged) ---
|
|
113
|
-
async def universal_scrape(
|
|
114
|
-
self,
|
|
115
|
-
url: str,
|
|
116
|
-
js_render: bool = False,
|
|
117
|
-
output_format: str = "HTML",
|
|
118
|
-
country: str = None,
|
|
119
|
-
block_resources: bool = False
|
|
120
|
-
) -> Union[str, bytes]:
|
|
121
|
-
if self._session is None:
|
|
122
|
-
raise RuntimeError("Client session not initialized.")
|
|
123
|
-
|
|
124
|
-
headers = {
|
|
125
|
-
"Authorization": f"Bearer {self.scraper_token}",
|
|
126
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
payload = {
|
|
130
|
-
"url": url,
|
|
131
|
-
"js_render": "True" if js_render else "False",
|
|
132
|
-
"type": output_format.lower(),
|
|
133
|
-
"block_resources": "True" if block_resources else "False"
|
|
134
|
-
}
|
|
135
|
-
if country:
|
|
136
|
-
payload["country"] = country
|
|
137
|
-
|
|
138
|
-
async with self._session.post(
|
|
139
|
-
self.UNIVERSAL_API_URL, data=payload, headers=headers
|
|
140
|
-
) as response:
|
|
141
|
-
response.raise_for_status()
|
|
142
|
-
|
|
143
|
-
try:
|
|
144
|
-
resp_json = await response.json()
|
|
145
|
-
except Exception:
|
|
146
|
-
if output_format.upper() == "PNG":
|
|
147
|
-
return await response.read()
|
|
148
|
-
return await response.text()
|
|
149
|
-
|
|
150
|
-
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
151
|
-
and resp_json.get("code") != 200:
|
|
152
|
-
raise Exception(f"Universal API Error: {resp_json}")
|
|
153
|
-
|
|
154
|
-
if "html" in resp_json:
|
|
155
|
-
return resp_json["html"]
|
|
156
|
-
|
|
157
|
-
if "png" in resp_json:
|
|
158
|
-
png_str = resp_json["png"]
|
|
159
|
-
if not png_str:
|
|
160
|
-
raise Exception("API returned empty PNG data")
|
|
161
|
-
|
|
162
|
-
# 🛠️ FIX: 移除 Data URI Scheme 前缀
|
|
163
|
-
if "," in png_str:
|
|
164
|
-
png_str = png_str.split(",", 1)[1]
|
|
165
|
-
|
|
166
|
-
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
167
|
-
missing_padding = len(png_str) % 4
|
|
168
|
-
if missing_padding:
|
|
169
|
-
png_str += '=' * (4 - missing_padding)
|
|
170
|
-
return base64.b64decode(png_str)
|
|
171
|
-
|
|
172
|
-
return str(resp_json)
|
|
173
|
-
|
|
174
|
-
# --- Web Scraper (Optimized) ---
|
|
175
|
-
async def create_scraper_task(
|
|
176
|
-
self,
|
|
177
|
-
file_name: str,
|
|
178
|
-
spider_id: str,
|
|
179
|
-
spider_name: str,
|
|
180
|
-
individual_params: Dict[str, Any],
|
|
181
|
-
universal_params: Dict[str, Any] = None
|
|
182
|
-
) -> str:
|
|
183
|
-
"""
|
|
184
|
-
Create an Asynchronous Web Scraper Task.
|
|
185
|
-
"""
|
|
186
|
-
if self._session is None:
|
|
187
|
-
raise RuntimeError("Client session not initialized.")
|
|
188
|
-
|
|
189
|
-
headers = {
|
|
190
|
-
"Authorization": f"Bearer {self.scraper_token}",
|
|
191
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
# 简化 Payload 构建,移除不必要的检查
|
|
195
|
-
payload = {
|
|
196
|
-
"file_name": file_name,
|
|
197
|
-
"spider_id": spider_id,
|
|
198
|
-
"spider_name": spider_name,
|
|
199
|
-
"spider_parameters": json.dumps([individual_params]),
|
|
200
|
-
"spider_errors": "true"
|
|
201
|
-
}
|
|
202
|
-
if universal_params:
|
|
203
|
-
payload["spider_universal"] = json.dumps(universal_params)
|
|
204
|
-
|
|
205
|
-
async with self._session.post(
|
|
206
|
-
self.SCRAPER_BUILDER_URL, data=payload, headers=headers
|
|
207
|
-
) as response:
|
|
208
|
-
response.raise_for_status()
|
|
209
|
-
data = await response.json()
|
|
210
|
-
if data.get("code") != 200:
|
|
211
|
-
raise Exception(f"Creation failed: {data}")
|
|
212
|
-
return data["data"]["task_id"]
|
|
213
|
-
|
|
214
|
-
# --- Status & Result (Unchanged) ---
|
|
215
|
-
async def get_task_status(self, task_id: str) -> str:
|
|
216
|
-
headers = {
|
|
217
|
-
"token": self.public_token,
|
|
218
|
-
"key": self.public_key,
|
|
219
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
220
|
-
}
|
|
221
|
-
payload = {"tasks_ids": task_id}
|
|
222
|
-
|
|
223
|
-
async with self._session.post(
|
|
224
|
-
self.SCRAPER_STATUS_URL, data=payload, headers=headers
|
|
225
|
-
) as response:
|
|
226
|
-
data = await response.json()
|
|
227
|
-
if data.get("code") == 200 and data.get("data"):
|
|
228
|
-
for item in data["data"]:
|
|
229
|
-
if str(item["task_id"]) == str(task_id):
|
|
230
|
-
return item["status"]
|
|
231
|
-
return "Unknown"
|
|
232
|
-
|
|
233
|
-
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
234
|
-
headers = {
|
|
235
|
-
"token": self.public_token,
|
|
236
|
-
"key": self.public_key,
|
|
237
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
238
|
-
}
|
|
239
|
-
payload = {"tasks_id": task_id, "type": "json"}
|
|
240
|
-
|
|
241
|
-
async with self._session.post(
|
|
242
|
-
self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
|
|
243
|
-
) as response:
|
|
244
|
-
data = await response.json()
|
|
245
|
-
if data.get("code") == 200:
|
|
246
|
-
return data["data"]["download"]
|
|
247
|
-
raise Exception(f"Result Error: {data}")
|