thordata-sdk 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +16 -0
- {thordata_sdk → thordata}/async_client.py +67 -33
- thordata/client.py +486 -0
- thordata/enums.py +25 -0
- thordata/parameters.py +52 -0
- thordata_sdk-0.3.1.dist-info/METADATA +200 -0
- thordata_sdk-0.3.1.dist-info/RECORD +10 -0
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-0.3.1.dist-info}/WHEEL +1 -1
- thordata_sdk-0.3.1.dist-info/top_level.txt +1 -0
- thordata_sdk/__init__.py +0 -9
- thordata_sdk/client.py +0 -303
- thordata_sdk/enums.py +0 -20
- thordata_sdk/parameters.py +0 -41
- thordata_sdk-0.2.4.dist-info/METADATA +0 -113
- thordata_sdk-0.2.4.dist-info/RECORD +0 -10
- thordata_sdk-0.2.4.dist-info/top_level.txt +0 -1
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-0.3.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
|
+
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://www.thordata.com
|
|
8
|
+
Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
|
|
9
|
+
Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
|
|
10
|
+
Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
|
|
11
|
+
Keywords: web scraping,proxy,ai,llm,data-mining,serp,thordata
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
22
|
+
Classifier: Operating System :: OS Independent
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: requests>=2.25.0
|
|
27
|
+
Requires-Dist: aiohttp>=3.8.0
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# Thordata Python SDK
|
|
31
|
+
|
|
32
|
+
<h4 align="center">
|
|
33
|
+
Official Python client for Thordata's Proxy Network, SERP API, Universal Scraping API, and Web Scraper API.
|
|
34
|
+
<br>
|
|
35
|
+
<i>Async-ready, built for AI agents and large-scale data collection.</i>
|
|
36
|
+
</h4>
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<a href="https://github.com/Thordata/thordata-python-sdk/actions/workflows/ci.yml">
|
|
40
|
+
<img src="https://github.com/Thordata/thordata-python-sdk/actions/workflows/ci.yml/badge.svg" alt="CI">
|
|
41
|
+
</a>
|
|
42
|
+
<a href="https://pypi.org/project/thordata-sdk/">
|
|
43
|
+
<img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version">
|
|
44
|
+
</a>
|
|
45
|
+
<a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE">
|
|
46
|
+
<img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License">
|
|
47
|
+
</a>
|
|
48
|
+
<a href="https://python.org">
|
|
49
|
+
<img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions">
|
|
50
|
+
</a>
|
|
51
|
+
</p>
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install thordata-sdk
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
All examples below use the unified client:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from thordata import ThordataClient, AsyncThordataClient
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
You can copy `examples/.env.example` to `.env` and fill in your tokens from the Thordata Dashboard.
|
|
70
|
+
|
|
71
|
+
### 1. Proxy Network (Simple GET)
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import os
|
|
75
|
+
from dotenv import load_dotenv
|
|
76
|
+
from thordata import ThordataClient
|
|
77
|
+
|
|
78
|
+
load_dotenv()
|
|
79
|
+
|
|
80
|
+
client = ThordataClient(
|
|
81
|
+
scraper_token=os.getenv("THORDATA_SCRAPER_TOKEN"),
|
|
82
|
+
public_token=os.getenv("THORDATA_PUBLIC_TOKEN"),
|
|
83
|
+
public_key=os.getenv("THORDATA_PUBLIC_KEY"),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
resp = client.get("http://httpbin.org/ip")
|
|
87
|
+
print(resp.json())
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 2. SERP API (Google, Bing, Yandex, DuckDuckGo)
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from thordata import ThordataClient, Engine
|
|
94
|
+
|
|
95
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
96
|
+
|
|
97
|
+
results = client.serp_search(
|
|
98
|
+
query="Thordata technology",
|
|
99
|
+
engine=Engine.GOOGLE,
|
|
100
|
+
num=10,
|
|
101
|
+
# Any engine-specific parameters are passed via **kwargs
|
|
102
|
+
# e.g. type="shopping", location="United States"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
print(len(results.get("organic", [])))
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 3. Universal Scraping API
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from thordata import ThordataClient
|
|
112
|
+
|
|
113
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
114
|
+
|
|
115
|
+
html = client.universal_scrape(
|
|
116
|
+
url="https://www.google.com",
|
|
117
|
+
js_render=True,
|
|
118
|
+
output_format="HTML",
|
|
119
|
+
)
|
|
120
|
+
print(html[:200])
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 4. Web Scraper API (Task-based)
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
import time
|
|
127
|
+
from thordata import ThordataClient
|
|
128
|
+
|
|
129
|
+
client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
|
|
130
|
+
|
|
131
|
+
task_id = client.create_scraper_task(
|
|
132
|
+
file_name="demo_youtube_data",
|
|
133
|
+
spider_id="youtube_video-post_by-url",
|
|
134
|
+
spider_name="youtube.com",
|
|
135
|
+
individual_params={
|
|
136
|
+
"url": "https://www.youtube.com/@stephcurry/videos",
|
|
137
|
+
"order_by": "",
|
|
138
|
+
"num_of_posts": ""
|
|
139
|
+
},
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
for _ in range(10):
|
|
143
|
+
status = client.get_task_status(task_id)
|
|
144
|
+
print("Status:", status)
|
|
145
|
+
if status in ["Ready", "Success"]:
|
|
146
|
+
break
|
|
147
|
+
if status == "Failed":
|
|
148
|
+
raise RuntimeError("Task failed")
|
|
149
|
+
time.sleep(3)
|
|
150
|
+
|
|
151
|
+
download_url = client.get_task_result(task_id)
|
|
152
|
+
print("Download URL:", download_url)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 5. Asynchronous Usage (High Concurrency)
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
import asyncio
|
|
159
|
+
from thordata import AsyncThordataClient
|
|
160
|
+
|
|
161
|
+
async def main():
|
|
162
|
+
async with AsyncThordataClient(
|
|
163
|
+
scraper_token="SCRAPER_TOKEN",
|
|
164
|
+
public_token="PUBLIC_TOKEN",
|
|
165
|
+
public_key="PUBLIC_KEY",
|
|
166
|
+
) as client:
|
|
167
|
+
resp = await client.get("http://httpbin.org/ip")
|
|
168
|
+
print(await resp.json())
|
|
169
|
+
|
|
170
|
+
asyncio.run(main())
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
More examples are available in the `examples/` directory.
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Features
|
|
178
|
+
|
|
179
|
+
| Feature | Status | Description |
|
|
180
|
+
|---------|--------|-------------|
|
|
181
|
+
| Proxy Network | Stable | Residential, ISP, Mobile, Datacenter via HTTP/HTTPS gateway. |
|
|
182
|
+
| SERP API | Stable | Google / Bing / Yandex / DuckDuckGo, flexible parameters. |
|
|
183
|
+
| Universal Scraping API | Stable | JS rendering, HTML / PNG output, antibot bypass. |
|
|
184
|
+
| Web Scraper API | Stable | Task-based scraping for complex sites (YouTube, E-commerce). |
|
|
185
|
+
| Async Client | Stable | aiohttp-based client for high-concurrency workloads. |
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Development & Contributing
|
|
190
|
+
|
|
191
|
+
See `CONTRIBUTING.md` for local development and contribution guidelines.
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
This project is licensed under the Apache License 2.0.
|
|
196
|
+
|
|
197
|
+
## Support
|
|
198
|
+
|
|
199
|
+
For technical support, please contact support@thordata.com
|
|
200
|
+
or verify your tokens and quotas in the Thordata Dashboard.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
thordata/__init__.py,sha256=iv2luaDxUmcWTqScu08gGJfocUZr6pSwtzJs2akZ1Gg,365
|
|
2
|
+
thordata/async_client.py,sha256=cpBtRIzr8oH6GuZs8gTh505tGYYV1aRFBUzbtmFOfEg,9717
|
|
3
|
+
thordata/client.py,sha256=WVIvIZTACEpw9NaTbGtIkMGUlfliFL7kNGdCoTxxsUI,17193
|
|
4
|
+
thordata/enums.py,sha256=PGUCQX3jw5a9mX8_JfhuyoR1WriWjWQpAgibVP_bpdM,679
|
|
5
|
+
thordata/parameters.py,sha256=1lNx_BSS8ztBKEj_MXZMaIQQ9_W3EAlS-VFiBqSWb9E,1841
|
|
6
|
+
thordata_sdk-0.3.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
7
|
+
thordata_sdk-0.3.1.dist-info/METADATA,sha256=NMq7Be240zn2q3MlUUg2Dmo4NFoQtDMgkRAGzjg_yjc,5901
|
|
8
|
+
thordata_sdk-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
thordata_sdk-0.3.1.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
|
|
10
|
+
thordata_sdk-0.3.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
thordata
|
thordata_sdk/__init__.py
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
# Expose main clients
|
|
2
|
-
from .client import ThordataClient
|
|
3
|
-
from .async_client import AsyncThordataClient
|
|
4
|
-
from .enums import Engine, GoogleSearchType
|
|
5
|
-
|
|
6
|
-
# Version of the thordata-sdk package
|
|
7
|
-
__version__ = "0.2.4"
|
|
8
|
-
|
|
9
|
-
__all__ = ["ThordataClient", "AsyncThordataClient"]
|
thordata_sdk/client.py
DELETED
|
@@ -1,303 +0,0 @@
|
|
|
1
|
-
import requests
|
|
2
|
-
import logging
|
|
3
|
-
import json
|
|
4
|
-
import base64
|
|
5
|
-
from typing import Dict, Any, Union, Optional
|
|
6
|
-
|
|
7
|
-
from .enums import Engine
|
|
8
|
-
from .parameters import normalize_serp_params
|
|
9
|
-
|
|
10
|
-
# Configure a library-specific logger
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class ThordataClient:
|
|
15
|
-
"""
|
|
16
|
-
The official synchronous Python client for Thordata.
|
|
17
|
-
|
|
18
|
-
Handles authentication for:
|
|
19
|
-
1. Proxy Network (HTTP/HTTPS)
|
|
20
|
-
2. SERP API (Real-time Search)
|
|
21
|
-
3. Universal Scraping API (Single Page)
|
|
22
|
-
4. Web Scraper API (Async Task Management)
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
scraper_token: str,
|
|
28
|
-
public_token: str,
|
|
29
|
-
public_key: str,
|
|
30
|
-
proxy_host: str = "gate.thordata.com",
|
|
31
|
-
proxy_port: int = 22225
|
|
32
|
-
):
|
|
33
|
-
"""
|
|
34
|
-
Initialize the Thordata Client.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
scraper_token (str): Token from Dashboard bottom.
|
|
38
|
-
public_token (str): Token from Public API section.
|
|
39
|
-
public_key (str): Key from Public API section.
|
|
40
|
-
proxy_host (str): Proxy gateway host.
|
|
41
|
-
proxy_port (int): Proxy gateway port.
|
|
42
|
-
"""
|
|
43
|
-
self.scraper_token = scraper_token
|
|
44
|
-
self.public_token = public_token
|
|
45
|
-
self.public_key = public_key
|
|
46
|
-
|
|
47
|
-
# Proxy Configuration
|
|
48
|
-
self.proxy_url = (
|
|
49
|
-
f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
# API Endpoints
|
|
53
|
-
self.base_url = "https://scraperapi.thordata.com"
|
|
54
|
-
self.universal_url = "https://universalapi.thordata.com"
|
|
55
|
-
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
56
|
-
|
|
57
|
-
self.SERP_API_URL = f"{self.base_url}/request"
|
|
58
|
-
self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
|
|
59
|
-
self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
|
|
60
|
-
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
61
|
-
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
62
|
-
|
|
63
|
-
self.session = requests.Session()
|
|
64
|
-
self.session.proxies = {
|
|
65
|
-
"http": self.proxy_url,
|
|
66
|
-
"https": self.proxy_url,
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
def get(self, url: str, **kwargs) -> requests.Response:
|
|
70
|
-
"""
|
|
71
|
-
Send a GET request through the Thordata Proxy Network.
|
|
72
|
-
"""
|
|
73
|
-
logger.debug(f"Proxy Request: {url}")
|
|
74
|
-
kwargs.setdefault("timeout", 30)
|
|
75
|
-
return self.session.get(url, **kwargs)
|
|
76
|
-
|
|
77
|
-
def serp_search(
|
|
78
|
-
self,
|
|
79
|
-
query: str,
|
|
80
|
-
engine: Union[Engine, str] = Engine.GOOGLE, # 既可以是枚举,也可以是字符串
|
|
81
|
-
num: int = 10,
|
|
82
|
-
**kwargs # 这里接收所有额外参数 (比如 type="maps")
|
|
83
|
-
) -> Dict[str, Any]:
|
|
84
|
-
"""
|
|
85
|
-
Execute a real-time SERP search.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
query: Keywords
|
|
89
|
-
engine: 'google', 'bing', 'yandex' etc.
|
|
90
|
-
num: Number of results (default 10)
|
|
91
|
-
**kwargs: Extra parameters (e.g., type="shopping", location="London")
|
|
92
|
-
"""
|
|
93
|
-
# 兼容处理:如果用户传的是枚举对象,取它的值;如果是字符串,转小写
|
|
94
|
-
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
95
|
-
|
|
96
|
-
# 调用 parameters.py 里的逻辑
|
|
97
|
-
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
98
|
-
|
|
99
|
-
headers = {
|
|
100
|
-
"Authorization": f"Bearer {self.scraper_token}",
|
|
101
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
logger.info(f"SERP Search: {engine_str} - {query}")
|
|
105
|
-
try:
|
|
106
|
-
response = self.session.post(
|
|
107
|
-
self.SERP_API_URL,
|
|
108
|
-
data=payload,
|
|
109
|
-
headers=headers,
|
|
110
|
-
timeout=60
|
|
111
|
-
)
|
|
112
|
-
response.raise_for_status()
|
|
113
|
-
|
|
114
|
-
data = response.json()
|
|
115
|
-
if isinstance(data, str):
|
|
116
|
-
try: data = json.loads(data)
|
|
117
|
-
except: pass
|
|
118
|
-
return data
|
|
119
|
-
except Exception as e:
|
|
120
|
-
logger.error(f"SERP Request Failed: {e}")
|
|
121
|
-
raise
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def universal_scrape(
|
|
125
|
-
self,
|
|
126
|
-
url: str,
|
|
127
|
-
js_render: bool = False,
|
|
128
|
-
output_format: str = "HTML",
|
|
129
|
-
country: str = None,
|
|
130
|
-
block_resources: bool = False
|
|
131
|
-
) -> Union[str, bytes]:
|
|
132
|
-
"""
|
|
133
|
-
Unlock target pages via the Universal Scraping API.
|
|
134
|
-
"""
|
|
135
|
-
headers = {
|
|
136
|
-
"Authorization": f"Bearer {self.scraper_token}",
|
|
137
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
payload = {
|
|
141
|
-
"url": url,
|
|
142
|
-
"js_render": "True" if js_render else "False",
|
|
143
|
-
"type": output_format.lower(),
|
|
144
|
-
"block_resources": "True" if block_resources else "False"
|
|
145
|
-
}
|
|
146
|
-
if country:
|
|
147
|
-
payload["country"] = country
|
|
148
|
-
|
|
149
|
-
logger.info(f"Universal Scrape: {url}")
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
response = self.session.post(
|
|
153
|
-
self.UNIVERSAL_API_URL,
|
|
154
|
-
data=payload,
|
|
155
|
-
headers=headers,
|
|
156
|
-
timeout=60
|
|
157
|
-
)
|
|
158
|
-
response.raise_for_status()
|
|
159
|
-
|
|
160
|
-
# Parse JSON wrapper
|
|
161
|
-
try:
|
|
162
|
-
resp_json = response.json()
|
|
163
|
-
except json.JSONDecodeError:
|
|
164
|
-
# Fallback for raw response
|
|
165
|
-
if output_format.upper() == "PNG":
|
|
166
|
-
return response.content
|
|
167
|
-
return response.text
|
|
168
|
-
|
|
169
|
-
# Check API errors
|
|
170
|
-
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
171
|
-
and resp_json.get("code") != 200:
|
|
172
|
-
raise Exception(f"Universal API Error: {resp_json}")
|
|
173
|
-
|
|
174
|
-
# Extract HTML
|
|
175
|
-
if "html" in resp_json:
|
|
176
|
-
return resp_json["html"]
|
|
177
|
-
|
|
178
|
-
# Extract PNG
|
|
179
|
-
if "png" in resp_json:
|
|
180
|
-
png_str = resp_json["png"]
|
|
181
|
-
if not png_str:
|
|
182
|
-
raise Exception("API returned empty PNG data")
|
|
183
|
-
|
|
184
|
-
# 🛠️ FIX: 移除 Data URI Scheme 前缀 (data:image/png;base64,)
|
|
185
|
-
if "," in png_str:
|
|
186
|
-
png_str = png_str.split(",", 1)[1]
|
|
187
|
-
|
|
188
|
-
# Base64 解码 (处理 padding)
|
|
189
|
-
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
190
|
-
missing_padding = len(png_str) % 4
|
|
191
|
-
if missing_padding:
|
|
192
|
-
png_str += '=' * (4 - missing_padding)
|
|
193
|
-
|
|
194
|
-
return base64.b64decode(png_str)
|
|
195
|
-
|
|
196
|
-
return str(resp_json)
|
|
197
|
-
|
|
198
|
-
except Exception as e:
|
|
199
|
-
logger.error(f"Universal Scrape Failed: {e}")
|
|
200
|
-
raise
|
|
201
|
-
|
|
202
|
-
def create_scraper_task(
|
|
203
|
-
self,
|
|
204
|
-
file_name: str,
|
|
205
|
-
spider_id: str, # 必须传,用户从仪表板获取
|
|
206
|
-
spider_name: str, # 必须传,例如 "youtube.com"
|
|
207
|
-
individual_params: Dict[str, Any], # 用户把具体的参数打包在这个字典里传进来
|
|
208
|
-
universal_params: Dict[str, Any] = None
|
|
209
|
-
) -> str:
|
|
210
|
-
"""
|
|
211
|
-
Create a generic Web Scraper Task.
|
|
212
|
-
|
|
213
|
-
Note: Check the Thordata Dashboard to get the correct 'spider_id' and 'spider_name'.
|
|
214
|
-
"""
|
|
215
|
-
headers = {
|
|
216
|
-
"Authorization": f"Bearer {self.scraper_token}",
|
|
217
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
# 直接打包发送,不替用户做太多复杂的校验,保证兼容性
|
|
221
|
-
payload = {
|
|
222
|
-
"spider_name": spider_name,
|
|
223
|
-
"spider_id": spider_id,
|
|
224
|
-
"spider_parameters": json.dumps([individual_params]),
|
|
225
|
-
"spider_errors": "true",
|
|
226
|
-
"file_name": file_name
|
|
227
|
-
}
|
|
228
|
-
if universal_params:
|
|
229
|
-
payload["spider_universal"] = json.dumps(universal_params)
|
|
230
|
-
|
|
231
|
-
logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
|
|
232
|
-
try:
|
|
233
|
-
response = self.session.post(
|
|
234
|
-
self.SCRAPER_BUILDER_URL,
|
|
235
|
-
data=payload,
|
|
236
|
-
headers=headers
|
|
237
|
-
)
|
|
238
|
-
response.raise_for_status()
|
|
239
|
-
data = response.json()
|
|
240
|
-
|
|
241
|
-
if data.get("code") != 200:
|
|
242
|
-
raise Exception(f"Creation failed: {data}")
|
|
243
|
-
return data["data"]["task_id"]
|
|
244
|
-
except Exception as e:
|
|
245
|
-
logger.error(f"Task Creation Failed: {e}")
|
|
246
|
-
raise
|
|
247
|
-
|
|
248
|
-
def get_task_status(self, task_id: str) -> str:
|
|
249
|
-
"""
|
|
250
|
-
Check the status of a task.
|
|
251
|
-
"""
|
|
252
|
-
headers = {
|
|
253
|
-
"token": self.public_token,
|
|
254
|
-
"key": self.public_key,
|
|
255
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
256
|
-
}
|
|
257
|
-
payload = {"tasks_ids": task_id}
|
|
258
|
-
|
|
259
|
-
try:
|
|
260
|
-
response = self.session.post(
|
|
261
|
-
self.SCRAPER_STATUS_URL,
|
|
262
|
-
data=payload,
|
|
263
|
-
headers=headers
|
|
264
|
-
)
|
|
265
|
-
response.raise_for_status()
|
|
266
|
-
data = response.json()
|
|
267
|
-
|
|
268
|
-
if data.get("code") == 200 and data.get("data"):
|
|
269
|
-
for item in data["data"]:
|
|
270
|
-
if str(item.get("task_id")) == str(task_id):
|
|
271
|
-
return item["status"]
|
|
272
|
-
return "Unknown"
|
|
273
|
-
except Exception as e:
|
|
274
|
-
logger.error(f"Status Check Failed: {e}")
|
|
275
|
-
return "Error"
|
|
276
|
-
|
|
277
|
-
def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
278
|
-
"""
|
|
279
|
-
Retrieve the download URL for a completed task.
|
|
280
|
-
"""
|
|
281
|
-
headers = {
|
|
282
|
-
"token": self.public_token,
|
|
283
|
-
"key": self.public_key,
|
|
284
|
-
"Content-Type": "application/x-www-form-urlencoded"
|
|
285
|
-
}
|
|
286
|
-
payload = {"tasks_id": task_id, "type": file_type}
|
|
287
|
-
|
|
288
|
-
logger.info(f"Getting result URL: {task_id}")
|
|
289
|
-
try:
|
|
290
|
-
response = self.session.post(
|
|
291
|
-
self.SCRAPER_DOWNLOAD_URL,
|
|
292
|
-
data=payload,
|
|
293
|
-
headers=headers
|
|
294
|
-
)
|
|
295
|
-
response.raise_for_status()
|
|
296
|
-
data = response.json()
|
|
297
|
-
|
|
298
|
-
if data.get("code") == 200 and data.get("data"):
|
|
299
|
-
return data["data"]["download"]
|
|
300
|
-
raise Exception(f"API returned error: {data}")
|
|
301
|
-
except Exception as e:
|
|
302
|
-
logger.error(f"Get Result Failed: {e}")
|
|
303
|
-
raise
|
thordata_sdk/enums.py
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
# thordata_sdk/enums.py
|
|
2
|
-
from enum import Enum
|
|
3
|
-
|
|
4
|
-
class Engine(str, Enum):
|
|
5
|
-
"""SERP 核心支持的四大引擎"""
|
|
6
|
-
GOOGLE = "google"
|
|
7
|
-
BING = "bing"
|
|
8
|
-
YANDEX = "yandex"
|
|
9
|
-
DUCKDUCKGO = "duckduckgo"
|
|
10
|
-
BAIDU = "baidu"
|
|
11
|
-
|
|
12
|
-
class GoogleSearchType(str, Enum):
|
|
13
|
-
"""Google 搜索的常见子类型 (参考你的截图)"""
|
|
14
|
-
SEARCH = "search" # 默认网页搜索
|
|
15
|
-
MAPS = "maps" # 地图
|
|
16
|
-
SHOPPING = "shopping" # 购物
|
|
17
|
-
NEWS = "news" # 新闻
|
|
18
|
-
IMAGES = "images" # 图片
|
|
19
|
-
VIDEOS = "videos" # 视频
|
|
20
|
-
# 其他冷门的先不写,用户可以通过字符串传参
|
thordata_sdk/parameters.py
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# thordata_sdk/parameters.py
|
|
2
|
-
from typing import Dict, Any
|
|
3
|
-
|
|
4
|
-
def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
|
|
5
|
-
"""
|
|
6
|
-
统一不同搜索引擎的参数差异。
|
|
7
|
-
"""
|
|
8
|
-
# 1. 基础参数
|
|
9
|
-
payload = {
|
|
10
|
-
"num": str(kwargs.get("num", 10)),
|
|
11
|
-
"json": "1",
|
|
12
|
-
"engine": engine,
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
# 2. 处理查询关键词 (Yandex 用 text,其他用 q)
|
|
16
|
-
if engine == "yandex":
|
|
17
|
-
payload["text"] = query
|
|
18
|
-
# 如果用户没传 url,给个默认的
|
|
19
|
-
if "url" not in kwargs:
|
|
20
|
-
payload["url"] = "yandex.com"
|
|
21
|
-
else:
|
|
22
|
-
payload["q"] = query
|
|
23
|
-
|
|
24
|
-
# 3. 处理默认 URL (如果用户没传)
|
|
25
|
-
if "url" not in kwargs:
|
|
26
|
-
defaults = {
|
|
27
|
-
"google": "google.com",
|
|
28
|
-
"bing": "bing.com",
|
|
29
|
-
"duckduckgo": "duckduckgo.com",
|
|
30
|
-
"baidu": "baidu.com"
|
|
31
|
-
}
|
|
32
|
-
if engine in defaults:
|
|
33
|
-
payload["url"] = defaults[engine]
|
|
34
|
-
|
|
35
|
-
# 4. 把用户传入的其他所有参数(比如 type="shopping", google_domain="google.co.uk")都透传进去
|
|
36
|
-
# 这样你就不用去定义那几十种类型了,用户传啥就是啥
|
|
37
|
-
for k, v in kwargs.items():
|
|
38
|
-
if k not in ["num", "engine", "q", "text"]: # 避免覆盖
|
|
39
|
-
payload[k] = v
|
|
40
|
-
|
|
41
|
-
return payload
|