thordata-sdk 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/PKG-INFO +2 -16
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/setup.py +1 -1
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/thordata_sdk/__init__.py +2 -1
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/thordata_sdk/async_client.py +92 -45
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/thordata_sdk/client.py +116 -43
- thordata_sdk-0.2.4/thordata_sdk/enums.py +20 -0
- thordata_sdk-0.2.4/thordata_sdk/parameters.py +41 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/thordata_sdk.egg-info/PKG-INFO +3 -17
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/thordata_sdk.egg-info/SOURCES.txt +2 -2
- thordata_sdk-0.2.2/tests/test_async_client.py +0 -59
- thordata_sdk-0.2.2/tests/test_client.py +0 -53
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/LICENSE +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/README.md +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/setup.cfg +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/thordata_sdk.egg-info/dependency_links.txt +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/thordata_sdk.egg-info/requires.txt +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.4}/thordata_sdk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: thordata_sdk
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
|
|
5
5
|
Home-page: https://github.com/Thordata/thordata-python-sdk
|
|
6
6
|
Author: Thordata Developer Team
|
|
@@ -22,20 +22,6 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
22
22
|
Requires-Python: >=3.8
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE
|
|
25
|
-
Requires-Dist: requests>=2.25.0
|
|
26
|
-
Requires-Dist: aiohttp>=3.8.0
|
|
27
|
-
Dynamic: author
|
|
28
|
-
Dynamic: author-email
|
|
29
|
-
Dynamic: classifier
|
|
30
|
-
Dynamic: description
|
|
31
|
-
Dynamic: description-content-type
|
|
32
|
-
Dynamic: home-page
|
|
33
|
-
Dynamic: license
|
|
34
|
-
Dynamic: license-file
|
|
35
|
-
Dynamic: project-url
|
|
36
|
-
Dynamic: requires-dist
|
|
37
|
-
Dynamic: requires-python
|
|
38
|
-
Dynamic: summary
|
|
39
25
|
|
|
40
26
|
# Thordata Python SDK
|
|
41
27
|
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name='thordata_sdk',
|
|
5
|
-
version='0.2.
|
|
5
|
+
version='0.2.4', # Bump version due to breaking auth changes
|
|
6
6
|
packages=find_packages(include=['thordata_sdk', 'thordata_sdk.*']),
|
|
7
7
|
install_requires=[
|
|
8
8
|
'requests>=2.25.0', # Standard synchronous HTTP
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# Expose main clients
|
|
2
2
|
from .client import ThordataClient
|
|
3
3
|
from .async_client import AsyncThordataClient
|
|
4
|
+
from .enums import Engine, GoogleSearchType
|
|
4
5
|
|
|
5
6
|
# Version of the thordata-sdk package
|
|
6
|
-
__version__ = "0.2.
|
|
7
|
+
__version__ = "0.2.4"
|
|
7
8
|
|
|
8
9
|
__all__ = ["ThordataClient", "AsyncThordataClient"]
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
import aiohttp
|
|
2
2
|
import logging
|
|
3
3
|
import json
|
|
4
|
-
|
|
4
|
+
import base64
|
|
5
|
+
from typing import Optional, Dict, Any, Union
|
|
6
|
+
|
|
7
|
+
# 复用我们刚刚写好的逻辑和枚举
|
|
8
|
+
from .enums import Engine
|
|
9
|
+
from .parameters import normalize_serp_params
|
|
5
10
|
|
|
6
11
|
logger = logging.getLogger(__name__)
|
|
7
12
|
|
|
@@ -9,11 +14,6 @@ logger = logging.getLogger(__name__)
|
|
|
9
14
|
class AsyncThordataClient:
|
|
10
15
|
"""
|
|
11
16
|
Thordata Asynchronous Client (built on aiohttp).
|
|
12
|
-
Designed for high-concurrency and low-latency data collection tasks.
|
|
13
|
-
|
|
14
|
-
Usage:
|
|
15
|
-
async with AsyncThordataClient(...) as client:
|
|
16
|
-
await client.get("http://example.com")
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
19
|
def __init__(
|
|
@@ -24,22 +24,19 @@ class AsyncThordataClient:
|
|
|
24
24
|
proxy_host: str = "gate.thordata.com",
|
|
25
25
|
proxy_port: int = 22225
|
|
26
26
|
):
|
|
27
|
-
"""
|
|
28
|
-
Initialize the asynchronous client.
|
|
29
|
-
"""
|
|
30
27
|
self.scraper_token = scraper_token
|
|
31
28
|
self.public_token = public_token
|
|
32
29
|
self.public_key = public_key
|
|
33
30
|
|
|
34
|
-
# Proxy Authentication
|
|
35
31
|
self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
|
|
36
32
|
self.proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
37
33
|
|
|
38
|
-
# API Endpoints
|
|
39
34
|
self.base_url = "https://scraperapi.thordata.com"
|
|
35
|
+
self.universal_url = "https://universalapi.thordata.com"
|
|
40
36
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
41
37
|
|
|
42
38
|
self.SERP_API_URL = f"{self.base_url}/request"
|
|
39
|
+
self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
|
|
43
40
|
self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
|
|
44
41
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
45
42
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
@@ -55,21 +52,14 @@ class AsyncThordataClient:
|
|
|
55
52
|
await self.close()
|
|
56
53
|
|
|
57
54
|
async def close(self):
|
|
58
|
-
"""Close the underlying aiohttp session."""
|
|
59
55
|
if self._session and not self._session.closed:
|
|
60
56
|
await self._session.close()
|
|
61
57
|
self._session = None
|
|
62
58
|
|
|
63
|
-
# --- Proxy
|
|
64
|
-
|
|
59
|
+
# --- Proxy (Unchanged) ---
|
|
65
60
|
async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
|
|
66
|
-
"""
|
|
67
|
-
Send an asynchronous GET request through the Thordata Proxy.
|
|
68
|
-
"""
|
|
69
61
|
if self._session is None:
|
|
70
62
|
raise RuntimeError("Client session not initialized.")
|
|
71
|
-
|
|
72
|
-
logger.debug(f"Async Proxy Request: {url}")
|
|
73
63
|
try:
|
|
74
64
|
return await self._session.get(
|
|
75
65
|
url,
|
|
@@ -81,43 +71,37 @@ class AsyncThordataClient:
|
|
|
81
71
|
logger.error(f"Async Request failed: {e}")
|
|
82
72
|
raise
|
|
83
73
|
|
|
84
|
-
# --- SERP
|
|
85
|
-
|
|
74
|
+
# --- SERP (Optimized) ---
|
|
86
75
|
async def serp_search(
|
|
87
|
-
self,
|
|
76
|
+
self,
|
|
77
|
+
query: str,
|
|
78
|
+
engine: Union[Engine, str] = Engine.GOOGLE,
|
|
79
|
+
num: int = 10,
|
|
80
|
+
**kwargs
|
|
88
81
|
) -> Dict[str, Any]:
|
|
89
|
-
"""
|
|
82
|
+
"""
|
|
83
|
+
Execute a real-time SERP search (Async).
|
|
84
|
+
"""
|
|
90
85
|
if self._session is None:
|
|
91
86
|
raise RuntimeError("Client session not initialized.")
|
|
92
87
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
**kwargs
|
|
99
|
-
}
|
|
100
|
-
if engine.lower() == 'yandex':
|
|
101
|
-
payload['text'] = payload.pop('q')
|
|
102
|
-
if 'url' not in payload:
|
|
103
|
-
payload['url'] = "yandex.com"
|
|
104
|
-
elif 'url' not in payload:
|
|
105
|
-
if engine == 'google':
|
|
106
|
-
payload['url'] = "google.com"
|
|
107
|
-
elif engine == 'bing':
|
|
108
|
-
payload['url'] = "bing.com"
|
|
88
|
+
# 1. 转换枚举
|
|
89
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
90
|
+
|
|
91
|
+
# 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
|
|
92
|
+
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
109
93
|
|
|
110
94
|
headers = {
|
|
111
95
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
112
96
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
113
97
|
}
|
|
114
98
|
|
|
99
|
+
# 3. 发送请求
|
|
115
100
|
async with self._session.post(
|
|
116
101
|
self.SERP_API_URL, data=payload, headers=headers
|
|
117
102
|
) as response:
|
|
118
103
|
response.raise_for_status()
|
|
119
104
|
data = await response.json()
|
|
120
|
-
# Handle double-encoding
|
|
121
105
|
if isinstance(data, str):
|
|
122
106
|
try:
|
|
123
107
|
data = json.loads(data)
|
|
@@ -125,17 +109,80 @@ class AsyncThordataClient:
|
|
|
125
109
|
pass
|
|
126
110
|
return data
|
|
127
111
|
|
|
128
|
-
# ---
|
|
112
|
+
# --- Universal (Unchanged) ---
|
|
113
|
+
async def universal_scrape(
|
|
114
|
+
self,
|
|
115
|
+
url: str,
|
|
116
|
+
js_render: bool = False,
|
|
117
|
+
output_format: str = "HTML",
|
|
118
|
+
country: str = None,
|
|
119
|
+
block_resources: bool = False
|
|
120
|
+
) -> Union[str, bytes]:
|
|
121
|
+
if self._session is None:
|
|
122
|
+
raise RuntimeError("Client session not initialized.")
|
|
123
|
+
|
|
124
|
+
headers = {
|
|
125
|
+
"Authorization": f"Bearer {self.scraper_token}",
|
|
126
|
+
"Content-Type": "application/x-www-form-urlencoded"
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
payload = {
|
|
130
|
+
"url": url,
|
|
131
|
+
"js_render": "True" if js_render else "False",
|
|
132
|
+
"type": output_format.lower(),
|
|
133
|
+
"block_resources": "True" if block_resources else "False"
|
|
134
|
+
}
|
|
135
|
+
if country:
|
|
136
|
+
payload["country"] = country
|
|
137
|
+
|
|
138
|
+
async with self._session.post(
|
|
139
|
+
self.UNIVERSAL_API_URL, data=payload, headers=headers
|
|
140
|
+
) as response:
|
|
141
|
+
response.raise_for_status()
|
|
129
142
|
|
|
143
|
+
try:
|
|
144
|
+
resp_json = await response.json()
|
|
145
|
+
except Exception:
|
|
146
|
+
if output_format.upper() == "PNG":
|
|
147
|
+
return await response.read()
|
|
148
|
+
return await response.text()
|
|
149
|
+
|
|
150
|
+
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
151
|
+
and resp_json.get("code") != 200:
|
|
152
|
+
raise Exception(f"Universal API Error: {resp_json}")
|
|
153
|
+
|
|
154
|
+
if "html" in resp_json:
|
|
155
|
+
return resp_json["html"]
|
|
156
|
+
|
|
157
|
+
if "png" in resp_json:
|
|
158
|
+
png_str = resp_json["png"]
|
|
159
|
+
if not png_str:
|
|
160
|
+
raise Exception("API returned empty PNG data")
|
|
161
|
+
|
|
162
|
+
# 🛠️ FIX: 移除 Data URI Scheme 前缀
|
|
163
|
+
if "," in png_str:
|
|
164
|
+
png_str = png_str.split(",", 1)[1]
|
|
165
|
+
|
|
166
|
+
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
167
|
+
missing_padding = len(png_str) % 4
|
|
168
|
+
if missing_padding:
|
|
169
|
+
png_str += '=' * (4 - missing_padding)
|
|
170
|
+
return base64.b64decode(png_str)
|
|
171
|
+
|
|
172
|
+
return str(resp_json)
|
|
173
|
+
|
|
174
|
+
# --- Web Scraper (Optimized) ---
|
|
130
175
|
async def create_scraper_task(
|
|
131
176
|
self,
|
|
132
177
|
file_name: str,
|
|
133
178
|
spider_id: str,
|
|
179
|
+
spider_name: str,
|
|
134
180
|
individual_params: Dict[str, Any],
|
|
135
|
-
spider_name: str = "youtube.com",
|
|
136
181
|
universal_params: Dict[str, Any] = None
|
|
137
182
|
) -> str:
|
|
138
|
-
"""
|
|
183
|
+
"""
|
|
184
|
+
Create an Asynchronous Web Scraper Task.
|
|
185
|
+
"""
|
|
139
186
|
if self._session is None:
|
|
140
187
|
raise RuntimeError("Client session not initialized.")
|
|
141
188
|
|
|
@@ -144,6 +191,7 @@ class AsyncThordataClient:
|
|
|
144
191
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
145
192
|
}
|
|
146
193
|
|
|
194
|
+
# 简化 Payload 构建,移除不必要的检查
|
|
147
195
|
payload = {
|
|
148
196
|
"file_name": file_name,
|
|
149
197
|
"spider_id": spider_id,
|
|
@@ -163,8 +211,8 @@ class AsyncThordataClient:
|
|
|
163
211
|
raise Exception(f"Creation failed: {data}")
|
|
164
212
|
return data["data"]["task_id"]
|
|
165
213
|
|
|
214
|
+
# --- Status & Result (Unchanged) ---
|
|
166
215
|
async def get_task_status(self, task_id: str) -> str:
|
|
167
|
-
"""Check task status."""
|
|
168
216
|
headers = {
|
|
169
217
|
"token": self.public_token,
|
|
170
218
|
"key": self.public_key,
|
|
@@ -183,7 +231,6 @@ class AsyncThordataClient:
|
|
|
183
231
|
return "Unknown"
|
|
184
232
|
|
|
185
233
|
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
186
|
-
"""Get download link."""
|
|
187
234
|
headers = {
|
|
188
235
|
"token": self.public_token,
|
|
189
236
|
"key": self.public_key,
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
import logging
|
|
3
3
|
import json
|
|
4
|
-
|
|
4
|
+
import base64
|
|
5
|
+
from typing import Dict, Any, Union, Optional
|
|
6
|
+
|
|
7
|
+
from .enums import Engine
|
|
8
|
+
from .parameters import normalize_serp_params
|
|
5
9
|
|
|
6
10
|
# Configure a library-specific logger
|
|
7
11
|
logger = logging.getLogger(__name__)
|
|
@@ -14,7 +18,8 @@ class ThordataClient:
|
|
|
14
18
|
Handles authentication for:
|
|
15
19
|
1. Proxy Network (HTTP/HTTPS)
|
|
16
20
|
2. SERP API (Real-time Search)
|
|
17
|
-
3.
|
|
21
|
+
3. Universal Scraping API (Single Page)
|
|
22
|
+
4. Web Scraper API (Async Task Management)
|
|
18
23
|
"""
|
|
19
24
|
|
|
20
25
|
def __init__(
|
|
@@ -39,16 +44,18 @@ class ThordataClient:
|
|
|
39
44
|
self.public_token = public_token
|
|
40
45
|
self.public_key = public_key
|
|
41
46
|
|
|
42
|
-
# Proxy Configuration
|
|
47
|
+
# Proxy Configuration
|
|
43
48
|
self.proxy_url = (
|
|
44
49
|
f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
|
|
45
50
|
)
|
|
46
51
|
|
|
47
52
|
# API Endpoints
|
|
48
53
|
self.base_url = "https://scraperapi.thordata.com"
|
|
54
|
+
self.universal_url = "https://universalapi.thordata.com"
|
|
49
55
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
50
56
|
|
|
51
57
|
self.SERP_API_URL = f"{self.base_url}/request"
|
|
58
|
+
self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
|
|
52
59
|
self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
|
|
53
60
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
54
61
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
@@ -62,51 +69,39 @@ class ThordataClient:
|
|
|
62
69
|
def get(self, url: str, **kwargs) -> requests.Response:
|
|
63
70
|
"""
|
|
64
71
|
Send a GET request through the Thordata Proxy Network.
|
|
65
|
-
|
|
66
|
-
Args:
|
|
67
|
-
url (str): The target URL.
|
|
68
|
-
**kwargs: Additional arguments passed to requests.get().
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
requests.Response: The HTTP response.
|
|
72
72
|
"""
|
|
73
73
|
logger.debug(f"Proxy Request: {url}")
|
|
74
74
|
kwargs.setdefault("timeout", 30)
|
|
75
75
|
return self.session.get(url, **kwargs)
|
|
76
76
|
|
|
77
77
|
def serp_search(
|
|
78
|
-
self,
|
|
78
|
+
self,
|
|
79
|
+
query: str,
|
|
80
|
+
engine: Union[Engine, str] = Engine.GOOGLE, # 既可以是枚举,也可以是字符串
|
|
81
|
+
num: int = 10,
|
|
82
|
+
**kwargs # 这里接收所有额外参数 (比如 type="maps")
|
|
79
83
|
) -> Dict[str, Any]:
|
|
80
84
|
"""
|
|
81
85
|
Execute a real-time SERP search.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
query: Keywords
|
|
89
|
+
engine: 'google', 'bing', 'yandex' etc.
|
|
90
|
+
num: Number of results (default 10)
|
|
91
|
+
**kwargs: Extra parameters (e.g., type="shopping", location="London")
|
|
82
92
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
"num": str(num),
|
|
86
|
-
"json": "1",
|
|
87
|
-
"engine": engine.lower(),
|
|
88
|
-
**kwargs
|
|
89
|
-
}
|
|
93
|
+
# 兼容处理:如果用户传的是枚举对象,取它的值;如果是字符串,转小写
|
|
94
|
+
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
90
95
|
|
|
91
|
-
#
|
|
92
|
-
|
|
93
|
-
payload['text'] = payload.pop('q')
|
|
94
|
-
if 'url' not in payload:
|
|
95
|
-
payload['url'] = "yandex.com"
|
|
96
|
-
elif 'url' not in payload:
|
|
97
|
-
if engine == 'google':
|
|
98
|
-
payload['url'] = "google.com"
|
|
99
|
-
elif engine == 'bing':
|
|
100
|
-
payload['url'] = "bing.com"
|
|
101
|
-
elif engine == 'duckduckgo':
|
|
102
|
-
payload['url'] = "duckduckgo.com"
|
|
96
|
+
# 调用 parameters.py 里的逻辑
|
|
97
|
+
payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
|
|
103
98
|
|
|
104
99
|
headers = {
|
|
105
100
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
106
101
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
107
102
|
}
|
|
108
103
|
|
|
109
|
-
logger.info(f"SERP Search: {
|
|
104
|
+
logger.info(f"SERP Search: {engine_str} - {query}")
|
|
110
105
|
try:
|
|
111
106
|
response = self.session.post(
|
|
112
107
|
self.SERP_API_URL,
|
|
@@ -115,35 +110,114 @@ class ThordataClient:
|
|
|
115
110
|
timeout=60
|
|
116
111
|
)
|
|
117
112
|
response.raise_for_status()
|
|
113
|
+
|
|
118
114
|
data = response.json()
|
|
119
|
-
|
|
120
|
-
# Handle potential double-encoded JSON strings
|
|
121
115
|
if isinstance(data, str):
|
|
122
|
-
try:
|
|
123
|
-
|
|
124
|
-
except json.JSONDecodeError:
|
|
125
|
-
pass
|
|
116
|
+
try: data = json.loads(data)
|
|
117
|
+
except: pass
|
|
126
118
|
return data
|
|
127
119
|
except Exception as e:
|
|
128
120
|
logger.error(f"SERP Request Failed: {e}")
|
|
129
121
|
raise
|
|
130
122
|
|
|
123
|
+
|
|
124
|
+
def universal_scrape(
|
|
125
|
+
self,
|
|
126
|
+
url: str,
|
|
127
|
+
js_render: bool = False,
|
|
128
|
+
output_format: str = "HTML",
|
|
129
|
+
country: str = None,
|
|
130
|
+
block_resources: bool = False
|
|
131
|
+
) -> Union[str, bytes]:
|
|
132
|
+
"""
|
|
133
|
+
Unlock target pages via the Universal Scraping API.
|
|
134
|
+
"""
|
|
135
|
+
headers = {
|
|
136
|
+
"Authorization": f"Bearer {self.scraper_token}",
|
|
137
|
+
"Content-Type": "application/x-www-form-urlencoded"
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
payload = {
|
|
141
|
+
"url": url,
|
|
142
|
+
"js_render": "True" if js_render else "False",
|
|
143
|
+
"type": output_format.lower(),
|
|
144
|
+
"block_resources": "True" if block_resources else "False"
|
|
145
|
+
}
|
|
146
|
+
if country:
|
|
147
|
+
payload["country"] = country
|
|
148
|
+
|
|
149
|
+
logger.info(f"Universal Scrape: {url}")
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
response = self.session.post(
|
|
153
|
+
self.UNIVERSAL_API_URL,
|
|
154
|
+
data=payload,
|
|
155
|
+
headers=headers,
|
|
156
|
+
timeout=60
|
|
157
|
+
)
|
|
158
|
+
response.raise_for_status()
|
|
159
|
+
|
|
160
|
+
# Parse JSON wrapper
|
|
161
|
+
try:
|
|
162
|
+
resp_json = response.json()
|
|
163
|
+
except json.JSONDecodeError:
|
|
164
|
+
# Fallback for raw response
|
|
165
|
+
if output_format.upper() == "PNG":
|
|
166
|
+
return response.content
|
|
167
|
+
return response.text
|
|
168
|
+
|
|
169
|
+
# Check API errors
|
|
170
|
+
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
171
|
+
and resp_json.get("code") != 200:
|
|
172
|
+
raise Exception(f"Universal API Error: {resp_json}")
|
|
173
|
+
|
|
174
|
+
# Extract HTML
|
|
175
|
+
if "html" in resp_json:
|
|
176
|
+
return resp_json["html"]
|
|
177
|
+
|
|
178
|
+
# Extract PNG
|
|
179
|
+
if "png" in resp_json:
|
|
180
|
+
png_str = resp_json["png"]
|
|
181
|
+
if not png_str:
|
|
182
|
+
raise Exception("API returned empty PNG data")
|
|
183
|
+
|
|
184
|
+
# 🛠️ FIX: 移除 Data URI Scheme 前缀 (data:image/png;base64,)
|
|
185
|
+
if "," in png_str:
|
|
186
|
+
png_str = png_str.split(",", 1)[1]
|
|
187
|
+
|
|
188
|
+
# Base64 解码 (处理 padding)
|
|
189
|
+
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
190
|
+
missing_padding = len(png_str) % 4
|
|
191
|
+
if missing_padding:
|
|
192
|
+
png_str += '=' * (4 - missing_padding)
|
|
193
|
+
|
|
194
|
+
return base64.b64decode(png_str)
|
|
195
|
+
|
|
196
|
+
return str(resp_json)
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Universal Scrape Failed: {e}")
|
|
200
|
+
raise
|
|
201
|
+
|
|
131
202
|
def create_scraper_task(
|
|
132
203
|
self,
|
|
133
204
|
file_name: str,
|
|
134
|
-
spider_id: str,
|
|
135
|
-
|
|
136
|
-
|
|
205
|
+
spider_id: str, # 必须传,用户从仪表板获取
|
|
206
|
+
spider_name: str, # 必须传,例如 "youtube.com"
|
|
207
|
+
individual_params: Dict[str, Any], # 用户把具体的参数打包在这个字典里传进来
|
|
137
208
|
universal_params: Dict[str, Any] = None
|
|
138
209
|
) -> str:
|
|
139
210
|
"""
|
|
140
|
-
Create
|
|
211
|
+
Create a generic Web Scraper Task.
|
|
212
|
+
|
|
213
|
+
Note: Check the Thordata Dashboard to get the correct 'spider_id' and 'spider_name'.
|
|
141
214
|
"""
|
|
142
215
|
headers = {
|
|
143
216
|
"Authorization": f"Bearer {self.scraper_token}",
|
|
144
217
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
145
218
|
}
|
|
146
219
|
|
|
220
|
+
# 直接打包发送,不替用户做太多复杂的校验,保证兼容性
|
|
147
221
|
payload = {
|
|
148
222
|
"spider_name": spider_name,
|
|
149
223
|
"spider_id": spider_id,
|
|
@@ -154,7 +228,7 @@ class ThordataClient:
|
|
|
154
228
|
if universal_params:
|
|
155
229
|
payload["spider_universal"] = json.dumps(universal_params)
|
|
156
230
|
|
|
157
|
-
logger.info(f"Creating Scraper Task: {spider_id}")
|
|
231
|
+
logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
|
|
158
232
|
try:
|
|
159
233
|
response = self.session.post(
|
|
160
234
|
self.SCRAPER_BUILDER_URL,
|
|
@@ -174,7 +248,6 @@ class ThordataClient:
|
|
|
174
248
|
def get_task_status(self, task_id: str) -> str:
|
|
175
249
|
"""
|
|
176
250
|
Check the status of a task.
|
|
177
|
-
Returns: 'Running', 'Ready', 'Failed', or 'Unknown'.
|
|
178
251
|
"""
|
|
179
252
|
headers = {
|
|
180
253
|
"token": self.public_token,
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# thordata_sdk/enums.py
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
class Engine(str, Enum):
|
|
5
|
+
"""SERP 核心支持的四大引擎"""
|
|
6
|
+
GOOGLE = "google"
|
|
7
|
+
BING = "bing"
|
|
8
|
+
YANDEX = "yandex"
|
|
9
|
+
DUCKDUCKGO = "duckduckgo"
|
|
10
|
+
BAIDU = "baidu"
|
|
11
|
+
|
|
12
|
+
class GoogleSearchType(str, Enum):
|
|
13
|
+
"""Google 搜索的常见子类型 (参考你的截图)"""
|
|
14
|
+
SEARCH = "search" # 默认网页搜索
|
|
15
|
+
MAPS = "maps" # 地图
|
|
16
|
+
SHOPPING = "shopping" # 购物
|
|
17
|
+
NEWS = "news" # 新闻
|
|
18
|
+
IMAGES = "images" # 图片
|
|
19
|
+
VIDEOS = "videos" # 视频
|
|
20
|
+
# 其他冷门的先不写,用户可以通过字符串传参
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# thordata_sdk/parameters.py
|
|
2
|
+
from typing import Dict, Any
|
|
3
|
+
|
|
4
|
+
def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
|
|
5
|
+
"""
|
|
6
|
+
统一不同搜索引擎的参数差异。
|
|
7
|
+
"""
|
|
8
|
+
# 1. 基础参数
|
|
9
|
+
payload = {
|
|
10
|
+
"num": str(kwargs.get("num", 10)),
|
|
11
|
+
"json": "1",
|
|
12
|
+
"engine": engine,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
# 2. 处理查询关键词 (Yandex 用 text,其他用 q)
|
|
16
|
+
if engine == "yandex":
|
|
17
|
+
payload["text"] = query
|
|
18
|
+
# 如果用户没传 url,给个默认的
|
|
19
|
+
if "url" not in kwargs:
|
|
20
|
+
payload["url"] = "yandex.com"
|
|
21
|
+
else:
|
|
22
|
+
payload["q"] = query
|
|
23
|
+
|
|
24
|
+
# 3. 处理默认 URL (如果用户没传)
|
|
25
|
+
if "url" not in kwargs:
|
|
26
|
+
defaults = {
|
|
27
|
+
"google": "google.com",
|
|
28
|
+
"bing": "bing.com",
|
|
29
|
+
"duckduckgo": "duckduckgo.com",
|
|
30
|
+
"baidu": "baidu.com"
|
|
31
|
+
}
|
|
32
|
+
if engine in defaults:
|
|
33
|
+
payload["url"] = defaults[engine]
|
|
34
|
+
|
|
35
|
+
# 4. 把用户传入的其他所有参数(比如 type="shopping", google_domain="google.co.uk")都透传进去
|
|
36
|
+
# 这样你就不用去定义那几十种类型了,用户传啥就是啥
|
|
37
|
+
for k, v in kwargs.items():
|
|
38
|
+
if k not in ["num", "engine", "q", "text"]: # 避免覆盖
|
|
39
|
+
payload[k] = v
|
|
40
|
+
|
|
41
|
+
return payload
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
2
|
-
Name:
|
|
3
|
-
Version: 0.2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: thordata-sdk
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: The official Python SDK for Thordata Proxy & Scraper Infrastructure.
|
|
5
5
|
Home-page: https://github.com/Thordata/thordata-python-sdk
|
|
6
6
|
Author: Thordata Developer Team
|
|
@@ -22,20 +22,6 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
22
22
|
Requires-Python: >=3.8
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE
|
|
25
|
-
Requires-Dist: requests>=2.25.0
|
|
26
|
-
Requires-Dist: aiohttp>=3.8.0
|
|
27
|
-
Dynamic: author
|
|
28
|
-
Dynamic: author-email
|
|
29
|
-
Dynamic: classifier
|
|
30
|
-
Dynamic: description
|
|
31
|
-
Dynamic: description-content-type
|
|
32
|
-
Dynamic: home-page
|
|
33
|
-
Dynamic: license
|
|
34
|
-
Dynamic: license-file
|
|
35
|
-
Dynamic: project-url
|
|
36
|
-
Dynamic: requires-dist
|
|
37
|
-
Dynamic: requires-python
|
|
38
|
-
Dynamic: summary
|
|
39
25
|
|
|
40
26
|
# Thordata Python SDK
|
|
41
27
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
LICENSE
|
|
2
2
|
README.md
|
|
3
3
|
setup.py
|
|
4
|
-
tests/test_async_client.py
|
|
5
|
-
tests/test_client.py
|
|
6
4
|
thordata_sdk/__init__.py
|
|
7
5
|
thordata_sdk/async_client.py
|
|
8
6
|
thordata_sdk/client.py
|
|
7
|
+
thordata_sdk/enums.py
|
|
8
|
+
thordata_sdk/parameters.py
|
|
9
9
|
thordata_sdk.egg-info/PKG-INFO
|
|
10
10
|
thordata_sdk.egg-info/SOURCES.txt
|
|
11
11
|
thordata_sdk.egg-info/dependency_links.txt
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
import aiohttp
|
|
3
|
-
from aioresponses import aioresponses
|
|
4
|
-
from thordata_sdk import AsyncThordataClient
|
|
5
|
-
|
|
6
|
-
# Mark all tests in this module as async
|
|
7
|
-
pytestmark = pytest.mark.asyncio
|
|
8
|
-
|
|
9
|
-
# Mock Credentials
|
|
10
|
-
TEST_SCRAPER = "async_scraper_token"
|
|
11
|
-
TEST_PUB_TOKEN = "async_public_token"
|
|
12
|
-
TEST_PUB_KEY = "async_key"
|
|
13
|
-
TEST_HOST = "gate.thordata.com"
|
|
14
|
-
TEST_PORT = 22225
|
|
15
|
-
|
|
16
|
-
@pytest.fixture
|
|
17
|
-
async def async_client():
|
|
18
|
-
"""Fixture for AsyncThordataClient with context management."""
|
|
19
|
-
client = AsyncThordataClient(
|
|
20
|
-
scraper_token=TEST_SCRAPER,
|
|
21
|
-
public_token=TEST_PUB_TOKEN,
|
|
22
|
-
public_key=TEST_PUB_KEY,
|
|
23
|
-
proxy_host=TEST_HOST,
|
|
24
|
-
proxy_port=TEST_PORT
|
|
25
|
-
)
|
|
26
|
-
async with client:
|
|
27
|
-
yield client
|
|
28
|
-
|
|
29
|
-
async def test_async_client_initialization(async_client):
|
|
30
|
-
"""Test async client properties."""
|
|
31
|
-
expected_url = f"http://{TEST_HOST}:{TEST_PORT}"
|
|
32
|
-
|
|
33
|
-
assert async_client.proxy_url == expected_url
|
|
34
|
-
assert isinstance(async_client.proxy_auth, aiohttp.BasicAuth)
|
|
35
|
-
assert async_client.proxy_auth.login == TEST_SCRAPER
|
|
36
|
-
|
|
37
|
-
async def test_async_successful_request(async_client):
|
|
38
|
-
"""Test successful async proxy request."""
|
|
39
|
-
mock_url = "http://example.com/async_test"
|
|
40
|
-
mock_data = {"status": "async_ok"}
|
|
41
|
-
|
|
42
|
-
with aioresponses() as m:
|
|
43
|
-
m.get(mock_url, status=200, payload=mock_data)
|
|
44
|
-
|
|
45
|
-
response = await async_client.get(mock_url)
|
|
46
|
-
|
|
47
|
-
assert response.status == 200
|
|
48
|
-
data = await response.json()
|
|
49
|
-
assert data == mock_data
|
|
50
|
-
|
|
51
|
-
async def test_async_http_error_handling(async_client):
|
|
52
|
-
"""Test async HTTP error."""
|
|
53
|
-
error_url = "http://example.com/async_error"
|
|
54
|
-
|
|
55
|
-
with aioresponses() as m:
|
|
56
|
-
m.get(error_url, status=401)
|
|
57
|
-
|
|
58
|
-
response = await async_client.get(error_url)
|
|
59
|
-
assert response.status == 401
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import requests
|
|
2
|
-
import requests_mock
|
|
3
|
-
import pytest
|
|
4
|
-
from thordata_sdk.client import ThordataClient
|
|
5
|
-
|
|
6
|
-
# Mock Credentials
|
|
7
|
-
TEST_SCRAPER = "mock_scraper_token"
|
|
8
|
-
TEST_PUB_TOKEN = "mock_public_token"
|
|
9
|
-
TEST_PUB_KEY = "mock_public_key"
|
|
10
|
-
TEST_HOST = "gate.thordata.com"
|
|
11
|
-
TEST_PORT = 22225
|
|
12
|
-
|
|
13
|
-
@pytest.fixture
|
|
14
|
-
def client():
|
|
15
|
-
"""Fixture to create a ThordataClient instance."""
|
|
16
|
-
return ThordataClient(
|
|
17
|
-
scraper_token=TEST_SCRAPER,
|
|
18
|
-
public_token=TEST_PUB_TOKEN,
|
|
19
|
-
public_key=TEST_PUB_KEY,
|
|
20
|
-
proxy_host=TEST_HOST,
|
|
21
|
-
proxy_port=TEST_PORT
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
def test_client_initialization(client):
|
|
25
|
-
"""Test client initialization and proxy URL construction."""
|
|
26
|
-
expected_url = f"http://{TEST_SCRAPER}:@{TEST_HOST}:{TEST_PORT}"
|
|
27
|
-
|
|
28
|
-
# Verify proxy configuration in session
|
|
29
|
-
assert client.session.proxies["http"] == expected_url
|
|
30
|
-
assert client.session.proxies["https"] == expected_url
|
|
31
|
-
|
|
32
|
-
def test_successful_request(client):
|
|
33
|
-
"""Test a successful proxy request (200 OK)."""
|
|
34
|
-
mock_url = "http://example.com/test"
|
|
35
|
-
mock_data = {"status": "ok"}
|
|
36
|
-
|
|
37
|
-
with requests_mock.Mocker() as m:
|
|
38
|
-
m.get(mock_url, status_code=200, json=mock_data)
|
|
39
|
-
|
|
40
|
-
response = client.get(mock_url)
|
|
41
|
-
|
|
42
|
-
assert response.status_code == 200
|
|
43
|
-
assert response.json() == mock_data
|
|
44
|
-
|
|
45
|
-
def test_http_error_handling(client):
|
|
46
|
-
"""Test handling of HTTP errors (e.g., 403 Forbidden)."""
|
|
47
|
-
error_url = "http://example.com/error"
|
|
48
|
-
|
|
49
|
-
with requests_mock.Mocker() as m:
|
|
50
|
-
m.get(error_url, status_code=403)
|
|
51
|
-
|
|
52
|
-
response = client.get(error_url)
|
|
53
|
-
assert response.status_code == 403
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|