thordata-sdk 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {thordata_sdk-0.2.2/thordata_sdk.egg-info → thordata_sdk-0.2.3}/PKG-INFO +1 -1
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/setup.py +1 -1
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk/__init__.py +1 -1
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk/async_client.py +66 -32
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk/client.py +80 -13
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3/thordata_sdk.egg-info}/PKG-INFO +1 -1
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/LICENSE +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/README.md +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/setup.cfg +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/tests/test_async_client.py +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/tests/test_client.py +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk.egg-info/SOURCES.txt +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk.egg-info/dependency_links.txt +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk.egg-info/requires.txt +0 -0
- {thordata_sdk-0.2.2 → thordata_sdk-0.2.3}/thordata_sdk.egg-info/top_level.txt +0 -0
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name='thordata_sdk',
|
|
5
|
-
version='0.2.
|
|
5
|
+
version='0.2.3', # Bump version due to breaking auth changes
|
|
6
6
|
packages=find_packages(include=['thordata_sdk', 'thordata_sdk.*']),
|
|
7
7
|
install_requires=[
|
|
8
8
|
'requests>=2.25.0', # Standard synchronous HTTP
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import aiohttp
|
|
2
2
|
import logging
|
|
3
3
|
import json
|
|
4
|
-
|
|
4
|
+
import base64
|
|
5
|
+
from typing import Optional, Dict, Any, Union
|
|
5
6
|
|
|
6
7
|
logger = logging.getLogger(__name__)
|
|
7
8
|
|
|
@@ -9,11 +10,6 @@ logger = logging.getLogger(__name__)
|
|
|
9
10
|
class AsyncThordataClient:
|
|
10
11
|
"""
|
|
11
12
|
Thordata Asynchronous Client (built on aiohttp).
|
|
12
|
-
Designed for high-concurrency and low-latency data collection tasks.
|
|
13
|
-
|
|
14
|
-
Usage:
|
|
15
|
-
async with AsyncThordataClient(...) as client:
|
|
16
|
-
await client.get("http://example.com")
|
|
17
13
|
"""
|
|
18
14
|
|
|
19
15
|
def __init__(
|
|
@@ -24,22 +20,19 @@ class AsyncThordataClient:
|
|
|
24
20
|
proxy_host: str = "gate.thordata.com",
|
|
25
21
|
proxy_port: int = 22225
|
|
26
22
|
):
|
|
27
|
-
"""
|
|
28
|
-
Initialize the asynchronous client.
|
|
29
|
-
"""
|
|
30
23
|
self.scraper_token = scraper_token
|
|
31
24
|
self.public_token = public_token
|
|
32
25
|
self.public_key = public_key
|
|
33
26
|
|
|
34
|
-
# Proxy Authentication
|
|
35
27
|
self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
|
|
36
28
|
self.proxy_url = f"http://{proxy_host}:{proxy_port}"
|
|
37
29
|
|
|
38
|
-
# API Endpoints
|
|
39
30
|
self.base_url = "https://scraperapi.thordata.com"
|
|
31
|
+
self.universal_url = "https://universalapi.thordata.com"
|
|
40
32
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
41
33
|
|
|
42
34
|
self.SERP_API_URL = f"{self.base_url}/request"
|
|
35
|
+
self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
|
|
43
36
|
self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
|
|
44
37
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
45
38
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
@@ -55,21 +48,14 @@ class AsyncThordataClient:
|
|
|
55
48
|
await self.close()
|
|
56
49
|
|
|
57
50
|
async def close(self):
|
|
58
|
-
"""Close the underlying aiohttp session."""
|
|
59
51
|
if self._session and not self._session.closed:
|
|
60
52
|
await self._session.close()
|
|
61
53
|
self._session = None
|
|
62
54
|
|
|
63
|
-
# --- Proxy
|
|
64
|
-
|
|
55
|
+
# --- Proxy ---
|
|
65
56
|
async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
|
|
66
|
-
"""
|
|
67
|
-
Send an asynchronous GET request through the Thordata Proxy.
|
|
68
|
-
"""
|
|
69
57
|
if self._session is None:
|
|
70
58
|
raise RuntimeError("Client session not initialized.")
|
|
71
|
-
|
|
72
|
-
logger.debug(f"Async Proxy Request: {url}")
|
|
73
59
|
try:
|
|
74
60
|
return await self._session.get(
|
|
75
61
|
url,
|
|
@@ -81,21 +67,16 @@ class AsyncThordataClient:
|
|
|
81
67
|
logger.error(f"Async Request failed: {e}")
|
|
82
68
|
raise
|
|
83
69
|
|
|
84
|
-
# --- SERP
|
|
85
|
-
|
|
70
|
+
# --- SERP ---
|
|
86
71
|
async def serp_search(
|
|
87
72
|
self, query: str, engine: str = "google", num: int = 10, **kwargs
|
|
88
73
|
) -> Dict[str, Any]:
|
|
89
|
-
"""Async SERP search."""
|
|
90
74
|
if self._session is None:
|
|
91
75
|
raise RuntimeError("Client session not initialized.")
|
|
92
76
|
|
|
93
77
|
payload = {
|
|
94
|
-
"q": query,
|
|
95
|
-
"
|
|
96
|
-
"json": "1",
|
|
97
|
-
"engine": engine.lower(),
|
|
98
|
-
**kwargs
|
|
78
|
+
"q": query, "num": str(num), "json": "1",
|
|
79
|
+
"engine": engine.lower(), **kwargs
|
|
99
80
|
}
|
|
100
81
|
if engine.lower() == 'yandex':
|
|
101
82
|
payload['text'] = payload.pop('q')
|
|
@@ -117,7 +98,6 @@ class AsyncThordataClient:
|
|
|
117
98
|
) as response:
|
|
118
99
|
response.raise_for_status()
|
|
119
100
|
data = await response.json()
|
|
120
|
-
# Handle double-encoding
|
|
121
101
|
if isinstance(data, str):
|
|
122
102
|
try:
|
|
123
103
|
data = json.loads(data)
|
|
@@ -125,8 +105,65 @@ class AsyncThordataClient:
|
|
|
125
105
|
pass
|
|
126
106
|
return data
|
|
127
107
|
|
|
128
|
-
# ---
|
|
108
|
+
# --- Universal ---
|
|
109
|
+
async def universal_scrape(
|
|
110
|
+
self,
|
|
111
|
+
url: str,
|
|
112
|
+
js_render: bool = False,
|
|
113
|
+
output_format: str = "HTML",
|
|
114
|
+
country: str = None,
|
|
115
|
+
block_resources: bool = False
|
|
116
|
+
) -> Union[str, bytes]:
|
|
117
|
+
if self._session is None:
|
|
118
|
+
raise RuntimeError("Client session not initialized.")
|
|
119
|
+
|
|
120
|
+
headers = {
|
|
121
|
+
"Authorization": f"Bearer {self.scraper_token}",
|
|
122
|
+
"Content-Type": "application/x-www-form-urlencoded"
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
payload = {
|
|
126
|
+
"url": url,
|
|
127
|
+
"js_render": "True" if js_render else "False",
|
|
128
|
+
"type": output_format.lower(),
|
|
129
|
+
"block_resources": "True" if block_resources else "False"
|
|
130
|
+
}
|
|
131
|
+
if country:
|
|
132
|
+
payload["country"] = country
|
|
133
|
+
|
|
134
|
+
async with self._session.post(
|
|
135
|
+
self.UNIVERSAL_API_URL, data=payload, headers=headers
|
|
136
|
+
) as response:
|
|
137
|
+
response.raise_for_status()
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
resp_json = await response.json()
|
|
141
|
+
except Exception:
|
|
142
|
+
if output_format.upper() == "PNG":
|
|
143
|
+
return await response.read()
|
|
144
|
+
return await response.text()
|
|
145
|
+
|
|
146
|
+
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
147
|
+
and resp_json.get("code") != 200:
|
|
148
|
+
raise Exception(f"Universal API Error: {resp_json}")
|
|
149
|
+
|
|
150
|
+
if "html" in resp_json:
|
|
151
|
+
return resp_json["html"]
|
|
152
|
+
|
|
153
|
+
if "png" in resp_json:
|
|
154
|
+
png_str = resp_json["png"]
|
|
155
|
+
if not png_str:
|
|
156
|
+
raise Exception("API returned empty PNG data")
|
|
157
|
+
|
|
158
|
+
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
159
|
+
missing_padding = len(png_str) % 4
|
|
160
|
+
if missing_padding:
|
|
161
|
+
png_str += '=' * (4 - missing_padding)
|
|
162
|
+
return base64.b64decode(png_str)
|
|
163
|
+
|
|
164
|
+
return str(resp_json)
|
|
129
165
|
|
|
166
|
+
# --- Web Scraper ---
|
|
130
167
|
async def create_scraper_task(
|
|
131
168
|
self,
|
|
132
169
|
file_name: str,
|
|
@@ -135,7 +172,6 @@ class AsyncThordataClient:
|
|
|
135
172
|
spider_name: str = "youtube.com",
|
|
136
173
|
universal_params: Dict[str, Any] = None
|
|
137
174
|
) -> str:
|
|
138
|
-
"""Create an async scraping task."""
|
|
139
175
|
if self._session is None:
|
|
140
176
|
raise RuntimeError("Client session not initialized.")
|
|
141
177
|
|
|
@@ -164,7 +200,6 @@ class AsyncThordataClient:
|
|
|
164
200
|
return data["data"]["task_id"]
|
|
165
201
|
|
|
166
202
|
async def get_task_status(self, task_id: str) -> str:
|
|
167
|
-
"""Check task status."""
|
|
168
203
|
headers = {
|
|
169
204
|
"token": self.public_token,
|
|
170
205
|
"key": self.public_key,
|
|
@@ -183,7 +218,6 @@ class AsyncThordataClient:
|
|
|
183
218
|
return "Unknown"
|
|
184
219
|
|
|
185
220
|
async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
|
|
186
|
-
"""Get download link."""
|
|
187
221
|
headers = {
|
|
188
222
|
"token": self.public_token,
|
|
189
223
|
"key": self.public_key,
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
import logging
|
|
3
3
|
import json
|
|
4
|
-
|
|
4
|
+
import base64
|
|
5
|
+
from typing import Dict, Any, Union
|
|
5
6
|
|
|
6
7
|
# Configure a library-specific logger
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
@@ -14,7 +15,8 @@ class ThordataClient:
|
|
|
14
15
|
Handles authentication for:
|
|
15
16
|
1. Proxy Network (HTTP/HTTPS)
|
|
16
17
|
2. SERP API (Real-time Search)
|
|
17
|
-
3.
|
|
18
|
+
3. Universal Scraping API (Single Page)
|
|
19
|
+
4. Web Scraper API (Async Task Management)
|
|
18
20
|
"""
|
|
19
21
|
|
|
20
22
|
def __init__(
|
|
@@ -39,16 +41,18 @@ class ThordataClient:
|
|
|
39
41
|
self.public_token = public_token
|
|
40
42
|
self.public_key = public_key
|
|
41
43
|
|
|
42
|
-
# Proxy Configuration
|
|
44
|
+
# Proxy Configuration
|
|
43
45
|
self.proxy_url = (
|
|
44
46
|
f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
|
|
45
47
|
)
|
|
46
48
|
|
|
47
49
|
# API Endpoints
|
|
48
50
|
self.base_url = "https://scraperapi.thordata.com"
|
|
51
|
+
self.universal_url = "https://universalapi.thordata.com"
|
|
49
52
|
self.api_url = "https://api.thordata.com/api/web-scraper-api"
|
|
50
53
|
|
|
51
54
|
self.SERP_API_URL = f"{self.base_url}/request"
|
|
55
|
+
self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
|
|
52
56
|
self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
|
|
53
57
|
self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
|
|
54
58
|
self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
|
|
@@ -62,13 +66,6 @@ class ThordataClient:
|
|
|
62
66
|
def get(self, url: str, **kwargs) -> requests.Response:
|
|
63
67
|
"""
|
|
64
68
|
Send a GET request through the Thordata Proxy Network.
|
|
65
|
-
|
|
66
|
-
Args:
|
|
67
|
-
url (str): The target URL.
|
|
68
|
-
**kwargs: Additional arguments passed to requests.get().
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
requests.Response: The HTTP response.
|
|
72
69
|
"""
|
|
73
70
|
logger.debug(f"Proxy Request: {url}")
|
|
74
71
|
kwargs.setdefault("timeout", 30)
|
|
@@ -88,7 +85,6 @@ class ThordataClient:
|
|
|
88
85
|
**kwargs
|
|
89
86
|
}
|
|
90
87
|
|
|
91
|
-
# Engine-specific parameter adjustments
|
|
92
88
|
if engine.lower() == 'yandex':
|
|
93
89
|
payload['text'] = payload.pop('q')
|
|
94
90
|
if 'url' not in payload:
|
|
@@ -117,7 +113,6 @@ class ThordataClient:
|
|
|
117
113
|
response.raise_for_status()
|
|
118
114
|
data = response.json()
|
|
119
115
|
|
|
120
|
-
# Handle potential double-encoded JSON strings
|
|
121
116
|
if isinstance(data, str):
|
|
122
117
|
try:
|
|
123
118
|
data = json.loads(data)
|
|
@@ -128,6 +123,79 @@ class ThordataClient:
|
|
|
128
123
|
logger.error(f"SERP Request Failed: {e}")
|
|
129
124
|
raise
|
|
130
125
|
|
|
126
|
+
def universal_scrape(
|
|
127
|
+
self,
|
|
128
|
+
url: str,
|
|
129
|
+
js_render: bool = False,
|
|
130
|
+
output_format: str = "HTML",
|
|
131
|
+
country: str = None,
|
|
132
|
+
block_resources: bool = False
|
|
133
|
+
) -> Union[str, bytes]:
|
|
134
|
+
"""
|
|
135
|
+
Unlock target pages via the Universal Scraping API.
|
|
136
|
+
"""
|
|
137
|
+
headers = {
|
|
138
|
+
"Authorization": f"Bearer {self.scraper_token}",
|
|
139
|
+
"Content-Type": "application/x-www-form-urlencoded"
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
payload = {
|
|
143
|
+
"url": url,
|
|
144
|
+
"js_render": "True" if js_render else "False",
|
|
145
|
+
"type": output_format.lower(),
|
|
146
|
+
"block_resources": "True" if block_resources else "False"
|
|
147
|
+
}
|
|
148
|
+
if country:
|
|
149
|
+
payload["country"] = country
|
|
150
|
+
|
|
151
|
+
logger.info(f"Universal Scrape: {url}")
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
response = self.session.post(
|
|
155
|
+
self.UNIVERSAL_API_URL,
|
|
156
|
+
data=payload,
|
|
157
|
+
headers=headers,
|
|
158
|
+
timeout=60
|
|
159
|
+
)
|
|
160
|
+
response.raise_for_status()
|
|
161
|
+
|
|
162
|
+
# Parse JSON wrapper
|
|
163
|
+
try:
|
|
164
|
+
resp_json = response.json()
|
|
165
|
+
except json.JSONDecodeError:
|
|
166
|
+
# Fallback for raw response
|
|
167
|
+
if output_format.upper() == "PNG":
|
|
168
|
+
return response.content
|
|
169
|
+
return response.text
|
|
170
|
+
|
|
171
|
+
# Check API errors
|
|
172
|
+
if isinstance(resp_json, dict) and resp_json.get("code") \
|
|
173
|
+
and resp_json.get("code") != 200:
|
|
174
|
+
raise Exception(f"Universal API Error: {resp_json}")
|
|
175
|
+
|
|
176
|
+
# Extract HTML
|
|
177
|
+
if "html" in resp_json:
|
|
178
|
+
return resp_json["html"]
|
|
179
|
+
|
|
180
|
+
# Extract PNG (Base64 decoding with padding fix)
|
|
181
|
+
if "png" in resp_json:
|
|
182
|
+
png_str = resp_json["png"]
|
|
183
|
+
if not png_str:
|
|
184
|
+
raise Exception("API returned empty PNG data")
|
|
185
|
+
|
|
186
|
+
png_str = png_str.replace("\n", "").replace("\r", "")
|
|
187
|
+
missing_padding = len(png_str) % 4
|
|
188
|
+
if missing_padding:
|
|
189
|
+
png_str += '=' * (4 - missing_padding)
|
|
190
|
+
|
|
191
|
+
return base64.b64decode(png_str)
|
|
192
|
+
|
|
193
|
+
return str(resp_json)
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.error(f"Universal Scrape Failed: {e}")
|
|
197
|
+
raise
|
|
198
|
+
|
|
131
199
|
def create_scraper_task(
|
|
132
200
|
self,
|
|
133
201
|
file_name: str,
|
|
@@ -174,7 +242,6 @@ class ThordataClient:
|
|
|
174
242
|
def get_task_status(self, task_id: str) -> str:
|
|
175
243
|
"""
|
|
176
244
|
Check the status of a task.
|
|
177
|
-
Returns: 'Running', 'Ready', 'Failed', or 'Unknown'.
|
|
178
245
|
"""
|
|
179
246
|
headers = {
|
|
180
247
|
"token": self.public_token,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|