thordata-sdk 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: thordata-sdk
3
+ Version: 0.3.1
4
+ Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
+ Author-email: Thordata Developer Team <support@thordata.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://www.thordata.com
8
+ Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
9
+ Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
10
+ Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
11
+ Keywords: web scraping,proxy,ai,llm,data-mining,serp,thordata
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Topic :: Internet :: WWW/HTTP
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: License :: OSI Approved :: Apache Software License
22
+ Classifier: Operating System :: OS Independent
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: requests>=2.25.0
27
+ Requires-Dist: aiohttp>=3.8.0
28
+ Dynamic: license-file
29
+
30
+ # Thordata Python SDK
31
+
32
+ <h4 align="center">
33
+ Official Python client for Thordata's Proxy Network, SERP API, Universal Scraping API, and Web Scraper API.
34
+ <br>
35
+ <i>Async-ready, built for AI agents and large-scale data collection.</i>
36
+ </h4>
37
+
38
+ <p align="center">
39
+ <a href="https://github.com/Thordata/thordata-python-sdk/actions/workflows/ci.yml">
40
+ <img src="https://github.com/Thordata/thordata-python-sdk/actions/workflows/ci.yml/badge.svg" alt="CI">
41
+ </a>
42
+ <a href="https://pypi.org/project/thordata-sdk/">
43
+ <img src="https://img.shields.io/pypi/v/thordata-sdk?color=blue" alt="PyPI version">
44
+ </a>
45
+ <a href="https://github.com/Thordata/thordata-python-sdk/blob/main/LICENSE">
46
+ <img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License">
47
+ </a>
48
+ <a href="https://python.org">
49
+ <img src="https://img.shields.io/badge/python-3.8+-blue" alt="Python Versions">
50
+ </a>
51
+ </p>
52
+
53
+ ---
54
+
55
+ ## Installation
56
+
57
+ ```bash
58
+ pip install thordata-sdk
59
+ ```
60
+
61
+ ## Quick Start
62
+
63
+ All examples below use the unified client:
64
+
65
+ ```python
66
+ from thordata import ThordataClient, AsyncThordataClient
67
+ ```
68
+
69
+ You can copy `examples/.env.example` to `.env` and fill in your tokens from the Thordata Dashboard.
70
+
71
+ ### 1. Proxy Network (Simple GET)
72
+
73
+ ```python
74
+ import os
75
+ from dotenv import load_dotenv
76
+ from thordata import ThordataClient
77
+
78
+ load_dotenv()
79
+
80
+ client = ThordataClient(
81
+ scraper_token=os.getenv("THORDATA_SCRAPER_TOKEN"),
82
+ public_token=os.getenv("THORDATA_PUBLIC_TOKEN"),
83
+ public_key=os.getenv("THORDATA_PUBLIC_KEY"),
84
+ )
85
+
86
+ resp = client.get("http://httpbin.org/ip")
87
+ print(resp.json())
88
+ ```
89
+
90
+ ### 2. SERP API (Google, Bing, Yandex, DuckDuckGo)
91
+
92
+ ```python
93
+ from thordata import ThordataClient, Engine
94
+
95
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
96
+
97
+ results = client.serp_search(
98
+ query="Thordata technology",
99
+ engine=Engine.GOOGLE,
100
+ num=10,
101
+ # Any engine-specific parameters are passed via **kwargs
102
+ # e.g. type="shopping", location="United States"
103
+ )
104
+
105
+ print(len(results.get("organic", [])))
106
+ ```
107
+
108
+ ### 3. Universal Scraping API
109
+
110
+ ```python
111
+ from thordata import ThordataClient
112
+
113
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
114
+
115
+ html = client.universal_scrape(
116
+ url="https://www.google.com",
117
+ js_render=True,
118
+ output_format="HTML",
119
+ )
120
+ print(html[:200])
121
+ ```
122
+
123
+ ### 4. Web Scraper API (Task-based)
124
+
125
+ ```python
126
+ import time
127
+ from thordata import ThordataClient
128
+
129
+ client = ThordataClient("SCRAPER_TOKEN", "PUBLIC_TOKEN", "PUBLIC_KEY")
130
+
131
+ task_id = client.create_scraper_task(
132
+ file_name="demo_youtube_data",
133
+ spider_id="youtube_video-post_by-url",
134
+ spider_name="youtube.com",
135
+ individual_params={
136
+ "url": "https://www.youtube.com/@stephcurry/videos",
137
+ "order_by": "",
138
+ "num_of_posts": ""
139
+ },
140
+ )
141
+
142
+ for _ in range(10):
143
+ status = client.get_task_status(task_id)
144
+ print("Status:", status)
145
+ if status in ["Ready", "Success"]:
146
+ break
147
+ if status == "Failed":
148
+ raise RuntimeError("Task failed")
149
+ time.sleep(3)
150
+
151
+ download_url = client.get_task_result(task_id)
152
+ print("Download URL:", download_url)
153
+ ```
154
+
155
+ ### 5. Asynchronous Usage (High Concurrency)
156
+
157
+ ```python
158
+ import asyncio
159
+ from thordata import AsyncThordataClient
160
+
161
+ async def main():
162
+ async with AsyncThordataClient(
163
+ scraper_token="SCRAPER_TOKEN",
164
+ public_token="PUBLIC_TOKEN",
165
+ public_key="PUBLIC_KEY",
166
+ ) as client:
167
+ resp = await client.get("http://httpbin.org/ip")
168
+ print(await resp.json())
169
+
170
+ asyncio.run(main())
171
+ ```
172
+
173
+ More examples are available in the `examples/` directory.
174
+
175
+ ---
176
+
177
+ ## Features
178
+
179
+ | Feature | Status | Description |
180
+ |---------|--------|-------------|
181
+ | Proxy Network | Stable | Residential, ISP, Mobile, Datacenter via HTTP/HTTPS gateway. |
182
+ | SERP API | Stable | Google / Bing / Yandex / DuckDuckGo, flexible parameters. |
183
+ | Universal Scraping API | Stable | JS rendering, HTML / PNG output, antibot bypass. |
184
+ | Web Scraper API | Stable | Task-based scraping for complex sites (YouTube, E-commerce). |
185
+ | Async Client | Stable | aiohttp-based client for high-concurrency workloads. |
186
+
187
+ ---
188
+
189
+ ## Development & Contributing
190
+
191
+ See `CONTRIBUTING.md` for local development and contribution guidelines.
192
+
193
+ ## License
194
+
195
+ This project is licensed under the Apache License 2.0.
196
+
197
+ ## Support
198
+
199
+ For technical support, please contact support@thordata.com
200
+ or verify your tokens and quotas in the Thordata Dashboard.
@@ -0,0 +1,10 @@
1
+ thordata/__init__.py,sha256=iv2luaDxUmcWTqScu08gGJfocUZr6pSwtzJs2akZ1Gg,365
2
+ thordata/async_client.py,sha256=cpBtRIzr8oH6GuZs8gTh505tGYYV1aRFBUzbtmFOfEg,9717
3
+ thordata/client.py,sha256=WVIvIZTACEpw9NaTbGtIkMGUlfliFL7kNGdCoTxxsUI,17193
4
+ thordata/enums.py,sha256=PGUCQX3jw5a9mX8_JfhuyoR1WriWjWQpAgibVP_bpdM,679
5
+ thordata/parameters.py,sha256=1lNx_BSS8ztBKEj_MXZMaIQQ9_W3EAlS-VFiBqSWb9E,1841
6
+ thordata_sdk-0.3.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
7
+ thordata_sdk-0.3.1.dist-info/METADATA,sha256=NMq7Be240zn2q3MlUUg2Dmo4NFoQtDMgkRAGzjg_yjc,5901
8
+ thordata_sdk-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ thordata_sdk-0.3.1.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
10
+ thordata_sdk-0.3.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1 @@
1
+ thordata
thordata_sdk/__init__.py DELETED
@@ -1,9 +0,0 @@
1
- # Expose main clients
2
- from .client import ThordataClient
3
- from .async_client import AsyncThordataClient
4
- from .enums import Engine, GoogleSearchType
5
-
6
- # Version of the thordata-sdk package
7
- __version__ = "0.2.4"
8
-
9
- __all__ = ["ThordataClient", "AsyncThordataClient"]
thordata_sdk/client.py DELETED
@@ -1,303 +0,0 @@
1
- import requests
2
- import logging
3
- import json
4
- import base64
5
- from typing import Dict, Any, Union, Optional
6
-
7
- from .enums import Engine
8
- from .parameters import normalize_serp_params
9
-
10
- # Configure a library-specific logger
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- class ThordataClient:
15
- """
16
- The official synchronous Python client for Thordata.
17
-
18
- Handles authentication for:
19
- 1. Proxy Network (HTTP/HTTPS)
20
- 2. SERP API (Real-time Search)
21
- 3. Universal Scraping API (Single Page)
22
- 4. Web Scraper API (Async Task Management)
23
- """
24
-
25
- def __init__(
26
- self,
27
- scraper_token: str,
28
- public_token: str,
29
- public_key: str,
30
- proxy_host: str = "gate.thordata.com",
31
- proxy_port: int = 22225
32
- ):
33
- """
34
- Initialize the Thordata Client.
35
-
36
- Args:
37
- scraper_token (str): Token from Dashboard bottom.
38
- public_token (str): Token from Public API section.
39
- public_key (str): Key from Public API section.
40
- proxy_host (str): Proxy gateway host.
41
- proxy_port (int): Proxy gateway port.
42
- """
43
- self.scraper_token = scraper_token
44
- self.public_token = public_token
45
- self.public_key = public_key
46
-
47
- # Proxy Configuration
48
- self.proxy_url = (
49
- f"http://{self.scraper_token}:@{proxy_host}:{proxy_port}"
50
- )
51
-
52
- # API Endpoints
53
- self.base_url = "https://scraperapi.thordata.com"
54
- self.universal_url = "https://universalapi.thordata.com"
55
- self.api_url = "https://api.thordata.com/api/web-scraper-api"
56
-
57
- self.SERP_API_URL = f"{self.base_url}/request"
58
- self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
59
- self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
60
- self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
61
- self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
62
-
63
- self.session = requests.Session()
64
- self.session.proxies = {
65
- "http": self.proxy_url,
66
- "https": self.proxy_url,
67
- }
68
-
69
- def get(self, url: str, **kwargs) -> requests.Response:
70
- """
71
- Send a GET request through the Thordata Proxy Network.
72
- """
73
- logger.debug(f"Proxy Request: {url}")
74
- kwargs.setdefault("timeout", 30)
75
- return self.session.get(url, **kwargs)
76
-
77
- def serp_search(
78
- self,
79
- query: str,
80
- engine: Union[Engine, str] = Engine.GOOGLE, # 既可以是枚举,也可以是字符串
81
- num: int = 10,
82
- **kwargs # 这里接收所有额外参数 (比如 type="maps")
83
- ) -> Dict[str, Any]:
84
- """
85
- Execute a real-time SERP search.
86
-
87
- Args:
88
- query: Keywords
89
- engine: 'google', 'bing', 'yandex' etc.
90
- num: Number of results (default 10)
91
- **kwargs: Extra parameters (e.g., type="shopping", location="London")
92
- """
93
- # 兼容处理:如果用户传的是枚举对象,取它的值;如果是字符串,转小写
94
- engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
95
-
96
- # 调用 parameters.py 里的逻辑
97
- payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
98
-
99
- headers = {
100
- "Authorization": f"Bearer {self.scraper_token}",
101
- "Content-Type": "application/x-www-form-urlencoded"
102
- }
103
-
104
- logger.info(f"SERP Search: {engine_str} - {query}")
105
- try:
106
- response = self.session.post(
107
- self.SERP_API_URL,
108
- data=payload,
109
- headers=headers,
110
- timeout=60
111
- )
112
- response.raise_for_status()
113
-
114
- data = response.json()
115
- if isinstance(data, str):
116
- try: data = json.loads(data)
117
- except: pass
118
- return data
119
- except Exception as e:
120
- logger.error(f"SERP Request Failed: {e}")
121
- raise
122
-
123
-
124
- def universal_scrape(
125
- self,
126
- url: str,
127
- js_render: bool = False,
128
- output_format: str = "HTML",
129
- country: str = None,
130
- block_resources: bool = False
131
- ) -> Union[str, bytes]:
132
- """
133
- Unlock target pages via the Universal Scraping API.
134
- """
135
- headers = {
136
- "Authorization": f"Bearer {self.scraper_token}",
137
- "Content-Type": "application/x-www-form-urlencoded"
138
- }
139
-
140
- payload = {
141
- "url": url,
142
- "js_render": "True" if js_render else "False",
143
- "type": output_format.lower(),
144
- "block_resources": "True" if block_resources else "False"
145
- }
146
- if country:
147
- payload["country"] = country
148
-
149
- logger.info(f"Universal Scrape: {url}")
150
-
151
- try:
152
- response = self.session.post(
153
- self.UNIVERSAL_API_URL,
154
- data=payload,
155
- headers=headers,
156
- timeout=60
157
- )
158
- response.raise_for_status()
159
-
160
- # Parse JSON wrapper
161
- try:
162
- resp_json = response.json()
163
- except json.JSONDecodeError:
164
- # Fallback for raw response
165
- if output_format.upper() == "PNG":
166
- return response.content
167
- return response.text
168
-
169
- # Check API errors
170
- if isinstance(resp_json, dict) and resp_json.get("code") \
171
- and resp_json.get("code") != 200:
172
- raise Exception(f"Universal API Error: {resp_json}")
173
-
174
- # Extract HTML
175
- if "html" in resp_json:
176
- return resp_json["html"]
177
-
178
- # Extract PNG
179
- if "png" in resp_json:
180
- png_str = resp_json["png"]
181
- if not png_str:
182
- raise Exception("API returned empty PNG data")
183
-
184
- # 🛠️ FIX: 移除 Data URI Scheme 前缀 (data:image/png;base64,)
185
- if "," in png_str:
186
- png_str = png_str.split(",", 1)[1]
187
-
188
- # Base64 解码 (处理 padding)
189
- png_str = png_str.replace("\n", "").replace("\r", "")
190
- missing_padding = len(png_str) % 4
191
- if missing_padding:
192
- png_str += '=' * (4 - missing_padding)
193
-
194
- return base64.b64decode(png_str)
195
-
196
- return str(resp_json)
197
-
198
- except Exception as e:
199
- logger.error(f"Universal Scrape Failed: {e}")
200
- raise
201
-
202
- def create_scraper_task(
203
- self,
204
- file_name: str,
205
- spider_id: str, # 必须传,用户从仪表板获取
206
- spider_name: str, # 必须传,例如 "youtube.com"
207
- individual_params: Dict[str, Any], # 用户把具体的参数打包在这个字典里传进来
208
- universal_params: Dict[str, Any] = None
209
- ) -> str:
210
- """
211
- Create a generic Web Scraper Task.
212
-
213
- Note: Check the Thordata Dashboard to get the correct 'spider_id' and 'spider_name'.
214
- """
215
- headers = {
216
- "Authorization": f"Bearer {self.scraper_token}",
217
- "Content-Type": "application/x-www-form-urlencoded"
218
- }
219
-
220
- # 直接打包发送,不替用户做太多复杂的校验,保证兼容性
221
- payload = {
222
- "spider_name": spider_name,
223
- "spider_id": spider_id,
224
- "spider_parameters": json.dumps([individual_params]),
225
- "spider_errors": "true",
226
- "file_name": file_name
227
- }
228
- if universal_params:
229
- payload["spider_universal"] = json.dumps(universal_params)
230
-
231
- logger.info(f"Creating Scraper Task: {spider_name} (ID: {spider_id})")
232
- try:
233
- response = self.session.post(
234
- self.SCRAPER_BUILDER_URL,
235
- data=payload,
236
- headers=headers
237
- )
238
- response.raise_for_status()
239
- data = response.json()
240
-
241
- if data.get("code") != 200:
242
- raise Exception(f"Creation failed: {data}")
243
- return data["data"]["task_id"]
244
- except Exception as e:
245
- logger.error(f"Task Creation Failed: {e}")
246
- raise
247
-
248
- def get_task_status(self, task_id: str) -> str:
249
- """
250
- Check the status of a task.
251
- """
252
- headers = {
253
- "token": self.public_token,
254
- "key": self.public_key,
255
- "Content-Type": "application/x-www-form-urlencoded"
256
- }
257
- payload = {"tasks_ids": task_id}
258
-
259
- try:
260
- response = self.session.post(
261
- self.SCRAPER_STATUS_URL,
262
- data=payload,
263
- headers=headers
264
- )
265
- response.raise_for_status()
266
- data = response.json()
267
-
268
- if data.get("code") == 200 and data.get("data"):
269
- for item in data["data"]:
270
- if str(item.get("task_id")) == str(task_id):
271
- return item["status"]
272
- return "Unknown"
273
- except Exception as e:
274
- logger.error(f"Status Check Failed: {e}")
275
- return "Error"
276
-
277
- def get_task_result(self, task_id: str, file_type: str = "json") -> str:
278
- """
279
- Retrieve the download URL for a completed task.
280
- """
281
- headers = {
282
- "token": self.public_token,
283
- "key": self.public_key,
284
- "Content-Type": "application/x-www-form-urlencoded"
285
- }
286
- payload = {"tasks_id": task_id, "type": file_type}
287
-
288
- logger.info(f"Getting result URL: {task_id}")
289
- try:
290
- response = self.session.post(
291
- self.SCRAPER_DOWNLOAD_URL,
292
- data=payload,
293
- headers=headers
294
- )
295
- response.raise_for_status()
296
- data = response.json()
297
-
298
- if data.get("code") == 200 and data.get("data"):
299
- return data["data"]["download"]
300
- raise Exception(f"API returned error: {data}")
301
- except Exception as e:
302
- logger.error(f"Get Result Failed: {e}")
303
- raise
thordata_sdk/enums.py DELETED
@@ -1,20 +0,0 @@
1
- # thordata_sdk/enums.py
2
- from enum import Enum
3
-
4
- class Engine(str, Enum):
5
- """SERP 核心支持的四大引擎"""
6
- GOOGLE = "google"
7
- BING = "bing"
8
- YANDEX = "yandex"
9
- DUCKDUCKGO = "duckduckgo"
10
- BAIDU = "baidu"
11
-
12
- class GoogleSearchType(str, Enum):
13
- """Google 搜索的常见子类型 (参考你的截图)"""
14
- SEARCH = "search" # 默认网页搜索
15
- MAPS = "maps" # 地图
16
- SHOPPING = "shopping" # 购物
17
- NEWS = "news" # 新闻
18
- IMAGES = "images" # 图片
19
- VIDEOS = "videos" # 视频
20
- # 其他冷门的先不写,用户可以通过字符串传参
@@ -1,41 +0,0 @@
1
- # thordata_sdk/parameters.py
2
- from typing import Dict, Any
3
-
4
- def normalize_serp_params(engine: str, query: str, **kwargs) -> Dict[str, Any]:
5
- """
6
- 统一不同搜索引擎的参数差异。
7
- """
8
- # 1. 基础参数
9
- payload = {
10
- "num": str(kwargs.get("num", 10)),
11
- "json": "1",
12
- "engine": engine,
13
- }
14
-
15
- # 2. 处理查询关键词 (Yandex 用 text,其他用 q)
16
- if engine == "yandex":
17
- payload["text"] = query
18
- # 如果用户没传 url,给个默认的
19
- if "url" not in kwargs:
20
- payload["url"] = "yandex.com"
21
- else:
22
- payload["q"] = query
23
-
24
- # 3. 处理默认 URL (如果用户没传)
25
- if "url" not in kwargs:
26
- defaults = {
27
- "google": "google.com",
28
- "bing": "bing.com",
29
- "duckduckgo": "duckduckgo.com",
30
- "baidu": "baidu.com"
31
- }
32
- if engine in defaults:
33
- payload["url"] = defaults[engine]
34
-
35
- # 4. 把用户传入的其他所有参数(比如 type="shopping", google_domain="google.co.uk")都透传进去
36
- # 这样你就不用去定义那几十种类型了,用户传啥就是啥
37
- for k, v in kwargs.items():
38
- if k not in ["num", "engine", "q", "text"]: # 避免覆盖
39
- payload[k] = v
40
-
41
- return payload