thordata-sdk 0.2.4__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ Metadata-Version: 2.4
2
+ Name: thordata-sdk
3
+ Version: 1.2.0
4
+ Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
+ Author-email: Thordata Developer Team <support@thordata.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://www.thordata.com
8
+ Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
9
+ Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
10
+ Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
11
+ Project-URL: Changelog, https://github.com/Thordata/thordata-python-sdk/blob/main/CHANGELOG.md
12
+ Keywords: web scraping,proxy,residential proxy,datacenter proxy,ai,llm,data-mining,serp,thordata,web scraper,anti-bot bypass
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
16
+ Classifier: Topic :: Internet :: WWW/HTTP
17
+ Classifier: Topic :: Internet :: Proxy Servers
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: License :: OSI Approved :: MIT License
24
+ Classifier: Operating System :: OS Independent
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.9
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: requests>=2.25.0
30
+ Requires-Dist: aiohttp>=3.9.0
31
+ Requires-Dist: PySocks>=1.7.1
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
34
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
35
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
36
+ Requires-Dist: pytest-httpserver>=1.0.0; extra == "dev"
37
+ Requires-Dist: python-dotenv>=1.0.0; extra == "dev"
38
+ Requires-Dist: black>=23.0.0; extra == "dev"
39
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
40
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
41
+ Requires-Dist: types-requests>=2.28.0; extra == "dev"
42
+ Requires-Dist: aioresponses>=0.7.6; extra == "dev"
43
+ Dynamic: license-file
44
+
45
+ # Thordata Python SDK
46
+
47
+ <div align="center">
48
+
49
+ <img src="https://img.shields.io/badge/Thordata-AI%20Infrastructure-blue?style=for-the-badge" alt="Thordata Logo">
50
+
51
+ **The Official Python Client for Thordata APIs**
52
+
53
+ *Proxy Network • SERP API • Web Unlocker • Web Scraper API*
54
+
55
+ [![PyPI version](https://img.shields.io/pypi/v/thordata-sdk.svg?style=flat-square)](https://pypi.org/project/thordata-sdk/)
56
+ [![Python Versions](https://img.shields.io/pypi/pyversions/thordata-sdk.svg?style=flat-square)](https://pypi.org/project/thordata-sdk/)
57
+ [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
58
+ [![CI Status](https://img.shields.io/github/actions/workflow/status/Thordata/thordata-python-sdk/ci.yml?branch=main&style=flat-square)](https://github.com/Thordata/thordata-python-sdk/actions)
59
+
60
+ </div>
61
+
62
+ ---
63
+
64
+ ## 📖 Introduction
65
+
66
+ This SDK provides a robust, high-performance interface to Thordata's AI data infrastructure. It is designed for high-concurrency scraping, reliable proxy tunneling, and seamless data extraction.
67
+
68
+ **Key Features:**
69
+ * **🚀 Production Ready:** Built on `urllib3` connection pooling for low-latency proxy requests.
70
+ * **⚡ Async Support:** Native `aiohttp` client for high-concurrency SERP/Universal scraping.
71
+ * **🛡️ Robust:** Handles TLS-in-TLS tunneling, retries, and error parsing automatically.
72
+ * **✨ Developer Experience:** Fully typed (`mypy` compatible) with intuitive IDE autocomplete.
73
+ * **🧩 Lazy Validation:** Only validate credentials for the features you actually use.
74
+
75
+ ---
76
+
77
+ ## 📦 Installation
78
+
79
+ ```bash
80
+ pip install thordata-sdk
81
+ ```
82
+
83
+ ---
84
+
85
+ ## 🔐 Configuration
86
+
87
+ Set environment variables to avoid hardcoding credentials. You only need to set the variables for the features you use.
88
+
89
+ ```bash
90
+ # [Required for SERP & Web Unlocker]
91
+ export THORDATA_SCRAPER_TOKEN="your_token_here"
92
+
93
+ # [Required for Proxy Network]
94
+ export THORDATA_RESIDENTIAL_USERNAME="your_username"
95
+ export THORDATA_RESIDENTIAL_PASSWORD="your_password"
96
+ export THORDATA_PROXY_HOST="vpnXXXX.pr.thordata.net"
97
+
98
+ # [Required for Task Management]
99
+ export THORDATA_PUBLIC_TOKEN="public_token"
100
+ export THORDATA_PUBLIC_KEY="public_key"
101
+ ```
102
+
103
+ ---
104
+
105
+ ## 🚀 Quick Start
106
+
107
+ ### 1. SERP Search (Google/Bing/Yandex)
108
+
109
+ ```python
110
+ from thordata import ThordataClient, Engine
111
+
112
+ client = ThordataClient() # Loads THORDATA_SCRAPER_TOKEN from env
113
+
114
+ # Simple Search
115
+ print("Searching...")
116
+ results = client.serp_search("latest AI trends", engine=Engine.GOOGLE_NEWS)
117
+
118
+ for news in results.get("news_results", [])[:3]:
119
+ print(f"- {news['title']} ({news['source']})")
120
+ ```
121
+
122
+ ### 2. Universal Scrape (Web Unlocker)
123
+
124
+ Bypass Cloudflare/Akamai and render JavaScript automatically.
125
+
126
+ ```python
127
+ html = client.universal_scrape(
128
+ url="https://example.com/protected-page",
129
+ js_render=True,
130
+ wait_for=".content-loaded",
131
+ country="us"
132
+ )
133
+ print(f"Scraped {len(html)} bytes")
134
+ ```
135
+
136
+ ### 3. High-Performance Proxy
137
+
138
+ Use Thordata's residential IPs with automatic connection pooling.
139
+
140
+ ```python
141
+ from thordata import ProxyConfig, ProxyProduct
142
+
143
+ # Config is optional if env vars are set, but allows granular control
144
+ proxy = ProxyConfig(
145
+ product=ProxyProduct.RESIDENTIAL,
146
+ country="jp",
147
+ city="tokyo",
148
+ session_id="session-001",
149
+ session_duration=10 # Sticky IP for 10 mins
150
+ )
151
+
152
+ # Use the client to make requests (Reuses TCP connections)
153
+ response = client.get("https://httpbin.org/ip", proxy_config=proxy)
154
+ print(response.json())
155
+ ```
156
+
157
+ ---
158
+
159
+ ## ⚙️ Advanced Usage
160
+
161
+ ### Async Client (High Concurrency)
162
+
163
+ For building AI agents or high-throughput spiders.
164
+
165
+ ```python
166
+ import asyncio
167
+ from thordata import AsyncThordataClient
168
+
169
+ async def main():
170
+ async with AsyncThordataClient() as client:
171
+ # Fire off multiple requests in parallel
172
+ tasks = [
173
+ client.serp_search(f"query {i}")
174
+ for i in range(5)
175
+ ]
176
+ results = await asyncio.gather(*tasks)
177
+ print(f"Completed {len(results)} searches")
178
+
179
+ asyncio.run(main())
180
+ ```
181
+
182
+ ### Web Scraper API (Task Management)
183
+
184
+ Create and manage large-scale scraping tasks asynchronously.
185
+
186
+ ```python
187
+ # 1. Create a task
188
+ task_id = client.create_scraper_task(
189
+ file_name="daily_scrape",
190
+ spider_id="universal",
191
+ spider_name="universal",
192
+ parameters={"url": "https://example.com"}
193
+ )
194
+
195
+ # 2. Wait for completion (Polling)
196
+ status = client.wait_for_task(task_id)
197
+
198
+ # 3. Get results
199
+ if status == "ready":
200
+ url = client.get_task_result(task_id)
201
+ print(f"Download Data: {url}")
202
+ ```
203
+
204
+ ---
205
+
206
+ ## 📄 License
207
+
208
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,16 @@
1
+ thordata/__init__.py,sha256=Ac7vBgJx4idvcw-Q5HFI0K9xkpSjtavQ_CRN5fZnBPk,3195
2
+ thordata/_example_utils.py,sha256=T9QtVq9BHhubOShgtGp2GSusYYd-ZFUJFJAw7ubIsa4,2199
3
+ thordata/_utils.py,sha256=Acr_6sHgdZXU7SQozd6FEYTZV6iHw__nlhpBTDwb66U,4917
4
+ thordata/async_client.py,sha256=Cd94u7Si8WRClZMzVfPfUO0a9U1vEDOXysuBQMEjvjs,56075
5
+ thordata/client.py,sha256=KRveHA62hF3NzbJHYQSQdIsEYeozhL7rdDHg--N7GaM,56159
6
+ thordata/demo.py,sha256=HQzgaUM33bWD7mBQ6HEkK5K6zqFnSAHLvaam6BwPgFA,3762
7
+ thordata/enums.py,sha256=MpZnS9_8sg2vtcFqM6UicB94cKZm5R1t83L3ejNSbLs,8502
8
+ thordata/exceptions.py,sha256=P9czrxkFhT439DxW3LE5W-koS595ObH4-mAQOfaDM18,9976
9
+ thordata/models.py,sha256=qtB7jE0v5zNEQfSpmOqdiacB5DgM2QfVR2PaYs-DisM,38206
10
+ thordata/retry.py,sha256=5kRwULl3X68Nx8PlSzr9benfyCL0nRSpVQXrwjWr45M,11456
11
+ thordata/serp_engines.py,sha256=iuMWncelcGOskCHXFzpcPMMTL5qfiLkazHB1uj3zpZo,5985
12
+ thordata_sdk-1.2.0.dist-info/licenses/LICENSE,sha256=bAxpWgQIzb-5jl3nhLdOwOJ_vlbHLtSG7yev2B7vioY,1088
13
+ thordata_sdk-1.2.0.dist-info/METADATA,sha256=fFDZQ8nh5_4RM8kUS7CAMylANPKROscmDusJId21ZjU,6600
14
+ thordata_sdk-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ thordata_sdk-1.2.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
16
+ thordata_sdk-1.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Thordata · AI Proxy & Web Data
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ thordata
thordata_sdk/__init__.py DELETED
@@ -1,9 +0,0 @@
1
- # Expose main clients
2
- from .client import ThordataClient
3
- from .async_client import AsyncThordataClient
4
- from .enums import Engine, GoogleSearchType
5
-
6
- # Version of the thordata-sdk package
7
- __version__ = "0.2.4"
8
-
9
- __all__ = ["ThordataClient", "AsyncThordataClient"]
@@ -1,247 +0,0 @@
1
- import aiohttp
2
- import logging
3
- import json
4
- import base64
5
- from typing import Optional, Dict, Any, Union
6
-
7
- # 复用我们刚刚写好的逻辑和枚举
8
- from .enums import Engine
9
- from .parameters import normalize_serp_params
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- class AsyncThordataClient:
15
- """
16
- Thordata Asynchronous Client (built on aiohttp).
17
- """
18
-
19
- def __init__(
20
- self,
21
- scraper_token: str,
22
- public_token: str,
23
- public_key: str,
24
- proxy_host: str = "gate.thordata.com",
25
- proxy_port: int = 22225
26
- ):
27
- self.scraper_token = scraper_token
28
- self.public_token = public_token
29
- self.public_key = public_key
30
-
31
- self.proxy_auth = aiohttp.BasicAuth(login=scraper_token, password='')
32
- self.proxy_url = f"http://{proxy_host}:{proxy_port}"
33
-
34
- self.base_url = "https://scraperapi.thordata.com"
35
- self.universal_url = "https://universalapi.thordata.com"
36
- self.api_url = "https://api.thordata.com/api/web-scraper-api"
37
-
38
- self.SERP_API_URL = f"{self.base_url}/request"
39
- self.UNIVERSAL_API_URL = f"{self.universal_url}/request"
40
- self.SCRAPER_BUILDER_URL = f"{self.base_url}/builder"
41
- self.SCRAPER_STATUS_URL = f"{self.api_url}/tasks-status"
42
- self.SCRAPER_DOWNLOAD_URL = f"{self.api_url}/tasks-download"
43
-
44
- self._session: Optional[aiohttp.ClientSession] = None
45
-
46
- async def __aenter__(self):
47
- if self._session is None or self._session.closed:
48
- self._session = aiohttp.ClientSession(trust_env=True)
49
- return self
50
-
51
- async def __aexit__(self, exc_type, exc, tb):
52
- await self.close()
53
-
54
- async def close(self):
55
- if self._session and not self._session.closed:
56
- await self._session.close()
57
- self._session = None
58
-
59
- # --- Proxy (Unchanged) ---
60
- async def get(self, url: str, **kwargs) -> aiohttp.ClientResponse:
61
- if self._session is None:
62
- raise RuntimeError("Client session not initialized.")
63
- try:
64
- return await self._session.get(
65
- url,
66
- proxy=self.proxy_url,
67
- proxy_auth=self.proxy_auth,
68
- **kwargs
69
- )
70
- except aiohttp.ClientError as e:
71
- logger.error(f"Async Request failed: {e}")
72
- raise
73
-
74
- # --- SERP (Optimized) ---
75
- async def serp_search(
76
- self,
77
- query: str,
78
- engine: Union[Engine, str] = Engine.GOOGLE,
79
- num: int = 10,
80
- **kwargs
81
- ) -> Dict[str, Any]:
82
- """
83
- Execute a real-time SERP search (Async).
84
- """
85
- if self._session is None:
86
- raise RuntimeError("Client session not initialized.")
87
-
88
- # 1. 转换枚举
89
- engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
90
-
91
- # 2. 调用 parameters.py 复用逻辑 (Don't Repeat Yourself!)
92
- payload = normalize_serp_params(engine_str, query, num=num, **kwargs)
93
-
94
- headers = {
95
- "Authorization": f"Bearer {self.scraper_token}",
96
- "Content-Type": "application/x-www-form-urlencoded"
97
- }
98
-
99
- # 3. 发送请求
100
- async with self._session.post(
101
- self.SERP_API_URL, data=payload, headers=headers
102
- ) as response:
103
- response.raise_for_status()
104
- data = await response.json()
105
- if isinstance(data, str):
106
- try:
107
- data = json.loads(data)
108
- except Exception:
109
- pass
110
- return data
111
-
112
- # --- Universal (Unchanged) ---
113
- async def universal_scrape(
114
- self,
115
- url: str,
116
- js_render: bool = False,
117
- output_format: str = "HTML",
118
- country: str = None,
119
- block_resources: bool = False
120
- ) -> Union[str, bytes]:
121
- if self._session is None:
122
- raise RuntimeError("Client session not initialized.")
123
-
124
- headers = {
125
- "Authorization": f"Bearer {self.scraper_token}",
126
- "Content-Type": "application/x-www-form-urlencoded"
127
- }
128
-
129
- payload = {
130
- "url": url,
131
- "js_render": "True" if js_render else "False",
132
- "type": output_format.lower(),
133
- "block_resources": "True" if block_resources else "False"
134
- }
135
- if country:
136
- payload["country"] = country
137
-
138
- async with self._session.post(
139
- self.UNIVERSAL_API_URL, data=payload, headers=headers
140
- ) as response:
141
- response.raise_for_status()
142
-
143
- try:
144
- resp_json = await response.json()
145
- except Exception:
146
- if output_format.upper() == "PNG":
147
- return await response.read()
148
- return await response.text()
149
-
150
- if isinstance(resp_json, dict) and resp_json.get("code") \
151
- and resp_json.get("code") != 200:
152
- raise Exception(f"Universal API Error: {resp_json}")
153
-
154
- if "html" in resp_json:
155
- return resp_json["html"]
156
-
157
- if "png" in resp_json:
158
- png_str = resp_json["png"]
159
- if not png_str:
160
- raise Exception("API returned empty PNG data")
161
-
162
- # 🛠️ FIX: 移除 Data URI Scheme 前缀
163
- if "," in png_str:
164
- png_str = png_str.split(",", 1)[1]
165
-
166
- png_str = png_str.replace("\n", "").replace("\r", "")
167
- missing_padding = len(png_str) % 4
168
- if missing_padding:
169
- png_str += '=' * (4 - missing_padding)
170
- return base64.b64decode(png_str)
171
-
172
- return str(resp_json)
173
-
174
- # --- Web Scraper (Optimized) ---
175
- async def create_scraper_task(
176
- self,
177
- file_name: str,
178
- spider_id: str,
179
- spider_name: str,
180
- individual_params: Dict[str, Any],
181
- universal_params: Dict[str, Any] = None
182
- ) -> str:
183
- """
184
- Create an Asynchronous Web Scraper Task.
185
- """
186
- if self._session is None:
187
- raise RuntimeError("Client session not initialized.")
188
-
189
- headers = {
190
- "Authorization": f"Bearer {self.scraper_token}",
191
- "Content-Type": "application/x-www-form-urlencoded"
192
- }
193
-
194
- # 简化 Payload 构建,移除不必要的检查
195
- payload = {
196
- "file_name": file_name,
197
- "spider_id": spider_id,
198
- "spider_name": spider_name,
199
- "spider_parameters": json.dumps([individual_params]),
200
- "spider_errors": "true"
201
- }
202
- if universal_params:
203
- payload["spider_universal"] = json.dumps(universal_params)
204
-
205
- async with self._session.post(
206
- self.SCRAPER_BUILDER_URL, data=payload, headers=headers
207
- ) as response:
208
- response.raise_for_status()
209
- data = await response.json()
210
- if data.get("code") != 200:
211
- raise Exception(f"Creation failed: {data}")
212
- return data["data"]["task_id"]
213
-
214
- # --- Status & Result (Unchanged) ---
215
- async def get_task_status(self, task_id: str) -> str:
216
- headers = {
217
- "token": self.public_token,
218
- "key": self.public_key,
219
- "Content-Type": "application/x-www-form-urlencoded"
220
- }
221
- payload = {"tasks_ids": task_id}
222
-
223
- async with self._session.post(
224
- self.SCRAPER_STATUS_URL, data=payload, headers=headers
225
- ) as response:
226
- data = await response.json()
227
- if data.get("code") == 200 and data.get("data"):
228
- for item in data["data"]:
229
- if str(item["task_id"]) == str(task_id):
230
- return item["status"]
231
- return "Unknown"
232
-
233
- async def get_task_result(self, task_id: str, file_type: str = "json") -> str:
234
- headers = {
235
- "token": self.public_token,
236
- "key": self.public_key,
237
- "Content-Type": "application/x-www-form-urlencoded"
238
- }
239
- payload = {"tasks_id": task_id, "type": "json"}
240
-
241
- async with self._session.post(
242
- self.SCRAPER_DOWNLOAD_URL, data=payload, headers=headers
243
- ) as response:
244
- data = await response.json()
245
- if data.get("code") == 200:
246
- return data["data"]["download"]
247
- raise Exception(f"Result Error: {data}")