thordata-sdk 1.0.1__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. thordata_sdk-1.2.0/PKG-INFO +208 -0
  2. thordata_sdk-1.2.0/README.md +164 -0
  3. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/pyproject.toml +8 -3
  4. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/__init__.py +1 -1
  5. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/_example_utils.py +3 -2
  6. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/_utils.py +4 -4
  7. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/async_client.py +106 -86
  8. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/client.py +782 -118
  9. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/demo.py +1 -3
  10. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/exceptions.py +12 -12
  11. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/models.py +102 -89
  12. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/retry.py +13 -13
  13. thordata_sdk-1.2.0/src/thordata/serp_engines.py +166 -0
  14. thordata_sdk-1.2.0/src/thordata_sdk.egg-info/PKG-INFO +208 -0
  15. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata_sdk.egg-info/SOURCES.txt +2 -0
  16. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata_sdk.egg-info/requires.txt +1 -0
  17. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_async_client.py +14 -2
  18. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_async_client_errors.py +5 -9
  19. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_client.py +9 -2
  20. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_client_errors.py +24 -22
  21. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_examples.py +0 -1
  22. thordata_sdk-1.2.0/tests/test_integration_proxy_protocols.py +113 -0
  23. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_task_status_and_wait.py +1 -5
  24. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_user_agent.py +0 -2
  25. thordata_sdk-1.0.1/PKG-INFO +0 -208
  26. thordata_sdk-1.0.1/README.md +0 -165
  27. thordata_sdk-1.0.1/src/thordata_sdk.egg-info/PKG-INFO +0 -208
  28. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/LICENSE +0 -0
  29. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/setup.cfg +0 -0
  30. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata/enums.py +0 -0
  31. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata_sdk.egg-info/dependency_links.txt +0 -0
  32. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/src/thordata_sdk.egg-info/top_level.txt +0 -0
  33. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_enums.py +0 -0
  34. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_exceptions.py +0 -0
  35. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_models.py +0 -0
  36. {thordata_sdk-1.0.1 → thordata_sdk-1.2.0}/tests/test_spec_parity.py +0 -0
@@ -0,0 +1,208 @@
1
+ Metadata-Version: 2.4
2
+ Name: thordata-sdk
3
+ Version: 1.2.0
4
+ Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
+ Author-email: Thordata Developer Team <support@thordata.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://www.thordata.com
8
+ Project-URL: Documentation, https://github.com/Thordata/thordata-python-sdk#readme
9
+ Project-URL: Source, https://github.com/Thordata/thordata-python-sdk
10
+ Project-URL: Tracker, https://github.com/Thordata/thordata-python-sdk/issues
11
+ Project-URL: Changelog, https://github.com/Thordata/thordata-python-sdk/blob/main/CHANGELOG.md
12
+ Keywords: web scraping,proxy,residential proxy,datacenter proxy,ai,llm,data-mining,serp,thordata,web scraper,anti-bot bypass
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
16
+ Classifier: Topic :: Internet :: WWW/HTTP
17
+ Classifier: Topic :: Internet :: Proxy Servers
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: License :: OSI Approved :: MIT License
24
+ Classifier: Operating System :: OS Independent
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.9
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: requests>=2.25.0
30
+ Requires-Dist: aiohttp>=3.9.0
31
+ Requires-Dist: PySocks>=1.7.1
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
34
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
35
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
36
+ Requires-Dist: pytest-httpserver>=1.0.0; extra == "dev"
37
+ Requires-Dist: python-dotenv>=1.0.0; extra == "dev"
38
+ Requires-Dist: black>=23.0.0; extra == "dev"
39
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
40
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
41
+ Requires-Dist: types-requests>=2.28.0; extra == "dev"
42
+ Requires-Dist: aioresponses>=0.7.6; extra == "dev"
43
+ Dynamic: license-file
44
+
45
+ # Thordata Python SDK
46
+
47
+ <div align="center">
48
+
49
+ <img src="https://img.shields.io/badge/Thordata-AI%20Infrastructure-blue?style=for-the-badge" alt="Thordata Logo">
50
+
51
+ **The Official Python Client for Thordata APIs**
52
+
53
+ *Proxy Network • SERP API • Web Unlocker • Web Scraper API*
54
+
55
+ [![PyPI version](https://img.shields.io/pypi/v/thordata-sdk.svg?style=flat-square)](https://pypi.org/project/thordata-sdk/)
56
+ [![Python Versions](https://img.shields.io/pypi/pyversions/thordata-sdk.svg?style=flat-square)](https://pypi.org/project/thordata-sdk/)
57
+ [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
58
+ [![CI Status](https://img.shields.io/github/actions/workflow/status/Thordata/thordata-python-sdk/ci.yml?branch=main&style=flat-square)](https://github.com/Thordata/thordata-python-sdk/actions)
59
+
60
+ </div>
61
+
62
+ ---
63
+
64
+ ## 📖 Introduction
65
+
66
+ This SDK provides a robust, high-performance interface to Thordata's AI data infrastructure. It is designed for high-concurrency scraping, reliable proxy tunneling, and seamless data extraction.
67
+
68
+ **Key Features:**
69
+ * **🚀 Production Ready:** Built on `urllib3` connection pooling for low-latency proxy requests.
70
+ * **⚡ Async Support:** Native `aiohttp` client for high-concurrency SERP/Universal scraping.
71
+ * **🛡️ Robust:** Handles TLS-in-TLS tunneling, retries, and error parsing automatically.
72
+ * **✨ Developer Experience:** Fully typed (`mypy` compatible) with intuitive IDE autocomplete.
73
+ * **🧩 Lazy Validation:** Only validate credentials for the features you actually use.
74
+
75
+ ---
76
+
77
+ ## 📦 Installation
78
+
79
+ ```bash
80
+ pip install thordata-sdk
81
+ ```
82
+
83
+ ---
84
+
85
+ ## 🔐 Configuration
86
+
87
+ Set environment variables to avoid hardcoding credentials. You only need to set the variables for the features you use.
88
+
89
+ ```bash
90
+ # [Required for SERP & Web Unlocker]
91
+ export THORDATA_SCRAPER_TOKEN="your_token_here"
92
+
93
+ # [Required for Proxy Network]
94
+ export THORDATA_RESIDENTIAL_USERNAME="your_username"
95
+ export THORDATA_RESIDENTIAL_PASSWORD="your_password"
96
+ export THORDATA_PROXY_HOST="vpnXXXX.pr.thordata.net"
97
+
98
+ # [Required for Task Management]
99
+ export THORDATA_PUBLIC_TOKEN="public_token"
100
+ export THORDATA_PUBLIC_KEY="public_key"
101
+ ```
102
+
103
+ ---
104
+
105
+ ## 🚀 Quick Start
106
+
107
+ ### 1. SERP Search (Google/Bing/Yandex)
108
+
109
+ ```python
110
+ from thordata import ThordataClient, Engine
111
+
112
+ client = ThordataClient() # Loads THORDATA_SCRAPER_TOKEN from env
113
+
114
+ # Simple Search
115
+ print("Searching...")
116
+ results = client.serp_search("latest AI trends", engine=Engine.GOOGLE_NEWS)
117
+
118
+ for news in results.get("news_results", [])[:3]:
119
+ print(f"- {news['title']} ({news['source']})")
120
+ ```
121
+
122
+ ### 2. Universal Scrape (Web Unlocker)
123
+
124
+ Bypass Cloudflare/Akamai and render JavaScript automatically.
125
+
126
+ ```python
127
+ html = client.universal_scrape(
128
+ url="https://example.com/protected-page",
129
+ js_render=True,
130
+ wait_for=".content-loaded",
131
+ country="us"
132
+ )
133
+ print(f"Scraped {len(html)} bytes")
134
+ ```
135
+
136
+ ### 3. High-Performance Proxy
137
+
138
+ Use Thordata's residential IPs with automatic connection pooling.
139
+
140
+ ```python
141
+ from thordata import ProxyConfig, ProxyProduct
142
+
143
+ # Config is optional if env vars are set, but allows granular control
144
+ proxy = ProxyConfig(
145
+ product=ProxyProduct.RESIDENTIAL,
146
+ country="jp",
147
+ city="tokyo",
148
+ session_id="session-001",
149
+ session_duration=10 # Sticky IP for 10 mins
150
+ )
151
+
152
+ # Use the client to make requests (Reuses TCP connections)
153
+ response = client.get("https://httpbin.org/ip", proxy_config=proxy)
154
+ print(response.json())
155
+ ```
156
+
157
+ ---
158
+
159
+ ## ⚙️ Advanced Usage
160
+
161
+ ### Async Client (High Concurrency)
162
+
163
+ For building AI agents or high-throughput spiders.
164
+
165
+ ```python
166
+ import asyncio
167
+ from thordata import AsyncThordataClient
168
+
169
+ async def main():
170
+ async with AsyncThordataClient() as client:
171
+ # Fire off multiple requests in parallel
172
+ tasks = [
173
+ client.serp_search(f"query {i}")
174
+ for i in range(5)
175
+ ]
176
+ results = await asyncio.gather(*tasks)
177
+ print(f"Completed {len(results)} searches")
178
+
179
+ asyncio.run(main())
180
+ ```
181
+
182
+ ### Web Scraper API (Task Management)
183
+
184
+ Create and manage large-scale scraping tasks asynchronously.
185
+
186
+ ```python
187
+ # 1. Create a task
188
+ task_id = client.create_scraper_task(
189
+ file_name="daily_scrape",
190
+ spider_id="universal",
191
+ spider_name="universal",
192
+ parameters={"url": "https://example.com"}
193
+ )
194
+
195
+ # 2. Wait for completion (Polling)
196
+ status = client.wait_for_task(task_id)
197
+
198
+ # 3. Get results
199
+ if status == "ready":
200
+ url = client.get_task_result(task_id)
201
+ print(f"Download Data: {url}")
202
+ ```
203
+
204
+ ---
205
+
206
+ ## 📄 License
207
+
208
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,164 @@
1
+ # Thordata Python SDK
2
+
3
+ <div align="center">
4
+
5
+ <img src="https://img.shields.io/badge/Thordata-AI%20Infrastructure-blue?style=for-the-badge" alt="Thordata Logo">
6
+
7
+ **The Official Python Client for Thordata APIs**
8
+
9
+ *Proxy Network • SERP API • Web Unlocker • Web Scraper API*
10
+
11
+ [![PyPI version](https://img.shields.io/pypi/v/thordata-sdk.svg?style=flat-square)](https://pypi.org/project/thordata-sdk/)
12
+ [![Python Versions](https://img.shields.io/pypi/pyversions/thordata-sdk.svg?style=flat-square)](https://pypi.org/project/thordata-sdk/)
13
+ [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
14
+ [![CI Status](https://img.shields.io/github/actions/workflow/status/Thordata/thordata-python-sdk/ci.yml?branch=main&style=flat-square)](https://github.com/Thordata/thordata-python-sdk/actions)
15
+
16
+ </div>
17
+
18
+ ---
19
+
20
+ ## 📖 Introduction
21
+
22
+ This SDK provides a robust, high-performance interface to Thordata's AI data infrastructure. It is designed for high-concurrency scraping, reliable proxy tunneling, and seamless data extraction.
23
+
24
+ **Key Features:**
25
+ * **🚀 Production Ready:** Built on `urllib3` connection pooling for low-latency proxy requests.
26
+ * **⚡ Async Support:** Native `aiohttp` client for high-concurrency SERP/Universal scraping.
27
+ * **🛡️ Robust:** Handles TLS-in-TLS tunneling, retries, and error parsing automatically.
28
+ * **✨ Developer Experience:** Fully typed (`mypy` compatible) with intuitive IDE autocomplete.
29
+ * **🧩 Lazy Validation:** Only validate credentials for the features you actually use.
30
+
31
+ ---
32
+
33
+ ## 📦 Installation
34
+
35
+ ```bash
36
+ pip install thordata-sdk
37
+ ```
38
+
39
+ ---
40
+
41
+ ## 🔐 Configuration
42
+
43
+ Set environment variables to avoid hardcoding credentials. You only need to set the variables for the features you use.
44
+
45
+ ```bash
46
+ # [Required for SERP & Web Unlocker]
47
+ export THORDATA_SCRAPER_TOKEN="your_token_here"
48
+
49
+ # [Required for Proxy Network]
50
+ export THORDATA_RESIDENTIAL_USERNAME="your_username"
51
+ export THORDATA_RESIDENTIAL_PASSWORD="your_password"
52
+ export THORDATA_PROXY_HOST="vpnXXXX.pr.thordata.net"
53
+
54
+ # [Required for Task Management]
55
+ export THORDATA_PUBLIC_TOKEN="public_token"
56
+ export THORDATA_PUBLIC_KEY="public_key"
57
+ ```
58
+
59
+ ---
60
+
61
+ ## 🚀 Quick Start
62
+
63
+ ### 1. SERP Search (Google/Bing/Yandex)
64
+
65
+ ```python
66
+ from thordata import ThordataClient, Engine
67
+
68
+ client = ThordataClient() # Loads THORDATA_SCRAPER_TOKEN from env
69
+
70
+ # Simple Search
71
+ print("Searching...")
72
+ results = client.serp_search("latest AI trends", engine=Engine.GOOGLE_NEWS)
73
+
74
+ for news in results.get("news_results", [])[:3]:
75
+ print(f"- {news['title']} ({news['source']})")
76
+ ```
77
+
78
+ ### 2. Universal Scrape (Web Unlocker)
79
+
80
+ Bypass Cloudflare/Akamai and render JavaScript automatically.
81
+
82
+ ```python
83
+ html = client.universal_scrape(
84
+ url="https://example.com/protected-page",
85
+ js_render=True,
86
+ wait_for=".content-loaded",
87
+ country="us"
88
+ )
89
+ print(f"Scraped {len(html)} bytes")
90
+ ```
91
+
92
+ ### 3. High-Performance Proxy
93
+
94
+ Use Thordata's residential IPs with automatic connection pooling.
95
+
96
+ ```python
97
+ from thordata import ProxyConfig, ProxyProduct
98
+
99
+ # Config is optional if env vars are set, but allows granular control
100
+ proxy = ProxyConfig(
101
+ product=ProxyProduct.RESIDENTIAL,
102
+ country="jp",
103
+ city="tokyo",
104
+ session_id="session-001",
105
+ session_duration=10 # Sticky IP for 10 mins
106
+ )
107
+
108
+ # Use the client to make requests (Reuses TCP connections)
109
+ response = client.get("https://httpbin.org/ip", proxy_config=proxy)
110
+ print(response.json())
111
+ ```
112
+
113
+ ---
114
+
115
+ ## ⚙️ Advanced Usage
116
+
117
+ ### Async Client (High Concurrency)
118
+
119
+ For building AI agents or high-throughput spiders.
120
+
121
+ ```python
122
+ import asyncio
123
+ from thordata import AsyncThordataClient
124
+
125
+ async def main():
126
+ async with AsyncThordataClient() as client:
127
+ # Fire off multiple requests in parallel
128
+ tasks = [
129
+ client.serp_search(f"query {i}")
130
+ for i in range(5)
131
+ ]
132
+ results = await asyncio.gather(*tasks)
133
+ print(f"Completed {len(results)} searches")
134
+
135
+ asyncio.run(main())
136
+ ```
137
+
138
+ ### Web Scraper API (Task Management)
139
+
140
+ Create and manage large-scale scraping tasks asynchronously.
141
+
142
+ ```python
143
+ # 1. Create a task
144
+ task_id = client.create_scraper_task(
145
+ file_name="daily_scrape",
146
+ spider_id="universal",
147
+ spider_name="universal",
148
+ parameters={"url": "https://example.com"}
149
+ )
150
+
151
+ # 2. Wait for completion (Polling)
152
+ status = client.wait_for_task(task_id)
153
+
154
+ # 3. Get results
155
+ if status == "ready":
156
+ url = client.get_task_result(task_id)
157
+ print(f"Download Data: {url}")
158
+ ```
159
+
160
+ ---
161
+
162
+ ## 📄 License
163
+
164
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -1,10 +1,11 @@
1
+ # thordata-python-sdk/pyproject.toml
1
2
  [build-system]
2
3
  requires = ["setuptools>=61.0", "wheel"]
3
4
  build-backend = "setuptools.build_meta"
4
5
 
5
6
  [project]
6
7
  name = "thordata-sdk"
7
- version = "1.0.1"
8
+ version = "1.2.0"
8
9
  description = "The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network."
9
10
  readme = "README.md"
10
11
  requires-python = ">=3.9"
@@ -44,6 +45,7 @@ classifiers = [
44
45
  dependencies = [
45
46
  "requests>=2.25.0",
46
47
  "aiohttp>=3.9.0",
48
+ "PySocks>=1.7.1",
47
49
  ]
48
50
 
49
51
  [project.optional-dependencies]
@@ -82,6 +84,7 @@ include = '\.pyi?$'
82
84
  [tool.ruff]
83
85
  line-length = 88
84
86
  target-version = "py39"
87
+ extend-exclude = ["sdk-spec"]
85
88
 
86
89
  [tool.ruff.lint]
87
90
  select = [
@@ -90,11 +93,12 @@ select = [
90
93
  "F", # pyflakes
91
94
  "I", # isort (import sorting)
92
95
  "B", # flake8-bugbear
96
+ "UP", # pyupgrade
97
+ "SIM", # flake8-simplify
93
98
  ]
94
99
  ignore = [
95
100
  "E501", # line too long (handled by black)
96
101
  "E731", # do not assign a lambda expression
97
- "F401", # imported but unused (we have some intentional re-exports)
98
102
  ]
99
103
 
100
104
  [tool.ruff.lint.isort]
@@ -121,6 +125,7 @@ ignore_missing_imports = true
121
125
  testpaths = ["tests"]
122
126
  asyncio_mode = "auto"
123
127
  addopts = "-v --cov=thordata --cov-report=term-missing"
128
+ markers = ["integration: live tests that require real credentials"]
124
129
 
125
130
  # Coverage setup
126
131
  [tool.coverage.run]
@@ -133,4 +138,4 @@ exclude_lines = [
133
138
  "def __repr__",
134
139
  "raise NotImplementedError",
135
140
  "if TYPE_CHECKING:",
136
- ]
141
+ ]
@@ -35,7 +35,7 @@ Async Usage:
35
35
  >>> asyncio.run(main())
36
36
  """
37
37
 
38
- __version__ = "1.0.1"
38
+ __version__ = "1.2.0"
39
39
  __author__ = "Thordata Developer Team"
40
40
  __email__ = "support@thordata.com"
41
41
 
@@ -2,8 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import os
5
+ from collections.abc import Iterable
5
6
  from pathlib import Path
6
- from typing import Any, Iterable, Optional
7
+ from typing import Any
7
8
 
8
9
  try:
9
10
  from dotenv import load_dotenv
@@ -23,7 +24,7 @@ def env(name: str) -> str:
23
24
  return (os.getenv(name) or "").strip()
24
25
 
25
26
 
26
- def skip_if_missing(required: Iterable[str], *, tip: Optional[str] = None) -> bool:
27
+ def skip_if_missing(required: Iterable[str], *, tip: str | None = None) -> bool:
27
28
  missing = [k for k in required if not env(k)]
28
29
  if not missing:
29
30
  return False
@@ -10,7 +10,7 @@ import base64
10
10
  import json
11
11
  import logging
12
12
  import platform
13
- from typing import Any, Dict
13
+ from typing import Any
14
14
 
15
15
  logger = logging.getLogger(__name__)
16
16
 
@@ -71,7 +71,7 @@ def decode_base64_image(png_str: str) -> bytes:
71
71
  raise ValueError(f"Failed to decode base64 image: {e}") from e
72
72
 
73
73
 
74
- def build_auth_headers(token: str, mode: str = "bearer") -> Dict[str, str]:
74
+ def build_auth_headers(token: str, mode: str = "bearer") -> dict[str, str]:
75
75
  """
76
76
  Build authorization headers for API requests.
77
77
 
@@ -105,7 +105,7 @@ def build_builder_headers(
105
105
  scraper_token: str,
106
106
  public_token: str,
107
107
  public_key: str,
108
- ) -> Dict[str, str]:
108
+ ) -> dict[str, str]:
109
109
  """
110
110
  Build headers for Web Scraper builder API.
111
111
 
@@ -130,7 +130,7 @@ def build_builder_headers(
130
130
  }
131
131
 
132
132
 
133
- def build_public_api_headers(public_token: str, public_key: str) -> Dict[str, str]:
133
+ def build_public_api_headers(public_token: str, public_key: str) -> dict[str, str]:
134
134
  """
135
135
  Build headers for public API requests (task status, locations, etc.)
136
136