thordata-sdk 0.2.4__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +151 -0
- thordata/_example_utils.py +77 -0
- thordata/_utils.py +190 -0
- thordata/async_client.py +1675 -0
- thordata/client.py +1644 -0
- thordata/demo.py +138 -0
- thordata/enums.py +384 -0
- thordata/exceptions.py +355 -0
- thordata/models.py +1197 -0
- thordata/retry.py +382 -0
- thordata/serp_engines.py +166 -0
- thordata_sdk-1.2.0.dist-info/METADATA +208 -0
- thordata_sdk-1.2.0.dist-info/RECORD +16 -0
- {thordata_sdk-0.2.4.dist-info → thordata_sdk-1.2.0.dist-info}/WHEEL +1 -1
- thordata_sdk-1.2.0.dist-info/licenses/LICENSE +21 -0
- thordata_sdk-1.2.0.dist-info/top_level.txt +1 -0
- thordata_sdk/__init__.py +0 -9
- thordata_sdk/async_client.py +0 -247
- thordata_sdk/client.py +0 -303
- thordata_sdk/enums.py +0 -20
- thordata_sdk/parameters.py +0 -41
- thordata_sdk-0.2.4.dist-info/LICENSE +0 -201
- thordata_sdk-0.2.4.dist-info/METADATA +0 -113
- thordata_sdk-0.2.4.dist-info/RECORD +0 -10
- thordata_sdk-0.2.4.dist-info/top_level.txt +0 -1
thordata/__init__.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Thordata Python SDK
|
|
3
|
+
|
|
4
|
+
Official Python client for Thordata's Proxy Network, SERP API,
|
|
5
|
+
Universal Scraping API (Web Unlocker), and Web Scraper API.
|
|
6
|
+
|
|
7
|
+
Basic Usage:
|
|
8
|
+
>>> from thordata import ThordataClient
|
|
9
|
+
>>>
|
|
10
|
+
>>> client = ThordataClient(
|
|
11
|
+
... scraper_token="your_token",
|
|
12
|
+
... public_token="your_public_token",
|
|
13
|
+
... public_key="your_public_key"
|
|
14
|
+
... )
|
|
15
|
+
>>>
|
|
16
|
+
>>> # Proxy request
|
|
17
|
+
>>> response = client.get("https://httpbin.org/ip")
|
|
18
|
+
>>>
|
|
19
|
+
>>> # SERP search
|
|
20
|
+
>>> results = client.serp_search("python tutorial", engine="google")
|
|
21
|
+
>>>
|
|
22
|
+
>>> # Universal scrape
|
|
23
|
+
>>> html = client.universal_scrape("https://example.com", js_render=True)
|
|
24
|
+
|
|
25
|
+
Async Usage:
|
|
26
|
+
>>> from thordata import AsyncThordataClient
|
|
27
|
+
>>> import asyncio
|
|
28
|
+
>>>
|
|
29
|
+
>>> async def main():
|
|
30
|
+
... async with AsyncThordataClient(
|
|
31
|
+
... scraper_token="your_token"
|
|
32
|
+
... ) as client:
|
|
33
|
+
... response = await client.get("https://httpbin.org/ip")
|
|
34
|
+
>>>
|
|
35
|
+
>>> asyncio.run(main())
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
__version__ = "1.2.0"
|
|
39
|
+
__author__ = "Thordata Developer Team"
|
|
40
|
+
__email__ = "support@thordata.com"
|
|
41
|
+
|
|
42
|
+
# Main clients
|
|
43
|
+
from .async_client import AsyncThordataClient
|
|
44
|
+
from .client import ThordataClient
|
|
45
|
+
|
|
46
|
+
# Enums
|
|
47
|
+
from .enums import (
|
|
48
|
+
BingSearchType,
|
|
49
|
+
Continent,
|
|
50
|
+
Country,
|
|
51
|
+
DataFormat,
|
|
52
|
+
Device,
|
|
53
|
+
Engine,
|
|
54
|
+
GoogleSearchType,
|
|
55
|
+
GoogleTbm,
|
|
56
|
+
OutputFormat,
|
|
57
|
+
ProxyHost,
|
|
58
|
+
ProxyPort,
|
|
59
|
+
ProxyType,
|
|
60
|
+
SessionType,
|
|
61
|
+
TaskStatus,
|
|
62
|
+
TimeRange,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Exceptions
|
|
66
|
+
from .exceptions import (
|
|
67
|
+
ThordataAPIError,
|
|
68
|
+
ThordataAuthError,
|
|
69
|
+
ThordataConfigError,
|
|
70
|
+
ThordataError,
|
|
71
|
+
ThordataNetworkError,
|
|
72
|
+
ThordataNotCollectedError,
|
|
73
|
+
ThordataRateLimitError,
|
|
74
|
+
ThordataServerError,
|
|
75
|
+
ThordataTimeoutError,
|
|
76
|
+
ThordataValidationError,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Models
|
|
80
|
+
from .models import (
|
|
81
|
+
CommonSettings,
|
|
82
|
+
ProxyConfig,
|
|
83
|
+
ProxyProduct,
|
|
84
|
+
ProxyServer,
|
|
85
|
+
ProxyUser,
|
|
86
|
+
ProxyUserList,
|
|
87
|
+
ScraperTaskConfig,
|
|
88
|
+
SerpRequest,
|
|
89
|
+
StaticISPProxy,
|
|
90
|
+
StickySession,
|
|
91
|
+
TaskStatusResponse,
|
|
92
|
+
UniversalScrapeRequest,
|
|
93
|
+
UsageStatistics,
|
|
94
|
+
VideoTaskConfig,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Retry utilities
|
|
98
|
+
from .retry import RetryConfig
|
|
99
|
+
|
|
100
|
+
# Public API
|
|
101
|
+
__all__ = [
|
|
102
|
+
# Version
|
|
103
|
+
"__version__",
|
|
104
|
+
# Clients
|
|
105
|
+
"ThordataClient",
|
|
106
|
+
"AsyncThordataClient",
|
|
107
|
+
# Enums
|
|
108
|
+
"Engine",
|
|
109
|
+
"GoogleSearchType",
|
|
110
|
+
"BingSearchType",
|
|
111
|
+
"ProxyType",
|
|
112
|
+
"SessionType",
|
|
113
|
+
"Continent",
|
|
114
|
+
"Country",
|
|
115
|
+
"OutputFormat",
|
|
116
|
+
"DataFormat",
|
|
117
|
+
"TaskStatus",
|
|
118
|
+
"Device",
|
|
119
|
+
"TimeRange",
|
|
120
|
+
"ProxyHost",
|
|
121
|
+
"ProxyPort",
|
|
122
|
+
"GoogleTbm",
|
|
123
|
+
# Models
|
|
124
|
+
"ProxyConfig",
|
|
125
|
+
"ProxyProduct",
|
|
126
|
+
"ProxyServer",
|
|
127
|
+
"ProxyUser",
|
|
128
|
+
"ProxyUserList",
|
|
129
|
+
"UsageStatistics",
|
|
130
|
+
"StaticISPProxy",
|
|
131
|
+
"StickySession",
|
|
132
|
+
"SerpRequest",
|
|
133
|
+
"UniversalScrapeRequest",
|
|
134
|
+
"ScraperTaskConfig",
|
|
135
|
+
"CommonSettings",
|
|
136
|
+
"VideoTaskConfig",
|
|
137
|
+
"TaskStatusResponse",
|
|
138
|
+
# Exceptions
|
|
139
|
+
"ThordataError",
|
|
140
|
+
"ThordataConfigError",
|
|
141
|
+
"ThordataNetworkError",
|
|
142
|
+
"ThordataTimeoutError",
|
|
143
|
+
"ThordataAPIError",
|
|
144
|
+
"ThordataAuthError",
|
|
145
|
+
"ThordataRateLimitError",
|
|
146
|
+
"ThordataServerError",
|
|
147
|
+
"ThordataValidationError",
|
|
148
|
+
"ThordataNotCollectedError",
|
|
149
|
+
# Retry
|
|
150
|
+
"RetryConfig",
|
|
151
|
+
]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
except Exception: # pragma: no cover
|
|
12
|
+
load_dotenv = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_env() -> None:
|
|
16
|
+
"""Load .env from repo root if python-dotenv is installed."""
|
|
17
|
+
if load_dotenv is None:
|
|
18
|
+
return
|
|
19
|
+
repo_root = Path(__file__).resolve().parents[2]
|
|
20
|
+
load_dotenv(dotenv_path=repo_root / ".env")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def env(name: str) -> str:
|
|
24
|
+
return (os.getenv(name) or "").strip()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def skip_if_missing(required: Iterable[str], *, tip: str | None = None) -> bool:
|
|
28
|
+
missing = [k for k in required if not env(k)]
|
|
29
|
+
if not missing:
|
|
30
|
+
return False
|
|
31
|
+
print("Skipping live example: missing env:", ", ".join(missing))
|
|
32
|
+
if tip:
|
|
33
|
+
print(tip)
|
|
34
|
+
else:
|
|
35
|
+
print("Tip: copy .env.example to .env and fill values, then re-run.")
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def parse_json_env(name: str, default: str = "{}") -> Any:
|
|
40
|
+
raw = env(name) or default
|
|
41
|
+
return json.loads(raw)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def normalize_task_parameters(raw: Any) -> dict[str, Any]:
|
|
45
|
+
"""Accept {..} or [{..}] and return a single dict for create_scraper_task(parameters=...)."""
|
|
46
|
+
if isinstance(raw, list):
|
|
47
|
+
if not raw:
|
|
48
|
+
raise ValueError("Task parameters JSON array must not be empty")
|
|
49
|
+
raw = raw[0]
|
|
50
|
+
if not isinstance(raw, dict):
|
|
51
|
+
raise ValueError("Task parameters must be a JSON object (or array of objects)")
|
|
52
|
+
return raw
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def output_dir() -> Path:
|
|
56
|
+
"""Return output dir for examples; defaults to examples/output (ignored by git)."""
|
|
57
|
+
repo_root = Path(__file__).resolve().parents[2]
|
|
58
|
+
d = env("THORDATA_OUTPUT_DIR") or str(repo_root / "examples" / "output")
|
|
59
|
+
p = Path(d)
|
|
60
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
return p
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def write_text(filename: str, content: str) -> Path:
|
|
65
|
+
p = output_dir() / filename
|
|
66
|
+
p.write_text(content, encoding="utf-8", errors="replace")
|
|
67
|
+
return p
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def write_json(filename: str, data: Any) -> Path:
|
|
71
|
+
p = output_dir() / filename
|
|
72
|
+
p.write_text(
|
|
73
|
+
json.dumps(data, ensure_ascii=False, indent=2),
|
|
74
|
+
encoding="utf-8",
|
|
75
|
+
errors="replace",
|
|
76
|
+
)
|
|
77
|
+
return p
|
thordata/_utils.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Internal utility functions for the Thordata Python SDK.
|
|
3
|
+
|
|
4
|
+
These are not part of the public API and may change without notice.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import platform
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def parse_json_response(data: Any) -> Any:
|
|
19
|
+
"""
|
|
20
|
+
Parse a response that might be double-encoded JSON.
|
|
21
|
+
|
|
22
|
+
Some API endpoints return JSON as a string inside JSON.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
data: The response data to parse.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Parsed data.
|
|
29
|
+
"""
|
|
30
|
+
if isinstance(data, str):
|
|
31
|
+
try:
|
|
32
|
+
return json.loads(data)
|
|
33
|
+
except json.JSONDecodeError:
|
|
34
|
+
return data
|
|
35
|
+
return data
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def decode_base64_image(png_str: str) -> bytes:
|
|
39
|
+
"""
|
|
40
|
+
Decode a base64-encoded PNG image.
|
|
41
|
+
|
|
42
|
+
Handles Data URI scheme (data:image/png;base64,...) and fixes padding.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
png_str: Base64-encoded string, possibly with Data URI prefix.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Decoded PNG bytes.
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
ValueError: If the string is empty or cannot be decoded.
|
|
52
|
+
"""
|
|
53
|
+
if not png_str:
|
|
54
|
+
raise ValueError("Empty PNG data received")
|
|
55
|
+
|
|
56
|
+
# Remove Data URI scheme if present
|
|
57
|
+
if "," in png_str:
|
|
58
|
+
png_str = png_str.split(",", 1)[1]
|
|
59
|
+
|
|
60
|
+
# Clean up whitespace
|
|
61
|
+
png_str = png_str.replace("\n", "").replace("\r", "").replace(" ", "")
|
|
62
|
+
|
|
63
|
+
# Fix Base64 padding
|
|
64
|
+
missing_padding = len(png_str) % 4
|
|
65
|
+
if missing_padding:
|
|
66
|
+
png_str += "=" * (4 - missing_padding)
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
return base64.b64decode(png_str)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
raise ValueError(f"Failed to decode base64 image: {e}") from e
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def build_auth_headers(token: str, mode: str = "bearer") -> dict[str, str]:
|
|
75
|
+
"""
|
|
76
|
+
Build authorization headers for API requests.
|
|
77
|
+
|
|
78
|
+
Supports two modes:
|
|
79
|
+
- bearer: Authorization: Bearer <token> (Thordata Docs examples)
|
|
80
|
+
- header_token: token: <token> (Interface documentation)
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
token: The scraper token.
|
|
84
|
+
mode: Authentication mode ("bearer" or "header_token").
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Headers dict with Authorization/token and Content-Type.
|
|
88
|
+
"""
|
|
89
|
+
headers = {
|
|
90
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if mode == "bearer":
|
|
94
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
95
|
+
elif mode == "header_token":
|
|
96
|
+
headers["token"] = token
|
|
97
|
+
else:
|
|
98
|
+
# Fallback to bearer for compatibility
|
|
99
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
100
|
+
|
|
101
|
+
return headers
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def build_builder_headers(
|
|
105
|
+
scraper_token: str,
|
|
106
|
+
public_token: str,
|
|
107
|
+
public_key: str,
|
|
108
|
+
) -> dict[str, str]:
|
|
109
|
+
"""
|
|
110
|
+
Build headers for Web Scraper builder API.
|
|
111
|
+
|
|
112
|
+
Builder requires THREE auth headers per official docs:
|
|
113
|
+
- token: public token
|
|
114
|
+
- key: public key
|
|
115
|
+
- Authorization: Bearer scraper_token
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
scraper_token: The scraper API token.
|
|
119
|
+
public_token: The public API token.
|
|
120
|
+
public_key: The public API key.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Headers dict with all required auth headers.
|
|
124
|
+
"""
|
|
125
|
+
return {
|
|
126
|
+
"token": public_token,
|
|
127
|
+
"key": public_key,
|
|
128
|
+
"Authorization": f"Bearer {scraper_token}",
|
|
129
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def build_public_api_headers(public_token: str, public_key: str) -> dict[str, str]:
|
|
134
|
+
"""
|
|
135
|
+
Build headers for public API requests (task status, locations, etc.)
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
public_token: The public API token.
|
|
139
|
+
public_key: The public API key.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Headers dict with token, key, and Content-Type.
|
|
143
|
+
"""
|
|
144
|
+
return {
|
|
145
|
+
"token": public_token,
|
|
146
|
+
"key": public_key,
|
|
147
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def extract_error_message(payload: Any) -> str:
|
|
152
|
+
"""
|
|
153
|
+
Extract a human-readable error message from an API response.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
payload: The API response payload.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Error message string.
|
|
160
|
+
"""
|
|
161
|
+
if isinstance(payload, dict):
|
|
162
|
+
# Try common error message fields
|
|
163
|
+
for key in ("msg", "message", "error", "detail", "description"):
|
|
164
|
+
if key in payload:
|
|
165
|
+
return str(payload[key])
|
|
166
|
+
|
|
167
|
+
# Fall back to full payload
|
|
168
|
+
return str(payload)
|
|
169
|
+
|
|
170
|
+
return str(payload)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def build_user_agent(sdk_version: str, http_client: str) -> str:
|
|
174
|
+
"""
|
|
175
|
+
Build a standardized User-Agent for the SDK.
|
|
176
|
+
Format: thordata-python-sdk/{version} python/{py_ver} ({system}/{release}; {machine})
|
|
177
|
+
"""
|
|
178
|
+
py_ver = platform.python_version()
|
|
179
|
+
system = platform.system() or "unknown"
|
|
180
|
+
release = platform.release() or "unknown"
|
|
181
|
+
machine = platform.machine() or "unknown"
|
|
182
|
+
|
|
183
|
+
# Clean up strings to avoid UA parsing issues (remove newlines, etc)
|
|
184
|
+
system = system.replace(";", "").strip()
|
|
185
|
+
|
|
186
|
+
return (
|
|
187
|
+
f"thordata-python-sdk/{sdk_version} "
|
|
188
|
+
f"python/{py_ver} "
|
|
189
|
+
f"({system}/{release}; {machine}; {http_client})"
|
|
190
|
+
)
|