thordata-sdk 0.2.4__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata/__init__.py ADDED
@@ -0,0 +1,151 @@
1
+ """
2
+ Thordata Python SDK
3
+
4
+ Official Python client for Thordata's Proxy Network, SERP API,
5
+ Universal Scraping API (Web Unlocker), and Web Scraper API.
6
+
7
+ Basic Usage:
8
+ >>> from thordata import ThordataClient
9
+ >>>
10
+ >>> client = ThordataClient(
11
+ ... scraper_token="your_token",
12
+ ... public_token="your_public_token",
13
+ ... public_key="your_public_key"
14
+ ... )
15
+ >>>
16
+ >>> # Proxy request
17
+ >>> response = client.get("https://httpbin.org/ip")
18
+ >>>
19
+ >>> # SERP search
20
+ >>> results = client.serp_search("python tutorial", engine="google")
21
+ >>>
22
+ >>> # Universal scrape
23
+ >>> html = client.universal_scrape("https://example.com", js_render=True)
24
+
25
+ Async Usage:
26
+ >>> from thordata import AsyncThordataClient
27
+ >>> import asyncio
28
+ >>>
29
+ >>> async def main():
30
+ ... async with AsyncThordataClient(
31
+ ... scraper_token="your_token"
32
+ ... ) as client:
33
+ ... response = await client.get("https://httpbin.org/ip")
34
+ >>>
35
+ >>> asyncio.run(main())
36
+ """
37
+
38
+ __version__ = "1.2.0"
39
+ __author__ = "Thordata Developer Team"
40
+ __email__ = "support@thordata.com"
41
+
42
+ # Main clients
43
+ from .async_client import AsyncThordataClient
44
+ from .client import ThordataClient
45
+
46
+ # Enums
47
+ from .enums import (
48
+ BingSearchType,
49
+ Continent,
50
+ Country,
51
+ DataFormat,
52
+ Device,
53
+ Engine,
54
+ GoogleSearchType,
55
+ GoogleTbm,
56
+ OutputFormat,
57
+ ProxyHost,
58
+ ProxyPort,
59
+ ProxyType,
60
+ SessionType,
61
+ TaskStatus,
62
+ TimeRange,
63
+ )
64
+
65
+ # Exceptions
66
+ from .exceptions import (
67
+ ThordataAPIError,
68
+ ThordataAuthError,
69
+ ThordataConfigError,
70
+ ThordataError,
71
+ ThordataNetworkError,
72
+ ThordataNotCollectedError,
73
+ ThordataRateLimitError,
74
+ ThordataServerError,
75
+ ThordataTimeoutError,
76
+ ThordataValidationError,
77
+ )
78
+
79
+ # Models
80
+ from .models import (
81
+ CommonSettings,
82
+ ProxyConfig,
83
+ ProxyProduct,
84
+ ProxyServer,
85
+ ProxyUser,
86
+ ProxyUserList,
87
+ ScraperTaskConfig,
88
+ SerpRequest,
89
+ StaticISPProxy,
90
+ StickySession,
91
+ TaskStatusResponse,
92
+ UniversalScrapeRequest,
93
+ UsageStatistics,
94
+ VideoTaskConfig,
95
+ )
96
+
97
+ # Retry utilities
98
+ from .retry import RetryConfig
99
+
100
+ # Public API
101
+ __all__ = [
102
+ # Version
103
+ "__version__",
104
+ # Clients
105
+ "ThordataClient",
106
+ "AsyncThordataClient",
107
+ # Enums
108
+ "Engine",
109
+ "GoogleSearchType",
110
+ "BingSearchType",
111
+ "ProxyType",
112
+ "SessionType",
113
+ "Continent",
114
+ "Country",
115
+ "OutputFormat",
116
+ "DataFormat",
117
+ "TaskStatus",
118
+ "Device",
119
+ "TimeRange",
120
+ "ProxyHost",
121
+ "ProxyPort",
122
+ "GoogleTbm",
123
+ # Models
124
+ "ProxyConfig",
125
+ "ProxyProduct",
126
+ "ProxyServer",
127
+ "ProxyUser",
128
+ "ProxyUserList",
129
+ "UsageStatistics",
130
+ "StaticISPProxy",
131
+ "StickySession",
132
+ "SerpRequest",
133
+ "UniversalScrapeRequest",
134
+ "ScraperTaskConfig",
135
+ "CommonSettings",
136
+ "VideoTaskConfig",
137
+ "TaskStatusResponse",
138
+ # Exceptions
139
+ "ThordataError",
140
+ "ThordataConfigError",
141
+ "ThordataNetworkError",
142
+ "ThordataTimeoutError",
143
+ "ThordataAPIError",
144
+ "ThordataAuthError",
145
+ "ThordataRateLimitError",
146
+ "ThordataServerError",
147
+ "ThordataValidationError",
148
+ "ThordataNotCollectedError",
149
+ # Retry
150
+ "RetryConfig",
151
+ ]
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from collections.abc import Iterable
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ try:
10
+ from dotenv import load_dotenv
11
+ except Exception: # pragma: no cover
12
+ load_dotenv = None
13
+
14
+
15
+ def load_env() -> None:
16
+ """Load .env from repo root if python-dotenv is installed."""
17
+ if load_dotenv is None:
18
+ return
19
+ repo_root = Path(__file__).resolve().parents[2]
20
+ load_dotenv(dotenv_path=repo_root / ".env")
21
+
22
+
23
+ def env(name: str) -> str:
24
+ return (os.getenv(name) or "").strip()
25
+
26
+
27
+ def skip_if_missing(required: Iterable[str], *, tip: str | None = None) -> bool:
28
+ missing = [k for k in required if not env(k)]
29
+ if not missing:
30
+ return False
31
+ print("Skipping live example: missing env:", ", ".join(missing))
32
+ if tip:
33
+ print(tip)
34
+ else:
35
+ print("Tip: copy .env.example to .env and fill values, then re-run.")
36
+ return True
37
+
38
+
39
+ def parse_json_env(name: str, default: str = "{}") -> Any:
40
+ raw = env(name) or default
41
+ return json.loads(raw)
42
+
43
+
44
+ def normalize_task_parameters(raw: Any) -> dict[str, Any]:
45
+ """Accept {..} or [{..}] and return a single dict for create_scraper_task(parameters=...)."""
46
+ if isinstance(raw, list):
47
+ if not raw:
48
+ raise ValueError("Task parameters JSON array must not be empty")
49
+ raw = raw[0]
50
+ if not isinstance(raw, dict):
51
+ raise ValueError("Task parameters must be a JSON object (or array of objects)")
52
+ return raw
53
+
54
+
55
+ def output_dir() -> Path:
56
+ """Return output dir for examples; defaults to examples/output (ignored by git)."""
57
+ repo_root = Path(__file__).resolve().parents[2]
58
+ d = env("THORDATA_OUTPUT_DIR") or str(repo_root / "examples" / "output")
59
+ p = Path(d)
60
+ p.mkdir(parents=True, exist_ok=True)
61
+ return p
62
+
63
+
64
+ def write_text(filename: str, content: str) -> Path:
65
+ p = output_dir() / filename
66
+ p.write_text(content, encoding="utf-8", errors="replace")
67
+ return p
68
+
69
+
70
+ def write_json(filename: str, data: Any) -> Path:
71
+ p = output_dir() / filename
72
+ p.write_text(
73
+ json.dumps(data, ensure_ascii=False, indent=2),
74
+ encoding="utf-8",
75
+ errors="replace",
76
+ )
77
+ return p
thordata/_utils.py ADDED
@@ -0,0 +1,190 @@
1
+ """
2
+ Internal utility functions for the Thordata Python SDK.
3
+
4
+ These are not part of the public API and may change without notice.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import base64
10
+ import json
11
+ import logging
12
+ import platform
13
+ from typing import Any
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def parse_json_response(data: Any) -> Any:
19
+ """
20
+ Parse a response that might be double-encoded JSON.
21
+
22
+ Some API endpoints return JSON as a string inside JSON.
23
+
24
+ Args:
25
+ data: The response data to parse.
26
+
27
+ Returns:
28
+ Parsed data.
29
+ """
30
+ if isinstance(data, str):
31
+ try:
32
+ return json.loads(data)
33
+ except json.JSONDecodeError:
34
+ return data
35
+ return data
36
+
37
+
38
+ def decode_base64_image(png_str: str) -> bytes:
39
+ """
40
+ Decode a base64-encoded PNG image.
41
+
42
+ Handles Data URI scheme (data:image/png;base64,...) and fixes padding.
43
+
44
+ Args:
45
+ png_str: Base64-encoded string, possibly with Data URI prefix.
46
+
47
+ Returns:
48
+ Decoded PNG bytes.
49
+
50
+ Raises:
51
+ ValueError: If the string is empty or cannot be decoded.
52
+ """
53
+ if not png_str:
54
+ raise ValueError("Empty PNG data received")
55
+
56
+ # Remove Data URI scheme if present
57
+ if "," in png_str:
58
+ png_str = png_str.split(",", 1)[1]
59
+
60
+ # Clean up whitespace
61
+ png_str = png_str.replace("\n", "").replace("\r", "").replace(" ", "")
62
+
63
+ # Fix Base64 padding
64
+ missing_padding = len(png_str) % 4
65
+ if missing_padding:
66
+ png_str += "=" * (4 - missing_padding)
67
+
68
+ try:
69
+ return base64.b64decode(png_str)
70
+ except Exception as e:
71
+ raise ValueError(f"Failed to decode base64 image: {e}") from e
72
+
73
+
74
+ def build_auth_headers(token: str, mode: str = "bearer") -> dict[str, str]:
75
+ """
76
+ Build authorization headers for API requests.
77
+
78
+ Supports two modes:
79
+ - bearer: Authorization: Bearer <token> (Thordata Docs examples)
80
+ - header_token: token: <token> (Interface documentation)
81
+
82
+ Args:
83
+ token: The scraper token.
84
+ mode: Authentication mode ("bearer" or "header_token").
85
+
86
+ Returns:
87
+ Headers dict with Authorization/token and Content-Type.
88
+ """
89
+ headers = {
90
+ "Content-Type": "application/x-www-form-urlencoded",
91
+ }
92
+
93
+ if mode == "bearer":
94
+ headers["Authorization"] = f"Bearer {token}"
95
+ elif mode == "header_token":
96
+ headers["token"] = token
97
+ else:
98
+ # Fallback to bearer for compatibility
99
+ headers["Authorization"] = f"Bearer {token}"
100
+
101
+ return headers
102
+
103
+
104
+ def build_builder_headers(
105
+ scraper_token: str,
106
+ public_token: str,
107
+ public_key: str,
108
+ ) -> dict[str, str]:
109
+ """
110
+ Build headers for Web Scraper builder API.
111
+
112
+ Builder requires THREE auth headers per official docs:
113
+ - token: public token
114
+ - key: public key
115
+ - Authorization: Bearer scraper_token
116
+
117
+ Args:
118
+ scraper_token: The scraper API token.
119
+ public_token: The public API token.
120
+ public_key: The public API key.
121
+
122
+ Returns:
123
+ Headers dict with all required auth headers.
124
+ """
125
+ return {
126
+ "token": public_token,
127
+ "key": public_key,
128
+ "Authorization": f"Bearer {scraper_token}",
129
+ "Content-Type": "application/x-www-form-urlencoded",
130
+ }
131
+
132
+
133
+ def build_public_api_headers(public_token: str, public_key: str) -> dict[str, str]:
134
+ """
135
+ Build headers for public API requests (task status, locations, etc.)
136
+
137
+ Args:
138
+ public_token: The public API token.
139
+ public_key: The public API key.
140
+
141
+ Returns:
142
+ Headers dict with token, key, and Content-Type.
143
+ """
144
+ return {
145
+ "token": public_token,
146
+ "key": public_key,
147
+ "Content-Type": "application/x-www-form-urlencoded",
148
+ }
149
+
150
+
151
+ def extract_error_message(payload: Any) -> str:
152
+ """
153
+ Extract a human-readable error message from an API response.
154
+
155
+ Args:
156
+ payload: The API response payload.
157
+
158
+ Returns:
159
+ Error message string.
160
+ """
161
+ if isinstance(payload, dict):
162
+ # Try common error message fields
163
+ for key in ("msg", "message", "error", "detail", "description"):
164
+ if key in payload:
165
+ return str(payload[key])
166
+
167
+ # Fall back to full payload
168
+ return str(payload)
169
+
170
+ return str(payload)
171
+
172
+
173
+ def build_user_agent(sdk_version: str, http_client: str) -> str:
174
+ """
175
+ Build a standardized User-Agent for the SDK.
176
+ Format: thordata-python-sdk/{version} python/{py_ver} ({system}/{release}; {machine})
177
+ """
178
+ py_ver = platform.python_version()
179
+ system = platform.system() or "unknown"
180
+ release = platform.release() or "unknown"
181
+ machine = platform.machine() or "unknown"
182
+
183
+ # Clean up strings to avoid UA parsing issues (remove newlines, etc)
184
+ system = system.replace(";", "").strip()
185
+
186
+ return (
187
+ f"thordata-python-sdk/{sdk_version} "
188
+ f"python/{py_ver} "
189
+ f"({system}/{release}; {machine}; {http_client})"
190
+ )