thordata-sdk 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +137 -0
- thordata/_utils.py +144 -0
- thordata/async_client.py +815 -0
- thordata/client.py +1040 -0
- thordata/demo.py +140 -0
- thordata/enums.py +315 -0
- thordata/exceptions.py +344 -0
- thordata/models.py +840 -0
- thordata/parameters.py +53 -0
- thordata/retry.py +380 -0
- thordata_sdk-0.6.0.dist-info/METADATA +1053 -0
- thordata_sdk-0.6.0.dist-info/RECORD +15 -0
- thordata_sdk-0.6.0.dist-info/WHEEL +5 -0
- thordata_sdk-0.6.0.dist-info/licenses/LICENSE +21 -0
- thordata_sdk-0.6.0.dist-info/top_level.txt +1 -0
thordata/__init__.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Thordata Python SDK
|
|
3
|
+
|
|
4
|
+
Official Python client for Thordata's Proxy Network, SERP API,
|
|
5
|
+
Universal Scraping API (Web Unlocker), and Web Scraper API.
|
|
6
|
+
|
|
7
|
+
Basic Usage:
|
|
8
|
+
>>> from thordata import ThordataClient
|
|
9
|
+
>>>
|
|
10
|
+
>>> client = ThordataClient(
|
|
11
|
+
... scraper_token="your_token",
|
|
12
|
+
... public_token="your_public_token",
|
|
13
|
+
... public_key="your_public_key"
|
|
14
|
+
... )
|
|
15
|
+
>>>
|
|
16
|
+
>>> # Proxy request
|
|
17
|
+
>>> response = client.get("https://httpbin.org/ip")
|
|
18
|
+
>>>
|
|
19
|
+
>>> # SERP search
|
|
20
|
+
>>> results = client.serp_search("python tutorial", engine="google")
|
|
21
|
+
>>>
|
|
22
|
+
>>> # Universal scrape
|
|
23
|
+
>>> html = client.universal_scrape("https://example.com", js_render=True)
|
|
24
|
+
|
|
25
|
+
Async Usage:
|
|
26
|
+
>>> from thordata import AsyncThordataClient
|
|
27
|
+
>>> import asyncio
|
|
28
|
+
>>>
|
|
29
|
+
>>> async def main():
|
|
30
|
+
... async with AsyncThordataClient(
|
|
31
|
+
... scraper_token="your_token"
|
|
32
|
+
... ) as client:
|
|
33
|
+
... response = await client.get("https://httpbin.org/ip")
|
|
34
|
+
>>>
|
|
35
|
+
>>> asyncio.run(main())
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
__version__ = "0.6.0"
|
|
39
|
+
__author__ = "Thordata Developer Team"
|
|
40
|
+
__email__ = "support@thordata.com"
|
|
41
|
+
|
|
42
|
+
# Main clients
|
|
43
|
+
from .async_client import AsyncThordataClient
|
|
44
|
+
from .client import ThordataClient
|
|
45
|
+
|
|
46
|
+
# Enums
|
|
47
|
+
from .enums import (
|
|
48
|
+
BingSearchType,
|
|
49
|
+
Continent,
|
|
50
|
+
Country,
|
|
51
|
+
DataFormat,
|
|
52
|
+
Device,
|
|
53
|
+
Engine,
|
|
54
|
+
GoogleSearchType,
|
|
55
|
+
OutputFormat,
|
|
56
|
+
ProxyHost,
|
|
57
|
+
ProxyPort,
|
|
58
|
+
ProxyType,
|
|
59
|
+
SessionType,
|
|
60
|
+
TaskStatus,
|
|
61
|
+
TimeRange,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Exceptions
|
|
65
|
+
from .exceptions import (
|
|
66
|
+
ThordataAPIError,
|
|
67
|
+
ThordataAuthError,
|
|
68
|
+
ThordataConfigError,
|
|
69
|
+
ThordataError,
|
|
70
|
+
ThordataNetworkError,
|
|
71
|
+
ThordataNotCollectedError,
|
|
72
|
+
ThordataRateLimitError,
|
|
73
|
+
ThordataServerError,
|
|
74
|
+
ThordataTimeoutError,
|
|
75
|
+
ThordataValidationError,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Models
|
|
79
|
+
from .models import (
|
|
80
|
+
ProxyConfig,
|
|
81
|
+
ProxyProduct,
|
|
82
|
+
ScraperTaskConfig,
|
|
83
|
+
SerpRequest,
|
|
84
|
+
StaticISPProxy,
|
|
85
|
+
StickySession,
|
|
86
|
+
TaskStatusResponse,
|
|
87
|
+
UniversalScrapeRequest,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Retry utilities
|
|
91
|
+
from .retry import RetryConfig
|
|
92
|
+
|
|
93
|
+
# Public API
|
|
94
|
+
__all__ = [
|
|
95
|
+
# Version
|
|
96
|
+
"__version__",
|
|
97
|
+
# Clients
|
|
98
|
+
"ThordataClient",
|
|
99
|
+
"AsyncThordataClient",
|
|
100
|
+
# Enums
|
|
101
|
+
"Engine",
|
|
102
|
+
"GoogleSearchType",
|
|
103
|
+
"BingSearchType",
|
|
104
|
+
"ProxyType",
|
|
105
|
+
"SessionType",
|
|
106
|
+
"Continent",
|
|
107
|
+
"Country",
|
|
108
|
+
"OutputFormat",
|
|
109
|
+
"DataFormat",
|
|
110
|
+
"TaskStatus",
|
|
111
|
+
"Device",
|
|
112
|
+
"TimeRange",
|
|
113
|
+
"ProxyHost",
|
|
114
|
+
"ProxyPort",
|
|
115
|
+
# Models
|
|
116
|
+
"ProxyConfig",
|
|
117
|
+
"ProxyProduct",
|
|
118
|
+
"StaticISPProxy",
|
|
119
|
+
"StickySession",
|
|
120
|
+
"SerpRequest",
|
|
121
|
+
"UniversalScrapeRequest",
|
|
122
|
+
"ScraperTaskConfig",
|
|
123
|
+
"TaskStatusResponse",
|
|
124
|
+
# Exceptions
|
|
125
|
+
"ThordataError",
|
|
126
|
+
"ThordataConfigError",
|
|
127
|
+
"ThordataNetworkError",
|
|
128
|
+
"ThordataTimeoutError",
|
|
129
|
+
"ThordataAPIError",
|
|
130
|
+
"ThordataAuthError",
|
|
131
|
+
"ThordataRateLimitError",
|
|
132
|
+
"ThordataServerError",
|
|
133
|
+
"ThordataValidationError",
|
|
134
|
+
"ThordataNotCollectedError",
|
|
135
|
+
# Retry
|
|
136
|
+
"RetryConfig",
|
|
137
|
+
]
|
thordata/_utils.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Internal utility functions for the Thordata Python SDK.
|
|
3
|
+
|
|
4
|
+
These are not part of the public API and may change without notice.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Any, Dict
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_json_response(data: Any) -> Any:
|
|
18
|
+
"""
|
|
19
|
+
Parse a response that might be double-encoded JSON.
|
|
20
|
+
|
|
21
|
+
Some API endpoints return JSON as a string inside JSON.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
data: The response data to parse.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Parsed data.
|
|
28
|
+
"""
|
|
29
|
+
if isinstance(data, str):
|
|
30
|
+
try:
|
|
31
|
+
return json.loads(data)
|
|
32
|
+
except json.JSONDecodeError:
|
|
33
|
+
return data
|
|
34
|
+
return data
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def decode_base64_image(png_str: str) -> bytes:
|
|
38
|
+
"""
|
|
39
|
+
Decode a base64-encoded PNG image.
|
|
40
|
+
|
|
41
|
+
Handles Data URI scheme (data:image/png;base64,...) and fixes padding.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
png_str: Base64-encoded string, possibly with Data URI prefix.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Decoded PNG bytes.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: If the string is empty or cannot be decoded.
|
|
51
|
+
"""
|
|
52
|
+
if not png_str:
|
|
53
|
+
raise ValueError("Empty PNG data received")
|
|
54
|
+
|
|
55
|
+
# Remove Data URI scheme if present
|
|
56
|
+
if "," in png_str:
|
|
57
|
+
png_str = png_str.split(",", 1)[1]
|
|
58
|
+
|
|
59
|
+
# Clean up whitespace
|
|
60
|
+
png_str = png_str.replace("\n", "").replace("\r", "").replace(" ", "")
|
|
61
|
+
|
|
62
|
+
# Fix Base64 padding
|
|
63
|
+
missing_padding = len(png_str) % 4
|
|
64
|
+
if missing_padding:
|
|
65
|
+
png_str += "=" * (4 - missing_padding)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
return base64.b64decode(png_str)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
raise ValueError(f"Failed to decode base64 image: {e}") from e
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def build_auth_headers(token: str) -> Dict[str, str]:
|
|
74
|
+
"""
|
|
75
|
+
Build authorization headers for API requests.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
token: The scraper token.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Headers dict with Authorization and Content-Type.
|
|
82
|
+
"""
|
|
83
|
+
return {
|
|
84
|
+
"Authorization": f"Bearer {token}",
|
|
85
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def build_public_api_headers(public_token: str, public_key: str) -> Dict[str, str]:
|
|
90
|
+
"""
|
|
91
|
+
Build headers for public API requests (task status, locations, etc.)
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
public_token: The public API token.
|
|
95
|
+
public_key: The public API key.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Headers dict with token, key, and Content-Type.
|
|
99
|
+
"""
|
|
100
|
+
return {
|
|
101
|
+
"token": public_token,
|
|
102
|
+
"key": public_key,
|
|
103
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def extract_error_message(payload: Any) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Extract a human-readable error message from an API response.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
payload: The API response payload.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Error message string.
|
|
116
|
+
"""
|
|
117
|
+
if isinstance(payload, dict):
|
|
118
|
+
# Try common error message fields
|
|
119
|
+
for key in ("msg", "message", "error", "detail", "description"):
|
|
120
|
+
if key in payload:
|
|
121
|
+
return str(payload[key])
|
|
122
|
+
|
|
123
|
+
# Fall back to full payload
|
|
124
|
+
return str(payload)
|
|
125
|
+
|
|
126
|
+
return str(payload)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def build_user_agent(sdk_version: str, http_client: str) -> str:
|
|
130
|
+
"""
|
|
131
|
+
Build a default User-Agent for the SDK.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
sdk_version: SDK version string.
|
|
135
|
+
http_client: "requests" or "aiohttp" (or any identifier).
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
A User-Agent string.
|
|
139
|
+
"""
|
|
140
|
+
import platform
|
|
141
|
+
|
|
142
|
+
py = platform.python_version()
|
|
143
|
+
system = platform.system()
|
|
144
|
+
return f"thordata-python-sdk/{sdk_version} (python {py}; {system}; {http_client})"
|