thordata-sdk 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +33 -36
- thordata/_utils.py +21 -21
- thordata/async_client.py +230 -192
- thordata/client.py +281 -222
- thordata/enums.py +32 -6
- thordata/exceptions.py +60 -31
- thordata/models.py +173 -146
- thordata/parameters.py +7 -6
- thordata/retry.py +109 -111
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/METADATA +228 -10
- thordata_sdk-0.5.0.dist-info/RECORD +14 -0
- thordata_sdk-0.4.0.dist-info/RECORD +0 -14
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-0.4.0.dist-info → thordata_sdk-0.5.0.dist-info}/top_level.txt +0 -0
thordata/__init__.py
CHANGED
|
@@ -6,83 +6,84 @@ Universal Scraping API (Web Unlocker), and Web Scraper API.
|
|
|
6
6
|
|
|
7
7
|
Basic Usage:
|
|
8
8
|
>>> from thordata import ThordataClient
|
|
9
|
-
>>>
|
|
9
|
+
>>>
|
|
10
10
|
>>> client = ThordataClient(
|
|
11
11
|
... scraper_token="your_token",
|
|
12
12
|
... public_token="your_public_token",
|
|
13
13
|
... public_key="your_public_key"
|
|
14
14
|
... )
|
|
15
|
-
>>>
|
|
15
|
+
>>>
|
|
16
16
|
>>> # Proxy request
|
|
17
17
|
>>> response = client.get("https://httpbin.org/ip")
|
|
18
|
-
>>>
|
|
18
|
+
>>>
|
|
19
19
|
>>> # SERP search
|
|
20
20
|
>>> results = client.serp_search("python tutorial", engine="google")
|
|
21
|
-
>>>
|
|
21
|
+
>>>
|
|
22
22
|
>>> # Universal scrape
|
|
23
23
|
>>> html = client.universal_scrape("https://example.com", js_render=True)
|
|
24
24
|
|
|
25
25
|
Async Usage:
|
|
26
26
|
>>> from thordata import AsyncThordataClient
|
|
27
27
|
>>> import asyncio
|
|
28
|
-
>>>
|
|
28
|
+
>>>
|
|
29
29
|
>>> async def main():
|
|
30
30
|
... async with AsyncThordataClient(
|
|
31
31
|
... scraper_token="your_token"
|
|
32
32
|
... ) as client:
|
|
33
33
|
... response = await client.get("https://httpbin.org/ip")
|
|
34
|
-
>>>
|
|
34
|
+
>>>
|
|
35
35
|
>>> asyncio.run(main())
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
|
-
__version__ = "0.
|
|
38
|
+
__version__ = "0.5.0"
|
|
39
39
|
__author__ = "Thordata Developer Team"
|
|
40
40
|
__email__ = "support@thordata.com"
|
|
41
41
|
|
|
42
42
|
# Main clients
|
|
43
|
-
from .client import ThordataClient
|
|
44
43
|
from .async_client import AsyncThordataClient
|
|
44
|
+
from .client import ThordataClient
|
|
45
45
|
|
|
46
46
|
# Enums
|
|
47
47
|
from .enums import (
|
|
48
|
-
Engine,
|
|
49
|
-
GoogleSearchType,
|
|
50
48
|
BingSearchType,
|
|
51
|
-
ProxyType,
|
|
52
|
-
SessionType,
|
|
53
49
|
Continent,
|
|
54
50
|
Country,
|
|
55
|
-
OutputFormat,
|
|
56
51
|
DataFormat,
|
|
57
|
-
TaskStatus,
|
|
58
52
|
Device,
|
|
59
|
-
|
|
53
|
+
Engine,
|
|
54
|
+
GoogleSearchType,
|
|
55
|
+
OutputFormat,
|
|
60
56
|
ProxyHost,
|
|
61
57
|
ProxyPort,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
ProxyConfig,
|
|
67
|
-
ProxyProduct,
|
|
68
|
-
StickySession,
|
|
69
|
-
SerpRequest,
|
|
70
|
-
UniversalScrapeRequest,
|
|
71
|
-
ScraperTaskConfig,
|
|
72
|
-
TaskStatusResponse,
|
|
58
|
+
ProxyType,
|
|
59
|
+
SessionType,
|
|
60
|
+
TaskStatus,
|
|
61
|
+
TimeRange,
|
|
73
62
|
)
|
|
74
63
|
|
|
75
64
|
# Exceptions
|
|
76
65
|
from .exceptions import (
|
|
77
|
-
ThordataError,
|
|
78
|
-
ThordataConfigError,
|
|
79
|
-
ThordataNetworkError,
|
|
80
|
-
ThordataTimeoutError,
|
|
81
66
|
ThordataAPIError,
|
|
82
67
|
ThordataAuthError,
|
|
68
|
+
ThordataConfigError,
|
|
69
|
+
ThordataError,
|
|
70
|
+
ThordataNetworkError,
|
|
83
71
|
ThordataRateLimitError,
|
|
84
72
|
ThordataServerError,
|
|
73
|
+
ThordataTimeoutError,
|
|
85
74
|
ThordataValidationError,
|
|
75
|
+
ThordataNotCollectedError,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Models
|
|
79
|
+
from .models import (
|
|
80
|
+
ProxyConfig,
|
|
81
|
+
ProxyProduct,
|
|
82
|
+
ScraperTaskConfig,
|
|
83
|
+
SerpRequest,
|
|
84
|
+
StickySession,
|
|
85
|
+
TaskStatusResponse,
|
|
86
|
+
UniversalScrapeRequest,
|
|
86
87
|
)
|
|
87
88
|
|
|
88
89
|
# Retry utilities
|
|
@@ -92,11 +93,9 @@ from .retry import RetryConfig
|
|
|
92
93
|
__all__ = [
|
|
93
94
|
# Version
|
|
94
95
|
"__version__",
|
|
95
|
-
|
|
96
96
|
# Clients
|
|
97
97
|
"ThordataClient",
|
|
98
98
|
"AsyncThordataClient",
|
|
99
|
-
|
|
100
99
|
# Enums
|
|
101
100
|
"Engine",
|
|
102
101
|
"GoogleSearchType",
|
|
@@ -112,7 +111,6 @@ __all__ = [
|
|
|
112
111
|
"TimeRange",
|
|
113
112
|
"ProxyHost",
|
|
114
113
|
"ProxyPort",
|
|
115
|
-
|
|
116
114
|
# Models
|
|
117
115
|
"ProxyConfig",
|
|
118
116
|
"ProxyProduct",
|
|
@@ -121,7 +119,6 @@ __all__ = [
|
|
|
121
119
|
"UniversalScrapeRequest",
|
|
122
120
|
"ScraperTaskConfig",
|
|
123
121
|
"TaskStatusResponse",
|
|
124
|
-
|
|
125
122
|
# Exceptions
|
|
126
123
|
"ThordataError",
|
|
127
124
|
"ThordataConfigError",
|
|
@@ -132,7 +129,7 @@ __all__ = [
|
|
|
132
129
|
"ThordataRateLimitError",
|
|
133
130
|
"ThordataServerError",
|
|
134
131
|
"ThordataValidationError",
|
|
135
|
-
|
|
132
|
+
"ThordataNotCollectedError",
|
|
136
133
|
# Retry
|
|
137
134
|
"RetryConfig",
|
|
138
|
-
]
|
|
135
|
+
]
|
thordata/_utils.py
CHANGED
|
@@ -9,7 +9,7 @@ from __future__ import annotations
|
|
|
9
9
|
import base64
|
|
10
10
|
import json
|
|
11
11
|
import logging
|
|
12
|
-
from typing import Any, Dict
|
|
12
|
+
from typing import Any, Dict
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
@@ -17,12 +17,12 @@ logger = logging.getLogger(__name__)
|
|
|
17
17
|
def parse_json_response(data: Any) -> Any:
|
|
18
18
|
"""
|
|
19
19
|
Parse a response that might be double-encoded JSON.
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
Some API endpoints return JSON as a string inside JSON.
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
Args:
|
|
24
24
|
data: The response data to parse.
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
Returns:
|
|
27
27
|
Parsed data.
|
|
28
28
|
"""
|
|
@@ -37,33 +37,33 @@ def parse_json_response(data: Any) -> Any:
|
|
|
37
37
|
def decode_base64_image(png_str: str) -> bytes:
|
|
38
38
|
"""
|
|
39
39
|
Decode a base64-encoded PNG image.
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
Handles Data URI scheme (data:image/png;base64,...) and fixes padding.
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
Args:
|
|
44
44
|
png_str: Base64-encoded string, possibly with Data URI prefix.
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
Returns:
|
|
47
47
|
Decoded PNG bytes.
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
Raises:
|
|
50
50
|
ValueError: If the string is empty or cannot be decoded.
|
|
51
51
|
"""
|
|
52
52
|
if not png_str:
|
|
53
53
|
raise ValueError("Empty PNG data received")
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
# Remove Data URI scheme if present
|
|
56
56
|
if "," in png_str:
|
|
57
57
|
png_str = png_str.split(",", 1)[1]
|
|
58
|
-
|
|
58
|
+
|
|
59
59
|
# Clean up whitespace
|
|
60
60
|
png_str = png_str.replace("\n", "").replace("\r", "").replace(" ", "")
|
|
61
|
-
|
|
61
|
+
|
|
62
62
|
# Fix Base64 padding
|
|
63
63
|
missing_padding = len(png_str) % 4
|
|
64
64
|
if missing_padding:
|
|
65
65
|
png_str += "=" * (4 - missing_padding)
|
|
66
|
-
|
|
66
|
+
|
|
67
67
|
try:
|
|
68
68
|
return base64.b64decode(png_str)
|
|
69
69
|
except Exception as e:
|
|
@@ -73,10 +73,10 @@ def decode_base64_image(png_str: str) -> bytes:
|
|
|
73
73
|
def build_auth_headers(token: str) -> Dict[str, str]:
|
|
74
74
|
"""
|
|
75
75
|
Build authorization headers for API requests.
|
|
76
|
-
|
|
76
|
+
|
|
77
77
|
Args:
|
|
78
78
|
token: The scraper token.
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
Returns:
|
|
81
81
|
Headers dict with Authorization and Content-Type.
|
|
82
82
|
"""
|
|
@@ -89,11 +89,11 @@ def build_auth_headers(token: str) -> Dict[str, str]:
|
|
|
89
89
|
def build_public_api_headers(public_token: str, public_key: str) -> Dict[str, str]:
|
|
90
90
|
"""
|
|
91
91
|
Build headers for public API requests (task status, locations, etc.)
|
|
92
|
-
|
|
92
|
+
|
|
93
93
|
Args:
|
|
94
94
|
public_token: The public API token.
|
|
95
95
|
public_key: The public API key.
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
Returns:
|
|
98
98
|
Headers dict with token, key, and Content-Type.
|
|
99
99
|
"""
|
|
@@ -107,10 +107,10 @@ def build_public_api_headers(public_token: str, public_key: str) -> Dict[str, st
|
|
|
107
107
|
def extract_error_message(payload: Any) -> str:
|
|
108
108
|
"""
|
|
109
109
|
Extract a human-readable error message from an API response.
|
|
110
|
-
|
|
110
|
+
|
|
111
111
|
Args:
|
|
112
112
|
payload: The API response payload.
|
|
113
|
-
|
|
113
|
+
|
|
114
114
|
Returns:
|
|
115
115
|
Error message string.
|
|
116
116
|
"""
|
|
@@ -119,8 +119,8 @@ def extract_error_message(payload: Any) -> str:
|
|
|
119
119
|
for key in ("msg", "message", "error", "detail", "description"):
|
|
120
120
|
if key in payload:
|
|
121
121
|
return str(payload[key])
|
|
122
|
-
|
|
122
|
+
|
|
123
123
|
# Fall back to full payload
|
|
124
124
|
return str(payload)
|
|
125
|
-
|
|
126
|
-
return str(payload)
|
|
125
|
+
|
|
126
|
+
return str(payload)
|