thordata-sdk 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata/__init__.py +1 -1
- thordata/async_client.py +76 -10
- thordata/browser/__init__.py +16 -0
- thordata/browser/exceptions.py +23 -0
- thordata/browser/session.py +469 -0
- thordata/client.py +55 -7
- thordata/exceptions.py +10 -1
- thordata/types/serp.py +35 -3
- thordata/types/task.py +63 -9
- thordata/types/universal.py +37 -5
- {thordata_sdk-1.6.0.dist-info → thordata_sdk-1.8.0.dist-info}/METADATA +4 -2
- {thordata_sdk-1.6.0.dist-info → thordata_sdk-1.8.0.dist-info}/RECORD +15 -12
- {thordata_sdk-1.6.0.dist-info → thordata_sdk-1.8.0.dist-info}/WHEEL +0 -0
- {thordata_sdk-1.6.0.dist-info → thordata_sdk-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {thordata_sdk-1.6.0.dist-info → thordata_sdk-1.8.0.dist-info}/top_level.txt +0 -0
thordata/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ Official Python client for Thordata's Proxy Network, SERP API,
|
|
|
5
5
|
Universal Scraping API (Web Unlocker), and Web Scraper API.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "1.
|
|
8
|
+
__version__ = "1.8.0"
|
|
9
9
|
__author__ = "Thordata Developer Team/Kael Odin"
|
|
10
10
|
__email__ = "support@thordata.com"
|
|
11
11
|
|
thordata/async_client.py
CHANGED
|
@@ -244,6 +244,7 @@ class AsyncThordataClient:
|
|
|
244
244
|
render_js: bool | None = None,
|
|
245
245
|
no_cache: bool | None = None,
|
|
246
246
|
output_format: str = "json",
|
|
247
|
+
ai_overview: bool = False,
|
|
247
248
|
**kwargs: Any,
|
|
248
249
|
) -> dict[str, Any]:
|
|
249
250
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
@@ -258,13 +259,14 @@ class AsyncThordataClient:
|
|
|
258
259
|
render_js=render_js,
|
|
259
260
|
no_cache=no_cache,
|
|
260
261
|
output_format=output_format,
|
|
262
|
+
ai_overview=ai_overview,
|
|
261
263
|
extra_params=kwargs,
|
|
262
264
|
)
|
|
263
265
|
return await self.serp_search_advanced(request)
|
|
264
266
|
|
|
265
267
|
async def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
|
|
266
268
|
if not self.scraper_token:
|
|
267
|
-
raise ThordataConfigError("scraper_token required")
|
|
269
|
+
raise ThordataConfigError("scraper_token is required for SERP API")
|
|
268
270
|
payload = request.to_payload()
|
|
269
271
|
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
270
272
|
logger.info(f"Async SERP: {request.engine} - {request.query}")
|
|
@@ -293,30 +295,38 @@ class AsyncThordataClient:
|
|
|
293
295
|
url: str,
|
|
294
296
|
*,
|
|
295
297
|
js_render: bool = False,
|
|
296
|
-
output_format: str = "html",
|
|
298
|
+
output_format: str | list[str] = "html",
|
|
297
299
|
country: str | None = None,
|
|
298
300
|
block_resources: str | None = None,
|
|
301
|
+
clean_content: str | None = None,
|
|
299
302
|
wait: int | None = None,
|
|
300
303
|
wait_for: str | None = None,
|
|
304
|
+
follow_redirect: bool | None = None,
|
|
305
|
+
headers: list[dict[str, str]] | None = None,
|
|
306
|
+
cookies: list[dict[str, str]] | None = None,
|
|
301
307
|
**kwargs: Any,
|
|
302
|
-
) -> str | bytes:
|
|
308
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
303
309
|
request = UniversalScrapeRequest(
|
|
304
310
|
url=url,
|
|
305
311
|
js_render=js_render,
|
|
306
312
|
output_format=output_format,
|
|
307
313
|
country=country,
|
|
308
314
|
block_resources=block_resources,
|
|
315
|
+
clean_content=clean_content,
|
|
309
316
|
wait=wait,
|
|
310
317
|
wait_for=wait_for,
|
|
318
|
+
follow_redirect=follow_redirect,
|
|
319
|
+
headers=headers,
|
|
320
|
+
cookies=cookies,
|
|
311
321
|
extra_params=kwargs,
|
|
312
322
|
)
|
|
313
323
|
return await self.universal_scrape_advanced(request)
|
|
314
324
|
|
|
315
325
|
async def universal_scrape_advanced(
|
|
316
326
|
self, request: UniversalScrapeRequest
|
|
317
|
-
) -> str | bytes:
|
|
327
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
318
328
|
if not self.scraper_token:
|
|
319
|
-
raise ThordataConfigError("scraper_token required")
|
|
329
|
+
raise ThordataConfigError("scraper_token is required for Universal API")
|
|
320
330
|
payload = request.to_payload()
|
|
321
331
|
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
322
332
|
|
|
@@ -327,9 +337,17 @@ class AsyncThordataClient:
|
|
|
327
337
|
try:
|
|
328
338
|
resp_json = await response.json()
|
|
329
339
|
except ValueError:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
340
|
+
# If not JSON, return raw content based on format
|
|
341
|
+
if isinstance(request.output_format, list) or (
|
|
342
|
+
isinstance(request.output_format, str) and "," in request.output_format
|
|
343
|
+
):
|
|
344
|
+
return {"raw": await response.read()}
|
|
345
|
+
fmt = (
|
|
346
|
+
request.output_format.lower()
|
|
347
|
+
if isinstance(request.output_format, str)
|
|
348
|
+
else str(request.output_format).lower()
|
|
349
|
+
)
|
|
350
|
+
return await response.read() if fmt == "png" else await response.text()
|
|
333
351
|
|
|
334
352
|
if isinstance(resp_json, dict):
|
|
335
353
|
code = resp_json.get("code")
|
|
@@ -337,6 +355,27 @@ class AsyncThordataClient:
|
|
|
337
355
|
msg = extract_error_message(resp_json)
|
|
338
356
|
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
339
357
|
|
|
358
|
+
# Handle multiple output formats
|
|
359
|
+
if isinstance(request.output_format, list) or (
|
|
360
|
+
isinstance(request.output_format, str) and "," in request.output_format
|
|
361
|
+
):
|
|
362
|
+
result: dict[str, str | bytes] = {}
|
|
363
|
+
formats = (
|
|
364
|
+
request.output_format
|
|
365
|
+
if isinstance(request.output_format, list)
|
|
366
|
+
else [f.strip() for f in request.output_format.split(",")]
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
for fmt in formats:
|
|
370
|
+
fmt_lower = fmt.lower()
|
|
371
|
+
if fmt_lower == "html" and "html" in resp_json:
|
|
372
|
+
result["html"] = resp_json["html"]
|
|
373
|
+
elif fmt_lower == "png" and "png" in resp_json:
|
|
374
|
+
result["png"] = decode_base64_image(resp_json["png"])
|
|
375
|
+
|
|
376
|
+
if result:
|
|
377
|
+
return result
|
|
378
|
+
|
|
340
379
|
if "html" in resp_json:
|
|
341
380
|
return resp_json["html"]
|
|
342
381
|
if "png" in resp_json:
|
|
@@ -411,7 +450,7 @@ class AsyncThordataClient:
|
|
|
411
450
|
async def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
|
|
412
451
|
self._require_public_credentials()
|
|
413
452
|
if not self.scraper_token:
|
|
414
|
-
raise ThordataConfigError("scraper_token required")
|
|
453
|
+
raise ThordataConfigError("scraper_token is required for Task Builder")
|
|
415
454
|
payload = config.to_payload()
|
|
416
455
|
headers = build_builder_headers(
|
|
417
456
|
self.scraper_token, str(self.public_token), str(self.public_key)
|
|
@@ -449,7 +488,9 @@ class AsyncThordataClient:
|
|
|
449
488
|
async def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
|
|
450
489
|
self._require_public_credentials()
|
|
451
490
|
if not self.scraper_token:
|
|
452
|
-
raise ThordataConfigError(
|
|
491
|
+
raise ThordataConfigError(
|
|
492
|
+
"scraper_token is required for Video Task Builder"
|
|
493
|
+
)
|
|
453
494
|
payload = config.to_payload()
|
|
454
495
|
headers = build_builder_headers(
|
|
455
496
|
self.scraper_token, str(self.public_token), str(self.public_key)
|
|
@@ -1067,3 +1108,28 @@ class AsyncThordataClient:
|
|
|
1067
1108
|
safe_user = quote(final_user, safe="")
|
|
1068
1109
|
safe_pass = quote(pwd, safe="")
|
|
1069
1110
|
return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"
|
|
1111
|
+
|
|
1112
|
+
@property
|
|
1113
|
+
def browser(self):
|
|
1114
|
+
"""Get a browser session for automation.
|
|
1115
|
+
|
|
1116
|
+
Requires playwright: pip install thordata[browser]
|
|
1117
|
+
|
|
1118
|
+
Returns:
|
|
1119
|
+
BrowserSession instance
|
|
1120
|
+
|
|
1121
|
+
Example:
|
|
1122
|
+
async with AsyncThordataClient() as client:
|
|
1123
|
+
session = client.browser
|
|
1124
|
+
await session.navigate("https://example.com")
|
|
1125
|
+
snapshot = await session.snapshot()
|
|
1126
|
+
"""
|
|
1127
|
+
try:
|
|
1128
|
+
from .browser import BrowserSession
|
|
1129
|
+
|
|
1130
|
+
return BrowserSession(self)
|
|
1131
|
+
except ImportError as e:
|
|
1132
|
+
raise ImportError(
|
|
1133
|
+
"Playwright is required for browser automation. "
|
|
1134
|
+
"Install it with: pip install thordata[browser]"
|
|
1135
|
+
) from e
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Browser automation module for Thordata Scraping Browser.
|
|
2
|
+
|
|
3
|
+
This module provides high-level browser automation capabilities using Playwright.
|
|
4
|
+
Requires optional dependency: pip install thordata[browser]
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from .exceptions import BrowserConnectionError, BrowserError
|
|
11
|
+
from .session import BrowserSession
|
|
12
|
+
|
|
13
|
+
__all__ = ["BrowserSession", "BrowserError", "BrowserConnectionError"]
|
|
14
|
+
except ImportError:
|
|
15
|
+
# Playwright not installed
|
|
16
|
+
__all__ = []
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Browser automation exceptions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..exceptions import ThordataError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BrowserError(ThordataError):
|
|
9
|
+
"""Base exception for browser automation errors."""
|
|
10
|
+
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BrowserConnectionError(BrowserError):
|
|
15
|
+
"""Raised when browser connection fails."""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BrowserSessionError(BrowserError):
|
|
21
|
+
"""Raised when browser session operations fail."""
|
|
22
|
+
|
|
23
|
+
pass
|
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
"""Browser session management for Thordata Scraping Browser.
|
|
2
|
+
|
|
3
|
+
This module provides a high-level wrapper around Playwright connected to
|
|
4
|
+
Thordata's Scraping Browser.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from playwright.async_api import Browser, Page, Playwright, async_playwright
|
|
15
|
+
except ImportError as e:
|
|
16
|
+
raise ImportError(
|
|
17
|
+
"Playwright is required for browser automation. "
|
|
18
|
+
"Install it with: pip install thordata[browser]"
|
|
19
|
+
) from e
|
|
20
|
+
|
|
21
|
+
from ..async_client import AsyncThordataClient
|
|
22
|
+
from .exceptions import BrowserConnectionError, BrowserSessionError
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BrowserSession:
|
|
28
|
+
"""Domain-aware browser session wrapper for Thordata Scraping Browser."""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
client: AsyncThordataClient,
|
|
33
|
+
username: str | None = None,
|
|
34
|
+
password: str | None = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Initialize browser session.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
client: AsyncThordataClient instance
|
|
40
|
+
username: Browser username (optional, can use env var)
|
|
41
|
+
password: Browser password (optional, can use env var)
|
|
42
|
+
"""
|
|
43
|
+
self._client = client
|
|
44
|
+
self._username = username
|
|
45
|
+
self._password = password
|
|
46
|
+
self._playwright: Playwright | None = None
|
|
47
|
+
self._browsers: dict[str, Browser] = {}
|
|
48
|
+
self._pages: dict[str, Page] = {}
|
|
49
|
+
self._current_domain: str = "default"
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _get_domain(url: str) -> str:
|
|
53
|
+
"""Extract domain from URL."""
|
|
54
|
+
try:
|
|
55
|
+
parsed = urlparse(url)
|
|
56
|
+
return parsed.hostname or "default"
|
|
57
|
+
except Exception:
|
|
58
|
+
return "default"
|
|
59
|
+
|
|
60
|
+
async def _ensure_playwright(self) -> Playwright:
|
|
61
|
+
"""Ensure Playwright is started."""
|
|
62
|
+
if self._playwright is None:
|
|
63
|
+
self._playwright = await async_playwright().start()
|
|
64
|
+
return self._playwright
|
|
65
|
+
|
|
66
|
+
async def get_browser(self, domain: str = "default") -> Browser:
|
|
67
|
+
"""Get or create a browser instance for a given domain."""
|
|
68
|
+
existing = self._browsers.get(domain)
|
|
69
|
+
if existing and existing.is_connected():
|
|
70
|
+
return existing
|
|
71
|
+
|
|
72
|
+
# Clean up stale browser/page
|
|
73
|
+
if existing is not None:
|
|
74
|
+
logger.info("Browser for domain %s disconnected, recreating", domain)
|
|
75
|
+
self._browsers.pop(domain, None)
|
|
76
|
+
self._pages.pop(domain, None)
|
|
77
|
+
|
|
78
|
+
playwright = await self._ensure_playwright()
|
|
79
|
+
|
|
80
|
+
logger.info("Connecting to Thordata Scraping Browser for domain %s", domain)
|
|
81
|
+
|
|
82
|
+
# Get browser credentials
|
|
83
|
+
import os
|
|
84
|
+
|
|
85
|
+
user = self._username or os.getenv("THORDATA_BROWSER_USERNAME")
|
|
86
|
+
pwd = self._password or os.getenv("THORDATA_BROWSER_PASSWORD")
|
|
87
|
+
|
|
88
|
+
if not user or not pwd:
|
|
89
|
+
raise BrowserConnectionError(
|
|
90
|
+
"Missing browser credentials. Set THORDATA_BROWSER_USERNAME and "
|
|
91
|
+
"THORDATA_BROWSER_PASSWORD or pass them to BrowserSession."
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Retry logic with exponential backoff
|
|
95
|
+
max_retries = 3
|
|
96
|
+
last_error = None
|
|
97
|
+
|
|
98
|
+
for attempt in range(max_retries):
|
|
99
|
+
try:
|
|
100
|
+
ws_url = self._client.get_browser_connection_url(
|
|
101
|
+
username=user, password=pwd
|
|
102
|
+
)
|
|
103
|
+
logger.debug(
|
|
104
|
+
"Attempt %d/%d: Connecting to %s...",
|
|
105
|
+
attempt + 1,
|
|
106
|
+
max_retries,
|
|
107
|
+
ws_url[:50],
|
|
108
|
+
)
|
|
109
|
+
browser = await playwright.chromium.connect_over_cdp(ws_url)
|
|
110
|
+
logger.info("Successfully connected to browser for domain %s", domain)
|
|
111
|
+
self._browsers[domain] = browser
|
|
112
|
+
return browser
|
|
113
|
+
except Exception as e:
|
|
114
|
+
last_error = e
|
|
115
|
+
logger.warning(
|
|
116
|
+
"Browser connection attempt %d/%d failed: %s",
|
|
117
|
+
attempt + 1,
|
|
118
|
+
max_retries,
|
|
119
|
+
e,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
if attempt < max_retries - 1:
|
|
123
|
+
import asyncio
|
|
124
|
+
|
|
125
|
+
wait_time = 2**attempt # Exponential backoff: 1s, 2s, 4s
|
|
126
|
+
logger.info("Retrying in %d seconds...", wait_time)
|
|
127
|
+
await asyncio.sleep(wait_time)
|
|
128
|
+
|
|
129
|
+
# If all retries failed, raise the last error
|
|
130
|
+
raise BrowserConnectionError(
|
|
131
|
+
f"Failed to connect to Thordata Scraping Browser after {max_retries} attempts. "
|
|
132
|
+
f"Last error: {last_error}"
|
|
133
|
+
) from last_error
|
|
134
|
+
|
|
135
|
+
async def get_page(self, url: str | None = None) -> Page:
|
|
136
|
+
"""Get or create a page for the current (or provided) domain."""
|
|
137
|
+
if url:
|
|
138
|
+
self._current_domain = self._get_domain(url)
|
|
139
|
+
domain = self._current_domain
|
|
140
|
+
|
|
141
|
+
existing = self._pages.get(domain)
|
|
142
|
+
if existing and not existing.is_closed():
|
|
143
|
+
return existing
|
|
144
|
+
|
|
145
|
+
browser = await self.get_browser(domain)
|
|
146
|
+
contexts = browser.contexts
|
|
147
|
+
if not contexts:
|
|
148
|
+
context = await browser.new_context()
|
|
149
|
+
else:
|
|
150
|
+
context = contexts[0]
|
|
151
|
+
|
|
152
|
+
pages = context.pages
|
|
153
|
+
if pages:
|
|
154
|
+
page = pages[0]
|
|
155
|
+
else:
|
|
156
|
+
page = await context.new_page()
|
|
157
|
+
|
|
158
|
+
self._pages[domain] = page
|
|
159
|
+
return page
|
|
160
|
+
|
|
161
|
+
async def navigate(self, url: str, timeout: int = 120000) -> dict[str, Any]:
|
|
162
|
+
"""Navigate to a URL.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
url: Target URL
|
|
166
|
+
timeout: Navigation timeout in milliseconds
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Dictionary with url and title
|
|
170
|
+
"""
|
|
171
|
+
page = await self.get_page(url)
|
|
172
|
+
if page.url != url:
|
|
173
|
+
await page.goto(url, timeout=timeout)
|
|
174
|
+
title = await page.title()
|
|
175
|
+
return {"url": page.url, "title": title}
|
|
176
|
+
|
|
177
|
+
async def snapshot(
|
|
178
|
+
self, filtered: bool = True, max_items: int = 80
|
|
179
|
+
) -> dict[str, Any]:
|
|
180
|
+
"""Capture an ARIA-like snapshot of the current page.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
filtered: Whether to filter to interactive elements only
|
|
184
|
+
max_items: Maximum number of elements to include
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Dictionary with url, title, and aria_snapshot
|
|
188
|
+
"""
|
|
189
|
+
page = await self.get_page()
|
|
190
|
+
full_snapshot = await self._get_interactive_snapshot(page)
|
|
191
|
+
|
|
192
|
+
if not filtered:
|
|
193
|
+
return {
|
|
194
|
+
"url": page.url,
|
|
195
|
+
"title": await page.title(),
|
|
196
|
+
"aria_snapshot": full_snapshot,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
# Filter and limit
|
|
200
|
+
filtered_snapshot = self._filter_snapshot(full_snapshot)
|
|
201
|
+
filtered_snapshot = self._limit_snapshot_items(
|
|
202
|
+
filtered_snapshot, max_items=max_items
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return {
|
|
206
|
+
"url": page.url,
|
|
207
|
+
"title": await page.title(),
|
|
208
|
+
"aria_snapshot": filtered_snapshot,
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
async def click_ref(
|
|
212
|
+
self, ref: str, wait_for_navigation_ms: int | None = None
|
|
213
|
+
) -> dict[str, Any]:
|
|
214
|
+
"""Click an element by its ref ID.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
ref: The ref ID from snapshot (e.g., "1" or "dom-1")
|
|
218
|
+
wait_for_navigation_ms: Optional wait time in ms to detect navigation
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Dictionary with click result information
|
|
222
|
+
"""
|
|
223
|
+
page = await self.get_page()
|
|
224
|
+
url_before = page.url
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
locator = page.locator(f'[data-fastmcp-ref="{ref}"]').first
|
|
228
|
+
await locator.click(timeout=5000)
|
|
229
|
+
|
|
230
|
+
# Check for navigation if requested
|
|
231
|
+
did_navigate = False
|
|
232
|
+
url_after = url_before
|
|
233
|
+
if wait_for_navigation_ms and wait_for_navigation_ms > 0:
|
|
234
|
+
import asyncio
|
|
235
|
+
|
|
236
|
+
await asyncio.sleep(wait_for_navigation_ms / 1000)
|
|
237
|
+
url_after = page.url
|
|
238
|
+
did_navigate = url_after != url_before
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
"message": "Successfully clicked element",
|
|
242
|
+
"ref": ref,
|
|
243
|
+
"url_before": url_before,
|
|
244
|
+
"url_after": url_after,
|
|
245
|
+
"did_navigate": did_navigate,
|
|
246
|
+
}
|
|
247
|
+
except Exception as e:
|
|
248
|
+
raise BrowserSessionError(f"Failed to click element: {e}") from e
|
|
249
|
+
|
|
250
|
+
async def type_ref(
|
|
251
|
+
self, ref: str, text: str, submit: bool = False
|
|
252
|
+
) -> dict[str, Any]:
|
|
253
|
+
"""Type text into an element by its ref ID.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
ref: The ref ID from snapshot
|
|
257
|
+
text: Text to type
|
|
258
|
+
submit: Whether to press Enter after typing
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Dictionary with type result information
|
|
262
|
+
"""
|
|
263
|
+
page = await self.get_page()
|
|
264
|
+
url_before = page.url
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
locator = page.locator(f'[data-fastmcp-ref="{ref}"]').first
|
|
268
|
+
await locator.fill(text)
|
|
269
|
+
if submit:
|
|
270
|
+
await locator.press("Enter")
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
"message": "Typed into element" + (" and submitted" if submit else ""),
|
|
274
|
+
"ref": ref,
|
|
275
|
+
"url_before": url_before,
|
|
276
|
+
"url_after": page.url,
|
|
277
|
+
}
|
|
278
|
+
except Exception as e:
|
|
279
|
+
raise BrowserSessionError(f"Failed to type into element: {e}") from e
|
|
280
|
+
|
|
281
|
+
async def screenshot_page(self, full_page: bool = False) -> bytes:
|
|
282
|
+
"""Take a screenshot of the current page.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
full_page: Whether to capture full page or viewport only
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
Screenshot as bytes (PNG format)
|
|
289
|
+
"""
|
|
290
|
+
page = await self.get_page()
|
|
291
|
+
return await page.screenshot(full_page=full_page)
|
|
292
|
+
|
|
293
|
+
async def get_html(self, full_page: bool = False) -> str:
|
|
294
|
+
"""Get the HTML content of the current page.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
full_page: Whether to get full page HTML or body only
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
HTML content as string
|
|
301
|
+
"""
|
|
302
|
+
page = await self.get_page()
|
|
303
|
+
if full_page:
|
|
304
|
+
return await page.content()
|
|
305
|
+
else:
|
|
306
|
+
try:
|
|
307
|
+
return await page.evaluate("document.body.innerHTML")
|
|
308
|
+
except Exception:
|
|
309
|
+
return await page.content()
|
|
310
|
+
|
|
311
|
+
async def scroll(self) -> dict[str, Any]:
|
|
312
|
+
"""Scroll to the bottom of the page.
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Dictionary with scroll result
|
|
316
|
+
"""
|
|
317
|
+
page = await self.get_page()
|
|
318
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
319
|
+
return {"message": "Scrolled to bottom"}
|
|
320
|
+
|
|
321
|
+
async def go_back(self) -> dict[str, Any]:
|
|
322
|
+
"""Navigate back in browser history.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Dictionary with new URL
|
|
326
|
+
"""
|
|
327
|
+
page = await self.get_page()
|
|
328
|
+
await page.go_back()
|
|
329
|
+
return {"url": page.url}
|
|
330
|
+
|
|
331
|
+
async def _get_interactive_snapshot(self, page: Page) -> str:
|
|
332
|
+
"""Generate a text snapshot of interactive elements with refs."""
|
|
333
|
+
script = """
|
|
334
|
+
() => {
|
|
335
|
+
function getSnapshot() {
|
|
336
|
+
const lines = [];
|
|
337
|
+
let refCounter = 0;
|
|
338
|
+
|
|
339
|
+
function normalizeRole(tag, explicitRole) {
|
|
340
|
+
const role = (explicitRole || '').toLowerCase();
|
|
341
|
+
const t = (tag || '').toLowerCase();
|
|
342
|
+
if (role) return role;
|
|
343
|
+
if (t === 'a') return 'link';
|
|
344
|
+
if (t === 'button') return 'button';
|
|
345
|
+
if (t === 'input') return 'textbox';
|
|
346
|
+
if (t === 'select') return 'combobox';
|
|
347
|
+
if (t === 'textarea') return 'textbox';
|
|
348
|
+
return t;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
function traverse(node) {
|
|
352
|
+
if (node.nodeType === Node.ELEMENT_NODE) {
|
|
353
|
+
const tag = node.tagName.toLowerCase();
|
|
354
|
+
const interactiveTag = ['a', 'button', 'input', 'select', 'textarea'].includes(tag);
|
|
355
|
+
const role = normalizeRole(tag, node.getAttribute('role'));
|
|
356
|
+
const interactiveRole = ['button', 'link', 'textbox', 'searchbox', 'combobox', 'checkbox', 'radio', 'switch', 'tab', 'menuitem', 'option'].includes(role);
|
|
357
|
+
|
|
358
|
+
if (interactiveTag || interactiveRole) {
|
|
359
|
+
if (!node.dataset.fastmcpRef) {
|
|
360
|
+
node.dataset.fastmcpRef = (++refCounter).toString();
|
|
361
|
+
}
|
|
362
|
+
let name = node.innerText || node.getAttribute('aria-label') || '';
|
|
363
|
+
name = (name || '').replace(/\\s+/g, ' ').trim().substring(0, 80);
|
|
364
|
+
|
|
365
|
+
lines.push(`- ${role} "${name}" [ref=${node.dataset.fastmcpRef}]`);
|
|
366
|
+
if (node.href) {
|
|
367
|
+
lines.push(` /url: "${node.href}"`);
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
node.childNodes.forEach(child => traverse(child));
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
traverse(document.body);
|
|
376
|
+
return lines.join('\\n');
|
|
377
|
+
}
|
|
378
|
+
return getSnapshot();
|
|
379
|
+
}
|
|
380
|
+
"""
|
|
381
|
+
return await page.evaluate(script)
|
|
382
|
+
|
|
383
|
+
@staticmethod
|
|
384
|
+
def _filter_snapshot(snapshot_text: str) -> str:
|
|
385
|
+
"""Filter snapshot to interactive elements only."""
|
|
386
|
+
import re
|
|
387
|
+
|
|
388
|
+
lines = snapshot_text.split("\n")
|
|
389
|
+
filtered = []
|
|
390
|
+
i = 0
|
|
391
|
+
while i < len(lines):
|
|
392
|
+
line = lines[i]
|
|
393
|
+
trimmed = line.strip()
|
|
394
|
+
|
|
395
|
+
if not trimmed or not trimmed.startswith("-"):
|
|
396
|
+
i += 1
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
# Extract role
|
|
400
|
+
role_match = re.match(r"^-\s+([a-zA-Z]+)", trimmed)
|
|
401
|
+
if not role_match:
|
|
402
|
+
i += 1
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
role = role_match.group(1).lower()
|
|
406
|
+
interactive_roles = {
|
|
407
|
+
"button",
|
|
408
|
+
"link",
|
|
409
|
+
"textbox",
|
|
410
|
+
"searchbox",
|
|
411
|
+
"combobox",
|
|
412
|
+
"checkbox",
|
|
413
|
+
"radio",
|
|
414
|
+
"switch",
|
|
415
|
+
"tab",
|
|
416
|
+
"menuitem",
|
|
417
|
+
"option",
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
if role in interactive_roles:
|
|
421
|
+
filtered.append(line)
|
|
422
|
+
# Include next line if it's a URL
|
|
423
|
+
if i + 1 < len(lines) and "/url:" in lines[i + 1]:
|
|
424
|
+
filtered.append(lines[i + 1])
|
|
425
|
+
i += 1
|
|
426
|
+
|
|
427
|
+
i += 1
|
|
428
|
+
|
|
429
|
+
return "\n".join(filtered)
|
|
430
|
+
|
|
431
|
+
@staticmethod
|
|
432
|
+
def _limit_snapshot_items(text: str, *, max_items: int) -> str:
|
|
433
|
+
"""Limit snapshot to the first N interactive element blocks."""
|
|
434
|
+
if max_items <= 0:
|
|
435
|
+
return ""
|
|
436
|
+
if not text:
|
|
437
|
+
return text
|
|
438
|
+
|
|
439
|
+
lines = text.splitlines()
|
|
440
|
+
out: list[str] = []
|
|
441
|
+
items = 0
|
|
442
|
+
for line in lines:
|
|
443
|
+
if line.startswith("- ") or line.startswith("["):
|
|
444
|
+
if items >= max_items:
|
|
445
|
+
break
|
|
446
|
+
items += 1
|
|
447
|
+
if items > 0:
|
|
448
|
+
out.append(line)
|
|
449
|
+
return "\n".join(out).strip()
|
|
450
|
+
|
|
451
|
+
async def close(self) -> None:
|
|
452
|
+
"""Cleanly close all pages, browsers, and Playwright."""
|
|
453
|
+
import contextlib
|
|
454
|
+
|
|
455
|
+
for page in list(self._pages.values()):
|
|
456
|
+
with contextlib.suppress(Exception):
|
|
457
|
+
await page.close()
|
|
458
|
+
self._pages.clear()
|
|
459
|
+
|
|
460
|
+
for browser in list(self._browsers.values()):
|
|
461
|
+
with contextlib.suppress(Exception):
|
|
462
|
+
await browser.close()
|
|
463
|
+
self._browsers.clear()
|
|
464
|
+
|
|
465
|
+
if self._playwright is not None:
|
|
466
|
+
try:
|
|
467
|
+
await self._playwright.stop()
|
|
468
|
+
finally:
|
|
469
|
+
self._playwright = None
|
thordata/client.py
CHANGED
|
@@ -53,6 +53,7 @@ from .serp_engines import SerpNamespace
|
|
|
53
53
|
# Import Types (Modernized)
|
|
54
54
|
from .types import (
|
|
55
55
|
CommonSettings,
|
|
56
|
+
DataFormat,
|
|
56
57
|
ProxyConfig,
|
|
57
58
|
ProxyProduct,
|
|
58
59
|
ProxyServer,
|
|
@@ -308,6 +309,7 @@ class ThordataClient:
|
|
|
308
309
|
render_js: bool | None = None,
|
|
309
310
|
no_cache: bool | None = None,
|
|
310
311
|
output_format: str = "json",
|
|
312
|
+
ai_overview: bool = False,
|
|
311
313
|
**kwargs: Any,
|
|
312
314
|
) -> dict[str, Any]:
|
|
313
315
|
engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
|
|
@@ -323,6 +325,7 @@ class ThordataClient:
|
|
|
323
325
|
render_js=render_js,
|
|
324
326
|
no_cache=no_cache,
|
|
325
327
|
output_format=output_format,
|
|
328
|
+
ai_overview=ai_overview,
|
|
326
329
|
extra_params=kwargs,
|
|
327
330
|
)
|
|
328
331
|
return self.serp_search_advanced(request)
|
|
@@ -364,28 +367,38 @@ class ThordataClient:
|
|
|
364
367
|
url: str,
|
|
365
368
|
*,
|
|
366
369
|
js_render: bool = False,
|
|
367
|
-
output_format: str = "html",
|
|
370
|
+
output_format: str | list[str] = "html",
|
|
368
371
|
country: str | None = None,
|
|
369
372
|
block_resources: str | None = None,
|
|
373
|
+
clean_content: str | None = None,
|
|
370
374
|
wait: int | None = None,
|
|
371
375
|
wait_for: str | None = None,
|
|
376
|
+
follow_redirect: bool | None = None,
|
|
377
|
+
headers: list[dict[str, str]] | None = None,
|
|
378
|
+
cookies: list[dict[str, str]] | None = None,
|
|
372
379
|
**kwargs: Any,
|
|
373
|
-
) -> str | bytes:
|
|
380
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
374
381
|
request = UniversalScrapeRequest(
|
|
375
382
|
url=url,
|
|
376
383
|
js_render=js_render,
|
|
377
384
|
output_format=output_format,
|
|
378
385
|
country=country,
|
|
379
386
|
block_resources=block_resources,
|
|
387
|
+
clean_content=clean_content,
|
|
380
388
|
wait=wait,
|
|
381
389
|
wait_for=wait_for,
|
|
390
|
+
follow_redirect=follow_redirect,
|
|
391
|
+
headers=headers,
|
|
392
|
+
cookies=cookies,
|
|
382
393
|
extra_params=kwargs,
|
|
383
394
|
)
|
|
384
395
|
return self.universal_scrape_advanced(request)
|
|
385
396
|
|
|
386
|
-
def universal_scrape_advanced(
|
|
397
|
+
def universal_scrape_advanced(
|
|
398
|
+
self, request: UniversalScrapeRequest
|
|
399
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
387
400
|
if not self.scraper_token:
|
|
388
|
-
raise ThordataConfigError("scraper_token required")
|
|
401
|
+
raise ThordataConfigError("scraper_token is required for Universal API")
|
|
389
402
|
|
|
390
403
|
payload = request.to_payload()
|
|
391
404
|
headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
|
|
@@ -648,6 +661,7 @@ class ThordataClient:
|
|
|
648
661
|
include_errors: bool = True,
|
|
649
662
|
task_type: str = "web",
|
|
650
663
|
common_settings: CommonSettings | None = None,
|
|
664
|
+
data_format: DataFormat | str | None = None,
|
|
651
665
|
) -> str:
|
|
652
666
|
import time
|
|
653
667
|
|
|
@@ -671,6 +685,7 @@ class ThordataClient:
|
|
|
671
685
|
parameters=parameters,
|
|
672
686
|
universal_params=universal_params,
|
|
673
687
|
include_errors=include_errors,
|
|
688
|
+
data_format=data_format,
|
|
674
689
|
)
|
|
675
690
|
task_id = self.create_scraper_task_advanced(config)
|
|
676
691
|
|
|
@@ -1212,12 +1227,22 @@ class ThordataClient:
|
|
|
1212
1227
|
# =========================================================================
|
|
1213
1228
|
|
|
1214
1229
|
def _process_universal_response(
|
|
1215
|
-
self, response: requests.Response, output_format: str
|
|
1216
|
-
) -> str | bytes:
|
|
1230
|
+
self, response: requests.Response, output_format: str | list[str]
|
|
1231
|
+
) -> str | bytes | dict[str, str | bytes]:
|
|
1232
|
+
"""Process universal scrape response. Returns single value or dict if multiple formats requested."""
|
|
1217
1233
|
try:
|
|
1218
1234
|
resp_json = response.json()
|
|
1219
1235
|
except ValueError:
|
|
1220
|
-
|
|
1236
|
+
# If not JSON, return raw content based on format
|
|
1237
|
+
if isinstance(output_format, list):
|
|
1238
|
+
# Multiple formats requested but got non-JSON response
|
|
1239
|
+
return {"raw": response.content}
|
|
1240
|
+
fmt = (
|
|
1241
|
+
output_format.lower()
|
|
1242
|
+
if isinstance(output_format, str)
|
|
1243
|
+
else str(output_format).lower()
|
|
1244
|
+
)
|
|
1245
|
+
return response.content if fmt == "png" else response.text
|
|
1221
1246
|
|
|
1222
1247
|
if isinstance(resp_json, dict):
|
|
1223
1248
|
code = resp_json.get("code")
|
|
@@ -1225,6 +1250,29 @@ class ThordataClient:
|
|
|
1225
1250
|
msg = extract_error_message(resp_json)
|
|
1226
1251
|
raise_for_code(f"Universal Error: {msg}", code=code, payload=resp_json)
|
|
1227
1252
|
|
|
1253
|
+
# Handle multiple output formats
|
|
1254
|
+
if isinstance(output_format, list) or (
|
|
1255
|
+
isinstance(output_format, str) and "," in output_format
|
|
1256
|
+
):
|
|
1257
|
+
result: dict[str, str | bytes] = {}
|
|
1258
|
+
formats = (
|
|
1259
|
+
output_format
|
|
1260
|
+
if isinstance(output_format, list)
|
|
1261
|
+
else [f.strip() for f in output_format.split(",")]
|
|
1262
|
+
)
|
|
1263
|
+
|
|
1264
|
+
for fmt in formats:
|
|
1265
|
+
fmt_lower = fmt.lower()
|
|
1266
|
+
if fmt_lower == "html" and "html" in resp_json:
|
|
1267
|
+
result["html"] = resp_json["html"]
|
|
1268
|
+
elif fmt_lower == "png" and "png" in resp_json:
|
|
1269
|
+
result["png"] = decode_base64_image(resp_json["png"])
|
|
1270
|
+
|
|
1271
|
+
# If we got results, return dict; otherwise return single value for backward compatibility
|
|
1272
|
+
if result:
|
|
1273
|
+
return result
|
|
1274
|
+
|
|
1275
|
+
# Single format (backward compatibility)
|
|
1228
1276
|
if "html" in resp_json:
|
|
1229
1277
|
return resp_json["html"]
|
|
1230
1278
|
if "png" in resp_json:
|
thordata/exceptions.py
CHANGED
|
@@ -390,7 +390,16 @@ def is_retryable_exception(exc: Exception) -> bool:
|
|
|
390
390
|
try:
|
|
391
391
|
import requests
|
|
392
392
|
|
|
393
|
-
|
|
393
|
+
# requests exposes SSLError under requests.exceptions.SSLError (not requests.SSLError)
|
|
394
|
+
ssl_error = getattr(getattr(requests, "exceptions", None), "SSLError", None)
|
|
395
|
+
retryable: tuple[type[BaseException], ...] = (
|
|
396
|
+
requests.Timeout,
|
|
397
|
+
requests.ConnectionError,
|
|
398
|
+
)
|
|
399
|
+
if ssl_error is not None:
|
|
400
|
+
retryable = retryable + (ssl_error,)
|
|
401
|
+
|
|
402
|
+
if isinstance(exc, retryable):
|
|
394
403
|
return True
|
|
395
404
|
except ImportError:
|
|
396
405
|
pass
|
thordata/types/serp.py
CHANGED
|
@@ -14,6 +14,7 @@ from .common import ThordataBaseConfig
|
|
|
14
14
|
class Engine(str, Enum):
|
|
15
15
|
# Google
|
|
16
16
|
GOOGLE = "google"
|
|
17
|
+
GOOGLE_AI_MODE = "google_ai_mode"
|
|
17
18
|
GOOGLE_NEWS = "google_news"
|
|
18
19
|
GOOGLE_SHOPPING = "google_shopping"
|
|
19
20
|
GOOGLE_VIDEOS = "google_videos"
|
|
@@ -21,10 +22,18 @@ class Engine(str, Enum):
|
|
|
21
22
|
GOOGLE_MAPS = "google_maps"
|
|
22
23
|
GOOGLE_JOBS = "google_jobs"
|
|
23
24
|
GOOGLE_PLAY = "google_play"
|
|
25
|
+
GOOGLE_PLAY_PRODUCT = "google_play_product"
|
|
26
|
+
GOOGLE_PLAY_GAMES = "google_play_games"
|
|
27
|
+
GOOGLE_PLAY_MOVIES = "google_play_movies"
|
|
28
|
+
GOOGLE_PLAY_BOOKS = "google_play_books"
|
|
24
29
|
GOOGLE_TRENDS = "google_trends"
|
|
25
30
|
GOOGLE_SCHOLAR = "google_scholar"
|
|
31
|
+
GOOGLE_SCHOLAR_CITE = "google_scholar_cite"
|
|
32
|
+
GOOGLE_SCHOLAR_AUTHOR = "google_scholar_author"
|
|
26
33
|
GOOGLE_PATENTS = "google_patents"
|
|
34
|
+
GOOGLE_PATENTS_DETAILS = "google_patents_details"
|
|
27
35
|
GOOGLE_FINANCE = "google_finance"
|
|
36
|
+
GOOGLE_FINANCE_MARKETS = "google_finance_markets"
|
|
28
37
|
GOOGLE_FLIGHTS = "google_flights"
|
|
29
38
|
GOOGLE_LENS = "google_lens"
|
|
30
39
|
GOOGLE_HOTELS = "google_hotels"
|
|
@@ -40,7 +49,7 @@ class Engine(str, Enum):
|
|
|
40
49
|
# Others
|
|
41
50
|
YANDEX = "yandex"
|
|
42
51
|
DUCKDUCKGO = "duckduckgo"
|
|
43
|
-
BAIDU = "baidu"
|
|
52
|
+
BAIDU = "baidu" # Deprecated: Not supported by Dashboard
|
|
44
53
|
|
|
45
54
|
# Legacy / Compatibility Aliases
|
|
46
55
|
GOOGLE_SEARCH = "google_search"
|
|
@@ -117,12 +126,14 @@ class SerpRequest(ThordataBaseConfig):
|
|
|
117
126
|
render_js: bool | None = None
|
|
118
127
|
no_cache: bool | None = None
|
|
119
128
|
|
|
120
|
-
# Output
|
|
129
|
+
# Output format: "json" (json=1), "html" (json=3), "light_json" (json=4)
|
|
130
|
+
# Note: "both" (json=2) format is not supported by Dashboard
|
|
121
131
|
output_format: str = "json"
|
|
122
132
|
|
|
123
133
|
# Advanced Google
|
|
124
134
|
ludocid: str | None = None
|
|
125
135
|
kgmid: str | None = None
|
|
136
|
+
ai_overview: bool = False # Only supported for engine=google
|
|
126
137
|
|
|
127
138
|
# Pass-through for any other param
|
|
128
139
|
extra_params: dict[str, Any] = field(default_factory=dict)
|
|
@@ -155,13 +166,26 @@ class SerpRequest(ThordataBaseConfig):
|
|
|
155
166
|
}
|
|
156
167
|
|
|
157
168
|
# JSON output handling
|
|
169
|
+
# Dashboard mapping: json=1 (json), json=3 (html), json=4 (light json)
|
|
170
|
+
# Note: json=2 (both) format is not supported by Dashboard
|
|
158
171
|
fmt = self.output_format.lower()
|
|
159
172
|
if fmt == "json":
|
|
160
173
|
payload["json"] = "1"
|
|
161
174
|
elif fmt == "html":
|
|
162
|
-
|
|
175
|
+
payload["json"] = "3"
|
|
176
|
+
elif fmt in ("light_json", "light-json", "lightjson"):
|
|
177
|
+
payload["json"] = "4"
|
|
163
178
|
elif fmt in ("2", "both", "json+html"):
|
|
179
|
+
import warnings
|
|
180
|
+
|
|
181
|
+
warnings.warn(
|
|
182
|
+
"The 'both' output format (json=2) is not supported by Dashboard. "
|
|
183
|
+
"Use 'json' or 'html' instead.",
|
|
184
|
+
DeprecationWarning,
|
|
185
|
+
stacklevel=2,
|
|
186
|
+
)
|
|
164
187
|
payload["json"] = "2"
|
|
188
|
+
# If no json param is set, default to HTML (legacy behavior)
|
|
165
189
|
|
|
166
190
|
# Query param handling
|
|
167
191
|
if engine == "yandex":
|
|
@@ -219,6 +243,14 @@ class SerpRequest(ThordataBaseConfig):
|
|
|
219
243
|
if self.kgmid:
|
|
220
244
|
payload["kgmid"] = self.kgmid
|
|
221
245
|
|
|
246
|
+
# AI Overview (only for Google engine)
|
|
247
|
+
if self.ai_overview:
|
|
248
|
+
if engine != "google":
|
|
249
|
+
raise ValueError(
|
|
250
|
+
"ai_overview parameter is only supported for engine=google"
|
|
251
|
+
)
|
|
252
|
+
payload["ai_overview"] = "true"
|
|
253
|
+
|
|
222
254
|
# Merge extras
|
|
223
255
|
payload.update(self.extra_params)
|
|
224
256
|
return payload
|
thordata/types/task.py
CHANGED
|
@@ -8,6 +8,7 @@ import json
|
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from enum import Enum
|
|
10
10
|
from typing import Any
|
|
11
|
+
from urllib.parse import unquote
|
|
11
12
|
|
|
12
13
|
from .common import CommonSettings, ThordataBaseConfig
|
|
13
14
|
|
|
@@ -49,6 +50,52 @@ class DataFormat(str, Enum):
|
|
|
49
50
|
XLSX = "xlsx"
|
|
50
51
|
|
|
51
52
|
|
|
53
|
+
def _normalize_url_value(value: Any) -> Any:
|
|
54
|
+
if not isinstance(value, str):
|
|
55
|
+
return value
|
|
56
|
+
# Decode all percent-encoded characters to match Dashboard format
|
|
57
|
+
# Dashboard expects URLs in their raw/decoded form, not URL-encoded
|
|
58
|
+
# This ensures API/SDK submissions match manual Dashboard input exactly
|
|
59
|
+
try:
|
|
60
|
+
# Check if URL contains any percent-encoded characters
|
|
61
|
+
if "%" in value:
|
|
62
|
+
# Fully decode the URL to match Dashboard format
|
|
63
|
+
decoded = unquote(value)
|
|
64
|
+
# If decoding changed the value, use decoded version
|
|
65
|
+
# This handles cases like %26 -> &, %3A -> :, %2F -> /, etc.
|
|
66
|
+
if decoded != value:
|
|
67
|
+
return decoded
|
|
68
|
+
except Exception:
|
|
69
|
+
# If decoding fails, return original value
|
|
70
|
+
pass
|
|
71
|
+
return value
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _normalize_parameters(params: dict[str, Any]) -> dict[str, Any]:
|
|
75
|
+
# All parameter keys that contain URLs and should be normalized
|
|
76
|
+
# This ensures API/SDK submissions match Dashboard format exactly
|
|
77
|
+
url_keys = {
|
|
78
|
+
"url",
|
|
79
|
+
"domain",
|
|
80
|
+
"profileurl",
|
|
81
|
+
"posturl",
|
|
82
|
+
"seller_url",
|
|
83
|
+
# Additional URL-related keys that may be used
|
|
84
|
+
"link",
|
|
85
|
+
"href",
|
|
86
|
+
"page_url",
|
|
87
|
+
"product_url",
|
|
88
|
+
"category_url",
|
|
89
|
+
}
|
|
90
|
+
out: dict[str, Any] = {}
|
|
91
|
+
for k, v in params.items():
|
|
92
|
+
if k in url_keys:
|
|
93
|
+
out[k] = _normalize_url_value(v)
|
|
94
|
+
else:
|
|
95
|
+
out[k] = v
|
|
96
|
+
return out
|
|
97
|
+
|
|
98
|
+
|
|
52
99
|
@dataclass
|
|
53
100
|
class ScraperTaskConfig(ThordataBaseConfig):
|
|
54
101
|
file_name: str
|
|
@@ -57,13 +104,18 @@ class ScraperTaskConfig(ThordataBaseConfig):
|
|
|
57
104
|
parameters: dict[str, Any] | list[dict[str, Any]]
|
|
58
105
|
universal_params: dict[str, Any] | None = None
|
|
59
106
|
include_errors: bool = True
|
|
107
|
+
data_format: DataFormat | str | None = (
|
|
108
|
+
None # Support json, csv, xlsx output formats
|
|
109
|
+
)
|
|
60
110
|
|
|
61
111
|
def to_payload(self) -> dict[str, Any]:
|
|
62
|
-
#
|
|
112
|
+
# Normalize parameters: decode percent-encoded URLs to reduce API/Dashboard divergence
|
|
63
113
|
if isinstance(self.parameters, list):
|
|
64
|
-
|
|
114
|
+
normalized_list = [_normalize_parameters(p) for p in self.parameters]
|
|
115
|
+
params_json = json.dumps(normalized_list)
|
|
65
116
|
else:
|
|
66
|
-
|
|
117
|
+
normalized_one = _normalize_parameters(self.parameters)
|
|
118
|
+
params_json = json.dumps([normalized_one])
|
|
67
119
|
|
|
68
120
|
payload: dict[str, Any] = {
|
|
69
121
|
"file_name": self.file_name,
|
|
@@ -74,6 +126,14 @@ class ScraperTaskConfig(ThordataBaseConfig):
|
|
|
74
126
|
}
|
|
75
127
|
if self.universal_params:
|
|
76
128
|
payload["spider_universal"] = json.dumps(self.universal_params)
|
|
129
|
+
# Add data_format if specified (for json/csv/xlsx output)
|
|
130
|
+
if self.data_format:
|
|
131
|
+
fmt = (
|
|
132
|
+
self.data_format.value
|
|
133
|
+
if isinstance(self.data_format, DataFormat)
|
|
134
|
+
else str(self.data_format).lower()
|
|
135
|
+
)
|
|
136
|
+
payload["data_format"] = fmt
|
|
77
137
|
return payload
|
|
78
138
|
|
|
79
139
|
|
|
@@ -87,7 +147,6 @@ class VideoTaskConfig(ThordataBaseConfig):
|
|
|
87
147
|
include_errors: bool = True
|
|
88
148
|
|
|
89
149
|
def to_payload(self) -> dict[str, Any]:
|
|
90
|
-
# Handle batch parameters
|
|
91
150
|
if isinstance(self.parameters, list):
|
|
92
151
|
params_json = json.dumps(self.parameters)
|
|
93
152
|
else:
|
|
@@ -99,13 +158,8 @@ class VideoTaskConfig(ThordataBaseConfig):
|
|
|
99
158
|
"spider_name": self.spider_name,
|
|
100
159
|
"spider_parameters": params_json,
|
|
101
160
|
"spider_errors": "true" if self.include_errors else "false",
|
|
102
|
-
# v2.0 Doc explicitly requires 'spider_universal' key for video tasks too sometimes,
|
|
103
|
-
# but usually it's passed as 'common_settings' or 'spider_universal'.
|
|
104
|
-
# Sticking to original models.py key logic for now to ensure stability.
|
|
105
161
|
"spider_universal": self.common_settings.to_json(),
|
|
106
162
|
}
|
|
107
|
-
# Note: If API expects 'common_settings' key specifically, adjust here.
|
|
108
|
-
# Based on v2 context, video builder often uses spider_universal.
|
|
109
163
|
return payload
|
|
110
164
|
|
|
111
165
|
|
thordata/types/universal.py
CHANGED
|
@@ -15,12 +15,15 @@ from .common import ThordataBaseConfig
|
|
|
15
15
|
class UniversalScrapeRequest(ThordataBaseConfig):
|
|
16
16
|
url: str
|
|
17
17
|
js_render: bool = False
|
|
18
|
-
output_format: str
|
|
18
|
+
output_format: str | list[str] = (
|
|
19
|
+
"html" # 'html', 'png', or ['png', 'html'] for both
|
|
20
|
+
)
|
|
19
21
|
country: str | None = None
|
|
20
|
-
block_resources: str | None = None # 'script,image'
|
|
22
|
+
block_resources: str | None = None # 'script,image,video'
|
|
21
23
|
clean_content: str | None = None # 'js,css'
|
|
22
24
|
wait: int | None = None # ms
|
|
23
25
|
wait_for: str | None = None # selector
|
|
26
|
+
follow_redirect: bool | None = None # Follow redirects
|
|
24
27
|
|
|
25
28
|
# Headers/Cookies must be serialized to JSON in payload
|
|
26
29
|
headers: list[dict[str, str]] | None = None
|
|
@@ -29,12 +32,26 @@ class UniversalScrapeRequest(ThordataBaseConfig):
|
|
|
29
32
|
extra_params: dict[str, Any] = field(default_factory=dict)
|
|
30
33
|
|
|
31
34
|
def __post_init__(self) -> None:
|
|
35
|
+
# Normalize output_format to list for easier handling
|
|
36
|
+
if isinstance(self.output_format, str):
|
|
37
|
+
formats = [f.strip().lower() for f in self.output_format.split(",")]
|
|
38
|
+
else:
|
|
39
|
+
formats = [
|
|
40
|
+
f.lower() if isinstance(f, str) else str(f).lower()
|
|
41
|
+
for f in self.output_format
|
|
42
|
+
]
|
|
43
|
+
|
|
32
44
|
valid_formats = {"html", "png"}
|
|
33
|
-
if
|
|
45
|
+
invalid = [f for f in formats if f not in valid_formats]
|
|
46
|
+
if invalid:
|
|
34
47
|
raise ValueError(
|
|
35
|
-
f"Invalid output_format: {
|
|
48
|
+
f"Invalid output_format: {invalid}. Must be one or more of: {valid_formats}. "
|
|
49
|
+
f"Use comma-separated string like 'png,html' or list ['png', 'html'] for multiple formats."
|
|
36
50
|
)
|
|
37
51
|
|
|
52
|
+
# Store as list for to_payload
|
|
53
|
+
self._output_formats = formats
|
|
54
|
+
|
|
38
55
|
if self.wait is not None and (self.wait < 0 or self.wait > 100000):
|
|
39
56
|
raise ValueError("wait must be between 0 and 100000 milliseconds")
|
|
40
57
|
|
|
@@ -42,9 +59,22 @@ class UniversalScrapeRequest(ThordataBaseConfig):
|
|
|
42
59
|
payload: dict[str, Any] = {
|
|
43
60
|
"url": self.url,
|
|
44
61
|
"js_render": "True" if self.js_render else "False",
|
|
45
|
-
"type": self.output_format.lower(),
|
|
46
62
|
}
|
|
47
63
|
|
|
64
|
+
# Handle output format: support single or multiple formats (e.g., "png,html")
|
|
65
|
+
if hasattr(self, "_output_formats") and self._output_formats:
|
|
66
|
+
if len(self._output_formats) == 1:
|
|
67
|
+
payload["type"] = self._output_formats[0]
|
|
68
|
+
else:
|
|
69
|
+
# Multiple formats: join with comma (e.g., "png,html")
|
|
70
|
+
payload["type"] = ",".join(self._output_formats)
|
|
71
|
+
else:
|
|
72
|
+
# Fallback for backward compatibility
|
|
73
|
+
if isinstance(self.output_format, str):
|
|
74
|
+
payload["type"] = self.output_format.lower()
|
|
75
|
+
else:
|
|
76
|
+
payload["type"] = ",".join([str(f).lower() for f in self.output_format])
|
|
77
|
+
|
|
48
78
|
if self.country:
|
|
49
79
|
payload["country"] = self.country.lower()
|
|
50
80
|
if self.block_resources:
|
|
@@ -55,6 +85,8 @@ class UniversalScrapeRequest(ThordataBaseConfig):
|
|
|
55
85
|
payload["wait"] = str(self.wait)
|
|
56
86
|
if self.wait_for:
|
|
57
87
|
payload["wait_for"] = self.wait_for
|
|
88
|
+
if self.follow_redirect is not None:
|
|
89
|
+
payload["follow_redirect"] = "True" if self.follow_redirect else "False"
|
|
58
90
|
|
|
59
91
|
# Serialize complex objects as JSON strings
|
|
60
92
|
if self.headers:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: thordata-sdk
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.0
|
|
4
4
|
Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
|
|
5
5
|
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
6
|
License: MIT
|
|
@@ -40,6 +40,8 @@ Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
|
40
40
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
41
41
|
Requires-Dist: types-requests>=2.28.0; extra == "dev"
|
|
42
42
|
Requires-Dist: aioresponses>=0.7.6; extra == "dev"
|
|
43
|
+
Provides-Extra: browser
|
|
44
|
+
Requires-Dist: playwright>=1.40.0; extra == "browser"
|
|
43
45
|
Dynamic: license-file
|
|
44
46
|
|
|
45
47
|
# Thordata Python SDK
|
|
@@ -63,7 +65,7 @@ Dynamic: license-file
|
|
|
63
65
|
|
|
64
66
|
## 📖 Introduction
|
|
65
67
|
|
|
66
|
-
The **Thordata Python SDK v1.
|
|
68
|
+
The **Thordata Python SDK v1.8.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
|
|
67
69
|
|
|
68
70
|
**Why v1.6.0?**
|
|
69
71
|
* **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
|
|
@@ -1,14 +1,17 @@
|
|
|
1
|
-
thordata/__init__.py,sha256=
|
|
1
|
+
thordata/__init__.py,sha256=easXVqOOb6hZk0NtsaSJ0lOCzmKu6O24-Y68fukCwcY,2287
|
|
2
2
|
thordata/_utils.py,sha256=Acr_6sHgdZXU7SQozd6FEYTZV6iHw__nlhpBTDwb66U,4917
|
|
3
|
-
thordata/async_client.py,sha256=
|
|
3
|
+
thordata/async_client.py,sha256=3tb9zg7eqTxxaC46Npei21a-oeLgSi3aaUTwqJygp88,42308
|
|
4
4
|
thordata/async_unlimited.py,sha256=kzTksFkN21rDM21Pwy3hcayjfyGYNGGyGR3fRLtZC6I,4510
|
|
5
|
-
thordata/client.py,sha256=
|
|
5
|
+
thordata/client.py,sha256=DARFPtxJdkz0tjh8cKjP0nnwN-oQEkPZXclseCUJ2LY,59074
|
|
6
6
|
thordata/enums.py,sha256=dO5QWpPFLpYP2GfLAdoFtxMTemhGNdr_NPqBoYfSFkk,764
|
|
7
|
-
thordata/exceptions.py,sha256=
|
|
7
|
+
thordata/exceptions.py,sha256=ntiq3F5sxAiEDmCnlcfS2GNb3Qa7DpRvMrhmgXhAGIg,11947
|
|
8
8
|
thordata/models.py,sha256=7GshQklo5aqke_ZQ2QIXiz9Ac5v6IRtvjWIjsBKEq6A,853
|
|
9
9
|
thordata/retry.py,sha256=X6Sa5IIb5EWD5fUJjKyhvWJyWQGPVgxLB3-vKoWfa5Q,11453
|
|
10
10
|
thordata/serp_engines.py,sha256=iuMWncelcGOskCHXFzpcPMMTL5qfiLkazHB1uj3zpZo,5985
|
|
11
11
|
thordata/unlimited.py,sha256=RzrtwcotYlbOWuSLysDyI75IkMVL7ygdfE9HKNoe02M,6087
|
|
12
|
+
thordata/browser/__init__.py,sha256=nu4cUeQDOhBV4LIJmgOZlsSdSXkmuuGhdXQBHo2yCvU,498
|
|
13
|
+
thordata/browser/exceptions.py,sha256=IRHmH-4dbXWbV6wDMKxDUsKuFGRseZdsYBgspzhcsyM,428
|
|
14
|
+
thordata/browser/session.py,sha256=FuwdUNpEzCIBVBv60gQO2c7bZ3S-MZ8bhGScG_NtDVQ,15686
|
|
12
15
|
thordata/core/__init__.py,sha256=EFT6mZpSdec_7uFUpSpDDHVwbTxy314uxJC_uprR6J4,500
|
|
13
16
|
thordata/core/async_http_client.py,sha256=KKsmhXN6bWRTDFvqa0H-WRf4R-TWH8WSgpDBRv6TEvg,3052
|
|
14
17
|
thordata/core/http_client.py,sha256=8lSwclmVweM-Go1qMW36zYnMKAUT_9RyDdPF7qMS4-Y,2280
|
|
@@ -25,11 +28,11 @@ thordata/tools/video.py,sha256=HUFqdue-dtWmTVlYtmf5ffzuYDIzw5l3wk3Vr7AXQW0,4689
|
|
|
25
28
|
thordata/types/__init__.py,sha256=hlLt5UCVm7QdeOCN5_YWXS4Vy8tJUhIp0XbWjAoQiQg,1357
|
|
26
29
|
thordata/types/common.py,sha256=hkTZ1QtokpE1yT9BvTmYfQz9AUjeCIIPvjib2pnq_Ag,2818
|
|
27
30
|
thordata/types/proxy.py,sha256=IU45wQHCBOIlbdcCN9veypAkDT0q9NIikLu674CudOU,10438
|
|
28
|
-
thordata/types/serp.py,sha256=
|
|
29
|
-
thordata/types/task.py,sha256=
|
|
30
|
-
thordata/types/universal.py,sha256=
|
|
31
|
-
thordata_sdk-1.
|
|
32
|
-
thordata_sdk-1.
|
|
33
|
-
thordata_sdk-1.
|
|
34
|
-
thordata_sdk-1.
|
|
35
|
-
thordata_sdk-1.
|
|
31
|
+
thordata/types/serp.py,sha256=2pOw2mrl1RzMZMSR9ZJCZ2ggYj8f4zRg0TKy8l2ye0o,7340
|
|
32
|
+
thordata/types/task.py,sha256=PVKNyzXZHNunThVb5dwHDtMZ9WmFpbaePDnkeV754CQ,6264
|
|
33
|
+
thordata/types/universal.py,sha256=8OIZs239fBxzSuLEe3VB9qCp1ddN5XKAXbpVm9MJuls,3631
|
|
34
|
+
thordata_sdk-1.8.0.dist-info/licenses/LICENSE,sha256=bAxpWgQIzb-5jl3nhLdOwOJ_vlbHLtSG7yev2B7vioY,1088
|
|
35
|
+
thordata_sdk-1.8.0.dist-info/METADATA,sha256=OI-f2KDEjN96Kt8KtZcbtPWcSJse34MfdlWmUSuNp0s,9386
|
|
36
|
+
thordata_sdk-1.8.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
37
|
+
thordata_sdk-1.8.0.dist-info/top_level.txt,sha256=Z8R_07m0lXCCSb1hapL9_nxMtyO3rf_9wOvq4n9u2Hg,9
|
|
38
|
+
thordata_sdk-1.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|