thordata-sdk 1.7.0__tar.gz → 1.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {thordata_sdk-1.7.0/src/thordata_sdk.egg-info → thordata_sdk-1.8.1}/PKG-INFO +4 -2
  2. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/README.md +1 -1
  3. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/pyproject.toml +4 -1
  4. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/__init__.py +1 -1
  5. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/async_client.py +33 -4
  6. thordata_sdk-1.8.1/src/thordata/browser/__init__.py +17 -0
  7. thordata_sdk-1.8.1/src/thordata/browser/exceptions.py +23 -0
  8. thordata_sdk-1.8.1/src/thordata/browser/session.py +469 -0
  9. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/client.py +3 -1
  10. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/types/serp.py +31 -3
  11. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1/src/thordata_sdk.egg-info}/PKG-INFO +4 -2
  12. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata_sdk.egg-info/SOURCES.txt +4 -0
  13. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata_sdk.egg-info/requires.txt +3 -0
  14. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_async_client.py +3 -1
  15. thordata_sdk-1.8.1/tests/test_browser.py +108 -0
  16. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/LICENSE +0 -0
  17. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/setup.cfg +0 -0
  18. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/_utils.py +0 -0
  19. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/async_unlimited.py +0 -0
  20. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/core/__init__.py +0 -0
  21. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/core/async_http_client.py +0 -0
  22. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/core/http_client.py +0 -0
  23. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/core/tunnel.py +0 -0
  24. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/enums.py +0 -0
  25. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/exceptions.py +0 -0
  26. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/models.py +0 -0
  27. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/retry.py +0 -0
  28. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/serp_engines.py +0 -0
  29. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/__init__.py +0 -0
  30. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/base.py +0 -0
  31. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/code.py +0 -0
  32. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/ecommerce.py +0 -0
  33. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/professional.py +0 -0
  34. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/search.py +0 -0
  35. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/social.py +0 -0
  36. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/travel.py +0 -0
  37. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/tools/video.py +0 -0
  38. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/types/__init__.py +0 -0
  39. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/types/common.py +0 -0
  40. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/types/proxy.py +0 -0
  41. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/types/task.py +0 -0
  42. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/types/universal.py +0 -0
  43. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata/unlimited.py +0 -0
  44. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata_sdk.egg-info/dependency_links.txt +0 -0
  45. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/src/thordata_sdk.egg-info/top_level.txt +0 -0
  46. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_async_client_errors.py +0 -0
  47. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_batch_creation.py +0 -0
  48. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_client.py +0 -0
  49. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_client_errors.py +0 -0
  50. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_enums.py +0 -0
  51. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_examples.py +0 -0
  52. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_exceptions.py +0 -0
  53. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_integration_proxy_protocols.py +0 -0
  54. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_models.py +0 -0
  55. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_retry.py +0 -0
  56. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_spec_parity.py +0 -0
  57. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_task_status_and_wait.py +0 -0
  58. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_tools.py +0 -0
  59. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_tools_coverage.py +0 -0
  60. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_unlimited.py +0 -0
  61. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_user_agent.py +0 -0
  62. {thordata_sdk-1.7.0 → thordata_sdk-1.8.1}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thordata-sdk
3
- Version: 1.7.0
3
+ Version: 1.8.1
4
4
  Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
5
  Author-email: Thordata Developer Team <support@thordata.com>
6
6
  License: MIT
@@ -40,6 +40,8 @@ Requires-Dist: ruff>=0.1.0; extra == "dev"
40
40
  Requires-Dist: mypy>=1.0.0; extra == "dev"
41
41
  Requires-Dist: types-requests>=2.28.0; extra == "dev"
42
42
  Requires-Dist: aioresponses>=0.7.6; extra == "dev"
43
+ Provides-Extra: browser
44
+ Requires-Dist: playwright>=1.40.0; extra == "browser"
43
45
  Dynamic: license-file
44
46
 
45
47
  # Thordata Python SDK
@@ -63,7 +65,7 @@ Dynamic: license-file
63
65
 
64
66
  ## 📖 Introduction
65
67
 
66
- The **Thordata Python SDK v1.6.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
68
+ The **Thordata Python SDK v1.8.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
67
69
 
68
70
  **Why v1.6.0?**
69
71
  * **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
@@ -19,7 +19,7 @@
19
19
 
20
20
  ## 📖 Introduction
21
21
 
22
- The **Thordata Python SDK v1.6.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
22
+ The **Thordata Python SDK v1.8.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
23
23
 
24
24
  **Why v1.6.0?**
25
25
  * **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "thordata-sdk"
8
- version = "1.7.0"
8
+ version = "1.8.1"
9
9
  description = "The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network."
10
10
  readme = "README.md"
11
11
  requires-python = ">=3.9"
@@ -61,6 +61,9 @@ dev = [
61
61
  "types-requests>=2.28.0",
62
62
  "aioresponses>=0.7.6",
63
63
  ]
64
+ browser = [
65
+ "playwright>=1.40.0",
66
+ ]
64
67
 
65
68
  [project.urls]
66
69
  "Homepage" = "https://www.thordata.com"
@@ -5,7 +5,7 @@ Official Python client for Thordata's Proxy Network, SERP API,
5
5
  Universal Scraping API (Web Unlocker), and Web Scraper API.
6
6
  """
7
7
 
8
- __version__ = "1.6.0"
8
+ __version__ = "1.8.1"
9
9
  __author__ = "Thordata Developer Team/Kael Odin"
10
10
  __email__ = "support@thordata.com"
11
11
 
@@ -244,6 +244,7 @@ class AsyncThordataClient:
244
244
  render_js: bool | None = None,
245
245
  no_cache: bool | None = None,
246
246
  output_format: str = "json",
247
+ ai_overview: bool = False,
247
248
  **kwargs: Any,
248
249
  ) -> dict[str, Any]:
249
250
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
@@ -258,13 +259,14 @@ class AsyncThordataClient:
258
259
  render_js=render_js,
259
260
  no_cache=no_cache,
260
261
  output_format=output_format,
262
+ ai_overview=ai_overview,
261
263
  extra_params=kwargs,
262
264
  )
263
265
  return await self.serp_search_advanced(request)
264
266
 
265
267
  async def serp_search_advanced(self, request: SerpRequest) -> dict[str, Any]:
266
268
  if not self.scraper_token:
267
- raise ThordataConfigError("scraper_token required")
269
+ raise ThordataConfigError("scraper_token is required for SERP API")
268
270
  payload = request.to_payload()
269
271
  headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
270
272
  logger.info(f"Async SERP: {request.engine} - {request.query}")
@@ -324,7 +326,7 @@ class AsyncThordataClient:
324
326
  self, request: UniversalScrapeRequest
325
327
  ) -> str | bytes | dict[str, str | bytes]:
326
328
  if not self.scraper_token:
327
- raise ThordataConfigError("scraper_token required")
329
+ raise ThordataConfigError("scraper_token is required for Universal API")
328
330
  payload = request.to_payload()
329
331
  headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
330
332
 
@@ -448,7 +450,7 @@ class AsyncThordataClient:
448
450
  async def create_scraper_task_advanced(self, config: ScraperTaskConfig) -> str:
449
451
  self._require_public_credentials()
450
452
  if not self.scraper_token:
451
- raise ThordataConfigError("scraper_token required")
453
+ raise ThordataConfigError("scraper_token is required for Task Builder")
452
454
  payload = config.to_payload()
453
455
  headers = build_builder_headers(
454
456
  self.scraper_token, str(self.public_token), str(self.public_key)
@@ -486,7 +488,9 @@ class AsyncThordataClient:
486
488
  async def create_video_task_advanced(self, config: VideoTaskConfig) -> str:
487
489
  self._require_public_credentials()
488
490
  if not self.scraper_token:
489
- raise ThordataConfigError("scraper_token required")
491
+ raise ThordataConfigError(
492
+ "scraper_token is required for Video Task Builder"
493
+ )
490
494
  payload = config.to_payload()
491
495
  headers = build_builder_headers(
492
496
  self.scraper_token, str(self.public_token), str(self.public_key)
@@ -1104,3 +1108,28 @@ class AsyncThordataClient:
1104
1108
  safe_user = quote(final_user, safe="")
1105
1109
  safe_pass = quote(pwd, safe="")
1106
1110
  return f"wss://{safe_user}:{safe_pass}@ws-browser.thordata.com"
1111
+
1112
+ @property
1113
+ def browser(self):
1114
+ """Get a browser session for automation.
1115
+
1116
+ Requires playwright: pip install thordata[browser]
1117
+
1118
+ Returns:
1119
+ BrowserSession instance
1120
+
1121
+ Example:
1122
+ async with AsyncThordataClient() as client:
1123
+ session = client.browser
1124
+ await session.navigate("https://example.com")
1125
+ snapshot = await session.snapshot()
1126
+ """
1127
+ try:
1128
+ from .browser import BrowserSession
1129
+
1130
+ return BrowserSession(self)
1131
+ except ImportError as e:
1132
+ raise ImportError(
1133
+ "Playwright is required for browser automation. "
1134
+ "Install it with: pip install thordata[browser]"
1135
+ ) from e
@@ -0,0 +1,17 @@
1
+ """Browser automation module for Thordata Scraping Browser.
2
+
3
+ This module provides high-level browser automation capabilities using Playwright.
4
+ Requires optional dependency: pip install thordata[browser]
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from .exceptions import BrowserConnectionError, BrowserError
10
+
11
+ try:
12
+ from .session import BrowserSession
13
+
14
+ __all__ = ["BrowserSession", "BrowserError", "BrowserConnectionError"]
15
+ except ImportError:
16
+ # Playwright not installed - BrowserSession not available
17
+ __all__ = ["BrowserError", "BrowserConnectionError"]
@@ -0,0 +1,23 @@
1
+ """Browser automation exceptions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..exceptions import ThordataError
6
+
7
+
8
+ class BrowserError(ThordataError):
9
+ """Base exception for browser automation errors."""
10
+
11
+ pass
12
+
13
+
14
+ class BrowserConnectionError(BrowserError):
15
+ """Raised when browser connection fails."""
16
+
17
+ pass
18
+
19
+
20
+ class BrowserSessionError(BrowserError):
21
+ """Raised when browser session operations fail."""
22
+
23
+ pass
@@ -0,0 +1,469 @@
1
+ """Browser session management for Thordata Scraping Browser.
2
+
3
+ This module provides a high-level wrapper around Playwright connected to
4
+ Thordata's Scraping Browser.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Any
11
+ from urllib.parse import urlparse
12
+
13
+ try:
14
+ from playwright.async_api import Browser, Page, Playwright, async_playwright
15
+ except ImportError as e:
16
+ raise ImportError(
17
+ "Playwright is required for browser automation. "
18
+ "Install it with: pip install thordata[browser]"
19
+ ) from e
20
+
21
+ from ..async_client import AsyncThordataClient
22
+ from .exceptions import BrowserConnectionError, BrowserSessionError
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class BrowserSession:
28
+ """Domain-aware browser session wrapper for Thordata Scraping Browser."""
29
+
30
+ def __init__(
31
+ self,
32
+ client: AsyncThordataClient,
33
+ username: str | None = None,
34
+ password: str | None = None,
35
+ ) -> None:
36
+ """Initialize browser session.
37
+
38
+ Args:
39
+ client: AsyncThordataClient instance
40
+ username: Browser username (optional, can use env var)
41
+ password: Browser password (optional, can use env var)
42
+ """
43
+ self._client = client
44
+ self._username = username
45
+ self._password = password
46
+ self._playwright: Playwright | None = None
47
+ self._browsers: dict[str, Browser] = {}
48
+ self._pages: dict[str, Page] = {}
49
+ self._current_domain: str = "default"
50
+
51
+ @staticmethod
52
+ def _get_domain(url: str) -> str:
53
+ """Extract domain from URL."""
54
+ try:
55
+ parsed = urlparse(url)
56
+ return parsed.hostname or "default"
57
+ except Exception:
58
+ return "default"
59
+
60
+ async def _ensure_playwright(self) -> Playwright:
61
+ """Ensure Playwright is started."""
62
+ if self._playwright is None:
63
+ self._playwright = await async_playwright().start()
64
+ return self._playwright
65
+
66
+ async def get_browser(self, domain: str = "default") -> Browser:
67
+ """Get or create a browser instance for a given domain."""
68
+ existing = self._browsers.get(domain)
69
+ if existing and existing.is_connected():
70
+ return existing
71
+
72
+ # Clean up stale browser/page
73
+ if existing is not None:
74
+ logger.info("Browser for domain %s disconnected, recreating", domain)
75
+ self._browsers.pop(domain, None)
76
+ self._pages.pop(domain, None)
77
+
78
+ playwright = await self._ensure_playwright()
79
+
80
+ logger.info("Connecting to Thordata Scraping Browser for domain %s", domain)
81
+
82
+ # Get browser credentials
83
+ import os
84
+
85
+ user = self._username or os.getenv("THORDATA_BROWSER_USERNAME")
86
+ pwd = self._password or os.getenv("THORDATA_BROWSER_PASSWORD")
87
+
88
+ if not user or not pwd:
89
+ raise BrowserConnectionError(
90
+ "Missing browser credentials. Set THORDATA_BROWSER_USERNAME and "
91
+ "THORDATA_BROWSER_PASSWORD or pass them to BrowserSession."
92
+ )
93
+
94
+ # Retry logic with exponential backoff
95
+ max_retries = 3
96
+ last_error = None
97
+
98
+ for attempt in range(max_retries):
99
+ try:
100
+ ws_url = self._client.get_browser_connection_url(
101
+ username=user, password=pwd
102
+ )
103
+ logger.debug(
104
+ "Attempt %d/%d: Connecting to %s...",
105
+ attempt + 1,
106
+ max_retries,
107
+ ws_url[:50],
108
+ )
109
+ browser = await playwright.chromium.connect_over_cdp(ws_url)
110
+ logger.info("Successfully connected to browser for domain %s", domain)
111
+ self._browsers[domain] = browser
112
+ return browser
113
+ except Exception as e:
114
+ last_error = e
115
+ logger.warning(
116
+ "Browser connection attempt %d/%d failed: %s",
117
+ attempt + 1,
118
+ max_retries,
119
+ e,
120
+ )
121
+
122
+ if attempt < max_retries - 1:
123
+ import asyncio
124
+
125
+ wait_time = 2**attempt # Exponential backoff: 1s, 2s, 4s
126
+ logger.info("Retrying in %d seconds...", wait_time)
127
+ await asyncio.sleep(wait_time)
128
+
129
+ # If all retries failed, raise the last error
130
+ raise BrowserConnectionError(
131
+ f"Failed to connect to Thordata Scraping Browser after {max_retries} attempts. "
132
+ f"Last error: {last_error}"
133
+ ) from last_error
134
+
135
+ async def get_page(self, url: str | None = None) -> Page:
136
+ """Get or create a page for the current (or provided) domain."""
137
+ if url:
138
+ self._current_domain = self._get_domain(url)
139
+ domain = self._current_domain
140
+
141
+ existing = self._pages.get(domain)
142
+ if existing and not existing.is_closed():
143
+ return existing
144
+
145
+ browser = await self.get_browser(domain)
146
+ contexts = browser.contexts
147
+ if not contexts:
148
+ context = await browser.new_context()
149
+ else:
150
+ context = contexts[0]
151
+
152
+ pages = context.pages
153
+ if pages:
154
+ page = pages[0]
155
+ else:
156
+ page = await context.new_page()
157
+
158
+ self._pages[domain] = page
159
+ return page
160
+
161
+ async def navigate(self, url: str, timeout: int = 120000) -> dict[str, Any]:
162
+ """Navigate to a URL.
163
+
164
+ Args:
165
+ url: Target URL
166
+ timeout: Navigation timeout in milliseconds
167
+
168
+ Returns:
169
+ Dictionary with url and title
170
+ """
171
+ page = await self.get_page(url)
172
+ if page.url != url:
173
+ await page.goto(url, timeout=timeout)
174
+ title = await page.title()
175
+ return {"url": page.url, "title": title}
176
+
177
+ async def snapshot(
178
+ self, filtered: bool = True, max_items: int = 80
179
+ ) -> dict[str, Any]:
180
+ """Capture an ARIA-like snapshot of the current page.
181
+
182
+ Args:
183
+ filtered: Whether to filter to interactive elements only
184
+ max_items: Maximum number of elements to include
185
+
186
+ Returns:
187
+ Dictionary with url, title, and aria_snapshot
188
+ """
189
+ page = await self.get_page()
190
+ full_snapshot = await self._get_interactive_snapshot(page)
191
+
192
+ if not filtered:
193
+ return {
194
+ "url": page.url,
195
+ "title": await page.title(),
196
+ "aria_snapshot": full_snapshot,
197
+ }
198
+
199
+ # Filter and limit
200
+ filtered_snapshot = self._filter_snapshot(full_snapshot)
201
+ filtered_snapshot = self._limit_snapshot_items(
202
+ filtered_snapshot, max_items=max_items
203
+ )
204
+
205
+ return {
206
+ "url": page.url,
207
+ "title": await page.title(),
208
+ "aria_snapshot": filtered_snapshot,
209
+ }
210
+
211
+ async def click_ref(
212
+ self, ref: str, wait_for_navigation_ms: int | None = None
213
+ ) -> dict[str, Any]:
214
+ """Click an element by its ref ID.
215
+
216
+ Args:
217
+ ref: The ref ID from snapshot (e.g., "1" or "dom-1")
218
+ wait_for_navigation_ms: Optional wait time in ms to detect navigation
219
+
220
+ Returns:
221
+ Dictionary with click result information
222
+ """
223
+ page = await self.get_page()
224
+ url_before = page.url
225
+
226
+ try:
227
+ locator = page.locator(f'[data-fastmcp-ref="{ref}"]').first
228
+ await locator.click(timeout=5000)
229
+
230
+ # Check for navigation if requested
231
+ did_navigate = False
232
+ url_after = url_before
233
+ if wait_for_navigation_ms and wait_for_navigation_ms > 0:
234
+ import asyncio
235
+
236
+ await asyncio.sleep(wait_for_navigation_ms / 1000)
237
+ url_after = page.url
238
+ did_navigate = url_after != url_before
239
+
240
+ return {
241
+ "message": "Successfully clicked element",
242
+ "ref": ref,
243
+ "url_before": url_before,
244
+ "url_after": url_after,
245
+ "did_navigate": did_navigate,
246
+ }
247
+ except Exception as e:
248
+ raise BrowserSessionError(f"Failed to click element: {e}") from e
249
+
250
+ async def type_ref(
251
+ self, ref: str, text: str, submit: bool = False
252
+ ) -> dict[str, Any]:
253
+ """Type text into an element by its ref ID.
254
+
255
+ Args:
256
+ ref: The ref ID from snapshot
257
+ text: Text to type
258
+ submit: Whether to press Enter after typing
259
+
260
+ Returns:
261
+ Dictionary with type result information
262
+ """
263
+ page = await self.get_page()
264
+ url_before = page.url
265
+
266
+ try:
267
+ locator = page.locator(f'[data-fastmcp-ref="{ref}"]').first
268
+ await locator.fill(text)
269
+ if submit:
270
+ await locator.press("Enter")
271
+
272
+ return {
273
+ "message": "Typed into element" + (" and submitted" if submit else ""),
274
+ "ref": ref,
275
+ "url_before": url_before,
276
+ "url_after": page.url,
277
+ }
278
+ except Exception as e:
279
+ raise BrowserSessionError(f"Failed to type into element: {e}") from e
280
+
281
+ async def screenshot_page(self, full_page: bool = False) -> bytes:
282
+ """Take a screenshot of the current page.
283
+
284
+ Args:
285
+ full_page: Whether to capture full page or viewport only
286
+
287
+ Returns:
288
+ Screenshot as bytes (PNG format)
289
+ """
290
+ page = await self.get_page()
291
+ return await page.screenshot(full_page=full_page)
292
+
293
+ async def get_html(self, full_page: bool = False) -> str:
294
+ """Get the HTML content of the current page.
295
+
296
+ Args:
297
+ full_page: Whether to get full page HTML or body only
298
+
299
+ Returns:
300
+ HTML content as string
301
+ """
302
+ page = await self.get_page()
303
+ if full_page:
304
+ return await page.content()
305
+ else:
306
+ try:
307
+ return await page.evaluate("document.body.innerHTML")
308
+ except Exception:
309
+ return await page.content()
310
+
311
+ async def scroll(self) -> dict[str, Any]:
312
+ """Scroll to the bottom of the page.
313
+
314
+ Returns:
315
+ Dictionary with scroll result
316
+ """
317
+ page = await self.get_page()
318
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
319
+ return {"message": "Scrolled to bottom"}
320
+
321
+ async def go_back(self) -> dict[str, Any]:
322
+ """Navigate back in browser history.
323
+
324
+ Returns:
325
+ Dictionary with new URL
326
+ """
327
+ page = await self.get_page()
328
+ await page.go_back()
329
+ return {"url": page.url}
330
+
331
+ async def _get_interactive_snapshot(self, page: Page) -> str:
332
+ """Generate a text snapshot of interactive elements with refs."""
333
+ script = """
334
+ () => {
335
+ function getSnapshot() {
336
+ const lines = [];
337
+ let refCounter = 0;
338
+
339
+ function normalizeRole(tag, explicitRole) {
340
+ const role = (explicitRole || '').toLowerCase();
341
+ const t = (tag || '').toLowerCase();
342
+ if (role) return role;
343
+ if (t === 'a') return 'link';
344
+ if (t === 'button') return 'button';
345
+ if (t === 'input') return 'textbox';
346
+ if (t === 'select') return 'combobox';
347
+ if (t === 'textarea') return 'textbox';
348
+ return t;
349
+ }
350
+
351
+ function traverse(node) {
352
+ if (node.nodeType === Node.ELEMENT_NODE) {
353
+ const tag = node.tagName.toLowerCase();
354
+ const interactiveTag = ['a', 'button', 'input', 'select', 'textarea'].includes(tag);
355
+ const role = normalizeRole(tag, node.getAttribute('role'));
356
+ const interactiveRole = ['button', 'link', 'textbox', 'searchbox', 'combobox', 'checkbox', 'radio', 'switch', 'tab', 'menuitem', 'option'].includes(role);
357
+
358
+ if (interactiveTag || interactiveRole) {
359
+ if (!node.dataset.fastmcpRef) {
360
+ node.dataset.fastmcpRef = (++refCounter).toString();
361
+ }
362
+ let name = node.innerText || node.getAttribute('aria-label') || '';
363
+ name = (name || '').replace(/\\s+/g, ' ').trim().substring(0, 80);
364
+
365
+ lines.push(`- ${role} "${name}" [ref=${node.dataset.fastmcpRef}]`);
366
+ if (node.href) {
367
+ lines.push(` /url: "${node.href}"`);
368
+ }
369
+ }
370
+ }
371
+
372
+ node.childNodes.forEach(child => traverse(child));
373
+ }
374
+
375
+ traverse(document.body);
376
+ return lines.join('\\n');
377
+ }
378
+ return getSnapshot();
379
+ }
380
+ """
381
+ return await page.evaluate(script)
382
+
383
+ @staticmethod
384
+ def _filter_snapshot(snapshot_text: str) -> str:
385
+ """Filter snapshot to interactive elements only."""
386
+ import re
387
+
388
+ lines = snapshot_text.split("\n")
389
+ filtered = []
390
+ i = 0
391
+ while i < len(lines):
392
+ line = lines[i]
393
+ trimmed = line.strip()
394
+
395
+ if not trimmed or not trimmed.startswith("-"):
396
+ i += 1
397
+ continue
398
+
399
+ # Extract role
400
+ role_match = re.match(r"^-\s+([a-zA-Z]+)", trimmed)
401
+ if not role_match:
402
+ i += 1
403
+ continue
404
+
405
+ role = role_match.group(1).lower()
406
+ interactive_roles = {
407
+ "button",
408
+ "link",
409
+ "textbox",
410
+ "searchbox",
411
+ "combobox",
412
+ "checkbox",
413
+ "radio",
414
+ "switch",
415
+ "tab",
416
+ "menuitem",
417
+ "option",
418
+ }
419
+
420
+ if role in interactive_roles:
421
+ filtered.append(line)
422
+ # Include next line if it's a URL
423
+ if i + 1 < len(lines) and "/url:" in lines[i + 1]:
424
+ filtered.append(lines[i + 1])
425
+ i += 1
426
+
427
+ i += 1
428
+
429
+ return "\n".join(filtered)
430
+
431
+ @staticmethod
432
+ def _limit_snapshot_items(text: str, *, max_items: int) -> str:
433
+ """Limit snapshot to the first N interactive element blocks."""
434
+ if max_items <= 0:
435
+ return ""
436
+ if not text:
437
+ return text
438
+
439
+ lines = text.splitlines()
440
+ out: list[str] = []
441
+ items = 0
442
+ for line in lines:
443
+ if line.startswith("- ") or line.startswith("["):
444
+ if items >= max_items:
445
+ break
446
+ items += 1
447
+ if items > 0:
448
+ out.append(line)
449
+ return "\n".join(out).strip()
450
+
451
+ async def close(self) -> None:
452
+ """Cleanly close all pages, browsers, and Playwright."""
453
+ import contextlib
454
+
455
+ for page in list(self._pages.values()):
456
+ with contextlib.suppress(Exception):
457
+ await page.close()
458
+ self._pages.clear()
459
+
460
+ for browser in list(self._browsers.values()):
461
+ with contextlib.suppress(Exception):
462
+ await browser.close()
463
+ self._browsers.clear()
464
+
465
+ if self._playwright is not None:
466
+ try:
467
+ await self._playwright.stop()
468
+ finally:
469
+ self._playwright = None
@@ -309,6 +309,7 @@ class ThordataClient:
309
309
  render_js: bool | None = None,
310
310
  no_cache: bool | None = None,
311
311
  output_format: str = "json",
312
+ ai_overview: bool = False,
312
313
  **kwargs: Any,
313
314
  ) -> dict[str, Any]:
314
315
  engine_str = engine.value if isinstance(engine, Engine) else engine.lower()
@@ -324,6 +325,7 @@ class ThordataClient:
324
325
  render_js=render_js,
325
326
  no_cache=no_cache,
326
327
  output_format=output_format,
328
+ ai_overview=ai_overview,
327
329
  extra_params=kwargs,
328
330
  )
329
331
  return self.serp_search_advanced(request)
@@ -396,7 +398,7 @@ class ThordataClient:
396
398
  self, request: UniversalScrapeRequest
397
399
  ) -> str | bytes | dict[str, str | bytes]:
398
400
  if not self.scraper_token:
399
- raise ThordataConfigError("scraper_token required")
401
+ raise ThordataConfigError("scraper_token is required for Universal API")
400
402
 
401
403
  payload = request.to_payload()
402
404
  headers = build_auth_headers(self.scraper_token, mode=self._auth_mode)
@@ -14,6 +14,7 @@ from .common import ThordataBaseConfig
14
14
  class Engine(str, Enum):
15
15
  # Google
16
16
  GOOGLE = "google"
17
+ GOOGLE_AI_MODE = "google_ai_mode"
17
18
  GOOGLE_NEWS = "google_news"
18
19
  GOOGLE_SHOPPING = "google_shopping"
19
20
  GOOGLE_VIDEOS = "google_videos"
@@ -21,10 +22,18 @@ class Engine(str, Enum):
21
22
  GOOGLE_MAPS = "google_maps"
22
23
  GOOGLE_JOBS = "google_jobs"
23
24
  GOOGLE_PLAY = "google_play"
25
+ GOOGLE_PLAY_PRODUCT = "google_play_product"
26
+ GOOGLE_PLAY_GAMES = "google_play_games"
27
+ GOOGLE_PLAY_MOVIES = "google_play_movies"
28
+ GOOGLE_PLAY_BOOKS = "google_play_books"
24
29
  GOOGLE_TRENDS = "google_trends"
25
30
  GOOGLE_SCHOLAR = "google_scholar"
31
+ GOOGLE_SCHOLAR_CITE = "google_scholar_cite"
32
+ GOOGLE_SCHOLAR_AUTHOR = "google_scholar_author"
26
33
  GOOGLE_PATENTS = "google_patents"
34
+ GOOGLE_PATENTS_DETAILS = "google_patents_details"
27
35
  GOOGLE_FINANCE = "google_finance"
36
+ GOOGLE_FINANCE_MARKETS = "google_finance_markets"
28
37
  GOOGLE_FLIGHTS = "google_flights"
29
38
  GOOGLE_LENS = "google_lens"
30
39
  GOOGLE_HOTELS = "google_hotels"
@@ -40,7 +49,7 @@ class Engine(str, Enum):
40
49
  # Others
41
50
  YANDEX = "yandex"
42
51
  DUCKDUCKGO = "duckduckgo"
43
- BAIDU = "baidu"
52
+ BAIDU = "baidu" # Deprecated: Not supported by Dashboard
44
53
 
45
54
  # Legacy / Compatibility Aliases
46
55
  GOOGLE_SEARCH = "google_search"
@@ -117,12 +126,14 @@ class SerpRequest(ThordataBaseConfig):
117
126
  render_js: bool | None = None
118
127
  no_cache: bool | None = None
119
128
 
120
- # Output format: "json" (json=1), "html" (json=3), "light_json" (json=4), or "both" (json=2)
129
+ # Output format: "json" (json=1), "html" (json=3), "light_json" (json=4)
130
+ # Note: "both" (json=2) format is not supported by Dashboard
121
131
  output_format: str = "json"
122
132
 
123
133
  # Advanced Google
124
134
  ludocid: str | None = None
125
135
  kgmid: str | None = None
136
+ ai_overview: bool = False # Only supported for engine=google
126
137
 
127
138
  # Pass-through for any other param
128
139
  extra_params: dict[str, Any] = field(default_factory=dict)
@@ -155,7 +166,8 @@ class SerpRequest(ThordataBaseConfig):
155
166
  }
156
167
 
157
168
  # JSON output handling
158
- # Dashboard mapping: json=1 (json), json=3 (html), json=4 (light json), json=2 (both)
169
+ # Dashboard mapping: json=1 (json), json=3 (html), json=4 (light json)
170
+ # Note: json=2 (both) format is not supported by Dashboard
159
171
  fmt = self.output_format.lower()
160
172
  if fmt == "json":
161
173
  payload["json"] = "1"
@@ -164,6 +176,14 @@ class SerpRequest(ThordataBaseConfig):
164
176
  elif fmt in ("light_json", "light-json", "lightjson"):
165
177
  payload["json"] = "4"
166
178
  elif fmt in ("2", "both", "json+html"):
179
+ import warnings
180
+
181
+ warnings.warn(
182
+ "The 'both' output format (json=2) is not supported by Dashboard. "
183
+ "Use 'json' or 'html' instead.",
184
+ DeprecationWarning,
185
+ stacklevel=2,
186
+ )
167
187
  payload["json"] = "2"
168
188
  # If no json param is set, default to HTML (legacy behavior)
169
189
 
@@ -223,6 +243,14 @@ class SerpRequest(ThordataBaseConfig):
223
243
  if self.kgmid:
224
244
  payload["kgmid"] = self.kgmid
225
245
 
246
+ # AI Overview (only for Google engine)
247
+ if self.ai_overview:
248
+ if engine != "google":
249
+ raise ValueError(
250
+ "ai_overview parameter is only supported for engine=google"
251
+ )
252
+ payload["ai_overview"] = "true"
253
+
226
254
  # Merge extras
227
255
  payload.update(self.extra_params)
228
256
  return payload
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thordata-sdk
3
- Version: 1.7.0
3
+ Version: 1.8.1
4
4
  Summary: The Official Python SDK for Thordata - AI Data Infrastructure & Proxy Network.
5
5
  Author-email: Thordata Developer Team <support@thordata.com>
6
6
  License: MIT
@@ -40,6 +40,8 @@ Requires-Dist: ruff>=0.1.0; extra == "dev"
40
40
  Requires-Dist: mypy>=1.0.0; extra == "dev"
41
41
  Requires-Dist: types-requests>=2.28.0; extra == "dev"
42
42
  Requires-Dist: aioresponses>=0.7.6; extra == "dev"
43
+ Provides-Extra: browser
44
+ Requires-Dist: playwright>=1.40.0; extra == "browser"
43
45
  Dynamic: license-file
44
46
 
45
47
  # Thordata Python SDK
@@ -63,7 +65,7 @@ Dynamic: license-file
63
65
 
64
66
  ## 📖 Introduction
65
67
 
66
- The **Thordata Python SDK v1.6.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
68
+ The **Thordata Python SDK v1.8.0** is a production-ready wrapper for Thordata's AI data infrastructure. It is architected for high reliability, strict type safety, and maximum performance.
67
69
 
68
70
  **Why v1.6.0?**
69
71
  * **🛡️ Bulletproof Networking**: Custom core handles `HTTP`, `HTTPS`, and `SOCKS5h` (Remote DNS) tunneling, solving common SSL/TLS handshake issues in complex network environments.
@@ -12,6 +12,9 @@ src/thordata/models.py
12
12
  src/thordata/retry.py
13
13
  src/thordata/serp_engines.py
14
14
  src/thordata/unlimited.py
15
+ src/thordata/browser/__init__.py
16
+ src/thordata/browser/exceptions.py
17
+ src/thordata/browser/session.py
15
18
  src/thordata/core/__init__.py
16
19
  src/thordata/core/async_http_client.py
17
20
  src/thordata/core/http_client.py
@@ -39,6 +42,7 @@ src/thordata_sdk.egg-info/top_level.txt
39
42
  tests/test_async_client.py
40
43
  tests/test_async_client_errors.py
41
44
  tests/test_batch_creation.py
45
+ tests/test_browser.py
42
46
  tests/test_client.py
43
47
  tests/test_client_errors.py
44
48
  tests/test_enums.py
@@ -2,6 +2,9 @@ requests>=2.25.0
2
2
  aiohttp>=3.9.0
3
3
  PySocks>=1.7.1
4
4
 
5
+ [browser]
6
+ playwright>=1.40.0
7
+
5
8
  [dev]
6
9
  pytest>=7.0.0
7
10
  pytest-asyncio>=0.21.0
@@ -127,7 +127,9 @@ async def test_async_missing_scraper_token():
127
127
  async with client:
128
128
  # 3. Method call should fail
129
129
  # Updated match string to match actual code in async_client.py
130
- with pytest.raises(ThordataConfigError, match="scraper_token required"):
130
+ with pytest.raises(
131
+ ThordataConfigError, match="scraper_token is required for SERP API"
132
+ ):
131
133
  await client.serp_search("test")
132
134
 
133
135
 
@@ -0,0 +1,108 @@
1
+ """Tests for browser automation module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ try:
8
+ import playwright.async_api # noqa: F401
9
+
10
+ PLAYWRIGHT_AVAILABLE = True
11
+ except ImportError:
12
+ PLAYWRIGHT_AVAILABLE = False
13
+
14
+ from thordata import AsyncThordataClient
15
+
16
+ if PLAYWRIGHT_AVAILABLE:
17
+ from thordata.browser import BrowserConnectionError, BrowserError, BrowserSession
18
+ else:
19
+ from thordata.browser import BrowserConnectionError, BrowserError
20
+
21
+
22
+ @pytest.mark.skipif(not PLAYWRIGHT_AVAILABLE, reason="Playwright not installed")
23
+ class TestBrowserSession:
24
+ """Tests for BrowserSession class."""
25
+
26
+ @pytest.fixture
27
+ def client(self):
28
+ """Create a test client."""
29
+ return AsyncThordataClient(scraper_token="test_token")
30
+
31
+ def test_browser_session_init(self, client):
32
+ """Test BrowserSession initialization."""
33
+ session = BrowserSession(client)
34
+ assert session._client == client
35
+ assert session._playwright is None
36
+
37
+ def test_browser_session_with_credentials(self, client):
38
+ """Test BrowserSession with credentials."""
39
+ session = BrowserSession(client, username="test_user", password="test_pass")
40
+ assert session._username == "test_user"
41
+ assert session._password == "test_pass"
42
+
43
+ def test_get_domain(self):
44
+ """Test domain extraction."""
45
+ assert BrowserSession._get_domain("https://example.com/page") == "example.com"
46
+ assert BrowserSession._get_domain("http://test.org") == "test.org"
47
+ assert BrowserSession._get_domain("invalid") == "default"
48
+
49
+ def test_filter_snapshot(self):
50
+ """Test snapshot filtering."""
51
+ snapshot = """
52
+ - button "Click me" [ref=1]
53
+ /url: "https://example.com"
54
+ - div "Not interactive" [ref=2]
55
+ - link "Go here" [ref=3]
56
+ /url: "https://example.com/page"
57
+ """
58
+ filtered = BrowserSession._filter_snapshot(snapshot)
59
+ assert "button" in filtered
60
+ assert "link" in filtered
61
+ assert "div" not in filtered
62
+
63
+ def test_limit_snapshot_items(self):
64
+ """Test snapshot item limiting."""
65
+ snapshot = '- button "1" [ref=1]\n- button "2" [ref=2]\n- button "3" [ref=3]'
66
+ limited = BrowserSession._limit_snapshot_items(snapshot, max_items=2)
67
+ assert 'button "1"' in limited
68
+ assert 'button "2"' in limited
69
+ assert 'button "3"' not in limited
70
+
71
+
72
+ @pytest.mark.skipif(not PLAYWRIGHT_AVAILABLE, reason="Playwright not installed")
73
+ class TestBrowserClientIntegration:
74
+ """Tests for browser integration with AsyncThordataClient."""
75
+
76
+ @pytest.fixture
77
+ def client(self):
78
+ """Create a test client."""
79
+ return AsyncThordataClient(scraper_token="test_token")
80
+
81
+ def test_browser_property(self, client):
82
+ """Test browser property access."""
83
+ session = client.browser
84
+ assert isinstance(session, BrowserSession)
85
+ assert session._client == client
86
+
87
+ def test_browser_property_import_error(self, monkeypatch):
88
+ """Test browser property raises ImportError when playwright is not available."""
89
+ # This test verifies the error message, but since playwright might be installed
90
+ # in the test environment, we'll just verify the property exists
91
+ # The actual import error will be raised at runtime when playwright is missing
92
+ pass
93
+
94
+
95
+ class TestBrowserExceptions:
96
+ """Tests for browser exceptions."""
97
+
98
+ def test_browser_error(self):
99
+ """Test BrowserError exception."""
100
+ error = BrowserError("Test error")
101
+ assert str(error) == "Test error"
102
+ assert isinstance(error, Exception)
103
+
104
+ def test_browser_connection_error(self):
105
+ """Test BrowserConnectionError exception."""
106
+ error = BrowserConnectionError("Connection failed")
107
+ assert str(error) == "Connection failed"
108
+ assert isinstance(error, BrowserError)
File without changes
File without changes