thordata-mcp-server 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata_mcp/__init__.py +1 -1
- thordata_mcp/browser_session.py +157 -12
- thordata_mcp/config.py +14 -3
- thordata_mcp/context.py +1 -1
- thordata_mcp/tools/data/browser.py +124 -18
- thordata_mcp/tools/debug.py +125 -0
- thordata_mcp/tools/params_utils.py +107 -0
- thordata_mcp/tools/product.py +83 -5
- thordata_mcp/tools/product_compact.py +2108 -962
- thordata_mcp/tools/utils.py +2 -0
- thordata_mcp/utils.py +393 -322
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/METADATA +29 -54
- thordata_mcp_server-0.5.0.dist-info/RECORD +26 -0
- thordata_mcp_server-0.4.4.dist-info/RECORD +0 -24
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/entry_points.txt +0 -0
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/top_level.txt +0 -0
thordata_mcp/__init__.py
CHANGED
thordata_mcp/browser_session.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""Browser session management for Thordata Scraping Browser.
|
|
2
2
|
|
|
3
3
|
This module provides a high-level wrapper around Playwright connected to
|
|
4
|
-
Thordata's Scraping Browser (via `AsyncThordataClient.get_browser_connection_url`)
|
|
5
|
-
inspired by Bright Data's browser session design but implemented in Python.
|
|
4
|
+
Thordata's Scraping Browser (via `AsyncThordataClient.get_browser_connection_url`).
|
|
6
5
|
|
|
7
6
|
Design goals:
|
|
8
7
|
- Domain-scoped browser sessions (one browser/page per domain).
|
|
@@ -18,6 +17,8 @@ from urllib.parse import urlparse
|
|
|
18
17
|
|
|
19
18
|
from playwright.async_api import Browser, Page, Playwright, async_playwright
|
|
20
19
|
|
|
20
|
+
import time
|
|
21
|
+
|
|
21
22
|
from thordata.async_client import AsyncThordataClient
|
|
22
23
|
|
|
23
24
|
from .aria_snapshot import AriaSnapshotFilter
|
|
@@ -37,6 +38,11 @@ class BrowserSession:
|
|
|
37
38
|
self._requests: Dict[str, Dict[Any, Any]] = {}
|
|
38
39
|
self._dom_refs: Set[str] = set()
|
|
39
40
|
self._current_domain: str = "default"
|
|
41
|
+
# Console and network diagnostics cache
|
|
42
|
+
self._console_messages: Dict[str, List[Dict[str, Any]]] = {}
|
|
43
|
+
self._network_requests: Dict[str, List[Dict[str, Any]]] = {}
|
|
44
|
+
self._max_console_messages = 10
|
|
45
|
+
self._max_network_requests = 20
|
|
40
46
|
|
|
41
47
|
@staticmethod
|
|
42
48
|
def _get_domain(url: str) -> str:
|
|
@@ -139,11 +145,26 @@ class BrowserSession:
|
|
|
139
145
|
|
|
140
146
|
# Reset network tracking for this domain
|
|
141
147
|
self._requests[domain] = {}
|
|
148
|
+
self._console_messages[domain] = []
|
|
149
|
+
self._network_requests[domain] = []
|
|
142
150
|
|
|
143
151
|
async def on_request(request: Any) -> None:
|
|
144
152
|
if domain in self._requests:
|
|
145
153
|
self._requests[domain][request] = None
|
|
146
|
-
|
|
154
|
+
try:
|
|
155
|
+
self._network_requests.setdefault(domain, [])
|
|
156
|
+
self._network_requests[domain].append(
|
|
157
|
+
{
|
|
158
|
+
"url": request.url,
|
|
159
|
+
"method": request.method,
|
|
160
|
+
"resourceType": getattr(request, "resource_type", None),
|
|
161
|
+
"timestamp": int(time.time() * 1000),
|
|
162
|
+
}
|
|
163
|
+
)
|
|
164
|
+
self._network_requests[domain] = self._network_requests[domain][-self._max_network_requests :]
|
|
165
|
+
except Exception:
|
|
166
|
+
pass
|
|
167
|
+
|
|
147
168
|
async def on_response(response: Any) -> None:
|
|
148
169
|
if domain in self._requests:
|
|
149
170
|
try:
|
|
@@ -151,15 +172,78 @@ class BrowserSession:
|
|
|
151
172
|
except Exception:
|
|
152
173
|
# Best-effort, non-fatal
|
|
153
174
|
pass
|
|
175
|
+
try:
|
|
176
|
+
# Update last matching request with status
|
|
177
|
+
req = response.request
|
|
178
|
+
url = getattr(req, "url", None)
|
|
179
|
+
if url and domain in self._network_requests:
|
|
180
|
+
for item in reversed(self._network_requests[domain]):
|
|
181
|
+
if item.get("url") == url and item.get("statusCode") is None:
|
|
182
|
+
item["statusCode"] = response.status
|
|
183
|
+
break
|
|
184
|
+
except Exception:
|
|
185
|
+
pass
|
|
154
186
|
|
|
155
187
|
page.on("request", on_request)
|
|
156
188
|
page.on("response", on_response)
|
|
157
|
-
|
|
189
|
+
|
|
190
|
+
# Console message tracking
|
|
191
|
+
async def on_console(msg: Any) -> None:
|
|
192
|
+
try:
|
|
193
|
+
self._console_messages.setdefault(domain, [])
|
|
194
|
+
self._console_messages[domain].append(
|
|
195
|
+
{
|
|
196
|
+
"type": msg.type,
|
|
197
|
+
"message": msg.text,
|
|
198
|
+
"timestamp": int(time.time() * 1000),
|
|
199
|
+
}
|
|
200
|
+
)
|
|
201
|
+
self._console_messages[domain] = self._console_messages[domain][-self._max_console_messages :]
|
|
202
|
+
except Exception:
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
page.on("console", on_console)
|
|
206
|
+
|
|
158
207
|
self._pages[domain] = page
|
|
159
208
|
return page
|
|
160
209
|
|
|
161
|
-
|
|
162
|
-
"""
|
|
210
|
+
def get_console_tail(self, n: int = 10, domain: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
211
|
+
"""Return recent console messages for the given domain."""
|
|
212
|
+
d = domain or self._current_domain
|
|
213
|
+
items = self._console_messages.get(d, [])
|
|
214
|
+
return items[-max(0, int(n)) :]
|
|
215
|
+
|
|
216
|
+
def get_network_tail(self, n: int = 20, domain: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
217
|
+
"""Return recent network request summaries for the given domain."""
|
|
218
|
+
d = domain or self._current_domain
|
|
219
|
+
items = self._network_requests.get(d, [])
|
|
220
|
+
return items[-max(0, int(n)) :]
|
|
221
|
+
|
|
222
|
+
def reset_page(self, domain: Optional[str] = None) -> None:
|
|
223
|
+
"""Drop cached page for a domain so the next call recreates it."""
|
|
224
|
+
d = domain or self._current_domain
|
|
225
|
+
self._pages.pop(d, None)
|
|
226
|
+
self._requests.pop(d, None)
|
|
227
|
+
self._console_messages.pop(d, None)
|
|
228
|
+
self._network_requests.pop(d, None)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
async def capture_snapshot(
|
|
232
|
+
self,
|
|
233
|
+
*,
|
|
234
|
+
filtered: bool = True,
|
|
235
|
+
mode: str = "compact",
|
|
236
|
+
max_items: int = 80,
|
|
237
|
+
include_dom: bool = False,
|
|
238
|
+
) -> Dict[str, Any]:
|
|
239
|
+
"""Capture an ARIA-like snapshot and optional DOM snapshot.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
filtered: Whether to apply AriaSnapshotFilter (legacy, kept for compatibility).
|
|
243
|
+
mode: "compact" | "full". Compact returns minimal interactive elements.
|
|
244
|
+
max_items: Maximum number of interactive elements to include (compact mode only).
|
|
245
|
+
include_dom: Whether to include dom_snapshot (compact mode defaults to False).
|
|
246
|
+
"""
|
|
163
247
|
page = await self.get_page()
|
|
164
248
|
|
|
165
249
|
try:
|
|
@@ -175,16 +259,64 @@ class BrowserSession:
|
|
|
175
259
|
"aria_snapshot": full_snapshot,
|
|
176
260
|
}
|
|
177
261
|
|
|
262
|
+
if mode == "compact":
|
|
263
|
+
# Compact: return only filtered interactive elements, optionally without dom_snapshot
|
|
264
|
+
filtered_snapshot = AriaSnapshotFilter.filter_snapshot(full_snapshot)
|
|
265
|
+
filtered_snapshot = self._limit_aria_snapshot_items(filtered_snapshot, max_items=max_items)
|
|
266
|
+
dom_snapshot = None
|
|
267
|
+
if include_dom:
|
|
268
|
+
dom_snapshot_raw = await self._capture_dom_snapshot(page)
|
|
269
|
+
self._dom_refs = {el["ref"] for el in dom_snapshot_raw}
|
|
270
|
+
dom_snapshot = AriaSnapshotFilter.format_dom_elements(dom_snapshot_raw)
|
|
271
|
+
return {
|
|
272
|
+
"url": page.url,
|
|
273
|
+
"title": await page.title(),
|
|
274
|
+
"aria_snapshot": filtered_snapshot,
|
|
275
|
+
"dom_snapshot": dom_snapshot,
|
|
276
|
+
"_meta": {"mode": mode, "max_items": max_items, "include_dom": include_dom},
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
# Full mode: include both filtered aria and dom_snapshot (legacy behavior)
|
|
178
280
|
filtered_snapshot = AriaSnapshotFilter.filter_snapshot(full_snapshot)
|
|
179
|
-
|
|
180
|
-
self._dom_refs = {el["ref"] for el in
|
|
181
|
-
|
|
281
|
+
dom_snapshot_raw = await self._capture_dom_snapshot(page)
|
|
282
|
+
self._dom_refs = {el["ref"] for el in dom_snapshot_raw}
|
|
182
283
|
return {
|
|
183
284
|
"url": page.url,
|
|
184
285
|
"title": await page.title(),
|
|
185
286
|
"aria_snapshot": filtered_snapshot,
|
|
186
|
-
"dom_snapshot": AriaSnapshotFilter.format_dom_elements(
|
|
287
|
+
"dom_snapshot": AriaSnapshotFilter.format_dom_elements(dom_snapshot_raw),
|
|
288
|
+
"_meta": {"mode": mode},
|
|
187
289
|
}
|
|
290
|
+
|
|
291
|
+
@staticmethod
|
|
292
|
+
def _limit_aria_snapshot_items(text: str, *, max_items: int) -> str:
|
|
293
|
+
"""Limit snapshot to the first N interactive element blocks.
|
|
294
|
+
|
|
295
|
+
The snapshot format is a list where each element starts with a line beginning
|
|
296
|
+
with '- ' (Playwright raw) or '[' (AriaSnapshotFilter compact), and may include
|
|
297
|
+
one or more indented continuation lines.
|
|
298
|
+
"""
|
|
299
|
+
try:
|
|
300
|
+
n = int(max_items)
|
|
301
|
+
except Exception:
|
|
302
|
+
n = 80
|
|
303
|
+
if n <= 0:
|
|
304
|
+
return ""
|
|
305
|
+
if not text:
|
|
306
|
+
return text
|
|
307
|
+
|
|
308
|
+
lines = text.splitlines()
|
|
309
|
+
out: list[str] = []
|
|
310
|
+
items = 0
|
|
311
|
+
for line in lines:
|
|
312
|
+
if line.startswith("- ") or line.startswith("["):
|
|
313
|
+
if items >= n:
|
|
314
|
+
break
|
|
315
|
+
items += 1
|
|
316
|
+
# Include continuation lines only if we've started collecting items.
|
|
317
|
+
if items > 0:
|
|
318
|
+
out.append(line)
|
|
319
|
+
return "\n".join(out).strip()
|
|
188
320
|
|
|
189
321
|
async def _get_interactive_snapshot(self, page: Page) -> str:
|
|
190
322
|
"""Generate a text snapshot of interactive elements with refs."""
|
|
@@ -194,12 +326,25 @@ class BrowserSession:
|
|
|
194
326
|
const lines = [];
|
|
195
327
|
let refCounter = 0;
|
|
196
328
|
|
|
329
|
+
function normalizeRole(tag, explicitRole) {
|
|
330
|
+
const role = (explicitRole || '').toLowerCase();
|
|
331
|
+
const t = (tag || '').toLowerCase();
|
|
332
|
+
if (role) return role;
|
|
333
|
+
// Map common interactive tags to standard ARIA roles
|
|
334
|
+
if (t === 'a') return 'link';
|
|
335
|
+
if (t === 'button') return 'button';
|
|
336
|
+
if (t === 'input') return 'textbox';
|
|
337
|
+
if (t === 'select') return 'combobox';
|
|
338
|
+
if (t === 'textarea') return 'textbox';
|
|
339
|
+
return t;
|
|
340
|
+
}
|
|
341
|
+
|
|
197
342
|
function traverse(node) {
|
|
198
343
|
if (node.nodeType === Node.ELEMENT_NODE) {
|
|
199
|
-
const role = node.getAttribute('role') || node.tagName.toLowerCase();
|
|
200
344
|
const tag = node.tagName.toLowerCase();
|
|
201
345
|
const interactiveTag = ['a', 'button', 'input', 'select', 'textarea'].includes(tag);
|
|
202
|
-
const
|
|
346
|
+
const role = normalizeRole(tag, node.getAttribute('role'));
|
|
347
|
+
const interactiveRole = ['button', 'link', 'textbox', 'searchbox', 'combobox', 'checkbox', 'radio', 'switch', 'tab', 'menuitem', 'option'].includes(role);
|
|
203
348
|
|
|
204
349
|
if (interactiveTag || interactiveRole) {
|
|
205
350
|
if (!node.dataset.fastmcpRef) {
|
thordata_mcp/config.py
CHANGED
|
@@ -6,6 +6,14 @@ from pydantic_settings import BaseSettings
|
|
|
6
6
|
class Settings(BaseSettings):
|
|
7
7
|
"""Environment-driven configuration for the MCP server."""
|
|
8
8
|
|
|
9
|
+
# MCP tool exposure mode (BrightData-like)
|
|
10
|
+
# - rapid: minimal core tools
|
|
11
|
+
# - pro: all tools
|
|
12
|
+
# - custom: enable by THORDATA_GROUPS and THORDATA_TOOLS
|
|
13
|
+
THORDATA_MODE: str = "rapid"
|
|
14
|
+
THORDATA_GROUPS: str | None = None
|
|
15
|
+
THORDATA_TOOLS: str | None = None
|
|
16
|
+
|
|
9
17
|
# Thordata credentials
|
|
10
18
|
THORDATA_SCRAPER_TOKEN: str | None = None
|
|
11
19
|
THORDATA_PUBLIC_TOKEN: str | None = None
|
|
@@ -20,9 +28,9 @@ class Settings(BaseSettings):
|
|
|
20
28
|
# Tasks discovery UX (to avoid dumping hundreds of tools to the client by default)
|
|
21
29
|
# - mode=curated: only return tools from THORDATA_TASKS_GROUPS, with pagination
|
|
22
30
|
# - mode=all: return all discovered tools
|
|
23
|
-
# Default to
|
|
24
|
-
THORDATA_TASKS_LIST_MODE: str = "
|
|
25
|
-
THORDATA_TASKS_LIST_DEFAULT_LIMIT: int =
|
|
31
|
+
# Default to curated mode to reduce tool selection noise for LLMs.
|
|
32
|
+
THORDATA_TASKS_LIST_MODE: str = "curated"
|
|
33
|
+
THORDATA_TASKS_LIST_DEFAULT_LIMIT: int = 60
|
|
26
34
|
THORDATA_TASKS_GROUPS: str = "ecommerce,social,video,search,travel,code,professional"
|
|
27
35
|
|
|
28
36
|
# Optional: restrict which SDK tool_keys are allowed to execute (safety/UX)
|
|
@@ -49,6 +57,9 @@ class Settings(BaseSettings):
|
|
|
49
57
|
# Logging
|
|
50
58
|
LOG_LEVEL: str = "INFO"
|
|
51
59
|
|
|
60
|
+
# Debug tools exposure
|
|
61
|
+
THORDATA_DEBUG_TOOLS: bool = False
|
|
62
|
+
|
|
52
63
|
class Config:
|
|
53
64
|
env_file = ".env"
|
|
54
65
|
extra = "ignore"
|
thordata_mcp/context.py
CHANGED
|
@@ -298,16 +298,96 @@ def register(mcp: FastMCP) -> None:
|
|
|
298
298
|
|
|
299
299
|
@mcp.tool(name="browser.click_ref", description="Click an element by its ref ID")
|
|
300
300
|
@handle_mcp_errors
|
|
301
|
-
async def browser_click_ref(
|
|
302
|
-
|
|
301
|
+
async def browser_click_ref(
|
|
302
|
+
ref: str,
|
|
303
|
+
element: str = "element",
|
|
304
|
+
wait_for_navigation_ms: Optional[int] = None,
|
|
305
|
+
) -> dict[str, Any]:
|
|
306
|
+
"""Click an element using the [ref=X] ID from the snapshot.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
ref: The ref ID from snapshot (e.g., ref-w545663wqs)
|
|
310
|
+
element: Description of the element for error messages
|
|
311
|
+
wait_for_navigation_ms: Optional wait time in ms to detect navigation after click
|
|
312
|
+
"""
|
|
303
313
|
session = await ServerContext.get_browser_session()
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
314
|
+
page = await session.get_page()
|
|
315
|
+
|
|
316
|
+
url_before = page.url
|
|
317
|
+
try:
|
|
318
|
+
locator = await session.ref_locator(ref, element)
|
|
319
|
+
await locator.click(timeout=5_000)
|
|
320
|
+
|
|
321
|
+
# Check for navigation if requested
|
|
322
|
+
did_navigate = False
|
|
323
|
+
url_after = url_before
|
|
324
|
+
if wait_for_navigation_ms and wait_for_navigation_ms > 0:
|
|
325
|
+
import asyncio
|
|
326
|
+
await asyncio.sleep(wait_for_navigation_ms / 1000)
|
|
327
|
+
url_after = page.url
|
|
328
|
+
did_navigate = url_after != url_before
|
|
329
|
+
|
|
330
|
+
return ok_response(
|
|
331
|
+
tool="browser.click_ref",
|
|
332
|
+
input={"ref": ref, "element": element, "wait_for_navigation_ms": wait_for_navigation_ms},
|
|
333
|
+
output={
|
|
334
|
+
"message": f"Successfully clicked {element}",
|
|
335
|
+
"ref": ref,
|
|
336
|
+
"url_before": url_before,
|
|
337
|
+
"url_after": url_after,
|
|
338
|
+
"did_navigate": did_navigate,
|
|
339
|
+
},
|
|
340
|
+
)
|
|
341
|
+
except Exception as e:
|
|
342
|
+
# Enhanced error with diagnostics + self-heal for common browser lifecycle issues
|
|
343
|
+
from ...utils import error_response
|
|
344
|
+
|
|
345
|
+
err_s = str(e).lower()
|
|
346
|
+
did_reset = False
|
|
347
|
+
if any(k in err_s for k in [
|
|
348
|
+
"target closed",
|
|
349
|
+
"page closed",
|
|
350
|
+
"browser has been closed",
|
|
351
|
+
"execution context was destroyed",
|
|
352
|
+
"has been disposed",
|
|
353
|
+
]):
|
|
354
|
+
try:
|
|
355
|
+
session.reset_page()
|
|
356
|
+
did_reset = True
|
|
357
|
+
except Exception:
|
|
358
|
+
did_reset = False
|
|
359
|
+
|
|
360
|
+
# Try to get console and network diagnostics from session cache
|
|
361
|
+
try:
|
|
362
|
+
console_tail = session.get_console_tail(n=10)
|
|
363
|
+
except Exception:
|
|
364
|
+
console_tail = []
|
|
365
|
+
try:
|
|
366
|
+
network_tail = session.get_network_tail(n=20)
|
|
367
|
+
except Exception:
|
|
368
|
+
network_tail = []
|
|
369
|
+
|
|
370
|
+
hint = "Try taking a new snapshot to get fresh refs, or check if the element is still visible"
|
|
371
|
+
if did_reset:
|
|
372
|
+
hint = "Browser page was closed/reset. Take a new snapshot to get fresh refs, then retry the click."
|
|
373
|
+
|
|
374
|
+
return error_response(
|
|
375
|
+
tool="browser.click_ref",
|
|
376
|
+
input={"ref": ref, "element": element, "wait_for_navigation_ms": wait_for_navigation_ms},
|
|
377
|
+
error_type="browser_interaction_error",
|
|
378
|
+
code="E5001",
|
|
379
|
+
message=f"Failed to click element: {str(e)}",
|
|
380
|
+
details={
|
|
381
|
+
"ref": ref,
|
|
382
|
+
"element": element,
|
|
383
|
+
"url_before": url_before,
|
|
384
|
+
"url_after": page.url,
|
|
385
|
+
"did_reset": did_reset,
|
|
386
|
+
"hint": hint,
|
|
387
|
+
"console_tail": console_tail,
|
|
388
|
+
"network_tail": network_tail,
|
|
389
|
+
},
|
|
390
|
+
)
|
|
311
391
|
|
|
312
392
|
@mcp.tool(
|
|
313
393
|
name="browser.type_ref",
|
|
@@ -322,15 +402,41 @@ def register(mcp: FastMCP) -> None:
|
|
|
322
402
|
) -> dict[str, Any]:
|
|
323
403
|
"""Type text into an element using the [ref=X] ID."""
|
|
324
404
|
session = await ServerContext.get_browser_session()
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
405
|
+
page = await session.get_page()
|
|
406
|
+
url_before = page.url
|
|
407
|
+
|
|
408
|
+
try:
|
|
409
|
+
locator = await session.ref_locator(ref, element)
|
|
410
|
+
await locator.fill(text)
|
|
411
|
+
if submit:
|
|
412
|
+
await locator.press("Enter")
|
|
413
|
+
|
|
414
|
+
return ok_response(
|
|
415
|
+
tool="browser.type_ref",
|
|
416
|
+
input={"ref": ref, "text": text, "submit": submit, "element": element},
|
|
417
|
+
output={
|
|
418
|
+
"message": "Typed into element" + (" and submitted" if submit else ""),
|
|
419
|
+
"ref": ref,
|
|
420
|
+
"url_before": url_before,
|
|
421
|
+
"url_after": page.url,
|
|
422
|
+
},
|
|
423
|
+
)
|
|
424
|
+
except Exception as e:
|
|
425
|
+
from ...utils import error_response
|
|
426
|
+
return error_response(
|
|
427
|
+
tool="browser.type_ref",
|
|
428
|
+
input={"ref": ref, "text": text, "submit": submit, "element": element},
|
|
429
|
+
error_type="browser_interaction_error",
|
|
430
|
+
code="E5002",
|
|
431
|
+
message=f"Failed to type into element: {str(e)}",
|
|
432
|
+
details={
|
|
433
|
+
"ref": ref,
|
|
434
|
+
"element": element,
|
|
435
|
+
"url_before": url_before,
|
|
436
|
+
"url_after": page.url,
|
|
437
|
+
"hint": "Try taking a new snapshot to get fresh refs, or check if the element is still visible and editable",
|
|
438
|
+
},
|
|
439
|
+
)
|
|
334
440
|
|
|
335
441
|
@mcp.tool(name="browser.screenshot_page", description="Take a screenshot of the current browser page")
|
|
336
442
|
@handle_mcp_errors
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from mcp.server.fastmcp import FastMCP
|
|
7
|
+
|
|
8
|
+
from thordata_mcp.config import settings
|
|
9
|
+
from thordata_mcp.context import ServerContext
|
|
10
|
+
from thordata_mcp.utils import ok_response
|
|
11
|
+
from thordata_mcp.tools.params_utils import normalize_params
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def register(mcp: FastMCP) -> None:
|
|
15
|
+
@mcp.tool(name="debug.status", description="Return server status and effective configuration (no secrets).")
|
|
16
|
+
async def debug_status() -> dict[str, Any]:
|
|
17
|
+
def _mask(v: str | None) -> dict[str, Any]:
|
|
18
|
+
if not v:
|
|
19
|
+
return {"set": False}
|
|
20
|
+
return {
|
|
21
|
+
"set": True,
|
|
22
|
+
"length": len(v),
|
|
23
|
+
"tail4": v[-4:] if len(v) >= 4 else v,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
return ok_response(
|
|
27
|
+
tool="debug.status",
|
|
28
|
+
input={},
|
|
29
|
+
output={
|
|
30
|
+
"python": __import__("sys").version,
|
|
31
|
+
"settings": {
|
|
32
|
+
"THORDATA_SCRAPER_TOKEN": _mask(settings.THORDATA_SCRAPER_TOKEN),
|
|
33
|
+
"THORDATA_PUBLIC_TOKEN": _mask(settings.THORDATA_PUBLIC_TOKEN),
|
|
34
|
+
"THORDATA_PUBLIC_KEY": _mask(settings.THORDATA_PUBLIC_KEY),
|
|
35
|
+
"THORDATA_BROWSER_USERNAME": _mask(settings.THORDATA_BROWSER_USERNAME),
|
|
36
|
+
"THORDATA_BROWSER_PASSWORD": _mask(settings.THORDATA_BROWSER_PASSWORD),
|
|
37
|
+
"THORDATA_TASKS_LIST_MODE": settings.THORDATA_TASKS_LIST_MODE,
|
|
38
|
+
"THORDATA_TASKS_LIST_DEFAULT_LIMIT": settings.THORDATA_TASKS_LIST_DEFAULT_LIMIT,
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@mcp.tool(name="browser.diagnostics", description="Return recent browser console/network diagnostics for the current session.")
|
|
44
|
+
async def browser_diagnostics(
|
|
45
|
+
console_limit: int = 10,
|
|
46
|
+
network_limit: int = 20,
|
|
47
|
+
) -> dict[str, Any]:
|
|
48
|
+
session = await ServerContext.get_browser_session()
|
|
49
|
+
page = await session.get_page()
|
|
50
|
+
|
|
51
|
+
return ok_response(
|
|
52
|
+
tool="browser.diagnostics",
|
|
53
|
+
input={"console_limit": console_limit, "network_limit": network_limit},
|
|
54
|
+
output={
|
|
55
|
+
"url": page.url,
|
|
56
|
+
"title": await page.title(),
|
|
57
|
+
"console_tail": session.get_console_tail(n=console_limit),
|
|
58
|
+
"network_tail": session.get_network_tail(n=network_limit),
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
@mcp.tool(
|
|
63
|
+
name="debug.self_test",
|
|
64
|
+
description=(
|
|
65
|
+
"Run a small, non-destructive smoke test suite for core scraping capabilities and return a compact report. "
|
|
66
|
+
"Useful after restarting the MCP server. Params: {\"timeout_s\": 30}."
|
|
67
|
+
),
|
|
68
|
+
)
|
|
69
|
+
async def debug_self_test(*, params: Any = None) -> dict[str, Any]:
|
|
70
|
+
try:
|
|
71
|
+
p = normalize_params(params, "debug.self_test", "run")
|
|
72
|
+
except Exception:
|
|
73
|
+
p = {}
|
|
74
|
+
|
|
75
|
+
timeout_s = int(p.get("timeout_s", 30))
|
|
76
|
+
timeout_s = max(5, min(timeout_s, 120))
|
|
77
|
+
|
|
78
|
+
async def _run(name: str, fn) -> dict[str, Any]:
|
|
79
|
+
try:
|
|
80
|
+
out = await asyncio.wait_for(fn(), timeout=timeout_s)
|
|
81
|
+
return {"check": name, "ok": True, "detail": out}
|
|
82
|
+
except Exception as e:
|
|
83
|
+
return {"check": name, "ok": False, "error": str(e)}
|
|
84
|
+
|
|
85
|
+
client = await ServerContext.get_client()
|
|
86
|
+
|
|
87
|
+
async def _check_serp() -> dict[str, Any]:
|
|
88
|
+
from thordata.types import SerpRequest
|
|
89
|
+
|
|
90
|
+
req = SerpRequest(query="thordata", engine="google", num=3, output_format="light_json")
|
|
91
|
+
data = await client.serp_search_advanced(req)
|
|
92
|
+
organic = data.get("organic") if isinstance(data, dict) else None
|
|
93
|
+
return {"has_organic": isinstance(organic, list) and len(organic) > 0, "organic_count": len(organic) if isinstance(organic, list) else None}
|
|
94
|
+
|
|
95
|
+
async def _check_unlocker() -> dict[str, Any]:
|
|
96
|
+
html = await client.universal_scrape(url="https://example.com", js_render=True, output_format="html")
|
|
97
|
+
s = html if isinstance(html, str) else str(html)
|
|
98
|
+
return {"html_len": len(s), "contains_example_domain": "Example Domain" in s}
|
|
99
|
+
|
|
100
|
+
async def _check_browser_snapshot() -> dict[str, Any]:
|
|
101
|
+
session = await ServerContext.get_browser_session()
|
|
102
|
+
snap = await session.capture_snapshot(url="https://example.com", filtered=True, max_items=20)
|
|
103
|
+
aria = snap.get("aria_snapshot") if isinstance(snap, dict) else None
|
|
104
|
+
return {"aria_non_empty": bool(aria), "aria_len": len(aria) if isinstance(aria, str) else None, "url": snap.get("url") if isinstance(snap, dict) else None}
|
|
105
|
+
|
|
106
|
+
results = await asyncio.gather(
|
|
107
|
+
_run("serp.search", _check_serp),
|
|
108
|
+
_run("unlocker.fetch(html,js_render=true)", _check_unlocker),
|
|
109
|
+
_run("browser.snapshot(filtered,max_items=20)", _check_browser_snapshot),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
summary = []
|
|
113
|
+
ok_all = True
|
|
114
|
+
for r in results:
|
|
115
|
+
if r.get("ok"):
|
|
116
|
+
summary.append({"check": r.get("check"), "ok": True})
|
|
117
|
+
else:
|
|
118
|
+
ok_all = False
|
|
119
|
+
summary.append({"check": r.get("check"), "ok": False, "error": r.get("error")})
|
|
120
|
+
|
|
121
|
+
return ok_response(
|
|
122
|
+
tool="debug.self_test",
|
|
123
|
+
input={"params": {"timeout_s": timeout_s}},
|
|
124
|
+
output={"ok_all": ok_all, "summary": summary, "_meta": {"timeout_s": timeout_s}},
|
|
125
|
+
)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Common parameter normalization utilities for thordata MCP tools."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Dict, Optional
|
|
7
|
+
|
|
8
|
+
from thordata_mcp.utils import error_response
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def normalize_params(params: Any, tool_name: str, action: Optional[str] = None) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Normalize params to dictionary with clear error messages.
|
|
14
|
+
|
|
15
|
+
This function handles the common case where Cursor might pass params as a string
|
|
16
|
+
instead of a dictionary object, and provides helpful error messages.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
params: The params value passed to the tool
|
|
20
|
+
tool_name: Name of the tool for error reporting
|
|
21
|
+
action: Optional action name for error reporting
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Normalized params dictionary
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: If params cannot be normalized to a dictionary
|
|
28
|
+
"""
|
|
29
|
+
if params is None:
|
|
30
|
+
return {}
|
|
31
|
+
|
|
32
|
+
if isinstance(params, dict):
|
|
33
|
+
return params
|
|
34
|
+
|
|
35
|
+
if isinstance(params, str):
|
|
36
|
+
try:
|
|
37
|
+
parsed = json.loads(params)
|
|
38
|
+
if not isinstance(parsed, dict):
|
|
39
|
+
raise ValueError("Parsed JSON is not a dictionary")
|
|
40
|
+
return parsed
|
|
41
|
+
except json.JSONDecodeError as e:
|
|
42
|
+
error_msg = (
|
|
43
|
+
f"Invalid JSON in params: {e}. "
|
|
44
|
+
f"Params should be a dictionary object, not a string. "
|
|
45
|
+
f"Example: params={{'url': 'https://example.com'}}. "
|
|
46
|
+
f"Received: {params[:100]}{'...' if len(params) > 100 else ''}"
|
|
47
|
+
)
|
|
48
|
+
raise ValueError(error_msg)
|
|
49
|
+
|
|
50
|
+
# Handle other types (list, number, etc.)
|
|
51
|
+
error_msg = (
|
|
52
|
+
f"params must be a dictionary object, not {type(params).__name__}. "
|
|
53
|
+
f"Example: params={{'url': 'https://example.com'}}. "
|
|
54
|
+
f"Received: {str(params)[:100]}{'...' if len(str(params)) > 100 else ''}"
|
|
55
|
+
)
|
|
56
|
+
raise ValueError(error_msg)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def create_params_error(tool_name: str, action: str, params: Any, error_message: str) -> Dict[str, Any]:
|
|
60
|
+
"""
|
|
61
|
+
Create a standardized error response for parameter validation errors.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
tool_name: Name of the tool
|
|
65
|
+
action: Action being performed
|
|
66
|
+
params: The invalid params value
|
|
67
|
+
error_message: Detailed error message
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Error response dictionary
|
|
71
|
+
"""
|
|
72
|
+
return error_response(
|
|
73
|
+
tool=tool_name,
|
|
74
|
+
input={"action": action, "params": params},
|
|
75
|
+
error_type="validation_error",
|
|
76
|
+
code="E4001",
|
|
77
|
+
message=error_message,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def create_json_error(tool_name: str, action: str, params: str, error_detail: str) -> Dict[str, Any]:
|
|
82
|
+
"""
|
|
83
|
+
Create a standardized error response for JSON parsing errors.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
tool_name: Name of the tool
|
|
87
|
+
action: Action being performed
|
|
88
|
+
params: The invalid JSON string
|
|
89
|
+
error_detail: JSON parsing error detail
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Error response dictionary
|
|
93
|
+
"""
|
|
94
|
+
error_message = (
|
|
95
|
+
f"Invalid JSON in params: {error_detail}. "
|
|
96
|
+
f"Use dictionary format: params={{'url': 'https://example.com'}} "
|
|
97
|
+
f"or valid JSON string: params='{{\"url\":\"https://example.com\"}}'. "
|
|
98
|
+
f"Received: {params[:100]}{'...' if len(params) > 100 else ''}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return error_response(
|
|
102
|
+
tool=tool_name,
|
|
103
|
+
input={"action": action, "params": params},
|
|
104
|
+
error_type="json_error",
|
|
105
|
+
code="E4002",
|
|
106
|
+
message=error_message,
|
|
107
|
+
)
|