thordata-mcp-server 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,962 +1,2108 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import json
5
- from typing import Any, Optional
6
-
7
- from mcp.server.fastmcp import Context, FastMCP
8
-
9
- from thordata_mcp.config import settings
10
- from thordata_mcp.context import ServerContext
11
- from thordata_mcp.monitoring import PerformanceTimer
12
- from thordata_mcp.utils import (
13
- error_response,
14
- handle_mcp_errors,
15
- html_to_markdown_clean,
16
- ok_response,
17
- safe_ctx_info,
18
- truncate_content,
19
- )
20
-
21
- # Tool schema helper (for catalog)
22
- from .utils import tool_schema # noqa: E402
23
-
24
- # Reuse battle-tested helpers from the full product module
25
- from .product import ( # noqa: E402
26
- _catalog,
27
- _candidate_tools_for_url,
28
- _extract_structured_from_html,
29
- _fetch_json_preview,
30
- _guess_tool_for_url,
31
- _hostname,
32
- _normalize_extracted,
33
- _normalize_record,
34
- _run_web_scraper_tool,
35
- _to_light_json,
36
- )
37
-
38
-
39
- def register(mcp: FastMCP) -> None:
40
- """Register the compact product surface (competitor-style).
41
-
42
- Only 5 top-level tools are exposed:
43
- - serp
44
- - unlocker
45
- - web_scraper
46
- - browser
47
- - smart_scrape
48
- """
49
-
50
- # -------------------------
51
- # SERP (compact)
52
- # -------------------------
53
- @mcp.tool(name="serp")
54
- @handle_mcp_errors
55
- async def serp(
56
- action: str,
57
- *,
58
- params: dict[str, Any] | None = None,
59
- ctx: Optional[Context] = None,
60
- ) -> dict[str, Any]:
61
- """SERP SCRAPER: action in {search, batch_search}.
62
-
63
- Args:
64
- action: Action to perform - "search" or "batch_search"
65
- params: Parameters dictionary. For "search": {"q": "query", "num": 10, "engine": "google", ...}
66
- For "batch_search": {"requests": [{"q": "query1"}, ...], "concurrency": 5}
67
-
68
- Examples:
69
- serp(action="search", params={"q": "Python programming", "num": 10})
70
- serp(action="batch_search", params={"requests": [{"q": "query1"}, {"q": "query2"}], "concurrency": 5})
71
- """
72
- # Normalize params: handle None, empty dict, or string (JSON)
73
- if params is None:
74
- p = {}
75
- elif isinstance(params, str):
76
- try:
77
- p = json.loads(params)
78
- except json.JSONDecodeError as e:
79
- return error_response(
80
- tool="serp",
81
- input={"action": action, "params": params},
82
- error_type="json_error",
83
- code="E4002",
84
- message=f"Invalid JSON in params: {e}",
85
- )
86
- elif isinstance(params, dict):
87
- p = params
88
- else:
89
- return error_response(
90
- tool="serp",
91
- input={"action": action, "params": params},
92
- error_type="validation_error",
93
- code="E4001",
94
- message="params must be a dictionary or JSON string",
95
- )
96
-
97
- a = (action or "").strip().lower()
98
- if not a:
99
- return error_response(
100
- tool="serp",
101
- input={"action": action, "params": p},
102
- error_type="validation_error",
103
- code="E4001",
104
- message="action is required",
105
- )
106
-
107
- client = await ServerContext.get_client()
108
-
109
- if a == "search":
110
- # Mirror serp.search product contract
111
- q = str(p.get("q", ""))
112
- if not q:
113
- return error_response(tool="serp", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing q")
114
- engine = str(p.get("engine", "google"))
115
- num = int(p.get("num", 10))
116
- start = int(p.get("start", 0))
117
- fmt = str(p.get("format", "json")).strip().lower()
118
- # Leverage SerpRequest mapping via SDK by calling full tool through request object
119
- from thordata.types import SerpRequest
120
- from thordata.types import Engine as EngineEnum
121
-
122
- sdk_fmt = "json" if fmt in {"json", "light_json", "light"} else ("both" if fmt in {"both", "json+html", "2"} else "html")
123
- extra_params = p.get("extra_params") if isinstance(p.get("extra_params"), dict) else {}
124
- if p.get("ai_overview") is not None:
125
- extra_params = dict(extra_params)
126
- extra_params["ai_overview"] = p.get("ai_overview")
127
- req = SerpRequest(
128
- query=q,
129
- engine=getattr(EngineEnum, engine.upper(), EngineEnum.GOOGLE),
130
- num=num,
131
- start=start,
132
- device=p.get("device"),
133
- output_format=sdk_fmt,
134
- render_js=p.get("render_js"),
135
- no_cache=p.get("no_cache"),
136
- google_domain=p.get("google_domain"),
137
- country=p.get("gl"),
138
- language=p.get("hl"),
139
- countries_filter=p.get("cr"),
140
- languages_filter=p.get("lr"),
141
- location=p.get("location"),
142
- uule=p.get("uule"),
143
- search_type=p.get("tbm"),
144
- ludocid=p.get("ludocid"),
145
- kgmid=p.get("kgmid"),
146
- extra_params=extra_params,
147
- )
148
- await safe_ctx_info(ctx, f"serp.search q={q!r} engine={engine} num={num} start={start} format={fmt}")
149
- data = await client.serp_search_advanced(req)
150
- if fmt in {"light_json", "light"}:
151
- data = _to_light_json(data)
152
- return ok_response(tool="serp", input={"action": "search", "params": p}, output=data)
153
-
154
- if a == "batch_search":
155
- reqs = p.get("requests")
156
- if not isinstance(reqs, list) or not reqs:
157
- return error_response(tool="serp", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
158
- concurrency = int(p.get("concurrency", 5))
159
- concurrency = max(1, min(concurrency, 20))
160
- fmt = str(p.get("format", "json")).strip().lower()
161
- sdk_fmt = "json" if fmt in {"json", "light_json", "light"} else ("both" if fmt in {"both", "json+html", "2"} else "html")
162
- from thordata.types import SerpRequest
163
- from thordata.types import Engine as EngineEnum
164
-
165
- sem = asyncio.Semaphore(concurrency)
166
-
167
- async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
168
- q = str(r.get("q", r.get("query", "")))
169
- if not q:
170
- return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing q"}}
171
- try:
172
- engine = str(r.get("engine", "google"))
173
- num = int(r.get("num", 10))
174
- start = int(r.get("start", 0))
175
- extra_params = r.get("extra_params") if isinstance(r.get("extra_params"), dict) else {}
176
- if r.get("ai_overview") is not None:
177
- extra_params = dict(extra_params)
178
- extra_params["ai_overview"] = r.get("ai_overview")
179
- async with sem:
180
- req = SerpRequest(
181
- query=q,
182
- engine=getattr(EngineEnum, engine.upper(), EngineEnum.GOOGLE),
183
- num=num,
184
- start=start,
185
- device=r.get("device"),
186
- output_format=sdk_fmt,
187
- render_js=r.get("render_js"),
188
- no_cache=r.get("no_cache"),
189
- google_domain=r.get("google_domain"),
190
- country=r.get("gl"),
191
- language=r.get("hl"),
192
- countries_filter=r.get("cr"),
193
- languages_filter=r.get("lr"),
194
- location=r.get("location"),
195
- uule=r.get("uule"),
196
- search_type=r.get("tbm"),
197
- ludocid=r.get("ludocid"),
198
- kgmid=r.get("kgmid"),
199
- extra_params=extra_params,
200
- )
201
- data = await client.serp_search_advanced(req)
202
- if fmt in {"light_json", "light"}:
203
- data = _to_light_json(data)
204
- return {"index": i, "ok": True, "q": q, "output": data}
205
- except Exception as e:
206
- return {"index": i, "ok": False, "q": q, "error": str(e)}
207
-
208
- await safe_ctx_info(ctx, f"serp.batch_search count={len(reqs)} concurrency={concurrency} format={fmt}")
209
- results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)], return_exceptions=False)
210
- return ok_response(tool="serp", input={"action": "batch_search", "params": p}, output={"results": results})
211
-
212
- return error_response(
213
- tool="serp",
214
- input={"action": action, "params": p},
215
- error_type="validation_error",
216
- code="E4001",
217
- message=f"Unknown action '{action}'. Supported actions: 'search', 'batch_search'",
218
- )
219
-
220
- # -------------------------
221
- # WEB UNLOCKER (compact)
222
- # -------------------------
223
- @mcp.tool(name="unlocker")
224
- @handle_mcp_errors
225
- async def unlocker(
226
- action: str,
227
- *,
228
- params: dict[str, Any] | None = None,
229
- ctx: Optional[Context] = None,
230
- ) -> dict[str, Any]:
231
- """WEB UNLOCKER: action in {fetch, batch_fetch}.
232
-
233
- Args:
234
- action: Action to perform - "fetch" or "batch_fetch"
235
- params: Parameters dictionary. For "fetch": {"url": "https://...", "js_render": true, "output_format": "html", ...}
236
- For "batch_fetch": {"requests": [{"url": "https://..."}, ...], "concurrency": 5}
237
-
238
- Examples:
239
- unlocker(action="fetch", params={"url": "https://www.google.com", "js_render": true})
240
- unlocker(action="batch_fetch", params={"requests": [{"url": "https://example.com"}], "concurrency": 5})
241
- """
242
- # Normalize params: handle None, empty dict, or string (JSON)
243
- if params is None:
244
- p = {}
245
- elif isinstance(params, str):
246
- try:
247
- p = json.loads(params)
248
- except json.JSONDecodeError as e:
249
- return error_response(
250
- tool="unlocker",
251
- input={"action": action, "params": params},
252
- error_type="json_error",
253
- code="E4002",
254
- message=f"Invalid JSON in params: {e}",
255
- )
256
- elif isinstance(params, dict):
257
- p = params
258
- else:
259
- return error_response(
260
- tool="unlocker",
261
- input={"action": action, "params": params},
262
- error_type="validation_error",
263
- code="E4001",
264
- message="params must be a dictionary or JSON string",
265
- )
266
-
267
- a = (action or "").strip().lower()
268
- if not a:
269
- return error_response(
270
- tool="unlocker",
271
- input={"action": action, "params": p},
272
- error_type="validation_error",
273
- code="E4001",
274
- message="action is required",
275
- )
276
-
277
- client = await ServerContext.get_client()
278
-
279
- if a == "fetch":
280
- url = str(p.get("url", ""))
281
- if not url:
282
- return error_response(tool="unlocker", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing url")
283
- fmt = str(p.get("output_format", "html")).strip().lower()
284
- js_render = bool(p.get("js_render", True))
285
- wait_ms = p.get("wait_ms")
286
- wait_seconds = int(wait_ms / 1000) if isinstance(wait_ms, (int, float)) else None
287
- country = p.get("country")
288
- block_resources = p.get("block_resources")
289
- wait_for = p.get("wait_for")
290
- max_chars = int(p.get("max_chars", 20_000))
291
- clean_content = p.get("clean_content") # e.g., "js,css" or ["js", "css"]
292
- headers = p.get("headers") # Custom headers (list of dicts or dict)
293
- cookies = p.get("cookies") # Custom cookies (list of dicts or string)
294
- extra_params = p.get("extra_params") if isinstance(p.get("extra_params"), dict) else {}
295
-
296
- # Handle clean_content: can be string (comma-separated) or list
297
- if clean_content:
298
- if isinstance(clean_content, str):
299
- clean_content_list = [c.strip() for c in clean_content.split(",")]
300
- elif isinstance(clean_content, list):
301
- clean_content_list = clean_content
302
- else:
303
- clean_content_list = None
304
- if clean_content_list:
305
- extra_params["clean_content"] = ",".join(clean_content_list)
306
-
307
- # Handle headers: can be list of dicts [{"name": "...", "value": "..."}] or dict
308
- if headers:
309
- if isinstance(headers, list):
310
- # Convert list of dicts to proper format if needed
311
- extra_params["headers"] = headers
312
- elif isinstance(headers, dict):
313
- # Convert dict to list format
314
- extra_params["headers"] = [{"name": k, "value": v} for k, v in headers.items()]
315
-
316
- # Handle cookies: can be list of dicts [{"name": "...", "value": "..."}] or string
317
- if cookies:
318
- if isinstance(cookies, str):
319
- extra_params["cookies"] = cookies
320
- elif isinstance(cookies, list):
321
- # Convert list of dicts to string format if needed
322
- cookie_strs = []
323
- for c in cookies:
324
- if isinstance(c, dict):
325
- cookie_strs.append(f"{c.get('name', '')}={c.get('value', '')}")
326
- else:
327
- cookie_strs.append(str(c))
328
- extra_params["cookies"] = "; ".join(cookie_strs)
329
-
330
- fetch_format = "html" if fmt in {"markdown", "md"} else fmt
331
- await safe_ctx_info(ctx, f"unlocker.fetch url={url!r} format={fmt} js_render={js_render}")
332
- with PerformanceTimer(tool="unlocker.fetch", url=url):
333
- data = await client.universal_scrape(
334
- url=url,
335
- js_render=js_render,
336
- output_format=fetch_format,
337
- country=country,
338
- block_resources=block_resources,
339
- wait=wait_seconds,
340
- wait_for=wait_for,
341
- **extra_params,
342
- )
343
- if fetch_format == "png":
344
- import base64
345
-
346
- if isinstance(data, (bytes, bytearray)):
347
- png_base64 = base64.b64encode(data).decode("utf-8")
348
- size = len(data)
349
- else:
350
- png_base64 = str(data)
351
- size = None
352
- return ok_response(tool="unlocker", input={"action": "fetch", "params": p}, output={"png_base64": png_base64, "size": size, "format": "png"})
353
- html = str(data) if not isinstance(data, str) else data
354
- if fmt in {"markdown", "md"}:
355
- md = html_to_markdown_clean(html)
356
- md = truncate_content(md, max_length=max_chars)
357
- return ok_response(tool="unlocker", input={"action": "fetch", "params": p}, output={"markdown": md})
358
- return ok_response(tool="unlocker", input={"action": "fetch", "params": p}, output={"html": html})
359
-
360
- if a == "batch_fetch":
361
- reqs = p.get("requests")
362
- if not isinstance(reqs, list) or not reqs:
363
- return error_response(tool="unlocker", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
364
- concurrency = int(p.get("concurrency", 5))
365
- concurrency = max(1, min(concurrency, 20))
366
- sem = asyncio.Semaphore(concurrency)
367
-
368
- async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
369
- url = str(r.get("url", ""))
370
- if not url:
371
- return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing url"}}
372
- fmt = str(r.get("output_format", "html")).strip().lower()
373
- fetch_format = "html" if fmt in {"markdown", "md"} else fmt
374
- js_render = bool(r.get("js_render", True))
375
- wait_ms = r.get("wait_ms")
376
- wait_seconds = int(wait_ms / 1000) if isinstance(wait_ms, (int, float)) else None
377
- extra_params = r.get("extra_params") if isinstance(r.get("extra_params"), dict) else {}
378
- async with sem:
379
- with PerformanceTimer(tool="unlocker.batch_fetch", url=url):
380
- data = await client.universal_scrape(
381
- url=url,
382
- js_render=js_render,
383
- output_format=fetch_format,
384
- country=r.get("country"),
385
- block_resources=r.get("block_resources"),
386
- wait=wait_seconds,
387
- wait_for=r.get("wait_for"),
388
- **extra_params,
389
- )
390
- if fetch_format == "png":
391
- import base64
392
-
393
- if isinstance(data, (bytes, bytearray)):
394
- png_base64 = base64.b64encode(data).decode("utf-8")
395
- size = len(data)
396
- else:
397
- png_base64 = str(data)
398
- size = None
399
- return {"index": i, "ok": True, "url": url, "output": {"png_base64": png_base64, "size": size, "format": "png"}}
400
- html = str(data) if not isinstance(data, str) else data
401
- if fmt in {"markdown", "md"}:
402
- md = html_to_markdown_clean(html)
403
- md = truncate_content(md, max_length=int(r.get("max_chars", 20_000)))
404
- return {"index": i, "ok": True, "url": url, "output": {"markdown": md}}
405
- return {"index": i, "ok": True, "url": url, "output": {"html": html}}
406
-
407
- await safe_ctx_info(ctx, f"unlocker.batch_fetch count={len(reqs)} concurrency={concurrency}")
408
- results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)])
409
- return ok_response(tool="unlocker", input={"action": "batch_fetch", "params": p}, output={"results": results})
410
-
411
- return error_response(
412
- tool="unlocker",
413
- input={"action": action, "params": p},
414
- error_type="validation_error",
415
- code="E4001",
416
- message=f"Unknown action '{action}'. Supported actions: 'fetch', 'batch_fetch'",
417
- )
418
-
419
- # -------------------------
420
- # WEB SCRAPER (compact)
421
- # -------------------------
422
- @mcp.tool(name="web_scraper")
423
- @handle_mcp_errors
424
- async def web_scraper(
425
- action: str,
426
- *,
427
- params: dict[str, Any] | None = None,
428
- ctx: Optional[Context] = None,
429
- ) -> dict[str, Any]:
430
- """WEB SCRAPER: action covers catalog/groups/run/batch_run/status/wait/result/list_tasks and batch helpers.
431
-
432
- Args:
433
- action: Action to perform - "catalog", "groups", "run", "batch_run", "status", "wait", "result", "list_tasks", etc.
434
- params: Parameters dictionary. Varies by action:
435
- - "catalog": {"group": "...", "keyword": "...", "limit": 100, "offset": 0}
436
- - "run": {"tool": "tool_key", "params": {...}, "wait": true, "file_type": "json"}
437
- - "status": {"task_id": "..."}
438
- - etc.
439
-
440
- Examples:
441
- web_scraper(action="catalog", params={"limit": 20})
442
- web_scraper(action="run", params={"tool": "thordata.tools.ecommerce.Amazon.ProductByUrl", "params": {"url": "https://amazon.com/..."}})
443
- """
444
- # Normalize params: handle None, empty dict, or string (JSON)
445
- if params is None:
446
- p = {}
447
- elif isinstance(params, str):
448
- try:
449
- p = json.loads(params)
450
- except json.JSONDecodeError as e:
451
- return error_response(
452
- tool="web_scraper",
453
- input={"action": action, "params": params},
454
- error_type="json_error",
455
- code="E4002",
456
- message=f"Invalid JSON in params: {e}",
457
- )
458
- elif isinstance(params, dict):
459
- p = params
460
- else:
461
- return error_response(
462
- tool="web_scraper",
463
- input={"action": action, "params": params},
464
- error_type="validation_error",
465
- code="E4001",
466
- message="params must be a dictionary or JSON string",
467
- )
468
-
469
- a = (action or "").strip().lower()
470
- if not a:
471
- return error_response(
472
- tool="web_scraper",
473
- input={"action": action, "params": p},
474
- error_type="validation_error",
475
- code="E4001",
476
- message="action is required",
477
- )
478
-
479
- client = await ServerContext.get_client()
480
-
481
- if a == "groups":
482
- # Reuse helper via full module: simply call web_scraper.groups by computing from catalog
483
- # We use web_scraper.catalog meta/groups via _catalog
484
- page, meta = _catalog(group=None, keyword=None, limit=1, offset=0)
485
- return ok_response(tool="web_scraper", input={"action": "groups", "params": p}, output={"groups": meta.get("groups"), "total": meta.get("total")})
486
-
487
- if a == "catalog":
488
- limit = max(1, min(int(p.get("limit", 100)), 500))
489
- offset = max(0, int(p.get("offset", 0)))
490
- page, meta = _catalog(group=p.get("group"), keyword=p.get("keyword"), limit=limit, offset=offset)
491
- return ok_response(tool="web_scraper", input={"action": "catalog", "params": p}, output={"tools": [tool_schema(t) for t in page], "meta": meta})
492
-
493
- if a == "run":
494
- tool = str(p.get("tool", ""))
495
- if not tool:
496
- return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing tool")
497
- params_dict = p.get("params") if isinstance(p.get("params"), dict) else None
498
- param_json = p.get("param_json")
499
- if params_dict is None:
500
- if isinstance(param_json, str) and param_json:
501
- try:
502
- params_dict = json.loads(param_json)
503
- except json.JSONDecodeError as e:
504
- return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="json_error", code="E4002", message=str(e))
505
- else:
506
- params_dict = {}
507
- wait = bool(p.get("wait", True))
508
- max_wait_seconds = int(p.get("max_wait_seconds", 300))
509
- file_type = str(p.get("file_type", "json"))
510
- return await _run_web_scraper_tool(tool=tool, params=params_dict, wait=wait, max_wait_seconds=max_wait_seconds, file_type=file_type, ctx=ctx)
511
-
512
- if a == "batch_run":
513
- reqs = p.get("requests")
514
- if not isinstance(reqs, list) or not reqs:
515
- return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
516
- concurrency = max(1, min(int(p.get("concurrency", 5)), 20))
517
- wait = bool(p.get("wait", True))
518
- max_wait_seconds = int(p.get("max_wait_seconds", 300))
519
- file_type = str(p.get("file_type", "json"))
520
- sem = asyncio.Semaphore(concurrency)
521
-
522
- async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
523
- tool = str(r.get("tool", ""))
524
- if not tool:
525
- return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing tool"}}
526
- params_dict = r.get("params") if isinstance(r.get("params"), dict) else {}
527
- async with sem:
528
- out = await _run_web_scraper_tool(tool=tool, params=params_dict, wait=wait, max_wait_seconds=max_wait_seconds, file_type=file_type, ctx=ctx)
529
- # compact per-item
530
- if out.get("ok") is True and isinstance(out.get("output"), dict):
531
- o = out["output"]
532
- out["output"] = {k: o.get(k) for k in ("task_id", "spider_id", "spider_name", "status", "download_url") if k in o}
533
- return {"index": i, **out}
534
-
535
- await safe_ctx_info(ctx, f"web_scraper.batch_run count={len(reqs)} concurrency={concurrency}")
536
- results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)])
537
- return ok_response(tool="web_scraper", input={"action": "batch_run", "params": p}, output={"results": results})
538
-
539
- if a == "list_tasks":
540
- page = max(1, int(p.get("page", 1)))
541
- size = max(1, min(int(p.get("size", 20)), 200))
542
- data = await client.list_tasks(page=page, size=size)
543
- return ok_response(tool="web_scraper", input={"action": "list_tasks", "params": p}, output=data)
544
-
545
- if a == "status":
546
- tid = str(p.get("task_id", ""))
547
- if not tid:
548
- return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
549
- s = await client.get_task_status(tid)
550
- return ok_response(tool="web_scraper", input={"action": "status", "params": p}, output={"task_id": tid, "status": str(s)})
551
-
552
- if a == "status_batch":
553
- tids = p.get("task_ids")
554
- if not isinstance(tids, list) or not tids:
555
- return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_ids[]")
556
- results = []
557
- for tid in [str(x) for x in tids[:200]]:
558
- try:
559
- s = await client.get_task_status(tid)
560
- results.append({"task_id": tid, "ok": True, "status": str(s)})
561
- except Exception as e:
562
- results.append({"task_id": tid, "ok": False, "error": {"message": str(e)}})
563
- return ok_response(tool="web_scraper", input={"action": "status_batch", "params": {"count": len(tids)}}, output={"results": results})
564
-
565
- if a == "wait":
566
- tid = str(p.get("task_id", ""))
567
- if not tid:
568
- return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
569
- poll = float(p.get("poll_interval_seconds", 5.0))
570
- max_wait = float(p.get("max_wait_seconds", 600.0))
571
- s = await client.wait_for_task(tid, poll_interval=poll, max_wait=max_wait)
572
- return ok_response(tool="web_scraper", input={"action": "wait", "params": p}, output={"task_id": tid, "status": str(s)})
573
-
574
- if a == "result":
575
- tid = str(p.get("task_id", ""))
576
- if not tid:
577
- return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
578
- file_type = str(p.get("file_type", "json"))
579
- preview = bool(p.get("preview", True))
580
- preview_max_chars = int(p.get("preview_max_chars", 20_000))
581
- dl = await client.get_task_result(tid, file_type=file_type)
582
- from thordata_mcp.utils import enrich_download_url
583
-
584
- dl = enrich_download_url(dl, task_id=tid, file_type=file_type)
585
- preview_obj = None
586
- structured = None
587
- if preview and file_type.lower() == "json":
588
- preview_obj = await _fetch_json_preview(dl, max_chars=preview_max_chars)
589
- if preview_obj.get("ok") is True:
590
- data = preview_obj.get("data")
591
- if isinstance(data, list) and data:
592
- structured = _normalize_record(data[0])
593
- elif isinstance(data, dict):
594
- structured = _normalize_record(data)
595
- return ok_response(tool="web_scraper", input={"action": "result", "params": p}, output={"task_id": tid, "download_url": dl, "preview": preview_obj, "structured": structured})
596
-
597
- if a == "result_batch":
598
- tids = p.get("task_ids")
599
- if not isinstance(tids, list) or not tids:
600
- return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_ids[]")
601
- file_type = str(p.get("file_type", "json"))
602
- preview = bool(p.get("preview", False))
603
- preview_max_chars = int(p.get("preview_max_chars", 20_000))
604
- from thordata_mcp.utils import enrich_download_url
605
-
606
- results = []
607
- for tid in [str(x) for x in tids[:100]]:
608
- try:
609
- dl = await client.get_task_result(tid, file_type=file_type)
610
- dl = enrich_download_url(dl, task_id=tid, file_type=file_type)
611
- prev = None
612
- structured = None
613
- if preview and file_type.lower() == "json":
614
- prev = await _fetch_json_preview(dl, max_chars=preview_max_chars)
615
- if prev.get("ok") is True:
616
- data = prev.get("data")
617
- if isinstance(data, list) and data:
618
- structured = _normalize_record(data[0])
619
- elif isinstance(data, dict):
620
- structured = _normalize_record(data)
621
- results.append({"task_id": tid, "ok": True, "download_url": dl, "preview": prev, "structured": structured})
622
- except Exception as e:
623
- results.append({"task_id": tid, "ok": False, "error": {"message": str(e)}})
624
- return ok_response(tool="web_scraper", input={"action": "result_batch", "params": {"count": len(tids)}}, output={"results": results})
625
-
626
- if a == "cancel":
627
- # Public spec currently doesn't provide cancel; keep clear error
628
- tid = str(p.get("task_id", ""))
629
- return error_response(tool="web_scraper", input={"action": "cancel", "params": p}, error_type="not_supported", code="E4005", message="Cancel endpoint not available in public Tasks API.", details={"task_id": tid})
630
-
631
- return error_response(
632
- tool="web_scraper",
633
- input={"action": action, "params": p},
634
- error_type="validation_error",
635
- code="E4001",
636
- message=f"Unknown action '{action}'. Supported actions: 'catalog', 'groups', 'run', 'batch_run', 'status', 'wait', 'result', 'list_tasks', 'status_batch', 'result_batch', 'cancel'",
637
- )
638
-
639
- # -------------------------
640
- # BROWSER SCRAPER (compact)
641
- # -------------------------
642
- @mcp.tool(name="browser")
643
- @handle_mcp_errors
644
- async def browser(
645
- action: str,
646
- *,
647
- params: dict[str, Any] | None = None,
648
- ctx: Optional[Context] = None,
649
- ) -> dict[str, Any]:
650
- """BROWSER SCRAPER: action in {navigate, snapshot}.
651
-
652
- Args:
653
- action: Action to perform - "navigate" or "snapshot"
654
- params: Parameters dictionary. For "navigate": {"url": "https://..."}
655
- For "snapshot": {"filtered": true}
656
-
657
- Examples:
658
- browser(action="navigate", params={"url": "https://www.google.com"})
659
- browser(action="snapshot", params={"filtered": true})
660
- """
661
- # Normalize params: handle None, empty dict, or string (JSON)
662
- if params is None:
663
- p = {}
664
- elif isinstance(params, str):
665
- try:
666
- p = json.loads(params)
667
- except json.JSONDecodeError as e:
668
- return error_response(
669
- tool="browser",
670
- input={"action": action, "params": params},
671
- error_type="json_error",
672
- code="E4002",
673
- message=f"Invalid JSON in params: {e}",
674
- )
675
- elif isinstance(params, dict):
676
- p = params
677
- else:
678
- return error_response(
679
- tool="browser",
680
- input={"action": action, "params": params},
681
- error_type="validation_error",
682
- code="E4001",
683
- message="params must be a dictionary or JSON string",
684
- )
685
-
686
- a = (action or "").strip().lower()
687
- if not a:
688
- return error_response(
689
- tool="browser",
690
- input={"action": action, "params": p},
691
- error_type="validation_error",
692
- code="E4001",
693
- message="action is required",
694
- )
695
-
696
- # Credentials check
697
- user = settings.THORDATA_BROWSER_USERNAME
698
- pwd = settings.THORDATA_BROWSER_PASSWORD
699
- if not user or not pwd:
700
- return error_response(
701
- tool="browser",
702
- input={"action": action, "params": p},
703
- error_type="config_error",
704
- code="E1001",
705
- message="Missing browser credentials. Set THORDATA_BROWSER_USERNAME and THORDATA_BROWSER_PASSWORD.",
706
- )
707
- session = await ServerContext.get_browser_session()
708
- if a == "navigate":
709
- url = str(p.get("url", ""))
710
- if not url:
711
- return error_response(tool="browser", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing url")
712
- page = await session.get_page(url)
713
- if page.url != url:
714
- await page.goto(url, timeout=120_000)
715
- title = await page.title()
716
- return ok_response(tool="browser", input={"action": "navigate", "params": p}, output={"url": page.url, "title": title})
717
- if a == "snapshot":
718
- filtered = bool(p.get("filtered", True))
719
- data = await session.capture_snapshot(filtered=filtered)
720
- aria_snapshot = truncate_content(str(data.get("aria_snapshot", "")))
721
- dom_snapshot = data.get("dom_snapshot")
722
- dom_snapshot = truncate_content(str(dom_snapshot)) if dom_snapshot else None
723
- return ok_response(
724
- tool="browser",
725
- input={"action": "snapshot", "params": p},
726
- output={
727
- "url": data.get("url"),
728
- "title": data.get("title"),
729
- "aria_snapshot": aria_snapshot,
730
- "dom_snapshot": dom_snapshot,
731
- },
732
- )
733
- return error_response(
734
- tool="browser",
735
- input={"action": action, "params": p},
736
- error_type="validation_error",
737
- code="E4001",
738
- message=f"Unknown action '{action}'. Supported actions: 'navigate', 'snapshot'",
739
- )
740
-
741
- # -------------------------
742
- # SMART SCRAPE (compact)
743
- # -------------------------
744
- @mcp.tool(name="smart_scrape")
745
- @handle_mcp_errors
746
- async def smart_scrape(
747
- url: str,
748
- *,
749
- prefer_structured: bool = True,
750
- preview: bool = True,
751
- preview_max_chars: int = 20_000,
752
- max_wait_seconds: int = 300,
753
- ctx: Optional[Context] = None,
754
- ) -> dict[str, Any]:
755
- """Auto-pick a Web Scraper task for URL; fallback to Unlocker. Always returns structured."""
756
- await safe_ctx_info(ctx, f"smart_scrape url={url!r} prefer_structured={prefer_structured}")
757
- host = _hostname(url)
758
- url_lower = url.lower()
759
-
760
- # Special-case: Google search pages are best handled by SERP (more reliable than Unlocker).
761
- if prefer_structured:
762
- from .product import _extract_google_search_query as _extract_q
763
- from .product import _is_google_search_url as _is_gsearch
764
-
765
- if _is_gsearch(url):
766
- q = _extract_q(url)
767
- await safe_ctx_info(ctx, f"smart_scrape: Google search detected, routing to SERP q={q!r}")
768
- try:
769
- from thordata.types import SerpRequest
770
- from thordata.types import Engine as EngineEnum
771
- client = await ServerContext.get_client()
772
- req = SerpRequest(
773
- query=str(q or ""),
774
- engine=EngineEnum.GOOGLE,
775
- num=10,
776
- start=0,
777
- country=None,
778
- language=None,
779
- google_domain="google.com",
780
- gl=None,
781
- hl=None,
782
- location=None,
783
- uule=None,
784
- ludocid=None,
785
- kgmid=None,
786
- extra_params={},
787
- )
788
- data = await client.serp_search_advanced(req)
789
- return ok_response(
790
- tool="smart_scrape",
791
- input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
792
- output={
793
- "path": "SERP",
794
- "serp": {"engine": "google", "q": q, "num": 10, "start": 0},
795
- "result": data,
796
- "structured": {"url": url, "query": q, "engine": "google"},
797
- "candidates": [],
798
- "tried": [],
799
- },
800
- )
801
- except Exception as e:
802
- await safe_ctx_info(ctx, f"smart_scrape: SERP routing failed, falling back. err={e}")
803
-
804
- # Match product.py behavior: for certain URLs, don't even attempt Web Scraper.
805
- # - Google search pages: prefer SERP / Unlocker
806
- # - Generic/example domains: never pick marketplace/product tools
807
- skip_web_scraper = False
808
- if host == "google.com" and "/search" in url_lower:
809
- skip_web_scraper = True
810
- generic_domains = {"example.com", "example.org", "example.net", "test.com", "localhost"}
811
- if host in generic_domains or (host and host.endswith(".example.com")):
812
- skip_web_scraper = True
813
-
814
- selected_tool: str | None = None
815
- selected_params: dict[str, Any] = {}
816
- candidates: list[tuple[str, dict[str, Any]]] = []
817
- if not skip_web_scraper:
818
- selected_tool, selected_params = _guess_tool_for_url(url)
819
- # Only keep guessed tool if it exists in tool map (avoid invalid hardcode drift)
820
- from .product import _ensure_tools as _ensure # local import to avoid cycles
821
-
822
- _, tools_map = _ensure()
823
- if selected_tool and selected_tool in tools_map:
824
- candidates.append((selected_tool, selected_params))
825
-
826
- if not candidates:
827
- candidate_keys = _candidate_tools_for_url(url, limit=3)
828
- # Filter out obviously wrong tools (like GitHub for non-GitHub URLs)
829
- filtered_candidates: list[str] = []
830
- for k in candidate_keys:
831
- lk = k.lower()
832
- if "github" in lk and host and "github" not in host.lower():
833
- continue
834
- if "repository" in lk and host and "github" not in host.lower() and "gitlab" not in host.lower():
835
- continue
836
- if "amazon" in lk and host and "amazon" not in host.lower():
837
- continue
838
- if "walmart" in lk and host and "walmart" not in host.lower():
839
- continue
840
- if ("googleshopping" in lk or "google.shopping" in lk) and (host == "google.com" or "/search" in url_lower):
841
- continue
842
- filtered_candidates.append(k)
843
-
844
- for k in filtered_candidates:
845
- candidates.append((k, {"url": url}))
846
- else:
847
- await safe_ctx_info(ctx, f"smart_scrape: skipping Web Scraper for host={host!r} url={url!r}")
848
-
849
- if prefer_structured and candidates:
850
- tried: list[dict[str, Any]] = []
851
- for tool, params in candidates[:3]:
852
- r = await _run_web_scraper_tool(tool=tool, params=params, wait=True, max_wait_seconds=max_wait_seconds, file_type="json", ctx=ctx)
853
- # Check if task succeeded (status should be Ready/Success, not Failed)
854
- result_obj = r.get("output") if isinstance(r.get("output"), dict) else {}
855
- status = result_obj.get("status", "").lower() if isinstance(result_obj, dict) else ""
856
-
857
- # If status is Failed, don't try more Web Scraper tools - go to Unlocker
858
- # Also check if r.get("ok") is False, which indicates the tool call itself failed
859
- if status == "failed" or r.get("ok") is False:
860
- await safe_ctx_info(ctx, f"smart_scrape: Web Scraper tool {tool} failed (status={status}, ok={r.get('ok')}), falling back to Unlocker")
861
- tried.append({
862
- "tool": tool,
863
- "ok": r.get("ok"),
864
- "status": status,
865
- "error": r.get("error"),
866
- })
867
- break # Exit loop and go to Unlocker fallback
868
-
869
- # Only return success if both ok is True AND status is not failed
870
- if r.get("ok") is True and status not in {"failed", "error", "failure"}:
871
- out = r.get("output") if isinstance(r.get("output"), dict) else {}
872
- dl = out.get("download_url") if isinstance(out, dict) else None
873
- preview_obj = None
874
- structured = {"url": url}
875
- if preview and isinstance(dl, str) and dl:
876
- preview_obj = await _fetch_json_preview(dl, max_chars=int(preview_max_chars))
877
- # Try to use preview data even if JSON parsing failed but we have raw data
878
- if preview_obj.get("ok") is True:
879
- data = preview_obj.get("data")
880
- if isinstance(data, list) and data:
881
- structured = _normalize_record(data[0], url=url)
882
- elif isinstance(data, dict):
883
- structured = _normalize_record(data, url=url)
884
- elif preview_obj.get("status") == 200 and preview_obj.get("raw"):
885
- # JSON parsing failed but we have raw data - try to extract basic info
886
- raw = preview_obj.get("raw", "")
887
- if raw:
888
- # Try to extract basic fields from raw text if possible
889
- structured = {"url": url, "raw_preview": raw[:500]} # Limit raw preview size
890
- return ok_response(
891
- tool="smart_scrape",
892
- input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
893
- output={"path": "WEB_SCRAPER", "selected_tool": tool, "selected_params": params, "result": out, "structured": structured, "preview": preview_obj, "tried": tried},
894
- )
895
- tried.append({"tool": tool, "ok": r.get("ok"), "status": status, "error": r.get("error")})
896
-
897
- client = await ServerContext.get_client()
898
- try:
899
- with PerformanceTimer(tool="smart_scrape.unlocker", url=url):
900
- html = await client.universal_scrape(url=url, js_render=True, output_format="html")
901
- html_str = str(html) if not isinstance(html, str) else html
902
- extracted = _extract_structured_from_html(html_str) if html_str else {}
903
- structured = _normalize_extracted(extracted, url=url)
904
- return ok_response(
905
- tool="smart_scrape",
906
- input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
907
- output={
908
- "path": "WEB_UNLOCKER",
909
- "unlocker": {"html": html_str},
910
- "extracted": extracted,
911
- "structured": structured,
912
- "selected_tool": selected_tool,
913
- "selected_params": selected_params,
914
- "candidates": [c[0] for c in candidates],
915
- "tried": tried if "tried" in locals() else [],
916
- },
917
- )
918
- except asyncio.TimeoutError as e:
919
- # Handle timeout specifically
920
- await safe_ctx_info(ctx, f"smart_scrape: Unlocker timed out: {e}")
921
- return error_response(
922
- tool="smart_scrape",
923
- input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
924
- error_type="timeout_error",
925
- code="E2003",
926
- message=f"Unlocker request timed out. The page may be slow to load or blocked.",
927
- details={
928
- "selected_tool": selected_tool,
929
- "candidates": [c[0] for c in candidates],
930
- "tried": tried if "tried" in locals() else [],
931
- },
932
- )
933
- except Exception as e:
934
- # If Unlocker also fails, return error with context
935
- await safe_ctx_info(ctx, f"smart_scrape: Unlocker also failed: {e}")
936
- error_msg = str(e)
937
- # Extract more useful error information
938
- if "504" in error_msg or "Gateway Timeout" in error_msg:
939
- error_type = "timeout_error"
940
- error_code = "E2003"
941
- error_message = f"Unlocker request timed out (504 Gateway Timeout). The page may be slow to load or blocked."
942
- elif "timeout" in error_msg.lower():
943
- error_type = "timeout_error"
944
- error_code = "E2003"
945
- error_message = f"Unlocker request timed out: {error_msg}"
946
- else:
947
- error_type = "network_error"
948
- error_code = "E2002"
949
- error_message = f"Both Web Scraper and Unlocker failed. Last error: {error_msg}"
950
- return error_response(
951
- tool="smart_scrape",
952
- input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
953
- error_type=error_type,
954
- code=error_code,
955
- message=error_message,
956
- details={
957
- "selected_tool": selected_tool,
958
- "candidates": [c[0] for c in candidates],
959
- "tried": tried if "tried" in locals() else [],
960
- },
961
- )
962
-
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ from typing import Any, Optional
6
+
7
+ from thordata_mcp.tools.params_utils import create_params_error, normalize_params
8
+ from thordata_mcp.tools.debug import register as register_debug
9
+ from thordata_mcp.config import get_settings
10
+
11
+ from mcp.server.fastmcp import Context, FastMCP
12
+
13
+ from thordata_mcp.config import settings
14
+ from thordata_mcp.context import ServerContext
15
+ from thordata_mcp.monitoring import PerformanceTimer
16
+ from thordata_mcp.utils import (
17
+ error_response,
18
+ handle_mcp_errors,
19
+ html_to_markdown_clean,
20
+ ok_response,
21
+ safe_ctx_info,
22
+ truncate_content,
23
+ )
24
+
25
+ # Tool schema helper (for catalog)
26
+ from .utils import tool_schema # noqa: E402
27
+
28
+ # Reuse battle-tested helpers from the full product module
29
+ from .product import ( # noqa: E402
30
+ _catalog,
31
+ _candidate_tools_for_url,
32
+ _extract_structured_from_html,
33
+ _fetch_json_preview,
34
+ _guess_tool_for_url,
35
+ _hostname,
36
+ _normalize_extracted,
37
+ _normalize_record,
38
+ _run_web_scraper_tool,
39
+ _to_light_json,
40
+ )
41
+
42
+ def _build_params_template(schema: dict[str, Any]) -> dict[str, Any]:
43
+ """Build a minimal runnable params template from a tool_schema() dict.
44
+
45
+ We do NOT include URL examples; we only provide placeholders and defaults.
46
+ """
47
+ fields = schema.get("fields") if isinstance(schema, dict) else None
48
+ if not isinstance(fields, dict):
49
+ return {}
50
+
51
+ template: dict[str, Any] = {}
52
+ for k, meta in fields.items():
53
+ if k in {"SPIDER_ID", "SPIDER_NAME"}:
54
+ continue
55
+ if not isinstance(meta, dict):
56
+ continue
57
+ required = bool(meta.get("required"))
58
+ default = meta.get("default")
59
+ typ = str(meta.get("type") or "")
60
+
61
+ # Always special-case common_settings for video tools, regardless of required/optional.
62
+ if k == "common_settings":
63
+ try:
64
+ from thordata.types.common import CommonSettings
65
+
66
+ cs_fields = getattr(CommonSettings, "__dataclass_fields__", {}) # type: ignore[attr-defined]
67
+ cs_template: dict[str, Any] = {}
68
+ for ck, cf in cs_fields.items():
69
+ # Keep all optional keys visible; user fills what they need.
70
+ if ck.startswith("_"):
71
+ continue
72
+ # default is always None in SDK, keep placeholder to make schema explicit
73
+ cs_template[ck] = f"<{ck}>"
74
+ template[k] = cs_template
75
+ except Exception:
76
+ # Fall back to a generic dict placeholder if SDK shape changes.
77
+ template[k] = {}
78
+ continue
79
+
80
+ # For required fields without defaults, provide a clear placeholder.
81
+ if required and default is None:
82
+ template[k] = f"<{k}>"
83
+ continue
84
+
85
+ # For optional fields, include default only if it's not None.
86
+ if default is not None:
87
+ template[k] = default
88
+ continue
89
+
90
+ # For some known shapes, provide a sensible empty structure.
91
+ if "dict" in typ:
92
+ template[k] = {}
93
+ elif "list" in typ:
94
+ template[k] = []
95
+ # else: omit
96
+
97
+ return template
98
+
99
+
100
+ def register(mcp: FastMCP) -> None:
101
+ """Register the compact product surface (competitor-style).
102
+
103
+ Core tools are exposed:
104
+ - serp
105
+ - search_engine / search_engine_batch (minimal web search)
106
+ - unlocker
107
+ - web_scraper
108
+ - browser
109
+ - smart_scrape
110
+
111
+ Plus optional debug helper:
112
+ - debug.status
113
+
114
+ Tool exposure can be controlled via environment variables:
115
+ - THORDATA_TOOLS: comma-separated tool names to explicitly enable (optional)
116
+ - THORDATA_MODE / THORDATA_GROUPS: legacy knobs (kept for backward-compat)
117
+ """
118
+
119
+ cfg = get_settings()
120
+ mode = str(getattr(cfg, "THORDATA_MODE", "rapid")).strip().lower()
121
+ groups = [g.strip().lower() for g in (getattr(cfg, "THORDATA_GROUPS", "") or "").split(",") if g.strip()]
122
+ tools = [t.strip().lower() for t in (getattr(cfg, "THORDATA_TOOLS", "") or "").split(",") if t.strip()]
123
+
124
+ # Register debug helper tools (read-only) only when enabled
125
+ if getattr(cfg, "THORDATA_DEBUG_TOOLS", False):
126
+ register_debug(mcp)
127
+
128
+ # Decide which tools to register.
129
+ # Competitor-style defaults: keep tool surface small for LLMs.
130
+ # We always expose a small base set; advanced tools require explicit allowlisting via THORDATA_TOOLS.
131
+ all_tools = {
132
+ "search_engine",
133
+ "search_engine_batch",
134
+ "serp",
135
+ "unlocker",
136
+ "web_scraper",
137
+ "web_scraper.help",
138
+ "browser",
139
+ "smart_scrape",
140
+ }
141
+ base_tools = {"search_engine", "unlocker", "browser", "smart_scrape"}
142
+
143
+ # Legacy note:
144
+ # We keep THORDATA_MODE/THORDATA_GROUPS for backward-compat, but avoid relying on multi-tier modes.
145
+ # If someone explicitly sets THORDATA_MODE=pro, we still honor it for now.
146
+ if mode == "pro":
147
+ allowed_tools = set(all_tools)
148
+ else:
149
+ allowed_tools = set(base_tools)
150
+ allowed_tools |= {t for t in tools if t in all_tools}
151
+
152
+ def _allow(name: str) -> bool:
153
+ return name.lower() in allowed_tools
154
+
155
+ # -------------------------
156
+ # SERP (compact)
157
+ # -------------------------
158
+ # Web search aliases
159
+ # - search_engine: single query web search
160
+ # - search_engine_batch: batch web search
161
+ if _allow("search_engine"):
162
+ @mcp.tool(
163
+ name="search_engine",
164
+ description=(
165
+ "Web search with AI-optimized results. "
166
+ 'Params example: {"q": "Python", "num": 10, "engine": "google", "format": "light_json"}. '
167
+ "Returns a minimal, LLM-friendly subset: title/link/description."
168
+ ),
169
+ )
170
+ @handle_mcp_errors
171
+ async def search_engine(
172
+ *,
173
+ params: Any = None,
174
+ ctx: Optional[Context] = None,
175
+ ) -> dict[str, Any]:
176
+ # Schema-friendly normalization: accept q/query, set sensible defaults.
177
+ try:
178
+ p = normalize_params(params, "search_engine", "search")
179
+ except ValueError as e:
180
+ return create_params_error("search_engine", "search", params, str(e))
181
+
182
+ q = str(p.get("q", "") or p.get("query", "")).strip()
183
+ if not q:
184
+ return error_response(
185
+ tool="search_engine",
186
+ input={"params": p},
187
+ error_type="validation_error",
188
+ code="E4001",
189
+ message="Missing q (provide params.q or params.query)",
190
+ details={"params_example": {"q": "Python web scraping", "num": 10, "engine": "google"}},
191
+ )
192
+
193
+ # Normalize basic options with defaults (schema-style).
194
+ engine = str(p.get("engine", "google") or "google").strip()
195
+ num = int(p.get("num", 10) or 10)
196
+ start = int(p.get("start", 0) or 0)
197
+ fmt = str(p.get("format", "light_json") or "light_json").strip().lower()
198
+ if num <= 0 or num > 50:
199
+ return error_response(
200
+ tool="search_engine",
201
+ input={"params": p},
202
+ error_type="validation_error",
203
+ code="E4001",
204
+ message="num must be between 1 and 50",
205
+ details={"num": num},
206
+ )
207
+
208
+ # Delegate to serp.search
209
+ await safe_ctx_info(ctx, f"search_engine q={q!r} engine={engine!r} num={num} start={start}")
210
+ out = await serp(
211
+ action="search",
212
+ params={"q": q, "engine": engine, "num": num, "start": start, "format": fmt, **{k: v for k, v in p.items() if k not in {"q", "query", "engine", "num", "start", "format"}}},
213
+ ctx=ctx,
214
+ )
215
+ if out.get("ok") is not True:
216
+ return out
217
+
218
+ data = out.get("output")
219
+ organic = data.get("organic") if isinstance(data, dict) else None
220
+ results = []
221
+ if isinstance(organic, list):
222
+ for r in organic[:num]:
223
+ if not isinstance(r, dict):
224
+ continue
225
+ results.append(
226
+ {
227
+ "title": r.get("title"),
228
+ "link": r.get("link"),
229
+ "description": r.get("description"),
230
+ }
231
+ )
232
+
233
+ return ok_response(
234
+ tool="search_engine",
235
+ input={"params": p},
236
+ output={
237
+ "query": q,
238
+ "engine": engine,
239
+ "results": results,
240
+ "_meta": data.get("_meta") if isinstance(data, dict) else None,
241
+ },
242
+ )
243
+
244
+ if _allow("search_engine_batch"):
245
+ @mcp.tool(
246
+ name="search_engine_batch",
247
+ description=(
248
+ "Batch web search. "
249
+ 'Params example: {"requests": [{"q": "q1"}, {"q": "q2"}], "concurrency": 5, "engine": "google"}.'
250
+ ),
251
+ )
252
+ @handle_mcp_errors
253
+ async def search_engine_batch(
254
+ *,
255
+ params: Any = None,
256
+ ctx: Optional[Context] = None,
257
+ ) -> dict[str, Any]:
258
+ try:
259
+ p = normalize_params(params, "search_engine_batch", "batch_search")
260
+ except ValueError as e:
261
+ return create_params_error("search_engine_batch", "batch_search", params, str(e))
262
+
263
+ reqs = p.get("requests")
264
+ if not isinstance(reqs, list) or not reqs:
265
+ return error_response(
266
+ tool="search_engine_batch",
267
+ input={"params": p},
268
+ error_type="validation_error",
269
+ code="E4001",
270
+ message="Missing requests[] (array of {q,...} objects)",
271
+ )
272
+
273
+ # Optional shared defaults for engine/num/start
274
+ default_engine = str(p.get("engine", "google") or "google").strip()
275
+ default_num = int(p.get("num", 10) or 10)
276
+ if default_num <= 0 or default_num > 50:
277
+ return error_response(
278
+ tool="search_engine_batch",
279
+ input={"params": p},
280
+ error_type="validation_error",
281
+ code="E4001",
282
+ message="num must be between 1 and 50",
283
+ details={"num": default_num},
284
+ )
285
+
286
+ # Delegate to serp.batch_search
287
+ await safe_ctx_info(ctx, f"search_engine_batch count={len(reqs)}")
288
+ out = await serp(
289
+ action="batch_search",
290
+ params={
291
+ **p,
292
+ "requests": [
293
+ {
294
+ **r,
295
+ "q": str((r.get("q") if isinstance(r, dict) else "") or (r.get("query") if isinstance(r, dict) else "")).strip(),
296
+ "engine": str((r.get("engine") if isinstance(r, dict) else "") or default_engine),
297
+ "num": int((r.get("num") if isinstance(r, dict) else 0) or default_num),
298
+ }
299
+ for r in reqs if isinstance(r, dict)
300
+ ],
301
+ },
302
+ ctx=ctx,
303
+ )
304
+ if out.get("ok") is not True:
305
+ return out
306
+
307
+ data = out.get("output")
308
+ results = []
309
+ if isinstance(data, dict):
310
+ for item in data.get("results", []) if isinstance(data.get("results"), list) else []:
311
+ if not isinstance(item, dict):
312
+ continue
313
+ o = item.get("output")
314
+ organic = o.get("organic") if isinstance(o, dict) else None
315
+ mapped = []
316
+ if isinstance(organic, list):
317
+ for r in organic:
318
+ if not isinstance(r, dict):
319
+ continue
320
+ mapped.append({"title": r.get("title"), "link": r.get("link"), "description": r.get("description")})
321
+ results.append(
322
+ {
323
+ "index": item.get("index"),
324
+ "ok": bool(item.get("ok")),
325
+ "input": {"q": item.get("q"), "engine": item.get("engine"), "num": item.get("num")},
326
+ "results": mapped if item.get("ok") else None,
327
+ "error": item.get("error") if not item.get("ok") else None,
328
+ }
329
+ )
330
+
331
+ return ok_response(tool="search_engine_batch", input={"params": p}, output={"results": results})
332
+
333
+ # -------------------------
334
+ # Low-level SERP (advanced users; not exposed by default)
335
+ @handle_mcp_errors
336
+ async def serp(
337
+ action: str,
338
+ *,
339
+ params: Any = None,
340
+ ctx: Optional[Context] = None,
341
+ ) -> dict[str, Any]:
342
+ """SERP SCRAPER: action in {search, batch_search}.
343
+
344
+ Args:
345
+ action: Action to perform - "search" or "batch_search"
346
+ params: Parameters dictionary. For "search": {"q": "query", "num": 10, "engine": "google", ...}
347
+ For "batch_search": {"requests": [{"q": "query1"}, ...], "concurrency": 5}
348
+
349
+ Examples:
350
+ serp(action="search", params={"q": "Python programming", "num": 10})
351
+ serp(action="batch_search", params={"requests": [{"q": "query1"}, {"q": "query2"}], "concurrency": 5})
352
+ """
353
+ # Normalize params with enhanced error messages
354
+ try:
355
+ p = normalize_params(params, "serp", action)
356
+ except ValueError as e:
357
+ if "JSON" in str(e):
358
+ return create_params_error("serp", action, params, str(e))
359
+ else:
360
+ return create_params_error("serp", action, params, str(e))
361
+
362
+ a = (action or "").strip().lower()
363
+ if not a:
364
+ return error_response(
365
+ tool="serp",
366
+ input={"action": action, "params": p},
367
+ error_type="validation_error",
368
+ code="E4001",
369
+ message="action is required",
370
+ )
371
+
372
+ client = await ServerContext.get_client()
373
+
374
+ if a == "search":
375
+ # Mirror serp.search product contract
376
+ q = str(p.get("q", ""))
377
+ if not q:
378
+ return error_response(tool="serp", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing q")
379
+ engine_in = str(p.get("engine", "google")).strip() or "google"
380
+ num = int(p.get("num", 10))
381
+ start = int(p.get("start", 0))
382
+ fmt = str(p.get("format", "json")).strip().lower()
383
+ # Backend contract nuance:
384
+ # - Some engines support "mode" via engine name (google_images/news/videos/shopping/ai_mode)
385
+ # - For engine=google, passing tbm often breaks on some backends. We route to a specific engine when possible.
386
+ tbm_raw = p.get("tbm")
387
+ tbm_lower = tbm_raw.strip().lower() if isinstance(tbm_raw, str) else None
388
+ engine = engine_in
389
+ if engine_in.lower() == "google" and tbm_lower in {"images", "news", "videos", "shops", "shopping"}:
390
+ # Map tbm-style mode to dedicated engine.
391
+ engine_map = {
392
+ "images": "google_images",
393
+ "news": "google_news",
394
+ "videos": "google_videos",
395
+ "shops": "google_shopping",
396
+ "shopping": "google_shopping",
397
+ }
398
+ engine = engine_map[tbm_lower]
399
+
400
+ # For engines that explicitly support tbm modes, keep tbm as-is but normalize common aliases
401
+ # (do NOT convert to isch/nws/vid/shop here; those are Google UI tbm values and may differ from backend contract).
402
+ if isinstance(tbm_raw, str):
403
+ tbm_alias = {"image": "images", "video": "videos", "shop": "shops"}
404
+ tbm_norm = tbm_alias.get(tbm_lower)
405
+ if tbm_norm:
406
+ p = dict(p)
407
+ p["tbm"] = tbm_norm
408
+ # Leverage SerpRequest mapping via SDK by calling full tool through request object
409
+ from thordata.types import SerpRequest
410
+
411
+ sdk_fmt = "json" if fmt in {"json", "light_json", "light"} else ("both" if fmt in {"both", "json+html", "2"} else "html")
412
+ extra_params = p.get("extra_params") if isinstance(p.get("extra_params"), dict) else {}
413
+ if p.get("ai_overview") is not None:
414
+ extra_params = dict(extra_params)
415
+ extra_params["ai_overview"] = p.get("ai_overview")
416
+ # Dashboard-style passthrough parameters (kept in extra_params)
417
+ for k in ("safe", "nfpr", "filter", "tbs", "ibp", "lsig", "si", "uds"):
418
+ if p.get(k) is not None:
419
+ extra_params = dict(extra_params)
420
+ extra_params[k] = p.get(k)
421
+ req = SerpRequest(
422
+ query=q,
423
+ engine=engine,
424
+ num=num,
425
+ start=start,
426
+ device=p.get("device"),
427
+ output_format=sdk_fmt,
428
+ render_js=p.get("render_js"),
429
+ no_cache=p.get("no_cache"),
430
+ google_domain=p.get("google_domain"),
431
+ country=p.get("gl"),
432
+ language=p.get("hl"),
433
+ countries_filter=p.get("cr"),
434
+ languages_filter=p.get("lr"),
435
+ location=p.get("location"),
436
+ uule=p.get("uule"),
437
+ search_type=p.get("tbm"),
438
+ ludocid=p.get("ludocid"),
439
+ kgmid=p.get("kgmid"),
440
+ extra_params=extra_params,
441
+ )
442
+ await safe_ctx_info(ctx, f"serp.search q={q!r} engine={engine} (input={engine_in}) num={num} start={start} format={fmt}")
443
+ try:
444
+ data = await client.serp_search_advanced(req)
445
+ except Exception as e:
446
+ msg = str(e)
447
+ if "Invalid tbm parameter" in msg or "invalid tbm parameter" in msg:
448
+ return error_response(
449
+ tool="serp",
450
+ input={"action": "search", "params": p},
451
+ error_type="validation_error",
452
+ code="E4001",
453
+ message="Invalid tbm (search type) parameter for SERP.",
454
+ details={
455
+ "tbm": p.get("tbm"),
456
+ "engine": engine,
457
+ "engine_input": engine_in,
458
+ "hint": "The upstream SERP endpoint rejected 'tbm'. Try removing tbm/search_type, or use engine-specific modes (e.g. google_images/google_news/google_videos/google_shopping).",
459
+ "examples": {"engine": ["google", "google_images", "google_news", "google_videos", "google_shopping"], "tbm": ["images", "news", "videos", "shops", "local", "patents"]},
460
+ },
461
+ )
462
+ raise
463
+ if fmt in {"light_json", "light"}:
464
+ data = _to_light_json(data)
465
+
466
+ # Add diagnostics for empty/no-result responses (common UX issue)
467
+ organic = None
468
+ if isinstance(data, dict):
469
+ organic = data.get("organic")
470
+ meta = {
471
+ "engine": engine,
472
+ "q": q,
473
+ "num": num,
474
+ "start": start,
475
+ "format": fmt,
476
+ "has_organic": isinstance(organic, list) and len(organic) > 0,
477
+ "organic_count": len(organic) if isinstance(organic, list) else None,
478
+ }
479
+
480
+ if isinstance(organic, list) and len(organic) == 0:
481
+ return ok_response(
482
+ tool="serp",
483
+ input={"action": "search", "params": p},
484
+ output={"_meta": meta, **data},
485
+ )
486
+ if isinstance(data, dict):
487
+ return ok_response(
488
+ tool="serp",
489
+ input={"action": "search", "params": p},
490
+ output={"_meta": meta, **data},
491
+ )
492
+ return ok_response(tool="serp", input={"action": "search", "params": p}, output={"_meta": meta, "data": data})
493
+
494
+ if a == "batch_search":
495
+ reqs = p.get("requests")
496
+ if not isinstance(reqs, list) or not reqs:
497
+ return error_response(tool="serp", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
498
+ concurrency = int(p.get("concurrency", 5))
499
+ concurrency = max(1, min(concurrency, 20))
500
+ fmt = str(p.get("format", "json")).strip().lower()
501
+ sdk_fmt = "json" if fmt in {"json", "light_json", "light"} else ("both" if fmt in {"both", "json+html", "2"} else "html")
502
+ from thordata.types import SerpRequest
503
+
504
+ sem = asyncio.Semaphore(concurrency)
505
+
506
+ async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
507
+ q = str(r.get("q", r.get("query", "")))
508
+ if not q:
509
+ return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing q"}}
510
+ try:
511
+ engine_in = str(r.get("engine", "google")).strip() or "google"
512
+ num = int(r.get("num", 10))
513
+ start = int(r.get("start", 0))
514
+ tbm_raw = r.get("tbm")
515
+ tbm_lower = tbm_raw.strip().lower() if isinstance(tbm_raw, str) else None
516
+ engine = engine_in
517
+ if engine_in.lower() == "google" and tbm_lower in {"images", "news", "videos", "shops", "shopping"}:
518
+ engine_map = {
519
+ "images": "google_images",
520
+ "news": "google_news",
521
+ "videos": "google_videos",
522
+ "shops": "google_shopping",
523
+ "shopping": "google_shopping",
524
+ }
525
+ engine = engine_map[tbm_lower]
526
+ if isinstance(tbm_raw, str):
527
+ tbm_alias = {"image": "images", "video": "videos", "shop": "shops"}
528
+ tbm_norm = tbm_alias.get(tbm_lower)
529
+ if tbm_norm:
530
+ r = dict(r)
531
+ r["tbm"] = tbm_norm
532
+ extra_params = r.get("extra_params") if isinstance(r.get("extra_params"), dict) else {}
533
+ if r.get("ai_overview") is not None:
534
+ extra_params = dict(extra_params)
535
+ extra_params["ai_overview"] = r.get("ai_overview")
536
+ for k in ("safe", "nfpr", "filter", "tbs", "ibp", "lsig", "si", "uds"):
537
+ if r.get(k) is not None:
538
+ extra_params = dict(extra_params)
539
+ extra_params[k] = r.get(k)
540
+ async with sem:
541
+ req = SerpRequest(
542
+ query=q,
543
+ engine=engine,
544
+ num=num,
545
+ start=start,
546
+ device=r.get("device"),
547
+ output_format=sdk_fmt,
548
+ render_js=r.get("render_js"),
549
+ no_cache=r.get("no_cache"),
550
+ google_domain=r.get("google_domain"),
551
+ country=r.get("gl"),
552
+ language=r.get("hl"),
553
+ countries_filter=r.get("cr"),
554
+ languages_filter=r.get("lr"),
555
+ location=r.get("location"),
556
+ uule=r.get("uule"),
557
+ search_type=r.get("tbm"),
558
+ ludocid=r.get("ludocid"),
559
+ kgmid=r.get("kgmid"),
560
+ extra_params=extra_params,
561
+ )
562
+ try:
563
+ data = await client.serp_search_advanced(req)
564
+ except Exception as e:
565
+ msg = str(e)
566
+ if "Invalid tbm parameter" in msg or "invalid tbm parameter" in msg:
567
+ return {
568
+ "index": i,
569
+ "ok": False,
570
+ "q": q,
571
+ "error": {
572
+ "type": "validation_error",
573
+ "message": "Invalid tbm (search type) parameter for SERP.",
574
+ "details": {"tbm": r.get("tbm")},
575
+ },
576
+ }
577
+ raise
578
+ if fmt in {"light_json", "light"}:
579
+ data = _to_light_json(data)
580
+ return {"index": i, "ok": True, "q": q, "output": data}
581
+ except Exception as e:
582
+ return {"index": i, "ok": False, "q": q, "error": str(e)}
583
+
584
+ await safe_ctx_info(ctx, f"serp.batch_search count={len(reqs)} concurrency={concurrency} format={fmt}")
585
+ results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)], return_exceptions=False)
586
+ return ok_response(tool="serp", input={"action": "batch_search", "params": p}, output={"results": results})
587
+
588
+ return error_response(
589
+ tool="serp",
590
+ input={"action": action, "params": p},
591
+ error_type="validation_error",
592
+ code="E4001",
593
+ message=f"Unknown action '{action}'. Supported actions: 'search', 'batch_search'",
594
+ )
595
+
596
+ if _allow("serp"):
597
+ mcp.tool(
598
+ name="serp",
599
+ description=(
600
+ "Low-level SERP scraper with full parameter control. "
601
+ 'Action in {search, batch_search}. Example: {"q": "Python", "num": 10, "engine": "google", "format": "light_json"}. '
602
+ "Prefer search_engine for minimal, LLM-friendly output."
603
+ ),
604
+ )(serp)
605
+
606
+ # -------------------------
607
+ # WEB UNLOCKER (compact)
608
+ # -------------------------
609
+ @mcp.tool(
610
+ name="unlocker",
611
+ description=(
612
+ "WEB UNLOCKER (Universal Scrape): action in {fetch, batch_fetch}. "
613
+ 'Use fetch for a single URL: {"url": "https://example.com", "output_format": "markdown", "js_render": true}. '
614
+ 'Use batch_fetch for multiple URLs: {"requests": [{"url": "..."}, ...], "concurrency": 5}.'
615
+ ),
616
+ )
617
+ @handle_mcp_errors
618
+ async def unlocker(
619
+ action: str,
620
+ *,
621
+ params: Any = None,
622
+ ctx: Optional[Context] = None,
623
+ ) -> dict[str, Any]:
624
+ """WEB UNLOCKER: action in {fetch, batch_fetch}.
625
+
626
+ Args:
627
+ action: Action to perform - "fetch" or "batch_fetch"
628
+ params: Parameters dictionary. For "fetch": {"url": "https://...", "js_render": true, "output_format": "html", ...}
629
+ For "batch_fetch": {"requests": [{"url": "https://..."}, ...], "concurrency": 5}
630
+
631
+ Examples:
632
+ unlocker(action="fetch", params={"url": "https://www.google.com", "js_render": true})
633
+ unlocker(action="batch_fetch", params={"requests": [{"url": "https://example.com"}], "concurrency": 5})
634
+ """
635
+ # Normalize params with enhanced error messages
636
+ try:
637
+ p = normalize_params(params, "unlocker", action)
638
+ except ValueError as e:
639
+ if "JSON" in str(e):
640
+ return create_params_error("unlocker", action, params, str(e))
641
+ else:
642
+ return create_params_error("unlocker", action, params, str(e))
643
+
644
+ a = (action or "").strip().lower()
645
+ if not a:
646
+ return error_response(
647
+ tool="unlocker",
648
+ input={"action": action, "params": p},
649
+ error_type="validation_error",
650
+ code="E4001",
651
+ message="action is required",
652
+ )
653
+
654
+ client = await ServerContext.get_client()
655
+
656
+ if a == "fetch":
657
+ url = str(p.get("url", "")).strip()
658
+ if not url:
659
+ return error_response(
660
+ tool="unlocker",
661
+ input={"action": action, "params": p},
662
+ error_type="validation_error",
663
+ code="E4001",
664
+ message="Missing url",
665
+ details={"params_example": {"url": "https://example.com", "output_format": "markdown", "js_render": True}},
666
+ )
667
+ fmt = str(p.get("output_format", "html") or "html").strip().lower()
668
+ js_render = bool(p.get("js_render", True))
669
+ wait_ms = p.get("wait_ms")
670
+ wait_seconds = int(wait_ms / 1000) if isinstance(wait_ms, (int, float)) else None
671
+ country = p.get("country")
672
+ # Validate block_resources (allowed: script, image, video)
673
+ block_resources_raw = p.get("block_resources")
674
+ block_resources = None
675
+ if block_resources_raw is not None:
676
+ if isinstance(block_resources_raw, str):
677
+ items = [x.strip() for x in block_resources_raw.split(",") if x.strip()]
678
+ elif isinstance(block_resources_raw, list):
679
+ items = [str(x).strip() for x in block_resources_raw]
680
+ else:
681
+ items = []
682
+ allowed = {"script", "image", "video"}
683
+ invalid = [x for x in items if x not in allowed]
684
+ if invalid:
685
+ return error_response(
686
+ tool="unlocker",
687
+ input={"action": action, "params": p},
688
+ error_type="validation_error",
689
+ code="E4001",
690
+ message="Invalid block_resources values.",
691
+ details={
692
+ "allowed": ["script", "image", "video"],
693
+ "invalid": invalid,
694
+ },
695
+ )
696
+ block_resources = ",".join(items) if items else None
697
+
698
+ # Validate clean_content (allowed: js, css)
699
+ clean_content_raw = p.get("clean_content")
700
+ clean_content = None
701
+ if clean_content_raw is not None:
702
+ if isinstance(clean_content_raw, str):
703
+ items = [x.strip() for x in clean_content_raw.split(",") if x.strip()]
704
+ elif isinstance(clean_content_raw, list):
705
+ items = [str(x).strip() for x in clean_content_raw]
706
+ else:
707
+ items = []
708
+ allowed = {"js", "css"}
709
+ invalid = [x for x in items if x not in allowed]
710
+ if invalid:
711
+ return error_response(
712
+ tool="unlocker",
713
+ input={"action": action, "params": p},
714
+ error_type="validation_error",
715
+ code="E4001",
716
+ message="Invalid clean_content values.",
717
+ details={
718
+ "allowed": ["js", "css"],
719
+ "invalid": invalid,
720
+ },
721
+ )
722
+ clean_content = ",".join(items) if items else None
723
+
724
+ # Default wait_for to .content if not provided
725
+ wait_for = p.get("wait_for") or ".content"
726
+ max_chars = int(p.get("max_chars", 20_000))
727
+ headers = p.get("headers") # Custom headers (list[{'name','value'}] or dict)
728
+ cookies = p.get("cookies") # Custom cookies (list[{'name','value'}])
729
+ extra_params = p.get("extra_params") if isinstance(p.get("extra_params"), dict) else {}
730
+
731
+ # Apply validated clean_content (allowed: js, css)
732
+ if clean_content:
733
+ extra_params["clean_content"] = clean_content
734
+
735
+ # Headers: accept list[{name,value}] or dict
736
+ if headers is not None:
737
+ if isinstance(headers, list):
738
+ bad = [h for h in headers if not (isinstance(h, dict) and "name" in h and "value" in h)]
739
+ if bad:
740
+ return error_response(
741
+ tool="unlocker",
742
+ input={"action": action, "params": p},
743
+ error_type="validation_error",
744
+ code="E4001",
745
+ message="Invalid headers format.",
746
+ details={"expected": "list[{name,value}] or dict", "example": [{"name": "User-Agent", "value": "..."}]},
747
+ )
748
+ extra_params["headers"] = headers
749
+ elif isinstance(headers, dict):
750
+ extra_params["headers"] = [{"name": k, "value": v} for k, v in headers.items()]
751
+ else:
752
+ return error_response(
753
+ tool="unlocker",
754
+ input={"action": action, "params": p},
755
+ error_type="validation_error",
756
+ code="E4001",
757
+ message="Invalid headers type.",
758
+ details={"expected": "list or dict"},
759
+ )
760
+
761
+ # Cookies: accept list[{name,value}] only (panel format)
762
+ if cookies is not None:
763
+ if isinstance(cookies, list):
764
+ bad = [c for c in cookies if not (isinstance(c, dict) and "name" in c and "value" in c)]
765
+ if bad:
766
+ return error_response(
767
+ tool="unlocker",
768
+ input={"action": action, "params": p},
769
+ error_type="validation_error",
770
+ code="E4001",
771
+ message="Invalid cookies format.",
772
+ details={"expected": "list[{name,value}]", "example": [{"name": "__csrf_token", "value": "..."}]},
773
+ )
774
+ extra_params["cookies"] = cookies
775
+ elif isinstance(cookies, dict):
776
+ extra_params["cookies"] = [{"name": k, "value": v} for k, v in cookies.items()]
777
+ else:
778
+ return error_response(
779
+ tool="unlocker",
780
+ input={"action": action, "params": p},
781
+ error_type="validation_error",
782
+ code="E4001",
783
+ message="Invalid cookies type.",
784
+ details={"expected": "list or dict"},
785
+ )
786
+
787
+ fetch_format = "html" if fmt in {"markdown", "md"} else fmt
788
+
789
+ # If the user asked for Markdown, we still fetch HTML from Unlocker and convert locally.
790
+ # Default: strip JS/CSS in the same request (avoid double network calls).
791
+ raw_markdown = bool(p.get("raw_markdown", False)) if fmt in {"markdown", "md"} else False
792
+ if fmt in {"markdown", "md"} and not raw_markdown:
793
+ cc = extra_params.get("clean_content")
794
+ if isinstance(cc, str) and cc.strip():
795
+ parts = [x.strip() for x in cc.split(",") if x.strip()]
796
+ else:
797
+ parts = []
798
+ for x in ("js", "css"):
799
+ if x not in parts:
800
+ parts.append(x)
801
+ extra_params["clean_content"] = ",".join(parts)
802
+
803
+ await safe_ctx_info(ctx, f"unlocker.fetch url={url!r} format={fmt} js_render={js_render} raw_markdown={raw_markdown}")
804
+ with PerformanceTimer(tool="unlocker.fetch", url=url):
805
+ try:
806
+ data = await client.universal_scrape(
807
+ url=url,
808
+ js_render=js_render,
809
+ output_format=fetch_format,
810
+ country=country,
811
+ block_resources=block_resources,
812
+ wait=wait_seconds,
813
+ wait_for=wait_for,
814
+ **extra_params,
815
+ )
816
+ except Exception as e:
817
+ msg = str(e)
818
+ # Some upstream failures return HTML (e.g. gateway errors) which can trigger JSON decode errors in the SDK.
819
+ if "Attempt to decode JSON" in msg or "unexpected mimetype: text/html" in msg:
820
+ return error_response(
821
+ tool="unlocker",
822
+ input={"action": action, "params": p},
823
+ error_type="upstream_internal_error",
824
+ code="E2106",
825
+ message="Universal API returned a non-JSON error page (likely gateway/upstream failure).",
826
+ details={"url": url, "output_format": fetch_format, "js_render": js_render, "error": msg},
827
+ )
828
+ raise
829
+ if fetch_format == "png":
830
+ import base64
831
+
832
+ if isinstance(data, (bytes, bytearray)):
833
+ png_base64 = base64.b64encode(data).decode("utf-8")
834
+ size = len(data)
835
+ else:
836
+ png_base64 = str(data)
837
+ size = None
838
+ return ok_response(tool="unlocker", input={"action": "fetch", "params": p}, output={"png_base64": png_base64, "size": size, "format": "png"})
839
+ html = str(data) if not isinstance(data, str) else data
840
+ if fmt in {"markdown", "md"}:
841
+ raw_markdown = bool(p.get("raw_markdown", False))
842
+
843
+ # Default behavior: clean Markdown by stripping common noise (style/script).
844
+ # IMPORTANT: do this with a single universal_scrape request by injecting clean_content into extra_params.
845
+ if not raw_markdown:
846
+ cc = extra_params.get("clean_content")
847
+ if isinstance(cc, str) and cc.strip():
848
+ parts = [x.strip() for x in cc.split(",") if x.strip()]
849
+ else:
850
+ parts = []
851
+ for x in ("js", "css"):
852
+ if x not in parts:
853
+ parts.append(x)
854
+ extra_params["clean_content"] = ",".join(parts)
855
+
856
+ md = html_to_markdown_clean(html)
857
+ md = truncate_content(md, max_length=max_chars)
858
+ return ok_response(
859
+ tool="unlocker",
860
+ input={"action": "fetch", "params": p},
861
+ output={"markdown": md, "_meta": {"raw_markdown": raw_markdown}},
862
+ )
863
+ return ok_response(tool="unlocker", input={"action": "fetch", "params": p}, output={"html": html})
864
+
865
+ if a == "batch_fetch":
866
+ reqs = p.get("requests")
867
+ if not isinstance(reqs, list) or not reqs:
868
+ return error_response(
869
+ tool="unlocker",
870
+ input={"action": action, "params": p},
871
+ error_type="validation_error",
872
+ code="E4001",
873
+ message="Missing requests[] (array of {url,...} objects)",
874
+ )
875
+ concurrency = int(p.get("concurrency", 5))
876
+ concurrency = max(1, min(concurrency, 20))
877
+ sem = asyncio.Semaphore(concurrency)
878
+
879
+ async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
880
+ url = str(r.get("url", ""))
881
+ if not url:
882
+ return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing url"}}
883
+ fmt = str(r.get("output_format", "html")).strip().lower()
884
+ fetch_format = "html" if fmt in {"markdown", "md"} else fmt
885
+ js_render = bool(r.get("js_render", True))
886
+ wait_ms = r.get("wait_ms")
887
+ wait_seconds = int(wait_ms / 1000) if isinstance(wait_ms, (int, float)) else None
888
+ # Per-request params normalization to match unlocker.fetch
889
+ country = r.get("country")
890
+
891
+ # Validate block_resources (allowed: script, image, video)
892
+ block_resources_raw = r.get("block_resources")
893
+ block_resources = None
894
+ if block_resources_raw is not None:
895
+ if isinstance(block_resources_raw, str):
896
+ items = [x.strip() for x in block_resources_raw.split(",") if x.strip()]
897
+ elif isinstance(block_resources_raw, list):
898
+ items = [str(x).strip() for x in block_resources_raw]
899
+ else:
900
+ items = []
901
+ allowed = {"script", "image", "video"}
902
+ invalid = [x for x in items if x not in allowed]
903
+ if invalid:
904
+ return {
905
+ "index": i,
906
+ "ok": False,
907
+ "url": url,
908
+ "error": {
909
+ "type": "validation_error",
910
+ "message": "Invalid block_resources values.",
911
+ "details": {"allowed": ["script", "image", "video"], "invalid": invalid},
912
+ },
913
+ }
914
+ block_resources = ",".join(items) if items else None
915
+
916
+ # Validate clean_content (allowed: js, css)
917
+ clean_content_raw = r.get("clean_content")
918
+ clean_content = None
919
+ if clean_content_raw is not None:
920
+ if isinstance(clean_content_raw, str):
921
+ items = [x.strip() for x in clean_content_raw.split(",") if x.strip()]
922
+ elif isinstance(clean_content_raw, list):
923
+ items = [str(x).strip() for x in clean_content_raw]
924
+ else:
925
+ items = []
926
+ allowed = {"js", "css"}
927
+ invalid = [x for x in items if x not in allowed]
928
+ if invalid:
929
+ return {
930
+ "index": i,
931
+ "ok": False,
932
+ "url": url,
933
+ "error": {
934
+ "type": "validation_error",
935
+ "message": "Invalid clean_content values.",
936
+ "details": {"allowed": ["js", "css"], "invalid": invalid},
937
+ },
938
+ }
939
+ clean_content = ",".join(items) if items else None
940
+
941
+ # Default wait_for to .content if not provided
942
+ wait_for = r.get("wait_for") or ".content"
943
+
944
+ headers = r.get("headers")
945
+ cookies = r.get("cookies")
946
+ extra_params = r.get("extra_params") if isinstance(r.get("extra_params"), dict) else {}
947
+
948
+ # Apply validated clean_content
949
+ if clean_content:
950
+ extra_params["clean_content"] = clean_content
951
+
952
+ # Headers: accept list[{name,value}] or dict
953
+ if headers is not None:
954
+ if isinstance(headers, list):
955
+ bad = [h for h in headers if not (isinstance(h, dict) and "name" in h and "value" in h)]
956
+ if bad:
957
+ return {
958
+ "index": i,
959
+ "ok": False,
960
+ "url": url,
961
+ "error": {
962
+ "type": "validation_error",
963
+ "message": "Invalid headers format.",
964
+ "details": {"expected": "list[{name,value}] or dict", "example": [{"name": "User-Agent", "value": "..."}]},
965
+ },
966
+ }
967
+ extra_params["headers"] = headers
968
+ elif isinstance(headers, dict):
969
+ extra_params["headers"] = [{"name": k, "value": v} for k, v in headers.items()]
970
+ else:
971
+ return {
972
+ "index": i,
973
+ "ok": False,
974
+ "url": url,
975
+ "error": {"type": "validation_error", "message": "Invalid headers type.", "details": {"expected": "list or dict"}},
976
+ }
977
+
978
+ # Cookies: accept list[{name,value}] or dict
979
+ if cookies is not None:
980
+ if isinstance(cookies, list):
981
+ bad = [c for c in cookies if not (isinstance(c, dict) and "name" in c and "value" in c)]
982
+ if bad:
983
+ return {
984
+ "index": i,
985
+ "ok": False,
986
+ "url": url,
987
+ "error": {
988
+ "type": "validation_error",
989
+ "message": "Invalid cookies format.",
990
+ "details": {"expected": "list[{name,value}]", "example": [{"name": "__csrf_token", "value": "..."}]},
991
+ },
992
+ }
993
+ extra_params["cookies"] = cookies
994
+ elif isinstance(cookies, dict):
995
+ extra_params["cookies"] = [{"name": k, "value": v} for k, v in cookies.items()]
996
+ else:
997
+ return {
998
+ "index": i,
999
+ "ok": False,
1000
+ "url": url,
1001
+ "error": {"type": "validation_error", "message": "Invalid cookies type.", "details": {"expected": "list or dict"}},
1002
+ }
1003
+
1004
+ # If the user asked for Markdown, we still fetch HTML from Unlocker and convert locally.
1005
+ raw_markdown = bool(r.get("raw_markdown", False)) if fmt in {"markdown", "md"} else False
1006
+ if fmt in {"markdown", "md"} and not raw_markdown:
1007
+ cc = extra_params.get("clean_content")
1008
+ if isinstance(cc, str) and cc.strip():
1009
+ parts = [x.strip() for x in cc.split(",") if x.strip()]
1010
+ else:
1011
+ parts = []
1012
+ for x in ("js", "css"):
1013
+ if x not in parts:
1014
+ parts.append(x)
1015
+ extra_params["clean_content"] = ",".join(parts)
1016
+ async with sem:
1017
+ with PerformanceTimer(tool="unlocker.batch_fetch", url=url):
1018
+ try:
1019
+ data = await client.universal_scrape(
1020
+ url=url,
1021
+ js_render=js_render,
1022
+ output_format=fetch_format,
1023
+ country=country,
1024
+ block_resources=block_resources,
1025
+ wait=wait_seconds,
1026
+ wait_for=wait_for,
1027
+ **extra_params,
1028
+ )
1029
+ except Exception as e:
1030
+ msg = str(e)
1031
+ if "Attempt to decode JSON" in msg or "unexpected mimetype: text/html" in msg:
1032
+ return {
1033
+ "index": i,
1034
+ "ok": False,
1035
+ "url": url,
1036
+ "error": {
1037
+ "type": "upstream_internal_error",
1038
+ "code": "E2106",
1039
+ "message": "Universal API returned a non-JSON error page (likely gateway/upstream failure).",
1040
+ "details": {"url": url, "output_format": fetch_format, "js_render": js_render, "error": msg},
1041
+ },
1042
+ }
1043
+ # Ensure batch_fetch never fails the whole batch on a single upstream error.
1044
+ return {
1045
+ "index": i,
1046
+ "ok": False,
1047
+ "url": url,
1048
+ "error": {
1049
+ "type": "upstream_internal_error",
1050
+ "code": "E2106",
1051
+ "message": "Universal API request failed.",
1052
+ "details": {"url": url, "output_format": fetch_format, "js_render": js_render, "error": msg},
1053
+ },
1054
+ }
1055
+ if fetch_format == "png":
1056
+ import base64
1057
+
1058
+ if isinstance(data, (bytes, bytearray)):
1059
+ png_base64 = base64.b64encode(data).decode("utf-8")
1060
+ size = len(data)
1061
+ else:
1062
+ png_base64 = str(data)
1063
+ size = None
1064
+ return {"index": i, "ok": True, "url": url, "output": {"png_base64": png_base64, "size": size, "format": "png"}}
1065
+ html = str(data) if not isinstance(data, str) else data
1066
+ if fmt in {"markdown", "md"}:
1067
+ md = html_to_markdown_clean(html)
1068
+ md = truncate_content(md, max_length=int(r.get("max_chars", 20_000)))
1069
+ return {"index": i, "ok": True, "url": url, "output": {"markdown": md}}
1070
+ return {"index": i, "ok": True, "url": url, "output": {"html": html}}
1071
+
1072
+ await safe_ctx_info(ctx, f"unlocker.batch_fetch count={len(reqs)} concurrency={concurrency}")
1073
+ results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)])
1074
+ return ok_response(tool="unlocker", input={"action": "batch_fetch", "params": p}, output={"results": results})
1075
+
1076
+ return error_response(
1077
+ tool="unlocker",
1078
+ input={"action": action, "params": p},
1079
+ error_type="validation_error",
1080
+ code="E4001",
1081
+ message=f"Unknown action '{action}'. Supported actions: 'fetch', 'batch_fetch'",
1082
+ )
1083
+
1084
+ # -------------------------
1085
+ # WEB SCRAPER (compact)
1086
+ # -------------------------
1087
+ async def web_scraper(
1088
+ action: str,
1089
+ *,
1090
+ params: Any = None,
1091
+ ctx: Optional[Context] = None,
1092
+ ) -> dict[str, Any]:
1093
+ """WEB SCRAPER: action covers catalog/groups/run/batch_run/status/wait/result/list_tasks and batch helpers.
1094
+
1095
+ Args:
1096
+ action: Action to perform - "catalog", "groups", "run", "batch_run", "status", "wait", "result", "list_tasks", etc.
1097
+ params: Parameters dictionary. Varies by action:
1098
+ - "catalog": {"group": "...", "keyword": "...", "limit": 100, "offset": 0}
1099
+ - "run": {"tool": "tool_key", "params": {...}, "wait": true, "file_type": "json"}
1100
+ - "status": {"task_id": "..."}
1101
+ - etc.
1102
+
1103
+ Examples:
1104
+ web_scraper(action="catalog", params={"limit": 20})
1105
+ web_scraper(action="run", params={"tool": "thordata.tools.ecommerce.Amazon.ProductByUrl", "params": {"url": "https://amazon.com/..."}})
1106
+ """
1107
+ # Normalize params with enhanced error messages
1108
+ try:
1109
+ p = normalize_params(params, "web_scraper", action)
1110
+ except ValueError as e:
1111
+ if "JSON" in str(e):
1112
+ return create_params_error("web_scraper", action, params, str(e))
1113
+ else:
1114
+ return create_params_error("web_scraper", action, params, str(e))
1115
+
1116
+ a = (action or "").strip().lower()
1117
+ if not a:
1118
+ return error_response(
1119
+ tool="web_scraper",
1120
+ input={"action": action, "params": p},
1121
+ error_type="validation_error",
1122
+ code="E4001",
1123
+ message="action is required",
1124
+ )
1125
+
1126
+ client = await ServerContext.get_client()
1127
+
1128
+ if a == "groups":
1129
+ # Reuse helper via full module: simply call web_scraper.groups by computing from catalog
1130
+ # We use web_scraper.catalog meta/groups via _catalog
1131
+ page, meta = _catalog(group=None, keyword=None, limit=1, offset=0)
1132
+ return ok_response(tool="web_scraper", input={"action": "groups", "params": p}, output={"groups": meta.get("groups"), "total": meta.get("total")})
1133
+
1134
+ if a in {"spiders", "spider_ids", "ids"}:
1135
+ # Convenience: return the full list of spider_id mappings without huge field schemas.
1136
+ limit = max(1, min(int(p.get("limit", 500)), 2000))
1137
+ offset = max(0, int(p.get("offset", 0)))
1138
+ page, meta = _catalog(group=p.get("group"), keyword=p.get("keyword"), limit=limit, offset=offset)
1139
+ items = []
1140
+ for t in page:
1141
+ s = tool_schema(t)
1142
+ items.append(
1143
+ {
1144
+ "tool_key": s.get("tool_key"),
1145
+ "spider_id": s.get("spider_id"),
1146
+ "spider_name": s.get("spider_name"),
1147
+ "group": s.get("group"),
1148
+ }
1149
+ )
1150
+ return ok_response(tool="web_scraper", input={"action": a, "params": p}, output={"items": items, "meta": meta})
1151
+
1152
+ if a == "catalog":
1153
+ # Tool discovery is configurable to reduce LLM tool selection noise.
1154
+ # - mode=curated: only allow groups from THORDATA_TASKS_GROUPS
1155
+ # - mode=all: list everything
1156
+ cfg = get_settings()
1157
+ mode = str(getattr(cfg, "THORDATA_TASKS_LIST_MODE", "curated") or "curated").strip().lower()
1158
+ groups_allow = [g.strip().lower() for g in (getattr(cfg, "THORDATA_TASKS_GROUPS", "") or "").split(",") if g.strip()]
1159
+
1160
+ # Respect explicit group filter provided by user
1161
+ group_in = p.get("group")
1162
+ group = str(group_in).strip() if group_in is not None else None
1163
+ group = group or None
1164
+
1165
+ # If curated, and no group provided, default to first allowed group to keep list small.
1166
+ # Users can still browse other groups by passing params.group.
1167
+ if mode == "curated" and not group and groups_allow:
1168
+ group = groups_allow[0]
1169
+
1170
+ # If curated + group provided but not allowed, return helpful error
1171
+ if mode == "curated" and group and groups_allow and group.lower() not in groups_allow:
1172
+ return error_response(
1173
+ tool="web_scraper",
1174
+ input={"action": "catalog", "params": p},
1175
+ error_type="not_allowed",
1176
+ code="E4010",
1177
+ message="Group not allowed in curated mode.",
1178
+ details={
1179
+ "mode": mode,
1180
+ "allowed_groups": groups_allow,
1181
+ "requested_group": group,
1182
+ "tip": "Set THORDATA_TASKS_LIST_MODE=all to browse all groups, or update THORDATA_TASKS_GROUPS.",
1183
+ },
1184
+ )
1185
+
1186
+ limit_default = int(getattr(cfg, "THORDATA_TASKS_LIST_DEFAULT_LIMIT", 60) or 60)
1187
+ limit = max(1, min(int(p.get("limit", limit_default)), 500))
1188
+ offset = max(0, int(p.get("offset", 0)))
1189
+ page, meta = _catalog(group=group, keyword=p.get("keyword"), limit=limit, offset=offset)
1190
+
1191
+ meta = dict(meta)
1192
+ meta.update(
1193
+ {
1194
+ "mode": mode,
1195
+ "allowed_groups": groups_allow,
1196
+ "effective_group": group,
1197
+ "how_to_show_all": "Set THORDATA_TASKS_LIST_MODE=all",
1198
+ }
1199
+ )
1200
+
1201
+ return ok_response(
1202
+ tool="web_scraper",
1203
+ input={"action": "catalog", "params": {**p, "group": group} if group else p},
1204
+ output={"tools": [tool_schema(t) for t in page], "meta": meta},
1205
+ )
1206
+
1207
+ if a in {"example", "template"}:
1208
+ tool = str(p.get("tool", "")) or str(p.get("tool_key", ""))
1209
+ if not tool:
1210
+ return error_response(tool="web_scraper", input={"action": a, "params": p}, error_type="validation_error", code="E4001", message="Missing tool (tool_key)")
1211
+ # Ensure tool exists and produce its schema + minimal params template.
1212
+ from .product import _ensure_tools as _ensure # local import to avoid cycles
1213
+ _, tools_map = _ensure()
1214
+ t = tools_map.get(tool)
1215
+ if not t:
1216
+ return error_response(tool="web_scraper", input={"action": a, "params": p}, error_type="invalid_tool", code="E4003", message="Unknown tool key. Use web_scraper.catalog to discover valid keys.")
1217
+ schema = tool_schema(t)
1218
+ params_template = _build_params_template(schema)
1219
+ spider_id = schema.get("spider_id")
1220
+ spider_name = schema.get("spider_name")
1221
+
1222
+ # LLM-oriented notes: explain the two main calling styles.
1223
+ notes: list[str] = [
1224
+ "Step 1: Use web_scraper.catalog to discover tools (filter by keyword/group).",
1225
+ "Step 2: Use web_scraper.example to get this params_template, then fill placeholders like <field> with real values.",
1226
+ "Step 3: Call web_scraper.run with {'tool': tool_key, 'params': {...}, 'wait': true} for a single task, or web_scraper.batch_run for many.",
1227
+ ]
1228
+ if spider_id and spider_name:
1229
+ # Many dashboard examples in documentation use builder/video_builder + spider_id.
1230
+ notes.append(
1231
+ "Alternative: For full Dashboard parity, you can call web_scraper.raw_run with "
1232
+ "{'builder': 'builder' or 'video_builder', 'spider_name': spider_name, "
1233
+ "'spider_id': spider_id, 'spider_parameters': [...]}. Use this to mirror curl examples "
1234
+ "from the official web scraper tasks documentation."
1235
+ )
1236
+
1237
+ return ok_response(
1238
+ tool="web_scraper",
1239
+ input={"action": a, "params": {"tool": tool}},
1240
+ output={
1241
+ "tool": tool,
1242
+ "spider_id": spider_id,
1243
+ "spider_name": spider_name,
1244
+ "group": schema.get("group"),
1245
+ "params_template": params_template,
1246
+ "notes": notes,
1247
+ },
1248
+ )
1249
+
1250
+ if a in {"raw_run", "raw_batch_run"}:
1251
+ # Ultimate fallback for 100% Dashboard parity: run by spider_id/spider_name directly,
1252
+ # even if SDK doesn't provide a ToolRequest class for it.
1253
+ client = await ServerContext.get_client()
1254
+
1255
+ async def _one(raw: dict[str, Any]) -> dict[str, Any]:
1256
+ spider_name = str(raw.get("spider_name", "") or raw.get("name", ""))
1257
+ spider_id = str(raw.get("spider_id", "") or raw.get("id", ""))
1258
+ if not spider_name or not spider_id:
1259
+ return {"ok": False, "error": {"type": "validation_error", "message": "Missing spider_name or spider_id"}}
1260
+
1261
+ builder = str(raw.get("builder", "builder")).strip().lower()
1262
+ wait = bool(raw.get("wait", True))
1263
+ max_wait_seconds = int(raw.get("max_wait_seconds", 300))
1264
+ file_type = str(raw.get("file_type", "json"))
1265
+ include_errors = bool(raw.get("include_errors", True))
1266
+ file_name = raw.get("file_name")
1267
+
1268
+ # spider_parameters can be dict/list or JSON string
1269
+ sp = raw.get("spider_parameters", raw.get("parameters"))
1270
+ if isinstance(sp, str):
1271
+ try:
1272
+ sp = json.loads(sp) if sp else {}
1273
+ except Exception:
1274
+ sp = {"raw": sp}
1275
+ if isinstance(sp, dict):
1276
+ sp_list: list[dict[str, Any]] = [sp]
1277
+ elif isinstance(sp, list):
1278
+ sp_list = [x for x in sp if isinstance(x, dict)]
1279
+ if not sp_list:
1280
+ sp_list = [{}]
1281
+ else:
1282
+ sp_list = [{}]
1283
+
1284
+ # spider_universal: for builder universal params or video common_settings
1285
+ su = raw.get("spider_universal") or raw.get("universal_params") or raw.get("common_settings")
1286
+ if isinstance(su, str):
1287
+ try:
1288
+ su = json.loads(su) if su else None
1289
+ except Exception:
1290
+ su = None
1291
+ su_dict = su if isinstance(su, dict) else None
1292
+
1293
+ # Lazy import types from SDK
1294
+ from thordata.types.task import ScraperTaskConfig, VideoTaskConfig
1295
+ from thordata.types.common import CommonSettings
1296
+
1297
+ # Generate file_name if missing (mirror SDK behavior)
1298
+ if not file_name:
1299
+ import uuid
1300
+ short_id = uuid.uuid4().hex[:8]
1301
+ file_name = f"{spider_id}_{short_id}"
1302
+
1303
+ await safe_ctx_info(ctx, f"web_scraper.{a} spider_id={spider_id} builder={builder} wait={wait}")
1304
+
1305
+ # Create task via correct builder endpoint
1306
+ if builder in {"video_builder", "video"}:
1307
+ # Defensive filtering: CommonSettings in the SDK may not include every
1308
+ # key shown in external documentation (e.g. some newer fields like
1309
+ # "kilohertz" / "bitrate" may not yet exist in this SDK version).
1310
+ # Passing unknown keys would raise "unexpected keyword argument" errors,
1311
+ # so we restrict to the dataclass' declared fields.
1312
+ cs_input: dict[str, Any] = {}
1313
+ if su_dict:
1314
+ allowed_keys = getattr(CommonSettings, "__dataclass_fields__", {}).keys()
1315
+ cs_input = {k: v for k, v in su_dict.items() if k in allowed_keys}
1316
+ cs = CommonSettings(**cs_input)
1317
+ config = VideoTaskConfig(
1318
+ file_name=str(file_name),
1319
+ spider_id=spider_id,
1320
+ spider_name=spider_name,
1321
+ parameters=sp_list if len(sp_list) > 1 else sp_list[0],
1322
+ common_settings=cs,
1323
+ include_errors=include_errors,
1324
+ )
1325
+ task_id = await client.create_video_task_advanced(config)
1326
+ else:
1327
+ config = ScraperTaskConfig(
1328
+ file_name=str(file_name),
1329
+ spider_id=spider_id,
1330
+ spider_name=spider_name,
1331
+ parameters=sp_list if len(sp_list) > 1 else sp_list[0],
1332
+ universal_params=su_dict,
1333
+ include_errors=include_errors,
1334
+ )
1335
+ task_id = await client.create_scraper_task_advanced(config)
1336
+
1337
+ result: dict[str, Any] = {"task_id": task_id, "spider_id": spider_id, "spider_name": spider_name}
1338
+ if wait:
1339
+ status = await client.wait_for_task(task_id, max_wait=max_wait_seconds)
1340
+ status_s = str(status)
1341
+ result["status"] = status_s
1342
+ if status_s.strip().lower() in {"ready", "success", "finished", "succeeded", "task succeeded", "task_succeeded"}:
1343
+ dl = await client.get_task_result(task_id, file_type=file_type)
1344
+ from thordata_mcp.utils import enrich_download_url
1345
+ result["download_url"] = enrich_download_url(dl, task_id=task_id, file_type=file_type)
1346
+ return {"ok": True, "output": result}
1347
+
1348
+ if a == "raw_run":
1349
+ out = await _one(p)
1350
+ if out.get("ok") is True:
1351
+ return ok_response(tool="web_scraper", input={"action": a, "params": p}, output=out["output"])
1352
+ return error_response(tool="web_scraper", input={"action": a, "params": p}, error_type="validation_error", code="E4001", message="raw_run failed", details=out.get("error"))
1353
+
1354
+ reqs = p.get("requests")
1355
+ if not isinstance(reqs, list) or not reqs:
1356
+ return error_response(tool="web_scraper", input={"action": a, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
1357
+ concurrency = max(1, min(int(p.get("concurrency", 5)), 20))
1358
+ sem = asyncio.Semaphore(concurrency)
1359
+
1360
+ async def _wrap(i: int, r: Any) -> dict[str, Any]:
1361
+ raw = r if isinstance(r, dict) else {}
1362
+ async with sem:
1363
+ one = await _one(raw)
1364
+ return {"index": i, **one}
1365
+
1366
+ results = await asyncio.gather(*[_wrap(i, r) for i, r in enumerate(reqs)], return_exceptions=False)
1367
+ return ok_response(tool="web_scraper", input={"action": a, "params": {"count": len(reqs), "concurrency": concurrency}}, output={"results": results})
1368
+
1369
+ if a == "run":
1370
+ tool = str(p.get("tool", ""))
1371
+ if not tool:
1372
+ return error_response(
1373
+ tool="web_scraper",
1374
+ input={"action": action, "params": p},
1375
+ error_type="validation_error",
1376
+ code="E4001",
1377
+ message="Missing tool",
1378
+ details={
1379
+ "missing_fields": ["tool"],
1380
+ "next_step": "Call web_scraper(action='catalog', params={'keyword': '...'}) to discover tool_key",
1381
+ },
1382
+ )
1383
+ params_dict = p.get("params") if isinstance(p.get("params"), dict) else None
1384
+ param_json = p.get("param_json")
1385
+ if params_dict is None:
1386
+ if isinstance(param_json, str) and param_json:
1387
+ try:
1388
+ params_dict = json.loads(param_json)
1389
+ except json.JSONDecodeError as e:
1390
+ return error_response(
1391
+ tool="web_scraper",
1392
+ input={"action": action, "params": p},
1393
+ error_type="json_error",
1394
+ code="E4002",
1395
+ message=str(e),
1396
+ )
1397
+ else:
1398
+ params_dict = {}
1399
+ wait = bool(p.get("wait", True))
1400
+
1401
+ # Validate required fields based on tool schema
1402
+ from .product import _ensure_tools as _ensure
1403
+ _, tools_map = _ensure()
1404
+ t = tools_map.get(tool)
1405
+ if not t:
1406
+ return error_response(
1407
+ tool="web_scraper",
1408
+ input={"action": action, "params": p},
1409
+ error_type="invalid_tool",
1410
+ code="E4003",
1411
+ message="Unknown tool key. Use web_scraper.catalog to discover valid keys.",
1412
+ )
1413
+ schema = tool_schema(t)
1414
+ fields = schema.get("fields", {})
1415
+ missing_fields = []
1416
+ params_template = {}
1417
+ for key, meta in fields.items():
1418
+ required = bool(meta.get("required"))
1419
+ if required and (params_dict is None or key not in params_dict or params_dict.get(key) in (None, "", [])):
1420
+ missing_fields.append(key)
1421
+ # Build minimal template for missing fields
1422
+ if required and key not in (params_dict or {}):
1423
+ default = meta.get("default")
1424
+ typ = str(meta.get("type", "")).lower()
1425
+ if "dict" in typ:
1426
+ params_template[key] = {}
1427
+ elif "list" in typ:
1428
+ params_template[key] = []
1429
+ elif default is not None:
1430
+ params_template[key] = default
1431
+ else:
1432
+ params_template[key] = f"<{key}>"
1433
+
1434
+ if missing_fields:
1435
+ return error_response(
1436
+ tool="web_scraper",
1437
+ input={"action": action, "params": p},
1438
+ error_type="validation_error",
1439
+ code="E4001",
1440
+ message="Missing required fields for tool params",
1441
+ details={
1442
+ "tool": tool,
1443
+ "missing_fields": missing_fields,
1444
+ "params_template": params_template,
1445
+ "tip": f"Run web_scraper(action='example', params={{'tool': '{tool}'}}) to see full template",
1446
+ },
1447
+ )
1448
+ wait = bool(p.get("wait", True))
1449
+
1450
+ # Execution-layer allowlist (optional safety)
1451
+ allowlist = getattr(settings, "THORDATA_TASKS_ALLOWLIST", "")
1452
+ if allowlist and allowlist.strip():
1453
+ allowed_prefixes = [prefix.strip().lower() for prefix in allowlist.split(",") if prefix.strip()]
1454
+ allowed_exact = [exact.strip() for exact in allowlist.split(",") if exact.strip()]
1455
+ tool_lower = tool.lower()
1456
+ if not any(tool_lower.startswith(p) for p in allowed_prefixes) and tool_lower not in allowed_exact:
1457
+ return error_response(
1458
+ tool="web_scraper",
1459
+ input={"action": action, "params": p},
1460
+ error_type="not_allowed",
1461
+ code="E4011",
1462
+ message="Tool not allowed by allowlist.",
1463
+ details={
1464
+ "tool": tool,
1465
+ "allowlist": allowlist,
1466
+ "tip": "Update THORDATA_TASKS_ALLOWLIST or set THORDATA_TASKS_LIST_MODE=all to bypass.",
1467
+ },
1468
+ )
1469
+ wait = bool(p.get("wait", True))
1470
+ max_wait_seconds = int(p.get("max_wait_seconds", 300))
1471
+ file_type = str(p.get("file_type", "json"))
1472
+ return await _run_web_scraper_tool(tool=tool, params=params_dict, wait=wait, max_wait_seconds=max_wait_seconds, file_type=file_type, ctx=ctx)
1473
+
1474
+ if a == "batch_run":
1475
+ reqs = p.get("requests")
1476
+ if not isinstance(reqs, list) or not reqs:
1477
+ return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing requests[]")
1478
+ concurrency = max(1, min(int(p.get("concurrency", 5)), 20))
1479
+ wait = bool(p.get("wait", True))
1480
+ max_wait_seconds = int(p.get("max_wait_seconds", 300))
1481
+ file_type = str(p.get("file_type", "json"))
1482
+ sem = asyncio.Semaphore(concurrency)
1483
+
1484
+ async def _one(i: int, r: dict[str, Any]) -> dict[str, Any]:
1485
+ tool = str(r.get("tool", ""))
1486
+ if not tool:
1487
+ return {"index": i, "ok": False, "error": {"type": "validation_error", "message": "Missing tool"}}
1488
+ params_dict = r.get("params") if isinstance(r.get("params"), dict) else {}
1489
+ async with sem:
1490
+ out = await _run_web_scraper_tool(tool=tool, params=params_dict, wait=wait, max_wait_seconds=max_wait_seconds, file_type=file_type, ctx=ctx)
1491
+ # compact per-item
1492
+ if out.get("ok") is True and isinstance(out.get("output"), dict):
1493
+ o = out["output"]
1494
+ out["output"] = {k: o.get(k) for k in ("task_id", "spider_id", "spider_name", "status", "download_url") if k in o}
1495
+ return {"index": i, **out}
1496
+
1497
+ await safe_ctx_info(ctx, f"web_scraper.batch_run count={len(reqs)} concurrency={concurrency}")
1498
+ results = await asyncio.gather(*[_one(i, r if isinstance(r, dict) else {}) for i, r in enumerate(reqs)])
1499
+ return ok_response(tool="web_scraper", input={"action": "batch_run", "params": p}, output={"results": results})
1500
+
1501
+ if a == "list_tasks":
1502
+ page = max(1, int(p.get("page", 1)))
1503
+ size = max(1, min(int(p.get("size", 20)), 200))
1504
+ data = await client.list_tasks(page=page, size=size)
1505
+ return ok_response(tool="web_scraper", input={"action": "list_tasks", "params": p}, output=data)
1506
+
1507
+ if a == "status":
1508
+ tid = str(p.get("task_id", ""))
1509
+ if not tid:
1510
+ return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
1511
+ s = await client.get_task_status(tid)
1512
+ return ok_response(tool="web_scraper", input={"action": "status", "params": p}, output={"task_id": tid, "status": str(s)})
1513
+
1514
+ if a == "status_batch":
1515
+ tids = p.get("task_ids")
1516
+ if not isinstance(tids, list) or not tids:
1517
+ return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_ids[]")
1518
+ results = []
1519
+ for tid in [str(x) for x in tids[:200]]:
1520
+ try:
1521
+ s = await client.get_task_status(tid)
1522
+ results.append({"task_id": tid, "ok": True, "status": str(s)})
1523
+ except Exception as e:
1524
+ results.append({"task_id": tid, "ok": False, "error": {"message": str(e)}})
1525
+ return ok_response(tool="web_scraper", input={"action": "status_batch", "params": {"count": len(tids)}}, output={"results": results})
1526
+
1527
+ if a == "wait":
1528
+ tid = str(p.get("task_id", ""))
1529
+ if not tid:
1530
+ return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
1531
+ poll = float(p.get("poll_interval_seconds", 5.0))
1532
+ max_wait = float(p.get("max_wait_seconds", 600.0))
1533
+ s = await client.wait_for_task(tid, poll_interval=poll, max_wait=max_wait)
1534
+ return ok_response(tool="web_scraper", input={"action": "wait", "params": p}, output={"task_id": tid, "status": str(s)})
1535
+
1536
+ if a == "result":
1537
+ tid = str(p.get("task_id", ""))
1538
+ if not tid:
1539
+ return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_id")
1540
+ file_type = str(p.get("file_type", "json"))
1541
+ preview = bool(p.get("preview", True))
1542
+ preview_max_chars = int(p.get("preview_max_chars", 20_000))
1543
+ dl = await client.get_task_result(tid, file_type=file_type)
1544
+ from thordata_mcp.utils import enrich_download_url
1545
+
1546
+ dl = enrich_download_url(dl, task_id=tid, file_type=file_type)
1547
+ preview_obj = None
1548
+ structured = None
1549
+ if preview and file_type.lower() == "json":
1550
+ preview_obj = await _fetch_json_preview(dl, max_chars=preview_max_chars)
1551
+ if preview_obj.get("ok") is True:
1552
+ data = preview_obj.get("data")
1553
+ if isinstance(data, list) and data:
1554
+ structured = _normalize_record(data[0])
1555
+ elif isinstance(data, dict):
1556
+ structured = _normalize_record(data)
1557
+ return ok_response(tool="web_scraper", input={"action": "result", "params": p}, output={"task_id": tid, "download_url": dl, "preview": preview_obj, "structured": structured})
1558
+
1559
+ if a == "result_batch":
1560
+ tids = p.get("task_ids")
1561
+ if not isinstance(tids, list) or not tids:
1562
+ return error_response(tool="web_scraper", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing task_ids[]")
1563
+ file_type = str(p.get("file_type", "json"))
1564
+ preview = bool(p.get("preview", False))
1565
+ preview_max_chars = int(p.get("preview_max_chars", 20_000))
1566
+ from thordata_mcp.utils import enrich_download_url
1567
+
1568
+ results = []
1569
+ for tid in [str(x) for x in tids[:100]]:
1570
+ try:
1571
+ dl = await client.get_task_result(tid, file_type=file_type)
1572
+ dl = enrich_download_url(dl, task_id=tid, file_type=file_type)
1573
+ prev = None
1574
+ structured = None
1575
+ if preview and file_type.lower() == "json":
1576
+ prev = await _fetch_json_preview(dl, max_chars=preview_max_chars)
1577
+ if prev.get("ok") is True:
1578
+ data = prev.get("data")
1579
+ if isinstance(data, list) and data:
1580
+ structured = _normalize_record(data[0])
1581
+ elif isinstance(data, dict):
1582
+ structured = _normalize_record(data)
1583
+ results.append({"task_id": tid, "ok": True, "download_url": dl, "preview": prev, "structured": structured})
1584
+ except Exception as e:
1585
+ results.append({"task_id": tid, "ok": False, "error": {"message": str(e)}})
1586
+ return ok_response(tool="web_scraper", input={"action": "result_batch", "params": {"count": len(tids)}}, output={"results": results})
1587
+
1588
+ if a == "cancel":
1589
+ # Public spec currently doesn't provide cancel; keep clear error
1590
+ tid = str(p.get("task_id", ""))
1591
+ return error_response(tool="web_scraper", input={"action": "cancel", "params": p}, error_type="not_supported", code="E4005", message="Cancel endpoint not available in public Tasks API.", details={"task_id": tid})
1592
+
1593
+ return error_response(
1594
+ tool="web_scraper",
1595
+ input={"action": action, "params": p},
1596
+ error_type="validation_error",
1597
+ code="E4001",
1598
+ message=(
1599
+ f"Unknown action '{action}'. Supported actions: "
1600
+ "'catalog', 'groups', 'spiders', 'spider_ids', 'ids', "
1601
+ "'example', 'template', "
1602
+ "'run', 'batch_run', "
1603
+ "'raw_run', 'raw_batch_run', "
1604
+ "'list_tasks', 'status', 'status_batch', 'wait', "
1605
+ "'result', 'result_batch', 'cancel'"
1606
+ ),
1607
+ )
1608
+
1609
+ # Conditionally register WEB SCRAPER tools (kept out of default rapid mode to reduce surface area).
1610
+ if _allow("web_scraper"):
1611
+ mcp.tool(
1612
+ name="web_scraper",
1613
+ description=(
1614
+ "WEB SCRAPER TASKS: action in {catalog, groups, spiders, example, run, batch_run, "
1615
+ "raw_run, raw_batch_run, list_tasks, status, status_batch, wait, result, result_batch, cancel}. "
1616
+ "Typical flow: catalog → example (params_template) → run / batch_run, or use raw_run for direct "
1617
+ "builder/video_builder spider_id calls that mirror Dashboard curl examples."
1618
+ ),
1619
+ )(handle_mcp_errors(web_scraper))
1620
+
1621
+ # -------------------------
1622
+ # WEB SCRAPER HELP (UX helper)
1623
+ # -------------------------
1624
+ if _allow("web_scraper.help"):
1625
+ mcp.tool(
1626
+ name="web_scraper.help",
1627
+ description=(
1628
+ "Explain how to use web_scraper actions (catalog/example/run/batch_run/raw_run/...). "
1629
+ "Use this as a quick reference for LLMs and users."
1630
+ ),
1631
+ )(handle_mcp_errors(web_scraper_help))
1632
+ async def web_scraper_help() -> dict[str, Any]:
1633
+ """Return a high-level usage guide for web_scraper.* actions."""
1634
+ guide = {
1635
+ "recommended_flow": [
1636
+ "1. Discover tools: call web_scraper with action='catalog' (and optional group/keyword/limit/offset).",
1637
+ "2. Inspect a tool: call web_scraper with action='example' to get params_template and metadata.",
1638
+ "3. Run a single task: call web_scraper with action='run' and provide tool + params.",
1639
+ "4. Run many tasks: call web_scraper with action='batch_run' and a list of {tool, params}.",
1640
+ "5. Get status/result: call web_scraper with action='status'/'wait'/'result' (or their *_batch variants).",
1641
+ ],
1642
+ "quick_example": {
1643
+ "catalog": {"action": "catalog", "params": {"keyword": "amazon_product_by-url", "limit": 5}},
1644
+ "example": {"action": "example", "params": {"tool": "<tool_key_from_catalog>"}},
1645
+ "run": {
1646
+ "action": "run",
1647
+ "params": {
1648
+ "tool": "<tool_key_from_catalog>",
1649
+ "params": {"<field>": "<value>"},
1650
+ "wait": True,
1651
+ "file_type": "json",
1652
+ },
1653
+ },
1654
+ "result": {"action": "result", "params": {"task_id": "<task_id>", "file_type": "json", "preview": True}},
1655
+ },
1656
+ "when_to_use_raw_run": [
1657
+ "Use action='raw_run' or 'raw_batch_run' when you only know spider_name/spider_id from Dashboard docs, "
1658
+ "or when a spider does not yet have a dedicated SDK ToolRequest.",
1659
+ "These actions mirror the 'builder' / 'video_builder' curl examples: you pass spider_id, spider_name, "
1660
+ "spider_parameters and optional spider_universal/common_settings directly.",
1661
+ ],
1662
+ "raw_run_cheatsheet": {
1663
+ "builder": {
1664
+ "action": "raw_run",
1665
+ "params": {
1666
+ "builder": "builder",
1667
+ "spider_name": "<spider_name>",
1668
+ "spider_id": "<spider_id>",
1669
+ "spider_parameters": [{"<param>": "<value>"}],
1670
+ "spider_universal": {"<universal_param>": "<value>"},
1671
+ "wait": True,
1672
+ "file_type": "json",
1673
+ "include_errors": True,
1674
+ },
1675
+ },
1676
+ "video_builder": {
1677
+ "action": "raw_run",
1678
+ "params": {
1679
+ "builder": "video_builder",
1680
+ "spider_name": "<spider_name>",
1681
+ "spider_id": "<spider_id>",
1682
+ "spider_parameters": [{"<param>": "<value>"}],
1683
+ "common_settings": {"<common_setting>": "<value>"},
1684
+ "wait": True,
1685
+ "file_type": "json",
1686
+ "include_errors": True,
1687
+ },
1688
+ },
1689
+ "curl_mapping": [
1690
+ "curl builder/video_builder → params.builder",
1691
+ "curl spider_name → params.spider_name",
1692
+ "curl spider_id → params.spider_id",
1693
+ "curl spider_parameters → params.spider_parameters (dict or list[dict])",
1694
+ "curl spider_universal → params.spider_universal (builder only)",
1695
+ "curl common_settings → params.common_settings (video_builder only)",
1696
+ ],
1697
+ },
1698
+ "llm_tips": [
1699
+ "If you know a tool_key: catalog → example → run/batch_run (best schema, safer defaults).",
1700
+ "If you only have a URL and you're unsure which task fits: try smart_scrape(url=...) first (structured if possible, else unlocker).",
1701
+ "If catalog cannot find a matching tool by keyword/group: try web_scraper.spiders with a broader keyword (e.g. domain name) to confirm whether the spider_id exists in this MCP build.",
1702
+ "If the spider_id is not present in catalog/spiders: treat it as NOT SUPPORTED by this MCP build. Next best action is to use unlocker.fetch (or smart_scrape with prefer_structured=false) to still get content, then extract fields from HTML/Markdown.",
1703
+ "When a structured task fails but unlocker succeeds: include the URL + tool_key/spider_id + error.message in your report; it usually indicates site changes or anti-bot and we can improve routing/tool defaults.",
1704
+ "If run/raw_run returns task_id: use web_scraper.status / wait / result to poll and fetch outputs.",
1705
+ ],
1706
+ }
1707
+ return ok_response(tool="web_scraper.help", input={}, output=guide)
1708
+
1709
+ # -------------------------
1710
+ # BROWSER SCRAPER (compact)
1711
+ # -------------------------
1712
+ @mcp.tool(
1713
+ name="browser",
1714
+ description=(
1715
+ "BROWSER SCRAPER (Playwright): action in {navigate, snapshot}. "
1716
+ 'Use navigate with {"url": "..."} to open a page, then snapshot with {"filtered": true} to get ARIA refs '
1717
+ "for click/type tools from the separate browser.* namespace."
1718
+ ),
1719
+ )
1720
+ @handle_mcp_errors
1721
+ async def browser(
1722
+ action: str,
1723
+ *,
1724
+ params: Any = None,
1725
+ ctx: Optional[Context] = None,
1726
+ ) -> dict[str, Any]:
1727
+ """BROWSER SCRAPER: action in {navigate, snapshot}.
1728
+
1729
+ Args:
1730
+ action: Action to perform - "navigate" or "snapshot"
1731
+ params: Parameters dictionary. For "navigate": {"url": "https://..."}
1732
+ For "snapshot": {"filtered": true}
1733
+
1734
+ Examples:
1735
+ browser(action="navigate", params={"url": "https://www.google.com"})
1736
+ browser(action="snapshot", params={"filtered": true})
1737
+ """
1738
+ # Normalize params with enhanced error messages
1739
+ try:
1740
+ p = normalize_params(params, "browser", action)
1741
+ except ValueError as e:
1742
+ if "JSON" in str(e):
1743
+ return create_params_error("browser", action, params, str(e))
1744
+ else:
1745
+ return create_params_error("browser", action, params, str(e))
1746
+
1747
+ a = (action or "").strip().lower()
1748
+ if not a:
1749
+ return error_response(
1750
+ tool="browser",
1751
+ input={"action": action, "params": p},
1752
+ error_type="validation_error",
1753
+ code="E4001",
1754
+ message="action is required",
1755
+ )
1756
+
1757
+ # Credentials check
1758
+ user = settings.THORDATA_BROWSER_USERNAME
1759
+ pwd = settings.THORDATA_BROWSER_PASSWORD
1760
+ if not user or not pwd:
1761
+ return error_response(
1762
+ tool="browser",
1763
+ input={"action": action, "params": p},
1764
+ error_type="config_error",
1765
+ code="E1001",
1766
+ message="Missing browser credentials. Set THORDATA_BROWSER_USERNAME and THORDATA_BROWSER_PASSWORD.",
1767
+ )
1768
+ session = await ServerContext.get_browser_session()
1769
+ if a == "navigate":
1770
+ url = str(p.get("url", ""))
1771
+ if not url:
1772
+ return error_response(tool="browser", input={"action": action, "params": p}, error_type="validation_error", code="E4001", message="Missing url")
1773
+ page = await session.get_page(url)
1774
+ if page.url != url:
1775
+ await page.goto(url, timeout=120_000)
1776
+ title = await page.title()
1777
+ return ok_response(tool="browser", input={"action": "navigate", "params": p}, output={"url": page.url, "title": title})
1778
+ if a == "snapshot":
1779
+ filtered = bool(p.get("filtered", True))
1780
+ mode = str(p.get("mode", "compact") or "compact").strip().lower()
1781
+ max_items = int(p.get("max_items", 80) or 80)
1782
+ if max_items <= 0 or max_items > 500:
1783
+ return error_response(
1784
+ tool="browser",
1785
+ input={"action": action, "params": p},
1786
+ error_type="validation_error",
1787
+ code="E4001",
1788
+ message="max_items must be between 1 and 500",
1789
+ details={"max_items": max_items},
1790
+ )
1791
+ include_dom = bool(p.get("include_dom", False))
1792
+ # Optional: allow snapshot to navigate when url is provided (better UX)
1793
+ url = p.get("url")
1794
+ if isinstance(url, str) and url.strip():
1795
+ page = await session.get_page(url)
1796
+ if page.url != url:
1797
+ await page.goto(url, timeout=120_000)
1798
+ data = await session.capture_snapshot(filtered=filtered, mode=mode, max_items=max_items, include_dom=include_dom)
1799
+ # Apply an additional safety max_chars guard to avoid flooding context.
1800
+ max_chars = int(p.get("max_chars", 20_000) or 20_000)
1801
+ aria_snapshot = truncate_content(str(data.get("aria_snapshot", "")), max_length=max_chars)
1802
+ dom_snapshot = data.get("dom_snapshot")
1803
+ dom_snapshot = truncate_content(str(dom_snapshot), max_length=max_chars) if dom_snapshot else None
1804
+ meta = data.get("_meta") if isinstance(data, dict) else None
1805
+ return ok_response(
1806
+ tool="browser",
1807
+ input={"action": "snapshot", "params": p},
1808
+ output={
1809
+ "url": data.get("url"),
1810
+ "title": data.get("title"),
1811
+ "aria_snapshot": aria_snapshot,
1812
+ "dom_snapshot": dom_snapshot,
1813
+ "_meta": meta,
1814
+ },
1815
+ )
1816
+ return error_response(
1817
+ tool="browser",
1818
+ input={"action": action, "params": p},
1819
+ error_type="validation_error",
1820
+ code="E4001",
1821
+ message=f"Unknown action '{action}'. Supported actions: 'navigate', 'snapshot'",
1822
+ )
1823
+
1824
+ # -------------------------
1825
+ # SMART SCRAPE (compact)
1826
+ # -------------------------
1827
+ @mcp.tool(
1828
+ name="smart_scrape",
1829
+ description=(
1830
+ "Auto-pick a Web Scraper task for URL; fallback to Unlocker. "
1831
+ "Always returns a structured summary plus raw HTML/JSON preview when possible."
1832
+ ),
1833
+ )
1834
+ @handle_mcp_errors
1835
+ async def smart_scrape(
1836
+ url: str,
1837
+ *,
1838
+ prefer_structured: bool = True,
1839
+ preview: bool = True,
1840
+ preview_max_chars: int = 20_000,
1841
+ max_wait_seconds: int = 300,
1842
+ unlocker_output: str = "markdown",
1843
+ ctx: Optional[Context] = None,
1844
+ ) -> dict[str, Any]:
1845
+ """Auto-pick a Web Scraper task for URL; fallback to Unlocker. Always returns structured."""
1846
+ # Basic schema-style guards for numeric params
1847
+ if preview_max_chars <= 0 or preview_max_chars > 100_000:
1848
+ return error_response(
1849
+ tool="smart_scrape",
1850
+ input={"url": url, "prefer_structured": prefer_structured, "preview": preview, "preview_max_chars": preview_max_chars},
1851
+ error_type="validation_error",
1852
+ code="E4001",
1853
+ message="preview_max_chars must be between 1 and 100000",
1854
+ details={"preview_max_chars": preview_max_chars},
1855
+ )
1856
+ if max_wait_seconds <= 0 or max_wait_seconds > 600:
1857
+ return error_response(
1858
+ tool="smart_scrape",
1859
+ input={"url": url, "prefer_structured": prefer_structured, "preview": preview, "max_wait_seconds": max_wait_seconds},
1860
+ error_type="validation_error",
1861
+ code="E4001",
1862
+ message="max_wait_seconds must be between 1 and 600",
1863
+ details={"max_wait_seconds": max_wait_seconds},
1864
+ )
1865
+ await safe_ctx_info(ctx, f"smart_scrape url={url!r} prefer_structured={prefer_structured}")
1866
+ host = _hostname(url)
1867
+ url_lower = url.lower()
1868
+ tried: list[dict[str, Any]] = []
1869
+
1870
+ # Special-case: Google search pages are best handled by SERP (more reliable than Unlocker).
1871
+ if prefer_structured:
1872
+ def _is_google_search_local(u: str) -> tuple[bool, str | None]:
1873
+ try:
1874
+ from urllib.parse import urlparse, parse_qs
1875
+
1876
+ p0 = urlparse(u)
1877
+ h0 = (p0.hostname or "").lower()
1878
+ if h0.startswith("www."):
1879
+ h0 = h0[4:]
1880
+ if h0 != "google.com":
1881
+ return (False, None)
1882
+ if p0.path != "/search":
1883
+ return (False, None)
1884
+ qs0 = parse_qs(p0.query or "")
1885
+ q0 = (qs0.get("q") or [""])[0].strip()
1886
+ return (bool(q0), q0 or None)
1887
+ except Exception:
1888
+ return (False, None)
1889
+
1890
+ is_g, q = _is_google_search_local(url)
1891
+ if is_g:
1892
+ await safe_ctx_info(ctx, f"smart_scrape: Google search detected, routing to SERP q={q!r}")
1893
+ try:
1894
+ from thordata.types import SerpRequest
1895
+ from thordata.types import Engine as EngineEnum
1896
+ client = await ServerContext.get_client()
1897
+ req = SerpRequest(
1898
+ query=str(q or ""),
1899
+ engine=EngineEnum.GOOGLE,
1900
+ num=10,
1901
+ start=0,
1902
+ country=None,
1903
+ language=None,
1904
+ google_domain="google.com",
1905
+ extra_params={},
1906
+ )
1907
+ data = await client.serp_search_advanced(req)
1908
+ serp_preview = None
1909
+ if preview:
1910
+ raw = truncate_content(str(data), max_length=int(preview_max_chars))
1911
+ serp_preview = {"format": "light_json", "raw": raw}
1912
+ return ok_response(
1913
+ tool="smart_scrape",
1914
+ input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
1915
+ output={
1916
+ "path": "SERP",
1917
+ "serp": {"engine": "google", "q": q, "num": 10, "start": 0},
1918
+ "result": data,
1919
+ "structured": {"url": url, "query": q, "engine": "google"},
1920
+ "preview": serp_preview,
1921
+ "candidates": [],
1922
+ "tried": tried,
1923
+ },
1924
+ )
1925
+ except Exception as e:
1926
+ err_msg = str(e)
1927
+ tried.append({"path": "SERP", "engine": "google", "q": q, "ok": False, "error": err_msg})
1928
+ await safe_ctx_info(ctx, f"smart_scrape: SERP routing failed, falling back. err={e}")
1929
+
1930
+ # Match product.py behavior: for certain URLs, don't even attempt Web Scraper.
1931
+ # - Google search pages: prefer SERP / Unlocker
1932
+ # - Generic/example domains: never pick marketplace/product tools
1933
+ skip_web_scraper = False
1934
+ if host == "google.com" and "/search" in url_lower:
1935
+ skip_web_scraper = True
1936
+ generic_domains = {"example.com", "example.org", "example.net", "test.com", "localhost"}
1937
+ if host in generic_domains or (host and host.endswith(".example.com")):
1938
+ skip_web_scraper = True
1939
+
1940
+ selected_tool: str | None = None
1941
+ selected_params: dict[str, Any] = {}
1942
+ candidates: list[tuple[str, dict[str, Any]]] = []
1943
+ if not skip_web_scraper:
1944
+ selected_tool, selected_params = _guess_tool_for_url(url)
1945
+ # Only keep guessed tool if it exists in tool map (avoid invalid hardcode drift)
1946
+ from .product import _ensure_tools as _ensure # local import to avoid cycles
1947
+
1948
+ _, tools_map = _ensure()
1949
+ if selected_tool and selected_tool in tools_map:
1950
+ candidates.append((selected_tool, selected_params))
1951
+
1952
+ if not candidates:
1953
+ candidate_keys = _candidate_tools_for_url(url, limit=3)
1954
+ # Filter out obviously wrong tools (like GitHub for non-GitHub URLs)
1955
+ filtered_candidates: list[str] = []
1956
+ for k in candidate_keys:
1957
+ lk = k.lower()
1958
+ if "github" in lk and host and "github" not in host.lower():
1959
+ continue
1960
+ if "repository" in lk and host and "github" not in host.lower() and "gitlab" not in host.lower():
1961
+ continue
1962
+ if "amazon" in lk and host and "amazon" not in host.lower():
1963
+ continue
1964
+ if "walmart" in lk and host and "walmart" not in host.lower():
1965
+ continue
1966
+ if ("googleshopping" in lk or "google.shopping" in lk) and (host == "google.com" or "/search" in url_lower):
1967
+ continue
1968
+ filtered_candidates.append(k)
1969
+
1970
+ for k in filtered_candidates:
1971
+ candidates.append((k, {"url": url}))
1972
+ else:
1973
+ await safe_ctx_info(ctx, f"smart_scrape: skipping Web Scraper for host={host!r} url={url!r}")
1974
+
1975
+ if prefer_structured and candidates:
1976
+ for tool, params in candidates[:3]:
1977
+ r = await _run_web_scraper_tool(tool=tool, params=params, wait=True, max_wait_seconds=max_wait_seconds, file_type="json", ctx=ctx)
1978
+ # Check if task succeeded (status should be Ready/Success, not Failed)
1979
+ result_obj = r.get("output") if isinstance(r.get("output"), dict) else {}
1980
+ status = result_obj.get("status", "").lower() if isinstance(result_obj, dict) else ""
1981
+
1982
+ # If status is Failed, don't try more Web Scraper tools - go to Unlocker
1983
+ # Also check if r.get("ok") is False, which indicates the tool call itself failed
1984
+ if status == "failed" or r.get("ok") is False:
1985
+ await safe_ctx_info(ctx, f"smart_scrape: Web Scraper tool {tool} failed (status={status}, ok={r.get('ok')}), falling back to Unlocker")
1986
+ tried.append({
1987
+ "tool": tool,
1988
+ "ok": r.get("ok"),
1989
+ "status": status,
1990
+ "error": r.get("error"),
1991
+ })
1992
+ break # Exit loop and go to Unlocker fallback
1993
+
1994
+ # Only return success if both ok is True AND status is not failed
1995
+ if r.get("ok") is True and status not in {"failed", "error", "failure"}:
1996
+ out = r.get("output") if isinstance(r.get("output"), dict) else {}
1997
+ dl = out.get("download_url") if isinstance(out, dict) else None
1998
+ preview_obj = None
1999
+ structured = {"url": url}
2000
+ if preview and isinstance(dl, str) and dl:
2001
+ preview_obj = await _fetch_json_preview(dl, max_chars=int(preview_max_chars))
2002
+ # Try to use preview data even if JSON parsing failed but we have raw data
2003
+ if preview_obj.get("ok") is True:
2004
+ data = preview_obj.get("data")
2005
+ if isinstance(data, list) and data:
2006
+ structured = _normalize_record(data[0], url=url)
2007
+ elif isinstance(data, dict):
2008
+ structured = _normalize_record(data, url=url)
2009
+ elif preview_obj.get("status") == 200 and preview_obj.get("raw"):
2010
+ # JSON parsing failed but we have raw data - try to extract basic info
2011
+ raw = preview_obj.get("raw", "")
2012
+ if raw:
2013
+ # Try to extract basic fields from raw text if possible
2014
+ structured = {"url": url, "raw_preview": raw[:500]} # Limit raw preview size
2015
+ return ok_response(
2016
+ tool="smart_scrape",
2017
+ input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
2018
+ output={
2019
+ "path": "WEB_SCRAPER",
2020
+ "selected_tool": tool,
2021
+ "selected_params": params,
2022
+ "result": out,
2023
+ "structured": structured,
2024
+ "preview": preview_obj,
2025
+ "candidates": [c[0] for c in candidates],
2026
+ "tried": tried,
2027
+ },
2028
+ )
2029
+ tried.append({"tool": tool, "ok": r.get("ok"), "status": status, "error": r.get("error")})
2030
+
2031
+ client = await ServerContext.get_client()
2032
+ try:
2033
+ with PerformanceTimer(tool="smart_scrape.unlocker", url=url):
2034
+ html = await client.universal_scrape(url=url, js_render=True, output_format="html", wait_for=".content")
2035
+ html_str = str(html) if not isinstance(html, str) else html
2036
+ extracted = _extract_structured_from_html(html_str) if html_str else {}
2037
+ structured = _normalize_extracted(extracted, url=url)
2038
+ # Token-efficient preview
2039
+ preview_obj: dict[str, Any] | None = None
2040
+ out_mode = (unlocker_output or "markdown").strip().lower()
2041
+ if out_mode not in {"markdown", "md", "html"}:
2042
+ out_mode = "markdown"
2043
+ if preview:
2044
+ if out_mode in {"markdown", "md"}:
2045
+ md = html_to_markdown_clean(html_str)
2046
+ md = truncate_content(md, max_length=int(preview_max_chars))
2047
+ preview_obj = {"format": "markdown", "raw": md}
2048
+ else:
2049
+ preview_obj = {"format": "html", "raw": truncate_content(html_str, max_length=int(preview_max_chars))}
2050
+ return ok_response(
2051
+ tool="smart_scrape",
2052
+ input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
2053
+ output={
2054
+ "path": "WEB_UNLOCKER",
2055
+ "preview": preview_obj,
2056
+ "extracted": extracted,
2057
+ "structured": structured,
2058
+ "selected_tool": selected_tool,
2059
+ "selected_params": selected_params,
2060
+ "candidates": [c[0] for c in candidates],
2061
+ "tried": tried,
2062
+ },
2063
+ )
2064
+ except asyncio.TimeoutError as e:
2065
+ # Handle timeout specifically
2066
+ await safe_ctx_info(ctx, f"smart_scrape: Unlocker timed out: {e}")
2067
+ return error_response(
2068
+ tool="smart_scrape",
2069
+ input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
2070
+ error_type="timeout_error",
2071
+ code="E2003",
2072
+ message=f"Unlocker request timed out. The page may be slow to load or blocked.",
2073
+ details={
2074
+ "selected_tool": selected_tool,
2075
+ "candidates": [c[0] for c in candidates],
2076
+ "tried": tried,
2077
+ },
2078
+ )
2079
+ except Exception as e:
2080
+ # If Unlocker also fails, return error with context
2081
+ await safe_ctx_info(ctx, f"smart_scrape: Unlocker also failed: {e}")
2082
+ error_msg = str(e)
2083
+ # Extract more useful error information
2084
+ if "504" in error_msg or "Gateway Timeout" in error_msg:
2085
+ error_type = "timeout_error"
2086
+ error_code = "E2003"
2087
+ error_message = f"Unlocker request timed out (504 Gateway Timeout). The page may be slow to load or blocked."
2088
+ elif "timeout" in error_msg.lower():
2089
+ error_type = "timeout_error"
2090
+ error_code = "E2003"
2091
+ error_message = f"Unlocker request timed out: {error_msg}"
2092
+ else:
2093
+ error_type = "network_error"
2094
+ error_code = "E2002"
2095
+ error_message = f"Both Web Scraper and Unlocker failed. Last error: {error_msg}"
2096
+ return error_response(
2097
+ tool="smart_scrape",
2098
+ input={"url": url, "prefer_structured": prefer_structured, "preview": preview},
2099
+ error_type=error_type,
2100
+ code=error_code,
2101
+ message=error_message,
2102
+ details={
2103
+ "selected_tool": selected_tool,
2104
+ "candidates": [c[0] for c in candidates],
2105
+ "tried": tried,
2106
+ },
2107
+ )
2108
+