thordata-mcp-server 0.4.4__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/PKG-INFO +29 -54
  2. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/README.md +27 -52
  3. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/pyproject.toml +2 -2
  4. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/__init__.py +1 -1
  5. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/browser_session.py +157 -12
  6. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/config.py +14 -3
  7. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/context.py +1 -1
  8. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/browser.py +124 -18
  9. thordata_mcp_server-0.5.0/src/thordata_mcp/tools/debug.py +125 -0
  10. thordata_mcp_server-0.5.0/src/thordata_mcp/tools/params_utils.py +107 -0
  11. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/product.py +83 -5
  12. thordata_mcp_server-0.5.0/src/thordata_mcp/tools/product_compact.py +2108 -0
  13. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/utils.py +2 -0
  14. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/utils.py +393 -322
  15. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/PKG-INFO +29 -54
  16. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/SOURCES.txt +2 -0
  17. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/requires.txt +1 -1
  18. thordata_mcp_server-0.4.4/src/thordata_mcp/tools/product_compact.py +0 -962
  19. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/setup.cfg +0 -0
  20. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/aria_snapshot.py +0 -0
  21. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/debug_http.py +0 -0
  22. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/main.py +0 -0
  23. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/monitoring.py +0 -0
  24. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/registry.py +0 -0
  25. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/__init__.py +0 -0
  26. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/__init__.py +0 -0
  27. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/serp.py +0 -0
  28. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/tasks.py +0 -0
  29. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/universal.py +0 -0
  30. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/dependency_links.txt +0 -0
  31. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/entry_points.txt +0 -0
  32. {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thordata-mcp-server
3
- Version: 0.4.4
3
+ Version: 0.5.0
4
4
  Summary: Official MCP Server for Thordata.
5
5
  Author-email: Thordata Developer Team <support@thordata.com>
6
6
  License-Expression: MIT
@@ -8,7 +8,7 @@ Requires-Python: >=3.10
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: mcp[cli]>=1.0.0
10
10
  Requires-Dist: sse-starlette>=1.6.1
11
- Requires-Dist: thordata-sdk>=1.6.0
11
+ Requires-Dist: thordata-sdk>=1.7.0
12
12
  Requires-Dist: pydantic-settings
13
13
  Requires-Dist: markdownify
14
14
  Requires-Dist: html2text
@@ -23,14 +23,14 @@ Requires-Dist: uvicorn
23
23
 
24
24
  **Give your AI Agents real-time web scraping superpowers.**
25
25
 
26
- This MCP Server version has been **streamlined to focus on scraping**, concentrating on four core products:
26
+ This MCP Server version has been **streamlined to focus on scraping**, concentrating on a compact, LLM‑friendly tool surface:
27
27
 
28
- - **SERP API** (Search result scraping)
28
+ - **Search Engine** (LLM-friendly web search wrapper)
29
+ - **SERP API** (Search result scraping, internal plumbing)
29
30
  - **Web Unlocker / Universal Scraper** (Universal page unlocking & scraping)
30
- - **Web Scraper API** (Structured task flow)
31
31
  - **Scraping Browser** (Browser-level scraping)
32
32
 
33
- Earlier versions exposed `proxy.*` / `account.*` / `proxy_users.*` proxy and account management tools. This version has removed these control plane interfaces, keeping only scraping-related capabilities for a clean tool surface in Cursor / MCP clients.
33
+ Earlier versions exposed `proxy.*` / `account.*` / `proxy_users.*` proxy and account management tools, and a large `web_scraper` task surface. This version removes those control plane interfaces from MCP, keeping only scraping-related capabilities that are easy for LLMs to use.
34
34
 
35
35
  ## 🚀 Features
36
36
 
@@ -76,38 +76,15 @@ THORDATA_BROWSER_PASSWORD=your_password
76
76
 
77
77
  ### Tool Exposure Modes
78
78
 
79
- Current implementation provides **streamlined scraping tool surface only**, no longer exposing proxy and account management tools:
79
+ Current implementation provides a **compact scraping tool surface**, optimized for Cursor / LLM tool callers:
80
80
 
81
- - **SERP SCRAPER**: `serp` (actions: `search`, `batch_search`)
82
- - **WEB UNLOCKER**: `unlocker` (actions: `fetch`, `batch_fetch`)
83
- - **WEB SCRAPER (100+ structured tasks + task management)**: `web_scraper` (actions: `catalog`, `groups`, `run`, `batch_run`, `status`, `status_batch`, `wait`, `result`, `result_batch`, `list_tasks`, `cancel`)
84
- - **BROWSER SCRAPER**: `browser` (actions: `navigate`, `snapshot`)
85
- - **Smart (auto tool + fallback)**: `smart_scrape`
81
+ - **`search_engine`** (recommended for LLMs): high-level web search wrapper, returns a light `results[]` array with `title/link/description`. Internally delegates to the SERP backend.
82
+ - **`search_engine_batch`**: batch variant of `search_engine` with per-item `ok/error` results.
83
+ - **`unlocker`**: actions `fetch`, `batch_fetch` universal page unlock & content extraction (HTML/Markdown), with per-item error reporting for batch.
84
+ - **`browser`**: action `snapshot` navigate (optional `url`) and capture an ARIA-focused snapshot for interactive elements.
85
+ - **`smart_scrape`**: auto-picks the best scraper (SERP, Web Scraper, Unlocker) for a given URL and returns a unified, LLM-friendly response.
86
86
 
87
- > Note: This version focuses on scraping functionality and no longer includes `proxy.*` / `account.*` control plane tools.
88
-
89
- ### Web Scraper discovery (100+ tools, no extra env required)
90
-
91
- Use `web_scraper` with `action="catalog"` / `action="groups"` to discover tools.
92
- This keeps Cursor/LLMs usable while still supporting **100+ tools** under a single entrypoint.
93
-
94
- ```env
95
- # Default: curated + limit 60
96
- THORDATA_TASKS_LIST_MODE=curated
97
- THORDATA_TASKS_LIST_DEFAULT_LIMIT=60
98
-
99
- # Which groups are included when mode=curated
100
- THORDATA_TASKS_GROUPS=ecommerce,social,video,search,travel,code,professional
101
-
102
- # Optional safety/UX: restrict which tools can actually run
103
- # (comma-separated prefixes or exact tool keys)
104
- # Example:
105
- # THORDATA_TASKS_ALLOWLIST=thordata.tools.video.,thordata.tools.ecommerce.Amazon.ProductByAsin
106
- THORDATA_TASKS_ALLOWLIST=
107
- ```
108
-
109
- If you want Cursor to **never** see the full 300+ tool list, keep `THORDATA_TASKS_LIST_MODE=curated`
110
- and optionally set `THORDATA_TASKS_ALLOWLIST` to the small subset you actually want to support.
87
+ Internally, the server still uses structured SERP and Web Scraper capabilities, but they are not exposed as large tool surfaces by default to avoid overwhelming LLMs.
111
88
 
112
89
  ### Deployment (Optional)
113
90
 
@@ -162,19 +139,17 @@ Add this to your `claude_desktop_config.json`:
162
139
  Notes:
163
140
  - `THORDATA_BROWSER_USERNAME` / `THORDATA_BROWSER_PASSWORD` are required for `browser.*` tools (Scraping Browser).
164
141
 
165
- ## 🛠️ Available Tools
166
-
167
- ### Available Tools (All directly related to scraping)
142
+ ## 🛠️ Available Tools (Compact Surface)
168
143
 
169
- Current MCP Server only exposes the following **5 scraping-related tools**:
144
+ By default, the MCP server exposes a **small, LLM-friendly tool set**:
170
145
 
171
- - **`serp`**: action `search`, `batch_search`
172
- - **`unlocker`**: action `fetch`, `batch_fetch`
173
- - **`web_scraper`**: action `catalog`, `groups`, `run`, `batch_run`, `status`, `status_batch`, `wait`, `result`, `result_batch`, `list_tasks`, `cancel`
174
- - **`browser`**: action `navigate`, `snapshot`
175
- - **`smart_scrape`**: auto-pick structured tool; fallback to unlocker
146
+ - **`search_engine`**: single-query web search (`params.q`, optional `params.num`, `params.engine`).
147
+ - **`search_engine_batch`**: batch web search with per-item `ok/error` in `results[]`.
148
+ - **`unlocker`**: universal scraping via `fetch` / `batch_fetch`.
149
+ - **`browser`**: `snapshot` with optional `url`, `max_items`, and `max_chars`.
150
+ - **`smart_scrape`**: smart router for `url` with optional preview limit parameters.
176
151
 
177
- > Proxy network related APIs can still be used via other Thordata SDKs / HTTP APIs, but are not exposed through MCP to avoid introducing complex management operations in LLMs.
152
+ Advanced / internal tools (e.g. low-level `serp.*`, full `web_scraper.*` surfaces, proxy/account control plane) remain available via HTTP APIs and SDKs, but are not exposed directly as MCP tools to keep the surface manageable for agents and LLMs.
178
153
 
179
154
  ## 🏗️ Architecture
180
155
 
@@ -189,14 +164,14 @@ thordata_mcp/
189
164
  ├── utils.py # Common utilities (error handling, responses)
190
165
  ├── browser_session.py # Browser session management (Playwright)
191
166
  ├── aria_snapshot.py # ARIA snapshot filtering
192
- └── tools/
193
- ├── product_compact.py # Streamlined 5-tool entry point (serp/unlocker/web_scraper/browser/smart_scrape)
194
- ├── product.py # Full product implementation for internal use (reused by compact version)
195
- ├── data/ # Data plane tools (only scraping-related namespaces retained)
196
- │ ├── serp.py # serp.*
197
- │ ├── universal.py # universal.*
198
- │ ├── browser.py # browser.*
199
- │ └── tasks.py # tasks.*
167
+ └── tools/
168
+ ├── product_compact.py # Streamlined MCP entrypoint (search_engine / unlocker / browser / smart_scrape, plus batch variants)
169
+ ├── product.py # Full product implementation for internal use (reused by compact version)
170
+ ├── data/ # Data plane tools (only scraping-related namespaces retained)
171
+ │ ├── serp.py # SERP backend integration
172
+ │ ├── universal.py # Universal / Unlocker backend integration
173
+ │ ├── browser.py # Browser / Playwright helpers
174
+ │ └── tasks.py # Structured scraping tasks (used by smart_scrape and internal flows)
200
175
  ```
201
176
 
202
177
  ## 🎯 Design Principles
@@ -2,14 +2,14 @@
2
2
 
3
3
  **Give your AI Agents real-time web scraping superpowers.**
4
4
 
5
- This MCP Server version has been **streamlined to focus on scraping**, concentrating on four core products:
5
+ This MCP Server version has been **streamlined to focus on scraping**, concentrating on a compact, LLM‑friendly tool surface:
6
6
 
7
- - **SERP API** (Search result scraping)
7
+ - **Search Engine** (LLM-friendly web search wrapper)
8
+ - **SERP API** (Search result scraping, internal plumbing)
8
9
  - **Web Unlocker / Universal Scraper** (Universal page unlocking & scraping)
9
- - **Web Scraper API** (Structured task flow)
10
10
  - **Scraping Browser** (Browser-level scraping)
11
11
 
12
- Earlier versions exposed `proxy.*` / `account.*` / `proxy_users.*` proxy and account management tools. This version has removed these control plane interfaces, keeping only scraping-related capabilities for a clean tool surface in Cursor / MCP clients.
12
+ Earlier versions exposed `proxy.*` / `account.*` / `proxy_users.*` proxy and account management tools, and a large `web_scraper` task surface. This version removes those control plane interfaces from MCP, keeping only scraping-related capabilities that are easy for LLMs to use.
13
13
 
14
14
  ## 🚀 Features
15
15
 
@@ -55,38 +55,15 @@ THORDATA_BROWSER_PASSWORD=your_password
55
55
 
56
56
  ### Tool Exposure Modes
57
57
 
58
- Current implementation provides **streamlined scraping tool surface only**, no longer exposing proxy and account management tools:
58
+ Current implementation provides a **compact scraping tool surface**, optimized for Cursor / LLM tool callers:
59
59
 
60
- - **SERP SCRAPER**: `serp` (actions: `search`, `batch_search`)
61
- - **WEB UNLOCKER**: `unlocker` (actions: `fetch`, `batch_fetch`)
62
- - **WEB SCRAPER (100+ structured tasks + task management)**: `web_scraper` (actions: `catalog`, `groups`, `run`, `batch_run`, `status`, `status_batch`, `wait`, `result`, `result_batch`, `list_tasks`, `cancel`)
63
- - **BROWSER SCRAPER**: `browser` (actions: `navigate`, `snapshot`)
64
- - **Smart (auto tool + fallback)**: `smart_scrape`
60
+ - **`search_engine`** (recommended for LLMs): high-level web search wrapper, returns a light `results[]` array with `title/link/description`. Internally delegates to the SERP backend.
61
+ - **`search_engine_batch`**: batch variant of `search_engine` with per-item `ok/error` results.
62
+ - **`unlocker`**: actions `fetch`, `batch_fetch` universal page unlock & content extraction (HTML/Markdown), with per-item error reporting for batch.
63
+ - **`browser`**: action `snapshot` navigate (optional `url`) and capture an ARIA-focused snapshot for interactive elements.
64
+ - **`smart_scrape`**: auto-picks the best scraper (SERP, Web Scraper, Unlocker) for a given URL and returns a unified, LLM-friendly response.
65
65
 
66
- > Note: This version focuses on scraping functionality and no longer includes `proxy.*` / `account.*` control plane tools.
67
-
68
- ### Web Scraper discovery (100+ tools, no extra env required)
69
-
70
- Use `web_scraper` with `action="catalog"` / `action="groups"` to discover tools.
71
- This keeps Cursor/LLMs usable while still supporting **100+ tools** under a single entrypoint.
72
-
73
- ```env
74
- # Default: curated + limit 60
75
- THORDATA_TASKS_LIST_MODE=curated
76
- THORDATA_TASKS_LIST_DEFAULT_LIMIT=60
77
-
78
- # Which groups are included when mode=curated
79
- THORDATA_TASKS_GROUPS=ecommerce,social,video,search,travel,code,professional
80
-
81
- # Optional safety/UX: restrict which tools can actually run
82
- # (comma-separated prefixes or exact tool keys)
83
- # Example:
84
- # THORDATA_TASKS_ALLOWLIST=thordata.tools.video.,thordata.tools.ecommerce.Amazon.ProductByAsin
85
- THORDATA_TASKS_ALLOWLIST=
86
- ```
87
-
88
- If you want Cursor to **never** see the full 300+ tool list, keep `THORDATA_TASKS_LIST_MODE=curated`
89
- and optionally set `THORDATA_TASKS_ALLOWLIST` to the small subset you actually want to support.
66
+ Internally, the server still uses structured SERP and Web Scraper capabilities, but they are not exposed as large tool surfaces by default to avoid overwhelming LLMs.
90
67
 
91
68
  ### Deployment (Optional)
92
69
 
@@ -141,19 +118,17 @@ Add this to your `claude_desktop_config.json`:
141
118
  Notes:
142
119
  - `THORDATA_BROWSER_USERNAME` / `THORDATA_BROWSER_PASSWORD` are required for `browser.*` tools (Scraping Browser).
143
120
 
144
- ## 🛠️ Available Tools
145
-
146
- ### Available Tools (All directly related to scraping)
121
+ ## 🛠️ Available Tools (Compact Surface)
147
122
 
148
- Current MCP Server only exposes the following **5 scraping-related tools**:
123
+ By default, the MCP server exposes a **small, LLM-friendly tool set**:
149
124
 
150
- - **`serp`**: action `search`, `batch_search`
151
- - **`unlocker`**: action `fetch`, `batch_fetch`
152
- - **`web_scraper`**: action `catalog`, `groups`, `run`, `batch_run`, `status`, `status_batch`, `wait`, `result`, `result_batch`, `list_tasks`, `cancel`
153
- - **`browser`**: action `navigate`, `snapshot`
154
- - **`smart_scrape`**: auto-pick structured tool; fallback to unlocker
125
+ - **`search_engine`**: single-query web search (`params.q`, optional `params.num`, `params.engine`).
126
+ - **`search_engine_batch`**: batch web search with per-item `ok/error` in `results[]`.
127
+ - **`unlocker`**: universal scraping via `fetch` / `batch_fetch`.
128
+ - **`browser`**: `snapshot` with optional `url`, `max_items`, and `max_chars`.
129
+ - **`smart_scrape`**: smart router for `url` with optional preview limit parameters.
155
130
 
156
- > Proxy network related APIs can still be used via other Thordata SDKs / HTTP APIs, but are not exposed through MCP to avoid introducing complex management operations in LLMs.
131
+ Advanced / internal tools (e.g. low-level `serp.*`, full `web_scraper.*` surfaces, proxy/account control plane) remain available via HTTP APIs and SDKs, but are not exposed directly as MCP tools to keep the surface manageable for agents and LLMs.
157
132
 
158
133
  ## 🏗️ Architecture
159
134
 
@@ -168,14 +143,14 @@ thordata_mcp/
168
143
  ├── utils.py # Common utilities (error handling, responses)
169
144
  ├── browser_session.py # Browser session management (Playwright)
170
145
  ├── aria_snapshot.py # ARIA snapshot filtering
171
- └── tools/
172
- ├── product_compact.py # Streamlined 5-tool entry point (serp/unlocker/web_scraper/browser/smart_scrape)
173
- ├── product.py # Full product implementation for internal use (reused by compact version)
174
- ├── data/ # Data plane tools (only scraping-related namespaces retained)
175
- │ ├── serp.py # serp.*
176
- │ ├── universal.py # universal.*
177
- │ ├── browser.py # browser.*
178
- │ └── tasks.py # tasks.*
146
+ └── tools/
147
+ ├── product_compact.py # Streamlined MCP entrypoint (search_engine / unlocker / browser / smart_scrape, plus batch variants)
148
+ ├── product.py # Full product implementation for internal use (reused by compact version)
149
+ ├── data/ # Data plane tools (only scraping-related namespaces retained)
150
+ │ ├── serp.py # SERP backend integration
151
+ │ ├── universal.py # Universal / Unlocker backend integration
152
+ │ ├── browser.py # Browser / Playwright helpers
153
+ │ └── tasks.py # Structured scraping tasks (used by smart_scrape and internal flows)
179
154
  ```
180
155
 
181
156
  ## 🎯 Design Principles
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "thordata-mcp-server"
7
- version = "0.4.4"
7
+ version = "0.5.0"
8
8
  description = "Official MCP Server for Thordata."
9
9
  authors = [{name = "Thordata Developer Team", email = "support@thordata.com"}]
10
10
  readme = "README.md"
@@ -13,7 +13,7 @@ license = "MIT"
13
13
  dependencies = [
14
14
  "mcp[cli]>=1.0.0",
15
15
  "sse-starlette>=1.6.1",
16
- "thordata-sdk>=1.6.0",
16
+ "thordata-sdk>=1.7.0",
17
17
  "pydantic-settings",
18
18
  "markdownify",
19
19
  "html2text",
@@ -1,4 +1,4 @@
1
1
  """
2
2
  Thordata MCP Server package.
3
3
  """
4
- __version__ = "0.4.4"
4
+ __version__ = "0.5.0"
@@ -1,8 +1,7 @@
1
1
  """Browser session management for Thordata Scraping Browser.
2
2
 
3
3
  This module provides a high-level wrapper around Playwright connected to
4
- Thordata's Scraping Browser (via `AsyncThordataClient.get_browser_connection_url`),
5
- inspired by Bright Data's browser session design but implemented in Python.
4
+ Thordata's Scraping Browser (via `AsyncThordataClient.get_browser_connection_url`).
6
5
 
7
6
  Design goals:
8
7
  - Domain-scoped browser sessions (one browser/page per domain).
@@ -18,6 +17,8 @@ from urllib.parse import urlparse
18
17
 
19
18
  from playwright.async_api import Browser, Page, Playwright, async_playwright
20
19
 
20
+ import time
21
+
21
22
  from thordata.async_client import AsyncThordataClient
22
23
 
23
24
  from .aria_snapshot import AriaSnapshotFilter
@@ -37,6 +38,11 @@ class BrowserSession:
37
38
  self._requests: Dict[str, Dict[Any, Any]] = {}
38
39
  self._dom_refs: Set[str] = set()
39
40
  self._current_domain: str = "default"
41
+ # Console and network diagnostics cache
42
+ self._console_messages: Dict[str, List[Dict[str, Any]]] = {}
43
+ self._network_requests: Dict[str, List[Dict[str, Any]]] = {}
44
+ self._max_console_messages = 10
45
+ self._max_network_requests = 20
40
46
 
41
47
  @staticmethod
42
48
  def _get_domain(url: str) -> str:
@@ -139,11 +145,26 @@ class BrowserSession:
139
145
 
140
146
  # Reset network tracking for this domain
141
147
  self._requests[domain] = {}
148
+ self._console_messages[domain] = []
149
+ self._network_requests[domain] = []
142
150
 
143
151
  async def on_request(request: Any) -> None:
144
152
  if domain in self._requests:
145
153
  self._requests[domain][request] = None
146
-
154
+ try:
155
+ self._network_requests.setdefault(domain, [])
156
+ self._network_requests[domain].append(
157
+ {
158
+ "url": request.url,
159
+ "method": request.method,
160
+ "resourceType": getattr(request, "resource_type", None),
161
+ "timestamp": int(time.time() * 1000),
162
+ }
163
+ )
164
+ self._network_requests[domain] = self._network_requests[domain][-self._max_network_requests :]
165
+ except Exception:
166
+ pass
167
+
147
168
  async def on_response(response: Any) -> None:
148
169
  if domain in self._requests:
149
170
  try:
@@ -151,15 +172,78 @@ class BrowserSession:
151
172
  except Exception:
152
173
  # Best-effort, non-fatal
153
174
  pass
175
+ try:
176
+ # Update last matching request with status
177
+ req = response.request
178
+ url = getattr(req, "url", None)
179
+ if url and domain in self._network_requests:
180
+ for item in reversed(self._network_requests[domain]):
181
+ if item.get("url") == url and item.get("statusCode") is None:
182
+ item["statusCode"] = response.status
183
+ break
184
+ except Exception:
185
+ pass
154
186
 
155
187
  page.on("request", on_request)
156
188
  page.on("response", on_response)
157
-
189
+
190
+ # Console message tracking
191
+ async def on_console(msg: Any) -> None:
192
+ try:
193
+ self._console_messages.setdefault(domain, [])
194
+ self._console_messages[domain].append(
195
+ {
196
+ "type": msg.type,
197
+ "message": msg.text,
198
+ "timestamp": int(time.time() * 1000),
199
+ }
200
+ )
201
+ self._console_messages[domain] = self._console_messages[domain][-self._max_console_messages :]
202
+ except Exception:
203
+ pass
204
+
205
+ page.on("console", on_console)
206
+
158
207
  self._pages[domain] = page
159
208
  return page
160
209
 
161
- async def capture_snapshot(self, filtered: bool = True) -> Dict[str, Any]:
162
- """Capture an ARIA-like snapshot and optional DOM snapshot."""
210
+ def get_console_tail(self, n: int = 10, domain: Optional[str] = None) -> List[Dict[str, Any]]:
211
+ """Return recent console messages for the given domain."""
212
+ d = domain or self._current_domain
213
+ items = self._console_messages.get(d, [])
214
+ return items[-max(0, int(n)) :]
215
+
216
+ def get_network_tail(self, n: int = 20, domain: Optional[str] = None) -> List[Dict[str, Any]]:
217
+ """Return recent network request summaries for the given domain."""
218
+ d = domain or self._current_domain
219
+ items = self._network_requests.get(d, [])
220
+ return items[-max(0, int(n)) :]
221
+
222
+ def reset_page(self, domain: Optional[str] = None) -> None:
223
+ """Drop cached page for a domain so the next call recreates it."""
224
+ d = domain or self._current_domain
225
+ self._pages.pop(d, None)
226
+ self._requests.pop(d, None)
227
+ self._console_messages.pop(d, None)
228
+ self._network_requests.pop(d, None)
229
+
230
+
231
+ async def capture_snapshot(
232
+ self,
233
+ *,
234
+ filtered: bool = True,
235
+ mode: str = "compact",
236
+ max_items: int = 80,
237
+ include_dom: bool = False,
238
+ ) -> Dict[str, Any]:
239
+ """Capture an ARIA-like snapshot and optional DOM snapshot.
240
+
241
+ Args:
242
+ filtered: Whether to apply AriaSnapshotFilter (legacy, kept for compatibility).
243
+ mode: "compact" | "full". Compact returns minimal interactive elements.
244
+ max_items: Maximum number of interactive elements to include (compact mode only).
245
+ include_dom: Whether to include dom_snapshot (compact mode defaults to False).
246
+ """
163
247
  page = await self.get_page()
164
248
 
165
249
  try:
@@ -175,16 +259,64 @@ class BrowserSession:
175
259
  "aria_snapshot": full_snapshot,
176
260
  }
177
261
 
262
+ if mode == "compact":
263
+ # Compact: return only filtered interactive elements, optionally without dom_snapshot
264
+ filtered_snapshot = AriaSnapshotFilter.filter_snapshot(full_snapshot)
265
+ filtered_snapshot = self._limit_aria_snapshot_items(filtered_snapshot, max_items=max_items)
266
+ dom_snapshot = None
267
+ if include_dom:
268
+ dom_snapshot_raw = await self._capture_dom_snapshot(page)
269
+ self._dom_refs = {el["ref"] for el in dom_snapshot_raw}
270
+ dom_snapshot = AriaSnapshotFilter.format_dom_elements(dom_snapshot_raw)
271
+ return {
272
+ "url": page.url,
273
+ "title": await page.title(),
274
+ "aria_snapshot": filtered_snapshot,
275
+ "dom_snapshot": dom_snapshot,
276
+ "_meta": {"mode": mode, "max_items": max_items, "include_dom": include_dom},
277
+ }
278
+
279
+ # Full mode: include both filtered aria and dom_snapshot (legacy behavior)
178
280
  filtered_snapshot = AriaSnapshotFilter.filter_snapshot(full_snapshot)
179
- dom_snapshot = await self._capture_dom_snapshot(page)
180
- self._dom_refs = {el["ref"] for el in dom_snapshot}
181
-
281
+ dom_snapshot_raw = await self._capture_dom_snapshot(page)
282
+ self._dom_refs = {el["ref"] for el in dom_snapshot_raw}
182
283
  return {
183
284
  "url": page.url,
184
285
  "title": await page.title(),
185
286
  "aria_snapshot": filtered_snapshot,
186
- "dom_snapshot": AriaSnapshotFilter.format_dom_elements(dom_snapshot),
287
+ "dom_snapshot": AriaSnapshotFilter.format_dom_elements(dom_snapshot_raw),
288
+ "_meta": {"mode": mode},
187
289
  }
290
+
291
+ @staticmethod
292
+ def _limit_aria_snapshot_items(text: str, *, max_items: int) -> str:
293
+ """Limit snapshot to the first N interactive element blocks.
294
+
295
+ The snapshot format is a list where each element starts with a line beginning
296
+ with '- ' (Playwright raw) or '[' (AriaSnapshotFilter compact), and may include
297
+ one or more indented continuation lines.
298
+ """
299
+ try:
300
+ n = int(max_items)
301
+ except Exception:
302
+ n = 80
303
+ if n <= 0:
304
+ return ""
305
+ if not text:
306
+ return text
307
+
308
+ lines = text.splitlines()
309
+ out: list[str] = []
310
+ items = 0
311
+ for line in lines:
312
+ if line.startswith("- ") or line.startswith("["):
313
+ if items >= n:
314
+ break
315
+ items += 1
316
+ # Include continuation lines only if we've started collecting items.
317
+ if items > 0:
318
+ out.append(line)
319
+ return "\n".join(out).strip()
188
320
 
189
321
  async def _get_interactive_snapshot(self, page: Page) -> str:
190
322
  """Generate a text snapshot of interactive elements with refs."""
@@ -194,12 +326,25 @@ class BrowserSession:
194
326
  const lines = [];
195
327
  let refCounter = 0;
196
328
 
329
+ function normalizeRole(tag, explicitRole) {
330
+ const role = (explicitRole || '').toLowerCase();
331
+ const t = (tag || '').toLowerCase();
332
+ if (role) return role;
333
+ // Map common interactive tags to standard ARIA roles
334
+ if (t === 'a') return 'link';
335
+ if (t === 'button') return 'button';
336
+ if (t === 'input') return 'textbox';
337
+ if (t === 'select') return 'combobox';
338
+ if (t === 'textarea') return 'textbox';
339
+ return t;
340
+ }
341
+
197
342
  function traverse(node) {
198
343
  if (node.nodeType === Node.ELEMENT_NODE) {
199
- const role = node.getAttribute('role') || node.tagName.toLowerCase();
200
344
  const tag = node.tagName.toLowerCase();
201
345
  const interactiveTag = ['a', 'button', 'input', 'select', 'textarea'].includes(tag);
202
- const interactiveRole = ['button', 'link', 'textbox', 'checkbox'].includes(role);
346
+ const role = normalizeRole(tag, node.getAttribute('role'));
347
+ const interactiveRole = ['button', 'link', 'textbox', 'searchbox', 'combobox', 'checkbox', 'radio', 'switch', 'tab', 'menuitem', 'option'].includes(role);
203
348
 
204
349
  if (interactiveTag || interactiveRole) {
205
350
  if (!node.dataset.fastmcpRef) {
@@ -6,6 +6,14 @@ from pydantic_settings import BaseSettings
6
6
  class Settings(BaseSettings):
7
7
  """Environment-driven configuration for the MCP server."""
8
8
 
9
+ # MCP tool exposure mode (BrightData-like)
10
+ # - rapid: minimal core tools
11
+ # - pro: all tools
12
+ # - custom: enable by THORDATA_GROUPS and THORDATA_TOOLS
13
+ THORDATA_MODE: str = "rapid"
14
+ THORDATA_GROUPS: str | None = None
15
+ THORDATA_TOOLS: str | None = None
16
+
9
17
  # Thordata credentials
10
18
  THORDATA_SCRAPER_TOKEN: str | None = None
11
19
  THORDATA_PUBLIC_TOKEN: str | None = None
@@ -20,9 +28,9 @@ class Settings(BaseSettings):
20
28
  # Tasks discovery UX (to avoid dumping hundreds of tools to the client by default)
21
29
  # - mode=curated: only return tools from THORDATA_TASKS_GROUPS, with pagination
22
30
  # - mode=all: return all discovered tools
23
- # Default to listing ALL Web Scraper tasks, but paginated (no env changes required for “100+ tools” use-case).
24
- THORDATA_TASKS_LIST_MODE: str = "all"
25
- THORDATA_TASKS_LIST_DEFAULT_LIMIT: int = 100
31
+ # Default to curated mode to reduce tool selection noise for LLMs.
32
+ THORDATA_TASKS_LIST_MODE: str = "curated"
33
+ THORDATA_TASKS_LIST_DEFAULT_LIMIT: int = 60
26
34
  THORDATA_TASKS_GROUPS: str = "ecommerce,social,video,search,travel,code,professional"
27
35
 
28
36
  # Optional: restrict which SDK tool_keys are allowed to execute (safety/UX)
@@ -49,6 +57,9 @@ class Settings(BaseSettings):
49
57
  # Logging
50
58
  LOG_LEVEL: str = "INFO"
51
59
 
60
+ # Debug tools exposure
61
+ THORDATA_DEBUG_TOOLS: bool = False
62
+
52
63
  class Config:
53
64
  env_file = ".env"
54
65
  extra = "ignore"
@@ -35,4 +35,4 @@ class ServerContext:
35
35
 
36
36
  if cls._client:
37
37
  await cls._client.close()
38
- cls._client = None
38
+ cls._client = None