thordata-mcp-server 0.4.4__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/PKG-INFO +29 -54
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/README.md +27 -52
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/pyproject.toml +2 -2
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/__init__.py +1 -1
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/browser_session.py +157 -12
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/config.py +14 -3
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/context.py +1 -1
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/browser.py +124 -18
- thordata_mcp_server-0.5.0/src/thordata_mcp/tools/debug.py +125 -0
- thordata_mcp_server-0.5.0/src/thordata_mcp/tools/params_utils.py +107 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/product.py +83 -5
- thordata_mcp_server-0.5.0/src/thordata_mcp/tools/product_compact.py +2108 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/utils.py +2 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/utils.py +393 -322
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/PKG-INFO +29 -54
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/SOURCES.txt +2 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/requires.txt +1 -1
- thordata_mcp_server-0.4.4/src/thordata_mcp/tools/product_compact.py +0 -962
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/setup.cfg +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/aria_snapshot.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/debug_http.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/main.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/monitoring.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/registry.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/__init__.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/__init__.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/serp.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/tasks.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp/tools/data/universal.py +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/dependency_links.txt +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/entry_points.txt +0 -0
- {thordata_mcp_server-0.4.4 → thordata_mcp_server-0.5.0}/src/thordata_mcp_server.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: thordata-mcp-server
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Official MCP Server for Thordata.
|
|
5
5
|
Author-email: Thordata Developer Team <support@thordata.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,7 +8,7 @@ Requires-Python: >=3.10
|
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: mcp[cli]>=1.0.0
|
|
10
10
|
Requires-Dist: sse-starlette>=1.6.1
|
|
11
|
-
Requires-Dist: thordata-sdk>=1.
|
|
11
|
+
Requires-Dist: thordata-sdk>=1.7.0
|
|
12
12
|
Requires-Dist: pydantic-settings
|
|
13
13
|
Requires-Dist: markdownify
|
|
14
14
|
Requires-Dist: html2text
|
|
@@ -23,14 +23,14 @@ Requires-Dist: uvicorn
|
|
|
23
23
|
|
|
24
24
|
**Give your AI Agents real-time web scraping superpowers.**
|
|
25
25
|
|
|
26
|
-
This MCP Server version has been **streamlined to focus on scraping**, concentrating on
|
|
26
|
+
This MCP Server version has been **streamlined to focus on scraping**, concentrating on a compact, LLM‑friendly tool surface:
|
|
27
27
|
|
|
28
|
-
- **
|
|
28
|
+
- **Search Engine** (LLM-friendly web search wrapper)
|
|
29
|
+
- **SERP API** (Search result scraping, internal plumbing)
|
|
29
30
|
- **Web Unlocker / Universal Scraper** (Universal page unlocking & scraping)
|
|
30
|
-
- **Web Scraper API** (Structured task flow)
|
|
31
31
|
- **Scraping Browser** (Browser-level scraping)
|
|
32
32
|
|
|
33
|
-
Earlier versions exposed `proxy.*` / `account.*` / `proxy_users.*` proxy and account management tools. This version
|
|
33
|
+
Earlier versions exposed `proxy.*` / `account.*` / `proxy_users.*` proxy and account management tools, and a large `web_scraper` task surface. This version removes those control plane interfaces from MCP, keeping only scraping-related capabilities that are easy for LLMs to use.
|
|
34
34
|
|
|
35
35
|
## 🚀 Features
|
|
36
36
|
|
|
@@ -76,38 +76,15 @@ THORDATA_BROWSER_PASSWORD=your_password
|
|
|
76
76
|
|
|
77
77
|
### Tool Exposure Modes
|
|
78
78
|
|
|
79
|
-
Current implementation provides **
|
|
79
|
+
Current implementation provides a **compact scraping tool surface**, optimized for Cursor / LLM tool callers:
|
|
80
80
|
|
|
81
|
-
-
|
|
82
|
-
-
|
|
83
|
-
-
|
|
84
|
-
-
|
|
85
|
-
-
|
|
81
|
+
- **`search_engine`** (recommended for LLMs): high-level web search wrapper, returns a light `results[]` array with `title/link/description`. Internally delegates to the SERP backend.
|
|
82
|
+
- **`search_engine_batch`**: batch variant of `search_engine` with per-item `ok/error` results.
|
|
83
|
+
- **`unlocker`**: actions `fetch`, `batch_fetch` – universal page unlock & content extraction (HTML/Markdown), with per-item error reporting for batch.
|
|
84
|
+
- **`browser`**: action `snapshot` – navigate (optional `url`) and capture an ARIA-focused snapshot for interactive elements.
|
|
85
|
+
- **`smart_scrape`**: auto-picks the best scraper (SERP, Web Scraper, Unlocker) for a given URL and returns a unified, LLM-friendly response.
|
|
86
86
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
### Web Scraper discovery (100+ tools, no extra env required)
|
|
90
|
-
|
|
91
|
-
Use `web_scraper` with `action="catalog"` / `action="groups"` to discover tools.
|
|
92
|
-
This keeps Cursor/LLMs usable while still supporting **100+ tools** under a single entrypoint.
|
|
93
|
-
|
|
94
|
-
```env
|
|
95
|
-
# Default: curated + limit 60
|
|
96
|
-
THORDATA_TASKS_LIST_MODE=curated
|
|
97
|
-
THORDATA_TASKS_LIST_DEFAULT_LIMIT=60
|
|
98
|
-
|
|
99
|
-
# Which groups are included when mode=curated
|
|
100
|
-
THORDATA_TASKS_GROUPS=ecommerce,social,video,search,travel,code,professional
|
|
101
|
-
|
|
102
|
-
# Optional safety/UX: restrict which tools can actually run
|
|
103
|
-
# (comma-separated prefixes or exact tool keys)
|
|
104
|
-
# Example:
|
|
105
|
-
# THORDATA_TASKS_ALLOWLIST=thordata.tools.video.,thordata.tools.ecommerce.Amazon.ProductByAsin
|
|
106
|
-
THORDATA_TASKS_ALLOWLIST=
|
|
107
|
-
```
|
|
108
|
-
|
|
109
|
-
If you want Cursor to **never** see the full 300+ tool list, keep `THORDATA_TASKS_LIST_MODE=curated`
|
|
110
|
-
and optionally set `THORDATA_TASKS_ALLOWLIST` to the small subset you actually want to support.
|
|
87
|
+
Internally, the server still uses structured SERP and Web Scraper capabilities, but they are not exposed as large tool surfaces by default to avoid overwhelming LLMs.
|
|
111
88
|
|
|
112
89
|
### Deployment (Optional)
|
|
113
90
|
|
|
@@ -162,19 +139,17 @@ Add this to your `claude_desktop_config.json`:
|
|
|
162
139
|
Notes:
|
|
163
140
|
- `THORDATA_BROWSER_USERNAME` / `THORDATA_BROWSER_PASSWORD` are required for `browser.*` tools (Scraping Browser).
|
|
164
141
|
|
|
165
|
-
## 🛠️ Available Tools
|
|
166
|
-
|
|
167
|
-
### Available Tools (All directly related to scraping)
|
|
142
|
+
## 🛠️ Available Tools (Compact Surface)
|
|
168
143
|
|
|
169
|
-
|
|
144
|
+
By default, the MCP server exposes a **small, LLM-friendly tool set**:
|
|
170
145
|
|
|
171
|
-
- **`
|
|
172
|
-
- **`
|
|
173
|
-
- **`
|
|
174
|
-
- **`browser`**:
|
|
175
|
-
- **`smart_scrape`**:
|
|
146
|
+
- **`search_engine`**: single-query web search (`params.q`, optional `params.num`, `params.engine`).
|
|
147
|
+
- **`search_engine_batch`**: batch web search with per-item `ok/error` in `results[]`.
|
|
148
|
+
- **`unlocker`**: universal scraping via `fetch` / `batch_fetch`.
|
|
149
|
+
- **`browser`**: `snapshot` with optional `url`, `max_items`, and `max_chars`.
|
|
150
|
+
- **`smart_scrape`**: smart router for `url` with optional preview limit parameters.
|
|
176
151
|
|
|
177
|
-
|
|
152
|
+
Advanced / internal tools (e.g. low-level `serp.*`, full `web_scraper.*` surfaces, proxy/account control plane) remain available via HTTP APIs and SDKs, but are not exposed directly as MCP tools to keep the surface manageable for agents and LLMs.
|
|
178
153
|
|
|
179
154
|
## 🏗️ Architecture
|
|
180
155
|
|
|
@@ -189,14 +164,14 @@ thordata_mcp/
|
|
|
189
164
|
├── utils.py # Common utilities (error handling, responses)
|
|
190
165
|
├── browser_session.py # Browser session management (Playwright)
|
|
191
166
|
├── aria_snapshot.py # ARIA snapshot filtering
|
|
192
|
-
└── tools/
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
167
|
+
└── tools/
|
|
168
|
+
├── product_compact.py # Streamlined MCP entrypoint (search_engine / unlocker / browser / smart_scrape, plus batch variants)
|
|
169
|
+
├── product.py # Full product implementation for internal use (reused by compact version)
|
|
170
|
+
├── data/ # Data plane tools (only scraping-related namespaces retained)
|
|
171
|
+
│ ├── serp.py # SERP backend integration
|
|
172
|
+
│ ├── universal.py # Universal / Unlocker backend integration
|
|
173
|
+
│ ├── browser.py # Browser / Playwright helpers
|
|
174
|
+
│ └── tasks.py # Structured scraping tasks (used by smart_scrape and internal flows)
|
|
200
175
|
```
|
|
201
176
|
|
|
202
177
|
## 🎯 Design Principles
|
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
**Give your AI Agents real-time web scraping superpowers.**
|
|
4
4
|
|
|
5
|
-
This MCP Server version has been **streamlined to focus on scraping**, concentrating on
|
|
5
|
+
This MCP Server version has been **streamlined to focus on scraping**, concentrating on a compact, LLM‑friendly tool surface:
|
|
6
6
|
|
|
7
|
-
- **
|
|
7
|
+
- **Search Engine** (LLM-friendly web search wrapper)
|
|
8
|
+
- **SERP API** (Search result scraping, internal plumbing)
|
|
8
9
|
- **Web Unlocker / Universal Scraper** (Universal page unlocking & scraping)
|
|
9
|
-
- **Web Scraper API** (Structured task flow)
|
|
10
10
|
- **Scraping Browser** (Browser-level scraping)
|
|
11
11
|
|
|
12
|
-
Earlier versions exposed `proxy.*` / `account.*` / `proxy_users.*` proxy and account management tools. This version
|
|
12
|
+
Earlier versions exposed `proxy.*` / `account.*` / `proxy_users.*` proxy and account management tools, and a large `web_scraper` task surface. This version removes those control plane interfaces from MCP, keeping only scraping-related capabilities that are easy for LLMs to use.
|
|
13
13
|
|
|
14
14
|
## 🚀 Features
|
|
15
15
|
|
|
@@ -55,38 +55,15 @@ THORDATA_BROWSER_PASSWORD=your_password
|
|
|
55
55
|
|
|
56
56
|
### Tool Exposure Modes
|
|
57
57
|
|
|
58
|
-
Current implementation provides **
|
|
58
|
+
Current implementation provides a **compact scraping tool surface**, optimized for Cursor / LLM tool callers:
|
|
59
59
|
|
|
60
|
-
-
|
|
61
|
-
-
|
|
62
|
-
-
|
|
63
|
-
-
|
|
64
|
-
-
|
|
60
|
+
- **`search_engine`** (recommended for LLMs): high-level web search wrapper, returns a light `results[]` array with `title/link/description`. Internally delegates to the SERP backend.
|
|
61
|
+
- **`search_engine_batch`**: batch variant of `search_engine` with per-item `ok/error` results.
|
|
62
|
+
- **`unlocker`**: actions `fetch`, `batch_fetch` – universal page unlock & content extraction (HTML/Markdown), with per-item error reporting for batch.
|
|
63
|
+
- **`browser`**: action `snapshot` – navigate (optional `url`) and capture an ARIA-focused snapshot for interactive elements.
|
|
64
|
+
- **`smart_scrape`**: auto-picks the best scraper (SERP, Web Scraper, Unlocker) for a given URL and returns a unified, LLM-friendly response.
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
### Web Scraper discovery (100+ tools, no extra env required)
|
|
69
|
-
|
|
70
|
-
Use `web_scraper` with `action="catalog"` / `action="groups"` to discover tools.
|
|
71
|
-
This keeps Cursor/LLMs usable while still supporting **100+ tools** under a single entrypoint.
|
|
72
|
-
|
|
73
|
-
```env
|
|
74
|
-
# Default: curated + limit 60
|
|
75
|
-
THORDATA_TASKS_LIST_MODE=curated
|
|
76
|
-
THORDATA_TASKS_LIST_DEFAULT_LIMIT=60
|
|
77
|
-
|
|
78
|
-
# Which groups are included when mode=curated
|
|
79
|
-
THORDATA_TASKS_GROUPS=ecommerce,social,video,search,travel,code,professional
|
|
80
|
-
|
|
81
|
-
# Optional safety/UX: restrict which tools can actually run
|
|
82
|
-
# (comma-separated prefixes or exact tool keys)
|
|
83
|
-
# Example:
|
|
84
|
-
# THORDATA_TASKS_ALLOWLIST=thordata.tools.video.,thordata.tools.ecommerce.Amazon.ProductByAsin
|
|
85
|
-
THORDATA_TASKS_ALLOWLIST=
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
If you want Cursor to **never** see the full 300+ tool list, keep `THORDATA_TASKS_LIST_MODE=curated`
|
|
89
|
-
and optionally set `THORDATA_TASKS_ALLOWLIST` to the small subset you actually want to support.
|
|
66
|
+
Internally, the server still uses structured SERP and Web Scraper capabilities, but they are not exposed as large tool surfaces by default to avoid overwhelming LLMs.
|
|
90
67
|
|
|
91
68
|
### Deployment (Optional)
|
|
92
69
|
|
|
@@ -141,19 +118,17 @@ Add this to your `claude_desktop_config.json`:
|
|
|
141
118
|
Notes:
|
|
142
119
|
- `THORDATA_BROWSER_USERNAME` / `THORDATA_BROWSER_PASSWORD` are required for `browser.*` tools (Scraping Browser).
|
|
143
120
|
|
|
144
|
-
## 🛠️ Available Tools
|
|
145
|
-
|
|
146
|
-
### Available Tools (All directly related to scraping)
|
|
121
|
+
## 🛠️ Available Tools (Compact Surface)
|
|
147
122
|
|
|
148
|
-
|
|
123
|
+
By default, the MCP server exposes a **small, LLM-friendly tool set**:
|
|
149
124
|
|
|
150
|
-
- **`
|
|
151
|
-
- **`
|
|
152
|
-
- **`
|
|
153
|
-
- **`browser`**:
|
|
154
|
-
- **`smart_scrape`**:
|
|
125
|
+
- **`search_engine`**: single-query web search (`params.q`, optional `params.num`, `params.engine`).
|
|
126
|
+
- **`search_engine_batch`**: batch web search with per-item `ok/error` in `results[]`.
|
|
127
|
+
- **`unlocker`**: universal scraping via `fetch` / `batch_fetch`.
|
|
128
|
+
- **`browser`**: `snapshot` with optional `url`, `max_items`, and `max_chars`.
|
|
129
|
+
- **`smart_scrape`**: smart router for `url` with optional preview limit parameters.
|
|
155
130
|
|
|
156
|
-
|
|
131
|
+
Advanced / internal tools (e.g. low-level `serp.*`, full `web_scraper.*` surfaces, proxy/account control plane) remain available via HTTP APIs and SDKs, but are not exposed directly as MCP tools to keep the surface manageable for agents and LLMs.
|
|
157
132
|
|
|
158
133
|
## 🏗️ Architecture
|
|
159
134
|
|
|
@@ -168,14 +143,14 @@ thordata_mcp/
|
|
|
168
143
|
├── utils.py # Common utilities (error handling, responses)
|
|
169
144
|
├── browser_session.py # Browser session management (Playwright)
|
|
170
145
|
├── aria_snapshot.py # ARIA snapshot filtering
|
|
171
|
-
└── tools/
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
146
|
+
└── tools/
|
|
147
|
+
├── product_compact.py # Streamlined MCP entrypoint (search_engine / unlocker / browser / smart_scrape, plus batch variants)
|
|
148
|
+
├── product.py # Full product implementation for internal use (reused by compact version)
|
|
149
|
+
├── data/ # Data plane tools (only scraping-related namespaces retained)
|
|
150
|
+
│ ├── serp.py # SERP backend integration
|
|
151
|
+
│ ├── universal.py # Universal / Unlocker backend integration
|
|
152
|
+
│ ├── browser.py # Browser / Playwright helpers
|
|
153
|
+
│ └── tasks.py # Structured scraping tasks (used by smart_scrape and internal flows)
|
|
179
154
|
```
|
|
180
155
|
|
|
181
156
|
## 🎯 Design Principles
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "thordata-mcp-server"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "Official MCP Server for Thordata."
|
|
9
9
|
authors = [{name = "Thordata Developer Team", email = "support@thordata.com"}]
|
|
10
10
|
readme = "README.md"
|
|
@@ -13,7 +13,7 @@ license = "MIT"
|
|
|
13
13
|
dependencies = [
|
|
14
14
|
"mcp[cli]>=1.0.0",
|
|
15
15
|
"sse-starlette>=1.6.1",
|
|
16
|
-
"thordata-sdk>=1.
|
|
16
|
+
"thordata-sdk>=1.7.0",
|
|
17
17
|
"pydantic-settings",
|
|
18
18
|
"markdownify",
|
|
19
19
|
"html2text",
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""Browser session management for Thordata Scraping Browser.
|
|
2
2
|
|
|
3
3
|
This module provides a high-level wrapper around Playwright connected to
|
|
4
|
-
Thordata's Scraping Browser (via `AsyncThordataClient.get_browser_connection_url`)
|
|
5
|
-
inspired by Bright Data's browser session design but implemented in Python.
|
|
4
|
+
Thordata's Scraping Browser (via `AsyncThordataClient.get_browser_connection_url`).
|
|
6
5
|
|
|
7
6
|
Design goals:
|
|
8
7
|
- Domain-scoped browser sessions (one browser/page per domain).
|
|
@@ -18,6 +17,8 @@ from urllib.parse import urlparse
|
|
|
18
17
|
|
|
19
18
|
from playwright.async_api import Browser, Page, Playwright, async_playwright
|
|
20
19
|
|
|
20
|
+
import time
|
|
21
|
+
|
|
21
22
|
from thordata.async_client import AsyncThordataClient
|
|
22
23
|
|
|
23
24
|
from .aria_snapshot import AriaSnapshotFilter
|
|
@@ -37,6 +38,11 @@ class BrowserSession:
|
|
|
37
38
|
self._requests: Dict[str, Dict[Any, Any]] = {}
|
|
38
39
|
self._dom_refs: Set[str] = set()
|
|
39
40
|
self._current_domain: str = "default"
|
|
41
|
+
# Console and network diagnostics cache
|
|
42
|
+
self._console_messages: Dict[str, List[Dict[str, Any]]] = {}
|
|
43
|
+
self._network_requests: Dict[str, List[Dict[str, Any]]] = {}
|
|
44
|
+
self._max_console_messages = 10
|
|
45
|
+
self._max_network_requests = 20
|
|
40
46
|
|
|
41
47
|
@staticmethod
|
|
42
48
|
def _get_domain(url: str) -> str:
|
|
@@ -139,11 +145,26 @@ class BrowserSession:
|
|
|
139
145
|
|
|
140
146
|
# Reset network tracking for this domain
|
|
141
147
|
self._requests[domain] = {}
|
|
148
|
+
self._console_messages[domain] = []
|
|
149
|
+
self._network_requests[domain] = []
|
|
142
150
|
|
|
143
151
|
async def on_request(request: Any) -> None:
|
|
144
152
|
if domain in self._requests:
|
|
145
153
|
self._requests[domain][request] = None
|
|
146
|
-
|
|
154
|
+
try:
|
|
155
|
+
self._network_requests.setdefault(domain, [])
|
|
156
|
+
self._network_requests[domain].append(
|
|
157
|
+
{
|
|
158
|
+
"url": request.url,
|
|
159
|
+
"method": request.method,
|
|
160
|
+
"resourceType": getattr(request, "resource_type", None),
|
|
161
|
+
"timestamp": int(time.time() * 1000),
|
|
162
|
+
}
|
|
163
|
+
)
|
|
164
|
+
self._network_requests[domain] = self._network_requests[domain][-self._max_network_requests :]
|
|
165
|
+
except Exception:
|
|
166
|
+
pass
|
|
167
|
+
|
|
147
168
|
async def on_response(response: Any) -> None:
|
|
148
169
|
if domain in self._requests:
|
|
149
170
|
try:
|
|
@@ -151,15 +172,78 @@ class BrowserSession:
|
|
|
151
172
|
except Exception:
|
|
152
173
|
# Best-effort, non-fatal
|
|
153
174
|
pass
|
|
175
|
+
try:
|
|
176
|
+
# Update last matching request with status
|
|
177
|
+
req = response.request
|
|
178
|
+
url = getattr(req, "url", None)
|
|
179
|
+
if url and domain in self._network_requests:
|
|
180
|
+
for item in reversed(self._network_requests[domain]):
|
|
181
|
+
if item.get("url") == url and item.get("statusCode") is None:
|
|
182
|
+
item["statusCode"] = response.status
|
|
183
|
+
break
|
|
184
|
+
except Exception:
|
|
185
|
+
pass
|
|
154
186
|
|
|
155
187
|
page.on("request", on_request)
|
|
156
188
|
page.on("response", on_response)
|
|
157
|
-
|
|
189
|
+
|
|
190
|
+
# Console message tracking
|
|
191
|
+
async def on_console(msg: Any) -> None:
|
|
192
|
+
try:
|
|
193
|
+
self._console_messages.setdefault(domain, [])
|
|
194
|
+
self._console_messages[domain].append(
|
|
195
|
+
{
|
|
196
|
+
"type": msg.type,
|
|
197
|
+
"message": msg.text,
|
|
198
|
+
"timestamp": int(time.time() * 1000),
|
|
199
|
+
}
|
|
200
|
+
)
|
|
201
|
+
self._console_messages[domain] = self._console_messages[domain][-self._max_console_messages :]
|
|
202
|
+
except Exception:
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
page.on("console", on_console)
|
|
206
|
+
|
|
158
207
|
self._pages[domain] = page
|
|
159
208
|
return page
|
|
160
209
|
|
|
161
|
-
|
|
162
|
-
"""
|
|
210
|
+
def get_console_tail(self, n: int = 10, domain: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
211
|
+
"""Return recent console messages for the given domain."""
|
|
212
|
+
d = domain or self._current_domain
|
|
213
|
+
items = self._console_messages.get(d, [])
|
|
214
|
+
return items[-max(0, int(n)) :]
|
|
215
|
+
|
|
216
|
+
def get_network_tail(self, n: int = 20, domain: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
217
|
+
"""Return recent network request summaries for the given domain."""
|
|
218
|
+
d = domain or self._current_domain
|
|
219
|
+
items = self._network_requests.get(d, [])
|
|
220
|
+
return items[-max(0, int(n)) :]
|
|
221
|
+
|
|
222
|
+
def reset_page(self, domain: Optional[str] = None) -> None:
|
|
223
|
+
"""Drop cached page for a domain so the next call recreates it."""
|
|
224
|
+
d = domain or self._current_domain
|
|
225
|
+
self._pages.pop(d, None)
|
|
226
|
+
self._requests.pop(d, None)
|
|
227
|
+
self._console_messages.pop(d, None)
|
|
228
|
+
self._network_requests.pop(d, None)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
async def capture_snapshot(
|
|
232
|
+
self,
|
|
233
|
+
*,
|
|
234
|
+
filtered: bool = True,
|
|
235
|
+
mode: str = "compact",
|
|
236
|
+
max_items: int = 80,
|
|
237
|
+
include_dom: bool = False,
|
|
238
|
+
) -> Dict[str, Any]:
|
|
239
|
+
"""Capture an ARIA-like snapshot and optional DOM snapshot.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
filtered: Whether to apply AriaSnapshotFilter (legacy, kept for compatibility).
|
|
243
|
+
mode: "compact" | "full". Compact returns minimal interactive elements.
|
|
244
|
+
max_items: Maximum number of interactive elements to include (compact mode only).
|
|
245
|
+
include_dom: Whether to include dom_snapshot (compact mode defaults to False).
|
|
246
|
+
"""
|
|
163
247
|
page = await self.get_page()
|
|
164
248
|
|
|
165
249
|
try:
|
|
@@ -175,16 +259,64 @@ class BrowserSession:
|
|
|
175
259
|
"aria_snapshot": full_snapshot,
|
|
176
260
|
}
|
|
177
261
|
|
|
262
|
+
if mode == "compact":
|
|
263
|
+
# Compact: return only filtered interactive elements, optionally without dom_snapshot
|
|
264
|
+
filtered_snapshot = AriaSnapshotFilter.filter_snapshot(full_snapshot)
|
|
265
|
+
filtered_snapshot = self._limit_aria_snapshot_items(filtered_snapshot, max_items=max_items)
|
|
266
|
+
dom_snapshot = None
|
|
267
|
+
if include_dom:
|
|
268
|
+
dom_snapshot_raw = await self._capture_dom_snapshot(page)
|
|
269
|
+
self._dom_refs = {el["ref"] for el in dom_snapshot_raw}
|
|
270
|
+
dom_snapshot = AriaSnapshotFilter.format_dom_elements(dom_snapshot_raw)
|
|
271
|
+
return {
|
|
272
|
+
"url": page.url,
|
|
273
|
+
"title": await page.title(),
|
|
274
|
+
"aria_snapshot": filtered_snapshot,
|
|
275
|
+
"dom_snapshot": dom_snapshot,
|
|
276
|
+
"_meta": {"mode": mode, "max_items": max_items, "include_dom": include_dom},
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
# Full mode: include both filtered aria and dom_snapshot (legacy behavior)
|
|
178
280
|
filtered_snapshot = AriaSnapshotFilter.filter_snapshot(full_snapshot)
|
|
179
|
-
|
|
180
|
-
self._dom_refs = {el["ref"] for el in
|
|
181
|
-
|
|
281
|
+
dom_snapshot_raw = await self._capture_dom_snapshot(page)
|
|
282
|
+
self._dom_refs = {el["ref"] for el in dom_snapshot_raw}
|
|
182
283
|
return {
|
|
183
284
|
"url": page.url,
|
|
184
285
|
"title": await page.title(),
|
|
185
286
|
"aria_snapshot": filtered_snapshot,
|
|
186
|
-
"dom_snapshot": AriaSnapshotFilter.format_dom_elements(
|
|
287
|
+
"dom_snapshot": AriaSnapshotFilter.format_dom_elements(dom_snapshot_raw),
|
|
288
|
+
"_meta": {"mode": mode},
|
|
187
289
|
}
|
|
290
|
+
|
|
291
|
+
@staticmethod
|
|
292
|
+
def _limit_aria_snapshot_items(text: str, *, max_items: int) -> str:
|
|
293
|
+
"""Limit snapshot to the first N interactive element blocks.
|
|
294
|
+
|
|
295
|
+
The snapshot format is a list where each element starts with a line beginning
|
|
296
|
+
with '- ' (Playwright raw) or '[' (AriaSnapshotFilter compact), and may include
|
|
297
|
+
one or more indented continuation lines.
|
|
298
|
+
"""
|
|
299
|
+
try:
|
|
300
|
+
n = int(max_items)
|
|
301
|
+
except Exception:
|
|
302
|
+
n = 80
|
|
303
|
+
if n <= 0:
|
|
304
|
+
return ""
|
|
305
|
+
if not text:
|
|
306
|
+
return text
|
|
307
|
+
|
|
308
|
+
lines = text.splitlines()
|
|
309
|
+
out: list[str] = []
|
|
310
|
+
items = 0
|
|
311
|
+
for line in lines:
|
|
312
|
+
if line.startswith("- ") or line.startswith("["):
|
|
313
|
+
if items >= n:
|
|
314
|
+
break
|
|
315
|
+
items += 1
|
|
316
|
+
# Include continuation lines only if we've started collecting items.
|
|
317
|
+
if items > 0:
|
|
318
|
+
out.append(line)
|
|
319
|
+
return "\n".join(out).strip()
|
|
188
320
|
|
|
189
321
|
async def _get_interactive_snapshot(self, page: Page) -> str:
|
|
190
322
|
"""Generate a text snapshot of interactive elements with refs."""
|
|
@@ -194,12 +326,25 @@ class BrowserSession:
|
|
|
194
326
|
const lines = [];
|
|
195
327
|
let refCounter = 0;
|
|
196
328
|
|
|
329
|
+
function normalizeRole(tag, explicitRole) {
|
|
330
|
+
const role = (explicitRole || '').toLowerCase();
|
|
331
|
+
const t = (tag || '').toLowerCase();
|
|
332
|
+
if (role) return role;
|
|
333
|
+
// Map common interactive tags to standard ARIA roles
|
|
334
|
+
if (t === 'a') return 'link';
|
|
335
|
+
if (t === 'button') return 'button';
|
|
336
|
+
if (t === 'input') return 'textbox';
|
|
337
|
+
if (t === 'select') return 'combobox';
|
|
338
|
+
if (t === 'textarea') return 'textbox';
|
|
339
|
+
return t;
|
|
340
|
+
}
|
|
341
|
+
|
|
197
342
|
function traverse(node) {
|
|
198
343
|
if (node.nodeType === Node.ELEMENT_NODE) {
|
|
199
|
-
const role = node.getAttribute('role') || node.tagName.toLowerCase();
|
|
200
344
|
const tag = node.tagName.toLowerCase();
|
|
201
345
|
const interactiveTag = ['a', 'button', 'input', 'select', 'textarea'].includes(tag);
|
|
202
|
-
const
|
|
346
|
+
const role = normalizeRole(tag, node.getAttribute('role'));
|
|
347
|
+
const interactiveRole = ['button', 'link', 'textbox', 'searchbox', 'combobox', 'checkbox', 'radio', 'switch', 'tab', 'menuitem', 'option'].includes(role);
|
|
203
348
|
|
|
204
349
|
if (interactiveTag || interactiveRole) {
|
|
205
350
|
if (!node.dataset.fastmcpRef) {
|
|
@@ -6,6 +6,14 @@ from pydantic_settings import BaseSettings
|
|
|
6
6
|
class Settings(BaseSettings):
|
|
7
7
|
"""Environment-driven configuration for the MCP server."""
|
|
8
8
|
|
|
9
|
+
# MCP tool exposure mode (BrightData-like)
|
|
10
|
+
# - rapid: minimal core tools
|
|
11
|
+
# - pro: all tools
|
|
12
|
+
# - custom: enable by THORDATA_GROUPS and THORDATA_TOOLS
|
|
13
|
+
THORDATA_MODE: str = "rapid"
|
|
14
|
+
THORDATA_GROUPS: str | None = None
|
|
15
|
+
THORDATA_TOOLS: str | None = None
|
|
16
|
+
|
|
9
17
|
# Thordata credentials
|
|
10
18
|
THORDATA_SCRAPER_TOKEN: str | None = None
|
|
11
19
|
THORDATA_PUBLIC_TOKEN: str | None = None
|
|
@@ -20,9 +28,9 @@ class Settings(BaseSettings):
|
|
|
20
28
|
# Tasks discovery UX (to avoid dumping hundreds of tools to the client by default)
|
|
21
29
|
# - mode=curated: only return tools from THORDATA_TASKS_GROUPS, with pagination
|
|
22
30
|
# - mode=all: return all discovered tools
|
|
23
|
-
# Default to
|
|
24
|
-
THORDATA_TASKS_LIST_MODE: str = "
|
|
25
|
-
THORDATA_TASKS_LIST_DEFAULT_LIMIT: int =
|
|
31
|
+
# Default to curated mode to reduce tool selection noise for LLMs.
|
|
32
|
+
THORDATA_TASKS_LIST_MODE: str = "curated"
|
|
33
|
+
THORDATA_TASKS_LIST_DEFAULT_LIMIT: int = 60
|
|
26
34
|
THORDATA_TASKS_GROUPS: str = "ecommerce,social,video,search,travel,code,professional"
|
|
27
35
|
|
|
28
36
|
# Optional: restrict which SDK tool_keys are allowed to execute (safety/UX)
|
|
@@ -49,6 +57,9 @@ class Settings(BaseSettings):
|
|
|
49
57
|
# Logging
|
|
50
58
|
LOG_LEVEL: str = "INFO"
|
|
51
59
|
|
|
60
|
+
# Debug tools exposure
|
|
61
|
+
THORDATA_DEBUG_TOOLS: bool = False
|
|
62
|
+
|
|
52
63
|
class Config:
|
|
53
64
|
env_file = ".env"
|
|
54
65
|
extra = "ignore"
|