@browserless.io/mcp 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +557 -0
- package/README.md +280 -0
- package/bin/cli.js +2 -0
- package/build/src/@types/types.d.ts +538 -0
- package/build/src/config.d.ts +3 -0
- package/build/src/config.js +42 -0
- package/build/src/index.d.ts +4 -0
- package/build/src/index.js +153 -0
- package/build/src/lib/account-resolver.d.ts +17 -0
- package/build/src/lib/account-resolver.js +78 -0
- package/build/src/lib/agent-client.d.ts +58 -0
- package/build/src/lib/agent-client.js +530 -0
- package/build/src/lib/agent-format.d.ts +35 -0
- package/build/src/lib/agent-format.js +155 -0
- package/build/src/lib/amplitude.d.ts +11 -0
- package/build/src/lib/amplitude.js +65 -0
- package/build/src/lib/analytics.d.ts +18 -0
- package/build/src/lib/analytics.js +79 -0
- package/build/src/lib/api-client.d.ts +17 -0
- package/build/src/lib/api-client.js +357 -0
- package/build/src/lib/bounded-event-store.d.ts +22 -0
- package/build/src/lib/bounded-event-store.js +69 -0
- package/build/src/lib/cache.d.ts +12 -0
- package/build/src/lib/cache.js +49 -0
- package/build/src/lib/define-tool.d.ts +71 -0
- package/build/src/lib/define-tool.js +71 -0
- package/build/src/lib/error-classifier.d.ts +4 -0
- package/build/src/lib/error-classifier.js +125 -0
- package/build/src/lib/redis-oauth-proxy.d.ts +13 -0
- package/build/src/lib/redis-oauth-proxy.js +214 -0
- package/build/src/lib/retry.d.ts +2 -0
- package/build/src/lib/retry.js +19 -0
- package/build/src/lib/schema-fields.d.ts +10 -0
- package/build/src/lib/schema-fields.js +27 -0
- package/build/src/lib/supabase-token-patch.d.ts +6 -0
- package/build/src/lib/supabase-token-patch.js +33 -0
- package/build/src/lib/utils.d.ts +27 -0
- package/build/src/lib/utils.js +67 -0
- package/build/src/prompts/extract-content.d.ts +2 -0
- package/build/src/prompts/extract-content.js +33 -0
- package/build/src/prompts/scrape-url.d.ts +2 -0
- package/build/src/prompts/scrape-url.js +36 -0
- package/build/src/resources/api-docs.d.ts +3 -0
- package/build/src/resources/api-docs.js +54 -0
- package/build/src/resources/status.d.ts +3 -0
- package/build/src/resources/status.js +30 -0
- package/build/src/skills/autonomous-login.md +95 -0
- package/build/src/skills/captchas.md +48 -0
- package/build/src/skills/cookie-consent.md +50 -0
- package/build/src/skills/dynamic-content.md +72 -0
- package/build/src/skills/index.d.ts +9 -0
- package/build/src/skills/index.js +221 -0
- package/build/src/skills/modals.md +56 -0
- package/build/src/skills/screenshots.md +53 -0
- package/build/src/skills/shadow-dom.md +64 -0
- package/build/src/skills/snapshot-misses.md +67 -0
- package/build/src/skills/system-prompt.d.ts +2 -0
- package/build/src/skills/system-prompt.js +128 -0
- package/build/src/skills/tabs.md +77 -0
- package/build/src/tools/agent.d.ts +15 -0
- package/build/src/tools/agent.js +299 -0
- package/build/src/tools/crawl.d.ts +75 -0
- package/build/src/tools/crawl.js +426 -0
- package/build/src/tools/download.d.ts +11 -0
- package/build/src/tools/download.js +92 -0
- package/build/src/tools/export.d.ts +28 -0
- package/build/src/tools/export.js +129 -0
- package/build/src/tools/function.d.ts +24 -0
- package/build/src/tools/function.js +144 -0
- package/build/src/tools/map.d.ts +23 -0
- package/build/src/tools/map.js +129 -0
- package/build/src/tools/performance.d.ts +25 -0
- package/build/src/tools/performance.js +103 -0
- package/build/src/tools/schemas.d.ts +466 -0
- package/build/src/tools/schemas.js +487 -0
- package/build/src/tools/search.d.ts +67 -0
- package/build/src/tools/search.js +184 -0
- package/build/src/tools/smartscraper.d.ts +42 -0
- package/build/src/tools/smartscraper.js +136 -0
- package/package.json +111 -0
- package/patches/mcp-proxy+6.4.0.patch +31 -0
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
import { UserError } from 'fastmcp';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
import { defineTool, validateHttpUrl } from '../lib/define-tool.js';
|
|
4
|
+
import { profileField } from './schemas.js';
|
|
5
|
+
export const CrawlStatusSchema = z.enum([
|
|
6
|
+
'in-progress',
|
|
7
|
+
'completed',
|
|
8
|
+
'failed',
|
|
9
|
+
'cancelled',
|
|
10
|
+
]);
|
|
11
|
+
export const PageStatusSchema = z.enum([
|
|
12
|
+
'queued',
|
|
13
|
+
'in-progress',
|
|
14
|
+
'completed',
|
|
15
|
+
'failed',
|
|
16
|
+
'cancelled',
|
|
17
|
+
]);
|
|
18
|
+
export const CrawlSitemapModeSchema = z.enum(['auto', 'force', 'skip']);
|
|
19
|
+
export const CrawlFormatSchema = z.enum(['markdown', 'html', 'rawText']);
|
|
20
|
+
export const CrawlScrapeOptionsSchema = z.object({
|
|
21
|
+
formats: z
|
|
22
|
+
.array(CrawlFormatSchema)
|
|
23
|
+
.optional()
|
|
24
|
+
.default(['markdown'])
|
|
25
|
+
.describe('Output formats for scraped content'),
|
|
26
|
+
onlyMainContent: z
|
|
27
|
+
.boolean()
|
|
28
|
+
.optional()
|
|
29
|
+
.default(true)
|
|
30
|
+
.describe('Extract only the main content using Readability'),
|
|
31
|
+
includeTags: z
|
|
32
|
+
.array(z.string())
|
|
33
|
+
.optional()
|
|
34
|
+
.describe('HTML tag selectors to include'),
|
|
35
|
+
excludeTags: z
|
|
36
|
+
.array(z.string())
|
|
37
|
+
.optional()
|
|
38
|
+
.describe('HTML tag selectors to exclude'),
|
|
39
|
+
waitFor: z
|
|
40
|
+
.number()
|
|
41
|
+
.int()
|
|
42
|
+
.nonnegative()
|
|
43
|
+
.optional()
|
|
44
|
+
.default(0)
|
|
45
|
+
.describe('Time in ms to wait after page load before scraping'),
|
|
46
|
+
headers: z
|
|
47
|
+
.record(z.string(), z.string())
|
|
48
|
+
.optional()
|
|
49
|
+
.describe('Custom HTTP headers to send with each request'),
|
|
50
|
+
timeout: z
|
|
51
|
+
.number()
|
|
52
|
+
.int()
|
|
53
|
+
.positive()
|
|
54
|
+
.optional()
|
|
55
|
+
.describe('Navigation timeout in milliseconds'),
|
|
56
|
+
});
|
|
57
|
+
export const CrawlParamsSchema = z.object({
|
|
58
|
+
url: z.url().describe('The URL to crawl (must be http or https)'),
|
|
59
|
+
limit: z
|
|
60
|
+
.number()
|
|
61
|
+
.int()
|
|
62
|
+
.positive()
|
|
63
|
+
.max(10000)
|
|
64
|
+
.optional()
|
|
65
|
+
.default(100)
|
|
66
|
+
.describe('Maximum number of pages to crawl (default: 100)'),
|
|
67
|
+
maxDepth: z
|
|
68
|
+
.number()
|
|
69
|
+
.int()
|
|
70
|
+
.nonnegative()
|
|
71
|
+
.optional()
|
|
72
|
+
.default(5)
|
|
73
|
+
.describe('Maximum link-follow depth from the root URL (default: 5)'),
|
|
74
|
+
maxRetries: z
|
|
75
|
+
.number()
|
|
76
|
+
.int()
|
|
77
|
+
.nonnegative()
|
|
78
|
+
.optional()
|
|
79
|
+
.default(1)
|
|
80
|
+
.describe('Number of retry attempts per failed page (default: 1)'),
|
|
81
|
+
allowExternalLinks: z
|
|
82
|
+
.boolean()
|
|
83
|
+
.optional()
|
|
84
|
+
.default(false)
|
|
85
|
+
.describe('Whether to follow links to external domains'),
|
|
86
|
+
allowSubdomains: z
|
|
87
|
+
.boolean()
|
|
88
|
+
.optional()
|
|
89
|
+
.default(false)
|
|
90
|
+
.describe('Whether to follow links to subdomains'),
|
|
91
|
+
sitemap: CrawlSitemapModeSchema.optional()
|
|
92
|
+
.default('auto')
|
|
93
|
+
.describe('Sitemap handling: "auto" (default), "force", "skip"'),
|
|
94
|
+
includePaths: z
|
|
95
|
+
.array(z.string())
|
|
96
|
+
.optional()
|
|
97
|
+
.describe('Regex patterns for URL paths to include'),
|
|
98
|
+
excludePaths: z
|
|
99
|
+
.array(z.string())
|
|
100
|
+
.optional()
|
|
101
|
+
.describe('Regex patterns for URL paths to exclude'),
|
|
102
|
+
delay: z
|
|
103
|
+
.number()
|
|
104
|
+
.int()
|
|
105
|
+
.nonnegative()
|
|
106
|
+
.optional()
|
|
107
|
+
.default(200)
|
|
108
|
+
.describe('Delay between requests in milliseconds (default: 200)'),
|
|
109
|
+
scrapeOptions: CrawlScrapeOptionsSchema.optional().describe('Options controlling how each page is scraped'),
|
|
110
|
+
waitForCompletion: z
|
|
111
|
+
.boolean()
|
|
112
|
+
.optional()
|
|
113
|
+
.default(true)
|
|
114
|
+
.describe('Whether to wait for crawl completion (default: true). If false, returns immediately with crawl ID.'),
|
|
115
|
+
pollInterval: z
|
|
116
|
+
.number()
|
|
117
|
+
.int()
|
|
118
|
+
.positive()
|
|
119
|
+
.optional()
|
|
120
|
+
.default(5000)
|
|
121
|
+
.describe('Polling interval in ms when waiting for completion (default: 5000)'),
|
|
122
|
+
maxWaitTime: z
|
|
123
|
+
.number()
|
|
124
|
+
.int()
|
|
125
|
+
.positive()
|
|
126
|
+
.optional()
|
|
127
|
+
.default(300000)
|
|
128
|
+
.describe('Maximum time in ms to wait for crawl completion when waitForCompletion is true (default: 300000 = 5 minutes)'),
|
|
129
|
+
timeout: z
|
|
130
|
+
.number()
|
|
131
|
+
.int()
|
|
132
|
+
.positive()
|
|
133
|
+
.optional()
|
|
134
|
+
.describe('HTTP request timeout in milliseconds for API calls (default: 30000)'),
|
|
135
|
+
profile: profileField('before each page is scraped'),
|
|
136
|
+
});
|
|
137
|
+
const TERMINAL_STATUSES = new Set(['completed', 'failed', 'cancelled']);
|
|
138
|
+
/** Maximum number of pages to fetch full content for */
|
|
139
|
+
const MAX_CONTENT_PAGES = 50;
|
|
140
|
+
/** Maximum content length per page (chars) before truncation */
|
|
141
|
+
const MAX_CONTENT_LENGTH = 10000;
|
|
142
|
+
/** Maximum URLs to list in the crawled URLs section */
|
|
143
|
+
const MAX_URL_LIST = 200;
|
|
144
|
+
/**
|
|
145
|
+
* Fetch the actual scraped content from an S3 signed URL.
|
|
146
|
+
* Returns null on any error (expired URL, network issue, etc.).
|
|
147
|
+
*/
|
|
148
|
+
async function fetchPageContent(contentUrl) {
|
|
149
|
+
try {
|
|
150
|
+
const res = await fetch(contentUrl, { signal: AbortSignal.timeout(10000) });
|
|
151
|
+
if (!res.ok)
|
|
152
|
+
return null;
|
|
153
|
+
return (await res.json());
|
|
154
|
+
}
|
|
155
|
+
catch {
|
|
156
|
+
return null;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
export function registerCrawlTool(server, config, analytics) {
|
|
160
|
+
defineTool(server, config, analytics, {
|
|
161
|
+
name: 'browserless_crawl',
|
|
162
|
+
description: 'Crawl a website and scrape every discovered page using Browserless. ' +
|
|
163
|
+
'Starts from a seed URL and follows links up to a configurable depth. ' +
|
|
164
|
+
'Supports sitemap discovery, path filtering, subdomain handling, and custom scrape options. ' +
|
|
165
|
+
'Returns scraped content (markdown/HTML) for each page along with metadata. ' +
|
|
166
|
+
'Useful for comprehensive site analysis, content extraction, and data gathering.',
|
|
167
|
+
parameters: CrawlParamsSchema,
|
|
168
|
+
annotations: {
|
|
169
|
+
title: 'Browserless Crawl',
|
|
170
|
+
readOnlyHint: true,
|
|
171
|
+
destructiveHint: false,
|
|
172
|
+
openWorldHint: true,
|
|
173
|
+
},
|
|
174
|
+
validateUrl: (p) => validateHttpUrl(p.url),
|
|
175
|
+
profileNotFoundMessage: (profile) => `Profile "${profile}" was not found for the configured API ` +
|
|
176
|
+
`token. Create the profile with Browserless.saveProfile in a ` +
|
|
177
|
+
`live session first, or omit the profile parameter to crawl ` +
|
|
178
|
+
`anonymously.`,
|
|
179
|
+
// crawl fires its own analytics events at multiple points (start failure,
|
|
180
|
+
// timeout, async-return, success), so we skip defineTool's end-of-run fire.
|
|
181
|
+
run: async ({ client, params, log, analytics, token, apiUrl, reportProgress, }) => {
|
|
182
|
+
const analyticsBase = {
|
|
183
|
+
url: params.url,
|
|
184
|
+
limit: params.limit ?? 100,
|
|
185
|
+
api_url: apiUrl,
|
|
186
|
+
profile_used: !!params.profile,
|
|
187
|
+
};
|
|
188
|
+
// Start the crawl (ProfileNotFoundError propagates to defineTool)
|
|
189
|
+
const startResponse = await client.crawl({
|
|
190
|
+
url: params.url,
|
|
191
|
+
limit: params.limit,
|
|
192
|
+
maxDepth: params.maxDepth,
|
|
193
|
+
maxRetries: params.maxRetries,
|
|
194
|
+
allowExternalLinks: params.allowExternalLinks,
|
|
195
|
+
allowSubdomains: params.allowSubdomains,
|
|
196
|
+
sitemap: params.sitemap,
|
|
197
|
+
includePaths: params.includePaths,
|
|
198
|
+
excludePaths: params.excludePaths,
|
|
199
|
+
delay: params.delay,
|
|
200
|
+
scrapeOptions: params.scrapeOptions,
|
|
201
|
+
timeout: params.timeout,
|
|
202
|
+
profile: params.profile,
|
|
203
|
+
});
|
|
204
|
+
if (!startResponse.success) {
|
|
205
|
+
analytics?.fireToolRequest(token, 'browserless_crawl', {
|
|
206
|
+
...analyticsBase,
|
|
207
|
+
success: false,
|
|
208
|
+
error: startResponse.error ?? 'Unknown error',
|
|
209
|
+
});
|
|
210
|
+
throw new UserError(`Failed to start crawl: ${startResponse.error ?? 'Unknown error'}`);
|
|
211
|
+
}
|
|
212
|
+
const crawlId = startResponse.id;
|
|
213
|
+
log.debug(`Crawl started: id=${crawlId}, url=${params.url}`);
|
|
214
|
+
// Async return — caller polls externally
|
|
215
|
+
if (params.waitForCompletion === false) {
|
|
216
|
+
analytics?.fireToolRequest(token, 'browserless_crawl', {
|
|
217
|
+
...analyticsBase,
|
|
218
|
+
success: true,
|
|
219
|
+
crawl_id: crawlId,
|
|
220
|
+
wait_for_completion: false,
|
|
221
|
+
});
|
|
222
|
+
return { kind: 'started', crawlId, startResponse };
|
|
223
|
+
}
|
|
224
|
+
// Poll for completion
|
|
225
|
+
const pollInterval = params.pollInterval ?? 5000;
|
|
226
|
+
const maxWaitTime = params.maxWaitTime ?? 300000;
|
|
227
|
+
const startTime = Date.now();
|
|
228
|
+
let statusResponse;
|
|
229
|
+
let isFirstPoll = true;
|
|
230
|
+
do {
|
|
231
|
+
if (Date.now() - startTime > maxWaitTime) {
|
|
232
|
+
analytics?.fireToolRequest(token, 'browserless_crawl', {
|
|
233
|
+
...analyticsBase,
|
|
234
|
+
success: false,
|
|
235
|
+
crawl_id: crawlId,
|
|
236
|
+
timeout: true,
|
|
237
|
+
});
|
|
238
|
+
throw new UserError(`Crawl exceeded max wait time of ${maxWaitTime}ms. Crawl ID: ${crawlId}. ` +
|
|
239
|
+
'The crawl may still be running. You can check its status later using the crawl ID.');
|
|
240
|
+
}
|
|
241
|
+
if (!isFirstPoll) {
|
|
242
|
+
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
|
243
|
+
}
|
|
244
|
+
isFirstPoll = false;
|
|
245
|
+
statusResponse = await client.getCrawl(crawlId);
|
|
246
|
+
if (statusResponse.total > 0) {
|
|
247
|
+
const progress = Math.min(Math.floor((statusResponse.completed / statusResponse.total) * 95), 95);
|
|
248
|
+
await reportProgress({ progress, total: 100 });
|
|
249
|
+
}
|
|
250
|
+
log.debug(`Crawl status: ${statusResponse.status}, ` +
|
|
251
|
+
`completed=${statusResponse.completed}/${statusResponse.total}, ` +
|
|
252
|
+
`failed=${statusResponse.failed}`);
|
|
253
|
+
} while (!TERMINAL_STATUSES.has(statusResponse.status));
|
|
254
|
+
// Fetch all pages (paginated)
|
|
255
|
+
const allPages = [...statusResponse.data];
|
|
256
|
+
let nextUrl = statusResponse.next;
|
|
257
|
+
let skip = allPages.length;
|
|
258
|
+
while (nextUrl && allPages.length < statusResponse.total) {
|
|
259
|
+
const nextResponse = await client.getCrawl(crawlId, skip);
|
|
260
|
+
allPages.push(...nextResponse.data);
|
|
261
|
+
skip = allPages.length;
|
|
262
|
+
nextUrl = nextResponse.next;
|
|
263
|
+
}
|
|
264
|
+
analytics?.fireToolRequest(token, 'browserless_crawl', {
|
|
265
|
+
...analyticsBase,
|
|
266
|
+
success: statusResponse.status === 'completed',
|
|
267
|
+
crawl_id: crawlId,
|
|
268
|
+
status: statusResponse.status,
|
|
269
|
+
total_pages: statusResponse.total,
|
|
270
|
+
completed_pages: statusResponse.completed,
|
|
271
|
+
failed_pages: statusResponse.failed,
|
|
272
|
+
});
|
|
273
|
+
// Fetch page content for completed pages (so format() stays sync)
|
|
274
|
+
const completedPages = allPages.filter((p) => p.status === 'completed');
|
|
275
|
+
const pagesToFetch = completedPages.slice(0, MAX_CONTENT_PAGES);
|
|
276
|
+
log.debug(`Fetching content for ${pagesToFetch.length} pages...`);
|
|
277
|
+
const fetched = await Promise.all(pagesToFetch.map(async (page) => ({
|
|
278
|
+
page,
|
|
279
|
+
content: page.contentUrl
|
|
280
|
+
? await fetchPageContent(page.contentUrl)
|
|
281
|
+
: null,
|
|
282
|
+
})));
|
|
283
|
+
// Re-pair: keep order with allPages so failed pages render too
|
|
284
|
+
const fetchedByUrl = new Map(fetched.map(({ page, content }) => [page, content]));
|
|
285
|
+
const pages = allPages.map((page) => ({
|
|
286
|
+
page,
|
|
287
|
+
content: fetchedByUrl.get(page) ?? null,
|
|
288
|
+
}));
|
|
289
|
+
log.debug(`Crawl completed: id=${crawlId}, pages=${allPages.length}`);
|
|
290
|
+
return { kind: 'completed', crawlId, statusResponse, pages };
|
|
291
|
+
},
|
|
292
|
+
format: (result, params) => {
|
|
293
|
+
if (result.kind === 'started') {
|
|
294
|
+
return [
|
|
295
|
+
{
|
|
296
|
+
type: 'text',
|
|
297
|
+
text: [
|
|
298
|
+
'## Crawl Started',
|
|
299
|
+
'',
|
|
300
|
+
`**Crawl ID:** ${result.crawlId}`,
|
|
301
|
+
`**Status URL:** ${result.startResponse.url}`,
|
|
302
|
+
`**Target URL:** ${params.url}`,
|
|
303
|
+
'',
|
|
304
|
+
'The crawl is running asynchronously. Use the crawl ID to check status.',
|
|
305
|
+
].join('\n'),
|
|
306
|
+
},
|
|
307
|
+
];
|
|
308
|
+
}
|
|
309
|
+
const { crawlId, statusResponse, pages } = result;
|
|
310
|
+
if (statusResponse.status === 'failed') {
|
|
311
|
+
throw new UserError(`Crawl failed. Crawl ID: ${crawlId}. ` +
|
|
312
|
+
`Completed: ${statusResponse.completed}/${statusResponse.total} pages.`);
|
|
313
|
+
}
|
|
314
|
+
if (statusResponse.status === 'cancelled') {
|
|
315
|
+
throw new UserError(`Crawl was cancelled. Crawl ID: ${crawlId}. ` +
|
|
316
|
+
`Completed: ${statusResponse.completed}/${statusResponse.total} pages.`);
|
|
317
|
+
}
|
|
318
|
+
const blocks = [];
|
|
319
|
+
blocks.push({
|
|
320
|
+
type: 'text',
|
|
321
|
+
text: [
|
|
322
|
+
`## Crawl Results for ${params.url}`,
|
|
323
|
+
'',
|
|
324
|
+
`**Status:** ${statusResponse.status}`,
|
|
325
|
+
`**Total Pages:** ${statusResponse.total}`,
|
|
326
|
+
`**Completed:** ${statusResponse.completed}`,
|
|
327
|
+
`**Failed:** ${statusResponse.failed}`,
|
|
328
|
+
statusResponse.expiresAt
|
|
329
|
+
? `**Results Expire:** ${statusResponse.expiresAt}`
|
|
330
|
+
: '',
|
|
331
|
+
]
|
|
332
|
+
.filter(Boolean)
|
|
333
|
+
.join('\n'),
|
|
334
|
+
});
|
|
335
|
+
const completedPages = pages.filter((p) => p.page.status === 'completed');
|
|
336
|
+
const failedPages = pages.filter((p) => p.page.status === 'failed');
|
|
337
|
+
if (completedPages.length > 0) {
|
|
338
|
+
const renderable = completedPages.slice(0, MAX_CONTENT_PAGES);
|
|
339
|
+
const pageList = renderable
|
|
340
|
+
.map(({ page, content }, index) => {
|
|
341
|
+
const lines = [`### ${index + 1}. ${page.metadata.sourceURL}`];
|
|
342
|
+
if (page.metadata.title)
|
|
343
|
+
lines.push(`**Title:** ${page.metadata.title}`);
|
|
344
|
+
if (page.metadata.statusCode)
|
|
345
|
+
lines.push(`**Status Code:** ${page.metadata.statusCode}`);
|
|
346
|
+
if (content) {
|
|
347
|
+
let textContent = content.markdown ?? content.rawText ?? content.html;
|
|
348
|
+
if (textContent) {
|
|
349
|
+
if (textContent.length > MAX_CONTENT_LENGTH) {
|
|
350
|
+
textContent =
|
|
351
|
+
textContent.slice(0, MAX_CONTENT_LENGTH) +
|
|
352
|
+
`\n\n... [Content truncated at ${MAX_CONTENT_LENGTH} characters]`;
|
|
353
|
+
}
|
|
354
|
+
lines.push('');
|
|
355
|
+
lines.push('**Content:**');
|
|
356
|
+
lines.push('```');
|
|
357
|
+
lines.push(textContent);
|
|
358
|
+
lines.push('```');
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
else if (page.contentUrl) {
|
|
362
|
+
lines.push('');
|
|
363
|
+
lines.push('*[Content could not be fetched - URL may have expired]*');
|
|
364
|
+
}
|
|
365
|
+
return lines.join('\n');
|
|
366
|
+
})
|
|
367
|
+
.join('\n\n---\n\n');
|
|
368
|
+
blocks.push({
|
|
369
|
+
type: 'text',
|
|
370
|
+
text: `## Scraped Pages (${completedPages.length})\n\n${pageList}`,
|
|
371
|
+
});
|
|
372
|
+
if (completedPages.length > MAX_CONTENT_PAGES) {
|
|
373
|
+
blocks.push({
|
|
374
|
+
type: 'text',
|
|
375
|
+
text: `\n*Note: Content shown for first ${MAX_CONTENT_PAGES} pages. ` +
|
|
376
|
+
`${completedPages.length - MAX_CONTENT_PAGES} additional pages were crawled but content not included to avoid response size limits.*`,
|
|
377
|
+
});
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
if (failedPages.length > 0) {
|
|
381
|
+
const failedList = failedPages
|
|
382
|
+
.slice(0, 20)
|
|
383
|
+
.map(({ page }, index) => {
|
|
384
|
+
return `${index + 1}. ${page.metadata.sourceURL}\n Error: ${page.metadata.error ?? 'Unknown error'}`;
|
|
385
|
+
})
|
|
386
|
+
.join('\n');
|
|
387
|
+
blocks.push({
|
|
388
|
+
type: 'text',
|
|
389
|
+
text: `## Failed Pages (${failedPages.length})\n\n${failedList}${failedPages.length > 20 ? `\n... and ${failedPages.length - 20} more` : ''}`,
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
if (completedPages.length === 0 && failedPages.length === 0) {
|
|
393
|
+
blocks.push({
|
|
394
|
+
type: 'text',
|
|
395
|
+
text: 'No pages were successfully crawled.',
|
|
396
|
+
});
|
|
397
|
+
}
|
|
398
|
+
else {
|
|
399
|
+
const urlsToShow = pages.slice(0, MAX_URL_LIST);
|
|
400
|
+
const urlList = urlsToShow
|
|
401
|
+
.map(({ page }) => page.metadata.sourceURL)
|
|
402
|
+
.join('\n');
|
|
403
|
+
const urlListSuffix = pages.length > MAX_URL_LIST
|
|
404
|
+
? `\n\n... and ${pages.length - MAX_URL_LIST} more URLs`
|
|
405
|
+
: '';
|
|
406
|
+
blocks.push({
|
|
407
|
+
type: 'text',
|
|
408
|
+
text: `## All Crawled URLs\n\n${urlList}${urlListSuffix}`,
|
|
409
|
+
});
|
|
410
|
+
}
|
|
411
|
+
blocks.push({
|
|
412
|
+
type: 'text',
|
|
413
|
+
text: [
|
|
414
|
+
'---',
|
|
415
|
+
`Crawl ID: ${crawlId}`,
|
|
416
|
+
`Target URL: ${params.url}`,
|
|
417
|
+
`Max Depth: ${params.maxDepth ?? 5}`,
|
|
418
|
+
`Page Limit: ${params.limit ?? 100}`,
|
|
419
|
+
`Sitemap Mode: ${params.sitemap ?? 'auto'}`,
|
|
420
|
+
'---',
|
|
421
|
+
].join('\n'),
|
|
422
|
+
});
|
|
423
|
+
return blocks;
|
|
424
|
+
},
|
|
425
|
+
});
|
|
426
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { FastMCP } from 'fastmcp';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
import { AnalyticsHelper } from '../lib/analytics.js';
|
|
4
|
+
import type { McpConfig } from '../@types/types.js';
|
|
5
|
+
export declare const DownloadParamsSchema: z.ZodObject<{
|
|
6
|
+
code: z.ZodString;
|
|
7
|
+
context: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
8
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
9
|
+
profile: z.ZodOptional<z.ZodString>;
|
|
10
|
+
}, z.core.$strip>;
|
|
11
|
+
export declare function registerDownloadTool(server: FastMCP, config: McpConfig, analytics?: AnalyticsHelper): void;
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { UserError } from 'fastmcp';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
import { defineTool } from '../lib/define-tool.js';
|
|
4
|
+
import { profileField } from './schemas.js';
|
|
5
|
+
export const DownloadParamsSchema = z.object({
|
|
6
|
+
code: z
|
|
7
|
+
.string()
|
|
8
|
+
.describe('JavaScript (ESM) code to execute. The default export receives ' +
|
|
9
|
+
'{ page, context }. During execution the code should trigger a ' +
|
|
10
|
+
'file download in the browser (e.g. clicking a download link).'),
|
|
11
|
+
context: z
|
|
12
|
+
.record(z.string(), z.unknown())
|
|
13
|
+
.optional()
|
|
14
|
+
.describe('Optional context object passed to the function.'),
|
|
15
|
+
timeout: z
|
|
16
|
+
.number()
|
|
17
|
+
.int()
|
|
18
|
+
.positive()
|
|
19
|
+
.optional()
|
|
20
|
+
.describe('Request timeout in milliseconds'),
|
|
21
|
+
profile: profileField('before the download script runs'),
|
|
22
|
+
});
|
|
23
|
+
export function registerDownloadTool(server, config, analytics) {
|
|
24
|
+
defineTool(server, config, analytics, {
|
|
25
|
+
name: 'browserless_download',
|
|
26
|
+
description: 'Run custom Puppeteer code on Browserless and return the file that ' +
|
|
27
|
+
'Chrome downloads during execution. Your code should trigger a file ' +
|
|
28
|
+
'download (e.g. clicking a download link). The downloaded file is ' +
|
|
29
|
+
'returned with its original Content-Type. Useful for downloading ' +
|
|
30
|
+
'CSVs, PDFs, images, or any file from a website.',
|
|
31
|
+
parameters: DownloadParamsSchema,
|
|
32
|
+
annotations: {
|
|
33
|
+
title: 'Browserless Download',
|
|
34
|
+
readOnlyHint: false,
|
|
35
|
+
destructiveHint: true,
|
|
36
|
+
openWorldHint: true,
|
|
37
|
+
},
|
|
38
|
+
profileNotFoundMessage: (profile) => `Profile "${profile}" was not found for the configured API ` +
|
|
39
|
+
`token. Create the profile with Browserless.saveProfile in a ` +
|
|
40
|
+
`live session first, or omit the profile parameter to run the ` +
|
|
41
|
+
`download anonymously.`,
|
|
42
|
+
run: async ({ client, params, log }) => {
|
|
43
|
+
const response = await client.download({
|
|
44
|
+
code: params.code,
|
|
45
|
+
context: params.context,
|
|
46
|
+
timeout: params.timeout,
|
|
47
|
+
profile: params.profile,
|
|
48
|
+
});
|
|
49
|
+
log.debug(`Download response: ok=${response.ok}, status=${response.statusCode}, ` +
|
|
50
|
+
`contentType=${response.contentType}, size=${response.size}, ` +
|
|
51
|
+
`disposition=${response.contentDisposition}`);
|
|
52
|
+
return response;
|
|
53
|
+
},
|
|
54
|
+
analyticsProps: (params, result) => ({
|
|
55
|
+
ok: result.ok,
|
|
56
|
+
status_code: result.statusCode,
|
|
57
|
+
content_type: result.contentType,
|
|
58
|
+
size: result.size,
|
|
59
|
+
profile_used: !!params.profile,
|
|
60
|
+
}),
|
|
61
|
+
format: (response) => {
|
|
62
|
+
if (!response.ok) {
|
|
63
|
+
throw new UserError(`Download failed (status ${response.statusCode}): ${response.data.slice(0, 500)}`);
|
|
64
|
+
}
|
|
65
|
+
const filenameMatch = response.contentDisposition?.match(/filename[^;=\n]*=["']?([^"';\n]*)["']?/);
|
|
66
|
+
const filename = filenameMatch?.[1] ?? 'downloaded-file';
|
|
67
|
+
const blocks = [];
|
|
68
|
+
if (response.isBinary) {
|
|
69
|
+
blocks.push({
|
|
70
|
+
type: 'text',
|
|
71
|
+
text: `[Downloaded file: "${filename}" – ${response.contentType}, ` +
|
|
72
|
+
`${response.size} bytes, base64-encoded]\n${response.data}`,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
blocks.push({ type: 'text', text: response.data });
|
|
77
|
+
}
|
|
78
|
+
blocks.push({
|
|
79
|
+
type: 'text',
|
|
80
|
+
text: [
|
|
81
|
+
'---',
|
|
82
|
+
`Filename: ${filename}`,
|
|
83
|
+
`Content-Type: ${response.contentType}`,
|
|
84
|
+
`Status: ${response.statusCode}`,
|
|
85
|
+
`Size: ${response.size} bytes`,
|
|
86
|
+
'---',
|
|
87
|
+
].join('\n'),
|
|
88
|
+
});
|
|
89
|
+
return blocks;
|
|
90
|
+
},
|
|
91
|
+
});
|
|
92
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { FastMCP } from 'fastmcp';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
import { AnalyticsHelper } from '../lib/analytics.js';
|
|
4
|
+
import type { McpConfig } from '../@types/types.js';
|
|
5
|
+
export declare const ExportParamsSchema: z.ZodObject<{
|
|
6
|
+
url: z.ZodURL;
|
|
7
|
+
gotoOptions: z.ZodOptional<z.ZodObject<{
|
|
8
|
+
waitUntil: z.ZodOptional<z.ZodUnion<readonly [z.ZodEnum<{
|
|
9
|
+
load: "load";
|
|
10
|
+
domcontentloaded: "domcontentloaded";
|
|
11
|
+
networkidle0: "networkidle0";
|
|
12
|
+
networkidle2: "networkidle2";
|
|
13
|
+
}>, z.ZodArray<z.ZodEnum<{
|
|
14
|
+
load: "load";
|
|
15
|
+
domcontentloaded: "domcontentloaded";
|
|
16
|
+
networkidle0: "networkidle0";
|
|
17
|
+
networkidle2: "networkidle2";
|
|
18
|
+
}>>]>>;
|
|
19
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
20
|
+
referer: z.ZodOptional<z.ZodString>;
|
|
21
|
+
}, z.core.$strip>>;
|
|
22
|
+
bestAttempt: z.ZodOptional<z.ZodBoolean>;
|
|
23
|
+
includeResources: z.ZodOptional<z.ZodBoolean>;
|
|
24
|
+
waitForTimeout: z.ZodOptional<z.ZodNumber>;
|
|
25
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
26
|
+
profile: z.ZodOptional<z.ZodString>;
|
|
27
|
+
}, z.core.$strip>;
|
|
28
|
+
export declare function registerExportTool(server: FastMCP, config: McpConfig, analytics?: AnalyticsHelper): void;
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import { UserError } from 'fastmcp';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
import { defineTool, validateHttpUrl } from '../lib/define-tool.js';
|
|
4
|
+
import { profileField } from './schemas.js';
|
|
5
|
+
export const ExportParamsSchema = z.object({
|
|
6
|
+
url: z.url().describe('The URL to export (must be http or https)'),
|
|
7
|
+
gotoOptions: z
|
|
8
|
+
.object({
|
|
9
|
+
waitUntil: z
|
|
10
|
+
.union([
|
|
11
|
+
z.enum(['load', 'domcontentloaded', 'networkidle0', 'networkidle2']),
|
|
12
|
+
z.array(z.enum([
|
|
13
|
+
'load',
|
|
14
|
+
'domcontentloaded',
|
|
15
|
+
'networkidle0',
|
|
16
|
+
'networkidle2',
|
|
17
|
+
])),
|
|
18
|
+
])
|
|
19
|
+
.optional()
|
|
20
|
+
.describe('When to consider navigation complete'),
|
|
21
|
+
timeout: z
|
|
22
|
+
.number()
|
|
23
|
+
.optional()
|
|
24
|
+
.describe('Navigation timeout in milliseconds'),
|
|
25
|
+
referer: z.string().optional().describe('Referer header value'),
|
|
26
|
+
})
|
|
27
|
+
.optional()
|
|
28
|
+
.describe('Puppeteer Page.goto() options for navigation'),
|
|
29
|
+
bestAttempt: z
|
|
30
|
+
.boolean()
|
|
31
|
+
.optional()
|
|
32
|
+
.describe('When true, proceed even if awaited events fail or timeout.'),
|
|
33
|
+
includeResources: z
|
|
34
|
+
.boolean()
|
|
35
|
+
.optional()
|
|
36
|
+
.describe('When true, bundle all linked resources (CSS, JS, images) into a ZIP file.'),
|
|
37
|
+
waitForTimeout: z
|
|
38
|
+
.number()
|
|
39
|
+
.int()
|
|
40
|
+
.nonnegative()
|
|
41
|
+
.optional()
|
|
42
|
+
.describe('Milliseconds to wait after page load before exporting'),
|
|
43
|
+
timeout: z
|
|
44
|
+
.number()
|
|
45
|
+
.int()
|
|
46
|
+
.positive()
|
|
47
|
+
.optional()
|
|
48
|
+
.describe('Request timeout in milliseconds'),
|
|
49
|
+
profile: profileField('before the page is exported'),
|
|
50
|
+
});
|
|
51
|
+
export function registerExportTool(server, config, analytics) {
|
|
52
|
+
defineTool(server, config, analytics, {
|
|
53
|
+
name: 'browserless_export',
|
|
54
|
+
description: 'Export a webpage from a URL via the Browserless /export API. ' +
|
|
55
|
+
'Fetches the URL and returns its content in the native format ' +
|
|
56
|
+
'(HTML, PDF, image, etc.). Automatically detects the content type. ' +
|
|
57
|
+
'Set includeResources=true to bundle all page assets (CSS, JS, images) ' +
|
|
58
|
+
'into a ZIP archive for offline use.',
|
|
59
|
+
parameters: ExportParamsSchema,
|
|
60
|
+
annotations: {
|
|
61
|
+
title: 'Browserless Export',
|
|
62
|
+
readOnlyHint: true,
|
|
63
|
+
destructiveHint: false,
|
|
64
|
+
openWorldHint: true,
|
|
65
|
+
},
|
|
66
|
+
validateUrl: (p) => validateHttpUrl(p.url),
|
|
67
|
+
profileNotFoundMessage: (profile) => `Profile "${profile}" was not found for the configured API ` +
|
|
68
|
+
`token. Create the profile with Browserless.saveProfile in a ` +
|
|
69
|
+
`live session first, or omit the profile parameter to export ` +
|
|
70
|
+
`the page anonymously.`,
|
|
71
|
+
run: async ({ client, params, log }) => {
|
|
72
|
+
const response = await client.exportPage({
|
|
73
|
+
url: params.url,
|
|
74
|
+
gotoOptions: params.gotoOptions,
|
|
75
|
+
bestAttempt: params.bestAttempt,
|
|
76
|
+
includeResources: params.includeResources,
|
|
77
|
+
waitForTimeout: params.waitForTimeout,
|
|
78
|
+
timeout: params.timeout,
|
|
79
|
+
profile: params.profile,
|
|
80
|
+
});
|
|
81
|
+
log.debug(`Export response: ok=${response.ok}, status=${response.statusCode}, ` +
|
|
82
|
+
`contentType=${response.contentType}, size=${response.size}`);
|
|
83
|
+
return response;
|
|
84
|
+
},
|
|
85
|
+
analyticsProps: (params, result) => ({
|
|
86
|
+
url: params.url,
|
|
87
|
+
ok: result.ok,
|
|
88
|
+
status_code: result.statusCode,
|
|
89
|
+
content_type: result.contentType,
|
|
90
|
+
size: result.size,
|
|
91
|
+
include_resources: params.includeResources ?? false,
|
|
92
|
+
profile_used: !!params.profile,
|
|
93
|
+
}),
|
|
94
|
+
format: (response, params) => {
|
|
95
|
+
if (!response.ok) {
|
|
96
|
+
throw new UserError(`Export failed (status ${response.statusCode}): ${response.data.slice(0, 500)}`);
|
|
97
|
+
}
|
|
98
|
+
const filenameMatch = response.contentDisposition?.match(/filename[^;=\n]*=["']?([^"';\n]*)["']?/);
|
|
99
|
+
const filename = filenameMatch?.[1] ?? new URL(params.url).hostname;
|
|
100
|
+
const blocks = [];
|
|
101
|
+
if (response.isBinary) {
|
|
102
|
+
blocks.push({
|
|
103
|
+
type: 'text',
|
|
104
|
+
text: `[Exported "${filename}" – ${response.contentType}, ` +
|
|
105
|
+
`${response.size} bytes, base64-encoded]\n${response.data}`,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
blocks.push({ type: 'text', text: response.data });
|
|
110
|
+
}
|
|
111
|
+
blocks.push({
|
|
112
|
+
type: 'text',
|
|
113
|
+
text: [
|
|
114
|
+
'---',
|
|
115
|
+
`URL: ${params.url}`,
|
|
116
|
+
`Filename: ${filename}`,
|
|
117
|
+
`Content-Type: ${response.contentType}`,
|
|
118
|
+
`Status: ${response.statusCode}`,
|
|
119
|
+
`Size: ${response.size} bytes`,
|
|
120
|
+
params.includeResources ? 'Resources: included (ZIP)' : '',
|
|
121
|
+
'---',
|
|
122
|
+
]
|
|
123
|
+
.filter(Boolean)
|
|
124
|
+
.join('\n'),
|
|
125
|
+
});
|
|
126
|
+
return blocks;
|
|
127
|
+
},
|
|
128
|
+
});
|
|
129
|
+
}
|