webpeel 0.13.4 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -162
- package/dist/cli-auth.js +7 -7
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +197 -26
- package/dist/cli.js.map +1 -1
- package/dist/core/auto-extract.d.ts +83 -0
- package/dist/core/auto-extract.d.ts.map +1 -0
- package/dist/core/auto-extract.js +565 -0
- package/dist/core/auto-extract.js.map +1 -0
- package/dist/core/deep-fetch.d.ts +75 -0
- package/dist/core/deep-fetch.d.ts.map +1 -0
- package/dist/core/deep-fetch.js +406 -0
- package/dist/core/deep-fetch.js.map +1 -0
- package/dist/core/domain-extractors.d.ts +34 -0
- package/dist/core/domain-extractors.d.ts.map +1 -0
- package/dist/core/domain-extractors.js +654 -0
- package/dist/core/domain-extractors.js.map +1 -0
- package/dist/core/markdown.d.ts +8 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +25 -0
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/quick-answer.d.ts +28 -0
- package/dist/core/quick-answer.d.ts.map +1 -0
- package/dist/core/quick-answer.js +288 -0
- package/dist/core/quick-answer.js.map +1 -0
- package/dist/core/readability.d.ts +58 -0
- package/dist/core/readability.d.ts.map +1 -0
- package/dist/core/readability.js +496 -0
- package/dist/core/readability.js.map +1 -0
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +3 -6
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +70 -5
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/watch-manager.d.ts +140 -0
- package/dist/core/watch-manager.d.ts.map +1 -0
- package/dist/core/watch-manager.js +348 -0
- package/dist/core/watch-manager.js.map +1 -0
- package/dist/core/youtube.d.ts +91 -0
- package/dist/core/youtube.d.ts.map +1 -0
- package/dist/core/youtube.js +380 -0
- package/dist/core/youtube.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +103 -0
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +58 -16
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +19 -1
- package/dist/server/app.js.map +1 -1
- package/dist/server/routes/deep-fetch.d.ts +9 -0
- package/dist/server/routes/deep-fetch.d.ts.map +1 -0
- package/dist/server/routes/deep-fetch.js +38 -0
- package/dist/server/routes/deep-fetch.js.map +1 -0
- package/dist/server/routes/extract.d.ts.map +1 -1
- package/dist/server/routes/extract.js +11 -0
- package/dist/server/routes/extract.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +45 -19
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/mcp.d.ts +2 -1
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +307 -38
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/quick-answer.d.ts +9 -0
- package/dist/server/routes/quick-answer.d.ts.map +1 -0
- package/dist/server/routes/quick-answer.js +84 -0
- package/dist/server/routes/quick-answer.js.map +1 -0
- package/dist/server/routes/watch.d.ts +16 -0
- package/dist/server/routes/watch.d.ts.map +1 -0
- package/dist/server/routes/watch.js +219 -0
- package/dist/server/routes/watch.js.map +1 -0
- package/dist/server/routes/youtube.d.ts +7 -0
- package/dist/server/routes/youtube.d.ts.map +1 -0
- package/dist/server/routes/youtube.js +87 -0
- package/dist/server/routes/youtube.js.map +1 -0
- package/dist/types.d.ts +18 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +14 -5
- package/package.json +1 -1
|
@@ -14,6 +14,7 @@ import { Router } from 'express';
|
|
|
14
14
|
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
15
15
|
import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
|
|
16
16
|
import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
17
|
+
import { LRUCache } from 'lru-cache';
|
|
17
18
|
import { peel, peelBatch } from '../../index.js';
|
|
18
19
|
import { normalizeActions } from '../../core/actions.js';
|
|
19
20
|
import { runAgent } from '../../core/agent.js';
|
|
@@ -30,6 +31,12 @@ try {
|
|
|
30
31
|
pkgVersion = pkg.version;
|
|
31
32
|
}
|
|
32
33
|
catch { /* fallback */ }
|
|
34
|
+
const mcpFetchCache = new LRUCache({
|
|
35
|
+
max: 500,
|
|
36
|
+
ttl: 5 * 60 * 1000, // 5 minutes default
|
|
37
|
+
maxSize: 100 * 1024 * 1024, // 100MB
|
|
38
|
+
sizeCalculation: (entry) => JSON.stringify(entry).length,
|
|
39
|
+
});
|
|
33
40
|
// ---------------------------------------------------------------------------
|
|
34
41
|
// Helper functions for brand extraction
|
|
35
42
|
// ---------------------------------------------------------------------------
|
|
@@ -58,17 +65,21 @@ function getTools() {
|
|
|
58
65
|
return [
|
|
59
66
|
{
|
|
60
67
|
name: 'webpeel_fetch',
|
|
61
|
-
description: 'Fetch
|
|
68
|
+
description: 'Fetch any URL and return clean markdown content. Handles JavaScript rendering, bot detection, and content extraction automatically. Set readable=true for article-only content.',
|
|
62
69
|
annotations: { title: 'Fetch Web Page', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
63
70
|
inputSchema: {
|
|
64
71
|
type: 'object',
|
|
65
72
|
properties: {
|
|
66
|
-
url: { type: 'string', description: '
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
73
|
+
url: { type: 'string', description: 'URL to fetch' },
|
|
74
|
+
format: { type: 'string', enum: ['markdown', 'html', 'text'], description: 'Output format (default: markdown)', default: 'markdown' },
|
|
75
|
+
render: { type: 'boolean', description: 'Use browser rendering for JavaScript-heavy sites', default: false },
|
|
76
|
+
stealth: { type: 'boolean', description: 'Stealth mode for bot-protected sites (Amazon, LinkedIn, etc.)', default: false },
|
|
77
|
+
readable: { type: 'boolean', description: 'Reader mode — extract only article content, strip all noise', default: false },
|
|
78
|
+
question: { type: 'string', description: 'Ask a question about the content (BM25, no LLM needed). Returns the most relevant passages.' },
|
|
79
|
+
budget: { type: 'number', description: 'Smart token budget — distill content to N tokens' },
|
|
71
80
|
selector: { type: 'string', description: 'CSS selector to extract specific content' },
|
|
81
|
+
screenshot: { type: 'boolean', description: 'Also take a screenshot', default: false },
|
|
82
|
+
wait: { type: 'number', description: 'Milliseconds to wait for dynamic content', default: 0 },
|
|
72
83
|
maxTokens: { type: 'number', description: 'Maximum token count for output' },
|
|
73
84
|
images: { type: 'boolean', description: 'Extract image URLs', default: false },
|
|
74
85
|
inlineExtract: {
|
|
@@ -108,7 +119,7 @@ function getTools() {
|
|
|
108
119
|
},
|
|
109
120
|
{
|
|
110
121
|
name: 'webpeel_search',
|
|
111
|
-
description: 'Search the web and return results with titles, URLs, and snippets.',
|
|
122
|
+
description: 'Search the web and return structured results with titles, URLs, and snippets. No API key needed.',
|
|
112
123
|
annotations: { title: 'Search the Web', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
113
124
|
inputSchema: {
|
|
114
125
|
type: 'object',
|
|
@@ -121,7 +132,7 @@ function getTools() {
|
|
|
121
132
|
},
|
|
122
133
|
{
|
|
123
134
|
name: 'webpeel_crawl',
|
|
124
|
-
description: 'Crawl a website
|
|
135
|
+
description: 'Crawl a website starting from a URL. Returns content for all discovered pages up to the specified depth/limit.',
|
|
125
136
|
annotations: { title: 'Crawl Website', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
126
137
|
inputSchema: {
|
|
127
138
|
type: 'object',
|
|
@@ -136,7 +147,7 @@ function getTools() {
|
|
|
136
147
|
},
|
|
137
148
|
{
|
|
138
149
|
name: 'webpeel_map',
|
|
139
|
-
description: 'Discover all URLs on a domain
|
|
150
|
+
description: 'Discover all URLs on a domain via sitemap and link crawling. Returns a structured URL list.',
|
|
140
151
|
annotations: { title: 'Map Website URLs', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
141
152
|
inputSchema: {
|
|
142
153
|
type: 'object',
|
|
@@ -149,7 +160,7 @@ function getTools() {
|
|
|
149
160
|
},
|
|
150
161
|
{
|
|
151
162
|
name: 'webpeel_extract',
|
|
152
|
-
description: 'Extract structured data from a
|
|
163
|
+
description: 'Extract structured data from a URL using CSS selectors, JSON Schema, or LLM. Returns typed key-value pairs.',
|
|
153
164
|
annotations: { title: 'Extract Structured Data', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
154
165
|
inputSchema: {
|
|
155
166
|
type: 'object',
|
|
@@ -164,7 +175,7 @@ function getTools() {
|
|
|
164
175
|
},
|
|
165
176
|
{
|
|
166
177
|
name: 'webpeel_batch',
|
|
167
|
-
description: 'Fetch multiple URLs
|
|
178
|
+
description: 'Fetch multiple URLs concurrently. Pass an array of URLs, get back an array of results.',
|
|
168
179
|
annotations: { title: 'Batch Fetch URLs', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
169
180
|
inputSchema: {
|
|
170
181
|
type: 'object',
|
|
@@ -178,7 +189,7 @@ function getTools() {
|
|
|
178
189
|
},
|
|
179
190
|
{
|
|
180
191
|
name: 'webpeel_research',
|
|
181
|
-
description: '
|
|
192
|
+
description: 'Multi-step web research: searches the web, fetches top sources, follows leads, and synthesizes findings into a report with citations.',
|
|
182
193
|
annotations: { title: 'Deep Research Agent', readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
183
194
|
inputSchema: {
|
|
184
195
|
type: 'object',
|
|
@@ -197,7 +208,7 @@ function getTools() {
|
|
|
197
208
|
},
|
|
198
209
|
{
|
|
199
210
|
name: 'webpeel_screenshot',
|
|
200
|
-
description: 'Take a screenshot of
|
|
211
|
+
description: 'Take a screenshot of any URL. Returns a PNG image. Supports full-page capture and viewport sizing.',
|
|
201
212
|
annotations: { title: 'Take Screenshot', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
202
213
|
inputSchema: {
|
|
203
214
|
type: 'object',
|
|
@@ -217,7 +228,7 @@ function getTools() {
|
|
|
217
228
|
},
|
|
218
229
|
{
|
|
219
230
|
name: 'webpeel_summarize',
|
|
220
|
-
description: 'Generate an AI
|
|
231
|
+
description: 'Generate an AI summary of a URL\'s content. Requires an LLM API key (BYOK).',
|
|
221
232
|
annotations: { title: 'Summarize Page', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
222
233
|
inputSchema: {
|
|
223
234
|
type: 'object',
|
|
@@ -234,7 +245,7 @@ function getTools() {
|
|
|
234
245
|
},
|
|
235
246
|
{
|
|
236
247
|
name: 'webpeel_answer',
|
|
237
|
-
description: 'Ask a question
|
|
248
|
+
description: 'Ask a question about a URL and get an AI-generated answer with citations. Requires an LLM API key (BYOK). For LLM-free Q&A, use webpeel_quick_answer instead.',
|
|
238
249
|
annotations: { title: 'Answer a Question', readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
239
250
|
inputSchema: {
|
|
240
251
|
type: 'object',
|
|
@@ -252,7 +263,7 @@ function getTools() {
|
|
|
252
263
|
},
|
|
253
264
|
{
|
|
254
265
|
name: 'webpeel_brand',
|
|
255
|
-
description: 'Extract branding
|
|
266
|
+
description: 'Extract branding assets from a URL: logo, colors, fonts, and social links.',
|
|
256
267
|
annotations: { title: 'Extract Branding', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
257
268
|
inputSchema: {
|
|
258
269
|
type: 'object',
|
|
@@ -265,7 +276,7 @@ function getTools() {
|
|
|
265
276
|
},
|
|
266
277
|
{
|
|
267
278
|
name: 'webpeel_change_track',
|
|
268
|
-
description: 'Track changes on a URL
|
|
279
|
+
description: 'Track content changes on a URL. First call saves a snapshot, subsequent calls show what changed.',
|
|
269
280
|
annotations: { title: 'Track Page Changes', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
270
281
|
inputSchema: {
|
|
271
282
|
type: 'object',
|
|
@@ -278,18 +289,73 @@ function getTools() {
|
|
|
278
289
|
},
|
|
279
290
|
{
|
|
280
291
|
name: 'webpeel_deep_fetch',
|
|
281
|
-
description: 'Search
|
|
292
|
+
description: 'Search + fetch + analyze in one call. Fetches multiple sources for a query, scores by relevance, deduplicates facts, and merges into structured intelligence. No LLM key needed. Supports \'comparison\' format for vs-queries.',
|
|
282
293
|
annotations: { title: 'Deep Fetch Research', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
283
294
|
inputSchema: {
|
|
284
295
|
type: 'object',
|
|
285
296
|
properties: {
|
|
286
297
|
query: { type: 'string', description: 'Search query to research' },
|
|
287
298
|
count: { type: 'number', description: 'Number of top results to fetch (default: 5, max: 10)', default: 5, minimum: 1, maximum: 10 },
|
|
288
|
-
format: { type: 'string', enum: ['markdown', 'text'], description: 'Content format
|
|
299
|
+
format: { type: 'string', enum: ['markdown', 'text', 'comparison'], description: 'Content format (default: markdown). Use "comparison" for vs-queries to get a side-by-side structure.', default: 'markdown' },
|
|
289
300
|
},
|
|
290
301
|
required: ['query'],
|
|
291
302
|
},
|
|
292
303
|
},
|
|
304
|
+
{
|
|
305
|
+
name: 'webpeel_youtube',
|
|
306
|
+
description: 'Extract the full transcript from a YouTube video. Returns timestamped segments and video metadata. No API key needed. Supports all YouTube URL formats.',
|
|
307
|
+
annotations: { title: 'Extract YouTube Transcript', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
308
|
+
inputSchema: {
|
|
309
|
+
type: 'object',
|
|
310
|
+
properties: {
|
|
311
|
+
url: { type: 'string', description: 'YouTube video URL (supports youtube.com/watch, youtu.be, embed, shorts, and mobile URLs)' },
|
|
312
|
+
language: { type: 'string', description: 'Preferred transcript language code (default: en). Falls back to any available language if not found.' },
|
|
313
|
+
},
|
|
314
|
+
required: ['url'],
|
|
315
|
+
},
|
|
316
|
+
},
|
|
317
|
+
{
|
|
318
|
+
name: 'webpeel_auto_extract',
|
|
319
|
+
description: 'Detect page type and extract structured JSON automatically. Supports pricing pages, product listings, contact info, articles, and API documentation. No LLM needed.',
|
|
320
|
+
inputSchema: {
|
|
321
|
+
type: 'object',
|
|
322
|
+
properties: {
|
|
323
|
+
url: { type: 'string', description: 'URL to fetch and auto-extract structured data from' },
|
|
324
|
+
},
|
|
325
|
+
required: ['url'],
|
|
326
|
+
},
|
|
327
|
+
},
|
|
328
|
+
{
|
|
329
|
+
name: 'webpeel_quick_answer',
|
|
330
|
+
description: 'Ask a question about a URL\'s content — no LLM key needed. Uses BM25 relevance scoring to find and return the most relevant passages. Returns answer text with confidence score.',
|
|
331
|
+
annotations: { title: 'Quick Answer (No LLM)', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
332
|
+
inputSchema: {
|
|
333
|
+
type: 'object',
|
|
334
|
+
properties: {
|
|
335
|
+
url: { type: 'string', description: 'URL to fetch and search' },
|
|
336
|
+
question: { type: 'string', description: 'Question to answer from the page content' },
|
|
337
|
+
maxPassages: { type: 'number', description: 'Maximum number of relevant passages to return (default: 3)', default: 3, minimum: 1, maximum: 10 },
|
|
338
|
+
render: { type: 'boolean', description: 'Use browser rendering', default: false },
|
|
339
|
+
},
|
|
340
|
+
required: ['url', 'question'],
|
|
341
|
+
},
|
|
342
|
+
},
|
|
343
|
+
{
|
|
344
|
+
name: 'webpeel_watch',
|
|
345
|
+
description: 'Monitor a URL for changes with webhook notifications. Create persistent watchers that check on a schedule and alert when content changes.',
|
|
346
|
+
inputSchema: {
|
|
347
|
+
type: 'object',
|
|
348
|
+
properties: {
|
|
349
|
+
action: { type: 'string', enum: ['create', 'list', 'check', 'delete'], description: 'Watch action to perform' },
|
|
350
|
+
url: { type: 'string', description: 'URL to monitor (for create)' },
|
|
351
|
+
id: { type: 'string', description: 'Watch ID (for check/delete)' },
|
|
352
|
+
webhookUrl: { type: 'string', description: 'Webhook URL to notify on changes (for create)' },
|
|
353
|
+
intervalMinutes: { type: 'number', description: 'Check interval in minutes (default: 60)' },
|
|
354
|
+
selector: { type: 'string', description: 'CSS selector to monitor specific content (optional)' },
|
|
355
|
+
},
|
|
356
|
+
required: ['action'],
|
|
357
|
+
},
|
|
358
|
+
},
|
|
293
359
|
];
|
|
294
360
|
}
|
|
295
361
|
// ---------------------------------------------------------------------------
|
|
@@ -303,7 +369,7 @@ function safeStringify(obj) {
|
|
|
303
369
|
return JSON.stringify({ error: 'serialization_error', message: 'Failed to serialize result' });
|
|
304
370
|
}
|
|
305
371
|
}
|
|
306
|
-
async function handleToolCall(name, args) {
|
|
372
|
+
async function handleToolCall(name, args, pool, req) {
|
|
307
373
|
try {
|
|
308
374
|
// webpeel_fetch
|
|
309
375
|
if (name === 'webpeel_fetch') {
|
|
@@ -323,8 +389,55 @@ async function handleToolCall(name, args) {
|
|
|
323
389
|
selector: args.selector,
|
|
324
390
|
maxTokens: args.maxTokens,
|
|
325
391
|
images: args.images,
|
|
392
|
+
readable: args.readable || false,
|
|
393
|
+
budget: args.budget,
|
|
394
|
+
question: args.question,
|
|
395
|
+
screenshot: args.screenshot || false,
|
|
326
396
|
actions: parsedActions,
|
|
327
397
|
};
|
|
398
|
+
// Cache key and bypass logic
|
|
399
|
+
const mcpNoCache = args.noCache === true;
|
|
400
|
+
const mcpCacheTtlMs = typeof args.cacheTtl === 'number' ? args.cacheTtl * 1000 : 5 * 60 * 1000;
|
|
401
|
+
const mcpActionsKey = parsedActions ? JSON.stringify(parsedActions) : '';
|
|
402
|
+
const mcpCacheKey = `mcp:fetch:${url}:${options.render}:${options.wait}:${options.format}:${options.selector}:${options.images}:${mcpActionsKey}`;
|
|
403
|
+
// Check cache (skip for noCache or inline extraction requests)
|
|
404
|
+
const hasInlineExtract = args.inlineExtract && (args.inlineExtract.schema || args.inlineExtract.prompt);
|
|
405
|
+
if (!mcpNoCache && !hasInlineExtract) {
|
|
406
|
+
const cached = mcpFetchCache.get(mcpCacheKey);
|
|
407
|
+
if (cached) {
|
|
408
|
+
const cacheAge = Date.now() - cached.timestamp;
|
|
409
|
+
if (cacheAge < mcpCacheTtlMs) {
|
|
410
|
+
const r = cached.result;
|
|
411
|
+
const cachedOutput = {
|
|
412
|
+
url: r.url || url,
|
|
413
|
+
title: r.title || r.metadata?.title || '',
|
|
414
|
+
tokens: r.tokens || 0,
|
|
415
|
+
content: r.content,
|
|
416
|
+
_cache: 'HIT',
|
|
417
|
+
_cacheAge: Math.floor(cacheAge / 1000),
|
|
418
|
+
};
|
|
419
|
+
if (r.metadata && Object.keys(r.metadata).length > 0)
|
|
420
|
+
cachedOutput.metadata = r.metadata;
|
|
421
|
+
if (r.domainData)
|
|
422
|
+
cachedOutput.domainData = r.domainData;
|
|
423
|
+
if (r.readability)
|
|
424
|
+
cachedOutput.readability = { readingTime: r.readability.readingTime, wordCount: r.readability.wordCount };
|
|
425
|
+
if (r.quickAnswer)
|
|
426
|
+
cachedOutput.quickAnswer = r.quickAnswer;
|
|
427
|
+
if (r.json)
|
|
428
|
+
cachedOutput.json = r.json;
|
|
429
|
+
if (r.extracted)
|
|
430
|
+
cachedOutput.extracted = r.extracted;
|
|
431
|
+
if (r.images && r.images.length > 0)
|
|
432
|
+
cachedOutput.images = r.images;
|
|
433
|
+
if (r.screenshot)
|
|
434
|
+
cachedOutput.screenshot = r.screenshot;
|
|
435
|
+
if (r.fingerprint)
|
|
436
|
+
cachedOutput.fingerprint = r.fingerprint;
|
|
437
|
+
return ok(safeStringify(cachedOutput));
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
}
|
|
328
441
|
const result = await Promise.race([
|
|
329
442
|
peel(url, options),
|
|
330
443
|
timeout(60000, 'Fetch timed out'),
|
|
@@ -348,7 +461,45 @@ async function handleToolCall(name, args) {
|
|
|
348
461
|
result.extractTokensUsed = extractResult.tokensUsed;
|
|
349
462
|
}
|
|
350
463
|
}
|
|
351
|
-
|
|
464
|
+
// Store in cache (skip for inline extraction results — they depend on user's LLM keys)
|
|
465
|
+
if (!mcpNoCache && !hasInlineExtract) {
|
|
466
|
+
mcpFetchCache.set(mcpCacheKey, { result, timestamp: Date.now() }, { ttl: mcpCacheTtlMs });
|
|
467
|
+
}
|
|
468
|
+
// Build consistent output — always include url, title, tokens
|
|
469
|
+
const output = {
|
|
470
|
+
url: result.url || url,
|
|
471
|
+
title: result.title || result.metadata?.title || '',
|
|
472
|
+
tokens: result.tokens || 0,
|
|
473
|
+
content: result.content,
|
|
474
|
+
};
|
|
475
|
+
if (result.metadata && Object.keys(result.metadata).length > 0)
|
|
476
|
+
output.metadata = result.metadata;
|
|
477
|
+
if (result.domainData)
|
|
478
|
+
output.domainData = result.domainData;
|
|
479
|
+
if (result.readability)
|
|
480
|
+
output.readability = {
|
|
481
|
+
readingTime: result.readability.readingTime,
|
|
482
|
+
wordCount: result.readability.wordCount,
|
|
483
|
+
};
|
|
484
|
+
if (result.quickAnswer)
|
|
485
|
+
output.quickAnswer = result.quickAnswer;
|
|
486
|
+
if (result.json)
|
|
487
|
+
output.json = result.json;
|
|
488
|
+
if (result.extracted)
|
|
489
|
+
output.extracted = result.extracted;
|
|
490
|
+
if (result.images && result.images.length > 0)
|
|
491
|
+
output.images = result.images;
|
|
492
|
+
if (result.screenshot)
|
|
493
|
+
output.screenshot = result.screenshot;
|
|
494
|
+
if (result.fingerprint)
|
|
495
|
+
output.fingerprint = result.fingerprint;
|
|
496
|
+
if (result.extractTokensUsed)
|
|
497
|
+
output.extractTokensUsed = result.extractTokensUsed;
|
|
498
|
+
if (result._cache)
|
|
499
|
+
output._cache = result._cache;
|
|
500
|
+
if (result._cacheAge !== undefined)
|
|
501
|
+
output._cacheAge = result._cacheAge;
|
|
502
|
+
return ok(safeStringify(output));
|
|
352
503
|
}
|
|
353
504
|
// webpeel_search
|
|
354
505
|
if (name === 'webpeel_search') {
|
|
@@ -358,11 +509,19 @@ async function handleToolCall(name, args) {
|
|
|
358
509
|
const { getBestSearchProvider } = await import('../../core/search-provider.js');
|
|
359
510
|
const { provider, apiKey } = getBestSearchProvider();
|
|
360
511
|
const count = Math.min(Math.max(args.count || 5, 1), 10);
|
|
361
|
-
const
|
|
512
|
+
const rawResults = await Promise.race([
|
|
362
513
|
provider.searchWeb(query, { count, apiKey }),
|
|
363
514
|
timeout(30000, 'Search timed out'),
|
|
364
515
|
]);
|
|
365
|
-
|
|
516
|
+
// Normalize to consistent format
|
|
517
|
+
const resultsList = Array.isArray(rawResults) ? rawResults : (rawResults?.results ?? []);
|
|
518
|
+
const normalizedResults = resultsList.map((r) => ({
|
|
519
|
+
title: r.title || '',
|
|
520
|
+
url: r.url || r.link || '',
|
|
521
|
+
snippet: r.snippet || r.description || r.body || '',
|
|
522
|
+
...(r.favicon ? { favicon: r.favicon } : {}),
|
|
523
|
+
}));
|
|
524
|
+
return ok(safeStringify({ query, count: normalizedResults.length, results: normalizedResults }));
|
|
366
525
|
}
|
|
367
526
|
// webpeel_crawl
|
|
368
527
|
if (name === 'webpeel_crawl') {
|
|
@@ -635,7 +794,9 @@ async function handleToolCall(name, args) {
|
|
|
635
794
|
if (!query || typeof query !== 'string')
|
|
636
795
|
throw new Error('Invalid query');
|
|
637
796
|
const count = Math.min(Math.max(args.count || 5, 1), 10);
|
|
638
|
-
const
|
|
797
|
+
const rawFormat = args.format || 'markdown';
|
|
798
|
+
const isComparison = rawFormat === 'comparison';
|
|
799
|
+
const format = (isComparison ? 'markdown' : rawFormat);
|
|
639
800
|
// Step 1: Search for the query using best available provider
|
|
640
801
|
const { getBestSearchProvider } = await import('../../core/search-provider.js');
|
|
641
802
|
const { provider, apiKey } = getBestSearchProvider();
|
|
@@ -661,25 +822,132 @@ async function handleToolCall(name, args) {
|
|
|
661
822
|
for (let i = 0; i < pages.length; i++) {
|
|
662
823
|
const page = pages[i];
|
|
663
824
|
const searchResult = topResults[i];
|
|
664
|
-
const
|
|
665
|
-
const title = page?.title || searchResult?.title ||
|
|
666
|
-
|
|
825
|
+
const pageUrl = urls[i];
|
|
826
|
+
const title = page?.title || searchResult?.title || pageUrl;
|
|
827
|
+
// Position-based relevance score (top result = 1.0, decreasing)
|
|
828
|
+
const relevanceScore = Math.round((1 - i / Math.max(pages.length, 1)) * 100) / 100;
|
|
829
|
+
sources.push({ url: pageUrl, title, relevanceScore, ...(searchResult?.snippet ? { snippet: searchResult.snippet } : {}) });
|
|
667
830
|
if (page?.content) {
|
|
668
|
-
contentParts.push(`## Source ${i + 1}: ${title}\n**URL:** ${
|
|
831
|
+
contentParts.push(`## Source ${i + 1}: ${title}\n**URL:** ${pageUrl}\n\n${page.content}\n\n---\n`);
|
|
669
832
|
totalTokens += page.tokens || 0;
|
|
670
833
|
}
|
|
671
834
|
else if (page?.error) {
|
|
672
|
-
contentParts.push(`## Source ${i + 1}: ${title}\n**URL:** ${
|
|
835
|
+
contentParts.push(`## Source ${i + 1}: ${title}\n**URL:** ${pageUrl}\n\n*(Failed to fetch: ${page.error})*\n\n---\n`);
|
|
673
836
|
}
|
|
674
837
|
}
|
|
675
838
|
const mergedContent = contentParts.join('\n');
|
|
676
|
-
|
|
839
|
+
const deepFetchOutput = {
|
|
677
840
|
query,
|
|
678
841
|
sources,
|
|
679
842
|
content: mergedContent,
|
|
680
843
|
totalTokens,
|
|
844
|
+
};
|
|
845
|
+
// For comparison format, add a structured comparison hint
|
|
846
|
+
if (isComparison) {
|
|
847
|
+
deepFetchOutput.format = 'comparison';
|
|
848
|
+
deepFetchOutput.comparisonNote = 'Sources fetched and ranked by relevance. Review sources array and content sections for side-by-side comparison.';
|
|
849
|
+
}
|
|
850
|
+
return ok(safeStringify(deepFetchOutput));
|
|
851
|
+
}
|
|
852
|
+
// webpeel_quick_answer
|
|
853
|
+
if (name === 'webpeel_quick_answer') {
|
|
854
|
+
const url = args.url;
|
|
855
|
+
const question = args.question;
|
|
856
|
+
if (!url || typeof url !== 'string')
|
|
857
|
+
throw new Error('Invalid URL');
|
|
858
|
+
if (url.length > 2048)
|
|
859
|
+
throw new Error('URL too long');
|
|
860
|
+
if (!question || typeof question !== 'string')
|
|
861
|
+
throw new Error('Invalid question');
|
|
862
|
+
if (question.length > 1000)
|
|
863
|
+
throw new Error('Question too long (max 1000 characters)');
|
|
864
|
+
const maxPassages = typeof args.maxPassages === 'number' ? Math.min(Math.max(args.maxPassages, 1), 10) : 3;
|
|
865
|
+
const peelResult = await Promise.race([
|
|
866
|
+
peel(url, {
|
|
867
|
+
render: args.render || false,
|
|
868
|
+
format: 'markdown',
|
|
869
|
+
budget: 8000,
|
|
870
|
+
}),
|
|
871
|
+
timeout(60000, 'Quick answer fetch timed out'),
|
|
872
|
+
]);
|
|
873
|
+
const { quickAnswer } = await import('../../core/quick-answer.js');
|
|
874
|
+
const qa = quickAnswer({
|
|
875
|
+
question,
|
|
876
|
+
content: peelResult.content || '',
|
|
877
|
+
url: peelResult.url || url,
|
|
878
|
+
maxPassages,
|
|
879
|
+
});
|
|
880
|
+
return ok(safeStringify({
|
|
881
|
+
url: peelResult.url || url,
|
|
882
|
+
title: peelResult.title,
|
|
883
|
+
question: qa.question,
|
|
884
|
+
answer: qa.answer,
|
|
885
|
+
confidence: qa.confidence,
|
|
886
|
+
passages: qa.passages,
|
|
887
|
+
method: qa.method,
|
|
681
888
|
}));
|
|
682
889
|
}
|
|
890
|
+
// webpeel_youtube
|
|
891
|
+
if (name === 'webpeel_youtube') {
|
|
892
|
+
const url = args.url;
|
|
893
|
+
if (!url || typeof url !== 'string')
|
|
894
|
+
throw new Error('Invalid URL');
|
|
895
|
+
const { getYouTubeTranscript } = await import('../../core/youtube.js');
|
|
896
|
+
const transcript = await Promise.race([
|
|
897
|
+
getYouTubeTranscript(url, {
|
|
898
|
+
language: args.language ?? 'en',
|
|
899
|
+
}),
|
|
900
|
+
timeout(60000, 'YouTube transcript extraction timed out'),
|
|
901
|
+
]);
|
|
902
|
+
return ok(safeStringify(transcript));
|
|
903
|
+
}
|
|
904
|
+
// webpeel_auto_extract
|
|
905
|
+
if (name === 'webpeel_auto_extract') {
|
|
906
|
+
const url = args.url;
|
|
907
|
+
if (!url)
|
|
908
|
+
return { content: [{ type: 'text', text: JSON.stringify({ error: 'Missing url parameter' }) }] };
|
|
909
|
+
const { autoExtract } = await import('../../core/auto-extract.js');
|
|
910
|
+
const result = await peel(url, { format: 'html' });
|
|
911
|
+
const extracted = autoExtract(result.content || '', url);
|
|
912
|
+
return {
|
|
913
|
+
content: [{ type: 'text', text: JSON.stringify({
|
|
914
|
+
url,
|
|
915
|
+
pageType: extracted.type,
|
|
916
|
+
structured: extracted,
|
|
917
|
+
}, null, 2) }],
|
|
918
|
+
};
|
|
919
|
+
}
|
|
920
|
+
// webpeel_watch
|
|
921
|
+
if (name === 'webpeel_watch') {
|
|
922
|
+
const action = args.action;
|
|
923
|
+
if (!pool) {
|
|
924
|
+
return { content: [{ type: 'text', text: JSON.stringify({ error: 'Watch feature requires database connection. Use the REST API at /v1/watch instead.' }) }] };
|
|
925
|
+
}
|
|
926
|
+
const { WatchManager } = await import('../../core/watch-manager.js');
|
|
927
|
+
const wm = new WatchManager(pool);
|
|
928
|
+
const accountId = req?.auth?.keyInfo?.accountId || req?.auth?.keyInfo?.userId || 'anonymous';
|
|
929
|
+
if (action === 'create') {
|
|
930
|
+
const watch = await wm.create(accountId, args.url, {
|
|
931
|
+
webhookUrl: args.webhookUrl,
|
|
932
|
+
checkIntervalMinutes: args.intervalMinutes || 60,
|
|
933
|
+
selector: args.selector,
|
|
934
|
+
});
|
|
935
|
+
return { content: [{ type: 'text', text: JSON.stringify(watch, null, 2) }] };
|
|
936
|
+
}
|
|
937
|
+
if (action === 'list') {
|
|
938
|
+
const watches = await wm.list(accountId);
|
|
939
|
+
return { content: [{ type: 'text', text: JSON.stringify(watches, null, 2) }] };
|
|
940
|
+
}
|
|
941
|
+
if (action === 'check') {
|
|
942
|
+
const result = await wm.check(args.id);
|
|
943
|
+
return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
|
|
944
|
+
}
|
|
945
|
+
if (action === 'delete') {
|
|
946
|
+
await wm.delete(args.id);
|
|
947
|
+
return { content: [{ type: 'text', text: JSON.stringify({ success: true }) }] };
|
|
948
|
+
}
|
|
949
|
+
return { content: [{ type: 'text', text: JSON.stringify({ error: `Unknown watch action: ${action}` }) }] };
|
|
950
|
+
}
|
|
683
951
|
throw new Error(`Unknown tool: ${name}`);
|
|
684
952
|
}
|
|
685
953
|
catch (error) {
|
|
@@ -699,13 +967,13 @@ function timeout(ms, msg) {
|
|
|
699
967
|
// ---------------------------------------------------------------------------
|
|
700
968
|
// Create a fresh MCP server instance (stateless — one per request)
|
|
701
969
|
// ---------------------------------------------------------------------------
|
|
702
|
-
function createMcpServer() {
|
|
970
|
+
function createMcpServer(pool, req) {
|
|
703
971
|
const server = new Server({ name: 'webpeel', version: pkgVersion }, { capabilities: { tools: {} } });
|
|
704
972
|
const tools = getTools();
|
|
705
973
|
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
|
|
706
974
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
707
975
|
const { name, arguments: args } = request.params;
|
|
708
|
-
return handleToolCall(name, (args ?? {}));
|
|
976
|
+
return handleToolCall(name, (args ?? {}), pool, req);
|
|
709
977
|
});
|
|
710
978
|
return server;
|
|
711
979
|
}
|
|
@@ -715,7 +983,7 @@ function createMcpServer() {
|
|
|
715
983
|
// ---------------------------------------------------------------------------
|
|
716
984
|
// Shared MCP handler logic
|
|
717
985
|
// ---------------------------------------------------------------------------
|
|
718
|
-
async function handleMcpPost(req, res) {
|
|
986
|
+
async function handleMcpPost(req, res, pool) {
|
|
719
987
|
// Require authentication — reject unauthenticated requests.
|
|
720
988
|
// The /:apiKey/v2/mcp path validates the key before calling this handler.
|
|
721
989
|
// The /mcp and /v2/mcp paths rely on the global auth middleware (Bearer token).
|
|
@@ -728,7 +996,7 @@ async function handleMcpPost(req, res) {
|
|
|
728
996
|
return;
|
|
729
997
|
}
|
|
730
998
|
try {
|
|
731
|
-
const server = createMcpServer();
|
|
999
|
+
const server = createMcpServer(pool, req);
|
|
732
1000
|
const transport = new StreamableHTTPServerTransport({
|
|
733
1001
|
sessionIdGenerator: undefined, // stateless
|
|
734
1002
|
});
|
|
@@ -768,16 +1036,17 @@ function mcpDeleteOk(_req, res) {
|
|
|
768
1036
|
// ---------------------------------------------------------------------------
|
|
769
1037
|
// Express router
|
|
770
1038
|
// ---------------------------------------------------------------------------
|
|
771
|
-
export function createMcpRouter(authStore) {
|
|
1039
|
+
export function createMcpRouter(authStore, pool) {
|
|
772
1040
|
const router = Router();
|
|
1041
|
+
const boundHandler = (req, res) => handleMcpPost(req, res, pool);
|
|
773
1042
|
// POST /mcp — legacy path, MCP Streamable HTTP transport
|
|
774
|
-
router.post('/mcp',
|
|
1043
|
+
router.post('/mcp', boundHandler);
|
|
775
1044
|
router.get('/mcp', mcpMethodNotAllowed);
|
|
776
1045
|
router.delete('/mcp', mcpDeleteOk);
|
|
777
1046
|
// POST /v2/mcp — canonical v2 path; auth via Authorization: Bearer <key> header
|
|
778
1047
|
// The global auth middleware already validates the Bearer token, so no extra
|
|
779
1048
|
// validation is needed here.
|
|
780
|
-
router.post('/v2/mcp',
|
|
1049
|
+
router.post('/v2/mcp', boundHandler);
|
|
781
1050
|
router.get('/v2/mcp', mcpMethodNotAllowed);
|
|
782
1051
|
router.delete('/v2/mcp', mcpDeleteOk);
|
|
783
1052
|
// POST /:apiKey/v2/mcp — Firecrawl-style: API key embedded in URL path
|
|
@@ -825,7 +1094,7 @@ export function createMcpRouter(authStore) {
|
|
|
825
1094
|
return;
|
|
826
1095
|
}
|
|
827
1096
|
}
|
|
828
|
-
return handleMcpPost(req, res);
|
|
1097
|
+
return handleMcpPost(req, res, pool);
|
|
829
1098
|
});
|
|
830
1099
|
router.get('/:apiKey/v2/mcp', mcpMethodNotAllowed);
|
|
831
1100
|
router.delete('/:apiKey/v2/mcp', mcpDeleteOk);
|