webpeel 0.13.4 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +120 -162
  2. package/dist/cli-auth.js +7 -7
  3. package/dist/cli-auth.js.map +1 -1
  4. package/dist/cli.js +197 -26
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/auto-extract.d.ts +83 -0
  7. package/dist/core/auto-extract.d.ts.map +1 -0
  8. package/dist/core/auto-extract.js +565 -0
  9. package/dist/core/auto-extract.js.map +1 -0
  10. package/dist/core/deep-fetch.d.ts +75 -0
  11. package/dist/core/deep-fetch.d.ts.map +1 -0
  12. package/dist/core/deep-fetch.js +406 -0
  13. package/dist/core/deep-fetch.js.map +1 -0
  14. package/dist/core/domain-extractors.d.ts +34 -0
  15. package/dist/core/domain-extractors.d.ts.map +1 -0
  16. package/dist/core/domain-extractors.js +654 -0
  17. package/dist/core/domain-extractors.js.map +1 -0
  18. package/dist/core/markdown.d.ts +8 -0
  19. package/dist/core/markdown.d.ts.map +1 -1
  20. package/dist/core/markdown.js +25 -0
  21. package/dist/core/markdown.js.map +1 -1
  22. package/dist/core/quick-answer.d.ts +28 -0
  23. package/dist/core/quick-answer.d.ts.map +1 -0
  24. package/dist/core/quick-answer.js +288 -0
  25. package/dist/core/quick-answer.js.map +1 -0
  26. package/dist/core/readability.d.ts +58 -0
  27. package/dist/core/readability.d.ts.map +1 -0
  28. package/dist/core/readability.js +496 -0
  29. package/dist/core/readability.js.map +1 -0
  30. package/dist/core/search-provider.d.ts.map +1 -1
  31. package/dist/core/search-provider.js +3 -6
  32. package/dist/core/search-provider.js.map +1 -1
  33. package/dist/core/strategies.d.ts.map +1 -1
  34. package/dist/core/strategies.js +70 -5
  35. package/dist/core/strategies.js.map +1 -1
  36. package/dist/core/watch-manager.d.ts +140 -0
  37. package/dist/core/watch-manager.d.ts.map +1 -0
  38. package/dist/core/watch-manager.js +348 -0
  39. package/dist/core/watch-manager.js.map +1 -0
  40. package/dist/core/youtube.d.ts +91 -0
  41. package/dist/core/youtube.d.ts.map +1 -0
  42. package/dist/core/youtube.js +380 -0
  43. package/dist/core/youtube.js.map +1 -0
  44. package/dist/index.d.ts +4 -0
  45. package/dist/index.d.ts.map +1 -1
  46. package/dist/index.js +103 -0
  47. package/dist/index.js.map +1 -1
  48. package/dist/mcp/server.js +58 -16
  49. package/dist/mcp/server.js.map +1 -1
  50. package/dist/server/app.d.ts.map +1 -1
  51. package/dist/server/app.js +19 -1
  52. package/dist/server/app.js.map +1 -1
  53. package/dist/server/routes/deep-fetch.d.ts +9 -0
  54. package/dist/server/routes/deep-fetch.d.ts.map +1 -0
  55. package/dist/server/routes/deep-fetch.js +38 -0
  56. package/dist/server/routes/deep-fetch.js.map +1 -0
  57. package/dist/server/routes/extract.d.ts.map +1 -1
  58. package/dist/server/routes/extract.js +11 -0
  59. package/dist/server/routes/extract.js.map +1 -1
  60. package/dist/server/routes/fetch.d.ts.map +1 -1
  61. package/dist/server/routes/fetch.js +45 -19
  62. package/dist/server/routes/fetch.js.map +1 -1
  63. package/dist/server/routes/mcp.d.ts +2 -1
  64. package/dist/server/routes/mcp.d.ts.map +1 -1
  65. package/dist/server/routes/mcp.js +307 -38
  66. package/dist/server/routes/mcp.js.map +1 -1
  67. package/dist/server/routes/quick-answer.d.ts +9 -0
  68. package/dist/server/routes/quick-answer.d.ts.map +1 -0
  69. package/dist/server/routes/quick-answer.js +84 -0
  70. package/dist/server/routes/quick-answer.js.map +1 -0
  71. package/dist/server/routes/watch.d.ts +16 -0
  72. package/dist/server/routes/watch.d.ts.map +1 -0
  73. package/dist/server/routes/watch.js +219 -0
  74. package/dist/server/routes/watch.js.map +1 -0
  75. package/dist/server/routes/youtube.d.ts +7 -0
  76. package/dist/server/routes/youtube.d.ts.map +1 -0
  77. package/dist/server/routes/youtube.js +87 -0
  78. package/dist/server/routes/youtube.js.map +1 -0
  79. package/dist/types.d.ts +18 -0
  80. package/dist/types.d.ts.map +1 -1
  81. package/dist/types.js.map +1 -1
  82. package/llms.txt +14 -5
  83. package/package.json +1 -1
@@ -14,6 +14,7 @@ import { Router } from 'express';
14
14
  import { Server } from '@modelcontextprotocol/sdk/server/index.js';
15
15
  import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
16
16
  import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
17
+ import { LRUCache } from 'lru-cache';
17
18
  import { peel, peelBatch } from '../../index.js';
18
19
  import { normalizeActions } from '../../core/actions.js';
19
20
  import { runAgent } from '../../core/agent.js';
@@ -30,6 +31,12 @@ try {
30
31
  pkgVersion = pkg.version;
31
32
  }
32
33
  catch { /* fallback */ }
34
+ const mcpFetchCache = new LRUCache({
35
+ max: 500,
36
+ ttl: 5 * 60 * 1000, // 5 minutes default
37
+ maxSize: 100 * 1024 * 1024, // 100MB
38
+ sizeCalculation: (entry) => JSON.stringify(entry).length,
39
+ });
33
40
  // ---------------------------------------------------------------------------
34
41
  // Helper functions for brand extraction
35
42
  // ---------------------------------------------------------------------------
@@ -58,17 +65,21 @@ function getTools() {
58
65
  return [
59
66
  {
60
67
  name: 'webpeel_fetch',
61
- description: 'Fetch a URL and return clean, AI-ready markdown content. Handles JavaScript rendering and anti-bot protections.',
68
+ description: 'Fetch any URL and return clean markdown content. Handles JavaScript rendering, bot detection, and content extraction automatically. Set readable=true for article-only content.',
62
69
  annotations: { title: 'Fetch Web Page', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
63
70
  inputSchema: {
64
71
  type: 'object',
65
72
  properties: {
66
- url: { type: 'string', description: 'The URL to fetch' },
67
- render: { type: 'boolean', description: 'Force browser rendering', default: false },
68
- stealth: { type: 'boolean', description: 'Stealth mode to bypass bot detection', default: false },
69
- wait: { type: 'number', description: 'Milliseconds to wait for dynamic content', default: 0 },
70
- format: { type: 'string', enum: ['markdown', 'text', 'html'], default: 'markdown' },
73
+ url: { type: 'string', description: 'URL to fetch' },
74
+ format: { type: 'string', enum: ['markdown', 'html', 'text'], description: 'Output format (default: markdown)', default: 'markdown' },
75
+ render: { type: 'boolean', description: 'Use browser rendering for JavaScript-heavy sites', default: false },
76
+ stealth: { type: 'boolean', description: 'Stealth mode for bot-protected sites (Amazon, LinkedIn, etc.)', default: false },
77
+ readable: { type: 'boolean', description: 'Reader mode — extract only article content, strip all noise', default: false },
78
+ question: { type: 'string', description: 'Ask a question about the content (BM25, no LLM needed). Returns the most relevant passages.' },
79
+ budget: { type: 'number', description: 'Smart token budget — distill content to N tokens' },
71
80
  selector: { type: 'string', description: 'CSS selector to extract specific content' },
81
+ screenshot: { type: 'boolean', description: 'Also take a screenshot', default: false },
82
+ wait: { type: 'number', description: 'Milliseconds to wait for dynamic content', default: 0 },
72
83
  maxTokens: { type: 'number', description: 'Maximum token count for output' },
73
84
  images: { type: 'boolean', description: 'Extract image URLs', default: false },
74
85
  inlineExtract: {
@@ -108,7 +119,7 @@ function getTools() {
108
119
  },
109
120
  {
110
121
  name: 'webpeel_search',
111
- description: 'Search the web and return results with titles, URLs, and snippets.',
122
+ description: 'Search the web and return structured results with titles, URLs, and snippets. No API key needed.',
112
123
  annotations: { title: 'Search the Web', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
113
124
  inputSchema: {
114
125
  type: 'object',
@@ -121,7 +132,7 @@ function getTools() {
121
132
  },
122
133
  {
123
134
  name: 'webpeel_crawl',
124
- description: 'Crawl a website following links and extracting content.',
135
+ description: 'Crawl a website starting from a URL. Returns content for all discovered pages up to the specified depth/limit.',
125
136
  annotations: { title: 'Crawl Website', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
126
137
  inputSchema: {
127
138
  type: 'object',
@@ -136,7 +147,7 @@ function getTools() {
136
147
  },
137
148
  {
138
149
  name: 'webpeel_map',
139
- description: 'Discover all URLs on a domain using sitemap.xml and link crawling.',
150
+ description: 'Discover all URLs on a domain via sitemap and link crawling. Returns a structured URL list.',
140
151
  annotations: { title: 'Map Website URLs', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
141
152
  inputSchema: {
142
153
  type: 'object',
@@ -149,7 +160,7 @@ function getTools() {
149
160
  },
150
161
  {
151
162
  name: 'webpeel_extract',
152
- description: 'Extract structured data from a webpage using CSS selectors or AI.',
163
+ description: 'Extract structured data from a URL using CSS selectors, JSON Schema, or LLM. Returns typed key-value pairs.',
153
164
  annotations: { title: 'Extract Structured Data', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
154
165
  inputSchema: {
155
166
  type: 'object',
@@ -164,7 +175,7 @@ function getTools() {
164
175
  },
165
176
  {
166
177
  name: 'webpeel_batch',
167
- description: 'Fetch multiple URLs in batch with concurrency control.',
178
+ description: 'Fetch multiple URLs concurrently. Pass an array of URLs, get back an array of results.',
168
179
  annotations: { title: 'Batch Fetch URLs', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
169
180
  inputSchema: {
170
181
  type: 'object',
@@ -178,7 +189,7 @@ function getTools() {
178
189
  },
179
190
  {
180
191
  name: 'webpeel_research',
181
- description: 'Conduct autonomous multi-step web research on a topic. Searches the web, fetches top sources, extracts relevant content, and synthesizes a comprehensive report with citations. Returns a markdown report and structured source list. Requires LLM API key for synthesis; without one it returns raw extracted source content.',
192
+ description: 'Multi-step web research: searches the web, fetches top sources, follows leads, and synthesizes findings into a report with citations.',
182
193
  annotations: { title: 'Deep Research Agent', readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
183
194
  inputSchema: {
184
195
  type: 'object',
@@ -197,7 +208,7 @@ function getTools() {
197
208
  },
198
209
  {
199
210
  name: 'webpeel_screenshot',
200
- description: 'Take a screenshot of a URL and return a base64-encoded image. Supports full page or viewport capture, custom dimensions, PNG/JPEG format, and page actions before capture.',
211
+ description: 'Take a screenshot of any URL. Returns a PNG image. Supports full-page capture and viewport sizing.',
201
212
  annotations: { title: 'Take Screenshot', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
202
213
  inputSchema: {
203
214
  type: 'object',
@@ -217,7 +228,7 @@ function getTools() {
217
228
  },
218
229
  {
219
230
  name: 'webpeel_summarize',
220
- description: 'Generate an AI-powered summary of a webpage using an LLM. Requires an OpenAI-compatible API key.',
231
+ description: 'Generate an AI summary of a URL\'s content. Requires an LLM API key (BYOK).',
221
232
  annotations: { title: 'Summarize Page', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
222
233
  inputSchema: {
223
234
  type: 'object',
@@ -234,7 +245,7 @@ function getTools() {
234
245
  },
235
246
  {
236
247
  name: 'webpeel_answer',
237
- description: 'Ask a question, search the web, fetch top results, and generate a cited answer using an LLM (BYOK). Returns an answer with [1], [2] source citations. Supports OpenAI, Anthropic, and Google LLMs.',
248
+ description: 'Ask a question about a URL and get an AI-generated answer with citations. Requires an LLM API key (BYOK). For LLM-free Q&A, use webpeel_quick_answer instead.',
238
249
  annotations: { title: 'Answer a Question', readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
239
250
  inputSchema: {
240
251
  type: 'object',
@@ -252,7 +263,7 @@ function getTools() {
252
263
  },
253
264
  {
254
265
  name: 'webpeel_brand',
255
- description: 'Extract branding and design system from a URL. Returns colors, fonts, typography, and visual identity elements.',
266
+ description: 'Extract branding assets from a URL: logo, colors, fonts, and social links.',
256
267
  annotations: { title: 'Extract Branding', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
257
268
  inputSchema: {
258
269
  type: 'object',
@@ -265,7 +276,7 @@ function getTools() {
265
276
  },
266
277
  {
267
278
  name: 'webpeel_change_track',
268
- description: 'Track changes on a URL by generating a content fingerprint. Use this to detect when a page has been updated.',
279
+ description: 'Track content changes on a URL. First call saves a snapshot, subsequent calls show what changed.',
269
280
  annotations: { title: 'Track Page Changes', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
270
281
  inputSchema: {
271
282
  type: 'object',
@@ -278,18 +289,73 @@ function getTools() {
278
289
  },
279
290
  {
280
291
  name: 'webpeel_deep_fetch',
281
- description: 'Search for a query and fetch the top N results in parallel, merging all content into one combined document with source attribution. No LLM API key required pure web fetching + merging. Ideal for AI agents that need comprehensive research content on a topic.',
292
+ description: 'Search + fetch + analyze in one call. Fetches multiple sources for a query, scores by relevance, deduplicates facts, and merges into structured intelligence. No LLM key needed. Supports \'comparison\' format for vs-queries.',
282
293
  annotations: { title: 'Deep Fetch Research', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
283
294
  inputSchema: {
284
295
  type: 'object',
285
296
  properties: {
286
297
  query: { type: 'string', description: 'Search query to research' },
287
298
  count: { type: 'number', description: 'Number of top results to fetch (default: 5, max: 10)', default: 5, minimum: 1, maximum: 10 },
288
- format: { type: 'string', enum: ['markdown', 'text'], description: 'Content format for fetched pages (default: markdown)', default: 'markdown' },
299
+ format: { type: 'string', enum: ['markdown', 'text', 'comparison'], description: 'Content format (default: markdown). Use "comparison" for vs-queries to get a side-by-side structure.', default: 'markdown' },
289
300
  },
290
301
  required: ['query'],
291
302
  },
292
303
  },
304
+ {
305
+ name: 'webpeel_youtube',
306
+ description: 'Extract the full transcript from a YouTube video. Returns timestamped segments and video metadata. No API key needed. Supports all YouTube URL formats.',
307
+ annotations: { title: 'Extract YouTube Transcript', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
308
+ inputSchema: {
309
+ type: 'object',
310
+ properties: {
311
+ url: { type: 'string', description: 'YouTube video URL (supports youtube.com/watch, youtu.be, embed, shorts, and mobile URLs)' },
312
+ language: { type: 'string', description: 'Preferred transcript language code (default: en). Falls back to any available language if not found.' },
313
+ },
314
+ required: ['url'],
315
+ },
316
+ },
317
+ {
318
+ name: 'webpeel_auto_extract',
319
+ description: 'Detect page type and extract structured JSON automatically. Supports pricing pages, product listings, contact info, articles, and API documentation. No LLM needed.',
320
+ inputSchema: {
321
+ type: 'object',
322
+ properties: {
323
+ url: { type: 'string', description: 'URL to fetch and auto-extract structured data from' },
324
+ },
325
+ required: ['url'],
326
+ },
327
+ },
328
+ {
329
+ name: 'webpeel_quick_answer',
330
+ description: 'Ask a question about a URL\'s content — no LLM key needed. Uses BM25 relevance scoring to find and return the most relevant passages. Returns answer text with confidence score.',
331
+ annotations: { title: 'Quick Answer (No LLM)', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
332
+ inputSchema: {
333
+ type: 'object',
334
+ properties: {
335
+ url: { type: 'string', description: 'URL to fetch and search' },
336
+ question: { type: 'string', description: 'Question to answer from the page content' },
337
+ maxPassages: { type: 'number', description: 'Maximum number of relevant passages to return (default: 3)', default: 3, minimum: 1, maximum: 10 },
338
+ render: { type: 'boolean', description: 'Use browser rendering', default: false },
339
+ },
340
+ required: ['url', 'question'],
341
+ },
342
+ },
343
+ {
344
+ name: 'webpeel_watch',
345
+ description: 'Monitor a URL for changes with webhook notifications. Create persistent watchers that check on a schedule and alert when content changes.',
346
+ inputSchema: {
347
+ type: 'object',
348
+ properties: {
349
+ action: { type: 'string', enum: ['create', 'list', 'check', 'delete'], description: 'Watch action to perform' },
350
+ url: { type: 'string', description: 'URL to monitor (for create)' },
351
+ id: { type: 'string', description: 'Watch ID (for check/delete)' },
352
+ webhookUrl: { type: 'string', description: 'Webhook URL to notify on changes (for create)' },
353
+ intervalMinutes: { type: 'number', description: 'Check interval in minutes (default: 60)' },
354
+ selector: { type: 'string', description: 'CSS selector to monitor specific content (optional)' },
355
+ },
356
+ required: ['action'],
357
+ },
358
+ },
293
359
  ];
294
360
  }
295
361
  // ---------------------------------------------------------------------------
@@ -303,7 +369,7 @@ function safeStringify(obj) {
303
369
  return JSON.stringify({ error: 'serialization_error', message: 'Failed to serialize result' });
304
370
  }
305
371
  }
306
- async function handleToolCall(name, args) {
372
+ async function handleToolCall(name, args, pool, req) {
307
373
  try {
308
374
  // webpeel_fetch
309
375
  if (name === 'webpeel_fetch') {
@@ -323,8 +389,55 @@ async function handleToolCall(name, args) {
323
389
  selector: args.selector,
324
390
  maxTokens: args.maxTokens,
325
391
  images: args.images,
392
+ readable: args.readable || false,
393
+ budget: args.budget,
394
+ question: args.question,
395
+ screenshot: args.screenshot || false,
326
396
  actions: parsedActions,
327
397
  };
398
+ // Cache key and bypass logic
399
+ const mcpNoCache = args.noCache === true;
400
+ const mcpCacheTtlMs = typeof args.cacheTtl === 'number' ? args.cacheTtl * 1000 : 5 * 60 * 1000;
401
+ const mcpActionsKey = parsedActions ? JSON.stringify(parsedActions) : '';
402
+ const mcpCacheKey = `mcp:fetch:${url}:${options.render}:${options.wait}:${options.format}:${options.selector}:${options.images}:${mcpActionsKey}`;
403
+ // Check cache (skip for noCache or inline extraction requests)
404
+ const hasInlineExtract = args.inlineExtract && (args.inlineExtract.schema || args.inlineExtract.prompt);
405
+ if (!mcpNoCache && !hasInlineExtract) {
406
+ const cached = mcpFetchCache.get(mcpCacheKey);
407
+ if (cached) {
408
+ const cacheAge = Date.now() - cached.timestamp;
409
+ if (cacheAge < mcpCacheTtlMs) {
410
+ const r = cached.result;
411
+ const cachedOutput = {
412
+ url: r.url || url,
413
+ title: r.title || r.metadata?.title || '',
414
+ tokens: r.tokens || 0,
415
+ content: r.content,
416
+ _cache: 'HIT',
417
+ _cacheAge: Math.floor(cacheAge / 1000),
418
+ };
419
+ if (r.metadata && Object.keys(r.metadata).length > 0)
420
+ cachedOutput.metadata = r.metadata;
421
+ if (r.domainData)
422
+ cachedOutput.domainData = r.domainData;
423
+ if (r.readability)
424
+ cachedOutput.readability = { readingTime: r.readability.readingTime, wordCount: r.readability.wordCount };
425
+ if (r.quickAnswer)
426
+ cachedOutput.quickAnswer = r.quickAnswer;
427
+ if (r.json)
428
+ cachedOutput.json = r.json;
429
+ if (r.extracted)
430
+ cachedOutput.extracted = r.extracted;
431
+ if (r.images && r.images.length > 0)
432
+ cachedOutput.images = r.images;
433
+ if (r.screenshot)
434
+ cachedOutput.screenshot = r.screenshot;
435
+ if (r.fingerprint)
436
+ cachedOutput.fingerprint = r.fingerprint;
437
+ return ok(safeStringify(cachedOutput));
438
+ }
439
+ }
440
+ }
328
441
  const result = await Promise.race([
329
442
  peel(url, options),
330
443
  timeout(60000, 'Fetch timed out'),
@@ -348,7 +461,45 @@ async function handleToolCall(name, args) {
348
461
  result.extractTokensUsed = extractResult.tokensUsed;
349
462
  }
350
463
  }
351
- return ok(safeStringify(result));
464
+ // Store in cache (skip for inline extraction results — they depend on user's LLM keys)
465
+ if (!mcpNoCache && !hasInlineExtract) {
466
+ mcpFetchCache.set(mcpCacheKey, { result, timestamp: Date.now() }, { ttl: mcpCacheTtlMs });
467
+ }
468
+ // Build consistent output — always include url, title, tokens
469
+ const output = {
470
+ url: result.url || url,
471
+ title: result.title || result.metadata?.title || '',
472
+ tokens: result.tokens || 0,
473
+ content: result.content,
474
+ };
475
+ if (result.metadata && Object.keys(result.metadata).length > 0)
476
+ output.metadata = result.metadata;
477
+ if (result.domainData)
478
+ output.domainData = result.domainData;
479
+ if (result.readability)
480
+ output.readability = {
481
+ readingTime: result.readability.readingTime,
482
+ wordCount: result.readability.wordCount,
483
+ };
484
+ if (result.quickAnswer)
485
+ output.quickAnswer = result.quickAnswer;
486
+ if (result.json)
487
+ output.json = result.json;
488
+ if (result.extracted)
489
+ output.extracted = result.extracted;
490
+ if (result.images && result.images.length > 0)
491
+ output.images = result.images;
492
+ if (result.screenshot)
493
+ output.screenshot = result.screenshot;
494
+ if (result.fingerprint)
495
+ output.fingerprint = result.fingerprint;
496
+ if (result.extractTokensUsed)
497
+ output.extractTokensUsed = result.extractTokensUsed;
498
+ if (result._cache)
499
+ output._cache = result._cache;
500
+ if (result._cacheAge !== undefined)
501
+ output._cacheAge = result._cacheAge;
502
+ return ok(safeStringify(output));
352
503
  }
353
504
  // webpeel_search
354
505
  if (name === 'webpeel_search') {
@@ -358,11 +509,19 @@ async function handleToolCall(name, args) {
358
509
  const { getBestSearchProvider } = await import('../../core/search-provider.js');
359
510
  const { provider, apiKey } = getBestSearchProvider();
360
511
  const count = Math.min(Math.max(args.count || 5, 1), 10);
361
- const results = await Promise.race([
512
+ const rawResults = await Promise.race([
362
513
  provider.searchWeb(query, { count, apiKey }),
363
514
  timeout(30000, 'Search timed out'),
364
515
  ]);
365
- return ok(safeStringify(results));
516
+ // Normalize to consistent format
517
+ const resultsList = Array.isArray(rawResults) ? rawResults : (rawResults?.results ?? []);
518
+ const normalizedResults = resultsList.map((r) => ({
519
+ title: r.title || '',
520
+ url: r.url || r.link || '',
521
+ snippet: r.snippet || r.description || r.body || '',
522
+ ...(r.favicon ? { favicon: r.favicon } : {}),
523
+ }));
524
+ return ok(safeStringify({ query, count: normalizedResults.length, results: normalizedResults }));
366
525
  }
367
526
  // webpeel_crawl
368
527
  if (name === 'webpeel_crawl') {
@@ -635,7 +794,9 @@ async function handleToolCall(name, args) {
635
794
  if (!query || typeof query !== 'string')
636
795
  throw new Error('Invalid query');
637
796
  const count = Math.min(Math.max(args.count || 5, 1), 10);
638
- const format = args.format || 'markdown';
797
+ const rawFormat = args.format || 'markdown';
798
+ const isComparison = rawFormat === 'comparison';
799
+ const format = (isComparison ? 'markdown' : rawFormat);
639
800
  // Step 1: Search for the query using best available provider
640
801
  const { getBestSearchProvider } = await import('../../core/search-provider.js');
641
802
  const { provider, apiKey } = getBestSearchProvider();
@@ -661,25 +822,132 @@ async function handleToolCall(name, args) {
661
822
  for (let i = 0; i < pages.length; i++) {
662
823
  const page = pages[i];
663
824
  const searchResult = topResults[i];
664
- const url = urls[i];
665
- const title = page?.title || searchResult?.title || url;
666
- sources.push({ url, title });
825
+ const pageUrl = urls[i];
826
+ const title = page?.title || searchResult?.title || pageUrl;
827
+ // Position-based relevance score (top result = 1.0, decreasing)
828
+ const relevanceScore = Math.round((1 - i / Math.max(pages.length, 1)) * 100) / 100;
829
+ sources.push({ url: pageUrl, title, relevanceScore, ...(searchResult?.snippet ? { snippet: searchResult.snippet } : {}) });
667
830
  if (page?.content) {
668
- contentParts.push(`## Source ${i + 1}: ${title}\n**URL:** ${url}\n\n${page.content}\n\n---\n`);
831
+ contentParts.push(`## Source ${i + 1}: ${title}\n**URL:** ${pageUrl}\n\n${page.content}\n\n---\n`);
669
832
  totalTokens += page.tokens || 0;
670
833
  }
671
834
  else if (page?.error) {
672
- contentParts.push(`## Source ${i + 1}: ${title}\n**URL:** ${url}\n\n*(Failed to fetch: ${page.error})*\n\n---\n`);
835
+ contentParts.push(`## Source ${i + 1}: ${title}\n**URL:** ${pageUrl}\n\n*(Failed to fetch: ${page.error})*\n\n---\n`);
673
836
  }
674
837
  }
675
838
  const mergedContent = contentParts.join('\n');
676
- return ok(safeStringify({
839
+ const deepFetchOutput = {
677
840
  query,
678
841
  sources,
679
842
  content: mergedContent,
680
843
  totalTokens,
844
+ };
845
+ // For comparison format, add a structured comparison hint
846
+ if (isComparison) {
847
+ deepFetchOutput.format = 'comparison';
848
+ deepFetchOutput.comparisonNote = 'Sources fetched and ranked by relevance. Review sources array and content sections for side-by-side comparison.';
849
+ }
850
+ return ok(safeStringify(deepFetchOutput));
851
+ }
852
+ // webpeel_quick_answer
853
+ if (name === 'webpeel_quick_answer') {
854
+ const url = args.url;
855
+ const question = args.question;
856
+ if (!url || typeof url !== 'string')
857
+ throw new Error('Invalid URL');
858
+ if (url.length > 2048)
859
+ throw new Error('URL too long');
860
+ if (!question || typeof question !== 'string')
861
+ throw new Error('Invalid question');
862
+ if (question.length > 1000)
863
+ throw new Error('Question too long (max 1000 characters)');
864
+ const maxPassages = typeof args.maxPassages === 'number' ? Math.min(Math.max(args.maxPassages, 1), 10) : 3;
865
+ const peelResult = await Promise.race([
866
+ peel(url, {
867
+ render: args.render || false,
868
+ format: 'markdown',
869
+ budget: 8000,
870
+ }),
871
+ timeout(60000, 'Quick answer fetch timed out'),
872
+ ]);
873
+ const { quickAnswer } = await import('../../core/quick-answer.js');
874
+ const qa = quickAnswer({
875
+ question,
876
+ content: peelResult.content || '',
877
+ url: peelResult.url || url,
878
+ maxPassages,
879
+ });
880
+ return ok(safeStringify({
881
+ url: peelResult.url || url,
882
+ title: peelResult.title,
883
+ question: qa.question,
884
+ answer: qa.answer,
885
+ confidence: qa.confidence,
886
+ passages: qa.passages,
887
+ method: qa.method,
681
888
  }));
682
889
  }
890
+ // webpeel_youtube
891
+ if (name === 'webpeel_youtube') {
892
+ const url = args.url;
893
+ if (!url || typeof url !== 'string')
894
+ throw new Error('Invalid URL');
895
+ const { getYouTubeTranscript } = await import('../../core/youtube.js');
896
+ const transcript = await Promise.race([
897
+ getYouTubeTranscript(url, {
898
+ language: args.language ?? 'en',
899
+ }),
900
+ timeout(60000, 'YouTube transcript extraction timed out'),
901
+ ]);
902
+ return ok(safeStringify(transcript));
903
+ }
904
+ // webpeel_auto_extract
905
+ if (name === 'webpeel_auto_extract') {
906
+ const url = args.url;
907
+ if (!url)
908
+ return { content: [{ type: 'text', text: JSON.stringify({ error: 'Missing url parameter' }) }] };
909
+ const { autoExtract } = await import('../../core/auto-extract.js');
910
+ const result = await peel(url, { format: 'html' });
911
+ const extracted = autoExtract(result.content || '', url);
912
+ return {
913
+ content: [{ type: 'text', text: JSON.stringify({
914
+ url,
915
+ pageType: extracted.type,
916
+ structured: extracted,
917
+ }, null, 2) }],
918
+ };
919
+ }
920
+ // webpeel_watch
921
+ if (name === 'webpeel_watch') {
922
+ const action = args.action;
923
+ if (!pool) {
924
+ return { content: [{ type: 'text', text: JSON.stringify({ error: 'Watch feature requires database connection. Use the REST API at /v1/watch instead.' }) }] };
925
+ }
926
+ const { WatchManager } = await import('../../core/watch-manager.js');
927
+ const wm = new WatchManager(pool);
928
+ const accountId = req?.auth?.keyInfo?.accountId || req?.auth?.keyInfo?.userId || 'anonymous';
929
+ if (action === 'create') {
930
+ const watch = await wm.create(accountId, args.url, {
931
+ webhookUrl: args.webhookUrl,
932
+ checkIntervalMinutes: args.intervalMinutes || 60,
933
+ selector: args.selector,
934
+ });
935
+ return { content: [{ type: 'text', text: JSON.stringify(watch, null, 2) }] };
936
+ }
937
+ if (action === 'list') {
938
+ const watches = await wm.list(accountId);
939
+ return { content: [{ type: 'text', text: JSON.stringify(watches, null, 2) }] };
940
+ }
941
+ if (action === 'check') {
942
+ const result = await wm.check(args.id);
943
+ return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
944
+ }
945
+ if (action === 'delete') {
946
+ await wm.delete(args.id);
947
+ return { content: [{ type: 'text', text: JSON.stringify({ success: true }) }] };
948
+ }
949
+ return { content: [{ type: 'text', text: JSON.stringify({ error: `Unknown watch action: ${action}` }) }] };
950
+ }
683
951
  throw new Error(`Unknown tool: ${name}`);
684
952
  }
685
953
  catch (error) {
@@ -699,13 +967,13 @@ function timeout(ms, msg) {
699
967
  // ---------------------------------------------------------------------------
700
968
  // Create a fresh MCP server instance (stateless — one per request)
701
969
  // ---------------------------------------------------------------------------
702
- function createMcpServer() {
970
+ function createMcpServer(pool, req) {
703
971
  const server = new Server({ name: 'webpeel', version: pkgVersion }, { capabilities: { tools: {} } });
704
972
  const tools = getTools();
705
973
  server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
706
974
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
707
975
  const { name, arguments: args } = request.params;
708
- return handleToolCall(name, (args ?? {}));
976
+ return handleToolCall(name, (args ?? {}), pool, req);
709
977
  });
710
978
  return server;
711
979
  }
@@ -715,7 +983,7 @@ function createMcpServer() {
715
983
  // ---------------------------------------------------------------------------
716
984
  // Shared MCP handler logic
717
985
  // ---------------------------------------------------------------------------
718
- async function handleMcpPost(req, res) {
986
+ async function handleMcpPost(req, res, pool) {
719
987
  // Require authentication — reject unauthenticated requests.
720
988
  // The /:apiKey/v2/mcp path validates the key before calling this handler.
721
989
  // The /mcp and /v2/mcp paths rely on the global auth middleware (Bearer token).
@@ -728,7 +996,7 @@ async function handleMcpPost(req, res) {
728
996
  return;
729
997
  }
730
998
  try {
731
- const server = createMcpServer();
999
+ const server = createMcpServer(pool, req);
732
1000
  const transport = new StreamableHTTPServerTransport({
733
1001
  sessionIdGenerator: undefined, // stateless
734
1002
  });
@@ -768,16 +1036,17 @@ function mcpDeleteOk(_req, res) {
768
1036
  // ---------------------------------------------------------------------------
769
1037
  // Express router
770
1038
  // ---------------------------------------------------------------------------
771
- export function createMcpRouter(authStore) {
1039
+ export function createMcpRouter(authStore, pool) {
772
1040
  const router = Router();
1041
+ const boundHandler = (req, res) => handleMcpPost(req, res, pool);
773
1042
  // POST /mcp — legacy path, MCP Streamable HTTP transport
774
- router.post('/mcp', handleMcpPost);
1043
+ router.post('/mcp', boundHandler);
775
1044
  router.get('/mcp', mcpMethodNotAllowed);
776
1045
  router.delete('/mcp', mcpDeleteOk);
777
1046
  // POST /v2/mcp — canonical v2 path; auth via Authorization: Bearer <key> header
778
1047
  // The global auth middleware already validates the Bearer token, so no extra
779
1048
  // validation is needed here.
780
- router.post('/v2/mcp', handleMcpPost);
1049
+ router.post('/v2/mcp', boundHandler);
781
1050
  router.get('/v2/mcp', mcpMethodNotAllowed);
782
1051
  router.delete('/v2/mcp', mcpDeleteOk);
783
1052
  // POST /:apiKey/v2/mcp — Firecrawl-style: API key embedded in URL path
@@ -825,7 +1094,7 @@ export function createMcpRouter(authStore) {
825
1094
  return;
826
1095
  }
827
1096
  }
828
- return handleMcpPost(req, res);
1097
+ return handleMcpPost(req, res, pool);
829
1098
  });
830
1099
  router.get('/:apiKey/v2/mcp', mcpMethodNotAllowed);
831
1100
  router.delete('/:apiKey/v2/mcp', mcpDeleteOk);