webpeel 0.20.4 → 0.20.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,12 +7,51 @@ import { getProfilePath, loadStorageState, touchProfile } from '../../core/profi
7
7
  import { peel, cleanup } from '../../index.js';
8
8
  import { checkUsage, showUsageFooter, loadConfig } from '../../cli-auth.js';
9
9
  import { getCache, setCache, parseTTL } from '../../cache.js';
10
- import { estimateTokens } from '../../core/markdown.js';
10
+ import { estimateTokens, htmlToMarkdown } from '../../core/markdown.js';
11
11
  import { distillToBudget, budgetListings } from '../../core/budget.js';
12
12
  import { parseActions, formatError, fetchViaApi, outputResult, writeStdout, buildEnvelope, classifyErrorCode, formatListingsCsv, normaliseExtractedToRows, } from '../utils.js';
13
+ // ─── readStdin ────────────────────────────────────────────────────────────────
14
+ async function readStdin() {
15
+ const chunks = [];
16
+ for await (const chunk of process.stdin) {
17
+ chunks.push(Buffer.from(chunk));
18
+ }
19
+ return Buffer.concat(chunks).toString('utf-8');
20
+ }
21
+ // ─── runStdin ─────────────────────────────────────────────────────────────────
22
+ // Read HTML from stdin, convert to markdown, and output
23
+ async function runStdin(options) {
24
+ try {
25
+ const html = await readStdin();
26
+ if (!html.trim()) {
27
+ process.stderr.write('Error: No input received on stdin\n');
28
+ process.exit(1);
29
+ }
30
+ const markdown = htmlToMarkdown(html, { raw: false, prune: true });
31
+ if (options.json) {
32
+ const tokens = estimateTokens(markdown);
33
+ process.stdout.write(JSON.stringify({ success: true, content: markdown, tokens }) + '\n');
34
+ }
35
+ else {
36
+ process.stdout.write(markdown + '\n');
37
+ }
38
+ }
39
+ catch (err) {
40
+ process.stderr.write(`Error: ${err.message}\n`);
41
+ process.exit(1);
42
+ }
43
+ }
13
44
  // ─── runFetch ─────────────────────────────────────────────────────────────────
14
45
  // Main fetch handler — shared with the `pipe` and `ask` subcommands
15
46
  export async function runFetch(url, options) {
47
+ // --content-only: override all output flags — we just want raw content
48
+ if (options.contentOnly) {
49
+ options.silent = true;
50
+ // Disable json/text/html — we output content directly
51
+ options.json = false;
52
+ options.html = false;
53
+ options.text = false;
54
+ }
16
55
  // Handle --format flag: maps to existing boolean flags
17
56
  if (options.format) {
18
57
  const fmt = options.format.toLowerCase();
@@ -30,9 +69,10 @@ export async function runFetch(url, options) {
30
69
  }
31
70
  // Smart defaults: when piped (not a TTY), default to silent JSON + budget
32
71
  // BUT respect explicit --format flag (user chose the output format)
72
+ // AND respect --content-only (raw content output, no JSON wrapper)
33
73
  const isPiped = !process.stdout.isTTY;
34
74
  const hasExplicitFormat = options.format && ['text', 'html', 'markdown', 'md'].includes(options.format.toLowerCase());
35
- if (isPiped && !options.html && !options.text && !hasExplicitFormat) {
75
+ if (isPiped && !options.html && !options.text && !hasExplicitFormat && !options.contentOnly) {
36
76
  if (!options.json)
37
77
  options.json = true;
38
78
  if (!options.silent)
@@ -284,11 +324,38 @@ export async function runFetch(url, options) {
284
324
  cachedResult.extracted = extractedCached;
285
325
  }
286
326
  }
287
- await outputResult(cachedResult, options, { cached: true });
327
+ if (options.contentOnly) {
328
+ await writeStdout(cachedResult.content + '\n');
329
+ }
330
+ else {
331
+ await outputResult(cachedResult, options, { cached: true });
332
+ }
288
333
  process.exit(0);
289
334
  }
290
335
  }
291
- const spinner = options.silent ? null : ora('Fetching...').start();
336
+ // --progress: show escalation steps on stderr (overrides spinner)
337
+ let progressInterval;
338
+ const progressStart = Date.now();
339
+ if (options.progress) {
340
+ process.stderr.write(`[simple] Fetching ${url}...\n`);
341
+ // Show escalation hints based on elapsed time (best-effort approximations)
342
+ const progressSteps = [
343
+ { afterMs: 2500, message: '[simple] Waiting for response...' },
344
+ { afterMs: 6000, message: '[browser] Simple too slow — escalating to browser render...' },
345
+ { afterMs: 12000, message: '[browser] Rendering with Chromium...' },
346
+ { afterMs: 20000, message: '[stealth] Escalating to stealth mode...' },
347
+ ];
348
+ let stepIdx = 0;
349
+ progressInterval = setInterval(() => {
350
+ const elapsed = Date.now() - progressStart;
351
+ while (stepIdx < progressSteps.length && elapsed >= progressSteps[stepIdx].afterMs) {
352
+ process.stderr.write(`${progressSteps[stepIdx].message}\n`);
353
+ stepIdx++;
354
+ }
355
+ }, 500);
356
+ }
357
+ // Suppress spinner when --progress is active (progress lines replace it)
358
+ const spinner = (options.silent || options.progress) ? null : ora('Fetching...').start();
292
359
  try {
293
360
  // Validate options
294
361
  if (options.wait && (options.wait < 0 || options.wait > 60000)) {
@@ -528,7 +595,22 @@ export async function runFetch(url, options) {
528
595
  if (resolvedProfileName) {
529
596
  touchProfile(resolvedProfileName);
530
597
  }
531
- if (spinner) {
598
+ // Stop progress interval and show final result
599
+ if (progressInterval) {
600
+ clearInterval(progressInterval);
601
+ progressInterval = undefined;
602
+ }
603
+ if (options.progress) {
604
+ const method = result.method || 'simple';
605
+ const elapsedSec = ((result.elapsed || (Date.now() - progressStart)) / 1000).toFixed(1);
606
+ const tokenCount = (result.tokens || 0).toLocaleString();
607
+ // Show escalation arrow if browser/stealth was needed
608
+ if (method !== 'simple') {
609
+ process.stderr.write(`[simple] → [${method}] escalated\n`);
610
+ }
611
+ process.stderr.write(`[${method}] Done — ${tokenCount} tokens in ${elapsedSec}s\n`);
612
+ }
613
+ else if (spinner) {
532
614
  const domainTag = result.domainData
533
615
  ? ` [${result.domainData.domain}:${result.domainData.type}]`
534
616
  : '';
@@ -866,11 +948,27 @@ export async function runFetch(url, options) {
866
948
  result.extracted = extracted;
867
949
  }
868
950
  }
869
- // Output results (default path)
870
- await outputResult(result, options, {
871
- cached: false,
872
- truncated: contentTruncated || undefined,
873
- });
951
+ // --content-only: output raw content only, no wrapper
952
+ if (options.contentOnly) {
953
+ await writeStdout(result.content + '\n');
954
+ }
955
+ else {
956
+ // Output results (default path)
957
+ await outputResult(result, options, {
958
+ cached: false,
959
+ truncated: contentTruncated || undefined,
960
+ });
961
+ // Token savings display (our unique selling point)
962
+ if (!options.json && !options.silent && result.tokenSavingsPercent) {
963
+ const savings = result.tokenSavingsPercent;
964
+ const raw = result.rawTokenEstimate;
965
+ const optimized = result.tokens || 0;
966
+ if (savings > 0) {
967
+ const rawStr = raw ? `${raw.toLocaleString()}→${optimized.toLocaleString()} tokens` : `${optimized.toLocaleString()} tokens`;
968
+ process.stderr.write(`\x1b[32m💰 Token savings: ${savings}% smaller than raw HTML (${rawStr})\x1b[0m\n`);
969
+ }
970
+ }
971
+ }
874
972
  }
875
973
  // Clean up and exit
876
974
  await cleanup();
@@ -976,7 +1074,14 @@ export function registerFetchCommands(program) {
976
1074
  .option('--wait-selector <css>', 'Wait for CSS selector before extracting (auto-enables --render)')
977
1075
  .option('--block-resources <types>', 'Block resource types, comma-separated: image,stylesheet,font,media,script (auto-enables --render)')
978
1076
  .option('--format <type>', 'Output format: markdown (default), text, html, json')
1077
+ .option('--content-only', 'Output only the raw content field (no metadata, no JSON wrapper) — ideal for piping to LLMs')
1078
+ .option('--progress', 'Show engine escalation steps (simple → browser → stealth) with timing')
1079
+ .option('--stdin', 'Read HTML from stdin instead of fetching a URL — converts to markdown')
979
1080
  .action(async (url, options) => {
1081
+ if (options.stdin) {
1082
+ await runStdin(options);
1083
+ return;
1084
+ }
980
1085
  await runFetch(url, options);
981
1086
  });
982
1087
  // ── read subcommand (explicit readable mode) ─────────────────────────────
@@ -23,6 +23,7 @@ export function registerSearchCommands(program) {
23
23
  .option('--budget <n>', 'Token budget for site-search result content', parseInt)
24
24
  .option('-s, --silent', 'Silent mode')
25
25
  .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
26
+ .option('--fetch', 'Also fetch and include content from each result URL')
26
27
  .option('--agent', 'Agent mode: sets --json, --silent, and --budget 4000 (override with --budget N)')
27
28
  .action(async (query, options) => {
28
29
  // --agent sets sensible defaults for AI agents; explicit flags override
@@ -178,9 +179,61 @@ export function registerSearchCommands(program) {
178
179
  const searchData = await searchRes.json();
179
180
  // API returns { success: true, data: { web: [...] } } or { results: [...] }
180
181
  let results = searchData.data?.web || searchData.data?.results || searchData.results || [];
182
+ // Client-side ad filtering: remove DuckDuckGo ads that slip through the server
183
+ results = results.filter(r => {
184
+ // Filter DDG-internal URLs
185
+ try {
186
+ const parsed = new URL(r.url);
187
+ if (parsed.hostname === 'duckduckgo.com')
188
+ return false;
189
+ if (parsed.searchParams.has('ad_domain') ||
190
+ parsed.searchParams.has('ad_provider') ||
191
+ parsed.searchParams.has('ad_type'))
192
+ return false;
193
+ }
194
+ catch {
195
+ return false;
196
+ }
197
+ // Filter ad snippets
198
+ if (r.snippet && (r.snippet.includes('Ad ·') ||
199
+ r.snippet.includes('Ad Viewing ads is privacy protected by DuckDuckGo') ||
200
+ r.snippet.toLowerCase().startsWith('ad ·')))
201
+ return false;
202
+ return true;
203
+ });
181
204
  if (spinner) {
182
205
  spinner.succeed(`Found ${results.length} results`);
183
206
  }
207
+ // --fetch: fetch content from each result
208
+ if (options.fetch && results.length > 0) {
209
+ const fetchCfg = loadConfig();
210
+ const fetchApiKey = fetchCfg.apiKey || process.env.WEBPEEL_API_KEY;
211
+ const fetchApiUrl = process.env.WEBPEEL_API_URL || 'https://api.webpeel.dev';
212
+ if (fetchApiKey) {
213
+ const fetchSpinner = isSilent ? null : ora(`Fetching content from ${results.length} results...`).start();
214
+ await Promise.all(results.map(async (result) => {
215
+ try {
216
+ const fetchParams = new URLSearchParams({ url: result.url });
217
+ if (options.budget)
218
+ fetchParams.set('budget', String(options.budget || 2000));
219
+ const fetchRes = await fetch(`${fetchApiUrl}/v1/fetch?${fetchParams}`, {
220
+ headers: { Authorization: `Bearer ${fetchApiKey}` },
221
+ signal: AbortSignal.timeout(20000),
222
+ });
223
+ if (fetchRes.ok) {
224
+ const fetchData = await fetchRes.json();
225
+ result.content = fetchData.content || fetchData.data?.content || '';
226
+ }
227
+ }
228
+ catch { /* skip on error */ }
229
+ }));
230
+ if (fetchSpinner)
231
+ fetchSpinner.succeed('Content fetched');
232
+ }
233
+ else if (!isSilent) {
234
+ console.error('Warning: --fetch requires API key (run: webpeel auth <key>)');
235
+ }
236
+ }
184
237
  // Show usage footer for free/anonymous users
185
238
  if (usageCheck.usageInfo && !isSilent) {
186
239
  showUsageFooter(usageCheck.usageInfo, usageCheck.isAnonymous || false, false);
@@ -196,10 +249,24 @@ export function registerSearchCommands(program) {
196
249
  await writeStdout(jsonStr + '\n');
197
250
  }
198
251
  else {
199
- for (const result of results) {
200
- console.log(`\n${result.title}`);
201
- console.log(result.url);
202
- console.log(result.snippet);
252
+ // Human-readable numbered results
253
+ if (results.length === 0) {
254
+ await writeStdout('No results found.\n');
255
+ }
256
+ else {
257
+ await writeStdout(`\n`);
258
+ for (const [i, result] of results.entries()) {
259
+ await writeStdout(`${i + 1}. ${result.title}\n`);
260
+ await writeStdout(` ${result.url}\n`);
261
+ if (result.snippet) {
262
+ await writeStdout(` ${result.snippet}\n`);
263
+ }
264
+ if (result.content) {
265
+ const preview = result.content.slice(0, 500);
266
+ await writeStdout(`\n --- Content ---\n${preview}${result.content.length > 500 ? '\n [...]' : ''}\n`);
267
+ }
268
+ await writeStdout('\n');
269
+ }
203
270
  }
204
271
  }
205
272
  process.exit(0);
package/dist/cli/utils.js CHANGED
@@ -508,13 +508,11 @@ export async function outputResult(result, options, extra = {}) {
508
508
  // Default: full output
509
509
  if (options.json) {
510
510
  // Build clean JSON output with guaranteed top-level fields
511
+ // Note: elapsed/method/tokens are placed at the END so `tail -3` shows perf metrics
511
512
  const output = {
512
513
  url: result.url,
513
514
  title: result.metadata?.title || result.title || null,
514
- tokens: result.tokens || 0,
515
515
  fetchedAt: new Date().toISOString(),
516
- method: result.method || 'simple',
517
- elapsed: result.elapsed,
518
516
  content: result.content,
519
517
  };
520
518
  // Add optional fields only if present (filter out undefined/null values from metadata)
@@ -529,6 +527,10 @@ export async function outputResult(result, options, extra = {}) {
529
527
  }
530
528
  if (result.links?.length)
531
529
  output.links = result.links;
530
+ if (result.tokenSavingsPercent !== undefined)
531
+ output.tokenSavingsPercent = result.tokenSavingsPercent;
532
+ if (result.rawTokenEstimate !== undefined)
533
+ output.rawTokenEstimate = result.rawTokenEstimate;
532
534
  if (result.images?.length)
533
535
  output.images = result.images;
534
536
  if (result.structured)
@@ -562,6 +564,10 @@ export async function outputResult(result, options, extra = {}) {
562
564
  if (extra.totalAvailable !== undefined)
563
565
  output.totalAvailable = extra.totalAvailable;
564
566
  output._meta = { version: cliVersion, method: result.method || 'simple', timing: result.timing, serverMarkdown: result.serverMarkdown || false };
567
+ // Perf metrics at the end — `tail -3` shows: elapsed | method | tokens
568
+ output.elapsed = result.elapsed;
569
+ output.method = result.method || 'simple';
570
+ output.tokens = result.tokens || 0;
565
571
  await writeStdout(JSON.stringify(output, null, 2) + '\n');
566
572
  }
567
573
  else {
@@ -586,10 +592,11 @@ export async function outputResult(result, options, extra = {}) {
586
592
  }
587
593
  // Stream content immediately to stdout — consumer gets it without waiting
588
594
  await writeStdout(result.content + '\n');
589
- // Append timing summary to stderr so it doesn't pollute piped content
590
- if (!options.silent) {
595
+ // Append timing summary to stderr (always doesn't pollute stdout pipe)
596
+ {
591
597
  const totalMs = result.timing?.total ?? result.elapsed;
592
- process.stderr.write(`\n--- ${result.tokens} tokens · ${totalMs}ms ---\n`);
598
+ const method = result.method || 'simple';
599
+ process.stderr.write(`\n--- ${totalMs}ms | ${method} | ${result.tokens} tokens ---\n`);
593
600
  }
594
601
  }
595
602
  }
@@ -83,7 +83,7 @@ export declare const providerStats: ProviderStatsTracker;
83
83
  export declare class StealthSearchProvider implements SearchProvider {
84
84
  readonly id: SearchProviderId;
85
85
  readonly requiresApiKey = false;
86
- /** Validate and normalize a URL; returns null if invalid/non-http */
86
+ /** Validate and normalize a URL; returns null if invalid/non-http or a DDG ad URL */
87
87
  private validateUrl;
88
88
  /**
89
89
  * Scrape DuckDuckGo HTML endpoint with stealth browser.
@@ -145,7 +145,7 @@ export declare class GoogleSearchProvider implements SearchProvider {
145
145
  * m[n]=past n months, y[n]=past n years.
146
146
  */
147
147
  private mapFreshnessToDateRestrict;
148
- /** Validate URL; returns null if invalid/non-http */
148
+ /** Validate URL; returns null if invalid/non-http or a DDG ad URL */
149
149
  private validateUrl;
150
150
  /**
151
151
  * Stealth browser scrape of google.com/search.
@@ -84,6 +84,30 @@ function decodeDdgUrl(rawUrl) {
84
84
  return rawUrl;
85
85
  }
86
86
  }
87
+ /** Returns true if a URL looks like a DuckDuckGo ad or tracking link */
88
+ function isDdgAdUrl(url) {
89
+ try {
90
+ const parsed = new URL(url);
91
+ // DDG-internal ad redirect paths
92
+ if (parsed.hostname === 'duckduckgo.com')
93
+ return true;
94
+ // URLs with known ad tracking query params
95
+ if (parsed.searchParams.has('ad_domain') ||
96
+ parsed.searchParams.has('ad_provider') ||
97
+ parsed.searchParams.has('ad_type'))
98
+ return true;
99
+ return false;
100
+ }
101
+ catch {
102
+ return false;
103
+ }
104
+ }
105
+ /** Returns true if a snippet is a DuckDuckGo ad snippet */
106
+ function isDdgAdSnippet(snippet) {
107
+ return snippet.includes('Ad ·') ||
108
+ snippet.includes('Ad Viewing ads is privacy protected by DuckDuckGo') ||
109
+ snippet.toLowerCase().startsWith('ad ·');
110
+ }
87
111
  class ProviderStatsTracker {
88
112
  history = new Map();
89
113
  windowSize;
@@ -182,14 +206,19 @@ function normalizeUrlForDedupe(rawUrl) {
182
206
  export class StealthSearchProvider {
183
207
  id = 'stealth';
184
208
  requiresApiKey = false;
185
- /** Validate and normalize a URL; returns null if invalid/non-http */
209
+ /** Validate and normalize a URL; returns null if invalid/non-http or a DDG ad URL */
186
210
  validateUrl(rawUrl) {
187
211
  try {
188
212
  const parsed = new URL(rawUrl);
189
213
  if (!['http:', 'https:'].includes(parsed.protocol))
190
214
  return null;
191
- // Filter DuckDuckGo ad redirect URLs (e.g. duckduckgo.com/y.js?ad_domain=...)
192
- if (parsed.hostname === 'duckduckgo.com' && parsed.pathname === '/y.js')
215
+ // Filter all DuckDuckGo URLs (internal links, ad redirects, etc.)
216
+ if (parsed.hostname === 'duckduckgo.com')
217
+ return null;
218
+ // Filter URLs with ad tracking query params
219
+ if (parsed.searchParams.has('ad_domain') ||
220
+ parsed.searchParams.has('ad_provider') ||
221
+ parsed.searchParams.has('ad_type'))
193
222
  return null;
194
223
  return parsed.href;
195
224
  }
@@ -236,10 +265,16 @@ export class StealthSearchProvider {
236
265
  const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
237
266
  if (!title || !rawUrl)
238
267
  return;
268
+ // Filter ad snippets
269
+ if (isDdgAdSnippet(snippet))
270
+ return;
239
271
  // Extract real URL from DDG redirect param
240
272
  const finalUrl = decodeDdgUrl(rawUrl);
241
273
  if (!finalUrl)
242
274
  return; // filtered out (DDG internal link)
275
+ // Filter ad URLs
276
+ if (isDdgAdUrl(finalUrl))
277
+ return;
243
278
  const validated = this.validateUrl(finalUrl);
244
279
  if (!validated)
245
280
  return;
@@ -532,10 +567,16 @@ export class DuckDuckGoProvider {
532
567
  let snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
533
568
  if (!title || !rawUrl)
534
569
  return;
570
+ // Filter ad snippets (DuckDuckGo injects ad labels into snippets)
571
+ if (isDdgAdSnippet(snippet))
572
+ return;
535
573
  // Extract actual URL from DuckDuckGo redirect; filter DDG internal/ad URLs
536
574
  const decoded = decodeDdgUrl(rawUrl);
537
575
  if (!decoded)
538
576
  return; // filtered out (DDG internal link or ad redirect)
577
+ // Filter ad URLs
578
+ if (isDdgAdUrl(decoded))
579
+ return;
539
580
  // SECURITY: Validate and sanitize results — only allow HTTP/HTTPS URLs
540
581
  let url;
541
582
  try {
@@ -813,14 +854,19 @@ export class GoogleSearchProvider {
813
854
  };
814
855
  return map[tbs];
815
856
  }
816
- /** Validate URL; returns null if invalid/non-http */
857
+ /** Validate URL; returns null if invalid/non-http or a DDG ad URL */
817
858
  validateUrl(rawUrl) {
818
859
  try {
819
860
  const parsed = new URL(rawUrl);
820
861
  if (!['http:', 'https:'].includes(parsed.protocol))
821
862
  return null;
822
- // Filter DuckDuckGo ad redirect URLs (e.g. duckduckgo.com/y.js?ad_domain=...)
823
- if (parsed.hostname === 'duckduckgo.com' && parsed.pathname === '/y.js')
863
+ // Filter all DuckDuckGo URLs (internal links, ad redirects, etc.)
864
+ if (parsed.hostname === 'duckduckgo.com')
865
+ return null;
866
+ // Filter URLs with ad tracking query params
867
+ if (parsed.searchParams.has('ad_domain') ||
868
+ parsed.searchParams.has('ad_provider') ||
869
+ parsed.searchParams.has('ad_type'))
824
870
  return null;
825
871
  return parsed.href;
826
872
  }
@@ -597,8 +597,12 @@ export async function smartFetch(url, options = {}) {
597
597
  .then((result) => ({ type: 'simple-success', result }))
598
598
  .catch((error) => ({ type: 'simple-error', error }));
599
599
  if (simpleResult.type === 'simple-success') {
600
- // Check if the content is suspiciously thin or has SPA indicators escalate to browser if so
601
- if (shouldEscalateForLowContent(simpleResult.result) || hasSpaIndicators(simpleResult.result.html)) {
600
+ // Check if the content is suspiciously thin, looks like an SPA shell, or is a shell page
601
+ // (looksLikeShellPage catches partial renders with 200-500 visible chars that
602
+ // shouldEscalateForLowContent misses — improves consistency on sites like China Daily)
603
+ if (shouldEscalateForLowContent(simpleResult.result) ||
604
+ hasSpaIndicators(simpleResult.result.html) ||
605
+ looksLikeShellPage(simpleResult.result)) {
602
606
  shouldUseBrowser = true;
603
607
  }
604
608
  else {
@@ -1,5 +1,7 @@
1
1
  /**
2
- * POST /v1/agent
2
+ * POST /v1/agent — single autonomous agent query
3
+ * POST /v1/agent/batch — parallel batch of agent queries (max 50)
4
+ * GET /v1/agent/batch/:id — poll batch job status
3
5
  *
4
6
  * Autonomous web agent — search → fetch → extract (LLM or BM25)
5
7
  *
@@ -11,9 +13,7 @@
11
13
  *
12
14
  * Returns: { success, data|answer, sources, method, elapsed, tokensUsed }
13
15
  *
14
- * Two modes:
15
- * - agent-llm: schema + llmApiKey → LLM extraction (BYOK)
16
- * - agent-bm25: no LLM key → BM25 text answer (always free)
16
+ * Webhook support: pass `webhook` URL to get async delivery with HMAC-SHA256 signing.
17
17
  *
18
18
  * 5-minute in-memory cache. Max 10 sources per request.
19
19
  */
@@ -1,5 +1,7 @@
1
1
  /**
2
- * POST /v1/agent
2
+ * POST /v1/agent — single autonomous agent query
3
+ * POST /v1/agent/batch — parallel batch of agent queries (max 50)
4
+ * GET /v1/agent/batch/:id — poll batch job status
3
5
  *
4
6
  * Autonomous web agent — search → fetch → extract (LLM or BM25)
5
7
  *
@@ -11,9 +13,7 @@
11
13
  *
12
14
  * Returns: { success, data|answer, sources, method, elapsed, tokensUsed }
13
15
  *
14
- * Two modes:
15
- * - agent-llm: schema + llmApiKey → LLM extraction (BYOK)
16
- * - agent-bm25: no LLM key → BM25 text answer (always free)
16
+ * Webhook support: pass `webhook` URL to get async delivery with HMAC-SHA256 signing.
17
17
  *
18
18
  * 5-minute in-memory cache. Max 10 sources per request.
19
19
  */
@@ -22,9 +22,42 @@ import { peel } from '../../index.js';
22
22
  import { extractWithLLM } from '../../core/llm-extract.js';
23
23
  import { getBestSearchProvider } from '../../core/search-provider.js';
24
24
  import { quickAnswer } from '../../core/quick-answer.js';
25
+ import { sendWebhook } from './webhooks.js';
25
26
  import { createLogger } from '../../core/logger.js';
26
27
  import crypto from 'crypto';
27
28
  const log = createLogger('agent');
29
+ const batchJobs = new Map();
30
+ const BATCH_TTL = 60 * 60 * 1000; // 1 hour
31
+ // GC stale batch jobs every 10 minutes
32
+ setInterval(() => {
33
+ const now = Date.now();
34
+ for (const [id, job] of batchJobs) {
35
+ if (now - job.createdAt > BATCH_TTL)
36
+ batchJobs.delete(id);
37
+ }
38
+ }, 10 * 60 * 1000).unref();
39
+ // Simple concurrency limiter
40
+ class Semaphore {
41
+ max;
42
+ queue = [];
43
+ running = 0;
44
+ constructor(max) {
45
+ this.max = max;
46
+ }
47
+ async acquire() {
48
+ if (this.running < this.max) {
49
+ this.running++;
50
+ return;
51
+ }
52
+ return new Promise((resolve) => this.queue.push(() => { this.running++; resolve(); }));
53
+ }
54
+ release() {
55
+ this.running--;
56
+ const next = this.queue.shift();
57
+ if (next)
58
+ next();
59
+ }
60
+ }
28
61
  const cache = new Map();
29
62
  const CACHE_TTL = 5 * 60 * 1000; // 5 minutes
30
63
  function getCached(key) {
@@ -48,191 +81,175 @@ function setCache(key, result) {
48
81
  }
49
82
  cache.set(key, { result, expiresAt: Date.now() + CACHE_TTL });
50
83
  }
84
+ async function runAgentQuery(params) {
85
+ const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources } = params;
86
+ const startMs = Date.now();
87
+ const numSources = Math.min(maxSources || 5, 10);
88
+ // Cache check
89
+ const cacheKey = `${prompt.trim()}:${JSON.stringify(schema || {})}`;
90
+ const cached = getCached(cacheKey);
91
+ if (cached)
92
+ return { ...cached, cached: true };
93
+ // Step 1: Resolve source URLs
94
+ let sourceUrls = [];
95
+ if (Array.isArray(urls) && urls.length > 0) {
96
+ sourceUrls = urls.map((u) => ({ url: u }));
97
+ }
98
+ else {
99
+ log.info(`Searching web for: "${prompt}"`);
100
+ const { provider, apiKey: searchApiKey } = getBestSearchProvider();
101
+ try {
102
+ const searchResults = await provider.searchWeb(prompt.trim(), { count: numSources, apiKey: searchApiKey });
103
+ sourceUrls = searchResults.slice(0, numSources).map((r) => ({ url: r.url, title: r.title, snippet: r.snippet }));
104
+ }
105
+ catch (err) {
106
+ log.warn('Search failed:', err.message);
107
+ }
108
+ }
109
+ if (sourceUrls.length === 0) {
110
+ return { success: false, error: { type: 'no_sources', message: 'Could not find relevant pages for this query' }, prompt, elapsed: Date.now() - startMs };
111
+ }
112
+ // Step 2: Fetch pages in parallel
113
+ log.info(`Fetching ${sourceUrls.length} sources in parallel`);
114
+ const PER_SOURCE_TIMEOUT_MS = 5000;
115
+ const fetchPromises = sourceUrls.map(async (source) => {
116
+ try {
117
+ const result = await Promise.race([
118
+ peel(source.url, { render: false, noEscalate: true, format: 'markdown', timeout: PER_SOURCE_TIMEOUT_MS, budget: 3000 }),
119
+ new Promise((_, reject) => setTimeout(() => reject(new Error('per-source timeout')), PER_SOURCE_TIMEOUT_MS)),
120
+ ]);
121
+ return { url: source.url, title: result.title || source.title || '', content: (result.content || '').slice(0, 15000), tokens: result.tokens || 0 };
122
+ }
123
+ catch {
124
+ return null;
125
+ }
126
+ });
127
+ const fetchResults = (await Promise.allSettled(fetchPromises))
128
+ .map((r) => (r.status === 'fulfilled' ? r.value : null))
129
+ .filter(Boolean);
130
+ if (fetchResults.length === 0) {
131
+ return { success: false, error: { type: 'fetch_failed', message: 'Could not fetch any of the found pages' }, prompt, sources: sourceUrls.map((s) => ({ url: s.url })), elapsed: Date.now() - startMs };
132
+ }
133
+ // Step 3: Extract or answer
134
+ const combinedContent = fetchResults.map((r) => `### ${r.title || r.url}\nURL: ${r.url}\n\n${r.content}`).join('\n\n---\n\n');
135
+ const totalTokens = fetchResults.reduce((sum, r) => sum + r.tokens, 0);
136
+ let result;
137
+ if (schema && llmApiKey) {
138
+ log.info('Using LLM extraction');
139
+ const extracted = await extractWithLLM({
140
+ content: combinedContent.slice(0, 30000), schema, llmApiKey, llmProvider: (llmProvider || 'openai'), llmModel,
141
+ prompt: `Based on these web pages, ${prompt}`, url: fetchResults[0].url,
142
+ });
143
+ const llmTokensUsed = (extracted.tokensUsed?.input ?? 0) + (extracted.tokensUsed?.output ?? 0);
144
+ result = { success: true, data: extracted.items, sources: fetchResults.map((r) => ({ url: r.url, title: r.title })), method: 'agent-llm',
145
+ llm: { provider: extracted.provider || llmProvider || 'openai', model: extracted.model || llmModel || 'default' }, tokensUsed: totalTokens + llmTokensUsed, elapsed: Date.now() - startMs };
146
+ }
147
+ else {
148
+ log.info('Using BM25 text extraction');
149
+ const qa = quickAnswer({ question: prompt, content: combinedContent, maxPassages: 3, maxChars: 2000 });
150
+ result = { success: true, answer: qa.answer || combinedContent.slice(0, 2000), confidence: qa.confidence ?? 0,
151
+ sources: fetchResults.map((r) => ({ url: r.url, title: r.title })), method: 'agent-bm25', tokensUsed: totalTokens, elapsed: Date.now() - startMs };
152
+ }
153
+ setCache(cacheKey, result);
154
+ return result;
155
+ }
51
156
  // ---------------------------------------------------------------------------
52
157
  // Route factory
53
158
  // ---------------------------------------------------------------------------
54
159
  export function createAgentRouter() {
55
160
  const router = Router();
161
+ // ── POST /v1/agent — single query (with optional webhook) ──────────────
56
162
  router.post('/', async (req, res) => {
57
- const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, } = req.body || {};
58
- // Validate required param
163
+ const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, webhook } = req.body || {};
164
+ const requestId = req.requestId || crypto.randomUUID();
59
165
  if (!prompt?.trim()) {
60
166
  return res.status(400).json({
61
167
  success: false,
62
- error: {
63
- type: 'missing_prompt',
64
- message: 'Provide a prompt describing what you want to find',
65
- hint: 'POST /v1/agent { "prompt": "Find Stripe pricing plans" }',
66
- docs: 'https://webpeel.dev/docs/api-reference',
67
- },
68
- requestId: req.requestId || crypto.randomUUID(),
168
+ error: { type: 'missing_prompt', message: 'Provide a prompt describing what you want to find',
169
+ hint: 'POST /v1/agent { "prompt": "Find Stripe pricing plans" }', docs: 'https://webpeel.dev/docs/api-reference' },
170
+ requestId,
69
171
  });
70
172
  }
71
- const startMs = Date.now();
72
- const numSources = Math.min(maxSources || 5, 10);
73
- const requestId = req.requestId || crypto.randomUUID();
74
- // Cache check
75
- const cacheKey = `${prompt.trim()}:${JSON.stringify(schema || {})}`;
76
- const cached = getCached(cacheKey);
77
- if (cached) {
78
- return res.json({ ...cached, cached: true, requestId });
173
+ // Async mode: webhook provided → return immediately, deliver result later
174
+ if (webhook) {
175
+ const jobId = crypto.randomUUID();
176
+ res.json({ success: true, id: jobId, status: 'processing', requestId });
177
+ // Fire-and-forget agent query + webhook delivery
178
+ runAgentQuery({ prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources })
179
+ .then((result) => sendWebhook(webhook, 'agent.completed', { id: jobId, ...result, requestId }))
180
+ .catch((err) => {
181
+ log.error('Async agent error:', err.message);
182
+ sendWebhook(webhook, 'agent.failed', { id: jobId, error: err.message, requestId }).catch(() => { });
183
+ });
184
+ return;
79
185
  }
186
+ // Synchronous mode: wait for result
80
187
  try {
81
- // -----------------------------------------------------------------------
82
- // Step 1: Resolve source URLs — use caller-provided or search the web
83
- // -----------------------------------------------------------------------
84
- let sourceUrls = [];
85
- if (Array.isArray(urls) && urls.length > 0) {
86
- sourceUrls = urls.map((u) => ({ url: u }));
87
- }
88
- else {
89
- log.info(`Searching web for: "${prompt}"`);
90
- const { provider, apiKey: searchApiKey } = getBestSearchProvider();
91
- let searchResults = [];
92
- try {
93
- searchResults = await provider.searchWeb(prompt.trim(), {
94
- count: numSources,
95
- apiKey: searchApiKey,
96
- });
97
- }
98
- catch (err) {
99
- log.warn('Search failed:', err.message);
100
- }
101
- sourceUrls = searchResults.slice(0, numSources).map((r) => ({
102
- url: r.url,
103
- title: r.title,
104
- snippet: r.snippet,
105
- }));
106
- }
107
- if (sourceUrls.length === 0) {
108
- return res.json({
109
- success: false,
110
- error: {
111
- type: 'no_sources',
112
- message: 'Could not find relevant pages for this query',
113
- },
114
- prompt,
115
- elapsed: Date.now() - startMs,
116
- requestId,
117
- });
118
- }
119
- // -----------------------------------------------------------------------
120
- // Step 2: Fetch pages in parallel (HTTP only, no browser, 5s timeout)
121
- // -----------------------------------------------------------------------
122
- log.info(`Fetching ${sourceUrls.length} sources in parallel`);
123
- const PER_SOURCE_TIMEOUT_MS = 5000;
124
- const fetchPromises = sourceUrls.map(async (source) => {
125
- try {
126
- const result = await Promise.race([
127
- peel(source.url, {
128
- render: false,
129
- noEscalate: true,
130
- format: 'markdown',
131
- timeout: PER_SOURCE_TIMEOUT_MS,
132
- budget: 3000,
133
- }),
134
- new Promise((_, reject) => setTimeout(() => reject(new Error('per-source timeout')), PER_SOURCE_TIMEOUT_MS)),
135
- ]);
136
- return {
137
- url: source.url,
138
- title: result.title || source.title || '',
139
- content: (result.content || '').slice(0, 15000),
140
- tokens: result.tokens || 0,
141
- };
142
- }
143
- catch {
144
- return null;
145
- }
146
- });
147
- const fetchSettled = await Promise.allSettled(fetchPromises);
148
- const fetchResults = fetchSettled
149
- .map((r) => (r.status === 'fulfilled' ? r.value : null))
150
- .filter(Boolean);
151
- if (fetchResults.length === 0) {
152
- return res.json({
153
- success: false,
154
- error: {
155
- type: 'fetch_failed',
156
- message: 'Could not fetch any of the found pages',
157
- },
158
- prompt,
159
- sources: sourceUrls.map((s) => ({ url: s.url })),
160
- elapsed: Date.now() - startMs,
161
- requestId,
162
- });
163
- }
164
- // -----------------------------------------------------------------------
165
- // Step 3: Extract or answer
166
- // -----------------------------------------------------------------------
167
- const combinedContent = fetchResults
168
- .map((r) => `### ${r.title || r.url}\nURL: ${r.url}\n\n${r.content}`)
169
- .join('\n\n---\n\n');
170
- const totalTokens = fetchResults.reduce((sum, r) => sum + r.tokens, 0);
171
- let result;
172
- if (schema && llmApiKey) {
173
- // ── LLM extraction path ──────────────────────────────────────────────
174
- log.info('Using LLM extraction');
175
- const extracted = await extractWithLLM({
176
- content: combinedContent.slice(0, 30000),
177
- schema,
178
- llmApiKey,
179
- llmProvider: llmProvider || 'openai',
180
- llmModel,
181
- prompt: `Based on these web pages, ${prompt}`,
182
- url: fetchResults[0].url,
183
- });
184
- const llmTokensUsed = (extracted.tokensUsed?.input ?? 0) + (extracted.tokensUsed?.output ?? 0);
185
- result = {
186
- success: true,
187
- data: extracted.items,
188
- sources: fetchResults.map((r) => ({ url: r.url, title: r.title })),
189
- method: 'agent-llm',
190
- llm: {
191
- provider: extracted.provider || llmProvider || 'openai',
192
- model: extracted.model || llmModel || 'default',
193
- },
194
- tokensUsed: totalTokens + llmTokensUsed,
195
- elapsed: Date.now() - startMs,
196
- requestId,
197
- };
198
- }
199
- else {
200
- // ── BM25 text answer path (no LLM needed) ───────────────────────────
201
- log.info('Using BM25 text extraction');
202
- const qa = quickAnswer({
203
- question: prompt,
204
- content: combinedContent,
205
- maxPassages: 3,
206
- maxChars: 2000,
207
- });
208
- result = {
209
- success: true,
210
- answer: qa.answer || combinedContent.slice(0, 2000),
211
- confidence: qa.confidence ?? 0,
212
- sources: fetchResults.map((r) => ({ url: r.url, title: r.title })),
213
- method: 'agent-bm25',
214
- tokensUsed: totalTokens,
215
- elapsed: Date.now() - startMs,
216
- requestId,
217
- };
218
- }
219
- // Cache the result
220
- setCache(cacheKey, result);
221
- return res.json(result);
188
+ const result = await runAgentQuery({ prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources });
189
+ return res.json({ ...result, requestId });
222
190
  }
223
191
  catch (err) {
224
192
  log.error('Agent error:', err.message);
225
193
  return res.status(500).json({
226
- success: false,
227
- error: {
228
- type: 'agent_error',
229
- message: err.message || 'An unexpected error occurred',
230
- },
231
- prompt,
232
- elapsed: Date.now() - startMs,
233
- requestId,
194
+ success: false, error: { type: 'agent_error', message: err.message || 'An unexpected error occurred' },
195
+ prompt, elapsed: 0, requestId,
196
+ });
197
+ }
198
+ });
199
+ // ── POST /v1/agent/batch — parallel batch queries ─────────────────────
200
+ router.post('/batch', async (req, res) => {
201
+ const { prompts, schema, llmApiKey, llmProvider, llmModel, sources, webhook } = req.body || {};
202
+ const requestId = req.requestId || crypto.randomUUID();
203
+ if (!Array.isArray(prompts) || prompts.length === 0) {
204
+ return res.status(400).json({
205
+ success: false, error: { type: 'missing_prompts', message: 'Provide an array of prompts',
206
+ hint: 'POST /v1/agent/batch { "prompts": ["Find X", "Find Y"] }' }, requestId,
207
+ });
208
+ }
209
+ if (prompts.length > 50) {
210
+ return res.status(400).json({
211
+ success: false, error: { type: 'too_many_prompts', message: `Max 50 prompts per batch (got ${prompts.length})` }, requestId,
234
212
  });
235
213
  }
214
+ const jobId = crypto.randomUUID();
215
+ const job = { id: jobId, status: 'processing', total: prompts.length, completed: 0, results: [], webhook, createdAt: Date.now() };
216
+ batchJobs.set(jobId, job);
217
+ // Return immediately, then process in background
218
+ res.json({ success: true, id: jobId, status: 'processing', total: prompts.length, requestId });
219
+ // Process in background with concurrency limit of 5
220
+ // eslint-disable-next-line @typescript-eslint/no-floating-promises
221
+ const sem = new Semaphore(5);
222
+ const tasks = prompts.map(async (prompt) => {
223
+ await sem.acquire();
224
+ try {
225
+ const result = await runAgentQuery({ prompt, schema, llmApiKey, llmProvider, llmModel, sources });
226
+ job.results.push({ prompt, success: !!result.success, answer: result.answer,
227
+ data: result.data, sources: result.sources, method: result.method, elapsed: result.elapsed });
228
+ }
229
+ catch (err) {
230
+ job.results.push({ prompt, success: false, error: err.message });
231
+ }
232
+ finally {
233
+ job.completed++;
234
+ sem.release();
235
+ }
236
+ });
237
+ Promise.allSettled(tasks).then(() => {
238
+ job.status = job.results.every((r) => r.success) ? 'completed' : 'completed';
239
+ if (webhook) {
240
+ sendWebhook(webhook, 'agent.batch.completed', { id: jobId, total: job.total, completed: job.completed, results: job.results })
241
+ .catch((err) => log.error('Batch webhook failed:', err.message));
242
+ }
243
+ });
244
+ return;
245
+ });
246
+ // ── GET /v1/agent/batch/:id — poll batch status ───────────────────────
247
+ router.get('/batch/:id', async (req, res) => {
248
+ const job = batchJobs.get(req.params.id);
249
+ if (!job) {
250
+ return res.status(404).json({ success: false, error: { type: 'not_found', message: 'Batch job not found or expired' } });
251
+ }
252
+ return res.json({ success: true, id: job.id, status: job.status, total: job.total, completed: job.completed, results: job.results });
236
253
  });
237
254
  return router;
238
255
  }
@@ -6,9 +6,11 @@
6
6
  * POST /v1/session/:id/navigate → navigate to URL { url }
7
7
  * POST /v1/session/:id/act → execute PageActions array
8
8
  * GET /v1/session/:id/screenshot → take screenshot (image/png)
9
+ * GET /v1/session/:id/cookies → export cookies from session context
10
+ * POST /v1/session/:id/cookies → inject cookies into session context
9
11
  * DELETE /v1/session/:id → close session
10
12
  *
11
- * Use cases: login flows, multi-step automation, UI testing.
13
+ * Use cases: login flows, multi-step automation, UI testing, cookie persistence.
12
14
  * This is what Browserbase charges $500/mo for — built into WebPeel.
13
15
  */
14
16
  import { Router } from 'express';
@@ -6,9 +6,11 @@
6
6
  * POST /v1/session/:id/navigate → navigate to URL { url }
7
7
  * POST /v1/session/:id/act → execute PageActions array
8
8
  * GET /v1/session/:id/screenshot → take screenshot (image/png)
9
+ * GET /v1/session/:id/cookies → export cookies from session context
10
+ * POST /v1/session/:id/cookies → inject cookies into session context
9
11
  * DELETE /v1/session/:id → close session
10
12
  *
11
- * Use cases: login flows, multi-step automation, UI testing.
13
+ * Use cases: login flows, multi-step automation, UI testing, cookie persistence.
12
14
  * This is what Browserbase charges $500/mo for — built into WebPeel.
13
15
  */
14
16
  import { Router } from 'express';
@@ -17,13 +19,15 @@ import { normalizeActions, executeActions } from '../../core/actions.js';
17
19
  import { ANTI_DETECTION_ARGS, getRandomViewport, getRandomUserAgent, applyStealthScripts, } from '../../core/browser-pool.js';
18
20
  import { extractReadableContent } from '../../core/readability.js';
19
21
  const sessions = new Map();
20
- const SESSION_TTL_MS = 5 * 60 * 1000; // 5 minutes idle TTL
22
+ const DEFAULT_SESSION_TTL_MS = 5 * 60 * 1000; // 5 minutes idle TTL (default)
23
+ const MAX_SESSION_TTL_MS = 60 * 60 * 1000; // 60 minutes (persist / max)
24
+ const MIN_SESSION_TTL_MS = 1 * 60 * 1000; // 1 minute minimum
21
25
  const MAX_SESSIONS_PER_USER = 3; // prevent abuse
22
26
  // Cleanup expired sessions every minute
23
27
  const _cleanupInterval = setInterval(() => {
24
28
  const now = Date.now();
25
29
  for (const [id, session] of sessions) {
26
- if (now - session.lastUsedAt > SESSION_TTL_MS) {
30
+ if (now - session.lastUsedAt > session.ttlMs) {
27
31
  session.browser.close().catch(() => { });
28
32
  sessions.delete(id);
29
33
  }
@@ -73,7 +77,18 @@ function extractReadableText(html, url) {
73
77
  // ── Router ────────────────────────────────────────────────────────────────────
74
78
  export function createSessionRouter() {
75
79
  const router = Router();
76
- // ── POST /v1/session — create session ────────────────────────────────────────
80
+ /**
81
+ * POST /v1/session — create a stateful browser session
82
+ *
83
+ * Body params:
84
+ * url? {string} Initial URL to navigate to (optional).
85
+ * ttl? {number} Session idle TTL in minutes (1–60, default 5).
86
+ * Timer resets on every request that touches the session.
87
+ * persist? {boolean} Shorthand for ttl=60. Enables long-lived sessions
88
+ * for login flows where cookies must persist.
89
+ *
90
+ * Returns: { sessionId, currentUrl, expiresAt, ttlMinutes }
91
+ */
77
92
  router.post('/v1/session', async (req, res) => {
78
93
  const ownerId = getOwnerId(req);
79
94
  if (!ownerId) {
@@ -95,7 +110,15 @@ export function createSessionRouter() {
95
110
  });
96
111
  return;
97
112
  }
98
- const { url } = req.body;
113
+ const { url, ttl, persist } = req.body;
114
+ // Resolve TTL: persist=true → 60 min max, ttl overrides default, clamp to [1, 60] min
115
+ let ttlMs = DEFAULT_SESSION_TTL_MS;
116
+ if (persist) {
117
+ ttlMs = MAX_SESSION_TTL_MS;
118
+ }
119
+ else if (typeof ttl === 'number') {
120
+ ttlMs = Math.min(MAX_SESSION_TTL_MS, Math.max(MIN_SESSION_TTL_MS, ttl * 60 * 1000));
121
+ }
99
122
  let browser = null;
100
123
  try {
101
124
  browser = await launchBrowser();
@@ -137,11 +160,13 @@ export function createSessionRouter() {
137
160
  createdAt: now,
138
161
  lastUsedAt: now,
139
162
  currentUrl: page.url(),
163
+ ttlMs,
140
164
  });
141
165
  res.status(201).json({
142
166
  sessionId: id,
143
167
  currentUrl: page.url(),
144
- expiresAt: new Date(now + SESSION_TTL_MS).toISOString(),
168
+ expiresAt: new Date(now + ttlMs).toISOString(),
169
+ ttlMinutes: ttlMs / 60_000,
145
170
  });
146
171
  }
147
172
  catch (err) {
@@ -188,7 +213,8 @@ export function createSessionRouter() {
188
213
  currentUrl: session.page.url(),
189
214
  title,
190
215
  content,
191
- expiresAt: new Date(session.lastUsedAt + SESSION_TTL_MS).toISOString(),
216
+ expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
217
+ ttlMinutes: session.ttlMs / 60_000,
192
218
  });
193
219
  }
194
220
  catch (err) {
@@ -242,7 +268,8 @@ export function createSessionRouter() {
242
268
  res.json({
243
269
  currentUrl: session.page.url(),
244
270
  title: await session.page.title(),
245
- expiresAt: new Date(session.lastUsedAt + SESSION_TTL_MS).toISOString(),
271
+ expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
272
+ ttlMinutes: session.ttlMs / 60_000,
246
273
  });
247
274
  }
248
275
  catch (err) {
@@ -327,7 +354,8 @@ export function createSessionRouter() {
327
354
  title,
328
355
  screenshot,
329
356
  actionsExecuted: normalizedActions.length,
330
- expiresAt: new Date(session.lastUsedAt + SESSION_TTL_MS).toISOString(),
357
+ expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
358
+ ttlMinutes: session.ttlMs / 60_000,
331
359
  });
332
360
  }
333
361
  catch (err) {
@@ -367,6 +395,7 @@ export function createSessionRouter() {
367
395
  session.lastUsedAt = Date.now();
368
396
  res.setHeader('Content-Type', 'image/png');
369
397
  res.setHeader('Cache-Control', 'no-store');
398
+ res.setHeader('X-Session-Expires-At', new Date(session.lastUsedAt + session.ttlMs).toISOString());
370
399
  res.send(buf);
371
400
  }
372
401
  catch (err) {
@@ -382,6 +411,128 @@ export function createSessionRouter() {
382
411
  });
383
412
  }
384
413
  });
414
+ /**
415
+ * GET /v1/session/:id/cookies — export all cookies from the session's browser context
416
+ *
417
+ * Returns: { sessionId, cookies: Cookie[], count: number, expiresAt: string }
418
+ *
419
+ * Each cookie follows the Playwright Cookie shape:
420
+ * { name, value, domain, path, expires, httpOnly, secure, sameSite }
421
+ *
422
+ * Use this to snapshot cookies after a login flow, then re-inject them later
423
+ * via POST /v1/session/:id/cookies to skip re-authentication.
424
+ */
425
+ router.get('/v1/session/:id/cookies', async (req, res) => {
426
+ const ownerId = getOwnerId(req);
427
+ const session = getSession(req.params['id'], ownerId);
428
+ if (!session) {
429
+ res.status(404).json({
430
+ success: false,
431
+ error: {
432
+ type: 'session_not_found',
433
+ message: 'Session not found or has expired.',
434
+ hint: 'Create a new session via POST /v1/session.',
435
+ docs: 'https://webpeel.dev/docs/errors#session-not-found',
436
+ },
437
+ requestId: req.requestId || randomUUID(),
438
+ });
439
+ return;
440
+ }
441
+ try {
442
+ // Playwright context.cookies() returns all cookies for all URLs by default
443
+ const cookies = await session.context.cookies();
444
+ session.lastUsedAt = Date.now();
445
+ res.json({
446
+ sessionId: session.id,
447
+ cookies,
448
+ count: cookies.length,
449
+ expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
450
+ });
451
+ }
452
+ catch (err) {
453
+ const msg = err instanceof Error ? err.message : String(err);
454
+ res.status(500).json({
455
+ success: false,
456
+ error: {
457
+ type: 'cookie_export_failed',
458
+ message: msg,
459
+ docs: 'https://webpeel.dev/docs/errors#cookie-export-failed',
460
+ },
461
+ requestId: req.requestId || randomUUID(),
462
+ });
463
+ }
464
+ });
465
+ /**
466
+ * POST /v1/session/:id/cookies — inject cookies into the session's browser context
467
+ *
468
+ * Body params:
469
+ * cookies {Cookie[]} Array of Playwright-compatible cookie objects.
470
+ * Required fields: name, value, domain (or url).
471
+ * Optional: path, expires, httpOnly, secure, sameSite.
472
+ *
473
+ * Returns: { sessionId, injected: number, expiresAt: string }
474
+ *
475
+ * Typical cookie-persistence workflow:
476
+ * 1. POST /v1/session { url: "https://example.com", persist: true }
477
+ * 2. POST /v1/session/:id/act (complete login flow)
478
+ * 3. GET /v1/session/:id/cookies → save cookies array to your storage
479
+ * 4. Later: POST /v1/session/:id/cookies { cookies: [...] }
480
+ * 5. GET /v1/session/:id → page loads authenticated (no re-login needed)
481
+ */
482
+ router.post('/v1/session/:id/cookies', async (req, res) => {
483
+ const ownerId = getOwnerId(req);
484
+ const session = getSession(req.params['id'], ownerId);
485
+ if (!session) {
486
+ res.status(404).json({
487
+ success: false,
488
+ error: {
489
+ type: 'session_not_found',
490
+ message: 'Session not found or has expired.',
491
+ hint: 'Create a new session via POST /v1/session.',
492
+ docs: 'https://webpeel.dev/docs/errors#session-not-found',
493
+ },
494
+ requestId: req.requestId || randomUUID(),
495
+ });
496
+ return;
497
+ }
498
+ const { cookies } = req.body;
499
+ if (!Array.isArray(cookies) || cookies.length === 0) {
500
+ res.status(400).json({
501
+ success: false,
502
+ error: {
503
+ type: 'bad_request',
504
+ message: '`cookies` must be a non-empty array of cookie objects.',
505
+ hint: 'Pass cookies exported from GET /v1/session/:id/cookies or a compatible Cookie[] array.',
506
+ docs: 'https://webpeel.dev/docs/errors#bad-request',
507
+ },
508
+ requestId: req.requestId || randomUUID(),
509
+ });
510
+ return;
511
+ }
512
+ try {
513
+ // Playwright's addCookies validates the shape internally; invalid cookies will throw
514
+ await session.context.addCookies(cookies);
515
+ session.lastUsedAt = Date.now();
516
+ res.json({
517
+ sessionId: session.id,
518
+ injected: cookies.length,
519
+ expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
520
+ });
521
+ }
522
+ catch (err) {
523
+ const msg = err instanceof Error ? err.message : String(err);
524
+ res.status(400).json({
525
+ success: false,
526
+ error: {
527
+ type: 'cookie_inject_failed',
528
+ message: msg,
529
+ hint: 'Ensure each cookie has at minimum: name, value, and domain (or url).',
530
+ docs: 'https://webpeel.dev/docs/errors#cookie-inject-failed',
531
+ },
532
+ requestId: req.requestId || randomUUID(),
533
+ });
534
+ }
535
+ });
385
536
  // ── DELETE /v1/session/:id ───────────────────────────────────────────────────
386
537
  router.delete('/v1/session/:id', async (req, res) => {
387
538
  const ownerId = getOwnerId(req);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.20.4",
3
+ "version": "0.20.6",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",