webpeel 0.20.5 → 0.20.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -7,9 +7,40 @@ import { getProfilePath, loadStorageState, touchProfile } from '../../core/profi
|
|
|
7
7
|
import { peel, cleanup } from '../../index.js';
|
|
8
8
|
import { checkUsage, showUsageFooter, loadConfig } from '../../cli-auth.js';
|
|
9
9
|
import { getCache, setCache, parseTTL } from '../../cache.js';
|
|
10
|
-
import { estimateTokens } from '../../core/markdown.js';
|
|
10
|
+
import { estimateTokens, htmlToMarkdown } from '../../core/markdown.js';
|
|
11
11
|
import { distillToBudget, budgetListings } from '../../core/budget.js';
|
|
12
12
|
import { parseActions, formatError, fetchViaApi, outputResult, writeStdout, buildEnvelope, classifyErrorCode, formatListingsCsv, normaliseExtractedToRows, } from '../utils.js';
|
|
13
|
+
// ─── readStdin ────────────────────────────────────────────────────────────────
|
|
14
|
+
async function readStdin() {
|
|
15
|
+
const chunks = [];
|
|
16
|
+
for await (const chunk of process.stdin) {
|
|
17
|
+
chunks.push(Buffer.from(chunk));
|
|
18
|
+
}
|
|
19
|
+
return Buffer.concat(chunks).toString('utf-8');
|
|
20
|
+
}
|
|
21
|
+
// ─── runStdin ─────────────────────────────────────────────────────────────────
|
|
22
|
+
// Read HTML from stdin, convert to markdown, and output
|
|
23
|
+
async function runStdin(options) {
|
|
24
|
+
try {
|
|
25
|
+
const html = await readStdin();
|
|
26
|
+
if (!html.trim()) {
|
|
27
|
+
process.stderr.write('Error: No input received on stdin\n');
|
|
28
|
+
process.exit(1);
|
|
29
|
+
}
|
|
30
|
+
const markdown = htmlToMarkdown(html, { raw: false, prune: true });
|
|
31
|
+
if (options.json) {
|
|
32
|
+
const tokens = estimateTokens(markdown);
|
|
33
|
+
process.stdout.write(JSON.stringify({ success: true, content: markdown, tokens }) + '\n');
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
process.stdout.write(markdown + '\n');
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
process.stderr.write(`Error: ${err.message}\n`);
|
|
41
|
+
process.exit(1);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
13
44
|
// ─── runFetch ─────────────────────────────────────────────────────────────────
|
|
14
45
|
// Main fetch handler — shared with the `pipe` and `ask` subcommands
|
|
15
46
|
export async function runFetch(url, options) {
|
|
@@ -1045,7 +1076,12 @@ export function registerFetchCommands(program) {
|
|
|
1045
1076
|
.option('--format <type>', 'Output format: markdown (default), text, html, json')
|
|
1046
1077
|
.option('--content-only', 'Output only the raw content field (no metadata, no JSON wrapper) — ideal for piping to LLMs')
|
|
1047
1078
|
.option('--progress', 'Show engine escalation steps (simple → browser → stealth) with timing')
|
|
1079
|
+
.option('--stdin', 'Read HTML from stdin instead of fetching a URL — converts to markdown')
|
|
1048
1080
|
.action(async (url, options) => {
|
|
1081
|
+
if (options.stdin) {
|
|
1082
|
+
await runStdin(options);
|
|
1083
|
+
return;
|
|
1084
|
+
}
|
|
1049
1085
|
await runFetch(url, options);
|
|
1050
1086
|
});
|
|
1051
1087
|
// ── read subcommand (explicit readable mode) ─────────────────────────────
|
|
@@ -324,19 +324,87 @@ async function twitterExtractor(html, url) {
|
|
|
324
324
|
source: 'fxtwitter',
|
|
325
325
|
};
|
|
326
326
|
// Try to fetch recent tweets from Twitter's public syndication endpoint
|
|
327
|
+
// NOTE: simpleFetch sends too many Sec-* headers that trigger 429. Use https directly.
|
|
327
328
|
let recentTweets = '';
|
|
328
329
|
try {
|
|
329
|
-
const
|
|
330
|
-
const
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
330
|
+
const { default: httpsModule } = await import('https');
|
|
331
|
+
const syndicationHtml = await new Promise((resolve, reject) => {
|
|
332
|
+
const req = httpsModule.request({
|
|
333
|
+
hostname: 'syndication.twitter.com',
|
|
334
|
+
path: `/srv/timeline-profile/screen-name/${u.screen_name}`,
|
|
335
|
+
method: 'GET',
|
|
336
|
+
headers: {
|
|
337
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
338
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
339
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
340
|
+
},
|
|
341
|
+
}, (res) => {
|
|
342
|
+
if (res.statusCode !== 200) {
|
|
343
|
+
reject(new Error(`HTTP ${res.statusCode}`));
|
|
344
|
+
res.resume();
|
|
345
|
+
return;
|
|
346
|
+
}
|
|
347
|
+
let body = '';
|
|
348
|
+
res.on('data', (chunk) => body += chunk.toString());
|
|
349
|
+
res.on('end', () => resolve(body));
|
|
350
|
+
});
|
|
351
|
+
req.on('error', reject);
|
|
352
|
+
setTimeout(() => req.destroy(new Error('timeout')), 12000);
|
|
353
|
+
req.end();
|
|
354
|
+
});
|
|
355
|
+
if (syndicationHtml) {
|
|
356
|
+
// Parse __NEXT_DATA__ JSON from the syndication page for rich tweet data
|
|
357
|
+
const nextDataMatch = syndicationHtml.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
|
|
358
|
+
if (nextDataMatch) {
|
|
359
|
+
const nextData = tryParseJson(nextDataMatch[1]);
|
|
360
|
+
const entries = nextData?.props?.pageProps?.timeline?.entries || [];
|
|
361
|
+
const tweetSections = [];
|
|
362
|
+
for (const entry of entries) {
|
|
363
|
+
if (tweetSections.length >= 8)
|
|
364
|
+
break;
|
|
365
|
+
const tweet = entry?.content?.tweet;
|
|
366
|
+
if (!tweet?.full_text)
|
|
367
|
+
continue;
|
|
368
|
+
const text = tweet.full_text.replace(/\\n/g, '\n').replace(/\\"/g, '"').trim();
|
|
369
|
+
// Skip retweets and pure-URL-only tweets without media
|
|
370
|
+
if (text.startsWith('RT @'))
|
|
371
|
+
continue;
|
|
372
|
+
const media = tweet.extended_entities?.media || tweet.entities?.media || [];
|
|
373
|
+
const isUrlOnly = /^https?:\/\/t\.co\/\S+$/.test(text.trim()) || /^https?:\/\/t\.co\/\S+\s*$/.test(text.trim());
|
|
374
|
+
if (isUrlOnly && media.length === 0)
|
|
375
|
+
continue;
|
|
376
|
+
// Format date
|
|
377
|
+
const dateStr = tweet.created_at ? (() => {
|
|
378
|
+
try {
|
|
379
|
+
return new Date(tweet.created_at).toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' });
|
|
380
|
+
}
|
|
381
|
+
catch {
|
|
382
|
+
return tweet.created_at;
|
|
383
|
+
}
|
|
384
|
+
})() : '';
|
|
385
|
+
const likes = tweet.favorite_count ?? 0;
|
|
386
|
+
const retweets = tweet.retweet_count ?? 0;
|
|
387
|
+
const replies = tweet.reply_count ?? 0;
|
|
388
|
+
const fmtNum = (n) => n >= 1000000 ? (n / 1000000).toFixed(1) + 'M' : n >= 1000 ? (n / 1000).toFixed(1) + 'K' : String(n);
|
|
389
|
+
const mediaLine = media.length > 0 ? `\n📷 ${media.map((m) => m.media_url_https || m.media_url).filter(Boolean).join(', ')}` : '';
|
|
390
|
+
// Clean t.co URLs from text when they have real media
|
|
391
|
+
const cleanText = media.length > 0 ? text.replace(/https?:\/\/t\.co\/\S+/g, '').trim() : text;
|
|
392
|
+
tweetSections.push(`### ${dateStr}\n${cleanText}${mediaLine}\n♻️ ${fmtNum(retweets)} | ❤️ ${fmtNum(likes)} | 💬 ${fmtNum(replies)}`);
|
|
393
|
+
}
|
|
394
|
+
if (tweetSections.length > 0) {
|
|
395
|
+
recentTweets = '\n\n## Recent Tweets\n\n' + tweetSections.join('\n\n---\n\n');
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
else {
|
|
399
|
+
// Fallback: simple regex extraction without metrics
|
|
400
|
+
const tweetMatches = [...syndicationHtml.matchAll(/"full_text":"((?:[^"\\]|\\.)*)"/g)];
|
|
401
|
+
const tweets = tweetMatches
|
|
402
|
+
.slice(0, 5)
|
|
403
|
+
.map(m => m[1].replace(/\\n/g, ' ').replace(/\\"/g, '"').trim())
|
|
404
|
+
.filter(t => t.length > 10 && !t.startsWith('RT @'));
|
|
405
|
+
if (tweets.length > 0) {
|
|
406
|
+
recentTweets = '\n\n## Recent Tweets\n\n' + tweets.map(t => `> ${t}`).join('\n\n');
|
|
407
|
+
}
|
|
340
408
|
}
|
|
341
409
|
}
|
|
342
410
|
}
|
|
@@ -344,7 +412,7 @@ async function twitterExtractor(html, url) {
|
|
|
344
412
|
const websiteLine = structured.website ? `\n🌐 ${structured.website}` : '';
|
|
345
413
|
const joinedLine = structured.created ? `\n📅 Joined: ${structured.created}` : '';
|
|
346
414
|
const likesLine = structured.likes ? ` | ❤️ Likes: ${structured.likes?.toLocaleString() || 0}` : '';
|
|
347
|
-
const cleanContent =
|
|
415
|
+
const cleanContent = `# @${(structured.handle || '').replace('@', '')} on X/Twitter\n\n**${structured.name}**${structured.verified ? ' ✓' : ''}\n\n${structured.bio || ''}\n\n📍 ${structured.location || 'N/A'}${websiteLine}${joinedLine}\n👥 Followers: ${structured.followers?.toLocaleString() || 0} | Following: ${structured.following?.toLocaleString() || 0} | Tweets: ${structured.tweets?.toLocaleString() || 0}${likesLine}${recentTweets}`;
|
|
348
416
|
return { domain, type: 'profile', structured, cleanContent };
|
|
349
417
|
}
|
|
350
418
|
}
|
|
@@ -1961,7 +2029,16 @@ async function linkedinExtractor(html, url) {
|
|
|
1961
2029
|
try {
|
|
1962
2030
|
const { load } = await import('cheerio');
|
|
1963
2031
|
const $ = load(html);
|
|
1964
|
-
//
|
|
2032
|
+
// Detect page type from URL first
|
|
2033
|
+
const urlObj = new URL(url);
|
|
2034
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
2035
|
+
const pageType = pathParts[0] === 'company' ? 'company'
|
|
2036
|
+
: pathParts[0] === 'in' ? 'profile'
|
|
2037
|
+
: pathParts[0] === 'jobs' ? 'job'
|
|
2038
|
+
: 'page';
|
|
2039
|
+
// Detect if we're on the authwall (LinkedIn redirects unauthenticated requests)
|
|
2040
|
+
const isAuthwall = html.includes('authwall') || html.includes('Join LinkedIn') || html.includes('Sign in') && !html.includes('linkedin.com/in/');
|
|
2041
|
+
// --- Try parsing meta tags / JSON-LD from the HTML ---
|
|
1965
2042
|
let jsonLd = null;
|
|
1966
2043
|
$('script[type="application/ld+json"]').each((_, el) => {
|
|
1967
2044
|
if (jsonLd)
|
|
@@ -1974,30 +2051,83 @@ async function linkedinExtractor(html, url) {
|
|
|
1974
2051
|
const ogTitle = $('meta[property="og:title"]').attr('content') || '';
|
|
1975
2052
|
const ogDescription = $('meta[property="og:description"]').attr('content') || '';
|
|
1976
2053
|
const ogImage = $('meta[property="og:image"]').attr('content') || '';
|
|
1977
|
-
const
|
|
2054
|
+
const metaDescription = $('meta[name="description"]').attr('content') || '';
|
|
2055
|
+
let name = jsonLd?.name || ogTitle.replace(/ \| LinkedIn$/, '').replace(/Sign Up \| LinkedIn$/, '').trim() || '';
|
|
2056
|
+
// When on authwall, discard authwall-specific meta data
|
|
2057
|
+
let headline = isAuthwall ? (jsonLd?.jobTitle || '') : (jsonLd?.jobTitle || metaDescription?.split('|')?.[0]?.trim() || ogDescription || '');
|
|
2058
|
+
let description = isAuthwall ? (jsonLd?.description || '') : (jsonLd?.description || ogDescription || '');
|
|
2059
|
+
let location = $('[class*="location"]').first().text().trim() || jsonLd?.address?.addressLocality || '';
|
|
2060
|
+
// --- If authwall or no useful data, try direct HTTPS fetch with minimal headers ---
|
|
2061
|
+
// LinkedIn returns rich og: meta tags when fetched with a plain browser UA (no Sec-Fetch-* noise)
|
|
2062
|
+
if (!name || isAuthwall || name.toLowerCase().includes('sign up') || name.toLowerCase().includes('linkedin')) {
|
|
2063
|
+
try {
|
|
2064
|
+
const { default: httpsLI } = await import('https');
|
|
2065
|
+
const { gunzip } = await import('zlib');
|
|
2066
|
+
const linkedInHtml = await new Promise((resolve, reject) => {
|
|
2067
|
+
const req = httpsLI.request({
|
|
2068
|
+
hostname: 'www.linkedin.com',
|
|
2069
|
+
path: urlObj.pathname,
|
|
2070
|
+
method: 'GET',
|
|
2071
|
+
headers: {
|
|
2072
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
2073
|
+
'Accept': 'text/html,application/xhtml+xml',
|
|
2074
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
2075
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
2076
|
+
},
|
|
2077
|
+
}, (res) => {
|
|
2078
|
+
if (res.statusCode && res.statusCode >= 400) {
|
|
2079
|
+
reject(new Error(`HTTP ${res.statusCode}`));
|
|
2080
|
+
res.resume();
|
|
2081
|
+
return;
|
|
2082
|
+
}
|
|
2083
|
+
const chunks = [];
|
|
2084
|
+
res.on('data', (chunk) => chunks.push(chunk));
|
|
2085
|
+
res.on('end', () => {
|
|
2086
|
+
const buf = Buffer.concat(chunks);
|
|
2087
|
+
const enc = res.headers['content-encoding'] || '';
|
|
2088
|
+
if (enc === 'gzip') {
|
|
2089
|
+
gunzip(buf, (err, decoded) => err ? reject(err) : resolve(decoded.toString('utf8')));
|
|
2090
|
+
}
|
|
2091
|
+
else {
|
|
2092
|
+
resolve(buf.toString('utf8'));
|
|
2093
|
+
}
|
|
2094
|
+
});
|
|
2095
|
+
});
|
|
2096
|
+
req.on('error', reject);
|
|
2097
|
+
setTimeout(() => req.destroy(new Error('timeout')), 10000);
|
|
2098
|
+
req.end();
|
|
2099
|
+
});
|
|
2100
|
+
if (linkedInHtml) {
|
|
2101
|
+
const $li = load(linkedInHtml);
|
|
2102
|
+
const liOgTitle = $li('meta[property="og:title"]').attr('content') || '';
|
|
2103
|
+
const liOgDesc = $li('meta[property="og:description"]').attr('content') || '';
|
|
2104
|
+
// Only use if it has real profile data (not authwall)
|
|
2105
|
+
if (liOgTitle && !liOgTitle.toLowerCase().includes('sign up') && !liOgTitle.toLowerCase().includes('join linkedin')) {
|
|
2106
|
+
// "Name - Headline | LinkedIn" or "Name | LinkedIn"
|
|
2107
|
+
const titleParts = liOgTitle.replace(/ \| LinkedIn$/, '').split(/\s*[-–]\s*/);
|
|
2108
|
+
if (titleParts[0])
|
|
2109
|
+
name = titleParts[0].trim();
|
|
2110
|
+
if (titleParts[1])
|
|
2111
|
+
headline = titleParts[1].trim();
|
|
2112
|
+
if (liOgDesc)
|
|
2113
|
+
description = liOgDesc;
|
|
2114
|
+
}
|
|
2115
|
+
}
|
|
2116
|
+
}
|
|
2117
|
+
catch { /* direct fetch optional */ }
|
|
2118
|
+
}
|
|
1978
2119
|
if (!name)
|
|
1979
2120
|
return null;
|
|
1980
|
-
const headline = jsonLd?.jobTitle ||
|
|
1981
|
-
$('meta[name="description"]').attr('content')?.split('|')?.[0]?.trim() ||
|
|
1982
|
-
ogDescription || '';
|
|
1983
|
-
const description = jsonLd?.description || ogDescription || '';
|
|
1984
|
-
// Try to detect page type from URL
|
|
1985
|
-
const pathParts = new URL(url).pathname.split('/').filter(Boolean);
|
|
1986
|
-
const pageType = pathParts[0] === 'company' ? 'company'
|
|
1987
|
-
: pathParts[0] === 'in' ? 'profile'
|
|
1988
|
-
: pathParts[0] === 'jobs' ? 'job'
|
|
1989
|
-
: 'page';
|
|
1990
|
-
// Extract any visible structured info from the HTML
|
|
1991
|
-
const location = $('[class*="location"]').first().text().trim() ||
|
|
1992
|
-
jsonLd?.address?.addressLocality || '';
|
|
1993
2121
|
const structured = {
|
|
1994
2122
|
name, headline, description, location, pageType,
|
|
1995
2123
|
image: ogImage, url,
|
|
1996
2124
|
};
|
|
1997
2125
|
const typeLine = pageType === 'company' ? '🏢' : pageType === 'profile' ? '👤' : '🔗';
|
|
1998
2126
|
const locationLine = location ? `\n📍 ${location}` : '';
|
|
1999
|
-
const headlineLine = headline ? `\n*${headline}*` : '';
|
|
2000
|
-
const
|
|
2127
|
+
const headlineLine = headline && headline !== name ? `\n*${headline}*` : '';
|
|
2128
|
+
const descriptionLine = description ? `\n\n${description}` : '';
|
|
2129
|
+
const authNote = '\n\n⚠️ Full LinkedIn profiles require authentication. Use /v1/session to log in first.';
|
|
2130
|
+
const cleanContent = `# ${typeLine} ${name} — LinkedIn${headlineLine}${locationLine}${descriptionLine}${authNote}`;
|
|
2001
2131
|
return { domain: 'linkedin.com', type: pageType, structured, cleanContent };
|
|
2002
2132
|
}
|
|
2003
2133
|
catch {
|
|
@@ -2541,8 +2671,73 @@ async function soundcloudExtractor(_html, url) {
|
|
|
2541
2671
|
// 29. Instagram extractor (oEmbed)
|
|
2542
2672
|
// ---------------------------------------------------------------------------
|
|
2543
2673
|
async function instagramExtractor(_html, url) {
|
|
2674
|
+
const pathParts = new URL(url).pathname.split('/').filter(Boolean);
|
|
2675
|
+
const contentType = pathParts[0] === 'p' ? 'post' : pathParts[0] === 'reel' ? 'reel' : pathParts[0] === 'tv' ? 'igtv' : pathParts.length === 1 ? 'profile' : 'post';
|
|
2676
|
+
// --- Profile extraction via Instagram internal API (no auth needed) ---
|
|
2677
|
+
if (contentType === 'profile' && pathParts.length === 1) {
|
|
2678
|
+
const username = pathParts[0];
|
|
2679
|
+
try {
|
|
2680
|
+
const apiUrl = `https://www.instagram.com/api/v1/users/web_profile_info/?username=${encodeURIComponent(username)}`;
|
|
2681
|
+
const igHeaders = {
|
|
2682
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
2683
|
+
'X-IG-App-ID': '936619743392459',
|
|
2684
|
+
'Accept': '*/*',
|
|
2685
|
+
'Referer': 'https://www.instagram.com/',
|
|
2686
|
+
};
|
|
2687
|
+
const apiResult = await simpleFetch(apiUrl, igHeaders['User-Agent'], 12000, igHeaders);
|
|
2688
|
+
const data = tryParseJson(apiResult?.html || '');
|
|
2689
|
+
const user = data?.data?.user;
|
|
2690
|
+
if (user && user.username) {
|
|
2691
|
+
const followers = user.edge_followed_by?.count ?? 0;
|
|
2692
|
+
const following = user.edge_follow?.count ?? 0;
|
|
2693
|
+
const postCount = user.edge_owner_to_timeline_media?.count ?? 0;
|
|
2694
|
+
const fmtNum = (n) => n >= 1000000 ? (n / 1000000).toFixed(1) + 'M' : n >= 1000 ? (n / 1000).toFixed(1) + 'K' : String(n);
|
|
2695
|
+
const structured = {
|
|
2696
|
+
username: user.username,
|
|
2697
|
+
fullName: user.full_name || '',
|
|
2698
|
+
bio: user.biography || '',
|
|
2699
|
+
followers,
|
|
2700
|
+
following,
|
|
2701
|
+
posts: postCount,
|
|
2702
|
+
verified: user.is_verified || false,
|
|
2703
|
+
isPrivate: user.is_private || false,
|
|
2704
|
+
profilePic: user.profile_pic_url_hd || user.profile_pic_url || '',
|
|
2705
|
+
externalUrl: user.external_url || (user.bio_links?.[0]?.url) || '',
|
|
2706
|
+
contentType: 'profile',
|
|
2707
|
+
};
|
|
2708
|
+
// Recent posts
|
|
2709
|
+
const edges = user.edge_owner_to_timeline_media?.edges || [];
|
|
2710
|
+
const postSections = [];
|
|
2711
|
+
for (const edge of edges.slice(0, 6)) {
|
|
2712
|
+
const node = edge?.node;
|
|
2713
|
+
if (!node)
|
|
2714
|
+
continue;
|
|
2715
|
+
const caption = node.edge_media_to_caption?.edges?.[0]?.node?.text || '';
|
|
2716
|
+
const likes = node.edge_liked_by?.count ?? node.edge_media_preview_like?.count ?? 0;
|
|
2717
|
+
const comments = node.edge_media_to_comment?.count ?? 0;
|
|
2718
|
+
const isVideo = node.is_video;
|
|
2719
|
+
const mediaType = isVideo ? '🎬' : '📸';
|
|
2720
|
+
const timestamp = node.taken_at_timestamp ? new Date(node.taken_at_timestamp * 1000).toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }) : '';
|
|
2721
|
+
const imgUrl = node.thumbnail_src || node.display_url || '';
|
|
2722
|
+
const captionSnippet = caption ? caption.slice(0, 150) + (caption.length > 150 ? '…' : '') : '';
|
|
2723
|
+
postSections.push(`### ${mediaType} ${timestamp}\n${captionSnippet}\n❤️ ${fmtNum(likes)} | 💬 ${fmtNum(comments)}${imgUrl ? `\n🖼 ${imgUrl}` : ''}`);
|
|
2724
|
+
}
|
|
2725
|
+
const verifiedBadge = structured.verified ? ' ✓' : '';
|
|
2726
|
+
const privateBadge = structured.isPrivate ? ' 🔒' : '';
|
|
2727
|
+
const bioLine = structured.bio ? `\n\n${structured.bio}` : '';
|
|
2728
|
+
const externalLine = structured.externalUrl ? `\n🌐 ${structured.externalUrl}` : '';
|
|
2729
|
+
const postsSection = postSections.length > 0 ? '\n\n## Recent Posts\n\n' + postSections.join('\n\n---\n\n') : '';
|
|
2730
|
+
const cleanContent = `# @${structured.username} on Instagram${verifiedBadge}${privateBadge}\n\n**${structured.fullName || structured.username}**${bioLine}${externalLine}\n\n👥 ${fmtNum(followers)} Followers | ${fmtNum(following)} Following | ${fmtNum(postCount)} Posts${postsSection}`;
|
|
2731
|
+
return { domain: 'instagram.com', type: 'profile', structured, cleanContent };
|
|
2732
|
+
}
|
|
2733
|
+
}
|
|
2734
|
+
catch (e) {
|
|
2735
|
+
if (process.env.DEBUG)
|
|
2736
|
+
console.debug('[webpeel]', 'Instagram profile API failed:', e instanceof Error ? e.message : e);
|
|
2737
|
+
}
|
|
2738
|
+
}
|
|
2739
|
+
// --- Post/Reel/IGTV: Try oEmbed API ---
|
|
2544
2740
|
try {
|
|
2545
|
-
// Instagram official oEmbed (no access token needed for basic data)
|
|
2546
2741
|
const oembedUrl = `https://graph.facebook.com/v22.0/instagram_oembed?url=${encodeURIComponent(url)}&fields=title,author_name,provider_name,thumbnail_url`;
|
|
2547
2742
|
const data = await fetchJson(oembedUrl);
|
|
2548
2743
|
// Also try noembed.com as fallback
|
|
@@ -2553,8 +2748,6 @@ async function instagramExtractor(_html, url) {
|
|
|
2553
2748
|
}
|
|
2554
2749
|
if (!resolvedData || resolvedData.error)
|
|
2555
2750
|
return null;
|
|
2556
|
-
const pathParts = new URL(url).pathname.split('/').filter(Boolean);
|
|
2557
|
-
const contentType = pathParts[0] === 'p' ? 'post' : pathParts[0] === 'reel' ? 'reel' : pathParts[0] === 'tv' ? 'igtv' : pathParts.length === 1 ? 'profile' : 'post';
|
|
2558
2751
|
const structured = {
|
|
2559
2752
|
title: resolvedData.title || '',
|
|
2560
2753
|
author: resolvedData.author_name || '',
|
|
@@ -2563,7 +2756,7 @@ async function instagramExtractor(_html, url) {
|
|
|
2563
2756
|
contentType,
|
|
2564
2757
|
provider: 'Instagram',
|
|
2565
2758
|
};
|
|
2566
|
-
const typeEmoji = contentType === 'reel' ? '🎬' : contentType === 'post' ? '📸' :
|
|
2759
|
+
const typeEmoji = contentType === 'reel' ? '🎬' : contentType === 'post' ? '📸' : '📱';
|
|
2567
2760
|
const titleText = structured.title || `Instagram ${contentType} by ${structured.author}`;
|
|
2568
2761
|
const cleanContent = `## ${typeEmoji} Instagram ${contentType}: ${titleText}\n\n**Creator:** @${structured.author.replace('@', '')}\n**URL:** ${url}`;
|
|
2569
2762
|
return { domain: 'instagram.com', type: contentType, structured, cleanContent };
|
|
@@ -2575,59 +2768,106 @@ async function instagramExtractor(_html, url) {
|
|
|
2575
2768
|
}
|
|
2576
2769
|
}
|
|
2577
2770
|
// ---------------------------------------------------------------------------
|
|
2578
|
-
// 30. PDF extractor (URL-based detection)
|
|
2771
|
+
// 30. PDF extractor (URL-based detection) — downloads and extracts real text
|
|
2579
2772
|
// ---------------------------------------------------------------------------
|
|
2773
|
+
const PDF_MAX_BYTES = 50 * 1024 * 1024; // 50 MB
|
|
2774
|
+
const PDF_TRUNCATE_CHARS = 100_000;
|
|
2580
2775
|
async function pdfExtractor(_html, url) {
|
|
2581
2776
|
try {
|
|
2582
2777
|
const urlObj = new URL(url);
|
|
2583
2778
|
const filename = urlObj.pathname.split('/').pop() || 'document.pdf';
|
|
2584
2779
|
const hostname = urlObj.hostname;
|
|
2585
|
-
//
|
|
2586
|
-
let
|
|
2587
|
-
let
|
|
2780
|
+
// Download the PDF
|
|
2781
|
+
let buffer;
|
|
2782
|
+
let finalContentType = 'application/pdf';
|
|
2588
2783
|
try {
|
|
2589
|
-
const
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
await new Promise((resolve) => {
|
|
2593
|
-
const req = client.request(url, { method: 'HEAD', timeout: 5000 }, (res) => {
|
|
2594
|
-
contentType = res.headers['content-type'] || 'application/pdf';
|
|
2595
|
-
contentLength = res.headers['content-length'] || '';
|
|
2596
|
-
resolve();
|
|
2597
|
-
res.resume();
|
|
2598
|
-
});
|
|
2599
|
-
req.on('error', () => resolve());
|
|
2600
|
-
req.on('timeout', () => { req.destroy(); resolve(); });
|
|
2601
|
-
req.end();
|
|
2784
|
+
const response = await fetch(url, {
|
|
2785
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; WebPeel/1.0)' },
|
|
2786
|
+
signal: AbortSignal.timeout(30000),
|
|
2602
2787
|
});
|
|
2788
|
+
if (!response.ok) {
|
|
2789
|
+
if (process.env.DEBUG)
|
|
2790
|
+
console.debug('[webpeel]', `PDF download failed: HTTP ${response.status}`);
|
|
2791
|
+
return null; // Let the normal pipeline handle it
|
|
2792
|
+
}
|
|
2793
|
+
finalContentType = response.headers.get('content-type') || 'application/pdf';
|
|
2794
|
+
// Verify it's actually a PDF (content-type or URL)
|
|
2795
|
+
const isPdf = finalContentType.toLowerCase().includes('pdf') || /\.pdf(\?|$|#)/i.test(url);
|
|
2796
|
+
if (!isPdf)
|
|
2797
|
+
return null;
|
|
2798
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
2799
|
+
buffer = Buffer.from(arrayBuffer);
|
|
2800
|
+
}
|
|
2801
|
+
catch (downloadErr) {
|
|
2802
|
+
if (process.env.DEBUG)
|
|
2803
|
+
console.debug('[webpeel]', 'PDF download error:', downloadErr instanceof Error ? downloadErr.message : downloadErr);
|
|
2804
|
+
return null; // Let the normal pipeline handle it
|
|
2603
2805
|
}
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
|
|
2806
|
+
// Size guard
|
|
2807
|
+
if (buffer.length > PDF_MAX_BYTES) {
|
|
2808
|
+
if (process.env.DEBUG)
|
|
2809
|
+
console.debug('[webpeel]', `PDF too large (${buffer.length} bytes), falling back to stub`);
|
|
2607
2810
|
return null;
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2811
|
+
}
|
|
2812
|
+
// Extract text via pdf-parse
|
|
2813
|
+
const { extractPdf } = await import('./pdf.js');
|
|
2814
|
+
let pdf;
|
|
2815
|
+
try {
|
|
2816
|
+
pdf = await extractPdf(buffer);
|
|
2817
|
+
}
|
|
2818
|
+
catch (parseErr) {
|
|
2819
|
+
if (process.env.DEBUG)
|
|
2820
|
+
console.debug('[webpeel]', 'PDF parse failed:', parseErr instanceof Error ? parseErr.message : parseErr);
|
|
2821
|
+
return null; // Let the normal pipeline handle it
|
|
2822
|
+
}
|
|
2823
|
+
// Normalize whitespace (pdf-parse emits lots of blank lines)
|
|
2824
|
+
let text = (pdf.text || '')
|
|
2825
|
+
.replace(/\r\n/g, '\n')
|
|
2826
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
2827
|
+
.replace(/[ \t]+/g, ' ')
|
|
2828
|
+
.trim();
|
|
2829
|
+
// Truncate very large documents
|
|
2830
|
+
let truncated = false;
|
|
2831
|
+
if (text.length > PDF_TRUNCATE_CHARS) {
|
|
2832
|
+
text = text.slice(0, PDF_TRUNCATE_CHARS);
|
|
2833
|
+
truncated = true;
|
|
2834
|
+
}
|
|
2835
|
+
if (!text) {
|
|
2836
|
+
// Scanned/image-only PDF — return a clear message rather than empty content
|
|
2837
|
+
const emptyNote = `## 📄 ${filename}\n\n*This PDF appears to be a scanned document (image-only). No extractable text was found.*\n\n**Source:** ${url}`;
|
|
2838
|
+
return {
|
|
2839
|
+
domain: hostname,
|
|
2840
|
+
type: 'pdf',
|
|
2841
|
+
structured: { title: filename, url, pages: pdf.pages, contentType: finalContentType },
|
|
2842
|
+
cleanContent: emptyNote,
|
|
2843
|
+
};
|
|
2844
|
+
}
|
|
2845
|
+
// Build markdown output
|
|
2846
|
+
const titleRaw = pdf.metadata?.title || '';
|
|
2847
|
+
const title = titleRaw || filename.replace(/\.pdf$/i, '') || 'PDF Document';
|
|
2848
|
+
const metaParts = [];
|
|
2849
|
+
if (pdf.metadata?.author)
|
|
2850
|
+
metaParts.push(`**Author:** ${pdf.metadata.author}`);
|
|
2851
|
+
if (pdf.pages)
|
|
2852
|
+
metaParts.push(`**Pages:** ${pdf.pages}`);
|
|
2853
|
+
metaParts.push(`**Source:** ${url}`);
|
|
2854
|
+
const header = titleRaw ? `# ${titleRaw}\n\n` : '';
|
|
2855
|
+
const metaBlock = metaParts.join(' | ') + '\n\n';
|
|
2856
|
+
const truncNote = truncated ? '\n\n*[Content truncated — document exceeds 100,000 characters]*' : '';
|
|
2857
|
+
const cleanContent = header + metaBlock + text + truncNote;
|
|
2858
|
+
return {
|
|
2859
|
+
domain: hostname,
|
|
2860
|
+
type: 'pdf',
|
|
2861
|
+
structured: {
|
|
2862
|
+
title,
|
|
2863
|
+
filename,
|
|
2864
|
+
url,
|
|
2865
|
+
pages: pdf.pages,
|
|
2866
|
+
contentType: finalContentType,
|
|
2867
|
+
...pdf.metadata,
|
|
2868
|
+
},
|
|
2869
|
+
cleanContent,
|
|
2616
2870
|
};
|
|
2617
|
-
const sizeStr = fileSizeKb ? ` (${fileSizeKb > 1024 ? (fileSizeKb / 1024).toFixed(1) + ' MB' : fileSizeKb + ' KB'})` : '';
|
|
2618
|
-
const cleanContent = `## 📄 PDF Document: ${filename}
|
|
2619
|
-
|
|
2620
|
-
**URL:** ${url}
|
|
2621
|
-
**Host:** ${hostname}${sizeStr ? `\n**Size:** ${sizeStr}` : ''}
|
|
2622
|
-
|
|
2623
|
-
> **Note:** This is a PDF document. Binary PDF content cannot be directly extracted as text through standard web fetching. To extract the full text, consider:
|
|
2624
|
-
>
|
|
2625
|
-
> 1. Use a dedicated PDF extraction service (e.g., Adobe PDF Extract API, pdfminer, PyMuPDF)
|
|
2626
|
-
> 2. Download the file and process locally with \`pdf-parse\` (Node.js) or \`pdfplumber\` (Python)
|
|
2627
|
-
> 3. For academic PDFs, check if an HTML version is available at the same URL without \`.pdf\`
|
|
2628
|
-
|
|
2629
|
-
**Direct download URL:** ${url}`;
|
|
2630
|
-
return { domain: hostname, type: 'pdf', structured, cleanContent };
|
|
2631
2871
|
}
|
|
2632
2872
|
catch (e) {
|
|
2633
2873
|
if (process.env.DEBUG)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* POST /v1/agent
|
|
2
|
+
* POST /v1/agent — single autonomous agent query
|
|
3
|
+
* POST /v1/agent/batch — parallel batch of agent queries (max 50)
|
|
4
|
+
* GET /v1/agent/batch/:id — poll batch job status
|
|
3
5
|
*
|
|
4
6
|
* Autonomous web agent — search → fetch → extract (LLM or BM25)
|
|
5
7
|
*
|
|
@@ -11,9 +13,7 @@
|
|
|
11
13
|
*
|
|
12
14
|
* Returns: { success, data|answer, sources, method, elapsed, tokensUsed }
|
|
13
15
|
*
|
|
14
|
-
*
|
|
15
|
-
* - agent-llm: schema + llmApiKey → LLM extraction (BYOK)
|
|
16
|
-
* - agent-bm25: no LLM key → BM25 text answer (always free)
|
|
16
|
+
* Webhook support: pass `webhook` URL to get async delivery with HMAC-SHA256 signing.
|
|
17
17
|
*
|
|
18
18
|
* 5-minute in-memory cache. Max 10 sources per request.
|
|
19
19
|
*/
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* POST /v1/agent
|
|
2
|
+
* POST /v1/agent — single autonomous agent query
|
|
3
|
+
* POST /v1/agent/batch — parallel batch of agent queries (max 50)
|
|
4
|
+
* GET /v1/agent/batch/:id — poll batch job status
|
|
3
5
|
*
|
|
4
6
|
* Autonomous web agent — search → fetch → extract (LLM or BM25)
|
|
5
7
|
*
|
|
@@ -11,9 +13,7 @@
|
|
|
11
13
|
*
|
|
12
14
|
* Returns: { success, data|answer, sources, method, elapsed, tokensUsed }
|
|
13
15
|
*
|
|
14
|
-
*
|
|
15
|
-
* - agent-llm: schema + llmApiKey → LLM extraction (BYOK)
|
|
16
|
-
* - agent-bm25: no LLM key → BM25 text answer (always free)
|
|
16
|
+
* Webhook support: pass `webhook` URL to get async delivery with HMAC-SHA256 signing.
|
|
17
17
|
*
|
|
18
18
|
* 5-minute in-memory cache. Max 10 sources per request.
|
|
19
19
|
*/
|
|
@@ -22,9 +22,42 @@ import { peel } from '../../index.js';
|
|
|
22
22
|
import { extractWithLLM } from '../../core/llm-extract.js';
|
|
23
23
|
import { getBestSearchProvider } from '../../core/search-provider.js';
|
|
24
24
|
import { quickAnswer } from '../../core/quick-answer.js';
|
|
25
|
+
import { sendWebhook } from './webhooks.js';
|
|
25
26
|
import { createLogger } from '../../core/logger.js';
|
|
26
27
|
import crypto from 'crypto';
|
|
27
28
|
const log = createLogger('agent');
|
|
29
|
+
const batchJobs = new Map();
|
|
30
|
+
const BATCH_TTL = 60 * 60 * 1000; // 1 hour
|
|
31
|
+
// GC stale batch jobs every 10 minutes
|
|
32
|
+
setInterval(() => {
|
|
33
|
+
const now = Date.now();
|
|
34
|
+
for (const [id, job] of batchJobs) {
|
|
35
|
+
if (now - job.createdAt > BATCH_TTL)
|
|
36
|
+
batchJobs.delete(id);
|
|
37
|
+
}
|
|
38
|
+
}, 10 * 60 * 1000).unref();
|
|
39
|
+
// Simple concurrency limiter
|
|
40
|
+
class Semaphore {
|
|
41
|
+
max;
|
|
42
|
+
queue = [];
|
|
43
|
+
running = 0;
|
|
44
|
+
constructor(max) {
|
|
45
|
+
this.max = max;
|
|
46
|
+
}
|
|
47
|
+
async acquire() {
|
|
48
|
+
if (this.running < this.max) {
|
|
49
|
+
this.running++;
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
return new Promise((resolve) => this.queue.push(() => { this.running++; resolve(); }));
|
|
53
|
+
}
|
|
54
|
+
release() {
|
|
55
|
+
this.running--;
|
|
56
|
+
const next = this.queue.shift();
|
|
57
|
+
if (next)
|
|
58
|
+
next();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
28
61
|
const cache = new Map();
|
|
29
62
|
const CACHE_TTL = 5 * 60 * 1000; // 5 minutes
|
|
30
63
|
function getCached(key) {
|
|
@@ -48,191 +81,175 @@ function setCache(key, result) {
|
|
|
48
81
|
}
|
|
49
82
|
cache.set(key, { result, expiresAt: Date.now() + CACHE_TTL });
|
|
50
83
|
}
|
|
84
|
+
async function runAgentQuery(params) {
|
|
85
|
+
const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources } = params;
|
|
86
|
+
const startMs = Date.now();
|
|
87
|
+
const numSources = Math.min(maxSources || 5, 10);
|
|
88
|
+
// Cache check
|
|
89
|
+
const cacheKey = `${prompt.trim()}:${JSON.stringify(schema || {})}`;
|
|
90
|
+
const cached = getCached(cacheKey);
|
|
91
|
+
if (cached)
|
|
92
|
+
return { ...cached, cached: true };
|
|
93
|
+
// Step 1: Resolve source URLs
|
|
94
|
+
let sourceUrls = [];
|
|
95
|
+
if (Array.isArray(urls) && urls.length > 0) {
|
|
96
|
+
sourceUrls = urls.map((u) => ({ url: u }));
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
log.info(`Searching web for: "${prompt}"`);
|
|
100
|
+
const { provider, apiKey: searchApiKey } = getBestSearchProvider();
|
|
101
|
+
try {
|
|
102
|
+
const searchResults = await provider.searchWeb(prompt.trim(), { count: numSources, apiKey: searchApiKey });
|
|
103
|
+
sourceUrls = searchResults.slice(0, numSources).map((r) => ({ url: r.url, title: r.title, snippet: r.snippet }));
|
|
104
|
+
}
|
|
105
|
+
catch (err) {
|
|
106
|
+
log.warn('Search failed:', err.message);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
if (sourceUrls.length === 0) {
|
|
110
|
+
return { success: false, error: { type: 'no_sources', message: 'Could not find relevant pages for this query' }, prompt, elapsed: Date.now() - startMs };
|
|
111
|
+
}
|
|
112
|
+
// Step 2: Fetch pages in parallel
|
|
113
|
+
log.info(`Fetching ${sourceUrls.length} sources in parallel`);
|
|
114
|
+
const PER_SOURCE_TIMEOUT_MS = 5000;
|
|
115
|
+
const fetchPromises = sourceUrls.map(async (source) => {
|
|
116
|
+
try {
|
|
117
|
+
const result = await Promise.race([
|
|
118
|
+
peel(source.url, { render: false, noEscalate: true, format: 'markdown', timeout: PER_SOURCE_TIMEOUT_MS, budget: 3000 }),
|
|
119
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('per-source timeout')), PER_SOURCE_TIMEOUT_MS)),
|
|
120
|
+
]);
|
|
121
|
+
return { url: source.url, title: result.title || source.title || '', content: (result.content || '').slice(0, 15000), tokens: result.tokens || 0 };
|
|
122
|
+
}
|
|
123
|
+
catch {
|
|
124
|
+
return null;
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
const fetchResults = (await Promise.allSettled(fetchPromises))
|
|
128
|
+
.map((r) => (r.status === 'fulfilled' ? r.value : null))
|
|
129
|
+
.filter(Boolean);
|
|
130
|
+
if (fetchResults.length === 0) {
|
|
131
|
+
return { success: false, error: { type: 'fetch_failed', message: 'Could not fetch any of the found pages' }, prompt, sources: sourceUrls.map((s) => ({ url: s.url })), elapsed: Date.now() - startMs };
|
|
132
|
+
}
|
|
133
|
+
// Step 3: Extract or answer
|
|
134
|
+
const combinedContent = fetchResults.map((r) => `### ${r.title || r.url}\nURL: ${r.url}\n\n${r.content}`).join('\n\n---\n\n');
|
|
135
|
+
const totalTokens = fetchResults.reduce((sum, r) => sum + r.tokens, 0);
|
|
136
|
+
let result;
|
|
137
|
+
if (schema && llmApiKey) {
|
|
138
|
+
log.info('Using LLM extraction');
|
|
139
|
+
const extracted = await extractWithLLM({
|
|
140
|
+
content: combinedContent.slice(0, 30000), schema, llmApiKey, llmProvider: (llmProvider || 'openai'), llmModel,
|
|
141
|
+
prompt: `Based on these web pages, ${prompt}`, url: fetchResults[0].url,
|
|
142
|
+
});
|
|
143
|
+
const llmTokensUsed = (extracted.tokensUsed?.input ?? 0) + (extracted.tokensUsed?.output ?? 0);
|
|
144
|
+
result = { success: true, data: extracted.items, sources: fetchResults.map((r) => ({ url: r.url, title: r.title })), method: 'agent-llm',
|
|
145
|
+
llm: { provider: extracted.provider || llmProvider || 'openai', model: extracted.model || llmModel || 'default' }, tokensUsed: totalTokens + llmTokensUsed, elapsed: Date.now() - startMs };
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
log.info('Using BM25 text extraction');
|
|
149
|
+
const qa = quickAnswer({ question: prompt, content: combinedContent, maxPassages: 3, maxChars: 2000 });
|
|
150
|
+
result = { success: true, answer: qa.answer || combinedContent.slice(0, 2000), confidence: qa.confidence ?? 0,
|
|
151
|
+
sources: fetchResults.map((r) => ({ url: r.url, title: r.title })), method: 'agent-bm25', tokensUsed: totalTokens, elapsed: Date.now() - startMs };
|
|
152
|
+
}
|
|
153
|
+
setCache(cacheKey, result);
|
|
154
|
+
return result;
|
|
155
|
+
}
|
|
51
156
|
// ---------------------------------------------------------------------------
|
|
52
157
|
// Route factory
|
|
53
158
|
// ---------------------------------------------------------------------------
|
|
54
159
|
export function createAgentRouter() {
|
|
55
160
|
const router = Router();
|
|
161
|
+
// ── POST /v1/agent — single query (with optional webhook) ──────────────
|
|
56
162
|
router.post('/', async (req, res) => {
|
|
57
|
-
const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, } = req.body || {};
|
|
58
|
-
|
|
163
|
+
const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, webhook } = req.body || {};
|
|
164
|
+
const requestId = req.requestId || crypto.randomUUID();
|
|
59
165
|
if (!prompt?.trim()) {
|
|
60
166
|
return res.status(400).json({
|
|
61
167
|
success: false,
|
|
62
|
-
error: {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
hint: 'POST /v1/agent { "prompt": "Find Stripe pricing plans" }',
|
|
66
|
-
docs: 'https://webpeel.dev/docs/api-reference',
|
|
67
|
-
},
|
|
68
|
-
requestId: req.requestId || crypto.randomUUID(),
|
|
168
|
+
error: { type: 'missing_prompt', message: 'Provide a prompt describing what you want to find',
|
|
169
|
+
hint: 'POST /v1/agent { "prompt": "Find Stripe pricing plans" }', docs: 'https://webpeel.dev/docs/api-reference' },
|
|
170
|
+
requestId,
|
|
69
171
|
});
|
|
70
172
|
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
173
|
+
// Async mode: webhook provided → return immediately, deliver result later
|
|
174
|
+
if (webhook) {
|
|
175
|
+
const jobId = crypto.randomUUID();
|
|
176
|
+
res.json({ success: true, id: jobId, status: 'processing', requestId });
|
|
177
|
+
// Fire-and-forget agent query + webhook delivery
|
|
178
|
+
runAgentQuery({ prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources })
|
|
179
|
+
.then((result) => sendWebhook(webhook, 'agent.completed', { id: jobId, ...result, requestId }))
|
|
180
|
+
.catch((err) => {
|
|
181
|
+
log.error('Async agent error:', err.message);
|
|
182
|
+
sendWebhook(webhook, 'agent.failed', { id: jobId, error: err.message, requestId }).catch(() => { });
|
|
183
|
+
});
|
|
184
|
+
return;
|
|
79
185
|
}
|
|
186
|
+
// Synchronous mode: wait for result
|
|
80
187
|
try {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
// -----------------------------------------------------------------------
|
|
84
|
-
let sourceUrls = [];
|
|
85
|
-
if (Array.isArray(urls) && urls.length > 0) {
|
|
86
|
-
sourceUrls = urls.map((u) => ({ url: u }));
|
|
87
|
-
}
|
|
88
|
-
else {
|
|
89
|
-
log.info(`Searching web for: "${prompt}"`);
|
|
90
|
-
const { provider, apiKey: searchApiKey } = getBestSearchProvider();
|
|
91
|
-
let searchResults = [];
|
|
92
|
-
try {
|
|
93
|
-
searchResults = await provider.searchWeb(prompt.trim(), {
|
|
94
|
-
count: numSources,
|
|
95
|
-
apiKey: searchApiKey,
|
|
96
|
-
});
|
|
97
|
-
}
|
|
98
|
-
catch (err) {
|
|
99
|
-
log.warn('Search failed:', err.message);
|
|
100
|
-
}
|
|
101
|
-
sourceUrls = searchResults.slice(0, numSources).map((r) => ({
|
|
102
|
-
url: r.url,
|
|
103
|
-
title: r.title,
|
|
104
|
-
snippet: r.snippet,
|
|
105
|
-
}));
|
|
106
|
-
}
|
|
107
|
-
if (sourceUrls.length === 0) {
|
|
108
|
-
return res.json({
|
|
109
|
-
success: false,
|
|
110
|
-
error: {
|
|
111
|
-
type: 'no_sources',
|
|
112
|
-
message: 'Could not find relevant pages for this query',
|
|
113
|
-
},
|
|
114
|
-
prompt,
|
|
115
|
-
elapsed: Date.now() - startMs,
|
|
116
|
-
requestId,
|
|
117
|
-
});
|
|
118
|
-
}
|
|
119
|
-
// -----------------------------------------------------------------------
|
|
120
|
-
// Step 2: Fetch pages in parallel (HTTP only, no browser, 5s timeout)
|
|
121
|
-
// -----------------------------------------------------------------------
|
|
122
|
-
log.info(`Fetching ${sourceUrls.length} sources in parallel`);
|
|
123
|
-
const PER_SOURCE_TIMEOUT_MS = 5000;
|
|
124
|
-
const fetchPromises = sourceUrls.map(async (source) => {
|
|
125
|
-
try {
|
|
126
|
-
const result = await Promise.race([
|
|
127
|
-
peel(source.url, {
|
|
128
|
-
render: false,
|
|
129
|
-
noEscalate: true,
|
|
130
|
-
format: 'markdown',
|
|
131
|
-
timeout: PER_SOURCE_TIMEOUT_MS,
|
|
132
|
-
budget: 3000,
|
|
133
|
-
}),
|
|
134
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('per-source timeout')), PER_SOURCE_TIMEOUT_MS)),
|
|
135
|
-
]);
|
|
136
|
-
return {
|
|
137
|
-
url: source.url,
|
|
138
|
-
title: result.title || source.title || '',
|
|
139
|
-
content: (result.content || '').slice(0, 15000),
|
|
140
|
-
tokens: result.tokens || 0,
|
|
141
|
-
};
|
|
142
|
-
}
|
|
143
|
-
catch {
|
|
144
|
-
return null;
|
|
145
|
-
}
|
|
146
|
-
});
|
|
147
|
-
const fetchSettled = await Promise.allSettled(fetchPromises);
|
|
148
|
-
const fetchResults = fetchSettled
|
|
149
|
-
.map((r) => (r.status === 'fulfilled' ? r.value : null))
|
|
150
|
-
.filter(Boolean);
|
|
151
|
-
if (fetchResults.length === 0) {
|
|
152
|
-
return res.json({
|
|
153
|
-
success: false,
|
|
154
|
-
error: {
|
|
155
|
-
type: 'fetch_failed',
|
|
156
|
-
message: 'Could not fetch any of the found pages',
|
|
157
|
-
},
|
|
158
|
-
prompt,
|
|
159
|
-
sources: sourceUrls.map((s) => ({ url: s.url })),
|
|
160
|
-
elapsed: Date.now() - startMs,
|
|
161
|
-
requestId,
|
|
162
|
-
});
|
|
163
|
-
}
|
|
164
|
-
// -----------------------------------------------------------------------
|
|
165
|
-
// Step 3: Extract or answer
|
|
166
|
-
// -----------------------------------------------------------------------
|
|
167
|
-
const combinedContent = fetchResults
|
|
168
|
-
.map((r) => `### ${r.title || r.url}\nURL: ${r.url}\n\n${r.content}`)
|
|
169
|
-
.join('\n\n---\n\n');
|
|
170
|
-
const totalTokens = fetchResults.reduce((sum, r) => sum + r.tokens, 0);
|
|
171
|
-
let result;
|
|
172
|
-
if (schema && llmApiKey) {
|
|
173
|
-
// ── LLM extraction path ──────────────────────────────────────────────
|
|
174
|
-
log.info('Using LLM extraction');
|
|
175
|
-
const extracted = await extractWithLLM({
|
|
176
|
-
content: combinedContent.slice(0, 30000),
|
|
177
|
-
schema,
|
|
178
|
-
llmApiKey,
|
|
179
|
-
llmProvider: llmProvider || 'openai',
|
|
180
|
-
llmModel,
|
|
181
|
-
prompt: `Based on these web pages, ${prompt}`,
|
|
182
|
-
url: fetchResults[0].url,
|
|
183
|
-
});
|
|
184
|
-
const llmTokensUsed = (extracted.tokensUsed?.input ?? 0) + (extracted.tokensUsed?.output ?? 0);
|
|
185
|
-
result = {
|
|
186
|
-
success: true,
|
|
187
|
-
data: extracted.items,
|
|
188
|
-
sources: fetchResults.map((r) => ({ url: r.url, title: r.title })),
|
|
189
|
-
method: 'agent-llm',
|
|
190
|
-
llm: {
|
|
191
|
-
provider: extracted.provider || llmProvider || 'openai',
|
|
192
|
-
model: extracted.model || llmModel || 'default',
|
|
193
|
-
},
|
|
194
|
-
tokensUsed: totalTokens + llmTokensUsed,
|
|
195
|
-
elapsed: Date.now() - startMs,
|
|
196
|
-
requestId,
|
|
197
|
-
};
|
|
198
|
-
}
|
|
199
|
-
else {
|
|
200
|
-
// ── BM25 text answer path (no LLM needed) ───────────────────────────
|
|
201
|
-
log.info('Using BM25 text extraction');
|
|
202
|
-
const qa = quickAnswer({
|
|
203
|
-
question: prompt,
|
|
204
|
-
content: combinedContent,
|
|
205
|
-
maxPassages: 3,
|
|
206
|
-
maxChars: 2000,
|
|
207
|
-
});
|
|
208
|
-
result = {
|
|
209
|
-
success: true,
|
|
210
|
-
answer: qa.answer || combinedContent.slice(0, 2000),
|
|
211
|
-
confidence: qa.confidence ?? 0,
|
|
212
|
-
sources: fetchResults.map((r) => ({ url: r.url, title: r.title })),
|
|
213
|
-
method: 'agent-bm25',
|
|
214
|
-
tokensUsed: totalTokens,
|
|
215
|
-
elapsed: Date.now() - startMs,
|
|
216
|
-
requestId,
|
|
217
|
-
};
|
|
218
|
-
}
|
|
219
|
-
// Cache the result
|
|
220
|
-
setCache(cacheKey, result);
|
|
221
|
-
return res.json(result);
|
|
188
|
+
const result = await runAgentQuery({ prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources });
|
|
189
|
+
return res.json({ ...result, requestId });
|
|
222
190
|
}
|
|
223
191
|
catch (err) {
|
|
224
192
|
log.error('Agent error:', err.message);
|
|
225
193
|
return res.status(500).json({
|
|
226
|
-
success: false,
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
194
|
+
success: false, error: { type: 'agent_error', message: err.message || 'An unexpected error occurred' },
|
|
195
|
+
prompt, elapsed: 0, requestId,
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
});
|
|
199
|
+
// ── POST /v1/agent/batch — parallel batch queries ─────────────────────
|
|
200
|
+
router.post('/batch', async (req, res) => {
|
|
201
|
+
const { prompts, schema, llmApiKey, llmProvider, llmModel, sources, webhook } = req.body || {};
|
|
202
|
+
const requestId = req.requestId || crypto.randomUUID();
|
|
203
|
+
if (!Array.isArray(prompts) || prompts.length === 0) {
|
|
204
|
+
return res.status(400).json({
|
|
205
|
+
success: false, error: { type: 'missing_prompts', message: 'Provide an array of prompts',
|
|
206
|
+
hint: 'POST /v1/agent/batch { "prompts": ["Find X", "Find Y"] }' }, requestId,
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
if (prompts.length > 50) {
|
|
210
|
+
return res.status(400).json({
|
|
211
|
+
success: false, error: { type: 'too_many_prompts', message: `Max 50 prompts per batch (got ${prompts.length})` }, requestId,
|
|
234
212
|
});
|
|
235
213
|
}
|
|
214
|
+
const jobId = crypto.randomUUID();
|
|
215
|
+
const job = { id: jobId, status: 'processing', total: prompts.length, completed: 0, results: [], webhook, createdAt: Date.now() };
|
|
216
|
+
batchJobs.set(jobId, job);
|
|
217
|
+
// Return immediately, then process in background
|
|
218
|
+
res.json({ success: true, id: jobId, status: 'processing', total: prompts.length, requestId });
|
|
219
|
+
// Process in background with concurrency limit of 5
|
|
220
|
+
// eslint-disable-next-line @typescript-eslint/no-floating-promises
|
|
221
|
+
const sem = new Semaphore(5);
|
|
222
|
+
const tasks = prompts.map(async (prompt) => {
|
|
223
|
+
await sem.acquire();
|
|
224
|
+
try {
|
|
225
|
+
const result = await runAgentQuery({ prompt, schema, llmApiKey, llmProvider, llmModel, sources });
|
|
226
|
+
job.results.push({ prompt, success: !!result.success, answer: result.answer,
|
|
227
|
+
data: result.data, sources: result.sources, method: result.method, elapsed: result.elapsed });
|
|
228
|
+
}
|
|
229
|
+
catch (err) {
|
|
230
|
+
job.results.push({ prompt, success: false, error: err.message });
|
|
231
|
+
}
|
|
232
|
+
finally {
|
|
233
|
+
job.completed++;
|
|
234
|
+
sem.release();
|
|
235
|
+
}
|
|
236
|
+
});
|
|
237
|
+
Promise.allSettled(tasks).then(() => {
|
|
238
|
+
job.status = job.results.every((r) => r.success) ? 'completed' : 'completed';
|
|
239
|
+
if (webhook) {
|
|
240
|
+
sendWebhook(webhook, 'agent.batch.completed', { id: jobId, total: job.total, completed: job.completed, results: job.results })
|
|
241
|
+
.catch((err) => log.error('Batch webhook failed:', err.message));
|
|
242
|
+
}
|
|
243
|
+
});
|
|
244
|
+
return;
|
|
245
|
+
});
|
|
246
|
+
// ── GET /v1/agent/batch/:id — poll batch status ───────────────────────
|
|
247
|
+
router.get('/batch/:id', async (req, res) => {
|
|
248
|
+
const job = batchJobs.get(req.params.id);
|
|
249
|
+
if (!job) {
|
|
250
|
+
return res.status(404).json({ success: false, error: { type: 'not_found', message: 'Batch job not found or expired' } });
|
|
251
|
+
}
|
|
252
|
+
return res.json({ success: true, id: job.id, status: job.status, total: job.total, completed: job.completed, results: job.results });
|
|
236
253
|
});
|
|
237
254
|
return router;
|
|
238
255
|
}
|
|
@@ -6,9 +6,11 @@
|
|
|
6
6
|
* POST /v1/session/:id/navigate → navigate to URL { url }
|
|
7
7
|
* POST /v1/session/:id/act → execute PageActions array
|
|
8
8
|
* GET /v1/session/:id/screenshot → take screenshot (image/png)
|
|
9
|
+
* GET /v1/session/:id/cookies → export cookies from session context
|
|
10
|
+
* POST /v1/session/:id/cookies → inject cookies into session context
|
|
9
11
|
* DELETE /v1/session/:id → close session
|
|
10
12
|
*
|
|
11
|
-
* Use cases: login flows, multi-step automation, UI testing.
|
|
13
|
+
* Use cases: login flows, multi-step automation, UI testing, cookie persistence.
|
|
12
14
|
* This is what Browserbase charges $500/mo for — built into WebPeel.
|
|
13
15
|
*/
|
|
14
16
|
import { Router } from 'express';
|
|
@@ -6,9 +6,11 @@
|
|
|
6
6
|
* POST /v1/session/:id/navigate → navigate to URL { url }
|
|
7
7
|
* POST /v1/session/:id/act → execute PageActions array
|
|
8
8
|
* GET /v1/session/:id/screenshot → take screenshot (image/png)
|
|
9
|
+
* GET /v1/session/:id/cookies → export cookies from session context
|
|
10
|
+
* POST /v1/session/:id/cookies → inject cookies into session context
|
|
9
11
|
* DELETE /v1/session/:id → close session
|
|
10
12
|
*
|
|
11
|
-
* Use cases: login flows, multi-step automation, UI testing.
|
|
13
|
+
* Use cases: login flows, multi-step automation, UI testing, cookie persistence.
|
|
12
14
|
* This is what Browserbase charges $500/mo for — built into WebPeel.
|
|
13
15
|
*/
|
|
14
16
|
import { Router } from 'express';
|
|
@@ -17,13 +19,15 @@ import { normalizeActions, executeActions } from '../../core/actions.js';
|
|
|
17
19
|
import { ANTI_DETECTION_ARGS, getRandomViewport, getRandomUserAgent, applyStealthScripts, } from '../../core/browser-pool.js';
|
|
18
20
|
import { extractReadableContent } from '../../core/readability.js';
|
|
19
21
|
const sessions = new Map();
|
|
20
|
-
const
|
|
22
|
+
const DEFAULT_SESSION_TTL_MS = 5 * 60 * 1000; // 5 minutes idle TTL (default)
|
|
23
|
+
const MAX_SESSION_TTL_MS = 60 * 60 * 1000; // 60 minutes (persist / max)
|
|
24
|
+
const MIN_SESSION_TTL_MS = 1 * 60 * 1000; // 1 minute minimum
|
|
21
25
|
const MAX_SESSIONS_PER_USER = 3; // prevent abuse
|
|
22
26
|
// Cleanup expired sessions every minute
|
|
23
27
|
const _cleanupInterval = setInterval(() => {
|
|
24
28
|
const now = Date.now();
|
|
25
29
|
for (const [id, session] of sessions) {
|
|
26
|
-
if (now - session.lastUsedAt >
|
|
30
|
+
if (now - session.lastUsedAt > session.ttlMs) {
|
|
27
31
|
session.browser.close().catch(() => { });
|
|
28
32
|
sessions.delete(id);
|
|
29
33
|
}
|
|
@@ -73,7 +77,18 @@ function extractReadableText(html, url) {
|
|
|
73
77
|
// ── Router ────────────────────────────────────────────────────────────────────
|
|
74
78
|
export function createSessionRouter() {
|
|
75
79
|
const router = Router();
|
|
76
|
-
|
|
80
|
+
/**
|
|
81
|
+
* POST /v1/session — create a stateful browser session
|
|
82
|
+
*
|
|
83
|
+
* Body params:
|
|
84
|
+
* url? {string} Initial URL to navigate to (optional).
|
|
85
|
+
* ttl? {number} Session idle TTL in minutes (1–60, default 5).
|
|
86
|
+
* Timer resets on every request that touches the session.
|
|
87
|
+
* persist? {boolean} Shorthand for ttl=60. Enables long-lived sessions
|
|
88
|
+
* for login flows where cookies must persist.
|
|
89
|
+
*
|
|
90
|
+
* Returns: { sessionId, currentUrl, expiresAt, ttlMinutes }
|
|
91
|
+
*/
|
|
77
92
|
router.post('/v1/session', async (req, res) => {
|
|
78
93
|
const ownerId = getOwnerId(req);
|
|
79
94
|
if (!ownerId) {
|
|
@@ -95,7 +110,15 @@ export function createSessionRouter() {
|
|
|
95
110
|
});
|
|
96
111
|
return;
|
|
97
112
|
}
|
|
98
|
-
const { url } = req.body;
|
|
113
|
+
const { url, ttl, persist } = req.body;
|
|
114
|
+
// Resolve TTL: persist=true → 60 min max, ttl overrides default, clamp to [1, 60] min
|
|
115
|
+
let ttlMs = DEFAULT_SESSION_TTL_MS;
|
|
116
|
+
if (persist) {
|
|
117
|
+
ttlMs = MAX_SESSION_TTL_MS;
|
|
118
|
+
}
|
|
119
|
+
else if (typeof ttl === 'number') {
|
|
120
|
+
ttlMs = Math.min(MAX_SESSION_TTL_MS, Math.max(MIN_SESSION_TTL_MS, ttl * 60 * 1000));
|
|
121
|
+
}
|
|
99
122
|
let browser = null;
|
|
100
123
|
try {
|
|
101
124
|
browser = await launchBrowser();
|
|
@@ -137,11 +160,13 @@ export function createSessionRouter() {
|
|
|
137
160
|
createdAt: now,
|
|
138
161
|
lastUsedAt: now,
|
|
139
162
|
currentUrl: page.url(),
|
|
163
|
+
ttlMs,
|
|
140
164
|
});
|
|
141
165
|
res.status(201).json({
|
|
142
166
|
sessionId: id,
|
|
143
167
|
currentUrl: page.url(),
|
|
144
|
-
expiresAt: new Date(now +
|
|
168
|
+
expiresAt: new Date(now + ttlMs).toISOString(),
|
|
169
|
+
ttlMinutes: ttlMs / 60_000,
|
|
145
170
|
});
|
|
146
171
|
}
|
|
147
172
|
catch (err) {
|
|
@@ -188,7 +213,8 @@ export function createSessionRouter() {
|
|
|
188
213
|
currentUrl: session.page.url(),
|
|
189
214
|
title,
|
|
190
215
|
content,
|
|
191
|
-
expiresAt: new Date(session.lastUsedAt +
|
|
216
|
+
expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
|
|
217
|
+
ttlMinutes: session.ttlMs / 60_000,
|
|
192
218
|
});
|
|
193
219
|
}
|
|
194
220
|
catch (err) {
|
|
@@ -242,7 +268,8 @@ export function createSessionRouter() {
|
|
|
242
268
|
res.json({
|
|
243
269
|
currentUrl: session.page.url(),
|
|
244
270
|
title: await session.page.title(),
|
|
245
|
-
expiresAt: new Date(session.lastUsedAt +
|
|
271
|
+
expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
|
|
272
|
+
ttlMinutes: session.ttlMs / 60_000,
|
|
246
273
|
});
|
|
247
274
|
}
|
|
248
275
|
catch (err) {
|
|
@@ -327,7 +354,8 @@ export function createSessionRouter() {
|
|
|
327
354
|
title,
|
|
328
355
|
screenshot,
|
|
329
356
|
actionsExecuted: normalizedActions.length,
|
|
330
|
-
expiresAt: new Date(session.lastUsedAt +
|
|
357
|
+
expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
|
|
358
|
+
ttlMinutes: session.ttlMs / 60_000,
|
|
331
359
|
});
|
|
332
360
|
}
|
|
333
361
|
catch (err) {
|
|
@@ -367,6 +395,7 @@ export function createSessionRouter() {
|
|
|
367
395
|
session.lastUsedAt = Date.now();
|
|
368
396
|
res.setHeader('Content-Type', 'image/png');
|
|
369
397
|
res.setHeader('Cache-Control', 'no-store');
|
|
398
|
+
res.setHeader('X-Session-Expires-At', new Date(session.lastUsedAt + session.ttlMs).toISOString());
|
|
370
399
|
res.send(buf);
|
|
371
400
|
}
|
|
372
401
|
catch (err) {
|
|
@@ -382,6 +411,128 @@ export function createSessionRouter() {
|
|
|
382
411
|
});
|
|
383
412
|
}
|
|
384
413
|
});
|
|
414
|
+
/**
|
|
415
|
+
* GET /v1/session/:id/cookies — export all cookies from the session's browser context
|
|
416
|
+
*
|
|
417
|
+
* Returns: { sessionId, cookies: Cookie[], count: number, expiresAt: string }
|
|
418
|
+
*
|
|
419
|
+
* Each cookie follows the Playwright Cookie shape:
|
|
420
|
+
* { name, value, domain, path, expires, httpOnly, secure, sameSite }
|
|
421
|
+
*
|
|
422
|
+
* Use this to snapshot cookies after a login flow, then re-inject them later
|
|
423
|
+
* via POST /v1/session/:id/cookies to skip re-authentication.
|
|
424
|
+
*/
|
|
425
|
+
router.get('/v1/session/:id/cookies', async (req, res) => {
|
|
426
|
+
const ownerId = getOwnerId(req);
|
|
427
|
+
const session = getSession(req.params['id'], ownerId);
|
|
428
|
+
if (!session) {
|
|
429
|
+
res.status(404).json({
|
|
430
|
+
success: false,
|
|
431
|
+
error: {
|
|
432
|
+
type: 'session_not_found',
|
|
433
|
+
message: 'Session not found or has expired.',
|
|
434
|
+
hint: 'Create a new session via POST /v1/session.',
|
|
435
|
+
docs: 'https://webpeel.dev/docs/errors#session-not-found',
|
|
436
|
+
},
|
|
437
|
+
requestId: req.requestId || randomUUID(),
|
|
438
|
+
});
|
|
439
|
+
return;
|
|
440
|
+
}
|
|
441
|
+
try {
|
|
442
|
+
// Playwright context.cookies() returns all cookies for all URLs by default
|
|
443
|
+
const cookies = await session.context.cookies();
|
|
444
|
+
session.lastUsedAt = Date.now();
|
|
445
|
+
res.json({
|
|
446
|
+
sessionId: session.id,
|
|
447
|
+
cookies,
|
|
448
|
+
count: cookies.length,
|
|
449
|
+
expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
|
|
450
|
+
});
|
|
451
|
+
}
|
|
452
|
+
catch (err) {
|
|
453
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
454
|
+
res.status(500).json({
|
|
455
|
+
success: false,
|
|
456
|
+
error: {
|
|
457
|
+
type: 'cookie_export_failed',
|
|
458
|
+
message: msg,
|
|
459
|
+
docs: 'https://webpeel.dev/docs/errors#cookie-export-failed',
|
|
460
|
+
},
|
|
461
|
+
requestId: req.requestId || randomUUID(),
|
|
462
|
+
});
|
|
463
|
+
}
|
|
464
|
+
});
|
|
465
|
+
/**
|
|
466
|
+
* POST /v1/session/:id/cookies — inject cookies into the session's browser context
|
|
467
|
+
*
|
|
468
|
+
* Body params:
|
|
469
|
+
* cookies {Cookie[]} Array of Playwright-compatible cookie objects.
|
|
470
|
+
* Required fields: name, value, domain (or url).
|
|
471
|
+
* Optional: path, expires, httpOnly, secure, sameSite.
|
|
472
|
+
*
|
|
473
|
+
* Returns: { sessionId, injected: number, expiresAt: string }
|
|
474
|
+
*
|
|
475
|
+
* Typical cookie-persistence workflow:
|
|
476
|
+
* 1. POST /v1/session { url: "https://example.com", persist: true }
|
|
477
|
+
* 2. POST /v1/session/:id/act (complete login flow)
|
|
478
|
+
* 3. GET /v1/session/:id/cookies → save cookies array to your storage
|
|
479
|
+
* 4. Later: POST /v1/session/:id/cookies { cookies: [...] }
|
|
480
|
+
* 5. GET /v1/session/:id → page loads authenticated (no re-login needed)
|
|
481
|
+
*/
|
|
482
|
+
router.post('/v1/session/:id/cookies', async (req, res) => {
|
|
483
|
+
const ownerId = getOwnerId(req);
|
|
484
|
+
const session = getSession(req.params['id'], ownerId);
|
|
485
|
+
if (!session) {
|
|
486
|
+
res.status(404).json({
|
|
487
|
+
success: false,
|
|
488
|
+
error: {
|
|
489
|
+
type: 'session_not_found',
|
|
490
|
+
message: 'Session not found or has expired.',
|
|
491
|
+
hint: 'Create a new session via POST /v1/session.',
|
|
492
|
+
docs: 'https://webpeel.dev/docs/errors#session-not-found',
|
|
493
|
+
},
|
|
494
|
+
requestId: req.requestId || randomUUID(),
|
|
495
|
+
});
|
|
496
|
+
return;
|
|
497
|
+
}
|
|
498
|
+
const { cookies } = req.body;
|
|
499
|
+
if (!Array.isArray(cookies) || cookies.length === 0) {
|
|
500
|
+
res.status(400).json({
|
|
501
|
+
success: false,
|
|
502
|
+
error: {
|
|
503
|
+
type: 'bad_request',
|
|
504
|
+
message: '`cookies` must be a non-empty array of cookie objects.',
|
|
505
|
+
hint: 'Pass cookies exported from GET /v1/session/:id/cookies or a compatible Cookie[] array.',
|
|
506
|
+
docs: 'https://webpeel.dev/docs/errors#bad-request',
|
|
507
|
+
},
|
|
508
|
+
requestId: req.requestId || randomUUID(),
|
|
509
|
+
});
|
|
510
|
+
return;
|
|
511
|
+
}
|
|
512
|
+
try {
|
|
513
|
+
// Playwright's addCookies validates the shape internally; invalid cookies will throw
|
|
514
|
+
await session.context.addCookies(cookies);
|
|
515
|
+
session.lastUsedAt = Date.now();
|
|
516
|
+
res.json({
|
|
517
|
+
sessionId: session.id,
|
|
518
|
+
injected: cookies.length,
|
|
519
|
+
expiresAt: new Date(session.lastUsedAt + session.ttlMs).toISOString(),
|
|
520
|
+
});
|
|
521
|
+
}
|
|
522
|
+
catch (err) {
|
|
523
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
524
|
+
res.status(400).json({
|
|
525
|
+
success: false,
|
|
526
|
+
error: {
|
|
527
|
+
type: 'cookie_inject_failed',
|
|
528
|
+
message: msg,
|
|
529
|
+
hint: 'Ensure each cookie has at minimum: name, value, and domain (or url).',
|
|
530
|
+
docs: 'https://webpeel.dev/docs/errors#cookie-inject-failed',
|
|
531
|
+
},
|
|
532
|
+
requestId: req.requestId || randomUUID(),
|
|
533
|
+
});
|
|
534
|
+
}
|
|
535
|
+
});
|
|
385
536
|
// ── DELETE /v1/session/:id ───────────────────────────────────────────────────
|
|
386
537
|
router.delete('/v1/session/:id', async (req, res) => {
|
|
387
538
|
const ownerId = getOwnerId(req);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.7",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|