webpeel 0.20.6 → 0.20.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/auth.js +30 -5
- package/dist/cli/commands/interact.js +1 -0
- package/dist/core/domain-extractors.js +315 -75
- package/dist/core/profiles.d.ts +15 -1
- package/dist/core/profiles.js +137 -2
- package/dist/server/routes/agent.d.ts +1 -0
- package/dist/server/routes/agent.js +98 -5
- package/dist/server/routes/crawl.js +74 -1
- package/package.json +1 -1
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
*/
|
|
4
4
|
import { handleLogin, handleLogout, handleUsage, loadConfig, saveConfig } from '../../cli-auth.js';
|
|
5
5
|
import { clearCache, cacheStats } from '../../cache.js';
|
|
6
|
+
import { loginToProfile } from '../../core/profiles.js';
|
|
6
7
|
import { cliVersion } from '../utils.js';
|
|
7
8
|
export function registerAuthCommands(program) {
|
|
8
9
|
// ── auth command ──────────────────────────────────────────────────────────
|
|
@@ -254,13 +255,37 @@ export function registerAuthCommands(program) {
|
|
|
254
255
|
console.log(' Try: webpeel "https://news.ycombinator.com" --json');
|
|
255
256
|
});
|
|
256
257
|
// ── login command ─────────────────────────────────────────────────────────
|
|
258
|
+
// Two modes:
|
|
259
|
+
// webpeel login — interactive API key authentication (existing)
|
|
260
|
+
// webpeel login <domain> — browser login: open site, log in, save cookies as profile
|
|
257
261
|
program
|
|
258
|
-
.command('login')
|
|
259
|
-
.description('Authenticate
|
|
260
|
-
.
|
|
262
|
+
.command('login [domain]')
|
|
263
|
+
.description('Authenticate: no args = API key auth; with domain = browser login (saves cookies as a named profile)')
|
|
264
|
+
.option('--profile <name>', 'Profile name to save under (defaults to the domain)')
|
|
265
|
+
.action(async (domain, opts) => {
|
|
261
266
|
try {
|
|
262
|
-
|
|
263
|
-
|
|
267
|
+
if (domain) {
|
|
268
|
+
// ── Browser login mode ──────────────────────────────────────────
|
|
269
|
+
const url = domain.startsWith('http') ? domain : `https://${domain}`;
|
|
270
|
+
// Extract hostname for profile name default (e.g. "instagram.com" from "https://www.instagram.com/")
|
|
271
|
+
let defaultProfileName;
|
|
272
|
+
try {
|
|
273
|
+
const hostname = new URL(url).hostname;
|
|
274
|
+
// Strip "www." prefix for cleaner profile names
|
|
275
|
+
defaultProfileName = hostname.replace(/^www\./, '');
|
|
276
|
+
}
|
|
277
|
+
catch {
|
|
278
|
+
defaultProfileName = domain;
|
|
279
|
+
}
|
|
280
|
+
const profileName = opts.profile || defaultProfileName;
|
|
281
|
+
await loginToProfile(url, profileName);
|
|
282
|
+
process.exit(0);
|
|
283
|
+
}
|
|
284
|
+
else {
|
|
285
|
+
// ── API key auth mode (original behavior) ───────────────────────
|
|
286
|
+
await handleLogin();
|
|
287
|
+
process.exit(0);
|
|
288
|
+
}
|
|
264
289
|
}
|
|
265
290
|
catch (error) {
|
|
266
291
|
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
@@ -310,6 +310,7 @@ export function registerInteractCommands(program) {
|
|
|
310
310
|
.option('--schema <json>', 'Schema template name (e.g. product, article) or JSON schema for structured output')
|
|
311
311
|
.option('-s, --silent', 'Silent mode (no spinner)')
|
|
312
312
|
.option('--json', 'Output as JSON')
|
|
313
|
+
.option('--stream', 'Stream progress via SSE (calls API endpoint, requires API key)')
|
|
313
314
|
.action(async (prompt, options) => {
|
|
314
315
|
const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
|
|
315
316
|
const urls = options.urls ? options.urls.split(',').map((u) => u.trim()) : undefined;
|
|
@@ -324,19 +324,87 @@ async function twitterExtractor(html, url) {
|
|
|
324
324
|
source: 'fxtwitter',
|
|
325
325
|
};
|
|
326
326
|
// Try to fetch recent tweets from Twitter's public syndication endpoint
|
|
327
|
+
// NOTE: simpleFetch sends too many Sec-* headers that trigger 429. Use https directly.
|
|
327
328
|
let recentTweets = '';
|
|
328
329
|
try {
|
|
329
|
-
const
|
|
330
|
-
const
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
330
|
+
const { default: httpsModule } = await import('https');
|
|
331
|
+
const syndicationHtml = await new Promise((resolve, reject) => {
|
|
332
|
+
const req = httpsModule.request({
|
|
333
|
+
hostname: 'syndication.twitter.com',
|
|
334
|
+
path: `/srv/timeline-profile/screen-name/${u.screen_name}`,
|
|
335
|
+
method: 'GET',
|
|
336
|
+
headers: {
|
|
337
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
338
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
339
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
340
|
+
},
|
|
341
|
+
}, (res) => {
|
|
342
|
+
if (res.statusCode !== 200) {
|
|
343
|
+
reject(new Error(`HTTP ${res.statusCode}`));
|
|
344
|
+
res.resume();
|
|
345
|
+
return;
|
|
346
|
+
}
|
|
347
|
+
let body = '';
|
|
348
|
+
res.on('data', (chunk) => body += chunk.toString());
|
|
349
|
+
res.on('end', () => resolve(body));
|
|
350
|
+
});
|
|
351
|
+
req.on('error', reject);
|
|
352
|
+
setTimeout(() => req.destroy(new Error('timeout')), 12000);
|
|
353
|
+
req.end();
|
|
354
|
+
});
|
|
355
|
+
if (syndicationHtml) {
|
|
356
|
+
// Parse __NEXT_DATA__ JSON from the syndication page for rich tweet data
|
|
357
|
+
const nextDataMatch = syndicationHtml.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
|
|
358
|
+
if (nextDataMatch) {
|
|
359
|
+
const nextData = tryParseJson(nextDataMatch[1]);
|
|
360
|
+
const entries = nextData?.props?.pageProps?.timeline?.entries || [];
|
|
361
|
+
const tweetSections = [];
|
|
362
|
+
for (const entry of entries) {
|
|
363
|
+
if (tweetSections.length >= 8)
|
|
364
|
+
break;
|
|
365
|
+
const tweet = entry?.content?.tweet;
|
|
366
|
+
if (!tweet?.full_text)
|
|
367
|
+
continue;
|
|
368
|
+
const text = tweet.full_text.replace(/\\n/g, '\n').replace(/\\"/g, '"').trim();
|
|
369
|
+
// Skip retweets and pure-URL-only tweets without media
|
|
370
|
+
if (text.startsWith('RT @'))
|
|
371
|
+
continue;
|
|
372
|
+
const media = tweet.extended_entities?.media || tweet.entities?.media || [];
|
|
373
|
+
const isUrlOnly = /^https?:\/\/t\.co\/\S+$/.test(text.trim()) || /^https?:\/\/t\.co\/\S+\s*$/.test(text.trim());
|
|
374
|
+
if (isUrlOnly && media.length === 0)
|
|
375
|
+
continue;
|
|
376
|
+
// Format date
|
|
377
|
+
const dateStr = tweet.created_at ? (() => {
|
|
378
|
+
try {
|
|
379
|
+
return new Date(tweet.created_at).toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' });
|
|
380
|
+
}
|
|
381
|
+
catch {
|
|
382
|
+
return tweet.created_at;
|
|
383
|
+
}
|
|
384
|
+
})() : '';
|
|
385
|
+
const likes = tweet.favorite_count ?? 0;
|
|
386
|
+
const retweets = tweet.retweet_count ?? 0;
|
|
387
|
+
const replies = tweet.reply_count ?? 0;
|
|
388
|
+
const fmtNum = (n) => n >= 1000000 ? (n / 1000000).toFixed(1) + 'M' : n >= 1000 ? (n / 1000).toFixed(1) + 'K' : String(n);
|
|
389
|
+
const mediaLine = media.length > 0 ? `\n📷 ${media.map((m) => m.media_url_https || m.media_url).filter(Boolean).join(', ')}` : '';
|
|
390
|
+
// Clean t.co URLs from text when they have real media
|
|
391
|
+
const cleanText = media.length > 0 ? text.replace(/https?:\/\/t\.co\/\S+/g, '').trim() : text;
|
|
392
|
+
tweetSections.push(`### ${dateStr}\n${cleanText}${mediaLine}\n♻️ ${fmtNum(retweets)} | ❤️ ${fmtNum(likes)} | 💬 ${fmtNum(replies)}`);
|
|
393
|
+
}
|
|
394
|
+
if (tweetSections.length > 0) {
|
|
395
|
+
recentTweets = '\n\n## Recent Tweets\n\n' + tweetSections.join('\n\n---\n\n');
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
else {
|
|
399
|
+
// Fallback: simple regex extraction without metrics
|
|
400
|
+
const tweetMatches = [...syndicationHtml.matchAll(/"full_text":"((?:[^"\\]|\\.)*)"/g)];
|
|
401
|
+
const tweets = tweetMatches
|
|
402
|
+
.slice(0, 5)
|
|
403
|
+
.map(m => m[1].replace(/\\n/g, ' ').replace(/\\"/g, '"').trim())
|
|
404
|
+
.filter(t => t.length > 10 && !t.startsWith('RT @'));
|
|
405
|
+
if (tweets.length > 0) {
|
|
406
|
+
recentTweets = '\n\n## Recent Tweets\n\n' + tweets.map(t => `> ${t}`).join('\n\n');
|
|
407
|
+
}
|
|
340
408
|
}
|
|
341
409
|
}
|
|
342
410
|
}
|
|
@@ -344,7 +412,7 @@ async function twitterExtractor(html, url) {
|
|
|
344
412
|
const websiteLine = structured.website ? `\n🌐 ${structured.website}` : '';
|
|
345
413
|
const joinedLine = structured.created ? `\n📅 Joined: ${structured.created}` : '';
|
|
346
414
|
const likesLine = structured.likes ? ` | ❤️ Likes: ${structured.likes?.toLocaleString() || 0}` : '';
|
|
347
|
-
const cleanContent =
|
|
415
|
+
const cleanContent = `# @${(structured.handle || '').replace('@', '')} on X/Twitter\n\n**${structured.name}**${structured.verified ? ' ✓' : ''}\n\n${structured.bio || ''}\n\n📍 ${structured.location || 'N/A'}${websiteLine}${joinedLine}\n👥 Followers: ${structured.followers?.toLocaleString() || 0} | Following: ${structured.following?.toLocaleString() || 0} | Tweets: ${structured.tweets?.toLocaleString() || 0}${likesLine}${recentTweets}`;
|
|
348
416
|
return { domain, type: 'profile', structured, cleanContent };
|
|
349
417
|
}
|
|
350
418
|
}
|
|
@@ -1961,7 +2029,16 @@ async function linkedinExtractor(html, url) {
|
|
|
1961
2029
|
try {
|
|
1962
2030
|
const { load } = await import('cheerio');
|
|
1963
2031
|
const $ = load(html);
|
|
1964
|
-
//
|
|
2032
|
+
// Detect page type from URL first
|
|
2033
|
+
const urlObj = new URL(url);
|
|
2034
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
2035
|
+
const pageType = pathParts[0] === 'company' ? 'company'
|
|
2036
|
+
: pathParts[0] === 'in' ? 'profile'
|
|
2037
|
+
: pathParts[0] === 'jobs' ? 'job'
|
|
2038
|
+
: 'page';
|
|
2039
|
+
// Detect if we're on the authwall (LinkedIn redirects unauthenticated requests)
|
|
2040
|
+
const isAuthwall = html.includes('authwall') || html.includes('Join LinkedIn') || html.includes('Sign in') && !html.includes('linkedin.com/in/');
|
|
2041
|
+
// --- Try parsing meta tags / JSON-LD from the HTML ---
|
|
1965
2042
|
let jsonLd = null;
|
|
1966
2043
|
$('script[type="application/ld+json"]').each((_, el) => {
|
|
1967
2044
|
if (jsonLd)
|
|
@@ -1974,30 +2051,83 @@ async function linkedinExtractor(html, url) {
|
|
|
1974
2051
|
const ogTitle = $('meta[property="og:title"]').attr('content') || '';
|
|
1975
2052
|
const ogDescription = $('meta[property="og:description"]').attr('content') || '';
|
|
1976
2053
|
const ogImage = $('meta[property="og:image"]').attr('content') || '';
|
|
1977
|
-
const
|
|
2054
|
+
const metaDescription = $('meta[name="description"]').attr('content') || '';
|
|
2055
|
+
let name = jsonLd?.name || ogTitle.replace(/ \| LinkedIn$/, '').replace(/Sign Up \| LinkedIn$/, '').trim() || '';
|
|
2056
|
+
// When on authwall, discard authwall-specific meta data
|
|
2057
|
+
let headline = isAuthwall ? (jsonLd?.jobTitle || '') : (jsonLd?.jobTitle || metaDescription?.split('|')?.[0]?.trim() || ogDescription || '');
|
|
2058
|
+
let description = isAuthwall ? (jsonLd?.description || '') : (jsonLd?.description || ogDescription || '');
|
|
2059
|
+
let location = $('[class*="location"]').first().text().trim() || jsonLd?.address?.addressLocality || '';
|
|
2060
|
+
// --- If authwall or no useful data, try direct HTTPS fetch with minimal headers ---
|
|
2061
|
+
// LinkedIn returns rich og: meta tags when fetched with a plain browser UA (no Sec-Fetch-* noise)
|
|
2062
|
+
if (!name || isAuthwall || name.toLowerCase().includes('sign up') || name.toLowerCase().includes('linkedin')) {
|
|
2063
|
+
try {
|
|
2064
|
+
const { default: httpsLI } = await import('https');
|
|
2065
|
+
const { gunzip } = await import('zlib');
|
|
2066
|
+
const linkedInHtml = await new Promise((resolve, reject) => {
|
|
2067
|
+
const req = httpsLI.request({
|
|
2068
|
+
hostname: 'www.linkedin.com',
|
|
2069
|
+
path: urlObj.pathname,
|
|
2070
|
+
method: 'GET',
|
|
2071
|
+
headers: {
|
|
2072
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
2073
|
+
'Accept': 'text/html,application/xhtml+xml',
|
|
2074
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
2075
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
2076
|
+
},
|
|
2077
|
+
}, (res) => {
|
|
2078
|
+
if (res.statusCode && res.statusCode >= 400) {
|
|
2079
|
+
reject(new Error(`HTTP ${res.statusCode}`));
|
|
2080
|
+
res.resume();
|
|
2081
|
+
return;
|
|
2082
|
+
}
|
|
2083
|
+
const chunks = [];
|
|
2084
|
+
res.on('data', (chunk) => chunks.push(chunk));
|
|
2085
|
+
res.on('end', () => {
|
|
2086
|
+
const buf = Buffer.concat(chunks);
|
|
2087
|
+
const enc = res.headers['content-encoding'] || '';
|
|
2088
|
+
if (enc === 'gzip') {
|
|
2089
|
+
gunzip(buf, (err, decoded) => err ? reject(err) : resolve(decoded.toString('utf8')));
|
|
2090
|
+
}
|
|
2091
|
+
else {
|
|
2092
|
+
resolve(buf.toString('utf8'));
|
|
2093
|
+
}
|
|
2094
|
+
});
|
|
2095
|
+
});
|
|
2096
|
+
req.on('error', reject);
|
|
2097
|
+
setTimeout(() => req.destroy(new Error('timeout')), 10000);
|
|
2098
|
+
req.end();
|
|
2099
|
+
});
|
|
2100
|
+
if (linkedInHtml) {
|
|
2101
|
+
const $li = load(linkedInHtml);
|
|
2102
|
+
const liOgTitle = $li('meta[property="og:title"]').attr('content') || '';
|
|
2103
|
+
const liOgDesc = $li('meta[property="og:description"]').attr('content') || '';
|
|
2104
|
+
// Only use if it has real profile data (not authwall)
|
|
2105
|
+
if (liOgTitle && !liOgTitle.toLowerCase().includes('sign up') && !liOgTitle.toLowerCase().includes('join linkedin')) {
|
|
2106
|
+
// "Name - Headline | LinkedIn" or "Name | LinkedIn"
|
|
2107
|
+
const titleParts = liOgTitle.replace(/ \| LinkedIn$/, '').split(/\s*[-–]\s*/);
|
|
2108
|
+
if (titleParts[0])
|
|
2109
|
+
name = titleParts[0].trim();
|
|
2110
|
+
if (titleParts[1])
|
|
2111
|
+
headline = titleParts[1].trim();
|
|
2112
|
+
if (liOgDesc)
|
|
2113
|
+
description = liOgDesc;
|
|
2114
|
+
}
|
|
2115
|
+
}
|
|
2116
|
+
}
|
|
2117
|
+
catch { /* direct fetch optional */ }
|
|
2118
|
+
}
|
|
1978
2119
|
if (!name)
|
|
1979
2120
|
return null;
|
|
1980
|
-
const headline = jsonLd?.jobTitle ||
|
|
1981
|
-
$('meta[name="description"]').attr('content')?.split('|')?.[0]?.trim() ||
|
|
1982
|
-
ogDescription || '';
|
|
1983
|
-
const description = jsonLd?.description || ogDescription || '';
|
|
1984
|
-
// Try to detect page type from URL
|
|
1985
|
-
const pathParts = new URL(url).pathname.split('/').filter(Boolean);
|
|
1986
|
-
const pageType = pathParts[0] === 'company' ? 'company'
|
|
1987
|
-
: pathParts[0] === 'in' ? 'profile'
|
|
1988
|
-
: pathParts[0] === 'jobs' ? 'job'
|
|
1989
|
-
: 'page';
|
|
1990
|
-
// Extract any visible structured info from the HTML
|
|
1991
|
-
const location = $('[class*="location"]').first().text().trim() ||
|
|
1992
|
-
jsonLd?.address?.addressLocality || '';
|
|
1993
2121
|
const structured = {
|
|
1994
2122
|
name, headline, description, location, pageType,
|
|
1995
2123
|
image: ogImage, url,
|
|
1996
2124
|
};
|
|
1997
2125
|
const typeLine = pageType === 'company' ? '🏢' : pageType === 'profile' ? '👤' : '🔗';
|
|
1998
2126
|
const locationLine = location ? `\n📍 ${location}` : '';
|
|
1999
|
-
const headlineLine = headline ? `\n*${headline}*` : '';
|
|
2000
|
-
const
|
|
2127
|
+
const headlineLine = headline && headline !== name ? `\n*${headline}*` : '';
|
|
2128
|
+
const descriptionLine = description ? `\n\n${description}` : '';
|
|
2129
|
+
const authNote = '\n\n⚠️ Full LinkedIn profiles require authentication. Use /v1/session to log in first.';
|
|
2130
|
+
const cleanContent = `# ${typeLine} ${name} — LinkedIn${headlineLine}${locationLine}${descriptionLine}${authNote}`;
|
|
2001
2131
|
return { domain: 'linkedin.com', type: pageType, structured, cleanContent };
|
|
2002
2132
|
}
|
|
2003
2133
|
catch {
|
|
@@ -2541,8 +2671,73 @@ async function soundcloudExtractor(_html, url) {
|
|
|
2541
2671
|
// 29. Instagram extractor (oEmbed)
|
|
2542
2672
|
// ---------------------------------------------------------------------------
|
|
2543
2673
|
async function instagramExtractor(_html, url) {
|
|
2674
|
+
const pathParts = new URL(url).pathname.split('/').filter(Boolean);
|
|
2675
|
+
const contentType = pathParts[0] === 'p' ? 'post' : pathParts[0] === 'reel' ? 'reel' : pathParts[0] === 'tv' ? 'igtv' : pathParts.length === 1 ? 'profile' : 'post';
|
|
2676
|
+
// --- Profile extraction via Instagram internal API (no auth needed) ---
|
|
2677
|
+
if (contentType === 'profile' && pathParts.length === 1) {
|
|
2678
|
+
const username = pathParts[0];
|
|
2679
|
+
try {
|
|
2680
|
+
const apiUrl = `https://www.instagram.com/api/v1/users/web_profile_info/?username=${encodeURIComponent(username)}`;
|
|
2681
|
+
const igHeaders = {
|
|
2682
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
2683
|
+
'X-IG-App-ID': '936619743392459',
|
|
2684
|
+
'Accept': '*/*',
|
|
2685
|
+
'Referer': 'https://www.instagram.com/',
|
|
2686
|
+
};
|
|
2687
|
+
const apiResult = await simpleFetch(apiUrl, igHeaders['User-Agent'], 12000, igHeaders);
|
|
2688
|
+
const data = tryParseJson(apiResult?.html || '');
|
|
2689
|
+
const user = data?.data?.user;
|
|
2690
|
+
if (user && user.username) {
|
|
2691
|
+
const followers = user.edge_followed_by?.count ?? 0;
|
|
2692
|
+
const following = user.edge_follow?.count ?? 0;
|
|
2693
|
+
const postCount = user.edge_owner_to_timeline_media?.count ?? 0;
|
|
2694
|
+
const fmtNum = (n) => n >= 1000000 ? (n / 1000000).toFixed(1) + 'M' : n >= 1000 ? (n / 1000).toFixed(1) + 'K' : String(n);
|
|
2695
|
+
const structured = {
|
|
2696
|
+
username: user.username,
|
|
2697
|
+
fullName: user.full_name || '',
|
|
2698
|
+
bio: user.biography || '',
|
|
2699
|
+
followers,
|
|
2700
|
+
following,
|
|
2701
|
+
posts: postCount,
|
|
2702
|
+
verified: user.is_verified || false,
|
|
2703
|
+
isPrivate: user.is_private || false,
|
|
2704
|
+
profilePic: user.profile_pic_url_hd || user.profile_pic_url || '',
|
|
2705
|
+
externalUrl: user.external_url || (user.bio_links?.[0]?.url) || '',
|
|
2706
|
+
contentType: 'profile',
|
|
2707
|
+
};
|
|
2708
|
+
// Recent posts
|
|
2709
|
+
const edges = user.edge_owner_to_timeline_media?.edges || [];
|
|
2710
|
+
const postSections = [];
|
|
2711
|
+
for (const edge of edges.slice(0, 6)) {
|
|
2712
|
+
const node = edge?.node;
|
|
2713
|
+
if (!node)
|
|
2714
|
+
continue;
|
|
2715
|
+
const caption = node.edge_media_to_caption?.edges?.[0]?.node?.text || '';
|
|
2716
|
+
const likes = node.edge_liked_by?.count ?? node.edge_media_preview_like?.count ?? 0;
|
|
2717
|
+
const comments = node.edge_media_to_comment?.count ?? 0;
|
|
2718
|
+
const isVideo = node.is_video;
|
|
2719
|
+
const mediaType = isVideo ? '🎬' : '📸';
|
|
2720
|
+
const timestamp = node.taken_at_timestamp ? new Date(node.taken_at_timestamp * 1000).toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }) : '';
|
|
2721
|
+
const imgUrl = node.thumbnail_src || node.display_url || '';
|
|
2722
|
+
const captionSnippet = caption ? caption.slice(0, 150) + (caption.length > 150 ? '…' : '') : '';
|
|
2723
|
+
postSections.push(`### ${mediaType} ${timestamp}\n${captionSnippet}\n❤️ ${fmtNum(likes)} | 💬 ${fmtNum(comments)}${imgUrl ? `\n🖼 ${imgUrl}` : ''}`);
|
|
2724
|
+
}
|
|
2725
|
+
const verifiedBadge = structured.verified ? ' ✓' : '';
|
|
2726
|
+
const privateBadge = structured.isPrivate ? ' 🔒' : '';
|
|
2727
|
+
const bioLine = structured.bio ? `\n\n${structured.bio}` : '';
|
|
2728
|
+
const externalLine = structured.externalUrl ? `\n🌐 ${structured.externalUrl}` : '';
|
|
2729
|
+
const postsSection = postSections.length > 0 ? '\n\n## Recent Posts\n\n' + postSections.join('\n\n---\n\n') : '';
|
|
2730
|
+
const cleanContent = `# @${structured.username} on Instagram${verifiedBadge}${privateBadge}\n\n**${structured.fullName || structured.username}**${bioLine}${externalLine}\n\n👥 ${fmtNum(followers)} Followers | ${fmtNum(following)} Following | ${fmtNum(postCount)} Posts${postsSection}`;
|
|
2731
|
+
return { domain: 'instagram.com', type: 'profile', structured, cleanContent };
|
|
2732
|
+
}
|
|
2733
|
+
}
|
|
2734
|
+
catch (e) {
|
|
2735
|
+
if (process.env.DEBUG)
|
|
2736
|
+
console.debug('[webpeel]', 'Instagram profile API failed:', e instanceof Error ? e.message : e);
|
|
2737
|
+
}
|
|
2738
|
+
}
|
|
2739
|
+
// --- Post/Reel/IGTV: Try oEmbed API ---
|
|
2544
2740
|
try {
|
|
2545
|
-
// Instagram official oEmbed (no access token needed for basic data)
|
|
2546
2741
|
const oembedUrl = `https://graph.facebook.com/v22.0/instagram_oembed?url=${encodeURIComponent(url)}&fields=title,author_name,provider_name,thumbnail_url`;
|
|
2547
2742
|
const data = await fetchJson(oembedUrl);
|
|
2548
2743
|
// Also try noembed.com as fallback
|
|
@@ -2553,8 +2748,6 @@ async function instagramExtractor(_html, url) {
|
|
|
2553
2748
|
}
|
|
2554
2749
|
if (!resolvedData || resolvedData.error)
|
|
2555
2750
|
return null;
|
|
2556
|
-
const pathParts = new URL(url).pathname.split('/').filter(Boolean);
|
|
2557
|
-
const contentType = pathParts[0] === 'p' ? 'post' : pathParts[0] === 'reel' ? 'reel' : pathParts[0] === 'tv' ? 'igtv' : pathParts.length === 1 ? 'profile' : 'post';
|
|
2558
2751
|
const structured = {
|
|
2559
2752
|
title: resolvedData.title || '',
|
|
2560
2753
|
author: resolvedData.author_name || '',
|
|
@@ -2563,7 +2756,7 @@ async function instagramExtractor(_html, url) {
|
|
|
2563
2756
|
contentType,
|
|
2564
2757
|
provider: 'Instagram',
|
|
2565
2758
|
};
|
|
2566
|
-
const typeEmoji = contentType === 'reel' ? '🎬' : contentType === 'post' ? '📸' :
|
|
2759
|
+
const typeEmoji = contentType === 'reel' ? '🎬' : contentType === 'post' ? '📸' : '📱';
|
|
2567
2760
|
const titleText = structured.title || `Instagram ${contentType} by ${structured.author}`;
|
|
2568
2761
|
const cleanContent = `## ${typeEmoji} Instagram ${contentType}: ${titleText}\n\n**Creator:** @${structured.author.replace('@', '')}\n**URL:** ${url}`;
|
|
2569
2762
|
return { domain: 'instagram.com', type: contentType, structured, cleanContent };
|
|
@@ -2575,59 +2768,106 @@ async function instagramExtractor(_html, url) {
|
|
|
2575
2768
|
}
|
|
2576
2769
|
}
|
|
2577
2770
|
// ---------------------------------------------------------------------------
|
|
2578
|
-
// 30. PDF extractor (URL-based detection)
|
|
2771
|
+
// 30. PDF extractor (URL-based detection) — downloads and extracts real text
|
|
2579
2772
|
// ---------------------------------------------------------------------------
|
|
2773
|
+
const PDF_MAX_BYTES = 50 * 1024 * 1024; // 50 MB
|
|
2774
|
+
const PDF_TRUNCATE_CHARS = 100_000;
|
|
2580
2775
|
async function pdfExtractor(_html, url) {
|
|
2581
2776
|
try {
|
|
2582
2777
|
const urlObj = new URL(url);
|
|
2583
2778
|
const filename = urlObj.pathname.split('/').pop() || 'document.pdf';
|
|
2584
2779
|
const hostname = urlObj.hostname;
|
|
2585
|
-
//
|
|
2586
|
-
let
|
|
2587
|
-
let
|
|
2780
|
+
// Download the PDF
|
|
2781
|
+
let buffer;
|
|
2782
|
+
let finalContentType = 'application/pdf';
|
|
2588
2783
|
try {
|
|
2589
|
-
const
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
await new Promise((resolve) => {
|
|
2593
|
-
const req = client.request(url, { method: 'HEAD', timeout: 5000 }, (res) => {
|
|
2594
|
-
contentType = res.headers['content-type'] || 'application/pdf';
|
|
2595
|
-
contentLength = res.headers['content-length'] || '';
|
|
2596
|
-
resolve();
|
|
2597
|
-
res.resume();
|
|
2598
|
-
});
|
|
2599
|
-
req.on('error', () => resolve());
|
|
2600
|
-
req.on('timeout', () => { req.destroy(); resolve(); });
|
|
2601
|
-
req.end();
|
|
2784
|
+
const response = await fetch(url, {
|
|
2785
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; WebPeel/1.0)' },
|
|
2786
|
+
signal: AbortSignal.timeout(30000),
|
|
2602
2787
|
});
|
|
2788
|
+
if (!response.ok) {
|
|
2789
|
+
if (process.env.DEBUG)
|
|
2790
|
+
console.debug('[webpeel]', `PDF download failed: HTTP ${response.status}`);
|
|
2791
|
+
return null; // Let the normal pipeline handle it
|
|
2792
|
+
}
|
|
2793
|
+
finalContentType = response.headers.get('content-type') || 'application/pdf';
|
|
2794
|
+
// Verify it's actually a PDF (content-type or URL)
|
|
2795
|
+
const isPdf = finalContentType.toLowerCase().includes('pdf') || /\.pdf(\?|$|#)/i.test(url);
|
|
2796
|
+
if (!isPdf)
|
|
2797
|
+
return null;
|
|
2798
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
2799
|
+
buffer = Buffer.from(arrayBuffer);
|
|
2800
|
+
}
|
|
2801
|
+
catch (downloadErr) {
|
|
2802
|
+
if (process.env.DEBUG)
|
|
2803
|
+
console.debug('[webpeel]', 'PDF download error:', downloadErr instanceof Error ? downloadErr.message : downloadErr);
|
|
2804
|
+
return null; // Let the normal pipeline handle it
|
|
2603
2805
|
}
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
|
|
2806
|
+
// Size guard
|
|
2807
|
+
if (buffer.length > PDF_MAX_BYTES) {
|
|
2808
|
+
if (process.env.DEBUG)
|
|
2809
|
+
console.debug('[webpeel]', `PDF too large (${buffer.length} bytes), falling back to stub`);
|
|
2607
2810
|
return null;
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2811
|
+
}
|
|
2812
|
+
// Extract text via pdf-parse
|
|
2813
|
+
const { extractPdf } = await import('./pdf.js');
|
|
2814
|
+
let pdf;
|
|
2815
|
+
try {
|
|
2816
|
+
pdf = await extractPdf(buffer);
|
|
2817
|
+
}
|
|
2818
|
+
catch (parseErr) {
|
|
2819
|
+
if (process.env.DEBUG)
|
|
2820
|
+
console.debug('[webpeel]', 'PDF parse failed:', parseErr instanceof Error ? parseErr.message : parseErr);
|
|
2821
|
+
return null; // Let the normal pipeline handle it
|
|
2822
|
+
}
|
|
2823
|
+
// Normalize whitespace (pdf-parse emits lots of blank lines)
|
|
2824
|
+
let text = (pdf.text || '')
|
|
2825
|
+
.replace(/\r\n/g, '\n')
|
|
2826
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
2827
|
+
.replace(/[ \t]+/g, ' ')
|
|
2828
|
+
.trim();
|
|
2829
|
+
// Truncate very large documents
|
|
2830
|
+
let truncated = false;
|
|
2831
|
+
if (text.length > PDF_TRUNCATE_CHARS) {
|
|
2832
|
+
text = text.slice(0, PDF_TRUNCATE_CHARS);
|
|
2833
|
+
truncated = true;
|
|
2834
|
+
}
|
|
2835
|
+
if (!text) {
|
|
2836
|
+
// Scanned/image-only PDF — return a clear message rather than empty content
|
|
2837
|
+
const emptyNote = `## 📄 ${filename}\n\n*This PDF appears to be a scanned document (image-only). No extractable text was found.*\n\n**Source:** ${url}`;
|
|
2838
|
+
return {
|
|
2839
|
+
domain: hostname,
|
|
2840
|
+
type: 'pdf',
|
|
2841
|
+
structured: { title: filename, url, pages: pdf.pages, contentType: finalContentType },
|
|
2842
|
+
cleanContent: emptyNote,
|
|
2843
|
+
};
|
|
2844
|
+
}
|
|
2845
|
+
// Build markdown output
|
|
2846
|
+
const titleRaw = pdf.metadata?.title || '';
|
|
2847
|
+
const title = titleRaw || filename.replace(/\.pdf$/i, '') || 'PDF Document';
|
|
2848
|
+
const metaParts = [];
|
|
2849
|
+
if (pdf.metadata?.author)
|
|
2850
|
+
metaParts.push(`**Author:** ${pdf.metadata.author}`);
|
|
2851
|
+
if (pdf.pages)
|
|
2852
|
+
metaParts.push(`**Pages:** ${pdf.pages}`);
|
|
2853
|
+
metaParts.push(`**Source:** ${url}`);
|
|
2854
|
+
const header = titleRaw ? `# ${titleRaw}\n\n` : '';
|
|
2855
|
+
const metaBlock = metaParts.join(' | ') + '\n\n';
|
|
2856
|
+
const truncNote = truncated ? '\n\n*[Content truncated — document exceeds 100,000 characters]*' : '';
|
|
2857
|
+
const cleanContent = header + metaBlock + text + truncNote;
|
|
2858
|
+
return {
|
|
2859
|
+
domain: hostname,
|
|
2860
|
+
type: 'pdf',
|
|
2861
|
+
structured: {
|
|
2862
|
+
title,
|
|
2863
|
+
filename,
|
|
2864
|
+
url,
|
|
2865
|
+
pages: pdf.pages,
|
|
2866
|
+
contentType: finalContentType,
|
|
2867
|
+
...pdf.metadata,
|
|
2868
|
+
},
|
|
2869
|
+
cleanContent,
|
|
2616
2870
|
};
|
|
2617
|
-
const sizeStr = fileSizeKb ? ` (${fileSizeKb > 1024 ? (fileSizeKb / 1024).toFixed(1) + ' MB' : fileSizeKb + ' KB'})` : '';
|
|
2618
|
-
const cleanContent = `## 📄 PDF Document: ${filename}
|
|
2619
|
-
|
|
2620
|
-
**URL:** ${url}
|
|
2621
|
-
**Host:** ${hostname}${sizeStr ? `\n**Size:** ${sizeStr}` : ''}
|
|
2622
|
-
|
|
2623
|
-
> **Note:** This is a PDF document. Binary PDF content cannot be directly extracted as text through standard web fetching. To extract the full text, consider:
|
|
2624
|
-
>
|
|
2625
|
-
> 1. Use a dedicated PDF extraction service (e.g., Adobe PDF Extract API, pdfminer, PyMuPDF)
|
|
2626
|
-
> 2. Download the file and process locally with \`pdf-parse\` (Node.js) or \`pdfplumber\` (Python)
|
|
2627
|
-
> 3. For academic PDFs, check if an HTML version is available at the same URL without \`.pdf\`
|
|
2628
|
-
|
|
2629
|
-
**Direct download URL:** ${url}`;
|
|
2630
|
-
return { domain: hostname, type: 'pdf', structured, cleanContent };
|
|
2631
2871
|
}
|
|
2632
2872
|
catch (e) {
|
|
2633
2873
|
if (process.env.DEBUG)
|
package/dist/core/profiles.d.ts
CHANGED
|
@@ -14,7 +14,8 @@ export interface ProfileMetadata {
|
|
|
14
14
|
description?: string;
|
|
15
15
|
}
|
|
16
16
|
/**
|
|
17
|
-
* Valid profile names: letters, digits, hyphens
|
|
17
|
+
* Valid profile names: letters, digits, hyphens, and dots. No spaces or special chars.
|
|
18
|
+
* Dots are allowed so domain names like "instagram.com" work as profile names.
|
|
18
19
|
*/
|
|
19
20
|
export declare function isValidProfileName(name: string): boolean;
|
|
20
21
|
/**
|
|
@@ -45,3 +46,16 @@ export declare function deleteProfile(name: string): boolean;
|
|
|
45
46
|
* 3. On browser close or Ctrl+C, captures storage state and saves the profile
|
|
46
47
|
*/
|
|
47
48
|
export declare function createProfile(name: string, description?: string): Promise<void>;
|
|
49
|
+
/**
|
|
50
|
+
* Open a headed browser, navigate to `url`, and wait for the user to log in.
|
|
51
|
+
* Pressing Enter (or closing the browser) saves the session as a named profile.
|
|
52
|
+
*
|
|
53
|
+
* Unlike `createProfile()` (which opens to about:blank and waits for browser close),
|
|
54
|
+
* this function:
|
|
55
|
+
* 1. Navigates directly to the given URL on launch
|
|
56
|
+
* 2. Waits for the user to press Enter (or close the browser) to save
|
|
57
|
+
* 3. Saves storage state AND creates metadata under ~/.webpeel/profiles/<name>/
|
|
58
|
+
*
|
|
59
|
+
* Profile names may contain letters, digits, hyphens, and dots (e.g. "instagram.com").
|
|
60
|
+
*/
|
|
61
|
+
export declare function loginToProfile(url: string, profileName: string, description?: string): Promise<void>;
|
package/dist/core/profiles.js
CHANGED
|
@@ -19,10 +19,11 @@ function ensureProfilesDir() {
|
|
|
19
19
|
}
|
|
20
20
|
// ─── Name validation ─────────────────────────────────────────────────────────
|
|
21
21
|
/**
|
|
22
|
-
* Valid profile names: letters, digits, hyphens
|
|
22
|
+
* Valid profile names: letters, digits, hyphens, and dots. No spaces or special chars.
|
|
23
|
+
* Dots are allowed so domain names like "instagram.com" work as profile names.
|
|
23
24
|
*/
|
|
24
25
|
export function isValidProfileName(name) {
|
|
25
|
-
return /^[a-zA-Z0-9
|
|
26
|
+
return /^[a-zA-Z0-9\-.]+$/.test(name) && name.length > 0 && name.length <= 64;
|
|
26
27
|
}
|
|
27
28
|
// ─── Core helpers ─────────────────────────────────────────────────────────────
|
|
28
29
|
/**
|
|
@@ -213,3 +214,137 @@ export async function createProfile(name, description) {
|
|
|
213
214
|
});
|
|
214
215
|
});
|
|
215
216
|
}
|
|
217
|
+
// ─── Browser-based login helper ───────────────────────────────────────────────
|
|
218
|
+
/**
|
|
219
|
+
* Open a headed browser, navigate to `url`, and wait for the user to log in.
|
|
220
|
+
* Pressing Enter (or closing the browser) saves the session as a named profile.
|
|
221
|
+
*
|
|
222
|
+
* Unlike `createProfile()` (which opens to about:blank and waits for browser close),
|
|
223
|
+
* this function:
|
|
224
|
+
* 1. Navigates directly to the given URL on launch
|
|
225
|
+
* 2. Waits for the user to press Enter (or close the browser) to save
|
|
226
|
+
* 3. Saves storage state AND creates metadata under ~/.webpeel/profiles/<name>/
|
|
227
|
+
*
|
|
228
|
+
* Profile names may contain letters, digits, hyphens, and dots (e.g. "instagram.com").
|
|
229
|
+
*/
|
|
230
|
+
export async function loginToProfile(url, profileName, description) {
|
|
231
|
+
if (!isValidProfileName(profileName)) {
|
|
232
|
+
throw new Error(`Invalid profile name "${profileName}". Use only letters, numbers, hyphens, and dots (no spaces).`);
|
|
233
|
+
}
|
|
234
|
+
ensureProfilesDir();
|
|
235
|
+
const profileDir = path.join(PROFILES_DIR, profileName);
|
|
236
|
+
const isUpdate = existsSync(profileDir) && existsSync(path.join(profileDir, 'metadata.json'));
|
|
237
|
+
mkdirSync(profileDir, { recursive: true });
|
|
238
|
+
const browser = await chromium.launch({ headless: false });
|
|
239
|
+
const context = await browser.newContext();
|
|
240
|
+
const page = await context.newPage();
|
|
241
|
+
try {
|
|
242
|
+
await page.goto(url);
|
|
243
|
+
}
|
|
244
|
+
catch (e) {
|
|
245
|
+
// Non-fatal — browser is open, user can navigate manually
|
|
246
|
+
if (process.env.DEBUG)
|
|
247
|
+
console.debug('[webpeel]', 'initial navigation error:', e instanceof Error ? e.message : e);
|
|
248
|
+
}
|
|
249
|
+
console.log('');
|
|
250
|
+
console.log('╔══════════════════════════════════════════════════════╗');
|
|
251
|
+
console.log(`║ WebPeel Browser Login`);
|
|
252
|
+
console.log(`║ URL: ${url}`);
|
|
253
|
+
console.log(`║ Profile: ${profileName}`);
|
|
254
|
+
console.log('║ ║');
|
|
255
|
+
console.log('║ Log in, then press Enter here to save your session. ║');
|
|
256
|
+
console.log('║ (Or close the browser window — same effect.) ║');
|
|
257
|
+
console.log('╚══════════════════════════════════════════════════════╝');
|
|
258
|
+
console.log('');
|
|
259
|
+
let saved = false;
|
|
260
|
+
const saveAndClose = async () => {
|
|
261
|
+
if (saved)
|
|
262
|
+
return;
|
|
263
|
+
saved = true;
|
|
264
|
+
console.log('\nCapturing browser session...');
|
|
265
|
+
try {
|
|
266
|
+
const storageState = await context.storageState();
|
|
267
|
+
writeFileSync(path.join(profileDir, 'storage-state.json'), JSON.stringify(storageState, null, 2));
|
|
268
|
+
// Extract unique domains from cookies (strip leading dot)
|
|
269
|
+
const domains = [
|
|
270
|
+
...new Set((storageState.cookies ?? [])
|
|
271
|
+
.map((c) => (c.domain ?? '').replace(/^\./, ''))
|
|
272
|
+
.filter(Boolean)),
|
|
273
|
+
];
|
|
274
|
+
const now = new Date().toISOString();
|
|
275
|
+
const meta = isUpdate
|
|
276
|
+
? {
|
|
277
|
+
// Preserve original creation date on update
|
|
278
|
+
...((() => {
|
|
279
|
+
try {
|
|
280
|
+
return JSON.parse(readFileSync(path.join(profileDir, 'metadata.json'), 'utf-8'));
|
|
281
|
+
}
|
|
282
|
+
catch {
|
|
283
|
+
return {};
|
|
284
|
+
}
|
|
285
|
+
})()),
|
|
286
|
+
name: profileName,
|
|
287
|
+
lastUsed: now,
|
|
288
|
+
domains,
|
|
289
|
+
...(description ? { description } : {}),
|
|
290
|
+
}
|
|
291
|
+
: {
|
|
292
|
+
name: profileName,
|
|
293
|
+
created: now,
|
|
294
|
+
lastUsed: now,
|
|
295
|
+
domains,
|
|
296
|
+
...(description ? { description } : {}),
|
|
297
|
+
};
|
|
298
|
+
writeFileSync(path.join(profileDir, 'metadata.json'), JSON.stringify(meta, null, 2));
|
|
299
|
+
console.log(`✅ Profile "${profileName}" ${isUpdate ? 'updated' : 'saved'}!`);
|
|
300
|
+
if (domains.length > 0) {
|
|
301
|
+
console.log(` Domains: ${domains.join(', ')}`);
|
|
302
|
+
}
|
|
303
|
+
else {
|
|
304
|
+
console.log(' No login sessions detected (no cookies captured).');
|
|
305
|
+
console.log(' Make sure you completed the login before pressing Enter.');
|
|
306
|
+
}
|
|
307
|
+
console.log('');
|
|
308
|
+
console.log(` Use with: webpeel "${url}" --profile ${profileName}`);
|
|
309
|
+
}
|
|
310
|
+
catch (e) {
|
|
311
|
+
console.error('Warning: Failed to save storage state:', e instanceof Error ? e.message : String(e));
|
|
312
|
+
// Clean up partial directory if this was a new profile
|
|
313
|
+
if (!isUpdate) {
|
|
314
|
+
try {
|
|
315
|
+
rmSync(profileDir, { recursive: true, force: true });
|
|
316
|
+
}
|
|
317
|
+
catch {
|
|
318
|
+
// ignore cleanup errors
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
try {
|
|
323
|
+
await browser.close();
|
|
324
|
+
}
|
|
325
|
+
catch {
|
|
326
|
+
// ignore close errors
|
|
327
|
+
}
|
|
328
|
+
};
|
|
329
|
+
// Three ways to save: Enter key, browser close, or Ctrl+C
|
|
330
|
+
await new Promise((resolve) => {
|
|
331
|
+
let resolved = false;
|
|
332
|
+
const done = async () => {
|
|
333
|
+
if (resolved)
|
|
334
|
+
return;
|
|
335
|
+
resolved = true;
|
|
336
|
+
await saveAndClose();
|
|
337
|
+
resolve();
|
|
338
|
+
};
|
|
339
|
+
// Wait for Enter key on stdin
|
|
340
|
+
if (process.stdin.isTTY) {
|
|
341
|
+
process.stdin.setRawMode(false);
|
|
342
|
+
}
|
|
343
|
+
process.stdin.resume();
|
|
344
|
+
process.stdin.once('data', () => done());
|
|
345
|
+
// Browser closed by user
|
|
346
|
+
browser.on('disconnected', () => done());
|
|
347
|
+
// Ctrl+C
|
|
348
|
+
process.once('SIGINT', () => done());
|
|
349
|
+
});
|
|
350
|
+
}
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
* Returns: { success, data|answer, sources, method, elapsed, tokensUsed }
|
|
15
15
|
*
|
|
16
16
|
* Webhook support: pass `webhook` URL to get async delivery with HMAC-SHA256 signing.
|
|
17
|
+
* Streaming support: pass `stream: true` to get SSE events instead of polling.
|
|
17
18
|
*
|
|
18
19
|
* 5-minute in-memory cache. Max 10 sources per request.
|
|
19
20
|
*/
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
* Returns: { success, data|answer, sources, method, elapsed, tokensUsed }
|
|
15
15
|
*
|
|
16
16
|
* Webhook support: pass `webhook` URL to get async delivery with HMAC-SHA256 signing.
|
|
17
|
+
* Streaming support: pass `stream: true` to get SSE events instead of polling.
|
|
17
18
|
*
|
|
18
19
|
* 5-minute in-memory cache. Max 10 sources per request.
|
|
19
20
|
*/
|
|
@@ -81,8 +82,14 @@ function setCache(key, result) {
|
|
|
81
82
|
}
|
|
82
83
|
cache.set(key, { result, expiresAt: Date.now() + CACHE_TTL });
|
|
83
84
|
}
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
// SSE helpers
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
function sseWrite(res, event, data) {
|
|
89
|
+
res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`);
|
|
90
|
+
}
|
|
84
91
|
async function runAgentQuery(params) {
|
|
85
|
-
const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources } = params;
|
|
92
|
+
const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, onSearching, onFetching, onExtracting } = params;
|
|
86
93
|
const startMs = Date.now();
|
|
87
94
|
const numSources = Math.min(maxSources || 5, 10);
|
|
88
95
|
// Cache check
|
|
@@ -97,6 +104,8 @@ async function runAgentQuery(params) {
|
|
|
97
104
|
}
|
|
98
105
|
else {
|
|
99
106
|
log.info(`Searching web for: "${prompt}"`);
|
|
107
|
+
if (onSearching)
|
|
108
|
+
onSearching();
|
|
100
109
|
const { provider, apiKey: searchApiKey } = getBestSearchProvider();
|
|
101
110
|
try {
|
|
102
111
|
const searchResults = await provider.searchWeb(prompt.trim(), { count: numSources, apiKey: searchApiKey });
|
|
@@ -111,6 +120,8 @@ async function runAgentQuery(params) {
|
|
|
111
120
|
}
|
|
112
121
|
// Step 2: Fetch pages in parallel
|
|
113
122
|
log.info(`Fetching ${sourceUrls.length} sources in parallel`);
|
|
123
|
+
if (onFetching)
|
|
124
|
+
onFetching(sourceUrls.length);
|
|
114
125
|
const PER_SOURCE_TIMEOUT_MS = 5000;
|
|
115
126
|
const fetchPromises = sourceUrls.map(async (source) => {
|
|
116
127
|
try {
|
|
@@ -136,6 +147,8 @@ async function runAgentQuery(params) {
|
|
|
136
147
|
let result;
|
|
137
148
|
if (schema && llmApiKey) {
|
|
138
149
|
log.info('Using LLM extraction');
|
|
150
|
+
if (onExtracting)
|
|
151
|
+
onExtracting('llm');
|
|
139
152
|
const extracted = await extractWithLLM({
|
|
140
153
|
content: combinedContent.slice(0, 30000), schema, llmApiKey, llmProvider: (llmProvider || 'openai'), llmModel,
|
|
141
154
|
prompt: `Based on these web pages, ${prompt}`, url: fetchResults[0].url,
|
|
@@ -146,6 +159,8 @@ async function runAgentQuery(params) {
|
|
|
146
159
|
}
|
|
147
160
|
else {
|
|
148
161
|
log.info('Using BM25 text extraction');
|
|
162
|
+
if (onExtracting)
|
|
163
|
+
onExtracting('bm25');
|
|
149
164
|
const qa = quickAnswer({ question: prompt, content: combinedContent, maxPassages: 3, maxChars: 2000 });
|
|
150
165
|
result = { success: true, answer: qa.answer || combinedContent.slice(0, 2000), confidence: qa.confidence ?? 0,
|
|
151
166
|
sources: fetchResults.map((r) => ({ url: r.url, title: r.title })), method: 'agent-bm25', tokensUsed: totalTokens, elapsed: Date.now() - startMs };
|
|
@@ -158,9 +173,9 @@ async function runAgentQuery(params) {
|
|
|
158
173
|
// ---------------------------------------------------------------------------
|
|
159
174
|
export function createAgentRouter() {
|
|
160
175
|
const router = Router();
|
|
161
|
-
// ── POST /v1/agent — single query (with optional webhook)
|
|
176
|
+
// ── POST /v1/agent — single query (with optional webhook or stream) ──────
|
|
162
177
|
router.post('/', async (req, res) => {
|
|
163
|
-
const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, webhook } = req.body || {};
|
|
178
|
+
const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, webhook, stream } = req.body || {};
|
|
164
179
|
const requestId = req.requestId || crypto.randomUUID();
|
|
165
180
|
if (!prompt?.trim()) {
|
|
166
181
|
return res.status(400).json({
|
|
@@ -170,6 +185,34 @@ export function createAgentRouter() {
|
|
|
170
185
|
requestId,
|
|
171
186
|
});
|
|
172
187
|
}
|
|
188
|
+
// ── Streaming mode (SSE) ─────────────────────────────────────────────
|
|
189
|
+
if (stream === true) {
|
|
190
|
+
res.setHeader('Content-Type', 'text/event-stream');
|
|
191
|
+
res.setHeader('Cache-Control', 'no-cache');
|
|
192
|
+
res.setHeader('Connection', 'keep-alive');
|
|
193
|
+
res.setHeader('X-Accel-Buffering', 'no');
|
|
194
|
+
res.flushHeaders();
|
|
195
|
+
try {
|
|
196
|
+
const result = await runAgentQuery({
|
|
197
|
+
prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources,
|
|
198
|
+
onSearching: () => {
|
|
199
|
+
sseWrite(res, 'searching', { message: 'Searching the web...' });
|
|
200
|
+
},
|
|
201
|
+
onFetching: (count) => {
|
|
202
|
+
sseWrite(res, 'fetching', { message: `Fetching ${count} sources...`, count });
|
|
203
|
+
},
|
|
204
|
+
onExtracting: (method) => {
|
|
205
|
+
sseWrite(res, 'extracting', { message: method === 'llm' ? 'Extracting with LLM...' : 'Analyzing with BM25...', method });
|
|
206
|
+
},
|
|
207
|
+
});
|
|
208
|
+
sseWrite(res, 'done', { ...result, requestId });
|
|
209
|
+
}
|
|
210
|
+
catch (err) {
|
|
211
|
+
sseWrite(res, 'error', { message: err.message || 'An unexpected error occurred', requestId });
|
|
212
|
+
}
|
|
213
|
+
res.end();
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
173
216
|
// Async mode: webhook provided → return immediately, deliver result later
|
|
174
217
|
if (webhook) {
|
|
175
218
|
const jobId = crypto.randomUUID();
|
|
@@ -198,7 +241,7 @@ export function createAgentRouter() {
|
|
|
198
241
|
});
|
|
199
242
|
// ── POST /v1/agent/batch — parallel batch queries ─────────────────────
|
|
200
243
|
router.post('/batch', async (req, res) => {
|
|
201
|
-
const { prompts, schema, llmApiKey, llmProvider, llmModel, sources, webhook } = req.body || {};
|
|
244
|
+
const { prompts, schema, llmApiKey, llmProvider, llmModel, sources, webhook, stream } = req.body || {};
|
|
202
245
|
const requestId = req.requestId || crypto.randomUUID();
|
|
203
246
|
if (!Array.isArray(prompts) || prompts.length === 0) {
|
|
204
247
|
return res.status(400).json({
|
|
@@ -214,7 +257,57 @@ export function createAgentRouter() {
|
|
|
214
257
|
const jobId = crypto.randomUUID();
|
|
215
258
|
const job = { id: jobId, status: 'processing', total: prompts.length, completed: 0, results: [], webhook, createdAt: Date.now() };
|
|
216
259
|
batchJobs.set(jobId, job);
|
|
217
|
-
//
|
|
260
|
+
// ── Streaming mode (SSE) — keep connection open ──────────────────────
|
|
261
|
+
if (stream === true) {
|
|
262
|
+
res.setHeader('Content-Type', 'text/event-stream');
|
|
263
|
+
res.setHeader('Cache-Control', 'no-cache');
|
|
264
|
+
res.setHeader('Connection', 'keep-alive');
|
|
265
|
+
res.setHeader('X-Accel-Buffering', 'no');
|
|
266
|
+
res.flushHeaders();
|
|
267
|
+
// Send start event
|
|
268
|
+
sseWrite(res, 'start', { id: jobId, total: prompts.length, requestId });
|
|
269
|
+
const sem = new Semaphore(5);
|
|
270
|
+
const tasks = prompts.map(async (prompt) => {
|
|
271
|
+
await sem.acquire();
|
|
272
|
+
try {
|
|
273
|
+
const result = await runAgentQuery({ prompt, schema, llmApiKey, llmProvider, llmModel, sources });
|
|
274
|
+
const entry = {
|
|
275
|
+
prompt,
|
|
276
|
+
success: !!result.success,
|
|
277
|
+
answer: result.answer,
|
|
278
|
+
data: result.data,
|
|
279
|
+
sources: result.sources,
|
|
280
|
+
method: result.method,
|
|
281
|
+
elapsed: result.elapsed,
|
|
282
|
+
};
|
|
283
|
+
job.results.push(entry);
|
|
284
|
+
job.completed++;
|
|
285
|
+
// Send per-prompt progress event
|
|
286
|
+
sseWrite(res, 'progress', { completed: job.completed, total: job.total, result: entry });
|
|
287
|
+
}
|
|
288
|
+
catch (err) {
|
|
289
|
+
const entry = { prompt, success: false, error: err.message };
|
|
290
|
+
job.results.push(entry);
|
|
291
|
+
job.completed++;
|
|
292
|
+
sseWrite(res, 'progress', { completed: job.completed, total: job.total, result: entry });
|
|
293
|
+
}
|
|
294
|
+
finally {
|
|
295
|
+
sem.release();
|
|
296
|
+
}
|
|
297
|
+
});
|
|
298
|
+
await Promise.allSettled(tasks);
|
|
299
|
+
job.status = 'completed';
|
|
300
|
+
// Send done event
|
|
301
|
+
sseWrite(res, 'done', { id: jobId, total: job.total, completed: job.completed, requestId });
|
|
302
|
+
res.end();
|
|
303
|
+
// Fire webhook if configured
|
|
304
|
+
if (webhook) {
|
|
305
|
+
sendWebhook(webhook, 'agent.batch.completed', { id: jobId, total: job.total, completed: job.completed, results: job.results })
|
|
306
|
+
.catch((err) => log.error('Batch webhook failed:', err.message));
|
|
307
|
+
}
|
|
308
|
+
return;
|
|
309
|
+
}
|
|
310
|
+
// Non-streaming mode: Return immediately, then process in background
|
|
218
311
|
res.json({ success: true, id: jobId, status: 'processing', total: prompts.length, requestId });
|
|
219
312
|
// Process in background with concurrency limit of 5
|
|
220
313
|
// eslint-disable-next-line @typescript-eslint/no-floating-promises
|
|
@@ -11,12 +11,14 @@ import { Router } from 'express';
|
|
|
11
11
|
import '../types.js'; // Augments Express.Request with requestId
|
|
12
12
|
import { crawl } from '../../core/crawler.js';
|
|
13
13
|
import { validateUrlForSSRF, SSRFError } from '../middleware/url-validator.js';
|
|
14
|
+
import crypto from 'crypto';
|
|
14
15
|
export function createCrawlRouter(jobQueue) {
|
|
15
16
|
const router = Router();
|
|
16
17
|
/**
|
|
17
18
|
* POST /v1/crawl
|
|
18
19
|
*
|
|
19
20
|
* Start an async crawl job. Returns a job ID immediately; poll GET /v1/crawl/:id for status.
|
|
21
|
+
* With stream:true, keeps the connection open and sends SSE events per page.
|
|
20
22
|
*
|
|
21
23
|
* Body:
|
|
22
24
|
* url {string} Required. Starting URL.
|
|
@@ -26,10 +28,11 @@ export function createCrawlRouter(jobQueue) {
|
|
|
26
28
|
* excludePatterns {string[]} Regex patterns — skip matching URLs.
|
|
27
29
|
* formats {string[]} Content formats: 'markdown' | 'text' (default: ['markdown']).
|
|
28
30
|
* webhook {object} Optional webhook to POST results to when done.
|
|
31
|
+
* stream {boolean} If true, respond with SSE events (start → progress → done).
|
|
29
32
|
*/
|
|
30
33
|
router.post('/', async (req, res) => {
|
|
31
34
|
try {
|
|
32
|
-
const { url, maxPages = 10, maxDepth = 2, includePatterns = [], excludePatterns = [], webhook, } = req.body ?? {};
|
|
35
|
+
const { url, maxPages = 10, maxDepth = 2, includePatterns = [], excludePatterns = [], webhook, stream, } = req.body ?? {};
|
|
33
36
|
// Validate URL
|
|
34
37
|
if (!url || typeof url !== 'string') {
|
|
35
38
|
res.status(400).json({
|
|
@@ -78,6 +81,76 @@ export function createCrawlRouter(jobQueue) {
|
|
|
78
81
|
throw error;
|
|
79
82
|
}
|
|
80
83
|
const ownerId = req.auth?.keyInfo?.accountId;
|
|
84
|
+
// ── Streaming mode (SSE) — keep connection open ──────────────────────
|
|
85
|
+
if (stream === true) {
|
|
86
|
+
res.setHeader('Content-Type', 'text/event-stream');
|
|
87
|
+
res.setHeader('Cache-Control', 'no-cache');
|
|
88
|
+
res.setHeader('Connection', 'keep-alive');
|
|
89
|
+
res.setHeader('X-Accel-Buffering', 'no');
|
|
90
|
+
res.flushHeaders();
|
|
91
|
+
const jobId = crypto.randomUUID();
|
|
92
|
+
// Send start event (total unknown until crawl runs)
|
|
93
|
+
res.write(`event: start\ndata: ${JSON.stringify({ id: jobId, url, maxPages, requestId: req.requestId })}\n\n`);
|
|
94
|
+
const crawlOptions = {
|
|
95
|
+
maxPages,
|
|
96
|
+
maxDepth,
|
|
97
|
+
tier: req.auth?.tier,
|
|
98
|
+
onProgress: (progress) => {
|
|
99
|
+
const total = progress.crawled + progress.queued;
|
|
100
|
+
res.write(`event: progress\ndata: ${JSON.stringify({
|
|
101
|
+
id: jobId,
|
|
102
|
+
completed: progress.crawled,
|
|
103
|
+
total,
|
|
104
|
+
queued: progress.queued,
|
|
105
|
+
currentUrl: progress.currentUrl,
|
|
106
|
+
})}\n\n`);
|
|
107
|
+
},
|
|
108
|
+
};
|
|
109
|
+
if (Array.isArray(includePatterns) && includePatterns.length > 0) {
|
|
110
|
+
crawlOptions.includePatterns = includePatterns;
|
|
111
|
+
}
|
|
112
|
+
if (Array.isArray(excludePatterns) && excludePatterns.length > 0) {
|
|
113
|
+
crawlOptions.excludePatterns = excludePatterns;
|
|
114
|
+
}
|
|
115
|
+
try {
|
|
116
|
+
const results = await crawl(url, crawlOptions);
|
|
117
|
+
const data = results.map(r => ({
|
|
118
|
+
url: r.url,
|
|
119
|
+
title: r.title,
|
|
120
|
+
content: r.markdown,
|
|
121
|
+
links: r.links,
|
|
122
|
+
elapsed: r.elapsed,
|
|
123
|
+
}));
|
|
124
|
+
res.write(`event: done\ndata: ${JSON.stringify({
|
|
125
|
+
id: jobId,
|
|
126
|
+
total: results.length,
|
|
127
|
+
completed: results.length,
|
|
128
|
+
results: data,
|
|
129
|
+
requestId: req.requestId,
|
|
130
|
+
})}\n\n`);
|
|
131
|
+
// Fire webhook if configured
|
|
132
|
+
if (webhook) {
|
|
133
|
+
Promise.resolve(jobQueue.createJob('crawl', webhook, ownerId)).then((job) => {
|
|
134
|
+
jobQueue.updateJob(job.id, {
|
|
135
|
+
status: 'completed',
|
|
136
|
+
data,
|
|
137
|
+
total: results.length,
|
|
138
|
+
completed: results.length,
|
|
139
|
+
creditsUsed: results.length,
|
|
140
|
+
});
|
|
141
|
+
}).catch(() => { });
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
catch (error) {
|
|
145
|
+
res.write(`event: error\ndata: ${JSON.stringify({
|
|
146
|
+
id: jobId,
|
|
147
|
+
message: error.message || 'Crawl failed',
|
|
148
|
+
requestId: req.requestId,
|
|
149
|
+
})}\n\n`);
|
|
150
|
+
}
|
|
151
|
+
res.end();
|
|
152
|
+
return;
|
|
153
|
+
}
|
|
81
154
|
const job = await jobQueue.createJob('crawl', webhook, ownerId);
|
|
82
155
|
// Start crawl in background
|
|
83
156
|
setImmediate(async () => {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.8",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|