webpeel 0.13.4 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -162
- package/dist/cli-auth.js +7 -7
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +197 -26
- package/dist/cli.js.map +1 -1
- package/dist/core/auto-extract.d.ts +83 -0
- package/dist/core/auto-extract.d.ts.map +1 -0
- package/dist/core/auto-extract.js +565 -0
- package/dist/core/auto-extract.js.map +1 -0
- package/dist/core/deep-fetch.d.ts +75 -0
- package/dist/core/deep-fetch.d.ts.map +1 -0
- package/dist/core/deep-fetch.js +406 -0
- package/dist/core/deep-fetch.js.map +1 -0
- package/dist/core/domain-extractors.d.ts +34 -0
- package/dist/core/domain-extractors.d.ts.map +1 -0
- package/dist/core/domain-extractors.js +654 -0
- package/dist/core/domain-extractors.js.map +1 -0
- package/dist/core/markdown.d.ts +8 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +25 -0
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/quick-answer.d.ts +28 -0
- package/dist/core/quick-answer.d.ts.map +1 -0
- package/dist/core/quick-answer.js +288 -0
- package/dist/core/quick-answer.js.map +1 -0
- package/dist/core/readability.d.ts +58 -0
- package/dist/core/readability.d.ts.map +1 -0
- package/dist/core/readability.js +496 -0
- package/dist/core/readability.js.map +1 -0
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +3 -6
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +70 -5
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/watch-manager.d.ts +140 -0
- package/dist/core/watch-manager.d.ts.map +1 -0
- package/dist/core/watch-manager.js +348 -0
- package/dist/core/watch-manager.js.map +1 -0
- package/dist/core/youtube.d.ts +91 -0
- package/dist/core/youtube.d.ts.map +1 -0
- package/dist/core/youtube.js +380 -0
- package/dist/core/youtube.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +103 -0
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +58 -16
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +19 -1
- package/dist/server/app.js.map +1 -1
- package/dist/server/routes/deep-fetch.d.ts +9 -0
- package/dist/server/routes/deep-fetch.d.ts.map +1 -0
- package/dist/server/routes/deep-fetch.js +38 -0
- package/dist/server/routes/deep-fetch.js.map +1 -0
- package/dist/server/routes/extract.d.ts.map +1 -1
- package/dist/server/routes/extract.js +11 -0
- package/dist/server/routes/extract.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +45 -19
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/mcp.d.ts +2 -1
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +307 -38
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/quick-answer.d.ts +9 -0
- package/dist/server/routes/quick-answer.d.ts.map +1 -0
- package/dist/server/routes/quick-answer.js +84 -0
- package/dist/server/routes/quick-answer.js.map +1 -0
- package/dist/server/routes/watch.d.ts +16 -0
- package/dist/server/routes/watch.d.ts.map +1 -0
- package/dist/server/routes/watch.js +219 -0
- package/dist/server/routes/watch.js.map +1 -0
- package/dist/server/routes/youtube.d.ts +7 -0
- package/dist/server/routes/youtube.d.ts.map +1 -0
- package/dist/server/routes/youtube.js +87 -0
- package/dist/server/routes/youtube.js.map +1 -0
- package/dist/types.d.ts +18 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +14 -5
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -127,6 +127,53 @@ function parseActions(actionStrings) {
|
|
|
127
127
|
}
|
|
128
128
|
});
|
|
129
129
|
}
|
|
130
|
+
/**
|
|
131
|
+
* Format an error with actionable suggestions based on error type
|
|
132
|
+
*/
|
|
133
|
+
function formatError(error, _url, options) {
|
|
134
|
+
const msg = error.message || String(error);
|
|
135
|
+
const lines = [`\x1b[31m✖ ${msg}\x1b[0m`];
|
|
136
|
+
if (msg.includes('net::ERR_') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
|
|
137
|
+
lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
|
|
138
|
+
}
|
|
139
|
+
else if (msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
|
|
140
|
+
lines.push('\x1b[33m💡 Try increasing timeout: --timeout 60000\x1b[0m');
|
|
141
|
+
if (!options.render) {
|
|
142
|
+
lines.push('\x1b[33m💡 Site may need browser rendering: --render\x1b[0m');
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
else if (msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
|
|
146
|
+
if (!options.stealth) {
|
|
147
|
+
lines.push('\x1b[33m💡 Try stealth mode to bypass bot detection: --stealth\x1b[0m');
|
|
148
|
+
}
|
|
149
|
+
lines.push('\x1b[33m💡 Try a different user agent: --ua "Mozilla/5.0..."\x1b[0m');
|
|
150
|
+
}
|
|
151
|
+
else if (msg.includes('empty') || msg.includes('no content') || msg.includes('0 tokens')) {
|
|
152
|
+
if (!options.render) {
|
|
153
|
+
lines.push('\x1b[33m💡 Page may be JavaScript-rendered. Try: --render\x1b[0m');
|
|
154
|
+
}
|
|
155
|
+
else if (!options.stealth) {
|
|
156
|
+
lines.push('\x1b[33m💡 Content may be behind bot detection. Try: --stealth\x1b[0m');
|
|
157
|
+
}
|
|
158
|
+
lines.push('\x1b[33m💡 Try waiting longer for content: --wait 5000\x1b[0m');
|
|
159
|
+
}
|
|
160
|
+
else if (msg.includes('captcha') || msg.includes('CAPTCHA') || msg.includes('Captcha')) {
|
|
161
|
+
lines.push('\x1b[33m💡 This site requires CAPTCHA solving. Try a browser profile: --profile mysite --headed\x1b[0m');
|
|
162
|
+
}
|
|
163
|
+
else if (msg.includes('rate limit') || msg.includes('429')) {
|
|
164
|
+
lines.push('\x1b[33m💡 Rate limited. Wait a moment and try again, or use --proxy.\x1b[0m');
|
|
165
|
+
}
|
|
166
|
+
else if (msg.toLowerCase().includes('enotfound') || msg.toLowerCase().includes('getaddrinfo')) {
|
|
167
|
+
lines.push('\x1b[33m💡 Could not resolve hostname. Check the URL is correct.\x1b[0m');
|
|
168
|
+
}
|
|
169
|
+
else if (msg.toLowerCase().includes('certificate') || msg.toLowerCase().includes('ssl') || msg.toLowerCase().includes('tls')) {
|
|
170
|
+
lines.push('\x1b[33m💡 SSL/TLS error. The site may have an invalid certificate.\x1b[0m');
|
|
171
|
+
}
|
|
172
|
+
else if (msg.toLowerCase().includes('usage') || msg.toLowerCase().includes('quota') || msg.toLowerCase().includes('limit')) {
|
|
173
|
+
lines.push('\x1b[33m💡 Run `webpeel usage` to check your quota, or `webpeel login` to authenticate.\x1b[0m');
|
|
174
|
+
}
|
|
175
|
+
return lines.join('\n');
|
|
176
|
+
}
|
|
130
177
|
program
|
|
131
178
|
.argument('[url]', 'URL to fetch')
|
|
132
179
|
.option('-r, --render', 'Use headless browser (for JS-heavy sites)')
|
|
@@ -147,6 +194,7 @@ program
|
|
|
147
194
|
.option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
|
|
148
195
|
.option('--only-main-content', 'Shortcut for --include-tags main,article')
|
|
149
196
|
.option('--full-content', 'Return full page content (disable automatic content density pruning)')
|
|
197
|
+
.option('--readable', 'Reader mode — extract only the main article content, strip all noise (like browser Reader Mode)')
|
|
150
198
|
.option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
|
|
151
199
|
.option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
|
|
152
200
|
.option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
|
|
@@ -181,7 +229,51 @@ program
|
|
|
181
229
|
.option('--pages <n>', 'Follow pagination "Next" links for N pages (max 10)', (v) => parseInt(v, 10))
|
|
182
230
|
.option('--profile <path>', 'Use a persistent browser profile directory (cookies/sessions survive between calls)')
|
|
183
231
|
.option('--headed', 'Run browser in headed (visible) mode — useful for profile setup and debugging')
|
|
184
|
-
.option('--
|
|
232
|
+
.option('-q, --question <q>', 'Ask a question about the page content (BM25-powered, no LLM key needed)')
|
|
233
|
+
.option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)');
|
|
234
|
+
program.configureHelp({
|
|
235
|
+
sortSubcommands: true,
|
|
236
|
+
showGlobalOptions: false,
|
|
237
|
+
});
|
|
238
|
+
program.addHelpText('afterAll', `
|
|
239
|
+
Output Formats:
|
|
240
|
+
--json JSON output with full metadata
|
|
241
|
+
--html Raw HTML output
|
|
242
|
+
--text Plain text output
|
|
243
|
+
--csv / --table Tabular output for extractions
|
|
244
|
+
-s, --silent No spinner or progress output
|
|
245
|
+
|
|
246
|
+
Content Control:
|
|
247
|
+
--readable Reader mode — clean article content only
|
|
248
|
+
--budget <n> Smart token budget (no LLM key needed)
|
|
249
|
+
--focus <query> BM25 query-focused filtering
|
|
250
|
+
--selector <css> Extract specific CSS selector
|
|
251
|
+
--only-main-content Just main/article content
|
|
252
|
+
--full-content Disable content pruning
|
|
253
|
+
-q, --question <q> Ask a question about the content
|
|
254
|
+
|
|
255
|
+
Rendering:
|
|
256
|
+
-r, --render Browser rendering for JS-heavy sites
|
|
257
|
+
--stealth Stealth mode for bot-protected sites
|
|
258
|
+
--profile <path> Persistent browser profile
|
|
259
|
+
--headed Visible browser (for debugging)
|
|
260
|
+
--action <actions> Browser automation (click, type, scroll...)
|
|
261
|
+
|
|
262
|
+
Extraction:
|
|
263
|
+
--extract <json> CSS selector extraction
|
|
264
|
+
--extract-all Auto-detect listing items
|
|
265
|
+
--schema <name> Named extraction schema
|
|
266
|
+
--llm-extract [inst] LLM-powered extraction (BYOK)
|
|
267
|
+
|
|
268
|
+
Examples:
|
|
269
|
+
$ webpeel "https://example.com" Basic fetch
|
|
270
|
+
$ webpeel "https://youtube.com/watch?v=..." --json YouTube transcript
|
|
271
|
+
$ webpeel "https://openai.com/pricing" -q "GPT-4 cost?" Quick answer
|
|
272
|
+
$ webpeel "https://nytimes.com/article" --readable Reader mode
|
|
273
|
+
$ webpeel search "best restaurants in NYC" Web search
|
|
274
|
+
$ webpeel hotels "Manhattan" --checkin tomorrow Hotel search
|
|
275
|
+
`);
|
|
276
|
+
program
|
|
185
277
|
.action(async (url, options) => {
|
|
186
278
|
// --agent sets sensible defaults for AI agents; explicit flags override
|
|
187
279
|
if (options.agent) {
|
|
@@ -505,6 +597,7 @@ program
|
|
|
505
597
|
storageState: resolvedStorageState,
|
|
506
598
|
proxy: options.proxy,
|
|
507
599
|
fullPage: options.fullContent || false,
|
|
600
|
+
readable: options.readable || false,
|
|
508
601
|
// Smart auto-scroll (bare --scroll-extract flag)
|
|
509
602
|
autoScroll: isAutoScroll
|
|
510
603
|
? { timeout: options.scrollExtractTimeout }
|
|
@@ -540,7 +633,23 @@ program
|
|
|
540
633
|
touchProfile(resolvedProfileName);
|
|
541
634
|
}
|
|
542
635
|
if (spinner) {
|
|
543
|
-
|
|
636
|
+
const domainTag = result.domainData
|
|
637
|
+
? ` [${result.domainData.domain}:${result.domainData.type}]`
|
|
638
|
+
: '';
|
|
639
|
+
spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
|
|
640
|
+
}
|
|
641
|
+
// Show metadata header
|
|
642
|
+
const pageTitle = result.metadata?.title || result.title;
|
|
643
|
+
if (!options.silent && !options.json && pageTitle) {
|
|
644
|
+
const parts = [];
|
|
645
|
+
if (result.metadata?.author)
|
|
646
|
+
parts.push(`by ${result.metadata.author}`);
|
|
647
|
+
if (result.readability?.readingTime)
|
|
648
|
+
parts.push(result.readability.readingTime);
|
|
649
|
+
if (result.tokens)
|
|
650
|
+
parts.push(`${result.tokens.toLocaleString()} tokens`);
|
|
651
|
+
const subtitle = parts.length ? ` · ${parts.join(' · ')}` : '';
|
|
652
|
+
console.error(`\x1b[36m📄 ${pageTitle}${subtitle}\x1b[0m`);
|
|
544
653
|
}
|
|
545
654
|
// Show usage footer for free/anonymous users
|
|
546
655
|
if (usageCheck.usageInfo && !options.silent) {
|
|
@@ -596,6 +705,36 @@ program
|
|
|
596
705
|
result.focusReduction = focusResult.reductionPercent;
|
|
597
706
|
}
|
|
598
707
|
}
|
|
708
|
+
// --- LLM-free Quick Answer ---
|
|
709
|
+
if (options.question && result.content) {
|
|
710
|
+
const { quickAnswer } = await import('./core/quick-answer.js');
|
|
711
|
+
const qa = quickAnswer({
|
|
712
|
+
question: options.question,
|
|
713
|
+
content: result.content,
|
|
714
|
+
url: result.url,
|
|
715
|
+
});
|
|
716
|
+
result.quickAnswer = qa;
|
|
717
|
+
if (!isJson) {
|
|
718
|
+
// Display answer prominently in human-readable mode
|
|
719
|
+
const conf = (qa.confidence * 100).toFixed(0);
|
|
720
|
+
await writeStdout(`\n\x1b[36m📋 ${qa.question}\x1b[0m\n\n`);
|
|
721
|
+
if (qa.answer) {
|
|
722
|
+
await writeStdout(`\x1b[32m💡 Answer (${conf}% confidence):\x1b[0m\n${qa.answer}\n`);
|
|
723
|
+
}
|
|
724
|
+
else {
|
|
725
|
+
await writeStdout(`\x1b[33m💡 No relevant answer found (${conf}% confidence)\x1b[0m\n`);
|
|
726
|
+
}
|
|
727
|
+
if (qa.passages && qa.passages.length > 1) {
|
|
728
|
+
await writeStdout(`\n\x1b[33m📝 Supporting evidence:\x1b[0m\n`);
|
|
729
|
+
for (const p of qa.passages.slice(1, 4)) {
|
|
730
|
+
await writeStdout(` • [${(p.score * 100).toFixed(0)}%] ${p.text.substring(0, 200)}${p.text.length > 200 ? '...' : ''}\n`);
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
await writeStdout('\n');
|
|
734
|
+
await cleanup();
|
|
735
|
+
process.exit(0);
|
|
736
|
+
}
|
|
737
|
+
}
|
|
599
738
|
// --- Smart Chunking ---
|
|
600
739
|
if (options.chunk && options.chunk > 0 && result.content) {
|
|
601
740
|
const { chunkContent } = await import('./core/chunking.js');
|
|
@@ -851,30 +990,10 @@ program
|
|
|
851
990
|
process.exit(1);
|
|
852
991
|
}
|
|
853
992
|
if (error instanceof Error) {
|
|
854
|
-
console.error(
|
|
855
|
-
// Provide actionable hints based on error type
|
|
856
|
-
const msg = error.message.toLowerCase();
|
|
857
|
-
if (msg.includes('timeout') || msg.includes('timed out')) {
|
|
858
|
-
console.error('\n💡 Hint: Try --render for JS-heavy sites, or --wait 5000 to wait longer.');
|
|
859
|
-
}
|
|
860
|
-
else if (msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
|
|
861
|
-
console.error('\n💡 Hint: Try --stealth to bypass bot detection (uses more credits).');
|
|
862
|
-
}
|
|
863
|
-
else if (msg.includes('enotfound') || msg.includes('getaddrinfo')) {
|
|
864
|
-
console.error('\n💡 Hint: Could not resolve hostname. Check the URL is correct.');
|
|
865
|
-
}
|
|
866
|
-
else if (msg.includes('econnrefused') || msg.includes('econnreset')) {
|
|
867
|
-
console.error('\n💡 Hint: Connection refused. The site may be down or blocking requests.');
|
|
868
|
-
}
|
|
869
|
-
else if (msg.includes('certificate') || msg.includes('ssl') || msg.includes('tls')) {
|
|
870
|
-
console.error('\n💡 Hint: SSL/TLS error. The site may have an invalid certificate.');
|
|
871
|
-
}
|
|
872
|
-
else if (msg.includes('usage') || msg.includes('quota') || msg.includes('limit')) {
|
|
873
|
-
console.error('\n💡 Hint: Run `webpeel usage` to check your quota, or `webpeel login` to authenticate.');
|
|
874
|
-
}
|
|
993
|
+
console.error('\n' + formatError(error, url || '', options));
|
|
875
994
|
}
|
|
876
995
|
else {
|
|
877
|
-
console.error('\
|
|
996
|
+
console.error('\x1b[31m✖ Unknown error occurred\x1b[0m');
|
|
878
997
|
}
|
|
879
998
|
await cleanup();
|
|
880
999
|
process.exit(1);
|
|
@@ -3399,8 +3518,60 @@ async function outputResult(result, options, extra = {}) {
|
|
|
3399
3518
|
}
|
|
3400
3519
|
// Default: full output
|
|
3401
3520
|
if (options.json) {
|
|
3402
|
-
|
|
3403
|
-
|
|
3521
|
+
// Build clean JSON output with guaranteed top-level fields
|
|
3522
|
+
const output = {
|
|
3523
|
+
url: result.url,
|
|
3524
|
+
title: result.metadata?.title || result.title || null,
|
|
3525
|
+
tokens: result.tokens || 0,
|
|
3526
|
+
fetchedAt: new Date().toISOString(),
|
|
3527
|
+
method: result.method || 'simple',
|
|
3528
|
+
elapsed: result.elapsed,
|
|
3529
|
+
content: result.content,
|
|
3530
|
+
};
|
|
3531
|
+
// Add optional fields only if present (filter out undefined/null values from metadata)
|
|
3532
|
+
if (result.metadata) {
|
|
3533
|
+
const cleanMeta = {};
|
|
3534
|
+
for (const [k, v] of Object.entries(result.metadata)) {
|
|
3535
|
+
if (v !== undefined && v !== null)
|
|
3536
|
+
cleanMeta[k] = v;
|
|
3537
|
+
}
|
|
3538
|
+
if (Object.keys(cleanMeta).length > 0)
|
|
3539
|
+
output.metadata = cleanMeta;
|
|
3540
|
+
}
|
|
3541
|
+
if (result.links?.length)
|
|
3542
|
+
output.links = result.links;
|
|
3543
|
+
if (result.images?.length)
|
|
3544
|
+
output.images = result.images;
|
|
3545
|
+
if (result.structured)
|
|
3546
|
+
output.structured = result.structured;
|
|
3547
|
+
if (result.domainData)
|
|
3548
|
+
output.domainData = result.domainData;
|
|
3549
|
+
if (result.readability)
|
|
3550
|
+
output.readability = result.readability;
|
|
3551
|
+
if (result.quickAnswer)
|
|
3552
|
+
output.quickAnswer = result.quickAnswer;
|
|
3553
|
+
if (result.quality)
|
|
3554
|
+
output.quality = result.quality;
|
|
3555
|
+
if (result.contentType)
|
|
3556
|
+
output.contentType = result.contentType;
|
|
3557
|
+
if (result.chunks)
|
|
3558
|
+
output.chunks = result.chunks;
|
|
3559
|
+
if (result.totalChunks)
|
|
3560
|
+
output.totalChunks = result.totalChunks;
|
|
3561
|
+
if (result.warning)
|
|
3562
|
+
output.warning = result.warning;
|
|
3563
|
+
if (result.focusQuery)
|
|
3564
|
+
output.focusQuery = result.focusQuery;
|
|
3565
|
+
if (result.focusReduction)
|
|
3566
|
+
output.focusReduction = result.focusReduction;
|
|
3567
|
+
if (extra.cached)
|
|
3568
|
+
output.cached = true;
|
|
3569
|
+
if (extra.truncated)
|
|
3570
|
+
output.truncated = true;
|
|
3571
|
+
if (extra.totalAvailable !== undefined)
|
|
3572
|
+
output.totalAvailable = extra.totalAvailable;
|
|
3573
|
+
output._meta = { version: cliVersion, method: result.method || 'simple' };
|
|
3574
|
+
await writeStdout(JSON.stringify(output, null, 2) + '\n');
|
|
3404
3575
|
}
|
|
3405
3576
|
else {
|
|
3406
3577
|
await writeStdout(result.content + '\n');
|