webpeel 0.13.4 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +120 -162
  2. package/dist/cli-auth.js +7 -7
  3. package/dist/cli-auth.js.map +1 -1
  4. package/dist/cli.js +197 -26
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/auto-extract.d.ts +83 -0
  7. package/dist/core/auto-extract.d.ts.map +1 -0
  8. package/dist/core/auto-extract.js +565 -0
  9. package/dist/core/auto-extract.js.map +1 -0
  10. package/dist/core/deep-fetch.d.ts +75 -0
  11. package/dist/core/deep-fetch.d.ts.map +1 -0
  12. package/dist/core/deep-fetch.js +406 -0
  13. package/dist/core/deep-fetch.js.map +1 -0
  14. package/dist/core/domain-extractors.d.ts +34 -0
  15. package/dist/core/domain-extractors.d.ts.map +1 -0
  16. package/dist/core/domain-extractors.js +654 -0
  17. package/dist/core/domain-extractors.js.map +1 -0
  18. package/dist/core/markdown.d.ts +8 -0
  19. package/dist/core/markdown.d.ts.map +1 -1
  20. package/dist/core/markdown.js +25 -0
  21. package/dist/core/markdown.js.map +1 -1
  22. package/dist/core/quick-answer.d.ts +28 -0
  23. package/dist/core/quick-answer.d.ts.map +1 -0
  24. package/dist/core/quick-answer.js +288 -0
  25. package/dist/core/quick-answer.js.map +1 -0
  26. package/dist/core/readability.d.ts +58 -0
  27. package/dist/core/readability.d.ts.map +1 -0
  28. package/dist/core/readability.js +496 -0
  29. package/dist/core/readability.js.map +1 -0
  30. package/dist/core/search-provider.d.ts.map +1 -1
  31. package/dist/core/search-provider.js +3 -6
  32. package/dist/core/search-provider.js.map +1 -1
  33. package/dist/core/strategies.d.ts.map +1 -1
  34. package/dist/core/strategies.js +70 -5
  35. package/dist/core/strategies.js.map +1 -1
  36. package/dist/core/watch-manager.d.ts +140 -0
  37. package/dist/core/watch-manager.d.ts.map +1 -0
  38. package/dist/core/watch-manager.js +348 -0
  39. package/dist/core/watch-manager.js.map +1 -0
  40. package/dist/core/youtube.d.ts +91 -0
  41. package/dist/core/youtube.d.ts.map +1 -0
  42. package/dist/core/youtube.js +380 -0
  43. package/dist/core/youtube.js.map +1 -0
  44. package/dist/index.d.ts +4 -0
  45. package/dist/index.d.ts.map +1 -1
  46. package/dist/index.js +103 -0
  47. package/dist/index.js.map +1 -1
  48. package/dist/mcp/server.js +58 -16
  49. package/dist/mcp/server.js.map +1 -1
  50. package/dist/server/app.d.ts.map +1 -1
  51. package/dist/server/app.js +19 -1
  52. package/dist/server/app.js.map +1 -1
  53. package/dist/server/routes/deep-fetch.d.ts +9 -0
  54. package/dist/server/routes/deep-fetch.d.ts.map +1 -0
  55. package/dist/server/routes/deep-fetch.js +38 -0
  56. package/dist/server/routes/deep-fetch.js.map +1 -0
  57. package/dist/server/routes/extract.d.ts.map +1 -1
  58. package/dist/server/routes/extract.js +11 -0
  59. package/dist/server/routes/extract.js.map +1 -1
  60. package/dist/server/routes/fetch.d.ts.map +1 -1
  61. package/dist/server/routes/fetch.js +45 -19
  62. package/dist/server/routes/fetch.js.map +1 -1
  63. package/dist/server/routes/mcp.d.ts +2 -1
  64. package/dist/server/routes/mcp.d.ts.map +1 -1
  65. package/dist/server/routes/mcp.js +307 -38
  66. package/dist/server/routes/mcp.js.map +1 -1
  67. package/dist/server/routes/quick-answer.d.ts +9 -0
  68. package/dist/server/routes/quick-answer.d.ts.map +1 -0
  69. package/dist/server/routes/quick-answer.js +84 -0
  70. package/dist/server/routes/quick-answer.js.map +1 -0
  71. package/dist/server/routes/watch.d.ts +16 -0
  72. package/dist/server/routes/watch.d.ts.map +1 -0
  73. package/dist/server/routes/watch.js +219 -0
  74. package/dist/server/routes/watch.js.map +1 -0
  75. package/dist/server/routes/youtube.d.ts +7 -0
  76. package/dist/server/routes/youtube.d.ts.map +1 -0
  77. package/dist/server/routes/youtube.js +87 -0
  78. package/dist/server/routes/youtube.js.map +1 -0
  79. package/dist/types.d.ts +18 -0
  80. package/dist/types.d.ts.map +1 -1
  81. package/dist/types.js.map +1 -1
  82. package/llms.txt +14 -5
  83. package/package.json +1 -1
package/dist/cli.js CHANGED
@@ -127,6 +127,53 @@ function parseActions(actionStrings) {
127
127
  }
128
128
  });
129
129
  }
130
+ /**
131
+ * Format an error with actionable suggestions based on error type
132
+ */
133
+ function formatError(error, _url, options) {
134
+ const msg = error.message || String(error);
135
+ const lines = [`\x1b[31m✖ ${msg}\x1b[0m`];
136
+ if (msg.includes('net::ERR_') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
137
+ lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
138
+ }
139
+ else if (msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
140
+ lines.push('\x1b[33m💡 Try increasing timeout: --timeout 60000\x1b[0m');
141
+ if (!options.render) {
142
+ lines.push('\x1b[33m💡 Site may need browser rendering: --render\x1b[0m');
143
+ }
144
+ }
145
+ else if (msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
146
+ if (!options.stealth) {
147
+ lines.push('\x1b[33m💡 Try stealth mode to bypass bot detection: --stealth\x1b[0m');
148
+ }
149
+ lines.push('\x1b[33m💡 Try a different user agent: --ua "Mozilla/5.0..."\x1b[0m');
150
+ }
151
+ else if (msg.includes('empty') || msg.includes('no content') || msg.includes('0 tokens')) {
152
+ if (!options.render) {
153
+ lines.push('\x1b[33m💡 Page may be JavaScript-rendered. Try: --render\x1b[0m');
154
+ }
155
+ else if (!options.stealth) {
156
+ lines.push('\x1b[33m💡 Content may be behind bot detection. Try: --stealth\x1b[0m');
157
+ }
158
+ lines.push('\x1b[33m💡 Try waiting longer for content: --wait 5000\x1b[0m');
159
+ }
160
+ else if (msg.includes('captcha') || msg.includes('CAPTCHA') || msg.includes('Captcha')) {
161
+ lines.push('\x1b[33m💡 This site requires CAPTCHA solving. Try a browser profile: --profile mysite --headed\x1b[0m');
162
+ }
163
+ else if (msg.includes('rate limit') || msg.includes('429')) {
164
+ lines.push('\x1b[33m💡 Rate limited. Wait a moment and try again, or use --proxy.\x1b[0m');
165
+ }
166
+ else if (msg.toLowerCase().includes('enotfound') || msg.toLowerCase().includes('getaddrinfo')) {
167
+ lines.push('\x1b[33m💡 Could not resolve hostname. Check the URL is correct.\x1b[0m');
168
+ }
169
+ else if (msg.toLowerCase().includes('certificate') || msg.toLowerCase().includes('ssl') || msg.toLowerCase().includes('tls')) {
170
+ lines.push('\x1b[33m💡 SSL/TLS error. The site may have an invalid certificate.\x1b[0m');
171
+ }
172
+ else if (msg.toLowerCase().includes('usage') || msg.toLowerCase().includes('quota') || msg.toLowerCase().includes('limit')) {
173
+ lines.push('\x1b[33m💡 Run `webpeel usage` to check your quota, or `webpeel login` to authenticate.\x1b[0m');
174
+ }
175
+ return lines.join('\n');
176
+ }
130
177
  program
131
178
  .argument('[url]', 'URL to fetch')
132
179
  .option('-r, --render', 'Use headless browser (for JS-heavy sites)')
@@ -147,6 +194,7 @@ program
147
194
  .option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
148
195
  .option('--only-main-content', 'Shortcut for --include-tags main,article')
149
196
  .option('--full-content', 'Return full page content (disable automatic content density pruning)')
197
+ .option('--readable', 'Reader mode — extract only the main article content, strip all noise (like browser Reader Mode)')
150
198
  .option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
151
199
  .option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
152
200
  .option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
@@ -181,7 +229,51 @@ program
181
229
  .option('--pages <n>', 'Follow pagination "Next" links for N pages (max 10)', (v) => parseInt(v, 10))
182
230
  .option('--profile <path>', 'Use a persistent browser profile directory (cookies/sessions survive between calls)')
183
231
  .option('--headed', 'Run browser in headed (visible) mode — useful for profile setup and debugging')
184
- .option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)')
232
+ .option('-q, --question <q>', 'Ask a question about the page content (BM25-powered, no LLM key needed)')
233
+ .option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)');
234
+ program.configureHelp({
235
+ sortSubcommands: true,
236
+ showGlobalOptions: false,
237
+ });
238
+ program.addHelpText('afterAll', `
239
+ Output Formats:
240
+ --json JSON output with full metadata
241
+ --html Raw HTML output
242
+ --text Plain text output
243
+ --csv / --table Tabular output for extractions
244
+ -s, --silent No spinner or progress output
245
+
246
+ Content Control:
247
+ --readable Reader mode — clean article content only
248
+ --budget <n> Smart token budget (no LLM key needed)
249
+ --focus <query> BM25 query-focused filtering
250
+ --selector <css> Extract specific CSS selector
251
+ --only-main-content Just main/article content
252
+ --full-content Disable content pruning
253
+ -q, --question <q> Ask a question about the content
254
+
255
+ Rendering:
256
+ -r, --render Browser rendering for JS-heavy sites
257
+ --stealth Stealth mode for bot-protected sites
258
+ --profile <path> Persistent browser profile
259
+ --headed Visible browser (for debugging)
260
+ --action <actions> Browser automation (click, type, scroll...)
261
+
262
+ Extraction:
263
+ --extract <json> CSS selector extraction
264
+ --extract-all Auto-detect listing items
265
+ --schema <name> Named extraction schema
266
+ --llm-extract [inst] LLM-powered extraction (BYOK)
267
+
268
+ Examples:
269
+ $ webpeel "https://example.com" Basic fetch
270
+ $ webpeel "https://youtube.com/watch?v=..." --json YouTube transcript
271
+ $ webpeel "https://openai.com/pricing" -q "GPT-4 cost?" Quick answer
272
+ $ webpeel "https://nytimes.com/article" --readable Reader mode
273
+ $ webpeel search "best restaurants in NYC" Web search
274
+ $ webpeel hotels "Manhattan" --checkin tomorrow Hotel search
275
+ `);
276
+ program
185
277
  .action(async (url, options) => {
186
278
  // --agent sets sensible defaults for AI agents; explicit flags override
187
279
  if (options.agent) {
@@ -505,6 +597,7 @@ program
505
597
  storageState: resolvedStorageState,
506
598
  proxy: options.proxy,
507
599
  fullPage: options.fullContent || false,
600
+ readable: options.readable || false,
508
601
  // Smart auto-scroll (bare --scroll-extract flag)
509
602
  autoScroll: isAutoScroll
510
603
  ? { timeout: options.scrollExtractTimeout }
@@ -540,7 +633,23 @@ program
540
633
  touchProfile(resolvedProfileName);
541
634
  }
542
635
  if (spinner) {
543
- spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method`);
636
+ const domainTag = result.domainData
637
+ ? ` [${result.domainData.domain}:${result.domainData.type}]`
638
+ : '';
639
+ spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
640
+ }
641
+ // Show metadata header
642
+ const pageTitle = result.metadata?.title || result.title;
643
+ if (!options.silent && !options.json && pageTitle) {
644
+ const parts = [];
645
+ if (result.metadata?.author)
646
+ parts.push(`by ${result.metadata.author}`);
647
+ if (result.readability?.readingTime)
648
+ parts.push(result.readability.readingTime);
649
+ if (result.tokens)
650
+ parts.push(`${result.tokens.toLocaleString()} tokens`);
651
+ const subtitle = parts.length ? ` · ${parts.join(' · ')}` : '';
652
+ console.error(`\x1b[36m📄 ${pageTitle}${subtitle}\x1b[0m`);
544
653
  }
545
654
  // Show usage footer for free/anonymous users
546
655
  if (usageCheck.usageInfo && !options.silent) {
@@ -596,6 +705,36 @@ program
596
705
  result.focusReduction = focusResult.reductionPercent;
597
706
  }
598
707
  }
708
+ // --- LLM-free Quick Answer ---
709
+ if (options.question && result.content) {
710
+ const { quickAnswer } = await import('./core/quick-answer.js');
711
+ const qa = quickAnswer({
712
+ question: options.question,
713
+ content: result.content,
714
+ url: result.url,
715
+ });
716
+ result.quickAnswer = qa;
717
+ if (!isJson) {
718
+ // Display answer prominently in human-readable mode
719
+ const conf = (qa.confidence * 100).toFixed(0);
720
+ await writeStdout(`\n\x1b[36m📋 ${qa.question}\x1b[0m\n\n`);
721
+ if (qa.answer) {
722
+ await writeStdout(`\x1b[32m💡 Answer (${conf}% confidence):\x1b[0m\n${qa.answer}\n`);
723
+ }
724
+ else {
725
+ await writeStdout(`\x1b[33m💡 No relevant answer found (${conf}% confidence)\x1b[0m\n`);
726
+ }
727
+ if (qa.passages && qa.passages.length > 1) {
728
+ await writeStdout(`\n\x1b[33m📝 Supporting evidence:\x1b[0m\n`);
729
+ for (const p of qa.passages.slice(1, 4)) {
730
+ await writeStdout(` • [${(p.score * 100).toFixed(0)}%] ${p.text.substring(0, 200)}${p.text.length > 200 ? '...' : ''}\n`);
731
+ }
732
+ }
733
+ await writeStdout('\n');
734
+ await cleanup();
735
+ process.exit(0);
736
+ }
737
+ }
599
738
  // --- Smart Chunking ---
600
739
  if (options.chunk && options.chunk > 0 && result.content) {
601
740
  const { chunkContent } = await import('./core/chunking.js');
@@ -851,30 +990,10 @@ program
851
990
  process.exit(1);
852
991
  }
853
992
  if (error instanceof Error) {
854
- console.error(`\nError: ${error.message}`);
855
- // Provide actionable hints based on error type
856
- const msg = error.message.toLowerCase();
857
- if (msg.includes('timeout') || msg.includes('timed out')) {
858
- console.error('\n💡 Hint: Try --render for JS-heavy sites, or --wait 5000 to wait longer.');
859
- }
860
- else if (msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
861
- console.error('\n💡 Hint: Try --stealth to bypass bot detection (uses more credits).');
862
- }
863
- else if (msg.includes('enotfound') || msg.includes('getaddrinfo')) {
864
- console.error('\n💡 Hint: Could not resolve hostname. Check the URL is correct.');
865
- }
866
- else if (msg.includes('econnrefused') || msg.includes('econnreset')) {
867
- console.error('\n💡 Hint: Connection refused. The site may be down or blocking requests.');
868
- }
869
- else if (msg.includes('certificate') || msg.includes('ssl') || msg.includes('tls')) {
870
- console.error('\n💡 Hint: SSL/TLS error. The site may have an invalid certificate.');
871
- }
872
- else if (msg.includes('usage') || msg.includes('quota') || msg.includes('limit')) {
873
- console.error('\n💡 Hint: Run `webpeel usage` to check your quota, or `webpeel login` to authenticate.');
874
- }
993
+ console.error('\n' + formatError(error, url || '', options));
875
994
  }
876
995
  else {
877
- console.error('\nError: Unknown error occurred');
996
+ console.error('\x1b[31m✖ Unknown error occurred\x1b[0m');
878
997
  }
879
998
  await cleanup();
880
999
  process.exit(1);
@@ -3399,8 +3518,60 @@ async function outputResult(result, options, extra = {}) {
3399
3518
  }
3400
3519
  // Default: full output
3401
3520
  if (options.json) {
3402
- const envelope = buildEnvelope(result, extra);
3403
- await writeStdout(JSON.stringify(envelope, null, 2) + '\n');
3521
+ // Build clean JSON output with guaranteed top-level fields
3522
+ const output = {
3523
+ url: result.url,
3524
+ title: result.metadata?.title || result.title || null,
3525
+ tokens: result.tokens || 0,
3526
+ fetchedAt: new Date().toISOString(),
3527
+ method: result.method || 'simple',
3528
+ elapsed: result.elapsed,
3529
+ content: result.content,
3530
+ };
3531
+ // Add optional fields only if present (filter out undefined/null values from metadata)
3532
+ if (result.metadata) {
3533
+ const cleanMeta = {};
3534
+ for (const [k, v] of Object.entries(result.metadata)) {
3535
+ if (v !== undefined && v !== null)
3536
+ cleanMeta[k] = v;
3537
+ }
3538
+ if (Object.keys(cleanMeta).length > 0)
3539
+ output.metadata = cleanMeta;
3540
+ }
3541
+ if (result.links?.length)
3542
+ output.links = result.links;
3543
+ if (result.images?.length)
3544
+ output.images = result.images;
3545
+ if (result.structured)
3546
+ output.structured = result.structured;
3547
+ if (result.domainData)
3548
+ output.domainData = result.domainData;
3549
+ if (result.readability)
3550
+ output.readability = result.readability;
3551
+ if (result.quickAnswer)
3552
+ output.quickAnswer = result.quickAnswer;
3553
+ if (result.quality)
3554
+ output.quality = result.quality;
3555
+ if (result.contentType)
3556
+ output.contentType = result.contentType;
3557
+ if (result.chunks)
3558
+ output.chunks = result.chunks;
3559
+ if (result.totalChunks)
3560
+ output.totalChunks = result.totalChunks;
3561
+ if (result.warning)
3562
+ output.warning = result.warning;
3563
+ if (result.focusQuery)
3564
+ output.focusQuery = result.focusQuery;
3565
+ if (result.focusReduction)
3566
+ output.focusReduction = result.focusReduction;
3567
+ if (extra.cached)
3568
+ output.cached = true;
3569
+ if (extra.truncated)
3570
+ output.truncated = true;
3571
+ if (extra.totalAvailable !== undefined)
3572
+ output.totalAvailable = extra.totalAvailable;
3573
+ output._meta = { version: cliVersion, method: result.method || 'simple' };
3574
+ await writeStdout(JSON.stringify(output, null, 2) + '\n');
3404
3575
  }
3405
3576
  else {
3406
3577
  await writeStdout(result.content + '\n');