webpeel 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/LICENSE +11 -657
  2. package/README.md +246 -325
  3. package/dist/cli.js +330 -73
  4. package/dist/cli.js.map +1 -1
  5. package/dist/core/browser-fetch.d.ts +12 -0
  6. package/dist/core/browser-fetch.d.ts.map +1 -1
  7. package/dist/core/browser-fetch.js +70 -17
  8. package/dist/core/browser-fetch.js.map +1 -1
  9. package/dist/core/cf-worker-proxy.d.ts +33 -0
  10. package/dist/core/cf-worker-proxy.d.ts.map +1 -0
  11. package/dist/core/cf-worker-proxy.js +88 -0
  12. package/dist/core/cf-worker-proxy.js.map +1 -0
  13. package/dist/core/chunker.d.ts +47 -0
  14. package/dist/core/chunker.d.ts.map +1 -0
  15. package/dist/core/chunker.js +250 -0
  16. package/dist/core/chunker.js.map +1 -0
  17. package/dist/core/cloak-fetch.d.ts +43 -0
  18. package/dist/core/cloak-fetch.d.ts.map +1 -0
  19. package/dist/core/cloak-fetch.js +141 -0
  20. package/dist/core/cloak-fetch.js.map +1 -0
  21. package/dist/core/crawl-checkpoint.d.ts +55 -0
  22. package/dist/core/crawl-checkpoint.d.ts.map +1 -0
  23. package/dist/core/crawl-checkpoint.js +105 -0
  24. package/dist/core/crawl-checkpoint.js.map +1 -0
  25. package/dist/core/crawler.d.ts +5 -1
  26. package/dist/core/crawler.d.ts.map +1 -1
  27. package/dist/core/crawler.js +60 -5
  28. package/dist/core/crawler.js.map +1 -1
  29. package/dist/core/cycle-fetch.d.ts +27 -0
  30. package/dist/core/cycle-fetch.d.ts.map +1 -0
  31. package/dist/core/cycle-fetch.js +99 -0
  32. package/dist/core/cycle-fetch.js.map +1 -0
  33. package/dist/core/domain-extractors.d.ts.map +1 -1
  34. package/dist/core/domain-extractors.js +754 -14
  35. package/dist/core/domain-extractors.js.map +1 -1
  36. package/dist/core/google-cache.d.ts +30 -0
  37. package/dist/core/google-cache.d.ts.map +1 -0
  38. package/dist/core/google-cache.js +181 -0
  39. package/dist/core/google-cache.js.map +1 -0
  40. package/dist/core/markdown.d.ts +11 -0
  41. package/dist/core/markdown.d.ts.map +1 -1
  42. package/dist/core/markdown.js +43 -0
  43. package/dist/core/markdown.js.map +1 -1
  44. package/dist/core/peel-tls.d.ts +26 -0
  45. package/dist/core/peel-tls.d.ts.map +1 -0
  46. package/dist/core/peel-tls.js +221 -0
  47. package/dist/core/peel-tls.js.map +1 -0
  48. package/dist/core/pipeline.d.ts +5 -1
  49. package/dist/core/pipeline.d.ts.map +1 -1
  50. package/dist/core/pipeline.js +269 -21
  51. package/dist/core/pipeline.js.map +1 -1
  52. package/dist/core/schema-postprocess.d.ts +33 -0
  53. package/dist/core/schema-postprocess.d.ts.map +1 -0
  54. package/dist/core/schema-postprocess.js +470 -0
  55. package/dist/core/schema-postprocess.js.map +1 -0
  56. package/dist/core/schema-templates.d.ts +20 -0
  57. package/dist/core/schema-templates.d.ts.map +1 -0
  58. package/dist/core/schema-templates.js +131 -0
  59. package/dist/core/schema-templates.js.map +1 -0
  60. package/dist/core/search-fallback.d.ts +28 -0
  61. package/dist/core/search-fallback.d.ts.map +1 -0
  62. package/dist/core/search-fallback.js +185 -0
  63. package/dist/core/search-fallback.js.map +1 -0
  64. package/dist/core/search-provider.d.ts +47 -4
  65. package/dist/core/search-provider.d.ts.map +1 -1
  66. package/dist/core/search-provider.js +278 -7
  67. package/dist/core/search-provider.js.map +1 -1
  68. package/dist/core/stealth-patches.d.ts +58 -0
  69. package/dist/core/stealth-patches.d.ts.map +1 -0
  70. package/dist/core/stealth-patches.js +340 -0
  71. package/dist/core/stealth-patches.js.map +1 -0
  72. package/dist/core/strategies.d.ts +20 -0
  73. package/dist/core/strategies.d.ts.map +1 -1
  74. package/dist/core/strategies.js +284 -48
  75. package/dist/core/strategies.js.map +1 -1
  76. package/dist/core/strategy-hooks.d.ts +1 -1
  77. package/dist/core/strategy-hooks.d.ts.map +1 -1
  78. package/dist/index.d.ts +11 -0
  79. package/dist/index.d.ts.map +1 -1
  80. package/dist/index.js +37 -15
  81. package/dist/index.js.map +1 -1
  82. package/dist/mcp/server.js +109 -4
  83. package/dist/mcp/server.js.map +1 -1
  84. package/dist/server/app.d.ts.map +1 -1
  85. package/dist/server/app.js +29 -0
  86. package/dist/server/app.js.map +1 -1
  87. package/dist/server/middleware/rate-limit.d.ts +2 -1
  88. package/dist/server/middleware/rate-limit.d.ts.map +1 -1
  89. package/dist/server/middleware/rate-limit.js +24 -8
  90. package/dist/server/middleware/rate-limit.js.map +1 -1
  91. package/dist/server/routes/agent.d.ts +4 -0
  92. package/dist/server/routes/agent.d.ts.map +1 -1
  93. package/dist/server/routes/agent.js +196 -9
  94. package/dist/server/routes/agent.js.map +1 -1
  95. package/dist/server/routes/batch.js +5 -5
  96. package/dist/server/routes/batch.js.map +1 -1
  97. package/dist/server/routes/compat.d.ts.map +1 -1
  98. package/dist/server/routes/compat.js +1 -0
  99. package/dist/server/routes/compat.js.map +1 -1
  100. package/dist/server/routes/fetch.d.ts.map +1 -1
  101. package/dist/server/routes/fetch.js +60 -6
  102. package/dist/server/routes/fetch.js.map +1 -1
  103. package/dist/server/routes/mcp.d.ts.map +1 -1
  104. package/dist/server/routes/mcp.js +103 -2
  105. package/dist/server/routes/mcp.js.map +1 -1
  106. package/dist/server/routes/search.js +1 -1
  107. package/dist/server/routes/search.js.map +1 -1
  108. package/dist/types.d.ts +55 -4
  109. package/dist/types.d.ts.map +1 -1
  110. package/dist/types.js +4 -1
  111. package/dist/types.js.map +1 -1
  112. package/llms.txt +55 -125
  113. package/package.json +15 -1
package/dist/cli.js CHANGED
@@ -21,6 +21,7 @@ import { checkUsage, showUsageFooter, handleLogin, handleLogout, handleUsage, lo
21
21
  import { getCache, setCache, parseTTL, clearCache, cacheStats } from './cache.js';
22
22
  import { estimateTokens } from './core/markdown.js';
23
23
  import { distillToBudget, budgetListings } from './core/budget.js';
24
+ import { SCHEMA_TEMPLATES, getSchemaTemplate, listSchemaTemplates } from './core/schema-templates.js';
24
25
  const program = new Command();
25
26
  // Read version from package.json dynamically
26
27
  import { fileURLToPath } from 'url';
@@ -178,10 +179,15 @@ program
178
179
  .argument('[url]', 'URL to fetch')
179
180
  .option('-r, --render', 'Use headless browser (for JS-heavy sites)')
180
181
  .option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
182
+ .option('--cloaked', 'Use CloakBrowser stealth (requires: npm install cloakbrowser)')
183
+ .option('--tls', 'Use PeelTLS TLS fingerprint spoofing (built-in, no install needed)')
184
+ .option('--cycle', 'Use PeelTLS TLS fingerprint spoofing (alias for --tls)', false)
181
185
  .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
186
+ .option('--proxies <urls>', 'Comma-separated list of proxy URLs for rotation (tried in order on failure)', (val) => val.split(',').map((s) => s.trim()).filter(Boolean))
182
187
  .option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
183
188
  .option('--html', 'Output raw HTML instead of markdown')
184
189
  .option('--text', 'Output plain text instead of markdown')
190
+ .option('--clean', 'Output clean text optimized for AI (strips URLs, keeps structure)')
185
191
  .option('--json', 'Output as JSON')
186
192
  .option('-t, --timeout <ms>', 'Request timeout (ms)', (v) => parseInt(v, 10), 30000)
187
193
  .option('--ua <agent>', 'Custom user agent')
@@ -196,9 +202,10 @@ program
196
202
  .option('--full-content', 'Return full page content (disable automatic content density pruning)')
197
203
  .option('--readable', 'Reader mode — extract only the main article content, strip all noise (like browser Reader Mode)')
198
204
  .option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
199
- .option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
200
- .option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
201
- .option('--chunk-strategy <strategy>', 'Chunking strategy: fixed, semantic (default), paragraph')
205
+ .option('--chunk', 'Split content into RAG-ready chunks')
206
+ .option('--chunk-size <tokens>', 'Max tokens per chunk (default: 512)', parseInt)
207
+ .option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 50)', parseInt)
208
+ .option('--chunk-strategy <strategy>', 'Chunking strategy: section (default), paragraph, fixed')
202
209
  .option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
203
210
  .option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
204
211
  .option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
@@ -231,7 +238,15 @@ program
231
238
  .option('--profile <path>', 'Use a persistent browser profile directory (cookies/sessions survive between calls)')
232
239
  .option('--headed', 'Run browser in headed (visible) mode — useful for profile setup and debugging')
233
240
  .option('-q, --question <q>', 'Ask a question about the page content (BM25-powered, no LLM key needed)')
234
- .option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)');
241
+ .option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)')
242
+ .option('--device <type>', 'Device emulation: desktop (default), mobile, tablet (auto-enables --render)')
243
+ .option('--viewport <WxH>', 'Browser viewport size (e.g., "1920x1080") (auto-enables --render)', (val) => {
244
+ const [w, h] = val.split('x').map(Number);
245
+ return { width: w, height: h };
246
+ })
247
+ .option('--wait-until <event>', 'Page load event: domcontentloaded, networkidle, load, commit (auto-enables --render)')
248
+ .option('--wait-selector <css>', 'Wait for CSS selector before extracting (auto-enables --render)')
249
+ .option('--block-resources <types>', 'Block resource types, comma-separated: image,stylesheet,font,media,script (auto-enables --render)');
235
250
  program.configureHelp({
236
251
  sortSubcommands: true,
237
252
  showGlobalOptions: false,
@@ -273,9 +288,24 @@ Examples:
273
288
  $ webpeel "https://nytimes.com/article" --readable Reader mode
274
289
  $ webpeel search "best restaurants in NYC" Web search
275
290
  $ webpeel hotels "Manhattan" --checkin tomorrow Hotel search
291
+
292
+ Agent Integration:
293
+ $ webpeel mcp Start MCP server
294
+ $ cat urls.txt | webpeel batch Batch from stdin
295
+ $ webpeel pipe "https://example.com" | jq .content Pipe-friendly JSON
296
+ $ webpeel "https://site.com" --json --silent Same as pipe
297
+ $ curl https://webpeel.dev/llms.txt AI-readable docs
276
298
  `);
277
- program
278
- .action(async (url, options) => {
299
+ // Main fetch handler — shared with the `pipe` subcommand
300
+ async function runFetch(url, options) {
301
+ // Smart defaults: when piped (not a TTY), default to silent JSON
302
+ const isPiped = !process.stdout.isTTY;
303
+ if (isPiped && !options.html && !options.text) {
304
+ if (!options.json)
305
+ options.json = true;
306
+ if (!options.silent)
307
+ options.silent = true;
308
+ }
279
309
  // --agent sets sensible defaults for AI agents; explicit flags override
280
310
  if (options.agent) {
281
311
  if (!options.json)
@@ -471,6 +501,21 @@ program
471
501
  process.exit(0);
472
502
  }
473
503
  }
504
+ // --- BM25 Schema Template Extraction (cached path) ---
505
+ if (options.schema && cachedResult.content) {
506
+ const { getSchemaTemplate: getSchTmplCached } = await import('./core/schema-templates.js');
507
+ const schTemplateCached = getSchTmplCached(options.schema);
508
+ if (schTemplateCached) {
509
+ const { quickAnswer: qaCached } = await import('./core/quick-answer.js');
510
+ const { smartExtractSchemaFields: smartExtractCached } = await import('./core/schema-postprocess.js');
511
+ const extractedCached = smartExtractCached(cachedResult.content, schTemplateCached.fields, qaCached, {
512
+ pageTitle: cachedResult.title,
513
+ pageUrl: cachedResult.url,
514
+ metadata: cachedResult.metadata,
515
+ });
516
+ cachedResult.extracted = extractedCached;
517
+ }
518
+ }
474
519
  await outputResult(cachedResult, options, { cached: true });
475
520
  process.exit(0);
476
521
  }
@@ -592,7 +637,13 @@ program
592
637
  const scrollExtractCount = isAutoScroll
593
638
  ? 0
594
639
  : (scrollExtractRaw !== undefined ? scrollExtractRaw : 0);
595
- const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll || false;
640
+ const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll
641
+ || (options.device && options.device !== 'desktop')
642
+ || !!options.viewport
643
+ || !!options.waitUntil
644
+ || !!options.waitSelector
645
+ || !!options.blockResources
646
+ || false;
596
647
  // Inject scroll actions when --scroll-extract N (fixed count) is used
597
648
  if (scrollExtractCount > 0) {
598
649
  const scrollActions = [];
@@ -630,13 +681,34 @@ program
630
681
  headed: options.headed || false,
631
682
  storageState: resolvedStorageState,
632
683
  proxy: options.proxy,
684
+ proxies: options.proxies,
633
685
  fullPage: options.fullContent || false,
634
686
  readable: options.readable || false,
635
687
  // Smart auto-scroll (bare --scroll-extract flag)
636
688
  autoScroll: isAutoScroll
637
689
  ? { timeout: options.scrollExtractTimeout }
638
690
  : undefined,
691
+ device: options.device,
692
+ viewportWidth: options.viewport ? options.viewport.width : undefined,
693
+ viewportHeight: options.viewport ? options.viewport.height : undefined,
694
+ waitUntil: options.waitUntil,
695
+ waitSelector: options.waitSelector,
696
+ blockResources: options.blockResources ? options.blockResources.split(',').map((s) => s.trim()) : undefined,
697
+ cloaked: options.cloaked ? true : undefined,
698
+ cycle: options.cycle ? true : undefined,
699
+ tls: (options.tls || options.cycle) ? true : undefined,
639
700
  };
701
+ if (options.cloaked) {
702
+ peelOptions.render = true; // CloakBrowser is a browser
703
+ }
704
+ // Add chunk option if requested
705
+ if (options.chunk) {
706
+ peelOptions.chunk = {
707
+ maxTokens: options.chunkSize || 512,
708
+ overlap: options.chunkOverlap || 50,
709
+ strategy: options.chunkStrategy || 'section',
710
+ };
711
+ }
640
712
  // Add summary option if requested
641
713
  if (options.summary) {
642
714
  const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
@@ -657,6 +729,9 @@ program
657
729
  else if (options.text) {
658
730
  peelOptions.format = 'text';
659
731
  }
732
+ else if (options.clean) {
733
+ peelOptions.format = 'clean';
734
+ }
660
735
  else {
661
736
  peelOptions.format = 'markdown';
662
737
  }
@@ -769,28 +844,15 @@ program
769
844
  process.exit(0);
770
845
  }
771
846
  }
772
- // --- Smart Chunking ---
773
- if (options.chunk && options.chunk > 0 && result.content) {
774
- const { chunkContent } = await import('./core/chunking.js');
775
- const chunkResult = chunkContent(result.content, {
776
- chunkSize: options.chunk,
777
- overlap: options.chunkOverlap || 200,
778
- strategy: options.chunkStrategy || 'semantic',
779
- });
780
- // Replace content with chunked output
781
- if (isJson) {
782
- result.chunks = chunkResult.chunks;
783
- result.totalChunks = chunkResult.totalChunks;
784
- result.originalTokens = chunkResult.originalTokens;
785
- // Keep content as first chunk for non-JSON fallback
786
- result.content = chunkResult.chunks[0]?.content || '';
787
- result.tokens = chunkResult.chunks[0]?.tokens || 0;
788
- }
789
- else {
790
- // Plain text mode: output chunks separated by markers
791
- const chunkOutput = chunkResult.chunks.map((c, i) => `--- Chunk ${i + 1}/${chunkResult.totalChunks} (${c.tokens} tokens) ---\n${c.content}`).join('\n\n');
792
- result.content = chunkOutput;
793
- result.tokens = chunkResult.totalTokens;
847
+ // --- RAG Chunking output (chunks come from pipeline via peelOptions.chunk) ---
848
+ if (result.chunks && result.chunks.length > 0 && !isJson) {
849
+ console.log(`\n${'─'.repeat(60)}`);
850
+ console.log(`šŸ“¦ ${result.chunks.length} chunks (${options.chunkStrategy || 'section'} strategy)\n`);
851
+ for (const chunk of result.chunks) {
852
+ const sectionLabel = chunk.section ? ` [${chunk.section}]` : '';
853
+ console.log(`── Chunk ${chunk.index + 1}${sectionLabel} (${chunk.tokenCount} tokens, ${chunk.wordCount} words) ──`);
854
+ console.log(chunk.text.substring(0, 200) + (chunk.text.length > 200 ? '...' : ''));
855
+ console.log('');
794
856
  }
795
857
  }
796
858
  // --- #4: Content quality warning ---
@@ -1001,6 +1063,21 @@ program
1001
1063
  }
1002
1064
  }
1003
1065
  else {
1066
+ // --- BM25 Schema Template Extraction (no LLM needed) ---
1067
+ if (options.schema && result.content) {
1068
+ const { getSchemaTemplate: getSchTmpl } = await import('./core/schema-templates.js');
1069
+ const schTemplate = getSchTmpl(options.schema);
1070
+ if (schTemplate) {
1071
+ const { quickAnswer: qa } = await import('./core/quick-answer.js');
1072
+ const { smartExtractSchemaFields } = await import('./core/schema-postprocess.js');
1073
+ const extracted = smartExtractSchemaFields(result.content, schTemplate.fields, qa, {
1074
+ pageTitle: result.title,
1075
+ pageUrl: result.url,
1076
+ metadata: result.metadata,
1077
+ });
1078
+ result.extracted = extracted;
1079
+ }
1080
+ }
1004
1081
  // Output results (default path)
1005
1082
  await outputResult(result, options, {
1006
1083
  cached: false,
@@ -1032,6 +1109,10 @@ program
1032
1109
  await cleanup();
1033
1110
  process.exit(1);
1034
1111
  }
1112
+ }
1113
+ program
1114
+ .action(async (url, options) => {
1115
+ await runFetch(url, options);
1035
1116
  });
1036
1117
  // Search command
1037
1118
  program
@@ -1423,6 +1504,7 @@ program
1423
1504
  .option('--stealth', 'Use stealth mode for all pages')
1424
1505
  .option('-s, --silent', 'Silent mode (no spinner)')
1425
1506
  .option('--json', 'Output as JSON')
1507
+ .option('--resume', 'Resume an interrupted crawl from its last checkpoint')
1426
1508
  .action(async (url, options) => {
1427
1509
  // Check usage quota
1428
1510
  const usageCheck = await checkUsage();
@@ -1442,6 +1524,7 @@ program
1442
1524
  rateLimitMs: options.rateLimit,
1443
1525
  render: options.render || false,
1444
1526
  stealth: options.stealth || false,
1527
+ resume: options.resume || false,
1445
1528
  });
1446
1529
  if (spinner) {
1447
1530
  spinner.succeed(`Crawled ${results.length} pages`);
@@ -1782,6 +1865,23 @@ program
1782
1865
  .action(async () => {
1783
1866
  await import('./mcp/server.js');
1784
1867
  });
1868
+ // Pipe command — always JSON, no UI (agent-friendly)
1869
+ program
1870
+ .command('pipe <url>')
1871
+ .description('Pipe-friendly fetch (always JSON, no UI). Alias for: webpeel <url> --json --silent')
1872
+ .option('-r, --render', 'Use headless browser')
1873
+ .option('--stealth', 'Stealth mode')
1874
+ .option('--budget <n>', 'Token budget', parseInt)
1875
+ .option('--clean', 'Clean format for AI')
1876
+ .option('-q, --question <q>', 'Quick answer')
1877
+ .option('--proxy <url>', 'Proxy URL')
1878
+ .option('--timeout <ms>', 'Timeout in ms', parseInt)
1879
+ .action(async (url, opts) => {
1880
+ // Force JSON + silent — always, unconditionally
1881
+ opts.json = true;
1882
+ opts.silent = true;
1883
+ await runFetch(url, opts);
1884
+ });
1785
1885
  // Config command — webpeel config [get|set] [key] [value]
1786
1886
  program
1787
1887
  .command('config')
@@ -2078,69 +2178,210 @@ program
2078
2178
  // Agent command - autonomous web research
2079
2179
  program
2080
2180
  .command('agent <prompt>')
2081
- .description('Autonomous web research — finds and extracts data from the web using AI')
2181
+ .description('Web research agent — LLM-free by default, add --llm-key for AI synthesis')
2082
2182
  .option('--llm-key <key>', 'LLM API key (or use OPENAI_API_KEY env var)')
2083
2183
  .option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
2084
2184
  .option('--llm-base-url <url>', 'LLM API base URL')
2085
2185
  .option('--urls <urls>', 'Comma-separated starting URLs')
2086
2186
  .option('--max-pages <n>', 'Maximum pages to visit (default: 10)', '10')
2087
- .option('--schema <json>', 'JSON schema for structured output')
2187
+ .option('--schema <json>', 'Schema template name (e.g. product, article) or JSON schema for structured output')
2088
2188
  .option('-s, --silent', 'Silent mode (no spinner)')
2089
2189
  .option('--json', 'Output as JSON')
2090
2190
  .action(async (prompt, options) => {
2091
2191
  const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
2092
- if (!llmApiKey) {
2093
- console.error('Error: --llm-key or OPENAI_API_KEY environment variable is required');
2094
- process.exit(1);
2095
- }
2096
- const spinner = options.silent ? null : ora('Running agent research...').start();
2097
- try {
2098
- const { runAgent } = await import('./core/agent.js');
2099
- let schema;
2100
- if (options.schema) {
2192
+ const urls = options.urls ? options.urls.split(',').map((u) => u.trim()) : undefined;
2193
+ // Parse schema (support templates)
2194
+ let schema;
2195
+ if (options.schema) {
2196
+ const template = getSchemaTemplate(options.schema);
2197
+ if (template) {
2198
+ schema = template.fields;
2199
+ }
2200
+ else {
2101
2201
  try {
2102
2202
  schema = JSON.parse(options.schema);
2103
2203
  }
2104
2204
  catch {
2105
- console.error('Error: --schema must be valid JSON');
2205
+ console.error(`Error: --schema must be a template name (${listSchemaTemplates().join(', ')}) or valid JSON`);
2106
2206
  process.exit(1);
2107
2207
  }
2108
2208
  }
2109
- const result = await runAgent({
2110
- prompt,
2111
- urls: options.urls ? options.urls.split(',').map((u) => u.trim()) : undefined,
2112
- schema,
2113
- llmApiKey,
2114
- llmModel: options.llmModel,
2115
- llmApiBase: options.llmBaseUrl,
2116
- maxPages: parseInt(options.maxPages, 10),
2117
- onProgress: (progress) => {
2118
- if (spinner) {
2119
- spinner.text = progress.message;
2120
- }
2121
- },
2122
- });
2123
- if (spinner) {
2124
- spinner.succeed(`Agent finished: ${result.pagesVisited} pages, ${result.creditsUsed} credits`);
2125
- }
2126
- if (options.json) {
2127
- console.log(JSON.stringify(result, null, 2));
2209
+ }
2210
+ if (llmApiKey) {
2211
+ // Full LLM agent mode (existing code)
2212
+ const spinner = options.silent ? null : ora('Running agent research...').start();
2213
+ try {
2214
+ const { runAgent } = await import('./core/agent.js');
2215
+ const result = await runAgent({
2216
+ prompt,
2217
+ urls,
2218
+ schema,
2219
+ llmApiKey,
2220
+ llmModel: options.llmModel,
2221
+ llmApiBase: options.llmBaseUrl,
2222
+ maxPages: parseInt(options.maxPages, 10),
2223
+ onProgress: (progress) => {
2224
+ if (spinner)
2225
+ spinner.text = progress.message;
2226
+ },
2227
+ });
2228
+ if (spinner)
2229
+ spinner.succeed(`Agent finished: ${result.pagesVisited} pages`);
2230
+ if (options.json) {
2231
+ console.log(JSON.stringify(result, null, 2));
2232
+ }
2233
+ else {
2234
+ console.log(`\nSources (${result.sources.length}):`);
2235
+ result.sources.forEach(s => console.log(` • ${s}`));
2236
+ console.log(`\nResults:`);
2237
+ console.log(JSON.stringify(result.data, null, 2));
2238
+ }
2239
+ await cleanup();
2240
+ process.exit(0);
2128
2241
  }
2129
- else {
2130
- console.log(`\nSources (${result.sources.length}):`);
2131
- result.sources.forEach(s => console.log(` • ${s}`));
2132
- console.log(`\nResults:`);
2133
- console.log(JSON.stringify(result.data, null, 2));
2242
+ catch (e) {
2243
+ if (spinner)
2244
+ spinner.fail('Agent failed');
2245
+ console.error(e instanceof Error ? e.message : e);
2246
+ await cleanup();
2247
+ process.exit(1);
2134
2248
  }
2135
- await cleanup();
2136
- process.exit(0);
2137
2249
  }
2138
- catch (error) {
2139
- if (spinner)
2140
- spinner.fail('Agent research failed');
2141
- console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
2142
- await cleanup();
2143
- process.exit(1);
2250
+ else {
2251
+ // LLM-free mode: search + fetch + BM25 extraction
2252
+ const spinner = options.silent ? null : ora('Running LLM-free research...').start();
2253
+ try {
2254
+ // Import needed modules
2255
+ const { quickAnswer } = await import('./core/quick-answer.js');
2256
+ // Step 1: Get URLs to process
2257
+ let targetUrls = urls || [];
2258
+ // If no URLs, search the web
2259
+ if (targetUrls.length === 0) {
2260
+ if (spinner)
2261
+ spinner.text = 'Searching the web...';
2262
+ try {
2263
+ const { getBestSearchProvider } = await import('./core/search-provider.js');
2264
+ const { provider, apiKey: searchApiKey } = getBestSearchProvider();
2265
+ const searchResults = await provider.searchWeb(prompt, {
2266
+ count: Math.min(parseInt(options.maxPages, 10) || 5, 10),
2267
+ apiKey: searchApiKey,
2268
+ });
2269
+ targetUrls = searchResults.map((r) => r.url);
2270
+ }
2271
+ catch {
2272
+ // Fallback: try DuckDuckGo HTML
2273
+ if (spinner)
2274
+ spinner.text = 'Searching via DuckDuckGo...';
2275
+ try {
2276
+ const duckUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(prompt)}`;
2277
+ const searchResult = await peel(duckUrl, { budget: 4000 });
2278
+ // Extract URLs from search results content
2279
+ const urlMatches = searchResult.content.match(/https?:\/\/[^\s\)]+/g) || [];
2280
+ targetUrls = urlMatches
2281
+ .filter((u) => !u.includes('duckduckgo.com'))
2282
+ .slice(0, parseInt(options.maxPages, 10) || 5);
2283
+ }
2284
+ catch {
2285
+ // No search results
2286
+ }
2287
+ }
2288
+ }
2289
+ if (targetUrls.length === 0) {
2290
+ if (spinner)
2291
+ spinner.fail('No URLs found. Provide --urls or a more specific prompt.');
2292
+ process.exit(1);
2293
+ }
2294
+ if (spinner)
2295
+ spinner.text = `Processing ${targetUrls.length} pages...`;
2296
+ // Step 2: Fetch and extract from each URL
2297
+ const results = [];
2298
+ for (const url of targetUrls) {
2299
+ try {
2300
+ if (spinner)
2301
+ spinner.text = `Fetching: ${url.substring(0, 60)}...`;
2302
+ const pageResult = await peel(url, { budget: 4000 });
2303
+ let extracted = null;
2304
+ let confidence = 0;
2305
+ if (schema) {
2306
+ // Extract each schema field using smartExtractSchemaFields
2307
+ const { smartExtractSchemaFields: smartExtractResearch } = await import('./core/schema-postprocess.js');
2308
+ extracted = smartExtractResearch(pageResult.content, schema, quickAnswer, {
2309
+ pageTitle: pageResult.title,
2310
+ pageUrl: url,
2311
+ metadata: pageResult.metadata,
2312
+ });
2313
+ // Calculate confidence from quickAnswer for any field
2314
+ for (const question of Object.values(schema)) {
2315
+ try {
2316
+ const qa = quickAnswer({ content: pageResult.content, question: typeof question === 'string' ? question : '' });
2317
+ confidence = Math.max(confidence, qa.confidence || 0);
2318
+ }
2319
+ catch { /* ignore */ }
2320
+ break; // just need one confidence estimate
2321
+ }
2322
+ }
2323
+ else {
2324
+ // Answer the prompt directly
2325
+ try {
2326
+ const qa = quickAnswer({ content: pageResult.content, question: prompt });
2327
+ extracted = { answer: qa.answer || '' };
2328
+ confidence = qa.confidence || 0;
2329
+ }
2330
+ catch {
2331
+ extracted = null;
2332
+ }
2333
+ }
2334
+ results.push({
2335
+ url,
2336
+ title: pageResult.metadata?.title || url,
2337
+ extracted,
2338
+ content: pageResult.content.substring(0, 500),
2339
+ confidence,
2340
+ });
2341
+ }
2342
+ catch (e) {
2343
+ // Skip failed URLs
2344
+ if (process.env.DEBUG) {
2345
+ console.debug('[webpeel]', `Failed to fetch ${url}:`, e instanceof Error ? e.message : e);
2346
+ }
2347
+ }
2348
+ }
2349
+ if (spinner)
2350
+ spinner.succeed(`Processed ${results.length}/${targetUrls.length} pages (LLM-free)`);
2351
+ if (options.json) {
2352
+ console.log(JSON.stringify({
2353
+ mode: 'llm-free',
2354
+ prompt,
2355
+ schema: schema || null,
2356
+ results,
2357
+ sources: results.map(r => r.url),
2358
+ pagesVisited: results.length,
2359
+ }, null, 2));
2360
+ }
2361
+ else {
2362
+ console.log(`\nšŸ“Š Results (${results.length} pages, LLM-free):\n`);
2363
+ for (const r of results) {
2364
+ console.log(`── ${r.title} ──`);
2365
+ console.log(` ${r.url}`);
2366
+ if (r.extracted) {
2367
+ for (const [k, v] of Object.entries(r.extracted)) {
2368
+ if (v)
2369
+ console.log(` ${k}: ${v}`);
2370
+ }
2371
+ }
2372
+ console.log(` Confidence: ${(r.confidence * 100).toFixed(0)}%\n`);
2373
+ }
2374
+ }
2375
+ await cleanup();
2376
+ process.exit(0);
2377
+ }
2378
+ catch (e) {
2379
+ if (spinner)
2380
+ spinner.fail('Research failed');
2381
+ console.error(e instanceof Error ? e.message : e);
2382
+ await cleanup();
2383
+ process.exit(1);
2384
+ }
2144
2385
  }
2145
2386
  });
2146
2387
  // ── Jobs command group ─────────────────────────────────────────────────────
@@ -3403,6 +3644,20 @@ program
3403
3644
  process.exit(1);
3404
3645
  }
3405
3646
  });
3647
+ // Schema templates listing command
3648
+ program
3649
+ .command('schemas')
3650
+ .description('List available extraction schema templates')
3651
+ .action(() => {
3652
+ console.log('\nAvailable schema templates:\n');
3653
+ for (const [key, template] of Object.entries(SCHEMA_TEMPLATES)) {
3654
+ console.log(` ${key.padEnd(12)} ${template.description}`);
3655
+ console.log(` ${''.padEnd(12)} Fields: ${Object.keys(template.fields).join(', ')}`);
3656
+ console.log('');
3657
+ }
3658
+ console.log('Usage: webpeel "https://example.com" --schema product');
3659
+ console.log(' webpeel "https://example.com" --schema \'{"field":"description"}\'');
3660
+ });
3406
3661
  program.parse();
3407
3662
  // ============================================================
3408
3663
  // Time formatting helper
@@ -3598,6 +3853,8 @@ async function outputResult(result, options, extra = {}) {
3598
3853
  output.focusQuery = result.focusQuery;
3599
3854
  if (result.focusReduction)
3600
3855
  output.focusReduction = result.focusReduction;
3856
+ if (result.extracted)
3857
+ output.extracted = result.extracted;
3601
3858
  if (extra.cached)
3602
3859
  output.cached = true;
3603
3860
  if (extra.truncated)