webpeel 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/cli.js +86 -3
  2. package/dist/cli.js.map +1 -1
  3. package/dist/core/bm25-filter.d.ts +57 -0
  4. package/dist/core/bm25-filter.d.ts.map +1 -0
  5. package/dist/core/bm25-filter.js +249 -0
  6. package/dist/core/bm25-filter.js.map +1 -0
  7. package/dist/core/chunking.d.ts +43 -0
  8. package/dist/core/chunking.d.ts.map +1 -0
  9. package/dist/core/chunking.js +182 -0
  10. package/dist/core/chunking.js.map +1 -0
  11. package/dist/core/content-pruner.d.ts +33 -0
  12. package/dist/core/content-pruner.d.ts.map +1 -0
  13. package/dist/core/content-pruner.js +249 -0
  14. package/dist/core/content-pruner.js.map +1 -0
  15. package/dist/core/fetcher.d.ts +8 -1
  16. package/dist/core/fetcher.d.ts.map +1 -1
  17. package/dist/core/fetcher.js +33 -7
  18. package/dist/core/fetcher.js.map +1 -1
  19. package/dist/core/hotel-search.d.ts +2 -0
  20. package/dist/core/hotel-search.d.ts.map +1 -1
  21. package/dist/core/hotel-search.js +2 -0
  22. package/dist/core/hotel-search.js.map +1 -1
  23. package/dist/core/llm-extract.d.ts +15 -1
  24. package/dist/core/llm-extract.d.ts.map +1 -1
  25. package/dist/core/llm-extract.js +127 -7
  26. package/dist/core/llm-extract.js.map +1 -1
  27. package/dist/core/markdown.d.ts +4 -1
  28. package/dist/core/markdown.d.ts.map +1 -1
  29. package/dist/core/markdown.js +11 -2
  30. package/dist/core/markdown.js.map +1 -1
  31. package/dist/core/strategies.d.ts +6 -0
  32. package/dist/core/strategies.d.ts.map +1 -1
  33. package/dist/core/strategies.js +10 -5
  34. package/dist/core/strategies.js.map +1 -1
  35. package/dist/index.d.ts.map +1 -1
  36. package/dist/index.js +17 -2
  37. package/dist/index.js.map +1 -1
  38. package/dist/mcp/server.js +47 -3
  39. package/dist/mcp/server.js.map +1 -1
  40. package/dist/types.d.ts +19 -0
  41. package/dist/types.d.ts.map +1 -1
  42. package/dist/types.js.map +1 -1
  43. package/package.json +1 -1
package/dist/cli.js CHANGED
@@ -131,6 +131,7 @@ program
131
131
  .argument('[url]', 'URL to fetch')
132
132
  .option('-r, --render', 'Use headless browser (for JS-heavy sites)')
133
133
  .option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
134
+ .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
134
135
  .option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
135
136
  .option('--html', 'Output raw HTML instead of markdown')
136
137
  .option('--text', 'Output plain text instead of markdown')
@@ -145,6 +146,11 @@ program
145
146
  .option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
146
147
  .option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
147
148
  .option('--only-main-content', 'Shortcut for --include-tags main,article')
149
+ .option('--full-content', 'Return full page content (disable automatic content density pruning)')
150
+ .option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
151
+ .option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
152
+ .option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
153
+ .option('--chunk-strategy <strategy>', 'Chunking strategy: fixed, semantic (default), paragraph')
148
154
  .option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
149
155
  .option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
150
156
  .option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
@@ -156,6 +162,7 @@ program
156
162
  .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
157
163
  .option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
158
164
  .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
165
+ .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
159
166
  .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
160
167
  .option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
161
168
  .option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
@@ -296,7 +303,7 @@ program
296
303
  cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
297
304
  }
298
305
  // LLM extraction from cached content
299
- if (options.llmExtract) {
306
+ if (options.llmExtract || options.extractSchema) {
300
307
  const { extractWithLLM } = await import('./core/llm-extract.js');
301
308
  const llmCfgCached = loadConfig();
302
309
  const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
@@ -307,9 +314,25 @@ program
307
314
  const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
308
315
  const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
309
316
  const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
317
+ // Parse schema if provided
318
+ let llmSchemaCached;
319
+ if (options.extractSchema) {
320
+ let schemaStr = options.extractSchema;
321
+ if (schemaStr.startsWith('@')) {
322
+ schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
323
+ }
324
+ try {
325
+ llmSchemaCached = JSON.parse(schemaStr);
326
+ }
327
+ catch {
328
+ console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
329
+ process.exit(1);
330
+ }
331
+ }
310
332
  const llmResultCached = await extractWithLLM({
311
333
  content: cachedResult.content,
312
334
  instruction: llmInstructionCached,
335
+ schema: llmSchemaCached,
313
336
  apiKey: llmApiKeyCached,
314
337
  model: llmModelCached,
315
338
  baseUrl: llmBaseUrlCached,
@@ -356,9 +379,13 @@ program
356
379
  throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
357
380
  }
358
381
  }
382
+ // --extract-schema auto-enables JSON output
383
+ if (options.extractSchema) {
384
+ options.json = true;
385
+ }
359
386
  // Parse extract
360
387
  let extract;
361
- if (options.llmExtract) {
388
+ if (options.llmExtract || options.extractSchema) {
362
389
  // LLM-based extraction is handled post-fetch (after peel returns markdown).
363
390
  // Early-validate that an API key is available so we fail fast.
364
391
  const llmCfg = loadConfig();
@@ -470,6 +497,8 @@ program
470
497
  profileDir: resolvedProfileDir,
471
498
  headed: options.headed || false,
472
499
  storageState: resolvedStorageState,
500
+ proxy: options.proxy,
501
+ fullPage: options.fullContent || false,
473
502
  };
474
503
  // Add summary option if requested
475
504
  if (options.summary) {
@@ -546,6 +575,41 @@ program
546
575
  result.tokens = estimateTokens(distilled);
547
576
  }
548
577
  }
578
+ // --- BM25 Query-Focused Filtering ---
579
+ if (options.focus && result.content) {
580
+ const { filterByRelevance } = await import('./core/bm25-filter.js');
581
+ const focusResult = filterByRelevance(result.content, { query: options.focus });
582
+ result.content = focusResult.content;
583
+ result.tokens = estimateTokens(focusResult.content);
584
+ if (isJson) {
585
+ result.focusQuery = options.focus;
586
+ result.focusReduction = focusResult.reductionPercent;
587
+ }
588
+ }
589
+ // --- Smart Chunking ---
590
+ if (options.chunk && options.chunk > 0 && result.content) {
591
+ const { chunkContent } = await import('./core/chunking.js');
592
+ const chunkResult = chunkContent(result.content, {
593
+ chunkSize: options.chunk,
594
+ overlap: options.chunkOverlap || 200,
595
+ strategy: options.chunkStrategy || 'semantic',
596
+ });
597
+ // Replace content with chunked output
598
+ if (isJson) {
599
+ result.chunks = chunkResult.chunks;
600
+ result.totalChunks = chunkResult.totalChunks;
601
+ result.originalTokens = chunkResult.originalTokens;
602
+ // Keep content as first chunk for non-JSON fallback
603
+ result.content = chunkResult.chunks[0]?.content || '';
604
+ result.tokens = chunkResult.chunks[0]?.tokens || 0;
605
+ }
606
+ else {
607
+ // Plain text mode: output chunks separated by markers
608
+ const chunkOutput = chunkResult.chunks.map((c, i) => `--- Chunk ${i + 1}/${chunkResult.totalChunks} (${c.tokens} tokens) ---\n${c.content}`).join('\n\n');
609
+ result.content = chunkOutput;
610
+ result.tokens = chunkResult.totalTokens;
611
+ }
612
+ }
549
613
  // --- #4: Content quality warning ---
550
614
  const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
551
615
  const isRedirect = false; // peel() follows redirects — final result is always 200
@@ -559,16 +623,31 @@ program
559
623
  }
560
624
  }
561
625
  // --- LLM-based extraction (post-peel) ---
562
- if (options.llmExtract) {
626
+ if (options.llmExtract || options.extractSchema) {
563
627
  const { extractWithLLM } = await import('./core/llm-extract.js');
564
628
  const llmCfg = loadConfig();
565
629
  const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
566
630
  const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
567
631
  const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
568
632
  const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
633
+ // Parse --extract-schema if provided
634
+ let llmSchema;
635
+ if (options.extractSchema) {
636
+ let schemaStr = options.extractSchema;
637
+ if (schemaStr.startsWith('@')) {
638
+ schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
639
+ }
640
+ try {
641
+ llmSchema = JSON.parse(schemaStr);
642
+ }
643
+ catch {
644
+ exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
645
+ }
646
+ }
569
647
  const llmResult = await extractWithLLM({
570
648
  content: result.content,
571
649
  instruction: llmInstruction,
650
+ schema: llmSchema,
572
651
  apiKey: llmApiKey,
573
652
  model: llmModel,
574
653
  baseUrl: llmBaseUrl,
@@ -806,6 +885,7 @@ program
806
885
  .option('--csv', 'Output site-search results as CSV (requires --site)')
807
886
  .option('--budget <n>', 'Token budget for site-search result content', parseInt)
808
887
  .option('-s, --silent', 'Silent mode')
888
+ .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
809
889
  .option('--agent', 'Agent mode: sets --json, --silent, and --budget 4000 (override with --budget N)')
810
890
  .action(async (query, options) => {
811
891
  // --agent sets sensible defaults for AI agents; explicit flags override
@@ -837,6 +917,7 @@ program
837
917
  const htmlResult = await peel(siteResult.url, {
838
918
  format: 'html',
839
919
  timeout: 30000,
920
+ proxy: options.proxy,
840
921
  });
841
922
  if (spinner) {
842
923
  spinner.succeed(`Fetched ${siteResult.site} in ${htmlResult.elapsed}ms`);
@@ -2953,6 +3034,7 @@ program
2953
3034
  .option('--source <name...>', 'Only use specific source(s): kayak, booking, google (repeatable)')
2954
3035
  .option('--json', 'Output as JSON')
2955
3036
  .option('--stealth', 'Use stealth mode for all sources')
3037
+ .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
2956
3038
  .option('-s, --silent', 'Suppress progress messages')
2957
3039
  .action(async (destination, options) => {
2958
3040
  const isJson = options.json;
@@ -3005,6 +3087,7 @@ program
3005
3087
  sources,
3006
3088
  stealth: options.stealth,
3007
3089
  silent: isSilent,
3090
+ proxy: options.proxy,
3008
3091
  });
3009
3092
  if (searchSpinner)
3010
3093
  searchSpinner.stop();