webpeel 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -146,6 +146,11 @@ program
146
146
  .option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
147
147
  .option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
148
148
  .option('--only-main-content', 'Shortcut for --include-tags main,article')
149
+ .option('--full-content', 'Return full page content (disable automatic content density pruning)')
150
+ .option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
151
+ .option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
152
+ .option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
153
+ .option('--chunk-strategy <strategy>', 'Chunking strategy: fixed, semantic (default), paragraph')
149
154
  .option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
150
155
  .option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
151
156
  .option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
@@ -493,6 +498,7 @@ program
493
498
  headed: options.headed || false,
494
499
  storageState: resolvedStorageState,
495
500
  proxy: options.proxy,
501
+ fullPage: options.fullContent || false,
496
502
  };
497
503
  // Add summary option if requested
498
504
  if (options.summary) {
@@ -569,6 +575,41 @@ program
569
575
  result.tokens = estimateTokens(distilled);
570
576
  }
571
577
  }
578
+ // --- BM25 Query-Focused Filtering ---
579
+ if (options.focus && result.content) {
580
+ const { filterByRelevance } = await import('./core/bm25-filter.js');
581
+ const focusResult = filterByRelevance(result.content, { query: options.focus });
582
+ result.content = focusResult.content;
583
+ result.tokens = estimateTokens(focusResult.content);
584
+ if (isJson) {
585
+ result.focusQuery = options.focus;
586
+ result.focusReduction = focusResult.reductionPercent;
587
+ }
588
+ }
589
+ // --- Smart Chunking ---
590
+ if (options.chunk && options.chunk > 0 && result.content) {
591
+ const { chunkContent } = await import('./core/chunking.js');
592
+ const chunkResult = chunkContent(result.content, {
593
+ chunkSize: options.chunk,
594
+ overlap: options.chunkOverlap || 200,
595
+ strategy: options.chunkStrategy || 'semantic',
596
+ });
597
+ // Replace content with chunked output
598
+ if (isJson) {
599
+ result.chunks = chunkResult.chunks;
600
+ result.totalChunks = chunkResult.totalChunks;
601
+ result.originalTokens = chunkResult.originalTokens;
602
+ // Keep content as first chunk for non-JSON fallback
603
+ result.content = chunkResult.chunks[0]?.content || '';
604
+ result.tokens = chunkResult.chunks[0]?.tokens || 0;
605
+ }
606
+ else {
607
+ // Plain text mode: output chunks separated by markers
608
+ const chunkOutput = chunkResult.chunks.map((c, i) => `--- Chunk ${i + 1}/${chunkResult.totalChunks} (${c.tokens} tokens) ---\n${c.content}`).join('\n\n');
609
+ result.content = chunkOutput;
610
+ result.tokens = chunkResult.totalTokens;
611
+ }
612
+ }
572
613
  // --- #4: Content quality warning ---
573
614
  const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
574
615
  const isRedirect = false; // peel() follows redirects — final result is always 200