webpeel 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +41 -0
- package/dist/cli.js.map +1 -1
- package/dist/core/bm25-filter.d.ts +57 -0
- package/dist/core/bm25-filter.d.ts.map +1 -0
- package/dist/core/bm25-filter.js +249 -0
- package/dist/core/bm25-filter.js.map +1 -0
- package/dist/core/chunking.d.ts +43 -0
- package/dist/core/chunking.d.ts.map +1 -0
- package/dist/core/chunking.js +182 -0
- package/dist/core/chunking.js.map +1 -0
- package/dist/core/content-pruner.d.ts +33 -0
- package/dist/core/content-pruner.d.ts.map +1 -0
- package/dist/core/content-pruner.js +249 -0
- package/dist/core/content-pruner.js.map +1 -0
- package/dist/core/markdown.d.ts +4 -1
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +11 -2
- package/dist/core/markdown.js.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +16 -2
- package/dist/index.js.map +1 -1
- package/dist/types.d.ts +9 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -146,6 +146,11 @@ program
|
|
|
146
146
|
.option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
|
|
147
147
|
.option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
|
|
148
148
|
.option('--only-main-content', 'Shortcut for --include-tags main,article')
|
|
149
|
+
.option('--full-content', 'Return full page content (disable automatic content density pruning)')
|
|
150
|
+
.option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
|
|
151
|
+
.option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
|
|
152
|
+
.option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
|
|
153
|
+
.option('--chunk-strategy <strategy>', 'Chunking strategy: fixed, semantic (default), paragraph')
|
|
149
154
|
.option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
|
|
150
155
|
.option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
|
|
151
156
|
.option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
|
|
@@ -493,6 +498,7 @@ program
|
|
|
493
498
|
headed: options.headed || false,
|
|
494
499
|
storageState: resolvedStorageState,
|
|
495
500
|
proxy: options.proxy,
|
|
501
|
+
fullPage: options.fullContent || false,
|
|
496
502
|
};
|
|
497
503
|
// Add summary option if requested
|
|
498
504
|
if (options.summary) {
|
|
@@ -569,6 +575,41 @@ program
|
|
|
569
575
|
result.tokens = estimateTokens(distilled);
|
|
570
576
|
}
|
|
571
577
|
}
|
|
578
|
+
// --- BM25 Query-Focused Filtering ---
|
|
579
|
+
if (options.focus && result.content) {
|
|
580
|
+
const { filterByRelevance } = await import('./core/bm25-filter.js');
|
|
581
|
+
const focusResult = filterByRelevance(result.content, { query: options.focus });
|
|
582
|
+
result.content = focusResult.content;
|
|
583
|
+
result.tokens = estimateTokens(focusResult.content);
|
|
584
|
+
if (isJson) {
|
|
585
|
+
result.focusQuery = options.focus;
|
|
586
|
+
result.focusReduction = focusResult.reductionPercent;
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
// --- Smart Chunking ---
|
|
590
|
+
if (options.chunk && options.chunk > 0 && result.content) {
|
|
591
|
+
const { chunkContent } = await import('./core/chunking.js');
|
|
592
|
+
const chunkResult = chunkContent(result.content, {
|
|
593
|
+
chunkSize: options.chunk,
|
|
594
|
+
overlap: options.chunkOverlap || 200,
|
|
595
|
+
strategy: options.chunkStrategy || 'semantic',
|
|
596
|
+
});
|
|
597
|
+
// Replace content with chunked output
|
|
598
|
+
if (isJson) {
|
|
599
|
+
result.chunks = chunkResult.chunks;
|
|
600
|
+
result.totalChunks = chunkResult.totalChunks;
|
|
601
|
+
result.originalTokens = chunkResult.originalTokens;
|
|
602
|
+
// Keep content as first chunk for non-JSON fallback
|
|
603
|
+
result.content = chunkResult.chunks[0]?.content || '';
|
|
604
|
+
result.tokens = chunkResult.chunks[0]?.tokens || 0;
|
|
605
|
+
}
|
|
606
|
+
else {
|
|
607
|
+
// Plain text mode: output chunks separated by markers
|
|
608
|
+
const chunkOutput = chunkResult.chunks.map((c, i) => `--- Chunk ${i + 1}/${chunkResult.totalChunks} (${c.tokens} tokens) ---\n${c.content}`).join('\n\n');
|
|
609
|
+
result.content = chunkOutput;
|
|
610
|
+
result.tokens = chunkResult.totalTokens;
|
|
611
|
+
}
|
|
612
|
+
}
|
|
572
613
|
// --- #4: Content quality warning ---
|
|
573
614
|
const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
|
|
574
615
|
const isRedirect = false; // peel() follows redirects — final result is always 200
|