webpeel 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +86 -3
- package/dist/cli.js.map +1 -1
- package/dist/core/bm25-filter.d.ts +57 -0
- package/dist/core/bm25-filter.d.ts.map +1 -0
- package/dist/core/bm25-filter.js +249 -0
- package/dist/core/bm25-filter.js.map +1 -0
- package/dist/core/chunking.d.ts +43 -0
- package/dist/core/chunking.d.ts.map +1 -0
- package/dist/core/chunking.js +182 -0
- package/dist/core/chunking.js.map +1 -0
- package/dist/core/content-pruner.d.ts +33 -0
- package/dist/core/content-pruner.d.ts.map +1 -0
- package/dist/core/content-pruner.js +249 -0
- package/dist/core/content-pruner.js.map +1 -0
- package/dist/core/fetcher.d.ts +8 -1
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +33 -7
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/hotel-search.d.ts +2 -0
- package/dist/core/hotel-search.d.ts.map +1 -1
- package/dist/core/hotel-search.js +2 -0
- package/dist/core/hotel-search.js.map +1 -1
- package/dist/core/llm-extract.d.ts +15 -1
- package/dist/core/llm-extract.d.ts.map +1 -1
- package/dist/core/llm-extract.js +127 -7
- package/dist/core/llm-extract.js.map +1 -1
- package/dist/core/markdown.d.ts +4 -1
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +11 -2
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/strategies.d.ts +6 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +10 -5
- package/dist/core/strategies.js.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +17 -2
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +47 -3
- package/dist/mcp/server.js.map +1 -1
- package/dist/types.d.ts +19 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -131,6 +131,7 @@ program
|
|
|
131
131
|
.argument('[url]', 'URL to fetch')
|
|
132
132
|
.option('-r, --render', 'Use headless browser (for JS-heavy sites)')
|
|
133
133
|
.option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
|
|
134
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
134
135
|
.option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
|
|
135
136
|
.option('--html', 'Output raw HTML instead of markdown')
|
|
136
137
|
.option('--text', 'Output plain text instead of markdown')
|
|
@@ -145,6 +146,11 @@ program
|
|
|
145
146
|
.option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
|
|
146
147
|
.option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
|
|
147
148
|
.option('--only-main-content', 'Shortcut for --include-tags main,article')
|
|
149
|
+
.option('--full-content', 'Return full page content (disable automatic content density pruning)')
|
|
150
|
+
.option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
|
|
151
|
+
.option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
|
|
152
|
+
.option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
|
|
153
|
+
.option('--chunk-strategy <strategy>', 'Chunking strategy: fixed, semantic (default), paragraph')
|
|
148
154
|
.option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
|
|
149
155
|
.option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
|
|
150
156
|
.option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
|
|
@@ -156,6 +162,7 @@ program
|
|
|
156
162
|
.option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
|
|
157
163
|
.option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
|
|
158
164
|
.option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
|
|
165
|
+
.option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
|
|
159
166
|
.option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
|
|
160
167
|
.option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
|
|
161
168
|
.option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
|
|
@@ -296,7 +303,7 @@ program
|
|
|
296
303
|
cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
|
|
297
304
|
}
|
|
298
305
|
// LLM extraction from cached content
|
|
299
|
-
if (options.llmExtract) {
|
|
306
|
+
if (options.llmExtract || options.extractSchema) {
|
|
300
307
|
const { extractWithLLM } = await import('./core/llm-extract.js');
|
|
301
308
|
const llmCfgCached = loadConfig();
|
|
302
309
|
const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
@@ -307,9 +314,25 @@ program
|
|
|
307
314
|
const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
|
|
308
315
|
const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
309
316
|
const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
|
|
317
|
+
// Parse schema if provided
|
|
318
|
+
let llmSchemaCached;
|
|
319
|
+
if (options.extractSchema) {
|
|
320
|
+
let schemaStr = options.extractSchema;
|
|
321
|
+
if (schemaStr.startsWith('@')) {
|
|
322
|
+
schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
|
|
323
|
+
}
|
|
324
|
+
try {
|
|
325
|
+
llmSchemaCached = JSON.parse(schemaStr);
|
|
326
|
+
}
|
|
327
|
+
catch {
|
|
328
|
+
console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
|
|
329
|
+
process.exit(1);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
310
332
|
const llmResultCached = await extractWithLLM({
|
|
311
333
|
content: cachedResult.content,
|
|
312
334
|
instruction: llmInstructionCached,
|
|
335
|
+
schema: llmSchemaCached,
|
|
313
336
|
apiKey: llmApiKeyCached,
|
|
314
337
|
model: llmModelCached,
|
|
315
338
|
baseUrl: llmBaseUrlCached,
|
|
@@ -356,9 +379,13 @@ program
|
|
|
356
379
|
throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
|
|
357
380
|
}
|
|
358
381
|
}
|
|
382
|
+
// --extract-schema auto-enables JSON output
|
|
383
|
+
if (options.extractSchema) {
|
|
384
|
+
options.json = true;
|
|
385
|
+
}
|
|
359
386
|
// Parse extract
|
|
360
387
|
let extract;
|
|
361
|
-
if (options.llmExtract) {
|
|
388
|
+
if (options.llmExtract || options.extractSchema) {
|
|
362
389
|
// LLM-based extraction is handled post-fetch (after peel returns markdown).
|
|
363
390
|
// Early-validate that an API key is available so we fail fast.
|
|
364
391
|
const llmCfg = loadConfig();
|
|
@@ -470,6 +497,8 @@ program
|
|
|
470
497
|
profileDir: resolvedProfileDir,
|
|
471
498
|
headed: options.headed || false,
|
|
472
499
|
storageState: resolvedStorageState,
|
|
500
|
+
proxy: options.proxy,
|
|
501
|
+
fullPage: options.fullContent || false,
|
|
473
502
|
};
|
|
474
503
|
// Add summary option if requested
|
|
475
504
|
if (options.summary) {
|
|
@@ -546,6 +575,41 @@ program
|
|
|
546
575
|
result.tokens = estimateTokens(distilled);
|
|
547
576
|
}
|
|
548
577
|
}
|
|
578
|
+
// --- BM25 Query-Focused Filtering ---
|
|
579
|
+
if (options.focus && result.content) {
|
|
580
|
+
const { filterByRelevance } = await import('./core/bm25-filter.js');
|
|
581
|
+
const focusResult = filterByRelevance(result.content, { query: options.focus });
|
|
582
|
+
result.content = focusResult.content;
|
|
583
|
+
result.tokens = estimateTokens(focusResult.content);
|
|
584
|
+
if (isJson) {
|
|
585
|
+
result.focusQuery = options.focus;
|
|
586
|
+
result.focusReduction = focusResult.reductionPercent;
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
// --- Smart Chunking ---
|
|
590
|
+
if (options.chunk && options.chunk > 0 && result.content) {
|
|
591
|
+
const { chunkContent } = await import('./core/chunking.js');
|
|
592
|
+
const chunkResult = chunkContent(result.content, {
|
|
593
|
+
chunkSize: options.chunk,
|
|
594
|
+
overlap: options.chunkOverlap || 200,
|
|
595
|
+
strategy: options.chunkStrategy || 'semantic',
|
|
596
|
+
});
|
|
597
|
+
// Replace content with chunked output
|
|
598
|
+
if (isJson) {
|
|
599
|
+
result.chunks = chunkResult.chunks;
|
|
600
|
+
result.totalChunks = chunkResult.totalChunks;
|
|
601
|
+
result.originalTokens = chunkResult.originalTokens;
|
|
602
|
+
// Keep content as first chunk for non-JSON fallback
|
|
603
|
+
result.content = chunkResult.chunks[0]?.content || '';
|
|
604
|
+
result.tokens = chunkResult.chunks[0]?.tokens || 0;
|
|
605
|
+
}
|
|
606
|
+
else {
|
|
607
|
+
// Plain text mode: output chunks separated by markers
|
|
608
|
+
const chunkOutput = chunkResult.chunks.map((c, i) => `--- Chunk ${i + 1}/${chunkResult.totalChunks} (${c.tokens} tokens) ---\n${c.content}`).join('\n\n');
|
|
609
|
+
result.content = chunkOutput;
|
|
610
|
+
result.tokens = chunkResult.totalTokens;
|
|
611
|
+
}
|
|
612
|
+
}
|
|
549
613
|
// --- #4: Content quality warning ---
|
|
550
614
|
const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
|
|
551
615
|
const isRedirect = false; // peel() follows redirects — final result is always 200
|
|
@@ -559,16 +623,31 @@ program
|
|
|
559
623
|
}
|
|
560
624
|
}
|
|
561
625
|
// --- LLM-based extraction (post-peel) ---
|
|
562
|
-
if (options.llmExtract) {
|
|
626
|
+
if (options.llmExtract || options.extractSchema) {
|
|
563
627
|
const { extractWithLLM } = await import('./core/llm-extract.js');
|
|
564
628
|
const llmCfg = loadConfig();
|
|
565
629
|
const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
566
630
|
const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
|
|
567
631
|
const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
568
632
|
const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
|
|
633
|
+
// Parse --extract-schema if provided
|
|
634
|
+
let llmSchema;
|
|
635
|
+
if (options.extractSchema) {
|
|
636
|
+
let schemaStr = options.extractSchema;
|
|
637
|
+
if (schemaStr.startsWith('@')) {
|
|
638
|
+
schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
|
|
639
|
+
}
|
|
640
|
+
try {
|
|
641
|
+
llmSchema = JSON.parse(schemaStr);
|
|
642
|
+
}
|
|
643
|
+
catch {
|
|
644
|
+
exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
|
|
645
|
+
}
|
|
646
|
+
}
|
|
569
647
|
const llmResult = await extractWithLLM({
|
|
570
648
|
content: result.content,
|
|
571
649
|
instruction: llmInstruction,
|
|
650
|
+
schema: llmSchema,
|
|
572
651
|
apiKey: llmApiKey,
|
|
573
652
|
model: llmModel,
|
|
574
653
|
baseUrl: llmBaseUrl,
|
|
@@ -806,6 +885,7 @@ program
|
|
|
806
885
|
.option('--csv', 'Output site-search results as CSV (requires --site)')
|
|
807
886
|
.option('--budget <n>', 'Token budget for site-search result content', parseInt)
|
|
808
887
|
.option('-s, --silent', 'Silent mode')
|
|
888
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
809
889
|
.option('--agent', 'Agent mode: sets --json, --silent, and --budget 4000 (override with --budget N)')
|
|
810
890
|
.action(async (query, options) => {
|
|
811
891
|
// --agent sets sensible defaults for AI agents; explicit flags override
|
|
@@ -837,6 +917,7 @@ program
|
|
|
837
917
|
const htmlResult = await peel(siteResult.url, {
|
|
838
918
|
format: 'html',
|
|
839
919
|
timeout: 30000,
|
|
920
|
+
proxy: options.proxy,
|
|
840
921
|
});
|
|
841
922
|
if (spinner) {
|
|
842
923
|
spinner.succeed(`Fetched ${siteResult.site} in ${htmlResult.elapsed}ms`);
|
|
@@ -2953,6 +3034,7 @@ program
|
|
|
2953
3034
|
.option('--source <name...>', 'Only use specific source(s): kayak, booking, google (repeatable)')
|
|
2954
3035
|
.option('--json', 'Output as JSON')
|
|
2955
3036
|
.option('--stealth', 'Use stealth mode for all sources')
|
|
3037
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
2956
3038
|
.option('-s, --silent', 'Suppress progress messages')
|
|
2957
3039
|
.action(async (destination, options) => {
|
|
2958
3040
|
const isJson = options.json;
|
|
@@ -3005,6 +3087,7 @@ program
|
|
|
3005
3087
|
sources,
|
|
3006
3088
|
stealth: options.stealth,
|
|
3007
3089
|
silent: isSilent,
|
|
3090
|
+
proxy: options.proxy,
|
|
3008
3091
|
});
|
|
3009
3092
|
if (searchSpinner)
|
|
3010
3093
|
searchSpinner.stop();
|