npm - webpeel - Versions diffs - 0.10.0 → 0.12.0 - Mend

webpeel 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/dist/cli.js +86 -3
package/dist/cli.js.map +1 -1
package/dist/core/bm25-filter.d.ts +57 -0
package/dist/core/bm25-filter.d.ts.map +1 -0
package/dist/core/bm25-filter.js +249 -0
package/dist/core/bm25-filter.js.map +1 -0
package/dist/core/chunking.d.ts +43 -0
package/dist/core/chunking.d.ts.map +1 -0
package/dist/core/chunking.js +182 -0
package/dist/core/chunking.js.map +1 -0
package/dist/core/content-pruner.d.ts +33 -0
package/dist/core/content-pruner.d.ts.map +1 -0
package/dist/core/content-pruner.js +249 -0
package/dist/core/content-pruner.js.map +1 -0
package/dist/core/fetcher.d.ts +8 -1
package/dist/core/fetcher.d.ts.map +1 -1
package/dist/core/fetcher.js +33 -7
package/dist/core/fetcher.js.map +1 -1
package/dist/core/hotel-search.d.ts +2 -0
package/dist/core/hotel-search.d.ts.map +1 -1
package/dist/core/hotel-search.js +2 -0
package/dist/core/hotel-search.js.map +1 -1
package/dist/core/llm-extract.d.ts +15 -1
package/dist/core/llm-extract.d.ts.map +1 -1
package/dist/core/llm-extract.js +127 -7
package/dist/core/llm-extract.js.map +1 -1
package/dist/core/markdown.d.ts +4 -1
package/dist/core/markdown.d.ts.map +1 -1
package/dist/core/markdown.js +11 -2
package/dist/core/markdown.js.map +1 -1
package/dist/core/strategies.d.ts +6 -0
package/dist/core/strategies.d.ts.map +1 -1
package/dist/core/strategies.js +10 -5
package/dist/core/strategies.js.map +1 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +17 -2
package/dist/index.js.map +1 -1
package/dist/mcp/server.js +47 -3
package/dist/mcp/server.js.map +1 -1
package/dist/types.d.ts +19 -0
package/dist/types.d.ts.map +1 -1
package/dist/types.js.map +1 -1
package/package.json +1 -1

package/dist/cli.js CHANGED Viewed

@@ -131,6 +131,7 @@ program
     .argument('[url]', 'URL to fetch')
     .option('-r, --render', 'Use headless browser (for JS-heavy sites)')
     .option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
+    .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
     .option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
     .option('--html', 'Output raw HTML instead of markdown')
     .option('--text', 'Output plain text instead of markdown')
@@ -145,6 +146,11 @@ program
     .option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
     .option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
     .option('--only-main-content', 'Shortcut for --include-tags main,article')
+    .option('--full-content', 'Return full page content (disable automatic content density pruning)')
+    .option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
+    .option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
+    .option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
+    .option('--chunk-strategy <strategy>', 'Chunking strategy: fixed, semantic (default), paragraph')
     .option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
     .option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
     .option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
@@ -156,6 +162,7 @@ program
     .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
     .option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
     .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
+    .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
     .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
     .option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
     .option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
@@ -296,7 +303,7 @@ program
                 cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
             }
             // LLM extraction from cached content
-            if (options.llmExtract) {
+            if (options.llmExtract || options.extractSchema) {
                 const { extractWithLLM } = await import('./core/llm-extract.js');
                 const llmCfgCached = loadConfig();
                 const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
@@ -307,9 +314,25 @@ program
                 const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
                 const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
                 const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
+                // Parse schema if provided
+                let llmSchemaCached;
+                if (options.extractSchema) {
+                    let schemaStr = options.extractSchema;
+                    if (schemaStr.startsWith('@')) {
+                        schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
+                    }
+                    try {
+                        llmSchemaCached = JSON.parse(schemaStr);
+                    }
+                    catch {
+                        console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
+                        process.exit(1);
+                    }
+                }
                 const llmResultCached = await extractWithLLM({
                     content: cachedResult.content,
                     instruction: llmInstructionCached,
+                    schema: llmSchemaCached,
                     apiKey: llmApiKeyCached,
                     model: llmModelCached,
                     baseUrl: llmBaseUrlCached,
@@ -356,9 +379,13 @@ program
                 throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
             }
         }
+        // --extract-schema auto-enables JSON output
+        if (options.extractSchema) {
+            options.json = true;
+        }
         // Parse extract
         let extract;
-        if (options.llmExtract) {
+        if (options.llmExtract || options.extractSchema) {
             // LLM-based extraction is handled post-fetch (after peel returns markdown).
             // Early-validate that an API key is available so we fail fast.
             const llmCfg = loadConfig();
@@ -470,6 +497,8 @@ program
             profileDir: resolvedProfileDir,
             headed: options.headed || false,
             storageState: resolvedStorageState,
+            proxy: options.proxy,
+            fullPage: options.fullContent || false,
         };
         // Add summary option if requested
         if (options.summary) {
@@ -546,6 +575,41 @@ program
                 result.tokens = estimateTokens(distilled);
             }
         }
+        // --- BM25 Query-Focused Filtering ---
+        if (options.focus && result.content) {
+            const { filterByRelevance } = await import('./core/bm25-filter.js');
+            const focusResult = filterByRelevance(result.content, { query: options.focus });
+            result.content = focusResult.content;
+            result.tokens = estimateTokens(focusResult.content);
+            if (isJson) {
+                result.focusQuery = options.focus;
+                result.focusReduction = focusResult.reductionPercent;
+            }
+        }
+        // --- Smart Chunking ---
+        if (options.chunk && options.chunk > 0 && result.content) {
+            const { chunkContent } = await import('./core/chunking.js');
+            const chunkResult = chunkContent(result.content, {
+                chunkSize: options.chunk,
+                overlap: options.chunkOverlap || 200,
+                strategy: options.chunkStrategy || 'semantic',
+            });
+            // Replace content with chunked output
+            if (isJson) {
+                result.chunks = chunkResult.chunks;
+                result.totalChunks = chunkResult.totalChunks;
+                result.originalTokens = chunkResult.originalTokens;
+                // Keep content as first chunk for non-JSON fallback
+                result.content = chunkResult.chunks[0]?.content || '';
+                result.tokens = chunkResult.chunks[0]?.tokens || 0;
+            }
+            else {
+                // Plain text mode: output chunks separated by markers
+                const chunkOutput = chunkResult.chunks.map((c, i) => `--- Chunk ${i + 1}/${chunkResult.totalChunks} (${c.tokens} tokens) ---\n${c.content}`).join('\n\n');
+                result.content = chunkOutput;
+                result.tokens = chunkResult.totalTokens;
+            }
+        }
         // --- #4: Content quality warning ---
         const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
         const isRedirect = false; // peel() follows redirects — final result is always 200
@@ -559,16 +623,31 @@ program
             }
         }
         // --- LLM-based extraction (post-peel) ---
-        if (options.llmExtract) {
+        if (options.llmExtract || options.extractSchema) {
             const { extractWithLLM } = await import('./core/llm-extract.js');
             const llmCfg = loadConfig();
             const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
             const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
             const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
             const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
+            // Parse --extract-schema if provided
+            let llmSchema;
+            if (options.extractSchema) {
+                let schemaStr = options.extractSchema;
+                if (schemaStr.startsWith('@')) {
+                    schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
+                }
+                try {
+                    llmSchema = JSON.parse(schemaStr);
+                }
+                catch {
+                    exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
+                }
+            }
             const llmResult = await extractWithLLM({
                 content: result.content,
                 instruction: llmInstruction,
+                schema: llmSchema,
                 apiKey: llmApiKey,
                 model: llmModel,
                 baseUrl: llmBaseUrl,
@@ -806,6 +885,7 @@ program
     .option('--csv', 'Output site-search results as CSV (requires --site)')
     .option('--budget <n>', 'Token budget for site-search result content', parseInt)
     .option('-s, --silent', 'Silent mode')
+    .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
     .option('--agent', 'Agent mode: sets --json, --silent, and --budget 4000 (override with --budget N)')
     .action(async (query, options) => {
     // --agent sets sensible defaults for AI agents; explicit flags override
@@ -837,6 +917,7 @@ program
             const htmlResult = await peel(siteResult.url, {
                 format: 'html',
                 timeout: 30000,
+                proxy: options.proxy,
             });
             if (spinner) {
                 spinner.succeed(`Fetched ${siteResult.site} in ${htmlResult.elapsed}ms`);
@@ -2953,6 +3034,7 @@ program
     .option('--source <name...>', 'Only use specific source(s): kayak, booking, google (repeatable)')
     .option('--json', 'Output as JSON')
     .option('--stealth', 'Use stealth mode for all sources')
+    .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
     .option('-s, --silent', 'Suppress progress messages')
     .action(async (destination, options) => {
     const isJson = options.json;
@@ -3005,6 +3087,7 @@ program
             sources,
             stealth: options.stealth,
             silent: isSilent,
+            proxy: options.proxy,
         });
         if (searchSpinner)
             searchSpinner.stop();