webpeel 0.16.0 ā 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +11 -657
- package/README.md +246 -325
- package/dist/cli.js +330 -73
- package/dist/cli.js.map +1 -1
- package/dist/core/browser-fetch.d.ts +12 -0
- package/dist/core/browser-fetch.d.ts.map +1 -1
- package/dist/core/browser-fetch.js +70 -17
- package/dist/core/browser-fetch.js.map +1 -1
- package/dist/core/cf-worker-proxy.d.ts +33 -0
- package/dist/core/cf-worker-proxy.d.ts.map +1 -0
- package/dist/core/cf-worker-proxy.js +88 -0
- package/dist/core/cf-worker-proxy.js.map +1 -0
- package/dist/core/chunker.d.ts +47 -0
- package/dist/core/chunker.d.ts.map +1 -0
- package/dist/core/chunker.js +250 -0
- package/dist/core/chunker.js.map +1 -0
- package/dist/core/cloak-fetch.d.ts +43 -0
- package/dist/core/cloak-fetch.d.ts.map +1 -0
- package/dist/core/cloak-fetch.js +141 -0
- package/dist/core/cloak-fetch.js.map +1 -0
- package/dist/core/crawl-checkpoint.d.ts +55 -0
- package/dist/core/crawl-checkpoint.d.ts.map +1 -0
- package/dist/core/crawl-checkpoint.js +105 -0
- package/dist/core/crawl-checkpoint.js.map +1 -0
- package/dist/core/crawler.d.ts +5 -1
- package/dist/core/crawler.d.ts.map +1 -1
- package/dist/core/crawler.js +60 -5
- package/dist/core/crawler.js.map +1 -1
- package/dist/core/cycle-fetch.d.ts +27 -0
- package/dist/core/cycle-fetch.d.ts.map +1 -0
- package/dist/core/cycle-fetch.js +99 -0
- package/dist/core/cycle-fetch.js.map +1 -0
- package/dist/core/domain-extractors.d.ts.map +1 -1
- package/dist/core/domain-extractors.js +754 -14
- package/dist/core/domain-extractors.js.map +1 -1
- package/dist/core/google-cache.d.ts +30 -0
- package/dist/core/google-cache.d.ts.map +1 -0
- package/dist/core/google-cache.js +181 -0
- package/dist/core/google-cache.js.map +1 -0
- package/dist/core/markdown.d.ts +11 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +43 -0
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/peel-tls.d.ts +26 -0
- package/dist/core/peel-tls.d.ts.map +1 -0
- package/dist/core/peel-tls.js +221 -0
- package/dist/core/peel-tls.js.map +1 -0
- package/dist/core/pipeline.d.ts +5 -1
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +269 -21
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/schema-postprocess.d.ts +33 -0
- package/dist/core/schema-postprocess.d.ts.map +1 -0
- package/dist/core/schema-postprocess.js +470 -0
- package/dist/core/schema-postprocess.js.map +1 -0
- package/dist/core/schema-templates.d.ts +20 -0
- package/dist/core/schema-templates.d.ts.map +1 -0
- package/dist/core/schema-templates.js +131 -0
- package/dist/core/schema-templates.js.map +1 -0
- package/dist/core/search-fallback.d.ts +28 -0
- package/dist/core/search-fallback.d.ts.map +1 -0
- package/dist/core/search-fallback.js +185 -0
- package/dist/core/search-fallback.js.map +1 -0
- package/dist/core/search-provider.d.ts +47 -4
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +278 -7
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/stealth-patches.d.ts +58 -0
- package/dist/core/stealth-patches.d.ts.map +1 -0
- package/dist/core/stealth-patches.js +340 -0
- package/dist/core/stealth-patches.js.map +1 -0
- package/dist/core/strategies.d.ts +20 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +284 -48
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/strategy-hooks.d.ts +1 -1
- package/dist/core/strategy-hooks.d.ts.map +1 -1
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +37 -15
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +109 -4
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +29 -0
- package/dist/server/app.js.map +1 -1
- package/dist/server/middleware/rate-limit.d.ts +2 -1
- package/dist/server/middleware/rate-limit.d.ts.map +1 -1
- package/dist/server/middleware/rate-limit.js +24 -8
- package/dist/server/middleware/rate-limit.js.map +1 -1
- package/dist/server/routes/agent.d.ts +4 -0
- package/dist/server/routes/agent.d.ts.map +1 -1
- package/dist/server/routes/agent.js +196 -9
- package/dist/server/routes/agent.js.map +1 -1
- package/dist/server/routes/batch.js +5 -5
- package/dist/server/routes/batch.js.map +1 -1
- package/dist/server/routes/compat.d.ts.map +1 -1
- package/dist/server/routes/compat.js +1 -0
- package/dist/server/routes/compat.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +60 -6
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +103 -2
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/search.js +1 -1
- package/dist/server/routes/search.js.map +1 -1
- package/dist/types.d.ts +55 -4
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +4 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +55 -125
- package/package.json +15 -1
package/dist/cli.js
CHANGED
|
@@ -21,6 +21,7 @@ import { checkUsage, showUsageFooter, handleLogin, handleLogout, handleUsage, lo
|
|
|
21
21
|
import { getCache, setCache, parseTTL, clearCache, cacheStats } from './cache.js';
|
|
22
22
|
import { estimateTokens } from './core/markdown.js';
|
|
23
23
|
import { distillToBudget, budgetListings } from './core/budget.js';
|
|
24
|
+
import { SCHEMA_TEMPLATES, getSchemaTemplate, listSchemaTemplates } from './core/schema-templates.js';
|
|
24
25
|
const program = new Command();
|
|
25
26
|
// Read version from package.json dynamically
|
|
26
27
|
import { fileURLToPath } from 'url';
|
|
@@ -178,10 +179,15 @@ program
|
|
|
178
179
|
.argument('[url]', 'URL to fetch')
|
|
179
180
|
.option('-r, --render', 'Use headless browser (for JS-heavy sites)')
|
|
180
181
|
.option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
|
|
182
|
+
.option('--cloaked', 'Use CloakBrowser stealth (requires: npm install cloakbrowser)')
|
|
183
|
+
.option('--tls', 'Use PeelTLS TLS fingerprint spoofing (built-in, no install needed)')
|
|
184
|
+
.option('--cycle', 'Use PeelTLS TLS fingerprint spoofing (alias for --tls)', false)
|
|
181
185
|
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
186
|
+
.option('--proxies <urls>', 'Comma-separated list of proxy URLs for rotation (tried in order on failure)', (val) => val.split(',').map((s) => s.trim()).filter(Boolean))
|
|
182
187
|
.option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
|
|
183
188
|
.option('--html', 'Output raw HTML instead of markdown')
|
|
184
189
|
.option('--text', 'Output plain text instead of markdown')
|
|
190
|
+
.option('--clean', 'Output clean text optimized for AI (strips URLs, keeps structure)')
|
|
185
191
|
.option('--json', 'Output as JSON')
|
|
186
192
|
.option('-t, --timeout <ms>', 'Request timeout (ms)', (v) => parseInt(v, 10), 30000)
|
|
187
193
|
.option('--ua <agent>', 'Custom user agent')
|
|
@@ -196,9 +202,10 @@ program
|
|
|
196
202
|
.option('--full-content', 'Return full page content (disable automatic content density pruning)')
|
|
197
203
|
.option('--readable', 'Reader mode ā extract only the main article content, strip all noise (like browser Reader Mode)')
|
|
198
204
|
.option('--focus <query>', 'Query-focused filtering ā only return content relevant to this query (BM25 ranking)')
|
|
199
|
-
.option('--chunk
|
|
200
|
-
.option('--chunk-
|
|
201
|
-
.option('--chunk-
|
|
205
|
+
.option('--chunk', 'Split content into RAG-ready chunks')
|
|
206
|
+
.option('--chunk-size <tokens>', 'Max tokens per chunk (default: 512)', parseInt)
|
|
207
|
+
.option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 50)', parseInt)
|
|
208
|
+
.option('--chunk-strategy <strategy>', 'Chunking strategy: section (default), paragraph, fixed')
|
|
202
209
|
.option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
|
|
203
210
|
.option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
|
|
204
211
|
.option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") ā default: 5m')
|
|
@@ -231,7 +238,15 @@ program
|
|
|
231
238
|
.option('--profile <path>', 'Use a persistent browser profile directory (cookies/sessions survive between calls)')
|
|
232
239
|
.option('--headed', 'Run browser in headed (visible) mode ā useful for profile setup and debugging')
|
|
233
240
|
.option('-q, --question <q>', 'Ask a question about the page content (BM25-powered, no LLM key needed)')
|
|
234
|
-
.option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)')
|
|
241
|
+
.option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)')
|
|
242
|
+
.option('--device <type>', 'Device emulation: desktop (default), mobile, tablet (auto-enables --render)')
|
|
243
|
+
.option('--viewport <WxH>', 'Browser viewport size (e.g., "1920x1080") (auto-enables --render)', (val) => {
|
|
244
|
+
const [w, h] = val.split('x').map(Number);
|
|
245
|
+
return { width: w, height: h };
|
|
246
|
+
})
|
|
247
|
+
.option('--wait-until <event>', 'Page load event: domcontentloaded, networkidle, load, commit (auto-enables --render)')
|
|
248
|
+
.option('--wait-selector <css>', 'Wait for CSS selector before extracting (auto-enables --render)')
|
|
249
|
+
.option('--block-resources <types>', 'Block resource types, comma-separated: image,stylesheet,font,media,script (auto-enables --render)');
|
|
235
250
|
program.configureHelp({
|
|
236
251
|
sortSubcommands: true,
|
|
237
252
|
showGlobalOptions: false,
|
|
@@ -273,9 +288,24 @@ Examples:
|
|
|
273
288
|
$ webpeel "https://nytimes.com/article" --readable Reader mode
|
|
274
289
|
$ webpeel search "best restaurants in NYC" Web search
|
|
275
290
|
$ webpeel hotels "Manhattan" --checkin tomorrow Hotel search
|
|
291
|
+
|
|
292
|
+
Agent Integration:
|
|
293
|
+
$ webpeel mcp Start MCP server
|
|
294
|
+
$ cat urls.txt | webpeel batch Batch from stdin
|
|
295
|
+
$ webpeel pipe "https://example.com" | jq .content Pipe-friendly JSON
|
|
296
|
+
$ webpeel "https://site.com" --json --silent Same as pipe
|
|
297
|
+
$ curl https://webpeel.dev/llms.txt AI-readable docs
|
|
276
298
|
`);
|
|
277
|
-
|
|
278
|
-
|
|
299
|
+
// Main fetch handler ā shared with the `pipe` subcommand
|
|
300
|
+
async function runFetch(url, options) {
|
|
301
|
+
// Smart defaults: when piped (not a TTY), default to silent JSON
|
|
302
|
+
const isPiped = !process.stdout.isTTY;
|
|
303
|
+
if (isPiped && !options.html && !options.text) {
|
|
304
|
+
if (!options.json)
|
|
305
|
+
options.json = true;
|
|
306
|
+
if (!options.silent)
|
|
307
|
+
options.silent = true;
|
|
308
|
+
}
|
|
279
309
|
// --agent sets sensible defaults for AI agents; explicit flags override
|
|
280
310
|
if (options.agent) {
|
|
281
311
|
if (!options.json)
|
|
@@ -471,6 +501,21 @@ program
|
|
|
471
501
|
process.exit(0);
|
|
472
502
|
}
|
|
473
503
|
}
|
|
504
|
+
// --- BM25 Schema Template Extraction (cached path) ---
|
|
505
|
+
if (options.schema && cachedResult.content) {
|
|
506
|
+
const { getSchemaTemplate: getSchTmplCached } = await import('./core/schema-templates.js');
|
|
507
|
+
const schTemplateCached = getSchTmplCached(options.schema);
|
|
508
|
+
if (schTemplateCached) {
|
|
509
|
+
const { quickAnswer: qaCached } = await import('./core/quick-answer.js');
|
|
510
|
+
const { smartExtractSchemaFields: smartExtractCached } = await import('./core/schema-postprocess.js');
|
|
511
|
+
const extractedCached = smartExtractCached(cachedResult.content, schTemplateCached.fields, qaCached, {
|
|
512
|
+
pageTitle: cachedResult.title,
|
|
513
|
+
pageUrl: cachedResult.url,
|
|
514
|
+
metadata: cachedResult.metadata,
|
|
515
|
+
});
|
|
516
|
+
cachedResult.extracted = extractedCached;
|
|
517
|
+
}
|
|
518
|
+
}
|
|
474
519
|
await outputResult(cachedResult, options, { cached: true });
|
|
475
520
|
process.exit(0);
|
|
476
521
|
}
|
|
@@ -592,7 +637,13 @@ program
|
|
|
592
637
|
const scrollExtractCount = isAutoScroll
|
|
593
638
|
? 0
|
|
594
639
|
: (scrollExtractRaw !== undefined ? scrollExtractRaw : 0);
|
|
595
|
-
const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll
|
|
640
|
+
const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll
|
|
641
|
+
|| (options.device && options.device !== 'desktop')
|
|
642
|
+
|| !!options.viewport
|
|
643
|
+
|| !!options.waitUntil
|
|
644
|
+
|| !!options.waitSelector
|
|
645
|
+
|| !!options.blockResources
|
|
646
|
+
|| false;
|
|
596
647
|
// Inject scroll actions when --scroll-extract N (fixed count) is used
|
|
597
648
|
if (scrollExtractCount > 0) {
|
|
598
649
|
const scrollActions = [];
|
|
@@ -630,13 +681,34 @@ program
|
|
|
630
681
|
headed: options.headed || false,
|
|
631
682
|
storageState: resolvedStorageState,
|
|
632
683
|
proxy: options.proxy,
|
|
684
|
+
proxies: options.proxies,
|
|
633
685
|
fullPage: options.fullContent || false,
|
|
634
686
|
readable: options.readable || false,
|
|
635
687
|
// Smart auto-scroll (bare --scroll-extract flag)
|
|
636
688
|
autoScroll: isAutoScroll
|
|
637
689
|
? { timeout: options.scrollExtractTimeout }
|
|
638
690
|
: undefined,
|
|
691
|
+
device: options.device,
|
|
692
|
+
viewportWidth: options.viewport ? options.viewport.width : undefined,
|
|
693
|
+
viewportHeight: options.viewport ? options.viewport.height : undefined,
|
|
694
|
+
waitUntil: options.waitUntil,
|
|
695
|
+
waitSelector: options.waitSelector,
|
|
696
|
+
blockResources: options.blockResources ? options.blockResources.split(',').map((s) => s.trim()) : undefined,
|
|
697
|
+
cloaked: options.cloaked ? true : undefined,
|
|
698
|
+
cycle: options.cycle ? true : undefined,
|
|
699
|
+
tls: (options.tls || options.cycle) ? true : undefined,
|
|
639
700
|
};
|
|
701
|
+
if (options.cloaked) {
|
|
702
|
+
peelOptions.render = true; // CloakBrowser is a browser
|
|
703
|
+
}
|
|
704
|
+
// Add chunk option if requested
|
|
705
|
+
if (options.chunk) {
|
|
706
|
+
peelOptions.chunk = {
|
|
707
|
+
maxTokens: options.chunkSize || 512,
|
|
708
|
+
overlap: options.chunkOverlap || 50,
|
|
709
|
+
strategy: options.chunkStrategy || 'section',
|
|
710
|
+
};
|
|
711
|
+
}
|
|
640
712
|
// Add summary option if requested
|
|
641
713
|
if (options.summary) {
|
|
642
714
|
const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
|
|
@@ -657,6 +729,9 @@ program
|
|
|
657
729
|
else if (options.text) {
|
|
658
730
|
peelOptions.format = 'text';
|
|
659
731
|
}
|
|
732
|
+
else if (options.clean) {
|
|
733
|
+
peelOptions.format = 'clean';
|
|
734
|
+
}
|
|
660
735
|
else {
|
|
661
736
|
peelOptions.format = 'markdown';
|
|
662
737
|
}
|
|
@@ -769,28 +844,15 @@ program
|
|
|
769
844
|
process.exit(0);
|
|
770
845
|
}
|
|
771
846
|
}
|
|
772
|
-
// ---
|
|
773
|
-
if (
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
if (isJson) {
|
|
782
|
-
result.chunks = chunkResult.chunks;
|
|
783
|
-
result.totalChunks = chunkResult.totalChunks;
|
|
784
|
-
result.originalTokens = chunkResult.originalTokens;
|
|
785
|
-
// Keep content as first chunk for non-JSON fallback
|
|
786
|
-
result.content = chunkResult.chunks[0]?.content || '';
|
|
787
|
-
result.tokens = chunkResult.chunks[0]?.tokens || 0;
|
|
788
|
-
}
|
|
789
|
-
else {
|
|
790
|
-
// Plain text mode: output chunks separated by markers
|
|
791
|
-
const chunkOutput = chunkResult.chunks.map((c, i) => `--- Chunk ${i + 1}/${chunkResult.totalChunks} (${c.tokens} tokens) ---\n${c.content}`).join('\n\n');
|
|
792
|
-
result.content = chunkOutput;
|
|
793
|
-
result.tokens = chunkResult.totalTokens;
|
|
847
|
+
// --- RAG Chunking output (chunks come from pipeline via peelOptions.chunk) ---
|
|
848
|
+
if (result.chunks && result.chunks.length > 0 && !isJson) {
|
|
849
|
+
console.log(`\n${'ā'.repeat(60)}`);
|
|
850
|
+
console.log(`š¦ ${result.chunks.length} chunks (${options.chunkStrategy || 'section'} strategy)\n`);
|
|
851
|
+
for (const chunk of result.chunks) {
|
|
852
|
+
const sectionLabel = chunk.section ? ` [${chunk.section}]` : '';
|
|
853
|
+
console.log(`āā Chunk ${chunk.index + 1}${sectionLabel} (${chunk.tokenCount} tokens, ${chunk.wordCount} words) āā`);
|
|
854
|
+
console.log(chunk.text.substring(0, 200) + (chunk.text.length > 200 ? '...' : ''));
|
|
855
|
+
console.log('');
|
|
794
856
|
}
|
|
795
857
|
}
|
|
796
858
|
// --- #4: Content quality warning ---
|
|
@@ -1001,6 +1063,21 @@ program
|
|
|
1001
1063
|
}
|
|
1002
1064
|
}
|
|
1003
1065
|
else {
|
|
1066
|
+
// --- BM25 Schema Template Extraction (no LLM needed) ---
|
|
1067
|
+
if (options.schema && result.content) {
|
|
1068
|
+
const { getSchemaTemplate: getSchTmpl } = await import('./core/schema-templates.js');
|
|
1069
|
+
const schTemplate = getSchTmpl(options.schema);
|
|
1070
|
+
if (schTemplate) {
|
|
1071
|
+
const { quickAnswer: qa } = await import('./core/quick-answer.js');
|
|
1072
|
+
const { smartExtractSchemaFields } = await import('./core/schema-postprocess.js');
|
|
1073
|
+
const extracted = smartExtractSchemaFields(result.content, schTemplate.fields, qa, {
|
|
1074
|
+
pageTitle: result.title,
|
|
1075
|
+
pageUrl: result.url,
|
|
1076
|
+
metadata: result.metadata,
|
|
1077
|
+
});
|
|
1078
|
+
result.extracted = extracted;
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1004
1081
|
// Output results (default path)
|
|
1005
1082
|
await outputResult(result, options, {
|
|
1006
1083
|
cached: false,
|
|
@@ -1032,6 +1109,10 @@ program
|
|
|
1032
1109
|
await cleanup();
|
|
1033
1110
|
process.exit(1);
|
|
1034
1111
|
}
|
|
1112
|
+
}
|
|
1113
|
+
program
|
|
1114
|
+
.action(async (url, options) => {
|
|
1115
|
+
await runFetch(url, options);
|
|
1035
1116
|
});
|
|
1036
1117
|
// Search command
|
|
1037
1118
|
program
|
|
@@ -1423,6 +1504,7 @@ program
|
|
|
1423
1504
|
.option('--stealth', 'Use stealth mode for all pages')
|
|
1424
1505
|
.option('-s, --silent', 'Silent mode (no spinner)')
|
|
1425
1506
|
.option('--json', 'Output as JSON')
|
|
1507
|
+
.option('--resume', 'Resume an interrupted crawl from its last checkpoint')
|
|
1426
1508
|
.action(async (url, options) => {
|
|
1427
1509
|
// Check usage quota
|
|
1428
1510
|
const usageCheck = await checkUsage();
|
|
@@ -1442,6 +1524,7 @@ program
|
|
|
1442
1524
|
rateLimitMs: options.rateLimit,
|
|
1443
1525
|
render: options.render || false,
|
|
1444
1526
|
stealth: options.stealth || false,
|
|
1527
|
+
resume: options.resume || false,
|
|
1445
1528
|
});
|
|
1446
1529
|
if (spinner) {
|
|
1447
1530
|
spinner.succeed(`Crawled ${results.length} pages`);
|
|
@@ -1782,6 +1865,23 @@ program
|
|
|
1782
1865
|
.action(async () => {
|
|
1783
1866
|
await import('./mcp/server.js');
|
|
1784
1867
|
});
|
|
1868
|
+
// Pipe command ā always JSON, no UI (agent-friendly)
|
|
1869
|
+
program
|
|
1870
|
+
.command('pipe <url>')
|
|
1871
|
+
.description('Pipe-friendly fetch (always JSON, no UI). Alias for: webpeel <url> --json --silent')
|
|
1872
|
+
.option('-r, --render', 'Use headless browser')
|
|
1873
|
+
.option('--stealth', 'Stealth mode')
|
|
1874
|
+
.option('--budget <n>', 'Token budget', parseInt)
|
|
1875
|
+
.option('--clean', 'Clean format for AI')
|
|
1876
|
+
.option('-q, --question <q>', 'Quick answer')
|
|
1877
|
+
.option('--proxy <url>', 'Proxy URL')
|
|
1878
|
+
.option('--timeout <ms>', 'Timeout in ms', parseInt)
|
|
1879
|
+
.action(async (url, opts) => {
|
|
1880
|
+
// Force JSON + silent ā always, unconditionally
|
|
1881
|
+
opts.json = true;
|
|
1882
|
+
opts.silent = true;
|
|
1883
|
+
await runFetch(url, opts);
|
|
1884
|
+
});
|
|
1785
1885
|
// Config command ā webpeel config [get|set] [key] [value]
|
|
1786
1886
|
program
|
|
1787
1887
|
.command('config')
|
|
@@ -2078,69 +2178,210 @@ program
|
|
|
2078
2178
|
// Agent command - autonomous web research
|
|
2079
2179
|
program
|
|
2080
2180
|
.command('agent <prompt>')
|
|
2081
|
-
.description('
|
|
2181
|
+
.description('Web research agent ā LLM-free by default, add --llm-key for AI synthesis')
|
|
2082
2182
|
.option('--llm-key <key>', 'LLM API key (or use OPENAI_API_KEY env var)')
|
|
2083
2183
|
.option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
|
|
2084
2184
|
.option('--llm-base-url <url>', 'LLM API base URL')
|
|
2085
2185
|
.option('--urls <urls>', 'Comma-separated starting URLs')
|
|
2086
2186
|
.option('--max-pages <n>', 'Maximum pages to visit (default: 10)', '10')
|
|
2087
|
-
.option('--schema <json>', 'JSON schema for structured output')
|
|
2187
|
+
.option('--schema <json>', 'Schema template name (e.g. product, article) or JSON schema for structured output')
|
|
2088
2188
|
.option('-s, --silent', 'Silent mode (no spinner)')
|
|
2089
2189
|
.option('--json', 'Output as JSON')
|
|
2090
2190
|
.action(async (prompt, options) => {
|
|
2091
2191
|
const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2192
|
+
const urls = options.urls ? options.urls.split(',').map((u) => u.trim()) : undefined;
|
|
2193
|
+
// Parse schema (support templates)
|
|
2194
|
+
let schema;
|
|
2195
|
+
if (options.schema) {
|
|
2196
|
+
const template = getSchemaTemplate(options.schema);
|
|
2197
|
+
if (template) {
|
|
2198
|
+
schema = template.fields;
|
|
2199
|
+
}
|
|
2200
|
+
else {
|
|
2101
2201
|
try {
|
|
2102
2202
|
schema = JSON.parse(options.schema);
|
|
2103
2203
|
}
|
|
2104
2204
|
catch {
|
|
2105
|
-
console.error(
|
|
2205
|
+
console.error(`Error: --schema must be a template name (${listSchemaTemplates().join(', ')}) or valid JSON`);
|
|
2106
2206
|
process.exit(1);
|
|
2107
2207
|
}
|
|
2108
2208
|
}
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2209
|
+
}
|
|
2210
|
+
if (llmApiKey) {
|
|
2211
|
+
// Full LLM agent mode (existing code)
|
|
2212
|
+
const spinner = options.silent ? null : ora('Running agent research...').start();
|
|
2213
|
+
try {
|
|
2214
|
+
const { runAgent } = await import('./core/agent.js');
|
|
2215
|
+
const result = await runAgent({
|
|
2216
|
+
prompt,
|
|
2217
|
+
urls,
|
|
2218
|
+
schema,
|
|
2219
|
+
llmApiKey,
|
|
2220
|
+
llmModel: options.llmModel,
|
|
2221
|
+
llmApiBase: options.llmBaseUrl,
|
|
2222
|
+
maxPages: parseInt(options.maxPages, 10),
|
|
2223
|
+
onProgress: (progress) => {
|
|
2224
|
+
if (spinner)
|
|
2225
|
+
spinner.text = progress.message;
|
|
2226
|
+
},
|
|
2227
|
+
});
|
|
2228
|
+
if (spinner)
|
|
2229
|
+
spinner.succeed(`Agent finished: ${result.pagesVisited} pages`);
|
|
2230
|
+
if (options.json) {
|
|
2231
|
+
console.log(JSON.stringify(result, null, 2));
|
|
2232
|
+
}
|
|
2233
|
+
else {
|
|
2234
|
+
console.log(`\nSources (${result.sources.length}):`);
|
|
2235
|
+
result.sources.forEach(s => console.log(` ⢠${s}`));
|
|
2236
|
+
console.log(`\nResults:`);
|
|
2237
|
+
console.log(JSON.stringify(result.data, null, 2));
|
|
2238
|
+
}
|
|
2239
|
+
await cleanup();
|
|
2240
|
+
process.exit(0);
|
|
2128
2241
|
}
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
console.
|
|
2133
|
-
|
|
2242
|
+
catch (e) {
|
|
2243
|
+
if (spinner)
|
|
2244
|
+
spinner.fail('Agent failed');
|
|
2245
|
+
console.error(e instanceof Error ? e.message : e);
|
|
2246
|
+
await cleanup();
|
|
2247
|
+
process.exit(1);
|
|
2134
2248
|
}
|
|
2135
|
-
await cleanup();
|
|
2136
|
-
process.exit(0);
|
|
2137
2249
|
}
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2250
|
+
else {
|
|
2251
|
+
// LLM-free mode: search + fetch + BM25 extraction
|
|
2252
|
+
const spinner = options.silent ? null : ora('Running LLM-free research...').start();
|
|
2253
|
+
try {
|
|
2254
|
+
// Import needed modules
|
|
2255
|
+
const { quickAnswer } = await import('./core/quick-answer.js');
|
|
2256
|
+
// Step 1: Get URLs to process
|
|
2257
|
+
let targetUrls = urls || [];
|
|
2258
|
+
// If no URLs, search the web
|
|
2259
|
+
if (targetUrls.length === 0) {
|
|
2260
|
+
if (spinner)
|
|
2261
|
+
spinner.text = 'Searching the web...';
|
|
2262
|
+
try {
|
|
2263
|
+
const { getBestSearchProvider } = await import('./core/search-provider.js');
|
|
2264
|
+
const { provider, apiKey: searchApiKey } = getBestSearchProvider();
|
|
2265
|
+
const searchResults = await provider.searchWeb(prompt, {
|
|
2266
|
+
count: Math.min(parseInt(options.maxPages, 10) || 5, 10),
|
|
2267
|
+
apiKey: searchApiKey,
|
|
2268
|
+
});
|
|
2269
|
+
targetUrls = searchResults.map((r) => r.url);
|
|
2270
|
+
}
|
|
2271
|
+
catch {
|
|
2272
|
+
// Fallback: try DuckDuckGo HTML
|
|
2273
|
+
if (spinner)
|
|
2274
|
+
spinner.text = 'Searching via DuckDuckGo...';
|
|
2275
|
+
try {
|
|
2276
|
+
const duckUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(prompt)}`;
|
|
2277
|
+
const searchResult = await peel(duckUrl, { budget: 4000 });
|
|
2278
|
+
// Extract URLs from search results content
|
|
2279
|
+
const urlMatches = searchResult.content.match(/https?:\/\/[^\s\)]+/g) || [];
|
|
2280
|
+
targetUrls = urlMatches
|
|
2281
|
+
.filter((u) => !u.includes('duckduckgo.com'))
|
|
2282
|
+
.slice(0, parseInt(options.maxPages, 10) || 5);
|
|
2283
|
+
}
|
|
2284
|
+
catch {
|
|
2285
|
+
// No search results
|
|
2286
|
+
}
|
|
2287
|
+
}
|
|
2288
|
+
}
|
|
2289
|
+
if (targetUrls.length === 0) {
|
|
2290
|
+
if (spinner)
|
|
2291
|
+
spinner.fail('No URLs found. Provide --urls or a more specific prompt.');
|
|
2292
|
+
process.exit(1);
|
|
2293
|
+
}
|
|
2294
|
+
if (spinner)
|
|
2295
|
+
spinner.text = `Processing ${targetUrls.length} pages...`;
|
|
2296
|
+
// Step 2: Fetch and extract from each URL
|
|
2297
|
+
const results = [];
|
|
2298
|
+
for (const url of targetUrls) {
|
|
2299
|
+
try {
|
|
2300
|
+
if (spinner)
|
|
2301
|
+
spinner.text = `Fetching: ${url.substring(0, 60)}...`;
|
|
2302
|
+
const pageResult = await peel(url, { budget: 4000 });
|
|
2303
|
+
let extracted = null;
|
|
2304
|
+
let confidence = 0;
|
|
2305
|
+
if (schema) {
|
|
2306
|
+
// Extract each schema field using smartExtractSchemaFields
|
|
2307
|
+
const { smartExtractSchemaFields: smartExtractResearch } = await import('./core/schema-postprocess.js');
|
|
2308
|
+
extracted = smartExtractResearch(pageResult.content, schema, quickAnswer, {
|
|
2309
|
+
pageTitle: pageResult.title,
|
|
2310
|
+
pageUrl: url,
|
|
2311
|
+
metadata: pageResult.metadata,
|
|
2312
|
+
});
|
|
2313
|
+
// Calculate confidence from quickAnswer for any field
|
|
2314
|
+
for (const question of Object.values(schema)) {
|
|
2315
|
+
try {
|
|
2316
|
+
const qa = quickAnswer({ content: pageResult.content, question: typeof question === 'string' ? question : '' });
|
|
2317
|
+
confidence = Math.max(confidence, qa.confidence || 0);
|
|
2318
|
+
}
|
|
2319
|
+
catch { /* ignore */ }
|
|
2320
|
+
break; // just need one confidence estimate
|
|
2321
|
+
}
|
|
2322
|
+
}
|
|
2323
|
+
else {
|
|
2324
|
+
// Answer the prompt directly
|
|
2325
|
+
try {
|
|
2326
|
+
const qa = quickAnswer({ content: pageResult.content, question: prompt });
|
|
2327
|
+
extracted = { answer: qa.answer || '' };
|
|
2328
|
+
confidence = qa.confidence || 0;
|
|
2329
|
+
}
|
|
2330
|
+
catch {
|
|
2331
|
+
extracted = null;
|
|
2332
|
+
}
|
|
2333
|
+
}
|
|
2334
|
+
results.push({
|
|
2335
|
+
url,
|
|
2336
|
+
title: pageResult.metadata?.title || url,
|
|
2337
|
+
extracted,
|
|
2338
|
+
content: pageResult.content.substring(0, 500),
|
|
2339
|
+
confidence,
|
|
2340
|
+
});
|
|
2341
|
+
}
|
|
2342
|
+
catch (e) {
|
|
2343
|
+
// Skip failed URLs
|
|
2344
|
+
if (process.env.DEBUG) {
|
|
2345
|
+
console.debug('[webpeel]', `Failed to fetch ${url}:`, e instanceof Error ? e.message : e);
|
|
2346
|
+
}
|
|
2347
|
+
}
|
|
2348
|
+
}
|
|
2349
|
+
if (spinner)
|
|
2350
|
+
spinner.succeed(`Processed ${results.length}/${targetUrls.length} pages (LLM-free)`);
|
|
2351
|
+
if (options.json) {
|
|
2352
|
+
console.log(JSON.stringify({
|
|
2353
|
+
mode: 'llm-free',
|
|
2354
|
+
prompt,
|
|
2355
|
+
schema: schema || null,
|
|
2356
|
+
results,
|
|
2357
|
+
sources: results.map(r => r.url),
|
|
2358
|
+
pagesVisited: results.length,
|
|
2359
|
+
}, null, 2));
|
|
2360
|
+
}
|
|
2361
|
+
else {
|
|
2362
|
+
console.log(`\nš Results (${results.length} pages, LLM-free):\n`);
|
|
2363
|
+
for (const r of results) {
|
|
2364
|
+
console.log(`āā ${r.title} āā`);
|
|
2365
|
+
console.log(` ${r.url}`);
|
|
2366
|
+
if (r.extracted) {
|
|
2367
|
+
for (const [k, v] of Object.entries(r.extracted)) {
|
|
2368
|
+
if (v)
|
|
2369
|
+
console.log(` ${k}: ${v}`);
|
|
2370
|
+
}
|
|
2371
|
+
}
|
|
2372
|
+
console.log(` Confidence: ${(r.confidence * 100).toFixed(0)}%\n`);
|
|
2373
|
+
}
|
|
2374
|
+
}
|
|
2375
|
+
await cleanup();
|
|
2376
|
+
process.exit(0);
|
|
2377
|
+
}
|
|
2378
|
+
catch (e) {
|
|
2379
|
+
if (spinner)
|
|
2380
|
+
spinner.fail('Research failed');
|
|
2381
|
+
console.error(e instanceof Error ? e.message : e);
|
|
2382
|
+
await cleanup();
|
|
2383
|
+
process.exit(1);
|
|
2384
|
+
}
|
|
2144
2385
|
}
|
|
2145
2386
|
});
|
|
2146
2387
|
// āā Jobs command group āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
@@ -3403,6 +3644,20 @@ program
|
|
|
3403
3644
|
process.exit(1);
|
|
3404
3645
|
}
|
|
3405
3646
|
});
|
|
3647
|
+
// Schema templates listing command
|
|
3648
|
+
program
|
|
3649
|
+
.command('schemas')
|
|
3650
|
+
.description('List available extraction schema templates')
|
|
3651
|
+
.action(() => {
|
|
3652
|
+
console.log('\nAvailable schema templates:\n');
|
|
3653
|
+
for (const [key, template] of Object.entries(SCHEMA_TEMPLATES)) {
|
|
3654
|
+
console.log(` ${key.padEnd(12)} ${template.description}`);
|
|
3655
|
+
console.log(` ${''.padEnd(12)} Fields: ${Object.keys(template.fields).join(', ')}`);
|
|
3656
|
+
console.log('');
|
|
3657
|
+
}
|
|
3658
|
+
console.log('Usage: webpeel "https://example.com" --schema product');
|
|
3659
|
+
console.log(' webpeel "https://example.com" --schema \'{"field":"description"}\'');
|
|
3660
|
+
});
|
|
3406
3661
|
program.parse();
|
|
3407
3662
|
// ============================================================
|
|
3408
3663
|
// Time formatting helper
|
|
@@ -3598,6 +3853,8 @@ async function outputResult(result, options, extra = {}) {
|
|
|
3598
3853
|
output.focusQuery = result.focusQuery;
|
|
3599
3854
|
if (result.focusReduction)
|
|
3600
3855
|
output.focusReduction = result.focusReduction;
|
|
3856
|
+
if (result.extracted)
|
|
3857
|
+
output.extracted = result.extracted;
|
|
3601
3858
|
if (extra.cached)
|
|
3602
3859
|
output.cached = true;
|
|
3603
3860
|
if (extra.truncated)
|