webpeel 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -4
- package/dist/cli-auth.d.ts +6 -0
- package/dist/cli-auth.d.ts.map +1 -1
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +506 -23
- package/dist/cli.js.map +1 -1
- package/dist/core/challenge-detection.d.ts.map +1 -1
- package/dist/core/challenge-detection.js +39 -6
- package/dist/core/challenge-detection.js.map +1 -1
- package/dist/core/extract-listings.d.ts.map +1 -1
- package/dist/core/extract-listings.js +167 -36
- package/dist/core/extract-listings.js.map +1 -1
- package/dist/core/fetcher.d.ts +14 -1
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +176 -14
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/hotel-search.d.ts +123 -0
- package/dist/core/hotel-search.d.ts.map +1 -0
- package/dist/core/hotel-search.js +383 -0
- package/dist/core/hotel-search.js.map +1 -0
- package/dist/core/llm-extract.d.ts +56 -0
- package/dist/core/llm-extract.d.ts.map +1 -0
- package/dist/core/llm-extract.js +264 -0
- package/dist/core/llm-extract.js.map +1 -0
- package/dist/core/profiles.d.ts +48 -0
- package/dist/core/profiles.d.ts.map +1 -0
- package/dist/core/profiles.js +211 -0
- package/dist/core/profiles.js.map +1 -0
- package/dist/core/schema-extraction.d.ts +67 -0
- package/dist/core/schema-extraction.d.ts.map +1 -0
- package/dist/core/schema-extraction.js +353 -0
- package/dist/core/schema-extraction.js.map +1 -0
- package/dist/core/strategies.d.ts +11 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +17 -5
- package/dist/core/strategies.js.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -1
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +47 -3
- package/dist/mcp/server.js.map +1 -1
- package/dist/types.d.ts +16 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
*/
|
|
15
15
|
import { Command } from 'commander';
|
|
16
16
|
import ora from 'ora';
|
|
17
|
-
import { writeFileSync, readFileSync } from 'fs';
|
|
17
|
+
import { writeFileSync, readFileSync, existsSync } from 'fs';
|
|
18
|
+
import { getProfilePath, loadStorageState, touchProfile, listProfiles, deleteProfile, createProfile } from './core/profiles.js';
|
|
18
19
|
import { peel, peelBatch, cleanup } from './index.js';
|
|
19
20
|
import { checkUsage, showUsageFooter, handleLogin, handleLogout, handleUsage, loadConfig, saveConfig } from './cli-auth.js';
|
|
20
21
|
import { getCache, setCache, parseTTL, clearCache, cacheStats } from './cache.js';
|
|
@@ -130,6 +131,7 @@ program
|
|
|
130
131
|
.argument('[url]', 'URL to fetch')
|
|
131
132
|
.option('-r, --render', 'Use headless browser (for JS-heavy sites)')
|
|
132
133
|
.option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
|
|
134
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
133
135
|
.option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
|
|
134
136
|
.option('--html', 'Output raw HTML instead of markdown')
|
|
135
137
|
.option('--text', 'Output plain text instead of markdown')
|
|
@@ -154,14 +156,19 @@ program
|
|
|
154
156
|
.option('--raw', 'Return full page without smart content extraction')
|
|
155
157
|
.option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
|
|
156
158
|
.option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
|
|
157
|
-
.option('--llm-extract
|
|
159
|
+
.option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
|
|
160
|
+
.option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
|
|
158
161
|
.option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
|
|
162
|
+
.option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
|
|
163
|
+
.option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
|
|
159
164
|
.option('--summary', 'Generate AI summary of content (requires --llm-key or OPENAI_API_KEY)')
|
|
160
165
|
.option('--location <country>', 'ISO country code for geo-targeting (e.g., "US", "DE", "JP")')
|
|
161
166
|
.option('--language <lang>', 'Language preference (e.g., "en", "de", "ja")')
|
|
162
167
|
.option('--max-tokens <n>', 'Maximum token count for output (truncate if exceeded)', parseInt)
|
|
163
168
|
.option('--budget <n>', 'Smart token budget — distill content to fit within N tokens (heuristic, no LLM key needed)', parseInt)
|
|
164
169
|
.option('--extract-all', 'Auto-detect and extract repeated listing items (e.g., search results)')
|
|
170
|
+
.option('--schema <name>', 'Force a specific extraction schema by name or domain (e.g., "booking.com", "amazon")')
|
|
171
|
+
.option('--list-schemas', 'List all available extraction schemas and their supported domains')
|
|
165
172
|
.option('--scroll-extract [count]', 'Scroll page N times to load lazy content, then extract (implies --render)', (v) => parseInt(v, 10))
|
|
166
173
|
.option('--csv', 'Output extraction results as CSV')
|
|
167
174
|
.option('--table', 'Output extraction results as a formatted table')
|
|
@@ -182,6 +189,31 @@ program
|
|
|
182
189
|
options.budget = 4000;
|
|
183
190
|
}
|
|
184
191
|
const isJson = options.json;
|
|
192
|
+
// --- --list-schemas: print all available schemas and exit ---
|
|
193
|
+
if (options.listSchemas) {
|
|
194
|
+
const { loadBundledSchemas } = await import('./core/schema-extraction.js');
|
|
195
|
+
const schemas = loadBundledSchemas();
|
|
196
|
+
if (isJson) {
|
|
197
|
+
await writeStdout(JSON.stringify(schemas.map(s => ({
|
|
198
|
+
name: s.name,
|
|
199
|
+
version: s.version,
|
|
200
|
+
domains: s.domains,
|
|
201
|
+
urlPatterns: s.urlPatterns,
|
|
202
|
+
})), null, 2) + '\n');
|
|
203
|
+
}
|
|
204
|
+
else {
|
|
205
|
+
console.log(`\nAvailable extraction schemas (${schemas.length}):\n`);
|
|
206
|
+
for (const s of schemas) {
|
|
207
|
+
console.log(` ${s.name} (v${s.version})`);
|
|
208
|
+
console.log(` Domains: ${s.domains.join(', ')}`);
|
|
209
|
+
if (s.urlPatterns && s.urlPatterns.length > 0) {
|
|
210
|
+
console.log(` URL patterns: ${s.urlPatterns.join(', ')}`);
|
|
211
|
+
}
|
|
212
|
+
console.log('');
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
process.exit(0);
|
|
216
|
+
}
|
|
185
217
|
// --- #5: Concise error for missing URL (no help dump) ---
|
|
186
218
|
if (!url || url.trim() === '') {
|
|
187
219
|
if (isJson) {
|
|
@@ -265,6 +297,49 @@ program
|
|
|
265
297
|
cachedResult.content = distillToBudget(cachedResult.content, options.budget, fmt);
|
|
266
298
|
cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
|
|
267
299
|
}
|
|
300
|
+
// LLM extraction from cached content
|
|
301
|
+
if (options.llmExtract || options.extractSchema) {
|
|
302
|
+
const { extractWithLLM } = await import('./core/llm-extract.js');
|
|
303
|
+
const llmCfgCached = loadConfig();
|
|
304
|
+
const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
305
|
+
if (!llmApiKeyCached) {
|
|
306
|
+
console.error('Error: LLM extraction requires an API key.\nSet OPENAI_API_KEY environment variable or use --llm-key <key>');
|
|
307
|
+
process.exit(1);
|
|
308
|
+
}
|
|
309
|
+
const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
|
|
310
|
+
const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
311
|
+
const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
|
|
312
|
+
// Parse schema if provided
|
|
313
|
+
let llmSchemaCached;
|
|
314
|
+
if (options.extractSchema) {
|
|
315
|
+
let schemaStr = options.extractSchema;
|
|
316
|
+
if (schemaStr.startsWith('@')) {
|
|
317
|
+
schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
|
|
318
|
+
}
|
|
319
|
+
try {
|
|
320
|
+
llmSchemaCached = JSON.parse(schemaStr);
|
|
321
|
+
}
|
|
322
|
+
catch {
|
|
323
|
+
console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
|
|
324
|
+
process.exit(1);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
const llmResultCached = await extractWithLLM({
|
|
328
|
+
content: cachedResult.content,
|
|
329
|
+
instruction: llmInstructionCached,
|
|
330
|
+
schema: llmSchemaCached,
|
|
331
|
+
apiKey: llmApiKeyCached,
|
|
332
|
+
model: llmModelCached,
|
|
333
|
+
baseUrl: llmBaseUrlCached,
|
|
334
|
+
});
|
|
335
|
+
await writeStdout(JSON.stringify(llmResultCached.items, null, 2) + '\n');
|
|
336
|
+
if (!options.silent) {
|
|
337
|
+
const { input, output } = llmResultCached.tokensUsed;
|
|
338
|
+
const costStr = llmResultCached.cost !== undefined ? ` | Est. cost: $${llmResultCached.cost.toFixed(6)}` : '';
|
|
339
|
+
console.error(`\n🤖 LLM extraction: ${llmResultCached.items.length} items | ${input} input + ${output} output tokens${costStr} | model: ${llmResultCached.model}`);
|
|
340
|
+
}
|
|
341
|
+
process.exit(0);
|
|
342
|
+
}
|
|
268
343
|
await outputResult(cachedResult, options, { cached: true });
|
|
269
344
|
process.exit(0);
|
|
270
345
|
}
|
|
@@ -299,19 +374,22 @@ program
|
|
|
299
374
|
throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
|
|
300
375
|
}
|
|
301
376
|
}
|
|
377
|
+
// --extract-schema auto-enables JSON output
|
|
378
|
+
if (options.extractSchema) {
|
|
379
|
+
options.json = true;
|
|
380
|
+
}
|
|
302
381
|
// Parse extract
|
|
303
382
|
let extract;
|
|
304
|
-
if (options.llmExtract) {
|
|
305
|
-
// LLM-based extraction
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
if (!extract.llmApiKey) {
|
|
313
|
-
throw Object.assign(new Error('--llm-extract requires OPENAI_API_KEY environment variable'), { _code: 'FETCH_FAILED' });
|
|
383
|
+
if (options.llmExtract || options.extractSchema) {
|
|
384
|
+
// LLM-based extraction is handled post-fetch (after peel returns markdown).
|
|
385
|
+
// Early-validate that an API key is available so we fail fast.
|
|
386
|
+
const llmCfg = loadConfig();
|
|
387
|
+
const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
388
|
+
if (!llmApiKey) {
|
|
389
|
+
throw Object.assign(new Error('LLM extraction requires an API key.\n' +
|
|
390
|
+
'Set OPENAI_API_KEY environment variable or use --llm-key <key>'), { _code: 'FETCH_FAILED' });
|
|
314
391
|
}
|
|
392
|
+
// Do NOT set extract here — peel runs normally, LLM extraction happens below.
|
|
315
393
|
}
|
|
316
394
|
else if (options.extract) {
|
|
317
395
|
// CSS-based extraction
|
|
@@ -351,6 +429,26 @@ program
|
|
|
351
429
|
locationOptions.languages = [options.language];
|
|
352
430
|
}
|
|
353
431
|
}
|
|
432
|
+
// ── Resolve --profile: name → path + storage state ─────────────────
|
|
433
|
+
let resolvedProfileDir;
|
|
434
|
+
let resolvedStorageState;
|
|
435
|
+
let resolvedProfileName;
|
|
436
|
+
if (options.profile) {
|
|
437
|
+
const profilePath = getProfilePath(options.profile);
|
|
438
|
+
if (profilePath) {
|
|
439
|
+
// It's a named profile in ~/.webpeel/profiles/
|
|
440
|
+
resolvedProfileDir = profilePath;
|
|
441
|
+
resolvedStorageState = loadStorageState(options.profile) ?? undefined;
|
|
442
|
+
resolvedProfileName = options.profile;
|
|
443
|
+
}
|
|
444
|
+
else if (existsSync(options.profile)) {
|
|
445
|
+
// It's a raw directory path (backward compat)
|
|
446
|
+
resolvedProfileDir = options.profile;
|
|
447
|
+
}
|
|
448
|
+
else {
|
|
449
|
+
exitWithJsonError(`Profile "${options.profile}" not found. Run "webpeel profile list" to see available profiles.`, 'PROFILE_NOT_FOUND');
|
|
450
|
+
}
|
|
451
|
+
}
|
|
354
452
|
// Build peel options
|
|
355
453
|
// --stealth auto-enables --render (stealth requires browser)
|
|
356
454
|
// --action auto-enables --render (actions require browser)
|
|
@@ -391,8 +489,10 @@ program
|
|
|
391
489
|
extract,
|
|
392
490
|
images: options.images || false,
|
|
393
491
|
location: locationOptions,
|
|
394
|
-
profileDir:
|
|
492
|
+
profileDir: resolvedProfileDir,
|
|
395
493
|
headed: options.headed || false,
|
|
494
|
+
storageState: resolvedStorageState,
|
|
495
|
+
proxy: options.proxy,
|
|
396
496
|
};
|
|
397
497
|
// Add summary option if requested
|
|
398
498
|
if (options.summary) {
|
|
@@ -419,6 +519,10 @@ program
|
|
|
419
519
|
}
|
|
420
520
|
// Fetch the page
|
|
421
521
|
const result = await peel(url, peelOptions);
|
|
522
|
+
// Update lastUsed timestamp for named profiles
|
|
523
|
+
if (resolvedProfileName) {
|
|
524
|
+
touchProfile(resolvedProfileName);
|
|
525
|
+
}
|
|
422
526
|
if (spinner) {
|
|
423
527
|
spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method`);
|
|
424
528
|
}
|
|
@@ -477,19 +581,92 @@ program
|
|
|
477
581
|
console.error(`⚠ ${warningMsg}`);
|
|
478
582
|
}
|
|
479
583
|
}
|
|
584
|
+
// --- LLM-based extraction (post-peel) ---
|
|
585
|
+
if (options.llmExtract || options.extractSchema) {
|
|
586
|
+
const { extractWithLLM } = await import('./core/llm-extract.js');
|
|
587
|
+
const llmCfg = loadConfig();
|
|
588
|
+
const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
589
|
+
const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
|
|
590
|
+
const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
591
|
+
const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
|
|
592
|
+
// Parse --extract-schema if provided
|
|
593
|
+
let llmSchema;
|
|
594
|
+
if (options.extractSchema) {
|
|
595
|
+
let schemaStr = options.extractSchema;
|
|
596
|
+
if (schemaStr.startsWith('@')) {
|
|
597
|
+
schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
|
|
598
|
+
}
|
|
599
|
+
try {
|
|
600
|
+
llmSchema = JSON.parse(schemaStr);
|
|
601
|
+
}
|
|
602
|
+
catch {
|
|
603
|
+
exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
const llmResult = await extractWithLLM({
|
|
607
|
+
content: result.content,
|
|
608
|
+
instruction: llmInstruction,
|
|
609
|
+
schema: llmSchema,
|
|
610
|
+
apiKey: llmApiKey,
|
|
611
|
+
model: llmModel,
|
|
612
|
+
baseUrl: llmBaseUrl,
|
|
613
|
+
});
|
|
614
|
+
// Output structured items as JSON
|
|
615
|
+
await writeStdout(JSON.stringify(llmResult.items, null, 2) + '\n');
|
|
616
|
+
// Show token usage and estimated cost
|
|
617
|
+
if (!options.silent) {
|
|
618
|
+
const { input, output } = llmResult.tokensUsed;
|
|
619
|
+
const costStr = llmResult.cost !== undefined
|
|
620
|
+
? ` | Est. cost: $${llmResult.cost.toFixed(6)}`
|
|
621
|
+
: '';
|
|
622
|
+
console.error(`\n🤖 LLM extraction: ${llmResult.items.length} items | ${input} input + ${output} output tokens${costStr} | model: ${llmResult.model}`);
|
|
623
|
+
}
|
|
624
|
+
await cleanup();
|
|
625
|
+
process.exit(0);
|
|
626
|
+
}
|
|
480
627
|
// --- Extract-all / pagination / output formatting ---
|
|
481
628
|
const wantsExtractAll = options.extractAll || options.scrollExtract !== undefined;
|
|
482
629
|
const pagesCount = Math.min(Math.max(options.pages || 1, 1), 10);
|
|
483
630
|
if (wantsExtractAll) {
|
|
484
631
|
const { extractListings } = await import('./core/extract-listings.js');
|
|
485
632
|
const { findNextPageUrl } = await import('./core/paginate.js');
|
|
633
|
+
const { findSchemaForUrl, extractWithSchema, loadBundledSchemas } = await import('./core/schema-extraction.js');
|
|
634
|
+
// Resolve which schema to use (explicit --schema flag or auto-detect)
|
|
635
|
+
let activeSchema = null;
|
|
636
|
+
if (options.schema) {
|
|
637
|
+
// Find schema by name or domain match
|
|
638
|
+
const schemaQuery = options.schema.toLowerCase();
|
|
639
|
+
const allSchemas = loadBundledSchemas();
|
|
640
|
+
activeSchema = allSchemas.find(s => s.name.toLowerCase().includes(schemaQuery) ||
|
|
641
|
+
s.domains.some(d => d.toLowerCase().includes(schemaQuery))) ?? null;
|
|
642
|
+
if (!activeSchema && !options.silent) {
|
|
643
|
+
console.error(`Warning: No schema found for "${options.schema}", falling back to auto-detection`);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
else {
|
|
647
|
+
// Auto-detect from URL
|
|
648
|
+
activeSchema = findSchemaForUrl(result.url || url);
|
|
649
|
+
}
|
|
486
650
|
// We need the raw HTML for extraction. Re-fetch with format=html if needed.
|
|
487
651
|
let allListings = [];
|
|
488
652
|
// Fetch HTML for extraction
|
|
489
653
|
const htmlResult = peelOptions.format === 'html'
|
|
490
654
|
? result
|
|
491
655
|
: await peel(url, { ...peelOptions, format: 'html', maxTokens: undefined });
|
|
492
|
-
|
|
656
|
+
// Try schema extraction first, fall back to generic
|
|
657
|
+
if (activeSchema) {
|
|
658
|
+
const schemaListings = extractWithSchema(htmlResult.content, activeSchema, result.url);
|
|
659
|
+
if (schemaListings.length > 0) {
|
|
660
|
+
allListings.push(...schemaListings);
|
|
661
|
+
}
|
|
662
|
+
else {
|
|
663
|
+
// Schema returned nothing — fall back to generic
|
|
664
|
+
allListings.push(...extractListings(htmlResult.content, result.url));
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
else {
|
|
668
|
+
allListings.push(...extractListings(htmlResult.content, result.url));
|
|
669
|
+
}
|
|
493
670
|
// Pagination: follow "Next" links
|
|
494
671
|
if (pagesCount > 1) {
|
|
495
672
|
let currentHtml = htmlResult.content;
|
|
@@ -500,7 +677,16 @@ program
|
|
|
500
677
|
break;
|
|
501
678
|
try {
|
|
502
679
|
const nextResult = await peel(nextUrl, { ...peelOptions, format: 'html', maxTokens: undefined });
|
|
503
|
-
|
|
680
|
+
let pageListings;
|
|
681
|
+
if (activeSchema) {
|
|
682
|
+
const schemaPage = extractWithSchema(nextResult.content, activeSchema, nextResult.url);
|
|
683
|
+
pageListings = schemaPage.length > 0
|
|
684
|
+
? schemaPage
|
|
685
|
+
: extractListings(nextResult.content, nextResult.url);
|
|
686
|
+
}
|
|
687
|
+
else {
|
|
688
|
+
pageListings = extractListings(nextResult.content, nextResult.url);
|
|
689
|
+
}
|
|
504
690
|
allListings.push(...pageListings);
|
|
505
691
|
currentHtml = nextResult.content;
|
|
506
692
|
currentUrl = nextResult.url;
|
|
@@ -658,7 +844,18 @@ program
|
|
|
658
844
|
.option('--csv', 'Output site-search results as CSV (requires --site)')
|
|
659
845
|
.option('--budget <n>', 'Token budget for site-search result content', parseInt)
|
|
660
846
|
.option('-s, --silent', 'Silent mode')
|
|
847
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
848
|
+
.option('--agent', 'Agent mode: sets --json, --silent, and --budget 4000 (override with --budget N)')
|
|
661
849
|
.action(async (query, options) => {
|
|
850
|
+
// --agent sets sensible defaults for AI agents; explicit flags override
|
|
851
|
+
if (options.agent) {
|
|
852
|
+
if (!options.json)
|
|
853
|
+
options.json = true;
|
|
854
|
+
if (!options.silent)
|
|
855
|
+
options.silent = true;
|
|
856
|
+
if (options.budget === undefined)
|
|
857
|
+
options.budget = 4000;
|
|
858
|
+
}
|
|
662
859
|
const isJson = options.json;
|
|
663
860
|
const isSilent = options.silent;
|
|
664
861
|
// --top overrides --count when both are provided
|
|
@@ -679,6 +876,7 @@ program
|
|
|
679
876
|
const htmlResult = await peel(siteResult.url, {
|
|
680
877
|
format: 'html',
|
|
681
878
|
timeout: 30000,
|
|
879
|
+
proxy: options.proxy,
|
|
682
880
|
});
|
|
683
881
|
if (spinner) {
|
|
684
882
|
spinner.succeed(`Fetched ${siteResult.site} in ${htmlResult.elapsed}ms`);
|
|
@@ -773,10 +971,24 @@ program
|
|
|
773
971
|
|| config.braveApiKey
|
|
774
972
|
|| undefined;
|
|
775
973
|
const provider = getSearchProvider(providerId);
|
|
776
|
-
|
|
974
|
+
let results = await provider.searchWeb(query, {
|
|
777
975
|
count: Math.min(Math.max(count, 1), 10),
|
|
778
976
|
apiKey,
|
|
779
977
|
});
|
|
978
|
+
// Apply budget to search results if requested (trim results to fit token budget)
|
|
979
|
+
if (options.budget && options.budget > 0 && results.length > 0) {
|
|
980
|
+
let totalTokens = 0;
|
|
981
|
+
let maxResults = 0;
|
|
982
|
+
for (const r of results) {
|
|
983
|
+
// Estimate ~4 chars per token for title + url + snippet
|
|
984
|
+
const resultTokens = Math.ceil((`${r.title || ''}\n${r.url || ''}\n${r.snippet || ''}`).length / 4);
|
|
985
|
+
if (totalTokens + resultTokens > options.budget)
|
|
986
|
+
break;
|
|
987
|
+
totalTokens += resultTokens;
|
|
988
|
+
maxResults++;
|
|
989
|
+
}
|
|
990
|
+
results = results.slice(0, Math.max(maxResults, 1));
|
|
991
|
+
}
|
|
780
992
|
if (spinner) {
|
|
781
993
|
spinner.succeed(`Found ${results.length} results (${providerId})`);
|
|
782
994
|
}
|
|
@@ -1370,24 +1582,52 @@ program
|
|
|
1370
1582
|
program
|
|
1371
1583
|
.command('config')
|
|
1372
1584
|
.description('View or update CLI configuration')
|
|
1373
|
-
.argument('[action]', '"get <key>", "set <key> <value>", or omit for overview')
|
|
1585
|
+
.argument('[action]', '"list", "get <key>", "set <key> <value>", or omit for overview')
|
|
1374
1586
|
.argument('[key]', 'Config key')
|
|
1375
1587
|
.argument('[value]', 'Value to set')
|
|
1376
1588
|
.action(async (action, key, value) => {
|
|
1377
1589
|
const config = loadConfig();
|
|
1378
1590
|
// Settable config keys (safe for user modification)
|
|
1591
|
+
// Supports dot-notation for nested keys (e.g., llm.apiKey)
|
|
1379
1592
|
const SETTABLE_KEYS = {
|
|
1380
1593
|
braveApiKey: 'Brave Search API key',
|
|
1594
|
+
'llm.apiKey': 'LLM API key for AI-powered extraction (OpenAI-compatible)',
|
|
1595
|
+
'llm.model': 'LLM model name (default: gpt-4o-mini)',
|
|
1596
|
+
'llm.baseUrl': 'LLM API base URL (default: https://api.openai.com/v1)',
|
|
1381
1597
|
};
|
|
1382
1598
|
const maskSecret = (k, v) => {
|
|
1383
1599
|
if (!v)
|
|
1384
1600
|
return '(not set)';
|
|
1385
|
-
if (k === 'apiKey' || k === 'braveApiKey')
|
|
1601
|
+
if (k === 'apiKey' || k === 'braveApiKey' || k === 'llm.apiKey') {
|
|
1386
1602
|
return v.slice(0, 4) + '...' + v.slice(-4);
|
|
1603
|
+
}
|
|
1387
1604
|
return String(v);
|
|
1388
1605
|
};
|
|
1389
|
-
|
|
1390
|
-
|
|
1606
|
+
/** Get a potentially nested value using dot-notation (e.g., "llm.apiKey") */
|
|
1607
|
+
function getNestedValue(obj, path) {
|
|
1608
|
+
const parts = path.split('.');
|
|
1609
|
+
let cur = obj;
|
|
1610
|
+
for (const part of parts) {
|
|
1611
|
+
if (cur == null || typeof cur !== 'object')
|
|
1612
|
+
return undefined;
|
|
1613
|
+
cur = cur[part];
|
|
1614
|
+
}
|
|
1615
|
+
return cur;
|
|
1616
|
+
}
|
|
1617
|
+
/** Set a potentially nested value using dot-notation (e.g., "llm.apiKey") */
|
|
1618
|
+
function setNestedValue(obj, path, val) {
|
|
1619
|
+
const parts = path.split('.');
|
|
1620
|
+
let cur = obj;
|
|
1621
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
1622
|
+
const part = parts[i];
|
|
1623
|
+
if (cur[part] == null || typeof cur[part] !== 'object')
|
|
1624
|
+
cur[part] = {};
|
|
1625
|
+
cur = cur[part];
|
|
1626
|
+
}
|
|
1627
|
+
cur[parts[parts.length - 1]] = val;
|
|
1628
|
+
}
|
|
1629
|
+
if (!action || action === 'list') {
|
|
1630
|
+
// Show all config (also triggered by `webpeel config list`)
|
|
1391
1631
|
console.log('WebPeel CLI Configuration');
|
|
1392
1632
|
console.log(` Config file: ~/.webpeel/config.json`);
|
|
1393
1633
|
console.log('');
|
|
@@ -1395,6 +1635,11 @@ program
|
|
|
1395
1635
|
console.log(` braveApiKey: ${maskSecret('braveApiKey', config.braveApiKey)}`);
|
|
1396
1636
|
console.log(` planTier: ${config.planTier || 'free'}`);
|
|
1397
1637
|
console.log(` anonymousUsage: ${config.anonymousUsage}`);
|
|
1638
|
+
console.log('');
|
|
1639
|
+
console.log(' LLM:');
|
|
1640
|
+
console.log(` llm.apiKey: ${maskSecret('llm.apiKey', config.llm?.apiKey)}`);
|
|
1641
|
+
console.log(` llm.model: ${config.llm?.model || '(not set, default: gpt-4o-mini)'}`);
|
|
1642
|
+
console.log(` llm.baseUrl: ${config.llm?.baseUrl || '(not set, default: https://api.openai.com/v1)'}`);
|
|
1398
1643
|
const stats = cacheStats();
|
|
1399
1644
|
console.log('');
|
|
1400
1645
|
console.log(' Cache:');
|
|
@@ -1420,14 +1665,14 @@ program
|
|
|
1420
1665
|
console.error(`Usage: webpeel config set ${key} <value>`);
|
|
1421
1666
|
process.exit(1);
|
|
1422
1667
|
}
|
|
1423
|
-
config
|
|
1668
|
+
setNestedValue(config, key, value);
|
|
1424
1669
|
saveConfig(config);
|
|
1425
1670
|
console.log(`✓ ${key} saved`);
|
|
1426
1671
|
process.exit(0);
|
|
1427
1672
|
}
|
|
1428
1673
|
if (action === 'get') {
|
|
1429
1674
|
const lookupKey = key || '';
|
|
1430
|
-
const val = config[lookupKey];
|
|
1675
|
+
const val = getNestedValue(config, lookupKey) ?? config[lookupKey];
|
|
1431
1676
|
if (val !== undefined) {
|
|
1432
1677
|
console.log(maskSecret(lookupKey, String(val)));
|
|
1433
1678
|
}
|
|
@@ -1438,7 +1683,7 @@ program
|
|
|
1438
1683
|
process.exit(0);
|
|
1439
1684
|
}
|
|
1440
1685
|
// Legacy: `webpeel config <key>` — treat action as the key name
|
|
1441
|
-
const val = config[action];
|
|
1686
|
+
const val = getNestedValue(config, action) ?? config[action];
|
|
1442
1687
|
if (val !== undefined) {
|
|
1443
1688
|
console.log(maskSecret(action, String(val)));
|
|
1444
1689
|
}
|
|
@@ -2635,6 +2880,244 @@ applyCmd
|
|
|
2635
2880
|
process.exit(1);
|
|
2636
2881
|
}
|
|
2637
2882
|
});
|
|
2883
|
+
// ============================================================
|
|
2884
|
+
// Profile management commands
|
|
2885
|
+
// ============================================================
|
|
2886
|
+
const profileCmd = program
|
|
2887
|
+
.command('profile')
|
|
2888
|
+
.description('Manage named browser profiles (saved login sessions)');
|
|
2889
|
+
profileCmd
|
|
2890
|
+
.command('create <name>')
|
|
2891
|
+
.description('Create a new profile interactively (launches browser, log in, press Ctrl+C when done)')
|
|
2892
|
+
.option('--description <text>', 'Optional description for this profile')
|
|
2893
|
+
.action(async (name, opts) => {
|
|
2894
|
+
try {
|
|
2895
|
+
await createProfile(name, opts.description);
|
|
2896
|
+
process.exit(0);
|
|
2897
|
+
}
|
|
2898
|
+
catch (error) {
|
|
2899
|
+
console.error(`Error: ${error instanceof Error ? error.message : String(error)}`);
|
|
2900
|
+
process.exit(1);
|
|
2901
|
+
}
|
|
2902
|
+
});
|
|
2903
|
+
profileCmd
|
|
2904
|
+
.command('list')
|
|
2905
|
+
.description('List all saved browser profiles')
|
|
2906
|
+
.action(() => {
|
|
2907
|
+
const profiles = listProfiles();
|
|
2908
|
+
if (profiles.length === 0) {
|
|
2909
|
+
console.log('No profiles found.');
|
|
2910
|
+
console.log('');
|
|
2911
|
+
console.log('Create one with:');
|
|
2912
|
+
console.log(' webpeel profile create <name>');
|
|
2913
|
+
console.log('');
|
|
2914
|
+
console.log('Then use it with:');
|
|
2915
|
+
console.log(' webpeel <url> --profile <name>');
|
|
2916
|
+
process.exit(0);
|
|
2917
|
+
}
|
|
2918
|
+
console.log('');
|
|
2919
|
+
console.log('Saved profiles:');
|
|
2920
|
+
console.log('');
|
|
2921
|
+
// Column widths
|
|
2922
|
+
const nameW = Math.max(8, ...profiles.map((p) => p.name.length));
|
|
2923
|
+
const domainsW = Math.max(10, ...profiles.map((p) => (p.domains.join(', ') || '(none)').length));
|
|
2924
|
+
const header = 'Name'.padEnd(nameW) + ' ' +
|
|
2925
|
+
'Domains'.padEnd(domainsW) + ' ' +
|
|
2926
|
+
'Last Used'.padEnd(12) + ' ' +
|
|
2927
|
+
'Created';
|
|
2928
|
+
console.log(header);
|
|
2929
|
+
console.log('─'.repeat(header.length + 4));
|
|
2930
|
+
for (const p of profiles) {
|
|
2931
|
+
const domainsStr = p.domains.length > 0 ? p.domains.join(', ') : '(none)';
|
|
2932
|
+
const lastUsed = formatRelativeTime(new Date(p.lastUsed));
|
|
2933
|
+
const created = new Date(p.created).toISOString().split('T')[0];
|
|
2934
|
+
console.log(p.name.padEnd(nameW) + ' ' +
|
|
2935
|
+
domainsStr.padEnd(domainsW) + ' ' +
|
|
2936
|
+
lastUsed.padEnd(12) + ' ' +
|
|
2937
|
+
created);
|
|
2938
|
+
}
|
|
2939
|
+
console.log('');
|
|
2940
|
+
process.exit(0);
|
|
2941
|
+
});
|
|
2942
|
+
profileCmd
|
|
2943
|
+
.command('show <name>')
|
|
2944
|
+
.description('Show details for a profile')
|
|
2945
|
+
.action((name) => {
|
|
2946
|
+
const profilePath = getProfilePath(name);
|
|
2947
|
+
if (!profilePath) {
|
|
2948
|
+
console.error(`Error: Profile "${name}" not found.`);
|
|
2949
|
+
console.error('Run "webpeel profile list" to see available profiles.');
|
|
2950
|
+
process.exit(1);
|
|
2951
|
+
}
|
|
2952
|
+
try {
|
|
2953
|
+
const meta = JSON.parse(readFileSync(`${profilePath}/metadata.json`, 'utf-8'));
|
|
2954
|
+
console.log('');
|
|
2955
|
+
console.log(`Profile: ${meta.name}`);
|
|
2956
|
+
if (meta.description)
|
|
2957
|
+
console.log(`Description: ${meta.description}`);
|
|
2958
|
+
console.log(`Created: ${new Date(meta.created).toLocaleString()}`);
|
|
2959
|
+
console.log(`Last used: ${new Date(meta.lastUsed).toLocaleString()}`);
|
|
2960
|
+
console.log(`Domains: ${meta.domains.length > 0 ? meta.domains.join(', ') : '(none)'}`);
|
|
2961
|
+
console.log(`Directory: ${profilePath}`);
|
|
2962
|
+
console.log('');
|
|
2963
|
+
process.exit(0);
|
|
2964
|
+
}
|
|
2965
|
+
catch (e) {
|
|
2966
|
+
console.error(`Error reading profile: ${e instanceof Error ? e.message : String(e)}`);
|
|
2967
|
+
process.exit(1);
|
|
2968
|
+
}
|
|
2969
|
+
});
|
|
2970
|
+
profileCmd
|
|
2971
|
+
.command('delete <name>')
|
|
2972
|
+
.description('Delete a saved profile')
|
|
2973
|
+
.action((name) => {
|
|
2974
|
+
const deleted = deleteProfile(name);
|
|
2975
|
+
if (deleted) {
|
|
2976
|
+
console.log(`Profile "${name}" deleted.`);
|
|
2977
|
+
process.exit(0);
|
|
2978
|
+
}
|
|
2979
|
+
else {
|
|
2980
|
+
console.error(`Error: Profile "${name}" not found.`);
|
|
2981
|
+
console.error('Run "webpeel profile list" to see available profiles.');
|
|
2982
|
+
process.exit(1);
|
|
2983
|
+
}
|
|
2984
|
+
});
|
|
2985
|
+
// ── Hotels command ─────────────────────────────────────────────────────────────
|
|
2986
|
+
program
|
|
2987
|
+
.command('hotels <destination>')
|
|
2988
|
+
.description('Search multiple travel sites for hotels (Kayak, Booking.com, Google Travel)')
|
|
2989
|
+
.option('--checkin <date>', 'Check-in date (ISO or relative, e.g. "tomorrow", "2026-02-20"). Default: tomorrow')
|
|
2990
|
+
.option('--checkout <date>', 'Check-out date (ISO or relative). Default: checkin + 1 day')
|
|
2991
|
+
.option('--sort <method>', 'Sort by: price, rating, value (default: price)', 'price')
|
|
2992
|
+
.option('--limit <n>', 'Max results (default: 20)', '20')
|
|
2993
|
+
.option('--source <name...>', 'Only use specific source(s): kayak, booking, google (repeatable)')
|
|
2994
|
+
.option('--json', 'Output as JSON')
|
|
2995
|
+
.option('--stealth', 'Use stealth mode for all sources')
|
|
2996
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
2997
|
+
.option('-s, --silent', 'Suppress progress messages')
|
|
2998
|
+
.action(async (destination, options) => {
|
|
2999
|
+
const isJson = options.json;
|
|
3000
|
+
const isSilent = options.silent;
|
|
3001
|
+
// Build checkin/checkout
|
|
3002
|
+
const { parseDate, addDays: hotelAddDays } = await import('./core/hotel-search.js');
|
|
3003
|
+
let checkinStr;
|
|
3004
|
+
let checkoutStr;
|
|
3005
|
+
try {
|
|
3006
|
+
checkinStr = parseDate(options.checkin ?? 'tomorrow');
|
|
3007
|
+
checkoutStr = options.checkout
|
|
3008
|
+
? parseDate(options.checkout)
|
|
3009
|
+
: hotelAddDays(checkinStr, 1);
|
|
3010
|
+
}
|
|
3011
|
+
catch (err) {
|
|
3012
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
3013
|
+
if (isJson) {
|
|
3014
|
+
await writeStdout(JSON.stringify({ error: msg, code: 'INVALID_DATE' }) + '\n');
|
|
3015
|
+
}
|
|
3016
|
+
else {
|
|
3017
|
+
console.error(`Error: ${msg}`);
|
|
3018
|
+
}
|
|
3019
|
+
process.exit(1);
|
|
3020
|
+
}
|
|
3021
|
+
const sortMethod = (['price', 'rating', 'value'].includes(options.sort)
|
|
3022
|
+
? options.sort
|
|
3023
|
+
: 'price');
|
|
3024
|
+
const limit = Math.max(1, parseInt(options.limit, 10) || 20);
|
|
3025
|
+
const sources = options.source
|
|
3026
|
+
? (Array.isArray(options.source) ? options.source : [options.source])
|
|
3027
|
+
: undefined;
|
|
3028
|
+
// Spinner per-source progress (non-silent, non-JSON)
|
|
3029
|
+
let searchSpinner = null;
|
|
3030
|
+
if (!isSilent && !isJson) {
|
|
3031
|
+
searchSpinner = ora(`Searching hotels in ${destination}...`).start();
|
|
3032
|
+
}
|
|
3033
|
+
else if (!isSilent && !isJson) {
|
|
3034
|
+
console.error(`⏳ Searching kayak.com...`);
|
|
3035
|
+
console.error(`⏳ Searching booking.com...`);
|
|
3036
|
+
console.error(`⏳ Searching google.com...`);
|
|
3037
|
+
}
|
|
3038
|
+
try {
|
|
3039
|
+
const { searchHotels } = await import('./core/hotel-search.js');
|
|
3040
|
+
const result = await searchHotels({
|
|
3041
|
+
destination,
|
|
3042
|
+
checkin: checkinStr,
|
|
3043
|
+
checkout: checkoutStr,
|
|
3044
|
+
sort: sortMethod,
|
|
3045
|
+
limit,
|
|
3046
|
+
sources,
|
|
3047
|
+
stealth: options.stealth,
|
|
3048
|
+
silent: isSilent,
|
|
3049
|
+
proxy: options.proxy,
|
|
3050
|
+
});
|
|
3051
|
+
if (searchSpinner)
|
|
3052
|
+
searchSpinner.stop();
|
|
3053
|
+
// Show per-source status
|
|
3054
|
+
if (!isSilent && !isJson) {
|
|
3055
|
+
for (const src of result.sources) {
|
|
3056
|
+
if (src.status === 'ok') {
|
|
3057
|
+
console.error(`✅ ${src.name}: ${src.count} hotels found`);
|
|
3058
|
+
}
|
|
3059
|
+
else {
|
|
3060
|
+
console.error(`❌ ${src.name}: ${src.status}${src.error ? ' — ' + src.error : ''}`);
|
|
3061
|
+
}
|
|
3062
|
+
}
|
|
3063
|
+
}
|
|
3064
|
+
if (isJson) {
|
|
3065
|
+
await writeStdout(JSON.stringify(result, null, 2) + '\n');
|
|
3066
|
+
await cleanup();
|
|
3067
|
+
process.exit(0);
|
|
3068
|
+
}
|
|
3069
|
+
// Human-readable table output
|
|
3070
|
+
const { formatDate: fmtDate } = {
|
|
3071
|
+
formatDate: (iso) => {
|
|
3072
|
+
const d = new Date(iso + 'T12:00:00Z');
|
|
3073
|
+
return d.toLocaleDateString('en-US', { month: 'short', day: 'numeric', year: 'numeric', timeZone: 'UTC' });
|
|
3074
|
+
},
|
|
3075
|
+
};
|
|
3076
|
+
const ci = fmtDate(result.checkin);
|
|
3077
|
+
const co = fmtDate(result.checkout);
|
|
3078
|
+
console.log(`\n🏨 Hotels in ${result.destination}`);
|
|
3079
|
+
console.log(` ${ci} → ${co} | Sorted by ${sortMethod}\n`);
|
|
3080
|
+
if (result.results.length === 0) {
|
|
3081
|
+
console.log(' No hotels found.\n');
|
|
3082
|
+
}
|
|
3083
|
+
else {
|
|
3084
|
+
const colNum = 3;
|
|
3085
|
+
const colName = 42;
|
|
3086
|
+
const colPrice = 8;
|
|
3087
|
+
const colRating = 8;
|
|
3088
|
+
const colSource = 10;
|
|
3089
|
+
const padEnd = (s, w) => s.length > w ? s.slice(0, w - 1) + '…' : s.padEnd(w);
|
|
3090
|
+
const padStart = (s, w) => s.padStart(w);
|
|
3091
|
+
console.log(` ${padStart('#', colNum)} ${padEnd('Hotel', colName)} ${padEnd('Price', colPrice)} ${padEnd('Rating', colRating)} ${padEnd('Source', colSource)}`);
|
|
3092
|
+
result.results.forEach((hotel, i) => {
|
|
3093
|
+
const priceStr = hotel.priceDisplay || '—';
|
|
3094
|
+
const ratingStr = hotel.rating !== null ? String(hotel.rating) : '—';
|
|
3095
|
+
console.log(` ${padStart(String(i + 1), colNum)} ${padEnd(hotel.name, colName)} ${padEnd(priceStr, colPrice)} ${padEnd(ratingStr, colRating)} ${padEnd(hotel.source, colSource)}`);
|
|
3096
|
+
});
|
|
3097
|
+
console.log('');
|
|
3098
|
+
const sourceSummary = result.sources
|
|
3099
|
+
.map(s => `${s.name} (${s.count} ${s.status === 'ok' ? '✅' : s.status === 'blocked' ? '🚫' : '❌'})`)
|
|
3100
|
+
.join(' | ');
|
|
3101
|
+
console.log(`Sources: ${sourceSummary}`);
|
|
3102
|
+
}
|
|
3103
|
+
console.log('');
|
|
3104
|
+
await cleanup();
|
|
3105
|
+
process.exit(0);
|
|
3106
|
+
}
|
|
3107
|
+
catch (error) {
|
|
3108
|
+
if (searchSpinner)
|
|
3109
|
+
searchSpinner.fail('Hotel search failed');
|
|
3110
|
+
const msg = error instanceof Error ? error.message : 'Unknown error';
|
|
3111
|
+
if (isJson) {
|
|
3112
|
+
await writeStdout(JSON.stringify({ error: msg, code: 'FETCH_FAILED' }) + '\n');
|
|
3113
|
+
}
|
|
3114
|
+
else {
|
|
3115
|
+
console.error(`\nError: ${msg}`);
|
|
3116
|
+
}
|
|
3117
|
+
await cleanup();
|
|
3118
|
+
process.exit(1);
|
|
3119
|
+
}
|
|
3120
|
+
});
|
|
2638
3121
|
program.parse();
|
|
2639
3122
|
// ============================================================
|
|
2640
3123
|
// Time formatting helper
|