webpeel 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -131,6 +131,7 @@ program
131
131
  .argument('[url]', 'URL to fetch')
132
132
  .option('-r, --render', 'Use headless browser (for JS-heavy sites)')
133
133
  .option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
134
+ .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
134
135
  .option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
135
136
  .option('--html', 'Output raw HTML instead of markdown')
136
137
  .option('--text', 'Output plain text instead of markdown')
@@ -156,6 +157,7 @@ program
156
157
  .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
157
158
  .option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
158
159
  .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
160
+ .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
159
161
  .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
160
162
  .option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
161
163
  .option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
@@ -296,7 +298,7 @@ program
296
298
  cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
297
299
  }
298
300
  // LLM extraction from cached content
299
- if (options.llmExtract) {
301
+ if (options.llmExtract || options.extractSchema) {
300
302
  const { extractWithLLM } = await import('./core/llm-extract.js');
301
303
  const llmCfgCached = loadConfig();
302
304
  const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
@@ -307,9 +309,25 @@ program
307
309
  const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
308
310
  const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
309
311
  const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
312
+ // Parse schema if provided
313
+ let llmSchemaCached;
314
+ if (options.extractSchema) {
315
+ let schemaStr = options.extractSchema;
316
+ if (schemaStr.startsWith('@')) {
317
+ schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
318
+ }
319
+ try {
320
+ llmSchemaCached = JSON.parse(schemaStr);
321
+ }
322
+ catch {
323
+ console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
324
+ process.exit(1);
325
+ }
326
+ }
310
327
  const llmResultCached = await extractWithLLM({
311
328
  content: cachedResult.content,
312
329
  instruction: llmInstructionCached,
330
+ schema: llmSchemaCached,
313
331
  apiKey: llmApiKeyCached,
314
332
  model: llmModelCached,
315
333
  baseUrl: llmBaseUrlCached,
@@ -356,9 +374,13 @@ program
356
374
  throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
357
375
  }
358
376
  }
377
+ // --extract-schema auto-enables JSON output
378
+ if (options.extractSchema) {
379
+ options.json = true;
380
+ }
359
381
  // Parse extract
360
382
  let extract;
361
- if (options.llmExtract) {
383
+ if (options.llmExtract || options.extractSchema) {
362
384
  // LLM-based extraction is handled post-fetch (after peel returns markdown).
363
385
  // Early-validate that an API key is available so we fail fast.
364
386
  const llmCfg = loadConfig();
@@ -470,6 +492,7 @@ program
470
492
  profileDir: resolvedProfileDir,
471
493
  headed: options.headed || false,
472
494
  storageState: resolvedStorageState,
495
+ proxy: options.proxy,
473
496
  };
474
497
  // Add summary option if requested
475
498
  if (options.summary) {
@@ -559,16 +582,31 @@ program
559
582
  }
560
583
  }
561
584
  // --- LLM-based extraction (post-peel) ---
562
- if (options.llmExtract) {
585
+ if (options.llmExtract || options.extractSchema) {
563
586
  const { extractWithLLM } = await import('./core/llm-extract.js');
564
587
  const llmCfg = loadConfig();
565
588
  const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
566
589
  const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
567
590
  const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
568
591
  const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
592
+ // Parse --extract-schema if provided
593
+ let llmSchema;
594
+ if (options.extractSchema) {
595
+ let schemaStr = options.extractSchema;
596
+ if (schemaStr.startsWith('@')) {
597
+ schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
598
+ }
599
+ try {
600
+ llmSchema = JSON.parse(schemaStr);
601
+ }
602
+ catch {
603
+ exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
604
+ }
605
+ }
569
606
  const llmResult = await extractWithLLM({
570
607
  content: result.content,
571
608
  instruction: llmInstruction,
609
+ schema: llmSchema,
572
610
  apiKey: llmApiKey,
573
611
  model: llmModel,
574
612
  baseUrl: llmBaseUrl,
@@ -806,6 +844,7 @@ program
806
844
  .option('--csv', 'Output site-search results as CSV (requires --site)')
807
845
  .option('--budget <n>', 'Token budget for site-search result content', parseInt)
808
846
  .option('-s, --silent', 'Silent mode')
847
+ .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
809
848
  .option('--agent', 'Agent mode: sets --json, --silent, and --budget 4000 (override with --budget N)')
810
849
  .action(async (query, options) => {
811
850
  // --agent sets sensible defaults for AI agents; explicit flags override
@@ -837,6 +876,7 @@ program
837
876
  const htmlResult = await peel(siteResult.url, {
838
877
  format: 'html',
839
878
  timeout: 30000,
879
+ proxy: options.proxy,
840
880
  });
841
881
  if (spinner) {
842
882
  spinner.succeed(`Fetched ${siteResult.site} in ${htmlResult.elapsed}ms`);
@@ -2953,6 +2993,7 @@ program
2953
2993
  .option('--source <name...>', 'Only use specific source(s): kayak, booking, google (repeatable)')
2954
2994
  .option('--json', 'Output as JSON')
2955
2995
  .option('--stealth', 'Use stealth mode for all sources')
2996
+ .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
2956
2997
  .option('-s, --silent', 'Suppress progress messages')
2957
2998
  .action(async (destination, options) => {
2958
2999
  const isJson = options.json;
@@ -3005,6 +3046,7 @@ program
3005
3046
  sources,
3006
3047
  stealth: options.stealth,
3007
3048
  silent: isSilent,
3049
+ proxy: options.proxy,
3008
3050
  });
3009
3051
  if (searchSpinner)
3010
3052
  searchSpinner.stop();