webpeel 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +45 -3
- package/dist/cli.js.map +1 -1
- package/dist/core/fetcher.d.ts +8 -1
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +33 -7
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/hotel-search.d.ts +2 -0
- package/dist/core/hotel-search.d.ts.map +1 -1
- package/dist/core/hotel-search.js +2 -0
- package/dist/core/hotel-search.js.map +1 -1
- package/dist/core/llm-extract.d.ts +15 -1
- package/dist/core/llm-extract.d.ts.map +1 -1
- package/dist/core/llm-extract.js +127 -7
- package/dist/core/llm-extract.js.map +1 -1
- package/dist/core/strategies.d.ts +6 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +10 -5
- package/dist/core/strategies.js.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +47 -3
- package/dist/mcp/server.js.map +1 -1
- package/dist/types.d.ts +10 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -131,6 +131,7 @@ program
|
|
|
131
131
|
.argument('[url]', 'URL to fetch')
|
|
132
132
|
.option('-r, --render', 'Use headless browser (for JS-heavy sites)')
|
|
133
133
|
.option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
|
|
134
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
134
135
|
.option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
|
|
135
136
|
.option('--html', 'Output raw HTML instead of markdown')
|
|
136
137
|
.option('--text', 'Output plain text instead of markdown')
|
|
@@ -156,6 +157,7 @@ program
|
|
|
156
157
|
.option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
|
|
157
158
|
.option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
|
|
158
159
|
.option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
|
|
160
|
+
.option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
|
|
159
161
|
.option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
|
|
160
162
|
.option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
|
|
161
163
|
.option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
|
|
@@ -296,7 +298,7 @@ program
|
|
|
296
298
|
cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
|
|
297
299
|
}
|
|
298
300
|
// LLM extraction from cached content
|
|
299
|
-
if (options.llmExtract) {
|
|
301
|
+
if (options.llmExtract || options.extractSchema) {
|
|
300
302
|
const { extractWithLLM } = await import('./core/llm-extract.js');
|
|
301
303
|
const llmCfgCached = loadConfig();
|
|
302
304
|
const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
@@ -307,9 +309,25 @@ program
|
|
|
307
309
|
const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
|
|
308
310
|
const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
309
311
|
const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
|
|
312
|
+
// Parse schema if provided
|
|
313
|
+
let llmSchemaCached;
|
|
314
|
+
if (options.extractSchema) {
|
|
315
|
+
let schemaStr = options.extractSchema;
|
|
316
|
+
if (schemaStr.startsWith('@')) {
|
|
317
|
+
schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
|
|
318
|
+
}
|
|
319
|
+
try {
|
|
320
|
+
llmSchemaCached = JSON.parse(schemaStr);
|
|
321
|
+
}
|
|
322
|
+
catch {
|
|
323
|
+
console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
|
|
324
|
+
process.exit(1);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
310
327
|
const llmResultCached = await extractWithLLM({
|
|
311
328
|
content: cachedResult.content,
|
|
312
329
|
instruction: llmInstructionCached,
|
|
330
|
+
schema: llmSchemaCached,
|
|
313
331
|
apiKey: llmApiKeyCached,
|
|
314
332
|
model: llmModelCached,
|
|
315
333
|
baseUrl: llmBaseUrlCached,
|
|
@@ -356,9 +374,13 @@ program
|
|
|
356
374
|
throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
|
|
357
375
|
}
|
|
358
376
|
}
|
|
377
|
+
// --extract-schema auto-enables JSON output
|
|
378
|
+
if (options.extractSchema) {
|
|
379
|
+
options.json = true;
|
|
380
|
+
}
|
|
359
381
|
// Parse extract
|
|
360
382
|
let extract;
|
|
361
|
-
if (options.llmExtract) {
|
|
383
|
+
if (options.llmExtract || options.extractSchema) {
|
|
362
384
|
// LLM-based extraction is handled post-fetch (after peel returns markdown).
|
|
363
385
|
// Early-validate that an API key is available so we fail fast.
|
|
364
386
|
const llmCfg = loadConfig();
|
|
@@ -470,6 +492,7 @@ program
|
|
|
470
492
|
profileDir: resolvedProfileDir,
|
|
471
493
|
headed: options.headed || false,
|
|
472
494
|
storageState: resolvedStorageState,
|
|
495
|
+
proxy: options.proxy,
|
|
473
496
|
};
|
|
474
497
|
// Add summary option if requested
|
|
475
498
|
if (options.summary) {
|
|
@@ -559,16 +582,31 @@ program
|
|
|
559
582
|
}
|
|
560
583
|
}
|
|
561
584
|
// --- LLM-based extraction (post-peel) ---
|
|
562
|
-
if (options.llmExtract) {
|
|
585
|
+
if (options.llmExtract || options.extractSchema) {
|
|
563
586
|
const { extractWithLLM } = await import('./core/llm-extract.js');
|
|
564
587
|
const llmCfg = loadConfig();
|
|
565
588
|
const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
566
589
|
const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
|
|
567
590
|
const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
568
591
|
const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
|
|
592
|
+
// Parse --extract-schema if provided
|
|
593
|
+
let llmSchema;
|
|
594
|
+
if (options.extractSchema) {
|
|
595
|
+
let schemaStr = options.extractSchema;
|
|
596
|
+
if (schemaStr.startsWith('@')) {
|
|
597
|
+
schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
|
|
598
|
+
}
|
|
599
|
+
try {
|
|
600
|
+
llmSchema = JSON.parse(schemaStr);
|
|
601
|
+
}
|
|
602
|
+
catch {
|
|
603
|
+
exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
|
|
604
|
+
}
|
|
605
|
+
}
|
|
569
606
|
const llmResult = await extractWithLLM({
|
|
570
607
|
content: result.content,
|
|
571
608
|
instruction: llmInstruction,
|
|
609
|
+
schema: llmSchema,
|
|
572
610
|
apiKey: llmApiKey,
|
|
573
611
|
model: llmModel,
|
|
574
612
|
baseUrl: llmBaseUrl,
|
|
@@ -806,6 +844,7 @@ program
|
|
|
806
844
|
.option('--csv', 'Output site-search results as CSV (requires --site)')
|
|
807
845
|
.option('--budget <n>', 'Token budget for site-search result content', parseInt)
|
|
808
846
|
.option('-s, --silent', 'Silent mode')
|
|
847
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
809
848
|
.option('--agent', 'Agent mode: sets --json, --silent, and --budget 4000 (override with --budget N)')
|
|
810
849
|
.action(async (query, options) => {
|
|
811
850
|
// --agent sets sensible defaults for AI agents; explicit flags override
|
|
@@ -837,6 +876,7 @@ program
|
|
|
837
876
|
const htmlResult = await peel(siteResult.url, {
|
|
838
877
|
format: 'html',
|
|
839
878
|
timeout: 30000,
|
|
879
|
+
proxy: options.proxy,
|
|
840
880
|
});
|
|
841
881
|
if (spinner) {
|
|
842
882
|
spinner.succeed(`Fetched ${siteResult.site} in ${htmlResult.elapsed}ms`);
|
|
@@ -2953,6 +2993,7 @@ program
|
|
|
2953
2993
|
.option('--source <name...>', 'Only use specific source(s): kayak, booking, google (repeatable)')
|
|
2954
2994
|
.option('--json', 'Output as JSON')
|
|
2955
2995
|
.option('--stealth', 'Use stealth mode for all sources')
|
|
2996
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
2956
2997
|
.option('-s, --silent', 'Suppress progress messages')
|
|
2957
2998
|
.action(async (destination, options) => {
|
|
2958
2999
|
const isJson = options.json;
|
|
@@ -3005,6 +3046,7 @@ program
|
|
|
3005
3046
|
sources,
|
|
3006
3047
|
stealth: options.stealth,
|
|
3007
3048
|
silent: isSilent,
|
|
3049
|
+
proxy: options.proxy,
|
|
3008
3050
|
});
|
|
3009
3051
|
if (searchSpinner)
|
|
3010
3052
|
searchSpinner.stop();
|