webpeel 0.21.78 → 0.21.79
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -631,6 +631,122 @@ export function registerSearchCommands(program) {
|
|
|
631
631
|
process.exit(1);
|
|
632
632
|
}
|
|
633
633
|
});
|
|
634
|
+
// ── rental command ────────────────────────────────────────────────────────
|
|
635
|
+
program
|
|
636
|
+
.command('rental <query>')
|
|
637
|
+
.alias('car-rental')
|
|
638
|
+
.description('Search for car rentals via Kayak — e.g. "Punta Gorda FL Apr 1-3"')
|
|
639
|
+
.option('--json', 'Output as JSON')
|
|
640
|
+
.option('-s, --silent', 'Silent mode')
|
|
641
|
+
.action(async (query, options) => {
|
|
642
|
+
// Parse location: strip date portion from query
|
|
643
|
+
const location = query.replace(/\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+\d+.*/i, '').trim();
|
|
644
|
+
const encodedLocation = encodeURIComponent(location.replace(/\s+/g, '-'));
|
|
645
|
+
// Parse dates: try "Apr 1-3" or "Apr 1 to Apr 3" patterns
|
|
646
|
+
const year = new Date().getFullYear();
|
|
647
|
+
let pickupDate = `${year}-04-01`;
|
|
648
|
+
let returnDate = `${year}-04-03`;
|
|
649
|
+
const rangeMatch = query.match(/\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+(\d+)\s*[-–to]+\s*(?:(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+)?(\d+)/i);
|
|
650
|
+
if (rangeMatch) {
|
|
651
|
+
const months = {
|
|
652
|
+
jan: '01', feb: '02', mar: '03', apr: '04', may: '05', jun: '06',
|
|
653
|
+
jul: '07', aug: '08', sep: '09', oct: '10', nov: '11', dec: '12',
|
|
654
|
+
};
|
|
655
|
+
const startMonth = months[rangeMatch[1].toLowerCase().slice(0, 3)];
|
|
656
|
+
const startDay = rangeMatch[2].padStart(2, '0');
|
|
657
|
+
const endMonth = rangeMatch[3] ? months[rangeMatch[3].toLowerCase().slice(0, 3)] : startMonth;
|
|
658
|
+
const endDay = rangeMatch[4].padStart(2, '0');
|
|
659
|
+
pickupDate = `${year}-${startMonth}-${startDay}`;
|
|
660
|
+
returnDate = `${year}-${endMonth}-${endDay}`;
|
|
661
|
+
}
|
|
662
|
+
const searchUrl = `https://www.kayak.com/cars/${encodedLocation}/${pickupDate}/${returnDate}?sort=price_a`;
|
|
663
|
+
const spinner = options.silent ? null : (await import('ora')).default(`Searching car rentals: ${query}...`).start();
|
|
664
|
+
try {
|
|
665
|
+
const result = await peel(searchUrl, { render: true, timeout: 40000 });
|
|
666
|
+
if (spinner)
|
|
667
|
+
spinner.succeed('Car rentals loaded');
|
|
668
|
+
if (options.json) {
|
|
669
|
+
console.log(JSON.stringify({
|
|
670
|
+
query,
|
|
671
|
+
location,
|
|
672
|
+
pickupDate,
|
|
673
|
+
returnDate,
|
|
674
|
+
url: searchUrl,
|
|
675
|
+
content: result.content,
|
|
676
|
+
tokens: result.tokens,
|
|
677
|
+
}, null, 2));
|
|
678
|
+
}
|
|
679
|
+
else {
|
|
680
|
+
console.log(result.content);
|
|
681
|
+
}
|
|
682
|
+
await cleanup();
|
|
683
|
+
process.exit(0);
|
|
684
|
+
}
|
|
685
|
+
catch (error) {
|
|
686
|
+
if (spinner)
|
|
687
|
+
spinner.fail('Car rental search failed');
|
|
688
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
689
|
+
await cleanup();
|
|
690
|
+
process.exit(1);
|
|
691
|
+
}
|
|
692
|
+
});
|
|
693
|
+
// ── cars command ──────────────────────────────────────────────────────────
|
|
694
|
+
program
|
|
695
|
+
.command('cars <query>')
|
|
696
|
+
.description('Search for cars to buy via Cars.com — e.g. "Honda Civic"')
|
|
697
|
+
.option('--zip <zip>', 'ZIP code for local search', '10001')
|
|
698
|
+
.option('--distance <miles>', 'Max distance in miles', '30')
|
|
699
|
+
.option('--max-price <price>', 'Maximum listing price')
|
|
700
|
+
.option('--min-price <price>', 'Minimum listing price')
|
|
701
|
+
.option('--json', 'Output as JSON')
|
|
702
|
+
.option('-s, --silent', 'Silent mode')
|
|
703
|
+
.action(async (query, options) => {
|
|
704
|
+
const zip = options.zip || '10001';
|
|
705
|
+
const distance = options.distance || '30';
|
|
706
|
+
const maxPrice = options.maxPrice || '';
|
|
707
|
+
const minPrice = options.minPrice || '';
|
|
708
|
+
const params = new URLSearchParams({
|
|
709
|
+
keyword: query,
|
|
710
|
+
sort: 'list_price',
|
|
711
|
+
stock_type: 'all',
|
|
712
|
+
zip,
|
|
713
|
+
maximum_distance: distance,
|
|
714
|
+
});
|
|
715
|
+
if (maxPrice)
|
|
716
|
+
params.set('list_price_max', maxPrice);
|
|
717
|
+
if (minPrice)
|
|
718
|
+
params.set('list_price_min', minPrice);
|
|
719
|
+
const url = `https://www.cars.com/shopping/results/?${params.toString()}`;
|
|
720
|
+
const spinner = options.silent ? null : (await import('ora')).default(`Searching cars: ${query}...`).start();
|
|
721
|
+
try {
|
|
722
|
+
const result = await peel(url, { timeout: 25000 });
|
|
723
|
+
if (spinner)
|
|
724
|
+
spinner.succeed('Cars loaded');
|
|
725
|
+
if (options.json) {
|
|
726
|
+
console.log(JSON.stringify({
|
|
727
|
+
query,
|
|
728
|
+
zip,
|
|
729
|
+
distance,
|
|
730
|
+
maxPrice,
|
|
731
|
+
url,
|
|
732
|
+
content: result.content,
|
|
733
|
+
tokens: result.tokens,
|
|
734
|
+
}, null, 2));
|
|
735
|
+
}
|
|
736
|
+
else {
|
|
737
|
+
console.log(result.content);
|
|
738
|
+
}
|
|
739
|
+
await cleanup();
|
|
740
|
+
process.exit(0);
|
|
741
|
+
}
|
|
742
|
+
catch (error) {
|
|
743
|
+
if (spinner)
|
|
744
|
+
spinner.fail('Car search failed');
|
|
745
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
746
|
+
await cleanup();
|
|
747
|
+
process.exit(1);
|
|
748
|
+
}
|
|
749
|
+
});
|
|
634
750
|
// ── extractors command ────────────────────────────────────────────────────
|
|
635
751
|
program
|
|
636
752
|
.command('extractors')
|
|
@@ -56,6 +56,12 @@ export declare function browserFetch(url: string, options?: {
|
|
|
56
56
|
waitSelector?: string;
|
|
57
57
|
/** Block resource types for faster loading: 'image', 'stylesheet', 'font', 'media', 'script' */
|
|
58
58
|
blockResources?: string[];
|
|
59
|
+
/**
|
|
60
|
+
* Whether the target is a Single-Page Application (Kayak, Google Flights, Expedia, etc).
|
|
61
|
+
* When true, the DOM stability check uses a longer timeout (12s) to wait for async data loads.
|
|
62
|
+
* When false (default), a shorter 3s stability window is used.
|
|
63
|
+
*/
|
|
64
|
+
isSPA?: boolean;
|
|
59
65
|
}): Promise<FetchResult>;
|
|
60
66
|
/**
|
|
61
67
|
* Capture a screenshot of a URL using headless Chromium via Playwright.
|
|
@@ -30,7 +30,7 @@ let activePagesCount = 0;
|
|
|
30
30
|
export async function browserFetch(url, options = {}) {
|
|
31
31
|
// SECURITY: Validate URL to prevent SSRF
|
|
32
32
|
validateUrl(url);
|
|
33
|
-
const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, device = 'desktop', viewportWidth: optViewportWidth, viewportHeight: optViewportHeight, waitUntil: optWaitUntil, waitSelector, blockResources, } = options;
|
|
33
|
+
const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, device = 'desktop', viewportWidth: optViewportWidth, viewportHeight: optViewportHeight, waitUntil: optWaitUntil, waitSelector, blockResources, isSPA = false, } = options;
|
|
34
34
|
// Device emulation profiles
|
|
35
35
|
const deviceProfiles = {
|
|
36
36
|
desktop: { width: 1920, height: 1080, userAgent: undefined },
|
|
@@ -317,27 +317,37 @@ export async function browserFetch(url, options = {}) {
|
|
|
317
317
|
throwIfAborted();
|
|
318
318
|
}
|
|
319
319
|
// DOM stability check: wait for SPA hydration to settle.
|
|
320
|
-
// Polls innerText length every 500ms — if still growing, keep waiting
|
|
320
|
+
// Polls innerText length every 500ms — if still growing, keep waiting.
|
|
321
|
+
// SPAs (Kayak, Google Flights, Expedia) get a longer timeout to allow async data loads.
|
|
321
322
|
{
|
|
322
323
|
const stabilityStart = Date.now();
|
|
323
|
-
|
|
324
|
+
// SPA sites (Kayak, Google Flights, Expedia) need up to 12s for results to load.
|
|
325
|
+
// Normal rendered pages need just 3s extra.
|
|
326
|
+
const MAX_STABILITY_WAIT_MS = isSPA ? 12000 : 3000;
|
|
327
|
+
// SPA: must be stable for 2s (4 consecutive 500ms checks). Normal: 1s (2 checks).
|
|
328
|
+
const STABLE_CHECKS_REQUIRED = isSPA ? 4 : 2;
|
|
324
329
|
const POLL_INTERVAL_MS = 500;
|
|
330
|
+
const MIN_CONTENT_LENGTH = 200; // Don't consider near-empty pages stable
|
|
325
331
|
let prevLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
|
|
326
332
|
let stableCount = 0;
|
|
327
333
|
while (Date.now() - stabilityStart < MAX_STABILITY_WAIT_MS) {
|
|
328
334
|
throwIfAborted();
|
|
329
335
|
await page.waitForTimeout(POLL_INTERVAL_MS);
|
|
330
336
|
const curLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
|
|
331
|
-
if (curLength
|
|
332
|
-
|
|
333
|
-
if (stableCount >= 2)
|
|
334
|
-
break; // stable for 2 consecutive checks (~1s)
|
|
335
|
-
}
|
|
336
|
-
else {
|
|
337
|
+
if (curLength !== prevLength) {
|
|
338
|
+
// Content changed — reset stability counter
|
|
337
339
|
stableCount = 0;
|
|
338
340
|
}
|
|
341
|
+
else if (curLength >= MIN_CONTENT_LENGTH) {
|
|
342
|
+
stableCount++;
|
|
343
|
+
if (stableCount >= STABLE_CHECKS_REQUIRED)
|
|
344
|
+
break; // stable long enough
|
|
345
|
+
}
|
|
339
346
|
prevLength = curLength;
|
|
340
347
|
}
|
|
348
|
+
if (isSPA) {
|
|
349
|
+
log.debug(`SPA stability check: ${Date.now() - stabilityStart}ms, length=${prevLength}`);
|
|
350
|
+
}
|
|
341
351
|
}
|
|
342
352
|
const finalUrl = page.url();
|
|
343
353
|
const contentType = response?.headers()?.['content-type'] || '';
|
package/dist/core/strategies.js
CHANGED
|
@@ -232,7 +232,7 @@ function prefetchDns(url) {
|
|
|
232
232
|
}
|
|
233
233
|
}
|
|
234
234
|
async function fetchWithBrowserStrategy(url, options) {
|
|
235
|
-
const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, storageState, proxy, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, } = options;
|
|
235
|
+
const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, storageState, proxy, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, isSPA, } = options;
|
|
236
236
|
try {
|
|
237
237
|
const result = await browserFetch(url, {
|
|
238
238
|
userAgent,
|
|
@@ -256,6 +256,7 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
256
256
|
waitUntil,
|
|
257
257
|
waitSelector,
|
|
258
258
|
blockResources,
|
|
259
|
+
isSPA,
|
|
259
260
|
});
|
|
260
261
|
return {
|
|
261
262
|
...result,
|
|
@@ -500,6 +501,21 @@ export async function smartFetch(url, options = {}) {
|
|
|
500
501
|
if (storageState) {
|
|
501
502
|
effectiveForceBrowser = true;
|
|
502
503
|
}
|
|
504
|
+
// Detect SPA for smarter DOM stability wait
|
|
505
|
+
const SPA_FETCH_DOMAINS = new Set([
|
|
506
|
+
'www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com',
|
|
507
|
+
'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com',
|
|
508
|
+
'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev',
|
|
509
|
+
]);
|
|
510
|
+
const SPA_FETCH_URL_PATTERNS = [
|
|
511
|
+
/google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/,
|
|
512
|
+
];
|
|
513
|
+
let isSPAUrl = false;
|
|
514
|
+
try {
|
|
515
|
+
const parsedHostname = new URL(url).hostname;
|
|
516
|
+
isSPAUrl = SPA_FETCH_DOMAINS.has(parsedHostname) || SPA_FETCH_URL_PATTERNS.some(p => p.test(url));
|
|
517
|
+
}
|
|
518
|
+
catch { /* invalid URL — ignore */ }
|
|
503
519
|
const browserOptions = {
|
|
504
520
|
userAgent,
|
|
505
521
|
waitMs,
|
|
@@ -521,6 +537,7 @@ export async function smartFetch(url, options = {}) {
|
|
|
521
537
|
waitUntil,
|
|
522
538
|
waitSelector,
|
|
523
539
|
blockResources,
|
|
540
|
+
isSPA: isSPAUrl,
|
|
524
541
|
};
|
|
525
542
|
/* ---- Strategy: simple fetch (with optional race) --------------------- */
|
|
526
543
|
if (!shouldUseBrowser) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.79",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|