webpeel 0.21.78 → 0.21.79

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -631,6 +631,122 @@ export function registerSearchCommands(program) {
631
631
  process.exit(1);
632
632
  }
633
633
  });
634
+ // ── rental command ────────────────────────────────────────────────────────
635
+ program
636
+ .command('rental <query>')
637
+ .alias('car-rental')
638
+ .description('Search for car rentals via Kayak — e.g. "Punta Gorda FL Apr 1-3"')
639
+ .option('--json', 'Output as JSON')
640
+ .option('-s, --silent', 'Silent mode')
641
+ .action(async (query, options) => {
642
+ // Parse location: strip date portion from query
643
+ const location = query.replace(/\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+\d+.*/i, '').trim();
644
+ const encodedLocation = encodeURIComponent(location.replace(/\s+/g, '-'));
645
+ // Parse dates: try "Apr 1-3" or "Apr 1 to Apr 3" patterns
646
+ const year = new Date().getFullYear();
647
+ let pickupDate = `${year}-04-01`;
648
+ let returnDate = `${year}-04-03`;
649
+ const rangeMatch = query.match(/\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+(\d+)\s*[-–to]+\s*(?:(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+)?(\d+)/i);
650
+ if (rangeMatch) {
651
+ const months = {
652
+ jan: '01', feb: '02', mar: '03', apr: '04', may: '05', jun: '06',
653
+ jul: '07', aug: '08', sep: '09', oct: '10', nov: '11', dec: '12',
654
+ };
655
+ const startMonth = months[rangeMatch[1].toLowerCase().slice(0, 3)];
656
+ const startDay = rangeMatch[2].padStart(2, '0');
657
+ const endMonth = rangeMatch[3] ? months[rangeMatch[3].toLowerCase().slice(0, 3)] : startMonth;
658
+ const endDay = rangeMatch[4].padStart(2, '0');
659
+ pickupDate = `${year}-${startMonth}-${startDay}`;
660
+ returnDate = `${year}-${endMonth}-${endDay}`;
661
+ }
662
+ const searchUrl = `https://www.kayak.com/cars/${encodedLocation}/${pickupDate}/${returnDate}?sort=price_a`;
663
+ const spinner = options.silent ? null : (await import('ora')).default(`Searching car rentals: ${query}...`).start();
664
+ try {
665
+ const result = await peel(searchUrl, { render: true, timeout: 40000 });
666
+ if (spinner)
667
+ spinner.succeed('Car rentals loaded');
668
+ if (options.json) {
669
+ console.log(JSON.stringify({
670
+ query,
671
+ location,
672
+ pickupDate,
673
+ returnDate,
674
+ url: searchUrl,
675
+ content: result.content,
676
+ tokens: result.tokens,
677
+ }, null, 2));
678
+ }
679
+ else {
680
+ console.log(result.content);
681
+ }
682
+ await cleanup();
683
+ process.exit(0);
684
+ }
685
+ catch (error) {
686
+ if (spinner)
687
+ spinner.fail('Car rental search failed');
688
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
689
+ await cleanup();
690
+ process.exit(1);
691
+ }
692
+ });
693
+ // ── cars command ──────────────────────────────────────────────────────────
694
+ program
695
+ .command('cars <query>')
696
+ .description('Search for cars to buy via Cars.com — e.g. "Honda Civic"')
697
+ .option('--zip <zip>', 'ZIP code for local search', '10001')
698
+ .option('--distance <miles>', 'Max distance in miles', '30')
699
+ .option('--max-price <price>', 'Maximum listing price')
700
+ .option('--min-price <price>', 'Minimum listing price')
701
+ .option('--json', 'Output as JSON')
702
+ .option('-s, --silent', 'Silent mode')
703
+ .action(async (query, options) => {
704
+ const zip = options.zip || '10001';
705
+ const distance = options.distance || '30';
706
+ const maxPrice = options.maxPrice || '';
707
+ const minPrice = options.minPrice || '';
708
+ const params = new URLSearchParams({
709
+ keyword: query,
710
+ sort: 'list_price',
711
+ stock_type: 'all',
712
+ zip,
713
+ maximum_distance: distance,
714
+ });
715
+ if (maxPrice)
716
+ params.set('list_price_max', maxPrice);
717
+ if (minPrice)
718
+ params.set('list_price_min', minPrice);
719
+ const url = `https://www.cars.com/shopping/results/?${params.toString()}`;
720
+ const spinner = options.silent ? null : (await import('ora')).default(`Searching cars: ${query}...`).start();
721
+ try {
722
+ const result = await peel(url, { timeout: 25000 });
723
+ if (spinner)
724
+ spinner.succeed('Cars loaded');
725
+ if (options.json) {
726
+ console.log(JSON.stringify({
727
+ query,
728
+ zip,
729
+ distance,
730
+ maxPrice,
731
+ url,
732
+ content: result.content,
733
+ tokens: result.tokens,
734
+ }, null, 2));
735
+ }
736
+ else {
737
+ console.log(result.content);
738
+ }
739
+ await cleanup();
740
+ process.exit(0);
741
+ }
742
+ catch (error) {
743
+ if (spinner)
744
+ spinner.fail('Car search failed');
745
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
746
+ await cleanup();
747
+ process.exit(1);
748
+ }
749
+ });
634
750
  // ── extractors command ────────────────────────────────────────────────────
635
751
  program
636
752
  .command('extractors')
@@ -56,6 +56,12 @@ export declare function browserFetch(url: string, options?: {
56
56
  waitSelector?: string;
57
57
  /** Block resource types for faster loading: 'image', 'stylesheet', 'font', 'media', 'script' */
58
58
  blockResources?: string[];
59
+ /**
60
+ * Whether the target is a Single-Page Application (Kayak, Google Flights, Expedia, etc).
61
+ * When true, the DOM stability check uses a longer timeout (12s) to wait for async data loads.
62
+ * When false (default), a shorter 3s stability window is used.
63
+ */
64
+ isSPA?: boolean;
59
65
  }): Promise<FetchResult>;
60
66
  /**
61
67
  * Capture a screenshot of a URL using headless Chromium via Playwright.
@@ -30,7 +30,7 @@ let activePagesCount = 0;
30
30
  export async function browserFetch(url, options = {}) {
31
31
  // SECURITY: Validate URL to prevent SSRF
32
32
  validateUrl(url);
33
- const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, device = 'desktop', viewportWidth: optViewportWidth, viewportHeight: optViewportHeight, waitUntil: optWaitUntil, waitSelector, blockResources, } = options;
33
+ const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, device = 'desktop', viewportWidth: optViewportWidth, viewportHeight: optViewportHeight, waitUntil: optWaitUntil, waitSelector, blockResources, isSPA = false, } = options;
34
34
  // Device emulation profiles
35
35
  const deviceProfiles = {
36
36
  desktop: { width: 1920, height: 1080, userAgent: undefined },
@@ -317,27 +317,37 @@ export async function browserFetch(url, options = {}) {
317
317
  throwIfAborted();
318
318
  }
319
319
  // DOM stability check: wait for SPA hydration to settle.
320
- // Polls innerText length every 500ms — if still growing, keep waiting (max 3s extra).
320
+ // Polls innerText length every 500ms — if still growing, keep waiting.
321
+ // SPAs (Kayak, Google Flights, Expedia) get a longer timeout to allow async data loads.
321
322
  {
322
323
  const stabilityStart = Date.now();
323
- const MAX_STABILITY_WAIT_MS = 3000;
324
+ // SPA sites (Kayak, Google Flights, Expedia) need up to 12s for results to load.
325
+ // Normal rendered pages need just 3s extra.
326
+ const MAX_STABILITY_WAIT_MS = isSPA ? 12000 : 3000;
327
+ // SPA: must be stable for 2s (4 consecutive 500ms checks). Normal: 1s (2 checks).
328
+ const STABLE_CHECKS_REQUIRED = isSPA ? 4 : 2;
324
329
  const POLL_INTERVAL_MS = 500;
330
+ const MIN_CONTENT_LENGTH = 200; // Don't consider near-empty pages stable
325
331
  let prevLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
326
332
  let stableCount = 0;
327
333
  while (Date.now() - stabilityStart < MAX_STABILITY_WAIT_MS) {
328
334
  throwIfAborted();
329
335
  await page.waitForTimeout(POLL_INTERVAL_MS);
330
336
  const curLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
331
- if (curLength === prevLength) {
332
- stableCount++;
333
- if (stableCount >= 2)
334
- break; // stable for 2 consecutive checks (~1s)
335
- }
336
- else {
337
+ if (curLength !== prevLength) {
338
+ // Content changed — reset stability counter
337
339
  stableCount = 0;
338
340
  }
341
+ else if (curLength >= MIN_CONTENT_LENGTH) {
342
+ stableCount++;
343
+ if (stableCount >= STABLE_CHECKS_REQUIRED)
344
+ break; // stable long enough
345
+ }
339
346
  prevLength = curLength;
340
347
  }
348
+ if (isSPA) {
349
+ log.debug(`SPA stability check: ${Date.now() - stabilityStart}ms, length=${prevLength}`);
350
+ }
341
351
  }
342
352
  const finalUrl = page.url();
343
353
  const contentType = response?.headers()?.['content-type'] || '';
@@ -232,7 +232,7 @@ function prefetchDns(url) {
232
232
  }
233
233
  }
234
234
  async function fetchWithBrowserStrategy(url, options) {
235
- const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, storageState, proxy, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, } = options;
235
+ const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, storageState, proxy, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, isSPA, } = options;
236
236
  try {
237
237
  const result = await browserFetch(url, {
238
238
  userAgent,
@@ -256,6 +256,7 @@ async function fetchWithBrowserStrategy(url, options) {
256
256
  waitUntil,
257
257
  waitSelector,
258
258
  blockResources,
259
+ isSPA,
259
260
  });
260
261
  return {
261
262
  ...result,
@@ -500,6 +501,21 @@ export async function smartFetch(url, options = {}) {
500
501
  if (storageState) {
501
502
  effectiveForceBrowser = true;
502
503
  }
504
+ // Detect SPA for smarter DOM stability wait
505
+ const SPA_FETCH_DOMAINS = new Set([
506
+ 'www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com',
507
+ 'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com',
508
+ 'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev',
509
+ ]);
510
+ const SPA_FETCH_URL_PATTERNS = [
511
+ /google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/,
512
+ ];
513
+ let isSPAUrl = false;
514
+ try {
515
+ const parsedHostname = new URL(url).hostname;
516
+ isSPAUrl = SPA_FETCH_DOMAINS.has(parsedHostname) || SPA_FETCH_URL_PATTERNS.some(p => p.test(url));
517
+ }
518
+ catch { /* invalid URL — ignore */ }
503
519
  const browserOptions = {
504
520
  userAgent,
505
521
  waitMs,
@@ -521,6 +537,7 @@ export async function smartFetch(url, options = {}) {
521
537
  waitUntil,
522
538
  waitSelector,
523
539
  blockResources,
540
+ isSPA: isSPAUrl,
524
541
  };
525
542
  /* ---- Strategy: simple fetch (with optional race) --------------------- */
526
543
  if (!shouldUseBrowser) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.78",
3
+ "version": "0.21.79",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",