webpeel 0.21.78 → 0.21.80
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/search.js +222 -0
- package/dist/cli.js +0 -0
- package/dist/core/browser-fetch.d.ts +6 -0
- package/dist/core/browser-fetch.js +19 -9
- package/dist/core/domain-extractors.js +271 -0
- package/dist/core/pipeline.js +16 -0
- package/dist/core/strategies.js +18 -1
- package/package.json +1 -1
|
@@ -6,6 +6,31 @@ import { readFileSync } from 'fs';
|
|
|
6
6
|
import { peel, peelBatch, cleanup } from '../../index.js';
|
|
7
7
|
import { checkUsage, showUsageFooter, loadConfig } from '../../cli-auth.js';
|
|
8
8
|
import { writeStdout, formatListingsCsv } from '../utils.js';
|
|
9
|
+
/**
|
|
10
|
+
* Parse a date range string like "Mar29-Apr4" into an array of date strings.
|
|
11
|
+
* Returns ["Mar 29", "Mar 30", ..., "Apr 4"]
|
|
12
|
+
*/
|
|
13
|
+
function parseDateRange(range) {
|
|
14
|
+
const match = range.match(/(\w{3})\s*(\d{1,2})\s*[-–to]+\s*(\w{3})\s*(\d{1,2})/i);
|
|
15
|
+
if (!match)
|
|
16
|
+
return [];
|
|
17
|
+
const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
|
|
18
|
+
const startMonthIdx = months.findIndex(m => m.toLowerCase() === match[1].toLowerCase().slice(0, 3));
|
|
19
|
+
const endMonthIdx = months.findIndex(m => m.toLowerCase() === match[3].toLowerCase().slice(0, 3));
|
|
20
|
+
if (startMonthIdx === -1 || endMonthIdx === -1)
|
|
21
|
+
return [];
|
|
22
|
+
const startDay = parseInt(match[2]);
|
|
23
|
+
const endDay = parseInt(match[4]);
|
|
24
|
+
const year = new Date().getFullYear();
|
|
25
|
+
const dates = [];
|
|
26
|
+
const start = new Date(year, startMonthIdx, startDay);
|
|
27
|
+
const end = new Date(year, endMonthIdx, endDay);
|
|
28
|
+
for (let d = new Date(start); d <= end; d.setDate(d.getDate() + 1)) {
|
|
29
|
+
const mon = months[d.getMonth()];
|
|
30
|
+
dates.push(`${mon} ${d.getDate()}`);
|
|
31
|
+
}
|
|
32
|
+
return dates;
|
|
33
|
+
}
|
|
9
34
|
export function registerSearchCommands(program) {
|
|
10
35
|
// ── search command ────────────────────────────────────────────────────────
|
|
11
36
|
program
|
|
@@ -595,9 +620,90 @@ export function registerSearchCommands(program) {
|
|
|
595
620
|
.option('--one-way', 'One-way flight (default)')
|
|
596
621
|
.option('--round-trip', 'Round-trip flight')
|
|
597
622
|
.option('-n, --count <n>', 'Max flights to show', '10')
|
|
623
|
+
.option('--dates <range>', 'Compare prices across date range (e.g., "Mar29-Apr4")')
|
|
598
624
|
.option('--json', 'Output as JSON')
|
|
599
625
|
.option('-s, --silent', 'Silent mode')
|
|
600
626
|
.action(async (query, options) => {
|
|
627
|
+
// ── --dates: compare cheapest flight across a date range ──────────────
|
|
628
|
+
if (options.dates) {
|
|
629
|
+
const dates = parseDateRange(options.dates);
|
|
630
|
+
if (dates.length === 0) {
|
|
631
|
+
console.error('Could not parse date range. Format: "Mar29-Apr4"');
|
|
632
|
+
process.exit(1);
|
|
633
|
+
}
|
|
634
|
+
const spinner = options.silent ? null : ora(`Comparing flights across ${dates.length} dates...`).start();
|
|
635
|
+
const tripType = options.roundTrip ? '' : ' one way';
|
|
636
|
+
const rows = [];
|
|
637
|
+
for (const date of dates) {
|
|
638
|
+
if (spinner)
|
|
639
|
+
spinner.text = `Fetching flights for ${date}...`;
|
|
640
|
+
try {
|
|
641
|
+
const dateQuery = `Flights from ${query} ${date}${tripType}`;
|
|
642
|
+
const encoded = encodeURIComponent(dateQuery);
|
|
643
|
+
const url = `https://www.google.com/travel/flights?q=${encoded}`;
|
|
644
|
+
const result = await peel(url, { render: true, timeout: 30000 });
|
|
645
|
+
// Try to extract cheapest flight from structured data or content
|
|
646
|
+
let price = null;
|
|
647
|
+
let airline = null;
|
|
648
|
+
let time = null;
|
|
649
|
+
const flights = result.domainData?.structured?.flights || [];
|
|
650
|
+
if (flights.length > 0) {
|
|
651
|
+
const cheapest = flights.reduce((a, b) => {
|
|
652
|
+
const ap = parseFloat(String(a.price || '').replace(/[^0-9.]/g, '')) || Infinity;
|
|
653
|
+
const bp = parseFloat(String(b.price || '').replace(/[^0-9.]/g, '')) || Infinity;
|
|
654
|
+
return ap <= bp ? a : b;
|
|
655
|
+
});
|
|
656
|
+
price = cheapest.priceStr || (cheapest.price ? `$${cheapest.price}` : null);
|
|
657
|
+
airline = cheapest.airline || cheapest.carrier || null;
|
|
658
|
+
time = cheapest.departTime && cheapest.arriveTime
|
|
659
|
+
? `${cheapest.departTime} → ${cheapest.arriveTime}`
|
|
660
|
+
: (cheapest.time || cheapest.departure || null);
|
|
661
|
+
}
|
|
662
|
+
else {
|
|
663
|
+
// Extract from markdown content — look for price patterns
|
|
664
|
+
const priceMatch = result.content.match(/\$(\d+)/);
|
|
665
|
+
if (priceMatch)
|
|
666
|
+
price = `$${priceMatch[1]}`;
|
|
667
|
+
const airlineMatch = result.content.match(/\b(American|Delta|United|Southwest|Spirit|JetBlue|Alaska|Frontier|Allegiant|Sun Country)\b/i);
|
|
668
|
+
if (airlineMatch)
|
|
669
|
+
airline = airlineMatch[1];
|
|
670
|
+
const timeMatch = result.content.match(/(\d{1,2}:\d{2}\s*(?:AM|PM))\s*[–—→]\s*(\d{1,2}:\d{2}\s*(?:AM|PM))/i);
|
|
671
|
+
if (timeMatch)
|
|
672
|
+
time = `${timeMatch[1]} → ${timeMatch[2]}`;
|
|
673
|
+
}
|
|
674
|
+
const priceNum = price ? parseFloat(price.replace(/[^0-9.]/g, '')) || Infinity : Infinity;
|
|
675
|
+
rows.push({ date, price, airline, time, priceNum });
|
|
676
|
+
}
|
|
677
|
+
catch {
|
|
678
|
+
rows.push({ date, price: null, airline: null, time: null, priceNum: Infinity });
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
if (spinner)
|
|
682
|
+
spinner.succeed(`Compared ${rows.length} dates`);
|
|
683
|
+
if (options.json) {
|
|
684
|
+
console.log(JSON.stringify({ query, dateRange: options.dates, rows }, null, 2));
|
|
685
|
+
}
|
|
686
|
+
else {
|
|
687
|
+
// Find best price
|
|
688
|
+
const best = rows.reduce((a, b) => a.priceNum <= b.priceNum ? a : b);
|
|
689
|
+
console.log(`\n# ✈️ Flight Price Comparison — ${query}\n`);
|
|
690
|
+
console.log('| Date | Airline | Time | Price |');
|
|
691
|
+
console.log('|------|---------|------|-------|');
|
|
692
|
+
for (const row of rows) {
|
|
693
|
+
const star = row.priceNum === best.priceNum ? ' ⭐' : '';
|
|
694
|
+
const priceStr = row.price ? `${row.price}${star}` : 'N/A';
|
|
695
|
+
const airlineStr = row.airline || 'Unknown';
|
|
696
|
+
const timeStr = row.time || '—';
|
|
697
|
+
console.log(`| ${row.date} | ${airlineStr} | ${timeStr} | ${priceStr} |`);
|
|
698
|
+
}
|
|
699
|
+
if (best.price) {
|
|
700
|
+
console.log(`\n⭐ Best price: ${best.date} — ${best.airline || 'Unknown'} ${best.price}`);
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
await cleanup();
|
|
704
|
+
process.exit(0);
|
|
705
|
+
}
|
|
706
|
+
// ── Single date (default) ─────────────────────────────────────────────
|
|
601
707
|
const tripType = options.roundTrip ? '' : ' one way';
|
|
602
708
|
const encoded = encodeURIComponent(`Flights from ${query}${tripType}`);
|
|
603
709
|
const url = `https://www.google.com/travel/flights?q=${encoded}`;
|
|
@@ -631,6 +737,122 @@ export function registerSearchCommands(program) {
|
|
|
631
737
|
process.exit(1);
|
|
632
738
|
}
|
|
633
739
|
});
|
|
740
|
+
// ── rental command ────────────────────────────────────────────────────────
|
|
741
|
+
program
|
|
742
|
+
.command('rental <query>')
|
|
743
|
+
.alias('car-rental')
|
|
744
|
+
.description('Search for car rentals via Kayak — e.g. "Punta Gorda FL Apr 1-3"')
|
|
745
|
+
.option('--json', 'Output as JSON')
|
|
746
|
+
.option('-s, --silent', 'Silent mode')
|
|
747
|
+
.action(async (query, options) => {
|
|
748
|
+
// Parse location: strip date portion from query
|
|
749
|
+
const location = query.replace(/\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+\d+.*/i, '').trim();
|
|
750
|
+
const encodedLocation = encodeURIComponent(location.replace(/\s+/g, '-'));
|
|
751
|
+
// Parse dates: try "Apr 1-3" or "Apr 1 to Apr 3" patterns
|
|
752
|
+
const year = new Date().getFullYear();
|
|
753
|
+
let pickupDate = `${year}-04-01`;
|
|
754
|
+
let returnDate = `${year}-04-03`;
|
|
755
|
+
const rangeMatch = query.match(/\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+(\d+)\s*[-–to]+\s*(?:(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\s+)?(\d+)/i);
|
|
756
|
+
if (rangeMatch) {
|
|
757
|
+
const months = {
|
|
758
|
+
jan: '01', feb: '02', mar: '03', apr: '04', may: '05', jun: '06',
|
|
759
|
+
jul: '07', aug: '08', sep: '09', oct: '10', nov: '11', dec: '12',
|
|
760
|
+
};
|
|
761
|
+
const startMonth = months[rangeMatch[1].toLowerCase().slice(0, 3)];
|
|
762
|
+
const startDay = rangeMatch[2].padStart(2, '0');
|
|
763
|
+
const endMonth = rangeMatch[3] ? months[rangeMatch[3].toLowerCase().slice(0, 3)] : startMonth;
|
|
764
|
+
const endDay = rangeMatch[4].padStart(2, '0');
|
|
765
|
+
pickupDate = `${year}-${startMonth}-${startDay}`;
|
|
766
|
+
returnDate = `${year}-${endMonth}-${endDay}`;
|
|
767
|
+
}
|
|
768
|
+
const searchUrl = `https://www.kayak.com/cars/${encodedLocation}/${pickupDate}/${returnDate}?sort=price_a`;
|
|
769
|
+
const spinner = options.silent ? null : (await import('ora')).default(`Searching car rentals: ${query}...`).start();
|
|
770
|
+
try {
|
|
771
|
+
const result = await peel(searchUrl, { render: true, timeout: 40000 });
|
|
772
|
+
if (spinner)
|
|
773
|
+
spinner.succeed('Car rentals loaded');
|
|
774
|
+
if (options.json) {
|
|
775
|
+
console.log(JSON.stringify({
|
|
776
|
+
query,
|
|
777
|
+
location,
|
|
778
|
+
pickupDate,
|
|
779
|
+
returnDate,
|
|
780
|
+
url: searchUrl,
|
|
781
|
+
content: result.content,
|
|
782
|
+
tokens: result.tokens,
|
|
783
|
+
}, null, 2));
|
|
784
|
+
}
|
|
785
|
+
else {
|
|
786
|
+
console.log(result.content);
|
|
787
|
+
}
|
|
788
|
+
await cleanup();
|
|
789
|
+
process.exit(0);
|
|
790
|
+
}
|
|
791
|
+
catch (error) {
|
|
792
|
+
if (spinner)
|
|
793
|
+
spinner.fail('Car rental search failed');
|
|
794
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
795
|
+
await cleanup();
|
|
796
|
+
process.exit(1);
|
|
797
|
+
}
|
|
798
|
+
});
|
|
799
|
+
// ── cars command ──────────────────────────────────────────────────────────
|
|
800
|
+
program
|
|
801
|
+
.command('cars <query>')
|
|
802
|
+
.description('Search for cars to buy via Cars.com — e.g. "Honda Civic"')
|
|
803
|
+
.option('--zip <zip>', 'ZIP code for local search', '10001')
|
|
804
|
+
.option('--distance <miles>', 'Max distance in miles', '30')
|
|
805
|
+
.option('--max-price <price>', 'Maximum listing price')
|
|
806
|
+
.option('--min-price <price>', 'Minimum listing price')
|
|
807
|
+
.option('--json', 'Output as JSON')
|
|
808
|
+
.option('-s, --silent', 'Silent mode')
|
|
809
|
+
.action(async (query, options) => {
|
|
810
|
+
const zip = options.zip || '10001';
|
|
811
|
+
const distance = options.distance || '30';
|
|
812
|
+
const maxPrice = options.maxPrice || '';
|
|
813
|
+
const minPrice = options.minPrice || '';
|
|
814
|
+
const params = new URLSearchParams({
|
|
815
|
+
keyword: query,
|
|
816
|
+
sort: 'list_price',
|
|
817
|
+
stock_type: 'all',
|
|
818
|
+
zip,
|
|
819
|
+
maximum_distance: distance,
|
|
820
|
+
});
|
|
821
|
+
if (maxPrice)
|
|
822
|
+
params.set('list_price_max', maxPrice);
|
|
823
|
+
if (minPrice)
|
|
824
|
+
params.set('list_price_min', minPrice);
|
|
825
|
+
const url = `https://www.cars.com/shopping/results/?${params.toString()}`;
|
|
826
|
+
const spinner = options.silent ? null : (await import('ora')).default(`Searching cars: ${query}...`).start();
|
|
827
|
+
try {
|
|
828
|
+
const result = await peel(url, { timeout: 25000 });
|
|
829
|
+
if (spinner)
|
|
830
|
+
spinner.succeed('Cars loaded');
|
|
831
|
+
if (options.json) {
|
|
832
|
+
console.log(JSON.stringify({
|
|
833
|
+
query,
|
|
834
|
+
zip,
|
|
835
|
+
distance,
|
|
836
|
+
maxPrice,
|
|
837
|
+
url,
|
|
838
|
+
content: result.content,
|
|
839
|
+
tokens: result.tokens,
|
|
840
|
+
}, null, 2));
|
|
841
|
+
}
|
|
842
|
+
else {
|
|
843
|
+
console.log(result.content);
|
|
844
|
+
}
|
|
845
|
+
await cleanup();
|
|
846
|
+
process.exit(0);
|
|
847
|
+
}
|
|
848
|
+
catch (error) {
|
|
849
|
+
if (spinner)
|
|
850
|
+
spinner.fail('Car search failed');
|
|
851
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
852
|
+
await cleanup();
|
|
853
|
+
process.exit(1);
|
|
854
|
+
}
|
|
855
|
+
});
|
|
634
856
|
// ── extractors command ────────────────────────────────────────────────────
|
|
635
857
|
program
|
|
636
858
|
.command('extractors')
|
package/dist/cli.js
CHANGED
|
File without changes
|
|
@@ -56,6 +56,12 @@ export declare function browserFetch(url: string, options?: {
|
|
|
56
56
|
waitSelector?: string;
|
|
57
57
|
/** Block resource types for faster loading: 'image', 'stylesheet', 'font', 'media', 'script' */
|
|
58
58
|
blockResources?: string[];
|
|
59
|
+
/**
|
|
60
|
+
* Whether the target is a Single-Page Application (Kayak, Google Flights, Expedia, etc).
|
|
61
|
+
* When true, the DOM stability check uses a longer timeout (12s) to wait for async data loads.
|
|
62
|
+
* When false (default), a shorter 3s stability window is used.
|
|
63
|
+
*/
|
|
64
|
+
isSPA?: boolean;
|
|
59
65
|
}): Promise<FetchResult>;
|
|
60
66
|
/**
|
|
61
67
|
* Capture a screenshot of a URL using headless Chromium via Playwright.
|
|
@@ -30,7 +30,7 @@ let activePagesCount = 0;
|
|
|
30
30
|
export async function browserFetch(url, options = {}) {
|
|
31
31
|
// SECURITY: Validate URL to prevent SSRF
|
|
32
32
|
validateUrl(url);
|
|
33
|
-
const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, device = 'desktop', viewportWidth: optViewportWidth, viewportHeight: optViewportHeight, waitUntil: optWaitUntil, waitSelector, blockResources, } = options;
|
|
33
|
+
const { userAgent, waitMs = 0, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, stealth = false, actions, keepPageOpen = false, signal, profileDir, headed = false, storageState, proxy, device = 'desktop', viewportWidth: optViewportWidth, viewportHeight: optViewportHeight, waitUntil: optWaitUntil, waitSelector, blockResources, isSPA = false, } = options;
|
|
34
34
|
// Device emulation profiles
|
|
35
35
|
const deviceProfiles = {
|
|
36
36
|
desktop: { width: 1920, height: 1080, userAgent: undefined },
|
|
@@ -317,27 +317,37 @@ export async function browserFetch(url, options = {}) {
|
|
|
317
317
|
throwIfAborted();
|
|
318
318
|
}
|
|
319
319
|
// DOM stability check: wait for SPA hydration to settle.
|
|
320
|
-
// Polls innerText length every 500ms — if still growing, keep waiting
|
|
320
|
+
// Polls innerText length every 500ms — if still growing, keep waiting.
|
|
321
|
+
// SPAs (Kayak, Google Flights, Expedia) get a longer timeout to allow async data loads.
|
|
321
322
|
{
|
|
322
323
|
const stabilityStart = Date.now();
|
|
323
|
-
|
|
324
|
+
// SPA sites (Kayak, Google Flights, Expedia) need up to 12s for results to load.
|
|
325
|
+
// Normal rendered pages need just 3s extra.
|
|
326
|
+
const MAX_STABILITY_WAIT_MS = isSPA ? 12000 : 3000;
|
|
327
|
+
// SPA: must be stable for 2s (4 consecutive 500ms checks). Normal: 1s (2 checks).
|
|
328
|
+
const STABLE_CHECKS_REQUIRED = isSPA ? 4 : 2;
|
|
324
329
|
const POLL_INTERVAL_MS = 500;
|
|
330
|
+
const MIN_CONTENT_LENGTH = 200; // Don't consider near-empty pages stable
|
|
325
331
|
let prevLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
|
|
326
332
|
let stableCount = 0;
|
|
327
333
|
while (Date.now() - stabilityStart < MAX_STABILITY_WAIT_MS) {
|
|
328
334
|
throwIfAborted();
|
|
329
335
|
await page.waitForTimeout(POLL_INTERVAL_MS);
|
|
330
336
|
const curLength = await page.evaluate('document.body?.innerText?.length || 0').catch(() => 0);
|
|
331
|
-
if (curLength
|
|
332
|
-
|
|
333
|
-
if (stableCount >= 2)
|
|
334
|
-
break; // stable for 2 consecutive checks (~1s)
|
|
335
|
-
}
|
|
336
|
-
else {
|
|
337
|
+
if (curLength !== prevLength) {
|
|
338
|
+
// Content changed — reset stability counter
|
|
337
339
|
stableCount = 0;
|
|
338
340
|
}
|
|
341
|
+
else if (curLength >= MIN_CONTENT_LENGTH) {
|
|
342
|
+
stableCount++;
|
|
343
|
+
if (stableCount >= STABLE_CHECKS_REQUIRED)
|
|
344
|
+
break; // stable long enough
|
|
345
|
+
}
|
|
339
346
|
prevLength = curLength;
|
|
340
347
|
}
|
|
348
|
+
if (isSPA) {
|
|
349
|
+
log.debug(`SPA stability check: ${Date.now() - stabilityStart}ms, length=${prevLength}`);
|
|
350
|
+
}
|
|
341
351
|
}
|
|
342
352
|
const finalUrl = page.url();
|
|
343
353
|
const contentType = response?.headers()?.['content-type'] || '';
|
|
@@ -123,6 +123,7 @@ const REGISTRY = [
|
|
|
123
123
|
{ match: (h) => h === 'redfin.com' || h === 'www.redfin.com', extractor: redfinExtractor },
|
|
124
124
|
// ── Travel ──────────────────────────────────────────────────────────────
|
|
125
125
|
{ match: (h, url = '') => (h === 'www.google.com' || h === 'google.com') && url.includes('/travel/flights'), extractor: googleFlightsExtractor },
|
|
126
|
+
{ match: (h, url = '') => (h === 'www.kayak.com' || h === 'kayak.com') && url.includes('/cars/'), extractor: kayakCarRentalExtractor },
|
|
126
127
|
];
|
|
127
128
|
/**
|
|
128
129
|
* Returns the domain extractor for a URL, or null if none matches.
|
|
@@ -6069,3 +6070,273 @@ async function googleFlightsExtractor(_html, url) {
|
|
|
6069
6070
|
cleanContent: md.join('\n'),
|
|
6070
6071
|
};
|
|
6071
6072
|
}
|
|
6073
|
+
// ---------------------------------------------------------------------------
|
|
6074
|
+
// Kayak Car Rental extractor
|
|
6075
|
+
// ---------------------------------------------------------------------------
|
|
6076
|
+
async function kayakCarRentalExtractor(_html, url) {
|
|
6077
|
+
if (!url.includes('/cars/'))
|
|
6078
|
+
return null;
|
|
6079
|
+
// Rental company homepage URLs
|
|
6080
|
+
const rentalCompanyUrls = {
|
|
6081
|
+
'Hertz': 'https://www.hertz.com',
|
|
6082
|
+
'Budget': 'https://www.budget.com',
|
|
6083
|
+
'Avis': 'https://www.avis.com',
|
|
6084
|
+
'Enterprise': 'https://www.enterprise.com',
|
|
6085
|
+
'National': 'https://www.nationalcar.com',
|
|
6086
|
+
'Alamo': 'https://www.alamo.com',
|
|
6087
|
+
'Dollar': 'https://www.dollar.com',
|
|
6088
|
+
'Thrifty': 'https://www.thrifty.com',
|
|
6089
|
+
'Sixt': 'https://www.sixt.com',
|
|
6090
|
+
'Fox': 'https://www.foxrentacar.com',
|
|
6091
|
+
'Payless': 'https://www.paylesscar.com',
|
|
6092
|
+
'Turn': 'https://www.turn.com',
|
|
6093
|
+
'EconomyBookings': 'https://www.economybookings.com',
|
|
6094
|
+
'Priceline': 'https://www.priceline.com',
|
|
6095
|
+
'Expedia': 'https://www.expedia.com',
|
|
6096
|
+
'Turo': 'https://www.turo.com',
|
|
6097
|
+
'KAYAK': 'https://www.kayak.com',
|
|
6098
|
+
'Booking.com': 'https://www.booking.com',
|
|
6099
|
+
'DiscoverCars': 'https://www.discovercars.com',
|
|
6100
|
+
'RentalCars': 'https://www.rentalcars.com',
|
|
6101
|
+
'Car Rental 8': 'https://www.carrental8.com',
|
|
6102
|
+
'Hotwire': 'https://www.hotwire.com',
|
|
6103
|
+
};
|
|
6104
|
+
function getCompanyUrl(company) {
|
|
6105
|
+
return rentalCompanyUrls[company] || `https://www.kayak.com`;
|
|
6106
|
+
}
|
|
6107
|
+
// Parse dates from URL: /cars/Location/YYYY-MM-DD/YYYY-MM-DD
|
|
6108
|
+
let numDays = 1;
|
|
6109
|
+
let pickupDate = '';
|
|
6110
|
+
let dropoffDate = '';
|
|
6111
|
+
let locationName = '';
|
|
6112
|
+
const dateMatch = url.match(/\/cars\/([^/]+)\/(\d{4}-\d{2}-\d{2})\/(\d{4}-\d{2}-\d{2})/);
|
|
6113
|
+
if (dateMatch) {
|
|
6114
|
+
locationName = decodeURIComponent(dateMatch[1]);
|
|
6115
|
+
pickupDate = dateMatch[2];
|
|
6116
|
+
dropoffDate = dateMatch[3];
|
|
6117
|
+
const pickup = new Date(pickupDate);
|
|
6118
|
+
const dropoff = new Date(dropoffDate);
|
|
6119
|
+
numDays = Math.max(1, Math.round((dropoff.getTime() - pickup.getTime()) / (1000 * 60 * 60 * 24)));
|
|
6120
|
+
}
|
|
6121
|
+
// Format date range for display (e.g. "Apr 1–3")
|
|
6122
|
+
function formatDateRange(from, to) {
|
|
6123
|
+
if (!from || !to)
|
|
6124
|
+
return '';
|
|
6125
|
+
const fromDate = new Date(from + 'T12:00:00');
|
|
6126
|
+
const toDate = new Date(to + 'T12:00:00');
|
|
6127
|
+
const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
|
|
6128
|
+
const fromMonth = months[fromDate.getUTCMonth()];
|
|
6129
|
+
const toMonth = months[toDate.getUTCMonth()];
|
|
6130
|
+
const fromDay = fromDate.getUTCDate();
|
|
6131
|
+
const toDay = toDate.getUTCDate();
|
|
6132
|
+
if (fromMonth === toMonth)
|
|
6133
|
+
return `${fromMonth} ${fromDay}–${toDay}`;
|
|
6134
|
+
return `${fromMonth} ${fromDay}–${toMonth} ${toDay}`;
|
|
6135
|
+
}
|
|
6136
|
+
// Process content: strip HTML if needed
|
|
6137
|
+
let text = _html;
|
|
6138
|
+
if (text.includes('<!DOCTYPE') || text.includes('<html')) {
|
|
6139
|
+
text = text
|
|
6140
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
6141
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
6142
|
+
.replace(/<[^>]+>/g, '\n')
|
|
6143
|
+
.replace(/&/g, '&')
|
|
6144
|
+
.replace(/</g, '<')
|
|
6145
|
+
.replace(/>/g, '>')
|
|
6146
|
+
.replace(/&#\d+;/g, '')
|
|
6147
|
+
.replace(/\n{2,}/g, '\n');
|
|
6148
|
+
}
|
|
6149
|
+
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
|
6150
|
+
const KNOWN_COMPANIES = ['Hertz', 'Budget', 'Avis', 'Enterprise', 'National', 'Alamo', 'Dollar', 'Thrifty', 'Sixt', 'Fox', 'Payless', 'Turn', 'EconomyBookings', 'Priceline', 'Expedia', 'Turo', 'KAYAK', 'Booking.com', 'DiscoverCars', 'RentalCars', 'Car Rental 8', 'Hotwire'];
|
|
6151
|
+
const listings = [];
|
|
6152
|
+
for (let i = 0; i < lines.length; i++) {
|
|
6153
|
+
const line = lines[i];
|
|
6154
|
+
// Detect: "or similar {Class}" — this signals a standard car rental listing
|
|
6155
|
+
// The car name is the line BEFORE "or similar"
|
|
6156
|
+
const orSimilarMatch = line.match(/^or similar\s+(.+)$/);
|
|
6157
|
+
if (orSimilarMatch) {
|
|
6158
|
+
const carClass = orSimilarMatch[1].trim();
|
|
6159
|
+
const carName = i > 0 ? lines[i - 1] : '';
|
|
6160
|
+
if (!carName || carName.length > 60)
|
|
6161
|
+
continue;
|
|
6162
|
+
// Look ahead for: pickup location, rating, company, price
|
|
6163
|
+
let location = '';
|
|
6164
|
+
let distanceFromCenter = '';
|
|
6165
|
+
let rating = null;
|
|
6166
|
+
let company = '';
|
|
6167
|
+
let totalPrice = 0;
|
|
6168
|
+
let cancellation = '';
|
|
6169
|
+
for (let j = i + 1; j < Math.min(i + 30, lines.length); j++) {
|
|
6170
|
+
const l = lines[j];
|
|
6171
|
+
// Pickup location
|
|
6172
|
+
if (!location && l.startsWith('Pick-up')) {
|
|
6173
|
+
const locMatch = l.match(/Pick-up (?:city|airport):\s*(.+)/);
|
|
6174
|
+
if (locMatch)
|
|
6175
|
+
location = locMatch[1].trim();
|
|
6176
|
+
continue;
|
|
6177
|
+
}
|
|
6178
|
+
// Distance from center
|
|
6179
|
+
if (!distanceFromCenter) {
|
|
6180
|
+
const distM = l.match(/^([\d.]+)\s+mi\s+from\s+city\s+center/);
|
|
6181
|
+
if (distM) {
|
|
6182
|
+
distanceFromCenter = `${distM[1]} mi from city center`;
|
|
6183
|
+
continue;
|
|
6184
|
+
}
|
|
6185
|
+
}
|
|
6186
|
+
// Rating (number like "9.2", "8.5", "7.2")
|
|
6187
|
+
if (rating === null) {
|
|
6188
|
+
const ratingM = l.match(/^(\d+\.\d+)$/);
|
|
6189
|
+
if (ratingM) {
|
|
6190
|
+
rating = parseFloat(ratingM[1]);
|
|
6191
|
+
continue;
|
|
6192
|
+
}
|
|
6193
|
+
}
|
|
6194
|
+
// Company from "X offer from {Company}" or "{Company}" line
|
|
6195
|
+
if (!company) {
|
|
6196
|
+
const offerMatch = l.match(/offer from (.+)$/);
|
|
6197
|
+
if (offerMatch) {
|
|
6198
|
+
company = offerMatch[1].trim();
|
|
6199
|
+
continue;
|
|
6200
|
+
}
|
|
6201
|
+
// Also detect company name standalone
|
|
6202
|
+
for (const c of KNOWN_COMPANIES) {
|
|
6203
|
+
if (l === c) {
|
|
6204
|
+
company = c;
|
|
6205
|
+
break;
|
|
6206
|
+
}
|
|
6207
|
+
}
|
|
6208
|
+
if (company)
|
|
6209
|
+
continue;
|
|
6210
|
+
}
|
|
6211
|
+
// Cancellation policy
|
|
6212
|
+
if (!cancellation && (l.includes('Free cancellation') || l.includes('No free cancellation'))) {
|
|
6213
|
+
cancellation = l;
|
|
6214
|
+
continue;
|
|
6215
|
+
}
|
|
6216
|
+
// Price — "$NNN" followed by "Total"
|
|
6217
|
+
const priceM = l.match(/^\$(\d[\d,]*)$/);
|
|
6218
|
+
if (priceM) {
|
|
6219
|
+
const nextLine = lines[j + 1] || '';
|
|
6220
|
+
if (nextLine === 'Total' || nextLine.includes('Total')) {
|
|
6221
|
+
totalPrice = parseInt(priceM[1].replace(',', ''));
|
|
6222
|
+
break;
|
|
6223
|
+
}
|
|
6224
|
+
}
|
|
6225
|
+
// Also catch price on same line
|
|
6226
|
+
const inlinePriceM = l.match(/\$(\d[\d,]*)\s*Total/);
|
|
6227
|
+
if (inlinePriceM) {
|
|
6228
|
+
totalPrice = parseInt(inlinePriceM[1].replace(',', ''));
|
|
6229
|
+
break;
|
|
6230
|
+
}
|
|
6231
|
+
// Stop if we hit another car listing marker
|
|
6232
|
+
if (l.match(/^or similar\s/) || l === 'Show more results')
|
|
6233
|
+
break;
|
|
6234
|
+
}
|
|
6235
|
+
if (carName && totalPrice > 0) {
|
|
6236
|
+
const normalizedClass = carClass.replace('Full size', 'Full-size');
|
|
6237
|
+
listings.push({
|
|
6238
|
+
name: carName,
|
|
6239
|
+
carClass: normalizedClass,
|
|
6240
|
+
totalPrice,
|
|
6241
|
+
perDayPrice: Math.round(totalPrice / numDays),
|
|
6242
|
+
company: company || 'Unknown',
|
|
6243
|
+
location: location || 'See booking',
|
|
6244
|
+
distanceFromCenter,
|
|
6245
|
+
rating,
|
|
6246
|
+
cancellation,
|
|
6247
|
+
isTuro: false,
|
|
6248
|
+
});
|
|
6249
|
+
}
|
|
6250
|
+
}
|
|
6251
|
+
}
|
|
6252
|
+
// Deduplicate: first prefer listings with real company info over "Unknown"
|
|
6253
|
+
// Key by name+price; keep the one with best data
|
|
6254
|
+
const byKey = new Map();
|
|
6255
|
+
for (const c of listings) {
|
|
6256
|
+
const key = `${c.name.toLowerCase()}-${c.totalPrice}`;
|
|
6257
|
+
const existing = byKey.get(key);
|
|
6258
|
+
if (!existing) {
|
|
6259
|
+
byKey.set(key, c);
|
|
6260
|
+
}
|
|
6261
|
+
else {
|
|
6262
|
+
// Prefer non-Unknown company, or same company with more info
|
|
6263
|
+
if (existing.company === 'Unknown' && c.company !== 'Unknown') {
|
|
6264
|
+
byKey.set(key, c);
|
|
6265
|
+
}
|
|
6266
|
+
}
|
|
6267
|
+
}
|
|
6268
|
+
const unique = Array.from(byKey.values());
|
|
6269
|
+
if (unique.length === 0)
|
|
6270
|
+
return null;
|
|
6271
|
+
// Filter out Unknown company entries if the total found from page suggests more results exist
|
|
6272
|
+
// Also filter them only if they have no location info (these are likely ad/promo extractions)
|
|
6273
|
+
const knownCompanyListings = unique.filter(c => c.company !== 'Unknown');
|
|
6274
|
+
const finalListings = knownCompanyListings.length > 0 ? knownCompanyListings : unique;
|
|
6275
|
+
// Sort by price
|
|
6276
|
+
finalListings.sort((a, b) => a.totalPrice - b.totalPrice);
|
|
6277
|
+
// Get total count from page if mentioned
|
|
6278
|
+
let totalFound = unique.length;
|
|
6279
|
+
for (const l of lines) {
|
|
6280
|
+
const m = l.match(/^(\d+)\s+results?$/);
|
|
6281
|
+
if (m) {
|
|
6282
|
+
totalFound = parseInt(m[1]);
|
|
6283
|
+
break;
|
|
6284
|
+
}
|
|
6285
|
+
const m2 = l.match(/(\d+)\s+cars?\s+found/);
|
|
6286
|
+
if (m2) {
|
|
6287
|
+
totalFound = parseInt(m2[1]);
|
|
6288
|
+
break;
|
|
6289
|
+
}
|
|
6290
|
+
}
|
|
6291
|
+
// Format location name nicely (e.g. "Punta-Gorda,FL-c34451" → "Punta Gorda, FL")
|
|
6292
|
+
function formatLocation(loc) {
|
|
6293
|
+
return loc
|
|
6294
|
+
.replace(/-c\d+$/, '') // remove trailing "-c12345"
|
|
6295
|
+
.replace(/-/g, ' ') // hyphens to spaces
|
|
6296
|
+
.replace(/,(\S)/g, ', $1'); // ensure space after comma
|
|
6297
|
+
}
|
|
6298
|
+
const dateRange = formatDateRange(pickupDate, dropoffDate);
|
|
6299
|
+
const displayLocation = formatLocation(locationName);
|
|
6300
|
+
const daysLabel = numDays === 1 ? '1 day' : `${numDays} days`;
|
|
6301
|
+
const md = [
|
|
6302
|
+
`# 🚗 Car Rentals — ${displayLocation} · ${dateRange} (${daysLabel})`,
|
|
6303
|
+
'',
|
|
6304
|
+
`*${totalFound} cars found · Source: [Kayak](${url})*`,
|
|
6305
|
+
`*Free cancellation available on most rentals*`,
|
|
6306
|
+
'',
|
|
6307
|
+
];
|
|
6308
|
+
for (let idx = 0; idx < finalListings.length; idx++) {
|
|
6309
|
+
const c = finalListings[idx];
|
|
6310
|
+
md.push(`## ${idx + 1}. ${c.name} (${c.carClass}) — $${c.totalPrice} total · $${c.perDayPrice}/day`);
|
|
6311
|
+
if (c.distanceFromCenter) {
|
|
6312
|
+
md.push(`📍 ${c.distanceFromCenter}`);
|
|
6313
|
+
}
|
|
6314
|
+
else if (c.location && c.location !== 'See booking') {
|
|
6315
|
+
md.push(`📍 ${c.location}`);
|
|
6316
|
+
}
|
|
6317
|
+
const ratingStr = c.rating !== null ? ` · Rating: ${c.rating}` : '';
|
|
6318
|
+
md.push(`🏪 via ${c.company}${ratingStr}`);
|
|
6319
|
+
if (c.cancellation)
|
|
6320
|
+
md.push(`✅ ${c.cancellation}`);
|
|
6321
|
+
md.push(`🔍 [See price on Kayak](${url})`);
|
|
6322
|
+
md.push(`🛒 [Book on ${c.company}](${getCompanyUrl(c.company)})`);
|
|
6323
|
+
md.push('');
|
|
6324
|
+
}
|
|
6325
|
+
md.push('---');
|
|
6326
|
+
md.push(`📌 *Prices verified via [Kayak](${url}). Click "See price" to confirm current rate, then book with the rental company.*`);
|
|
6327
|
+
return {
|
|
6328
|
+
domain: 'kayak.com/cars',
|
|
6329
|
+
type: 'car-rental',
|
|
6330
|
+
structured: {
|
|
6331
|
+
cars: finalListings,
|
|
6332
|
+
location: displayLocation,
|
|
6333
|
+
pickupDate,
|
|
6334
|
+
dropoffDate,
|
|
6335
|
+
numDays,
|
|
6336
|
+
totalFound,
|
|
6337
|
+
source: 'Kayak',
|
|
6338
|
+
sourceUrl: url,
|
|
6339
|
+
},
|
|
6340
|
+
cleanContent: md.join('\n'),
|
|
6341
|
+
};
|
|
6342
|
+
}
|
package/dist/core/pipeline.js
CHANGED
|
@@ -426,6 +426,22 @@ export async function fetchContent(ctx) {
|
|
|
426
426
|
// @ts-ignore — proprietary module, gitignored
|
|
427
427
|
const { searchFallback } = await import('./search-fallback.js');
|
|
428
428
|
const searchResult = await searchFallback(ctx.url);
|
|
429
|
+
// If DDG/primary returned very little, also try Bing for richer snippets
|
|
430
|
+
if (!searchResult.cachedContent || searchResult.cachedContent.length < 400) {
|
|
431
|
+
try {
|
|
432
|
+
const { simpleFetch } = await import('./http-fetch.js');
|
|
433
|
+
const bingUrl = `https://www.bing.com/search?q=${encodeURIComponent(ctx.url)}`;
|
|
434
|
+
const bingResult = await simpleFetch(bingUrl, ctx.userAgent, 8000);
|
|
435
|
+
if (bingResult.html && bingResult.html.length > 500) {
|
|
436
|
+
const snippetMatch = bingResult.html.match(/<p[^>]*class="[^"]*snippet[^"]*"[^>]*>(.*?)<\/p>/gi);
|
|
437
|
+
if (snippetMatch) {
|
|
438
|
+
const bingSnippet = snippetMatch.map(s => s.replace(/<[^>]+>/g, '')).join('\n');
|
|
439
|
+
searchResult.cachedContent = (searchResult.cachedContent || '') + '\n\n---\n*Additional context from Bing:*\n' + bingSnippet;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
catch { /* Bing fallback is best-effort */ }
|
|
444
|
+
}
|
|
429
445
|
if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
|
|
430
446
|
ctx.timer.end('fetch');
|
|
431
447
|
ctx.content = searchResult.cachedContent;
|
package/dist/core/strategies.js
CHANGED
|
@@ -232,7 +232,7 @@ function prefetchDns(url) {
|
|
|
232
232
|
}
|
|
233
233
|
}
|
|
234
234
|
async function fetchWithBrowserStrategy(url, options) {
|
|
235
|
-
const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, storageState, proxy, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, } = options;
|
|
235
|
+
const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, storageState, proxy, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, isSPA, } = options;
|
|
236
236
|
try {
|
|
237
237
|
const result = await browserFetch(url, {
|
|
238
238
|
userAgent,
|
|
@@ -256,6 +256,7 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
256
256
|
waitUntil,
|
|
257
257
|
waitSelector,
|
|
258
258
|
blockResources,
|
|
259
|
+
isSPA,
|
|
259
260
|
});
|
|
260
261
|
return {
|
|
261
262
|
...result,
|
|
@@ -500,6 +501,21 @@ export async function smartFetch(url, options = {}) {
|
|
|
500
501
|
if (storageState) {
|
|
501
502
|
effectiveForceBrowser = true;
|
|
502
503
|
}
|
|
504
|
+
// Detect SPA for smarter DOM stability wait
|
|
505
|
+
const SPA_FETCH_DOMAINS = new Set([
|
|
506
|
+
'www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com',
|
|
507
|
+
'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com',
|
|
508
|
+
'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev',
|
|
509
|
+
]);
|
|
510
|
+
const SPA_FETCH_URL_PATTERNS = [
|
|
511
|
+
/google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/,
|
|
512
|
+
];
|
|
513
|
+
let isSPAUrl = false;
|
|
514
|
+
try {
|
|
515
|
+
const parsedHostname = new URL(url).hostname;
|
|
516
|
+
isSPAUrl = SPA_FETCH_DOMAINS.has(parsedHostname) || SPA_FETCH_URL_PATTERNS.some(p => p.test(url));
|
|
517
|
+
}
|
|
518
|
+
catch { /* invalid URL — ignore */ }
|
|
503
519
|
const browserOptions = {
|
|
504
520
|
userAgent,
|
|
505
521
|
waitMs,
|
|
@@ -521,6 +537,7 @@ export async function smartFetch(url, options = {}) {
|
|
|
521
537
|
waitUntil,
|
|
522
538
|
waitSelector,
|
|
523
539
|
blockResources,
|
|
540
|
+
isSPA: isSPAUrl,
|
|
524
541
|
};
|
|
525
542
|
/* ---- Strategy: simple fetch (with optional race) --------------------- */
|
|
526
543
|
if (!shouldUseBrowser) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.80",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|