webpeel 0.21.73 → 0.21.75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/search.js +43 -0
- package/dist/core/domain-extractors.js +176 -0
- package/dist/core/pipeline.js +43 -1
- package/package.json +1 -1
|
@@ -588,6 +588,49 @@ export function registerSearchCommands(program) {
|
|
|
588
588
|
process.exit(1);
|
|
589
589
|
}
|
|
590
590
|
});
|
|
591
|
+
// ── flights command ───────────────────────────────────────────────────────
|
|
592
|
+
program
|
|
593
|
+
.command('flights <query>')
|
|
594
|
+
.description('Search for flights (via Google Flights) — e.g. "NYC to Fort Myers Apr 4"')
|
|
595
|
+
.option('--one-way', 'One-way flight (default)')
|
|
596
|
+
.option('--round-trip', 'Round-trip flight')
|
|
597
|
+
.option('-n, --count <n>', 'Max flights to show', '10')
|
|
598
|
+
.option('--json', 'Output as JSON')
|
|
599
|
+
.option('-s, --silent', 'Silent mode')
|
|
600
|
+
.action(async (query, options) => {
|
|
601
|
+
const tripType = options.roundTrip ? '' : ' one way';
|
|
602
|
+
const encoded = encodeURIComponent(`Flights from ${query}${tripType}`);
|
|
603
|
+
const url = `https://www.google.com/travel/flights?q=${encoded}`;
|
|
604
|
+
const spinner = options.silent ? null : ora(`Searching flights: ${query}...`).start();
|
|
605
|
+
try {
|
|
606
|
+
// render is forced automatically by SPA auto-detect, but be explicit here
|
|
607
|
+
const result = await peel(url, { render: true, timeout: 30000 });
|
|
608
|
+
if (spinner)
|
|
609
|
+
spinner.succeed('Flights loaded');
|
|
610
|
+
if (options.json) {
|
|
611
|
+
console.log(JSON.stringify({
|
|
612
|
+
query,
|
|
613
|
+
url,
|
|
614
|
+
flights: result.domainData?.structured?.flights || [],
|
|
615
|
+
source: 'Google Flights',
|
|
616
|
+
content: result.content,
|
|
617
|
+
tokens: result.tokens,
|
|
618
|
+
}, null, 2));
|
|
619
|
+
}
|
|
620
|
+
else {
|
|
621
|
+
console.log(result.content);
|
|
622
|
+
}
|
|
623
|
+
await cleanup();
|
|
624
|
+
process.exit(0);
|
|
625
|
+
}
|
|
626
|
+
catch (error) {
|
|
627
|
+
if (spinner)
|
|
628
|
+
spinner.fail('Flight search failed');
|
|
629
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
630
|
+
await cleanup();
|
|
631
|
+
process.exit(1);
|
|
632
|
+
}
|
|
633
|
+
});
|
|
591
634
|
// ── extractors command ────────────────────────────────────────────────────
|
|
592
635
|
program
|
|
593
636
|
.command('extractors')
|
|
@@ -121,6 +121,8 @@ const REGISTRY = [
|
|
|
121
121
|
{ match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
|
|
122
122
|
{ match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
|
|
123
123
|
{ match: (h) => h === 'redfin.com' || h === 'www.redfin.com', extractor: redfinExtractor },
|
|
124
|
+
// ── Travel ──────────────────────────────────────────────────────────────
|
|
125
|
+
{ match: (h, url = '') => (h === 'www.google.com' || h === 'google.com') && url.includes('/travel/flights'), extractor: googleFlightsExtractor },
|
|
124
126
|
];
|
|
125
127
|
/**
|
|
126
128
|
* Returns the domain extractor for a URL, or null if none matches.
|
|
@@ -5860,3 +5862,177 @@ async function redfinExtractor(_html, url) {
|
|
|
5860
5862
|
return null;
|
|
5861
5863
|
}
|
|
5862
5864
|
}
|
|
5865
|
+
// ---------------------------------------------------------------------------
|
|
5866
|
+
// Google Flights extractor
|
|
5867
|
+
// ---------------------------------------------------------------------------
|
|
5868
|
+
async function googleFlightsExtractor(_html, url) {
|
|
5869
|
+
if (!url.includes('/travel/flights'))
|
|
5870
|
+
return null;
|
|
5871
|
+
// Google Flights is a SPA. The _html parameter is usually readability-processed markdown
|
|
5872
|
+
// (from the pipeline's post-fetch processing), which looks like:
|
|
5873
|
+
// - 7:15 PM
|
|
5874
|
+
// 7:15 PM on Sat, Apr 4
|
|
5875
|
+
// – 10:29 PM
|
|
5876
|
+
// United
|
|
5877
|
+
// 3 hr 14 min
|
|
5878
|
+
// EWR
|
|
5879
|
+
// ...
|
|
5880
|
+
// $188
|
|
5881
|
+
//
|
|
5882
|
+
// This markdown is much easier to parse than raw HTML.
|
|
5883
|
+
let text = _html;
|
|
5884
|
+
// If this is raw HTML (contains <!DOCTYPE or <html), strip HTML tags
|
|
5885
|
+
if (text.includes('<!DOCTYPE') || text.includes('<html')) {
|
|
5886
|
+
text = text
|
|
5887
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
5888
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
5889
|
+
.replace(/<[^>]+>/g, '\n')
|
|
5890
|
+
.replace(/&/g, '&')
|
|
5891
|
+
.replace(/</g, '<')
|
|
5892
|
+
.replace(/>/g, '>')
|
|
5893
|
+
.replace(/&#\d+;/g, '')
|
|
5894
|
+
.replace(/\n{2,}/g, '\n');
|
|
5895
|
+
}
|
|
5896
|
+
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
|
5897
|
+
const AIRLINES = ['United', 'Delta', 'American', 'JetBlue', 'Spirit', 'Frontier', 'Southwest', 'Breeze', 'Alaska', 'Hawaiian', 'Sun Country', 'Avelo'];
|
|
5898
|
+
const flights = [];
|
|
5899
|
+
for (let i = 0; i < lines.length; i++) {
|
|
5900
|
+
const line = lines[i];
|
|
5901
|
+
// Detect departure time
|
|
5902
|
+
const departMatch = line.match(/^(?:-\s+)?(\d{1,2}:\d{2}\s*[AP]M)$/);
|
|
5903
|
+
if (!departMatch)
|
|
5904
|
+
continue;
|
|
5905
|
+
const departTime = departMatch[1];
|
|
5906
|
+
let departDate = '', arriveTime = '', airline = '', duration = '';
|
|
5907
|
+
let fromAirport = '', toAirport = '', stops = '', bags = '';
|
|
5908
|
+
let price = 0;
|
|
5909
|
+
for (let j = i + 1; j < Math.min(i + 45, lines.length); j++) {
|
|
5910
|
+
const l = lines[j];
|
|
5911
|
+
// Date
|
|
5912
|
+
const dateM = l.match(/on\s+(\w+,\s+\w+\s+\d+)/);
|
|
5913
|
+
if (dateM && !departDate) {
|
|
5914
|
+
departDate = dateM[1];
|
|
5915
|
+
continue;
|
|
5916
|
+
}
|
|
5917
|
+
// Arrival time
|
|
5918
|
+
const arrM = l.match(/^[–\-–—]\s*(\d{1,2}:\d{2}\s*[AP]M)$/) || l.match(/^(\d{1,2}:\d{2}\s*[AP]M)\s+on\s/);
|
|
5919
|
+
if (arrM && !arriveTime && departTime) {
|
|
5920
|
+
arriveTime = arrM[1];
|
|
5921
|
+
continue;
|
|
5922
|
+
}
|
|
5923
|
+
// Arrival time: also check for "10:29 PM on Sat, Apr 4" pattern (second occurrence)
|
|
5924
|
+
if (!arriveTime && l.match(/^\d{1,2}:\d{2}\s*[AP]M\s+on\s/)) {
|
|
5925
|
+
const m = l.match(/^(\d{1,2}:\d{2}\s*[AP]M)/);
|
|
5926
|
+
if (m) {
|
|
5927
|
+
arriveTime = m[1];
|
|
5928
|
+
continue;
|
|
5929
|
+
}
|
|
5930
|
+
}
|
|
5931
|
+
// Airline
|
|
5932
|
+
if (!airline) {
|
|
5933
|
+
for (const a of AIRLINES) {
|
|
5934
|
+
if (l === a || l.startsWith(a + 'Operated') || l.startsWith(a + ' ')) {
|
|
5935
|
+
airline = a;
|
|
5936
|
+
break;
|
|
5937
|
+
}
|
|
5938
|
+
}
|
|
5939
|
+
if (airline)
|
|
5940
|
+
continue;
|
|
5941
|
+
}
|
|
5942
|
+
// Duration
|
|
5943
|
+
if (!duration && l.match(/^\d+\s+hr\s+\d+\s+min$/)) {
|
|
5944
|
+
duration = l;
|
|
5945
|
+
continue;
|
|
5946
|
+
}
|
|
5947
|
+
// Airport codes
|
|
5948
|
+
if (l.match(/^[A-Z]{3}$/) && !fromAirport) {
|
|
5949
|
+
fromAirport = l;
|
|
5950
|
+
continue;
|
|
5951
|
+
}
|
|
5952
|
+
if (l.match(/^[A-Z]{3}$/) && fromAirport && !toAirport && l !== fromAirport) {
|
|
5953
|
+
toAirport = l;
|
|
5954
|
+
continue;
|
|
5955
|
+
}
|
|
5956
|
+
// Stops
|
|
5957
|
+
if (!stops && (l === 'Nonstop' || l.match(/^\d+\s+stop/))) {
|
|
5958
|
+
stops = l;
|
|
5959
|
+
continue;
|
|
5960
|
+
}
|
|
5961
|
+
// Bags
|
|
5962
|
+
if (l.includes('carry-on bag') && !bags) {
|
|
5963
|
+
bags = l.includes('not included') ? 'Carry-on NOT included (extra fee)' : 'Carry-on included';
|
|
5964
|
+
continue;
|
|
5965
|
+
}
|
|
5966
|
+
// Price — first occurrence only
|
|
5967
|
+
const priceM = l.match(/^\$(\d[\d,]*)$/);
|
|
5968
|
+
if (priceM && !price) {
|
|
5969
|
+
price = parseInt(priceM[1].replace(',', ''));
|
|
5970
|
+
break;
|
|
5971
|
+
}
|
|
5972
|
+
}
|
|
5973
|
+
if (departTime && arriveTime && airline && price) {
|
|
5974
|
+
flights.push({ departTime, arriveTime, departDate, airline, duration, fromAirport, toAirport, stops: stops || 'Unknown', price, priceStr: `$${price}`, bags });
|
|
5975
|
+
}
|
|
5976
|
+
}
|
|
5977
|
+
// Deduplicate
|
|
5978
|
+
const seen = new Set();
|
|
5979
|
+
const unique = flights.filter(f => {
|
|
5980
|
+
const key = `${f.departTime}-${f.airline}-${f.price}`;
|
|
5981
|
+
if (seen.has(key))
|
|
5982
|
+
return false;
|
|
5983
|
+
seen.add(key);
|
|
5984
|
+
return true;
|
|
5985
|
+
});
|
|
5986
|
+
if (unique.length === 0)
|
|
5987
|
+
return null;
|
|
5988
|
+
unique.sort((a, b) => a.price - b.price);
|
|
5989
|
+
// Helper: get airline booking URL
|
|
5990
|
+
function getAirlineBookingUrl(airline, from, to) {
|
|
5991
|
+
const fromLower = from.toLowerCase();
|
|
5992
|
+
const toLower = to.toLowerCase();
|
|
5993
|
+
const urlMap = {
|
|
5994
|
+
'United': `https://www.united.com/en-us/flights-from-${fromLower}-to-${toLower}`,
|
|
5995
|
+
'Delta': `https://www.delta.com/flight-search/search`,
|
|
5996
|
+
'JetBlue': `https://www.jetblue.com/booking/flights`,
|
|
5997
|
+
'American': `https://www.aa.com/booking/find-flights`,
|
|
5998
|
+
'Spirit': `https://www.spirit.com/book/flights`,
|
|
5999
|
+
'Frontier': `https://www.flyfrontier.com/booking/`,
|
|
6000
|
+
'Southwest': `https://www.southwest.com/air/booking/`,
|
|
6001
|
+
'Breeze': `https://www.flybreeze.com/home`,
|
|
6002
|
+
'Alaska': `https://www.alaskaair.com/booking/flights`,
|
|
6003
|
+
'Hawaiian': `https://www.hawaiianairlines.com/book-a-trip`,
|
|
6004
|
+
'Sun Country': `https://www.suncountry.com/booking/search`,
|
|
6005
|
+
'Avelo': `https://www.aveloair.com/book`,
|
|
6006
|
+
};
|
|
6007
|
+
return urlMap[airline] || `https://www.google.com/search?q=${encodeURIComponent(`${airline} flights ${from} to ${to}`)}`;
|
|
6008
|
+
}
|
|
6009
|
+
// Parse route from URL
|
|
6010
|
+
const u = new URL(url);
|
|
6011
|
+
const query = (u.searchParams.get('q') || '').replace(/Flights?\s+(from\s+)?/i, '').replace(/\s+one\s+way/i, '').trim();
|
|
6012
|
+
const md = [
|
|
6013
|
+
`# ✈️ Flights — ${query || 'Search Results'}`,
|
|
6014
|
+
'',
|
|
6015
|
+
`*${unique.length} flights found · Source: [Google Flights](${url})*`,
|
|
6016
|
+
`*Prices include taxes + fees for 1 adult. Book directly via airline.*`,
|
|
6017
|
+
'',
|
|
6018
|
+
];
|
|
6019
|
+
for (let idx = 0; idx < unique.length; idx++) {
|
|
6020
|
+
const f = unique[idx];
|
|
6021
|
+
const bookingUrl = getAirlineBookingUrl(f.airline, f.fromAirport, f.toAirport);
|
|
6022
|
+
md.push(`## ${idx + 1}. ${f.airline} — ${f.priceStr}`);
|
|
6023
|
+
md.push(`🕐 Depart **${f.departTime}** → Arrive **${f.arriveTime}**${f.departDate ? ` · ${f.departDate}` : ''}`);
|
|
6024
|
+
md.push(`🛫 ${f.fromAirport} → ${f.toAirport} · ${f.duration} · ${f.stops}`);
|
|
6025
|
+
if (f.bags)
|
|
6026
|
+
md.push(`🧳 ${f.bags}`);
|
|
6027
|
+
md.push(`🔗 [Book on ${f.airline}](${bookingUrl})`);
|
|
6028
|
+
md.push('');
|
|
6029
|
+
}
|
|
6030
|
+
md.push('---');
|
|
6031
|
+
md.push(`📌 *Prices change frequently. [View live prices on Google Flights](${url})*`);
|
|
6032
|
+
return {
|
|
6033
|
+
domain: 'google.com/travel/flights',
|
|
6034
|
+
type: 'flights',
|
|
6035
|
+
structured: { flights: unique, route: query, source: 'Google Flights', sourceUrl: url },
|
|
6036
|
+
cleanContent: md.join('\n'),
|
|
6037
|
+
};
|
|
6038
|
+
}
|
package/dist/core/pipeline.js
CHANGED
|
@@ -146,6 +146,43 @@ export function normalizeOptions(ctx) {
|
|
|
146
146
|
if (autoScrollOpts) {
|
|
147
147
|
ctx.render = true;
|
|
148
148
|
}
|
|
149
|
+
// Auto-detect SPAs that require browser rendering (no --render flag needed)
|
|
150
|
+
if (!ctx.render) {
|
|
151
|
+
const SPA_DOMAINS = new Set([
|
|
152
|
+
'www.google.com', // Google Flights, Maps, Shopping etc.
|
|
153
|
+
'flights.google.com',
|
|
154
|
+
'www.airbnb.com',
|
|
155
|
+
'www.booking.com',
|
|
156
|
+
'www.expedia.com',
|
|
157
|
+
'www.kayak.com',
|
|
158
|
+
'www.skyscanner.com',
|
|
159
|
+
'www.tripadvisor.com',
|
|
160
|
+
'www.indeed.com',
|
|
161
|
+
'www.glassdoor.com',
|
|
162
|
+
'www.zillow.com', // already handled but backup
|
|
163
|
+
'app.webpeel.dev', // our own dashboard is a SPA
|
|
164
|
+
]);
|
|
165
|
+
// More specific: some google.com paths need render, not all
|
|
166
|
+
const SPA_URL_PATTERNS = [
|
|
167
|
+
/google\.com\/travel/,
|
|
168
|
+
/google\.com\/maps/,
|
|
169
|
+
/google\.com\/shopping/,
|
|
170
|
+
];
|
|
171
|
+
try {
|
|
172
|
+
const hostname = new URL(ctx.url).hostname;
|
|
173
|
+
if (SPA_DOMAINS.has(hostname)) {
|
|
174
|
+
ctx.render = true;
|
|
175
|
+
log.debug(`Auto-enabling render: SPA domain detected (${hostname})`);
|
|
176
|
+
}
|
|
177
|
+
else if (SPA_URL_PATTERNS.some(p => p.test(ctx.url))) {
|
|
178
|
+
ctx.render = true;
|
|
179
|
+
log.debug(`Auto-enabling render: SPA URL pattern matched`);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
catch {
|
|
183
|
+
// Invalid URL — skip SPA detection
|
|
184
|
+
}
|
|
185
|
+
}
|
|
149
186
|
}
|
|
150
187
|
// ---------------------------------------------------------------------------
|
|
151
188
|
// Stage 2: handleYouTube
|
|
@@ -991,7 +1028,12 @@ export async function postProcess(ctx) {
|
|
|
991
1028
|
if (getDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
|
|
992
1029
|
try {
|
|
993
1030
|
ctx.timer.mark('domainExtract');
|
|
994
|
-
|
|
1031
|
+
// Try raw HTML first, then fall back to readability-processed content
|
|
1032
|
+
// (some SPAs like Google Flights have data only after readability processing)
|
|
1033
|
+
let ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
|
|
1034
|
+
if (!ddResult && ctx.content) {
|
|
1035
|
+
ddResult = await extractDomainData(ctx.content, fetchResult.url);
|
|
1036
|
+
}
|
|
995
1037
|
ctx.timer.end('domainExtract');
|
|
996
1038
|
if (ddResult) {
|
|
997
1039
|
ctx.domainData = ddResult;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.75",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|