webpeel 0.21.73 → 0.21.75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -588,6 +588,49 @@ export function registerSearchCommands(program) {
588
588
  process.exit(1);
589
589
  }
590
590
  });
591
+ // ── flights command ───────────────────────────────────────────────────────
592
+ program
593
+ .command('flights <query>')
594
+ .description('Search for flights (via Google Flights) — e.g. "NYC to Fort Myers Apr 4"')
595
+ .option('--one-way', 'One-way flight (default)')
596
+ .option('--round-trip', 'Round-trip flight')
597
+ .option('-n, --count <n>', 'Max flights to show', '10')
598
+ .option('--json', 'Output as JSON')
599
+ .option('-s, --silent', 'Silent mode')
600
+ .action(async (query, options) => {
601
+ const tripType = options.roundTrip ? '' : ' one way';
602
+ const encoded = encodeURIComponent(`Flights from ${query}${tripType}`);
603
+ const url = `https://www.google.com/travel/flights?q=${encoded}`;
604
+ const spinner = options.silent ? null : ora(`Searching flights: ${query}...`).start();
605
+ try {
606
+ // render is forced automatically by SPA auto-detect, but be explicit here
607
+ const result = await peel(url, { render: true, timeout: 30000 });
608
+ if (spinner)
609
+ spinner.succeed('Flights loaded');
610
+ if (options.json) {
611
+ console.log(JSON.stringify({
612
+ query,
613
+ url,
614
+ flights: result.domainData?.structured?.flights || [],
615
+ source: 'Google Flights',
616
+ content: result.content,
617
+ tokens: result.tokens,
618
+ }, null, 2));
619
+ }
620
+ else {
621
+ console.log(result.content);
622
+ }
623
+ await cleanup();
624
+ process.exit(0);
625
+ }
626
+ catch (error) {
627
+ if (spinner)
628
+ spinner.fail('Flight search failed');
629
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
630
+ await cleanup();
631
+ process.exit(1);
632
+ }
633
+ });
591
634
  // ── extractors command ────────────────────────────────────────────────────
592
635
  program
593
636
  .command('extractors')
@@ -121,6 +121,8 @@ const REGISTRY = [
121
121
  { match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
122
122
  { match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
123
123
  { match: (h) => h === 'redfin.com' || h === 'www.redfin.com', extractor: redfinExtractor },
124
+ // ── Travel ──────────────────────────────────────────────────────────────
125
+ { match: (h, url = '') => (h === 'www.google.com' || h === 'google.com') && url.includes('/travel/flights'), extractor: googleFlightsExtractor },
124
126
  ];
125
127
  /**
126
128
  * Returns the domain extractor for a URL, or null if none matches.
@@ -5860,3 +5862,177 @@ async function redfinExtractor(_html, url) {
5860
5862
  return null;
5861
5863
  }
5862
5864
  }
5865
+ // ---------------------------------------------------------------------------
5866
+ // Google Flights extractor
5867
+ // ---------------------------------------------------------------------------
5868
+ async function googleFlightsExtractor(_html, url) {
5869
+ if (!url.includes('/travel/flights'))
5870
+ return null;
5871
+ // Google Flights is a SPA. The _html parameter is usually readability-processed markdown
5872
+ // (from the pipeline's post-fetch processing), which looks like:
5873
+ // - 7:15 PM
5874
+ // 7:15 PM on Sat, Apr 4
5875
+ // – 10:29 PM
5876
+ // United
5877
+ // 3 hr 14 min
5878
+ // EWR
5879
+ // ...
5880
+ // $188
5881
+ //
5882
+ // This markdown is much easier to parse than raw HTML.
5883
+ let text = _html;
5884
+ // If this is raw HTML (contains <!DOCTYPE or <html), strip HTML tags
5885
+ if (text.includes('<!DOCTYPE') || text.includes('<html')) {
5886
+ text = text
5887
+ .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
5888
+ .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
5889
+ .replace(/<[^>]+>/g, '\n')
5890
+ .replace(/&amp;/g, '&')
5891
+ .replace(/&lt;/g, '<')
5892
+ .replace(/&gt;/g, '>')
5893
+ .replace(/&#\d+;/g, '')
5894
+ .replace(/\n{2,}/g, '\n');
5895
+ }
5896
+ const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
5897
+ const AIRLINES = ['United', 'Delta', 'American', 'JetBlue', 'Spirit', 'Frontier', 'Southwest', 'Breeze', 'Alaska', 'Hawaiian', 'Sun Country', 'Avelo'];
5898
+ const flights = [];
5899
+ for (let i = 0; i < lines.length; i++) {
5900
+ const line = lines[i];
5901
+ // Detect departure time
5902
+ const departMatch = line.match(/^(?:-\s+)?(\d{1,2}:\d{2}\s*[AP]M)$/);
5903
+ if (!departMatch)
5904
+ continue;
5905
+ const departTime = departMatch[1];
5906
+ let departDate = '', arriveTime = '', airline = '', duration = '';
5907
+ let fromAirport = '', toAirport = '', stops = '', bags = '';
5908
+ let price = 0;
5909
+ for (let j = i + 1; j < Math.min(i + 45, lines.length); j++) {
5910
+ const l = lines[j];
5911
+ // Date
5912
+ const dateM = l.match(/on\s+(\w+,\s+\w+\s+\d+)/);
5913
+ if (dateM && !departDate) {
5914
+ departDate = dateM[1];
5915
+ continue;
5916
+ }
5917
+ // Arrival time
5918
+ const arrM = l.match(/^[–\-–—]\s*(\d{1,2}:\d{2}\s*[AP]M)$/) || l.match(/^(\d{1,2}:\d{2}\s*[AP]M)\s+on\s/);
5919
+ if (arrM && !arriveTime && departTime) {
5920
+ arriveTime = arrM[1];
5921
+ continue;
5922
+ }
5923
+ // Arrival time: also check for "10:29 PM on Sat, Apr 4" pattern (second occurrence)
5924
+ if (!arriveTime && l.match(/^\d{1,2}:\d{2}\s*[AP]M\s+on\s/)) {
5925
+ const m = l.match(/^(\d{1,2}:\d{2}\s*[AP]M)/);
5926
+ if (m) {
5927
+ arriveTime = m[1];
5928
+ continue;
5929
+ }
5930
+ }
5931
+ // Airline
5932
+ if (!airline) {
5933
+ for (const a of AIRLINES) {
5934
+ if (l === a || l.startsWith(a + 'Operated') || l.startsWith(a + ' ')) {
5935
+ airline = a;
5936
+ break;
5937
+ }
5938
+ }
5939
+ if (airline)
5940
+ continue;
5941
+ }
5942
+ // Duration
5943
+ if (!duration && l.match(/^\d+\s+hr\s+\d+\s+min$/)) {
5944
+ duration = l;
5945
+ continue;
5946
+ }
5947
+ // Airport codes
5948
+ if (l.match(/^[A-Z]{3}$/) && !fromAirport) {
5949
+ fromAirport = l;
5950
+ continue;
5951
+ }
5952
+ if (l.match(/^[A-Z]{3}$/) && fromAirport && !toAirport && l !== fromAirport) {
5953
+ toAirport = l;
5954
+ continue;
5955
+ }
5956
+ // Stops
5957
+ if (!stops && (l === 'Nonstop' || l.match(/^\d+\s+stop/))) {
5958
+ stops = l;
5959
+ continue;
5960
+ }
5961
+ // Bags
5962
+ if (l.includes('carry-on bag') && !bags) {
5963
+ bags = l.includes('not included') ? 'Carry-on NOT included (extra fee)' : 'Carry-on included';
5964
+ continue;
5965
+ }
5966
+ // Price — first occurrence only
5967
+ const priceM = l.match(/^\$(\d[\d,]*)$/);
5968
+ if (priceM && !price) {
5969
+ price = parseInt(priceM[1].replace(',', ''));
5970
+ break;
5971
+ }
5972
+ }
5973
+ if (departTime && arriveTime && airline && price) {
5974
+ flights.push({ departTime, arriveTime, departDate, airline, duration, fromAirport, toAirport, stops: stops || 'Unknown', price, priceStr: `$${price}`, bags });
5975
+ }
5976
+ }
5977
+ // Deduplicate
5978
+ const seen = new Set();
5979
+ const unique = flights.filter(f => {
5980
+ const key = `${f.departTime}-${f.airline}-${f.price}`;
5981
+ if (seen.has(key))
5982
+ return false;
5983
+ seen.add(key);
5984
+ return true;
5985
+ });
5986
+ if (unique.length === 0)
5987
+ return null;
5988
+ unique.sort((a, b) => a.price - b.price);
5989
+ // Helper: get airline booking URL
5990
+ function getAirlineBookingUrl(airline, from, to) {
5991
+ const fromLower = from.toLowerCase();
5992
+ const toLower = to.toLowerCase();
5993
+ const urlMap = {
5994
+ 'United': `https://www.united.com/en-us/flights-from-${fromLower}-to-${toLower}`,
5995
+ 'Delta': `https://www.delta.com/flight-search/search`,
5996
+ 'JetBlue': `https://www.jetblue.com/booking/flights`,
5997
+ 'American': `https://www.aa.com/booking/find-flights`,
5998
+ 'Spirit': `https://www.spirit.com/book/flights`,
5999
+ 'Frontier': `https://www.flyfrontier.com/booking/`,
6000
+ 'Southwest': `https://www.southwest.com/air/booking/`,
6001
+ 'Breeze': `https://www.flybreeze.com/home`,
6002
+ 'Alaska': `https://www.alaskaair.com/booking/flights`,
6003
+ 'Hawaiian': `https://www.hawaiianairlines.com/book-a-trip`,
6004
+ 'Sun Country': `https://www.suncountry.com/booking/search`,
6005
+ 'Avelo': `https://www.aveloair.com/book`,
6006
+ };
6007
+ return urlMap[airline] || `https://www.google.com/search?q=${encodeURIComponent(`${airline} flights ${from} to ${to}`)}`;
6008
+ }
6009
+ // Parse route from URL
6010
+ const u = new URL(url);
6011
+ const query = (u.searchParams.get('q') || '').replace(/Flights?\s+(from\s+)?/i, '').replace(/\s+one\s+way/i, '').trim();
6012
+ const md = [
6013
+ `# ✈️ Flights — ${query || 'Search Results'}`,
6014
+ '',
6015
+ `*${unique.length} flights found · Source: [Google Flights](${url})*`,
6016
+ `*Prices include taxes + fees for 1 adult. Book directly via airline.*`,
6017
+ '',
6018
+ ];
6019
+ for (let idx = 0; idx < unique.length; idx++) {
6020
+ const f = unique[idx];
6021
+ const bookingUrl = getAirlineBookingUrl(f.airline, f.fromAirport, f.toAirport);
6022
+ md.push(`## ${idx + 1}. ${f.airline} — ${f.priceStr}`);
6023
+ md.push(`🕐 Depart **${f.departTime}** → Arrive **${f.arriveTime}**${f.departDate ? ` · ${f.departDate}` : ''}`);
6024
+ md.push(`🛫 ${f.fromAirport} → ${f.toAirport} · ${f.duration} · ${f.stops}`);
6025
+ if (f.bags)
6026
+ md.push(`🧳 ${f.bags}`);
6027
+ md.push(`🔗 [Book on ${f.airline}](${bookingUrl})`);
6028
+ md.push('');
6029
+ }
6030
+ md.push('---');
6031
+ md.push(`📌 *Prices change frequently. [View live prices on Google Flights](${url})*`);
6032
+ return {
6033
+ domain: 'google.com/travel/flights',
6034
+ type: 'flights',
6035
+ structured: { flights: unique, route: query, source: 'Google Flights', sourceUrl: url },
6036
+ cleanContent: md.join('\n'),
6037
+ };
6038
+ }
@@ -146,6 +146,43 @@ export function normalizeOptions(ctx) {
146
146
  if (autoScrollOpts) {
147
147
  ctx.render = true;
148
148
  }
149
+ // Auto-detect SPAs that require browser rendering (no --render flag needed)
150
+ if (!ctx.render) {
151
+ const SPA_DOMAINS = new Set([
152
+ 'www.google.com', // Google Flights, Maps, Shopping etc.
153
+ 'flights.google.com',
154
+ 'www.airbnb.com',
155
+ 'www.booking.com',
156
+ 'www.expedia.com',
157
+ 'www.kayak.com',
158
+ 'www.skyscanner.com',
159
+ 'www.tripadvisor.com',
160
+ 'www.indeed.com',
161
+ 'www.glassdoor.com',
162
+ 'www.zillow.com', // already handled but backup
163
+ 'app.webpeel.dev', // our own dashboard is a SPA
164
+ ]);
165
+ // More specific: some google.com paths need render, not all
166
+ const SPA_URL_PATTERNS = [
167
+ /google\.com\/travel/,
168
+ /google\.com\/maps/,
169
+ /google\.com\/shopping/,
170
+ ];
171
+ try {
172
+ const hostname = new URL(ctx.url).hostname;
173
+ if (SPA_DOMAINS.has(hostname)) {
174
+ ctx.render = true;
175
+ log.debug(`Auto-enabling render: SPA domain detected (${hostname})`);
176
+ }
177
+ else if (SPA_URL_PATTERNS.some(p => p.test(ctx.url))) {
178
+ ctx.render = true;
179
+ log.debug(`Auto-enabling render: SPA URL pattern matched`);
180
+ }
181
+ }
182
+ catch {
183
+ // Invalid URL — skip SPA detection
184
+ }
185
+ }
149
186
  }
150
187
  // ---------------------------------------------------------------------------
151
188
  // Stage 2: handleYouTube
@@ -991,7 +1028,12 @@ export async function postProcess(ctx) {
991
1028
  if (getDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
992
1029
  try {
993
1030
  ctx.timer.mark('domainExtract');
994
- const ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
1031
+ // Try raw HTML first, then fall back to readability-processed content
1032
+ // (some SPAs like Google Flights have data only after readability processing)
1033
+ let ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
1034
+ if (!ddResult && ctx.content) {
1035
+ ddResult = await extractDomainData(ctx.content, fetchResult.url);
1036
+ }
995
1037
  ctx.timer.end('domainExtract');
996
1038
  if (ddResult) {
997
1039
  ctx.domainData = ddResult;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.73",
3
+ "version": "0.21.75",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",