webpeel 0.21.73 → 0.21.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -121,6 +121,8 @@ const REGISTRY = [
121
121
  { match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
122
122
  { match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
123
123
  { match: (h) => h === 'redfin.com' || h === 'www.redfin.com', extractor: redfinExtractor },
124
+ // ── Travel ──────────────────────────────────────────────────────────────
125
+ { match: (h, url = '') => (h === 'www.google.com' || h === 'google.com') && url.includes('/travel/flights'), extractor: googleFlightsExtractor },
124
126
  ];
125
127
  /**
126
128
  * Returns the domain extractor for a URL, or null if none matches.
@@ -5860,3 +5862,155 @@ async function redfinExtractor(_html, url) {
5860
5862
  return null;
5861
5863
  }
5862
5864
  }
5865
+ // ---------------------------------------------------------------------------
5866
+ // Google Flights extractor
5867
+ // ---------------------------------------------------------------------------
5868
+ async function googleFlightsExtractor(_html, url) {
5869
+ if (!url.includes('/travel/flights'))
5870
+ return null;
5871
+ // Google Flights is a SPA. The _html parameter is usually readability-processed markdown
5872
+ // (from the pipeline's post-fetch processing), which looks like:
5873
+ // - 7:15 PM
5874
+ // 7:15 PM on Sat, Apr 4
5875
+ // – 10:29 PM
5876
+ // United
5877
+ // 3 hr 14 min
5878
+ // EWR
5879
+ // ...
5880
+ // $188
5881
+ //
5882
+ // This markdown is much easier to parse than raw HTML.
5883
+ let text = _html;
5884
+ // If this is raw HTML (contains <!DOCTYPE or <html), strip HTML tags
5885
+ if (text.includes('<!DOCTYPE') || text.includes('<html')) {
5886
+ text = text
5887
+ .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
5888
+ .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
5889
+ .replace(/<[^>]+>/g, '\n')
5890
+ .replace(/&amp;/g, '&')
5891
+ .replace(/&lt;/g, '<')
5892
+ .replace(/&gt;/g, '>')
5893
+ .replace(/&#\d+;/g, '')
5894
+ .replace(/\n{2,}/g, '\n');
5895
+ }
5896
+ const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
5897
+ const AIRLINES = ['United', 'Delta', 'American', 'JetBlue', 'Spirit', 'Frontier', 'Southwest', 'Breeze', 'Alaska', 'Hawaiian', 'Sun Country', 'Avelo'];
5898
+ const flights = [];
5899
+ for (let i = 0; i < lines.length; i++) {
5900
+ const line = lines[i];
5901
+ // Detect departure time
5902
+ const departMatch = line.match(/^(?:-\s+)?(\d{1,2}:\d{2}\s*[AP]M)$/);
5903
+ if (!departMatch)
5904
+ continue;
5905
+ const departTime = departMatch[1];
5906
+ let departDate = '', arriveTime = '', airline = '', duration = '';
5907
+ let fromAirport = '', toAirport = '', stops = '', bags = '';
5908
+ let price = 0;
5909
+ for (let j = i + 1; j < Math.min(i + 45, lines.length); j++) {
5910
+ const l = lines[j];
5911
+ // Date
5912
+ const dateM = l.match(/on\s+(\w+,\s+\w+\s+\d+)/);
5913
+ if (dateM && !departDate) {
5914
+ departDate = dateM[1];
5915
+ continue;
5916
+ }
5917
+ // Arrival time
5918
+ const arrM = l.match(/^[–\-–—]\s*(\d{1,2}:\d{2}\s*[AP]M)$/) || l.match(/^(\d{1,2}:\d{2}\s*[AP]M)\s+on\s/);
5919
+ if (arrM && !arriveTime && departTime) {
5920
+ arriveTime = arrM[1];
5921
+ continue;
5922
+ }
5923
+ // Arrival time: also check for "10:29 PM on Sat, Apr 4" pattern (second occurrence)
5924
+ if (!arriveTime && l.match(/^\d{1,2}:\d{2}\s*[AP]M\s+on\s/)) {
5925
+ const m = l.match(/^(\d{1,2}:\d{2}\s*[AP]M)/);
5926
+ if (m) {
5927
+ arriveTime = m[1];
5928
+ continue;
5929
+ }
5930
+ }
5931
+ // Airline
5932
+ if (!airline) {
5933
+ for (const a of AIRLINES) {
5934
+ if (l === a || l.startsWith(a + 'Operated') || l.startsWith(a + ' ')) {
5935
+ airline = a;
5936
+ break;
5937
+ }
5938
+ }
5939
+ if (airline)
5940
+ continue;
5941
+ }
5942
+ // Duration
5943
+ if (!duration && l.match(/^\d+\s+hr\s+\d+\s+min$/)) {
5944
+ duration = l;
5945
+ continue;
5946
+ }
5947
+ // Airport codes
5948
+ if (l.match(/^[A-Z]{3}$/) && !fromAirport) {
5949
+ fromAirport = l;
5950
+ continue;
5951
+ }
5952
+ if (l.match(/^[A-Z]{3}$/) && fromAirport && !toAirport && l !== fromAirport) {
5953
+ toAirport = l;
5954
+ continue;
5955
+ }
5956
+ // Stops
5957
+ if (!stops && (l === 'Nonstop' || l.match(/^\d+\s+stop/))) {
5958
+ stops = l;
5959
+ continue;
5960
+ }
5961
+ // Bags
5962
+ if (l.includes('carry-on bag') && !bags) {
5963
+ bags = l.includes('not included') ? 'Carry-on NOT included (extra fee)' : 'Carry-on included';
5964
+ continue;
5965
+ }
5966
+ // Price — first occurrence only
5967
+ const priceM = l.match(/^\$(\d[\d,]*)$/);
5968
+ if (priceM && !price) {
5969
+ price = parseInt(priceM[1].replace(',', ''));
5970
+ break;
5971
+ }
5972
+ }
5973
+ if (departTime && arriveTime && airline && price) {
5974
+ flights.push({ departTime, arriveTime, departDate, airline, duration, fromAirport, toAirport, stops: stops || 'Unknown', price, priceStr: `$${price}`, bags });
5975
+ }
5976
+ }
5977
+ // Deduplicate
5978
+ const seen = new Set();
5979
+ const unique = flights.filter(f => {
5980
+ const key = `${f.departTime}-${f.airline}-${f.price}`;
5981
+ if (seen.has(key))
5982
+ return false;
5983
+ seen.add(key);
5984
+ return true;
5985
+ });
5986
+ if (unique.length === 0)
5987
+ return null;
5988
+ unique.sort((a, b) => a.price - b.price);
5989
+ // Parse route from URL
5990
+ const u = new URL(url);
5991
+ const query = (u.searchParams.get('q') || '').replace(/Flights?\s+(from\s+)?/i, '').replace(/\s+one\s+way/i, '').trim();
5992
+ const md = [
5993
+ `# ✈️ Flights — ${query || 'Search Results'}`,
5994
+ '',
5995
+ `*${unique.length} flights found · Source: [Google Flights](${url})*`,
5996
+ `*Prices include taxes + fees for 1 adult. Book directly via airline.*`,
5997
+ '',
5998
+ ];
5999
+ for (let idx = 0; idx < unique.length; idx++) {
6000
+ const f = unique[idx];
6001
+ md.push(`## ${idx + 1}. ${f.airline} — ${f.priceStr}`);
6002
+ md.push(`🕐 Depart **${f.departTime}** → Arrive **${f.arriveTime}**${f.departDate ? ` · ${f.departDate}` : ''}`);
6003
+ md.push(`🛫 ${f.fromAirport} → ${f.toAirport} · ${f.duration} · ${f.stops}`);
6004
+ if (f.bags)
6005
+ md.push(`🧳 ${f.bags}`);
6006
+ md.push('');
6007
+ }
6008
+ md.push('---');
6009
+ md.push(`📌 *Prices change frequently. [View live prices on Google Flights](${url})*`);
6010
+ return {
6011
+ domain: 'google.com/travel/flights',
6012
+ type: 'flights',
6013
+ structured: { flights: unique, route: query, source: 'Google Flights', sourceUrl: url },
6014
+ cleanContent: md.join('\n'),
6015
+ };
6016
+ }
@@ -991,7 +991,12 @@ export async function postProcess(ctx) {
991
991
  if (getDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
992
992
  try {
993
993
  ctx.timer.mark('domainExtract');
994
- const ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
994
+ // Try raw HTML first, then fall back to readability-processed content
995
+ // (some SPAs like Google Flights have data only after readability processing)
996
+ let ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
997
+ if (!ddResult && ctx.content) {
998
+ ddResult = await extractDomainData(ctx.content, fetchResult.url);
999
+ }
995
1000
  ctx.timer.end('domainExtract');
996
1001
  if (ddResult) {
997
1002
  ctx.domainData = ddResult;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.73",
3
+ "version": "0.21.74",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",