webpeel 0.21.73 → 0.21.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +154 -0
- package/dist/core/pipeline.js +6 -1
- package/package.json +1 -1
|
@@ -121,6 +121,8 @@ const REGISTRY = [
|
|
|
121
121
|
{ match: (h) => h === 'yelp.com' || h === 'www.yelp.com', extractor: yelpExtractor },
|
|
122
122
|
{ match: (h) => h === 'zillow.com' || h === 'www.zillow.com', extractor: zillowExtractor },
|
|
123
123
|
{ match: (h) => h === 'redfin.com' || h === 'www.redfin.com', extractor: redfinExtractor },
|
|
124
|
+
// ── Travel ──────────────────────────────────────────────────────────────
|
|
125
|
+
{ match: (h, url = '') => (h === 'www.google.com' || h === 'google.com') && url.includes('/travel/flights'), extractor: googleFlightsExtractor },
|
|
124
126
|
];
|
|
125
127
|
/**
|
|
126
128
|
* Returns the domain extractor for a URL, or null if none matches.
|
|
@@ -5860,3 +5862,155 @@ async function redfinExtractor(_html, url) {
|
|
|
5860
5862
|
return null;
|
|
5861
5863
|
}
|
|
5862
5864
|
}
|
|
5865
|
+
// ---------------------------------------------------------------------------
|
|
5866
|
+
// Google Flights extractor
|
|
5867
|
+
// ---------------------------------------------------------------------------
|
|
5868
|
+
async function googleFlightsExtractor(_html, url) {
|
|
5869
|
+
if (!url.includes('/travel/flights'))
|
|
5870
|
+
return null;
|
|
5871
|
+
// Google Flights is a SPA. The _html parameter is usually readability-processed markdown
|
|
5872
|
+
// (from the pipeline's post-fetch processing), which looks like:
|
|
5873
|
+
// - 7:15 PM
|
|
5874
|
+
// 7:15 PM on Sat, Apr 4
|
|
5875
|
+
// – 10:29 PM
|
|
5876
|
+
// United
|
|
5877
|
+
// 3 hr 14 min
|
|
5878
|
+
// EWR
|
|
5879
|
+
// ...
|
|
5880
|
+
// $188
|
|
5881
|
+
//
|
|
5882
|
+
// This markdown is much easier to parse than raw HTML.
|
|
5883
|
+
let text = _html;
|
|
5884
|
+
// If this is raw HTML (contains <!DOCTYPE or <html), strip HTML tags
|
|
5885
|
+
if (text.includes('<!DOCTYPE') || text.includes('<html')) {
|
|
5886
|
+
text = text
|
|
5887
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
5888
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
5889
|
+
.replace(/<[^>]+>/g, '\n')
|
|
5890
|
+
.replace(/&/g, '&')
|
|
5891
|
+
.replace(/</g, '<')
|
|
5892
|
+
.replace(/>/g, '>')
|
|
5893
|
+
.replace(/&#\d+;/g, '')
|
|
5894
|
+
.replace(/\n{2,}/g, '\n');
|
|
5895
|
+
}
|
|
5896
|
+
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
|
5897
|
+
const AIRLINES = ['United', 'Delta', 'American', 'JetBlue', 'Spirit', 'Frontier', 'Southwest', 'Breeze', 'Alaska', 'Hawaiian', 'Sun Country', 'Avelo'];
|
|
5898
|
+
const flights = [];
|
|
5899
|
+
for (let i = 0; i < lines.length; i++) {
|
|
5900
|
+
const line = lines[i];
|
|
5901
|
+
// Detect departure time
|
|
5902
|
+
const departMatch = line.match(/^(?:-\s+)?(\d{1,2}:\d{2}\s*[AP]M)$/);
|
|
5903
|
+
if (!departMatch)
|
|
5904
|
+
continue;
|
|
5905
|
+
const departTime = departMatch[1];
|
|
5906
|
+
let departDate = '', arriveTime = '', airline = '', duration = '';
|
|
5907
|
+
let fromAirport = '', toAirport = '', stops = '', bags = '';
|
|
5908
|
+
let price = 0;
|
|
5909
|
+
for (let j = i + 1; j < Math.min(i + 45, lines.length); j++) {
|
|
5910
|
+
const l = lines[j];
|
|
5911
|
+
// Date
|
|
5912
|
+
const dateM = l.match(/on\s+(\w+,\s+\w+\s+\d+)/);
|
|
5913
|
+
if (dateM && !departDate) {
|
|
5914
|
+
departDate = dateM[1];
|
|
5915
|
+
continue;
|
|
5916
|
+
}
|
|
5917
|
+
// Arrival time
|
|
5918
|
+
const arrM = l.match(/^[–\-–—]\s*(\d{1,2}:\d{2}\s*[AP]M)$/) || l.match(/^(\d{1,2}:\d{2}\s*[AP]M)\s+on\s/);
|
|
5919
|
+
if (arrM && !arriveTime && departTime) {
|
|
5920
|
+
arriveTime = arrM[1];
|
|
5921
|
+
continue;
|
|
5922
|
+
}
|
|
5923
|
+
// Arrival time: also check for "10:29 PM on Sat, Apr 4" pattern (second occurrence)
|
|
5924
|
+
if (!arriveTime && l.match(/^\d{1,2}:\d{2}\s*[AP]M\s+on\s/)) {
|
|
5925
|
+
const m = l.match(/^(\d{1,2}:\d{2}\s*[AP]M)/);
|
|
5926
|
+
if (m) {
|
|
5927
|
+
arriveTime = m[1];
|
|
5928
|
+
continue;
|
|
5929
|
+
}
|
|
5930
|
+
}
|
|
5931
|
+
// Airline
|
|
5932
|
+
if (!airline) {
|
|
5933
|
+
for (const a of AIRLINES) {
|
|
5934
|
+
if (l === a || l.startsWith(a + 'Operated') || l.startsWith(a + ' ')) {
|
|
5935
|
+
airline = a;
|
|
5936
|
+
break;
|
|
5937
|
+
}
|
|
5938
|
+
}
|
|
5939
|
+
if (airline)
|
|
5940
|
+
continue;
|
|
5941
|
+
}
|
|
5942
|
+
// Duration
|
|
5943
|
+
if (!duration && l.match(/^\d+\s+hr\s+\d+\s+min$/)) {
|
|
5944
|
+
duration = l;
|
|
5945
|
+
continue;
|
|
5946
|
+
}
|
|
5947
|
+
// Airport codes
|
|
5948
|
+
if (l.match(/^[A-Z]{3}$/) && !fromAirport) {
|
|
5949
|
+
fromAirport = l;
|
|
5950
|
+
continue;
|
|
5951
|
+
}
|
|
5952
|
+
if (l.match(/^[A-Z]{3}$/) && fromAirport && !toAirport && l !== fromAirport) {
|
|
5953
|
+
toAirport = l;
|
|
5954
|
+
continue;
|
|
5955
|
+
}
|
|
5956
|
+
// Stops
|
|
5957
|
+
if (!stops && (l === 'Nonstop' || l.match(/^\d+\s+stop/))) {
|
|
5958
|
+
stops = l;
|
|
5959
|
+
continue;
|
|
5960
|
+
}
|
|
5961
|
+
// Bags
|
|
5962
|
+
if (l.includes('carry-on bag') && !bags) {
|
|
5963
|
+
bags = l.includes('not included') ? 'Carry-on NOT included (extra fee)' : 'Carry-on included';
|
|
5964
|
+
continue;
|
|
5965
|
+
}
|
|
5966
|
+
// Price — first occurrence only
|
|
5967
|
+
const priceM = l.match(/^\$(\d[\d,]*)$/);
|
|
5968
|
+
if (priceM && !price) {
|
|
5969
|
+
price = parseInt(priceM[1].replace(',', ''));
|
|
5970
|
+
break;
|
|
5971
|
+
}
|
|
5972
|
+
}
|
|
5973
|
+
if (departTime && arriveTime && airline && price) {
|
|
5974
|
+
flights.push({ departTime, arriveTime, departDate, airline, duration, fromAirport, toAirport, stops: stops || 'Unknown', price, priceStr: `$${price}`, bags });
|
|
5975
|
+
}
|
|
5976
|
+
}
|
|
5977
|
+
// Deduplicate
|
|
5978
|
+
const seen = new Set();
|
|
5979
|
+
const unique = flights.filter(f => {
|
|
5980
|
+
const key = `${f.departTime}-${f.airline}-${f.price}`;
|
|
5981
|
+
if (seen.has(key))
|
|
5982
|
+
return false;
|
|
5983
|
+
seen.add(key);
|
|
5984
|
+
return true;
|
|
5985
|
+
});
|
|
5986
|
+
if (unique.length === 0)
|
|
5987
|
+
return null;
|
|
5988
|
+
unique.sort((a, b) => a.price - b.price);
|
|
5989
|
+
// Parse route from URL
|
|
5990
|
+
const u = new URL(url);
|
|
5991
|
+
const query = (u.searchParams.get('q') || '').replace(/Flights?\s+(from\s+)?/i, '').replace(/\s+one\s+way/i, '').trim();
|
|
5992
|
+
const md = [
|
|
5993
|
+
`# ✈️ Flights — ${query || 'Search Results'}`,
|
|
5994
|
+
'',
|
|
5995
|
+
`*${unique.length} flights found · Source: [Google Flights](${url})*`,
|
|
5996
|
+
`*Prices include taxes + fees for 1 adult. Book directly via airline.*`,
|
|
5997
|
+
'',
|
|
5998
|
+
];
|
|
5999
|
+
for (let idx = 0; idx < unique.length; idx++) {
|
|
6000
|
+
const f = unique[idx];
|
|
6001
|
+
md.push(`## ${idx + 1}. ${f.airline} — ${f.priceStr}`);
|
|
6002
|
+
md.push(`🕐 Depart **${f.departTime}** → Arrive **${f.arriveTime}**${f.departDate ? ` · ${f.departDate}` : ''}`);
|
|
6003
|
+
md.push(`🛫 ${f.fromAirport} → ${f.toAirport} · ${f.duration} · ${f.stops}`);
|
|
6004
|
+
if (f.bags)
|
|
6005
|
+
md.push(`🧳 ${f.bags}`);
|
|
6006
|
+
md.push('');
|
|
6007
|
+
}
|
|
6008
|
+
md.push('---');
|
|
6009
|
+
md.push(`📌 *Prices change frequently. [View live prices on Google Flights](${url})*`);
|
|
6010
|
+
return {
|
|
6011
|
+
domain: 'google.com/travel/flights',
|
|
6012
|
+
type: 'flights',
|
|
6013
|
+
structured: { flights: unique, route: query, source: 'Google Flights', sourceUrl: url },
|
|
6014
|
+
cleanContent: md.join('\n'),
|
|
6015
|
+
};
|
|
6016
|
+
}
|
package/dist/core/pipeline.js
CHANGED
|
@@ -991,7 +991,12 @@ export async function postProcess(ctx) {
|
|
|
991
991
|
if (getDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
|
|
992
992
|
try {
|
|
993
993
|
ctx.timer.mark('domainExtract');
|
|
994
|
-
|
|
994
|
+
// Try raw HTML first, then fall back to readability-processed content
|
|
995
|
+
// (some SPAs like Google Flights have data only after readability processing)
|
|
996
|
+
let ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
|
|
997
|
+
if (!ddResult && ctx.content) {
|
|
998
|
+
ddResult = await extractDomainData(ctx.content, fetchResult.url);
|
|
999
|
+
}
|
|
995
1000
|
ctx.timer.end('domainExtract');
|
|
996
1001
|
if (ddResult) {
|
|
997
1002
|
ctx.domainData = ddResult;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.74",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|