webpeel 0.21.83 → 0.21.85
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/pipeline.d.ts +1 -1
- package/dist/core/pipeline.js +7 -38
- package/dist/core/strategy-hooks.d.ts +1 -1
- package/dist/index.d.ts +1 -2
- package/dist/index.js +3 -1
- package/dist/server/app.js +7 -5
- package/dist/server/routes/smart-search.d.ts +35 -0
- package/dist/server/routes/smart-search.js +358 -0
- package/dist/types.d.ts +1 -1
- package/package.json +1 -1
package/dist/core/pipeline.d.ts
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* mutable PipelineContext. The stages are called in order by peel().
|
|
6
6
|
*/
|
|
7
7
|
import { type AutoScrollOptions } from './actions.js';
|
|
8
|
-
import { type DomainExtractResult } from '
|
|
8
|
+
import { type DomainExtractResult } from '../ee/domain-extractors.js';
|
|
9
9
|
import { type ReadabilityResult } from './readability.js';
|
|
10
10
|
import { type QuickAnswerResult } from './quick-answer.js';
|
|
11
11
|
import { Timer } from './timing.js';
|
package/dist/core/pipeline.js
CHANGED
|
@@ -14,34 +14,8 @@ import { autoScroll as runAutoScroll } from './actions.js';
|
|
|
14
14
|
import { extractStructured } from './extract.js';
|
|
15
15
|
import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
|
|
16
16
|
import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
|
|
17
|
-
import {
|
|
17
|
+
import { extractDomainData, getDomainExtractor } from '../ee/domain-extractors.js';
|
|
18
18
|
import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
|
|
19
|
-
// ---------------------------------------------------------------------------
|
|
20
|
-
// Domain extraction — lazy-load full extractors from compiled JS
|
|
21
|
-
// ---------------------------------------------------------------------------
|
|
22
|
-
// The compiled domain-extractors.js (312KB) ships in the npm package.
|
|
23
|
-
// TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
|
|
24
|
-
// If compiled JS is missing (bare repo clone without proprietary files),
|
|
25
|
-
// falls back to basic stub (no domain extraction, just standard markdown).
|
|
26
|
-
// Server premium hooks can override for additional caching/intelligence.
|
|
27
|
-
let _extractorsLoaded = false;
|
|
28
|
-
let _extractDomainData = null;
|
|
29
|
-
let _getDomainExtractor = null;
|
|
30
|
-
async function loadExtractors() {
|
|
31
|
-
if (_extractorsLoaded)
|
|
32
|
-
return;
|
|
33
|
-
_extractorsLoaded = true;
|
|
34
|
-
try {
|
|
35
|
-
const mod = await import('./domain-extractors.js');
|
|
36
|
-
_extractDomainData = mod.extractDomainData;
|
|
37
|
-
_getDomainExtractor = mod.getDomainExtractor;
|
|
38
|
-
}
|
|
39
|
-
catch {
|
|
40
|
-
// Compiled JS not available (bare repo clone) — basic stub will be used
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
// Start loading immediately (non-blocking)
|
|
44
|
-
loadExtractors();
|
|
45
19
|
import { extractReadableContent } from './readability.js';
|
|
46
20
|
import { quickAnswer as runQuickAnswer } from './quick-answer.js';
|
|
47
21
|
import { Timer } from './timing.js';
|
|
@@ -56,28 +30,23 @@ const log = createLogger('pipeline');
|
|
|
56
30
|
// ---------------------------------------------------------------------------
|
|
57
31
|
/**
|
|
58
32
|
* Check if a URL has a domain extractor.
|
|
59
|
-
* Priority: premium hook →
|
|
33
|
+
* Priority: premium hook → ee/domain-extractors.
|
|
60
34
|
*/
|
|
61
35
|
function hasDomainExtractor(url) {
|
|
62
36
|
const hookFn = getDomainExtractorHook();
|
|
63
37
|
if (hookFn)
|
|
64
38
|
return hookFn(url) !== null;
|
|
65
|
-
|
|
66
|
-
return _getDomainExtractor(url) !== null;
|
|
67
|
-
return getDomainExtractorBasic(url) !== null;
|
|
39
|
+
return getDomainExtractor(url) !== null;
|
|
68
40
|
}
|
|
69
41
|
/**
|
|
70
42
|
* Run domain extraction on HTML/URL.
|
|
71
|
-
* Priority: premium hook →
|
|
43
|
+
* Priority: premium hook → ee/domain-extractors.
|
|
72
44
|
*/
|
|
73
45
|
async function runDomainExtract(html, url) {
|
|
74
46
|
const hookFn = getDomainExtractHook();
|
|
75
47
|
if (hookFn)
|
|
76
48
|
return hookFn(html, url);
|
|
77
|
-
|
|
78
|
-
if (_extractDomainData)
|
|
79
|
-
return _extractDomainData(html, url);
|
|
80
|
-
return extractDomainDataBasic(html, url);
|
|
49
|
+
return extractDomainData(html, url);
|
|
81
50
|
}
|
|
82
51
|
/** Create the initial PipelineContext with defaults */
|
|
83
52
|
export function createContext(url, options) {
|
|
@@ -609,7 +578,7 @@ export async function fetchContent(ctx) {
|
|
|
609
578
|
const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
|
|
610
579
|
if (canSolve) {
|
|
611
580
|
try {
|
|
612
|
-
const { solveChallenge } = await import('
|
|
581
|
+
const { solveChallenge } = await import('../ee/challenge-solver.js');
|
|
613
582
|
const { detectChallenge } = await import('./challenge-detection.js');
|
|
614
583
|
const rawHtml = fetchResult.html || '';
|
|
615
584
|
const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
|
|
@@ -1179,7 +1148,7 @@ export async function postProcess(ctx) {
|
|
|
1179
1148
|
const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
|
|
1180
1149
|
if (canSolve && ctx.fetchResult?.html) {
|
|
1181
1150
|
try {
|
|
1182
|
-
const { solveChallenge } = await import('
|
|
1151
|
+
const { solveChallenge } = await import('../ee/challenge-solver.js');
|
|
1183
1152
|
const { detectChallenge } = await import('./challenge-detection.js');
|
|
1184
1153
|
const rawHtml = ctx.fetchResult.html;
|
|
1185
1154
|
const detectionResult = detectChallenge(rawHtml, ctx.fetchResult.statusCode);
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* All hook methods are optional — unset hooks are simply skipped.
|
|
11
11
|
*/
|
|
12
12
|
import type { FetchResult } from './fetcher.js';
|
|
13
|
-
import type { DomainExtractResult } from '
|
|
13
|
+
import type { DomainExtractResult } from '../ee/domain-extractors.js';
|
|
14
14
|
export interface StrategyResult extends FetchResult {
|
|
15
15
|
method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
|
|
16
16
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -6,8 +6,7 @@
|
|
|
6
6
|
import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
|
|
7
7
|
import type { PeelOptions, PeelResult } from './types.js';
|
|
8
8
|
export * from './types.js';
|
|
9
|
-
export type
|
|
10
|
-
export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
|
|
9
|
+
export { getDomainExtractor, extractDomainData, type DomainExtractResult, type DomainExtractor } from './ee/domain-extractors.js';
|
|
11
10
|
export { crawl, type CrawlOptions, type CrawlResult, type CrawlProgress } from './core/crawler.js';
|
|
12
11
|
export { discoverSitemap, type SitemapUrl, type SitemapResult } from './core/sitemap.js';
|
|
13
12
|
export { mapDomain, type MapOptions, type MapResult } from './core/map.js';
|
package/dist/index.js
CHANGED
|
@@ -7,7 +7,9 @@ import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from '
|
|
|
7
7
|
import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
|
|
8
8
|
import { checkUrlSafety } from './core/safe-browsing.js';
|
|
9
9
|
export * from './types.js';
|
|
10
|
-
|
|
10
|
+
// Domain extractors — compiled JS ships in npm, TypeScript source is .gitignore'd.
|
|
11
|
+
// Re-export types from the basic stub (always available), runtime functions via lazy wrapper.
|
|
12
|
+
export { getDomainExtractor, extractDomainData } from './ee/domain-extractors.js';
|
|
11
13
|
export { crawl } from './core/crawler.js';
|
|
12
14
|
export { discoverSitemap } from './core/sitemap.js';
|
|
13
15
|
export { mapDomain } from './core/map.js';
|
package/dist/server/app.js
CHANGED
|
@@ -18,6 +18,7 @@ import { createRateLimitMiddleware, RateLimiter } from './middleware/rate-limit.
|
|
|
18
18
|
import { createHealthRouter } from './routes/health.js';
|
|
19
19
|
import { createFetchRouter } from './routes/fetch.js';
|
|
20
20
|
import { createSearchRouter } from './routes/search.js';
|
|
21
|
+
import { createSmartSearchRouter } from './routes/smart-search.js';
|
|
21
22
|
import { createUserRouter } from './routes/users.js';
|
|
22
23
|
import { createStripeRouter, createBillingPortalRouter } from './routes/stripe.js';
|
|
23
24
|
import { createOAuthRouter } from './routes/oauth.js';
|
|
@@ -55,19 +56,18 @@ import { requireScope } from './middleware/scope-guard.js';
|
|
|
55
56
|
import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
|
|
56
57
|
import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
|
|
57
58
|
// Proprietary modules — loaded dynamically so the build works without TypeScript source.
|
|
58
|
-
// Compiled JS ships in npm/Docker. TypeScript source is .gitignore'd (not on GitHub).
|
|
59
59
|
let setExtractorRedis;
|
|
60
60
|
let registerPremiumHooks;
|
|
61
61
|
try {
|
|
62
|
-
const de = await import('../
|
|
62
|
+
const de = await import('../ee/domain-extractors.js');
|
|
63
63
|
setExtractorRedis = de.setExtractorRedis;
|
|
64
64
|
}
|
|
65
|
-
catch { /*
|
|
65
|
+
catch { /* ee module not available */ }
|
|
66
66
|
try {
|
|
67
|
-
const ph = await import('
|
|
67
|
+
const ph = await import('../ee/premium-hooks.js');
|
|
68
68
|
registerPremiumHooks = ph.registerPremiumHooks;
|
|
69
69
|
}
|
|
70
|
-
catch { /*
|
|
70
|
+
catch { /* ee module not available */ }
|
|
71
71
|
import { readFileSync } from 'fs';
|
|
72
72
|
import { join, dirname } from 'path';
|
|
73
73
|
import { fileURLToPath } from 'url';
|
|
@@ -315,6 +315,8 @@ export function createApp(config = {}) {
|
|
|
315
315
|
app.use('/v1/screenshot', requireScope('full', 'read'));
|
|
316
316
|
app.use(createScreenshotRouter(authStore));
|
|
317
317
|
app.use(createSearchRouter(authStore));
|
|
318
|
+
// /v1/search/smart — intent detection + travel/commerce routing (POST)
|
|
319
|
+
app.use(createSmartSearchRouter(authStore));
|
|
318
320
|
// /v1/research — lightweight research (search → fetch → compile), BYOK LLM optional
|
|
319
321
|
app.use('/v1/research', requireScope('full', 'read'));
|
|
320
322
|
app.use(createResearchRouter());
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart Search endpoint — intent detection + travel/commerce routing
|
|
3
|
+
* POST /v1/search/smart
|
|
4
|
+
*
|
|
5
|
+
* Detects user intent from natural language and routes to the best source:
|
|
6
|
+
* - cars → Cars.com with browser rendering + structured extraction
|
|
7
|
+
* - flights → Google Flights with browser rendering + flight extractor
|
|
8
|
+
* - hotels → Google Hotels with browser rendering
|
|
9
|
+
* - rental → Kayak with browser rendering + rental extractor
|
|
10
|
+
* - restaurants → Yelp Fusion API extractor
|
|
11
|
+
* - general → SearXNG with smart enrichment (peel() for top 2)
|
|
12
|
+
*/
|
|
13
|
+
import { Router } from 'express';
|
|
14
|
+
import '../types.js';
|
|
15
|
+
import { AuthStore } from '../auth-store.js';
|
|
16
|
+
export interface SearchIntent {
|
|
17
|
+
type: 'cars' | 'flights' | 'hotels' | 'rental' | 'restaurants' | 'general';
|
|
18
|
+
query: string;
|
|
19
|
+
params: Record<string, string>;
|
|
20
|
+
}
|
|
21
|
+
export interface SmartSearchResult {
|
|
22
|
+
type: 'cars' | 'flights' | 'hotels' | 'rental' | 'restaurants' | 'general';
|
|
23
|
+
source: string;
|
|
24
|
+
sourceUrl: string;
|
|
25
|
+
content: string;
|
|
26
|
+
title?: string;
|
|
27
|
+
domainData?: any;
|
|
28
|
+
structured?: any;
|
|
29
|
+
results?: any[];
|
|
30
|
+
tokens: number;
|
|
31
|
+
fetchTimeMs: number;
|
|
32
|
+
loadingMessage?: string;
|
|
33
|
+
}
|
|
34
|
+
export declare function detectSearchIntent(query: string): SearchIntent;
|
|
35
|
+
export declare function createSmartSearchRouter(authStore: AuthStore): Router;
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart Search endpoint — intent detection + travel/commerce routing
|
|
3
|
+
* POST /v1/search/smart
|
|
4
|
+
*
|
|
5
|
+
* Detects user intent from natural language and routes to the best source:
|
|
6
|
+
* - cars → Cars.com with browser rendering + structured extraction
|
|
7
|
+
* - flights → Google Flights with browser rendering + flight extractor
|
|
8
|
+
* - hotels → Google Hotels with browser rendering
|
|
9
|
+
* - rental → Kayak with browser rendering + rental extractor
|
|
10
|
+
* - restaurants → Yelp Fusion API extractor
|
|
11
|
+
* - general → SearXNG with smart enrichment (peel() for top 2)
|
|
12
|
+
*/
|
|
13
|
+
import { Router } from 'express';
|
|
14
|
+
import '../types.js'; // Augments Express.Request with requestId, auth
|
|
15
|
+
import { peel } from '../../index.js';
|
|
16
|
+
import { getBestSearchProvider, } from '../../core/search-provider.js';
|
|
17
|
+
import { getSourceCredibility } from '../../core/source-credibility.js';
|
|
18
|
+
// ─── Intent Detection ──────────────────────────────────────────────────────
|
|
19
|
+
export function detectSearchIntent(query) {
|
|
20
|
+
const q = query.toLowerCase();
|
|
21
|
+
// Cars: vehicle name/type + buying signals
|
|
22
|
+
if (/\b(car|cars|vehicle|sedan|suv|truck|honda|toyota|tesla|bmw|ford|chevy|chevrolet|nissan|hyundai|kia|mazda|subaru|lexus|audi|mercedes|volkswagen|jeep|dodge|ram|buick|cadillac|gmc|chrysler|acura|infiniti|volvo|porsche|mini|fiat|mitsubishi)\b/.test(q) &&
|
|
23
|
+
/\b(buy|cheap|under|budget|price|used|new|for sale|listing|deal)\b/.test(q)) {
|
|
24
|
+
const priceMatch = q.match(/(?:under|\$|budget|max)\s*\$?(\d[\d,]*)/);
|
|
25
|
+
const priceValue = priceMatch ? priceMatch[1].replace(/,/g, '') : '';
|
|
26
|
+
// Find all 5-digit numbers, pick the one that isn't the price
|
|
27
|
+
const allZips = [...q.matchAll(/\b(\d{5})\b/g)].map(m => m[1]);
|
|
28
|
+
const finalZip = allZips.find(z => z !== priceValue) || '10001';
|
|
29
|
+
return {
|
|
30
|
+
type: 'cars',
|
|
31
|
+
query: q,
|
|
32
|
+
params: {
|
|
33
|
+
maxPrice: priceValue,
|
|
34
|
+
zip: finalZip,
|
|
35
|
+
},
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
// Flights: "flight", "fly", city-to-city patterns with dates
|
|
39
|
+
if (/\b(flight|flights|fly|flying|airline|plane)\b/.test(q) ||
|
|
40
|
+
(/\b(from|to)\b.*\b(to|from)\b/.test(q) && /\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|\d{1,2}\/\d{1,2})\b/.test(q))) {
|
|
41
|
+
return { type: 'flights', query: q, params: {} };
|
|
42
|
+
}
|
|
43
|
+
// Hotels: "hotel", "stay", "accommodation", etc. + location signal
|
|
44
|
+
if (/\b(hotel|hotels|motel|stay|accommodation|lodging|inn|resort|airbnb|hostel)\b/.test(q) &&
|
|
45
|
+
/\b(in|near|at|around|cheap|best|book)\b/.test(q)) {
|
|
46
|
+
return { type: 'hotels', query: q, params: {} };
|
|
47
|
+
}
|
|
48
|
+
// Car rental: "rent a car", "car rental", "rental car"
|
|
49
|
+
if (/\b(rent|rental)\b.*\b(car|vehicle|suv)\b/.test(q) ||
|
|
50
|
+
/\bcar\s+rental\b/.test(q)) {
|
|
51
|
+
return { type: 'rental', query: q, params: {} };
|
|
52
|
+
}
|
|
53
|
+
// Restaurants: food/dining + location/quality signal
|
|
54
|
+
if (/\b(restaurant|restaurants|food|eat|dinner|lunch|pizza|sushi|burger|cafe|bar|bistro|brunch|breakfast)\b/.test(q) &&
|
|
55
|
+
/\b(in|near|best|top|good|cheap)\b/.test(q)) {
|
|
56
|
+
return { type: 'restaurants', query: q, params: {} };
|
|
57
|
+
}
|
|
58
|
+
return { type: 'general', query: q, params: {} };
|
|
59
|
+
}
|
|
60
|
+
// ─── Intent Handlers ───────────────────────────────────────────────────────
|
|
61
|
+
async function handleCarSearch(intent) {
|
|
62
|
+
const t0 = Date.now();
|
|
63
|
+
// Build a clean keyword: strip the common car/buy/deal words to surface the actual vehicle name
|
|
64
|
+
const keyword = intent.query
|
|
65
|
+
.replace(/\b(buy|cheap|under|budget|price|used|new|for sale|listing|deal|car|cars)\b/gi, '')
|
|
66
|
+
.replace(/\s+/g, ' ')
|
|
67
|
+
.trim();
|
|
68
|
+
const params = new URLSearchParams({
|
|
69
|
+
keyword,
|
|
70
|
+
sort: 'list_price',
|
|
71
|
+
stock_type: 'all',
|
|
72
|
+
zip: intent.params.zip || '10001',
|
|
73
|
+
maximum_distance: '50',
|
|
74
|
+
});
|
|
75
|
+
if (intent.params.maxPrice)
|
|
76
|
+
params.set('list_price_max', intent.params.maxPrice);
|
|
77
|
+
const url = `https://www.cars.com/shopping/results/?${params.toString()}`;
|
|
78
|
+
try {
|
|
79
|
+
const result = await peel(url, { render: true, timeout: 25000 });
|
|
80
|
+
return {
|
|
81
|
+
type: 'cars',
|
|
82
|
+
source: 'Cars.com',
|
|
83
|
+
sourceUrl: url,
|
|
84
|
+
content: result.content,
|
|
85
|
+
title: result.title,
|
|
86
|
+
domainData: result.domainData,
|
|
87
|
+
structured: result.domainData?.structured,
|
|
88
|
+
tokens: result.tokens,
|
|
89
|
+
fetchTimeMs: Date.now() - t0,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
catch (err) {
|
|
93
|
+
throw new Error(`Cars.com search failed: ${err.message}`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
async function handleFlightSearch(intent) {
|
|
97
|
+
const t0 = Date.now();
|
|
98
|
+
const gfUrl = `https://www.google.com/travel/flights?q=Flights+${encodeURIComponent(intent.query)}+one+way`;
|
|
99
|
+
try {
|
|
100
|
+
const result = await peel(gfUrl, { render: true, timeout: 30000 });
|
|
101
|
+
return {
|
|
102
|
+
type: 'flights',
|
|
103
|
+
source: 'Google Flights',
|
|
104
|
+
sourceUrl: gfUrl,
|
|
105
|
+
content: result.content,
|
|
106
|
+
title: result.title,
|
|
107
|
+
domainData: result.domainData,
|
|
108
|
+
structured: result.domainData?.structured,
|
|
109
|
+
tokens: result.tokens,
|
|
110
|
+
fetchTimeMs: Date.now() - t0,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
catch (err) {
|
|
114
|
+
throw new Error(`Google Flights search failed: ${err.message}`);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
async function handleHotelSearch(intent) {
|
|
118
|
+
const t0 = Date.now();
|
|
119
|
+
const ghUrl = `https://www.google.com/travel/hotels?q=${encodeURIComponent(intent.query)}`;
|
|
120
|
+
try {
|
|
121
|
+
const result = await peel(ghUrl, { render: true, timeout: 30000 });
|
|
122
|
+
return {
|
|
123
|
+
type: 'hotels',
|
|
124
|
+
source: 'Google Hotels',
|
|
125
|
+
sourceUrl: ghUrl,
|
|
126
|
+
content: result.content,
|
|
127
|
+
title: result.title,
|
|
128
|
+
domainData: result.domainData,
|
|
129
|
+
structured: result.domainData?.structured,
|
|
130
|
+
tokens: result.tokens,
|
|
131
|
+
fetchTimeMs: Date.now() - t0,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
catch (err) {
|
|
135
|
+
throw new Error(`Google Hotels search failed: ${err.message}`);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
async function handleRentalSearch(intent) {
|
|
139
|
+
const t0 = Date.now();
|
|
140
|
+
// Build Kayak car rental URL: /cars/<location>/<date-range>
|
|
141
|
+
// For simplicity, use a search-style URL that will browser-render fine
|
|
142
|
+
const encodedQuery = encodeURIComponent(intent.query.replace(/\b(rent|rental|car|a|vehicle|suv)\b/gi, '').trim() || intent.query);
|
|
143
|
+
const kayakUrl = `https://www.kayak.com/cars/${encodedQuery}/2025-04-10/2025-04-13/`;
|
|
144
|
+
try {
|
|
145
|
+
const result = await peel(kayakUrl, { render: true, timeout: 30000 });
|
|
146
|
+
return {
|
|
147
|
+
type: 'rental',
|
|
148
|
+
source: 'Kayak',
|
|
149
|
+
sourceUrl: kayakUrl,
|
|
150
|
+
content: result.content,
|
|
151
|
+
title: result.title,
|
|
152
|
+
domainData: result.domainData,
|
|
153
|
+
structured: result.domainData?.structured,
|
|
154
|
+
tokens: result.tokens,
|
|
155
|
+
fetchTimeMs: Date.now() - t0,
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
catch (err) {
|
|
159
|
+
throw new Error(`Kayak car rental search failed: ${err.message}`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
async function handleRestaurantSearch(intent) {
|
|
163
|
+
const t0 = Date.now();
|
|
164
|
+
const yelpUrl = `https://www.yelp.com/search?find_desc=${encodeURIComponent(intent.query.replace(/\b(best|top|good|cheap|near me)\b/gi, '').trim())}&find_loc=${encodeURIComponent('New York, NY')}`;
|
|
165
|
+
try {
|
|
166
|
+
const result = await peel(yelpUrl, { render: true, timeout: 25000 });
|
|
167
|
+
return {
|
|
168
|
+
type: 'restaurants',
|
|
169
|
+
source: 'Yelp',
|
|
170
|
+
sourceUrl: yelpUrl,
|
|
171
|
+
content: result.content,
|
|
172
|
+
title: result.title,
|
|
173
|
+
domainData: result.domainData,
|
|
174
|
+
structured: result.domainData?.structured,
|
|
175
|
+
tokens: result.tokens,
|
|
176
|
+
fetchTimeMs: Date.now() - t0,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
catch (err) {
|
|
180
|
+
throw new Error(`Yelp search failed: ${err.message}`);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
async function handleGeneralSearch(query) {
|
|
184
|
+
const t0 = Date.now();
|
|
185
|
+
const { provider: searchProvider } = getBestSearchProvider();
|
|
186
|
+
const rawResults = await searchProvider.searchWeb(query, { count: 10 });
|
|
187
|
+
const getDomain = (url) => {
|
|
188
|
+
try {
|
|
189
|
+
return new URL(url).hostname.replace(/^www\./, '');
|
|
190
|
+
}
|
|
191
|
+
catch {
|
|
192
|
+
return '';
|
|
193
|
+
}
|
|
194
|
+
};
|
|
195
|
+
const tierOrder = { official: 0, established: 1, community: 2, new: 3, suspicious: 4 };
|
|
196
|
+
let results = rawResults
|
|
197
|
+
.map((r) => {
|
|
198
|
+
const cred = getSourceCredibility(r.url);
|
|
199
|
+
return {
|
|
200
|
+
title: r.title,
|
|
201
|
+
url: r.url,
|
|
202
|
+
snippet: r.snippet,
|
|
203
|
+
domain: getDomain(r.url),
|
|
204
|
+
credibility: cred,
|
|
205
|
+
};
|
|
206
|
+
})
|
|
207
|
+
.sort((a, b) => {
|
|
208
|
+
const aTier = tierOrder[a.credibility?.tier || 'new'] ?? 3;
|
|
209
|
+
const bTier = tierOrder[b.credibility?.tier || 'new'] ?? 3;
|
|
210
|
+
return aTier - bTier;
|
|
211
|
+
})
|
|
212
|
+
.map((r, i) => ({ ...r, rank: i + 1 }));
|
|
213
|
+
// Enrich top 2 results with peel() for richer content
|
|
214
|
+
const top2 = results.slice(0, 2);
|
|
215
|
+
const enriched = await Promise.allSettled(top2.map(async (r) => {
|
|
216
|
+
try {
|
|
217
|
+
const peeled = await peel(r.url, { render: true, timeout: 15000, maxTokens: 2000 });
|
|
218
|
+
return { url: r.url, content: peeled.content?.substring(0, 1500), fetchTimeMs: peeled.elapsed };
|
|
219
|
+
}
|
|
220
|
+
catch {
|
|
221
|
+
return { url: r.url, content: null, fetchTimeMs: 0 };
|
|
222
|
+
}
|
|
223
|
+
}));
|
|
224
|
+
for (const settled of enriched) {
|
|
225
|
+
if (settled.status === 'fulfilled' && settled.value.content) {
|
|
226
|
+
const match = results.find((r) => r.url === settled.value.url);
|
|
227
|
+
if (match) {
|
|
228
|
+
match.content = settled.value.content;
|
|
229
|
+
match.fetchTimeMs = settled.value.fetchTimeMs;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
const content = results
|
|
234
|
+
.map((r, i) => `${i + 1}. **${r.title}**\n ${r.url}\n ${r.snippet}`)
|
|
235
|
+
.join('\n\n');
|
|
236
|
+
return {
|
|
237
|
+
type: 'general',
|
|
238
|
+
source: 'Web Search',
|
|
239
|
+
sourceUrl: `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`,
|
|
240
|
+
content,
|
|
241
|
+
results,
|
|
242
|
+
tokens: content.split(/\s+/).length,
|
|
243
|
+
fetchTimeMs: Date.now() - t0,
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
// ─── Loading message by intent type ────────────────────────────────────────
|
|
247
|
+
function getLoadingMessage(type) {
|
|
248
|
+
const msgs = {
|
|
249
|
+
cars: 'Searching cars on Cars.com…',
|
|
250
|
+
flights: 'Finding flights on Google Flights…',
|
|
251
|
+
hotels: 'Looking up hotels on Google Hotels…',
|
|
252
|
+
rental: 'Searching rental cars on Kayak…',
|
|
253
|
+
restaurants: 'Finding restaurants on Yelp…',
|
|
254
|
+
general: 'Searching the web…',
|
|
255
|
+
};
|
|
256
|
+
return msgs[type] || 'Searching…';
|
|
257
|
+
}
|
|
258
|
+
// ─── Router ────────────────────────────────────────────────────────────────
|
|
259
|
+
export function createSmartSearchRouter(authStore) {
|
|
260
|
+
const router = Router();
|
|
261
|
+
router.post('/v1/search/smart', async (req, res) => {
|
|
262
|
+
try {
|
|
263
|
+
// Require authentication
|
|
264
|
+
const authId = req.auth?.keyInfo?.accountId || req.user?.userId;
|
|
265
|
+
if (!authId) {
|
|
266
|
+
res.status(401).json({
|
|
267
|
+
success: false,
|
|
268
|
+
error: {
|
|
269
|
+
type: 'authentication_required',
|
|
270
|
+
message: 'API key required. Get one free at https://app.webpeel.dev',
|
|
271
|
+
docs: 'https://webpeel.dev/docs/api-reference#authentication',
|
|
272
|
+
},
|
|
273
|
+
requestId: req.requestId,
|
|
274
|
+
});
|
|
275
|
+
return;
|
|
276
|
+
}
|
|
277
|
+
const { q, location, zip } = req.body;
|
|
278
|
+
if (!q || typeof q !== 'string' || !q.trim()) {
|
|
279
|
+
res.status(400).json({
|
|
280
|
+
success: false,
|
|
281
|
+
error: {
|
|
282
|
+
type: 'invalid_request',
|
|
283
|
+
message: 'Missing or invalid "q" field in request body',
|
|
284
|
+
hint: 'POST /v1/search/smart with JSON body: { "q": "your search query" }',
|
|
285
|
+
docs: 'https://webpeel.dev/docs/api-reference#smart-search',
|
|
286
|
+
},
|
|
287
|
+
requestId: req.requestId,
|
|
288
|
+
});
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
const query = q.trim();
|
|
292
|
+
const intent = detectSearchIntent(query);
|
|
293
|
+
// Override zip from request body if provided
|
|
294
|
+
if (zip && intent.params) {
|
|
295
|
+
intent.params.zip = zip;
|
|
296
|
+
}
|
|
297
|
+
// Also try to extract location context from query if "location" is provided
|
|
298
|
+
if (location && intent.type === 'restaurants') {
|
|
299
|
+
// Will be passed in URL construction
|
|
300
|
+
intent.location = location;
|
|
301
|
+
}
|
|
302
|
+
let smartResult;
|
|
303
|
+
switch (intent.type) {
|
|
304
|
+
case 'cars':
|
|
305
|
+
smartResult = await handleCarSearch(intent);
|
|
306
|
+
break;
|
|
307
|
+
case 'flights':
|
|
308
|
+
smartResult = await handleFlightSearch(intent);
|
|
309
|
+
break;
|
|
310
|
+
case 'hotels':
|
|
311
|
+
smartResult = await handleHotelSearch(intent);
|
|
312
|
+
break;
|
|
313
|
+
case 'rental':
|
|
314
|
+
smartResult = await handleRentalSearch(intent);
|
|
315
|
+
break;
|
|
316
|
+
case 'restaurants':
|
|
317
|
+
smartResult = await handleRestaurantSearch(intent);
|
|
318
|
+
break;
|
|
319
|
+
default:
|
|
320
|
+
smartResult = await handleGeneralSearch(query);
|
|
321
|
+
}
|
|
322
|
+
// Add loading message hint for frontend UX
|
|
323
|
+
smartResult.loadingMessage = getLoadingMessage(intent.type);
|
|
324
|
+
// Track usage
|
|
325
|
+
const pgStore = authStore;
|
|
326
|
+
if (req.auth?.keyInfo?.key && typeof pgStore.trackUsage === 'function') {
|
|
327
|
+
if (typeof pgStore.trackBurstUsage === 'function') {
|
|
328
|
+
await pgStore.trackBurstUsage(req.auth.keyInfo.key);
|
|
329
|
+
}
|
|
330
|
+
if (!req.auth?.softLimited) {
|
|
331
|
+
await pgStore.trackUsage(req.auth.keyInfo.key, 'smart-search');
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
res.setHeader('X-Intent-Type', intent.type);
|
|
335
|
+
res.setHeader('X-Source', smartResult.source);
|
|
336
|
+
res.setHeader('X-Processing-Time', smartResult.fetchTimeMs.toString());
|
|
337
|
+
res.setHeader('Cache-Control', 'no-store');
|
|
338
|
+
res.json({
|
|
339
|
+
success: true,
|
|
340
|
+
data: smartResult,
|
|
341
|
+
});
|
|
342
|
+
}
|
|
343
|
+
catch (error) {
|
|
344
|
+
const err = error;
|
|
345
|
+
console.error('Smart search error:', err.message, err.stack);
|
|
346
|
+
res.status(500).json({
|
|
347
|
+
success: false,
|
|
348
|
+
error: {
|
|
349
|
+
type: 'smart_search_failed',
|
|
350
|
+
message: err.message || 'Smart search failed. Please try again.',
|
|
351
|
+
docs: 'https://webpeel.dev/docs/api-reference#smart-search',
|
|
352
|
+
},
|
|
353
|
+
requestId: req.requestId,
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
});
|
|
357
|
+
return router;
|
|
358
|
+
}
|
package/dist/types.d.ts
CHANGED
|
@@ -309,7 +309,7 @@ export interface PeelResult {
|
|
|
309
309
|
*/
|
|
310
310
|
readability?: import('./core/readability.js').ReadabilityResult;
|
|
311
311
|
/** Domain-aware structured data (Twitter, Reddit, GitHub, HN). Present when URL matches a known domain. */
|
|
312
|
-
domainData?: import('./
|
|
312
|
+
domainData?: import('./ee/domain-extractors.js').DomainExtractResult;
|
|
313
313
|
/** Quick answer result (when question option is set). BM25-powered, no LLM needed. */
|
|
314
314
|
quickAnswer?: import('./core/quick-answer.js').QuickAnswerResult;
|
|
315
315
|
/** Per-stage timing breakdown in milliseconds. */
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.85",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|