webpeel 0.21.52 → 0.21.53
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory LRU fetch cache for WebPeel
|
|
3
|
+
*
|
|
4
|
+
* Caches pipeline results to avoid redundant fetches for identical requests.
|
|
5
|
+
* Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
|
|
6
|
+
* Exported as a singleton: import { fetchCache } from './fetch-cache.js'
|
|
7
|
+
*/
|
|
8
|
+
export interface FetchCacheEntry {
|
|
9
|
+
content: string;
|
|
10
|
+
title: string;
|
|
11
|
+
metadata: any;
|
|
12
|
+
method: string;
|
|
13
|
+
tokens: number;
|
|
14
|
+
links?: any[];
|
|
15
|
+
timestamp: number;
|
|
16
|
+
}
|
|
17
|
+
export interface FetchCacheStats {
|
|
18
|
+
size: number;
|
|
19
|
+
hits: number;
|
|
20
|
+
misses: number;
|
|
21
|
+
hitRate: number;
|
|
22
|
+
}
|
|
23
|
+
export declare class FetchCache {
|
|
24
|
+
private cache;
|
|
25
|
+
private maxEntries;
|
|
26
|
+
private defaultTTL;
|
|
27
|
+
private hits;
|
|
28
|
+
private misses;
|
|
29
|
+
constructor(maxEntries?: number, defaultTTLSeconds?: number);
|
|
30
|
+
/**
|
|
31
|
+
* Generate a stable cache key from url + relevant fetch options.
|
|
32
|
+
* Different option combinations produce different cache entries.
|
|
33
|
+
*/
|
|
34
|
+
getKey(url: string, options?: {
|
|
35
|
+
render?: boolean;
|
|
36
|
+
stealth?: boolean;
|
|
37
|
+
budget?: number;
|
|
38
|
+
}): string;
|
|
39
|
+
/**
|
|
40
|
+
* Retrieve a cached entry. Returns null if missing or expired.
|
|
41
|
+
* On hit: entry is moved to the end of the Map (LRU refresh).
|
|
42
|
+
*/
|
|
43
|
+
get(key: string): FetchCacheEntry | null;
|
|
44
|
+
/**
|
|
45
|
+
* Store an entry in the cache.
|
|
46
|
+
* If the cache is at capacity, the least recently used entry is evicted.
|
|
47
|
+
*/
|
|
48
|
+
set(key: string, entry: FetchCacheEntry): void;
|
|
49
|
+
/** Clear all entries and reset stats. */
|
|
50
|
+
clear(): void;
|
|
51
|
+
/** Return cache stats. hitRate is in [0, 1]. */
|
|
52
|
+
stats(): FetchCacheStats;
|
|
53
|
+
}
|
|
54
|
+
/** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
|
|
55
|
+
export declare const fetchCache: FetchCache;
|
|
56
|
+
/** Singleton search cache — shorter TTL since results change faster (60 s). */
|
|
57
|
+
export declare const searchCache: FetchCache;
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory LRU fetch cache for WebPeel
|
|
3
|
+
*
|
|
4
|
+
* Caches pipeline results to avoid redundant fetches for identical requests.
|
|
5
|
+
* Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
|
|
6
|
+
* Exported as a singleton: import { fetchCache } from './fetch-cache.js'
|
|
7
|
+
*/
|
|
8
|
+
export class FetchCache {
|
|
9
|
+
cache;
|
|
10
|
+
maxEntries;
|
|
11
|
+
defaultTTL; // ms
|
|
12
|
+
hits;
|
|
13
|
+
misses;
|
|
14
|
+
constructor(maxEntries = 500, defaultTTLSeconds = 300) {
|
|
15
|
+
this.cache = new Map();
|
|
16
|
+
this.maxEntries = maxEntries;
|
|
17
|
+
this.defaultTTL = defaultTTLSeconds * 1000;
|
|
18
|
+
this.hits = 0;
|
|
19
|
+
this.misses = 0;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Generate a stable cache key from url + relevant fetch options.
|
|
23
|
+
* Different option combinations produce different cache entries.
|
|
24
|
+
*/
|
|
25
|
+
getKey(url, options = {}) {
|
|
26
|
+
const render = options.render ? '1' : '0';
|
|
27
|
+
const stealth = options.stealth ? '1' : '0';
|
|
28
|
+
const budget = options.budget !== undefined ? String(options.budget) : '';
|
|
29
|
+
return `${url}|r:${render}|s:${stealth}|b:${budget}`;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Retrieve a cached entry. Returns null if missing or expired.
|
|
33
|
+
* On hit: entry is moved to the end of the Map (LRU refresh).
|
|
34
|
+
*/
|
|
35
|
+
get(key) {
|
|
36
|
+
const entry = this.cache.get(key);
|
|
37
|
+
if (!entry) {
|
|
38
|
+
this.misses++;
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
const ageMs = Date.now() - entry.timestamp;
|
|
42
|
+
if (ageMs > this.defaultTTL) {
|
|
43
|
+
// Expired — evict and return null
|
|
44
|
+
this.cache.delete(key);
|
|
45
|
+
this.misses++;
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
// LRU touch: move to end
|
|
49
|
+
this.cache.delete(key);
|
|
50
|
+
this.cache.set(key, entry);
|
|
51
|
+
this.hits++;
|
|
52
|
+
return entry;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Store an entry in the cache.
|
|
56
|
+
* If the cache is at capacity, the least recently used entry is evicted.
|
|
57
|
+
*/
|
|
58
|
+
set(key, entry) {
|
|
59
|
+
// Remove existing to refresh position
|
|
60
|
+
if (this.cache.has(key)) {
|
|
61
|
+
this.cache.delete(key);
|
|
62
|
+
}
|
|
63
|
+
this.cache.set(key, entry);
|
|
64
|
+
// LRU eviction: remove oldest entry (first in Map iteration order)
|
|
65
|
+
while (this.cache.size > this.maxEntries) {
|
|
66
|
+
const oldestKey = this.cache.keys().next().value;
|
|
67
|
+
if (oldestKey !== undefined) {
|
|
68
|
+
this.cache.delete(oldestKey);
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
/** Clear all entries and reset stats. */
|
|
76
|
+
clear() {
|
|
77
|
+
this.cache.clear();
|
|
78
|
+
this.hits = 0;
|
|
79
|
+
this.misses = 0;
|
|
80
|
+
}
|
|
81
|
+
/** Return cache stats. hitRate is in [0, 1]. */
|
|
82
|
+
stats() {
|
|
83
|
+
const total = this.hits + this.misses;
|
|
84
|
+
return {
|
|
85
|
+
size: this.cache.size,
|
|
86
|
+
hits: this.hits,
|
|
87
|
+
misses: this.misses,
|
|
88
|
+
hitRate: total === 0 ? 0 : Math.round((this.hits / total) * 100) / 100,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
/** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
|
|
93
|
+
export const fetchCache = new FetchCache(500, 300);
|
|
94
|
+
/** Singleton search cache — shorter TTL since results change faster (60 s). */
|
|
95
|
+
export const searchCache = new FetchCache(500, 60);
|
package/dist/core/pipeline.js
CHANGED
|
@@ -1217,6 +1217,52 @@ export async function finalize(ctx) {
|
|
|
1217
1217
|
log.error('Change tracking failed:', error);
|
|
1218
1218
|
}
|
|
1219
1219
|
}
|
|
1220
|
+
// ── Auto-escalation: retry thin content with browser rendering ──────────────
|
|
1221
|
+
// If simple fetch returned very little content and user didn't explicitly disable render,
|
|
1222
|
+
// automatically retry with browser rendering to handle JS-heavy/paywalled sites.
|
|
1223
|
+
const preEscalationWords = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1224
|
+
const escalationFetchMethod = fetchResult?.method || 'unknown';
|
|
1225
|
+
const alreadyTriedBrowser = escalationFetchMethod === 'browser' || escalationFetchMethod === 'stealth'
|
|
1226
|
+
|| options.render || options.stealth;
|
|
1227
|
+
const userDisabledRender = options.render === false;
|
|
1228
|
+
const escalationCandidate = preEscalationWords < 200 && preEscalationWords > 0
|
|
1229
|
+
&& escalationFetchMethod === 'simple' && !alreadyTriedBrowser && !userDisabledRender
|
|
1230
|
+
&& !ctx._escalated;
|
|
1231
|
+
if (escalationCandidate) {
|
|
1232
|
+
log.info(`thin content (${preEscalationWords}w) from simple fetch, auto-escalating to browser render for ${ctx.url}`);
|
|
1233
|
+
ctx._escalated = true;
|
|
1234
|
+
try {
|
|
1235
|
+
const { smartFetch } = await import('./strategies.js');
|
|
1236
|
+
const browserResult = await smartFetch(ctx.url, {
|
|
1237
|
+
forceBrowser: true,
|
|
1238
|
+
stealth: false,
|
|
1239
|
+
timeoutMs: options.timeout || 15000,
|
|
1240
|
+
proxy: options.proxy,
|
|
1241
|
+
});
|
|
1242
|
+
if (browserResult.html && browserResult.html.length > (fetchResult?.html?.length || 0)) {
|
|
1243
|
+
const { htmlToMarkdown } = await import('./markdown.js');
|
|
1244
|
+
const browserContent = htmlToMarkdown(browserResult.html);
|
|
1245
|
+
const browserWords = browserContent.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1246
|
+
if (browserWords > preEscalationWords) {
|
|
1247
|
+
log.info(`browser escalation improved content: ${preEscalationWords}w → ${browserWords}w`);
|
|
1248
|
+
ctx.content = browserContent;
|
|
1249
|
+
ctx.fetchResult = browserResult;
|
|
1250
|
+
ctx.fetchResult.method = 'browser-escalation';
|
|
1251
|
+
}
|
|
1252
|
+
else {
|
|
1253
|
+
log.debug(`browser escalation did not improve (${browserWords}w vs ${preEscalationWords}w)`);
|
|
1254
|
+
}
|
|
1255
|
+
// Always clean up browser resources
|
|
1256
|
+
if (browserResult.page)
|
|
1257
|
+
await browserResult.page.close().catch(() => { });
|
|
1258
|
+
if (browserResult.browser)
|
|
1259
|
+
await browserResult.browser.close().catch(() => { });
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
catch (e) {
|
|
1263
|
+
log.debug('browser escalation failed:', e instanceof Error ? e.message : e);
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1220
1266
|
// Generate AI summary if requested
|
|
1221
1267
|
if (options.summary && options.llm) {
|
|
1222
1268
|
try {
|
|
@@ -7,6 +7,7 @@ import { Router } from 'express';
|
|
|
7
7
|
import { readFileSync } from 'fs';
|
|
8
8
|
import { join, dirname } from 'path';
|
|
9
9
|
import { fileURLToPath } from 'url';
|
|
10
|
+
import { fetchCache, searchCache } from '../../core/fetch-cache.js';
|
|
10
11
|
const startTime = Date.now();
|
|
11
12
|
// Read version once at startup
|
|
12
13
|
let version = 'unknown';
|
|
@@ -26,11 +27,23 @@ export function createHealthRouter() {
|
|
|
26
27
|
const router = Router();
|
|
27
28
|
router.get('/health', (_req, res) => {
|
|
28
29
|
const uptime = Math.floor((Date.now() - startTime) / 1000);
|
|
30
|
+
const fetchStats = fetchCache.stats();
|
|
31
|
+
const searchStats = searchCache.stats();
|
|
29
32
|
res.json({
|
|
30
33
|
status: 'healthy',
|
|
31
34
|
version,
|
|
32
35
|
uptime,
|
|
33
36
|
timestamp: new Date().toISOString(),
|
|
37
|
+
cache: {
|
|
38
|
+
fetch: {
|
|
39
|
+
size: fetchStats.size,
|
|
40
|
+
hitRate: fetchStats.hitRate,
|
|
41
|
+
},
|
|
42
|
+
search: {
|
|
43
|
+
size: searchStats.size,
|
|
44
|
+
hitRate: searchStats.hitRate,
|
|
45
|
+
},
|
|
46
|
+
},
|
|
34
47
|
});
|
|
35
48
|
});
|
|
36
49
|
return router;
|
|
@@ -7,6 +7,7 @@ import { load } from 'cheerio';
|
|
|
7
7
|
import { LRUCache } from 'lru-cache';
|
|
8
8
|
import { peel } from '../../index.js';
|
|
9
9
|
import { simpleFetch } from '../../core/fetcher.js';
|
|
10
|
+
import { searchCache } from '../../core/fetch-cache.js';
|
|
10
11
|
import { getSearchProvider, getBestSearchProvider, } from '../../core/search-provider.js';
|
|
11
12
|
export function createSearchRouter(authStore) {
|
|
12
13
|
const router = Router();
|
|
@@ -47,9 +48,9 @@ export function createSearchRouter(authStore) {
|
|
|
47
48
|
return;
|
|
48
49
|
}
|
|
49
50
|
// Parse and validate count
|
|
50
|
-
const resultCount = count ? parseInt(count, 10) :
|
|
51
|
-
if (isNaN(resultCount) || resultCount < 1 || resultCount >
|
|
52
|
-
res.status(400).json({ success: false, error: { type: 'invalid_request', message: 'Invalid "count" parameter: must be between 1 and
|
|
51
|
+
const resultCount = count ? parseInt(count, 10) : 10;
|
|
52
|
+
if (isNaN(resultCount) || resultCount < 1 || resultCount > 20) {
|
|
53
|
+
res.status(400).json({ success: false, error: { type: 'invalid_request', message: 'Invalid "count" parameter: must be between 1 and 20', hint: 'Use a count value between 1 and 20', docs: 'https://webpeel.dev/docs/errors#invalid_request' }, requestId: req.requestId });
|
|
53
54
|
return;
|
|
54
55
|
}
|
|
55
56
|
// Parse sources parameter (comma-separated: web,news,images)
|
|
@@ -64,10 +65,12 @@ export function createSearchRouter(authStore) {
|
|
|
64
65
|
// Build cache key (include all parameters)
|
|
65
66
|
const enrichCount = enrich ? Math.min(Math.max(parseInt(enrich, 10) || 0, 0), 5) : 0;
|
|
66
67
|
const cacheKey = `search:${providerId}:${q}:${resultCount}:${sourcesStr}:${shouldScrape}:${enrichCount}:${categoriesStr}:${tbsStr}:${countryStr}:${locationStr}`;
|
|
67
|
-
|
|
68
|
+
const sharedCacheKey = searchCache.getKey(cacheKey, {});
|
|
69
|
+
// Check cache (local LRU first, then shared singleton)
|
|
68
70
|
const cached = cache.get(cacheKey);
|
|
69
71
|
if (cached) {
|
|
70
72
|
res.setHeader('X-Cache', 'HIT');
|
|
73
|
+
res.setHeader('X-Cache-Status', 'HIT');
|
|
71
74
|
res.setHeader('X-Cache-Age', Math.floor((Date.now() - cached.timestamp) / 1000).toString());
|
|
72
75
|
res.json({
|
|
73
76
|
success: true,
|
|
@@ -75,6 +78,19 @@ export function createSearchRouter(authStore) {
|
|
|
75
78
|
});
|
|
76
79
|
return;
|
|
77
80
|
}
|
|
81
|
+
// Also check shared searchCache singleton (used for /health stats)
|
|
82
|
+
const sharedCached = searchCache.get(sharedCacheKey);
|
|
83
|
+
if (sharedCached) {
|
|
84
|
+
const age = Math.floor((Date.now() - sharedCached.timestamp) / 1000);
|
|
85
|
+
res.setHeader('X-Cache', 'HIT');
|
|
86
|
+
res.setHeader('X-Cache-Status', 'HIT');
|
|
87
|
+
res.setHeader('X-Cache-Age', age.toString());
|
|
88
|
+
res.json({
|
|
89
|
+
success: true,
|
|
90
|
+
data: sharedCached.content ? JSON.parse(sharedCached.content) : {},
|
|
91
|
+
});
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
78
94
|
const startTime = Date.now();
|
|
79
95
|
const data = {};
|
|
80
96
|
// Fetch web results via the search-provider abstraction
|
|
@@ -317,13 +333,22 @@ export function createSearchRouter(authStore) {
|
|
|
317
333
|
await pgStore.trackUsage(req.auth.keyInfo.key, 'search');
|
|
318
334
|
}
|
|
319
335
|
}
|
|
320
|
-
// Cache results
|
|
336
|
+
// Cache results (local LRU + shared singleton for /health stats)
|
|
321
337
|
cache.set(cacheKey, {
|
|
322
338
|
data,
|
|
323
339
|
timestamp: Date.now(),
|
|
324
340
|
});
|
|
341
|
+
searchCache.set(sharedCacheKey, {
|
|
342
|
+
content: JSON.stringify(data),
|
|
343
|
+
title: q,
|
|
344
|
+
metadata: {},
|
|
345
|
+
method: 'search',
|
|
346
|
+
tokens: 0,
|
|
347
|
+
timestamp: Date.now(),
|
|
348
|
+
});
|
|
325
349
|
// Add headers
|
|
326
350
|
res.setHeader('X-Cache', 'MISS');
|
|
351
|
+
res.setHeader('X-Cache-Status', 'MISS');
|
|
327
352
|
res.setHeader('X-Credits-Used', '1');
|
|
328
353
|
res.setHeader('X-Processing-Time', elapsed.toString());
|
|
329
354
|
res.setHeader('X-Fetch-Type', 'search');
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.53",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|