webpeel 0.21.52 → 0.21.56
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory LRU fetch cache for WebPeel
|
|
3
|
+
*
|
|
4
|
+
* Caches pipeline results to avoid redundant fetches for identical requests.
|
|
5
|
+
* Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
|
|
6
|
+
* Exported as a singleton: import { fetchCache } from './fetch-cache.js'
|
|
7
|
+
*/
|
|
8
|
+
export interface FetchCacheEntry {
|
|
9
|
+
content: string;
|
|
10
|
+
title: string;
|
|
11
|
+
metadata: any;
|
|
12
|
+
method: string;
|
|
13
|
+
tokens: number;
|
|
14
|
+
links?: any[];
|
|
15
|
+
timestamp: number;
|
|
16
|
+
}
|
|
17
|
+
export interface FetchCacheStats {
|
|
18
|
+
size: number;
|
|
19
|
+
hits: number;
|
|
20
|
+
misses: number;
|
|
21
|
+
hitRate: number;
|
|
22
|
+
}
|
|
23
|
+
export declare class FetchCache {
|
|
24
|
+
private cache;
|
|
25
|
+
private maxEntries;
|
|
26
|
+
private defaultTTL;
|
|
27
|
+
private hits;
|
|
28
|
+
private misses;
|
|
29
|
+
constructor(maxEntries?: number, defaultTTLSeconds?: number);
|
|
30
|
+
/**
|
|
31
|
+
* Generate a stable cache key from url + relevant fetch options.
|
|
32
|
+
* Different option combinations produce different cache entries.
|
|
33
|
+
*/
|
|
34
|
+
getKey(url: string, options?: {
|
|
35
|
+
render?: boolean;
|
|
36
|
+
stealth?: boolean;
|
|
37
|
+
budget?: number;
|
|
38
|
+
}): string;
|
|
39
|
+
/**
|
|
40
|
+
* Retrieve a cached entry. Returns null if missing or expired.
|
|
41
|
+
* On hit: entry is moved to the end of the Map (LRU refresh).
|
|
42
|
+
*/
|
|
43
|
+
get(key: string): FetchCacheEntry | null;
|
|
44
|
+
/**
|
|
45
|
+
* Store an entry in the cache.
|
|
46
|
+
* If the cache is at capacity, the least recently used entry is evicted.
|
|
47
|
+
*/
|
|
48
|
+
set(key: string, entry: FetchCacheEntry): void;
|
|
49
|
+
/** Clear all entries and reset stats. */
|
|
50
|
+
clear(): void;
|
|
51
|
+
/** Return cache stats. hitRate is in [0, 1]. */
|
|
52
|
+
stats(): FetchCacheStats;
|
|
53
|
+
}
|
|
54
|
+
/** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
|
|
55
|
+
export declare const fetchCache: FetchCache;
|
|
56
|
+
/** Singleton search cache — shorter TTL since results change faster (60 s). */
|
|
57
|
+
export declare const searchCache: FetchCache;
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory LRU fetch cache for WebPeel
|
|
3
|
+
*
|
|
4
|
+
* Caches pipeline results to avoid redundant fetches for identical requests.
|
|
5
|
+
* Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
|
|
6
|
+
* Exported as a singleton: import { fetchCache } from './fetch-cache.js'
|
|
7
|
+
*/
|
|
8
|
+
export class FetchCache {
|
|
9
|
+
cache;
|
|
10
|
+
maxEntries;
|
|
11
|
+
defaultTTL; // ms
|
|
12
|
+
hits;
|
|
13
|
+
misses;
|
|
14
|
+
constructor(maxEntries = 500, defaultTTLSeconds = 300) {
|
|
15
|
+
this.cache = new Map();
|
|
16
|
+
this.maxEntries = maxEntries;
|
|
17
|
+
this.defaultTTL = defaultTTLSeconds * 1000;
|
|
18
|
+
this.hits = 0;
|
|
19
|
+
this.misses = 0;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Generate a stable cache key from url + relevant fetch options.
|
|
23
|
+
* Different option combinations produce different cache entries.
|
|
24
|
+
*/
|
|
25
|
+
getKey(url, options = {}) {
|
|
26
|
+
const render = options.render ? '1' : '0';
|
|
27
|
+
const stealth = options.stealth ? '1' : '0';
|
|
28
|
+
const budget = options.budget !== undefined ? String(options.budget) : '';
|
|
29
|
+
return `${url}|r:${render}|s:${stealth}|b:${budget}`;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Retrieve a cached entry. Returns null if missing or expired.
|
|
33
|
+
* On hit: entry is moved to the end of the Map (LRU refresh).
|
|
34
|
+
*/
|
|
35
|
+
get(key) {
|
|
36
|
+
const entry = this.cache.get(key);
|
|
37
|
+
if (!entry) {
|
|
38
|
+
this.misses++;
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
const ageMs = Date.now() - entry.timestamp;
|
|
42
|
+
if (ageMs > this.defaultTTL) {
|
|
43
|
+
// Expired — evict and return null
|
|
44
|
+
this.cache.delete(key);
|
|
45
|
+
this.misses++;
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
// LRU touch: move to end
|
|
49
|
+
this.cache.delete(key);
|
|
50
|
+
this.cache.set(key, entry);
|
|
51
|
+
this.hits++;
|
|
52
|
+
return entry;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Store an entry in the cache.
|
|
56
|
+
* If the cache is at capacity, the least recently used entry is evicted.
|
|
57
|
+
*/
|
|
58
|
+
set(key, entry) {
|
|
59
|
+
// Remove existing to refresh position
|
|
60
|
+
if (this.cache.has(key)) {
|
|
61
|
+
this.cache.delete(key);
|
|
62
|
+
}
|
|
63
|
+
this.cache.set(key, entry);
|
|
64
|
+
// LRU eviction: remove oldest entry (first in Map iteration order)
|
|
65
|
+
while (this.cache.size > this.maxEntries) {
|
|
66
|
+
const oldestKey = this.cache.keys().next().value;
|
|
67
|
+
if (oldestKey !== undefined) {
|
|
68
|
+
this.cache.delete(oldestKey);
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
/** Clear all entries and reset stats. */
|
|
76
|
+
clear() {
|
|
77
|
+
this.cache.clear();
|
|
78
|
+
this.hits = 0;
|
|
79
|
+
this.misses = 0;
|
|
80
|
+
}
|
|
81
|
+
/** Return cache stats. hitRate is in [0, 1]. */
|
|
82
|
+
stats() {
|
|
83
|
+
const total = this.hits + this.misses;
|
|
84
|
+
return {
|
|
85
|
+
size: this.cache.size,
|
|
86
|
+
hits: this.hits,
|
|
87
|
+
misses: this.misses,
|
|
88
|
+
hitRate: total === 0 ? 0 : Math.round((this.hits / total) * 100) / 100,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
/** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
|
|
93
|
+
export const fetchCache = new FetchCache(500, 300);
|
|
94
|
+
/** Singleton search cache — shorter TTL since results change faster (60 s). */
|
|
95
|
+
export const searchCache = new FetchCache(500, 60);
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Source credibility scoring — lightweight, zero dependencies.
|
|
3
|
+
*
|
|
4
|
+
* Classifies URLs by trustworthiness:
|
|
5
|
+
* - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
|
|
6
|
+
* - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
|
|
7
|
+
* - General (★): Everything else
|
|
8
|
+
*/
|
|
9
|
+
export interface SourceCredibility {
|
|
10
|
+
tier: 'official' | 'verified' | 'general';
|
|
11
|
+
stars: number;
|
|
12
|
+
label: string;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Assess the credibility of a source URL.
|
|
16
|
+
*/
|
|
17
|
+
export declare function getSourceCredibility(url: string): SourceCredibility;
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Source credibility scoring — lightweight, zero dependencies.
|
|
3
|
+
*
|
|
4
|
+
* Classifies URLs by trustworthiness:
|
|
5
|
+
* - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
|
|
6
|
+
* - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
|
|
7
|
+
* - General (★): Everything else
|
|
8
|
+
*/
|
|
9
|
+
/** Official TLDs and hostnames that indicate high-authority sources */
|
|
10
|
+
const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil']);
|
|
11
|
+
const OFFICIAL_HOSTNAMES = new Set([
|
|
12
|
+
// Academic / research
|
|
13
|
+
'arxiv.org', 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
|
|
14
|
+
'jstor.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
|
|
15
|
+
'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
|
|
16
|
+
// International organisations
|
|
17
|
+
'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
|
|
18
|
+
// Official tech documentation
|
|
19
|
+
'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
|
|
20
|
+
'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
|
|
21
|
+
'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
|
|
22
|
+
// Health / medicine
|
|
23
|
+
'cdc.gov', 'nih.gov', 'fda.gov', 'mayoclinic.org', 'clevelandclinic.org',
|
|
24
|
+
'webmd.com', 'medlineplus.gov',
|
|
25
|
+
// Standards / specs
|
|
26
|
+
'w3.org', 'ietf.org', 'rfc-editor.org', 'iso.org',
|
|
27
|
+
]);
|
|
28
|
+
const VERIFIED_HOSTNAMES = new Set([
|
|
29
|
+
// Encyclopaedia / reference
|
|
30
|
+
'wikipedia.org', 'en.wikipedia.org', 'britannica.com',
|
|
31
|
+
// Reputable news agencies
|
|
32
|
+
'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'nytimes.com',
|
|
33
|
+
'washingtonpost.com', 'theguardian.com', 'economist.com', 'ft.com',
|
|
34
|
+
'cnn.com', 'npr.org', 'pbs.org',
|
|
35
|
+
// Developer resources
|
|
36
|
+
'github.com', 'stackoverflow.com', 'npmjs.com', 'pypi.org',
|
|
37
|
+
'crates.io', 'docs.rs', 'packagist.org', 'rubygems.org',
|
|
38
|
+
// Official cloud / vendor docs
|
|
39
|
+
'docs.aws.amazon.com', 'cloud.google.com', 'docs.github.com',
|
|
40
|
+
'azure.microsoft.com', 'registry.terraform.io',
|
|
41
|
+
// Reputable tech publications
|
|
42
|
+
'arstechnica.com', 'wired.com', 'techcrunch.com', 'theverge.com',
|
|
43
|
+
// National Geographic, Smithsonian
|
|
44
|
+
'nationalgeographic.com', 'smithsonianmag.com',
|
|
45
|
+
]);
|
|
46
|
+
/**
|
|
47
|
+
* Assess the credibility of a source URL.
|
|
48
|
+
*/
|
|
49
|
+
export function getSourceCredibility(url) {
|
|
50
|
+
try {
|
|
51
|
+
const hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, '');
|
|
52
|
+
// Check official TLDs
|
|
53
|
+
for (const tld of OFFICIAL_TLDS) {
|
|
54
|
+
if (hostname.endsWith(tld)) {
|
|
55
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Check known official hostnames
|
|
59
|
+
if (OFFICIAL_HOSTNAMES.has(hostname)) {
|
|
60
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
61
|
+
}
|
|
62
|
+
// Check parent domain (e.g. en.wikipedia.org → wikipedia.org)
|
|
63
|
+
const parts = hostname.split('.');
|
|
64
|
+
if (parts.length > 2) {
|
|
65
|
+
const parentDomain = parts.slice(-2).join('.');
|
|
66
|
+
if (OFFICIAL_HOSTNAMES.has(parentDomain)) {
|
|
67
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
68
|
+
}
|
|
69
|
+
if (VERIFIED_HOSTNAMES.has(parentDomain)) {
|
|
70
|
+
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Check known verified hostnames
|
|
74
|
+
if (VERIFIED_HOSTNAMES.has(hostname)) {
|
|
75
|
+
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
76
|
+
}
|
|
77
|
+
// Everything else
|
|
78
|
+
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
82
|
+
}
|
|
83
|
+
}
|
|
@@ -7,6 +7,7 @@ import { Router } from 'express';
|
|
|
7
7
|
import { readFileSync } from 'fs';
|
|
8
8
|
import { join, dirname } from 'path';
|
|
9
9
|
import { fileURLToPath } from 'url';
|
|
10
|
+
import { fetchCache, searchCache } from '../../core/fetch-cache.js';
|
|
10
11
|
const startTime = Date.now();
|
|
11
12
|
// Read version once at startup
|
|
12
13
|
let version = 'unknown';
|
|
@@ -26,11 +27,23 @@ export function createHealthRouter() {
|
|
|
26
27
|
const router = Router();
|
|
27
28
|
router.get('/health', (_req, res) => {
|
|
28
29
|
const uptime = Math.floor((Date.now() - startTime) / 1000);
|
|
30
|
+
const fetchStats = fetchCache.stats();
|
|
31
|
+
const searchStats = searchCache.stats();
|
|
29
32
|
res.json({
|
|
30
33
|
status: 'healthy',
|
|
31
34
|
version,
|
|
32
35
|
uptime,
|
|
33
36
|
timestamp: new Date().toISOString(),
|
|
37
|
+
cache: {
|
|
38
|
+
fetch: {
|
|
39
|
+
size: fetchStats.size,
|
|
40
|
+
hitRate: fetchStats.hitRate,
|
|
41
|
+
},
|
|
42
|
+
search: {
|
|
43
|
+
size: searchStats.size,
|
|
44
|
+
hitRate: searchStats.hitRate,
|
|
45
|
+
},
|
|
46
|
+
},
|
|
34
47
|
});
|
|
35
48
|
});
|
|
36
49
|
return router;
|
|
@@ -7,7 +7,9 @@ import { load } from 'cheerio';
|
|
|
7
7
|
import { LRUCache } from 'lru-cache';
|
|
8
8
|
import { peel } from '../../index.js';
|
|
9
9
|
import { simpleFetch } from '../../core/fetcher.js';
|
|
10
|
+
import { searchCache } from '../../core/fetch-cache.js';
|
|
10
11
|
import { getSearchProvider, getBestSearchProvider, } from '../../core/search-provider.js';
|
|
12
|
+
import { getSourceCredibility } from '../../core/source-credibility.js';
|
|
11
13
|
export function createSearchRouter(authStore) {
|
|
12
14
|
const router = Router();
|
|
13
15
|
// LRU cache: 15 minute TTL, max 500 entries, 50MB total size
|
|
@@ -47,9 +49,9 @@ export function createSearchRouter(authStore) {
|
|
|
47
49
|
return;
|
|
48
50
|
}
|
|
49
51
|
// Parse and validate count
|
|
50
|
-
const resultCount = count ? parseInt(count, 10) :
|
|
51
|
-
if (isNaN(resultCount) || resultCount < 1 || resultCount >
|
|
52
|
-
res.status(400).json({ success: false, error: { type: 'invalid_request', message: 'Invalid "count" parameter: must be between 1 and
|
|
52
|
+
const resultCount = count ? parseInt(count, 10) : 10;
|
|
53
|
+
if (isNaN(resultCount) || resultCount < 1 || resultCount > 20) {
|
|
54
|
+
res.status(400).json({ success: false, error: { type: 'invalid_request', message: 'Invalid "count" parameter: must be between 1 and 20', hint: 'Use a count value between 1 and 20', docs: 'https://webpeel.dev/docs/errors#invalid_request' }, requestId: req.requestId });
|
|
53
55
|
return;
|
|
54
56
|
}
|
|
55
57
|
// Parse sources parameter (comma-separated: web,news,images)
|
|
@@ -64,10 +66,12 @@ export function createSearchRouter(authStore) {
|
|
|
64
66
|
// Build cache key (include all parameters)
|
|
65
67
|
const enrichCount = enrich ? Math.min(Math.max(parseInt(enrich, 10) || 0, 0), 5) : 0;
|
|
66
68
|
const cacheKey = `search:${providerId}:${q}:${resultCount}:${sourcesStr}:${shouldScrape}:${enrichCount}:${categoriesStr}:${tbsStr}:${countryStr}:${locationStr}`;
|
|
67
|
-
|
|
69
|
+
const sharedCacheKey = searchCache.getKey(cacheKey, {});
|
|
70
|
+
// Check cache (local LRU first, then shared singleton)
|
|
68
71
|
const cached = cache.get(cacheKey);
|
|
69
72
|
if (cached) {
|
|
70
73
|
res.setHeader('X-Cache', 'HIT');
|
|
74
|
+
res.setHeader('X-Cache-Status', 'HIT');
|
|
71
75
|
res.setHeader('X-Cache-Age', Math.floor((Date.now() - cached.timestamp) / 1000).toString());
|
|
72
76
|
res.json({
|
|
73
77
|
success: true,
|
|
@@ -75,6 +79,19 @@ export function createSearchRouter(authStore) {
|
|
|
75
79
|
});
|
|
76
80
|
return;
|
|
77
81
|
}
|
|
82
|
+
// Also check shared searchCache singleton (used for /health stats)
|
|
83
|
+
const sharedCached = searchCache.get(sharedCacheKey);
|
|
84
|
+
if (sharedCached) {
|
|
85
|
+
const age = Math.floor((Date.now() - sharedCached.timestamp) / 1000);
|
|
86
|
+
res.setHeader('X-Cache', 'HIT');
|
|
87
|
+
res.setHeader('X-Cache-Status', 'HIT');
|
|
88
|
+
res.setHeader('X-Cache-Age', age.toString());
|
|
89
|
+
res.json({
|
|
90
|
+
success: true,
|
|
91
|
+
data: sharedCached.content ? JSON.parse(sharedCached.content) : {},
|
|
92
|
+
});
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
78
95
|
const startTime = Date.now();
|
|
79
96
|
const data = {};
|
|
80
97
|
// Fetch web results via the search-provider abstraction
|
|
@@ -194,6 +211,19 @@ export function createSearchRouter(authStore) {
|
|
|
194
211
|
}
|
|
195
212
|
}
|
|
196
213
|
}
|
|
214
|
+
// Add credibility scores and sort by trustworthiness
|
|
215
|
+
const tierOrder = { official: 0, verified: 1, general: 2 };
|
|
216
|
+
results = results
|
|
217
|
+
.map(r => {
|
|
218
|
+
const cred = getSourceCredibility(r.url);
|
|
219
|
+
return { ...r, credibility: cred };
|
|
220
|
+
})
|
|
221
|
+
.sort((a, b) => {
|
|
222
|
+
const aTier = tierOrder[a.credibility?.tier || 'general'] ?? 2;
|
|
223
|
+
const bTier = tierOrder[b.credibility?.tier || 'general'] ?? 2;
|
|
224
|
+
return aTier - bTier; // Official first, then verified, then general
|
|
225
|
+
})
|
|
226
|
+
.map((r, i) => ({ ...r, rank: i + 1 }));
|
|
197
227
|
data.web = results;
|
|
198
228
|
}
|
|
199
229
|
// Fetch news results (DDG only — Brave news is not supported via HTML scraping)
|
|
@@ -317,13 +347,22 @@ export function createSearchRouter(authStore) {
|
|
|
317
347
|
await pgStore.trackUsage(req.auth.keyInfo.key, 'search');
|
|
318
348
|
}
|
|
319
349
|
}
|
|
320
|
-
// Cache results
|
|
350
|
+
// Cache results (local LRU + shared singleton for /health stats)
|
|
321
351
|
cache.set(cacheKey, {
|
|
322
352
|
data,
|
|
323
353
|
timestamp: Date.now(),
|
|
324
354
|
});
|
|
355
|
+
searchCache.set(sharedCacheKey, {
|
|
356
|
+
content: JSON.stringify(data),
|
|
357
|
+
title: q,
|
|
358
|
+
metadata: {},
|
|
359
|
+
method: 'search',
|
|
360
|
+
tokens: 0,
|
|
361
|
+
timestamp: Date.now(),
|
|
362
|
+
});
|
|
325
363
|
// Add headers
|
|
326
364
|
res.setHeader('X-Cache', 'MISS');
|
|
365
|
+
res.setHeader('X-Cache-Status', 'MISS');
|
|
327
366
|
res.setHeader('X-Credits-Used', '1');
|
|
328
367
|
res.setHeader('X-Processing-Time', elapsed.toString());
|
|
329
368
|
res.setHeader('X-Fetch-Type', 'search');
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.56",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|