webpeel 0.21.52 → 0.21.56

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ /**
2
+ * In-memory LRU fetch cache for WebPeel
3
+ *
4
+ * Caches pipeline results to avoid redundant fetches for identical requests.
5
+ * Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
6
+ * Exported as a singleton: import { fetchCache } from './fetch-cache.js'
7
+ */
8
+ export interface FetchCacheEntry {
9
+ content: string;
10
+ title: string;
11
+ metadata: any;
12
+ method: string;
13
+ tokens: number;
14
+ links?: any[];
15
+ timestamp: number;
16
+ }
17
+ export interface FetchCacheStats {
18
+ size: number;
19
+ hits: number;
20
+ misses: number;
21
+ hitRate: number;
22
+ }
23
+ export declare class FetchCache {
24
+ private cache;
25
+ private maxEntries;
26
+ private defaultTTL;
27
+ private hits;
28
+ private misses;
29
+ constructor(maxEntries?: number, defaultTTLSeconds?: number);
30
+ /**
31
+ * Generate a stable cache key from url + relevant fetch options.
32
+ * Different option combinations produce different cache entries.
33
+ */
34
+ getKey(url: string, options?: {
35
+ render?: boolean;
36
+ stealth?: boolean;
37
+ budget?: number;
38
+ }): string;
39
+ /**
40
+ * Retrieve a cached entry. Returns null if missing or expired.
41
+ * On hit: entry is moved to the end of the Map (LRU refresh).
42
+ */
43
+ get(key: string): FetchCacheEntry | null;
44
+ /**
45
+ * Store an entry in the cache.
46
+ * If the cache is at capacity, the least recently used entry is evicted.
47
+ */
48
+ set(key: string, entry: FetchCacheEntry): void;
49
+ /** Clear all entries and reset stats. */
50
+ clear(): void;
51
+ /** Return cache stats. hitRate is in [0, 1]. */
52
+ stats(): FetchCacheStats;
53
+ }
54
+ /** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
55
+ export declare const fetchCache: FetchCache;
56
+ /** Singleton search cache — shorter TTL since results change faster (60 s). */
57
+ export declare const searchCache: FetchCache;
@@ -0,0 +1,95 @@
1
+ /**
2
+ * In-memory LRU fetch cache for WebPeel
3
+ *
4
+ * Caches pipeline results to avoid redundant fetches for identical requests.
5
+ * Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
6
+ * Exported as a singleton: import { fetchCache } from './fetch-cache.js'
7
+ */
8
+ export class FetchCache {
9
+ cache;
10
+ maxEntries;
11
+ defaultTTL; // ms
12
+ hits;
13
+ misses;
14
+ constructor(maxEntries = 500, defaultTTLSeconds = 300) {
15
+ this.cache = new Map();
16
+ this.maxEntries = maxEntries;
17
+ this.defaultTTL = defaultTTLSeconds * 1000;
18
+ this.hits = 0;
19
+ this.misses = 0;
20
+ }
21
+ /**
22
+ * Generate a stable cache key from url + relevant fetch options.
23
+ * Different option combinations produce different cache entries.
24
+ */
25
+ getKey(url, options = {}) {
26
+ const render = options.render ? '1' : '0';
27
+ const stealth = options.stealth ? '1' : '0';
28
+ const budget = options.budget !== undefined ? String(options.budget) : '';
29
+ return `${url}|r:${render}|s:${stealth}|b:${budget}`;
30
+ }
31
+ /**
32
+ * Retrieve a cached entry. Returns null if missing or expired.
33
+ * On hit: entry is moved to the end of the Map (LRU refresh).
34
+ */
35
+ get(key) {
36
+ const entry = this.cache.get(key);
37
+ if (!entry) {
38
+ this.misses++;
39
+ return null;
40
+ }
41
+ const ageMs = Date.now() - entry.timestamp;
42
+ if (ageMs > this.defaultTTL) {
43
+ // Expired — evict and return null
44
+ this.cache.delete(key);
45
+ this.misses++;
46
+ return null;
47
+ }
48
+ // LRU touch: move to end
49
+ this.cache.delete(key);
50
+ this.cache.set(key, entry);
51
+ this.hits++;
52
+ return entry;
53
+ }
54
+ /**
55
+ * Store an entry in the cache.
56
+ * If the cache is at capacity, the least recently used entry is evicted.
57
+ */
58
+ set(key, entry) {
59
+ // Remove existing to refresh position
60
+ if (this.cache.has(key)) {
61
+ this.cache.delete(key);
62
+ }
63
+ this.cache.set(key, entry);
64
+ // LRU eviction: remove oldest entry (first in Map iteration order)
65
+ while (this.cache.size > this.maxEntries) {
66
+ const oldestKey = this.cache.keys().next().value;
67
+ if (oldestKey !== undefined) {
68
+ this.cache.delete(oldestKey);
69
+ }
70
+ else {
71
+ break;
72
+ }
73
+ }
74
+ }
75
+ /** Clear all entries and reset stats. */
76
+ clear() {
77
+ this.cache.clear();
78
+ this.hits = 0;
79
+ this.misses = 0;
80
+ }
81
+ /** Return cache stats. hitRate is in [0, 1]. */
82
+ stats() {
83
+ const total = this.hits + this.misses;
84
+ return {
85
+ size: this.cache.size,
86
+ hits: this.hits,
87
+ misses: this.misses,
88
+ hitRate: total === 0 ? 0 : Math.round((this.hits / total) * 100) / 100,
89
+ };
90
+ }
91
+ }
92
+ /** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
93
+ export const fetchCache = new FetchCache(500, 300);
94
+ /** Singleton search cache — shorter TTL since results change faster (60 s). */
95
+ export const searchCache = new FetchCache(500, 60);
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Source credibility scoring — lightweight, zero dependencies.
3
+ *
4
+ * Classifies URLs by trustworthiness:
5
+ * - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
6
+ * - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
7
+ * - General (★): Everything else
8
+ */
9
+ export interface SourceCredibility {
10
+ tier: 'official' | 'verified' | 'general';
11
+ stars: number;
12
+ label: string;
13
+ }
14
+ /**
15
+ * Assess the credibility of a source URL.
16
+ */
17
+ export declare function getSourceCredibility(url: string): SourceCredibility;
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Source credibility scoring — lightweight, zero dependencies.
3
+ *
4
+ * Classifies URLs by trustworthiness:
5
+ * - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
6
+ * - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
7
+ * - General (★): Everything else
8
+ */
9
+ /** Official TLDs and hostnames that indicate high-authority sources */
10
+ const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil']);
11
+ const OFFICIAL_HOSTNAMES = new Set([
12
+ // Academic / research
13
+ 'arxiv.org', 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
14
+ 'jstor.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
15
+ 'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
16
+ // International organisations
17
+ 'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
18
+ // Official tech documentation
19
+ 'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
20
+ 'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
21
+ 'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
22
+ // Health / medicine
23
+ 'cdc.gov', 'nih.gov', 'fda.gov', 'mayoclinic.org', 'clevelandclinic.org',
24
+ 'webmd.com', 'medlineplus.gov',
25
+ // Standards / specs
26
+ 'w3.org', 'ietf.org', 'rfc-editor.org', 'iso.org',
27
+ ]);
28
+ const VERIFIED_HOSTNAMES = new Set([
29
+ // Encyclopaedia / reference
30
+ 'wikipedia.org', 'en.wikipedia.org', 'britannica.com',
31
+ // Reputable news agencies
32
+ 'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'nytimes.com',
33
+ 'washingtonpost.com', 'theguardian.com', 'economist.com', 'ft.com',
34
+ 'cnn.com', 'npr.org', 'pbs.org',
35
+ // Developer resources
36
+ 'github.com', 'stackoverflow.com', 'npmjs.com', 'pypi.org',
37
+ 'crates.io', 'docs.rs', 'packagist.org', 'rubygems.org',
38
+ // Official cloud / vendor docs
39
+ 'docs.aws.amazon.com', 'cloud.google.com', 'docs.github.com',
40
+ 'azure.microsoft.com', 'registry.terraform.io',
41
+ // Reputable tech publications
42
+ 'arstechnica.com', 'wired.com', 'techcrunch.com', 'theverge.com',
43
+ // National Geographic, Smithsonian
44
+ 'nationalgeographic.com', 'smithsonianmag.com',
45
+ ]);
46
+ /**
47
+ * Assess the credibility of a source URL.
48
+ */
49
+ export function getSourceCredibility(url) {
50
+ try {
51
+ const hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, '');
52
+ // Check official TLDs
53
+ for (const tld of OFFICIAL_TLDS) {
54
+ if (hostname.endsWith(tld)) {
55
+ return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
56
+ }
57
+ }
58
+ // Check known official hostnames
59
+ if (OFFICIAL_HOSTNAMES.has(hostname)) {
60
+ return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
61
+ }
62
+ // Check parent domain (e.g. en.wikipedia.org → wikipedia.org)
63
+ const parts = hostname.split('.');
64
+ if (parts.length > 2) {
65
+ const parentDomain = parts.slice(-2).join('.');
66
+ if (OFFICIAL_HOSTNAMES.has(parentDomain)) {
67
+ return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
68
+ }
69
+ if (VERIFIED_HOSTNAMES.has(parentDomain)) {
70
+ return { tier: 'verified', stars: 2, label: 'VERIFIED' };
71
+ }
72
+ }
73
+ // Check known verified hostnames
74
+ if (VERIFIED_HOSTNAMES.has(hostname)) {
75
+ return { tier: 'verified', stars: 2, label: 'VERIFIED' };
76
+ }
77
+ // Everything else
78
+ return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
79
+ }
80
+ catch {
81
+ return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
82
+ }
83
+ }
@@ -7,6 +7,7 @@ import { Router } from 'express';
7
7
  import { readFileSync } from 'fs';
8
8
  import { join, dirname } from 'path';
9
9
  import { fileURLToPath } from 'url';
10
+ import { fetchCache, searchCache } from '../../core/fetch-cache.js';
10
11
  const startTime = Date.now();
11
12
  // Read version once at startup
12
13
  let version = 'unknown';
@@ -26,11 +27,23 @@ export function createHealthRouter() {
26
27
  const router = Router();
27
28
  router.get('/health', (_req, res) => {
28
29
  const uptime = Math.floor((Date.now() - startTime) / 1000);
30
+ const fetchStats = fetchCache.stats();
31
+ const searchStats = searchCache.stats();
29
32
  res.json({
30
33
  status: 'healthy',
31
34
  version,
32
35
  uptime,
33
36
  timestamp: new Date().toISOString(),
37
+ cache: {
38
+ fetch: {
39
+ size: fetchStats.size,
40
+ hitRate: fetchStats.hitRate,
41
+ },
42
+ search: {
43
+ size: searchStats.size,
44
+ hitRate: searchStats.hitRate,
45
+ },
46
+ },
34
47
  });
35
48
  });
36
49
  return router;
@@ -7,7 +7,9 @@ import { load } from 'cheerio';
7
7
  import { LRUCache } from 'lru-cache';
8
8
  import { peel } from '../../index.js';
9
9
  import { simpleFetch } from '../../core/fetcher.js';
10
+ import { searchCache } from '../../core/fetch-cache.js';
10
11
  import { getSearchProvider, getBestSearchProvider, } from '../../core/search-provider.js';
12
+ import { getSourceCredibility } from '../../core/source-credibility.js';
11
13
  export function createSearchRouter(authStore) {
12
14
  const router = Router();
13
15
  // LRU cache: 15 minute TTL, max 500 entries, 50MB total size
@@ -47,9 +49,9 @@ export function createSearchRouter(authStore) {
47
49
  return;
48
50
  }
49
51
  // Parse and validate count
50
- const resultCount = count ? parseInt(count, 10) : 5;
51
- if (isNaN(resultCount) || resultCount < 1 || resultCount > 10) {
52
- res.status(400).json({ success: false, error: { type: 'invalid_request', message: 'Invalid "count" parameter: must be between 1 and 10', hint: 'Use a count value between 1 and 10', docs: 'https://webpeel.dev/docs/errors#invalid_request' }, requestId: req.requestId });
52
+ const resultCount = count ? parseInt(count, 10) : 10;
53
+ if (isNaN(resultCount) || resultCount < 1 || resultCount > 20) {
54
+ res.status(400).json({ success: false, error: { type: 'invalid_request', message: 'Invalid "count" parameter: must be between 1 and 20', hint: 'Use a count value between 1 and 20', docs: 'https://webpeel.dev/docs/errors#invalid_request' }, requestId: req.requestId });
53
55
  return;
54
56
  }
55
57
  // Parse sources parameter (comma-separated: web,news,images)
@@ -64,10 +66,12 @@ export function createSearchRouter(authStore) {
64
66
  // Build cache key (include all parameters)
65
67
  const enrichCount = enrich ? Math.min(Math.max(parseInt(enrich, 10) || 0, 0), 5) : 0;
66
68
  const cacheKey = `search:${providerId}:${q}:${resultCount}:${sourcesStr}:${shouldScrape}:${enrichCount}:${categoriesStr}:${tbsStr}:${countryStr}:${locationStr}`;
67
- // Check cache
69
+ const sharedCacheKey = searchCache.getKey(cacheKey, {});
70
+ // Check cache (local LRU first, then shared singleton)
68
71
  const cached = cache.get(cacheKey);
69
72
  if (cached) {
70
73
  res.setHeader('X-Cache', 'HIT');
74
+ res.setHeader('X-Cache-Status', 'HIT');
71
75
  res.setHeader('X-Cache-Age', Math.floor((Date.now() - cached.timestamp) / 1000).toString());
72
76
  res.json({
73
77
  success: true,
@@ -75,6 +79,19 @@ export function createSearchRouter(authStore) {
75
79
  });
76
80
  return;
77
81
  }
82
+ // Also check shared searchCache singleton (used for /health stats)
83
+ const sharedCached = searchCache.get(sharedCacheKey);
84
+ if (sharedCached) {
85
+ const age = Math.floor((Date.now() - sharedCached.timestamp) / 1000);
86
+ res.setHeader('X-Cache', 'HIT');
87
+ res.setHeader('X-Cache-Status', 'HIT');
88
+ res.setHeader('X-Cache-Age', age.toString());
89
+ res.json({
90
+ success: true,
91
+ data: sharedCached.content ? JSON.parse(sharedCached.content) : {},
92
+ });
93
+ return;
94
+ }
78
95
  const startTime = Date.now();
79
96
  const data = {};
80
97
  // Fetch web results via the search-provider abstraction
@@ -194,6 +211,19 @@ export function createSearchRouter(authStore) {
194
211
  }
195
212
  }
196
213
  }
214
+ // Add credibility scores and sort by trustworthiness
215
+ const tierOrder = { official: 0, verified: 1, general: 2 };
216
+ results = results
217
+ .map(r => {
218
+ const cred = getSourceCredibility(r.url);
219
+ return { ...r, credibility: cred };
220
+ })
221
+ .sort((a, b) => {
222
+ const aTier = tierOrder[a.credibility?.tier || 'general'] ?? 2;
223
+ const bTier = tierOrder[b.credibility?.tier || 'general'] ?? 2;
224
+ return aTier - bTier; // Official first, then verified, then general
225
+ })
226
+ .map((r, i) => ({ ...r, rank: i + 1 }));
197
227
  data.web = results;
198
228
  }
199
229
  // Fetch news results (DDG only — Brave news is not supported via HTML scraping)
@@ -317,13 +347,22 @@ export function createSearchRouter(authStore) {
317
347
  await pgStore.trackUsage(req.auth.keyInfo.key, 'search');
318
348
  }
319
349
  }
320
- // Cache results
350
+ // Cache results (local LRU + shared singleton for /health stats)
321
351
  cache.set(cacheKey, {
322
352
  data,
323
353
  timestamp: Date.now(),
324
354
  });
355
+ searchCache.set(sharedCacheKey, {
356
+ content: JSON.stringify(data),
357
+ title: q,
358
+ metadata: {},
359
+ method: 'search',
360
+ tokens: 0,
361
+ timestamp: Date.now(),
362
+ });
325
363
  // Add headers
326
364
  res.setHeader('X-Cache', 'MISS');
365
+ res.setHeader('X-Cache-Status', 'MISS');
327
366
  res.setHeader('X-Credits-Used', '1');
328
367
  res.setHeader('X-Processing-Time', elapsed.toString());
329
368
  res.setHeader('X-Fetch-Type', 'search');
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.52",
3
+ "version": "0.21.56",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",