webpeel 0.21.1 → 0.21.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,6 +49,7 @@ import { createAgentRouter } from './routes/agent.js';
49
49
  import { createSessionRouter } from './routes/session.js';
50
50
  import { createSentryHooks } from './sentry.js';
51
51
  import { requireScope } from './middleware/scope-guard.js';
52
+ import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
52
53
  import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
53
54
  import { registerPremiumHooks } from './premium/index.js';
54
55
  import { readFileSync } from 'fs';
@@ -238,6 +239,9 @@ export function createApp(config = {}) {
238
239
  app.get('/docs/api', (_req, res) => {
239
240
  res.redirect('/openapi.yaml');
240
241
  });
242
+ // Internal cache-warming endpoints — unauthenticated (self-auth via bearer token)
243
+ // Must be BEFORE auth middleware so the CF Worker can call without an API key
244
+ app.use(createCacheWarmRouter(pool));
241
245
  // Demo endpoint — unauthenticated, must be before auth middleware
242
246
  app.use(createDemoRouter());
243
247
  // Playground endpoint — unauthenticated, CORS-locked to webpeel.dev/localhost
@@ -398,9 +402,22 @@ export function startServer(config = {}) {
398
402
  void warmup().catch((error) => {
399
403
  log.warn('Browser warmup failed', { error: error instanceof Error ? error.message : String(error) });
400
404
  });
405
+ // Build a dedicated pool for the cache warmer (separate from the app pool inside createApp)
406
+ const warmerPool = process.env.DATABASE_URL
407
+ ? new pg.Pool({
408
+ connectionString: process.env.DATABASE_URL,
409
+ ssl: process.env.NODE_ENV === 'production' ? { rejectUnauthorized: true } : false,
410
+ max: 2, // small pool — warmer only needs occasional queries
411
+ })
412
+ : null;
401
413
  const server = app.listen(port, () => {
402
414
  log.info(`WebPeel API server listening on port ${port}`);
403
415
  log.info(`Health: http://localhost:${port}/health Fetch: /v1/fetch Search: /v1/search`);
416
+ // Start cache warmer only when opted-in
417
+ if (process.env.ENABLE_CACHE_WARM === 'true') {
418
+ log.info('Cache warming enabled (ENABLE_CACHE_WARM=true)');
419
+ startCacheWarmer(warmerPool);
420
+ }
404
421
  });
405
422
  // Graceful shutdown
406
423
  const shutdown = () => {
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Cache Pre-Warming Routes
3
+ *
4
+ * GET /internal/popular-urls — Top N URLs fetched in the last 24h (for CF Worker)
5
+ * GET /internal/cache-status — Current warmer state (warmed URLs, last run time)
6
+ *
7
+ * Both routes are mounted BEFORE auth middleware so they're accessible internally.
8
+ * /internal/popular-urls is protected by CACHE_WARM_SECRET bearer token when set.
9
+ *
10
+ * startCacheWarmer() — server-side self-warming (opt-in via ENABLE_CACHE_WARM=true)
11
+ */
12
+ import { Router } from 'express';
13
+ import pg from 'pg';
14
+ export declare function createCacheWarmRouter(pool: pg.Pool | null): Router;
15
+ /**
16
+ * startCacheWarmer — server-side self-warming (fallback when no CF Worker).
17
+ *
18
+ * Every `intervalMs` (default 2 min):
19
+ * 1. Queries /internal/popular-urls (via the DB, not HTTP)
20
+ * 2. Fetches each URL through /r/<url> with concurrency 5
21
+ * 3. Updates warmerState for /internal/cache-status
22
+ *
23
+ * Only started if ENABLE_CACHE_WARM=true.
24
+ */
25
+ export declare function startCacheWarmer(pool: pg.Pool | null, intervalMs?: number): void;
@@ -0,0 +1,210 @@
1
+ /**
2
+ * Cache Pre-Warming Routes
3
+ *
4
+ * GET /internal/popular-urls — Top N URLs fetched in the last 24h (for CF Worker)
5
+ * GET /internal/cache-status — Current warmer state (warmed URLs, last run time)
6
+ *
7
+ * Both routes are mounted BEFORE auth middleware so they're accessible internally.
8
+ * /internal/popular-urls is protected by CACHE_WARM_SECRET bearer token when set.
9
+ *
10
+ * startCacheWarmer() — server-side self-warming (opt-in via ENABLE_CACHE_WARM=true)
11
+ */
12
+ import { Router } from 'express';
13
+ import { createLogger } from '../logger.js';
14
+ const log = createLogger('cache-warm');
15
+ // ─── Fallback URLs ────────────────────────────────────────────────────────────
16
+ // Used when the DB has no usage data yet (new deployment / empty DB).
17
+ const FALLBACK_URLS = [
18
+ 'https://www.bbc.com/news',
19
+ 'https://news.ycombinator.com',
20
+ 'https://github.com',
21
+ 'https://en.wikipedia.org/wiki/Main_Page',
22
+ 'https://www.reuters.com',
23
+ 'https://techcrunch.com',
24
+ 'https://stripe.com/docs',
25
+ 'https://developer.mozilla.org',
26
+ 'https://react.dev',
27
+ 'https://docs.python.org/3/',
28
+ 'https://nodejs.org/en/docs',
29
+ 'https://www.npmjs.com',
30
+ 'https://vercel.com/docs',
31
+ 'https://nextjs.org/docs',
32
+ 'https://tailwindcss.com/docs',
33
+ 'https://www.typescriptlang.org/docs/',
34
+ 'https://docs.render.com',
35
+ 'https://cloudflare.com/docs',
36
+ 'https://aws.amazon.com/documentation/',
37
+ 'https://docs.github.com',
38
+ 'https://www.nytimes.com',
39
+ 'https://www.theguardian.com',
40
+ 'https://arstechnica.com',
41
+ 'https://www.wired.com',
42
+ 'https://www.bloomberg.com/technology',
43
+ 'https://lobste.rs',
44
+ 'https://www.producthunt.com',
45
+ 'https://stackoverflow.com',
46
+ 'https://css-tricks.com',
47
+ 'https://web.dev',
48
+ ];
49
+ // ─── In-memory warmer state ───────────────────────────────────────────────────
50
+ const warmerState = {
51
+ warmedUrls: new Set(),
52
+ lastWarmTime: null,
53
+ };
54
+ // ─── Router ──────────────────────────────────────────────────────────────────
55
+ export function createCacheWarmRouter(pool) {
56
+ const router = Router();
57
+ // GET /internal/popular-urls
58
+ router.get('/internal/popular-urls', async (req, res) => {
59
+ // Auth check — if CACHE_WARM_SECRET is set, require it
60
+ const secret = process.env.CACHE_WARM_SECRET;
61
+ if (secret) {
62
+ const authHeader = req.headers['authorization'] || '';
63
+ const token = authHeader.startsWith('Bearer ') ? authHeader.slice(7) : '';
64
+ if (token !== secret) {
65
+ res.status(401).json({
66
+ success: false,
67
+ error: { type: 'unauthorized', message: 'Invalid or missing bearer token.' },
68
+ });
69
+ return;
70
+ }
71
+ }
72
+ let urls = [];
73
+ // Query DB if available
74
+ if (pool) {
75
+ try {
76
+ const result = await pool.query(`
77
+ SELECT url, COUNT(*) as fetch_count
78
+ FROM usage_logs
79
+ WHERE created_at > NOW() - INTERVAL '24 hours'
80
+ AND url IS NOT NULL
81
+ AND status_code >= 200 AND status_code < 300
82
+ AND url NOT LIKE '%localhost%'
83
+ AND url NOT LIKE '%127.0.0.1%'
84
+ AND url NOT LIKE '%169.254%'
85
+ GROUP BY url
86
+ ORDER BY fetch_count DESC
87
+ LIMIT 50
88
+ `);
89
+ urls = result.rows.map((row) => ({
90
+ url: row.url,
91
+ count: parseInt(row.fetch_count, 10),
92
+ }));
93
+ }
94
+ catch (err) {
95
+ log.warn('Failed to query usage_logs, falling back to static list', {
96
+ error: err?.message,
97
+ });
98
+ }
99
+ }
100
+ // Fall back to static list if no DB data
101
+ if (urls.length === 0) {
102
+ log.info('No usage data found, using fallback URL list');
103
+ urls = FALLBACK_URLS.map((url) => ({ url, count: 0 }));
104
+ }
105
+ res.json({
106
+ urls,
107
+ total: urls.length,
108
+ generatedAt: new Date().toISOString(),
109
+ });
110
+ });
111
+ // GET /internal/cache-status
112
+ router.get('/internal/cache-status', (_req, res) => {
113
+ res.json({
114
+ warmedUrls: Array.from(warmerState.warmedUrls),
115
+ urlCount: warmerState.warmedUrls.size,
116
+ lastWarmTime: warmerState.lastWarmTime?.toISOString() ?? null,
117
+ });
118
+ });
119
+ return router;
120
+ }
121
+ // ─── Self-Warming ─────────────────────────────────────────────────────────────
122
+ /**
123
+ * startCacheWarmer — server-side self-warming (fallback when no CF Worker).
124
+ *
125
+ * Every `intervalMs` (default 2 min):
126
+ * 1. Queries /internal/popular-urls (via the DB, not HTTP)
127
+ * 2. Fetches each URL through /r/<url> with concurrency 5
128
+ * 3. Updates warmerState for /internal/cache-status
129
+ *
130
+ * Only started if ENABLE_CACHE_WARM=true.
131
+ */
132
+ export function startCacheWarmer(pool, intervalMs = 120_000) {
133
+ log.info('Cache warmer started', { intervalMs });
134
+ const runWarm = async () => {
135
+ const t0 = Date.now();
136
+ log.info('Cache warm cycle starting');
137
+ // Determine base URL
138
+ const base = process.env.RENDER_EXTERNAL_URL?.replace(/\/$/, '') ||
139
+ `http://localhost:${process.env.PORT || 3000}`;
140
+ // Step 1: Fetch popular URLs (same logic as the endpoint)
141
+ let urls = [];
142
+ if (pool) {
143
+ try {
144
+ const result = await pool.query(`
145
+ SELECT url, COUNT(*) as fetch_count
146
+ FROM usage_logs
147
+ WHERE created_at > NOW() - INTERVAL '24 hours'
148
+ AND url IS NOT NULL
149
+ AND status_code >= 200 AND status_code < 300
150
+ AND url NOT LIKE '%localhost%'
151
+ AND url NOT LIKE '%127.0.0.1%'
152
+ AND url NOT LIKE '%169.254%'
153
+ GROUP BY url
154
+ ORDER BY fetch_count DESC
155
+ LIMIT 50
156
+ `);
157
+ urls = result.rows.map((row) => ({
158
+ url: row.url,
159
+ count: parseInt(row.fetch_count, 10),
160
+ }));
161
+ }
162
+ catch (err) {
163
+ log.warn('Warm cycle: DB query failed, using fallback', { error: err?.message });
164
+ }
165
+ }
166
+ if (urls.length === 0) {
167
+ urls = FALLBACK_URLS.map((u) => ({ url: u, count: 0 }));
168
+ }
169
+ // Step 2: Warm each URL with concurrency 5
170
+ const concurrency = 5;
171
+ let warmed = 0;
172
+ let failed = 0;
173
+ const newWarmedSet = new Set();
174
+ for (let i = 0; i < urls.length; i += concurrency) {
175
+ const batch = urls.slice(i, i + concurrency);
176
+ const results = await Promise.allSettled(batch.map(({ url }) => fetch(`${base}/r/${encodeURIComponent(url)}`, {
177
+ headers: { 'User-Agent': 'WebPeel-CacheWarmer/1.0' },
178
+ signal: AbortSignal.timeout(15_000),
179
+ }).then((r) => {
180
+ if (r.ok) {
181
+ newWarmedSet.add(url);
182
+ warmed++;
183
+ }
184
+ else {
185
+ failed++;
186
+ }
187
+ })));
188
+ // Count settled rejections as failures
189
+ results.forEach((r) => {
190
+ if (r.status === 'rejected') {
191
+ failed++;
192
+ }
193
+ });
194
+ }
195
+ // Step 3: Update state
196
+ warmerState.warmedUrls = newWarmedSet;
197
+ warmerState.lastWarmTime = new Date();
198
+ const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
199
+ log.info(`Warmed ${warmed}/${urls.length} URLs in ${elapsed}s`, { failed });
200
+ };
201
+ // Run once immediately, then on interval
202
+ void runWarm().catch((err) => {
203
+ log.error('Cache warm cycle error', { error: err?.message });
204
+ });
205
+ setInterval(() => {
206
+ void runWarm().catch((err) => {
207
+ log.error('Cache warm cycle error', { error: err?.message });
208
+ });
209
+ }, intervalMs);
210
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.1",
3
+ "version": "0.21.2",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",