webpeel 0.21.53 → 0.21.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/apply.js +0 -1
- package/dist/core/pipeline.js +0 -46
- package/dist/core/source-credibility.d.ts +17 -0
- package/dist/core/source-credibility.js +83 -0
- package/dist/server/app.js +12 -2
- package/dist/server/bull-queues.d.ts +57 -0
- package/dist/server/bull-queues.js +86 -0
- package/dist/server/middleware/auth.js +1 -1
- package/dist/server/middleware/rate-limit.js +1 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +231 -0
- package/dist/server/routes/health.d.ts +6 -2
- package/dist/server/routes/health.js +70 -2
- package/dist/server/routes/search.js +14 -0
- package/package.json +3 -1
package/dist/core/apply.js
CHANGED
|
@@ -590,7 +590,6 @@ export async function applyToJob(options) {
|
|
|
590
590
|
try {
|
|
591
591
|
// ── 3. Launch persistent browser ──────────────────────────────
|
|
592
592
|
progress('navigating', 'Launching browser with persistent session...');
|
|
593
|
-
// @ts-expect-error rebrowser-playwright types diverge from upstream playwright-core
|
|
594
593
|
context = await stealthChromium.launchPersistentContext(sessionDir, {
|
|
595
594
|
headless: false, // visible so user can monitor (or log in on first run)
|
|
596
595
|
viewport: { width: 1440, height: 900 },
|
package/dist/core/pipeline.js
CHANGED
|
@@ -1217,52 +1217,6 @@ export async function finalize(ctx) {
|
|
|
1217
1217
|
log.error('Change tracking failed:', error);
|
|
1218
1218
|
}
|
|
1219
1219
|
}
|
|
1220
|
-
// ── Auto-escalation: retry thin content with browser rendering ──────────────
|
|
1221
|
-
// If simple fetch returned very little content and user didn't explicitly disable render,
|
|
1222
|
-
// automatically retry with browser rendering to handle JS-heavy/paywalled sites.
|
|
1223
|
-
const preEscalationWords = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1224
|
-
const escalationFetchMethod = fetchResult?.method || 'unknown';
|
|
1225
|
-
const alreadyTriedBrowser = escalationFetchMethod === 'browser' || escalationFetchMethod === 'stealth'
|
|
1226
|
-
|| options.render || options.stealth;
|
|
1227
|
-
const userDisabledRender = options.render === false;
|
|
1228
|
-
const escalationCandidate = preEscalationWords < 200 && preEscalationWords > 0
|
|
1229
|
-
&& escalationFetchMethod === 'simple' && !alreadyTriedBrowser && !userDisabledRender
|
|
1230
|
-
&& !ctx._escalated;
|
|
1231
|
-
if (escalationCandidate) {
|
|
1232
|
-
log.info(`thin content (${preEscalationWords}w) from simple fetch, auto-escalating to browser render for ${ctx.url}`);
|
|
1233
|
-
ctx._escalated = true;
|
|
1234
|
-
try {
|
|
1235
|
-
const { smartFetch } = await import('./strategies.js');
|
|
1236
|
-
const browserResult = await smartFetch(ctx.url, {
|
|
1237
|
-
forceBrowser: true,
|
|
1238
|
-
stealth: false,
|
|
1239
|
-
timeoutMs: options.timeout || 15000,
|
|
1240
|
-
proxy: options.proxy,
|
|
1241
|
-
});
|
|
1242
|
-
if (browserResult.html && browserResult.html.length > (fetchResult?.html?.length || 0)) {
|
|
1243
|
-
const { htmlToMarkdown } = await import('./markdown.js');
|
|
1244
|
-
const browserContent = htmlToMarkdown(browserResult.html);
|
|
1245
|
-
const browserWords = browserContent.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1246
|
-
if (browserWords > preEscalationWords) {
|
|
1247
|
-
log.info(`browser escalation improved content: ${preEscalationWords}w → ${browserWords}w`);
|
|
1248
|
-
ctx.content = browserContent;
|
|
1249
|
-
ctx.fetchResult = browserResult;
|
|
1250
|
-
ctx.fetchResult.method = 'browser-escalation';
|
|
1251
|
-
}
|
|
1252
|
-
else {
|
|
1253
|
-
log.debug(`browser escalation did not improve (${browserWords}w vs ${preEscalationWords}w)`);
|
|
1254
|
-
}
|
|
1255
|
-
// Always clean up browser resources
|
|
1256
|
-
if (browserResult.page)
|
|
1257
|
-
await browserResult.page.close().catch(() => { });
|
|
1258
|
-
if (browserResult.browser)
|
|
1259
|
-
await browserResult.browser.close().catch(() => { });
|
|
1260
|
-
}
|
|
1261
|
-
}
|
|
1262
|
-
catch (e) {
|
|
1263
|
-
log.debug('browser escalation failed:', e instanceof Error ? e.message : e);
|
|
1264
|
-
}
|
|
1265
|
-
}
|
|
1266
1220
|
// Generate AI summary if requested
|
|
1267
1221
|
if (options.summary && options.llm) {
|
|
1268
1222
|
try {
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Source credibility scoring — lightweight, zero dependencies.
|
|
3
|
+
*
|
|
4
|
+
* Classifies URLs by trustworthiness:
|
|
5
|
+
* - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
|
|
6
|
+
* - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
|
|
7
|
+
* - General (★): Everything else
|
|
8
|
+
*/
|
|
9
|
+
export interface SourceCredibility {
|
|
10
|
+
tier: 'official' | 'verified' | 'general';
|
|
11
|
+
stars: number;
|
|
12
|
+
label: string;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Assess the credibility of a source URL.
|
|
16
|
+
*/
|
|
17
|
+
export declare function getSourceCredibility(url: string): SourceCredibility;
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Source credibility scoring — lightweight, zero dependencies.
|
|
3
|
+
*
|
|
4
|
+
* Classifies URLs by trustworthiness:
|
|
5
|
+
* - Official (★★★): .gov, .edu, .mil, WHO, NIH, academic journals
|
|
6
|
+
* - Verified (★★): Wikipedia, Reuters, BBC, GitHub, StackOverflow
|
|
7
|
+
* - General (★): Everything else
|
|
8
|
+
*/
|
|
9
|
+
/** Official TLDs and hostnames that indicate high-authority sources */
|
|
10
|
+
const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil']);
|
|
11
|
+
const OFFICIAL_HOSTNAMES = new Set([
|
|
12
|
+
// Academic / research
|
|
13
|
+
'arxiv.org', 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
|
|
14
|
+
'jstor.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
|
|
15
|
+
'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
|
|
16
|
+
// International organisations
|
|
17
|
+
'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
|
|
18
|
+
// Official tech documentation
|
|
19
|
+
'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
|
|
20
|
+
'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
|
|
21
|
+
'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
|
|
22
|
+
// Health / medicine
|
|
23
|
+
'cdc.gov', 'nih.gov', 'fda.gov', 'mayoclinic.org', 'clevelandclinic.org',
|
|
24
|
+
'webmd.com', 'medlineplus.gov',
|
|
25
|
+
// Standards / specs
|
|
26
|
+
'w3.org', 'ietf.org', 'rfc-editor.org', 'iso.org',
|
|
27
|
+
]);
|
|
28
|
+
const VERIFIED_HOSTNAMES = new Set([
|
|
29
|
+
// Encyclopaedia / reference
|
|
30
|
+
'wikipedia.org', 'en.wikipedia.org', 'britannica.com',
|
|
31
|
+
// Reputable news agencies
|
|
32
|
+
'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'nytimes.com',
|
|
33
|
+
'washingtonpost.com', 'theguardian.com', 'economist.com', 'ft.com',
|
|
34
|
+
'cnn.com', 'npr.org', 'pbs.org',
|
|
35
|
+
// Developer resources
|
|
36
|
+
'github.com', 'stackoverflow.com', 'npmjs.com', 'pypi.org',
|
|
37
|
+
'crates.io', 'docs.rs', 'packagist.org', 'rubygems.org',
|
|
38
|
+
// Official cloud / vendor docs
|
|
39
|
+
'docs.aws.amazon.com', 'cloud.google.com', 'docs.github.com',
|
|
40
|
+
'azure.microsoft.com', 'registry.terraform.io',
|
|
41
|
+
// Reputable tech publications
|
|
42
|
+
'arstechnica.com', 'wired.com', 'techcrunch.com', 'theverge.com',
|
|
43
|
+
// National Geographic, Smithsonian
|
|
44
|
+
'nationalgeographic.com', 'smithsonianmag.com',
|
|
45
|
+
]);
|
|
46
|
+
/**
|
|
47
|
+
* Assess the credibility of a source URL.
|
|
48
|
+
*/
|
|
49
|
+
export function getSourceCredibility(url) {
|
|
50
|
+
try {
|
|
51
|
+
const hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, '');
|
|
52
|
+
// Check official TLDs
|
|
53
|
+
for (const tld of OFFICIAL_TLDS) {
|
|
54
|
+
if (hostname.endsWith(tld)) {
|
|
55
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Check known official hostnames
|
|
59
|
+
if (OFFICIAL_HOSTNAMES.has(hostname)) {
|
|
60
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
61
|
+
}
|
|
62
|
+
// Check parent domain (e.g. en.wikipedia.org → wikipedia.org)
|
|
63
|
+
const parts = hostname.split('.');
|
|
64
|
+
if (parts.length > 2) {
|
|
65
|
+
const parentDomain = parts.slice(-2).join('.');
|
|
66
|
+
if (OFFICIAL_HOSTNAMES.has(parentDomain)) {
|
|
67
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
68
|
+
}
|
|
69
|
+
if (VERIFIED_HOSTNAMES.has(parentDomain)) {
|
|
70
|
+
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Check known verified hostnames
|
|
74
|
+
if (VERIFIED_HOSTNAMES.has(hostname)) {
|
|
75
|
+
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
76
|
+
}
|
|
77
|
+
// Everything else
|
|
78
|
+
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
82
|
+
}
|
|
83
|
+
}
|
package/dist/server/app.js
CHANGED
|
@@ -43,6 +43,7 @@ import { createPlaygroundRouter } from './routes/playground.js';
|
|
|
43
43
|
import { createReaderRouter } from './routes/reader.js';
|
|
44
44
|
import { createSharePublicRouter, createShareRouter } from './routes/share.js';
|
|
45
45
|
import { createJobQueue } from './job-queue.js';
|
|
46
|
+
import { createQueueFetchRouter } from './routes/fetch-queue.js';
|
|
46
47
|
import { createCompatRouter } from './routes/compat.js';
|
|
47
48
|
import { createCrawlRouter } from './routes/crawl.js';
|
|
48
49
|
import { createMapRouter } from './routes/map.js';
|
|
@@ -229,7 +230,8 @@ export function createApp(config = {}) {
|
|
|
229
230
|
}, 5 * 60 * 1000);
|
|
230
231
|
// Health check MUST be before auth/rate-limit middleware
|
|
231
232
|
// Render hits /health every ~30s; rate-limiting it causes 429 → service marked as failed
|
|
232
|
-
|
|
233
|
+
// Pass pool so /ready can check DB connectivity
|
|
234
|
+
app.use(createHealthRouter(pool));
|
|
233
235
|
// OpenAPI spec — public, no auth required
|
|
234
236
|
app.get('/openapi.yaml', (_req, res) => {
|
|
235
237
|
res.setHeader('Content-Type', 'application/yaml; charset=utf-8');
|
|
@@ -287,7 +289,15 @@ export function createApp(config = {}) {
|
|
|
287
289
|
app.use(createWatchRouter(pool));
|
|
288
290
|
}
|
|
289
291
|
// /v1/fetch, /v1/search — all scopes allowed, no guard needed
|
|
290
|
-
|
|
292
|
+
// In queue mode (API_MODE=queue), /v1/fetch and /v1/render are replaced by
|
|
293
|
+
// queue-backed endpoints that enqueue Bull jobs and return { jobId, status }.
|
|
294
|
+
// GET /v1/jobs/:id is also provided by the queue router for result polling.
|
|
295
|
+
if (process.env.API_MODE === 'queue') {
|
|
296
|
+
app.use(createQueueFetchRouter());
|
|
297
|
+
}
|
|
298
|
+
else {
|
|
299
|
+
app.use(createFetchRouter(authStore));
|
|
300
|
+
}
|
|
291
301
|
// /v1/screenshot — full or read only (router uses absolute paths, guard before router)
|
|
292
302
|
app.use('/v1/screenshot', requireScope('full', 'read'));
|
|
293
303
|
app.use(createScreenshotRouter(authStore));
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bull queue setup for WebPeel microservices.
|
|
3
|
+
*
|
|
4
|
+
* Used by:
|
|
5
|
+
* - API container (API_MODE=queue) — to enqueue fetch/render jobs
|
|
6
|
+
* - Worker container (WORKER_MODE=1) — to process jobs and write results
|
|
7
|
+
*
|
|
8
|
+
* Queue names:
|
|
9
|
+
* - "webpeel:fetch" — HTTP-only fetches (no browser)
|
|
10
|
+
* - "webpeel:render" — Browser/Playwright fetches (render=true)
|
|
11
|
+
*
|
|
12
|
+
* Job result format stored in Redis:
|
|
13
|
+
* key: webpeel:result:<jobId>
|
|
14
|
+
* value: JSON string of { status, result?, error? }
|
|
15
|
+
* TTL: 24h
|
|
16
|
+
*
|
|
17
|
+
* No secrets in code. All config via env vars:
|
|
18
|
+
* REDIS_URL — e.g. redis://redis:6379 (default)
|
|
19
|
+
* REDIS_PASSWORD — optional password
|
|
20
|
+
*/
|
|
21
|
+
import Bull from 'bull';
|
|
22
|
+
export interface FetchJobPayload {
|
|
23
|
+
jobId: string;
|
|
24
|
+
url: string;
|
|
25
|
+
format?: 'markdown' | 'text' | 'html' | 'clean';
|
|
26
|
+
render?: boolean;
|
|
27
|
+
wait?: number;
|
|
28
|
+
maxTokens?: number;
|
|
29
|
+
budget?: number;
|
|
30
|
+
stealth?: boolean;
|
|
31
|
+
screenshot?: boolean;
|
|
32
|
+
fullPage?: boolean;
|
|
33
|
+
selector?: string;
|
|
34
|
+
exclude?: string[];
|
|
35
|
+
includeTags?: string[];
|
|
36
|
+
excludeTags?: string[];
|
|
37
|
+
images?: boolean;
|
|
38
|
+
actions?: any[];
|
|
39
|
+
timeout?: number;
|
|
40
|
+
lite?: boolean;
|
|
41
|
+
raw?: boolean;
|
|
42
|
+
readable?: boolean;
|
|
43
|
+
question?: string;
|
|
44
|
+
userId?: string;
|
|
45
|
+
}
|
|
46
|
+
export declare function getFetchQueue(): Bull.Queue<FetchJobPayload>;
|
|
47
|
+
export declare function getRenderQueue(): Bull.Queue<FetchJobPayload>;
|
|
48
|
+
export declare const RESULT_KEY_PREFIX = "webpeel:result:";
|
|
49
|
+
export declare const RESULT_TTL_SECONDS = 86400;
|
|
50
|
+
export interface JobResult {
|
|
51
|
+
status: 'queued' | 'processing' | 'completed' | 'failed';
|
|
52
|
+
result?: any;
|
|
53
|
+
error?: string;
|
|
54
|
+
startedAt?: string;
|
|
55
|
+
completedAt?: string;
|
|
56
|
+
}
|
|
57
|
+
export declare function closeQueues(): Promise<void>;
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bull queue setup for WebPeel microservices.
|
|
3
|
+
*
|
|
4
|
+
* Used by:
|
|
5
|
+
* - API container (API_MODE=queue) — to enqueue fetch/render jobs
|
|
6
|
+
* - Worker container (WORKER_MODE=1) — to process jobs and write results
|
|
7
|
+
*
|
|
8
|
+
* Queue names:
|
|
9
|
+
* - "webpeel:fetch" — HTTP-only fetches (no browser)
|
|
10
|
+
* - "webpeel:render" — Browser/Playwright fetches (render=true)
|
|
11
|
+
*
|
|
12
|
+
* Job result format stored in Redis:
|
|
13
|
+
* key: webpeel:result:<jobId>
|
|
14
|
+
* value: JSON string of { status, result?, error? }
|
|
15
|
+
* TTL: 24h
|
|
16
|
+
*
|
|
17
|
+
* No secrets in code. All config via env vars:
|
|
18
|
+
* REDIS_URL — e.g. redis://redis:6379 (default)
|
|
19
|
+
* REDIS_PASSWORD — optional password
|
|
20
|
+
*/
|
|
21
|
+
import Bull from 'bull';
|
|
22
|
+
// ─── Redis connection config ─────────────────────────────────────────────────
|
|
23
|
+
function getRedisConfig() {
|
|
24
|
+
const url = process.env.REDIS_URL || 'redis://redis:6379';
|
|
25
|
+
const password = process.env.REDIS_PASSWORD || undefined;
|
|
26
|
+
// Parse the URL to extract host/port (Bull accepts host+port or full URL)
|
|
27
|
+
try {
|
|
28
|
+
const parsed = new URL(url);
|
|
29
|
+
return {
|
|
30
|
+
host: parsed.hostname,
|
|
31
|
+
port: parseInt(parsed.port || '6379', 10),
|
|
32
|
+
password,
|
|
33
|
+
db: parseInt(parsed.pathname?.slice(1) || '0', 10) || 0,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
// Fallback defaults
|
|
38
|
+
return {
|
|
39
|
+
host: 'redis',
|
|
40
|
+
port: 6379,
|
|
41
|
+
password,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
const sharedOpts = {
|
|
46
|
+
redis: getRedisConfig(),
|
|
47
|
+
defaultJobOptions: {
|
|
48
|
+
attempts: 3,
|
|
49
|
+
backoff: {
|
|
50
|
+
type: 'exponential',
|
|
51
|
+
delay: 2000,
|
|
52
|
+
},
|
|
53
|
+
removeOnComplete: false, // keep for result lookup
|
|
54
|
+
removeOnFail: false,
|
|
55
|
+
timeout: 120_000, // 2 min hard timeout per job
|
|
56
|
+
},
|
|
57
|
+
};
|
|
58
|
+
// ─── Queue singletons ────────────────────────────────────────────────────────
|
|
59
|
+
let _fetchQueue = null;
|
|
60
|
+
let _renderQueue = null;
|
|
61
|
+
export function getFetchQueue() {
|
|
62
|
+
if (!_fetchQueue) {
|
|
63
|
+
_fetchQueue = new Bull('webpeel:fetch', sharedOpts);
|
|
64
|
+
}
|
|
65
|
+
return _fetchQueue;
|
|
66
|
+
}
|
|
67
|
+
export function getRenderQueue() {
|
|
68
|
+
if (!_renderQueue) {
|
|
69
|
+
_renderQueue = new Bull('webpeel:render', sharedOpts);
|
|
70
|
+
}
|
|
71
|
+
return _renderQueue;
|
|
72
|
+
}
|
|
73
|
+
// ─── Result helpers (Redis key = webpeel:result:<jobId>) ─────────────────────
|
|
74
|
+
export const RESULT_KEY_PREFIX = 'webpeel:result:';
|
|
75
|
+
export const RESULT_TTL_SECONDS = 86_400; // 24 hours
|
|
76
|
+
// ─── Graceful teardown ───────────────────────────────────────────────────────
|
|
77
|
+
export async function closeQueues() {
|
|
78
|
+
const closes = [];
|
|
79
|
+
if (_fetchQueue)
|
|
80
|
+
closes.push(_fetchQueue.close());
|
|
81
|
+
if (_renderQueue)
|
|
82
|
+
closes.push(_renderQueue.close());
|
|
83
|
+
await Promise.all(closes);
|
|
84
|
+
_fetchQueue = null;
|
|
85
|
+
_renderQueue = null;
|
|
86
|
+
}
|
|
@@ -20,7 +20,7 @@ export function createAuthMiddleware(authStore) {
|
|
|
20
20
|
const apiKeyHeader = req.headers['x-api-key'];
|
|
21
21
|
// SECURITY: Skip API key auth for public/JWT-protected endpoints
|
|
22
22
|
// These routes either need no auth or use their own JWT middleware
|
|
23
|
-
const isPublicEndpoint = req.path === '/health' ||
|
|
23
|
+
const isPublicEndpoint = req.path === '/health' || req.path === '/ready' ||
|
|
24
24
|
req.path.startsWith('/v1/auth/') ||
|
|
25
25
|
req.path === '/v1/webhooks/stripe' ||
|
|
26
26
|
req.path === '/v1/me' ||
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Queue-backed /v1/fetch and /v1/render endpoints.
|
|
3
|
+
*
|
|
4
|
+
* Used when API_MODE=queue (microservices mode).
|
|
5
|
+
* Instead of calling peel() directly, jobs are enqueued in Bull
|
|
6
|
+
* and results are polled from Redis via GET /v1/jobs/:id.
|
|
7
|
+
*
|
|
8
|
+
* POST /v1/fetch → enqueue in webpeel:fetch queue → return { jobId, status }
|
|
9
|
+
* POST /v1/render → enqueue in webpeel:render queue → return { jobId, status }
|
|
10
|
+
* GET /v1/jobs/:id → return job status + result from Redis
|
|
11
|
+
*/
|
|
12
|
+
import { Router } from 'express';
|
|
13
|
+
export declare function createQueueFetchRouter(): Router;
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Queue-backed /v1/fetch and /v1/render endpoints.
|
|
3
|
+
*
|
|
4
|
+
* Used when API_MODE=queue (microservices mode).
|
|
5
|
+
* Instead of calling peel() directly, jobs are enqueued in Bull
|
|
6
|
+
* and results are polled from Redis via GET /v1/jobs/:id.
|
|
7
|
+
*
|
|
8
|
+
* POST /v1/fetch → enqueue in webpeel:fetch queue → return { jobId, status }
|
|
9
|
+
* POST /v1/render → enqueue in webpeel:render queue → return { jobId, status }
|
|
10
|
+
* GET /v1/jobs/:id → return job status + result from Redis
|
|
11
|
+
*/
|
|
12
|
+
import { Router } from 'express';
|
|
13
|
+
import { randomUUID } from 'crypto';
|
|
14
|
+
import { validateUrlForSSRF, SSRFError } from '../middleware/url-validator.js';
|
|
15
|
+
import { getFetchQueue, getRenderQueue, RESULT_KEY_PREFIX, } from '../bull-queues.js';
|
|
16
|
+
// @ts-ignore — ioredis CJS/ESM interop
|
|
17
|
+
import IoRedisModule from "ioredis";
|
|
18
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
19
|
+
const IoRedis = IoRedisModule.default ?? IoRedisModule;
|
|
20
|
+
// ─── Redis client for result reads ──────────────────────────────────────────
|
|
21
|
+
function buildRedisClient() {
|
|
22
|
+
const url = process.env.REDIS_URL || 'redis://redis:6379';
|
|
23
|
+
const password = process.env.REDIS_PASSWORD || undefined;
|
|
24
|
+
try {
|
|
25
|
+
const parsed = new URL(url);
|
|
26
|
+
return new IoRedis({
|
|
27
|
+
host: parsed.hostname,
|
|
28
|
+
port: parseInt(parsed.port || '6379', 10),
|
|
29
|
+
password,
|
|
30
|
+
db: parseInt(parsed.pathname?.slice(1) || '0', 10) || 0,
|
|
31
|
+
lazyConnect: true,
|
|
32
|
+
maxRetriesPerRequest: 3,
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
return new IoRedis({ host: 'redis', port: 6379, password, lazyConnect: true, maxRetriesPerRequest: 3 });
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
let _redis = null;
|
|
40
|
+
function getRedis() {
|
|
41
|
+
if (!_redis)
|
|
42
|
+
_redis = buildRedisClient();
|
|
43
|
+
return _redis;
|
|
44
|
+
}
|
|
45
|
+
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
46
|
+
async function getJobResult(jobId) {
|
|
47
|
+
const raw = await getRedis().get(`${RESULT_KEY_PREFIX}${jobId}`);
|
|
48
|
+
if (!raw)
|
|
49
|
+
return null;
|
|
50
|
+
try {
|
|
51
|
+
return JSON.parse(raw);
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
function validateUrl(url, res, requestId) {
|
|
58
|
+
if (!url || typeof url !== 'string') {
|
|
59
|
+
res.status(400).json({
|
|
60
|
+
success: false,
|
|
61
|
+
error: {
|
|
62
|
+
type: 'invalid_request',
|
|
63
|
+
message: 'Missing or invalid "url" parameter.',
|
|
64
|
+
hint: 'Send JSON: { "url": "https://example.com" }',
|
|
65
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
66
|
+
},
|
|
67
|
+
requestId,
|
|
68
|
+
});
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
if (url.length > 2048) {
|
|
72
|
+
res.status(400).json({
|
|
73
|
+
success: false,
|
|
74
|
+
error: { type: 'invalid_url', message: 'URL too long (max 2048 characters)' },
|
|
75
|
+
requestId,
|
|
76
|
+
});
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
try {
|
|
80
|
+
new URL(url);
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
res.status(400).json({
|
|
84
|
+
success: false,
|
|
85
|
+
error: {
|
|
86
|
+
type: 'invalid_url',
|
|
87
|
+
message: 'Invalid URL format',
|
|
88
|
+
hint: 'Ensure the URL includes a scheme (https://) and a valid hostname',
|
|
89
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
90
|
+
},
|
|
91
|
+
requestId,
|
|
92
|
+
});
|
|
93
|
+
return null;
|
|
94
|
+
}
|
|
95
|
+
try {
|
|
96
|
+
validateUrlForSSRF(url);
|
|
97
|
+
}
|
|
98
|
+
catch (e) {
|
|
99
|
+
if (e instanceof SSRFError) {
|
|
100
|
+
res.status(400).json({
|
|
101
|
+
success: false,
|
|
102
|
+
error: {
|
|
103
|
+
type: 'forbidden_url',
|
|
104
|
+
message: 'Cannot fetch localhost, private networks, or non-HTTP URLs',
|
|
105
|
+
},
|
|
106
|
+
requestId,
|
|
107
|
+
});
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
throw e;
|
|
111
|
+
}
|
|
112
|
+
return url;
|
|
113
|
+
}
|
|
114
|
+
// ─── Router factory ──────────────────────────────────────────────────────────
|
|
115
|
+
export function createQueueFetchRouter() {
|
|
116
|
+
const router = Router();
|
|
117
|
+
/**
|
|
118
|
+
* POST /v1/fetch — enqueue HTTP fetch job
|
|
119
|
+
* POST /v1/render — enqueue browser render job
|
|
120
|
+
* These are the queue-mode replacements for the direct peel() calls.
|
|
121
|
+
*/
|
|
122
|
+
async function handleEnqueue(req, res, renderMode) {
|
|
123
|
+
const requestId = req.requestId || randomUUID();
|
|
124
|
+
const url = validateUrl(req.body?.url, res, requestId);
|
|
125
|
+
if (!url)
|
|
126
|
+
return;
|
|
127
|
+
const userId = req.auth?.keyInfo?.accountId || req.user?.userId;
|
|
128
|
+
if (!userId) {
|
|
129
|
+
res.status(401).json({
|
|
130
|
+
success: false,
|
|
131
|
+
error: {
|
|
132
|
+
type: 'unauthorized',
|
|
133
|
+
message: 'API key required. Get one free at https://app.webpeel.dev/keys',
|
|
134
|
+
docs: 'https://webpeel.dev/docs/errors#unauthorized',
|
|
135
|
+
},
|
|
136
|
+
requestId,
|
|
137
|
+
});
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
const jobId = randomUUID();
|
|
141
|
+
const payload = {
|
|
142
|
+
jobId,
|
|
143
|
+
url,
|
|
144
|
+
render: renderMode,
|
|
145
|
+
format: req.body.format || 'markdown',
|
|
146
|
+
wait: req.body.wait,
|
|
147
|
+
maxTokens: req.body.maxTokens,
|
|
148
|
+
budget: req.body.budget,
|
|
149
|
+
stealth: req.body.stealth,
|
|
150
|
+
screenshot: req.body.screenshot,
|
|
151
|
+
fullPage: req.body.fullPage,
|
|
152
|
+
selector: req.body.selector,
|
|
153
|
+
exclude: req.body.exclude,
|
|
154
|
+
includeTags: req.body.includeTags,
|
|
155
|
+
excludeTags: req.body.excludeTags,
|
|
156
|
+
images: req.body.images,
|
|
157
|
+
actions: req.body.actions,
|
|
158
|
+
timeout: req.body.timeout,
|
|
159
|
+
lite: req.body.lite,
|
|
160
|
+
raw: req.body.raw,
|
|
161
|
+
readable: req.body.readable,
|
|
162
|
+
question: req.body.question,
|
|
163
|
+
userId,
|
|
164
|
+
};
|
|
165
|
+
// Write initial queued status to Redis immediately so polling works right away
|
|
166
|
+
await getRedis().set(`${RESULT_KEY_PREFIX}${jobId}`, JSON.stringify({ status: 'queued' }), 'EX', 86_400);
|
|
167
|
+
// Enqueue in the appropriate Bull queue
|
|
168
|
+
const queue = renderMode ? getRenderQueue() : getFetchQueue();
|
|
169
|
+
await queue.add(payload, {
|
|
170
|
+
jobId, // use our own UUID as Bull job id for easy lookup
|
|
171
|
+
});
|
|
172
|
+
res.status(202).json({
|
|
173
|
+
success: true,
|
|
174
|
+
jobId,
|
|
175
|
+
status: 'queued',
|
|
176
|
+
pollUrl: `/v1/jobs/${jobId}`,
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
router.post('/v1/fetch', (req, res) => void handleEnqueue(req, res, false));
|
|
180
|
+
router.post('/v1/render', (req, res) => void handleEnqueue(req, res, true));
|
|
181
|
+
/**
|
|
182
|
+
* GET /v1/jobs/:id — return job status + result (or error)
|
|
183
|
+
* This endpoint is used regardless of whether queue mode is enabled.
|
|
184
|
+
* When a job is complete, `result` contains the full peel() output.
|
|
185
|
+
*/
|
|
186
|
+
router.get('/v1/jobs/:id', async (req, res) => {
|
|
187
|
+
const { id } = req.params;
|
|
188
|
+
const requestId = req.requestId || randomUUID();
|
|
189
|
+
if (!id || typeof id !== 'string') {
|
|
190
|
+
res.status(400).json({
|
|
191
|
+
success: false,
|
|
192
|
+
error: { type: 'invalid_request', message: 'Missing job id' },
|
|
193
|
+
requestId,
|
|
194
|
+
});
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
197
|
+
try {
|
|
198
|
+
const job = await getJobResult(id);
|
|
199
|
+
if (!job) {
|
|
200
|
+
res.status(404).json({
|
|
201
|
+
success: false,
|
|
202
|
+
error: {
|
|
203
|
+
type: 'not_found',
|
|
204
|
+
message: 'Job not found or expired',
|
|
205
|
+
hint: 'Jobs expire after 24h. Check the jobId.',
|
|
206
|
+
},
|
|
207
|
+
requestId,
|
|
208
|
+
});
|
|
209
|
+
return;
|
|
210
|
+
}
|
|
211
|
+
const statusCode = job.status === 'failed' ? 200 : 200; // always 200 for polling
|
|
212
|
+
res.status(statusCode).json({
|
|
213
|
+
success: true,
|
|
214
|
+
jobId: id,
|
|
215
|
+
status: job.status,
|
|
216
|
+
result: job.result,
|
|
217
|
+
error: job.error,
|
|
218
|
+
startedAt: job.startedAt,
|
|
219
|
+
completedAt: job.completedAt,
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
catch (err) {
|
|
223
|
+
res.status(500).json({
|
|
224
|
+
success: false,
|
|
225
|
+
error: { type: 'internal_error', message: 'Failed to retrieve job result' },
|
|
226
|
+
requestId,
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
});
|
|
230
|
+
return router;
|
|
231
|
+
}
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Health check
|
|
2
|
+
* Health check endpoints
|
|
3
3
|
* NOTE: This route is mounted BEFORE auth/rate-limit middleware in app.ts
|
|
4
4
|
* so it's never blocked by rate limiting (Render hits it every ~30s).
|
|
5
|
+
*
|
|
6
|
+
* GET /health — liveness probe: always returns 200 if process is alive
|
|
7
|
+
* GET /ready — readiness probe: checks DB + job queue; returns 503 if not ready
|
|
5
8
|
*/
|
|
6
9
|
import { Router } from 'express';
|
|
7
|
-
|
|
10
|
+
import type pg from 'pg';
|
|
11
|
+
export declare function createHealthRouter(pool?: pg.Pool | null): Router;
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Health check
|
|
2
|
+
* Health check endpoints
|
|
3
3
|
* NOTE: This route is mounted BEFORE auth/rate-limit middleware in app.ts
|
|
4
4
|
* so it's never blocked by rate limiting (Render hits it every ~30s).
|
|
5
|
+
*
|
|
6
|
+
* GET /health — liveness probe: always returns 200 if process is alive
|
|
7
|
+
* GET /ready — readiness probe: checks DB + job queue; returns 503 if not ready
|
|
5
8
|
*/
|
|
6
9
|
import { Router } from 'express';
|
|
7
10
|
import { readFileSync } from 'fs';
|
|
@@ -23,8 +26,12 @@ catch {
|
|
|
23
26
|
}
|
|
24
27
|
catch { /* keep 'unknown' */ }
|
|
25
28
|
}
|
|
26
|
-
export function createHealthRouter() {
|
|
29
|
+
export function createHealthRouter(pool) {
|
|
27
30
|
const router = Router();
|
|
31
|
+
// ------------------------------------------------------------------
|
|
32
|
+
// GET /health — liveness probe
|
|
33
|
+
// K8s: if this fails, pod is restarted
|
|
34
|
+
// ------------------------------------------------------------------
|
|
28
35
|
router.get('/health', (_req, res) => {
|
|
29
36
|
const uptime = Math.floor((Date.now() - startTime) / 1000);
|
|
30
37
|
const fetchStats = fetchCache.stats();
|
|
@@ -46,5 +53,66 @@ export function createHealthRouter() {
|
|
|
46
53
|
},
|
|
47
54
|
});
|
|
48
55
|
});
|
|
56
|
+
// ------------------------------------------------------------------
|
|
57
|
+
// GET /ready — readiness probe
|
|
58
|
+
// K8s: if this fails, pod is removed from service endpoints (no traffic)
|
|
59
|
+
// Checks: database connectivity + queue (job table) reachability
|
|
60
|
+
// ------------------------------------------------------------------
|
|
61
|
+
router.get('/ready', async (_req, res) => {
|
|
62
|
+
const checks = {};
|
|
63
|
+
let allOk = true;
|
|
64
|
+
// --- Database check ---
|
|
65
|
+
if (pool) {
|
|
66
|
+
const t0 = Date.now();
|
|
67
|
+
try {
|
|
68
|
+
await pool.query('SELECT 1');
|
|
69
|
+
checks.database = { ok: true, latencyMs: Date.now() - t0 };
|
|
70
|
+
}
|
|
71
|
+
catch (err) {
|
|
72
|
+
checks.database = { ok: false, latencyMs: Date.now() - t0, error: err?.message ?? 'unknown' };
|
|
73
|
+
allOk = false;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
// No pool configured (in-memory mode / local dev without DATABASE_URL)
|
|
78
|
+
checks.database = { ok: true, latencyMs: 0 };
|
|
79
|
+
}
|
|
80
|
+
// --- Job queue check (probe the jobs table via DATABASE_URL directly) ---
|
|
81
|
+
if (process.env.DATABASE_URL) {
|
|
82
|
+
const t0 = Date.now();
|
|
83
|
+
try {
|
|
84
|
+
// Reuse the same pool if available, or do a lightweight table existence check
|
|
85
|
+
if (pool) {
|
|
86
|
+
await pool.query('SELECT COUNT(*) FROM jobs WHERE status = $1 LIMIT 1', ['queued']);
|
|
87
|
+
checks.queue = { ok: true, latencyMs: Date.now() - t0 };
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
checks.queue = { ok: true, latencyMs: 0 };
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
catch (err) {
|
|
94
|
+
// Table may not exist in early boot; treat as non-fatal
|
|
95
|
+
const msg = err?.message ?? '';
|
|
96
|
+
if (msg.includes('relation "jobs" does not exist')) {
|
|
97
|
+
checks.queue = { ok: true, latencyMs: Date.now() - t0 };
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
checks.queue = { ok: false, latencyMs: Date.now() - t0, error: msg };
|
|
101
|
+
allOk = false;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
checks.queue = { ok: true, latencyMs: 0 };
|
|
107
|
+
}
|
|
108
|
+
const status = allOk ? 200 : 503;
|
|
109
|
+
res.status(status).json({
|
|
110
|
+
status: allOk ? 'ready' : 'not_ready',
|
|
111
|
+
version,
|
|
112
|
+
uptime: Math.floor((Date.now() - startTime) / 1000),
|
|
113
|
+
timestamp: new Date().toISOString(),
|
|
114
|
+
checks,
|
|
115
|
+
});
|
|
116
|
+
});
|
|
49
117
|
return router;
|
|
50
118
|
}
|
|
@@ -9,6 +9,7 @@ import { peel } from '../../index.js';
|
|
|
9
9
|
import { simpleFetch } from '../../core/fetcher.js';
|
|
10
10
|
import { searchCache } from '../../core/fetch-cache.js';
|
|
11
11
|
import { getSearchProvider, getBestSearchProvider, } from '../../core/search-provider.js';
|
|
12
|
+
import { getSourceCredibility } from '../../core/source-credibility.js';
|
|
12
13
|
export function createSearchRouter(authStore) {
|
|
13
14
|
const router = Router();
|
|
14
15
|
// LRU cache: 15 minute TTL, max 500 entries, 50MB total size
|
|
@@ -210,6 +211,19 @@ export function createSearchRouter(authStore) {
|
|
|
210
211
|
}
|
|
211
212
|
}
|
|
212
213
|
}
|
|
214
|
+
// Add credibility scores and sort by trustworthiness
|
|
215
|
+
const tierOrder = { official: 0, verified: 1, general: 2 };
|
|
216
|
+
results = results
|
|
217
|
+
.map(r => {
|
|
218
|
+
const cred = getSourceCredibility(r.url);
|
|
219
|
+
return { ...r, credibility: cred };
|
|
220
|
+
})
|
|
221
|
+
.sort((a, b) => {
|
|
222
|
+
const aTier = tierOrder[a.credibility?.tier || 'general'] ?? 2;
|
|
223
|
+
const bTier = tierOrder[b.credibility?.tier || 'general'] ?? 2;
|
|
224
|
+
return aTier - bTier; // Official first, then verified, then general
|
|
225
|
+
})
|
|
226
|
+
.map((r, i) => ({ ...r, rank: i + 1 }));
|
|
213
227
|
data.web = results;
|
|
214
228
|
}
|
|
215
229
|
// Fetch news results (DDG only — Brave news is not supported via HTML scraping)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.57",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|
|
@@ -96,10 +96,12 @@
|
|
|
96
96
|
],
|
|
97
97
|
"dependencies": {
|
|
98
98
|
"@modelcontextprotocol/sdk": "^1.0.4",
|
|
99
|
+
"bull": "^4.16.5",
|
|
99
100
|
"cheerio": "^1.0.0",
|
|
100
101
|
"cloakbrowser": "^0.1.8",
|
|
101
102
|
"commander": "^12.0.0",
|
|
102
103
|
"cycletls": "^2.0.5",
|
|
104
|
+
"ioredis": "^5.10.0",
|
|
103
105
|
"lru-cache": "^11.0.2",
|
|
104
106
|
"mammoth": "^1.11.0",
|
|
105
107
|
"nodemailer": "^8.0.1",
|