crawlforge-mcp-server 3.0.17 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -0
- package/README.md +1 -0
- package/package.json +6 -2
- package/server.js +192 -1277
- package/src/constants/config.js +2 -1
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +230 -32
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/ResearchOrchestrator.js +86 -5
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/core/endpointGuard.js +37 -0
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +295 -0
- package/src/tools/research/deepResearch.js +33 -8
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
package/src/constants/config.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import dotenv from 'dotenv';
|
|
2
2
|
import { fileURLToPath } from 'url';
|
|
3
3
|
import { dirname, join } from 'path';
|
|
4
|
+
import { resolveApiEndpoint } from '../core/endpointGuard.js';
|
|
4
5
|
|
|
5
6
|
// Load environment variables
|
|
6
7
|
const __filename = fileURLToPath(import.meta.url);
|
|
@@ -11,7 +12,7 @@ export const config = {
|
|
|
11
12
|
// CrawlForge API Configuration
|
|
12
13
|
crawlforge: {
|
|
13
14
|
apiKey: process.env.CRAWLFORGE_API_KEY || '',
|
|
14
|
-
apiBaseUrl: process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev'
|
|
15
|
+
apiBaseUrl: resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev')
|
|
15
16
|
},
|
|
16
17
|
|
|
17
18
|
// Performance
|
|
@@ -171,49 +171,8 @@ export class ActionExecutor extends EventEmitter {
|
|
|
171
171
|
actualChainConfig = chainConfig;
|
|
172
172
|
}
|
|
173
173
|
|
|
174
|
-
//
|
|
175
|
-
|
|
176
|
-
const actions = Array.isArray(chainConfig) ? chainConfig : actualChainConfig.actions;
|
|
177
|
-
const mockResults = actions.map((action, index) => {
|
|
178
|
-
const baseResult = {
|
|
179
|
-
id: `mock_action_${index}`,
|
|
180
|
-
type: action.type,
|
|
181
|
-
success: true,
|
|
182
|
-
executionTime: 10,
|
|
183
|
-
timestamp: Date.now(),
|
|
184
|
-
description: `Mock ${action.type} action`
|
|
185
|
-
};
|
|
186
|
-
|
|
187
|
-
if (action.type === 'wait') {
|
|
188
|
-
const waitTime = action.duration || action.milliseconds || 100;
|
|
189
|
-
baseResult.result = { waited: waitTime };
|
|
190
|
-
} else if (action.type === 'click') {
|
|
191
|
-
baseResult.result = { selector: action.selector, button: 'left' };
|
|
192
|
-
} else {
|
|
193
|
-
baseResult.result = { mockResult: true };
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
return baseResult;
|
|
197
|
-
});
|
|
198
|
-
|
|
199
|
-
return {
|
|
200
|
-
success: true,
|
|
201
|
-
chainId,
|
|
202
|
-
url,
|
|
203
|
-
executionTime: Date.now() - startTime,
|
|
204
|
-
results: mockResults,
|
|
205
|
-
screenshots: [],
|
|
206
|
-
metadata: {
|
|
207
|
-
userAgent: 'mock-agent',
|
|
208
|
-
viewport: { width: 1280, height: 720 }
|
|
209
|
-
},
|
|
210
|
-
stats: {
|
|
211
|
-
totalActions: mockResults.length,
|
|
212
|
-
successfulActions: mockResults.filter(r => r.success).length,
|
|
213
|
-
failedActions: mockResults.filter(r => !r.success).length
|
|
214
|
-
}
|
|
215
|
-
};
|
|
216
|
-
}
|
|
174
|
+
// (v3.0.19 cleanup) The legacy example.com mock branch was removed — no
|
|
175
|
+
// test depended on it and it short-circuited real validation. See §A3.
|
|
217
176
|
|
|
218
177
|
// Validate chain configuration
|
|
219
178
|
const validatedChain = ActionChainSchema.parse(actualChainConfig);
|
package/src/core/AuthManager.js
CHANGED
|
@@ -6,16 +6,21 @@
|
|
|
6
6
|
// Using native fetch (Node.js 18+)
|
|
7
7
|
import fs from 'fs/promises';
|
|
8
8
|
import path from 'path';
|
|
9
|
+
import { randomUUID } from 'crypto';
|
|
9
10
|
import { isCreatorModeVerified } from './creatorMode.js';
|
|
11
|
+
import { resolveApiEndpoint } from './endpointGuard.js';
|
|
12
|
+
import { logger } from '../utils/Logger.js';
|
|
10
13
|
|
|
11
14
|
class AuthManager {
|
|
12
15
|
constructor() {
|
|
13
|
-
this.apiEndpoint = process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev';
|
|
16
|
+
this.apiEndpoint = resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev');
|
|
14
17
|
this.configPath = path.join(process.env.HOME || process.env.USERPROFILE, '.crawlforge', 'config.json');
|
|
18
|
+
this.pendingUsagePath = path.join(process.env.HOME || process.env.USERPROFILE, '.crawlforge', 'pending-usage.json');
|
|
15
19
|
this.config = null;
|
|
16
20
|
this.creditCache = new Map();
|
|
17
21
|
this.lastCreditCheck = null;
|
|
18
|
-
this.
|
|
22
|
+
this.lastSuccessfulCreditCheck = new Map();
|
|
23
|
+
this.CREDIT_CHECK_INTERVAL = 15000;
|
|
19
24
|
this.initialized = false;
|
|
20
25
|
// NOTE: Don't read creator mode in constructor - it's set dynamically in server.js
|
|
21
26
|
}
|
|
@@ -30,17 +35,23 @@ class AuthManager {
|
|
|
30
35
|
|
|
31
36
|
/**
|
|
32
37
|
* Initialize the auth manager and load stored config
|
|
38
|
+
*
|
|
39
|
+
* Audit phase 5: re-validate the stored API key against the backend at startup.
|
|
40
|
+
* If the backend explicitly reports the key as revoked/invalid, we throw —
|
|
41
|
+
* the server must refuse to start rather than silently run with a dead key.
|
|
42
|
+
* Network failures are tolerated (we already have a cached config and the
|
|
43
|
+
* fail-closed credit check from audit phase 2 handles runtime revocation).
|
|
33
44
|
*/
|
|
34
45
|
async initialize() {
|
|
35
46
|
if (this.initialized) return;
|
|
36
|
-
|
|
47
|
+
|
|
37
48
|
// Skip config loading in creator mode
|
|
38
49
|
if (this.isCreatorMode()) {
|
|
39
50
|
console.log('🚀 Creator Mode Active - Unlimited Access Enabled');
|
|
40
51
|
this.initialized = true;
|
|
41
52
|
return;
|
|
42
53
|
}
|
|
43
|
-
|
|
54
|
+
|
|
44
55
|
try {
|
|
45
56
|
await this.loadConfig();
|
|
46
57
|
this.initialized = true;
|
|
@@ -48,6 +59,45 @@ class AuthManager {
|
|
|
48
59
|
console.log('No existing CrawlForge configuration found. Run setup to configure.');
|
|
49
60
|
this.initialized = true;
|
|
50
61
|
}
|
|
62
|
+
|
|
63
|
+
// Phase 5: re-validate cached API key with backend. Refuse to start if revoked.
|
|
64
|
+
if (this.config?.apiKey && process.env.CRAWLFORGE_SKIP_STARTUP_VALIDATION !== 'true') {
|
|
65
|
+
const validation = await this.validateApiKey(this.config.apiKey);
|
|
66
|
+
if (!validation.valid) {
|
|
67
|
+
const lower = (validation.error || '').toLowerCase();
|
|
68
|
+
const isExplicitReject =
|
|
69
|
+
lower.includes('invalid') ||
|
|
70
|
+
lower.includes('revoked') ||
|
|
71
|
+
lower.includes('not found') ||
|
|
72
|
+
lower.includes('expired') ||
|
|
73
|
+
lower.includes('unauthorized');
|
|
74
|
+
if (isExplicitReject) {
|
|
75
|
+
const rejectErr = new Error(
|
|
76
|
+
`CrawlForge API key rejected by backend at startup: ${validation.error}. ` +
|
|
77
|
+
`Run \`npm run setup\` with a current key, or set CRAWLFORGE_SKIP_STARTUP_VALIDATION=true to bypass.`
|
|
78
|
+
);
|
|
79
|
+
logger.error('Startup API key validation rejected by backend', rejectErr, {
|
|
80
|
+
backendError: validation.error
|
|
81
|
+
});
|
|
82
|
+
throw rejectErr;
|
|
83
|
+
}
|
|
84
|
+
// Connection error — tolerate, log, continue. Runtime credit check will fail closed.
|
|
85
|
+
logger.warn('Startup API key validation skipped (backend unreachable)', {
|
|
86
|
+
error: validation.error
|
|
87
|
+
});
|
|
88
|
+
} else {
|
|
89
|
+
logger.info('Startup API key validation OK', {
|
|
90
|
+
userId: validation.userId,
|
|
91
|
+
creditsRemaining: validation.creditsRemaining
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
try {
|
|
97
|
+
await this._flushPendingUsage();
|
|
98
|
+
} catch {
|
|
99
|
+
// Best-effort flush — do not block startup
|
|
100
|
+
}
|
|
51
101
|
}
|
|
52
102
|
|
|
53
103
|
/**
|
|
@@ -192,20 +242,16 @@ class AuthManager {
|
|
|
192
242
|
const data = await response.json();
|
|
193
243
|
this.creditCache.set(this.config.userId, data.creditsRemaining);
|
|
194
244
|
this.lastCreditCheck = now;
|
|
245
|
+
this.lastSuccessfulCreditCheck.set(this.config.userId, now);
|
|
195
246
|
return data.creditsRemaining >= estimatedCredits;
|
|
196
247
|
}
|
|
197
248
|
} catch (error) {
|
|
198
249
|
console.error('Failed to check credits:', error.message);
|
|
199
250
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
// failing closed when there's no cached data (no free usage bypass)
|
|
251
|
+
const lastOk = this.lastSuccessfulCreditCheck.get(this.config.userId) ?? 0;
|
|
252
|
+
const withinGrace = Date.now() - lastOk < 30_000;
|
|
203
253
|
const cached = this.creditCache.get(this.config.userId);
|
|
204
|
-
if (cached !== undefined && cached >= estimatedCredits)
|
|
205
|
-
console.warn('Using cached credits due to network error — will re-verify on next call');
|
|
206
|
-
return true;
|
|
207
|
-
}
|
|
208
|
-
|
|
254
|
+
if (withinGrace && cached !== undefined && cached >= estimatedCredits) return true;
|
|
209
255
|
throw new Error('Unable to verify credits. Please check your connection and try again.');
|
|
210
256
|
}
|
|
211
257
|
}
|
|
@@ -218,39 +264,188 @@ class AuthManager {
|
|
|
218
264
|
if (this.isCreatorMode()) {
|
|
219
265
|
return;
|
|
220
266
|
}
|
|
221
|
-
|
|
267
|
+
|
|
222
268
|
if (!this.config) {
|
|
223
269
|
return; // Silently skip if not configured
|
|
224
270
|
}
|
|
225
271
|
|
|
226
|
-
|
|
227
|
-
const payload = {
|
|
228
|
-
tool,
|
|
229
|
-
creditsUsed,
|
|
230
|
-
requestData,
|
|
231
|
-
responseStatus,
|
|
232
|
-
processingTime,
|
|
233
|
-
timestamp: new Date().toISOString(),
|
|
234
|
-
version: '3.0.3'
|
|
235
|
-
};
|
|
272
|
+
const userId = this.config.userId;
|
|
236
273
|
|
|
274
|
+
// Pre-decrement cache before fetch so network failures still deplete credits
|
|
275
|
+
const cached = this.creditCache.get(userId);
|
|
276
|
+
if (cached !== undefined) {
|
|
277
|
+
this.creditCache.set(userId, Math.max(0, cached - creditsUsed));
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Audit phase A2: every usage report gets a request ID and idempotency key
|
|
281
|
+
// so retries (in-memory or via pending-usage.json) are safe to replay.
|
|
282
|
+
const requestId = randomUUID();
|
|
283
|
+
const idempotencyKey = randomUUID();
|
|
284
|
+
|
|
285
|
+
const payload = {
|
|
286
|
+
tool,
|
|
287
|
+
creditsUsed,
|
|
288
|
+
requestData,
|
|
289
|
+
responseStatus,
|
|
290
|
+
processingTime,
|
|
291
|
+
timestamp: new Date().toISOString(),
|
|
292
|
+
requestId,
|
|
293
|
+
idempotencyKey,
|
|
294
|
+
version: '3.0.3'
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
try {
|
|
237
298
|
await fetch(`${this.apiEndpoint}/api/v1/usage`, {
|
|
238
299
|
method: 'POST',
|
|
239
300
|
headers: {
|
|
240
301
|
'Content-Type': 'application/json',
|
|
241
|
-
'X-API-Key': this.config.apiKey
|
|
302
|
+
'X-API-Key': this.config.apiKey,
|
|
303
|
+
'Idempotency-Key': idempotencyKey
|
|
242
304
|
},
|
|
243
|
-
body: JSON.stringify(payload)
|
|
305
|
+
body: JSON.stringify(payload),
|
|
306
|
+
signal: AbortSignal.timeout(5000)
|
|
244
307
|
});
|
|
245
308
|
|
|
246
|
-
|
|
247
|
-
const cached = this.creditCache.get(this.config.userId);
|
|
248
|
-
if (cached !== undefined) {
|
|
249
|
-
this.creditCache.set(this.config.userId, Math.max(0, cached - creditsUsed));
|
|
250
|
-
}
|
|
309
|
+
await this._flushPendingUsage();
|
|
251
310
|
} catch (error) {
|
|
252
311
|
// Log but don't throw - usage reporting should not break tool execution
|
|
253
|
-
|
|
312
|
+
logger.error('Failed to report usage; queued for retry', error, {
|
|
313
|
+
tool,
|
|
314
|
+
creditsUsed,
|
|
315
|
+
requestId,
|
|
316
|
+
idempotencyKey
|
|
317
|
+
});
|
|
318
|
+
await this._appendPendingUsage({
|
|
319
|
+
toolName: tool,
|
|
320
|
+
creditsUsed,
|
|
321
|
+
userId,
|
|
322
|
+
timestamp: payload.timestamp,
|
|
323
|
+
requestId,
|
|
324
|
+
idempotencyKey
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
async _appendPendingUsage(entry) {
|
|
330
|
+
try {
|
|
331
|
+
const configDir = path.dirname(this.pendingUsagePath);
|
|
332
|
+
await fs.mkdir(configDir, { recursive: true });
|
|
333
|
+
|
|
334
|
+
let entries = [];
|
|
335
|
+
try {
|
|
336
|
+
const raw = await fs.readFile(this.pendingUsagePath, 'utf-8');
|
|
337
|
+
entries = JSON.parse(raw);
|
|
338
|
+
} catch {
|
|
339
|
+
// File absent or corrupt — start fresh
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Audit phase A2: stamp every pending entry with a request ID and idempotency key
|
|
343
|
+
// so the backend (when it ships support) can dedupe, and so we can log dropped
|
|
344
|
+
// entries by ID when the flush retry path fails permanently.
|
|
345
|
+
const stamped = {
|
|
346
|
+
requestId: entry.requestId || randomUUID(),
|
|
347
|
+
idempotencyKey: entry.idempotencyKey || randomUUID(),
|
|
348
|
+
...entry
|
|
349
|
+
};
|
|
350
|
+
|
|
351
|
+
entries.push(stamped);
|
|
352
|
+
|
|
353
|
+
// Cap at 1 MB — drop oldest entries until serialized size fits
|
|
354
|
+
let serialized = JSON.stringify(entries);
|
|
355
|
+
const dropped = [];
|
|
356
|
+
while (serialized.length > 1_048_576 && entries.length > 1) {
|
|
357
|
+
dropped.push(entries.shift());
|
|
358
|
+
serialized = JSON.stringify(entries);
|
|
359
|
+
}
|
|
360
|
+
if (dropped.length > 0) {
|
|
361
|
+
logger.warn('Pending usage queue truncated to 1 MB cap', {
|
|
362
|
+
droppedCount: dropped.length,
|
|
363
|
+
droppedIds: dropped.map(d => d.requestId).filter(Boolean)
|
|
364
|
+
});
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
await fs.writeFile(this.pendingUsagePath, serialized, { mode: 0o600 });
|
|
368
|
+
} catch (error) {
|
|
369
|
+
logger.error('Failed to append pending usage', error, {
|
|
370
|
+
toolName: entry?.toolName,
|
|
371
|
+
requestId: entry?.requestId
|
|
372
|
+
});
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
async _flushPendingUsage() {
|
|
377
|
+
if (!this.config) return;
|
|
378
|
+
|
|
379
|
+
let entries;
|
|
380
|
+
try {
|
|
381
|
+
const raw = await fs.readFile(this.pendingUsagePath, 'utf-8');
|
|
382
|
+
entries = JSON.parse(raw);
|
|
383
|
+
} catch (err) {
|
|
384
|
+
// ENOENT is normal (nothing pending). Anything else is corruption — log it.
|
|
385
|
+
if (err && err.code !== 'ENOENT') {
|
|
386
|
+
logger.warn('Pending usage file unreadable; treating as empty', {
|
|
387
|
+
error: err.message,
|
|
388
|
+
path: this.pendingUsagePath
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
return;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
if (!Array.isArray(entries) || entries.length === 0) return;
|
|
395
|
+
|
|
396
|
+
const remaining = [];
|
|
397
|
+
const flushedIds = [];
|
|
398
|
+
const failedIds = [];
|
|
399
|
+
for (const entry of entries) {
|
|
400
|
+
try {
|
|
401
|
+
const idempotencyKey = entry.idempotencyKey || randomUUID();
|
|
402
|
+
await fetch(`${this.apiEndpoint}/api/v1/usage`, {
|
|
403
|
+
method: 'POST',
|
|
404
|
+
headers: {
|
|
405
|
+
'Content-Type': 'application/json',
|
|
406
|
+
'X-API-Key': this.config.apiKey,
|
|
407
|
+
'Idempotency-Key': idempotencyKey
|
|
408
|
+
},
|
|
409
|
+
body: JSON.stringify({
|
|
410
|
+
tool: entry.toolName,
|
|
411
|
+
creditsUsed: entry.creditsUsed,
|
|
412
|
+
timestamp: entry.timestamp,
|
|
413
|
+
requestId: entry.requestId,
|
|
414
|
+
idempotencyKey,
|
|
415
|
+
version: '3.0.3'
|
|
416
|
+
}),
|
|
417
|
+
signal: AbortSignal.timeout(5000)
|
|
418
|
+
});
|
|
419
|
+
flushedIds.push(entry.requestId);
|
|
420
|
+
} catch (err) {
|
|
421
|
+
failedIds.push(entry.requestId);
|
|
422
|
+
remaining.push(entry);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
if (flushedIds.length > 0) {
|
|
427
|
+
logger.info('Flushed pending usage entries', {
|
|
428
|
+
count: flushedIds.length,
|
|
429
|
+
requestIds: flushedIds.filter(Boolean)
|
|
430
|
+
});
|
|
431
|
+
}
|
|
432
|
+
if (failedIds.length > 0) {
|
|
433
|
+
logger.warn('Pending usage entries failed to flush; retained for next attempt', {
|
|
434
|
+
count: failedIds.length,
|
|
435
|
+
requestIds: failedIds.filter(Boolean)
|
|
436
|
+
});
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
try {
|
|
440
|
+
if (remaining.length === 0) {
|
|
441
|
+
await fs.unlink(this.pendingUsagePath);
|
|
442
|
+
} else {
|
|
443
|
+
await fs.writeFile(this.pendingUsagePath, JSON.stringify(remaining), { mode: 0o600 });
|
|
444
|
+
}
|
|
445
|
+
} catch (error) {
|
|
446
|
+
logger.error('Failed to update pending usage file', error, {
|
|
447
|
+
path: this.pendingUsagePath
|
|
448
|
+
});
|
|
254
449
|
}
|
|
255
450
|
}
|
|
256
451
|
|
|
@@ -287,7 +482,10 @@ class AuthManager {
|
|
|
287
482
|
track_changes: 3,
|
|
288
483
|
|
|
289
484
|
// Phase 1: LLM-Powered Structured Extraction
|
|
290
|
-
extract_structured: 4
|
|
485
|
+
extract_structured: 4,
|
|
486
|
+
|
|
487
|
+
// Phase C5: Natural-language LLM extraction (external paid API call per invocation)
|
|
488
|
+
extract_with_llm: 5
|
|
291
489
|
};
|
|
292
490
|
|
|
293
491
|
return costs[tool] || 1;
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BrowserContextPool — bounded Playwright browser-context pool.
|
|
3
|
+
*
|
|
4
|
+
* Replaces the unbounded `this.contexts = new Map()` in StealthBrowserManager
|
|
5
|
+
* with a pool that:
|
|
6
|
+
* - enforces a hard cap (MAX_BROWSER_CONTEXTS, default 10)
|
|
7
|
+
* - disposes contexts after N uses (periodicRefreshAfter, default 200)
|
|
8
|
+
* - closes idle contexts after a configurable timeout
|
|
9
|
+
* - maintains a concurrency wait-queue so excess callers fail fast
|
|
10
|
+
* (timeout: waitTimeoutMs, default 10 000 ms) rather than accumulating
|
|
11
|
+
*
|
|
12
|
+
* The Map-compatible surface (get/set/delete/entries/clear/size) lets
|
|
13
|
+
* StealthBrowserManager adopt it with minimal changes.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
const DEFAULT_MAX_CONTEXTS = parseInt(process.env.MAX_BROWSER_CONTEXTS || '10', 10);
|
|
17
|
+
const DEFAULT_PERIODIC_REFRESH_AFTER = 200; // context uses before forced close+relaunch
|
|
18
|
+
const DEFAULT_CLOSE_IDLE_AFTER_MS = 5 * 60 * 1000; // 5 minutes
|
|
19
|
+
const DEFAULT_WAIT_TIMEOUT_MS = 10_000;
|
|
20
|
+
|
|
21
|
+
export class BrowserContextPool {
|
|
22
|
+
/**
|
|
23
|
+
* @param {Object} [opts]
|
|
24
|
+
* @param {number} [opts.maxContexts]
|
|
25
|
+
* @param {number} [opts.periodicRefreshAfter] — max uses per context before disposal
|
|
26
|
+
* @param {number} [opts.closeIdleAfterMs]
|
|
27
|
+
* @param {number} [opts.waitTimeoutMs] — max wait for a free slot; fails fast after
|
|
28
|
+
* @param {Function} [opts.onContextExpired] — async (contextId, contextData) => void
|
|
29
|
+
*/
|
|
30
|
+
constructor(opts = {}) {
|
|
31
|
+
this._maxContexts = opts.maxContexts ?? DEFAULT_MAX_CONTEXTS;
|
|
32
|
+
this._periodicRefreshAfter = opts.periodicRefreshAfter ?? DEFAULT_PERIODIC_REFRESH_AFTER;
|
|
33
|
+
this._closeIdleAfterMs = opts.closeIdleAfterMs ?? DEFAULT_CLOSE_IDLE_AFTER_MS;
|
|
34
|
+
this._waitTimeoutMs = opts.waitTimeoutMs ?? DEFAULT_WAIT_TIMEOUT_MS;
|
|
35
|
+
this._onContextExpired = opts.onContextExpired || null;
|
|
36
|
+
|
|
37
|
+
/** @type {Map<string, { context: any, fingerprint: any, config: any, uses: number, lastUsed: number, created: number }>} */
|
|
38
|
+
this._contexts = new Map();
|
|
39
|
+
|
|
40
|
+
/** Pending callers waiting for a free slot */
|
|
41
|
+
this._waitQueue = [];
|
|
42
|
+
|
|
43
|
+
/** Periodic idle-checker timer */
|
|
44
|
+
this._idleTimer = setInterval(() => this._closeIdleContexts(), Math.min(this._closeIdleAfterMs, 60_000));
|
|
45
|
+
this._idleTimer.unref?.(); // don't prevent process exit
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ── Map-compatible surface ──────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
get size() { return this._contexts.size; }
|
|
51
|
+
|
|
52
|
+
get(contextId) { return this._contexts.get(contextId) ?? undefined; }
|
|
53
|
+
|
|
54
|
+
has(contextId) { return this._contexts.has(contextId); }
|
|
55
|
+
|
|
56
|
+
entries() { return this._contexts.entries(); }
|
|
57
|
+
|
|
58
|
+
keys() { return this._contexts.keys(); }
|
|
59
|
+
|
|
60
|
+
values() { return this._contexts.values(); }
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Register a context. Throws if the pool is full and no slot becomes
|
|
64
|
+
* available within waitTimeoutMs.
|
|
65
|
+
*/
|
|
66
|
+
async set(contextId, contextData) {
|
|
67
|
+
if (this._contexts.size >= this._maxContexts) {
|
|
68
|
+
await this._waitForSlot();
|
|
69
|
+
}
|
|
70
|
+
this._contexts.set(contextId, {
|
|
71
|
+
...contextData,
|
|
72
|
+
uses: 0,
|
|
73
|
+
lastUsed: Date.now(),
|
|
74
|
+
created: Date.now()
|
|
75
|
+
});
|
|
76
|
+
return this;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Synchronous set — for callers that already verified there is capacity.
|
|
81
|
+
* Throws immediately if pool is at capacity.
|
|
82
|
+
*/
|
|
83
|
+
setSync(contextId, contextData) {
|
|
84
|
+
if (this._contexts.size >= this._maxContexts) {
|
|
85
|
+
throw new Error(`BrowserContextPool is at capacity (${this._maxContexts} contexts). Use await pool.set() to wait for a free slot.`);
|
|
86
|
+
}
|
|
87
|
+
this._contexts.set(contextId, {
|
|
88
|
+
...contextData,
|
|
89
|
+
uses: 0,
|
|
90
|
+
lastUsed: Date.now(),
|
|
91
|
+
created: Date.now()
|
|
92
|
+
});
|
|
93
|
+
return this;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
delete(contextId) {
|
|
97
|
+
const deleted = this._contexts.delete(contextId);
|
|
98
|
+
if (deleted) this._notifyWaiter();
|
|
99
|
+
return deleted;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
clear() {
|
|
103
|
+
this._contexts.clear();
|
|
104
|
+
// Drain any waiters with rejections so they don't hang
|
|
105
|
+
const waiters = this._waitQueue.splice(0);
|
|
106
|
+
waiters.forEach(({ reject }) => reject(new Error('BrowserContextPool cleared')));
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// ── Pool-specific API ───────────────────────────────────────────────────────
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Record a use for the given context.
|
|
113
|
+
* Returns true if the context should be closed and re-created (refresh needed).
|
|
114
|
+
* @param {string} contextId
|
|
115
|
+
*/
|
|
116
|
+
recordUse(contextId) {
|
|
117
|
+
const entry = this._contexts.get(contextId);
|
|
118
|
+
if (!entry) return false;
|
|
119
|
+
entry.uses++;
|
|
120
|
+
entry.lastUsed = Date.now();
|
|
121
|
+
return entry.uses >= this._periodicRefreshAfter;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Dispose a context (close it + remove from pool).
|
|
126
|
+
* @param {string} contextId
|
|
127
|
+
*/
|
|
128
|
+
async dispose(contextId) {
|
|
129
|
+
const entry = this._contexts.get(contextId);
|
|
130
|
+
if (!entry) return;
|
|
131
|
+
this._contexts.delete(contextId);
|
|
132
|
+
this._notifyWaiter();
|
|
133
|
+
try {
|
|
134
|
+
await entry.context?.close();
|
|
135
|
+
} catch {
|
|
136
|
+
// ignore close errors
|
|
137
|
+
}
|
|
138
|
+
this._onContextExpired?.(contextId, entry);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Close all idle contexts (lastUsed > closeIdleAfterMs ago).
|
|
143
|
+
*/
|
|
144
|
+
async _closeIdleContexts() {
|
|
145
|
+
const now = Date.now();
|
|
146
|
+
for (const [contextId, entry] of this._contexts.entries()) {
|
|
147
|
+
if (now - entry.lastUsed > this._closeIdleAfterMs) {
|
|
148
|
+
await this.dispose(contextId);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Wait until a slot becomes available (or time out).
|
|
155
|
+
*/
|
|
156
|
+
_waitForSlot() {
|
|
157
|
+
return new Promise((resolve, reject) => {
|
|
158
|
+
const timer = setTimeout(() => {
|
|
159
|
+
const idx = this._waitQueue.findIndex(w => w.resolve === resolve);
|
|
160
|
+
if (idx !== -1) this._waitQueue.splice(idx, 1);
|
|
161
|
+
reject(new Error(`BrowserContextPool: timed out waiting for a free context slot after ${this._waitTimeoutMs}ms`));
|
|
162
|
+
}, this._waitTimeoutMs);
|
|
163
|
+
|
|
164
|
+
this._waitQueue.push({ resolve, reject, timer });
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/** Notify the oldest pending waiter that a slot is now free. */
|
|
169
|
+
_notifyWaiter() {
|
|
170
|
+
if (this._waitQueue.length === 0) return;
|
|
171
|
+
const { resolve, timer } = this._waitQueue.shift();
|
|
172
|
+
clearTimeout(timer);
|
|
173
|
+
resolve();
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/** Destroy the pool — closes all contexts and clears the idle timer. */
|
|
177
|
+
async destroy() {
|
|
178
|
+
clearInterval(this._idleTimer);
|
|
179
|
+
for (const contextId of Array.from(this._contexts.keys())) {
|
|
180
|
+
await this.dispose(contextId);
|
|
181
|
+
}
|
|
182
|
+
const waiters = this._waitQueue.splice(0);
|
|
183
|
+
waiters.forEach(({ reject }) => reject(new Error('BrowserContextPool destroyed')));
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
export default BrowserContextPool;
|
package/src/core/JobManager.js
CHANGED
|
@@ -573,11 +573,13 @@ export class JobManager extends EventEmitter {
|
|
|
573
573
|
* @returns {boolean} Whether job is valid
|
|
574
574
|
*/
|
|
575
575
|
validateJob(job) {
|
|
576
|
-
return
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
576
|
+
return Boolean(
|
|
577
|
+
job &&
|
|
578
|
+
typeof job.id === 'string' &&
|
|
579
|
+
typeof job.type === 'string' &&
|
|
580
|
+
typeof job.status === 'string' &&
|
|
581
|
+
Object.values(this.JOB_STATES).includes(job.status)
|
|
582
|
+
);
|
|
581
583
|
}
|
|
582
584
|
|
|
583
585
|
/**
|