crawlforge-mcp-server 3.4.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +28 -2
  2. package/package.json +6 -4
  3. package/server.js +166 -32
  4. package/src/cli/commands/actions.js +36 -0
  5. package/src/cli/commands/analyze.js +19 -0
  6. package/src/cli/commands/batch.js +45 -0
  7. package/src/cli/commands/crawl.js +30 -0
  8. package/src/cli/commands/extract.js +45 -0
  9. package/src/cli/commands/install-skills.js +46 -0
  10. package/src/cli/commands/llmstxt.js +24 -0
  11. package/src/cli/commands/localize.js +29 -0
  12. package/src/cli/commands/map.js +26 -0
  13. package/src/cli/commands/monitor.js +29 -0
  14. package/src/cli/commands/research.js +26 -0
  15. package/src/cli/commands/scrape.js +37 -0
  16. package/src/cli/commands/search.js +28 -0
  17. package/src/cli/commands/stealth.js +29 -0
  18. package/src/cli/commands/template.js +26 -0
  19. package/src/cli/commands/track.js +24 -0
  20. package/src/cli/commands/uninstall-skills.js +35 -0
  21. package/src/cli/formatter.js +57 -0
  22. package/src/cli/index.js +94 -0
  23. package/src/cli/lib/runTool.js +40 -0
  24. package/src/core/ActionExecutor.js +8 -6
  25. package/src/core/AuthManager.js +103 -3
  26. package/src/core/ChangeTracker.js +34 -0
  27. package/src/core/ElicitationHelper.js +112 -0
  28. package/src/core/JobManager.js +36 -2
  29. package/src/core/LocalizationManager.js +19 -5
  30. package/src/core/PerformanceManager.js +53 -17
  31. package/src/core/ResearchOrchestrator.js +40 -5
  32. package/src/core/SamplingClient.js +191 -0
  33. package/src/core/StealthBrowserManager.js +248 -2
  34. package/src/core/WebhookDispatcher.js +18 -10
  35. package/src/prompts/PromptRegistry.js +199 -0
  36. package/src/resources/ResourceRegistry.js +273 -0
  37. package/src/server/transports/streamableHttp.js +6 -6
  38. package/src/server/withAuth.js +25 -0
  39. package/src/skills/crawlforge-cli.md +157 -0
  40. package/src/skills/crawlforge-mcp.md +80 -0
  41. package/src/skills/crawlforge-research.md +104 -0
  42. package/src/skills/crawlforge-stealth.md +98 -0
  43. package/src/skills/installer.js +141 -0
  44. package/src/tools/advanced/batchScrape/index.js +30 -0
  45. package/src/tools/advanced/batchScrape/schema.js +1 -1
  46. package/src/tools/basic/extractText.js +19 -8
  47. package/src/tools/crawl/crawlDeep.js +27 -0
  48. package/src/tools/extract/extractContent.js +5 -17
  49. package/src/tools/extract/extractStructured.js +8 -0
  50. package/src/tools/extract/extractWithLlm.js +35 -25
  51. package/src/tools/extract/listOllamaModels.js +66 -0
  52. package/src/tools/extract/processDocument.js +7 -1
  53. package/src/tools/extract/summarizeContent.js +17 -0
  54. package/src/tools/research/deepResearch.js +34 -0
  55. package/src/tools/templates/ScrapeTemplateTool.js +68 -0
  56. package/src/tools/templates/TemplateRegistry.js +311 -0
  57. package/src/utils/Logger.js +15 -0
  58. package/src/utils/htmlToMarkdown.js +54 -0
  59. package/src/utils/secretMask.js +86 -0
@@ -0,0 +1,40 @@
1
+ /**
2
+ * runTool.js — Thin wrapper that invokes a tool's execute() method directly
3
+ * and formats the output according to global CLI flags.
4
+ *
5
+ * This intentionally does NOT replicate withAuth credit logic — CLI invocations
6
+ * go through the same AuthManager path as MCP calls when a real API key is set.
7
+ * In creator mode (CRAWLFORGE_CREATOR_SECRET set) credits are skipped automatically.
8
+ */
9
+
10
+ import { formatResult, formatError } from '../formatter.js';
11
+
12
+ /**
13
+ * Run a tool and print formatted output.
14
+ * @param {object} tool — tool instance with execute(params) method
15
+ * @param {object} params — tool parameters
16
+ * @param {object} cliFlags — { json, pretty, quiet }
17
+ * @param {object} [options]
18
+ * @param {boolean} [options.exitOnError=true]
19
+ */
20
+ export async function runTool(tool, params, cliFlags, options = {}) {
21
+ const { exitOnError = true } = options;
22
+
23
+ try {
24
+ const result = await tool.execute(params);
25
+
26
+ // Check for MCP-style error response
27
+ if (result && result.isError) {
28
+ const errText = result.content?.[0]?.text ?? 'Tool returned an error';
29
+ process.stderr.write(formatError(errText, cliFlags) + '\n');
30
+ if (exitOnError) process.exit(1);
31
+ return;
32
+ }
33
+
34
+ const output = formatResult(result, cliFlags);
35
+ if (output) process.stdout.write(output + '\n');
36
+ } catch (error) {
37
+ process.stderr.write(formatError(error, cliFlags) + '\n');
38
+ if (exitOnError) process.exit(1);
39
+ }
40
+ }
@@ -202,13 +202,15 @@ export class ActionExecutor extends EventEmitter {
202
202
  this.activeChains.set(chainId, executionContext);
203
203
  this.emit('chainStarted', executionContext);
204
204
 
205
- // Initialize browser and navigate to page
206
- const page = await this.initializePage(url, browserOptions);
207
- executionContext.page = page;
208
-
205
+ // D2.4: initialize page INSIDE try/finally so it is always closed even on
206
+ // errors thrown between acquisition and the inner try block.
207
+ let page = null;
209
208
  let chainResult;
210
209
 
211
210
  try {
211
+ page = await this.initializePage(url, browserOptions);
212
+ executionContext.page = page;
213
+
212
214
  // Execute chain with potential retries
213
215
  chainResult = await this.executeChainWithRetries(executionContext);
214
216
 
@@ -235,9 +237,9 @@ export class ActionExecutor extends EventEmitter {
235
237
 
236
238
  throw error;
237
239
  } finally {
238
- // Clean up page
240
+ // D2.4: always close page to prevent leaks
239
241
  if (page) {
240
- await page.close();
242
+ try { await page.close(); } catch (_) { /* ignore close errors */ }
241
243
  }
242
244
 
243
245
  // Update execution time
@@ -10,6 +10,15 @@ import { randomUUID } from 'crypto';
10
10
  import { isCreatorModeVerified } from './creatorMode.js';
11
11
  import { resolveApiEndpoint } from './endpointGuard.js';
12
12
  import { logger } from '../utils/Logger.js';
13
+ // D1.4: Elicitation for low-credit warnings (lazy import to avoid circular dep)
14
+ let _ElicitationHelper = null;
15
+ function getElicitationHelper() {
16
+ if (!_ElicitationHelper) {
17
+ // Dynamic import to avoid circular dependency at module load time
18
+ return null; // Will be set via setElicitation() from server.js
19
+ }
20
+ return _ElicitationHelper;
21
+ }
13
22
 
14
23
  class AuthManager {
15
24
  constructor() {
@@ -22,9 +31,22 @@ class AuthManager {
22
31
  this.lastSuccessfulCreditCheck = new Map();
23
32
  this.CREDIT_CHECK_INTERVAL = 15000;
24
33
  this.initialized = false;
34
+ // D2.1: simple async mutex to prevent concurrent reportUsage calls from
35
+ // double-decrementing the credit cache before the backend ack arrives.
36
+ this._usageQueue = Promise.resolve();
37
+ // D1.4: Elicitation helper for low-credit warnings
38
+ this._elicitation = null;
25
39
  // NOTE: Don't read creator mode in constructor - it's set dynamically in server.js
26
40
  }
27
41
 
42
+ /**
43
+ * D1.4: Set elicitation helper for low-credit warnings.
44
+ * @param {object} elicitation - ElicitationHelper instance
45
+ */
46
+ setElicitation(elicitation) {
47
+ this._elicitation = elicitation;
48
+ }
49
+
28
50
  /**
29
51
  * Check if running in creator mode (unlimited access, no API required)
30
52
  * Uses module-scoped verified flag from server.js - cannot be bypassed via env vars
@@ -243,6 +265,24 @@ class AuthManager {
243
265
  this.creditCache.set(this.config.userId, data.creditsRemaining);
244
266
  this.lastCreditCheck = now;
245
267
  this.lastSuccessfulCreditCheck.set(this.config.userId, now);
268
+
269
+ // D1.4: If credits are close to running out, elicit confirmation instead of hard-failing
270
+ if (data.creditsRemaining < estimatedCredits) {
271
+ if (this._elicitation) {
272
+ const proceed = await this._elicitation.confirm(
273
+ `Low credits: ${data.creditsRemaining} remaining, this tool needs ~${estimatedCredits}. Proceed anyway?`,
274
+ {
275
+ credits_remaining: data.creditsRemaining,
276
+ credits_needed: estimatedCredits,
277
+ note: 'Top up at https://www.crawlforge.dev/dashboard',
278
+ }
279
+ );
280
+ if (!proceed) return false;
281
+ return true; // user confirmed — let tool attempt it
282
+ }
283
+ return false; // no elicitation — standard hard-fail behavior
284
+ }
285
+
246
286
  return data.creditsRemaining >= estimatedCredits;
247
287
  }
248
288
  } catch (error) {
@@ -269,9 +309,18 @@ class AuthManager {
269
309
  return; // Silently skip if not configured
270
310
  }
271
311
 
312
+ // D2.1: serialize via promise queue so concurrent tool calls do not race
313
+ // on creditCache and double-decrement before the backend ack arrives.
314
+ this._usageQueue = this._usageQueue.then(() =>
315
+ this._reportUsageOnce(tool, creditsUsed, requestData, responseStatus, processingTime)
316
+ );
317
+ return this._usageQueue;
318
+ }
319
+
320
+ async _reportUsageOnce(tool, creditsUsed, requestData = {}, responseStatus = 200, processingTime = 0) {
272
321
  const userId = this.config.userId;
273
322
 
274
- // Pre-decrement cache before fetch so network failures still deplete credits
323
+ // Decrement only inside the serialized task -- no concurrent races
275
324
  const cached = this.creditCache.get(userId);
276
325
  if (cached !== undefined) {
277
326
  this.creditCache.set(userId, Math.max(0, cached - creditsUsed));
@@ -484,13 +533,64 @@ class AuthManager {
484
533
  // Phase 1: LLM-Powered Structured Extraction
485
534
  extract_structured: 4,
486
535
 
487
- // Phase C5: Natural-language LLM extraction (external paid API call per invocation)
488
- extract_with_llm: 5
536
+ // D3.3: Pre-built site templates (1 credit same as fetch_url)
537
+ extract_with_llm: 5,
538
+
539
+ // D3.3: Pre-built site templates (1 credit per template scrape)
540
+ scrape_template: 1
489
541
  };
490
542
 
491
543
  return costs[tool] || 1;
492
544
  }
493
545
 
546
+ /**
547
+ * D3.5: Project the cost of calling a tool with given params.
548
+ *
549
+ * Returns a lower-bound estimate. Dynamic tools (deep_research, crawl_deep)
550
+ * have variable costs that depend on runtime behaviour (e.g. how many URLs
551
+ * get fetched). The projection is a MINIMUM — actual cost may be higher.
552
+ * Accuracy caveats are documented in each tool description.
553
+ *
554
+ * @param {string} toolName
555
+ * @param {object} params
556
+ * @returns {{ projected: number, note: string }}
557
+ */
558
+ projectCost(toolName, params) {
559
+ const base = this.getToolCost(toolName);
560
+
561
+ // Override for tools whose cost scales with params
562
+ let projected = base;
563
+ let note = 'Fixed cost per invocation.';
564
+
565
+ switch (toolName) {
566
+ case 'batch_scrape': {
567
+ const urlCount = Array.isArray(params?.urls) ? params.urls.length : 1;
568
+ projected = Math.max(base, Math.ceil(urlCount / 10));
569
+ note = `Estimated from ${urlCount} URLs. Actual may be higher for slow/large pages.`;
570
+ break;
571
+ }
572
+ case 'deep_research': {
573
+ const maxUrls = params?.maxUrls || params?.options?.maxUrls || 20;
574
+ projected = Math.max(base, Math.ceil(maxUrls / 5) + base);
575
+ note = `Lower-bound estimate. deep_research cost grows with source count (${maxUrls} max URLs).`;
576
+ break;
577
+ }
578
+ case 'crawl_deep': {
579
+ const maxPages = params?.maxPages || params?.options?.maxPages || 10;
580
+ projected = Math.max(base, Math.ceil(maxPages / 20) * base);
581
+ note = `Lower-bound estimate. crawl_deep cost grows with page count (${maxPages} max).`;
582
+ break;
583
+ }
584
+ case 'extract_with_llm':
585
+ note = 'Includes external LLM API call cost (not billed in credits, billed by your LLM provider).';
586
+ break;
587
+ default:
588
+ note = 'Fixed cost per invocation.';
589
+ }
590
+
591
+ return { projected, note };
592
+ }
593
+
494
594
  /**
495
595
  * Check if authenticated
496
596
  */
@@ -6,6 +6,7 @@ import crypto from "crypto";
6
6
  */
7
7
 
8
8
  import { createHash } from 'crypto';
9
+ import { Worker } from 'worker_threads';
9
10
  import { z } from 'zod';
10
11
  import { EventEmitter } from 'events';
11
12
  import { load } from 'cheerio';
@@ -828,6 +829,39 @@ export class ChangeTracker extends EventEmitter {
828
829
  .update(content || '')
829
830
  .digest('hex');
830
831
  }
832
+
833
+ /**
834
+ * D2.8: Hash large content (>256KB) off the main thread to avoid event-loop blocking.
835
+ * Falls back to synchronous hashContent for smaller payloads.
836
+ * @param {string} content
837
+ * @returns {Promise<string>}
838
+ */
839
+ async hashContentAsync(content) {
840
+ const THRESHOLD = 256 * 1024; // 256 KB
841
+ const str = content || '';
842
+ if (str.length <= THRESHOLD) {
843
+ return this.hashContent(str);
844
+ }
845
+
846
+ const algorithm = this.options.hashAlgorithm || 'sha256';
847
+ return new Promise((resolve, reject) => {
848
+ const workerCode = `
849
+ const { createHash } = require('crypto');
850
+ const { workerData, parentPort } = require('worker_threads');
851
+ const hash = createHash(workerData.algorithm).update(workerData.content).digest('hex');
852
+ parentPort.postMessage(hash);
853
+ `;
854
+ const worker = new Worker(workerCode, {
855
+ eval: true,
856
+ workerData: { content: str, algorithm }
857
+ });
858
+ worker.once('message', resolve);
859
+ worker.once('error', (err) => {
860
+ // Fallback to sync on worker error
861
+ try { resolve(this.hashContent(str)); } catch (e) { reject(e); }
862
+ });
863
+ });
864
+ }
831
865
 
832
866
  calculateSimilarity(hash1, hash2) {
833
867
  if (hash1 === hash2) return 1;
@@ -0,0 +1,112 @@
1
+ /**
2
+ * ElicitationHelper — MCP Elicitation for CrawlForge
3
+ *
4
+ * Allows tools to request user confirmation or input mid-execution for
5
+ * expensive or ambiguous operations. Falls back gracefully when the
6
+ * MCP client does not support elicitation.
7
+ *
8
+ * MCP Spec 2025-11-25: client/elicit request with requestedSchema
9
+ */
10
+
11
+ export class ElicitationHelper {
12
+ /**
13
+ * @param {object} options
14
+ * @param {object|null} options.mcpServer - McpServer instance
15
+ * @param {object|null} options.logger
16
+ */
17
+ constructor({ mcpServer, logger } = {}) {
18
+ this._mcpServer = mcpServer || null;
19
+ this._logger = logger || { warn: () => {}, info: () => {} };
20
+ }
21
+
22
+ /**
23
+ * Whether the connected MCP client supports elicitation.
24
+ * @returns {boolean}
25
+ */
26
+ get supported() {
27
+ return !!(this._mcpServer?.server?.elicit);
28
+ }
29
+
30
+ /**
31
+ * Ask for user confirmation before proceeding with an expensive operation.
32
+ * Returns true if confirmed (or if elicitation is unsupported — fail-open
33
+ * so tools continue working in non-elicitation clients).
34
+ *
35
+ * @param {string} message - Human-readable explanation of what requires confirmation
36
+ * @param {object} [details] - Additional context (projected cost, URL count, etc.)
37
+ * @returns {Promise<boolean>} - true = proceed, false = cancel
38
+ */
39
+ async confirm(message, details = {}) {
40
+ if (!this.supported) {
41
+ this._logger.warn('Elicitation not supported by client — proceeding without confirmation', { message });
42
+ return true;
43
+ }
44
+
45
+ try {
46
+ const detailLines = Object.entries(details)
47
+ .map(([k, v]) => ` ${k}: ${v}`)
48
+ .join('\n');
49
+ const fullMessage = detailLines ? `${message}\n\n${detailLines}` : message;
50
+
51
+ const result = await this._mcpServer.server.elicit({
52
+ message: fullMessage,
53
+ requestedSchema: {
54
+ type: 'object',
55
+ properties: {
56
+ confirmed: {
57
+ type: 'boolean',
58
+ title: 'Proceed?',
59
+ description: 'Confirm to proceed with the operation',
60
+ },
61
+ },
62
+ required: ['confirmed'],
63
+ },
64
+ });
65
+
66
+ return result?.content?.confirmed === true;
67
+ } catch (err) {
68
+ this._logger.warn('Elicitation request failed — proceeding without confirmation', { error: err.message });
69
+ return true; // fail-open
70
+ }
71
+ }
72
+
73
+ /**
74
+ * Ask the user to provide a string value (e.g. missing schema field).
75
+ *
76
+ * @param {string} message
77
+ * @param {object} [options]
78
+ * @param {string} [options.fieldName]
79
+ * @param {string} [options.fieldDescription]
80
+ * @param {string} [options.defaultValue]
81
+ * @returns {Promise<string|null>} - The user-provided value or null if cancelled/unsupported
82
+ */
83
+ async requestString(message, { fieldName = 'value', fieldDescription = '', defaultValue } = {}) {
84
+ if (!this.supported) {
85
+ this._logger.warn('Elicitation not supported — using default value', { fieldName, defaultValue });
86
+ return defaultValue || null;
87
+ }
88
+
89
+ try {
90
+ const result = await this._mcpServer.server.elicit({
91
+ message,
92
+ requestedSchema: {
93
+ type: 'object',
94
+ properties: {
95
+ [fieldName]: {
96
+ type: 'string',
97
+ title: fieldName,
98
+ description: fieldDescription,
99
+ ...(defaultValue ? { default: defaultValue } : {}),
100
+ },
101
+ },
102
+ required: [fieldName],
103
+ },
104
+ });
105
+
106
+ return result?.content?.[fieldName] || defaultValue || null;
107
+ } catch (err) {
108
+ this._logger.warn('Elicitation request failed', { error: err.message });
109
+ return defaultValue || null;
110
+ }
111
+ }
112
+ }
@@ -139,6 +139,28 @@ export class JobManager extends EventEmitter {
139
139
  logs: []
140
140
  };
141
141
 
142
+ // D2.6: LRU eviction -- remove oldest completed/failed/cancelled job when at capacity
143
+ if (this.jobs.size >= this.maxJobs) {
144
+ let evicted = false;
145
+ for (const [eid, ejob] of this.jobs) {
146
+ if ([this.JOB_STATES.COMPLETED, this.JOB_STATES.FAILED, this.JOB_STATES.CANCELLED].includes(ejob.status)) {
147
+ this.jobs.delete(eid);
148
+ this.jobsByStatus.get(ejob.status).delete(eid);
149
+ evicted = true;
150
+ break;
151
+ }
152
+ }
153
+ if (!evicted) {
154
+ // All jobs are active -- evict the oldest regardless of state
155
+ const oldestId = this.jobs.keys().next().value;
156
+ const oldest = this.jobs.get(oldestId);
157
+ if (oldest) {
158
+ this.jobs.delete(oldestId);
159
+ this.jobsByStatus.get(oldest.status).delete(oldestId);
160
+ }
161
+ }
162
+ }
163
+
142
164
  // Store job
143
165
  this.jobs.set(jobId, job);
144
166
  this.jobsByStatus.get(this.JOB_STATES.PENDING).add(jobId);
@@ -345,7 +367,17 @@ export class JobManager extends EventEmitter {
345
367
 
346
368
  await this.updateJobStatus(jobId, this.JOB_STATES.CANCELLED);
347
369
  this.emit('jobCancelled', job);
348
-
370
+
371
+ // D2.6: cascade-cancel all jobs that depend on this one
372
+ for (const [depId, depJob] of this.jobs) {
373
+ if (depJob.dependencies && depJob.dependencies.includes(jobId)) {
374
+ if (![this.JOB_STATES.COMPLETED, this.JOB_STATES.FAILED, this.JOB_STATES.CANCELLED].includes(depJob.status)) {
375
+ await this.updateJobStatus(depId, this.JOB_STATES.CANCELLED);
376
+ this.emit('jobCancelled', depJob);
377
+ }
378
+ }
379
+ }
380
+
349
381
  return job;
350
382
  }
351
383
 
@@ -456,8 +488,10 @@ export class JobManager extends EventEmitter {
456
488
  const now = Date.now();
457
489
  const expiredJobs = [];
458
490
 
491
+ // D2.6: expire ALL jobs past their TTL regardless of state (was previously only checking expiresAt)
459
492
  for (const [jobId, job] of this.jobs) {
460
- if (job.expiresAt && now > job.expiresAt) {
493
+ const expiry = job.expiresAt || (job.createdAt + (job.ttl || this.defaultTtl));
494
+ if (now > expiry) {
461
495
  expiredJobs.push(jobId);
462
496
  }
463
497
  }
@@ -157,11 +157,25 @@ export class LocalizationManager extends EventEmitter {
157
157
  };
158
158
 
159
159
  this.currentSettings = { ...this.defaultSettings, ...options };
160
- this.localeCache = new Map();
161
- this.geoLocationCache = new Map();
162
- this.timezoneCache = new Map();
163
- this.proxyCache = new Map();
164
- this.translationCache = new Map();
160
+ // D2.8: cap all caches to prevent unbounded growth under long-lived sessions.
161
+ const MAX_CACHE = parseInt(process.env.LOCALIZATION_CACHE_MAX || '500', 10);
162
+ const makeLRUMap = (max) => {
163
+ const m = new Map();
164
+ m._max = max;
165
+ const origSet = m.set.bind(m);
166
+ m.set = (k, v) => {
167
+ if (m.size >= m._max) {
168
+ m.delete(m.keys().next().value); // evict oldest
169
+ }
170
+ return origSet(k, v);
171
+ };
172
+ return m;
173
+ };
174
+ this.localeCache = makeLRUMap(MAX_CACHE);
175
+ this.geoLocationCache = makeLRUMap(MAX_CACHE);
176
+ this.timezoneCache = makeLRUMap(MAX_CACHE);
177
+ this.proxyCache = makeLRUMap(MAX_CACHE);
178
+ this.translationCache = makeLRUMap(MAX_CACHE);
165
179
 
166
180
  // Proxy management
167
181
  this.proxyManager = {
@@ -206,19 +206,40 @@ export class PerformanceManager extends EventEmitter {
206
206
  return this.taskRouting[taskType];
207
207
  }
208
208
 
209
- // Auto-select based on task characteristics
210
- const dataSize = this.getDataSize(data);
211
- const isLargeDataset = dataSize > 10 * 1024 * 1024; // 10MB
212
- const isCpuIntensive = this.isCpuIntensive(data);
213
- const isNetworkOperation = this.isNetworkOperation(taskType);
214
-
215
- if (isLargeDataset && !isCpuIntensive) {
216
- return 'stream';
217
- } else if (isCpuIntensive) {
218
- return 'worker';
219
- } else if (isNetworkOperation) {
220
- return 'connection';
221
- } else {
209
+ // D2.7: route by live queue depth + wait time, not just static heuristics
210
+ try {
211
+ const workerStats = this.workerPool.getStats ? this.workerPool.getStats() : {};
212
+ const connStats = this.connectionPool.getStats ? this.connectionPool.getStats() : {};
213
+ const workerDepth = workerStats.pendingCount || workerStats.queueDepth || 0;
214
+ const connDepth = connStats.pendingCount || connStats.activeConnections || 0;
215
+ const workerAvgWait = workerStats.averageWaitMs || workerStats.avgWaitTime || 0;
216
+ const connAvgWait = connStats.averageWaitMs || connStats.avgWaitTime || 0;
217
+
218
+ const dataSize = this.getDataSize(data);
219
+ const isLargeDataset = dataSize > 10 * 1024 * 1024; // 10MB
220
+ const isCpuIntensive = this.isCpuIntensive(data);
221
+ const isNetworkOperation = this.isNetworkOperation(taskType);
222
+
223
+ if (isNetworkOperation) {
224
+ if (connDepth < 50 && connAvgWait < 2000) return 'connection';
225
+ if (workerDepth < connDepth) return 'worker';
226
+ return 'queue';
227
+ }
228
+ if (isCpuIntensive) {
229
+ if (workerDepth < 20 && workerAvgWait < 5000) return 'worker';
230
+ return 'queue';
231
+ }
232
+ if (isLargeDataset) return 'stream';
233
+ return 'queue';
234
+ } catch (_statsErr) {
235
+ // Stats API unavailable -- fall back to static heuristics
236
+ const dataSize = this.getDataSize(data);
237
+ const isLargeDataset = dataSize > 10 * 1024 * 1024;
238
+ const isCpuIntensive = this.isCpuIntensive(data);
239
+ const isNetworkOperation = this.isNetworkOperation(taskType);
240
+ if (isLargeDataset && !isCpuIntensive) return 'stream';
241
+ if (isCpuIntensive) return 'worker';
242
+ if (isNetworkOperation) return 'connection';
222
243
  return 'queue';
223
244
  }
224
245
  }
@@ -809,16 +830,31 @@ export class PerformanceManager extends EventEmitter {
809
830
  async shutdown() {
810
831
  this.emit('shutdown');
811
832
 
833
+ // D2.7: signal all in-flight tasks to abort via AbortController
834
+ if (this._shutdownController) {
835
+ this._shutdownController.abort();
836
+ }
837
+ this._shutdownController = new AbortController();
838
+
812
839
  // Stop metrics collection
813
840
  if (this.metricsTimer) {
814
841
  clearInterval(this.metricsTimer);
815
842
  }
816
843
 
817
- // Shutdown all components
844
+ // Shutdown all components with a 5-second timeout each
845
+ const shutdownWithTimeout = (component, name) => {
846
+ const timeout = new Promise((_, reject) =>
847
+ setTimeout(() => reject(new Error(`${name} shutdown timed out`)), 5000)
848
+ );
849
+ return Promise.race([component.shutdown(), timeout]).catch(err => {
850
+ console.error(`PerformanceManager: ${err.message}`);
851
+ });
852
+ };
853
+
818
854
  await Promise.all([
819
- this.workerPool.shutdown(),
820
- this.connectionPool.shutdown(),
821
- this.streamProcessor.shutdown()
855
+ shutdownWithTimeout(this.workerPool, 'WorkerPool'),
856
+ shutdownWithTimeout(this.connectionPool, 'ConnectionPool'),
857
+ shutdownWithTimeout(this.streamProcessor, 'StreamProcessor')
822
858
  ]);
823
859
 
824
860
  this.emit('shutdownComplete');
@@ -177,6 +177,8 @@ export class ResearchOrchestrator extends EventEmitter {
177
177
  * Initialize research session state
178
178
  */
179
179
  initializeResearchSession(sessionId, topic, startTime) {
180
+ // D2.3: per-session token budget (approx 4 chars = 1 token, 1M char cap = ~250K tokens)
181
+ const TOKEN_BUDGET_CHARS = parseInt(process.env.RESEARCH_TOKEN_BUDGET_CHARS || String(1_000_000), 10);
180
182
  this.researchState = {
181
183
  sessionId,
182
184
  topic,
@@ -188,7 +190,11 @@ export class ResearchOrchestrator extends EventEmitter {
188
190
  researchFindings: [],
189
191
  credibilityScores: new Map(),
190
192
  conflictMap: new Map(),
191
- activityLog: []
193
+ activityLog: [],
194
+ // D2.3 token budget tracking
195
+ tokenBudgetChars: TOKEN_BUDGET_CHARS,
196
+ tokenBudgetUsed: 0,
197
+ tokenBudgetExceeded: false
192
198
  };
193
199
 
194
200
  // Reset metrics
@@ -461,7 +467,9 @@ export class ResearchOrchestrator extends EventEmitter {
461
467
  const batchPromises = batch.map(async (source) => {
462
468
  try {
463
469
  if (this.researchState.visitedUrls.has(source.link)) {
464
- return null;
470
+ // D2.10: return already-extracted content rather than null,
471
+ // so overlapping query batches can reuse it.
472
+ return this.researchState.extractedContent.get(source.link) || null;
465
473
  }
466
474
 
467
475
  this.researchState.visitedUrls.add(source.link);
@@ -530,6 +538,22 @@ export class ResearchOrchestrator extends EventEmitter {
530
538
  readabilityScore: this.calculateReadabilityScore(contentText)
531
539
  };
532
540
 
541
+ // D2.3: charge content length to token budget (rough 4 chars/token heuristic)
542
+ if (this.researchState.tokenBudgetUsed !== undefined) {
543
+ this.researchState.tokenBudgetUsed += contentText.length;
544
+ if (this.researchState.tokenBudgetUsed > this.researchState.tokenBudgetChars) {
545
+ if (!this.researchState.tokenBudgetExceeded) {
546
+ this.researchState.tokenBudgetExceeded = true;
547
+ this.logger.warn('Research token budget exceeded -- skipping remaining LLM calls', {
548
+ sessionId: this.researchState.sessionId,
549
+ budgetChars: this.researchState.tokenBudgetChars,
550
+ usedChars: this.researchState.tokenBudgetUsed
551
+ });
552
+ }
553
+ this.enableLLMFeatures = false; // disable for remainder of session
554
+ }
555
+ }
556
+
533
557
  // LLM-powered relevance analysis
534
558
  if (this.enableLLMFeatures && topic) {
535
559
  try {
@@ -1085,11 +1109,16 @@ export class ResearchOrchestrator extends EventEmitter {
1085
1109
  }
1086
1110
 
1087
1111
  deduplicateSources(sources) {
1088
- const seen = new Set();
1112
+ // D2.10: use per-session visitedUrls so URLs are deduped across all query batches,
1113
+ // not just within a single gatherInitialSources call.
1114
+ const sessionSeen = this.researchState && this.researchState.visitedUrls
1115
+ ? this.researchState.visitedUrls
1116
+ : new Set();
1117
+ const localSeen = new Set();
1089
1118
  return sources.filter(source => {
1090
1119
  const key = source.link;
1091
- if (seen.has(key)) return false;
1092
- seen.add(key);
1120
+ if (sessionSeen.has(key) || localSeen.has(key)) return false;
1121
+ localSeen.add(key);
1093
1122
  return true;
1094
1123
  });
1095
1124
  }
@@ -1226,6 +1255,12 @@ export class ResearchOrchestrator extends EventEmitter {
1226
1255
  timeLimit: this.timeLimit,
1227
1256
  completedWithinLimit: this.metrics.totalProcessingTime < this.timeLimit
1228
1257
  },
1258
+ // D2.3: cost transparency
1259
+ _cost: {
1260
+ tokenBudgetChars: this.researchState.tokenBudgetChars,
1261
+ tokenBudgetUsed: this.researchState.tokenBudgetUsed,
1262
+ tokenBudgetExceeded: this.researchState.tokenBudgetExceeded
1263
+ },
1229
1264
  metadata: {
1230
1265
  generatedAt: new Date().toISOString(),
1231
1266
  researchDepth: this.researchState.currentDepth,