crawlforge-mcp-server 3.0.17 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CLAUDE.md +2 -0
  2. package/README.md +1 -0
  3. package/package.json +6 -2
  4. package/server.js +192 -1277
  5. package/src/constants/config.js +2 -1
  6. package/src/core/ActionExecutor.js +2 -43
  7. package/src/core/AuthManager.js +230 -32
  8. package/src/core/BrowserContextPool.js +187 -0
  9. package/src/core/JobManager.js +7 -5
  10. package/src/core/LocalizationManager.js +14 -125
  11. package/src/core/ResearchOrchestrator.js +86 -5
  12. package/src/core/StealthBrowserManager.js +26 -18
  13. package/src/core/cache/CacheManager.js +4 -1
  14. package/src/core/crawlers/BFSCrawler.js +19 -5
  15. package/src/core/endpointGuard.js +37 -0
  16. package/src/observability/metrics.js +137 -0
  17. package/src/observability/tracing.js +74 -0
  18. package/src/server/auth/oauth.js +388 -0
  19. package/src/server/registerTool.js +41 -0
  20. package/src/server/schemas/common.js +29 -0
  21. package/src/server/transports/http.js +22 -0
  22. package/src/server/transports/stdio.js +16 -0
  23. package/src/server/transports/streamableHttp.js +226 -0
  24. package/src/server/withAuth.js +121 -0
  25. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  26. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  27. package/src/tools/advanced/batchScrape/index.js +328 -0
  28. package/src/tools/advanced/batchScrape/queue.js +91 -0
  29. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  30. package/src/tools/advanced/batchScrape/schema.js +37 -0
  31. package/src/tools/advanced/batchScrape/worker.js +179 -0
  32. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  33. package/src/tools/basic/_fetch.js +35 -0
  34. package/src/tools/basic/extractLinks.js +74 -0
  35. package/src/tools/basic/extractMetadata.js +74 -0
  36. package/src/tools/basic/extractText.js +46 -0
  37. package/src/tools/basic/fetchUrl.js +44 -0
  38. package/src/tools/basic/scrapeStructured.js +58 -0
  39. package/src/tools/crawl/_sessionContext.js +234 -0
  40. package/src/tools/crawl/crawlDeep.js +55 -5
  41. package/src/tools/crawl/mapSite.js +23 -2
  42. package/src/tools/extract/_fetchAndParse.js +57 -0
  43. package/src/tools/extract/extractStructured.js +3 -19
  44. package/src/tools/extract/extractWithLlm.js +295 -0
  45. package/src/tools/research/deepResearch.js +33 -8
  46. package/src/tools/search/providers/searxng.js +126 -0
  47. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  48. package/src/tools/search/ranking/ResultRanker.js +17 -10
  49. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  50. package/src/tools/search/searchWeb.js +112 -6
  51. package/src/tools/tracking/trackChanges/differ.js +98 -0
  52. package/src/tools/tracking/trackChanges/index.js +432 -0
  53. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  54. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  55. package/src/tools/tracking/trackChanges/schema.js +127 -0
  56. package/src/tools/tracking/trackChanges.js +12 -1374
@@ -63,47 +63,11 @@ const LANGUAGE_MAPPINGS = {
63
63
  // RTL Languages Configuration
64
64
  const RTL_LANGUAGES = new Set(['ar', 'he', 'fa', 'ur', 'ku', 'dv']);
65
65
 
66
- // Proxy Provider Configuration
67
- const PROXY_PROVIDERS = {
68
- regions: {
69
- 'us-east': { endpoint: 'proxy-us-east.example.com', port: 8080 },
70
- 'us-west': { endpoint: 'proxy-us-west.example.com', port: 8080 },
71
- 'eu-west': { endpoint: 'proxy-eu-west.example.com', port: 8080 },
72
- 'eu-central': { endpoint: 'proxy-eu-central.example.com', port: 8080 },
73
- 'eu-north': { endpoint: 'proxy-eu-north.example.com', port: 8080 },
74
- 'eu-east': { endpoint: 'proxy-eu-east.example.com', port: 8080 },
75
- 'asia-pacific': { endpoint: 'proxy-asia-pacific.example.com', port: 8080 },
76
- 'middle-east': { endpoint: 'proxy-middle-east.example.com', port: 8080 },
77
- 'south-america': { endpoint: 'proxy-south-america.example.com', port: 8080 },
78
- 'north-america': { endpoint: 'proxy-north-america.example.com', port: 8080 },
79
- 'africa': { endpoint: 'proxy-africa.example.com', port: 8080 }
80
- },
81
- fallbackStrategies: {
82
- 'geo-blocked': ['rotate-proxy', 'change-user-agent', 'delay-request'],
83
- 'rate-limited': ['change-proxy', 'exponential-backoff'],
84
- 'detection': ['rotate-fingerprint', 'change-proxy', 'human-delay']
85
- }
86
- };
87
-
88
- // Translation Service Configuration
89
- const TRANSLATION_SERVICES = {
90
- google: {
91
- enabled: process.env.GOOGLE_TRANSLATE_API_KEY ? true : false,
92
- apiKey: process.env.GOOGLE_TRANSLATE_API_KEY,
93
- endpoint: 'https://translation.googleapis.com/language/translate/v2'
94
- },
95
- azure: {
96
- enabled: process.env.AZURE_TRANSLATE_KEY ? true : false,
97
- key: process.env.AZURE_TRANSLATE_KEY,
98
- region: process.env.AZURE_TRANSLATE_REGION || 'global',
99
- endpoint: 'https://api.cognitive.microsofttranslator.com/translate'
100
- },
101
- libre: {
102
- enabled: process.env.LIBRE_TRANSLATE_URL ? true : false,
103
- url: process.env.LIBRE_TRANSLATE_URL,
104
- apiKey: process.env.LIBRE_TRANSLATE_API_KEY
105
- }
106
- };
66
+ // NOTE (v3.0.19 cleanup): PROXY_PROVIDERS and TRANSLATION_SERVICES configs were
67
+ // removed. They pointed at `*.example.com` endpoints and translation services
68
+ // that were never wired up — pure dead code. If/when real proxy rotation or
69
+ // translation lands, configure providers explicitly rather than reviving these
70
+ // placeholders. See IMPROVEMENT_PLAN.md §A3.
107
71
 
108
72
  const LocalizationSchema = z.object({
109
73
  countryCode: z.string().length(2).optional(),
@@ -237,27 +201,21 @@ export class LocalizationManager extends EventEmitter {
237
201
  try {
238
202
  // Pre-populate timezone mappings
239
203
  await this.loadTimezoneData();
240
-
204
+
241
205
  // Initialize geo-location data
242
206
  await this.loadGeoLocationData();
243
-
244
- // Initialize proxy configurations
245
- await this.initializeProxySystem();
246
-
247
- // Initialize translation services
248
- await this.initializeTranslationServices();
249
-
207
+
250
208
  // Load cultural browsing patterns
251
209
  await this.loadCulturalPatterns();
252
-
210
+
253
211
  // Setup periodic health checks
254
212
  this.setupHealthChecks();
255
-
213
+
256
214
  this.emit('initialized');
257
215
  } catch (error) {
258
- this.emit('error', {
259
- type: 'initialization_failed',
260
- error: error.message
216
+ this.emit('error', {
217
+ type: 'initialization_failed',
218
+ error: error.message
261
219
  });
262
220
  throw error;
263
221
  }
@@ -958,76 +916,6 @@ export class LocalizationManager extends EventEmitter {
958
916
 
959
917
  return null;
960
918
  }
961
- /**
962
- * Initialize proxy system with regional configurations
963
- */
964
- async initializeProxySystem() {
965
- try {
966
- // Load proxy configurations from environment or config
967
- for (const [region, config] of Object.entries(PROXY_PROVIDERS.regions)) {
968
- if (process.env[`PROXY_${region.toUpperCase().replace('-', '_')}_ENABLED`] === 'true') {
969
- this.proxyManager.activeProxies.set(region, {
970
- ...config,
971
- username: process.env[`PROXY_${region.toUpperCase().replace('-', '_')}_USERNAME`],
972
- password: process.env[`PROXY_${region.toUpperCase().replace('-', '_')}_PASSWORD`],
973
- healthScore: 100,
974
- lastCheck: 0,
975
- failureCount: 0
976
- });
977
- }
978
- }
979
-
980
- // Setup proxy health monitoring
981
- if (this.proxyManager.activeProxies.size > 0) {
982
- await this.performProxyHealthChecks();
983
- }
984
-
985
- } catch (error) {
986
- console.warn('Failed to initialize proxy system:', error.message);
987
- }
988
- }
989
-
990
- /**
991
- * Initialize translation services
992
- */
993
- async initializeTranslationServices() {
994
- try {
995
- // Google Translate
996
- if (TRANSLATION_SERVICES.google.enabled) {
997
- this.translationProviders.set('google', {
998
- type: 'google',
999
- apiKey: TRANSLATION_SERVICES.google.apiKey,
1000
- endpoint: TRANSLATION_SERVICES.google.endpoint,
1001
- available: true
1002
- });
1003
- }
1004
-
1005
- // Azure Translator
1006
- if (TRANSLATION_SERVICES.azure.enabled) {
1007
- this.translationProviders.set('azure', {
1008
- type: 'azure',
1009
- key: TRANSLATION_SERVICES.azure.key,
1010
- region: TRANSLATION_SERVICES.azure.region,
1011
- endpoint: TRANSLATION_SERVICES.azure.endpoint,
1012
- available: true
1013
- });
1014
- }
1015
-
1016
- // LibreTranslate
1017
- if (TRANSLATION_SERVICES.libre.enabled) {
1018
- this.translationProviders.set('libre', {
1019
- type: 'libre',
1020
- url: TRANSLATION_SERVICES.libre.url,
1021
- apiKey: TRANSLATION_SERVICES.libre.apiKey,
1022
- available: true
1023
- });
1024
- }
1025
-
1026
- } catch (error) {
1027
- console.warn('Failed to initialize translation services:', error.message);
1028
- }
1029
- }
1030
-
1031
919
  /**
1032
920
  * Load cultural browsing patterns for different regions
1033
921
  */
@@ -1612,4 +1500,5 @@ export class LocalizationManager extends EventEmitter {
1612
1500
  export default LocalizationManager;
1613
1501
 
1614
1502
  // Export constants for external use
1615
- export { SUPPORTED_COUNTRIES, RTL_LANGUAGES, PROXY_PROVIDERS, TRANSLATION_SERVICES };
1503
+ // (PROXY_PROVIDERS / TRANSLATION_SERVICES removed in v3.0.19 — see §A3 of IMPROVEMENT_PLAN.md)
1504
+ export { SUPPORTED_COUNTRIES, RTL_LANGUAGES };
@@ -120,30 +120,35 @@ export class ResearchOrchestrator extends EventEmitter {
120
120
 
121
121
  // Stage 1: Initial topic exploration and query expansion
122
122
  const expandedQueries = await this.expandResearchTopic(topic);
123
+ this.researchState.currentDepth = 1;
123
124
  this.logActivity('topic_expansion', { originalTopic: topic, expandedQueries });
124
125
 
125
126
  // Stage 2: Broad information gathering
126
127
  const initialSources = await this.gatherInitialSources(expandedQueries, options);
128
+ this.researchState.currentDepth = 2;
127
129
  this.logActivity('initial_gathering', { sourcesFound: initialSources.length });
128
130
 
129
131
  // Stage 3: Deep exploration of promising sources
130
132
  const detailedFindings = await this.exploreSourcesInDepth(initialSources, options);
133
+ this.researchState.currentDepth = 3;
131
134
  this.logActivity('deep_exploration', { findingsCount: detailedFindings.length });
132
135
 
133
136
  // Stage 4: Source credibility assessment
134
- const verifiedSources = this.enableSourceVerification ?
137
+ const verifiedSources = this.enableSourceVerification ?
135
138
  await this.verifySourceCredibility(detailedFindings) : detailedFindings;
139
+ this.researchState.currentDepth = 4;
136
140
  this.logActivity('source_verification', { verifiedCount: verifiedSources.length });
137
141
 
138
142
  // Stage 5: Information synthesis and conflict detection
139
143
  const synthesizedResults = await this.synthesizeInformation(verifiedSources, topic);
144
+ this.researchState.currentDepth = 5;
140
145
  this.logActivity('information_synthesis', { conflictsFound: synthesizedResults.conflicts.length });
141
146
 
142
- // Stage 6: Final result compilation
143
- const finalResults = this.compileResearchResults(topic, synthesizedResults, options);
144
-
145
147
  const totalTime = Date.now() - startTime;
146
148
  this.metrics.totalProcessingTime = totalTime;
149
+
150
+ // Stage 6: Final result compilation
151
+ const finalResults = this.compileResearchResults(topic, synthesizedResults, options);
147
152
 
148
153
  this.logger.info('Research completed', {
149
154
  sessionId,
@@ -636,10 +641,22 @@ export class ResearchOrchestrator extends EventEmitter {
636
641
  consensus: [],
637
642
  gaps: [],
638
643
  recommendations: [],
639
- llmSynthesis: null
644
+ llmSynthesis: null,
645
+ rawEvidence: null,
646
+ synthesisMode: this.enableLLMFeatures ? 'llm' : 'raw_evidence'
640
647
  };
641
648
 
642
649
  try {
650
+ // Without an LLM the keyword/frequency-based synthesis produces
651
+ // unreadable output. Skip it and return raw evidence for the calling
652
+ // LLM (e.g. Claude Code) to synthesize.
653
+ if (!this.enableLLMFeatures) {
654
+ synthesis.rawEvidence = this.buildRawEvidence(sources);
655
+ synthesis.supportingEvidence = this.compileSupportingEvidence(sources);
656
+ this.metrics.synthesisTime += Date.now() - startTime;
657
+ return synthesis;
658
+ }
659
+
643
660
  // Extract key claims and facts from each source
644
661
  const extractedClaims = await this.extractKeyClaims(sources);
645
662
 
@@ -1110,6 +1127,36 @@ export class ResearchOrchestrator extends EventEmitter {
1110
1127
  .slice(0, 15);
1111
1128
  }
1112
1129
 
1130
+ buildRawEvidence(sources) {
1131
+ return sources
1132
+ .filter(s => s.extractedContent && s.extractedContent.length > 0)
1133
+ .map(s => ({
1134
+ title: s.title,
1135
+ url: s.link,
1136
+ credibility: s.overallCredibility ?? 0.5,
1137
+ contentSnippet: s.extractedContent.substring(0, 4000),
1138
+ topSentences: this.extractTopSentences(s.extractedContent, 5)
1139
+ }))
1140
+ .slice(0, 20);
1141
+ }
1142
+
1143
+ extractTopSentences(text, n = 5) {
1144
+ if (!text) return [];
1145
+ const sentences = text
1146
+ .split(/(?<=[.!?])\s+/)
1147
+ .map(s => s.trim())
1148
+ .filter(s => s.length >= 40 && s.length <= 500);
1149
+
1150
+ return sentences
1151
+ .map(s => ({
1152
+ text: s,
1153
+ score: s.length * 0.5 + (s.match(/[A-Z][a-z]+/g)?.length || 0) * 5
1154
+ }))
1155
+ .sort((a, b) => b.score - a.score)
1156
+ .slice(0, n)
1157
+ .map(item => item.text);
1158
+ }
1159
+
1113
1160
  identifyResearchGaps(claimGroups, topic) {
1114
1161
  const gaps = [];
1115
1162
 
@@ -1158,6 +1205,40 @@ export class ResearchOrchestrator extends EventEmitter {
1158
1205
  }
1159
1206
 
1160
1207
  compileResearchResults(topic, synthesis, options) {
1208
+ if (synthesis.synthesisMode === 'raw_evidence') {
1209
+ const sources = synthesis.rawEvidence || [];
1210
+ return {
1211
+ sessionId: this.researchState.sessionId,
1212
+ topic,
1213
+ synthesisMode: 'raw_evidence',
1214
+ note: "This response contains raw research evidence with no AI synthesis. The calling LLM (you) should synthesize these sources to answer the user's question. To enable internal LLM synthesis instead, set OPENAI_API_KEY or ANTHROPIC_API_KEY in the MCP server environment.",
1215
+ sources,
1216
+ findings: [],
1217
+ researchSummary: {
1218
+ totalSources: this.metrics.urlsProcessed,
1219
+ verifiedSources: this.metrics.sourcesVerified,
1220
+ sourcesReturned: sources.length,
1221
+ llmEnhanced: false
1222
+ },
1223
+ activityLog: this.researchState.activityLog,
1224
+ performance: {
1225
+ ...this.metrics,
1226
+ timeLimit: this.timeLimit,
1227
+ completedWithinLimit: this.metrics.totalProcessingTime < this.timeLimit
1228
+ },
1229
+ metadata: {
1230
+ generatedAt: new Date().toISOString(),
1231
+ researchDepth: this.researchState.currentDepth,
1232
+ configuration: {
1233
+ maxDepth: this.maxDepth,
1234
+ maxUrls: this.maxUrls,
1235
+ timeLimit: this.timeLimit,
1236
+ llmEnabled: false
1237
+ }
1238
+ }
1239
+ };
1240
+ }
1241
+
1161
1242
  const baseResults = {
1162
1243
  sessionId: this.researchState.sessionId,
1163
1244
  topic,
@@ -12,6 +12,7 @@ import { chromium } from 'playwright';
12
12
  import { z } from 'zod';
13
13
  import crypto from 'crypto';
14
14
  import HumanBehaviorSimulator from '../utils/HumanBehaviorSimulator.js';
15
+ import { BrowserContextPool } from './BrowserContextPool.js';
15
16
 
16
17
  const StealthConfigSchema = z.object({
17
18
  level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
@@ -59,7 +60,15 @@ const StealthConfigSchema = z.object({
59
60
  export class StealthBrowserManager {
60
61
  constructor(options = {}) {
61
62
  this.browser = null;
62
- this.contexts = new Map();
63
+ this.contexts = new BrowserContextPool({
64
+ maxContexts: parseInt(process.env.MAX_BROWSER_CONTEXTS || '10', 10),
65
+ periodicRefreshAfter: 200,
66
+ closeIdleAfterMs: 30 * 60 * 1000,
67
+ waitTimeoutMs: 10_000,
68
+ onContextExpired: (contextId) => {
69
+ this.fingerprints.delete(contextId);
70
+ }
71
+ });
63
72
  this.fingerprints = new Map();
64
73
 
65
74
  // Enhanced stealth components
@@ -367,7 +376,7 @@ export class StealthBrowserManager {
367
376
  // Apply stealth scripts and configurations
368
377
  await this.applyAdvancedStealthConfigurations(context, validatedConfig, fingerprint);
369
378
 
370
- this.contexts.set(contextId, { context, fingerprint, config: validatedConfig });
379
+ await this.contexts.set(contextId, { context, fingerprint, config: validatedConfig });
371
380
  this.fingerprints.set(contextId, fingerprint);
372
381
 
373
382
  return { context, contextId, fingerprint };
@@ -1493,11 +1502,20 @@ export class StealthBrowserManager {
1493
1502
  throw new Error('Context not found');
1494
1503
  }
1495
1504
 
1505
+ // Record use and check if context needs periodic refresh
1506
+ const needsRefresh = this.contexts.recordUse(contextId);
1507
+ if (needsRefresh) {
1508
+ // Dispose old context; caller should create a fresh one
1509
+ await this.contexts.dispose(contextId);
1510
+ this.fingerprints.delete(contextId);
1511
+ throw new Error(`StealthBrowserManager: context ${contextId} has reached its use limit and was recycled. Create a new context.`);
1512
+ }
1513
+
1496
1514
  const page = await contextData.context.newPage();
1497
-
1515
+
1498
1516
  // Apply additional page-level stealth measures
1499
1517
  await this.applyPageStealthMeasures(page, contextData.config, contextData.fingerprint);
1500
-
1518
+
1501
1519
  return page;
1502
1520
  }
1503
1521
 
@@ -1678,10 +1696,8 @@ export class StealthBrowserManager {
1678
1696
  * Close specific context
1679
1697
  */
1680
1698
  async closeContext(contextId) {
1681
- const contextData = this.contexts.get(contextId);
1682
- if (contextData) {
1683
- await contextData.context.close();
1684
- this.contexts.delete(contextId);
1699
+ if (this.contexts.has(contextId)) {
1700
+ await this.contexts.dispose(contextId);
1685
1701
  this.fingerprints.delete(contextId);
1686
1702
  }
1687
1703
  }
@@ -1690,16 +1706,8 @@ export class StealthBrowserManager {
1690
1706
  * Close all contexts and browser
1691
1707
  */
1692
1708
  async cleanup() {
1693
- // Close all contexts
1694
- for (const [contextId, contextData] of this.contexts.entries()) {
1695
- try {
1696
- await contextData.context.close();
1697
- } catch (error) {
1698
- console.warn(`Failed to close context ${contextId}:`, error.message);
1699
- }
1700
- }
1701
-
1702
- this.contexts.clear();
1709
+ // Close all contexts via pool (handles idle timer cleanup + wait queue drain)
1710
+ await this.contexts.destroy();
1703
1711
  this.fingerprints.clear();
1704
1712
 
1705
1713
  // Reset human behavior simulator
@@ -82,10 +82,12 @@ export class CacheManager extends EventEmitter {
82
82
  this.startMonitoring(monitoringInterval);
83
83
  }
84
84
 
85
- // Initialize auto cleanup
85
+ // Initialize auto cleanup. .unref() so the timer never blocks process exit
86
+ // — short-lived CLI invocations and tests don't need an explicit destroy().
86
87
  this.cleanupTimer = setInterval(() => {
87
88
  this.cleanupExpired();
88
89
  }, autoCleanupInterval);
90
+ if (typeof this.cleanupTimer.unref === 'function') this.cleanupTimer.unref();
89
91
 
90
92
  // Eviction tracking is handled in the LRU cache dispose callback above
91
93
  }
@@ -546,6 +548,7 @@ export class CacheManager extends EventEmitter {
546
548
  this.updateStats();
547
549
  this.emit('monitoring', this.getDetailedStats());
548
550
  }, interval);
551
+ if (typeof this.monitoringTimer.unref === 'function') this.monitoringTimer.unref();
549
552
  }
550
553
 
551
554
  /**
@@ -19,7 +19,8 @@ export class BFSCrawler {
19
19
  concurrency = 10,
20
20
  domainFilter = null,
21
21
  enableLinkAnalysis = true,
22
- linkAnalyzerOptions = {}
22
+ linkAnalyzerOptions = {},
23
+ sessionContext = null
23
24
  } = options;
24
25
 
25
26
  this.maxDepth = maxDepth;
@@ -28,6 +29,8 @@ export class BFSCrawler {
28
29
  this.respectRobots = respectRobots;
29
30
  this.userAgent = userAgent;
30
31
  this.timeout = timeout;
32
+ // Session context for cookie jar + persistent headers (null = stateless)
33
+ this.sessionContext = sessionContext;
31
34
 
32
35
  this.visited = new Set();
33
36
  this.results = [];
@@ -254,21 +257,32 @@ export class BFSCrawler {
254
257
  'Connection': 'keep-alive',
255
258
  'Upgrade-Insecure-Requests': '1'
256
259
  };
257
-
258
- const headers = { ...defaultHeaders, ...domainRules.customHeaders };
260
+
261
+ let headers = { ...defaultHeaders, ...domainRules.customHeaders };
262
+
263
+ // If a session is active, layer in session headers + cookie jar
264
+ if (this.sessionContext) {
265
+ headers = this.sessionContext.applyToHeaders(url, headers);
266
+ }
267
+
259
268
  const effectiveTimeout = domainRules.timeout || this.timeout;
260
-
269
+
261
270
  // Update timeout if different
262
271
  if (effectiveTimeout !== this.timeout) {
263
272
  clearTimeout(timeoutId);
264
273
  setTimeout(() => controller.abort(), effectiveTimeout);
265
274
  }
266
-
275
+
267
276
  const response = await fetch(url, {
268
277
  signal: controller.signal,
269
278
  headers
270
279
  });
271
280
 
281
+ // Capture any cookies the server sets during the crawl
282
+ if (this.sessionContext) {
283
+ this.sessionContext.recordCookies(response, url);
284
+ }
285
+
272
286
  clearTimeout(timeoutId);
273
287
 
274
288
  if (!response.ok) {
@@ -0,0 +1,37 @@
1
+ import { isCreatorModeVerified } from './creatorMode.js';
2
+
3
+ export const ALLOWED_HOSTS = ['www.crawlforge.dev', 'crawlforge.dev', 'api.crawlforge.dev'];
4
+
5
+ const LOCALHOST_HOSTS = new Set(['localhost', '127.0.0.1', '::1']);
6
+
7
+ export function resolveApiEndpoint(rawUrl) {
8
+ let parsed;
9
+ try {
10
+ parsed = new URL(rawUrl);
11
+ } catch {
12
+ throw new Error(`Invalid API endpoint URL: "${rawUrl}"`);
13
+ }
14
+
15
+ const hostname = parsed.hostname;
16
+
17
+ if (LOCALHOST_HOSTS.has(hostname)) {
18
+ if (!isCreatorModeVerified()) {
19
+ throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
20
+ }
21
+ // Strip trailing slash from pathname
22
+ parsed.pathname = parsed.pathname.replace(/\/+$/, '');
23
+ return parsed.toString();
24
+ }
25
+
26
+ if (parsed.protocol !== 'https:') {
27
+ throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
28
+ }
29
+
30
+ if (!ALLOWED_HOSTS.includes(hostname)) {
31
+ throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
32
+ }
33
+
34
+ // Strip trailing slash from pathname
35
+ parsed.pathname = parsed.pathname.replace(/\/+$/, '');
36
+ return parsed.toString();
37
+ }
@@ -0,0 +1,137 @@
1
+ /**
2
+ * Prometheus metrics — dependency-free implementation.
3
+ *
4
+ * Why no prom-client? CrawlForge is shipped via npm and runs in stdio mode
5
+ * by default. Pulling in prom-client (and its dependency tree) just to
6
+ * expose four counters/gauges is overkill. This 150 LOC implementation
7
+ * conforms to the Prometheus exposition format 0.0.4.
8
+ *
9
+ * Disabled by default. Enable via `CRAWLFORGE_METRICS=true` in HTTP mode.
10
+ *
11
+ * Counters/gauges exposed:
12
+ * - crawlforge_tool_requests_total{tool,outcome}
13
+ * - crawlforge_tool_errors_total{tool,error_class}
14
+ * - crawlforge_tool_duration_ms{tool} (histogram, summed)
15
+ * - crawlforge_credits_consumed_total{tool}
16
+ * - crawlforge_browser_pool_in_use (gauge)
17
+ * - crawlforge_browser_pool_capacity (gauge)
18
+ */
19
+
20
+ const CONTENT_TYPE = 'text/plain; version=0.0.4; charset=utf-8';
21
+
22
+ export function createMetricsRegistry() {
23
+ const counters = new Map(); // name|labels -> number
24
+ const gauges = new Map(); // name|labels -> number
25
+ const histograms = new Map(); // name|labels -> { count, sum, buckets:{le->count} }
26
+
27
+ const HISTOGRAM_BUCKETS_MS = [10, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000];
28
+
29
+ function key(name, labels) {
30
+ const labelStr = Object.entries(labels ?? {})
31
+ .sort(([a], [b]) => a.localeCompare(b))
32
+ .map(([k, v]) => `${k}="${escapeLabel(String(v))}"`)
33
+ .join(',');
34
+ return labelStr ? `${name}{${labelStr}}` : name;
35
+ }
36
+
37
+ return {
38
+ contentType: CONTENT_TYPE,
39
+
40
+ incCounter(name, labels, by = 1) {
41
+ const k = key(name, labels);
42
+ counters.set(k, (counters.get(k) ?? 0) + by);
43
+ },
44
+
45
+ setGauge(name, labels, value) {
46
+ gauges.set(key(name, labels), value);
47
+ },
48
+
49
+ observeHistogram(name, labels, valueMs) {
50
+ const k = key(name, labels);
51
+ let h = histograms.get(k);
52
+ if (!h) {
53
+ h = { count: 0, sum: 0, buckets: Object.fromEntries(HISTOGRAM_BUCKETS_MS.map(b => [b, 0])) };
54
+ histograms.set(k, h);
55
+ }
56
+ h.count += 1;
57
+ h.sum += valueMs;
58
+ for (const b of HISTOGRAM_BUCKETS_MS) {
59
+ if (valueMs <= b) h.buckets[b] += 1;
60
+ }
61
+ },
62
+
63
+ async render() {
64
+ const lines = [];
65
+
66
+ // Counters
67
+ const counterNames = new Set();
68
+ for (const k of counters.keys()) counterNames.add(k.split('{')[0]);
69
+ for (const name of counterNames) {
70
+ lines.push(`# HELP ${name} ${describe(name)}`);
71
+ lines.push(`# TYPE ${name} counter`);
72
+ for (const [k, v] of counters.entries()) {
73
+ if (k.split('{')[0] === name) lines.push(`${k} ${v}`);
74
+ }
75
+ }
76
+
77
+ // Gauges
78
+ const gaugeNames = new Set();
79
+ for (const k of gauges.keys()) gaugeNames.add(k.split('{')[0]);
80
+ for (const name of gaugeNames) {
81
+ lines.push(`# HELP ${name} ${describe(name)}`);
82
+ lines.push(`# TYPE ${name} gauge`);
83
+ for (const [k, v] of gauges.entries()) {
84
+ if (k.split('{')[0] === name) lines.push(`${k} ${v}`);
85
+ }
86
+ }
87
+
88
+ // Histograms
89
+ const histNames = new Set();
90
+ for (const k of histograms.keys()) histNames.add(k.split('{')[0]);
91
+ for (const name of histNames) {
92
+ lines.push(`# HELP ${name} ${describe(name)}`);
93
+ lines.push(`# TYPE ${name} histogram`);
94
+ for (const [k, h] of histograms.entries()) {
95
+ if (k.split('{')[0] !== name) continue;
96
+ // Reconstruct base labels (everything inside { })
97
+ const baseLabels = k.includes('{') ? k.slice(k.indexOf('{') + 1, -1) : '';
98
+ const sep = baseLabels ? ',' : '';
99
+ for (const b of HISTOGRAM_BUCKETS_MS) {
100
+ lines.push(`${name}_bucket{${baseLabels}${sep}le="${b}"} ${h.buckets[b]}`);
101
+ }
102
+ lines.push(`${name}_bucket{${baseLabels}${sep}le="+Inf"} ${h.count}`);
103
+ lines.push(`${name}_sum${baseLabels ? `{${baseLabels}}` : ''} ${h.sum}`);
104
+ lines.push(`${name}_count${baseLabels ? `{${baseLabels}}` : ''} ${h.count}`);
105
+ }
106
+ }
107
+
108
+ return lines.join('\n') + '\n';
109
+ },
110
+
111
+ // Snapshot for tests
112
+ _snapshot() {
113
+ return {
114
+ counters: Object.fromEntries(counters.entries()),
115
+ gauges: Object.fromEntries(gauges.entries()),
116
+ histograms: Object.fromEntries(histograms.entries())
117
+ };
118
+ }
119
+ };
120
+ }
121
+
122
+ const HELP = {
123
+ crawlforge_tool_requests_total: 'Total number of MCP tool invocations',
124
+ crawlforge_tool_errors_total: 'Total number of MCP tool errors',
125
+ crawlforge_tool_duration_ms: 'MCP tool invocation duration in milliseconds',
126
+ crawlforge_credits_consumed_total: 'Total CrawlForge credits consumed',
127
+ crawlforge_browser_pool_in_use: 'Number of browser contexts currently leased from the pool',
128
+ crawlforge_browser_pool_capacity: 'Maximum browser context pool capacity'
129
+ };
130
+
131
+ function describe(name) {
132
+ return HELP[name] ?? name;
133
+ }
134
+
135
+ function escapeLabel(v) {
136
+ return v.replace(/\\/g, '\\\\').replace(/\n/g, '\\n').replace(/"/g, '\\"');
137
+ }