crawlforge-mcp-server 3.0.17 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -0
- package/README.md +1 -0
- package/package.json +6 -2
- package/server.js +192 -1277
- package/src/constants/config.js +2 -1
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +230 -32
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/ResearchOrchestrator.js +86 -5
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/core/endpointGuard.js +37 -0
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +295 -0
- package/src/tools/research/deepResearch.js +33 -8
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
|
@@ -63,47 +63,11 @@ const LANGUAGE_MAPPINGS = {
|
|
|
63
63
|
// RTL Languages Configuration
|
|
64
64
|
const RTL_LANGUAGES = new Set(['ar', 'he', 'fa', 'ur', 'ku', 'dv']);
|
|
65
65
|
|
|
66
|
-
//
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
'eu-west': { endpoint: 'proxy-eu-west.example.com', port: 8080 },
|
|
72
|
-
'eu-central': { endpoint: 'proxy-eu-central.example.com', port: 8080 },
|
|
73
|
-
'eu-north': { endpoint: 'proxy-eu-north.example.com', port: 8080 },
|
|
74
|
-
'eu-east': { endpoint: 'proxy-eu-east.example.com', port: 8080 },
|
|
75
|
-
'asia-pacific': { endpoint: 'proxy-asia-pacific.example.com', port: 8080 },
|
|
76
|
-
'middle-east': { endpoint: 'proxy-middle-east.example.com', port: 8080 },
|
|
77
|
-
'south-america': { endpoint: 'proxy-south-america.example.com', port: 8080 },
|
|
78
|
-
'north-america': { endpoint: 'proxy-north-america.example.com', port: 8080 },
|
|
79
|
-
'africa': { endpoint: 'proxy-africa.example.com', port: 8080 }
|
|
80
|
-
},
|
|
81
|
-
fallbackStrategies: {
|
|
82
|
-
'geo-blocked': ['rotate-proxy', 'change-user-agent', 'delay-request'],
|
|
83
|
-
'rate-limited': ['change-proxy', 'exponential-backoff'],
|
|
84
|
-
'detection': ['rotate-fingerprint', 'change-proxy', 'human-delay']
|
|
85
|
-
}
|
|
86
|
-
};
|
|
87
|
-
|
|
88
|
-
// Translation Service Configuration
|
|
89
|
-
const TRANSLATION_SERVICES = {
|
|
90
|
-
google: {
|
|
91
|
-
enabled: process.env.GOOGLE_TRANSLATE_API_KEY ? true : false,
|
|
92
|
-
apiKey: process.env.GOOGLE_TRANSLATE_API_KEY,
|
|
93
|
-
endpoint: 'https://translation.googleapis.com/language/translate/v2'
|
|
94
|
-
},
|
|
95
|
-
azure: {
|
|
96
|
-
enabled: process.env.AZURE_TRANSLATE_KEY ? true : false,
|
|
97
|
-
key: process.env.AZURE_TRANSLATE_KEY,
|
|
98
|
-
region: process.env.AZURE_TRANSLATE_REGION || 'global',
|
|
99
|
-
endpoint: 'https://api.cognitive.microsofttranslator.com/translate'
|
|
100
|
-
},
|
|
101
|
-
libre: {
|
|
102
|
-
enabled: process.env.LIBRE_TRANSLATE_URL ? true : false,
|
|
103
|
-
url: process.env.LIBRE_TRANSLATE_URL,
|
|
104
|
-
apiKey: process.env.LIBRE_TRANSLATE_API_KEY
|
|
105
|
-
}
|
|
106
|
-
};
|
|
66
|
+
// NOTE (v3.0.19 cleanup): PROXY_PROVIDERS and TRANSLATION_SERVICES configs were
|
|
67
|
+
// removed. They pointed at `*.example.com` endpoints and translation services
|
|
68
|
+
// that were never wired up — pure dead code. If/when real proxy rotation or
|
|
69
|
+
// translation lands, configure providers explicitly rather than reviving these
|
|
70
|
+
// placeholders. See IMPROVEMENT_PLAN.md §A3.
|
|
107
71
|
|
|
108
72
|
const LocalizationSchema = z.object({
|
|
109
73
|
countryCode: z.string().length(2).optional(),
|
|
@@ -237,27 +201,21 @@ export class LocalizationManager extends EventEmitter {
|
|
|
237
201
|
try {
|
|
238
202
|
// Pre-populate timezone mappings
|
|
239
203
|
await this.loadTimezoneData();
|
|
240
|
-
|
|
204
|
+
|
|
241
205
|
// Initialize geo-location data
|
|
242
206
|
await this.loadGeoLocationData();
|
|
243
|
-
|
|
244
|
-
// Initialize proxy configurations
|
|
245
|
-
await this.initializeProxySystem();
|
|
246
|
-
|
|
247
|
-
// Initialize translation services
|
|
248
|
-
await this.initializeTranslationServices();
|
|
249
|
-
|
|
207
|
+
|
|
250
208
|
// Load cultural browsing patterns
|
|
251
209
|
await this.loadCulturalPatterns();
|
|
252
|
-
|
|
210
|
+
|
|
253
211
|
// Setup periodic health checks
|
|
254
212
|
this.setupHealthChecks();
|
|
255
|
-
|
|
213
|
+
|
|
256
214
|
this.emit('initialized');
|
|
257
215
|
} catch (error) {
|
|
258
|
-
this.emit('error', {
|
|
259
|
-
type: 'initialization_failed',
|
|
260
|
-
error: error.message
|
|
216
|
+
this.emit('error', {
|
|
217
|
+
type: 'initialization_failed',
|
|
218
|
+
error: error.message
|
|
261
219
|
});
|
|
262
220
|
throw error;
|
|
263
221
|
}
|
|
@@ -958,76 +916,6 @@ export class LocalizationManager extends EventEmitter {
|
|
|
958
916
|
|
|
959
917
|
return null;
|
|
960
918
|
}
|
|
961
|
-
/**
|
|
962
|
-
* Initialize proxy system with regional configurations
|
|
963
|
-
*/
|
|
964
|
-
async initializeProxySystem() {
|
|
965
|
-
try {
|
|
966
|
-
// Load proxy configurations from environment or config
|
|
967
|
-
for (const [region, config] of Object.entries(PROXY_PROVIDERS.regions)) {
|
|
968
|
-
if (process.env[`PROXY_${region.toUpperCase().replace('-', '_')}_ENABLED`] === 'true') {
|
|
969
|
-
this.proxyManager.activeProxies.set(region, {
|
|
970
|
-
...config,
|
|
971
|
-
username: process.env[`PROXY_${region.toUpperCase().replace('-', '_')}_USERNAME`],
|
|
972
|
-
password: process.env[`PROXY_${region.toUpperCase().replace('-', '_')}_PASSWORD`],
|
|
973
|
-
healthScore: 100,
|
|
974
|
-
lastCheck: 0,
|
|
975
|
-
failureCount: 0
|
|
976
|
-
});
|
|
977
|
-
}
|
|
978
|
-
}
|
|
979
|
-
|
|
980
|
-
// Setup proxy health monitoring
|
|
981
|
-
if (this.proxyManager.activeProxies.size > 0) {
|
|
982
|
-
await this.performProxyHealthChecks();
|
|
983
|
-
}
|
|
984
|
-
|
|
985
|
-
} catch (error) {
|
|
986
|
-
console.warn('Failed to initialize proxy system:', error.message);
|
|
987
|
-
}
|
|
988
|
-
}
|
|
989
|
-
|
|
990
|
-
/**
|
|
991
|
-
* Initialize translation services
|
|
992
|
-
*/
|
|
993
|
-
async initializeTranslationServices() {
|
|
994
|
-
try {
|
|
995
|
-
// Google Translate
|
|
996
|
-
if (TRANSLATION_SERVICES.google.enabled) {
|
|
997
|
-
this.translationProviders.set('google', {
|
|
998
|
-
type: 'google',
|
|
999
|
-
apiKey: TRANSLATION_SERVICES.google.apiKey,
|
|
1000
|
-
endpoint: TRANSLATION_SERVICES.google.endpoint,
|
|
1001
|
-
available: true
|
|
1002
|
-
});
|
|
1003
|
-
}
|
|
1004
|
-
|
|
1005
|
-
// Azure Translator
|
|
1006
|
-
if (TRANSLATION_SERVICES.azure.enabled) {
|
|
1007
|
-
this.translationProviders.set('azure', {
|
|
1008
|
-
type: 'azure',
|
|
1009
|
-
key: TRANSLATION_SERVICES.azure.key,
|
|
1010
|
-
region: TRANSLATION_SERVICES.azure.region,
|
|
1011
|
-
endpoint: TRANSLATION_SERVICES.azure.endpoint,
|
|
1012
|
-
available: true
|
|
1013
|
-
});
|
|
1014
|
-
}
|
|
1015
|
-
|
|
1016
|
-
// LibreTranslate
|
|
1017
|
-
if (TRANSLATION_SERVICES.libre.enabled) {
|
|
1018
|
-
this.translationProviders.set('libre', {
|
|
1019
|
-
type: 'libre',
|
|
1020
|
-
url: TRANSLATION_SERVICES.libre.url,
|
|
1021
|
-
apiKey: TRANSLATION_SERVICES.libre.apiKey,
|
|
1022
|
-
available: true
|
|
1023
|
-
});
|
|
1024
|
-
}
|
|
1025
|
-
|
|
1026
|
-
} catch (error) {
|
|
1027
|
-
console.warn('Failed to initialize translation services:', error.message);
|
|
1028
|
-
}
|
|
1029
|
-
}
|
|
1030
|
-
|
|
1031
919
|
/**
|
|
1032
920
|
* Load cultural browsing patterns for different regions
|
|
1033
921
|
*/
|
|
@@ -1612,4 +1500,5 @@ export class LocalizationManager extends EventEmitter {
|
|
|
1612
1500
|
export default LocalizationManager;
|
|
1613
1501
|
|
|
1614
1502
|
// Export constants for external use
|
|
1615
|
-
|
|
1503
|
+
// (PROXY_PROVIDERS / TRANSLATION_SERVICES removed in v3.0.19 — see §A3 of IMPROVEMENT_PLAN.md)
|
|
1504
|
+
export { SUPPORTED_COUNTRIES, RTL_LANGUAGES };
|
|
@@ -120,30 +120,35 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
120
120
|
|
|
121
121
|
// Stage 1: Initial topic exploration and query expansion
|
|
122
122
|
const expandedQueries = await this.expandResearchTopic(topic);
|
|
123
|
+
this.researchState.currentDepth = 1;
|
|
123
124
|
this.logActivity('topic_expansion', { originalTopic: topic, expandedQueries });
|
|
124
125
|
|
|
125
126
|
// Stage 2: Broad information gathering
|
|
126
127
|
const initialSources = await this.gatherInitialSources(expandedQueries, options);
|
|
128
|
+
this.researchState.currentDepth = 2;
|
|
127
129
|
this.logActivity('initial_gathering', { sourcesFound: initialSources.length });
|
|
128
130
|
|
|
129
131
|
// Stage 3: Deep exploration of promising sources
|
|
130
132
|
const detailedFindings = await this.exploreSourcesInDepth(initialSources, options);
|
|
133
|
+
this.researchState.currentDepth = 3;
|
|
131
134
|
this.logActivity('deep_exploration', { findingsCount: detailedFindings.length });
|
|
132
135
|
|
|
133
136
|
// Stage 4: Source credibility assessment
|
|
134
|
-
const verifiedSources = this.enableSourceVerification ?
|
|
137
|
+
const verifiedSources = this.enableSourceVerification ?
|
|
135
138
|
await this.verifySourceCredibility(detailedFindings) : detailedFindings;
|
|
139
|
+
this.researchState.currentDepth = 4;
|
|
136
140
|
this.logActivity('source_verification', { verifiedCount: verifiedSources.length });
|
|
137
141
|
|
|
138
142
|
// Stage 5: Information synthesis and conflict detection
|
|
139
143
|
const synthesizedResults = await this.synthesizeInformation(verifiedSources, topic);
|
|
144
|
+
this.researchState.currentDepth = 5;
|
|
140
145
|
this.logActivity('information_synthesis', { conflictsFound: synthesizedResults.conflicts.length });
|
|
141
146
|
|
|
142
|
-
// Stage 6: Final result compilation
|
|
143
|
-
const finalResults = this.compileResearchResults(topic, synthesizedResults, options);
|
|
144
|
-
|
|
145
147
|
const totalTime = Date.now() - startTime;
|
|
146
148
|
this.metrics.totalProcessingTime = totalTime;
|
|
149
|
+
|
|
150
|
+
// Stage 6: Final result compilation
|
|
151
|
+
const finalResults = this.compileResearchResults(topic, synthesizedResults, options);
|
|
147
152
|
|
|
148
153
|
this.logger.info('Research completed', {
|
|
149
154
|
sessionId,
|
|
@@ -636,10 +641,22 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
636
641
|
consensus: [],
|
|
637
642
|
gaps: [],
|
|
638
643
|
recommendations: [],
|
|
639
|
-
llmSynthesis: null
|
|
644
|
+
llmSynthesis: null,
|
|
645
|
+
rawEvidence: null,
|
|
646
|
+
synthesisMode: this.enableLLMFeatures ? 'llm' : 'raw_evidence'
|
|
640
647
|
};
|
|
641
648
|
|
|
642
649
|
try {
|
|
650
|
+
// Without an LLM the keyword/frequency-based synthesis produces
|
|
651
|
+
// unreadable output. Skip it and return raw evidence for the calling
|
|
652
|
+
// LLM (e.g. Claude Code) to synthesize.
|
|
653
|
+
if (!this.enableLLMFeatures) {
|
|
654
|
+
synthesis.rawEvidence = this.buildRawEvidence(sources);
|
|
655
|
+
synthesis.supportingEvidence = this.compileSupportingEvidence(sources);
|
|
656
|
+
this.metrics.synthesisTime += Date.now() - startTime;
|
|
657
|
+
return synthesis;
|
|
658
|
+
}
|
|
659
|
+
|
|
643
660
|
// Extract key claims and facts from each source
|
|
644
661
|
const extractedClaims = await this.extractKeyClaims(sources);
|
|
645
662
|
|
|
@@ -1110,6 +1127,36 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
1110
1127
|
.slice(0, 15);
|
|
1111
1128
|
}
|
|
1112
1129
|
|
|
1130
|
+
buildRawEvidence(sources) {
|
|
1131
|
+
return sources
|
|
1132
|
+
.filter(s => s.extractedContent && s.extractedContent.length > 0)
|
|
1133
|
+
.map(s => ({
|
|
1134
|
+
title: s.title,
|
|
1135
|
+
url: s.link,
|
|
1136
|
+
credibility: s.overallCredibility ?? 0.5,
|
|
1137
|
+
contentSnippet: s.extractedContent.substring(0, 4000),
|
|
1138
|
+
topSentences: this.extractTopSentences(s.extractedContent, 5)
|
|
1139
|
+
}))
|
|
1140
|
+
.slice(0, 20);
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
extractTopSentences(text, n = 5) {
|
|
1144
|
+
if (!text) return [];
|
|
1145
|
+
const sentences = text
|
|
1146
|
+
.split(/(?<=[.!?])\s+/)
|
|
1147
|
+
.map(s => s.trim())
|
|
1148
|
+
.filter(s => s.length >= 40 && s.length <= 500);
|
|
1149
|
+
|
|
1150
|
+
return sentences
|
|
1151
|
+
.map(s => ({
|
|
1152
|
+
text: s,
|
|
1153
|
+
score: s.length * 0.5 + (s.match(/[A-Z][a-z]+/g)?.length || 0) * 5
|
|
1154
|
+
}))
|
|
1155
|
+
.sort((a, b) => b.score - a.score)
|
|
1156
|
+
.slice(0, n)
|
|
1157
|
+
.map(item => item.text);
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1113
1160
|
identifyResearchGaps(claimGroups, topic) {
|
|
1114
1161
|
const gaps = [];
|
|
1115
1162
|
|
|
@@ -1158,6 +1205,40 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
1158
1205
|
}
|
|
1159
1206
|
|
|
1160
1207
|
compileResearchResults(topic, synthesis, options) {
|
|
1208
|
+
if (synthesis.synthesisMode === 'raw_evidence') {
|
|
1209
|
+
const sources = synthesis.rawEvidence || [];
|
|
1210
|
+
return {
|
|
1211
|
+
sessionId: this.researchState.sessionId,
|
|
1212
|
+
topic,
|
|
1213
|
+
synthesisMode: 'raw_evidence',
|
|
1214
|
+
note: "This response contains raw research evidence with no AI synthesis. The calling LLM (you) should synthesize these sources to answer the user's question. To enable internal LLM synthesis instead, set OPENAI_API_KEY or ANTHROPIC_API_KEY in the MCP server environment.",
|
|
1215
|
+
sources,
|
|
1216
|
+
findings: [],
|
|
1217
|
+
researchSummary: {
|
|
1218
|
+
totalSources: this.metrics.urlsProcessed,
|
|
1219
|
+
verifiedSources: this.metrics.sourcesVerified,
|
|
1220
|
+
sourcesReturned: sources.length,
|
|
1221
|
+
llmEnhanced: false
|
|
1222
|
+
},
|
|
1223
|
+
activityLog: this.researchState.activityLog,
|
|
1224
|
+
performance: {
|
|
1225
|
+
...this.metrics,
|
|
1226
|
+
timeLimit: this.timeLimit,
|
|
1227
|
+
completedWithinLimit: this.metrics.totalProcessingTime < this.timeLimit
|
|
1228
|
+
},
|
|
1229
|
+
metadata: {
|
|
1230
|
+
generatedAt: new Date().toISOString(),
|
|
1231
|
+
researchDepth: this.researchState.currentDepth,
|
|
1232
|
+
configuration: {
|
|
1233
|
+
maxDepth: this.maxDepth,
|
|
1234
|
+
maxUrls: this.maxUrls,
|
|
1235
|
+
timeLimit: this.timeLimit,
|
|
1236
|
+
llmEnabled: false
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
};
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1161
1242
|
const baseResults = {
|
|
1162
1243
|
sessionId: this.researchState.sessionId,
|
|
1163
1244
|
topic,
|
|
@@ -12,6 +12,7 @@ import { chromium } from 'playwright';
|
|
|
12
12
|
import { z } from 'zod';
|
|
13
13
|
import crypto from 'crypto';
|
|
14
14
|
import HumanBehaviorSimulator from '../utils/HumanBehaviorSimulator.js';
|
|
15
|
+
import { BrowserContextPool } from './BrowserContextPool.js';
|
|
15
16
|
|
|
16
17
|
const StealthConfigSchema = z.object({
|
|
17
18
|
level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
|
|
@@ -59,7 +60,15 @@ const StealthConfigSchema = z.object({
|
|
|
59
60
|
export class StealthBrowserManager {
|
|
60
61
|
constructor(options = {}) {
|
|
61
62
|
this.browser = null;
|
|
62
|
-
this.contexts = new
|
|
63
|
+
this.contexts = new BrowserContextPool({
|
|
64
|
+
maxContexts: parseInt(process.env.MAX_BROWSER_CONTEXTS || '10', 10),
|
|
65
|
+
periodicRefreshAfter: 200,
|
|
66
|
+
closeIdleAfterMs: 30 * 60 * 1000,
|
|
67
|
+
waitTimeoutMs: 10_000,
|
|
68
|
+
onContextExpired: (contextId) => {
|
|
69
|
+
this.fingerprints.delete(contextId);
|
|
70
|
+
}
|
|
71
|
+
});
|
|
63
72
|
this.fingerprints = new Map();
|
|
64
73
|
|
|
65
74
|
// Enhanced stealth components
|
|
@@ -367,7 +376,7 @@ export class StealthBrowserManager {
|
|
|
367
376
|
// Apply stealth scripts and configurations
|
|
368
377
|
await this.applyAdvancedStealthConfigurations(context, validatedConfig, fingerprint);
|
|
369
378
|
|
|
370
|
-
this.contexts.set(contextId, { context, fingerprint, config: validatedConfig });
|
|
379
|
+
await this.contexts.set(contextId, { context, fingerprint, config: validatedConfig });
|
|
371
380
|
this.fingerprints.set(contextId, fingerprint);
|
|
372
381
|
|
|
373
382
|
return { context, contextId, fingerprint };
|
|
@@ -1493,11 +1502,20 @@ export class StealthBrowserManager {
|
|
|
1493
1502
|
throw new Error('Context not found');
|
|
1494
1503
|
}
|
|
1495
1504
|
|
|
1505
|
+
// Record use and check if context needs periodic refresh
|
|
1506
|
+
const needsRefresh = this.contexts.recordUse(contextId);
|
|
1507
|
+
if (needsRefresh) {
|
|
1508
|
+
// Dispose old context; caller should create a fresh one
|
|
1509
|
+
await this.contexts.dispose(contextId);
|
|
1510
|
+
this.fingerprints.delete(contextId);
|
|
1511
|
+
throw new Error(`StealthBrowserManager: context ${contextId} has reached its use limit and was recycled. Create a new context.`);
|
|
1512
|
+
}
|
|
1513
|
+
|
|
1496
1514
|
const page = await contextData.context.newPage();
|
|
1497
|
-
|
|
1515
|
+
|
|
1498
1516
|
// Apply additional page-level stealth measures
|
|
1499
1517
|
await this.applyPageStealthMeasures(page, contextData.config, contextData.fingerprint);
|
|
1500
|
-
|
|
1518
|
+
|
|
1501
1519
|
return page;
|
|
1502
1520
|
}
|
|
1503
1521
|
|
|
@@ -1678,10 +1696,8 @@ export class StealthBrowserManager {
|
|
|
1678
1696
|
* Close specific context
|
|
1679
1697
|
*/
|
|
1680
1698
|
async closeContext(contextId) {
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
await contextData.context.close();
|
|
1684
|
-
this.contexts.delete(contextId);
|
|
1699
|
+
if (this.contexts.has(contextId)) {
|
|
1700
|
+
await this.contexts.dispose(contextId);
|
|
1685
1701
|
this.fingerprints.delete(contextId);
|
|
1686
1702
|
}
|
|
1687
1703
|
}
|
|
@@ -1690,16 +1706,8 @@ export class StealthBrowserManager {
|
|
|
1690
1706
|
* Close all contexts and browser
|
|
1691
1707
|
*/
|
|
1692
1708
|
async cleanup() {
|
|
1693
|
-
// Close all contexts
|
|
1694
|
-
|
|
1695
|
-
try {
|
|
1696
|
-
await contextData.context.close();
|
|
1697
|
-
} catch (error) {
|
|
1698
|
-
console.warn(`Failed to close context ${contextId}:`, error.message);
|
|
1699
|
-
}
|
|
1700
|
-
}
|
|
1701
|
-
|
|
1702
|
-
this.contexts.clear();
|
|
1709
|
+
// Close all contexts via pool (handles idle timer cleanup + wait queue drain)
|
|
1710
|
+
await this.contexts.destroy();
|
|
1703
1711
|
this.fingerprints.clear();
|
|
1704
1712
|
|
|
1705
1713
|
// Reset human behavior simulator
|
|
@@ -82,10 +82,12 @@ export class CacheManager extends EventEmitter {
|
|
|
82
82
|
this.startMonitoring(monitoringInterval);
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
-
// Initialize auto cleanup
|
|
85
|
+
// Initialize auto cleanup. .unref() so the timer never blocks process exit
|
|
86
|
+
// — short-lived CLI invocations and tests don't need an explicit destroy().
|
|
86
87
|
this.cleanupTimer = setInterval(() => {
|
|
87
88
|
this.cleanupExpired();
|
|
88
89
|
}, autoCleanupInterval);
|
|
90
|
+
if (typeof this.cleanupTimer.unref === 'function') this.cleanupTimer.unref();
|
|
89
91
|
|
|
90
92
|
// Eviction tracking is handled in the LRU cache dispose callback above
|
|
91
93
|
}
|
|
@@ -546,6 +548,7 @@ export class CacheManager extends EventEmitter {
|
|
|
546
548
|
this.updateStats();
|
|
547
549
|
this.emit('monitoring', this.getDetailedStats());
|
|
548
550
|
}, interval);
|
|
551
|
+
if (typeof this.monitoringTimer.unref === 'function') this.monitoringTimer.unref();
|
|
549
552
|
}
|
|
550
553
|
|
|
551
554
|
/**
|
|
@@ -19,7 +19,8 @@ export class BFSCrawler {
|
|
|
19
19
|
concurrency = 10,
|
|
20
20
|
domainFilter = null,
|
|
21
21
|
enableLinkAnalysis = true,
|
|
22
|
-
linkAnalyzerOptions = {}
|
|
22
|
+
linkAnalyzerOptions = {},
|
|
23
|
+
sessionContext = null
|
|
23
24
|
} = options;
|
|
24
25
|
|
|
25
26
|
this.maxDepth = maxDepth;
|
|
@@ -28,6 +29,8 @@ export class BFSCrawler {
|
|
|
28
29
|
this.respectRobots = respectRobots;
|
|
29
30
|
this.userAgent = userAgent;
|
|
30
31
|
this.timeout = timeout;
|
|
32
|
+
// Session context for cookie jar + persistent headers (null = stateless)
|
|
33
|
+
this.sessionContext = sessionContext;
|
|
31
34
|
|
|
32
35
|
this.visited = new Set();
|
|
33
36
|
this.results = [];
|
|
@@ -254,21 +257,32 @@ export class BFSCrawler {
|
|
|
254
257
|
'Connection': 'keep-alive',
|
|
255
258
|
'Upgrade-Insecure-Requests': '1'
|
|
256
259
|
};
|
|
257
|
-
|
|
258
|
-
|
|
260
|
+
|
|
261
|
+
let headers = { ...defaultHeaders, ...domainRules.customHeaders };
|
|
262
|
+
|
|
263
|
+
// If a session is active, layer in session headers + cookie jar
|
|
264
|
+
if (this.sessionContext) {
|
|
265
|
+
headers = this.sessionContext.applyToHeaders(url, headers);
|
|
266
|
+
}
|
|
267
|
+
|
|
259
268
|
const effectiveTimeout = domainRules.timeout || this.timeout;
|
|
260
|
-
|
|
269
|
+
|
|
261
270
|
// Update timeout if different
|
|
262
271
|
if (effectiveTimeout !== this.timeout) {
|
|
263
272
|
clearTimeout(timeoutId);
|
|
264
273
|
setTimeout(() => controller.abort(), effectiveTimeout);
|
|
265
274
|
}
|
|
266
|
-
|
|
275
|
+
|
|
267
276
|
const response = await fetch(url, {
|
|
268
277
|
signal: controller.signal,
|
|
269
278
|
headers
|
|
270
279
|
});
|
|
271
280
|
|
|
281
|
+
// Capture any cookies the server sets during the crawl
|
|
282
|
+
if (this.sessionContext) {
|
|
283
|
+
this.sessionContext.recordCookies(response, url);
|
|
284
|
+
}
|
|
285
|
+
|
|
272
286
|
clearTimeout(timeoutId);
|
|
273
287
|
|
|
274
288
|
if (!response.ok) {
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { isCreatorModeVerified } from './creatorMode.js';
|
|
2
|
+
|
|
3
|
+
export const ALLOWED_HOSTS = ['www.crawlforge.dev', 'crawlforge.dev', 'api.crawlforge.dev'];
|
|
4
|
+
|
|
5
|
+
const LOCALHOST_HOSTS = new Set(['localhost', '127.0.0.1', '::1']);
|
|
6
|
+
|
|
7
|
+
export function resolveApiEndpoint(rawUrl) {
|
|
8
|
+
let parsed;
|
|
9
|
+
try {
|
|
10
|
+
parsed = new URL(rawUrl);
|
|
11
|
+
} catch {
|
|
12
|
+
throw new Error(`Invalid API endpoint URL: "${rawUrl}"`);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
const hostname = parsed.hostname;
|
|
16
|
+
|
|
17
|
+
if (LOCALHOST_HOSTS.has(hostname)) {
|
|
18
|
+
if (!isCreatorModeVerified()) {
|
|
19
|
+
throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
|
|
20
|
+
}
|
|
21
|
+
// Strip trailing slash from pathname
|
|
22
|
+
parsed.pathname = parsed.pathname.replace(/\/+$/, '');
|
|
23
|
+
return parsed.toString();
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
if (parsed.protocol !== 'https:') {
|
|
27
|
+
throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if (!ALLOWED_HOSTS.includes(hostname)) {
|
|
31
|
+
throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Strip trailing slash from pathname
|
|
35
|
+
parsed.pathname = parsed.pathname.replace(/\/+$/, '');
|
|
36
|
+
return parsed.toString();
|
|
37
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Prometheus metrics — dependency-free implementation.
|
|
3
|
+
*
|
|
4
|
+
* Why no prom-client? CrawlForge is shipped via npm and runs in stdio mode
|
|
5
|
+
* by default. Pulling in prom-client (and its dependency tree) just to
|
|
6
|
+
* expose four counters/gauges is overkill. This 150 LOC implementation
|
|
7
|
+
* conforms to the Prometheus exposition format 0.0.4.
|
|
8
|
+
*
|
|
9
|
+
* Disabled by default. Enable via `CRAWLFORGE_METRICS=true` in HTTP mode.
|
|
10
|
+
*
|
|
11
|
+
* Counters/gauges exposed:
|
|
12
|
+
* - crawlforge_tool_requests_total{tool,outcome}
|
|
13
|
+
* - crawlforge_tool_errors_total{tool,error_class}
|
|
14
|
+
* - crawlforge_tool_duration_ms{tool} (histogram, summed)
|
|
15
|
+
* - crawlforge_credits_consumed_total{tool}
|
|
16
|
+
* - crawlforge_browser_pool_in_use (gauge)
|
|
17
|
+
* - crawlforge_browser_pool_capacity (gauge)
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
const CONTENT_TYPE = 'text/plain; version=0.0.4; charset=utf-8';
|
|
21
|
+
|
|
22
|
+
export function createMetricsRegistry() {
|
|
23
|
+
const counters = new Map(); // name|labels -> number
|
|
24
|
+
const gauges = new Map(); // name|labels -> number
|
|
25
|
+
const histograms = new Map(); // name|labels -> { count, sum, buckets:{le->count} }
|
|
26
|
+
|
|
27
|
+
const HISTOGRAM_BUCKETS_MS = [10, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000];
|
|
28
|
+
|
|
29
|
+
function key(name, labels) {
|
|
30
|
+
const labelStr = Object.entries(labels ?? {})
|
|
31
|
+
.sort(([a], [b]) => a.localeCompare(b))
|
|
32
|
+
.map(([k, v]) => `${k}="${escapeLabel(String(v))}"`)
|
|
33
|
+
.join(',');
|
|
34
|
+
return labelStr ? `${name}{${labelStr}}` : name;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
contentType: CONTENT_TYPE,
|
|
39
|
+
|
|
40
|
+
incCounter(name, labels, by = 1) {
|
|
41
|
+
const k = key(name, labels);
|
|
42
|
+
counters.set(k, (counters.get(k) ?? 0) + by);
|
|
43
|
+
},
|
|
44
|
+
|
|
45
|
+
setGauge(name, labels, value) {
|
|
46
|
+
gauges.set(key(name, labels), value);
|
|
47
|
+
},
|
|
48
|
+
|
|
49
|
+
observeHistogram(name, labels, valueMs) {
|
|
50
|
+
const k = key(name, labels);
|
|
51
|
+
let h = histograms.get(k);
|
|
52
|
+
if (!h) {
|
|
53
|
+
h = { count: 0, sum: 0, buckets: Object.fromEntries(HISTOGRAM_BUCKETS_MS.map(b => [b, 0])) };
|
|
54
|
+
histograms.set(k, h);
|
|
55
|
+
}
|
|
56
|
+
h.count += 1;
|
|
57
|
+
h.sum += valueMs;
|
|
58
|
+
for (const b of HISTOGRAM_BUCKETS_MS) {
|
|
59
|
+
if (valueMs <= b) h.buckets[b] += 1;
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
|
|
63
|
+
async render() {
|
|
64
|
+
const lines = [];
|
|
65
|
+
|
|
66
|
+
// Counters
|
|
67
|
+
const counterNames = new Set();
|
|
68
|
+
for (const k of counters.keys()) counterNames.add(k.split('{')[0]);
|
|
69
|
+
for (const name of counterNames) {
|
|
70
|
+
lines.push(`# HELP ${name} ${describe(name)}`);
|
|
71
|
+
lines.push(`# TYPE ${name} counter`);
|
|
72
|
+
for (const [k, v] of counters.entries()) {
|
|
73
|
+
if (k.split('{')[0] === name) lines.push(`${k} ${v}`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Gauges
|
|
78
|
+
const gaugeNames = new Set();
|
|
79
|
+
for (const k of gauges.keys()) gaugeNames.add(k.split('{')[0]);
|
|
80
|
+
for (const name of gaugeNames) {
|
|
81
|
+
lines.push(`# HELP ${name} ${describe(name)}`);
|
|
82
|
+
lines.push(`# TYPE ${name} gauge`);
|
|
83
|
+
for (const [k, v] of gauges.entries()) {
|
|
84
|
+
if (k.split('{')[0] === name) lines.push(`${k} ${v}`);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Histograms
|
|
89
|
+
const histNames = new Set();
|
|
90
|
+
for (const k of histograms.keys()) histNames.add(k.split('{')[0]);
|
|
91
|
+
for (const name of histNames) {
|
|
92
|
+
lines.push(`# HELP ${name} ${describe(name)}`);
|
|
93
|
+
lines.push(`# TYPE ${name} histogram`);
|
|
94
|
+
for (const [k, h] of histograms.entries()) {
|
|
95
|
+
if (k.split('{')[0] !== name) continue;
|
|
96
|
+
// Reconstruct base labels (everything inside { })
|
|
97
|
+
const baseLabels = k.includes('{') ? k.slice(k.indexOf('{') + 1, -1) : '';
|
|
98
|
+
const sep = baseLabels ? ',' : '';
|
|
99
|
+
for (const b of HISTOGRAM_BUCKETS_MS) {
|
|
100
|
+
lines.push(`${name}_bucket{${baseLabels}${sep}le="${b}"} ${h.buckets[b]}`);
|
|
101
|
+
}
|
|
102
|
+
lines.push(`${name}_bucket{${baseLabels}${sep}le="+Inf"} ${h.count}`);
|
|
103
|
+
lines.push(`${name}_sum${baseLabels ? `{${baseLabels}}` : ''} ${h.sum}`);
|
|
104
|
+
lines.push(`${name}_count${baseLabels ? `{${baseLabels}}` : ''} ${h.count}`);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return lines.join('\n') + '\n';
|
|
109
|
+
},
|
|
110
|
+
|
|
111
|
+
// Snapshot for tests
|
|
112
|
+
_snapshot() {
|
|
113
|
+
return {
|
|
114
|
+
counters: Object.fromEntries(counters.entries()),
|
|
115
|
+
gauges: Object.fromEntries(gauges.entries()),
|
|
116
|
+
histograms: Object.fromEntries(histograms.entries())
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const HELP = {
|
|
123
|
+
crawlforge_tool_requests_total: 'Total number of MCP tool invocations',
|
|
124
|
+
crawlforge_tool_errors_total: 'Total number of MCP tool errors',
|
|
125
|
+
crawlforge_tool_duration_ms: 'MCP tool invocation duration in milliseconds',
|
|
126
|
+
crawlforge_credits_consumed_total: 'Total CrawlForge credits consumed',
|
|
127
|
+
crawlforge_browser_pool_in_use: 'Number of browser contexts currently leased from the pool',
|
|
128
|
+
crawlforge_browser_pool_capacity: 'Maximum browser context pool capacity'
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
function describe(name) {
|
|
132
|
+
return HELP[name] ?? name;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function escapeLabel(v) {
|
|
136
|
+
return v.replace(/\\/g, '\\\\').replace(/\n/g, '\\n').replace(/"/g, '\\"');
|
|
137
|
+
}
|