crawlforge-mcp-server 3.0.7 → 3.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CLAUDE.md CHANGED
@@ -91,7 +91,7 @@ Tools are organized in subdirectories by category:
91
91
  - `crawl/` - crawlDeep, mapSite
92
92
  - `extract/` - analyzeContent, extractContent, processDocument, summarizeContent
93
93
  - `research/` - deepResearch
94
- - `search/` - searchWeb and provider adapters (Google, DuckDuckGo)
94
+ - `search/` - searchWeb (uses CrawlForge proxy for Google Search)
95
95
  - `tracking/` - trackChanges
96
96
  - `llmstxt/` - generateLLMsTxt
97
97
 
@@ -103,7 +103,7 @@ Tools are organized in subdirectories by category:
103
103
 
104
104
  **Advanced Tools:**
105
105
 
106
- - search_web (conditional - requires search provider), crawl_deep, map_site
106
+ - search_web, crawl_deep, map_site
107
107
  - extract_content, process_document, summarize_content, analyze_content
108
108
  - batch_scrape, scrape_with_actions, deep_research
109
109
  - track_changes, generate_llms_txt, stealth_mode, localization
@@ -154,12 +154,6 @@ CRAWLFORGE_API_KEY=your_api_key_here
154
154
  # CRAWLFORGE_CREATOR_SECRET=your-uuid-secret
155
155
  # Enables unlimited access for development/testing
156
156
 
157
- # Search Provider (auto, google, duckduckgo)
158
- SEARCH_PROVIDER=auto
159
-
160
- # Google API (optional, only if using Google)
161
- GOOGLE_API_KEY=your_key
162
- GOOGLE_SEARCH_ENGINE_ID=your_id
163
157
 
164
158
  # Performance Settings
165
159
  MAX_WORKERS=10
@@ -424,18 +418,16 @@ export class ToolName {
424
418
 
425
419
  ### Search Provider Architecture
426
420
 
427
- Search providers implement a factory pattern:
421
+ All search requests are proxied through the CrawlForge.dev API:
428
422
 
429
- - `searchProviderFactory.js` selects provider based on config
430
- - Providers implement common interface: `search(query, options)`
431
- - Auto-fallback: Google DuckDuckGo if Google credentials missing
432
- - Each provider in `src/tools/search/adapters/`
423
+ - `crawlforgeSearch.js` - Proxies through CrawlForge.dev API (Google Search backend)
424
+ - No Google API credentials needed from users
425
+ - Users only need their CrawlForge API key
426
+ - Credit cost: 2 credits per search
433
427
 
434
- ### Browser Management
428
+ Factory in `src/tools/search/adapters/searchProviderFactory.js`
435
429
 
436
- - Playwright used for browser automation (ActionExecutor, ScrapeWithActionsTool)
437
- - Stealth features in StealthBrowserManager
438
- - Always cleanup browsers in error handlers
430
+ ### Browser Management
439
431
  - Context isolation per operation for security
440
432
 
441
433
  ### Memory Management
package/README.md CHANGED
@@ -85,7 +85,7 @@ Or use the MCP plugin in Cursor settings.
85
85
 
86
86
  ### Advanced Tools (2-3 credits)
87
87
  - `scrape_structured` - Extract structured data with CSS selectors
88
- - `search_web` - Search the web with Google/DuckDuckGo
88
+ - `search_web` - Search the web using Google Search API
89
89
  - `summarize_content` - Generate intelligent summaries
90
90
  - `analyze_content` - Comprehensive content analysis
91
91
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "3.0.7",
3
+ "version": "3.0.9",
4
4
  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 19 comprehensive web scraping, crawling, and content processing tools.",
5
5
  "main": "server.js",
6
6
  "bin": {
@@ -95,7 +95,6 @@
95
95
  "compromise": "^14.14.4",
96
96
  "diff": "^8.0.2",
97
97
  "dotenv": "^17.2.1",
98
- "duck-duck-scrape": "^2.2.7",
99
98
  "franc": "^6.2.0",
100
99
  "isomorphic-dompurify": "^2.26.0",
101
100
  "jsdom": "^26.1.0",
package/server.js CHANGED
@@ -49,7 +49,7 @@ import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
49
49
  import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
50
50
  import { LocalizationManager } from "./src/core/LocalizationManager.js";
51
51
  import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
52
- import { config, validateConfig, isSearchConfigured, getToolConfig, getActiveSearchProvider } from "./src/constants/config.js";
52
+ import { config, validateConfig, getToolConfig } from "./src/constants/config.js";
53
53
  // Authentication Manager
54
54
  import AuthManager from "./src/core/AuthManager.js";
55
55
 
@@ -160,11 +160,8 @@ function withAuth(toolName, handler) {
160
160
  };
161
161
  }
162
162
 
163
- // Initialize tools
164
- let searchWebTool = null;
165
- if (isSearchConfigured()) {
166
- searchWebTool = new SearchWebTool(getToolConfig('search_web'));
167
- }
163
+ // Initialize Search Web Tool - always available with CrawlForge API key
164
+ const searchWebTool = new SearchWebTool(getToolConfig("search_web"));
168
165
  const crawlDeepTool = new CrawlDeepTool(getToolConfig('crawl_deep'));
169
166
  const mapSiteTool = new MapSiteTool(getToolConfig('map_site'));
170
167
 
@@ -946,60 +943,48 @@ server.registerTool("scrape_structured", {
946
943
  }));
947
944
 
948
945
  // Tool: search_web - Web search with configurable providers
949
- if (searchWebTool) {
950
- const activeProvider = getActiveSearchProvider();
951
- const providerName = activeProvider === 'google' ? 'Google Custom Search API' :
952
- activeProvider === 'duckduckgo' ? 'DuckDuckGo' : 'Auto-selected provider';
953
-
954
- server.registerTool("search_web", {
955
- description: `Search the web using ${providerName}`,
956
- inputSchema: {
957
- query: z.string(),
958
- limit: z.number().min(1).max(100).optional(),
959
- offset: z.number().min(0).optional(),
960
- lang: z.string().optional(),
961
- safe_search: z.boolean().optional(),
962
- time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(),
963
- site: z.string().optional(),
964
- file_type: z.string().optional()
965
- }
966
- }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
967
- try {
968
- if (!query) {
969
- return {
970
- content: [{
971
- type: "text",
972
- text: "Query parameter is required"
973
- }],
974
- isError: true
975
- };
976
- }
977
-
978
- const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
979
- return {
980
- content: [{
981
- type: "text",
982
- text: JSON.stringify(result, null, 2)
983
- }]
984
- };
985
- } catch (error) {
946
+ // Tool: search_web - Search the web using Google Search via CrawlForge proxy
947
+ server.registerTool("search_web", {
948
+ description: "Search the web using Google Search API (proxied through CrawlForge)",
949
+ inputSchema: {
950
+ query: z.string(),
951
+ limit: z.number().min(1).max(100).optional(),
952
+ offset: z.number().min(0).optional(),
953
+ lang: z.string().optional(),
954
+ safe_search: z.boolean().optional(),
955
+ time_range: z.enum(["day", "week", "month", "year", "all"]).optional(),
956
+ site: z.string().optional(),
957
+ file_type: z.string().optional()
958
+ }
959
+ }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
960
+ try {
961
+ if (!query) {
986
962
  return {
987
963
  content: [{
988
964
  type: "text",
989
- text: `Search failed: ${error.message}`
965
+ text: "Query parameter is required"
990
966
  }],
991
967
  isError: true
992
968
  };
993
969
  }
994
- }));
995
- } else {
996
- const activeProvider = getActiveSearchProvider();
997
- if (activeProvider === 'google') {
998
- console.error("Warning: search_web tool not configured. Set GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID to enable Google search.");
999
- } else {
1000
- console.error("Warning: search_web tool initialization failed. Check your SEARCH_PROVIDER configuration.");
970
+
971
+ const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
972
+ return {
973
+ content: [{
974
+ type: "text",
975
+ text: JSON.stringify(result, null, 2)
976
+ }]
977
+ };
978
+ } catch (error) {
979
+ return {
980
+ content: [{
981
+ type: "text",
982
+ text: `Search failed: ${error.message}`
983
+ }],
984
+ isError: true
985
+ };
1001
986
  }
1002
- }
987
+ }));
1003
988
 
1004
989
  // Tool: crawl_deep - Deep crawl websites with BFS algorithm
1005
990
  server.registerTool("crawl_deep", {
@@ -1859,34 +1844,19 @@ async function runServer() {
1859
1844
  console.error("CrawlForge MCP Server v3.0 running on stdio");
1860
1845
  console.error(`Environment: ${config.server.nodeEnv}`);
1861
1846
 
1862
- if (isSearchConfigured()) {
1863
- const activeProvider = getActiveSearchProvider();
1864
- console.error(`Search enabled: ${isSearchConfigured()} (provider: ${activeProvider})`);
1865
- } else {
1866
- console.error(`Search enabled: ${isSearchConfigured()}`);
1867
- }
1847
+ console.error("Search enabled: true (via CrawlForge proxy)");
1868
1848
 
1869
- const baseTools = 'fetch_url, extract_text, extract_links, extract_metadata, scrape_structured, crawl_deep, map_site';
1870
- const searchTool = isSearchConfigured() ? ', search_web' : '';
1871
- const phase3Tools = ', extract_content, process_document, summarize_content, analyze_content';
1872
- const wave2Tools = ', batch_scrape, scrape_with_actions';
1873
- const researchTools = ', deep_research';
1874
- const trackingTools = ', track_changes';
1875
- const llmsTxtTools = ', generate_llms_txt';
1876
- const wave3Tools = ', stealth_mode, localization';
1849
+ const baseTools = "fetch_url, extract_text, extract_links, extract_metadata, scrape_structured, crawl_deep, map_site";
1850
+ const searchTool = ", search_web";
1851
+ const phase3Tools = ", extract_content, process_document, summarize_content, analyze_content";
1852
+ const wave2Tools = ", batch_scrape, scrape_with_actions";
1853
+ const researchTools = ", deep_research";
1854
+ const trackingTools = ", track_changes";
1855
+ const llmsTxtTools = ", generate_llms_txt";
1856
+ const wave3Tools = ", stealth_mode, localization";
1877
1857
  console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}`);
1878
1858
 
1879
- // Start memory monitoring in development
1880
- if (config.server.nodeEnv === "development") {
1881
- memoryMonitor.start();
1882
- console.error("Memory monitoring started");
1883
- }
1884
- }
1885
1859
 
1886
- runServer().catch((error) => {
1887
- console.error("Server error:", error);
1888
- process.exit(1);
1889
- });
1890
1860
  // === MEMORY LEAK PREVENTION ===
1891
1861
  // Add graceful shutdown handling to prevent memory leaks
1892
1862
 
@@ -1978,3 +1948,15 @@ if (config.server.nodeEnv === 'development') {
1978
1948
  }
1979
1949
  }, 60000); // Check every minute
1980
1950
  }
1951
+
1952
+ // Start memory monitoring in development
1953
+ if (config.server.nodeEnv === "development") {
1954
+ memoryMonitor.start();
1955
+ console.error("Memory monitoring started");
1956
+ }
1957
+ }
1958
+
1959
+ runServer().catch((error) => {
1960
+ console.error("Server error:", error);
1961
+ process.exit(1);
1962
+ });
@@ -8,23 +8,10 @@ const __dirname = dirname(__filename);
8
8
  dotenv.config({ path: join(__dirname, '../../.env'), quiet: true });
9
9
 
10
10
  export const config = {
11
- // Search Provider Configuration
12
- search: {
13
- provider: process.env.SEARCH_PROVIDER || 'auto', // 'google', 'duckduckgo', or 'auto'
14
-
15
- // Google Search API
16
- google: {
17
- apiKey: process.env.GOOGLE_API_KEY || '',
18
- searchEngineId: process.env.GOOGLE_SEARCH_ENGINE_ID || ''
19
- },
20
-
21
- // DuckDuckGo Configuration
22
- duckduckgo: {
23
- timeout: parseInt(process.env.DUCKDUCKGO_TIMEOUT || '30000'),
24
- maxRetries: parseInt(process.env.DUCKDUCKGO_MAX_RETRIES || '3'),
25
- retryDelay: parseInt(process.env.DUCKDUCKGO_RETRY_DELAY || '1000'),
26
- userAgent: process.env.DUCKDUCKGO_USER_AGENT || process.env.USER_AGENT || 'CrawlForge/1.0'
27
- }
11
+ // CrawlForge API Configuration
12
+ crawlforge: {
13
+ apiKey: process.env.CRAWLFORGE_API_KEY || '',
14
+ apiBaseUrl: process.env.CRAWLFORGE_API_URL || 'https://api.crawlforge.dev'
28
15
  },
29
16
 
30
17
  // Performance
@@ -286,30 +273,6 @@ export const config = {
286
273
  export function validateConfig() {
287
274
  const errors = [];
288
275
 
289
- // Check search provider configuration
290
- const provider = getActiveSearchProvider();
291
-
292
- if (config.server.nodeEnv === 'production') {
293
- if (provider === 'google') {
294
- if (!config.search.google.apiKey) {
295
- errors.push('GOOGLE_API_KEY is required when using Google search provider in production');
296
- }
297
- if (!config.search.google.searchEngineId) {
298
- errors.push('GOOGLE_SEARCH_ENGINE_ID is required when using Google search provider in production');
299
- }
300
- }
301
-
302
- if (!isSearchConfigured()) {
303
- errors.push('Search provider is not properly configured');
304
- }
305
- }
306
-
307
- // Validate search provider setting
308
- const validProviders = ['google', 'duckduckgo', 'auto'];
309
- if (!validProviders.includes(config.search.provider.toLowerCase())) {
310
- errors.push(`Invalid SEARCH_PROVIDER value. Must be one of: ${validProviders.join(', ')}`);
311
- }
312
-
313
276
  // Validate numeric ranges
314
277
  if (config.crawling.maxDepth > 10) {
315
278
  errors.push('MAX_CRAWL_DEPTH should not exceed 10 for performance reasons');
@@ -330,60 +293,12 @@ export function validateConfig() {
330
293
  return errors;
331
294
  }
332
295
 
333
- // Check if search is properly configured
334
- export function isSearchConfigured() {
335
- const provider = getActiveSearchProvider();
336
-
337
- switch (provider) {
338
- case 'google':
339
- return !!(config.search.google.apiKey && config.search.google.searchEngineId);
340
- case 'duckduckgo':
341
- return true; // DuckDuckGo doesn't require API credentials
342
- default:
343
- return false;
344
- }
345
- }
346
-
347
- // Get the active search provider based on configuration and availability
348
- export function getActiveSearchProvider() {
349
- const configuredProvider = config.search.provider.toLowerCase();
350
-
351
- switch (configuredProvider) {
352
- case 'google':
353
- return 'google';
354
- case 'duckduckgo':
355
- return 'duckduckgo';
356
- case 'auto':
357
- default:
358
- // Auto mode: prefer Google if credentials available, otherwise use DuckDuckGo
359
- if (config.search.google.apiKey && config.search.google.searchEngineId) {
360
- return 'google';
361
- }
362
- return 'duckduckgo';
363
- }
364
- }
365
-
366
296
  // Get configuration for a specific tool
367
297
  export function getToolConfig(toolName) {
368
- const provider = getActiveSearchProvider();
369
-
370
298
  const toolConfigs = {
371
299
  search_web: {
372
- provider: provider,
373
-
374
- // Google-specific configuration
375
- google: {
376
- apiKey: config.search.google.apiKey,
377
- searchEngineId: config.search.google.searchEngineId
378
- },
379
-
380
- // DuckDuckGo-specific configuration
381
- duckduckgo: {
382
- timeout: config.search.duckduckgo.timeout,
383
- maxRetries: config.search.duckduckgo.maxRetries,
384
- retryDelay: config.search.duckduckgo.retryDelay,
385
- userAgent: config.search.duckduckgo.userAgent
386
- },
300
+ apiKey: config.crawlforge.apiKey,
301
+ apiBaseUrl: config.crawlforge.apiBaseUrl,
387
302
 
388
303
  // Common configuration
389
304
  cacheEnabled: config.performance.cacheEnableDisk,
@@ -593,7 +508,8 @@ export function validateLocalizationConfig() {
593
508
  if (localizationConfig.translation.enabled) {
594
509
  const validProviders = ['google', 'azure', 'libre'];
595
510
  if (!validProviders.includes(localizationConfig.translation.defaultProvider)) {
596
- errors.push(`TRANSLATION_PROVIDER must be one of: ${validProviders.join(', ')}`);
511
+ const providersString = validProviders.join(', ');
512
+ errors.push('TRANSLATION_PROVIDER must be one of: ' + providersString);
597
513
  }
598
514
  }
599
515
 
@@ -612,4 +528,4 @@ export function validateLocalizationConfig() {
612
528
  return errors;
613
529
  }
614
530
 
615
- export default config;
531
+ export default config;
@@ -0,0 +1,107 @@
1
+ /**
2
+ * CrawlForge Search Adapter
3
+ *
4
+ * Proxies search requests through CrawlForge.dev API which uses Google Search.
5
+ * Users only need their CrawlForge API key - no Google credentials required.
6
+ *
7
+ * Credit Cost: 2 credits per search
8
+ */
9
+
10
+ export class CrawlForgeSearchAdapter {
11
+ constructor(apiKey, apiBaseUrl = 'https://api.crawlforge.dev') {
12
+ if (!apiKey) {
13
+ throw new Error('CrawlForge API key is required for search functionality');
14
+ }
15
+
16
+ this.apiKey = apiKey;
17
+ this.apiBaseUrl = apiBaseUrl;
18
+ }
19
+
20
+ /**
21
+ * Perform a web search via CrawlForge API
22
+ * @param {Object} params - Search parameters
23
+ * @param {string} params.query - Search query
24
+ * @param {number} params.num - Number of results
25
+ * @param {number} params.start - Starting position
26
+ * @param {string} params.lr - Language restriction
27
+ * @param {string} params.safe - Safe search setting
28
+ * @param {string} params.dateRestrict - Date restriction
29
+ * @param {string} params.cr - Country restriction
30
+ * @param {string} params.uule - Location encoding
31
+ * @returns {Promise<Object>} Search results in Google Search API format
32
+ */
33
+ async search(params) {
34
+ try {
35
+ const response = await fetch(`${this.apiBaseUrl}/api/v1/search`, {
36
+ method: 'POST',
37
+ headers: {
38
+ 'Content-Type': 'application/json',
39
+ 'X-API-Key': this.apiKey
40
+ },
41
+ body: JSON.stringify({
42
+ query: params.query,
43
+ num: params.num || 10,
44
+ start: params.start || 1,
45
+ lr: params.lr,
46
+ safe: params.safe,
47
+ dateRestrict: params.dateRestrict,
48
+ cr: params.cr,
49
+ uule: params.uule,
50
+ // Forward any additional localization headers
51
+ headers: params.headers
52
+ })
53
+ });
54
+
55
+ if (!response.ok) {
56
+ let errorMessage = 'Search request failed';
57
+
58
+ try {
59
+ const errorData = await response.json();
60
+ errorMessage = errorData.message || errorData.error || errorMessage;
61
+
62
+ // Handle specific error cases
63
+ if (response.status === 401) {
64
+ errorMessage = 'Invalid API key. Please check your CrawlForge API key.';
65
+ } else if (response.status === 402) {
66
+ errorMessage = 'Insufficient credits. Please upgrade your plan at https://www.crawlforge.dev/pricing';
67
+ } else if (response.status === 429) {
68
+ errorMessage = 'Rate limit exceeded. Please try again later.';
69
+ }
70
+ } catch (parseError) {
71
+ // If we can't parse the error response, use the status text
72
+ errorMessage = `Search failed with status ${response.status}: ${response.statusText}`;
73
+ }
74
+
75
+ throw new Error(errorMessage);
76
+ }
77
+
78
+ const data = await response.json();
79
+
80
+ // Validate response format
81
+ if (!data || typeof data !== 'object') {
82
+ throw new Error('Invalid response format from search API');
83
+ }
84
+
85
+ // Return data in Google Search API compatible format
86
+ return {
87
+ items: data.items || [],
88
+ searchInformation: data.searchInformation || {
89
+ totalResults: '0',
90
+ searchTime: 0
91
+ },
92
+ queries: data.queries || {},
93
+ context: data.context || {}
94
+ };
95
+ } catch (error) {
96
+ // Network errors or fetch failures
97
+ if (error.name === 'TypeError' || error.message.includes('fetch')) {
98
+ throw new Error(`Network error connecting to CrawlForge API: ${error.message}`);
99
+ }
100
+
101
+ // Re-throw our formatted errors
102
+ throw error;
103
+ }
104
+ }
105
+ }
106
+
107
+ export default CrawlForgeSearchAdapter;