crawlforge-mcp-server 3.0.6 → 3.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,9 +63,8 @@ const SearchWebSchema = z.object({
63
63
  export class SearchWebTool {
64
64
  constructor(options = {}) {
65
65
  const {
66
- provider = 'auto',
67
- google = {},
68
- duckduckgo = {},
66
+ apiKey,
67
+ apiBaseUrl,
69
68
  cacheEnabled = true,
70
69
  cacheTTL = 3600000, // 1 hour
71
70
  expanderOptions = {},
@@ -73,17 +72,22 @@ export class SearchWebTool {
73
72
  deduplicationOptions = {}
74
73
  } = options;
75
74
 
76
- // Determine which provider to use
77
- this.provider = this.determineProvider(provider, { google, duckduckgo });
78
-
79
- // Create the search adapter
75
+ // Check for Creator Mode - allows search without API key for development/testing
76
+ const isCreatorMode = process.env.CRAWLFORGE_CREATOR_MODE === 'true';
77
+
78
+ if (!apiKey && !isCreatorMode) {
79
+ throw new Error('CrawlForge API key is required for search functionality');
80
+ }
81
+
82
+ // Create the search adapter (CrawlForge API or DuckDuckGo fallback for Creator Mode)
80
83
  try {
81
- this.searchAdapter = SearchProviderFactory.createAdapter(this.provider, {
82
- google,
83
- duckduckgo
84
+ this.searchAdapter = SearchProviderFactory.createAdapter(apiKey, {
85
+ apiBaseUrl,
86
+ creatorMode: isCreatorMode
84
87
  });
88
+ this.isCreatorModeFallback = !apiKey && isCreatorMode;
85
89
  } catch (error) {
86
- throw new Error(`Failed to initialize search provider '${this.provider}': ${error.message}`);
90
+ throw new Error(`Failed to initialize search adapter: ${error.message}`);
87
91
  }
88
92
 
89
93
  this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
@@ -102,27 +106,6 @@ export class SearchWebTool {
102
106
  });
103
107
  }
104
108
 
105
- determineProvider(configuredProvider, providerOptions) {
106
- switch (configuredProvider.toLowerCase()) {
107
- case 'google':
108
- if (!providerOptions.google?.apiKey || !providerOptions.google?.searchEngineId) {
109
- throw new Error('Google provider requires apiKey and searchEngineId');
110
- }
111
- return 'google';
112
-
113
- case 'duckduckgo':
114
- return 'duckduckgo';
115
-
116
- case 'auto':
117
- default:
118
- // Auto mode: prefer Google if credentials available, otherwise use DuckDuckGo
119
- if (providerOptions.google?.apiKey && providerOptions.google?.searchEngineId) {
120
- return 'google';
121
- }
122
- return 'duckduckgo';
123
- }
124
- }
125
-
126
109
  async execute(params) {
127
110
  try {
128
111
  const validated = SearchWebSchema.parse(params);
@@ -308,9 +291,15 @@ export class SearchWebTool {
308
291
  cached: false,
309
292
 
310
293
  // Add provider information
311
- provider: {
312
- name: this.provider,
313
- capabilities: SearchProviderFactory.getProviderCapabilities(this.provider)
294
+ provider: this.isCreatorModeFallback ? {
295
+ name: 'google',
296
+ backend: 'Google Custom Search API (Creator Mode)',
297
+ note: 'Using Google Search API directly. Production users use CrawlForge API.',
298
+ capabilities: SearchProviderFactory.getProviderCapabilities('google')
299
+ } : {
300
+ name: 'crawlforge',
301
+ backend: 'Google Search',
302
+ capabilities: SearchProviderFactory.getProviderCapabilities('crawlforge')
314
303
  },
315
304
 
316
305
  // Add localization information
@@ -458,10 +447,16 @@ export class SearchWebTool {
458
447
 
459
448
  getStats() {
460
449
  return {
461
- provider: {
462
- name: this.provider,
463
- capabilities: SearchProviderFactory.getProviderCapabilities(this.provider)
450
+ provider: this.isCreatorModeFallback ? {
451
+ name: 'google',
452
+ backend: 'Google Custom Search API (Creator Mode)',
453
+ note: 'Using Google Search API directly'
454
+ } : {
455
+ name: 'crawlforge',
456
+ backend: 'Google Search',
457
+ capabilities: SearchProviderFactory.getProviderCapabilities('crawlforge')
464
458
  },
459
+ creatorMode: this.isCreatorModeFallback || false,
465
460
  cacheStats: this.cache ? this.cache.getStats() : null,
466
461
  queryExpanderStats: this.queryExpander ? this.queryExpander.getStats() : null,
467
462
  rankingStats: this.resultRanker ? this.resultRanker.getStats() : null,
@@ -471,12 +466,17 @@ export class SearchWebTool {
471
466
 
472
467
  getProviderInfo() {
473
468
  return {
474
- activeProvider: this.provider,
475
- capabilities: SearchProviderFactory.getProviderCapabilities(this.provider),
469
+ activeProvider: this.isCreatorModeFallback ? 'google' : 'crawlforge',
470
+ backend: this.isCreatorModeFallback
471
+ ? 'Google Custom Search API (Creator Mode)'
472
+ : 'Google Search via CrawlForge API',
473
+ capabilities: SearchProviderFactory.getProviderCapabilities(
474
+ this.isCreatorModeFallback ? 'google' : 'crawlforge'
475
+ ),
476
476
  supportedProviders: SearchProviderFactory.getSupportedProviders(),
477
- allProviders: SearchProviderFactory.compareProviders()
477
+ isCreatorMode: this.isCreatorModeFallback || false
478
478
  };
479
479
  }
480
480
  }
481
481
 
482
- export default SearchWebTool;
482
+ export default SearchWebTool;
@@ -1,500 +0,0 @@
1
- import * as cheerio from 'cheerio';
2
- import { search as ddgSearch, SafeSearchType, SearchTimeType } from 'duck-duck-scrape';
3
-
4
- export class DuckDuckGoSearchAdapter {
5
- constructor(options = {}) {
6
- this.timeout = options.timeout || 30000;
7
- this.maxRetries = options.maxRetries || 3;
8
- this.userAgent = options.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
9
- this.retryDelay = options.retryDelay || 2000; // Increased base delay
10
- this.baseUrl = 'https://html.duckduckgo.com/html/';
11
- }
12
-
13
- async search(params) {
14
- const {
15
- query,
16
- num = 10,
17
- start = 1,
18
- lr,
19
- safe = 'moderate',
20
- dateRestrict
21
- } = params;
22
-
23
- // Try duck-duck-scrape library first (more reliable API access)
24
- try {
25
- const results = await this.searchWithLibrary(query, num, safe, dateRestrict);
26
- if (results.items && results.items.length > 0) {
27
- return results;
28
- }
29
- } catch (libraryError) {
30
- console.warn('DuckDuckGo library search failed:', libraryError.message);
31
- // Check if it's a CAPTCHA/anomaly error
32
- if (libraryError.message.includes('anomaly') || libraryError.message.includes('too quickly')) {
33
- throw new Error(
34
- 'DuckDuckGo is blocking automated requests. ' +
35
- 'To use web search reliably, please configure Google Custom Search API by setting ' +
36
- 'GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables. ' +
37
- 'See: https://developers.google.com/custom-search/v1/introduction'
38
- );
39
- }
40
- }
41
-
42
- // Fallback to HTML scraping (legacy method)
43
- const offset = (start - 1) * num;
44
-
45
- const formData = new URLSearchParams({
46
- q: query,
47
- b: offset.toString(),
48
- kl: 'us-en',
49
- df: '',
50
- safe: 'moderate'
51
- });
52
-
53
- if (safe === 'active') {
54
- formData.set('safe', 'strict');
55
- } else if (safe === 'off') {
56
- formData.set('safe', 'off');
57
- } else {
58
- formData.set('safe', 'moderate');
59
- }
60
-
61
- if (lr && lr.startsWith('lang_')) {
62
- const lang = lr.replace('lang_', '');
63
- formData.set('kl', this.mapLanguageCode(lang));
64
- }
65
-
66
- if (dateRestrict) {
67
- const timeFilter = this.mapDateRestrict(dateRestrict);
68
- if (timeFilter) {
69
- formData.set('df', timeFilter);
70
- }
71
- }
72
-
73
- let lastError;
74
- for (let attempt = 1; attempt <= this.maxRetries; attempt++) {
75
- try {
76
- // Add delay between attempts to avoid rate limiting
77
- if (attempt > 1) {
78
- await new Promise(resolve =>
79
- setTimeout(resolve, this.retryDelay * Math.pow(2, attempt - 1))
80
- );
81
- }
82
-
83
- const htmlResponse = await this.makeRequest(formData);
84
- return this.parseHtmlResponse(htmlResponse, query, num, start);
85
- } catch (error) {
86
- lastError = error;
87
- // If it's a CAPTCHA error, don't retry - it won't help
88
- if (error.message.includes('CAPTCHA') || error.message.includes('automated requests')) {
89
- throw error;
90
- }
91
- }
92
- }
93
-
94
- throw new Error(`DuckDuckGo search failed after ${this.maxRetries} attempts: ${lastError.message}`);
95
- }
96
-
97
- async searchWithLibrary(query, num, safe, dateRestrict) {
98
- // Map safe search settings
99
- let safeSearch = SafeSearchType.MODERATE;
100
- if (safe === 'active' || safe === 'strict') {
101
- safeSearch = SafeSearchType.STRICT;
102
- } else if (safe === 'off') {
103
- safeSearch = SafeSearchType.OFF;
104
- }
105
-
106
- // Map time filter
107
- let time = undefined;
108
- if (dateRestrict) {
109
- const timeMap = {
110
- 'd1': SearchTimeType.DAY,
111
- 'w1': SearchTimeType.WEEK,
112
- 'm1': SearchTimeType.MONTH,
113
- 'y1': SearchTimeType.YEAR
114
- };
115
- time = timeMap[dateRestrict];
116
- }
117
-
118
- const searchResults = await ddgSearch(query, {
119
- safeSearch,
120
- time,
121
- locale: 'en-us'
122
- });
123
-
124
- // Transform results to match expected format
125
- const items = (searchResults.results || []).slice(0, num).map(result => ({
126
- title: result.title || '',
127
- link: result.url || '',
128
- snippet: result.description || '',
129
- displayLink: this.extractDomain(result.url),
130
- formattedUrl: result.url || '',
131
- htmlSnippet: result.description || '',
132
- pagemap: {
133
- metatags: {
134
- title: result.title || '',
135
- description: result.description || ''
136
- }
137
- },
138
- metadata: {
139
- source: 'duckduckgo_api',
140
- type: 'web_result',
141
- hostname: result.hostname || '',
142
- icon: result.icon || ''
143
- }
144
- }));
145
-
146
- return {
147
- kind: 'duckduckgo#search',
148
- searchInformation: {
149
- searchTime: 0.1,
150
- formattedSearchTime: '0.10',
151
- totalResults: items.length.toString(),
152
- formattedTotalResults: items.length.toLocaleString()
153
- },
154
- items: items
155
- };
156
- }
157
-
158
- async makeRequest(formData) {
159
- const controller = new AbortController();
160
- const timeoutId = setTimeout(() => controller.abort(), this.timeout);
161
-
162
- try {
163
- const response = await fetch(this.baseUrl, {
164
- method: 'POST',
165
- headers: {
166
- 'User-Agent': this.userAgent,
167
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
168
- 'Accept-Language': 'en-US,en;q=0.5',
169
- 'Accept-Encoding': 'gzip, deflate, br',
170
- 'Content-Type': 'application/x-www-form-urlencoded',
171
- 'Origin': 'https://duckduckgo.com',
172
- 'Referer': 'https://duckduckgo.com/',
173
- 'Upgrade-Insecure-Requests': '1',
174
- 'Sec-Fetch-Dest': 'document',
175
- 'Sec-Fetch-Mode': 'navigate',
176
- 'Sec-Fetch-Site': 'same-site'
177
- },
178
- body: formData.toString(),
179
- signal: controller.signal
180
- });
181
-
182
- clearTimeout(timeoutId);
183
-
184
- if (!response.ok) {
185
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
186
- }
187
-
188
- const html = await response.text();
189
- return html;
190
- } catch (error) {
191
- clearTimeout(timeoutId);
192
-
193
- if (error.name === 'AbortError') {
194
- throw new Error(`Request timeout after ${this.timeout}ms`);
195
- }
196
-
197
- throw error;
198
- }
199
- }
200
-
201
- parseHtmlResponse(html, query, num, start) {
202
- try {
203
- const $ = cheerio.load(html);
204
- const items = [];
205
-
206
- // Check for CAPTCHA challenge (DuckDuckGo bot protection)
207
- const captchaIndicators = [
208
- 'anomaly-modal',
209
- 'Unfortunately, bots use DuckDuckGo too',
210
- 'Select all squares containing a duck',
211
- 'confirm this search was made by a human',
212
- 'challenge-form'
213
- ];
214
-
215
- for (const indicator of captchaIndicators) {
216
- if (html.includes(indicator)) {
217
- throw new Error(
218
- 'DuckDuckGo CAPTCHA detected - automated requests are being blocked. ' +
219
- 'To use web search reliably, please configure Google Custom Search API by setting ' +
220
- 'GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables. ' +
221
- 'See: https://developers.google.com/custom-search/v1/introduction'
222
- );
223
- }
224
- }
225
-
226
- // Look for search result containers - DuckDuckGo uses various selectors
227
- const resultSelectors = [
228
- '.result', // Primary result class
229
- '.results_links', // Alternative result class
230
- '.web-result', // Another possible class
231
- '.result__body' // Result body container
232
- ];
233
-
234
- let results = $();
235
- for (const selector of resultSelectors) {
236
- results = $(selector);
237
- if (results.length > 0) break;
238
- }
239
-
240
- // If no results found with standard selectors, try more generic approach
241
- if (results.length === 0) {
242
- results = $('div[data-domain]'); // DuckDuckGo sometimes uses data-domain attribute
243
- }
244
-
245
- results.each((index, element) => {
246
- if (items.length >= num) return false; // Stop if we have enough results
247
-
248
- const $result = $(element);
249
-
250
- // Extract title - try multiple selectors
251
- let title = '';
252
- const titleSelectors = [
253
- 'a.result__a',
254
- '.result__title a',
255
- 'h2 a',
256
- '.result-title a',
257
- 'a[href^="http"]'
258
- ];
259
-
260
- for (const selector of titleSelectors) {
261
- const titleElement = $result.find(selector).first();
262
- if (titleElement.length > 0) {
263
- title = titleElement.text().trim();
264
- break;
265
- }
266
- }
267
-
268
- // Extract URL - try multiple selectors
269
- let url = '';
270
- const urlSelectors = [
271
- 'a.result__a',
272
- '.result__title a',
273
- 'h2 a',
274
- '.result-title a',
275
- 'a[href^="http"]'
276
- ];
277
-
278
- for (const selector of urlSelectors) {
279
- const urlElement = $result.find(selector).first();
280
- if (urlElement.length > 0) {
281
- url = urlElement.attr('href') || '';
282
- break;
283
- }
284
- }
285
-
286
- // Extract snippet - try multiple selectors
287
- let snippet = '';
288
- const snippetSelectors = [
289
- 'a.result__snippet',
290
- '.result__snippet',
291
- '.result-snippet',
292
- '.snippet',
293
- '.result__body',
294
- 'span.result__snippet'
295
- ];
296
-
297
- for (const selector of snippetSelectors) {
298
- const snippetElement = $result.find(selector).first();
299
- if (snippetElement.length > 0) {
300
- snippet = snippetElement.text().trim();
301
- break;
302
- }
303
- }
304
-
305
- // If no snippet found, try to get any text content
306
- if (!snippet) {
307
- const allText = $result.text().trim();
308
- // Remove title from text to get snippet
309
- snippet = allText.replace(title, '').trim().substring(0, 300);
310
- }
311
-
312
- // Clean and validate the extracted data
313
- if (title && url && this.isValidUrl(url)) {
314
- items.push({
315
- title: this.cleanText(title),
316
- link: url,
317
- snippet: this.cleanText(snippet),
318
- displayLink: this.extractDomain(url),
319
- formattedUrl: url,
320
- htmlSnippet: this.cleanText(snippet),
321
- pagemap: {
322
- metatags: {
323
- title: this.cleanText(title),
324
- description: this.cleanText(snippet)
325
- }
326
- },
327
- metadata: {
328
- source: 'duckduckgo_html',
329
- type: 'web_result'
330
- }
331
- });
332
- }
333
- });
334
-
335
- // If no results found, provide helpful feedback
336
- if (items.length === 0) {
337
- // Check if there's a "no results" message
338
- const noResultsIndicators = [
339
- 'No results found',
340
- 'no web results',
341
- 'Try searching for'
342
- ];
343
-
344
- let hasNoResults = false;
345
- for (const indicator of noResultsIndicators) {
346
- if (html.toLowerCase().includes(indicator.toLowerCase())) {
347
- hasNoResults = true;
348
- break;
349
- }
350
- }
351
-
352
- if (hasNoResults) {
353
- throw new Error(`No search results found for query: "${query}"`);
354
- } else {
355
- throw new Error('Could not parse search results from DuckDuckGo response');
356
- }
357
- }
358
-
359
- return {
360
- kind: 'duckduckgo#search',
361
- searchInformation: {
362
- searchTime: 0.1,
363
- formattedSearchTime: '0.10',
364
- totalResults: items.length.toString(),
365
- formattedTotalResults: items.length.toLocaleString()
366
- },
367
- items: items
368
- };
369
-
370
- } catch (error) {
371
- if (error.message.includes('No search results found') || error.message.includes('Could not parse')) {
372
- throw error;
373
- }
374
- throw new Error(`Failed to parse DuckDuckGo HTML response: ${error.message}`);
375
- }
376
- }
377
-
378
- isValidUrl(url) {
379
- if (!url) return false;
380
- try {
381
- const urlObj = new URL(url);
382
- return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
383
- } catch {
384
- return false;
385
- }
386
- }
387
-
388
- cleanText(text) {
389
- if (!text) return '';
390
- // Remove HTML tags, normalize whitespace, and trim
391
- return text
392
- .replace(/<[^>]*>/g, '')
393
- .replace(/\s+/g, ' ')
394
- .replace(/&nbsp;/g, ' ')
395
- .replace(/&amp;/g, '&')
396
- .replace(/&lt;/g, '<')
397
- .replace(/&gt;/g, '>')
398
- .replace(/&quot;/g, '"')
399
- .replace(/&#39;/g, "'")
400
- .trim();
401
- }
402
-
403
- extractDomain(url) {
404
- if (!url) return '';
405
- try {
406
- return new URL(url).hostname;
407
- } catch {
408
- return '';
409
- }
410
- }
411
-
412
- mapLanguageCode(code) {
413
- // Map common language codes to DuckDuckGo's format
414
- const languageMap = {
415
- 'en': 'us-en',
416
- 'es': 'es-es',
417
- 'fr': 'fr-fr',
418
- 'de': 'de-de',
419
- 'it': 'it-it',
420
- 'pt': 'pt-br',
421
- 'ru': 'ru-ru',
422
- 'ja': 'jp-jp',
423
- 'ko': 'kr-kr',
424
- 'zh': 'cn-zh'
425
- };
426
- return languageMap[code] || 'us-en';
427
- }
428
-
429
- mapDateRestrict(dateRestrict) {
430
- // Map Google's dateRestrict format to DuckDuckGo's time filters
431
- const dateMap = {
432
- 'd1': 'd', // past day
433
- 'w1': 'w', // past week
434
- 'm1': 'm', // past month
435
- 'y1': 'y' // past year
436
- };
437
- return dateMap[dateRestrict] || null;
438
- }
439
-
440
- async getSuggestions(query) {
441
- try {
442
- // DuckDuckGo's autocomplete endpoint
443
- const url = `https://duckduckgo.com/ac/?q=${encodeURIComponent(query)}&type=list`;
444
-
445
- const controller = new AbortController();
446
- const timeoutId = setTimeout(() => controller.abort(), 5000); // Shorter timeout for suggestions
447
-
448
- const response = await fetch(url, {
449
- headers: {
450
- 'User-Agent': this.userAgent,
451
- 'Accept': 'application/json',
452
- 'Referer': 'https://duckduckgo.com/'
453
- },
454
- signal: controller.signal
455
- });
456
-
457
- clearTimeout(timeoutId);
458
-
459
- if (!response.ok) {
460
- return [];
461
- }
462
-
463
- const data = await response.json();
464
- return Array.isArray(data) && data.length > 1 ? data[1] : [];
465
- } catch (error) {
466
- // Fail silently for suggestions
467
- return [];
468
- }
469
- }
470
-
471
- async getRelatedSearches(query) {
472
- // DuckDuckGo doesn't provide a direct related searches API
473
- // Return some common query variations
474
- const words = query.split(' ').filter(w => w.length > 2);
475
- const related = [];
476
-
477
- if (words.length > 0) {
478
- related.push(`${query} tutorial`);
479
- related.push(`${query} guide`);
480
- related.push(`${query} examples`);
481
- related.push(`how to ${query}`);
482
- related.push(`${query} best practices`);
483
- }
484
-
485
- return related.slice(0, 5);
486
- }
487
-
488
- async validateApiKey() {
489
- // DuckDuckGo doesn't require API keys, test HTML scraping functionality
490
- try {
491
- const result = await this.search({ query: 'test search', num: 1 });
492
- return result && result.items && result.items.length >= 0; // Even 0 results is valid
493
- } catch (error) {
494
- console.warn('DuckDuckGo validation failed:', error.message);
495
- return false;
496
- }
497
- }
498
- }
499
-
500
- export default DuckDuckGoSearchAdapter;