crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { SearchProviderFactory } from './adapters/searchProviderFactory.js';
|
|
3
|
+
import { CacheManager } from '../../core/cache/CacheManager.js';
|
|
4
|
+
import { QueryExpander } from './queryExpander.js';
|
|
5
|
+
import { ResultRanker } from './ranking/ResultRanker.js';
|
|
6
|
+
import { ResultDeduplicator } from './ranking/ResultDeduplicator.js';
|
|
7
|
+
import LocalizationManager from '../../core/LocalizationManager.js';
|
|
8
|
+
|
|
9
|
+
const SearchWebSchema = z.object({
|
|
10
|
+
query: z.string().min(1),
|
|
11
|
+
limit: z.number().min(1).max(100).optional().default(10),
|
|
12
|
+
offset: z.number().min(0).optional().default(0),
|
|
13
|
+
lang: z.string().optional().default('en'),
|
|
14
|
+
safe_search: z.boolean().optional().default(true),
|
|
15
|
+
time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional().default('all'),
|
|
16
|
+
site: z.string().optional(),
|
|
17
|
+
file_type: z.string().optional(),
|
|
18
|
+
expand_query: z.boolean().optional().default(true),
|
|
19
|
+
expansion_options: z.object({
|
|
20
|
+
enableSynonyms: z.boolean().optional(),
|
|
21
|
+
enableSpellCheck: z.boolean().optional(),
|
|
22
|
+
enableStemming: z.boolean().optional(),
|
|
23
|
+
enablePhraseDetection: z.boolean().optional(),
|
|
24
|
+
enableBooleanOperators: z.boolean().optional(),
|
|
25
|
+
maxExpansions: z.number().min(1).max(10).optional()
|
|
26
|
+
}).optional(),
|
|
27
|
+
|
|
28
|
+
// Ranking options
|
|
29
|
+
enable_ranking: z.boolean().optional().default(true),
|
|
30
|
+
ranking_weights: z.object({
|
|
31
|
+
bm25: z.number().min(0).max(1).optional(),
|
|
32
|
+
semantic: z.number().min(0).max(1).optional(),
|
|
33
|
+
authority: z.number().min(0).max(1).optional(),
|
|
34
|
+
freshness: z.number().min(0).max(1).optional()
|
|
35
|
+
}).optional(),
|
|
36
|
+
|
|
37
|
+
// Deduplication options
|
|
38
|
+
enable_deduplication: z.boolean().optional().default(true),
|
|
39
|
+
deduplication_thresholds: z.object({
|
|
40
|
+
url: z.number().min(0).max(1).optional(),
|
|
41
|
+
title: z.number().min(0).max(1).optional(),
|
|
42
|
+
content: z.number().min(0).max(1).optional(),
|
|
43
|
+
combined: z.number().min(0).max(1).optional()
|
|
44
|
+
}).optional(),
|
|
45
|
+
|
|
46
|
+
// Output options
|
|
47
|
+
include_ranking_details: z.boolean().optional().default(false),
|
|
48
|
+
include_deduplication_details: z.boolean().optional().default(false),
|
|
49
|
+
|
|
50
|
+
// Localization options
|
|
51
|
+
localization: z.object({
|
|
52
|
+
countryCode: z.string().length(2).optional(),
|
|
53
|
+
language: z.string().optional(),
|
|
54
|
+
timezone: z.string().optional(),
|
|
55
|
+
enableGeoTargeting: z.boolean().default(false),
|
|
56
|
+
customLocation: z.object({
|
|
57
|
+
latitude: z.number().min(-90).max(90),
|
|
58
|
+
longitude: z.number().min(-180).max(180)
|
|
59
|
+
}).optional()
|
|
60
|
+
}).optional()
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
export class SearchWebTool {
|
|
64
|
+
constructor(options = {}) {
|
|
65
|
+
const {
|
|
66
|
+
provider = 'auto',
|
|
67
|
+
google = {},
|
|
68
|
+
duckduckgo = {},
|
|
69
|
+
cacheEnabled = true,
|
|
70
|
+
cacheTTL = 3600000, // 1 hour
|
|
71
|
+
expanderOptions = {},
|
|
72
|
+
rankingOptions = {},
|
|
73
|
+
deduplicationOptions = {}
|
|
74
|
+
} = options;
|
|
75
|
+
|
|
76
|
+
// Determine which provider to use
|
|
77
|
+
this.provider = this.determineProvider(provider, { google, duckduckgo });
|
|
78
|
+
|
|
79
|
+
// Create the search adapter
|
|
80
|
+
try {
|
|
81
|
+
this.searchAdapter = SearchProviderFactory.createAdapter(this.provider, {
|
|
82
|
+
google,
|
|
83
|
+
duckduckgo
|
|
84
|
+
});
|
|
85
|
+
} catch (error) {
|
|
86
|
+
throw new Error(`Failed to initialize search provider '${this.provider}': ${error.message}`);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
90
|
+
|
|
91
|
+
// Initialize query expander
|
|
92
|
+
this.queryExpander = new QueryExpander(expanderOptions);
|
|
93
|
+
|
|
94
|
+
// Initialize ranking and deduplication systems
|
|
95
|
+
this.resultRanker = new ResultRanker({ cacheEnabled, cacheTTL, ...rankingOptions });
|
|
96
|
+
this.resultDeduplicator = new ResultDeduplicator({ cacheEnabled, cacheTTL, ...deduplicationOptions });
|
|
97
|
+
|
|
98
|
+
// Initialize localization manager
|
|
99
|
+
this.localizationManager = new LocalizationManager({
|
|
100
|
+
enableGeoBlockingBypass: options.enableGeoBlockingBypass !== false,
|
|
101
|
+
dynamicFingerprinting: options.dynamicFingerprinting !== false
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
determineProvider(configuredProvider, providerOptions) {
|
|
106
|
+
switch (configuredProvider.toLowerCase()) {
|
|
107
|
+
case 'google':
|
|
108
|
+
if (!providerOptions.google?.apiKey || !providerOptions.google?.searchEngineId) {
|
|
109
|
+
throw new Error('Google provider requires apiKey and searchEngineId');
|
|
110
|
+
}
|
|
111
|
+
return 'google';
|
|
112
|
+
|
|
113
|
+
case 'duckduckgo':
|
|
114
|
+
return 'duckduckgo';
|
|
115
|
+
|
|
116
|
+
case 'auto':
|
|
117
|
+
default:
|
|
118
|
+
// Auto mode: prefer Google if credentials available, otherwise use DuckDuckGo
|
|
119
|
+
if (providerOptions.google?.apiKey && providerOptions.google?.searchEngineId) {
|
|
120
|
+
return 'google';
|
|
121
|
+
}
|
|
122
|
+
return 'duckduckgo';
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
async execute(params) {
|
|
127
|
+
try {
|
|
128
|
+
const validated = SearchWebSchema.parse(params);
|
|
129
|
+
|
|
130
|
+
// Apply localization if specified
|
|
131
|
+
let localizedParams = validated;
|
|
132
|
+
if (validated.localization) {
|
|
133
|
+
try {
|
|
134
|
+
localizedParams = await this.localizationManager.localizeSearchQuery(
|
|
135
|
+
validated,
|
|
136
|
+
validated.localization.countryCode
|
|
137
|
+
);
|
|
138
|
+
} catch (localizationError) {
|
|
139
|
+
console.warn('Localization failed, using original parameters:', localizationError.message);
|
|
140
|
+
// Continue with original parameters
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Expand query if enabled
|
|
145
|
+
let searchQueries = [localizedParams.query];
|
|
146
|
+
let expandedQueries = [];
|
|
147
|
+
|
|
148
|
+
if (localizedParams.expand_query) {
|
|
149
|
+
try {
|
|
150
|
+
expandedQueries = await this.queryExpander.expandQuery(
|
|
151
|
+
localizedParams.query,
|
|
152
|
+
localizedParams.expansion_options || {}
|
|
153
|
+
);
|
|
154
|
+
|
|
155
|
+
// Use the best expanded query as primary, keep original as fallback
|
|
156
|
+
if (expandedQueries.length > 1) {
|
|
157
|
+
searchQueries = expandedQueries;
|
|
158
|
+
}
|
|
159
|
+
} catch (expansionError) {
|
|
160
|
+
console.warn('Query expansion failed, using original query:', expansionError.message);
|
|
161
|
+
// Continue with original query
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Generate cache key (include expansion and localization info for accurate caching)
|
|
166
|
+
const cacheKey = this.cache ? this.cache.generateKey('search', {
|
|
167
|
+
...localizedParams,
|
|
168
|
+
expandedQueries: localizedParams.expand_query ? expandedQueries : undefined,
|
|
169
|
+
localization: validated.localization
|
|
170
|
+
}) : null;
|
|
171
|
+
|
|
172
|
+
// Check cache
|
|
173
|
+
if (this.cache) {
|
|
174
|
+
const cached = await this.cache.get(cacheKey);
|
|
175
|
+
if (cached) {
|
|
176
|
+
return {
|
|
177
|
+
...cached,
|
|
178
|
+
cached: true
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Try searches with expanded queries, starting with the best one
|
|
184
|
+
let bestResults = null;
|
|
185
|
+
let usedQuery = validated.query;
|
|
186
|
+
let searchError = null;
|
|
187
|
+
|
|
188
|
+
for (let i = 0; i < searchQueries.length; i++) {
|
|
189
|
+
try {
|
|
190
|
+
// Build search query with modifiers
|
|
191
|
+
let searchQuery = searchQueries[i];
|
|
192
|
+
|
|
193
|
+
if (validated.site) {
|
|
194
|
+
searchQuery = `site:${validated.site} ${searchQuery}`;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if (validated.file_type) {
|
|
198
|
+
searchQuery = `filetype:${validated.file_type} ${searchQuery}`;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Perform search with localized parameters
|
|
202
|
+
const searchParams = {
|
|
203
|
+
query: searchQuery,
|
|
204
|
+
num: localizedParams.limit,
|
|
205
|
+
start: localizedParams.offset + 1, // Google uses 1-based indexing
|
|
206
|
+
lr: localizedParams.lr || `lang_${localizedParams.lang}`,
|
|
207
|
+
safe: localizedParams.safe_search ? 'active' : 'off',
|
|
208
|
+
dateRestrict: this.getDateRestrict(localizedParams.time_range),
|
|
209
|
+
// Add localization-specific parameters
|
|
210
|
+
...localizedParams.headers && { headers: localizedParams.headers },
|
|
211
|
+
cr: localizedParams.cr, // Country restrict
|
|
212
|
+
uule: localizedParams.uule // Location encoding
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
const results = await this.searchAdapter.search(searchParams);
|
|
216
|
+
|
|
217
|
+
// Check if we got good results
|
|
218
|
+
if (results.items && results.items.length > 0) {
|
|
219
|
+
bestResults = results;
|
|
220
|
+
usedQuery = searchQueries[i];
|
|
221
|
+
break;
|
|
222
|
+
} else if (i === 0) {
|
|
223
|
+
// Save results from first query even if no items (might be the original query)
|
|
224
|
+
bestResults = results;
|
|
225
|
+
usedQuery = searchQueries[i];
|
|
226
|
+
}
|
|
227
|
+
} catch (error) {
|
|
228
|
+
searchError = error;
|
|
229
|
+
console.warn(`Search failed for query "${searchQueries[i]}":`, error.message);
|
|
230
|
+
|
|
231
|
+
// If this is the last query and we haven't found results, throw the error
|
|
232
|
+
if (i === searchQueries.length - 1 && !bestResults) {
|
|
233
|
+
throw error;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (!bestResults) {
|
|
239
|
+
throw searchError || new Error('All search queries failed');
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Process and enrich results
|
|
243
|
+
let processedResults = await this.processResults(bestResults);
|
|
244
|
+
|
|
245
|
+
// Apply deduplication if enabled
|
|
246
|
+
let deduplicationInfo = null;
|
|
247
|
+
if (validated.enable_deduplication && processedResults.length > 1) {
|
|
248
|
+
const dedupeOptions = validated.deduplication_thresholds ?
|
|
249
|
+
{ thresholds: validated.deduplication_thresholds } : {};
|
|
250
|
+
|
|
251
|
+
const originalCount = processedResults.length;
|
|
252
|
+
processedResults = await this.resultDeduplicator.deduplicateResults(
|
|
253
|
+
processedResults,
|
|
254
|
+
dedupeOptions
|
|
255
|
+
);
|
|
256
|
+
|
|
257
|
+
deduplicationInfo = {
|
|
258
|
+
originalCount,
|
|
259
|
+
finalCount: processedResults.length,
|
|
260
|
+
duplicatesRemoved: originalCount - processedResults.length,
|
|
261
|
+
deduplicationRate: ((originalCount - processedResults.length) / originalCount * 100).toFixed(1) + '%'
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Apply ranking if enabled
|
|
266
|
+
let rankingInfo = null;
|
|
267
|
+
if (validated.enable_ranking && processedResults.length > 1) {
|
|
268
|
+
const rankingOptions = validated.ranking_weights ?
|
|
269
|
+
{ weights: validated.ranking_weights } : {};
|
|
270
|
+
|
|
271
|
+
processedResults = await this.resultRanker.rankResults(
|
|
272
|
+
processedResults,
|
|
273
|
+
validated.query,
|
|
274
|
+
rankingOptions
|
|
275
|
+
);
|
|
276
|
+
|
|
277
|
+
rankingInfo = {
|
|
278
|
+
algorithmsUsed: ['bm25', 'semantic', 'authority', 'freshness'],
|
|
279
|
+
weightsApplied: this.resultRanker.options.weights,
|
|
280
|
+
totalResults: processedResults.length
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Clean up results based on detail level requested
|
|
285
|
+
if (!validated.include_ranking_details) {
|
|
286
|
+
processedResults = processedResults.map(result => {
|
|
287
|
+
const { rankingDetails, ...cleanResult } = result;
|
|
288
|
+
return cleanResult;
|
|
289
|
+
});
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if (!validated.include_deduplication_details) {
|
|
293
|
+
processedResults = processedResults.map(result => {
|
|
294
|
+
const { deduplicationInfo, ...cleanResult } = result;
|
|
295
|
+
return cleanResult;
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
const response = {
|
|
300
|
+
query: validated.query,
|
|
301
|
+
effective_query: usedQuery !== validated.query ? usedQuery : undefined,
|
|
302
|
+
expanded_queries: localizedParams.expand_query && expandedQueries.length > 1 ? expandedQueries : undefined,
|
|
303
|
+
results: processedResults,
|
|
304
|
+
total_results: bestResults.searchInformation?.totalResults || 0,
|
|
305
|
+
search_time: bestResults.searchInformation?.searchTime || 0,
|
|
306
|
+
offset: localizedParams.offset,
|
|
307
|
+
limit: localizedParams.limit,
|
|
308
|
+
cached: false,
|
|
309
|
+
|
|
310
|
+
// Add provider information
|
|
311
|
+
provider: {
|
|
312
|
+
name: this.provider,
|
|
313
|
+
capabilities: SearchProviderFactory.getProviderCapabilities(this.provider)
|
|
314
|
+
},
|
|
315
|
+
|
|
316
|
+
// Add localization information
|
|
317
|
+
localization: validated.localization ? {
|
|
318
|
+
applied: true,
|
|
319
|
+
countryCode: validated.localization.countryCode,
|
|
320
|
+
language: localizedParams.lang,
|
|
321
|
+
searchDomain: localizedParams.searchDomain,
|
|
322
|
+
geoTargeting: validated.localization.enableGeoTargeting
|
|
323
|
+
} : null,
|
|
324
|
+
|
|
325
|
+
// Add processing information
|
|
326
|
+
processing: {
|
|
327
|
+
ranking: rankingInfo,
|
|
328
|
+
deduplication: deduplicationInfo,
|
|
329
|
+
query_expansion: localizedParams.expand_query && expandedQueries.length > 1 ? {
|
|
330
|
+
original_query: validated.query,
|
|
331
|
+
expanded_count: expandedQueries.length,
|
|
332
|
+
used_query: usedQuery
|
|
333
|
+
} : null,
|
|
334
|
+
localization_applied: !!validated.localization
|
|
335
|
+
}
|
|
336
|
+
};
|
|
337
|
+
|
|
338
|
+
// Cache the results
|
|
339
|
+
if (this.cache) {
|
|
340
|
+
await this.cache.set(cacheKey, response);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
return response;
|
|
344
|
+
} catch (error) {
|
|
345
|
+
throw new Error(`Search failed: ${error.message}`);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
async processResults(searchResults) {
|
|
350
|
+
if (!searchResults.items || searchResults.items.length === 0) {
|
|
351
|
+
return [];
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
return searchResults.items.map(item => ({
|
|
355
|
+
title: item.title || '',
|
|
356
|
+
link: item.link || '',
|
|
357
|
+
snippet: item.snippet || '',
|
|
358
|
+
displayLink: item.displayLink || '',
|
|
359
|
+
formattedUrl: item.formattedUrl || '',
|
|
360
|
+
htmlSnippet: item.htmlSnippet || '',
|
|
361
|
+
pagemap: this.extractPagemap(item.pagemap),
|
|
362
|
+
metadata: {
|
|
363
|
+
mime: item.mime,
|
|
364
|
+
fileFormat: item.fileFormat,
|
|
365
|
+
cacheId: item.cacheId
|
|
366
|
+
}
|
|
367
|
+
}));
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
extractPagemap(pagemap) {
|
|
371
|
+
if (!pagemap) return {};
|
|
372
|
+
|
|
373
|
+
const extracted = {};
|
|
374
|
+
|
|
375
|
+
// Extract metatags
|
|
376
|
+
if (pagemap.metatags && pagemap.metatags[0]) {
|
|
377
|
+
const meta = pagemap.metatags[0];
|
|
378
|
+
extracted.metatags = {
|
|
379
|
+
title: meta['og:title'] || meta['twitter:title'] || '',
|
|
380
|
+
description: meta['og:description'] || meta['twitter:description'] || meta.description || '',
|
|
381
|
+
image: meta['og:image'] || meta['twitter:image'] || '',
|
|
382
|
+
author: meta.author || '',
|
|
383
|
+
publishedTime: meta['article:published_time'] || '',
|
|
384
|
+
modifiedTime: meta['article:modified_time'] || ''
|
|
385
|
+
};
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Extract CSE thumbnail
|
|
389
|
+
if (pagemap.cse_thumbnail && pagemap.cse_thumbnail[0]) {
|
|
390
|
+
extracted.thumbnail = {
|
|
391
|
+
src: pagemap.cse_thumbnail[0].src,
|
|
392
|
+
width: pagemap.cse_thumbnail[0].width,
|
|
393
|
+
height: pagemap.cse_thumbnail[0].height
|
|
394
|
+
};
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Extract CSE image
|
|
398
|
+
if (pagemap.cse_image && pagemap.cse_image[0]) {
|
|
399
|
+
extracted.image = pagemap.cse_image[0].src;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
return extracted;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
getDateRestrict(timeRange) {
|
|
406
|
+
const ranges = {
|
|
407
|
+
'day': 'd1',
|
|
408
|
+
'week': 'w1',
|
|
409
|
+
'month': 'm1',
|
|
410
|
+
'year': 'y1',
|
|
411
|
+
'all': ''
|
|
412
|
+
};
|
|
413
|
+
|
|
414
|
+
return ranges[timeRange] || '';
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
async expandQuery(query, options = {}) {
|
|
418
|
+
// Enhanced query expansion using QueryExpander
|
|
419
|
+
try {
|
|
420
|
+
return await this.queryExpander.expandQuery(query, options);
|
|
421
|
+
} catch (error) {
|
|
422
|
+
console.warn('Advanced query expansion failed, falling back to simple expansion:', error.message);
|
|
423
|
+
|
|
424
|
+
// Fallback to simple expansion for backward compatibility
|
|
425
|
+
const expansions = [];
|
|
426
|
+
|
|
427
|
+
// Add common variations
|
|
428
|
+
expansions.push(query);
|
|
429
|
+
|
|
430
|
+
// Add quoted exact match
|
|
431
|
+
if (!query.includes('"')) {
|
|
432
|
+
expansions.push(`"${query}"`);
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// Add OR variations for multi-word queries
|
|
436
|
+
const words = query.split(' ').filter(w => w.length > 2);
|
|
437
|
+
if (words.length > 1) {
|
|
438
|
+
expansions.push(words.join(' OR '));
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
return expansions;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Generate query suggestions
|
|
447
|
+
* @param {string} query
|
|
448
|
+
* @returns {Array<string>}
|
|
449
|
+
*/
|
|
450
|
+
async generateSuggestions(query) {
|
|
451
|
+
try {
|
|
452
|
+
return this.queryExpander.generateSuggestions(query);
|
|
453
|
+
} catch (error) {
|
|
454
|
+
console.warn('Suggestion generation failed:', error.message);
|
|
455
|
+
return [];
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
getStats() {
|
|
460
|
+
return {
|
|
461
|
+
provider: {
|
|
462
|
+
name: this.provider,
|
|
463
|
+
capabilities: SearchProviderFactory.getProviderCapabilities(this.provider)
|
|
464
|
+
},
|
|
465
|
+
cacheStats: this.cache ? this.cache.getStats() : null,
|
|
466
|
+
queryExpanderStats: this.queryExpander ? this.queryExpander.getStats() : null,
|
|
467
|
+
rankingStats: this.resultRanker ? this.resultRanker.getStats() : null,
|
|
468
|
+
deduplicationStats: this.resultDeduplicator ? this.resultDeduplicator.getStats() : null
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
getProviderInfo() {
|
|
473
|
+
return {
|
|
474
|
+
activeProvider: this.provider,
|
|
475
|
+
capabilities: SearchProviderFactory.getProviderCapabilities(this.provider),
|
|
476
|
+
supportedProviders: SearchProviderFactory.getSupportedProviders(),
|
|
477
|
+
allProviders: SearchProviderFactory.compareProviders()
|
|
478
|
+
};
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
export default SearchWebTool;
|