crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { GoogleSearchAdapter } from './googleSearch.js';
|
|
2
|
+
import { DuckDuckGoSearchAdapter } from './duckduckgoSearch.js';
|
|
3
|
+
|
|
4
|
+
export class SearchProviderFactory {
|
|
5
|
+
static createAdapter(provider, options = {}) {
|
|
6
|
+
switch (provider.toLowerCase()) {
|
|
7
|
+
case 'google':
|
|
8
|
+
if (!options.google?.apiKey || !options.google?.searchEngineId) {
|
|
9
|
+
throw new Error('Google Search adapter requires apiKey and searchEngineId');
|
|
10
|
+
}
|
|
11
|
+
return new GoogleSearchAdapter(
|
|
12
|
+
options.google.apiKey,
|
|
13
|
+
options.google.searchEngineId
|
|
14
|
+
);
|
|
15
|
+
|
|
16
|
+
case 'duckduckgo':
|
|
17
|
+
return new DuckDuckGoSearchAdapter(options.duckduckgo || {});
|
|
18
|
+
|
|
19
|
+
default:
|
|
20
|
+
throw new Error(`Unsupported search provider: ${provider}`);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
static getSupportedProviders() {
|
|
25
|
+
return ['google', 'duckduckgo'];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
static isProviderAvailable(provider, options = {}) {
|
|
29
|
+
try {
|
|
30
|
+
SearchProviderFactory.createAdapter(provider, options);
|
|
31
|
+
return true;
|
|
32
|
+
} catch {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
static getProviderCapabilities(provider) {
|
|
38
|
+
const capabilities = {
|
|
39
|
+
google: {
|
|
40
|
+
requiresApiKey: true,
|
|
41
|
+
supportsPagination: true,
|
|
42
|
+
supportsLanguageFilter: true,
|
|
43
|
+
supportsDateFilter: true,
|
|
44
|
+
supportsSiteFilter: true,
|
|
45
|
+
supportsFileTypeFilter: true,
|
|
46
|
+
supportsSafeSearch: true,
|
|
47
|
+
maxResultsPerRequest: 100,
|
|
48
|
+
rateLimit: '100 queries per day (free tier)',
|
|
49
|
+
features: [
|
|
50
|
+
'Web search',
|
|
51
|
+
'Image search',
|
|
52
|
+
'Exact phrase matching',
|
|
53
|
+
'Boolean operators',
|
|
54
|
+
'Site-specific search',
|
|
55
|
+
'File type filtering',
|
|
56
|
+
'Date range filtering',
|
|
57
|
+
'Language filtering',
|
|
58
|
+
'Safe search',
|
|
59
|
+
'Related searches'
|
|
60
|
+
]
|
|
61
|
+
},
|
|
62
|
+
duckduckgo: {
|
|
63
|
+
requiresApiKey: false,
|
|
64
|
+
supportsPagination: false, // Limited by API
|
|
65
|
+
supportsLanguageFilter: true,
|
|
66
|
+
supportsDateFilter: true,
|
|
67
|
+
supportsSiteFilter: false, // Not directly supported
|
|
68
|
+
supportsFileTypeFilter: false, // Not directly supported
|
|
69
|
+
supportsSafeSearch: true,
|
|
70
|
+
maxResultsPerRequest: 10, // Limited by instant answer API
|
|
71
|
+
rateLimit: 'No explicit limit (be respectful)',
|
|
72
|
+
features: [
|
|
73
|
+
'Privacy-focused search',
|
|
74
|
+
'Instant answers',
|
|
75
|
+
'No tracking',
|
|
76
|
+
'Language filtering',
|
|
77
|
+
'Date filtering',
|
|
78
|
+
'Safe search',
|
|
79
|
+
'Autocomplete suggestions'
|
|
80
|
+
]
|
|
81
|
+
}
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
return capabilities[provider.toLowerCase()] || null;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
static compareProviders() {
|
|
88
|
+
const providers = SearchProviderFactory.getSupportedProviders();
|
|
89
|
+
return providers.map(provider => ({
|
|
90
|
+
name: provider,
|
|
91
|
+
...SearchProviderFactory.getProviderCapabilities(provider)
|
|
92
|
+
}));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export default SearchProviderFactory;
|
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
import { createHash } from 'crypto';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* QueryExpander - Advanced query expansion and refinement capabilities
|
|
5
|
+
* Provides synonym expansion, misspelling correction, stemming, phrase detection, and boolean operators
|
|
6
|
+
*/
|
|
7
|
+
export class QueryExpander {
|
|
8
|
+
constructor(options = {}) {
|
|
9
|
+
this.options = {
|
|
10
|
+
enableSynonyms: options.enableSynonyms !== false,
|
|
11
|
+
enableSpellCheck: options.enableSpellCheck !== false,
|
|
12
|
+
enableStemming: options.enableStemming !== false,
|
|
13
|
+
enablePhraseDetection: options.enablePhraseDetection !== false,
|
|
14
|
+
enableBooleanOperators: options.enableBooleanOperators !== false,
|
|
15
|
+
maxExpansions: options.maxExpansions || 5,
|
|
16
|
+
...options
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
// Initialize dictionaries
|
|
20
|
+
this.synonymDict = this._initializeSynonyms();
|
|
21
|
+
this.spellingDict = this._initializeSpellingCorrections();
|
|
22
|
+
this.stopWords = this._initializeStopWords();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Main expansion method - orchestrates all expansion techniques
|
|
27
|
+
* @param {string} query - The original search query
|
|
28
|
+
* @param {Object} options - Optional expansion preferences
|
|
29
|
+
* @returns {Array<string>} - Array of expanded query variations
|
|
30
|
+
*/
|
|
31
|
+
async expandQuery(query, options = {}) {
|
|
32
|
+
if (!query || typeof query !== 'string') {
|
|
33
|
+
throw new Error('Query must be a non-empty string');
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const expansionOptions = { ...this.options, ...options };
|
|
37
|
+
const expansions = new Set();
|
|
38
|
+
|
|
39
|
+
try {
|
|
40
|
+
// Always include the original query
|
|
41
|
+
expansions.add(query.trim());
|
|
42
|
+
|
|
43
|
+
// Apply different expansion strategies
|
|
44
|
+
if (expansionOptions.enableSynonyms) {
|
|
45
|
+
const synonymExpansions = this.expandWithSynonyms(query);
|
|
46
|
+
synonymExpansions.forEach(exp => expansions.add(exp));
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (expansionOptions.enableSpellCheck) {
|
|
50
|
+
const correctedExpansions = this.correctSpelling(query);
|
|
51
|
+
correctedExpansions.forEach(exp => expansions.add(exp));
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if (expansionOptions.enableStemming) {
|
|
55
|
+
const stemmedExpansions = this.expandWithStemming(query);
|
|
56
|
+
stemmedExpansions.forEach(exp => expansions.add(exp));
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (expansionOptions.enablePhraseDetection) {
|
|
60
|
+
const phraseExpansions = this.expandWithPhrases(query);
|
|
61
|
+
phraseExpansions.forEach(exp => expansions.add(exp));
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (expansionOptions.enableBooleanOperators) {
|
|
65
|
+
const booleanExpansions = this.expandWithBooleanOperators(query);
|
|
66
|
+
booleanExpansions.forEach(exp => expansions.add(exp));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Convert to array and limit results
|
|
70
|
+
const result = Array.from(expansions)
|
|
71
|
+
.filter(exp => exp.length > 0)
|
|
72
|
+
.slice(0, expansionOptions.maxExpansions);
|
|
73
|
+
|
|
74
|
+
return result;
|
|
75
|
+
} catch (error) {
|
|
76
|
+
console.error('Query expansion error:', error);
|
|
77
|
+
return [query]; // Fallback to original query
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Expand query with synonyms
|
|
83
|
+
* @param {string} query
|
|
84
|
+
* @returns {Array<string>}
|
|
85
|
+
*/
|
|
86
|
+
expandWithSynonyms(query) {
|
|
87
|
+
const expansions = [];
|
|
88
|
+
const words = this._tokenize(query);
|
|
89
|
+
|
|
90
|
+
// Replace individual words with synonyms
|
|
91
|
+
words.forEach((word, index) => {
|
|
92
|
+
const synonyms = this.synonymDict[word.toLowerCase()];
|
|
93
|
+
if (synonyms && synonyms.length > 0) {
|
|
94
|
+
synonyms.forEach(synonym => {
|
|
95
|
+
const newWords = [...words];
|
|
96
|
+
newWords[index] = synonym;
|
|
97
|
+
expansions.push(newWords.join(' '));
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Create OR variations with synonyms
|
|
103
|
+
const synonymGroups = words.map(word => {
|
|
104
|
+
const synonyms = this.synonymDict[word.toLowerCase()];
|
|
105
|
+
return synonyms && synonyms.length > 0 ? [word, ...synonyms.slice(0, 2)] : [word];
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
if (synonymGroups.some(group => group.length > 1)) {
|
|
109
|
+
// Create combinations with most relevant synonyms
|
|
110
|
+
const combinations = this._generateCombinations(synonymGroups, 3);
|
|
111
|
+
combinations.forEach(combo => {
|
|
112
|
+
if (combo.join(' ') !== query) {
|
|
113
|
+
expansions.push(combo.join(' '));
|
|
114
|
+
}
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return expansions;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Correct common misspellings
|
|
123
|
+
* @param {string} query
|
|
124
|
+
* @returns {Array<string>}
|
|
125
|
+
*/
|
|
126
|
+
correctSpelling(query) {
|
|
127
|
+
const expansions = [];
|
|
128
|
+
const words = this._tokenize(query);
|
|
129
|
+
let hasCorrections = false;
|
|
130
|
+
|
|
131
|
+
const correctedWords = words.map(word => {
|
|
132
|
+
const correction = this.spellingDict[word.toLowerCase()];
|
|
133
|
+
if (correction && correction !== word.toLowerCase()) {
|
|
134
|
+
hasCorrections = true;
|
|
135
|
+
return correction;
|
|
136
|
+
}
|
|
137
|
+
return word;
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
if (hasCorrections) {
|
|
141
|
+
expansions.push(correctedWords.join(' '));
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return expansions;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Apply word stemming for broader matches
|
|
149
|
+
* @param {string} query
|
|
150
|
+
* @returns {Array<string>}
|
|
151
|
+
*/
|
|
152
|
+
expandWithStemming(query) {
|
|
153
|
+
const expansions = [];
|
|
154
|
+
const words = this._tokenize(query);
|
|
155
|
+
|
|
156
|
+
// Apply simple stemming rules
|
|
157
|
+
const stemmedWords = words.map(word => this._applyStemming(word));
|
|
158
|
+
const stemmedQuery = stemmedWords.join(' ');
|
|
159
|
+
|
|
160
|
+
if (stemmedQuery !== query) {
|
|
161
|
+
expansions.push(stemmedQuery);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Create variations with stemmed and original words
|
|
165
|
+
const mixedVariations = this._createStemMixVariations(words, stemmedWords);
|
|
166
|
+
expansions.push(...mixedVariations);
|
|
167
|
+
|
|
168
|
+
return expansions;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Detect and handle phrases
|
|
173
|
+
* @param {string} query
|
|
174
|
+
* @returns {Array<string>}
|
|
175
|
+
*/
|
|
176
|
+
expandWithPhrases(query) {
|
|
177
|
+
const expansions = [];
|
|
178
|
+
|
|
179
|
+
// Add quoted phrases if not already present
|
|
180
|
+
if (!query.includes('"') && query.includes(' ')) {
|
|
181
|
+
expansions.push(`"${query}"`);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Detect potential phrases (consecutive capitalized words, common phrase patterns)
|
|
185
|
+
const phrases = this._detectPhrases(query);
|
|
186
|
+
phrases.forEach(phrase => {
|
|
187
|
+
if (phrase.length > query.length * 0.5) { // Only for substantial phrases
|
|
188
|
+
expansions.push(`"${phrase}"`);
|
|
189
|
+
}
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
// Break down complex queries into sub-phrases
|
|
193
|
+
if (query.split(' ').length > 3) {
|
|
194
|
+
const subPhrases = this._generateSubPhrases(query);
|
|
195
|
+
subPhrases.forEach(phrase => {
|
|
196
|
+
if (phrase !== query) {
|
|
197
|
+
expansions.push(phrase);
|
|
198
|
+
}
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return expansions;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Add boolean operator variations
|
|
207
|
+
* @param {string} query
|
|
208
|
+
* @returns {Array<string>}
|
|
209
|
+
*/
|
|
210
|
+
expandWithBooleanOperators(query) {
|
|
211
|
+
const expansions = [];
|
|
212
|
+
const words = this._tokenize(query).filter(word => !this.stopWords.has(word.toLowerCase()));
|
|
213
|
+
|
|
214
|
+
if (words.length > 1) {
|
|
215
|
+
// AND variations (explicit)
|
|
216
|
+
expansions.push(words.join(' AND '));
|
|
217
|
+
|
|
218
|
+
// OR variations
|
|
219
|
+
expansions.push(words.join(' OR '));
|
|
220
|
+
|
|
221
|
+
// Mixed AND/OR for longer queries
|
|
222
|
+
if (words.length > 2) {
|
|
223
|
+
const primaryTerms = words.slice(0, 2);
|
|
224
|
+
const secondaryTerms = words.slice(2);
|
|
225
|
+
|
|
226
|
+
if (secondaryTerms.length > 0) {
|
|
227
|
+
expansions.push(`(${primaryTerms.join(' AND ')}) OR (${secondaryTerms.join(' OR ')})`);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Proximity searches (if supported by search engine)
|
|
232
|
+
if (words.length === 2) {
|
|
233
|
+
expansions.push(`"${words.join(' ')}"~5`); // Within 5 words
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
return expansions;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Generate query suggestions based on common patterns
|
|
242
|
+
* @param {string} query
|
|
243
|
+
* @returns {Array<string>}
|
|
244
|
+
*/
|
|
245
|
+
generateSuggestions(query) {
|
|
246
|
+
const suggestions = [];
|
|
247
|
+
const words = this._tokenize(query);
|
|
248
|
+
|
|
249
|
+
// Common search patterns
|
|
250
|
+
const patterns = [
|
|
251
|
+
`${query} tutorial`,
|
|
252
|
+
`${query} guide`,
|
|
253
|
+
`${query} examples`,
|
|
254
|
+
`${query} best practices`,
|
|
255
|
+
`${query} documentation`,
|
|
256
|
+
`how to ${query}`,
|
|
257
|
+
`${query} vs alternatives`,
|
|
258
|
+
`${query} tips`,
|
|
259
|
+
`learn ${query}`,
|
|
260
|
+
`${query} review`
|
|
261
|
+
];
|
|
262
|
+
|
|
263
|
+
// Add contextual suggestions based on query type
|
|
264
|
+
if (this._isProductQuery(query)) {
|
|
265
|
+
patterns.push(
|
|
266
|
+
`${query} price`,
|
|
267
|
+
`${query} review`,
|
|
268
|
+
`buy ${query}`,
|
|
269
|
+
`${query} comparison`
|
|
270
|
+
);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if (this._isTechnicalQuery(query)) {
|
|
274
|
+
patterns.push(
|
|
275
|
+
`${query} API`,
|
|
276
|
+
`${query} configuration`,
|
|
277
|
+
`${query} troubleshooting`,
|
|
278
|
+
`${query} installation`
|
|
279
|
+
);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return patterns.slice(0, 8); // Limit suggestions
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Initialize synonym dictionary
|
|
287
|
+
* @private
|
|
288
|
+
*/
|
|
289
|
+
_initializeSynonyms() {
|
|
290
|
+
return {
|
|
291
|
+
// Technology synonyms
|
|
292
|
+
'web': ['website', 'internet', 'online'],
|
|
293
|
+
'website': ['web', 'site', 'portal'],
|
|
294
|
+
'app': ['application', 'software', 'program'],
|
|
295
|
+
'mobile': ['smartphone', 'phone', 'cellular'],
|
|
296
|
+
'computer': ['pc', 'desktop', 'machine'],
|
|
297
|
+
'software': ['program', 'app', 'application'],
|
|
298
|
+
'development': ['programming', 'coding', 'dev'],
|
|
299
|
+
'programming': ['coding', 'development', 'dev'],
|
|
300
|
+
'database': ['db', 'data storage', 'repository'],
|
|
301
|
+
'server': ['host', 'backend', 'service'],
|
|
302
|
+
'api': ['interface', 'endpoint', 'service'],
|
|
303
|
+
|
|
304
|
+
// Business synonyms
|
|
305
|
+
'company': ['business', 'organization', 'firm', 'corporation'],
|
|
306
|
+
'business': ['company', 'enterprise', 'firm'],
|
|
307
|
+
'customer': ['client', 'user', 'consumer'],
|
|
308
|
+
'product': ['item', 'good', 'service'],
|
|
309
|
+
'service': ['offering', 'solution', 'product'],
|
|
310
|
+
|
|
311
|
+
// General synonyms
|
|
312
|
+
'big': ['large', 'huge', 'massive'],
|
|
313
|
+
'small': ['little', 'tiny', 'mini'],
|
|
314
|
+
'fast': ['quick', 'rapid', 'speedy'],
|
|
315
|
+
'slow': ['sluggish', 'gradual', 'delayed'],
|
|
316
|
+
'good': ['excellent', 'great', 'quality'],
|
|
317
|
+
'bad': ['poor', 'terrible', 'awful'],
|
|
318
|
+
'new': ['recent', 'latest', 'modern'],
|
|
319
|
+
'old': ['vintage', 'classic', 'legacy'],
|
|
320
|
+
'free': ['gratis', 'complimentary', 'no cost'],
|
|
321
|
+
'cheap': ['affordable', 'budget', 'economical'],
|
|
322
|
+
'expensive': ['costly', 'premium', 'high-end']
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
/**
|
|
327
|
+
* Initialize common misspelling corrections
|
|
328
|
+
* @private
|
|
329
|
+
*/
|
|
330
|
+
_initializeSpellingCorrections() {
|
|
331
|
+
return {
|
|
332
|
+
// Common technology misspellings
|
|
333
|
+
'javascirpt': 'javascript',
|
|
334
|
+
'javscript': 'javascript',
|
|
335
|
+
'phyton': 'python',
|
|
336
|
+
'pyhton': 'python',
|
|
337
|
+
'databse': 'database',
|
|
338
|
+
'developement': 'development',
|
|
339
|
+
'progamming': 'programming',
|
|
340
|
+
'algoritm': 'algorithm',
|
|
341
|
+
'machien': 'machine',
|
|
342
|
+
'compuer': 'computer',
|
|
343
|
+
'webiste': 'website',
|
|
344
|
+
'appliation': 'application',
|
|
345
|
+
'sofware': 'software',
|
|
346
|
+
'programing': 'programming',
|
|
347
|
+
|
|
348
|
+
// Common word misspellings
|
|
349
|
+
'recieve': 'receive',
|
|
350
|
+
'seperate': 'separate',
|
|
351
|
+
'definately': 'definitely',
|
|
352
|
+
'occured': 'occurred',
|
|
353
|
+
'begining': 'beginning',
|
|
354
|
+
'achievment': 'achievement',
|
|
355
|
+
'beleive': 'believe',
|
|
356
|
+
'freind': 'friend',
|
|
357
|
+
'wierd': 'weird',
|
|
358
|
+
'speach': 'speech',
|
|
359
|
+
'realy': 'really',
|
|
360
|
+
'alot': 'a lot',
|
|
361
|
+
'wich': 'which',
|
|
362
|
+
'thier': 'their',
|
|
363
|
+
'lenght': 'length'
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/**
|
|
368
|
+
* Initialize stop words
|
|
369
|
+
* @private
|
|
370
|
+
*/
|
|
371
|
+
_initializeStopWords() {
|
|
372
|
+
return new Set([
|
|
373
|
+
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
|
|
374
|
+
'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
|
|
375
|
+
'to', 'was', 'were', 'will', 'with', 'would', 'could', 'should',
|
|
376
|
+
'this', 'these', 'they', 'them', 'their', 'there', 'then', 'than'
|
|
377
|
+
]);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Tokenize query into words
|
|
382
|
+
* @private
|
|
383
|
+
*/
|
|
384
|
+
_tokenize(query) {
|
|
385
|
+
return query.toLowerCase()
|
|
386
|
+
.replace(/[^\w\s]/g, ' ')
|
|
387
|
+
.split(/\s+/)
|
|
388
|
+
.filter(word => word.length > 0);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Apply simple stemming rules
|
|
393
|
+
* @private
|
|
394
|
+
*/
|
|
395
|
+
_applyStemming(word) {
|
|
396
|
+
const stemRules = [
|
|
397
|
+
{ suffix: 'ies', replacement: 'y', minLength: 4 },
|
|
398
|
+
{ suffix: 'ied', replacement: 'y', minLength: 4 },
|
|
399
|
+
{ suffix: 'ing', replacement: '', minLength: 4 },
|
|
400
|
+
{ suffix: 'ed', replacement: '', minLength: 3 },
|
|
401
|
+
{ suffix: 's', replacement: '', minLength: 3 },
|
|
402
|
+
{ suffix: 'ly', replacement: '', minLength: 4 },
|
|
403
|
+
{ suffix: 'er', replacement: '', minLength: 3 },
|
|
404
|
+
{ suffix: 'est', replacement: '', minLength: 4 }
|
|
405
|
+
];
|
|
406
|
+
|
|
407
|
+
const lowerWord = word.toLowerCase();
|
|
408
|
+
|
|
409
|
+
for (const rule of stemRules) {
|
|
410
|
+
if (lowerWord.length >= rule.minLength && lowerWord.endsWith(rule.suffix)) {
|
|
411
|
+
return lowerWord.slice(0, -rule.suffix.length) + rule.replacement;
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
return lowerWord;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
/**
|
|
419
|
+
* Generate combinations from synonym groups
|
|
420
|
+
* @private
|
|
421
|
+
*/
|
|
422
|
+
_generateCombinations(groups, maxCombinations) {
|
|
423
|
+
const combinations = [];
|
|
424
|
+
const maxGroups = Math.min(groups.length, 3); // Limit complexity
|
|
425
|
+
|
|
426
|
+
// Generate simple combinations
|
|
427
|
+
for (let i = 0; i < Math.min(maxCombinations, maxGroups); i++) {
|
|
428
|
+
const combo = groups.map((group, index) =>
|
|
429
|
+
index === i && group.length > 1 ? group[1] : group[0]
|
|
430
|
+
);
|
|
431
|
+
combinations.push(combo);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
return combinations;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* Create mixed variations of stemmed and original words
|
|
439
|
+
* @private
|
|
440
|
+
*/
|
|
441
|
+
_createStemMixVariations(originalWords, stemmedWords) {
|
|
442
|
+
const variations = [];
|
|
443
|
+
|
|
444
|
+
// Mix original and stemmed words
|
|
445
|
+
for (let i = 0; i < originalWords.length; i++) {
|
|
446
|
+
if (originalWords[i] !== stemmedWords[i]) {
|
|
447
|
+
const mixed = [...originalWords];
|
|
448
|
+
mixed[i] = stemmedWords[i];
|
|
449
|
+
variations.push(mixed.join(' '));
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
return variations;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/**
|
|
457
|
+
* Detect potential phrases in query
|
|
458
|
+
* @private
|
|
459
|
+
*/
|
|
460
|
+
_detectPhrases(query) {
|
|
461
|
+
const phrases = [];
|
|
462
|
+
const words = query.split(' ');
|
|
463
|
+
|
|
464
|
+
// Look for consecutive capitalized words
|
|
465
|
+
let currentPhrase = [];
|
|
466
|
+
words.forEach(word => {
|
|
467
|
+
if (word.charAt(0) === word.charAt(0).toUpperCase() && word.length > 1) {
|
|
468
|
+
currentPhrase.push(word);
|
|
469
|
+
} else {
|
|
470
|
+
if (currentPhrase.length > 1) {
|
|
471
|
+
phrases.push(currentPhrase.join(' '));
|
|
472
|
+
}
|
|
473
|
+
currentPhrase = [];
|
|
474
|
+
}
|
|
475
|
+
});
|
|
476
|
+
|
|
477
|
+
if (currentPhrase.length > 1) {
|
|
478
|
+
phrases.push(currentPhrase.join(' '));
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
return phrases;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/**
|
|
485
|
+
* Generate sub-phrases from complex queries
|
|
486
|
+
* @private
|
|
487
|
+
*/
|
|
488
|
+
_generateSubPhrases(query) {
|
|
489
|
+
const words = this._tokenize(query);
|
|
490
|
+
const phrases = [];
|
|
491
|
+
|
|
492
|
+
// Generate 2-word and 3-word phrases
|
|
493
|
+
for (let i = 0; i < words.length - 1; i++) {
|
|
494
|
+
// 2-word phrases
|
|
495
|
+
phrases.push(`${words[i]} ${words[i + 1]}`);
|
|
496
|
+
|
|
497
|
+
// 3-word phrases
|
|
498
|
+
if (i < words.length - 2) {
|
|
499
|
+
phrases.push(`${words[i]} ${words[i + 1]} ${words[i + 2]}`);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
return phrases;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
/**
|
|
507
|
+
* Check if query is product-related
|
|
508
|
+
* @private
|
|
509
|
+
*/
|
|
510
|
+
_isProductQuery(query) {
|
|
511
|
+
const productKeywords = ['buy', 'purchase', 'price', 'cost', 'shop', 'store', 'product', 'item'];
|
|
512
|
+
return productKeywords.some(keyword => query.toLowerCase().includes(keyword));
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
/**
|
|
516
|
+
* Check if query is technical
|
|
517
|
+
* @private
|
|
518
|
+
*/
|
|
519
|
+
_isTechnicalQuery(query) {
|
|
520
|
+
const techKeywords = ['api', 'code', 'programming', 'development', 'software', 'app', 'web', 'database', 'server'];
|
|
521
|
+
return techKeywords.some(keyword => query.toLowerCase().includes(keyword));
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
/**
|
|
525
|
+
* Get statistics about expansion capabilities
|
|
526
|
+
*/
|
|
527
|
+
getStats() {
|
|
528
|
+
return {
|
|
529
|
+
synonymCount: Object.keys(this.synonymDict).length,
|
|
530
|
+
spellingCorrectionCount: Object.keys(this.spellingDict).length,
|
|
531
|
+
stopWordCount: this.stopWords.size,
|
|
532
|
+
capabilities: {
|
|
533
|
+
synonyms: this.options.enableSynonyms,
|
|
534
|
+
spellCheck: this.options.enableSpellCheck,
|
|
535
|
+
stemming: this.options.enableStemming,
|
|
536
|
+
phrases: this.options.enablePhraseDetection,
|
|
537
|
+
booleanOperators: this.options.enableBooleanOperators
|
|
538
|
+
}
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
export default QueryExpander;
|