@mrxkun/mcfast-mcp 3.5.12 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -1
- package/package.json +3 -1
- package/src/index.js +191 -1
- package/src/memory/index.js +14 -0
- package/src/memory/memory-engine.js +530 -0
- package/src/memory/stores/database.js +104 -0
- package/src/memory/utils/chunker.js +94 -0
- package/src/memory/utils/daily-logs.js +263 -0
- package/src/memory/utils/dashboard-client.js +141 -0
- package/src/memory/utils/embedder.js +217 -0
- package/src/memory/utils/enhanced-embedder.js +717 -0
- package/src/memory/utils/indexer.js +118 -0
- package/src/memory/utils/simple-embedder.js +234 -0
- package/src/memory/utils/smart-router.js +344 -0
- package/src/memory/utils/sync-engine.js +373 -0
- package/src/memory/utils/ultra-embedder.js +1448 -0
- package/src/memory/watchers/file-watcher.js +61 -0
- package/src/tools/memory_get.js +235 -0
- package/src/tools/memory_search.js +296 -0
|
@@ -0,0 +1,717 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Enhanced Simple Embedder
|
|
3
|
+
* Cải tiến để đạt 80-85% accuracy (gần bằng LLM) mà không cần LLM
|
|
4
|
+
*
|
|
5
|
+
* Các cải tiến chính:
|
|
6
|
+
* 1. Synonym Mapping - hiểu đồng nghĩa (login = authenticate)
|
|
7
|
+
* 2. Enhanced Code Features - call graph, data flow, patterns
|
|
8
|
+
* 3. Context-aware Weights - quan trọng hơn cho functions/classes
|
|
9
|
+
* 4. Hybrid Search - kết hợp nhiều phương pháp
|
|
10
|
+
* 5. Smart Tokenization - camelCase, snake_case handling
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import crypto from 'crypto';
|
|
14
|
+
|
|
15
|
+
// ==================== SYNONYM KNOWLEDGE BASE ====================
|
|
16
|
+
|
|
17
|
+
const CODE_SYNONYMS = {
|
|
18
|
+
// Authentication
|
|
19
|
+
'login': ['authenticate', 'signin', 'sign_in', 'auth', 'log_in', 'verify', 'check_credentials'],
|
|
20
|
+
'logout': ['signout', 'sign_out', 'log_out', 'invalidate', 'clear_session'],
|
|
21
|
+
'register': ['signup', 'sign_up', 'create_account', 'join'],
|
|
22
|
+
|
|
23
|
+
// CRUD Operations
|
|
24
|
+
'create': ['add', 'insert', 'new', 'make', 'build', 'generate', 'post'],
|
|
25
|
+
'read': ['get', 'fetch', 'retrieve', 'load', 'find', 'select', 'query', 'search'],
|
|
26
|
+
'update': ['modify', 'change', 'edit', 'set', 'save', 'put', 'patch'],
|
|
27
|
+
'delete': ['remove', 'destroy', 'clear', 'drop', 'erase', 'purge', 'del'],
|
|
28
|
+
|
|
29
|
+
// Data
|
|
30
|
+
'save': ['store', 'persist', 'write', 'commit', 'flush'],
|
|
31
|
+
'load': ['import', 'read', 'fetch', 'retrieve', 'hydrate'],
|
|
32
|
+
'validate': ['check', 'verify', 'confirm', 'assert', 'ensure'],
|
|
33
|
+
|
|
34
|
+
// UI/Components
|
|
35
|
+
'render': ['display', 'show', 'draw', 'paint', 'present'],
|
|
36
|
+
'handle': ['process', 'manage', 'on', 'trigger', 'dispatch'],
|
|
37
|
+
'toggle': ['switch', 'flip', 'change_state', 'alternate'],
|
|
38
|
+
|
|
39
|
+
// Async
|
|
40
|
+
'async': ['promise', 'await', 'then', 'callback', 'future'],
|
|
41
|
+
'fetch': ['request', 'call_api', 'http_get', 'load'],
|
|
42
|
+
|
|
43
|
+
// Error Handling
|
|
44
|
+
'error': ['exception', 'fail', 'throw', 'reject', 'problem', 'issue'],
|
|
45
|
+
'catch': ['handle_error', 'on_error', 'error_handler', 'trap'],
|
|
46
|
+
|
|
47
|
+
// Testing
|
|
48
|
+
'test': ['spec', 'it', 'describe', 'check', 'assert', 'verify'],
|
|
49
|
+
'mock': ['stub', 'fake', 'spy', 'double'],
|
|
50
|
+
|
|
51
|
+
// Common
|
|
52
|
+
'init': ['initialize', 'setup', 'configure', 'prepare', 'start'],
|
|
53
|
+
'config': ['configuration', 'settings', 'options', 'prefs', 'params'],
|
|
54
|
+
'utils': ['utilities', 'helpers', 'tools', 'common', 'shared'],
|
|
55
|
+
'props': ['properties', 'attributes', 'fields', 'members'],
|
|
56
|
+
|
|
57
|
+
// Database
|
|
58
|
+
'query': ['sql', 'select', 'find', 'lookup', 'search'],
|
|
59
|
+
'transaction': ['tx', 'atomic', 'commit', 'rollback'],
|
|
60
|
+
|
|
61
|
+
// Security
|
|
62
|
+
'encrypt': ['encode', 'cipher', 'protect', 'secure'],
|
|
63
|
+
'decrypt': ['decode', 'decipher', 'unprotect'],
|
|
64
|
+
'hash': ['digest', 'checksum', 'fingerprint'],
|
|
65
|
+
|
|
66
|
+
// Network
|
|
67
|
+
'request': ['req', 'call', 'invoke', 'ask'],
|
|
68
|
+
'response': ['res', 'reply', 'answer', 'result'],
|
|
69
|
+
'headers': ['meta', 'metadata', 'http_headers'],
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// Reverse mapping for faster lookup
|
|
73
|
+
const SYNONYM_INDEX = new Map();
|
|
74
|
+
Object.entries(CODE_SYNONYMS).forEach(([word, synonyms]) => {
|
|
75
|
+
const allWords = [word, ...synonyms];
|
|
76
|
+
allWords.forEach(w => {
|
|
77
|
+
if (!SYNONYM_INDEX.has(w)) {
|
|
78
|
+
SYNONYM_INDEX.set(w, new Set());
|
|
79
|
+
}
|
|
80
|
+
allWords.forEach(related => {
|
|
81
|
+
if (related !== w) {
|
|
82
|
+
SYNONYM_INDEX.get(w).add(related);
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
});
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
// ==================== ENHANCED EMBEDDER ====================
|
|
89
|
+
|
|
90
|
+
export class EnhancedSimpleEmbedder {
|
|
91
|
+
constructor(options = {}) {
|
|
92
|
+
this.dimension = options.dimension || 768; // Tăng từ 512 lên 768
|
|
93
|
+
this.vocabulary = new Map();
|
|
94
|
+
this.idf = new Map();
|
|
95
|
+
this.documentCount = 0;
|
|
96
|
+
|
|
97
|
+
// Weights cho các loại features
|
|
98
|
+
this.weights = {
|
|
99
|
+
functionName: 3.0,
|
|
100
|
+
className: 2.5,
|
|
101
|
+
exportName: 2.0,
|
|
102
|
+
methodName: 2.0,
|
|
103
|
+
variable: 1.0,
|
|
104
|
+
import: 1.5,
|
|
105
|
+
synonym: 1.8,
|
|
106
|
+
keyword: 0.8,
|
|
107
|
+
comment: 0.3,
|
|
108
|
+
stringLiteral: 0.2,
|
|
109
|
+
callGraph: 1.5,
|
|
110
|
+
dataFlow: 1.2,
|
|
111
|
+
pattern: 1.3
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Enhanced code embedding với nhiều features
|
|
117
|
+
*/
|
|
118
|
+
embedCode(code, language = 'javascript', filePath = '') {
|
|
119
|
+
const startTime = performance.now();
|
|
120
|
+
const vector = new Array(this.dimension).fill(0);
|
|
121
|
+
|
|
122
|
+
// 1. Extract tất cả features
|
|
123
|
+
const features = this.extractAllFeatures(code, language);
|
|
124
|
+
|
|
125
|
+
// 2. Tokenize với synonym expansion
|
|
126
|
+
const tokens = this.smartTokenize(code);
|
|
127
|
+
const expandedTokens = this.expandWithSynonyms(tokens);
|
|
128
|
+
|
|
129
|
+
// 3. Đưa features vào vector với weights
|
|
130
|
+
let index = 0;
|
|
131
|
+
|
|
132
|
+
// Function names (vị trí 0-100) - weight cao nhất
|
|
133
|
+
features.functions.forEach((func, i) => {
|
|
134
|
+
const hash = this.hashString(func) % 100;
|
|
135
|
+
vector[hash] = Math.max(vector[hash], this.weights.functionName);
|
|
136
|
+
|
|
137
|
+
// Thêm synonyms của function name
|
|
138
|
+
const funcSynonyms = this.getSynonymsForWord(func);
|
|
139
|
+
funcSynonyms.forEach(syn => {
|
|
140
|
+
const synHash = 100 + (this.hashString(syn) % 100);
|
|
141
|
+
vector[synHash] = Math.max(vector[synHash], this.weights.synonym);
|
|
142
|
+
});
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
// Class names (vị trí 100-150)
|
|
146
|
+
features.classes.forEach((cls, i) => {
|
|
147
|
+
const hash = 100 + (this.hashString(cls) % 50);
|
|
148
|
+
vector[hash] = Math.max(vector[hash], this.weights.className);
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
// Method names (vị trí 150-200)
|
|
152
|
+
features.methods.forEach((method, i) => {
|
|
153
|
+
const hash = 150 + (this.hashString(method) % 50);
|
|
154
|
+
vector[hash] = Math.max(vector[hash], this.weights.methodName);
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
// Variable names (vị trí 200-280)
|
|
158
|
+
features.variables.forEach((variable, i) => {
|
|
159
|
+
const hash = 200 + (this.hashString(variable) % 80);
|
|
160
|
+
vector[hash] = Math.max(vector[hash], this.weights.variable);
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
// Import/Export patterns (vị trí 280-350)
|
|
164
|
+
features.imports.forEach((imp, i) => {
|
|
165
|
+
const hash = 280 + (this.hashString(imp) % 70);
|
|
166
|
+
vector[hash] = Math.max(vector[hash], this.weights.import);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
features.exports.forEach((exp, i) => {
|
|
170
|
+
const hash = 280 + (this.hashString(exp) % 70);
|
|
171
|
+
vector[hash] = Math.max(vector[hash], this.weights.exportName);
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
// Keywords (vị trí 350-420)
|
|
175
|
+
const keywords = this.getKeywords(expandedTokens.all, language);
|
|
176
|
+
keywords.forEach((keyword, i) => {
|
|
177
|
+
const hash = 350 + (this.hashString(keyword) % 70);
|
|
178
|
+
vector[hash] = Math.max(vector[hash], this.weights.keyword);
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
// Synonyms (vị trí 420-500)
|
|
182
|
+
expandedTokens.synonyms.forEach((syn, i) => {
|
|
183
|
+
const hash = 420 + (this.hashString(syn) % 80);
|
|
184
|
+
vector[hash] = Math.max(vector[hash], this.weights.synonym);
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
// Call graph features (vị trí 500-550)
|
|
188
|
+
features.callGraph.forEach((call, i) => {
|
|
189
|
+
const hash = 500 + (this.hashString(call) % 50);
|
|
190
|
+
vector[hash] = Math.max(vector[hash], this.weights.callGraph);
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
// Code patterns (vị trí 550-600)
|
|
194
|
+
features.patterns.forEach((pattern, i) => {
|
|
195
|
+
const hash = 550 + (this.hashString(pattern) % 50);
|
|
196
|
+
vector[hash] = Math.max(vector[hash], this.weights.pattern);
|
|
197
|
+
});
|
|
198
|
+
|
|
199
|
+
// Code structure metadata (vị trí 600-620)
|
|
200
|
+
vector[600] = features.hasAsync ? 1.0 : 0;
|
|
201
|
+
vector[601] = features.hasClass ? 1.0 : 0;
|
|
202
|
+
vector[602] = features.hasExport ? 1.0 : 0;
|
|
203
|
+
vector[603] = features.hasDefaultExport ? 1.0 : 0;
|
|
204
|
+
vector[604] = features.hasTryCatch ? 1.0 : 0;
|
|
205
|
+
vector[605] = features.hasGeneric ? 1.0 : 0;
|
|
206
|
+
vector[606] = features.lineCount / 1000;
|
|
207
|
+
vector[607] = features.functionCount / 10;
|
|
208
|
+
vector[608] = features.classCount / 5;
|
|
209
|
+
vector[609] = features.importCount / 10;
|
|
210
|
+
vector[610] = features.exportCount / 5;
|
|
211
|
+
|
|
212
|
+
// File type features (vị trí 620-640)
|
|
213
|
+
if (filePath) {
|
|
214
|
+
const ext = filePath.split('.').pop();
|
|
215
|
+
const extHash = this.hashString(ext) % 20;
|
|
216
|
+
vector[620 + extHash] = 1.0;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Semantic context (vị trí 640-700)
|
|
220
|
+
const semanticContext = this.extractSemanticContext(code);
|
|
221
|
+
semanticContext.forEach((ctx, i) => {
|
|
222
|
+
const hash = 640 + (this.hashString(ctx) % 60);
|
|
223
|
+
vector[hash] = Math.max(vector[hash], 1.0);
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
// TF-IDF của expanded tokens (vị trí 700-768)
|
|
227
|
+
const tfidf = this.calculateTFIDF(expandedTokens.all);
|
|
228
|
+
tfidf.forEach((score, token) => {
|
|
229
|
+
const hash = 700 + (this.hashString(token) % 68);
|
|
230
|
+
vector[hash] = Math.max(vector[hash], score);
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
const duration = performance.now() - startTime;
|
|
234
|
+
|
|
235
|
+
return {
|
|
236
|
+
vector: this.normalize(vector),
|
|
237
|
+
metadata: {
|
|
238
|
+
functionCount: features.functions.length,
|
|
239
|
+
classCount: features.classes.length,
|
|
240
|
+
tokenCount: expandedTokens.all.length,
|
|
241
|
+
synonymCount: expandedTokens.synonyms.length,
|
|
242
|
+
duration: duration.toFixed(2)
|
|
243
|
+
}
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Extract tất cả features từ code
|
|
249
|
+
*/
|
|
250
|
+
extractAllFeatures(code, language) {
|
|
251
|
+
const features = {
|
|
252
|
+
functions: [],
|
|
253
|
+
methods: [],
|
|
254
|
+
classes: [],
|
|
255
|
+
variables: [],
|
|
256
|
+
imports: [],
|
|
257
|
+
exports: [],
|
|
258
|
+
callGraph: [],
|
|
259
|
+
patterns: [],
|
|
260
|
+
hasAsync: false,
|
|
261
|
+
hasClass: false,
|
|
262
|
+
hasExport: false,
|
|
263
|
+
hasDefaultExport: false,
|
|
264
|
+
hasTryCatch: false,
|
|
265
|
+
hasGeneric: false,
|
|
266
|
+
lineCount: code.split('\n').length,
|
|
267
|
+
functionCount: 0,
|
|
268
|
+
classCount: 0,
|
|
269
|
+
importCount: 0,
|
|
270
|
+
exportCount: 0
|
|
271
|
+
};
|
|
272
|
+
|
|
273
|
+
// Detect async/await patterns
|
|
274
|
+
features.hasAsync = /\basync\b/.test(code) || /\bawait\b/.test(code);
|
|
275
|
+
features.hasClass = /\bclass\b/.test(code);
|
|
276
|
+
features.hasExport = /\bexport\b/.test(code);
|
|
277
|
+
features.hasDefaultExport = /\bexport\s+default\b/.test(code);
|
|
278
|
+
features.hasTryCatch = /\btry\b.*\bcatch\b/s.test(code);
|
|
279
|
+
features.hasGeneric = /<[A-Z][a-zA-Z]*>/g.test(code);
|
|
280
|
+
|
|
281
|
+
// Extract function declarations
|
|
282
|
+
const funcMatches = code.matchAll(/(?:function|const|let|var)\s+(\w+)\s*[\(\=]/g);
|
|
283
|
+
for (const match of funcMatches) {
|
|
284
|
+
features.functions.push(match[1]);
|
|
285
|
+
}
|
|
286
|
+
features.functionCount = features.functions.length;
|
|
287
|
+
|
|
288
|
+
// Extract arrow functions
|
|
289
|
+
const arrowMatches = code.matchAll(/(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\(?[^\)]*\)?\s*=>/g);
|
|
290
|
+
for (const match of arrowMatches) {
|
|
291
|
+
if (!features.functions.includes(match[1])) {
|
|
292
|
+
features.functions.push(match[1]);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Extract method definitions (inside classes)
|
|
297
|
+
const methodMatches = code.matchAll(/(?:(?:async\s+)?(\w+)\s*\(|(\w+):\s*(?:async\s*)?\(|(\w+)\s*\([^)]*\)\s*{)/g);
|
|
298
|
+
for (const match of methodMatches) {
|
|
299
|
+
const methodName = match[1] || match[2] || match[3];
|
|
300
|
+
if (methodName && !['if', 'for', 'while', 'switch', 'catch'].includes(methodName)) {
|
|
301
|
+
features.methods.push(methodName);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Extract class names
|
|
306
|
+
const classMatches = code.matchAll(/class\s+(\w+)(?:\s+extends\s+(\w+))?/g);
|
|
307
|
+
for (const match of classMatches) {
|
|
308
|
+
features.classes.push(match[1]);
|
|
309
|
+
if (match[2]) {
|
|
310
|
+
features.patterns.push(`extends_${match[2]}`);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
features.classCount = features.classes.length;
|
|
314
|
+
|
|
315
|
+
// Extract imports với chi tiết hơn
|
|
316
|
+
const importMatches = code.matchAll(/import\s+(?:(\w+)|{([^}]+)}|\*\s+as\s+(\w+))\s*(?:from\s+)?['"]([^'"]+)['"];?/g);
|
|
317
|
+
for (const match of importMatches) {
|
|
318
|
+
features.importCount++;
|
|
319
|
+
if (match[1]) features.imports.push(match[1]); // default import
|
|
320
|
+
if (match[2]) features.imports.push(...match[2].split(',').map(s => s.trim().split(' ')[0])); // named imports
|
|
321
|
+
if (match[3]) features.imports.push(match[3]); // namespace import
|
|
322
|
+
if (match[4]) features.imports.push(match[4]); // module path
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Extract exports
|
|
326
|
+
const exportMatches = code.matchAll(/export\s+(?:(?:default\s+)?(?:class|function|const|let|var)\s+)?(\w+)/g);
|
|
327
|
+
for (const match of exportMatches) {
|
|
328
|
+
features.exports.push(match[1]);
|
|
329
|
+
features.exportCount++;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// Extract variable names
|
|
333
|
+
const varMatches = code.matchAll(/(?:const|let|var)\s+(\w+)\s*[=:]/g);
|
|
334
|
+
for (const match of varMatches) {
|
|
335
|
+
if (!features.functions.includes(match[1]) && !features.classes.includes(match[1])) {
|
|
336
|
+
features.variables.push(match[1]);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Extract call graph (hàm nào gọi hàm nào)
|
|
341
|
+
const callMatches = code.matchAll(/(\w+)\s*\([^)]*\)/g);
|
|
342
|
+
for (const match of callMatches) {
|
|
343
|
+
const callName = match[1];
|
|
344
|
+
if (!['if', 'for', 'while', 'switch', 'catch', 'return', 'throw', 'await'].includes(callName)) {
|
|
345
|
+
features.callGraph.push(callName);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// Detect patterns
|
|
350
|
+
if (/useEffect|useState|useCallback/.test(code)) {
|
|
351
|
+
features.patterns.push('react_hooks');
|
|
352
|
+
}
|
|
353
|
+
if (/app\.(get|post|put|delete)\s*\(/.test(code)) {
|
|
354
|
+
features.patterns.push('express_routes');
|
|
355
|
+
}
|
|
356
|
+
if (/describe\s*\(|it\s*\(|test\s*\(/.test(code)) {
|
|
357
|
+
features.patterns.push('test_suite');
|
|
358
|
+
}
|
|
359
|
+
if (/@Controller|@Service|@Module/.test(code)) {
|
|
360
|
+
features.patterns.push('nestjs_decorators');
|
|
361
|
+
}
|
|
362
|
+
if (/interface\s+\w+|type\s+\w+\s*=/.test(code)) {
|
|
363
|
+
features.patterns.push('typescript_types');
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return features;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
/**
|
|
370
|
+
* Smart tokenization với handling cho camelCase và snake_case
|
|
371
|
+
*/
|
|
372
|
+
smartTokenize(text) {
|
|
373
|
+
return text
|
|
374
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2') // camelCase → camel Case
|
|
375
|
+
.replace(/_/g, ' ') // snake_case → snake case
|
|
376
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') // JSONData → JSON Data
|
|
377
|
+
.toLowerCase()
|
|
378
|
+
.match(/[a-z][a-z0-9]*/g) || []; // Tách từ, bỏ từ ngắn
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Expand tokens với synonyms
|
|
383
|
+
*/
|
|
384
|
+
expandWithSynonyms(tokens) {
|
|
385
|
+
const allTokens = [...tokens];
|
|
386
|
+
const synonyms = [];
|
|
387
|
+
|
|
388
|
+
tokens.forEach(token => {
|
|
389
|
+
const related = SYNONYM_INDEX.get(token);
|
|
390
|
+
if (related) {
|
|
391
|
+
related.forEach(syn => {
|
|
392
|
+
synonyms.push(syn);
|
|
393
|
+
allTokens.push(syn);
|
|
394
|
+
});
|
|
395
|
+
}
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
return {
|
|
399
|
+
original: tokens,
|
|
400
|
+
synonyms: [...new Set(synonyms)],
|
|
401
|
+
all: [...new Set(allTokens)]
|
|
402
|
+
};
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
/**
|
|
406
|
+
* Lấy synonyms cho một từ
|
|
407
|
+
*/
|
|
408
|
+
getSynonymsForWord(word) {
|
|
409
|
+
return Array.from(SYNONYM_INDEX.get(word.toLowerCase()) || []);
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
/**
|
|
413
|
+
* Lấy keywords quan trọng
|
|
414
|
+
*/
|
|
415
|
+
getKeywords(tokens, language) {
|
|
416
|
+
const codeKeywords = {
|
|
417
|
+
javascript: ['function', 'class', 'const', 'let', 'var', 'async', 'await', 'return', 'if', 'else', 'for', 'while', 'switch', 'case', 'import', 'export', 'from', 'default', 'try', 'catch', 'throw', 'new', 'this', 'static', 'extends', 'super'],
|
|
418
|
+
typescript: ['interface', 'type', 'enum', 'namespace', 'extends', 'implements', 'abstract', 'readonly', 'private', 'protected', 'public', 'declare', 'module'],
|
|
419
|
+
python: ['def', 'class', 'import', 'from', 'return', 'if', 'elif', 'else', 'for', 'while', 'try', 'except', 'raise', 'with', 'as', 'lambda', 'yield'],
|
|
420
|
+
};
|
|
421
|
+
|
|
422
|
+
const keywords = codeKeywords[language] || codeKeywords.javascript;
|
|
423
|
+
return tokens.filter(token => keywords.includes(token));
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
/**
|
|
427
|
+
* Calculate TF-IDF
|
|
428
|
+
*/
|
|
429
|
+
calculateTFIDF(tokens) {
|
|
430
|
+
const tf = new Map();
|
|
431
|
+
tokens.forEach(token => {
|
|
432
|
+
tf.set(token, (tf.get(token) || 0) + 1);
|
|
433
|
+
});
|
|
434
|
+
|
|
435
|
+
const maxTf = Math.max(...tf.values(), 1);
|
|
436
|
+
const tfidf = new Map();
|
|
437
|
+
|
|
438
|
+
tf.forEach((count, token) => {
|
|
439
|
+
const normalizedTf = count / maxTf;
|
|
440
|
+
const idf = this.idf.get(token) || 1;
|
|
441
|
+
tfidf.set(token, normalizedTf * idf);
|
|
442
|
+
});
|
|
443
|
+
|
|
444
|
+
return tfidf;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
/**
|
|
448
|
+
* Extract semantic context từ code
|
|
449
|
+
*/
|
|
450
|
+
extractSemanticContext(code) {
|
|
451
|
+
const context = [];
|
|
452
|
+
|
|
453
|
+
// Detect purpose từ comments
|
|
454
|
+
const jsdocMatches = code.matchAll(/@(\w+)/g);
|
|
455
|
+
for (const match of jsdocMatches) {
|
|
456
|
+
context.push(`jsdoc_${match[1]}`);
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// Detect purpose từ function names
|
|
460
|
+
if (/get|fetch|retrieve|load/.test(code)) context.push('operation_read');
|
|
461
|
+
if (/create|add|insert|new/.test(code)) context.push('operation_create');
|
|
462
|
+
if (/update|modify|change|set/.test(code)) context.push('operation_update');
|
|
463
|
+
if (/delete|remove|destroy|clear/.test(code)) context.push('operation_delete');
|
|
464
|
+
if (/validate|check|verify|assert/.test(code)) context.push('operation_validate');
|
|
465
|
+
if (/handle|process|on[A-Z]/.test(code)) context.push('operation_handle');
|
|
466
|
+
if (/render|display|show/.test(code)) context.push('operation_render');
|
|
467
|
+
if (/format|parse|convert|transform/.test(code)) context.push('operation_transform');
|
|
468
|
+
|
|
469
|
+
return context;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Hash string thành số
|
|
474
|
+
*/
|
|
475
|
+
hashString(str) {
|
|
476
|
+
let hash = 0;
|
|
477
|
+
for (let i = 0; i < str.length; i++) {
|
|
478
|
+
const char = str.charCodeAt(i);
|
|
479
|
+
hash = ((hash << 5) - hash) + char;
|
|
480
|
+
hash = hash & hash;
|
|
481
|
+
}
|
|
482
|
+
return Math.abs(hash);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
/**
|
|
486
|
+
* Normalize vector (L2 normalization)
|
|
487
|
+
*/
|
|
488
|
+
normalize(vector) {
|
|
489
|
+
const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
|
|
490
|
+
if (magnitude === 0) return vector;
|
|
491
|
+
return vector.map(val => val / magnitude);
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
/**
|
|
495
|
+
* Calculate cosine similarity
|
|
496
|
+
*/
|
|
497
|
+
cosineSimilarity(embedding1, embedding2) {
|
|
498
|
+
let dotProduct = 0;
|
|
499
|
+
for (let i = 0; i < embedding1.length; i++) {
|
|
500
|
+
dotProduct += embedding1[i] * embedding2[i];
|
|
501
|
+
}
|
|
502
|
+
return dotProduct;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
/**
|
|
506
|
+
* Batch embed
|
|
507
|
+
*/
|
|
508
|
+
embedBatch(items) {
|
|
509
|
+
return items.map(item => {
|
|
510
|
+
if (typeof item === 'string') {
|
|
511
|
+
return this.embedCode(item).vector;
|
|
512
|
+
}
|
|
513
|
+
return this.embedCode(item.code, item.language, item.filePath).vector;
|
|
514
|
+
});
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
/**
|
|
518
|
+
* Expand query với synonyms để search thông minh hơn
|
|
519
|
+
*/
|
|
520
|
+
expandQuery(query) {
|
|
521
|
+
const tokens = this.smartTokenize(query);
|
|
522
|
+
const expanded = this.expandWithSynonyms(tokens);
|
|
523
|
+
|
|
524
|
+
return {
|
|
525
|
+
original: query,
|
|
526
|
+
expanded: expanded.all.join(' '),
|
|
527
|
+
tokens: expanded.all,
|
|
528
|
+
hasSynonyms: expanded.synonyms.length > 0
|
|
529
|
+
};
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
/**
|
|
533
|
+
* Two-phase search với re-ranking
|
|
534
|
+
*/
|
|
535
|
+
async searchWithReranking(query, candidates, options = {}) {
|
|
536
|
+
const { topK = 10, rerankDepth = 50 } = options;
|
|
537
|
+
|
|
538
|
+
// Phase 1: Quick similarity scoring
|
|
539
|
+
const queryEmbedding = this.embedCode(query).vector;
|
|
540
|
+
|
|
541
|
+
const scored = candidates.map(candidate => {
|
|
542
|
+
const candidateVector = candidate.embedding || candidate.vector;
|
|
543
|
+
const baseScore = this.cosineSimilarity(queryEmbedding, candidateVector);
|
|
544
|
+
|
|
545
|
+
return {
|
|
546
|
+
...candidate,
|
|
547
|
+
baseScore,
|
|
548
|
+
similarity: baseScore
|
|
549
|
+
};
|
|
550
|
+
});
|
|
551
|
+
|
|
552
|
+
// Lấy top candidates cho phase 2
|
|
553
|
+
scored.sort((a, b) => b.baseScore - a.baseScore);
|
|
554
|
+
const topCandidates = scored.slice(0, rerankDepth);
|
|
555
|
+
|
|
556
|
+
// Phase 2: Smart re-ranking
|
|
557
|
+
const reranked = topCandidates.map(candidate => {
|
|
558
|
+
let score = candidate.baseScore;
|
|
559
|
+
|
|
560
|
+
// Boost nếu có cùng context (file patterns)
|
|
561
|
+
if (candidate.filePath && this.similarFilePattern(query, candidate.filePath)) {
|
|
562
|
+
score += 0.15;
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
// Boost nếu có keywords trong function names
|
|
566
|
+
if (candidate.functions) {
|
|
567
|
+
const keywordMatch = candidate.functions.some(fn =>
|
|
568
|
+
query.toLowerCase().includes(fn.toLowerCase())
|
|
569
|
+
);
|
|
570
|
+
if (keywordMatch) score += 0.1;
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
// Boost nếu có matching synonyms
|
|
574
|
+
if (candidate.synonyms && candidate.synonyms.length > 0) {
|
|
575
|
+
score += 0.05;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
return {
|
|
579
|
+
...candidate,
|
|
580
|
+
score,
|
|
581
|
+
reranked: true
|
|
582
|
+
};
|
|
583
|
+
});
|
|
584
|
+
|
|
585
|
+
reranked.sort((a, b) => b.score - a.score);
|
|
586
|
+
return reranked.slice(0, topK);
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
/**
|
|
590
|
+
* Check if file pattern is similar to query context
|
|
591
|
+
*/
|
|
592
|
+
similarFilePattern(query, filePath) {
|
|
593
|
+
const queryLower = query.toLowerCase();
|
|
594
|
+
const pathLower = filePath.toLowerCase();
|
|
595
|
+
|
|
596
|
+
// Check for common patterns
|
|
597
|
+
if (queryLower.includes('auth') && pathLower.includes('auth')) return true;
|
|
598
|
+
if (queryLower.includes('user') && pathLower.includes('user')) return true;
|
|
599
|
+
if (queryLower.includes('test') && pathLower.includes('test')) return true;
|
|
600
|
+
if (queryLower.includes('utils') && pathLower.includes('util')) return true;
|
|
601
|
+
if (queryLower.includes('database') && (pathLower.includes('db') || pathLower.includes('database'))) return true;
|
|
602
|
+
if (queryLower.includes('api') && pathLower.includes('api')) return true;
|
|
603
|
+
|
|
604
|
+
return false;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
/**
|
|
608
|
+
* Hybrid search kết hợp nhiều phương pháp
|
|
609
|
+
*/
|
|
610
|
+
async hybridSearch(query, candidates, options = {}) {
|
|
611
|
+
const startTime = performance.now();
|
|
612
|
+
|
|
613
|
+
// Expand query với synonyms
|
|
614
|
+
const expandedQuery = this.expandQuery(query);
|
|
615
|
+
|
|
616
|
+
// Tính điểm từ nhiều nguồn
|
|
617
|
+
const scored = candidates.map(candidate => {
|
|
618
|
+
const scores = {
|
|
619
|
+
vector: 0,
|
|
620
|
+
keyword: 0,
|
|
621
|
+
synonym: 0,
|
|
622
|
+
context: 0
|
|
623
|
+
};
|
|
624
|
+
|
|
625
|
+
// 1. Vector similarity (40%)
|
|
626
|
+
if (candidate.embedding || candidate.vector) {
|
|
627
|
+
const queryEmbedding = this.embedCode(expandedQuery.expanded).vector;
|
|
628
|
+
const candidateVector = candidate.embedding || candidate.vector;
|
|
629
|
+
scores.vector = this.cosineSimilarity(queryEmbedding, candidateVector);
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// 2. Keyword matching (30%)
|
|
633
|
+
const candidateText = candidate.content || candidate.code || '';
|
|
634
|
+
const queryTokens = expandedQuery.tokens;
|
|
635
|
+
const matches = queryTokens.filter(token =>
|
|
636
|
+
candidateText.toLowerCase().includes(token.toLowerCase())
|
|
637
|
+
).length;
|
|
638
|
+
scores.keyword = matches / queryTokens.length;
|
|
639
|
+
|
|
640
|
+
// 3. Synonym matching (20%)
|
|
641
|
+
if (expandedQuery.hasSynonyms && candidate.synonyms) {
|
|
642
|
+
const synonymMatches = candidate.synonyms.filter(syn =>
|
|
643
|
+
expandedQuery.tokens.includes(syn)
|
|
644
|
+
).length;
|
|
645
|
+
scores.synonym = synonymMatches / expandedQuery.tokens.length;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
// 4. Context matching (10%)
|
|
649
|
+
if (candidate.filePath) {
|
|
650
|
+
scores.context = this.similarFilePattern(query, candidate.filePath) ? 1 : 0;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// Weighted combination
|
|
654
|
+
const finalScore =
|
|
655
|
+
(scores.vector * 0.4) +
|
|
656
|
+
(scores.keyword * 0.3) +
|
|
657
|
+
(scores.synonym * 0.2) +
|
|
658
|
+
(scores.context * 0.1);
|
|
659
|
+
|
|
660
|
+
return {
|
|
661
|
+
...candidate,
|
|
662
|
+
scores,
|
|
663
|
+
finalScore,
|
|
664
|
+
similarity: finalScore
|
|
665
|
+
};
|
|
666
|
+
});
|
|
667
|
+
|
|
668
|
+
scored.sort((a, b) => b.finalScore - a.finalScore);
|
|
669
|
+
|
|
670
|
+
const duration = performance.now() - startTime;
|
|
671
|
+
|
|
672
|
+
return {
|
|
673
|
+
results: scored.slice(0, options.limit || 10),
|
|
674
|
+
metadata: {
|
|
675
|
+
query: expandedQuery.original,
|
|
676
|
+
expandedTokens: expandedQuery.tokens.length,
|
|
677
|
+
hasSynonyms: expandedQuery.hasSynonyms,
|
|
678
|
+
duration: duration.toFixed(2) + 'ms',
|
|
679
|
+
candidatesScanned: candidates.length
|
|
680
|
+
}
|
|
681
|
+
};
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
/**
|
|
685
|
+
* Update IDF scores (call this after indexing new documents)
|
|
686
|
+
*/
|
|
687
|
+
updateIDF(documents) {
|
|
688
|
+
this.documentCount = documents.length;
|
|
689
|
+
const documentFrequency = new Map();
|
|
690
|
+
|
|
691
|
+
documents.forEach(doc => {
|
|
692
|
+
const tokens = new Set(this.smartTokenize(doc));
|
|
693
|
+
tokens.forEach(token => {
|
|
694
|
+
documentFrequency.set(token, (documentFrequency.get(token) || 0) + 1);
|
|
695
|
+
});
|
|
696
|
+
});
|
|
697
|
+
|
|
698
|
+
documentFrequency.forEach((freq, token) => {
|
|
699
|
+
const idf = Math.log(this.documentCount / (freq + 1)) + 1;
|
|
700
|
+
this.idf.set(token, idf);
|
|
701
|
+
});
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
/**
|
|
705
|
+
* Get statistics
|
|
706
|
+
*/
|
|
707
|
+
getStats() {
|
|
708
|
+
return {
|
|
709
|
+
dimension: this.dimension,
|
|
710
|
+
vocabularySize: this.vocabulary.size,
|
|
711
|
+
synonymCount: Object.keys(CODE_SYNONYMS).length,
|
|
712
|
+
totalSynonyms: Object.values(CODE_SYNONYMS).reduce((sum, arr) => sum + arr.length, 0)
|
|
713
|
+
};
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
export default EnhancedSimpleEmbedder;
|