@sylphx/flow 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/package.json +12 -2
- package/src/commands/hook-command.ts +10 -230
- package/src/composables/index.ts +0 -1
- package/src/config/servers.ts +35 -78
- package/src/core/interfaces.ts +0 -33
- package/src/domains/index.ts +0 -2
- package/src/index.ts +0 -4
- package/src/services/mcp-service.ts +0 -16
- package/src/targets/claude-code.ts +3 -9
- package/src/targets/functional/claude-code-logic.ts +4 -22
- package/src/targets/opencode.ts +0 -6
- package/src/types/mcp.types.ts +29 -38
- package/src/types/target.types.ts +0 -2
- package/src/types.ts +0 -1
- package/src/commands/codebase-command.ts +0 -168
- package/src/commands/knowledge-command.ts +0 -161
- package/src/composables/useTargetConfig.ts +0 -45
- package/src/core/formatting/bytes.test.ts +0 -115
- package/src/core/validation/limit.test.ts +0 -155
- package/src/core/validation/query.test.ts +0 -44
- package/src/domains/codebase/index.ts +0 -5
- package/src/domains/codebase/tools.ts +0 -139
- package/src/domains/knowledge/index.ts +0 -10
- package/src/domains/knowledge/resources.ts +0 -537
- package/src/domains/knowledge/tools.ts +0 -174
- package/src/services/search/base-indexer.ts +0 -156
- package/src/services/search/codebase-indexer-types.ts +0 -38
- package/src/services/search/codebase-indexer.ts +0 -647
- package/src/services/search/embeddings-provider.ts +0 -455
- package/src/services/search/embeddings.ts +0 -316
- package/src/services/search/functional-indexer.ts +0 -323
- package/src/services/search/index.ts +0 -27
- package/src/services/search/indexer.ts +0 -380
- package/src/services/search/knowledge-indexer.ts +0 -422
- package/src/services/search/semantic-search.ts +0 -244
- package/src/services/search/tfidf.ts +0 -559
- package/src/services/search/unified-search-service.ts +0 -888
- package/src/services/storage/cache-storage.ts +0 -487
- package/src/services/storage/drizzle-storage.ts +0 -581
- package/src/services/storage/index.ts +0 -15
- package/src/services/storage/lancedb-vector-storage.ts +0 -494
- package/src/services/storage/memory-storage.ts +0 -268
- package/src/services/storage/separated-storage.ts +0 -467
- package/src/services/storage/vector-storage.ts +0 -13
|
@@ -1,559 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* TF-IDF (Term Frequency-Inverse Document Frequency) implementation
|
|
3
|
-
* Used for ranking document relevance in semantic search
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import { AdvancedCodeTokenizer } from '../../utils/advanced-tokenizer.js';
|
|
7
|
-
import type { SeparatedMemoryStorage } from './separated-storage.js';
|
|
8
|
-
|
|
9
|
-
export interface DocumentVector {
|
|
10
|
-
uri: string;
|
|
11
|
-
terms: Map<string, number>; // term → TF-IDF score
|
|
12
|
-
rawTerms: Map<string, number>; // term → raw frequency
|
|
13
|
-
magnitude: number; // Vector magnitude for cosine similarity
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
export interface SearchIndex {
|
|
17
|
-
documents: DocumentVector[];
|
|
18
|
-
idf: Map<string, number>; // term → IDF score
|
|
19
|
-
totalDocuments: number;
|
|
20
|
-
metadata: {
|
|
21
|
-
generatedAt: string;
|
|
22
|
-
version: string;
|
|
23
|
-
};
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
/**
|
|
27
|
-
* Build search index from database (shared between CLI and MCP)
|
|
28
|
-
*/
|
|
29
|
-
export async function buildSearchIndexFromDB(
|
|
30
|
-
memoryStorage: SeparatedMemoryStorage,
|
|
31
|
-
filters?: {
|
|
32
|
-
file_extensions?: string[];
|
|
33
|
-
path_filter?: string;
|
|
34
|
-
exclude_paths?: string[];
|
|
35
|
-
}
|
|
36
|
-
): Promise<SearchIndex | null> {
|
|
37
|
-
try {
|
|
38
|
-
// Get all files from database
|
|
39
|
-
let files = await memoryStorage.getAllCodebaseFiles();
|
|
40
|
-
|
|
41
|
-
// Apply filters
|
|
42
|
-
if (filters) {
|
|
43
|
-
if (filters.file_extensions && filters.file_extensions.length > 0) {
|
|
44
|
-
files = files.filter((file) =>
|
|
45
|
-
filters.file_extensions?.some((ext: string) => file.path.endsWith(ext))
|
|
46
|
-
);
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
if (filters.path_filter) {
|
|
50
|
-
files = files.filter((file) => file.path.includes(filters.path_filter!));
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
if (filters.exclude_paths && filters.exclude_paths.length > 0) {
|
|
54
|
-
files = files.filter(
|
|
55
|
-
(file) => !filters.exclude_paths?.some((exclude: string) => file.path.includes(exclude))
|
|
56
|
-
);
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
if (files.length === 0) {
|
|
61
|
-
return null;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Build search documents - read TF-IDF terms directly from database
|
|
65
|
-
const documents = [];
|
|
66
|
-
for (const file of files) {
|
|
67
|
-
const tfidfDoc = await memoryStorage.getTFIDFDocument(file.path);
|
|
68
|
-
if (tfidfDoc) {
|
|
69
|
-
// Get TF-IDF terms from database (already calculated)
|
|
70
|
-
const tfidfTerms = await memoryStorage.getTFIDFTerms(file.path);
|
|
71
|
-
const terms = new Map<string, number>();
|
|
72
|
-
const rawTermsMap = new Map<string, number>();
|
|
73
|
-
|
|
74
|
-
// Use TF-IDF terms for search scoring
|
|
75
|
-
for (const [term, tfidfScore] of Object.entries(tfidfTerms)) {
|
|
76
|
-
terms.set(term, tfidfScore as number);
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// Use rawTerms for reference
|
|
80
|
-
const rawTerms = tfidfDoc.rawTerms || {};
|
|
81
|
-
for (const [term, freq] of Object.entries(rawTerms)) {
|
|
82
|
-
rawTermsMap.set(term, freq as number);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
documents.push({
|
|
86
|
-
uri: `file://${file.path}`,
|
|
87
|
-
terms,
|
|
88
|
-
rawTerms: rawTermsMap,
|
|
89
|
-
magnitude: tfidfDoc.magnitude,
|
|
90
|
-
});
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
if (documents.length === 0) {
|
|
95
|
-
return null;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
// Get IDF values from database
|
|
99
|
-
const idfRecords = await memoryStorage.getIDFValues();
|
|
100
|
-
const idf = new Map<string, number>();
|
|
101
|
-
for (const [term, value] of Object.entries(idfRecords)) {
|
|
102
|
-
idf.set(term, value as number);
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
return {
|
|
106
|
-
documents,
|
|
107
|
-
idf,
|
|
108
|
-
totalDocuments: documents.length,
|
|
109
|
-
metadata: {
|
|
110
|
-
generatedAt: new Date().toISOString(),
|
|
111
|
-
version: '1.0.0',
|
|
112
|
-
},
|
|
113
|
-
};
|
|
114
|
-
} catch (error) {
|
|
115
|
-
console.error('[ERROR] Failed to build search index from database:', error);
|
|
116
|
-
return null;
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
/**
|
|
121
|
-
* Calculate Term Frequency (TF)
|
|
122
|
-
* TF = (number of times term appears in document) / (total terms in document)
|
|
123
|
-
*/
|
|
124
|
-
function calculateTF(termFrequency: Map<string, number>): Map<string, number> {
|
|
125
|
-
const totalTerms = Array.from(termFrequency.values()).reduce((sum, freq) => sum + freq, 0);
|
|
126
|
-
const tf = new Map<string, number>();
|
|
127
|
-
|
|
128
|
-
for (const [term, freq] of termFrequency.entries()) {
|
|
129
|
-
tf.set(term, freq / totalTerms);
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
return tf;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* Calculate Inverse Document Frequency (IDF)
|
|
137
|
-
* IDF = log(total documents / documents containing term)
|
|
138
|
-
*/
|
|
139
|
-
function calculateIDF(
|
|
140
|
-
documents: Map<string, number>[],
|
|
141
|
-
totalDocuments: number
|
|
142
|
-
): Map<string, number> {
|
|
143
|
-
const documentFrequency = new Map<string, number>();
|
|
144
|
-
|
|
145
|
-
// Count how many documents contain each term
|
|
146
|
-
for (const doc of documents) {
|
|
147
|
-
const uniqueTerms = new Set(doc.keys());
|
|
148
|
-
for (const term of uniqueTerms) {
|
|
149
|
-
documentFrequency.set(term, (documentFrequency.get(term) || 0) + 1);
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
// Calculate IDF for each term
|
|
154
|
-
const idf = new Map<string, number>();
|
|
155
|
-
for (const [term, docFreq] of documentFrequency.entries()) {
|
|
156
|
-
idf.set(term, Math.log(totalDocuments / docFreq));
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
return idf;
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
/**
|
|
163
|
-
* Calculate TF-IDF scores for a document
|
|
164
|
-
*/
|
|
165
|
-
function calculateTFIDF(tf: Map<string, number>, idf: Map<string, number>): Map<string, number> {
|
|
166
|
-
const tfidf = new Map<string, number>();
|
|
167
|
-
|
|
168
|
-
for (const [term, tfScore] of tf.entries()) {
|
|
169
|
-
const idfScore = idf.get(term) || 0;
|
|
170
|
-
tfidf.set(term, tfScore * idfScore);
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
return tfidf;
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
/**
|
|
177
|
-
* Calculate vector magnitude for cosine similarity
|
|
178
|
-
*/
|
|
179
|
-
function calculateMagnitude(vector: Map<string, number>): number {
|
|
180
|
-
let sum = 0;
|
|
181
|
-
for (const value of vector.values()) {
|
|
182
|
-
sum += value * value;
|
|
183
|
-
}
|
|
184
|
-
return Math.sqrt(sum);
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
// Global tokenizer instance for performance
|
|
188
|
-
let globalTokenizer: AdvancedCodeTokenizer | null = null;
|
|
189
|
-
let tokenizerInitialized = false;
|
|
190
|
-
|
|
191
|
-
/**
|
|
192
|
-
* Get or create the global tokenizer
|
|
193
|
-
*/
|
|
194
|
-
async function getTokenizer(): Promise<AdvancedCodeTokenizer> {
|
|
195
|
-
if (!globalTokenizer) {
|
|
196
|
-
globalTokenizer = new AdvancedCodeTokenizer({
|
|
197
|
-
modelPath: './models/starcoder2',
|
|
198
|
-
});
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
if (!tokenizerInitialized) {
|
|
202
|
-
// Silently initialize - no console output
|
|
203
|
-
const originalLog = console.log;
|
|
204
|
-
const originalError = console.error;
|
|
205
|
-
console.log = () => {}; // Temporarily silence console.log
|
|
206
|
-
console.error = () => {}; // Temporarily silence console.error
|
|
207
|
-
try {
|
|
208
|
-
await globalTokenizer.initialize();
|
|
209
|
-
tokenizerInitialized = true;
|
|
210
|
-
} finally {
|
|
211
|
-
console.log = originalLog; // Restore console.log
|
|
212
|
-
console.error = originalError; // Restore console.error
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
return globalTokenizer;
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
/**
|
|
220
|
-
* Extract terms using our advanced tokenizer
|
|
221
|
-
*/
|
|
222
|
-
async function extractTerms(content: string): Promise<Map<string, number>> {
|
|
223
|
-
const tokenizer = await getTokenizer();
|
|
224
|
-
const result = await tokenizer.tokenize(content);
|
|
225
|
-
const terms = new Map<string, number>();
|
|
226
|
-
|
|
227
|
-
// Use token scores as TF weights
|
|
228
|
-
for (const token of result.tokens) {
|
|
229
|
-
const term = token.text.toLowerCase();
|
|
230
|
-
const currentScore = terms.get(term) || 0;
|
|
231
|
-
terms.set(term, currentScore + token.score);
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
return terms;
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
/**
|
|
238
|
-
* Extract simple tokens for query processing
|
|
239
|
-
*/
|
|
240
|
-
async function extractQueryTokens(query: string): Promise<string[]> {
|
|
241
|
-
const tokenizer = await getTokenizer();
|
|
242
|
-
const result = await tokenizer.tokenize(query);
|
|
243
|
-
|
|
244
|
-
// Return unique tokens, sorted by score (highest first)
|
|
245
|
-
const uniqueTokens = new Map<string, string>();
|
|
246
|
-
for (const token of result.tokens) {
|
|
247
|
-
const lowerText = token.text.toLowerCase();
|
|
248
|
-
if (!uniqueTokens.has(lowerText) || token.score > 0.8) {
|
|
249
|
-
uniqueTokens.set(lowerText, token.text);
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
return Array.from(uniqueTokens.values());
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
export interface BuildIndexProgress {
|
|
257
|
-
current: number;
|
|
258
|
-
total: number;
|
|
259
|
-
fileName: string;
|
|
260
|
-
status: 'processing' | 'completed' | 'skipped';
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
/**
|
|
264
|
-
* Build TF-IDF search index from documents using our advanced tokenizer
|
|
265
|
-
*/
|
|
266
|
-
export async function buildSearchIndex(
|
|
267
|
-
documents: Array<{ uri: string; content: string }>,
|
|
268
|
-
onProgress?: (progress: BuildIndexProgress) => void
|
|
269
|
-
): Promise<SearchIndex> {
|
|
270
|
-
// Process documents one by one to avoid hanging
|
|
271
|
-
const batchSize = 1; // Process 1 document at a time to avoid hanging
|
|
272
|
-
const documentTerms: Array<{ uri: string; terms: Map<string, number> }> = [];
|
|
273
|
-
|
|
274
|
-
for (let i = 0; i < documents.length; i += batchSize) {
|
|
275
|
-
const batch = documents.slice(i, i + batchSize);
|
|
276
|
-
|
|
277
|
-
// Process sequentially to avoid hanging
|
|
278
|
-
const batchResults = [];
|
|
279
|
-
for (let j = 0; j < batch.length; j++) {
|
|
280
|
-
const doc = batch[j];
|
|
281
|
-
const fileName = doc.uri.split('/').pop() || doc.uri;
|
|
282
|
-
|
|
283
|
-
// Report progress
|
|
284
|
-
onProgress?.({
|
|
285
|
-
current: i + j + 1,
|
|
286
|
-
total: documents.length,
|
|
287
|
-
fileName,
|
|
288
|
-
status: 'processing',
|
|
289
|
-
});
|
|
290
|
-
|
|
291
|
-
try {
|
|
292
|
-
const result = await extractTerms(doc.content);
|
|
293
|
-
|
|
294
|
-
batchResults.push({
|
|
295
|
-
uri: doc.uri,
|
|
296
|
-
terms: result,
|
|
297
|
-
});
|
|
298
|
-
|
|
299
|
-
// Report completion
|
|
300
|
-
onProgress?.({
|
|
301
|
-
current: i + j + 1,
|
|
302
|
-
total: documents.length,
|
|
303
|
-
fileName,
|
|
304
|
-
status: 'completed',
|
|
305
|
-
});
|
|
306
|
-
} catch (_error) {
|
|
307
|
-
batchResults.push({
|
|
308
|
-
uri: doc.uri,
|
|
309
|
-
terms: new Map<string, number>(),
|
|
310
|
-
});
|
|
311
|
-
|
|
312
|
-
// Report skip
|
|
313
|
-
onProgress?.({
|
|
314
|
-
current: i + j + 1,
|
|
315
|
-
total: documents.length,
|
|
316
|
-
fileName,
|
|
317
|
-
status: 'skipped',
|
|
318
|
-
});
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
documentTerms.push(...batchResults);
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
// Calculate IDF scores
|
|
326
|
-
const idf = calculateIDF(
|
|
327
|
-
documentTerms.map((d) => d.terms),
|
|
328
|
-
documents.length
|
|
329
|
-
);
|
|
330
|
-
|
|
331
|
-
// Calculate TF-IDF for each document
|
|
332
|
-
const documentVectors: DocumentVector[] = documentTerms.map((doc) => {
|
|
333
|
-
const tf = calculateTF(doc.terms);
|
|
334
|
-
const tfidf = calculateTFIDF(tf, idf);
|
|
335
|
-
const magnitude = calculateMagnitude(tfidf);
|
|
336
|
-
|
|
337
|
-
return {
|
|
338
|
-
uri: doc.uri,
|
|
339
|
-
terms: tfidf,
|
|
340
|
-
rawTerms: doc.terms,
|
|
341
|
-
magnitude,
|
|
342
|
-
};
|
|
343
|
-
});
|
|
344
|
-
|
|
345
|
-
return {
|
|
346
|
-
documents: documentVectors,
|
|
347
|
-
idf,
|
|
348
|
-
totalDocuments: documents.length,
|
|
349
|
-
metadata: {
|
|
350
|
-
generatedAt: new Date().toISOString(),
|
|
351
|
-
version: '5.0.0',
|
|
352
|
-
tokenizer: 'AdvancedCodeTokenizer',
|
|
353
|
-
features: [
|
|
354
|
-
'Industry-leading code understanding',
|
|
355
|
-
'Advanced technical term recognition',
|
|
356
|
-
'Optimized for code search',
|
|
357
|
-
'Simple and effective approach',
|
|
358
|
-
'No unnecessary complexity',
|
|
359
|
-
],
|
|
360
|
-
},
|
|
361
|
-
};
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
/**
|
|
365
|
-
* Calculate cosine similarity between query and document
|
|
366
|
-
*/
|
|
367
|
-
export function calculateCosineSimilarity(
|
|
368
|
-
queryVector: Map<string, number>,
|
|
369
|
-
docVector: DocumentVector
|
|
370
|
-
): number {
|
|
371
|
-
let dotProduct = 0;
|
|
372
|
-
|
|
373
|
-
// Calculate dot product
|
|
374
|
-
for (const [term, queryScore] of queryVector.entries()) {
|
|
375
|
-
const docScore = docVector.terms.get(term) || 0;
|
|
376
|
-
dotProduct += queryScore * docScore;
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
// Calculate query magnitude
|
|
380
|
-
const queryMagnitude = calculateMagnitude(queryVector);
|
|
381
|
-
|
|
382
|
-
if (queryMagnitude === 0 || docVector.magnitude === 0) {
|
|
383
|
-
return 0;
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
return dotProduct / (queryMagnitude * docVector.magnitude);
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
/**
|
|
390
|
-
* Process query into TF-IDF vector using database values
|
|
391
|
-
*/
|
|
392
|
-
export async function processQuery(
|
|
393
|
-
query: string,
|
|
394
|
-
idf: Map<string, number>
|
|
395
|
-
): Promise<Map<string, number>> {
|
|
396
|
-
const terms = await extractQueryTokens(query);
|
|
397
|
-
const queryVector = new Map<string, number>();
|
|
398
|
-
|
|
399
|
-
// 為每個查詢詞使用 IDF 值(查詢本身無 TF-IDF,直接用 IDF)
|
|
400
|
-
for (const term of terms) {
|
|
401
|
-
const lowerTerm = term.toLowerCase();
|
|
402
|
-
const idfValue = idf.get(lowerTerm) || 0;
|
|
403
|
-
|
|
404
|
-
// 純粹用 IDF 值,完全信任 StarCoder2 嘅 tokenization
|
|
405
|
-
if (idfValue > 0) {
|
|
406
|
-
queryVector.set(lowerTerm, idfValue);
|
|
407
|
-
}
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
return queryVector;
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
/**
|
|
414
|
-
* Search documents using TF-IDF and cosine similarity with Advanced Code Tokenizer
|
|
415
|
-
*/
|
|
416
|
-
export async function searchDocuments(
|
|
417
|
-
query: string,
|
|
418
|
-
index: SearchIndex,
|
|
419
|
-
options: {
|
|
420
|
-
limit?: number;
|
|
421
|
-
minScore?: number;
|
|
422
|
-
boostFactors?: {
|
|
423
|
-
exactMatch?: number; // Boost for exact term matches
|
|
424
|
-
phraseMatch?: number; // Boost for phrase matches
|
|
425
|
-
technicalMatch?: number; // Boost for technical term matches
|
|
426
|
-
identifierMatch?: number; // Boost for identifier matches
|
|
427
|
-
};
|
|
428
|
-
} = {}
|
|
429
|
-
): Promise<Array<{ uri: string; score: number; matchedTerms: string[] }>> {
|
|
430
|
-
const { limit = 10, minScore = 0, boostFactors = {} } = options;
|
|
431
|
-
const {
|
|
432
|
-
exactMatch = 1.5,
|
|
433
|
-
phraseMatch = 2.0,
|
|
434
|
-
technicalMatch = 1.8,
|
|
435
|
-
identifierMatch = 1.3,
|
|
436
|
-
} = boostFactors;
|
|
437
|
-
|
|
438
|
-
// Process query using Advanced Code Tokenizer
|
|
439
|
-
const queryVector = await processQuery(query, index.idf);
|
|
440
|
-
const queryTokens = (await extractQueryTokens(query)).map((t) => t.toLowerCase());
|
|
441
|
-
|
|
442
|
-
// Calculate similarity for each document
|
|
443
|
-
const results = index.documents.map((doc) => {
|
|
444
|
-
let score = calculateCosineSimilarity(queryVector, doc);
|
|
445
|
-
|
|
446
|
-
// Boost for exact term matches with enhanced scoring
|
|
447
|
-
const matchedTerms: string[] = [];
|
|
448
|
-
for (const token of queryTokens) {
|
|
449
|
-
if (doc.rawTerms.has(token)) {
|
|
450
|
-
// Apply different boost factors based on term characteristics
|
|
451
|
-
let boostFactor = exactMatch;
|
|
452
|
-
|
|
453
|
-
// Additional boost for technical terms
|
|
454
|
-
if (isTechnicalTerm(token)) {
|
|
455
|
-
boostFactor = Math.max(boostFactor, technicalMatch);
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
// Additional boost for identifiers
|
|
459
|
-
if (isIdentifier(token)) {
|
|
460
|
-
boostFactor = Math.max(boostFactor, identifierMatch);
|
|
461
|
-
}
|
|
462
|
-
|
|
463
|
-
score *= boostFactor;
|
|
464
|
-
matchedTerms.push(token);
|
|
465
|
-
}
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
// Enhanced phrase match detection (all query terms appear in document)
|
|
469
|
-
if (matchedTerms.length === queryTokens.length && queryTokens.length > 1) {
|
|
470
|
-
score *= phraseMatch;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
// Contextual relevance boost for longer queries
|
|
474
|
-
if (queryTokens.length > 3 && matchedTerms.length >= queryTokens.length * 0.7) {
|
|
475
|
-
score *= 1.2; // Boost for partial matches on complex queries
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
return {
|
|
479
|
-
uri: doc.uri,
|
|
480
|
-
score,
|
|
481
|
-
matchedTerms,
|
|
482
|
-
};
|
|
483
|
-
});
|
|
484
|
-
|
|
485
|
-
// Filter and sort
|
|
486
|
-
return results
|
|
487
|
-
.filter((result) => result.score >= minScore)
|
|
488
|
-
.sort((a, b) => b.score - a.score)
|
|
489
|
-
.slice(0, limit);
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
/**
|
|
493
|
-
* Check if a term is likely a technical term
|
|
494
|
-
*/
|
|
495
|
-
function isTechnicalTerm(term: string): boolean {
|
|
496
|
-
const technicalPatterns = [
|
|
497
|
-
/\b[A-Z]{2,}\b/, // Acronyms like HTTP, API, JSON
|
|
498
|
-
/\b[A-Z][a-z]+(?:[A-Z][a-z]+)+\b/, // PascalCase like ComponentName
|
|
499
|
-
/\b[a-z]+[A-Z][a-z]*\b/, // camelCase like functionName
|
|
500
|
-
/\b\w+(?:Dir|Config|File|Path|Data|Service|Manager|Handler)\b/, // Common suffixes
|
|
501
|
-
/\b(?:get|set|is|has|can|should|will|do)[A-Z]\w*\b/, // Common prefixes
|
|
502
|
-
/\b(?:http|https|json|xml|yaml|sql|api|url|uri)\b/, // Technical keywords
|
|
503
|
-
];
|
|
504
|
-
|
|
505
|
-
return technicalPatterns.some((pattern) => pattern.test(term));
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
/**
|
|
509
|
-
* Check if a term is likely an identifier
|
|
510
|
-
*/
|
|
511
|
-
function isIdentifier(term: string): boolean {
|
|
512
|
-
// Identifiers typically contain letters and numbers, maybe underscores
|
|
513
|
-
return /^[a-zA-Z][a-zA-Z0-9_]*$/.test(term) && term.length > 1;
|
|
514
|
-
}
|
|
515
|
-
|
|
516
|
-
/**
|
|
517
|
-
* Serialize search index to JSON
|
|
518
|
-
*/
|
|
519
|
-
export function serializeIndex(index: SearchIndex): string {
|
|
520
|
-
const serializable = {
|
|
521
|
-
documents: index.documents.map((doc) => ({
|
|
522
|
-
uri: doc.uri,
|
|
523
|
-
terms: Array.from(doc.terms.entries()),
|
|
524
|
-
rawTerms: Array.from(doc.rawTerms.entries()),
|
|
525
|
-
magnitude: doc.magnitude,
|
|
526
|
-
})),
|
|
527
|
-
idf: Array.from(index.idf.entries()),
|
|
528
|
-
totalDocuments: index.totalDocuments,
|
|
529
|
-
metadata: index.metadata,
|
|
530
|
-
};
|
|
531
|
-
|
|
532
|
-
return JSON.stringify(serializable, null, 2);
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
/**
|
|
536
|
-
* Deserialize search index from JSON
|
|
537
|
-
*/
|
|
538
|
-
export function deserializeIndex(json: string): SearchIndex {
|
|
539
|
-
const data = JSON.parse(json);
|
|
540
|
-
|
|
541
|
-
return {
|
|
542
|
-
documents: data.documents.map(
|
|
543
|
-
(doc: {
|
|
544
|
-
uri: string;
|
|
545
|
-
terms: [string, number][];
|
|
546
|
-
rawTerms: [string, number][];
|
|
547
|
-
magnitude: number;
|
|
548
|
-
}) => ({
|
|
549
|
-
uri: doc.uri,
|
|
550
|
-
terms: new Map(doc.terms),
|
|
551
|
-
rawTerms: new Map(doc.rawTerms),
|
|
552
|
-
magnitude: doc.magnitude,
|
|
553
|
-
})
|
|
554
|
-
),
|
|
555
|
-
idf: new Map(data.idf),
|
|
556
|
-
totalDocuments: data.totalDocuments,
|
|
557
|
-
metadata: data.metadata,
|
|
558
|
-
};
|
|
559
|
-
}
|