codecritique 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -114
- package/package.json +10 -9
- package/src/content-retrieval.test.js +775 -0
- package/src/custom-documents.test.js +440 -0
- package/src/feedback-loader.test.js +529 -0
- package/src/llm.test.js +256 -0
- package/src/project-analyzer.test.js +747 -0
- package/src/rag-analyzer.js +12 -0
- package/src/rag-analyzer.test.js +1109 -0
- package/src/rag-review.test.js +317 -0
- package/src/setupTests.js +131 -0
- package/src/zero-shot-classifier-open.test.js +278 -0
- package/src/embeddings/cache-manager.js +0 -364
- package/src/embeddings/constants.js +0 -40
- package/src/embeddings/database.js +0 -921
- package/src/embeddings/errors.js +0 -208
- package/src/embeddings/factory.js +0 -447
- package/src/embeddings/file-processor.js +0 -851
- package/src/embeddings/model-manager.js +0 -337
- package/src/embeddings/similarity-calculator.js +0 -97
- package/src/embeddings/types.js +0 -113
- package/src/pr-history/analyzer.js +0 -579
- package/src/pr-history/bot-detector.js +0 -123
- package/src/pr-history/cli-utils.js +0 -204
- package/src/pr-history/comment-processor.js +0 -549
- package/src/pr-history/database.js +0 -819
- package/src/pr-history/github-client.js +0 -629
- package/src/technology-keywords.json +0 -753
- package/src/utils/command.js +0 -48
- package/src/utils/constants.js +0 -263
- package/src/utils/context-inference.js +0 -364
- package/src/utils/document-detection.js +0 -105
- package/src/utils/file-validation.js +0 -271
- package/src/utils/git.js +0 -232
- package/src/utils/language-detection.js +0 -170
- package/src/utils/logging.js +0 -24
- package/src/utils/markdown.js +0 -132
- package/src/utils/mobilebert-tokenizer.js +0 -141
- package/src/utils/pr-chunking.js +0 -276
- package/src/utils/string-utils.js +0 -28
|
@@ -1,337 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Model Manager Module
|
|
3
|
-
*
|
|
4
|
-
* This module provides centralized embedding model management using FastEmbed.
|
|
5
|
-
* It handles model initialization, caching, and all embedding generation operations.
|
|
6
|
-
*
|
|
7
|
-
* Features:
|
|
8
|
-
* - Singleton model instance management
|
|
9
|
-
* - Thread-safe model initialization
|
|
10
|
-
* - Embedding generation with caching
|
|
11
|
-
* - Batch embedding processing
|
|
12
|
-
* - Query-specific embedding generation
|
|
13
|
-
* - Comprehensive error handling
|
|
14
|
-
*/
|
|
15
|
-
/**
|
|
16
|
-
* @typedef {import('./types.js').EmbeddingVector} EmbeddingVector
|
|
17
|
-
* @typedef {import('./types.js').QueryEmbeddingOptions} QueryEmbeddingOptions
|
|
18
|
-
* @typedef {import('./types.js').BatchProcessingOptions} BatchProcessingOptions
|
|
19
|
-
*/
|
|
20
|
-
|
|
21
|
-
import fs from 'node:fs';
|
|
22
|
-
import chalk from 'chalk';
|
|
23
|
-
import dotenv from 'dotenv';
|
|
24
|
-
import { EmbeddingModel, FlagEmbedding } from 'fastembed';
|
|
25
|
-
import { debug } from '../utils/logging.js';
|
|
26
|
-
import { EMBEDDING_DIMENSIONS, MODEL_NAME_STRING, MAX_RETRIES } from './constants.js';
|
|
27
|
-
import { FASTEMBED_CACHE_DIR } from './constants.js';
|
|
28
|
-
import { createModelInitializationError, createEmbeddingGenerationError } from './errors.js';
|
|
29
|
-
|
|
30
|
-
// Load environment variables
|
|
31
|
-
dotenv.config();
|
|
32
|
-
|
|
33
|
-
// ============================================================================
|
|
34
|
-
// MODEL MANAGER CLASS
|
|
35
|
-
// ============================================================================
|
|
36
|
-
|
|
37
|
-
export class ModelManager {
|
|
38
|
-
constructor(options = {}) {
|
|
39
|
-
this.embeddingDimensions = options.embeddingDimensions || EMBEDDING_DIMENSIONS;
|
|
40
|
-
this.modelNameString = options.modelNameString || MODEL_NAME_STRING;
|
|
41
|
-
this.maxRetries = options.maxRetries || MAX_RETRIES;
|
|
42
|
-
this.cacheDir = options.cacheDir || FASTEMBED_CACHE_DIR;
|
|
43
|
-
this.cacheManager = options.cacheManager || null;
|
|
44
|
-
|
|
45
|
-
// Model state
|
|
46
|
-
this.embeddingModel = null;
|
|
47
|
-
this.modelInitialized = false;
|
|
48
|
-
this.modelInitializationPromise = null;
|
|
49
|
-
this.cleaningUp = false;
|
|
50
|
-
|
|
51
|
-
console.log(chalk.magenta(`[ModelManager] Using MODEL = ${this.modelNameString}, DIMENSIONS = ${this.embeddingDimensions}`));
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
// ============================================================================
|
|
55
|
-
// MODEL INITIALIZATION
|
|
56
|
-
// ============================================================================
|
|
57
|
-
|
|
58
|
-
/**
|
|
59
|
-
* Initialize the FastEmbed model instance
|
|
60
|
-
* @returns {Promise<import('fastembed').FlagEmbedding>} Initialized model instance
|
|
61
|
-
*/
|
|
62
|
-
async initialize() {
|
|
63
|
-
// If model is already initialized, return it immediately
|
|
64
|
-
if (this.embeddingModel) {
|
|
65
|
-
return this.embeddingModel;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
// If initialization is already in progress, wait for it
|
|
69
|
-
if (this.modelInitializationPromise) {
|
|
70
|
-
return await this.modelInitializationPromise;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
// Start initialization and store the promise
|
|
74
|
-
this.modelInitializationPromise = (async () => {
|
|
75
|
-
const modelIdentifier = EmbeddingModel.BGESmallENV15;
|
|
76
|
-
|
|
77
|
-
// Only print logs if we haven't initialized before
|
|
78
|
-
if (!this.modelInitialized) {
|
|
79
|
-
console.log(chalk.blue(`Attempting to initialize fastembed model. Identifier: ${this.modelNameString}`));
|
|
80
|
-
console.log(chalk.blue(`FastEmbed Cache Directory: ${this.cacheDir}`));
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
try {
|
|
84
|
-
if (!fs.existsSync(this.cacheDir)) {
|
|
85
|
-
console.log(chalk.yellow(`Creating fastembed cache directory: ${this.cacheDir}`));
|
|
86
|
-
fs.mkdirSync(this.cacheDir, { recursive: true });
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
let retries = 0;
|
|
90
|
-
while (retries < this.maxRetries) {
|
|
91
|
-
try {
|
|
92
|
-
this.embeddingModel = await FlagEmbedding.init({
|
|
93
|
-
model: modelIdentifier,
|
|
94
|
-
cacheDir: this.cacheDir,
|
|
95
|
-
});
|
|
96
|
-
|
|
97
|
-
// Only print success message if we haven't initialized before
|
|
98
|
-
if (!this.modelInitialized) {
|
|
99
|
-
console.log(chalk.green('FastEmbed model initialized successfully.'));
|
|
100
|
-
this.modelInitialized = true;
|
|
101
|
-
}
|
|
102
|
-
break; // Exit loop on success
|
|
103
|
-
} catch (initError) {
|
|
104
|
-
retries++;
|
|
105
|
-
console.error(chalk.yellow(`Model initialization attempt ${retries}/${this.maxRetries} failed: ${initError.message}`));
|
|
106
|
-
if (retries >= this.maxRetries) {
|
|
107
|
-
throw createModelInitializationError(
|
|
108
|
-
`Failed to initialize model after ${this.maxRetries} attempts: ${initError.message}`,
|
|
109
|
-
initError,
|
|
110
|
-
{ modelIdentifier, cacheDir: this.cacheDir }
|
|
111
|
-
);
|
|
112
|
-
}
|
|
113
|
-
await new Promise((resolve) => setTimeout(resolve, retries * 2000)); // Wait before retrying
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
// Clear the initialization promise since we're done
|
|
118
|
-
this.modelInitializationPromise = null;
|
|
119
|
-
return this.embeddingModel;
|
|
120
|
-
} catch (err) {
|
|
121
|
-
// Clear the initialization promise on error
|
|
122
|
-
this.modelInitializationPromise = null;
|
|
123
|
-
console.error(chalk.red(`Fatal: Failed to initialize fastembed model: ${err.message}`), err);
|
|
124
|
-
throw err; // Re-throw critical error
|
|
125
|
-
}
|
|
126
|
-
})();
|
|
127
|
-
|
|
128
|
-
return await this.modelInitializationPromise;
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
/**
|
|
132
|
-
* Check if the model is initialized
|
|
133
|
-
* @returns {boolean} True if model is initialized
|
|
134
|
-
*/
|
|
135
|
-
isInitialized() {
|
|
136
|
-
return this.modelInitialized && this.embeddingModel !== null;
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
// ============================================================================
|
|
140
|
-
// EMBEDDING GENERATION
|
|
141
|
-
// ============================================================================
|
|
142
|
-
|
|
143
|
-
/**
|
|
144
|
-
* Calculate embedding for a text using fastembed
|
|
145
|
-
* @param {string} text - The text to embed
|
|
146
|
-
* @returns {Promise<EmbeddingVector|null>} The embedding vector or null on error
|
|
147
|
-
*/
|
|
148
|
-
async calculateEmbedding(text) {
|
|
149
|
-
// Ensure text is a non-empty string
|
|
150
|
-
if (typeof text !== 'string' || text.trim().length === 0) {
|
|
151
|
-
return null; // Return null for empty text to avoid errors downstream
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
// Check cache first
|
|
155
|
-
const cacheKey = text.trim().substring(0, 200); // Use first 200 chars as cache key
|
|
156
|
-
if (this.cacheManager) {
|
|
157
|
-
const cachedResult = this.cacheManager.getEmbedding(cacheKey);
|
|
158
|
-
if (cachedResult) {
|
|
159
|
-
return cachedResult;
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
try {
|
|
164
|
-
const model = await this.initialize();
|
|
165
|
-
let embedding = null; // Initialize embedding as null
|
|
166
|
-
|
|
167
|
-
// Use passageEmbed which is suitable for sentences/paragraphs/code snippets
|
|
168
|
-
const embeddingGenerator = model.passageEmbed([text]);
|
|
169
|
-
// FastEmbed's async generator yields batches, even for single input
|
|
170
|
-
for await (const batch of embeddingGenerator) {
|
|
171
|
-
if (batch && batch.length > 0 && batch[0]) {
|
|
172
|
-
embedding = Array.from(batch[0]); // Convert Float32Array to regular array
|
|
173
|
-
break; // Got the embedding for the single input text
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
// Validate the generated embedding
|
|
178
|
-
if (!embedding || !Array.isArray(embedding) || embedding.length !== this.embeddingDimensions) {
|
|
179
|
-
console.error(
|
|
180
|
-
chalk.red(
|
|
181
|
-
`Generated embedding dimension (${embedding?.length}) does not match expected (${this.embeddingDimensions}) or embedding is invalid.`
|
|
182
|
-
)
|
|
183
|
-
);
|
|
184
|
-
return null; // Return null if dimensions mismatch or invalid
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
// Cache the result
|
|
188
|
-
if (this.cacheManager) {
|
|
189
|
-
this.cacheManager.setEmbedding(cacheKey, embedding);
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
return embedding;
|
|
193
|
-
} catch (error) {
|
|
194
|
-
console.error(chalk.red(`Error calculating embedding: ${error.message}`), error);
|
|
195
|
-
throw createEmbeddingGenerationError(`Failed to calculate embedding: ${error.message}`, error, { text: text.substring(0, 100) });
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
/**
|
|
200
|
-
* Calculate embeddings for a batch of texts using fastembed
|
|
201
|
-
* @param {string[]} texts - An array of texts to embed
|
|
202
|
-
* @param {BatchProcessingOptions} [options] - Batch processing options
|
|
203
|
-
* @returns {Promise<Array<EmbeddingVector|null>>} A promise that resolves to an array of embedding vectors
|
|
204
|
-
*/
|
|
205
|
-
async calculateEmbeddingBatch(texts) {
|
|
206
|
-
// Ensure texts is a non-empty array of non-empty strings
|
|
207
|
-
if (!Array.isArray(texts) || texts.length === 0 || texts.some((text) => typeof text !== 'string' || text.trim().length === 0)) {
|
|
208
|
-
debug('Skipping batch embedding for empty or invalid texts array.');
|
|
209
|
-
// Return an array of nulls corresponding to the input, or an empty array if appropriate
|
|
210
|
-
return texts.map(() => null);
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
try {
|
|
214
|
-
const model = await this.initialize();
|
|
215
|
-
const embeddings = [];
|
|
216
|
-
|
|
217
|
-
// passageEmbed is an async generator of batches
|
|
218
|
-
for await (const batch of model.passageEmbed(texts)) {
|
|
219
|
-
for (const vec of batch) {
|
|
220
|
-
// Validate each generated embedding
|
|
221
|
-
if (vec && typeof vec.length === 'number' && vec.length === this.embeddingDimensions) {
|
|
222
|
-
embeddings.push(Array.from(vec)); // Convert Float32Array (or other array-like) to regular array
|
|
223
|
-
} else {
|
|
224
|
-
console.error(
|
|
225
|
-
chalk.red(
|
|
226
|
-
`Generated batch embedding dimension (${vec?.length}) does not match expected (${this.embeddingDimensions}) or embedding is invalid.`
|
|
227
|
-
)
|
|
228
|
-
);
|
|
229
|
-
embeddings.push(null); // Add null for invalid embeddings in the batch
|
|
230
|
-
}
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
// Ensure the number of embeddings matches the number of input texts
|
|
235
|
-
if (embeddings.length !== texts.length) {
|
|
236
|
-
console.error(
|
|
237
|
-
chalk.red(`Number of generated embeddings (${embeddings.length}) does not match number of input texts (${texts.length}).`)
|
|
238
|
-
);
|
|
239
|
-
// This case should ideally be handled by ensuring one embedding (or null) per input text.
|
|
240
|
-
// For now, if there's a mismatch, it might indicate a deeper issue.
|
|
241
|
-
// We'll return what we have, but this could lead to misaligned data.
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
debug(`Batch embeddings generated successfully, count: ${embeddings.filter((e) => e !== null).length}`);
|
|
245
|
-
return embeddings;
|
|
246
|
-
} catch (error) {
|
|
247
|
-
console.error(chalk.red(`Error calculating batch embeddings: ${error.message}`), error);
|
|
248
|
-
throw createEmbeddingGenerationError(`Failed to calculate batch embeddings: ${error.message}`, error, { textsCount: texts.length });
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
/**
|
|
253
|
-
* Calculate embedding for a query text using fastembed
|
|
254
|
-
* @param {string} text - The query text to embed
|
|
255
|
-
* @param {QueryEmbeddingOptions} [options] - Query embedding options
|
|
256
|
-
* @returns {Promise<EmbeddingVector|null>} The embedding vector or null on error
|
|
257
|
-
*/
|
|
258
|
-
async calculateQueryEmbedding(text) {
|
|
259
|
-
if (typeof text !== 'string' || text.trim().length === 0) {
|
|
260
|
-
return null;
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
// Check cache first (use 'query:' prefix to distinguish from passage embeddings)
|
|
264
|
-
const cacheKey = `query:${text.trim().substring(0, 200)}`;
|
|
265
|
-
if (this.cacheManager) {
|
|
266
|
-
const cachedResult = this.cacheManager.getEmbedding(cacheKey);
|
|
267
|
-
if (cachedResult) {
|
|
268
|
-
return cachedResult;
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
try {
|
|
273
|
-
const model = await this.initialize();
|
|
274
|
-
// queryEmbed directly returns the embedding for the single query text
|
|
275
|
-
const embeddingArray = await model.queryEmbed(text);
|
|
276
|
-
|
|
277
|
-
// Validate the generated query embedding
|
|
278
|
-
if (embeddingArray && typeof embeddingArray.length === 'number' && embeddingArray.length === this.embeddingDimensions) {
|
|
279
|
-
// queryEmbed in fastembed-js v0.2.0+ might return number[] directly or Float32Array
|
|
280
|
-
// Array.from() handles both cases correctly, converting Float32Array to number[] or returning number[] as is.
|
|
281
|
-
const embedding = Array.from(embeddingArray);
|
|
282
|
-
|
|
283
|
-
// Cache the result
|
|
284
|
-
if (this.cacheManager) {
|
|
285
|
-
this.cacheManager.setEmbedding(cacheKey, embedding);
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
return embedding;
|
|
289
|
-
} else {
|
|
290
|
-
console.error(
|
|
291
|
-
chalk.red(
|
|
292
|
-
`Generated query embedding dimension (${embeddingArray?.length}) does not match expected (${this.embeddingDimensions}) or embedding is invalid.`
|
|
293
|
-
)
|
|
294
|
-
);
|
|
295
|
-
return null;
|
|
296
|
-
}
|
|
297
|
-
} catch (error) {
|
|
298
|
-
console.error(chalk.red(`Error calculating query embedding: ${error.message}`), error);
|
|
299
|
-
throw createEmbeddingGenerationError(`Failed to calculate query embedding: ${error.message}`, error, {
|
|
300
|
-
text: text.substring(0, 100),
|
|
301
|
-
});
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
// ============================================================================
|
|
306
|
-
// CLEANUP
|
|
307
|
-
// ============================================================================
|
|
308
|
-
|
|
309
|
-
/**
|
|
310
|
-
* Clean up model resources
|
|
311
|
-
*/
|
|
312
|
-
async cleanup() {
|
|
313
|
-
if (this.cleaningUp) {
|
|
314
|
-
return; // Already cleaning up, prevent duplicate calls
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
this.cleaningUp = true;
|
|
318
|
-
|
|
319
|
-
try {
|
|
320
|
-
// FastEmbed models don't have an explicit cleanup method
|
|
321
|
-
// but we can clear our references
|
|
322
|
-
this.embeddingModel = null;
|
|
323
|
-
this.modelInitialized = false;
|
|
324
|
-
this.modelInitializationPromise = null;
|
|
325
|
-
|
|
326
|
-
if (this.cacheManager) {
|
|
327
|
-
this.cacheManager.clearCache('embedding');
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
console.log(chalk.green('[ModelManager] Model resources cleaned up.'));
|
|
331
|
-
} catch (error) {
|
|
332
|
-
console.error(chalk.red(`[ModelManager] Error during cleanup: ${error.message}`));
|
|
333
|
-
} finally {
|
|
334
|
-
this.cleaningUp = false;
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
}
|
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Similarity Calculator
|
|
3
|
-
*
|
|
4
|
-
* This module contains pure mathematical functions for calculating similarity between vectors,
|
|
5
|
-
* paths, and other data structures. These functions have no external dependencies and are
|
|
6
|
-
* safe to extract for modular use.
|
|
7
|
-
*/
|
|
8
|
-
|
|
9
|
-
import path from 'node:path';
|
|
10
|
-
import { debug } from '../utils/logging.js';
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* Calculate cosine similarity between two vectors
|
|
14
|
-
*
|
|
15
|
-
* @param {Array<number>} vecA - First vector
|
|
16
|
-
* @param {Array<number>} vecB - Second vector
|
|
17
|
-
* @returns {number} Cosine similarity score between -1 and 1
|
|
18
|
-
*/
|
|
19
|
-
export function calculateCosineSimilarity(vecA, vecB) {
|
|
20
|
-
if (!vecA || !vecB || !Array.isArray(vecA) || !Array.isArray(vecB) || vecA.length !== vecB.length || vecA.length === 0) {
|
|
21
|
-
// Add more robust checks
|
|
22
|
-
debug(`Invalid input for cosine similarity: vecA length=${vecA?.length}, vecB length=${vecB?.length}`);
|
|
23
|
-
return 0;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
let dotProduct = 0;
|
|
27
|
-
let normA = 0;
|
|
28
|
-
let normB = 0;
|
|
29
|
-
|
|
30
|
-
const len = vecA.length; // Cache length
|
|
31
|
-
|
|
32
|
-
for (let i = 0; i < len; i++) {
|
|
33
|
-
const a = vecA[i]; // Cache values
|
|
34
|
-
const b = vecB[i];
|
|
35
|
-
dotProduct += a * b;
|
|
36
|
-
normA += a * a;
|
|
37
|
-
normB += b * b;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
// Check for zero vectors, handle potential floating point inaccuracies
|
|
41
|
-
if (normA <= 1e-9 || normB <= 1e-9) {
|
|
42
|
-
return 0;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
// Clamp result to handle potential floating point errors leading to > 1 or < -1
|
|
46
|
-
return Math.max(-1.0, Math.min(1.0, dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))));
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
/**
|
|
50
|
-
* Calculate path similarity between two file paths
|
|
51
|
-
*
|
|
52
|
-
* This function compares two file paths and returns a similarity score based on
|
|
53
|
-
* the common directory prefix. The score is normalized between 0 and 1.
|
|
54
|
-
*
|
|
55
|
-
* @param {string} path1 - First file path
|
|
56
|
-
* @param {string} path2 - Second file path
|
|
57
|
-
* @returns {number} Similarity score between 0 and 1
|
|
58
|
-
*/
|
|
59
|
-
export function calculatePathSimilarity(path1, path2) {
|
|
60
|
-
if (!path1 || !path2) return 0;
|
|
61
|
-
|
|
62
|
-
try {
|
|
63
|
-
// Normalize paths and split into directory components
|
|
64
|
-
const parts1 = path
|
|
65
|
-
.dirname(path.normalize(path1))
|
|
66
|
-
.split(path.sep)
|
|
67
|
-
.filter((p) => p);
|
|
68
|
-
const parts2 = path
|
|
69
|
-
.dirname(path.normalize(path2))
|
|
70
|
-
.split(path.sep)
|
|
71
|
-
.filter((p) => p);
|
|
72
|
-
|
|
73
|
-
let commonPrefixLength = 0;
|
|
74
|
-
const minLength = Math.min(parts1.length, parts2.length);
|
|
75
|
-
|
|
76
|
-
for (let i = 0; i < minLength; i++) {
|
|
77
|
-
if (parts1[i] === parts2[i]) {
|
|
78
|
-
commonPrefixLength++;
|
|
79
|
-
} else {
|
|
80
|
-
break;
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
// Calculate score: common prefix length relative to the average length
|
|
85
|
-
// Avoid division by zero
|
|
86
|
-
const avgLength = (parts1.length + parts2.length) / 2;
|
|
87
|
-
if (avgLength === 0) {
|
|
88
|
-
return 1; // Both paths are likely in the root or identical
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
const score = commonPrefixLength / avgLength;
|
|
92
|
-
return Math.max(0, Math.min(1, score)); // Clamp score between 0 and 1
|
|
93
|
-
} catch (error) {
|
|
94
|
-
debug(`[calculatePathSimilarity] Error comparing paths '${path1}' and '${path2}': ${error.message}`);
|
|
95
|
-
return 0; // Return 0 similarity on error
|
|
96
|
-
}
|
|
97
|
-
}
|
package/src/embeddings/types.js
DELETED
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Type Definitions for Embeddings System
|
|
3
|
-
*
|
|
4
|
-
* This module provides TypeScript-style interfaces and type definitions
|
|
5
|
-
* for the embeddings system. These help with documentation and development
|
|
6
|
-
* even in a JavaScript environment.
|
|
7
|
-
*/
|
|
8
|
-
|
|
9
|
-
/**
|
|
10
|
-
* @typedef {Object} EmbeddingVector
|
|
11
|
-
* @property {number[]} vector - The embedding vector array
|
|
12
|
-
* @property {number} dimensions - Number of dimensions in the vector
|
|
13
|
-
* @property {string} model - Model used to generate the embedding
|
|
14
|
-
* @property {string} [id] - Optional identifier for the embedding
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* @typedef {Object} SearchResult
|
|
19
|
-
* @property {string} content - The content that was found
|
|
20
|
-
* @property {string} path - File path of the content
|
|
21
|
-
* @property {number} similarity - Similarity score (0-1)
|
|
22
|
-
* @property {string} [language] - Programming language of the content
|
|
23
|
-
* @property {string} [context] - Additional context information
|
|
24
|
-
* @property {number} [line_start] - Starting line number
|
|
25
|
-
* @property {number} [line_end] - Ending line number
|
|
26
|
-
* @property {boolean} [reranked] - Whether the result has been reranked
|
|
27
|
-
* @property {Object} [metadata] - Additional metadata
|
|
28
|
-
*/
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* @typedef {Object} SearchOptions
|
|
32
|
-
* @property {number} [limit] - Maximum number of results to return
|
|
33
|
-
* @property {number} [threshold] - Minimum similarity threshold
|
|
34
|
-
* @property {string} [language] - Filter by programming language
|
|
35
|
-
* @property {string} [path] - Filter by file path pattern
|
|
36
|
-
* @property {boolean} [includeMetadata] - Include metadata in results
|
|
37
|
-
* @property {boolean} [rerank] - Whether to rerank results
|
|
38
|
-
* @property {string} [context] - Additional context for search
|
|
39
|
-
*/
|
|
40
|
-
|
|
41
|
-
/**
|
|
42
|
-
* @typedef {Object} DocumentChunk
|
|
43
|
-
* @property {string} content - The text content of the chunk
|
|
44
|
-
* @property {string} document_title - Title of the document
|
|
45
|
-
* @property {string} document_path - Path to the document
|
|
46
|
-
* @property {number} chunk_index - Index of the chunk within the document
|
|
47
|
-
* @property {number[]} embedding - Embedding vector for the chunk
|
|
48
|
-
* @property {string} [h1_title] - H1 title if applicable
|
|
49
|
-
* @property {number[]} [h1_embedding] - H1 embedding if applicable
|
|
50
|
-
* @property {string} [language] - Programming language
|
|
51
|
-
* @property {Object} [metadata] - Additional metadata
|
|
52
|
-
*/
|
|
53
|
-
|
|
54
|
-
/**
|
|
55
|
-
* @typedef {Object} EmbeddingConfig
|
|
56
|
-
* @property {string} modelName - Name of the embedding model
|
|
57
|
-
* @property {number} dimensions - Number of dimensions
|
|
58
|
-
* @property {string} lancedbPath - Path to LanceDB database
|
|
59
|
-
* @property {string} fastembedCacheDir - FastEmbed cache directory
|
|
60
|
-
* @property {number} maxRetries - Maximum number of retries
|
|
61
|
-
* @property {boolean} debug - Enable debug mode
|
|
62
|
-
* @property {number} maxConcurrency - Maximum concurrent operations
|
|
63
|
-
* @property {number} batchSize - Batch size for processing
|
|
64
|
-
*/
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* @typedef {Object} CacheMetrics
|
|
68
|
-
* @property {number} hits - Number of cache hits
|
|
69
|
-
* @property {number} misses - Number of cache misses
|
|
70
|
-
* @property {number} size - Current cache size
|
|
71
|
-
* @property {number} maxSize - Maximum cache size
|
|
72
|
-
* @property {number} evictions - Number of evictions
|
|
73
|
-
* @property {number} hitRate - Hit rate percentage
|
|
74
|
-
*/
|
|
75
|
-
|
|
76
|
-
/**
|
|
77
|
-
* @typedef {Object} ProcessingProgress
|
|
78
|
-
* @property {number} totalFiles - Total number of files to process
|
|
79
|
-
* @property {number} processedCount - Number of files processed
|
|
80
|
-
* @property {number} skippedCount - Number of files skipped
|
|
81
|
-
* @property {number} failedCount - Number of files failed
|
|
82
|
-
* @property {number} startTime - Processing start time
|
|
83
|
-
* @property {number} currentTime - Current time
|
|
84
|
-
* @property {number} estimatedTimeRemaining - Estimated time remaining
|
|
85
|
-
* @property {number} percentComplete - Percentage complete
|
|
86
|
-
*/
|
|
87
|
-
|
|
88
|
-
/**
|
|
89
|
-
* @typedef {Object} DatabaseSchema
|
|
90
|
-
* @property {string} tableName - Name of the table
|
|
91
|
-
* @property {Object} fields - Field definitions
|
|
92
|
-
* @property {string[]} indexes - Index definitions
|
|
93
|
-
* @property {string} primaryKey - Primary key field
|
|
94
|
-
*/
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* @typedef {Object} BatchProcessingOptions
|
|
98
|
-
* @property {number} batchSize - Size of each batch
|
|
99
|
-
* @property {number} maxConcurrency - Maximum concurrent batches
|
|
100
|
-
* @property {boolean} skipExisting - Skip files that already exist
|
|
101
|
-
* @property {Function} [progressCallback] - Progress callback function
|
|
102
|
-
* @property {Function} [errorCallback] - Error callback function
|
|
103
|
-
*/
|
|
104
|
-
|
|
105
|
-
/**
|
|
106
|
-
* @typedef {Object} QueryEmbeddingOptions
|
|
107
|
-
* @property {string} [context] - Additional context for the query
|
|
108
|
-
* @property {string} [language] - Programming language hint
|
|
109
|
-
* @property {boolean} [normalize] - Whether to normalize the embedding
|
|
110
|
-
* @property {Object} [metadata] - Additional metadata
|
|
111
|
-
*/
|
|
112
|
-
|
|
113
|
-
export {};
|