codecritique 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +82 -114
  2. package/package.json +10 -9
  3. package/src/content-retrieval.test.js +775 -0
  4. package/src/custom-documents.test.js +440 -0
  5. package/src/feedback-loader.test.js +529 -0
  6. package/src/llm.test.js +256 -0
  7. package/src/project-analyzer.test.js +747 -0
  8. package/src/rag-analyzer.js +12 -0
  9. package/src/rag-analyzer.test.js +1109 -0
  10. package/src/rag-review.test.js +317 -0
  11. package/src/setupTests.js +131 -0
  12. package/src/zero-shot-classifier-open.test.js +278 -0
  13. package/src/embeddings/cache-manager.js +0 -364
  14. package/src/embeddings/constants.js +0 -40
  15. package/src/embeddings/database.js +0 -921
  16. package/src/embeddings/errors.js +0 -208
  17. package/src/embeddings/factory.js +0 -447
  18. package/src/embeddings/file-processor.js +0 -851
  19. package/src/embeddings/model-manager.js +0 -337
  20. package/src/embeddings/similarity-calculator.js +0 -97
  21. package/src/embeddings/types.js +0 -113
  22. package/src/pr-history/analyzer.js +0 -579
  23. package/src/pr-history/bot-detector.js +0 -123
  24. package/src/pr-history/cli-utils.js +0 -204
  25. package/src/pr-history/comment-processor.js +0 -549
  26. package/src/pr-history/database.js +0 -819
  27. package/src/pr-history/github-client.js +0 -629
  28. package/src/technology-keywords.json +0 -753
  29. package/src/utils/command.js +0 -48
  30. package/src/utils/constants.js +0 -263
  31. package/src/utils/context-inference.js +0 -364
  32. package/src/utils/document-detection.js +0 -105
  33. package/src/utils/file-validation.js +0 -271
  34. package/src/utils/git.js +0 -232
  35. package/src/utils/language-detection.js +0 -170
  36. package/src/utils/logging.js +0 -24
  37. package/src/utils/markdown.js +0 -132
  38. package/src/utils/mobilebert-tokenizer.js +0 -141
  39. package/src/utils/pr-chunking.js +0 -276
  40. package/src/utils/string-utils.js +0 -28
@@ -1,337 +0,0 @@
1
- /**
2
- * Model Manager Module
3
- *
4
- * This module provides centralized embedding model management using FastEmbed.
5
- * It handles model initialization, caching, and all embedding generation operations.
6
- *
7
- * Features:
8
- * - Singleton model instance management
9
- * - Thread-safe model initialization
10
- * - Embedding generation with caching
11
- * - Batch embedding processing
12
- * - Query-specific embedding generation
13
- * - Comprehensive error handling
14
- */
15
- /**
16
- * @typedef {import('./types.js').EmbeddingVector} EmbeddingVector
17
- * @typedef {import('./types.js').QueryEmbeddingOptions} QueryEmbeddingOptions
18
- * @typedef {import('./types.js').BatchProcessingOptions} BatchProcessingOptions
19
- */
20
-
21
- import fs from 'node:fs';
22
- import chalk from 'chalk';
23
- import dotenv from 'dotenv';
24
- import { EmbeddingModel, FlagEmbedding } from 'fastembed';
25
- import { debug } from '../utils/logging.js';
26
- import { EMBEDDING_DIMENSIONS, MODEL_NAME_STRING, MAX_RETRIES } from './constants.js';
27
- import { FASTEMBED_CACHE_DIR } from './constants.js';
28
- import { createModelInitializationError, createEmbeddingGenerationError } from './errors.js';
29
-
30
- // Load environment variables
31
- dotenv.config();
32
-
33
- // ============================================================================
34
- // MODEL MANAGER CLASS
35
- // ============================================================================
36
-
37
- export class ModelManager {
38
- constructor(options = {}) {
39
- this.embeddingDimensions = options.embeddingDimensions || EMBEDDING_DIMENSIONS;
40
- this.modelNameString = options.modelNameString || MODEL_NAME_STRING;
41
- this.maxRetries = options.maxRetries || MAX_RETRIES;
42
- this.cacheDir = options.cacheDir || FASTEMBED_CACHE_DIR;
43
- this.cacheManager = options.cacheManager || null;
44
-
45
- // Model state
46
- this.embeddingModel = null;
47
- this.modelInitialized = false;
48
- this.modelInitializationPromise = null;
49
- this.cleaningUp = false;
50
-
51
- console.log(chalk.magenta(`[ModelManager] Using MODEL = ${this.modelNameString}, DIMENSIONS = ${this.embeddingDimensions}`));
52
- }
53
-
54
- // ============================================================================
55
- // MODEL INITIALIZATION
56
- // ============================================================================
57
-
58
- /**
59
- * Initialize the FastEmbed model instance
60
- * @returns {Promise<import('fastembed').FlagEmbedding>} Initialized model instance
61
- */
62
- async initialize() {
63
- // If model is already initialized, return it immediately
64
- if (this.embeddingModel) {
65
- return this.embeddingModel;
66
- }
67
-
68
- // If initialization is already in progress, wait for it
69
- if (this.modelInitializationPromise) {
70
- return await this.modelInitializationPromise;
71
- }
72
-
73
- // Start initialization and store the promise
74
- this.modelInitializationPromise = (async () => {
75
- const modelIdentifier = EmbeddingModel.BGESmallENV15;
76
-
77
- // Only print logs if we haven't initialized before
78
- if (!this.modelInitialized) {
79
- console.log(chalk.blue(`Attempting to initialize fastembed model. Identifier: ${this.modelNameString}`));
80
- console.log(chalk.blue(`FastEmbed Cache Directory: ${this.cacheDir}`));
81
- }
82
-
83
- try {
84
- if (!fs.existsSync(this.cacheDir)) {
85
- console.log(chalk.yellow(`Creating fastembed cache directory: ${this.cacheDir}`));
86
- fs.mkdirSync(this.cacheDir, { recursive: true });
87
- }
88
-
89
- let retries = 0;
90
- while (retries < this.maxRetries) {
91
- try {
92
- this.embeddingModel = await FlagEmbedding.init({
93
- model: modelIdentifier,
94
- cacheDir: this.cacheDir,
95
- });
96
-
97
- // Only print success message if we haven't initialized before
98
- if (!this.modelInitialized) {
99
- console.log(chalk.green('FastEmbed model initialized successfully.'));
100
- this.modelInitialized = true;
101
- }
102
- break; // Exit loop on success
103
- } catch (initError) {
104
- retries++;
105
- console.error(chalk.yellow(`Model initialization attempt ${retries}/${this.maxRetries} failed: ${initError.message}`));
106
- if (retries >= this.maxRetries) {
107
- throw createModelInitializationError(
108
- `Failed to initialize model after ${this.maxRetries} attempts: ${initError.message}`,
109
- initError,
110
- { modelIdentifier, cacheDir: this.cacheDir }
111
- );
112
- }
113
- await new Promise((resolve) => setTimeout(resolve, retries * 2000)); // Wait before retrying
114
- }
115
- }
116
-
117
- // Clear the initialization promise since we're done
118
- this.modelInitializationPromise = null;
119
- return this.embeddingModel;
120
- } catch (err) {
121
- // Clear the initialization promise on error
122
- this.modelInitializationPromise = null;
123
- console.error(chalk.red(`Fatal: Failed to initialize fastembed model: ${err.message}`), err);
124
- throw err; // Re-throw critical error
125
- }
126
- })();
127
-
128
- return await this.modelInitializationPromise;
129
- }
130
-
131
- /**
132
- * Check if the model is initialized
133
- * @returns {boolean} True if model is initialized
134
- */
135
- isInitialized() {
136
- return this.modelInitialized && this.embeddingModel !== null;
137
- }
138
-
139
- // ============================================================================
140
- // EMBEDDING GENERATION
141
- // ============================================================================
142
-
143
- /**
144
- * Calculate embedding for a text using fastembed
145
- * @param {string} text - The text to embed
146
- * @returns {Promise<EmbeddingVector|null>} The embedding vector or null on error
147
- */
148
- async calculateEmbedding(text) {
149
- // Ensure text is a non-empty string
150
- if (typeof text !== 'string' || text.trim().length === 0) {
151
- return null; // Return null for empty text to avoid errors downstream
152
- }
153
-
154
- // Check cache first
155
- const cacheKey = text.trim().substring(0, 200); // Use first 200 chars as cache key
156
- if (this.cacheManager) {
157
- const cachedResult = this.cacheManager.getEmbedding(cacheKey);
158
- if (cachedResult) {
159
- return cachedResult;
160
- }
161
- }
162
-
163
- try {
164
- const model = await this.initialize();
165
- let embedding = null; // Initialize embedding as null
166
-
167
- // Use passageEmbed which is suitable for sentences/paragraphs/code snippets
168
- const embeddingGenerator = model.passageEmbed([text]);
169
- // FastEmbed's async generator yields batches, even for single input
170
- for await (const batch of embeddingGenerator) {
171
- if (batch && batch.length > 0 && batch[0]) {
172
- embedding = Array.from(batch[0]); // Convert Float32Array to regular array
173
- break; // Got the embedding for the single input text
174
- }
175
- }
176
-
177
- // Validate the generated embedding
178
- if (!embedding || !Array.isArray(embedding) || embedding.length !== this.embeddingDimensions) {
179
- console.error(
180
- chalk.red(
181
- `Generated embedding dimension (${embedding?.length}) does not match expected (${this.embeddingDimensions}) or embedding is invalid.`
182
- )
183
- );
184
- return null; // Return null if dimensions mismatch or invalid
185
- }
186
-
187
- // Cache the result
188
- if (this.cacheManager) {
189
- this.cacheManager.setEmbedding(cacheKey, embedding);
190
- }
191
-
192
- return embedding;
193
- } catch (error) {
194
- console.error(chalk.red(`Error calculating embedding: ${error.message}`), error);
195
- throw createEmbeddingGenerationError(`Failed to calculate embedding: ${error.message}`, error, { text: text.substring(0, 100) });
196
- }
197
- }
198
-
199
- /**
200
- * Calculate embeddings for a batch of texts using fastembed
201
- * @param {string[]} texts - An array of texts to embed
202
- * @param {BatchProcessingOptions} [options] - Batch processing options
203
- * @returns {Promise<Array<EmbeddingVector|null>>} A promise that resolves to an array of embedding vectors
204
- */
205
- async calculateEmbeddingBatch(texts) {
206
- // Ensure texts is a non-empty array of non-empty strings
207
- if (!Array.isArray(texts) || texts.length === 0 || texts.some((text) => typeof text !== 'string' || text.trim().length === 0)) {
208
- debug('Skipping batch embedding for empty or invalid texts array.');
209
- // Return an array of nulls corresponding to the input, or an empty array if appropriate
210
- return texts.map(() => null);
211
- }
212
-
213
- try {
214
- const model = await this.initialize();
215
- const embeddings = [];
216
-
217
- // passageEmbed is an async generator of batches
218
- for await (const batch of model.passageEmbed(texts)) {
219
- for (const vec of batch) {
220
- // Validate each generated embedding
221
- if (vec && typeof vec.length === 'number' && vec.length === this.embeddingDimensions) {
222
- embeddings.push(Array.from(vec)); // Convert Float32Array (or other array-like) to regular array
223
- } else {
224
- console.error(
225
- chalk.red(
226
- `Generated batch embedding dimension (${vec?.length}) does not match expected (${this.embeddingDimensions}) or embedding is invalid.`
227
- )
228
- );
229
- embeddings.push(null); // Add null for invalid embeddings in the batch
230
- }
231
- }
232
- }
233
-
234
- // Ensure the number of embeddings matches the number of input texts
235
- if (embeddings.length !== texts.length) {
236
- console.error(
237
- chalk.red(`Number of generated embeddings (${embeddings.length}) does not match number of input texts (${texts.length}).`)
238
- );
239
- // This case should ideally be handled by ensuring one embedding (or null) per input text.
240
- // For now, if there's a mismatch, it might indicate a deeper issue.
241
- // We'll return what we have, but this could lead to misaligned data.
242
- }
243
-
244
- debug(`Batch embeddings generated successfully, count: ${embeddings.filter((e) => e !== null).length}`);
245
- return embeddings;
246
- } catch (error) {
247
- console.error(chalk.red(`Error calculating batch embeddings: ${error.message}`), error);
248
- throw createEmbeddingGenerationError(`Failed to calculate batch embeddings: ${error.message}`, error, { textsCount: texts.length });
249
- }
250
- }
251
-
252
- /**
253
- * Calculate embedding for a query text using fastembed
254
- * @param {string} text - The query text to embed
255
- * @param {QueryEmbeddingOptions} [options] - Query embedding options
256
- * @returns {Promise<EmbeddingVector|null>} The embedding vector or null on error
257
- */
258
- async calculateQueryEmbedding(text) {
259
- if (typeof text !== 'string' || text.trim().length === 0) {
260
- return null;
261
- }
262
-
263
- // Check cache first (use 'query:' prefix to distinguish from passage embeddings)
264
- const cacheKey = `query:${text.trim().substring(0, 200)}`;
265
- if (this.cacheManager) {
266
- const cachedResult = this.cacheManager.getEmbedding(cacheKey);
267
- if (cachedResult) {
268
- return cachedResult;
269
- }
270
- }
271
-
272
- try {
273
- const model = await this.initialize();
274
- // queryEmbed directly returns the embedding for the single query text
275
- const embeddingArray = await model.queryEmbed(text);
276
-
277
- // Validate the generated query embedding
278
- if (embeddingArray && typeof embeddingArray.length === 'number' && embeddingArray.length === this.embeddingDimensions) {
279
- // queryEmbed in fastembed-js v0.2.0+ might return number[] directly or Float32Array
280
- // Array.from() handles both cases correctly, converting Float32Array to number[] or returning number[] as is.
281
- const embedding = Array.from(embeddingArray);
282
-
283
- // Cache the result
284
- if (this.cacheManager) {
285
- this.cacheManager.setEmbedding(cacheKey, embedding);
286
- }
287
-
288
- return embedding;
289
- } else {
290
- console.error(
291
- chalk.red(
292
- `Generated query embedding dimension (${embeddingArray?.length}) does not match expected (${this.embeddingDimensions}) or embedding is invalid.`
293
- )
294
- );
295
- return null;
296
- }
297
- } catch (error) {
298
- console.error(chalk.red(`Error calculating query embedding: ${error.message}`), error);
299
- throw createEmbeddingGenerationError(`Failed to calculate query embedding: ${error.message}`, error, {
300
- text: text.substring(0, 100),
301
- });
302
- }
303
- }
304
-
305
- // ============================================================================
306
- // CLEANUP
307
- // ============================================================================
308
-
309
- /**
310
- * Clean up model resources
311
- */
312
- async cleanup() {
313
- if (this.cleaningUp) {
314
- return; // Already cleaning up, prevent duplicate calls
315
- }
316
-
317
- this.cleaningUp = true;
318
-
319
- try {
320
- // FastEmbed models don't have an explicit cleanup method
321
- // but we can clear our references
322
- this.embeddingModel = null;
323
- this.modelInitialized = false;
324
- this.modelInitializationPromise = null;
325
-
326
- if (this.cacheManager) {
327
- this.cacheManager.clearCache('embedding');
328
- }
329
-
330
- console.log(chalk.green('[ModelManager] Model resources cleaned up.'));
331
- } catch (error) {
332
- console.error(chalk.red(`[ModelManager] Error during cleanup: ${error.message}`));
333
- } finally {
334
- this.cleaningUp = false;
335
- }
336
- }
337
- }
@@ -1,97 +0,0 @@
1
- /**
2
- * Similarity Calculator
3
- *
4
- * This module contains pure mathematical functions for calculating similarity between vectors,
5
- * paths, and other data structures. These functions have no external dependencies and are
6
- * safe to extract for modular use.
7
- */
8
-
9
- import path from 'node:path';
10
- import { debug } from '../utils/logging.js';
11
-
12
- /**
13
- * Calculate cosine similarity between two vectors
14
- *
15
- * @param {Array<number>} vecA - First vector
16
- * @param {Array<number>} vecB - Second vector
17
- * @returns {number} Cosine similarity score between -1 and 1
18
- */
19
- export function calculateCosineSimilarity(vecA, vecB) {
20
- if (!vecA || !vecB || !Array.isArray(vecA) || !Array.isArray(vecB) || vecA.length !== vecB.length || vecA.length === 0) {
21
- // Add more robust checks
22
- debug(`Invalid input for cosine similarity: vecA length=${vecA?.length}, vecB length=${vecB?.length}`);
23
- return 0;
24
- }
25
-
26
- let dotProduct = 0;
27
- let normA = 0;
28
- let normB = 0;
29
-
30
- const len = vecA.length; // Cache length
31
-
32
- for (let i = 0; i < len; i++) {
33
- const a = vecA[i]; // Cache values
34
- const b = vecB[i];
35
- dotProduct += a * b;
36
- normA += a * a;
37
- normB += b * b;
38
- }
39
-
40
- // Check for zero vectors, handle potential floating point inaccuracies
41
- if (normA <= 1e-9 || normB <= 1e-9) {
42
- return 0;
43
- }
44
-
45
- // Clamp result to handle potential floating point errors leading to > 1 or < -1
46
- return Math.max(-1.0, Math.min(1.0, dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))));
47
- }
48
-
49
- /**
50
- * Calculate path similarity between two file paths
51
- *
52
- * This function compares two file paths and returns a similarity score based on
53
- * the common directory prefix. The score is normalized between 0 and 1.
54
- *
55
- * @param {string} path1 - First file path
56
- * @param {string} path2 - Second file path
57
- * @returns {number} Similarity score between 0 and 1
58
- */
59
- export function calculatePathSimilarity(path1, path2) {
60
- if (!path1 || !path2) return 0;
61
-
62
- try {
63
- // Normalize paths and split into directory components
64
- const parts1 = path
65
- .dirname(path.normalize(path1))
66
- .split(path.sep)
67
- .filter((p) => p);
68
- const parts2 = path
69
- .dirname(path.normalize(path2))
70
- .split(path.sep)
71
- .filter((p) => p);
72
-
73
- let commonPrefixLength = 0;
74
- const minLength = Math.min(parts1.length, parts2.length);
75
-
76
- for (let i = 0; i < minLength; i++) {
77
- if (parts1[i] === parts2[i]) {
78
- commonPrefixLength++;
79
- } else {
80
- break;
81
- }
82
- }
83
-
84
- // Calculate score: common prefix length relative to the average length
85
- // Avoid division by zero
86
- const avgLength = (parts1.length + parts2.length) / 2;
87
- if (avgLength === 0) {
88
- return 1; // Both paths are likely in the root or identical
89
- }
90
-
91
- const score = commonPrefixLength / avgLength;
92
- return Math.max(0, Math.min(1, score)); // Clamp score between 0 and 1
93
- } catch (error) {
94
- debug(`[calculatePathSimilarity] Error comparing paths '${path1}' and '${path2}': ${error.message}`);
95
- return 0; // Return 0 similarity on error
96
- }
97
- }
@@ -1,113 +0,0 @@
1
- /**
2
- * Type Definitions for Embeddings System
3
- *
4
- * This module provides TypeScript-style interfaces and type definitions
5
- * for the embeddings system. These help with documentation and development
6
- * even in a JavaScript environment.
7
- */
8
-
9
- /**
10
- * @typedef {Object} EmbeddingVector
11
- * @property {number[]} vector - The embedding vector array
12
- * @property {number} dimensions - Number of dimensions in the vector
13
- * @property {string} model - Model used to generate the embedding
14
- * @property {string} [id] - Optional identifier for the embedding
15
- */
16
-
17
- /**
18
- * @typedef {Object} SearchResult
19
- * @property {string} content - The content that was found
20
- * @property {string} path - File path of the content
21
- * @property {number} similarity - Similarity score (0-1)
22
- * @property {string} [language] - Programming language of the content
23
- * @property {string} [context] - Additional context information
24
- * @property {number} [line_start] - Starting line number
25
- * @property {number} [line_end] - Ending line number
26
- * @property {boolean} [reranked] - Whether the result has been reranked
27
- * @property {Object} [metadata] - Additional metadata
28
- */
29
-
30
- /**
31
- * @typedef {Object} SearchOptions
32
- * @property {number} [limit] - Maximum number of results to return
33
- * @property {number} [threshold] - Minimum similarity threshold
34
- * @property {string} [language] - Filter by programming language
35
- * @property {string} [path] - Filter by file path pattern
36
- * @property {boolean} [includeMetadata] - Include metadata in results
37
- * @property {boolean} [rerank] - Whether to rerank results
38
- * @property {string} [context] - Additional context for search
39
- */
40
-
41
- /**
42
- * @typedef {Object} DocumentChunk
43
- * @property {string} content - The text content of the chunk
44
- * @property {string} document_title - Title of the document
45
- * @property {string} document_path - Path to the document
46
- * @property {number} chunk_index - Index of the chunk within the document
47
- * @property {number[]} embedding - Embedding vector for the chunk
48
- * @property {string} [h1_title] - H1 title if applicable
49
- * @property {number[]} [h1_embedding] - H1 embedding if applicable
50
- * @property {string} [language] - Programming language
51
- * @property {Object} [metadata] - Additional metadata
52
- */
53
-
54
- /**
55
- * @typedef {Object} EmbeddingConfig
56
- * @property {string} modelName - Name of the embedding model
57
- * @property {number} dimensions - Number of dimensions
58
- * @property {string} lancedbPath - Path to LanceDB database
59
- * @property {string} fastembedCacheDir - FastEmbed cache directory
60
- * @property {number} maxRetries - Maximum number of retries
61
- * @property {boolean} debug - Enable debug mode
62
- * @property {number} maxConcurrency - Maximum concurrent operations
63
- * @property {number} batchSize - Batch size for processing
64
- */
65
-
66
- /**
67
- * @typedef {Object} CacheMetrics
68
- * @property {number} hits - Number of cache hits
69
- * @property {number} misses - Number of cache misses
70
- * @property {number} size - Current cache size
71
- * @property {number} maxSize - Maximum cache size
72
- * @property {number} evictions - Number of evictions
73
- * @property {number} hitRate - Hit rate percentage
74
- */
75
-
76
- /**
77
- * @typedef {Object} ProcessingProgress
78
- * @property {number} totalFiles - Total number of files to process
79
- * @property {number} processedCount - Number of files processed
80
- * @property {number} skippedCount - Number of files skipped
81
- * @property {number} failedCount - Number of files failed
82
- * @property {number} startTime - Processing start time
83
- * @property {number} currentTime - Current time
84
- * @property {number} estimatedTimeRemaining - Estimated time remaining
85
- * @property {number} percentComplete - Percentage complete
86
- */
87
-
88
- /**
89
- * @typedef {Object} DatabaseSchema
90
- * @property {string} tableName - Name of the table
91
- * @property {Object} fields - Field definitions
92
- * @property {string[]} indexes - Index definitions
93
- * @property {string} primaryKey - Primary key field
94
- */
95
-
96
- /**
97
- * @typedef {Object} BatchProcessingOptions
98
- * @property {number} batchSize - Size of each batch
99
- * @property {number} maxConcurrency - Maximum concurrent batches
100
- * @property {boolean} skipExisting - Skip files that already exist
101
- * @property {Function} [progressCallback] - Progress callback function
102
- * @property {Function} [errorCallback] - Error callback function
103
- */
104
-
105
- /**
106
- * @typedef {Object} QueryEmbeddingOptions
107
- * @property {string} [context] - Additional context for the query
108
- * @property {string} [language] - Programming language hint
109
- * @property {boolean} [normalize] - Whether to normalize the embedding
110
- * @property {Object} [metadata] - Additional metadata
111
- */
112
-
113
- export {};