codecritique 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +82 -114
  2. package/package.json +10 -9
  3. package/src/content-retrieval.test.js +775 -0
  4. package/src/custom-documents.test.js +440 -0
  5. package/src/feedback-loader.test.js +529 -0
  6. package/src/llm.test.js +256 -0
  7. package/src/project-analyzer.test.js +747 -0
  8. package/src/rag-analyzer.js +12 -0
  9. package/src/rag-analyzer.test.js +1109 -0
  10. package/src/rag-review.test.js +317 -0
  11. package/src/setupTests.js +131 -0
  12. package/src/zero-shot-classifier-open.test.js +278 -0
  13. package/src/embeddings/cache-manager.js +0 -364
  14. package/src/embeddings/constants.js +0 -40
  15. package/src/embeddings/database.js +0 -921
  16. package/src/embeddings/errors.js +0 -208
  17. package/src/embeddings/factory.js +0 -447
  18. package/src/embeddings/file-processor.js +0 -851
  19. package/src/embeddings/model-manager.js +0 -337
  20. package/src/embeddings/similarity-calculator.js +0 -97
  21. package/src/embeddings/types.js +0 -113
  22. package/src/pr-history/analyzer.js +0 -579
  23. package/src/pr-history/bot-detector.js +0 -123
  24. package/src/pr-history/cli-utils.js +0 -204
  25. package/src/pr-history/comment-processor.js +0 -549
  26. package/src/pr-history/database.js +0 -819
  27. package/src/pr-history/github-client.js +0 -629
  28. package/src/technology-keywords.json +0 -753
  29. package/src/utils/command.js +0 -48
  30. package/src/utils/constants.js +0 -263
  31. package/src/utils/context-inference.js +0 -364
  32. package/src/utils/document-detection.js +0 -105
  33. package/src/utils/file-validation.js +0 -271
  34. package/src/utils/git.js +0 -232
  35. package/src/utils/language-detection.js +0 -170
  36. package/src/utils/logging.js +0 -24
  37. package/src/utils/markdown.js +0 -132
  38. package/src/utils/mobilebert-tokenizer.js +0 -141
  39. package/src/utils/pr-chunking.js +0 -276
  40. package/src/utils/string-utils.js +0 -28
@@ -1,921 +0,0 @@
1
- /**
2
- * Database Manager Module
3
- *
4
- * This module provides centralized database management for embeddings
5
- * using LanceDB and Apache Arrow.
6
- *
7
- * Features:
8
- * - Database connection management
9
- * - Table initialization and schema management
10
- * - Adaptive vector indexing
11
- * - Project-specific data cleanup
12
- * - Database maintenance operations
13
- */
14
-
15
- /**
16
- * @typedef {import('./types.js').DatabaseSchema} DatabaseSchema
17
- * @typedef {import('@lancedb/lancedb').Connection} LanceDBConnection
18
- * @typedef {import('@lancedb/lancedb').Table} LanceDBTable
19
- */
20
-
21
- import fs from 'node:fs';
22
- import path from 'node:path';
23
- import * as lancedb from '@lancedb/lancedb';
24
- import { Field, FixedSizeList, Float32, Int32, Schema, Utf8 } from 'apache-arrow';
25
- import chalk from 'chalk';
26
- import { debug } from '../utils/logging.js';
27
- import { EMBEDDING_DIMENSIONS, TABLE_NAMES } from './constants.js';
28
- import { LANCEDB_PATH } from './constants.js';
29
- import { createDatabaseError, ERROR_CODES } from './errors.js';
30
-
31
- // ============================================================================
32
- // DATABASE CONFIGURATION
33
- // ============================================================================
34
-
35
- // Database Constants
36
- const FILE_EMBEDDINGS_TABLE = TABLE_NAMES.FILE_EMBEDDINGS;
37
- const DOCUMENT_CHUNK_TABLE = TABLE_NAMES.DOCUMENT_CHUNK;
38
- const PR_COMMENTS_TABLE = TABLE_NAMES.PR_COMMENTS;
39
- const PROJECT_SUMMARIES_TABLE = TABLE_NAMES.PROJECT_SUMMARIES;
40
-
41
- // ============================================================================
42
- // DATABASE MANAGER CLASS
43
- // ============================================================================
44
-
45
- export class DatabaseManager {
46
- constructor(options = {}) {
47
- this.dbPath = options.dbPath || LANCEDB_PATH;
48
- this.embeddingDimensions = options.embeddingDimensions || EMBEDDING_DIMENSIONS;
49
-
50
- // Connection state
51
- this.dbConnection = null;
52
- this.tablesInitialized = false;
53
- this.tableInitializationPromise = null;
54
- this.cleaningUp = false;
55
-
56
- // Table names
57
- this.fileEmbeddingsTable = options.fileEmbeddingsTable || FILE_EMBEDDINGS_TABLE;
58
- this.documentChunkTable = options.documentChunkTable || DOCUMENT_CHUNK_TABLE;
59
- this.prCommentsTable = options.prCommentsTable || PR_COMMENTS_TABLE;
60
- }
61
-
62
- // ============================================================================
63
- // CONNECTION MANAGEMENT
64
- // ============================================================================
65
-
66
- /**
67
- * Get database connection, creating it if necessary
68
- * @returns {Promise<LanceDBConnection>} Database connection
69
- */
70
- async getDBConnection() {
71
- if (!this.dbConnection) {
72
- console.log(chalk.blue(`Initializing DB connection. Target Path: ${this.dbPath}`));
73
- if (!fs.existsSync(this.dbPath)) {
74
- fs.mkdirSync(this.dbPath, { recursive: true });
75
- }
76
- this.dbConnection = await lancedb.connect(this.dbPath);
77
- console.log(chalk.green('LanceDB connected.'));
78
- }
79
- return this.dbConnection;
80
- }
81
-
82
- /**
83
- * Get database connection with initialized tables
84
- * @returns {Promise<LanceDBConnection>} Database connection
85
- */
86
- async getDB() {
87
- const db = await this.getDBConnection();
88
- if (!this.tablesInitialized) {
89
- await this.initializeTables();
90
- }
91
- return db;
92
- }
93
-
94
- /**
95
- * Connect to database (compatibility method)
96
- * @returns {Promise<LanceDBConnection>} Database connection
97
- */
98
- async connect() {
99
- return this.getDB();
100
- }
101
-
102
- /**
103
- * Close database connection
104
- */
105
- async closeConnection() {
106
- if (this.dbConnection) {
107
- console.log('Closing LanceDB connection...');
108
- await this.dbConnection.close();
109
- this.dbConnection = null;
110
- this.tablesInitialized = false;
111
- this.tableInitializationPromise = null;
112
- console.log('LanceDB connection closed.');
113
- }
114
- }
115
-
116
- // ============================================================================
117
- // TABLE INITIALIZATION
118
- // ============================================================================
119
-
120
- /**
121
- * Initialize database tables
122
- * @returns {Promise<void>}
123
- */
124
- async initializeTables() {
125
- if (this.tablesInitialized) {
126
- return;
127
- }
128
-
129
- // If initialization is already in progress, wait for it to complete
130
- if (this.tableInitializationPromise) {
131
- await this.tableInitializationPromise;
132
- return;
133
- }
134
-
135
- // Start initialization and store the promise
136
- this.tableInitializationPromise = (async () => {
137
- try {
138
- console.log(chalk.blue('Initializing database tables and indices...'));
139
- const db = await this.getDBConnection();
140
- await this.ensureTablesExist(db);
141
- this.tablesInitialized = true;
142
- console.log(chalk.green('Database tables and indices initialized successfully.'));
143
- } catch (error) {
144
- this.tablesInitialized = false;
145
- console.error(chalk.red('Failed to initialize database tables:'), error);
146
- throw error; // Re-throw to propagate the error to callers
147
- } finally {
148
- // The initialization attempt is over, clear the promise
149
- this.tableInitializationPromise = null;
150
- }
151
- })();
152
-
153
- await this.tableInitializationPromise;
154
- }
155
-
156
- /**
157
- * Ensure all required tables exist with proper schemas
158
- * @param {LanceDBConnection} db - Database connection
159
- * @returns {Promise<void>}
160
- */
161
- async ensureTablesExist(db) {
162
- try {
163
- const tableNames = await db.tableNames();
164
- const vectorType = new FixedSizeList(this.embeddingDimensions, new Field('item', new Float32(), true));
165
-
166
- // File embeddings table schema
167
- const fileFields = [
168
- new Field('id', new Utf8(), false),
169
- new Field('content', new Utf8(), false),
170
- new Field('type', new Utf8(), false),
171
- new Field('name', new Utf8(), false),
172
- new Field('path', new Utf8(), false),
173
- new Field('project_path', new Utf8(), false),
174
- new Field('language', new Utf8(), true),
175
- new Field('content_hash', new Utf8(), false),
176
- new Field('last_modified', new Utf8(), false),
177
- new Field('vector', vectorType, false),
178
- ];
179
- const fileSchema = new Schema(fileFields);
180
-
181
- // Document chunk table schema
182
- const documentChunkFields = [
183
- new Field('id', new Utf8(), false),
184
- new Field('content', new Utf8(), false),
185
- new Field('original_document_path', new Utf8(), false),
186
- new Field('project_path', new Utf8(), false),
187
- new Field('heading_text', new Utf8(), true),
188
- new Field('document_title', new Utf8(), true),
189
- new Field('language', new Utf8(), true),
190
- new Field('vector', vectorType, false),
191
- new Field('content_hash', new Utf8(), false),
192
- new Field('last_modified', new Utf8(), false),
193
- ];
194
- const documentChunkSchema = new Schema(documentChunkFields);
195
-
196
- // PR comments table schema
197
- const prCommentsSchema = this.createPRCommentsSchema();
198
-
199
- // Create or open tables
200
- let fileTable, documentChunkTable, prCommentsTable;
201
-
202
- if (!tableNames.includes(this.fileEmbeddingsTable)) {
203
- console.log(chalk.yellow(`Creating ${this.fileEmbeddingsTable} table with optimized schema...`));
204
- fileTable = await db.createEmptyTable(this.fileEmbeddingsTable, fileSchema, { mode: 'create' });
205
- console.log(chalk.green(`Created ${this.fileEmbeddingsTable} table.`));
206
- } else {
207
- fileTable = await db.openTable(this.fileEmbeddingsTable);
208
- await this._checkSchemaCompatibility(fileTable, this.fileEmbeddingsTable, 'project_path');
209
- }
210
-
211
- if (!tableNames.includes(this.documentChunkTable)) {
212
- console.log(chalk.yellow(`Creating ${this.documentChunkTable} table with optimized schema...`));
213
- documentChunkTable = await db.createEmptyTable(this.documentChunkTable, documentChunkSchema, { mode: 'create' });
214
- console.log(chalk.green(`Created ${this.documentChunkTable} table.`));
215
- } else {
216
- documentChunkTable = await db.openTable(this.documentChunkTable);
217
- await this._checkSchemaCompatibility(documentChunkTable, this.documentChunkTable, 'project_path');
218
- }
219
-
220
- // Create PR comments table
221
- if (!tableNames.includes(this.prCommentsTable)) {
222
- console.log(chalk.yellow(`Creating ${this.prCommentsTable} table with optimized schema...`));
223
- prCommentsTable = await db.createEmptyTable(this.prCommentsTable, prCommentsSchema, { mode: 'create' });
224
- console.log(chalk.green(`Created ${this.prCommentsTable} table.`));
225
- } else {
226
- prCommentsTable = await db.openTable(this.prCommentsTable);
227
- }
228
-
229
- // Create FTS indexes
230
- await this._createFTSIndexes([
231
- [fileTable, this.fileEmbeddingsTable, 'content'],
232
- [documentChunkTable, this.documentChunkTable, 'content'],
233
- [prCommentsTable, this.prCommentsTable, 'comment_text'],
234
- ]);
235
-
236
- // Create adaptive vector indexes
237
- await this._createVectorIndexes([
238
- [fileTable, this.fileEmbeddingsTable, 'vector'],
239
- [documentChunkTable, this.documentChunkTable, 'vector'],
240
- [prCommentsTable, this.prCommentsTable, 'combined_embedding'],
241
- ]);
242
- } catch (error) {
243
- console.error(chalk.red(`Error ensuring tables exist: ${error.message}`), error.stack);
244
- throw error;
245
- }
246
- }
247
-
248
- // ============================================================================
249
- // TABLE OPERATIONS
250
- // ============================================================================
251
-
252
- /**
253
- * Get table by name
254
- * @param {string} tableName - Name of the table
255
- * @returns {Promise<LanceDBTable|null>} Table instance or null if not found
256
- */
257
- async getTable(tableName) {
258
- try {
259
- const db = await this.getDBConnection();
260
- const tableNames = await db.tableNames();
261
- if (tableNames.includes(tableName)) {
262
- return await db.openTable(tableName);
263
- }
264
- return null;
265
- } catch (error) {
266
- console.error(chalk.red(`Error opening table ${tableName}: ${error.message}`), error);
267
- return null;
268
- }
269
- }
270
-
271
- // ============================================================================
272
- // SCHEMA MANAGEMENT
273
- // ============================================================================
274
-
275
- /**
276
- * Create PR comments schema
277
- * @returns {import('apache-arrow').Schema} PR comments schema
278
- */
279
- createPRCommentsSchema() {
280
- const vectorType = new FixedSizeList(this.embeddingDimensions, new Field('item', new Float32(), true));
281
-
282
- const fields = [
283
- new Field('id', new Utf8(), false),
284
- new Field('pr_number', new Int32(), false),
285
- new Field('repository', new Utf8(), false),
286
- new Field('project_path', new Utf8(), false),
287
- new Field('comment_type', new Utf8(), false),
288
- new Field('comment_text', new Utf8(), false),
289
- new Field('comment_embedding', vectorType, false),
290
-
291
- // Code context fields
292
- new Field('file_path', new Utf8(), true),
293
- new Field('line_number', new Int32(), true),
294
- new Field('line_range_start', new Int32(), true),
295
- new Field('line_range_end', new Int32(), true),
296
- new Field('original_code', new Utf8(), true),
297
- new Field('suggested_code', new Utf8(), true),
298
- new Field('diff_hunk', new Utf8(), true),
299
-
300
- // Code embedding
301
- new Field('code_embedding', vectorType, true),
302
- new Field('combined_embedding', vectorType, false),
303
-
304
- // Metadata
305
- new Field('author', new Utf8(), false),
306
- new Field('created_at', new Utf8(), false),
307
- new Field('updated_at', new Utf8(), true),
308
- new Field('review_id', new Utf8(), true),
309
- new Field('review_state', new Utf8(), true),
310
-
311
- // Analysis metadata
312
- new Field('issue_category', new Utf8(), true),
313
- new Field('severity', new Utf8(), true),
314
- new Field('pattern_tags', new Utf8(), true),
315
- ];
316
-
317
- return new Schema(fields);
318
- }
319
-
320
- // ============================================================================
321
- // INDEXING
322
- // ============================================================================
323
-
324
- /**
325
- * Create adaptive vector indexes based on dataset size
326
- * @param {LanceDBTable} table - Table instance
327
- * @param {string} tableName - Table name
328
- * @param {string} vectorField - Vector field name
329
- * @returns {Promise<Object>} Index information
330
- */
331
- async createAdaptiveVectorIndexes(table, tableName, vectorField = 'vector') {
332
- try {
333
- const rowCount = await table.countRows();
334
- console.log(chalk.blue(`[${tableName}] Row count: ${rowCount}`));
335
-
336
- if (rowCount < 100) {
337
- console.log(chalk.blue(`[${tableName}] Skipping indexing for small dataset (${rowCount} rows). Using exact search.`));
338
- return { indexType: 'exact', rowCount };
339
- } else if (rowCount < 1000) {
340
- console.log(chalk.blue(`[${tableName}] Using exact search for small dataset (${rowCount} rows) - no index needed`));
341
- return { indexType: 'exact', rowCount };
342
- } else if (rowCount < 10000) {
343
- const numPartitions = Math.max(Math.floor(Math.sqrt(rowCount / 50)), 2);
344
- console.log(
345
- chalk.blue(`[${tableName}] Creating/updating IVF-Flat index for medium dataset (${rowCount} rows, ${numPartitions} partitions)`)
346
- );
347
- await table.createIndex(vectorField, {
348
- config: lancedb.Index.ivfFlat({ numPartitions }),
349
- replace: false,
350
- });
351
- return { indexType: 'ivf_flat', rowCount, numPartitions };
352
- } else {
353
- const numPartitions = Math.max(Math.floor(Math.sqrt(rowCount / 100)), 8);
354
- const numSubVectors = Math.floor(this.embeddingDimensions / 4);
355
- console.log(
356
- chalk.blue(`[${tableName}] Creating/updating IVF-PQ index for large dataset (${rowCount} rows, ${numPartitions} partitions)`)
357
- );
358
- await table.createIndex(vectorField, {
359
- config: lancedb.Index.ivfPq({
360
- numPartitions,
361
- numSubVectors,
362
- numBits: 8,
363
- }),
364
- replace: false,
365
- });
366
- return { indexType: 'ivf_pq', rowCount, numPartitions, numSubVectors };
367
- }
368
- } catch (error) {
369
- if (error.message.includes('already exists')) {
370
- console.log(chalk.green(`[${tableName}] Index already up-to-date.`));
371
- return { indexType: 'existing' };
372
- }
373
- console.warn(chalk.yellow(`[${tableName}] Index creation/update failed: ${error.message}. Falling back to exact search.`));
374
- return { indexType: 'exact_fallback', error: error.message };
375
- }
376
- }
377
-
378
- // ============================================================================
379
- // CLEANUP OPERATIONS
380
- // ============================================================================
381
-
382
- /**
383
- * Clean up database connection and resources
384
- */
385
- async cleanup() {
386
- if (this.cleaningUp) {
387
- return; // Already cleaning up, prevent duplicate calls
388
- }
389
-
390
- this.cleaningUp = true;
391
-
392
- try {
393
- await this.closeConnection();
394
- console.log(chalk.green('Database resources cleaned up.'));
395
- } catch (error) {
396
- console.error(`Error during database cleanup: ${error.message}`);
397
- } finally {
398
- this.cleaningUp = false;
399
- }
400
- }
401
-
402
- /**
403
- * Clear all embeddings by dropping tables
404
- * @returns {Promise<boolean>} Success status
405
- */
406
- async clearAllEmbeddings() {
407
- let db = null;
408
- try {
409
- console.log(chalk.cyan('Clearing ALL embeddings by dropping tables...'));
410
- console.log(chalk.red('WARNING: This will affect all projects on this machine!'));
411
-
412
- if (!fs.existsSync(this.dbPath)) {
413
- console.log(chalk.yellow('LanceDB directory does not exist, nothing to clear.'));
414
- return true;
415
- }
416
-
417
- db = await lancedb.connect(this.dbPath);
418
- const tableNames = await db.tableNames();
419
- let droppedCount = 0;
420
-
421
- for (const tableName of [this.fileEmbeddingsTable, this.documentChunkTable, this.prCommentsTable]) {
422
- if (tableNames.includes(tableName)) {
423
- console.log(chalk.yellow(`Dropping table ${tableName}...`));
424
- await db.dropTable(tableName);
425
- console.log(chalk.green(`Table ${tableName} dropped.`));
426
- droppedCount++;
427
- } else {
428
- console.log(chalk.yellow(`Table ${tableName} does not exist.`));
429
- }
430
- }
431
-
432
- if (droppedCount > 0) {
433
- console.log(chalk.green('All embedding tables have been dropped.'));
434
- console.log(chalk.yellow('Run the embedding generation process again to recreate tables.'));
435
- } else {
436
- console.log(chalk.green('No embedding tables found to drop.'));
437
- }
438
-
439
- // Reset connection state
440
- this.dbConnection = null;
441
- this.tablesInitialized = false;
442
- return true;
443
- } catch (error) {
444
- console.error(chalk.red(`Error clearing embeddings: ${error.message}`), error);
445
- this.dbConnection = null;
446
- this.tablesInitialized = false;
447
- throw error;
448
- }
449
- }
450
-
451
- /**
452
- * Clear embeddings for a specific project
453
- * @param {string} projectPath - Project path
454
- * @returns {Promise<boolean>} Success status
455
- */
456
- async clearProjectEmbeddings(projectPath = process.cwd()) {
457
- let db = null;
458
- try {
459
- const resolvedProjectPath = path.resolve(projectPath);
460
- const projectName = path.basename(resolvedProjectPath);
461
-
462
- // Safety check: ensure project path is valid and not root
463
- if (!resolvedProjectPath || resolvedProjectPath === '/' || resolvedProjectPath === path.resolve('/')) {
464
- throw new Error(`Invalid project path: ${resolvedProjectPath}. Cannot clear embeddings for root directory.`);
465
- }
466
-
467
- // Additional safety: ensure project path is not too generic
468
- const pathParts = resolvedProjectPath.split(path.sep);
469
- if (pathParts.length <= 2) {
470
- throw new Error(`Project path too generic: ${resolvedProjectPath}. For safety, project must be at least 3 levels deep.`);
471
- }
472
-
473
- console.log(chalk.cyan(`Clearing embeddings for project: ${resolvedProjectPath} (${projectName})`));
474
-
475
- if (!fs.existsSync(this.dbPath)) {
476
- console.log(chalk.yellow('LanceDB directory does not exist, nothing to clear.'));
477
- return true;
478
- }
479
-
480
- db = await lancedb.connect(this.dbPath);
481
- const tableNames = await db.tableNames();
482
- let deletedCount = 0;
483
-
484
- // Clear file embeddings for this project
485
- if (tableNames.includes(this.fileEmbeddingsTable)) {
486
- const fileTable = await db.openTable(this.fileEmbeddingsTable);
487
- await this._validateTableHasProjectPath(fileTable, this.fileEmbeddingsTable);
488
- deletedCount += await this._clearProjectTableRecords(
489
- db,
490
- this.fileEmbeddingsTable,
491
- resolvedProjectPath,
492
- projectName,
493
- 'project_path'
494
- );
495
- }
496
-
497
- // Clear document chunk embeddings for this project
498
- if (tableNames.includes(this.documentChunkTable)) {
499
- const docTable = await db.openTable(this.documentChunkTable);
500
- await this._validateTableHasProjectPath(docTable, this.documentChunkTable);
501
- deletedCount += await this._clearProjectTableRecords(db, this.documentChunkTable, resolvedProjectPath, projectName, 'project_path');
502
- }
503
-
504
- // Clear project summaries for this project
505
- if (tableNames.includes(PROJECT_SUMMARIES_TABLE)) {
506
- const summariesTable = await db.openTable(PROJECT_SUMMARIES_TABLE);
507
- await this._validateTableHasProjectPath(summariesTable, PROJECT_SUMMARIES_TABLE);
508
- deletedCount += await this._clearProjectTableRecords(db, PROJECT_SUMMARIES_TABLE, resolvedProjectPath, projectName, 'project_path');
509
- }
510
-
511
- // Note: PR comments are cleared via separate pr-history:clear command
512
- // This embeddings:clear command handles file embeddings, document embeddings, and project summaries
513
-
514
- if (deletedCount > 0) {
515
- console.log(chalk.green(`Successfully cleared ${deletedCount} embeddings for project: ${resolvedProjectPath}`));
516
-
517
- // Optimize tables after cleanup to maintain performance
518
- await this._optimizeTablesAfterCleanup(db, tableNames);
519
- } else {
520
- console.log(chalk.yellow(`No embeddings found for project: ${resolvedProjectPath}`));
521
- }
522
-
523
- return true;
524
- } catch (error) {
525
- console.error(chalk.red(`Error clearing project embeddings: ${error.message}`), error);
526
- throw error;
527
- }
528
- }
529
-
530
- // ============================================================================
531
- // PRIVATE METHODS
532
- // ============================================================================
533
-
534
- /**
535
- * Check schema compatibility for existing tables
536
- * @param {LanceDBTable} table - Table instance
537
- * @param {string} tableName - Table name
538
- * @param {string} requiredField - Required field name
539
- * @private
540
- */
541
- async _checkSchemaCompatibility(table, tableName, requiredField) {
542
- try {
543
- const currentSchema = await table.schema;
544
- if (currentSchema && currentSchema.fields) {
545
- const hasRequiredField = currentSchema.fields.some((field) => field.name === requiredField);
546
- if (!hasRequiredField) {
547
- console.log(chalk.yellow(`Table ${tableName} has old schema without ${requiredField}. Migration needed.`));
548
- console.log(chalk.yellow(`Please clear embeddings and regenerate them to use the new schema with project isolation.`));
549
- }
550
- }
551
- } catch (schemaError) {
552
- debug(`Could not check schema for ${tableName}: ${schemaError.message}`);
553
- }
554
- }
555
-
556
- /**
557
- * Validate that a table has the project_path field for proper project isolation
558
- * @param {LanceDBTable} table - Table instance
559
- * @param {string} tableName - Table name
560
- * @throws {Error} If table doesn't have project_path field
561
- * @private
562
- */
563
- async _validateTableHasProjectPath(table, tableName) {
564
- try {
565
- const currentSchema = await table.schema;
566
- if (currentSchema && currentSchema.fields) {
567
- const hasProjectPath = currentSchema.fields.some((field) => field.name === 'project_path');
568
- if (!hasProjectPath) {
569
- throw new Error(
570
- `Table ${tableName} does not have project_path field. Cannot perform project-specific cleanup. Please regenerate embeddings to use the new schema with project isolation.`
571
- );
572
- }
573
- console.log(chalk.green(`✓ Table ${tableName} has project_path field for proper isolation`));
574
- } else {
575
- console.log(chalk.yellow(`Table ${tableName} has no readable schema, skipping validation`));
576
- }
577
- } catch (schemaError) {
578
- // If we can't read the schema, it might be because the table is empty or doesn't exist
579
- // In this case, we should just warn and continue
580
- console.log(chalk.yellow(`Warning: Could not validate schema for ${tableName}: ${schemaError.message}`));
581
- }
582
- }
583
-
584
- /**
585
- * Create FTS indexes for tables
586
- * @param {Array} tableSpecs - Array of [table, tableName, contentField] tuples
587
- * @private
588
- */
589
- async _createFTSIndexes(tableSpecs) {
590
- console.log(chalk.blue('Creating native FTS indexes...'));
591
-
592
- for (const [table, tableName, contentField] of tableSpecs) {
593
- try {
594
- await table.createIndex(contentField, { config: lancedb.Index.fts(), replace: false });
595
- console.log(chalk.green(`FTS index created/updated for ${tableName}`));
596
- } catch (error) {
597
- if (error.message.toLowerCase().includes('already exists')) {
598
- console.log(chalk.green(`FTS index already exists for ${tableName}.`));
599
- } else {
600
- console.warn(chalk.yellow(`FTS index warning for ${tableName}: ${error.message}`));
601
- }
602
- }
603
- }
604
- }
605
-
606
- /**
607
- * Create vector indexes for tables
608
- * @param {Array} tableSpecs - Array of [table, tableName, vectorField] tuples
609
- * @private
610
- */
611
- async _createVectorIndexes(tableSpecs) {
612
- console.log(chalk.blue('Creating adaptive vector indexes...'));
613
-
614
- const indexResults = [];
615
- for (const [table, tableName, vectorField] of tableSpecs) {
616
- const indexInfo = await this.createAdaptiveVectorIndexes(table, tableName, vectorField);
617
- indexResults.push(indexInfo);
618
- }
619
-
620
- console.log(chalk.green(`Indexing complete - ${JSON.stringify(indexResults)}`));
621
- }
622
-
623
- /**
624
- * Optimize tables to sync indices with data and prevent TakeExec panics
625
- * @param {Array} tableSpecs - Array of [table, tableName] tuples
626
- * @private
627
- */
628
- async _optimizeTables(tableSpecs) {
629
- console.log(chalk.blue('Optimizing tables to sync indices with data...'));
630
-
631
- for (const [table, tableName] of tableSpecs) {
632
- try {
633
- console.log(chalk.blue(`Optimizing table: ${tableName}`));
634
- await table.optimize();
635
- console.log(chalk.green(`✓ Table ${tableName} optimized successfully`));
636
- } catch (error) {
637
- // Handle legacy FTS index upgrade issues in v0.22.2
638
- if (error.message && error.message.includes('legacy format')) {
639
- console.warn(
640
- chalk.yellow(
641
- `Skipping optimization for ${tableName} due to legacy index format - will be auto-upgraded during normal operations`
642
- )
643
- );
644
- } else {
645
- console.warn(chalk.yellow(`Warning: Failed to optimize table ${tableName}: ${error.message}`));
646
- }
647
- }
648
- }
649
-
650
- console.log(chalk.green('Table optimization complete'));
651
- }
652
-
653
- /**
654
- * Clear records from a specific table for a project
655
- * @param {LanceDBConnection} db - Database connection
656
- * @param {string} tableName - Table name
657
- * @param {string} resolvedProjectPath - Resolved project path
658
- * @param {string} projectName - Project name
659
- * @param {string} pathField - Path field name
660
- * @returns {Promise<number>} Number of deleted records
661
- * @private
662
- */
663
- async _clearProjectTableRecords(db, tableName, resolvedProjectPath, projectName, pathField) {
664
- const table = await db.openTable(tableName);
665
- const allRecords = await table.query().toArray();
666
-
667
- const projectRecords = allRecords.filter((record) => {
668
- if (!record[pathField]) return false;
669
-
670
- // Check for project-specific structure
671
- if (record.id === `__project_structure__${projectName}` || record.id === '__project_structure__') {
672
- return true;
673
- }
674
-
675
- // Check if this record belongs to the current project
676
- try {
677
- if (pathField === 'project_path') {
678
- // For project_path field, do direct equality check
679
- return record[pathField] === resolvedProjectPath;
680
- } else {
681
- // For other path fields (like 'path'), resolve relative to project path
682
- const absolutePath = path.resolve(resolvedProjectPath, record[pathField]);
683
- return absolutePath.startsWith(resolvedProjectPath);
684
- }
685
- } catch {
686
- return false;
687
- }
688
- });
689
-
690
- if (projectRecords.length > 0) {
691
- console.log(chalk.blue(`Found ${projectRecords.length} ${tableName} records for this project`));
692
-
693
- let deletedCount = 0;
694
- for (const record of projectRecords) {
695
- try {
696
- await table.delete(`id = '${record.id.replace(/'/g, "''")}'`);
697
- deletedCount++;
698
- } catch (deleteError) {
699
- console.warn(chalk.yellow(`Warning: Could not delete record ${record.id}: ${deleteError.message}`));
700
- }
701
- }
702
-
703
- console.log(chalk.green(`Deleted ${deletedCount} ${tableName} records for this project`));
704
- return deletedCount;
705
- } else {
706
- console.log(chalk.yellow(`No ${tableName} records found for this project`));
707
- return 0;
708
- }
709
- }
710
-
711
- /**
712
- * Update the vector index for the PR comments table
713
- * @returns {Promise<void>}
714
- */
715
- async updatePRCommentsIndex() {
716
- try {
717
- const table = await this.getTable(this.prCommentsTable);
718
- if (table) {
719
- console.log(chalk.blue(`Updating vector index for ${this.prCommentsTable}...`));
720
- await this.createAdaptiveVectorIndexes(table, this.prCommentsTable, 'combined_embedding');
721
-
722
- // Optimize table to sync indices with data (conditional due to legacy index issues)
723
- console.log(chalk.blue(`Optimizing ${this.prCommentsTable} table...`));
724
- try {
725
- await table.optimize();
726
- } catch (optimizeError) {
727
- if (optimizeError.message && optimizeError.message.includes('legacy format')) {
728
- console.warn(chalk.yellow(`Skipping optimization due to legacy index format - will be auto-upgraded during normal operations`));
729
- } else {
730
- throw optimizeError; // Re-throw non-legacy errors
731
- }
732
- }
733
-
734
- console.log(chalk.green(`Vector index for ${this.prCommentsTable} updated and optimized.`));
735
- }
736
- } catch (error) {
737
- console.error(chalk.red(`Error updating PR comments index: ${error.message}`));
738
- throw createDatabaseError(`Failed to update PR comments index: ${error.message}`, ERROR_CODES.INDEX_UPDATE_ERROR, error);
739
- }
740
- }
741
-
742
- /**
743
- * Store project summary in database
744
- * @param {string} projectPath - Project path
745
- * @param {Object} projectSummary - Project analysis summary
746
- * @returns {Promise<boolean>} Success status
747
- */
748
- async storeProjectSummary(projectPath, projectSummary) {
749
- try {
750
- const resolvedProjectPath = path.resolve(projectPath);
751
- const projectName = path.basename(resolvedProjectPath);
752
-
753
- // Get database connection
754
- const db = await this.getDBConnection();
755
- const tableNames = await db.tableNames();
756
-
757
- // Create project summaries table if it doesn't exist
758
- if (!tableNames.includes(PROJECT_SUMMARIES_TABLE)) {
759
- await this._createProjectSummariesTable(db);
760
- }
761
-
762
- const table = await db.openTable(PROJECT_SUMMARIES_TABLE);
763
-
764
- // Prepare the record
765
- const record = {
766
- id: `project_summary_${projectName}_${Date.now()}`,
767
- project_path: resolvedProjectPath,
768
- project_name: projectName,
769
- summary: JSON.stringify(projectSummary),
770
- created_at: new Date().toISOString(),
771
- last_updated: new Date().toISOString(),
772
- };
773
-
774
- // Remove any existing summary for this project first
775
- try {
776
- const existingRecords = await table
777
- .query()
778
- .where(`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`)
779
- .toArray();
780
-
781
- for (const existing of existingRecords) {
782
- await table.delete(`id = '${existing.id.replace(/'/g, "''")}'`);
783
- }
784
- } catch {
785
- // Continue if no existing records found
786
- }
787
-
788
- // Add the new record
789
- await table.add([record]);
790
-
791
- // Optimize table to sync indices with data (conditional due to legacy index issues)
792
- try {
793
- await table.optimize();
794
- console.log(chalk.blue(`✓ Project summaries table optimized`));
795
- } catch (optimizeError) {
796
- if (optimizeError.message && optimizeError.message.includes('legacy format')) {
797
- console.warn(chalk.yellow(`Skipping optimization due to legacy index format - will be auto-upgraded during normal operations`));
798
- } else {
799
- console.warn(chalk.yellow(`Warning: Failed to optimize project summaries table: ${optimizeError.message}`));
800
- }
801
- }
802
-
803
- console.log(chalk.green(`✅ Project summary stored for: ${resolvedProjectPath}`));
804
- return true;
805
- } catch (error) {
806
- console.error(chalk.red(`Error storing project summary: ${error.message}`));
807
- throw createDatabaseError(`Failed to store project summary: ${error.message}`, ERROR_CODES.STORAGE_ERROR, error);
808
- }
809
- }
810
-
811
- /**
812
- * Get stored project summary from database
813
- * @param {string} projectPath - Project path
814
- * @returns {Promise<Object|null>} Project summary or null if not found
815
- */
816
- async getProjectSummary(projectPath) {
817
- try {
818
- const resolvedProjectPath = path.resolve(projectPath);
819
-
820
- // Get database connection
821
- const db = await this.getDBConnection();
822
- const tableNames = await db.tableNames();
823
-
824
- // Check if table exists
825
- if (!tableNames.includes(PROJECT_SUMMARIES_TABLE)) {
826
- return null;
827
- }
828
-
829
- const table = await db.openTable(PROJECT_SUMMARIES_TABLE);
830
-
831
- // Query for the project summary
832
- const records = await table
833
- .query()
834
- .where(`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`)
835
- .toArray();
836
-
837
- if (records.length === 0) {
838
- return null;
839
- }
840
-
841
- // Get the most recent record (in case there are duplicates)
842
- const latestRecord = records.sort((a, b) => new Date(b.last_updated).getTime() - new Date(a.last_updated).getTime())[0];
843
-
844
- // Parse and return the summary
845
- const summary = JSON.parse(latestRecord.summary);
846
- summary._metadata = {
847
- created_at: latestRecord.created_at,
848
- last_updated: latestRecord.last_updated,
849
- project_name: latestRecord.project_name,
850
- };
851
-
852
- return summary;
853
- } catch (error) {
854
- console.error(chalk.red(`Error retrieving project summary: ${error.message}`));
855
- return null; // Return null instead of throwing to allow graceful fallback
856
- }
857
- }
858
-
859
- /**
860
- * Optimize tables after cleanup operations
861
- * @param {LanceDBConnection} db - Database connection
862
- * @param {Array<string>} availableTableNames - Available table names
863
- * @private
864
- */
865
- async _optimizeTablesAfterCleanup(db, availableTableNames) {
866
- console.log(chalk.blue('Optimizing tables after cleanup...'));
867
-
868
- const tablesToOptimize = [
869
- { name: this.fileEmbeddingsTable, displayName: 'File embeddings' },
870
- { name: this.documentChunkTable, displayName: 'Document chunks' },
871
- { name: this.prCommentsTable, displayName: 'PR comments' },
872
- { name: PROJECT_SUMMARIES_TABLE, displayName: 'Project summaries' },
873
- ];
874
-
875
- for (const { name, displayName } of tablesToOptimize) {
876
- if (availableTableNames.includes(name)) {
877
- try {
878
- const table = await db.openTable(name);
879
- console.log(chalk.blue(`Optimizing ${displayName} table...`));
880
- await table.optimize();
881
- console.log(chalk.green(`✓ ${displayName} table optimized`));
882
- } catch (error) {
883
- if (error.message && error.message.includes('legacy format')) {
884
- console.warn(
885
- chalk.yellow(
886
- `Skipping optimization for ${displayName} due to legacy index format - will be auto-upgraded during normal operations`
887
- )
888
- );
889
- } else {
890
- console.warn(chalk.yellow(`Warning: Failed to optimize ${displayName} table: ${error.message}`));
891
- }
892
- }
893
- }
894
- }
895
-
896
- console.log(chalk.green('Post-cleanup table optimization complete'));
897
- }
898
-
899
- /**
900
- * Create project summaries table
901
- * @param {LanceDBConnection} db - Database connection
902
- * @returns {Promise<void>}
903
- * @private
904
- */
905
- async _createProjectSummariesTable(db) {
906
- console.log(chalk.blue('Creating project summaries table...'));
907
-
908
- const schema = new Schema([
909
- new Field('id', new Utf8()),
910
- new Field('project_path', new Utf8()),
911
- new Field('project_name', new Utf8()),
912
- new Field('summary', new Utf8()),
913
- new Field('created_at', new Utf8()),
914
- new Field('last_updated', new Utf8()),
915
- ]);
916
-
917
- // Create table with empty initial data
918
- await db.createEmptyTable(PROJECT_SUMMARIES_TABLE, schema);
919
- console.log(chalk.green(`✅ Project summaries table created: ${PROJECT_SUMMARIES_TABLE}`));
920
- }
921
- }