codecritique 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +1145 -0
  3. package/package.json +98 -0
  4. package/src/content-retrieval.js +747 -0
  5. package/src/custom-documents.js +597 -0
  6. package/src/embeddings/cache-manager.js +364 -0
  7. package/src/embeddings/constants.js +40 -0
  8. package/src/embeddings/database.js +921 -0
  9. package/src/embeddings/errors.js +208 -0
  10. package/src/embeddings/factory.js +447 -0
  11. package/src/embeddings/file-processor.js +851 -0
  12. package/src/embeddings/model-manager.js +337 -0
  13. package/src/embeddings/similarity-calculator.js +97 -0
  14. package/src/embeddings/types.js +113 -0
  15. package/src/feedback-loader.js +384 -0
  16. package/src/index.js +1418 -0
  17. package/src/llm.js +123 -0
  18. package/src/pr-history/analyzer.js +579 -0
  19. package/src/pr-history/bot-detector.js +123 -0
  20. package/src/pr-history/cli-utils.js +204 -0
  21. package/src/pr-history/comment-processor.js +549 -0
  22. package/src/pr-history/database.js +819 -0
  23. package/src/pr-history/github-client.js +629 -0
  24. package/src/project-analyzer.js +955 -0
  25. package/src/rag-analyzer.js +2764 -0
  26. package/src/rag-review.js +566 -0
  27. package/src/technology-keywords.json +753 -0
  28. package/src/utils/command.js +48 -0
  29. package/src/utils/constants.js +263 -0
  30. package/src/utils/context-inference.js +364 -0
  31. package/src/utils/document-detection.js +105 -0
  32. package/src/utils/file-validation.js +271 -0
  33. package/src/utils/git.js +232 -0
  34. package/src/utils/language-detection.js +170 -0
  35. package/src/utils/logging.js +24 -0
  36. package/src/utils/markdown.js +132 -0
  37. package/src/utils/mobilebert-tokenizer.js +141 -0
  38. package/src/utils/pr-chunking.js +276 -0
  39. package/src/utils/string-utils.js +28 -0
  40. package/src/zero-shot-classifier-open.js +392 -0
@@ -0,0 +1,819 @@
1
+ /**
2
+ * PR History Database Integration
3
+ *
4
+ * This module provides PR comment storage and retrieval functionality
5
+ * by reusing the database infrastructure from embeddings.js.
6
+ * All database connection, table management, and indexing is handled by embeddings.js.
7
+ */
8
+
9
+ import path from 'node:path';
10
+ import { pipeline } from '@huggingface/transformers';
11
+ import chalk from 'chalk';
12
+ import stopwords from 'stopwords-iso/stopwords-iso.json' with { type: 'json' };
13
+ import { EMBEDDING_DIMENSIONS, TABLE_NAMES } from '../embeddings/constants.js';
14
+ import { getDefaultEmbeddingsSystem } from '../embeddings/factory.js';
15
+ import { truncateToTokenLimit, cleanupTokenizer } from '../utils/mobilebert-tokenizer.js';
16
+
17
+ // Create embeddings system instance
18
+ const embeddingsSystem = getDefaultEmbeddingsSystem();
19
+
20
+ // Import constants from embeddings.js to avoid duplication
21
+ const { PR_COMMENTS } = TABLE_NAMES;
22
+ const PR_COMMENTS_TABLE = PR_COMMENTS;
23
+
24
+ /**
25
+ * Store multiple PR comments in batch
26
+ * @param {Array<Object>} commentsData - Array of processed comment data
27
+ * @param {string} projectPath - Project path for isolation (optional, defaults to cwd)
28
+ * @returns {Promise<number>} Number of successfully stored comments
29
+ */
30
+ export async function storePRCommentsBatch(commentsData, projectPath = process.cwd()) {
31
+ if (!Array.isArray(commentsData) || commentsData.length === 0) {
32
+ return 0;
33
+ }
34
+
35
+ let successCount = 0;
36
+ const batchSize = 100;
37
+ const resolvedProjectPath = path.resolve(projectPath);
38
+
39
+ try {
40
+ const table = await embeddingsSystem.getPRCommentsTable();
41
+
42
+ if (!table) {
43
+ throw new Error(`Table ${PR_COMMENTS_TABLE} not found`);
44
+ }
45
+
46
+ for (let i = 0; i < commentsData.length; i += batchSize) {
47
+ const batch = commentsData.slice(i, i + batchSize);
48
+ const validRecords = [];
49
+
50
+ for (const commentData of batch) {
51
+ try {
52
+ // Validate and prepare record
53
+ if (!commentData.id || !commentData.comment_text || !commentData.comment_embedding) {
54
+ console.warn(chalk.yellow(`Skipping comment with missing required fields: ${commentData.id || 'unknown'}`));
55
+ continue;
56
+ }
57
+
58
+ if (commentData.comment_embedding.length !== EMBEDDING_DIMENSIONS) {
59
+ console.warn(chalk.yellow(`Skipping comment with invalid embedding dimensions: ${commentData.id}`));
60
+ continue;
61
+ }
62
+
63
+ const record = {
64
+ id: commentData.id,
65
+ pr_number: commentData.pr_number || 0,
66
+ repository: commentData.repository || '',
67
+ project_path: resolvedProjectPath,
68
+ comment_type: commentData.comment_type || 'issue',
69
+ comment_text: commentData.comment_text,
70
+ comment_embedding: commentData.comment_embedding,
71
+
72
+ file_path: commentData.file_path || null,
73
+ line_number: commentData.line_number || null,
74
+ line_range_start: commentData.line_range_start || null,
75
+ line_range_end: commentData.line_range_end || null,
76
+ original_code: commentData.original_code || null,
77
+ suggested_code: commentData.suggested_code || null,
78
+ diff_hunk: commentData.diff_hunk || null,
79
+
80
+ code_embedding: commentData.code_embedding || null,
81
+ combined_embedding: commentData.combined_embedding || commentData.comment_embedding,
82
+
83
+ author: commentData.author || 'unknown',
84
+ created_at: commentData.created_at || new Date().toISOString(),
85
+ updated_at: commentData.updated_at || null,
86
+ review_id: commentData.review_id || null,
87
+ review_state: commentData.review_state || null,
88
+
89
+ issue_category: commentData.issue_category || 'general',
90
+ severity: commentData.severity || 'minor',
91
+ pattern_tags: JSON.stringify(commentData.pattern_tags || []),
92
+ };
93
+
94
+ validRecords.push(record);
95
+ } catch (recordError) {
96
+ console.warn(chalk.yellow(`Error preparing record for ${commentData.id}: ${recordError.message}`));
97
+ }
98
+ }
99
+
100
+ if (validRecords.length > 0) {
101
+ try {
102
+ await table.add(validRecords);
103
+ successCount += validRecords.length;
104
+
105
+ // Optimize table to sync indices with data and prevent TakeExec panics
106
+ try {
107
+ await table.optimize();
108
+ } catch (optimizeError) {
109
+ if (optimizeError.message && optimizeError.message.includes('legacy format')) {
110
+ console.log(
111
+ chalk.yellow(`Skipping optimization due to legacy index format - will be auto-upgraded during normal operations`)
112
+ );
113
+ } else {
114
+ console.warn(chalk.yellow(`Warning: Failed to optimize PR comments table after adding records: ${optimizeError.message}`));
115
+ }
116
+ }
117
+
118
+ console.log(chalk.green(`Stored batch of ${validRecords.length} PR comments`));
119
+ } catch (batchError) {
120
+ console.error(chalk.red(`Error storing batch: ${batchError.message}`));
121
+ }
122
+ }
123
+ }
124
+ if (successCount > 0) {
125
+ await embeddingsSystem.updatePRCommentsIndex();
126
+ }
127
+ } catch (error) {
128
+ console.error(chalk.red(`Error in batch storage: ${error.message}`));
129
+ }
130
+
131
+ return successCount;
132
+ }
133
+
134
+ /**
135
+ * Get statistics about stored PR comments
136
+ * @param {string} repository - Repository to get stats for (optional)
137
+ * @param {string} projectPath - Project path for filtering (optional, defaults to cwd)
138
+ * @returns {Promise<Object>} Statistics object
139
+ */
140
+ export async function getPRCommentsStats(repository = null, projectPath = process.cwd()) {
141
+ try {
142
+ const table = await embeddingsSystem.getPRCommentsTable();
143
+
144
+ const defaultStats = {
145
+ total_comments: 0,
146
+ comment_types: {},
147
+ issue_categories: {},
148
+ severity_levels: {},
149
+ authors: {},
150
+ repositories: {},
151
+ };
152
+
153
+ if (!table) {
154
+ console.log(chalk.yellow('PR comments table not found, returning empty stats'));
155
+ return defaultStats;
156
+ }
157
+
158
+ const resolvedProjectPath = path.resolve(projectPath);
159
+
160
+ const filters = [`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`];
161
+ if (repository) {
162
+ filters.push(`repository = '${repository.replace(/'/g, "''")}'`);
163
+ }
164
+
165
+ const whereClause = filters.join(' AND ');
166
+ console.log(chalk.blue(`Getting stats with filter: ${whereClause}`));
167
+
168
+ let totalCount = 0;
169
+ try {
170
+ totalCount = await table.countRows(whereClause);
171
+ console.log(chalk.blue(`Found ${totalCount} total comments matching filter`));
172
+ } catch (countError) {
173
+ console.warn(chalk.yellow(`Error counting rows: ${countError.message}, trying without filter`));
174
+ totalCount = await table.countRows();
175
+ console.log(chalk.blue(`Found ${totalCount} total comments in table`));
176
+ }
177
+
178
+ let results = [];
179
+ if (totalCount > 0) {
180
+ try {
181
+ // Use query() instead of search() for non-vector queries
182
+ results = await table.query().where(whereClause).limit(10000).toArray();
183
+ console.log(chalk.blue(`Retrieved ${results.length} comments for analysis`));
184
+ } catch (queryError) {
185
+ console.warn(chalk.yellow(`Error with filtered query: ${queryError.message}, trying without filter`));
186
+ try {
187
+ // Try getting all records and filter manually
188
+ results = await table.query().limit(10000).toArray();
189
+ // Filter results manually if database query failed
190
+ if (repository) {
191
+ results = results.filter((r) => r.repository === repository && r.project_path === resolvedProjectPath);
192
+ } else {
193
+ results = results.filter((r) => r.project_path === resolvedProjectPath);
194
+ }
195
+ console.log(chalk.blue(`Retrieved and filtered ${results.length} comments for analysis`));
196
+ } catch (fallbackError) {
197
+ console.error(chalk.red(`Fallback query also failed: ${fallbackError.message}`));
198
+ results = [];
199
+ }
200
+ }
201
+ }
202
+
203
+ const stats = {
204
+ total_comments: results.length,
205
+ totalComments: results.length, // Add field expected by index.js
206
+ comment_types: {},
207
+ issue_categories: {},
208
+ severity_levels: {},
209
+ authors: {},
210
+ repositories: {},
211
+ };
212
+
213
+ // Calculate additional fields expected by index.js
214
+ const uniquePRs = new Set();
215
+ let earliestDate = null;
216
+ let latestDate = null;
217
+
218
+ if (Array.isArray(results) && results.length > 0) {
219
+ for (const comment of results) {
220
+ // Safely handle potentially undefined fields
221
+ const commentType = comment.comment_type || 'unknown';
222
+ const issueCategory = comment.issue_category || 'general';
223
+ const severity = comment.severity || 'minor';
224
+ const author = comment.author || 'unknown';
225
+ const repo = comment.repository || 'unknown';
226
+
227
+ stats.comment_types[commentType] = (stats.comment_types[commentType] || 0) + 1;
228
+ stats.issue_categories[issueCategory] = (stats.issue_categories[issueCategory] || 0) + 1;
229
+ stats.severity_levels[severity] = (stats.severity_levels[severity] || 0) + 1;
230
+ stats.authors[author] = (stats.authors[author] || 0) + 1;
231
+ stats.repositories[repo] = (stats.repositories[repo] || 0) + 1;
232
+
233
+ // Track unique PRs
234
+ if (comment.pr_number) {
235
+ uniquePRs.add(comment.pr_number);
236
+ }
237
+
238
+ // Track date range
239
+ if (comment.created_at) {
240
+ const commentDate = new Date(comment.created_at);
241
+ if (!earliestDate || commentDate < earliestDate) {
242
+ earliestDate = commentDate;
243
+ }
244
+ if (!latestDate || commentDate > latestDate) {
245
+ latestDate = commentDate;
246
+ }
247
+ }
248
+ }
249
+ }
250
+
251
+ // Add fields expected by index.js clear command
252
+ stats.totalPRs = uniquePRs.size;
253
+ stats.uniqueAuthors = Object.keys(stats.authors).length;
254
+ stats.dateRange = {
255
+ earliest: earliestDate ? earliestDate.toISOString().split('T')[0] : 'N/A',
256
+ latest: latestDate ? latestDate.toISOString().split('T')[0] : 'N/A',
257
+ };
258
+
259
+ console.log(chalk.green(`Stats generated: ${stats.totalComments} comments, ${stats.totalPRs} PRs, ${stats.uniqueAuthors} authors`));
260
+ return stats;
261
+ } catch (error) {
262
+ console.error(chalk.red(`Error getting PR comments stats: ${error.message}`));
263
+ console.error(chalk.red(`Stack trace: ${error.stack}`));
264
+ return {
265
+ total_comments: 0,
266
+ comment_types: {},
267
+ issue_categories: {},
268
+ severity_levels: {},
269
+ authors: {},
270
+ repositories: {},
271
+ };
272
+ }
273
+ }
274
+
275
+ /**
276
+ * Get the date range of processed PRs for a repository
277
+ * @param {string} repository - Repository in format "owner/repo"
278
+ * @param {string} projectPath - Project path for filtering (optional, defaults to cwd)
279
+ * @returns {Promise<{oldestPR: string|null, newestPR: string|null}>} Date range of processed PRs
280
+ */
281
+ export async function getProcessedPRDateRange(repository, projectPath = process.cwd()) {
282
+ try {
283
+ const table = await embeddingsSystem.getPRCommentsTable();
284
+
285
+ if (!table) {
286
+ return { oldestPR: null, newestPR: null };
287
+ }
288
+
289
+ const resolvedProjectPath = path.resolve(projectPath);
290
+ const whereClause = `repository = '${repository.replace(/'/g, "''")}' AND project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`;
291
+
292
+ // Get all unique PR numbers and their creation dates
293
+ const results = await table.query().where(whereClause).limit(10000).toArray();
294
+
295
+ if (results.length === 0) {
296
+ return { oldestPR: null, newestPR: null };
297
+ }
298
+
299
+ // Extract unique PRs with their dates
300
+ const prDates = new Map();
301
+ results.forEach((comment) => {
302
+ if (comment.pr_number && comment.created_at) {
303
+ const prNumber = comment.pr_number;
304
+ const commentDate = new Date(comment.created_at);
305
+
306
+ if (!prDates.has(prNumber) || commentDate < prDates.get(prNumber)) {
307
+ prDates.set(prNumber, commentDate);
308
+ }
309
+ }
310
+ });
311
+
312
+ if (prDates.size === 0) {
313
+ return { oldestPR: null, newestPR: null };
314
+ }
315
+
316
+ const dates = Array.from(prDates.values()).sort((a, b) => a - b);
317
+ const oldestPR = dates[0].toISOString();
318
+ const newestPR = dates[dates.length - 1].toISOString();
319
+
320
+ console.log(chalk.blue(`Processed PR date range: ${oldestPR} to ${newestPR} (${prDates.size} PRs)`));
321
+ return { oldestPR, newestPR };
322
+ } catch (error) {
323
+ console.error(chalk.red(`Error getting processed PR date range: ${error.message}`));
324
+ return { oldestPR: null, newestPR: null };
325
+ }
326
+ }
327
+
328
+ /**
329
+ * Check if a PR should be skipped based on processed date range
330
+ * @param {Object} pr - PR object with merged_at or created_at date
331
+ * @param {string} oldestPR - Oldest processed PR date (ISO string)
332
+ * @param {string} newestPR - Newest processed PR date (ISO string)
333
+ * @returns {boolean} True if PR should be skipped
334
+ */
335
+ export function shouldSkipPR(pr, oldestPR, newestPR) {
336
+ if (!oldestPR || !newestPR || !pr) {
337
+ return false;
338
+ }
339
+
340
+ const prDate = new Date(pr.merged_at || pr.created_at || pr.updated_at);
341
+ const oldestDate = new Date(oldestPR);
342
+ const newestDate = new Date(newestPR);
343
+
344
+ // Skip if PR date falls within the already processed range
345
+ return prDate >= oldestDate && prDate <= newestDate;
346
+ }
347
+
348
+ /**
349
+ * Clear all PR comments for a repository
350
+ * @param {string} repository - Repository in format "owner/repo"
351
+ * @param {string} projectPath - Project path for filtering (optional, defaults to cwd)
352
+ * @returns {Promise<number>} Number of deleted comments
353
+ */
354
+ export async function clearPRComments(repository, projectPath = process.cwd()) {
355
+ try {
356
+ const table = await embeddingsSystem.getPRCommentsTable();
357
+
358
+ if (!table) {
359
+ return 0;
360
+ }
361
+
362
+ const resolvedProjectPath = path.resolve(projectPath);
363
+ const deleteQuery = `repository = '${repository.replace(/'/g, "''")}' AND project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`;
364
+ const countBefore = await table.countRows(deleteQuery);
365
+
366
+ await table.delete(deleteQuery);
367
+
368
+ console.log(chalk.yellow(`Cleared ${countBefore} PR comments for repository ${repository}`));
369
+ return countBefore;
370
+ } catch (error) {
371
+ console.error(chalk.red(`Error clearing PR comments: ${error.message}`));
372
+ return 0;
373
+ }
374
+ }
375
+
376
+ /**
377
+ * Check if PR comments exist for a repository
378
+ * @param {string} repository - Repository in format "owner/repo"
379
+ * @param {string} projectPath - Project path for filtering (optional, if null checks all projects)
380
+ * @returns {Promise<boolean>} True if comments exist
381
+ */
382
+ export async function hasPRComments(repository, projectPath = process.cwd()) {
383
+ try {
384
+ const table = await embeddingsSystem.getPRCommentsTable();
385
+
386
+ if (!table) {
387
+ return false;
388
+ }
389
+
390
+ let whereClause = `repository = '${repository.replace(/'/g, "''")}'`;
391
+
392
+ if (projectPath !== null) {
393
+ const resolvedProjectPath = path.resolve(projectPath);
394
+ whereClause += ` AND project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`;
395
+ }
396
+
397
+ const count = await table.countRows(whereClause);
398
+ return count > 0;
399
+ } catch (error) {
400
+ console.error(chalk.red(`Error checking PR comments existence: ${error.message}`));
401
+ return false;
402
+ }
403
+ }
404
+
405
+ /**
406
+ * Get the timestamp of the last analysis for incremental updates
407
+ * @param {string} repository - Repository in format "owner/repo"
408
+ * @param {string} projectPath - Project path for filtering
409
+ * @returns {Promise<string|null>} ISO timestamp or null if no previous analysis
410
+ */
411
+ export async function getLastAnalysisTimestamp(repository, projectPath) {
412
+ try {
413
+ const table = await embeddingsSystem.getPRCommentsTable();
414
+
415
+ if (!table) {
416
+ return null;
417
+ }
418
+
419
+ const resolvedProjectPath = path.resolve(projectPath);
420
+
421
+ const filters = [`repository = '${repository.replace(/'/g, "''")}'`, `project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`];
422
+
423
+ const results = await table
424
+ .search()
425
+ .where(filters.join(' AND '))
426
+ .limit(1)
427
+ .select(['created_at'])
428
+ .orderBy([{ column: 'created_at', order: 'desc' }])
429
+ .toArray();
430
+
431
+ if (results.length > 0) {
432
+ return results[0].created_at;
433
+ }
434
+
435
+ return null;
436
+ } catch (error) {
437
+ console.error(chalk.red(`Error getting last analysis timestamp: ${error.message}`));
438
+ return null;
439
+ }
440
+ }
441
+
442
+ // ============================================================================
443
+ // MAIN OPTIMIZATION FUNCTIONS
444
+ // ============================================================================
445
+
446
+ // ============================================================================
447
+ // HYBRID SEARCH IMPLEMENTATION BASED ON RESEARCH SAMPLE
448
+ // ============================================================================
449
+
450
+ // Configuration based on research sample
451
+ const HYBRID_SEARCH_CONFIG = {
452
+ CHUNK_SIZE: 20,
453
+ CHUNK_OVERLAP: 5,
454
+ SEARCH_LIMIT: 1, // We only need the single best chunk match for each historical comment
455
+ SIMILARITY_THRESHOLD: 0.4, // this is actually distance, where 0 is an exact match
456
+ LLM_BATCH_SIZE: 10,
457
+ };
458
+
459
+ /**
460
+ * Creates overlapping chunks of code from a source file (based on research sample)
461
+ * @param {string} codeContent - The full string content of the code file
462
+ * @param {number} chunkSize - The number of lines per chunk
463
+ * @param {number} overlap - The number of lines to overlap between consecutive chunks
464
+ * @returns {Array<{code: string, startLine: number, endLine: number}>} An array of code chunks
465
+ */
466
+ function createCodeChunks(codeContent, chunkSize = HYBRID_SEARCH_CONFIG.CHUNK_SIZE, overlap = HYBRID_SEARCH_CONFIG.CHUNK_OVERLAP) {
467
+ const lines = codeContent.split(/\r?\n/);
468
+ const chunks = [];
469
+ const step = chunkSize - overlap;
470
+
471
+ for (let i = 0; i < lines.length; i += step) {
472
+ const end = Math.min(i + chunkSize, lines.length);
473
+ const chunkLines = lines.slice(i, end);
474
+
475
+ if (chunkLines.join('').trim() !== '') {
476
+ chunks.push({
477
+ code: chunkLines.join('\n'),
478
+ startLine: i + 1,
479
+ endLine: end,
480
+ });
481
+ }
482
+ if (end === lines.length) break;
483
+ }
484
+ return chunks;
485
+ }
486
+
487
+ // Classifier is initialized lazily on first use to avoid heavy startup for non-PR tasks
488
+ let classifier = null;
489
+ let isInitializingClassifier = false;
490
+ let classifierInitializationPromise = null;
491
+
492
+ async function getClassifier() {
493
+ // If already initialized, return immediately
494
+ if (classifier) return classifier;
495
+
496
+ // If currently initializing, wait for the existing initialization
497
+ if (isInitializingClassifier && classifierInitializationPromise) {
498
+ return await classifierInitializationPromise;
499
+ }
500
+
501
+ // Start initialization
502
+ isInitializingClassifier = true;
503
+ classifierInitializationPromise = _initializeClassifier();
504
+
505
+ try {
506
+ classifier = await classifierInitializationPromise;
507
+ return classifier;
508
+ } finally {
509
+ isInitializingClassifier = false;
510
+ classifierInitializationPromise = null;
511
+ }
512
+ }
513
+
514
+ async function _initializeClassifier() {
515
+ try {
516
+ console.log(chalk.blue('Initializing MobileBERT classifier...'));
517
+ const cls = await pipeline('zero-shot-classification', 'Xenova/mobilebert-uncased-mnli', {
518
+ quantized: true,
519
+ dtype: 'fp32',
520
+ device: 'cpu',
521
+ });
522
+ console.log(chalk.green('โœ“ Local MobileBERT classifier initialized successfully'));
523
+ return cls;
524
+ } catch {
525
+ console.warn(chalk.yellow('โš  Failed to initialize MobileBERT, trying fallback model...'));
526
+ try {
527
+ const cls = await pipeline('zero-shot-classification', 'Xenova/distilbert-base-uncased-mnli', {
528
+ quantized: true,
529
+ dtype: 'fp32',
530
+ device: 'cpu',
531
+ });
532
+ console.log(chalk.green('โœ“ Local DistilBERT classifier initialized successfully (fallback)'));
533
+ return cls;
534
+ } catch (fallbackError) {
535
+ console.warn(chalk.yellow('โš  Failed to initialize any local classifier:'), fallbackError.message);
536
+ return null;
537
+ }
538
+ }
539
+ }
540
+
541
+ /**
542
+ * Clean up the classifier and tokenizer resources to prevent hanging
543
+ */
544
+ export async function cleanupClassifier() {
545
+ if (classifier) {
546
+ try {
547
+ await classifier.dispose();
548
+ classifier = null;
549
+ console.log(chalk.green('โœ“ Local classifier resources cleaned up'));
550
+ } catch (error) {
551
+ console.warn(chalk.yellow('โš  Error cleaning up classifier:'), error.message);
552
+ classifier = null;
553
+ }
554
+ }
555
+
556
+ // Clean up shared tokenizer
557
+ await cleanupTokenizer();
558
+
559
+ // Force garbage collection if available
560
+ if (global.gc) {
561
+ global.gc();
562
+ }
563
+ }
564
+
565
+ /**
566
+ * A faster, local alternative to the full LLM verification that processes candidates in batches.
567
+ * @param {Array<object>} candidates - An array of candidate objects to verify. Each object should have
568
+ * `comment_text`, and a `matchedChunk` with `code`.
569
+ * @returns {Promise<Array<object>>} - An array of the candidates that were verified as relevant.
570
+ */
571
+ async function verifyLocally(candidates) {
572
+ if (!candidates || candidates.length === 0) {
573
+ return [];
574
+ }
575
+
576
+ // Check if classifier is available
577
+ if (!classifier) {
578
+ classifier = await getClassifier();
579
+ }
580
+ if (!classifier) {
581
+ console.warn(chalk.yellow('Local classifier not available, assuming all candidates relevant'));
582
+ return candidates;
583
+ }
584
+
585
+ // MobileBERT has a max sequence length of 512 tokens.
586
+ // Use exact token counting to stay well under the limit
587
+ const maxTokensPerContext = 450; // Conservative limit to avoid ONNX dimension issues
588
+
589
+ // 1. Create an array of text contexts for the entire batch.
590
+ const contexts = await Promise.all(
591
+ candidates.map(async (candidate) => {
592
+ // Clean and normalize the text inputs
593
+ const commentText = (candidate.comment_text || '').trim().replace(/\s+/g, ' ');
594
+ const codeText = (candidate.matchedChunk.code || '').trim().replace(/\s+/g, ' ');
595
+
596
+ // Smart truncation: prioritize beginning and key parts of comment
597
+ let selectedCommentText = commentText;
598
+ if (commentText.length > 500) {
599
+ const firstPart = commentText.substring(0, 300);
600
+ const lastPart = commentText.substring(commentText.length - 100);
601
+ // Check if the last part contains important keywords
602
+ if (lastPart.match(/\b(fix|bug|issue|error|problem|solution|should|recommend)\b/i)) {
603
+ selectedCommentText = firstPart + '... ' + lastPart;
604
+ } else {
605
+ selectedCommentText = commentText.substring(0, 400);
606
+ }
607
+ }
608
+
609
+ // For code, prioritize the beginning as it usually contains the most context
610
+ let selectedCodeText = codeText;
611
+ if (codeText.length > 400) {
612
+ selectedCodeText = codeText.substring(0, 400) + '...';
613
+ }
614
+
615
+ // Create the context string
616
+ const problemContext = `Comment: ${selectedCommentText} Code: ${selectedCodeText}`;
617
+
618
+ // Use exact token counting to truncate properly
619
+ const finalContext = await truncateToTokenLimit(problemContext, maxTokensPerContext);
620
+ return finalContext;
621
+ })
622
+ );
623
+
624
+ const candidateLabels = ['relevant issue', 'irrelevant'];
625
+ const relevanceThreshold = 0.75; // Tune this value (75% confidence)
626
+ const verifiedCandidates = [];
627
+
628
+ try {
629
+ // 2. Make a SINGLE call to the classifier with the entire batch of contexts.
630
+ // The pipeline will return an array of results, one for each context.
631
+ const outputs = await classifier(contexts, candidateLabels);
632
+
633
+ // 3. Process the batch of results.
634
+ outputs.forEach((output, index) => {
635
+ const relevanceScore = output.scores[output.labels.indexOf('relevant issue')];
636
+
637
+ if (relevanceScore > relevanceThreshold) {
638
+ verifiedCandidates.push(candidates[index]);
639
+ }
640
+ });
641
+
642
+ return verifiedCandidates;
643
+ } catch (error) {
644
+ // Check if it's the specific ONNX broadcasting error or token limit exceeded
645
+ if (error.message && (error.message.includes('BroadcastIterator') || error.message.includes('Non-zero status code'))) {
646
+ console.warn(chalk.yellow(`Local batch verification skipped due to token/tensor dimension issues. Batch size: ${candidates.length}`));
647
+ console.warn(chalk.yellow(`Using exact token counting to prevent this issue in the future.`));
648
+ } else {
649
+ console.error(chalk.red('Local batch verification failed:'), error.message || error);
650
+ }
651
+
652
+ // Fail open: if the local model fails, assume the whole batch is relevant to avoid discarding good matches.
653
+ return candidates;
654
+ }
655
+ }
656
+
657
+ // NEW: A fast pre-filtering step to reduce candidates before hitting the LLM.
658
+ // Use English stopwords from stopwords-iso
659
+ const stopWords = new Set(stopwords.en || []);
660
+ function preFilterWithKeywords(candidate) {
661
+ const commentText = (candidate.comment_text || '').toLowerCase();
662
+ const codeText = (candidate.matchedChunk.code || '').toLowerCase();
663
+
664
+ // Extract potential keywords from the comment, ignoring common words.
665
+ const keywords = commentText.split(/[^a-zA-Z0-9_]+/).filter((word) => word.length > 2 && !stopWords.has(word));
666
+
667
+ // If there are no good keywords, we can't pre-filter, so let it pass.
668
+ if (keywords.length === 0) {
669
+ return true;
670
+ }
671
+
672
+ // Check if at least one of the keywords from the comment appears in the code chunk.
673
+ return keywords.some((keyword) => codeText.includes(keyword));
674
+ }
675
+
676
+ /**
677
+ * Find relevant PR comments using hybrid search with chunking strategy
678
+ * @param {string} reviewFileContent - Content of the review file
679
+ * @param {Object} options - Search options
680
+ * @returns {Promise<Array<Object>>} Relevant PR comments with verification
681
+ */
682
+ export async function findRelevantPRComments(reviewFileContent, options = {}) {
683
+ const { limit = 10, projectPath = process.cwd(), isTestFile = false } = options;
684
+
685
+ try {
686
+ console.log(chalk.cyan('๐Ÿ” Starting FORWARD Hybrid Search with LLM Verification'));
687
+
688
+ if (!reviewFileContent) {
689
+ console.warn(chalk.yellow('No review file content provided'));
690
+ return [];
691
+ }
692
+
693
+ // --- Step 1: Create chunks from the file under review ---
694
+ const codeChunks = createCodeChunks(reviewFileContent);
695
+ if (codeChunks.length === 0) {
696
+ console.warn(chalk.yellow('No valid chunks created from review file'));
697
+ return [];
698
+ }
699
+ console.log(chalk.blue(`๐Ÿ“ Created ${codeChunks.length} chunks from the review file.`));
700
+
701
+ const chunkEmbeddings = await Promise.all(
702
+ codeChunks.map(async (chunk) => ({
703
+ vector: await embeddingsSystem.calculateQueryEmbedding(chunk.code),
704
+ ...chunk,
705
+ }))
706
+ );
707
+
708
+ // --- Step 2: Search for relevant historical comments for each chunk ---
709
+ const mainTable = await embeddingsSystem.getPRCommentsTable();
710
+ if (!mainTable) throw new Error('Main PR comments table not found.');
711
+
712
+ const candidateMatches = new Map();
713
+
714
+ // Create project-specific WHERE clause for filtering
715
+ const resolvedProjectPath = path.resolve(projectPath);
716
+ const projectWhereClause = `project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`;
717
+
718
+ console.log(chalk.blue(`๐Ÿ”’ Project isolation: filtering by project_path = '${resolvedProjectPath}'`));
719
+
720
+ const searchPromises = chunkEmbeddings.map((chunk) => {
721
+ if (!chunk.vector) return Promise.resolve([]);
722
+ return (
723
+ mainTable
724
+ .search(chunk.vector)
725
+ .column('combined_embedding')
726
+ .where(projectWhereClause) // Add project-specific filtering
727
+ .limit(15) // Get 15 potential candidates for each chunk
728
+ .toArray()
729
+ // Attach the chunk that was used for the search to each result
730
+ .then((results) => results.map((res) => ({ ...res, matchedChunk: chunk })))
731
+ );
732
+ });
733
+
734
+ const allResults = await Promise.all(searchPromises);
735
+ const flattenedResults = allResults.flat();
736
+
737
+ // Deduplicate results, keeping the best match (lowest distance) for each comment
738
+ for (const historicalComment of flattenedResults) {
739
+ const commentId = historicalComment.id;
740
+ const distance = historicalComment._distance;
741
+
742
+ if (distance <= HYBRID_SEARCH_CONFIG.SIMILARITY_THRESHOLD) {
743
+ if (!candidateMatches.has(commentId) || distance < candidateMatches.get(commentId)._distance) {
744
+ candidateMatches.set(commentId, historicalComment);
745
+ }
746
+ }
747
+ }
748
+
749
+ console.log(chalk.blue(`๐ŸŽฏ Found ${candidateMatches.size} unique candidate comments for verification.`));
750
+
751
+ // --- STEP 3: THE NEW PRE-FILTERING STEP ---
752
+ const preFilteredCandidates = Array.from(candidateMatches.values()).filter(preFilterWithKeywords);
753
+ console.log(chalk.yellow(`โšก After keyword pre-filtering, ${preFilteredCandidates.length} candidates remain for LLM verification.`));
754
+
755
+ // --- Step 4: LLM Verification ---
756
+ const candidatesArray = preFilteredCandidates;
757
+ const batchSize = HYBRID_SEARCH_CONFIG.LLM_BATCH_SIZE;
758
+ const verifiedComments = [];
759
+ console.log(chalk.cyan(`๐Ÿค– Starting LLM verification of ${candidatesArray.length} candidates...`));
760
+
761
+ for (let i = 0; i < candidatesArray.length; i += batchSize) {
762
+ const batch = candidatesArray.slice(i, i + batchSize);
763
+ const verifiedBatch = await verifyLocally(batch); // SINGLE batch call
764
+ verifiedComments.push(...verifiedBatch);
765
+ }
766
+ console.log(chalk.green(`โœ… LLM verification complete: ${verifiedComments.length}/${candidatesArray.length} comments verified.`));
767
+
768
+ // --- Step 4: Filtering and Formatting (same as before) ---
769
+ let filteredComments = verifiedComments;
770
+ if (isTestFile) {
771
+ console.log(chalk.blue('๐Ÿงช Applying test file filtering - prioritizing test-related comments'));
772
+ filteredComments = filteredComments.filter((comment) => {
773
+ const filePath = comment.file_path || '';
774
+ const commentText = comment.comment_text || '';
775
+ return (
776
+ filePath.includes('.test.') ||
777
+ filePath.includes('.spec.') ||
778
+ commentText.toLowerCase().includes('test') ||
779
+ commentText.toLowerCase().includes('spec')
780
+ );
781
+ });
782
+ } else {
783
+ console.log(chalk.blue('๐Ÿ“ Applying non-test file filtering - excluding test-specific comments'));
784
+ filteredComments = filteredComments.filter((comment) => {
785
+ const filePath = comment.file_path || '';
786
+ const commentText = comment.comment_text || '';
787
+ // Only exclude if it's clearly a test file AND has test-specific content
788
+ return !(filePath.includes('.test.') && (commentText.includes('describe(') || commentText.includes('it(')));
789
+ });
790
+ }
791
+
792
+ const sortedResults = filteredComments.sort((a, b) => a._distance - b._distance).slice(0, limit);
793
+
794
+ const formattedResults = sortedResults.map((res) => ({
795
+ id: res.id,
796
+ comment_text: res.comment_text,
797
+ body: res.comment_text,
798
+ original_code: res.original_code,
799
+ suggested_code: res.suggested_code,
800
+ file_path: res.file_path,
801
+ line_number: res.line_number,
802
+ pr_number: res.pr_number,
803
+ author: res.author,
804
+ created_at: res.created_at,
805
+ issue_category: res.issue_category,
806
+ severity: res.severity,
807
+ pattern_tags: res.pattern_tags ? JSON.parse(res.pattern_tags) : [],
808
+ similarity_score: 1 - res._distance,
809
+ matchedChunk: res.matchedChunk,
810
+ contentVerified: true,
811
+ }));
812
+
813
+ console.log(chalk.green.bold(`\n๐ŸŽ‰ Final results: ${formattedResults.length} relevant comments found.`));
814
+ return formattedResults;
815
+ } catch (error) {
816
+ console.error(chalk.red(`Error in reverse hybrid search: ${error.message}`));
817
+ return [];
818
+ }
819
+ }