codecritique 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1145 -0
- package/package.json +98 -0
- package/src/content-retrieval.js +747 -0
- package/src/custom-documents.js +597 -0
- package/src/embeddings/cache-manager.js +364 -0
- package/src/embeddings/constants.js +40 -0
- package/src/embeddings/database.js +921 -0
- package/src/embeddings/errors.js +208 -0
- package/src/embeddings/factory.js +447 -0
- package/src/embeddings/file-processor.js +851 -0
- package/src/embeddings/model-manager.js +337 -0
- package/src/embeddings/similarity-calculator.js +97 -0
- package/src/embeddings/types.js +113 -0
- package/src/feedback-loader.js +384 -0
- package/src/index.js +1418 -0
- package/src/llm.js +123 -0
- package/src/pr-history/analyzer.js +579 -0
- package/src/pr-history/bot-detector.js +123 -0
- package/src/pr-history/cli-utils.js +204 -0
- package/src/pr-history/comment-processor.js +549 -0
- package/src/pr-history/database.js +819 -0
- package/src/pr-history/github-client.js +629 -0
- package/src/project-analyzer.js +955 -0
- package/src/rag-analyzer.js +2764 -0
- package/src/rag-review.js +566 -0
- package/src/technology-keywords.json +753 -0
- package/src/utils/command.js +48 -0
- package/src/utils/constants.js +263 -0
- package/src/utils/context-inference.js +364 -0
- package/src/utils/document-detection.js +105 -0
- package/src/utils/file-validation.js +271 -0
- package/src/utils/git.js +232 -0
- package/src/utils/language-detection.js +170 -0
- package/src/utils/logging.js +24 -0
- package/src/utils/markdown.js +132 -0
- package/src/utils/mobilebert-tokenizer.js +141 -0
- package/src/utils/pr-chunking.js +276 -0
- package/src/utils/string-utils.js +28 -0
- package/src/zero-shot-classifier-open.js +392 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import chalk from 'chalk';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Determines if a PR should be chunked based on estimated token usage
|
|
5
|
+
* @param {Array} prFiles - Array of PR files with diffContent and content
|
|
6
|
+
* @returns {Object} Decision object with shouldChunk flag and estimates
|
|
7
|
+
*/
|
|
8
|
+
export function shouldChunkPR(prFiles) {
|
|
9
|
+
// IMPORTANT: The holistic PR prompt includes BOTH full file content AND diff content
|
|
10
|
+
// for each file, plus context (code examples, guidelines, PR comments, custom docs)
|
|
11
|
+
|
|
12
|
+
// Calculate tokens for diff content
|
|
13
|
+
const diffTokens = prFiles.reduce((sum, file) => {
|
|
14
|
+
return sum + Math.ceil((file.diffContent?.length || 0) / 3);
|
|
15
|
+
}, 0);
|
|
16
|
+
|
|
17
|
+
// Calculate tokens for full file content (included in prompt for context awareness)
|
|
18
|
+
const fullContentTokens = prFiles.reduce((sum, file) => {
|
|
19
|
+
return sum + Math.ceil((file.content?.length || 0) / 3);
|
|
20
|
+
}, 0);
|
|
21
|
+
|
|
22
|
+
// Total file-related tokens (both diff AND full content are sent)
|
|
23
|
+
const fileTokens = diffTokens + fullContentTokens;
|
|
24
|
+
|
|
25
|
+
// Estimate context overhead (code examples, guidelines, PR comments, custom docs, project summary)
|
|
26
|
+
// This is typically 10-30k tokens depending on project size
|
|
27
|
+
const CONTEXT_OVERHEAD_TOKENS = 25000;
|
|
28
|
+
|
|
29
|
+
// Total estimated prompt tokens
|
|
30
|
+
const totalEstimatedTokens = fileTokens + CONTEXT_OVERHEAD_TOKENS;
|
|
31
|
+
|
|
32
|
+
// Claude's limit is 200k tokens. Leave buffer for response and safety margin.
|
|
33
|
+
// Max safe prompt size ~150k tokens to be conservative
|
|
34
|
+
const MAX_SINGLE_REVIEW_TOKENS = 100000;
|
|
35
|
+
|
|
36
|
+
const shouldChunk = totalEstimatedTokens > MAX_SINGLE_REVIEW_TOKENS || prFiles.length > 30;
|
|
37
|
+
|
|
38
|
+
console.log(
|
|
39
|
+
chalk.gray(
|
|
40
|
+
` Token breakdown: ${diffTokens} diff + ${fullContentTokens} full content + ${CONTEXT_OVERHEAD_TOKENS} context overhead = ${totalEstimatedTokens} total`
|
|
41
|
+
)
|
|
42
|
+
);
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
shouldChunk,
|
|
46
|
+
estimatedTokens: totalEstimatedTokens,
|
|
47
|
+
diffTokens,
|
|
48
|
+
fullContentTokens,
|
|
49
|
+
contextOverhead: CONTEXT_OVERHEAD_TOKENS,
|
|
50
|
+
recommendedChunks: Math.ceil(totalEstimatedTokens / 35000), // More aggressive chunking
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Chunks PR files into manageable groups based on token limits and logical grouping
|
|
56
|
+
* @param {Array} prFiles - Array of PR files with diffContent and content
|
|
57
|
+
* @param {number} maxTokensPerChunk - Maximum tokens per chunk
|
|
58
|
+
* @returns {Array} Array of chunks with files and metadata
|
|
59
|
+
*/
|
|
60
|
+
export function chunkPRFiles(prFiles, maxTokensPerChunk = 35000) {
|
|
61
|
+
// Calculate change complexity for each file (works for any language)
|
|
62
|
+
// IMPORTANT: Token estimate must include BOTH diff AND full content since both are sent
|
|
63
|
+
const filesWithMetrics = prFiles.map((file) => ({
|
|
64
|
+
...file,
|
|
65
|
+
changeSize: calculateChangeSize(file.diffContent),
|
|
66
|
+
fileComplexity: calculateFileComplexity(file),
|
|
67
|
+
// Estimate tokens for BOTH diff content AND full file content (both are included in prompt)
|
|
68
|
+
estimatedTokens: Math.ceil((file.diffContent?.length || 0) / 3) + Math.ceil((file.content?.length || 0) / 3),
|
|
69
|
+
}));
|
|
70
|
+
|
|
71
|
+
// Sort by directory + change importance for logical grouping
|
|
72
|
+
const sortedFiles = filesWithMetrics.sort((a, b) => {
|
|
73
|
+
const dirA = getDirectoryDepth(a.filePath);
|
|
74
|
+
const dirB = getDirectoryDepth(b.filePath);
|
|
75
|
+
|
|
76
|
+
// Primary: Directory structure (keep related files together)
|
|
77
|
+
if (dirA !== dirB) return dirA.localeCompare(dirB);
|
|
78
|
+
|
|
79
|
+
// Secondary: Change importance (larger changes first)
|
|
80
|
+
return b.changeSize - a.changeSize;
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// Chunk files based on token budget
|
|
84
|
+
const chunks = [];
|
|
85
|
+
let currentChunk = [];
|
|
86
|
+
let currentTokens = 0;
|
|
87
|
+
|
|
88
|
+
for (const file of sortedFiles) {
|
|
89
|
+
// Start new chunk if adding this file exceeds budget
|
|
90
|
+
if (currentTokens + file.estimatedTokens > maxTokensPerChunk && currentChunk.length > 0) {
|
|
91
|
+
chunks.push({
|
|
92
|
+
files: [...currentChunk],
|
|
93
|
+
totalTokens: currentTokens,
|
|
94
|
+
chunkId: chunks.length + 1,
|
|
95
|
+
});
|
|
96
|
+
currentChunk = [];
|
|
97
|
+
currentTokens = 0;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
currentChunk.push(file);
|
|
101
|
+
currentTokens += file.estimatedTokens;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Add final chunk
|
|
105
|
+
if (currentChunk.length > 0) {
|
|
106
|
+
chunks.push({
|
|
107
|
+
files: [...currentChunk],
|
|
108
|
+
totalTokens: currentTokens,
|
|
109
|
+
chunkId: chunks.length + 1,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
return chunks;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Language-agnostic change size calculation
|
|
118
|
+
* @param {string} diffContent - The diff content
|
|
119
|
+
* @returns {number} Total number of additions and deletions
|
|
120
|
+
*/
|
|
121
|
+
function calculateChangeSize(diffContent) {
|
|
122
|
+
if (!diffContent) return 0;
|
|
123
|
+
const lines = diffContent.split('\n');
|
|
124
|
+
const additions = lines.filter((line) => line.startsWith('+')).length;
|
|
125
|
+
const deletions = lines.filter((line) => line.startsWith('-')).length;
|
|
126
|
+
return additions + deletions;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Language-agnostic file complexity scoring
|
|
131
|
+
* @param {Object} file - File object with filePath and diffContent
|
|
132
|
+
* @returns {number} Complexity score
|
|
133
|
+
*/
|
|
134
|
+
function calculateFileComplexity(file) {
|
|
135
|
+
let complexity = 0;
|
|
136
|
+
|
|
137
|
+
// File size factor
|
|
138
|
+
complexity += Math.min(file.diffContent ? file.diffContent.length / 1000 : 0, 20);
|
|
139
|
+
|
|
140
|
+
// Path-based heuristics (works for any language)
|
|
141
|
+
const path = file.filePath.toLowerCase();
|
|
142
|
+
if (path.includes('/src/') || path.includes('/lib/')) complexity += 10;
|
|
143
|
+
if (path.includes('/test/') || path.includes('/spec/')) complexity += 5;
|
|
144
|
+
if (path.includes('/config/') || path.includes('/settings/')) complexity += 8;
|
|
145
|
+
if (path.includes('/main.') || path.includes('/index.')) complexity += 15;
|
|
146
|
+
|
|
147
|
+
// Change type heuristics
|
|
148
|
+
if (file.diffContent) {
|
|
149
|
+
if (file.diffContent.includes('new file mode')) complexity += 12;
|
|
150
|
+
if (file.diffContent.includes('deleted file mode')) complexity += 8;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return complexity;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Gets directory path for grouping related files
|
|
158
|
+
* @param {string} filePath - The file path
|
|
159
|
+
* @returns {string} Directory path without filename
|
|
160
|
+
*/
|
|
161
|
+
function getDirectoryDepth(filePath) {
|
|
162
|
+
return filePath.split('/').slice(0, -1).join('/'); // Directory path without filename
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Combines results from multiple chunk reviews into a single result
|
|
167
|
+
* @param {Array} chunkResults - Array of chunk review results
|
|
168
|
+
* @param {number} totalFiles - Total number of files in the PR
|
|
169
|
+
* @returns {Object} Combined result object
|
|
170
|
+
*/
|
|
171
|
+
export function combineChunkResults(chunkResults, totalFiles) {
|
|
172
|
+
const combinedResult = {
|
|
173
|
+
success: true,
|
|
174
|
+
results: [],
|
|
175
|
+
prContext: {
|
|
176
|
+
totalFiles: totalFiles,
|
|
177
|
+
chunkedReview: true,
|
|
178
|
+
chunks: chunkResults.length,
|
|
179
|
+
},
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
// Combine file-specific results
|
|
183
|
+
chunkResults.forEach((chunkResult, chunkIndex) => {
|
|
184
|
+
if (chunkResult.success && chunkResult.results) {
|
|
185
|
+
chunkResult.results.forEach((fileResult) => {
|
|
186
|
+
// Add chunk context to each result
|
|
187
|
+
const enhancedResult = {
|
|
188
|
+
...fileResult,
|
|
189
|
+
chunkInfo: {
|
|
190
|
+
chunkNumber: chunkIndex + 1,
|
|
191
|
+
totalChunks: chunkResults.length,
|
|
192
|
+
},
|
|
193
|
+
};
|
|
194
|
+
combinedResult.results.push(enhancedResult);
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
|
|
199
|
+
// Create combined summary
|
|
200
|
+
combinedResult.combinedSummary = createCombinedSummary(chunkResults);
|
|
201
|
+
|
|
202
|
+
// Detect and merge cross-chunk issues
|
|
203
|
+
combinedResult.crossChunkIssues = detectCrossChunkIssues(chunkResults);
|
|
204
|
+
|
|
205
|
+
console.log(chalk.green(`✅ Combined results from ${chunkResults.length} chunks: ${combinedResult.results.length} file reviews`));
|
|
206
|
+
|
|
207
|
+
return combinedResult;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Creates a summary from combined chunk results
|
|
212
|
+
* @param {Array} chunkResults - Array of chunk review results
|
|
213
|
+
* @returns {string} Combined summary text
|
|
214
|
+
*/
|
|
215
|
+
function createCombinedSummary(chunkResults) {
|
|
216
|
+
const totalIssues = chunkResults.reduce((sum, chunk) => {
|
|
217
|
+
if (!chunk.results) return sum;
|
|
218
|
+
return (
|
|
219
|
+
sum +
|
|
220
|
+
chunk.results.reduce((fileSum, file) => {
|
|
221
|
+
return fileSum + (file.results?.issues?.length || 0);
|
|
222
|
+
}, 0)
|
|
223
|
+
);
|
|
224
|
+
}, 0);
|
|
225
|
+
|
|
226
|
+
const successfulChunks = chunkResults.filter((c) => c.success).length;
|
|
227
|
+
|
|
228
|
+
return `Chunked PR review completed: ${successfulChunks}/${chunkResults.length} chunks processed successfully. Total issues found: ${totalIssues}. Review performed in parallel chunks to optimize token usage.`;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Detects issues that span across multiple chunks
|
|
233
|
+
* @param {Array} chunkResults - Array of chunk review results
|
|
234
|
+
* @returns {Array} Array of cross-chunk issues
|
|
235
|
+
*/
|
|
236
|
+
function detectCrossChunkIssues(chunkResults) {
|
|
237
|
+
const crossChunkIssues = [];
|
|
238
|
+
|
|
239
|
+
// Simple heuristic: Look for similar issues across chunks that might indicate patterns
|
|
240
|
+
const allIssues = chunkResults.flatMap(
|
|
241
|
+
(chunk) =>
|
|
242
|
+
chunk.results?.flatMap((file) =>
|
|
243
|
+
(file.results?.issues || []).map((issue) => ({
|
|
244
|
+
...issue,
|
|
245
|
+
chunkId: chunk.chunkId,
|
|
246
|
+
filePath: file.filePath,
|
|
247
|
+
}))
|
|
248
|
+
) || []
|
|
249
|
+
);
|
|
250
|
+
|
|
251
|
+
// Group by issue type and description similarity
|
|
252
|
+
const issueGroups = new Map();
|
|
253
|
+
allIssues.forEach((issue) => {
|
|
254
|
+
const key = `${issue.type}-${issue.description ? issue.description.substring(0, 50) : ''}`;
|
|
255
|
+
if (!issueGroups.has(key)) {
|
|
256
|
+
issueGroups.set(key, []);
|
|
257
|
+
}
|
|
258
|
+
issueGroups.get(key).push(issue);
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
// Identify patterns that appear across multiple chunks
|
|
262
|
+
issueGroups.forEach((issues) => {
|
|
263
|
+
const uniqueChunks = new Set(issues.map((i) => i.chunkId));
|
|
264
|
+
if (uniqueChunks.size > 1) {
|
|
265
|
+
crossChunkIssues.push({
|
|
266
|
+
type: 'pattern',
|
|
267
|
+
severity: 'medium',
|
|
268
|
+
description: `Similar issue pattern detected across ${uniqueChunks.size} chunks: ${issues[0].description || 'Pattern issue'}`,
|
|
269
|
+
affectedFiles: issues.map((i) => i.filePath),
|
|
270
|
+
suggestion: `This issue appears in multiple parts of the PR. Consider addressing it consistently across all affected files.`,
|
|
271
|
+
});
|
|
272
|
+
}
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
return crossChunkIssues;
|
|
276
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* String Utilities Module
|
|
3
|
+
*
|
|
4
|
+
* This module provides utilities for string manipulation, formatting,
|
|
5
|
+
* and text processing operations.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Slugify text for use in IDs and URLs
|
|
10
|
+
*
|
|
11
|
+
* @param {string} text - The text to slugify
|
|
12
|
+
* @returns {string} A slugified string safe for use in IDs and URLs
|
|
13
|
+
*
|
|
14
|
+
* @example
|
|
15
|
+
* slugify('Hello World!'); // 'hello-world'
|
|
16
|
+
* slugify('My Component Name'); // 'my-component-name'
|
|
17
|
+
* slugify(' Multiple Spaces '); // 'multiple-spaces'
|
|
18
|
+
*/
|
|
19
|
+
export function slugify(text) {
|
|
20
|
+
if (!text) return '';
|
|
21
|
+
return text
|
|
22
|
+
.toString()
|
|
23
|
+
.toLowerCase()
|
|
24
|
+
.trim()
|
|
25
|
+
.replace(/\s+/g, '-') // Replace spaces with -
|
|
26
|
+
.replace(/[^\w-]+/g, '') // Remove all non-word chars
|
|
27
|
+
.replace(/--+/g, '-'); // Replace multiple - with single -
|
|
28
|
+
}
|
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Open-ended Zero-Shot Classification Module
|
|
3
|
+
*
|
|
4
|
+
* This module provides zero-shot classification without predefined categories,
|
|
5
|
+
* allowing it to detect any technology or framework mentioned in the text.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { env, pipeline } from '@huggingface/transformers';
|
|
9
|
+
import * as linguistLanguages from 'linguist-languages';
|
|
10
|
+
import { LRUCache } from 'lru-cache';
|
|
11
|
+
import stopwords from 'stopwords-iso/stopwords-iso.json' with { type: 'json' };
|
|
12
|
+
import techKeywords from './technology-keywords.json' with { type: 'json' };
|
|
13
|
+
import { truncateToTokenLimit } from './utils/mobilebert-tokenizer.js';
|
|
14
|
+
|
|
15
|
+
// Configure Transformers.js environment
|
|
16
|
+
env.allowLocalModels = false;
|
|
17
|
+
env.useBrowserCache = false;
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* OpenZeroShotClassifier for unrestricted technology detection
|
|
21
|
+
*/
|
|
22
|
+
class OpenZeroShotClassifier {
|
|
23
|
+
constructor() {
|
|
24
|
+
this.classifier = null;
|
|
25
|
+
this.initializationPromise = null;
|
|
26
|
+
this.cache = new LRUCache({
|
|
27
|
+
max: 100,
|
|
28
|
+
ttl: 1000 * 60 * 60, // 1 hour TTL
|
|
29
|
+
});
|
|
30
|
+
this.isInitialized = false;
|
|
31
|
+
|
|
32
|
+
// Common words to exclude from technology detection
|
|
33
|
+
// Use English stopwords from stopwords-iso
|
|
34
|
+
this.commonWords = new Set(stopwords.en || []);
|
|
35
|
+
|
|
36
|
+
// Add additional technical context words that are too generic
|
|
37
|
+
const additionalCommonWords = [
|
|
38
|
+
'system',
|
|
39
|
+
'modern',
|
|
40
|
+
'architecture',
|
|
41
|
+
'stack',
|
|
42
|
+
'features',
|
|
43
|
+
'data',
|
|
44
|
+
'service',
|
|
45
|
+
'tools',
|
|
46
|
+
'runtime',
|
|
47
|
+
'apps',
|
|
48
|
+
'workloads',
|
|
49
|
+
'pipeline',
|
|
50
|
+
'builds',
|
|
51
|
+
'team',
|
|
52
|
+
'interfaces',
|
|
53
|
+
'queries',
|
|
54
|
+
'computing',
|
|
55
|
+
'database',
|
|
56
|
+
'processing',
|
|
57
|
+
'stream',
|
|
58
|
+
'analytics',
|
|
59
|
+
'infrastructure',
|
|
60
|
+
'runs',
|
|
61
|
+
'orchestration',
|
|
62
|
+
'mesh',
|
|
63
|
+
'experimenting',
|
|
64
|
+
'desktop',
|
|
65
|
+
'entire',
|
|
66
|
+
'reproducible',
|
|
67
|
+
'migrating',
|
|
68
|
+
'temporal',
|
|
69
|
+
'distributed',
|
|
70
|
+
'graph',
|
|
71
|
+
'high-performance',
|
|
72
|
+
'real-time',
|
|
73
|
+
'reactive',
|
|
74
|
+
'frontend',
|
|
75
|
+
'instead',
|
|
76
|
+
'legacy',
|
|
77
|
+
'fast',
|
|
78
|
+
'slow',
|
|
79
|
+
'quick',
|
|
80
|
+
'easy',
|
|
81
|
+
'hard',
|
|
82
|
+
'simple',
|
|
83
|
+
'complex',
|
|
84
|
+
'basic',
|
|
85
|
+
'advanced',
|
|
86
|
+
'beginner',
|
|
87
|
+
'intermediate',
|
|
88
|
+
'expert',
|
|
89
|
+
'professional',
|
|
90
|
+
];
|
|
91
|
+
|
|
92
|
+
// Add the additional words to the stopwords set
|
|
93
|
+
additionalCommonWords.forEach((word) => this.commonWords.add(word));
|
|
94
|
+
|
|
95
|
+
// Build technology patterns from loaded keywords
|
|
96
|
+
this.techPatterns = this.buildTechPatterns();
|
|
97
|
+
|
|
98
|
+
// Build a set of all known technologies for quick lookup
|
|
99
|
+
this.knownTechnologies = this.buildKnownTechnologies();
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Initialize the zero-shot classification pipeline (singleton pattern)
|
|
104
|
+
*/
|
|
105
|
+
async initialize() {
|
|
106
|
+
// If already initialized, return immediately
|
|
107
|
+
if (this.isInitialized) return;
|
|
108
|
+
|
|
109
|
+
// If currently initializing, wait for the existing initialization
|
|
110
|
+
if (this.initializationPromise) {
|
|
111
|
+
return await this.initializationPromise;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Start initialization
|
|
115
|
+
this.initializationPromise = this._doInitialize();
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
await this.initializationPromise;
|
|
119
|
+
} finally {
|
|
120
|
+
// Clean up the promise after initialization (success or failure)
|
|
121
|
+
this.initializationPromise = null;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
async _doInitialize() {
|
|
126
|
+
try {
|
|
127
|
+
console.log('Initializing open-ended zero-shot classifier...');
|
|
128
|
+
|
|
129
|
+
this.classifier = await pipeline('zero-shot-classification', 'Xenova/mobilebert-uncased-mnli', {
|
|
130
|
+
quantized: true,
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
this.isInitialized = true;
|
|
134
|
+
console.log('✓ Open-ended zero-shot classifier initialized successfully');
|
|
135
|
+
} catch (error) {
|
|
136
|
+
console.error('Error initializing classifier:', error);
|
|
137
|
+
this.isInitialized = false;
|
|
138
|
+
throw error;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Build technology patterns from keywords JSON
|
|
144
|
+
*/
|
|
145
|
+
buildTechPatterns() {
|
|
146
|
+
const patterns = [
|
|
147
|
+
/\b(\w+\.js)\b/gi, // Matches *.js frameworks
|
|
148
|
+
/\b(\w+\.py)\b/gi, // Matches *.py libraries
|
|
149
|
+
/\b([A-Z](?:[a-z]*[A-Z])*[a-z]*)\b/g, // CamelCase (React, FastAPI)
|
|
150
|
+
/\b([a-z]+(?:-[a-z]+)+)\b/gi, // kebab-case (scikit-learn, styled-components)
|
|
151
|
+
];
|
|
152
|
+
|
|
153
|
+
// Add dynamic patterns from linguist languages
|
|
154
|
+
for (const [, langData] of Object.entries(linguistLanguages)) {
|
|
155
|
+
if (langData.aliases) {
|
|
156
|
+
langData.aliases.forEach((alias) => {
|
|
157
|
+
patterns.push(new RegExp(`\\b${this.escapeRegex(alias)}\\b`, 'gi'));
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
return patterns;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Build a set of all known technologies
|
|
167
|
+
*/
|
|
168
|
+
buildKnownTechnologies() {
|
|
169
|
+
const techs = new Set();
|
|
170
|
+
|
|
171
|
+
// Add all technologies from JSON file
|
|
172
|
+
const addTechsFromObject = (obj) => {
|
|
173
|
+
for (const value of Object.values(obj)) {
|
|
174
|
+
if (Array.isArray(value)) {
|
|
175
|
+
value.forEach((tech) => techs.add(tech.toLowerCase()));
|
|
176
|
+
} else if (typeof value === 'object') {
|
|
177
|
+
addTechsFromObject(value);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
addTechsFromObject(techKeywords);
|
|
183
|
+
|
|
184
|
+
// Add languages from linguist
|
|
185
|
+
for (const [langName, langData] of Object.entries(linguistLanguages)) {
|
|
186
|
+
techs.add(langName.toLowerCase());
|
|
187
|
+
if (langData.aliases) {
|
|
188
|
+
langData.aliases.forEach((alias) => techs.add(alias.toLowerCase()));
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return techs;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Escape regex special characters
|
|
197
|
+
*/
|
|
198
|
+
escapeRegex(str) {
|
|
199
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Extract potential technology candidates from text
|
|
204
|
+
*/
|
|
205
|
+
extractTechnologyCandidates(text) {
|
|
206
|
+
const candidates = new Set();
|
|
207
|
+
|
|
208
|
+
// Look for known technologies
|
|
209
|
+
for (const tech of this.knownTechnologies) {
|
|
210
|
+
// Create regex for exact word boundary matching
|
|
211
|
+
const regex = new RegExp(`\\b${this.escapeRegex(tech)}\\b`, 'i');
|
|
212
|
+
if (regex.test(text)) {
|
|
213
|
+
candidates.add(tech);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Extract using patterns
|
|
218
|
+
for (const pattern of this.techPatterns) {
|
|
219
|
+
const matches = text.matchAll(pattern);
|
|
220
|
+
for (const match of matches) {
|
|
221
|
+
const candidate = match[1] || match[0];
|
|
222
|
+
if (candidate.length > 2 && candidate.length < 30 && !this.commonWords.has(candidate.toLowerCase())) {
|
|
223
|
+
candidates.add(candidate);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Extract capitalized words that might be technologies
|
|
229
|
+
const sentences = text.split(/[.!?]+/);
|
|
230
|
+
for (const sentence of sentences) {
|
|
231
|
+
const words = sentence.trim().split(/\s+/);
|
|
232
|
+
for (let i = 0; i < words.length; i++) {
|
|
233
|
+
const word = words[i].replace(/[.,;:!?'"()[\]{}]/g, '');
|
|
234
|
+
|
|
235
|
+
// Skip if it's a common word
|
|
236
|
+
if (this.commonWords.has(word.toLowerCase())) continue;
|
|
237
|
+
|
|
238
|
+
// Check if word is capitalized and not at sentence start
|
|
239
|
+
if (i > 0 && /^[A-Z][a-zA-Z]+/.test(word) && word.length > 2 && word.length < 20) {
|
|
240
|
+
candidates.add(word);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Also check for acronyms
|
|
244
|
+
if (/^[A-Z]{2,6}$/.test(word)) {
|
|
245
|
+
candidates.add(word);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return Array.from(candidates);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Classify if the text is about each candidate technology
|
|
255
|
+
*/
|
|
256
|
+
async classifyTechnologies(text, minConfidence = 0.3) {
|
|
257
|
+
if (!this.isInitialized) {
|
|
258
|
+
await this.initialize();
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
const cacheKey = `tech:${text.substring(0, 100)}`;
|
|
262
|
+
const cached = this.cache.get(cacheKey);
|
|
263
|
+
if (cached) {
|
|
264
|
+
return cached;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
try {
|
|
268
|
+
// Extract technology candidates
|
|
269
|
+
const candidates = this.extractTechnologyCandidates(text);
|
|
270
|
+
|
|
271
|
+
if (candidates.length === 0) {
|
|
272
|
+
return [];
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Truncate text using exact token counting to avoid MobileBERT's 512 token limit
|
|
276
|
+
const truncatedText = await truncateToTokenLimit(text, 450); // Conservative limit
|
|
277
|
+
|
|
278
|
+
// Create hypotheses for each candidate
|
|
279
|
+
const hypotheses = candidates.map((tech) => `This text is about ${tech}`);
|
|
280
|
+
|
|
281
|
+
// Classify
|
|
282
|
+
const result = await this.classifier(truncatedText, hypotheses, {
|
|
283
|
+
multi_label: true,
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
// Process results
|
|
287
|
+
const classifications = [];
|
|
288
|
+
for (let i = 0; i < result.labels.length; i++) {
|
|
289
|
+
if (result.scores[i] >= minConfidence) {
|
|
290
|
+
// Extract technology name from hypothesis
|
|
291
|
+
const tech = result.labels[i].replace('This text is about ', '');
|
|
292
|
+
classifications.push({
|
|
293
|
+
technology: tech,
|
|
294
|
+
confidence: result.scores[i],
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// Sort by confidence
|
|
300
|
+
classifications.sort((a, b) => b.confidence - a.confidence);
|
|
301
|
+
|
|
302
|
+
this.cache.set(cacheKey, classifications);
|
|
303
|
+
return classifications;
|
|
304
|
+
} catch (error) {
|
|
305
|
+
console.error('Error in technology classification:', error);
|
|
306
|
+
return [];
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/**
|
|
311
|
+
* Classify the general area/domain of the documentation
|
|
312
|
+
*/
|
|
313
|
+
async classifyDomain(text, minConfidence = 0.3) {
|
|
314
|
+
if (!this.isInitialized) {
|
|
315
|
+
await this.initialize();
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
const cacheKey = `domain:${text.substring(0, 100)}`;
|
|
319
|
+
const cached = this.cache.get(cacheKey);
|
|
320
|
+
if (cached) {
|
|
321
|
+
return cached;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
try {
|
|
325
|
+
// Use exact token counting to avoid MobileBERT's 512 token limit
|
|
326
|
+
const truncatedText = await truncateToTokenLimit(text, 450);
|
|
327
|
+
|
|
328
|
+
// Open-ended domain hypotheses
|
|
329
|
+
const domainHypotheses = [
|
|
330
|
+
'This is frontend/UI documentation',
|
|
331
|
+
'This is backend/server documentation',
|
|
332
|
+
'This is database documentation',
|
|
333
|
+
'This is DevOps/infrastructure documentation',
|
|
334
|
+
'This is mobile app documentation',
|
|
335
|
+
'This is data science/ML documentation',
|
|
336
|
+
'This is API documentation',
|
|
337
|
+
'This is security documentation',
|
|
338
|
+
'This is testing documentation',
|
|
339
|
+
'This is architecture documentation',
|
|
340
|
+
'This is getting started/setup documentation',
|
|
341
|
+
'This is configuration documentation',
|
|
342
|
+
'This is deployment documentation',
|
|
343
|
+
'This is troubleshooting documentation',
|
|
344
|
+
'This is reference documentation',
|
|
345
|
+
'This is tutorial documentation',
|
|
346
|
+
'This is best practices documentation',
|
|
347
|
+
'This is changelog/release notes',
|
|
348
|
+
];
|
|
349
|
+
|
|
350
|
+
const result = await this.classifier(truncatedText, domainHypotheses, {
|
|
351
|
+
multi_label: true,
|
|
352
|
+
});
|
|
353
|
+
|
|
354
|
+
// Process results
|
|
355
|
+
const classifications = [];
|
|
356
|
+
for (let i = 0; i < result.labels.length; i++) {
|
|
357
|
+
if (result.scores[i] >= minConfidence) {
|
|
358
|
+
classifications.push({
|
|
359
|
+
domain: result.labels[i].replace('This is ', '').replace(' documentation', ''),
|
|
360
|
+
confidence: result.scores[i],
|
|
361
|
+
});
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Sort by confidence
|
|
366
|
+
classifications.sort((a, b) => b.confidence - a.confidence);
|
|
367
|
+
|
|
368
|
+
this.cache.set(cacheKey, classifications);
|
|
369
|
+
return classifications;
|
|
370
|
+
} catch (error) {
|
|
371
|
+
console.error('Error in domain classification:', error);
|
|
372
|
+
return [];
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Get a summary classification of the text
|
|
378
|
+
*/
|
|
379
|
+
async classifyDocument(text) {
|
|
380
|
+
const [technologies, domains] = await Promise.all([this.classifyTechnologies(text), this.classifyDomain(text)]);
|
|
381
|
+
|
|
382
|
+
return {
|
|
383
|
+
technologies,
|
|
384
|
+
domains,
|
|
385
|
+
primaryTechnology: technologies[0]?.technology || 'Unknown',
|
|
386
|
+
primaryDomain: domains[0]?.domain || 'general',
|
|
387
|
+
};
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Export singleton instance
|
|
392
|
+
export const openClassifier = new OpenZeroShotClassifier();
|