codecritique 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +1145 -0
  3. package/package.json +98 -0
  4. package/src/content-retrieval.js +747 -0
  5. package/src/custom-documents.js +597 -0
  6. package/src/embeddings/cache-manager.js +364 -0
  7. package/src/embeddings/constants.js +40 -0
  8. package/src/embeddings/database.js +921 -0
  9. package/src/embeddings/errors.js +208 -0
  10. package/src/embeddings/factory.js +447 -0
  11. package/src/embeddings/file-processor.js +851 -0
  12. package/src/embeddings/model-manager.js +337 -0
  13. package/src/embeddings/similarity-calculator.js +97 -0
  14. package/src/embeddings/types.js +113 -0
  15. package/src/feedback-loader.js +384 -0
  16. package/src/index.js +1418 -0
  17. package/src/llm.js +123 -0
  18. package/src/pr-history/analyzer.js +579 -0
  19. package/src/pr-history/bot-detector.js +123 -0
  20. package/src/pr-history/cli-utils.js +204 -0
  21. package/src/pr-history/comment-processor.js +549 -0
  22. package/src/pr-history/database.js +819 -0
  23. package/src/pr-history/github-client.js +629 -0
  24. package/src/project-analyzer.js +955 -0
  25. package/src/rag-analyzer.js +2764 -0
  26. package/src/rag-review.js +566 -0
  27. package/src/technology-keywords.json +753 -0
  28. package/src/utils/command.js +48 -0
  29. package/src/utils/constants.js +263 -0
  30. package/src/utils/context-inference.js +364 -0
  31. package/src/utils/document-detection.js +105 -0
  32. package/src/utils/file-validation.js +271 -0
  33. package/src/utils/git.js +232 -0
  34. package/src/utils/language-detection.js +170 -0
  35. package/src/utils/logging.js +24 -0
  36. package/src/utils/markdown.js +132 -0
  37. package/src/utils/mobilebert-tokenizer.js +141 -0
  38. package/src/utils/pr-chunking.js +276 -0
  39. package/src/utils/string-utils.js +28 -0
  40. package/src/zero-shot-classifier-open.js +392 -0
@@ -0,0 +1,276 @@
1
+ import chalk from 'chalk';
2
+
3
+ /**
4
+ * Determines if a PR should be chunked based on estimated token usage
5
+ * @param {Array} prFiles - Array of PR files with diffContent and content
6
+ * @returns {Object} Decision object with shouldChunk flag and estimates
7
+ */
8
+ export function shouldChunkPR(prFiles) {
9
+ // IMPORTANT: The holistic PR prompt includes BOTH full file content AND diff content
10
+ // for each file, plus context (code examples, guidelines, PR comments, custom docs)
11
+
12
+ // Calculate tokens for diff content
13
+ const diffTokens = prFiles.reduce((sum, file) => {
14
+ return sum + Math.ceil((file.diffContent?.length || 0) / 3);
15
+ }, 0);
16
+
17
+ // Calculate tokens for full file content (included in prompt for context awareness)
18
+ const fullContentTokens = prFiles.reduce((sum, file) => {
19
+ return sum + Math.ceil((file.content?.length || 0) / 3);
20
+ }, 0);
21
+
22
+ // Total file-related tokens (both diff AND full content are sent)
23
+ const fileTokens = diffTokens + fullContentTokens;
24
+
25
+ // Estimate context overhead (code examples, guidelines, PR comments, custom docs, project summary)
26
+ // This is typically 10-30k tokens depending on project size
27
+ const CONTEXT_OVERHEAD_TOKENS = 25000;
28
+
29
+ // Total estimated prompt tokens
30
+ const totalEstimatedTokens = fileTokens + CONTEXT_OVERHEAD_TOKENS;
31
+
32
+ // Claude's limit is 200k tokens. Leave buffer for response and safety margin.
33
+ // Max safe prompt size ~150k tokens to be conservative
34
+ const MAX_SINGLE_REVIEW_TOKENS = 100000;
35
+
36
+ const shouldChunk = totalEstimatedTokens > MAX_SINGLE_REVIEW_TOKENS || prFiles.length > 30;
37
+
38
+ console.log(
39
+ chalk.gray(
40
+ ` Token breakdown: ${diffTokens} diff + ${fullContentTokens} full content + ${CONTEXT_OVERHEAD_TOKENS} context overhead = ${totalEstimatedTokens} total`
41
+ )
42
+ );
43
+
44
+ return {
45
+ shouldChunk,
46
+ estimatedTokens: totalEstimatedTokens,
47
+ diffTokens,
48
+ fullContentTokens,
49
+ contextOverhead: CONTEXT_OVERHEAD_TOKENS,
50
+ recommendedChunks: Math.ceil(totalEstimatedTokens / 35000), // More aggressive chunking
51
+ };
52
+ }
53
+
54
+ /**
55
+ * Chunks PR files into manageable groups based on token limits and logical grouping
56
+ * @param {Array} prFiles - Array of PR files with diffContent and content
57
+ * @param {number} maxTokensPerChunk - Maximum tokens per chunk
58
+ * @returns {Array} Array of chunks with files and metadata
59
+ */
60
+ export function chunkPRFiles(prFiles, maxTokensPerChunk = 35000) {
61
+ // Calculate change complexity for each file (works for any language)
62
+ // IMPORTANT: Token estimate must include BOTH diff AND full content since both are sent
63
+ const filesWithMetrics = prFiles.map((file) => ({
64
+ ...file,
65
+ changeSize: calculateChangeSize(file.diffContent),
66
+ fileComplexity: calculateFileComplexity(file),
67
+ // Estimate tokens for BOTH diff content AND full file content (both are included in prompt)
68
+ estimatedTokens: Math.ceil((file.diffContent?.length || 0) / 3) + Math.ceil((file.content?.length || 0) / 3),
69
+ }));
70
+
71
+ // Sort by directory + change importance for logical grouping
72
+ const sortedFiles = filesWithMetrics.sort((a, b) => {
73
+ const dirA = getDirectoryDepth(a.filePath);
74
+ const dirB = getDirectoryDepth(b.filePath);
75
+
76
+ // Primary: Directory structure (keep related files together)
77
+ if (dirA !== dirB) return dirA.localeCompare(dirB);
78
+
79
+ // Secondary: Change importance (larger changes first)
80
+ return b.changeSize - a.changeSize;
81
+ });
82
+
83
+ // Chunk files based on token budget
84
+ const chunks = [];
85
+ let currentChunk = [];
86
+ let currentTokens = 0;
87
+
88
+ for (const file of sortedFiles) {
89
+ // Start new chunk if adding this file exceeds budget
90
+ if (currentTokens + file.estimatedTokens > maxTokensPerChunk && currentChunk.length > 0) {
91
+ chunks.push({
92
+ files: [...currentChunk],
93
+ totalTokens: currentTokens,
94
+ chunkId: chunks.length + 1,
95
+ });
96
+ currentChunk = [];
97
+ currentTokens = 0;
98
+ }
99
+
100
+ currentChunk.push(file);
101
+ currentTokens += file.estimatedTokens;
102
+ }
103
+
104
+ // Add final chunk
105
+ if (currentChunk.length > 0) {
106
+ chunks.push({
107
+ files: [...currentChunk],
108
+ totalTokens: currentTokens,
109
+ chunkId: chunks.length + 1,
110
+ });
111
+ }
112
+
113
+ return chunks;
114
+ }
115
+
116
+ /**
117
+ * Language-agnostic change size calculation
118
+ * @param {string} diffContent - The diff content
119
+ * @returns {number} Total number of additions and deletions
120
+ */
121
+ function calculateChangeSize(diffContent) {
122
+ if (!diffContent) return 0;
123
+ const lines = diffContent.split('\n');
124
+ const additions = lines.filter((line) => line.startsWith('+')).length;
125
+ const deletions = lines.filter((line) => line.startsWith('-')).length;
126
+ return additions + deletions;
127
+ }
128
+
129
+ /**
130
+ * Language-agnostic file complexity scoring
131
+ * @param {Object} file - File object with filePath and diffContent
132
+ * @returns {number} Complexity score
133
+ */
134
+ function calculateFileComplexity(file) {
135
+ let complexity = 0;
136
+
137
+ // File size factor
138
+ complexity += Math.min(file.diffContent ? file.diffContent.length / 1000 : 0, 20);
139
+
140
+ // Path-based heuristics (works for any language)
141
+ const path = file.filePath.toLowerCase();
142
+ if (path.includes('/src/') || path.includes('/lib/')) complexity += 10;
143
+ if (path.includes('/test/') || path.includes('/spec/')) complexity += 5;
144
+ if (path.includes('/config/') || path.includes('/settings/')) complexity += 8;
145
+ if (path.includes('/main.') || path.includes('/index.')) complexity += 15;
146
+
147
+ // Change type heuristics
148
+ if (file.diffContent) {
149
+ if (file.diffContent.includes('new file mode')) complexity += 12;
150
+ if (file.diffContent.includes('deleted file mode')) complexity += 8;
151
+ }
152
+
153
+ return complexity;
154
+ }
155
+
156
+ /**
157
+ * Gets directory path for grouping related files
158
+ * @param {string} filePath - The file path
159
+ * @returns {string} Directory path without filename
160
+ */
161
+ function getDirectoryDepth(filePath) {
162
+ return filePath.split('/').slice(0, -1).join('/'); // Directory path without filename
163
+ }
164
+
165
+ /**
166
+ * Combines results from multiple chunk reviews into a single result
167
+ * @param {Array} chunkResults - Array of chunk review results
168
+ * @param {number} totalFiles - Total number of files in the PR
169
+ * @returns {Object} Combined result object
170
+ */
171
+ export function combineChunkResults(chunkResults, totalFiles) {
172
+ const combinedResult = {
173
+ success: true,
174
+ results: [],
175
+ prContext: {
176
+ totalFiles: totalFiles,
177
+ chunkedReview: true,
178
+ chunks: chunkResults.length,
179
+ },
180
+ };
181
+
182
+ // Combine file-specific results
183
+ chunkResults.forEach((chunkResult, chunkIndex) => {
184
+ if (chunkResult.success && chunkResult.results) {
185
+ chunkResult.results.forEach((fileResult) => {
186
+ // Add chunk context to each result
187
+ const enhancedResult = {
188
+ ...fileResult,
189
+ chunkInfo: {
190
+ chunkNumber: chunkIndex + 1,
191
+ totalChunks: chunkResults.length,
192
+ },
193
+ };
194
+ combinedResult.results.push(enhancedResult);
195
+ });
196
+ }
197
+ });
198
+
199
+ // Create combined summary
200
+ combinedResult.combinedSummary = createCombinedSummary(chunkResults);
201
+
202
+ // Detect and merge cross-chunk issues
203
+ combinedResult.crossChunkIssues = detectCrossChunkIssues(chunkResults);
204
+
205
+ console.log(chalk.green(`✅ Combined results from ${chunkResults.length} chunks: ${combinedResult.results.length} file reviews`));
206
+
207
+ return combinedResult;
208
+ }
209
+
210
+ /**
211
+ * Creates a summary from combined chunk results
212
+ * @param {Array} chunkResults - Array of chunk review results
213
+ * @returns {string} Combined summary text
214
+ */
215
+ function createCombinedSummary(chunkResults) {
216
+ const totalIssues = chunkResults.reduce((sum, chunk) => {
217
+ if (!chunk.results) return sum;
218
+ return (
219
+ sum +
220
+ chunk.results.reduce((fileSum, file) => {
221
+ return fileSum + (file.results?.issues?.length || 0);
222
+ }, 0)
223
+ );
224
+ }, 0);
225
+
226
+ const successfulChunks = chunkResults.filter((c) => c.success).length;
227
+
228
+ return `Chunked PR review completed: ${successfulChunks}/${chunkResults.length} chunks processed successfully. Total issues found: ${totalIssues}. Review performed in parallel chunks to optimize token usage.`;
229
+ }
230
+
231
+ /**
232
+ * Detects issues that span across multiple chunks
233
+ * @param {Array} chunkResults - Array of chunk review results
234
+ * @returns {Array} Array of cross-chunk issues
235
+ */
236
+ function detectCrossChunkIssues(chunkResults) {
237
+ const crossChunkIssues = [];
238
+
239
+ // Simple heuristic: Look for similar issues across chunks that might indicate patterns
240
+ const allIssues = chunkResults.flatMap(
241
+ (chunk) =>
242
+ chunk.results?.flatMap((file) =>
243
+ (file.results?.issues || []).map((issue) => ({
244
+ ...issue,
245
+ chunkId: chunk.chunkId,
246
+ filePath: file.filePath,
247
+ }))
248
+ ) || []
249
+ );
250
+
251
+ // Group by issue type and description similarity
252
+ const issueGroups = new Map();
253
+ allIssues.forEach((issue) => {
254
+ const key = `${issue.type}-${issue.description ? issue.description.substring(0, 50) : ''}`;
255
+ if (!issueGroups.has(key)) {
256
+ issueGroups.set(key, []);
257
+ }
258
+ issueGroups.get(key).push(issue);
259
+ });
260
+
261
+ // Identify patterns that appear across multiple chunks
262
+ issueGroups.forEach((issues) => {
263
+ const uniqueChunks = new Set(issues.map((i) => i.chunkId));
264
+ if (uniqueChunks.size > 1) {
265
+ crossChunkIssues.push({
266
+ type: 'pattern',
267
+ severity: 'medium',
268
+ description: `Similar issue pattern detected across ${uniqueChunks.size} chunks: ${issues[0].description || 'Pattern issue'}`,
269
+ affectedFiles: issues.map((i) => i.filePath),
270
+ suggestion: `This issue appears in multiple parts of the PR. Consider addressing it consistently across all affected files.`,
271
+ });
272
+ }
273
+ });
274
+
275
+ return crossChunkIssues;
276
+ }
@@ -0,0 +1,28 @@
1
+ /**
2
+ * String Utilities Module
3
+ *
4
+ * This module provides utilities for string manipulation, formatting,
5
+ * and text processing operations.
6
+ */
7
+
8
+ /**
9
+ * Slugify text for use in IDs and URLs
10
+ *
11
+ * @param {string} text - The text to slugify
12
+ * @returns {string} A slugified string safe for use in IDs and URLs
13
+ *
14
+ * @example
15
+ * slugify('Hello World!'); // 'hello-world'
16
+ * slugify('My Component Name'); // 'my-component-name'
17
+ * slugify(' Multiple Spaces '); // 'multiple-spaces'
18
+ */
19
+ export function slugify(text) {
20
+ if (!text) return '';
21
+ return text
22
+ .toString()
23
+ .toLowerCase()
24
+ .trim()
25
+ .replace(/\s+/g, '-') // Replace spaces with -
26
+ .replace(/[^\w-]+/g, '') // Remove all non-word chars
27
+ .replace(/--+/g, '-'); // Replace multiple - with single -
28
+ }
@@ -0,0 +1,392 @@
1
+ /**
2
+ * Open-ended Zero-Shot Classification Module
3
+ *
4
+ * This module provides zero-shot classification without predefined categories,
5
+ * allowing it to detect any technology or framework mentioned in the text.
6
+ */
7
+
8
+ import { env, pipeline } from '@huggingface/transformers';
9
+ import * as linguistLanguages from 'linguist-languages';
10
+ import { LRUCache } from 'lru-cache';
11
+ import stopwords from 'stopwords-iso/stopwords-iso.json' with { type: 'json' };
12
+ import techKeywords from './technology-keywords.json' with { type: 'json' };
13
+ import { truncateToTokenLimit } from './utils/mobilebert-tokenizer.js';
14
+
15
+ // Configure Transformers.js environment
16
+ env.allowLocalModels = false;
17
+ env.useBrowserCache = false;
18
+
19
+ /**
20
+ * OpenZeroShotClassifier for unrestricted technology detection
21
+ */
22
+ class OpenZeroShotClassifier {
23
+ constructor() {
24
+ this.classifier = null;
25
+ this.initializationPromise = null;
26
+ this.cache = new LRUCache({
27
+ max: 100,
28
+ ttl: 1000 * 60 * 60, // 1 hour TTL
29
+ });
30
+ this.isInitialized = false;
31
+
32
+ // Common words to exclude from technology detection
33
+ // Use English stopwords from stopwords-iso
34
+ this.commonWords = new Set(stopwords.en || []);
35
+
36
+ // Add additional technical context words that are too generic
37
+ const additionalCommonWords = [
38
+ 'system',
39
+ 'modern',
40
+ 'architecture',
41
+ 'stack',
42
+ 'features',
43
+ 'data',
44
+ 'service',
45
+ 'tools',
46
+ 'runtime',
47
+ 'apps',
48
+ 'workloads',
49
+ 'pipeline',
50
+ 'builds',
51
+ 'team',
52
+ 'interfaces',
53
+ 'queries',
54
+ 'computing',
55
+ 'database',
56
+ 'processing',
57
+ 'stream',
58
+ 'analytics',
59
+ 'infrastructure',
60
+ 'runs',
61
+ 'orchestration',
62
+ 'mesh',
63
+ 'experimenting',
64
+ 'desktop',
65
+ 'entire',
66
+ 'reproducible',
67
+ 'migrating',
68
+ 'temporal',
69
+ 'distributed',
70
+ 'graph',
71
+ 'high-performance',
72
+ 'real-time',
73
+ 'reactive',
74
+ 'frontend',
75
+ 'instead',
76
+ 'legacy',
77
+ 'fast',
78
+ 'slow',
79
+ 'quick',
80
+ 'easy',
81
+ 'hard',
82
+ 'simple',
83
+ 'complex',
84
+ 'basic',
85
+ 'advanced',
86
+ 'beginner',
87
+ 'intermediate',
88
+ 'expert',
89
+ 'professional',
90
+ ];
91
+
92
+ // Add the additional words to the stopwords set
93
+ additionalCommonWords.forEach((word) => this.commonWords.add(word));
94
+
95
+ // Build technology patterns from loaded keywords
96
+ this.techPatterns = this.buildTechPatterns();
97
+
98
+ // Build a set of all known technologies for quick lookup
99
+ this.knownTechnologies = this.buildKnownTechnologies();
100
+ }
101
+
102
+ /**
103
+ * Initialize the zero-shot classification pipeline (singleton pattern)
104
+ */
105
+ async initialize() {
106
+ // If already initialized, return immediately
107
+ if (this.isInitialized) return;
108
+
109
+ // If currently initializing, wait for the existing initialization
110
+ if (this.initializationPromise) {
111
+ return await this.initializationPromise;
112
+ }
113
+
114
+ // Start initialization
115
+ this.initializationPromise = this._doInitialize();
116
+
117
+ try {
118
+ await this.initializationPromise;
119
+ } finally {
120
+ // Clean up the promise after initialization (success or failure)
121
+ this.initializationPromise = null;
122
+ }
123
+ }
124
+
125
+ async _doInitialize() {
126
+ try {
127
+ console.log('Initializing open-ended zero-shot classifier...');
128
+
129
+ this.classifier = await pipeline('zero-shot-classification', 'Xenova/mobilebert-uncased-mnli', {
130
+ quantized: true,
131
+ });
132
+
133
+ this.isInitialized = true;
134
+ console.log('✓ Open-ended zero-shot classifier initialized successfully');
135
+ } catch (error) {
136
+ console.error('Error initializing classifier:', error);
137
+ this.isInitialized = false;
138
+ throw error;
139
+ }
140
+ }
141
+
142
+ /**
143
+ * Build technology patterns from keywords JSON
144
+ */
145
+ buildTechPatterns() {
146
+ const patterns = [
147
+ /\b(\w+\.js)\b/gi, // Matches *.js frameworks
148
+ /\b(\w+\.py)\b/gi, // Matches *.py libraries
149
+ /\b([A-Z](?:[a-z]*[A-Z])*[a-z]*)\b/g, // CamelCase (React, FastAPI)
150
+ /\b([a-z]+(?:-[a-z]+)+)\b/gi, // kebab-case (scikit-learn, styled-components)
151
+ ];
152
+
153
+ // Add dynamic patterns from linguist languages
154
+ for (const [, langData] of Object.entries(linguistLanguages)) {
155
+ if (langData.aliases) {
156
+ langData.aliases.forEach((alias) => {
157
+ patterns.push(new RegExp(`\\b${this.escapeRegex(alias)}\\b`, 'gi'));
158
+ });
159
+ }
160
+ }
161
+
162
+ return patterns;
163
+ }
164
+
165
+ /**
166
+ * Build a set of all known technologies
167
+ */
168
+ buildKnownTechnologies() {
169
+ const techs = new Set();
170
+
171
+ // Add all technologies from JSON file
172
+ const addTechsFromObject = (obj) => {
173
+ for (const value of Object.values(obj)) {
174
+ if (Array.isArray(value)) {
175
+ value.forEach((tech) => techs.add(tech.toLowerCase()));
176
+ } else if (typeof value === 'object') {
177
+ addTechsFromObject(value);
178
+ }
179
+ }
180
+ };
181
+
182
+ addTechsFromObject(techKeywords);
183
+
184
+ // Add languages from linguist
185
+ for (const [langName, langData] of Object.entries(linguistLanguages)) {
186
+ techs.add(langName.toLowerCase());
187
+ if (langData.aliases) {
188
+ langData.aliases.forEach((alias) => techs.add(alias.toLowerCase()));
189
+ }
190
+ }
191
+
192
+ return techs;
193
+ }
194
+
195
+ /**
196
+ * Escape regex special characters
197
+ */
198
+ escapeRegex(str) {
199
+ return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
200
+ }
201
+
202
+ /**
203
+ * Extract potential technology candidates from text
204
+ */
205
+ extractTechnologyCandidates(text) {
206
+ const candidates = new Set();
207
+
208
+ // Look for known technologies
209
+ for (const tech of this.knownTechnologies) {
210
+ // Create regex for exact word boundary matching
211
+ const regex = new RegExp(`\\b${this.escapeRegex(tech)}\\b`, 'i');
212
+ if (regex.test(text)) {
213
+ candidates.add(tech);
214
+ }
215
+ }
216
+
217
+ // Extract using patterns
218
+ for (const pattern of this.techPatterns) {
219
+ const matches = text.matchAll(pattern);
220
+ for (const match of matches) {
221
+ const candidate = match[1] || match[0];
222
+ if (candidate.length > 2 && candidate.length < 30 && !this.commonWords.has(candidate.toLowerCase())) {
223
+ candidates.add(candidate);
224
+ }
225
+ }
226
+ }
227
+
228
+ // Extract capitalized words that might be technologies
229
+ const sentences = text.split(/[.!?]+/);
230
+ for (const sentence of sentences) {
231
+ const words = sentence.trim().split(/\s+/);
232
+ for (let i = 0; i < words.length; i++) {
233
+ const word = words[i].replace(/[.,;:!?'"()[\]{}]/g, '');
234
+
235
+ // Skip if it's a common word
236
+ if (this.commonWords.has(word.toLowerCase())) continue;
237
+
238
+ // Check if word is capitalized and not at sentence start
239
+ if (i > 0 && /^[A-Z][a-zA-Z]+/.test(word) && word.length > 2 && word.length < 20) {
240
+ candidates.add(word);
241
+ }
242
+
243
+ // Also check for acronyms
244
+ if (/^[A-Z]{2,6}$/.test(word)) {
245
+ candidates.add(word);
246
+ }
247
+ }
248
+ }
249
+
250
+ return Array.from(candidates);
251
+ }
252
+
253
+ /**
254
+ * Classify if the text is about each candidate technology
255
+ */
256
+ async classifyTechnologies(text, minConfidence = 0.3) {
257
+ if (!this.isInitialized) {
258
+ await this.initialize();
259
+ }
260
+
261
+ const cacheKey = `tech:${text.substring(0, 100)}`;
262
+ const cached = this.cache.get(cacheKey);
263
+ if (cached) {
264
+ return cached;
265
+ }
266
+
267
+ try {
268
+ // Extract technology candidates
269
+ const candidates = this.extractTechnologyCandidates(text);
270
+
271
+ if (candidates.length === 0) {
272
+ return [];
273
+ }
274
+
275
+ // Truncate text using exact token counting to avoid MobileBERT's 512 token limit
276
+ const truncatedText = await truncateToTokenLimit(text, 450); // Conservative limit
277
+
278
+ // Create hypotheses for each candidate
279
+ const hypotheses = candidates.map((tech) => `This text is about ${tech}`);
280
+
281
+ // Classify
282
+ const result = await this.classifier(truncatedText, hypotheses, {
283
+ multi_label: true,
284
+ });
285
+
286
+ // Process results
287
+ const classifications = [];
288
+ for (let i = 0; i < result.labels.length; i++) {
289
+ if (result.scores[i] >= minConfidence) {
290
+ // Extract technology name from hypothesis
291
+ const tech = result.labels[i].replace('This text is about ', '');
292
+ classifications.push({
293
+ technology: tech,
294
+ confidence: result.scores[i],
295
+ });
296
+ }
297
+ }
298
+
299
+ // Sort by confidence
300
+ classifications.sort((a, b) => b.confidence - a.confidence);
301
+
302
+ this.cache.set(cacheKey, classifications);
303
+ return classifications;
304
+ } catch (error) {
305
+ console.error('Error in technology classification:', error);
306
+ return [];
307
+ }
308
+ }
309
+
310
+ /**
311
+ * Classify the general area/domain of the documentation
312
+ */
313
+ async classifyDomain(text, minConfidence = 0.3) {
314
+ if (!this.isInitialized) {
315
+ await this.initialize();
316
+ }
317
+
318
+ const cacheKey = `domain:${text.substring(0, 100)}`;
319
+ const cached = this.cache.get(cacheKey);
320
+ if (cached) {
321
+ return cached;
322
+ }
323
+
324
+ try {
325
+ // Use exact token counting to avoid MobileBERT's 512 token limit
326
+ const truncatedText = await truncateToTokenLimit(text, 450);
327
+
328
+ // Open-ended domain hypotheses
329
+ const domainHypotheses = [
330
+ 'This is frontend/UI documentation',
331
+ 'This is backend/server documentation',
332
+ 'This is database documentation',
333
+ 'This is DevOps/infrastructure documentation',
334
+ 'This is mobile app documentation',
335
+ 'This is data science/ML documentation',
336
+ 'This is API documentation',
337
+ 'This is security documentation',
338
+ 'This is testing documentation',
339
+ 'This is architecture documentation',
340
+ 'This is getting started/setup documentation',
341
+ 'This is configuration documentation',
342
+ 'This is deployment documentation',
343
+ 'This is troubleshooting documentation',
344
+ 'This is reference documentation',
345
+ 'This is tutorial documentation',
346
+ 'This is best practices documentation',
347
+ 'This is changelog/release notes',
348
+ ];
349
+
350
+ const result = await this.classifier(truncatedText, domainHypotheses, {
351
+ multi_label: true,
352
+ });
353
+
354
+ // Process results
355
+ const classifications = [];
356
+ for (let i = 0; i < result.labels.length; i++) {
357
+ if (result.scores[i] >= minConfidence) {
358
+ classifications.push({
359
+ domain: result.labels[i].replace('This is ', '').replace(' documentation', ''),
360
+ confidence: result.scores[i],
361
+ });
362
+ }
363
+ }
364
+
365
+ // Sort by confidence
366
+ classifications.sort((a, b) => b.confidence - a.confidence);
367
+
368
+ this.cache.set(cacheKey, classifications);
369
+ return classifications;
370
+ } catch (error) {
371
+ console.error('Error in domain classification:', error);
372
+ return [];
373
+ }
374
+ }
375
+
376
+ /**
377
+ * Get a summary classification of the text
378
+ */
379
+ async classifyDocument(text) {
380
+ const [technologies, domains] = await Promise.all([this.classifyTechnologies(text), this.classifyDomain(text)]);
381
+
382
+ return {
383
+ technologies,
384
+ domains,
385
+ primaryTechnology: technologies[0]?.technology || 'Unknown',
386
+ primaryDomain: domains[0]?.domain || 'general',
387
+ };
388
+ }
389
+ }
390
+
391
+ // Export singleton instance
392
+ export const openClassifier = new OpenZeroShotClassifier();