codecritique 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -114
- package/package.json +10 -9
- package/src/content-retrieval.test.js +775 -0
- package/src/custom-documents.test.js +440 -0
- package/src/feedback-loader.test.js +529 -0
- package/src/llm.test.js +256 -0
- package/src/project-analyzer.test.js +747 -0
- package/src/rag-analyzer.js +12 -0
- package/src/rag-analyzer.test.js +1109 -0
- package/src/rag-review.test.js +317 -0
- package/src/setupTests.js +131 -0
- package/src/zero-shot-classifier-open.test.js +278 -0
- package/src/embeddings/cache-manager.js +0 -364
- package/src/embeddings/constants.js +0 -40
- package/src/embeddings/database.js +0 -921
- package/src/embeddings/errors.js +0 -208
- package/src/embeddings/factory.js +0 -447
- package/src/embeddings/file-processor.js +0 -851
- package/src/embeddings/model-manager.js +0 -337
- package/src/embeddings/similarity-calculator.js +0 -97
- package/src/embeddings/types.js +0 -113
- package/src/pr-history/analyzer.js +0 -579
- package/src/pr-history/bot-detector.js +0 -123
- package/src/pr-history/cli-utils.js +0 -204
- package/src/pr-history/comment-processor.js +0 -549
- package/src/pr-history/database.js +0 -819
- package/src/pr-history/github-client.js +0 -629
- package/src/technology-keywords.json +0 -753
- package/src/utils/command.js +0 -48
- package/src/utils/constants.js +0 -263
- package/src/utils/context-inference.js +0 -364
- package/src/utils/document-detection.js +0 -105
- package/src/utils/file-validation.js +0 -271
- package/src/utils/git.js +0 -232
- package/src/utils/language-detection.js +0 -170
- package/src/utils/logging.js +0 -24
- package/src/utils/markdown.js +0 -132
- package/src/utils/mobilebert-tokenizer.js +0 -141
- package/src/utils/pr-chunking.js +0 -276
- package/src/utils/string-utils.js +0 -28
package/src/utils/command.js
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Command Execution Module
|
|
3
|
-
*
|
|
4
|
-
* This module provides utilities for safely executing shell commands,
|
|
5
|
-
* particularly focused on git operations with proper argument escaping
|
|
6
|
-
* to prevent command injection attacks.
|
|
7
|
-
*/
|
|
8
|
-
|
|
9
|
-
import { execSync } from 'child_process';
|
|
10
|
-
|
|
11
|
-
/**
|
|
12
|
-
* Safely escape shell arguments to prevent command injection
|
|
13
|
-
*
|
|
14
|
-
* @param {string} arg - The argument to escape
|
|
15
|
-
* @returns {string} The safely escaped argument
|
|
16
|
-
*
|
|
17
|
-
* @example
|
|
18
|
-
* const safeArg = escapeShellArg("user's file.txt");
|
|
19
|
-
* // Returns: 'user'\''s file.txt'
|
|
20
|
-
*/
|
|
21
|
-
function escapeShellArg(arg) {
|
|
22
|
-
if (!arg || typeof arg !== 'string') {
|
|
23
|
-
return "''";
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
// For POSIX shells, single quotes preserve everything literally
|
|
27
|
-
// We escape single quotes by ending the quoted string, adding an escaped quote, and starting a new quoted string
|
|
28
|
-
return "'" + arg.replace(/'/g, "'\\''") + "'";
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
/**
|
|
32
|
-
* Safely execute git commands by escaping all arguments
|
|
33
|
-
*
|
|
34
|
-
* @param {string} baseCommand - The base git command (e.g., 'git show')
|
|
35
|
-
* @param {Array<string>} args - Array of arguments to escape and append
|
|
36
|
-
* @param {Object} options - Options to pass to execSync
|
|
37
|
-
* @returns {string} The command output
|
|
38
|
-
*
|
|
39
|
-
* @example
|
|
40
|
-
* const result = execGitSafe('git show', ['HEAD~1', 'src/file.js'], { cwd: '/path/to/repo' });
|
|
41
|
-
*
|
|
42
|
-
* @throws {Error} If the command execution fails
|
|
43
|
-
*/
|
|
44
|
-
export function execGitSafe(baseCommand, args = [], options = {}) {
|
|
45
|
-
const escapedArgs = args.map((arg) => escapeShellArg(arg)).join(' ');
|
|
46
|
-
const fullCommand = escapedArgs ? `${baseCommand} ${escapedArgs}` : baseCommand;
|
|
47
|
-
return execSync(fullCommand, options);
|
|
48
|
-
}
|
package/src/utils/constants.js
DELETED
|
@@ -1,263 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Constants Module
|
|
3
|
-
*
|
|
4
|
-
* This module provides shared constants for file extensions, patterns,
|
|
5
|
-
* and other configuration values used throughout the utility modules.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Extension to language mapping
|
|
10
|
-
* This is the single source of truth for supported file types and their languages
|
|
11
|
-
* @type {Object.<string, string>}
|
|
12
|
-
*/
|
|
13
|
-
export const EXTENSION_TO_LANGUAGE_MAP = {
|
|
14
|
-
// JavaScript and variants
|
|
15
|
-
'.js': 'javascript',
|
|
16
|
-
'.jsx': 'javascript',
|
|
17
|
-
'.mjs': 'javascript',
|
|
18
|
-
'.cjs': 'javascript',
|
|
19
|
-
|
|
20
|
-
// TypeScript and variants
|
|
21
|
-
'.ts': 'typescript',
|
|
22
|
-
'.tsx': 'typescript',
|
|
23
|
-
'.mts': 'typescript',
|
|
24
|
-
'.cts': 'typescript',
|
|
25
|
-
'.d.ts': 'typescript',
|
|
26
|
-
|
|
27
|
-
// Web technologies
|
|
28
|
-
'.html': 'html',
|
|
29
|
-
'.htm': 'html',
|
|
30
|
-
'.css': 'css',
|
|
31
|
-
'.scss': 'scss',
|
|
32
|
-
'.sass': 'sass',
|
|
33
|
-
'.less': 'less',
|
|
34
|
-
'.svg': 'svg',
|
|
35
|
-
|
|
36
|
-
// Configuration files
|
|
37
|
-
'.json': 'json',
|
|
38
|
-
'.yaml': 'yaml',
|
|
39
|
-
'.yml': 'yaml',
|
|
40
|
-
'.toml': 'toml',
|
|
41
|
-
'.xml': 'xml',
|
|
42
|
-
|
|
43
|
-
// Documentation
|
|
44
|
-
'.md': 'markdown',
|
|
45
|
-
'.mdx': 'markdown',
|
|
46
|
-
'.markdown': 'markdown',
|
|
47
|
-
'.rst': 'restructuredtext',
|
|
48
|
-
'.adoc': 'asciidoc',
|
|
49
|
-
'.txt': 'text',
|
|
50
|
-
|
|
51
|
-
// Python
|
|
52
|
-
'.py': 'python',
|
|
53
|
-
'.pyi': 'python',
|
|
54
|
-
'.ipynb': 'jupyter',
|
|
55
|
-
|
|
56
|
-
// Ruby
|
|
57
|
-
'.rb': 'ruby',
|
|
58
|
-
'.erb': 'ruby',
|
|
59
|
-
'.rake': 'ruby',
|
|
60
|
-
|
|
61
|
-
// PHP
|
|
62
|
-
'.php': 'php',
|
|
63
|
-
'.phtml': 'php',
|
|
64
|
-
|
|
65
|
-
// Java and JVM languages
|
|
66
|
-
'.java': 'java',
|
|
67
|
-
'.kt': 'kotlin',
|
|
68
|
-
'.kts': 'kotlin',
|
|
69
|
-
'.groovy': 'groovy',
|
|
70
|
-
'.scala': 'scala',
|
|
71
|
-
|
|
72
|
-
// C-family languages
|
|
73
|
-
'.c': 'c',
|
|
74
|
-
'.h': 'c',
|
|
75
|
-
'.cpp': 'cpp',
|
|
76
|
-
'.cc': 'cpp',
|
|
77
|
-
'.cxx': 'cpp',
|
|
78
|
-
'.hpp': 'cpp',
|
|
79
|
-
'.c++': 'cpp',
|
|
80
|
-
'.h++': 'cpp',
|
|
81
|
-
'.cs': 'csharp',
|
|
82
|
-
|
|
83
|
-
// Go
|
|
84
|
-
'.go': 'go',
|
|
85
|
-
|
|
86
|
-
// Rust
|
|
87
|
-
'.rs': 'rust',
|
|
88
|
-
|
|
89
|
-
// Swift
|
|
90
|
-
'.swift': 'swift',
|
|
91
|
-
|
|
92
|
-
// Shell scripts
|
|
93
|
-
'.sh': 'bash',
|
|
94
|
-
'.bash': 'bash',
|
|
95
|
-
'.zsh': 'zsh',
|
|
96
|
-
'.fish': 'fish',
|
|
97
|
-
|
|
98
|
-
// Other languages
|
|
99
|
-
'.pl': 'perl',
|
|
100
|
-
'.pm': 'perl',
|
|
101
|
-
'.lua': 'lua',
|
|
102
|
-
'.r': 'r',
|
|
103
|
-
'.dart': 'dart',
|
|
104
|
-
'.ex': 'elixir',
|
|
105
|
-
'.exs': 'elixir',
|
|
106
|
-
'.erl': 'erlang',
|
|
107
|
-
'.hrl': 'erlang',
|
|
108
|
-
'.clj': 'clojure',
|
|
109
|
-
'.cljs': 'clojure',
|
|
110
|
-
'.hs': 'haskell',
|
|
111
|
-
'.lhs': 'haskell',
|
|
112
|
-
|
|
113
|
-
// GraphQL
|
|
114
|
-
'.graphql': 'graphql',
|
|
115
|
-
'.gql': 'graphql',
|
|
116
|
-
|
|
117
|
-
// Frameworks
|
|
118
|
-
'.vue': 'vue',
|
|
119
|
-
'.svelte': 'svelte',
|
|
120
|
-
'.astro': 'astro',
|
|
121
|
-
'.prisma': 'prisma',
|
|
122
|
-
};
|
|
123
|
-
|
|
124
|
-
/**
|
|
125
|
-
* All supported file extensions derived from the language mapping
|
|
126
|
-
* @type {string[]}
|
|
127
|
-
*/
|
|
128
|
-
export const ALL_SUPPORTED_EXTENSIONS = Object.keys(EXTENSION_TO_LANGUAGE_MAP);
|
|
129
|
-
|
|
130
|
-
/**
|
|
131
|
-
* Documentation file extensions
|
|
132
|
-
* @type {string[]}
|
|
133
|
-
*/
|
|
134
|
-
export const DOCUMENTATION_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS.filter((ext) => {
|
|
135
|
-
const lang = EXTENSION_TO_LANGUAGE_MAP[ext];
|
|
136
|
-
return ['markdown', 'restructuredtext', 'asciidoc', 'text'].includes(lang);
|
|
137
|
-
});
|
|
138
|
-
|
|
139
|
-
/**
|
|
140
|
-
* Code file extensions (excludes documentation types)
|
|
141
|
-
* @type {string[]}
|
|
142
|
-
*/
|
|
143
|
-
export const CODE_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS.filter((ext) => !DOCUMENTATION_EXTENSIONS.includes(ext));
|
|
144
|
-
|
|
145
|
-
/**
|
|
146
|
-
* Binary file extensions that should be skipped during processing
|
|
147
|
-
* @type {string[]}
|
|
148
|
-
*/
|
|
149
|
-
export const BINARY_EXTENSIONS = [
|
|
150
|
-
'.jpg',
|
|
151
|
-
'.jpeg',
|
|
152
|
-
'.png',
|
|
153
|
-
'.gif',
|
|
154
|
-
'.bmp',
|
|
155
|
-
'.ico',
|
|
156
|
-
'.webp',
|
|
157
|
-
'.pdf',
|
|
158
|
-
'.doc',
|
|
159
|
-
'.docx',
|
|
160
|
-
'.ppt',
|
|
161
|
-
'.pptx',
|
|
162
|
-
'.xls',
|
|
163
|
-
'.xlsx',
|
|
164
|
-
'.zip',
|
|
165
|
-
'.tar',
|
|
166
|
-
'.gz',
|
|
167
|
-
'.7z',
|
|
168
|
-
'.rar',
|
|
169
|
-
'.exe',
|
|
170
|
-
'.dll',
|
|
171
|
-
'.so',
|
|
172
|
-
'.dylib',
|
|
173
|
-
'.ttf',
|
|
174
|
-
'.otf',
|
|
175
|
-
'.woff',
|
|
176
|
-
'.woff2',
|
|
177
|
-
'.mp3',
|
|
178
|
-
'.mp4',
|
|
179
|
-
'.avi',
|
|
180
|
-
'.mov',
|
|
181
|
-
'.wav',
|
|
182
|
-
];
|
|
183
|
-
|
|
184
|
-
/**
|
|
185
|
-
* Directories to skip during file processing
|
|
186
|
-
* @type {string[]}
|
|
187
|
-
*/
|
|
188
|
-
export const SKIP_DIRECTORIES = ['node_modules', 'dist', 'build', '.git', 'coverage', 'vendor'];
|
|
189
|
-
|
|
190
|
-
/**
|
|
191
|
-
* File names to skip during processing (lock files, config files not useful as code examples)
|
|
192
|
-
* @type {string[]}
|
|
193
|
-
*/
|
|
194
|
-
export const SKIP_FILENAMES = [
|
|
195
|
-
// Lock files
|
|
196
|
-
'package-lock.json',
|
|
197
|
-
'yarn.lock',
|
|
198
|
-
'pnpm-lock.yaml',
|
|
199
|
-
'composer.lock',
|
|
200
|
-
'Gemfile.lock',
|
|
201
|
-
// Package manifests (config, not source code)
|
|
202
|
-
'package.json',
|
|
203
|
-
'composer.json',
|
|
204
|
-
'Gemfile',
|
|
205
|
-
'Cargo.toml',
|
|
206
|
-
'go.mod',
|
|
207
|
-
'go.sum',
|
|
208
|
-
'requirements.txt',
|
|
209
|
-
'pyproject.toml',
|
|
210
|
-
'pom.xml',
|
|
211
|
-
'build.gradle',
|
|
212
|
-
// Common config files (not useful as code examples)
|
|
213
|
-
'tsconfig.json',
|
|
214
|
-
'jsconfig.json',
|
|
215
|
-
'.eslintrc',
|
|
216
|
-
'.eslintrc.json',
|
|
217
|
-
'.eslintrc.js',
|
|
218
|
-
'.prettierrc',
|
|
219
|
-
'.prettierrc.json',
|
|
220
|
-
'prettier.config.js',
|
|
221
|
-
'.babelrc',
|
|
222
|
-
'babel.config.js',
|
|
223
|
-
'jest.config.js',
|
|
224
|
-
'jest.config.ts',
|
|
225
|
-
'vitest.config.ts',
|
|
226
|
-
'vitest.config.js',
|
|
227
|
-
'webpack.config.js',
|
|
228
|
-
'vite.config.js',
|
|
229
|
-
'vite.config.ts',
|
|
230
|
-
'rollup.config.js',
|
|
231
|
-
'Makefile',
|
|
232
|
-
'Dockerfile',
|
|
233
|
-
'.dockerignore',
|
|
234
|
-
'.gitignore',
|
|
235
|
-
'.gitattributes',
|
|
236
|
-
'.editorconfig',
|
|
237
|
-
'.env.example',
|
|
238
|
-
'.nvmrc',
|
|
239
|
-
'.node-version',
|
|
240
|
-
];
|
|
241
|
-
|
|
242
|
-
/**
|
|
243
|
-
* File patterns to skip during processing (likely generated files)
|
|
244
|
-
* @type {RegExp[]}
|
|
245
|
-
*/
|
|
246
|
-
export const SKIP_FILE_PATTERNS = [
|
|
247
|
-
/\.min\.(js|css)$/,
|
|
248
|
-
/\.bundle\.(js|css)$/,
|
|
249
|
-
/\.generated\./,
|
|
250
|
-
/\.d\.ts$/,
|
|
251
|
-
/\.snap$/,
|
|
252
|
-
// Config file patterns
|
|
253
|
-
/^\..*rc$/, // .eslintrc, .prettierrc, etc.
|
|
254
|
-
/^\..*rc\.json$/, // .eslintrc.json, etc.
|
|
255
|
-
/\.config\.(js|ts|mjs|cjs)$/, // *.config.js, *.config.ts files
|
|
256
|
-
];
|
|
257
|
-
|
|
258
|
-
/**
|
|
259
|
-
* Regex pattern for detecting generic documentation files
|
|
260
|
-
* Shared between different modules for consistency
|
|
261
|
-
* @type {RegExp}
|
|
262
|
-
*/
|
|
263
|
-
export const GENERIC_DOC_REGEX = /(README|RUNBOOK|CONTRIBUTING|CHANGELOG|LICENSE|SETUP|INSTALL)(\.md|$)/i;
|
|
@@ -1,364 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Context Inference Module
|
|
3
|
-
*
|
|
4
|
-
* This module provides utilities for inferring context from code and document content,
|
|
5
|
-
* including technology detection, area classification, and semantic analysis.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import path from 'path';
|
|
9
|
-
import { openClassifier } from '../zero-shot-classifier-open.js';
|
|
10
|
-
|
|
11
|
-
/**
|
|
12
|
-
* Infer context from code content using heuristic analysis
|
|
13
|
-
*
|
|
14
|
-
* @param {string} codeContent - The code content to analyze
|
|
15
|
-
* @param {string} language - The detected programming language
|
|
16
|
-
* @returns {Object} Context information including area, keywords, and dominant technologies
|
|
17
|
-
*
|
|
18
|
-
* @example
|
|
19
|
-
* const context = inferContextFromCodeContent('import React from "react"', 'javascript');
|
|
20
|
-
* // Returns: { area: 'Frontend', keywords: [...], dominantTech: ['React'] }
|
|
21
|
-
*/
|
|
22
|
-
export function inferContextFromCodeContent(codeContent, language) {
|
|
23
|
-
const context = {
|
|
24
|
-
area: 'Unknown', // "Frontend" | "Backend" | "Tooling" | "GeneralJS_TS" | "Unknown"
|
|
25
|
-
keywords: [], // string[]
|
|
26
|
-
dominantTech: [], // string[]
|
|
27
|
-
};
|
|
28
|
-
const lowerCode = codeContent.toLowerCase();
|
|
29
|
-
|
|
30
|
-
// Area inference (very basic for now)
|
|
31
|
-
if (language === 'javascript' || language === 'typescript') {
|
|
32
|
-
if (
|
|
33
|
-
lowerCode.includes('react') ||
|
|
34
|
-
lowerCode.includes('usestate') ||
|
|
35
|
-
lowerCode.includes('useeffect') ||
|
|
36
|
-
lowerCode.includes('angular') ||
|
|
37
|
-
lowerCode.includes('vue') ||
|
|
38
|
-
lowerCode.includes('document.getelementbyid') ||
|
|
39
|
-
lowerCode.includes('jsx') ||
|
|
40
|
-
lowerCode.includes('.tsx')
|
|
41
|
-
) {
|
|
42
|
-
context.area = 'Frontend';
|
|
43
|
-
if (lowerCode.includes('react')) context.dominantTech.push('React');
|
|
44
|
-
if (lowerCode.includes('angular')) context.dominantTech.push('Angular');
|
|
45
|
-
if (lowerCode.includes('vue')) context.dominantTech.push('Vue');
|
|
46
|
-
} else if (
|
|
47
|
-
lowerCode.includes("require('express')") ||
|
|
48
|
-
lowerCode.includes('http.createserver') ||
|
|
49
|
-
lowerCode.includes('fs.readfilesync') ||
|
|
50
|
-
lowerCode.includes('process.env')
|
|
51
|
-
) {
|
|
52
|
-
context.area = 'Backend';
|
|
53
|
-
if (lowerCode.includes('express')) context.dominantTech.push('Node.js/Express');
|
|
54
|
-
else context.dominantTech.push('Node.js');
|
|
55
|
-
} else {
|
|
56
|
-
context.area = 'GeneralJS_TS';
|
|
57
|
-
}
|
|
58
|
-
} else if (language === 'python') {
|
|
59
|
-
if (lowerCode.includes('django') || lowerCode.includes('flask')) {
|
|
60
|
-
context.area = 'Backend';
|
|
61
|
-
if (lowerCode.includes('django')) context.dominantTech.push('Django');
|
|
62
|
-
if (lowerCode.includes('flask')) context.dominantTech.push('Flask');
|
|
63
|
-
} else {
|
|
64
|
-
context.area = 'GeneralPython'; // Or just "Backend"
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
// Add more language-specific heuristics here
|
|
68
|
-
|
|
69
|
-
const commonTechWords = ['api', 'component', 'module', 'function', 'class', 'hook', 'service', 'database', 'query', 'state', 'props'];
|
|
70
|
-
commonTechWords.forEach((word) => {
|
|
71
|
-
if (lowerCode.includes(word)) context.keywords.push(word);
|
|
72
|
-
});
|
|
73
|
-
context.keywords = [...new Set(context.keywords)];
|
|
74
|
-
context.dominantTech = [...new Set(context.dominantTech)];
|
|
75
|
-
|
|
76
|
-
return context;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
/**
|
|
80
|
-
* Infer context from document content using advanced classification and analysis
|
|
81
|
-
*
|
|
82
|
-
* @param {string} docPath - Path to the document
|
|
83
|
-
* @param {string} h1Content - H1 heading content
|
|
84
|
-
* @param {Array} chunksSample - Sample chunks from the document for analysis
|
|
85
|
-
* @returns {Promise<Object>} Context information with area classification and technology detection
|
|
86
|
-
*
|
|
87
|
-
* @example
|
|
88
|
-
* const context = await inferContextFromDocumentContent('/docs/api.md', 'API Guide', chunks);
|
|
89
|
-
* // Returns: { area: 'Backend', dominantTech: ['API', 'REST'], keywords: [...], ... }
|
|
90
|
-
*/
|
|
91
|
-
export async function inferContextFromDocumentContent(docPath, h1Content, chunksSample = []) {
|
|
92
|
-
const context = {
|
|
93
|
-
area: 'Unknown',
|
|
94
|
-
keywords: [],
|
|
95
|
-
dominantTech: [],
|
|
96
|
-
isGeneralPurposeReadmeStyle: false,
|
|
97
|
-
docPath: docPath,
|
|
98
|
-
};
|
|
99
|
-
|
|
100
|
-
const lowerDocPath = docPath.toLowerCase();
|
|
101
|
-
const lowerH1 = (h1Content || '').toLowerCase();
|
|
102
|
-
|
|
103
|
-
// 1. Prepare and Prioritize Text for Analysis
|
|
104
|
-
let combinedChunkText = '';
|
|
105
|
-
let charCount = 0;
|
|
106
|
-
const MAX_CHARS_FROM_CHUNKS = 2000;
|
|
107
|
-
|
|
108
|
-
for (const chunk of chunksSample) {
|
|
109
|
-
// Iterate over potentially all sample chunks from findSimilarCode
|
|
110
|
-
if (charCount >= MAX_CHARS_FROM_CHUNKS) break;
|
|
111
|
-
const chunkContentLower = (chunk.content || '').toLowerCase();
|
|
112
|
-
const chunkHeadingLower = (chunk.heading_text || '').toLowerCase();
|
|
113
|
-
let textToAppend = '';
|
|
114
|
-
if (chunkHeadingLower && chunkHeadingLower !== lowerH1) {
|
|
115
|
-
textToAppend += chunkHeadingLower + ' ';
|
|
116
|
-
}
|
|
117
|
-
textToAppend += chunkContentLower;
|
|
118
|
-
|
|
119
|
-
combinedChunkText += ' ' + textToAppend.substring(0, MAX_CHARS_FROM_CHUNKS - charCount);
|
|
120
|
-
charCount += textToAppend.length;
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
const lowerDocPathFilename = path.basename(lowerDocPath).replace(/\.(md|rst|txt|mdx)$/i, '');
|
|
124
|
-
// Give H1 significant weight, also include filename (cleaned of hyphens)
|
|
125
|
-
let primaryTextForAnalysis = `${lowerH1} ${lowerH1} ${lowerDocPathFilename.replace(/-/g, ' ')}`;
|
|
126
|
-
let fullTextForAnalysis = `${primaryTextForAnalysis} ${combinedChunkText}`.replace(/\s+/g, ' ').trim();
|
|
127
|
-
|
|
128
|
-
if (!fullTextForAnalysis.trim()) {
|
|
129
|
-
// If absolutely no text content after H1, filename, and chunks
|
|
130
|
-
if (lowerDocPath)
|
|
131
|
-
fullTextForAnalysis = lowerDocPath; // Fallback to path for keyword extraction if all else fails
|
|
132
|
-
else {
|
|
133
|
-
context.area = 'UndeterminedByContent';
|
|
134
|
-
return context; // Early exit if no text to analyze at all
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
try {
|
|
139
|
-
// Initialize classifier if needed
|
|
140
|
-
await openClassifier.initialize();
|
|
141
|
-
|
|
142
|
-
// --- 2. Use Open-Ended Classification ---
|
|
143
|
-
const classification = await openClassifier.classifyDocument(fullTextForAnalysis);
|
|
144
|
-
|
|
145
|
-
// Extract technologies directly from the classification
|
|
146
|
-
context.dominantTech = classification.technologies.filter((t) => t.confidence >= 0.35).map((t) => t.technology);
|
|
147
|
-
|
|
148
|
-
// --- 3. Area Inference based on domains and technologies ---
|
|
149
|
-
let areaScore = {
|
|
150
|
-
Frontend: 0,
|
|
151
|
-
Backend: 0,
|
|
152
|
-
FullStack: 0,
|
|
153
|
-
Database: 0,
|
|
154
|
-
DevOps: 0,
|
|
155
|
-
Testing: 0,
|
|
156
|
-
Security: 0,
|
|
157
|
-
Architecture: 0,
|
|
158
|
-
ToolingInternal: 0,
|
|
159
|
-
GeneralProjectDoc: 0,
|
|
160
|
-
Unknown: 0,
|
|
161
|
-
};
|
|
162
|
-
|
|
163
|
-
// Score based on domains
|
|
164
|
-
classification.domains.forEach((domain) => {
|
|
165
|
-
const domainLower = domain.domain.toLowerCase();
|
|
166
|
-
const confidence = domain.confidence;
|
|
167
|
-
|
|
168
|
-
if (domainLower.includes('frontend') || domainLower.includes('ui/ux')) {
|
|
169
|
-
areaScore['Frontend'] += confidence;
|
|
170
|
-
}
|
|
171
|
-
if (domainLower.includes('backend') || domainLower.includes('api')) {
|
|
172
|
-
areaScore['Backend'] += confidence;
|
|
173
|
-
}
|
|
174
|
-
if (domainLower.includes('database') || domainLower.includes('data')) {
|
|
175
|
-
areaScore['Database'] += confidence;
|
|
176
|
-
}
|
|
177
|
-
if (domainLower.includes('devops') || domainLower.includes('infrastructure')) {
|
|
178
|
-
areaScore['DevOps'] += confidence;
|
|
179
|
-
}
|
|
180
|
-
if (domainLower.includes('testing') || domainLower.includes('qa')) {
|
|
181
|
-
areaScore['Testing'] += confidence;
|
|
182
|
-
}
|
|
183
|
-
if (domainLower.includes('security')) {
|
|
184
|
-
areaScore['Security'] += confidence;
|
|
185
|
-
}
|
|
186
|
-
if (domainLower.includes('architecture')) {
|
|
187
|
-
areaScore['Architecture'] += confidence;
|
|
188
|
-
}
|
|
189
|
-
if (domainLower.includes('tooling') || domainLower.includes('developer tools')) {
|
|
190
|
-
areaScore['ToolingInternal'] += confidence;
|
|
191
|
-
}
|
|
192
|
-
if (domainLower.includes('general')) {
|
|
193
|
-
areaScore['GeneralProjectDoc'] += confidence * 0.5;
|
|
194
|
-
}
|
|
195
|
-
});
|
|
196
|
-
|
|
197
|
-
// Score based on detected technologies
|
|
198
|
-
context.dominantTech.forEach((tech) => {
|
|
199
|
-
const techLower = tech.toLowerCase();
|
|
200
|
-
if (techLower.includes('react') || techLower.includes('vue') || techLower.includes('angular')) {
|
|
201
|
-
areaScore['Frontend'] += 0.3;
|
|
202
|
-
}
|
|
203
|
-
if (techLower.includes('node') || techLower.includes('express') || techLower.includes('django')) {
|
|
204
|
-
areaScore['Backend'] += 0.3;
|
|
205
|
-
}
|
|
206
|
-
if (techLower.includes('postgres') || techLower.includes('mysql') || techLower.includes('mongodb')) {
|
|
207
|
-
areaScore['Database'] += 0.3;
|
|
208
|
-
}
|
|
209
|
-
if (techLower.includes('docker') || techLower.includes('kubernetes') || techLower.includes('terraform')) {
|
|
210
|
-
areaScore['DevOps'] += 0.3;
|
|
211
|
-
}
|
|
212
|
-
if (techLower.includes('jest') || techLower.includes('pytest') || techLower.includes('testing')) {
|
|
213
|
-
areaScore['Testing'] += 0.3;
|
|
214
|
-
}
|
|
215
|
-
});
|
|
216
|
-
|
|
217
|
-
// Apply path-based hints as additional scoring
|
|
218
|
-
if (
|
|
219
|
-
lowerDocPath.includes('/tools/') ||
|
|
220
|
-
lowerDocPath.includes('/scripts/') ||
|
|
221
|
-
lowerDocPath.includes('/cli/') ||
|
|
222
|
-
lowerH1.includes(' cli') ||
|
|
223
|
-
lowerH1.includes(' tool')
|
|
224
|
-
) {
|
|
225
|
-
areaScore['ToolingInternal'] += 0.5;
|
|
226
|
-
}
|
|
227
|
-
if (
|
|
228
|
-
lowerDocPath.includes('/api/') ||
|
|
229
|
-
lowerDocPath.includes('/server/') ||
|
|
230
|
-
lowerDocPath.includes('/db/') ||
|
|
231
|
-
lowerDocPath.includes('/backend/') ||
|
|
232
|
-
lowerH1.includes(' api') ||
|
|
233
|
-
lowerH1.includes(' server') ||
|
|
234
|
-
lowerH1.includes(' backend')
|
|
235
|
-
) {
|
|
236
|
-
areaScore['Backend'] += 0.5;
|
|
237
|
-
}
|
|
238
|
-
if (
|
|
239
|
-
lowerDocPath.includes('/frontend/') ||
|
|
240
|
-
lowerDocPath.includes('/ui/') ||
|
|
241
|
-
lowerDocPath.includes('/components/') ||
|
|
242
|
-
lowerDocPath.includes('/views/') ||
|
|
243
|
-
lowerDocPath.includes('/pages/') ||
|
|
244
|
-
lowerH1.includes(' frontend') ||
|
|
245
|
-
lowerH1.includes(' user interface')
|
|
246
|
-
) {
|
|
247
|
-
areaScore['Frontend'] += 0.5;
|
|
248
|
-
}
|
|
249
|
-
if (
|
|
250
|
-
lowerDocPath.endsWith('readme.md') ||
|
|
251
|
-
lowerDocPath.endsWith('runbook.md') ||
|
|
252
|
-
lowerDocPath.endsWith('contributing.md') ||
|
|
253
|
-
lowerDocPath.endsWith('changelog.md')
|
|
254
|
-
) {
|
|
255
|
-
areaScore['GeneralProjectDoc'] += 0.5;
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
// Find the area with the highest score
|
|
259
|
-
let maxScore = 0;
|
|
260
|
-
let selectedArea = 'Unknown';
|
|
261
|
-
Object.entries(areaScore).forEach(([area, score]) => {
|
|
262
|
-
if (score > maxScore) {
|
|
263
|
-
maxScore = score;
|
|
264
|
-
selectedArea = area;
|
|
265
|
-
}
|
|
266
|
-
});
|
|
267
|
-
|
|
268
|
-
// Set threshold for area selection
|
|
269
|
-
if (maxScore >= 0.4) {
|
|
270
|
-
context.area = selectedArea;
|
|
271
|
-
} else {
|
|
272
|
-
context.area = 'Unknown';
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
// --- isGeneralPurposeReadmeStyle ---
|
|
276
|
-
let readmeStylePoints = 0;
|
|
277
|
-
const readmeKeywords = {
|
|
278
|
-
'getting started': 2,
|
|
279
|
-
installation: 2,
|
|
280
|
-
setup: 2,
|
|
281
|
-
'how to run': 2,
|
|
282
|
-
usage: 1,
|
|
283
|
-
configuration: 1,
|
|
284
|
-
deployment: 1,
|
|
285
|
-
troubleshooting: 1,
|
|
286
|
-
prerequisites: 1,
|
|
287
|
-
'table of contents': 1,
|
|
288
|
-
contributing: 0.5,
|
|
289
|
-
license: 0.5,
|
|
290
|
-
overview: 1,
|
|
291
|
-
introduction: 1,
|
|
292
|
-
purpose: 1,
|
|
293
|
-
'project structure': 0.5,
|
|
294
|
-
};
|
|
295
|
-
for (const keyword in readmeKeywords) {
|
|
296
|
-
if (fullTextForAnalysis.includes(keyword)) {
|
|
297
|
-
readmeStylePoints += readmeKeywords[keyword];
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
const isRootFile = !lowerDocPath.substring(0, lowerDocPath.lastIndexOf('/')).includes('/');
|
|
301
|
-
if ((isRootFile && lowerDocPath.startsWith('readme') && readmeStylePoints >= 3) || readmeStylePoints >= 5) {
|
|
302
|
-
context.isGeneralPurposeReadmeStyle = true;
|
|
303
|
-
}
|
|
304
|
-
// If classified as a general project doc, it usually has readme style.
|
|
305
|
-
if (context.area === 'GeneralProjectDoc') {
|
|
306
|
-
context.isGeneralPurposeReadmeStyle = true;
|
|
307
|
-
}
|
|
308
|
-
// Tooling READMEs are often general purpose style.
|
|
309
|
-
if (context.area === 'ToolingInternal' && lowerDocPath.includes('readme') && readmeStylePoints >= 2) {
|
|
310
|
-
context.isGeneralPurposeReadmeStyle = true;
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
// --- Extract Keywords ---
|
|
314
|
-
// Add technologies as keywords
|
|
315
|
-
context.keywords.push(...context.dominantTech.map((t) => t.toLowerCase()));
|
|
316
|
-
|
|
317
|
-
// Extract keywords from H1
|
|
318
|
-
if (lowerH1) {
|
|
319
|
-
lowerH1
|
|
320
|
-
.split(/[^a-z0-9-]+/g)
|
|
321
|
-
.filter(
|
|
322
|
-
(word) => word.length > 3 && !['the', 'for', 'and', 'with', 'into', 'about', 'using', 'docs', 'this', 'that'].includes(word)
|
|
323
|
-
)
|
|
324
|
-
.slice(0, 5)
|
|
325
|
-
.forEach((kw) => context.keywords.push(kw));
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
// Add domain-based keywords
|
|
329
|
-
classification.domains.slice(0, 3).forEach((domain) => {
|
|
330
|
-
const words = domain.domain.toLowerCase().split(/[\s\-/]+/);
|
|
331
|
-
words.forEach((word) => {
|
|
332
|
-
if (word.length > 3 && !context.keywords.includes(word)) {
|
|
333
|
-
context.keywords.push(word);
|
|
334
|
-
}
|
|
335
|
-
});
|
|
336
|
-
});
|
|
337
|
-
|
|
338
|
-
// Remove duplicates and limit
|
|
339
|
-
context.keywords = [...new Set(context.keywords)].slice(0, 15);
|
|
340
|
-
} catch (error) {
|
|
341
|
-
console.error('Error in automatic zero-shot classification:', error);
|
|
342
|
-
|
|
343
|
-
// Fallback to basic keyword extraction
|
|
344
|
-
context.area = 'Unknown';
|
|
345
|
-
context.dominantTech = [];
|
|
346
|
-
|
|
347
|
-
// Extract basic keywords from text
|
|
348
|
-
const words = fullTextForAnalysis.toLowerCase().split(/\s+/);
|
|
349
|
-
const wordFreq = {};
|
|
350
|
-
words.forEach((word) => {
|
|
351
|
-
if (word.length > 4 && !['the', 'and', 'for', 'with', 'this', 'that', 'from', 'into'].includes(word)) {
|
|
352
|
-
wordFreq[word] = (wordFreq[word] || 0) + 1;
|
|
353
|
-
}
|
|
354
|
-
});
|
|
355
|
-
|
|
356
|
-
// Sort by frequency and take top keywords
|
|
357
|
-
context.keywords = Object.entries(wordFreq)
|
|
358
|
-
.sort((a, b) => b[1] - a[1])
|
|
359
|
-
.slice(0, 15)
|
|
360
|
-
.map(([word]) => word);
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
return context;
|
|
364
|
-
}
|