codecritique 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +82 -114
  2. package/package.json +10 -9
  3. package/src/content-retrieval.test.js +775 -0
  4. package/src/custom-documents.test.js +440 -0
  5. package/src/feedback-loader.test.js +529 -0
  6. package/src/llm.test.js +256 -0
  7. package/src/project-analyzer.test.js +747 -0
  8. package/src/rag-analyzer.js +12 -0
  9. package/src/rag-analyzer.test.js +1109 -0
  10. package/src/rag-review.test.js +317 -0
  11. package/src/setupTests.js +131 -0
  12. package/src/zero-shot-classifier-open.test.js +278 -0
  13. package/src/embeddings/cache-manager.js +0 -364
  14. package/src/embeddings/constants.js +0 -40
  15. package/src/embeddings/database.js +0 -921
  16. package/src/embeddings/errors.js +0 -208
  17. package/src/embeddings/factory.js +0 -447
  18. package/src/embeddings/file-processor.js +0 -851
  19. package/src/embeddings/model-manager.js +0 -337
  20. package/src/embeddings/similarity-calculator.js +0 -97
  21. package/src/embeddings/types.js +0 -113
  22. package/src/pr-history/analyzer.js +0 -579
  23. package/src/pr-history/bot-detector.js +0 -123
  24. package/src/pr-history/cli-utils.js +0 -204
  25. package/src/pr-history/comment-processor.js +0 -549
  26. package/src/pr-history/database.js +0 -819
  27. package/src/pr-history/github-client.js +0 -629
  28. package/src/technology-keywords.json +0 -753
  29. package/src/utils/command.js +0 -48
  30. package/src/utils/constants.js +0 -263
  31. package/src/utils/context-inference.js +0 -364
  32. package/src/utils/document-detection.js +0 -105
  33. package/src/utils/file-validation.js +0 -271
  34. package/src/utils/git.js +0 -232
  35. package/src/utils/language-detection.js +0 -170
  36. package/src/utils/logging.js +0 -24
  37. package/src/utils/markdown.js +0 -132
  38. package/src/utils/mobilebert-tokenizer.js +0 -141
  39. package/src/utils/pr-chunking.js +0 -276
  40. package/src/utils/string-utils.js +0 -28
@@ -1,48 +0,0 @@
1
- /**
2
- * Command Execution Module
3
- *
4
- * This module provides utilities for safely executing shell commands,
5
- * particularly focused on git operations with proper argument escaping
6
- * to prevent command injection attacks.
7
- */
8
-
9
- import { execSync } from 'child_process';
10
-
11
- /**
12
- * Safely escape shell arguments to prevent command injection
13
- *
14
- * @param {string} arg - The argument to escape
15
- * @returns {string} The safely escaped argument
16
- *
17
- * @example
18
- * const safeArg = escapeShellArg("user's file.txt");
19
- * // Returns: 'user'\''s file.txt'
20
- */
21
- function escapeShellArg(arg) {
22
- if (!arg || typeof arg !== 'string') {
23
- return "''";
24
- }
25
-
26
- // For POSIX shells, single quotes preserve everything literally
27
- // We escape single quotes by ending the quoted string, adding an escaped quote, and starting a new quoted string
28
- return "'" + arg.replace(/'/g, "'\\''") + "'";
29
- }
30
-
31
- /**
32
- * Safely execute git commands by escaping all arguments
33
- *
34
- * @param {string} baseCommand - The base git command (e.g., 'git show')
35
- * @param {Array<string>} args - Array of arguments to escape and append
36
- * @param {Object} options - Options to pass to execSync
37
- * @returns {string} The command output
38
- *
39
- * @example
40
- * const result = execGitSafe('git show', ['HEAD~1', 'src/file.js'], { cwd: '/path/to/repo' });
41
- *
42
- * @throws {Error} If the command execution fails
43
- */
44
- export function execGitSafe(baseCommand, args = [], options = {}) {
45
- const escapedArgs = args.map((arg) => escapeShellArg(arg)).join(' ');
46
- const fullCommand = escapedArgs ? `${baseCommand} ${escapedArgs}` : baseCommand;
47
- return execSync(fullCommand, options);
48
- }
@@ -1,263 +0,0 @@
1
- /**
2
- * Constants Module
3
- *
4
- * This module provides shared constants for file extensions, patterns,
5
- * and other configuration values used throughout the utility modules.
6
- */
7
-
8
- /**
9
- * Extension to language mapping
10
- * This is the single source of truth for supported file types and their languages
11
- * @type {Object.<string, string>}
12
- */
13
- export const EXTENSION_TO_LANGUAGE_MAP = {
14
- // JavaScript and variants
15
- '.js': 'javascript',
16
- '.jsx': 'javascript',
17
- '.mjs': 'javascript',
18
- '.cjs': 'javascript',
19
-
20
- // TypeScript and variants
21
- '.ts': 'typescript',
22
- '.tsx': 'typescript',
23
- '.mts': 'typescript',
24
- '.cts': 'typescript',
25
- '.d.ts': 'typescript',
26
-
27
- // Web technologies
28
- '.html': 'html',
29
- '.htm': 'html',
30
- '.css': 'css',
31
- '.scss': 'scss',
32
- '.sass': 'sass',
33
- '.less': 'less',
34
- '.svg': 'svg',
35
-
36
- // Configuration files
37
- '.json': 'json',
38
- '.yaml': 'yaml',
39
- '.yml': 'yaml',
40
- '.toml': 'toml',
41
- '.xml': 'xml',
42
-
43
- // Documentation
44
- '.md': 'markdown',
45
- '.mdx': 'markdown',
46
- '.markdown': 'markdown',
47
- '.rst': 'restructuredtext',
48
- '.adoc': 'asciidoc',
49
- '.txt': 'text',
50
-
51
- // Python
52
- '.py': 'python',
53
- '.pyi': 'python',
54
- '.ipynb': 'jupyter',
55
-
56
- // Ruby
57
- '.rb': 'ruby',
58
- '.erb': 'ruby',
59
- '.rake': 'ruby',
60
-
61
- // PHP
62
- '.php': 'php',
63
- '.phtml': 'php',
64
-
65
- // Java and JVM languages
66
- '.java': 'java',
67
- '.kt': 'kotlin',
68
- '.kts': 'kotlin',
69
- '.groovy': 'groovy',
70
- '.scala': 'scala',
71
-
72
- // C-family languages
73
- '.c': 'c',
74
- '.h': 'c',
75
- '.cpp': 'cpp',
76
- '.cc': 'cpp',
77
- '.cxx': 'cpp',
78
- '.hpp': 'cpp',
79
- '.c++': 'cpp',
80
- '.h++': 'cpp',
81
- '.cs': 'csharp',
82
-
83
- // Go
84
- '.go': 'go',
85
-
86
- // Rust
87
- '.rs': 'rust',
88
-
89
- // Swift
90
- '.swift': 'swift',
91
-
92
- // Shell scripts
93
- '.sh': 'bash',
94
- '.bash': 'bash',
95
- '.zsh': 'zsh',
96
- '.fish': 'fish',
97
-
98
- // Other languages
99
- '.pl': 'perl',
100
- '.pm': 'perl',
101
- '.lua': 'lua',
102
- '.r': 'r',
103
- '.dart': 'dart',
104
- '.ex': 'elixir',
105
- '.exs': 'elixir',
106
- '.erl': 'erlang',
107
- '.hrl': 'erlang',
108
- '.clj': 'clojure',
109
- '.cljs': 'clojure',
110
- '.hs': 'haskell',
111
- '.lhs': 'haskell',
112
-
113
- // GraphQL
114
- '.graphql': 'graphql',
115
- '.gql': 'graphql',
116
-
117
- // Frameworks
118
- '.vue': 'vue',
119
- '.svelte': 'svelte',
120
- '.astro': 'astro',
121
- '.prisma': 'prisma',
122
- };
123
-
124
- /**
125
- * All supported file extensions derived from the language mapping
126
- * @type {string[]}
127
- */
128
- export const ALL_SUPPORTED_EXTENSIONS = Object.keys(EXTENSION_TO_LANGUAGE_MAP);
129
-
130
- /**
131
- * Documentation file extensions
132
- * @type {string[]}
133
- */
134
- export const DOCUMENTATION_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS.filter((ext) => {
135
- const lang = EXTENSION_TO_LANGUAGE_MAP[ext];
136
- return ['markdown', 'restructuredtext', 'asciidoc', 'text'].includes(lang);
137
- });
138
-
139
- /**
140
- * Code file extensions (excludes documentation types)
141
- * @type {string[]}
142
- */
143
- export const CODE_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS.filter((ext) => !DOCUMENTATION_EXTENSIONS.includes(ext));
144
-
145
- /**
146
- * Binary file extensions that should be skipped during processing
147
- * @type {string[]}
148
- */
149
- export const BINARY_EXTENSIONS = [
150
- '.jpg',
151
- '.jpeg',
152
- '.png',
153
- '.gif',
154
- '.bmp',
155
- '.ico',
156
- '.webp',
157
- '.pdf',
158
- '.doc',
159
- '.docx',
160
- '.ppt',
161
- '.pptx',
162
- '.xls',
163
- '.xlsx',
164
- '.zip',
165
- '.tar',
166
- '.gz',
167
- '.7z',
168
- '.rar',
169
- '.exe',
170
- '.dll',
171
- '.so',
172
- '.dylib',
173
- '.ttf',
174
- '.otf',
175
- '.woff',
176
- '.woff2',
177
- '.mp3',
178
- '.mp4',
179
- '.avi',
180
- '.mov',
181
- '.wav',
182
- ];
183
-
184
- /**
185
- * Directories to skip during file processing
186
- * @type {string[]}
187
- */
188
- export const SKIP_DIRECTORIES = ['node_modules', 'dist', 'build', '.git', 'coverage', 'vendor'];
189
-
190
- /**
191
- * File names to skip during processing (lock files, config files not useful as code examples)
192
- * @type {string[]}
193
- */
194
- export const SKIP_FILENAMES = [
195
- // Lock files
196
- 'package-lock.json',
197
- 'yarn.lock',
198
- 'pnpm-lock.yaml',
199
- 'composer.lock',
200
- 'Gemfile.lock',
201
- // Package manifests (config, not source code)
202
- 'package.json',
203
- 'composer.json',
204
- 'Gemfile',
205
- 'Cargo.toml',
206
- 'go.mod',
207
- 'go.sum',
208
- 'requirements.txt',
209
- 'pyproject.toml',
210
- 'pom.xml',
211
- 'build.gradle',
212
- // Common config files (not useful as code examples)
213
- 'tsconfig.json',
214
- 'jsconfig.json',
215
- '.eslintrc',
216
- '.eslintrc.json',
217
- '.eslintrc.js',
218
- '.prettierrc',
219
- '.prettierrc.json',
220
- 'prettier.config.js',
221
- '.babelrc',
222
- 'babel.config.js',
223
- 'jest.config.js',
224
- 'jest.config.ts',
225
- 'vitest.config.ts',
226
- 'vitest.config.js',
227
- 'webpack.config.js',
228
- 'vite.config.js',
229
- 'vite.config.ts',
230
- 'rollup.config.js',
231
- 'Makefile',
232
- 'Dockerfile',
233
- '.dockerignore',
234
- '.gitignore',
235
- '.gitattributes',
236
- '.editorconfig',
237
- '.env.example',
238
- '.nvmrc',
239
- '.node-version',
240
- ];
241
-
242
- /**
243
- * File patterns to skip during processing (likely generated files)
244
- * @type {RegExp[]}
245
- */
246
- export const SKIP_FILE_PATTERNS = [
247
- /\.min\.(js|css)$/,
248
- /\.bundle\.(js|css)$/,
249
- /\.generated\./,
250
- /\.d\.ts$/,
251
- /\.snap$/,
252
- // Config file patterns
253
- /^\..*rc$/, // .eslintrc, .prettierrc, etc.
254
- /^\..*rc\.json$/, // .eslintrc.json, etc.
255
- /\.config\.(js|ts|mjs|cjs)$/, // *.config.js, *.config.ts files
256
- ];
257
-
258
- /**
259
- * Regex pattern for detecting generic documentation files
260
- * Shared between different modules for consistency
261
- * @type {RegExp}
262
- */
263
- export const GENERIC_DOC_REGEX = /(README|RUNBOOK|CONTRIBUTING|CHANGELOG|LICENSE|SETUP|INSTALL)(\.md|$)/i;
@@ -1,364 +0,0 @@
1
- /**
2
- * Context Inference Module
3
- *
4
- * This module provides utilities for inferring context from code and document content,
5
- * including technology detection, area classification, and semantic analysis.
6
- */
7
-
8
- import path from 'path';
9
- import { openClassifier } from '../zero-shot-classifier-open.js';
10
-
11
- /**
12
- * Infer context from code content using heuristic analysis
13
- *
14
- * @param {string} codeContent - The code content to analyze
15
- * @param {string} language - The detected programming language
16
- * @returns {Object} Context information including area, keywords, and dominant technologies
17
- *
18
- * @example
19
- * const context = inferContextFromCodeContent('import React from "react"', 'javascript');
20
- * // Returns: { area: 'Frontend', keywords: [...], dominantTech: ['React'] }
21
- */
22
- export function inferContextFromCodeContent(codeContent, language) {
23
- const context = {
24
- area: 'Unknown', // "Frontend" | "Backend" | "Tooling" | "GeneralJS_TS" | "Unknown"
25
- keywords: [], // string[]
26
- dominantTech: [], // string[]
27
- };
28
- const lowerCode = codeContent.toLowerCase();
29
-
30
- // Area inference (very basic for now)
31
- if (language === 'javascript' || language === 'typescript') {
32
- if (
33
- lowerCode.includes('react') ||
34
- lowerCode.includes('usestate') ||
35
- lowerCode.includes('useeffect') ||
36
- lowerCode.includes('angular') ||
37
- lowerCode.includes('vue') ||
38
- lowerCode.includes('document.getelementbyid') ||
39
- lowerCode.includes('jsx') ||
40
- lowerCode.includes('.tsx')
41
- ) {
42
- context.area = 'Frontend';
43
- if (lowerCode.includes('react')) context.dominantTech.push('React');
44
- if (lowerCode.includes('angular')) context.dominantTech.push('Angular');
45
- if (lowerCode.includes('vue')) context.dominantTech.push('Vue');
46
- } else if (
47
- lowerCode.includes("require('express')") ||
48
- lowerCode.includes('http.createserver') ||
49
- lowerCode.includes('fs.readfilesync') ||
50
- lowerCode.includes('process.env')
51
- ) {
52
- context.area = 'Backend';
53
- if (lowerCode.includes('express')) context.dominantTech.push('Node.js/Express');
54
- else context.dominantTech.push('Node.js');
55
- } else {
56
- context.area = 'GeneralJS_TS';
57
- }
58
- } else if (language === 'python') {
59
- if (lowerCode.includes('django') || lowerCode.includes('flask')) {
60
- context.area = 'Backend';
61
- if (lowerCode.includes('django')) context.dominantTech.push('Django');
62
- if (lowerCode.includes('flask')) context.dominantTech.push('Flask');
63
- } else {
64
- context.area = 'GeneralPython'; // Or just "Backend"
65
- }
66
- }
67
- // Add more language-specific heuristics here
68
-
69
- const commonTechWords = ['api', 'component', 'module', 'function', 'class', 'hook', 'service', 'database', 'query', 'state', 'props'];
70
- commonTechWords.forEach((word) => {
71
- if (lowerCode.includes(word)) context.keywords.push(word);
72
- });
73
- context.keywords = [...new Set(context.keywords)];
74
- context.dominantTech = [...new Set(context.dominantTech)];
75
-
76
- return context;
77
- }
78
-
79
- /**
80
- * Infer context from document content using advanced classification and analysis
81
- *
82
- * @param {string} docPath - Path to the document
83
- * @param {string} h1Content - H1 heading content
84
- * @param {Array} chunksSample - Sample chunks from the document for analysis
85
- * @returns {Promise<Object>} Context information with area classification and technology detection
86
- *
87
- * @example
88
- * const context = await inferContextFromDocumentContent('/docs/api.md', 'API Guide', chunks);
89
- * // Returns: { area: 'Backend', dominantTech: ['API', 'REST'], keywords: [...], ... }
90
- */
91
- export async function inferContextFromDocumentContent(docPath, h1Content, chunksSample = []) {
92
- const context = {
93
- area: 'Unknown',
94
- keywords: [],
95
- dominantTech: [],
96
- isGeneralPurposeReadmeStyle: false,
97
- docPath: docPath,
98
- };
99
-
100
- const lowerDocPath = docPath.toLowerCase();
101
- const lowerH1 = (h1Content || '').toLowerCase();
102
-
103
- // 1. Prepare and Prioritize Text for Analysis
104
- let combinedChunkText = '';
105
- let charCount = 0;
106
- const MAX_CHARS_FROM_CHUNKS = 2000;
107
-
108
- for (const chunk of chunksSample) {
109
- // Iterate over potentially all sample chunks from findSimilarCode
110
- if (charCount >= MAX_CHARS_FROM_CHUNKS) break;
111
- const chunkContentLower = (chunk.content || '').toLowerCase();
112
- const chunkHeadingLower = (chunk.heading_text || '').toLowerCase();
113
- let textToAppend = '';
114
- if (chunkHeadingLower && chunkHeadingLower !== lowerH1) {
115
- textToAppend += chunkHeadingLower + ' ';
116
- }
117
- textToAppend += chunkContentLower;
118
-
119
- combinedChunkText += ' ' + textToAppend.substring(0, MAX_CHARS_FROM_CHUNKS - charCount);
120
- charCount += textToAppend.length;
121
- }
122
-
123
- const lowerDocPathFilename = path.basename(lowerDocPath).replace(/\.(md|rst|txt|mdx)$/i, '');
124
- // Give H1 significant weight, also include filename (cleaned of hyphens)
125
- let primaryTextForAnalysis = `${lowerH1} ${lowerH1} ${lowerDocPathFilename.replace(/-/g, ' ')}`;
126
- let fullTextForAnalysis = `${primaryTextForAnalysis} ${combinedChunkText}`.replace(/\s+/g, ' ').trim();
127
-
128
- if (!fullTextForAnalysis.trim()) {
129
- // If absolutely no text content after H1, filename, and chunks
130
- if (lowerDocPath)
131
- fullTextForAnalysis = lowerDocPath; // Fallback to path for keyword extraction if all else fails
132
- else {
133
- context.area = 'UndeterminedByContent';
134
- return context; // Early exit if no text to analyze at all
135
- }
136
- }
137
-
138
- try {
139
- // Initialize classifier if needed
140
- await openClassifier.initialize();
141
-
142
- // --- 2. Use Open-Ended Classification ---
143
- const classification = await openClassifier.classifyDocument(fullTextForAnalysis);
144
-
145
- // Extract technologies directly from the classification
146
- context.dominantTech = classification.technologies.filter((t) => t.confidence >= 0.35).map((t) => t.technology);
147
-
148
- // --- 3. Area Inference based on domains and technologies ---
149
- let areaScore = {
150
- Frontend: 0,
151
- Backend: 0,
152
- FullStack: 0,
153
- Database: 0,
154
- DevOps: 0,
155
- Testing: 0,
156
- Security: 0,
157
- Architecture: 0,
158
- ToolingInternal: 0,
159
- GeneralProjectDoc: 0,
160
- Unknown: 0,
161
- };
162
-
163
- // Score based on domains
164
- classification.domains.forEach((domain) => {
165
- const domainLower = domain.domain.toLowerCase();
166
- const confidence = domain.confidence;
167
-
168
- if (domainLower.includes('frontend') || domainLower.includes('ui/ux')) {
169
- areaScore['Frontend'] += confidence;
170
- }
171
- if (domainLower.includes('backend') || domainLower.includes('api')) {
172
- areaScore['Backend'] += confidence;
173
- }
174
- if (domainLower.includes('database') || domainLower.includes('data')) {
175
- areaScore['Database'] += confidence;
176
- }
177
- if (domainLower.includes('devops') || domainLower.includes('infrastructure')) {
178
- areaScore['DevOps'] += confidence;
179
- }
180
- if (domainLower.includes('testing') || domainLower.includes('qa')) {
181
- areaScore['Testing'] += confidence;
182
- }
183
- if (domainLower.includes('security')) {
184
- areaScore['Security'] += confidence;
185
- }
186
- if (domainLower.includes('architecture')) {
187
- areaScore['Architecture'] += confidence;
188
- }
189
- if (domainLower.includes('tooling') || domainLower.includes('developer tools')) {
190
- areaScore['ToolingInternal'] += confidence;
191
- }
192
- if (domainLower.includes('general')) {
193
- areaScore['GeneralProjectDoc'] += confidence * 0.5;
194
- }
195
- });
196
-
197
- // Score based on detected technologies
198
- context.dominantTech.forEach((tech) => {
199
- const techLower = tech.toLowerCase();
200
- if (techLower.includes('react') || techLower.includes('vue') || techLower.includes('angular')) {
201
- areaScore['Frontend'] += 0.3;
202
- }
203
- if (techLower.includes('node') || techLower.includes('express') || techLower.includes('django')) {
204
- areaScore['Backend'] += 0.3;
205
- }
206
- if (techLower.includes('postgres') || techLower.includes('mysql') || techLower.includes('mongodb')) {
207
- areaScore['Database'] += 0.3;
208
- }
209
- if (techLower.includes('docker') || techLower.includes('kubernetes') || techLower.includes('terraform')) {
210
- areaScore['DevOps'] += 0.3;
211
- }
212
- if (techLower.includes('jest') || techLower.includes('pytest') || techLower.includes('testing')) {
213
- areaScore['Testing'] += 0.3;
214
- }
215
- });
216
-
217
- // Apply path-based hints as additional scoring
218
- if (
219
- lowerDocPath.includes('/tools/') ||
220
- lowerDocPath.includes('/scripts/') ||
221
- lowerDocPath.includes('/cli/') ||
222
- lowerH1.includes(' cli') ||
223
- lowerH1.includes(' tool')
224
- ) {
225
- areaScore['ToolingInternal'] += 0.5;
226
- }
227
- if (
228
- lowerDocPath.includes('/api/') ||
229
- lowerDocPath.includes('/server/') ||
230
- lowerDocPath.includes('/db/') ||
231
- lowerDocPath.includes('/backend/') ||
232
- lowerH1.includes(' api') ||
233
- lowerH1.includes(' server') ||
234
- lowerH1.includes(' backend')
235
- ) {
236
- areaScore['Backend'] += 0.5;
237
- }
238
- if (
239
- lowerDocPath.includes('/frontend/') ||
240
- lowerDocPath.includes('/ui/') ||
241
- lowerDocPath.includes('/components/') ||
242
- lowerDocPath.includes('/views/') ||
243
- lowerDocPath.includes('/pages/') ||
244
- lowerH1.includes(' frontend') ||
245
- lowerH1.includes(' user interface')
246
- ) {
247
- areaScore['Frontend'] += 0.5;
248
- }
249
- if (
250
- lowerDocPath.endsWith('readme.md') ||
251
- lowerDocPath.endsWith('runbook.md') ||
252
- lowerDocPath.endsWith('contributing.md') ||
253
- lowerDocPath.endsWith('changelog.md')
254
- ) {
255
- areaScore['GeneralProjectDoc'] += 0.5;
256
- }
257
-
258
- // Find the area with the highest score
259
- let maxScore = 0;
260
- let selectedArea = 'Unknown';
261
- Object.entries(areaScore).forEach(([area, score]) => {
262
- if (score > maxScore) {
263
- maxScore = score;
264
- selectedArea = area;
265
- }
266
- });
267
-
268
- // Set threshold for area selection
269
- if (maxScore >= 0.4) {
270
- context.area = selectedArea;
271
- } else {
272
- context.area = 'Unknown';
273
- }
274
-
275
- // --- isGeneralPurposeReadmeStyle ---
276
- let readmeStylePoints = 0;
277
- const readmeKeywords = {
278
- 'getting started': 2,
279
- installation: 2,
280
- setup: 2,
281
- 'how to run': 2,
282
- usage: 1,
283
- configuration: 1,
284
- deployment: 1,
285
- troubleshooting: 1,
286
- prerequisites: 1,
287
- 'table of contents': 1,
288
- contributing: 0.5,
289
- license: 0.5,
290
- overview: 1,
291
- introduction: 1,
292
- purpose: 1,
293
- 'project structure': 0.5,
294
- };
295
- for (const keyword in readmeKeywords) {
296
- if (fullTextForAnalysis.includes(keyword)) {
297
- readmeStylePoints += readmeKeywords[keyword];
298
- }
299
- }
300
- const isRootFile = !lowerDocPath.substring(0, lowerDocPath.lastIndexOf('/')).includes('/');
301
- if ((isRootFile && lowerDocPath.startsWith('readme') && readmeStylePoints >= 3) || readmeStylePoints >= 5) {
302
- context.isGeneralPurposeReadmeStyle = true;
303
- }
304
- // If classified as a general project doc, it usually has readme style.
305
- if (context.area === 'GeneralProjectDoc') {
306
- context.isGeneralPurposeReadmeStyle = true;
307
- }
308
- // Tooling READMEs are often general purpose style.
309
- if (context.area === 'ToolingInternal' && lowerDocPath.includes('readme') && readmeStylePoints >= 2) {
310
- context.isGeneralPurposeReadmeStyle = true;
311
- }
312
-
313
- // --- Extract Keywords ---
314
- // Add technologies as keywords
315
- context.keywords.push(...context.dominantTech.map((t) => t.toLowerCase()));
316
-
317
- // Extract keywords from H1
318
- if (lowerH1) {
319
- lowerH1
320
- .split(/[^a-z0-9-]+/g)
321
- .filter(
322
- (word) => word.length > 3 && !['the', 'for', 'and', 'with', 'into', 'about', 'using', 'docs', 'this', 'that'].includes(word)
323
- )
324
- .slice(0, 5)
325
- .forEach((kw) => context.keywords.push(kw));
326
- }
327
-
328
- // Add domain-based keywords
329
- classification.domains.slice(0, 3).forEach((domain) => {
330
- const words = domain.domain.toLowerCase().split(/[\s\-/]+/);
331
- words.forEach((word) => {
332
- if (word.length > 3 && !context.keywords.includes(word)) {
333
- context.keywords.push(word);
334
- }
335
- });
336
- });
337
-
338
- // Remove duplicates and limit
339
- context.keywords = [...new Set(context.keywords)].slice(0, 15);
340
- } catch (error) {
341
- console.error('Error in automatic zero-shot classification:', error);
342
-
343
- // Fallback to basic keyword extraction
344
- context.area = 'Unknown';
345
- context.dominantTech = [];
346
-
347
- // Extract basic keywords from text
348
- const words = fullTextForAnalysis.toLowerCase().split(/\s+/);
349
- const wordFreq = {};
350
- words.forEach((word) => {
351
- if (word.length > 4 && !['the', 'and', 'for', 'with', 'this', 'that', 'from', 'into'].includes(word)) {
352
- wordFreq[word] = (wordFreq[word] || 0) + 1;
353
- }
354
- });
355
-
356
- // Sort by frequency and take top keywords
357
- context.keywords = Object.entries(wordFreq)
358
- .sort((a, b) => b[1] - a[1])
359
- .slice(0, 15)
360
- .map(([word]) => word);
361
- }
362
-
363
- return context;
364
- }