agent-security-scanner-mcp 4.3.0 → 4.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,301 @@
1
+ /**
2
+ * Semantic Analysis Integration
3
+ *
4
+ * Integrates semantic analyzer into existing scan pipeline
5
+ */
6
+
7
+ import { SemanticAnalyzer } from './semantic-analyzer.js';
8
+ import { readFileSync } from 'fs';
9
+ import { execFileSync } from 'child_process';
10
+ import path from 'path';
11
+ import { fileURLToPath } from 'url';
12
+ import { resolvePythonCommand, pythonArgs } from './python.js';
13
+
14
+ const __filename = fileURLToPath(import.meta.url);
15
+ const __dirname = path.dirname(__filename);
16
+
17
+ /**
18
+ * Run semantic analysis on a file
19
+ *
20
+ * @param {string} filePath - Path to file to analyze
21
+ * @param {object} options - Analysis options
22
+ * @returns {Array} Semantic findings
23
+ */
24
+ export async function runSemanticAnalysis(filePath, options = {}) {
25
+ try {
26
+ // Get AST from Python analyzer
27
+ const ast = await getASTFromPython(filePath);
28
+
29
+ if (!ast || ast.error) {
30
+ console.error(`[SEMANTIC] Failed to get AST: ${ast?.error || 'unknown error'}`);
31
+ return [];
32
+ }
33
+
34
+ // Determine language
35
+ const language = detectLanguage(filePath);
36
+
37
+ // Run semantic analyzer
38
+ const analyzer = new SemanticAnalyzer(ast, language, filePath);
39
+ const findings = analyzer.analyze();
40
+
41
+ // Convert findings to standard format
42
+ return findings.map(f => ({
43
+ ruleId: f.ruleId,
44
+ message: f.message,
45
+ line: extractLineNumber(f, ast),
46
+ column: 0,
47
+ length: 0,
48
+ severity: mapSeverity(f.severity),
49
+ confidence: (f.confidence ? String(f.confidence) : 'MEDIUM').toUpperCase(),
50
+ metadata: {
51
+ category: f.category,
52
+ engine: 'semantic',
53
+ ...f
54
+ },
55
+ engine: 'semantic'
56
+ }));
57
+
58
+ } catch (error) {
59
+ console.error(`[SEMANTIC] Analysis error: ${error.message}`);
60
+ return [];
61
+ }
62
+ }
63
+
64
+ /**
65
+ * Get AST from Python analyzer
66
+ */
67
+ async function getASTFromPython(filePath) {
68
+ try {
69
+ const pyCmd = resolvePythonCommand();
70
+ const analyzerPath = path.join(__dirname, '..', 'ast_parser.py');
71
+
72
+ // Call Python AST parser via the analyzer wrapper which outputs JSON
73
+ const result = execFileSync(pyCmd, [
74
+ ...pythonArgs(),
75
+ analyzerPath,
76
+ '--ast-only',
77
+ filePath
78
+ ], {
79
+ encoding: 'utf8',
80
+ maxBuffer: 10 * 1024 * 1024, // 10MB buffer
81
+ timeout: 30000 // 30s timeout
82
+ });
83
+
84
+ return JSON.parse(result);
85
+ } catch (error) {
86
+ console.error(`[SEMANTIC] AST extraction failed: ${error.message}`);
87
+ return null;
88
+ }
89
+ }
90
+
91
+ /**
92
+ * Detect language from file extension
93
+ */
94
+ function detectLanguage(filePath) {
95
+ const ext = path.extname(filePath).toLowerCase();
96
+ const extMap = {
97
+ '.js': 'javascript',
98
+ '.jsx': 'javascript',
99
+ '.ts': 'typescript',
100
+ '.tsx': 'typescript',
101
+ '.py': 'python',
102
+ '.java': 'java',
103
+ '.go': 'go',
104
+ '.rb': 'ruby',
105
+ '.php': 'php',
106
+ '.cs': 'csharp',
107
+ '.rs': 'rust',
108
+ '.c': 'c',
109
+ '.cpp': 'cpp',
110
+ '.cc': 'cpp',
111
+ '.h': 'c',
112
+ '.hpp': 'cpp'
113
+ };
114
+
115
+ return extMap[ext] || 'generic';
116
+ }
117
+
118
+ /**
119
+ * Extract line number from finding
120
+ */
121
+ function extractLineNumber(finding, ast) {
122
+ // Try to get line from node metadata
123
+ if (finding.node && finding.node.ast) {
124
+ const astNode = finding.node.ast;
125
+ if (astNode.line) return astNode.line;
126
+ if (astNode.start_point && astNode.start_point.row) return astNode.start_point.row + 1;
127
+ }
128
+
129
+ if (finding.nodeId && finding.nodeId.includes('_')) {
130
+ // Extract from node ID if possible
131
+ const parts = finding.nodeId.split('_');
132
+ if (parts.length > 1 && !isNaN(parts[1])) {
133
+ return parseInt(parts[1]);
134
+ }
135
+ }
136
+
137
+ return 1; // Default to line 1 if we can't determine
138
+ }
139
+
140
+ /**
141
+ * Map severity levels
142
+ */
143
+ function mapSeverity(severity) {
144
+ if (!severity) return 'warning';
145
+
146
+ const s = severity.toLowerCase();
147
+ if (s === 'error' || s === 'critical') return 'error';
148
+ if (s === 'warning' || s === 'warn') return 'warning';
149
+ if (s === 'info' || s === 'note') return 'info';
150
+
151
+ return 'warning';
152
+ }
153
+
154
+ /**
155
+ * Load semantic rules from YAML
156
+ */
157
+ export async function loadSemanticRules() {
158
+ try {
159
+ const rulesPath = path.join(__dirname, '..', 'rules', 'semantic-security.yaml');
160
+ const yamlContent = readFileSync(rulesPath, 'utf8');
161
+
162
+ // Parse YAML (simplified - in production use a YAML library)
163
+ const rules = parseSimpleYAML(yamlContent);
164
+
165
+ return rules;
166
+ } catch (error) {
167
+ console.error(`[SEMANTIC] Failed to load rules: ${error.message}`);
168
+ return [];
169
+ }
170
+ }
171
+
172
+ /**
173
+ * Simple YAML parser for rule files
174
+ * (In production, use js-yaml library)
175
+ */
176
+ function parseSimpleYAML(yamlContent) {
177
+ const rules = [];
178
+ const ruleBlocks = yamlContent.split(/^- id:/m).filter(b => b.trim());
179
+
180
+ for (const block of ruleBlocks) {
181
+ try {
182
+ const rule = {};
183
+
184
+ // Extract ID
185
+ const idMatch = block.match(/^\s*(.+?)$/m);
186
+ if (idMatch) {
187
+ rule.id = idMatch[1].trim();
188
+ }
189
+
190
+ // Extract languages
191
+ const langMatch = block.match(/languages:\s*\[([^\]]+)\]/);
192
+ if (langMatch) {
193
+ rule.languages = langMatch[1].split(',').map(l => l.trim());
194
+ }
195
+
196
+ // Extract severity
197
+ const severityMatch = block.match(/severity:\s*(\w+)/);
198
+ if (severityMatch) {
199
+ rule.severity = severityMatch[1].trim();
200
+ }
201
+
202
+ // Extract message
203
+ const messageMatch = block.match(/message:\s*["'](.+?)["']/);
204
+ if (messageMatch) {
205
+ rule.message = messageMatch[1];
206
+ }
207
+
208
+ // Extract metadata (simplified)
209
+ rule.metadata = {};
210
+
211
+ const categoryMatch = block.match(/category:\s*(\S+)/);
212
+ if (categoryMatch) {
213
+ rule.metadata.category = categoryMatch[1];
214
+ }
215
+
216
+ const cweMatch = block.match(/cwe:\s*["']([^"']+)["']/);
217
+ if (cweMatch) {
218
+ rule.metadata.cwe = cweMatch[1];
219
+ }
220
+
221
+ const owaspMatch = block.match(/owasp:\s*["']([^"']+)["']/);
222
+ if (owaspMatch) {
223
+ rule.metadata.owasp = owaspMatch[1];
224
+ }
225
+
226
+ const confidenceMatch = block.match(/confidence:\s*(\w+)/);
227
+ if (confidenceMatch) {
228
+ rule.metadata.confidence = confidenceMatch[1];
229
+ }
230
+
231
+ if (rule.id) {
232
+ rules.push(rule);
233
+ }
234
+ } catch (error) {
235
+ console.error(`[SEMANTIC] Failed to parse rule block: ${error.message}`);
236
+ }
237
+ }
238
+
239
+ return rules;
240
+ }
241
+
242
+ /**
243
+ * Check if semantic analysis is available
244
+ */
245
+ let _semanticAvailable = null;
246
+
247
+ export function isSemanticAnalysisAvailable() {
248
+ if (_semanticAvailable !== null) return _semanticAvailable;
249
+ try {
250
+ const pyCmd = resolvePythonCommand();
251
+ // Verify that Python 3 exists AND tree-sitter can be imported, which is
252
+ // the actual prerequisite for AST extraction. A bare "python --version"
253
+ // check was giving false positives on systems without tree-sitter.
254
+ execFileSync(pyCmd, [
255
+ ...pythonArgs(),
256
+ '-c',
257
+ 'import tree_sitter; import ast_parser'
258
+ ], {
259
+ stdio: 'pipe',
260
+ timeout: 10000,
261
+ cwd: path.join(__dirname, '..')
262
+ });
263
+ _semanticAvailable = true;
264
+ } catch {
265
+ _semanticAvailable = false;
266
+ }
267
+ return _semanticAvailable;
268
+ }
269
+
270
+ /**
271
+ * Get semantic analyzer statistics
272
+ */
273
+ export async function getSemanticStats() {
274
+ const rules = await loadSemanticRules();
275
+
276
+ const stats = {
277
+ total_rules: rules.length,
278
+ by_severity: {},
279
+ by_category: {},
280
+ by_language: {}
281
+ };
282
+
283
+ for (const rule of rules) {
284
+ // Count by severity
285
+ const severity = rule.severity || 'UNKNOWN';
286
+ stats.by_severity[severity] = (stats.by_severity[severity] || 0) + 1;
287
+
288
+ // Count by category
289
+ const category = rule.metadata?.category || 'unknown';
290
+ stats.by_category[category] = (stats.by_category[category] || 0) + 1;
291
+
292
+ // Count by language
293
+ if (rule.languages) {
294
+ for (const lang of rule.languages) {
295
+ stats.by_language[lang] = (stats.by_language[lang] || 0) + 1;
296
+ }
297
+ }
298
+ }
299
+
300
+ return stats;
301
+ }
@@ -24,6 +24,30 @@ const SCANNABLE_EXTENSIONS = new Set([
24
24
  '.tf', '.hcl', '.sql',
25
25
  ]);
26
26
 
27
+ // Directories pruned during the walk: VCS metadata, dependency trees, build
28
+ // artifacts, language/tool caches, and editor state. Hidden entries are NOT
29
+ // blanket-skipped — only names in this denylist are pruned — so that
30
+ // security-relevant dotpaths (e.g. .github/workflows) are still traversed.
31
+ const SKIP_DIRECTORIES = new Set([
32
+ // VCS metadata
33
+ '.git', '.svn', '.hg', '.bzr',
34
+ // Dependencies / package trees
35
+ 'node_modules', 'vendor', 'bower_components',
36
+ // Build / output artifacts
37
+ 'dist', 'build', 'out', 'target', 'coverage',
38
+ // Python environments and caches
39
+ '__pycache__', 'venv', 'env', '.venv',
40
+ '.tox', '.nox', '.pytest_cache', '.mypy_cache', '.ruff_cache', '.hypothesis',
41
+ // JS/TS framework and tooling caches
42
+ '.next', '.nuxt', '.svelte-kit', '.turbo', '.parcel-cache', '.cache',
43
+ // Package-manager caches
44
+ '.yarn', '.pnpm-store', '.bundle', '.cargo', '.gradle',
45
+ // Editor / IDE state
46
+ '.idea', '.vscode', '.vs',
47
+ // IaC state
48
+ '.terraform',
49
+ ]);
50
+
27
51
  // Parse .gitignore into patterns
28
52
  function parseGitignore(dirPath) {
29
53
  const gitignorePath = join(dirPath, '.gitignore');
@@ -64,9 +88,6 @@ function walkDirectory(dirPath, options = {}) {
64
88
  }
65
89
 
66
90
  for (const entry of entries) {
67
- // Skip hidden directories/files
68
- if (entry.startsWith('.')) continue;
69
-
70
91
  const fullPath = join(currentDir, entry);
71
92
  const relativePath = relative(dirPath, fullPath);
72
93
 
@@ -78,9 +99,10 @@ function walkDirectory(dirPath, options = {}) {
78
99
  }
79
100
 
80
101
  if (stat.isDirectory()) {
81
- // Skip common non-source directories
82
- if (['node_modules', 'vendor', 'dist', 'build', '__pycache__', '.git',
83
- 'venv', 'env', '.venv', 'target', 'coverage'].includes(entry)) continue;
102
+ // Prune known heavy/internal dirs (incl. hidden ones like .git), but
103
+ // do not blanket-skip every dotdir — security-relevant paths such as
104
+ // .github/workflows must still be walked.
105
+ if (SKIP_DIRECTORIES.has(entry)) continue;
84
106
 
85
107
  // Skip gitignored directories
86
108
  if (isGitignored(relativePath, gitignorePatterns)) continue;
@@ -0,0 +1,227 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * github-clone.js
5
+ *
6
+ * Safe GitHub repository cloning WITHOUT code execution
7
+ *
8
+ * Security Features:
9
+ * - Uses --no-checkout to prevent automatic code checkout
10
+ * - Uses git archive to extract files without triggering hooks
11
+ * - No postinstall scripts, no git hooks, no code execution
12
+ * - Size limits to prevent repository bombs
13
+ * - Timeout protection
14
+ */
15
+
16
+ import { exec } from 'child_process';
17
+ import { promisify } from 'util';
18
+ import fs from 'fs/promises';
19
+ import path from 'path';
20
+ import { existsSync } from 'fs';
21
+
22
+ const execAsync = promisify(exec);
23
+
24
+ const MAX_REPO_SIZE_MB = 500; // 500MB limit
25
+ const CLONE_TIMEOUT_MS = 120000; // 2 minutes
26
+ const ARCHIVE_TIMEOUT_MS = 60000; // 1 minute
27
+
28
+ /**
29
+ * Extract GitHub owner/repo from various URL formats
30
+ * @param {string} url - GitHub URL
31
+ * @returns {{owner: string, repo: string} | null}
32
+ */
33
+ export function parseGitHubUrl(url) {
34
+ if (!url) return null;
35
+
36
+ // Match patterns:
37
+ // - https://github.com/owner/repo
38
+ // - https://github.com/owner/repo.git
39
+ // - git@github.com:owner/repo.git
40
+ // - github.com/owner/repo
41
+ const patterns = [
42
+ /github\.com[\/:]([^\/]+)\/([^\/\.]+)/,
43
+ /^([^\/]+)\/([^\/]+)$/, // owner/repo format
44
+ ];
45
+
46
+ for (const pattern of patterns) {
47
+ const match = url.match(pattern);
48
+ if (match) {
49
+ return {
50
+ owner: match[1],
51
+ repo: match[2].replace(/\.git$/, ''),
52
+ };
53
+ }
54
+ }
55
+
56
+ return null;
57
+ }
58
+
59
+ /**
60
+ * Clone a GitHub repository safely without checking out files
61
+ * @param {string} repoUrl - GitHub repository URL
62
+ * @param {string} destDir - Destination directory
63
+ * @param {object} options - Options
64
+ * @returns {Promise<{success: boolean, path?: string, error?: string}>}
65
+ */
66
+ export async function cloneGitHubRepo(repoUrl, destDir, options = {}) {
67
+ const {
68
+ depth = 1,
69
+ branch = null,
70
+ timeout = CLONE_TIMEOUT_MS,
71
+ } = options;
72
+
73
+ const parsed = parseGitHubUrl(repoUrl);
74
+ if (!parsed) {
75
+ return { success: false, error: 'Invalid GitHub URL' };
76
+ }
77
+
78
+ const { owner, repo } = parsed;
79
+ const cloneDir = path.join(destDir, `${owner}-${repo}`);
80
+
81
+ try {
82
+ // Create destination directory
83
+ await fs.mkdir(destDir, { recursive: true });
84
+
85
+ // Build clone command with safety flags
86
+ const branchFlag = branch ? `--branch ${branch}` : '';
87
+ const cloneCmd = `git clone --depth ${depth} --no-checkout ${branchFlag} https://github.com/${owner}/${repo}.git "${cloneDir}"`;
88
+
89
+ // Clone without checking out (no hooks triggered)
90
+ const { stdout, stderr } = await execAsync(cloneCmd, {
91
+ timeout,
92
+ maxBuffer: 10 * 1024 * 1024, // 10MB buffer for git output
93
+ });
94
+
95
+ // Check repository size
96
+ const sizeCmd = `du -sm "${cloneDir}" | cut -f1`;
97
+ const { stdout: sizeOutput } = await execAsync(sizeCmd);
98
+ const sizeMB = parseInt(sizeOutput.trim(), 10);
99
+
100
+ if (sizeMB > MAX_REPO_SIZE_MB) {
101
+ // Delete oversized repo
102
+ await fs.rm(cloneDir, { recursive: true, force: true });
103
+ return {
104
+ success: false,
105
+ error: `Repository too large: ${sizeMB}MB > ${MAX_REPO_SIZE_MB}MB limit`,
106
+ };
107
+ }
108
+
109
+ return {
110
+ success: true,
111
+ path: cloneDir,
112
+ size: sizeMB,
113
+ };
114
+
115
+ } catch (error) {
116
+ // Clean up on failure
117
+ if (existsSync(cloneDir)) {
118
+ await fs.rm(cloneDir, { recursive: true, force: true }).catch(() => {});
119
+ }
120
+
121
+ return {
122
+ success: false,
123
+ error: error.message,
124
+ };
125
+ }
126
+ }
127
+
128
+ /**
129
+ * Extract files from cloned repo using git archive (safe - no hooks)
130
+ * @param {string} repoPath - Path to cloned repository
131
+ * @param {string} extractDir - Directory to extract files to
132
+ * @returns {Promise<{success: boolean, path?: string, error?: string}>}
133
+ */
134
+ export async function extractRepoFiles(repoPath, extractDir) {
135
+ try {
136
+ // Create extraction directory
137
+ await fs.mkdir(extractDir, { recursive: true });
138
+
139
+ // Use git archive to extract HEAD without triggering any hooks
140
+ const archiveCmd = `cd "${repoPath}" && git archive HEAD | tar -x -C "${extractDir}"`;
141
+
142
+ await execAsync(archiveCmd, {
143
+ timeout: ARCHIVE_TIMEOUT_MS,
144
+ maxBuffer: 50 * 1024 * 1024, // 50MB buffer
145
+ });
146
+
147
+ return {
148
+ success: true,
149
+ path: extractDir,
150
+ };
151
+
152
+ } catch (error) {
153
+ return {
154
+ success: false,
155
+ error: error.message,
156
+ };
157
+ }
158
+ }
159
+
160
+ /**
161
+ * Full workflow: clone + extract + cleanup
162
+ * @param {string} repoUrl - GitHub repository URL
163
+ * @param {string} workDir - Working directory for clone operations
164
+ * @returns {Promise<{success: boolean, sourcePath?: string, error?: string}>}
165
+ */
166
+ export async function cloneAndExtract(repoUrl, workDir) {
167
+ const tempCloneDir = path.join(workDir, 'temp-clones');
168
+ const extractDir = path.join(workDir, 'extracted-source');
169
+
170
+ try {
171
+ // Step 1: Clone without checkout
172
+ const cloneResult = await cloneGitHubRepo(repoUrl, tempCloneDir);
173
+
174
+ if (!cloneResult.success) {
175
+ return cloneResult;
176
+ }
177
+
178
+ // Step 2: Extract files safely
179
+ const extractResult = await extractRepoFiles(cloneResult.path, extractDir);
180
+
181
+ // Step 3: Clean up clone directory (keep extracted only)
182
+ await fs.rm(tempCloneDir, { recursive: true, force: true });
183
+
184
+ if (!extractResult.success) {
185
+ // Clean up extracted files on failure
186
+ await fs.rm(extractDir, { recursive: true, force: true });
187
+ return extractResult;
188
+ }
189
+
190
+ return {
191
+ success: true,
192
+ sourcePath: extractDir,
193
+ repoSize: cloneResult.size,
194
+ };
195
+
196
+ } catch (error) {
197
+ // Clean up everything on error
198
+ await fs.rm(tempCloneDir, { recursive: true, force: true }).catch(() => {});
199
+ await fs.rm(extractDir, { recursive: true, force: true }).catch(() => {});
200
+
201
+ return {
202
+ success: false,
203
+ error: error.message,
204
+ };
205
+ }
206
+ }
207
+
208
+ /**
209
+ * Check if a URL is a valid GitHub repository (without cloning)
210
+ * @param {string} url - URL to check
211
+ * @returns {Promise<boolean>}
212
+ */
213
+ export async function isValidGitHubRepo(url) {
214
+ const parsed = parseGitHubUrl(url);
215
+ if (!parsed) return false;
216
+
217
+ const { owner, repo } = parsed;
218
+
219
+ try {
220
+ // Check if repo exists using git ls-remote (lightweight)
221
+ const checkCmd = `git ls-remote https://github.com/${owner}/${repo}.git HEAD`;
222
+ await execAsync(checkCmd, { timeout: 10000 });
223
+ return true;
224
+ } catch {
225
+ return false;
226
+ }
227
+ }