npm - agent-security-scanner-mcp - Versions diffs - 4.3.0 → 4.4.1 - Mend

agent-security-scanner-mcp 4.3.0 → 4.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +4 -1
package/src/semantic-analyzer.js +1293 -0
package/src/semantic-integration.js +301 -0
package/src/tools/scan-project.js +28 -6
package/src/utils/github-clone.js +227 -0
package/src/utils/npm-download.js +265 -0

package/src/semantic-integration.js ADDED Viewed

@@ -0,0 +1,301 @@
+/**
+ * Semantic Analysis Integration
+ *
+ * Integrates semantic analyzer into existing scan pipeline
+ */
+import { SemanticAnalyzer } from './semantic-analyzer.js';
+import { readFileSync } from 'fs';
+import { execFileSync } from 'child_process';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import { resolvePythonCommand, pythonArgs } from './python.js';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+/**
+ * Run semantic analysis on a file
+ *
+ * @param {string} filePath - Path to file to analyze
+ * @param {object} options - Analysis options
+ * @returns {Array} Semantic findings
+ */
+export async function runSemanticAnalysis(filePath, options = {}) {
+  try {
+    // Get AST from Python analyzer
+    const ast = await getASTFromPython(filePath);
+    if (!ast || ast.error) {
+      console.error(`[SEMANTIC] Failed to get AST: ${ast?.error || 'unknown error'}`);
+      return [];
+    }
+    // Determine language
+    const language = detectLanguage(filePath);
+    // Run semantic analyzer
+    const analyzer = new SemanticAnalyzer(ast, language, filePath);
+    const findings = analyzer.analyze();
+    // Convert findings to standard format
+    return findings.map(f => ({
+      ruleId: f.ruleId,
+      message: f.message,
+      line: extractLineNumber(f, ast),
+      column: 0,
+      length: 0,
+      severity: mapSeverity(f.severity),
+      confidence: (f.confidence ? String(f.confidence) : 'MEDIUM').toUpperCase(),
+      metadata: {
+        category: f.category,
+        engine: 'semantic',
+        ...f
+      },
+      engine: 'semantic'
+    }));
+  } catch (error) {
+    console.error(`[SEMANTIC] Analysis error: ${error.message}`);
+    return [];
+  }
+}
+/**
+ * Get AST from Python analyzer
+ */
+async function getASTFromPython(filePath) {
+  try {
+    const pyCmd = resolvePythonCommand();
+    const analyzerPath = path.join(__dirname, '..', 'ast_parser.py');
+    // Call Python AST parser via the analyzer wrapper which outputs JSON
+    const result = execFileSync(pyCmd, [
+      ...pythonArgs(),
+      analyzerPath,
+      '--ast-only',
+      filePath
+    ], {
+      encoding: 'utf8',
+      maxBuffer: 10 * 1024 * 1024, // 10MB buffer
+      timeout: 30000 // 30s timeout
+    });
+    return JSON.parse(result);
+  } catch (error) {
+    console.error(`[SEMANTIC] AST extraction failed: ${error.message}`);
+    return null;
+  }
+}
+/**
+ * Detect language from file extension
+ */
+function detectLanguage(filePath) {
+  const ext = path.extname(filePath).toLowerCase();
+  const extMap = {
+    '.js': 'javascript',
+    '.jsx': 'javascript',
+    '.ts': 'typescript',
+    '.tsx': 'typescript',
+    '.py': 'python',
+    '.java': 'java',
+    '.go': 'go',
+    '.rb': 'ruby',
+    '.php': 'php',
+    '.cs': 'csharp',
+    '.rs': 'rust',
+    '.c': 'c',
+    '.cpp': 'cpp',
+    '.cc': 'cpp',
+    '.h': 'c',
+    '.hpp': 'cpp'
+  };
+  return extMap[ext] || 'generic';
+}
+/**
+ * Extract line number from finding
+ */
+function extractLineNumber(finding, ast) {
+  // Try to get line from node metadata
+  if (finding.node && finding.node.ast) {
+    const astNode = finding.node.ast;
+    if (astNode.line) return astNode.line;
+    if (astNode.start_point && astNode.start_point.row) return astNode.start_point.row + 1;
+  }
+  if (finding.nodeId && finding.nodeId.includes('_')) {
+    // Extract from node ID if possible
+    const parts = finding.nodeId.split('_');
+    if (parts.length > 1 && !isNaN(parts[1])) {
+      return parseInt(parts[1]);
+    }
+  }
+  return 1; // Default to line 1 if we can't determine
+}
+/**
+ * Map severity levels
+ */
+function mapSeverity(severity) {
+  if (!severity) return 'warning';
+  const s = severity.toLowerCase();
+  if (s === 'error' || s === 'critical') return 'error';
+  if (s === 'warning' || s === 'warn') return 'warning';
+  if (s === 'info' || s === 'note') return 'info';
+  return 'warning';
+}
+/**
+ * Load semantic rules from YAML
+ */
+export async function loadSemanticRules() {
+  try {
+    const rulesPath = path.join(__dirname, '..', 'rules', 'semantic-security.yaml');
+    const yamlContent = readFileSync(rulesPath, 'utf8');
+    // Parse YAML (simplified - in production use a YAML library)
+    const rules = parseSimpleYAML(yamlContent);
+    return rules;
+  } catch (error) {
+    console.error(`[SEMANTIC] Failed to load rules: ${error.message}`);
+    return [];
+  }
+}
+/**
+ * Simple YAML parser for rule files
+ * (In production, use js-yaml library)
+ */
+function parseSimpleYAML(yamlContent) {
+  const rules = [];
+  const ruleBlocks = yamlContent.split(/^- id:/m).filter(b => b.trim());
+  for (const block of ruleBlocks) {
+    try {
+      const rule = {};
+      // Extract ID
+      const idMatch = block.match(/^\s*(.+?)$/m);
+      if (idMatch) {
+        rule.id = idMatch[1].trim();
+      }
+      // Extract languages
+      const langMatch = block.match(/languages:\s*\[([^\]]+)\]/);
+      if (langMatch) {
+        rule.languages = langMatch[1].split(',').map(l => l.trim());
+      }
+      // Extract severity
+      const severityMatch = block.match(/severity:\s*(\w+)/);
+      if (severityMatch) {
+        rule.severity = severityMatch[1].trim();
+      }
+      // Extract message
+      const messageMatch = block.match(/message:\s*["'](.+?)["']/);
+      if (messageMatch) {
+        rule.message = messageMatch[1];
+      }
+      // Extract metadata (simplified)
+      rule.metadata = {};
+      const categoryMatch = block.match(/category:\s*(\S+)/);
+      if (categoryMatch) {
+        rule.metadata.category = categoryMatch[1];
+      }
+      const cweMatch = block.match(/cwe:\s*["']([^"']+)["']/);
+      if (cweMatch) {
+        rule.metadata.cwe = cweMatch[1];
+      }
+      const owaspMatch = block.match(/owasp:\s*["']([^"']+)["']/);
+      if (owaspMatch) {
+        rule.metadata.owasp = owaspMatch[1];
+      }
+      const confidenceMatch = block.match(/confidence:\s*(\w+)/);
+      if (confidenceMatch) {
+        rule.metadata.confidence = confidenceMatch[1];
+      }
+      if (rule.id) {
+        rules.push(rule);
+      }
+    } catch (error) {
+      console.error(`[SEMANTIC] Failed to parse rule block: ${error.message}`);
+    }
+  }
+  return rules;
+}
+/**
+ * Check if semantic analysis is available
+ */
+let _semanticAvailable = null;
+export function isSemanticAnalysisAvailable() {
+  if (_semanticAvailable !== null) return _semanticAvailable;
+  try {
+    const pyCmd = resolvePythonCommand();
+    // Verify that Python 3 exists AND tree-sitter can be imported, which is
+    // the actual prerequisite for AST extraction.  A bare "python --version"
+    // check was giving false positives on systems without tree-sitter.
+    execFileSync(pyCmd, [
+      ...pythonArgs(),
+      '-c',
+      'import tree_sitter; import ast_parser'
+    ], {
+      stdio: 'pipe',
+      timeout: 10000,
+      cwd: path.join(__dirname, '..')
+    });
+    _semanticAvailable = true;
+  } catch {
+    _semanticAvailable = false;
+  }
+  return _semanticAvailable;
+}
+/**
+ * Get semantic analyzer statistics
+ */
+export async function getSemanticStats() {
+  const rules = await loadSemanticRules();
+  const stats = {
+    total_rules: rules.length,
+    by_severity: {},
+    by_category: {},
+    by_language: {}
+  };
+  for (const rule of rules) {
+    // Count by severity
+    const severity = rule.severity || 'UNKNOWN';
+    stats.by_severity[severity] = (stats.by_severity[severity] || 0) + 1;
+    // Count by category
+    const category = rule.metadata?.category || 'unknown';
+    stats.by_category[category] = (stats.by_category[category] || 0) + 1;
+    // Count by language
+    if (rule.languages) {
+      for (const lang of rule.languages) {
+        stats.by_language[lang] = (stats.by_language[lang] || 0) + 1;
+      }
+    }
+  }
+  return stats;
+}

package/src/tools/scan-project.js CHANGED Viewed

@@ -24,6 +24,30 @@ const SCANNABLE_EXTENSIONS = new Set([
   '.tf', '.hcl', '.sql',
 ]);
+// Directories pruned during the walk: VCS metadata, dependency trees, build
+// artifacts, language/tool caches, and editor state. Hidden entries are NOT
+// blanket-skipped — only names in this denylist are pruned — so that
+// security-relevant dotpaths (e.g. .github/workflows) are still traversed.
+const SKIP_DIRECTORIES = new Set([
+  // VCS metadata
+  '.git', '.svn', '.hg', '.bzr',
+  // Dependencies / package trees
+  'node_modules', 'vendor', 'bower_components',
+  // Build / output artifacts
+  'dist', 'build', 'out', 'target', 'coverage',
+  // Python environments and caches
+  '__pycache__', 'venv', 'env', '.venv',
+  '.tox', '.nox', '.pytest_cache', '.mypy_cache', '.ruff_cache', '.hypothesis',
+  // JS/TS framework and tooling caches
+  '.next', '.nuxt', '.svelte-kit', '.turbo', '.parcel-cache', '.cache',
+  // Package-manager caches
+  '.yarn', '.pnpm-store', '.bundle', '.cargo', '.gradle',
+  // Editor / IDE state
+  '.idea', '.vscode', '.vs',
+  // IaC state
+  '.terraform',
+]);
 // Parse .gitignore into patterns
 function parseGitignore(dirPath) {
   const gitignorePath = join(dirPath, '.gitignore');
@@ -64,9 +88,6 @@ function walkDirectory(dirPath, options = {}) {
     }
     for (const entry of entries) {
-      // Skip hidden directories/files
-      if (entry.startsWith('.')) continue;
       const fullPath = join(currentDir, entry);
       const relativePath = relative(dirPath, fullPath);
@@ -78,9 +99,10 @@ function walkDirectory(dirPath, options = {}) {
       }
       if (stat.isDirectory()) {
-        // Skip common non-source directories
-        if (['node_modules', 'vendor', 'dist', 'build', '__pycache__', '.git',
-             'venv', 'env', '.venv', 'target', 'coverage'].includes(entry)) continue;
+        // Prune known heavy/internal dirs (incl. hidden ones like .git), but
+        // do not blanket-skip every dotdir — security-relevant paths such as
+        // .github/workflows must still be walked.
+        if (SKIP_DIRECTORIES.has(entry)) continue;
         // Skip gitignored directories
         if (isGitignored(relativePath, gitignorePatterns)) continue;

package/src/utils/github-clone.js ADDED Viewed

@@ -0,0 +1,227 @@
+#!/usr/bin/env node
+/**
+ * github-clone.js
+ *
+ * Safe GitHub repository cloning WITHOUT code execution
+ *
+ * Security Features:
+ * - Uses --no-checkout to prevent automatic code checkout
+ * - Uses git archive to extract files without triggering hooks
+ * - No postinstall scripts, no git hooks, no code execution
+ * - Size limits to prevent repository bombs
+ * - Timeout protection
+ */
+import { exec } from 'child_process';
+import { promisify } from 'util';
+import fs from 'fs/promises';
+import path from 'path';
+import { existsSync } from 'fs';
+const execAsync = promisify(exec);
+const MAX_REPO_SIZE_MB = 500; // 500MB limit
+const CLONE_TIMEOUT_MS = 120000; // 2 minutes
+const ARCHIVE_TIMEOUT_MS = 60000; // 1 minute
+/**
+ * Extract GitHub owner/repo from various URL formats
+ * @param {string} url - GitHub URL
+ * @returns {{owner: string, repo: string} | null}
+ */
+export function parseGitHubUrl(url) {
+  if (!url) return null;
+  // Match patterns:
+  // - https://github.com/owner/repo
+  // - https://github.com/owner/repo.git
+  // - git@github.com:owner/repo.git
+  // - github.com/owner/repo
+  const patterns = [
+    /github\.com[\/:]([^\/]+)\/([^\/\.]+)/,
+    /^([^\/]+)\/([^\/]+)$/,  // owner/repo format
+  ];
+  for (const pattern of patterns) {
+    const match = url.match(pattern);
+    if (match) {
+      return {
+        owner: match[1],
+        repo: match[2].replace(/\.git$/, ''),
+      };
+    }
+  }
+  return null;
+}
+/**
+ * Clone a GitHub repository safely without checking out files
+ * @param {string} repoUrl - GitHub repository URL
+ * @param {string} destDir - Destination directory
+ * @param {object} options - Options
+ * @returns {Promise<{success: boolean, path?: string, error?: string}>}
+ */
+export async function cloneGitHubRepo(repoUrl, destDir, options = {}) {
+  const {
+    depth = 1,
+    branch = null,
+    timeout = CLONE_TIMEOUT_MS,
+  } = options;
+  const parsed = parseGitHubUrl(repoUrl);
+  if (!parsed) {
+    return { success: false, error: 'Invalid GitHub URL' };
+  }
+  const { owner, repo } = parsed;
+  const cloneDir = path.join(destDir, `${owner}-${repo}`);
+  try {
+    // Create destination directory
+    await fs.mkdir(destDir, { recursive: true });
+    // Build clone command with safety flags
+    const branchFlag = branch ? `--branch ${branch}` : '';
+    const cloneCmd = `git clone --depth ${depth} --no-checkout ${branchFlag} https://github.com/${owner}/${repo}.git "${cloneDir}"`;
+    // Clone without checking out (no hooks triggered)
+    const { stdout, stderr } = await execAsync(cloneCmd, {
+      timeout,
+      maxBuffer: 10 * 1024 * 1024, // 10MB buffer for git output
+    });
+    // Check repository size
+    const sizeCmd = `du -sm "${cloneDir}" | cut -f1`;
+    const { stdout: sizeOutput } = await execAsync(sizeCmd);
+    const sizeMB = parseInt(sizeOutput.trim(), 10);
+    if (sizeMB > MAX_REPO_SIZE_MB) {
+      // Delete oversized repo
+      await fs.rm(cloneDir, { recursive: true, force: true });
+      return {
+        success: false,
+        error: `Repository too large: ${sizeMB}MB > ${MAX_REPO_SIZE_MB}MB limit`,
+      };
+    }
+    return {
+      success: true,
+      path: cloneDir,
+      size: sizeMB,
+    };
+  } catch (error) {
+    // Clean up on failure
+    if (existsSync(cloneDir)) {
+      await fs.rm(cloneDir, { recursive: true, force: true }).catch(() => {});
+    }
+    return {
+      success: false,
+      error: error.message,
+    };
+  }
+}
+/**
+ * Extract files from cloned repo using git archive (safe - no hooks)
+ * @param {string} repoPath - Path to cloned repository
+ * @param {string} extractDir - Directory to extract files to
+ * @returns {Promise<{success: boolean, path?: string, error?: string}>}
+ */
+export async function extractRepoFiles(repoPath, extractDir) {
+  try {
+    // Create extraction directory
+    await fs.mkdir(extractDir, { recursive: true });
+    // Use git archive to extract HEAD without triggering any hooks
+    const archiveCmd = `cd "${repoPath}" && git archive HEAD | tar -x -C "${extractDir}"`;
+    await execAsync(archiveCmd, {
+      timeout: ARCHIVE_TIMEOUT_MS,
+      maxBuffer: 50 * 1024 * 1024, // 50MB buffer
+    });
+    return {
+      success: true,
+      path: extractDir,
+    };
+  } catch (error) {
+    return {
+      success: false,
+      error: error.message,
+    };
+  }
+}
+/**
+ * Full workflow: clone + extract + cleanup
+ * @param {string} repoUrl - GitHub repository URL
+ * @param {string} workDir - Working directory for clone operations
+ * @returns {Promise<{success: boolean, sourcePath?: string, error?: string}>}
+ */
+export async function cloneAndExtract(repoUrl, workDir) {
+  const tempCloneDir = path.join(workDir, 'temp-clones');
+  const extractDir = path.join(workDir, 'extracted-source');
+  try {
+    // Step 1: Clone without checkout
+    const cloneResult = await cloneGitHubRepo(repoUrl, tempCloneDir);
+    if (!cloneResult.success) {
+      return cloneResult;
+    }
+    // Step 2: Extract files safely
+    const extractResult = await extractRepoFiles(cloneResult.path, extractDir);
+    // Step 3: Clean up clone directory (keep extracted only)
+    await fs.rm(tempCloneDir, { recursive: true, force: true });
+    if (!extractResult.success) {
+      // Clean up extracted files on failure
+      await fs.rm(extractDir, { recursive: true, force: true });
+      return extractResult;
+    }
+    return {
+      success: true,
+      sourcePath: extractDir,
+      repoSize: cloneResult.size,
+    };
+  } catch (error) {
+    // Clean up everything on error
+    await fs.rm(tempCloneDir, { recursive: true, force: true }).catch(() => {});
+    await fs.rm(extractDir, { recursive: true, force: true }).catch(() => {});
+    return {
+      success: false,
+      error: error.message,
+    };
+  }
+}
+/**
+ * Check if a URL is a valid GitHub repository (without cloning)
+ * @param {string} url - URL to check
+ * @returns {Promise<boolean>}
+ */
+export async function isValidGitHubRepo(url) {
+  const parsed = parseGitHubUrl(url);
+  if (!parsed) return false;
+  const { owner, repo } = parsed;
+  try {
+    // Check if repo exists using git ls-remote (lightweight)
+    const checkCmd = `git ls-remote https://github.com/${owner}/${repo}.git HEAD`;
+    await execAsync(checkCmd, { timeout: 10000 });
+    return true;
+  } catch {
+    return false;
+  }
+}