agent-security-scanner-mcp 1.5.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/analyzer.py CHANGED
@@ -1,85 +1,87 @@
1
+ """
2
+ Security Analyzer - AST-Based with Regex Fallback
3
+
4
+ Uses tree-sitter AST analysis when available, falls back to regex
5
+ pattern matching when tree-sitter is not installed. This ensures
6
+ the analyzer works out-of-the-box with `npx` (regex mode) and
7
+ provides enhanced detection when dependencies are installed.
8
+ """
9
+
1
10
  import sys
2
11
  import json
3
- import re
4
12
  import os
13
+ import re
14
+ from typing import List, Dict, Any
5
15
 
6
16
  # Add the directory containing this script to the path
7
17
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
8
18
 
9
- from rules import get_rules, get_rules_for_language, get_rule_stats
19
+ # Try to import AST engine
20
+ try:
21
+ from ast_parser import ASTParser, HAS_TREE_SITTER
22
+ from generic_ast import convert_tree
23
+ from pattern_matcher import RuleEngine
24
+ from regex_fallback import apply_regex_fallback
25
+ HAS_AST_ENGINE = HAS_TREE_SITTER
26
+ except ImportError:
27
+ HAS_AST_ENGINE = False
28
+
29
+ # Try to import Semgrep loader and taint analyzer
30
+ try:
31
+ from semgrep_loader import load_rules, get_loader
32
+ HAS_SEMGREP_LOADER = True
33
+ except ImportError:
34
+ HAS_SEMGREP_LOADER = False
35
+
36
+ try:
37
+ from taint_analyzer import TaintAnalyzer
38
+ HAS_TAINT_ANALYZER = True
39
+ except ImportError:
40
+ HAS_TAINT_ANALYZER = False
41
+
42
+ # Import the original regex-based rules (always available)
43
+ from rules import get_rules_for_language
10
44
 
11
45
  # File extension to language mapping
12
46
  EXTENSION_MAP = {
13
- '.py': 'python',
14
- '.js': 'javascript',
15
- '.ts': 'typescript',
16
- '.tsx': 'typescript',
17
- '.jsx': 'javascript',
18
- '.java': 'java',
19
- '.go': 'go',
20
- '.rb': 'ruby',
21
- '.php': 'php',
22
- '.cs': 'csharp',
23
- '.rs': 'rust',
24
- '.c': 'c',
25
- '.cpp': 'cpp',
26
- '.h': 'c',
27
- '.hpp': 'cpp',
28
- '.sql': 'sql',
29
- '.dockerfile': 'dockerfile',
30
- '.yaml': 'yaml',
31
- '.yml': 'yaml',
32
- '.json': 'json',
33
- '.tf': 'terraform',
34
- '.hcl': 'terraform',
35
- # Prompt/text file extensions for prompt injection scanning
36
- '.txt': 'generic',
37
- '.md': 'generic',
38
- '.prompt': 'generic',
39
- '.jinja': 'generic',
40
- '.jinja2': 'generic',
41
- '.j2': 'generic',
47
+ '.py': 'python', '.js': 'javascript', '.ts': 'typescript',
48
+ '.tsx': 'typescript', '.jsx': 'javascript', '.java': 'java',
49
+ '.go': 'go', '.rb': 'ruby', '.php': 'php', '.cs': 'csharp',
50
+ '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.h': 'c', '.hpp': 'cpp',
51
+ '.sql': 'sql', '.dockerfile': 'dockerfile',
52
+ '.yaml': 'yaml', '.yml': 'yaml', '.json': 'json',
53
+ '.tf': 'terraform', '.hcl': 'terraform',
54
+ '.txt': 'generic', '.md': 'generic', '.prompt': 'generic',
55
+ '.jinja': 'generic', '.jinja2': 'generic', '.j2': 'generic',
42
56
  }
43
57
 
58
+
44
59
  def detect_language(file_path):
45
- """Detect the programming language from file extension or name"""
60
+ """Detect the programming language from file extension or name."""
46
61
  basename = os.path.basename(file_path).lower()
47
-
48
62
  if basename == 'dockerfile' or basename.startswith('dockerfile.'):
49
63
  return 'dockerfile'
50
-
51
64
  _, ext = os.path.splitext(file_path.lower())
52
65
  return EXTENSION_MAP.get(ext, 'generic')
53
66
 
54
- def analyze_file(file_path):
55
- """Analyze a single file for security vulnerabilities"""
67
+
68
+ def analyze_file_regex(file_path):
69
+ """Original regex-based analysis (fallback when tree-sitter unavailable)."""
56
70
  issues = []
57
-
58
71
  try:
59
72
  language = detect_language(file_path)
60
73
  rules = get_rules_for_language(language)
61
-
62
74
  with open(file_path, 'r', encoding='utf-8') as f:
63
75
  lines = f.readlines()
64
- content = ''.join(lines)
65
-
66
- for line_index, line in enumerate(lines):
67
- original_line = line
68
- line = line.strip()
69
- if not line:
70
- continue
71
-
72
- # Skip comment-only lines (basic detection)
73
- if line.startswith('#') or line.startswith('//') or line.startswith('*'):
76
+
77
+ for line_index, original_line in enumerate(lines):
78
+ line = original_line.strip()
79
+ if not line or line.startswith('#') or line.startswith('//') or line.startswith('*'):
74
80
  continue
75
-
76
81
  for rule_id, rule in rules.items():
77
82
  for pattern in rule['patterns']:
78
83
  try:
79
- # Use IGNORECASE for better detection (API_KEY vs api_key)
80
- matches = re.finditer(pattern, line, re.IGNORECASE)
81
- for match in matches:
82
- # Calculate column based on original line (preserve indentation)
84
+ for match in re.finditer(pattern, line, re.IGNORECASE):
83
85
  col_offset = len(original_line) - len(original_line.lstrip())
84
86
  issues.append({
85
87
  'ruleId': rule['id'],
@@ -91,36 +93,110 @@ def analyze_file(file_path):
91
93
  'metadata': rule.get('metadata', {})
92
94
  })
93
95
  except re.error:
94
- # Skip invalid regex patterns
95
96
  continue
96
-
97
97
  except Exception as e:
98
98
  return {'error': str(e)}
99
-
100
- # Deduplicate issues (same rule, same line)
99
+
101
100
  seen = set()
102
- unique_issues = []
101
+ unique = []
103
102
  for issue in issues:
104
103
  key = (issue['ruleId'], issue['line'], issue['column'])
105
104
  if key not in seen:
106
105
  seen.add(key)
107
- unique_issues.append(issue)
108
-
109
- return unique_issues
106
+ unique.append(issue)
107
+ return unique
108
+
109
+
110
+ def analyze_file_ast(file_path):
111
+ """AST-based analysis using tree-sitter."""
112
+ try:
113
+ parser = ASTParser()
114
+ engine = RuleEngine()
115
+
116
+ # Load rules
117
+ rules = []
118
+ taint_rules = []
119
+ if HAS_SEMGREP_LOADER:
120
+ supported = ['python', 'javascript', 'typescript', 'java', 'go',
121
+ 'ruby', 'php', 'c', 'rust', 'csharp', 'generic']
122
+ rules = load_rules(supported)
123
+ loader = get_loader()
124
+ taint_rules = loader.get_taint_rules()
125
+
126
+ parse_result = parser.parse_file(file_path)
127
+ if not parse_result.success:
128
+ # Fall back to regex if AST parse fails
129
+ return analyze_file_regex(file_path)
130
+
131
+ ast = convert_tree(parse_result.tree, parse_result.language, parse_result.source_bytes)
132
+
133
+ applicable_rules = [
134
+ r for r in rules
135
+ if parse_result.language in r.languages or 'generic' in r.languages
136
+ ]
137
+
138
+ findings = engine.apply_rules(applicable_rules, ast)
139
+
140
+ # Taint analysis
141
+ if HAS_TAINT_ANALYZER and taint_rules:
142
+ taint = TaintAnalyzer()
143
+ applicable_taint = [
144
+ r for r in taint_rules
145
+ if parse_result.language in r.languages or 'generic' in r.languages
146
+ ]
147
+ findings.extend(taint.analyze(ast, applicable_taint))
148
+
149
+ issues = []
150
+ for f in findings:
151
+ length = f.end_column - f.column if f.line == f.end_line else len(f.text)
152
+ issues.append({
153
+ 'ruleId': f.rule_id,
154
+ 'message': f"[{f.rule_name}] {f.message}",
155
+ 'line': f.line - 1, # Convert to 0-indexed for compatibility
156
+ 'column': f.column,
157
+ 'length': length,
158
+ 'severity': f.severity,
159
+ 'metadata': f.metadata,
160
+ })
161
+
162
+ # Regex fallback for coverage gaps
163
+ source = parse_result.source_bytes.decode('utf-8', errors='replace')
164
+ issues.extend(apply_regex_fallback(source, parse_result.language, file_path))
165
+
166
+ seen = set()
167
+ unique = []
168
+ for issue in issues:
169
+ key = (issue['ruleId'], issue['line'], issue['column'])
170
+ if key not in seen:
171
+ seen.add(key)
172
+ unique.append(issue)
173
+ return unique
174
+
175
+ except Exception:
176
+ # Fall back to regex on any AST engine error
177
+ return analyze_file_regex(file_path)
178
+
179
+
180
+ def analyze_file(file_path):
181
+ """Analyze a file — uses AST engine if available, regex otherwise."""
182
+ if HAS_AST_ENGINE:
183
+ return analyze_file_ast(file_path)
184
+ return analyze_file_regex(file_path)
185
+
110
186
 
111
187
  def main():
112
188
  if len(sys.argv) < 2:
113
189
  print(json.dumps({'error': 'No file path provided'}))
114
190
  sys.exit(1)
115
-
191
+
116
192
  file_path = sys.argv[1]
117
-
118
193
  if not os.path.exists(file_path):
119
194
  print(json.dumps({'error': f'File not found: {file_path}'}))
120
195
  sys.exit(1)
121
-
196
+
122
197
  results = analyze_file(file_path)
123
198
  print(json.dumps(results))
124
199
 
200
+
125
201
  if __name__ == '__main__':
126
202
  main()
package/ast_parser.py ADDED
@@ -0,0 +1,296 @@
1
+ """
2
+ AST Parser Module - tree-sitter Integration
3
+
4
+ This module provides multi-language AST parsing using tree-sitter.
5
+ Supports: Python, JavaScript, TypeScript, Java, Go, Ruby, PHP, C, C++, Rust, C#
6
+ """
7
+
8
+ import os
9
+ from typing import Optional, Dict, Any, List
10
+ from dataclasses import dataclass
11
+
12
+ # tree-sitter imports
13
+ try:
14
+ import tree_sitter_python as tspython
15
+ import tree_sitter_javascript as tsjavascript
16
+ import tree_sitter_java as tsjava
17
+ import tree_sitter_go as tsgo
18
+ import tree_sitter_ruby as tsruby
19
+ import tree_sitter_php as tsphp
20
+ import tree_sitter_c as tsc
21
+ import tree_sitter_cpp as tscpp
22
+ import tree_sitter_rust as tsrust
23
+ import tree_sitter_c_sharp as tscsharp
24
+ import tree_sitter_typescript as tstypescript
25
+ from tree_sitter import Language, Parser
26
+ HAS_TREE_SITTER = True
27
+ except ImportError:
28
+ HAS_TREE_SITTER = False
29
+ # Define stub types for type hints when tree-sitter not installed
30
+ Parser = None
31
+ Language = None
32
+
33
+
34
+ # Language registry - maps file extensions to tree-sitter languages
35
+ LANGUAGE_REGISTRY: Dict[str, Any] = {}
36
+
37
+ if HAS_TREE_SITTER:
38
+ LANGUAGE_REGISTRY = {
39
+ 'python': Language(tspython.language()),
40
+ 'javascript': Language(tsjavascript.language()),
41
+ 'typescript': Language(tstypescript.language_typescript()),
42
+ 'tsx': Language(tstypescript.language_tsx()),
43
+ 'java': Language(tsjava.language()),
44
+ 'go': Language(tsgo.language()),
45
+ 'ruby': Language(tsruby.language()),
46
+ 'php': Language(tsphp.language_php()),
47
+ 'c': Language(tsc.language()),
48
+ 'cpp': Language(tscpp.language()),
49
+ 'rust': Language(tsrust.language()),
50
+ 'csharp': Language(tscsharp.language()),
51
+ }
52
+
53
+
54
+ # File extension to language mapping
55
+ EXTENSION_MAP = {
56
+ '.py': 'python',
57
+ '.js': 'javascript',
58
+ '.jsx': 'javascript',
59
+ '.ts': 'typescript',
60
+ '.tsx': 'tsx',
61
+ '.java': 'java',
62
+ '.go': 'go',
63
+ '.rb': 'ruby',
64
+ '.php': 'php',
65
+ '.c': 'c',
66
+ '.h': 'c',
67
+ '.cpp': 'cpp',
68
+ '.cc': 'cpp',
69
+ '.cxx': 'cpp',
70
+ '.hpp': 'cpp',
71
+ '.rs': 'rust',
72
+ '.cs': 'csharp',
73
+ # Additions for fallback support
74
+ '.yaml': 'kubernetes',
75
+ '.yml': 'kubernetes',
76
+ '.tf': 'terraform',
77
+ '.txt': 'generic',
78
+ '.md': 'generic',
79
+ '.json': 'json',
80
+ }
81
+
82
+
83
+ @dataclass
84
+ class ParseResult:
85
+ """Result of parsing a source file"""
86
+ tree: Any # tree_sitter.Tree or MockTree
87
+ language: str
88
+ source_bytes: bytes
89
+ success: bool
90
+ error: Optional[str] = None
91
+
92
+
93
+ class MockNode:
94
+ """Mock tree-sitter node for fallback parsing"""
95
+ def __init__(self, type_name: str, source_bytes: bytes):
96
+ self.type = type_name
97
+ self.start_byte = 0
98
+ self.end_byte = len(source_bytes)
99
+ self.start_point = (0, 0)
100
+ self.end_point = (source_bytes.count(b'\n'), 0)
101
+ self.child_count = 0
102
+ self.children = []
103
+
104
+
105
+ class MockTree:
106
+ """Mock tree-sitter tree for fallback parsing"""
107
+ def __init__(self, source_bytes: bytes, language: str):
108
+ self.root_node = MockNode("source_file", source_bytes)
109
+ self.language = language
110
+
111
+
112
+ class ASTParser:
113
+ """
114
+ Multi-language AST parser using tree-sitter.
115
+
116
+ Example usage:
117
+ parser = ASTParser()
118
+ result = parser.parse_file("example.py")
119
+ if result.success:
120
+ root = result.tree.root_node
121
+ print(root.sexp())
122
+ """
123
+
124
+ def __init__(self):
125
+ if not HAS_TREE_SITTER:
126
+ raise ImportError(
127
+ "tree-sitter and language bindings are required. "
128
+ "Install with: pip install -r requirements.txt"
129
+ )
130
+ self._parsers: Dict[str, Parser] = {}
131
+
132
+ def _get_parser(self, language: str) -> Optional[Parser]:
133
+ """Get or create a parser for the specified language"""
134
+ if language not in LANGUAGE_REGISTRY:
135
+ return None
136
+
137
+ if language not in self._parsers:
138
+ parser = Parser()
139
+ parser.language = LANGUAGE_REGISTRY[language]
140
+ self._parsers[language] = parser
141
+
142
+ return self._parsers[language]
143
+
144
+ def detect_language(self, file_path: str) -> Optional[str]:
145
+ """Detect programming language from file extension"""
146
+ _, ext = os.path.splitext(file_path.lower())
147
+ return EXTENSION_MAP.get(ext)
148
+
149
+ def parse_file(self, file_path: str) -> ParseResult:
150
+ """Parse a source file and return the AST"""
151
+ language = self.detect_language(file_path)
152
+
153
+ if not language:
154
+ # Fallback for unknown extensions
155
+ language = 'generic'
156
+
157
+ parser = self._get_parser(language)
158
+ if not parser:
159
+ # Fallback to generic parsing for unsupported languages
160
+ try:
161
+ with open(file_path, 'rb') as f:
162
+ source_bytes = f.read()
163
+ return ParseResult(
164
+ tree=MockTree(source_bytes, language),
165
+ language=language,
166
+ source_bytes=source_bytes,
167
+ success=True
168
+ )
169
+ except Exception as e:
170
+ return ParseResult(
171
+ tree=None,
172
+ language=language,
173
+ source_bytes=b'',
174
+ success=False,
175
+ error=str(e)
176
+ )
177
+
178
+ try:
179
+ with open(file_path, 'rb') as f:
180
+ source_bytes = f.read()
181
+
182
+ tree = parser.parse(source_bytes)
183
+
184
+ return ParseResult(
185
+ tree=tree,
186
+ language=language,
187
+ source_bytes=source_bytes,
188
+ success=True
189
+ )
190
+ except Exception as e:
191
+ return ParseResult(
192
+ tree=None,
193
+ language=language,
194
+ source_bytes=b'',
195
+ success=False,
196
+ error=str(e)
197
+ )
198
+
199
+ def parse_string(self, source: str, language: str) -> ParseResult:
200
+ """Parse source code string and return the AST"""
201
+ parser = self._get_parser(language)
202
+ if not parser:
203
+ return ParseResult(
204
+ tree=None,
205
+ language=language,
206
+ source_bytes=b'',
207
+ success=False,
208
+ error=f"No parser available for language: {language}"
209
+ )
210
+
211
+ try:
212
+ source_bytes = source.encode('utf-8')
213
+ tree = parser.parse(source_bytes)
214
+
215
+ return ParseResult(
216
+ tree=tree,
217
+ language=language,
218
+ source_bytes=source_bytes,
219
+ success=True
220
+ )
221
+ except Exception as e:
222
+ return ParseResult(
223
+ tree=None,
224
+ language=language,
225
+ source_bytes=b'',
226
+ success=False,
227
+ error=str(e)
228
+ )
229
+
230
+ def get_supported_languages(self) -> List[str]:
231
+ """Return list of supported languages"""
232
+ return list(LANGUAGE_REGISTRY.keys())
233
+
234
+
235
+ def walk_tree(node, callback, depth=0):
236
+ """
237
+ Walk the AST tree and call callback for each node.
238
+
239
+ Args:
240
+ node: tree_sitter.Node to walk
241
+ callback: function(node, depth) -> bool, return False to stop descent
242
+ depth: current depth in tree
243
+ """
244
+ if callback(node, depth) is False:
245
+ return
246
+
247
+ for child in node.children:
248
+ walk_tree(child, callback, depth + 1)
249
+
250
+
251
+ def get_node_text(node, source_bytes: bytes) -> str:
252
+ """Extract the text of a node from source bytes"""
253
+ return source_bytes[node.start_byte:node.end_byte].decode('utf-8')
254
+
255
+
256
+ def find_nodes_by_type(root_node, node_type: str) -> List[Any]:
257
+ """Find all nodes of a specific type in the tree"""
258
+ results = []
259
+
260
+ def collect(node, depth):
261
+ if node.type == node_type:
262
+ results.append(node)
263
+ return True
264
+
265
+ walk_tree(root_node, collect)
266
+ return results
267
+
268
+
269
+ # Convenience function for quick parsing
270
+ def parse(file_path: str) -> ParseResult:
271
+ """Quick parse a file - creates a new parser each time"""
272
+ parser = ASTParser()
273
+ return parser.parse_file(file_path)
274
+
275
+
276
+ if __name__ == '__main__':
277
+ import sys
278
+
279
+ if len(sys.argv) < 2:
280
+ print("Usage: python ast_parser.py <file_path>")
281
+ sys.exit(1)
282
+
283
+ result = parse(sys.argv[1])
284
+
285
+ if result.success:
286
+ print(f"Language: {result.language}")
287
+ print(f"Root node type: {result.tree.root_node.type}")
288
+ print(f"Child count: {result.tree.root_node.child_count}")
289
+
290
+ # Print first few child nodes as demo
291
+ print("\nTop-level nodes:")
292
+ for i, child in enumerate(result.tree.root_node.children[:10]):
293
+ text_preview = get_node_text(child, result.source_bytes)[:50].replace('\n', '\\n')
294
+ print(f" [{i}] {child.type}: {text_preview}...")
295
+ else:
296
+ print(f"Error: {result.error}")