agent-security-scanner-mcp 1.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,351 @@
1
+ """
2
+ Taint Analysis Engine
3
+
4
+ Tracks dataflow from user-controlled sources to dangerous sinks.
5
+ Detects vulnerabilities only when tainted data reaches a sink.
6
+
7
+ Algorithm:
8
+ 1. Find source patterns (user input) and mark matched variables as tainted
9
+ 2. Track taint through assignments (y = x means y inherits x's taint)
10
+ 3. Check if any tainted variable reaches a sink pattern
11
+ 4. Report vulnerability if tainted data flows to sink
12
+ """
13
+
14
+ from dataclasses import dataclass, field
15
+ from typing import List, Dict, Set, Optional, Tuple, Any
16
+ from generic_ast import GenericNode, NodeKind
17
+ from pattern_matcher import (
18
+ Pattern, TaintRule, Finding, PatternMatcher,
19
+ MatchResult, create_pattern
20
+ )
21
+
22
+
23
+ @dataclass
24
+ class TaintedVariable:
25
+ """Represents a tainted variable and its source"""
26
+ name: str
27
+ source_pattern: str # Which source pattern matched
28
+ source_line: int # Line where taint originated
29
+ propagation_path: List[str] = field(default_factory=list) # How taint flowed
30
+
31
+
32
+ @dataclass
33
+ class VariableAssignment:
34
+ """Represents a variable assignment in the code"""
35
+ target: str # Variable being assigned to
36
+ source_vars: Set[str] # Variables used in the right-hand side
37
+ line: int
38
+ column: int
39
+ node: GenericNode
40
+
41
+
42
+ class TaintAnalyzer:
43
+ """
44
+ Performs taint analysis on an AST using TaintRule definitions.
45
+
46
+ Tracks how data flows from sources (user input) to sinks (dangerous functions).
47
+ """
48
+
49
+ def __init__(self):
50
+ self.matcher = PatternMatcher()
51
+ self.tainted: Dict[str, TaintedVariable] = {}
52
+ self.assignments: List[VariableAssignment] = []
53
+
54
+ def analyze(self, ast: GenericNode, rules: List[TaintRule]) -> List[Finding]:
55
+ """
56
+ Analyze AST for taint vulnerabilities using the provided rules.
57
+
58
+ Returns list of findings where tainted data reaches a sink.
59
+ """
60
+ all_findings = []
61
+
62
+ for rule in rules:
63
+ findings = self._analyze_rule(ast, rule)
64
+ all_findings.extend(findings)
65
+
66
+ return all_findings
67
+
68
+ def _analyze_rule(self, ast: GenericNode, rule: TaintRule) -> List[Finding]:
69
+ """Analyze a single taint rule against the AST"""
70
+ # Reset state for each rule
71
+ self.tainted = {}
72
+ self.assignments = []
73
+
74
+ # Step 1: Collect all variable assignments
75
+ self._collect_assignments(ast)
76
+
77
+ # Step 2: Find sources and mark initial tainted variables
78
+ self._find_sources(ast, rule)
79
+
80
+ # Step 3: Propagate taint through assignments
81
+ self._propagate_taint(rule)
82
+
83
+ # Step 4: Check sinks for tainted input
84
+ findings = self._check_sinks(ast, rule)
85
+
86
+ return findings
87
+
88
+ def _collect_assignments(self, node: GenericNode):
89
+ """Collect all variable assignments in the AST"""
90
+ if node.kind == NodeKind.ASSIGNMENT:
91
+ # Extract target and source variables
92
+ target = self._get_assignment_target(node)
93
+ source_vars = self._get_referenced_variables(node)
94
+
95
+ if target:
96
+ # Remove target from source vars (x = x + 1 shouldn't include x as source twice)
97
+ source_vars.discard(target)
98
+
99
+ self.assignments.append(VariableAssignment(
100
+ target=target,
101
+ source_vars=source_vars,
102
+ line=node.line,
103
+ column=node.column,
104
+ node=node
105
+ ))
106
+
107
+ # Recurse into children
108
+ for child in node.children:
109
+ self._collect_assignments(child)
110
+
111
+ def _get_assignment_target(self, node: GenericNode) -> Optional[str]:
112
+ """Extract the target variable name from an assignment"""
113
+ # Look for identifier on the left side
114
+ for child in node.children:
115
+ if child.kind == NodeKind.IDENTIFIER:
116
+ return child.text
117
+ # Handle attribute access (obj.attr = ...)
118
+ if child.kind == NodeKind.ATTRIBUTE:
119
+ return child.text
120
+ return None
121
+
122
+ def _get_referenced_variables(self, node: GenericNode) -> Set[str]:
123
+ """Get all variable names referenced in a node"""
124
+ vars_found = set()
125
+
126
+ def collect(n: GenericNode, skip_first: bool = False):
127
+ # Skip the first identifier in assignments (that's the target)
128
+ if n.kind == NodeKind.IDENTIFIER:
129
+ if not skip_first:
130
+ vars_found.add(n.text)
131
+ elif n.kind == NodeKind.CALL:
132
+ # For function calls, check arguments
133
+ for child in n.children:
134
+ collect(child, False)
135
+ else:
136
+ first_child = True
137
+ for child in n.children:
138
+ collect(child, skip_first and first_child)
139
+ first_child = False
140
+
141
+ collect(node, skip_first=True)
142
+ return vars_found
143
+
144
+ def _find_sources(self, ast: GenericNode, rule: TaintRule):
145
+ """Find source patterns and mark matched variables as tainted"""
146
+ for source_pattern in rule.sources:
147
+ matches = self.matcher.find_all(source_pattern, ast)
148
+
149
+ for match in matches:
150
+ # Check if this source match is sanitized directly
151
+ if match.node and self._is_sanitized(match.node, rule):
152
+ continue
153
+
154
+ # Get the variable that receives the tainted value
155
+ tainted_var = self._find_receiving_variable(match, ast)
156
+
157
+ if tainted_var:
158
+ # Check if the assignment itself is sanitized (e.g. x = sanitize(source))
159
+ assignment = self._get_assignment_by_target(tainted_var)
160
+ if assignment and self._is_assignment_sanitized(assignment, rule):
161
+ continue
162
+
163
+ self.tainted[tainted_var] = TaintedVariable(
164
+ name=tainted_var,
165
+ source_pattern=source_pattern.pattern_text,
166
+ source_line=match.line,
167
+ propagation_path=[f"Source: {source_pattern.pattern_text}"]
168
+ )
169
+
170
+ # Check captured metavariables
171
+ for meta_name, meta in match.metavariables.items():
172
+ if meta.text and meta.text not in self.tainted:
173
+ if meta.node and self._is_sanitized(meta.node, rule):
174
+ continue
175
+
176
+ self.tainted[meta.text] = TaintedVariable(
177
+ name=meta.text,
178
+ source_pattern=source_pattern.pattern_text,
179
+ source_line=match.line,
180
+ propagation_path=[f"Captured: {meta_name}={meta.text}"]
181
+ )
182
+
183
+ def _get_assignment_by_target(self, target: str) -> Optional[VariableAssignment]:
184
+ for assignment in self.assignments:
185
+ if assignment.target == target:
186
+ return assignment
187
+ return None
188
+
189
+ def _is_sanitized(self, node: GenericNode, rule: TaintRule) -> bool:
190
+ """Check if a node is covered by a sanitizer"""
191
+ if not rule.sanitizers:
192
+ return False
193
+ for sanitizer in rule.sanitizers:
194
+ if self.matcher.match(sanitizer, node):
195
+ return True
196
+ return False
197
+
198
+ def _find_receiving_variable(self, match: MatchResult, ast: GenericNode) -> Optional[str]:
199
+ """Find which variable receives the matched expression"""
200
+ for assignment in self.assignments:
201
+ if assignment.line == match.line:
202
+ return assignment.target
203
+ return None
204
+
205
+ def _propagate_taint(self, rule: TaintRule):
206
+ """Propagate taint through variable assignments"""
207
+ changed = True
208
+ iterations = 0
209
+ max_iterations = 100
210
+
211
+ while changed and iterations < max_iterations:
212
+ changed = False
213
+ iterations += 1
214
+
215
+ for assignment in self.assignments:
216
+ if assignment.target in self.tainted:
217
+ continue
218
+
219
+ if self._is_assignment_sanitized(assignment, rule):
220
+ continue
221
+
222
+ for source_var in assignment.source_vars:
223
+ if source_var in self.tainted:
224
+ source_taint = self.tainted[source_var]
225
+ new_path = source_taint.propagation_path.copy()
226
+ new_path.append(f"Line {assignment.line}: {assignment.target} = ... {source_var} ...")
227
+
228
+ self.tainted[assignment.target] = TaintedVariable(
229
+ name=assignment.target,
230
+ source_pattern=source_taint.source_pattern,
231
+ source_line=source_taint.source_line,
232
+ propagation_path=new_path
233
+ )
234
+ changed = True
235
+ break
236
+
237
+ def _is_assignment_sanitized(self, assignment: VariableAssignment, rule: TaintRule) -> bool:
238
+ """Check if an assignment is sanitized"""
239
+ if not rule.sanitizers:
240
+ return False
241
+ for sanitizer in rule.sanitizers:
242
+ if self.matcher.find_all(sanitizer, assignment.node):
243
+ return True
244
+ return False
245
+
246
+ def _check_sinks(self, ast: GenericNode, rule: TaintRule) -> List[Finding]:
247
+ """Check if any tainted data reaches a sink"""
248
+ findings = []
249
+
250
+ for sink_pattern in rule.sinks:
251
+ matches = self.matcher.find_all(sink_pattern, ast)
252
+
253
+ for match in matches:
254
+ # Find all tainted variables used in this sink match
255
+ tainted_nodes = self._find_tainted_nodes_in_match(match)
256
+
257
+ for var_name, var_node in tainted_nodes:
258
+ # Check if this usage is sanitized
259
+ if self._is_node_sanitized_in_context(var_node, match.node, rule):
260
+ continue
261
+
262
+ taint_info = self.tainted[var_name]
263
+ path_str = " -> ".join(taint_info.propagation_path[-3:])
264
+ message = f"{rule.message}\n\nTaint flow: {path_str}\n\nTainted variable '{var_name}' flows to sink."
265
+
266
+ findings.append(Finding(
267
+ rule_id=rule.id,
268
+ rule_name=rule.name,
269
+ message=message,
270
+ severity=rule.severity,
271
+ line=match.line,
272
+ column=match.column,
273
+ text=match.node.text if match.node else "",
274
+ end_line=match.node.end_line if match.node else match.line,
275
+ end_column=match.node.end_column if match.node else match.column,
276
+ metavariables={k: v.text for k, v in match.metavariables.items()},
277
+ metadata={
278
+ **rule.metadata,
279
+ 'taint_source': taint_info.source_pattern,
280
+ 'taint_source_line': taint_info.source_line,
281
+ 'tainted_variable': var_name
282
+ }
283
+ ))
284
+
285
+ return findings
286
+
287
+ def _find_tainted_nodes_in_match(self, match: MatchResult) -> List[Tuple[str, GenericNode]]:
288
+ """Find tainted variables and their nodes within a match"""
289
+ results = []
290
+
291
+ # Check metavariables
292
+ for meta in match.metavariables.values():
293
+ if meta.text in self.tainted and meta.node:
294
+ results.append((meta.text, meta.node))
295
+
296
+ # If no metavariables, check all identifiers in the match node
297
+ if not results and match.node:
298
+ referenced = self._get_referenced_variables_with_nodes(match.node)
299
+ for var_name, var_node in referenced:
300
+ if var_name in self.tainted:
301
+ results.append((var_name, var_node))
302
+
303
+ return results
304
+
305
+ def _get_referenced_variables_with_nodes(self, node: GenericNode) -> List[Tuple[str, GenericNode]]:
306
+ """Get all variable names and nodes referenced in a node"""
307
+ vars_found = []
308
+
309
+ def collect(n: GenericNode):
310
+ if n.kind == NodeKind.IDENTIFIER:
311
+ vars_found.append((n.text, n))
312
+ for child in n.children:
313
+ collect(child)
314
+
315
+ collect(node)
316
+ return vars_found
317
+
318
+ def _is_node_sanitized_in_context(self, target: GenericNode, context: GenericNode, rule: TaintRule) -> bool:
319
+ """Check if target node is inside a sanitizer within context"""
320
+ if not rule.sanitizers:
321
+ return False
322
+
323
+ # Find all sanitizer matches within context
324
+ for sanitizer in rule.sanitizers:
325
+ sanitizer_matches = self.matcher.find_all(sanitizer, context)
326
+ for s_match in sanitizer_matches:
327
+ if s_match.node and self._contains_range(s_match.node, target):
328
+ return True
329
+ return False
330
+
331
+ def _contains_range(self, outer: GenericNode, inner: GenericNode) -> bool:
332
+ """Check if outer node lexically contains inner node"""
333
+ # Simple line/column check
334
+ if outer.line < inner.line or (outer.line == inner.line and outer.column <= inner.column):
335
+ if outer.end_line > inner.end_line or (outer.end_line == inner.end_line and outer.end_column >= inner.end_column):
336
+ return True
337
+ return False
338
+
339
+ # _find_tainted_in_match removed (replaced by _find_tainted_nodes_in_match)
340
+
341
+
342
+ def analyze_taint(ast: GenericNode, rules: List[TaintRule]) -> List[Finding]:
343
+ """Convenience function to run taint analysis"""
344
+ analyzer = TaintAnalyzer()
345
+ return analyzer.analyze(ast, rules)
346
+
347
+
348
+ if __name__ == '__main__':
349
+ # Quick test
350
+ print("TaintAnalyzer module loaded successfully")
351
+ print("Use analyze_taint(ast, rules) to run taint analysis")