agent-security-scanner-mcp 1.5.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +127 -105
- package/analyzer.py +140 -64
- package/ast_parser.py +296 -0
- package/generic_ast.py +572 -0
- package/index.js +422 -7
- package/package.json +11 -2
- package/pattern_matcher.py +550 -0
- package/regex_fallback.py +466 -0
- package/requirements.txt +13 -0
- package/semgrep_loader.py +570 -0
- package/taint_analyzer.py +351 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Taint Analysis Engine
|
|
3
|
+
|
|
4
|
+
Tracks dataflow from user-controlled sources to dangerous sinks.
|
|
5
|
+
Detects vulnerabilities only when tainted data reaches a sink.
|
|
6
|
+
|
|
7
|
+
Algorithm:
|
|
8
|
+
1. Find source patterns (user input) and mark matched variables as tainted
|
|
9
|
+
2. Track taint through assignments (y = x means y inherits x's taint)
|
|
10
|
+
3. Check if any tainted variable reaches a sink pattern
|
|
11
|
+
4. Report vulnerability if tainted data flows to sink
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import List, Dict, Set, Optional, Tuple, Any
|
|
16
|
+
from generic_ast import GenericNode, NodeKind
|
|
17
|
+
from pattern_matcher import (
|
|
18
|
+
Pattern, TaintRule, Finding, PatternMatcher,
|
|
19
|
+
MatchResult, create_pattern
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class TaintedVariable:
|
|
25
|
+
"""Represents a tainted variable and its source"""
|
|
26
|
+
name: str
|
|
27
|
+
source_pattern: str # Which source pattern matched
|
|
28
|
+
source_line: int # Line where taint originated
|
|
29
|
+
propagation_path: List[str] = field(default_factory=list) # How taint flowed
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class VariableAssignment:
|
|
34
|
+
"""Represents a variable assignment in the code"""
|
|
35
|
+
target: str # Variable being assigned to
|
|
36
|
+
source_vars: Set[str] # Variables used in the right-hand side
|
|
37
|
+
line: int
|
|
38
|
+
column: int
|
|
39
|
+
node: GenericNode
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class TaintAnalyzer:
|
|
43
|
+
"""
|
|
44
|
+
Performs taint analysis on an AST using TaintRule definitions.
|
|
45
|
+
|
|
46
|
+
Tracks how data flows from sources (user input) to sinks (dangerous functions).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self):
|
|
50
|
+
self.matcher = PatternMatcher()
|
|
51
|
+
self.tainted: Dict[str, TaintedVariable] = {}
|
|
52
|
+
self.assignments: List[VariableAssignment] = []
|
|
53
|
+
|
|
54
|
+
def analyze(self, ast: GenericNode, rules: List[TaintRule]) -> List[Finding]:
|
|
55
|
+
"""
|
|
56
|
+
Analyze AST for taint vulnerabilities using the provided rules.
|
|
57
|
+
|
|
58
|
+
Returns list of findings where tainted data reaches a sink.
|
|
59
|
+
"""
|
|
60
|
+
all_findings = []
|
|
61
|
+
|
|
62
|
+
for rule in rules:
|
|
63
|
+
findings = self._analyze_rule(ast, rule)
|
|
64
|
+
all_findings.extend(findings)
|
|
65
|
+
|
|
66
|
+
return all_findings
|
|
67
|
+
|
|
68
|
+
def _analyze_rule(self, ast: GenericNode, rule: TaintRule) -> List[Finding]:
|
|
69
|
+
"""Analyze a single taint rule against the AST"""
|
|
70
|
+
# Reset state for each rule
|
|
71
|
+
self.tainted = {}
|
|
72
|
+
self.assignments = []
|
|
73
|
+
|
|
74
|
+
# Step 1: Collect all variable assignments
|
|
75
|
+
self._collect_assignments(ast)
|
|
76
|
+
|
|
77
|
+
# Step 2: Find sources and mark initial tainted variables
|
|
78
|
+
self._find_sources(ast, rule)
|
|
79
|
+
|
|
80
|
+
# Step 3: Propagate taint through assignments
|
|
81
|
+
self._propagate_taint(rule)
|
|
82
|
+
|
|
83
|
+
# Step 4: Check sinks for tainted input
|
|
84
|
+
findings = self._check_sinks(ast, rule)
|
|
85
|
+
|
|
86
|
+
return findings
|
|
87
|
+
|
|
88
|
+
def _collect_assignments(self, node: GenericNode):
|
|
89
|
+
"""Collect all variable assignments in the AST"""
|
|
90
|
+
if node.kind == NodeKind.ASSIGNMENT:
|
|
91
|
+
# Extract target and source variables
|
|
92
|
+
target = self._get_assignment_target(node)
|
|
93
|
+
source_vars = self._get_referenced_variables(node)
|
|
94
|
+
|
|
95
|
+
if target:
|
|
96
|
+
# Remove target from source vars (x = x + 1 shouldn't include x as source twice)
|
|
97
|
+
source_vars.discard(target)
|
|
98
|
+
|
|
99
|
+
self.assignments.append(VariableAssignment(
|
|
100
|
+
target=target,
|
|
101
|
+
source_vars=source_vars,
|
|
102
|
+
line=node.line,
|
|
103
|
+
column=node.column,
|
|
104
|
+
node=node
|
|
105
|
+
))
|
|
106
|
+
|
|
107
|
+
# Recurse into children
|
|
108
|
+
for child in node.children:
|
|
109
|
+
self._collect_assignments(child)
|
|
110
|
+
|
|
111
|
+
def _get_assignment_target(self, node: GenericNode) -> Optional[str]:
|
|
112
|
+
"""Extract the target variable name from an assignment"""
|
|
113
|
+
# Look for identifier on the left side
|
|
114
|
+
for child in node.children:
|
|
115
|
+
if child.kind == NodeKind.IDENTIFIER:
|
|
116
|
+
return child.text
|
|
117
|
+
# Handle attribute access (obj.attr = ...)
|
|
118
|
+
if child.kind == NodeKind.ATTRIBUTE:
|
|
119
|
+
return child.text
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
def _get_referenced_variables(self, node: GenericNode) -> Set[str]:
|
|
123
|
+
"""Get all variable names referenced in a node"""
|
|
124
|
+
vars_found = set()
|
|
125
|
+
|
|
126
|
+
def collect(n: GenericNode, skip_first: bool = False):
|
|
127
|
+
# Skip the first identifier in assignments (that's the target)
|
|
128
|
+
if n.kind == NodeKind.IDENTIFIER:
|
|
129
|
+
if not skip_first:
|
|
130
|
+
vars_found.add(n.text)
|
|
131
|
+
elif n.kind == NodeKind.CALL:
|
|
132
|
+
# For function calls, check arguments
|
|
133
|
+
for child in n.children:
|
|
134
|
+
collect(child, False)
|
|
135
|
+
else:
|
|
136
|
+
first_child = True
|
|
137
|
+
for child in n.children:
|
|
138
|
+
collect(child, skip_first and first_child)
|
|
139
|
+
first_child = False
|
|
140
|
+
|
|
141
|
+
collect(node, skip_first=True)
|
|
142
|
+
return vars_found
|
|
143
|
+
|
|
144
|
+
def _find_sources(self, ast: GenericNode, rule: TaintRule):
|
|
145
|
+
"""Find source patterns and mark matched variables as tainted"""
|
|
146
|
+
for source_pattern in rule.sources:
|
|
147
|
+
matches = self.matcher.find_all(source_pattern, ast)
|
|
148
|
+
|
|
149
|
+
for match in matches:
|
|
150
|
+
# Check if this source match is sanitized directly
|
|
151
|
+
if match.node and self._is_sanitized(match.node, rule):
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
# Get the variable that receives the tainted value
|
|
155
|
+
tainted_var = self._find_receiving_variable(match, ast)
|
|
156
|
+
|
|
157
|
+
if tainted_var:
|
|
158
|
+
# Check if the assignment itself is sanitized (e.g. x = sanitize(source))
|
|
159
|
+
assignment = self._get_assignment_by_target(tainted_var)
|
|
160
|
+
if assignment and self._is_assignment_sanitized(assignment, rule):
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
self.tainted[tainted_var] = TaintedVariable(
|
|
164
|
+
name=tainted_var,
|
|
165
|
+
source_pattern=source_pattern.pattern_text,
|
|
166
|
+
source_line=match.line,
|
|
167
|
+
propagation_path=[f"Source: {source_pattern.pattern_text}"]
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Check captured metavariables
|
|
171
|
+
for meta_name, meta in match.metavariables.items():
|
|
172
|
+
if meta.text and meta.text not in self.tainted:
|
|
173
|
+
if meta.node and self._is_sanitized(meta.node, rule):
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
self.tainted[meta.text] = TaintedVariable(
|
|
177
|
+
name=meta.text,
|
|
178
|
+
source_pattern=source_pattern.pattern_text,
|
|
179
|
+
source_line=match.line,
|
|
180
|
+
propagation_path=[f"Captured: {meta_name}={meta.text}"]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def _get_assignment_by_target(self, target: str) -> Optional[VariableAssignment]:
|
|
184
|
+
for assignment in self.assignments:
|
|
185
|
+
if assignment.target == target:
|
|
186
|
+
return assignment
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
def _is_sanitized(self, node: GenericNode, rule: TaintRule) -> bool:
|
|
190
|
+
"""Check if a node is covered by a sanitizer"""
|
|
191
|
+
if not rule.sanitizers:
|
|
192
|
+
return False
|
|
193
|
+
for sanitizer in rule.sanitizers:
|
|
194
|
+
if self.matcher.match(sanitizer, node):
|
|
195
|
+
return True
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
def _find_receiving_variable(self, match: MatchResult, ast: GenericNode) -> Optional[str]:
|
|
199
|
+
"""Find which variable receives the matched expression"""
|
|
200
|
+
for assignment in self.assignments:
|
|
201
|
+
if assignment.line == match.line:
|
|
202
|
+
return assignment.target
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
def _propagate_taint(self, rule: TaintRule):
|
|
206
|
+
"""Propagate taint through variable assignments"""
|
|
207
|
+
changed = True
|
|
208
|
+
iterations = 0
|
|
209
|
+
max_iterations = 100
|
|
210
|
+
|
|
211
|
+
while changed and iterations < max_iterations:
|
|
212
|
+
changed = False
|
|
213
|
+
iterations += 1
|
|
214
|
+
|
|
215
|
+
for assignment in self.assignments:
|
|
216
|
+
if assignment.target in self.tainted:
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
if self._is_assignment_sanitized(assignment, rule):
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
for source_var in assignment.source_vars:
|
|
223
|
+
if source_var in self.tainted:
|
|
224
|
+
source_taint = self.tainted[source_var]
|
|
225
|
+
new_path = source_taint.propagation_path.copy()
|
|
226
|
+
new_path.append(f"Line {assignment.line}: {assignment.target} = ... {source_var} ...")
|
|
227
|
+
|
|
228
|
+
self.tainted[assignment.target] = TaintedVariable(
|
|
229
|
+
name=assignment.target,
|
|
230
|
+
source_pattern=source_taint.source_pattern,
|
|
231
|
+
source_line=source_taint.source_line,
|
|
232
|
+
propagation_path=new_path
|
|
233
|
+
)
|
|
234
|
+
changed = True
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
def _is_assignment_sanitized(self, assignment: VariableAssignment, rule: TaintRule) -> bool:
|
|
238
|
+
"""Check if an assignment is sanitized"""
|
|
239
|
+
if not rule.sanitizers:
|
|
240
|
+
return False
|
|
241
|
+
for sanitizer in rule.sanitizers:
|
|
242
|
+
if self.matcher.find_all(sanitizer, assignment.node):
|
|
243
|
+
return True
|
|
244
|
+
return False
|
|
245
|
+
|
|
246
|
+
def _check_sinks(self, ast: GenericNode, rule: TaintRule) -> List[Finding]:
|
|
247
|
+
"""Check if any tainted data reaches a sink"""
|
|
248
|
+
findings = []
|
|
249
|
+
|
|
250
|
+
for sink_pattern in rule.sinks:
|
|
251
|
+
matches = self.matcher.find_all(sink_pattern, ast)
|
|
252
|
+
|
|
253
|
+
for match in matches:
|
|
254
|
+
# Find all tainted variables used in this sink match
|
|
255
|
+
tainted_nodes = self._find_tainted_nodes_in_match(match)
|
|
256
|
+
|
|
257
|
+
for var_name, var_node in tainted_nodes:
|
|
258
|
+
# Check if this usage is sanitized
|
|
259
|
+
if self._is_node_sanitized_in_context(var_node, match.node, rule):
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
taint_info = self.tainted[var_name]
|
|
263
|
+
path_str = " -> ".join(taint_info.propagation_path[-3:])
|
|
264
|
+
message = f"{rule.message}\n\nTaint flow: {path_str}\n\nTainted variable '{var_name}' flows to sink."
|
|
265
|
+
|
|
266
|
+
findings.append(Finding(
|
|
267
|
+
rule_id=rule.id,
|
|
268
|
+
rule_name=rule.name,
|
|
269
|
+
message=message,
|
|
270
|
+
severity=rule.severity,
|
|
271
|
+
line=match.line,
|
|
272
|
+
column=match.column,
|
|
273
|
+
text=match.node.text if match.node else "",
|
|
274
|
+
end_line=match.node.end_line if match.node else match.line,
|
|
275
|
+
end_column=match.node.end_column if match.node else match.column,
|
|
276
|
+
metavariables={k: v.text for k, v in match.metavariables.items()},
|
|
277
|
+
metadata={
|
|
278
|
+
**rule.metadata,
|
|
279
|
+
'taint_source': taint_info.source_pattern,
|
|
280
|
+
'taint_source_line': taint_info.source_line,
|
|
281
|
+
'tainted_variable': var_name
|
|
282
|
+
}
|
|
283
|
+
))
|
|
284
|
+
|
|
285
|
+
return findings
|
|
286
|
+
|
|
287
|
+
def _find_tainted_nodes_in_match(self, match: MatchResult) -> List[Tuple[str, GenericNode]]:
|
|
288
|
+
"""Find tainted variables and their nodes within a match"""
|
|
289
|
+
results = []
|
|
290
|
+
|
|
291
|
+
# Check metavariables
|
|
292
|
+
for meta in match.metavariables.values():
|
|
293
|
+
if meta.text in self.tainted and meta.node:
|
|
294
|
+
results.append((meta.text, meta.node))
|
|
295
|
+
|
|
296
|
+
# If no metavariables, check all identifiers in the match node
|
|
297
|
+
if not results and match.node:
|
|
298
|
+
referenced = self._get_referenced_variables_with_nodes(match.node)
|
|
299
|
+
for var_name, var_node in referenced:
|
|
300
|
+
if var_name in self.tainted:
|
|
301
|
+
results.append((var_name, var_node))
|
|
302
|
+
|
|
303
|
+
return results
|
|
304
|
+
|
|
305
|
+
def _get_referenced_variables_with_nodes(self, node: GenericNode) -> List[Tuple[str, GenericNode]]:
|
|
306
|
+
"""Get all variable names and nodes referenced in a node"""
|
|
307
|
+
vars_found = []
|
|
308
|
+
|
|
309
|
+
def collect(n: GenericNode):
|
|
310
|
+
if n.kind == NodeKind.IDENTIFIER:
|
|
311
|
+
vars_found.append((n.text, n))
|
|
312
|
+
for child in n.children:
|
|
313
|
+
collect(child)
|
|
314
|
+
|
|
315
|
+
collect(node)
|
|
316
|
+
return vars_found
|
|
317
|
+
|
|
318
|
+
def _is_node_sanitized_in_context(self, target: GenericNode, context: GenericNode, rule: TaintRule) -> bool:
|
|
319
|
+
"""Check if target node is inside a sanitizer within context"""
|
|
320
|
+
if not rule.sanitizers:
|
|
321
|
+
return False
|
|
322
|
+
|
|
323
|
+
# Find all sanitizer matches within context
|
|
324
|
+
for sanitizer in rule.sanitizers:
|
|
325
|
+
sanitizer_matches = self.matcher.find_all(sanitizer, context)
|
|
326
|
+
for s_match in sanitizer_matches:
|
|
327
|
+
if s_match.node and self._contains_range(s_match.node, target):
|
|
328
|
+
return True
|
|
329
|
+
return False
|
|
330
|
+
|
|
331
|
+
def _contains_range(self, outer: GenericNode, inner: GenericNode) -> bool:
|
|
332
|
+
"""Check if outer node lexically contains inner node"""
|
|
333
|
+
# Simple line/column check
|
|
334
|
+
if outer.line < inner.line or (outer.line == inner.line and outer.column <= inner.column):
|
|
335
|
+
if outer.end_line > inner.end_line or (outer.end_line == inner.end_line and outer.end_column >= inner.end_column):
|
|
336
|
+
return True
|
|
337
|
+
return False
|
|
338
|
+
|
|
339
|
+
# _find_tainted_in_match removed (replaced by _find_tainted_nodes_in_match)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def analyze_taint(ast: GenericNode, rules: List[TaintRule]) -> List[Finding]:
|
|
343
|
+
"""Convenience function to run taint analysis"""
|
|
344
|
+
analyzer = TaintAnalyzer()
|
|
345
|
+
return analyzer.analyze(ast, rules)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
if __name__ == '__main__':
|
|
349
|
+
# Quick test
|
|
350
|
+
print("TaintAnalyzer module loaded successfully")
|
|
351
|
+
print("Use analyze_taint(ast, rules) to run taint analysis")
|