agent-security-scanner-mcp 1.4.9 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,550 @@
1
+ """
2
+ Pattern Matcher Module - Semgrep-style AST Pattern Matching
3
+
4
+ Implements pattern matching with metavariables ($VAR, $FUNC, etc.)
5
+ against the generic AST representation.
6
+ """
7
+
8
+ import re
9
+ from dataclasses import dataclass, field
10
+ from typing import List, Dict, Optional, Any, Tuple, Callable
11
+ from enum import Enum
12
+
13
+ from generic_ast import GenericNode, NodeKind
14
+
15
+
16
+ class PatternType(Enum):
17
+ """Types of pattern operators"""
18
+ PATTERN = "pattern" # Match this pattern
19
+ PATTERN_NOT = "pattern-not" # Exclude matches of this pattern
20
+ PATTERN_INSIDE = "pattern-inside" # Match only inside this context
21
+ PATTERN_EITHER = "pattern-either" # Match any of these patterns (OR)
22
+ PATTERNS = "patterns" # Match all of these patterns (AND)
23
+
24
+
25
+ @dataclass
26
+ class Metavariable:
27
+ """Represents a captured metavariable like $VAR or $FUNC"""
28
+ name: str # e.g., "$VAR"
29
+ node: GenericNode # The matched node
30
+ text: str # The source text
31
+
32
+
33
+ @dataclass
34
+ class MatchResult:
35
+ """Result of a pattern match"""
36
+ matched: bool
37
+ node: Optional[GenericNode] = None
38
+ metavariables: Dict[str, Metavariable] = field(default_factory=dict)
39
+ line: int = 0
40
+ column: int = 0
41
+
42
+ def __bool__(self):
43
+ return self.matched
44
+
45
+
46
+ @dataclass
47
+ class Pattern:
48
+ """
49
+ Represents a pattern to match against AST nodes.
50
+
51
+ Patterns support:
52
+ - Metavariables: $VAR matches any identifier, $FUNC matches function name
53
+ - Ellipsis: ... matches any sequence of nodes
54
+ - Literal text: matches exact text
55
+ - Regex: matches node text against regex
56
+ """
57
+ pattern_text: str
58
+ pattern_type: PatternType = PatternType.PATTERN
59
+ is_regex: bool = False
60
+
61
+ # Parsed components
62
+ _tokens: List[str] = field(default_factory=list)
63
+ _regex_obj: Optional[re.Pattern] = None
64
+ _is_parsed: bool = False
65
+
66
+ def __post_init__(self):
67
+ if self.is_regex:
68
+ try:
69
+ self._regex_obj = re.compile(self.pattern_text)
70
+ except re.error:
71
+ # Fallback or log? For now just ignore
72
+ pass
73
+ elif not self._is_parsed:
74
+ self._parse()
75
+
76
+ def _parse(self):
77
+ """Parse the pattern into tokens"""
78
+ # Tokenize the pattern
79
+ # Match: metavariables ($VAR), ellipsis (...), identifiers with dots,
80
+ # strings, individual punctuation marks
81
+ # Important: Match ellipsis BEFORE individual dots, and parentheses individually
82
+ token_pattern = r'(\$[A-Z_][A-Z0-9_]*|\.\.\.|\w+(?:\.\w+)*|"[^"]*"|\'[^\']*\'|[()\[\],=+\-*/<>!&|]|\s+)'
83
+ self._tokens = [t for t in re.findall(token_pattern, self.pattern_text) if t.strip()]
84
+ self._is_parsed = True
85
+
86
+
87
+ class PatternMatcher:
88
+ """
89
+ Matches Semgrep-style patterns against generic AST nodes.
90
+
91
+ Example patterns:
92
+ - "$FUNC($ARG)" - matches any function call
93
+ - "$VAR = $EXPR" - matches any assignment
94
+ - "cursor.execute($SQL)" - matches specific function call
95
+ - "eval(...)" - matches eval with any arguments
96
+ """
97
+
98
+ # Metavariable pattern
99
+ METAVAR_RE = re.compile(r'^\$[A-Z_][A-Z0-9_]*$')
100
+
101
+ def __init__(self):
102
+ self.debug = False
103
+
104
+ def match(self, pattern: Pattern, node: GenericNode) -> MatchResult:
105
+ """
106
+ Match a pattern against a node and all its descendants.
107
+ Returns the first match found.
108
+ """
109
+ # Try matching at this node
110
+ result = self._match_node(pattern, node)
111
+ if result.matched:
112
+ return result
113
+
114
+ # Recursively try children
115
+ for child in node.children:
116
+ result = self.match(pattern, child)
117
+ if result.matched:
118
+ return result
119
+
120
+ return MatchResult(matched=False)
121
+
122
+ def find_all(self, pattern: Pattern, node: GenericNode) -> List[MatchResult]:
123
+ """Find all matches of the pattern in the AST"""
124
+ results = []
125
+
126
+ # Try matching at this node
127
+ result = self._match_node(pattern, node)
128
+ if result.matched:
129
+ results.append(result)
130
+
131
+ # Recursively search children
132
+ for child in node.children:
133
+ results.extend(self.find_all(pattern, child))
134
+
135
+ return results
136
+
137
+ def _match_node(self, pattern: Pattern, node: GenericNode) -> MatchResult:
138
+ """Try to match a pattern at a specific node"""
139
+ # Handle Regex patterns
140
+ if pattern.is_regex and pattern._regex_obj:
141
+ match = pattern._regex_obj.search(node.text)
142
+ if match:
143
+ # Calculate correct line number based on match offset
144
+ match_start = match.start()
145
+ relative_line = node.text[:match_start].count('\n')
146
+ actual_line = node.line + relative_line
147
+
148
+ return MatchResult(
149
+ matched=True,
150
+ node=node,
151
+ line=actual_line,
152
+ column=node.column # Approximate column
153
+ )
154
+ return MatchResult(matched=False)
155
+
156
+ tokens = pattern._tokens
157
+
158
+ if not tokens:
159
+ return MatchResult(matched=False)
160
+
161
+ # Skip patterns that don't have concrete anchors (would match too broadly)
162
+ if not self._has_concrete_anchor(tokens):
163
+ return MatchResult(matched=False)
164
+
165
+ metavariables: Dict[str, Metavariable] = {}
166
+
167
+ # Function call pattern: $FUNC(...) or func_name(...)
168
+ if self._is_call_pattern(tokens):
169
+ if node.kind == NodeKind.CALL:
170
+ func_token = tokens[0]
171
+
172
+ # Check function name
173
+ if self.METAVAR_RE.match(func_token):
174
+ # Metavariable - matches any function
175
+ if node.name:
176
+ metavariables[func_token] = Metavariable(
177
+ name=func_token,
178
+ node=node,
179
+ text=node.name
180
+ )
181
+ elif node.name and not self._text_matches(func_token, node.name):
182
+ return MatchResult(matched=False)
183
+
184
+ # Match arguments
185
+ arg_match = self._match_arguments(tokens, node, metavariables)
186
+ if arg_match:
187
+ return MatchResult(
188
+ matched=True,
189
+ node=node,
190
+ metavariables=metavariables,
191
+ line=node.line,
192
+ column=node.column
193
+ )
194
+
195
+ # Assignment pattern: $VAR = $EXPR
196
+ elif self._is_assignment_pattern(tokens):
197
+ if node.kind == NodeKind.ASSIGNMENT:
198
+ target_token = tokens[0]
199
+ value_start_idx = 2 # After "="
200
+
201
+ if node.target:
202
+ if self.METAVAR_RE.match(target_token):
203
+ metavariables[target_token] = Metavariable(
204
+ name=target_token,
205
+ node=node.target,
206
+ text=node.target.text
207
+ )
208
+ elif not self._text_matches(target_token, node.target.text):
209
+ return MatchResult(matched=False)
210
+
211
+ if node.value and value_start_idx < len(tokens):
212
+ value_token = tokens[value_start_idx]
213
+ if self.METAVAR_RE.match(value_token):
214
+ metavariables[value_token] = Metavariable(
215
+ name=value_token,
216
+ node=node.value,
217
+ text=node.value.text
218
+ )
219
+
220
+ return MatchResult(
221
+ matched=True,
222
+ node=node,
223
+ metavariables=metavariables,
224
+ line=node.line,
225
+ column=node.column
226
+ )
227
+
228
+ # NOTE: Single-token metavariable patterns (like $VAR) are intentionally
229
+ # NOT matched here - they would match every node in the AST.
230
+ # Metavariables should only capture within larger patterns.
231
+
232
+ # Literal text match (for identifiers, strings, etc.)
233
+ elif len(tokens) == 1:
234
+ if self._text_matches(tokens[0], node.text.strip()):
235
+ return MatchResult(
236
+ matched=True,
237
+ node=node,
238
+ line=node.line,
239
+ column=node.column
240
+ )
241
+
242
+ return MatchResult(matched=False)
243
+
244
+ def _has_concrete_anchor(self, tokens: List[str]) -> bool:
245
+ """Check if pattern has at least one non-metavariable concrete token.
246
+
247
+ Patterns without concrete anchors (like just '$VAR' or '$FUNC(...)')
248
+ would match too broadly. We require at least one literal token.
249
+ """
250
+ for t in tokens:
251
+ # Skip metavariables, punctuation, and ellipsis
252
+ if self.METAVAR_RE.match(t):
253
+ continue
254
+ if t in ('(', ')', ',', '...', '=', '+', '-', '*', '/', '[', ']'):
255
+ continue
256
+ # This is a concrete token (function name, identifier, etc.)
257
+ return True
258
+ return False
259
+
260
+ def _is_call_pattern(self, tokens: List[str]) -> bool:
261
+ """Check if tokens represent a function call pattern"""
262
+ if len(tokens) < 3:
263
+ return False
264
+ return '(' in tokens and ')' in tokens
265
+
266
+ def _is_assignment_pattern(self, tokens: List[str]) -> bool:
267
+ """Check if tokens represent an assignment pattern"""
268
+ return '=' in tokens and tokens.index('=') > 0
269
+
270
+ def _match_arguments(self, tokens: List[str], node: GenericNode,
271
+ metavariables: Dict[str, Metavariable]) -> bool:
272
+ """Match function arguments against pattern"""
273
+ # Find argument section in tokens
274
+ try:
275
+ paren_start = tokens.index('(')
276
+ paren_end = tokens.index(')')
277
+ except ValueError:
278
+ return False
279
+
280
+ arg_tokens = tokens[paren_start + 1:paren_end]
281
+
282
+ # Ellipsis matches any arguments
283
+ if arg_tokens == ['...'] or arg_tokens == []:
284
+ return True
285
+
286
+ # Match individual arguments
287
+ token_args = [t for t in arg_tokens if t not in (',',)]
288
+ node_args = node.args
289
+
290
+ # Split tokens by ellipsis '...'
291
+ segments = []
292
+ current_segment = []
293
+ for t in token_args:
294
+ if t == '...':
295
+ segments.append(current_segment)
296
+ current_segment = []
297
+ else:
298
+ current_segment.append(t)
299
+ segments.append(current_segment)
300
+
301
+ # Helper to match a segment at a specific index
302
+ def match_segment(segment, start_idx, out_metavars) -> bool:
303
+ for i, token in enumerate(segment):
304
+ if start_idx + i >= len(node_args):
305
+ return False
306
+ node_arg = node_args[start_idx + i]
307
+
308
+ if self.METAVAR_RE.match(token):
309
+ out_metavars[token] = Metavariable(
310
+ name=token, node=node_arg, text=node_arg.text
311
+ )
312
+ elif not self._text_matches(token, node_arg.text):
313
+ return False
314
+ return True
315
+
316
+ # Case 1: No ellipsis
317
+ if len(segments) == 1:
318
+ if len(node_args) != len(segments[0]):
319
+ return False
320
+ return match_segment(segments[0], 0, metavariables)
321
+
322
+ # Case 2: Using ellipsis
323
+ node_idx = 0
324
+
325
+ # Match first segment (anchored at start)
326
+ first_seg = segments[0]
327
+ if first_seg:
328
+ if not match_segment(first_seg, 0, metavariables):
329
+ return False
330
+ node_idx += len(first_seg)
331
+
332
+ # Match middle segments (search forward)
333
+ for i in range(1, len(segments) - 1):
334
+ seg = segments[i]
335
+ if not seg: continue
336
+
337
+ found_idx = -1
338
+ # Search for segment in remaining args
339
+ # Optimization: ensure enough space for remaining segments? ignoring for now
340
+ for k in range(node_idx, len(node_args) - len(seg) + 1):
341
+ temp_metavars = {}
342
+ if match_segment(seg, k, temp_metavars):
343
+ found_idx = k
344
+ metavariables.update(temp_metavars)
345
+ break
346
+
347
+ if found_idx == -1:
348
+ return False
349
+ node_idx = found_idx + len(seg)
350
+
351
+ # Match last segment (anchored at end)
352
+ last_seg = segments[-1]
353
+ if last_seg:
354
+ remaining_len = len(node_args) - node_idx
355
+ if len(last_seg) > remaining_len:
356
+ return False
357
+
358
+ # Must match exactly at the end
359
+ start_k = len(node_args) - len(last_seg)
360
+ return match_segment(last_seg, start_k, metavariables)
361
+
362
+ return True
363
+
364
+ def _text_matches(self, pattern_text: str, node_text: str) -> bool:
365
+ """Check if node text matches pattern text (case-insensitive for identifiers)"""
366
+ # Remove quotes if present
367
+ pattern_text = pattern_text.strip('"\'')
368
+ node_text = node_text.strip('"\'').strip()
369
+
370
+ # Exact match or starts with (for method chains)
371
+ return pattern_text == node_text or node_text.startswith(pattern_text + '.')
372
+
373
+
374
+ @dataclass
375
+ class Rule:
376
+ """A security rule with patterns to match"""
377
+ id: str
378
+ name: str
379
+ patterns: List[Pattern]
380
+ message: str
381
+ severity: str = "warning"
382
+ languages: List[str] = field(default_factory=lambda: ["generic"])
383
+ metadata: Dict[str, Any] = field(default_factory=dict)
384
+
385
+ # Pattern operators
386
+ pattern_not: List[Pattern] = field(default_factory=list)
387
+ pattern_inside: List[Pattern] = field(default_factory=list)
388
+
389
+
390
+ @dataclass
391
+ class TaintRule:
392
+ """A taint analysis rule with sources, sinks, and optional sanitizers.
393
+
394
+ Taint analysis tracks data flow from sources (user input) to sinks
395
+ (dangerous functions). A vulnerability is reported when tainted data
396
+ reaches a sink without being sanitized.
397
+
398
+ Example:
399
+ sources: [request.args.get(...), input(...)]
400
+ sinks: [subprocess.run($CMD), eval($CODE)]
401
+ sanitizers: [shlex.quote(...)]
402
+ """
403
+ id: str
404
+ name: str
405
+ sources: List[Pattern] # Patterns that introduce tainted data
406
+ sinks: List[Pattern] # Patterns where tainted data is dangerous
407
+ message: str
408
+ severity: str = "error"
409
+ languages: List[str] = field(default_factory=lambda: ["generic"])
410
+ metadata: Dict[str, Any] = field(default_factory=dict)
411
+
412
+ # Optional: patterns that clean/sanitize tainted data
413
+ sanitizers: List[Pattern] = field(default_factory=list)
414
+
415
+ # Optional: patterns that propagate taint (default: all assignments propagate)
416
+ propagators: List[Pattern] = field(default_factory=list)
417
+
418
+
419
+ @dataclass
420
+ class Finding:
421
+ """A security finding from pattern matching"""
422
+ rule_id: str
423
+ rule_name: str
424
+ message: str
425
+ severity: str
426
+ line: int
427
+ column: int
428
+ text: str
429
+ end_line: int = 0
430
+ end_column: int = 0
431
+ metavariables: Dict[str, str] = field(default_factory=dict)
432
+ metadata: Dict[str, Any] = field(default_factory=dict)
433
+
434
+
435
+ class RuleEngine:
436
+ """Applies security rules to AST and collects findings"""
437
+
438
+ def __init__(self):
439
+ self.matcher = PatternMatcher()
440
+
441
+ def apply_rule(self, rule: Rule, ast: GenericNode) -> List[Finding]:
442
+ """Apply a single rule to the AST and return findings"""
443
+ findings = []
444
+
445
+ # Find all matches for the main patterns
446
+ for pattern in rule.patterns:
447
+ matches = self.matcher.find_all(pattern, ast)
448
+
449
+ for match in matches:
450
+ # Check pattern-not (exclusions)
451
+ excluded = False
452
+ for not_pattern in rule.pattern_not:
453
+ if match.node and self.matcher.match(not_pattern, match.node).matched:
454
+ excluded = True
455
+ break
456
+
457
+ if excluded:
458
+ continue
459
+
460
+ # Create finding
461
+ end_line = match.node.end_line if match.node else match.line
462
+ end_column = match.node.end_column if match.node else match.column
463
+
464
+ finding = Finding(
465
+ rule_id=rule.id,
466
+ rule_name=rule.name,
467
+ message=rule.message,
468
+ severity=rule.severity,
469
+ line=match.line,
470
+ column=match.column,
471
+ end_line=end_line,
472
+ end_column=end_column,
473
+ text=match.node.text if match.node else "",
474
+ metavariables={k: v.text for k, v in match.metavariables.items()},
475
+ metadata=rule.metadata
476
+ )
477
+ findings.append(finding)
478
+
479
+ return findings
480
+
481
+ def apply_rules(self, rules: List[Rule], ast: GenericNode) -> List[Finding]:
482
+ """Apply multiple rules and return all findings"""
483
+ findings = []
484
+ for rule in rules:
485
+ findings.extend(self.apply_rule(rule, ast))
486
+ return findings
487
+
488
+
489
+ # Convenience functions
490
+ def create_pattern(pattern_text: str) -> Pattern:
491
+ """Create a pattern from text"""
492
+ return Pattern(pattern_text=pattern_text)
493
+
494
+
495
+ def match_pattern(pattern_text: str, node: GenericNode) -> MatchResult:
496
+ """Quick pattern match"""
497
+ pattern = create_pattern(pattern_text)
498
+ matcher = PatternMatcher()
499
+ return matcher.match(pattern, node)
500
+
501
+
502
+ def find_all_matches(pattern_text: str, node: GenericNode) -> List[MatchResult]:
503
+ """Find all matches of a pattern"""
504
+ pattern = create_pattern(pattern_text)
505
+ matcher = PatternMatcher()
506
+ return matcher.find_all(pattern, node)
507
+
508
+
509
+ if __name__ == '__main__':
510
+ import sys
511
+ sys.path.insert(0, '.')
512
+
513
+ from ast_parser import ASTParser
514
+ from generic_ast import convert_tree
515
+
516
+ # Test patterns
517
+ test_code = '''
518
+ import os
519
+ password = "secret123"
520
+ cursor.execute("SELECT * FROM users WHERE id = " + user_id)
521
+ eval(user_input)
522
+ '''
523
+
524
+ parser = ASTParser()
525
+ result = parser.parse_string(test_code, 'python')
526
+
527
+ if result.success:
528
+ ast = convert_tree(result.tree, 'python', result.source_bytes)
529
+
530
+ # Test patterns
531
+ patterns_to_test = [
532
+ "$FUNC($ARG)",
533
+ "$VAR = $EXPR",
534
+ "eval(...)",
535
+ "cursor.execute($SQL)",
536
+ ]
537
+
538
+ print("Test code:")
539
+ print(test_code)
540
+ print("\nPattern matching results:")
541
+ print("-" * 50)
542
+
543
+ for pattern_text in patterns_to_test:
544
+ matches = find_all_matches(pattern_text, ast)
545
+ print(f"\nPattern: {pattern_text}")
546
+ print(f" Matches: {len(matches)}")
547
+ for m in matches:
548
+ print(f" Line {m.line}: {m.node.text[:50] if m.node else 'N/A'}...")
549
+ if m.metavariables:
550
+ print(f" Captured: {', '.join(f'{k}={v.text}' for k, v in m.metavariables.items())}")