agent-security-scanner-mcp 1.4.9 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,570 @@
1
+ """
2
+ Semgrep YAML Rule Loader
3
+ Converts Semgrep-style YAML rules to our AST pattern format
4
+ """
5
+
6
+ import os
7
+ import yaml
8
+ from typing import List, Dict, Any, Optional
9
+ from pathlib import Path
10
+
11
+ try:
12
+ from pattern_matcher import Rule, TaintRule, Pattern, create_pattern
13
+ HAS_PATTERN_MATCHER = True
14
+ except ImportError:
15
+ HAS_PATTERN_MATCHER = False
16
+
17
+
18
+ class SemgrepRuleLoader:
19
+ """Load and convert Semgrep YAML rules to AST patterns"""
20
+
21
+ def __init__(self, rules_dir: str = None):
22
+ if rules_dir is None:
23
+ rules_dir = os.path.join(os.path.dirname(__file__), 'rules')
24
+ self.rules_dir = Path(rules_dir)
25
+ self.loaded_rules = []
26
+ self.taint_rules = [] # Separate list for taint analysis rules
27
+
28
+ # Loader diagnostics
29
+ self.stats = {
30
+ 'rules_loaded': 0,
31
+ 'rules_skipped_no_patterns': 0,
32
+ 'rules_skipped_taint': 0,
33
+ 'patterns_skipped_multiline': 0,
34
+ 'patterns_skipped_no_anchor': 0,
35
+ }
36
+
37
+ def load_all_rules(self, languages: List[str] = None) -> List[Rule]:
38
+ """Load all YAML rules for specified languages"""
39
+ if languages is None:
40
+ languages = ['python', 'javascript', 'typescript', 'java', 'go', 'ruby', 'php', 'c', 'rust', 'csharp', 'generic']
41
+
42
+ all_rules = []
43
+
44
+ # Load from flat YAML files first (main branch format: python.security.yaml, etc.)
45
+ all_rules.extend(self._load_flat_rules(languages))
46
+
47
+ # Load from main language directories (feature branch format: python/lang/security/...)
48
+ for language in languages:
49
+ lang_dir = self.rules_dir / language
50
+ if lang_dir.exists() and lang_dir.is_dir():
51
+ all_rules.extend(self._load_language_rules(lang_dir, language))
52
+
53
+ # Load from third-party directory (all languages)
54
+ third_party_dir = self.rules_dir / 'third-party'
55
+ if third_party_dir.exists():
56
+ all_rules.extend(self._load_third_party_rules(third_party_dir))
57
+
58
+ self.loaded_rules = all_rules
59
+ return all_rules
60
+
61
+ def _load_flat_rules(self, languages: List[str]) -> List[Rule]:
62
+ """Load rules from flat YAML files (main branch format).
63
+
64
+ Handles files like:
65
+ - python.security.yaml
66
+ - javascript.security.yaml
67
+ - generic.secrets.yaml
68
+ - agent-attacks.security.yaml
69
+ - prompt-injection.security.yaml
70
+ """
71
+ rules = []
72
+
73
+ # Language-specific security rules
74
+ for lang in languages:
75
+ yaml_file = self.rules_dir / f'{lang}.security.yaml'
76
+ if yaml_file.exists():
77
+ rules.extend(self._load_flat_yaml_file(yaml_file, lang))
78
+
79
+ # Generic rules that apply to all languages
80
+ generic_files = [
81
+ 'generic.secrets.yaml',
82
+ 'agent-attacks.security.yaml',
83
+ 'prompt-injection.security.yaml',
84
+ ]
85
+ for filename in generic_files:
86
+ yaml_file = self.rules_dir / filename
87
+ if yaml_file.exists():
88
+ rules.extend(self._load_flat_yaml_file(yaml_file, 'generic'))
89
+
90
+ return rules
91
+
92
+ def _load_flat_yaml_file(self, yaml_file: Path, default_language: str) -> List[Rule]:
93
+ """Load rules from a flat YAML file (main branch format).
94
+
95
+ These files have a different format than Semgrep rules - they have
96
+ 'rules' as a list with 'id', 'patterns' (as regex strings), etc.
97
+ """
98
+ rules = []
99
+ try:
100
+ with open(yaml_file, 'r', encoding='utf-8') as f:
101
+ data = yaml.safe_load(f)
102
+
103
+ if not data:
104
+ return rules
105
+
106
+ # Handle main branch format: list of rules under 'rules' key
107
+ rule_list = data.get('rules', data) if isinstance(data, dict) else data
108
+ if not isinstance(rule_list, list):
109
+ rule_list = [rule_list]
110
+
111
+ for rule_data in rule_list:
112
+ if not isinstance(rule_data, dict):
113
+ continue
114
+
115
+ # Try Semgrep format first
116
+ if 'pattern' in rule_data or 'patterns' in rule_data or 'pattern-either' in rule_data:
117
+ converted = self._convert_semgrep_rule(rule_data, default_language)
118
+ if converted:
119
+ rules.append(converted)
120
+ continue
121
+
122
+ # Handle main branch format (regex patterns as strings)
123
+ rule_id = rule_data.get('id', 'unknown')
124
+ message = rule_data.get('message', '')
125
+ severity = rule_data.get('severity', 'WARNING').lower()
126
+ languages = rule_data.get('languages', [default_language])
127
+ metadata = rule_data.get('metadata', {})
128
+
129
+ # Get patterns (list of regex strings in main format)
130
+ pattern_list = rule_data.get('patterns', [])
131
+ if not pattern_list:
132
+ continue
133
+
134
+ # Convert regex patterns to Pattern objects
135
+ patterns = []
136
+ for p in pattern_list:
137
+ if isinstance(p, str):
138
+ patterns.append(Pattern(pattern_text=p, is_regex=True))
139
+
140
+ if not patterns:
141
+ continue
142
+
143
+ self.stats['rules_loaded'] += 1
144
+
145
+ # Map severity
146
+ severity_map = {'error': 'error', 'warning': 'warning', 'info': 'info'}
147
+ severity = severity_map.get(severity.lower(), 'warning')
148
+
149
+ rules.append(Rule(
150
+ id=rule_id,
151
+ name=rule_data.get('name', rule_id.split('.')[-1].replace('-', ' ').title()),
152
+ patterns=patterns,
153
+ pattern_not=[],
154
+ message=message,
155
+ severity=severity,
156
+ languages=languages,
157
+ metadata=metadata
158
+ ))
159
+
160
+ except Exception:
161
+ pass
162
+
163
+ return rules
164
+
165
+ def _load_third_party_rules(self, third_party_dir: Path) -> List[Rule]:
166
+ """Load all rules from third-party sources"""
167
+ rules = []
168
+
169
+ # Find all .yaml and .yml files recursively
170
+ for yaml_file in third_party_dir.rglob('*.yaml'):
171
+ rules.extend(self._load_yaml_file(yaml_file))
172
+ for yml_file in third_party_dir.rglob('*.yml'):
173
+ rules.extend(self._load_yaml_file(yml_file))
174
+
175
+ return rules
176
+
177
+ def _load_yaml_file(self, yaml_file: Path) -> List[Rule]:
178
+ """Load rules from a single YAML file"""
179
+ rules = []
180
+ try:
181
+ with open(yaml_file, 'r', encoding='utf-8') as f:
182
+ data = yaml.safe_load(f)
183
+ if data and 'rules' in data:
184
+ for rule_data in data['rules']:
185
+ # Detect language from file path or rule
186
+ language = self._detect_language(yaml_file, rule_data)
187
+ converted_rule = self._convert_semgrep_rule(rule_data, language)
188
+ if converted_rule:
189
+ rules.append(converted_rule)
190
+ except Exception:
191
+ pass
192
+ return rules
193
+
194
+ def _detect_language(self, yaml_file: Path, rule_data: Dict[str, Any]) -> str:
195
+ """Detect language from file path or rule data"""
196
+ # Check rule data first
197
+ if 'languages' in rule_data and rule_data['languages']:
198
+ return rule_data['languages'][0]
199
+
200
+ # Detect from path
201
+ path_str = str(yaml_file).lower()
202
+ lang_map = {
203
+ 'python': 'python', 'javascript': 'javascript', 'typescript': 'typescript',
204
+ 'java': 'java', 'go': 'go', 'ruby': 'ruby', 'php': 'php', 'c/': 'c',
205
+ 'rust': 'rust', 'csharp': 'csharp'
206
+ }
207
+ for key, lang in lang_map.items():
208
+ if key in path_str:
209
+ return lang
210
+ return 'generic'
211
+
212
+ def _load_language_rules(self, lang_dir: Path, language: str) -> List[Rule]:
213
+ """Load all YAML files from a language directory"""
214
+ rules = []
215
+
216
+ # Find all .yaml files recursively (load everything, no filtering)
217
+ for yaml_file in lang_dir.rglob('*.yaml'):
218
+ try:
219
+ with open(yaml_file, 'r', encoding='utf-8') as f:
220
+ data = yaml.safe_load(f)
221
+ if data and 'rules' in data:
222
+ for rule_data in data['rules']:
223
+ converted_rule = self._convert_semgrep_rule(rule_data, language)
224
+ if converted_rule:
225
+ rules.append(converted_rule)
226
+ except Exception as e:
227
+ # Silently skip files that fail to load
228
+ pass
229
+
230
+ return rules
231
+
232
+ def _convert_semgrep_rule(self, rule_data: Dict[str, Any], default_language: str) -> Optional[Rule]:
233
+ """Convert a Semgrep rule to our Rule format"""
234
+ if not HAS_PATTERN_MATCHER:
235
+ return None
236
+
237
+ # Check if this is a taint rule (mode: taint)
238
+ if rule_data.get('mode') == 'taint':
239
+ taint_rule = self._convert_taint_rule(rule_data, default_language)
240
+ if taint_rule:
241
+ self.taint_rules.append(taint_rule)
242
+ # Return None - taint rules are stored separately
243
+ return None
244
+
245
+ rule_id = rule_data.get('id', 'unknown')
246
+ message = rule_data.get('message', '')
247
+ severity = rule_data.get('severity', 'WARNING').lower()
248
+
249
+ # Parse languages and map 'regex' to 'generic'
250
+ languages = rule_data.get('languages', [default_language])
251
+ if 'regex' in languages and 'generic' not in languages:
252
+ languages.append('generic')
253
+
254
+ metadata = rule_data.get('metadata', {})
255
+
256
+ # Convert Semgrep patterns to our AST patterns
257
+ patterns = self._extract_patterns(rule_data)
258
+ pattern_nots = self._extract_pattern_nots(rule_data)
259
+
260
+ if not patterns:
261
+ self.stats['rules_skipped_no_patterns'] += 1
262
+ return None
263
+
264
+ self.stats['rules_loaded'] += 1
265
+
266
+ # Map severity
267
+ severity_map = {
268
+ 'error': 'error',
269
+ 'warning': 'warning',
270
+ 'info': 'info'
271
+ }
272
+ severity = severity_map.get(severity, 'warning')
273
+
274
+ return Rule(
275
+ id=rule_id,
276
+ name=rule_id.split('.')[-1].replace('-', ' ').title(),
277
+ patterns=patterns,
278
+ pattern_not=pattern_nots,
279
+ message=message,
280
+ severity=severity,
281
+ languages=languages,
282
+ metadata=metadata
283
+ )
284
+
285
+ def _convert_taint_rule(self, rule_data: Dict[str, Any], default_language: str) -> Optional[TaintRule]:
286
+ """Convert a Semgrep taint rule to our TaintRule format"""
287
+ rule_id = rule_data.get('id', 'unknown')
288
+ message = rule_data.get('message', '')
289
+ severity = rule_data.get('severity', 'ERROR').lower()
290
+ languages = rule_data.get('languages', [default_language])
291
+ metadata = rule_data.get('metadata', {})
292
+
293
+ # Extract source patterns
294
+ sources = self._extract_taint_patterns(rule_data.get('pattern-sources', []))
295
+
296
+ # Extract sink patterns
297
+ sinks = self._extract_taint_patterns(rule_data.get('pattern-sinks', []))
298
+
299
+ # Extract optional sanitizer patterns
300
+ sanitizers = self._extract_taint_patterns(rule_data.get('pattern-sanitizers', []))
301
+
302
+ if not sources or not sinks:
303
+ return None
304
+
305
+ # Map severity
306
+ severity_map = {'error': 'error', 'warning': 'warning', 'info': 'info'}
307
+ severity = severity_map.get(severity, 'error')
308
+
309
+ return TaintRule(
310
+ id=rule_id,
311
+ name=rule_id.split('.')[-1].replace('-', ' ').title(),
312
+ sources=sources,
313
+ sinks=sinks,
314
+ message=message,
315
+ severity=severity,
316
+ languages=languages,
317
+ metadata=metadata,
318
+ sanitizers=sanitizers
319
+ )
320
+
321
+ def _extract_taint_patterns(self, pattern_list: List[Any]) -> List[Pattern]:
322
+ """Extract patterns from taint source/sink definitions"""
323
+ patterns = []
324
+
325
+ for item in pattern_list:
326
+ if isinstance(item, dict):
327
+ # Handle nested patterns structure
328
+ if 'pattern' in item:
329
+ patterns.append(create_pattern(item['pattern']))
330
+ elif 'patterns' in item:
331
+ # Recursively extract from patterns list
332
+ patterns.extend(self._extract_patterns_deep(item['patterns']))
333
+ elif 'pattern-either' in item:
334
+ # Multiple alternative patterns
335
+ for either in item['pattern-either']:
336
+ if isinstance(either, dict) and 'pattern' in either:
337
+ patterns.append(create_pattern(either['pattern']))
338
+ elif isinstance(either, str):
339
+ patterns.append(create_pattern(either))
340
+ elif isinstance(item, str):
341
+ patterns.append(create_pattern(item))
342
+
343
+ return patterns
344
+
345
+ def _extract_patterns_deep(self, patterns_list: List[Any]) -> List[Pattern]:
346
+ """Recursively extract patterns from nested structures"""
347
+ patterns = []
348
+
349
+ for item in patterns_list:
350
+ if isinstance(item, dict):
351
+ if 'pattern' in item:
352
+ patterns.append(create_pattern(item['pattern']))
353
+ elif 'pattern-either' in item:
354
+ for either in item['pattern-either']:
355
+ if isinstance(either, dict) and 'pattern' in either:
356
+ patterns.append(create_pattern(either['pattern']))
357
+ elif isinstance(either, str):
358
+ patterns.append(create_pattern(either))
359
+
360
+ return patterns
361
+
362
+ def _is_supported_pattern(self, pattern_text: str) -> bool:
363
+ """Check if pattern is supported by the AST matcher.
364
+
365
+ Filters out:
366
+ - Multi-line patterns (require statement sequence matching)
367
+ - Patterns without concrete function/method anchors
368
+ - Pure ellipsis patterns
369
+ """
370
+ if not pattern_text or not pattern_text.strip():
371
+ return False
372
+
373
+ # Multi-line patterns are not supported
374
+ if '\n' in pattern_text:
375
+ self.stats['patterns_skipped_multiline'] += 1
376
+ return False
377
+
378
+ # Pure ellipsis is not a useful pattern
379
+ if pattern_text.strip() == '...':
380
+ return False
381
+
382
+ # Must have at least one concrete identifier (not just metavariables)
383
+ import re
384
+ tokens = re.findall(r'[\w\.]+|\$[A-Z_][A-Z0-9_]*', pattern_text)
385
+ has_concrete = False
386
+ for token in tokens:
387
+ # Check if token is a metavariable
388
+ if re.match(r'^\$[A-Z_][A-Z0-9_]*$', token):
389
+ continue
390
+ # Check if token contains a concrete identifier (like function name)
391
+ if re.match(r'^[a-z_][a-z0-9_\.]*$', token, re.IGNORECASE):
392
+ has_concrete = True
393
+ break
394
+
395
+ if not has_concrete:
396
+ self.stats['patterns_skipped_no_anchor'] += 1
397
+
398
+ return has_concrete
399
+
400
+ def get_stats(self) -> Dict[str, int]:
401
+ """Get loader statistics for diagnostics"""
402
+ return self.stats.copy()
403
+
404
+
405
+ def _extract_patterns(self, rule_data: Dict[str, Any]) -> List[Pattern]:
406
+ """Extract and convert Semgrep patterns to our format"""
407
+ patterns = []
408
+
409
+ # Handle simple pattern field
410
+ if 'pattern' in rule_data:
411
+ pattern_str = rule_data['pattern']
412
+ if self._is_supported_pattern(pattern_str):
413
+ patterns.append(create_pattern(pattern_str))
414
+
415
+ # Handle regex pattern field
416
+ if 'pattern-regex' in rule_data:
417
+ pattern_str = rule_data['pattern-regex']
418
+ # Regex patterns are always supported (parsed by re module)
419
+ patterns.append(Pattern(pattern_text=pattern_str, is_regex=True))
420
+
421
+ # Handle patterns list
422
+ if 'patterns' in rule_data:
423
+ for pattern_item in rule_data['patterns']:
424
+ if isinstance(pattern_item, dict):
425
+ # Handle pattern-either
426
+ if 'pattern-either' in pattern_item:
427
+ for either_pattern in pattern_item['pattern-either']:
428
+ if isinstance(either_pattern, dict) and 'pattern' in either_pattern:
429
+ p = either_pattern['pattern']
430
+ if self._is_supported_pattern(p):
431
+ patterns.append(create_pattern(p))
432
+ elif isinstance(either_pattern, dict) and 'pattern-regex' in either_pattern:
433
+ p = either_pattern['pattern-regex']
434
+ patterns.append(Pattern(pattern_text=p, is_regex=True))
435
+ elif isinstance(either_pattern, str):
436
+ if self._is_supported_pattern(either_pattern):
437
+ patterns.append(create_pattern(either_pattern))
438
+
439
+ # Handle simple pattern in list
440
+ elif 'pattern' in pattern_item:
441
+ p = pattern_item['pattern']
442
+ if self._is_supported_pattern(p):
443
+ patterns.append(create_pattern(p))
444
+
445
+ # Handle regex pattern in list
446
+ elif 'pattern-regex' in pattern_item:
447
+ p = pattern_item['pattern-regex']
448
+ patterns.append(Pattern(pattern_text=p, is_regex=True))
449
+
450
+ elif isinstance(pattern_item, str):
451
+ if self._is_supported_pattern(pattern_item):
452
+ patterns.append(create_pattern(pattern_item))
453
+
454
+ # Handle pattern-either at top level
455
+ if 'pattern-either' in rule_data:
456
+ for either_pattern in rule_data['pattern-either']:
457
+ if isinstance(either_pattern, dict) and 'pattern' in either_pattern:
458
+ p = either_pattern['pattern']
459
+ if self._is_supported_pattern(p):
460
+ patterns.append(create_pattern(p))
461
+ elif isinstance(either_pattern, dict) and 'pattern-regex' in either_pattern:
462
+ p = either_pattern['pattern-regex']
463
+ patterns.append(Pattern(pattern_text=p, is_regex=True))
464
+ elif isinstance(either_pattern, str):
465
+ if self._is_supported_pattern(either_pattern):
466
+ patterns.append(create_pattern(either_pattern))
467
+
468
+ return patterns
469
+
470
+ def _extract_pattern_nots(self, rule_data: Dict[str, Any]) -> List[Pattern]:
471
+ """Extract pattern-not negation patterns from a rule.
472
+
473
+ These are used to exclude false positives from matches.
474
+ """
475
+ pattern_nots = []
476
+
477
+ # Handle pattern-not at top level
478
+ if 'pattern-not' in rule_data:
479
+ p = rule_data['pattern-not']
480
+ if isinstance(p, str) and self._is_supported_pattern(p):
481
+ pattern_nots.append(create_pattern(p))
482
+
483
+ # Handle pattern-not-regex at top level
484
+ if 'pattern-not-regex' in rule_data:
485
+ p = rule_data['pattern-not-regex']
486
+ pattern_nots.append(Pattern(pattern_text=p, is_regex=True))
487
+
488
+ # Handle pattern-not in patterns list
489
+ if 'patterns' in rule_data:
490
+ for pattern_item in rule_data['patterns']:
491
+ if isinstance(pattern_item, dict):
492
+ if 'pattern-not' in pattern_item:
493
+ p = pattern_item['pattern-not']
494
+ if isinstance(p, str) and self._is_supported_pattern(p):
495
+ pattern_nots.append(create_pattern(p))
496
+ if 'pattern-not-regex' in pattern_item:
497
+ p = pattern_item['pattern-not-regex']
498
+ pattern_nots.append(Pattern(pattern_text=p, is_regex=True))
499
+
500
+ return pattern_nots
501
+
502
+ def get_rules_by_language(self, language: str) -> List[Rule]:
503
+ """Get all rules for a specific language"""
504
+ return [rule for rule in self.loaded_rules if language in rule.languages]
505
+
506
+ def get_rules_by_severity(self, severity: str) -> List[Rule]:
507
+ """Get all rules of a specific severity"""
508
+ return [rule for rule in self.loaded_rules if rule.severity == severity]
509
+
510
+ def get_taint_rules(self) -> List[TaintRule]:
511
+ """Get all loaded taint analysis rules"""
512
+ return self.taint_rules
513
+
514
+ def get_taint_rules_by_language(self, language: str) -> List[TaintRule]:
515
+ """Get taint rules for a specific language"""
516
+ return [rule for rule in self.taint_rules if language in rule.languages]
517
+
518
+ def get_rule_stats(self) -> Dict[str, Any]:
519
+ """Get statistics about loaded rules"""
520
+ stats = {
521
+ 'total': len(self.loaded_rules),
522
+ 'taint_rules': len(self.taint_rules),
523
+ 'by_language': {},
524
+ 'by_severity': {'error': 0, 'warning': 0, 'info': 0}
525
+ }
526
+
527
+ for rule in self.loaded_rules:
528
+ # Count by language
529
+ for lang in rule.languages:
530
+ stats['by_language'][lang] = stats['by_language'].get(lang, 0) + 1
531
+
532
+ # Count by severity
533
+ stats['by_severity'][rule.severity] = stats['by_severity'].get(rule.severity, 0) + 1
534
+
535
+ return stats
536
+
537
+
538
+ # Global loader instance
539
+ _loader = None
540
+
541
+ def get_loader() -> SemgrepRuleLoader:
542
+ """Get or create the global rule loader"""
543
+ global _loader
544
+ if _loader is None:
545
+ _loader = SemgrepRuleLoader()
546
+ return _loader
547
+
548
+
549
+ def load_rules(languages: List[str] = None) -> List[Rule]:
550
+ """Load all rules for specified languages"""
551
+ loader = get_loader()
552
+ return loader.load_all_rules(languages)
553
+
554
+
555
+ if __name__ == '__main__':
556
+ # Test the loader
557
+ loader = SemgrepRuleLoader()
558
+ rules = loader.load_all_rules(['python'])
559
+ stats = loader.get_rule_stats()
560
+
561
+ print(f"Loaded {stats['total']} rules")
562
+ print(f"By language: {stats['by_language']}")
563
+ print(f"By severity: {stats['by_severity']}")
564
+
565
+ # Show a sample rule
566
+ if rules:
567
+ sample = rules[0]
568
+ print(f"\nSample rule: {sample.id}")
569
+ print(f"Message: {sample.message[:100]}...")
570
+ print(f"Patterns: {len(sample.patterns)}")