caidongyun 6.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1086 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ v6.0.0 Scanner - 集成 Gitleaks + Semgrep AI + Bandit
4
+
5
+ 检测流程:
6
+ 1. PatternEngine (Layer 1) - 快速模式匹配 (+ Gitleaks 220 条)
7
+ 2. RuleEngine (Layer 2) - 深度规则匹配 (+ Semgrep AI 31 条 + Bandit 10 条)
8
+ 3. LLMEngine (Layer 3, 可选) - 语义分析 + 上下文理解
9
+
10
+ 设计原则:
11
+ - 串行执行,确保每层都能获取前层信息
12
+ - 准确性优先于性能
13
+ - 支持单文件和完整技能文件夹扫描
14
+ - LLM 可选,获取历史信息和完整上下文
15
+ - 自动加载外部规则(Gitleaks/Semgrep/Bandit)
16
+ """
17
+
18
+ import sys
19
+ import os
20
+ import re
21
+ import json
22
+ import time
23
+ from pathlib import Path
24
+ from typing import List, Dict, Optional, Tuple, Set
25
+ from dataclasses import dataclass, asdict
26
+ from concurrent.futures import ThreadPoolExecutor, as_completed
27
+
28
+ # 注意:PatternEngine/RuleEngine/LLMEngine 在下方内联定义
29
+
30
+
31
+ # ========== 版本信息 ==========
32
+ VERSION = "v6.0.0"
33
+ SCANNER_NAME = "agent-security-skill-scanner"
34
+
35
+
36
+ # ========== 扫描结果 ==========
37
+ @dataclass
38
+ class ScanResult:
39
+ """扫描结果"""
40
+ # 基本信息
41
+ file_path: str
42
+ file_type: str # 'single_file' or 'skill_folder'
43
+
44
+ # 风险评估
45
+ is_malicious: bool
46
+ risk_level: str # SAFE/LOW/MEDIUM/HIGH/CRITICAL
47
+ score: int # 0-100
48
+ confidence: float # 0.0-1.0
49
+
50
+ # 攻击信息
51
+ attack_types: List[str]
52
+ threat_summary: str
53
+
54
+ # 各层检测结果
55
+ layer1_pattern: Optional[Dict] # PatternEngine 结果
56
+ layer2_rule: Optional[Dict] # RuleEngine 结果
57
+ layer3_llm: Optional[Dict] # LLMEngine 结果
58
+
59
+ # 详细信息
60
+ matched_patterns: List[Dict]
61
+ matched_rules: List[Dict]
62
+
63
+ # 性能
64
+ scan_time_ms: float
65
+
66
+ # 上下文(LLM 使用)
67
+ context: Optional[Dict] # 历史信息、技能描述等
68
+
69
+ def to_dict(self) -> Dict:
70
+ return asdict(self)
71
+
72
+
73
+ # ========== Layer 1: Pattern Engine ==========
74
+ class PatternEngine:
75
+ """
76
+ Layer 1: Pattern 引擎 - 快速模式匹配
77
+
78
+ 职责:
79
+ - 使用正则表达式快速匹配已知攻击模式
80
+ - 返回匹配的 pattern 和权重
81
+ - 为 Layer 2 提供候选攻击类型
82
+ """
83
+
84
+ # 攻击模式库 (按优先级排序)
85
+ ATTACK_PATTERNS = [
86
+ # 高危攻击 (权重 50-60)
87
+ ("reverse_shell", r'bash\s+-i', 55),
88
+ ("reverse_shell", r'/dev/tcp/', 60),
89
+ ("reverse_shell", r'nc\s+-e', 60),
90
+ ("supply_chain_attack", r'curl\s+.*\|\s*bash', 60),
91
+ ("false_prone", r'/dev/tcp/', 60),
92
+
93
+ # 中危攻击 (权重 35-49)
94
+ ("credential_theft", r'\.ssh/', 40),
95
+ ("credential_theft", r'\.aws/', 40),
96
+ ("prompt_injection", r'prompt[_-]inject', 40),
97
+ ("prompt_injection", r'ignore\s+previous', 45),
98
+ ("data_exfiltration", r'exfiltrat', 40),
99
+ ("evasion", r'marshal\s*\.\s*(dumps|loads)', 40),
100
+ ("resource_exhaustion", r'os\s*\.\s*fork\s*\(', 45),
101
+ ("false_prone", r'attacker[-_]?c2', 50),
102
+ ("false_prone", r'tar.*\.ssh', 50),
103
+ ("false_prone", r'curl.*\|.*bash', 50),
104
+
105
+ # 低危攻击 (权重 10-34)
106
+ ("credential_theft", r'credentials', 35),
107
+ ("data_exfiltration", r'fetch\s*\(', 25),
108
+ ("obfuscation", r'base64', 30),
109
+ ("obfuscation", r'base64\.b64decode', 50),
110
+ ("obfuscation", r'base64\.b64encode', 45),
111
+ ("obfuscation", r'zlib\.compress', 50),
112
+ ("obfuscation", r'zlib\.decompress', 50),
113
+ ("obfuscation", r'exec.*base64', 60),
114
+ ("persistence", r'systemd', 35),
115
+
116
+ # Credential Theft
117
+ ("credential_theft", r'\.netrc', 50),
118
+ ("credential_theft", r'/etc/shadow', 55),
119
+ ("credential_theft", r'/etc/passwd', 50),
120
+
121
+ # Resource Exhaustion
122
+ ("resource_exhaustion", r'subprocess\.Popen', 50),
123
+ ("resource_exhaustion", r'os\.fork', 50),
124
+
125
+ # Privilege Escalation
126
+ ("privilege_escalation", r'sudoers', 60),
127
+ ("privilege_escalation", r'NOPASSWD', 60),
128
+ ("privilege_escalation", r'chmod.*4755', 55),
129
+ ]
130
+
131
+ def __init__(self):
132
+ # 预编译所有正则
133
+ self.compiled = []
134
+ for attack_type, pattern, weight in self.ATTACK_PATTERNS:
135
+ try:
136
+ self.compiled.append((
137
+ attack_type,
138
+ re.compile(pattern, re.IGNORECASE),
139
+ pattern,
140
+ weight
141
+ ))
142
+ except re.error as e:
143
+ print(f"⚠️ Pattern 编译失败:{pattern} - {e}")
144
+
145
+ print(f"✅ PatternEngine: {len(self.compiled)} patterns")
146
+
147
+ def scan(self, content: str, file_path: str = "") -> Dict:
148
+ """
149
+ Layer 1: Pattern 扫描
150
+
151
+ Args:
152
+ content: 文件内容
153
+ file_path: 文件路径(用于日志)
154
+
155
+ Returns:
156
+ {
157
+ 'matches': [(type, pattern, weight), ...],
158
+ 'max_weight': int,
159
+ 'attack_types': set(),
160
+ 'hit_count': int
161
+ }
162
+ """
163
+ matches = []
164
+ matched_patterns = set()
165
+ attack_types = set()
166
+
167
+ for attack_type, compiled, pattern, weight in self.compiled:
168
+ if pattern in matched_patterns:
169
+ continue
170
+
171
+ if compiled.search(content):
172
+ matches.append((attack_type, pattern, weight))
173
+ matched_patterns.add(pattern)
174
+ attack_types.add(attack_type)
175
+
176
+ max_weight = max((w for _, _, w in matches), default=0)
177
+
178
+ result = {
179
+ 'matches': matches,
180
+ 'max_weight': max_weight,
181
+ 'attack_types': list(attack_types),
182
+ 'hit_count': len(matches),
183
+ 'layer': 'PatternEngine'
184
+ }
185
+
186
+ return result
187
+
188
+
189
+ # ========== Layer 2: Rule Engine ==========
190
+ class RuleEngine:
191
+ """
192
+ Layer 2: Rule 引擎 - 深度规则匹配
193
+
194
+ 职责:
195
+ - 使用复杂规则(多 pattern 组合)进行深度检测
196
+ - 结合 Layer 1 的结果进行针对性扫描
197
+ - 提供置信度评分
198
+ """
199
+
200
+ # Category 关键词映射(用于推断 unknown/false_prone 类别)
201
+ CATEGORY_KEYWORDS = {
202
+ 'credential_theft': ['shadow', 'passwd', 'netrc', '.aws/', '.ssh/', 'credential', 'password', 'secret'],
203
+ 'privilege_escalation': ['sudo', 'sudoers', 'NOPASSWD', 'chmod', '4755', 'SUID', 'setuid'],
204
+ 'resource_exhaustion': ['fork', 'bomb', 'exhaust', 'while.*true', 'subprocess'],
205
+ 'persistence': ['cron', 'systemd', '.bashrc', '.profile', 'startup'],
206
+ 'code_execution': ['exec', 'eval', 'compile', 'subprocess', 'os.system'],
207
+ }
208
+
209
+ def __init__(self, rules_file: Optional[Path] = None):
210
+ self.rules_file = rules_file
211
+ self.rules = []
212
+ self.compiled = []
213
+
214
+ # 加载规则
215
+ if rules_file and rules_file.exists():
216
+ self.load_rules(rules_file)
217
+ else:
218
+ # 使用内置规则
219
+ self.load_builtin_rules()
220
+
221
+ print(f"✅ RuleEngine: {len(self.compiled)} rules")
222
+
223
+ def _infer_category(self, rule: Dict, content: str) -> str:
224
+ """推断规则类别(用于 unknown/false_prone)"""
225
+ category = rule.get('category', 'unknown')
226
+
227
+ # 只推断 unknown 或 false_prone_generated
228
+ if category not in ['unknown', 'false_prone_generated']:
229
+ return category
230
+
231
+ # 检查 pattern 和 rule_id
232
+ patterns_str = str(rule.get('patterns', [])).lower()
233
+ rule_id = rule.get('id', '').lower()
234
+ content_lower = content.lower()
235
+
236
+ # 根据关键词推断类别
237
+ for inferred_cat, keywords in self.CATEGORY_KEYWORDS.items():
238
+ for kw in keywords:
239
+ if kw.lower() in patterns_str or kw.lower() in rule_id or kw.lower() in content_lower:
240
+ return inferred_cat
241
+
242
+ return category
243
+
244
+ def load_builtin_rules(self):
245
+ """加载内置规则"""
246
+ # 内置高置信度规则
247
+ builtin_rules = [
248
+ {
249
+ 'id': 'CRED-001',
250
+ 'name': 'SSH 密钥窃取',
251
+ 'category': 'credential_theft',
252
+ 'patterns': [r'\.ssh/', r'id_rsa', r'id_ed25519'],
253
+ 'min_matches': 2,
254
+ 'confidence': 95
255
+ },
256
+ {
257
+ 'id': 'CRED-002',
258
+ 'name': 'AWS 凭证窃取',
259
+ 'category': 'credential_theft',
260
+ 'patterns': [r'\.aws/', r'AWS_SECRET', r'AWS_ACCESS'],
261
+ 'min_matches': 2,
262
+ 'confidence': 95
263
+ },
264
+ {
265
+ 'id': 'EXFIL-001',
266
+ 'name': '数据外传',
267
+ 'category': 'data_exfiltration',
268
+ 'patterns': [r'curl\s+.*\|.*bash', r'wget.*\|.*sh'],
269
+ 'min_matches': 1,
270
+ 'confidence': 95
271
+ },
272
+ {
273
+ 'id': 'EVASION-001',
274
+ 'name': '代码混淆执行',
275
+ 'category': 'evasion',
276
+ 'patterns': [r'base64', r'eval\s*\(', r'exec\s*\('],
277
+ 'min_matches': 2,
278
+ 'confidence': 90
279
+ },
280
+ {
281
+ 'id': 'PERSIST-001',
282
+ 'name': '持久化后门',
283
+ 'category': 'persistence',
284
+ 'patterns': [r'crontab', r'systemd', r'\.service'],
285
+ 'min_matches': 2,
286
+ 'confidence': 90
287
+ },
288
+ ]
289
+
290
+ self.rules = builtin_rules
291
+ self._compile_rules()
292
+
293
+ def load_rules(self, rules_file: Path):
294
+ """从文件加载规则"""
295
+ try:
296
+ with open(rules_file, 'r', encoding='utf-8') as f:
297
+ data = json.load(f)
298
+ self.rules = data.get('rules', [])
299
+ self._compile_rules()
300
+ except Exception as e:
301
+ print(f"⚠️ 规则加载失败:{e}")
302
+ self.load_builtin_rules()
303
+
304
+ def _compile_rules(self):
305
+ """编译规则中的正则"""
306
+ self.compiled = []
307
+
308
+ for rule in self.rules:
309
+ compiled_patterns = []
310
+ for pattern in rule.get('patterns', []):
311
+ try:
312
+ compiled_patterns.append(re.compile(pattern, re.IGNORECASE))
313
+ except re.error:
314
+ pass
315
+
316
+ rule['_compiled'] = compiled_patterns
317
+ self.compiled.append(rule)
318
+
319
+ def scan(self, content: str, layer1_result: Dict = None) -> Dict:
320
+ """
321
+ Layer 2: Rule 扫描
322
+
323
+ Args:
324
+ content: 文件内容
325
+ layer1_result: Layer 1 的结果(用于针对性扫描)
326
+
327
+ Returns:
328
+ {
329
+ 'matches': [(rule_id, category, confidence), ...],
330
+ 'max_confidence': int,
331
+ 'attack_types': set(),
332
+ 'hit_count': int
333
+ }
334
+ """
335
+ matches = []
336
+ attack_types = set()
337
+
338
+ # 如果 Layer 1 有结果,优先扫描相关规则
339
+ if layer1_result and layer1_result.get('attack_types'):
340
+ priority_types = set(layer1_result['attack_types'])
341
+ else:
342
+ priority_types = None
343
+
344
+ for rule in self.compiled:
345
+ rule_category = self._infer_category(rule, content)
346
+
347
+ # 如果有 Layer 1 结果,优先处理相关类别
348
+ if priority_types and rule_category not in priority_types:
349
+ continue
350
+
351
+ # 检查规则匹配
352
+ match_count = 0
353
+ for compiled in rule.get('_compiled', []):
354
+ if compiled.search(content):
355
+ match_count += 1
356
+
357
+ # 检查是否达到最小匹配数
358
+ min_matches = rule.get('min_matches', 1)
359
+ if match_count >= min_matches:
360
+ confidence = rule.get('confidence', 50)
361
+ matches.append((
362
+ rule.get('id', 'UNKNOWN'),
363
+ rule_category,
364
+ confidence,
365
+ rule.get('name', '')
366
+ ))
367
+ attack_types.add(rule_category)
368
+
369
+ max_confidence = max((c for _, _, c, _ in matches), default=0)
370
+
371
+ # 计算 score 和 risk_level
372
+ score = max_confidence
373
+ if max_confidence >= 80: risk_level = 'CRITICAL'
374
+ elif max_confidence >= 60: risk_level = 'HIGH'
375
+ elif max_confidence >= 40: risk_level = 'MEDIUM'
376
+ elif max_confidence >= 20: risk_level = 'LOW'
377
+ else: risk_level = 'SAFE'
378
+
379
+ result = {
380
+ 'matches': matches,
381
+ 'max_confidence': max_confidence,
382
+ 'attack_types': list(attack_types),
383
+ 'hit_count': len(matches),
384
+ 'score': score,
385
+ 'risk_level': risk_level,
386
+ 'confidence': max_confidence / 100.0,
387
+ 'layer': 'RuleEngine'
388
+ }
389
+
390
+ return result
391
+
392
+
393
+ # ========== Layer 3: LLM Engine (可选) ==========
394
+ class LLMEngine:
395
+ """
396
+ Layer 3: LLM 引擎 - 语义分析 + 上下文理解
397
+
398
+ 职责:
399
+ - 分析代码语义(不仅仅是模式匹配)
400
+ - 结合上下文(技能描述、历史记录)判断意图
401
+ - 提供最终确认
402
+
403
+ 注意:这是可选层,需要用户配置 LLM API
404
+ """
405
+
406
+ def __init__(self, api_config: Optional[Dict] = None):
407
+ self.api_config = api_config
408
+ self.enabled = api_config is not None
409
+
410
+ if self.enabled:
411
+ print(f"✅ LLMEngine: 已启用 ({api_config.get('provider', 'unknown')})")
412
+ else:
413
+ print("ℹ️ LLMEngine: 未启用(跳过 Layer 3)")
414
+
415
+ def scan(self, content: str, layer1_result: Dict, layer2_result: Dict,
416
+ context: Optional[Dict] = None) -> Dict:
417
+ """
418
+ Layer 3: LLM 语义分析
419
+
420
+ Args:
421
+ content: 文件内容
422
+ layer1_result: Layer 1 结果
423
+ layer2_result: Layer 2 结果
424
+ context: 上下文信息(技能描述、历史记录等)
425
+
426
+ Returns:
427
+ {
428
+ 'is_malicious': bool,
429
+ 'confidence': float,
430
+ 'reasoning': str,
431
+ 'threat_summary': str
432
+ }
433
+ """
434
+ if not self.enabled:
435
+ return {
436
+ 'enabled': False,
437
+ 'reason': 'LLM not configured'
438
+ }
439
+
440
+ # TODO: 调用 LLM API 进行语义分析
441
+ # 这里需要根据实际 LLM API 实现
442
+
443
+ # 伪代码示例:
444
+ # prompt = self._build_prompt(content, layer1_result, layer2_result, context)
445
+ # response = call_llm_api(prompt, self.api_config)
446
+ # return self._parse_response(response)
447
+
448
+ return {
449
+ 'enabled': True,
450
+ 'is_malicious': False,
451
+ 'confidence': 0.0,
452
+ 'reasoning': 'LLM analysis not implemented yet',
453
+ 'threat_summary': '',
454
+ 'layer': 'LLMEngine'
455
+ }
456
+
457
+ def _build_prompt(self, content: str, layer1: Dict, layer2: Dict,
458
+ context: Optional[Dict]) -> str:
459
+ """构建 LLM 提示词"""
460
+ prompt = """你是一个 AI 安全专家。请分析以下代码是否存在恶意行为。
461
+
462
+ ## 代码内容
463
+ ```
464
+ {content}
465
+ ```
466
+
467
+ ## Pattern 检测结果
468
+ - 命中数:{layer1_hits}
469
+ - 攻击类型:{layer1_types}
470
+ - 最高权重:{layer1_weight}
471
+
472
+ ## Rule 检测结果
473
+ - 命中数:{layer2_hits}
474
+ - 攻击类型:{layer2_types}
475
+ - 最高置信度:{layer2_confidence}
476
+
477
+ ## 上下文信息
478
+ {context}
479
+
480
+ ## 任务
481
+ 1. 判断代码是否恶意
482
+ 2. 说明判断理由
483
+ 3. 给出置信度 (0.0-1.0)
484
+ 4. 总结威胁类型
485
+
486
+ 请按以下 JSON 格式回复:
487
+ {{
488
+ "is_malicious": true/false,
489
+ "confidence": 0.0-1.0,
490
+ "reasoning": "...",
491
+ "threat_summary": "..."
492
+ }}
493
+ """
494
+ return prompt.format(
495
+ content=content[:5000], # 限制长度
496
+ layer1_hits=layer1.get('hit_count', 0),
497
+ layer1_types=', '.join(layer1.get('attack_types', [])),
498
+ layer1_weight=layer1.get('max_weight', 0),
499
+ layer2_hits=layer2.get('hit_count', 0),
500
+ layer2_types=', '.join(layer2.get('attack_types', [])),
501
+ layer2_confidence=layer2.get('max_confidence', 0),
502
+ context=json.dumps(context, ensure_ascii=False) if context else '无'
503
+ )
504
+
505
+
506
+ # ========== 主 Scanner ==========
507
+ class Scanner:
508
+ """
509
+ 主扫描器 - 串行执行三层检测
510
+
511
+ 流程:
512
+ 1. PatternEngine (Layer 1) - 快速模式匹配
513
+ 2. RuleEngine (Layer 2) - 深度规则匹配
514
+ 3. LLMEngine (Layer 3, 可选) - 语义分析
515
+
516
+ 特点:
517
+ - 串行执行,每层都能获取前层信息
518
+ - 准确性优先
519
+ - 支持单文件和技能文件夹扫描
520
+ """
521
+
522
+ def __init__(self, rules_file: Optional[Path] = None,
523
+ llm_config: Optional[Dict] = None):
524
+ self.version = VERSION
525
+
526
+ print(f"🔧 初始化 Scanner {VERSION}...")
527
+
528
+ # 初始化三层引擎
529
+ self.layer1 = PatternEngine()
530
+ self.layer2 = RuleEngine(rules_file)
531
+ self.layer3 = LLMEngine(llm_config) if llm_config else None
532
+
533
+ # 手动加载外部规则
534
+ self._load_external_rules()
535
+
536
+ # 统计
537
+ self.stats = {
538
+ 'files_scanned': 0,
539
+ 'threats_found': 0,
540
+ 'layer1_hits': 0,
541
+ 'layer2_hits': 0,
542
+ 'layer3_enabled': self.layer3 is not None
543
+ }
544
+
545
+ print(f"✅ Scanner 初始化完成")
546
+
547
+ def _load_external_rules(self):
548
+ """加载外部规则(Gitleaks + Semgrep AI + Bandit)"""
549
+ import json
550
+
551
+ # 加载 Gitleaks patterns
552
+ # __file__ = v6.0.0/src/engines/__init__.py
553
+ # parent.parent = v6.0.0/
554
+ gitleaks_file = Path(__file__).parent.parent.parent / 'rules' / 'gitleaks_patterns.json'
555
+ if gitleaks_file.exists():
556
+ try:
557
+ with open(gitleaks_file, 'r', encoding='utf-8') as f:
558
+ data = json.load(f)
559
+
560
+ patterns = data.get('patterns', [])
561
+ for p in patterns:
562
+ pattern_regex = p.get('pattern', '')
563
+ attack_type = p.get('attack_type', 'credential_theft')
564
+ weight = p.get('weight', 40)
565
+
566
+ if not pattern_regex:
567
+ continue
568
+
569
+ try:
570
+ compiled = re.compile(pattern_regex, re.IGNORECASE)
571
+ self.layer1.compiled.append((
572
+ attack_type,
573
+ compiled,
574
+ pattern_regex,
575
+ weight
576
+ ))
577
+ except re.error as e:
578
+ pass # 跳过无效正则
579
+
580
+ print(f"✅ PatternEngine: 加载 {len(patterns)} 条 Gitleaks 规则")
581
+ except Exception as e:
582
+ print(f"⚠️ 加载 Gitleaks 规则失败:{e}")
583
+
584
+ # 加载 Semgrep AI rules
585
+ semgrep_file = Path(__file__).parent.parent.parent / 'rules' / 'semgrep_ai_rules.json'
586
+ if semgrep_file.exists():
587
+ try:
588
+ with open(semgrep_file, 'r', encoding='utf-8') as f:
589
+ data = json.load(f)
590
+
591
+ # 支持列表或字典格式
592
+ if isinstance(data, list):
593
+ rules = data
594
+ else:
595
+ rules = data.get('rules', [])
596
+
597
+ print(f" DEBUG: Semgrep rules 类型={type(rules)}, 数量={len(rules) if hasattr(rules, '__len__') else 'N/A'}")
598
+
599
+ loaded_count = 0
600
+ for i, r in enumerate(rules):
601
+ if i < 3: # 只打印前 3 条调试
602
+ print(f" DEBUG[{i}]: rule 类型={type(r)}, keys={r.keys() if isinstance(r, dict) else 'N/A'}")
603
+
604
+ rule_id = r.get('source', r.get('id', 'SEMGREP-UNKNOWN')).replace('-', '_').upper()
605
+ category = r.get('category', 'credential_theft')
606
+ confidence = r.get('confidence', r.get('weight', 60))
607
+ patterns = r.get('patterns', [])
608
+
609
+ if not patterns:
610
+ continue
611
+
612
+ for pattern in patterns:
613
+ try:
614
+ compiled = re.compile(pattern, re.IGNORECASE)
615
+ self.layer2.compiled[rule_id] = {
616
+ 'rule': r,
617
+ 'patterns': [compiled],
618
+ 'category': category,
619
+ 'severity': r.get('severity', 'medium'),
620
+ 'confidence': confidence,
621
+ 'description': f'Semgrep AI: {r.get("source", "")}',
622
+ 'source': 'semgrep'
623
+ }
624
+ loaded_count += 1
625
+ break # 每个 rule 只取第一个 pattern
626
+ except re.error as e:
627
+ print(f" DEBUG: 正则错误:{pattern} - {e}")
628
+ pass
629
+
630
+ print(f"✅ RuleEngine: 加载 {loaded_count} 条 Semgrep AI 规则")
631
+ except Exception as e:
632
+ import traceback
633
+ print(f"⚠️ 加载 Semgrep AI 规则失败:{e}")
634
+ traceback.print_exc()
635
+
636
+ # 加载 Bandit rules
637
+ bandit_file = Path(__file__).parent.parent.parent / 'rules' / 'bandit_rules.json'
638
+ if bandit_file.exists():
639
+ try:
640
+ with open(bandit_file, 'r', encoding='utf-8') as f:
641
+ data = json.load(f)
642
+
643
+ # 支持列表或字典格式
644
+ if isinstance(data, list):
645
+ rules = data
646
+ else:
647
+ rules = data.get('rules', [])
648
+
649
+ loaded_count = 0
650
+ for r in rules:
651
+ rule_id = r.get('id', 'BANDIT-UNKNOWN')
652
+ category = r.get('category', 'arbitrary_execution')
653
+ confidence = r.get('confidence', 70)
654
+ patterns = r.get('patterns', [])
655
+
656
+ if not patterns:
657
+ continue
658
+
659
+ for pattern in patterns:
660
+ try:
661
+ compiled = re.compile(pattern, re.IGNORECASE)
662
+ self.layer2.compiled[rule_id] = {
663
+ 'rule': r,
664
+ 'patterns': [compiled],
665
+ 'category': category,
666
+ 'severity': r.get('severity', 'MEDIUM'),
667
+ 'confidence': confidence,
668
+ 'description': r.get('description', ''),
669
+ 'source': 'bandit'
670
+ }
671
+ loaded_count += 1
672
+ break # 每个 rule 只取第一个 pattern
673
+ except re.error:
674
+ pass
675
+
676
+ print(f"✅ RuleEngine: 加载 {loaded_count} 条 Bandit 规则")
677
+ except Exception as e:
678
+ print(f"⚠️ 加载 Bandit 规则失败:{e}")
679
+
680
+ def scan_file(self, file_path: Path, context: Optional[Dict] = None) -> ScanResult:
681
+ """
682
+ 扫描单个文件(串行三层检测)
683
+
684
+ Args:
685
+ file_path: 文件路径
686
+ context: 上下文信息(技能描述、历史记录等)
687
+
688
+ Returns:
689
+ ScanResult 对象
690
+ """
691
+ start_time = time.time()
692
+
693
+ # 读取文件
694
+ try:
695
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
696
+ except Exception as e:
697
+ return self._create_error_result(str(file_path), str(e))
698
+
699
+ # Layer 1: Pattern 扫描
700
+ layer1_result = self.layer1.scan(content, str(file_path))
701
+ if layer1_result['hit_count'] > 0:
702
+ self.stats['layer1_hits'] += 1
703
+
704
+ # Layer 2: Rule 扫描(使用 Layer 1 结果)
705
+ layer2_result = self.layer2.scan(content, layer1_result)
706
+ if layer2_result['hit_count'] > 0:
707
+ self.stats['layer2_hits'] += 1
708
+
709
+ # Layer 3: LLM 扫描(如果启用)
710
+ layer3_result = None
711
+ if self.layer3:
712
+ layer3_result = self.layer3.scan(
713
+ content, layer1_result, layer2_result, context
714
+ )
715
+
716
+ # 综合评估
717
+ assessment = self._assess(layer1_result, layer2_result, layer3_result)
718
+
719
+ # 构建结果
720
+ scan_time = (time.time() - start_time) * 1000
721
+
722
+ result = ScanResult(
723
+ file_path=str(file_path),
724
+ file_type='single_file',
725
+ is_malicious=assessment['is_malicious'],
726
+ risk_level=assessment['risk_level'],
727
+ score=assessment['score'],
728
+ confidence=assessment['confidence'],
729
+ attack_types=assessment['attack_types'],
730
+ threat_summary=assessment.get('threat_summary', ''),
731
+ layer1_pattern=layer1_result,
732
+ layer2_rule=layer2_result,
733
+ layer3_llm=layer3_result,
734
+ matched_patterns=[
735
+ {'type': t, 'pattern': p, 'weight': w}
736
+ for t, p, w in layer1_result.get('matches', [])
737
+ ],
738
+ matched_rules=[
739
+ {'id': i, 'category': c, 'confidence': conf, 'name': n}
740
+ for i, c, conf, n in layer2_result.get('matches', [])
741
+ ],
742
+ scan_time_ms=scan_time,
743
+ context=context
744
+ )
745
+
746
+ # 更新统计
747
+ self.stats['files_scanned'] += 1
748
+ if result.is_malicious:
749
+ self.stats['threats_found'] += 1
750
+
751
+ return result
752
+
753
+ def scan_skill_folder(self, skill_folder: Path,
754
+ context: Optional[Dict] = None) -> ScanResult:
755
+ """
756
+ 扫描完整技能文件夹
757
+
758
+ Args:
759
+ skill_folder: 技能文件夹路径
760
+ context: 上下文信息
761
+
762
+ Returns:
763
+ ScanResult 对象(综合整个文件夹的评估)
764
+ """
765
+ start_time = time.time()
766
+
767
+ # 找到所有关键文件
768
+ key_files = self._find_key_files(skill_folder)
769
+
770
+ if not key_files:
771
+ return self._create_error_result(
772
+ str(skill_folder),
773
+ "No key files found"
774
+ )
775
+
776
+ # 扫描每个文件
777
+ file_results = []
778
+ all_attack_types = set()
779
+ max_score = 0
780
+ total_score = 0
781
+
782
+ for file_path in key_files:
783
+ result = self.scan_file(file_path, context)
784
+ file_results.append(result)
785
+
786
+ if result.is_malicious:
787
+ all_attack_types.update(result.attack_types)
788
+ max_score = max(max_score, result.score)
789
+ total_score += result.score
790
+
791
+ # 综合评估整个技能
792
+ file_count = len(file_results)
793
+ avg_score = total_score / file_count if file_count > 0 else 0
794
+
795
+ # 技能最终评分 = 最高分 + 平均分加成
796
+ final_score = min(max_score + int(avg_score * 0.3), 100)
797
+
798
+ is_malicious = final_score >= 70 or max_score >= 90
799
+ is_suspicious = 30 <= final_score < 70
800
+
801
+ risk_level = (
802
+ 'CRITICAL' if final_score >= 90 else
803
+ 'HIGH' if final_score >= 70 else
804
+ 'MEDIUM' if final_score >= 30 else
805
+ 'LOW' if final_score >= 20 else
806
+ 'SAFE'
807
+ )
808
+
809
+ scan_time = (time.time() - start_time) * 1000
810
+
811
+ # 收集所有匹配的规则和 pattern
812
+ all_patterns = []
813
+ all_rules = []
814
+ for r in file_results:
815
+ all_patterns.extend(r.matched_patterns)
816
+ all_rules.extend(r.matched_rules)
817
+
818
+ result = ScanResult(
819
+ file_path=str(skill_folder),
820
+ file_type='skill_folder',
821
+ is_malicious=is_malicious,
822
+ risk_level=risk_level,
823
+ score=final_score,
824
+ confidence=0.9 if is_malicious else 0.7 if is_suspicious else 0.5,
825
+ attack_types=list(all_attack_types),
826
+ threat_summary=f"Scanned {file_count} files, found {len(all_patterns)} patterns and {len(all_rules)} rules",
827
+ layer1_pattern={'file_results': [r.layer1_pattern for r in file_results]},
828
+ layer2_rule={'file_results': [r.layer2_rule for r in file_results]},
829
+ layer3_llm={'file_results': [r.layer3_llm for r in file_results]} if self.layer3 else None,
830
+ matched_patterns=all_patterns[:20], # 最多 20 个
831
+ matched_rules=all_rules[:20],
832
+ scan_time_ms=scan_time,
833
+ context=context
834
+ )
835
+
836
+ return result
837
+
838
+ def _find_key_files(self, skill_folder: Path, recursive: bool = True, max_depth: int = 20) -> List[Path]:
839
+ """
840
+ 找到技能文件夹中的所有文件(带深度限制和保护)
841
+
842
+ Args:
843
+ skill_folder: 技能文件夹路径
844
+ recursive: 是否递归扫描子目录(默认 True)
845
+ max_depth: 最大递归深度(默认 20 层,防止过深目录)
846
+
847
+ Returns:
848
+ 文件路径列表
849
+ """
850
+ # 安全限制:最大深度不超过 20 层
851
+ max_depth = min(max_depth, 20)
852
+
853
+ filtered_files = []
854
+
855
+ if recursive:
856
+ # 手动递归以控制深度
857
+ self._collect_files_recursive(
858
+ skill_folder,
859
+ filtered_files,
860
+ current_depth=0,
861
+ max_depth=max_depth
862
+ )
863
+ else:
864
+ # 仅扫描根目录
865
+ try:
866
+ for f in skill_folder.iterdir():
867
+ if f.is_file() and not f.is_symlink():
868
+ # 跳过二进制文件
869
+ if f.suffix not in {'.dll', '.so', '.exe', '.bin', '.dat', '.pyc', '.pyo'}:
870
+ filtered_files.append(f)
871
+ except:
872
+ pass
873
+
874
+ return sorted(filtered_files)
875
+
876
+ def _collect_files_recursive(self, dir_path: Path, files_list: List[Path],
877
+ current_depth: int, max_depth: int):
878
+ """
879
+ 递归收集文件(带深度限制和保护)
880
+
881
+ Args:
882
+ dir_path: 当前目录
883
+ files_list: 文件列表(累加)
884
+ current_depth: 当前深度
885
+ max_depth: 最大深度
886
+ """
887
+ # 深度保护:超过最大深度停止
888
+ if current_depth >= max_depth:
889
+ return
890
+
891
+ try:
892
+ for item in dir_path.iterdir():
893
+ # 跳过符号链接(防止循环链接)
894
+ if item.is_symlink():
895
+ continue
896
+
897
+ # 跳过忽略的目录
898
+ ignored_dirs = {'.git', '.svn', '__pycache__', 'node_modules',
899
+ '.DS_Store', 'Thumbs.db', 'venv', '.venv', 'env', '.env'}
900
+ if item.is_dir() and item.name in ignored_dirs:
901
+ continue
902
+
903
+ if item.is_file():
904
+ # 跳过二进制文件
905
+ if item.suffix not in {'.dll', '.so', '.exe', '.bin', '.dat', '.pyc', '.pyo'}:
906
+ files_list.append(item)
907
+
908
+ elif item.is_dir():
909
+ # 递归子目录
910
+ self._collect_files_recursive(
911
+ item, files_list,
912
+ current_depth + 1, max_depth
913
+ )
914
+ except PermissionError:
915
+ # 跳过无权限访问的目录
916
+ pass
917
+ except Exception:
918
+ # 跳过其他错误
919
+ pass
920
+
921
+ def _assess(self, layer1: Dict, layer2: Dict, layer3: Optional[Dict]) -> Dict:
922
+ """
923
+ 综合评估
924
+
925
+ 结合三层结果,计算最终分数和风险等级
926
+ """
927
+ attack_types = set()
928
+ attack_types.update(layer1.get('attack_types', []))
929
+ attack_types.update(layer2.get('attack_types', []))
930
+
931
+ # 基础分数
932
+ pattern_score = layer1.get('max_weight', 0)
933
+ rule_score = layer2.get('max_confidence', 0)
934
+
935
+ # 取最高分
936
+ base_score = max(pattern_score, rule_score)
937
+
938
+ # 类型加成
939
+ type_bonus = min(len(attack_types) * 3, 10)
940
+
941
+ # LLM 调整(如果启用)
942
+ llm_adjustment = 0
943
+ if layer3 and layer3.get('enabled'):
944
+ if layer3.get('is_malicious'):
945
+ llm_adjustment = 10
946
+ attack_types.add('llm_confirmed')
947
+
948
+ # 最终分数
949
+ final_score = min(base_score + type_bonus + llm_adjustment, 100)
950
+
951
+ # 风险等级
952
+ if final_score >= 90 or rule_score >= 95:
953
+ risk_level = 'CRITICAL'
954
+ elif final_score >= 70:
955
+ risk_level = 'HIGH'
956
+ elif final_score >= 30:
957
+ risk_level = 'MEDIUM'
958
+ elif final_score >= 20:
959
+ risk_level = 'LOW'
960
+ else:
961
+ risk_level = 'SAFE'
962
+
963
+ is_malicious = risk_level in ('MEDIUM', 'HIGH', 'CRITICAL')
964
+
965
+ # 置信度
966
+ confidence = (
967
+ 0.95 if risk_level == 'CRITICAL' else
968
+ 0.85 if risk_level == 'HIGH' else
969
+ 0.70 if risk_level == 'MEDIUM' else
970
+ 0.50
971
+ )
972
+
973
+ # 威胁总结
974
+ if attack_types:
975
+ threat_summary = f"Detected: {', '.join(sorted(attack_types))}"
976
+ else:
977
+ threat_summary = "No threats detected"
978
+
979
+ return {
980
+ 'is_malicious': is_malicious,
981
+ 'risk_level': risk_level,
982
+ 'score': final_score,
983
+ 'confidence': confidence,
984
+ 'attack_types': list(attack_types),
985
+ 'threat_summary': threat_summary
986
+ }
987
+
988
+ def _create_error_result(self, file_path: str, error: str) -> ScanResult:
989
+ """创建错误结果"""
990
+ return ScanResult(
991
+ file_path=file_path,
992
+ file_type='error',
993
+ is_malicious=False,
994
+ risk_level='SAFE',
995
+ score=0,
996
+ confidence=0.0,
997
+ attack_types=[],
998
+ threat_summary=f"Scan error: {error}",
999
+ layer1_pattern=None,
1000
+ layer2_rule=None,
1001
+ layer3_llm=None,
1002
+ matched_patterns=[],
1003
+ matched_rules=[],
1004
+ scan_time_ms=0,
1005
+ context=None
1006
+ )
1007
+
1008
+
1009
+ # ========== 便捷函数 ==========
1010
+ def scan_file(file_path: str, rules_file: Optional[str] = None,
1011
+ llm_config: Optional[Dict] = None) -> ScanResult:
1012
+ """便捷函数:扫描单个文件"""
1013
+ scanner = Scanner(
1014
+ rules_file=Path(rules_file) if rules_file else None,
1015
+ llm_config=llm_config
1016
+ )
1017
+ return scanner.scan_file(Path(file_path))
1018
+
1019
+
1020
+ def scan_skill_folder(skill_folder: str, rules_file: Optional[str] = None,
1021
+ llm_config: Optional[Dict] = None) -> ScanResult:
1022
+ """便捷函数:扫描技能文件夹"""
1023
+ scanner = Scanner(
1024
+ rules_file=Path(rules_file) if rules_file else None,
1025
+ llm_config=llm_config
1026
+ )
1027
+ return scanner.scan_skill_folder(Path(skill_folder))
1028
+
1029
+
1030
+ # ========== 命令行入口 ==========
1031
+ if __name__ == '__main__':
1032
+ import argparse
1033
+
1034
+ parser = argparse.ArgumentParser(description=f"Scanner {VERSION}")
1035
+ parser.add_argument('path', help='扫描路径(文件或文件夹)')
1036
+ parser.add_argument('--rules', '-r', help='规则文件路径')
1037
+ parser.add_argument('--llm-config', '-l', help='LLM 配置文件路径')
1038
+ parser.add_argument('--output', '-o', help='输出 JSON 文件')
1039
+ parser.add_argument('--verbose', '-v', action='store_true', help='详细输出')
1040
+
1041
+ args = parser.parse_args()
1042
+
1043
+ path = Path(args.path)
1044
+ if not path.exists():
1045
+ print(f"❌ 路径不存在:{path}")
1046
+ sys.exit(1)
1047
+
1048
+ # 加载 LLM 配置
1049
+ llm_config = None
1050
+ if args.llm_config:
1051
+ with open(args.llm_config) as f:
1052
+ llm_config = json.load(f)
1053
+
1054
+ # 创建 Scanner
1055
+ scanner = Scanner(
1056
+ rules_file=Path(args.rules) if args.rules else None,
1057
+ llm_config=llm_config
1058
+ )
1059
+
1060
+ # 扫描
1061
+ if path.is_file():
1062
+ result = scanner.scan_file(path)
1063
+ else:
1064
+ result = scanner.scan_skill_folder(path)
1065
+
1066
+ # 输出结果
1067
+ if args.verbose:
1068
+ print(f"\n{'='*60}")
1069
+ print(f"扫描结果")
1070
+ print(f"{'='*60}")
1071
+ print(f"路径:{result.file_path}")
1072
+ print(f"类型:{result.file_type}")
1073
+ print(f"恶意:{result.is_malicious}")
1074
+ print(f"风险:{result.risk_level}")
1075
+ print(f"分数:{result.score}")
1076
+ print(f"置信度:{result.confidence}")
1077
+ print(f"攻击类型:{', '.join(result.attack_types)}")
1078
+ print(f"威胁总结:{result.threat_summary}")
1079
+ print(f"Pattern 命中:{len(result.matched_patterns)}")
1080
+ print(f"Rule 命中:{len(result.matched_rules)}")
1081
+ print(f"耗时:{result.scan_time_ms:.2f}ms")
1082
+
1083
+ if args.output:
1084
+ with open(args.output, 'w', encoding='utf-8') as f:
1085
+ json.dump(result.to_dict(), f, indent=2, ensure_ascii=False)
1086
+ print(f"\n✅ 结果已保存:{args.output}")