caidongyun 6.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +310 -0
- package/RELEASE_NOTES.md +200 -0
- package/SKILL.md +294 -0
- package/config_detector.py +134 -0
- package/index.d.ts +43 -0
- package/index.js +34 -0
- package/package.json +72 -0
- package/requirements.txt +11 -0
- package/rules/dist/all_rules.json +1 -0
- package/scan +17 -0
- package/scanner.py +322 -0
- package/src/encoding_utils.py +239 -0
- package/src/engines/__init__.py +1086 -0
- package/src/engines/aho_corasick_scanner.py +520 -0
- package/src/engines/ast_engine.py +290 -0
- package/src/engines/hybrid_scanner.py +284 -0
- package/src/engines/llm_engine.py +379 -0
- package/src/engines/pattern_engine.py +296 -0
- package/src/engines/rule_engine.py +282 -0
- package/whitelist_filter.py +394 -0
|
@@ -0,0 +1,1086 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
v6.0.0 Scanner - 集成 Gitleaks + Semgrep AI + Bandit
|
|
4
|
+
|
|
5
|
+
检测流程:
|
|
6
|
+
1. PatternEngine (Layer 1) - 快速模式匹配 (+ Gitleaks 220 条)
|
|
7
|
+
2. RuleEngine (Layer 2) - 深度规则匹配 (+ Semgrep AI 31 条 + Bandit 10 条)
|
|
8
|
+
3. LLMEngine (Layer 3, 可选) - 语义分析 + 上下文理解
|
|
9
|
+
|
|
10
|
+
设计原则:
|
|
11
|
+
- 串行执行,确保每层都能获取前层信息
|
|
12
|
+
- 准确性优先于性能
|
|
13
|
+
- 支持单文件和完整技能文件夹扫描
|
|
14
|
+
- LLM 可选,获取历史信息和完整上下文
|
|
15
|
+
- 自动加载外部规则(Gitleaks/Semgrep/Bandit)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import sys
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import json
|
|
22
|
+
import time
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import List, Dict, Optional, Tuple, Set
|
|
25
|
+
from dataclasses import dataclass, asdict
|
|
26
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
27
|
+
|
|
28
|
+
# 注意:PatternEngine/RuleEngine/LLMEngine 在下方内联定义
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ========== 版本信息 ==========
|
|
32
|
+
VERSION = "v6.0.0"
|
|
33
|
+
SCANNER_NAME = "agent-security-skill-scanner"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ========== 扫描结果 ==========
|
|
37
|
+
@dataclass
|
|
38
|
+
class ScanResult:
|
|
39
|
+
"""扫描结果"""
|
|
40
|
+
# 基本信息
|
|
41
|
+
file_path: str
|
|
42
|
+
file_type: str # 'single_file' or 'skill_folder'
|
|
43
|
+
|
|
44
|
+
# 风险评估
|
|
45
|
+
is_malicious: bool
|
|
46
|
+
risk_level: str # SAFE/LOW/MEDIUM/HIGH/CRITICAL
|
|
47
|
+
score: int # 0-100
|
|
48
|
+
confidence: float # 0.0-1.0
|
|
49
|
+
|
|
50
|
+
# 攻击信息
|
|
51
|
+
attack_types: List[str]
|
|
52
|
+
threat_summary: str
|
|
53
|
+
|
|
54
|
+
# 各层检测结果
|
|
55
|
+
layer1_pattern: Optional[Dict] # PatternEngine 结果
|
|
56
|
+
layer2_rule: Optional[Dict] # RuleEngine 结果
|
|
57
|
+
layer3_llm: Optional[Dict] # LLMEngine 结果
|
|
58
|
+
|
|
59
|
+
# 详细信息
|
|
60
|
+
matched_patterns: List[Dict]
|
|
61
|
+
matched_rules: List[Dict]
|
|
62
|
+
|
|
63
|
+
# 性能
|
|
64
|
+
scan_time_ms: float
|
|
65
|
+
|
|
66
|
+
# 上下文(LLM 使用)
|
|
67
|
+
context: Optional[Dict] # 历史信息、技能描述等
|
|
68
|
+
|
|
69
|
+
def to_dict(self) -> Dict:
|
|
70
|
+
return asdict(self)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ========== Layer 1: Pattern Engine ==========
|
|
74
|
+
class PatternEngine:
|
|
75
|
+
"""
|
|
76
|
+
Layer 1: Pattern 引擎 - 快速模式匹配
|
|
77
|
+
|
|
78
|
+
职责:
|
|
79
|
+
- 使用正则表达式快速匹配已知攻击模式
|
|
80
|
+
- 返回匹配的 pattern 和权重
|
|
81
|
+
- 为 Layer 2 提供候选攻击类型
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
# 攻击模式库 (按优先级排序)
|
|
85
|
+
ATTACK_PATTERNS = [
|
|
86
|
+
# 高危攻击 (权重 50-60)
|
|
87
|
+
("reverse_shell", r'bash\s+-i', 55),
|
|
88
|
+
("reverse_shell", r'/dev/tcp/', 60),
|
|
89
|
+
("reverse_shell", r'nc\s+-e', 60),
|
|
90
|
+
("supply_chain_attack", r'curl\s+.*\|\s*bash', 60),
|
|
91
|
+
("false_prone", r'/dev/tcp/', 60),
|
|
92
|
+
|
|
93
|
+
# 中危攻击 (权重 35-49)
|
|
94
|
+
("credential_theft", r'\.ssh/', 40),
|
|
95
|
+
("credential_theft", r'\.aws/', 40),
|
|
96
|
+
("prompt_injection", r'prompt[_-]inject', 40),
|
|
97
|
+
("prompt_injection", r'ignore\s+previous', 45),
|
|
98
|
+
("data_exfiltration", r'exfiltrat', 40),
|
|
99
|
+
("evasion", r'marshal\s*\.\s*(dumps|loads)', 40),
|
|
100
|
+
("resource_exhaustion", r'os\s*\.\s*fork\s*\(', 45),
|
|
101
|
+
("false_prone", r'attacker[-_]?c2', 50),
|
|
102
|
+
("false_prone", r'tar.*\.ssh', 50),
|
|
103
|
+
("false_prone", r'curl.*\|.*bash', 50),
|
|
104
|
+
|
|
105
|
+
# 低危攻击 (权重 10-34)
|
|
106
|
+
("credential_theft", r'credentials', 35),
|
|
107
|
+
("data_exfiltration", r'fetch\s*\(', 25),
|
|
108
|
+
("obfuscation", r'base64', 30),
|
|
109
|
+
("obfuscation", r'base64\.b64decode', 50),
|
|
110
|
+
("obfuscation", r'base64\.b64encode', 45),
|
|
111
|
+
("obfuscation", r'zlib\.compress', 50),
|
|
112
|
+
("obfuscation", r'zlib\.decompress', 50),
|
|
113
|
+
("obfuscation", r'exec.*base64', 60),
|
|
114
|
+
("persistence", r'systemd', 35),
|
|
115
|
+
|
|
116
|
+
# Credential Theft
|
|
117
|
+
("credential_theft", r'\.netrc', 50),
|
|
118
|
+
("credential_theft", r'/etc/shadow', 55),
|
|
119
|
+
("credential_theft", r'/etc/passwd', 50),
|
|
120
|
+
|
|
121
|
+
# Resource Exhaustion
|
|
122
|
+
("resource_exhaustion", r'subprocess\.Popen', 50),
|
|
123
|
+
("resource_exhaustion", r'os\.fork', 50),
|
|
124
|
+
|
|
125
|
+
# Privilege Escalation
|
|
126
|
+
("privilege_escalation", r'sudoers', 60),
|
|
127
|
+
("privilege_escalation", r'NOPASSWD', 60),
|
|
128
|
+
("privilege_escalation", r'chmod.*4755', 55),
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
def __init__(self):
|
|
132
|
+
# 预编译所有正则
|
|
133
|
+
self.compiled = []
|
|
134
|
+
for attack_type, pattern, weight in self.ATTACK_PATTERNS:
|
|
135
|
+
try:
|
|
136
|
+
self.compiled.append((
|
|
137
|
+
attack_type,
|
|
138
|
+
re.compile(pattern, re.IGNORECASE),
|
|
139
|
+
pattern,
|
|
140
|
+
weight
|
|
141
|
+
))
|
|
142
|
+
except re.error as e:
|
|
143
|
+
print(f"⚠️ Pattern 编译失败:{pattern} - {e}")
|
|
144
|
+
|
|
145
|
+
print(f"✅ PatternEngine: {len(self.compiled)} patterns")
|
|
146
|
+
|
|
147
|
+
def scan(self, content: str, file_path: str = "") -> Dict:
|
|
148
|
+
"""
|
|
149
|
+
Layer 1: Pattern 扫描
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
content: 文件内容
|
|
153
|
+
file_path: 文件路径(用于日志)
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
{
|
|
157
|
+
'matches': [(type, pattern, weight), ...],
|
|
158
|
+
'max_weight': int,
|
|
159
|
+
'attack_types': set(),
|
|
160
|
+
'hit_count': int
|
|
161
|
+
}
|
|
162
|
+
"""
|
|
163
|
+
matches = []
|
|
164
|
+
matched_patterns = set()
|
|
165
|
+
attack_types = set()
|
|
166
|
+
|
|
167
|
+
for attack_type, compiled, pattern, weight in self.compiled:
|
|
168
|
+
if pattern in matched_patterns:
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
if compiled.search(content):
|
|
172
|
+
matches.append((attack_type, pattern, weight))
|
|
173
|
+
matched_patterns.add(pattern)
|
|
174
|
+
attack_types.add(attack_type)
|
|
175
|
+
|
|
176
|
+
max_weight = max((w for _, _, w in matches), default=0)
|
|
177
|
+
|
|
178
|
+
result = {
|
|
179
|
+
'matches': matches,
|
|
180
|
+
'max_weight': max_weight,
|
|
181
|
+
'attack_types': list(attack_types),
|
|
182
|
+
'hit_count': len(matches),
|
|
183
|
+
'layer': 'PatternEngine'
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return result
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ========== Layer 2: Rule Engine ==========
|
|
190
|
+
class RuleEngine:
|
|
191
|
+
"""
|
|
192
|
+
Layer 2: Rule 引擎 - 深度规则匹配
|
|
193
|
+
|
|
194
|
+
职责:
|
|
195
|
+
- 使用复杂规则(多 pattern 组合)进行深度检测
|
|
196
|
+
- 结合 Layer 1 的结果进行针对性扫描
|
|
197
|
+
- 提供置信度评分
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
# Category 关键词映射(用于推断 unknown/false_prone 类别)
|
|
201
|
+
CATEGORY_KEYWORDS = {
|
|
202
|
+
'credential_theft': ['shadow', 'passwd', 'netrc', '.aws/', '.ssh/', 'credential', 'password', 'secret'],
|
|
203
|
+
'privilege_escalation': ['sudo', 'sudoers', 'NOPASSWD', 'chmod', '4755', 'SUID', 'setuid'],
|
|
204
|
+
'resource_exhaustion': ['fork', 'bomb', 'exhaust', 'while.*true', 'subprocess'],
|
|
205
|
+
'persistence': ['cron', 'systemd', '.bashrc', '.profile', 'startup'],
|
|
206
|
+
'code_execution': ['exec', 'eval', 'compile', 'subprocess', 'os.system'],
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
def __init__(self, rules_file: Optional[Path] = None):
|
|
210
|
+
self.rules_file = rules_file
|
|
211
|
+
self.rules = []
|
|
212
|
+
self.compiled = []
|
|
213
|
+
|
|
214
|
+
# 加载规则
|
|
215
|
+
if rules_file and rules_file.exists():
|
|
216
|
+
self.load_rules(rules_file)
|
|
217
|
+
else:
|
|
218
|
+
# 使用内置规则
|
|
219
|
+
self.load_builtin_rules()
|
|
220
|
+
|
|
221
|
+
print(f"✅ RuleEngine: {len(self.compiled)} rules")
|
|
222
|
+
|
|
223
|
+
def _infer_category(self, rule: Dict, content: str) -> str:
|
|
224
|
+
"""推断规则类别(用于 unknown/false_prone)"""
|
|
225
|
+
category = rule.get('category', 'unknown')
|
|
226
|
+
|
|
227
|
+
# 只推断 unknown 或 false_prone_generated
|
|
228
|
+
if category not in ['unknown', 'false_prone_generated']:
|
|
229
|
+
return category
|
|
230
|
+
|
|
231
|
+
# 检查 pattern 和 rule_id
|
|
232
|
+
patterns_str = str(rule.get('patterns', [])).lower()
|
|
233
|
+
rule_id = rule.get('id', '').lower()
|
|
234
|
+
content_lower = content.lower()
|
|
235
|
+
|
|
236
|
+
# 根据关键词推断类别
|
|
237
|
+
for inferred_cat, keywords in self.CATEGORY_KEYWORDS.items():
|
|
238
|
+
for kw in keywords:
|
|
239
|
+
if kw.lower() in patterns_str or kw.lower() in rule_id or kw.lower() in content_lower:
|
|
240
|
+
return inferred_cat
|
|
241
|
+
|
|
242
|
+
return category
|
|
243
|
+
|
|
244
|
+
def load_builtin_rules(self):
|
|
245
|
+
"""加载内置规则"""
|
|
246
|
+
# 内置高置信度规则
|
|
247
|
+
builtin_rules = [
|
|
248
|
+
{
|
|
249
|
+
'id': 'CRED-001',
|
|
250
|
+
'name': 'SSH 密钥窃取',
|
|
251
|
+
'category': 'credential_theft',
|
|
252
|
+
'patterns': [r'\.ssh/', r'id_rsa', r'id_ed25519'],
|
|
253
|
+
'min_matches': 2,
|
|
254
|
+
'confidence': 95
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
'id': 'CRED-002',
|
|
258
|
+
'name': 'AWS 凭证窃取',
|
|
259
|
+
'category': 'credential_theft',
|
|
260
|
+
'patterns': [r'\.aws/', r'AWS_SECRET', r'AWS_ACCESS'],
|
|
261
|
+
'min_matches': 2,
|
|
262
|
+
'confidence': 95
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
'id': 'EXFIL-001',
|
|
266
|
+
'name': '数据外传',
|
|
267
|
+
'category': 'data_exfiltration',
|
|
268
|
+
'patterns': [r'curl\s+.*\|.*bash', r'wget.*\|.*sh'],
|
|
269
|
+
'min_matches': 1,
|
|
270
|
+
'confidence': 95
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
'id': 'EVASION-001',
|
|
274
|
+
'name': '代码混淆执行',
|
|
275
|
+
'category': 'evasion',
|
|
276
|
+
'patterns': [r'base64', r'eval\s*\(', r'exec\s*\('],
|
|
277
|
+
'min_matches': 2,
|
|
278
|
+
'confidence': 90
|
|
279
|
+
},
|
|
280
|
+
{
|
|
281
|
+
'id': 'PERSIST-001',
|
|
282
|
+
'name': '持久化后门',
|
|
283
|
+
'category': 'persistence',
|
|
284
|
+
'patterns': [r'crontab', r'systemd', r'\.service'],
|
|
285
|
+
'min_matches': 2,
|
|
286
|
+
'confidence': 90
|
|
287
|
+
},
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
self.rules = builtin_rules
|
|
291
|
+
self._compile_rules()
|
|
292
|
+
|
|
293
|
+
def load_rules(self, rules_file: Path):
|
|
294
|
+
"""从文件加载规则"""
|
|
295
|
+
try:
|
|
296
|
+
with open(rules_file, 'r', encoding='utf-8') as f:
|
|
297
|
+
data = json.load(f)
|
|
298
|
+
self.rules = data.get('rules', [])
|
|
299
|
+
self._compile_rules()
|
|
300
|
+
except Exception as e:
|
|
301
|
+
print(f"⚠️ 规则加载失败:{e}")
|
|
302
|
+
self.load_builtin_rules()
|
|
303
|
+
|
|
304
|
+
def _compile_rules(self):
|
|
305
|
+
"""编译规则中的正则"""
|
|
306
|
+
self.compiled = []
|
|
307
|
+
|
|
308
|
+
for rule in self.rules:
|
|
309
|
+
compiled_patterns = []
|
|
310
|
+
for pattern in rule.get('patterns', []):
|
|
311
|
+
try:
|
|
312
|
+
compiled_patterns.append(re.compile(pattern, re.IGNORECASE))
|
|
313
|
+
except re.error:
|
|
314
|
+
pass
|
|
315
|
+
|
|
316
|
+
rule['_compiled'] = compiled_patterns
|
|
317
|
+
self.compiled.append(rule)
|
|
318
|
+
|
|
319
|
+
def scan(self, content: str, layer1_result: Dict = None) -> Dict:
|
|
320
|
+
"""
|
|
321
|
+
Layer 2: Rule 扫描
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
content: 文件内容
|
|
325
|
+
layer1_result: Layer 1 的结果(用于针对性扫描)
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
{
|
|
329
|
+
'matches': [(rule_id, category, confidence), ...],
|
|
330
|
+
'max_confidence': int,
|
|
331
|
+
'attack_types': set(),
|
|
332
|
+
'hit_count': int
|
|
333
|
+
}
|
|
334
|
+
"""
|
|
335
|
+
matches = []
|
|
336
|
+
attack_types = set()
|
|
337
|
+
|
|
338
|
+
# 如果 Layer 1 有结果,优先扫描相关规则
|
|
339
|
+
if layer1_result and layer1_result.get('attack_types'):
|
|
340
|
+
priority_types = set(layer1_result['attack_types'])
|
|
341
|
+
else:
|
|
342
|
+
priority_types = None
|
|
343
|
+
|
|
344
|
+
for rule in self.compiled:
|
|
345
|
+
rule_category = self._infer_category(rule, content)
|
|
346
|
+
|
|
347
|
+
# 如果有 Layer 1 结果,优先处理相关类别
|
|
348
|
+
if priority_types and rule_category not in priority_types:
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
# 检查规则匹配
|
|
352
|
+
match_count = 0
|
|
353
|
+
for compiled in rule.get('_compiled', []):
|
|
354
|
+
if compiled.search(content):
|
|
355
|
+
match_count += 1
|
|
356
|
+
|
|
357
|
+
# 检查是否达到最小匹配数
|
|
358
|
+
min_matches = rule.get('min_matches', 1)
|
|
359
|
+
if match_count >= min_matches:
|
|
360
|
+
confidence = rule.get('confidence', 50)
|
|
361
|
+
matches.append((
|
|
362
|
+
rule.get('id', 'UNKNOWN'),
|
|
363
|
+
rule_category,
|
|
364
|
+
confidence,
|
|
365
|
+
rule.get('name', '')
|
|
366
|
+
))
|
|
367
|
+
attack_types.add(rule_category)
|
|
368
|
+
|
|
369
|
+
max_confidence = max((c for _, _, c, _ in matches), default=0)
|
|
370
|
+
|
|
371
|
+
# 计算 score 和 risk_level
|
|
372
|
+
score = max_confidence
|
|
373
|
+
if max_confidence >= 80: risk_level = 'CRITICAL'
|
|
374
|
+
elif max_confidence >= 60: risk_level = 'HIGH'
|
|
375
|
+
elif max_confidence >= 40: risk_level = 'MEDIUM'
|
|
376
|
+
elif max_confidence >= 20: risk_level = 'LOW'
|
|
377
|
+
else: risk_level = 'SAFE'
|
|
378
|
+
|
|
379
|
+
result = {
|
|
380
|
+
'matches': matches,
|
|
381
|
+
'max_confidence': max_confidence,
|
|
382
|
+
'attack_types': list(attack_types),
|
|
383
|
+
'hit_count': len(matches),
|
|
384
|
+
'score': score,
|
|
385
|
+
'risk_level': risk_level,
|
|
386
|
+
'confidence': max_confidence / 100.0,
|
|
387
|
+
'layer': 'RuleEngine'
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
return result
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# ========== Layer 3: LLM Engine (可选) ==========
|
|
394
|
+
class LLMEngine:
|
|
395
|
+
"""
|
|
396
|
+
Layer 3: LLM 引擎 - 语义分析 + 上下文理解
|
|
397
|
+
|
|
398
|
+
职责:
|
|
399
|
+
- 分析代码语义(不仅仅是模式匹配)
|
|
400
|
+
- 结合上下文(技能描述、历史记录)判断意图
|
|
401
|
+
- 提供最终确认
|
|
402
|
+
|
|
403
|
+
注意:这是可选层,需要用户配置 LLM API
|
|
404
|
+
"""
|
|
405
|
+
|
|
406
|
+
def __init__(self, api_config: Optional[Dict] = None):
|
|
407
|
+
self.api_config = api_config
|
|
408
|
+
self.enabled = api_config is not None
|
|
409
|
+
|
|
410
|
+
if self.enabled:
|
|
411
|
+
print(f"✅ LLMEngine: 已启用 ({api_config.get('provider', 'unknown')})")
|
|
412
|
+
else:
|
|
413
|
+
print("ℹ️ LLMEngine: 未启用(跳过 Layer 3)")
|
|
414
|
+
|
|
415
|
+
def scan(self, content: str, layer1_result: Dict, layer2_result: Dict,
|
|
416
|
+
context: Optional[Dict] = None) -> Dict:
|
|
417
|
+
"""
|
|
418
|
+
Layer 3: LLM 语义分析
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
content: 文件内容
|
|
422
|
+
layer1_result: Layer 1 结果
|
|
423
|
+
layer2_result: Layer 2 结果
|
|
424
|
+
context: 上下文信息(技能描述、历史记录等)
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
{
|
|
428
|
+
'is_malicious': bool,
|
|
429
|
+
'confidence': float,
|
|
430
|
+
'reasoning': str,
|
|
431
|
+
'threat_summary': str
|
|
432
|
+
}
|
|
433
|
+
"""
|
|
434
|
+
if not self.enabled:
|
|
435
|
+
return {
|
|
436
|
+
'enabled': False,
|
|
437
|
+
'reason': 'LLM not configured'
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
# TODO: 调用 LLM API 进行语义分析
|
|
441
|
+
# 这里需要根据实际 LLM API 实现
|
|
442
|
+
|
|
443
|
+
# 伪代码示例:
|
|
444
|
+
# prompt = self._build_prompt(content, layer1_result, layer2_result, context)
|
|
445
|
+
# response = call_llm_api(prompt, self.api_config)
|
|
446
|
+
# return self._parse_response(response)
|
|
447
|
+
|
|
448
|
+
return {
|
|
449
|
+
'enabled': True,
|
|
450
|
+
'is_malicious': False,
|
|
451
|
+
'confidence': 0.0,
|
|
452
|
+
'reasoning': 'LLM analysis not implemented yet',
|
|
453
|
+
'threat_summary': '',
|
|
454
|
+
'layer': 'LLMEngine'
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
def _build_prompt(self, content: str, layer1: Dict, layer2: Dict,
|
|
458
|
+
context: Optional[Dict]) -> str:
|
|
459
|
+
"""构建 LLM 提示词"""
|
|
460
|
+
prompt = """你是一个 AI 安全专家。请分析以下代码是否存在恶意行为。
|
|
461
|
+
|
|
462
|
+
## 代码内容
|
|
463
|
+
```
|
|
464
|
+
{content}
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
## Pattern 检测结果
|
|
468
|
+
- 命中数:{layer1_hits}
|
|
469
|
+
- 攻击类型:{layer1_types}
|
|
470
|
+
- 最高权重:{layer1_weight}
|
|
471
|
+
|
|
472
|
+
## Rule 检测结果
|
|
473
|
+
- 命中数:{layer2_hits}
|
|
474
|
+
- 攻击类型:{layer2_types}
|
|
475
|
+
- 最高置信度:{layer2_confidence}
|
|
476
|
+
|
|
477
|
+
## 上下文信息
|
|
478
|
+
{context}
|
|
479
|
+
|
|
480
|
+
## 任务
|
|
481
|
+
1. 判断代码是否恶意
|
|
482
|
+
2. 说明判断理由
|
|
483
|
+
3. 给出置信度 (0.0-1.0)
|
|
484
|
+
4. 总结威胁类型
|
|
485
|
+
|
|
486
|
+
请按以下 JSON 格式回复:
|
|
487
|
+
{{
|
|
488
|
+
"is_malicious": true/false,
|
|
489
|
+
"confidence": 0.0-1.0,
|
|
490
|
+
"reasoning": "...",
|
|
491
|
+
"threat_summary": "..."
|
|
492
|
+
}}
|
|
493
|
+
"""
|
|
494
|
+
return prompt.format(
|
|
495
|
+
content=content[:5000], # 限制长度
|
|
496
|
+
layer1_hits=layer1.get('hit_count', 0),
|
|
497
|
+
layer1_types=', '.join(layer1.get('attack_types', [])),
|
|
498
|
+
layer1_weight=layer1.get('max_weight', 0),
|
|
499
|
+
layer2_hits=layer2.get('hit_count', 0),
|
|
500
|
+
layer2_types=', '.join(layer2.get('attack_types', [])),
|
|
501
|
+
layer2_confidence=layer2.get('max_confidence', 0),
|
|
502
|
+
context=json.dumps(context, ensure_ascii=False) if context else '无'
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
# ========== 主 Scanner ==========
|
|
507
|
+
class Scanner:
|
|
508
|
+
"""
|
|
509
|
+
主扫描器 - 串行执行三层检测
|
|
510
|
+
|
|
511
|
+
流程:
|
|
512
|
+
1. PatternEngine (Layer 1) - 快速模式匹配
|
|
513
|
+
2. RuleEngine (Layer 2) - 深度规则匹配
|
|
514
|
+
3. LLMEngine (Layer 3, 可选) - 语义分析
|
|
515
|
+
|
|
516
|
+
特点:
|
|
517
|
+
- 串行执行,每层都能获取前层信息
|
|
518
|
+
- 准确性优先
|
|
519
|
+
- 支持单文件和技能文件夹扫描
|
|
520
|
+
"""
|
|
521
|
+
|
|
522
|
+
def __init__(self, rules_file: Optional[Path] = None,
|
|
523
|
+
llm_config: Optional[Dict] = None):
|
|
524
|
+
self.version = VERSION
|
|
525
|
+
|
|
526
|
+
print(f"🔧 初始化 Scanner {VERSION}...")
|
|
527
|
+
|
|
528
|
+
# 初始化三层引擎
|
|
529
|
+
self.layer1 = PatternEngine()
|
|
530
|
+
self.layer2 = RuleEngine(rules_file)
|
|
531
|
+
self.layer3 = LLMEngine(llm_config) if llm_config else None
|
|
532
|
+
|
|
533
|
+
# 手动加载外部规则
|
|
534
|
+
self._load_external_rules()
|
|
535
|
+
|
|
536
|
+
# 统计
|
|
537
|
+
self.stats = {
|
|
538
|
+
'files_scanned': 0,
|
|
539
|
+
'threats_found': 0,
|
|
540
|
+
'layer1_hits': 0,
|
|
541
|
+
'layer2_hits': 0,
|
|
542
|
+
'layer3_enabled': self.layer3 is not None
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
print(f"✅ Scanner 初始化完成")
|
|
546
|
+
|
|
547
|
+
def _load_external_rules(self):
|
|
548
|
+
"""加载外部规则(Gitleaks + Semgrep AI + Bandit)"""
|
|
549
|
+
import json
|
|
550
|
+
|
|
551
|
+
# 加载 Gitleaks patterns
|
|
552
|
+
# __file__ = v6.0.0/src/engines/__init__.py
|
|
553
|
+
# parent.parent = v6.0.0/
|
|
554
|
+
gitleaks_file = Path(__file__).parent.parent.parent / 'rules' / 'gitleaks_patterns.json'
|
|
555
|
+
if gitleaks_file.exists():
|
|
556
|
+
try:
|
|
557
|
+
with open(gitleaks_file, 'r', encoding='utf-8') as f:
|
|
558
|
+
data = json.load(f)
|
|
559
|
+
|
|
560
|
+
patterns = data.get('patterns', [])
|
|
561
|
+
for p in patterns:
|
|
562
|
+
pattern_regex = p.get('pattern', '')
|
|
563
|
+
attack_type = p.get('attack_type', 'credential_theft')
|
|
564
|
+
weight = p.get('weight', 40)
|
|
565
|
+
|
|
566
|
+
if not pattern_regex:
|
|
567
|
+
continue
|
|
568
|
+
|
|
569
|
+
try:
|
|
570
|
+
compiled = re.compile(pattern_regex, re.IGNORECASE)
|
|
571
|
+
self.layer1.compiled.append((
|
|
572
|
+
attack_type,
|
|
573
|
+
compiled,
|
|
574
|
+
pattern_regex,
|
|
575
|
+
weight
|
|
576
|
+
))
|
|
577
|
+
except re.error as e:
|
|
578
|
+
pass # 跳过无效正则
|
|
579
|
+
|
|
580
|
+
print(f"✅ PatternEngine: 加载 {len(patterns)} 条 Gitleaks 规则")
|
|
581
|
+
except Exception as e:
|
|
582
|
+
print(f"⚠️ 加载 Gitleaks 规则失败:{e}")
|
|
583
|
+
|
|
584
|
+
# 加载 Semgrep AI rules
|
|
585
|
+
semgrep_file = Path(__file__).parent.parent.parent / 'rules' / 'semgrep_ai_rules.json'
|
|
586
|
+
if semgrep_file.exists():
|
|
587
|
+
try:
|
|
588
|
+
with open(semgrep_file, 'r', encoding='utf-8') as f:
|
|
589
|
+
data = json.load(f)
|
|
590
|
+
|
|
591
|
+
# 支持列表或字典格式
|
|
592
|
+
if isinstance(data, list):
|
|
593
|
+
rules = data
|
|
594
|
+
else:
|
|
595
|
+
rules = data.get('rules', [])
|
|
596
|
+
|
|
597
|
+
print(f" DEBUG: Semgrep rules 类型={type(rules)}, 数量={len(rules) if hasattr(rules, '__len__') else 'N/A'}")
|
|
598
|
+
|
|
599
|
+
loaded_count = 0
|
|
600
|
+
for i, r in enumerate(rules):
|
|
601
|
+
if i < 3: # 只打印前 3 条调试
|
|
602
|
+
print(f" DEBUG[{i}]: rule 类型={type(r)}, keys={r.keys() if isinstance(r, dict) else 'N/A'}")
|
|
603
|
+
|
|
604
|
+
rule_id = r.get('source', r.get('id', 'SEMGREP-UNKNOWN')).replace('-', '_').upper()
|
|
605
|
+
category = r.get('category', 'credential_theft')
|
|
606
|
+
confidence = r.get('confidence', r.get('weight', 60))
|
|
607
|
+
patterns = r.get('patterns', [])
|
|
608
|
+
|
|
609
|
+
if not patterns:
|
|
610
|
+
continue
|
|
611
|
+
|
|
612
|
+
for pattern in patterns:
|
|
613
|
+
try:
|
|
614
|
+
compiled = re.compile(pattern, re.IGNORECASE)
|
|
615
|
+
self.layer2.compiled[rule_id] = {
|
|
616
|
+
'rule': r,
|
|
617
|
+
'patterns': [compiled],
|
|
618
|
+
'category': category,
|
|
619
|
+
'severity': r.get('severity', 'medium'),
|
|
620
|
+
'confidence': confidence,
|
|
621
|
+
'description': f'Semgrep AI: {r.get("source", "")}',
|
|
622
|
+
'source': 'semgrep'
|
|
623
|
+
}
|
|
624
|
+
loaded_count += 1
|
|
625
|
+
break # 每个 rule 只取第一个 pattern
|
|
626
|
+
except re.error as e:
|
|
627
|
+
print(f" DEBUG: 正则错误:{pattern} - {e}")
|
|
628
|
+
pass
|
|
629
|
+
|
|
630
|
+
print(f"✅ RuleEngine: 加载 {loaded_count} 条 Semgrep AI 规则")
|
|
631
|
+
except Exception as e:
|
|
632
|
+
import traceback
|
|
633
|
+
print(f"⚠️ 加载 Semgrep AI 规则失败:{e}")
|
|
634
|
+
traceback.print_exc()
|
|
635
|
+
|
|
636
|
+
# 加载 Bandit rules
|
|
637
|
+
bandit_file = Path(__file__).parent.parent.parent / 'rules' / 'bandit_rules.json'
|
|
638
|
+
if bandit_file.exists():
|
|
639
|
+
try:
|
|
640
|
+
with open(bandit_file, 'r', encoding='utf-8') as f:
|
|
641
|
+
data = json.load(f)
|
|
642
|
+
|
|
643
|
+
# 支持列表或字典格式
|
|
644
|
+
if isinstance(data, list):
|
|
645
|
+
rules = data
|
|
646
|
+
else:
|
|
647
|
+
rules = data.get('rules', [])
|
|
648
|
+
|
|
649
|
+
loaded_count = 0
|
|
650
|
+
for r in rules:
|
|
651
|
+
rule_id = r.get('id', 'BANDIT-UNKNOWN')
|
|
652
|
+
category = r.get('category', 'arbitrary_execution')
|
|
653
|
+
confidence = r.get('confidence', 70)
|
|
654
|
+
patterns = r.get('patterns', [])
|
|
655
|
+
|
|
656
|
+
if not patterns:
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
for pattern in patterns:
|
|
660
|
+
try:
|
|
661
|
+
compiled = re.compile(pattern, re.IGNORECASE)
|
|
662
|
+
self.layer2.compiled[rule_id] = {
|
|
663
|
+
'rule': r,
|
|
664
|
+
'patterns': [compiled],
|
|
665
|
+
'category': category,
|
|
666
|
+
'severity': r.get('severity', 'MEDIUM'),
|
|
667
|
+
'confidence': confidence,
|
|
668
|
+
'description': r.get('description', ''),
|
|
669
|
+
'source': 'bandit'
|
|
670
|
+
}
|
|
671
|
+
loaded_count += 1
|
|
672
|
+
break # 每个 rule 只取第一个 pattern
|
|
673
|
+
except re.error:
|
|
674
|
+
pass
|
|
675
|
+
|
|
676
|
+
print(f"✅ RuleEngine: 加载 {loaded_count} 条 Bandit 规则")
|
|
677
|
+
except Exception as e:
|
|
678
|
+
print(f"⚠️ 加载 Bandit 规则失败:{e}")
|
|
679
|
+
|
|
680
|
+
def scan_file(self, file_path: Path, context: Optional[Dict] = None) -> ScanResult:
|
|
681
|
+
"""
|
|
682
|
+
扫描单个文件(串行三层检测)
|
|
683
|
+
|
|
684
|
+
Args:
|
|
685
|
+
file_path: 文件路径
|
|
686
|
+
context: 上下文信息(技能描述、历史记录等)
|
|
687
|
+
|
|
688
|
+
Returns:
|
|
689
|
+
ScanResult 对象
|
|
690
|
+
"""
|
|
691
|
+
start_time = time.time()
|
|
692
|
+
|
|
693
|
+
# 读取文件
|
|
694
|
+
try:
|
|
695
|
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
|
696
|
+
except Exception as e:
|
|
697
|
+
return self._create_error_result(str(file_path), str(e))
|
|
698
|
+
|
|
699
|
+
# Layer 1: Pattern 扫描
|
|
700
|
+
layer1_result = self.layer1.scan(content, str(file_path))
|
|
701
|
+
if layer1_result['hit_count'] > 0:
|
|
702
|
+
self.stats['layer1_hits'] += 1
|
|
703
|
+
|
|
704
|
+
# Layer 2: Rule 扫描(使用 Layer 1 结果)
|
|
705
|
+
layer2_result = self.layer2.scan(content, layer1_result)
|
|
706
|
+
if layer2_result['hit_count'] > 0:
|
|
707
|
+
self.stats['layer2_hits'] += 1
|
|
708
|
+
|
|
709
|
+
# Layer 3: LLM 扫描(如果启用)
|
|
710
|
+
layer3_result = None
|
|
711
|
+
if self.layer3:
|
|
712
|
+
layer3_result = self.layer3.scan(
|
|
713
|
+
content, layer1_result, layer2_result, context
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
# 综合评估
|
|
717
|
+
assessment = self._assess(layer1_result, layer2_result, layer3_result)
|
|
718
|
+
|
|
719
|
+
# 构建结果
|
|
720
|
+
scan_time = (time.time() - start_time) * 1000
|
|
721
|
+
|
|
722
|
+
result = ScanResult(
|
|
723
|
+
file_path=str(file_path),
|
|
724
|
+
file_type='single_file',
|
|
725
|
+
is_malicious=assessment['is_malicious'],
|
|
726
|
+
risk_level=assessment['risk_level'],
|
|
727
|
+
score=assessment['score'],
|
|
728
|
+
confidence=assessment['confidence'],
|
|
729
|
+
attack_types=assessment['attack_types'],
|
|
730
|
+
threat_summary=assessment.get('threat_summary', ''),
|
|
731
|
+
layer1_pattern=layer1_result,
|
|
732
|
+
layer2_rule=layer2_result,
|
|
733
|
+
layer3_llm=layer3_result,
|
|
734
|
+
matched_patterns=[
|
|
735
|
+
{'type': t, 'pattern': p, 'weight': w}
|
|
736
|
+
for t, p, w in layer1_result.get('matches', [])
|
|
737
|
+
],
|
|
738
|
+
matched_rules=[
|
|
739
|
+
{'id': i, 'category': c, 'confidence': conf, 'name': n}
|
|
740
|
+
for i, c, conf, n in layer2_result.get('matches', [])
|
|
741
|
+
],
|
|
742
|
+
scan_time_ms=scan_time,
|
|
743
|
+
context=context
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
# 更新统计
|
|
747
|
+
self.stats['files_scanned'] += 1
|
|
748
|
+
if result.is_malicious:
|
|
749
|
+
self.stats['threats_found'] += 1
|
|
750
|
+
|
|
751
|
+
return result
|
|
752
|
+
|
|
753
|
+
def scan_skill_folder(self, skill_folder: Path,
|
|
754
|
+
context: Optional[Dict] = None) -> ScanResult:
|
|
755
|
+
"""
|
|
756
|
+
扫描完整技能文件夹
|
|
757
|
+
|
|
758
|
+
Args:
|
|
759
|
+
skill_folder: 技能文件夹路径
|
|
760
|
+
context: 上下文信息
|
|
761
|
+
|
|
762
|
+
Returns:
|
|
763
|
+
ScanResult 对象(综合整个文件夹的评估)
|
|
764
|
+
"""
|
|
765
|
+
start_time = time.time()
|
|
766
|
+
|
|
767
|
+
# 找到所有关键文件
|
|
768
|
+
key_files = self._find_key_files(skill_folder)
|
|
769
|
+
|
|
770
|
+
if not key_files:
|
|
771
|
+
return self._create_error_result(
|
|
772
|
+
str(skill_folder),
|
|
773
|
+
"No key files found"
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
# 扫描每个文件
|
|
777
|
+
file_results = []
|
|
778
|
+
all_attack_types = set()
|
|
779
|
+
max_score = 0
|
|
780
|
+
total_score = 0
|
|
781
|
+
|
|
782
|
+
for file_path in key_files:
|
|
783
|
+
result = self.scan_file(file_path, context)
|
|
784
|
+
file_results.append(result)
|
|
785
|
+
|
|
786
|
+
if result.is_malicious:
|
|
787
|
+
all_attack_types.update(result.attack_types)
|
|
788
|
+
max_score = max(max_score, result.score)
|
|
789
|
+
total_score += result.score
|
|
790
|
+
|
|
791
|
+
# 综合评估整个技能
|
|
792
|
+
file_count = len(file_results)
|
|
793
|
+
avg_score = total_score / file_count if file_count > 0 else 0
|
|
794
|
+
|
|
795
|
+
# 技能最终评分 = 最高分 + 平均分加成
|
|
796
|
+
final_score = min(max_score + int(avg_score * 0.3), 100)
|
|
797
|
+
|
|
798
|
+
is_malicious = final_score >= 70 or max_score >= 90
|
|
799
|
+
is_suspicious = 30 <= final_score < 70
|
|
800
|
+
|
|
801
|
+
risk_level = (
|
|
802
|
+
'CRITICAL' if final_score >= 90 else
|
|
803
|
+
'HIGH' if final_score >= 70 else
|
|
804
|
+
'MEDIUM' if final_score >= 30 else
|
|
805
|
+
'LOW' if final_score >= 20 else
|
|
806
|
+
'SAFE'
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
scan_time = (time.time() - start_time) * 1000
|
|
810
|
+
|
|
811
|
+
# 收集所有匹配的规则和 pattern
|
|
812
|
+
all_patterns = []
|
|
813
|
+
all_rules = []
|
|
814
|
+
for r in file_results:
|
|
815
|
+
all_patterns.extend(r.matched_patterns)
|
|
816
|
+
all_rules.extend(r.matched_rules)
|
|
817
|
+
|
|
818
|
+
result = ScanResult(
|
|
819
|
+
file_path=str(skill_folder),
|
|
820
|
+
file_type='skill_folder',
|
|
821
|
+
is_malicious=is_malicious,
|
|
822
|
+
risk_level=risk_level,
|
|
823
|
+
score=final_score,
|
|
824
|
+
confidence=0.9 if is_malicious else 0.7 if is_suspicious else 0.5,
|
|
825
|
+
attack_types=list(all_attack_types),
|
|
826
|
+
threat_summary=f"Scanned {file_count} files, found {len(all_patterns)} patterns and {len(all_rules)} rules",
|
|
827
|
+
layer1_pattern={'file_results': [r.layer1_pattern for r in file_results]},
|
|
828
|
+
layer2_rule={'file_results': [r.layer2_rule for r in file_results]},
|
|
829
|
+
layer3_llm={'file_results': [r.layer3_llm for r in file_results]} if self.layer3 else None,
|
|
830
|
+
matched_patterns=all_patterns[:20], # 最多 20 个
|
|
831
|
+
matched_rules=all_rules[:20],
|
|
832
|
+
scan_time_ms=scan_time,
|
|
833
|
+
context=context
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
return result
|
|
837
|
+
|
|
838
|
+
def _find_key_files(self, skill_folder: Path, recursive: bool = True, max_depth: int = 20) -> List[Path]:
|
|
839
|
+
"""
|
|
840
|
+
找到技能文件夹中的所有文件(带深度限制和保护)
|
|
841
|
+
|
|
842
|
+
Args:
|
|
843
|
+
skill_folder: 技能文件夹路径
|
|
844
|
+
recursive: 是否递归扫描子目录(默认 True)
|
|
845
|
+
max_depth: 最大递归深度(默认 20 层,防止过深目录)
|
|
846
|
+
|
|
847
|
+
Returns:
|
|
848
|
+
文件路径列表
|
|
849
|
+
"""
|
|
850
|
+
# 安全限制:最大深度不超过 20 层
|
|
851
|
+
max_depth = min(max_depth, 20)
|
|
852
|
+
|
|
853
|
+
filtered_files = []
|
|
854
|
+
|
|
855
|
+
if recursive:
|
|
856
|
+
# 手动递归以控制深度
|
|
857
|
+
self._collect_files_recursive(
|
|
858
|
+
skill_folder,
|
|
859
|
+
filtered_files,
|
|
860
|
+
current_depth=0,
|
|
861
|
+
max_depth=max_depth
|
|
862
|
+
)
|
|
863
|
+
else:
|
|
864
|
+
# 仅扫描根目录
|
|
865
|
+
try:
|
|
866
|
+
for f in skill_folder.iterdir():
|
|
867
|
+
if f.is_file() and not f.is_symlink():
|
|
868
|
+
# 跳过二进制文件
|
|
869
|
+
if f.suffix not in {'.dll', '.so', '.exe', '.bin', '.dat', '.pyc', '.pyo'}:
|
|
870
|
+
filtered_files.append(f)
|
|
871
|
+
except:
|
|
872
|
+
pass
|
|
873
|
+
|
|
874
|
+
return sorted(filtered_files)
|
|
875
|
+
|
|
876
|
+
def _collect_files_recursive(self, dir_path: Path, files_list: List[Path],
|
|
877
|
+
current_depth: int, max_depth: int):
|
|
878
|
+
"""
|
|
879
|
+
递归收集文件(带深度限制和保护)
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
dir_path: 当前目录
|
|
883
|
+
files_list: 文件列表(累加)
|
|
884
|
+
current_depth: 当前深度
|
|
885
|
+
max_depth: 最大深度
|
|
886
|
+
"""
|
|
887
|
+
# 深度保护:超过最大深度停止
|
|
888
|
+
if current_depth >= max_depth:
|
|
889
|
+
return
|
|
890
|
+
|
|
891
|
+
try:
|
|
892
|
+
for item in dir_path.iterdir():
|
|
893
|
+
# 跳过符号链接(防止循环链接)
|
|
894
|
+
if item.is_symlink():
|
|
895
|
+
continue
|
|
896
|
+
|
|
897
|
+
# 跳过忽略的目录
|
|
898
|
+
ignored_dirs = {'.git', '.svn', '__pycache__', 'node_modules',
|
|
899
|
+
'.DS_Store', 'Thumbs.db', 'venv', '.venv', 'env', '.env'}
|
|
900
|
+
if item.is_dir() and item.name in ignored_dirs:
|
|
901
|
+
continue
|
|
902
|
+
|
|
903
|
+
if item.is_file():
|
|
904
|
+
# 跳过二进制文件
|
|
905
|
+
if item.suffix not in {'.dll', '.so', '.exe', '.bin', '.dat', '.pyc', '.pyo'}:
|
|
906
|
+
files_list.append(item)
|
|
907
|
+
|
|
908
|
+
elif item.is_dir():
|
|
909
|
+
# 递归子目录
|
|
910
|
+
self._collect_files_recursive(
|
|
911
|
+
item, files_list,
|
|
912
|
+
current_depth + 1, max_depth
|
|
913
|
+
)
|
|
914
|
+
except PermissionError:
|
|
915
|
+
# 跳过无权限访问的目录
|
|
916
|
+
pass
|
|
917
|
+
except Exception:
|
|
918
|
+
# 跳过其他错误
|
|
919
|
+
pass
|
|
920
|
+
|
|
921
|
+
def _assess(self, layer1: Dict, layer2: Dict, layer3: Optional[Dict]) -> Dict:
|
|
922
|
+
"""
|
|
923
|
+
综合评估
|
|
924
|
+
|
|
925
|
+
结合三层结果,计算最终分数和风险等级
|
|
926
|
+
"""
|
|
927
|
+
attack_types = set()
|
|
928
|
+
attack_types.update(layer1.get('attack_types', []))
|
|
929
|
+
attack_types.update(layer2.get('attack_types', []))
|
|
930
|
+
|
|
931
|
+
# 基础分数
|
|
932
|
+
pattern_score = layer1.get('max_weight', 0)
|
|
933
|
+
rule_score = layer2.get('max_confidence', 0)
|
|
934
|
+
|
|
935
|
+
# 取最高分
|
|
936
|
+
base_score = max(pattern_score, rule_score)
|
|
937
|
+
|
|
938
|
+
# 类型加成
|
|
939
|
+
type_bonus = min(len(attack_types) * 3, 10)
|
|
940
|
+
|
|
941
|
+
# LLM 调整(如果启用)
|
|
942
|
+
llm_adjustment = 0
|
|
943
|
+
if layer3 and layer3.get('enabled'):
|
|
944
|
+
if layer3.get('is_malicious'):
|
|
945
|
+
llm_adjustment = 10
|
|
946
|
+
attack_types.add('llm_confirmed')
|
|
947
|
+
|
|
948
|
+
# 最终分数
|
|
949
|
+
final_score = min(base_score + type_bonus + llm_adjustment, 100)
|
|
950
|
+
|
|
951
|
+
# 风险等级
|
|
952
|
+
if final_score >= 90 or rule_score >= 95:
|
|
953
|
+
risk_level = 'CRITICAL'
|
|
954
|
+
elif final_score >= 70:
|
|
955
|
+
risk_level = 'HIGH'
|
|
956
|
+
elif final_score >= 30:
|
|
957
|
+
risk_level = 'MEDIUM'
|
|
958
|
+
elif final_score >= 20:
|
|
959
|
+
risk_level = 'LOW'
|
|
960
|
+
else:
|
|
961
|
+
risk_level = 'SAFE'
|
|
962
|
+
|
|
963
|
+
is_malicious = risk_level in ('MEDIUM', 'HIGH', 'CRITICAL')
|
|
964
|
+
|
|
965
|
+
# 置信度
|
|
966
|
+
confidence = (
|
|
967
|
+
0.95 if risk_level == 'CRITICAL' else
|
|
968
|
+
0.85 if risk_level == 'HIGH' else
|
|
969
|
+
0.70 if risk_level == 'MEDIUM' else
|
|
970
|
+
0.50
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
# 威胁总结
|
|
974
|
+
if attack_types:
|
|
975
|
+
threat_summary = f"Detected: {', '.join(sorted(attack_types))}"
|
|
976
|
+
else:
|
|
977
|
+
threat_summary = "No threats detected"
|
|
978
|
+
|
|
979
|
+
return {
|
|
980
|
+
'is_malicious': is_malicious,
|
|
981
|
+
'risk_level': risk_level,
|
|
982
|
+
'score': final_score,
|
|
983
|
+
'confidence': confidence,
|
|
984
|
+
'attack_types': list(attack_types),
|
|
985
|
+
'threat_summary': threat_summary
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
def _create_error_result(self, file_path: str, error: str) -> ScanResult:
|
|
989
|
+
"""创建错误结果"""
|
|
990
|
+
return ScanResult(
|
|
991
|
+
file_path=file_path,
|
|
992
|
+
file_type='error',
|
|
993
|
+
is_malicious=False,
|
|
994
|
+
risk_level='SAFE',
|
|
995
|
+
score=0,
|
|
996
|
+
confidence=0.0,
|
|
997
|
+
attack_types=[],
|
|
998
|
+
threat_summary=f"Scan error: {error}",
|
|
999
|
+
layer1_pattern=None,
|
|
1000
|
+
layer2_rule=None,
|
|
1001
|
+
layer3_llm=None,
|
|
1002
|
+
matched_patterns=[],
|
|
1003
|
+
matched_rules=[],
|
|
1004
|
+
scan_time_ms=0,
|
|
1005
|
+
context=None
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
# ========== 便捷函数 ==========
|
|
1010
|
+
def scan_file(file_path: str, rules_file: Optional[str] = None,
|
|
1011
|
+
llm_config: Optional[Dict] = None) -> ScanResult:
|
|
1012
|
+
"""便捷函数:扫描单个文件"""
|
|
1013
|
+
scanner = Scanner(
|
|
1014
|
+
rules_file=Path(rules_file) if rules_file else None,
|
|
1015
|
+
llm_config=llm_config
|
|
1016
|
+
)
|
|
1017
|
+
return scanner.scan_file(Path(file_path))
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
def scan_skill_folder(skill_folder: str, rules_file: Optional[str] = None,
|
|
1021
|
+
llm_config: Optional[Dict] = None) -> ScanResult:
|
|
1022
|
+
"""便捷函数:扫描技能文件夹"""
|
|
1023
|
+
scanner = Scanner(
|
|
1024
|
+
rules_file=Path(rules_file) if rules_file else None,
|
|
1025
|
+
llm_config=llm_config
|
|
1026
|
+
)
|
|
1027
|
+
return scanner.scan_skill_folder(Path(skill_folder))
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
# ========== 命令行入口 ==========
|
|
1031
|
+
if __name__ == '__main__':
|
|
1032
|
+
import argparse
|
|
1033
|
+
|
|
1034
|
+
parser = argparse.ArgumentParser(description=f"Scanner {VERSION}")
|
|
1035
|
+
parser.add_argument('path', help='扫描路径(文件或文件夹)')
|
|
1036
|
+
parser.add_argument('--rules', '-r', help='规则文件路径')
|
|
1037
|
+
parser.add_argument('--llm-config', '-l', help='LLM 配置文件路径')
|
|
1038
|
+
parser.add_argument('--output', '-o', help='输出 JSON 文件')
|
|
1039
|
+
parser.add_argument('--verbose', '-v', action='store_true', help='详细输出')
|
|
1040
|
+
|
|
1041
|
+
args = parser.parse_args()
|
|
1042
|
+
|
|
1043
|
+
path = Path(args.path)
|
|
1044
|
+
if not path.exists():
|
|
1045
|
+
print(f"❌ 路径不存在:{path}")
|
|
1046
|
+
sys.exit(1)
|
|
1047
|
+
|
|
1048
|
+
# 加载 LLM 配置
|
|
1049
|
+
llm_config = None
|
|
1050
|
+
if args.llm_config:
|
|
1051
|
+
with open(args.llm_config) as f:
|
|
1052
|
+
llm_config = json.load(f)
|
|
1053
|
+
|
|
1054
|
+
# 创建 Scanner
|
|
1055
|
+
scanner = Scanner(
|
|
1056
|
+
rules_file=Path(args.rules) if args.rules else None,
|
|
1057
|
+
llm_config=llm_config
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
# 扫描
|
|
1061
|
+
if path.is_file():
|
|
1062
|
+
result = scanner.scan_file(path)
|
|
1063
|
+
else:
|
|
1064
|
+
result = scanner.scan_skill_folder(path)
|
|
1065
|
+
|
|
1066
|
+
# 输出结果
|
|
1067
|
+
if args.verbose:
|
|
1068
|
+
print(f"\n{'='*60}")
|
|
1069
|
+
print(f"扫描结果")
|
|
1070
|
+
print(f"{'='*60}")
|
|
1071
|
+
print(f"路径:{result.file_path}")
|
|
1072
|
+
print(f"类型:{result.file_type}")
|
|
1073
|
+
print(f"恶意:{result.is_malicious}")
|
|
1074
|
+
print(f"风险:{result.risk_level}")
|
|
1075
|
+
print(f"分数:{result.score}")
|
|
1076
|
+
print(f"置信度:{result.confidence}")
|
|
1077
|
+
print(f"攻击类型:{', '.join(result.attack_types)}")
|
|
1078
|
+
print(f"威胁总结:{result.threat_summary}")
|
|
1079
|
+
print(f"Pattern 命中:{len(result.matched_patterns)}")
|
|
1080
|
+
print(f"Rule 命中:{len(result.matched_rules)}")
|
|
1081
|
+
print(f"耗时:{result.scan_time_ms:.2f}ms")
|
|
1082
|
+
|
|
1083
|
+
if args.output:
|
|
1084
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
1085
|
+
json.dump(result.to_dict(), f, indent=2, ensure_ascii=False)
|
|
1086
|
+
print(f"\n✅ 结果已保存:{args.output}")
|