caidongyun 6.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scan ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Agent Security Scanner CLI
4
+
5
+ 快速启动扫描器的命令行工具
6
+ """
7
+
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ # 添加当前目录到路径
12
+ sys.path.insert(0, str(Path(__file__).parent))
13
+
14
+ from scanner import main
15
+
16
+ if __name__ == '__main__':
17
+ main()
package/scanner.py ADDED
@@ -0,0 +1,322 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Security Scanner CLI v6.1.0
4
+ 统一安全扫描器 - 支持三层架构和 LLM 可选集成
5
+
6
+ 检测流程:
7
+ 1. PatternEngine (Layer 1) - 快速模式匹配
8
+ 2. RuleEngine (Layer 2) - 深度规则匹配
9
+ 3. LLMEngine (Layer 3, 可选) - 语义分析 + 上下文理解
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ import os
16
+ from pathlib import Path
17
+ from datetime import datetime
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from tqdm import tqdm
20
+
21
+ # 添加 src 路径
22
+ sys.path.insert(0, str(Path(__file__).parent / 'src'))
23
+ sys.path.insert(0, str(Path(__file__).parent))
24
+
25
+ # 导入三层架构引擎
26
+ from engines import PatternEngine, RuleEngine, LLMEngine
27
+ from whitelist_filter import WhitelistFilter
28
+ from config_detector import ConfigFileDetector
29
+
30
+ # 全局组件
31
+ whitelist_filter = WhitelistFilter()
32
+ config_detector = ConfigFileDetector()
33
+
34
+
35
+ def create_scanner(args):
36
+ """
37
+ 创建扫描器(支持三层架构)
38
+ """
39
+ # Layer 1: Pattern Engine (必选)
40
+ layer1 = PatternEngine()
41
+
42
+ # Layer 2: Rule Engine (必选)
43
+ rules_file = Path(__file__).parent / 'rules' / 'dist' / 'all_rules.json'
44
+ layer2 = RuleEngine(rules_file=rules_file)
45
+
46
+ # Layer 3: LLM Engine (可选)
47
+ layer3 = None
48
+ if args.llm:
49
+ print(f"🤖 启用 LLM 深度分析 (模型:{args.llm_model})")
50
+ llm_config = {
51
+ 'model': args.llm_model,
52
+ 'api_key': args.llm_api_key or os.environ.get('LLM_API_KEY', ''),
53
+ 'threshold': args.llm_threshold
54
+ }
55
+ layer3 = LLMEngine(llm_config)
56
+
57
+ return {
58
+ 'layer1': layer1,
59
+ 'layer2': layer2,
60
+ 'layer3': layer3
61
+ }
62
+
63
+
64
+ def scan_file(file_path: Path, scanner, max_depth: int = -1) -> dict:
65
+ """扫描单个文件(支持三层架构 + 白名单过滤)"""
66
+ try:
67
+ # 检查目录深度
68
+ if max_depth > 0:
69
+ try:
70
+ depth = len(file_path.relative_to(Path(scanner['base_path'])).parts)
71
+ if depth > max_depth:
72
+ return {'file': str(file_path), 'skipped': 'max_depth'}
73
+ except (ValueError, KeyError):
74
+ pass
75
+
76
+ # 读取文件内容
77
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
78
+
79
+ # 配置文件识别 (v6.1.0 新增)
80
+ file_type, config_risk = config_detector.classify_file(str(file_path), content)
81
+ if file_type == 'config':
82
+ if config_risk == 'malicious':
83
+ return {
84
+ 'file': str(file_path),
85
+ 'detected': True,
86
+ 'score': 80,
87
+ 'findings_count': 1,
88
+ 'risk_level': 'HIGH',
89
+ 'matched_rules': ['CONFIG-MALICIOUS'],
90
+ 'whitelist_applied': False,
91
+ 'is_config_file': True
92
+ }
93
+ else:
94
+ return {
95
+ 'file': str(file_path),
96
+ 'detected': False,
97
+ 'score': 0,
98
+ 'findings_count': 0,
99
+ 'risk_level': 'SAFE',
100
+ 'matched_rules': [],
101
+ 'whitelist_applied': False,
102
+ 'is_config_file': True
103
+ }
104
+
105
+ # 三层架构扫描
106
+ # Layer 1: Pattern Engine
107
+ layer1_result = scanner['layer1'].scan(content, str(file_path))
108
+
109
+ # Layer 2: Rule Engine
110
+ layer2_result = scanner['layer2'].scan(content, layer1_result)
111
+
112
+ # Layer 3: LLM Engine (可选)
113
+ layer3_result = None
114
+ if scanner['layer3'] and layer2_result.get('confidence', 1.0) < 0.8:
115
+ layer3_result = scanner['layer3'].scan(content, layer1_result, layer2_result)
116
+
117
+ # 合并结果
118
+ result = {
119
+ 'layer1': layer1_result,
120
+ 'layer2': layer2_result,
121
+ 'layer3': layer3_result,
122
+ 'hit_count': layer2_result.get('hit_count', 0),
123
+ 'matches': layer2_result.get('matches', []),
124
+ 'score': layer2_result.get('score', 0),
125
+ 'risk_level': layer2_result.get('risk_level', 'SAFE')
126
+ }
127
+
128
+ # 白名单过滤
129
+ if result.get('matches'):
130
+ filtered = whitelist_filter.filter_results(
131
+ result['matches'],
132
+ str(file_path),
133
+ content
134
+ )
135
+ result['matches'] = filtered
136
+ result['hit_count'] = len(filtered)
137
+ result['whitelist_applied'] = True
138
+
139
+ # 转换为统一格式
140
+ detected = result.get('hit_count', 0) > 0
141
+
142
+ return {
143
+ 'file': str(file_path),
144
+ 'detected': detected,
145
+ 'score': result.get('score', 0),
146
+ 'findings_count': result.get('hit_count', 0),
147
+ 'risk_level': result.get('risk_level', 'SAFE'),
148
+ 'matched_rules': list(set([m[0] if isinstance(m, tuple) else m.get('rule_id', m.get('pattern', '')) for m in result.get('matches', [])[:5]])),
149
+ 'whitelist_applied': result.get('whitelist_applied', False),
150
+ 'is_config_file': False,
151
+ 'layer1_result': layer1_result,
152
+ 'layer2_result': layer2_result,
153
+ 'layer3_llm': layer3_result
154
+ }
155
+ except Exception as e:
156
+ return {
157
+ 'file': str(file_path),
158
+ 'error': str(e),
159
+ 'detected': False
160
+ }
161
+
162
+
163
+ def scan_directory(target_path: Path, scanner, args) -> list:
164
+ """扫描目录"""
165
+ print(f"\n📂 扫描目标:{target_path}")
166
+
167
+ # 收集文件
168
+ files_to_scan = []
169
+ for ext in args.extensions.split(','):
170
+ files_to_scan.extend(list(target_path.rglob(f'*{ext.strip()}')))
171
+
172
+ # 去重
173
+ files_to_scan = list(set(files_to_scan))
174
+
175
+ # 应用文件数限制
176
+ if args.max_files > 0 and len(files_to_scan) > args.max_files:
177
+ print(f"⚠️ 文件数超过 {args.max_files},只扫描前 {args.max_files} 个")
178
+ files_to_scan = files_to_scan[:args.max_files]
179
+
180
+ print(f"✅ 找到 {len(files_to_scan)} 个文件")
181
+
182
+ # 并发扫描
183
+ results = []
184
+ with ThreadPoolExecutor(max_workers=args.workers) as executor:
185
+ futures = [executor.submit(scan_file, f, scanner, args.max_depth) for f in files_to_scan]
186
+ for future in tqdm(as_completed(futures), total=len(futures), desc="扫描进度"):
187
+ results.append(future.result())
188
+
189
+ return results
190
+
191
+
192
+ def generate_report(results, args):
193
+ """生成扫描报告"""
194
+ # 统计
195
+ total = len(results)
196
+ detected = sum(1 for r in results if r.get('detected'))
197
+ safe = total - detected
198
+
199
+ # 风险分布
200
+ risk_dist = {'CRITICAL': 0, 'HIGH': 0, 'MEDIUM': 0, 'LOW': 0, 'SAFE': 0}
201
+ for r in results:
202
+ risk_level = r.get('risk_level', 'SAFE')
203
+ if risk_level in risk_dist:
204
+ risk_dist[risk_level] += 1
205
+
206
+ # LLM 统计
207
+ llm_stats = None
208
+ if args.llm:
209
+ llm_count = sum(1 for r in results if r.get('layer3_llm'))
210
+ llm_stats = {
211
+ 'analyzed': llm_count,
212
+ 'model': args.llm_model
213
+ }
214
+
215
+ # 生成报告
216
+ report = {
217
+ 'summary': {
218
+ 'total_files': total,
219
+ 'detected': detected,
220
+ 'safe': safe,
221
+ 'detection_rate': detected / total * 100 if total > 0 else 0,
222
+ 'scan_time': datetime.now().isoformat()
223
+ },
224
+ 'config': {
225
+ 'version': '6.1.0',
226
+ 'rules_count': 609,
227
+ 'extensions': args.extensions,
228
+ 'max_files': args.max_files,
229
+ 'llm_enabled': args.llm,
230
+ 'llm_model': args.llm_model if args.llm else None
231
+ },
232
+ 'risk_distribution': risk_dist,
233
+ 'llm_stats': llm_stats,
234
+ 'results': results
235
+ }
236
+
237
+ return report
238
+
239
+
240
+ def main():
241
+ """主函数"""
242
+ parser = argparse.ArgumentParser(description='Security Scanner CLI v6.1.0 - 支持三层架构和 LLM 可选集成')
243
+
244
+ # 基本参数
245
+ parser.add_argument('target', type=str, help='扫描目标 (文件或目录)')
246
+ parser.add_argument('--extensions', type=str, default='.py,.js,.sh,.ps1,.yaml,.json',
247
+ help='文件扩展名 (默认:.py,.js,.sh,.ps1,.yaml,.json)')
248
+ parser.add_argument('--max-files', type=int, default=1000,
249
+ help='最大文件数 (默认:1000)')
250
+ parser.add_argument('--max-depth', type=int, default=10,
251
+ help='最大目录深度 (默认:10)')
252
+ parser.add_argument('--workers', type=int, default=4,
253
+ help='并发 workers (默认:4)')
254
+
255
+ # LLM 可选参数
256
+ llm_group = parser.add_argument_group('LLM 选项 (可选)')
257
+ llm_group.add_argument('--llm', action='store_true',
258
+ help='启用 LLM 深度分析 (仅对可疑样本)')
259
+ llm_group.add_argument('--llm-model', type=str, default='minimax',
260
+ choices=['minimax', 'qwen', 'openai'],
261
+ help='LLM 模型选择 (默认:minimax)')
262
+ llm_group.add_argument('--llm-threshold', type=float, default=0.5,
263
+ help='LLM 分析阈值 (confidence < 阈值时启用,默认:0.5)')
264
+ llm_group.add_argument('--llm-api-key', type=str, default='',
265
+ help='LLM API Key (默认:从 LLM_API_KEY 环境变量读取)')
266
+
267
+ # 输出参数
268
+ parser.add_argument('--output', type=str, default='text',
269
+ choices=['text', 'json'],
270
+ help='输出格式 (默认:text)')
271
+ parser.add_argument('--output-file', type=str, default='scan_report.json',
272
+ help='输出文件路径 (默认:scan_report.json)')
273
+
274
+ args = parser.parse_args()
275
+
276
+ # 打印版本信息
277
+ print("=" * 60)
278
+ print("🛡️ Security Scanner CLI v6.1.0")
279
+ print("=" * 60)
280
+ print(f"⏰ 开始时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
281
+
282
+ # 创建扫描器 (三层架构)
283
+ scanner = create_scanner(args)
284
+ scanner['base_path'] = args.target
285
+
286
+ # 扫描
287
+ target_path = Path(args.target)
288
+ results = scan_directory(target_path, scanner, args)
289
+
290
+ # 生成报告
291
+ report = generate_report(results, args)
292
+
293
+ # 输出
294
+ if args.output == 'json':
295
+ with open(args.output_file, 'w', encoding='utf-8') as f:
296
+ json.dump(report, f, ensure_ascii=False, indent=2)
297
+ print(f"\n📂 报告已保存:{args.output_file}")
298
+ else:
299
+ print("\n" + "=" * 60)
300
+ print("📊 扫描总结")
301
+ print("=" * 60)
302
+ print(f"⏱️ 总耗时:N/A")
303
+ print(f"📁 文件数:{report['summary']['total_files']}")
304
+ print(f"✅ 检出:{report['summary']['detected']}")
305
+ print(f"❌ 漏检:{report['summary']['safe']}")
306
+ print(f"📈 检测率:{report['summary']['detection_rate']:.2f}%")
307
+ print(f"\n🚨 风险分布:")
308
+ for level, count in report['risk_distribution'].items():
309
+ if count > 0:
310
+ print(f" {level}: {count} 个")
311
+ if report['llm_stats']:
312
+ print(f"\n🤖 LLM 分析:")
313
+ print(f" 分析样本:{report['llm_stats']['analyzed']} 个")
314
+ print(f" 模型:{report['llm_stats']['model']}")
315
+ print("=" * 60)
316
+ print("\n✅ 扫描完成!")
317
+
318
+ return 0
319
+
320
+
321
+ if __name__ == '__main__':
322
+ sys.exit(main())
@@ -0,0 +1,239 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ 编码处理工具 - 确保扫描器正确读取各种编码的文件
4
+
5
+ 问题:
6
+ - 样本文件没有声明编码(metadata.json 在外部)
7
+ - 使用 errors='ignore' 会丢弃无法解码的字符
8
+ - 编码不匹配导致规则匹配失败
9
+
10
+ 支持的编码形式:
11
+ 1. UTF-8 (无 BOM) - Linux/macOS 标准
12
+ 2. UTF-8 BOM (EF BB BF) - Windows 常见
13
+ 3. UTF-16 LE/BE - Windows 程序/文档
14
+ 4. UTF-32 LE/BE - 罕见但存在
15
+ 5. GBK/GB2312 - 中文 Windows
16
+ 6. Big5 - 繁体中文
17
+ 7. Shift-JIS - 日文
18
+ 8. EUC-KR - 韩文
19
+ 9. Latin-1/ISO-8859 - 欧洲语言
20
+ 10. Windows-1252 - Windows 西欧
21
+
22
+ 解决方案:
23
+ 1. 检测 BOM 头(优先)
24
+ 2. 使用 chardet 自动检测编码
25
+ 3. 优先 UTF-8,失败则用检测的编码
26
+ 4. 使用 errors='replace' 不丢字符(替换为 )
27
+ """
28
+
29
+ import chardet
30
+ from pathlib import Path
31
+ from typing import Tuple, Optional
32
+
33
+ # BOM 头定义
34
+ BOMS = {
35
+ b'\xef\xbb\xbf': 'utf-8-sig', # UTF-8 BOM
36
+ b'\xff\xfe': 'utf-16-le', # UTF-16 LE
37
+ b'\xfe\xff': 'utf-16-be', # UTF-16 BE
38
+ b'\xff\xfe\x00\x00': 'utf-32-le', # UTF-32 LE
39
+ b'\x00\x00\xfe\xff': 'utf-32-be', # UTF-32 BE
40
+ }
41
+
42
+ # 常见编码优先级(根据平台)
43
+ COMMON_ENCODINGS = {
44
+ 'windows': ['utf-8-sig', 'utf-8', 'gbk', 'gb2312', 'big5', 'latin-1', 'windows-1252'],
45
+ 'linux': ['utf-8', 'utf-8-sig', 'latin-1', 'iso-8859-1'],
46
+ 'darwin': ['utf-8', 'utf-8-sig', 'latin-1'],
47
+ 'default': ['utf-8', 'utf-8-sig', 'gbk', 'latin-1'],
48
+ }
49
+
50
+
51
+ def detect_bom(file_path: str) -> Optional[str]:
52
+ """
53
+ 检测 BOM 头
54
+
55
+ Args:
56
+ file_path: 文件路径
57
+
58
+ Returns:
59
+ encoding - 如果有 BOM 头,返回对应编码;否则返回 None
60
+ """
61
+ try:
62
+ path = Path(file_path)
63
+ if not path.exists():
64
+ return None
65
+
66
+ with open(path, 'rb') as f:
67
+ header = f.read(4) # 读取前 4 字节(最长 BOM)
68
+
69
+ # 按长度从长到短匹配(UTF-32 > UTF-16 > UTF-8)
70
+ for bom, encoding in sorted(BOMS.items(), key=lambda x: -len(x[0])):
71
+ if header.startswith(bom):
72
+ return encoding
73
+
74
+ return None
75
+ except Exception:
76
+ return None
77
+
78
+
79
+ def get_platform() -> str:
80
+ """检测当前平台"""
81
+ import sys
82
+ if sys.platform.startswith('win'):
83
+ return 'windows'
84
+ elif sys.platform.startswith('darwin'):
85
+ return 'darwin'
86
+ elif sys.platform.startswith('linux'):
87
+ return 'linux'
88
+ return 'default'
89
+
90
+
91
+ def detect_encoding(file_path: str, read_bytes: int = 10000) -> Tuple[Optional[str], float]:
92
+ """
93
+ 检测文件编码(综合 BOM + chardet)
94
+
95
+ Args:
96
+ file_path: 文件路径
97
+ read_bytes: 读取多少字节用于检测(默认 10KB)
98
+
99
+ Returns:
100
+ (encoding, confidence) - 编码名称和置信度
101
+ """
102
+ try:
103
+ path = Path(file_path)
104
+ if not path.exists():
105
+ return None, 0.0
106
+
107
+ # 1. 优先检测 BOM 头
108
+ bom_encoding = detect_bom(str(path))
109
+ if bom_encoding:
110
+ return bom_encoding, 1.0 # BOM 检测置信度 100%
111
+
112
+ # 2. 使用 chardet 检测
113
+ with open(path, 'rb') as f:
114
+ raw = f.read(read_bytes)
115
+ if not raw:
116
+ return None, 0.0
117
+
118
+ result = chardet.detect(raw)
119
+ return result['encoding'], result['confidence']
120
+ except Exception:
121
+ return None, 0.0
122
+
123
+
124
+ def read_file_safe(file_path: str) -> Tuple[str, str]:
125
+ """
126
+ 安全读取文件,自动处理编码
127
+
128
+ 策略:
129
+ 1. 检测 BOM 头(最高优先级)
130
+ 2. 平台特定编码优先级(Windows: UTF-8 BOM/GBK, Linux: UTF-8)
131
+ 3. chardet 自动检测
132
+ 4. 使用 errors='replace' 不丢字符
133
+
134
+ Args:
135
+ file_path: 文件路径
136
+
137
+ Returns:
138
+ (content, actual_encoding) - 文件内容和实际使用的编码
139
+ """
140
+ path = Path(file_path)
141
+
142
+ # 策略 1: 检测 BOM 头(最高优先级)
143
+ bom_encoding = detect_bom(str(path))
144
+ if bom_encoding:
145
+ try:
146
+ with open(path, 'r', encoding=bom_encoding, errors='replace') as f:
147
+ content = f.read()
148
+ return content, bom_encoding
149
+ except Exception:
150
+ pass
151
+
152
+ # 策略 2: chardet 检测编码(优先于平台默认,因为更准确)
153
+ detected_encoding, confidence = detect_encoding(str(path))
154
+
155
+ if detected_encoding and confidence > 0.7:
156
+ try:
157
+ with open(path, 'r', encoding=detected_encoding, errors='replace') as f:
158
+ content = f.read()
159
+ return content, detected_encoding
160
+ except Exception:
161
+ pass
162
+
163
+ # 策略 3: 根据平台尝试常见编码(使用 replace 避免失败)
164
+ platform = get_platform()
165
+ preferred_encodings = COMMON_ENCODINGS.get(platform, COMMON_ENCODINGS['default'])
166
+
167
+ for encoding in preferred_encodings:
168
+ try:
169
+ with open(path, 'r', encoding=encoding, errors='replace') as f:
170
+ content = f.read()
171
+ return content, encoding
172
+ except Exception:
173
+ continue
174
+
175
+ # 策略 3: 降级方案 - 二进制读取后强制 UTF-8
176
+ try:
177
+ with open(path, 'rb') as f:
178
+ raw = f.read()
179
+ content = raw.decode('utf-8', errors='replace')
180
+ return content, 'utf-8'
181
+ except Exception:
182
+ pass
183
+
184
+ # 策略 4: 最后手段 - ignore(会丢字符,但避免崩溃)
185
+ with open(path, 'r', encoding='utf-8', errors='ignore') as f:
186
+ content = f.read()
187
+ return content, 'utf-8'
188
+
189
+
190
+ def read_file_with_fallback(file_path: str, preferred_encodings: list = None) -> Tuple[str, str, bool]:
191
+ """
192
+ 读取文件,尝试多种编码
193
+
194
+ Args:
195
+ file_path: 文件路径
196
+ preferred_encodings: 优先尝试的编码列表(默认 ['utf-8', 'gbk', 'latin-1'])
197
+
198
+ Returns:
199
+ (content, encoding, success) - 内容、编码、是否成功
200
+ """
201
+ if preferred_encodings is None:
202
+ preferred_encodings = ['utf-8', 'gbk', 'latin-1', 'gb2312', 'big5']
203
+
204
+ for encoding in preferred_encodings:
205
+ try:
206
+ with open(file_path, 'r', encoding=encoding, errors='strict') as f:
207
+ content = f.read()
208
+ return content, encoding, True
209
+ except (UnicodeDecodeError, UnicodeError):
210
+ continue
211
+ except Exception:
212
+ break
213
+
214
+ # 全部失败,使用 replace 模式
215
+ content, encoding = read_file_safe(file_path)
216
+ return content, encoding, False
217
+
218
+
219
+ if __name__ == '__main__':
220
+ import sys
221
+
222
+ if len(sys.argv) < 2:
223
+ print("用法:python encoding_utils.py <文件路径>")
224
+ print("示例:python encoding_utils.py samples/malicious/payload.bash")
225
+ sys.exit(1)
226
+
227
+ file_path = sys.argv[1]
228
+
229
+ print(f"检测文件:{file_path}")
230
+
231
+ # 检测编码
232
+ detected, confidence = detect_encoding(file_path)
233
+ print(f"检测编码:{detected} (置信度:{confidence:.2f})")
234
+
235
+ # 安全读取
236
+ content, actual = read_file_safe(file_path)
237
+ print(f"实际使用:{actual}")
238
+ print(f"文件大小:{len(content)} 字符")
239
+ print(f"前 200 字符:\n{content[:200]}")