caidongyun 6.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +310 -0
- package/RELEASE_NOTES.md +200 -0
- package/SKILL.md +294 -0
- package/config_detector.py +134 -0
- package/index.d.ts +43 -0
- package/index.js +34 -0
- package/package.json +72 -0
- package/requirements.txt +11 -0
- package/rules/dist/all_rules.json +1 -0
- package/scan +17 -0
- package/scanner.py +322 -0
- package/src/encoding_utils.py +239 -0
- package/src/engines/__init__.py +1086 -0
- package/src/engines/aho_corasick_scanner.py +520 -0
- package/src/engines/ast_engine.py +290 -0
- package/src/engines/hybrid_scanner.py +284 -0
- package/src/engines/llm_engine.py +379 -0
- package/src/engines/pattern_engine.py +296 -0
- package/src/engines/rule_engine.py +282 -0
- package/whitelist_filter.py +394 -0
package/scan
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Agent Security Scanner CLI
|
|
4
|
+
|
|
5
|
+
快速启动扫描器的命令行工具
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
# 添加当前目录到路径
|
|
12
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
13
|
+
|
|
14
|
+
from scanner import main
|
|
15
|
+
|
|
16
|
+
if __name__ == '__main__':
|
|
17
|
+
main()
|
package/scanner.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Security Scanner CLI v6.1.0
|
|
4
|
+
统一安全扫描器 - 支持三层架构和 LLM 可选集成
|
|
5
|
+
|
|
6
|
+
检测流程:
|
|
7
|
+
1. PatternEngine (Layer 1) - 快速模式匹配
|
|
8
|
+
2. RuleEngine (Layer 2) - 深度规则匹配
|
|
9
|
+
3. LLMEngine (Layer 3, 可选) - 语义分析 + 上下文理解
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
|
|
21
|
+
# 添加 src 路径
|
|
22
|
+
sys.path.insert(0, str(Path(__file__).parent / 'src'))
|
|
23
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
24
|
+
|
|
25
|
+
# 导入三层架构引擎
|
|
26
|
+
from engines import PatternEngine, RuleEngine, LLMEngine
|
|
27
|
+
from whitelist_filter import WhitelistFilter
|
|
28
|
+
from config_detector import ConfigFileDetector
|
|
29
|
+
|
|
30
|
+
# 全局组件
|
|
31
|
+
whitelist_filter = WhitelistFilter()
|
|
32
|
+
config_detector = ConfigFileDetector()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def create_scanner(args):
|
|
36
|
+
"""
|
|
37
|
+
创建扫描器(支持三层架构)
|
|
38
|
+
"""
|
|
39
|
+
# Layer 1: Pattern Engine (必选)
|
|
40
|
+
layer1 = PatternEngine()
|
|
41
|
+
|
|
42
|
+
# Layer 2: Rule Engine (必选)
|
|
43
|
+
rules_file = Path(__file__).parent / 'rules' / 'dist' / 'all_rules.json'
|
|
44
|
+
layer2 = RuleEngine(rules_file=rules_file)
|
|
45
|
+
|
|
46
|
+
# Layer 3: LLM Engine (可选)
|
|
47
|
+
layer3 = None
|
|
48
|
+
if args.llm:
|
|
49
|
+
print(f"🤖 启用 LLM 深度分析 (模型:{args.llm_model})")
|
|
50
|
+
llm_config = {
|
|
51
|
+
'model': args.llm_model,
|
|
52
|
+
'api_key': args.llm_api_key or os.environ.get('LLM_API_KEY', ''),
|
|
53
|
+
'threshold': args.llm_threshold
|
|
54
|
+
}
|
|
55
|
+
layer3 = LLMEngine(llm_config)
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
'layer1': layer1,
|
|
59
|
+
'layer2': layer2,
|
|
60
|
+
'layer3': layer3
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def scan_file(file_path: Path, scanner, max_depth: int = -1) -> dict:
|
|
65
|
+
"""扫描单个文件(支持三层架构 + 白名单过滤)"""
|
|
66
|
+
try:
|
|
67
|
+
# 检查目录深度
|
|
68
|
+
if max_depth > 0:
|
|
69
|
+
try:
|
|
70
|
+
depth = len(file_path.relative_to(Path(scanner['base_path'])).parts)
|
|
71
|
+
if depth > max_depth:
|
|
72
|
+
return {'file': str(file_path), 'skipped': 'max_depth'}
|
|
73
|
+
except (ValueError, KeyError):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
# 读取文件内容
|
|
77
|
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
|
78
|
+
|
|
79
|
+
# 配置文件识别 (v6.1.0 新增)
|
|
80
|
+
file_type, config_risk = config_detector.classify_file(str(file_path), content)
|
|
81
|
+
if file_type == 'config':
|
|
82
|
+
if config_risk == 'malicious':
|
|
83
|
+
return {
|
|
84
|
+
'file': str(file_path),
|
|
85
|
+
'detected': True,
|
|
86
|
+
'score': 80,
|
|
87
|
+
'findings_count': 1,
|
|
88
|
+
'risk_level': 'HIGH',
|
|
89
|
+
'matched_rules': ['CONFIG-MALICIOUS'],
|
|
90
|
+
'whitelist_applied': False,
|
|
91
|
+
'is_config_file': True
|
|
92
|
+
}
|
|
93
|
+
else:
|
|
94
|
+
return {
|
|
95
|
+
'file': str(file_path),
|
|
96
|
+
'detected': False,
|
|
97
|
+
'score': 0,
|
|
98
|
+
'findings_count': 0,
|
|
99
|
+
'risk_level': 'SAFE',
|
|
100
|
+
'matched_rules': [],
|
|
101
|
+
'whitelist_applied': False,
|
|
102
|
+
'is_config_file': True
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# 三层架构扫描
|
|
106
|
+
# Layer 1: Pattern Engine
|
|
107
|
+
layer1_result = scanner['layer1'].scan(content, str(file_path))
|
|
108
|
+
|
|
109
|
+
# Layer 2: Rule Engine
|
|
110
|
+
layer2_result = scanner['layer2'].scan(content, layer1_result)
|
|
111
|
+
|
|
112
|
+
# Layer 3: LLM Engine (可选)
|
|
113
|
+
layer3_result = None
|
|
114
|
+
if scanner['layer3'] and layer2_result.get('confidence', 1.0) < 0.8:
|
|
115
|
+
layer3_result = scanner['layer3'].scan(content, layer1_result, layer2_result)
|
|
116
|
+
|
|
117
|
+
# 合并结果
|
|
118
|
+
result = {
|
|
119
|
+
'layer1': layer1_result,
|
|
120
|
+
'layer2': layer2_result,
|
|
121
|
+
'layer3': layer3_result,
|
|
122
|
+
'hit_count': layer2_result.get('hit_count', 0),
|
|
123
|
+
'matches': layer2_result.get('matches', []),
|
|
124
|
+
'score': layer2_result.get('score', 0),
|
|
125
|
+
'risk_level': layer2_result.get('risk_level', 'SAFE')
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# 白名单过滤
|
|
129
|
+
if result.get('matches'):
|
|
130
|
+
filtered = whitelist_filter.filter_results(
|
|
131
|
+
result['matches'],
|
|
132
|
+
str(file_path),
|
|
133
|
+
content
|
|
134
|
+
)
|
|
135
|
+
result['matches'] = filtered
|
|
136
|
+
result['hit_count'] = len(filtered)
|
|
137
|
+
result['whitelist_applied'] = True
|
|
138
|
+
|
|
139
|
+
# 转换为统一格式
|
|
140
|
+
detected = result.get('hit_count', 0) > 0
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
'file': str(file_path),
|
|
144
|
+
'detected': detected,
|
|
145
|
+
'score': result.get('score', 0),
|
|
146
|
+
'findings_count': result.get('hit_count', 0),
|
|
147
|
+
'risk_level': result.get('risk_level', 'SAFE'),
|
|
148
|
+
'matched_rules': list(set([m[0] if isinstance(m, tuple) else m.get('rule_id', m.get('pattern', '')) for m in result.get('matches', [])[:5]])),
|
|
149
|
+
'whitelist_applied': result.get('whitelist_applied', False),
|
|
150
|
+
'is_config_file': False,
|
|
151
|
+
'layer1_result': layer1_result,
|
|
152
|
+
'layer2_result': layer2_result,
|
|
153
|
+
'layer3_llm': layer3_result
|
|
154
|
+
}
|
|
155
|
+
except Exception as e:
|
|
156
|
+
return {
|
|
157
|
+
'file': str(file_path),
|
|
158
|
+
'error': str(e),
|
|
159
|
+
'detected': False
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def scan_directory(target_path: Path, scanner, args) -> list:
|
|
164
|
+
"""扫描目录"""
|
|
165
|
+
print(f"\n📂 扫描目标:{target_path}")
|
|
166
|
+
|
|
167
|
+
# 收集文件
|
|
168
|
+
files_to_scan = []
|
|
169
|
+
for ext in args.extensions.split(','):
|
|
170
|
+
files_to_scan.extend(list(target_path.rglob(f'*{ext.strip()}')))
|
|
171
|
+
|
|
172
|
+
# 去重
|
|
173
|
+
files_to_scan = list(set(files_to_scan))
|
|
174
|
+
|
|
175
|
+
# 应用文件数限制
|
|
176
|
+
if args.max_files > 0 and len(files_to_scan) > args.max_files:
|
|
177
|
+
print(f"⚠️ 文件数超过 {args.max_files},只扫描前 {args.max_files} 个")
|
|
178
|
+
files_to_scan = files_to_scan[:args.max_files]
|
|
179
|
+
|
|
180
|
+
print(f"✅ 找到 {len(files_to_scan)} 个文件")
|
|
181
|
+
|
|
182
|
+
# 并发扫描
|
|
183
|
+
results = []
|
|
184
|
+
with ThreadPoolExecutor(max_workers=args.workers) as executor:
|
|
185
|
+
futures = [executor.submit(scan_file, f, scanner, args.max_depth) for f in files_to_scan]
|
|
186
|
+
for future in tqdm(as_completed(futures), total=len(futures), desc="扫描进度"):
|
|
187
|
+
results.append(future.result())
|
|
188
|
+
|
|
189
|
+
return results
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def generate_report(results, args):
|
|
193
|
+
"""生成扫描报告"""
|
|
194
|
+
# 统计
|
|
195
|
+
total = len(results)
|
|
196
|
+
detected = sum(1 for r in results if r.get('detected'))
|
|
197
|
+
safe = total - detected
|
|
198
|
+
|
|
199
|
+
# 风险分布
|
|
200
|
+
risk_dist = {'CRITICAL': 0, 'HIGH': 0, 'MEDIUM': 0, 'LOW': 0, 'SAFE': 0}
|
|
201
|
+
for r in results:
|
|
202
|
+
risk_level = r.get('risk_level', 'SAFE')
|
|
203
|
+
if risk_level in risk_dist:
|
|
204
|
+
risk_dist[risk_level] += 1
|
|
205
|
+
|
|
206
|
+
# LLM 统计
|
|
207
|
+
llm_stats = None
|
|
208
|
+
if args.llm:
|
|
209
|
+
llm_count = sum(1 for r in results if r.get('layer3_llm'))
|
|
210
|
+
llm_stats = {
|
|
211
|
+
'analyzed': llm_count,
|
|
212
|
+
'model': args.llm_model
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
# 生成报告
|
|
216
|
+
report = {
|
|
217
|
+
'summary': {
|
|
218
|
+
'total_files': total,
|
|
219
|
+
'detected': detected,
|
|
220
|
+
'safe': safe,
|
|
221
|
+
'detection_rate': detected / total * 100 if total > 0 else 0,
|
|
222
|
+
'scan_time': datetime.now().isoformat()
|
|
223
|
+
},
|
|
224
|
+
'config': {
|
|
225
|
+
'version': '6.1.0',
|
|
226
|
+
'rules_count': 609,
|
|
227
|
+
'extensions': args.extensions,
|
|
228
|
+
'max_files': args.max_files,
|
|
229
|
+
'llm_enabled': args.llm,
|
|
230
|
+
'llm_model': args.llm_model if args.llm else None
|
|
231
|
+
},
|
|
232
|
+
'risk_distribution': risk_dist,
|
|
233
|
+
'llm_stats': llm_stats,
|
|
234
|
+
'results': results
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
return report
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def main():
|
|
241
|
+
"""主函数"""
|
|
242
|
+
parser = argparse.ArgumentParser(description='Security Scanner CLI v6.1.0 - 支持三层架构和 LLM 可选集成')
|
|
243
|
+
|
|
244
|
+
# 基本参数
|
|
245
|
+
parser.add_argument('target', type=str, help='扫描目标 (文件或目录)')
|
|
246
|
+
parser.add_argument('--extensions', type=str, default='.py,.js,.sh,.ps1,.yaml,.json',
|
|
247
|
+
help='文件扩展名 (默认:.py,.js,.sh,.ps1,.yaml,.json)')
|
|
248
|
+
parser.add_argument('--max-files', type=int, default=1000,
|
|
249
|
+
help='最大文件数 (默认:1000)')
|
|
250
|
+
parser.add_argument('--max-depth', type=int, default=10,
|
|
251
|
+
help='最大目录深度 (默认:10)')
|
|
252
|
+
parser.add_argument('--workers', type=int, default=4,
|
|
253
|
+
help='并发 workers (默认:4)')
|
|
254
|
+
|
|
255
|
+
# LLM 可选参数
|
|
256
|
+
llm_group = parser.add_argument_group('LLM 选项 (可选)')
|
|
257
|
+
llm_group.add_argument('--llm', action='store_true',
|
|
258
|
+
help='启用 LLM 深度分析 (仅对可疑样本)')
|
|
259
|
+
llm_group.add_argument('--llm-model', type=str, default='minimax',
|
|
260
|
+
choices=['minimax', 'qwen', 'openai'],
|
|
261
|
+
help='LLM 模型选择 (默认:minimax)')
|
|
262
|
+
llm_group.add_argument('--llm-threshold', type=float, default=0.5,
|
|
263
|
+
help='LLM 分析阈值 (confidence < 阈值时启用,默认:0.5)')
|
|
264
|
+
llm_group.add_argument('--llm-api-key', type=str, default='',
|
|
265
|
+
help='LLM API Key (默认:从 LLM_API_KEY 环境变量读取)')
|
|
266
|
+
|
|
267
|
+
# 输出参数
|
|
268
|
+
parser.add_argument('--output', type=str, default='text',
|
|
269
|
+
choices=['text', 'json'],
|
|
270
|
+
help='输出格式 (默认:text)')
|
|
271
|
+
parser.add_argument('--output-file', type=str, default='scan_report.json',
|
|
272
|
+
help='输出文件路径 (默认:scan_report.json)')
|
|
273
|
+
|
|
274
|
+
args = parser.parse_args()
|
|
275
|
+
|
|
276
|
+
# 打印版本信息
|
|
277
|
+
print("=" * 60)
|
|
278
|
+
print("🛡️ Security Scanner CLI v6.1.0")
|
|
279
|
+
print("=" * 60)
|
|
280
|
+
print(f"⏰ 开始时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
281
|
+
|
|
282
|
+
# 创建扫描器 (三层架构)
|
|
283
|
+
scanner = create_scanner(args)
|
|
284
|
+
scanner['base_path'] = args.target
|
|
285
|
+
|
|
286
|
+
# 扫描
|
|
287
|
+
target_path = Path(args.target)
|
|
288
|
+
results = scan_directory(target_path, scanner, args)
|
|
289
|
+
|
|
290
|
+
# 生成报告
|
|
291
|
+
report = generate_report(results, args)
|
|
292
|
+
|
|
293
|
+
# 输出
|
|
294
|
+
if args.output == 'json':
|
|
295
|
+
with open(args.output_file, 'w', encoding='utf-8') as f:
|
|
296
|
+
json.dump(report, f, ensure_ascii=False, indent=2)
|
|
297
|
+
print(f"\n📂 报告已保存:{args.output_file}")
|
|
298
|
+
else:
|
|
299
|
+
print("\n" + "=" * 60)
|
|
300
|
+
print("📊 扫描总结")
|
|
301
|
+
print("=" * 60)
|
|
302
|
+
print(f"⏱️ 总耗时:N/A")
|
|
303
|
+
print(f"📁 文件数:{report['summary']['total_files']}")
|
|
304
|
+
print(f"✅ 检出:{report['summary']['detected']}")
|
|
305
|
+
print(f"❌ 漏检:{report['summary']['safe']}")
|
|
306
|
+
print(f"📈 检测率:{report['summary']['detection_rate']:.2f}%")
|
|
307
|
+
print(f"\n🚨 风险分布:")
|
|
308
|
+
for level, count in report['risk_distribution'].items():
|
|
309
|
+
if count > 0:
|
|
310
|
+
print(f" {level}: {count} 个")
|
|
311
|
+
if report['llm_stats']:
|
|
312
|
+
print(f"\n🤖 LLM 分析:")
|
|
313
|
+
print(f" 分析样本:{report['llm_stats']['analyzed']} 个")
|
|
314
|
+
print(f" 模型:{report['llm_stats']['model']}")
|
|
315
|
+
print("=" * 60)
|
|
316
|
+
print("\n✅ 扫描完成!")
|
|
317
|
+
|
|
318
|
+
return 0
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
if __name__ == '__main__':
|
|
322
|
+
sys.exit(main())
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
编码处理工具 - 确保扫描器正确读取各种编码的文件
|
|
4
|
+
|
|
5
|
+
问题:
|
|
6
|
+
- 样本文件没有声明编码(metadata.json 在外部)
|
|
7
|
+
- 使用 errors='ignore' 会丢弃无法解码的字符
|
|
8
|
+
- 编码不匹配导致规则匹配失败
|
|
9
|
+
|
|
10
|
+
支持的编码形式:
|
|
11
|
+
1. UTF-8 (无 BOM) - Linux/macOS 标准
|
|
12
|
+
2. UTF-8 BOM (EF BB BF) - Windows 常见
|
|
13
|
+
3. UTF-16 LE/BE - Windows 程序/文档
|
|
14
|
+
4. UTF-32 LE/BE - 罕见但存在
|
|
15
|
+
5. GBK/GB2312 - 中文 Windows
|
|
16
|
+
6. Big5 - 繁体中文
|
|
17
|
+
7. Shift-JIS - 日文
|
|
18
|
+
8. EUC-KR - 韩文
|
|
19
|
+
9. Latin-1/ISO-8859 - 欧洲语言
|
|
20
|
+
10. Windows-1252 - Windows 西欧
|
|
21
|
+
|
|
22
|
+
解决方案:
|
|
23
|
+
1. 检测 BOM 头(优先)
|
|
24
|
+
2. 使用 chardet 自动检测编码
|
|
25
|
+
3. 优先 UTF-8,失败则用检测的编码
|
|
26
|
+
4. 使用 errors='replace' 不丢字符(替换为 )
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import chardet
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Tuple, Optional
|
|
32
|
+
|
|
33
|
+
# BOM 头定义
|
|
34
|
+
BOMS = {
|
|
35
|
+
b'\xef\xbb\xbf': 'utf-8-sig', # UTF-8 BOM
|
|
36
|
+
b'\xff\xfe': 'utf-16-le', # UTF-16 LE
|
|
37
|
+
b'\xfe\xff': 'utf-16-be', # UTF-16 BE
|
|
38
|
+
b'\xff\xfe\x00\x00': 'utf-32-le', # UTF-32 LE
|
|
39
|
+
b'\x00\x00\xfe\xff': 'utf-32-be', # UTF-32 BE
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# 常见编码优先级(根据平台)
|
|
43
|
+
COMMON_ENCODINGS = {
|
|
44
|
+
'windows': ['utf-8-sig', 'utf-8', 'gbk', 'gb2312', 'big5', 'latin-1', 'windows-1252'],
|
|
45
|
+
'linux': ['utf-8', 'utf-8-sig', 'latin-1', 'iso-8859-1'],
|
|
46
|
+
'darwin': ['utf-8', 'utf-8-sig', 'latin-1'],
|
|
47
|
+
'default': ['utf-8', 'utf-8-sig', 'gbk', 'latin-1'],
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def detect_bom(file_path: str) -> Optional[str]:
|
|
52
|
+
"""
|
|
53
|
+
检测 BOM 头
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
file_path: 文件路径
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
encoding - 如果有 BOM 头,返回对应编码;否则返回 None
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
path = Path(file_path)
|
|
63
|
+
if not path.exists():
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
with open(path, 'rb') as f:
|
|
67
|
+
header = f.read(4) # 读取前 4 字节(最长 BOM)
|
|
68
|
+
|
|
69
|
+
# 按长度从长到短匹配(UTF-32 > UTF-16 > UTF-8)
|
|
70
|
+
for bom, encoding in sorted(BOMS.items(), key=lambda x: -len(x[0])):
|
|
71
|
+
if header.startswith(bom):
|
|
72
|
+
return encoding
|
|
73
|
+
|
|
74
|
+
return None
|
|
75
|
+
except Exception:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_platform() -> str:
|
|
80
|
+
"""检测当前平台"""
|
|
81
|
+
import sys
|
|
82
|
+
if sys.platform.startswith('win'):
|
|
83
|
+
return 'windows'
|
|
84
|
+
elif sys.platform.startswith('darwin'):
|
|
85
|
+
return 'darwin'
|
|
86
|
+
elif sys.platform.startswith('linux'):
|
|
87
|
+
return 'linux'
|
|
88
|
+
return 'default'
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def detect_encoding(file_path: str, read_bytes: int = 10000) -> Tuple[Optional[str], float]:
|
|
92
|
+
"""
|
|
93
|
+
检测文件编码(综合 BOM + chardet)
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
file_path: 文件路径
|
|
97
|
+
read_bytes: 读取多少字节用于检测(默认 10KB)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
(encoding, confidence) - 编码名称和置信度
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
path = Path(file_path)
|
|
104
|
+
if not path.exists():
|
|
105
|
+
return None, 0.0
|
|
106
|
+
|
|
107
|
+
# 1. 优先检测 BOM 头
|
|
108
|
+
bom_encoding = detect_bom(str(path))
|
|
109
|
+
if bom_encoding:
|
|
110
|
+
return bom_encoding, 1.0 # BOM 检测置信度 100%
|
|
111
|
+
|
|
112
|
+
# 2. 使用 chardet 检测
|
|
113
|
+
with open(path, 'rb') as f:
|
|
114
|
+
raw = f.read(read_bytes)
|
|
115
|
+
if not raw:
|
|
116
|
+
return None, 0.0
|
|
117
|
+
|
|
118
|
+
result = chardet.detect(raw)
|
|
119
|
+
return result['encoding'], result['confidence']
|
|
120
|
+
except Exception:
|
|
121
|
+
return None, 0.0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def read_file_safe(file_path: str) -> Tuple[str, str]:
|
|
125
|
+
"""
|
|
126
|
+
安全读取文件,自动处理编码
|
|
127
|
+
|
|
128
|
+
策略:
|
|
129
|
+
1. 检测 BOM 头(最高优先级)
|
|
130
|
+
2. 平台特定编码优先级(Windows: UTF-8 BOM/GBK, Linux: UTF-8)
|
|
131
|
+
3. chardet 自动检测
|
|
132
|
+
4. 使用 errors='replace' 不丢字符
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
file_path: 文件路径
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
(content, actual_encoding) - 文件内容和实际使用的编码
|
|
139
|
+
"""
|
|
140
|
+
path = Path(file_path)
|
|
141
|
+
|
|
142
|
+
# 策略 1: 检测 BOM 头(最高优先级)
|
|
143
|
+
bom_encoding = detect_bom(str(path))
|
|
144
|
+
if bom_encoding:
|
|
145
|
+
try:
|
|
146
|
+
with open(path, 'r', encoding=bom_encoding, errors='replace') as f:
|
|
147
|
+
content = f.read()
|
|
148
|
+
return content, bom_encoding
|
|
149
|
+
except Exception:
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
# 策略 2: chardet 检测编码(优先于平台默认,因为更准确)
|
|
153
|
+
detected_encoding, confidence = detect_encoding(str(path))
|
|
154
|
+
|
|
155
|
+
if detected_encoding and confidence > 0.7:
|
|
156
|
+
try:
|
|
157
|
+
with open(path, 'r', encoding=detected_encoding, errors='replace') as f:
|
|
158
|
+
content = f.read()
|
|
159
|
+
return content, detected_encoding
|
|
160
|
+
except Exception:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
# 策略 3: 根据平台尝试常见编码(使用 replace 避免失败)
|
|
164
|
+
platform = get_platform()
|
|
165
|
+
preferred_encodings = COMMON_ENCODINGS.get(platform, COMMON_ENCODINGS['default'])
|
|
166
|
+
|
|
167
|
+
for encoding in preferred_encodings:
|
|
168
|
+
try:
|
|
169
|
+
with open(path, 'r', encoding=encoding, errors='replace') as f:
|
|
170
|
+
content = f.read()
|
|
171
|
+
return content, encoding
|
|
172
|
+
except Exception:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# 策略 3: 降级方案 - 二进制读取后强制 UTF-8
|
|
176
|
+
try:
|
|
177
|
+
with open(path, 'rb') as f:
|
|
178
|
+
raw = f.read()
|
|
179
|
+
content = raw.decode('utf-8', errors='replace')
|
|
180
|
+
return content, 'utf-8'
|
|
181
|
+
except Exception:
|
|
182
|
+
pass
|
|
183
|
+
|
|
184
|
+
# 策略 4: 最后手段 - ignore(会丢字符,但避免崩溃)
|
|
185
|
+
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
186
|
+
content = f.read()
|
|
187
|
+
return content, 'utf-8'
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def read_file_with_fallback(file_path: str, preferred_encodings: list = None) -> Tuple[str, str, bool]:
|
|
191
|
+
"""
|
|
192
|
+
读取文件,尝试多种编码
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
file_path: 文件路径
|
|
196
|
+
preferred_encodings: 优先尝试的编码列表(默认 ['utf-8', 'gbk', 'latin-1'])
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
(content, encoding, success) - 内容、编码、是否成功
|
|
200
|
+
"""
|
|
201
|
+
if preferred_encodings is None:
|
|
202
|
+
preferred_encodings = ['utf-8', 'gbk', 'latin-1', 'gb2312', 'big5']
|
|
203
|
+
|
|
204
|
+
for encoding in preferred_encodings:
|
|
205
|
+
try:
|
|
206
|
+
with open(file_path, 'r', encoding=encoding, errors='strict') as f:
|
|
207
|
+
content = f.read()
|
|
208
|
+
return content, encoding, True
|
|
209
|
+
except (UnicodeDecodeError, UnicodeError):
|
|
210
|
+
continue
|
|
211
|
+
except Exception:
|
|
212
|
+
break
|
|
213
|
+
|
|
214
|
+
# 全部失败,使用 replace 模式
|
|
215
|
+
content, encoding = read_file_safe(file_path)
|
|
216
|
+
return content, encoding, False
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == '__main__':
|
|
220
|
+
import sys
|
|
221
|
+
|
|
222
|
+
if len(sys.argv) < 2:
|
|
223
|
+
print("用法:python encoding_utils.py <文件路径>")
|
|
224
|
+
print("示例:python encoding_utils.py samples/malicious/payload.bash")
|
|
225
|
+
sys.exit(1)
|
|
226
|
+
|
|
227
|
+
file_path = sys.argv[1]
|
|
228
|
+
|
|
229
|
+
print(f"检测文件:{file_path}")
|
|
230
|
+
|
|
231
|
+
# 检测编码
|
|
232
|
+
detected, confidence = detect_encoding(file_path)
|
|
233
|
+
print(f"检测编码:{detected} (置信度:{confidence:.2f})")
|
|
234
|
+
|
|
235
|
+
# 安全读取
|
|
236
|
+
content, actual = read_file_safe(file_path)
|
|
237
|
+
print(f"实际使用:{actual}")
|
|
238
|
+
print(f"文件大小:{len(content)} 字符")
|
|
239
|
+
print(f"前 200 字符:\n{content[:200]}")
|