jarvis-ai-assistant 0.3.30__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/jarvis_agent/__init__.py +289 -87
  3. jarvis/jarvis_agent/agent_manager.py +17 -8
  4. jarvis/jarvis_agent/edit_file_handler.py +374 -86
  5. jarvis/jarvis_agent/event_bus.py +1 -1
  6. jarvis/jarvis_agent/file_context_handler.py +79 -0
  7. jarvis/jarvis_agent/jarvis.py +601 -43
  8. jarvis/jarvis_agent/main.py +32 -2
  9. jarvis/jarvis_agent/rewrite_file_handler.py +141 -0
  10. jarvis/jarvis_agent/run_loop.py +38 -5
  11. jarvis/jarvis_agent/share_manager.py +8 -1
  12. jarvis/jarvis_agent/stdio_redirect.py +295 -0
  13. jarvis/jarvis_agent/task_analyzer.py +5 -2
  14. jarvis/jarvis_agent/task_planner.py +496 -0
  15. jarvis/jarvis_agent/utils.py +5 -1
  16. jarvis/jarvis_agent/web_bridge.py +189 -0
  17. jarvis/jarvis_agent/web_output_sink.py +53 -0
  18. jarvis/jarvis_agent/web_server.py +751 -0
  19. jarvis/jarvis_c2rust/__init__.py +26 -0
  20. jarvis/jarvis_c2rust/cli.py +613 -0
  21. jarvis/jarvis_c2rust/collector.py +258 -0
  22. jarvis/jarvis_c2rust/library_replacer.py +1122 -0
  23. jarvis/jarvis_c2rust/llm_module_agent.py +1300 -0
  24. jarvis/jarvis_c2rust/optimizer.py +960 -0
  25. jarvis/jarvis_c2rust/scanner.py +1681 -0
  26. jarvis/jarvis_c2rust/transpiler.py +2325 -0
  27. jarvis/jarvis_code_agent/build_validation_config.py +133 -0
  28. jarvis/jarvis_code_agent/code_agent.py +1171 -94
  29. jarvis/jarvis_code_agent/code_analyzer/__init__.py +62 -0
  30. jarvis/jarvis_code_agent/code_analyzer/base_language.py +74 -0
  31. jarvis/jarvis_code_agent/code_analyzer/build_validator/__init__.py +44 -0
  32. jarvis/jarvis_code_agent/code_analyzer/build_validator/base.py +102 -0
  33. jarvis/jarvis_code_agent/code_analyzer/build_validator/cmake.py +59 -0
  34. jarvis/jarvis_code_agent/code_analyzer/build_validator/detector.py +125 -0
  35. jarvis/jarvis_code_agent/code_analyzer/build_validator/fallback.py +69 -0
  36. jarvis/jarvis_code_agent/code_analyzer/build_validator/go.py +38 -0
  37. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_gradle.py +44 -0
  38. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_maven.py +38 -0
  39. jarvis/jarvis_code_agent/code_analyzer/build_validator/makefile.py +50 -0
  40. jarvis/jarvis_code_agent/code_analyzer/build_validator/nodejs.py +93 -0
  41. jarvis/jarvis_code_agent/code_analyzer/build_validator/python.py +129 -0
  42. jarvis/jarvis_code_agent/code_analyzer/build_validator/rust.py +54 -0
  43. jarvis/jarvis_code_agent/code_analyzer/build_validator/validator.py +154 -0
  44. jarvis/jarvis_code_agent/code_analyzer/build_validator.py +43 -0
  45. jarvis/jarvis_code_agent/code_analyzer/context_manager.py +363 -0
  46. jarvis/jarvis_code_agent/code_analyzer/context_recommender.py +18 -0
  47. jarvis/jarvis_code_agent/code_analyzer/dependency_analyzer.py +132 -0
  48. jarvis/jarvis_code_agent/code_analyzer/file_ignore.py +330 -0
  49. jarvis/jarvis_code_agent/code_analyzer/impact_analyzer.py +781 -0
  50. jarvis/jarvis_code_agent/code_analyzer/language_registry.py +185 -0
  51. jarvis/jarvis_code_agent/code_analyzer/language_support.py +89 -0
  52. jarvis/jarvis_code_agent/code_analyzer/languages/__init__.py +31 -0
  53. jarvis/jarvis_code_agent/code_analyzer/languages/c_cpp_language.py +231 -0
  54. jarvis/jarvis_code_agent/code_analyzer/languages/go_language.py +183 -0
  55. jarvis/jarvis_code_agent/code_analyzer/languages/python_language.py +219 -0
  56. jarvis/jarvis_code_agent/code_analyzer/languages/rust_language.py +209 -0
  57. jarvis/jarvis_code_agent/code_analyzer/llm_context_recommender.py +451 -0
  58. jarvis/jarvis_code_agent/code_analyzer/symbol_extractor.py +77 -0
  59. jarvis/jarvis_code_agent/code_analyzer/tree_sitter_extractor.py +48 -0
  60. jarvis/jarvis_code_agent/lint.py +270 -8
  61. jarvis/jarvis_code_agent/utils.py +142 -0
  62. jarvis/jarvis_code_analysis/code_review.py +483 -569
  63. jarvis/jarvis_data/config_schema.json +97 -8
  64. jarvis/jarvis_git_utils/git_commiter.py +38 -26
  65. jarvis/jarvis_mcp/sse_mcp_client.py +2 -2
  66. jarvis/jarvis_mcp/stdio_mcp_client.py +1 -1
  67. jarvis/jarvis_memory_organizer/memory_organizer.py +1 -1
  68. jarvis/jarvis_multi_agent/__init__.py +239 -25
  69. jarvis/jarvis_multi_agent/main.py +37 -1
  70. jarvis/jarvis_platform/base.py +103 -51
  71. jarvis/jarvis_platform/openai.py +26 -1
  72. jarvis/jarvis_platform/yuanbao.py +1 -1
  73. jarvis/jarvis_platform_manager/service.py +2 -2
  74. jarvis/jarvis_rag/cli.py +4 -4
  75. jarvis/jarvis_sec/__init__.py +3605 -0
  76. jarvis/jarvis_sec/checkers/__init__.py +32 -0
  77. jarvis/jarvis_sec/checkers/c_checker.py +2680 -0
  78. jarvis/jarvis_sec/checkers/rust_checker.py +1108 -0
  79. jarvis/jarvis_sec/cli.py +116 -0
  80. jarvis/jarvis_sec/report.py +257 -0
  81. jarvis/jarvis_sec/status.py +264 -0
  82. jarvis/jarvis_sec/types.py +20 -0
  83. jarvis/jarvis_sec/workflow.py +219 -0
  84. jarvis/jarvis_stats/cli.py +1 -1
  85. jarvis/jarvis_stats/stats.py +1 -1
  86. jarvis/jarvis_stats/visualizer.py +1 -1
  87. jarvis/jarvis_tools/cli/main.py +1 -0
  88. jarvis/jarvis_tools/execute_script.py +46 -9
  89. jarvis/jarvis_tools/generate_new_tool.py +3 -1
  90. jarvis/jarvis_tools/read_code.py +275 -12
  91. jarvis/jarvis_tools/read_symbols.py +141 -0
  92. jarvis/jarvis_tools/read_webpage.py +5 -3
  93. jarvis/jarvis_tools/registry.py +73 -35
  94. jarvis/jarvis_tools/search_web.py +15 -11
  95. jarvis/jarvis_tools/sub_agent.py +24 -42
  96. jarvis/jarvis_tools/sub_code_agent.py +14 -13
  97. jarvis/jarvis_tools/virtual_tty.py +1 -1
  98. jarvis/jarvis_utils/config.py +187 -35
  99. jarvis/jarvis_utils/embedding.py +3 -0
  100. jarvis/jarvis_utils/git_utils.py +181 -6
  101. jarvis/jarvis_utils/globals.py +3 -3
  102. jarvis/jarvis_utils/http.py +1 -1
  103. jarvis/jarvis_utils/input.py +78 -2
  104. jarvis/jarvis_utils/methodology.py +25 -19
  105. jarvis/jarvis_utils/utils.py +644 -359
  106. {jarvis_ai_assistant-0.3.30.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/METADATA +85 -1
  107. jarvis_ai_assistant-0.7.0.dist-info/RECORD +192 -0
  108. {jarvis_ai_assistant-0.3.30.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/entry_points.txt +4 -0
  109. jarvis/jarvis_agent/config.py +0 -92
  110. jarvis/jarvis_tools/edit_file.py +0 -179
  111. jarvis/jarvis_tools/rewrite_file.py +0 -191
  112. jarvis_ai_assistant-0.3.30.dist-info/RECORD +0 -137
  113. {jarvis_ai_assistant-0.3.30.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/WHEEL +0 -0
  114. {jarvis_ai_assistant-0.3.30.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/licenses/LICENSE +0 -0
  115. {jarvis_ai_assistant-0.3.30.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2680 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Jarvis 安全分析套件 —— C/C++ 启发式安全检查器
4
+
5
+ 目标与范围:
6
+ - 聚焦内存管理、缓冲区操作、错误处理三类基础安全问题,提供可解释的启发式检测与置信度评估。
7
+ - 面向 C/C++ 与头文件(.c/.cpp/.h/.hpp)。
8
+
9
+ 输出约定:
10
+ - 返回 jarvis.jarvis_sec.workflow.Issue 列表(保持结构化,便于聚合评分与报告生成)。
11
+ - 置信度区间 [0,1],基于命中规则与上下文线索加权计算;严重性(severity)分为 high/medium/low。
12
+
13
+ 使用方式(示例):
14
+ - from jarvis.jarvis_sec.checkers.c_checker import analyze_files
15
+ - issues = analyze_files("./repo", ["src/a.c", "include/a.h"])
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import re
21
+ from pathlib import Path
22
+ from typing import Iterable, List, Optional, Sequence, Tuple
23
+
24
+ from jarvis.jarvis_sec.types import Issue
25
+
26
+
27
+ # ---------------------------
28
+ # 规则库(正则表达式)
29
+ # ---------------------------
30
+
31
+ RE_UNSAFE_API = re.compile(
32
+ r"\b(strcpy|strcat|gets|sprintf|vsprintf)\s*\(",
33
+ re.IGNORECASE,
34
+ )
35
+ RE_BOUNDARY_FUNCS = re.compile(
36
+ r"\b(memcpy|memmove|strncpy|strncat)\s*\(",
37
+ re.IGNORECASE,
38
+ )
39
+ RE_MEM_MGMT = re.compile(
40
+ r"\b(malloc|calloc|realloc|free|new\s+|delete\b)",
41
+ re.IGNORECASE,
42
+ )
43
+ RE_IO_API = re.compile(
44
+ r"\b(fopen|fclose|fread|fwrite|read|write|open|close)\s*\(",
45
+ re.IGNORECASE,
46
+ )
47
+
48
+ # 新增:格式化字符串/危险临时文件/命令执行等风险 API 模式
49
+ RE_PRINTF_LIKE = re.compile(r"\b(printf|sprintf|snprintf|vsprintf|vsnprintf)\s*\(", re.IGNORECASE)
50
+ RE_FPRINTF = re.compile(r"\bfprintf\s*\(", re.IGNORECASE)
51
+ RE_INSECURE_TMP = re.compile(r"\b(tmpnam|tempnam|mktemp)\s*\(", re.IGNORECASE)
52
+ RE_SYSTEM_LIKE = re.compile(r"\b(system|popen)\s*\(", re.IGNORECASE)
53
+ RE_EXEC_LIKE = re.compile(r"\b(execvp|execlp|execvpe|execl|execve|execv)\s*\(", re.IGNORECASE)
54
+ RE_SCANF_CALL = re.compile(r'\b(?:[fs]?scanf)\s*\(\s*"([^"]*)"', re.IGNORECASE)
55
+ # 线程/锁相关
56
+ RE_PTHREAD_LOCK = re.compile(r"\bpthread_mutex_lock\s*\(\s*&\s*([A-Za-z_]\w*)\s*\)\s*;?", re.IGNORECASE)
57
+ RE_PTHREAD_UNLOCK = re.compile(r"\bpthread_mutex_unlock\s*\(\s*&\s*([A-Za-z_]\w*)\s*\)\s*;?", re.IGNORECASE)
58
+ # 其他危险用法相关
59
+ RE_ATOI_FAMILY = re.compile(r"\b(atoi|atol|atoll|atof)\s*\(", re.IGNORECASE)
60
+ RE_RAND = re.compile(r"\b(rand|srand)\s*\(", re.IGNORECASE)
61
+ RE_STRTOK = re.compile(r"\bstrtok\s*\(", re.IGNORECASE)
62
+ RE_OPEN_PERMISSIVE = re.compile(r"\bopen\s*\(\s*[^,]+,\s*[^,]*O_CREAT[^,]*,\s*(0[0-7]{3,4})\s*\)", re.IGNORECASE)
63
+ RE_FOPEN_MODE = re.compile(r'\bfopen\s*\(\s*[^,]+,\s*"([^"]+)"\s*\)', re.IGNORECASE)
64
+ RE_GENERIC_ASSIGN = re.compile(r"\b([A-Za-z_]\w*)\s*=\s*")
65
+ RE_FREE_CALL_ANY = re.compile(r"\bfree\s*\(\s*([^)]+?)\s*\)", re.IGNORECASE)
66
+ # 扩展:更多危险用法相关
67
+ RE_ALLOCA = re.compile(r"\balloca\s*\(\s*([^)]+)\s*\)", re.IGNORECASE)
68
+ RE_VLA_DECL = re.compile(
69
+ r"\b(?:const\s+|volatile\s+|static\s+|register\s+|unsigned\s+|signed\s+)?[A-Za-z_]\w*(?:\s+\*|\s+)+[A-Za-z_]\w*\s*\[\s*([^\]]+)\s*\]\s*;",
70
+ re.IGNORECASE,
71
+ )
72
+ RE_PTHREAD_RET = re.compile(
73
+ r"\b(pthread_(?:mutex_(?:lock|trylock|timedlock)|create|cond_(?:wait|timedwait)|join|detach))\s*\(",
74
+ re.IGNORECASE,
75
+ )
76
+ RE_PTHREAD_COND_WAIT = re.compile(r"\bpthread_cond_(?:timed)?wait\s*\(", re.IGNORECASE)
77
+ RE_PTHREAD_CREATE = re.compile(r"\bpthread_create\s*\(\s*&\s*([A-Za-z_]\w*)\s*,", re.IGNORECASE)
78
+ RE_PTHREAD_JOIN = re.compile(r"\bpthread_join\s*\(\s*([A-Za-z_]\w*)\s*,", re.IGNORECASE)
79
+ RE_PTHREAD_DETACH = re.compile(r"\bpthread_detach\s*\(\s*([A-Za-z_]\w*)\s*\)", re.IGNORECASE)
80
+ # C++ 标准库锁相关
81
+ RE_STD_MUTEX = re.compile(r"\b(?:std::)?mutex\s+([A-Za-z_]\w*)", re.IGNORECASE)
82
+ RE_MUTEX_LOCK = re.compile(r"\b([A-Za-z_]\w*)\s*\.lock\s*\(", re.IGNORECASE)
83
+ RE_MUTEX_UNLOCK = re.compile(r"\b([A-Za-z_]\w*)\s*\.unlock\s*\(", re.IGNORECASE)
84
+ RE_MUTEX_TRY_LOCK = re.compile(r"\b([A-Za-z_]\w*)\s*\.try_lock\s*\(", re.IGNORECASE)
85
+ RE_LOCK_GUARD = re.compile(r"\b(?:std::)?lock_guard\s*<[^>]+>\s*([A-Za-z_]\w*)", re.IGNORECASE)
86
+ RE_UNIQUE_LOCK = re.compile(r"\b(?:std::)?unique_lock\s*<[^>]+>\s*([A-Za-z_]\w*)", re.IGNORECASE)
87
+ RE_SHARED_LOCK = re.compile(r"\b(?:std::)?shared_lock\s*<[^>]+>\s*([A-Za-z_]\w*)", re.IGNORECASE)
88
+ RE_STD_LOCK = re.compile(r"\bstd::lock\s*\(", re.IGNORECASE)
89
+ RE_SCOPED_LOCK = re.compile(r"\b(?:std::)?scoped_lock\s*<", re.IGNORECASE)
90
+ # 数据竞争检测相关
91
+ RE_STATIC_VAR = re.compile(r"\bstatic\s+(?:const\s+|volatile\s+)?[A-Za-z_]\w*(?:\s+\*|\s+)+([A-Za-z_]\w*)", re.IGNORECASE)
92
+ RE_EXTERN_VAR = re.compile(r"\bextern\s+[A-Za-z_]\w*(?:\s+\*|\s+)+([A-Za-z_]\w*)", re.IGNORECASE)
93
+ RE_STD_THREAD = re.compile(r"\b(?:std::)?thread\s+([A-Za-z_]\w*)", re.IGNORECASE)
94
+ RE_ATOMIC = re.compile(r"\b(?:std::)?atomic\s*<[^>]+>\s*([A-Za-z_]\w*)", re.IGNORECASE)
95
+ RE_VOLATILE = re.compile(r"\bvolatile\s+[A-Za-z_]\w*(?:\s+\*|\s+)+([A-Za-z_]\w*)", re.IGNORECASE)
96
+ RE_VAR_ACCESS = re.compile(r"\b([A-Za-z_]\w*)\s*(?:=|\[|->|\.)", re.IGNORECASE)
97
+ RE_VAR_ASSIGN = re.compile(r"\b([A-Za-z_]\w*)\s*=", re.IGNORECASE)
98
+ RE_INET_LEGACY = re.compile(r"\b(inet_addr|inet_aton)\s*\(", re.IGNORECASE)
99
+ RE_TIME_UNSAFE = re.compile(r"\b(asctime|ctime|localtime|gmtime)\s*\(", re.IGNORECASE)
100
+ RE_GETENV = re.compile(r'\bgetenv\s*\(\s*"[^"]*"\s*\)', re.IGNORECASE)
101
+
102
+ # 辅助正则
103
+ RE_REALLOC_ASSIGN_BACK = re.compile(
104
+ r"\b([A-Za-z_]\w*)\s*=\s*realloc\s*\(\s*\1\s*,", re.IGNORECASE
105
+ )
106
+ RE_MALLOC_ASSIGN = re.compile(
107
+ r"\b([A-Za-z_]\w*)\s*=\s*malloc\s*\(", re.IGNORECASE
108
+ )
109
+ RE_CALLOC_ASSIGN = re.compile(
110
+ r"\b([A-Za-z_]\w*)\s*=\s*calloc\s*\(", re.IGNORECASE
111
+ )
112
+ RE_NEW_ASSIGN = re.compile(
113
+ r"\b([A-Za-z_]\w*)\s*=\s*new\b", re.IGNORECASE
114
+ )
115
+ RE_DEREF = re.compile(
116
+ r"(\*|->)\s*[A-Za-z_]\w*|\b[A-Za-z_]\w*\s*\[", re.IGNORECASE
117
+ )
118
+ RE_NULL_CHECK = re.compile(
119
+ r"\bif\s*\(\s*(!\s*)?[A-Za-z_]\w*\s*(==|!=)\s*NULL\s*\)|\bif\s*\(\s*[A-Za-z_]\w*\s*\)", re.IGNORECASE
120
+ )
121
+ RE_FREE_VAR = re.compile(r"free\s*\(\s*([A-Za-z_]\w*)\s*\)\s*;", re.IGNORECASE)
122
+ RE_USE_VAR = re.compile(r"\b([A-Za-z_]\w*)\b")
123
+ RE_STRLEN_IN_SIZE = re.compile(r"\bstrlen\s*\(", re.IGNORECASE)
124
+ RE_SIZEOF_PTR = re.compile(r"\bsizeof\s*\(\s*\*\s*[A-Za-z_]\w*\s*\)", re.IGNORECASE)
125
+ RE_STRNCPY = re.compile(r"\bstrncpy\s*\(", re.IGNORECASE)
126
+ RE_STRNCAT = re.compile(r"\bstrncat\s*\(", re.IGNORECASE)
127
+
128
+ # C++ 特定模式
129
+ RE_SHARED_PTR = re.compile(r"\b(?:std::)?shared_ptr\s*<", re.IGNORECASE)
130
+ RE_UNIQUE_PTR = re.compile(r"\b(?:std::)?unique_ptr\s*<", re.IGNORECASE)
131
+ RE_WEAK_PTR = re.compile(r"\b(?:std::)?weak_ptr\s*<", re.IGNORECASE)
132
+ RE_SMART_PTR_ASSIGN = re.compile(r"\b([A-Za-z_]\w*)\s*=\s*(?:std::)?(?:shared_ptr|unique_ptr|weak_ptr)\s*<", re.IGNORECASE)
133
+ RE_NEW_ARRAY = re.compile(r"\bnew\s+[A-Za-z_]\w*\s*\[", re.IGNORECASE)
134
+ RE_DELETE_ARRAY = re.compile(r"\bdelete\s*\[\s*\]", re.IGNORECASE)
135
+ RE_DELETE = re.compile(r"\bdelete\s+(?!\[)", re.IGNORECASE)
136
+ RE_STATIC_CAST = re.compile(r"\bstatic_cast\s*<", re.IGNORECASE)
137
+ RE_DYNAMIC_CAST = re.compile(r"\bdynamic_cast\s*<", re.IGNORECASE)
138
+ RE_REINTERPRET_CAST = re.compile(r"\breinterpret_cast\s*<", re.IGNORECASE)
139
+ RE_CONST_CAST = re.compile(r"\bconst_cast\s*<", re.IGNORECASE)
140
+ RE_VECTOR_ACCESS = re.compile(r"\b(?:std::)?vector\s*<[^>]+>\s*[A-Za-z_]\w*\s*\[", re.IGNORECASE)
141
+ RE_STRING_ACCESS = re.compile(r"\b(?:std::)?(?:string|wstring)\s*[A-Za-z_]\w*\s*\[", re.IGNORECASE)
142
+ RE_VECTOR_VAR = re.compile(r"\b(?:std::)?vector\s*<[^>]+>\s*([A-Za-z_]\w*)", re.IGNORECASE)
143
+ RE_STRING_VAR = re.compile(r"\b(?:std::)?(?:string|wstring)\s+([A-Za-z_]\w*)", re.IGNORECASE)
144
+ RE_AT_METHOD = re.compile(r"\.at\s*\(", re.IGNORECASE)
145
+ RE_VIRTUAL_DTOR = re.compile(r"\bvirtual\s+~[A-Za-z_]\w*\s*\(", re.IGNORECASE)
146
+ RE_CLASS_DECL = re.compile(r"\bclass\s+([A-Za-z_]\w*)", re.IGNORECASE)
147
+ RE_DTOR_DECL = re.compile(r"~\s*([A-Za-z_]\w*)\s*\(", re.IGNORECASE)
148
+ RE_MOVE = re.compile(r"\bstd::move\s*\(", re.IGNORECASE)
149
+ RE_MOVE_ASSIGN = re.compile(r"\b([A-Za-z_]\w*)\s*=\s*std::move\s*\(", re.IGNORECASE)
150
+ RE_THROW = re.compile(r"\bthrow\s+", re.IGNORECASE)
151
+ RE_TRY = re.compile(r"\btry\s*\{", re.IGNORECASE)
152
+ RE_CATCH = re.compile(r"\bcatch\s*\(", re.IGNORECASE)
153
+ RE_NOEXCEPT = re.compile(r"\bnoexcept\s*(?:\([^)]*\))?", re.IGNORECASE)
154
+
155
+
156
+ # ---------------------------
157
+ # 公共工具
158
+ # ---------------------------
159
+
160
+ def _safe_line(lines: Sequence[str], idx: int) -> str:
161
+ if 1 <= idx <= len(lines):
162
+ return lines[idx - 1]
163
+ return ""
164
+
165
+
166
+ def _strip_line(s: str, max_len: int = 200) -> str:
167
+ s = s.strip().replace("\t", " ")
168
+ return s if len(s) <= max_len else s[: max_len - 3] + "..."
169
+
170
+
171
+ def _window(lines: Sequence[str], center: int, before: int = 3, after: int = 3) -> List[Tuple[int, str]]:
172
+ start = max(1, center - before)
173
+ end = min(len(lines), center + after)
174
+ return [(i, _safe_line(lines, i)) for i in range(start, end + 1)]
175
+
176
+
177
+ def _remove_comments_preserve_strings(text: str) -> str:
178
+ """
179
+ 移除 C/C++ 源码中的注释(// 与 /* */),保留字符串与字符字面量内容;
180
+ 为了保持行号与窗口定位稳定,注释内容会被空格替换并保留换行符。
181
+ 说明:本函数为启发式实现,旨在降低“注释中的API命中”造成的误报。
182
+ """
183
+ res: list[str] = []
184
+ i = 0
185
+ n = len(text)
186
+ in_sl_comment = False # //
187
+ in_bl_comment = False # /* */
188
+ in_string = False # "
189
+ in_char = False # '
190
+ escape = False
191
+
192
+ while i < n:
193
+ ch = text[i]
194
+ nxt = text[i + 1] if i + 1 < n else ""
195
+
196
+ if in_sl_comment:
197
+ # 单行注释直到换行结束
198
+ if ch == "\n":
199
+ in_sl_comment = False
200
+ res.append(ch)
201
+ else:
202
+ # 用空格占位,保持列数
203
+ res.append(" ")
204
+ i += 1
205
+ continue
206
+
207
+ if in_bl_comment:
208
+ # 多行注释直到 */
209
+ if ch == "*" and nxt == "/":
210
+ in_bl_comment = False
211
+ res.append(" ")
212
+ res.append(" ")
213
+ i += 2
214
+ else:
215
+ # 注释体内保留换行,其余替换为空格
216
+ res.append("\n" if ch == "\n" else " ")
217
+ i += 1
218
+ continue
219
+
220
+ # 非注释态下,处理字符串与字符字面量
221
+ if in_string:
222
+ res.append(ch)
223
+ if escape:
224
+ escape = False
225
+ elif ch == "\\":
226
+ escape = True
227
+ elif ch == '"':
228
+ in_string = False
229
+ i += 1
230
+ continue
231
+
232
+ if in_char:
233
+ res.append(ch)
234
+ if escape:
235
+ escape = False
236
+ elif ch == "\\":
237
+ escape = True
238
+ elif ch == "'":
239
+ in_char = False
240
+ i += 1
241
+ continue
242
+
243
+ # 进入注释判定(需不在字符串/字符字面量中)
244
+ if ch == "/" and nxt == "/":
245
+ in_sl_comment = True
246
+ # 保留两个占位,避免拼接
247
+ res.append(" ")
248
+ res.append(" ")
249
+ i += 2
250
+ continue
251
+ if ch == "/" and nxt == "*":
252
+ in_bl_comment = True
253
+ res.append(" ")
254
+ res.append(" ")
255
+ i += 2
256
+ continue
257
+
258
+ # 进入字符串/字符字面量
259
+ if ch == '"':
260
+ in_string = True
261
+ res.append(ch)
262
+ i += 1
263
+ continue
264
+ if ch == "'":
265
+ in_char = True
266
+ res.append(ch)
267
+ i += 1
268
+ continue
269
+
270
+ # 普通字符
271
+ res.append(ch)
272
+ i += 1
273
+
274
+ return "".join(res)
275
+
276
+
277
+ def _mask_strings_preserve_len(text: str) -> str:
278
+ """
279
+ 将字符串与字符字面量内部内容替换为空格,保留引号与换行,保持长度与行号不变。
280
+ 用于在扫描通用 API 模式时避免误将字符串中的片段(如 "system(")当作代码。
281
+ 注意:此函数不移除注释,请在已移除注释的文本上调用。
282
+ """
283
+ res: list[str] = []
284
+ in_string = False
285
+ in_char = False
286
+ escape = False
287
+ for ch in text:
288
+ if in_string:
289
+ if escape:
290
+ # 保留转义反斜杠为两字符(反斜杠+空格),以不破坏列对齐过多
291
+ res.append(" ")
292
+ escape = False
293
+ elif ch == "\\":
294
+ res.append("\\")
295
+ escape = True
296
+ elif ch == '"':
297
+ res.append('"')
298
+ in_string = False
299
+ elif ch == "\n":
300
+ res.append("\n")
301
+ else:
302
+ res.append(" ")
303
+ continue
304
+ if in_char:
305
+ if escape:
306
+ res.append(" ")
307
+ escape = False
308
+ elif ch == "\\":
309
+ res.append("\\")
310
+ escape = True
311
+ elif ch == "'":
312
+ res.append("'")
313
+ in_char = False
314
+ elif ch == "\n":
315
+ res.append("\n")
316
+ else:
317
+ res.append(" ")
318
+ continue
319
+ if ch == '"':
320
+ in_string = True
321
+ res.append('"')
322
+ continue
323
+ if ch == "'":
324
+ in_char = True
325
+ res.append("'")
326
+ continue
327
+ res.append(ch)
328
+ return "".join(res)
329
+
330
+
331
+ def _strip_if0_blocks(text: str) -> str:
332
+ """
333
+ 预处理常见的 #if 0 … #else … #endif 结构:
334
+ - 跳过 #if 0 的主体;若存在 #else,则保留 #else 分支
335
+ - 保留行数与换行,确保行号稳定
336
+ 限制:仅识别常量 0 的条件,不对复杂表达式求值;#elif 未处理
337
+ """
338
+ lines = text.splitlines(keepends=True)
339
+ out: list[str] = []
340
+ stack: list[dict] = [] # 每帧:{"kind": "if0"|"if", "skipping": bool, "in_else": bool}
341
+
342
+ def any_skipping() -> bool:
343
+ return any(frame.get("skipping", False) for frame in stack)
344
+
345
+ for line in lines:
346
+ if re.match(r"^\s*#\s*if\s+0\b", line):
347
+ # 进入 #if 0:主体跳过
348
+ stack.append({"kind": "if0", "skipping": True, "in_else": False})
349
+ out.append("\n" if line.endswith("\n") else "")
350
+ continue
351
+ if re.match(r"^\s*#\s*if\b", line):
352
+ # 其他 #if:不求值,仅记录,继承外层 skipping
353
+ stack.append({"kind": "if", "skipping": any_skipping(), "in_else": False})
354
+ out.append(line if not any_skipping() else ("\n" if line.endswith("\n") else ""))
355
+ continue
356
+ if re.match(r"^\s*#\s*else\b", line):
357
+ if stack:
358
+ top = stack[-1]
359
+ if top["kind"] == "if0":
360
+ # #if 0 的 else:翻转 skipping,使 else 分支有效
361
+ top["skipping"] = not top["skipping"]
362
+ top["in_else"] = True
363
+ out.append(line if not any_skipping() else ("\n" if line.endswith("\n") else ""))
364
+ continue
365
+ if re.match(r"^\s*#\s*endif\b", line):
366
+ if stack:
367
+ stack.pop()
368
+ out.append(line if not any_skipping() else ("\n" if line.endswith("\n") else ""))
369
+ continue
370
+ # 常规代码
371
+ if any_skipping():
372
+ out.append("\n" if line.endswith("\n") else "")
373
+ else:
374
+ out.append(line)
375
+ return "".join(out)
376
+
377
+
378
+ def _has_null_check_around(var: str, lines: Sequence[str], line_no: int, radius: int = 5) -> bool:
379
+ """
380
+ 扩展空指针检查识别能力,减少误报:
381
+ - if (ptr) / if (!ptr)
382
+ - if (ptr == NULL/0) / if (NULL/0 == ptr)
383
+ - 断言/检查宏:assert(ptr)、assert(ptr != NULL)、BUG_ON(!ptr)、WARN_ON(!ptr)、CHECK/ENSURE 等
384
+ """
385
+ for i, s in _window(lines, line_no, before=radius, after=radius):
386
+ # 直接真假判断
387
+ if re.search(rf"\bif\s*\(\s*{re.escape(var)}\s*\)", s):
388
+ return True
389
+ if re.search(rf"\bif\s*\(\s*!\s*{re.escape(var)}\s*\)", s):
390
+ return True
391
+ # 显式与 NULL/0 比较(任意顺序)
392
+ if re.search(rf"\bif\s*\(\s*{re.escape(var)}\s*(==|!=)\s*(NULL|0)\s*\)", s):
393
+ return True
394
+ if re.search(rf"\bif\s*\(\s*(NULL|0)\s*(==|!=)\s*{re.escape(var)}\s*\)", s):
395
+ return True
396
+ # 断言/检查宏(常见宏名):assert/BUG_ON/WARN_ON/CHECK/ENSURE
397
+ if re.search(
398
+ rf"\b(assert|BUG_ON|WARN_ON|CHECK|ENSURE)\s*\(\s*(!\s*)?{re.escape(var)}(\s*(==|!=)\s*(NULL|0))?\s*\)",
399
+ s,
400
+ ):
401
+ return True
402
+ return False
403
+
404
+
405
+ def _has_len_bound_around(lines: Sequence[str], line_no: int, radius: int = 3) -> bool:
406
+ for _, s in _window(lines, line_no, before=radius, after=radius):
407
+ # 检测是否出现长度上界/检查(非常粗略)
408
+ if any(k in s for k in ["sizeof(", "BUFFER_SIZE", "MAX_", "min(", "clamp(", "snprintf", "strlcpy", "strlcat"]):
409
+ return True
410
+ return False
411
+
412
+
413
+ def _severity_from_confidence(conf: float, base: str) -> str:
414
+ # 基于基类目提供缺省严重度调整
415
+ if conf >= 0.8:
416
+ return "high"
417
+ if conf >= 0.6:
418
+ return "medium"
419
+ return "low"
420
+
421
+
422
+ # ---------------------------
423
+ # 具体验证规则
424
+ # ---------------------------
425
+
426
+ def _rule_unsafe_api(lines: Sequence[str], relpath: str) -> List[Issue]:
427
+ issues: List[Issue] = []
428
+ is_header = str(relpath).lower().endswith((".h", ".hpp"))
429
+ re_type_kw = re.compile(r"\b(static|inline|const|volatile|unsigned|signed|long|short|int|char|void|size_t|ssize_t)\b")
430
+ for idx, s in enumerate(lines, start=1):
431
+ # 跳过预处理行与声明行,减少原型/宏中的误报
432
+ t = s.lstrip()
433
+ if t.startswith("#") or re.search(r"\b(typedef|extern)\b", s):
434
+ continue
435
+ m = RE_UNSAFE_API.search(s)
436
+ if not m:
437
+ continue
438
+ # 若在头文件中,且形如“返回类型 + 函数原型”的声明行(以 ); 结尾),跳过,避免将原型误报为调用
439
+ if is_header:
440
+ before = s[: m.start()]
441
+ if re_type_kw.search(before) and s.strip().endswith(");"):
442
+ continue
443
+ api = m.group(1)
444
+ conf = 0.85
445
+ if not _has_len_bound_around(lines, idx, radius=2):
446
+ conf += 0.05
447
+ severity = _severity_from_confidence(conf, "unsafe_api")
448
+ issues.append(
449
+ Issue(
450
+ language="c/cpp",
451
+ category="unsafe_api",
452
+ pattern=api,
453
+ file=relpath,
454
+ line=idx,
455
+ evidence=_strip_line(s),
456
+ description="使用不安全/高风险字符串API,可能导致缓冲区溢出或格式化风险。",
457
+ suggestion="替换为带边界的安全API(如 snprintf/strlcpy 等)或加入显式长度检查。",
458
+ confidence=min(conf, 0.95),
459
+ severity=severity,
460
+ )
461
+ )
462
+ return issues
463
+
464
+
465
+ def _rule_boundary_funcs(lines: Sequence[str], relpath: str) -> List[Issue]:
466
+ issues: List[Issue] = []
467
+ for idx, s in enumerate(lines, start=1):
468
+ # 跳过预处理行与声明行,避免在 typedef/extern 原型中误报
469
+ t = s.lstrip()
470
+ if t.startswith("#") or re.search(r"\b(typedef|extern)\b", s):
471
+ continue
472
+ m = RE_BOUNDARY_FUNCS.search(s)
473
+ if not m:
474
+ continue
475
+ api = m.group(1)
476
+ conf = 0.65
477
+ # 提取调用参数(启发式,便于准确性优化)
478
+ args = ""
479
+ try:
480
+ start = s.index("(", m.start())
481
+ end = s.rfind(")")
482
+ if end != -1 and end > start:
483
+ args = s[start + 1 : end]
484
+ except Exception:
485
+ args = ""
486
+
487
+ # 若为 memcpy/memmove 且第三参明显使用 sizeof(...)(且非 sizeof(*ptr))且未混入 strlen,
488
+ # 通常为更安全的写法:降低误报(直接跳过告警)
489
+ safe_sizeof = False
490
+ if api.lower() in ("memcpy", "memmove") and args:
491
+ if "sizeof" in args and not RE_SIZEOF_PTR.search(args) and not RE_STRLEN_IN_SIZE.search(args):
492
+ safe_sizeof = True
493
+ if safe_sizeof:
494
+ # 跳过该条,以提高准确性(避免将安全写法误报为风险)
495
+ continue
496
+
497
+ # 如果参数中包含 strlen 或 sizeof( *ptr ),提高风险(长度来源不稳定/指针大小)
498
+ if RE_STRLEN_IN_SIZE.search(s) or RE_SIZEOF_PTR.search(s):
499
+ conf += 0.15
500
+ # 周围未见边界检查,再提高
501
+ if not _has_len_bound_around(lines, idx, radius=2):
502
+ conf += 0.1
503
+ issues.append(
504
+ Issue(
505
+ language="c/cpp",
506
+ category="buffer_overflow",
507
+ pattern=api,
508
+ file=relpath,
509
+ line=idx,
510
+ evidence=_strip_line(s),
511
+ description="缓冲区操作涉及长度/边界,需确认长度来源是否可靠,避免越界。",
512
+ suggestion="核对目标缓冲区大小与拷贝长度;对外部输入进行校验;优先使用安全封装。",
513
+ confidence=min(conf, 0.95),
514
+ severity=_severity_from_confidence(conf, "buffer_overflow"),
515
+ )
516
+ )
517
+ return issues
518
+
519
+
520
+ def _rule_realloc_assign_back(lines: Sequence[str], relpath: str) -> List[Issue]:
521
+ issues: List[Issue] = []
522
+ for idx, s in enumerate(lines, start=1):
523
+ m = RE_REALLOC_ASSIGN_BACK.search(s)
524
+ if not m:
525
+ continue
526
+ var = m.group(1)
527
+ conf = 0.8
528
+ # 如果附近未见错误处理/NULL检查,置信度更高
529
+ if not _has_null_check_around(var, lines, idx, radius=3):
530
+ conf += 0.1
531
+ issues.append(
532
+ Issue(
533
+ language="c/cpp",
534
+ category="memory_mgmt",
535
+ pattern="realloc_overwrite",
536
+ file=relpath,
537
+ line=idx,
538
+ evidence=_strip_line(s),
539
+ description=f"realloc 直接覆盖原指针 {var},若失败将导致原内存泄漏。",
540
+ suggestion="使用临时指针接收 realloc 返回值,判空成功后再赋值回原指针。",
541
+ confidence=min(conf, 0.95),
542
+ severity=_severity_from_confidence(conf, "memory_mgmt"),
543
+ )
544
+ )
545
+ return issues
546
+
547
+
548
+ def _rule_malloc_no_null_check(lines: Sequence[str], relpath: str) -> List[Issue]:
549
+ issues: List[Issue] = []
550
+ for idx, s in enumerate(lines, start=1):
551
+ for pat in (RE_MALLOC_ASSIGN, RE_CALLOC_ASSIGN, RE_NEW_ASSIGN):
552
+ m = pat.search(s)
553
+ if not m:
554
+ continue
555
+ var = m.group(1)
556
+ # 在后续若干行中存在明显解引用/使用但未见 NULL 检查,提示
557
+ conf = 0.55
558
+ has_check = _has_null_check_around(var, lines, idx, radius=4)
559
+ # 搜索后续 6 行是否出现变量使用(粗略)
560
+ used = False
561
+ for j, sj in _window(lines, idx, before=0, after=6):
562
+ if j == idx:
563
+ continue
564
+ if re.search(rf"\b{re.escape(var)}\b(\s*(->|\[|\())", sj):
565
+ used = True
566
+ break
567
+ if used and not has_check:
568
+ conf += 0.25
569
+ elif not has_check:
570
+ conf += 0.1
571
+ issues.append(
572
+ Issue(
573
+ language="c/cpp",
574
+ category="memory_mgmt",
575
+ pattern="alloc_no_null_check",
576
+ file=relpath,
577
+ line=idx,
578
+ evidence=_strip_line(s),
579
+ description=f"内存/对象分配给 {var} 后可能未检查是否成功(NULL 检查缺失)。",
580
+ suggestion="在使用前检查分配结果是否为 NULL,并在错误路径上释放已获取的资源。",
581
+ confidence=min(conf, 0.9),
582
+ severity=_severity_from_confidence(conf, "memory_mgmt"),
583
+ )
584
+ )
585
+ return issues
586
+
587
+
588
+ def _rule_uaf_suspect(lines: Sequence[str], relpath: str) -> List[Issue]:
589
+ """
590
+ 启发式 UAF(use-after-free)线索检测(准确性优化版):
591
+ - 仅在 free(var) 之后的窗口内检测到明显“解引用使用”(v->、*v、v[...)而且在此之前未见重新赋值/置空时告警
592
+ - 忽略 free 后立即将指针置为 NULL/0 的情况
593
+ 说明:仍为启发式,需要结合上下文确认。
594
+ """
595
+ issues: List[Issue] = []
596
+ # 收集所有 free(var) 位置
597
+ free_calls: List[Tuple[str, int]] = []
598
+ for idx, s in enumerate(lines, start=1):
599
+ for m in re.finditer(r"free\s*\(\s*([A-Za-z_]\w*)\s*\)\s*;", s):
600
+ free_calls.append((m.group(1), idx))
601
+
602
+ # 针对每个 free(var),在后续窗口中寻找“危险使用”
603
+ for var, free_ln in free_calls:
604
+ # free 后 50 行窗口
605
+ start = free_ln + 1
606
+ end = min(len(lines), free_ln + 50)
607
+
608
+ # 同/邻近行若有置空,先快速跳过
609
+ early_null = False
610
+ for j in range(free_ln, min(len(lines), free_ln + 3) + 1):
611
+ sj = _safe_line(lines, j)
612
+ if re.search(rf"\b{re.escape(var)}\s*=\s*(NULL|0)\s*;", sj):
613
+ early_null = True
614
+ break
615
+ if early_null:
616
+ continue
617
+
618
+ reassigned = False
619
+ uaf_evidence_line: Optional[int] = None
620
+
621
+ deref_arrow = re.compile(rf"\b{re.escape(var)}\s*->")
622
+ deref_star = re.compile(rf"(?<!\w)\*\s*{re.escape(var)}\b")
623
+ deref_index = re.compile(rf"\b{re.escape(var)}\s*\[")
624
+ assign_pat = re.compile(rf"\b{re.escape(var)}\s*=")
625
+
626
+ for j in range(start, end + 1):
627
+ sj = _safe_line(lines, j)
628
+ # 先检测重新赋值(包括置NULL或重新指向),则视为“生命周期重置”,不报本条
629
+ if assign_pat.search(sj):
630
+ reassigned = True
631
+ break
632
+ # 检测明显的解引用使用
633
+ if deref_arrow.search(sj) or deref_star.search(sj) or deref_index.search(sj):
634
+ uaf_evidence_line = j
635
+ break
636
+
637
+ if uaf_evidence_line and not reassigned:
638
+ # 以 free 行作为证据点(保持与既有输出一致性)
639
+ evidence = _strip_line(_safe_line(lines, free_ln))
640
+ issues.append(
641
+ Issue(
642
+ language="c/cpp",
643
+ category="memory_mgmt",
644
+ pattern="use_after_free_suspect",
645
+ file=relpath,
646
+ line=free_ln,
647
+ evidence=evidence,
648
+ description=f"变量 {var} 在 free 后的邻近窗口内出现了解引用使用(UAF 线索),且未检测到重新赋值/置空。",
649
+ suggestion="free 后应将指针置为 NULL,并避免在重新赋值前进行任何解引用;建议引入生命周期管理与动态/静态检测。",
650
+ confidence=0.65,
651
+ severity="high",
652
+ )
653
+ )
654
+ return issues
655
+
656
+
657
+ def _rule_unchecked_io(lines: Sequence[str], relpath: str) -> List[Issue]:
658
+ issues: List[Issue] = []
659
+ for idx, s in enumerate(lines, start=1):
660
+ # 排除预处理与声明
661
+ t = s.lstrip()
662
+ if t.startswith("#") or re.search(r"\b(typedef|extern)\b", s):
663
+ continue
664
+ m = RE_IO_API.search(s)
665
+ if not m:
666
+ continue
667
+
668
+ # 若本行/紧随其后 2 行出现条件判断,认为已检查(直接跳过)
669
+ nearby = " ".join(_safe_line(lines, i) for i in range(idx, min(idx + 2, len(lines)) + 1))
670
+ if re.search(r"\b(if|while|for)\s*\(", nearby) or re.search(r"(>=|<=|==|!=|<|>)", nearby):
671
+ continue
672
+
673
+ # 若赋值给变量,则在后续窗口内寻找对该变量的检查
674
+ assigned_var: Optional[str] = None
675
+ try:
676
+ # 仅截取调用前的左侧以匹配最近的 "var ="
677
+ left = s[: m.start()]
678
+ assigns = list(RE_GENERIC_ASSIGN.finditer(left))
679
+ if assigns:
680
+ assigned_var = assigns[-1].group(1)
681
+ except Exception:
682
+ assigned_var = None
683
+
684
+ checked_via_var = False
685
+ if assigned_var:
686
+ end = min(len(lines), idx + 5)
687
+ var_pat_cond = re.compile(rf"\b(if|while|for)\s*\([^)]*\b{re.escape(assigned_var)}\b[^)]*\)")
688
+ var_pat_cmp = re.compile(rf"\b{re.escape(assigned_var)}\b\s*(>=|<=|==|!=|<|>)")
689
+ for j in range(idx + 1, end + 1):
690
+ sj = _safe_line(lines, j)
691
+ if var_pat_cond.search(sj) or var_pat_cmp.search(sj):
692
+ checked_via_var = True
693
+ break
694
+ if checked_via_var:
695
+ continue
696
+
697
+ # 到此仍未见检查,认为可能未检查错误
698
+ conf = 0.65 # 较原先略微提高基础置信度,因已进行更多排除
699
+ issues.append(
700
+ Issue(
701
+ language="c/cpp",
702
+ category="error_handling",
703
+ pattern="io_call",
704
+ file=relpath,
705
+ line=idx,
706
+ evidence=_strip_line(s),
707
+ description="I/O/系统调用可能未检查返回值,存在错误处理缺失风险。",
708
+ suggestion="检查返回值/errno;在错误路径上释放资源(句柄/内存/锁)。",
709
+ confidence=min(conf, 0.75),
710
+ severity=_severity_from_confidence(conf, "error_handling"),
711
+ )
712
+ )
713
+ return issues
714
+
715
+
716
+ def _rule_strncpy_no_nullterm(lines: Sequence[str], relpath: str) -> List[Issue]:
717
+ # 使用 strncpy/strncat 后未确保目标缓冲区以 NUL 结尾的常见隐患(启发式)
718
+ issues: List[Issue] = []
719
+ for idx, s in enumerate(lines, start=1):
720
+ if RE_STRNCPY.search(s) or RE_STRNCAT.search(s):
721
+ conf = 0.55
722
+ # 若邻近窗口未出现手动 '\0' 终止或显式长度-1 等处理,提升风险
723
+ window_text = " ".join(t for _, t in _window(lines, idx, before=1, after=2))
724
+ if not re.search(r"\\0|'\0'|\"\\0\"|len\s*-\s*1|sizeof\s*\(\s*\w+\s*\)\s*-\s*1", window_text):
725
+ conf += 0.15
726
+ issues.append(
727
+ Issue(
728
+ language="c/cpp",
729
+ category="buffer_overflow",
730
+ pattern="strncpy/strncat",
731
+ file=relpath,
732
+ line=idx,
733
+ evidence=_strip_line(s),
734
+ description="使用 strncpy/strncat 可能未自动添加 NUL 终止,导致潜在字符串未终止风险。",
735
+ suggestion="确保目标缓冲区以 '\\0' 终止(例如手动结尾或采用更安全 API)。",
736
+ confidence=min(conf, 0.75),
737
+ severity=_severity_from_confidence(conf, "buffer_overflow"),
738
+ )
739
+ )
740
+ return issues
741
+
742
+
743
+ # ---------------------------
744
+ # 对外主入口
745
+ # ---------------------------
746
+
747
+ # ---------------------------
748
+ # 额外规则(新增)
749
+ # ---------------------------
750
+
751
+ def _rule_format_string(lines: Sequence[str], relpath: str) -> List[Issue]:
752
+ """
753
+ 检测格式化字符串漏洞:printf/sprintf/snprintf/vsprintf/vsnprintf 的格式参数不是字面量;
754
+ fprintf 的第二个参数不是字面量。
755
+ 准确性优化:
756
+ - 允许常见本地化/包装宏作为格式参数包装字面量(如 _("..."), gettext("..."), tr("..."), QT_TR_NOOP("..."))
757
+ - 若参数为变量名,回看若干行,若变量被赋值为字面量字符串,则视为较安全用法(跳过)
758
+ - 针对不同函数,准确定位“格式串”所在的参数位置:
759
+ printf: 第1参;sprintf/vsprintf: 第2参;snprintf/vsnprintf: 第3参;fprintf: 第2参
760
+ """
761
+ SAFE_WRAPPERS = ("_", "gettext", "dgettext", "ngettext", "tr", "QT_TR_NOOP", "QT_TRANSLATE_NOOP")
762
+ issues: List[Issue] = []
763
+
764
+ def _arg_is_literal(s: str, j: int) -> bool:
765
+ while j < len(s) and s[j].isspace():
766
+ j += 1
767
+ return j < len(s) and s[j] == '"'
768
+
769
+ def _arg_is_wrapper_literal(s: str, j: int) -> bool:
770
+ k = j
771
+ while k < len(s) and (s[k].isalnum() or s[k] == "_"):
772
+ k += 1
773
+ name = s[j:k]
774
+ p = k
775
+ while p < len(s) and s[p].isspace():
776
+ p += 1
777
+ if name in SAFE_WRAPPERS and p < len(s) and s[p] == "(":
778
+ q = p + 1
779
+ while q < len(s) and s[q].isspace():
780
+ q += 1
781
+ return q < len(s) and s[q] == '"'
782
+ return False
783
+
784
+ def _leading_ident(s: str, j: int) -> Optional[str]:
785
+ k = j
786
+ if k < len(s) and (s[k].isalpha() or s[k] == "_"):
787
+ while k < len(s) and (s[k].isalnum() or s[k] == "_"):
788
+ k += 1
789
+ return s[j:k]
790
+ return None
791
+
792
+ def _var_assigned_literal(var: str, lines: Sequence[str], upto_idx: int, lookback: int = 5) -> bool:
793
+ start = max(1, upto_idx - lookback)
794
+ pat_assign = re.compile(rf"\b{re.escape(var)}\s*=\s*")
795
+ for j in range(start, upto_idx):
796
+ sj = _safe_line(lines, j)
797
+ m = pat_assign.search(sj)
798
+ if not m:
799
+ continue
800
+ k = m.end()
801
+ while k < len(sj) and sj[k].isspace():
802
+ k += 1
803
+ if k < len(sj) and sj[k] == '"':
804
+ return True
805
+ return False
806
+
807
+ def _nth_arg_start(s: str, open_paren_idx: int, n: int) -> Optional[int]:
808
+ """
809
+ 返回第 n 个参数的起始索引(首个非空白字符),若失败返回 None。
810
+ 仅在单行内进行括号配对和逗号计数(启发式)。
811
+ """
812
+ depth = 0
813
+ # 从 '(' 后开始
814
+ i = open_paren_idx + 1
815
+ # 跳到第一个参数
816
+ # 如果需要第1个参数,先定位其起始
817
+ # 统一逻辑:遍历,记录每个参数的起始位置
818
+ starts: List[int] = []
819
+ start_pos = None
820
+ while i < len(s):
821
+ ch = s[i]
822
+ if ch == "(":
823
+ depth += 1
824
+ elif ch == ")":
825
+ if depth == 0:
826
+ # 结束
827
+ if start_pos is not None:
828
+ starts.append(start_pos)
829
+ start_pos = None
830
+ break
831
+ depth -= 1
832
+ elif ch == "," and depth == 0:
833
+ # 参数分隔
834
+ if start_pos is None:
835
+ # 空参数,记录当前位置(可能是宏展开),尽量返回后续判断
836
+ starts.append(i + 1)
837
+ else:
838
+ starts.append(start_pos)
839
+ start_pos = None
840
+ # 下一个参数
841
+ else:
842
+ if not start_pos and not ch.isspace():
843
+ start_pos = i
844
+ i += 1
845
+ # 补上最后一个参数起点
846
+ if start_pos is not None:
847
+ starts.append(start_pos)
848
+ # 去除参数起点的前导空白
849
+ cleaned: List[int] = []
850
+ for pos in starts:
851
+ j = pos
852
+ while j < len(s) and s[j].isspace():
853
+ j += 1
854
+ cleaned.append(j)
855
+ if 1 <= n <= len(cleaned):
856
+ return cleaned[n - 1]
857
+ return None
858
+
859
+ for idx, s in enumerate(lines, start=1):
860
+ flagged = False
861
+ # 处理 printf/sprintf/snprintf/vsprintf/vsnprintf(格式串参数位置不同)
862
+ m1 = RE_PRINTF_LIKE.search(s)
863
+ if m1:
864
+ try:
865
+ name = m1.group(1).lower()
866
+ open_idx = s.index("(", m1.start())
867
+ # 参数索引映射
868
+ fmt_arg_map = {
869
+ "printf": 1,
870
+ "sprintf": 2,
871
+ "vsprintf": 2,
872
+ "snprintf": 3,
873
+ "vsnprintf": 3,
874
+ }
875
+ fmt_idx = fmt_arg_map.get(name, 1)
876
+ j = _nth_arg_start(s, open_idx, fmt_idx)
877
+ if j is not None:
878
+ # 字面量/包装字面量/回看字面量赋值的变量
879
+ if not _arg_is_literal(s, j):
880
+ if (s[j].isalpha() or s[j] == "_"):
881
+ if _arg_is_wrapper_literal(s, j):
882
+ flagged = False
883
+ else:
884
+ ident = _leading_ident(s, j)
885
+ if ident and _var_assigned_literal(ident, lines, idx, lookback=5):
886
+ flagged = False
887
+ else:
888
+ flagged = True
889
+ else:
890
+ flagged = True
891
+ else:
892
+ # 无法解析参数位置,保守告警
893
+ flagged = True
894
+ except Exception:
895
+ pass
896
+
897
+ # fprintf:第二个参数为格式串
898
+ m2 = RE_FPRINTF.search(s)
899
+ if not flagged and m2:
900
+ try:
901
+ open_idx = s.index("(", m2.start())
902
+ j = _nth_arg_start(s, open_idx, 2)
903
+ if j is not None:
904
+ if not _arg_is_literal(s, j):
905
+ if (s[j].isalpha() or s[j] == "_"):
906
+ if _arg_is_wrapper_literal(s, j):
907
+ flagged = False
908
+ else:
909
+ ident = _leading_ident(s, j)
910
+ if ident and _var_assigned_literal(ident, lines, idx, lookback=5):
911
+ flagged = False
912
+ else:
913
+ flagged = True
914
+ else:
915
+ flagged = True
916
+ else:
917
+ flagged = True
918
+ except Exception:
919
+ pass
920
+
921
+ if flagged:
922
+ issues.append(
923
+ Issue(
924
+ language="c/cpp",
925
+ category="unsafe_usage",
926
+ pattern="format_string",
927
+ file=relpath,
928
+ line=idx,
929
+ evidence=_strip_line(s),
930
+ description="格式化字符串参数不是字面量,可能导致格式化字符串漏洞。",
931
+ suggestion="使用常量格式串并对外部输入进行参数化处理;避免将未验证的输入作为格式串。",
932
+ confidence=0.8,
933
+ severity="high",
934
+ )
935
+ )
936
+ return issues
937
+
938
+
939
+ def _rule_insecure_tmpfile(lines: Sequence[str], relpath: str) -> List[Issue]:
940
+ """
941
+ 检测不安全临时文件API:tmpnam/tempnam/mktemp
942
+ """
943
+ issues: List[Issue] = []
944
+ for idx, s in enumerate(lines, start=1):
945
+ if RE_INSECURE_TMP.search(s):
946
+ issues.append(
947
+ Issue(
948
+ language="c/cpp",
949
+ category="unsafe_usage",
950
+ pattern="insecure_tmpfile",
951
+ file=relpath,
952
+ line=idx,
953
+ evidence=_strip_line(s),
954
+ description="使用不安全的临时文件API(tmpnam/tempnam/mktemp)可能导致竞态条件与劫持风险。",
955
+ suggestion="使用 mkstemp/mkdtemp 或安全封装,并设置合适的权限。",
956
+ confidence=0.85,
957
+ severity="high",
958
+ )
959
+ )
960
+ return issues
961
+
962
+
963
+ def _rule_command_execution(lines: Sequence[str], relpath: str) -> List[Issue]:
964
+ """
965
+ 检测命令执行API:system/popen 和 exec* 系列,其中参数不是字面量(可能引入命令注入风险)
966
+ 准确性优化:
967
+ - exec* 系列仅在第一个参数不是字面量路径时告警
968
+ - 若第一个参数为变量名,向前回看若干行,若检测到该变量被赋值为字面量字符串,则视为较安全用法(跳过)
969
+ """
970
+ issues: List[Issue] = []
971
+
972
+ def _arg_is_literal_or_wrapper(s: str, start_idx: int) -> bool:
973
+ # 跳过空白,判断是否直接为字面量
974
+ j = start_idx + 1
975
+ while j < len(s) and s[j].isspace():
976
+ j += 1
977
+ return j < len(s) and s[j] == '"'
978
+
979
+ def _first_arg_identifier(s: str, start_idx: int) -> Optional[str]:
980
+ j = start_idx + 1
981
+ while j < len(s) and s[j].isspace():
982
+ j += 1
983
+ if j < len(s) and (s[j].isalpha() or s[j] == "_"):
984
+ k = j
985
+ while k < len(s) and (s[k].isalnum() or s[k] == "_"):
986
+ k += 1
987
+ return s[j:k]
988
+ return None
989
+
990
+ def _var_assigned_literal(var: str, lines: Sequence[str], upto_idx: int, lookback: int = 5) -> bool:
991
+ # 在前 lookback 行内查找 var = "..."
992
+ start = max(1, upto_idx - lookback)
993
+ pat_assign = re.compile(rf"\b{re.escape(var)}\s*=\s*")
994
+ for j in range(start, upto_idx):
995
+ sj = _safe_line(lines, j)
996
+ m = pat_assign.search(sj)
997
+ if not m:
998
+ continue
999
+ # 检查赋值右侧是否为字面量(masked 文本中依旧保留引号)
1000
+ k = m.end()
1001
+ while k < len(sj) and sj[k].isspace():
1002
+ k += 1
1003
+ if k < len(sj) and sj[k] == '"':
1004
+ return True
1005
+ return False
1006
+
1007
+ for idx, s in enumerate(lines, start=1):
1008
+ flagged = False
1009
+ m_sys = RE_SYSTEM_LIKE.search(s)
1010
+ if m_sys:
1011
+ try:
1012
+ start = s.index("(", m_sys.start())
1013
+ if not _arg_is_literal_or_wrapper(s, start):
1014
+ # 若首参为变量且之前赋过字面量,则跳过
1015
+ ident = _first_arg_identifier(s, start)
1016
+ if ident and _var_assigned_literal(ident, lines, idx, lookback=5):
1017
+ flagged = False
1018
+ else:
1019
+ flagged = True
1020
+ except Exception:
1021
+ pass
1022
+ if not flagged:
1023
+ m_exec = RE_EXEC_LIKE.search(s)
1024
+ if m_exec:
1025
+ try:
1026
+ start = s.index("(", m_exec.start())
1027
+ if not _arg_is_literal_or_wrapper(s, start):
1028
+ ident = _first_arg_identifier(s, start)
1029
+ if ident and _var_assigned_literal(ident, lines, idx, lookback=5):
1030
+ flagged = False
1031
+ else:
1032
+ flagged = True
1033
+ except Exception:
1034
+ flagged = True
1035
+ if flagged:
1036
+ issues.append(
1037
+ Issue(
1038
+ language="c/cpp",
1039
+ category="unsafe_usage",
1040
+ pattern="command_exec",
1041
+ file=relpath,
1042
+ line=idx,
1043
+ evidence=_strip_line(s),
1044
+ description="外部命令执行可能使用了非字面量参数,存在命令注入风险。",
1045
+ suggestion="避免拼接命令,使用参数化接口或受控白名单;严格校验/转义外部输入。",
1046
+ confidence=0.7,
1047
+ severity="high",
1048
+ )
1049
+ )
1050
+ return issues
1051
+
1052
+
1053
+ def _rule_scanf_no_width(lines: Sequence[str], relpath: str) -> List[Issue]:
1054
+ """
1055
+ 检测 scanf/sscanf/fscanf 使用 %s 但未指定最大宽度,存在缓冲区溢出风险。
1056
+ 仅对格式串直接字面量的情况进行粗略检查。
1057
+ 准确性优化:
1058
+ - 忽略 GNU 扩展的 %ms(自动分配内存)与 %m[...] 模式(自动分配),这类不会对固定缓冲造成溢出
1059
+ - 忽略丢弃输入的 %*s(不写入目标缓冲)
1060
+ """
1061
+ issues: List[Issue] = []
1062
+ for idx, s in enumerate(lines, start=1):
1063
+ m = RE_SCANF_CALL.search(s)
1064
+ if not m:
1065
+ continue
1066
+ fmt = m.group(1)
1067
+ unsafe = False
1068
+ # 经典不安全情形:出现 %s 但未指定最大宽度
1069
+ if "%s" in fmt and not re.search(r"%\d+s", fmt):
1070
+ unsafe = True
1071
+ # 例外:%*s 丢弃输入,不写入目标缓冲
1072
+ if unsafe and re.search(r"%\*s", fmt):
1073
+ unsafe = False
1074
+ # 例外:GNU 扩展 %ms 或 %m[...](自动分配)
1075
+ if unsafe and re.search(r"%m[a-z\[]", fmt, re.IGNORECASE):
1076
+ unsafe = False
1077
+ if unsafe:
1078
+ issues.append(
1079
+ Issue(
1080
+ language="c/cpp",
1081
+ category="buffer_overflow",
1082
+ pattern="scanf_%s_no_width",
1083
+ file=relpath,
1084
+ line=idx,
1085
+ evidence=_strip_line(s),
1086
+ description="scanf/sscanf/fscanf 使用 %s 但未限制最大宽度,存在缓冲区溢出风险。",
1087
+ suggestion="为 %s 指定最大宽度(如 \"%255s\"),或使用更安全的读取方式;若使用 GNU 扩展 %ms/%m[...] 请确保对返回内存进行释放。",
1088
+ confidence=0.75,
1089
+ severity="high",
1090
+ )
1091
+ )
1092
+ return issues
1093
+
1094
+
1095
+ def _rule_alloc_size_overflow(lines: Sequence[str], relpath: str) -> List[Issue]:
1096
+ """
1097
+ 检测分配大小可能溢出的简单情形:malloc/calloc/realloc 形参存在乘法表达式且未显式使用 sizeof。
1098
+ 该规则为启发式,需人工确认。
1099
+ """
1100
+ issues: List[Issue] = []
1101
+ for idx, s in enumerate(lines, start=1):
1102
+ m = re.search(r"\bmalloc\s*\(", s, re.IGNORECASE)
1103
+ if not m:
1104
+ continue
1105
+ try:
1106
+ start = s.index("(", m.start())
1107
+ end = s.find(")", start + 1)
1108
+ if end != -1:
1109
+ args = s[start + 1 : end]
1110
+ if "*" in args and not re.search(r"\bsizeof\s*\(", args):
1111
+ issues.append(
1112
+ Issue(
1113
+ language="c/cpp",
1114
+ category="memory_mgmt",
1115
+ pattern="alloc_size_overflow",
1116
+ file=relpath,
1117
+ line=idx,
1118
+ evidence=_strip_line(s),
1119
+ description="malloc 大小计算包含乘法且未显式使用 sizeof,存在整数溢出或尺寸计算错误的风险。",
1120
+ suggestion="使用 sizeof 计算元素大小并检查乘法是否可能溢出;引入范围/上界校验。",
1121
+ confidence=0.6,
1122
+ severity="medium",
1123
+ )
1124
+ )
1125
+ except Exception:
1126
+ pass
1127
+ return issues
1128
+
1129
+
1130
+ # ---------------------------
1131
+ # 空指针/野指针/死锁 等新增规则
1132
+ # ---------------------------
1133
+
1134
+ def _rule_possible_null_deref(lines: Sequence[str], relpath: str) -> List[Issue]:
1135
+ """
1136
+ 启发式检测空指针解引用:
1137
+ - 出现 p->... 或 *p 访问,且邻近未见明显的 NULL 检查。
1138
+ 注:可能存在误报,需结合上下文确认。
1139
+ 准确性优化:
1140
+ - 对于 *p 的检测,引入上下文判定,尽量排除乘法表达式 a * p 的误报
1141
+ (仅当 * 出现在典型解引用上下文,如行首/括号后/逗号后/赋值号后/分号后/冒号后/方括号后/逻辑非/取地址/另一解引用后)
1142
+ """
1143
+ issues: List[Issue] = []
1144
+ re_arrow = re.compile(r"\b([A-Za-z_]\w*)\s*->")
1145
+ re_star = re.compile(r"(?<!\w)\*\s*([A-Za-z_]\w*)\b")
1146
+ type_kw = re.compile(r"\b(typedef|struct|union|enum|class|char|int|long|short|void|size_t|ssize_t|FILE)\b")
1147
+
1148
+ def _is_deref_context(line: str, star_pos: int) -> bool:
1149
+ k = star_pos - 1
1150
+ while k >= 0 and line[k].isspace():
1151
+ k -= 1
1152
+ if k < 0:
1153
+ return True
1154
+ # 典型可视为解引用的前导字符集合
1155
+ return line[k] in "(*,=:{;[!&"
1156
+
1157
+ for idx, s in enumerate(lines, start=1):
1158
+ vars_hit: List[str] = []
1159
+ # '->' 访问几乎必为解引用
1160
+ for m in re_arrow.finditer(s):
1161
+ vars_hit.append(m.group(1))
1162
+ # '*p':排除类型声明行;并通过上下文过滤乘法用法
1163
+ if "*" in s and not type_kw.search(s):
1164
+ for m in re_star.finditer(s):
1165
+ star_pos = m.start(0)
1166
+ if not _is_deref_context(s, star_pos):
1167
+ continue
1168
+ vars_hit.append(m.group(1))
1169
+ for v in set(vars_hit):
1170
+ if v == "this": # C++ 成员函数中 this-> 通常不应视为空指针
1171
+ continue
1172
+ if not _has_null_check_around(v, lines, idx, radius=3):
1173
+ issues.append(
1174
+ Issue(
1175
+ language="c/cpp",
1176
+ category="memory_mgmt",
1177
+ pattern="possible_null_deref",
1178
+ file=relpath,
1179
+ line=idx,
1180
+ evidence=_strip_line(s),
1181
+ description=f"可能对指针 {v} 进行了解引用,但附近未见 NULL 检查,存在空指针解引用风险。",
1182
+ suggestion="在使用指针前执行 NULL 判定;确保所有返回/赋值路径均进行了合法性检查。",
1183
+ confidence=0.6,
1184
+ severity="high",
1185
+ )
1186
+ )
1187
+ return issues
1188
+
1189
+
1190
+ def _rule_uninitialized_ptr_use(lines: Sequence[str], relpath: str) -> List[Issue]:
1191
+ """
1192
+ 检测野指针(未初始化指针)使用的简单情形:
1193
+ - 出现形如 `type *p;`(行内不含 '=' 且不含 '(',避免函数指针)后,在后续若干行内出现 p-> 或 *p 访问,
1194
+ 且未见 p 的赋值/初始化,则认为可能为野指针解引用。
1195
+ """
1196
+ issues: List[Issue] = []
1197
+ # 收集候选未初始化指针声明
1198
+ candidates = [] # (var, decl_line)
1199
+ decl_ptr_line = re.compile(r"\*")
1200
+ type_prefix = re.compile(r"\b(typedef|struct|union|enum|class|const|volatile|static|register|signed|unsigned|char|int|long|short|void|float|double)\b")
1201
+ for idx, s in enumerate(lines, start=1):
1202
+ if ";" not in s or "(" in s or "=" in s:
1203
+ continue
1204
+ if not decl_ptr_line.search(s):
1205
+ continue
1206
+ if not type_prefix.search(s):
1207
+ continue
1208
+ # 提取形如 *p, *q
1209
+ for m in re.finditer(r"\*\s*([A-Za-z_]\w*)\b", s):
1210
+ v = m.group(1)
1211
+ candidates.append((v, idx))
1212
+
1213
+ # 检查候选在接下来的窗口中是否在赋值前被解引用
1214
+ for v, decl_line in candidates:
1215
+ # 向后查看 20 行
1216
+ end = min(len(lines), decl_line + 20)
1217
+ initialized = False
1218
+ deref_line = None
1219
+ for j in range(decl_line + 1, end + 1):
1220
+ sj = _safe_line(lines, j)
1221
+ # 赋值/初始化:p = ..., p = &x, p = malloc(...)
1222
+ if re.search(rf"\b{re.escape(v)}\s*=\s*", sj):
1223
+ initialized = True
1224
+ break
1225
+ # 解引用:p-> 或 *p
1226
+ if re.search(rf"\b{re.escape(v)}\s*->", sj) or re.search(rf"(?<!\w)\*\s*{re.escape(v)}\b", sj):
1227
+ deref_line = j
1228
+ # 若命中,若附近没有 NULL 检查/初始化则认为风险较高
1229
+ break
1230
+ if deref_line and not initialized:
1231
+ issues.append(
1232
+ Issue(
1233
+ language="c/cpp",
1234
+ category="memory_mgmt",
1235
+ pattern="wild_pointer_deref",
1236
+ file=relpath,
1237
+ line=deref_line,
1238
+ evidence=_strip_line(_safe_line(lines, deref_line)),
1239
+ description=f"指针 {v} 声明后未见初始化即被解引用,可能为野指针使用。",
1240
+ suggestion="在声明后立即将指针初始化为 NULL,并在使用前进行显式赋值与有效性校验。",
1241
+ confidence=0.65,
1242
+ severity="high",
1243
+ )
1244
+ )
1245
+ return issues
1246
+
1247
+
1248
+ def _rule_deadlock_patterns(lines: Sequence[str], relpath: str) -> List[Issue]:
1249
+ """
1250
+ 检测常见死锁风险:
1251
+ - 双重加锁:同一互斥量在未解锁情况下再次加锁
1252
+ - 可能缺失解锁:加锁后在后续窗口内未看到对应解锁
1253
+ - 锁顺序反转:存在 (A->B) 与 (B->A) 两种加锁顺序
1254
+ 实现基于启发式,可能产生误报。
1255
+ """
1256
+ issues: List[Issue] = []
1257
+ lock_stack: list[str] = []
1258
+ # 记录出现过的加锁顺序对及其行号
1259
+ order_pairs: dict[tuple[str, str], int] = {}
1260
+
1261
+ # 先行扫描:顺序和双重加锁
1262
+ for idx, s in enumerate(lines, start=1):
1263
+ m_lock = RE_PTHREAD_LOCK.search(s)
1264
+ m_unlock = RE_PTHREAD_UNLOCK.search(s)
1265
+ if m_lock:
1266
+ mtx = m_lock.group(1)
1267
+ # 双重加锁检测
1268
+ if mtx in lock_stack:
1269
+ issues.append(
1270
+ Issue(
1271
+ language="c/cpp",
1272
+ category="error_handling",
1273
+ pattern="double_lock",
1274
+ file=relpath,
1275
+ line=idx,
1276
+ evidence=_strip_line(s),
1277
+ description=f"互斥量 {mtx} 在未解锁的情况下被再次加锁,存在死锁风险。",
1278
+ suggestion="避免对同一互斥量重复加锁;检查代码路径确保加锁/解锁严格匹配。",
1279
+ confidence=0.8,
1280
+ severity="high",
1281
+ )
1282
+ )
1283
+ # 锁顺序记录
1284
+ if lock_stack and lock_stack[-1] != mtx:
1285
+ pair = (lock_stack[-1], mtx)
1286
+ order_pairs.setdefault(pair, idx)
1287
+ lock_stack.append(mtx)
1288
+ elif m_unlock:
1289
+ mtx = m_unlock.group(1)
1290
+ # 从栈中移除最近的相同锁
1291
+ if mtx in lock_stack:
1292
+ # 移除最后一次加锁的该互斥量(近似)
1293
+ for k in range(len(lock_stack) - 1, -1, -1):
1294
+ if lock_stack[k] == mtx:
1295
+ del lock_stack[k]
1296
+ break
1297
+ # 粗略按函数/作用域结束重置
1298
+ if "}" in s and not lock_stack:
1299
+ lock_stack = []
1300
+
1301
+ # 锁顺序反转检测
1302
+ for (a, b), ln in order_pairs.items():
1303
+ if (b, a) in order_pairs:
1304
+ # 在第二次发现处报一次
1305
+ issues.append(
1306
+ Issue(
1307
+ language="c/cpp",
1308
+ category="error_handling",
1309
+ pattern="lock_order_inversion",
1310
+ file=relpath,
1311
+ line=order_pairs[(b, a)],
1312
+ evidence=_strip_line(_safe_line(lines, order_pairs[(b, a)])),
1313
+ description=f"检测到互斥量加锁顺序反转:({a} -> {b}) 与 ({b} -> {a}),存在死锁风险。",
1314
+ suggestion="统一多锁的获取顺序,制定全局锁等级或严格的加锁顺序规范。",
1315
+ confidence=0.7,
1316
+ severity="high",
1317
+ )
1318
+ )
1319
+
1320
+ # 可能缺失解锁:在加锁后的 50 行窗口内未见对应解锁
1321
+ for idx, s in enumerate(lines, start=1):
1322
+ m_lock = RE_PTHREAD_LOCK.search(s)
1323
+ if not m_lock:
1324
+ continue
1325
+ mtx = m_lock.group(1)
1326
+ end = min(len(lines), idx + 50)
1327
+ unlocked = False
1328
+ for j in range(idx + 1, end + 1):
1329
+ m_un = RE_PTHREAD_UNLOCK.search(_safe_line(lines, j))
1330
+ if m_un and m_un.group(1) == mtx:
1331
+ unlocked = True
1332
+ break
1333
+ if not unlocked:
1334
+ issues.append(
1335
+ Issue(
1336
+ language="c/cpp",
1337
+ category="error_handling",
1338
+ pattern="missing_unlock_suspect",
1339
+ file=relpath,
1340
+ line=idx,
1341
+ evidence=_strip_line(s),
1342
+ description=f"在加锁 {mtx} 之后的邻近窗口内未检测到匹配解锁,可能存在缺失解锁的风险。",
1343
+ suggestion="确保所有加锁路径都有配对的解锁;考虑使用 RAII/DEFER 风格避免遗漏。",
1344
+ confidence=0.55,
1345
+ severity="medium",
1346
+ )
1347
+ )
1348
+ return issues
1349
+
1350
+
1351
+ # ---------------------------
1352
+ # 其他危险用法规则(新增一批低误报)
1353
+ # ---------------------------
1354
+
1355
+ def _rule_double_free_and_free_non_heap(lines: Sequence[str], relpath: str) -> List[Issue]:
1356
+ """
1357
+ 检测:
1358
+ - double_free:同一指针在未重新赋值/置空情况下被重复 free
1359
+ - free_non_heap:free(&x) 或 free("literal") 等明显非堆内存释放
1360
+ 说明:启发式实现,复杂场景可能仍需人工确认。
1361
+ """
1362
+ issues: List[Issue] = []
1363
+ last_free_line: dict[str, int] = {}
1364
+ last_assign_line: dict[str, int] = {}
1365
+
1366
+ for idx, s in enumerate(lines, start=1):
1367
+ # 记录简单赋值(用于判断 free 之间是否有重新赋值)
1368
+ for m in RE_GENERIC_ASSIGN.finditer(s):
1369
+ var = m.group(1)
1370
+ last_assign_line[var] = idx
1371
+
1372
+ # 处理 free(...) 调用
1373
+ for m in RE_FREE_CALL_ANY.finditer(s):
1374
+ arg = m.group(1).strip()
1375
+
1376
+ # 忽略 free(NULL)/free(0)
1377
+ if re.fullmatch(r"\(?\s*(NULL|0|\(void\s*\*\)\s*0)\s*\)?", arg, re.IGNORECASE):
1378
+ continue
1379
+
1380
+ # 明显非堆:&... 或 字符串字面量
1381
+ if re.match(r"^\(?\s*&", arg) or arg.lstrip().startswith('"'):
1382
+ issues.append(
1383
+ Issue(
1384
+ language="c/cpp",
1385
+ category="memory_mgmt",
1386
+ pattern="free_non_heap",
1387
+ file=relpath,
1388
+ line=idx,
1389
+ evidence=_strip_line(s),
1390
+ description="检测到对非堆内存的释放(如 &var 或字符串字面量),属于未定义行为。",
1391
+ suggestion="仅释放由 malloc/calloc/realloc/new/new[] 获得的堆内存;避免对栈地址/字面量调用 free。",
1392
+ confidence=0.85,
1393
+ severity="high",
1394
+ )
1395
+ )
1396
+ continue
1397
+
1398
+ # double_free:仅在参数为单一标识符时检测
1399
+ if re.fullmatch(r"[A-Za-z_]\w*", arg):
1400
+ var = arg
1401
+ prev = last_free_line.get(var)
1402
+ if prev is not None:
1403
+ assign_after_prev = last_assign_line.get(var, -1)
1404
+ if assign_after_prev < prev:
1405
+ # 在上次 free 之后没有重新赋值/置空即再次 free,认为 double_free 风险高
1406
+ issues.append(
1407
+ Issue(
1408
+ language="c/cpp",
1409
+ category="memory_mgmt",
1410
+ pattern="double_free",
1411
+ file=relpath,
1412
+ line=idx,
1413
+ evidence=_strip_line(s),
1414
+ description=f"指针 {var} 可能在未重新赋值/置空情况下被重复释放(double free)。",
1415
+ suggestion="free 后将指针置 NULL;确保每块内存仅释放一次;理清所有权与释放路径。",
1416
+ confidence=0.8,
1417
+ severity="high",
1418
+ )
1419
+ )
1420
+ last_free_line[var] = idx
1421
+ return issues
1422
+
1423
+
1424
+ def _rule_atoi_family(lines: Sequence[str], relpath: str) -> List[Issue]:
1425
+ """
1426
+ 检测 atoi/atol/atoll/atof 的使用(缺乏错误与范围检查,易产生解析歧义)。
1427
+ 建议改用 strtol/strtoul/strtod 并检查 errno/端点指针。
1428
+ """
1429
+ issues: List[Issue] = []
1430
+ for idx, s in enumerate(lines, start=1):
1431
+ if RE_ATOI_FAMILY.search(s):
1432
+ issues.append(
1433
+ Issue(
1434
+ language="c/cpp",
1435
+ category="input_validation",
1436
+ pattern="atoi_family",
1437
+ file=relpath,
1438
+ line=idx,
1439
+ evidence=_strip_line(s),
1440
+ description="使用 atoi/atol/atoll/atof 缺乏错误与范围检查,容易产生解析错误或未定义行为。",
1441
+ suggestion="使用 strtol/strtoul/strtod 等并检查 errno 和 endptr;进行范围与格式校验。",
1442
+ confidence=0.65,
1443
+ severity="medium",
1444
+ )
1445
+ )
1446
+ return issues
1447
+
1448
+
1449
+ def _rule_rand_insecure(lines: Sequence[str], relpath: str) -> List[Issue]:
1450
+ """
1451
+ 检测 rand/srand 的使用。若上下文包含安全敏感关键词,提升风险。
1452
+ """
1453
+ issues: List[Issue] = []
1454
+ keywords = ("token", "nonce", "secret", "password", "passwd", "key", "auth", "salt", "session", "otp")
1455
+ for idx, s in enumerate(lines, start=1):
1456
+ if RE_RAND.search(s):
1457
+ conf = 0.55
1458
+ window_text = " ".join(t for _, t in _window(lines, idx, before=1, after=1)).lower()
1459
+ if any(k in window_text for k in keywords):
1460
+ conf += 0.2
1461
+ issues.append(
1462
+ Issue(
1463
+ language="c/cpp",
1464
+ category="crypto",
1465
+ pattern="rand_insecure",
1466
+ file=relpath,
1467
+ line=idx,
1468
+ evidence=_strip_line(s),
1469
+ description="检测到 rand/srand,用于安全敏感场景可能不安全,易被预测。",
1470
+ suggestion="使用系统级 CSPRNG(如 getrandom/arc4random/openssl RAND_bytes),避免用于密钥/令牌生成。",
1471
+ confidence=min(conf, 0.8),
1472
+ severity="high" if conf >= 0.7 else "medium",
1473
+ )
1474
+ )
1475
+ return issues
1476
+
1477
+
1478
+ def _rule_strtok_nonreentrant(lines: Sequence[str], relpath: str) -> List[Issue]:
1479
+ """
1480
+ 检测 strtok 非重入/线程不安全使用。
1481
+ """
1482
+ issues: List[Issue] = []
1483
+ for idx, s in enumerate(lines, start=1):
1484
+ if RE_STRTOK.search(s):
1485
+ issues.append(
1486
+ Issue(
1487
+ language="c/cpp",
1488
+ category="thread_safety",
1489
+ pattern="strtok_nonreentrant",
1490
+ file=relpath,
1491
+ line=idx,
1492
+ evidence=_strip_line(s),
1493
+ description="使用 strtok 非重入且线程不安全,可能导致竞态或数据覆盖。",
1494
+ suggestion="使用 strtok_r(POSIX)或可重入/线程安全的分割方案。",
1495
+ confidence=0.6,
1496
+ severity="medium",
1497
+ )
1498
+ )
1499
+ return issues
1500
+
1501
+
1502
+ def _rule_open_permissive_perms(lines: Sequence[str], relpath: str) -> List[Issue]:
1503
+ """
1504
+ 检测过宽文件权限:
1505
+ - open(..., O_CREAT, 0666/0777/...) 直接授予过宽权限
1506
+ - fopen(..., "w"/"w+") 在安全敏感上下文可提示收紧权限(基于关键词启发)
1507
+ """
1508
+ issues: List[Issue] = []
1509
+ sensitive_keys = ("key", "secret", "token", "passwd", "password", "cred", "config", "cert", "private", "id_rsa")
1510
+ for idx, s in enumerate(lines, start=1):
1511
+ m = RE_OPEN_PERMISSIVE.search(s)
1512
+ if m:
1513
+ mode = m.group(1)
1514
+ issues.append(
1515
+ Issue(
1516
+ language="c/cpp",
1517
+ category="insecure_permissions",
1518
+ pattern="open_permissive_perms",
1519
+ file=relpath,
1520
+ line=idx,
1521
+ evidence=_strip_line(s),
1522
+ description=f"open 使用 O_CREAT 且权限 {mode} 过宽,存在敏感信息泄露风险。",
1523
+ suggestion="显式使用更严格的权限(如 0600/0640),或设置合适 umask 后再创建文件。",
1524
+ confidence=0.8,
1525
+ severity="high",
1526
+ )
1527
+ )
1528
+ # fopen 模式为写入且上下文敏感时,进行提醒
1529
+ m2 = RE_FOPEN_MODE.search(s)
1530
+ if m2:
1531
+ mode = m2.group(1)
1532
+ if "w" in mode:
1533
+ window = " ".join(t for _, t in _window(lines, idx, before=1, after=1)).lower()
1534
+ if any(k in window for k in sensitive_keys):
1535
+ issues.append(
1536
+ Issue(
1537
+ language="c/cpp",
1538
+ category="insecure_permissions",
1539
+ pattern="fopen_write_sensitive",
1540
+ file=relpath,
1541
+ line=idx,
1542
+ evidence=_strip_line(s),
1543
+ description="fopen 以写入模式操作可能的敏感文件,需确认创建权限与 umask 设置是否足够严格。",
1544
+ suggestion="确认运行态 umask;必要时使用 open+fchmod/umask 控制权限,或以 0600 创建后再放宽。",
1545
+ confidence=0.55,
1546
+ severity="medium",
1547
+ )
1548
+ )
1549
+ return issues
1550
+
1551
+
1552
+ # ---------------------------
1553
+ # 更多危险用法规则(第二批)
1554
+ # ---------------------------
1555
+
1556
+ def _rule_alloca_unbounded(lines: Sequence[str], relpath: str) -> List[Issue]:
1557
+ """
1558
+ 检测 alloca 使用非常量/未受控大小,可能导致栈耗尽或崩溃。
1559
+ 仅在参数非纯数字常量、且不含 sizeof 时告警。
1560
+ """
1561
+ issues: List[Issue] = []
1562
+ for idx, s in enumerate(lines, start=1):
1563
+ m = RE_ALLOCA.search(s)
1564
+ if not m:
1565
+ continue
1566
+ arg = m.group(1).strip()
1567
+ # 纯数字常量或包含 sizeof 视为更安全
1568
+ if re.fullmatch(r"\d+\s*", arg) or "sizeof" in arg:
1569
+ continue
1570
+ # 宏常量(全大写+下划线/数字)通常为编译期常量,减少误报
1571
+ if re.fullmatch(r"[A-Z_][A-Z0-9_]*", arg):
1572
+ continue
1573
+ conf = 0.6
1574
+ if re.search(r"(len|size|count|n)\b", arg, re.IGNORECASE):
1575
+ conf += 0.1
1576
+ issues.append(
1577
+ Issue(
1578
+ language="c/cpp",
1579
+ category="memory_mgmt",
1580
+ pattern="alloca_unbounded",
1581
+ file=relpath,
1582
+ line=idx,
1583
+ evidence=_strip_line(s),
1584
+ description="alloca 使用的大小不是编译期常量,可能导致未受控的栈分配与崩溃风险。",
1585
+ suggestion="避免使用 alloca;改用堆分配并对大小做上界检查与错误处理。",
1586
+ confidence=min(conf, 0.8),
1587
+ severity="high" if conf >= 0.7 else "medium",
1588
+ )
1589
+ )
1590
+ return issues
1591
+
1592
+
1593
+ def _rule_vla_usage(lines: Sequence[str], relpath: str) -> List[Issue]:
1594
+ """
1595
+ 检测可变长度数组(VLA)使用:声明中使用变量/表达式作为数组长度。
1596
+ 仅在长度非纯数字常量时提示。
1597
+ """
1598
+ issues: List[Issue] = []
1599
+ type_prefix = re.compile(r"\b(typedef|struct|union|enum|class|const|volatile|static|register|signed|unsigned|char|int|long|short|void|float|double|size_t|ssize_t)\b")
1600
+ for idx, s in enumerate(lines, start=1):
1601
+ if ";" not in s or "=" in s:
1602
+ continue
1603
+ if not type_prefix.search(s):
1604
+ continue
1605
+ m = RE_VLA_DECL.search(s)
1606
+ if not m:
1607
+ continue
1608
+ length_expr = m.group(1).strip()
1609
+ if re.fullmatch(r"\d+\s*", length_expr):
1610
+ continue
1611
+ # 宏常量(全大写+下划线/数字)通常为编译期常量(非 VLA),降低误报
1612
+ if re.fullmatch(r"[A-Z_][A-Z0-9_]*", length_expr):
1613
+ continue
1614
+ issues.append(
1615
+ Issue(
1616
+ language="c/cpp",
1617
+ category="memory_mgmt",
1618
+ pattern="vla_usage",
1619
+ file=relpath,
1620
+ line=idx,
1621
+ evidence=_strip_line(s),
1622
+ description="检测到可变长度数组(VLA),在栈上进行不定大小分配,可能导致栈溢出/不可控内存使用。",
1623
+ suggestion="避免 VLA;改用堆分配并进行上界校验,或使用固定上界的静态分配。",
1624
+ confidence=0.6,
1625
+ severity="medium",
1626
+ )
1627
+ )
1628
+ return issues
1629
+
1630
+
1631
+ def _rule_pthread_returns_unchecked(lines: Sequence[str], relpath: str) -> List[Issue]:
1632
+ """
1633
+ 检测 pthread 常见接口的返回值未检查的情形(同/后一两行缺少 if/比较判断)。
1634
+ """
1635
+ issues: List[Issue] = []
1636
+ for idx, s in enumerate(lines, start=1):
1637
+ if not RE_PTHREAD_RET.search(s):
1638
+ continue
1639
+ nearby = " ".join(_safe_line(lines, i) for i in range(idx, min(idx + 2, len(lines)) + 1))
1640
+ if not re.search(r"\bif\s*\(|>=|<=|==|!=|<|>", nearby):
1641
+ issues.append(
1642
+ Issue(
1643
+ language="c/cpp",
1644
+ category="error_handling",
1645
+ pattern="pthread_ret_unchecked",
1646
+ file=relpath,
1647
+ line=idx,
1648
+ evidence=_strip_line(s),
1649
+ description="pthread 接口返回值可能未检查,错误处理缺失可能导致死锁/资源泄漏。",
1650
+ suggestion="检查 pthread 接口返回码并进行错误路径处理;必要时记录日志与清理资源。",
1651
+ confidence=0.6,
1652
+ severity="medium",
1653
+ )
1654
+ )
1655
+ return issues
1656
+
1657
+
1658
+ def _rule_cond_wait_no_loop(lines: Sequence[str], relpath: str) -> List[Issue]:
1659
+ """
1660
+ 检测 pthread_cond_wait 未在 while 循环中使用(防止虚假唤醒)。
1661
+ 准确性优化:
1662
+ - 支持检测“与调用在同一行的 while(predicate) pthread_cond_wait(...)”写法,避免误报
1663
+ """
1664
+ issues: List[Issue] = []
1665
+ for idx, s in enumerate(lines, start=1):
1666
+ m = RE_PTHREAD_COND_WAIT.search(s)
1667
+ if not m:
1668
+ continue
1669
+ # 回看 2 行内是否有 while( ... )
1670
+ prev_text = " ".join(_safe_line(lines, j) for j in range(max(1, idx - 2), idx))
1671
+ has_prev_while = re.search(r"\bwhile\s*\(", prev_text) is not None
1672
+ # 同一行(调用前半部分)若包含 while(...),也视为正确用法
1673
+ same_line_before = s[: m.start()]
1674
+ has_same_line_while = re.search(r"\bwhile\s*\(", same_line_before) is not None
1675
+
1676
+ if has_prev_while or has_same_line_while:
1677
+ continue
1678
+
1679
+ issues.append(
1680
+ Issue(
1681
+ language="c/cpp",
1682
+ category="thread_safety",
1683
+ pattern="cond_wait_no_loop",
1684
+ file=relpath,
1685
+ line=idx,
1686
+ evidence=_strip_line(s),
1687
+ description="pthread_cond_wait 建议置于条件谓词的 while 循环中,以防止虚假唤醒。",
1688
+ suggestion="使用 while(predicate_not_satisfied) 包裹 pthread_cond_wait 调用并在唤醒后重新检查条件。",
1689
+ confidence=0.6,
1690
+ severity="medium",
1691
+ )
1692
+ )
1693
+ return issues
1694
+
1695
+
1696
+ def _rule_thread_leak_no_join(lines: Sequence[str], relpath: str) -> List[Issue]:
1697
+ """
1698
+ 检测创建线程后未 join/detach 的可能线程泄漏。
1699
+ """
1700
+ issues: List[Issue] = []
1701
+ for idx, s in enumerate(lines, start=1):
1702
+ m = RE_PTHREAD_CREATE.search(s)
1703
+ if not m:
1704
+ continue
1705
+ tid = m.group(1)
1706
+ end = min(len(lines), idx + 80)
1707
+ joined_or_detached = False
1708
+ for j in range(idx + 1, end + 1):
1709
+ sj = _safe_line(lines, j)
1710
+ m_join = RE_PTHREAD_JOIN.search(sj)
1711
+ if m_join and m_join.group(1) == tid:
1712
+ joined_or_detached = True
1713
+ break
1714
+ m_detach = RE_PTHREAD_DETACH.search(sj)
1715
+ if m_detach and m_detach.group(1) == tid:
1716
+ joined_or_detached = True
1717
+ break
1718
+ if not joined_or_detached:
1719
+ issues.append(
1720
+ Issue(
1721
+ language="c/cpp",
1722
+ category="resource_leak",
1723
+ pattern="thread_leak_no_join",
1724
+ file=relpath,
1725
+ line=idx,
1726
+ evidence=_strip_line(s),
1727
+ description=f"pthread_create 创建线程 {tid} 后的邻近窗口内未检测到 join/detach,可能导致线程泄漏或资源占用。",
1728
+ suggestion="确保创建的线程被显式 join 或 detach;遵循统一的线程生命周期管理策略。",
1729
+ confidence=0.6,
1730
+ severity="medium",
1731
+ )
1732
+ )
1733
+ return issues
1734
+
1735
+
1736
+ def _rule_inet_legacy(lines: Sequence[str], relpath: str) -> List[Issue]:
1737
+ """
1738
+ 检测 inet_addr/inet_aton 等旧接口的使用。
1739
+ """
1740
+ issues: List[Issue] = []
1741
+ for idx, s in enumerate(lines, start=1):
1742
+ if RE_INET_LEGACY.search(s):
1743
+ issues.append(
1744
+ Issue(
1745
+ language="c/cpp",
1746
+ category="network_api",
1747
+ pattern="inet_legacy",
1748
+ file=relpath,
1749
+ line=idx,
1750
+ evidence=_strip_line(s),
1751
+ description="使用 inet_addr/inet_aton 等旧接口,错误语义模糊/不一致。",
1752
+ suggestion="使用 inet_pton/inet_ntop 进行地址转换,错误处理更可靠且支持 IPv6。",
1753
+ confidence=0.6,
1754
+ severity="low",
1755
+ )
1756
+ )
1757
+ return issues
1758
+
1759
+
1760
+ def _rule_time_apis_not_threadsafe(lines: Sequence[str], relpath: str) -> List[Issue]:
1761
+ """
1762
+ 检测 asctime/ctime/localtime/gmtime 非线程安全接口(非 *_r)。
1763
+ """
1764
+ issues: List[Issue] = []
1765
+ for idx, s in enumerate(lines, start=1):
1766
+ # 排除 *_r 版本
1767
+ if RE_TIME_UNSAFE.search(s) and not re.search(r"_r\s*\(", s):
1768
+ issues.append(
1769
+ Issue(
1770
+ language="c/cpp",
1771
+ category="thread_safety",
1772
+ pattern="time_api_not_threadsafe",
1773
+ file=relpath,
1774
+ line=idx,
1775
+ evidence=_strip_line(s),
1776
+ description="使用 asctime/ctime/localtime/gmtime 等非重入接口,线程安全性不足。",
1777
+ suggestion="改用 *_r 线程安全版本(如 localtime_r/gmtime_r/ctime_r)。",
1778
+ confidence=0.6,
1779
+ severity="medium",
1780
+ )
1781
+ )
1782
+ return issues
1783
+
1784
+
1785
+ def _rule_getenv_unchecked(lines: Sequence[str], relpath: str) -> List[Issue]:
1786
+ """
1787
+ 检测 getenv 使用(环境变量未校验可能导致配置/路径/命令注入风险)。
1788
+ """
1789
+ issues: List[Issue] = []
1790
+ for idx, s in enumerate(lines, start=1):
1791
+ if RE_GETENV.search(s):
1792
+ issues.append(
1793
+ Issue(
1794
+ language="c/cpp",
1795
+ category="input_validation",
1796
+ pattern="getenv_unchecked",
1797
+ file=relpath,
1798
+ line=idx,
1799
+ evidence=_strip_line(s),
1800
+ description="读取环境变量后未见显式校验,可能被用于构造路径/命令等引入安全风险。",
1801
+ suggestion="对白名单键进行读取;对取值执行格式/长度/字符集校验;避免直接拼接为命令/路径。",
1802
+ confidence=0.55,
1803
+ severity="medium",
1804
+ )
1805
+ )
1806
+ return issues
1807
+
1808
+
1809
+ # ---------------------------
1810
+ # C++ 特定检查规则
1811
+ # ---------------------------
1812
+
1813
+ def _rule_new_delete_mismatch(lines: Sequence[str], relpath: str) -> List[Issue]:
1814
+ """
1815
+ 检测 new[]/delete[] 和 new/delete 的匹配问题:
1816
+ - new[] 必须用 delete[] 释放
1817
+ - new 必须用 delete 释放(不能用 delete[])
1818
+ """
1819
+ issues: List[Issue] = []
1820
+ new_array_vars: dict[str, int] = {} # var -> line_no
1821
+ new_vars: dict[str, int] = {} # var -> line_no
1822
+
1823
+ # 收集 new[] 和 new 的分配
1824
+ for idx, s in enumerate(lines, start=1):
1825
+ # new[] 分配
1826
+ m = RE_NEW_ARRAY.search(s)
1827
+ if m:
1828
+ # 尝试提取变量名(简单启发式)
1829
+ assign_match = re.search(r"\b([A-Za-z_]\w*)\s*=\s*new\s+", s, re.IGNORECASE)
1830
+ if assign_match:
1831
+ var = assign_match.group(1)
1832
+ new_array_vars[var] = idx
1833
+
1834
+ # new 分配(非数组)
1835
+ m_new = re.search(r"\b([A-Za-z_]\w*)\s*=\s*new\s+(?!.*\[)", s, re.IGNORECASE)
1836
+ if m_new:
1837
+ var = m_new.group(1)
1838
+ new_vars[var] = idx
1839
+
1840
+ # 检查 delete[] 和 delete 的使用
1841
+ for idx, s in enumerate(lines, start=1):
1842
+ # delete[] 使用
1843
+ if RE_DELETE_ARRAY.search(s):
1844
+ # 提取变量名
1845
+ m = re.search(r"delete\s*\[\s*\]\s*([A-Za-z_]\w*)", s, re.IGNORECASE)
1846
+ if m:
1847
+ var = m.group(1)
1848
+ if var in new_vars:
1849
+ # 用 delete[] 释放了 new 分配的内存
1850
+ issues.append(
1851
+ Issue(
1852
+ language="c/cpp",
1853
+ category="memory_mgmt",
1854
+ pattern="delete_array_mismatch",
1855
+ file=relpath,
1856
+ line=idx,
1857
+ evidence=_strip_line(s),
1858
+ description="使用 delete[] 释放由 new 分配的内存(非数组),存在未定义行为风险。",
1859
+ suggestion="new 分配的内存应使用 delete 释放;new[] 分配的内存应使用 delete[] 释放。",
1860
+ confidence=0.85,
1861
+ severity="high",
1862
+ )
1863
+ )
1864
+
1865
+ # delete 使用(非数组)
1866
+ if RE_DELETE.search(s):
1867
+ m = re.search(r"delete\s+([A-Za-z_]\w*)", s, re.IGNORECASE)
1868
+ if m:
1869
+ var = m.group(1)
1870
+ if var in new_array_vars:
1871
+ # 用 delete 释放了 new[] 分配的内存
1872
+ issues.append(
1873
+ Issue(
1874
+ language="c/cpp",
1875
+ category="memory_mgmt",
1876
+ pattern="delete_mismatch",
1877
+ file=relpath,
1878
+ line=idx,
1879
+ evidence=_strip_line(s),
1880
+ description="使用 delete 释放由 new[] 分配的数组内存,存在未定义行为风险。",
1881
+ suggestion="new[] 分配的内存应使用 delete[] 释放;new 分配的内存应使用 delete 释放。",
1882
+ confidence=0.85,
1883
+ severity="high",
1884
+ )
1885
+ )
1886
+
1887
+ return issues
1888
+
1889
+
1890
+ def _rule_reinterpret_cast_unsafe(lines: Sequence[str], relpath: str) -> List[Issue]:
1891
+ """
1892
+ 检测 reinterpret_cast 的不安全使用(高风险类型转换)。
1893
+ """
1894
+ issues: List[Issue] = []
1895
+ for idx, s in enumerate(lines, start=1):
1896
+ if RE_REINTERPRET_CAST.search(s):
1897
+ conf = 0.7
1898
+ # 如果转换为指针类型,风险更高
1899
+ if "->" in s or "*" in s:
1900
+ conf += 0.1
1901
+ issues.append(
1902
+ Issue(
1903
+ language="c/cpp",
1904
+ category="type_safety",
1905
+ pattern="reinterpret_cast_unsafe",
1906
+ file=relpath,
1907
+ line=idx,
1908
+ evidence=_strip_line(s),
1909
+ description="使用 reinterpret_cast 进行类型转换,可能导致未定义行为或类型安全问题。",
1910
+ suggestion="优先使用 static_cast 或 dynamic_cast;若必须使用 reinterpret_cast,需确保类型布局兼容并添加详细注释说明。",
1911
+ confidence=min(conf, 0.9),
1912
+ severity="high",
1913
+ )
1914
+ )
1915
+ return issues
1916
+
1917
+
1918
+ def _rule_const_cast_unsafe(lines: Sequence[str], relpath: str) -> List[Issue]:
1919
+ """
1920
+ 检测 const_cast 的不安全使用(移除 const 修饰符可能导致未定义行为)。
1921
+ """
1922
+ issues: List[Issue] = []
1923
+ for idx, s in enumerate(lines, start=1):
1924
+ if RE_CONST_CAST.search(s):
1925
+ conf = 0.65
1926
+ # 如果通过 const_cast 修改原本为 const 的对象,风险更高
1927
+ if "=" in s and not re.search(r"const\s+[A-Za-z_]\w*\s*\*", s):
1928
+ conf += 0.1
1929
+ issues.append(
1930
+ Issue(
1931
+ language="c/cpp",
1932
+ category="type_safety",
1933
+ pattern="const_cast_unsafe",
1934
+ file=relpath,
1935
+ line=idx,
1936
+ evidence=_strip_line(s),
1937
+ description="使用 const_cast 移除 const 修饰符,可能导致未定义行为(如修改常量对象)。",
1938
+ suggestion="避免使用 const_cast;若必须使用,确保仅用于移除非底层 const 且对象本身可变。",
1939
+ confidence=min(conf, 0.8),
1940
+ severity="high",
1941
+ )
1942
+ )
1943
+ return issues
1944
+
1945
+
1946
+ def _rule_vector_string_bounds_check(lines: Sequence[str], relpath: str) -> List[Issue]:
1947
+ """
1948
+ 检测 vector 和 string 的越界访问(使用 [] 而非 .at())。
1949
+ 启发式:检测 [] 访问,若附近未见边界检查,则提示风险。
1950
+ """
1951
+ issues: List[Issue] = []
1952
+ vector_vars: set[str] = set()
1953
+ string_vars: set[str] = set()
1954
+
1955
+ # 先收集 vector 和 string 变量
1956
+ for idx, s in enumerate(lines, start=1):
1957
+ m = RE_VECTOR_VAR.search(s)
1958
+ if m:
1959
+ vector_vars.add(m.group(1))
1960
+ m = RE_STRING_VAR.search(s)
1961
+ if m:
1962
+ string_vars.add(m.group(1))
1963
+
1964
+ for idx, s in enumerate(lines, start=1):
1965
+ # vector 访问:检测 var[...] 模式
1966
+ for var in vector_vars:
1967
+ if re.search(rf"\b{re.escape(var)}\s*\[", s):
1968
+ # 检查是否使用了 .at()(安全访问)
1969
+ if not RE_AT_METHOD.search(s):
1970
+ # 检查附近是否有边界检查
1971
+ window_text = " ".join(t for _, t in _window(lines, idx, before=2, after=2))
1972
+ if not re.search(rf"\b{re.escape(var)}\s*\.(size|length|empty|at)\s*\(", window_text, re.IGNORECASE):
1973
+ issues.append(
1974
+ Issue(
1975
+ language="c/cpp",
1976
+ category="buffer_overflow",
1977
+ pattern="vector_bounds_check",
1978
+ file=relpath,
1979
+ line=idx,
1980
+ evidence=_strip_line(s),
1981
+ description=f"vector {var} 使用 [] 访问可能越界,建议使用 .at() 进行边界检查。",
1982
+ suggestion="使用 .at() 方法进行安全访问,或在使用 [] 前显式检查索引范围。",
1983
+ confidence=0.6,
1984
+ severity="medium",
1985
+ )
1986
+ )
1987
+ break # 每行只报告一次
1988
+
1989
+ # string 访问:检测 var[...] 模式
1990
+ for var in string_vars:
1991
+ if re.search(rf"\b{re.escape(var)}\s*\[", s):
1992
+ if not RE_AT_METHOD.search(s):
1993
+ window_text = " ".join(t for _, t in _window(lines, idx, before=2, after=2))
1994
+ if not re.search(rf"\b{re.escape(var)}\s*\.(size|length|empty|at)\s*\(", window_text, re.IGNORECASE):
1995
+ issues.append(
1996
+ Issue(
1997
+ language="c/cpp",
1998
+ category="buffer_overflow",
1999
+ pattern="string_bounds_check",
2000
+ file=relpath,
2001
+ line=idx,
2002
+ evidence=_strip_line(s),
2003
+ description=f"string {var} 使用 [] 访问可能越界,建议使用 .at() 进行边界检查。",
2004
+ suggestion="使用 .at() 方法进行安全访问,或在使用 [] 前显式检查索引范围。",
2005
+ confidence=0.6,
2006
+ severity="medium",
2007
+ )
2008
+ )
2009
+ break # 每行只报告一次
2010
+ return issues
2011
+
2012
+
2013
+ def _rule_missing_virtual_dtor(lines: Sequence[str], relpath: str) -> List[Issue]:
2014
+ """
2015
+ 检测基类缺少虚析构函数的问题。
2016
+ 启发式:检测 class 声明,若存在虚函数但析构函数非虚,则提示。
2017
+ """
2018
+ issues: List[Issue] = []
2019
+ classes: dict[str, dict] = {} # class_name -> {"line": int, "has_virtual": bool, "has_virtual_dtor": bool}
2020
+ current_class: Optional[str] = None
2021
+ in_class = False
2022
+ brace_depth = 0
2023
+
2024
+ for idx, s in enumerate(lines, start=1):
2025
+ # 检测 class 声明
2026
+ m_class = RE_CLASS_DECL.search(s)
2027
+ if m_class:
2028
+ class_name = m_class.group(1)
2029
+ classes[class_name] = {"line": idx, "has_virtual": False, "has_virtual_dtor": False}
2030
+ current_class = class_name
2031
+ in_class = True
2032
+ brace_depth = s.count("{") - s.count("}")
2033
+ continue
2034
+
2035
+ if in_class and current_class:
2036
+ brace_depth += s.count("{") - s.count("}")
2037
+ if brace_depth <= 0:
2038
+ in_class = False
2039
+ current_class = None
2040
+ continue
2041
+
2042
+ # 检测虚函数
2043
+ if re.search(r"\bvirtual\s+[^~]", s, re.IGNORECASE):
2044
+ classes[current_class]["has_virtual"] = True
2045
+
2046
+ # 检测虚析构函数
2047
+ if RE_VIRTUAL_DTOR.search(s):
2048
+ classes[current_class]["has_virtual_dtor"] = True
2049
+
2050
+ # 检查有虚函数但无虚析构函数的类
2051
+ for class_name, info in classes.items():
2052
+ if info["has_virtual"] and not info["has_virtual_dtor"]:
2053
+ issues.append(
2054
+ Issue(
2055
+ language="c/cpp",
2056
+ category="memory_mgmt",
2057
+ pattern="missing_virtual_dtor",
2058
+ file=relpath,
2059
+ line=info["line"],
2060
+ evidence=_strip_line(_safe_line(lines, info["line"])),
2061
+ description=f"类 {class_name} 包含虚函数但析构函数非虚,通过基类指针删除派生类对象可能导致未定义行为。",
2062
+ suggestion="为基类添加虚析构函数,确保通过基类指针删除派生类对象时正确调用派生类析构函数。",
2063
+ confidence=0.75,
2064
+ severity="high",
2065
+ )
2066
+ )
2067
+
2068
+ return issues
2069
+
2070
+
2071
+ def _rule_move_after_use(lines: Sequence[str], relpath: str) -> List[Issue]:
2072
+ """
2073
+ 检测移动后使用的风险:对象被 std::move 后仍被使用。
2074
+ """
2075
+ issues: List[Issue] = []
2076
+ moved_vars: dict[str, int] = {} # var -> line_no
2077
+
2078
+ for idx, s in enumerate(lines, start=1):
2079
+ # 检测 std::move 赋值
2080
+ m = RE_MOVE_ASSIGN.search(s)
2081
+ if m:
2082
+ var = m.group(1)
2083
+ moved_vars[var] = idx
2084
+
2085
+ # 检测移动后的使用
2086
+ vars_to_remove: set[str] = set() # 收集要删除的键,避免在遍历时修改字典
2087
+ for var, move_line in moved_vars.items():
2088
+ if idx > move_line and idx <= move_line + 10: # 在移动后 10 行内
2089
+ # 检测变量使用(排除重新赋值)
2090
+ if re.search(rf"\b{re.escape(var)}\b", s) and not re.search(rf"\b{re.escape(var)}\s*=", s):
2091
+ # 检查是否是重新赋值(重置移动状态)
2092
+ if re.search(rf"\b{re.escape(var)}\s*=\s*(?!std::move)", s):
2093
+ # 重新赋值,移除记录
2094
+ vars_to_remove.add(var)
2095
+ else:
2096
+ # 可能是使用
2097
+ if re.search(rf"\b{re.escape(var)}\s*(->|\[|\.|\(|,)", s):
2098
+ issues.append(
2099
+ Issue(
2100
+ language="c/cpp",
2101
+ category="memory_mgmt",
2102
+ pattern="move_after_use",
2103
+ file=relpath,
2104
+ line=idx,
2105
+ evidence=_strip_line(s),
2106
+ description=f"变量 {var} 在 std::move 后仍被使用,移动后的对象处于有效但未指定状态,可能导致未定义行为。",
2107
+ suggestion="移动后的对象不应再使用,除非重新赋值;考虑使用移动语义后立即停止使用该对象。",
2108
+ confidence=0.7,
2109
+ severity="high",
2110
+ )
2111
+ )
2112
+ # 移除记录,避免重复报告
2113
+ vars_to_remove.add(var)
2114
+
2115
+ # 遍历结束后再删除
2116
+ for var in vars_to_remove:
2117
+ moved_vars.pop(var, None)
2118
+
2119
+ return issues
2120
+
2121
+
2122
+ def _rule_uncaught_exception(lines: Sequence[str], relpath: str) -> List[Issue]:
2123
+ """
2124
+ 检测可能未捕获的异常:throw 语句附近未见 try-catch。
2125
+ """
2126
+ issues: List[Issue] = []
2127
+ for idx, s in enumerate(lines, start=1):
2128
+ if RE_THROW.search(s):
2129
+ # 检查附近是否有 try-catch
2130
+ window_text = " ".join(t for _, t in _window(lines, idx, before=10, after=10))
2131
+ has_try = RE_TRY.search(window_text) is not None
2132
+ has_catch = RE_CATCH.search(window_text) is not None
2133
+
2134
+ if not (has_try and has_catch):
2135
+ conf = 0.6
2136
+ # 如果在 noexcept 函数中抛出异常,风险更高
2137
+ prev_text = " ".join(t for _, t in _window(lines, idx, before=5, after=0))
2138
+ if RE_NOEXCEPT.search(prev_text):
2139
+ conf += 0.2
2140
+
2141
+ issues.append(
2142
+ Issue(
2143
+ language="c/cpp",
2144
+ category="error_handling",
2145
+ pattern="uncaught_exception",
2146
+ file=relpath,
2147
+ line=idx,
2148
+ evidence=_strip_line(s),
2149
+ description="检测到 throw 语句,但附近未见 try-catch 块,可能导致未捕获异常。",
2150
+ suggestion="确保异常在适当的作用域内被捕获;考虑使用 RAII 确保资源在异常时正确释放。",
2151
+ confidence=min(conf, 0.85),
2152
+ severity="high" if conf >= 0.8 else "medium",
2153
+ )
2154
+ )
2155
+ return issues
2156
+
2157
+
2158
+ def _rule_smart_ptr_cycle(lines: Sequence[str], relpath: str) -> List[Issue]:
2159
+ """
2160
+ 检测智能指针可能的循环引用问题(启发式)。
2161
+ 注意:完全检测循环引用需要图分析,这里仅做简单启发式检测。
2162
+ """
2163
+ issues: List[Issue] = []
2164
+ shared_ptr_vars: set[str] = set()
2165
+
2166
+ for idx, s in enumerate(lines, start=1):
2167
+ # 收集 shared_ptr 变量
2168
+ if RE_SHARED_PTR.search(s):
2169
+ m = RE_SMART_PTR_ASSIGN.search(s)
2170
+ if m:
2171
+ var = m.group(1)
2172
+ shared_ptr_vars.add(var)
2173
+
2174
+ # 检测 shared_ptr 之间的相互引用(简单启发式)
2175
+ if RE_SHARED_PTR.search(s) and shared_ptr_vars:
2176
+ # 检查是否在 shared_ptr 初始化中使用了另一个 shared_ptr
2177
+ for var in shared_ptr_vars:
2178
+ if re.search(rf"\b{re.escape(var)}\b", s) and "make_shared" in s.lower():
2179
+ # 简单启发:如果两个 shared_ptr 相互引用,可能存在循环
2180
+ # 这里仅做提示,实际需要更复杂的分析
2181
+ pass
2182
+
2183
+ # 检测 weak_ptr 的使用(通常用于打破循环引用)
2184
+ has_weak_ptr = False
2185
+ for idx, s in enumerate(lines, start=1):
2186
+ if RE_WEAK_PTR.search(s):
2187
+ has_weak_ptr = True
2188
+ break
2189
+
2190
+ # 如果大量使用 shared_ptr 但未见 weak_ptr,提示可能的循环引用风险
2191
+ if len(shared_ptr_vars) > 3 and not has_weak_ptr:
2192
+ # 在第一个 shared_ptr 使用处提示
2193
+ for idx, s in enumerate(lines, start=1):
2194
+ if RE_SHARED_PTR.search(s):
2195
+ issues.append(
2196
+ Issue(
2197
+ language="c/cpp",
2198
+ category="memory_mgmt",
2199
+ pattern="smart_ptr_cycle_risk",
2200
+ file=relpath,
2201
+ line=idx,
2202
+ evidence=_strip_line(s),
2203
+ description="检测到多个 shared_ptr 使用但未见 weak_ptr,可能存在循环引用导致内存泄漏的风险。",
2204
+ suggestion="检查对象间的引用关系,必要时使用 weak_ptr 打破循环引用;考虑使用 unique_ptr 替代 shared_ptr 以明确所有权。",
2205
+ confidence=0.5,
2206
+ severity="medium",
2207
+ )
2208
+ )
2209
+ break
2210
+
2211
+ return issues
2212
+
2213
+
2214
+ def _rule_cpp_deadlock_patterns(lines: Sequence[str], relpath: str) -> List[Issue]:
2215
+ """
2216
+ 检测 C++ 标准库(std::mutex)相关的死锁风险:
2217
+ - 双重加锁:同一 mutex 在未解锁情况下再次加锁
2218
+ - 可能缺失解锁:lock() 后在后续窗口内未看到对应 unlock()
2219
+ - 锁顺序反转:存在 (A->B) 与 (B->A) 两种加锁顺序
2220
+ - 未使用 std::lock/scoped_lock:手动锁定多个 mutex 时未使用死锁避免机制
2221
+ 实现基于启发式,可能产生误报。
2222
+ """
2223
+ issues: List[Issue] = []
2224
+ lock_stack: list[str] = [] # 当前持有的锁栈
2225
+ order_pairs: dict[tuple[str, str], int] = {} # 加锁顺序对 -> 行号
2226
+ mutex_vars: set[str] = set() # 所有 mutex 变量名
2227
+
2228
+ # 先收集所有 mutex 变量
2229
+ for idx, s in enumerate(lines, start=1):
2230
+ m = RE_STD_MUTEX.search(s)
2231
+ if m:
2232
+ mutex_vars.add(m.group(1))
2233
+
2234
+ # 扫描加锁/解锁操作
2235
+ for idx, s in enumerate(lines, start=1):
2236
+ # 检测 lock() 调用
2237
+ m_lock = RE_MUTEX_LOCK.search(s)
2238
+ if m_lock:
2239
+ mtx = m_lock.group(1)
2240
+ if mtx in mutex_vars:
2241
+ # 双重加锁检测
2242
+ if mtx in lock_stack:
2243
+ issues.append(
2244
+ Issue(
2245
+ language="c/cpp",
2246
+ category="error_handling",
2247
+ pattern="cpp_double_lock",
2248
+ file=relpath,
2249
+ line=idx,
2250
+ evidence=_strip_line(s),
2251
+ description=f"mutex {mtx} 在未解锁的情况下被再次加锁,存在死锁风险。",
2252
+ suggestion="避免对同一 mutex 重复加锁;考虑使用 std::recursive_mutex 或重构代码避免嵌套加锁。",
2253
+ confidence=0.8,
2254
+ severity="high",
2255
+ )
2256
+ )
2257
+ # 锁顺序记录
2258
+ if lock_stack and lock_stack[-1] != mtx:
2259
+ pair = (lock_stack[-1], mtx)
2260
+ order_pairs.setdefault(pair, idx)
2261
+ lock_stack.append(mtx)
2262
+
2263
+ # 检测 unlock() 调用
2264
+ m_unlock = RE_MUTEX_UNLOCK.search(s)
2265
+ if m_unlock:
2266
+ mtx = m_unlock.group(1)
2267
+ if mtx in mutex_vars and mtx in lock_stack:
2268
+ # 从栈中移除最近的相同锁
2269
+ for k in range(len(lock_stack) - 1, -1, -1):
2270
+ if lock_stack[k] == mtx:
2271
+ del lock_stack[k]
2272
+ break
2273
+
2274
+ # 检测 lock_guard/unique_lock(RAII,自动解锁,通常更安全)
2275
+ RE_LOCK_GUARD.search(s) or RE_UNIQUE_LOCK.search(s) or RE_SHARED_LOCK.search(s)
2276
+
2277
+ # 检测 std::lock 或 scoped_lock(死锁避免机制)
2278
+ has_safe_lock = RE_STD_LOCK.search(s) or RE_SCOPED_LOCK.search(s)
2279
+
2280
+ # 粗略按作用域结束重置
2281
+ if "}" in s and not has_safe_lock:
2282
+ # 如果作用域结束且栈中还有锁,可能是问题(但可能是 RAII 锁,所以降低置信度)
2283
+ if lock_stack:
2284
+ # 这里不直接报错,因为可能是 RAII 锁
2285
+ pass
2286
+
2287
+ # 检测手动锁定多个 mutex 但未使用 std::lock
2288
+ if m_lock and len(lock_stack) > 1 and not has_safe_lock:
2289
+ # 在锁定第二个 mutex 时,如果之前已持有锁且未使用 std::lock,提示风险
2290
+ if idx > 1:
2291
+ prev_text = " ".join(_safe_line(lines, j) for j in range(max(1, idx - 3), idx))
2292
+ if not RE_STD_LOCK.search(prev_text) and not RE_SCOPED_LOCK.search(prev_text):
2293
+ issues.append(
2294
+ Issue(
2295
+ language="c/cpp",
2296
+ category="error_handling",
2297
+ pattern="cpp_multiple_lock_unsafe",
2298
+ file=relpath,
2299
+ line=idx,
2300
+ evidence=_strip_line(s),
2301
+ description="检测到手动锁定多个 mutex 但未使用 std::lock 或 std::scoped_lock,存在死锁风险。",
2302
+ suggestion="使用 std::lock 或 std::scoped_lock 同时锁定多个 mutex,可避免死锁;或统一加锁顺序。",
2303
+ confidence=0.65,
2304
+ severity="high",
2305
+ )
2306
+ )
2307
+
2308
+ # 锁顺序反转检测
2309
+ for (a, b), ln in order_pairs.items():
2310
+ if (b, a) in order_pairs:
2311
+ issues.append(
2312
+ Issue(
2313
+ language="c/cpp",
2314
+ category="error_handling",
2315
+ pattern="cpp_lock_order_inversion",
2316
+ file=relpath,
2317
+ line=order_pairs[(b, a)],
2318
+ evidence=_strip_line(_safe_line(lines, order_pairs[(b, a)])),
2319
+ description=f"检测到 mutex 加锁顺序反转:({a} -> {b}) 与 ({b} -> {a}),存在死锁风险。",
2320
+ suggestion="统一多锁的获取顺序,制定全局锁等级;或使用 std::lock/scoped_lock 避免死锁。",
2321
+ confidence=0.7,
2322
+ severity="high",
2323
+ )
2324
+ )
2325
+
2326
+ # 可能缺失解锁:在 lock() 后的 50 行窗口内未见对应 unlock()
2327
+ for idx, s in enumerate(lines, start=1):
2328
+ m_lock = RE_MUTEX_LOCK.search(s)
2329
+ if not m_lock:
2330
+ continue
2331
+ mtx = m_lock.group(1)
2332
+ if mtx not in mutex_vars:
2333
+ continue
2334
+
2335
+ # 检查是否是 lock_guard/unique_lock(RAII,自动解锁)
2336
+ window_text = " ".join(_safe_line(lines, j) for j in range(idx, min(idx + 3, len(lines)) + 1))
2337
+ is_raii = RE_LOCK_GUARD.search(window_text) or RE_UNIQUE_LOCK.search(window_text) or RE_SHARED_LOCK.search(window_text)
2338
+ if is_raii:
2339
+ continue # RAII 锁会自动解锁,跳过
2340
+
2341
+ end = min(len(lines), idx + 50)
2342
+ unlocked = False
2343
+ for j in range(idx + 1, end + 1):
2344
+ sj = _safe_line(lines, j)
2345
+ m_un = RE_MUTEX_UNLOCK.search(sj)
2346
+ if m_un and m_un.group(1) == mtx:
2347
+ unlocked = True
2348
+ break
2349
+ # 检查作用域结束(可能是 RAII 锁)
2350
+ if "}" in sj:
2351
+ # 检查是否是 lock_guard/unique_lock 的作用域
2352
+ prev_scope = " ".join(_safe_line(lines, k) for k in range(max(1, j - 5), j))
2353
+ if RE_LOCK_GUARD.search(prev_scope) or RE_UNIQUE_LOCK.search(prev_scope):
2354
+ unlocked = True
2355
+ break
2356
+
2357
+ if not unlocked:
2358
+ issues.append(
2359
+ Issue(
2360
+ language="c/cpp",
2361
+ category="error_handling",
2362
+ pattern="cpp_missing_unlock_suspect",
2363
+ file=relpath,
2364
+ line=idx,
2365
+ evidence=_strip_line(s),
2366
+ description=f"在 mutex {mtx} 调用 lock() 之后的邻近窗口内未检测到匹配 unlock(),可能存在缺失解锁的风险。",
2367
+ suggestion="确保所有 lock() 路径都有配对的 unlock();考虑使用 std::lock_guard 或 std::unique_lock(RAII)自动管理锁生命周期。",
2368
+ confidence=0.55,
2369
+ severity="medium",
2370
+ )
2371
+ )
2372
+
2373
+ return issues
2374
+
2375
+
2376
+ def _rule_data_race_suspect(lines: Sequence[str], relpath: str) -> List[Issue]:
2377
+ """
2378
+ 检测可能的数据竞争(data race)风险:
2379
+ - 共享变量(全局/静态变量)在多线程环境下未受保护访问
2380
+ - 检测到线程创建但共享变量访问时未见锁保护
2381
+ - volatile 误用(volatile 不能保证线程安全)
2382
+ - 未使用原子操作保护共享变量
2383
+
2384
+ 实现基于启发式,需要结合上下文分析。
2385
+ """
2386
+ issues: List[Issue] = []
2387
+ shared_vars: set[str] = set() # 共享变量集合
2388
+ thread_creation_lines: list[int] = [] # 线程创建行号
2389
+ atomic_vars: set[str] = set() # 原子变量集合
2390
+ volatile_vars: set[str] = set() # volatile 变量集合
2391
+
2392
+ # 第一遍扫描:收集共享变量、线程创建、原子变量
2393
+ for idx, s in enumerate(lines, start=1):
2394
+ # 收集全局/静态变量
2395
+ m_static = RE_STATIC_VAR.search(s)
2396
+ if m_static:
2397
+ var = m_static.group(1)
2398
+ # 排除 const 变量(只读,通常安全)
2399
+ if "const" not in s.lower():
2400
+ shared_vars.add(var)
2401
+
2402
+ m_extern = RE_EXTERN_VAR.search(s)
2403
+ if m_extern:
2404
+ var = m_extern.group(1)
2405
+ if "const" not in s.lower():
2406
+ shared_vars.add(var)
2407
+
2408
+ # 检测全局变量声明(文件作用域)
2409
+ if idx == 1 or (idx > 1 and _safe_line(lines, idx - 1).strip().endswith("}")):
2410
+ # 可能是文件作用域的变量
2411
+ m_global = re.search(r"^[A-Za-z_]\w*(?:\s+\*|\s+)+([A-Za-z_]\w*)\s*[=;]", s)
2412
+ if m_global and "const" not in s.lower() and "static" not in s.lower():
2413
+ var = m_global.group(1)
2414
+ shared_vars.add(var)
2415
+
2416
+ # 检测线程创建
2417
+ if RE_PTHREAD_CREATE.search(s) or RE_STD_THREAD.search(s):
2418
+ thread_creation_lines.append(idx)
2419
+
2420
+ # 收集原子变量
2421
+ m_atomic = RE_ATOMIC.search(s)
2422
+ if m_atomic:
2423
+ var = m_atomic.group(1)
2424
+ atomic_vars.add(var)
2425
+
2426
+ # 收集 volatile 变量
2427
+ m_volatile = RE_VOLATILE.search(s)
2428
+ if m_volatile:
2429
+ var = m_volatile.group(1)
2430
+ volatile_vars.add(var)
2431
+
2432
+ # 如果没有线程创建,通常不存在数据竞争风险
2433
+ if not thread_creation_lines:
2434
+ return issues
2435
+
2436
+ # 第二遍扫描:检测共享变量访问时的保护情况
2437
+ for idx, s in enumerate(lines, start=1):
2438
+ # 检测共享变量的访问(赋值或读取)
2439
+ for var in shared_vars:
2440
+ if var in atomic_vars:
2441
+ continue # 原子变量,通常安全
2442
+
2443
+ # 检测变量访问
2444
+ var_pattern = re.compile(rf"\b{re.escape(var)}\b")
2445
+ if not var_pattern.search(s):
2446
+ continue
2447
+
2448
+ # 检查是否是赋值操作
2449
+ is_write = RE_VAR_ASSIGN.search(s) and var in s[:s.find("=")]
2450
+
2451
+ # 检查附近是否有锁保护
2452
+ window_text = " ".join(t for _, t in _window(lines, idx, before=5, after=5))
2453
+ has_lock = (
2454
+ RE_PTHREAD_LOCK.search(window_text) is not None or
2455
+ RE_MUTEX_LOCK.search(window_text) is not None or
2456
+ RE_LOCK_GUARD.search(window_text) is not None or
2457
+ RE_UNIQUE_LOCK.search(window_text) is not None or
2458
+ RE_SHARED_LOCK.search(window_text) is not None
2459
+ )
2460
+
2461
+ # 检查是否在锁的作用域内(简单启发式)
2462
+ # 查找最近的锁
2463
+ lock_line = None
2464
+ for j in range(max(1, idx - 10), idx):
2465
+ sj = _safe_line(lines, j)
2466
+ if RE_PTHREAD_LOCK.search(sj) or RE_MUTEX_LOCK.search(sj) or RE_LOCK_GUARD.search(sj) or RE_UNIQUE_LOCK.search(sj):
2467
+ lock_line = j
2468
+ break
2469
+
2470
+ # 检查锁是否已解锁
2471
+ unlocked = False
2472
+ if lock_line:
2473
+ for j in range(lock_line + 1, idx):
2474
+ sj = _safe_line(lines, j)
2475
+ if RE_PTHREAD_UNLOCK.search(sj) or RE_MUTEX_UNLOCK.search(sj):
2476
+ unlocked = True
2477
+ break
2478
+
2479
+ # 如果未检测到锁保护,且是写操作,风险更高
2480
+ if not has_lock or (lock_line and unlocked):
2481
+ conf = 0.6
2482
+ if is_write:
2483
+ conf += 0.15
2484
+ if var in volatile_vars:
2485
+ # volatile 不能保证线程安全,但可能被误用
2486
+ conf += 0.1
2487
+
2488
+ # 检查是否在函数参数中(可能是局部变量,降低风险)
2489
+ if "(" in s and ")" in s:
2490
+ # 可能是函数调用参数,降低置信度
2491
+ conf -= 0.1
2492
+
2493
+ issues.append(
2494
+ Issue(
2495
+ language="c/cpp",
2496
+ category="concurrency",
2497
+ pattern="data_race_suspect",
2498
+ file=relpath,
2499
+ line=idx,
2500
+ evidence=_strip_line(s),
2501
+ description=f"共享变量 {var} 在多线程环境下访问但未见明确的锁保护,可能存在数据竞争风险。",
2502
+ suggestion="使用互斥锁保护共享变量访问;或使用原子操作(std::atomic)进行无锁编程;注意 volatile 不能保证线程安全。",
2503
+ confidence=min(conf, 0.85),
2504
+ severity="high" if conf >= 0.7 else "medium",
2505
+ )
2506
+ )
2507
+
2508
+ # 检测 volatile 的误用(volatile 不能保证线程安全)
2509
+ for idx, s in enumerate(lines, start=1):
2510
+ for var in volatile_vars:
2511
+ if var in atomic_vars:
2512
+ continue # 如果同时是原子变量,跳过
2513
+
2514
+ if re.search(rf"\b{re.escape(var)}\b", s):
2515
+ # 检查是否在多线程上下文中使用 volatile
2516
+ window_text = " ".join(t for _, t in _window(lines, idx, before=3, after=3))
2517
+ has_thread = (
2518
+ RE_PTHREAD_CREATE.search(window_text) is not None or
2519
+ RE_STD_THREAD.search(window_text) is not None or
2520
+ any(abs(j - idx) < 20 for j in thread_creation_lines)
2521
+ )
2522
+
2523
+ if has_thread:
2524
+ # 检查是否有锁保护
2525
+ has_lock = (
2526
+ RE_PTHREAD_LOCK.search(window_text) is not None or
2527
+ RE_MUTEX_LOCK.search(window_text) is not None or
2528
+ RE_LOCK_GUARD.search(window_text) is not None
2529
+ )
2530
+
2531
+ if not has_lock:
2532
+ issues.append(
2533
+ Issue(
2534
+ language="c/cpp",
2535
+ category="concurrency",
2536
+ pattern="volatile_not_threadsafe",
2537
+ file=relpath,
2538
+ line=idx,
2539
+ evidence=_strip_line(s),
2540
+ description=f"volatile 变量 {var} 在多线程环境下使用,但 volatile 不能保证线程安全,可能存在数据竞争。",
2541
+ suggestion="volatile 仅防止编译器优化,不能保证原子性或内存可见性;使用 std::atomic 或互斥锁保护共享变量。",
2542
+ confidence=0.7,
2543
+ severity="high",
2544
+ )
2545
+ )
2546
+
2547
+ return issues
2548
+
2549
+
2550
+ def _rule_smart_ptr_get_unsafe(lines: Sequence[str], relpath: str) -> List[Issue]:
2551
+ """
2552
+ 检测智能指针的 .get() 方法不安全使用(返回的原始指针可能悬空)。
2553
+ """
2554
+ issues: List[Issue] = []
2555
+ smart_ptr_vars: set[str] = set()
2556
+
2557
+ # 先收集智能指针变量
2558
+ for idx, s in enumerate(lines, start=1):
2559
+ m = RE_SMART_PTR_ASSIGN.search(s)
2560
+ if m:
2561
+ smart_ptr_vars.add(m.group(1))
2562
+ # 也检测声明
2563
+ if RE_SHARED_PTR.search(s) or RE_UNIQUE_PTR.search(s) or RE_WEAK_PTR.search(s):
2564
+ m = re.search(r"\b([A-Za-z_]\w*)\s*(?:=|;)", s)
2565
+ if m:
2566
+ smart_ptr_vars.add(m.group(1))
2567
+
2568
+ for idx, s in enumerate(lines, start=1):
2569
+ # 检测 .get() 调用
2570
+ for var in smart_ptr_vars:
2571
+ if re.search(rf"\b{re.escape(var)}\s*\.get\s*\(", s, re.IGNORECASE):
2572
+ conf = 0.65
2573
+ # 如果 .get() 的结果被存储或传递,风险更高
2574
+ if "=" in s or re.search(r"\.get\s*\([^)]*\)\s*[=,\(]", s):
2575
+ conf += 0.1
2576
+
2577
+ issues.append(
2578
+ Issue(
2579
+ language="c/cpp",
2580
+ category="memory_mgmt",
2581
+ pattern="smart_ptr_get_unsafe",
2582
+ file=relpath,
2583
+ line=idx,
2584
+ evidence=_strip_line(s),
2585
+ description=f"智能指针 {var} 使用 .get() 方法获取原始指针,若智能指针生命周期结束,原始指针将悬空。",
2586
+ suggestion="避免存储 .get() 返回的原始指针;若必须使用,确保智能指针的生命周期覆盖原始指针的使用期。",
2587
+ confidence=min(conf, 0.8),
2588
+ severity="high",
2589
+ )
2590
+ )
2591
+ break # 每行只报告一次
2592
+ return issues
2593
+
2594
+
2595
+ def analyze_c_cpp_text(relpath: str, text: str) -> List[Issue]:
2596
+ """
2597
+ 基于提供的文本进行 C/C++ 启发式分析。
2598
+ - 准确性优化:在启发式匹配前移除注释(保留字符串/字符字面量),
2599
+ 以避免注释中的API命中导致的误报。
2600
+ - 准确性优化2:对通用 API 扫描使用“字符串内容掩蔽”的副本,避免把字符串里的片段当作代码。
2601
+ """
2602
+ pre_text = _strip_if0_blocks(text)
2603
+ clean_text = _remove_comments_preserve_strings(pre_text)
2604
+ masked_text = _mask_strings_preserve_len(clean_text)
2605
+ # 原始行:保留字符串内容,供需要解析字面量的规则使用(如格式串、scanf 宽度等)
2606
+ lines = clean_text.splitlines()
2607
+ # 掩蔽行:字符串内容已被空格替换,适合用于通用 API/关键字匹配,减少误报
2608
+ mlines = masked_text.splitlines()
2609
+
2610
+ issues: List[Issue] = []
2611
+ # 通用 API/关键字匹配(使用掩蔽行)
2612
+ issues.extend(_rule_unsafe_api(mlines, relpath))
2613
+ issues.extend(_rule_boundary_funcs(mlines, relpath))
2614
+ issues.extend(_rule_realloc_assign_back(mlines, relpath))
2615
+ issues.extend(_rule_malloc_no_null_check(mlines, relpath))
2616
+ issues.extend(_rule_unchecked_io(mlines, relpath))
2617
+ # 需要字符串字面量信息的规则(使用原始行)
2618
+ issues.extend(_rule_strncpy_no_nullterm(lines, relpath))
2619
+ issues.extend(_rule_format_string(lines, relpath))
2620
+ issues.extend(_rule_scanf_no_width(lines, relpath))
2621
+ # 其他规则
2622
+ issues.extend(_rule_insecure_tmpfile(mlines, relpath))
2623
+ issues.extend(_rule_command_execution(mlines, relpath))
2624
+ issues.extend(_rule_alloc_size_overflow(mlines, relpath))
2625
+ issues.extend(_rule_double_free_and_free_non_heap(mlines, relpath))
2626
+ issues.extend(_rule_atoi_family(mlines, relpath))
2627
+ issues.extend(_rule_rand_insecure(mlines, relpath))
2628
+ issues.extend(_rule_strtok_nonreentrant(mlines, relpath))
2629
+ issues.extend(_rule_open_permissive_perms(mlines, relpath))
2630
+ issues.extend(_rule_alloca_unbounded(mlines, relpath))
2631
+ issues.extend(_rule_vla_usage(mlines, relpath))
2632
+ issues.extend(_rule_pthread_returns_unchecked(mlines, relpath))
2633
+ issues.extend(_rule_cond_wait_no_loop(mlines, relpath))
2634
+ issues.extend(_rule_thread_leak_no_join(mlines, relpath))
2635
+ issues.extend(_rule_inet_legacy(mlines, relpath))
2636
+ issues.extend(_rule_time_apis_not_threadsafe(mlines, relpath))
2637
+ issues.extend(_rule_getenv_unchecked(mlines, relpath))
2638
+ # 复杂语义(使用掩蔽行避免字符串干扰)
2639
+ issues.extend(_rule_uaf_suspect(mlines, relpath))
2640
+ issues.extend(_rule_possible_null_deref(mlines, relpath))
2641
+ issues.extend(_rule_uninitialized_ptr_use(mlines, relpath))
2642
+ issues.extend(_rule_deadlock_patterns(mlines, relpath))
2643
+ # C++ 特定检查规则
2644
+ issues.extend(_rule_new_delete_mismatch(mlines, relpath))
2645
+ issues.extend(_rule_reinterpret_cast_unsafe(mlines, relpath))
2646
+ issues.extend(_rule_const_cast_unsafe(mlines, relpath))
2647
+ issues.extend(_rule_vector_string_bounds_check(mlines, relpath))
2648
+ issues.extend(_rule_missing_virtual_dtor(mlines, relpath))
2649
+ issues.extend(_rule_move_after_use(mlines, relpath))
2650
+ issues.extend(_rule_uncaught_exception(mlines, relpath))
2651
+ issues.extend(_rule_smart_ptr_cycle(mlines, relpath))
2652
+ issues.extend(_rule_smart_ptr_get_unsafe(mlines, relpath))
2653
+ # C++ 死锁检测
2654
+ issues.extend(_rule_cpp_deadlock_patterns(mlines, relpath))
2655
+ # 数据竞争检测
2656
+ issues.extend(_rule_data_race_suspect(mlines, relpath))
2657
+ return issues
2658
+
2659
+
2660
+ def analyze_c_cpp_file(base: Path, relpath: Path) -> List[Issue]:
2661
+ """
2662
+ 从磁盘读取文件进行分析。
2663
+ """
2664
+ try:
2665
+ text = (base / relpath).read_text(errors="ignore")
2666
+ except Exception:
2667
+ return []
2668
+ return analyze_c_cpp_text(str(relpath), text)
2669
+
2670
+
2671
+ def analyze_files(base_path: str, files: Iterable[str]) -> List[Issue]:
2672
+ """
2673
+ 批量分析文件,相对路径相对于 base_path。
2674
+ """
2675
+ base = Path(base_path).resolve()
2676
+ out: List[Issue] = []
2677
+ for f in files:
2678
+ rel = Path(f)
2679
+ out.extend(analyze_c_cpp_file(base, rel))
2680
+ return out