jarvis-ai-assistant 0.1.222__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/__init__.py +1143 -245
- jarvis/jarvis_agent/agent_manager.py +97 -0
- jarvis/jarvis_agent/builtin_input_handler.py +12 -10
- jarvis/jarvis_agent/config_editor.py +57 -0
- jarvis/jarvis_agent/edit_file_handler.py +392 -99
- jarvis/jarvis_agent/event_bus.py +48 -0
- jarvis/jarvis_agent/events.py +157 -0
- jarvis/jarvis_agent/file_context_handler.py +79 -0
- jarvis/jarvis_agent/file_methodology_manager.py +117 -0
- jarvis/jarvis_agent/jarvis.py +1117 -147
- jarvis/jarvis_agent/main.py +78 -34
- jarvis/jarvis_agent/memory_manager.py +195 -0
- jarvis/jarvis_agent/methodology_share_manager.py +174 -0
- jarvis/jarvis_agent/prompt_manager.py +82 -0
- jarvis/jarvis_agent/prompts.py +46 -9
- jarvis/jarvis_agent/protocols.py +4 -1
- jarvis/jarvis_agent/rewrite_file_handler.py +141 -0
- jarvis/jarvis_agent/run_loop.py +146 -0
- jarvis/jarvis_agent/session_manager.py +9 -9
- jarvis/jarvis_agent/share_manager.py +228 -0
- jarvis/jarvis_agent/shell_input_handler.py +23 -3
- jarvis/jarvis_agent/stdio_redirect.py +295 -0
- jarvis/jarvis_agent/task_analyzer.py +212 -0
- jarvis/jarvis_agent/task_manager.py +154 -0
- jarvis/jarvis_agent/task_planner.py +496 -0
- jarvis/jarvis_agent/tool_executor.py +8 -4
- jarvis/jarvis_agent/tool_share_manager.py +139 -0
- jarvis/jarvis_agent/user_interaction.py +42 -0
- jarvis/jarvis_agent/utils.py +54 -0
- jarvis/jarvis_agent/web_bridge.py +189 -0
- jarvis/jarvis_agent/web_output_sink.py +53 -0
- jarvis/jarvis_agent/web_server.py +751 -0
- jarvis/jarvis_c2rust/__init__.py +26 -0
- jarvis/jarvis_c2rust/cli.py +613 -0
- jarvis/jarvis_c2rust/collector.py +258 -0
- jarvis/jarvis_c2rust/library_replacer.py +1122 -0
- jarvis/jarvis_c2rust/llm_module_agent.py +1300 -0
- jarvis/jarvis_c2rust/optimizer.py +960 -0
- jarvis/jarvis_c2rust/scanner.py +1681 -0
- jarvis/jarvis_c2rust/transpiler.py +2325 -0
- jarvis/jarvis_code_agent/build_validation_config.py +133 -0
- jarvis/jarvis_code_agent/code_agent.py +1605 -178
- jarvis/jarvis_code_agent/code_analyzer/__init__.py +62 -0
- jarvis/jarvis_code_agent/code_analyzer/base_language.py +74 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/__init__.py +44 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/base.py +102 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/cmake.py +59 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/detector.py +125 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/fallback.py +69 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/go.py +38 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/java_gradle.py +44 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/java_maven.py +38 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/makefile.py +50 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/nodejs.py +93 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/python.py +129 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/rust.py +54 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/validator.py +154 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator.py +43 -0
- jarvis/jarvis_code_agent/code_analyzer/context_manager.py +363 -0
- jarvis/jarvis_code_agent/code_analyzer/context_recommender.py +18 -0
- jarvis/jarvis_code_agent/code_analyzer/dependency_analyzer.py +132 -0
- jarvis/jarvis_code_agent/code_analyzer/file_ignore.py +330 -0
- jarvis/jarvis_code_agent/code_analyzer/impact_analyzer.py +781 -0
- jarvis/jarvis_code_agent/code_analyzer/language_registry.py +185 -0
- jarvis/jarvis_code_agent/code_analyzer/language_support.py +89 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/__init__.py +31 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/c_cpp_language.py +231 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/go_language.py +183 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/python_language.py +219 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/rust_language.py +209 -0
- jarvis/jarvis_code_agent/code_analyzer/llm_context_recommender.py +451 -0
- jarvis/jarvis_code_agent/code_analyzer/symbol_extractor.py +77 -0
- jarvis/jarvis_code_agent/code_analyzer/tree_sitter_extractor.py +48 -0
- jarvis/jarvis_code_agent/lint.py +275 -13
- jarvis/jarvis_code_agent/utils.py +142 -0
- jarvis/jarvis_code_analysis/checklists/loader.py +20 -6
- jarvis/jarvis_code_analysis/code_review.py +583 -548
- jarvis/jarvis_data/config_schema.json +339 -28
- jarvis/jarvis_git_squash/main.py +22 -13
- jarvis/jarvis_git_utils/git_commiter.py +171 -55
- jarvis/jarvis_mcp/sse_mcp_client.py +22 -15
- jarvis/jarvis_mcp/stdio_mcp_client.py +4 -4
- jarvis/jarvis_mcp/streamable_mcp_client.py +36 -16
- jarvis/jarvis_memory_organizer/memory_organizer.py +753 -0
- jarvis/jarvis_methodology/main.py +48 -63
- jarvis/jarvis_multi_agent/__init__.py +302 -43
- jarvis/jarvis_multi_agent/main.py +70 -24
- jarvis/jarvis_platform/ai8.py +40 -23
- jarvis/jarvis_platform/base.py +210 -49
- jarvis/jarvis_platform/human.py +11 -1
- jarvis/jarvis_platform/kimi.py +82 -76
- jarvis/jarvis_platform/openai.py +73 -1
- jarvis/jarvis_platform/registry.py +8 -15
- jarvis/jarvis_platform/tongyi.py +115 -101
- jarvis/jarvis_platform/yuanbao.py +89 -63
- jarvis/jarvis_platform_manager/main.py +194 -132
- jarvis/jarvis_platform_manager/service.py +122 -86
- jarvis/jarvis_rag/cli.py +156 -53
- jarvis/jarvis_rag/embedding_manager.py +155 -12
- jarvis/jarvis_rag/llm_interface.py +10 -13
- jarvis/jarvis_rag/query_rewriter.py +63 -12
- jarvis/jarvis_rag/rag_pipeline.py +222 -40
- jarvis/jarvis_rag/reranker.py +26 -3
- jarvis/jarvis_rag/retriever.py +270 -14
- jarvis/jarvis_sec/__init__.py +3605 -0
- jarvis/jarvis_sec/checkers/__init__.py +32 -0
- jarvis/jarvis_sec/checkers/c_checker.py +2680 -0
- jarvis/jarvis_sec/checkers/rust_checker.py +1108 -0
- jarvis/jarvis_sec/cli.py +116 -0
- jarvis/jarvis_sec/report.py +257 -0
- jarvis/jarvis_sec/status.py +264 -0
- jarvis/jarvis_sec/types.py +20 -0
- jarvis/jarvis_sec/workflow.py +219 -0
- jarvis/jarvis_smart_shell/main.py +405 -137
- jarvis/jarvis_stats/__init__.py +13 -0
- jarvis/jarvis_stats/cli.py +387 -0
- jarvis/jarvis_stats/stats.py +711 -0
- jarvis/jarvis_stats/storage.py +612 -0
- jarvis/jarvis_stats/visualizer.py +282 -0
- jarvis/jarvis_tools/ask_user.py +1 -0
- jarvis/jarvis_tools/base.py +18 -2
- jarvis/jarvis_tools/clear_memory.py +239 -0
- jarvis/jarvis_tools/cli/main.py +220 -144
- jarvis/jarvis_tools/execute_script.py +52 -12
- jarvis/jarvis_tools/file_analyzer.py +17 -12
- jarvis/jarvis_tools/generate_new_tool.py +46 -24
- jarvis/jarvis_tools/read_code.py +277 -18
- jarvis/jarvis_tools/read_symbols.py +141 -0
- jarvis/jarvis_tools/read_webpage.py +86 -13
- jarvis/jarvis_tools/registry.py +294 -90
- jarvis/jarvis_tools/retrieve_memory.py +227 -0
- jarvis/jarvis_tools/save_memory.py +194 -0
- jarvis/jarvis_tools/search_web.py +62 -28
- jarvis/jarvis_tools/sub_agent.py +205 -0
- jarvis/jarvis_tools/sub_code_agent.py +217 -0
- jarvis/jarvis_tools/virtual_tty.py +330 -62
- jarvis/jarvis_utils/builtin_replace_map.py +4 -5
- jarvis/jarvis_utils/clipboard.py +90 -0
- jarvis/jarvis_utils/config.py +607 -50
- jarvis/jarvis_utils/embedding.py +3 -0
- jarvis/jarvis_utils/fzf.py +57 -0
- jarvis/jarvis_utils/git_utils.py +251 -29
- jarvis/jarvis_utils/globals.py +174 -17
- jarvis/jarvis_utils/http.py +58 -79
- jarvis/jarvis_utils/input.py +899 -153
- jarvis/jarvis_utils/methodology.py +210 -83
- jarvis/jarvis_utils/output.py +220 -137
- jarvis/jarvis_utils/utils.py +1906 -135
- jarvis_ai_assistant-0.7.0.dist-info/METADATA +465 -0
- jarvis_ai_assistant-0.7.0.dist-info/RECORD +192 -0
- {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/entry_points.txt +8 -2
- jarvis/jarvis_git_details/main.py +0 -265
- jarvis/jarvis_platform/oyi.py +0 -357
- jarvis/jarvis_tools/edit_file.py +0 -255
- jarvis/jarvis_tools/rewrite_file.py +0 -195
- jarvis_ai_assistant-0.1.222.dist-info/METADATA +0 -767
- jarvis_ai_assistant-0.1.222.dist-info/RECORD +0 -110
- /jarvis/{jarvis_git_details → jarvis_memory_organizer}/__init__.py +0 -0
- {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,3605 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Jarvis 安全分析套件
|
|
4
|
+
|
|
5
|
+
当前版本概述:
|
|
6
|
+
- 关键路径:直扫(direct_scan)→ 单Agent逐条验证(只读工具:read_code/execute_script)→ 聚合输出(JSON + Markdown)
|
|
7
|
+
- 目标范围:内存管理、缓冲区操作、错误处理等基础安全问题识别
|
|
8
|
+
- 约束:不修改核心框架文件,保持最小侵入;严格只读分析
|
|
9
|
+
|
|
10
|
+
集成方式:
|
|
11
|
+
- 复用 jarvis.jarvis_agent.Agent 与工具注册系统(jarvis.jarvis_tools.registry.ToolRegistry)
|
|
12
|
+
- 提供入口:
|
|
13
|
+
- run_security_analysis(entry_path, ...):直扫 + 单Agent逐条验证 + 聚合
|
|
14
|
+
|
|
15
|
+
- workflow.direct_scan(entry_path, ...):仅启发式直扫
|
|
16
|
+
|
|
17
|
+
说明:
|
|
18
|
+
- 已移除 MultiAgent 编排与相关提示词;不存在“阶段一”等表述
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from typing import Dict, List, Optional
|
|
22
|
+
|
|
23
|
+
import typer
|
|
24
|
+
|
|
25
|
+
from jarvis.jarvis_agent import Agent
|
|
26
|
+
from jarvis.jarvis_sec.workflow import direct_scan, run_with_agent
|
|
27
|
+
from jarvis.jarvis_tools.registry import ToolRegistry
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _build_summary_prompt() -> str:
|
|
31
|
+
"""
|
|
32
|
+
构建摘要提示词:要求以 <REPORT>...</REPORT> 包裹的 YAML 输出(仅YAML)。
|
|
33
|
+
系统提示词不强制规定主对话输出格式,仅在摘要中给出结构化结果。
|
|
34
|
+
"""
|
|
35
|
+
return """
|
|
36
|
+
请将本轮"安全子任务(单点验证)"的结构化结果仅放入以下标记中,并使用 YAML 数组对象形式输出。
|
|
37
|
+
仅输出全局编号(gid)与详细理由(不含位置信息),gid 为全局唯一的数字编号。
|
|
38
|
+
|
|
39
|
+
示例1:有告警的情况(has_risk: true,单个gid)
|
|
40
|
+
<REPORT>
|
|
41
|
+
- gid: 1
|
|
42
|
+
has_risk: true
|
|
43
|
+
preconditions: "输入字符串 src 的长度大于等于 dst 的缓冲区大小"
|
|
44
|
+
trigger_path: "调用路径推导:main() -> handle_network_request() -> parse_packet() -> foobar() -> strcpy()。数据流:网络数据包通过 handle_network_request() 接收,传递给 parse_packet() 解析,parse_packet() 未对数据长度进行校验,直接将 src 传递给 foobar(),foobar() 调用 strcpy(dst, src) 时未检查 src 长度,可导致缓冲区溢出。关键调用点:parse_packet() 函数未对输入长度进行校验。"
|
|
45
|
+
consequences: "缓冲区溢出,可能引发程序崩溃或任意代码执行"
|
|
46
|
+
suggestions: "使用 strncpy_s 或其他安全的字符串复制函数"
|
|
47
|
+
</REPORT>
|
|
48
|
+
|
|
49
|
+
示例2:有告警的情况(has_risk: true,多个gid合并,路径和原因一致)
|
|
50
|
+
<REPORT>
|
|
51
|
+
- gids: [1, 2, 3]
|
|
52
|
+
has_risk: true
|
|
53
|
+
preconditions: "输入字符串 src 的长度大于等于 dst 的缓冲区大小"
|
|
54
|
+
trigger_path: "调用路径推导:main() -> handle_network_request() -> parse_packet() -> foobar() -> strcpy()。数据流:网络数据包通过 handle_network_request() 接收,传递给 parse_packet() 解析,parse_packet() 未对数据长度进行校验,直接将 src 传递给 foobar(),foobar() 调用 strcpy(dst, src) 时未检查 src 长度,可导致缓冲区溢出。关键调用点:parse_packet() 函数未对输入长度进行校验。"
|
|
55
|
+
consequences: "缓冲区溢出,可能引发程序崩溃或任意代码执行"
|
|
56
|
+
suggestions: "使用 strncpy_s 或其他安全的字符串复制函数"
|
|
57
|
+
</REPORT>
|
|
58
|
+
|
|
59
|
+
示例3:误报或无问题(返回空数组)
|
|
60
|
+
<REPORT>
|
|
61
|
+
[]
|
|
62
|
+
</REPORT>
|
|
63
|
+
|
|
64
|
+
要求:
|
|
65
|
+
- 只能在 <REPORT> 与 </REPORT> 中输出 YAML 数组,且不得出现其他文本。
|
|
66
|
+
- 若确认本批次全部为误报或无问题,请返回空数组 []。
|
|
67
|
+
- 数组元素为对象,包含字段:
|
|
68
|
+
- gid: 整数(全局唯一编号,单个告警时使用)
|
|
69
|
+
- gids: 整数数组(全局唯一编号数组,多个告警合并时使用)
|
|
70
|
+
- has_risk: 布尔值 (true/false),表示该项是否存在真实安全风险。
|
|
71
|
+
- preconditions: 字符串(触发漏洞的前置条件,仅当 has_risk 为 true 时必需)
|
|
72
|
+
- trigger_path: 字符串(漏洞的触发路径,必须包含完整的调用路径推导,包括:1) 可控输入的来源;2) 从输入源到缺陷代码的完整调用链(函数调用序列);3) 每个调用点的数据校验情况;4) 触发条件。格式示例:"调用路径推导:函数A() -> 函数B() -> 函数C() -> 缺陷代码。数据流:输入来源 -> 传递路径。关键调用点:函数B()未做校验。",仅当 has_risk 为 true 时必需)
|
|
73
|
+
- consequences: 字符串(漏洞被触发后可能导致的后果,仅当 has_risk 为 true 时必需)
|
|
74
|
+
- suggestions: 字符串(修复或缓解该漏洞的建议,仅当 has_risk 为 true 时必需)
|
|
75
|
+
- **合并格式优化**:如果多个告警(gid)的路径(trigger_path)、原因(preconditions/consequences/suggestions)完全一致,可以使用 gids 数组格式合并输出,减少重复内容。单个告警使用 gid,多个告警合并使用 gids。gid 和 gids 不能同时出现。
|
|
76
|
+
- 不要在数组元素中包含 file/line/pattern 等位置信息;写入 jsonl 时系统会结合原始候选信息。
|
|
77
|
+
- **关键**:仅当 `has_risk` 为 `true` 时,才会被记录为确认的问题。对于确认是误报的条目,请确保 `has_risk` 为 `false` 或不输出该条目。
|
|
78
|
+
- **输出格式**:有告警的条目必须包含所有字段(gid 或 gids, has_risk, preconditions, trigger_path, consequences, suggestions);无告警的条目只需包含 gid 和 has_risk。
|
|
79
|
+
- **调用路径推导要求**:trigger_path 字段必须包含完整的调用路径推导,不能省略或简化。必须明确说明从可控输入到缺陷代码的完整调用链,以及每个调用点的校验情况。如果无法推导出完整的调用路径,应该判定为误报(has_risk: false)。
|
|
80
|
+
""".strip()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _build_verification_summary_prompt() -> str:
|
|
84
|
+
"""
|
|
85
|
+
构建验证 Agent 的摘要提示词:验证分析 Agent 给出的结论是否正确。
|
|
86
|
+
"""
|
|
87
|
+
return """
|
|
88
|
+
请将本轮"验证分析结论"的结构化结果仅放入以下标记中,并使用 YAML 数组对象形式输出。
|
|
89
|
+
你需要验证分析 Agent 给出的结论是否正确,包括前置条件、触发路径、后果和建议是否合理。
|
|
90
|
+
|
|
91
|
+
示例1:验证通过(is_valid: true,单个gid)
|
|
92
|
+
<REPORT>
|
|
93
|
+
- gid: 1
|
|
94
|
+
is_valid: true
|
|
95
|
+
verification_notes: "分析结论正确,前置条件合理,触发路径清晰,后果评估准确"
|
|
96
|
+
</REPORT>
|
|
97
|
+
|
|
98
|
+
示例2:验证通过(is_valid: true,多个gid合并)
|
|
99
|
+
<REPORT>
|
|
100
|
+
- gids: [1, 2, 3]
|
|
101
|
+
is_valid: true
|
|
102
|
+
verification_notes: "分析结论正确,前置条件合理,触发路径清晰,后果评估准确"
|
|
103
|
+
</REPORT>
|
|
104
|
+
|
|
105
|
+
示例3:验证不通过(is_valid: false)
|
|
106
|
+
<REPORT>
|
|
107
|
+
- gid: 1
|
|
108
|
+
is_valid: false
|
|
109
|
+
verification_notes: "前置条件过于宽泛,实际代码中已有输入校验,触发路径不成立"
|
|
110
|
+
</REPORT>
|
|
111
|
+
|
|
112
|
+
要求:
|
|
113
|
+
- 只能在 <REPORT> 与 </REPORT> 中输出 YAML 数组,且不得出现其他文本。
|
|
114
|
+
- 数组元素为对象,包含字段:
|
|
115
|
+
- gid: 整数(全局唯一编号,对应分析 Agent 给出的 gid,单个告警时使用)
|
|
116
|
+
- gids: 整数数组(全局唯一编号数组,对应分析 Agent 给出的 gids,多个告警合并时使用)
|
|
117
|
+
- is_valid: 布尔值 (true/false),表示分析 Agent 的结论是否正确
|
|
118
|
+
- verification_notes: 字符串(验证说明,解释为什么结论正确或不正确)
|
|
119
|
+
- **合并格式优化**:如果多个告警(gid)的验证结果(is_valid)和验证说明(verification_notes)完全一致,可以使用 gids 数组格式合并输出,减少重复内容。单个告警使用 gid,多个告警合并使用 gids。gid 和 gids 不能同时出现。
|
|
120
|
+
- 必须对所有输入的 gid 进行验证,不能遗漏。
|
|
121
|
+
- 如果验证通过(is_valid: true),则保留该告警;如果验证不通过(is_valid: false),则视为误报,不记录为问题。
|
|
122
|
+
""".strip()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# 注:当前版本不使用 MultiAgent 编排,已移除默认多智能体配置与创建函数。
|
|
126
|
+
# 请使用 run_security_analysis(单Agent逐条验证)或 workflow.direct_scan + format_markdown_report(直扫基线)。
|
|
127
|
+
|
|
128
|
+
def _git_restore_if_dirty(repo_root: str) -> int:
|
|
129
|
+
"""
|
|
130
|
+
若 repo_root 为 git 仓库:检测工作区是否有变更;如有则使用 'git checkout -- .' 恢复。
|
|
131
|
+
返回估算的变更文件数(基于 git status --porcelain 的行数)。
|
|
132
|
+
"""
|
|
133
|
+
try:
|
|
134
|
+
from pathlib import Path as _Path
|
|
135
|
+
import subprocess as _sub
|
|
136
|
+
root = _Path(repo_root)
|
|
137
|
+
if not (root / ".git").exists():
|
|
138
|
+
return 0
|
|
139
|
+
proc = _sub.run(["git", "status", "--porcelain"], cwd=str(root), capture_output=True, text=True)
|
|
140
|
+
if proc.returncode != 0:
|
|
141
|
+
return 0
|
|
142
|
+
lines = [line for line in proc.stdout.splitlines() if line.strip()]
|
|
143
|
+
if lines:
|
|
144
|
+
_sub.run(["git", "checkout", "--", "."], cwd=str(root), capture_output=True, text=True)
|
|
145
|
+
return len(lines)
|
|
146
|
+
except Exception:
|
|
147
|
+
pass
|
|
148
|
+
return 0
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _get_sec_dir(base_path: str):
|
|
152
|
+
"""获取 .jarvis/sec 目录路径,支持 base_path 是项目根目录或已经是 .jarvis/sec 目录"""
|
|
153
|
+
from pathlib import Path as _Path
|
|
154
|
+
base = _Path(base_path)
|
|
155
|
+
# 检查 base_path 是否已经是 .jarvis/sec 目录
|
|
156
|
+
if base.name == "sec" and base.parent.name == ".jarvis":
|
|
157
|
+
return base
|
|
158
|
+
# 否则,假设 base_path 是项目根目录
|
|
159
|
+
return base / ".jarvis" / "sec"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _initialize_analysis_context(
|
|
163
|
+
entry_path: str,
|
|
164
|
+
status_mgr,
|
|
165
|
+
) -> tuple:
|
|
166
|
+
"""
|
|
167
|
+
初始化分析上下文,包括状态管理、进度文件、目录等。
|
|
168
|
+
|
|
169
|
+
返回: (sec_dir, progress_path, _progress_append, done_sigs)
|
|
170
|
+
"""
|
|
171
|
+
from pathlib import Path as _Path
|
|
172
|
+
from datetime import datetime as _dt
|
|
173
|
+
import json as _json
|
|
174
|
+
|
|
175
|
+
# 获取 .jarvis/sec 目录
|
|
176
|
+
sec_dir = _get_sec_dir(entry_path)
|
|
177
|
+
progress_path = sec_dir / "progress.jsonl"
|
|
178
|
+
|
|
179
|
+
# 进度追加函数
|
|
180
|
+
def _progress_append(rec: Dict) -> None:
|
|
181
|
+
try:
|
|
182
|
+
progress_path.parent.mkdir(parents=True, exist_ok=True)
|
|
183
|
+
rec = dict(rec)
|
|
184
|
+
rec.setdefault("timestamp", _dt.utcnow().isoformat() + "Z")
|
|
185
|
+
line = _json.dumps(rec, ensure_ascii=False)
|
|
186
|
+
with progress_path.open("a", encoding="utf-8") as f:
|
|
187
|
+
f.write(line + "\n")
|
|
188
|
+
except Exception:
|
|
189
|
+
# 进度文件失败不影响主流程
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
# 已完成集合(按候选签名)
|
|
193
|
+
done_sigs: set = set()
|
|
194
|
+
if progress_path.exists():
|
|
195
|
+
try:
|
|
196
|
+
for line in progress_path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
|
197
|
+
line = line.strip()
|
|
198
|
+
if not line:
|
|
199
|
+
continue
|
|
200
|
+
try:
|
|
201
|
+
obj = _json.loads(line)
|
|
202
|
+
except Exception:
|
|
203
|
+
continue
|
|
204
|
+
if obj.get("event") == "task_status" and obj.get("status") == "done":
|
|
205
|
+
sig = obj.get("candidate_signature")
|
|
206
|
+
if sig:
|
|
207
|
+
done_sigs.add(sig)
|
|
208
|
+
except Exception:
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
return sec_dir, progress_path, _progress_append, done_sigs
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _load_or_run_heuristic_scan(
|
|
215
|
+
entry_path: str,
|
|
216
|
+
langs: List[str],
|
|
217
|
+
exclude_dirs: Optional[List[str]],
|
|
218
|
+
sec_dir,
|
|
219
|
+
status_mgr,
|
|
220
|
+
_progress_append,
|
|
221
|
+
) -> tuple[List[Dict], Dict]:
|
|
222
|
+
"""
|
|
223
|
+
加载或运行启发式扫描。
|
|
224
|
+
|
|
225
|
+
返回: (candidates, summary)
|
|
226
|
+
"""
|
|
227
|
+
import json
|
|
228
|
+
from pathlib import Path as _Path
|
|
229
|
+
|
|
230
|
+
_heuristic_path = sec_dir / "heuristic_issues.jsonl"
|
|
231
|
+
candidates: List[Dict] = []
|
|
232
|
+
summary: Dict = {}
|
|
233
|
+
|
|
234
|
+
if _heuristic_path.exists():
|
|
235
|
+
try:
|
|
236
|
+
typer.secho(f"[jarvis-sec] 从 {_heuristic_path} 恢复启发式扫描", fg=typer.colors.BLUE)
|
|
237
|
+
with _heuristic_path.open("r", encoding="utf-8") as f:
|
|
238
|
+
for line in f:
|
|
239
|
+
if line.strip():
|
|
240
|
+
candidates.append(json.loads(line))
|
|
241
|
+
_progress_append({
|
|
242
|
+
"event": "pre_scan_resumed",
|
|
243
|
+
"path": str(_heuristic_path),
|
|
244
|
+
"issues_found": len(candidates)
|
|
245
|
+
})
|
|
246
|
+
except Exception as e:
|
|
247
|
+
typer.secho(f"[jarvis-sec] 恢复启发式扫描失败,执行完整扫描: {e}", fg=typer.colors.YELLOW)
|
|
248
|
+
candidates = [] # 重置以便执行完整扫描
|
|
249
|
+
|
|
250
|
+
if not candidates:
|
|
251
|
+
_progress_append({"event": "pre_scan_start", "entry_path": entry_path, "languages": langs})
|
|
252
|
+
status_mgr.update_pre_scan(message="开始启发式扫描...")
|
|
253
|
+
pre_scan = direct_scan(entry_path, languages=langs, exclude_dirs=exclude_dirs)
|
|
254
|
+
candidates = pre_scan.get("issues", [])
|
|
255
|
+
summary = pre_scan.get("summary", {})
|
|
256
|
+
scanned_files = summary.get("scanned_files", 0)
|
|
257
|
+
status_mgr.update_pre_scan(
|
|
258
|
+
current_files=scanned_files,
|
|
259
|
+
total_files=scanned_files,
|
|
260
|
+
issues_found=len(candidates),
|
|
261
|
+
message=f"启发式扫描完成,发现 {len(candidates)} 个候选问题"
|
|
262
|
+
)
|
|
263
|
+
_progress_append({
|
|
264
|
+
"event": "pre_scan_done",
|
|
265
|
+
"entry_path": entry_path,
|
|
266
|
+
"languages": langs,
|
|
267
|
+
"scanned_files": scanned_files,
|
|
268
|
+
"issues_found": len(candidates)
|
|
269
|
+
})
|
|
270
|
+
# 持久化
|
|
271
|
+
try:
|
|
272
|
+
_heuristic_path.parent.mkdir(parents=True, exist_ok=True)
|
|
273
|
+
with _heuristic_path.open("w", encoding="utf-8") as f:
|
|
274
|
+
for item in candidates:
|
|
275
|
+
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
276
|
+
_progress_append({
|
|
277
|
+
"event": "heuristic_report_written",
|
|
278
|
+
"path": str(_heuristic_path),
|
|
279
|
+
"issues_count": len(candidates),
|
|
280
|
+
})
|
|
281
|
+
typer.secho(f"[jarvis-sec] 已将 {len(candidates)} 个启发式扫描问题写入 {_heuristic_path}", fg=typer.colors.GREEN)
|
|
282
|
+
except Exception:
|
|
283
|
+
pass
|
|
284
|
+
else:
|
|
285
|
+
# 从断点恢复启发式扫描结果
|
|
286
|
+
status_mgr.update_pre_scan(
|
|
287
|
+
issues_found=len(candidates),
|
|
288
|
+
message=f"从断点恢复,已发现 {len(candidates)} 个候选问题"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
return candidates, summary
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _compact_candidate(it: Dict) -> Dict:
|
|
295
|
+
"""精简候选问题,只保留必要字段"""
|
|
296
|
+
return {
|
|
297
|
+
"language": it.get("language"),
|
|
298
|
+
"category": it.get("category"),
|
|
299
|
+
"pattern": it.get("pattern"),
|
|
300
|
+
"file": it.get("file"),
|
|
301
|
+
"line": it.get("line"),
|
|
302
|
+
"evidence": it.get("evidence"),
|
|
303
|
+
"confidence": it.get("confidence"),
|
|
304
|
+
"severity": it.get("severity", "medium"),
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _prepare_candidates(candidates: List[Dict]) -> List[Dict]:
|
|
309
|
+
"""
|
|
310
|
+
将候选问题精简为子任务清单,控制上下文长度,并分配全局唯一ID。
|
|
311
|
+
|
|
312
|
+
返回: compact_candidates (已分配gid的候选列表)
|
|
313
|
+
"""
|
|
314
|
+
compact_candidates = [_compact_candidate(it) for it in candidates]
|
|
315
|
+
# 为所有候选分配全局唯一数字ID(gid: 1..N),用于跨批次/跨文件统一编号与跟踪
|
|
316
|
+
for i, it in enumerate(compact_candidates, start=1):
|
|
317
|
+
try:
|
|
318
|
+
it["gid"] = i
|
|
319
|
+
except Exception:
|
|
320
|
+
pass
|
|
321
|
+
|
|
322
|
+
return compact_candidates
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _load_existing_clusters(
|
|
326
|
+
sec_dir,
|
|
327
|
+
progress_path,
|
|
328
|
+
) -> tuple[Dict[tuple[str, int], List[Dict]], set]:
|
|
329
|
+
"""
|
|
330
|
+
读取已有聚类报告以支持断点恢复。
|
|
331
|
+
|
|
332
|
+
返回: (_existing_clusters, _completed_cluster_batches)
|
|
333
|
+
"""
|
|
334
|
+
_existing_clusters: Dict[tuple[str, int], List[Dict]] = {}
|
|
335
|
+
_completed_cluster_batches: set = set()
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
from pathlib import Path as _Path2
|
|
339
|
+
import json as _json
|
|
340
|
+
_cluster_path = sec_dir / "cluster_report.jsonl"
|
|
341
|
+
|
|
342
|
+
# 从 progress.jsonl 中读取已完成的聚类批次(优先检查)
|
|
343
|
+
if progress_path.exists():
|
|
344
|
+
try:
|
|
345
|
+
for line in progress_path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
|
346
|
+
line = line.strip()
|
|
347
|
+
if not line:
|
|
348
|
+
continue
|
|
349
|
+
try:
|
|
350
|
+
obj = _json.loads(line)
|
|
351
|
+
except Exception:
|
|
352
|
+
continue
|
|
353
|
+
# 检查 cluster_status 事件,status 为 "done" 表示已完成
|
|
354
|
+
if obj.get("event") == "cluster_status" and obj.get("status") == "done":
|
|
355
|
+
file_name = obj.get("file")
|
|
356
|
+
batch_idx = obj.get("batch_index")
|
|
357
|
+
if file_name and batch_idx:
|
|
358
|
+
_completed_cluster_batches.add((str(file_name), int(batch_idx)))
|
|
359
|
+
except Exception:
|
|
360
|
+
pass
|
|
361
|
+
|
|
362
|
+
# 读取 cluster_report.jsonl(由于使用追加模式,可能有重复,需要去重)
|
|
363
|
+
if _cluster_path.exists():
|
|
364
|
+
try:
|
|
365
|
+
# 使用字典去重:key 为 (file, batch_index, verification, gids 的字符串表示)
|
|
366
|
+
seen_records: Dict[tuple, Dict] = {}
|
|
367
|
+
with _cluster_path.open("r", encoding="utf-8", errors="ignore") as f:
|
|
368
|
+
for line in f:
|
|
369
|
+
line = line.strip()
|
|
370
|
+
if not line:
|
|
371
|
+
continue
|
|
372
|
+
rec = _json.loads(line)
|
|
373
|
+
if not isinstance(rec, dict):
|
|
374
|
+
continue
|
|
375
|
+
f_name = str(rec.get("file") or "")
|
|
376
|
+
bidx = int(rec.get("batch_index", 1) or 1)
|
|
377
|
+
# 使用 gids 的排序后元组作为去重键
|
|
378
|
+
gids_list = rec.get("gids", [])
|
|
379
|
+
gids_key = tuple(sorted(gids_list)) if isinstance(gids_list, list) else ()
|
|
380
|
+
key = (f_name, bidx, str(rec.get("verification", "")), gids_key)
|
|
381
|
+
# 保留最新的记录(后写入的覆盖先写入的)
|
|
382
|
+
seen_records[key] = rec
|
|
383
|
+
|
|
384
|
+
# 按 (file, batch_index) 分组
|
|
385
|
+
for rec in seen_records.values():
|
|
386
|
+
f_name = str(rec.get("file") or "")
|
|
387
|
+
bidx = int(rec.get("batch_index", 1) or 1)
|
|
388
|
+
_existing_clusters.setdefault((f_name, bidx), []).append(rec)
|
|
389
|
+
except Exception:
|
|
390
|
+
_existing_clusters = {}
|
|
391
|
+
except Exception:
|
|
392
|
+
_existing_clusters = {}
|
|
393
|
+
_completed_cluster_batches = set()
|
|
394
|
+
|
|
395
|
+
return _existing_clusters, _completed_cluster_batches
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _restore_clusters_from_checkpoint(
|
|
399
|
+
_existing_clusters: Dict[tuple[str, int], List[Dict]],
|
|
400
|
+
_file_groups: Dict[str, List[Dict]],
|
|
401
|
+
) -> tuple[List[List[Dict]], List[Dict], List[Dict], set]:
|
|
402
|
+
"""
|
|
403
|
+
从断点恢复聚类结果。
|
|
404
|
+
|
|
405
|
+
返回: (cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids)
|
|
406
|
+
"""
|
|
407
|
+
# 1. 收集所有候选的 gid
|
|
408
|
+
all_candidate_gids_in_clustering = set()
|
|
409
|
+
gid_to_candidate: Dict[int, Dict] = {}
|
|
410
|
+
for _file, _items in _file_groups.items():
|
|
411
|
+
for it in _items:
|
|
412
|
+
try:
|
|
413
|
+
_gid = int(it.get("gid", 0))
|
|
414
|
+
if _gid >= 1:
|
|
415
|
+
all_candidate_gids_in_clustering.add(_gid)
|
|
416
|
+
gid_to_candidate[_gid] = it
|
|
417
|
+
except Exception:
|
|
418
|
+
pass
|
|
419
|
+
|
|
420
|
+
# 2. 从 cluster_report.jsonl 恢复所有聚类结果
|
|
421
|
+
clustered_gids = set() # 已聚类的 gid(包括有效和无效的,因为无效的也需要进入复核阶段)
|
|
422
|
+
invalid_clusters_for_review: List[Dict] = [] # 无效聚类列表(从断点恢复)
|
|
423
|
+
cluster_batches: List[List[Dict]] = []
|
|
424
|
+
cluster_records: List[Dict] = []
|
|
425
|
+
|
|
426
|
+
for (_file_key, _batch_idx), cluster_recs in _existing_clusters.items():
|
|
427
|
+
for rec in cluster_recs:
|
|
428
|
+
gids_list = rec.get("gids", [])
|
|
429
|
+
if not gids_list:
|
|
430
|
+
continue
|
|
431
|
+
is_invalid = rec.get("is_invalid", False)
|
|
432
|
+
verification = str(rec.get("verification", "")).strip()
|
|
433
|
+
members: List[Dict] = []
|
|
434
|
+
for _gid in gids_list:
|
|
435
|
+
try:
|
|
436
|
+
_gid_int = int(_gid)
|
|
437
|
+
if _gid_int >= 1 and _gid_int in gid_to_candidate:
|
|
438
|
+
# 只有当 gid 在当前运行中存在时,才恢复该聚类
|
|
439
|
+
candidate = gid_to_candidate[_gid_int]
|
|
440
|
+
candidate["verify"] = verification
|
|
441
|
+
members.append(candidate)
|
|
442
|
+
# 无论有效还是无效,都计入已聚类的 gid(避免被重新聚类)
|
|
443
|
+
clustered_gids.add(_gid_int)
|
|
444
|
+
except Exception:
|
|
445
|
+
pass
|
|
446
|
+
|
|
447
|
+
if members:
|
|
448
|
+
if is_invalid:
|
|
449
|
+
# 无效聚类:收集到复核列表,不加入 cluster_batches
|
|
450
|
+
invalid_clusters_for_review.append({
|
|
451
|
+
"file": _file_key,
|
|
452
|
+
"batch_index": _batch_idx,
|
|
453
|
+
"gids": [m.get("gid") for m in members],
|
|
454
|
+
"verification": verification,
|
|
455
|
+
"invalid_reason": str(rec.get("invalid_reason", "")).strip(),
|
|
456
|
+
"members": members, # 保存候选信息,用于复核后可能重新加入验证
|
|
457
|
+
"count": len(members),
|
|
458
|
+
})
|
|
459
|
+
else:
|
|
460
|
+
# 有效聚类:恢复到 cluster_batches
|
|
461
|
+
cluster_batches.append(members)
|
|
462
|
+
cluster_records.append({
|
|
463
|
+
"file": _file_key,
|
|
464
|
+
"verification": verification,
|
|
465
|
+
"gids": [m.get("gid") for m in members],
|
|
466
|
+
"count": len(members),
|
|
467
|
+
"batch_index": _batch_idx,
|
|
468
|
+
"is_invalid": False,
|
|
469
|
+
})
|
|
470
|
+
|
|
471
|
+
return cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _get_review_system_prompt() -> str:
|
|
475
|
+
"""获取复核Agent的系统提示词"""
|
|
476
|
+
return """
|
|
477
|
+
# 复核Agent约束
|
|
478
|
+
- 你的核心任务是复核聚类Agent给出的无效结论是否充分和正确。
|
|
479
|
+
- 你需要仔细检查聚类Agent提供的invalid_reason是否充分,是否真的考虑了所有可能的路径。
|
|
480
|
+
- 工具优先:使用 read_code 读取目标文件附近源码(行号前后各 ~50 行),必要时用 execute_script 辅助检索。
|
|
481
|
+
- 必要时需向上追溯调用者,查看完整的调用路径,以确认聚类Agent的结论是否成立。
|
|
482
|
+
- 禁止修改任何文件或执行写操作命令;仅进行只读分析与读取。
|
|
483
|
+
- 每次仅执行一个操作;等待工具结果后再进行下一步。
|
|
484
|
+
- **记忆使用**:
|
|
485
|
+
- 在复核过程中,充分利用 retrieve_memory 工具检索已有的记忆,特别是与当前文件或函数相关的记忆。
|
|
486
|
+
- 这些记忆可能包含函数的分析要点、指针判空情况、输入校验情况、调用路径分析结果等。
|
|
487
|
+
- **复核原则**:
|
|
488
|
+
- 必须验证聚类Agent是否真的检查了所有可能的调用路径和调用者。
|
|
489
|
+
- 必须验证聚类Agent是否真的确认了所有路径都有保护措施。
|
|
490
|
+
- 如果发现聚类Agent遗漏了某些路径、调用者或边界情况,必须判定为理由不充分。
|
|
491
|
+
- 保守策略:有疑问时,一律判定为理由不充分,将候选重新加入验证流程。
|
|
492
|
+
- 完成复核后,主输出仅打印结束符 <!!!COMPLETE!!!> ,不需要汇总结果。
|
|
493
|
+
""".strip()
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _get_review_summary_prompt() -> str:
|
|
497
|
+
"""获取复核Agent的摘要提示词"""
|
|
498
|
+
return """
|
|
499
|
+
请将本轮"复核结论"的结构化结果仅放入以下标记中,并使用 YAML 数组对象形式输出。
|
|
500
|
+
你需要复核聚类Agent给出的无效理由是否充分,是否真的考虑了所有可能的路径。
|
|
501
|
+
|
|
502
|
+
示例1:理由充分(is_reason_sufficient: true,单个gid)
|
|
503
|
+
<REPORT>
|
|
504
|
+
- gid: 1
|
|
505
|
+
is_reason_sufficient: true
|
|
506
|
+
review_notes: "聚类Agent已检查所有调用路径,确认所有调用者都有输入校验,理由充分"
|
|
507
|
+
</REPORT>
|
|
508
|
+
|
|
509
|
+
示例2:理由充分(is_reason_sufficient: true,多个gid合并)
|
|
510
|
+
<REPORT>
|
|
511
|
+
- gids: [1, 2, 3]
|
|
512
|
+
is_reason_sufficient: true
|
|
513
|
+
review_notes: "聚类Agent已检查所有调用路径,确认所有调用者都有输入校验,理由充分"
|
|
514
|
+
</REPORT>
|
|
515
|
+
|
|
516
|
+
示例3:理由不充分(is_reason_sufficient: false)
|
|
517
|
+
<REPORT>
|
|
518
|
+
- gid: 1
|
|
519
|
+
is_reason_sufficient: false
|
|
520
|
+
review_notes: "聚类Agent遗漏了函数X的调用路径,该路径可能未做校验,理由不充分,需要重新验证"
|
|
521
|
+
</REPORT>
|
|
522
|
+
|
|
523
|
+
要求:
|
|
524
|
+
- 只能在 <REPORT> 与 </REPORT> 中输出 YAML 数组,且不得出现其他文本。
|
|
525
|
+
- 数组元素为对象,包含字段:
|
|
526
|
+
- gid: 整数(全局唯一编号,对应无效聚类的gid,单个告警时使用)
|
|
527
|
+
- gids: 整数数组(全局唯一编号数组,对应无效聚类的gids,多个告警合并时使用)
|
|
528
|
+
- is_reason_sufficient: 布尔值 (true/false),表示无效理由是否充分
|
|
529
|
+
- review_notes: 字符串(复核说明,解释为什么理由充分或不充分)
|
|
530
|
+
- **合并格式优化**:如果多个告警(gid)的复核结果(is_reason_sufficient)和复核说明(review_notes)完全一致,可以使用 gids 数组格式合并输出,减少重复内容。单个告警使用 gid,多个告警合并使用 gids。gid 和 gids 不能同时出现。
|
|
531
|
+
- 必须对所有输入的gid进行复核,不能遗漏。
|
|
532
|
+
- 如果理由不充分(is_reason_sufficient: false),该候选将重新加入验证流程;如果理由充分(is_reason_sufficient: true),该候选将被确认为无效。
|
|
533
|
+
""".strip()
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def _build_review_task(review_batch: List[Dict], entry_path: str, langs: List[str]) -> str:
|
|
537
|
+
"""构建复核任务上下文"""
|
|
538
|
+
import json as _json_review
|
|
539
|
+
return f"""
|
|
540
|
+
# 复核无效聚类任务
|
|
541
|
+
上下文参数:
|
|
542
|
+
- entry_path: {entry_path}
|
|
543
|
+
- languages: {langs}
|
|
544
|
+
|
|
545
|
+
需要复核的无效聚类(JSON数组):
|
|
546
|
+
{_json_review.dumps(review_batch, ensure_ascii=False, indent=2)}
|
|
547
|
+
|
|
548
|
+
请仔细复核每个无效聚类的invalid_reason是否充分,是否真的考虑了所有可能的路径、调用者和边界情况。
|
|
549
|
+
对于每个gid,请判断无效理由是否充分(is_reason_sufficient: true/false),并给出复核说明。
|
|
550
|
+
""".strip()
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _create_review_agent(
|
|
554
|
+
current_review_num: int,
|
|
555
|
+
llm_group: Optional[str],
|
|
556
|
+
) -> Agent:
|
|
557
|
+
"""创建复核Agent"""
|
|
558
|
+
review_system_prompt = _get_review_system_prompt()
|
|
559
|
+
review_summary_prompt = _get_review_summary_prompt()
|
|
560
|
+
|
|
561
|
+
review_task_id = f"JARVIS-SEC-Review-Batch-{current_review_num}"
|
|
562
|
+
review_agent_kwargs: Dict = dict(
|
|
563
|
+
system_prompt=review_system_prompt,
|
|
564
|
+
name=review_task_id,
|
|
565
|
+
auto_complete=True,
|
|
566
|
+
need_summary=True,
|
|
567
|
+
summary_prompt=review_summary_prompt,
|
|
568
|
+
non_interactive=True,
|
|
569
|
+
in_multi_agent=False,
|
|
570
|
+
use_methodology=False,
|
|
571
|
+
use_analysis=False,
|
|
572
|
+
plan=False,
|
|
573
|
+
output_handler=[ToolRegistry()],
|
|
574
|
+
disable_file_edit=True,
|
|
575
|
+
use_tools=["read_code", "execute_script", "retrieve_memory", "save_memory"],
|
|
576
|
+
)
|
|
577
|
+
if llm_group:
|
|
578
|
+
review_agent_kwargs["model_group"] = llm_group
|
|
579
|
+
return Agent(**review_agent_kwargs)
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def _process_review_batch_items(
|
|
583
|
+
review_batch: List[Dict],
|
|
584
|
+
review_results: Optional[List[Dict]],
|
|
585
|
+
reviewed_clusters: List[Dict],
|
|
586
|
+
reinstated_candidates: List[Dict],
|
|
587
|
+
) -> None:
|
|
588
|
+
"""处理单个复核批次的结果"""
|
|
589
|
+
_process_review_batch(
|
|
590
|
+
review_batch,
|
|
591
|
+
review_results,
|
|
592
|
+
reviewed_clusters,
|
|
593
|
+
reinstated_candidates,
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def _reinstated_candidates_to_cluster_batches(
|
|
598
|
+
reinstated_candidates: List[Dict],
|
|
599
|
+
cluster_batches: List[List[Dict]],
|
|
600
|
+
_progress_append,
|
|
601
|
+
) -> None:
|
|
602
|
+
"""将重新加入的候选添加到cluster_batches"""
|
|
603
|
+
from collections import defaultdict as _dd2
|
|
604
|
+
|
|
605
|
+
if not reinstated_candidates:
|
|
606
|
+
return
|
|
607
|
+
|
|
608
|
+
typer.secho(f"[jarvis-sec] 复核完成:{len(reinstated_candidates)} 个候选重新加入验证流程", fg=typer.colors.GREEN)
|
|
609
|
+
# 按文件分组重新加入的候选
|
|
610
|
+
reinstated_by_file: Dict[str, List[Dict]] = _dd2(list)
|
|
611
|
+
for cand in reinstated_candidates:
|
|
612
|
+
file_key = str(cand.get("file") or "")
|
|
613
|
+
reinstated_by_file[file_key].append(cand)
|
|
614
|
+
|
|
615
|
+
# 为每个文件的重新加入候选创建批次
|
|
616
|
+
for file_key, cands in reinstated_by_file.items():
|
|
617
|
+
if cands:
|
|
618
|
+
cluster_batches.append(cands)
|
|
619
|
+
_progress_append({
|
|
620
|
+
"event": "review_reinstated",
|
|
621
|
+
"file": file_key,
|
|
622
|
+
"gids": [c.get("gid") for c in cands],
|
|
623
|
+
"count": len(cands),
|
|
624
|
+
})
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def _process_review_phase(
|
|
628
|
+
invalid_clusters_for_review: List[Dict],
|
|
629
|
+
entry_path: str,
|
|
630
|
+
langs: List[str],
|
|
631
|
+
llm_group: Optional[str],
|
|
632
|
+
status_mgr,
|
|
633
|
+
_progress_append,
|
|
634
|
+
cluster_batches: List[List[Dict]],
|
|
635
|
+
) -> List[List[Dict]]:
|
|
636
|
+
"""
|
|
637
|
+
处理复核阶段:验证所有标记为无效的聚类。
|
|
638
|
+
|
|
639
|
+
返回: 更新后的 cluster_batches(包含重新加入验证的候选)
|
|
640
|
+
"""
|
|
641
|
+
if not invalid_clusters_for_review:
|
|
642
|
+
typer.secho(f"[jarvis-sec] 无无效聚类需要复核", fg=typer.colors.BLUE)
|
|
643
|
+
return cluster_batches
|
|
644
|
+
|
|
645
|
+
typer.secho(f"\n[jarvis-sec] 开始复核 {len(invalid_clusters_for_review)} 个无效聚类...", fg=typer.colors.MAGENTA)
|
|
646
|
+
status_mgr.update_review(
|
|
647
|
+
current_review=0,
|
|
648
|
+
total_reviews=len(invalid_clusters_for_review),
|
|
649
|
+
message="开始复核无效聚类..."
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
# 按批次复核(每批最多10个无效聚类,避免上下文过长)
|
|
653
|
+
review_batch_size = 10
|
|
654
|
+
reviewed_clusters: List[Dict] = []
|
|
655
|
+
reinstated_candidates: List[Dict] = [] # 重新加入验证的候选
|
|
656
|
+
|
|
657
|
+
review_system_prompt = _get_review_system_prompt()
|
|
658
|
+
review_summary_prompt = _get_review_summary_prompt()
|
|
659
|
+
|
|
660
|
+
for review_idx in range(0, len(invalid_clusters_for_review), review_batch_size):
|
|
661
|
+
review_batch = invalid_clusters_for_review[review_idx:review_idx + review_batch_size]
|
|
662
|
+
current_review_num = review_idx // review_batch_size + 1
|
|
663
|
+
total_review_batches = (len(invalid_clusters_for_review) + review_batch_size - 1) // review_batch_size
|
|
664
|
+
|
|
665
|
+
typer.secho(f"[jarvis-sec] 复核批次 {current_review_num}/{total_review_batches}: {len(review_batch)} 个无效聚类", fg=typer.colors.CYAN)
|
|
666
|
+
status_mgr.update_review(
|
|
667
|
+
current_review=current_review_num,
|
|
668
|
+
total_reviews=total_review_batches,
|
|
669
|
+
message=f"正在复核批次 {current_review_num}/{total_review_batches}"
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
# 构建复核任务
|
|
673
|
+
review_task = _build_review_task(review_batch, entry_path, langs)
|
|
674
|
+
|
|
675
|
+
# 创建复核Agent
|
|
676
|
+
review_agent = _create_review_agent(current_review_num, llm_group)
|
|
677
|
+
|
|
678
|
+
# 订阅复核Agent的摘要
|
|
679
|
+
review_summary_container = _subscribe_summary_event(review_agent)
|
|
680
|
+
|
|
681
|
+
# 运行复核Agent(永久重试直到格式正确)
|
|
682
|
+
review_results, parse_error = _run_review_agent_with_retry(
|
|
683
|
+
review_agent,
|
|
684
|
+
review_task,
|
|
685
|
+
review_summary_prompt,
|
|
686
|
+
entry_path,
|
|
687
|
+
review_summary_container,
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
# 处理复核结果
|
|
691
|
+
_process_review_batch_items(
|
|
692
|
+
review_batch,
|
|
693
|
+
review_results,
|
|
694
|
+
reviewed_clusters,
|
|
695
|
+
reinstated_candidates,
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
# 将重新加入验证的候选添加到cluster_batches
|
|
699
|
+
_reinstated_candidates_to_cluster_batches(
|
|
700
|
+
reinstated_candidates,
|
|
701
|
+
cluster_batches,
|
|
702
|
+
_progress_append,
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
if not reinstated_candidates:
|
|
706
|
+
typer.secho(f"[jarvis-sec] 复核完成:所有无效聚类理由充分,确认为无效", fg=typer.colors.GREEN)
|
|
707
|
+
|
|
708
|
+
# 记录复核结果
|
|
709
|
+
_progress_append({
|
|
710
|
+
"event": "review_completed",
|
|
711
|
+
"total_reviewed": len(invalid_clusters_for_review),
|
|
712
|
+
"reinstated": len(reinstated_candidates),
|
|
713
|
+
"confirmed_invalid": len(invalid_clusters_for_review) - len(reinstated_candidates),
|
|
714
|
+
})
|
|
715
|
+
status_mgr.update_review(
|
|
716
|
+
current_review=len(invalid_clusters_for_review),
|
|
717
|
+
total_reviews=len(invalid_clusters_for_review),
|
|
718
|
+
message=f"复核完成:{len(reinstated_candidates)} 个候选重新加入验证"
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
return cluster_batches
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def _build_gid_to_review_mapping(review_results: List[Dict]) -> Dict[int, Dict]:
|
|
725
|
+
"""构建gid到复核结果的映射(支持 gid 和 gids 两种格式)"""
|
|
726
|
+
gid_to_review: Dict[int, Dict] = {}
|
|
727
|
+
for rr in review_results:
|
|
728
|
+
if not isinstance(rr, dict):
|
|
729
|
+
continue
|
|
730
|
+
|
|
731
|
+
# 支持 gid 和 gids 两种格式
|
|
732
|
+
gids_to_process: List[int] = []
|
|
733
|
+
if "gids" in rr and isinstance(rr.get("gids"), list):
|
|
734
|
+
# 合并格式:gids 数组
|
|
735
|
+
for gid_val in rr.get("gids", []):
|
|
736
|
+
try:
|
|
737
|
+
gid_int = int(gid_val)
|
|
738
|
+
if gid_int >= 1:
|
|
739
|
+
gids_to_process.append(gid_int)
|
|
740
|
+
except Exception:
|
|
741
|
+
pass
|
|
742
|
+
elif "gid" in rr:
|
|
743
|
+
# 单个格式:gid
|
|
744
|
+
try:
|
|
745
|
+
gid_int = int(rr.get("gid", 0))
|
|
746
|
+
if gid_int >= 1:
|
|
747
|
+
gids_to_process.append(gid_int)
|
|
748
|
+
except Exception:
|
|
749
|
+
pass
|
|
750
|
+
|
|
751
|
+
# 为每个 gid 创建复核结果映射
|
|
752
|
+
is_reason_sufficient = rr.get("is_reason_sufficient")
|
|
753
|
+
review_notes = str(rr.get("review_notes", "")).strip()
|
|
754
|
+
for gid in gids_to_process:
|
|
755
|
+
gid_to_review[gid] = {
|
|
756
|
+
"is_reason_sufficient": is_reason_sufficient,
|
|
757
|
+
"review_notes": review_notes
|
|
758
|
+
}
|
|
759
|
+
return gid_to_review
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def _process_review_batch(
|
|
763
|
+
review_batch: List[Dict],
|
|
764
|
+
review_results: Optional[List[Dict]],
|
|
765
|
+
reviewed_clusters: List[Dict],
|
|
766
|
+
reinstated_candidates: List[Dict],
|
|
767
|
+
) -> None:
|
|
768
|
+
"""处理单个复核批次的结果"""
|
|
769
|
+
if review_results:
|
|
770
|
+
# 构建gid到复核结果的映射
|
|
771
|
+
gid_to_review = _build_gid_to_review_mapping(review_results)
|
|
772
|
+
|
|
773
|
+
# 处理每个无效聚类
|
|
774
|
+
for invalid_cluster in review_batch:
|
|
775
|
+
cluster_gids = invalid_cluster.get("gids", [])
|
|
776
|
+
cluster_members = invalid_cluster.get("members", [])
|
|
777
|
+
|
|
778
|
+
# 检查该聚类中的所有gid的复核结果
|
|
779
|
+
all_sufficient = True
|
|
780
|
+
any_reviewed = False
|
|
781
|
+
insufficient_review_result = None
|
|
782
|
+
for gid in cluster_gids:
|
|
783
|
+
review_result = gid_to_review.get(gid)
|
|
784
|
+
if review_result:
|
|
785
|
+
any_reviewed = True
|
|
786
|
+
if review_result.get("is_reason_sufficient") is not True:
|
|
787
|
+
all_sufficient = False
|
|
788
|
+
if not insufficient_review_result:
|
|
789
|
+
insufficient_review_result = review_result
|
|
790
|
+
break
|
|
791
|
+
|
|
792
|
+
if any_reviewed and not all_sufficient:
|
|
793
|
+
# 理由不充分,重新加入验证流程
|
|
794
|
+
typer.secho(f"[jarvis-sec] 复核结果:无效聚类(gids={cluster_gids})理由不充分,重新加入验证流程", fg=typer.colors.BLUE)
|
|
795
|
+
for member in cluster_members:
|
|
796
|
+
reinstated_candidates.append(member)
|
|
797
|
+
reviewed_clusters.append({
|
|
798
|
+
**invalid_cluster,
|
|
799
|
+
"review_result": "reinstated",
|
|
800
|
+
"review_notes": insufficient_review_result.get("review_notes", "") if insufficient_review_result else "",
|
|
801
|
+
})
|
|
802
|
+
else:
|
|
803
|
+
# 理由充分,确认无效
|
|
804
|
+
review_notes = ""
|
|
805
|
+
if cluster_gids and gid_to_review.get(cluster_gids[0]):
|
|
806
|
+
review_notes = gid_to_review[cluster_gids[0]].get("review_notes", "")
|
|
807
|
+
typer.secho(f"[jarvis-sec] 复核结果:无效聚类(gids={cluster_gids})理由充分,确认为无效", fg=typer.colors.GREEN)
|
|
808
|
+
reviewed_clusters.append({
|
|
809
|
+
**invalid_cluster,
|
|
810
|
+
"review_result": "confirmed_invalid",
|
|
811
|
+
"review_notes": review_notes,
|
|
812
|
+
})
|
|
813
|
+
else:
|
|
814
|
+
# 复核结果解析失败,保守策略:重新加入验证流程
|
|
815
|
+
typer.secho(f"[jarvis-sec] 警告:复核结果解析失败,保守策略:将批次中的所有候选重新加入验证流程", fg=typer.colors.YELLOW)
|
|
816
|
+
for invalid_cluster in review_batch:
|
|
817
|
+
cluster_members = invalid_cluster.get("members", [])
|
|
818
|
+
for member in cluster_members:
|
|
819
|
+
reinstated_candidates.append(member)
|
|
820
|
+
reviewed_clusters.append({
|
|
821
|
+
**invalid_cluster,
|
|
822
|
+
"review_result": "reinstated",
|
|
823
|
+
"review_notes": "复核结果解析失败,保守策略重新加入验证",
|
|
824
|
+
})
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
def _run_review_agent_with_retry(
|
|
828
|
+
review_agent,
|
|
829
|
+
review_task: str,
|
|
830
|
+
review_summary_prompt: str,
|
|
831
|
+
entry_path: str,
|
|
832
|
+
review_summary_container: Dict[str, str],
|
|
833
|
+
) -> tuple[Optional[List[Dict]], Optional[str]]:
|
|
834
|
+
"""运行复核Agent并永久重试直到格式正确,返回(复核结果, 解析错误)"""
|
|
835
|
+
use_direct_model_review = False
|
|
836
|
+
prev_parse_error_review: Optional[str] = None
|
|
837
|
+
review_attempt = 0
|
|
838
|
+
|
|
839
|
+
while True:
|
|
840
|
+
review_attempt += 1
|
|
841
|
+
review_summary_container["text"] = ""
|
|
842
|
+
|
|
843
|
+
if use_direct_model_review:
|
|
844
|
+
# 格式校验失败后,直接调用模型接口
|
|
845
|
+
review_summary_prompt_text = _build_verification_summary_prompt()
|
|
846
|
+
error_guidance = ""
|
|
847
|
+
if prev_parse_error_review:
|
|
848
|
+
error_guidance = f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- YAML解析失败: {prev_parse_error_review}\n\n请确保输出的YAML格式正确,包括正确的缩进、引号、冒号等。"
|
|
849
|
+
|
|
850
|
+
full_review_prompt = f"{review_task}{error_guidance}\n\n{review_summary_prompt_text}"
|
|
851
|
+
try:
|
|
852
|
+
review_response = review_agent.model.chat_until_success(full_review_prompt) # type: ignore
|
|
853
|
+
review_summary_container["text"] = review_response
|
|
854
|
+
except Exception as e:
|
|
855
|
+
try:
|
|
856
|
+
typer.secho(f"[jarvis-sec] 复核阶段直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
|
|
857
|
+
except Exception:
|
|
858
|
+
pass
|
|
859
|
+
review_agent.run(review_task)
|
|
860
|
+
else:
|
|
861
|
+
review_agent.run(review_task)
|
|
862
|
+
|
|
863
|
+
# 工作区保护
|
|
864
|
+
try:
|
|
865
|
+
_changed_review = _git_restore_if_dirty(entry_path)
|
|
866
|
+
if _changed_review:
|
|
867
|
+
try:
|
|
868
|
+
typer.secho(f"[jarvis-sec] 复核 Agent 工作区已恢复 ({_changed_review} 个文件)", fg=typer.colors.BLUE)
|
|
869
|
+
except Exception:
|
|
870
|
+
pass
|
|
871
|
+
except Exception:
|
|
872
|
+
pass
|
|
873
|
+
|
|
874
|
+
# 解析复核结果
|
|
875
|
+
review_summary_text = review_summary_container.get("text", "")
|
|
876
|
+
parse_error_review = None
|
|
877
|
+
if review_summary_text:
|
|
878
|
+
review_parsed, parse_error_review = _try_parse_summary_report(review_summary_text)
|
|
879
|
+
if parse_error_review:
|
|
880
|
+
prev_parse_error_review = parse_error_review
|
|
881
|
+
try:
|
|
882
|
+
typer.secho(f"[jarvis-sec] 复核结果YAML解析失败: {parse_error_review}", fg=typer.colors.YELLOW)
|
|
883
|
+
except Exception:
|
|
884
|
+
pass
|
|
885
|
+
else:
|
|
886
|
+
prev_parse_error_review = None
|
|
887
|
+
if isinstance(review_parsed, list):
|
|
888
|
+
# 验证复核结果格式
|
|
889
|
+
if review_parsed and all(_is_valid_review_item(item) for item in review_parsed):
|
|
890
|
+
return review_parsed, None
|
|
891
|
+
|
|
892
|
+
# 格式校验失败,后续重试使用直接模型调用
|
|
893
|
+
use_direct_model_review = True
|
|
894
|
+
if parse_error_review:
|
|
895
|
+
try:
|
|
896
|
+
typer.secho(f"[jarvis-sec] 复核结果YAML解析失败 -> 重试第 {review_attempt} 次 (使用直接模型调用,将反馈解析错误)", fg=typer.colors.YELLOW)
|
|
897
|
+
except Exception:
|
|
898
|
+
pass
|
|
899
|
+
else:
|
|
900
|
+
try:
|
|
901
|
+
typer.secho(f"[jarvis-sec] 复核结果格式无效 -> 重试第 {review_attempt} 次 (使用直接模型调用)", fg=typer.colors.YELLOW)
|
|
902
|
+
except Exception:
|
|
903
|
+
pass
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def _is_valid_review_item(item: Dict) -> bool:
|
|
907
|
+
"""验证复核结果项的格式"""
|
|
908
|
+
if not isinstance(item, dict) or "is_reason_sufficient" not in item:
|
|
909
|
+
return False
|
|
910
|
+
has_gid = "gid" in item
|
|
911
|
+
has_gids = "gids" in item
|
|
912
|
+
if not has_gid and not has_gids:
|
|
913
|
+
return False
|
|
914
|
+
if has_gid and has_gids:
|
|
915
|
+
return False # gid 和 gids 不能同时出现
|
|
916
|
+
if has_gid:
|
|
917
|
+
try:
|
|
918
|
+
return int(item["gid"]) >= 1
|
|
919
|
+
except Exception:
|
|
920
|
+
return False
|
|
921
|
+
elif has_gids:
|
|
922
|
+
if not isinstance(item["gids"], list) or len(item["gids"]) == 0:
|
|
923
|
+
return False
|
|
924
|
+
try:
|
|
925
|
+
return all(int(gid_val) >= 1 for gid_val in item["gids"])
|
|
926
|
+
except Exception:
|
|
927
|
+
return False
|
|
928
|
+
return False
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
def _load_processed_gids_from_issues(sec_dir) -> set:
|
|
932
|
+
"""从 agent_issues.jsonl 中读取已处理的 gid"""
|
|
933
|
+
processed_gids = set()
|
|
934
|
+
try:
|
|
935
|
+
from pathlib import Path as _Path
|
|
936
|
+
_agent_issues_path = sec_dir / "agent_issues.jsonl"
|
|
937
|
+
if _agent_issues_path.exists():
|
|
938
|
+
import json as _json
|
|
939
|
+
with _agent_issues_path.open("r", encoding="utf-8", errors="ignore") as f:
|
|
940
|
+
for line in f:
|
|
941
|
+
line = line.strip()
|
|
942
|
+
if not line:
|
|
943
|
+
continue
|
|
944
|
+
try:
|
|
945
|
+
issue_obj = _json.loads(line)
|
|
946
|
+
_gid = int(issue_obj.get("gid", 0))
|
|
947
|
+
if _gid >= 1:
|
|
948
|
+
processed_gids.add(_gid)
|
|
949
|
+
except Exception:
|
|
950
|
+
pass
|
|
951
|
+
if processed_gids:
|
|
952
|
+
try:
|
|
953
|
+
typer.secho(f"[jarvis-sec] 断点恢复:从 agent_issues.jsonl 读取到 {len(processed_gids)} 个已处理的 gid", fg=typer.colors.BLUE)
|
|
954
|
+
except Exception:
|
|
955
|
+
pass
|
|
956
|
+
except Exception:
|
|
957
|
+
pass
|
|
958
|
+
return processed_gids
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
def _count_issues_from_file(sec_dir) -> int:
|
|
962
|
+
"""从 agent_issues.jsonl 中读取当前问题总数(用于状态显示)"""
|
|
963
|
+
count = 0
|
|
964
|
+
try:
|
|
965
|
+
from pathlib import Path as _Path
|
|
966
|
+
import json as _json
|
|
967
|
+
_agent_issues_path = sec_dir / "agent_issues.jsonl"
|
|
968
|
+
if _agent_issues_path.exists():
|
|
969
|
+
saved_gids = set()
|
|
970
|
+
with _agent_issues_path.open("r", encoding="utf-8", errors="ignore") as f:
|
|
971
|
+
for line in f:
|
|
972
|
+
line = line.strip()
|
|
973
|
+
if not line:
|
|
974
|
+
continue
|
|
975
|
+
try:
|
|
976
|
+
item = _json.loads(line)
|
|
977
|
+
gid = item.get("gid", 0)
|
|
978
|
+
if gid >= 1 and gid not in saved_gids:
|
|
979
|
+
# 只统计验证通过的告警(has_risk: true 且有 verification_notes)
|
|
980
|
+
if item.get("has_risk") is True and "verification_notes" in item:
|
|
981
|
+
count += 1
|
|
982
|
+
saved_gids.add(gid)
|
|
983
|
+
except Exception:
|
|
984
|
+
pass
|
|
985
|
+
except Exception:
|
|
986
|
+
pass
|
|
987
|
+
return count
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def _create_analysis_agent(task_id: str, llm_group: Optional[str]) -> Agent:
|
|
991
|
+
"""创建分析Agent"""
|
|
992
|
+
system_prompt = """
|
|
993
|
+
# 单Agent安全分析约束
|
|
994
|
+
- 你的核心任务是评估代码的安全问题,目标:针对本候选问题进行证据核实、风险评估与修复建议补充,查找漏洞触发路径,确认在某些条件下会触发;以此来判断是否是漏洞。
|
|
995
|
+
- **必须进行调用路径推导**:
|
|
996
|
+
- 对于每个候选问题,必须明确推导从可控输入到缺陷代码的完整调用路径。
|
|
997
|
+
- 调用路径推导必须包括:
|
|
998
|
+
1. 识别可控输入的来源(例如:用户输入、网络数据、文件读取、命令行参数等)
|
|
999
|
+
2. 追踪数据流:从输入源开始,逐步追踪数据如何传递到缺陷代码位置
|
|
1000
|
+
3. 识别调用链:明确列出从入口函数到缺陷代码的所有函数调用序列(例如:main() -> parse_input() -> process_data() -> vulnerable_function())
|
|
1001
|
+
4. 分析每个调用点的数据校验情况:检查每个函数是否对输入进行了校验、边界检查或安全检查
|
|
1002
|
+
5. 确认触发条件:明确说明在什么条件下,未校验或恶意输入能够到达缺陷代码位置
|
|
1003
|
+
- 如果无法推导出完整的调用路径,或者所有调用路径都有充分的保护措施,则应该判定为误报。
|
|
1004
|
+
- 调用路径推导必须在分析过程中明确展示,不能省略或假设。
|
|
1005
|
+
- 工具优先:使用 read_code 读取目标文件附近源码(行号前后各 ~50 行),必要时用 execute_script 辅助检索。
|
|
1006
|
+
- **调用路径追溯要求**:
|
|
1007
|
+
- 必须向上追溯所有可能的调用者,查看完整的调用路径,以确认风险是否真实存在。
|
|
1008
|
+
- 使用 read_code 和 execute_script 工具查找函数的调用者(例如:使用 grep 搜索函数名,查找所有调用该函数的位置)。
|
|
1009
|
+
- 对于每个调用者,必须检查其是否对输入进行了校验。
|
|
1010
|
+
- 如果发现任何调用路径未做校验,必须明确记录该路径。
|
|
1011
|
+
- 例如:一个函数存在空指针解引用风险,必须检查所有调用者。如果所有调用者均能确保传入的指针非空,则该风险在当前代码库中可能不会实际触发;但如果存在任何调用者未做校验,则风险真实存在。
|
|
1012
|
+
- 若多条告警位于同一文件且行号相距不远,可一次性读取共享上下文,对这些相邻告警进行联合分析与判断;但仍需避免无关扩展与大范围遍历。
|
|
1013
|
+
- 禁止修改任何文件或执行写操作命令(rm/mv/cp/echo >、sed -i、git、patch、chmod、chown 等);仅进行只读分析与读取。
|
|
1014
|
+
- 每次仅执行一个操作;等待工具结果后再进行下一步。
|
|
1015
|
+
- **记忆使用**:
|
|
1016
|
+
- 在分析过程中,充分利用 retrieve_memory 工具检索已有的记忆,特别是与当前分析函数相关的记忆。
|
|
1017
|
+
- 如果有必要,使用 save_memory 工具保存每个函数的分析要点,使用函数名作为 tag(例如:函数名、文件名等)。
|
|
1018
|
+
- 记忆内容示例:某个函数的指针已经判空、某个函数已有输入校验、某个函数的调用路径分析结果等。
|
|
1019
|
+
- 这样可以避免重复分析,提高效率,并保持分析的一致性。
|
|
1020
|
+
- 完成对本批次候选问题的判断后,主输出仅打印结束符 <!!!COMPLETE!!!> ,不需要汇总结果。
|
|
1021
|
+
""".strip()
|
|
1022
|
+
|
|
1023
|
+
agent_kwargs: Dict = dict(
|
|
1024
|
+
system_prompt=system_prompt,
|
|
1025
|
+
name=task_id,
|
|
1026
|
+
auto_complete=True,
|
|
1027
|
+
need_summary=True,
|
|
1028
|
+
summary_prompt=_build_summary_prompt(),
|
|
1029
|
+
non_interactive=True,
|
|
1030
|
+
in_multi_agent=False,
|
|
1031
|
+
use_methodology=False,
|
|
1032
|
+
use_analysis=False,
|
|
1033
|
+
plan=False,
|
|
1034
|
+
output_handler=[ToolRegistry()],
|
|
1035
|
+
disable_file_edit=True,
|
|
1036
|
+
force_save_memory=True,
|
|
1037
|
+
use_tools=["read_code", "execute_script", "save_memory", "retrieve_memory"],
|
|
1038
|
+
)
|
|
1039
|
+
if llm_group:
|
|
1040
|
+
agent_kwargs["model_group"] = llm_group
|
|
1041
|
+
return Agent(**agent_kwargs)
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
def _build_analysis_task_context(batch: List[Dict], entry_path: str, langs: List[str]) -> str:
|
|
1045
|
+
"""构建分析任务上下文"""
|
|
1046
|
+
import json as _json2
|
|
1047
|
+
batch_ctx: List[Dict] = list(batch)
|
|
1048
|
+
cluster_verify = str(batch_ctx[0].get("verify") if batch_ctx else "")
|
|
1049
|
+
cluster_gids_ctx = [it.get("gid") for it in batch_ctx]
|
|
1050
|
+
return f"""
|
|
1051
|
+
# 安全子任务批次
|
|
1052
|
+
上下文参数:
|
|
1053
|
+
- entry_path: {entry_path}
|
|
1054
|
+
- languages: {langs}
|
|
1055
|
+
- cluster_verification: {cluster_verify}
|
|
1056
|
+
|
|
1057
|
+
- cluster_gids: {cluster_gids_ctx}
|
|
1058
|
+
- note: 每个候选含 gid/verify 字段,模型仅需输出 gid 统一给出验证/判断结论(全局编号);无需使用局部 id
|
|
1059
|
+
|
|
1060
|
+
批次候选(JSON数组):
|
|
1061
|
+
{_json2.dumps(batch_ctx, ensure_ascii=False, indent=2)}
|
|
1062
|
+
""".strip()
|
|
1063
|
+
|
|
1064
|
+
|
|
1065
|
+
def _subscribe_summary_event(agent: Agent) -> Dict[str, str]:
|
|
1066
|
+
"""订阅Agent摘要事件"""
|
|
1067
|
+
summary_container: Dict[str, str] = {"text": ""}
|
|
1068
|
+
try:
|
|
1069
|
+
from jarvis.jarvis_agent.events import AFTER_SUMMARY as _AFTER_SUMMARY
|
|
1070
|
+
except Exception:
|
|
1071
|
+
_AFTER_SUMMARY = None
|
|
1072
|
+
|
|
1073
|
+
if _AFTER_SUMMARY:
|
|
1074
|
+
def _on_after_summary(**kwargs):
|
|
1075
|
+
try:
|
|
1076
|
+
summary_container["text"] = str(kwargs.get("summary", "") or "")
|
|
1077
|
+
except Exception:
|
|
1078
|
+
summary_container["text"] = ""
|
|
1079
|
+
try:
|
|
1080
|
+
agent.event_bus.subscribe(_AFTER_SUMMARY, _on_after_summary)
|
|
1081
|
+
except Exception:
|
|
1082
|
+
pass
|
|
1083
|
+
return summary_container
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def _build_validation_error_guidance(
|
|
1087
|
+
parse_error_analysis: Optional[str],
|
|
1088
|
+
prev_parsed_items: Optional[List],
|
|
1089
|
+
) -> str:
|
|
1090
|
+
"""构建验证错误指导信息"""
|
|
1091
|
+
if parse_error_analysis:
|
|
1092
|
+
return f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- YAML解析失败: {parse_error_analysis}\n\n请确保输出的YAML格式正确,包括正确的缩进、引号、冒号等。"
|
|
1093
|
+
elif prev_parsed_items is None:
|
|
1094
|
+
return "\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- 无法从摘要中解析出有效的 YAML 数组"
|
|
1095
|
+
elif not _valid_items(prev_parsed_items):
|
|
1096
|
+
validation_errors = []
|
|
1097
|
+
if not isinstance(prev_parsed_items, list):
|
|
1098
|
+
validation_errors.append("结果不是数组")
|
|
1099
|
+
else:
|
|
1100
|
+
for idx, it in enumerate(prev_parsed_items):
|
|
1101
|
+
if not isinstance(it, dict):
|
|
1102
|
+
validation_errors.append(f"元素{idx}不是字典")
|
|
1103
|
+
break
|
|
1104
|
+
has_gid = "gid" in it
|
|
1105
|
+
has_gids = "gids" in it
|
|
1106
|
+
if not has_gid and not has_gids:
|
|
1107
|
+
validation_errors.append(f"元素{idx}缺少必填字段 gid 或 gids")
|
|
1108
|
+
break
|
|
1109
|
+
if has_gid and has_gids:
|
|
1110
|
+
validation_errors.append(f"元素{idx}不能同时包含 gid 和 gids")
|
|
1111
|
+
break
|
|
1112
|
+
if has_gid:
|
|
1113
|
+
try:
|
|
1114
|
+
if int(it.get("gid", 0)) < 1:
|
|
1115
|
+
validation_errors.append(f"元素{idx}的 gid 必须 >= 1")
|
|
1116
|
+
break
|
|
1117
|
+
except Exception:
|
|
1118
|
+
validation_errors.append(f"元素{idx}的 gid 格式错误(必须是整数)")
|
|
1119
|
+
break
|
|
1120
|
+
elif has_gids:
|
|
1121
|
+
if not isinstance(it.get("gids"), list) or len(it.get("gids", [])) == 0:
|
|
1122
|
+
validation_errors.append(f"元素{idx}的 gids 必须是非空数组")
|
|
1123
|
+
break
|
|
1124
|
+
try:
|
|
1125
|
+
for gid_idx, gid_val in enumerate(it.get("gids", [])):
|
|
1126
|
+
if int(gid_val) < 1:
|
|
1127
|
+
validation_errors.append(f"元素{idx}的 gids[{gid_idx}] 必须 >= 1")
|
|
1128
|
+
break
|
|
1129
|
+
if validation_errors:
|
|
1130
|
+
break
|
|
1131
|
+
except Exception:
|
|
1132
|
+
validation_errors.append(f"元素{idx}的 gids 格式错误(必须是整数数组)")
|
|
1133
|
+
break
|
|
1134
|
+
if "has_risk" not in it or not isinstance(it.get("has_risk"), bool):
|
|
1135
|
+
validation_errors.append(f"元素{idx}缺少必填字段 has_risk(必须是布尔值)")
|
|
1136
|
+
break
|
|
1137
|
+
if it.get("has_risk"):
|
|
1138
|
+
for key in ["preconditions", "trigger_path", "consequences", "suggestions"]:
|
|
1139
|
+
if key not in it:
|
|
1140
|
+
validation_errors.append(f"元素{idx}的 has_risk 为 true,但缺少必填字段 {key}")
|
|
1141
|
+
break
|
|
1142
|
+
if not isinstance(it[key], str) or not it[key].strip():
|
|
1143
|
+
validation_errors.append(f"元素{idx}的 {key} 字段不能为空")
|
|
1144
|
+
break
|
|
1145
|
+
if validation_errors:
|
|
1146
|
+
break
|
|
1147
|
+
if validation_errors:
|
|
1148
|
+
return "\n\n**格式错误详情(请根据以下错误修复输出格式):**\n" + "\n".join(f"- {err}" for err in validation_errors)
|
|
1149
|
+
return ""
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
def _run_analysis_agent_with_retry(
|
|
1153
|
+
agent: Agent,
|
|
1154
|
+
per_task: str,
|
|
1155
|
+
summary_container: Dict[str, str],
|
|
1156
|
+
entry_path: str,
|
|
1157
|
+
task_id: str,
|
|
1158
|
+
bidx: int,
|
|
1159
|
+
meta_records: List[Dict],
|
|
1160
|
+
) -> tuple[Optional[List[Dict]], Optional[Dict]]:
|
|
1161
|
+
"""运行分析Agent并重试直到成功"""
|
|
1162
|
+
summary_items: Optional[List[Dict]] = None
|
|
1163
|
+
workspace_restore_info: Optional[Dict] = None
|
|
1164
|
+
use_direct_model_analysis = False
|
|
1165
|
+
prev_parsed_items: Optional[List] = None
|
|
1166
|
+
parse_error_analysis: Optional[str] = None
|
|
1167
|
+
attempt = 0
|
|
1168
|
+
|
|
1169
|
+
while True:
|
|
1170
|
+
attempt += 1
|
|
1171
|
+
summary_container["text"] = ""
|
|
1172
|
+
|
|
1173
|
+
if use_direct_model_analysis:
|
|
1174
|
+
summary_prompt_text = _build_summary_prompt()
|
|
1175
|
+
error_guidance = _build_validation_error_guidance(parse_error_analysis, prev_parsed_items)
|
|
1176
|
+
full_prompt = f"{per_task}{error_guidance}\n\n{summary_prompt_text}"
|
|
1177
|
+
try:
|
|
1178
|
+
response = agent.model.chat_until_success(full_prompt) # type: ignore
|
|
1179
|
+
summary_container["text"] = response
|
|
1180
|
+
except Exception as e:
|
|
1181
|
+
try:
|
|
1182
|
+
typer.secho(f"[jarvis-sec] 直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
|
|
1183
|
+
except Exception:
|
|
1184
|
+
pass
|
|
1185
|
+
agent.run(per_task)
|
|
1186
|
+
else:
|
|
1187
|
+
agent.run(per_task)
|
|
1188
|
+
|
|
1189
|
+
# 工作区保护
|
|
1190
|
+
try:
|
|
1191
|
+
_changed = _git_restore_if_dirty(entry_path)
|
|
1192
|
+
workspace_restore_info = {
|
|
1193
|
+
"performed": bool(_changed),
|
|
1194
|
+
"changed_files_count": int(_changed or 0),
|
|
1195
|
+
"action": "git checkout -- .",
|
|
1196
|
+
}
|
|
1197
|
+
meta_records.append({
|
|
1198
|
+
"task_id": task_id,
|
|
1199
|
+
"batch_index": bidx,
|
|
1200
|
+
"workspace_restore": workspace_restore_info,
|
|
1201
|
+
"attempt": attempt + 1,
|
|
1202
|
+
})
|
|
1203
|
+
if _changed:
|
|
1204
|
+
try:
|
|
1205
|
+
typer.secho(f"[jarvis-sec] 工作区已恢复 ({_changed} 个文件),操作: git checkout -- .", fg=typer.colors.BLUE)
|
|
1206
|
+
except Exception:
|
|
1207
|
+
pass
|
|
1208
|
+
except Exception:
|
|
1209
|
+
pass
|
|
1210
|
+
|
|
1211
|
+
# 解析摘要中的 <REPORT>(YAML)
|
|
1212
|
+
summary_text = summary_container.get("text", "")
|
|
1213
|
+
parsed_items: Optional[List] = None
|
|
1214
|
+
parse_error_analysis = None
|
|
1215
|
+
if summary_text:
|
|
1216
|
+
rep, parse_error_analysis = _try_parse_summary_report(summary_text)
|
|
1217
|
+
if parse_error_analysis:
|
|
1218
|
+
try:
|
|
1219
|
+
typer.secho(f"[jarvis-sec] 分析结果YAML解析失败: {parse_error_analysis}", fg=typer.colors.YELLOW)
|
|
1220
|
+
except Exception:
|
|
1221
|
+
pass
|
|
1222
|
+
elif isinstance(rep, list):
|
|
1223
|
+
parsed_items = rep
|
|
1224
|
+
elif isinstance(rep, dict):
|
|
1225
|
+
items = rep.get("issues")
|
|
1226
|
+
if isinstance(items, list):
|
|
1227
|
+
parsed_items = items
|
|
1228
|
+
|
|
1229
|
+
# 关键字段校验
|
|
1230
|
+
# 空数组 [] 是有效的(表示没有发现问题),需要单独处理
|
|
1231
|
+
if parsed_items is not None:
|
|
1232
|
+
if len(parsed_items) == 0:
|
|
1233
|
+
# 空数组表示没有发现问题,这是有效的格式
|
|
1234
|
+
summary_items = parsed_items
|
|
1235
|
+
break
|
|
1236
|
+
elif _valid_items(parsed_items):
|
|
1237
|
+
# 非空数组需要验证格式
|
|
1238
|
+
summary_items = parsed_items
|
|
1239
|
+
break
|
|
1240
|
+
|
|
1241
|
+
# 格式校验失败,后续重试使用直接模型调用
|
|
1242
|
+
use_direct_model_analysis = True
|
|
1243
|
+
prev_parsed_items = parsed_items
|
|
1244
|
+
if parse_error_analysis:
|
|
1245
|
+
try:
|
|
1246
|
+
typer.secho(f"[jarvis-sec] 分析结果YAML解析失败 -> 重试第 {attempt} 次 (批次={bidx},使用直接模型调用,将反馈解析错误)", fg=typer.colors.YELLOW)
|
|
1247
|
+
except Exception:
|
|
1248
|
+
pass
|
|
1249
|
+
else:
|
|
1250
|
+
try:
|
|
1251
|
+
typer.secho(f"[jarvis-sec] 分析结果格式无效 -> 重试第 {attempt} 次 (批次={bidx},使用直接模型调用)", fg=typer.colors.YELLOW)
|
|
1252
|
+
except Exception:
|
|
1253
|
+
pass
|
|
1254
|
+
|
|
1255
|
+
return summary_items, workspace_restore_info
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def _expand_and_filter_analysis_results(summary_items: List[Dict]) -> tuple[List[Dict], List[Dict]]:
|
|
1259
|
+
"""展开gids格式为单个gid格式,并过滤出有风险的项目"""
|
|
1260
|
+
items_with_risk: List[Dict] = []
|
|
1261
|
+
items_without_risk: List[Dict] = []
|
|
1262
|
+
merged_items: List[Dict] = []
|
|
1263
|
+
|
|
1264
|
+
for it in summary_items:
|
|
1265
|
+
has_risk = it.get("has_risk") is True
|
|
1266
|
+
if "gids" in it and isinstance(it.get("gids"), list):
|
|
1267
|
+
for gid_val in it.get("gids", []):
|
|
1268
|
+
try:
|
|
1269
|
+
gid_int = int(gid_val)
|
|
1270
|
+
if gid_int >= 1:
|
|
1271
|
+
item = {
|
|
1272
|
+
**{k: v for k, v in it.items() if k != "gids"},
|
|
1273
|
+
"gid": gid_int,
|
|
1274
|
+
}
|
|
1275
|
+
if has_risk:
|
|
1276
|
+
merged_items.append(item)
|
|
1277
|
+
items_with_risk.append(item)
|
|
1278
|
+
else:
|
|
1279
|
+
items_without_risk.append(item)
|
|
1280
|
+
except Exception:
|
|
1281
|
+
pass
|
|
1282
|
+
elif "gid" in it:
|
|
1283
|
+
if has_risk:
|
|
1284
|
+
merged_items.append(it)
|
|
1285
|
+
items_with_risk.append(it)
|
|
1286
|
+
else:
|
|
1287
|
+
items_without_risk.append(it)
|
|
1288
|
+
|
|
1289
|
+
return items_with_risk, items_without_risk
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _build_gid_to_verification_mapping(verification_results: List[Dict]) -> Dict[int, Dict]:
|
|
1293
|
+
"""构建gid到验证结果的映射"""
|
|
1294
|
+
gid_to_verification: Dict[int, Dict] = {}
|
|
1295
|
+
for vr in verification_results:
|
|
1296
|
+
if not isinstance(vr, dict):
|
|
1297
|
+
continue
|
|
1298
|
+
gids_to_process: List[int] = []
|
|
1299
|
+
if "gids" in vr and isinstance(vr.get("gids"), list):
|
|
1300
|
+
for gid_val in vr.get("gids", []):
|
|
1301
|
+
try:
|
|
1302
|
+
gid_int = int(gid_val)
|
|
1303
|
+
if gid_int >= 1:
|
|
1304
|
+
gids_to_process.append(gid_int)
|
|
1305
|
+
except Exception as e:
|
|
1306
|
+
try:
|
|
1307
|
+
typer.secho(f"[jarvis-sec] 警告:验证结果中 gids 数组元素格式错误: {gid_val}, 错误: {e}", fg=typer.colors.YELLOW)
|
|
1308
|
+
except Exception:
|
|
1309
|
+
pass
|
|
1310
|
+
elif "gid" in vr:
|
|
1311
|
+
try:
|
|
1312
|
+
gid_val = vr.get("gid", 0)
|
|
1313
|
+
gid_int = int(gid_val)
|
|
1314
|
+
if gid_int >= 1:
|
|
1315
|
+
gids_to_process.append(gid_int)
|
|
1316
|
+
else:
|
|
1317
|
+
try:
|
|
1318
|
+
typer.secho(f"[jarvis-sec] 警告:验证结果中 gid 值无效: {gid_val} (必须 >= 1)", fg=typer.colors.YELLOW)
|
|
1319
|
+
except Exception:
|
|
1320
|
+
pass
|
|
1321
|
+
except Exception as e:
|
|
1322
|
+
try:
|
|
1323
|
+
typer.secho(f"[jarvis-sec] 警告:验证结果中 gid 格式错误: {vr.get('gid')}, 错误: {e}", fg=typer.colors.YELLOW)
|
|
1324
|
+
except Exception:
|
|
1325
|
+
pass
|
|
1326
|
+
else:
|
|
1327
|
+
try:
|
|
1328
|
+
typer.secho(f"[jarvis-sec] 警告:验证结果项缺少 gid 或 gids 字段: {vr}", fg=typer.colors.YELLOW)
|
|
1329
|
+
except Exception:
|
|
1330
|
+
pass
|
|
1331
|
+
|
|
1332
|
+
is_valid = vr.get("is_valid")
|
|
1333
|
+
verification_notes = str(vr.get("verification_notes", "")).strip()
|
|
1334
|
+
for gid in gids_to_process:
|
|
1335
|
+
gid_to_verification[gid] = {
|
|
1336
|
+
"is_valid": is_valid,
|
|
1337
|
+
"verification_notes": verification_notes
|
|
1338
|
+
}
|
|
1339
|
+
return gid_to_verification
|
|
1340
|
+
|
|
1341
|
+
|
|
1342
|
+
def _merge_verified_items(
|
|
1343
|
+
items_with_risk: List[Dict],
|
|
1344
|
+
batch: List[Dict],
|
|
1345
|
+
gid_to_verification: Dict[int, Dict],
|
|
1346
|
+
) -> List[Dict]:
|
|
1347
|
+
"""合并验证通过的告警"""
|
|
1348
|
+
gid_to_candidate: Dict[int, Dict] = {}
|
|
1349
|
+
for c in batch:
|
|
1350
|
+
try:
|
|
1351
|
+
c_gid = int(c.get("gid", 0))
|
|
1352
|
+
if c_gid >= 1:
|
|
1353
|
+
gid_to_candidate[c_gid] = c
|
|
1354
|
+
except Exception:
|
|
1355
|
+
pass
|
|
1356
|
+
|
|
1357
|
+
verified_items: List[Dict] = []
|
|
1358
|
+
for item in items_with_risk:
|
|
1359
|
+
item_gid = int(item.get("gid", 0))
|
|
1360
|
+
verification = gid_to_verification.get(item_gid)
|
|
1361
|
+
if verification and verification.get("is_valid") is True:
|
|
1362
|
+
# 合并原始候选信息(file, line, pattern, category, language, evidence, confidence, severity 等)
|
|
1363
|
+
candidate = gid_to_candidate.get(item_gid, {})
|
|
1364
|
+
merged_item = {
|
|
1365
|
+
**candidate, # 原始候选信息
|
|
1366
|
+
**item, # 分析结果
|
|
1367
|
+
"verification_notes": str(verification.get("verification_notes", "")).strip(),
|
|
1368
|
+
}
|
|
1369
|
+
verified_items.append(merged_item)
|
|
1370
|
+
elif verification and verification.get("is_valid") is False:
|
|
1371
|
+
try:
|
|
1372
|
+
typer.secho(f"[jarvis-sec] 验证 Agent 判定 gid={item_gid} 为误报: {verification.get('verification_notes', '')}", fg=typer.colors.BLUE)
|
|
1373
|
+
except Exception:
|
|
1374
|
+
pass
|
|
1375
|
+
else:
|
|
1376
|
+
try:
|
|
1377
|
+
typer.secho(f"[jarvis-sec] 警告:验证结果中未找到 gid={item_gid},视为验证不通过", fg=typer.colors.YELLOW)
|
|
1378
|
+
except Exception:
|
|
1379
|
+
pass
|
|
1380
|
+
return verified_items
|
|
1381
|
+
|
|
1382
|
+
|
|
1383
|
+
def _process_verification_batch(
|
|
1384
|
+
batch: List[Dict],
|
|
1385
|
+
bidx: int,
|
|
1386
|
+
total_batches: int,
|
|
1387
|
+
entry_path: str,
|
|
1388
|
+
langs: List[str],
|
|
1389
|
+
llm_group: Optional[str],
|
|
1390
|
+
status_mgr,
|
|
1391
|
+
_progress_append,
|
|
1392
|
+
_append_report,
|
|
1393
|
+
meta_records: List[Dict],
|
|
1394
|
+
gid_counts: Dict[int, int],
|
|
1395
|
+
sec_dir,
|
|
1396
|
+
) -> None:
|
|
1397
|
+
"""
|
|
1398
|
+
处理单个验证批次。
|
|
1399
|
+
|
|
1400
|
+
参数:
|
|
1401
|
+
- batch: 当前批次的候选列表
|
|
1402
|
+
- bidx: 批次索引
|
|
1403
|
+
- total_batches: 总批次数
|
|
1404
|
+
- 其他参数用于状态管理和结果收集
|
|
1405
|
+
"""
|
|
1406
|
+
task_id = f"JARVIS-SEC-Batch-{bidx}"
|
|
1407
|
+
batch_file = batch[0].get("file") if batch else None
|
|
1408
|
+
|
|
1409
|
+
# 进度:批次开始
|
|
1410
|
+
_progress_append(
|
|
1411
|
+
{
|
|
1412
|
+
"event": "batch_status",
|
|
1413
|
+
"status": "running",
|
|
1414
|
+
"batch_id": task_id,
|
|
1415
|
+
"batch_index": bidx,
|
|
1416
|
+
"total_batches": total_batches,
|
|
1417
|
+
"batch_size": len(batch),
|
|
1418
|
+
"file": batch_file,
|
|
1419
|
+
}
|
|
1420
|
+
)
|
|
1421
|
+
# 更新验证阶段进度
|
|
1422
|
+
status_mgr.update_verification(
|
|
1423
|
+
current_batch=bidx,
|
|
1424
|
+
total_batches=total_batches,
|
|
1425
|
+
batch_id=task_id,
|
|
1426
|
+
file_name=batch_file,
|
|
1427
|
+
message=f"正在验证批次 {bidx}/{total_batches}"
|
|
1428
|
+
)
|
|
1429
|
+
|
|
1430
|
+
# 显示进度
|
|
1431
|
+
try:
|
|
1432
|
+
typer.secho(f"\n[jarvis-sec] 分析批次 {bidx}/{total_batches}: 大小={len(batch)} 文件='{batch_file}'", fg=typer.colors.CYAN)
|
|
1433
|
+
except Exception:
|
|
1434
|
+
pass
|
|
1435
|
+
|
|
1436
|
+
# 创建分析Agent
|
|
1437
|
+
agent = _create_analysis_agent(task_id, llm_group)
|
|
1438
|
+
|
|
1439
|
+
# 构建任务上下文
|
|
1440
|
+
per_task = _build_analysis_task_context(batch, entry_path, langs)
|
|
1441
|
+
|
|
1442
|
+
# 订阅摘要事件
|
|
1443
|
+
summary_container = _subscribe_summary_event(agent)
|
|
1444
|
+
|
|
1445
|
+
# 运行分析Agent并重试
|
|
1446
|
+
summary_items, workspace_restore_info = _run_analysis_agent_with_retry(
|
|
1447
|
+
agent, per_task, summary_container, entry_path, task_id, bidx, meta_records
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1450
|
+
# 处理分析结果
|
|
1451
|
+
parse_fail = summary_items is None
|
|
1452
|
+
verified_items: List[Dict] = []
|
|
1453
|
+
|
|
1454
|
+
if summary_items:
|
|
1455
|
+
# 展开并过滤分析结果
|
|
1456
|
+
items_with_risk, items_without_risk = _expand_and_filter_analysis_results(summary_items)
|
|
1457
|
+
|
|
1458
|
+
# 记录无风险项目的日志
|
|
1459
|
+
if items_without_risk:
|
|
1460
|
+
try:
|
|
1461
|
+
typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 分析 Agent 判定 {len(items_without_risk)} 个候选为无风险(has_risk: false),跳过验证", fg=typer.colors.BLUE)
|
|
1462
|
+
except Exception:
|
|
1463
|
+
pass
|
|
1464
|
+
|
|
1465
|
+
# 运行验证Agent(仅当分析Agent发现有风险的问题时)
|
|
1466
|
+
if items_with_risk:
|
|
1467
|
+
# 创建验证 Agent 来验证分析 Agent 的结论
|
|
1468
|
+
verification_system_prompt = """
|
|
1469
|
+
# 验证 Agent 约束
|
|
1470
|
+
- 你的核心任务是验证分析 Agent 给出的安全结论是否正确。
|
|
1471
|
+
- 你需要仔细检查分析 Agent 给出的前置条件、触发路径、后果和建议是否合理、准确。
|
|
1472
|
+
- 工具优先:使用 read_code 读取目标文件附近源码(行号前后各 ~50 行),必要时用 execute_script 辅助检索。
|
|
1473
|
+
- 必要时需向上追溯调用者,查看完整的调用路径,以确认分析 Agent 的结论是否成立。
|
|
1474
|
+
- 禁止修改任何文件或执行写操作命令;仅进行只读分析与读取。
|
|
1475
|
+
- 每次仅执行一个操作;等待工具结果后再进行下一步。
|
|
1476
|
+
- **记忆使用**:
|
|
1477
|
+
- 在验证过程中,充分利用 retrieve_memory 工具检索已有的记忆,特别是分析 Agent 保存的与当前验证函数相关的记忆。
|
|
1478
|
+
- 这些记忆可能包含函数的分析要点、指针判空情况、输入校验情况、调用路径分析结果等,可以帮助你更准确地验证分析结论。
|
|
1479
|
+
- 如果发现分析 Agent 的结论与记忆中的信息不一致,需要仔细核实。
|
|
1480
|
+
- 完成验证后,主输出仅打印结束符 <!!!COMPLETE!!!> ,不需要汇总结果。
|
|
1481
|
+
""".strip()
|
|
1482
|
+
|
|
1483
|
+
verification_task_id = f"JARVIS-SEC-Verify-Batch-{bidx}"
|
|
1484
|
+
verification_agent_kwargs: Dict = dict(
|
|
1485
|
+
system_prompt=verification_system_prompt,
|
|
1486
|
+
name=verification_task_id,
|
|
1487
|
+
auto_complete=True,
|
|
1488
|
+
need_summary=True,
|
|
1489
|
+
summary_prompt=_build_verification_summary_prompt(),
|
|
1490
|
+
non_interactive=True,
|
|
1491
|
+
in_multi_agent=False,
|
|
1492
|
+
use_methodology=False,
|
|
1493
|
+
use_analysis=False,
|
|
1494
|
+
plan=False,
|
|
1495
|
+
output_handler=[ToolRegistry()],
|
|
1496
|
+
disable_file_edit=True,
|
|
1497
|
+
use_tools=["read_code", "execute_script", "retrieve_memory"],
|
|
1498
|
+
)
|
|
1499
|
+
if llm_group:
|
|
1500
|
+
verification_agent_kwargs["model_group"] = llm_group
|
|
1501
|
+
verification_agent = Agent(**verification_agent_kwargs)
|
|
1502
|
+
|
|
1503
|
+
# 构造验证任务上下文
|
|
1504
|
+
import json as _json3
|
|
1505
|
+
verification_task = f"""
|
|
1506
|
+
# 验证分析结论任务
|
|
1507
|
+
上下文参数:
|
|
1508
|
+
- entry_path: {entry_path}
|
|
1509
|
+
- languages: {langs}
|
|
1510
|
+
|
|
1511
|
+
分析 Agent 给出的结论(需要验证,仅包含 has_risk: true 的项目):
|
|
1512
|
+
{_json3.dumps(items_with_risk, ensure_ascii=False, indent=2)}
|
|
1513
|
+
|
|
1514
|
+
请验证上述分析结论是否正确,包括:
|
|
1515
|
+
1. 前置条件(preconditions)是否合理
|
|
1516
|
+
2. 触发路径(trigger_path)是否成立
|
|
1517
|
+
3. 后果(consequences)评估是否准确
|
|
1518
|
+
4. 建议(suggestions)是否合适
|
|
1519
|
+
|
|
1520
|
+
对于每个 gid,请判断分析结论是否正确(is_valid: true/false),并给出验证说明。
|
|
1521
|
+
""".strip()
|
|
1522
|
+
|
|
1523
|
+
# 订阅验证 Agent 的摘要
|
|
1524
|
+
verification_summary_container = _subscribe_summary_event(verification_agent)
|
|
1525
|
+
|
|
1526
|
+
verification_results, verification_parse_error = _run_verification_agent_with_retry(
|
|
1527
|
+
verification_agent,
|
|
1528
|
+
verification_task,
|
|
1529
|
+
_build_verification_summary_prompt(),
|
|
1530
|
+
entry_path,
|
|
1531
|
+
verification_summary_container,
|
|
1532
|
+
bidx,
|
|
1533
|
+
)
|
|
1534
|
+
|
|
1535
|
+
# 调试日志:显示验证结果
|
|
1536
|
+
if verification_results is None:
|
|
1537
|
+
try:
|
|
1538
|
+
typer.secho(f"[jarvis-sec] 警告:验证 Agent 返回 None,可能解析失败", fg=typer.colors.YELLOW)
|
|
1539
|
+
except Exception:
|
|
1540
|
+
pass
|
|
1541
|
+
elif not isinstance(verification_results, list):
|
|
1542
|
+
try:
|
|
1543
|
+
typer.secho(f"[jarvis-sec] 警告:验证 Agent 返回类型错误,期望 list,实际: {type(verification_results)}", fg=typer.colors.YELLOW)
|
|
1544
|
+
except Exception:
|
|
1545
|
+
pass
|
|
1546
|
+
elif len(verification_results) == 0:
|
|
1547
|
+
try:
|
|
1548
|
+
typer.secho(f"[jarvis-sec] 警告:验证 Agent 返回空列表", fg=typer.colors.YELLOW)
|
|
1549
|
+
except Exception:
|
|
1550
|
+
pass
|
|
1551
|
+
else:
|
|
1552
|
+
try:
|
|
1553
|
+
typer.secho(f"[jarvis-sec] 验证 Agent 返回 {len(verification_results)} 个结果项", fg=typer.colors.BLUE)
|
|
1554
|
+
except Exception:
|
|
1555
|
+
pass
|
|
1556
|
+
|
|
1557
|
+
# 根据验证结果筛选:只保留验证通过(is_valid: true)的告警
|
|
1558
|
+
if verification_results:
|
|
1559
|
+
gid_to_verification = _build_gid_to_verification_mapping(verification_results)
|
|
1560
|
+
|
|
1561
|
+
# 调试日志:显示提取到的验证结果
|
|
1562
|
+
if gid_to_verification:
|
|
1563
|
+
try:
|
|
1564
|
+
typer.secho(f"[jarvis-sec] 从验证结果中提取到 {len(gid_to_verification)} 个 gid: {sorted(gid_to_verification.keys())}", fg=typer.colors.BLUE)
|
|
1565
|
+
except Exception:
|
|
1566
|
+
pass
|
|
1567
|
+
else:
|
|
1568
|
+
try:
|
|
1569
|
+
typer.secho(f"[jarvis-sec] 警告:验证结果解析成功,但未提取到任何有效的 gid。验证结果: {verification_results}", fg=typer.colors.YELLOW)
|
|
1570
|
+
except Exception:
|
|
1571
|
+
pass
|
|
1572
|
+
|
|
1573
|
+
# 合并验证通过的告警
|
|
1574
|
+
verified_items = _merge_verified_items(items_with_risk, batch, gid_to_verification)
|
|
1575
|
+
else:
|
|
1576
|
+
typer.secho(f"[jarvis-sec] 警告:验证 Agent 结果解析失败,不保留任何告警(保守策略)", fg=typer.colors.YELLOW)
|
|
1577
|
+
|
|
1578
|
+
# 只有验证通过的告警才写入文件
|
|
1579
|
+
if verified_items:
|
|
1580
|
+
for item in verified_items:
|
|
1581
|
+
gid = int(item.get("gid", 0))
|
|
1582
|
+
if gid >= 1:
|
|
1583
|
+
gid_counts[gid] = gid_counts.get(gid, 0) + 1
|
|
1584
|
+
typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 验证通过: 数量={len(verified_items)}/{len(items_with_risk)} -> 写入文件", fg=typer.colors.GREEN)
|
|
1585
|
+
_append_report(verified_items, "verified", task_id, {"batch": True, "candidates": batch})
|
|
1586
|
+
# 从文件读取当前总数(用于状态显示)
|
|
1587
|
+
current_count = _count_issues_from_file(sec_dir)
|
|
1588
|
+
status_mgr.update_verification(
|
|
1589
|
+
current_batch=bidx,
|
|
1590
|
+
total_batches=total_batches,
|
|
1591
|
+
issues_found=current_count,
|
|
1592
|
+
message=f"已验证 {bidx}/{total_batches} 批次,发现 {current_count} 个问题(验证通过)"
|
|
1593
|
+
)
|
|
1594
|
+
else:
|
|
1595
|
+
typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 验证后无有效告警: 分析 Agent 发现 {len(items_with_risk)} 个有风险的问题,验证后全部不通过", fg=typer.colors.BLUE)
|
|
1596
|
+
current_count = _count_issues_from_file(sec_dir)
|
|
1597
|
+
status_mgr.update_verification(
|
|
1598
|
+
current_batch=bidx,
|
|
1599
|
+
total_batches=total_batches,
|
|
1600
|
+
issues_found=current_count,
|
|
1601
|
+
message=f"已验证 {bidx}/{total_batches} 批次,验证后无有效告警"
|
|
1602
|
+
)
|
|
1603
|
+
elif parse_fail:
|
|
1604
|
+
typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 解析失败 (摘要中无 <REPORT> 或字段无效)", fg=typer.colors.YELLOW)
|
|
1605
|
+
else:
|
|
1606
|
+
typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 未发现问题", fg=typer.colors.BLUE)
|
|
1607
|
+
current_count = _count_issues_from_file(sec_dir)
|
|
1608
|
+
status_mgr.update_verification(
|
|
1609
|
+
current_batch=bidx,
|
|
1610
|
+
total_batches=total_batches,
|
|
1611
|
+
issues_found=current_count,
|
|
1612
|
+
message=f"已验证 {bidx}/{total_batches} 批次"
|
|
1613
|
+
)
|
|
1614
|
+
|
|
1615
|
+
# 为本批次所有候选写入 done 记录
|
|
1616
|
+
for c in batch:
|
|
1617
|
+
sig = _sig_of(c)
|
|
1618
|
+
try:
|
|
1619
|
+
c_gid = int(c.get("gid", 0))
|
|
1620
|
+
except Exception:
|
|
1621
|
+
c_gid = 0
|
|
1622
|
+
cnt = gid_counts.get(c_gid, 0)
|
|
1623
|
+
_progress_append({
|
|
1624
|
+
"event": "task_status",
|
|
1625
|
+
"status": "done",
|
|
1626
|
+
"task_id": task_id,
|
|
1627
|
+
"candidate_signature": sig,
|
|
1628
|
+
"candidate": c,
|
|
1629
|
+
"issues_count": int(cnt),
|
|
1630
|
+
"parse_fail": parse_fail,
|
|
1631
|
+
"workspace_restore": workspace_restore_info,
|
|
1632
|
+
"batch_index": bidx,
|
|
1633
|
+
})
|
|
1634
|
+
|
|
1635
|
+
# 批次结束记录
|
|
1636
|
+
_progress_append({
|
|
1637
|
+
"event": "batch_status",
|
|
1638
|
+
"status": "done",
|
|
1639
|
+
"batch_id": task_id,
|
|
1640
|
+
"batch_index": bidx,
|
|
1641
|
+
"total_batches": total_batches,
|
|
1642
|
+
"issues_count": len(verified_items),
|
|
1643
|
+
"parse_fail": parse_fail,
|
|
1644
|
+
})
|
|
1645
|
+
|
|
1646
|
+
|
|
1647
|
+
def _valid_items(items: Optional[List]) -> bool:
|
|
1648
|
+
"""验证分析结果项的格式"""
|
|
1649
|
+
if not isinstance(items, list):
|
|
1650
|
+
return False
|
|
1651
|
+
for it in items:
|
|
1652
|
+
if not isinstance(it, dict):
|
|
1653
|
+
return False
|
|
1654
|
+
has_gid = "gid" in it
|
|
1655
|
+
has_gids = "gids" in it
|
|
1656
|
+
if not has_gid and not has_gids:
|
|
1657
|
+
return False
|
|
1658
|
+
if has_gid and has_gids:
|
|
1659
|
+
return False
|
|
1660
|
+
if has_gid:
|
|
1661
|
+
try:
|
|
1662
|
+
if int(it["gid"]) < 1:
|
|
1663
|
+
return False
|
|
1664
|
+
except Exception:
|
|
1665
|
+
return False
|
|
1666
|
+
elif has_gids:
|
|
1667
|
+
if not isinstance(it["gids"], list) or len(it["gids"]) == 0:
|
|
1668
|
+
return False
|
|
1669
|
+
for gid_val in it["gids"]:
|
|
1670
|
+
try:
|
|
1671
|
+
if int(gid_val) < 1:
|
|
1672
|
+
return False
|
|
1673
|
+
except Exception:
|
|
1674
|
+
return False
|
|
1675
|
+
if "has_risk" not in it or not isinstance(it["has_risk"], bool):
|
|
1676
|
+
return False
|
|
1677
|
+
if it.get("has_risk"):
|
|
1678
|
+
for key in ["preconditions", "trigger_path", "consequences", "suggestions"]:
|
|
1679
|
+
if key not in it:
|
|
1680
|
+
return False
|
|
1681
|
+
if not isinstance(it[key], str) or not it[key].strip():
|
|
1682
|
+
return False
|
|
1683
|
+
return True
|
|
1684
|
+
|
|
1685
|
+
|
|
1686
|
+
def _is_valid_verification_item(item: Dict) -> bool:
|
|
1687
|
+
"""验证验证结果项的格式"""
|
|
1688
|
+
if not isinstance(item, dict) or "is_valid" not in item:
|
|
1689
|
+
return False
|
|
1690
|
+
has_gid = "gid" in item
|
|
1691
|
+
has_gids = "gids" in item
|
|
1692
|
+
if not has_gid and not has_gids:
|
|
1693
|
+
return False
|
|
1694
|
+
if has_gid and has_gids:
|
|
1695
|
+
return False # gid 和 gids 不能同时出现
|
|
1696
|
+
if has_gid:
|
|
1697
|
+
try:
|
|
1698
|
+
return int(item["gid"]) >= 1
|
|
1699
|
+
except Exception:
|
|
1700
|
+
return False
|
|
1701
|
+
elif has_gids:
|
|
1702
|
+
if not isinstance(item["gids"], list) or len(item["gids"]) == 0:
|
|
1703
|
+
return False
|
|
1704
|
+
try:
|
|
1705
|
+
return all(int(gid_val) >= 1 for gid_val in item["gids"])
|
|
1706
|
+
except Exception:
|
|
1707
|
+
return False
|
|
1708
|
+
return False
|
|
1709
|
+
|
|
1710
|
+
|
|
1711
|
+
def _run_verification_agent_with_retry(
|
|
1712
|
+
verification_agent,
|
|
1713
|
+
verification_task: str,
|
|
1714
|
+
verification_summary_prompt: str,
|
|
1715
|
+
entry_path: str,
|
|
1716
|
+
verification_summary_container: Dict[str, str],
|
|
1717
|
+
bidx: int,
|
|
1718
|
+
) -> tuple[Optional[List[Dict]], Optional[str]]:
|
|
1719
|
+
"""运行验证Agent并永久重试直到格式正确,返回(验证结果, 解析错误)"""
|
|
1720
|
+
use_direct_model_verify = False
|
|
1721
|
+
prev_parse_error_verify: Optional[str] = None
|
|
1722
|
+
verify_attempt = 0
|
|
1723
|
+
|
|
1724
|
+
while True:
|
|
1725
|
+
verify_attempt += 1
|
|
1726
|
+
verification_summary_container["text"] = ""
|
|
1727
|
+
|
|
1728
|
+
if use_direct_model_verify:
|
|
1729
|
+
verification_summary_prompt_text = _build_verification_summary_prompt()
|
|
1730
|
+
error_guidance = ""
|
|
1731
|
+
if prev_parse_error_verify:
|
|
1732
|
+
error_guidance = f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- YAML解析失败: {prev_parse_error_verify}\n\n请确保输出的YAML格式正确,包括正确的缩进、引号、冒号等。"
|
|
1733
|
+
|
|
1734
|
+
full_verify_prompt = f"{verification_task}{error_guidance}\n\n{verification_summary_prompt_text}"
|
|
1735
|
+
try:
|
|
1736
|
+
verify_response = verification_agent.model.chat_until_success(full_verify_prompt) # type: ignore
|
|
1737
|
+
verification_summary_container["text"] = verify_response
|
|
1738
|
+
except Exception as e:
|
|
1739
|
+
try:
|
|
1740
|
+
typer.secho(f"[jarvis-sec] 验证阶段直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
|
|
1741
|
+
except Exception:
|
|
1742
|
+
pass
|
|
1743
|
+
verification_agent.run(verification_task)
|
|
1744
|
+
else:
|
|
1745
|
+
verification_agent.run(verification_task)
|
|
1746
|
+
|
|
1747
|
+
# 工作区保护
|
|
1748
|
+
try:
|
|
1749
|
+
_changed_verify = _git_restore_if_dirty(entry_path)
|
|
1750
|
+
if _changed_verify:
|
|
1751
|
+
try:
|
|
1752
|
+
typer.secho(f"[jarvis-sec] 验证 Agent 工作区已恢复 ({_changed_verify} 个文件)", fg=typer.colors.BLUE)
|
|
1753
|
+
except Exception:
|
|
1754
|
+
pass
|
|
1755
|
+
except Exception:
|
|
1756
|
+
pass
|
|
1757
|
+
|
|
1758
|
+
# 解析验证结果
|
|
1759
|
+
verification_summary_text = verification_summary_container.get("text", "")
|
|
1760
|
+
parse_error_verify = None
|
|
1761
|
+
if verification_summary_text:
|
|
1762
|
+
verification_parsed, parse_error_verify = _try_parse_summary_report(verification_summary_text)
|
|
1763
|
+
if parse_error_verify:
|
|
1764
|
+
prev_parse_error_verify = parse_error_verify
|
|
1765
|
+
try:
|
|
1766
|
+
typer.secho(f"[jarvis-sec] 验证结果YAML解析失败: {parse_error_verify}", fg=typer.colors.YELLOW)
|
|
1767
|
+
except Exception:
|
|
1768
|
+
pass
|
|
1769
|
+
else:
|
|
1770
|
+
prev_parse_error_verify = None
|
|
1771
|
+
if isinstance(verification_parsed, list):
|
|
1772
|
+
if verification_parsed and all(_is_valid_verification_item(item) for item in verification_parsed):
|
|
1773
|
+
return verification_parsed, None
|
|
1774
|
+
|
|
1775
|
+
# 格式校验失败,后续重试使用直接模型调用
|
|
1776
|
+
use_direct_model_verify = True
|
|
1777
|
+
if parse_error_verify:
|
|
1778
|
+
try:
|
|
1779
|
+
typer.secho(f"[jarvis-sec] 验证结果YAML解析失败 -> 重试第 {verify_attempt} 次 (批次={bidx},使用直接模型调用,将反馈解析错误)", fg=typer.colors.YELLOW)
|
|
1780
|
+
except Exception:
|
|
1781
|
+
pass
|
|
1782
|
+
else:
|
|
1783
|
+
try:
|
|
1784
|
+
typer.secho(f"[jarvis-sec] 验证结果格式无效 -> 重试第 {verify_attempt} 次 (批次={bidx},使用直接模型调用)", fg=typer.colors.YELLOW)
|
|
1785
|
+
except Exception:
|
|
1786
|
+
pass
|
|
1787
|
+
|
|
1788
|
+
|
|
1789
|
+
def run_security_analysis(
|
|
1790
|
+
entry_path: str,
|
|
1791
|
+
languages: Optional[List[str]] = None,
|
|
1792
|
+
llm_group: Optional[str] = None,
|
|
1793
|
+
report_file: Optional[str] = None,
|
|
1794
|
+
cluster_limit: int = 50,
|
|
1795
|
+
exclude_dirs: Optional[List[str]] = None,
|
|
1796
|
+
) -> str:
|
|
1797
|
+
"""
|
|
1798
|
+
运行安全分析工作流(混合模式)。
|
|
1799
|
+
|
|
1800
|
+
改进:
|
|
1801
|
+
- 即使在 agent 模式下,也先进行本地正则/启发式直扫,生成候选问题;
|
|
1802
|
+
然后将候选问题拆分为子任务,交由多Agent进行深入分析与聚合。
|
|
1803
|
+
|
|
1804
|
+
注意:此函数会在发生异常时更新状态文件为 error 状态。
|
|
1805
|
+
|
|
1806
|
+
参数:
|
|
1807
|
+
- entry_path: 待分析的根目录路径
|
|
1808
|
+
- languages: 限定扫描的语言扩展(例如 ["c", "cpp", "h", "hpp", "rs"]),为空则使用默认
|
|
1809
|
+
|
|
1810
|
+
返回:
|
|
1811
|
+
- 最终报告(字符串),由 Aggregator 生成(JSON + Markdown)
|
|
1812
|
+
|
|
1813
|
+
其他:
|
|
1814
|
+
- llm_group: 模型组名称(仅在当前调用链内生效,不覆盖全局配置),将直接传入 Agent 用于选择模型
|
|
1815
|
+
- report_file: 增量报告文件路径(JSONL)。当每个子任务检测到 issues 时,立即将一条记录追加到该文件;
|
|
1816
|
+
若未指定,则默认写入 entry_path/.jarvis/sec/agent_issues.jsonl
|
|
1817
|
+
- cluster_limit: 聚类时每批次最多处理的告警数(默认 50),当单个文件告警过多时按批次进行聚类
|
|
1818
|
+
- exclude_dirs: 要排除的目录列表(可选),默认已包含测试目录(test, tests, __tests__, spec, testsuite, testdata)
|
|
1819
|
+
- 断点续扫: 默认开启。会基于 .jarvis/sec/progress.jsonl 和 .jarvis/sec/heuristic_issues.jsonl 文件进行状态恢复。
|
|
1820
|
+
"""
|
|
1821
|
+
import json
|
|
1822
|
+
|
|
1823
|
+
langs = languages or ["c", "cpp", "h", "hpp", "rs"]
|
|
1824
|
+
|
|
1825
|
+
# 状态管理器(结构化进度状态文件)
|
|
1826
|
+
from jarvis.jarvis_sec.status import StatusManager
|
|
1827
|
+
status_mgr = StatusManager(entry_path)
|
|
1828
|
+
|
|
1829
|
+
# 尝试从状态文件恢复并显示当前状态
|
|
1830
|
+
try:
|
|
1831
|
+
current_status = status_mgr.get_status()
|
|
1832
|
+
if current_status:
|
|
1833
|
+
stage = current_status.get("stage", "unknown")
|
|
1834
|
+
progress = current_status.get("progress", 0)
|
|
1835
|
+
message = current_status.get("message", "")
|
|
1836
|
+
typer.secho(f"[jarvis-sec] 从状态文件恢复: 阶段={stage}, 进度={progress}%, {message}", fg=typer.colors.BLUE)
|
|
1837
|
+
except Exception:
|
|
1838
|
+
pass
|
|
1839
|
+
|
|
1840
|
+
# 初始化分析上下文
|
|
1841
|
+
sec_dir, progress_path, _progress_append, done_sigs = _initialize_analysis_context(
|
|
1842
|
+
entry_path, status_mgr
|
|
1843
|
+
)
|
|
1844
|
+
|
|
1845
|
+
# 1) 启发式扫描(支持断点续扫)
|
|
1846
|
+
candidates, summary = _load_or_run_heuristic_scan(
|
|
1847
|
+
entry_path, langs, exclude_dirs, sec_dir, status_mgr, _progress_append
|
|
1848
|
+
)
|
|
1849
|
+
|
|
1850
|
+
# 2) 将候选问题精简为子任务清单,控制上下文长度
|
|
1851
|
+
compact_candidates = _prepare_candidates(candidates)
|
|
1852
|
+
|
|
1853
|
+
# 记录批次选择信息(可选,用于日志)
|
|
1854
|
+
try:
|
|
1855
|
+
groups = _group_candidates_by_file(compact_candidates)
|
|
1856
|
+
if groups:
|
|
1857
|
+
selected_file, items = max(groups.items(), key=lambda kv: len(kv[1]))
|
|
1858
|
+
try:
|
|
1859
|
+
typer.secho(f"[jarvis-sec] 批次选择: 文件={selected_file} 数量={len(items)}", fg=typer.colors.BLUE)
|
|
1860
|
+
except Exception:
|
|
1861
|
+
pass
|
|
1862
|
+
_progress_append({
|
|
1863
|
+
"event": "batch_selection",
|
|
1864
|
+
"selected_file": selected_file,
|
|
1865
|
+
"selected_count": len(items),
|
|
1866
|
+
"total_in_file": len(items),
|
|
1867
|
+
})
|
|
1868
|
+
except Exception:
|
|
1869
|
+
pass
|
|
1870
|
+
|
|
1871
|
+
# 创建报告写入函数
|
|
1872
|
+
_append_report = _create_report_writer(sec_dir, report_file)
|
|
1873
|
+
|
|
1874
|
+
# 3) 处理聚类阶段
|
|
1875
|
+
cluster_batches, invalid_clusters_for_review = _process_clustering_phase(
|
|
1876
|
+
compact_candidates,
|
|
1877
|
+
entry_path,
|
|
1878
|
+
langs,
|
|
1879
|
+
cluster_limit,
|
|
1880
|
+
llm_group,
|
|
1881
|
+
sec_dir,
|
|
1882
|
+
progress_path,
|
|
1883
|
+
status_mgr,
|
|
1884
|
+
_progress_append,
|
|
1885
|
+
)
|
|
1886
|
+
|
|
1887
|
+
# 4) 处理验证阶段
|
|
1888
|
+
meta_records: List[Dict] = []
|
|
1889
|
+
gid_counts: Dict[int, int] = {}
|
|
1890
|
+
all_issues = _process_verification_phase(
|
|
1891
|
+
cluster_batches,
|
|
1892
|
+
entry_path,
|
|
1893
|
+
langs,
|
|
1894
|
+
llm_group,
|
|
1895
|
+
sec_dir,
|
|
1896
|
+
progress_path,
|
|
1897
|
+
status_mgr,
|
|
1898
|
+
_progress_append,
|
|
1899
|
+
_append_report,
|
|
1900
|
+
)
|
|
1901
|
+
|
|
1902
|
+
# 5) 使用统一聚合器生成最终报告(JSON + Markdown)
|
|
1903
|
+
try:
|
|
1904
|
+
from jarvis.jarvis_sec.report import build_json_and_markdown
|
|
1905
|
+
result = build_json_and_markdown(
|
|
1906
|
+
all_issues,
|
|
1907
|
+
scanned_root=summary.get("scanned_root"),
|
|
1908
|
+
scanned_files=summary.get("scanned_files"),
|
|
1909
|
+
meta=meta_records or None,
|
|
1910
|
+
)
|
|
1911
|
+
# 标记分析完成
|
|
1912
|
+
status_mgr.mark_completed(
|
|
1913
|
+
total_issues=len(all_issues),
|
|
1914
|
+
message=f"安全分析完成,共发现 {len(all_issues)} 个问题"
|
|
1915
|
+
)
|
|
1916
|
+
return result
|
|
1917
|
+
except Exception as e:
|
|
1918
|
+
# 发生错误时更新状态
|
|
1919
|
+
error_msg = str(e)
|
|
1920
|
+
status_mgr.mark_error(
|
|
1921
|
+
error_message=error_msg,
|
|
1922
|
+
error_type=type(e).__name__
|
|
1923
|
+
)
|
|
1924
|
+
raise
|
|
1925
|
+
|
|
1926
|
+
|
|
1927
|
+
def _group_candidates_by_file(candidates: List[Dict]) -> Dict[str, List[Dict]]:
|
|
1928
|
+
"""按文件分组候选问题"""
|
|
1929
|
+
from collections import defaultdict
|
|
1930
|
+
groups: Dict[str, List[Dict]] = defaultdict(list)
|
|
1931
|
+
for it in candidates:
|
|
1932
|
+
groups[str(it.get("file") or "")].append(it)
|
|
1933
|
+
return groups
|
|
1934
|
+
|
|
1935
|
+
|
|
1936
|
+
def _create_report_writer(sec_dir, report_file):
|
|
1937
|
+
"""创建报告写入函数"""
|
|
1938
|
+
import json
|
|
1939
|
+
from pathlib import Path
|
|
1940
|
+
|
|
1941
|
+
def _append_report(items, source: str, task_id: str, cand: Dict):
|
|
1942
|
+
"""将当前子任务的检测结果追加写入 JSONL 报告文件(每行一个 issue)。仅当 items 非空时写入。"""
|
|
1943
|
+
if not items:
|
|
1944
|
+
return
|
|
1945
|
+
try:
|
|
1946
|
+
path = Path(report_file) if report_file else sec_dir / "agent_issues.jsonl"
|
|
1947
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
1948
|
+
with path.open("a", encoding="utf-8") as f:
|
|
1949
|
+
for item in items:
|
|
1950
|
+
line = json.dumps(item, ensure_ascii=False)
|
|
1951
|
+
f.write(line + "\n")
|
|
1952
|
+
try:
|
|
1953
|
+
typer.secho(f"[jarvis-sec] 已将 {len(items)} 个问题写入 {path}", fg=typer.colors.GREEN)
|
|
1954
|
+
except Exception:
|
|
1955
|
+
pass
|
|
1956
|
+
except Exception:
|
|
1957
|
+
# 报告写入失败不影响主流程
|
|
1958
|
+
pass
|
|
1959
|
+
|
|
1960
|
+
return _append_report
|
|
1961
|
+
|
|
1962
|
+
|
|
1963
|
+
def _sig_of(c: Dict) -> str:
|
|
1964
|
+
"""生成候选问题的签名"""
|
|
1965
|
+
return f"{c.get('language','')}|{c.get('file','')}|{c.get('line','')}|{c.get('pattern','')}"
|
|
1966
|
+
|
|
1967
|
+
|
|
1968
|
+
def _create_signature_function():
|
|
1969
|
+
"""创建候选签名函数(已废弃,直接使用 _sig_of)"""
|
|
1970
|
+
return _sig_of
|
|
1971
|
+
|
|
1972
|
+
|
|
1973
|
+
def _parse_clusters_from_text(text: str) -> tuple[Optional[List], Optional[str]]:
|
|
1974
|
+
"""解析聚类文本,返回(解析结果, 错误信息)"""
|
|
1975
|
+
try:
|
|
1976
|
+
start = text.find("<CLUSTERS>")
|
|
1977
|
+
end = text.find("</CLUSTERS>")
|
|
1978
|
+
if start == -1 or end == -1 or end <= start:
|
|
1979
|
+
return None, "未找到 <CLUSTERS> 或 </CLUSTERS> 标签,或标签顺序错误"
|
|
1980
|
+
content = text[start + len("<CLUSTERS>"):end].strip()
|
|
1981
|
+
if not content:
|
|
1982
|
+
return None, "YAML 内容为空"
|
|
1983
|
+
import yaml as _yaml3 # type: ignore
|
|
1984
|
+
try:
|
|
1985
|
+
data = _yaml3.safe_load(content)
|
|
1986
|
+
except Exception as yaml_err:
|
|
1987
|
+
error_msg = f"YAML 解析失败: {str(yaml_err)}"
|
|
1988
|
+
return None, error_msg
|
|
1989
|
+
if isinstance(data, list):
|
|
1990
|
+
return data, None
|
|
1991
|
+
return None, f"YAML 解析结果不是数组,而是 {type(data).__name__}"
|
|
1992
|
+
except Exception as e:
|
|
1993
|
+
return None, f"解析过程发生异常: {str(e)}"
|
|
1994
|
+
|
|
1995
|
+
|
|
1996
|
+
def _create_cluster_snapshot_writer(sec_dir, cluster_records, compact_candidates, _progress_append):
|
|
1997
|
+
"""创建聚类快照写入函数"""
|
|
1998
|
+
def _write_cluster_batch_snapshot(batch_records: List[Dict]):
|
|
1999
|
+
"""写入单个批次的聚类结果,支持增量保存"""
|
|
2000
|
+
try:
|
|
2001
|
+
from pathlib import Path as _Path2
|
|
2002
|
+
import json as _json
|
|
2003
|
+
_cluster_path = sec_dir / "cluster_report.jsonl"
|
|
2004
|
+
_cluster_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2005
|
+
|
|
2006
|
+
# 追加模式,每次只追加当前批次的记录
|
|
2007
|
+
with _cluster_path.open("a", encoding="utf-8") as f:
|
|
2008
|
+
for record in batch_records:
|
|
2009
|
+
f.write(_json.dumps(record, ensure_ascii=False) + "\n")
|
|
2010
|
+
except Exception:
|
|
2011
|
+
pass
|
|
2012
|
+
|
|
2013
|
+
def _write_cluster_report_snapshot():
|
|
2014
|
+
"""写入聚类报告快照"""
|
|
2015
|
+
try:
|
|
2016
|
+
from pathlib import Path as _Path2
|
|
2017
|
+
import json as _json
|
|
2018
|
+
_cluster_path = sec_dir / "cluster_report.jsonl"
|
|
2019
|
+
_cluster_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2020
|
+
|
|
2021
|
+
# 使用追加模式,每次只追加当前批次的记录
|
|
2022
|
+
# 注意:这会导致重复记录,需要在读取时去重
|
|
2023
|
+
with _cluster_path.open("a", encoding="utf-8") as f:
|
|
2024
|
+
for record in cluster_records:
|
|
2025
|
+
f.write(_json.dumps(record, ensure_ascii=False) + "\n")
|
|
2026
|
+
|
|
2027
|
+
_progress_append(
|
|
2028
|
+
{
|
|
2029
|
+
"event": "cluster_report_snapshot",
|
|
2030
|
+
"path": str(_cluster_path),
|
|
2031
|
+
"clusters": len(cluster_records),
|
|
2032
|
+
"total_candidates": len(compact_candidates),
|
|
2033
|
+
}
|
|
2034
|
+
)
|
|
2035
|
+
except Exception:
|
|
2036
|
+
pass
|
|
2037
|
+
|
|
2038
|
+
return _write_cluster_batch_snapshot, _write_cluster_report_snapshot
|
|
2039
|
+
|
|
2040
|
+
|
|
2041
|
+
def _collect_candidate_gids(file_groups: Dict[str, List[Dict]]) -> set:
|
|
2042
|
+
"""收集所有候选的 gid"""
|
|
2043
|
+
all_gids = set()
|
|
2044
|
+
for _file, _items in file_groups.items():
|
|
2045
|
+
for it in _items:
|
|
2046
|
+
try:
|
|
2047
|
+
_gid = int(it.get("gid", 0))
|
|
2048
|
+
if _gid >= 1:
|
|
2049
|
+
all_gids.add(_gid)
|
|
2050
|
+
except Exception:
|
|
2051
|
+
pass
|
|
2052
|
+
return all_gids
|
|
2053
|
+
|
|
2054
|
+
|
|
2055
|
+
def _collect_clustered_gids(cluster_batches: List[List[Dict]], invalid_clusters_for_review: List[Dict]) -> set:
|
|
2056
|
+
"""收集所有已聚类的 gid"""
|
|
2057
|
+
all_clustered_gids = set()
|
|
2058
|
+
for batch in cluster_batches:
|
|
2059
|
+
for item in batch:
|
|
2060
|
+
try:
|
|
2061
|
+
_gid = int(item.get("gid", 0))
|
|
2062
|
+
if _gid >= 1:
|
|
2063
|
+
all_clustered_gids.add(_gid)
|
|
2064
|
+
except Exception:
|
|
2065
|
+
pass
|
|
2066
|
+
# 也收集无效聚类中的 gid(它们已经进入复核流程)
|
|
2067
|
+
for invalid_cluster in invalid_clusters_for_review:
|
|
2068
|
+
gids_list = invalid_cluster.get("gids", [])
|
|
2069
|
+
for _gid in gids_list:
|
|
2070
|
+
try:
|
|
2071
|
+
_gid_int = int(_gid)
|
|
2072
|
+
if _gid_int >= 1:
|
|
2073
|
+
all_clustered_gids.add(_gid_int)
|
|
2074
|
+
except Exception:
|
|
2075
|
+
pass
|
|
2076
|
+
return all_clustered_gids
|
|
2077
|
+
|
|
2078
|
+
|
|
2079
|
+
def _load_processed_gids_from_agent_issues(sec_dir) -> set:
|
|
2080
|
+
"""从 agent_issues.jsonl 读取已处理的 gid"""
|
|
2081
|
+
processed_gids = set()
|
|
2082
|
+
try:
|
|
2083
|
+
from pathlib import Path
|
|
2084
|
+
import json
|
|
2085
|
+
_agent_issues_path = sec_dir / "agent_issues.jsonl"
|
|
2086
|
+
if _agent_issues_path.exists():
|
|
2087
|
+
with _agent_issues_path.open("r", encoding="utf-8", errors="ignore") as f:
|
|
2088
|
+
for line in f:
|
|
2089
|
+
line = line.strip()
|
|
2090
|
+
if not line:
|
|
2091
|
+
continue
|
|
2092
|
+
try:
|
|
2093
|
+
issue_obj = json.loads(line)
|
|
2094
|
+
_gid = int(issue_obj.get("gid", 0))
|
|
2095
|
+
if _gid >= 1:
|
|
2096
|
+
processed_gids.add(_gid)
|
|
2097
|
+
except Exception:
|
|
2098
|
+
pass
|
|
2099
|
+
except Exception:
|
|
2100
|
+
pass
|
|
2101
|
+
return processed_gids
|
|
2102
|
+
|
|
2103
|
+
|
|
2104
|
+
def _load_completed_batch_ids(progress_path) -> set:
|
|
2105
|
+
"""从 progress.jsonl 读取已完成的批次ID"""
|
|
2106
|
+
completed_batch_ids = set()
|
|
2107
|
+
try:
|
|
2108
|
+
import json
|
|
2109
|
+
if progress_path.exists():
|
|
2110
|
+
with progress_path.open("r", encoding="utf-8", errors="ignore") as f:
|
|
2111
|
+
for line in f:
|
|
2112
|
+
line = line.strip()
|
|
2113
|
+
if not line:
|
|
2114
|
+
continue
|
|
2115
|
+
try:
|
|
2116
|
+
obj = json.loads(line)
|
|
2117
|
+
# 检查 batch_status 事件,status 为 "done" 表示批次已完成
|
|
2118
|
+
if obj.get("event") == "batch_status" and obj.get("status") == "done":
|
|
2119
|
+
batch_id = obj.get("batch_id")
|
|
2120
|
+
if batch_id:
|
|
2121
|
+
completed_batch_ids.add(batch_id)
|
|
2122
|
+
except Exception:
|
|
2123
|
+
pass
|
|
2124
|
+
except Exception:
|
|
2125
|
+
pass
|
|
2126
|
+
return completed_batch_ids
|
|
2127
|
+
|
|
2128
|
+
|
|
2129
|
+
def _load_all_issues_from_file(sec_dir) -> List[Dict]:
|
|
2130
|
+
"""从 agent_issues.jsonl 读取所有已保存的告警"""
|
|
2131
|
+
all_issues: List[Dict] = []
|
|
2132
|
+
try:
|
|
2133
|
+
from pathlib import Path
|
|
2134
|
+
import json
|
|
2135
|
+
_agent_issues_path = sec_dir / "agent_issues.jsonl"
|
|
2136
|
+
if _agent_issues_path.exists():
|
|
2137
|
+
saved_gids_from_file = set()
|
|
2138
|
+
with _agent_issues_path.open("r", encoding="utf-8", errors="ignore") as f:
|
|
2139
|
+
for line in f:
|
|
2140
|
+
line = line.strip()
|
|
2141
|
+
if not line:
|
|
2142
|
+
continue
|
|
2143
|
+
try:
|
|
2144
|
+
item = json.loads(line)
|
|
2145
|
+
gid = item.get("gid", 0)
|
|
2146
|
+
if gid >= 1 and gid not in saved_gids_from_file:
|
|
2147
|
+
# 只保留验证通过的告警(has_risk: true 且有 verification_notes)
|
|
2148
|
+
if item.get("has_risk") is True and "verification_notes" in item:
|
|
2149
|
+
all_issues.append(item)
|
|
2150
|
+
saved_gids_from_file.add(gid)
|
|
2151
|
+
except Exception:
|
|
2152
|
+
pass
|
|
2153
|
+
|
|
2154
|
+
if all_issues:
|
|
2155
|
+
try:
|
|
2156
|
+
typer.secho(f"[jarvis-sec] 从 agent_issues.jsonl 加载了 {len(all_issues)} 个已保存的告警", fg=typer.colors.BLUE)
|
|
2157
|
+
except Exception:
|
|
2158
|
+
pass
|
|
2159
|
+
else:
|
|
2160
|
+
try:
|
|
2161
|
+
typer.secho(f"[jarvis-sec] agent_issues.jsonl 不存在,当前运行未发现任何问题", fg=typer.colors.BLUE)
|
|
2162
|
+
except Exception:
|
|
2163
|
+
pass
|
|
2164
|
+
except Exception as e:
|
|
2165
|
+
# 加载失败不影响主流程
|
|
2166
|
+
try:
|
|
2167
|
+
typer.secho(f"[jarvis-sec] 警告:从 agent_issues.jsonl 加载告警失败: {e}", fg=typer.colors.YELLOW)
|
|
2168
|
+
except Exception:
|
|
2169
|
+
pass
|
|
2170
|
+
return all_issues
|
|
2171
|
+
|
|
2172
|
+
|
|
2173
|
+
def _supplement_missing_gids_for_clustering(
|
|
2174
|
+
missing_gids: set,
|
|
2175
|
+
gid_to_candidate: Dict[int, Dict],
|
|
2176
|
+
cluster_batches: List[List[Dict]],
|
|
2177
|
+
_progress_append,
|
|
2178
|
+
processed_gids_from_issues: set,
|
|
2179
|
+
) -> tuple[int, int]:
|
|
2180
|
+
"""为遗漏的 gid 补充聚类,返回(补充数量, 跳过数量)"""
|
|
2181
|
+
supplemented_count = 0
|
|
2182
|
+
skipped_count = 0
|
|
2183
|
+
|
|
2184
|
+
for missing_gid in sorted(missing_gids):
|
|
2185
|
+
# 如果该 gid 已经在 agent_issues.jsonl 中有结果,说明已经验证过了
|
|
2186
|
+
# 不需要重新聚类,但记录一下
|
|
2187
|
+
if missing_gid in processed_gids_from_issues:
|
|
2188
|
+
skipped_count += 1
|
|
2189
|
+
_progress_append({
|
|
2190
|
+
"event": "cluster_missing_gid_skipped",
|
|
2191
|
+
"gid": missing_gid,
|
|
2192
|
+
"note": "已在agent_issues.jsonl中有验证结果,跳过重新处理",
|
|
2193
|
+
"reason": "already_processed",
|
|
2194
|
+
})
|
|
2195
|
+
continue
|
|
2196
|
+
|
|
2197
|
+
# 找到对应的候选
|
|
2198
|
+
missing_item = gid_to_candidate.get(missing_gid)
|
|
2199
|
+
if missing_item:
|
|
2200
|
+
# 为遗漏的 gid 创建默认验证条件
|
|
2201
|
+
default_verification = f"验证候选 {missing_gid} 的安全风险"
|
|
2202
|
+
missing_item["verify"] = default_verification
|
|
2203
|
+
cluster_batches.append([missing_item])
|
|
2204
|
+
supplemented_count += 1
|
|
2205
|
+
_progress_append({
|
|
2206
|
+
"event": "cluster_missing_gid_supplement",
|
|
2207
|
+
"gid": missing_gid,
|
|
2208
|
+
"file": missing_item.get("file"),
|
|
2209
|
+
"note": "分析阶段开始前补充的遗漏gid",
|
|
2210
|
+
})
|
|
2211
|
+
|
|
2212
|
+
return supplemented_count, skipped_count
|
|
2213
|
+
|
|
2214
|
+
|
|
2215
|
+
def _handle_single_alert_file(
|
|
2216
|
+
file: str,
|
|
2217
|
+
single_item: Dict,
|
|
2218
|
+
single_gid: int,
|
|
2219
|
+
cluster_batches: List[List[Dict]],
|
|
2220
|
+
cluster_records: List[Dict],
|
|
2221
|
+
_progress_append,
|
|
2222
|
+
_write_cluster_batch_snapshot,
|
|
2223
|
+
) -> None:
|
|
2224
|
+
"""处理单告警文件:跳过聚类,直接写入"""
|
|
2225
|
+
default_verification = f"验证候选 {single_gid} 的安全风险"
|
|
2226
|
+
single_item["verify"] = default_verification
|
|
2227
|
+
cluster_batches.append([single_item])
|
|
2228
|
+
cluster_records.append(
|
|
2229
|
+
{
|
|
2230
|
+
"file": file,
|
|
2231
|
+
"verification": default_verification,
|
|
2232
|
+
"gids": [single_gid],
|
|
2233
|
+
"count": 1,
|
|
2234
|
+
"batch_index": 1,
|
|
2235
|
+
"note": "单告警跳过聚类",
|
|
2236
|
+
}
|
|
2237
|
+
)
|
|
2238
|
+
_progress_append(
|
|
2239
|
+
{
|
|
2240
|
+
"event": "cluster_status",
|
|
2241
|
+
"status": "done",
|
|
2242
|
+
"file": file,
|
|
2243
|
+
"batch_index": 1,
|
|
2244
|
+
"skipped": True,
|
|
2245
|
+
"reason": "single_alert",
|
|
2246
|
+
}
|
|
2247
|
+
)
|
|
2248
|
+
current_batch_records = [
|
|
2249
|
+
rec for rec in cluster_records
|
|
2250
|
+
if rec.get("file") == file and rec.get("batch_index") == 1
|
|
2251
|
+
]
|
|
2252
|
+
if current_batch_records:
|
|
2253
|
+
_write_cluster_batch_snapshot(current_batch_records)
|
|
2254
|
+
typer.secho(f"[jarvis-sec] 文件 {file} 仅有一个告警(gid={single_gid}),跳过聚类直接写入", fg=typer.colors.BLUE)
|
|
2255
|
+
|
|
2256
|
+
|
|
2257
|
+
def _validate_cluster_format(cluster_items: List[Dict]) -> tuple[bool, List[str]]:
|
|
2258
|
+
"""验证聚类结果的格式,返回(是否有效, 错误详情列表)"""
|
|
2259
|
+
if not isinstance(cluster_items, list) or not cluster_items:
|
|
2260
|
+
return False, ["结果不是数组或数组为空"]
|
|
2261
|
+
|
|
2262
|
+
error_details = []
|
|
2263
|
+
for idx, it in enumerate(cluster_items):
|
|
2264
|
+
if not isinstance(it, dict):
|
|
2265
|
+
error_details.append(f"元素{idx}不是字典")
|
|
2266
|
+
return False, error_details
|
|
2267
|
+
|
|
2268
|
+
vals = it.get("gids", [])
|
|
2269
|
+
if not isinstance(it.get("verification", ""), str) or not isinstance(vals, list):
|
|
2270
|
+
error_details.append(f"元素{idx}的verification或gids格式错误")
|
|
2271
|
+
return False, error_details
|
|
2272
|
+
|
|
2273
|
+
# 校验 gids 列表中的每个元素是否都是有效的整数
|
|
2274
|
+
if isinstance(vals, list):
|
|
2275
|
+
for gid_idx, gid_val in enumerate(vals):
|
|
2276
|
+
try:
|
|
2277
|
+
gid_int = int(gid_val)
|
|
2278
|
+
if gid_int < 1:
|
|
2279
|
+
error_details.append(f"元素{idx}的gids[{gid_idx}]不是有效的正整数(值为{gid_val})")
|
|
2280
|
+
return False, error_details
|
|
2281
|
+
except (ValueError, TypeError):
|
|
2282
|
+
error_details.append(f"元素{idx}的gids[{gid_idx}]不是有效的整数(值为{gid_val},类型为{type(gid_val).__name__})")
|
|
2283
|
+
return False, error_details
|
|
2284
|
+
|
|
2285
|
+
# 校验 is_invalid 字段(必填)
|
|
2286
|
+
if "is_invalid" not in it:
|
|
2287
|
+
error_details.append(f"元素{idx}缺少is_invalid字段(必填)")
|
|
2288
|
+
return False, error_details
|
|
2289
|
+
|
|
2290
|
+
is_invalid_val = it.get("is_invalid")
|
|
2291
|
+
if not isinstance(is_invalid_val, bool):
|
|
2292
|
+
error_details.append(f"元素{idx}的is_invalid不是布尔值")
|
|
2293
|
+
return False, error_details
|
|
2294
|
+
|
|
2295
|
+
# 如果is_invalid为true,必须提供invalid_reason
|
|
2296
|
+
if is_invalid_val is True:
|
|
2297
|
+
invalid_reason = it.get("invalid_reason", "")
|
|
2298
|
+
if not isinstance(invalid_reason, str) or not invalid_reason.strip():
|
|
2299
|
+
error_details.append(f"元素{idx}的is_invalid为true但缺少invalid_reason字段或理由为空(必填)")
|
|
2300
|
+
return False, error_details
|
|
2301
|
+
|
|
2302
|
+
return True, []
|
|
2303
|
+
|
|
2304
|
+
|
|
2305
|
+
def _extract_classified_gids(cluster_items: List[Dict]) -> set:
|
|
2306
|
+
"""从聚类结果中提取所有已分类的gid
|
|
2307
|
+
|
|
2308
|
+
注意:此函数假设格式验证已经通过,所有gid都是有效的整数。
|
|
2309
|
+
如果遇到格式错误的gid,会记录警告但不会抛出异常(因为格式验证应该已经捕获了这些问题)。
|
|
2310
|
+
"""
|
|
2311
|
+
classified_gids = set()
|
|
2312
|
+
for cl in cluster_items:
|
|
2313
|
+
raw_gids = cl.get("gids", [])
|
|
2314
|
+
if isinstance(raw_gids, list):
|
|
2315
|
+
for x in raw_gids:
|
|
2316
|
+
try:
|
|
2317
|
+
xi = int(x)
|
|
2318
|
+
if xi >= 1:
|
|
2319
|
+
classified_gids.add(xi)
|
|
2320
|
+
except (ValueError, TypeError) as e:
|
|
2321
|
+
# 理论上不应该到达这里(格式验证应该已经捕获),但如果到达了,记录警告
|
|
2322
|
+
try:
|
|
2323
|
+
typer.secho(f"[jarvis-sec] 警告:在提取gid时遇到格式错误(值={x},类型={type(x).__name__}),这不应该发生(格式验证应该已捕获)", fg=typer.colors.YELLOW)
|
|
2324
|
+
except Exception:
|
|
2325
|
+
pass
|
|
2326
|
+
continue
|
|
2327
|
+
return classified_gids
|
|
2328
|
+
|
|
2329
|
+
|
|
2330
|
+
def _build_cluster_retry_task(
|
|
2331
|
+
file: str,
|
|
2332
|
+
missing_gids: set,
|
|
2333
|
+
error_details: List[str],
|
|
2334
|
+
) -> str:
|
|
2335
|
+
"""构建聚类重试任务"""
|
|
2336
|
+
retry_task = f"""
|
|
2337
|
+
# 聚类任务重试
|
|
2338
|
+
文件: {file}
|
|
2339
|
+
|
|
2340
|
+
**重要提示**:请重新输出聚类结果。
|
|
2341
|
+
""".strip()
|
|
2342
|
+
if missing_gids:
|
|
2343
|
+
missing_gids_list = sorted(list(missing_gids))
|
|
2344
|
+
missing_count = len(missing_gids)
|
|
2345
|
+
retry_task += f"\n\n**遗漏的gid(共{missing_count}个,必须被分类):**\n" + ", ".join(str(gid) for gid in missing_gids_list)
|
|
2346
|
+
if error_details:
|
|
2347
|
+
retry_task += f"\n\n**格式错误:**\n" + "\n".join(f"- {detail}" for detail in error_details)
|
|
2348
|
+
return retry_task
|
|
2349
|
+
|
|
2350
|
+
|
|
2351
|
+
def _build_cluster_error_guidance(
|
|
2352
|
+
error_details: List[str],
|
|
2353
|
+
missing_gids: set,
|
|
2354
|
+
) -> str:
|
|
2355
|
+
"""构建聚类错误指导信息"""
|
|
2356
|
+
error_guidance = ""
|
|
2357
|
+
if error_details:
|
|
2358
|
+
error_guidance = f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n" + "\n".join(f"- {detail}" for detail in error_details)
|
|
2359
|
+
if missing_gids:
|
|
2360
|
+
missing_gids_list = sorted(list(missing_gids))
|
|
2361
|
+
missing_count = len(missing_gids)
|
|
2362
|
+
error_guidance += f"\n\n**完整性错误:遗漏了 {missing_count} 个 gid,这些 gid 必须被分类:**\n" + ", ".join(str(gid) for gid in missing_gids_list)
|
|
2363
|
+
return error_guidance
|
|
2364
|
+
|
|
2365
|
+
|
|
2366
|
+
def _run_cluster_agent_direct_model(
|
|
2367
|
+
cluster_agent,
|
|
2368
|
+
cluster_task: str,
|
|
2369
|
+
cluster_summary_prompt: str,
|
|
2370
|
+
file: str,
|
|
2371
|
+
missing_gids: set,
|
|
2372
|
+
error_details: List[str],
|
|
2373
|
+
_cluster_summary: Dict[str, str],
|
|
2374
|
+
) -> None:
|
|
2375
|
+
"""使用直接模型调用运行聚类Agent"""
|
|
2376
|
+
retry_task = _build_cluster_retry_task(file, missing_gids, error_details)
|
|
2377
|
+
error_guidance = _build_cluster_error_guidance(error_details, missing_gids)
|
|
2378
|
+
full_prompt = f"{retry_task}{error_guidance}\n\n{cluster_summary_prompt}"
|
|
2379
|
+
try:
|
|
2380
|
+
response = cluster_agent.model.chat_until_success(full_prompt) # type: ignore
|
|
2381
|
+
_cluster_summary["text"] = response
|
|
2382
|
+
except Exception as e:
|
|
2383
|
+
try:
|
|
2384
|
+
typer.secho(f"[jarvis-sec] 直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
|
|
2385
|
+
except Exception:
|
|
2386
|
+
pass
|
|
2387
|
+
cluster_agent.run(cluster_task)
|
|
2388
|
+
|
|
2389
|
+
|
|
2390
|
+
def _validate_cluster_result(
|
|
2391
|
+
cluster_items: Optional[List[Dict]],
|
|
2392
|
+
parse_error: Optional[str],
|
|
2393
|
+
attempt: int,
|
|
2394
|
+
) -> tuple[bool, List[str]]:
|
|
2395
|
+
"""验证聚类结果格式"""
|
|
2396
|
+
if parse_error:
|
|
2397
|
+
error_details = [f"YAML解析失败: {parse_error}"]
|
|
2398
|
+
typer.secho(f"[jarvis-sec] YAML解析失败: {parse_error}", fg=typer.colors.YELLOW)
|
|
2399
|
+
return False, error_details
|
|
2400
|
+
else:
|
|
2401
|
+
valid, error_details = _validate_cluster_format(cluster_items)
|
|
2402
|
+
if not valid:
|
|
2403
|
+
typer.secho(f"[jarvis-sec] 聚类结果格式无效({'; '.join(error_details)}),重试第 {attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
|
|
2404
|
+
return valid, error_details
|
|
2405
|
+
|
|
2406
|
+
|
|
2407
|
+
def _check_cluster_completeness(
|
|
2408
|
+
cluster_items: List[Dict],
|
|
2409
|
+
input_gids: set,
|
|
2410
|
+
attempt: int,
|
|
2411
|
+
) -> tuple[bool, set]:
|
|
2412
|
+
"""检查聚类完整性,返回(是否完整, 遗漏的gid)"""
|
|
2413
|
+
classified_gids = _extract_classified_gids(cluster_items)
|
|
2414
|
+
missing_gids = input_gids - classified_gids
|
|
2415
|
+
if not missing_gids:
|
|
2416
|
+
typer.secho(f"[jarvis-sec] 聚类完整性校验通过,所有gid已分类(共尝试 {attempt} 次)", fg=typer.colors.GREEN)
|
|
2417
|
+
return True, set()
|
|
2418
|
+
else:
|
|
2419
|
+
missing_gids_list = sorted(list(missing_gids))
|
|
2420
|
+
missing_count = len(missing_gids)
|
|
2421
|
+
typer.secho(f"[jarvis-sec] 聚类完整性校验失败:遗漏的gid: {missing_gids_list}({missing_count}个),重试第 {attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
|
|
2422
|
+
return False, missing_gids
|
|
2423
|
+
|
|
2424
|
+
|
|
2425
|
+
def _run_cluster_agent_with_retry(
|
|
2426
|
+
cluster_agent,
|
|
2427
|
+
cluster_task: str,
|
|
2428
|
+
cluster_summary_prompt: str,
|
|
2429
|
+
input_gids: set,
|
|
2430
|
+
file: str,
|
|
2431
|
+
_cluster_summary: Dict[str, str],
|
|
2432
|
+
) -> tuple[Optional[List[Dict]], Optional[str]]:
|
|
2433
|
+
"""运行聚类Agent并永久重试直到所有gid都被分类,返回(聚类结果, 解析错误)"""
|
|
2434
|
+
_attempt = 0
|
|
2435
|
+
use_direct_model = False
|
|
2436
|
+
error_details: List[str] = []
|
|
2437
|
+
missing_gids = set()
|
|
2438
|
+
|
|
2439
|
+
while True:
|
|
2440
|
+
_attempt += 1
|
|
2441
|
+
_cluster_summary["text"] = ""
|
|
2442
|
+
|
|
2443
|
+
if use_direct_model:
|
|
2444
|
+
_run_cluster_agent_direct_model(
|
|
2445
|
+
cluster_agent,
|
|
2446
|
+
cluster_task,
|
|
2447
|
+
cluster_summary_prompt,
|
|
2448
|
+
file,
|
|
2449
|
+
missing_gids,
|
|
2450
|
+
error_details,
|
|
2451
|
+
_cluster_summary,
|
|
2452
|
+
)
|
|
2453
|
+
else:
|
|
2454
|
+
# 第一次使用 run(),让 Agent 完整运行(可能使用工具)
|
|
2455
|
+
cluster_agent.run(cluster_task)
|
|
2456
|
+
|
|
2457
|
+
cluster_items, parse_error = _parse_clusters_from_text(_cluster_summary.get("text", ""))
|
|
2458
|
+
|
|
2459
|
+
# 校验结构
|
|
2460
|
+
valid, error_details = _validate_cluster_result(cluster_items, parse_error, _attempt)
|
|
2461
|
+
|
|
2462
|
+
# 完整性校验:检查所有输入的gid是否都被分类
|
|
2463
|
+
missing_gids = set()
|
|
2464
|
+
if valid and cluster_items:
|
|
2465
|
+
is_complete, missing_gids = _check_cluster_completeness(cluster_items, input_gids, _attempt)
|
|
2466
|
+
if is_complete:
|
|
2467
|
+
return cluster_items, None
|
|
2468
|
+
else:
|
|
2469
|
+
use_direct_model = True
|
|
2470
|
+
valid = False
|
|
2471
|
+
|
|
2472
|
+
if not valid:
|
|
2473
|
+
use_direct_model = True
|
|
2474
|
+
cluster_items = None
|
|
2475
|
+
|
|
2476
|
+
|
|
2477
|
+
def _process_cluster_results(
|
|
2478
|
+
cluster_items: List[Dict],
|
|
2479
|
+
pending_in_file_with_ids: List[Dict],
|
|
2480
|
+
file: str,
|
|
2481
|
+
chunk_idx: int,
|
|
2482
|
+
cluster_batches: List[List[Dict]],
|
|
2483
|
+
cluster_records: List[Dict],
|
|
2484
|
+
invalid_clusters_for_review: List[Dict],
|
|
2485
|
+
_progress_append,
|
|
2486
|
+
) -> tuple[int, int]:
|
|
2487
|
+
"""处理聚类结果,返回(有效聚类数, 无效聚类数)"""
|
|
2488
|
+
gid_to_item: Dict[int, Dict] = {}
|
|
2489
|
+
try:
|
|
2490
|
+
for it in pending_in_file_with_ids:
|
|
2491
|
+
try:
|
|
2492
|
+
_gid = int(it.get("gid", 0))
|
|
2493
|
+
if _gid >= 1:
|
|
2494
|
+
gid_to_item[_gid] = it
|
|
2495
|
+
except Exception:
|
|
2496
|
+
pass
|
|
2497
|
+
except Exception:
|
|
2498
|
+
gid_to_item = {}
|
|
2499
|
+
|
|
2500
|
+
_merged_count = 0
|
|
2501
|
+
_invalid_count = 0
|
|
2502
|
+
classified_gids_final = set()
|
|
2503
|
+
|
|
2504
|
+
for cl in cluster_items:
|
|
2505
|
+
verification = str(cl.get("verification", "")).strip()
|
|
2506
|
+
raw_gids = cl.get("gids", [])
|
|
2507
|
+
is_invalid = cl["is_invalid"]
|
|
2508
|
+
norm_keys: List[int] = []
|
|
2509
|
+
if isinstance(raw_gids, list):
|
|
2510
|
+
for x in raw_gids:
|
|
2511
|
+
try:
|
|
2512
|
+
xi = int(x)
|
|
2513
|
+
if xi >= 1:
|
|
2514
|
+
norm_keys.append(xi)
|
|
2515
|
+
classified_gids_final.add(xi)
|
|
2516
|
+
except Exception:
|
|
2517
|
+
pass
|
|
2518
|
+
|
|
2519
|
+
members: List[Dict] = []
|
|
2520
|
+
for k in norm_keys:
|
|
2521
|
+
it = gid_to_item.get(k)
|
|
2522
|
+
if it:
|
|
2523
|
+
it["verify"] = verification
|
|
2524
|
+
members.append(it)
|
|
2525
|
+
|
|
2526
|
+
# 如果标记为无效,收集到复核列表
|
|
2527
|
+
if is_invalid:
|
|
2528
|
+
_invalid_count += 1
|
|
2529
|
+
invalid_gids = [m.get("gid") for m in members]
|
|
2530
|
+
invalid_reason = str(cl.get("invalid_reason", "")).strip()
|
|
2531
|
+
try:
|
|
2532
|
+
typer.secho(f"[jarvis-sec] 聚类阶段判定为无效(gids={invalid_gids}),将提交复核Agent验证", fg=typer.colors.BLUE)
|
|
2533
|
+
except Exception:
|
|
2534
|
+
pass
|
|
2535
|
+
invalid_clusters_for_review.append({
|
|
2536
|
+
"file": file,
|
|
2537
|
+
"batch_index": chunk_idx,
|
|
2538
|
+
"gids": invalid_gids,
|
|
2539
|
+
"verification": verification,
|
|
2540
|
+
"invalid_reason": invalid_reason,
|
|
2541
|
+
"members": members,
|
|
2542
|
+
"count": len(members),
|
|
2543
|
+
})
|
|
2544
|
+
_progress_append({
|
|
2545
|
+
"event": "cluster_invalid",
|
|
2546
|
+
"file": file,
|
|
2547
|
+
"batch_index": chunk_idx,
|
|
2548
|
+
"gids": invalid_gids,
|
|
2549
|
+
"verification": verification,
|
|
2550
|
+
"count": len(members),
|
|
2551
|
+
})
|
|
2552
|
+
cluster_records.append({
|
|
2553
|
+
"file": file,
|
|
2554
|
+
"verification": verification,
|
|
2555
|
+
"gids": invalid_gids,
|
|
2556
|
+
"count": len(members),
|
|
2557
|
+
"batch_index": chunk_idx,
|
|
2558
|
+
"is_invalid": True,
|
|
2559
|
+
"invalid_reason": invalid_reason,
|
|
2560
|
+
})
|
|
2561
|
+
elif members:
|
|
2562
|
+
_merged_count += 1
|
|
2563
|
+
cluster_batches.append(members)
|
|
2564
|
+
cluster_records.append({
|
|
2565
|
+
"file": file,
|
|
2566
|
+
"verification": verification,
|
|
2567
|
+
"gids": [m.get("gid") for m in members],
|
|
2568
|
+
"count": len(members),
|
|
2569
|
+
"batch_index": chunk_idx,
|
|
2570
|
+
"is_invalid": False,
|
|
2571
|
+
})
|
|
2572
|
+
|
|
2573
|
+
return _merged_count, _invalid_count
|
|
2574
|
+
|
|
2575
|
+
|
|
2576
|
+
def _supplement_missing_gids(
|
|
2577
|
+
missing_gids_final: set,
|
|
2578
|
+
gid_to_item: Dict[int, Dict],
|
|
2579
|
+
file: str,
|
|
2580
|
+
chunk_idx: int,
|
|
2581
|
+
cluster_batches: List[List[Dict]],
|
|
2582
|
+
cluster_records: List[Dict],
|
|
2583
|
+
) -> int:
|
|
2584
|
+
"""为遗漏的gid创建单独聚类,返回补充的聚类数"""
|
|
2585
|
+
supplemented_count = 0
|
|
2586
|
+
for missing_gid in sorted(missing_gids_final):
|
|
2587
|
+
missing_item = gid_to_item.get(missing_gid)
|
|
2588
|
+
if missing_item:
|
|
2589
|
+
default_verification = f"验证候选 {missing_gid} 的安全风险"
|
|
2590
|
+
missing_item["verify"] = default_verification
|
|
2591
|
+
cluster_batches.append([missing_item])
|
|
2592
|
+
cluster_records.append({
|
|
2593
|
+
"file": file,
|
|
2594
|
+
"verification": default_verification,
|
|
2595
|
+
"gids": [missing_gid],
|
|
2596
|
+
"count": 1,
|
|
2597
|
+
"batch_index": chunk_idx,
|
|
2598
|
+
"note": "完整性校验补充的遗漏gid",
|
|
2599
|
+
})
|
|
2600
|
+
supplemented_count += 1
|
|
2601
|
+
return supplemented_count
|
|
2602
|
+
|
|
2603
|
+
|
|
2604
|
+
def _get_cluster_system_prompt() -> str:
|
|
2605
|
+
"""获取聚类Agent的系统提示词"""
|
|
2606
|
+
return """
|
|
2607
|
+
# 单Agent聚类约束
|
|
2608
|
+
- 你的任务是对同一文件内的启发式候选进行聚类,将可以一起验证的问题归为一类。
|
|
2609
|
+
- **聚类原则**:
|
|
2610
|
+
- 可以一起验证的问题归为一类,不一定是验证条件完全一致才能归为一类。
|
|
2611
|
+
- 如果多个候选问题可以通过同一个验证过程来确认,即使它们的验证条件略有不同,也可以归为一类。
|
|
2612
|
+
- 例如:多个指针解引用问题可以归为一类(验证"指针在解引用前非空"),即使它们涉及不同的指针变量。
|
|
2613
|
+
- 例如:多个缓冲区操作问题可以归为一类(验证"拷贝长度不超过目标缓冲区容量"),即使它们涉及不同的缓冲区。
|
|
2614
|
+
- 验证条件:为了确认是否存在漏洞需要成立/验证的关键前置条件。例如:"指针p在解引用前非空""拷贝长度不超过目标缓冲区容量"等。
|
|
2615
|
+
- **完整性要求**:每个gid都必须出现在某个类别中,不能遗漏任何一个gid。所有输入的gid都必须被分类。
|
|
2616
|
+
- 工具优先:如需核对上下文,可使用 read_code 读取相邻代码;避免过度遍历。
|
|
2617
|
+
- 禁止写操作;仅只读分析。
|
|
2618
|
+
- **重要:关于无效判断的保守策略**:
|
|
2619
|
+
- 在判断候选是否无效时,必须充分考虑所有可能的路径、调用链和边界情况。
|
|
2620
|
+
- 必须考虑:所有可能的调用者、所有可能的输入来源、所有可能的执行路径、所有可能的边界条件。
|
|
2621
|
+
- 只要存在任何可能性(即使很小)导致漏洞可被触发,就不应该标记为无效(is_invalid: false)。
|
|
2622
|
+
- 只有在完全确定、没有任何可能性、所有路径都已验证安全的情况下,才能标记为无效(is_invalid: true)。
|
|
2623
|
+
- 保守原则:有疑问时,一律标记为 false(需要进入后续验证阶段),让分析Agent和验证Agent进行更深入的分析。
|
|
2624
|
+
- 不要因为看到局部有保护措施就认为无效,要考虑是否有其他调用路径绕过这些保护。
|
|
2625
|
+
- 不要因为看到某些调用者已做校验就认为无效,要考虑是否有其他调用者未做校验。
|
|
2626
|
+
- **记忆使用**:
|
|
2627
|
+
- 在聚类过程中,充分利用 retrieve_memory 工具检索已有的记忆,特别是与当前文件或函数相关的记忆。
|
|
2628
|
+
- 如果有必要,使用 save_memory 工具保存聚类过程中发现的函数或代码片段的要点,使用函数名或文件名作为 tag。
|
|
2629
|
+
- 记忆内容示例:某个函数的指针已经判空、某个函数已有输入校验、某个代码片段的上下文信息等。
|
|
2630
|
+
- 这些记忆可以帮助后续的分析Agent和验证Agent更高效地工作。
|
|
2631
|
+
""".strip()
|
|
2632
|
+
|
|
2633
|
+
|
|
2634
|
+
def _get_cluster_summary_prompt() -> str:
|
|
2635
|
+
"""获取聚类Agent的摘要提示词"""
|
|
2636
|
+
return """
|
|
2637
|
+
请仅在 <CLUSTERS> 与 </CLUSTERS> 中输出 YAML 数组:
|
|
2638
|
+
- 每个元素包含(所有字段均为必填):
|
|
2639
|
+
- verification: 字符串(对该聚类的验证条件描述,简洁明确,可直接用于后续Agent验证)
|
|
2640
|
+
- gids: 整数数组(候选的全局唯一编号;输入JSON每个元素含 gid,可直接对应填入)
|
|
2641
|
+
- is_invalid: 布尔值(必填,true 或 false)。如果为 true,表示该聚类中的所有候选已被确认为无效/误报,将不会进入后续验证阶段;如果为 false,表示该聚类中的候选需要进入后续验证阶段。
|
|
2642
|
+
- invalid_reason: 字符串(当 is_invalid 为 true 时必填,当 is_invalid 为 false 时可省略)。必须详细说明为什么这些候选是无效的,包括:
|
|
2643
|
+
* 已检查的所有调用路径和调用者
|
|
2644
|
+
* 已确认的保护措施和校验逻辑
|
|
2645
|
+
* 为什么这些保护措施在所有路径上都有效
|
|
2646
|
+
* 为什么不存在任何可能的触发路径
|
|
2647
|
+
* 必须足够详细,以便复核Agent能够验证你的判断
|
|
2648
|
+
- 要求:
|
|
2649
|
+
- 严格要求:仅输出位于 <CLUSTERS> 与 </CLUSTERS> 间的 YAML 数组,其他位置不输出任何文本
|
|
2650
|
+
- **完整性要求(最重要)**:输入JSON中的所有gid都必须被分类,不能遗漏任何一个gid。所有gid必须出现在某个聚类的gids数组中。这是强制要求,必须严格遵守。
|
|
2651
|
+
- **聚类原则**:可以一起验证的问题归为一类,不一定是验证条件完全一致才能归为一类。如果多个候选问题可以通过同一个验证过程来确认,即使它们的验证条件略有不同,也可以归为一类。
|
|
2652
|
+
- **必须要求**:每个聚类元素必须包含 is_invalid 字段,且值必须为 true 或 false,不能省略。
|
|
2653
|
+
- **必须要求**:当 is_invalid 为 true 时,必须提供 invalid_reason 字段,且理由必须充分详细。
|
|
2654
|
+
- 不需要解释与长文本,仅给出可执行的验证条件短句
|
|
2655
|
+
- 若无法聚类,请将每个候选单独成组,verification 为该候选的最小确认条件
|
|
2656
|
+
- **关于 is_invalid 的保守判断原则**:
|
|
2657
|
+
- 必须充分考虑所有可能的路径、调用链、输入来源和边界情况。
|
|
2658
|
+
- 只要存在任何可能性(即使很小)导致漏洞可被触发,必须设置 is_invalid: false。
|
|
2659
|
+
- 只有在完全确定、没有任何可能性、所有路径都已验证安全的情况下,才能设置 is_invalid: true。
|
|
2660
|
+
- 保守策略:有疑问时,一律设置为 false,让后续的分析Agent和验证Agent进行更深入的分析。
|
|
2661
|
+
- 不要因为局部有保护措施就设置为 true,要考虑是否有其他路径绕过保护。
|
|
2662
|
+
- 不要因为某些调用者已做校验就设置为 true,要考虑是否有其他调用者未做校验。
|
|
2663
|
+
- 如果设置为 true,必须在 invalid_reason 中详细说明已检查的所有路径和原因。
|
|
2664
|
+
<CLUSTERS>
|
|
2665
|
+
- verification: ""
|
|
2666
|
+
gids: []
|
|
2667
|
+
is_invalid: false
|
|
2668
|
+
</CLUSTERS>
|
|
2669
|
+
""".strip()
|
|
2670
|
+
|
|
2671
|
+
|
|
2672
|
+
def _create_cluster_agent(
|
|
2673
|
+
file: str,
|
|
2674
|
+
chunk_idx: int,
|
|
2675
|
+
llm_group: Optional[str],
|
|
2676
|
+
) -> Agent:
|
|
2677
|
+
"""创建聚类Agent"""
|
|
2678
|
+
cluster_system_prompt = _get_cluster_system_prompt()
|
|
2679
|
+
cluster_summary_prompt = _get_cluster_summary_prompt()
|
|
2680
|
+
|
|
2681
|
+
agent_kwargs_cluster: Dict = dict(
|
|
2682
|
+
system_prompt=cluster_system_prompt,
|
|
2683
|
+
name=f"JARVIS-SEC-Cluster::{file}::batch{chunk_idx}",
|
|
2684
|
+
auto_complete=True,
|
|
2685
|
+
need_summary=True,
|
|
2686
|
+
summary_prompt=cluster_summary_prompt,
|
|
2687
|
+
non_interactive=True,
|
|
2688
|
+
in_multi_agent=False,
|
|
2689
|
+
use_methodology=False,
|
|
2690
|
+
use_analysis=False,
|
|
2691
|
+
plan=False,
|
|
2692
|
+
output_handler=[ToolRegistry()],
|
|
2693
|
+
disable_file_edit=True,
|
|
2694
|
+
use_tools=["read_code", "execute_script", "save_memory", "retrieve_memory"],
|
|
2695
|
+
)
|
|
2696
|
+
if llm_group:
|
|
2697
|
+
agent_kwargs_cluster["model_group"] = llm_group
|
|
2698
|
+
return Agent(**agent_kwargs_cluster)
|
|
2699
|
+
|
|
2700
|
+
|
|
2701
|
+
def _build_cluster_task(
|
|
2702
|
+
pending_in_file_with_ids: List[Dict],
|
|
2703
|
+
entry_path: str,
|
|
2704
|
+
file: str,
|
|
2705
|
+
langs: List[str],
|
|
2706
|
+
) -> str:
|
|
2707
|
+
"""构建聚类任务上下文"""
|
|
2708
|
+
import json as _json2
|
|
2709
|
+
return f"""
|
|
2710
|
+
# 聚类任务(分析输入)
|
|
2711
|
+
上下文:
|
|
2712
|
+
- entry_path: {entry_path}
|
|
2713
|
+
- file: {file}
|
|
2714
|
+
- languages: {langs}
|
|
2715
|
+
|
|
2716
|
+
候选(JSON数组,包含 gid/file/line/pattern/category/evidence):
|
|
2717
|
+
{_json2.dumps(pending_in_file_with_ids, ensure_ascii=False, indent=2)}
|
|
2718
|
+
""".strip()
|
|
2719
|
+
|
|
2720
|
+
|
|
2721
|
+
def _extract_input_gids(pending_in_file_with_ids: List[Dict]) -> set:
|
|
2722
|
+
"""从待聚类项中提取gid集合"""
|
|
2723
|
+
input_gids = set()
|
|
2724
|
+
for it in pending_in_file_with_ids:
|
|
2725
|
+
try:
|
|
2726
|
+
_gid = int(it.get("gid", 0))
|
|
2727
|
+
if _gid >= 1:
|
|
2728
|
+
input_gids.add(_gid)
|
|
2729
|
+
except Exception:
|
|
2730
|
+
pass
|
|
2731
|
+
return input_gids
|
|
2732
|
+
|
|
2733
|
+
|
|
2734
|
+
def _build_gid_to_item_mapping(pending_in_file_with_ids: List[Dict]) -> Dict[int, Dict]:
|
|
2735
|
+
"""构建gid到项的映射"""
|
|
2736
|
+
gid_to_item: Dict[int, Dict] = {}
|
|
2737
|
+
try:
|
|
2738
|
+
for it in pending_in_file_with_ids:
|
|
2739
|
+
try:
|
|
2740
|
+
_gid = int(it.get("gid", 0))
|
|
2741
|
+
if _gid >= 1:
|
|
2742
|
+
gid_to_item[_gid] = it
|
|
2743
|
+
except Exception:
|
|
2744
|
+
pass
|
|
2745
|
+
except Exception:
|
|
2746
|
+
pass
|
|
2747
|
+
return gid_to_item
|
|
2748
|
+
|
|
2749
|
+
|
|
2750
|
+
def _process_cluster_chunk(
|
|
2751
|
+
chunk: List[Dict],
|
|
2752
|
+
chunk_idx: int,
|
|
2753
|
+
file: str,
|
|
2754
|
+
entry_path: str,
|
|
2755
|
+
langs: List[str],
|
|
2756
|
+
llm_group: Optional[str],
|
|
2757
|
+
cluster_batches: List[List[Dict]],
|
|
2758
|
+
cluster_records: List[Dict],
|
|
2759
|
+
invalid_clusters_for_review: List[Dict],
|
|
2760
|
+
_progress_append,
|
|
2761
|
+
_write_cluster_batch_snapshot,
|
|
2762
|
+
) -> None:
|
|
2763
|
+
"""处理单个聚类批次"""
|
|
2764
|
+
if not chunk:
|
|
2765
|
+
return
|
|
2766
|
+
|
|
2767
|
+
pending_in_file_with_ids = list(chunk)
|
|
2768
|
+
|
|
2769
|
+
# 记录聚类批次开始
|
|
2770
|
+
_progress_append({
|
|
2771
|
+
"event": "cluster_status",
|
|
2772
|
+
"status": "running",
|
|
2773
|
+
"file": file,
|
|
2774
|
+
"batch_index": chunk_idx,
|
|
2775
|
+
"total_in_batch": len(pending_in_file_with_ids),
|
|
2776
|
+
})
|
|
2777
|
+
|
|
2778
|
+
# 创建聚类Agent
|
|
2779
|
+
cluster_agent = _create_cluster_agent(file, chunk_idx, llm_group)
|
|
2780
|
+
|
|
2781
|
+
# 构建任务上下文
|
|
2782
|
+
cluster_task = _build_cluster_task(pending_in_file_with_ids, entry_path, file, langs)
|
|
2783
|
+
|
|
2784
|
+
# 订阅摘要事件
|
|
2785
|
+
cluster_summary = _subscribe_summary_event(cluster_agent)
|
|
2786
|
+
|
|
2787
|
+
# 提取输入gid
|
|
2788
|
+
input_gids = _extract_input_gids(pending_in_file_with_ids)
|
|
2789
|
+
|
|
2790
|
+
# 运行聚类Agent
|
|
2791
|
+
cluster_summary_prompt = _get_cluster_summary_prompt()
|
|
2792
|
+
cluster_items, parse_error = _run_cluster_agent_with_retry(
|
|
2793
|
+
cluster_agent,
|
|
2794
|
+
cluster_task,
|
|
2795
|
+
cluster_summary_prompt,
|
|
2796
|
+
input_gids,
|
|
2797
|
+
file,
|
|
2798
|
+
cluster_summary,
|
|
2799
|
+
)
|
|
2800
|
+
|
|
2801
|
+
# 处理聚类结果
|
|
2802
|
+
_merged_count = 0
|
|
2803
|
+
_invalid_count = 0
|
|
2804
|
+
|
|
2805
|
+
if isinstance(cluster_items, list) and cluster_items:
|
|
2806
|
+
gid_to_item = _build_gid_to_item_mapping(pending_in_file_with_ids)
|
|
2807
|
+
|
|
2808
|
+
_merged_count, _invalid_count = _process_cluster_results(
|
|
2809
|
+
cluster_items,
|
|
2810
|
+
pending_in_file_with_ids,
|
|
2811
|
+
file,
|
|
2812
|
+
chunk_idx,
|
|
2813
|
+
cluster_batches,
|
|
2814
|
+
cluster_records,
|
|
2815
|
+
invalid_clusters_for_review,
|
|
2816
|
+
_progress_append,
|
|
2817
|
+
)
|
|
2818
|
+
|
|
2819
|
+
classified_gids_final = _extract_classified_gids(cluster_items)
|
|
2820
|
+
missing_gids_final = input_gids - classified_gids_final
|
|
2821
|
+
if missing_gids_final:
|
|
2822
|
+
typer.secho(f"[jarvis-sec] 警告:仍有遗漏的gid {sorted(list(missing_gids_final))},将为每个遗漏的gid创建单独聚类", fg=typer.colors.YELLOW)
|
|
2823
|
+
supplemented_count = _supplement_missing_gids(
|
|
2824
|
+
missing_gids_final,
|
|
2825
|
+
gid_to_item,
|
|
2826
|
+
file,
|
|
2827
|
+
chunk_idx,
|
|
2828
|
+
cluster_batches,
|
|
2829
|
+
cluster_records,
|
|
2830
|
+
)
|
|
2831
|
+
_merged_count += supplemented_count
|
|
2832
|
+
else:
|
|
2833
|
+
# 聚类结果为空或None:为所有输入的gid创建单独聚类(保守策略)
|
|
2834
|
+
if pending_in_file_with_ids:
|
|
2835
|
+
typer.secho(f"[jarvis-sec] 警告:聚类结果为空或None(文件={file},批次={chunk_idx}),为所有gid创建单独聚类", fg=typer.colors.YELLOW)
|
|
2836
|
+
gid_to_item_fallback = _build_gid_to_item_mapping(pending_in_file_with_ids)
|
|
2837
|
+
|
|
2838
|
+
_merged_count = _supplement_missing_gids(
|
|
2839
|
+
input_gids,
|
|
2840
|
+
gid_to_item_fallback,
|
|
2841
|
+
file,
|
|
2842
|
+
chunk_idx,
|
|
2843
|
+
cluster_batches,
|
|
2844
|
+
cluster_records,
|
|
2845
|
+
)
|
|
2846
|
+
_invalid_count = 0
|
|
2847
|
+
else:
|
|
2848
|
+
_merged_count = 0
|
|
2849
|
+
_invalid_count = 0
|
|
2850
|
+
|
|
2851
|
+
# 标记聚类批次完成
|
|
2852
|
+
_progress_append({
|
|
2853
|
+
"event": "cluster_status",
|
|
2854
|
+
"status": "done",
|
|
2855
|
+
"file": file,
|
|
2856
|
+
"batch_index": chunk_idx,
|
|
2857
|
+
"clusters_count": _merged_count,
|
|
2858
|
+
"invalid_clusters_count": _invalid_count,
|
|
2859
|
+
})
|
|
2860
|
+
if _invalid_count > 0:
|
|
2861
|
+
try:
|
|
2862
|
+
typer.secho(f"[jarvis-sec] 聚类批次完成: 有效聚类={_merged_count},无效聚类={_invalid_count}(已跳过)", fg=typer.colors.GREEN)
|
|
2863
|
+
except Exception:
|
|
2864
|
+
pass
|
|
2865
|
+
|
|
2866
|
+
# 写入当前批次的聚类结果
|
|
2867
|
+
current_batch_records = [
|
|
2868
|
+
rec for rec in cluster_records
|
|
2869
|
+
if rec.get("file") == file and rec.get("batch_index") == chunk_idx
|
|
2870
|
+
]
|
|
2871
|
+
if current_batch_records:
|
|
2872
|
+
_write_cluster_batch_snapshot(current_batch_records)
|
|
2873
|
+
|
|
2874
|
+
|
|
2875
|
+
def _filter_pending_items(items: List[Dict], clustered_gids: set) -> List[Dict]:
|
|
2876
|
+
"""过滤出待聚类的项"""
|
|
2877
|
+
pending_in_file: List[Dict] = []
|
|
2878
|
+
for c in items:
|
|
2879
|
+
try:
|
|
2880
|
+
_gid = int(c.get("gid", 0))
|
|
2881
|
+
if _gid >= 1 and _gid not in clustered_gids:
|
|
2882
|
+
pending_in_file.append(c)
|
|
2883
|
+
except Exception:
|
|
2884
|
+
pass
|
|
2885
|
+
return pending_in_file
|
|
2886
|
+
|
|
2887
|
+
|
|
2888
|
+
def _process_file_clustering(
|
|
2889
|
+
file: str,
|
|
2890
|
+
items: List[Dict],
|
|
2891
|
+
clustered_gids: set,
|
|
2892
|
+
cluster_batches: List[List[Dict]],
|
|
2893
|
+
cluster_records: List[Dict],
|
|
2894
|
+
invalid_clusters_for_review: List[Dict],
|
|
2895
|
+
entry_path: str,
|
|
2896
|
+
langs: List[str],
|
|
2897
|
+
cluster_limit: int,
|
|
2898
|
+
llm_group: Optional[str],
|
|
2899
|
+
_progress_append,
|
|
2900
|
+
_write_cluster_batch_snapshot,
|
|
2901
|
+
) -> None:
|
|
2902
|
+
"""处理单个文件的聚类任务"""
|
|
2903
|
+
# 过滤掉已聚类的 gid
|
|
2904
|
+
pending_in_file = _filter_pending_items(items, clustered_gids)
|
|
2905
|
+
if not pending_in_file:
|
|
2906
|
+
return
|
|
2907
|
+
|
|
2908
|
+
# 优化:如果文件只有一个告警,跳过聚类,直接写入
|
|
2909
|
+
if len(pending_in_file) == 1:
|
|
2910
|
+
single_item = pending_in_file[0]
|
|
2911
|
+
single_gid = single_item.get("gid", 0)
|
|
2912
|
+
_handle_single_alert_file(
|
|
2913
|
+
file,
|
|
2914
|
+
single_item,
|
|
2915
|
+
single_gid,
|
|
2916
|
+
cluster_batches,
|
|
2917
|
+
cluster_records,
|
|
2918
|
+
_progress_append,
|
|
2919
|
+
_write_cluster_batch_snapshot,
|
|
2920
|
+
)
|
|
2921
|
+
return
|
|
2922
|
+
|
|
2923
|
+
# 将该文件的告警按 cluster_limit 分批
|
|
2924
|
+
_limit = cluster_limit if isinstance(cluster_limit, int) and cluster_limit > 0 else 50
|
|
2925
|
+
_chunks: List[List[Dict]] = [pending_in_file[i:i + _limit] for i in range(0, len(pending_in_file), _limit)]
|
|
2926
|
+
|
|
2927
|
+
# 处理每个批次
|
|
2928
|
+
for _chunk_idx, _chunk in enumerate(_chunks, start=1):
|
|
2929
|
+
_process_cluster_chunk(
|
|
2930
|
+
_chunk,
|
|
2931
|
+
_chunk_idx,
|
|
2932
|
+
file,
|
|
2933
|
+
entry_path,
|
|
2934
|
+
langs,
|
|
2935
|
+
llm_group,
|
|
2936
|
+
cluster_batches,
|
|
2937
|
+
cluster_records,
|
|
2938
|
+
invalid_clusters_for_review,
|
|
2939
|
+
_progress_append,
|
|
2940
|
+
_write_cluster_batch_snapshot,
|
|
2941
|
+
)
|
|
2942
|
+
|
|
2943
|
+
|
|
2944
|
+
def _is_valid_review_item(item: Dict) -> bool:
|
|
2945
|
+
"""验证复核结果项的格式"""
|
|
2946
|
+
if not isinstance(item, dict) or "is_reason_sufficient" not in item:
|
|
2947
|
+
return False
|
|
2948
|
+
has_gid = "gid" in item
|
|
2949
|
+
has_gids = "gids" in item
|
|
2950
|
+
if not has_gid and not has_gids:
|
|
2951
|
+
return False
|
|
2952
|
+
if has_gid and has_gids:
|
|
2953
|
+
return False # gid 和 gids 不能同时出现
|
|
2954
|
+
if has_gid:
|
|
2955
|
+
try:
|
|
2956
|
+
return int(item["gid"]) >= 1
|
|
2957
|
+
except Exception:
|
|
2958
|
+
return False
|
|
2959
|
+
elif has_gids:
|
|
2960
|
+
if not isinstance(item["gids"], list) or len(item["gids"]) == 0:
|
|
2961
|
+
return False
|
|
2962
|
+
try:
|
|
2963
|
+
return all(int(gid_val) >= 1 for gid_val in item["gids"])
|
|
2964
|
+
except Exception:
|
|
2965
|
+
return False
|
|
2966
|
+
return False
|
|
2967
|
+
|
|
2968
|
+
|
|
2969
|
+
def _build_gid_to_review_mapping(review_results: List[Dict]) -> Dict[int, Dict]:
|
|
2970
|
+
"""构建gid到复核结果的映射(支持 gid 和 gids 两种格式)"""
|
|
2971
|
+
gid_to_review: Dict[int, Dict] = {}
|
|
2972
|
+
for rr in review_results:
|
|
2973
|
+
if not isinstance(rr, dict):
|
|
2974
|
+
continue
|
|
2975
|
+
|
|
2976
|
+
# 支持 gid 和 gids 两种格式
|
|
2977
|
+
gids_to_process: List[int] = []
|
|
2978
|
+
if "gids" in rr and isinstance(rr.get("gids"), list):
|
|
2979
|
+
# 合并格式:gids 数组
|
|
2980
|
+
for gid_val in rr.get("gids", []):
|
|
2981
|
+
try:
|
|
2982
|
+
gid_int = int(gid_val)
|
|
2983
|
+
if gid_int >= 1:
|
|
2984
|
+
gids_to_process.append(gid_int)
|
|
2985
|
+
except Exception:
|
|
2986
|
+
pass
|
|
2987
|
+
elif "gid" in rr:
|
|
2988
|
+
# 单个格式:gid
|
|
2989
|
+
try:
|
|
2990
|
+
gid_int = int(rr.get("gid", 0))
|
|
2991
|
+
if gid_int >= 1:
|
|
2992
|
+
gids_to_process.append(gid_int)
|
|
2993
|
+
except Exception:
|
|
2994
|
+
pass
|
|
2995
|
+
|
|
2996
|
+
# 为每个 gid 创建复核结果映射
|
|
2997
|
+
is_reason_sufficient = rr.get("is_reason_sufficient")
|
|
2998
|
+
review_notes = str(rr.get("review_notes", "")).strip()
|
|
2999
|
+
for gid in gids_to_process:
|
|
3000
|
+
gid_to_review[gid] = {
|
|
3001
|
+
"is_reason_sufficient": is_reason_sufficient,
|
|
3002
|
+
"review_notes": review_notes
|
|
3003
|
+
}
|
|
3004
|
+
return gid_to_review
|
|
3005
|
+
|
|
3006
|
+
|
|
3007
|
+
def _process_review_batch(
|
|
3008
|
+
review_batch: List[Dict],
|
|
3009
|
+
review_results: Optional[List[Dict]],
|
|
3010
|
+
reviewed_clusters: List[Dict],
|
|
3011
|
+
reinstated_candidates: List[Dict],
|
|
3012
|
+
) -> None:
|
|
3013
|
+
"""处理单个复核批次的结果"""
|
|
3014
|
+
if review_results:
|
|
3015
|
+
# 构建gid到复核结果的映射
|
|
3016
|
+
gid_to_review = _build_gid_to_review_mapping(review_results)
|
|
3017
|
+
|
|
3018
|
+
# 处理每个无效聚类
|
|
3019
|
+
for invalid_cluster in review_batch:
|
|
3020
|
+
cluster_gids = invalid_cluster.get("gids", [])
|
|
3021
|
+
cluster_members = invalid_cluster.get("members", [])
|
|
3022
|
+
|
|
3023
|
+
# 检查该聚类中的所有gid的复核结果
|
|
3024
|
+
all_sufficient = True
|
|
3025
|
+
any_reviewed = False
|
|
3026
|
+
insufficient_review_result = None
|
|
3027
|
+
for gid in cluster_gids:
|
|
3028
|
+
review_result = gid_to_review.get(gid)
|
|
3029
|
+
if review_result:
|
|
3030
|
+
any_reviewed = True
|
|
3031
|
+
if review_result.get("is_reason_sufficient") is not True:
|
|
3032
|
+
all_sufficient = False
|
|
3033
|
+
if not insufficient_review_result:
|
|
3034
|
+
insufficient_review_result = review_result
|
|
3035
|
+
break
|
|
3036
|
+
|
|
3037
|
+
if any_reviewed and not all_sufficient:
|
|
3038
|
+
# 理由不充分,重新加入验证流程
|
|
3039
|
+
typer.secho(f"[jarvis-sec] 复核结果:无效聚类(gids={cluster_gids})理由不充分,重新加入验证流程", fg=typer.colors.BLUE)
|
|
3040
|
+
for member in cluster_members:
|
|
3041
|
+
reinstated_candidates.append(member)
|
|
3042
|
+
reviewed_clusters.append({
|
|
3043
|
+
**invalid_cluster,
|
|
3044
|
+
"review_result": "reinstated",
|
|
3045
|
+
"review_notes": insufficient_review_result.get("review_notes", "") if insufficient_review_result else "",
|
|
3046
|
+
})
|
|
3047
|
+
else:
|
|
3048
|
+
# 理由充分,确认无效
|
|
3049
|
+
review_notes = ""
|
|
3050
|
+
if cluster_gids and gid_to_review.get(cluster_gids[0]):
|
|
3051
|
+
review_notes = gid_to_review[cluster_gids[0]].get("review_notes", "")
|
|
3052
|
+
typer.secho(f"[jarvis-sec] 复核结果:无效聚类(gids={cluster_gids})理由充分,确认为无效", fg=typer.colors.GREEN)
|
|
3053
|
+
reviewed_clusters.append({
|
|
3054
|
+
**invalid_cluster,
|
|
3055
|
+
"review_result": "confirmed_invalid",
|
|
3056
|
+
"review_notes": review_notes,
|
|
3057
|
+
})
|
|
3058
|
+
else:
|
|
3059
|
+
# 复核结果解析失败,保守策略:重新加入验证流程
|
|
3060
|
+
typer.secho(f"[jarvis-sec] 警告:复核结果解析失败,保守策略:将批次中的所有候选重新加入验证流程", fg=typer.colors.YELLOW)
|
|
3061
|
+
for invalid_cluster in review_batch:
|
|
3062
|
+
cluster_members = invalid_cluster.get("members", [])
|
|
3063
|
+
for member in cluster_members:
|
|
3064
|
+
reinstated_candidates.append(member)
|
|
3065
|
+
reviewed_clusters.append({
|
|
3066
|
+
**invalid_cluster,
|
|
3067
|
+
"review_result": "reinstated",
|
|
3068
|
+
"review_notes": "复核结果解析失败,保守策略重新加入验证",
|
|
3069
|
+
})
|
|
3070
|
+
|
|
3071
|
+
|
|
3072
|
+
def _run_review_agent_with_retry(
|
|
3073
|
+
review_agent,
|
|
3074
|
+
review_task: str,
|
|
3075
|
+
review_summary_prompt: str,
|
|
3076
|
+
entry_path: str,
|
|
3077
|
+
review_summary_container: Dict[str, str],
|
|
3078
|
+
) -> tuple[Optional[List[Dict]], Optional[str]]:
|
|
3079
|
+
"""运行复核Agent并永久重试直到格式正确,返回(复核结果, 解析错误)"""
|
|
3080
|
+
use_direct_model_review = False
|
|
3081
|
+
prev_parse_error_review: Optional[str] = None
|
|
3082
|
+
review_attempt = 0
|
|
3083
|
+
|
|
3084
|
+
while True:
|
|
3085
|
+
review_attempt += 1
|
|
3086
|
+
review_summary_container["text"] = ""
|
|
3087
|
+
|
|
3088
|
+
if use_direct_model_review:
|
|
3089
|
+
# 格式校验失败后,直接调用模型接口
|
|
3090
|
+
review_summary_prompt_text = _build_verification_summary_prompt()
|
|
3091
|
+
error_guidance = ""
|
|
3092
|
+
if prev_parse_error_review:
|
|
3093
|
+
error_guidance = f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- YAML解析失败: {prev_parse_error_review}\n\n请确保输出的YAML格式正确,包括正确的缩进、引号、冒号等。"
|
|
3094
|
+
|
|
3095
|
+
full_review_prompt = f"{review_task}{error_guidance}\n\n{review_summary_prompt_text}"
|
|
3096
|
+
try:
|
|
3097
|
+
review_response = review_agent.model.chat_until_success(full_review_prompt) # type: ignore
|
|
3098
|
+
review_summary_container["text"] = review_response
|
|
3099
|
+
except Exception as e:
|
|
3100
|
+
try:
|
|
3101
|
+
typer.secho(f"[jarvis-sec] 复核阶段直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
|
|
3102
|
+
except Exception:
|
|
3103
|
+
pass
|
|
3104
|
+
review_agent.run(review_task)
|
|
3105
|
+
else:
|
|
3106
|
+
# 第一次使用 run(),让 Agent 完整运行(可能使用工具)
|
|
3107
|
+
review_agent.run(review_task)
|
|
3108
|
+
|
|
3109
|
+
# 工作区保护
|
|
3110
|
+
try:
|
|
3111
|
+
_changed_review = _git_restore_if_dirty(entry_path)
|
|
3112
|
+
if _changed_review:
|
|
3113
|
+
try:
|
|
3114
|
+
typer.secho(f"[jarvis-sec] 复核Agent工作区已恢复 ({_changed_review} 个文件)", fg=typer.colors.BLUE)
|
|
3115
|
+
except Exception:
|
|
3116
|
+
pass
|
|
3117
|
+
except Exception:
|
|
3118
|
+
pass
|
|
3119
|
+
|
|
3120
|
+
# 解析复核结果
|
|
3121
|
+
review_summary_text = review_summary_container.get("text", "")
|
|
3122
|
+
parse_error_review = None
|
|
3123
|
+
if review_summary_text:
|
|
3124
|
+
review_parsed, parse_error_review = _try_parse_summary_report(review_summary_text)
|
|
3125
|
+
if parse_error_review:
|
|
3126
|
+
prev_parse_error_review = parse_error_review
|
|
3127
|
+
try:
|
|
3128
|
+
typer.secho(f"[jarvis-sec] 复核结果YAML解析失败: {parse_error_review}", fg=typer.colors.YELLOW)
|
|
3129
|
+
except Exception:
|
|
3130
|
+
pass
|
|
3131
|
+
else:
|
|
3132
|
+
prev_parse_error_review = None
|
|
3133
|
+
if isinstance(review_parsed, list):
|
|
3134
|
+
if review_parsed and all(_is_valid_review_item(item) for item in review_parsed):
|
|
3135
|
+
return review_parsed, None
|
|
3136
|
+
|
|
3137
|
+
# 格式校验失败,后续重试使用直接模型调用
|
|
3138
|
+
use_direct_model_review = True
|
|
3139
|
+
if parse_error_review:
|
|
3140
|
+
try:
|
|
3141
|
+
typer.secho(f"[jarvis-sec] 复核结果YAML解析失败 -> 重试第 {review_attempt} 次(使用直接模型调用,将反馈解析错误)", fg=typer.colors.YELLOW)
|
|
3142
|
+
except Exception:
|
|
3143
|
+
pass
|
|
3144
|
+
else:
|
|
3145
|
+
try:
|
|
3146
|
+
typer.secho(f"[jarvis-sec] 复核结果格式无效 -> 重试第 {review_attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
|
|
3147
|
+
except Exception:
|
|
3148
|
+
pass
|
|
3149
|
+
|
|
3150
|
+
|
|
3151
|
+
def _check_and_supplement_missing_gids(
|
|
3152
|
+
file_groups: Dict[str, List[Dict]],
|
|
3153
|
+
cluster_batches: List[List[Dict]],
|
|
3154
|
+
invalid_clusters_for_review: List[Dict],
|
|
3155
|
+
sec_dir,
|
|
3156
|
+
_progress_append,
|
|
3157
|
+
) -> None:
|
|
3158
|
+
"""检查并补充遗漏的 gid"""
|
|
3159
|
+
# 1. 收集所有候选的 gid
|
|
3160
|
+
all_candidate_gids = _collect_candidate_gids(file_groups)
|
|
3161
|
+
gid_to_candidate_for_check: Dict[int, Dict] = {}
|
|
3162
|
+
for _file, _items in file_groups.items():
|
|
3163
|
+
for it in _items:
|
|
3164
|
+
try:
|
|
3165
|
+
_gid = int(it.get("gid", 0))
|
|
3166
|
+
if _gid >= 1:
|
|
3167
|
+
gid_to_candidate_for_check[_gid] = it
|
|
3168
|
+
except Exception:
|
|
3169
|
+
pass
|
|
3170
|
+
|
|
3171
|
+
# 2. 收集所有已聚类的 gid
|
|
3172
|
+
all_clustered_gids = _collect_clustered_gids(cluster_batches, invalid_clusters_for_review)
|
|
3173
|
+
|
|
3174
|
+
# 3. 读取已处理的 gid(从 agent_issues.jsonl)
|
|
3175
|
+
processed_gids_from_issues_for_check = _load_processed_gids_from_agent_issues(sec_dir)
|
|
3176
|
+
|
|
3177
|
+
# 4. 检查是否有遗漏的 gid(未聚类)
|
|
3178
|
+
missing_gids_before_analysis = all_candidate_gids - all_clustered_gids
|
|
3179
|
+
if missing_gids_before_analysis:
|
|
3180
|
+
missing_count = len(missing_gids_before_analysis)
|
|
3181
|
+
missing_list = sorted(list(missing_gids_before_analysis))
|
|
3182
|
+
if missing_count > 50:
|
|
3183
|
+
# 如果遗漏的gid太多,只显示前10个和后10个
|
|
3184
|
+
display_list = missing_list[:10] + ["..."] + missing_list[-10:]
|
|
3185
|
+
typer.secho(f"[jarvis-sec] 警告:分析阶段开始前发现遗漏的gid(共{missing_count}个):{display_list},将检查是否需要补充聚类", fg=typer.colors.YELLOW)
|
|
3186
|
+
else:
|
|
3187
|
+
typer.secho(f"[jarvis-sec] 警告:分析阶段开始前发现遗漏的gid {missing_list},将检查是否需要补充聚类", fg=typer.colors.YELLOW)
|
|
3188
|
+
|
|
3189
|
+
# 为每个遗漏的 gid 创建单独的聚类
|
|
3190
|
+
supplemented_count, skipped_count = _supplement_missing_gids_for_clustering(
|
|
3191
|
+
missing_gids_before_analysis,
|
|
3192
|
+
gid_to_candidate_for_check,
|
|
3193
|
+
cluster_batches,
|
|
3194
|
+
_progress_append,
|
|
3195
|
+
processed_gids_from_issues_for_check,
|
|
3196
|
+
)
|
|
3197
|
+
|
|
3198
|
+
# 输出统计信息
|
|
3199
|
+
if skipped_count > 0:
|
|
3200
|
+
try:
|
|
3201
|
+
typer.secho(f"[jarvis-sec] 已跳过 {skipped_count} 个已在agent_issues.jsonl中处理的gid", fg=typer.colors.GREEN)
|
|
3202
|
+
except Exception:
|
|
3203
|
+
pass
|
|
3204
|
+
if supplemented_count > 0:
|
|
3205
|
+
try:
|
|
3206
|
+
typer.secho(f"[jarvis-sec] 已为 {supplemented_count} 个遗漏的gid创建单独聚类", fg=typer.colors.GREEN)
|
|
3207
|
+
except Exception:
|
|
3208
|
+
pass
|
|
3209
|
+
|
|
3210
|
+
|
|
3211
|
+
def _initialize_clustering_context(
|
|
3212
|
+
compact_candidates: List[Dict],
|
|
3213
|
+
sec_dir,
|
|
3214
|
+
progress_path,
|
|
3215
|
+
_progress_append,
|
|
3216
|
+
) -> tuple[Dict[str, List[Dict]], Dict, tuple, List[List[Dict]], List[Dict], List[Dict], set]:
|
|
3217
|
+
"""初始化聚类上下文,返回(文件分组, 已有聚类, 快照写入函数, 聚类批次, 聚类记录, 无效聚类, 已聚类gid)"""
|
|
3218
|
+
# 按文件分组构建待聚类集合
|
|
3219
|
+
_file_groups = _group_candidates_by_file(compact_candidates)
|
|
3220
|
+
|
|
3221
|
+
cluster_batches: List[List[Dict]] = []
|
|
3222
|
+
cluster_records: List[Dict] = []
|
|
3223
|
+
invalid_clusters_for_review: List[Dict] = []
|
|
3224
|
+
|
|
3225
|
+
# 读取已有聚类报告以支持断点
|
|
3226
|
+
_existing_clusters, _completed_cluster_batches = _load_existing_clusters(
|
|
3227
|
+
sec_dir, progress_path
|
|
3228
|
+
)
|
|
3229
|
+
|
|
3230
|
+
# 创建快照写入函数
|
|
3231
|
+
_write_cluster_batch_snapshot, _write_cluster_report_snapshot = _create_cluster_snapshot_writer(
|
|
3232
|
+
sec_dir, cluster_records, compact_candidates, _progress_append
|
|
3233
|
+
)
|
|
3234
|
+
|
|
3235
|
+
# 从断点恢复聚类结果
|
|
3236
|
+
cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids = _restore_clusters_from_checkpoint(
|
|
3237
|
+
_existing_clusters, _file_groups
|
|
3238
|
+
)
|
|
3239
|
+
|
|
3240
|
+
return (
|
|
3241
|
+
_file_groups,
|
|
3242
|
+
_existing_clusters,
|
|
3243
|
+
(_write_cluster_batch_snapshot, _write_cluster_report_snapshot),
|
|
3244
|
+
cluster_batches,
|
|
3245
|
+
cluster_records,
|
|
3246
|
+
invalid_clusters_for_review,
|
|
3247
|
+
clustered_gids,
|
|
3248
|
+
)
|
|
3249
|
+
|
|
3250
|
+
|
|
3251
|
+
def _check_unclustered_gids(
|
|
3252
|
+
all_candidate_gids: set,
|
|
3253
|
+
clustered_gids: set,
|
|
3254
|
+
) -> set:
|
|
3255
|
+
"""检查未聚类的gid"""
|
|
3256
|
+
unclustered_gids = all_candidate_gids - clustered_gids
|
|
3257
|
+
if unclustered_gids:
|
|
3258
|
+
try:
|
|
3259
|
+
typer.secho(f"[jarvis-sec] 发现 {len(unclustered_gids)} 个未聚类的 gid,将进行聚类", fg=typer.colors.YELLOW)
|
|
3260
|
+
except Exception:
|
|
3261
|
+
pass
|
|
3262
|
+
else:
|
|
3263
|
+
try:
|
|
3264
|
+
typer.secho(f"[jarvis-sec] 所有 {len(all_candidate_gids)} 个候选已聚类,跳过聚类阶段", fg=typer.colors.GREEN)
|
|
3265
|
+
except Exception:
|
|
3266
|
+
pass
|
|
3267
|
+
return unclustered_gids
|
|
3268
|
+
|
|
3269
|
+
|
|
3270
|
+
def _execute_clustering_for_files(
|
|
3271
|
+
file_groups: Dict[str, List[Dict]],
|
|
3272
|
+
clustered_gids: set,
|
|
3273
|
+
cluster_batches: List[List[Dict]],
|
|
3274
|
+
cluster_records: List[Dict],
|
|
3275
|
+
invalid_clusters_for_review: List[Dict],
|
|
3276
|
+
entry_path: str,
|
|
3277
|
+
langs: List[str],
|
|
3278
|
+
cluster_limit: int,
|
|
3279
|
+
llm_group: Optional[str],
|
|
3280
|
+
status_mgr,
|
|
3281
|
+
_progress_append,
|
|
3282
|
+
_write_cluster_batch_snapshot,
|
|
3283
|
+
) -> None:
|
|
3284
|
+
"""执行文件聚类"""
|
|
3285
|
+
total_files_to_cluster = len(file_groups)
|
|
3286
|
+
# 更新聚类阶段状态
|
|
3287
|
+
if total_files_to_cluster > 0:
|
|
3288
|
+
status_mgr.update_clustering(
|
|
3289
|
+
current_file=0,
|
|
3290
|
+
total_files=total_files_to_cluster,
|
|
3291
|
+
message="开始聚类分析..."
|
|
3292
|
+
)
|
|
3293
|
+
for _file_idx, (_file, _items) in enumerate(file_groups.items(), start=1):
|
|
3294
|
+
typer.secho(f"\n[jarvis-sec] 聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}", fg=typer.colors.CYAN)
|
|
3295
|
+
# 更新当前文件进度
|
|
3296
|
+
status_mgr.update_clustering(
|
|
3297
|
+
current_file=_file_idx,
|
|
3298
|
+
total_files=total_files_to_cluster,
|
|
3299
|
+
file_name=_file,
|
|
3300
|
+
message=f"正在聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}"
|
|
3301
|
+
)
|
|
3302
|
+
# 使用子函数处理文件聚类
|
|
3303
|
+
_process_file_clustering(
|
|
3304
|
+
_file,
|
|
3305
|
+
_items,
|
|
3306
|
+
clustered_gids,
|
|
3307
|
+
cluster_batches,
|
|
3308
|
+
cluster_records,
|
|
3309
|
+
invalid_clusters_for_review,
|
|
3310
|
+
entry_path,
|
|
3311
|
+
langs,
|
|
3312
|
+
cluster_limit,
|
|
3313
|
+
llm_group,
|
|
3314
|
+
_progress_append,
|
|
3315
|
+
_write_cluster_batch_snapshot,
|
|
3316
|
+
)
|
|
3317
|
+
|
|
3318
|
+
|
|
3319
|
+
def _record_clustering_completion(
|
|
3320
|
+
sec_dir,
|
|
3321
|
+
cluster_records: List[Dict],
|
|
3322
|
+
compact_candidates: List[Dict],
|
|
3323
|
+
_progress_append,
|
|
3324
|
+
) -> None:
|
|
3325
|
+
"""记录聚类阶段完成"""
|
|
3326
|
+
try:
|
|
3327
|
+
from pathlib import Path
|
|
3328
|
+
import json
|
|
3329
|
+
_cluster_path = sec_dir / "cluster_report.jsonl"
|
|
3330
|
+
_progress_append({
|
|
3331
|
+
"event": "cluster_report_written",
|
|
3332
|
+
"path": str(_cluster_path),
|
|
3333
|
+
"clusters": len(cluster_records),
|
|
3334
|
+
"total_candidates": len(compact_candidates),
|
|
3335
|
+
"note": "每个批次已增量保存,无需重写整个文件",
|
|
3336
|
+
})
|
|
3337
|
+
except Exception:
|
|
3338
|
+
pass
|
|
3339
|
+
|
|
3340
|
+
|
|
3341
|
+
def _fallback_to_file_based_batches(
|
|
3342
|
+
file_groups: Dict[str, List[Dict]],
|
|
3343
|
+
existing_clusters: Dict,
|
|
3344
|
+
) -> List[List[Dict]]:
|
|
3345
|
+
"""若聚类失败或空,则回退为按文件一次处理"""
|
|
3346
|
+
fallback_batches: List[List[Dict]] = []
|
|
3347
|
+
|
|
3348
|
+
# 收集所有未聚类的 gid(从所有候选 gid 中排除已聚类的)
|
|
3349
|
+
all_gids_in_file_groups = _collect_candidate_gids(file_groups)
|
|
3350
|
+
gid_to_item_fallback: Dict[int, Dict] = {}
|
|
3351
|
+
for _file, _items in file_groups.items():
|
|
3352
|
+
for c in _items:
|
|
3353
|
+
try:
|
|
3354
|
+
_gid = int(c.get("gid", 0))
|
|
3355
|
+
if _gid >= 1:
|
|
3356
|
+
gid_to_item_fallback[_gid] = c
|
|
3357
|
+
except Exception:
|
|
3358
|
+
pass
|
|
3359
|
+
|
|
3360
|
+
# 如果还有未聚类的 gid,按文件分组创建批次
|
|
3361
|
+
if all_gids_in_file_groups:
|
|
3362
|
+
# 收集已聚类的 gid(从 cluster_report.jsonl)
|
|
3363
|
+
clustered_gids_fallback = set()
|
|
3364
|
+
for (_file_key, _batch_idx), cluster_recs in existing_clusters.items():
|
|
3365
|
+
for rec in cluster_recs:
|
|
3366
|
+
if rec.get("is_invalid", False):
|
|
3367
|
+
continue
|
|
3368
|
+
gids_list = rec.get("gids", [])
|
|
3369
|
+
for _gid in gids_list:
|
|
3370
|
+
try:
|
|
3371
|
+
_gid_int = int(_gid)
|
|
3372
|
+
if _gid_int >= 1:
|
|
3373
|
+
clustered_gids_fallback.add(_gid_int)
|
|
3374
|
+
except Exception:
|
|
3375
|
+
pass
|
|
3376
|
+
|
|
3377
|
+
unclustered_gids_fallback = all_gids_in_file_groups - clustered_gids_fallback
|
|
3378
|
+
if unclustered_gids_fallback:
|
|
3379
|
+
# 按文件分组未聚类的 gid
|
|
3380
|
+
from collections import defaultdict
|
|
3381
|
+
unclustered_by_file: Dict[str, List[Dict]] = defaultdict(list)
|
|
3382
|
+
for _gid in unclustered_gids_fallback:
|
|
3383
|
+
item = gid_to_item_fallback.get(_gid)
|
|
3384
|
+
if item:
|
|
3385
|
+
file_key = str(item.get("file") or "")
|
|
3386
|
+
unclustered_by_file[file_key].append(item)
|
|
3387
|
+
|
|
3388
|
+
# 为每个文件创建批次
|
|
3389
|
+
for _file, _items in unclustered_by_file.items():
|
|
3390
|
+
if _items:
|
|
3391
|
+
fallback_batches.append(_items)
|
|
3392
|
+
|
|
3393
|
+
return fallback_batches
|
|
3394
|
+
|
|
3395
|
+
|
|
3396
|
+
def _process_clustering_phase(
|
|
3397
|
+
compact_candidates: List[Dict],
|
|
3398
|
+
entry_path: str,
|
|
3399
|
+
langs: List[str],
|
|
3400
|
+
cluster_limit: int,
|
|
3401
|
+
llm_group: Optional[str],
|
|
3402
|
+
sec_dir,
|
|
3403
|
+
progress_path,
|
|
3404
|
+
status_mgr,
|
|
3405
|
+
_progress_append,
|
|
3406
|
+
) -> tuple[List[List[Dict]], List[Dict]]:
|
|
3407
|
+
"""处理聚类阶段,返回(cluster_batches, invalid_clusters_for_review)"""
|
|
3408
|
+
# 初始化聚类上下文
|
|
3409
|
+
(
|
|
3410
|
+
_file_groups,
|
|
3411
|
+
_existing_clusters,
|
|
3412
|
+
(_write_cluster_batch_snapshot, _write_cluster_report_snapshot),
|
|
3413
|
+
cluster_batches,
|
|
3414
|
+
cluster_records,
|
|
3415
|
+
invalid_clusters_for_review,
|
|
3416
|
+
clustered_gids,
|
|
3417
|
+
) = _initialize_clustering_context(compact_candidates, sec_dir, progress_path, _progress_append)
|
|
3418
|
+
|
|
3419
|
+
# 收集所有候选的 gid(用于检查未聚类的 gid)
|
|
3420
|
+
all_candidate_gids_in_clustering = _collect_candidate_gids(_file_groups)
|
|
3421
|
+
|
|
3422
|
+
# 检查是否有未聚类的 gid
|
|
3423
|
+
unclustered_gids = _check_unclustered_gids(all_candidate_gids_in_clustering, clustered_gids)
|
|
3424
|
+
|
|
3425
|
+
# 如果有未聚类的 gid,继续执行聚类
|
|
3426
|
+
if unclustered_gids:
|
|
3427
|
+
_execute_clustering_for_files(
|
|
3428
|
+
_file_groups,
|
|
3429
|
+
clustered_gids,
|
|
3430
|
+
cluster_batches,
|
|
3431
|
+
cluster_records,
|
|
3432
|
+
invalid_clusters_for_review,
|
|
3433
|
+
entry_path,
|
|
3434
|
+
langs,
|
|
3435
|
+
cluster_limit,
|
|
3436
|
+
llm_group,
|
|
3437
|
+
status_mgr,
|
|
3438
|
+
_progress_append,
|
|
3439
|
+
_write_cluster_batch_snapshot,
|
|
3440
|
+
)
|
|
3441
|
+
|
|
3442
|
+
# 记录聚类阶段完成
|
|
3443
|
+
_record_clustering_completion(sec_dir, cluster_records, compact_candidates, _progress_append)
|
|
3444
|
+
|
|
3445
|
+
# 复核Agent:验证所有标记为无效的聚类
|
|
3446
|
+
cluster_batches = _process_review_phase(
|
|
3447
|
+
invalid_clusters_for_review,
|
|
3448
|
+
entry_path,
|
|
3449
|
+
langs,
|
|
3450
|
+
llm_group,
|
|
3451
|
+
status_mgr,
|
|
3452
|
+
_progress_append,
|
|
3453
|
+
cluster_batches,
|
|
3454
|
+
)
|
|
3455
|
+
|
|
3456
|
+
# 若聚类失败或空,则回退为"按文件一次处理"
|
|
3457
|
+
if not cluster_batches:
|
|
3458
|
+
fallback_batches = _fallback_to_file_based_batches(_file_groups, _existing_clusters)
|
|
3459
|
+
cluster_batches.extend(fallback_batches)
|
|
3460
|
+
|
|
3461
|
+
# 完整性检查:确保所有候选的 gid 都已被聚类
|
|
3462
|
+
_check_and_supplement_missing_gids(
|
|
3463
|
+
_file_groups,
|
|
3464
|
+
cluster_batches,
|
|
3465
|
+
invalid_clusters_for_review,
|
|
3466
|
+
sec_dir,
|
|
3467
|
+
_progress_append,
|
|
3468
|
+
)
|
|
3469
|
+
|
|
3470
|
+
return cluster_batches, invalid_clusters_for_review
|
|
3471
|
+
|
|
3472
|
+
|
|
3473
|
+
def _process_verification_phase(
|
|
3474
|
+
cluster_batches: List[List[Dict]],
|
|
3475
|
+
entry_path: str,
|
|
3476
|
+
langs: List[str],
|
|
3477
|
+
llm_group: Optional[str],
|
|
3478
|
+
sec_dir,
|
|
3479
|
+
progress_path,
|
|
3480
|
+
status_mgr,
|
|
3481
|
+
_progress_append,
|
|
3482
|
+
_append_report,
|
|
3483
|
+
) -> List[Dict]:
|
|
3484
|
+
"""处理验证阶段,返回所有已保存的告警"""
|
|
3485
|
+
batches: List[List[Dict]] = cluster_batches
|
|
3486
|
+
total_batches = len(batches)
|
|
3487
|
+
|
|
3488
|
+
# 从 agent_issues.jsonl 中读取已处理的 gid
|
|
3489
|
+
processed_gids_from_issues = _load_processed_gids_from_issues(sec_dir)
|
|
3490
|
+
|
|
3491
|
+
# 从 progress.jsonl 中读取已完成的批次
|
|
3492
|
+
completed_batch_ids = _load_completed_batch_ids(progress_path)
|
|
3493
|
+
|
|
3494
|
+
if completed_batch_ids:
|
|
3495
|
+
try:
|
|
3496
|
+
typer.secho(f"[jarvis-sec] 断点恢复:从 progress.jsonl 读取到 {len(completed_batch_ids)} 个已完成的批次", fg=typer.colors.BLUE)
|
|
3497
|
+
except Exception:
|
|
3498
|
+
pass
|
|
3499
|
+
|
|
3500
|
+
# 更新验证阶段状态
|
|
3501
|
+
if total_batches > 0:
|
|
3502
|
+
status_mgr.update_verification(
|
|
3503
|
+
current_batch=0,
|
|
3504
|
+
total_batches=total_batches,
|
|
3505
|
+
message="开始安全验证..."
|
|
3506
|
+
)
|
|
3507
|
+
|
|
3508
|
+
meta_records: List[Dict] = []
|
|
3509
|
+
gid_counts: Dict[int, int] = {}
|
|
3510
|
+
|
|
3511
|
+
for bidx, batch in enumerate(batches, start=1):
|
|
3512
|
+
task_id = f"JARVIS-SEC-Batch-{bidx}"
|
|
3513
|
+
batch_file = batch[0].get("file") if batch else None
|
|
3514
|
+
|
|
3515
|
+
# 检查批次是否已完成:优先检查 progress.jsonl 中的批次状态
|
|
3516
|
+
is_batch_completed = False
|
|
3517
|
+
|
|
3518
|
+
# 方法1:检查 progress.jsonl 中是否有该批次的完成记录
|
|
3519
|
+
if task_id in completed_batch_ids:
|
|
3520
|
+
is_batch_completed = True
|
|
3521
|
+
else:
|
|
3522
|
+
# 方法2:检查批次中的所有 gid 是否都在 agent_issues.jsonl 中
|
|
3523
|
+
batch_gids = set()
|
|
3524
|
+
for item in batch:
|
|
3525
|
+
try:
|
|
3526
|
+
_gid = int(item.get("gid", 0))
|
|
3527
|
+
if _gid >= 1:
|
|
3528
|
+
batch_gids.add(_gid)
|
|
3529
|
+
except Exception:
|
|
3530
|
+
pass
|
|
3531
|
+
|
|
3532
|
+
# 如果批次中的所有 gid 都已处理,则认为该批次已完成
|
|
3533
|
+
if batch_gids and processed_gids_from_issues and batch_gids.issubset(processed_gids_from_issues):
|
|
3534
|
+
is_batch_completed = True
|
|
3535
|
+
|
|
3536
|
+
if is_batch_completed:
|
|
3537
|
+
try:
|
|
3538
|
+
typer.secho(f"[jarvis-sec] 跳过批次 {bidx}/{total_batches}:已在之前的运行中完成", fg=typer.colors.GREEN)
|
|
3539
|
+
except Exception:
|
|
3540
|
+
pass
|
|
3541
|
+
# 更新进度但不实际处理
|
|
3542
|
+
status_mgr.update_verification(
|
|
3543
|
+
current_batch=bidx,
|
|
3544
|
+
total_batches=total_batches,
|
|
3545
|
+
batch_id=task_id,
|
|
3546
|
+
file_name=batch_file,
|
|
3547
|
+
message=f"跳过已完成的批次 {bidx}/{total_batches}"
|
|
3548
|
+
)
|
|
3549
|
+
continue
|
|
3550
|
+
|
|
3551
|
+
# 处理验证批次
|
|
3552
|
+
_process_verification_batch(
|
|
3553
|
+
batch,
|
|
3554
|
+
bidx,
|
|
3555
|
+
total_batches,
|
|
3556
|
+
entry_path,
|
|
3557
|
+
langs,
|
|
3558
|
+
llm_group,
|
|
3559
|
+
status_mgr,
|
|
3560
|
+
_progress_append,
|
|
3561
|
+
_append_report,
|
|
3562
|
+
meta_records,
|
|
3563
|
+
gid_counts,
|
|
3564
|
+
sec_dir,
|
|
3565
|
+
)
|
|
3566
|
+
|
|
3567
|
+
# 从 agent_issues.jsonl 读取所有已保存的告警
|
|
3568
|
+
return _load_all_issues_from_file(sec_dir)
|
|
3569
|
+
|
|
3570
|
+
|
|
3571
|
+
def _try_parse_summary_report(text: str) -> tuple[Optional[object], Optional[str]]:
|
|
3572
|
+
"""
|
|
3573
|
+
从摘要文本中提取 <REPORT>...</REPORT> 内容,并解析为对象(dict 或 list,仅支持 YAML)。
|
|
3574
|
+
返回(解析结果, 错误信息)
|
|
3575
|
+
如果解析成功,返回(data, None)
|
|
3576
|
+
如果解析失败,返回(None, 错误信息)
|
|
3577
|
+
"""
|
|
3578
|
+
start = text.find("<REPORT>")
|
|
3579
|
+
end = text.find("</REPORT>")
|
|
3580
|
+
if start == -1 or end == -1 or end <= start:
|
|
3581
|
+
return None, "未找到 <REPORT> 或 </REPORT> 标签,或标签顺序错误"
|
|
3582
|
+
content = text[start + len("<REPORT>"):end].strip()
|
|
3583
|
+
if not content:
|
|
3584
|
+
return None, "YAML 内容为空"
|
|
3585
|
+
try:
|
|
3586
|
+
import yaml as _yaml # type: ignore
|
|
3587
|
+
try:
|
|
3588
|
+
data = _yaml.safe_load(content)
|
|
3589
|
+
except Exception as yaml_err:
|
|
3590
|
+
error_msg = f"YAML 解析失败: {str(yaml_err)}"
|
|
3591
|
+
return None, error_msg
|
|
3592
|
+
if isinstance(data, (dict, list)):
|
|
3593
|
+
return data, None
|
|
3594
|
+
return None, f"YAML 解析结果不是字典或数组,而是 {type(data).__name__}"
|
|
3595
|
+
except Exception as e:
|
|
3596
|
+
return None, f"解析过程发生异常: {str(e)}"
|
|
3597
|
+
|
|
3598
|
+
|
|
3599
|
+
__all__ = [
|
|
3600
|
+
|
|
3601
|
+
"run_security_analysis",
|
|
3602
|
+
|
|
3603
|
+
"direct_scan",
|
|
3604
|
+
"run_with_agent",
|
|
3605
|
+
]
|