jarvis-ai-assistant 0.7.0__py3-none-any.whl → 0.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/jarvis_agent/__init__.py +243 -139
  3. jarvis/jarvis_agent/agent_manager.py +5 -10
  4. jarvis/jarvis_agent/builtin_input_handler.py +2 -6
  5. jarvis/jarvis_agent/config_editor.py +2 -7
  6. jarvis/jarvis_agent/event_bus.py +82 -12
  7. jarvis/jarvis_agent/file_context_handler.py +265 -15
  8. jarvis/jarvis_agent/file_methodology_manager.py +3 -4
  9. jarvis/jarvis_agent/jarvis.py +113 -98
  10. jarvis/jarvis_agent/language_extractors/__init__.py +57 -0
  11. jarvis/jarvis_agent/language_extractors/c_extractor.py +21 -0
  12. jarvis/jarvis_agent/language_extractors/cpp_extractor.py +21 -0
  13. jarvis/jarvis_agent/language_extractors/go_extractor.py +21 -0
  14. jarvis/jarvis_agent/language_extractors/java_extractor.py +84 -0
  15. jarvis/jarvis_agent/language_extractors/javascript_extractor.py +79 -0
  16. jarvis/jarvis_agent/language_extractors/python_extractor.py +21 -0
  17. jarvis/jarvis_agent/language_extractors/rust_extractor.py +21 -0
  18. jarvis/jarvis_agent/language_extractors/typescript_extractor.py +84 -0
  19. jarvis/jarvis_agent/language_support_info.py +486 -0
  20. jarvis/jarvis_agent/main.py +6 -12
  21. jarvis/jarvis_agent/memory_manager.py +7 -16
  22. jarvis/jarvis_agent/methodology_share_manager.py +10 -16
  23. jarvis/jarvis_agent/prompt_manager.py +1 -1
  24. jarvis/jarvis_agent/prompts.py +193 -171
  25. jarvis/jarvis_agent/protocols.py +8 -12
  26. jarvis/jarvis_agent/run_loop.py +77 -14
  27. jarvis/jarvis_agent/session_manager.py +2 -3
  28. jarvis/jarvis_agent/share_manager.py +12 -21
  29. jarvis/jarvis_agent/shell_input_handler.py +1 -2
  30. jarvis/jarvis_agent/task_analyzer.py +26 -4
  31. jarvis/jarvis_agent/task_manager.py +11 -27
  32. jarvis/jarvis_agent/tool_executor.py +2 -3
  33. jarvis/jarvis_agent/tool_share_manager.py +12 -24
  34. jarvis/jarvis_agent/web_server.py +55 -20
  35. jarvis/jarvis_c2rust/__init__.py +5 -5
  36. jarvis/jarvis_c2rust/cli.py +461 -499
  37. jarvis/jarvis_c2rust/collector.py +45 -53
  38. jarvis/jarvis_c2rust/constants.py +26 -0
  39. jarvis/jarvis_c2rust/library_replacer.py +264 -132
  40. jarvis/jarvis_c2rust/llm_module_agent.py +162 -190
  41. jarvis/jarvis_c2rust/loaders.py +207 -0
  42. jarvis/jarvis_c2rust/models.py +28 -0
  43. jarvis/jarvis_c2rust/optimizer.py +1592 -395
  44. jarvis/jarvis_c2rust/transpiler.py +1722 -1064
  45. jarvis/jarvis_c2rust/utils.py +385 -0
  46. jarvis/jarvis_code_agent/build_validation_config.py +2 -3
  47. jarvis/jarvis_code_agent/code_agent.py +394 -320
  48. jarvis/jarvis_code_agent/code_analyzer/__init__.py +3 -0
  49. jarvis/jarvis_code_agent/code_analyzer/build_validator/base.py +4 -0
  50. jarvis/jarvis_code_agent/code_analyzer/build_validator/cmake.py +17 -2
  51. jarvis/jarvis_code_agent/code_analyzer/build_validator/fallback.py +3 -0
  52. jarvis/jarvis_code_agent/code_analyzer/build_validator/go.py +36 -4
  53. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_gradle.py +9 -0
  54. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_maven.py +9 -0
  55. jarvis/jarvis_code_agent/code_analyzer/build_validator/makefile.py +12 -1
  56. jarvis/jarvis_code_agent/code_analyzer/build_validator/nodejs.py +22 -5
  57. jarvis/jarvis_code_agent/code_analyzer/build_validator/python.py +57 -32
  58. jarvis/jarvis_code_agent/code_analyzer/build_validator/rust.py +62 -6
  59. jarvis/jarvis_code_agent/code_analyzer/build_validator/validator.py +8 -9
  60. jarvis/jarvis_code_agent/code_analyzer/context_manager.py +290 -5
  61. jarvis/jarvis_code_agent/code_analyzer/language_support.py +21 -0
  62. jarvis/jarvis_code_agent/code_analyzer/languages/__init__.py +21 -3
  63. jarvis/jarvis_code_agent/code_analyzer/languages/c_cpp_language.py +72 -4
  64. jarvis/jarvis_code_agent/code_analyzer/languages/go_language.py +35 -3
  65. jarvis/jarvis_code_agent/code_analyzer/languages/java_language.py +212 -0
  66. jarvis/jarvis_code_agent/code_analyzer/languages/javascript_language.py +254 -0
  67. jarvis/jarvis_code_agent/code_analyzer/languages/python_language.py +52 -2
  68. jarvis/jarvis_code_agent/code_analyzer/languages/rust_language.py +73 -1
  69. jarvis/jarvis_code_agent/code_analyzer/languages/typescript_language.py +280 -0
  70. jarvis/jarvis_code_agent/code_analyzer/llm_context_recommender.py +306 -152
  71. jarvis/jarvis_code_agent/code_analyzer/structured_code.py +556 -0
  72. jarvis/jarvis_code_agent/code_analyzer/symbol_extractor.py +193 -18
  73. jarvis/jarvis_code_agent/code_analyzer/tree_sitter_extractor.py +18 -8
  74. jarvis/jarvis_code_agent/lint.py +258 -27
  75. jarvis/jarvis_code_agent/utils.py +0 -1
  76. jarvis/jarvis_code_analysis/code_review.py +19 -24
  77. jarvis/jarvis_data/config_schema.json +53 -26
  78. jarvis/jarvis_git_squash/main.py +4 -5
  79. jarvis/jarvis_git_utils/git_commiter.py +44 -49
  80. jarvis/jarvis_mcp/sse_mcp_client.py +20 -27
  81. jarvis/jarvis_mcp/stdio_mcp_client.py +11 -12
  82. jarvis/jarvis_mcp/streamable_mcp_client.py +15 -14
  83. jarvis/jarvis_memory_organizer/memory_organizer.py +55 -74
  84. jarvis/jarvis_methodology/main.py +32 -48
  85. jarvis/jarvis_multi_agent/__init__.py +79 -61
  86. jarvis/jarvis_multi_agent/main.py +3 -7
  87. jarvis/jarvis_platform/base.py +469 -199
  88. jarvis/jarvis_platform/human.py +7 -8
  89. jarvis/jarvis_platform/kimi.py +30 -36
  90. jarvis/jarvis_platform/openai.py +65 -27
  91. jarvis/jarvis_platform/registry.py +26 -10
  92. jarvis/jarvis_platform/tongyi.py +24 -25
  93. jarvis/jarvis_platform/yuanbao.py +31 -42
  94. jarvis/jarvis_platform_manager/main.py +66 -77
  95. jarvis/jarvis_platform_manager/service.py +8 -13
  96. jarvis/jarvis_rag/cli.py +49 -51
  97. jarvis/jarvis_rag/embedding_manager.py +13 -18
  98. jarvis/jarvis_rag/llm_interface.py +8 -9
  99. jarvis/jarvis_rag/query_rewriter.py +10 -21
  100. jarvis/jarvis_rag/rag_pipeline.py +24 -27
  101. jarvis/jarvis_rag/reranker.py +4 -5
  102. jarvis/jarvis_rag/retriever.py +28 -30
  103. jarvis/jarvis_sec/__init__.py +220 -3520
  104. jarvis/jarvis_sec/agents.py +143 -0
  105. jarvis/jarvis_sec/analysis.py +276 -0
  106. jarvis/jarvis_sec/cli.py +29 -6
  107. jarvis/jarvis_sec/clustering.py +1439 -0
  108. jarvis/jarvis_sec/file_manager.py +427 -0
  109. jarvis/jarvis_sec/parsers.py +73 -0
  110. jarvis/jarvis_sec/prompts.py +268 -0
  111. jarvis/jarvis_sec/report.py +83 -4
  112. jarvis/jarvis_sec/review.py +453 -0
  113. jarvis/jarvis_sec/utils.py +499 -0
  114. jarvis/jarvis_sec/verification.py +848 -0
  115. jarvis/jarvis_sec/workflow.py +7 -0
  116. jarvis/jarvis_smart_shell/main.py +38 -87
  117. jarvis/jarvis_stats/cli.py +1 -1
  118. jarvis/jarvis_stats/stats.py +7 -7
  119. jarvis/jarvis_stats/storage.py +15 -21
  120. jarvis/jarvis_tools/clear_memory.py +3 -20
  121. jarvis/jarvis_tools/cli/main.py +20 -23
  122. jarvis/jarvis_tools/edit_file.py +1066 -0
  123. jarvis/jarvis_tools/execute_script.py +42 -21
  124. jarvis/jarvis_tools/file_analyzer.py +6 -9
  125. jarvis/jarvis_tools/generate_new_tool.py +11 -20
  126. jarvis/jarvis_tools/lsp_client.py +1552 -0
  127. jarvis/jarvis_tools/methodology.py +2 -3
  128. jarvis/jarvis_tools/read_code.py +1525 -87
  129. jarvis/jarvis_tools/read_symbols.py +2 -3
  130. jarvis/jarvis_tools/read_webpage.py +7 -10
  131. jarvis/jarvis_tools/registry.py +370 -181
  132. jarvis/jarvis_tools/retrieve_memory.py +20 -19
  133. jarvis/jarvis_tools/rewrite_file.py +105 -0
  134. jarvis/jarvis_tools/save_memory.py +3 -15
  135. jarvis/jarvis_tools/search_web.py +3 -7
  136. jarvis/jarvis_tools/sub_agent.py +17 -6
  137. jarvis/jarvis_tools/sub_code_agent.py +14 -16
  138. jarvis/jarvis_tools/virtual_tty.py +54 -32
  139. jarvis/jarvis_utils/clipboard.py +7 -10
  140. jarvis/jarvis_utils/config.py +98 -63
  141. jarvis/jarvis_utils/embedding.py +5 -5
  142. jarvis/jarvis_utils/fzf.py +8 -8
  143. jarvis/jarvis_utils/git_utils.py +81 -67
  144. jarvis/jarvis_utils/input.py +24 -49
  145. jarvis/jarvis_utils/jsonnet_compat.py +465 -0
  146. jarvis/jarvis_utils/methodology.py +33 -35
  147. jarvis/jarvis_utils/utils.py +245 -202
  148. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.6.dist-info}/METADATA +205 -70
  149. jarvis_ai_assistant-0.7.6.dist-info/RECORD +218 -0
  150. jarvis/jarvis_agent/edit_file_handler.py +0 -584
  151. jarvis/jarvis_agent/rewrite_file_handler.py +0 -141
  152. jarvis/jarvis_agent/task_planner.py +0 -496
  153. jarvis/jarvis_platform/ai8.py +0 -332
  154. jarvis/jarvis_tools/ask_user.py +0 -54
  155. jarvis_ai_assistant-0.7.0.dist-info/RECORD +0 -192
  156. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.6.dist-info}/WHEEL +0 -0
  157. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.6.dist-info}/entry_points.txt +0 -0
  158. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.6.dist-info}/licenses/LICENSE +0 -0
  159. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.6.dist-info}/top_level.txt +0 -0
@@ -15,3585 +15,285 @@ Jarvis 安全分析套件
15
15
  - workflow.direct_scan(entry_path, ...):仅启发式直扫
16
16
 
17
17
  说明:
18
- - 已移除 MultiAgent 编排与相关提示词;不存在“阶段一”等表述
18
+ - 已移除 MultiAgent 编排与相关提示词;不存在"阶段一"等表述
19
+ - 模块化重构:将功能拆分为多个模块(prompts, parsers, utils, agents, clustering, analysis, verification, review)
19
20
  """
20
21
 
21
22
  from typing import Dict, List, Optional
22
23
 
23
24
  import typer
24
25
 
25
- from jarvis.jarvis_agent import Agent
26
+ from jarvis.jarvis_agent import Agent # noqa: F401
26
27
  from jarvis.jarvis_sec.workflow import direct_scan, run_with_agent
27
- from jarvis.jarvis_tools.registry import ToolRegistry
28
28
 
29
+ # 导入模块化后的函数(用于触发模块加载)
30
+ from jarvis.jarvis_sec.prompts import ( # noqa: F401
31
+ build_summary_prompt as _build_summary_prompt,
32
+ build_verification_summary_prompt as _build_verification_summary_prompt,
33
+ get_review_system_prompt as _get_review_system_prompt,
34
+ get_review_summary_prompt as _get_review_summary_prompt,
35
+ get_cluster_system_prompt as _get_cluster_system_prompt,
36
+ get_cluster_summary_prompt as _get_cluster_summary_prompt,
37
+ )
38
+ from jarvis.jarvis_sec.parsers import ( # noqa: F401
39
+ parse_clusters_from_text as _parse_clusters_from_text,
40
+ try_parse_summary_report as _try_parse_summary_report,
41
+ )
42
+ from jarvis.jarvis_sec.utils import ( # noqa: F401
43
+ git_restore_if_dirty as _git_restore_if_dirty,
44
+ get_sec_dir as _get_sec_dir,
45
+ initialize_analysis_context as _initialize_analysis_context,
46
+ load_or_run_heuristic_scan as _load_or_run_heuristic_scan,
47
+ compact_candidate as _compact_candidate,
48
+ prepare_candidates as _prepare_candidates,
49
+ group_candidates_by_file as _group_candidates_by_file,
50
+ create_report_writer as _create_report_writer,
51
+ sig_of as _sig_of,
52
+ load_processed_gids_from_issues as _load_processed_gids_from_issues,
53
+ count_issues_from_file as _count_issues_from_file,
54
+ load_all_issues_from_file as _load_all_issues_from_file,
55
+ load_processed_gids_from_agent_issues as _load_processed_gids_from_agent_issues,
56
+ )
57
+ from jarvis.jarvis_sec.agents import ( # noqa: F401
58
+ subscribe_summary_event as _subscribe_summary_event,
59
+ create_analysis_agent as _create_analysis_agent,
60
+ create_review_agent as _create_review_agent,
61
+ create_cluster_agent as _create_cluster_agent,
62
+ )
63
+ from jarvis.jarvis_sec.clustering import ( # noqa: F401
64
+ load_existing_clusters as _load_existing_clusters,
65
+ restore_clusters_from_checkpoint as _restore_clusters_from_checkpoint,
66
+ create_cluster_snapshot_writer as _create_cluster_snapshot_writer,
67
+ collect_candidate_gids as _collect_candidate_gids,
68
+ collect_clustered_gids as _collect_clustered_gids,
69
+ # supplement_missing_gids_for_clustering已移除,不再需要
70
+ handle_single_alert_file as _handle_single_alert_file,
71
+ validate_cluster_format as _validate_cluster_format,
72
+ extract_classified_gids as _extract_classified_gids,
73
+ build_cluster_retry_task as _build_cluster_retry_task,
74
+ build_cluster_error_guidance as _build_cluster_error_guidance,
75
+ run_cluster_agent_direct_model as _run_cluster_agent_direct_model,
76
+ validate_cluster_result as _validate_cluster_result,
77
+ check_cluster_completeness as _check_cluster_completeness,
78
+ run_cluster_agent_with_retry as _run_cluster_agent_with_retry,
79
+ process_cluster_results as _process_cluster_results,
80
+ supplement_missing_gids as _supplement_missing_gids,
81
+ build_cluster_task as _build_cluster_task,
82
+ extract_input_gids as _extract_input_gids,
83
+ build_gid_to_item_mapping as _build_gid_to_item_mapping,
84
+ process_cluster_chunk as _process_cluster_chunk,
85
+ filter_pending_items as _filter_pending_items,
86
+ process_file_clustering as _process_file_clustering,
87
+ # check_and_supplement_missing_gids已移除,完整性检查已移至process_clustering_phase中
88
+ initialize_clustering_context as _initialize_clustering_context,
89
+ check_unclustered_gids as _check_unclustered_gids,
90
+ execute_clustering_for_files as _execute_clustering_for_files,
91
+ record_clustering_completion as _record_clustering_completion,
92
+ fallback_to_file_based_batches as _fallback_to_file_based_batches,
93
+ process_clustering_phase as _process_clustering_phase,
94
+ )
95
+ from jarvis.jarvis_sec.review import ( # noqa: F401
96
+ build_review_task as _build_review_task,
97
+ process_review_batch_items as _process_review_batch_items,
98
+ reinstated_candidates_to_cluster_batches as _reinstated_candidates_to_cluster_batches,
99
+ process_review_phase as _process_review_phase,
100
+ build_gid_to_review_mapping as _build_gid_to_review_mapping,
101
+ process_review_batch as _process_review_batch,
102
+ run_review_agent_with_retry as _run_review_agent_with_retry,
103
+ is_valid_review_item as _is_valid_review_item,
104
+ )
105
+ from jarvis.jarvis_sec.analysis import ( # noqa: F401
106
+ build_analysis_task_context as _build_analysis_task_context,
107
+ build_validation_error_guidance as _build_validation_error_guidance,
108
+ run_analysis_agent_with_retry as _run_analysis_agent_with_retry,
109
+ expand_and_filter_analysis_results as _expand_and_filter_analysis_results,
110
+ valid_items as _valid_items,
111
+ )
112
+ from jarvis.jarvis_sec.verification import ( # noqa: F401
113
+ build_gid_to_verification_mapping as _build_gid_to_verification_mapping,
114
+ merge_verified_items as _merge_verified_items,
115
+ merge_verified_items_without_verification as _merge_verified_items_without_verification,
116
+ process_verification_batch as _process_verification_batch,
117
+ is_valid_verification_item as _is_valid_verification_item,
118
+ run_verification_agent_with_retry as _run_verification_agent_with_retry,
119
+ process_verification_phase as _process_verification_phase,
120
+ )
29
121
 
30
- def _build_summary_prompt() -> str:
31
- """
32
- 构建摘要提示词:要求以 <REPORT>...</REPORT> 包裹的 YAML 输出(仅YAML)。
33
- 系统提示词不强制规定主对话输出格式,仅在摘要中给出结构化结果。
34
- """
35
- return """
36
- 请将本轮"安全子任务(单点验证)"的结构化结果仅放入以下标记中,并使用 YAML 数组对象形式输出。
37
- 仅输出全局编号(gid)与详细理由(不含位置信息),gid 为全局唯一的数字编号。
38
-
39
- 示例1:有告警的情况(has_risk: true,单个gid)
40
- <REPORT>
41
- - gid: 1
42
- has_risk: true
43
- preconditions: "输入字符串 src 的长度大于等于 dst 的缓冲区大小"
44
- trigger_path: "调用路径推导:main() -> handle_network_request() -> parse_packet() -> foobar() -> strcpy()。数据流:网络数据包通过 handle_network_request() 接收,传递给 parse_packet() 解析,parse_packet() 未对数据长度进行校验,直接将 src 传递给 foobar(),foobar() 调用 strcpy(dst, src) 时未检查 src 长度,可导致缓冲区溢出。关键调用点:parse_packet() 函数未对输入长度进行校验。"
45
- consequences: "缓冲区溢出,可能引发程序崩溃或任意代码执行"
46
- suggestions: "使用 strncpy_s 或其他安全的字符串复制函数"
47
- </REPORT>
48
-
49
- 示例2:有告警的情况(has_risk: true,多个gid合并,路径和原因一致)
50
- <REPORT>
51
- - gids: [1, 2, 3]
52
- has_risk: true
53
- preconditions: "输入字符串 src 的长度大于等于 dst 的缓冲区大小"
54
- trigger_path: "调用路径推导:main() -> handle_network_request() -> parse_packet() -> foobar() -> strcpy()。数据流:网络数据包通过 handle_network_request() 接收,传递给 parse_packet() 解析,parse_packet() 未对数据长度进行校验,直接将 src 传递给 foobar(),foobar() 调用 strcpy(dst, src) 时未检查 src 长度,可导致缓冲区溢出。关键调用点:parse_packet() 函数未对输入长度进行校验。"
55
- consequences: "缓冲区溢出,可能引发程序崩溃或任意代码执行"
56
- suggestions: "使用 strncpy_s 或其他安全的字符串复制函数"
57
- </REPORT>
58
-
59
- 示例3:误报或无问题(返回空数组)
60
- <REPORT>
61
- []
62
- </REPORT>
63
-
64
- 要求:
65
- - 只能在 <REPORT> 与 </REPORT> 中输出 YAML 数组,且不得出现其他文本。
66
- - 若确认本批次全部为误报或无问题,请返回空数组 []。
67
- - 数组元素为对象,包含字段:
68
- - gid: 整数(全局唯一编号,单个告警时使用)
69
- - gids: 整数数组(全局唯一编号数组,多个告警合并时使用)
70
- - has_risk: 布尔值 (true/false),表示该项是否存在真实安全风险。
71
- - preconditions: 字符串(触发漏洞的前置条件,仅当 has_risk 为 true 时必需)
72
- - trigger_path: 字符串(漏洞的触发路径,必须包含完整的调用路径推导,包括:1) 可控输入的来源;2) 从输入源到缺陷代码的完整调用链(函数调用序列);3) 每个调用点的数据校验情况;4) 触发条件。格式示例:"调用路径推导:函数A() -> 函数B() -> 函数C() -> 缺陷代码。数据流:输入来源 -> 传递路径。关键调用点:函数B()未做校验。",仅当 has_risk 为 true 时必需)
73
- - consequences: 字符串(漏洞被触发后可能导致的后果,仅当 has_risk 为 true 时必需)
74
- - suggestions: 字符串(修复或缓解该漏洞的建议,仅当 has_risk 为 true 时必需)
75
- - **合并格式优化**:如果多个告警(gid)的路径(trigger_path)、原因(preconditions/consequences/suggestions)完全一致,可以使用 gids 数组格式合并输出,减少重复内容。单个告警使用 gid,多个告警合并使用 gids。gid 和 gids 不能同时出现。
76
- - 不要在数组元素中包含 file/line/pattern 等位置信息;写入 jsonl 时系统会结合原始候选信息。
77
- - **关键**:仅当 `has_risk` 为 `true` 时,才会被记录为确认的问题。对于确认是误报的条目,请确保 `has_risk` 为 `false` 或不输出该条目。
78
- - **输出格式**:有告警的条目必须包含所有字段(gid 或 gids, has_risk, preconditions, trigger_path, consequences, suggestions);无告警的条目只需包含 gid 和 has_risk。
79
- - **调用路径推导要求**:trigger_path 字段必须包含完整的调用路径推导,不能省略或简化。必须明确说明从可控输入到缺陷代码的完整调用链,以及每个调用点的校验情况。如果无法推导出完整的调用路径,应该判定为误报(has_risk: false)。
80
- """.strip()
81
-
82
-
83
- def _build_verification_summary_prompt() -> str:
84
- """
85
- 构建验证 Agent 的摘要提示词:验证分析 Agent 给出的结论是否正确。
86
- """
87
- return """
88
- 请将本轮"验证分析结论"的结构化结果仅放入以下标记中,并使用 YAML 数组对象形式输出。
89
- 你需要验证分析 Agent 给出的结论是否正确,包括前置条件、触发路径、后果和建议是否合理。
90
-
91
- 示例1:验证通过(is_valid: true,单个gid)
92
- <REPORT>
93
- - gid: 1
94
- is_valid: true
95
- verification_notes: "分析结论正确,前置条件合理,触发路径清晰,后果评估准确"
96
- </REPORT>
97
-
98
- 示例2:验证通过(is_valid: true,多个gid合并)
99
- <REPORT>
100
- - gids: [1, 2, 3]
101
- is_valid: true
102
- verification_notes: "分析结论正确,前置条件合理,触发路径清晰,后果评估准确"
103
- </REPORT>
104
-
105
- 示例3:验证不通过(is_valid: false)
106
- <REPORT>
107
- - gid: 1
108
- is_valid: false
109
- verification_notes: "前置条件过于宽泛,实际代码中已有输入校验,触发路径不成立"
110
- </REPORT>
111
-
112
- 要求:
113
- - 只能在 <REPORT> 与 </REPORT> 中输出 YAML 数组,且不得出现其他文本。
114
- - 数组元素为对象,包含字段:
115
- - gid: 整数(全局唯一编号,对应分析 Agent 给出的 gid,单个告警时使用)
116
- - gids: 整数数组(全局唯一编号数组,对应分析 Agent 给出的 gids,多个告警合并时使用)
117
- - is_valid: 布尔值 (true/false),表示分析 Agent 的结论是否正确
118
- - verification_notes: 字符串(验证说明,解释为什么结论正确或不正确)
119
- - **合并格式优化**:如果多个告警(gid)的验证结果(is_valid)和验证说明(verification_notes)完全一致,可以使用 gids 数组格式合并输出,减少重复内容。单个告警使用 gid,多个告警合并使用 gids。gid 和 gids 不能同时出现。
120
- - 必须对所有输入的 gid 进行验证,不能遗漏。
121
- - 如果验证通过(is_valid: true),则保留该告警;如果验证不通过(is_valid: false),则视为误报,不记录为问题。
122
- """.strip()
123
-
124
-
125
- # 注:当前版本不使用 MultiAgent 编排,已移除默认多智能体配置与创建函数。
126
- # 请使用 run_security_analysis(单Agent逐条验证)或 workflow.direct_scan + format_markdown_report(直扫基线)。
127
-
128
- def _git_restore_if_dirty(repo_root: str) -> int:
129
- """
130
- 若 repo_root 为 git 仓库:检测工作区是否有变更;如有则使用 'git checkout -- .' 恢复。
131
- 返回估算的变更文件数(基于 git status --porcelain 的行数)。
132
- """
133
- try:
134
- from pathlib import Path as _Path
135
- import subprocess as _sub
136
- root = _Path(repo_root)
137
- if not (root / ".git").exists():
138
- return 0
139
- proc = _sub.run(["git", "status", "--porcelain"], cwd=str(root), capture_output=True, text=True)
140
- if proc.returncode != 0:
141
- return 0
142
- lines = [line for line in proc.stdout.splitlines() if line.strip()]
143
- if lines:
144
- _sub.run(["git", "checkout", "--", "."], cwd=str(root), capture_output=True, text=True)
145
- return len(lines)
146
- except Exception:
147
- pass
148
- return 0
149
-
150
-
151
- def _get_sec_dir(base_path: str):
152
- """获取 .jarvis/sec 目录路径,支持 base_path 是项目根目录或已经是 .jarvis/sec 目录"""
153
- from pathlib import Path as _Path
154
- base = _Path(base_path)
155
- # 检查 base_path 是否已经是 .jarvis/sec 目录
156
- if base.name == "sec" and base.parent.name == ".jarvis":
157
- return base
158
- # 否则,假设 base_path 是项目根目录
159
- return base / ".jarvis" / "sec"
160
-
161
-
162
- def _initialize_analysis_context(
163
- entry_path: str,
164
- status_mgr,
165
- ) -> tuple:
166
- """
167
- 初始化分析上下文,包括状态管理、进度文件、目录等。
168
-
169
- 返回: (sec_dir, progress_path, _progress_append, done_sigs)
170
- """
171
- from pathlib import Path as _Path
172
- from datetime import datetime as _dt
173
- import json as _json
174
-
175
- # 获取 .jarvis/sec 目录
176
- sec_dir = _get_sec_dir(entry_path)
177
- progress_path = sec_dir / "progress.jsonl"
178
-
179
- # 进度追加函数
180
- def _progress_append(rec: Dict) -> None:
181
- try:
182
- progress_path.parent.mkdir(parents=True, exist_ok=True)
183
- rec = dict(rec)
184
- rec.setdefault("timestamp", _dt.utcnow().isoformat() + "Z")
185
- line = _json.dumps(rec, ensure_ascii=False)
186
- with progress_path.open("a", encoding="utf-8") as f:
187
- f.write(line + "\n")
188
- except Exception:
189
- # 进度文件失败不影响主流程
190
- pass
191
-
192
- # 已完成集合(按候选签名)
193
- done_sigs: set = set()
194
- if progress_path.exists():
195
- try:
196
- for line in progress_path.read_text(encoding="utf-8", errors="ignore").splitlines():
197
- line = line.strip()
198
- if not line:
199
- continue
200
- try:
201
- obj = _json.loads(line)
202
- except Exception:
203
- continue
204
- if obj.get("event") == "task_status" and obj.get("status") == "done":
205
- sig = obj.get("candidate_signature")
206
- if sig:
207
- done_sigs.add(sig)
208
- except Exception:
209
- pass
210
-
211
- return sec_dir, progress_path, _progress_append, done_sigs
212
-
213
-
214
- def _load_or_run_heuristic_scan(
215
- entry_path: str,
216
- langs: List[str],
217
- exclude_dirs: Optional[List[str]],
218
- sec_dir,
219
- status_mgr,
220
- _progress_append,
221
- ) -> tuple[List[Dict], Dict]:
222
- """
223
- 加载或运行启发式扫描。
224
-
225
- 返回: (candidates, summary)
226
- """
227
- import json
228
- from pathlib import Path as _Path
229
-
230
- _heuristic_path = sec_dir / "heuristic_issues.jsonl"
231
- candidates: List[Dict] = []
232
- summary: Dict = {}
233
-
234
- if _heuristic_path.exists():
235
- try:
236
- typer.secho(f"[jarvis-sec] 从 {_heuristic_path} 恢复启发式扫描", fg=typer.colors.BLUE)
237
- with _heuristic_path.open("r", encoding="utf-8") as f:
238
- for line in f:
239
- if line.strip():
240
- candidates.append(json.loads(line))
241
- _progress_append({
242
- "event": "pre_scan_resumed",
243
- "path": str(_heuristic_path),
244
- "issues_found": len(candidates)
245
- })
246
- except Exception as e:
247
- typer.secho(f"[jarvis-sec] 恢复启发式扫描失败,执行完整扫描: {e}", fg=typer.colors.YELLOW)
248
- candidates = [] # 重置以便执行完整扫描
249
-
250
- if not candidates:
251
- _progress_append({"event": "pre_scan_start", "entry_path": entry_path, "languages": langs})
252
- status_mgr.update_pre_scan(message="开始启发式扫描...")
253
- pre_scan = direct_scan(entry_path, languages=langs, exclude_dirs=exclude_dirs)
254
- candidates = pre_scan.get("issues", [])
255
- summary = pre_scan.get("summary", {})
256
- scanned_files = summary.get("scanned_files", 0)
257
- status_mgr.update_pre_scan(
258
- current_files=scanned_files,
259
- total_files=scanned_files,
260
- issues_found=len(candidates),
261
- message=f"启发式扫描完成,发现 {len(candidates)} 个候选问题"
262
- )
263
- _progress_append({
264
- "event": "pre_scan_done",
265
- "entry_path": entry_path,
266
- "languages": langs,
267
- "scanned_files": scanned_files,
268
- "issues_found": len(candidates)
269
- })
270
- # 持久化
271
- try:
272
- _heuristic_path.parent.mkdir(parents=True, exist_ok=True)
273
- with _heuristic_path.open("w", encoding="utf-8") as f:
274
- for item in candidates:
275
- f.write(json.dumps(item, ensure_ascii=False) + "\n")
276
- _progress_append({
277
- "event": "heuristic_report_written",
278
- "path": str(_heuristic_path),
279
- "issues_count": len(candidates),
280
- })
281
- typer.secho(f"[jarvis-sec] 已将 {len(candidates)} 个启发式扫描问题写入 {_heuristic_path}", fg=typer.colors.GREEN)
282
- except Exception:
283
- pass
284
- else:
285
- # 从断点恢复启发式扫描结果
286
- status_mgr.update_pre_scan(
287
- issues_found=len(candidates),
288
- message=f"从断点恢复,已发现 {len(candidates)} 个候选问题"
289
- )
290
-
291
- return candidates, summary
292
-
293
-
294
- def _compact_candidate(it: Dict) -> Dict:
295
- """精简候选问题,只保留必要字段"""
296
- return {
297
- "language": it.get("language"),
298
- "category": it.get("category"),
299
- "pattern": it.get("pattern"),
300
- "file": it.get("file"),
301
- "line": it.get("line"),
302
- "evidence": it.get("evidence"),
303
- "confidence": it.get("confidence"),
304
- "severity": it.get("severity", "medium"),
305
- }
306
-
307
-
308
- def _prepare_candidates(candidates: List[Dict]) -> List[Dict]:
309
- """
310
- 将候选问题精简为子任务清单,控制上下文长度,并分配全局唯一ID。
311
-
312
- 返回: compact_candidates (已分配gid的候选列表)
313
- """
314
- compact_candidates = [_compact_candidate(it) for it in candidates]
315
- # 为所有候选分配全局唯一数字ID(gid: 1..N),用于跨批次/跨文件统一编号与跟踪
316
- for i, it in enumerate(compact_candidates, start=1):
317
- try:
318
- it["gid"] = i
319
- except Exception:
320
- pass
321
-
322
- return compact_candidates
323
-
324
-
325
- def _load_existing_clusters(
326
- sec_dir,
327
- progress_path,
328
- ) -> tuple[Dict[tuple[str, int], List[Dict]], set]:
329
- """
330
- 读取已有聚类报告以支持断点恢复。
331
-
332
- 返回: (_existing_clusters, _completed_cluster_batches)
333
- """
334
- _existing_clusters: Dict[tuple[str, int], List[Dict]] = {}
335
- _completed_cluster_batches: set = set()
336
-
337
- try:
338
- from pathlib import Path as _Path2
339
- import json as _json
340
- _cluster_path = sec_dir / "cluster_report.jsonl"
341
-
342
- # 从 progress.jsonl 中读取已完成的聚类批次(优先检查)
343
- if progress_path.exists():
344
- try:
345
- for line in progress_path.read_text(encoding="utf-8", errors="ignore").splitlines():
346
- line = line.strip()
347
- if not line:
348
- continue
349
- try:
350
- obj = _json.loads(line)
351
- except Exception:
352
- continue
353
- # 检查 cluster_status 事件,status 为 "done" 表示已完成
354
- if obj.get("event") == "cluster_status" and obj.get("status") == "done":
355
- file_name = obj.get("file")
356
- batch_idx = obj.get("batch_index")
357
- if file_name and batch_idx:
358
- _completed_cluster_batches.add((str(file_name), int(batch_idx)))
359
- except Exception:
360
- pass
361
-
362
- # 读取 cluster_report.jsonl(由于使用追加模式,可能有重复,需要去重)
363
- if _cluster_path.exists():
364
- try:
365
- # 使用字典去重:key 为 (file, batch_index, verification, gids 的字符串表示)
366
- seen_records: Dict[tuple, Dict] = {}
367
- with _cluster_path.open("r", encoding="utf-8", errors="ignore") as f:
368
- for line in f:
369
- line = line.strip()
370
- if not line:
371
- continue
372
- rec = _json.loads(line)
373
- if not isinstance(rec, dict):
374
- continue
375
- f_name = str(rec.get("file") or "")
376
- bidx = int(rec.get("batch_index", 1) or 1)
377
- # 使用 gids 的排序后元组作为去重键
378
- gids_list = rec.get("gids", [])
379
- gids_key = tuple(sorted(gids_list)) if isinstance(gids_list, list) else ()
380
- key = (f_name, bidx, str(rec.get("verification", "")), gids_key)
381
- # 保留最新的记录(后写入的覆盖先写入的)
382
- seen_records[key] = rec
383
-
384
- # 按 (file, batch_index) 分组
385
- for rec in seen_records.values():
386
- f_name = str(rec.get("file") or "")
387
- bidx = int(rec.get("batch_index", 1) or 1)
388
- _existing_clusters.setdefault((f_name, bidx), []).append(rec)
389
- except Exception:
390
- _existing_clusters = {}
391
- except Exception:
392
- _existing_clusters = {}
393
- _completed_cluster_batches = set()
394
-
395
- return _existing_clusters, _completed_cluster_batches
396
-
397
-
398
- def _restore_clusters_from_checkpoint(
399
- _existing_clusters: Dict[tuple[str, int], List[Dict]],
400
- _file_groups: Dict[str, List[Dict]],
401
- ) -> tuple[List[List[Dict]], List[Dict], List[Dict], set]:
402
- """
403
- 从断点恢复聚类结果。
404
-
405
- 返回: (cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids)
406
- """
407
- # 1. 收集所有候选的 gid
408
- all_candidate_gids_in_clustering = set()
409
- gid_to_candidate: Dict[int, Dict] = {}
410
- for _file, _items in _file_groups.items():
411
- for it in _items:
412
- try:
413
- _gid = int(it.get("gid", 0))
414
- if _gid >= 1:
415
- all_candidate_gids_in_clustering.add(_gid)
416
- gid_to_candidate[_gid] = it
417
- except Exception:
418
- pass
419
-
420
- # 2. 从 cluster_report.jsonl 恢复所有聚类结果
421
- clustered_gids = set() # 已聚类的 gid(包括有效和无效的,因为无效的也需要进入复核阶段)
422
- invalid_clusters_for_review: List[Dict] = [] # 无效聚类列表(从断点恢复)
423
- cluster_batches: List[List[Dict]] = []
424
- cluster_records: List[Dict] = []
425
-
426
- for (_file_key, _batch_idx), cluster_recs in _existing_clusters.items():
427
- for rec in cluster_recs:
428
- gids_list = rec.get("gids", [])
429
- if not gids_list:
430
- continue
431
- is_invalid = rec.get("is_invalid", False)
432
- verification = str(rec.get("verification", "")).strip()
433
- members: List[Dict] = []
434
- for _gid in gids_list:
435
- try:
436
- _gid_int = int(_gid)
437
- if _gid_int >= 1 and _gid_int in gid_to_candidate:
438
- # 只有当 gid 在当前运行中存在时,才恢复该聚类
439
- candidate = gid_to_candidate[_gid_int]
440
- candidate["verify"] = verification
441
- members.append(candidate)
442
- # 无论有效还是无效,都计入已聚类的 gid(避免被重新聚类)
443
- clustered_gids.add(_gid_int)
444
- except Exception:
445
- pass
446
-
447
- if members:
448
- if is_invalid:
449
- # 无效聚类:收集到复核列表,不加入 cluster_batches
450
- invalid_clusters_for_review.append({
451
- "file": _file_key,
452
- "batch_index": _batch_idx,
453
- "gids": [m.get("gid") for m in members],
454
- "verification": verification,
455
- "invalid_reason": str(rec.get("invalid_reason", "")).strip(),
456
- "members": members, # 保存候选信息,用于复核后可能重新加入验证
457
- "count": len(members),
458
- })
459
- else:
460
- # 有效聚类:恢复到 cluster_batches
461
- cluster_batches.append(members)
462
- cluster_records.append({
463
- "file": _file_key,
464
- "verification": verification,
465
- "gids": [m.get("gid") for m in members],
466
- "count": len(members),
467
- "batch_index": _batch_idx,
468
- "is_invalid": False,
469
- })
470
-
471
- return cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids
472
-
473
-
474
- def _get_review_system_prompt() -> str:
475
- """获取复核Agent的系统提示词"""
476
- return """
477
- # 复核Agent约束
478
- - 你的核心任务是复核聚类Agent给出的无效结论是否充分和正确。
479
- - 你需要仔细检查聚类Agent提供的invalid_reason是否充分,是否真的考虑了所有可能的路径。
480
- - 工具优先:使用 read_code 读取目标文件附近源码(行号前后各 ~50 行),必要时用 execute_script 辅助检索。
481
- - 必要时需向上追溯调用者,查看完整的调用路径,以确认聚类Agent的结论是否成立。
482
- - 禁止修改任何文件或执行写操作命令;仅进行只读分析与读取。
483
- - 每次仅执行一个操作;等待工具结果后再进行下一步。
484
- - **记忆使用**:
485
- - 在复核过程中,充分利用 retrieve_memory 工具检索已有的记忆,特别是与当前文件或函数相关的记忆。
486
- - 这些记忆可能包含函数的分析要点、指针判空情况、输入校验情况、调用路径分析结果等。
487
- - **复核原则**:
488
- - 必须验证聚类Agent是否真的检查了所有可能的调用路径和调用者。
489
- - 必须验证聚类Agent是否真的确认了所有路径都有保护措施。
490
- - 如果发现聚类Agent遗漏了某些路径、调用者或边界情况,必须判定为理由不充分。
491
- - 保守策略:有疑问时,一律判定为理由不充分,将候选重新加入验证流程。
492
- - 完成复核后,主输出仅打印结束符 <!!!COMPLETE!!!> ,不需要汇总结果。
493
- """.strip()
494
-
495
-
496
- def _get_review_summary_prompt() -> str:
497
- """获取复核Agent的摘要提示词"""
498
- return """
499
- 请将本轮"复核结论"的结构化结果仅放入以下标记中,并使用 YAML 数组对象形式输出。
500
- 你需要复核聚类Agent给出的无效理由是否充分,是否真的考虑了所有可能的路径。
501
-
502
- 示例1:理由充分(is_reason_sufficient: true,单个gid)
503
- <REPORT>
504
- - gid: 1
505
- is_reason_sufficient: true
506
- review_notes: "聚类Agent已检查所有调用路径,确认所有调用者都有输入校验,理由充分"
507
- </REPORT>
508
-
509
- 示例2:理由充分(is_reason_sufficient: true,多个gid合并)
510
- <REPORT>
511
- - gids: [1, 2, 3]
512
- is_reason_sufficient: true
513
- review_notes: "聚类Agent已检查所有调用路径,确认所有调用者都有输入校验,理由充分"
514
- </REPORT>
515
-
516
- 示例3:理由不充分(is_reason_sufficient: false)
517
- <REPORT>
518
- - gid: 1
519
- is_reason_sufficient: false
520
- review_notes: "聚类Agent遗漏了函数X的调用路径,该路径可能未做校验,理由不充分,需要重新验证"
521
- </REPORT>
522
-
523
- 要求:
524
- - 只能在 <REPORT> 与 </REPORT> 中输出 YAML 数组,且不得出现其他文本。
525
- - 数组元素为对象,包含字段:
526
- - gid: 整数(全局唯一编号,对应无效聚类的gid,单个告警时使用)
527
- - gids: 整数数组(全局唯一编号数组,对应无效聚类的gids,多个告警合并时使用)
528
- - is_reason_sufficient: 布尔值 (true/false),表示无效理由是否充分
529
- - review_notes: 字符串(复核说明,解释为什么理由充分或不充分)
530
- - **合并格式优化**:如果多个告警(gid)的复核结果(is_reason_sufficient)和复核说明(review_notes)完全一致,可以使用 gids 数组格式合并输出,减少重复内容。单个告警使用 gid,多个告警合并使用 gids。gid 和 gids 不能同时出现。
531
- - 必须对所有输入的gid进行复核,不能遗漏。
532
- - 如果理由不充分(is_reason_sufficient: false),该候选将重新加入验证流程;如果理由充分(is_reason_sufficient: true),该候选将被确认为无效。
533
- """.strip()
534
-
535
-
536
- def _build_review_task(review_batch: List[Dict], entry_path: str, langs: List[str]) -> str:
537
- """构建复核任务上下文"""
538
- import json as _json_review
539
- return f"""
540
- # 复核无效聚类任务
541
- 上下文参数:
542
- - entry_path: {entry_path}
543
- - languages: {langs}
544
-
545
- 需要复核的无效聚类(JSON数组):
546
- {_json_review.dumps(review_batch, ensure_ascii=False, indent=2)}
547
-
548
- 请仔细复核每个无效聚类的invalid_reason是否充分,是否真的考虑了所有可能的路径、调用者和边界情况。
549
- 对于每个gid,请判断无效理由是否充分(is_reason_sufficient: true/false),并给出复核说明。
550
- """.strip()
551
-
552
-
553
- def _create_review_agent(
554
- current_review_num: int,
555
- llm_group: Optional[str],
556
- ) -> Agent:
557
- """创建复核Agent"""
558
- review_system_prompt = _get_review_system_prompt()
559
- review_summary_prompt = _get_review_summary_prompt()
560
-
561
- review_task_id = f"JARVIS-SEC-Review-Batch-{current_review_num}"
562
- review_agent_kwargs: Dict = dict(
563
- system_prompt=review_system_prompt,
564
- name=review_task_id,
565
- auto_complete=True,
566
- need_summary=True,
567
- summary_prompt=review_summary_prompt,
568
- non_interactive=True,
569
- in_multi_agent=False,
570
- use_methodology=False,
571
- use_analysis=False,
572
- plan=False,
573
- output_handler=[ToolRegistry()],
574
- disable_file_edit=True,
575
- use_tools=["read_code", "execute_script", "retrieve_memory", "save_memory"],
576
- )
577
- if llm_group:
578
- review_agent_kwargs["model_group"] = llm_group
579
- return Agent(**review_agent_kwargs)
580
-
581
-
582
- def _process_review_batch_items(
583
- review_batch: List[Dict],
584
- review_results: Optional[List[Dict]],
585
- reviewed_clusters: List[Dict],
586
- reinstated_candidates: List[Dict],
587
- ) -> None:
588
- """处理单个复核批次的结果"""
589
- _process_review_batch(
590
- review_batch,
591
- review_results,
592
- reviewed_clusters,
593
- reinstated_candidates,
594
- )
595
-
596
-
597
- def _reinstated_candidates_to_cluster_batches(
598
- reinstated_candidates: List[Dict],
599
- cluster_batches: List[List[Dict]],
600
- _progress_append,
601
- ) -> None:
602
- """将重新加入的候选添加到cluster_batches"""
603
- from collections import defaultdict as _dd2
604
-
605
- if not reinstated_candidates:
606
- return
607
-
608
- typer.secho(f"[jarvis-sec] 复核完成:{len(reinstated_candidates)} 个候选重新加入验证流程", fg=typer.colors.GREEN)
609
- # 按文件分组重新加入的候选
610
- reinstated_by_file: Dict[str, List[Dict]] = _dd2(list)
611
- for cand in reinstated_candidates:
612
- file_key = str(cand.get("file") or "")
613
- reinstated_by_file[file_key].append(cand)
614
-
615
- # 为每个文件的重新加入候选创建批次
616
- for file_key, cands in reinstated_by_file.items():
617
- if cands:
618
- cluster_batches.append(cands)
619
- _progress_append({
620
- "event": "review_reinstated",
621
- "file": file_key,
622
- "gids": [c.get("gid") for c in cands],
623
- "count": len(cands),
624
- })
625
-
626
-
627
- def _process_review_phase(
628
- invalid_clusters_for_review: List[Dict],
629
- entry_path: str,
630
- langs: List[str],
631
- llm_group: Optional[str],
632
- status_mgr,
633
- _progress_append,
634
- cluster_batches: List[List[Dict]],
635
- ) -> List[List[Dict]]:
636
- """
637
- 处理复核阶段:验证所有标记为无效的聚类。
638
-
639
- 返回: 更新后的 cluster_batches(包含重新加入验证的候选)
640
- """
641
- if not invalid_clusters_for_review:
642
- typer.secho(f"[jarvis-sec] 无无效聚类需要复核", fg=typer.colors.BLUE)
643
- return cluster_batches
644
-
645
- typer.secho(f"\n[jarvis-sec] 开始复核 {len(invalid_clusters_for_review)} 个无效聚类...", fg=typer.colors.MAGENTA)
646
- status_mgr.update_review(
647
- current_review=0,
648
- total_reviews=len(invalid_clusters_for_review),
649
- message="开始复核无效聚类..."
650
- )
651
-
652
- # 按批次复核(每批最多10个无效聚类,避免上下文过长)
653
- review_batch_size = 10
654
- reviewed_clusters: List[Dict] = []
655
- reinstated_candidates: List[Dict] = [] # 重新加入验证的候选
656
-
657
- review_system_prompt = _get_review_system_prompt()
658
- review_summary_prompt = _get_review_summary_prompt()
659
-
660
- for review_idx in range(0, len(invalid_clusters_for_review), review_batch_size):
661
- review_batch = invalid_clusters_for_review[review_idx:review_idx + review_batch_size]
662
- current_review_num = review_idx // review_batch_size + 1
663
- total_review_batches = (len(invalid_clusters_for_review) + review_batch_size - 1) // review_batch_size
664
-
665
- typer.secho(f"[jarvis-sec] 复核批次 {current_review_num}/{total_review_batches}: {len(review_batch)} 个无效聚类", fg=typer.colors.CYAN)
666
- status_mgr.update_review(
667
- current_review=current_review_num,
668
- total_reviews=total_review_batches,
669
- message=f"正在复核批次 {current_review_num}/{total_review_batches}"
670
- )
671
-
672
- # 构建复核任务
673
- review_task = _build_review_task(review_batch, entry_path, langs)
674
-
675
- # 创建复核Agent
676
- review_agent = _create_review_agent(current_review_num, llm_group)
677
-
678
- # 订阅复核Agent的摘要
679
- review_summary_container = _subscribe_summary_event(review_agent)
680
-
681
- # 运行复核Agent(永久重试直到格式正确)
682
- review_results, parse_error = _run_review_agent_with_retry(
683
- review_agent,
684
- review_task,
685
- review_summary_prompt,
686
- entry_path,
687
- review_summary_container,
688
- )
689
-
690
- # 处理复核结果
691
- _process_review_batch_items(
692
- review_batch,
693
- review_results,
694
- reviewed_clusters,
695
- reinstated_candidates,
696
- )
697
-
698
- # 将重新加入验证的候选添加到cluster_batches
699
- _reinstated_candidates_to_cluster_batches(
700
- reinstated_candidates,
701
- cluster_batches,
702
- _progress_append,
703
- )
704
-
705
- if not reinstated_candidates:
706
- typer.secho(f"[jarvis-sec] 复核完成:所有无效聚类理由充分,确认为无效", fg=typer.colors.GREEN)
707
-
708
- # 记录复核结果
709
- _progress_append({
710
- "event": "review_completed",
711
- "total_reviewed": len(invalid_clusters_for_review),
712
- "reinstated": len(reinstated_candidates),
713
- "confirmed_invalid": len(invalid_clusters_for_review) - len(reinstated_candidates),
714
- })
715
- status_mgr.update_review(
716
- current_review=len(invalid_clusters_for_review),
717
- total_reviews=len(invalid_clusters_for_review),
718
- message=f"复核完成:{len(reinstated_candidates)} 个候选重新加入验证"
719
- )
720
-
721
- return cluster_batches
722
-
723
-
724
- def _build_gid_to_review_mapping(review_results: List[Dict]) -> Dict[int, Dict]:
725
- """构建gid到复核结果的映射(支持 gid 和 gids 两种格式)"""
726
- gid_to_review: Dict[int, Dict] = {}
727
- for rr in review_results:
728
- if not isinstance(rr, dict):
729
- continue
730
-
731
- # 支持 gid 和 gids 两种格式
732
- gids_to_process: List[int] = []
733
- if "gids" in rr and isinstance(rr.get("gids"), list):
734
- # 合并格式:gids 数组
735
- for gid_val in rr.get("gids", []):
736
- try:
737
- gid_int = int(gid_val)
738
- if gid_int >= 1:
739
- gids_to_process.append(gid_int)
740
- except Exception:
741
- pass
742
- elif "gid" in rr:
743
- # 单个格式:gid
744
- try:
745
- gid_int = int(rr.get("gid", 0))
746
- if gid_int >= 1:
747
- gids_to_process.append(gid_int)
748
- except Exception:
749
- pass
750
-
751
- # 为每个 gid 创建复核结果映射
752
- is_reason_sufficient = rr.get("is_reason_sufficient")
753
- review_notes = str(rr.get("review_notes", "")).strip()
754
- for gid in gids_to_process:
755
- gid_to_review[gid] = {
756
- "is_reason_sufficient": is_reason_sufficient,
757
- "review_notes": review_notes
758
- }
759
- return gid_to_review
760
-
761
-
762
- def _process_review_batch(
763
- review_batch: List[Dict],
764
- review_results: Optional[List[Dict]],
765
- reviewed_clusters: List[Dict],
766
- reinstated_candidates: List[Dict],
767
- ) -> None:
768
- """处理单个复核批次的结果"""
769
- if review_results:
770
- # 构建gid到复核结果的映射
771
- gid_to_review = _build_gid_to_review_mapping(review_results)
772
-
773
- # 处理每个无效聚类
774
- for invalid_cluster in review_batch:
775
- cluster_gids = invalid_cluster.get("gids", [])
776
- cluster_members = invalid_cluster.get("members", [])
777
-
778
- # 检查该聚类中的所有gid的复核结果
779
- all_sufficient = True
780
- any_reviewed = False
781
- insufficient_review_result = None
782
- for gid in cluster_gids:
783
- review_result = gid_to_review.get(gid)
784
- if review_result:
785
- any_reviewed = True
786
- if review_result.get("is_reason_sufficient") is not True:
787
- all_sufficient = False
788
- if not insufficient_review_result:
789
- insufficient_review_result = review_result
790
- break
791
-
792
- if any_reviewed and not all_sufficient:
793
- # 理由不充分,重新加入验证流程
794
- typer.secho(f"[jarvis-sec] 复核结果:无效聚类(gids={cluster_gids})理由不充分,重新加入验证流程", fg=typer.colors.BLUE)
795
- for member in cluster_members:
796
- reinstated_candidates.append(member)
797
- reviewed_clusters.append({
798
- **invalid_cluster,
799
- "review_result": "reinstated",
800
- "review_notes": insufficient_review_result.get("review_notes", "") if insufficient_review_result else "",
801
- })
802
- else:
803
- # 理由充分,确认无效
804
- review_notes = ""
805
- if cluster_gids and gid_to_review.get(cluster_gids[0]):
806
- review_notes = gid_to_review[cluster_gids[0]].get("review_notes", "")
807
- typer.secho(f"[jarvis-sec] 复核结果:无效聚类(gids={cluster_gids})理由充分,确认为无效", fg=typer.colors.GREEN)
808
- reviewed_clusters.append({
809
- **invalid_cluster,
810
- "review_result": "confirmed_invalid",
811
- "review_notes": review_notes,
812
- })
813
- else:
814
- # 复核结果解析失败,保守策略:重新加入验证流程
815
- typer.secho(f"[jarvis-sec] 警告:复核结果解析失败,保守策略:将批次中的所有候选重新加入验证流程", fg=typer.colors.YELLOW)
816
- for invalid_cluster in review_batch:
817
- cluster_members = invalid_cluster.get("members", [])
818
- for member in cluster_members:
819
- reinstated_candidates.append(member)
820
- reviewed_clusters.append({
821
- **invalid_cluster,
822
- "review_result": "reinstated",
823
- "review_notes": "复核结果解析失败,保守策略重新加入验证",
824
- })
825
-
826
-
827
- def _run_review_agent_with_retry(
828
- review_agent,
829
- review_task: str,
830
- review_summary_prompt: str,
831
- entry_path: str,
832
- review_summary_container: Dict[str, str],
833
- ) -> tuple[Optional[List[Dict]], Optional[str]]:
834
- """运行复核Agent并永久重试直到格式正确,返回(复核结果, 解析错误)"""
835
- use_direct_model_review = False
836
- prev_parse_error_review: Optional[str] = None
837
- review_attempt = 0
838
-
839
- while True:
840
- review_attempt += 1
841
- review_summary_container["text"] = ""
842
-
843
- if use_direct_model_review:
844
- # 格式校验失败后,直接调用模型接口
845
- review_summary_prompt_text = _build_verification_summary_prompt()
846
- error_guidance = ""
847
- if prev_parse_error_review:
848
- error_guidance = f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- YAML解析失败: {prev_parse_error_review}\n\n请确保输出的YAML格式正确,包括正确的缩进、引号、冒号等。"
849
-
850
- full_review_prompt = f"{review_task}{error_guidance}\n\n{review_summary_prompt_text}"
851
- try:
852
- review_response = review_agent.model.chat_until_success(full_review_prompt) # type: ignore
853
- review_summary_container["text"] = review_response
854
- except Exception as e:
855
- try:
856
- typer.secho(f"[jarvis-sec] 复核阶段直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
857
- except Exception:
858
- pass
859
- review_agent.run(review_task)
860
- else:
861
- review_agent.run(review_task)
862
-
863
- # 工作区保护
864
- try:
865
- _changed_review = _git_restore_if_dirty(entry_path)
866
- if _changed_review:
867
- try:
868
- typer.secho(f"[jarvis-sec] 复核 Agent 工作区已恢复 ({_changed_review} 个文件)", fg=typer.colors.BLUE)
869
- except Exception:
870
- pass
871
- except Exception:
872
- pass
873
-
874
- # 解析复核结果
875
- review_summary_text = review_summary_container.get("text", "")
876
- parse_error_review = None
877
- if review_summary_text:
878
- review_parsed, parse_error_review = _try_parse_summary_report(review_summary_text)
879
- if parse_error_review:
880
- prev_parse_error_review = parse_error_review
881
- try:
882
- typer.secho(f"[jarvis-sec] 复核结果YAML解析失败: {parse_error_review}", fg=typer.colors.YELLOW)
883
- except Exception:
884
- pass
885
- else:
886
- prev_parse_error_review = None
887
- if isinstance(review_parsed, list):
888
- # 验证复核结果格式
889
- if review_parsed and all(_is_valid_review_item(item) for item in review_parsed):
890
- return review_parsed, None
891
-
892
- # 格式校验失败,后续重试使用直接模型调用
893
- use_direct_model_review = True
894
- if parse_error_review:
895
- try:
896
- typer.secho(f"[jarvis-sec] 复核结果YAML解析失败 -> 重试第 {review_attempt} 次 (使用直接模型调用,将反馈解析错误)", fg=typer.colors.YELLOW)
897
- except Exception:
898
- pass
899
- else:
900
- try:
901
- typer.secho(f"[jarvis-sec] 复核结果格式无效 -> 重试第 {review_attempt} 次 (使用直接模型调用)", fg=typer.colors.YELLOW)
902
- except Exception:
903
- pass
904
-
905
-
906
- def _is_valid_review_item(item: Dict) -> bool:
907
- """验证复核结果项的格式"""
908
- if not isinstance(item, dict) or "is_reason_sufficient" not in item:
909
- return False
910
- has_gid = "gid" in item
911
- has_gids = "gids" in item
912
- if not has_gid and not has_gids:
913
- return False
914
- if has_gid and has_gids:
915
- return False # gid 和 gids 不能同时出现
916
- if has_gid:
917
- try:
918
- return int(item["gid"]) >= 1
919
- except Exception:
920
- return False
921
- elif has_gids:
922
- if not isinstance(item["gids"], list) or len(item["gids"]) == 0:
923
- return False
924
- try:
925
- return all(int(gid_val) >= 1 for gid_val in item["gids"])
926
- except Exception:
927
- return False
928
- return False
929
-
930
-
931
- def _load_processed_gids_from_issues(sec_dir) -> set:
932
- """从 agent_issues.jsonl 中读取已处理的 gid"""
933
- processed_gids = set()
934
- try:
935
- from pathlib import Path as _Path
936
- _agent_issues_path = sec_dir / "agent_issues.jsonl"
937
- if _agent_issues_path.exists():
938
- import json as _json
939
- with _agent_issues_path.open("r", encoding="utf-8", errors="ignore") as f:
940
- for line in f:
941
- line = line.strip()
942
- if not line:
943
- continue
944
- try:
945
- issue_obj = _json.loads(line)
946
- _gid = int(issue_obj.get("gid", 0))
947
- if _gid >= 1:
948
- processed_gids.add(_gid)
949
- except Exception:
950
- pass
951
- if processed_gids:
952
- try:
953
- typer.secho(f"[jarvis-sec] 断点恢复:从 agent_issues.jsonl 读取到 {len(processed_gids)} 个已处理的 gid", fg=typer.colors.BLUE)
954
- except Exception:
955
- pass
956
- except Exception:
957
- pass
958
- return processed_gids
959
-
960
-
961
- def _count_issues_from_file(sec_dir) -> int:
962
- """从 agent_issues.jsonl 中读取当前问题总数(用于状态显示)"""
963
- count = 0
964
- try:
965
- from pathlib import Path as _Path
966
- import json as _json
967
- _agent_issues_path = sec_dir / "agent_issues.jsonl"
968
- if _agent_issues_path.exists():
969
- saved_gids = set()
970
- with _agent_issues_path.open("r", encoding="utf-8", errors="ignore") as f:
971
- for line in f:
972
- line = line.strip()
973
- if not line:
974
- continue
975
- try:
976
- item = _json.loads(line)
977
- gid = item.get("gid", 0)
978
- if gid >= 1 and gid not in saved_gids:
979
- # 只统计验证通过的告警(has_risk: true 且有 verification_notes)
980
- if item.get("has_risk") is True and "verification_notes" in item:
981
- count += 1
982
- saved_gids.add(gid)
983
- except Exception:
984
- pass
985
- except Exception:
986
- pass
987
- return count
988
-
989
-
990
- def _create_analysis_agent(task_id: str, llm_group: Optional[str]) -> Agent:
991
- """创建分析Agent"""
992
- system_prompt = """
993
- # 单Agent安全分析约束
994
- - 你的核心任务是评估代码的安全问题,目标:针对本候选问题进行证据核实、风险评估与修复建议补充,查找漏洞触发路径,确认在某些条件下会触发;以此来判断是否是漏洞。
995
- - **必须进行调用路径推导**:
996
- - 对于每个候选问题,必须明确推导从可控输入到缺陷代码的完整调用路径。
997
- - 调用路径推导必须包括:
998
- 1. 识别可控输入的来源(例如:用户输入、网络数据、文件读取、命令行参数等)
999
- 2. 追踪数据流:从输入源开始,逐步追踪数据如何传递到缺陷代码位置
1000
- 3. 识别调用链:明确列出从入口函数到缺陷代码的所有函数调用序列(例如:main() -> parse_input() -> process_data() -> vulnerable_function())
1001
- 4. 分析每个调用点的数据校验情况:检查每个函数是否对输入进行了校验、边界检查或安全检查
1002
- 5. 确认触发条件:明确说明在什么条件下,未校验或恶意输入能够到达缺陷代码位置
1003
- - 如果无法推导出完整的调用路径,或者所有调用路径都有充分的保护措施,则应该判定为误报。
1004
- - 调用路径推导必须在分析过程中明确展示,不能省略或假设。
1005
- - 工具优先:使用 read_code 读取目标文件附近源码(行号前后各 ~50 行),必要时用 execute_script 辅助检索。
1006
- - **调用路径追溯要求**:
1007
- - 必须向上追溯所有可能的调用者,查看完整的调用路径,以确认风险是否真实存在。
1008
- - 使用 read_code 和 execute_script 工具查找函数的调用者(例如:使用 grep 搜索函数名,查找所有调用该函数的位置)。
1009
- - 对于每个调用者,必须检查其是否对输入进行了校验。
1010
- - 如果发现任何调用路径未做校验,必须明确记录该路径。
1011
- - 例如:一个函数存在空指针解引用风险,必须检查所有调用者。如果所有调用者均能确保传入的指针非空,则该风险在当前代码库中可能不会实际触发;但如果存在任何调用者未做校验,则风险真实存在。
1012
- - 若多条告警位于同一文件且行号相距不远,可一次性读取共享上下文,对这些相邻告警进行联合分析与判断;但仍需避免无关扩展与大范围遍历。
1013
- - 禁止修改任何文件或执行写操作命令(rm/mv/cp/echo >、sed -i、git、patch、chmod、chown 等);仅进行只读分析与读取。
1014
- - 每次仅执行一个操作;等待工具结果后再进行下一步。
1015
- - **记忆使用**:
1016
- - 在分析过程中,充分利用 retrieve_memory 工具检索已有的记忆,特别是与当前分析函数相关的记忆。
1017
- - 如果有必要,使用 save_memory 工具保存每个函数的分析要点,使用函数名作为 tag(例如:函数名、文件名等)。
1018
- - 记忆内容示例:某个函数的指针已经判空、某个函数已有输入校验、某个函数的调用路径分析结果等。
1019
- - 这样可以避免重复分析,提高效率,并保持分析的一致性。
1020
- - 完成对本批次候选问题的判断后,主输出仅打印结束符 <!!!COMPLETE!!!> ,不需要汇总结果。
1021
- """.strip()
1022
-
1023
- agent_kwargs: Dict = dict(
1024
- system_prompt=system_prompt,
1025
- name=task_id,
1026
- auto_complete=True,
1027
- need_summary=True,
1028
- summary_prompt=_build_summary_prompt(),
1029
- non_interactive=True,
1030
- in_multi_agent=False,
1031
- use_methodology=False,
1032
- use_analysis=False,
1033
- plan=False,
1034
- output_handler=[ToolRegistry()],
1035
- disable_file_edit=True,
1036
- force_save_memory=True,
1037
- use_tools=["read_code", "execute_script", "save_memory", "retrieve_memory"],
1038
- )
1039
- if llm_group:
1040
- agent_kwargs["model_group"] = llm_group
1041
- return Agent(**agent_kwargs)
1042
-
1043
-
1044
- def _build_analysis_task_context(batch: List[Dict], entry_path: str, langs: List[str]) -> str:
1045
- """构建分析任务上下文"""
1046
- import json as _json2
1047
- batch_ctx: List[Dict] = list(batch)
1048
- cluster_verify = str(batch_ctx[0].get("verify") if batch_ctx else "")
1049
- cluster_gids_ctx = [it.get("gid") for it in batch_ctx]
1050
- return f"""
1051
- # 安全子任务批次
1052
- 上下文参数:
1053
- - entry_path: {entry_path}
1054
- - languages: {langs}
1055
- - cluster_verification: {cluster_verify}
1056
-
1057
- - cluster_gids: {cluster_gids_ctx}
1058
- - note: 每个候选含 gid/verify 字段,模型仅需输出 gid 统一给出验证/判断结论(全局编号);无需使用局部 id
1059
-
1060
- 批次候选(JSON数组):
1061
- {_json2.dumps(batch_ctx, ensure_ascii=False, indent=2)}
1062
- """.strip()
1063
-
1064
-
1065
- def _subscribe_summary_event(agent: Agent) -> Dict[str, str]:
1066
- """订阅Agent摘要事件"""
1067
- summary_container: Dict[str, str] = {"text": ""}
1068
- try:
1069
- from jarvis.jarvis_agent.events import AFTER_SUMMARY as _AFTER_SUMMARY
1070
- except Exception:
1071
- _AFTER_SUMMARY = None
1072
-
1073
- if _AFTER_SUMMARY:
1074
- def _on_after_summary(**kwargs):
1075
- try:
1076
- summary_container["text"] = str(kwargs.get("summary", "") or "")
1077
- except Exception:
1078
- summary_container["text"] = ""
1079
- try:
1080
- agent.event_bus.subscribe(_AFTER_SUMMARY, _on_after_summary)
1081
- except Exception:
1082
- pass
1083
- return summary_container
1084
-
1085
-
1086
- def _build_validation_error_guidance(
1087
- parse_error_analysis: Optional[str],
1088
- prev_parsed_items: Optional[List],
1089
- ) -> str:
1090
- """构建验证错误指导信息"""
1091
- if parse_error_analysis:
1092
- return f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- YAML解析失败: {parse_error_analysis}\n\n请确保输出的YAML格式正确,包括正确的缩进、引号、冒号等。"
1093
- elif prev_parsed_items is None:
1094
- return "\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- 无法从摘要中解析出有效的 YAML 数组"
1095
- elif not _valid_items(prev_parsed_items):
1096
- validation_errors = []
1097
- if not isinstance(prev_parsed_items, list):
1098
- validation_errors.append("结果不是数组")
1099
- else:
1100
- for idx, it in enumerate(prev_parsed_items):
1101
- if not isinstance(it, dict):
1102
- validation_errors.append(f"元素{idx}不是字典")
1103
- break
1104
- has_gid = "gid" in it
1105
- has_gids = "gids" in it
1106
- if not has_gid and not has_gids:
1107
- validation_errors.append(f"元素{idx}缺少必填字段 gid 或 gids")
1108
- break
1109
- if has_gid and has_gids:
1110
- validation_errors.append(f"元素{idx}不能同时包含 gid 和 gids")
1111
- break
1112
- if has_gid:
1113
- try:
1114
- if int(it.get("gid", 0)) < 1:
1115
- validation_errors.append(f"元素{idx}的 gid 必须 >= 1")
1116
- break
1117
- except Exception:
1118
- validation_errors.append(f"元素{idx}的 gid 格式错误(必须是整数)")
1119
- break
1120
- elif has_gids:
1121
- if not isinstance(it.get("gids"), list) or len(it.get("gids", [])) == 0:
1122
- validation_errors.append(f"元素{idx}的 gids 必须是非空数组")
1123
- break
1124
- try:
1125
- for gid_idx, gid_val in enumerate(it.get("gids", [])):
1126
- if int(gid_val) < 1:
1127
- validation_errors.append(f"元素{idx}的 gids[{gid_idx}] 必须 >= 1")
1128
- break
1129
- if validation_errors:
1130
- break
1131
- except Exception:
1132
- validation_errors.append(f"元素{idx}的 gids 格式错误(必须是整数数组)")
1133
- break
1134
- if "has_risk" not in it or not isinstance(it.get("has_risk"), bool):
1135
- validation_errors.append(f"元素{idx}缺少必填字段 has_risk(必须是布尔值)")
1136
- break
1137
- if it.get("has_risk"):
1138
- for key in ["preconditions", "trigger_path", "consequences", "suggestions"]:
1139
- if key not in it:
1140
- validation_errors.append(f"元素{idx}的 has_risk 为 true,但缺少必填字段 {key}")
1141
- break
1142
- if not isinstance(it[key], str) or not it[key].strip():
1143
- validation_errors.append(f"元素{idx}的 {key} 字段不能为空")
1144
- break
1145
- if validation_errors:
1146
- break
1147
- if validation_errors:
1148
- return "\n\n**格式错误详情(请根据以下错误修复输出格式):**\n" + "\n".join(f"- {err}" for err in validation_errors)
1149
- return ""
1150
-
1151
-
1152
- def _run_analysis_agent_with_retry(
1153
- agent: Agent,
1154
- per_task: str,
1155
- summary_container: Dict[str, str],
1156
- entry_path: str,
1157
- task_id: str,
1158
- bidx: int,
1159
- meta_records: List[Dict],
1160
- ) -> tuple[Optional[List[Dict]], Optional[Dict]]:
1161
- """运行分析Agent并重试直到成功"""
1162
- summary_items: Optional[List[Dict]] = None
1163
- workspace_restore_info: Optional[Dict] = None
1164
- use_direct_model_analysis = False
1165
- prev_parsed_items: Optional[List] = None
1166
- parse_error_analysis: Optional[str] = None
1167
- attempt = 0
1168
-
1169
- while True:
1170
- attempt += 1
1171
- summary_container["text"] = ""
1172
-
1173
- if use_direct_model_analysis:
1174
- summary_prompt_text = _build_summary_prompt()
1175
- error_guidance = _build_validation_error_guidance(parse_error_analysis, prev_parsed_items)
1176
- full_prompt = f"{per_task}{error_guidance}\n\n{summary_prompt_text}"
1177
- try:
1178
- response = agent.model.chat_until_success(full_prompt) # type: ignore
1179
- summary_container["text"] = response
1180
- except Exception as e:
1181
- try:
1182
- typer.secho(f"[jarvis-sec] 直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
1183
- except Exception:
1184
- pass
1185
- agent.run(per_task)
1186
- else:
1187
- agent.run(per_task)
1188
-
1189
- # 工作区保护
1190
- try:
1191
- _changed = _git_restore_if_dirty(entry_path)
1192
- workspace_restore_info = {
1193
- "performed": bool(_changed),
1194
- "changed_files_count": int(_changed or 0),
1195
- "action": "git checkout -- .",
1196
- }
1197
- meta_records.append({
1198
- "task_id": task_id,
1199
- "batch_index": bidx,
1200
- "workspace_restore": workspace_restore_info,
1201
- "attempt": attempt + 1,
1202
- })
1203
- if _changed:
1204
- try:
1205
- typer.secho(f"[jarvis-sec] 工作区已恢复 ({_changed} 个文件),操作: git checkout -- .", fg=typer.colors.BLUE)
1206
- except Exception:
1207
- pass
1208
- except Exception:
1209
- pass
1210
-
1211
- # 解析摘要中的 <REPORT>(YAML)
1212
- summary_text = summary_container.get("text", "")
1213
- parsed_items: Optional[List] = None
1214
- parse_error_analysis = None
1215
- if summary_text:
1216
- rep, parse_error_analysis = _try_parse_summary_report(summary_text)
1217
- if parse_error_analysis:
1218
- try:
1219
- typer.secho(f"[jarvis-sec] 分析结果YAML解析失败: {parse_error_analysis}", fg=typer.colors.YELLOW)
1220
- except Exception:
1221
- pass
1222
- elif isinstance(rep, list):
1223
- parsed_items = rep
1224
- elif isinstance(rep, dict):
1225
- items = rep.get("issues")
1226
- if isinstance(items, list):
1227
- parsed_items = items
1228
-
1229
- # 关键字段校验
1230
- # 空数组 [] 是有效的(表示没有发现问题),需要单独处理
1231
- if parsed_items is not None:
1232
- if len(parsed_items) == 0:
1233
- # 空数组表示没有发现问题,这是有效的格式
1234
- summary_items = parsed_items
1235
- break
1236
- elif _valid_items(parsed_items):
1237
- # 非空数组需要验证格式
1238
- summary_items = parsed_items
1239
- break
1240
-
1241
- # 格式校验失败,后续重试使用直接模型调用
1242
- use_direct_model_analysis = True
1243
- prev_parsed_items = parsed_items
1244
- if parse_error_analysis:
1245
- try:
1246
- typer.secho(f"[jarvis-sec] 分析结果YAML解析失败 -> 重试第 {attempt} 次 (批次={bidx},使用直接模型调用,将反馈解析错误)", fg=typer.colors.YELLOW)
1247
- except Exception:
1248
- pass
1249
- else:
1250
- try:
1251
- typer.secho(f"[jarvis-sec] 分析结果格式无效 -> 重试第 {attempt} 次 (批次={bidx},使用直接模型调用)", fg=typer.colors.YELLOW)
1252
- except Exception:
1253
- pass
1254
-
1255
- return summary_items, workspace_restore_info
1256
-
1257
-
1258
- def _expand_and_filter_analysis_results(summary_items: List[Dict]) -> tuple[List[Dict], List[Dict]]:
1259
- """展开gids格式为单个gid格式,并过滤出有风险的项目"""
1260
- items_with_risk: List[Dict] = []
1261
- items_without_risk: List[Dict] = []
1262
- merged_items: List[Dict] = []
1263
-
1264
- for it in summary_items:
1265
- has_risk = it.get("has_risk") is True
1266
- if "gids" in it and isinstance(it.get("gids"), list):
1267
- for gid_val in it.get("gids", []):
1268
- try:
1269
- gid_int = int(gid_val)
1270
- if gid_int >= 1:
1271
- item = {
1272
- **{k: v for k, v in it.items() if k != "gids"},
1273
- "gid": gid_int,
1274
- }
1275
- if has_risk:
1276
- merged_items.append(item)
1277
- items_with_risk.append(item)
1278
- else:
1279
- items_without_risk.append(item)
1280
- except Exception:
1281
- pass
1282
- elif "gid" in it:
1283
- if has_risk:
1284
- merged_items.append(it)
1285
- items_with_risk.append(it)
1286
- else:
1287
- items_without_risk.append(it)
1288
-
1289
- return items_with_risk, items_without_risk
1290
-
1291
-
1292
- def _build_gid_to_verification_mapping(verification_results: List[Dict]) -> Dict[int, Dict]:
1293
- """构建gid到验证结果的映射"""
1294
- gid_to_verification: Dict[int, Dict] = {}
1295
- for vr in verification_results:
1296
- if not isinstance(vr, dict):
1297
- continue
1298
- gids_to_process: List[int] = []
1299
- if "gids" in vr and isinstance(vr.get("gids"), list):
1300
- for gid_val in vr.get("gids", []):
1301
- try:
1302
- gid_int = int(gid_val)
1303
- if gid_int >= 1:
1304
- gids_to_process.append(gid_int)
1305
- except Exception as e:
1306
- try:
1307
- typer.secho(f"[jarvis-sec] 警告:验证结果中 gids 数组元素格式错误: {gid_val}, 错误: {e}", fg=typer.colors.YELLOW)
1308
- except Exception:
1309
- pass
1310
- elif "gid" in vr:
1311
- try:
1312
- gid_val = vr.get("gid", 0)
1313
- gid_int = int(gid_val)
1314
- if gid_int >= 1:
1315
- gids_to_process.append(gid_int)
1316
- else:
1317
- try:
1318
- typer.secho(f"[jarvis-sec] 警告:验证结果中 gid 值无效: {gid_val} (必须 >= 1)", fg=typer.colors.YELLOW)
1319
- except Exception:
1320
- pass
1321
- except Exception as e:
1322
- try:
1323
- typer.secho(f"[jarvis-sec] 警告:验证结果中 gid 格式错误: {vr.get('gid')}, 错误: {e}", fg=typer.colors.YELLOW)
1324
- except Exception:
1325
- pass
1326
- else:
1327
- try:
1328
- typer.secho(f"[jarvis-sec] 警告:验证结果项缺少 gid 或 gids 字段: {vr}", fg=typer.colors.YELLOW)
1329
- except Exception:
1330
- pass
1331
-
1332
- is_valid = vr.get("is_valid")
1333
- verification_notes = str(vr.get("verification_notes", "")).strip()
1334
- for gid in gids_to_process:
1335
- gid_to_verification[gid] = {
1336
- "is_valid": is_valid,
1337
- "verification_notes": verification_notes
1338
- }
1339
- return gid_to_verification
1340
-
1341
-
1342
- def _merge_verified_items(
1343
- items_with_risk: List[Dict],
1344
- batch: List[Dict],
1345
- gid_to_verification: Dict[int, Dict],
1346
- ) -> List[Dict]:
1347
- """合并验证通过的告警"""
1348
- gid_to_candidate: Dict[int, Dict] = {}
1349
- for c in batch:
1350
- try:
1351
- c_gid = int(c.get("gid", 0))
1352
- if c_gid >= 1:
1353
- gid_to_candidate[c_gid] = c
1354
- except Exception:
1355
- pass
1356
-
1357
- verified_items: List[Dict] = []
1358
- for item in items_with_risk:
1359
- item_gid = int(item.get("gid", 0))
1360
- verification = gid_to_verification.get(item_gid)
1361
- if verification and verification.get("is_valid") is True:
1362
- # 合并原始候选信息(file, line, pattern, category, language, evidence, confidence, severity 等)
1363
- candidate = gid_to_candidate.get(item_gid, {})
1364
- merged_item = {
1365
- **candidate, # 原始候选信息
1366
- **item, # 分析结果
1367
- "verification_notes": str(verification.get("verification_notes", "")).strip(),
1368
- }
1369
- verified_items.append(merged_item)
1370
- elif verification and verification.get("is_valid") is False:
1371
- try:
1372
- typer.secho(f"[jarvis-sec] 验证 Agent 判定 gid={item_gid} 为误报: {verification.get('verification_notes', '')}", fg=typer.colors.BLUE)
1373
- except Exception:
1374
- pass
1375
- else:
1376
- try:
1377
- typer.secho(f"[jarvis-sec] 警告:验证结果中未找到 gid={item_gid},视为验证不通过", fg=typer.colors.YELLOW)
1378
- except Exception:
1379
- pass
1380
- return verified_items
1381
-
1382
-
1383
- def _process_verification_batch(
1384
- batch: List[Dict],
1385
- bidx: int,
1386
- total_batches: int,
1387
- entry_path: str,
1388
- langs: List[str],
1389
- llm_group: Optional[str],
1390
- status_mgr,
1391
- _progress_append,
1392
- _append_report,
1393
- meta_records: List[Dict],
1394
- gid_counts: Dict[int, int],
1395
- sec_dir,
1396
- ) -> None:
1397
- """
1398
- 处理单个验证批次。
1399
-
1400
- 参数:
1401
- - batch: 当前批次的候选列表
1402
- - bidx: 批次索引
1403
- - total_batches: 总批次数
1404
- - 其他参数用于状态管理和结果收集
1405
- """
1406
- task_id = f"JARVIS-SEC-Batch-{bidx}"
1407
- batch_file = batch[0].get("file") if batch else None
1408
-
1409
- # 进度:批次开始
1410
- _progress_append(
1411
- {
1412
- "event": "batch_status",
1413
- "status": "running",
1414
- "batch_id": task_id,
1415
- "batch_index": bidx,
1416
- "total_batches": total_batches,
1417
- "batch_size": len(batch),
1418
- "file": batch_file,
1419
- }
1420
- )
1421
- # 更新验证阶段进度
1422
- status_mgr.update_verification(
1423
- current_batch=bidx,
1424
- total_batches=total_batches,
1425
- batch_id=task_id,
1426
- file_name=batch_file,
1427
- message=f"正在验证批次 {bidx}/{total_batches}"
1428
- )
1429
-
1430
- # 显示进度
1431
- try:
1432
- typer.secho(f"\n[jarvis-sec] 分析批次 {bidx}/{total_batches}: 大小={len(batch)} 文件='{batch_file}'", fg=typer.colors.CYAN)
1433
- except Exception:
1434
- pass
1435
-
1436
- # 创建分析Agent
1437
- agent = _create_analysis_agent(task_id, llm_group)
1438
-
1439
- # 构建任务上下文
1440
- per_task = _build_analysis_task_context(batch, entry_path, langs)
1441
-
1442
- # 订阅摘要事件
1443
- summary_container = _subscribe_summary_event(agent)
1444
-
1445
- # 运行分析Agent并重试
1446
- summary_items, workspace_restore_info = _run_analysis_agent_with_retry(
1447
- agent, per_task, summary_container, entry_path, task_id, bidx, meta_records
1448
- )
1449
-
1450
- # 处理分析结果
1451
- parse_fail = summary_items is None
1452
- verified_items: List[Dict] = []
1453
-
1454
- if summary_items:
1455
- # 展开并过滤分析结果
1456
- items_with_risk, items_without_risk = _expand_and_filter_analysis_results(summary_items)
1457
-
1458
- # 记录无风险项目的日志
1459
- if items_without_risk:
1460
- try:
1461
- typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 分析 Agent 判定 {len(items_without_risk)} 个候选为无风险(has_risk: false),跳过验证", fg=typer.colors.BLUE)
1462
- except Exception:
1463
- pass
1464
-
1465
- # 运行验证Agent(仅当分析Agent发现有风险的问题时)
1466
- if items_with_risk:
1467
- # 创建验证 Agent 来验证分析 Agent 的结论
1468
- verification_system_prompt = """
1469
- # 验证 Agent 约束
1470
- - 你的核心任务是验证分析 Agent 给出的安全结论是否正确。
1471
- - 你需要仔细检查分析 Agent 给出的前置条件、触发路径、后果和建议是否合理、准确。
1472
- - 工具优先:使用 read_code 读取目标文件附近源码(行号前后各 ~50 行),必要时用 execute_script 辅助检索。
1473
- - 必要时需向上追溯调用者,查看完整的调用路径,以确认分析 Agent 的结论是否成立。
1474
- - 禁止修改任何文件或执行写操作命令;仅进行只读分析与读取。
1475
- - 每次仅执行一个操作;等待工具结果后再进行下一步。
1476
- - **记忆使用**:
1477
- - 在验证过程中,充分利用 retrieve_memory 工具检索已有的记忆,特别是分析 Agent 保存的与当前验证函数相关的记忆。
1478
- - 这些记忆可能包含函数的分析要点、指针判空情况、输入校验情况、调用路径分析结果等,可以帮助你更准确地验证分析结论。
1479
- - 如果发现分析 Agent 的结论与记忆中的信息不一致,需要仔细核实。
1480
- - 完成验证后,主输出仅打印结束符 <!!!COMPLETE!!!> ,不需要汇总结果。
1481
- """.strip()
1482
-
1483
- verification_task_id = f"JARVIS-SEC-Verify-Batch-{bidx}"
1484
- verification_agent_kwargs: Dict = dict(
1485
- system_prompt=verification_system_prompt,
1486
- name=verification_task_id,
1487
- auto_complete=True,
1488
- need_summary=True,
1489
- summary_prompt=_build_verification_summary_prompt(),
1490
- non_interactive=True,
1491
- in_multi_agent=False,
1492
- use_methodology=False,
1493
- use_analysis=False,
1494
- plan=False,
1495
- output_handler=[ToolRegistry()],
1496
- disable_file_edit=True,
1497
- use_tools=["read_code", "execute_script", "retrieve_memory"],
1498
- )
1499
- if llm_group:
1500
- verification_agent_kwargs["model_group"] = llm_group
1501
- verification_agent = Agent(**verification_agent_kwargs)
1502
-
1503
- # 构造验证任务上下文
1504
- import json as _json3
1505
- verification_task = f"""
1506
- # 验证分析结论任务
1507
- 上下文参数:
1508
- - entry_path: {entry_path}
1509
- - languages: {langs}
1510
-
1511
- 分析 Agent 给出的结论(需要验证,仅包含 has_risk: true 的项目):
1512
- {_json3.dumps(items_with_risk, ensure_ascii=False, indent=2)}
1513
-
1514
- 请验证上述分析结论是否正确,包括:
1515
- 1. 前置条件(preconditions)是否合理
1516
- 2. 触发路径(trigger_path)是否成立
1517
- 3. 后果(consequences)评估是否准确
1518
- 4. 建议(suggestions)是否合适
1519
-
1520
- 对于每个 gid,请判断分析结论是否正确(is_valid: true/false),并给出验证说明。
1521
- """.strip()
1522
-
1523
- # 订阅验证 Agent 的摘要
1524
- verification_summary_container = _subscribe_summary_event(verification_agent)
1525
-
1526
- verification_results, verification_parse_error = _run_verification_agent_with_retry(
1527
- verification_agent,
1528
- verification_task,
1529
- _build_verification_summary_prompt(),
1530
- entry_path,
1531
- verification_summary_container,
1532
- bidx,
1533
- )
1534
-
1535
- # 调试日志:显示验证结果
1536
- if verification_results is None:
1537
- try:
1538
- typer.secho(f"[jarvis-sec] 警告:验证 Agent 返回 None,可能解析失败", fg=typer.colors.YELLOW)
1539
- except Exception:
1540
- pass
1541
- elif not isinstance(verification_results, list):
1542
- try:
1543
- typer.secho(f"[jarvis-sec] 警告:验证 Agent 返回类型错误,期望 list,实际: {type(verification_results)}", fg=typer.colors.YELLOW)
1544
- except Exception:
1545
- pass
1546
- elif len(verification_results) == 0:
1547
- try:
1548
- typer.secho(f"[jarvis-sec] 警告:验证 Agent 返回空列表", fg=typer.colors.YELLOW)
1549
- except Exception:
1550
- pass
1551
- else:
1552
- try:
1553
- typer.secho(f"[jarvis-sec] 验证 Agent 返回 {len(verification_results)} 个结果项", fg=typer.colors.BLUE)
1554
- except Exception:
1555
- pass
1556
-
1557
- # 根据验证结果筛选:只保留验证通过(is_valid: true)的告警
1558
- if verification_results:
1559
- gid_to_verification = _build_gid_to_verification_mapping(verification_results)
1560
-
1561
- # 调试日志:显示提取到的验证结果
1562
- if gid_to_verification:
1563
- try:
1564
- typer.secho(f"[jarvis-sec] 从验证结果中提取到 {len(gid_to_verification)} 个 gid: {sorted(gid_to_verification.keys())}", fg=typer.colors.BLUE)
1565
- except Exception:
1566
- pass
1567
- else:
1568
- try:
1569
- typer.secho(f"[jarvis-sec] 警告:验证结果解析成功,但未提取到任何有效的 gid。验证结果: {verification_results}", fg=typer.colors.YELLOW)
1570
- except Exception:
1571
- pass
1572
-
1573
- # 合并验证通过的告警
1574
- verified_items = _merge_verified_items(items_with_risk, batch, gid_to_verification)
1575
- else:
1576
- typer.secho(f"[jarvis-sec] 警告:验证 Agent 结果解析失败,不保留任何告警(保守策略)", fg=typer.colors.YELLOW)
1577
-
1578
- # 只有验证通过的告警才写入文件
1579
- if verified_items:
1580
- for item in verified_items:
1581
- gid = int(item.get("gid", 0))
1582
- if gid >= 1:
1583
- gid_counts[gid] = gid_counts.get(gid, 0) + 1
1584
- typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 验证通过: 数量={len(verified_items)}/{len(items_with_risk)} -> 写入文件", fg=typer.colors.GREEN)
1585
- _append_report(verified_items, "verified", task_id, {"batch": True, "candidates": batch})
1586
- # 从文件读取当前总数(用于状态显示)
1587
- current_count = _count_issues_from_file(sec_dir)
1588
- status_mgr.update_verification(
1589
- current_batch=bidx,
1590
- total_batches=total_batches,
1591
- issues_found=current_count,
1592
- message=f"已验证 {bidx}/{total_batches} 批次,发现 {current_count} 个问题(验证通过)"
1593
- )
1594
- else:
1595
- typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 验证后无有效告警: 分析 Agent 发现 {len(items_with_risk)} 个有风险的问题,验证后全部不通过", fg=typer.colors.BLUE)
1596
- current_count = _count_issues_from_file(sec_dir)
1597
- status_mgr.update_verification(
1598
- current_batch=bidx,
1599
- total_batches=total_batches,
1600
- issues_found=current_count,
1601
- message=f"已验证 {bidx}/{total_batches} 批次,验证后无有效告警"
1602
- )
1603
- elif parse_fail:
1604
- typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 解析失败 (摘要中无 <REPORT> 或字段无效)", fg=typer.colors.YELLOW)
1605
- else:
1606
- typer.secho(f"[jarvis-sec] 批次 {bidx}/{total_batches} 未发现问题", fg=typer.colors.BLUE)
1607
- current_count = _count_issues_from_file(sec_dir)
1608
- status_mgr.update_verification(
1609
- current_batch=bidx,
1610
- total_batches=total_batches,
1611
- issues_found=current_count,
1612
- message=f"已验证 {bidx}/{total_batches} 批次"
1613
- )
1614
-
1615
- # 为本批次所有候选写入 done 记录
1616
- for c in batch:
1617
- sig = _sig_of(c)
1618
- try:
1619
- c_gid = int(c.get("gid", 0))
1620
- except Exception:
1621
- c_gid = 0
1622
- cnt = gid_counts.get(c_gid, 0)
1623
- _progress_append({
1624
- "event": "task_status",
1625
- "status": "done",
1626
- "task_id": task_id,
1627
- "candidate_signature": sig,
1628
- "candidate": c,
1629
- "issues_count": int(cnt),
1630
- "parse_fail": parse_fail,
1631
- "workspace_restore": workspace_restore_info,
1632
- "batch_index": bidx,
1633
- })
1634
-
1635
- # 批次结束记录
1636
- _progress_append({
1637
- "event": "batch_status",
1638
- "status": "done",
1639
- "batch_id": task_id,
1640
- "batch_index": bidx,
1641
- "total_batches": total_batches,
1642
- "issues_count": len(verified_items),
1643
- "parse_fail": parse_fail,
1644
- })
1645
-
1646
-
1647
- def _valid_items(items: Optional[List]) -> bool:
1648
- """验证分析结果项的格式"""
1649
- if not isinstance(items, list):
1650
- return False
1651
- for it in items:
1652
- if not isinstance(it, dict):
1653
- return False
1654
- has_gid = "gid" in it
1655
- has_gids = "gids" in it
1656
- if not has_gid and not has_gids:
1657
- return False
1658
- if has_gid and has_gids:
1659
- return False
1660
- if has_gid:
1661
- try:
1662
- if int(it["gid"]) < 1:
1663
- return False
1664
- except Exception:
1665
- return False
1666
- elif has_gids:
1667
- if not isinstance(it["gids"], list) or len(it["gids"]) == 0:
1668
- return False
1669
- for gid_val in it["gids"]:
1670
- try:
1671
- if int(gid_val) < 1:
1672
- return False
1673
- except Exception:
1674
- return False
1675
- if "has_risk" not in it or not isinstance(it["has_risk"], bool):
1676
- return False
1677
- if it.get("has_risk"):
1678
- for key in ["preconditions", "trigger_path", "consequences", "suggestions"]:
1679
- if key not in it:
1680
- return False
1681
- if not isinstance(it[key], str) or not it[key].strip():
1682
- return False
1683
- return True
1684
-
1685
-
1686
- def _is_valid_verification_item(item: Dict) -> bool:
1687
- """验证验证结果项的格式"""
1688
- if not isinstance(item, dict) or "is_valid" not in item:
1689
- return False
1690
- has_gid = "gid" in item
1691
- has_gids = "gids" in item
1692
- if not has_gid and not has_gids:
1693
- return False
1694
- if has_gid and has_gids:
1695
- return False # gid 和 gids 不能同时出现
1696
- if has_gid:
1697
- try:
1698
- return int(item["gid"]) >= 1
1699
- except Exception:
1700
- return False
1701
- elif has_gids:
1702
- if not isinstance(item["gids"], list) or len(item["gids"]) == 0:
1703
- return False
1704
- try:
1705
- return all(int(gid_val) >= 1 for gid_val in item["gids"])
1706
- except Exception:
1707
- return False
1708
- return False
1709
-
1710
-
1711
- def _run_verification_agent_with_retry(
1712
- verification_agent,
1713
- verification_task: str,
1714
- verification_summary_prompt: str,
1715
- entry_path: str,
1716
- verification_summary_container: Dict[str, str],
1717
- bidx: int,
1718
- ) -> tuple[Optional[List[Dict]], Optional[str]]:
1719
- """运行验证Agent并永久重试直到格式正确,返回(验证结果, 解析错误)"""
1720
- use_direct_model_verify = False
1721
- prev_parse_error_verify: Optional[str] = None
1722
- verify_attempt = 0
1723
-
1724
- while True:
1725
- verify_attempt += 1
1726
- verification_summary_container["text"] = ""
1727
-
1728
- if use_direct_model_verify:
1729
- verification_summary_prompt_text = _build_verification_summary_prompt()
1730
- error_guidance = ""
1731
- if prev_parse_error_verify:
1732
- error_guidance = f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- YAML解析失败: {prev_parse_error_verify}\n\n请确保输出的YAML格式正确,包括正确的缩进、引号、冒号等。"
1733
-
1734
- full_verify_prompt = f"{verification_task}{error_guidance}\n\n{verification_summary_prompt_text}"
1735
- try:
1736
- verify_response = verification_agent.model.chat_until_success(full_verify_prompt) # type: ignore
1737
- verification_summary_container["text"] = verify_response
1738
- except Exception as e:
1739
- try:
1740
- typer.secho(f"[jarvis-sec] 验证阶段直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
1741
- except Exception:
1742
- pass
1743
- verification_agent.run(verification_task)
1744
- else:
1745
- verification_agent.run(verification_task)
1746
-
1747
- # 工作区保护
1748
- try:
1749
- _changed_verify = _git_restore_if_dirty(entry_path)
1750
- if _changed_verify:
1751
- try:
1752
- typer.secho(f"[jarvis-sec] 验证 Agent 工作区已恢复 ({_changed_verify} 个文件)", fg=typer.colors.BLUE)
1753
- except Exception:
1754
- pass
1755
- except Exception:
1756
- pass
1757
-
1758
- # 解析验证结果
1759
- verification_summary_text = verification_summary_container.get("text", "")
1760
- parse_error_verify = None
1761
- if verification_summary_text:
1762
- verification_parsed, parse_error_verify = _try_parse_summary_report(verification_summary_text)
1763
- if parse_error_verify:
1764
- prev_parse_error_verify = parse_error_verify
1765
- try:
1766
- typer.secho(f"[jarvis-sec] 验证结果YAML解析失败: {parse_error_verify}", fg=typer.colors.YELLOW)
1767
- except Exception:
1768
- pass
1769
- else:
1770
- prev_parse_error_verify = None
1771
- if isinstance(verification_parsed, list):
1772
- if verification_parsed and all(_is_valid_verification_item(item) for item in verification_parsed):
1773
- return verification_parsed, None
1774
-
1775
- # 格式校验失败,后续重试使用直接模型调用
1776
- use_direct_model_verify = True
1777
- if parse_error_verify:
1778
- try:
1779
- typer.secho(f"[jarvis-sec] 验证结果YAML解析失败 -> 重试第 {verify_attempt} 次 (批次={bidx},使用直接模型调用,将反馈解析错误)", fg=typer.colors.YELLOW)
1780
- except Exception:
1781
- pass
1782
- else:
1783
- try:
1784
- typer.secho(f"[jarvis-sec] 验证结果格式无效 -> 重试第 {verify_attempt} 次 (批次={bidx},使用直接模型调用)", fg=typer.colors.YELLOW)
1785
- except Exception:
1786
- pass
1787
-
1788
-
1789
- def run_security_analysis(
1790
- entry_path: str,
1791
- languages: Optional[List[str]] = None,
1792
- llm_group: Optional[str] = None,
1793
- report_file: Optional[str] = None,
1794
- cluster_limit: int = 50,
1795
- exclude_dirs: Optional[List[str]] = None,
1796
- ) -> str:
1797
- """
1798
- 运行安全分析工作流(混合模式)。
1799
-
1800
- 改进:
1801
- - 即使在 agent 模式下,也先进行本地正则/启发式直扫,生成候选问题;
1802
- 然后将候选问题拆分为子任务,交由多Agent进行深入分析与聚合。
1803
-
1804
- 注意:此函数会在发生异常时更新状态文件为 error 状态。
1805
-
1806
- 参数:
1807
- - entry_path: 待分析的根目录路径
1808
- - languages: 限定扫描的语言扩展(例如 ["c", "cpp", "h", "hpp", "rs"]),为空则使用默认
1809
-
1810
- 返回:
1811
- - 最终报告(字符串),由 Aggregator 生成(JSON + Markdown)
1812
-
1813
- 其他:
1814
- - llm_group: 模型组名称(仅在当前调用链内生效,不覆盖全局配置),将直接传入 Agent 用于选择模型
1815
- - report_file: 增量报告文件路径(JSONL)。当每个子任务检测到 issues 时,立即将一条记录追加到该文件;
1816
- 若未指定,则默认写入 entry_path/.jarvis/sec/agent_issues.jsonl
1817
- - cluster_limit: 聚类时每批次最多处理的告警数(默认 50),当单个文件告警过多时按批次进行聚类
1818
- - exclude_dirs: 要排除的目录列表(可选),默认已包含测试目录(test, tests, __tests__, spec, testsuite, testdata)
1819
- - 断点续扫: 默认开启。会基于 .jarvis/sec/progress.jsonl 和 .jarvis/sec/heuristic_issues.jsonl 文件进行状态恢复。
1820
- """
1821
- import json
1822
-
1823
- langs = languages or ["c", "cpp", "h", "hpp", "rs"]
1824
-
1825
- # 状态管理器(结构化进度状态文件)
1826
- from jarvis.jarvis_sec.status import StatusManager
1827
- status_mgr = StatusManager(entry_path)
1828
-
1829
- # 尝试从状态文件恢复并显示当前状态
1830
- try:
1831
- current_status = status_mgr.get_status()
1832
- if current_status:
1833
- stage = current_status.get("stage", "unknown")
1834
- progress = current_status.get("progress", 0)
1835
- message = current_status.get("message", "")
1836
- typer.secho(f"[jarvis-sec] 从状态文件恢复: 阶段={stage}, 进度={progress}%, {message}", fg=typer.colors.BLUE)
1837
- except Exception:
1838
- pass
1839
-
1840
- # 初始化分析上下文
1841
- sec_dir, progress_path, _progress_append, done_sigs = _initialize_analysis_context(
1842
- entry_path, status_mgr
1843
- )
1844
-
1845
- # 1) 启发式扫描(支持断点续扫)
1846
- candidates, summary = _load_or_run_heuristic_scan(
1847
- entry_path, langs, exclude_dirs, sec_dir, status_mgr, _progress_append
1848
- )
1849
-
1850
- # 2) 将候选问题精简为子任务清单,控制上下文长度
1851
- compact_candidates = _prepare_candidates(candidates)
1852
-
1853
- # 记录批次选择信息(可选,用于日志)
1854
- try:
1855
- groups = _group_candidates_by_file(compact_candidates)
1856
- if groups:
1857
- selected_file, items = max(groups.items(), key=lambda kv: len(kv[1]))
1858
- try:
1859
- typer.secho(f"[jarvis-sec] 批次选择: 文件={selected_file} 数量={len(items)}", fg=typer.colors.BLUE)
1860
- except Exception:
1861
- pass
1862
- _progress_append({
1863
- "event": "batch_selection",
1864
- "selected_file": selected_file,
1865
- "selected_count": len(items),
1866
- "total_in_file": len(items),
1867
- })
1868
- except Exception:
1869
- pass
1870
-
1871
- # 创建报告写入函数
1872
- _append_report = _create_report_writer(sec_dir, report_file)
1873
-
1874
- # 3) 处理聚类阶段
1875
- cluster_batches, invalid_clusters_for_review = _process_clustering_phase(
1876
- compact_candidates,
1877
- entry_path,
1878
- langs,
1879
- cluster_limit,
1880
- llm_group,
1881
- sec_dir,
1882
- progress_path,
1883
- status_mgr,
1884
- _progress_append,
1885
- )
1886
-
1887
- # 4) 处理验证阶段
1888
- meta_records: List[Dict] = []
1889
- gid_counts: Dict[int, int] = {}
1890
- all_issues = _process_verification_phase(
1891
- cluster_batches,
1892
- entry_path,
1893
- langs,
1894
- llm_group,
1895
- sec_dir,
1896
- progress_path,
1897
- status_mgr,
1898
- _progress_append,
1899
- _append_report,
1900
- )
1901
-
1902
- # 5) 使用统一聚合器生成最终报告(JSON + Markdown)
1903
- try:
1904
- from jarvis.jarvis_sec.report import build_json_and_markdown
1905
- result = build_json_and_markdown(
1906
- all_issues,
1907
- scanned_root=summary.get("scanned_root"),
1908
- scanned_files=summary.get("scanned_files"),
1909
- meta=meta_records or None,
1910
- )
1911
- # 标记分析完成
1912
- status_mgr.mark_completed(
1913
- total_issues=len(all_issues),
1914
- message=f"安全分析完成,共发现 {len(all_issues)} 个问题"
1915
- )
1916
- return result
1917
- except Exception as e:
1918
- # 发生错误时更新状态
1919
- error_msg = str(e)
1920
- status_mgr.mark_error(
1921
- error_message=error_msg,
1922
- error_type=type(e).__name__
1923
- )
1924
- raise
1925
-
1926
-
1927
- def _group_candidates_by_file(candidates: List[Dict]) -> Dict[str, List[Dict]]:
1928
- """按文件分组候选问题"""
1929
- from collections import defaultdict
1930
- groups: Dict[str, List[Dict]] = defaultdict(list)
1931
- for it in candidates:
1932
- groups[str(it.get("file") or "")].append(it)
1933
- return groups
1934
-
1935
-
1936
- def _create_report_writer(sec_dir, report_file):
1937
- """创建报告写入函数"""
1938
- import json
1939
- from pathlib import Path
1940
-
1941
- def _append_report(items, source: str, task_id: str, cand: Dict):
1942
- """将当前子任务的检测结果追加写入 JSONL 报告文件(每行一个 issue)。仅当 items 非空时写入。"""
1943
- if not items:
1944
- return
1945
- try:
1946
- path = Path(report_file) if report_file else sec_dir / "agent_issues.jsonl"
1947
- path.parent.mkdir(parents=True, exist_ok=True)
1948
- with path.open("a", encoding="utf-8") as f:
1949
- for item in items:
1950
- line = json.dumps(item, ensure_ascii=False)
1951
- f.write(line + "\n")
1952
- try:
1953
- typer.secho(f"[jarvis-sec] 已将 {len(items)} 个问题写入 {path}", fg=typer.colors.GREEN)
1954
- except Exception:
1955
- pass
1956
- except Exception:
1957
- # 报告写入失败不影响主流程
1958
- pass
1959
-
1960
- return _append_report
1961
-
1962
-
1963
- def _sig_of(c: Dict) -> str:
1964
- """生成候选问题的签名"""
1965
- return f"{c.get('language','')}|{c.get('file','')}|{c.get('line','')}|{c.get('pattern','')}"
1966
-
1967
-
1968
- def _create_signature_function():
1969
- """创建候选签名函数(已废弃,直接使用 _sig_of)"""
1970
- return _sig_of
1971
-
1972
-
1973
- def _parse_clusters_from_text(text: str) -> tuple[Optional[List], Optional[str]]:
1974
- """解析聚类文本,返回(解析结果, 错误信息)"""
1975
- try:
1976
- start = text.find("<CLUSTERS>")
1977
- end = text.find("</CLUSTERS>")
1978
- if start == -1 or end == -1 or end <= start:
1979
- return None, "未找到 <CLUSTERS> 或 </CLUSTERS> 标签,或标签顺序错误"
1980
- content = text[start + len("<CLUSTERS>"):end].strip()
1981
- if not content:
1982
- return None, "YAML 内容为空"
1983
- import yaml as _yaml3 # type: ignore
1984
- try:
1985
- data = _yaml3.safe_load(content)
1986
- except Exception as yaml_err:
1987
- error_msg = f"YAML 解析失败: {str(yaml_err)}"
1988
- return None, error_msg
1989
- if isinstance(data, list):
1990
- return data, None
1991
- return None, f"YAML 解析结果不是数组,而是 {type(data).__name__}"
1992
- except Exception as e:
1993
- return None, f"解析过程发生异常: {str(e)}"
1994
-
1995
-
1996
- def _create_cluster_snapshot_writer(sec_dir, cluster_records, compact_candidates, _progress_append):
1997
- """创建聚类快照写入函数"""
1998
- def _write_cluster_batch_snapshot(batch_records: List[Dict]):
1999
- """写入单个批次的聚类结果,支持增量保存"""
2000
- try:
2001
- from pathlib import Path as _Path2
2002
- import json as _json
2003
- _cluster_path = sec_dir / "cluster_report.jsonl"
2004
- _cluster_path.parent.mkdir(parents=True, exist_ok=True)
2005
-
2006
- # 追加模式,每次只追加当前批次的记录
2007
- with _cluster_path.open("a", encoding="utf-8") as f:
2008
- for record in batch_records:
2009
- f.write(_json.dumps(record, ensure_ascii=False) + "\n")
2010
- except Exception:
2011
- pass
2012
-
2013
- def _write_cluster_report_snapshot():
2014
- """写入聚类报告快照"""
2015
- try:
2016
- from pathlib import Path as _Path2
2017
- import json as _json
2018
- _cluster_path = sec_dir / "cluster_report.jsonl"
2019
- _cluster_path.parent.mkdir(parents=True, exist_ok=True)
2020
-
2021
- # 使用追加模式,每次只追加当前批次的记录
2022
- # 注意:这会导致重复记录,需要在读取时去重
2023
- with _cluster_path.open("a", encoding="utf-8") as f:
2024
- for record in cluster_records:
2025
- f.write(_json.dumps(record, ensure_ascii=False) + "\n")
2026
-
2027
- _progress_append(
2028
- {
2029
- "event": "cluster_report_snapshot",
2030
- "path": str(_cluster_path),
2031
- "clusters": len(cluster_records),
2032
- "total_candidates": len(compact_candidates),
2033
- }
2034
- )
2035
- except Exception:
2036
- pass
2037
-
2038
- return _write_cluster_batch_snapshot, _write_cluster_report_snapshot
2039
-
2040
-
2041
- def _collect_candidate_gids(file_groups: Dict[str, List[Dict]]) -> set:
2042
- """收集所有候选的 gid"""
2043
- all_gids = set()
2044
- for _file, _items in file_groups.items():
2045
- for it in _items:
2046
- try:
2047
- _gid = int(it.get("gid", 0))
2048
- if _gid >= 1:
2049
- all_gids.add(_gid)
2050
- except Exception:
2051
- pass
2052
- return all_gids
2053
-
2054
-
2055
- def _collect_clustered_gids(cluster_batches: List[List[Dict]], invalid_clusters_for_review: List[Dict]) -> set:
2056
- """收集所有已聚类的 gid"""
2057
- all_clustered_gids = set()
2058
- for batch in cluster_batches:
2059
- for item in batch:
2060
- try:
2061
- _gid = int(item.get("gid", 0))
2062
- if _gid >= 1:
2063
- all_clustered_gids.add(_gid)
2064
- except Exception:
2065
- pass
2066
- # 也收集无效聚类中的 gid(它们已经进入复核流程)
2067
- for invalid_cluster in invalid_clusters_for_review:
2068
- gids_list = invalid_cluster.get("gids", [])
2069
- for _gid in gids_list:
2070
- try:
2071
- _gid_int = int(_gid)
2072
- if _gid_int >= 1:
2073
- all_clustered_gids.add(_gid_int)
2074
- except Exception:
2075
- pass
2076
- return all_clustered_gids
2077
-
2078
-
2079
- def _load_processed_gids_from_agent_issues(sec_dir) -> set:
2080
- """从 agent_issues.jsonl 读取已处理的 gid"""
2081
- processed_gids = set()
2082
- try:
2083
- from pathlib import Path
2084
- import json
2085
- _agent_issues_path = sec_dir / "agent_issues.jsonl"
2086
- if _agent_issues_path.exists():
2087
- with _agent_issues_path.open("r", encoding="utf-8", errors="ignore") as f:
2088
- for line in f:
2089
- line = line.strip()
2090
- if not line:
2091
- continue
2092
- try:
2093
- issue_obj = json.loads(line)
2094
- _gid = int(issue_obj.get("gid", 0))
2095
- if _gid >= 1:
2096
- processed_gids.add(_gid)
2097
- except Exception:
2098
- pass
2099
- except Exception:
2100
- pass
2101
- return processed_gids
2102
-
2103
-
2104
- def _load_completed_batch_ids(progress_path) -> set:
2105
- """从 progress.jsonl 读取已完成的批次ID"""
2106
- completed_batch_ids = set()
2107
- try:
2108
- import json
2109
- if progress_path.exists():
2110
- with progress_path.open("r", encoding="utf-8", errors="ignore") as f:
2111
- for line in f:
2112
- line = line.strip()
2113
- if not line:
2114
- continue
2115
- try:
2116
- obj = json.loads(line)
2117
- # 检查 batch_status 事件,status 为 "done" 表示批次已完成
2118
- if obj.get("event") == "batch_status" and obj.get("status") == "done":
2119
- batch_id = obj.get("batch_id")
2120
- if batch_id:
2121
- completed_batch_ids.add(batch_id)
2122
- except Exception:
2123
- pass
2124
- except Exception:
2125
- pass
2126
- return completed_batch_ids
2127
-
2128
-
2129
- def _load_all_issues_from_file(sec_dir) -> List[Dict]:
2130
- """从 agent_issues.jsonl 读取所有已保存的告警"""
2131
- all_issues: List[Dict] = []
2132
- try:
2133
- from pathlib import Path
2134
- import json
2135
- _agent_issues_path = sec_dir / "agent_issues.jsonl"
2136
- if _agent_issues_path.exists():
2137
- saved_gids_from_file = set()
2138
- with _agent_issues_path.open("r", encoding="utf-8", errors="ignore") as f:
2139
- for line in f:
2140
- line = line.strip()
2141
- if not line:
2142
- continue
2143
- try:
2144
- item = json.loads(line)
2145
- gid = item.get("gid", 0)
2146
- if gid >= 1 and gid not in saved_gids_from_file:
2147
- # 只保留验证通过的告警(has_risk: true 且有 verification_notes)
2148
- if item.get("has_risk") is True and "verification_notes" in item:
2149
- all_issues.append(item)
2150
- saved_gids_from_file.add(gid)
2151
- except Exception:
2152
- pass
2153
-
2154
- if all_issues:
2155
- try:
2156
- typer.secho(f"[jarvis-sec] 从 agent_issues.jsonl 加载了 {len(all_issues)} 个已保存的告警", fg=typer.colors.BLUE)
2157
- except Exception:
2158
- pass
2159
- else:
2160
- try:
2161
- typer.secho(f"[jarvis-sec] agent_issues.jsonl 不存在,当前运行未发现任何问题", fg=typer.colors.BLUE)
2162
- except Exception:
2163
- pass
2164
- except Exception as e:
2165
- # 加载失败不影响主流程
2166
- try:
2167
- typer.secho(f"[jarvis-sec] 警告:从 agent_issues.jsonl 加载告警失败: {e}", fg=typer.colors.YELLOW)
2168
- except Exception:
2169
- pass
2170
- return all_issues
2171
-
2172
-
2173
- def _supplement_missing_gids_for_clustering(
2174
- missing_gids: set,
2175
- gid_to_candidate: Dict[int, Dict],
2176
- cluster_batches: List[List[Dict]],
2177
- _progress_append,
2178
- processed_gids_from_issues: set,
2179
- ) -> tuple[int, int]:
2180
- """为遗漏的 gid 补充聚类,返回(补充数量, 跳过数量)"""
2181
- supplemented_count = 0
2182
- skipped_count = 0
2183
-
2184
- for missing_gid in sorted(missing_gids):
2185
- # 如果该 gid 已经在 agent_issues.jsonl 中有结果,说明已经验证过了
2186
- # 不需要重新聚类,但记录一下
2187
- if missing_gid in processed_gids_from_issues:
2188
- skipped_count += 1
2189
- _progress_append({
2190
- "event": "cluster_missing_gid_skipped",
2191
- "gid": missing_gid,
2192
- "note": "已在agent_issues.jsonl中有验证结果,跳过重新处理",
2193
- "reason": "already_processed",
2194
- })
2195
- continue
2196
-
2197
- # 找到对应的候选
2198
- missing_item = gid_to_candidate.get(missing_gid)
2199
- if missing_item:
2200
- # 为遗漏的 gid 创建默认验证条件
2201
- default_verification = f"验证候选 {missing_gid} 的安全风险"
2202
- missing_item["verify"] = default_verification
2203
- cluster_batches.append([missing_item])
2204
- supplemented_count += 1
2205
- _progress_append({
2206
- "event": "cluster_missing_gid_supplement",
2207
- "gid": missing_gid,
2208
- "file": missing_item.get("file"),
2209
- "note": "分析阶段开始前补充的遗漏gid",
2210
- })
2211
-
2212
- return supplemented_count, skipped_count
2213
-
2214
-
2215
- def _handle_single_alert_file(
2216
- file: str,
2217
- single_item: Dict,
2218
- single_gid: int,
2219
- cluster_batches: List[List[Dict]],
2220
- cluster_records: List[Dict],
2221
- _progress_append,
2222
- _write_cluster_batch_snapshot,
2223
- ) -> None:
2224
- """处理单告警文件:跳过聚类,直接写入"""
2225
- default_verification = f"验证候选 {single_gid} 的安全风险"
2226
- single_item["verify"] = default_verification
2227
- cluster_batches.append([single_item])
2228
- cluster_records.append(
2229
- {
2230
- "file": file,
2231
- "verification": default_verification,
2232
- "gids": [single_gid],
2233
- "count": 1,
2234
- "batch_index": 1,
2235
- "note": "单告警跳过聚类",
2236
- }
2237
- )
2238
- _progress_append(
2239
- {
2240
- "event": "cluster_status",
2241
- "status": "done",
2242
- "file": file,
2243
- "batch_index": 1,
2244
- "skipped": True,
2245
- "reason": "single_alert",
2246
- }
2247
- )
2248
- current_batch_records = [
2249
- rec for rec in cluster_records
2250
- if rec.get("file") == file and rec.get("batch_index") == 1
2251
- ]
2252
- if current_batch_records:
2253
- _write_cluster_batch_snapshot(current_batch_records)
2254
- typer.secho(f"[jarvis-sec] 文件 {file} 仅有一个告警(gid={single_gid}),跳过聚类直接写入", fg=typer.colors.BLUE)
2255
-
2256
-
2257
- def _validate_cluster_format(cluster_items: List[Dict]) -> tuple[bool, List[str]]:
2258
- """验证聚类结果的格式,返回(是否有效, 错误详情列表)"""
2259
- if not isinstance(cluster_items, list) or not cluster_items:
2260
- return False, ["结果不是数组或数组为空"]
2261
-
2262
- error_details = []
2263
- for idx, it in enumerate(cluster_items):
2264
- if not isinstance(it, dict):
2265
- error_details.append(f"元素{idx}不是字典")
2266
- return False, error_details
2267
-
2268
- vals = it.get("gids", [])
2269
- if not isinstance(it.get("verification", ""), str) or not isinstance(vals, list):
2270
- error_details.append(f"元素{idx}的verification或gids格式错误")
2271
- return False, error_details
2272
-
2273
- # 校验 gids 列表中的每个元素是否都是有效的整数
2274
- if isinstance(vals, list):
2275
- for gid_idx, gid_val in enumerate(vals):
2276
- try:
2277
- gid_int = int(gid_val)
2278
- if gid_int < 1:
2279
- error_details.append(f"元素{idx}的gids[{gid_idx}]不是有效的正整数(值为{gid_val})")
2280
- return False, error_details
2281
- except (ValueError, TypeError):
2282
- error_details.append(f"元素{idx}的gids[{gid_idx}]不是有效的整数(值为{gid_val},类型为{type(gid_val).__name__})")
2283
- return False, error_details
2284
-
2285
- # 校验 is_invalid 字段(必填)
2286
- if "is_invalid" not in it:
2287
- error_details.append(f"元素{idx}缺少is_invalid字段(必填)")
2288
- return False, error_details
2289
-
2290
- is_invalid_val = it.get("is_invalid")
2291
- if not isinstance(is_invalid_val, bool):
2292
- error_details.append(f"元素{idx}的is_invalid不是布尔值")
2293
- return False, error_details
2294
-
2295
- # 如果is_invalid为true,必须提供invalid_reason
2296
- if is_invalid_val is True:
2297
- invalid_reason = it.get("invalid_reason", "")
2298
- if not isinstance(invalid_reason, str) or not invalid_reason.strip():
2299
- error_details.append(f"元素{idx}的is_invalid为true但缺少invalid_reason字段或理由为空(必填)")
2300
- return False, error_details
2301
-
2302
- return True, []
2303
-
2304
-
2305
- def _extract_classified_gids(cluster_items: List[Dict]) -> set:
2306
- """从聚类结果中提取所有已分类的gid
2307
-
2308
- 注意:此函数假设格式验证已经通过,所有gid都是有效的整数。
2309
- 如果遇到格式错误的gid,会记录警告但不会抛出异常(因为格式验证应该已经捕获了这些问题)。
2310
- """
2311
- classified_gids = set()
2312
- for cl in cluster_items:
2313
- raw_gids = cl.get("gids", [])
2314
- if isinstance(raw_gids, list):
2315
- for x in raw_gids:
2316
- try:
2317
- xi = int(x)
2318
- if xi >= 1:
2319
- classified_gids.add(xi)
2320
- except (ValueError, TypeError) as e:
2321
- # 理论上不应该到达这里(格式验证应该已经捕获),但如果到达了,记录警告
2322
- try:
2323
- typer.secho(f"[jarvis-sec] 警告:在提取gid时遇到格式错误(值={x},类型={type(x).__name__}),这不应该发生(格式验证应该已捕获)", fg=typer.colors.YELLOW)
2324
- except Exception:
2325
- pass
2326
- continue
2327
- return classified_gids
2328
-
2329
-
2330
- def _build_cluster_retry_task(
2331
- file: str,
2332
- missing_gids: set,
2333
- error_details: List[str],
2334
- ) -> str:
2335
- """构建聚类重试任务"""
2336
- retry_task = f"""
2337
- # 聚类任务重试
2338
- 文件: {file}
2339
-
2340
- **重要提示**:请重新输出聚类结果。
2341
- """.strip()
2342
- if missing_gids:
2343
- missing_gids_list = sorted(list(missing_gids))
2344
- missing_count = len(missing_gids)
2345
- retry_task += f"\n\n**遗漏的gid(共{missing_count}个,必须被分类):**\n" + ", ".join(str(gid) for gid in missing_gids_list)
2346
- if error_details:
2347
- retry_task += f"\n\n**格式错误:**\n" + "\n".join(f"- {detail}" for detail in error_details)
2348
- return retry_task
2349
-
2350
-
2351
- def _build_cluster_error_guidance(
2352
- error_details: List[str],
2353
- missing_gids: set,
2354
- ) -> str:
2355
- """构建聚类错误指导信息"""
2356
- error_guidance = ""
2357
- if error_details:
2358
- error_guidance = f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n" + "\n".join(f"- {detail}" for detail in error_details)
2359
- if missing_gids:
2360
- missing_gids_list = sorted(list(missing_gids))
2361
- missing_count = len(missing_gids)
2362
- error_guidance += f"\n\n**完整性错误:遗漏了 {missing_count} 个 gid,这些 gid 必须被分类:**\n" + ", ".join(str(gid) for gid in missing_gids_list)
2363
- return error_guidance
2364
-
2365
-
2366
- def _run_cluster_agent_direct_model(
2367
- cluster_agent,
2368
- cluster_task: str,
2369
- cluster_summary_prompt: str,
2370
- file: str,
2371
- missing_gids: set,
2372
- error_details: List[str],
2373
- _cluster_summary: Dict[str, str],
2374
- ) -> None:
2375
- """使用直接模型调用运行聚类Agent"""
2376
- retry_task = _build_cluster_retry_task(file, missing_gids, error_details)
2377
- error_guidance = _build_cluster_error_guidance(error_details, missing_gids)
2378
- full_prompt = f"{retry_task}{error_guidance}\n\n{cluster_summary_prompt}"
2379
- try:
2380
- response = cluster_agent.model.chat_until_success(full_prompt) # type: ignore
2381
- _cluster_summary["text"] = response
2382
- except Exception as e:
2383
- try:
2384
- typer.secho(f"[jarvis-sec] 直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
2385
- except Exception:
2386
- pass
2387
- cluster_agent.run(cluster_task)
2388
-
2389
-
2390
- def _validate_cluster_result(
2391
- cluster_items: Optional[List[Dict]],
2392
- parse_error: Optional[str],
2393
- attempt: int,
2394
- ) -> tuple[bool, List[str]]:
2395
- """验证聚类结果格式"""
2396
- if parse_error:
2397
- error_details = [f"YAML解析失败: {parse_error}"]
2398
- typer.secho(f"[jarvis-sec] YAML解析失败: {parse_error}", fg=typer.colors.YELLOW)
2399
- return False, error_details
2400
- else:
2401
- valid, error_details = _validate_cluster_format(cluster_items)
2402
- if not valid:
2403
- typer.secho(f"[jarvis-sec] 聚类结果格式无效({'; '.join(error_details)}),重试第 {attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
2404
- return valid, error_details
2405
-
2406
-
2407
- def _check_cluster_completeness(
2408
- cluster_items: List[Dict],
2409
- input_gids: set,
2410
- attempt: int,
2411
- ) -> tuple[bool, set]:
2412
- """检查聚类完整性,返回(是否完整, 遗漏的gid)"""
2413
- classified_gids = _extract_classified_gids(cluster_items)
2414
- missing_gids = input_gids - classified_gids
2415
- if not missing_gids:
2416
- typer.secho(f"[jarvis-sec] 聚类完整性校验通过,所有gid已分类(共尝试 {attempt} 次)", fg=typer.colors.GREEN)
2417
- return True, set()
2418
- else:
2419
- missing_gids_list = sorted(list(missing_gids))
2420
- missing_count = len(missing_gids)
2421
- typer.secho(f"[jarvis-sec] 聚类完整性校验失败:遗漏的gid: {missing_gids_list}({missing_count}个),重试第 {attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
2422
- return False, missing_gids
2423
-
2424
-
2425
- def _run_cluster_agent_with_retry(
2426
- cluster_agent,
2427
- cluster_task: str,
2428
- cluster_summary_prompt: str,
2429
- input_gids: set,
2430
- file: str,
2431
- _cluster_summary: Dict[str, str],
2432
- ) -> tuple[Optional[List[Dict]], Optional[str]]:
2433
- """运行聚类Agent并永久重试直到所有gid都被分类,返回(聚类结果, 解析错误)"""
2434
- _attempt = 0
2435
- use_direct_model = False
2436
- error_details: List[str] = []
2437
- missing_gids = set()
2438
-
2439
- while True:
2440
- _attempt += 1
2441
- _cluster_summary["text"] = ""
2442
-
2443
- if use_direct_model:
2444
- _run_cluster_agent_direct_model(
2445
- cluster_agent,
2446
- cluster_task,
2447
- cluster_summary_prompt,
2448
- file,
2449
- missing_gids,
2450
- error_details,
2451
- _cluster_summary,
2452
- )
2453
- else:
2454
- # 第一次使用 run(),让 Agent 完整运行(可能使用工具)
2455
- cluster_agent.run(cluster_task)
2456
-
2457
- cluster_items, parse_error = _parse_clusters_from_text(_cluster_summary.get("text", ""))
2458
-
2459
- # 校验结构
2460
- valid, error_details = _validate_cluster_result(cluster_items, parse_error, _attempt)
2461
-
2462
- # 完整性校验:检查所有输入的gid是否都被分类
2463
- missing_gids = set()
2464
- if valid and cluster_items:
2465
- is_complete, missing_gids = _check_cluster_completeness(cluster_items, input_gids, _attempt)
2466
- if is_complete:
2467
- return cluster_items, None
2468
- else:
2469
- use_direct_model = True
2470
- valid = False
2471
-
2472
- if not valid:
2473
- use_direct_model = True
2474
- cluster_items = None
2475
-
2476
-
2477
- def _process_cluster_results(
2478
- cluster_items: List[Dict],
2479
- pending_in_file_with_ids: List[Dict],
2480
- file: str,
2481
- chunk_idx: int,
2482
- cluster_batches: List[List[Dict]],
2483
- cluster_records: List[Dict],
2484
- invalid_clusters_for_review: List[Dict],
2485
- _progress_append,
2486
- ) -> tuple[int, int]:
2487
- """处理聚类结果,返回(有效聚类数, 无效聚类数)"""
2488
- gid_to_item: Dict[int, Dict] = {}
2489
- try:
2490
- for it in pending_in_file_with_ids:
2491
- try:
2492
- _gid = int(it.get("gid", 0))
2493
- if _gid >= 1:
2494
- gid_to_item[_gid] = it
2495
- except Exception:
2496
- pass
2497
- except Exception:
2498
- gid_to_item = {}
2499
-
2500
- _merged_count = 0
2501
- _invalid_count = 0
2502
- classified_gids_final = set()
2503
-
2504
- for cl in cluster_items:
2505
- verification = str(cl.get("verification", "")).strip()
2506
- raw_gids = cl.get("gids", [])
2507
- is_invalid = cl["is_invalid"]
2508
- norm_keys: List[int] = []
2509
- if isinstance(raw_gids, list):
2510
- for x in raw_gids:
2511
- try:
2512
- xi = int(x)
2513
- if xi >= 1:
2514
- norm_keys.append(xi)
2515
- classified_gids_final.add(xi)
2516
- except Exception:
2517
- pass
2518
-
2519
- members: List[Dict] = []
2520
- for k in norm_keys:
2521
- it = gid_to_item.get(k)
2522
- if it:
2523
- it["verify"] = verification
2524
- members.append(it)
2525
-
2526
- # 如果标记为无效,收集到复核列表
2527
- if is_invalid:
2528
- _invalid_count += 1
2529
- invalid_gids = [m.get("gid") for m in members]
2530
- invalid_reason = str(cl.get("invalid_reason", "")).strip()
2531
- try:
2532
- typer.secho(f"[jarvis-sec] 聚类阶段判定为无效(gids={invalid_gids}),将提交复核Agent验证", fg=typer.colors.BLUE)
2533
- except Exception:
2534
- pass
2535
- invalid_clusters_for_review.append({
2536
- "file": file,
2537
- "batch_index": chunk_idx,
2538
- "gids": invalid_gids,
2539
- "verification": verification,
2540
- "invalid_reason": invalid_reason,
2541
- "members": members,
2542
- "count": len(members),
2543
- })
2544
- _progress_append({
2545
- "event": "cluster_invalid",
2546
- "file": file,
2547
- "batch_index": chunk_idx,
2548
- "gids": invalid_gids,
2549
- "verification": verification,
2550
- "count": len(members),
2551
- })
2552
- cluster_records.append({
2553
- "file": file,
2554
- "verification": verification,
2555
- "gids": invalid_gids,
2556
- "count": len(members),
2557
- "batch_index": chunk_idx,
2558
- "is_invalid": True,
2559
- "invalid_reason": invalid_reason,
2560
- })
2561
- elif members:
2562
- _merged_count += 1
2563
- cluster_batches.append(members)
2564
- cluster_records.append({
2565
- "file": file,
2566
- "verification": verification,
2567
- "gids": [m.get("gid") for m in members],
2568
- "count": len(members),
2569
- "batch_index": chunk_idx,
2570
- "is_invalid": False,
2571
- })
2572
-
2573
- return _merged_count, _invalid_count
2574
122
 
123
+ # 注:当前版本不使用 MultiAgent 编排,已移除默认多智能体配置与创建函数。
124
+ # 请使用 run_security_analysis(单Agent逐条验证)或 workflow.direct_scan + format_markdown_report(直扫基线)。
125
+ # 注意:部分函数已迁移到模块化文件中(prompts.py, parsers.py, utils.py, agents.py, clustering.py, analysis.py, verification.py, review.py),
126
+ # 本文件中保留了这些函数的别名导入,以便向后兼容。
2575
127
 
2576
- def _supplement_missing_gids(
2577
- missing_gids_final: set,
2578
- gid_to_item: Dict[int, Dict],
2579
- file: str,
2580
- chunk_idx: int,
2581
- cluster_batches: List[List[Dict]],
2582
- cluster_records: List[Dict],
2583
- ) -> int:
2584
- """为遗漏的gid创建单独聚类,返回补充的聚类数"""
2585
- supplemented_count = 0
2586
- for missing_gid in sorted(missing_gids_final):
2587
- missing_item = gid_to_item.get(missing_gid)
2588
- if missing_item:
2589
- default_verification = f"验证候选 {missing_gid} 的安全风险"
2590
- missing_item["verify"] = default_verification
2591
- cluster_batches.append([missing_item])
2592
- cluster_records.append({
2593
- "file": file,
2594
- "verification": default_verification,
2595
- "gids": [missing_gid],
2596
- "count": 1,
2597
- "batch_index": chunk_idx,
2598
- "note": "完整性校验补充的遗漏gid",
2599
- })
2600
- supplemented_count += 1
2601
- return supplemented_count
2602
128
 
2603
129
 
2604
- def _get_cluster_system_prompt() -> str:
2605
- """获取聚类Agent的系统提示词"""
2606
- return """
2607
- # 单Agent聚类约束
2608
- - 你的任务是对同一文件内的启发式候选进行聚类,将可以一起验证的问题归为一类。
2609
- - **聚类原则**:
2610
- - 可以一起验证的问题归为一类,不一定是验证条件完全一致才能归为一类。
2611
- - 如果多个候选问题可以通过同一个验证过程来确认,即使它们的验证条件略有不同,也可以归为一类。
2612
- - 例如:多个指针解引用问题可以归为一类(验证"指针在解引用前非空"),即使它们涉及不同的指针变量。
2613
- - 例如:多个缓冲区操作问题可以归为一类(验证"拷贝长度不超过目标缓冲区容量"),即使它们涉及不同的缓冲区。
2614
- - 验证条件:为了确认是否存在漏洞需要成立/验证的关键前置条件。例如:"指针p在解引用前非空""拷贝长度不超过目标缓冲区容量"等。
2615
- - **完整性要求**:每个gid都必须出现在某个类别中,不能遗漏任何一个gid。所有输入的gid都必须被分类。
2616
- - 工具优先:如需核对上下文,可使用 read_code 读取相邻代码;避免过度遍历。
2617
- - 禁止写操作;仅只读分析。
2618
- - **重要:关于无效判断的保守策略**:
2619
- - 在判断候选是否无效时,必须充分考虑所有可能的路径、调用链和边界情况。
2620
- - 必须考虑:所有可能的调用者、所有可能的输入来源、所有可能的执行路径、所有可能的边界条件。
2621
- - 只要存在任何可能性(即使很小)导致漏洞可被触发,就不应该标记为无效(is_invalid: false)。
2622
- - 只有在完全确定、没有任何可能性、所有路径都已验证安全的情况下,才能标记为无效(is_invalid: true)。
2623
- - 保守原则:有疑问时,一律标记为 false(需要进入后续验证阶段),让分析Agent和验证Agent进行更深入的分析。
2624
- - 不要因为看到局部有保护措施就认为无效,要考虑是否有其他调用路径绕过这些保护。
2625
- - 不要因为看到某些调用者已做校验就认为无效,要考虑是否有其他调用者未做校验。
2626
- - **记忆使用**:
2627
- - 在聚类过程中,充分利用 retrieve_memory 工具检索已有的记忆,特别是与当前文件或函数相关的记忆。
2628
- - 如果有必要,使用 save_memory 工具保存聚类过程中发现的函数或代码片段的要点,使用函数名或文件名作为 tag。
2629
- - 记忆内容示例:某个函数的指针已经判空、某个函数已有输入校验、某个代码片段的上下文信息等。
2630
- - 这些记忆可以帮助后续的分析Agent和验证Agent更高效地工作。
2631
- """.strip()
2632
130
 
2633
131
 
2634
- def _get_cluster_summary_prompt() -> str:
2635
- """获取聚类Agent的摘要提示词"""
2636
- return """
2637
- 请仅在 <CLUSTERS> 与 </CLUSTERS> 中输出 YAML 数组:
2638
- - 每个元素包含(所有字段均为必填):
2639
- - verification: 字符串(对该聚类的验证条件描述,简洁明确,可直接用于后续Agent验证)
2640
- - gids: 整数数组(候选的全局唯一编号;输入JSON每个元素含 gid,可直接对应填入)
2641
- - is_invalid: 布尔值(必填,true 或 false)。如果为 true,表示该聚类中的所有候选已被确认为无效/误报,将不会进入后续验证阶段;如果为 false,表示该聚类中的候选需要进入后续验证阶段。
2642
- - invalid_reason: 字符串(当 is_invalid 为 true 时必填,当 is_invalid 为 false 时可省略)。必须详细说明为什么这些候选是无效的,包括:
2643
- * 已检查的所有调用路径和调用者
2644
- * 已确认的保护措施和校验逻辑
2645
- * 为什么这些保护措施在所有路径上都有效
2646
- * 为什么不存在任何可能的触发路径
2647
- * 必须足够详细,以便复核Agent能够验证你的判断
2648
- - 要求:
2649
- - 严格要求:仅输出位于 <CLUSTERS> 与 </CLUSTERS> 间的 YAML 数组,其他位置不输出任何文本
2650
- - **完整性要求(最重要)**:输入JSON中的所有gid都必须被分类,不能遗漏任何一个gid。所有gid必须出现在某个聚类的gids数组中。这是强制要求,必须严格遵守。
2651
- - **聚类原则**:可以一起验证的问题归为一类,不一定是验证条件完全一致才能归为一类。如果多个候选问题可以通过同一个验证过程来确认,即使它们的验证条件略有不同,也可以归为一类。
2652
- - **必须要求**:每个聚类元素必须包含 is_invalid 字段,且值必须为 true 或 false,不能省略。
2653
- - **必须要求**:当 is_invalid 为 true 时,必须提供 invalid_reason 字段,且理由必须充分详细。
2654
- - 不需要解释与长文本,仅给出可执行的验证条件短句
2655
- - 若无法聚类,请将每个候选单独成组,verification 为该候选的最小确认条件
2656
- - **关于 is_invalid 的保守判断原则**:
2657
- - 必须充分考虑所有可能的路径、调用链、输入来源和边界情况。
2658
- - 只要存在任何可能性(即使很小)导致漏洞可被触发,必须设置 is_invalid: false。
2659
- - 只有在完全确定、没有任何可能性、所有路径都已验证安全的情况下,才能设置 is_invalid: true。
2660
- - 保守策略:有疑问时,一律设置为 false,让后续的分析Agent和验证Agent进行更深入的分析。
2661
- - 不要因为局部有保护措施就设置为 true,要考虑是否有其他路径绕过保护。
2662
- - 不要因为某些调用者已做校验就设置为 true,要考虑是否有其他调用者未做校验。
2663
- - 如果设置为 true,必须在 invalid_reason 中详细说明已检查的所有路径和原因。
2664
- <CLUSTERS>
2665
- - verification: ""
2666
- gids: []
2667
- is_invalid: false
2668
- </CLUSTERS>
2669
- """.strip()
2670
132
 
2671
133
 
2672
- def _create_cluster_agent(
2673
- file: str,
2674
- chunk_idx: int,
2675
- llm_group: Optional[str],
2676
- ) -> Agent:
2677
- """创建聚类Agent"""
2678
- cluster_system_prompt = _get_cluster_system_prompt()
2679
- cluster_summary_prompt = _get_cluster_summary_prompt()
2680
-
2681
- agent_kwargs_cluster: Dict = dict(
2682
- system_prompt=cluster_system_prompt,
2683
- name=f"JARVIS-SEC-Cluster::{file}::batch{chunk_idx}",
2684
- auto_complete=True,
2685
- need_summary=True,
2686
- summary_prompt=cluster_summary_prompt,
2687
- non_interactive=True,
2688
- in_multi_agent=False,
2689
- use_methodology=False,
2690
- use_analysis=False,
2691
- plan=False,
2692
- output_handler=[ToolRegistry()],
2693
- disable_file_edit=True,
2694
- use_tools=["read_code", "execute_script", "save_memory", "retrieve_memory"],
2695
- )
2696
- if llm_group:
2697
- agent_kwargs_cluster["model_group"] = llm_group
2698
- return Agent(**agent_kwargs_cluster)
2699
134
 
2700
135
 
2701
- def _build_cluster_task(
2702
- pending_in_file_with_ids: List[Dict],
136
+ def run_security_analysis(
2703
137
  entry_path: str,
2704
- file: str,
2705
- langs: List[str],
138
+ languages: Optional[List[str]] = None,
139
+ llm_group: Optional[str] = None,
140
+ report_file: Optional[str] = None,
141
+ cluster_limit: int = 50,
142
+ exclude_dirs: Optional[List[str]] = None,
143
+ enable_verification: bool = True,
144
+ force_save_memory: bool = False,
145
+ output_file: Optional[str] = None,
2706
146
  ) -> str:
2707
- """构建聚类任务上下文"""
2708
- import json as _json2
2709
- return f"""
2710
- # 聚类任务(分析输入)
2711
- 上下文:
2712
- - entry_path: {entry_path}
2713
- - file: {file}
2714
- - languages: {langs}
2715
-
2716
- 候选(JSON数组,包含 gid/file/line/pattern/category/evidence):
2717
- {_json2.dumps(pending_in_file_with_ids, ensure_ascii=False, indent=2)}
2718
- """.strip()
2719
-
2720
-
2721
- def _extract_input_gids(pending_in_file_with_ids: List[Dict]) -> set:
2722
- """从待聚类项中提取gid集合"""
2723
- input_gids = set()
2724
- for it in pending_in_file_with_ids:
2725
- try:
2726
- _gid = int(it.get("gid", 0))
2727
- if _gid >= 1:
2728
- input_gids.add(_gid)
2729
- except Exception:
2730
- pass
2731
- return input_gids
2732
-
2733
-
2734
- def _build_gid_to_item_mapping(pending_in_file_with_ids: List[Dict]) -> Dict[int, Dict]:
2735
- """构建gid到项的映射"""
2736
- gid_to_item: Dict[int, Dict] = {}
2737
- try:
2738
- for it in pending_in_file_with_ids:
2739
- try:
2740
- _gid = int(it.get("gid", 0))
2741
- if _gid >= 1:
2742
- gid_to_item[_gid] = it
2743
- except Exception:
2744
- pass
2745
- except Exception:
2746
- pass
2747
- return gid_to_item
2748
-
2749
-
2750
- def _process_cluster_chunk(
2751
- chunk: List[Dict],
2752
- chunk_idx: int,
2753
- file: str,
2754
- entry_path: str,
2755
- langs: List[str],
2756
- llm_group: Optional[str],
2757
- cluster_batches: List[List[Dict]],
2758
- cluster_records: List[Dict],
2759
- invalid_clusters_for_review: List[Dict],
2760
- _progress_append,
2761
- _write_cluster_batch_snapshot,
2762
- ) -> None:
2763
- """处理单个聚类批次"""
2764
- if not chunk:
2765
- return
2766
-
2767
- pending_in_file_with_ids = list(chunk)
2768
-
2769
- # 记录聚类批次开始
2770
- _progress_append({
2771
- "event": "cluster_status",
2772
- "status": "running",
2773
- "file": file,
2774
- "batch_index": chunk_idx,
2775
- "total_in_batch": len(pending_in_file_with_ids),
2776
- })
2777
-
2778
- # 创建聚类Agent
2779
- cluster_agent = _create_cluster_agent(file, chunk_idx, llm_group)
2780
-
2781
- # 构建任务上下文
2782
- cluster_task = _build_cluster_task(pending_in_file_with_ids, entry_path, file, langs)
2783
-
2784
- # 订阅摘要事件
2785
- cluster_summary = _subscribe_summary_event(cluster_agent)
2786
-
2787
- # 提取输入gid
2788
- input_gids = _extract_input_gids(pending_in_file_with_ids)
2789
-
2790
- # 运行聚类Agent
2791
- cluster_summary_prompt = _get_cluster_summary_prompt()
2792
- cluster_items, parse_error = _run_cluster_agent_with_retry(
2793
- cluster_agent,
2794
- cluster_task,
2795
- cluster_summary_prompt,
2796
- input_gids,
2797
- file,
2798
- cluster_summary,
2799
- )
2800
-
2801
- # 处理聚类结果
2802
- _merged_count = 0
2803
- _invalid_count = 0
2804
-
2805
- if isinstance(cluster_items, list) and cluster_items:
2806
- gid_to_item = _build_gid_to_item_mapping(pending_in_file_with_ids)
2807
-
2808
- _merged_count, _invalid_count = _process_cluster_results(
2809
- cluster_items,
2810
- pending_in_file_with_ids,
2811
- file,
2812
- chunk_idx,
2813
- cluster_batches,
2814
- cluster_records,
2815
- invalid_clusters_for_review,
2816
- _progress_append,
2817
- )
2818
-
2819
- classified_gids_final = _extract_classified_gids(cluster_items)
2820
- missing_gids_final = input_gids - classified_gids_final
2821
- if missing_gids_final:
2822
- typer.secho(f"[jarvis-sec] 警告:仍有遗漏的gid {sorted(list(missing_gids_final))},将为每个遗漏的gid创建单独聚类", fg=typer.colors.YELLOW)
2823
- supplemented_count = _supplement_missing_gids(
2824
- missing_gids_final,
2825
- gid_to_item,
2826
- file,
2827
- chunk_idx,
2828
- cluster_batches,
2829
- cluster_records,
2830
- )
2831
- _merged_count += supplemented_count
2832
- else:
2833
- # 聚类结果为空或None:为所有输入的gid创建单独聚类(保守策略)
2834
- if pending_in_file_with_ids:
2835
- typer.secho(f"[jarvis-sec] 警告:聚类结果为空或None(文件={file},批次={chunk_idx}),为所有gid创建单独聚类", fg=typer.colors.YELLOW)
2836
- gid_to_item_fallback = _build_gid_to_item_mapping(pending_in_file_with_ids)
2837
-
2838
- _merged_count = _supplement_missing_gids(
2839
- input_gids,
2840
- gid_to_item_fallback,
2841
- file,
2842
- chunk_idx,
2843
- cluster_batches,
2844
- cluster_records,
2845
- )
2846
- _invalid_count = 0
2847
- else:
2848
- _merged_count = 0
2849
- _invalid_count = 0
2850
-
2851
- # 标记聚类批次完成
2852
- _progress_append({
2853
- "event": "cluster_status",
2854
- "status": "done",
2855
- "file": file,
2856
- "batch_index": chunk_idx,
2857
- "clusters_count": _merged_count,
2858
- "invalid_clusters_count": _invalid_count,
2859
- })
2860
- if _invalid_count > 0:
2861
- try:
2862
- typer.secho(f"[jarvis-sec] 聚类批次完成: 有效聚类={_merged_count},无效聚类={_invalid_count}(已跳过)", fg=typer.colors.GREEN)
2863
- except Exception:
2864
- pass
2865
-
2866
- # 写入当前批次的聚类结果
2867
- current_batch_records = [
2868
- rec for rec in cluster_records
2869
- if rec.get("file") == file and rec.get("batch_index") == chunk_idx
2870
- ]
2871
- if current_batch_records:
2872
- _write_cluster_batch_snapshot(current_batch_records)
2873
-
2874
-
2875
- def _filter_pending_items(items: List[Dict], clustered_gids: set) -> List[Dict]:
2876
- """过滤出待聚类的项"""
2877
- pending_in_file: List[Dict] = []
2878
- for c in items:
2879
- try:
2880
- _gid = int(c.get("gid", 0))
2881
- if _gid >= 1 and _gid not in clustered_gids:
2882
- pending_in_file.append(c)
2883
- except Exception:
2884
- pass
2885
- return pending_in_file
2886
-
147
+ """
148
+ 运行安全分析工作流(混合模式)。
2887
149
 
2888
- def _process_file_clustering(
2889
- file: str,
2890
- items: List[Dict],
2891
- clustered_gids: set,
2892
- cluster_batches: List[List[Dict]],
2893
- cluster_records: List[Dict],
2894
- invalid_clusters_for_review: List[Dict],
2895
- entry_path: str,
2896
- langs: List[str],
2897
- cluster_limit: int,
2898
- llm_group: Optional[str],
2899
- _progress_append,
2900
- _write_cluster_batch_snapshot,
2901
- ) -> None:
2902
- """处理单个文件的聚类任务"""
2903
- # 过滤掉已聚类的 gid
2904
- pending_in_file = _filter_pending_items(items, clustered_gids)
2905
- if not pending_in_file:
2906
- return
2907
-
2908
- # 优化:如果文件只有一个告警,跳过聚类,直接写入
2909
- if len(pending_in_file) == 1:
2910
- single_item = pending_in_file[0]
2911
- single_gid = single_item.get("gid", 0)
2912
- _handle_single_alert_file(
2913
- file,
2914
- single_item,
2915
- single_gid,
2916
- cluster_batches,
2917
- cluster_records,
2918
- _progress_append,
2919
- _write_cluster_batch_snapshot,
2920
- )
2921
- return
2922
-
2923
- # 将该文件的告警按 cluster_limit 分批
2924
- _limit = cluster_limit if isinstance(cluster_limit, int) and cluster_limit > 0 else 50
2925
- _chunks: List[List[Dict]] = [pending_in_file[i:i + _limit] for i in range(0, len(pending_in_file), _limit)]
150
+ 改进:
151
+ - 即使在 agent 模式下,也先进行本地正则/启发式直扫,生成候选问题;
152
+ 然后将候选问题拆分为子任务,交由多Agent进行深入分析与聚合。
2926
153
 
2927
- # 处理每个批次
2928
- for _chunk_idx, _chunk in enumerate(_chunks, start=1):
2929
- _process_cluster_chunk(
2930
- _chunk,
2931
- _chunk_idx,
2932
- file,
2933
- entry_path,
2934
- langs,
2935
- llm_group,
2936
- cluster_batches,
2937
- cluster_records,
2938
- invalid_clusters_for_review,
2939
- _progress_append,
2940
- _write_cluster_batch_snapshot,
2941
- )
2942
-
2943
-
2944
- def _is_valid_review_item(item: Dict) -> bool:
2945
- """验证复核结果项的格式"""
2946
- if not isinstance(item, dict) or "is_reason_sufficient" not in item:
2947
- return False
2948
- has_gid = "gid" in item
2949
- has_gids = "gids" in item
2950
- if not has_gid and not has_gids:
2951
- return False
2952
- if has_gid and has_gids:
2953
- return False # gid 和 gids 不能同时出现
2954
- if has_gid:
2955
- try:
2956
- return int(item["gid"]) >= 1
2957
- except Exception:
2958
- return False
2959
- elif has_gids:
2960
- if not isinstance(item["gids"], list) or len(item["gids"]) == 0:
2961
- return False
2962
- try:
2963
- return all(int(gid_val) >= 1 for gid_val in item["gids"])
2964
- except Exception:
2965
- return False
2966
- return False
2967
-
2968
-
2969
- def _build_gid_to_review_mapping(review_results: List[Dict]) -> Dict[int, Dict]:
2970
- """构建gid到复核结果的映射(支持 gid 和 gids 两种格式)"""
2971
- gid_to_review: Dict[int, Dict] = {}
2972
- for rr in review_results:
2973
- if not isinstance(rr, dict):
2974
- continue
2975
-
2976
- # 支持 gid 和 gids 两种格式
2977
- gids_to_process: List[int] = []
2978
- if "gids" in rr and isinstance(rr.get("gids"), list):
2979
- # 合并格式:gids 数组
2980
- for gid_val in rr.get("gids", []):
2981
- try:
2982
- gid_int = int(gid_val)
2983
- if gid_int >= 1:
2984
- gids_to_process.append(gid_int)
2985
- except Exception:
2986
- pass
2987
- elif "gid" in rr:
2988
- # 单个格式:gid
2989
- try:
2990
- gid_int = int(rr.get("gid", 0))
2991
- if gid_int >= 1:
2992
- gids_to_process.append(gid_int)
2993
- except Exception:
2994
- pass
2995
-
2996
- # 为每个 gid 创建复核结果映射
2997
- is_reason_sufficient = rr.get("is_reason_sufficient")
2998
- review_notes = str(rr.get("review_notes", "")).strip()
2999
- for gid in gids_to_process:
3000
- gid_to_review[gid] = {
3001
- "is_reason_sufficient": is_reason_sufficient,
3002
- "review_notes": review_notes
3003
- }
3004
- return gid_to_review
3005
-
154
+ 注意:此函数会在发生异常时更新状态文件为 error 状态。
3006
155
 
3007
- def _process_review_batch(
3008
- review_batch: List[Dict],
3009
- review_results: Optional[List[Dict]],
3010
- reviewed_clusters: List[Dict],
3011
- reinstated_candidates: List[Dict],
3012
- ) -> None:
3013
- """处理单个复核批次的结果"""
3014
- if review_results:
3015
- # 构建gid到复核结果的映射
3016
- gid_to_review = _build_gid_to_review_mapping(review_results)
3017
-
3018
- # 处理每个无效聚类
3019
- for invalid_cluster in review_batch:
3020
- cluster_gids = invalid_cluster.get("gids", [])
3021
- cluster_members = invalid_cluster.get("members", [])
3022
-
3023
- # 检查该聚类中的所有gid的复核结果
3024
- all_sufficient = True
3025
- any_reviewed = False
3026
- insufficient_review_result = None
3027
- for gid in cluster_gids:
3028
- review_result = gid_to_review.get(gid)
3029
- if review_result:
3030
- any_reviewed = True
3031
- if review_result.get("is_reason_sufficient") is not True:
3032
- all_sufficient = False
3033
- if not insufficient_review_result:
3034
- insufficient_review_result = review_result
3035
- break
3036
-
3037
- if any_reviewed and not all_sufficient:
3038
- # 理由不充分,重新加入验证流程
3039
- typer.secho(f"[jarvis-sec] 复核结果:无效聚类(gids={cluster_gids})理由不充分,重新加入验证流程", fg=typer.colors.BLUE)
3040
- for member in cluster_members:
3041
- reinstated_candidates.append(member)
3042
- reviewed_clusters.append({
3043
- **invalid_cluster,
3044
- "review_result": "reinstated",
3045
- "review_notes": insufficient_review_result.get("review_notes", "") if insufficient_review_result else "",
3046
- })
3047
- else:
3048
- # 理由充分,确认无效
3049
- review_notes = ""
3050
- if cluster_gids and gid_to_review.get(cluster_gids[0]):
3051
- review_notes = gid_to_review[cluster_gids[0]].get("review_notes", "")
3052
- typer.secho(f"[jarvis-sec] 复核结果:无效聚类(gids={cluster_gids})理由充分,确认为无效", fg=typer.colors.GREEN)
3053
- reviewed_clusters.append({
3054
- **invalid_cluster,
3055
- "review_result": "confirmed_invalid",
3056
- "review_notes": review_notes,
3057
- })
3058
- else:
3059
- # 复核结果解析失败,保守策略:重新加入验证流程
3060
- typer.secho(f"[jarvis-sec] 警告:复核结果解析失败,保守策略:将批次中的所有候选重新加入验证流程", fg=typer.colors.YELLOW)
3061
- for invalid_cluster in review_batch:
3062
- cluster_members = invalid_cluster.get("members", [])
3063
- for member in cluster_members:
3064
- reinstated_candidates.append(member)
3065
- reviewed_clusters.append({
3066
- **invalid_cluster,
3067
- "review_result": "reinstated",
3068
- "review_notes": "复核结果解析失败,保守策略重新加入验证",
3069
- })
156
+ 参数:
157
+ - entry_path: 待分析的根目录路径
158
+ - languages: 限定扫描的语言扩展(例如 ["c", "cpp", "h", "hpp", "rs"]),为空则使用默认
3070
159
 
160
+ 返回:
161
+ - 最终报告(字符串),由 Aggregator 生成(JSON + Markdown)
3071
162
 
3072
- def _run_review_agent_with_retry(
3073
- review_agent,
3074
- review_task: str,
3075
- review_summary_prompt: str,
3076
- entry_path: str,
3077
- review_summary_container: Dict[str, str],
3078
- ) -> tuple[Optional[List[Dict]], Optional[str]]:
3079
- """运行复核Agent并永久重试直到格式正确,返回(复核结果, 解析错误)"""
3080
- use_direct_model_review = False
3081
- prev_parse_error_review: Optional[str] = None
3082
- review_attempt = 0
3083
-
3084
- while True:
3085
- review_attempt += 1
3086
- review_summary_container["text"] = ""
3087
-
3088
- if use_direct_model_review:
3089
- # 格式校验失败后,直接调用模型接口
3090
- review_summary_prompt_text = _build_verification_summary_prompt()
3091
- error_guidance = ""
3092
- if prev_parse_error_review:
3093
- error_guidance = f"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n- YAML解析失败: {prev_parse_error_review}\n\n请确保输出的YAML格式正确,包括正确的缩进、引号、冒号等。"
3094
-
3095
- full_review_prompt = f"{review_task}{error_guidance}\n\n{review_summary_prompt_text}"
3096
- try:
3097
- review_response = review_agent.model.chat_until_success(full_review_prompt) # type: ignore
3098
- review_summary_container["text"] = review_response
3099
- except Exception as e:
3100
- try:
3101
- typer.secho(f"[jarvis-sec] 复核阶段直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
3102
- except Exception:
3103
- pass
3104
- review_agent.run(review_task)
3105
- else:
3106
- # 第一次使用 run(),让 Agent 完整运行(可能使用工具)
3107
- review_agent.run(review_task)
3108
-
3109
- # 工作区保护
3110
- try:
3111
- _changed_review = _git_restore_if_dirty(entry_path)
3112
- if _changed_review:
3113
- try:
3114
- typer.secho(f"[jarvis-sec] 复核Agent工作区已恢复 ({_changed_review} 个文件)", fg=typer.colors.BLUE)
3115
- except Exception:
3116
- pass
3117
- except Exception:
3118
- pass
3119
-
3120
- # 解析复核结果
3121
- review_summary_text = review_summary_container.get("text", "")
3122
- parse_error_review = None
3123
- if review_summary_text:
3124
- review_parsed, parse_error_review = _try_parse_summary_report(review_summary_text)
3125
- if parse_error_review:
3126
- prev_parse_error_review = parse_error_review
3127
- try:
3128
- typer.secho(f"[jarvis-sec] 复核结果YAML解析失败: {parse_error_review}", fg=typer.colors.YELLOW)
3129
- except Exception:
3130
- pass
3131
- else:
3132
- prev_parse_error_review = None
3133
- if isinstance(review_parsed, list):
3134
- if review_parsed and all(_is_valid_review_item(item) for item in review_parsed):
3135
- return review_parsed, None
3136
-
3137
- # 格式校验失败,后续重试使用直接模型调用
3138
- use_direct_model_review = True
3139
- if parse_error_review:
3140
- try:
3141
- typer.secho(f"[jarvis-sec] 复核结果YAML解析失败 -> 重试第 {review_attempt} 次(使用直接模型调用,将反馈解析错误)", fg=typer.colors.YELLOW)
3142
- except Exception:
3143
- pass
3144
- else:
3145
- try:
3146
- typer.secho(f"[jarvis-sec] 复核结果格式无效 -> 重试第 {review_attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
3147
- except Exception:
3148
- pass
163
+ 其他:
164
+ - llm_group: 模型组名称(仅在当前调用链内生效,不覆盖全局配置),将直接传入 Agent 用于选择模型
165
+ - report_file: 增量报告文件路径(JSONL)。当每个子任务检测到 issues 时,立即将一条记录追加到该文件;
166
+ 若未指定,则默认写入 entry_path/.jarvis/sec/agent_issues.jsonl
167
+ - cluster_limit: 聚类时每批次最多处理的告警数(默认 50),当单个文件告警过多时按批次进行聚类
168
+ - exclude_dirs: 要排除的目录列表(可选),默认已包含测试目录(test, tests, __tests__, spec, testsuite, testdata)
169
+ - enable_verification: 是否启用二次验证(默认 True),关闭后分析Agent确认的问题将直接写入报告
170
+ - 断点续扫: 默认开启。会基于 .jarvis/sec/candidates.jsonl、clusters.jsonl 和 analysis.jsonl 文件进行状态恢复。
171
+ """
3149
172
 
173
+ langs = languages or ["c", "cpp", "h", "hpp", "rs"]
3150
174
 
3151
- def _check_and_supplement_missing_gids(
3152
- file_groups: Dict[str, List[Dict]],
3153
- cluster_batches: List[List[Dict]],
3154
- invalid_clusters_for_review: List[Dict],
3155
- sec_dir,
3156
- _progress_append,
3157
- ) -> None:
3158
- """检查并补充遗漏的 gid"""
3159
- # 1. 收集所有候选的 gid
3160
- all_candidate_gids = _collect_candidate_gids(file_groups)
3161
- gid_to_candidate_for_check: Dict[int, Dict] = {}
3162
- for _file, _items in file_groups.items():
3163
- for it in _items:
3164
- try:
3165
- _gid = int(it.get("gid", 0))
3166
- if _gid >= 1:
3167
- gid_to_candidate_for_check[_gid] = it
3168
- except Exception:
3169
- pass
3170
-
3171
- # 2. 收集所有已聚类的 gid
3172
- all_clustered_gids = _collect_clustered_gids(cluster_batches, invalid_clusters_for_review)
175
+ # 状态管理器(不再使用 status.json,使用空对象)
176
+ class DummyStatusManager:
177
+ def update_pre_scan(self, **kwargs): pass
178
+ def update_clustering(self, **kwargs): pass
179
+ def update_review(self, **kwargs): pass
180
+ def update_verification(self, **kwargs): pass
181
+ def mark_completed(self, **kwargs): pass
182
+ def mark_error(self, **kwargs): pass
3173
183
 
3174
- # 3. 读取已处理的 gid(从 agent_issues.jsonl)
3175
- processed_gids_from_issues_for_check = _load_processed_gids_from_agent_issues(sec_dir)
3176
-
3177
- # 4. 检查是否有遗漏的 gid(未聚类)
3178
- missing_gids_before_analysis = all_candidate_gids - all_clustered_gids
3179
- if missing_gids_before_analysis:
3180
- missing_count = len(missing_gids_before_analysis)
3181
- missing_list = sorted(list(missing_gids_before_analysis))
3182
- if missing_count > 50:
3183
- # 如果遗漏的gid太多,只显示前10个和后10个
3184
- display_list = missing_list[:10] + ["..."] + missing_list[-10:]
3185
- typer.secho(f"[jarvis-sec] 警告:分析阶段开始前发现遗漏的gid(共{missing_count}个):{display_list},将检查是否需要补充聚类", fg=typer.colors.YELLOW)
3186
- else:
3187
- typer.secho(f"[jarvis-sec] 警告:分析阶段开始前发现遗漏的gid {missing_list},将检查是否需要补充聚类", fg=typer.colors.YELLOW)
3188
-
3189
- # 为每个遗漏的 gid 创建单独的聚类
3190
- supplemented_count, skipped_count = _supplement_missing_gids_for_clustering(
3191
- missing_gids_before_analysis,
3192
- gid_to_candidate_for_check,
3193
- cluster_batches,
3194
- _progress_append,
3195
- processed_gids_from_issues_for_check,
3196
- )
3197
-
3198
- # 输出统计信息
3199
- if skipped_count > 0:
3200
- try:
3201
- typer.secho(f"[jarvis-sec] 已跳过 {skipped_count} 个已在agent_issues.jsonl中处理的gid", fg=typer.colors.GREEN)
3202
- except Exception:
3203
- pass
3204
- if supplemented_count > 0:
3205
- try:
3206
- typer.secho(f"[jarvis-sec] 已为 {supplemented_count} 个遗漏的gid创建单独聚类", fg=typer.colors.GREEN)
3207
- except Exception:
3208
- pass
184
+ status_mgr = DummyStatusManager()
3209
185
 
3210
-
3211
- def _initialize_clustering_context(
3212
- compact_candidates: List[Dict],
3213
- sec_dir,
3214
- progress_path,
3215
- _progress_append,
3216
- ) -> tuple[Dict[str, List[Dict]], Dict, tuple, List[List[Dict]], List[Dict], List[Dict], set]:
3217
- """初始化聚类上下文,返回(文件分组, 已有聚类, 快照写入函数, 聚类批次, 聚类记录, 无效聚类, 已聚类gid)"""
3218
- # 按文件分组构建待聚类集合
3219
- _file_groups = _group_candidates_by_file(compact_candidates)
3220
-
3221
- cluster_batches: List[List[Dict]] = []
3222
- cluster_records: List[Dict] = []
3223
- invalid_clusters_for_review: List[Dict] = []
3224
-
3225
- # 读取已有聚类报告以支持断点
3226
- _existing_clusters, _completed_cluster_batches = _load_existing_clusters(
3227
- sec_dir, progress_path
3228
- )
3229
-
3230
- # 创建快照写入函数
3231
- _write_cluster_batch_snapshot, _write_cluster_report_snapshot = _create_cluster_snapshot_writer(
3232
- sec_dir, cluster_records, compact_candidates, _progress_append
3233
- )
3234
-
3235
- # 从断点恢复聚类结果
3236
- cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids = _restore_clusters_from_checkpoint(
3237
- _existing_clusters, _file_groups
3238
- )
3239
-
3240
- return (
3241
- _file_groups,
3242
- _existing_clusters,
3243
- (_write_cluster_batch_snapshot, _write_cluster_report_snapshot),
3244
- cluster_batches,
3245
- cluster_records,
3246
- invalid_clusters_for_review,
3247
- clustered_gids,
186
+ # 初始化分析上下文
187
+ sec_dir, progress_path, _progress_append = _initialize_analysis_context(
188
+ entry_path, status_mgr
3248
189
  )
3249
190
 
191
+ # 1) 启发式扫描(支持断点续扫)
192
+ candidates, summary = _load_or_run_heuristic_scan(
193
+ entry_path, langs, exclude_dirs, sec_dir, status_mgr, _progress_append
194
+ )
3250
195
 
3251
- def _check_unclustered_gids(
3252
- all_candidate_gids: set,
3253
- clustered_gids: set,
3254
- ) -> set:
3255
- """检查未聚类的gid"""
3256
- unclustered_gids = all_candidate_gids - clustered_gids
3257
- if unclustered_gids:
3258
- try:
3259
- typer.secho(f"[jarvis-sec] 发现 {len(unclustered_gids)} 个未聚类的 gid,将进行聚类", fg=typer.colors.YELLOW)
3260
- except Exception:
3261
- pass
3262
- else:
3263
- try:
3264
- typer.secho(f"[jarvis-sec] 所有 {len(all_candidate_gids)} 个候选已聚类,跳过聚类阶段", fg=typer.colors.GREEN)
3265
- except Exception:
3266
- pass
3267
- return unclustered_gids
3268
-
3269
-
3270
- def _execute_clustering_for_files(
3271
- file_groups: Dict[str, List[Dict]],
3272
- clustered_gids: set,
3273
- cluster_batches: List[List[Dict]],
3274
- cluster_records: List[Dict],
3275
- invalid_clusters_for_review: List[Dict],
3276
- entry_path: str,
3277
- langs: List[str],
3278
- cluster_limit: int,
3279
- llm_group: Optional[str],
3280
- status_mgr,
3281
- _progress_append,
3282
- _write_cluster_batch_snapshot,
3283
- ) -> None:
3284
- """执行文件聚类"""
3285
- total_files_to_cluster = len(file_groups)
3286
- # 更新聚类阶段状态
3287
- if total_files_to_cluster > 0:
3288
- status_mgr.update_clustering(
3289
- current_file=0,
3290
- total_files=total_files_to_cluster,
3291
- message="开始聚类分析..."
3292
- )
3293
- for _file_idx, (_file, _items) in enumerate(file_groups.items(), start=1):
3294
- typer.secho(f"\n[jarvis-sec] 聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}", fg=typer.colors.CYAN)
3295
- # 更新当前文件进度
3296
- status_mgr.update_clustering(
3297
- current_file=_file_idx,
3298
- total_files=total_files_to_cluster,
3299
- file_name=_file,
3300
- message=f"正在聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}"
3301
- )
3302
- # 使用子函数处理文件聚类
3303
- _process_file_clustering(
3304
- _file,
3305
- _items,
3306
- clustered_gids,
3307
- cluster_batches,
3308
- cluster_records,
3309
- invalid_clusters_for_review,
3310
- entry_path,
3311
- langs,
3312
- cluster_limit,
3313
- llm_group,
3314
- _progress_append,
3315
- _write_cluster_batch_snapshot,
3316
- )
3317
-
3318
-
3319
- def _record_clustering_completion(
3320
- sec_dir,
3321
- cluster_records: List[Dict],
3322
- compact_candidates: List[Dict],
3323
- _progress_append,
3324
- ) -> None:
3325
- """记录聚类阶段完成"""
196
+ # 2) 将候选问题精简为子任务清单,控制上下文长度
197
+ compact_candidates = _prepare_candidates(candidates)
198
+
199
+ # 3) 保存候选到新的 candidates.jsonl 文件(包含gid)
200
+ from jarvis.jarvis_sec.file_manager import save_candidates, get_candidates_file
3326
201
  try:
3327
- from pathlib import Path
3328
- import json
3329
- _cluster_path = sec_dir / "cluster_report.jsonl"
202
+ save_candidates(sec_dir, compact_candidates)
3330
203
  _progress_append({
3331
- "event": "cluster_report_written",
3332
- "path": str(_cluster_path),
3333
- "clusters": len(cluster_records),
3334
- "total_candidates": len(compact_candidates),
3335
- "note": "每个批次已增量保存,无需重写整个文件",
204
+ "event": "candidates_saved",
205
+ "path": str(get_candidates_file(sec_dir)),
206
+ "issues_count": len(compact_candidates),
3336
207
  })
3337
208
  except Exception:
3338
209
  pass
3339
-
3340
-
3341
- def _fallback_to_file_based_batches(
3342
- file_groups: Dict[str, List[Dict]],
3343
- existing_clusters: Dict,
3344
- ) -> List[List[Dict]]:
3345
- """若聚类失败或空,则回退为按文件一次处理"""
3346
- fallback_batches: List[List[Dict]] = []
3347
210
 
3348
- # 收集所有未聚类的 gid(从所有候选 gid 中排除已聚类的)
3349
- all_gids_in_file_groups = _collect_candidate_gids(file_groups)
3350
- gid_to_item_fallback: Dict[int, Dict] = {}
3351
- for _file, _items in file_groups.items():
3352
- for c in _items:
211
+ # 记录批次选择信息(可选,用于日志)
212
+ try:
213
+ groups = _group_candidates_by_file(compact_candidates)
214
+ if groups:
215
+ selected_file, items = max(groups.items(), key=lambda kv: len(kv[1]))
3353
216
  try:
3354
- _gid = int(c.get("gid", 0))
3355
- if _gid >= 1:
3356
- gid_to_item_fallback[_gid] = c
217
+ typer.secho(f"[jarvis-sec] 批次选择: 文件={selected_file} 数量={len(items)}", fg=typer.colors.BLUE)
3357
218
  except Exception:
3358
219
  pass
220
+ _progress_append({
221
+ "event": "batch_selection",
222
+ "selected_file": selected_file,
223
+ "selected_count": len(items),
224
+ "total_in_file": len(items),
225
+ })
226
+ except Exception:
227
+ pass
3359
228
 
3360
- # 如果还有未聚类的 gid,按文件分组创建批次
3361
- if all_gids_in_file_groups:
3362
- # 收集已聚类的 gid(从 cluster_report.jsonl)
3363
- clustered_gids_fallback = set()
3364
- for (_file_key, _batch_idx), cluster_recs in existing_clusters.items():
3365
- for rec in cluster_recs:
3366
- if rec.get("is_invalid", False):
3367
- continue
3368
- gids_list = rec.get("gids", [])
3369
- for _gid in gids_list:
3370
- try:
3371
- _gid_int = int(_gid)
3372
- if _gid_int >= 1:
3373
- clustered_gids_fallback.add(_gid_int)
3374
- except Exception:
3375
- pass
3376
-
3377
- unclustered_gids_fallback = all_gids_in_file_groups - clustered_gids_fallback
3378
- if unclustered_gids_fallback:
3379
- # 按文件分组未聚类的 gid
3380
- from collections import defaultdict
3381
- unclustered_by_file: Dict[str, List[Dict]] = defaultdict(list)
3382
- for _gid in unclustered_gids_fallback:
3383
- item = gid_to_item_fallback.get(_gid)
3384
- if item:
3385
- file_key = str(item.get("file") or "")
3386
- unclustered_by_file[file_key].append(item)
3387
-
3388
- # 为每个文件创建批次
3389
- for _file, _items in unclustered_by_file.items():
3390
- if _items:
3391
- fallback_batches.append(_items)
3392
-
3393
- return fallback_batches
3394
-
229
+ # 创建报告写入函数
230
+ _append_report = _create_report_writer(sec_dir, report_file)
3395
231
 
3396
- def _process_clustering_phase(
3397
- compact_candidates: List[Dict],
3398
- entry_path: str,
3399
- langs: List[str],
3400
- cluster_limit: int,
3401
- llm_group: Optional[str],
3402
- sec_dir,
3403
- progress_path,
3404
- status_mgr,
3405
- _progress_append,
3406
- ) -> tuple[List[List[Dict]], List[Dict]]:
3407
- """处理聚类阶段,返回(cluster_batches, invalid_clusters_for_review)"""
3408
- # 初始化聚类上下文
3409
- (
3410
- _file_groups,
3411
- _existing_clusters,
3412
- (_write_cluster_batch_snapshot, _write_cluster_report_snapshot),
3413
- cluster_batches,
3414
- cluster_records,
3415
- invalid_clusters_for_review,
3416
- clustered_gids,
3417
- ) = _initialize_clustering_context(compact_candidates, sec_dir, progress_path, _progress_append)
3418
-
3419
- # 收集所有候选的 gid(用于检查未聚类的 gid)
3420
- all_candidate_gids_in_clustering = _collect_candidate_gids(_file_groups)
3421
-
3422
- # 检查是否有未聚类的 gid
3423
- unclustered_gids = _check_unclustered_gids(all_candidate_gids_in_clustering, clustered_gids)
3424
-
3425
- # 如果有未聚类的 gid,继续执行聚类
3426
- if unclustered_gids:
3427
- _execute_clustering_for_files(
3428
- _file_groups,
3429
- clustered_gids,
3430
- cluster_batches,
3431
- cluster_records,
3432
- invalid_clusters_for_review,
3433
- entry_path,
3434
- langs,
3435
- cluster_limit,
3436
- llm_group,
3437
- status_mgr,
3438
- _progress_append,
3439
- _write_cluster_batch_snapshot,
3440
- )
3441
-
3442
- # 记录聚类阶段完成
3443
- _record_clustering_completion(sec_dir, cluster_records, compact_candidates, _progress_append)
3444
-
3445
- # 复核Agent:验证所有标记为无效的聚类
3446
- cluster_batches = _process_review_phase(
3447
- invalid_clusters_for_review,
232
+ # 3) 处理聚类阶段
233
+ cluster_batches, invalid_clusters_for_review = _process_clustering_phase(
234
+ compact_candidates,
3448
235
  entry_path,
3449
236
  langs,
237
+ cluster_limit,
3450
238
  llm_group,
239
+ sec_dir,
3451
240
  status_mgr,
3452
241
  _progress_append,
3453
- cluster_batches,
242
+ force_save_memory=force_save_memory,
3454
243
  )
3455
-
3456
- # 若聚类失败或空,则回退为"按文件一次处理"
3457
- if not cluster_batches:
3458
- fallback_batches = _fallback_to_file_based_batches(_file_groups, _existing_clusters)
3459
- cluster_batches.extend(fallback_batches)
3460
-
3461
- # 完整性检查:确保所有候选的 gid 都已被聚类
3462
- _check_and_supplement_missing_gids(
3463
- _file_groups,
244
+
245
+ # 4) 处理验证阶段
246
+ meta_records: List[Dict] = []
247
+ all_issues = _process_verification_phase(
3464
248
  cluster_batches,
3465
- invalid_clusters_for_review,
249
+ entry_path,
250
+ langs,
251
+ llm_group,
3466
252
  sec_dir,
253
+ status_mgr,
3467
254
  _progress_append,
255
+ _append_report,
256
+ enable_verification=enable_verification,
257
+ force_save_memory=force_save_memory,
3468
258
  )
3469
259
 
3470
- return cluster_batches, invalid_clusters_for_review
3471
-
3472
-
3473
- def _process_verification_phase(
3474
- cluster_batches: List[List[Dict]],
3475
- entry_path: str,
3476
- langs: List[str],
3477
- llm_group: Optional[str],
3478
- sec_dir,
3479
- progress_path,
3480
- status_mgr,
3481
- _progress_append,
3482
- _append_report,
3483
- ) -> List[Dict]:
3484
- """处理验证阶段,返回所有已保存的告警"""
3485
- batches: List[List[Dict]] = cluster_batches
3486
- total_batches = len(batches)
3487
-
3488
- # 从 agent_issues.jsonl 中读取已处理的 gid
3489
- processed_gids_from_issues = _load_processed_gids_from_issues(sec_dir)
3490
-
3491
- # 从 progress.jsonl 中读取已完成的批次
3492
- completed_batch_ids = _load_completed_batch_ids(progress_path)
3493
-
3494
- if completed_batch_ids:
3495
- try:
3496
- typer.secho(f"[jarvis-sec] 断点恢复:从 progress.jsonl 读取到 {len(completed_batch_ids)} 个已完成的批次", fg=typer.colors.BLUE)
3497
- except Exception:
3498
- pass
3499
-
3500
- # 更新验证阶段状态
3501
- if total_batches > 0:
3502
- status_mgr.update_verification(
3503
- current_batch=0,
3504
- total_batches=total_batches,
3505
- message="开始安全验证..."
260
+ # 5) 使用统一聚合器生成最终报告(JSON + Markdown)
261
+ try:
262
+ from jarvis.jarvis_sec.report import build_json_and_markdown
263
+ result = build_json_and_markdown(
264
+ all_issues,
265
+ scanned_root=summary.get("scanned_root"),
266
+ scanned_files=summary.get("scanned_files"),
267
+ meta=meta_records or None,
268
+ output_file=output_file,
3506
269
  )
3507
-
3508
- meta_records: List[Dict] = []
3509
- gid_counts: Dict[int, int] = {}
3510
-
3511
- for bidx, batch in enumerate(batches, start=1):
3512
- task_id = f"JARVIS-SEC-Batch-{bidx}"
3513
- batch_file = batch[0].get("file") if batch else None
3514
-
3515
- # 检查批次是否已完成:优先检查 progress.jsonl 中的批次状态
3516
- is_batch_completed = False
3517
-
3518
- # 方法1:检查 progress.jsonl 中是否有该批次的完成记录
3519
- if task_id in completed_batch_ids:
3520
- is_batch_completed = True
3521
- else:
3522
- # 方法2:检查批次中的所有 gid 是否都在 agent_issues.jsonl 中
3523
- batch_gids = set()
3524
- for item in batch:
3525
- try:
3526
- _gid = int(item.get("gid", 0))
3527
- if _gid >= 1:
3528
- batch_gids.add(_gid)
3529
- except Exception:
3530
- pass
3531
-
3532
- # 如果批次中的所有 gid 都已处理,则认为该批次已完成
3533
- if batch_gids and processed_gids_from_issues and batch_gids.issubset(processed_gids_from_issues):
3534
- is_batch_completed = True
3535
-
3536
- if is_batch_completed:
3537
- try:
3538
- typer.secho(f"[jarvis-sec] 跳过批次 {bidx}/{total_batches}:已在之前的运行中完成", fg=typer.colors.GREEN)
3539
- except Exception:
3540
- pass
3541
- # 更新进度但不实际处理
3542
- status_mgr.update_verification(
3543
- current_batch=bidx,
3544
- total_batches=total_batches,
3545
- batch_id=task_id,
3546
- file_name=batch_file,
3547
- message=f"跳过已完成的批次 {bidx}/{total_batches}"
3548
- )
3549
- continue
3550
-
3551
- # 处理验证批次
3552
- _process_verification_batch(
3553
- batch,
3554
- bidx,
3555
- total_batches,
3556
- entry_path,
3557
- langs,
3558
- llm_group,
3559
- status_mgr,
3560
- _progress_append,
3561
- _append_report,
3562
- meta_records,
3563
- gid_counts,
3564
- sec_dir,
270
+ # 标记分析完成
271
+ status_mgr.mark_completed(
272
+ total_issues=len(all_issues),
273
+ message=f"安全分析完成,共发现 {len(all_issues)} 个问题"
3565
274
  )
3566
-
3567
- # agent_issues.jsonl 读取所有已保存的告警
3568
- return _load_all_issues_from_file(sec_dir)
275
+ return result
276
+ except Exception as e:
277
+ # 发生错误时更新状态
278
+ error_msg = str(e)
279
+ status_mgr.mark_error(
280
+ error_message=error_msg,
281
+ error_type=type(e).__name__
282
+ )
283
+ raise
284
+ finally:
285
+ # 清理LSP客户端资源,防止文件句柄泄露
286
+ try:
287
+ from jarvis.jarvis_tools.lsp_client import LSPClientTool
288
+ LSPClientTool.cleanup_all_clients()
289
+ except Exception:
290
+ pass # 清理失败不影响主流程
291
+
292
+
293
+
294
+
3569
295
 
3570
296
 
3571
- def _try_parse_summary_report(text: str) -> tuple[Optional[object], Optional[str]]:
3572
- """
3573
- 从摘要文本中提取 <REPORT>...</REPORT> 内容,并解析为对象(dict 或 list,仅支持 YAML)。
3574
- 返回(解析结果, 错误信息)
3575
- 如果解析成功,返回(data, None)
3576
- 如果解析失败,返回(None, 错误信息)
3577
- """
3578
- start = text.find("<REPORT>")
3579
- end = text.find("</REPORT>")
3580
- if start == -1 or end == -1 or end <= start:
3581
- return None, "未找到 <REPORT> 或 </REPORT> 标签,或标签顺序错误"
3582
- content = text[start + len("<REPORT>"):end].strip()
3583
- if not content:
3584
- return None, "YAML 内容为空"
3585
- try:
3586
- import yaml as _yaml # type: ignore
3587
- try:
3588
- data = _yaml.safe_load(content)
3589
- except Exception as yaml_err:
3590
- error_msg = f"YAML 解析失败: {str(yaml_err)}"
3591
- return None, error_msg
3592
- if isinstance(data, (dict, list)):
3593
- return data, None
3594
- return None, f"YAML 解析结果不是字典或数组,而是 {type(data).__name__}"
3595
- except Exception as e:
3596
- return None, f"解析过程发生异常: {str(e)}"
3597
297
 
3598
298
 
3599
299
  __all__ = [