jarvis-ai-assistant 0.7.16__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (279) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/jarvis_agent/__init__.py +567 -222
  3. jarvis/jarvis_agent/agent_manager.py +19 -12
  4. jarvis/jarvis_agent/builtin_input_handler.py +79 -11
  5. jarvis/jarvis_agent/config_editor.py +7 -2
  6. jarvis/jarvis_agent/event_bus.py +24 -13
  7. jarvis/jarvis_agent/events.py +19 -1
  8. jarvis/jarvis_agent/file_context_handler.py +67 -64
  9. jarvis/jarvis_agent/file_methodology_manager.py +38 -24
  10. jarvis/jarvis_agent/jarvis.py +186 -114
  11. jarvis/jarvis_agent/language_extractors/__init__.py +8 -1
  12. jarvis/jarvis_agent/language_extractors/c_extractor.py +7 -4
  13. jarvis/jarvis_agent/language_extractors/cpp_extractor.py +9 -4
  14. jarvis/jarvis_agent/language_extractors/go_extractor.py +7 -4
  15. jarvis/jarvis_agent/language_extractors/java_extractor.py +27 -20
  16. jarvis/jarvis_agent/language_extractors/javascript_extractor.py +22 -17
  17. jarvis/jarvis_agent/language_extractors/python_extractor.py +7 -4
  18. jarvis/jarvis_agent/language_extractors/rust_extractor.py +7 -4
  19. jarvis/jarvis_agent/language_extractors/typescript_extractor.py +22 -17
  20. jarvis/jarvis_agent/language_support_info.py +250 -219
  21. jarvis/jarvis_agent/main.py +19 -23
  22. jarvis/jarvis_agent/memory_manager.py +9 -6
  23. jarvis/jarvis_agent/methodology_share_manager.py +21 -15
  24. jarvis/jarvis_agent/output_handler.py +4 -2
  25. jarvis/jarvis_agent/prompt_builder.py +7 -6
  26. jarvis/jarvis_agent/prompt_manager.py +113 -8
  27. jarvis/jarvis_agent/prompts.py +317 -85
  28. jarvis/jarvis_agent/protocols.py +5 -2
  29. jarvis/jarvis_agent/run_loop.py +192 -32
  30. jarvis/jarvis_agent/session_manager.py +7 -3
  31. jarvis/jarvis_agent/share_manager.py +23 -13
  32. jarvis/jarvis_agent/shell_input_handler.py +12 -8
  33. jarvis/jarvis_agent/stdio_redirect.py +25 -26
  34. jarvis/jarvis_agent/task_analyzer.py +29 -23
  35. jarvis/jarvis_agent/task_list.py +869 -0
  36. jarvis/jarvis_agent/task_manager.py +26 -23
  37. jarvis/jarvis_agent/tool_executor.py +6 -5
  38. jarvis/jarvis_agent/tool_share_manager.py +24 -14
  39. jarvis/jarvis_agent/user_interaction.py +3 -3
  40. jarvis/jarvis_agent/utils.py +9 -1
  41. jarvis/jarvis_agent/web_bridge.py +37 -17
  42. jarvis/jarvis_agent/web_output_sink.py +5 -2
  43. jarvis/jarvis_agent/web_server.py +165 -36
  44. jarvis/jarvis_c2rust/__init__.py +1 -1
  45. jarvis/jarvis_c2rust/cli.py +260 -141
  46. jarvis/jarvis_c2rust/collector.py +37 -18
  47. jarvis/jarvis_c2rust/constants.py +60 -0
  48. jarvis/jarvis_c2rust/library_replacer.py +242 -1010
  49. jarvis/jarvis_c2rust/library_replacer_checkpoint.py +133 -0
  50. jarvis/jarvis_c2rust/library_replacer_llm.py +287 -0
  51. jarvis/jarvis_c2rust/library_replacer_loader.py +191 -0
  52. jarvis/jarvis_c2rust/library_replacer_output.py +134 -0
  53. jarvis/jarvis_c2rust/library_replacer_prompts.py +124 -0
  54. jarvis/jarvis_c2rust/library_replacer_utils.py +188 -0
  55. jarvis/jarvis_c2rust/llm_module_agent.py +98 -1044
  56. jarvis/jarvis_c2rust/llm_module_agent_apply.py +170 -0
  57. jarvis/jarvis_c2rust/llm_module_agent_executor.py +288 -0
  58. jarvis/jarvis_c2rust/llm_module_agent_loader.py +170 -0
  59. jarvis/jarvis_c2rust/llm_module_agent_prompts.py +268 -0
  60. jarvis/jarvis_c2rust/llm_module_agent_types.py +57 -0
  61. jarvis/jarvis_c2rust/llm_module_agent_utils.py +150 -0
  62. jarvis/jarvis_c2rust/llm_module_agent_validator.py +119 -0
  63. jarvis/jarvis_c2rust/loaders.py +28 -10
  64. jarvis/jarvis_c2rust/models.py +5 -2
  65. jarvis/jarvis_c2rust/optimizer.py +192 -1974
  66. jarvis/jarvis_c2rust/optimizer_build_fix.py +286 -0
  67. jarvis/jarvis_c2rust/optimizer_clippy.py +766 -0
  68. jarvis/jarvis_c2rust/optimizer_config.py +49 -0
  69. jarvis/jarvis_c2rust/optimizer_docs.py +183 -0
  70. jarvis/jarvis_c2rust/optimizer_options.py +48 -0
  71. jarvis/jarvis_c2rust/optimizer_progress.py +469 -0
  72. jarvis/jarvis_c2rust/optimizer_report.py +52 -0
  73. jarvis/jarvis_c2rust/optimizer_unsafe.py +309 -0
  74. jarvis/jarvis_c2rust/optimizer_utils.py +469 -0
  75. jarvis/jarvis_c2rust/optimizer_visibility.py +185 -0
  76. jarvis/jarvis_c2rust/scanner.py +229 -166
  77. jarvis/jarvis_c2rust/transpiler.py +531 -2732
  78. jarvis/jarvis_c2rust/transpiler_agents.py +503 -0
  79. jarvis/jarvis_c2rust/transpiler_build.py +1294 -0
  80. jarvis/jarvis_c2rust/transpiler_codegen.py +204 -0
  81. jarvis/jarvis_c2rust/transpiler_compile.py +146 -0
  82. jarvis/jarvis_c2rust/transpiler_config.py +178 -0
  83. jarvis/jarvis_c2rust/transpiler_context.py +122 -0
  84. jarvis/jarvis_c2rust/transpiler_executor.py +516 -0
  85. jarvis/jarvis_c2rust/transpiler_generation.py +278 -0
  86. jarvis/jarvis_c2rust/transpiler_git.py +163 -0
  87. jarvis/jarvis_c2rust/transpiler_mod_utils.py +225 -0
  88. jarvis/jarvis_c2rust/transpiler_modules.py +336 -0
  89. jarvis/jarvis_c2rust/transpiler_planning.py +394 -0
  90. jarvis/jarvis_c2rust/transpiler_review.py +1196 -0
  91. jarvis/jarvis_c2rust/transpiler_symbols.py +176 -0
  92. jarvis/jarvis_c2rust/utils.py +269 -79
  93. jarvis/jarvis_code_agent/after_change.py +233 -0
  94. jarvis/jarvis_code_agent/build_validation_config.py +37 -30
  95. jarvis/jarvis_code_agent/builtin_rules.py +68 -0
  96. jarvis/jarvis_code_agent/code_agent.py +976 -1517
  97. jarvis/jarvis_code_agent/code_agent_build.py +227 -0
  98. jarvis/jarvis_code_agent/code_agent_diff.py +246 -0
  99. jarvis/jarvis_code_agent/code_agent_git.py +525 -0
  100. jarvis/jarvis_code_agent/code_agent_impact.py +177 -0
  101. jarvis/jarvis_code_agent/code_agent_lint.py +283 -0
  102. jarvis/jarvis_code_agent/code_agent_llm.py +159 -0
  103. jarvis/jarvis_code_agent/code_agent_postprocess.py +105 -0
  104. jarvis/jarvis_code_agent/code_agent_prompts.py +46 -0
  105. jarvis/jarvis_code_agent/code_agent_rules.py +305 -0
  106. jarvis/jarvis_code_agent/code_analyzer/__init__.py +52 -48
  107. jarvis/jarvis_code_agent/code_analyzer/base_language.py +12 -10
  108. jarvis/jarvis_code_agent/code_analyzer/build_validator/__init__.py +12 -11
  109. jarvis/jarvis_code_agent/code_analyzer/build_validator/base.py +16 -12
  110. jarvis/jarvis_code_agent/code_analyzer/build_validator/cmake.py +26 -17
  111. jarvis/jarvis_code_agent/code_analyzer/build_validator/detector.py +558 -104
  112. jarvis/jarvis_code_agent/code_analyzer/build_validator/fallback.py +27 -16
  113. jarvis/jarvis_code_agent/code_analyzer/build_validator/go.py +22 -18
  114. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_gradle.py +21 -16
  115. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_maven.py +20 -16
  116. jarvis/jarvis_code_agent/code_analyzer/build_validator/makefile.py +27 -16
  117. jarvis/jarvis_code_agent/code_analyzer/build_validator/nodejs.py +47 -23
  118. jarvis/jarvis_code_agent/code_analyzer/build_validator/python.py +71 -37
  119. jarvis/jarvis_code_agent/code_analyzer/build_validator/rust.py +162 -35
  120. jarvis/jarvis_code_agent/code_analyzer/build_validator/validator.py +111 -57
  121. jarvis/jarvis_code_agent/code_analyzer/build_validator.py +18 -12
  122. jarvis/jarvis_code_agent/code_analyzer/context_manager.py +185 -183
  123. jarvis/jarvis_code_agent/code_analyzer/context_recommender.py +2 -1
  124. jarvis/jarvis_code_agent/code_analyzer/dependency_analyzer.py +24 -15
  125. jarvis/jarvis_code_agent/code_analyzer/file_ignore.py +227 -141
  126. jarvis/jarvis_code_agent/code_analyzer/impact_analyzer.py +321 -247
  127. jarvis/jarvis_code_agent/code_analyzer/language_registry.py +37 -29
  128. jarvis/jarvis_code_agent/code_analyzer/language_support.py +21 -13
  129. jarvis/jarvis_code_agent/code_analyzer/languages/__init__.py +15 -9
  130. jarvis/jarvis_code_agent/code_analyzer/languages/c_cpp_language.py +75 -45
  131. jarvis/jarvis_code_agent/code_analyzer/languages/go_language.py +87 -52
  132. jarvis/jarvis_code_agent/code_analyzer/languages/java_language.py +84 -51
  133. jarvis/jarvis_code_agent/code_analyzer/languages/javascript_language.py +94 -64
  134. jarvis/jarvis_code_agent/code_analyzer/languages/python_language.py +109 -71
  135. jarvis/jarvis_code_agent/code_analyzer/languages/rust_language.py +97 -63
  136. jarvis/jarvis_code_agent/code_analyzer/languages/typescript_language.py +103 -69
  137. jarvis/jarvis_code_agent/code_analyzer/llm_context_recommender.py +271 -268
  138. jarvis/jarvis_code_agent/code_analyzer/symbol_extractor.py +76 -64
  139. jarvis/jarvis_code_agent/code_analyzer/tree_sitter_extractor.py +92 -19
  140. jarvis/jarvis_code_agent/diff_visualizer.py +998 -0
  141. jarvis/jarvis_code_agent/lint.py +223 -524
  142. jarvis/jarvis_code_agent/rule_share_manager.py +158 -0
  143. jarvis/jarvis_code_agent/rules/clean_code.md +144 -0
  144. jarvis/jarvis_code_agent/rules/code_review.md +115 -0
  145. jarvis/jarvis_code_agent/rules/documentation.md +165 -0
  146. jarvis/jarvis_code_agent/rules/generate_rules.md +52 -0
  147. jarvis/jarvis_code_agent/rules/performance.md +158 -0
  148. jarvis/jarvis_code_agent/rules/refactoring.md +139 -0
  149. jarvis/jarvis_code_agent/rules/security.md +160 -0
  150. jarvis/jarvis_code_agent/rules/tdd.md +78 -0
  151. jarvis/jarvis_code_agent/test_rules/cpp_test.md +118 -0
  152. jarvis/jarvis_code_agent/test_rules/go_test.md +98 -0
  153. jarvis/jarvis_code_agent/test_rules/java_test.md +99 -0
  154. jarvis/jarvis_code_agent/test_rules/javascript_test.md +113 -0
  155. jarvis/jarvis_code_agent/test_rules/php_test.md +117 -0
  156. jarvis/jarvis_code_agent/test_rules/python_test.md +91 -0
  157. jarvis/jarvis_code_agent/test_rules/ruby_test.md +102 -0
  158. jarvis/jarvis_code_agent/test_rules/rust_test.md +86 -0
  159. jarvis/jarvis_code_agent/utils.py +36 -26
  160. jarvis/jarvis_code_analysis/checklists/loader.py +21 -21
  161. jarvis/jarvis_code_analysis/code_review.py +64 -33
  162. jarvis/jarvis_data/config_schema.json +285 -192
  163. jarvis/jarvis_git_squash/main.py +8 -6
  164. jarvis/jarvis_git_utils/git_commiter.py +53 -76
  165. jarvis/jarvis_mcp/__init__.py +5 -2
  166. jarvis/jarvis_mcp/sse_mcp_client.py +40 -30
  167. jarvis/jarvis_mcp/stdio_mcp_client.py +27 -19
  168. jarvis/jarvis_mcp/streamable_mcp_client.py +35 -26
  169. jarvis/jarvis_memory_organizer/memory_organizer.py +78 -55
  170. jarvis/jarvis_methodology/main.py +48 -39
  171. jarvis/jarvis_multi_agent/__init__.py +56 -23
  172. jarvis/jarvis_multi_agent/main.py +15 -18
  173. jarvis/jarvis_platform/base.py +179 -111
  174. jarvis/jarvis_platform/human.py +27 -16
  175. jarvis/jarvis_platform/kimi.py +52 -45
  176. jarvis/jarvis_platform/openai.py +101 -40
  177. jarvis/jarvis_platform/registry.py +51 -33
  178. jarvis/jarvis_platform/tongyi.py +68 -38
  179. jarvis/jarvis_platform/yuanbao.py +59 -43
  180. jarvis/jarvis_platform_manager/main.py +68 -76
  181. jarvis/jarvis_platform_manager/service.py +24 -14
  182. jarvis/jarvis_rag/README_CONFIG.md +314 -0
  183. jarvis/jarvis_rag/README_DYNAMIC_LOADING.md +311 -0
  184. jarvis/jarvis_rag/README_ONLINE_MODELS.md +230 -0
  185. jarvis/jarvis_rag/__init__.py +57 -4
  186. jarvis/jarvis_rag/cache.py +3 -1
  187. jarvis/jarvis_rag/cli.py +48 -68
  188. jarvis/jarvis_rag/embedding_interface.py +39 -0
  189. jarvis/jarvis_rag/embedding_manager.py +7 -230
  190. jarvis/jarvis_rag/embeddings/__init__.py +41 -0
  191. jarvis/jarvis_rag/embeddings/base.py +114 -0
  192. jarvis/jarvis_rag/embeddings/cohere.py +66 -0
  193. jarvis/jarvis_rag/embeddings/edgefn.py +117 -0
  194. jarvis/jarvis_rag/embeddings/local.py +260 -0
  195. jarvis/jarvis_rag/embeddings/openai.py +62 -0
  196. jarvis/jarvis_rag/embeddings/registry.py +293 -0
  197. jarvis/jarvis_rag/llm_interface.py +8 -6
  198. jarvis/jarvis_rag/query_rewriter.py +8 -9
  199. jarvis/jarvis_rag/rag_pipeline.py +61 -52
  200. jarvis/jarvis_rag/reranker.py +7 -75
  201. jarvis/jarvis_rag/reranker_interface.py +32 -0
  202. jarvis/jarvis_rag/rerankers/__init__.py +41 -0
  203. jarvis/jarvis_rag/rerankers/base.py +109 -0
  204. jarvis/jarvis_rag/rerankers/cohere.py +67 -0
  205. jarvis/jarvis_rag/rerankers/edgefn.py +140 -0
  206. jarvis/jarvis_rag/rerankers/jina.py +79 -0
  207. jarvis/jarvis_rag/rerankers/local.py +89 -0
  208. jarvis/jarvis_rag/rerankers/registry.py +293 -0
  209. jarvis/jarvis_rag/retriever.py +58 -43
  210. jarvis/jarvis_sec/__init__.py +66 -141
  211. jarvis/jarvis_sec/agents.py +21 -17
  212. jarvis/jarvis_sec/analysis.py +80 -33
  213. jarvis/jarvis_sec/checkers/__init__.py +7 -13
  214. jarvis/jarvis_sec/checkers/c_checker.py +356 -164
  215. jarvis/jarvis_sec/checkers/rust_checker.py +47 -29
  216. jarvis/jarvis_sec/cli.py +43 -21
  217. jarvis/jarvis_sec/clustering.py +430 -272
  218. jarvis/jarvis_sec/file_manager.py +99 -55
  219. jarvis/jarvis_sec/parsers.py +9 -6
  220. jarvis/jarvis_sec/prompts.py +4 -3
  221. jarvis/jarvis_sec/report.py +44 -22
  222. jarvis/jarvis_sec/review.py +180 -107
  223. jarvis/jarvis_sec/status.py +50 -41
  224. jarvis/jarvis_sec/types.py +3 -0
  225. jarvis/jarvis_sec/utils.py +160 -83
  226. jarvis/jarvis_sec/verification.py +411 -181
  227. jarvis/jarvis_sec/workflow.py +132 -21
  228. jarvis/jarvis_smart_shell/main.py +28 -41
  229. jarvis/jarvis_stats/cli.py +14 -12
  230. jarvis/jarvis_stats/stats.py +28 -19
  231. jarvis/jarvis_stats/storage.py +14 -8
  232. jarvis/jarvis_stats/visualizer.py +12 -7
  233. jarvis/jarvis_tools/base.py +5 -2
  234. jarvis/jarvis_tools/clear_memory.py +13 -9
  235. jarvis/jarvis_tools/cli/main.py +23 -18
  236. jarvis/jarvis_tools/edit_file.py +572 -873
  237. jarvis/jarvis_tools/execute_script.py +10 -7
  238. jarvis/jarvis_tools/file_analyzer.py +7 -8
  239. jarvis/jarvis_tools/meta_agent.py +287 -0
  240. jarvis/jarvis_tools/methodology.py +5 -3
  241. jarvis/jarvis_tools/read_code.py +305 -1438
  242. jarvis/jarvis_tools/read_symbols.py +50 -17
  243. jarvis/jarvis_tools/read_webpage.py +19 -18
  244. jarvis/jarvis_tools/registry.py +435 -156
  245. jarvis/jarvis_tools/retrieve_memory.py +16 -11
  246. jarvis/jarvis_tools/save_memory.py +8 -6
  247. jarvis/jarvis_tools/search_web.py +31 -31
  248. jarvis/jarvis_tools/sub_agent.py +32 -28
  249. jarvis/jarvis_tools/sub_code_agent.py +44 -60
  250. jarvis/jarvis_tools/task_list_manager.py +1811 -0
  251. jarvis/jarvis_tools/virtual_tty.py +29 -19
  252. jarvis/jarvis_utils/__init__.py +4 -0
  253. jarvis/jarvis_utils/builtin_replace_map.py +2 -1
  254. jarvis/jarvis_utils/clipboard.py +9 -8
  255. jarvis/jarvis_utils/collections.py +331 -0
  256. jarvis/jarvis_utils/config.py +699 -194
  257. jarvis/jarvis_utils/dialogue_recorder.py +294 -0
  258. jarvis/jarvis_utils/embedding.py +6 -3
  259. jarvis/jarvis_utils/file_processors.py +7 -1
  260. jarvis/jarvis_utils/fzf.py +9 -3
  261. jarvis/jarvis_utils/git_utils.py +71 -42
  262. jarvis/jarvis_utils/globals.py +116 -32
  263. jarvis/jarvis_utils/http.py +6 -2
  264. jarvis/jarvis_utils/input.py +318 -83
  265. jarvis/jarvis_utils/jsonnet_compat.py +119 -104
  266. jarvis/jarvis_utils/methodology.py +37 -28
  267. jarvis/jarvis_utils/output.py +201 -44
  268. jarvis/jarvis_utils/utils.py +986 -628
  269. {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/METADATA +49 -33
  270. jarvis_ai_assistant-1.0.2.dist-info/RECORD +304 -0
  271. jarvis/jarvis_code_agent/code_analyzer/structured_code.py +0 -556
  272. jarvis/jarvis_tools/generate_new_tool.py +0 -205
  273. jarvis/jarvis_tools/lsp_client.py +0 -1552
  274. jarvis/jarvis_tools/rewrite_file.py +0 -105
  275. jarvis_ai_assistant-0.7.16.dist-info/RECORD +0 -218
  276. {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/WHEEL +0 -0
  277. {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/entry_points.txt +0 -0
  278. {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/licenses/LICENSE +0 -0
  279. {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,24 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  """聚类相关模块"""
3
3
 
4
- from typing import Dict, List, Optional
5
- from pathlib import Path
6
4
  import json
7
- import typer
8
-
9
- from jarvis.jarvis_sec.prompts import get_cluster_summary_prompt
5
+ from pathlib import Path
6
+ from typing import Dict
7
+ from typing import List
8
+ from typing import Optional
9
+
10
+
11
+ from jarvis.jarvis_sec.agents import create_cluster_agent
12
+ from jarvis.jarvis_sec.agents import subscribe_summary_event
13
+ from jarvis.jarvis_sec.file_manager import get_all_clustered_gids
14
+ from jarvis.jarvis_sec.file_manager import get_clusters_file
15
+ from jarvis.jarvis_sec.file_manager import load_clusters
16
+ from jarvis.jarvis_sec.file_manager import save_cluster
17
+ from jarvis.jarvis_sec.file_manager import validate_clustering_completeness
10
18
  from jarvis.jarvis_sec.parsers import parse_clusters_from_text
11
- from jarvis.jarvis_sec.agents import create_cluster_agent, subscribe_summary_event
12
- from jarvis.jarvis_sec.utils import (
13
- group_candidates_by_file,
14
- )
15
- from jarvis.jarvis_sec.file_manager import (
16
- load_clusters,
17
- save_cluster,
18
- get_all_clustered_gids,
19
- validate_clustering_completeness,
20
- get_clusters_file,
21
- )
19
+ from jarvis.jarvis_sec.prompts import get_cluster_summary_prompt
20
+ from jarvis.jarvis_sec.utils import group_candidates_by_file
21
+ from jarvis.jarvis_utils.output import PrettyOutput
22
22
 
23
23
 
24
24
  def load_existing_clusters(
@@ -26,33 +26,36 @@ def load_existing_clusters(
26
26
  ) -> tuple[Dict[tuple[str, int], List[Dict]], set, set]:
27
27
  """
28
28
  读取已有聚类报告以支持断点恢复。
29
-
29
+
30
30
  优先使用新的 clusters.jsonl 文件,如果不存在则回退到旧的 cluster_report.jsonl。
31
-
31
+
32
32
  返回: (_existing_clusters, _completed_cluster_batches, _reviewed_invalid_gids)
33
33
  """
34
34
  _existing_clusters: Dict[tuple[str, int], List[Dict]] = {}
35
35
  _completed_cluster_batches: set = set()
36
36
  _reviewed_invalid_gids: set = set() # 已复核的无效聚类的 gids
37
-
37
+
38
38
  try:
39
39
  # 优先使用新的 clusters.jsonl 文件
40
40
  clusters = load_clusters(sec_dir)
41
-
41
+
42
42
  if clusters:
43
43
  # 从新的 clusters.jsonl 加载
44
44
  for cluster in clusters:
45
45
  f_name = str(cluster.get("file") or "")
46
46
  bidx = int(cluster.get("batch_index", 1) or 1)
47
47
  _existing_clusters.setdefault((f_name, bidx), []).append(cluster)
48
-
48
+
49
49
  # 从分析结果文件中读取已复核的无效聚类
50
50
  # 如果聚类是无效的,且其gids都在分析结果中被标记为误报,则认为已复核
51
51
  if cluster.get("is_invalid", False):
52
52
  gids_list = cluster.get("gids", [])
53
53
  if isinstance(gids_list, list):
54
54
  # 检查这些gid是否都在分析结果中被标记为误报
55
- from jarvis.jarvis_sec.file_manager import get_false_positive_gids
55
+ from jarvis.jarvis_sec.file_manager import (
56
+ get_false_positive_gids,
57
+ )
58
+
56
59
  false_positive_gids = get_false_positive_gids(sec_dir)
57
60
  all_false_positive = all(
58
61
  int(gid_val) in false_positive_gids
@@ -72,7 +75,7 @@ def load_existing_clusters(
72
75
  _existing_clusters = {}
73
76
  _completed_cluster_batches = set()
74
77
  _reviewed_invalid_gids = set()
75
-
78
+
76
79
  return _existing_clusters, _completed_cluster_batches, _reviewed_invalid_gids
77
80
 
78
81
 
@@ -83,7 +86,7 @@ def restore_clusters_from_checkpoint(
83
86
  ) -> tuple[List[List[Dict]], List[Dict], List[Dict], set]:
84
87
  """
85
88
  从断点恢复聚类结果。
86
-
89
+
87
90
  返回: (cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids)
88
91
  """
89
92
  # 1. 收集所有候选的 gid
@@ -98,15 +101,17 @@ def restore_clusters_from_checkpoint(
98
101
  gid_to_candidate[_gid] = it
99
102
  except Exception:
100
103
  pass
101
-
104
+
102
105
  # 2. 从 cluster_report.jsonl 恢复所有聚类结果
103
- clustered_gids = set() # 已聚类的 gid(包括有效和无效的,因为无效的也需要进入复核阶段)
106
+ clustered_gids = (
107
+ set()
108
+ ) # 已聚类的 gid(包括有效和无效的,因为无效的也需要进入复核阶段)
104
109
  invalid_clusters_for_review: List[Dict] = [] # 无效聚类列表(从断点恢复)
105
110
  cluster_batches: List[List[Dict]] = []
106
111
  cluster_records: List[Dict] = []
107
112
  skipped_reviewed_count = 0 # 已复核的无效聚类数量(跳过)
108
113
  missing_gids_in_restore = set() # 记录恢复时无法匹配的gid(用于诊断)
109
-
114
+
110
115
  # 首先,从所有聚类记录中收集所有已聚类的 gid(无论是否在当前候选集中)
111
116
  # 这样可以确保即使匹配失败,只要 gid 在 clusters.jsonl 中且在当前候选集中,就会被计入 clustered_gids
112
117
  all_clustered_gids_from_file = set()
@@ -121,13 +126,13 @@ def restore_clusters_from_checkpoint(
121
126
  all_clustered_gids_from_file.add(_gid_int)
122
127
  except Exception:
123
128
  pass
124
-
129
+
125
130
  # 对于所有在 clusters.jsonl 中记录的 gid,如果它们也在当前候选集中,就计入 clustered_gids
126
131
  # 这样可以避免因为匹配失败而导致的遗漏
127
132
  for _gid_int in all_clustered_gids_from_file:
128
133
  if _gid_int in all_candidate_gids_in_clustering:
129
134
  clustered_gids.add(_gid_int)
130
-
135
+
131
136
  # 然后,尝试恢复具体的聚类信息(用于恢复 cluster_batches 和 invalid_clusters_for_review)
132
137
  for (_file_key, _batch_idx), cluster_recs in _existing_clusters.items():
133
138
  for rec in cluster_recs:
@@ -168,7 +173,7 @@ def restore_clusters_from_checkpoint(
168
173
  pass
169
174
  if found_candidate:
170
175
  break
171
-
176
+
172
177
  if found_candidate:
173
178
  # 找到了对应的候选,添加到members中
174
179
  found_candidate["verify"] = verification
@@ -186,7 +191,7 @@ def restore_clusters_from_checkpoint(
186
191
  pass
187
192
  except Exception:
188
193
  pass
189
-
194
+
190
195
  # 只有当至少有一个gid在当前候选集中时,才恢复这个聚类
191
196
  # 如果所有gid都不在当前候选集中,说明这些gid对应的候选在当前运行中不存在
192
197
  # 这种情况下,不应该恢复这个聚类,因为这些gid不在当前运行中
@@ -198,49 +203,63 @@ def restore_clusters_from_checkpoint(
198
203
  cluster_gids_int = set()
199
204
  for gid_val in cluster_gids:
200
205
  try:
206
+ if gid_val is None:
207
+ continue
201
208
  gid_int = int(gid_val)
202
209
  if gid_int >= 1:
203
210
  cluster_gids_int.add(gid_int)
204
211
  except Exception:
205
212
  pass
206
213
  # 检查所有 gid 是否都已被复核过
207
- all_reviewed = cluster_gids_int and cluster_gids_int.issubset(_reviewed_invalid_gids)
208
-
214
+ all_reviewed = cluster_gids_int and cluster_gids_int.issubset(
215
+ _reviewed_invalid_gids
216
+ )
217
+
209
218
  if not all_reviewed:
210
219
  # 如果还有未复核的 gid,收集到复核列表
211
- invalid_clusters_for_review.append({
212
- "file": _file_key,
213
- "batch_index": _batch_idx,
214
- "gids": cluster_gids,
215
- "verification": verification,
216
- "invalid_reason": str(rec.get("invalid_reason", "")).strip(),
217
- "members": members, # 保存候选信息,用于复核后可能重新加入验证
218
- "count": len(members),
219
- })
220
+ invalid_clusters_for_review.append(
221
+ {
222
+ "file": _file_key,
223
+ "batch_index": _batch_idx,
224
+ "gids": cluster_gids,
225
+ "verification": verification,
226
+ "invalid_reason": str(
227
+ rec.get("invalid_reason", "")
228
+ ).strip(),
229
+ "members": members, # 保存候选信息,用于复核后可能重新加入验证
230
+ "count": len(members),
231
+ }
232
+ )
220
233
  else:
221
234
  # 如果所有 gid 都已被复核过,则跳过(不加入复核列表)
222
235
  skipped_reviewed_count += 1
223
236
  else:
224
237
  # 有效聚类:恢复到 cluster_batches
225
238
  cluster_batches.append(members)
226
- cluster_records.append({
227
- "file": _file_key,
228
- "verification": verification,
229
- "gids": [m.get("gid") for m in members],
230
- "count": len(members),
231
- "batch_index": _batch_idx,
232
- "is_invalid": False,
233
- })
234
-
239
+ cluster_records.append(
240
+ {
241
+ "file": _file_key,
242
+ "verification": verification,
243
+ "gids": [m.get("gid") for m in members],
244
+ "count": len(members),
245
+ "batch_index": _batch_idx,
246
+ "is_invalid": False,
247
+ }
248
+ )
249
+
235
250
  # 输出统计信息
236
251
  if _reviewed_invalid_gids:
237
252
  try:
238
- typer.secho(f"[jarvis-sec] 断点恢复:发现 {len(_reviewed_invalid_gids)} 个已复核的无效聚类 gids", fg=typer.colors.BLUE)
253
+ PrettyOutput.auto_print(
254
+ f"[jarvis-sec] 断点恢复:发现 {len(_reviewed_invalid_gids)} 个已复核的无效聚类 gids",
255
+ )
239
256
  except Exception:
240
257
  pass
241
258
  if skipped_reviewed_count > 0:
242
259
  try:
243
- typer.secho(f"[jarvis-sec] 断点恢复:跳过 {skipped_reviewed_count} 个已复核的无效聚类", fg=typer.colors.BLUE)
260
+ PrettyOutput.auto_print(
261
+ f"[jarvis-sec] 断点恢复:跳过 {skipped_reviewed_count} 个已复核的无效聚类",
262
+ )
244
263
  except Exception:
245
264
  pass
246
265
  if missing_gids_in_restore:
@@ -251,31 +270,42 @@ def restore_clusters_from_checkpoint(
251
270
  try:
252
271
  if missing_count <= 20:
253
272
  missing_list = sorted(list(missing_gids_in_restore))
254
- typer.secho(f"[jarvis-sec] 断点恢复诊断:发现 {missing_count} 个gid在当前候选集中但无法匹配(可能存在数据不一致): {missing_list}", fg=typer.colors.YELLOW)
273
+ PrettyOutput.auto_print(
274
+ f"[jarvis-sec] 断点恢复诊断:发现 {missing_count} 个gid在当前候选集中但无法匹配(可能存在数据不一致): {missing_list}",
275
+ )
255
276
  else:
256
277
  missing_list = sorted(list(missing_gids_in_restore))
257
278
  display_list = missing_list[:10] + ["..."] + missing_list[-10:]
258
- typer.secho(f"[jarvis-sec] 断点恢复诊断:发现 {missing_count} 个gid在当前候选集中但无法匹配(可能存在数据不一致): {display_list}", fg=typer.colors.YELLOW)
279
+ PrettyOutput.auto_print(
280
+ f"[jarvis-sec] 断点恢复诊断:发现 {missing_count} 个gid在当前候选集中但无法匹配(可能存在数据不一致): {display_list}",
281
+ )
259
282
  except Exception:
260
283
  pass
261
-
284
+
262
285
  return cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids
263
286
 
264
287
 
265
- def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], compact_candidates: List[Dict], _progress_append):
288
+ def create_cluster_snapshot_writer(
289
+ sec_dir: Path,
290
+ cluster_records: List[Dict],
291
+ compact_candidates: List[Dict],
292
+ _progress_append,
293
+ ):
266
294
  """创建聚类快照写入函数"""
295
+
267
296
  def _write_cluster_batch_snapshot(batch_records: List[Dict]):
268
297
  """写入单个批次的聚类结果,支持增量保存"""
269
298
  try:
270
299
  # 按 (file, batch_index) 分组,为每个分组内的记录生成唯一的 cluster_index
271
300
  from collections import defaultdict
301
+
272
302
  records_by_key = defaultdict(list)
273
303
  for record in batch_records:
274
304
  file_name = str(record.get("file", ""))
275
305
  batch_index = int(record.get("batch_index", 0))
276
306
  key = (file_name, batch_index)
277
307
  records_by_key[key].append(record)
278
-
308
+
279
309
  # 为每个分组内的记录生成 cluster_index
280
310
  for (file_name, batch_index), records in records_by_key.items():
281
311
  for local_idx, record in enumerate(records):
@@ -285,9 +315,9 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
285
315
  cluster_index = local_idx
286
316
  else:
287
317
  cluster_index = int(cluster_index)
288
-
318
+
289
319
  cluster_id = f"{file_name}|{batch_index}|{cluster_index}"
290
-
320
+
291
321
  # 转换为新的格式
292
322
  cluster = {
293
323
  "cluster_id": cluster_id,
@@ -299,12 +329,12 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
299
329
  "is_invalid": record.get("is_invalid", False),
300
330
  "invalid_reason": str(record.get("invalid_reason", "")).strip(),
301
331
  }
302
-
332
+
303
333
  # 使用新的文件管理器保存
304
334
  save_cluster(sec_dir, cluster)
305
335
  except Exception:
306
336
  pass
307
-
337
+
308
338
  def _write_cluster_report_snapshot():
309
339
  """写入聚类报告快照"""
310
340
  try:
@@ -314,7 +344,7 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
314
344
  batch_index = int(record.get("batch_index", 0))
315
345
  cluster_index = idx # 使用索引作为 cluster_index
316
346
  cluster_id = f"{file_name}|{batch_index}|{cluster_index}"
317
-
347
+
318
348
  # 转换为新的格式
319
349
  cluster = {
320
350
  "cluster_id": cluster_id,
@@ -326,7 +356,7 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
326
356
  "is_invalid": record.get("is_invalid", False),
327
357
  "invalid_reason": str(record.get("invalid_reason", "")).strip(),
328
358
  }
329
-
359
+
330
360
  # 使用新的文件管理器保存
331
361
  save_cluster(sec_dir, cluster)
332
362
 
@@ -340,7 +370,7 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
340
370
  )
341
371
  except Exception:
342
372
  pass
343
-
373
+
344
374
  return _write_cluster_batch_snapshot, _write_cluster_report_snapshot
345
375
 
346
376
 
@@ -358,7 +388,9 @@ def collect_candidate_gids(file_groups: Dict[str, List[Dict]]) -> set:
358
388
  return all_gids
359
389
 
360
390
 
361
- def collect_clustered_gids(cluster_batches: List[List[Dict]], invalid_clusters_for_review: List[Dict]) -> set:
391
+ def collect_clustered_gids(
392
+ cluster_batches: List[List[Dict]], invalid_clusters_for_review: List[Dict]
393
+ ) -> set:
362
394
  """收集所有已聚类的 gid"""
363
395
  all_clustered_gids = set()
364
396
  for batch in cluster_batches:
@@ -394,20 +426,21 @@ def filter_single_gid_clusters(
394
426
  ) -> List[List[Dict]]:
395
427
  """
396
428
  过滤掉单独聚类的批次(只包含1个gid的批次),避免分析工作量激增。
397
-
429
+
398
430
  这些单独聚类通常是之前为遗漏的gid自动创建的,现在不再需要。
399
431
  """
400
432
  filtered_batches = []
401
433
  removed_count = 0
402
434
  removed_gids = set()
403
-
435
+
404
436
  # 读取已分析的gid(从analysis.jsonl)
405
437
  from jarvis.jarvis_sec.file_manager import get_all_analyzed_gids
438
+
406
439
  processed_gids = get_all_analyzed_gids(sec_dir)
407
-
440
+
408
441
  # 读取clusters.jsonl中的所有gid
409
442
  cluster_report_gids = get_all_clustered_gids(sec_dir)
410
-
443
+
411
444
  for batch in cluster_batches:
412
445
  # 检查批次大小
413
446
  if len(batch) == 1:
@@ -420,17 +453,21 @@ def filter_single_gid_clusters(
420
453
  if gid in processed_gids:
421
454
  removed_count += 1
422
455
  removed_gids.add(gid)
423
- _progress_append({
424
- "event": "single_cluster_removed",
425
- "gid": gid,
426
- "reason": "already_analyzed",
427
- })
456
+ _progress_append(
457
+ {
458
+ "event": "single_cluster_removed",
459
+ "gid": gid,
460
+ "reason": "already_analyzed",
461
+ }
462
+ )
428
463
  continue
429
-
464
+
430
465
  # 检查verification字段,如果是默认的"验证候选 X 的安全风险",说明是自动创建的单独聚类
431
466
  verification = str(single_item.get("verify", "")).strip()
432
- is_auto_created = verification.startswith("验证候选 ") and verification.endswith(" 的安全风险")
433
-
467
+ is_auto_created = verification.startswith(
468
+ "验证候选 "
469
+ ) and verification.endswith(" 的安全风险")
470
+
434
471
  if is_auto_created:
435
472
  # 这是自动创建的单独聚类
436
473
  # 如果gid在clusters.jsonl中有记录,说明已经聚类过了,可以安全移除
@@ -438,18 +475,22 @@ def filter_single_gid_clusters(
438
475
  if gid in cluster_report_gids:
439
476
  removed_count += 1
440
477
  removed_gids.add(gid)
441
- _progress_append({
442
- "event": "single_cluster_removed",
443
- "gid": gid,
444
- "reason": "auto_created_and_in_clusters",
445
- })
478
+ _progress_append(
479
+ {
480
+ "event": "single_cluster_removed",
481
+ "gid": gid,
482
+ "reason": "auto_created_and_in_clusters",
483
+ }
484
+ )
446
485
  continue
447
486
  else:
448
487
  # 自动创建的单独聚类,但不在clusters.jsonl中,也不在analysis.jsonl中
449
488
  # 说明需要分析,保留它(避免遗漏告警)
450
489
  # 但给出警告,因为这种情况不应该发生
451
490
  try:
452
- typer.secho(f"[jarvis-sec] 警告:gid={gid}是自动创建的单独聚类,但不在clusters.jsonl中,保留以避免遗漏告警", fg=typer.colors.YELLOW)
491
+ PrettyOutput.auto_print(
492
+ f"[jarvis-sec] 警告:gid={gid}是自动创建的单独聚类,但不在clusters.jsonl中,保留以避免遗漏告警",
493
+ )
453
494
  except Exception:
454
495
  pass
455
496
  else:
@@ -458,23 +499,33 @@ def filter_single_gid_clusters(
458
499
  pass
459
500
  except Exception:
460
501
  pass
461
-
502
+
462
503
  # 保留这个批次(不是单独聚类,或者单独聚类但需要保留)
463
504
  filtered_batches.append(batch)
464
-
505
+
465
506
  if removed_count > 0:
466
507
  try:
467
508
  if len(removed_gids) <= 20:
468
- typer.secho(f"[jarvis-sec] 已移除 {removed_count} 个单独聚类批次(共{len(removed_gids)}个gid),避免分析工作量激增", fg=typer.colors.GREEN)
469
- typer.secho(f"[jarvis-sec] 移除的gid: {sorted(list(removed_gids))}", fg=typer.colors.GREEN)
509
+ PrettyOutput.auto_print(
510
+ f"[jarvis-sec] 已移除 {removed_count} 个单独聚类批次(共{len(removed_gids)}个gid),避免分析工作量激增",
511
+ )
512
+ PrettyOutput.auto_print(
513
+ f"[jarvis-sec] 移除的gid: {sorted(list(removed_gids))}",
514
+ )
470
515
  else:
471
516
  removed_gids_list = sorted(list(removed_gids))
472
- display_list = removed_gids_list[:10] + ["..."] + removed_gids_list[-10:]
473
- typer.secho(f"[jarvis-sec] 已移除 {removed_count} 个单独聚类批次(共{len(removed_gids)}个gid),避免分析工作量激增", fg=typer.colors.GREEN)
474
- typer.secho(f"[jarvis-sec] 移除的gid(示例): {display_list}", fg=typer.colors.GREEN)
517
+ display_list = (
518
+ removed_gids_list[:10] + ["..."] + removed_gids_list[-10:]
519
+ )
520
+ PrettyOutput.auto_print(
521
+ f"[jarvis-sec] 已移除 {removed_count} 个单独聚类批次(共{len(removed_gids)}个gid),避免分析工作量激增",
522
+ )
523
+ PrettyOutput.auto_print(
524
+ f"[jarvis-sec] 移除的gid(示例): {display_list}",
525
+ )
475
526
  except Exception:
476
527
  pass
477
-
528
+
478
529
  return filtered_batches
479
530
 
480
531
 
@@ -512,65 +563,76 @@ def handle_single_alert_file(
512
563
  }
513
564
  )
514
565
  current_batch_records = [
515
- rec for rec in cluster_records
566
+ rec
567
+ for rec in cluster_records
516
568
  if rec.get("file") == file and rec.get("batch_index") == 1
517
569
  ]
518
570
  if current_batch_records:
519
571
  _write_cluster_batch_snapshot(current_batch_records)
520
- typer.secho(f"[jarvis-sec] 文件 {file} 仅有一个告警(gid={single_gid}),跳过聚类直接写入", fg=typer.colors.BLUE)
572
+ PrettyOutput.auto_print(
573
+ f"[jarvis-sec] 文件 {file} 仅有一个告警(gid={single_gid}),跳过聚类直接写入",
574
+ )
521
575
 
522
576
 
523
577
  def validate_cluster_format(cluster_items: List[Dict]) -> tuple[bool, List[str]]:
524
578
  """验证聚类结果的格式,返回(是否有效, 错误详情列表)"""
525
579
  if not isinstance(cluster_items, list) or not cluster_items:
526
580
  return False, ["结果不是数组或数组为空"]
527
-
581
+
528
582
  error_details = []
529
583
  for idx, it in enumerate(cluster_items):
530
584
  if not isinstance(it, dict):
531
585
  error_details.append(f"元素{idx}不是字典")
532
586
  return False, error_details
533
-
587
+
534
588
  vals = it.get("gids", [])
535
- if not isinstance(it.get("verification", ""), str) or not isinstance(vals, list):
589
+ if not isinstance(it.get("verification", ""), str) or not isinstance(
590
+ vals, list
591
+ ):
536
592
  error_details.append(f"元素{idx}的verification或gids格式错误")
537
593
  return False, error_details
538
-
594
+
539
595
  # 校验 gids 列表中的每个元素是否都是有效的整数
540
596
  if isinstance(vals, list):
541
597
  for gid_idx, gid_val in enumerate(vals):
542
598
  try:
543
599
  gid_int = int(gid_val)
544
600
  if gid_int < 1:
545
- error_details.append(f"元素{idx}的gids[{gid_idx}]不是有效的正整数(值为{gid_val})")
601
+ error_details.append(
602
+ f"元素{idx}的gids[{gid_idx}]不是有效的正整数(值为{gid_val})"
603
+ )
546
604
  return False, error_details
547
605
  except (ValueError, TypeError):
548
- error_details.append(f"元素{idx}的gids[{gid_idx}]不是有效的整数(值为{gid_val},类型为{type(gid_val).__name__})")
606
+ error_details.append(
607
+ f"元素{idx}的gids[{gid_idx}]不是有效的整数(值为{gid_val},类型为{type(gid_val).__name__})"
608
+ )
549
609
  return False, error_details
550
-
610
+
551
611
  # 校验 is_invalid 字段(必填)
552
612
  if "is_invalid" not in it:
553
613
  error_details.append(f"元素{idx}缺少is_invalid字段(必填)")
554
614
  return False, error_details
555
-
615
+
556
616
  is_invalid_val = it.get("is_invalid")
557
617
  if not isinstance(is_invalid_val, bool):
558
618
  error_details.append(f"元素{idx}的is_invalid不是布尔值")
559
619
  return False, error_details
560
-
620
+
561
621
  # 如果is_invalid为true,必须提供invalid_reason
562
622
  if is_invalid_val is True:
563
623
  invalid_reason = it.get("invalid_reason", "")
564
624
  if not isinstance(invalid_reason, str) or not invalid_reason.strip():
565
- error_details.append(f"元素{idx}的is_invalid为true但缺少invalid_reason字段或理由为空(必填)")
625
+ error_details.append(
626
+ f"元素{idx}的is_invalid为true但缺少invalid_reason字段或理由为空(必填)"
627
+ )
566
628
  return False, error_details
567
-
629
+
568
630
  return True, []
569
631
 
570
632
 
571
633
  def extract_classified_gids(cluster_items: List[Dict]) -> set:
572
634
  """从聚类结果中提取所有已分类的gid
573
-
635
+
574
636
  注意:此函数假设格式验证已经通过,所有gid都是有效的整数。
575
637
  如果遇到格式错误的gid,会记录警告但不会抛出异常(因为格式验证应该已经捕获了这些问题)。
576
638
  """
@@ -586,7 +648,9 @@ def extract_classified_gids(cluster_items: List[Dict]) -> set:
586
648
  except (ValueError, TypeError):
587
649
  # 理论上不应该到达这里(格式验证应该已经捕获),但如果到达了,记录警告
588
650
  try:
589
- typer.secho(f"[jarvis-sec] 警告:在提取gid时遇到格式错误(值={x},类型={type(x).__name__}),这不应该发生(格式验证应该已捕获)", fg=typer.colors.YELLOW)
651
+ PrettyOutput.auto_print(
652
+ f"[jarvis-sec] 警告:在提取gid时遇到格式错误(值={x},类型={type(x).__name__}),这不应该发生(格式验证应该已捕获)",
653
+ )
590
654
  except Exception:
591
655
  pass
592
656
  continue
@@ -608,9 +672,14 @@ def build_cluster_retry_task(
608
672
  if missing_gids:
609
673
  missing_gids_list = sorted(list(missing_gids))
610
674
  missing_count = len(missing_gids)
611
- retry_task += f"\n\n**遗漏的gid(共{missing_count}个,必须被分类):**\n" + ", ".join(str(gid) for gid in missing_gids_list)
675
+ retry_task += (
676
+ f"\n\n**遗漏的gid(共{missing_count}个,必须被分类):**\n"
677
+ + ", ".join(str(gid) for gid in missing_gids_list)
678
+ )
612
679
  if error_details:
613
- retry_task += "\n\n**格式错误:**\n" + "\n".join(f"- {detail}" for detail in error_details)
680
+ retry_task += "\n\n**格式错误:**\n" + "\n".join(
681
+ f"- {detail}" for detail in error_details
682
+ )
614
683
  return retry_task
615
684
 
616
685
 
@@ -621,11 +690,17 @@ def build_cluster_error_guidance(
621
690
  """构建聚类错误指导信息"""
622
691
  error_guidance = ""
623
692
  if error_details:
624
- error_guidance = "\n\n**格式错误详情(请根据以下错误修复输出格式):**\n" + "\n".join(f"- {detail}" for detail in error_details)
693
+ error_guidance = (
694
+ "\n\n**格式错误详情(请根据以下错误修复输出格式):**\n"
695
+ + "\n".join(f"- {detail}" for detail in error_details)
696
+ )
625
697
  if missing_gids:
626
698
  missing_gids_list = sorted(list(missing_gids))
627
699
  missing_count = len(missing_gids)
628
- error_guidance += f"\n\n**完整性错误:遗漏了 {missing_count} 个 gid,这些 gid 必须被分类:**\n" + ", ".join(str(gid) for gid in missing_gids_list)
700
+ error_guidance += (
701
+ f"\n\n**完整性错误:遗漏了 {missing_count} 个 gid,这些 gid 必须被分类:**\n"
702
+ + ", ".join(str(gid) for gid in missing_gids_list)
703
+ )
629
704
  return error_guidance
630
705
 
631
706
 
@@ -643,11 +718,13 @@ def run_cluster_agent_direct_model(
643
718
  error_guidance = build_cluster_error_guidance(error_details, missing_gids)
644
719
  full_prompt = f"{retry_task}{error_guidance}\n\n{cluster_summary_prompt}"
645
720
  try:
646
- response = cluster_agent.model.chat_until_success(full_prompt) # type: ignore
721
+ response = cluster_agent.model.chat_until_success(full_prompt)
647
722
  _cluster_summary["text"] = response
648
723
  except Exception as e:
649
724
  try:
650
- typer.secho(f"[jarvis-sec] 直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
725
+ PrettyOutput.auto_print(
726
+ f"[jarvis-sec] 直接模型调用失败: {e},回退到 run()",
727
+ )
651
728
  except Exception:
652
729
  pass
653
730
  cluster_agent.run(cluster_task)
@@ -661,12 +738,14 @@ def validate_cluster_result(
661
738
  """验证聚类结果格式"""
662
739
  if parse_error:
663
740
  error_details = [f"JSON解析失败: {parse_error}"]
664
- typer.secho(f"[jarvis-sec] JSON解析失败: {parse_error}", fg=typer.colors.YELLOW)
741
+ PrettyOutput.auto_print(f"[jarvis-sec] JSON解析失败: {parse_error}")
665
742
  return False, error_details
666
743
  else:
667
- valid, error_details = validate_cluster_format(cluster_items)
744
+ valid, error_details = validate_cluster_format(cluster_items or [])
668
745
  if not valid:
669
- typer.secho(f"[jarvis-sec] 聚类结果格式无效({'; '.join(error_details)}),重试第 {attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
746
+ PrettyOutput.auto_print(
747
+ f"[jarvis-sec] 聚类结果格式无效({'; '.join(error_details)}),重试第 {attempt} 次(使用直接模型调用)",
748
+ )
670
749
  return valid, error_details
671
750
 
672
751
 
@@ -679,12 +758,16 @@ def check_cluster_completeness(
679
758
  classified_gids = extract_classified_gids(cluster_items)
680
759
  missing_gids = input_gids - classified_gids
681
760
  if not missing_gids:
682
- typer.secho(f"[jarvis-sec] 聚类完整性校验通过,所有gid已分类(共尝试 {attempt} 次)", fg=typer.colors.GREEN)
761
+ PrettyOutput.auto_print(
762
+ f"[jarvis-sec] 聚类完整性校验通过,所有gid已分类(共尝试 {attempt} 次)",
763
+ )
683
764
  return True, set()
684
765
  else:
685
766
  missing_gids_list = sorted(list(missing_gids))
686
767
  missing_count = len(missing_gids)
687
- typer.secho(f"[jarvis-sec] 聚类完整性校验失败:遗漏的gid: {missing_gids_list}({missing_count}个),重试第 {attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
768
+ PrettyOutput.auto_print(
769
+ f"[jarvis-sec] 聚类完整性校验失败:遗漏的gid: {missing_gids_list}({missing_count}个),重试第 {attempt} 次(使用直接模型调用)",
770
+ )
688
771
  return False, missing_gids
689
772
 
690
773
 
@@ -704,13 +787,13 @@ def run_cluster_agent_with_retry(
704
787
  _attempt = 0
705
788
  use_direct_model = False
706
789
  error_details: List[str] = []
707
- missing_gids = set()
790
+ missing_gids: set[str] = set()
708
791
  consecutive_failures = 0 # 连续失败次数
709
-
792
+
710
793
  while True:
711
794
  _attempt += 1
712
795
  _cluster_summary["text"] = ""
713
-
796
+
714
797
  if use_direct_model:
715
798
  run_cluster_agent_direct_model(
716
799
  cluster_agent,
@@ -724,26 +807,32 @@ def run_cluster_agent_with_retry(
724
807
  else:
725
808
  # 第一次使用 run(),让 Agent 完整运行(可能使用工具)
726
809
  cluster_agent.run(cluster_task)
727
-
810
+
728
811
  cluster_summary_text = _cluster_summary.get("text", "")
729
812
  # 调试:如果解析失败,输出摘要文本的前500个字符用于调试
730
813
  cluster_items, parse_error = parse_clusters_from_text(cluster_summary_text)
731
-
814
+
732
815
  # 如果解析失败且是第一次尝试,输出调试信息
733
816
  if parse_error and _attempt == 1:
734
817
  preview = cluster_summary_text[:500] if cluster_summary_text else "(空)"
735
818
  try:
736
- typer.secho(f"[jarvis-sec] 调试:摘要文本预览(前500字符): {preview}", fg=typer.colors.CYAN, err=True)
819
+ PrettyOutput.auto_print(
820
+ f"[jarvis-sec] 调试:摘要文本预览(前500字符): {preview}",
821
+ )
737
822
  except Exception:
738
823
  pass
739
-
824
+
740
825
  # 校验结构
741
- valid, error_details = validate_cluster_result(cluster_items, parse_error, _attempt)
742
-
826
+ valid, error_details = validate_cluster_result(
827
+ cluster_items, parse_error, _attempt
828
+ )
829
+
743
830
  # 完整性校验:检查所有输入的gid是否都被分类
744
831
  missing_gids = set()
745
832
  if valid and cluster_items:
746
- is_complete, missing_gids = check_cluster_completeness(cluster_items, input_gids, _attempt)
833
+ is_complete, missing_gids = check_cluster_completeness(
834
+ cluster_items, input_gids, _attempt
835
+ )
747
836
  if is_complete:
748
837
  return cluster_items, None, False
749
838
  else:
@@ -752,15 +841,17 @@ def run_cluster_agent_with_retry(
752
841
  consecutive_failures += 1
753
842
  else:
754
843
  consecutive_failures += 1
755
-
844
+
756
845
  # 如果连续失败5次,且提供了创建agent的函数,则返回需要重新创建agent的标志
757
846
  if not valid and consecutive_failures >= 5 and create_agent_func is not None:
758
847
  try:
759
- typer.secho(f"[jarvis-sec] 连续失败 {consecutive_failures} 次,需要重新创建agent", fg=typer.colors.YELLOW)
848
+ PrettyOutput.auto_print(
849
+ f"[jarvis-sec] 连续失败 {consecutive_failures} 次,需要重新创建agent",
850
+ )
760
851
  except Exception:
761
852
  pass
762
853
  return None, parse_error or "连续失败5次", True
763
-
854
+
764
855
  if not valid:
765
856
  use_direct_model = True
766
857
  cluster_items = None
@@ -788,11 +879,11 @@ def process_cluster_results(
788
879
  pass
789
880
  except Exception:
790
881
  gid_to_item = {}
791
-
882
+
792
883
  _merged_count = 0
793
884
  _invalid_count = 0
794
885
  classified_gids_final = set()
795
-
886
+
796
887
  for cl in cluster_items:
797
888
  verification = str(cl.get("verification", "")).strip()
798
889
  raw_gids = cl.get("gids", [])
@@ -807,61 +898,71 @@ def process_cluster_results(
807
898
  classified_gids_final.add(xi)
808
899
  except Exception:
809
900
  pass
810
-
901
+
811
902
  members: List[Dict] = []
812
903
  for k in norm_keys:
813
- it = gid_to_item.get(k)
814
- if it:
815
- it["verify"] = verification
816
- members.append(it)
817
-
904
+ item = gid_to_item.get(k)
905
+ if item is not None:
906
+ item["verify"] = verification
907
+ members.append(item)
908
+
818
909
  # 如果标记为无效,收集到复核列表
819
910
  if is_invalid:
820
911
  _invalid_count += 1
821
912
  invalid_gids = [m.get("gid") for m in members]
822
913
  invalid_reason = str(cl.get("invalid_reason", "")).strip()
823
914
  try:
824
- typer.secho(f"[jarvis-sec] 聚类阶段判定为无效(gids={invalid_gids}),将提交复核Agent验证", fg=typer.colors.BLUE)
915
+ PrettyOutput.auto_print(
916
+ f"[jarvis-sec] 聚类阶段判定为无效(gids={invalid_gids}),将提交复核Agent验证",
917
+ )
825
918
  except Exception:
826
919
  pass
827
- invalid_clusters_for_review.append({
828
- "file": file,
829
- "batch_index": chunk_idx,
830
- "gids": invalid_gids,
831
- "verification": verification,
832
- "invalid_reason": invalid_reason,
833
- "members": members,
834
- "count": len(members),
835
- })
836
- _progress_append({
837
- "event": "cluster_invalid",
838
- "file": file,
839
- "batch_index": chunk_idx,
840
- "gids": invalid_gids,
841
- "verification": verification,
842
- "count": len(members),
843
- })
844
- cluster_records.append({
845
- "file": file,
846
- "verification": verification,
847
- "gids": invalid_gids,
848
- "count": len(members),
849
- "batch_index": chunk_idx,
850
- "is_invalid": True,
851
- "invalid_reason": invalid_reason,
852
- })
920
+ invalid_clusters_for_review.append(
921
+ {
922
+ "file": file,
923
+ "batch_index": chunk_idx,
924
+ "gids": invalid_gids,
925
+ "verification": verification,
926
+ "invalid_reason": invalid_reason,
927
+ "members": members,
928
+ "count": len(members),
929
+ }
930
+ )
931
+ _progress_append(
932
+ {
933
+ "event": "cluster_invalid",
934
+ "file": file,
935
+ "batch_index": chunk_idx,
936
+ "gids": invalid_gids,
937
+ "verification": verification,
938
+ "count": len(members),
939
+ }
940
+ )
941
+ cluster_records.append(
942
+ {
943
+ "file": file,
944
+ "verification": verification,
945
+ "gids": invalid_gids,
946
+ "count": len(members),
947
+ "batch_index": chunk_idx,
948
+ "is_invalid": True,
949
+ "invalid_reason": invalid_reason,
950
+ }
951
+ )
853
952
  elif members:
854
953
  _merged_count += 1
855
954
  cluster_batches.append(members)
856
- cluster_records.append({
857
- "file": file,
858
- "verification": verification,
859
- "gids": [m.get("gid") for m in members],
860
- "count": len(members),
861
- "batch_index": chunk_idx,
862
- "is_invalid": False,
863
- })
864
-
955
+ cluster_records.append(
956
+ {
957
+ "file": file,
958
+ "verification": verification,
959
+ "gids": [m.get("gid") for m in members],
960
+ "count": len(members),
961
+ "batch_index": chunk_idx,
962
+ "is_invalid": False,
963
+ }
964
+ )
965
+
865
966
  return _merged_count, _invalid_count
866
967
 
867
968
 
@@ -881,14 +982,16 @@ def supplement_missing_gids(
881
982
  default_verification = f"验证候选 {missing_gid} 的安全风险"
882
983
  missing_item["verify"] = default_verification
883
984
  cluster_batches.append([missing_item])
884
- cluster_records.append({
885
- "file": file,
886
- "verification": default_verification,
887
- "gids": [missing_gid],
888
- "count": 1,
889
- "batch_index": chunk_idx,
890
- "note": "完整性校验补充的遗漏gid",
891
- })
985
+ cluster_records.append(
986
+ {
987
+ "file": file,
988
+ "verification": default_verification,
989
+ "gids": [missing_gid],
990
+ "count": 1,
991
+ "batch_index": chunk_idx,
992
+ "note": "完整性校验补充的遗漏gid",
993
+ }
994
+ )
892
995
  supplemented_count += 1
893
996
  return supplemented_count
894
997
 
@@ -958,35 +1061,39 @@ def process_cluster_chunk(
958
1061
  """处理单个聚类批次"""
959
1062
  if not chunk:
960
1063
  return
961
-
1064
+
962
1065
  pending_in_file_with_ids = list(chunk)
963
-
1066
+
964
1067
  # 记录聚类批次开始
965
- _progress_append({
966
- "event": "cluster_status",
967
- "status": "running",
968
- "file": file,
969
- "batch_index": chunk_idx,
970
- "total_in_batch": len(pending_in_file_with_ids),
971
- })
972
-
1068
+ _progress_append(
1069
+ {
1070
+ "event": "cluster_status",
1071
+ "status": "running",
1072
+ "file": file,
1073
+ "batch_index": chunk_idx,
1074
+ "total_in_batch": len(pending_in_file_with_ids),
1075
+ }
1076
+ )
1077
+
973
1078
  # 创建聚类Agent
974
- cluster_agent = create_cluster_agent(file, chunk_idx, llm_group, force_save_memory=force_save_memory)
975
-
1079
+ cluster_agent = create_cluster_agent(
1080
+ file, chunk_idx, llm_group, force_save_memory=force_save_memory
1081
+ )
1082
+
976
1083
  # 构建任务上下文
977
1084
  cluster_task = build_cluster_task(pending_in_file_with_ids, entry_path, file, langs)
978
-
1085
+
979
1086
  # 提取输入gid
980
1087
  input_gids = extract_input_gids(pending_in_file_with_ids)
981
-
1088
+
982
1089
  # 运行聚类Agent(支持重新创建agent,不限次数)
983
1090
  cluster_summary_prompt = get_cluster_summary_prompt()
984
1091
  recreate_count = 0
985
-
1092
+
986
1093
  while True:
987
1094
  # 订阅摘要事件(每次重新创建agent后需要重新订阅)
988
1095
  cluster_summary = subscribe_summary_event(cluster_agent)
989
-
1096
+
990
1097
  cluster_items, parse_error, need_recreate = run_cluster_agent_with_retry(
991
1098
  cluster_agent,
992
1099
  cluster_task,
@@ -994,28 +1101,34 @@ def process_cluster_chunk(
994
1101
  input_gids,
995
1102
  file,
996
1103
  cluster_summary,
997
- create_agent_func=lambda: create_cluster_agent(file, chunk_idx, llm_group, force_save_memory=force_save_memory),
1104
+ create_agent_func=lambda: create_cluster_agent(
1105
+ file, chunk_idx, llm_group, force_save_memory=force_save_memory
1106
+ ),
998
1107
  )
999
-
1108
+
1000
1109
  # 如果不需要重新创建agent,退出循环
1001
1110
  if not need_recreate:
1002
1111
  break
1003
-
1112
+
1004
1113
  # 需要重新创建agent(不限次数)
1005
1114
  recreate_count += 1
1006
1115
  try:
1007
- typer.secho(f"[jarvis-sec] 重新创建聚类Agent(第 {recreate_count} 次)", fg=typer.colors.MAGENTA)
1116
+ PrettyOutput.auto_print(
1117
+ f"[jarvis-sec] 重新创建聚类Agent(第 {recreate_count} 次)",
1118
+ )
1008
1119
  except Exception:
1009
1120
  pass
1010
- cluster_agent = create_cluster_agent(file, chunk_idx, llm_group, force_save_memory=force_save_memory)
1011
-
1121
+ cluster_agent = create_cluster_agent(
1122
+ file, chunk_idx, llm_group, force_save_memory=force_save_memory
1123
+ )
1124
+
1012
1125
  # 处理聚类结果
1013
1126
  _merged_count = 0
1014
1127
  _invalid_count = 0
1015
-
1128
+
1016
1129
  if isinstance(cluster_items, list) and cluster_items:
1017
1130
  gid_to_item = build_gid_to_item_mapping(pending_in_file_with_ids)
1018
-
1131
+
1019
1132
  _merged_count, _invalid_count = process_cluster_results(
1020
1133
  cluster_items,
1021
1134
  pending_in_file_with_ids,
@@ -1026,11 +1139,13 @@ def process_cluster_chunk(
1026
1139
  invalid_clusters_for_review,
1027
1140
  _progress_append,
1028
1141
  )
1029
-
1142
+
1030
1143
  classified_gids_final = extract_classified_gids(cluster_items)
1031
1144
  missing_gids_final = input_gids - classified_gids_final
1032
1145
  if missing_gids_final:
1033
- typer.secho(f"[jarvis-sec] 警告:仍有遗漏的gid {sorted(list(missing_gids_final))},将为每个遗漏的gid创建单独聚类", fg=typer.colors.YELLOW)
1146
+ PrettyOutput.auto_print(
1147
+ f"[jarvis-sec] 警告:仍有遗漏的gid {sorted(list(missing_gids_final))},将为每个遗漏的gid创建单独聚类",
1148
+ )
1034
1149
  supplemented_count = supplement_missing_gids(
1035
1150
  missing_gids_final,
1036
1151
  gid_to_item,
@@ -1043,9 +1158,11 @@ def process_cluster_chunk(
1043
1158
  else:
1044
1159
  # 聚类结果为空或None:为所有输入的gid创建单独聚类(保守策略)
1045
1160
  if pending_in_file_with_ids:
1046
- typer.secho(f"[jarvis-sec] 警告:聚类结果为空或None(文件={file},批次={chunk_idx}),为所有gid创建单独聚类", fg=typer.colors.YELLOW)
1161
+ PrettyOutput.auto_print(
1162
+ f"[jarvis-sec] 警告:聚类结果为空或None(文件={file},批次={chunk_idx}),为所有gid创建单独聚类",
1163
+ )
1047
1164
  gid_to_item_fallback = build_gid_to_item_mapping(pending_in_file_with_ids)
1048
-
1165
+
1049
1166
  _merged_count = supplement_missing_gids(
1050
1167
  input_gids,
1051
1168
  gid_to_item_fallback,
@@ -1058,25 +1175,30 @@ def process_cluster_chunk(
1058
1175
  else:
1059
1176
  _merged_count = 0
1060
1177
  _invalid_count = 0
1061
-
1178
+
1062
1179
  # 标记聚类批次完成
1063
- _progress_append({
1064
- "event": "cluster_status",
1065
- "status": "done",
1066
- "file": file,
1067
- "batch_index": chunk_idx,
1068
- "clusters_count": _merged_count,
1069
- "invalid_clusters_count": _invalid_count,
1070
- })
1180
+ _progress_append(
1181
+ {
1182
+ "event": "cluster_status",
1183
+ "status": "done",
1184
+ "file": file,
1185
+ "batch_index": chunk_idx,
1186
+ "clusters_count": _merged_count,
1187
+ "invalid_clusters_count": _invalid_count,
1188
+ }
1189
+ )
1071
1190
  if _invalid_count > 0:
1072
1191
  try:
1073
- typer.secho(f"[jarvis-sec] 聚类批次完成: 有效聚类={_merged_count},无效聚类={_invalid_count}(已跳过)", fg=typer.colors.GREEN)
1192
+ PrettyOutput.auto_print(
1193
+ f"[jarvis-sec] 聚类批次完成: 有效聚类={_merged_count},无效聚类={_invalid_count}(已跳过)",
1194
+ )
1074
1195
  except Exception:
1075
1196
  pass
1076
-
1197
+
1077
1198
  # 写入当前批次的聚类结果
1078
1199
  current_batch_records = [
1079
- rec for rec in cluster_records
1200
+ rec
1201
+ for rec in cluster_records
1080
1202
  if rec.get("file") == file and rec.get("batch_index") == chunk_idx
1081
1203
  ]
1082
1204
  if current_batch_records:
@@ -1116,7 +1238,7 @@ def process_file_clustering(
1116
1238
  pending_in_file = filter_pending_items(items, clustered_gids)
1117
1239
  if not pending_in_file:
1118
1240
  return
1119
-
1241
+
1120
1242
  # 优化:如果文件只有一个告警,跳过聚类,直接写入
1121
1243
  if len(pending_in_file) == 1:
1122
1244
  single_item = pending_in_file[0]
@@ -1131,11 +1253,15 @@ def process_file_clustering(
1131
1253
  _write_cluster_batch_snapshot,
1132
1254
  )
1133
1255
  return
1134
-
1256
+
1135
1257
  # 将该文件的告警按 cluster_limit 分批
1136
- _limit = cluster_limit if isinstance(cluster_limit, int) and cluster_limit > 0 else 50
1137
- _chunks: List[List[Dict]] = [pending_in_file[i:i + _limit] for i in range(0, len(pending_in_file), _limit)]
1138
-
1258
+ _limit = (
1259
+ cluster_limit if isinstance(cluster_limit, int) and cluster_limit > 0 else 50
1260
+ )
1261
+ _chunks: List[List[Dict]] = [
1262
+ pending_in_file[i : i + _limit] for i in range(0, len(pending_in_file), _limit)
1263
+ ]
1264
+
1139
1265
  # 处理每个批次
1140
1266
  for _chunk_idx, _chunk in enumerate(_chunks, start=1):
1141
1267
  process_cluster_chunk(
@@ -1163,30 +1289,42 @@ def initialize_clustering_context(
1163
1289
  compact_candidates: List[Dict],
1164
1290
  sec_dir: Path,
1165
1291
  _progress_append,
1166
- ) -> tuple[Dict[str, List[Dict]], Dict, tuple, List[List[Dict]], List[Dict], List[Dict], set]:
1292
+ ) -> tuple[
1293
+ Dict[str, List[Dict]], Dict, tuple, List[List[Dict]], List[Dict], List[Dict], set
1294
+ ]:
1167
1295
  """初始化聚类上下文,返回(文件分组, 已有聚类, 快照写入函数, 聚类批次, 聚类记录, 无效聚类, 已聚类gid)"""
1168
1296
  # 按文件分组构建待聚类集合
1169
1297
  _file_groups = group_candidates_by_file(compact_candidates)
1170
-
1298
+
1171
1299
  cluster_batches: List[List[Dict]] = []
1172
1300
  cluster_records: List[Dict] = []
1173
1301
  invalid_clusters_for_review: List[Dict] = []
1174
-
1302
+
1175
1303
  # 读取已有聚类报告以支持断点
1176
- _existing_clusters, _completed_cluster_batches, _reviewed_invalid_gids = load_existing_clusters(
1177
- sec_dir
1178
- )
1179
-
1304
+ (
1305
+ _existing_clusters,
1306
+ _completed_cluster_batches,
1307
+ _reviewed_invalid_gids,
1308
+ ) = load_existing_clusters(sec_dir)
1309
+
1180
1310
  # 创建快照写入函数
1181
- _write_cluster_batch_snapshot, _write_cluster_report_snapshot = create_cluster_snapshot_writer(
1311
+ (
1312
+ _write_cluster_batch_snapshot,
1313
+ _write_cluster_report_snapshot,
1314
+ ) = create_cluster_snapshot_writer(
1182
1315
  sec_dir, cluster_records, compact_candidates, _progress_append
1183
1316
  )
1184
-
1317
+
1185
1318
  # 从断点恢复聚类结果
1186
- cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids = restore_clusters_from_checkpoint(
1319
+ (
1320
+ cluster_batches,
1321
+ cluster_records,
1322
+ invalid_clusters_for_review,
1323
+ clustered_gids,
1324
+ ) = restore_clusters_from_checkpoint(
1187
1325
  _existing_clusters, _file_groups, _reviewed_invalid_gids
1188
1326
  )
1189
-
1327
+
1190
1328
  return (
1191
1329
  _file_groups,
1192
1330
  _existing_clusters,
@@ -1206,12 +1344,16 @@ def check_unclustered_gids(
1206
1344
  unclustered_gids = all_candidate_gids - clustered_gids
1207
1345
  if unclustered_gids:
1208
1346
  try:
1209
- typer.secho(f"[jarvis-sec] 发现 {len(unclustered_gids)} 个未聚类的 gid,将进行聚类", fg=typer.colors.YELLOW)
1347
+ PrettyOutput.auto_print(
1348
+ f"[jarvis-sec] 发现 {len(unclustered_gids)} 个未聚类的 gid,将进行聚类",
1349
+ )
1210
1350
  except Exception:
1211
1351
  pass
1212
1352
  else:
1213
1353
  try:
1214
- typer.secho(f"[jarvis-sec] 所有 {len(all_candidate_gids)} 个候选已聚类,跳过聚类阶段", fg=typer.colors.GREEN)
1354
+ PrettyOutput.auto_print(
1355
+ f"[jarvis-sec] 所有 {len(all_candidate_gids)} 个候选已聚类,跳过聚类阶段",
1356
+ )
1215
1357
  except Exception:
1216
1358
  pass
1217
1359
  return unclustered_gids
@@ -1239,16 +1381,18 @@ def execute_clustering_for_files(
1239
1381
  status_mgr.update_clustering(
1240
1382
  current_file=0,
1241
1383
  total_files=total_files_to_cluster,
1242
- message="开始聚类分析..."
1384
+ message="开始聚类分析...",
1243
1385
  )
1244
1386
  for _file_idx, (_file, _items) in enumerate(file_groups.items(), start=1):
1245
- typer.secho(f"\n[jarvis-sec] 聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}", fg=typer.colors.CYAN)
1387
+ PrettyOutput.auto_print(
1388
+ f"\n[jarvis-sec] 聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}",
1389
+ )
1246
1390
  # 更新当前文件进度
1247
1391
  status_mgr.update_clustering(
1248
1392
  current_file=_file_idx,
1249
1393
  total_files=total_files_to_cluster,
1250
1394
  file_name=_file,
1251
- message=f"正在聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}"
1395
+ message=f"正在聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}",
1252
1396
  )
1253
1397
  # 使用子函数处理文件聚类
1254
1398
  process_file_clustering(
@@ -1277,13 +1421,15 @@ def record_clustering_completion(
1277
1421
  """记录聚类阶段完成"""
1278
1422
  try:
1279
1423
  _cluster_path = sec_dir / "cluster_report.jsonl"
1280
- _progress_append({
1281
- "event": "cluster_report_written",
1282
- "path": str(_cluster_path),
1283
- "clusters": len(cluster_records),
1284
- "total_candidates": len(compact_candidates),
1285
- "note": "每个批次已增量保存,无需重写整个文件",
1286
- })
1424
+ _progress_append(
1425
+ {
1426
+ "event": "cluster_report_written",
1427
+ "path": str(_cluster_path),
1428
+ "clusters": len(cluster_records),
1429
+ "total_candidates": len(compact_candidates),
1430
+ "note": "每个批次已增量保存,无需重写整个文件",
1431
+ }
1432
+ )
1287
1433
  except Exception:
1288
1434
  pass
1289
1435
 
@@ -1294,7 +1440,7 @@ def fallback_to_file_based_batches(
1294
1440
  ) -> List[List[Dict]]:
1295
1441
  """若聚类失败或空,则回退为按文件一次处理"""
1296
1442
  fallback_batches: List[List[Dict]] = []
1297
-
1443
+
1298
1444
  # 收集所有未聚类的 gid(从所有候选 gid 中排除已聚类的)
1299
1445
  all_gids_in_file_groups = collect_candidate_gids(file_groups)
1300
1446
  gid_to_item_fallback: Dict[int, Dict] = {}
@@ -1306,7 +1452,7 @@ def fallback_to_file_based_batches(
1306
1452
  gid_to_item_fallback[_gid] = c
1307
1453
  except Exception:
1308
1454
  pass
1309
-
1455
+
1310
1456
  # 如果还有未聚类的 gid,按文件分组创建批次
1311
1457
  if all_gids_in_file_groups:
1312
1458
  # 收集已聚类的 gid(从 cluster_report.jsonl)
@@ -1323,23 +1469,24 @@ def fallback_to_file_based_batches(
1323
1469
  clustered_gids_fallback.add(_gid_int)
1324
1470
  except Exception:
1325
1471
  pass
1326
-
1472
+
1327
1473
  unclustered_gids_fallback = all_gids_in_file_groups - clustered_gids_fallback
1328
1474
  if unclustered_gids_fallback:
1329
1475
  # 按文件分组未聚类的 gid
1330
1476
  from collections import defaultdict
1477
+
1331
1478
  unclustered_by_file: Dict[str, List[Dict]] = defaultdict(list)
1332
1479
  for _gid in unclustered_gids_fallback:
1333
1480
  item = gid_to_item_fallback.get(_gid)
1334
1481
  if item:
1335
1482
  file_key = str(item.get("file") or "")
1336
1483
  unclustered_by_file[file_key].append(item)
1337
-
1484
+
1338
1485
  # 为每个文件创建批次
1339
1486
  for _file, _items in unclustered_by_file.items():
1340
1487
  if _items:
1341
1488
  fallback_batches.append(_items)
1342
-
1489
+
1343
1490
  return fallback_batches
1344
1491
 
1345
1492
 
@@ -1365,13 +1512,15 @@ def process_clustering_phase(
1365
1512
  invalid_clusters_for_review,
1366
1513
  clustered_gids,
1367
1514
  ) = initialize_clustering_context(compact_candidates, sec_dir, _progress_append)
1368
-
1515
+
1369
1516
  # 收集所有候选的 gid(用于检查未聚类的 gid)
1370
1517
  all_candidate_gids_in_clustering = collect_candidate_gids(_file_groups)
1371
-
1518
+
1372
1519
  # 检查是否有未聚类的 gid
1373
- unclustered_gids = check_unclustered_gids(all_candidate_gids_in_clustering, clustered_gids)
1374
-
1520
+ unclustered_gids = check_unclustered_gids(
1521
+ all_candidate_gids_in_clustering, clustered_gids
1522
+ )
1523
+
1375
1524
  # 如果有未聚类的 gid,继续执行聚类
1376
1525
  if unclustered_gids:
1377
1526
  execute_clustering_for_files(
@@ -1389,12 +1538,15 @@ def process_clustering_phase(
1389
1538
  _write_cluster_batch_snapshot,
1390
1539
  force_save_memory=force_save_memory,
1391
1540
  )
1392
-
1541
+
1393
1542
  # 记录聚类阶段完成
1394
- record_clustering_completion(sec_dir, cluster_records, compact_candidates, _progress_append)
1395
-
1543
+ record_clustering_completion(
1544
+ sec_dir, cluster_records, compact_candidates, _progress_append
1545
+ )
1546
+
1396
1547
  # 复核Agent:验证所有标记为无效的聚类(需要从review模块导入)
1397
1548
  from jarvis.jarvis_sec.review import process_review_phase
1549
+
1398
1550
  cluster_batches = process_review_phase(
1399
1551
  invalid_clusters_for_review,
1400
1552
  entry_path,
@@ -1405,35 +1557,41 @@ def process_clustering_phase(
1405
1557
  cluster_batches,
1406
1558
  sec_dir,
1407
1559
  )
1408
-
1560
+
1409
1561
  # 若聚类失败或空,则回退为"按文件一次处理"
1410
1562
  if not cluster_batches:
1411
- fallback_batches = fallback_to_file_based_batches(_file_groups, _existing_clusters)
1563
+ fallback_batches = fallback_to_file_based_batches(
1564
+ _file_groups, _existing_clusters
1565
+ )
1412
1566
  cluster_batches.extend(fallback_batches)
1413
-
1567
+
1414
1568
  # 完整性检查:确保所有候选的 gid 都已被聚类
1415
1569
  # 使用新的文件管理器进行校验
1416
1570
  is_complete, missing_gids_final = validate_clustering_completeness(sec_dir)
1417
-
1571
+
1418
1572
  if missing_gids_final:
1419
1573
  # 如果还有遗漏的gid,说明恢复逻辑有问题,需要重新聚类
1420
1574
  try:
1421
1575
  missing_count = len(missing_gids_final)
1422
1576
  if missing_count <= 20:
1423
- typer.secho(f"[jarvis-sec] 警告:发现 {missing_count} 个遗漏的gid(恢复逻辑可能有问题): {sorted(list(missing_gids_final))}", fg=typer.colors.RED)
1577
+ PrettyOutput.auto_print(
1578
+ f"[jarvis-sec] 警告:发现 {missing_count} 个遗漏的gid(恢复逻辑可能有问题): {sorted(list(missing_gids_final))}",
1579
+ )
1424
1580
  else:
1425
1581
  missing_list = sorted(list(missing_gids_final))
1426
1582
  display_list = missing_list[:10] + ["..."] + missing_list[-10:]
1427
- typer.secho(f"[jarvis-sec] 警告:发现 {missing_count} 个遗漏的gid(恢复逻辑可能有问题): {display_list}", fg=typer.colors.RED)
1428
-
1583
+ PrettyOutput.auto_print(
1584
+ f"[jarvis-sec] 警告:发现 {missing_count} 个遗漏的gid(恢复逻辑可能有问题): {display_list}",
1585
+ )
1586
+
1429
1587
  except Exception:
1430
1588
  pass
1431
-
1589
+
1432
1590
  # 清理之前创建的单独聚类(避免分析工作量激增)
1433
1591
  cluster_batches = filter_single_gid_clusters(
1434
1592
  cluster_batches,
1435
1593
  sec_dir,
1436
1594
  _progress_append,
1437
1595
  )
1438
-
1596
+
1439
1597
  return cluster_batches, invalid_clusters_for_review