jarvis-ai-assistant 0.7.0__py3-none-any.whl → 0.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/jarvis_agent/__init__.py +243 -139
  3. jarvis/jarvis_agent/agent_manager.py +5 -10
  4. jarvis/jarvis_agent/builtin_input_handler.py +2 -6
  5. jarvis/jarvis_agent/config_editor.py +2 -7
  6. jarvis/jarvis_agent/event_bus.py +82 -12
  7. jarvis/jarvis_agent/file_context_handler.py +265 -15
  8. jarvis/jarvis_agent/file_methodology_manager.py +3 -4
  9. jarvis/jarvis_agent/jarvis.py +113 -98
  10. jarvis/jarvis_agent/language_extractors/__init__.py +57 -0
  11. jarvis/jarvis_agent/language_extractors/c_extractor.py +21 -0
  12. jarvis/jarvis_agent/language_extractors/cpp_extractor.py +21 -0
  13. jarvis/jarvis_agent/language_extractors/go_extractor.py +21 -0
  14. jarvis/jarvis_agent/language_extractors/java_extractor.py +84 -0
  15. jarvis/jarvis_agent/language_extractors/javascript_extractor.py +79 -0
  16. jarvis/jarvis_agent/language_extractors/python_extractor.py +21 -0
  17. jarvis/jarvis_agent/language_extractors/rust_extractor.py +21 -0
  18. jarvis/jarvis_agent/language_extractors/typescript_extractor.py +84 -0
  19. jarvis/jarvis_agent/language_support_info.py +486 -0
  20. jarvis/jarvis_agent/main.py +6 -12
  21. jarvis/jarvis_agent/memory_manager.py +7 -16
  22. jarvis/jarvis_agent/methodology_share_manager.py +10 -16
  23. jarvis/jarvis_agent/prompt_manager.py +1 -1
  24. jarvis/jarvis_agent/prompts.py +193 -171
  25. jarvis/jarvis_agent/protocols.py +8 -12
  26. jarvis/jarvis_agent/run_loop.py +77 -14
  27. jarvis/jarvis_agent/session_manager.py +2 -3
  28. jarvis/jarvis_agent/share_manager.py +12 -21
  29. jarvis/jarvis_agent/shell_input_handler.py +1 -2
  30. jarvis/jarvis_agent/task_analyzer.py +26 -4
  31. jarvis/jarvis_agent/task_manager.py +11 -27
  32. jarvis/jarvis_agent/tool_executor.py +2 -3
  33. jarvis/jarvis_agent/tool_share_manager.py +12 -24
  34. jarvis/jarvis_agent/web_server.py +55 -20
  35. jarvis/jarvis_c2rust/__init__.py +5 -5
  36. jarvis/jarvis_c2rust/cli.py +461 -499
  37. jarvis/jarvis_c2rust/collector.py +45 -53
  38. jarvis/jarvis_c2rust/constants.py +26 -0
  39. jarvis/jarvis_c2rust/library_replacer.py +264 -132
  40. jarvis/jarvis_c2rust/llm_module_agent.py +162 -190
  41. jarvis/jarvis_c2rust/loaders.py +207 -0
  42. jarvis/jarvis_c2rust/models.py +28 -0
  43. jarvis/jarvis_c2rust/optimizer.py +1592 -395
  44. jarvis/jarvis_c2rust/transpiler.py +1722 -1064
  45. jarvis/jarvis_c2rust/utils.py +385 -0
  46. jarvis/jarvis_code_agent/build_validation_config.py +2 -3
  47. jarvis/jarvis_code_agent/code_agent.py +394 -320
  48. jarvis/jarvis_code_agent/code_analyzer/__init__.py +3 -0
  49. jarvis/jarvis_code_agent/code_analyzer/build_validator/base.py +4 -0
  50. jarvis/jarvis_code_agent/code_analyzer/build_validator/cmake.py +17 -2
  51. jarvis/jarvis_code_agent/code_analyzer/build_validator/fallback.py +3 -0
  52. jarvis/jarvis_code_agent/code_analyzer/build_validator/go.py +36 -4
  53. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_gradle.py +9 -0
  54. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_maven.py +9 -0
  55. jarvis/jarvis_code_agent/code_analyzer/build_validator/makefile.py +12 -1
  56. jarvis/jarvis_code_agent/code_analyzer/build_validator/nodejs.py +22 -5
  57. jarvis/jarvis_code_agent/code_analyzer/build_validator/python.py +57 -32
  58. jarvis/jarvis_code_agent/code_analyzer/build_validator/rust.py +62 -6
  59. jarvis/jarvis_code_agent/code_analyzer/build_validator/validator.py +8 -9
  60. jarvis/jarvis_code_agent/code_analyzer/context_manager.py +290 -5
  61. jarvis/jarvis_code_agent/code_analyzer/language_support.py +21 -0
  62. jarvis/jarvis_code_agent/code_analyzer/languages/__init__.py +21 -3
  63. jarvis/jarvis_code_agent/code_analyzer/languages/c_cpp_language.py +72 -4
  64. jarvis/jarvis_code_agent/code_analyzer/languages/go_language.py +35 -3
  65. jarvis/jarvis_code_agent/code_analyzer/languages/java_language.py +212 -0
  66. jarvis/jarvis_code_agent/code_analyzer/languages/javascript_language.py +254 -0
  67. jarvis/jarvis_code_agent/code_analyzer/languages/python_language.py +52 -2
  68. jarvis/jarvis_code_agent/code_analyzer/languages/rust_language.py +73 -1
  69. jarvis/jarvis_code_agent/code_analyzer/languages/typescript_language.py +280 -0
  70. jarvis/jarvis_code_agent/code_analyzer/llm_context_recommender.py +306 -152
  71. jarvis/jarvis_code_agent/code_analyzer/structured_code.py +556 -0
  72. jarvis/jarvis_code_agent/code_analyzer/symbol_extractor.py +193 -18
  73. jarvis/jarvis_code_agent/code_analyzer/tree_sitter_extractor.py +18 -8
  74. jarvis/jarvis_code_agent/lint.py +258 -27
  75. jarvis/jarvis_code_agent/utils.py +0 -1
  76. jarvis/jarvis_code_analysis/code_review.py +19 -24
  77. jarvis/jarvis_data/config_schema.json +53 -26
  78. jarvis/jarvis_git_squash/main.py +4 -5
  79. jarvis/jarvis_git_utils/git_commiter.py +44 -49
  80. jarvis/jarvis_mcp/sse_mcp_client.py +20 -27
  81. jarvis/jarvis_mcp/stdio_mcp_client.py +11 -12
  82. jarvis/jarvis_mcp/streamable_mcp_client.py +15 -14
  83. jarvis/jarvis_memory_organizer/memory_organizer.py +55 -74
  84. jarvis/jarvis_methodology/main.py +32 -48
  85. jarvis/jarvis_multi_agent/__init__.py +79 -61
  86. jarvis/jarvis_multi_agent/main.py +3 -7
  87. jarvis/jarvis_platform/base.py +469 -199
  88. jarvis/jarvis_platform/human.py +7 -8
  89. jarvis/jarvis_platform/kimi.py +30 -36
  90. jarvis/jarvis_platform/openai.py +65 -27
  91. jarvis/jarvis_platform/registry.py +26 -10
  92. jarvis/jarvis_platform/tongyi.py +24 -25
  93. jarvis/jarvis_platform/yuanbao.py +31 -42
  94. jarvis/jarvis_platform_manager/main.py +66 -77
  95. jarvis/jarvis_platform_manager/service.py +8 -13
  96. jarvis/jarvis_rag/cli.py +49 -51
  97. jarvis/jarvis_rag/embedding_manager.py +13 -18
  98. jarvis/jarvis_rag/llm_interface.py +8 -9
  99. jarvis/jarvis_rag/query_rewriter.py +10 -21
  100. jarvis/jarvis_rag/rag_pipeline.py +24 -27
  101. jarvis/jarvis_rag/reranker.py +4 -5
  102. jarvis/jarvis_rag/retriever.py +28 -30
  103. jarvis/jarvis_sec/__init__.py +220 -3520
  104. jarvis/jarvis_sec/agents.py +143 -0
  105. jarvis/jarvis_sec/analysis.py +276 -0
  106. jarvis/jarvis_sec/cli.py +29 -6
  107. jarvis/jarvis_sec/clustering.py +1439 -0
  108. jarvis/jarvis_sec/file_manager.py +427 -0
  109. jarvis/jarvis_sec/parsers.py +73 -0
  110. jarvis/jarvis_sec/prompts.py +268 -0
  111. jarvis/jarvis_sec/report.py +83 -4
  112. jarvis/jarvis_sec/review.py +453 -0
  113. jarvis/jarvis_sec/utils.py +499 -0
  114. jarvis/jarvis_sec/verification.py +848 -0
  115. jarvis/jarvis_sec/workflow.py +7 -0
  116. jarvis/jarvis_smart_shell/main.py +38 -87
  117. jarvis/jarvis_stats/cli.py +1 -1
  118. jarvis/jarvis_stats/stats.py +7 -7
  119. jarvis/jarvis_stats/storage.py +15 -21
  120. jarvis/jarvis_tools/clear_memory.py +3 -20
  121. jarvis/jarvis_tools/cli/main.py +20 -23
  122. jarvis/jarvis_tools/edit_file.py +1066 -0
  123. jarvis/jarvis_tools/execute_script.py +42 -21
  124. jarvis/jarvis_tools/file_analyzer.py +6 -9
  125. jarvis/jarvis_tools/generate_new_tool.py +11 -20
  126. jarvis/jarvis_tools/lsp_client.py +1552 -0
  127. jarvis/jarvis_tools/methodology.py +2 -3
  128. jarvis/jarvis_tools/read_code.py +1525 -87
  129. jarvis/jarvis_tools/read_symbols.py +2 -3
  130. jarvis/jarvis_tools/read_webpage.py +7 -10
  131. jarvis/jarvis_tools/registry.py +370 -181
  132. jarvis/jarvis_tools/retrieve_memory.py +20 -19
  133. jarvis/jarvis_tools/rewrite_file.py +105 -0
  134. jarvis/jarvis_tools/save_memory.py +3 -15
  135. jarvis/jarvis_tools/search_web.py +3 -7
  136. jarvis/jarvis_tools/sub_agent.py +17 -6
  137. jarvis/jarvis_tools/sub_code_agent.py +14 -16
  138. jarvis/jarvis_tools/virtual_tty.py +54 -32
  139. jarvis/jarvis_utils/clipboard.py +7 -10
  140. jarvis/jarvis_utils/config.py +98 -63
  141. jarvis/jarvis_utils/embedding.py +5 -5
  142. jarvis/jarvis_utils/fzf.py +8 -8
  143. jarvis/jarvis_utils/git_utils.py +81 -67
  144. jarvis/jarvis_utils/input.py +24 -49
  145. jarvis/jarvis_utils/jsonnet_compat.py +465 -0
  146. jarvis/jarvis_utils/methodology.py +33 -35
  147. jarvis/jarvis_utils/utils.py +245 -202
  148. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.8.dist-info}/METADATA +205 -70
  149. jarvis_ai_assistant-0.7.8.dist-info/RECORD +218 -0
  150. jarvis/jarvis_agent/edit_file_handler.py +0 -584
  151. jarvis/jarvis_agent/rewrite_file_handler.py +0 -141
  152. jarvis/jarvis_agent/task_planner.py +0 -496
  153. jarvis/jarvis_platform/ai8.py +0 -332
  154. jarvis/jarvis_tools/ask_user.py +0 -54
  155. jarvis_ai_assistant-0.7.0.dist-info/RECORD +0 -192
  156. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.8.dist-info}/WHEEL +0 -0
  157. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.8.dist-info}/entry_points.txt +0 -0
  158. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.8.dist-info}/licenses/LICENSE +0 -0
  159. {jarvis_ai_assistant-0.7.0.dist-info → jarvis_ai_assistant-0.7.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1439 @@
1
+ # -*- coding: utf-8 -*-
2
+ """聚类相关模块"""
3
+
4
+ from typing import Dict, List, Optional
5
+ from pathlib import Path
6
+ import json
7
+ import typer
8
+
9
+ from jarvis.jarvis_sec.prompts import get_cluster_summary_prompt
10
+ from jarvis.jarvis_sec.parsers import parse_clusters_from_text
11
+ from jarvis.jarvis_sec.agents import create_cluster_agent, subscribe_summary_event
12
+ from jarvis.jarvis_sec.utils import (
13
+ group_candidates_by_file,
14
+ )
15
+ from jarvis.jarvis_sec.file_manager import (
16
+ load_clusters,
17
+ save_cluster,
18
+ get_all_clustered_gids,
19
+ validate_clustering_completeness,
20
+ get_clusters_file,
21
+ )
22
+
23
+
24
+ def load_existing_clusters(
25
+ sec_dir: Path,
26
+ ) -> tuple[Dict[tuple[str, int], List[Dict]], set, set]:
27
+ """
28
+ 读取已有聚类报告以支持断点恢复。
29
+
30
+ 优先使用新的 clusters.jsonl 文件,如果不存在则回退到旧的 cluster_report.jsonl。
31
+
32
+ 返回: (_existing_clusters, _completed_cluster_batches, _reviewed_invalid_gids)
33
+ """
34
+ _existing_clusters: Dict[tuple[str, int], List[Dict]] = {}
35
+ _completed_cluster_batches: set = set()
36
+ _reviewed_invalid_gids: set = set() # 已复核的无效聚类的 gids
37
+
38
+ try:
39
+ # 优先使用新的 clusters.jsonl 文件
40
+ clusters = load_clusters(sec_dir)
41
+
42
+ if clusters:
43
+ # 从新的 clusters.jsonl 加载
44
+ for cluster in clusters:
45
+ f_name = str(cluster.get("file") or "")
46
+ bidx = int(cluster.get("batch_index", 1) or 1)
47
+ _existing_clusters.setdefault((f_name, bidx), []).append(cluster)
48
+
49
+ # 从分析结果文件中读取已复核的无效聚类
50
+ # 如果聚类是无效的,且其gids都在分析结果中被标记为误报,则认为已复核
51
+ if cluster.get("is_invalid", False):
52
+ gids_list = cluster.get("gids", [])
53
+ if isinstance(gids_list, list):
54
+ # 检查这些gid是否都在分析结果中被标记为误报
55
+ from jarvis.jarvis_sec.file_manager import get_false_positive_gids
56
+ false_positive_gids = get_false_positive_gids(sec_dir)
57
+ all_false_positive = all(
58
+ int(gid_val) in false_positive_gids
59
+ for gid_val in gids_list
60
+ if isinstance(gid_val, (int, str))
61
+ )
62
+ if all_false_positive:
63
+ for gid_val in gids_list:
64
+ try:
65
+ gid_int = int(gid_val)
66
+ if gid_int >= 1:
67
+ _reviewed_invalid_gids.add(gid_int)
68
+ except Exception:
69
+ pass
70
+ # 不再回退到旧的 cluster_report.jsonl,因为用户要求不考虑兼容
71
+ except Exception:
72
+ _existing_clusters = {}
73
+ _completed_cluster_batches = set()
74
+ _reviewed_invalid_gids = set()
75
+
76
+ return _existing_clusters, _completed_cluster_batches, _reviewed_invalid_gids
77
+
78
+
79
+ def restore_clusters_from_checkpoint(
80
+ _existing_clusters: Dict[tuple[str, int], List[Dict]],
81
+ _file_groups: Dict[str, List[Dict]],
82
+ _reviewed_invalid_gids: set,
83
+ ) -> tuple[List[List[Dict]], List[Dict], List[Dict], set]:
84
+ """
85
+ 从断点恢复聚类结果。
86
+
87
+ 返回: (cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids)
88
+ """
89
+ # 1. 收集所有候选的 gid
90
+ all_candidate_gids_in_clustering = set()
91
+ gid_to_candidate: Dict[int, Dict] = {}
92
+ for _file, _items in _file_groups.items():
93
+ for it in _items:
94
+ try:
95
+ _gid = int(it.get("gid", 0))
96
+ if _gid >= 1:
97
+ all_candidate_gids_in_clustering.add(_gid)
98
+ gid_to_candidate[_gid] = it
99
+ except Exception:
100
+ pass
101
+
102
+ # 2. 从 cluster_report.jsonl 恢复所有聚类结果
103
+ clustered_gids = set() # 已聚类的 gid(包括有效和无效的,因为无效的也需要进入复核阶段)
104
+ invalid_clusters_for_review: List[Dict] = [] # 无效聚类列表(从断点恢复)
105
+ cluster_batches: List[List[Dict]] = []
106
+ cluster_records: List[Dict] = []
107
+ skipped_reviewed_count = 0 # 已复核的无效聚类数量(跳过)
108
+ missing_gids_in_restore = set() # 记录恢复时无法匹配的gid(用于诊断)
109
+
110
+ # 首先,从所有聚类记录中收集所有已聚类的 gid(无论是否在当前候选集中)
111
+ # 这样可以确保即使匹配失败,只要 gid 在 clusters.jsonl 中且在当前候选集中,就会被计入 clustered_gids
112
+ all_clustered_gids_from_file = set()
113
+ for (_file_key, _batch_idx), cluster_recs in _existing_clusters.items():
114
+ for rec in cluster_recs:
115
+ gids_list = rec.get("gids", [])
116
+ if isinstance(gids_list, list):
117
+ for _gid in gids_list:
118
+ try:
119
+ _gid_int = int(_gid)
120
+ if _gid_int >= 1:
121
+ all_clustered_gids_from_file.add(_gid_int)
122
+ except Exception:
123
+ pass
124
+
125
+ # 对于所有在 clusters.jsonl 中记录的 gid,如果它们也在当前候选集中,就计入 clustered_gids
126
+ # 这样可以避免因为匹配失败而导致的遗漏
127
+ for _gid_int in all_clustered_gids_from_file:
128
+ if _gid_int in all_candidate_gids_in_clustering:
129
+ clustered_gids.add(_gid_int)
130
+
131
+ # 然后,尝试恢复具体的聚类信息(用于恢复 cluster_batches 和 invalid_clusters_for_review)
132
+ for (_file_key, _batch_idx), cluster_recs in _existing_clusters.items():
133
+ for rec in cluster_recs:
134
+ gids_list = rec.get("gids", [])
135
+ if not gids_list:
136
+ continue
137
+ is_invalid = rec.get("is_invalid", False)
138
+ verification = str(rec.get("verification", "")).strip()
139
+ members: List[Dict] = []
140
+ for _gid in gids_list:
141
+ try:
142
+ _gid_int = int(_gid)
143
+ if _gid_int >= 1:
144
+ if _gid_int in gid_to_candidate:
145
+ # 只有当 gid 在当前运行中存在时,才恢复该聚类
146
+ candidate = gid_to_candidate[_gid_int]
147
+ candidate["verify"] = verification
148
+ members.append(candidate)
149
+ else:
150
+ # gid不在gid_to_candidate中,说明无法直接匹配
151
+ # 可能的原因:
152
+ # 1. gid不在当前候选集中(候选列表变化)- 这是正常的,不应该计入clustered_gids
153
+ # 2. gid在当前候选集中但无法匹配(数据不一致)- 理论上不应该发生
154
+ # 由于all_candidate_gids_in_clustering是从_file_groups收集的,而gid_to_candidate也是从_file_groups构建的
155
+ # 如果gid在all_candidate_gids_in_clustering中,理论上应该在gid_to_candidate中
156
+ # 但为了保险起见,尝试从_file_groups中查找
157
+ if _gid_int in all_candidate_gids_in_clustering:
158
+ # gid在当前候选集中,尝试从_file_groups中查找(双重保险)
159
+ found_candidate = None
160
+ for _file, _items in _file_groups.items():
161
+ for it in _items:
162
+ try:
163
+ it_gid = int(it.get("gid", 0))
164
+ if it_gid == _gid_int:
165
+ found_candidate = it
166
+ break
167
+ except Exception:
168
+ pass
169
+ if found_candidate:
170
+ break
171
+
172
+ if found_candidate:
173
+ # 找到了对应的候选,添加到members中
174
+ found_candidate["verify"] = verification
175
+ members.append(found_candidate)
176
+ else:
177
+ # 理论上不应该到达这里,因为all_candidate_gids_in_clustering是从_file_groups收集的
178
+ # 如果gid在all_candidate_gids_in_clustering中,应该能在_file_groups中找到
179
+ # 但如果确实找不到,说明有bug,记录诊断信息
180
+ # 注意:即使找不到,gid 也已经在上面的循环中被计入了 clustered_gids
181
+ missing_gids_in_restore.add(_gid_int)
182
+ else:
183
+ # gid不在当前候选集中,说明候选列表发生了变化
184
+ # 这些gid不应该被计入clustered_gids,因为它们不在当前运行中
185
+ # 这是正常情况,不需要记录为遗漏(因为它们确实不在当前运行中)
186
+ pass
187
+ except Exception:
188
+ pass
189
+
190
+ # 只有当至少有一个gid在当前候选集中时,才恢复这个聚类
191
+ # 如果所有gid都不在当前候选集中,说明这些gid对应的候选在当前运行中不存在
192
+ # 这种情况下,不应该恢复这个聚类,因为这些gid不在当前运行中
193
+ if members:
194
+ if is_invalid:
195
+ # 检查该无效聚类的所有 gids 是否都已被复核过
196
+ cluster_gids = [m.get("gid") for m in members]
197
+ # 将 cluster_gids 转换为 int 类型进行比较
198
+ cluster_gids_int = set()
199
+ for gid_val in cluster_gids:
200
+ try:
201
+ gid_int = int(gid_val)
202
+ if gid_int >= 1:
203
+ cluster_gids_int.add(gid_int)
204
+ except Exception:
205
+ pass
206
+ # 检查所有 gid 是否都已被复核过
207
+ all_reviewed = cluster_gids_int and cluster_gids_int.issubset(_reviewed_invalid_gids)
208
+
209
+ if not all_reviewed:
210
+ # 如果还有未复核的 gid,收集到复核列表
211
+ invalid_clusters_for_review.append({
212
+ "file": _file_key,
213
+ "batch_index": _batch_idx,
214
+ "gids": cluster_gids,
215
+ "verification": verification,
216
+ "invalid_reason": str(rec.get("invalid_reason", "")).strip(),
217
+ "members": members, # 保存候选信息,用于复核后可能重新加入验证
218
+ "count": len(members),
219
+ })
220
+ else:
221
+ # 如果所有 gid 都已被复核过,则跳过(不加入复核列表)
222
+ skipped_reviewed_count += 1
223
+ else:
224
+ # 有效聚类:恢复到 cluster_batches
225
+ cluster_batches.append(members)
226
+ cluster_records.append({
227
+ "file": _file_key,
228
+ "verification": verification,
229
+ "gids": [m.get("gid") for m in members],
230
+ "count": len(members),
231
+ "batch_index": _batch_idx,
232
+ "is_invalid": False,
233
+ })
234
+
235
+ # 输出统计信息
236
+ if _reviewed_invalid_gids:
237
+ try:
238
+ typer.secho(f"[jarvis-sec] 断点恢复:发现 {len(_reviewed_invalid_gids)} 个已复核的无效聚类 gids", fg=typer.colors.BLUE)
239
+ except Exception:
240
+ pass
241
+ if skipped_reviewed_count > 0:
242
+ try:
243
+ typer.secho(f"[jarvis-sec] 断点恢复:跳过 {skipped_reviewed_count} 个已复核的无效聚类", fg=typer.colors.BLUE)
244
+ except Exception:
245
+ pass
246
+ if missing_gids_in_restore:
247
+ # 诊断信息:记录恢复时无法匹配的gid数量
248
+ # 注意:这些gid在当前候选集中,但无法匹配,说明可能存在数据不一致的问题
249
+ # 正常情况下不应该出现这种情况
250
+ missing_count = len(missing_gids_in_restore)
251
+ try:
252
+ if missing_count <= 20:
253
+ missing_list = sorted(list(missing_gids_in_restore))
254
+ typer.secho(f"[jarvis-sec] 断点恢复诊断:发现 {missing_count} 个gid在当前候选集中但无法匹配(可能存在数据不一致): {missing_list}", fg=typer.colors.YELLOW)
255
+ else:
256
+ missing_list = sorted(list(missing_gids_in_restore))
257
+ display_list = missing_list[:10] + ["..."] + missing_list[-10:]
258
+ typer.secho(f"[jarvis-sec] 断点恢复诊断:发现 {missing_count} 个gid在当前候选集中但无法匹配(可能存在数据不一致): {display_list}", fg=typer.colors.YELLOW)
259
+ except Exception:
260
+ pass
261
+
262
+ return cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids
263
+
264
+
265
+ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], compact_candidates: List[Dict], _progress_append):
266
+ """创建聚类快照写入函数"""
267
+ def _write_cluster_batch_snapshot(batch_records: List[Dict]):
268
+ """写入单个批次的聚类结果,支持增量保存"""
269
+ try:
270
+ # 按 (file, batch_index) 分组,为每个分组内的记录生成唯一的 cluster_index
271
+ from collections import defaultdict
272
+ records_by_key = defaultdict(list)
273
+ for record in batch_records:
274
+ file_name = str(record.get("file", ""))
275
+ batch_index = int(record.get("batch_index", 0))
276
+ key = (file_name, batch_index)
277
+ records_by_key[key].append(record)
278
+
279
+ # 为每个分组内的记录生成 cluster_index
280
+ for (file_name, batch_index), records in records_by_key.items():
281
+ for local_idx, record in enumerate(records):
282
+ # 如果 record 中没有 cluster_index,使用本地索引
283
+ cluster_index = record.get("cluster_index")
284
+ if cluster_index is None:
285
+ cluster_index = local_idx
286
+ else:
287
+ cluster_index = int(cluster_index)
288
+
289
+ cluster_id = f"{file_name}|{batch_index}|{cluster_index}"
290
+
291
+ # 转换为新的格式
292
+ cluster = {
293
+ "cluster_id": cluster_id,
294
+ "file": file_name,
295
+ "batch_index": batch_index,
296
+ "cluster_index": cluster_index,
297
+ "gids": record.get("gids", []),
298
+ "verification": str(record.get("verification", "")).strip(),
299
+ "is_invalid": record.get("is_invalid", False),
300
+ "invalid_reason": str(record.get("invalid_reason", "")).strip(),
301
+ }
302
+
303
+ # 使用新的文件管理器保存
304
+ save_cluster(sec_dir, cluster)
305
+ except Exception:
306
+ pass
307
+
308
+ def _write_cluster_report_snapshot():
309
+ """写入聚类报告快照"""
310
+ try:
311
+ # 为每个记录生成 cluster_id 并保存
312
+ for idx, record in enumerate(cluster_records):
313
+ file_name = str(record.get("file", ""))
314
+ batch_index = int(record.get("batch_index", 0))
315
+ cluster_index = idx # 使用索引作为 cluster_index
316
+ cluster_id = f"{file_name}|{batch_index}|{cluster_index}"
317
+
318
+ # 转换为新的格式
319
+ cluster = {
320
+ "cluster_id": cluster_id,
321
+ "file": file_name,
322
+ "batch_index": batch_index,
323
+ "cluster_index": cluster_index,
324
+ "gids": record.get("gids", []),
325
+ "verification": str(record.get("verification", "")).strip(),
326
+ "is_invalid": record.get("is_invalid", False),
327
+ "invalid_reason": str(record.get("invalid_reason", "")).strip(),
328
+ }
329
+
330
+ # 使用新的文件管理器保存
331
+ save_cluster(sec_dir, cluster)
332
+
333
+ _progress_append(
334
+ {
335
+ "event": "cluster_report_snapshot",
336
+ "path": str(get_clusters_file(sec_dir)),
337
+ "clusters": len(cluster_records),
338
+ "total_candidates": len(compact_candidates),
339
+ }
340
+ )
341
+ except Exception:
342
+ pass
343
+
344
+ return _write_cluster_batch_snapshot, _write_cluster_report_snapshot
345
+
346
+
347
+ def collect_candidate_gids(file_groups: Dict[str, List[Dict]]) -> set:
348
+ """收集所有候选的 gid"""
349
+ all_gids = set()
350
+ for _file, _items in file_groups.items():
351
+ for it in _items:
352
+ try:
353
+ _gid = int(it.get("gid", 0))
354
+ if _gid >= 1:
355
+ all_gids.add(_gid)
356
+ except Exception:
357
+ pass
358
+ return all_gids
359
+
360
+
361
+ def collect_clustered_gids(cluster_batches: List[List[Dict]], invalid_clusters_for_review: List[Dict]) -> set:
362
+ """收集所有已聚类的 gid"""
363
+ all_clustered_gids = set()
364
+ for batch in cluster_batches:
365
+ for item in batch:
366
+ try:
367
+ _gid = int(item.get("gid", 0))
368
+ if _gid >= 1:
369
+ all_clustered_gids.add(_gid)
370
+ except Exception:
371
+ pass
372
+ # 也收集无效聚类中的 gid(它们已经进入复核流程)
373
+ for invalid_cluster in invalid_clusters_for_review:
374
+ gids_list = invalid_cluster.get("gids", [])
375
+ for _gid in gids_list:
376
+ try:
377
+ _gid_int = int(_gid)
378
+ if _gid_int >= 1:
379
+ all_clustered_gids.add(_gid_int)
380
+ except Exception:
381
+ pass
382
+ return all_clustered_gids
383
+
384
+
385
+ # 注意:supplement_missing_gids_for_clustering函数已移除
386
+ # 由于gid现在保存在heuristic_issues.jsonl中,恢复逻辑已经能够正确匹配所有gid
387
+ # 理论上不应该再出现遗漏的gid,不需要补充处理
388
+
389
+
390
+ def filter_single_gid_clusters(
391
+ cluster_batches: List[List[Dict]],
392
+ sec_dir: Path,
393
+ _progress_append,
394
+ ) -> List[List[Dict]]:
395
+ """
396
+ 过滤掉单独聚类的批次(只包含1个gid的批次),避免分析工作量激增。
397
+
398
+ 这些单独聚类通常是之前为遗漏的gid自动创建的,现在不再需要。
399
+ """
400
+ filtered_batches = []
401
+ removed_count = 0
402
+ removed_gids = set()
403
+
404
+ # 读取已分析的gid(从analysis.jsonl)
405
+ from jarvis.jarvis_sec.file_manager import get_all_analyzed_gids
406
+ processed_gids = get_all_analyzed_gids(sec_dir)
407
+
408
+ # 读取clusters.jsonl中的所有gid
409
+ cluster_report_gids = get_all_clustered_gids(sec_dir)
410
+
411
+ for batch in cluster_batches:
412
+ # 检查批次大小
413
+ if len(batch) == 1:
414
+ # 这是单独聚类,检查是否需要保留
415
+ single_item = batch[0]
416
+ try:
417
+ gid = int(single_item.get("gid", 0))
418
+ if gid >= 1:
419
+ # 如果gid已经在analysis.jsonl中分析过,安全移除(不会遗漏)
420
+ if gid in processed_gids:
421
+ removed_count += 1
422
+ removed_gids.add(gid)
423
+ _progress_append({
424
+ "event": "single_cluster_removed",
425
+ "gid": gid,
426
+ "reason": "already_analyzed",
427
+ })
428
+ continue
429
+
430
+ # 检查verification字段,如果是默认的"验证候选 X 的安全风险",说明是自动创建的单独聚类
431
+ verification = str(single_item.get("verify", "")).strip()
432
+ is_auto_created = verification.startswith("验证候选 ") and verification.endswith(" 的安全风险")
433
+
434
+ if is_auto_created:
435
+ # 这是自动创建的单独聚类
436
+ # 如果gid在clusters.jsonl中有记录,说明已经聚类过了,可以安全移除
437
+ # 如果不在clusters.jsonl中,也不在analysis.jsonl中,说明需要分析,应该保留
438
+ if gid in cluster_report_gids:
439
+ removed_count += 1
440
+ removed_gids.add(gid)
441
+ _progress_append({
442
+ "event": "single_cluster_removed",
443
+ "gid": gid,
444
+ "reason": "auto_created_and_in_clusters",
445
+ })
446
+ continue
447
+ else:
448
+ # 自动创建的单独聚类,但不在clusters.jsonl中,也不在analysis.jsonl中
449
+ # 说明需要分析,保留它(避免遗漏告警)
450
+ # 但给出警告,因为这种情况不应该发生
451
+ try:
452
+ typer.secho(f"[jarvis-sec] 警告:gid={gid}是自动创建的单独聚类,但不在clusters.jsonl中,保留以避免遗漏告警", fg=typer.colors.YELLOW)
453
+ except Exception:
454
+ pass
455
+ else:
456
+ # 不是自动创建的单独聚类,可能是正常的单告警文件(handle_single_alert_file创建的)
457
+ # 保留它(避免遗漏告警)
458
+ pass
459
+ except Exception:
460
+ pass
461
+
462
+ # 保留这个批次(不是单独聚类,或者单独聚类但需要保留)
463
+ filtered_batches.append(batch)
464
+
465
+ if removed_count > 0:
466
+ try:
467
+ if len(removed_gids) <= 20:
468
+ typer.secho(f"[jarvis-sec] 已移除 {removed_count} 个单独聚类批次(共{len(removed_gids)}个gid),避免分析工作量激增", fg=typer.colors.GREEN)
469
+ typer.secho(f"[jarvis-sec] 移除的gid: {sorted(list(removed_gids))}", fg=typer.colors.GREEN)
470
+ else:
471
+ removed_gids_list = sorted(list(removed_gids))
472
+ display_list = removed_gids_list[:10] + ["..."] + removed_gids_list[-10:]
473
+ typer.secho(f"[jarvis-sec] 已移除 {removed_count} 个单独聚类批次(共{len(removed_gids)}个gid),避免分析工作量激增", fg=typer.colors.GREEN)
474
+ typer.secho(f"[jarvis-sec] 移除的gid(示例): {display_list}", fg=typer.colors.GREEN)
475
+ except Exception:
476
+ pass
477
+
478
+ return filtered_batches
479
+
480
+
481
+ def handle_single_alert_file(
482
+ file: str,
483
+ single_item: Dict,
484
+ single_gid: int,
485
+ cluster_batches: List[List[Dict]],
486
+ cluster_records: List[Dict],
487
+ _progress_append,
488
+ _write_cluster_batch_snapshot,
489
+ ) -> None:
490
+ """处理单告警文件:跳过聚类,直接写入"""
491
+ default_verification = f"验证候选 {single_gid} 的安全风险"
492
+ single_item["verify"] = default_verification
493
+ cluster_batches.append([single_item])
494
+ cluster_records.append(
495
+ {
496
+ "file": file,
497
+ "verification": default_verification,
498
+ "gids": [single_gid],
499
+ "count": 1,
500
+ "batch_index": 1,
501
+ "note": "单告警跳过聚类",
502
+ }
503
+ )
504
+ _progress_append(
505
+ {
506
+ "event": "cluster_status",
507
+ "status": "done",
508
+ "file": file,
509
+ "batch_index": 1,
510
+ "skipped": True,
511
+ "reason": "single_alert",
512
+ }
513
+ )
514
+ current_batch_records = [
515
+ rec for rec in cluster_records
516
+ if rec.get("file") == file and rec.get("batch_index") == 1
517
+ ]
518
+ if current_batch_records:
519
+ _write_cluster_batch_snapshot(current_batch_records)
520
+ typer.secho(f"[jarvis-sec] 文件 {file} 仅有一个告警(gid={single_gid}),跳过聚类直接写入", fg=typer.colors.BLUE)
521
+
522
+
523
+ def validate_cluster_format(cluster_items: List[Dict]) -> tuple[bool, List[str]]:
524
+ """验证聚类结果的格式,返回(是否有效, 错误详情列表)"""
525
+ if not isinstance(cluster_items, list) or not cluster_items:
526
+ return False, ["结果不是数组或数组为空"]
527
+
528
+ error_details = []
529
+ for idx, it in enumerate(cluster_items):
530
+ if not isinstance(it, dict):
531
+ error_details.append(f"元素{idx}不是字典")
532
+ return False, error_details
533
+
534
+ vals = it.get("gids", [])
535
+ if not isinstance(it.get("verification", ""), str) or not isinstance(vals, list):
536
+ error_details.append(f"元素{idx}的verification或gids格式错误")
537
+ return False, error_details
538
+
539
+ # 校验 gids 列表中的每个元素是否都是有效的整数
540
+ if isinstance(vals, list):
541
+ for gid_idx, gid_val in enumerate(vals):
542
+ try:
543
+ gid_int = int(gid_val)
544
+ if gid_int < 1:
545
+ error_details.append(f"元素{idx}的gids[{gid_idx}]不是有效的正整数(值为{gid_val})")
546
+ return False, error_details
547
+ except (ValueError, TypeError):
548
+ error_details.append(f"元素{idx}的gids[{gid_idx}]不是有效的整数(值为{gid_val},类型为{type(gid_val).__name__})")
549
+ return False, error_details
550
+
551
+ # 校验 is_invalid 字段(必填)
552
+ if "is_invalid" not in it:
553
+ error_details.append(f"元素{idx}缺少is_invalid字段(必填)")
554
+ return False, error_details
555
+
556
+ is_invalid_val = it.get("is_invalid")
557
+ if not isinstance(is_invalid_val, bool):
558
+ error_details.append(f"元素{idx}的is_invalid不是布尔值")
559
+ return False, error_details
560
+
561
+ # 如果is_invalid为true,必须提供invalid_reason
562
+ if is_invalid_val is True:
563
+ invalid_reason = it.get("invalid_reason", "")
564
+ if not isinstance(invalid_reason, str) or not invalid_reason.strip():
565
+ error_details.append(f"元素{idx}的is_invalid为true但缺少invalid_reason字段或理由为空(必填)")
566
+ return False, error_details
567
+
568
+ return True, []
569
+
570
+
571
+ def extract_classified_gids(cluster_items: List[Dict]) -> set:
572
+ """从聚类结果中提取所有已分类的gid
573
+
574
+ 注意:此函数假设格式验证已经通过,所有gid都是有效的整数。
575
+ 如果遇到格式错误的gid,会记录警告但不会抛出异常(因为格式验证应该已经捕获了这些问题)。
576
+ """
577
+ classified_gids = set()
578
+ for cl in cluster_items:
579
+ raw_gids = cl.get("gids", [])
580
+ if isinstance(raw_gids, list):
581
+ for x in raw_gids:
582
+ try:
583
+ xi = int(x)
584
+ if xi >= 1:
585
+ classified_gids.add(xi)
586
+ except (ValueError, TypeError):
587
+ # 理论上不应该到达这里(格式验证应该已经捕获),但如果到达了,记录警告
588
+ try:
589
+ typer.secho(f"[jarvis-sec] 警告:在提取gid时遇到格式错误(值={x},类型={type(x).__name__}),这不应该发生(格式验证应该已捕获)", fg=typer.colors.YELLOW)
590
+ except Exception:
591
+ pass
592
+ continue
593
+ return classified_gids
594
+
595
+
596
+ def build_cluster_retry_task(
597
+ file: str,
598
+ missing_gids: set,
599
+ error_details: List[str],
600
+ ) -> str:
601
+ """构建聚类重试任务"""
602
+ retry_task = f"""
603
+ # 聚类任务重试
604
+ 文件: {file}
605
+
606
+ **重要提示**:请重新输出聚类结果。
607
+ """.strip()
608
+ if missing_gids:
609
+ missing_gids_list = sorted(list(missing_gids))
610
+ missing_count = len(missing_gids)
611
+ retry_task += f"\n\n**遗漏的gid(共{missing_count}个,必须被分类):**\n" + ", ".join(str(gid) for gid in missing_gids_list)
612
+ if error_details:
613
+ retry_task += "\n\n**格式错误:**\n" + "\n".join(f"- {detail}" for detail in error_details)
614
+ return retry_task
615
+
616
+
617
+ def build_cluster_error_guidance(
618
+ error_details: List[str],
619
+ missing_gids: set,
620
+ ) -> str:
621
+ """构建聚类错误指导信息"""
622
+ error_guidance = ""
623
+ if error_details:
624
+ error_guidance = "\n\n**格式错误详情(请根据以下错误修复输出格式):**\n" + "\n".join(f"- {detail}" for detail in error_details)
625
+ if missing_gids:
626
+ missing_gids_list = sorted(list(missing_gids))
627
+ missing_count = len(missing_gids)
628
+ error_guidance += f"\n\n**完整性错误:遗漏了 {missing_count} 个 gid,这些 gid 必须被分类:**\n" + ", ".join(str(gid) for gid in missing_gids_list)
629
+ return error_guidance
630
+
631
+
632
+ def run_cluster_agent_direct_model(
633
+ cluster_agent,
634
+ cluster_task: str,
635
+ cluster_summary_prompt: str,
636
+ file: str,
637
+ missing_gids: set,
638
+ error_details: List[str],
639
+ _cluster_summary: Dict[str, str],
640
+ ) -> None:
641
+ """使用直接模型调用运行聚类Agent"""
642
+ retry_task = build_cluster_retry_task(file, missing_gids, error_details)
643
+ error_guidance = build_cluster_error_guidance(error_details, missing_gids)
644
+ full_prompt = f"{retry_task}{error_guidance}\n\n{cluster_summary_prompt}"
645
+ try:
646
+ response = cluster_agent.model.chat_until_success(full_prompt) # type: ignore
647
+ _cluster_summary["text"] = response
648
+ except Exception as e:
649
+ try:
650
+ typer.secho(f"[jarvis-sec] 直接模型调用失败: {e},回退到 run()", fg=typer.colors.YELLOW)
651
+ except Exception:
652
+ pass
653
+ cluster_agent.run(cluster_task)
654
+
655
+
656
+ def validate_cluster_result(
657
+ cluster_items: Optional[List[Dict]],
658
+ parse_error: Optional[str],
659
+ attempt: int,
660
+ ) -> tuple[bool, List[str]]:
661
+ """验证聚类结果格式"""
662
+ if parse_error:
663
+ error_details = [f"JSON解析失败: {parse_error}"]
664
+ typer.secho(f"[jarvis-sec] JSON解析失败: {parse_error}", fg=typer.colors.YELLOW)
665
+ return False, error_details
666
+ else:
667
+ valid, error_details = validate_cluster_format(cluster_items)
668
+ if not valid:
669
+ typer.secho(f"[jarvis-sec] 聚类结果格式无效({'; '.join(error_details)}),重试第 {attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
670
+ return valid, error_details
671
+
672
+
673
+ def check_cluster_completeness(
674
+ cluster_items: List[Dict],
675
+ input_gids: set,
676
+ attempt: int,
677
+ ) -> tuple[bool, set]:
678
+ """检查聚类完整性,返回(是否完整, 遗漏的gid)"""
679
+ classified_gids = extract_classified_gids(cluster_items)
680
+ missing_gids = input_gids - classified_gids
681
+ if not missing_gids:
682
+ typer.secho(f"[jarvis-sec] 聚类完整性校验通过,所有gid已分类(共尝试 {attempt} 次)", fg=typer.colors.GREEN)
683
+ return True, set()
684
+ else:
685
+ missing_gids_list = sorted(list(missing_gids))
686
+ missing_count = len(missing_gids)
687
+ typer.secho(f"[jarvis-sec] 聚类完整性校验失败:遗漏的gid: {missing_gids_list}({missing_count}个),重试第 {attempt} 次(使用直接模型调用)", fg=typer.colors.YELLOW)
688
+ return False, missing_gids
689
+
690
+
691
+ def run_cluster_agent_with_retry(
692
+ cluster_agent,
693
+ cluster_task: str,
694
+ cluster_summary_prompt: str,
695
+ input_gids: set,
696
+ file: str,
697
+ _cluster_summary: Dict[str, str],
698
+ create_agent_func=None,
699
+ ) -> tuple[Optional[List[Dict]], Optional[str], bool]:
700
+ """
701
+ 运行聚类Agent并永久重试直到所有gid都被分类,返回(聚类结果, 解析错误, 是否需要重新创建agent)
702
+ 如果需要重新创建agent,返回的第三个值为True
703
+ """
704
+ _attempt = 0
705
+ use_direct_model = False
706
+ error_details: List[str] = []
707
+ missing_gids = set()
708
+ consecutive_failures = 0 # 连续失败次数
709
+
710
+ while True:
711
+ _attempt += 1
712
+ _cluster_summary["text"] = ""
713
+
714
+ if use_direct_model:
715
+ run_cluster_agent_direct_model(
716
+ cluster_agent,
717
+ cluster_task,
718
+ cluster_summary_prompt,
719
+ file,
720
+ missing_gids,
721
+ error_details,
722
+ _cluster_summary,
723
+ )
724
+ else:
725
+ # 第一次使用 run(),让 Agent 完整运行(可能使用工具)
726
+ cluster_agent.run(cluster_task)
727
+
728
+ cluster_summary_text = _cluster_summary.get("text", "")
729
+ # 调试:如果解析失败,输出摘要文本的前500个字符用于调试
730
+ cluster_items, parse_error = parse_clusters_from_text(cluster_summary_text)
731
+
732
+ # 如果解析失败且是第一次尝试,输出调试信息
733
+ if parse_error and _attempt == 1:
734
+ preview = cluster_summary_text[:500] if cluster_summary_text else "(空)"
735
+ try:
736
+ typer.secho(f"[jarvis-sec] 调试:摘要文本预览(前500字符): {preview}", fg=typer.colors.CYAN, err=True)
737
+ except Exception:
738
+ pass
739
+
740
+ # 校验结构
741
+ valid, error_details = validate_cluster_result(cluster_items, parse_error, _attempt)
742
+
743
+ # 完整性校验:检查所有输入的gid是否都被分类
744
+ missing_gids = set()
745
+ if valid and cluster_items:
746
+ is_complete, missing_gids = check_cluster_completeness(cluster_items, input_gids, _attempt)
747
+ if is_complete:
748
+ return cluster_items, None, False
749
+ else:
750
+ use_direct_model = True
751
+ valid = False
752
+ consecutive_failures += 1
753
+ else:
754
+ consecutive_failures += 1
755
+
756
+ # 如果连续失败5次,且提供了创建agent的函数,则返回需要重新创建agent的标志
757
+ if not valid and consecutive_failures >= 5 and create_agent_func is not None:
758
+ try:
759
+ typer.secho(f"[jarvis-sec] 连续失败 {consecutive_failures} 次,需要重新创建agent", fg=typer.colors.YELLOW)
760
+ except Exception:
761
+ pass
762
+ return None, parse_error or "连续失败5次", True
763
+
764
+ if not valid:
765
+ use_direct_model = True
766
+ cluster_items = None
767
+
768
+
769
+ def process_cluster_results(
770
+ cluster_items: List[Dict],
771
+ pending_in_file_with_ids: List[Dict],
772
+ file: str,
773
+ chunk_idx: int,
774
+ cluster_batches: List[List[Dict]],
775
+ cluster_records: List[Dict],
776
+ invalid_clusters_for_review: List[Dict],
777
+ _progress_append,
778
+ ) -> tuple[int, int]:
779
+ """处理聚类结果,返回(有效聚类数, 无效聚类数)"""
780
+ gid_to_item: Dict[int, Dict] = {}
781
+ try:
782
+ for it in pending_in_file_with_ids:
783
+ try:
784
+ _gid = int(it.get("gid", 0))
785
+ if _gid >= 1:
786
+ gid_to_item[_gid] = it
787
+ except Exception:
788
+ pass
789
+ except Exception:
790
+ gid_to_item = {}
791
+
792
+ _merged_count = 0
793
+ _invalid_count = 0
794
+ classified_gids_final = set()
795
+
796
+ for cl in cluster_items:
797
+ verification = str(cl.get("verification", "")).strip()
798
+ raw_gids = cl.get("gids", [])
799
+ is_invalid = cl["is_invalid"]
800
+ norm_keys: List[int] = []
801
+ if isinstance(raw_gids, list):
802
+ for x in raw_gids:
803
+ try:
804
+ xi = int(x)
805
+ if xi >= 1:
806
+ norm_keys.append(xi)
807
+ classified_gids_final.add(xi)
808
+ except Exception:
809
+ pass
810
+
811
+ members: List[Dict] = []
812
+ for k in norm_keys:
813
+ it = gid_to_item.get(k)
814
+ if it:
815
+ it["verify"] = verification
816
+ members.append(it)
817
+
818
+ # 如果标记为无效,收集到复核列表
819
+ if is_invalid:
820
+ _invalid_count += 1
821
+ invalid_gids = [m.get("gid") for m in members]
822
+ invalid_reason = str(cl.get("invalid_reason", "")).strip()
823
+ try:
824
+ typer.secho(f"[jarvis-sec] 聚类阶段判定为无效(gids={invalid_gids}),将提交复核Agent验证", fg=typer.colors.BLUE)
825
+ except Exception:
826
+ pass
827
+ invalid_clusters_for_review.append({
828
+ "file": file,
829
+ "batch_index": chunk_idx,
830
+ "gids": invalid_gids,
831
+ "verification": verification,
832
+ "invalid_reason": invalid_reason,
833
+ "members": members,
834
+ "count": len(members),
835
+ })
836
+ _progress_append({
837
+ "event": "cluster_invalid",
838
+ "file": file,
839
+ "batch_index": chunk_idx,
840
+ "gids": invalid_gids,
841
+ "verification": verification,
842
+ "count": len(members),
843
+ })
844
+ cluster_records.append({
845
+ "file": file,
846
+ "verification": verification,
847
+ "gids": invalid_gids,
848
+ "count": len(members),
849
+ "batch_index": chunk_idx,
850
+ "is_invalid": True,
851
+ "invalid_reason": invalid_reason,
852
+ })
853
+ elif members:
854
+ _merged_count += 1
855
+ cluster_batches.append(members)
856
+ cluster_records.append({
857
+ "file": file,
858
+ "verification": verification,
859
+ "gids": [m.get("gid") for m in members],
860
+ "count": len(members),
861
+ "batch_index": chunk_idx,
862
+ "is_invalid": False,
863
+ })
864
+
865
+ return _merged_count, _invalid_count
866
+
867
+
868
+ def supplement_missing_gids(
869
+ missing_gids_final: set,
870
+ gid_to_item: Dict[int, Dict],
871
+ file: str,
872
+ chunk_idx: int,
873
+ cluster_batches: List[List[Dict]],
874
+ cluster_records: List[Dict],
875
+ ) -> int:
876
+ """为遗漏的gid创建单独聚类,返回补充的聚类数"""
877
+ supplemented_count = 0
878
+ for missing_gid in sorted(missing_gids_final):
879
+ missing_item = gid_to_item.get(missing_gid)
880
+ if missing_item:
881
+ default_verification = f"验证候选 {missing_gid} 的安全风险"
882
+ missing_item["verify"] = default_verification
883
+ cluster_batches.append([missing_item])
884
+ cluster_records.append({
885
+ "file": file,
886
+ "verification": default_verification,
887
+ "gids": [missing_gid],
888
+ "count": 1,
889
+ "batch_index": chunk_idx,
890
+ "note": "完整性校验补充的遗漏gid",
891
+ })
892
+ supplemented_count += 1
893
+ return supplemented_count
894
+
895
+
896
+ def build_cluster_task(
897
+ pending_in_file_with_ids: List[Dict],
898
+ entry_path: str,
899
+ file: str,
900
+ langs: List[str],
901
+ ) -> str:
902
+ """构建聚类任务上下文"""
903
+ return f"""
904
+ # 聚类任务(分析输入)
905
+ 上下文:
906
+ - entry_path: {entry_path}
907
+ - file: {file}
908
+ - languages: {langs}
909
+
910
+ 候选(JSON数组,包含 gid/file/line/pattern/category/evidence):
911
+ {json.dumps(pending_in_file_with_ids, ensure_ascii=False, indent=2)}
912
+ """.strip()
913
+
914
+
915
+ def extract_input_gids(pending_in_file_with_ids: List[Dict]) -> set:
916
+ """从待聚类项中提取gid集合"""
917
+ input_gids = set()
918
+ for it in pending_in_file_with_ids:
919
+ try:
920
+ _gid = int(it.get("gid", 0))
921
+ if _gid >= 1:
922
+ input_gids.add(_gid)
923
+ except Exception:
924
+ pass
925
+ return input_gids
926
+
927
+
928
+ def build_gid_to_item_mapping(pending_in_file_with_ids: List[Dict]) -> Dict[int, Dict]:
929
+ """构建gid到项的映射"""
930
+ gid_to_item: Dict[int, Dict] = {}
931
+ try:
932
+ for it in pending_in_file_with_ids:
933
+ try:
934
+ _gid = int(it.get("gid", 0))
935
+ if _gid >= 1:
936
+ gid_to_item[_gid] = it
937
+ except Exception:
938
+ pass
939
+ except Exception:
940
+ pass
941
+ return gid_to_item
942
+
943
+
944
+ def process_cluster_chunk(
945
+ chunk: List[Dict],
946
+ chunk_idx: int,
947
+ file: str,
948
+ entry_path: str,
949
+ langs: List[str],
950
+ llm_group: Optional[str],
951
+ cluster_batches: List[List[Dict]],
952
+ cluster_records: List[Dict],
953
+ invalid_clusters_for_review: List[Dict],
954
+ _progress_append,
955
+ _write_cluster_batch_snapshot,
956
+ force_save_memory: bool = False,
957
+ ) -> None:
958
+ """处理单个聚类批次"""
959
+ if not chunk:
960
+ return
961
+
962
+ pending_in_file_with_ids = list(chunk)
963
+
964
+ # 记录聚类批次开始
965
+ _progress_append({
966
+ "event": "cluster_status",
967
+ "status": "running",
968
+ "file": file,
969
+ "batch_index": chunk_idx,
970
+ "total_in_batch": len(pending_in_file_with_ids),
971
+ })
972
+
973
+ # 创建聚类Agent
974
+ cluster_agent = create_cluster_agent(file, chunk_idx, llm_group, force_save_memory=force_save_memory)
975
+
976
+ # 构建任务上下文
977
+ cluster_task = build_cluster_task(pending_in_file_with_ids, entry_path, file, langs)
978
+
979
+ # 提取输入gid
980
+ input_gids = extract_input_gids(pending_in_file_with_ids)
981
+
982
+ # 运行聚类Agent(支持重新创建agent,不限次数)
983
+ cluster_summary_prompt = get_cluster_summary_prompt()
984
+ recreate_count = 0
985
+
986
+ while True:
987
+ # 订阅摘要事件(每次重新创建agent后需要重新订阅)
988
+ cluster_summary = subscribe_summary_event(cluster_agent)
989
+
990
+ cluster_items, parse_error, need_recreate = run_cluster_agent_with_retry(
991
+ cluster_agent,
992
+ cluster_task,
993
+ cluster_summary_prompt,
994
+ input_gids,
995
+ file,
996
+ cluster_summary,
997
+ create_agent_func=lambda: create_cluster_agent(file, chunk_idx, llm_group, force_save_memory=force_save_memory),
998
+ )
999
+
1000
+ # 如果不需要重新创建agent,退出循环
1001
+ if not need_recreate:
1002
+ break
1003
+
1004
+ # 需要重新创建agent(不限次数)
1005
+ recreate_count += 1
1006
+ try:
1007
+ typer.secho(f"[jarvis-sec] 重新创建聚类Agent(第 {recreate_count} 次)", fg=typer.colors.MAGENTA)
1008
+ except Exception:
1009
+ pass
1010
+ cluster_agent = create_cluster_agent(file, chunk_idx, llm_group, force_save_memory=force_save_memory)
1011
+
1012
+ # 处理聚类结果
1013
+ _merged_count = 0
1014
+ _invalid_count = 0
1015
+
1016
+ if isinstance(cluster_items, list) and cluster_items:
1017
+ gid_to_item = build_gid_to_item_mapping(pending_in_file_with_ids)
1018
+
1019
+ _merged_count, _invalid_count = process_cluster_results(
1020
+ cluster_items,
1021
+ pending_in_file_with_ids,
1022
+ file,
1023
+ chunk_idx,
1024
+ cluster_batches,
1025
+ cluster_records,
1026
+ invalid_clusters_for_review,
1027
+ _progress_append,
1028
+ )
1029
+
1030
+ classified_gids_final = extract_classified_gids(cluster_items)
1031
+ missing_gids_final = input_gids - classified_gids_final
1032
+ if missing_gids_final:
1033
+ typer.secho(f"[jarvis-sec] 警告:仍有遗漏的gid {sorted(list(missing_gids_final))},将为每个遗漏的gid创建单独聚类", fg=typer.colors.YELLOW)
1034
+ supplemented_count = supplement_missing_gids(
1035
+ missing_gids_final,
1036
+ gid_to_item,
1037
+ file,
1038
+ chunk_idx,
1039
+ cluster_batches,
1040
+ cluster_records,
1041
+ )
1042
+ _merged_count += supplemented_count
1043
+ else:
1044
+ # 聚类结果为空或None:为所有输入的gid创建单独聚类(保守策略)
1045
+ if pending_in_file_with_ids:
1046
+ typer.secho(f"[jarvis-sec] 警告:聚类结果为空或None(文件={file},批次={chunk_idx}),为所有gid创建单独聚类", fg=typer.colors.YELLOW)
1047
+ gid_to_item_fallback = build_gid_to_item_mapping(pending_in_file_with_ids)
1048
+
1049
+ _merged_count = supplement_missing_gids(
1050
+ input_gids,
1051
+ gid_to_item_fallback,
1052
+ file,
1053
+ chunk_idx,
1054
+ cluster_batches,
1055
+ cluster_records,
1056
+ )
1057
+ _invalid_count = 0
1058
+ else:
1059
+ _merged_count = 0
1060
+ _invalid_count = 0
1061
+
1062
+ # 标记聚类批次完成
1063
+ _progress_append({
1064
+ "event": "cluster_status",
1065
+ "status": "done",
1066
+ "file": file,
1067
+ "batch_index": chunk_idx,
1068
+ "clusters_count": _merged_count,
1069
+ "invalid_clusters_count": _invalid_count,
1070
+ })
1071
+ if _invalid_count > 0:
1072
+ try:
1073
+ typer.secho(f"[jarvis-sec] 聚类批次完成: 有效聚类={_merged_count},无效聚类={_invalid_count}(已跳过)", fg=typer.colors.GREEN)
1074
+ except Exception:
1075
+ pass
1076
+
1077
+ # 写入当前批次的聚类结果
1078
+ current_batch_records = [
1079
+ rec for rec in cluster_records
1080
+ if rec.get("file") == file and rec.get("batch_index") == chunk_idx
1081
+ ]
1082
+ if current_batch_records:
1083
+ _write_cluster_batch_snapshot(current_batch_records)
1084
+
1085
+
1086
+ def filter_pending_items(items: List[Dict], clustered_gids: set) -> List[Dict]:
1087
+ """过滤出待聚类的项"""
1088
+ pending_in_file: List[Dict] = []
1089
+ for c in items:
1090
+ try:
1091
+ _gid = int(c.get("gid", 0))
1092
+ if _gid >= 1 and _gid not in clustered_gids:
1093
+ pending_in_file.append(c)
1094
+ except Exception:
1095
+ pass
1096
+ return pending_in_file
1097
+
1098
+
1099
+ def process_file_clustering(
1100
+ file: str,
1101
+ items: List[Dict],
1102
+ clustered_gids: set,
1103
+ cluster_batches: List[List[Dict]],
1104
+ cluster_records: List[Dict],
1105
+ invalid_clusters_for_review: List[Dict],
1106
+ entry_path: str,
1107
+ langs: List[str],
1108
+ cluster_limit: int,
1109
+ llm_group: Optional[str],
1110
+ _progress_append,
1111
+ _write_cluster_batch_snapshot,
1112
+ force_save_memory: bool = False,
1113
+ ) -> None:
1114
+ """处理单个文件的聚类任务"""
1115
+ # 过滤掉已聚类的 gid
1116
+ pending_in_file = filter_pending_items(items, clustered_gids)
1117
+ if not pending_in_file:
1118
+ return
1119
+
1120
+ # 优化:如果文件只有一个告警,跳过聚类,直接写入
1121
+ if len(pending_in_file) == 1:
1122
+ single_item = pending_in_file[0]
1123
+ single_gid = single_item.get("gid", 0)
1124
+ handle_single_alert_file(
1125
+ file,
1126
+ single_item,
1127
+ single_gid,
1128
+ cluster_batches,
1129
+ cluster_records,
1130
+ _progress_append,
1131
+ _write_cluster_batch_snapshot,
1132
+ )
1133
+ return
1134
+
1135
+ # 将该文件的告警按 cluster_limit 分批
1136
+ _limit = cluster_limit if isinstance(cluster_limit, int) and cluster_limit > 0 else 50
1137
+ _chunks: List[List[Dict]] = [pending_in_file[i:i + _limit] for i in range(0, len(pending_in_file), _limit)]
1138
+
1139
+ # 处理每个批次
1140
+ for _chunk_idx, _chunk in enumerate(_chunks, start=1):
1141
+ process_cluster_chunk(
1142
+ _chunk,
1143
+ _chunk_idx,
1144
+ file,
1145
+ entry_path,
1146
+ langs,
1147
+ llm_group,
1148
+ cluster_batches,
1149
+ cluster_records,
1150
+ invalid_clusters_for_review,
1151
+ _progress_append,
1152
+ _write_cluster_batch_snapshot,
1153
+ force_save_memory=force_save_memory,
1154
+ )
1155
+
1156
+
1157
+ # 注意:check_and_supplement_missing_gids函数已移除
1158
+ # 由于gid现在保存在heuristic_issues.jsonl中,恢复逻辑已经能够正确匹配所有gid
1159
+ # 理论上不应该再出现遗漏的gid,完整性检查已移至process_clustering_phase中
1160
+
1161
+
1162
+ def initialize_clustering_context(
1163
+ compact_candidates: List[Dict],
1164
+ sec_dir: Path,
1165
+ _progress_append,
1166
+ ) -> tuple[Dict[str, List[Dict]], Dict, tuple, List[List[Dict]], List[Dict], List[Dict], set]:
1167
+ """初始化聚类上下文,返回(文件分组, 已有聚类, 快照写入函数, 聚类批次, 聚类记录, 无效聚类, 已聚类gid)"""
1168
+ # 按文件分组构建待聚类集合
1169
+ _file_groups = group_candidates_by_file(compact_candidates)
1170
+
1171
+ cluster_batches: List[List[Dict]] = []
1172
+ cluster_records: List[Dict] = []
1173
+ invalid_clusters_for_review: List[Dict] = []
1174
+
1175
+ # 读取已有聚类报告以支持断点
1176
+ _existing_clusters, _completed_cluster_batches, _reviewed_invalid_gids = load_existing_clusters(
1177
+ sec_dir
1178
+ )
1179
+
1180
+ # 创建快照写入函数
1181
+ _write_cluster_batch_snapshot, _write_cluster_report_snapshot = create_cluster_snapshot_writer(
1182
+ sec_dir, cluster_records, compact_candidates, _progress_append
1183
+ )
1184
+
1185
+ # 从断点恢复聚类结果
1186
+ cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids = restore_clusters_from_checkpoint(
1187
+ _existing_clusters, _file_groups, _reviewed_invalid_gids
1188
+ )
1189
+
1190
+ return (
1191
+ _file_groups,
1192
+ _existing_clusters,
1193
+ (_write_cluster_batch_snapshot, _write_cluster_report_snapshot),
1194
+ cluster_batches,
1195
+ cluster_records,
1196
+ invalid_clusters_for_review,
1197
+ clustered_gids,
1198
+ )
1199
+
1200
+
1201
+ def check_unclustered_gids(
1202
+ all_candidate_gids: set,
1203
+ clustered_gids: set,
1204
+ ) -> set:
1205
+ """检查未聚类的gid"""
1206
+ unclustered_gids = all_candidate_gids - clustered_gids
1207
+ if unclustered_gids:
1208
+ try:
1209
+ typer.secho(f"[jarvis-sec] 发现 {len(unclustered_gids)} 个未聚类的 gid,将进行聚类", fg=typer.colors.YELLOW)
1210
+ except Exception:
1211
+ pass
1212
+ else:
1213
+ try:
1214
+ typer.secho(f"[jarvis-sec] 所有 {len(all_candidate_gids)} 个候选已聚类,跳过聚类阶段", fg=typer.colors.GREEN)
1215
+ except Exception:
1216
+ pass
1217
+ return unclustered_gids
1218
+
1219
+
1220
+ def execute_clustering_for_files(
1221
+ file_groups: Dict[str, List[Dict]],
1222
+ clustered_gids: set,
1223
+ cluster_batches: List[List[Dict]],
1224
+ cluster_records: List[Dict],
1225
+ invalid_clusters_for_review: List[Dict],
1226
+ entry_path: str,
1227
+ langs: List[str],
1228
+ cluster_limit: int,
1229
+ llm_group: Optional[str],
1230
+ status_mgr,
1231
+ _progress_append,
1232
+ _write_cluster_batch_snapshot,
1233
+ force_save_memory: bool = False,
1234
+ ) -> None:
1235
+ """执行文件聚类"""
1236
+ total_files_to_cluster = len(file_groups)
1237
+ # 更新聚类阶段状态
1238
+ if total_files_to_cluster > 0:
1239
+ status_mgr.update_clustering(
1240
+ current_file=0,
1241
+ total_files=total_files_to_cluster,
1242
+ message="开始聚类分析..."
1243
+ )
1244
+ for _file_idx, (_file, _items) in enumerate(file_groups.items(), start=1):
1245
+ typer.secho(f"\n[jarvis-sec] 聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}", fg=typer.colors.CYAN)
1246
+ # 更新当前文件进度
1247
+ status_mgr.update_clustering(
1248
+ current_file=_file_idx,
1249
+ total_files=total_files_to_cluster,
1250
+ file_name=_file,
1251
+ message=f"正在聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}"
1252
+ )
1253
+ # 使用子函数处理文件聚类
1254
+ process_file_clustering(
1255
+ _file,
1256
+ _items,
1257
+ clustered_gids,
1258
+ cluster_batches,
1259
+ cluster_records,
1260
+ invalid_clusters_for_review,
1261
+ entry_path,
1262
+ langs,
1263
+ cluster_limit,
1264
+ llm_group,
1265
+ _progress_append,
1266
+ _write_cluster_batch_snapshot,
1267
+ force_save_memory=force_save_memory,
1268
+ )
1269
+
1270
+
1271
+ def record_clustering_completion(
1272
+ sec_dir: Path,
1273
+ cluster_records: List[Dict],
1274
+ compact_candidates: List[Dict],
1275
+ _progress_append,
1276
+ ) -> None:
1277
+ """记录聚类阶段完成"""
1278
+ try:
1279
+ _cluster_path = sec_dir / "cluster_report.jsonl"
1280
+ _progress_append({
1281
+ "event": "cluster_report_written",
1282
+ "path": str(_cluster_path),
1283
+ "clusters": len(cluster_records),
1284
+ "total_candidates": len(compact_candidates),
1285
+ "note": "每个批次已增量保存,无需重写整个文件",
1286
+ })
1287
+ except Exception:
1288
+ pass
1289
+
1290
+
1291
+ def fallback_to_file_based_batches(
1292
+ file_groups: Dict[str, List[Dict]],
1293
+ existing_clusters: Dict,
1294
+ ) -> List[List[Dict]]:
1295
+ """若聚类失败或空,则回退为按文件一次处理"""
1296
+ fallback_batches: List[List[Dict]] = []
1297
+
1298
+ # 收集所有未聚类的 gid(从所有候选 gid 中排除已聚类的)
1299
+ all_gids_in_file_groups = collect_candidate_gids(file_groups)
1300
+ gid_to_item_fallback: Dict[int, Dict] = {}
1301
+ for _file, _items in file_groups.items():
1302
+ for c in _items:
1303
+ try:
1304
+ _gid = int(c.get("gid", 0))
1305
+ if _gid >= 1:
1306
+ gid_to_item_fallback[_gid] = c
1307
+ except Exception:
1308
+ pass
1309
+
1310
+ # 如果还有未聚类的 gid,按文件分组创建批次
1311
+ if all_gids_in_file_groups:
1312
+ # 收集已聚类的 gid(从 cluster_report.jsonl)
1313
+ clustered_gids_fallback = set()
1314
+ for (_file_key, _batch_idx), cluster_recs in existing_clusters.items():
1315
+ for rec in cluster_recs:
1316
+ if rec.get("is_invalid", False):
1317
+ continue
1318
+ gids_list = rec.get("gids", [])
1319
+ for _gid in gids_list:
1320
+ try:
1321
+ _gid_int = int(_gid)
1322
+ if _gid_int >= 1:
1323
+ clustered_gids_fallback.add(_gid_int)
1324
+ except Exception:
1325
+ pass
1326
+
1327
+ unclustered_gids_fallback = all_gids_in_file_groups - clustered_gids_fallback
1328
+ if unclustered_gids_fallback:
1329
+ # 按文件分组未聚类的 gid
1330
+ from collections import defaultdict
1331
+ unclustered_by_file: Dict[str, List[Dict]] = defaultdict(list)
1332
+ for _gid in unclustered_gids_fallback:
1333
+ item = gid_to_item_fallback.get(_gid)
1334
+ if item:
1335
+ file_key = str(item.get("file") or "")
1336
+ unclustered_by_file[file_key].append(item)
1337
+
1338
+ # 为每个文件创建批次
1339
+ for _file, _items in unclustered_by_file.items():
1340
+ if _items:
1341
+ fallback_batches.append(_items)
1342
+
1343
+ return fallback_batches
1344
+
1345
+
1346
+ def process_clustering_phase(
1347
+ compact_candidates: List[Dict],
1348
+ entry_path: str,
1349
+ langs: List[str],
1350
+ cluster_limit: int,
1351
+ llm_group: Optional[str],
1352
+ sec_dir: Path,
1353
+ status_mgr,
1354
+ _progress_append,
1355
+ force_save_memory: bool = False,
1356
+ ) -> tuple[List[List[Dict]], List[Dict]]:
1357
+ """处理聚类阶段,返回(cluster_batches, invalid_clusters_for_review)"""
1358
+ # 初始化聚类上下文
1359
+ (
1360
+ _file_groups,
1361
+ _existing_clusters,
1362
+ (_write_cluster_batch_snapshot, _write_cluster_report_snapshot),
1363
+ cluster_batches,
1364
+ cluster_records,
1365
+ invalid_clusters_for_review,
1366
+ clustered_gids,
1367
+ ) = initialize_clustering_context(compact_candidates, sec_dir, _progress_append)
1368
+
1369
+ # 收集所有候选的 gid(用于检查未聚类的 gid)
1370
+ all_candidate_gids_in_clustering = collect_candidate_gids(_file_groups)
1371
+
1372
+ # 检查是否有未聚类的 gid
1373
+ unclustered_gids = check_unclustered_gids(all_candidate_gids_in_clustering, clustered_gids)
1374
+
1375
+ # 如果有未聚类的 gid,继续执行聚类
1376
+ if unclustered_gids:
1377
+ execute_clustering_for_files(
1378
+ _file_groups,
1379
+ clustered_gids,
1380
+ cluster_batches,
1381
+ cluster_records,
1382
+ invalid_clusters_for_review,
1383
+ entry_path,
1384
+ langs,
1385
+ cluster_limit,
1386
+ llm_group,
1387
+ status_mgr,
1388
+ _progress_append,
1389
+ _write_cluster_batch_snapshot,
1390
+ force_save_memory=force_save_memory,
1391
+ )
1392
+
1393
+ # 记录聚类阶段完成
1394
+ record_clustering_completion(sec_dir, cluster_records, compact_candidates, _progress_append)
1395
+
1396
+ # 复核Agent:验证所有标记为无效的聚类(需要从review模块导入)
1397
+ from jarvis.jarvis_sec.review import process_review_phase
1398
+ cluster_batches = process_review_phase(
1399
+ invalid_clusters_for_review,
1400
+ entry_path,
1401
+ langs,
1402
+ llm_group,
1403
+ status_mgr,
1404
+ _progress_append,
1405
+ cluster_batches,
1406
+ sec_dir,
1407
+ )
1408
+
1409
+ # 若聚类失败或空,则回退为"按文件一次处理"
1410
+ if not cluster_batches:
1411
+ fallback_batches = fallback_to_file_based_batches(_file_groups, _existing_clusters)
1412
+ cluster_batches.extend(fallback_batches)
1413
+
1414
+ # 完整性检查:确保所有候选的 gid 都已被聚类
1415
+ # 使用新的文件管理器进行校验
1416
+ is_complete, missing_gids_final = validate_clustering_completeness(sec_dir)
1417
+
1418
+ if missing_gids_final:
1419
+ # 如果还有遗漏的gid,说明恢复逻辑有问题,需要重新聚类
1420
+ try:
1421
+ missing_count = len(missing_gids_final)
1422
+ if missing_count <= 20:
1423
+ typer.secho(f"[jarvis-sec] 警告:发现 {missing_count} 个遗漏的gid(恢复逻辑可能有问题): {sorted(list(missing_gids_final))}", fg=typer.colors.RED)
1424
+ else:
1425
+ missing_list = sorted(list(missing_gids_final))
1426
+ display_list = missing_list[:10] + ["..."] + missing_list[-10:]
1427
+ typer.secho(f"[jarvis-sec] 警告:发现 {missing_count} 个遗漏的gid(恢复逻辑可能有问题): {display_list}", fg=typer.colors.RED)
1428
+
1429
+ except Exception:
1430
+ pass
1431
+
1432
+ # 清理之前创建的单独聚类(避免分析工作量激增)
1433
+ cluster_batches = filter_single_gid_clusters(
1434
+ cluster_batches,
1435
+ sec_dir,
1436
+ _progress_append,
1437
+ )
1438
+
1439
+ return cluster_batches, invalid_clusters_for_review