jarvis-ai-assistant 0.7.16__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/__init__.py +567 -222
- jarvis/jarvis_agent/agent_manager.py +19 -12
- jarvis/jarvis_agent/builtin_input_handler.py +79 -11
- jarvis/jarvis_agent/config_editor.py +7 -2
- jarvis/jarvis_agent/event_bus.py +24 -13
- jarvis/jarvis_agent/events.py +19 -1
- jarvis/jarvis_agent/file_context_handler.py +67 -64
- jarvis/jarvis_agent/file_methodology_manager.py +38 -24
- jarvis/jarvis_agent/jarvis.py +186 -114
- jarvis/jarvis_agent/language_extractors/__init__.py +8 -1
- jarvis/jarvis_agent/language_extractors/c_extractor.py +7 -4
- jarvis/jarvis_agent/language_extractors/cpp_extractor.py +9 -4
- jarvis/jarvis_agent/language_extractors/go_extractor.py +7 -4
- jarvis/jarvis_agent/language_extractors/java_extractor.py +27 -20
- jarvis/jarvis_agent/language_extractors/javascript_extractor.py +22 -17
- jarvis/jarvis_agent/language_extractors/python_extractor.py +7 -4
- jarvis/jarvis_agent/language_extractors/rust_extractor.py +7 -4
- jarvis/jarvis_agent/language_extractors/typescript_extractor.py +22 -17
- jarvis/jarvis_agent/language_support_info.py +250 -219
- jarvis/jarvis_agent/main.py +19 -23
- jarvis/jarvis_agent/memory_manager.py +9 -6
- jarvis/jarvis_agent/methodology_share_manager.py +21 -15
- jarvis/jarvis_agent/output_handler.py +4 -2
- jarvis/jarvis_agent/prompt_builder.py +7 -6
- jarvis/jarvis_agent/prompt_manager.py +113 -8
- jarvis/jarvis_agent/prompts.py +317 -85
- jarvis/jarvis_agent/protocols.py +5 -2
- jarvis/jarvis_agent/run_loop.py +192 -32
- jarvis/jarvis_agent/session_manager.py +7 -3
- jarvis/jarvis_agent/share_manager.py +23 -13
- jarvis/jarvis_agent/shell_input_handler.py +12 -8
- jarvis/jarvis_agent/stdio_redirect.py +25 -26
- jarvis/jarvis_agent/task_analyzer.py +29 -23
- jarvis/jarvis_agent/task_list.py +869 -0
- jarvis/jarvis_agent/task_manager.py +26 -23
- jarvis/jarvis_agent/tool_executor.py +6 -5
- jarvis/jarvis_agent/tool_share_manager.py +24 -14
- jarvis/jarvis_agent/user_interaction.py +3 -3
- jarvis/jarvis_agent/utils.py +9 -1
- jarvis/jarvis_agent/web_bridge.py +37 -17
- jarvis/jarvis_agent/web_output_sink.py +5 -2
- jarvis/jarvis_agent/web_server.py +165 -36
- jarvis/jarvis_c2rust/__init__.py +1 -1
- jarvis/jarvis_c2rust/cli.py +260 -141
- jarvis/jarvis_c2rust/collector.py +37 -18
- jarvis/jarvis_c2rust/constants.py +60 -0
- jarvis/jarvis_c2rust/library_replacer.py +242 -1010
- jarvis/jarvis_c2rust/library_replacer_checkpoint.py +133 -0
- jarvis/jarvis_c2rust/library_replacer_llm.py +287 -0
- jarvis/jarvis_c2rust/library_replacer_loader.py +191 -0
- jarvis/jarvis_c2rust/library_replacer_output.py +134 -0
- jarvis/jarvis_c2rust/library_replacer_prompts.py +124 -0
- jarvis/jarvis_c2rust/library_replacer_utils.py +188 -0
- jarvis/jarvis_c2rust/llm_module_agent.py +98 -1044
- jarvis/jarvis_c2rust/llm_module_agent_apply.py +170 -0
- jarvis/jarvis_c2rust/llm_module_agent_executor.py +288 -0
- jarvis/jarvis_c2rust/llm_module_agent_loader.py +170 -0
- jarvis/jarvis_c2rust/llm_module_agent_prompts.py +268 -0
- jarvis/jarvis_c2rust/llm_module_agent_types.py +57 -0
- jarvis/jarvis_c2rust/llm_module_agent_utils.py +150 -0
- jarvis/jarvis_c2rust/llm_module_agent_validator.py +119 -0
- jarvis/jarvis_c2rust/loaders.py +28 -10
- jarvis/jarvis_c2rust/models.py +5 -2
- jarvis/jarvis_c2rust/optimizer.py +192 -1974
- jarvis/jarvis_c2rust/optimizer_build_fix.py +286 -0
- jarvis/jarvis_c2rust/optimizer_clippy.py +766 -0
- jarvis/jarvis_c2rust/optimizer_config.py +49 -0
- jarvis/jarvis_c2rust/optimizer_docs.py +183 -0
- jarvis/jarvis_c2rust/optimizer_options.py +48 -0
- jarvis/jarvis_c2rust/optimizer_progress.py +469 -0
- jarvis/jarvis_c2rust/optimizer_report.py +52 -0
- jarvis/jarvis_c2rust/optimizer_unsafe.py +309 -0
- jarvis/jarvis_c2rust/optimizer_utils.py +469 -0
- jarvis/jarvis_c2rust/optimizer_visibility.py +185 -0
- jarvis/jarvis_c2rust/scanner.py +229 -166
- jarvis/jarvis_c2rust/transpiler.py +531 -2732
- jarvis/jarvis_c2rust/transpiler_agents.py +503 -0
- jarvis/jarvis_c2rust/transpiler_build.py +1294 -0
- jarvis/jarvis_c2rust/transpiler_codegen.py +204 -0
- jarvis/jarvis_c2rust/transpiler_compile.py +146 -0
- jarvis/jarvis_c2rust/transpiler_config.py +178 -0
- jarvis/jarvis_c2rust/transpiler_context.py +122 -0
- jarvis/jarvis_c2rust/transpiler_executor.py +516 -0
- jarvis/jarvis_c2rust/transpiler_generation.py +278 -0
- jarvis/jarvis_c2rust/transpiler_git.py +163 -0
- jarvis/jarvis_c2rust/transpiler_mod_utils.py +225 -0
- jarvis/jarvis_c2rust/transpiler_modules.py +336 -0
- jarvis/jarvis_c2rust/transpiler_planning.py +394 -0
- jarvis/jarvis_c2rust/transpiler_review.py +1196 -0
- jarvis/jarvis_c2rust/transpiler_symbols.py +176 -0
- jarvis/jarvis_c2rust/utils.py +269 -79
- jarvis/jarvis_code_agent/after_change.py +233 -0
- jarvis/jarvis_code_agent/build_validation_config.py +37 -30
- jarvis/jarvis_code_agent/builtin_rules.py +68 -0
- jarvis/jarvis_code_agent/code_agent.py +976 -1517
- jarvis/jarvis_code_agent/code_agent_build.py +227 -0
- jarvis/jarvis_code_agent/code_agent_diff.py +246 -0
- jarvis/jarvis_code_agent/code_agent_git.py +525 -0
- jarvis/jarvis_code_agent/code_agent_impact.py +177 -0
- jarvis/jarvis_code_agent/code_agent_lint.py +283 -0
- jarvis/jarvis_code_agent/code_agent_llm.py +159 -0
- jarvis/jarvis_code_agent/code_agent_postprocess.py +105 -0
- jarvis/jarvis_code_agent/code_agent_prompts.py +46 -0
- jarvis/jarvis_code_agent/code_agent_rules.py +305 -0
- jarvis/jarvis_code_agent/code_analyzer/__init__.py +52 -48
- jarvis/jarvis_code_agent/code_analyzer/base_language.py +12 -10
- jarvis/jarvis_code_agent/code_analyzer/build_validator/__init__.py +12 -11
- jarvis/jarvis_code_agent/code_analyzer/build_validator/base.py +16 -12
- jarvis/jarvis_code_agent/code_analyzer/build_validator/cmake.py +26 -17
- jarvis/jarvis_code_agent/code_analyzer/build_validator/detector.py +558 -104
- jarvis/jarvis_code_agent/code_analyzer/build_validator/fallback.py +27 -16
- jarvis/jarvis_code_agent/code_analyzer/build_validator/go.py +22 -18
- jarvis/jarvis_code_agent/code_analyzer/build_validator/java_gradle.py +21 -16
- jarvis/jarvis_code_agent/code_analyzer/build_validator/java_maven.py +20 -16
- jarvis/jarvis_code_agent/code_analyzer/build_validator/makefile.py +27 -16
- jarvis/jarvis_code_agent/code_analyzer/build_validator/nodejs.py +47 -23
- jarvis/jarvis_code_agent/code_analyzer/build_validator/python.py +71 -37
- jarvis/jarvis_code_agent/code_analyzer/build_validator/rust.py +162 -35
- jarvis/jarvis_code_agent/code_analyzer/build_validator/validator.py +111 -57
- jarvis/jarvis_code_agent/code_analyzer/build_validator.py +18 -12
- jarvis/jarvis_code_agent/code_analyzer/context_manager.py +185 -183
- jarvis/jarvis_code_agent/code_analyzer/context_recommender.py +2 -1
- jarvis/jarvis_code_agent/code_analyzer/dependency_analyzer.py +24 -15
- jarvis/jarvis_code_agent/code_analyzer/file_ignore.py +227 -141
- jarvis/jarvis_code_agent/code_analyzer/impact_analyzer.py +321 -247
- jarvis/jarvis_code_agent/code_analyzer/language_registry.py +37 -29
- jarvis/jarvis_code_agent/code_analyzer/language_support.py +21 -13
- jarvis/jarvis_code_agent/code_analyzer/languages/__init__.py +15 -9
- jarvis/jarvis_code_agent/code_analyzer/languages/c_cpp_language.py +75 -45
- jarvis/jarvis_code_agent/code_analyzer/languages/go_language.py +87 -52
- jarvis/jarvis_code_agent/code_analyzer/languages/java_language.py +84 -51
- jarvis/jarvis_code_agent/code_analyzer/languages/javascript_language.py +94 -64
- jarvis/jarvis_code_agent/code_analyzer/languages/python_language.py +109 -71
- jarvis/jarvis_code_agent/code_analyzer/languages/rust_language.py +97 -63
- jarvis/jarvis_code_agent/code_analyzer/languages/typescript_language.py +103 -69
- jarvis/jarvis_code_agent/code_analyzer/llm_context_recommender.py +271 -268
- jarvis/jarvis_code_agent/code_analyzer/symbol_extractor.py +76 -64
- jarvis/jarvis_code_agent/code_analyzer/tree_sitter_extractor.py +92 -19
- jarvis/jarvis_code_agent/diff_visualizer.py +998 -0
- jarvis/jarvis_code_agent/lint.py +223 -524
- jarvis/jarvis_code_agent/rule_share_manager.py +158 -0
- jarvis/jarvis_code_agent/rules/clean_code.md +144 -0
- jarvis/jarvis_code_agent/rules/code_review.md +115 -0
- jarvis/jarvis_code_agent/rules/documentation.md +165 -0
- jarvis/jarvis_code_agent/rules/generate_rules.md +52 -0
- jarvis/jarvis_code_agent/rules/performance.md +158 -0
- jarvis/jarvis_code_agent/rules/refactoring.md +139 -0
- jarvis/jarvis_code_agent/rules/security.md +160 -0
- jarvis/jarvis_code_agent/rules/tdd.md +78 -0
- jarvis/jarvis_code_agent/test_rules/cpp_test.md +118 -0
- jarvis/jarvis_code_agent/test_rules/go_test.md +98 -0
- jarvis/jarvis_code_agent/test_rules/java_test.md +99 -0
- jarvis/jarvis_code_agent/test_rules/javascript_test.md +113 -0
- jarvis/jarvis_code_agent/test_rules/php_test.md +117 -0
- jarvis/jarvis_code_agent/test_rules/python_test.md +91 -0
- jarvis/jarvis_code_agent/test_rules/ruby_test.md +102 -0
- jarvis/jarvis_code_agent/test_rules/rust_test.md +86 -0
- jarvis/jarvis_code_agent/utils.py +36 -26
- jarvis/jarvis_code_analysis/checklists/loader.py +21 -21
- jarvis/jarvis_code_analysis/code_review.py +64 -33
- jarvis/jarvis_data/config_schema.json +285 -192
- jarvis/jarvis_git_squash/main.py +8 -6
- jarvis/jarvis_git_utils/git_commiter.py +53 -76
- jarvis/jarvis_mcp/__init__.py +5 -2
- jarvis/jarvis_mcp/sse_mcp_client.py +40 -30
- jarvis/jarvis_mcp/stdio_mcp_client.py +27 -19
- jarvis/jarvis_mcp/streamable_mcp_client.py +35 -26
- jarvis/jarvis_memory_organizer/memory_organizer.py +78 -55
- jarvis/jarvis_methodology/main.py +48 -39
- jarvis/jarvis_multi_agent/__init__.py +56 -23
- jarvis/jarvis_multi_agent/main.py +15 -18
- jarvis/jarvis_platform/base.py +179 -111
- jarvis/jarvis_platform/human.py +27 -16
- jarvis/jarvis_platform/kimi.py +52 -45
- jarvis/jarvis_platform/openai.py +101 -40
- jarvis/jarvis_platform/registry.py +51 -33
- jarvis/jarvis_platform/tongyi.py +68 -38
- jarvis/jarvis_platform/yuanbao.py +59 -43
- jarvis/jarvis_platform_manager/main.py +68 -76
- jarvis/jarvis_platform_manager/service.py +24 -14
- jarvis/jarvis_rag/README_CONFIG.md +314 -0
- jarvis/jarvis_rag/README_DYNAMIC_LOADING.md +311 -0
- jarvis/jarvis_rag/README_ONLINE_MODELS.md +230 -0
- jarvis/jarvis_rag/__init__.py +57 -4
- jarvis/jarvis_rag/cache.py +3 -1
- jarvis/jarvis_rag/cli.py +48 -68
- jarvis/jarvis_rag/embedding_interface.py +39 -0
- jarvis/jarvis_rag/embedding_manager.py +7 -230
- jarvis/jarvis_rag/embeddings/__init__.py +41 -0
- jarvis/jarvis_rag/embeddings/base.py +114 -0
- jarvis/jarvis_rag/embeddings/cohere.py +66 -0
- jarvis/jarvis_rag/embeddings/edgefn.py +117 -0
- jarvis/jarvis_rag/embeddings/local.py +260 -0
- jarvis/jarvis_rag/embeddings/openai.py +62 -0
- jarvis/jarvis_rag/embeddings/registry.py +293 -0
- jarvis/jarvis_rag/llm_interface.py +8 -6
- jarvis/jarvis_rag/query_rewriter.py +8 -9
- jarvis/jarvis_rag/rag_pipeline.py +61 -52
- jarvis/jarvis_rag/reranker.py +7 -75
- jarvis/jarvis_rag/reranker_interface.py +32 -0
- jarvis/jarvis_rag/rerankers/__init__.py +41 -0
- jarvis/jarvis_rag/rerankers/base.py +109 -0
- jarvis/jarvis_rag/rerankers/cohere.py +67 -0
- jarvis/jarvis_rag/rerankers/edgefn.py +140 -0
- jarvis/jarvis_rag/rerankers/jina.py +79 -0
- jarvis/jarvis_rag/rerankers/local.py +89 -0
- jarvis/jarvis_rag/rerankers/registry.py +293 -0
- jarvis/jarvis_rag/retriever.py +58 -43
- jarvis/jarvis_sec/__init__.py +66 -141
- jarvis/jarvis_sec/agents.py +21 -17
- jarvis/jarvis_sec/analysis.py +80 -33
- jarvis/jarvis_sec/checkers/__init__.py +7 -13
- jarvis/jarvis_sec/checkers/c_checker.py +356 -164
- jarvis/jarvis_sec/checkers/rust_checker.py +47 -29
- jarvis/jarvis_sec/cli.py +43 -21
- jarvis/jarvis_sec/clustering.py +430 -272
- jarvis/jarvis_sec/file_manager.py +99 -55
- jarvis/jarvis_sec/parsers.py +9 -6
- jarvis/jarvis_sec/prompts.py +4 -3
- jarvis/jarvis_sec/report.py +44 -22
- jarvis/jarvis_sec/review.py +180 -107
- jarvis/jarvis_sec/status.py +50 -41
- jarvis/jarvis_sec/types.py +3 -0
- jarvis/jarvis_sec/utils.py +160 -83
- jarvis/jarvis_sec/verification.py +411 -181
- jarvis/jarvis_sec/workflow.py +132 -21
- jarvis/jarvis_smart_shell/main.py +28 -41
- jarvis/jarvis_stats/cli.py +14 -12
- jarvis/jarvis_stats/stats.py +28 -19
- jarvis/jarvis_stats/storage.py +14 -8
- jarvis/jarvis_stats/visualizer.py +12 -7
- jarvis/jarvis_tools/base.py +5 -2
- jarvis/jarvis_tools/clear_memory.py +13 -9
- jarvis/jarvis_tools/cli/main.py +23 -18
- jarvis/jarvis_tools/edit_file.py +572 -873
- jarvis/jarvis_tools/execute_script.py +10 -7
- jarvis/jarvis_tools/file_analyzer.py +7 -8
- jarvis/jarvis_tools/meta_agent.py +287 -0
- jarvis/jarvis_tools/methodology.py +5 -3
- jarvis/jarvis_tools/read_code.py +305 -1438
- jarvis/jarvis_tools/read_symbols.py +50 -17
- jarvis/jarvis_tools/read_webpage.py +19 -18
- jarvis/jarvis_tools/registry.py +435 -156
- jarvis/jarvis_tools/retrieve_memory.py +16 -11
- jarvis/jarvis_tools/save_memory.py +8 -6
- jarvis/jarvis_tools/search_web.py +31 -31
- jarvis/jarvis_tools/sub_agent.py +32 -28
- jarvis/jarvis_tools/sub_code_agent.py +44 -60
- jarvis/jarvis_tools/task_list_manager.py +1811 -0
- jarvis/jarvis_tools/virtual_tty.py +29 -19
- jarvis/jarvis_utils/__init__.py +4 -0
- jarvis/jarvis_utils/builtin_replace_map.py +2 -1
- jarvis/jarvis_utils/clipboard.py +9 -8
- jarvis/jarvis_utils/collections.py +331 -0
- jarvis/jarvis_utils/config.py +699 -194
- jarvis/jarvis_utils/dialogue_recorder.py +294 -0
- jarvis/jarvis_utils/embedding.py +6 -3
- jarvis/jarvis_utils/file_processors.py +7 -1
- jarvis/jarvis_utils/fzf.py +9 -3
- jarvis/jarvis_utils/git_utils.py +71 -42
- jarvis/jarvis_utils/globals.py +116 -32
- jarvis/jarvis_utils/http.py +6 -2
- jarvis/jarvis_utils/input.py +318 -83
- jarvis/jarvis_utils/jsonnet_compat.py +119 -104
- jarvis/jarvis_utils/methodology.py +37 -28
- jarvis/jarvis_utils/output.py +201 -44
- jarvis/jarvis_utils/utils.py +986 -628
- {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/METADATA +49 -33
- jarvis_ai_assistant-1.0.2.dist-info/RECORD +304 -0
- jarvis/jarvis_code_agent/code_analyzer/structured_code.py +0 -556
- jarvis/jarvis_tools/generate_new_tool.py +0 -205
- jarvis/jarvis_tools/lsp_client.py +0 -1552
- jarvis/jarvis_tools/rewrite_file.py +0 -105
- jarvis_ai_assistant-0.7.16.dist-info/RECORD +0 -218
- {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/entry_points.txt +0 -0
- {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/licenses/LICENSE +0 -0
- {jarvis_ai_assistant-0.7.16.dist-info → jarvis_ai_assistant-1.0.2.dist-info}/top_level.txt +0 -0
jarvis/jarvis_sec/clustering.py
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
"""聚类相关模块"""
|
|
3
3
|
|
|
4
|
-
from typing import Dict, List, Optional
|
|
5
|
-
from pathlib import Path
|
|
6
4
|
import json
|
|
7
|
-
import
|
|
8
|
-
|
|
9
|
-
from
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict
|
|
7
|
+
from typing import List
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from jarvis.jarvis_sec.agents import create_cluster_agent
|
|
12
|
+
from jarvis.jarvis_sec.agents import subscribe_summary_event
|
|
13
|
+
from jarvis.jarvis_sec.file_manager import get_all_clustered_gids
|
|
14
|
+
from jarvis.jarvis_sec.file_manager import get_clusters_file
|
|
15
|
+
from jarvis.jarvis_sec.file_manager import load_clusters
|
|
16
|
+
from jarvis.jarvis_sec.file_manager import save_cluster
|
|
17
|
+
from jarvis.jarvis_sec.file_manager import validate_clustering_completeness
|
|
10
18
|
from jarvis.jarvis_sec.parsers import parse_clusters_from_text
|
|
11
|
-
from jarvis.jarvis_sec.
|
|
12
|
-
from jarvis.jarvis_sec.utils import
|
|
13
|
-
|
|
14
|
-
)
|
|
15
|
-
from jarvis.jarvis_sec.file_manager import (
|
|
16
|
-
load_clusters,
|
|
17
|
-
save_cluster,
|
|
18
|
-
get_all_clustered_gids,
|
|
19
|
-
validate_clustering_completeness,
|
|
20
|
-
get_clusters_file,
|
|
21
|
-
)
|
|
19
|
+
from jarvis.jarvis_sec.prompts import get_cluster_summary_prompt
|
|
20
|
+
from jarvis.jarvis_sec.utils import group_candidates_by_file
|
|
21
|
+
from jarvis.jarvis_utils.output import PrettyOutput
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def load_existing_clusters(
|
|
@@ -26,33 +26,36 @@ def load_existing_clusters(
|
|
|
26
26
|
) -> tuple[Dict[tuple[str, int], List[Dict]], set, set]:
|
|
27
27
|
"""
|
|
28
28
|
读取已有聚类报告以支持断点恢复。
|
|
29
|
-
|
|
29
|
+
|
|
30
30
|
优先使用新的 clusters.jsonl 文件,如果不存在则回退到旧的 cluster_report.jsonl。
|
|
31
|
-
|
|
31
|
+
|
|
32
32
|
返回: (_existing_clusters, _completed_cluster_batches, _reviewed_invalid_gids)
|
|
33
33
|
"""
|
|
34
34
|
_existing_clusters: Dict[tuple[str, int], List[Dict]] = {}
|
|
35
35
|
_completed_cluster_batches: set = set()
|
|
36
36
|
_reviewed_invalid_gids: set = set() # 已复核的无效聚类的 gids
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
try:
|
|
39
39
|
# 优先使用新的 clusters.jsonl 文件
|
|
40
40
|
clusters = load_clusters(sec_dir)
|
|
41
|
-
|
|
41
|
+
|
|
42
42
|
if clusters:
|
|
43
43
|
# 从新的 clusters.jsonl 加载
|
|
44
44
|
for cluster in clusters:
|
|
45
45
|
f_name = str(cluster.get("file") or "")
|
|
46
46
|
bidx = int(cluster.get("batch_index", 1) or 1)
|
|
47
47
|
_existing_clusters.setdefault((f_name, bidx), []).append(cluster)
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
# 从分析结果文件中读取已复核的无效聚类
|
|
50
50
|
# 如果聚类是无效的,且其gids都在分析结果中被标记为误报,则认为已复核
|
|
51
51
|
if cluster.get("is_invalid", False):
|
|
52
52
|
gids_list = cluster.get("gids", [])
|
|
53
53
|
if isinstance(gids_list, list):
|
|
54
54
|
# 检查这些gid是否都在分析结果中被标记为误报
|
|
55
|
-
from jarvis.jarvis_sec.file_manager import
|
|
55
|
+
from jarvis.jarvis_sec.file_manager import (
|
|
56
|
+
get_false_positive_gids,
|
|
57
|
+
)
|
|
58
|
+
|
|
56
59
|
false_positive_gids = get_false_positive_gids(sec_dir)
|
|
57
60
|
all_false_positive = all(
|
|
58
61
|
int(gid_val) in false_positive_gids
|
|
@@ -72,7 +75,7 @@ def load_existing_clusters(
|
|
|
72
75
|
_existing_clusters = {}
|
|
73
76
|
_completed_cluster_batches = set()
|
|
74
77
|
_reviewed_invalid_gids = set()
|
|
75
|
-
|
|
78
|
+
|
|
76
79
|
return _existing_clusters, _completed_cluster_batches, _reviewed_invalid_gids
|
|
77
80
|
|
|
78
81
|
|
|
@@ -83,7 +86,7 @@ def restore_clusters_from_checkpoint(
|
|
|
83
86
|
) -> tuple[List[List[Dict]], List[Dict], List[Dict], set]:
|
|
84
87
|
"""
|
|
85
88
|
从断点恢复聚类结果。
|
|
86
|
-
|
|
89
|
+
|
|
87
90
|
返回: (cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids)
|
|
88
91
|
"""
|
|
89
92
|
# 1. 收集所有候选的 gid
|
|
@@ -98,15 +101,17 @@ def restore_clusters_from_checkpoint(
|
|
|
98
101
|
gid_to_candidate[_gid] = it
|
|
99
102
|
except Exception:
|
|
100
103
|
pass
|
|
101
|
-
|
|
104
|
+
|
|
102
105
|
# 2. 从 cluster_report.jsonl 恢复所有聚类结果
|
|
103
|
-
clustered_gids =
|
|
106
|
+
clustered_gids = (
|
|
107
|
+
set()
|
|
108
|
+
) # 已聚类的 gid(包括有效和无效的,因为无效的也需要进入复核阶段)
|
|
104
109
|
invalid_clusters_for_review: List[Dict] = [] # 无效聚类列表(从断点恢复)
|
|
105
110
|
cluster_batches: List[List[Dict]] = []
|
|
106
111
|
cluster_records: List[Dict] = []
|
|
107
112
|
skipped_reviewed_count = 0 # 已复核的无效聚类数量(跳过)
|
|
108
113
|
missing_gids_in_restore = set() # 记录恢复时无法匹配的gid(用于诊断)
|
|
109
|
-
|
|
114
|
+
|
|
110
115
|
# 首先,从所有聚类记录中收集所有已聚类的 gid(无论是否在当前候选集中)
|
|
111
116
|
# 这样可以确保即使匹配失败,只要 gid 在 clusters.jsonl 中且在当前候选集中,就会被计入 clustered_gids
|
|
112
117
|
all_clustered_gids_from_file = set()
|
|
@@ -121,13 +126,13 @@ def restore_clusters_from_checkpoint(
|
|
|
121
126
|
all_clustered_gids_from_file.add(_gid_int)
|
|
122
127
|
except Exception:
|
|
123
128
|
pass
|
|
124
|
-
|
|
129
|
+
|
|
125
130
|
# 对于所有在 clusters.jsonl 中记录的 gid,如果它们也在当前候选集中,就计入 clustered_gids
|
|
126
131
|
# 这样可以避免因为匹配失败而导致的遗漏
|
|
127
132
|
for _gid_int in all_clustered_gids_from_file:
|
|
128
133
|
if _gid_int in all_candidate_gids_in_clustering:
|
|
129
134
|
clustered_gids.add(_gid_int)
|
|
130
|
-
|
|
135
|
+
|
|
131
136
|
# 然后,尝试恢复具体的聚类信息(用于恢复 cluster_batches 和 invalid_clusters_for_review)
|
|
132
137
|
for (_file_key, _batch_idx), cluster_recs in _existing_clusters.items():
|
|
133
138
|
for rec in cluster_recs:
|
|
@@ -168,7 +173,7 @@ def restore_clusters_from_checkpoint(
|
|
|
168
173
|
pass
|
|
169
174
|
if found_candidate:
|
|
170
175
|
break
|
|
171
|
-
|
|
176
|
+
|
|
172
177
|
if found_candidate:
|
|
173
178
|
# 找到了对应的候选,添加到members中
|
|
174
179
|
found_candidate["verify"] = verification
|
|
@@ -186,7 +191,7 @@ def restore_clusters_from_checkpoint(
|
|
|
186
191
|
pass
|
|
187
192
|
except Exception:
|
|
188
193
|
pass
|
|
189
|
-
|
|
194
|
+
|
|
190
195
|
# 只有当至少有一个gid在当前候选集中时,才恢复这个聚类
|
|
191
196
|
# 如果所有gid都不在当前候选集中,说明这些gid对应的候选在当前运行中不存在
|
|
192
197
|
# 这种情况下,不应该恢复这个聚类,因为这些gid不在当前运行中
|
|
@@ -198,49 +203,63 @@ def restore_clusters_from_checkpoint(
|
|
|
198
203
|
cluster_gids_int = set()
|
|
199
204
|
for gid_val in cluster_gids:
|
|
200
205
|
try:
|
|
206
|
+
if gid_val is None:
|
|
207
|
+
continue
|
|
201
208
|
gid_int = int(gid_val)
|
|
202
209
|
if gid_int >= 1:
|
|
203
210
|
cluster_gids_int.add(gid_int)
|
|
204
211
|
except Exception:
|
|
205
212
|
pass
|
|
206
213
|
# 检查所有 gid 是否都已被复核过
|
|
207
|
-
all_reviewed = cluster_gids_int and cluster_gids_int.issubset(
|
|
208
|
-
|
|
214
|
+
all_reviewed = cluster_gids_int and cluster_gids_int.issubset(
|
|
215
|
+
_reviewed_invalid_gids
|
|
216
|
+
)
|
|
217
|
+
|
|
209
218
|
if not all_reviewed:
|
|
210
219
|
# 如果还有未复核的 gid,收集到复核列表
|
|
211
|
-
invalid_clusters_for_review.append(
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
+
invalid_clusters_for_review.append(
|
|
221
|
+
{
|
|
222
|
+
"file": _file_key,
|
|
223
|
+
"batch_index": _batch_idx,
|
|
224
|
+
"gids": cluster_gids,
|
|
225
|
+
"verification": verification,
|
|
226
|
+
"invalid_reason": str(
|
|
227
|
+
rec.get("invalid_reason", "")
|
|
228
|
+
).strip(),
|
|
229
|
+
"members": members, # 保存候选信息,用于复核后可能重新加入验证
|
|
230
|
+
"count": len(members),
|
|
231
|
+
}
|
|
232
|
+
)
|
|
220
233
|
else:
|
|
221
234
|
# 如果所有 gid 都已被复核过,则跳过(不加入复核列表)
|
|
222
235
|
skipped_reviewed_count += 1
|
|
223
236
|
else:
|
|
224
237
|
# 有效聚类:恢复到 cluster_batches
|
|
225
238
|
cluster_batches.append(members)
|
|
226
|
-
cluster_records.append(
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
239
|
+
cluster_records.append(
|
|
240
|
+
{
|
|
241
|
+
"file": _file_key,
|
|
242
|
+
"verification": verification,
|
|
243
|
+
"gids": [m.get("gid") for m in members],
|
|
244
|
+
"count": len(members),
|
|
245
|
+
"batch_index": _batch_idx,
|
|
246
|
+
"is_invalid": False,
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
|
|
235
250
|
# 输出统计信息
|
|
236
251
|
if _reviewed_invalid_gids:
|
|
237
252
|
try:
|
|
238
|
-
|
|
253
|
+
PrettyOutput.auto_print(
|
|
254
|
+
f"[jarvis-sec] 断点恢复:发现 {len(_reviewed_invalid_gids)} 个已复核的无效聚类 gids",
|
|
255
|
+
)
|
|
239
256
|
except Exception:
|
|
240
257
|
pass
|
|
241
258
|
if skipped_reviewed_count > 0:
|
|
242
259
|
try:
|
|
243
|
-
|
|
260
|
+
PrettyOutput.auto_print(
|
|
261
|
+
f"[jarvis-sec] 断点恢复:跳过 {skipped_reviewed_count} 个已复核的无效聚类",
|
|
262
|
+
)
|
|
244
263
|
except Exception:
|
|
245
264
|
pass
|
|
246
265
|
if missing_gids_in_restore:
|
|
@@ -251,31 +270,42 @@ def restore_clusters_from_checkpoint(
|
|
|
251
270
|
try:
|
|
252
271
|
if missing_count <= 20:
|
|
253
272
|
missing_list = sorted(list(missing_gids_in_restore))
|
|
254
|
-
|
|
273
|
+
PrettyOutput.auto_print(
|
|
274
|
+
f"[jarvis-sec] 断点恢复诊断:发现 {missing_count} 个gid在当前候选集中但无法匹配(可能存在数据不一致): {missing_list}",
|
|
275
|
+
)
|
|
255
276
|
else:
|
|
256
277
|
missing_list = sorted(list(missing_gids_in_restore))
|
|
257
278
|
display_list = missing_list[:10] + ["..."] + missing_list[-10:]
|
|
258
|
-
|
|
279
|
+
PrettyOutput.auto_print(
|
|
280
|
+
f"[jarvis-sec] 断点恢复诊断:发现 {missing_count} 个gid在当前候选集中但无法匹配(可能存在数据不一致): {display_list}",
|
|
281
|
+
)
|
|
259
282
|
except Exception:
|
|
260
283
|
pass
|
|
261
|
-
|
|
284
|
+
|
|
262
285
|
return cluster_batches, cluster_records, invalid_clusters_for_review, clustered_gids
|
|
263
286
|
|
|
264
287
|
|
|
265
|
-
def create_cluster_snapshot_writer(
|
|
288
|
+
def create_cluster_snapshot_writer(
|
|
289
|
+
sec_dir: Path,
|
|
290
|
+
cluster_records: List[Dict],
|
|
291
|
+
compact_candidates: List[Dict],
|
|
292
|
+
_progress_append,
|
|
293
|
+
):
|
|
266
294
|
"""创建聚类快照写入函数"""
|
|
295
|
+
|
|
267
296
|
def _write_cluster_batch_snapshot(batch_records: List[Dict]):
|
|
268
297
|
"""写入单个批次的聚类结果,支持增量保存"""
|
|
269
298
|
try:
|
|
270
299
|
# 按 (file, batch_index) 分组,为每个分组内的记录生成唯一的 cluster_index
|
|
271
300
|
from collections import defaultdict
|
|
301
|
+
|
|
272
302
|
records_by_key = defaultdict(list)
|
|
273
303
|
for record in batch_records:
|
|
274
304
|
file_name = str(record.get("file", ""))
|
|
275
305
|
batch_index = int(record.get("batch_index", 0))
|
|
276
306
|
key = (file_name, batch_index)
|
|
277
307
|
records_by_key[key].append(record)
|
|
278
|
-
|
|
308
|
+
|
|
279
309
|
# 为每个分组内的记录生成 cluster_index
|
|
280
310
|
for (file_name, batch_index), records in records_by_key.items():
|
|
281
311
|
for local_idx, record in enumerate(records):
|
|
@@ -285,9 +315,9 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
|
|
|
285
315
|
cluster_index = local_idx
|
|
286
316
|
else:
|
|
287
317
|
cluster_index = int(cluster_index)
|
|
288
|
-
|
|
318
|
+
|
|
289
319
|
cluster_id = f"{file_name}|{batch_index}|{cluster_index}"
|
|
290
|
-
|
|
320
|
+
|
|
291
321
|
# 转换为新的格式
|
|
292
322
|
cluster = {
|
|
293
323
|
"cluster_id": cluster_id,
|
|
@@ -299,12 +329,12 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
|
|
|
299
329
|
"is_invalid": record.get("is_invalid", False),
|
|
300
330
|
"invalid_reason": str(record.get("invalid_reason", "")).strip(),
|
|
301
331
|
}
|
|
302
|
-
|
|
332
|
+
|
|
303
333
|
# 使用新的文件管理器保存
|
|
304
334
|
save_cluster(sec_dir, cluster)
|
|
305
335
|
except Exception:
|
|
306
336
|
pass
|
|
307
|
-
|
|
337
|
+
|
|
308
338
|
def _write_cluster_report_snapshot():
|
|
309
339
|
"""写入聚类报告快照"""
|
|
310
340
|
try:
|
|
@@ -314,7 +344,7 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
|
|
|
314
344
|
batch_index = int(record.get("batch_index", 0))
|
|
315
345
|
cluster_index = idx # 使用索引作为 cluster_index
|
|
316
346
|
cluster_id = f"{file_name}|{batch_index}|{cluster_index}"
|
|
317
|
-
|
|
347
|
+
|
|
318
348
|
# 转换为新的格式
|
|
319
349
|
cluster = {
|
|
320
350
|
"cluster_id": cluster_id,
|
|
@@ -326,7 +356,7 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
|
|
|
326
356
|
"is_invalid": record.get("is_invalid", False),
|
|
327
357
|
"invalid_reason": str(record.get("invalid_reason", "")).strip(),
|
|
328
358
|
}
|
|
329
|
-
|
|
359
|
+
|
|
330
360
|
# 使用新的文件管理器保存
|
|
331
361
|
save_cluster(sec_dir, cluster)
|
|
332
362
|
|
|
@@ -340,7 +370,7 @@ def create_cluster_snapshot_writer(sec_dir: Path, cluster_records: List[Dict], c
|
|
|
340
370
|
)
|
|
341
371
|
except Exception:
|
|
342
372
|
pass
|
|
343
|
-
|
|
373
|
+
|
|
344
374
|
return _write_cluster_batch_snapshot, _write_cluster_report_snapshot
|
|
345
375
|
|
|
346
376
|
|
|
@@ -358,7 +388,9 @@ def collect_candidate_gids(file_groups: Dict[str, List[Dict]]) -> set:
|
|
|
358
388
|
return all_gids
|
|
359
389
|
|
|
360
390
|
|
|
361
|
-
def collect_clustered_gids(
|
|
391
|
+
def collect_clustered_gids(
|
|
392
|
+
cluster_batches: List[List[Dict]], invalid_clusters_for_review: List[Dict]
|
|
393
|
+
) -> set:
|
|
362
394
|
"""收集所有已聚类的 gid"""
|
|
363
395
|
all_clustered_gids = set()
|
|
364
396
|
for batch in cluster_batches:
|
|
@@ -394,20 +426,21 @@ def filter_single_gid_clusters(
|
|
|
394
426
|
) -> List[List[Dict]]:
|
|
395
427
|
"""
|
|
396
428
|
过滤掉单独聚类的批次(只包含1个gid的批次),避免分析工作量激增。
|
|
397
|
-
|
|
429
|
+
|
|
398
430
|
这些单独聚类通常是之前为遗漏的gid自动创建的,现在不再需要。
|
|
399
431
|
"""
|
|
400
432
|
filtered_batches = []
|
|
401
433
|
removed_count = 0
|
|
402
434
|
removed_gids = set()
|
|
403
|
-
|
|
435
|
+
|
|
404
436
|
# 读取已分析的gid(从analysis.jsonl)
|
|
405
437
|
from jarvis.jarvis_sec.file_manager import get_all_analyzed_gids
|
|
438
|
+
|
|
406
439
|
processed_gids = get_all_analyzed_gids(sec_dir)
|
|
407
|
-
|
|
440
|
+
|
|
408
441
|
# 读取clusters.jsonl中的所有gid
|
|
409
442
|
cluster_report_gids = get_all_clustered_gids(sec_dir)
|
|
410
|
-
|
|
443
|
+
|
|
411
444
|
for batch in cluster_batches:
|
|
412
445
|
# 检查批次大小
|
|
413
446
|
if len(batch) == 1:
|
|
@@ -420,17 +453,21 @@ def filter_single_gid_clusters(
|
|
|
420
453
|
if gid in processed_gids:
|
|
421
454
|
removed_count += 1
|
|
422
455
|
removed_gids.add(gid)
|
|
423
|
-
_progress_append(
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
456
|
+
_progress_append(
|
|
457
|
+
{
|
|
458
|
+
"event": "single_cluster_removed",
|
|
459
|
+
"gid": gid,
|
|
460
|
+
"reason": "already_analyzed",
|
|
461
|
+
}
|
|
462
|
+
)
|
|
428
463
|
continue
|
|
429
|
-
|
|
464
|
+
|
|
430
465
|
# 检查verification字段,如果是默认的"验证候选 X 的安全风险",说明是自动创建的单独聚类
|
|
431
466
|
verification = str(single_item.get("verify", "")).strip()
|
|
432
|
-
is_auto_created = verification.startswith(
|
|
433
|
-
|
|
467
|
+
is_auto_created = verification.startswith(
|
|
468
|
+
"验证候选 "
|
|
469
|
+
) and verification.endswith(" 的安全风险")
|
|
470
|
+
|
|
434
471
|
if is_auto_created:
|
|
435
472
|
# 这是自动创建的单独聚类
|
|
436
473
|
# 如果gid在clusters.jsonl中有记录,说明已经聚类过了,可以安全移除
|
|
@@ -438,18 +475,22 @@ def filter_single_gid_clusters(
|
|
|
438
475
|
if gid in cluster_report_gids:
|
|
439
476
|
removed_count += 1
|
|
440
477
|
removed_gids.add(gid)
|
|
441
|
-
_progress_append(
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
478
|
+
_progress_append(
|
|
479
|
+
{
|
|
480
|
+
"event": "single_cluster_removed",
|
|
481
|
+
"gid": gid,
|
|
482
|
+
"reason": "auto_created_and_in_clusters",
|
|
483
|
+
}
|
|
484
|
+
)
|
|
446
485
|
continue
|
|
447
486
|
else:
|
|
448
487
|
# 自动创建的单独聚类,但不在clusters.jsonl中,也不在analysis.jsonl中
|
|
449
488
|
# 说明需要分析,保留它(避免遗漏告警)
|
|
450
489
|
# 但给出警告,因为这种情况不应该发生
|
|
451
490
|
try:
|
|
452
|
-
|
|
491
|
+
PrettyOutput.auto_print(
|
|
492
|
+
f"[jarvis-sec] 警告:gid={gid}是自动创建的单独聚类,但不在clusters.jsonl中,保留以避免遗漏告警",
|
|
493
|
+
)
|
|
453
494
|
except Exception:
|
|
454
495
|
pass
|
|
455
496
|
else:
|
|
@@ -458,23 +499,33 @@ def filter_single_gid_clusters(
|
|
|
458
499
|
pass
|
|
459
500
|
except Exception:
|
|
460
501
|
pass
|
|
461
|
-
|
|
502
|
+
|
|
462
503
|
# 保留这个批次(不是单独聚类,或者单独聚类但需要保留)
|
|
463
504
|
filtered_batches.append(batch)
|
|
464
|
-
|
|
505
|
+
|
|
465
506
|
if removed_count > 0:
|
|
466
507
|
try:
|
|
467
508
|
if len(removed_gids) <= 20:
|
|
468
|
-
|
|
469
|
-
|
|
509
|
+
PrettyOutput.auto_print(
|
|
510
|
+
f"[jarvis-sec] 已移除 {removed_count} 个单独聚类批次(共{len(removed_gids)}个gid),避免分析工作量激增",
|
|
511
|
+
)
|
|
512
|
+
PrettyOutput.auto_print(
|
|
513
|
+
f"[jarvis-sec] 移除的gid: {sorted(list(removed_gids))}",
|
|
514
|
+
)
|
|
470
515
|
else:
|
|
471
516
|
removed_gids_list = sorted(list(removed_gids))
|
|
472
|
-
display_list =
|
|
473
|
-
|
|
474
|
-
|
|
517
|
+
display_list = (
|
|
518
|
+
removed_gids_list[:10] + ["..."] + removed_gids_list[-10:]
|
|
519
|
+
)
|
|
520
|
+
PrettyOutput.auto_print(
|
|
521
|
+
f"[jarvis-sec] 已移除 {removed_count} 个单独聚类批次(共{len(removed_gids)}个gid),避免分析工作量激增",
|
|
522
|
+
)
|
|
523
|
+
PrettyOutput.auto_print(
|
|
524
|
+
f"[jarvis-sec] 移除的gid(示例): {display_list}",
|
|
525
|
+
)
|
|
475
526
|
except Exception:
|
|
476
527
|
pass
|
|
477
|
-
|
|
528
|
+
|
|
478
529
|
return filtered_batches
|
|
479
530
|
|
|
480
531
|
|
|
@@ -512,65 +563,76 @@ def handle_single_alert_file(
|
|
|
512
563
|
}
|
|
513
564
|
)
|
|
514
565
|
current_batch_records = [
|
|
515
|
-
rec
|
|
566
|
+
rec
|
|
567
|
+
for rec in cluster_records
|
|
516
568
|
if rec.get("file") == file and rec.get("batch_index") == 1
|
|
517
569
|
]
|
|
518
570
|
if current_batch_records:
|
|
519
571
|
_write_cluster_batch_snapshot(current_batch_records)
|
|
520
|
-
|
|
572
|
+
PrettyOutput.auto_print(
|
|
573
|
+
f"[jarvis-sec] 文件 {file} 仅有一个告警(gid={single_gid}),跳过聚类直接写入",
|
|
574
|
+
)
|
|
521
575
|
|
|
522
576
|
|
|
523
577
|
def validate_cluster_format(cluster_items: List[Dict]) -> tuple[bool, List[str]]:
|
|
524
578
|
"""验证聚类结果的格式,返回(是否有效, 错误详情列表)"""
|
|
525
579
|
if not isinstance(cluster_items, list) or not cluster_items:
|
|
526
580
|
return False, ["结果不是数组或数组为空"]
|
|
527
|
-
|
|
581
|
+
|
|
528
582
|
error_details = []
|
|
529
583
|
for idx, it in enumerate(cluster_items):
|
|
530
584
|
if not isinstance(it, dict):
|
|
531
585
|
error_details.append(f"元素{idx}不是字典")
|
|
532
586
|
return False, error_details
|
|
533
|
-
|
|
587
|
+
|
|
534
588
|
vals = it.get("gids", [])
|
|
535
|
-
if not isinstance(it.get("verification", ""), str) or not isinstance(
|
|
589
|
+
if not isinstance(it.get("verification", ""), str) or not isinstance(
|
|
590
|
+
vals, list
|
|
591
|
+
):
|
|
536
592
|
error_details.append(f"元素{idx}的verification或gids格式错误")
|
|
537
593
|
return False, error_details
|
|
538
|
-
|
|
594
|
+
|
|
539
595
|
# 校验 gids 列表中的每个元素是否都是有效的整数
|
|
540
596
|
if isinstance(vals, list):
|
|
541
597
|
for gid_idx, gid_val in enumerate(vals):
|
|
542
598
|
try:
|
|
543
599
|
gid_int = int(gid_val)
|
|
544
600
|
if gid_int < 1:
|
|
545
|
-
error_details.append(
|
|
601
|
+
error_details.append(
|
|
602
|
+
f"元素{idx}的gids[{gid_idx}]不是有效的正整数(值为{gid_val})"
|
|
603
|
+
)
|
|
546
604
|
return False, error_details
|
|
547
605
|
except (ValueError, TypeError):
|
|
548
|
-
error_details.append(
|
|
606
|
+
error_details.append(
|
|
607
|
+
f"元素{idx}的gids[{gid_idx}]不是有效的整数(值为{gid_val},类型为{type(gid_val).__name__})"
|
|
608
|
+
)
|
|
549
609
|
return False, error_details
|
|
550
|
-
|
|
610
|
+
|
|
551
611
|
# 校验 is_invalid 字段(必填)
|
|
552
612
|
if "is_invalid" not in it:
|
|
553
613
|
error_details.append(f"元素{idx}缺少is_invalid字段(必填)")
|
|
554
614
|
return False, error_details
|
|
555
|
-
|
|
615
|
+
|
|
556
616
|
is_invalid_val = it.get("is_invalid")
|
|
557
617
|
if not isinstance(is_invalid_val, bool):
|
|
558
618
|
error_details.append(f"元素{idx}的is_invalid不是布尔值")
|
|
559
619
|
return False, error_details
|
|
560
|
-
|
|
620
|
+
|
|
561
621
|
# 如果is_invalid为true,必须提供invalid_reason
|
|
562
622
|
if is_invalid_val is True:
|
|
563
623
|
invalid_reason = it.get("invalid_reason", "")
|
|
564
624
|
if not isinstance(invalid_reason, str) or not invalid_reason.strip():
|
|
565
|
-
error_details.append(
|
|
625
|
+
error_details.append(
|
|
626
|
+
f"元素{idx}的is_invalid为true但缺少invalid_reason字段或理由为空(必填)"
|
|
627
|
+
)
|
|
566
628
|
return False, error_details
|
|
567
|
-
|
|
629
|
+
|
|
568
630
|
return True, []
|
|
569
631
|
|
|
570
632
|
|
|
571
633
|
def extract_classified_gids(cluster_items: List[Dict]) -> set:
|
|
572
634
|
"""从聚类结果中提取所有已分类的gid
|
|
573
|
-
|
|
635
|
+
|
|
574
636
|
注意:此函数假设格式验证已经通过,所有gid都是有效的整数。
|
|
575
637
|
如果遇到格式错误的gid,会记录警告但不会抛出异常(因为格式验证应该已经捕获了这些问题)。
|
|
576
638
|
"""
|
|
@@ -586,7 +648,9 @@ def extract_classified_gids(cluster_items: List[Dict]) -> set:
|
|
|
586
648
|
except (ValueError, TypeError):
|
|
587
649
|
# 理论上不应该到达这里(格式验证应该已经捕获),但如果到达了,记录警告
|
|
588
650
|
try:
|
|
589
|
-
|
|
651
|
+
PrettyOutput.auto_print(
|
|
652
|
+
f"[jarvis-sec] 警告:在提取gid时遇到格式错误(值={x},类型={type(x).__name__}),这不应该发生(格式验证应该已捕获)",
|
|
653
|
+
)
|
|
590
654
|
except Exception:
|
|
591
655
|
pass
|
|
592
656
|
continue
|
|
@@ -608,9 +672,14 @@ def build_cluster_retry_task(
|
|
|
608
672
|
if missing_gids:
|
|
609
673
|
missing_gids_list = sorted(list(missing_gids))
|
|
610
674
|
missing_count = len(missing_gids)
|
|
611
|
-
retry_task +=
|
|
675
|
+
retry_task += (
|
|
676
|
+
f"\n\n**遗漏的gid(共{missing_count}个,必须被分类):**\n"
|
|
677
|
+
+ ", ".join(str(gid) for gid in missing_gids_list)
|
|
678
|
+
)
|
|
612
679
|
if error_details:
|
|
613
|
-
retry_task += "\n\n**格式错误:**\n" + "\n".join(
|
|
680
|
+
retry_task += "\n\n**格式错误:**\n" + "\n".join(
|
|
681
|
+
f"- {detail}" for detail in error_details
|
|
682
|
+
)
|
|
614
683
|
return retry_task
|
|
615
684
|
|
|
616
685
|
|
|
@@ -621,11 +690,17 @@ def build_cluster_error_guidance(
|
|
|
621
690
|
"""构建聚类错误指导信息"""
|
|
622
691
|
error_guidance = ""
|
|
623
692
|
if error_details:
|
|
624
|
-
error_guidance =
|
|
693
|
+
error_guidance = (
|
|
694
|
+
"\n\n**格式错误详情(请根据以下错误修复输出格式):**\n"
|
|
695
|
+
+ "\n".join(f"- {detail}" for detail in error_details)
|
|
696
|
+
)
|
|
625
697
|
if missing_gids:
|
|
626
698
|
missing_gids_list = sorted(list(missing_gids))
|
|
627
699
|
missing_count = len(missing_gids)
|
|
628
|
-
error_guidance +=
|
|
700
|
+
error_guidance += (
|
|
701
|
+
f"\n\n**完整性错误:遗漏了 {missing_count} 个 gid,这些 gid 必须被分类:**\n"
|
|
702
|
+
+ ", ".join(str(gid) for gid in missing_gids_list)
|
|
703
|
+
)
|
|
629
704
|
return error_guidance
|
|
630
705
|
|
|
631
706
|
|
|
@@ -643,11 +718,13 @@ def run_cluster_agent_direct_model(
|
|
|
643
718
|
error_guidance = build_cluster_error_guidance(error_details, missing_gids)
|
|
644
719
|
full_prompt = f"{retry_task}{error_guidance}\n\n{cluster_summary_prompt}"
|
|
645
720
|
try:
|
|
646
|
-
response = cluster_agent.model.chat_until_success(full_prompt)
|
|
721
|
+
response = cluster_agent.model.chat_until_success(full_prompt)
|
|
647
722
|
_cluster_summary["text"] = response
|
|
648
723
|
except Exception as e:
|
|
649
724
|
try:
|
|
650
|
-
|
|
725
|
+
PrettyOutput.auto_print(
|
|
726
|
+
f"[jarvis-sec] 直接模型调用失败: {e},回退到 run()",
|
|
727
|
+
)
|
|
651
728
|
except Exception:
|
|
652
729
|
pass
|
|
653
730
|
cluster_agent.run(cluster_task)
|
|
@@ -661,12 +738,14 @@ def validate_cluster_result(
|
|
|
661
738
|
"""验证聚类结果格式"""
|
|
662
739
|
if parse_error:
|
|
663
740
|
error_details = [f"JSON解析失败: {parse_error}"]
|
|
664
|
-
|
|
741
|
+
PrettyOutput.auto_print(f"[jarvis-sec] JSON解析失败: {parse_error}")
|
|
665
742
|
return False, error_details
|
|
666
743
|
else:
|
|
667
|
-
valid, error_details = validate_cluster_format(cluster_items)
|
|
744
|
+
valid, error_details = validate_cluster_format(cluster_items or [])
|
|
668
745
|
if not valid:
|
|
669
|
-
|
|
746
|
+
PrettyOutput.auto_print(
|
|
747
|
+
f"[jarvis-sec] 聚类结果格式无效({'; '.join(error_details)}),重试第 {attempt} 次(使用直接模型调用)",
|
|
748
|
+
)
|
|
670
749
|
return valid, error_details
|
|
671
750
|
|
|
672
751
|
|
|
@@ -679,12 +758,16 @@ def check_cluster_completeness(
|
|
|
679
758
|
classified_gids = extract_classified_gids(cluster_items)
|
|
680
759
|
missing_gids = input_gids - classified_gids
|
|
681
760
|
if not missing_gids:
|
|
682
|
-
|
|
761
|
+
PrettyOutput.auto_print(
|
|
762
|
+
f"[jarvis-sec] 聚类完整性校验通过,所有gid已分类(共尝试 {attempt} 次)",
|
|
763
|
+
)
|
|
683
764
|
return True, set()
|
|
684
765
|
else:
|
|
685
766
|
missing_gids_list = sorted(list(missing_gids))
|
|
686
767
|
missing_count = len(missing_gids)
|
|
687
|
-
|
|
768
|
+
PrettyOutput.auto_print(
|
|
769
|
+
f"[jarvis-sec] 聚类完整性校验失败:遗漏的gid: {missing_gids_list}({missing_count}个),重试第 {attempt} 次(使用直接模型调用)",
|
|
770
|
+
)
|
|
688
771
|
return False, missing_gids
|
|
689
772
|
|
|
690
773
|
|
|
@@ -704,13 +787,13 @@ def run_cluster_agent_with_retry(
|
|
|
704
787
|
_attempt = 0
|
|
705
788
|
use_direct_model = False
|
|
706
789
|
error_details: List[str] = []
|
|
707
|
-
missing_gids = set()
|
|
790
|
+
missing_gids: set[str] = set()
|
|
708
791
|
consecutive_failures = 0 # 连续失败次数
|
|
709
|
-
|
|
792
|
+
|
|
710
793
|
while True:
|
|
711
794
|
_attempt += 1
|
|
712
795
|
_cluster_summary["text"] = ""
|
|
713
|
-
|
|
796
|
+
|
|
714
797
|
if use_direct_model:
|
|
715
798
|
run_cluster_agent_direct_model(
|
|
716
799
|
cluster_agent,
|
|
@@ -724,26 +807,32 @@ def run_cluster_agent_with_retry(
|
|
|
724
807
|
else:
|
|
725
808
|
# 第一次使用 run(),让 Agent 完整运行(可能使用工具)
|
|
726
809
|
cluster_agent.run(cluster_task)
|
|
727
|
-
|
|
810
|
+
|
|
728
811
|
cluster_summary_text = _cluster_summary.get("text", "")
|
|
729
812
|
# 调试:如果解析失败,输出摘要文本的前500个字符用于调试
|
|
730
813
|
cluster_items, parse_error = parse_clusters_from_text(cluster_summary_text)
|
|
731
|
-
|
|
814
|
+
|
|
732
815
|
# 如果解析失败且是第一次尝试,输出调试信息
|
|
733
816
|
if parse_error and _attempt == 1:
|
|
734
817
|
preview = cluster_summary_text[:500] if cluster_summary_text else "(空)"
|
|
735
818
|
try:
|
|
736
|
-
|
|
819
|
+
PrettyOutput.auto_print(
|
|
820
|
+
f"[jarvis-sec] 调试:摘要文本预览(前500字符): {preview}",
|
|
821
|
+
)
|
|
737
822
|
except Exception:
|
|
738
823
|
pass
|
|
739
|
-
|
|
824
|
+
|
|
740
825
|
# 校验结构
|
|
741
|
-
valid, error_details = validate_cluster_result(
|
|
742
|
-
|
|
826
|
+
valid, error_details = validate_cluster_result(
|
|
827
|
+
cluster_items, parse_error, _attempt
|
|
828
|
+
)
|
|
829
|
+
|
|
743
830
|
# 完整性校验:检查所有输入的gid是否都被分类
|
|
744
831
|
missing_gids = set()
|
|
745
832
|
if valid and cluster_items:
|
|
746
|
-
is_complete, missing_gids = check_cluster_completeness(
|
|
833
|
+
is_complete, missing_gids = check_cluster_completeness(
|
|
834
|
+
cluster_items, input_gids, _attempt
|
|
835
|
+
)
|
|
747
836
|
if is_complete:
|
|
748
837
|
return cluster_items, None, False
|
|
749
838
|
else:
|
|
@@ -752,15 +841,17 @@ def run_cluster_agent_with_retry(
|
|
|
752
841
|
consecutive_failures += 1
|
|
753
842
|
else:
|
|
754
843
|
consecutive_failures += 1
|
|
755
|
-
|
|
844
|
+
|
|
756
845
|
# 如果连续失败5次,且提供了创建agent的函数,则返回需要重新创建agent的标志
|
|
757
846
|
if not valid and consecutive_failures >= 5 and create_agent_func is not None:
|
|
758
847
|
try:
|
|
759
|
-
|
|
848
|
+
PrettyOutput.auto_print(
|
|
849
|
+
f"[jarvis-sec] 连续失败 {consecutive_failures} 次,需要重新创建agent",
|
|
850
|
+
)
|
|
760
851
|
except Exception:
|
|
761
852
|
pass
|
|
762
853
|
return None, parse_error or "连续失败5次", True
|
|
763
|
-
|
|
854
|
+
|
|
764
855
|
if not valid:
|
|
765
856
|
use_direct_model = True
|
|
766
857
|
cluster_items = None
|
|
@@ -788,11 +879,11 @@ def process_cluster_results(
|
|
|
788
879
|
pass
|
|
789
880
|
except Exception:
|
|
790
881
|
gid_to_item = {}
|
|
791
|
-
|
|
882
|
+
|
|
792
883
|
_merged_count = 0
|
|
793
884
|
_invalid_count = 0
|
|
794
885
|
classified_gids_final = set()
|
|
795
|
-
|
|
886
|
+
|
|
796
887
|
for cl in cluster_items:
|
|
797
888
|
verification = str(cl.get("verification", "")).strip()
|
|
798
889
|
raw_gids = cl.get("gids", [])
|
|
@@ -807,61 +898,71 @@ def process_cluster_results(
|
|
|
807
898
|
classified_gids_final.add(xi)
|
|
808
899
|
except Exception:
|
|
809
900
|
pass
|
|
810
|
-
|
|
901
|
+
|
|
811
902
|
members: List[Dict] = []
|
|
812
903
|
for k in norm_keys:
|
|
813
|
-
|
|
814
|
-
if
|
|
815
|
-
|
|
816
|
-
members.append(
|
|
817
|
-
|
|
904
|
+
item = gid_to_item.get(k)
|
|
905
|
+
if item is not None:
|
|
906
|
+
item["verify"] = verification
|
|
907
|
+
members.append(item)
|
|
908
|
+
|
|
818
909
|
# 如果标记为无效,收集到复核列表
|
|
819
910
|
if is_invalid:
|
|
820
911
|
_invalid_count += 1
|
|
821
912
|
invalid_gids = [m.get("gid") for m in members]
|
|
822
913
|
invalid_reason = str(cl.get("invalid_reason", "")).strip()
|
|
823
914
|
try:
|
|
824
|
-
|
|
915
|
+
PrettyOutput.auto_print(
|
|
916
|
+
f"[jarvis-sec] 聚类阶段判定为无效(gids={invalid_gids}),将提交复核Agent验证",
|
|
917
|
+
)
|
|
825
918
|
except Exception:
|
|
826
919
|
pass
|
|
827
|
-
invalid_clusters_for_review.append(
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
920
|
+
invalid_clusters_for_review.append(
|
|
921
|
+
{
|
|
922
|
+
"file": file,
|
|
923
|
+
"batch_index": chunk_idx,
|
|
924
|
+
"gids": invalid_gids,
|
|
925
|
+
"verification": verification,
|
|
926
|
+
"invalid_reason": invalid_reason,
|
|
927
|
+
"members": members,
|
|
928
|
+
"count": len(members),
|
|
929
|
+
}
|
|
930
|
+
)
|
|
931
|
+
_progress_append(
|
|
932
|
+
{
|
|
933
|
+
"event": "cluster_invalid",
|
|
934
|
+
"file": file,
|
|
935
|
+
"batch_index": chunk_idx,
|
|
936
|
+
"gids": invalid_gids,
|
|
937
|
+
"verification": verification,
|
|
938
|
+
"count": len(members),
|
|
939
|
+
}
|
|
940
|
+
)
|
|
941
|
+
cluster_records.append(
|
|
942
|
+
{
|
|
943
|
+
"file": file,
|
|
944
|
+
"verification": verification,
|
|
945
|
+
"gids": invalid_gids,
|
|
946
|
+
"count": len(members),
|
|
947
|
+
"batch_index": chunk_idx,
|
|
948
|
+
"is_invalid": True,
|
|
949
|
+
"invalid_reason": invalid_reason,
|
|
950
|
+
}
|
|
951
|
+
)
|
|
853
952
|
elif members:
|
|
854
953
|
_merged_count += 1
|
|
855
954
|
cluster_batches.append(members)
|
|
856
|
-
cluster_records.append(
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
955
|
+
cluster_records.append(
|
|
956
|
+
{
|
|
957
|
+
"file": file,
|
|
958
|
+
"verification": verification,
|
|
959
|
+
"gids": [m.get("gid") for m in members],
|
|
960
|
+
"count": len(members),
|
|
961
|
+
"batch_index": chunk_idx,
|
|
962
|
+
"is_invalid": False,
|
|
963
|
+
}
|
|
964
|
+
)
|
|
965
|
+
|
|
865
966
|
return _merged_count, _invalid_count
|
|
866
967
|
|
|
867
968
|
|
|
@@ -881,14 +982,16 @@ def supplement_missing_gids(
|
|
|
881
982
|
default_verification = f"验证候选 {missing_gid} 的安全风险"
|
|
882
983
|
missing_item["verify"] = default_verification
|
|
883
984
|
cluster_batches.append([missing_item])
|
|
884
|
-
cluster_records.append(
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
985
|
+
cluster_records.append(
|
|
986
|
+
{
|
|
987
|
+
"file": file,
|
|
988
|
+
"verification": default_verification,
|
|
989
|
+
"gids": [missing_gid],
|
|
990
|
+
"count": 1,
|
|
991
|
+
"batch_index": chunk_idx,
|
|
992
|
+
"note": "完整性校验补充的遗漏gid",
|
|
993
|
+
}
|
|
994
|
+
)
|
|
892
995
|
supplemented_count += 1
|
|
893
996
|
return supplemented_count
|
|
894
997
|
|
|
@@ -958,35 +1061,39 @@ def process_cluster_chunk(
|
|
|
958
1061
|
"""处理单个聚类批次"""
|
|
959
1062
|
if not chunk:
|
|
960
1063
|
return
|
|
961
|
-
|
|
1064
|
+
|
|
962
1065
|
pending_in_file_with_ids = list(chunk)
|
|
963
|
-
|
|
1066
|
+
|
|
964
1067
|
# 记录聚类批次开始
|
|
965
|
-
_progress_append(
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
1068
|
+
_progress_append(
|
|
1069
|
+
{
|
|
1070
|
+
"event": "cluster_status",
|
|
1071
|
+
"status": "running",
|
|
1072
|
+
"file": file,
|
|
1073
|
+
"batch_index": chunk_idx,
|
|
1074
|
+
"total_in_batch": len(pending_in_file_with_ids),
|
|
1075
|
+
}
|
|
1076
|
+
)
|
|
1077
|
+
|
|
973
1078
|
# 创建聚类Agent
|
|
974
|
-
cluster_agent = create_cluster_agent(
|
|
975
|
-
|
|
1079
|
+
cluster_agent = create_cluster_agent(
|
|
1080
|
+
file, chunk_idx, llm_group, force_save_memory=force_save_memory
|
|
1081
|
+
)
|
|
1082
|
+
|
|
976
1083
|
# 构建任务上下文
|
|
977
1084
|
cluster_task = build_cluster_task(pending_in_file_with_ids, entry_path, file, langs)
|
|
978
|
-
|
|
1085
|
+
|
|
979
1086
|
# 提取输入gid
|
|
980
1087
|
input_gids = extract_input_gids(pending_in_file_with_ids)
|
|
981
|
-
|
|
1088
|
+
|
|
982
1089
|
# 运行聚类Agent(支持重新创建agent,不限次数)
|
|
983
1090
|
cluster_summary_prompt = get_cluster_summary_prompt()
|
|
984
1091
|
recreate_count = 0
|
|
985
|
-
|
|
1092
|
+
|
|
986
1093
|
while True:
|
|
987
1094
|
# 订阅摘要事件(每次重新创建agent后需要重新订阅)
|
|
988
1095
|
cluster_summary = subscribe_summary_event(cluster_agent)
|
|
989
|
-
|
|
1096
|
+
|
|
990
1097
|
cluster_items, parse_error, need_recreate = run_cluster_agent_with_retry(
|
|
991
1098
|
cluster_agent,
|
|
992
1099
|
cluster_task,
|
|
@@ -994,28 +1101,34 @@ def process_cluster_chunk(
|
|
|
994
1101
|
input_gids,
|
|
995
1102
|
file,
|
|
996
1103
|
cluster_summary,
|
|
997
|
-
create_agent_func=lambda: create_cluster_agent(
|
|
1104
|
+
create_agent_func=lambda: create_cluster_agent(
|
|
1105
|
+
file, chunk_idx, llm_group, force_save_memory=force_save_memory
|
|
1106
|
+
),
|
|
998
1107
|
)
|
|
999
|
-
|
|
1108
|
+
|
|
1000
1109
|
# 如果不需要重新创建agent,退出循环
|
|
1001
1110
|
if not need_recreate:
|
|
1002
1111
|
break
|
|
1003
|
-
|
|
1112
|
+
|
|
1004
1113
|
# 需要重新创建agent(不限次数)
|
|
1005
1114
|
recreate_count += 1
|
|
1006
1115
|
try:
|
|
1007
|
-
|
|
1116
|
+
PrettyOutput.auto_print(
|
|
1117
|
+
f"[jarvis-sec] 重新创建聚类Agent(第 {recreate_count} 次)",
|
|
1118
|
+
)
|
|
1008
1119
|
except Exception:
|
|
1009
1120
|
pass
|
|
1010
|
-
cluster_agent = create_cluster_agent(
|
|
1011
|
-
|
|
1121
|
+
cluster_agent = create_cluster_agent(
|
|
1122
|
+
file, chunk_idx, llm_group, force_save_memory=force_save_memory
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1012
1125
|
# 处理聚类结果
|
|
1013
1126
|
_merged_count = 0
|
|
1014
1127
|
_invalid_count = 0
|
|
1015
|
-
|
|
1128
|
+
|
|
1016
1129
|
if isinstance(cluster_items, list) and cluster_items:
|
|
1017
1130
|
gid_to_item = build_gid_to_item_mapping(pending_in_file_with_ids)
|
|
1018
|
-
|
|
1131
|
+
|
|
1019
1132
|
_merged_count, _invalid_count = process_cluster_results(
|
|
1020
1133
|
cluster_items,
|
|
1021
1134
|
pending_in_file_with_ids,
|
|
@@ -1026,11 +1139,13 @@ def process_cluster_chunk(
|
|
|
1026
1139
|
invalid_clusters_for_review,
|
|
1027
1140
|
_progress_append,
|
|
1028
1141
|
)
|
|
1029
|
-
|
|
1142
|
+
|
|
1030
1143
|
classified_gids_final = extract_classified_gids(cluster_items)
|
|
1031
1144
|
missing_gids_final = input_gids - classified_gids_final
|
|
1032
1145
|
if missing_gids_final:
|
|
1033
|
-
|
|
1146
|
+
PrettyOutput.auto_print(
|
|
1147
|
+
f"[jarvis-sec] 警告:仍有遗漏的gid {sorted(list(missing_gids_final))},将为每个遗漏的gid创建单独聚类",
|
|
1148
|
+
)
|
|
1034
1149
|
supplemented_count = supplement_missing_gids(
|
|
1035
1150
|
missing_gids_final,
|
|
1036
1151
|
gid_to_item,
|
|
@@ -1043,9 +1158,11 @@ def process_cluster_chunk(
|
|
|
1043
1158
|
else:
|
|
1044
1159
|
# 聚类结果为空或None:为所有输入的gid创建单独聚类(保守策略)
|
|
1045
1160
|
if pending_in_file_with_ids:
|
|
1046
|
-
|
|
1161
|
+
PrettyOutput.auto_print(
|
|
1162
|
+
f"[jarvis-sec] 警告:聚类结果为空或None(文件={file},批次={chunk_idx}),为所有gid创建单独聚类",
|
|
1163
|
+
)
|
|
1047
1164
|
gid_to_item_fallback = build_gid_to_item_mapping(pending_in_file_with_ids)
|
|
1048
|
-
|
|
1165
|
+
|
|
1049
1166
|
_merged_count = supplement_missing_gids(
|
|
1050
1167
|
input_gids,
|
|
1051
1168
|
gid_to_item_fallback,
|
|
@@ -1058,25 +1175,30 @@ def process_cluster_chunk(
|
|
|
1058
1175
|
else:
|
|
1059
1176
|
_merged_count = 0
|
|
1060
1177
|
_invalid_count = 0
|
|
1061
|
-
|
|
1178
|
+
|
|
1062
1179
|
# 标记聚类批次完成
|
|
1063
|
-
_progress_append(
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1180
|
+
_progress_append(
|
|
1181
|
+
{
|
|
1182
|
+
"event": "cluster_status",
|
|
1183
|
+
"status": "done",
|
|
1184
|
+
"file": file,
|
|
1185
|
+
"batch_index": chunk_idx,
|
|
1186
|
+
"clusters_count": _merged_count,
|
|
1187
|
+
"invalid_clusters_count": _invalid_count,
|
|
1188
|
+
}
|
|
1189
|
+
)
|
|
1071
1190
|
if _invalid_count > 0:
|
|
1072
1191
|
try:
|
|
1073
|
-
|
|
1192
|
+
PrettyOutput.auto_print(
|
|
1193
|
+
f"[jarvis-sec] 聚类批次完成: 有效聚类={_merged_count},无效聚类={_invalid_count}(已跳过)",
|
|
1194
|
+
)
|
|
1074
1195
|
except Exception:
|
|
1075
1196
|
pass
|
|
1076
|
-
|
|
1197
|
+
|
|
1077
1198
|
# 写入当前批次的聚类结果
|
|
1078
1199
|
current_batch_records = [
|
|
1079
|
-
rec
|
|
1200
|
+
rec
|
|
1201
|
+
for rec in cluster_records
|
|
1080
1202
|
if rec.get("file") == file and rec.get("batch_index") == chunk_idx
|
|
1081
1203
|
]
|
|
1082
1204
|
if current_batch_records:
|
|
@@ -1116,7 +1238,7 @@ def process_file_clustering(
|
|
|
1116
1238
|
pending_in_file = filter_pending_items(items, clustered_gids)
|
|
1117
1239
|
if not pending_in_file:
|
|
1118
1240
|
return
|
|
1119
|
-
|
|
1241
|
+
|
|
1120
1242
|
# 优化:如果文件只有一个告警,跳过聚类,直接写入
|
|
1121
1243
|
if len(pending_in_file) == 1:
|
|
1122
1244
|
single_item = pending_in_file[0]
|
|
@@ -1131,11 +1253,15 @@ def process_file_clustering(
|
|
|
1131
1253
|
_write_cluster_batch_snapshot,
|
|
1132
1254
|
)
|
|
1133
1255
|
return
|
|
1134
|
-
|
|
1256
|
+
|
|
1135
1257
|
# 将该文件的告警按 cluster_limit 分批
|
|
1136
|
-
_limit =
|
|
1137
|
-
|
|
1138
|
-
|
|
1258
|
+
_limit = (
|
|
1259
|
+
cluster_limit if isinstance(cluster_limit, int) and cluster_limit > 0 else 50
|
|
1260
|
+
)
|
|
1261
|
+
_chunks: List[List[Dict]] = [
|
|
1262
|
+
pending_in_file[i : i + _limit] for i in range(0, len(pending_in_file), _limit)
|
|
1263
|
+
]
|
|
1264
|
+
|
|
1139
1265
|
# 处理每个批次
|
|
1140
1266
|
for _chunk_idx, _chunk in enumerate(_chunks, start=1):
|
|
1141
1267
|
process_cluster_chunk(
|
|
@@ -1163,30 +1289,42 @@ def initialize_clustering_context(
|
|
|
1163
1289
|
compact_candidates: List[Dict],
|
|
1164
1290
|
sec_dir: Path,
|
|
1165
1291
|
_progress_append,
|
|
1166
|
-
) -> tuple[
|
|
1292
|
+
) -> tuple[
|
|
1293
|
+
Dict[str, List[Dict]], Dict, tuple, List[List[Dict]], List[Dict], List[Dict], set
|
|
1294
|
+
]:
|
|
1167
1295
|
"""初始化聚类上下文,返回(文件分组, 已有聚类, 快照写入函数, 聚类批次, 聚类记录, 无效聚类, 已聚类gid)"""
|
|
1168
1296
|
# 按文件分组构建待聚类集合
|
|
1169
1297
|
_file_groups = group_candidates_by_file(compact_candidates)
|
|
1170
|
-
|
|
1298
|
+
|
|
1171
1299
|
cluster_batches: List[List[Dict]] = []
|
|
1172
1300
|
cluster_records: List[Dict] = []
|
|
1173
1301
|
invalid_clusters_for_review: List[Dict] = []
|
|
1174
|
-
|
|
1302
|
+
|
|
1175
1303
|
# 读取已有聚类报告以支持断点
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1304
|
+
(
|
|
1305
|
+
_existing_clusters,
|
|
1306
|
+
_completed_cluster_batches,
|
|
1307
|
+
_reviewed_invalid_gids,
|
|
1308
|
+
) = load_existing_clusters(sec_dir)
|
|
1309
|
+
|
|
1180
1310
|
# 创建快照写入函数
|
|
1181
|
-
|
|
1311
|
+
(
|
|
1312
|
+
_write_cluster_batch_snapshot,
|
|
1313
|
+
_write_cluster_report_snapshot,
|
|
1314
|
+
) = create_cluster_snapshot_writer(
|
|
1182
1315
|
sec_dir, cluster_records, compact_candidates, _progress_append
|
|
1183
1316
|
)
|
|
1184
|
-
|
|
1317
|
+
|
|
1185
1318
|
# 从断点恢复聚类结果
|
|
1186
|
-
|
|
1319
|
+
(
|
|
1320
|
+
cluster_batches,
|
|
1321
|
+
cluster_records,
|
|
1322
|
+
invalid_clusters_for_review,
|
|
1323
|
+
clustered_gids,
|
|
1324
|
+
) = restore_clusters_from_checkpoint(
|
|
1187
1325
|
_existing_clusters, _file_groups, _reviewed_invalid_gids
|
|
1188
1326
|
)
|
|
1189
|
-
|
|
1327
|
+
|
|
1190
1328
|
return (
|
|
1191
1329
|
_file_groups,
|
|
1192
1330
|
_existing_clusters,
|
|
@@ -1206,12 +1344,16 @@ def check_unclustered_gids(
|
|
|
1206
1344
|
unclustered_gids = all_candidate_gids - clustered_gids
|
|
1207
1345
|
if unclustered_gids:
|
|
1208
1346
|
try:
|
|
1209
|
-
|
|
1347
|
+
PrettyOutput.auto_print(
|
|
1348
|
+
f"[jarvis-sec] 发现 {len(unclustered_gids)} 个未聚类的 gid,将进行聚类",
|
|
1349
|
+
)
|
|
1210
1350
|
except Exception:
|
|
1211
1351
|
pass
|
|
1212
1352
|
else:
|
|
1213
1353
|
try:
|
|
1214
|
-
|
|
1354
|
+
PrettyOutput.auto_print(
|
|
1355
|
+
f"[jarvis-sec] 所有 {len(all_candidate_gids)} 个候选已聚类,跳过聚类阶段",
|
|
1356
|
+
)
|
|
1215
1357
|
except Exception:
|
|
1216
1358
|
pass
|
|
1217
1359
|
return unclustered_gids
|
|
@@ -1239,16 +1381,18 @@ def execute_clustering_for_files(
|
|
|
1239
1381
|
status_mgr.update_clustering(
|
|
1240
1382
|
current_file=0,
|
|
1241
1383
|
total_files=total_files_to_cluster,
|
|
1242
|
-
message="开始聚类分析..."
|
|
1384
|
+
message="开始聚类分析...",
|
|
1243
1385
|
)
|
|
1244
1386
|
for _file_idx, (_file, _items) in enumerate(file_groups.items(), start=1):
|
|
1245
|
-
|
|
1387
|
+
PrettyOutput.auto_print(
|
|
1388
|
+
f"\n[jarvis-sec] 聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}",
|
|
1389
|
+
)
|
|
1246
1390
|
# 更新当前文件进度
|
|
1247
1391
|
status_mgr.update_clustering(
|
|
1248
1392
|
current_file=_file_idx,
|
|
1249
1393
|
total_files=total_files_to_cluster,
|
|
1250
1394
|
file_name=_file,
|
|
1251
|
-
message=f"正在聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}"
|
|
1395
|
+
message=f"正在聚类文件 {_file_idx}/{total_files_to_cluster}: {_file}",
|
|
1252
1396
|
)
|
|
1253
1397
|
# 使用子函数处理文件聚类
|
|
1254
1398
|
process_file_clustering(
|
|
@@ -1277,13 +1421,15 @@ def record_clustering_completion(
|
|
|
1277
1421
|
"""记录聚类阶段完成"""
|
|
1278
1422
|
try:
|
|
1279
1423
|
_cluster_path = sec_dir / "cluster_report.jsonl"
|
|
1280
|
-
_progress_append(
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1424
|
+
_progress_append(
|
|
1425
|
+
{
|
|
1426
|
+
"event": "cluster_report_written",
|
|
1427
|
+
"path": str(_cluster_path),
|
|
1428
|
+
"clusters": len(cluster_records),
|
|
1429
|
+
"total_candidates": len(compact_candidates),
|
|
1430
|
+
"note": "每个批次已增量保存,无需重写整个文件",
|
|
1431
|
+
}
|
|
1432
|
+
)
|
|
1287
1433
|
except Exception:
|
|
1288
1434
|
pass
|
|
1289
1435
|
|
|
@@ -1294,7 +1440,7 @@ def fallback_to_file_based_batches(
|
|
|
1294
1440
|
) -> List[List[Dict]]:
|
|
1295
1441
|
"""若聚类失败或空,则回退为按文件一次处理"""
|
|
1296
1442
|
fallback_batches: List[List[Dict]] = []
|
|
1297
|
-
|
|
1443
|
+
|
|
1298
1444
|
# 收集所有未聚类的 gid(从所有候选 gid 中排除已聚类的)
|
|
1299
1445
|
all_gids_in_file_groups = collect_candidate_gids(file_groups)
|
|
1300
1446
|
gid_to_item_fallback: Dict[int, Dict] = {}
|
|
@@ -1306,7 +1452,7 @@ def fallback_to_file_based_batches(
|
|
|
1306
1452
|
gid_to_item_fallback[_gid] = c
|
|
1307
1453
|
except Exception:
|
|
1308
1454
|
pass
|
|
1309
|
-
|
|
1455
|
+
|
|
1310
1456
|
# 如果还有未聚类的 gid,按文件分组创建批次
|
|
1311
1457
|
if all_gids_in_file_groups:
|
|
1312
1458
|
# 收集已聚类的 gid(从 cluster_report.jsonl)
|
|
@@ -1323,23 +1469,24 @@ def fallback_to_file_based_batches(
|
|
|
1323
1469
|
clustered_gids_fallback.add(_gid_int)
|
|
1324
1470
|
except Exception:
|
|
1325
1471
|
pass
|
|
1326
|
-
|
|
1472
|
+
|
|
1327
1473
|
unclustered_gids_fallback = all_gids_in_file_groups - clustered_gids_fallback
|
|
1328
1474
|
if unclustered_gids_fallback:
|
|
1329
1475
|
# 按文件分组未聚类的 gid
|
|
1330
1476
|
from collections import defaultdict
|
|
1477
|
+
|
|
1331
1478
|
unclustered_by_file: Dict[str, List[Dict]] = defaultdict(list)
|
|
1332
1479
|
for _gid in unclustered_gids_fallback:
|
|
1333
1480
|
item = gid_to_item_fallback.get(_gid)
|
|
1334
1481
|
if item:
|
|
1335
1482
|
file_key = str(item.get("file") or "")
|
|
1336
1483
|
unclustered_by_file[file_key].append(item)
|
|
1337
|
-
|
|
1484
|
+
|
|
1338
1485
|
# 为每个文件创建批次
|
|
1339
1486
|
for _file, _items in unclustered_by_file.items():
|
|
1340
1487
|
if _items:
|
|
1341
1488
|
fallback_batches.append(_items)
|
|
1342
|
-
|
|
1489
|
+
|
|
1343
1490
|
return fallback_batches
|
|
1344
1491
|
|
|
1345
1492
|
|
|
@@ -1365,13 +1512,15 @@ def process_clustering_phase(
|
|
|
1365
1512
|
invalid_clusters_for_review,
|
|
1366
1513
|
clustered_gids,
|
|
1367
1514
|
) = initialize_clustering_context(compact_candidates, sec_dir, _progress_append)
|
|
1368
|
-
|
|
1515
|
+
|
|
1369
1516
|
# 收集所有候选的 gid(用于检查未聚类的 gid)
|
|
1370
1517
|
all_candidate_gids_in_clustering = collect_candidate_gids(_file_groups)
|
|
1371
|
-
|
|
1518
|
+
|
|
1372
1519
|
# 检查是否有未聚类的 gid
|
|
1373
|
-
unclustered_gids = check_unclustered_gids(
|
|
1374
|
-
|
|
1520
|
+
unclustered_gids = check_unclustered_gids(
|
|
1521
|
+
all_candidate_gids_in_clustering, clustered_gids
|
|
1522
|
+
)
|
|
1523
|
+
|
|
1375
1524
|
# 如果有未聚类的 gid,继续执行聚类
|
|
1376
1525
|
if unclustered_gids:
|
|
1377
1526
|
execute_clustering_for_files(
|
|
@@ -1389,12 +1538,15 @@ def process_clustering_phase(
|
|
|
1389
1538
|
_write_cluster_batch_snapshot,
|
|
1390
1539
|
force_save_memory=force_save_memory,
|
|
1391
1540
|
)
|
|
1392
|
-
|
|
1541
|
+
|
|
1393
1542
|
# 记录聚类阶段完成
|
|
1394
|
-
record_clustering_completion(
|
|
1395
|
-
|
|
1543
|
+
record_clustering_completion(
|
|
1544
|
+
sec_dir, cluster_records, compact_candidates, _progress_append
|
|
1545
|
+
)
|
|
1546
|
+
|
|
1396
1547
|
# 复核Agent:验证所有标记为无效的聚类(需要从review模块导入)
|
|
1397
1548
|
from jarvis.jarvis_sec.review import process_review_phase
|
|
1549
|
+
|
|
1398
1550
|
cluster_batches = process_review_phase(
|
|
1399
1551
|
invalid_clusters_for_review,
|
|
1400
1552
|
entry_path,
|
|
@@ -1405,35 +1557,41 @@ def process_clustering_phase(
|
|
|
1405
1557
|
cluster_batches,
|
|
1406
1558
|
sec_dir,
|
|
1407
1559
|
)
|
|
1408
|
-
|
|
1560
|
+
|
|
1409
1561
|
# 若聚类失败或空,则回退为"按文件一次处理"
|
|
1410
1562
|
if not cluster_batches:
|
|
1411
|
-
fallback_batches = fallback_to_file_based_batches(
|
|
1563
|
+
fallback_batches = fallback_to_file_based_batches(
|
|
1564
|
+
_file_groups, _existing_clusters
|
|
1565
|
+
)
|
|
1412
1566
|
cluster_batches.extend(fallback_batches)
|
|
1413
|
-
|
|
1567
|
+
|
|
1414
1568
|
# 完整性检查:确保所有候选的 gid 都已被聚类
|
|
1415
1569
|
# 使用新的文件管理器进行校验
|
|
1416
1570
|
is_complete, missing_gids_final = validate_clustering_completeness(sec_dir)
|
|
1417
|
-
|
|
1571
|
+
|
|
1418
1572
|
if missing_gids_final:
|
|
1419
1573
|
# 如果还有遗漏的gid,说明恢复逻辑有问题,需要重新聚类
|
|
1420
1574
|
try:
|
|
1421
1575
|
missing_count = len(missing_gids_final)
|
|
1422
1576
|
if missing_count <= 20:
|
|
1423
|
-
|
|
1577
|
+
PrettyOutput.auto_print(
|
|
1578
|
+
f"[jarvis-sec] 警告:发现 {missing_count} 个遗漏的gid(恢复逻辑可能有问题): {sorted(list(missing_gids_final))}",
|
|
1579
|
+
)
|
|
1424
1580
|
else:
|
|
1425
1581
|
missing_list = sorted(list(missing_gids_final))
|
|
1426
1582
|
display_list = missing_list[:10] + ["..."] + missing_list[-10:]
|
|
1427
|
-
|
|
1428
|
-
|
|
1583
|
+
PrettyOutput.auto_print(
|
|
1584
|
+
f"[jarvis-sec] 警告:发现 {missing_count} 个遗漏的gid(恢复逻辑可能有问题): {display_list}",
|
|
1585
|
+
)
|
|
1586
|
+
|
|
1429
1587
|
except Exception:
|
|
1430
1588
|
pass
|
|
1431
|
-
|
|
1589
|
+
|
|
1432
1590
|
# 清理之前创建的单独聚类(避免分析工作量激增)
|
|
1433
1591
|
cluster_batches = filter_single_gid_clusters(
|
|
1434
1592
|
cluster_batches,
|
|
1435
1593
|
sec_dir,
|
|
1436
1594
|
_progress_append,
|
|
1437
1595
|
)
|
|
1438
|
-
|
|
1596
|
+
|
|
1439
1597
|
return cluster_batches, invalid_clusters_for_review
|