codeboarding 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +0 -0
- agents/abstraction_agent.py +150 -0
- agents/agent.py +467 -0
- agents/agent_responses.py +363 -0
- agents/cluster_methods_mixin.py +281 -0
- agents/constants.py +13 -0
- agents/dependency_discovery.py +159 -0
- agents/details_agent.py +174 -0
- agents/llm_config.py +309 -0
- agents/meta_agent.py +105 -0
- agents/planner_agent.py +105 -0
- agents/prompts/__init__.py +85 -0
- agents/prompts/abstract_prompt_factory.py +63 -0
- agents/prompts/claude_prompts.py +381 -0
- agents/prompts/deepseek_prompts.py +389 -0
- agents/prompts/gemini_flash_prompts.py +362 -0
- agents/prompts/glm_prompts.py +407 -0
- agents/prompts/gpt_prompts.py +470 -0
- agents/prompts/kimi_prompts.py +400 -0
- agents/prompts/prompt_factory.py +179 -0
- agents/tools/__init__.py +8 -0
- agents/tools/base.py +96 -0
- agents/tools/get_external_deps.py +47 -0
- agents/tools/get_method_invocations.py +47 -0
- agents/tools/read_cfg.py +60 -0
- agents/tools/read_docs.py +132 -0
- agents/tools/read_file.py +90 -0
- agents/tools/read_file_structure.py +156 -0
- agents/tools/read_git_diff.py +131 -0
- agents/tools/read_packages.py +60 -0
- agents/tools/read_source.py +105 -0
- agents/tools/read_structure.py +49 -0
- agents/tools/toolkit.py +119 -0
- agents/validation.py +383 -0
- caching/__init__.py +4 -0
- caching/cache.py +29 -0
- caching/meta_cache.py +227 -0
- codeboarding-0.9.0.dist-info/METADATA +223 -0
- codeboarding-0.9.0.dist-info/RECORD +126 -0
- codeboarding-0.9.0.dist-info/WHEEL +5 -0
- codeboarding-0.9.0.dist-info/entry_points.txt +3 -0
- codeboarding-0.9.0.dist-info/licenses/LICENSE +21 -0
- codeboarding-0.9.0.dist-info/top_level.txt +18 -0
- core/__init__.py +101 -0
- core/plugin_loader.py +46 -0
- core/protocols.py +27 -0
- core/registry.py +46 -0
- diagram_analysis/__init__.py +4 -0
- diagram_analysis/analysis_json.py +346 -0
- diagram_analysis/diagram_generator.py +486 -0
- diagram_analysis/file_coverage.py +212 -0
- diagram_analysis/incremental/__init__.py +63 -0
- diagram_analysis/incremental/component_checker.py +236 -0
- diagram_analysis/incremental/file_manager.py +217 -0
- diagram_analysis/incremental/impact_analyzer.py +238 -0
- diagram_analysis/incremental/io_utils.py +281 -0
- diagram_analysis/incremental/models.py +72 -0
- diagram_analysis/incremental/path_patching.py +164 -0
- diagram_analysis/incremental/reexpansion.py +166 -0
- diagram_analysis/incremental/scoped_analysis.py +227 -0
- diagram_analysis/incremental/updater.py +464 -0
- diagram_analysis/incremental/validation.py +48 -0
- diagram_analysis/manifest.py +152 -0
- diagram_analysis/version.py +6 -0
- duckdb_crud.py +125 -0
- github_action.py +172 -0
- health/__init__.py +3 -0
- health/checks/__init__.py +11 -0
- health/checks/circular_deps.py +48 -0
- health/checks/cohesion.py +93 -0
- health/checks/coupling.py +140 -0
- health/checks/function_size.py +85 -0
- health/checks/god_class.py +167 -0
- health/checks/inheritance.py +104 -0
- health/checks/instability.py +77 -0
- health/checks/unused_code_diagnostics.py +338 -0
- health/config.py +172 -0
- health/constants.py +19 -0
- health/models.py +186 -0
- health/runner.py +236 -0
- install.py +518 -0
- logging_config.py +105 -0
- main.py +529 -0
- monitoring/__init__.py +12 -0
- monitoring/callbacks.py +163 -0
- monitoring/context.py +158 -0
- monitoring/mixin.py +16 -0
- monitoring/paths.py +47 -0
- monitoring/stats.py +50 -0
- monitoring/writers.py +172 -0
- output_generators/__init__.py +0 -0
- output_generators/html.py +163 -0
- output_generators/html_template.py +382 -0
- output_generators/markdown.py +140 -0
- output_generators/mdx.py +171 -0
- output_generators/sphinx.py +175 -0
- repo_utils/__init__.py +277 -0
- repo_utils/change_detector.py +289 -0
- repo_utils/errors.py +6 -0
- repo_utils/git_diff.py +74 -0
- repo_utils/ignore.py +341 -0
- static_analyzer/__init__.py +335 -0
- static_analyzer/analysis_cache.py +699 -0
- static_analyzer/analysis_result.py +269 -0
- static_analyzer/cluster_change_analyzer.py +391 -0
- static_analyzer/cluster_helpers.py +79 -0
- static_analyzer/constants.py +166 -0
- static_analyzer/git_diff_analyzer.py +224 -0
- static_analyzer/graph.py +746 -0
- static_analyzer/incremental_orchestrator.py +671 -0
- static_analyzer/java_config_scanner.py +232 -0
- static_analyzer/java_utils.py +227 -0
- static_analyzer/lsp_client/__init__.py +12 -0
- static_analyzer/lsp_client/client.py +1642 -0
- static_analyzer/lsp_client/diagnostics.py +62 -0
- static_analyzer/lsp_client/java_client.py +517 -0
- static_analyzer/lsp_client/language_settings.py +97 -0
- static_analyzer/lsp_client/typescript_client.py +235 -0
- static_analyzer/programming_language.py +152 -0
- static_analyzer/reference_resolve_mixin.py +166 -0
- static_analyzer/scanner.py +95 -0
- static_analyzer/typescript_config_scanner.py +54 -0
- tool_registry.py +433 -0
- user_config.py +134 -0
- utils.py +56 -0
- vscode_constants.py +124 -0
agents/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from langchain_core.prompts import PromptTemplate
|
|
5
|
+
from langchain_core.language_models import BaseChatModel
|
|
6
|
+
|
|
7
|
+
from agents.agent import CodeBoardingAgent
|
|
8
|
+
from agents.agent_responses import (
|
|
9
|
+
AnalysisInsights,
|
|
10
|
+
ClusterAnalysis,
|
|
11
|
+
MetaAnalysisInsights,
|
|
12
|
+
assign_component_ids,
|
|
13
|
+
)
|
|
14
|
+
from agents.prompts import (
|
|
15
|
+
get_system_message,
|
|
16
|
+
get_cluster_grouping_message,
|
|
17
|
+
get_final_analysis_message,
|
|
18
|
+
)
|
|
19
|
+
from agents.cluster_methods_mixin import ClusterMethodsMixin
|
|
20
|
+
from agents.validation import (
|
|
21
|
+
ValidationContext,
|
|
22
|
+
validate_cluster_coverage,
|
|
23
|
+
validate_component_relationships,
|
|
24
|
+
validate_key_entities,
|
|
25
|
+
validate_cluster_ids_populated,
|
|
26
|
+
validate_relation_component_names,
|
|
27
|
+
)
|
|
28
|
+
from monitoring import trace
|
|
29
|
+
from static_analyzer.analysis_result import StaticAnalysisResults
|
|
30
|
+
from static_analyzer.graph import ClusterResult
|
|
31
|
+
from static_analyzer.cluster_helpers import build_all_cluster_results, get_all_cluster_ids
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class AbstractionAgent(ClusterMethodsMixin, CodeBoardingAgent):
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
repo_dir: Path,
|
|
40
|
+
static_analysis: StaticAnalysisResults,
|
|
41
|
+
project_name: str,
|
|
42
|
+
meta_context: MetaAnalysisInsights,
|
|
43
|
+
agent_llm: BaseChatModel,
|
|
44
|
+
parsing_llm: BaseChatModel,
|
|
45
|
+
):
|
|
46
|
+
super().__init__(repo_dir, static_analysis, get_system_message(), agent_llm, parsing_llm)
|
|
47
|
+
|
|
48
|
+
self.project_name = project_name
|
|
49
|
+
self.meta_context = meta_context
|
|
50
|
+
|
|
51
|
+
self.prompts = {
|
|
52
|
+
"group_clusters": PromptTemplate(
|
|
53
|
+
template=get_cluster_grouping_message(),
|
|
54
|
+
input_variables=["project_name", "cfg_clusters", "meta_context", "project_type"],
|
|
55
|
+
),
|
|
56
|
+
"final_analysis": PromptTemplate(
|
|
57
|
+
template=get_final_analysis_message(),
|
|
58
|
+
input_variables=["project_name", "cluster_analysis", "meta_context", "project_type"],
|
|
59
|
+
),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
@trace
|
|
63
|
+
def step_clusters_grouping(self, cluster_results: dict[str, ClusterResult]) -> ClusterAnalysis:
|
|
64
|
+
logger.info(f"[AbstractionAgent] Grouping CFG clusters for: {self.project_name}")
|
|
65
|
+
|
|
66
|
+
meta_context_str = self.meta_context.llm_str() if self.meta_context else "No project context available."
|
|
67
|
+
project_type = self.meta_context.project_type if self.meta_context else "unknown"
|
|
68
|
+
|
|
69
|
+
programming_langs = self.static_analysis.get_languages()
|
|
70
|
+
|
|
71
|
+
# Build cluster string using the pre-computed cluster results
|
|
72
|
+
cluster_str = self._build_cluster_string(programming_langs, cluster_results)
|
|
73
|
+
|
|
74
|
+
prompt = self.prompts["group_clusters"].format(
|
|
75
|
+
project_name=self.project_name,
|
|
76
|
+
cfg_clusters=cluster_str,
|
|
77
|
+
meta_context=meta_context_str,
|
|
78
|
+
project_type=project_type,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
cluster_analysis = self._validation_invoke(
|
|
82
|
+
prompt,
|
|
83
|
+
ClusterAnalysis,
|
|
84
|
+
validators=[validate_cluster_coverage],
|
|
85
|
+
context=ValidationContext(
|
|
86
|
+
cluster_results=cluster_results,
|
|
87
|
+
expected_cluster_ids=get_all_cluster_ids(cluster_results),
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
return cluster_analysis
|
|
91
|
+
|
|
92
|
+
@trace
|
|
93
|
+
def step_final_analysis(
|
|
94
|
+
self, cluster_analysis: ClusterAnalysis, cluster_results: dict[str, ClusterResult]
|
|
95
|
+
) -> AnalysisInsights:
|
|
96
|
+
logger.info(f"[AbstractionAgent] Generating final analysis for: {self.project_name}")
|
|
97
|
+
|
|
98
|
+
meta_context_str = self.meta_context.llm_str() if self.meta_context else "No project context available."
|
|
99
|
+
project_type = self.meta_context.project_type if self.meta_context else "unknown"
|
|
100
|
+
|
|
101
|
+
cluster_str = cluster_analysis.llm_str() if cluster_analysis else "No cluster analysis available."
|
|
102
|
+
|
|
103
|
+
prompt = self.prompts["final_analysis"].format(
|
|
104
|
+
project_name=self.project_name,
|
|
105
|
+
cluster_analysis=cluster_str,
|
|
106
|
+
meta_context=meta_context_str,
|
|
107
|
+
project_type=project_type,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Build validation context with CFG graphs for edge checking
|
|
111
|
+
context = ValidationContext(
|
|
112
|
+
cluster_results=cluster_results,
|
|
113
|
+
cfg_graphs={lang: self.static_analysis.get_cfg(lang) for lang in self.static_analysis.get_languages()},
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return self._validation_invoke(
|
|
117
|
+
prompt,
|
|
118
|
+
AnalysisInsights,
|
|
119
|
+
validators=[
|
|
120
|
+
validate_relation_component_names,
|
|
121
|
+
validate_component_relationships,
|
|
122
|
+
validate_key_entities,
|
|
123
|
+
validate_cluster_ids_populated,
|
|
124
|
+
],
|
|
125
|
+
context=context,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def run(self):
|
|
129
|
+
# Build full cluster results dict for all languages ONCE
|
|
130
|
+
cluster_results = build_all_cluster_results(self.static_analysis)
|
|
131
|
+
|
|
132
|
+
# Step 1: Group related clusters together into logical components
|
|
133
|
+
cluster_analysis = self.step_clusters_grouping(cluster_results)
|
|
134
|
+
|
|
135
|
+
# Step 2: Generate abstract components from grouped clusters
|
|
136
|
+
analysis = self.step_final_analysis(cluster_analysis, cluster_results)
|
|
137
|
+
# Step 3: Sanitize cluster IDs (remove invalid ones)
|
|
138
|
+
self._sanitize_component_cluster_ids(analysis, cluster_results=cluster_results)
|
|
139
|
+
# Step 4: Assign files to components (deterministic + LLM-based with validation)
|
|
140
|
+
self.classify_files(analysis, cluster_results, self.static_analysis.get_all_source_files())
|
|
141
|
+
# Step 5: Fix source code reference lines (resolves reference_file paths for key_entities)
|
|
142
|
+
analysis = self.fix_source_code_reference_lines(analysis)
|
|
143
|
+
# Step 6: Ensure unique key entities across components
|
|
144
|
+
self._ensure_unique_key_entities(analysis)
|
|
145
|
+
# Step 7: Ensure unique file assignments across components
|
|
146
|
+
self._ensure_unique_file_assignments(analysis)
|
|
147
|
+
# Step 8: Assign deterministic component IDs
|
|
148
|
+
assign_component_ids(analysis)
|
|
149
|
+
|
|
150
|
+
return analysis, cluster_results
|
agents/agent.py
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from google.api_core.exceptions import ResourceExhausted
|
|
8
|
+
from langchain_core.exceptions import OutputParserException
|
|
9
|
+
from langchain_core.language_models import BaseChatModel
|
|
10
|
+
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
|
|
11
|
+
from langchain_core.output_parsers import PydanticOutputParser
|
|
12
|
+
from langchain_core.prompts import PromptTemplate
|
|
13
|
+
from langchain.agents import create_agent
|
|
14
|
+
from langgraph.graph.state import CompiledStateGraph
|
|
15
|
+
from pydantic import ValidationError
|
|
16
|
+
from trustcall import create_extractor
|
|
17
|
+
|
|
18
|
+
from agents.agent_responses import AnalysisInsights, ComponentFiles, FileClassification
|
|
19
|
+
from agents.prompts import (
|
|
20
|
+
get_unassigned_files_classification_message,
|
|
21
|
+
get_validation_feedback_message,
|
|
22
|
+
)
|
|
23
|
+
from agents.tools.base import RepoContext
|
|
24
|
+
from agents.tools.toolkit import CodeBoardingToolkit
|
|
25
|
+
from agents.validation import ValidationContext, validate_file_classifications
|
|
26
|
+
from monitoring.mixin import MonitoringMixin
|
|
27
|
+
from repo_utils.ignore import RepoIgnoreManager
|
|
28
|
+
from agents.llm_config import MONITORING_CALLBACK
|
|
29
|
+
from static_analyzer.analysis_result import StaticAnalysisResults
|
|
30
|
+
from static_analyzer.reference_resolve_mixin import ReferenceResolverMixin
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CodeBoardingAgent(ReferenceResolverMixin, MonitoringMixin):
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
repo_dir: Path,
|
|
39
|
+
static_analysis: StaticAnalysisResults,
|
|
40
|
+
system_message: str,
|
|
41
|
+
agent_llm: BaseChatModel,
|
|
42
|
+
parsing_llm: BaseChatModel,
|
|
43
|
+
):
|
|
44
|
+
ReferenceResolverMixin.__init__(self, repo_dir, static_analysis)
|
|
45
|
+
MonitoringMixin.__init__(self)
|
|
46
|
+
self.parsing_llm = parsing_llm
|
|
47
|
+
self.repo_dir = repo_dir
|
|
48
|
+
self.ignore_manager = RepoIgnoreManager(repo_dir)
|
|
49
|
+
|
|
50
|
+
# Initialize the professional toolkit
|
|
51
|
+
context = RepoContext(repo_dir=repo_dir, ignore_manager=self.ignore_manager, static_analysis=static_analysis)
|
|
52
|
+
self.toolkit = CodeBoardingToolkit(context=context)
|
|
53
|
+
|
|
54
|
+
self.agent: CompiledStateGraph = create_agent(
|
|
55
|
+
model=agent_llm,
|
|
56
|
+
tools=self.toolkit.get_agent_tools(),
|
|
57
|
+
)
|
|
58
|
+
self.static_analysis = static_analysis
|
|
59
|
+
self.system_message = SystemMessage(content=system_message)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def read_source_reference(self):
|
|
63
|
+
return self.toolkit.read_source_reference
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def read_packages_tool(self):
|
|
67
|
+
return self.toolkit.read_packages
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def read_structure_tool(self):
|
|
71
|
+
return self.toolkit.read_structure
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def read_file_structure(self):
|
|
75
|
+
return self.toolkit.read_file_structure
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def read_cfg_tool(self):
|
|
79
|
+
return self.toolkit.read_cfg
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def read_method_invocations_tool(self):
|
|
83
|
+
return self.toolkit.read_method_invocations
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def read_file_tool(self):
|
|
87
|
+
return self.toolkit.read_file
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def read_docs(self):
|
|
91
|
+
return self.toolkit.read_docs
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def external_deps_tool(self):
|
|
95
|
+
return self.toolkit.external_deps
|
|
96
|
+
|
|
97
|
+
def _invoke(self, prompt, callbacks: list | None = None) -> str:
|
|
98
|
+
"""Unified agent invocation method with timeout and exponential backoff.
|
|
99
|
+
|
|
100
|
+
Uses exponential backoff based on total attempts, with different multipliers
|
|
101
|
+
for different error types. This ensures backoff increases appropriately even
|
|
102
|
+
when errors alternate between types.
|
|
103
|
+
"""
|
|
104
|
+
max_retries = 5
|
|
105
|
+
|
|
106
|
+
for attempt in range(max_retries):
|
|
107
|
+
timeout_seconds = 300 if attempt == 0 else 600
|
|
108
|
+
try:
|
|
109
|
+
callback_list = callbacks or []
|
|
110
|
+
# Always append monitoring callback - logging config controls output
|
|
111
|
+
callback_list.append(MONITORING_CALLBACK)
|
|
112
|
+
callback_list.append(self.agent_monitoring_callback)
|
|
113
|
+
|
|
114
|
+
logger.info(
|
|
115
|
+
f"Starting agent.invoke() [attempt {attempt + 1}/{max_retries}] with prompt length: {len(prompt)}, timeout: {timeout_seconds}s"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
response = self._invoke_with_timeout(
|
|
119
|
+
timeout_seconds=timeout_seconds, callback_list=callback_list, prompt=prompt
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
logger.info(
|
|
123
|
+
f"Completed agent.invoke() - message count: {len(response['messages'])}, last message type: {type(response['messages'][-1])}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
agent_response = response["messages"][-1]
|
|
127
|
+
assert isinstance(agent_response, AIMessage), f"Expected AIMessage, but got {type(agent_response)}"
|
|
128
|
+
if isinstance(agent_response.content, str):
|
|
129
|
+
return agent_response.content
|
|
130
|
+
if isinstance(agent_response.content, list):
|
|
131
|
+
return "".join(
|
|
132
|
+
[
|
|
133
|
+
str(message) if not isinstance(message, str) else message
|
|
134
|
+
for message in agent_response.content
|
|
135
|
+
]
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
except TimeoutError as e:
|
|
139
|
+
if attempt < max_retries - 1:
|
|
140
|
+
# Exponential backoff: 10s * 2^attempt (10s, 20s, 40s, 80s)
|
|
141
|
+
delay = min(10 * (2**attempt), 120)
|
|
142
|
+
logger.warning(
|
|
143
|
+
f"Agent invocation timed out after {timeout_seconds}s, retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
|
|
144
|
+
)
|
|
145
|
+
time.sleep(delay)
|
|
146
|
+
else:
|
|
147
|
+
logger.error(f"Agent invocation timed out after {timeout_seconds}s on final attempt")
|
|
148
|
+
raise
|
|
149
|
+
|
|
150
|
+
except ResourceExhausted as e:
|
|
151
|
+
if attempt < max_retries - 1:
|
|
152
|
+
# Longer backoff for rate limits: 30s * 2^attempt (30s, 60s, 120s, 240s)
|
|
153
|
+
delay = min(30 * (2**attempt), 300)
|
|
154
|
+
logger.warning(
|
|
155
|
+
f"ResourceExhausted (rate limit): {e}\n"
|
|
156
|
+
f"Retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
|
|
157
|
+
)
|
|
158
|
+
time.sleep(delay)
|
|
159
|
+
else:
|
|
160
|
+
logger.error(f"Max retries ({max_retries}) reached. ResourceExhausted: {e}")
|
|
161
|
+
raise
|
|
162
|
+
|
|
163
|
+
except Exception as e:
|
|
164
|
+
# Other errors (network, parsing, etc.) get standard exponential backoff
|
|
165
|
+
if attempt < max_retries - 1:
|
|
166
|
+
delay = min(10 * (2**attempt), 120)
|
|
167
|
+
logger.warning(
|
|
168
|
+
f"Agent error: {type(e).__name__}: {e}, retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
|
|
169
|
+
)
|
|
170
|
+
time.sleep(delay)
|
|
171
|
+
# On final attempt, fall through to return error message below
|
|
172
|
+
|
|
173
|
+
logger.error("Max retries reached. Failed to get response from the agent.")
|
|
174
|
+
return "Could not get response from the agent."
|
|
175
|
+
|
|
176
|
+
def _invoke_with_timeout(self, timeout_seconds: int, callback_list: list, prompt: str):
|
|
177
|
+
"""Invoke agent with a timeout using threading."""
|
|
178
|
+
import threading
|
|
179
|
+
from queue import Queue, Empty
|
|
180
|
+
|
|
181
|
+
result_queue: Queue = Queue()
|
|
182
|
+
exception_queue: Queue = Queue()
|
|
183
|
+
|
|
184
|
+
def invoke_target():
|
|
185
|
+
try:
|
|
186
|
+
response = self.agent.invoke(
|
|
187
|
+
{"messages": [self.system_message, HumanMessage(content=prompt)]},
|
|
188
|
+
config={"callbacks": callback_list, "recursion_limit": 40},
|
|
189
|
+
)
|
|
190
|
+
result_queue.put(response)
|
|
191
|
+
except Exception as e:
|
|
192
|
+
exception_queue.put(e)
|
|
193
|
+
|
|
194
|
+
thread = threading.Thread(target=invoke_target, daemon=True)
|
|
195
|
+
thread.start()
|
|
196
|
+
thread.join(timeout=timeout_seconds)
|
|
197
|
+
|
|
198
|
+
if thread.is_alive():
|
|
199
|
+
# Thread is still running - timeout occurred
|
|
200
|
+
logger.error(f"Agent invoke thread still running after {timeout_seconds}s timeout")
|
|
201
|
+
raise TimeoutError(f"Agent invocation exceeded {timeout_seconds}s timeout")
|
|
202
|
+
|
|
203
|
+
# Check for exceptions
|
|
204
|
+
try:
|
|
205
|
+
exception = exception_queue.get_nowait()
|
|
206
|
+
raise exception
|
|
207
|
+
except Empty:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
# Get result
|
|
211
|
+
try:
|
|
212
|
+
return result_queue.get_nowait()
|
|
213
|
+
except Empty:
|
|
214
|
+
raise RuntimeError("Agent invocation completed but no result was returned")
|
|
215
|
+
|
|
216
|
+
def _parse_invoke(self, prompt, type):
|
|
217
|
+
response = self._invoke(prompt)
|
|
218
|
+
assert isinstance(response, str), f"Expected a string as response type got {response}"
|
|
219
|
+
return self._parse_response(prompt, response, type)
|
|
220
|
+
|
|
221
|
+
def _validation_invoke(
|
|
222
|
+
self, prompt: str, return_type: type, validators: list, context, max_validation_retries: int = 1
|
|
223
|
+
):
|
|
224
|
+
"""
|
|
225
|
+
Invoke LLM with validation and feedback loop.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
prompt: The original prompt
|
|
229
|
+
return_type: Pydantic type to parse into
|
|
230
|
+
validators: List of validation functions to run
|
|
231
|
+
context: ValidationContext with data needed for validation
|
|
232
|
+
max_validation_retries: Maximum retry attempts with feedback (default: 1)
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Validated result of return_type
|
|
236
|
+
"""
|
|
237
|
+
result = self._parse_invoke(prompt, return_type)
|
|
238
|
+
|
|
239
|
+
for attempt in range(max_validation_retries):
|
|
240
|
+
# Run all validators
|
|
241
|
+
all_feedback = []
|
|
242
|
+
for validator in validators:
|
|
243
|
+
validation_result = validator(result, context)
|
|
244
|
+
if not validation_result.is_valid:
|
|
245
|
+
all_feedback.extend(validation_result.feedback_messages)
|
|
246
|
+
|
|
247
|
+
if not all_feedback:
|
|
248
|
+
logger.info(f"[Validation] All validations passed on attempt {attempt + 1}")
|
|
249
|
+
return result # All validations passed
|
|
250
|
+
|
|
251
|
+
# Build feedback prompt using the prompt factory
|
|
252
|
+
feedback_template = get_validation_feedback_message()
|
|
253
|
+
feedback_prompt = feedback_template.format(
|
|
254
|
+
original_output=result.llm_str(),
|
|
255
|
+
feedback_list="\n".join(f"- {msg}" for msg in all_feedback),
|
|
256
|
+
original_prompt=prompt,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
logger.info(
|
|
260
|
+
f"[Validation] Retry {attempt + 1}/{max_validation_retries} with {len(all_feedback)} feedback items"
|
|
261
|
+
)
|
|
262
|
+
result = self._parse_invoke(feedback_prompt, return_type)
|
|
263
|
+
|
|
264
|
+
return result
|
|
265
|
+
|
|
266
|
+
def _parse_response(self, prompt, response, return_type, max_retries=5, attempt=0):
|
|
267
|
+
if attempt >= max_retries:
|
|
268
|
+
logger.error(f"Max retries ({max_retries}) reached for parsing response: {response}")
|
|
269
|
+
raise Exception(f"Max retries reached for parsing response: {response}")
|
|
270
|
+
|
|
271
|
+
extractor = create_extractor(self.parsing_llm, tools=[return_type], tool_choice=return_type.__name__)
|
|
272
|
+
if response is None or response.strip() == "":
|
|
273
|
+
logger.error(f"Empty response for prompt: {prompt}")
|
|
274
|
+
try:
|
|
275
|
+
result = extractor.invoke(
|
|
276
|
+
return_type.extractor_str() + response,
|
|
277
|
+
config={"callbacks": [MONITORING_CALLBACK, self.agent_monitoring_callback]},
|
|
278
|
+
)
|
|
279
|
+
if "responses" in result and len(result["responses"]) != 0:
|
|
280
|
+
return return_type.model_validate(result["responses"][0])
|
|
281
|
+
if "messages" in result and len(result["messages"]) != 0:
|
|
282
|
+
message = result["messages"][0].content
|
|
283
|
+
parser = PydanticOutputParser(pydantic_object=return_type)
|
|
284
|
+
return self._try_parse(message, parser)
|
|
285
|
+
parser = PydanticOutputParser(pydantic_object=return_type)
|
|
286
|
+
return self._try_parse(response, parser)
|
|
287
|
+
except AttributeError as e:
|
|
288
|
+
# Workaround for trustcall bug: https://github.com/hinthornw/trustcall/issues/47
|
|
289
|
+
# 'ExtractionState' object has no attribute 'tool_call_id' occurs during validation retry
|
|
290
|
+
if "tool_call_id" in str(e):
|
|
291
|
+
logger.warning(f"Trustcall bug encountered, falling back to Pydantic parser: {e}")
|
|
292
|
+
parser = PydanticOutputParser(pydantic_object=return_type)
|
|
293
|
+
return self._try_parse(response, parser)
|
|
294
|
+
raise
|
|
295
|
+
except IndexError as e:
|
|
296
|
+
# try to parse with the json parser if possible
|
|
297
|
+
logger.warning(f"IndexError while parsing response (attempt {attempt + 1}/{max_retries}): {e}")
|
|
298
|
+
return self._parse_response(prompt, response, return_type, max_retries, attempt + 1)
|
|
299
|
+
except ResourceExhausted as e:
|
|
300
|
+
# Parsing uses exponential backoff for rate limits
|
|
301
|
+
if attempt < max_retries - 1:
|
|
302
|
+
# Exponential backoff: 30s * 2^attempt, capped at 300s
|
|
303
|
+
delay = min(30 * (2**attempt), 300)
|
|
304
|
+
logger.warning(
|
|
305
|
+
f"ResourceExhausted during parsing (rate limit): {e}\n"
|
|
306
|
+
f"Retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
|
|
307
|
+
)
|
|
308
|
+
time.sleep(delay)
|
|
309
|
+
return self._parse_response(prompt, response, return_type, max_retries, attempt + 1)
|
|
310
|
+
else:
|
|
311
|
+
logger.error(f"Resource exhausted on final parsing attempt: {e}")
|
|
312
|
+
raise
|
|
313
|
+
|
|
314
|
+
def _try_parse(self, message_content, parser):
|
|
315
|
+
try:
|
|
316
|
+
prompt_template = """You are an JSON expert. Here you need to extract information in the following json format: {format_instructions}
|
|
317
|
+
|
|
318
|
+
Here is the content to parse and fix: {adjective}
|
|
319
|
+
|
|
320
|
+
Please provide only the JSON output without any additional text."""
|
|
321
|
+
prompt = PromptTemplate(
|
|
322
|
+
template=prompt_template,
|
|
323
|
+
input_variables=["adjective"],
|
|
324
|
+
partial_variables={"format_instructions": parser.get_format_instructions()},
|
|
325
|
+
)
|
|
326
|
+
chain = prompt | self.parsing_llm | parser
|
|
327
|
+
return chain.invoke(
|
|
328
|
+
{"adjective": message_content},
|
|
329
|
+
config={"callbacks": [MONITORING_CALLBACK, self.agent_monitoring_callback]},
|
|
330
|
+
)
|
|
331
|
+
except (ValidationError, OutputParserException):
|
|
332
|
+
for _, v in json.loads(message_content).items():
|
|
333
|
+
try:
|
|
334
|
+
return self._try_parse(json.dumps(v), parser)
|
|
335
|
+
except:
|
|
336
|
+
pass
|
|
337
|
+
raise ValueError(f"Couldn't parse {message_content}")
|
|
338
|
+
|
|
339
|
+
def classify_files(self, analysis: AnalysisInsights, cluster_results: dict, scope_files: list[str]) -> None:
|
|
340
|
+
"""
|
|
341
|
+
Two-pass file assignment for AnalysisInsights:
|
|
342
|
+
1. Deterministic: assign files from cluster_ids and key_entities
|
|
343
|
+
2. LLM-based: classify remaining unassigned files
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
analysis: AnalysisInsights object to classify files for
|
|
347
|
+
cluster_results: Dict mapping language -> ClusterResult (for the relevant scope)
|
|
348
|
+
scope_files: List of file paths to limit classification scope.
|
|
349
|
+
|
|
350
|
+
Requires self to be a mixin with ClusterMethodsMixin for helper methods.
|
|
351
|
+
"""
|
|
352
|
+
for comp in analysis.components:
|
|
353
|
+
# Deterministic assignment (uses mixin methods)
|
|
354
|
+
self._assign_files_to_component(comp, cluster_results) # type: ignore[attr-defined]
|
|
355
|
+
self._classify_unassigned_files_llm(analysis, scope_files)
|
|
356
|
+
self._log_unclassified_files_count(analysis, scope_files)
|
|
357
|
+
|
|
358
|
+
def _classify_unassigned_files_llm(self, analysis: AnalysisInsights, scope_files: list[str]) -> None:
|
|
359
|
+
"""
|
|
360
|
+
Classify files from the scope files that weren't assigned to any component.
|
|
361
|
+
Uses a single LLM call to classify all unassigned files.
|
|
362
|
+
Args:
|
|
363
|
+
analysis: AnalysisInsights object
|
|
364
|
+
scope_files: List of file paths to limit classification scope.
|
|
365
|
+
"""
|
|
366
|
+
# Get unassigned files using the helper method
|
|
367
|
+
unassigned_files = self._get_unassigned_files(analysis, scope_files)
|
|
368
|
+
|
|
369
|
+
if not unassigned_files:
|
|
370
|
+
logger.info("[Agent] All files already assigned, skipping LLM classification")
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
logger.info(f"[Agent] Found {len(unassigned_files)} unassigned files, using LLM classification")
|
|
374
|
+
|
|
375
|
+
# 4. Build component summary for LLM using llm_str()
|
|
376
|
+
valid_components = [comp for comp in analysis.components if comp.name != "Unclassified"]
|
|
377
|
+
components_summary = "\n\n".join(comp.llm_str() for comp in valid_components)
|
|
378
|
+
component_map = {comp.name: comp for comp in valid_components}
|
|
379
|
+
|
|
380
|
+
# 5. Classify all unassigned files with LLM
|
|
381
|
+
classifications: list[FileClassification] = self._classify_unassigned_files_with_llm(
|
|
382
|
+
unassigned_files, components_summary, analysis
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# 6. Append successfully classified files to components
|
|
386
|
+
for fc in classifications:
|
|
387
|
+
if fc.component_name in component_map:
|
|
388
|
+
comp = component_map[fc.component_name]
|
|
389
|
+
if fc.file_path not in comp.assigned_files:
|
|
390
|
+
comp.assigned_files.append(fc.file_path)
|
|
391
|
+
logger.debug(f"[Agent] Assigned {fc.file_path} to {fc.component_name}")
|
|
392
|
+
else:
|
|
393
|
+
logger.warning(
|
|
394
|
+
f"[Agent] Invalid component name '{fc.component_name}' for file {fc.file_path}, skipping"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
logger.info(f"[Agent] File classification complete: {len(classifications)} files classified")
|
|
398
|
+
|
|
399
|
+
def _get_unassigned_files(self, analysis: AnalysisInsights, scope_files: list[str]) -> list[str]:
|
|
400
|
+
"""
|
|
401
|
+
Check which files remain unassigned after classification.
|
|
402
|
+
Args:
|
|
403
|
+
analysis: AnalysisInsights object with classified components
|
|
404
|
+
scope_files: List of file paths to limit the scope.
|
|
405
|
+
Returns:
|
|
406
|
+
List of file paths that are still unassigned
|
|
407
|
+
"""
|
|
408
|
+
# 1. Gather all assigned files
|
|
409
|
+
assigned_files = set()
|
|
410
|
+
for comp in analysis.components:
|
|
411
|
+
for f in comp.assigned_files:
|
|
412
|
+
abs_path = os.path.join(self.repo_dir, f) if not os.path.isabs(f) else f
|
|
413
|
+
assigned_files.add(os.path.relpath(abs_path, self.repo_dir))
|
|
414
|
+
|
|
415
|
+
# 2. Get files to consider for classification
|
|
416
|
+
# If scope_files is provided (e.g., DetailsAgent), use those
|
|
417
|
+
# Otherwise use all source files from static_analysis (e.g., AbstractionAgent)
|
|
418
|
+
all_files = set()
|
|
419
|
+
for file_path in scope_files:
|
|
420
|
+
file_path_str = str(file_path)
|
|
421
|
+
rel_path = os.path.relpath(file_path_str, self.repo_dir) if os.path.isabs(file_path_str) else file_path_str
|
|
422
|
+
all_files.add(rel_path)
|
|
423
|
+
|
|
424
|
+
# 3. Return unassigned files
|
|
425
|
+
return sorted(all_files - assigned_files)
|
|
426
|
+
|
|
427
|
+
def _log_unclassified_files_count(self, analysis: AnalysisInsights, scope_files: list[str]) -> None:
|
|
428
|
+
"""
|
|
429
|
+
Log how many files remain unclassified within the analysis.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
analysis: AnalysisInsights object with classified components
|
|
433
|
+
scope_files: List of file paths which are expected to be within the analysis.
|
|
434
|
+
"""
|
|
435
|
+
unassigned = self._get_unassigned_files(analysis, scope_files)
|
|
436
|
+
if unassigned:
|
|
437
|
+
logger.warning(f"[Agent] {len(unassigned)} files have not been classified successfully: {unassigned}")
|
|
438
|
+
else:
|
|
439
|
+
logger.info("[Agent] All files have been classified successfully")
|
|
440
|
+
|
|
441
|
+
def _classify_unassigned_files_with_llm(
|
|
442
|
+
self, unassigned_files: list[str], components_summary: str, analysis: AnalysisInsights
|
|
443
|
+
) -> list[FileClassification]:
|
|
444
|
+
"""
|
|
445
|
+
Classify unassigned files using LLM with validation.
|
|
446
|
+
Returns list of FileClassification objects.
|
|
447
|
+
"""
|
|
448
|
+
|
|
449
|
+
prompt = PromptTemplate(
|
|
450
|
+
template=get_unassigned_files_classification_message(), input_variables=["unassigned_files", "components"]
|
|
451
|
+
).format(unassigned_files="\n".join(unassigned_files), components=components_summary)
|
|
452
|
+
|
|
453
|
+
# Get valid component names from the components_summary
|
|
454
|
+
# Parse component names from the summary (components have format "**Component:** `ComponentName`")
|
|
455
|
+
valid_component_names = set([comp.name for comp in analysis.components])
|
|
456
|
+
|
|
457
|
+
# Build validation context
|
|
458
|
+
context = ValidationContext(
|
|
459
|
+
expected_files=set(unassigned_files),
|
|
460
|
+
valid_component_names=valid_component_names,
|
|
461
|
+
repo_dir=str(self.repo_dir),
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
file_classifications = self._validation_invoke(
|
|
465
|
+
prompt, ComponentFiles, validators=[validate_file_classifications], context=context
|
|
466
|
+
)
|
|
467
|
+
return file_classifications.file_paths
|