codeboarding 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. agents/__init__.py +0 -0
  2. agents/abstraction_agent.py +150 -0
  3. agents/agent.py +467 -0
  4. agents/agent_responses.py +363 -0
  5. agents/cluster_methods_mixin.py +281 -0
  6. agents/constants.py +13 -0
  7. agents/dependency_discovery.py +159 -0
  8. agents/details_agent.py +174 -0
  9. agents/llm_config.py +309 -0
  10. agents/meta_agent.py +105 -0
  11. agents/planner_agent.py +105 -0
  12. agents/prompts/__init__.py +85 -0
  13. agents/prompts/abstract_prompt_factory.py +63 -0
  14. agents/prompts/claude_prompts.py +381 -0
  15. agents/prompts/deepseek_prompts.py +389 -0
  16. agents/prompts/gemini_flash_prompts.py +362 -0
  17. agents/prompts/glm_prompts.py +407 -0
  18. agents/prompts/gpt_prompts.py +470 -0
  19. agents/prompts/kimi_prompts.py +400 -0
  20. agents/prompts/prompt_factory.py +179 -0
  21. agents/tools/__init__.py +8 -0
  22. agents/tools/base.py +96 -0
  23. agents/tools/get_external_deps.py +47 -0
  24. agents/tools/get_method_invocations.py +47 -0
  25. agents/tools/read_cfg.py +60 -0
  26. agents/tools/read_docs.py +132 -0
  27. agents/tools/read_file.py +90 -0
  28. agents/tools/read_file_structure.py +156 -0
  29. agents/tools/read_git_diff.py +131 -0
  30. agents/tools/read_packages.py +60 -0
  31. agents/tools/read_source.py +105 -0
  32. agents/tools/read_structure.py +49 -0
  33. agents/tools/toolkit.py +119 -0
  34. agents/validation.py +383 -0
  35. caching/__init__.py +4 -0
  36. caching/cache.py +29 -0
  37. caching/meta_cache.py +227 -0
  38. codeboarding-0.9.0.dist-info/METADATA +223 -0
  39. codeboarding-0.9.0.dist-info/RECORD +126 -0
  40. codeboarding-0.9.0.dist-info/WHEEL +5 -0
  41. codeboarding-0.9.0.dist-info/entry_points.txt +3 -0
  42. codeboarding-0.9.0.dist-info/licenses/LICENSE +21 -0
  43. codeboarding-0.9.0.dist-info/top_level.txt +18 -0
  44. core/__init__.py +101 -0
  45. core/plugin_loader.py +46 -0
  46. core/protocols.py +27 -0
  47. core/registry.py +46 -0
  48. diagram_analysis/__init__.py +4 -0
  49. diagram_analysis/analysis_json.py +346 -0
  50. diagram_analysis/diagram_generator.py +486 -0
  51. diagram_analysis/file_coverage.py +212 -0
  52. diagram_analysis/incremental/__init__.py +63 -0
  53. diagram_analysis/incremental/component_checker.py +236 -0
  54. diagram_analysis/incremental/file_manager.py +217 -0
  55. diagram_analysis/incremental/impact_analyzer.py +238 -0
  56. diagram_analysis/incremental/io_utils.py +281 -0
  57. diagram_analysis/incremental/models.py +72 -0
  58. diagram_analysis/incremental/path_patching.py +164 -0
  59. diagram_analysis/incremental/reexpansion.py +166 -0
  60. diagram_analysis/incremental/scoped_analysis.py +227 -0
  61. diagram_analysis/incremental/updater.py +464 -0
  62. diagram_analysis/incremental/validation.py +48 -0
  63. diagram_analysis/manifest.py +152 -0
  64. diagram_analysis/version.py +6 -0
  65. duckdb_crud.py +125 -0
  66. github_action.py +172 -0
  67. health/__init__.py +3 -0
  68. health/checks/__init__.py +11 -0
  69. health/checks/circular_deps.py +48 -0
  70. health/checks/cohesion.py +93 -0
  71. health/checks/coupling.py +140 -0
  72. health/checks/function_size.py +85 -0
  73. health/checks/god_class.py +167 -0
  74. health/checks/inheritance.py +104 -0
  75. health/checks/instability.py +77 -0
  76. health/checks/unused_code_diagnostics.py +338 -0
  77. health/config.py +172 -0
  78. health/constants.py +19 -0
  79. health/models.py +186 -0
  80. health/runner.py +236 -0
  81. install.py +518 -0
  82. logging_config.py +105 -0
  83. main.py +529 -0
  84. monitoring/__init__.py +12 -0
  85. monitoring/callbacks.py +163 -0
  86. monitoring/context.py +158 -0
  87. monitoring/mixin.py +16 -0
  88. monitoring/paths.py +47 -0
  89. monitoring/stats.py +50 -0
  90. monitoring/writers.py +172 -0
  91. output_generators/__init__.py +0 -0
  92. output_generators/html.py +163 -0
  93. output_generators/html_template.py +382 -0
  94. output_generators/markdown.py +140 -0
  95. output_generators/mdx.py +171 -0
  96. output_generators/sphinx.py +175 -0
  97. repo_utils/__init__.py +277 -0
  98. repo_utils/change_detector.py +289 -0
  99. repo_utils/errors.py +6 -0
  100. repo_utils/git_diff.py +74 -0
  101. repo_utils/ignore.py +341 -0
  102. static_analyzer/__init__.py +335 -0
  103. static_analyzer/analysis_cache.py +699 -0
  104. static_analyzer/analysis_result.py +269 -0
  105. static_analyzer/cluster_change_analyzer.py +391 -0
  106. static_analyzer/cluster_helpers.py +79 -0
  107. static_analyzer/constants.py +166 -0
  108. static_analyzer/git_diff_analyzer.py +224 -0
  109. static_analyzer/graph.py +746 -0
  110. static_analyzer/incremental_orchestrator.py +671 -0
  111. static_analyzer/java_config_scanner.py +232 -0
  112. static_analyzer/java_utils.py +227 -0
  113. static_analyzer/lsp_client/__init__.py +12 -0
  114. static_analyzer/lsp_client/client.py +1642 -0
  115. static_analyzer/lsp_client/diagnostics.py +62 -0
  116. static_analyzer/lsp_client/java_client.py +517 -0
  117. static_analyzer/lsp_client/language_settings.py +97 -0
  118. static_analyzer/lsp_client/typescript_client.py +235 -0
  119. static_analyzer/programming_language.py +152 -0
  120. static_analyzer/reference_resolve_mixin.py +166 -0
  121. static_analyzer/scanner.py +95 -0
  122. static_analyzer/typescript_config_scanner.py +54 -0
  123. tool_registry.py +433 -0
  124. user_config.py +134 -0
  125. utils.py +56 -0
  126. vscode_constants.py +124 -0
agents/__init__.py ADDED
File without changes
@@ -0,0 +1,150 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from langchain_core.prompts import PromptTemplate
5
+ from langchain_core.language_models import BaseChatModel
6
+
7
+ from agents.agent import CodeBoardingAgent
8
+ from agents.agent_responses import (
9
+ AnalysisInsights,
10
+ ClusterAnalysis,
11
+ MetaAnalysisInsights,
12
+ assign_component_ids,
13
+ )
14
+ from agents.prompts import (
15
+ get_system_message,
16
+ get_cluster_grouping_message,
17
+ get_final_analysis_message,
18
+ )
19
+ from agents.cluster_methods_mixin import ClusterMethodsMixin
20
+ from agents.validation import (
21
+ ValidationContext,
22
+ validate_cluster_coverage,
23
+ validate_component_relationships,
24
+ validate_key_entities,
25
+ validate_cluster_ids_populated,
26
+ validate_relation_component_names,
27
+ )
28
+ from monitoring import trace
29
+ from static_analyzer.analysis_result import StaticAnalysisResults
30
+ from static_analyzer.graph import ClusterResult
31
+ from static_analyzer.cluster_helpers import build_all_cluster_results, get_all_cluster_ids
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class AbstractionAgent(ClusterMethodsMixin, CodeBoardingAgent):
37
+ def __init__(
38
+ self,
39
+ repo_dir: Path,
40
+ static_analysis: StaticAnalysisResults,
41
+ project_name: str,
42
+ meta_context: MetaAnalysisInsights,
43
+ agent_llm: BaseChatModel,
44
+ parsing_llm: BaseChatModel,
45
+ ):
46
+ super().__init__(repo_dir, static_analysis, get_system_message(), agent_llm, parsing_llm)
47
+
48
+ self.project_name = project_name
49
+ self.meta_context = meta_context
50
+
51
+ self.prompts = {
52
+ "group_clusters": PromptTemplate(
53
+ template=get_cluster_grouping_message(),
54
+ input_variables=["project_name", "cfg_clusters", "meta_context", "project_type"],
55
+ ),
56
+ "final_analysis": PromptTemplate(
57
+ template=get_final_analysis_message(),
58
+ input_variables=["project_name", "cluster_analysis", "meta_context", "project_type"],
59
+ ),
60
+ }
61
+
62
+ @trace
63
+ def step_clusters_grouping(self, cluster_results: dict[str, ClusterResult]) -> ClusterAnalysis:
64
+ logger.info(f"[AbstractionAgent] Grouping CFG clusters for: {self.project_name}")
65
+
66
+ meta_context_str = self.meta_context.llm_str() if self.meta_context else "No project context available."
67
+ project_type = self.meta_context.project_type if self.meta_context else "unknown"
68
+
69
+ programming_langs = self.static_analysis.get_languages()
70
+
71
+ # Build cluster string using the pre-computed cluster results
72
+ cluster_str = self._build_cluster_string(programming_langs, cluster_results)
73
+
74
+ prompt = self.prompts["group_clusters"].format(
75
+ project_name=self.project_name,
76
+ cfg_clusters=cluster_str,
77
+ meta_context=meta_context_str,
78
+ project_type=project_type,
79
+ )
80
+
81
+ cluster_analysis = self._validation_invoke(
82
+ prompt,
83
+ ClusterAnalysis,
84
+ validators=[validate_cluster_coverage],
85
+ context=ValidationContext(
86
+ cluster_results=cluster_results,
87
+ expected_cluster_ids=get_all_cluster_ids(cluster_results),
88
+ ),
89
+ )
90
+ return cluster_analysis
91
+
92
+ @trace
93
+ def step_final_analysis(
94
+ self, cluster_analysis: ClusterAnalysis, cluster_results: dict[str, ClusterResult]
95
+ ) -> AnalysisInsights:
96
+ logger.info(f"[AbstractionAgent] Generating final analysis for: {self.project_name}")
97
+
98
+ meta_context_str = self.meta_context.llm_str() if self.meta_context else "No project context available."
99
+ project_type = self.meta_context.project_type if self.meta_context else "unknown"
100
+
101
+ cluster_str = cluster_analysis.llm_str() if cluster_analysis else "No cluster analysis available."
102
+
103
+ prompt = self.prompts["final_analysis"].format(
104
+ project_name=self.project_name,
105
+ cluster_analysis=cluster_str,
106
+ meta_context=meta_context_str,
107
+ project_type=project_type,
108
+ )
109
+
110
+ # Build validation context with CFG graphs for edge checking
111
+ context = ValidationContext(
112
+ cluster_results=cluster_results,
113
+ cfg_graphs={lang: self.static_analysis.get_cfg(lang) for lang in self.static_analysis.get_languages()},
114
+ )
115
+
116
+ return self._validation_invoke(
117
+ prompt,
118
+ AnalysisInsights,
119
+ validators=[
120
+ validate_relation_component_names,
121
+ validate_component_relationships,
122
+ validate_key_entities,
123
+ validate_cluster_ids_populated,
124
+ ],
125
+ context=context,
126
+ )
127
+
128
+ def run(self):
129
+ # Build full cluster results dict for all languages ONCE
130
+ cluster_results = build_all_cluster_results(self.static_analysis)
131
+
132
+ # Step 1: Group related clusters together into logical components
133
+ cluster_analysis = self.step_clusters_grouping(cluster_results)
134
+
135
+ # Step 2: Generate abstract components from grouped clusters
136
+ analysis = self.step_final_analysis(cluster_analysis, cluster_results)
137
+ # Step 3: Sanitize cluster IDs (remove invalid ones)
138
+ self._sanitize_component_cluster_ids(analysis, cluster_results=cluster_results)
139
+ # Step 4: Assign files to components (deterministic + LLM-based with validation)
140
+ self.classify_files(analysis, cluster_results, self.static_analysis.get_all_source_files())
141
+ # Step 5: Fix source code reference lines (resolves reference_file paths for key_entities)
142
+ analysis = self.fix_source_code_reference_lines(analysis)
143
+ # Step 6: Ensure unique key entities across components
144
+ self._ensure_unique_key_entities(analysis)
145
+ # Step 7: Ensure unique file assignments across components
146
+ self._ensure_unique_file_assignments(analysis)
147
+ # Step 8: Assign deterministic component IDs
148
+ assign_component_ids(analysis)
149
+
150
+ return analysis, cluster_results
agents/agent.py ADDED
@@ -0,0 +1,467 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import time
5
+ from pathlib import Path
6
+
7
+ from google.api_core.exceptions import ResourceExhausted
8
+ from langchain_core.exceptions import OutputParserException
9
+ from langchain_core.language_models import BaseChatModel
10
+ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
11
+ from langchain_core.output_parsers import PydanticOutputParser
12
+ from langchain_core.prompts import PromptTemplate
13
+ from langchain.agents import create_agent
14
+ from langgraph.graph.state import CompiledStateGraph
15
+ from pydantic import ValidationError
16
+ from trustcall import create_extractor
17
+
18
+ from agents.agent_responses import AnalysisInsights, ComponentFiles, FileClassification
19
+ from agents.prompts import (
20
+ get_unassigned_files_classification_message,
21
+ get_validation_feedback_message,
22
+ )
23
+ from agents.tools.base import RepoContext
24
+ from agents.tools.toolkit import CodeBoardingToolkit
25
+ from agents.validation import ValidationContext, validate_file_classifications
26
+ from monitoring.mixin import MonitoringMixin
27
+ from repo_utils.ignore import RepoIgnoreManager
28
+ from agents.llm_config import MONITORING_CALLBACK
29
+ from static_analyzer.analysis_result import StaticAnalysisResults
30
+ from static_analyzer.reference_resolve_mixin import ReferenceResolverMixin
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class CodeBoardingAgent(ReferenceResolverMixin, MonitoringMixin):
36
+ def __init__(
37
+ self,
38
+ repo_dir: Path,
39
+ static_analysis: StaticAnalysisResults,
40
+ system_message: str,
41
+ agent_llm: BaseChatModel,
42
+ parsing_llm: BaseChatModel,
43
+ ):
44
+ ReferenceResolverMixin.__init__(self, repo_dir, static_analysis)
45
+ MonitoringMixin.__init__(self)
46
+ self.parsing_llm = parsing_llm
47
+ self.repo_dir = repo_dir
48
+ self.ignore_manager = RepoIgnoreManager(repo_dir)
49
+
50
+ # Initialize the professional toolkit
51
+ context = RepoContext(repo_dir=repo_dir, ignore_manager=self.ignore_manager, static_analysis=static_analysis)
52
+ self.toolkit = CodeBoardingToolkit(context=context)
53
+
54
+ self.agent: CompiledStateGraph = create_agent(
55
+ model=agent_llm,
56
+ tools=self.toolkit.get_agent_tools(),
57
+ )
58
+ self.static_analysis = static_analysis
59
+ self.system_message = SystemMessage(content=system_message)
60
+
61
+ @property
62
+ def read_source_reference(self):
63
+ return self.toolkit.read_source_reference
64
+
65
+ @property
66
+ def read_packages_tool(self):
67
+ return self.toolkit.read_packages
68
+
69
+ @property
70
+ def read_structure_tool(self):
71
+ return self.toolkit.read_structure
72
+
73
+ @property
74
+ def read_file_structure(self):
75
+ return self.toolkit.read_file_structure
76
+
77
+ @property
78
+ def read_cfg_tool(self):
79
+ return self.toolkit.read_cfg
80
+
81
+ @property
82
+ def read_method_invocations_tool(self):
83
+ return self.toolkit.read_method_invocations
84
+
85
+ @property
86
+ def read_file_tool(self):
87
+ return self.toolkit.read_file
88
+
89
+ @property
90
+ def read_docs(self):
91
+ return self.toolkit.read_docs
92
+
93
+ @property
94
+ def external_deps_tool(self):
95
+ return self.toolkit.external_deps
96
+
97
+ def _invoke(self, prompt, callbacks: list | None = None) -> str:
98
+ """Unified agent invocation method with timeout and exponential backoff.
99
+
100
+ Uses exponential backoff based on total attempts, with different multipliers
101
+ for different error types. This ensures backoff increases appropriately even
102
+ when errors alternate between types.
103
+ """
104
+ max_retries = 5
105
+
106
+ for attempt in range(max_retries):
107
+ timeout_seconds = 300 if attempt == 0 else 600
108
+ try:
109
+ callback_list = callbacks or []
110
+ # Always append monitoring callback - logging config controls output
111
+ callback_list.append(MONITORING_CALLBACK)
112
+ callback_list.append(self.agent_monitoring_callback)
113
+
114
+ logger.info(
115
+ f"Starting agent.invoke() [attempt {attempt + 1}/{max_retries}] with prompt length: {len(prompt)}, timeout: {timeout_seconds}s"
116
+ )
117
+
118
+ response = self._invoke_with_timeout(
119
+ timeout_seconds=timeout_seconds, callback_list=callback_list, prompt=prompt
120
+ )
121
+
122
+ logger.info(
123
+ f"Completed agent.invoke() - message count: {len(response['messages'])}, last message type: {type(response['messages'][-1])}"
124
+ )
125
+
126
+ agent_response = response["messages"][-1]
127
+ assert isinstance(agent_response, AIMessage), f"Expected AIMessage, but got {type(agent_response)}"
128
+ if isinstance(agent_response.content, str):
129
+ return agent_response.content
130
+ if isinstance(agent_response.content, list):
131
+ return "".join(
132
+ [
133
+ str(message) if not isinstance(message, str) else message
134
+ for message in agent_response.content
135
+ ]
136
+ )
137
+
138
+ except TimeoutError as e:
139
+ if attempt < max_retries - 1:
140
+ # Exponential backoff: 10s * 2^attempt (10s, 20s, 40s, 80s)
141
+ delay = min(10 * (2**attempt), 120)
142
+ logger.warning(
143
+ f"Agent invocation timed out after {timeout_seconds}s, retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
144
+ )
145
+ time.sleep(delay)
146
+ else:
147
+ logger.error(f"Agent invocation timed out after {timeout_seconds}s on final attempt")
148
+ raise
149
+
150
+ except ResourceExhausted as e:
151
+ if attempt < max_retries - 1:
152
+ # Longer backoff for rate limits: 30s * 2^attempt (30s, 60s, 120s, 240s)
153
+ delay = min(30 * (2**attempt), 300)
154
+ logger.warning(
155
+ f"ResourceExhausted (rate limit): {e}\n"
156
+ f"Retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
157
+ )
158
+ time.sleep(delay)
159
+ else:
160
+ logger.error(f"Max retries ({max_retries}) reached. ResourceExhausted: {e}")
161
+ raise
162
+
163
+ except Exception as e:
164
+ # Other errors (network, parsing, etc.) get standard exponential backoff
165
+ if attempt < max_retries - 1:
166
+ delay = min(10 * (2**attempt), 120)
167
+ logger.warning(
168
+ f"Agent error: {type(e).__name__}: {e}, retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
169
+ )
170
+ time.sleep(delay)
171
+ # On final attempt, fall through to return error message below
172
+
173
+ logger.error("Max retries reached. Failed to get response from the agent.")
174
+ return "Could not get response from the agent."
175
+
176
+ def _invoke_with_timeout(self, timeout_seconds: int, callback_list: list, prompt: str):
177
+ """Invoke agent with a timeout using threading."""
178
+ import threading
179
+ from queue import Queue, Empty
180
+
181
+ result_queue: Queue = Queue()
182
+ exception_queue: Queue = Queue()
183
+
184
+ def invoke_target():
185
+ try:
186
+ response = self.agent.invoke(
187
+ {"messages": [self.system_message, HumanMessage(content=prompt)]},
188
+ config={"callbacks": callback_list, "recursion_limit": 40},
189
+ )
190
+ result_queue.put(response)
191
+ except Exception as e:
192
+ exception_queue.put(e)
193
+
194
+ thread = threading.Thread(target=invoke_target, daemon=True)
195
+ thread.start()
196
+ thread.join(timeout=timeout_seconds)
197
+
198
+ if thread.is_alive():
199
+ # Thread is still running - timeout occurred
200
+ logger.error(f"Agent invoke thread still running after {timeout_seconds}s timeout")
201
+ raise TimeoutError(f"Agent invocation exceeded {timeout_seconds}s timeout")
202
+
203
+ # Check for exceptions
204
+ try:
205
+ exception = exception_queue.get_nowait()
206
+ raise exception
207
+ except Empty:
208
+ pass
209
+
210
+ # Get result
211
+ try:
212
+ return result_queue.get_nowait()
213
+ except Empty:
214
+ raise RuntimeError("Agent invocation completed but no result was returned")
215
+
216
+ def _parse_invoke(self, prompt, type):
217
+ response = self._invoke(prompt)
218
+ assert isinstance(response, str), f"Expected a string as response type got {response}"
219
+ return self._parse_response(prompt, response, type)
220
+
221
+ def _validation_invoke(
222
+ self, prompt: str, return_type: type, validators: list, context, max_validation_retries: int = 1
223
+ ):
224
+ """
225
+ Invoke LLM with validation and feedback loop.
226
+
227
+ Args:
228
+ prompt: The original prompt
229
+ return_type: Pydantic type to parse into
230
+ validators: List of validation functions to run
231
+ context: ValidationContext with data needed for validation
232
+ max_validation_retries: Maximum retry attempts with feedback (default: 1)
233
+
234
+ Returns:
235
+ Validated result of return_type
236
+ """
237
+ result = self._parse_invoke(prompt, return_type)
238
+
239
+ for attempt in range(max_validation_retries):
240
+ # Run all validators
241
+ all_feedback = []
242
+ for validator in validators:
243
+ validation_result = validator(result, context)
244
+ if not validation_result.is_valid:
245
+ all_feedback.extend(validation_result.feedback_messages)
246
+
247
+ if not all_feedback:
248
+ logger.info(f"[Validation] All validations passed on attempt {attempt + 1}")
249
+ return result # All validations passed
250
+
251
+ # Build feedback prompt using the prompt factory
252
+ feedback_template = get_validation_feedback_message()
253
+ feedback_prompt = feedback_template.format(
254
+ original_output=result.llm_str(),
255
+ feedback_list="\n".join(f"- {msg}" for msg in all_feedback),
256
+ original_prompt=prompt,
257
+ )
258
+
259
+ logger.info(
260
+ f"[Validation] Retry {attempt + 1}/{max_validation_retries} with {len(all_feedback)} feedback items"
261
+ )
262
+ result = self._parse_invoke(feedback_prompt, return_type)
263
+
264
+ return result
265
+
266
+ def _parse_response(self, prompt, response, return_type, max_retries=5, attempt=0):
267
+ if attempt >= max_retries:
268
+ logger.error(f"Max retries ({max_retries}) reached for parsing response: {response}")
269
+ raise Exception(f"Max retries reached for parsing response: {response}")
270
+
271
+ extractor = create_extractor(self.parsing_llm, tools=[return_type], tool_choice=return_type.__name__)
272
+ if response is None or response.strip() == "":
273
+ logger.error(f"Empty response for prompt: {prompt}")
274
+ try:
275
+ result = extractor.invoke(
276
+ return_type.extractor_str() + response,
277
+ config={"callbacks": [MONITORING_CALLBACK, self.agent_monitoring_callback]},
278
+ )
279
+ if "responses" in result and len(result["responses"]) != 0:
280
+ return return_type.model_validate(result["responses"][0])
281
+ if "messages" in result and len(result["messages"]) != 0:
282
+ message = result["messages"][0].content
283
+ parser = PydanticOutputParser(pydantic_object=return_type)
284
+ return self._try_parse(message, parser)
285
+ parser = PydanticOutputParser(pydantic_object=return_type)
286
+ return self._try_parse(response, parser)
287
+ except AttributeError as e:
288
+ # Workaround for trustcall bug: https://github.com/hinthornw/trustcall/issues/47
289
+ # 'ExtractionState' object has no attribute 'tool_call_id' occurs during validation retry
290
+ if "tool_call_id" in str(e):
291
+ logger.warning(f"Trustcall bug encountered, falling back to Pydantic parser: {e}")
292
+ parser = PydanticOutputParser(pydantic_object=return_type)
293
+ return self._try_parse(response, parser)
294
+ raise
295
+ except IndexError as e:
296
+ # try to parse with the json parser if possible
297
+ logger.warning(f"IndexError while parsing response (attempt {attempt + 1}/{max_retries}): {e}")
298
+ return self._parse_response(prompt, response, return_type, max_retries, attempt + 1)
299
+ except ResourceExhausted as e:
300
+ # Parsing uses exponential backoff for rate limits
301
+ if attempt < max_retries - 1:
302
+ # Exponential backoff: 30s * 2^attempt, capped at 300s
303
+ delay = min(30 * (2**attempt), 300)
304
+ logger.warning(
305
+ f"ResourceExhausted during parsing (rate limit): {e}\n"
306
+ f"Retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
307
+ )
308
+ time.sleep(delay)
309
+ return self._parse_response(prompt, response, return_type, max_retries, attempt + 1)
310
+ else:
311
+ logger.error(f"Resource exhausted on final parsing attempt: {e}")
312
+ raise
313
+
314
+ def _try_parse(self, message_content, parser):
315
+ try:
316
+ prompt_template = """You are an JSON expert. Here you need to extract information in the following json format: {format_instructions}
317
+
318
+ Here is the content to parse and fix: {adjective}
319
+
320
+ Please provide only the JSON output without any additional text."""
321
+ prompt = PromptTemplate(
322
+ template=prompt_template,
323
+ input_variables=["adjective"],
324
+ partial_variables={"format_instructions": parser.get_format_instructions()},
325
+ )
326
+ chain = prompt | self.parsing_llm | parser
327
+ return chain.invoke(
328
+ {"adjective": message_content},
329
+ config={"callbacks": [MONITORING_CALLBACK, self.agent_monitoring_callback]},
330
+ )
331
+ except (ValidationError, OutputParserException):
332
+ for _, v in json.loads(message_content).items():
333
+ try:
334
+ return self._try_parse(json.dumps(v), parser)
335
+ except:
336
+ pass
337
+ raise ValueError(f"Couldn't parse {message_content}")
338
+
339
+ def classify_files(self, analysis: AnalysisInsights, cluster_results: dict, scope_files: list[str]) -> None:
340
+ """
341
+ Two-pass file assignment for AnalysisInsights:
342
+ 1. Deterministic: assign files from cluster_ids and key_entities
343
+ 2. LLM-based: classify remaining unassigned files
344
+
345
+ Args:
346
+ analysis: AnalysisInsights object to classify files for
347
+ cluster_results: Dict mapping language -> ClusterResult (for the relevant scope)
348
+ scope_files: List of file paths to limit classification scope.
349
+
350
+ Requires self to be a mixin with ClusterMethodsMixin for helper methods.
351
+ """
352
+ for comp in analysis.components:
353
+ # Deterministic assignment (uses mixin methods)
354
+ self._assign_files_to_component(comp, cluster_results) # type: ignore[attr-defined]
355
+ self._classify_unassigned_files_llm(analysis, scope_files)
356
+ self._log_unclassified_files_count(analysis, scope_files)
357
+
358
+ def _classify_unassigned_files_llm(self, analysis: AnalysisInsights, scope_files: list[str]) -> None:
359
+ """
360
+ Classify files from the scope files that weren't assigned to any component.
361
+ Uses a single LLM call to classify all unassigned files.
362
+ Args:
363
+ analysis: AnalysisInsights object
364
+ scope_files: List of file paths to limit classification scope.
365
+ """
366
+ # Get unassigned files using the helper method
367
+ unassigned_files = self._get_unassigned_files(analysis, scope_files)
368
+
369
+ if not unassigned_files:
370
+ logger.info("[Agent] All files already assigned, skipping LLM classification")
371
+ return
372
+
373
+ logger.info(f"[Agent] Found {len(unassigned_files)} unassigned files, using LLM classification")
374
+
375
+ # 4. Build component summary for LLM using llm_str()
376
+ valid_components = [comp for comp in analysis.components if comp.name != "Unclassified"]
377
+ components_summary = "\n\n".join(comp.llm_str() for comp in valid_components)
378
+ component_map = {comp.name: comp for comp in valid_components}
379
+
380
+ # 5. Classify all unassigned files with LLM
381
+ classifications: list[FileClassification] = self._classify_unassigned_files_with_llm(
382
+ unassigned_files, components_summary, analysis
383
+ )
384
+
385
+ # 6. Append successfully classified files to components
386
+ for fc in classifications:
387
+ if fc.component_name in component_map:
388
+ comp = component_map[fc.component_name]
389
+ if fc.file_path not in comp.assigned_files:
390
+ comp.assigned_files.append(fc.file_path)
391
+ logger.debug(f"[Agent] Assigned {fc.file_path} to {fc.component_name}")
392
+ else:
393
+ logger.warning(
394
+ f"[Agent] Invalid component name '{fc.component_name}' for file {fc.file_path}, skipping"
395
+ )
396
+
397
+ logger.info(f"[Agent] File classification complete: {len(classifications)} files classified")
398
+
399
+ def _get_unassigned_files(self, analysis: AnalysisInsights, scope_files: list[str]) -> list[str]:
400
+ """
401
+ Check which files remain unassigned after classification.
402
+ Args:
403
+ analysis: AnalysisInsights object with classified components
404
+ scope_files: List of file paths to limit the scope.
405
+ Returns:
406
+ List of file paths that are still unassigned
407
+ """
408
+ # 1. Gather all assigned files
409
+ assigned_files = set()
410
+ for comp in analysis.components:
411
+ for f in comp.assigned_files:
412
+ abs_path = os.path.join(self.repo_dir, f) if not os.path.isabs(f) else f
413
+ assigned_files.add(os.path.relpath(abs_path, self.repo_dir))
414
+
415
+ # 2. Get files to consider for classification
416
+ # If scope_files is provided (e.g., DetailsAgent), use those
417
+ # Otherwise use all source files from static_analysis (e.g., AbstractionAgent)
418
+ all_files = set()
419
+ for file_path in scope_files:
420
+ file_path_str = str(file_path)
421
+ rel_path = os.path.relpath(file_path_str, self.repo_dir) if os.path.isabs(file_path_str) else file_path_str
422
+ all_files.add(rel_path)
423
+
424
+ # 3. Return unassigned files
425
+ return sorted(all_files - assigned_files)
426
+
427
+ def _log_unclassified_files_count(self, analysis: AnalysisInsights, scope_files: list[str]) -> None:
428
+ """
429
+ Log how many files remain unclassified within the analysis.
430
+
431
+ Args:
432
+ analysis: AnalysisInsights object with classified components
433
+ scope_files: List of file paths which are expected to be within the analysis.
434
+ """
435
+ unassigned = self._get_unassigned_files(analysis, scope_files)
436
+ if unassigned:
437
+ logger.warning(f"[Agent] {len(unassigned)} files have not been classified successfully: {unassigned}")
438
+ else:
439
+ logger.info("[Agent] All files have been classified successfully")
440
+
441
+ def _classify_unassigned_files_with_llm(
442
+ self, unassigned_files: list[str], components_summary: str, analysis: AnalysisInsights
443
+ ) -> list[FileClassification]:
444
+ """
445
+ Classify unassigned files using LLM with validation.
446
+ Returns list of FileClassification objects.
447
+ """
448
+
449
+ prompt = PromptTemplate(
450
+ template=get_unassigned_files_classification_message(), input_variables=["unassigned_files", "components"]
451
+ ).format(unassigned_files="\n".join(unassigned_files), components=components_summary)
452
+
453
+ # Get valid component names from the components_summary
454
+ # Parse component names from the summary (components have format "**Component:** `ComponentName`")
455
+ valid_component_names = set([comp.name for comp in analysis.components])
456
+
457
+ # Build validation context
458
+ context = ValidationContext(
459
+ expected_files=set(unassigned_files),
460
+ valid_component_names=valid_component_names,
461
+ repo_dir=str(self.repo_dir),
462
+ )
463
+
464
+ file_classifications = self._validation_invoke(
465
+ prompt, ComponentFiles, validators=[validate_file_classifications], context=context
466
+ )
467
+ return file_classifications.file_paths