codeboarding 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +0 -0
- agents/abstraction_agent.py +150 -0
- agents/agent.py +467 -0
- agents/agent_responses.py +363 -0
- agents/cluster_methods_mixin.py +281 -0
- agents/constants.py +13 -0
- agents/dependency_discovery.py +159 -0
- agents/details_agent.py +174 -0
- agents/llm_config.py +309 -0
- agents/meta_agent.py +105 -0
- agents/planner_agent.py +105 -0
- agents/prompts/__init__.py +85 -0
- agents/prompts/abstract_prompt_factory.py +63 -0
- agents/prompts/claude_prompts.py +381 -0
- agents/prompts/deepseek_prompts.py +389 -0
- agents/prompts/gemini_flash_prompts.py +362 -0
- agents/prompts/glm_prompts.py +407 -0
- agents/prompts/gpt_prompts.py +470 -0
- agents/prompts/kimi_prompts.py +400 -0
- agents/prompts/prompt_factory.py +179 -0
- agents/tools/__init__.py +8 -0
- agents/tools/base.py +96 -0
- agents/tools/get_external_deps.py +47 -0
- agents/tools/get_method_invocations.py +47 -0
- agents/tools/read_cfg.py +60 -0
- agents/tools/read_docs.py +132 -0
- agents/tools/read_file.py +90 -0
- agents/tools/read_file_structure.py +156 -0
- agents/tools/read_git_diff.py +131 -0
- agents/tools/read_packages.py +60 -0
- agents/tools/read_source.py +105 -0
- agents/tools/read_structure.py +49 -0
- agents/tools/toolkit.py +119 -0
- agents/validation.py +383 -0
- caching/__init__.py +4 -0
- caching/cache.py +29 -0
- caching/meta_cache.py +227 -0
- codeboarding-0.9.0.dist-info/METADATA +223 -0
- codeboarding-0.9.0.dist-info/RECORD +126 -0
- codeboarding-0.9.0.dist-info/WHEEL +5 -0
- codeboarding-0.9.0.dist-info/entry_points.txt +3 -0
- codeboarding-0.9.0.dist-info/licenses/LICENSE +21 -0
- codeboarding-0.9.0.dist-info/top_level.txt +18 -0
- core/__init__.py +101 -0
- core/plugin_loader.py +46 -0
- core/protocols.py +27 -0
- core/registry.py +46 -0
- diagram_analysis/__init__.py +4 -0
- diagram_analysis/analysis_json.py +346 -0
- diagram_analysis/diagram_generator.py +486 -0
- diagram_analysis/file_coverage.py +212 -0
- diagram_analysis/incremental/__init__.py +63 -0
- diagram_analysis/incremental/component_checker.py +236 -0
- diagram_analysis/incremental/file_manager.py +217 -0
- diagram_analysis/incremental/impact_analyzer.py +238 -0
- diagram_analysis/incremental/io_utils.py +281 -0
- diagram_analysis/incremental/models.py +72 -0
- diagram_analysis/incremental/path_patching.py +164 -0
- diagram_analysis/incremental/reexpansion.py +166 -0
- diagram_analysis/incremental/scoped_analysis.py +227 -0
- diagram_analysis/incremental/updater.py +464 -0
- diagram_analysis/incremental/validation.py +48 -0
- diagram_analysis/manifest.py +152 -0
- diagram_analysis/version.py +6 -0
- duckdb_crud.py +125 -0
- github_action.py +172 -0
- health/__init__.py +3 -0
- health/checks/__init__.py +11 -0
- health/checks/circular_deps.py +48 -0
- health/checks/cohesion.py +93 -0
- health/checks/coupling.py +140 -0
- health/checks/function_size.py +85 -0
- health/checks/god_class.py +167 -0
- health/checks/inheritance.py +104 -0
- health/checks/instability.py +77 -0
- health/checks/unused_code_diagnostics.py +338 -0
- health/config.py +172 -0
- health/constants.py +19 -0
- health/models.py +186 -0
- health/runner.py +236 -0
- install.py +518 -0
- logging_config.py +105 -0
- main.py +529 -0
- monitoring/__init__.py +12 -0
- monitoring/callbacks.py +163 -0
- monitoring/context.py +158 -0
- monitoring/mixin.py +16 -0
- monitoring/paths.py +47 -0
- monitoring/stats.py +50 -0
- monitoring/writers.py +172 -0
- output_generators/__init__.py +0 -0
- output_generators/html.py +163 -0
- output_generators/html_template.py +382 -0
- output_generators/markdown.py +140 -0
- output_generators/mdx.py +171 -0
- output_generators/sphinx.py +175 -0
- repo_utils/__init__.py +277 -0
- repo_utils/change_detector.py +289 -0
- repo_utils/errors.py +6 -0
- repo_utils/git_diff.py +74 -0
- repo_utils/ignore.py +341 -0
- static_analyzer/__init__.py +335 -0
- static_analyzer/analysis_cache.py +699 -0
- static_analyzer/analysis_result.py +269 -0
- static_analyzer/cluster_change_analyzer.py +391 -0
- static_analyzer/cluster_helpers.py +79 -0
- static_analyzer/constants.py +166 -0
- static_analyzer/git_diff_analyzer.py +224 -0
- static_analyzer/graph.py +746 -0
- static_analyzer/incremental_orchestrator.py +671 -0
- static_analyzer/java_config_scanner.py +232 -0
- static_analyzer/java_utils.py +227 -0
- static_analyzer/lsp_client/__init__.py +12 -0
- static_analyzer/lsp_client/client.py +1642 -0
- static_analyzer/lsp_client/diagnostics.py +62 -0
- static_analyzer/lsp_client/java_client.py +517 -0
- static_analyzer/lsp_client/language_settings.py +97 -0
- static_analyzer/lsp_client/typescript_client.py +235 -0
- static_analyzer/programming_language.py +152 -0
- static_analyzer/reference_resolve_mixin.py +166 -0
- static_analyzer/scanner.py +95 -0
- static_analyzer/typescript_config_scanner.py +54 -0
- tool_registry.py +433 -0
- user_config.py +134 -0
- utils.py +56 -0
- vscode_constants.py +124 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import hashlib
|
|
3
|
+
import logging
|
|
4
|
+
from abc import abstractmethod
|
|
5
|
+
from typing import get_origin, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
ROOT_PARENT_ID = "ROOT"
|
|
12
|
+
COMPONENT_ID_BYTES = 8
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LLMBaseModel(BaseModel, abc.ABC):
|
|
16
|
+
"""Base model for LLM-parseable response types."""
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def llm_str(self):
|
|
20
|
+
raise NotImplementedError("LLM String has to be implemented.")
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def extractor_str(cls):
|
|
24
|
+
# Here iterate over the fields that we have and use their description like:
|
|
25
|
+
result_str = "please extract the following: "
|
|
26
|
+
for fname, fvalue in cls.model_fields.items():
|
|
27
|
+
# check if the field type is Optional
|
|
28
|
+
ftype = fvalue.annotation
|
|
29
|
+
# Check if the type is a typing.List (e.g., typing.List[SomeType])
|
|
30
|
+
if get_origin(ftype) is list:
|
|
31
|
+
# get the type of the list:
|
|
32
|
+
if ftype is not None and hasattr(ftype, "__args__"):
|
|
33
|
+
ftype = ftype.__args__[0]
|
|
34
|
+
result_str += f"{fname} which is a list ("
|
|
35
|
+
if ftype is Optional:
|
|
36
|
+
result_str += f"{fname} ({fvalue.description}), "
|
|
37
|
+
elif ftype is not None and isinstance(ftype, type) and issubclass(ftype, LLMBaseModel):
|
|
38
|
+
# Now I need to call the extractor_str method of the field
|
|
39
|
+
result_str += ftype.extractor_str()
|
|
40
|
+
else:
|
|
41
|
+
result_str += f"{fname} ({fvalue.description}), "
|
|
42
|
+
if get_origin(ftype) is list:
|
|
43
|
+
result_str += "), "
|
|
44
|
+
return result_str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SourceCodeReference(LLMBaseModel):
|
|
48
|
+
"""Reference to source code including qualified name and file location."""
|
|
49
|
+
|
|
50
|
+
qualified_name: str = Field(
|
|
51
|
+
description="Qualified name of the source code, e.g., `langchain.tools.tool` or `langchain_core.output_parsers.JsonOutputParser` or `langchain_core.output_parsers.JsonOutputParser:parse`."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
reference_file: str | None = Field(
|
|
55
|
+
default=None,
|
|
56
|
+
description="File path where the source code is located, e.g., `langchain/tools/tool.py` or `langchain_core/output_parsers/json_output_parser.py`.",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
reference_start_line: int | None = Field(
|
|
60
|
+
default=None,
|
|
61
|
+
description="The line number in the source code where the reference starts. Only if you are absolutely sure add this, otherwise None.",
|
|
62
|
+
)
|
|
63
|
+
reference_end_line: int | None = Field(
|
|
64
|
+
default=None,
|
|
65
|
+
description="The line number in the source code where the reference ends. Only if you are absolutely sure add this, otherwise None.",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def llm_str(self):
|
|
69
|
+
if self.reference_start_line is None or self.reference_end_line is None:
|
|
70
|
+
return f"QName:`{self.qualified_name}` FileRef: `{self.reference_file}`"
|
|
71
|
+
if (
|
|
72
|
+
self.reference_start_line <= self.reference_end_line <= 0
|
|
73
|
+
or self.reference_start_line == self.reference_end_line
|
|
74
|
+
):
|
|
75
|
+
return f"QName:`{self.qualified_name}` FileRef: `{self.reference_file}`"
|
|
76
|
+
return f"QName:`{self.qualified_name}` FileRef: `{self.reference_file}`, Lines:({self.reference_start_line}:{self.reference_end_line})"
|
|
77
|
+
|
|
78
|
+
def __str__(self):
|
|
79
|
+
if self.reference_start_line is None or self.reference_end_line is None:
|
|
80
|
+
return f"`{self.qualified_name}`"
|
|
81
|
+
if (
|
|
82
|
+
self.reference_start_line <= self.reference_end_line <= 0
|
|
83
|
+
or self.reference_start_line == self.reference_end_line
|
|
84
|
+
):
|
|
85
|
+
return f"`{self.qualified_name}`"
|
|
86
|
+
return f"`{self.qualified_name}`:{self.reference_start_line}-{self.reference_end_line}"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class Relation(LLMBaseModel):
|
|
90
|
+
"""A relationship between two components."""
|
|
91
|
+
|
|
92
|
+
relation: str = Field(description="Single phrase used for the relationship of two components.")
|
|
93
|
+
src_name: str = Field(description="Source component name")
|
|
94
|
+
dst_name: str = Field(description="Target component name")
|
|
95
|
+
src_id: str = Field(default="", description="Component ID of the source.", exclude=True)
|
|
96
|
+
dst_id: str = Field(default="", description="Component ID of the destination.", exclude=True)
|
|
97
|
+
|
|
98
|
+
def llm_str(self):
|
|
99
|
+
return f"({self.src_name}, {self.relation}, {self.dst_name})"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class ClustersComponent(LLMBaseModel):
|
|
103
|
+
"""A grouped component from cluster analysis - may contain multiple clusters."""
|
|
104
|
+
|
|
105
|
+
cluster_ids: list[int] = Field(
|
|
106
|
+
description="List of cluster IDs from the CFG analysis that are grouped together (e.g., [1, 3, 5])"
|
|
107
|
+
)
|
|
108
|
+
description: str = Field(
|
|
109
|
+
description="Explanation of what this component does, its main flow, and WHY these clusters are grouped together"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def llm_str(self):
|
|
113
|
+
ids_str = ", ".join(str(cid) for cid in self.cluster_ids)
|
|
114
|
+
return f"**Clusters [{ids_str}]**\n {self.description}"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class ClusterAnalysis(LLMBaseModel):
|
|
118
|
+
"""Analysis results containing grouped cluster components."""
|
|
119
|
+
|
|
120
|
+
cluster_components: list[ClustersComponent] = Field(
|
|
121
|
+
description="Grouped clusters into logical components. Multiple cluster IDs can be grouped together if they work as a cohesive unit."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def llm_str(self):
|
|
125
|
+
if not self.cluster_components:
|
|
126
|
+
return "No clusters analyzed."
|
|
127
|
+
title = "# Grouped Cluster Components\n"
|
|
128
|
+
body = "\n".join(cc.llm_str() for cc in self.cluster_components)
|
|
129
|
+
return title + body
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class Component(LLMBaseModel):
|
|
133
|
+
"""A software component with name, description, and key entities."""
|
|
134
|
+
|
|
135
|
+
name: str = Field(description="Name of the component")
|
|
136
|
+
description: str = Field(description="A short description of the component.")
|
|
137
|
+
|
|
138
|
+
# LLM picks these: The MOST IMPORTANT/critical methods and classes
|
|
139
|
+
key_entities: list[SourceCodeReference] = Field(
|
|
140
|
+
description="The most important/critical classes and methods that represent this component's core functionality. Pick 2-5 key entities."
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Deterministic from static analysis: ALL files belonging to this component
|
|
144
|
+
assigned_files: list[str] = Field(
|
|
145
|
+
description="All source files assigned to this component (populated deterministically).",
|
|
146
|
+
default_factory=list,
|
|
147
|
+
exclude=True,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
source_cluster_ids: list[int] = Field(
|
|
151
|
+
description="List of cluster IDs from CFG analysis that this component encompasses.",
|
|
152
|
+
default_factory=list,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
component_id: str = Field(
|
|
156
|
+
default="",
|
|
157
|
+
description="Deterministic unique identifier for this component.",
|
|
158
|
+
exclude=True,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def llm_str(self):
|
|
162
|
+
n = f"**Component:** `{self.name}`"
|
|
163
|
+
d = f" - *Description*: {self.description}"
|
|
164
|
+
qn = ""
|
|
165
|
+
if self.key_entities:
|
|
166
|
+
qn += " - *Key Entities*: "
|
|
167
|
+
qn += ", ".join(f"`{q.llm_str()}`" for q in self.key_entities)
|
|
168
|
+
return "\n".join([n, d, qn]).strip()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class AnalysisInsights(LLMBaseModel):
|
|
172
|
+
"""Project analysis insights including components and their relations."""
|
|
173
|
+
|
|
174
|
+
description: str = Field(
|
|
175
|
+
description="One paragraph explaining the functionality which is represented by this graph. What the main flow is and what is its purpose."
|
|
176
|
+
)
|
|
177
|
+
components: list[Component] = Field(description="List of the components identified in the project.")
|
|
178
|
+
components_relations: list[Relation] = Field(description="List of relations among the components.")
|
|
179
|
+
|
|
180
|
+
def llm_str(self):
|
|
181
|
+
if not self.components:
|
|
182
|
+
return "No abstract components found."
|
|
183
|
+
title = "# 📦 Abstract Components Overview\n"
|
|
184
|
+
body = "\n".join(ac.llm_str() for ac in self.components)
|
|
185
|
+
relations = "\n".join(cr.llm_str() for cr in self.components_relations)
|
|
186
|
+
return title + body + relations
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def hash_component_id(parent_id: str, name: str, sibling_index: int = 0) -> str:
|
|
190
|
+
"""Hash a deterministic component ID from parent ID, name, and sibling index.
|
|
191
|
+
|
|
192
|
+
Note:
|
|
193
|
+
The ID is a compact, 64-bit prefix of SHA-256 (8 bytes -> 16 hex chars).
|
|
194
|
+
Truncation happens at the byte level to keep the representation explicit.
|
|
195
|
+
"""
|
|
196
|
+
raw = f"{parent_id}:{name}:{sibling_index}".encode("utf-8")
|
|
197
|
+
return hashlib.sha256(raw).digest()[:COMPONENT_ID_BYTES].hex()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def assign_component_ids(analysis: AnalysisInsights, parent_id: str = ROOT_PARENT_ID) -> None:
|
|
201
|
+
"""Assign deterministic component IDs to all components in an analysis.
|
|
202
|
+
|
|
203
|
+
Handles same-named siblings by using a sibling index tiebreaker.
|
|
204
|
+
"""
|
|
205
|
+
name_counts: dict[str, int] = {}
|
|
206
|
+
for component in analysis.components:
|
|
207
|
+
count = name_counts.get(component.name, 0)
|
|
208
|
+
component.component_id = hash_component_id(parent_id, component.name, count)
|
|
209
|
+
name_counts[component.name] = count + 1
|
|
210
|
+
|
|
211
|
+
# Assign relation IDs by looking up component names (first occurrence wins for duplicates)
|
|
212
|
+
name_to_id: dict[str, str] = {}
|
|
213
|
+
for c in analysis.components:
|
|
214
|
+
if c.name in name_to_id:
|
|
215
|
+
logger.warning(
|
|
216
|
+
f"Duplicate component name '{c.name}' found during ID assignment; "
|
|
217
|
+
f"relation lookup will use the first occurrence (ID: {name_to_id[c.name]})"
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
name_to_id[c.name] = c.component_id
|
|
221
|
+
for relation in analysis.components_relations:
|
|
222
|
+
relation.src_id = name_to_id.get(relation.src_name, "")
|
|
223
|
+
relation.dst_id = name_to_id.get(relation.dst_name, "")
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class CFGComponent(LLMBaseModel):
|
|
227
|
+
"""A component derived from control flow graph analysis."""
|
|
228
|
+
|
|
229
|
+
name: str = Field(description="Name of the abstract component")
|
|
230
|
+
description: str = Field(description="One paragraph explaining the component.")
|
|
231
|
+
referenced_source: list[str] = Field(
|
|
232
|
+
description="List of the qualified names of the methods and classes that are within this component."
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def llm_str(self):
|
|
236
|
+
n = f"**Component:** `{self.name}`"
|
|
237
|
+
d = f" - *Description*: {self.description}"
|
|
238
|
+
qn = ""
|
|
239
|
+
if self.referenced_source:
|
|
240
|
+
qn += " - *Related Classes/Methods*: "
|
|
241
|
+
qn += ", ".join(f"`{q}`" for q in self.referenced_source)
|
|
242
|
+
return "\n".join([n, d, qn]).strip()
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class CFGAnalysisInsights(LLMBaseModel):
|
|
246
|
+
"""Insights from control flow graph analysis including components and relations."""
|
|
247
|
+
|
|
248
|
+
components: list[CFGComponent] = Field(description="List of components identified in the CFG.")
|
|
249
|
+
components_relations: list[Relation] = Field(description="List of relations among the components in the CFG.")
|
|
250
|
+
|
|
251
|
+
def llm_str(self):
|
|
252
|
+
if not self.components:
|
|
253
|
+
return "No abstract components found in the CFG."
|
|
254
|
+
title = "# 📦 Abstract Components Overview from CFG\n"
|
|
255
|
+
body = "\n".join(ac.llm_str() for ac in self.components)
|
|
256
|
+
relations = "\n".join(cr.llm_str() for cr in self.components_relations)
|
|
257
|
+
return title + body + relations
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class ExpandComponent(LLMBaseModel):
|
|
261
|
+
"""Decision on whether to expand a component with reasoning."""
|
|
262
|
+
|
|
263
|
+
should_expand: bool = Field(description="Whether the component should be expanded in detail or not.")
|
|
264
|
+
reason: str = Field(description="Reasoning behind the decision to expand or not.")
|
|
265
|
+
|
|
266
|
+
def llm_str(self):
|
|
267
|
+
return f"- *Should Expand:* {self.should_expand}\n- *Reason:* {self.reason}"
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class ValidationInsights(LLMBaseModel):
|
|
271
|
+
"""Validation results with status and additional information."""
|
|
272
|
+
|
|
273
|
+
is_valid: bool = Field(description="Indicates whether the validation results in valid or not.")
|
|
274
|
+
additional_info: str | None = Field(
|
|
275
|
+
default=None,
|
|
276
|
+
description="Any additional information or context related to the validation.",
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def llm_str(self):
|
|
280
|
+
return f"**Feedback Information:**\n{self.additional_info}"
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class UpdateAnalysis(LLMBaseModel):
|
|
284
|
+
"""Feedback on how much a diagram needs updating."""
|
|
285
|
+
|
|
286
|
+
update_degree: int = Field(
|
|
287
|
+
description="Degree to which the diagram needs update. 0 means no update, 10 means complete update."
|
|
288
|
+
)
|
|
289
|
+
feedback: str = Field(description="Feedback provided on the analysis.")
|
|
290
|
+
|
|
291
|
+
def llm_str(self):
|
|
292
|
+
return f"**Feedback:**\n{self.feedback}"
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class MetaAnalysisInsights(LLMBaseModel):
|
|
296
|
+
"""Insights from analyzing project metadata including type, domain, and architecture."""
|
|
297
|
+
|
|
298
|
+
project_type: str = Field(
|
|
299
|
+
description="Type/category of the project (e.g., web framework, data processing, ML library, etc.)"
|
|
300
|
+
)
|
|
301
|
+
domain: str = Field(
|
|
302
|
+
description="Domain or field the project belongs to (e.g., web development, data science, DevOps, etc.)"
|
|
303
|
+
)
|
|
304
|
+
architectural_patterns: list[str] = Field(description="Main architectural patterns typically used in such projects")
|
|
305
|
+
expected_components: list[str] = Field(description="Expected high-level components/modules based on project type")
|
|
306
|
+
technology_stack: list[str] = Field(description="Main technologies, frameworks, and libraries used")
|
|
307
|
+
architectural_bias: str = Field(
|
|
308
|
+
description="Guidance on how to interpret and organize components for this project type"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def llm_str(self):
|
|
312
|
+
title = "# 🎯 Project Metadata Analysis\n"
|
|
313
|
+
content = f"""
|
|
314
|
+
**Project Type:** {self.project_type}
|
|
315
|
+
**Domain:** {self.domain}
|
|
316
|
+
**Technology Stack:** {", ".join(self.technology_stack)}
|
|
317
|
+
**Architectural Patterns:** {", ".join(self.architectural_patterns)}
|
|
318
|
+
**Expected Components:** {", ".join(self.expected_components)}
|
|
319
|
+
**Architectural Bias:** {self.architectural_bias}
|
|
320
|
+
"""
|
|
321
|
+
return title + content
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class FileClassification(LLMBaseModel):
|
|
325
|
+
"""Classification of a file to a component."""
|
|
326
|
+
|
|
327
|
+
component_name: str = Field(description="Name of the component or module")
|
|
328
|
+
file_path: str = Field(description="Path to the file")
|
|
329
|
+
|
|
330
|
+
def llm_str(self):
|
|
331
|
+
return f"`{self.file_path}` -> Component: `{self.component_name}`"
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class ComponentFiles(LLMBaseModel):
|
|
335
|
+
"""Collection of file classifications for components."""
|
|
336
|
+
|
|
337
|
+
file_paths: list[FileClassification] = Field(
|
|
338
|
+
description="All files with their classifications for each of the files assigned to a component."
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
def llm_str(self):
|
|
342
|
+
if not self.file_paths:
|
|
343
|
+
return "No files classified."
|
|
344
|
+
title = "# 📄 Component File Classifications\n"
|
|
345
|
+
body = "\n".join(f"- `{fc.file_path}` -> Component: `{fc.component_name}`" for fc in self.file_paths)
|
|
346
|
+
return title + body
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class FilePath(LLMBaseModel):
|
|
350
|
+
"""File path with optional line range reference."""
|
|
351
|
+
|
|
352
|
+
file_path: str = Field(description="Full file path for the reference")
|
|
353
|
+
start_line: int | None = Field(
|
|
354
|
+
default=None,
|
|
355
|
+
description="Starting line number in the file for the reference (if applicable).",
|
|
356
|
+
)
|
|
357
|
+
end_line: int | None = Field(
|
|
358
|
+
default=None,
|
|
359
|
+
description="Ending line number in the file for the reference (if applicable).",
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
def llm_str(self):
|
|
363
|
+
return f"`{self.file_path}`: ({self.start_line}:{self.end_line})"
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from agents.agent_responses import Component, AnalysisInsights
|
|
6
|
+
from static_analyzer.analysis_result import StaticAnalysisResults
|
|
7
|
+
from static_analyzer.graph import ClusterResult
|
|
8
|
+
from static_analyzer.cluster_helpers import get_files_for_cluster_ids, get_all_cluster_ids
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ClusterMethodsMixin:
|
|
14
|
+
"""
|
|
15
|
+
Mixin providing shared cluster-related functionality for agents.
|
|
16
|
+
|
|
17
|
+
This mixin provides methods for:
|
|
18
|
+
- Building cluster strings from CFG analysis (using CallGraph.cluster())
|
|
19
|
+
- Assigning files to components based on clusters and key_entities
|
|
20
|
+
- Ensuring unique key entities across components
|
|
21
|
+
|
|
22
|
+
All clustering logic is delegated to CallGraph.cluster() which provides:
|
|
23
|
+
- Deterministic cluster IDs (seed=42)
|
|
24
|
+
- Cached results
|
|
25
|
+
- File <-> cluster bidirectional mappings
|
|
26
|
+
|
|
27
|
+
IMPORTANT: All methods are stateless with respect to ClusterResult.
|
|
28
|
+
Cluster results must be passed explicitly as parameters.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# These attributes must be provided by the class using this mixin
|
|
32
|
+
repo_dir: Path
|
|
33
|
+
static_analysis: StaticAnalysisResults
|
|
34
|
+
|
|
35
|
+
def _get_files_for_clusters(self, cluster_ids: list[int], cluster_results: dict[str, ClusterResult]) -> set[str]:
|
|
36
|
+
"""
|
|
37
|
+
Get all files that belong to the given cluster IDs.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
cluster_ids: List of cluster IDs to get files for
|
|
41
|
+
cluster_results: dict mapping language -> ClusterResult
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Set of file paths
|
|
45
|
+
"""
|
|
46
|
+
return get_files_for_cluster_ids(cluster_ids, cluster_results)
|
|
47
|
+
|
|
48
|
+
def _build_cluster_string(
|
|
49
|
+
self,
|
|
50
|
+
programming_langs: list[str],
|
|
51
|
+
cluster_results: dict[str, ClusterResult],
|
|
52
|
+
cluster_ids: set[int] | None = None,
|
|
53
|
+
) -> str:
|
|
54
|
+
"""
|
|
55
|
+
Build a cluster string for LLM consumption using pre-computed cluster results.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
programming_langs: List of languages to include
|
|
59
|
+
cluster_results: Pre-computed cluster results mapping language -> ClusterResult
|
|
60
|
+
cluster_ids: Optional set of cluster IDs to filter by
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Formatted cluster string with headers per language
|
|
64
|
+
"""
|
|
65
|
+
cluster_lines = []
|
|
66
|
+
|
|
67
|
+
for lang in programming_langs:
|
|
68
|
+
cfg = self.static_analysis.get_cfg(lang)
|
|
69
|
+
# Get cluster result for this language
|
|
70
|
+
cluster_result = cluster_results.get(lang)
|
|
71
|
+
cluster_str = cfg.to_cluster_string(cluster_ids, cluster_result)
|
|
72
|
+
|
|
73
|
+
if cluster_str.strip() and cluster_str not in ("empty", "none", "No clusters found."):
|
|
74
|
+
header = "Component CFG" if cluster_ids else "Clusters"
|
|
75
|
+
cluster_lines.append(f"\n## {lang.capitalize()} - {header}\n")
|
|
76
|
+
cluster_lines.append(cluster_str)
|
|
77
|
+
cluster_lines.append("\n")
|
|
78
|
+
|
|
79
|
+
return "".join(cluster_lines)
|
|
80
|
+
|
|
81
|
+
def _assign_files_to_component(self, component: Component, cluster_results: dict[str, ClusterResult]) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Assign files to a component.
|
|
84
|
+
1. Get all files from component's clusters (instant lookup)
|
|
85
|
+
2. Add resolved key_entity files
|
|
86
|
+
3. Convert to relative paths
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
component: Component to assign files to
|
|
90
|
+
cluster_results: dict mapping language -> ClusterResult
|
|
91
|
+
"""
|
|
92
|
+
assigned: set[str] = set()
|
|
93
|
+
|
|
94
|
+
# Step 1: Files from clusters
|
|
95
|
+
if component.source_cluster_ids:
|
|
96
|
+
cluster_files = self._get_files_for_clusters(component.source_cluster_ids, cluster_results)
|
|
97
|
+
assigned.update(cluster_files)
|
|
98
|
+
|
|
99
|
+
# Step 2: Files from key_entities (already resolved by ReferenceResolverMixin)
|
|
100
|
+
for entity in component.key_entities:
|
|
101
|
+
if entity.reference_file:
|
|
102
|
+
# Handle both absolute and relative paths
|
|
103
|
+
if os.path.isabs(entity.reference_file):
|
|
104
|
+
assigned.add(entity.reference_file)
|
|
105
|
+
else:
|
|
106
|
+
abs_path = os.path.join(self.repo_dir, entity.reference_file)
|
|
107
|
+
if os.path.exists(abs_path):
|
|
108
|
+
assigned.add(abs_path)
|
|
109
|
+
else:
|
|
110
|
+
assigned.add(entity.reference_file)
|
|
111
|
+
|
|
112
|
+
# Convert to relative paths
|
|
113
|
+
component.assigned_files = [os.path.relpath(f, self.repo_dir) if os.path.isabs(f) else f for f in assigned]
|
|
114
|
+
|
|
115
|
+
def _ensure_unique_key_entities(self, analysis: AnalysisInsights):
|
|
116
|
+
"""
|
|
117
|
+
Ensure that key_entities are unique across components.
|
|
118
|
+
|
|
119
|
+
If a key_entity (identified by qualified_name) appears in multiple components,
|
|
120
|
+
keep it only in the component where it's most relevant:
|
|
121
|
+
1. If it's in the component's assigned_files -> keep it there (highest priority)
|
|
122
|
+
2. Otherwise, keep it in the first component that references it
|
|
123
|
+
|
|
124
|
+
This prevents confusion in documentation where the same class/method
|
|
125
|
+
is listed as a "key entity" for multiple components.
|
|
126
|
+
"""
|
|
127
|
+
logger.info("[ClusterMethodsMixin] Ensuring key_entities are unique across components")
|
|
128
|
+
|
|
129
|
+
seen_entities: dict[str, Component] = {}
|
|
130
|
+
|
|
131
|
+
for component in analysis.components:
|
|
132
|
+
if component.name == "Unclassified":
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
entities_to_remove = []
|
|
136
|
+
|
|
137
|
+
for key_entity in component.key_entities:
|
|
138
|
+
qname = key_entity.qualified_name
|
|
139
|
+
|
|
140
|
+
if qname in seen_entities:
|
|
141
|
+
original_component = seen_entities[qname]
|
|
142
|
+
ref_file = key_entity.reference_file
|
|
143
|
+
|
|
144
|
+
current_has_file = ref_file and any(
|
|
145
|
+
ref_file in assigned_file for assigned_file in component.assigned_files
|
|
146
|
+
)
|
|
147
|
+
original_has_file = ref_file and any(
|
|
148
|
+
ref_file in assigned_file for assigned_file in original_component.assigned_files
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if current_has_file and not original_has_file:
|
|
152
|
+
# Move to current component
|
|
153
|
+
original_component.key_entities = [
|
|
154
|
+
e for e in original_component.key_entities if e.qualified_name != qname
|
|
155
|
+
]
|
|
156
|
+
seen_entities[qname] = component
|
|
157
|
+
logger.debug(
|
|
158
|
+
f"[ClusterMethodsMixin] Moved key_entity '{qname}' from {original_component.name} to {component.name}"
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
# Keep in original component
|
|
162
|
+
entities_to_remove.append(key_entity)
|
|
163
|
+
logger.debug(
|
|
164
|
+
f"[ClusterMethodsMixin] Removed duplicate key_entity '{qname}' from {component.name} (kept in {original_component.name})"
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
seen_entities[qname] = component
|
|
168
|
+
|
|
169
|
+
component.key_entities = [e for e in component.key_entities if e not in entities_to_remove]
|
|
170
|
+
|
|
171
|
+
def _ensure_unique_file_assignments(self, analysis: AnalysisInsights) -> None:
|
|
172
|
+
"""
|
|
173
|
+
Deduplicate assigned_files within each component.
|
|
174
|
+
|
|
175
|
+
A file may legitimately appear in multiple components, but should not
|
|
176
|
+
appear more than once within the same component's assigned_files list.
|
|
177
|
+
"""
|
|
178
|
+
logger.info("[ClusterMethodsMixin] Deduplicating file assignments within components")
|
|
179
|
+
|
|
180
|
+
total_removed = 0
|
|
181
|
+
|
|
182
|
+
for component in analysis.components:
|
|
183
|
+
seen: set[str] = set()
|
|
184
|
+
unique_files: list[str] = []
|
|
185
|
+
for file_path in component.assigned_files:
|
|
186
|
+
if file_path in seen:
|
|
187
|
+
logger.debug(
|
|
188
|
+
f"[ClusterMethodsMixin] Removed duplicate file '{file_path}' within '{component.name}'"
|
|
189
|
+
)
|
|
190
|
+
total_removed += 1
|
|
191
|
+
else:
|
|
192
|
+
seen.add(file_path)
|
|
193
|
+
unique_files.append(file_path)
|
|
194
|
+
|
|
195
|
+
component.assigned_files = unique_files
|
|
196
|
+
|
|
197
|
+
if total_removed > 0:
|
|
198
|
+
logger.info(f"[ClusterMethodsMixin] Removed {total_removed} duplicate file assignment(s)")
|
|
199
|
+
|
|
200
|
+
def _sanitize_component_cluster_ids(
|
|
201
|
+
self,
|
|
202
|
+
analysis: AnalysisInsights,
|
|
203
|
+
valid_cluster_ids: set[int] | None = None,
|
|
204
|
+
cluster_results: dict[str, ClusterResult] | None = None,
|
|
205
|
+
) -> None:
|
|
206
|
+
"""
|
|
207
|
+
Sanitize cluster IDs in the analysis by removing invalid ones.
|
|
208
|
+
Removes cluster IDs that don't exist in the static analysis.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
analysis: The analysis to sanitize
|
|
212
|
+
valid_cluster_ids: Optional set of valid IDs. If None, derives from cluster_results.
|
|
213
|
+
cluster_results: dict mapping language -> ClusterResult. Required if valid_cluster_ids is None.
|
|
214
|
+
"""
|
|
215
|
+
if valid_cluster_ids is None:
|
|
216
|
+
if cluster_results is None:
|
|
217
|
+
logger.error("Must provide either valid_cluster_ids or cluster_results")
|
|
218
|
+
return
|
|
219
|
+
valid_cluster_ids = get_all_cluster_ids(cluster_results)
|
|
220
|
+
|
|
221
|
+
for component in analysis.components:
|
|
222
|
+
if component.source_cluster_ids:
|
|
223
|
+
original_ids = component.source_cluster_ids.copy()
|
|
224
|
+
component.source_cluster_ids = [cid for cid in component.source_cluster_ids if cid in valid_cluster_ids]
|
|
225
|
+
removed_ids = set(original_ids) - set(component.source_cluster_ids)
|
|
226
|
+
if removed_ids:
|
|
227
|
+
logger.warning(
|
|
228
|
+
f"[ClusterMethodsMixin] Removed invalid cluster IDs {removed_ids} from component '{component.name}'"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
def _create_strict_component_subgraph(self, component: Component) -> tuple[str, dict]:
|
|
232
|
+
"""
|
|
233
|
+
Create a strict subgraph containing ONLY nodes from the component's assigned files.
|
|
234
|
+
This ensures the analysis is strictly scoped to the component's boundaries.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
component: Component with assigned_files to filter by
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Tuple of (formatted cluster string, cluster_results dict)
|
|
241
|
+
where cluster_results maps language -> ClusterResult for the subgraph
|
|
242
|
+
"""
|
|
243
|
+
if not component.assigned_files:
|
|
244
|
+
logger.warning(f"[ClusterMethodsMixin] Component {component.name} has no assigned_files")
|
|
245
|
+
return "No assigned files found for this component.", {}
|
|
246
|
+
|
|
247
|
+
# Convert assigned files to absolute paths for comparison
|
|
248
|
+
assigned_file_set = set()
|
|
249
|
+
for f in component.assigned_files:
|
|
250
|
+
abs_path = os.path.join(self.repo_dir, f) if not os.path.isabs(f) else f
|
|
251
|
+
assigned_file_set.add(abs_path)
|
|
252
|
+
|
|
253
|
+
result_parts = []
|
|
254
|
+
cluster_results = {}
|
|
255
|
+
|
|
256
|
+
for lang in self.static_analysis.get_languages():
|
|
257
|
+
cfg = self.static_analysis.get_cfg(lang)
|
|
258
|
+
|
|
259
|
+
# Use strict filtering logic
|
|
260
|
+
sub_cfg = cfg.filter_by_files(assigned_file_set)
|
|
261
|
+
|
|
262
|
+
if sub_cfg.nodes:
|
|
263
|
+
# Calculate clusters for the subgraph
|
|
264
|
+
sub_cluster_result = sub_cfg.cluster()
|
|
265
|
+
cluster_results[lang] = sub_cluster_result
|
|
266
|
+
|
|
267
|
+
cluster_str = sub_cfg.to_cluster_string(cluster_result=sub_cluster_result)
|
|
268
|
+
if cluster_str.strip() and cluster_str not in ("empty", "none", "No clusters found."):
|
|
269
|
+
result_parts.append(f"\n## {lang.capitalize()} - Component CFG\n")
|
|
270
|
+
result_parts.append(cluster_str)
|
|
271
|
+
result_parts.append("\n")
|
|
272
|
+
|
|
273
|
+
result = "".join(result_parts)
|
|
274
|
+
|
|
275
|
+
if not result.strip():
|
|
276
|
+
logger.warning(
|
|
277
|
+
f"[ClusterMethodsMixin] No CFG found for component {component.name} with {len(component.assigned_files)} assigned files"
|
|
278
|
+
)
|
|
279
|
+
return "No relevant CFG clusters found for this component.", cluster_results
|
|
280
|
+
|
|
281
|
+
return result, cluster_results
|
agents/constants.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Constants for the agents module."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LLMDefaults:
|
|
5
|
+
DEFAULT_AGENT_TEMPERATURE = 0.1
|
|
6
|
+
DEFAULT_PARSING_TEMPERATURE = 0
|
|
7
|
+
AWS_MAX_TOKENS = 4096
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FileStructureConfig:
|
|
11
|
+
MAX_LINES = 500
|
|
12
|
+
DEFAULT_MAX_DEPTH = 10
|
|
13
|
+
FALLBACK_MAX_LINES = 50000
|