codeboarding 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. agents/__init__.py +0 -0
  2. agents/abstraction_agent.py +150 -0
  3. agents/agent.py +467 -0
  4. agents/agent_responses.py +363 -0
  5. agents/cluster_methods_mixin.py +281 -0
  6. agents/constants.py +13 -0
  7. agents/dependency_discovery.py +159 -0
  8. agents/details_agent.py +174 -0
  9. agents/llm_config.py +309 -0
  10. agents/meta_agent.py +105 -0
  11. agents/planner_agent.py +105 -0
  12. agents/prompts/__init__.py +85 -0
  13. agents/prompts/abstract_prompt_factory.py +63 -0
  14. agents/prompts/claude_prompts.py +381 -0
  15. agents/prompts/deepseek_prompts.py +389 -0
  16. agents/prompts/gemini_flash_prompts.py +362 -0
  17. agents/prompts/glm_prompts.py +407 -0
  18. agents/prompts/gpt_prompts.py +470 -0
  19. agents/prompts/kimi_prompts.py +400 -0
  20. agents/prompts/prompt_factory.py +179 -0
  21. agents/tools/__init__.py +8 -0
  22. agents/tools/base.py +96 -0
  23. agents/tools/get_external_deps.py +47 -0
  24. agents/tools/get_method_invocations.py +47 -0
  25. agents/tools/read_cfg.py +60 -0
  26. agents/tools/read_docs.py +132 -0
  27. agents/tools/read_file.py +90 -0
  28. agents/tools/read_file_structure.py +156 -0
  29. agents/tools/read_git_diff.py +131 -0
  30. agents/tools/read_packages.py +60 -0
  31. agents/tools/read_source.py +105 -0
  32. agents/tools/read_structure.py +49 -0
  33. agents/tools/toolkit.py +119 -0
  34. agents/validation.py +383 -0
  35. caching/__init__.py +4 -0
  36. caching/cache.py +29 -0
  37. caching/meta_cache.py +227 -0
  38. codeboarding-0.9.0.dist-info/METADATA +223 -0
  39. codeboarding-0.9.0.dist-info/RECORD +126 -0
  40. codeboarding-0.9.0.dist-info/WHEEL +5 -0
  41. codeboarding-0.9.0.dist-info/entry_points.txt +3 -0
  42. codeboarding-0.9.0.dist-info/licenses/LICENSE +21 -0
  43. codeboarding-0.9.0.dist-info/top_level.txt +18 -0
  44. core/__init__.py +101 -0
  45. core/plugin_loader.py +46 -0
  46. core/protocols.py +27 -0
  47. core/registry.py +46 -0
  48. diagram_analysis/__init__.py +4 -0
  49. diagram_analysis/analysis_json.py +346 -0
  50. diagram_analysis/diagram_generator.py +486 -0
  51. diagram_analysis/file_coverage.py +212 -0
  52. diagram_analysis/incremental/__init__.py +63 -0
  53. diagram_analysis/incremental/component_checker.py +236 -0
  54. diagram_analysis/incremental/file_manager.py +217 -0
  55. diagram_analysis/incremental/impact_analyzer.py +238 -0
  56. diagram_analysis/incremental/io_utils.py +281 -0
  57. diagram_analysis/incremental/models.py +72 -0
  58. diagram_analysis/incremental/path_patching.py +164 -0
  59. diagram_analysis/incremental/reexpansion.py +166 -0
  60. diagram_analysis/incremental/scoped_analysis.py +227 -0
  61. diagram_analysis/incremental/updater.py +464 -0
  62. diagram_analysis/incremental/validation.py +48 -0
  63. diagram_analysis/manifest.py +152 -0
  64. diagram_analysis/version.py +6 -0
  65. duckdb_crud.py +125 -0
  66. github_action.py +172 -0
  67. health/__init__.py +3 -0
  68. health/checks/__init__.py +11 -0
  69. health/checks/circular_deps.py +48 -0
  70. health/checks/cohesion.py +93 -0
  71. health/checks/coupling.py +140 -0
  72. health/checks/function_size.py +85 -0
  73. health/checks/god_class.py +167 -0
  74. health/checks/inheritance.py +104 -0
  75. health/checks/instability.py +77 -0
  76. health/checks/unused_code_diagnostics.py +338 -0
  77. health/config.py +172 -0
  78. health/constants.py +19 -0
  79. health/models.py +186 -0
  80. health/runner.py +236 -0
  81. install.py +518 -0
  82. logging_config.py +105 -0
  83. main.py +529 -0
  84. monitoring/__init__.py +12 -0
  85. monitoring/callbacks.py +163 -0
  86. monitoring/context.py +158 -0
  87. monitoring/mixin.py +16 -0
  88. monitoring/paths.py +47 -0
  89. monitoring/stats.py +50 -0
  90. monitoring/writers.py +172 -0
  91. output_generators/__init__.py +0 -0
  92. output_generators/html.py +163 -0
  93. output_generators/html_template.py +382 -0
  94. output_generators/markdown.py +140 -0
  95. output_generators/mdx.py +171 -0
  96. output_generators/sphinx.py +175 -0
  97. repo_utils/__init__.py +277 -0
  98. repo_utils/change_detector.py +289 -0
  99. repo_utils/errors.py +6 -0
  100. repo_utils/git_diff.py +74 -0
  101. repo_utils/ignore.py +341 -0
  102. static_analyzer/__init__.py +335 -0
  103. static_analyzer/analysis_cache.py +699 -0
  104. static_analyzer/analysis_result.py +269 -0
  105. static_analyzer/cluster_change_analyzer.py +391 -0
  106. static_analyzer/cluster_helpers.py +79 -0
  107. static_analyzer/constants.py +166 -0
  108. static_analyzer/git_diff_analyzer.py +224 -0
  109. static_analyzer/graph.py +746 -0
  110. static_analyzer/incremental_orchestrator.py +671 -0
  111. static_analyzer/java_config_scanner.py +232 -0
  112. static_analyzer/java_utils.py +227 -0
  113. static_analyzer/lsp_client/__init__.py +12 -0
  114. static_analyzer/lsp_client/client.py +1642 -0
  115. static_analyzer/lsp_client/diagnostics.py +62 -0
  116. static_analyzer/lsp_client/java_client.py +517 -0
  117. static_analyzer/lsp_client/language_settings.py +97 -0
  118. static_analyzer/lsp_client/typescript_client.py +235 -0
  119. static_analyzer/programming_language.py +152 -0
  120. static_analyzer/reference_resolve_mixin.py +166 -0
  121. static_analyzer/scanner.py +95 -0
  122. static_analyzer/typescript_config_scanner.py +54 -0
  123. tool_registry.py +433 -0
  124. user_config.py +134 -0
  125. utils.py +56 -0
  126. vscode_constants.py +124 -0
@@ -0,0 +1,363 @@
1
+ import abc
2
+ import hashlib
3
+ import logging
4
+ from abc import abstractmethod
5
+ from typing import get_origin, Optional
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ ROOT_PARENT_ID = "ROOT"
12
+ COMPONENT_ID_BYTES = 8
13
+
14
+
15
+ class LLMBaseModel(BaseModel, abc.ABC):
16
+ """Base model for LLM-parseable response types."""
17
+
18
+ @abstractmethod
19
+ def llm_str(self):
20
+ raise NotImplementedError("LLM String has to be implemented.")
21
+
22
+ @classmethod
23
+ def extractor_str(cls):
24
+ # Here iterate over the fields that we have and use their description like:
25
+ result_str = "please extract the following: "
26
+ for fname, fvalue in cls.model_fields.items():
27
+ # check if the field type is Optional
28
+ ftype = fvalue.annotation
29
+ # Check if the type is a typing.List (e.g., typing.List[SomeType])
30
+ if get_origin(ftype) is list:
31
+ # get the type of the list:
32
+ if ftype is not None and hasattr(ftype, "__args__"):
33
+ ftype = ftype.__args__[0]
34
+ result_str += f"{fname} which is a list ("
35
+ if ftype is Optional:
36
+ result_str += f"{fname} ({fvalue.description}), "
37
+ elif ftype is not None and isinstance(ftype, type) and issubclass(ftype, LLMBaseModel):
38
+ # Now I need to call the extractor_str method of the field
39
+ result_str += ftype.extractor_str()
40
+ else:
41
+ result_str += f"{fname} ({fvalue.description}), "
42
+ if get_origin(ftype) is list:
43
+ result_str += "), "
44
+ return result_str
45
+
46
+
47
+ class SourceCodeReference(LLMBaseModel):
48
+ """Reference to source code including qualified name and file location."""
49
+
50
+ qualified_name: str = Field(
51
+ description="Qualified name of the source code, e.g., `langchain.tools.tool` or `langchain_core.output_parsers.JsonOutputParser` or `langchain_core.output_parsers.JsonOutputParser:parse`."
52
+ )
53
+
54
+ reference_file: str | None = Field(
55
+ default=None,
56
+ description="File path where the source code is located, e.g., `langchain/tools/tool.py` or `langchain_core/output_parsers/json_output_parser.py`.",
57
+ )
58
+
59
+ reference_start_line: int | None = Field(
60
+ default=None,
61
+ description="The line number in the source code where the reference starts. Only if you are absolutely sure add this, otherwise None.",
62
+ )
63
+ reference_end_line: int | None = Field(
64
+ default=None,
65
+ description="The line number in the source code where the reference ends. Only if you are absolutely sure add this, otherwise None.",
66
+ )
67
+
68
+ def llm_str(self):
69
+ if self.reference_start_line is None or self.reference_end_line is None:
70
+ return f"QName:`{self.qualified_name}` FileRef: `{self.reference_file}`"
71
+ if (
72
+ self.reference_start_line <= self.reference_end_line <= 0
73
+ or self.reference_start_line == self.reference_end_line
74
+ ):
75
+ return f"QName:`{self.qualified_name}` FileRef: `{self.reference_file}`"
76
+ return f"QName:`{self.qualified_name}` FileRef: `{self.reference_file}`, Lines:({self.reference_start_line}:{self.reference_end_line})"
77
+
78
+ def __str__(self):
79
+ if self.reference_start_line is None or self.reference_end_line is None:
80
+ return f"`{self.qualified_name}`"
81
+ if (
82
+ self.reference_start_line <= self.reference_end_line <= 0
83
+ or self.reference_start_line == self.reference_end_line
84
+ ):
85
+ return f"`{self.qualified_name}`"
86
+ return f"`{self.qualified_name}`:{self.reference_start_line}-{self.reference_end_line}"
87
+
88
+
89
+ class Relation(LLMBaseModel):
90
+ """A relationship between two components."""
91
+
92
+ relation: str = Field(description="Single phrase used for the relationship of two components.")
93
+ src_name: str = Field(description="Source component name")
94
+ dst_name: str = Field(description="Target component name")
95
+ src_id: str = Field(default="", description="Component ID of the source.", exclude=True)
96
+ dst_id: str = Field(default="", description="Component ID of the destination.", exclude=True)
97
+
98
+ def llm_str(self):
99
+ return f"({self.src_name}, {self.relation}, {self.dst_name})"
100
+
101
+
102
+ class ClustersComponent(LLMBaseModel):
103
+ """A grouped component from cluster analysis - may contain multiple clusters."""
104
+
105
+ cluster_ids: list[int] = Field(
106
+ description="List of cluster IDs from the CFG analysis that are grouped together (e.g., [1, 3, 5])"
107
+ )
108
+ description: str = Field(
109
+ description="Explanation of what this component does, its main flow, and WHY these clusters are grouped together"
110
+ )
111
+
112
+ def llm_str(self):
113
+ ids_str = ", ".join(str(cid) for cid in self.cluster_ids)
114
+ return f"**Clusters [{ids_str}]**\n {self.description}"
115
+
116
+
117
+ class ClusterAnalysis(LLMBaseModel):
118
+ """Analysis results containing grouped cluster components."""
119
+
120
+ cluster_components: list[ClustersComponent] = Field(
121
+ description="Grouped clusters into logical components. Multiple cluster IDs can be grouped together if they work as a cohesive unit."
122
+ )
123
+
124
+ def llm_str(self):
125
+ if not self.cluster_components:
126
+ return "No clusters analyzed."
127
+ title = "# Grouped Cluster Components\n"
128
+ body = "\n".join(cc.llm_str() for cc in self.cluster_components)
129
+ return title + body
130
+
131
+
132
+ class Component(LLMBaseModel):
133
+ """A software component with name, description, and key entities."""
134
+
135
+ name: str = Field(description="Name of the component")
136
+ description: str = Field(description="A short description of the component.")
137
+
138
+ # LLM picks these: The MOST IMPORTANT/critical methods and classes
139
+ key_entities: list[SourceCodeReference] = Field(
140
+ description="The most important/critical classes and methods that represent this component's core functionality. Pick 2-5 key entities."
141
+ )
142
+
143
+ # Deterministic from static analysis: ALL files belonging to this component
144
+ assigned_files: list[str] = Field(
145
+ description="All source files assigned to this component (populated deterministically).",
146
+ default_factory=list,
147
+ exclude=True,
148
+ )
149
+
150
+ source_cluster_ids: list[int] = Field(
151
+ description="List of cluster IDs from CFG analysis that this component encompasses.",
152
+ default_factory=list,
153
+ )
154
+
155
+ component_id: str = Field(
156
+ default="",
157
+ description="Deterministic unique identifier for this component.",
158
+ exclude=True,
159
+ )
160
+
161
+ def llm_str(self):
162
+ n = f"**Component:** `{self.name}`"
163
+ d = f" - *Description*: {self.description}"
164
+ qn = ""
165
+ if self.key_entities:
166
+ qn += " - *Key Entities*: "
167
+ qn += ", ".join(f"`{q.llm_str()}`" for q in self.key_entities)
168
+ return "\n".join([n, d, qn]).strip()
169
+
170
+
171
+ class AnalysisInsights(LLMBaseModel):
172
+ """Project analysis insights including components and their relations."""
173
+
174
+ description: str = Field(
175
+ description="One paragraph explaining the functionality which is represented by this graph. What the main flow is and what is its purpose."
176
+ )
177
+ components: list[Component] = Field(description="List of the components identified in the project.")
178
+ components_relations: list[Relation] = Field(description="List of relations among the components.")
179
+
180
+ def llm_str(self):
181
+ if not self.components:
182
+ return "No abstract components found."
183
+ title = "# 📦 Abstract Components Overview\n"
184
+ body = "\n".join(ac.llm_str() for ac in self.components)
185
+ relations = "\n".join(cr.llm_str() for cr in self.components_relations)
186
+ return title + body + relations
187
+
188
+
189
+ def hash_component_id(parent_id: str, name: str, sibling_index: int = 0) -> str:
190
+ """Hash a deterministic component ID from parent ID, name, and sibling index.
191
+
192
+ Note:
193
+ The ID is a compact, 64-bit prefix of SHA-256 (8 bytes -> 16 hex chars).
194
+ Truncation happens at the byte level to keep the representation explicit.
195
+ """
196
+ raw = f"{parent_id}:{name}:{sibling_index}".encode("utf-8")
197
+ return hashlib.sha256(raw).digest()[:COMPONENT_ID_BYTES].hex()
198
+
199
+
200
+ def assign_component_ids(analysis: AnalysisInsights, parent_id: str = ROOT_PARENT_ID) -> None:
201
+ """Assign deterministic component IDs to all components in an analysis.
202
+
203
+ Handles same-named siblings by using a sibling index tiebreaker.
204
+ """
205
+ name_counts: dict[str, int] = {}
206
+ for component in analysis.components:
207
+ count = name_counts.get(component.name, 0)
208
+ component.component_id = hash_component_id(parent_id, component.name, count)
209
+ name_counts[component.name] = count + 1
210
+
211
+ # Assign relation IDs by looking up component names (first occurrence wins for duplicates)
212
+ name_to_id: dict[str, str] = {}
213
+ for c in analysis.components:
214
+ if c.name in name_to_id:
215
+ logger.warning(
216
+ f"Duplicate component name '{c.name}' found during ID assignment; "
217
+ f"relation lookup will use the first occurrence (ID: {name_to_id[c.name]})"
218
+ )
219
+ else:
220
+ name_to_id[c.name] = c.component_id
221
+ for relation in analysis.components_relations:
222
+ relation.src_id = name_to_id.get(relation.src_name, "")
223
+ relation.dst_id = name_to_id.get(relation.dst_name, "")
224
+
225
+
226
+ class CFGComponent(LLMBaseModel):
227
+ """A component derived from control flow graph analysis."""
228
+
229
+ name: str = Field(description="Name of the abstract component")
230
+ description: str = Field(description="One paragraph explaining the component.")
231
+ referenced_source: list[str] = Field(
232
+ description="List of the qualified names of the methods and classes that are within this component."
233
+ )
234
+
235
+ def llm_str(self):
236
+ n = f"**Component:** `{self.name}`"
237
+ d = f" - *Description*: {self.description}"
238
+ qn = ""
239
+ if self.referenced_source:
240
+ qn += " - *Related Classes/Methods*: "
241
+ qn += ", ".join(f"`{q}`" for q in self.referenced_source)
242
+ return "\n".join([n, d, qn]).strip()
243
+
244
+
245
+ class CFGAnalysisInsights(LLMBaseModel):
246
+ """Insights from control flow graph analysis including components and relations."""
247
+
248
+ components: list[CFGComponent] = Field(description="List of components identified in the CFG.")
249
+ components_relations: list[Relation] = Field(description="List of relations among the components in the CFG.")
250
+
251
+ def llm_str(self):
252
+ if not self.components:
253
+ return "No abstract components found in the CFG."
254
+ title = "# 📦 Abstract Components Overview from CFG\n"
255
+ body = "\n".join(ac.llm_str() for ac in self.components)
256
+ relations = "\n".join(cr.llm_str() for cr in self.components_relations)
257
+ return title + body + relations
258
+
259
+
260
+ class ExpandComponent(LLMBaseModel):
261
+ """Decision on whether to expand a component with reasoning."""
262
+
263
+ should_expand: bool = Field(description="Whether the component should be expanded in detail or not.")
264
+ reason: str = Field(description="Reasoning behind the decision to expand or not.")
265
+
266
+ def llm_str(self):
267
+ return f"- *Should Expand:* {self.should_expand}\n- *Reason:* {self.reason}"
268
+
269
+
270
+ class ValidationInsights(LLMBaseModel):
271
+ """Validation results with status and additional information."""
272
+
273
+ is_valid: bool = Field(description="Indicates whether the validation results in valid or not.")
274
+ additional_info: str | None = Field(
275
+ default=None,
276
+ description="Any additional information or context related to the validation.",
277
+ )
278
+
279
+ def llm_str(self):
280
+ return f"**Feedback Information:**\n{self.additional_info}"
281
+
282
+
283
+ class UpdateAnalysis(LLMBaseModel):
284
+ """Feedback on how much a diagram needs updating."""
285
+
286
+ update_degree: int = Field(
287
+ description="Degree to which the diagram needs update. 0 means no update, 10 means complete update."
288
+ )
289
+ feedback: str = Field(description="Feedback provided on the analysis.")
290
+
291
+ def llm_str(self):
292
+ return f"**Feedback:**\n{self.feedback}"
293
+
294
+
295
+ class MetaAnalysisInsights(LLMBaseModel):
296
+ """Insights from analyzing project metadata including type, domain, and architecture."""
297
+
298
+ project_type: str = Field(
299
+ description="Type/category of the project (e.g., web framework, data processing, ML library, etc.)"
300
+ )
301
+ domain: str = Field(
302
+ description="Domain or field the project belongs to (e.g., web development, data science, DevOps, etc.)"
303
+ )
304
+ architectural_patterns: list[str] = Field(description="Main architectural patterns typically used in such projects")
305
+ expected_components: list[str] = Field(description="Expected high-level components/modules based on project type")
306
+ technology_stack: list[str] = Field(description="Main technologies, frameworks, and libraries used")
307
+ architectural_bias: str = Field(
308
+ description="Guidance on how to interpret and organize components for this project type"
309
+ )
310
+
311
+ def llm_str(self):
312
+ title = "# 🎯 Project Metadata Analysis\n"
313
+ content = f"""
314
+ **Project Type:** {self.project_type}
315
+ **Domain:** {self.domain}
316
+ **Technology Stack:** {", ".join(self.technology_stack)}
317
+ **Architectural Patterns:** {", ".join(self.architectural_patterns)}
318
+ **Expected Components:** {", ".join(self.expected_components)}
319
+ **Architectural Bias:** {self.architectural_bias}
320
+ """
321
+ return title + content
322
+
323
+
324
+ class FileClassification(LLMBaseModel):
325
+ """Classification of a file to a component."""
326
+
327
+ component_name: str = Field(description="Name of the component or module")
328
+ file_path: str = Field(description="Path to the file")
329
+
330
+ def llm_str(self):
331
+ return f"`{self.file_path}` -> Component: `{self.component_name}`"
332
+
333
+
334
+ class ComponentFiles(LLMBaseModel):
335
+ """Collection of file classifications for components."""
336
+
337
+ file_paths: list[FileClassification] = Field(
338
+ description="All files with their classifications for each of the files assigned to a component."
339
+ )
340
+
341
+ def llm_str(self):
342
+ if not self.file_paths:
343
+ return "No files classified."
344
+ title = "# 📄 Component File Classifications\n"
345
+ body = "\n".join(f"- `{fc.file_path}` -> Component: `{fc.component_name}`" for fc in self.file_paths)
346
+ return title + body
347
+
348
+
349
+ class FilePath(LLMBaseModel):
350
+ """File path with optional line range reference."""
351
+
352
+ file_path: str = Field(description="Full file path for the reference")
353
+ start_line: int | None = Field(
354
+ default=None,
355
+ description="Starting line number in the file for the reference (if applicable).",
356
+ )
357
+ end_line: int | None = Field(
358
+ default=None,
359
+ description="Ending line number in the file for the reference (if applicable).",
360
+ )
361
+
362
+ def llm_str(self):
363
+ return f"`{self.file_path}`: ({self.start_line}:{self.end_line})"
@@ -0,0 +1,281 @@
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from agents.agent_responses import Component, AnalysisInsights
6
+ from static_analyzer.analysis_result import StaticAnalysisResults
7
+ from static_analyzer.graph import ClusterResult
8
+ from static_analyzer.cluster_helpers import get_files_for_cluster_ids, get_all_cluster_ids
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ClusterMethodsMixin:
14
+ """
15
+ Mixin providing shared cluster-related functionality for agents.
16
+
17
+ This mixin provides methods for:
18
+ - Building cluster strings from CFG analysis (using CallGraph.cluster())
19
+ - Assigning files to components based on clusters and key_entities
20
+ - Ensuring unique key entities across components
21
+
22
+ All clustering logic is delegated to CallGraph.cluster() which provides:
23
+ - Deterministic cluster IDs (seed=42)
24
+ - Cached results
25
+ - File <-> cluster bidirectional mappings
26
+
27
+ IMPORTANT: All methods are stateless with respect to ClusterResult.
28
+ Cluster results must be passed explicitly as parameters.
29
+ """
30
+
31
+ # These attributes must be provided by the class using this mixin
32
+ repo_dir: Path
33
+ static_analysis: StaticAnalysisResults
34
+
35
+ def _get_files_for_clusters(self, cluster_ids: list[int], cluster_results: dict[str, ClusterResult]) -> set[str]:
36
+ """
37
+ Get all files that belong to the given cluster IDs.
38
+
39
+ Args:
40
+ cluster_ids: List of cluster IDs to get files for
41
+ cluster_results: dict mapping language -> ClusterResult
42
+
43
+ Returns:
44
+ Set of file paths
45
+ """
46
+ return get_files_for_cluster_ids(cluster_ids, cluster_results)
47
+
48
+ def _build_cluster_string(
49
+ self,
50
+ programming_langs: list[str],
51
+ cluster_results: dict[str, ClusterResult],
52
+ cluster_ids: set[int] | None = None,
53
+ ) -> str:
54
+ """
55
+ Build a cluster string for LLM consumption using pre-computed cluster results.
56
+
57
+ Args:
58
+ programming_langs: List of languages to include
59
+ cluster_results: Pre-computed cluster results mapping language -> ClusterResult
60
+ cluster_ids: Optional set of cluster IDs to filter by
61
+
62
+ Returns:
63
+ Formatted cluster string with headers per language
64
+ """
65
+ cluster_lines = []
66
+
67
+ for lang in programming_langs:
68
+ cfg = self.static_analysis.get_cfg(lang)
69
+ # Get cluster result for this language
70
+ cluster_result = cluster_results.get(lang)
71
+ cluster_str = cfg.to_cluster_string(cluster_ids, cluster_result)
72
+
73
+ if cluster_str.strip() and cluster_str not in ("empty", "none", "No clusters found."):
74
+ header = "Component CFG" if cluster_ids else "Clusters"
75
+ cluster_lines.append(f"\n## {lang.capitalize()} - {header}\n")
76
+ cluster_lines.append(cluster_str)
77
+ cluster_lines.append("\n")
78
+
79
+ return "".join(cluster_lines)
80
+
81
+ def _assign_files_to_component(self, component: Component, cluster_results: dict[str, ClusterResult]) -> None:
82
+ """
83
+ Assign files to a component.
84
+ 1. Get all files from component's clusters (instant lookup)
85
+ 2. Add resolved key_entity files
86
+ 3. Convert to relative paths
87
+
88
+ Args:
89
+ component: Component to assign files to
90
+ cluster_results: dict mapping language -> ClusterResult
91
+ """
92
+ assigned: set[str] = set()
93
+
94
+ # Step 1: Files from clusters
95
+ if component.source_cluster_ids:
96
+ cluster_files = self._get_files_for_clusters(component.source_cluster_ids, cluster_results)
97
+ assigned.update(cluster_files)
98
+
99
+ # Step 2: Files from key_entities (already resolved by ReferenceResolverMixin)
100
+ for entity in component.key_entities:
101
+ if entity.reference_file:
102
+ # Handle both absolute and relative paths
103
+ if os.path.isabs(entity.reference_file):
104
+ assigned.add(entity.reference_file)
105
+ else:
106
+ abs_path = os.path.join(self.repo_dir, entity.reference_file)
107
+ if os.path.exists(abs_path):
108
+ assigned.add(abs_path)
109
+ else:
110
+ assigned.add(entity.reference_file)
111
+
112
+ # Convert to relative paths
113
+ component.assigned_files = [os.path.relpath(f, self.repo_dir) if os.path.isabs(f) else f for f in assigned]
114
+
115
+ def _ensure_unique_key_entities(self, analysis: AnalysisInsights):
116
+ """
117
+ Ensure that key_entities are unique across components.
118
+
119
+ If a key_entity (identified by qualified_name) appears in multiple components,
120
+ keep it only in the component where it's most relevant:
121
+ 1. If it's in the component's assigned_files -> keep it there (highest priority)
122
+ 2. Otherwise, keep it in the first component that references it
123
+
124
+ This prevents confusion in documentation where the same class/method
125
+ is listed as a "key entity" for multiple components.
126
+ """
127
+ logger.info("[ClusterMethodsMixin] Ensuring key_entities are unique across components")
128
+
129
+ seen_entities: dict[str, Component] = {}
130
+
131
+ for component in analysis.components:
132
+ if component.name == "Unclassified":
133
+ continue
134
+
135
+ entities_to_remove = []
136
+
137
+ for key_entity in component.key_entities:
138
+ qname = key_entity.qualified_name
139
+
140
+ if qname in seen_entities:
141
+ original_component = seen_entities[qname]
142
+ ref_file = key_entity.reference_file
143
+
144
+ current_has_file = ref_file and any(
145
+ ref_file in assigned_file for assigned_file in component.assigned_files
146
+ )
147
+ original_has_file = ref_file and any(
148
+ ref_file in assigned_file for assigned_file in original_component.assigned_files
149
+ )
150
+
151
+ if current_has_file and not original_has_file:
152
+ # Move to current component
153
+ original_component.key_entities = [
154
+ e for e in original_component.key_entities if e.qualified_name != qname
155
+ ]
156
+ seen_entities[qname] = component
157
+ logger.debug(
158
+ f"[ClusterMethodsMixin] Moved key_entity '{qname}' from {original_component.name} to {component.name}"
159
+ )
160
+ else:
161
+ # Keep in original component
162
+ entities_to_remove.append(key_entity)
163
+ logger.debug(
164
+ f"[ClusterMethodsMixin] Removed duplicate key_entity '{qname}' from {component.name} (kept in {original_component.name})"
165
+ )
166
+ else:
167
+ seen_entities[qname] = component
168
+
169
+ component.key_entities = [e for e in component.key_entities if e not in entities_to_remove]
170
+
171
+ def _ensure_unique_file_assignments(self, analysis: AnalysisInsights) -> None:
172
+ """
173
+ Deduplicate assigned_files within each component.
174
+
175
+ A file may legitimately appear in multiple components, but should not
176
+ appear more than once within the same component's assigned_files list.
177
+ """
178
+ logger.info("[ClusterMethodsMixin] Deduplicating file assignments within components")
179
+
180
+ total_removed = 0
181
+
182
+ for component in analysis.components:
183
+ seen: set[str] = set()
184
+ unique_files: list[str] = []
185
+ for file_path in component.assigned_files:
186
+ if file_path in seen:
187
+ logger.debug(
188
+ f"[ClusterMethodsMixin] Removed duplicate file '{file_path}' within '{component.name}'"
189
+ )
190
+ total_removed += 1
191
+ else:
192
+ seen.add(file_path)
193
+ unique_files.append(file_path)
194
+
195
+ component.assigned_files = unique_files
196
+
197
+ if total_removed > 0:
198
+ logger.info(f"[ClusterMethodsMixin] Removed {total_removed} duplicate file assignment(s)")
199
+
200
+ def _sanitize_component_cluster_ids(
201
+ self,
202
+ analysis: AnalysisInsights,
203
+ valid_cluster_ids: set[int] | None = None,
204
+ cluster_results: dict[str, ClusterResult] | None = None,
205
+ ) -> None:
206
+ """
207
+ Sanitize cluster IDs in the analysis by removing invalid ones.
208
+ Removes cluster IDs that don't exist in the static analysis.
209
+
210
+ Args:
211
+ analysis: The analysis to sanitize
212
+ valid_cluster_ids: Optional set of valid IDs. If None, derives from cluster_results.
213
+ cluster_results: dict mapping language -> ClusterResult. Required if valid_cluster_ids is None.
214
+ """
215
+ if valid_cluster_ids is None:
216
+ if cluster_results is None:
217
+ logger.error("Must provide either valid_cluster_ids or cluster_results")
218
+ return
219
+ valid_cluster_ids = get_all_cluster_ids(cluster_results)
220
+
221
+ for component in analysis.components:
222
+ if component.source_cluster_ids:
223
+ original_ids = component.source_cluster_ids.copy()
224
+ component.source_cluster_ids = [cid for cid in component.source_cluster_ids if cid in valid_cluster_ids]
225
+ removed_ids = set(original_ids) - set(component.source_cluster_ids)
226
+ if removed_ids:
227
+ logger.warning(
228
+ f"[ClusterMethodsMixin] Removed invalid cluster IDs {removed_ids} from component '{component.name}'"
229
+ )
230
+
231
+ def _create_strict_component_subgraph(self, component: Component) -> tuple[str, dict]:
232
+ """
233
+ Create a strict subgraph containing ONLY nodes from the component's assigned files.
234
+ This ensures the analysis is strictly scoped to the component's boundaries.
235
+
236
+ Args:
237
+ component: Component with assigned_files to filter by
238
+
239
+ Returns:
240
+ Tuple of (formatted cluster string, cluster_results dict)
241
+ where cluster_results maps language -> ClusterResult for the subgraph
242
+ """
243
+ if not component.assigned_files:
244
+ logger.warning(f"[ClusterMethodsMixin] Component {component.name} has no assigned_files")
245
+ return "No assigned files found for this component.", {}
246
+
247
+ # Convert assigned files to absolute paths for comparison
248
+ assigned_file_set = set()
249
+ for f in component.assigned_files:
250
+ abs_path = os.path.join(self.repo_dir, f) if not os.path.isabs(f) else f
251
+ assigned_file_set.add(abs_path)
252
+
253
+ result_parts = []
254
+ cluster_results = {}
255
+
256
+ for lang in self.static_analysis.get_languages():
257
+ cfg = self.static_analysis.get_cfg(lang)
258
+
259
+ # Use strict filtering logic
260
+ sub_cfg = cfg.filter_by_files(assigned_file_set)
261
+
262
+ if sub_cfg.nodes:
263
+ # Calculate clusters for the subgraph
264
+ sub_cluster_result = sub_cfg.cluster()
265
+ cluster_results[lang] = sub_cluster_result
266
+
267
+ cluster_str = sub_cfg.to_cluster_string(cluster_result=sub_cluster_result)
268
+ if cluster_str.strip() and cluster_str not in ("empty", "none", "No clusters found."):
269
+ result_parts.append(f"\n## {lang.capitalize()} - Component CFG\n")
270
+ result_parts.append(cluster_str)
271
+ result_parts.append("\n")
272
+
273
+ result = "".join(result_parts)
274
+
275
+ if not result.strip():
276
+ logger.warning(
277
+ f"[ClusterMethodsMixin] No CFG found for component {component.name} with {len(component.assigned_files)} assigned files"
278
+ )
279
+ return "No relevant CFG clusters found for this component.", cluster_results
280
+
281
+ return result, cluster_results
agents/constants.py ADDED
@@ -0,0 +1,13 @@
1
+ """Constants for the agents module."""
2
+
3
+
4
+ class LLMDefaults:
5
+ DEFAULT_AGENT_TEMPERATURE = 0.1
6
+ DEFAULT_PARSING_TEMPERATURE = 0
7
+ AWS_MAX_TOKENS = 4096
8
+
9
+
10
+ class FileStructureConfig:
11
+ MAX_LINES = 500
12
+ DEFAULT_MAX_DEPTH = 10
13
+ FALLBACK_MAX_LINES = 50000