codeboarding 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +0 -0
- agents/abstraction_agent.py +150 -0
- agents/agent.py +467 -0
- agents/agent_responses.py +363 -0
- agents/cluster_methods_mixin.py +281 -0
- agents/constants.py +13 -0
- agents/dependency_discovery.py +159 -0
- agents/details_agent.py +174 -0
- agents/llm_config.py +309 -0
- agents/meta_agent.py +105 -0
- agents/planner_agent.py +105 -0
- agents/prompts/__init__.py +85 -0
- agents/prompts/abstract_prompt_factory.py +63 -0
- agents/prompts/claude_prompts.py +381 -0
- agents/prompts/deepseek_prompts.py +389 -0
- agents/prompts/gemini_flash_prompts.py +362 -0
- agents/prompts/glm_prompts.py +407 -0
- agents/prompts/gpt_prompts.py +470 -0
- agents/prompts/kimi_prompts.py +400 -0
- agents/prompts/prompt_factory.py +179 -0
- agents/tools/__init__.py +8 -0
- agents/tools/base.py +96 -0
- agents/tools/get_external_deps.py +47 -0
- agents/tools/get_method_invocations.py +47 -0
- agents/tools/read_cfg.py +60 -0
- agents/tools/read_docs.py +132 -0
- agents/tools/read_file.py +90 -0
- agents/tools/read_file_structure.py +156 -0
- agents/tools/read_git_diff.py +131 -0
- agents/tools/read_packages.py +60 -0
- agents/tools/read_source.py +105 -0
- agents/tools/read_structure.py +49 -0
- agents/tools/toolkit.py +119 -0
- agents/validation.py +383 -0
- caching/__init__.py +4 -0
- caching/cache.py +29 -0
- caching/meta_cache.py +227 -0
- codeboarding-0.9.0.dist-info/METADATA +223 -0
- codeboarding-0.9.0.dist-info/RECORD +126 -0
- codeboarding-0.9.0.dist-info/WHEEL +5 -0
- codeboarding-0.9.0.dist-info/entry_points.txt +3 -0
- codeboarding-0.9.0.dist-info/licenses/LICENSE +21 -0
- codeboarding-0.9.0.dist-info/top_level.txt +18 -0
- core/__init__.py +101 -0
- core/plugin_loader.py +46 -0
- core/protocols.py +27 -0
- core/registry.py +46 -0
- diagram_analysis/__init__.py +4 -0
- diagram_analysis/analysis_json.py +346 -0
- diagram_analysis/diagram_generator.py +486 -0
- diagram_analysis/file_coverage.py +212 -0
- diagram_analysis/incremental/__init__.py +63 -0
- diagram_analysis/incremental/component_checker.py +236 -0
- diagram_analysis/incremental/file_manager.py +217 -0
- diagram_analysis/incremental/impact_analyzer.py +238 -0
- diagram_analysis/incremental/io_utils.py +281 -0
- diagram_analysis/incremental/models.py +72 -0
- diagram_analysis/incremental/path_patching.py +164 -0
- diagram_analysis/incremental/reexpansion.py +166 -0
- diagram_analysis/incremental/scoped_analysis.py +227 -0
- diagram_analysis/incremental/updater.py +464 -0
- diagram_analysis/incremental/validation.py +48 -0
- diagram_analysis/manifest.py +152 -0
- diagram_analysis/version.py +6 -0
- duckdb_crud.py +125 -0
- github_action.py +172 -0
- health/__init__.py +3 -0
- health/checks/__init__.py +11 -0
- health/checks/circular_deps.py +48 -0
- health/checks/cohesion.py +93 -0
- health/checks/coupling.py +140 -0
- health/checks/function_size.py +85 -0
- health/checks/god_class.py +167 -0
- health/checks/inheritance.py +104 -0
- health/checks/instability.py +77 -0
- health/checks/unused_code_diagnostics.py +338 -0
- health/config.py +172 -0
- health/constants.py +19 -0
- health/models.py +186 -0
- health/runner.py +236 -0
- install.py +518 -0
- logging_config.py +105 -0
- main.py +529 -0
- monitoring/__init__.py +12 -0
- monitoring/callbacks.py +163 -0
- monitoring/context.py +158 -0
- monitoring/mixin.py +16 -0
- monitoring/paths.py +47 -0
- monitoring/stats.py +50 -0
- monitoring/writers.py +172 -0
- output_generators/__init__.py +0 -0
- output_generators/html.py +163 -0
- output_generators/html_template.py +382 -0
- output_generators/markdown.py +140 -0
- output_generators/mdx.py +171 -0
- output_generators/sphinx.py +175 -0
- repo_utils/__init__.py +277 -0
- repo_utils/change_detector.py +289 -0
- repo_utils/errors.py +6 -0
- repo_utils/git_diff.py +74 -0
- repo_utils/ignore.py +341 -0
- static_analyzer/__init__.py +335 -0
- static_analyzer/analysis_cache.py +699 -0
- static_analyzer/analysis_result.py +269 -0
- static_analyzer/cluster_change_analyzer.py +391 -0
- static_analyzer/cluster_helpers.py +79 -0
- static_analyzer/constants.py +166 -0
- static_analyzer/git_diff_analyzer.py +224 -0
- static_analyzer/graph.py +746 -0
- static_analyzer/incremental_orchestrator.py +671 -0
- static_analyzer/java_config_scanner.py +232 -0
- static_analyzer/java_utils.py +227 -0
- static_analyzer/lsp_client/__init__.py +12 -0
- static_analyzer/lsp_client/client.py +1642 -0
- static_analyzer/lsp_client/diagnostics.py +62 -0
- static_analyzer/lsp_client/java_client.py +517 -0
- static_analyzer/lsp_client/language_settings.py +97 -0
- static_analyzer/lsp_client/typescript_client.py +235 -0
- static_analyzer/programming_language.py +152 -0
- static_analyzer/reference_resolve_mixin.py +166 -0
- static_analyzer/scanner.py +95 -0
- static_analyzer/typescript_config_scanner.py +54 -0
- tool_registry.py +433 -0
- user_config.py +134 -0
- utils.py +56 -0
- vscode_constants.py +124 -0
agents/validation.py
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
"""Validation utilities for LLM agent outputs."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
from agents.agent_responses import AnalysisInsights, ClusterAnalysis, ComponentFiles
|
|
8
|
+
from repo_utils import normalize_path
|
|
9
|
+
from static_analyzer.graph import CallGraph, ClusterResult
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ValidationContext:
|
|
16
|
+
"""
|
|
17
|
+
This class is used to provide the necessary context for validating different LLM steps.
|
|
18
|
+
It encapsulates all relevant information required by validation routines to ensure that each step in the LLM pipeline
|
|
19
|
+
is checked against the expected criteria.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
cluster_results: dict[str, ClusterResult] = field(default_factory=dict)
|
|
23
|
+
cfg_graphs: dict[str, CallGraph] = field(default_factory=dict) # For edge checking
|
|
24
|
+
expected_cluster_ids: set[int] = field(default_factory=set)
|
|
25
|
+
expected_files: set[str] = field(default_factory=set)
|
|
26
|
+
valid_component_names: set[str] = field(default_factory=set) # For file classification validation
|
|
27
|
+
repo_dir: str | None = None # For path normalization
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ValidationResult:
|
|
32
|
+
"""Result of a validation check."""
|
|
33
|
+
|
|
34
|
+
is_valid: bool
|
|
35
|
+
feedback_messages: list[str] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def validate_cluster_coverage(result: ClusterAnalysis, context: ValidationContext) -> ValidationResult:
|
|
39
|
+
"""
|
|
40
|
+
Validate that all expected clusters are represented in the ClusterAnalysis.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
result: ClusterAnalysis containing cluster_components
|
|
44
|
+
context: ValidationContext with expected_cluster_ids
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
ValidationResult with feedback for missing clusters
|
|
48
|
+
"""
|
|
49
|
+
if not context.expected_cluster_ids:
|
|
50
|
+
logger.warning("[Validation] No expected cluster IDs provided for coverage validation")
|
|
51
|
+
return ValidationResult(is_valid=True)
|
|
52
|
+
|
|
53
|
+
# Extract all cluster IDs from the result
|
|
54
|
+
result_cluster_ids = set()
|
|
55
|
+
for component in result.cluster_components:
|
|
56
|
+
result_cluster_ids.update(component.cluster_ids)
|
|
57
|
+
|
|
58
|
+
# Find missing clusters
|
|
59
|
+
missing_clusters = context.expected_cluster_ids - result_cluster_ids
|
|
60
|
+
|
|
61
|
+
if not missing_clusters:
|
|
62
|
+
logger.info("[Validation] All clusters are represented in the ClusterAnalysis")
|
|
63
|
+
return ValidationResult(is_valid=True)
|
|
64
|
+
|
|
65
|
+
# Build feedback message
|
|
66
|
+
missing_str = ", ".join(str(cid) for cid in sorted(missing_clusters))
|
|
67
|
+
feedback = (
|
|
68
|
+
f"The following cluster IDs are missing from the analysis: {missing_str}. "
|
|
69
|
+
f"Please ensure all clusters are assigned to a component or create new components for them."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
logger.warning(f"[Validation] Missing clusters: {missing_str}")
|
|
73
|
+
return ValidationResult(is_valid=False, feedback_messages=[feedback])
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def validate_component_relationships(result: AnalysisInsights, context: ValidationContext) -> ValidationResult:
|
|
77
|
+
"""
|
|
78
|
+
Validate that component relationships have corresponding edges in the cluster graph.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
result: AnalysisInsights containing components and components_relations
|
|
82
|
+
context: ValidationContext with cluster_results and cfg_graphs
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
ValidationResult with feedback for invalid relationships
|
|
86
|
+
"""
|
|
87
|
+
if not context.cfg_graphs or not result.components_relations:
|
|
88
|
+
logger.warning("[Validation] No CFG graphs or component relationships provided for relationship validation")
|
|
89
|
+
return ValidationResult(is_valid=True)
|
|
90
|
+
|
|
91
|
+
# Build component name -> source_cluster_ids mapping
|
|
92
|
+
component_clusters: dict[str, list[int]] = {}
|
|
93
|
+
for component in result.components:
|
|
94
|
+
component_clusters[component.name] = component.source_cluster_ids
|
|
95
|
+
|
|
96
|
+
cluster_edge_lookup = _build_cluster_edge_lookup(context.cluster_results, context.cfg_graphs)
|
|
97
|
+
|
|
98
|
+
invalid_relations: list[str] = []
|
|
99
|
+
|
|
100
|
+
for relation in result.components_relations:
|
|
101
|
+
src_clusters = component_clusters.get(relation.src_name, [])
|
|
102
|
+
dst_clusters = component_clusters.get(relation.dst_name, [])
|
|
103
|
+
|
|
104
|
+
if not src_clusters or not dst_clusters:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
# Check if any cluster pair has an edge
|
|
108
|
+
has_edge = _check_edge_between_cluster_sets(
|
|
109
|
+
src_clusters,
|
|
110
|
+
dst_clusters,
|
|
111
|
+
context.cluster_results,
|
|
112
|
+
context.cfg_graphs,
|
|
113
|
+
cluster_edge_lookup,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
if not has_edge:
|
|
117
|
+
invalid_relations.append(f"({relation.src_name} -> {relation.dst_name})")
|
|
118
|
+
|
|
119
|
+
if not invalid_relations:
|
|
120
|
+
logger.info("[Validation] All component relationships have backing edges")
|
|
121
|
+
return ValidationResult(is_valid=True)
|
|
122
|
+
|
|
123
|
+
# Build feedback message
|
|
124
|
+
invalid_str = ", ".join(invalid_relations)
|
|
125
|
+
feedback = (
|
|
126
|
+
f"The following component relationships lack backing edges in the cluster graph: {invalid_str}. "
|
|
127
|
+
f"Please double-check if these components are actually related. If there is no direct edge between "
|
|
128
|
+
f"their clusters, the relationship may be indirect or incorrect."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
logger.warning(f"[Validation] Invalid relationships: {invalid_str}")
|
|
132
|
+
return ValidationResult(is_valid=False, feedback_messages=[feedback])
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def validate_key_entities(result: AnalysisInsights, context: ValidationContext) -> ValidationResult:
|
|
136
|
+
"""
|
|
137
|
+
Validate that every component in AnalysisInsights has at least one key_entity assigned.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
result: AnalysisInsights containing components
|
|
141
|
+
context: ValidationContext (not used but kept for interface consistency)
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
ValidationResult with feedback for components missing key entities
|
|
145
|
+
"""
|
|
146
|
+
components_without_key_entities: list[str] = []
|
|
147
|
+
|
|
148
|
+
for component in result.components:
|
|
149
|
+
if not component.key_entities or len(component.key_entities) == 0:
|
|
150
|
+
components_without_key_entities.append(component.name)
|
|
151
|
+
|
|
152
|
+
if not components_without_key_entities:
|
|
153
|
+
logger.info("[Validation] All components have key entities assigned")
|
|
154
|
+
return ValidationResult(is_valid=True)
|
|
155
|
+
|
|
156
|
+
# Build feedback message
|
|
157
|
+
missing_str = ", ".join(components_without_key_entities)
|
|
158
|
+
feedback = (
|
|
159
|
+
f"The following components are missing key entities: {missing_str}. "
|
|
160
|
+
f"Every component must have at least one key entity (critical class or method) "
|
|
161
|
+
f"that represents its core functionality. Please identify and add 2-5 key entities "
|
|
162
|
+
f"for each component."
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
logger.warning(f"[Validation] Components without key entities: {missing_str}")
|
|
166
|
+
return ValidationResult(is_valid=False, feedback_messages=[feedback])
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def validate_cluster_ids_populated(result: AnalysisInsights, context: ValidationContext) -> ValidationResult:
|
|
170
|
+
"""
|
|
171
|
+
Validate that every cluster is assigned to at least one component.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
result: AnalysisInsights containing components
|
|
175
|
+
context: ValidationContext with cluster_results to get available cluster IDs
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
ValidationResult with feedback for unassigned clusters
|
|
179
|
+
"""
|
|
180
|
+
if not context.cluster_results:
|
|
181
|
+
logger.warning("[Validation] No cluster results provided for cluster ID validation")
|
|
182
|
+
return ValidationResult(is_valid=True)
|
|
183
|
+
|
|
184
|
+
all_cluster_ids: set[int] = set()
|
|
185
|
+
for lang_result in context.cluster_results.values():
|
|
186
|
+
all_cluster_ids.update(lang_result.get_cluster_ids())
|
|
187
|
+
|
|
188
|
+
if not all_cluster_ids:
|
|
189
|
+
logger.warning("[Validation] No cluster IDs available for cluster ID validation")
|
|
190
|
+
return ValidationResult(is_valid=True)
|
|
191
|
+
|
|
192
|
+
assigned_cluster_ids: set[int] = set()
|
|
193
|
+
for component in result.components:
|
|
194
|
+
assigned_cluster_ids.update(component.source_cluster_ids or [])
|
|
195
|
+
|
|
196
|
+
unassigned_clusters = all_cluster_ids - assigned_cluster_ids
|
|
197
|
+
|
|
198
|
+
if not unassigned_clusters:
|
|
199
|
+
logger.info("[Validation] All clusters are assigned to components")
|
|
200
|
+
return ValidationResult(is_valid=True)
|
|
201
|
+
|
|
202
|
+
missing_str = ", ".join(str(cid) for cid in sorted(unassigned_clusters))
|
|
203
|
+
feedback = (
|
|
204
|
+
f"The following cluster IDs are not assigned to any component: {missing_str}. "
|
|
205
|
+
f"Please assign every cluster to a component based on which code clusters belong to it."
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
logger.warning(f"[Validation] Unassigned clusters: {missing_str}")
|
|
209
|
+
return ValidationResult(is_valid=False, feedback_messages=[feedback])
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def validate_file_classifications(result: ComponentFiles, context: ValidationContext) -> ValidationResult:
|
|
213
|
+
"""
|
|
214
|
+
Validate that all unassigned files were classified to valid component names.
|
|
215
|
+
|
|
216
|
+
This validator is used for _classify_unassigned_files_with_llm to ensure:
|
|
217
|
+
1. All input files are present in the result
|
|
218
|
+
2. All component names are valid (exist in valid_component_names)
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
result: ComponentFiles with file_paths containing FileClassification objects
|
|
222
|
+
context: ValidationContext with expected_files (unassigned files) and valid_component_names
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
ValidationResult with feedback for missing files or invalid component names
|
|
226
|
+
"""
|
|
227
|
+
if not context.expected_files:
|
|
228
|
+
logger.warning("[Validation] No expected files provided for file classification validation")
|
|
229
|
+
return ValidationResult(is_valid=True)
|
|
230
|
+
|
|
231
|
+
feedback_messages = []
|
|
232
|
+
|
|
233
|
+
# Get classified file paths from result
|
|
234
|
+
classified_files = {normalize_path(fc.file_path, context.repo_dir) for fc in result.file_paths}
|
|
235
|
+
|
|
236
|
+
# Normalize paths for comparison
|
|
237
|
+
expected_files_normalized = {normalize_path(file_path, context.repo_dir) for file_path in context.expected_files}
|
|
238
|
+
|
|
239
|
+
# Check 1: Are all unassigned files classified?
|
|
240
|
+
missing_files = expected_files_normalized - classified_files
|
|
241
|
+
if missing_files:
|
|
242
|
+
missing_list = sorted(str(f) for f in missing_files)[:10]
|
|
243
|
+
missing_str = ", ".join(missing_list)
|
|
244
|
+
more_msg = f" and {len(missing_files) - 10} more" if len(missing_files) > 10 else ""
|
|
245
|
+
feedback_messages.append(
|
|
246
|
+
f"The following files were not classified: {missing_str}{more_msg}. "
|
|
247
|
+
f"Please ensure all files are assigned to a component."
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Check 2: Are all component names valid?
|
|
251
|
+
if context.valid_component_names:
|
|
252
|
+
invalid_classifications = []
|
|
253
|
+
for fc in result.file_paths:
|
|
254
|
+
if fc.component_name not in context.valid_component_names:
|
|
255
|
+
invalid_classifications.append(f"{fc.file_path} -> {fc.component_name}")
|
|
256
|
+
|
|
257
|
+
if invalid_classifications:
|
|
258
|
+
invalid_str = ", ".join(invalid_classifications[:10])
|
|
259
|
+
more_msg = f" and {len(invalid_classifications) - 10} more" if len(invalid_classifications) > 10 else ""
|
|
260
|
+
valid_names = ", ".join(sorted(context.valid_component_names))
|
|
261
|
+
feedback_messages.append(
|
|
262
|
+
f"Invalid component names found: {invalid_str}{more_msg}. "
|
|
263
|
+
f"Valid component names are: {valid_names}. "
|
|
264
|
+
f"Please use only these component names."
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if not feedback_messages:
|
|
268
|
+
logger.info("[Validation] All unassigned files correctly classified")
|
|
269
|
+
return ValidationResult(is_valid=True)
|
|
270
|
+
|
|
271
|
+
logger.warning(f"[Validation] File classification issues: {len(feedback_messages)} problems found")
|
|
272
|
+
return ValidationResult(is_valid=False, feedback_messages=feedback_messages)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def validate_relation_component_names(result: AnalysisInsights, _context: ValidationContext) -> ValidationResult:
|
|
276
|
+
"""
|
|
277
|
+
Validate that every src_name and dst_name in components_relations refers to an existing component.
|
|
278
|
+
|
|
279
|
+
When a relation references a component name that does not exist, assign_component_ids will
|
|
280
|
+
leave src_id or dst_id as an empty string, producing broken references in the output JSON.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
result: AnalysisInsights containing components and components_relations
|
|
284
|
+
context: ValidationContext (not used but kept for interface consistency)
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
ValidationResult with feedback listing every relation whose src_name or dst_name is unknown
|
|
288
|
+
"""
|
|
289
|
+
known_names = {component.name for component in result.components}
|
|
290
|
+
|
|
291
|
+
invalid_relations: list[str] = []
|
|
292
|
+
for relation in result.components_relations:
|
|
293
|
+
unknown: list[str] = []
|
|
294
|
+
if relation.src_name not in known_names:
|
|
295
|
+
unknown.append(f"src_name='{relation.src_name}'")
|
|
296
|
+
if relation.dst_name not in known_names:
|
|
297
|
+
unknown.append(f"dst_name='{relation.dst_name}'")
|
|
298
|
+
if unknown:
|
|
299
|
+
invalid_relations.append(
|
|
300
|
+
f"({relation.src_name} -{relation.relation}-> {relation.dst_name}): {', '.join(unknown)}"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
if not invalid_relations:
|
|
304
|
+
logger.info("[Validation] All relation component names refer to existing components")
|
|
305
|
+
return ValidationResult(is_valid=True)
|
|
306
|
+
|
|
307
|
+
invalid_str = "; ".join(invalid_relations)
|
|
308
|
+
known_str = ", ".join(sorted(known_names)) if known_names else "<none>"
|
|
309
|
+
feedback = (
|
|
310
|
+
f"The following relations reference component names that do not exist: {invalid_str}. "
|
|
311
|
+
f"Known component names are: {known_str}. "
|
|
312
|
+
f"Please ensure that src_name and dst_name in every relation match an existing component name exactly."
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
logger.warning(f"[Validation] Relations with unknown component names: {invalid_str}")
|
|
316
|
+
return ValidationResult(is_valid=False, feedback_messages=[feedback])
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _build_cluster_edge_lookup(
|
|
320
|
+
cluster_results: dict[str, ClusterResult],
|
|
321
|
+
cfg_graphs: dict[str, CallGraph],
|
|
322
|
+
) -> dict[str, set[tuple[int, int]]]:
|
|
323
|
+
"""Build a lookup of (src_cluster_id, dst_cluster_id) edges per language."""
|
|
324
|
+
cluster_edge_lookup: dict[str, set[tuple[int, int]]] = {}
|
|
325
|
+
|
|
326
|
+
for lang, cfg in cfg_graphs.items():
|
|
327
|
+
cluster_result = cluster_results.get(lang)
|
|
328
|
+
if not cluster_result:
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
node_to_cluster: dict[str, int] = {}
|
|
332
|
+
for cluster_id, nodes in cluster_result.clusters.items():
|
|
333
|
+
for node in nodes:
|
|
334
|
+
node_to_cluster[node] = cluster_id
|
|
335
|
+
|
|
336
|
+
cluster_edges: set[tuple[int, int]] = set()
|
|
337
|
+
for edge in cfg.edges:
|
|
338
|
+
src_cluster = node_to_cluster.get(edge.get_source())
|
|
339
|
+
dst_cluster = node_to_cluster.get(edge.get_destination())
|
|
340
|
+
if src_cluster is None or dst_cluster is None:
|
|
341
|
+
continue
|
|
342
|
+
cluster_edges.add((src_cluster, dst_cluster))
|
|
343
|
+
|
|
344
|
+
cluster_edge_lookup[lang] = cluster_edges
|
|
345
|
+
|
|
346
|
+
return cluster_edge_lookup
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _check_edge_between_cluster_sets(
|
|
350
|
+
src_cluster_ids: list[int],
|
|
351
|
+
dst_cluster_ids: list[int],
|
|
352
|
+
cluster_results: dict[str, ClusterResult],
|
|
353
|
+
cfg_graphs: dict[str, CallGraph],
|
|
354
|
+
cluster_edge_lookup: dict[str, set[tuple[int, int]]] | None = None,
|
|
355
|
+
) -> bool:
|
|
356
|
+
"""
|
|
357
|
+
Check if there's an edge between any pair of clusters from two sets.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
src_cluster_ids: Source cluster IDs
|
|
361
|
+
dst_cluster_ids: Destination cluster IDs
|
|
362
|
+
cluster_results: dict mapping language -> ClusterResult
|
|
363
|
+
cfg_graphs: dict mapping language -> CallGraph
|
|
364
|
+
cluster_edge_lookup: Optional precomputed (src_cluster, dst_cluster) edges per language
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
True if any edge exists between the cluster sets
|
|
368
|
+
"""
|
|
369
|
+
if not src_cluster_ids or not dst_cluster_ids:
|
|
370
|
+
return False
|
|
371
|
+
|
|
372
|
+
if cluster_edge_lookup is None:
|
|
373
|
+
cluster_edge_lookup = _build_cluster_edge_lookup(cluster_results, cfg_graphs)
|
|
374
|
+
|
|
375
|
+
src_set = set(src_cluster_ids)
|
|
376
|
+
dst_set = set(dst_cluster_ids)
|
|
377
|
+
|
|
378
|
+
for cluster_edges in cluster_edge_lookup.values():
|
|
379
|
+
for src_cluster, dst_cluster in cluster_edges:
|
|
380
|
+
if src_cluster in src_set and dst_cluster in dst_set:
|
|
381
|
+
return True
|
|
382
|
+
|
|
383
|
+
return False
|
caching/__init__.py
ADDED
caching/cache.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TypeVar, Generic
|
|
4
|
+
|
|
5
|
+
T = TypeVar("T")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseCache(ABC, Generic[T]):
|
|
9
|
+
|
|
10
|
+
def __init__(self, filename: str, cache_dir: Path):
|
|
11
|
+
self.cache_dir = cache_dir
|
|
12
|
+
self.file_path = self.cache_dir / filename
|
|
13
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def load(self) -> T | None:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def store(self, data: T) -> None:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def signature(self) -> str:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def is_stale(self, data: T) -> bool:
|
|
29
|
+
pass
|
caching/meta_cache.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import sqlite3
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from langchain_community.cache import SQLiteCache
|
|
9
|
+
from langchain_core.language_models import BaseChatModel
|
|
10
|
+
from langchain_core.outputs import Generation
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
from agents.agent_responses import MetaAnalysisInsights
|
|
14
|
+
from agents.dependency_discovery import FileRole, discover_dependency_files
|
|
15
|
+
from caching.cache import BaseCache
|
|
16
|
+
from repo_utils import Repo, require_git_import
|
|
17
|
+
from repo_utils.ignore import RepoIgnoreManager
|
|
18
|
+
from utils import get_cache_dir
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
type JsonScalar = str | int | float | bool | None
|
|
23
|
+
|
|
24
|
+
_README_PATTERNS: tuple[str, ...] = (
|
|
25
|
+
"README.md",
|
|
26
|
+
"README.rst",
|
|
27
|
+
"README.txt",
|
|
28
|
+
"README",
|
|
29
|
+
"readme.md",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
_CACHE_WATCH_ROLES: frozenset[FileRole] = frozenset({FileRole.MANIFEST, FileRole.CONFIG})
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class MetaCacheRecord(BaseModel):
|
|
36
|
+
meta: MetaAnalysisInsights
|
|
37
|
+
base_commit: str
|
|
38
|
+
watch_files: list[str]
|
|
39
|
+
watch_state_hash: str | None = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MetaCache(BaseCache[MetaCacheRecord]):
|
|
43
|
+
"""SQLite-backed cache for MetaAgent analysis results.
|
|
44
|
+
|
|
45
|
+
Watches dependency manifests, config files, and root-level READMEs.
|
|
46
|
+
Keyed by a composite of project name, prompt version hash, and LLM
|
|
47
|
+
configuration so that any change to prompts or models automatically
|
|
48
|
+
produces a cache miss.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
repo_dir: Path,
|
|
54
|
+
ignore_manager: RepoIgnoreManager,
|
|
55
|
+
project_name: str,
|
|
56
|
+
agent_llm: BaseChatModel,
|
|
57
|
+
parsing_llm: BaseChatModel,
|
|
58
|
+
prompt_material: str,
|
|
59
|
+
):
|
|
60
|
+
super().__init__("meta_agent_llm.sqlite", cache_dir=get_cache_dir(repo_dir))
|
|
61
|
+
self._repo_dir = repo_dir
|
|
62
|
+
self._ignore_manager = ignore_manager
|
|
63
|
+
self._prompt_key = self._build_prompt_key(project_name, prompt_material)
|
|
64
|
+
self._llm_key = self._build_llm_key(agent_llm, parsing_llm)
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def _llm_signature(llm: BaseChatModel) -> str:
|
|
68
|
+
model_id = None
|
|
69
|
+
for attr in ("model_name", "model", "model_id"):
|
|
70
|
+
value = getattr(llm, attr, None)
|
|
71
|
+
if isinstance(value, str) and value:
|
|
72
|
+
model_id = value
|
|
73
|
+
break
|
|
74
|
+
|
|
75
|
+
config: dict[str, JsonScalar] = {}
|
|
76
|
+
for attr in ("temperature", "max_tokens", "top_p", "timeout", "max_retries"):
|
|
77
|
+
value = getattr(llm, attr, None)
|
|
78
|
+
if isinstance(value, (str, int, float, bool)) or value is None:
|
|
79
|
+
config[attr] = value
|
|
80
|
+
|
|
81
|
+
payload = {
|
|
82
|
+
"provider": f"{type(llm).__module__}.{type(llm).__name__}",
|
|
83
|
+
"model_id": model_id or type(llm).__name__,
|
|
84
|
+
"config": config,
|
|
85
|
+
}
|
|
86
|
+
return json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
|
|
87
|
+
|
|
88
|
+
def _build_prompt_key(self, project_name: str, prompt_material: str) -> str:
|
|
89
|
+
prompt_hash = hashlib.sha256(prompt_material.encode("utf-8")).hexdigest()
|
|
90
|
+
payload = {
|
|
91
|
+
"kind": "meta_agent_cache",
|
|
92
|
+
"project_name": project_name,
|
|
93
|
+
"prompt_version": prompt_hash,
|
|
94
|
+
}
|
|
95
|
+
return json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
|
|
96
|
+
|
|
97
|
+
def _build_llm_key(self, agent_llm: BaseChatModel, parsing_llm: BaseChatModel) -> str:
|
|
98
|
+
payload = {
|
|
99
|
+
"kind": "meta_agent_llm_cache",
|
|
100
|
+
"agent": self._llm_signature(agent_llm),
|
|
101
|
+
"parser": self._llm_signature(parsing_llm),
|
|
102
|
+
}
|
|
103
|
+
return json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
|
|
104
|
+
|
|
105
|
+
def signature(self) -> str:
|
|
106
|
+
"""Return the composite cache key identifying this configuration."""
|
|
107
|
+
return self._prompt_key + "|" + self._llm_key
|
|
108
|
+
|
|
109
|
+
def _open_sqlite(self) -> SQLiteCache | None:
|
|
110
|
+
try:
|
|
111
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
112
|
+
return SQLiteCache(database_path=str(self.file_path))
|
|
113
|
+
except (OSError, sqlite3.Error) as e:
|
|
114
|
+
logger.warning("Meta cache disabled: %s", e)
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
def load(self) -> MetaCacheRecord | None:
|
|
118
|
+
cache = self._open_sqlite()
|
|
119
|
+
if cache is None:
|
|
120
|
+
return None
|
|
121
|
+
raw: Sequence[Generation] | None = cache.lookup(self._prompt_key, self._llm_key)
|
|
122
|
+
if raw is None:
|
|
123
|
+
return None
|
|
124
|
+
if len(raw) > 1:
|
|
125
|
+
logger.warning("Meta cache returned %d generations; using first", len(raw))
|
|
126
|
+
try:
|
|
127
|
+
return MetaCacheRecord.model_validate_json(raw[0].text)
|
|
128
|
+
except Exception:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
def store(self, data: MetaCacheRecord) -> None:
|
|
132
|
+
cache = self._open_sqlite()
|
|
133
|
+
if cache is None:
|
|
134
|
+
return
|
|
135
|
+
cache.clear()
|
|
136
|
+
cache.update(self._prompt_key, self._llm_key, [Generation(text=data.model_dump_json())])
|
|
137
|
+
|
|
138
|
+
def clear(self) -> None:
|
|
139
|
+
cache = self._open_sqlite()
|
|
140
|
+
if cache is not None:
|
|
141
|
+
cache.clear()
|
|
142
|
+
|
|
143
|
+
@require_git_import(default=[])
|
|
144
|
+
def discover_watch_files(self) -> list[str]:
|
|
145
|
+
"""Return git-known files whose changes should invalidate this cache.
|
|
146
|
+
|
|
147
|
+
Includes dependency manifests and configs (not locks) and root-level
|
|
148
|
+
README files that the meta agent reads for project context.
|
|
149
|
+
"""
|
|
150
|
+
try:
|
|
151
|
+
repo = Repo(self._repo_dir)
|
|
152
|
+
tracked_files = set(repo.git.ls_files().splitlines())
|
|
153
|
+
untracked_files = {
|
|
154
|
+
Path(path).as_posix()
|
|
155
|
+
for path in repo.untracked_files
|
|
156
|
+
if not self._ignore_manager.should_ignore(Path(path))
|
|
157
|
+
}
|
|
158
|
+
git_known_files = tracked_files | untracked_files
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.warning("Unable to discover git file set for meta cache watch list: %s", e)
|
|
161
|
+
return []
|
|
162
|
+
|
|
163
|
+
watch: set[str] = set()
|
|
164
|
+
|
|
165
|
+
for discovered in discover_dependency_files(self._repo_dir, self._ignore_manager, roles=_CACHE_WATCH_ROLES):
|
|
166
|
+
relative_path = discovered.path.relative_to(self._repo_dir).as_posix()
|
|
167
|
+
if relative_path in git_known_files:
|
|
168
|
+
watch.add(relative_path)
|
|
169
|
+
|
|
170
|
+
for pattern in _README_PATTERNS:
|
|
171
|
+
if (self._repo_dir / pattern).is_file() and pattern in git_known_files:
|
|
172
|
+
watch.add(pattern)
|
|
173
|
+
|
|
174
|
+
return sorted(watch)
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def _fingerprint_file(path: Path) -> bytes | None:
|
|
178
|
+
try:
|
|
179
|
+
digest = hashlib.sha256()
|
|
180
|
+
with path.open("rb") as handle:
|
|
181
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
182
|
+
digest.update(chunk)
|
|
183
|
+
return digest.digest()
|
|
184
|
+
except OSError:
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
def compute_watch_state_hash(self, watch_files: Sequence[str]) -> str | None:
|
|
188
|
+
"""Return a deterministic fingerprint for watched file contents."""
|
|
189
|
+
if not watch_files:
|
|
190
|
+
return None
|
|
191
|
+
|
|
192
|
+
digest = hashlib.sha256()
|
|
193
|
+
for relative_path in sorted(set(watch_files)):
|
|
194
|
+
file_digest = self._fingerprint_file(self._repo_dir / relative_path)
|
|
195
|
+
if file_digest is None:
|
|
196
|
+
logger.warning("Unable to fingerprint meta cache watch file: %s", relative_path)
|
|
197
|
+
return None
|
|
198
|
+
digest.update(relative_path.encode("utf-8"))
|
|
199
|
+
digest.update(b"\0")
|
|
200
|
+
digest.update(file_digest)
|
|
201
|
+
digest.update(b"\n")
|
|
202
|
+
|
|
203
|
+
return digest.hexdigest()
|
|
204
|
+
|
|
205
|
+
def is_stale(self, record: MetaCacheRecord) -> bool:
|
|
206
|
+
"""Return True if watched file fingerprints differ from the cached record."""
|
|
207
|
+
if not record.watch_files:
|
|
208
|
+
return False
|
|
209
|
+
|
|
210
|
+
if not record.watch_state_hash:
|
|
211
|
+
logger.info("Meta cache record is missing watch-state fingerprint; recomputing once for migration")
|
|
212
|
+
return True
|
|
213
|
+
|
|
214
|
+
expected_watch_files = sorted(set(record.watch_files))
|
|
215
|
+
discovered_watch_files = self.discover_watch_files()
|
|
216
|
+
if discovered_watch_files:
|
|
217
|
+
normalized_discovered = sorted(set(discovered_watch_files))
|
|
218
|
+
if normalized_discovered != expected_watch_files:
|
|
219
|
+
logger.info("Meta cache watch-file set changed; recomputing metadata analysis")
|
|
220
|
+
return True
|
|
221
|
+
expected_watch_files = normalized_discovered
|
|
222
|
+
|
|
223
|
+
current_watch_hash = self.compute_watch_state_hash(expected_watch_files)
|
|
224
|
+
if current_watch_hash is None:
|
|
225
|
+
return True
|
|
226
|
+
|
|
227
|
+
return current_watch_hash != record.watch_state_hash
|