codeboarding 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. agents/__init__.py +0 -0
  2. agents/abstraction_agent.py +150 -0
  3. agents/agent.py +467 -0
  4. agents/agent_responses.py +363 -0
  5. agents/cluster_methods_mixin.py +281 -0
  6. agents/constants.py +13 -0
  7. agents/dependency_discovery.py +159 -0
  8. agents/details_agent.py +174 -0
  9. agents/llm_config.py +309 -0
  10. agents/meta_agent.py +105 -0
  11. agents/planner_agent.py +105 -0
  12. agents/prompts/__init__.py +85 -0
  13. agents/prompts/abstract_prompt_factory.py +63 -0
  14. agents/prompts/claude_prompts.py +381 -0
  15. agents/prompts/deepseek_prompts.py +389 -0
  16. agents/prompts/gemini_flash_prompts.py +362 -0
  17. agents/prompts/glm_prompts.py +407 -0
  18. agents/prompts/gpt_prompts.py +470 -0
  19. agents/prompts/kimi_prompts.py +400 -0
  20. agents/prompts/prompt_factory.py +179 -0
  21. agents/tools/__init__.py +8 -0
  22. agents/tools/base.py +96 -0
  23. agents/tools/get_external_deps.py +47 -0
  24. agents/tools/get_method_invocations.py +47 -0
  25. agents/tools/read_cfg.py +60 -0
  26. agents/tools/read_docs.py +132 -0
  27. agents/tools/read_file.py +90 -0
  28. agents/tools/read_file_structure.py +156 -0
  29. agents/tools/read_git_diff.py +131 -0
  30. agents/tools/read_packages.py +60 -0
  31. agents/tools/read_source.py +105 -0
  32. agents/tools/read_structure.py +49 -0
  33. agents/tools/toolkit.py +119 -0
  34. agents/validation.py +383 -0
  35. caching/__init__.py +4 -0
  36. caching/cache.py +29 -0
  37. caching/meta_cache.py +227 -0
  38. codeboarding-0.9.0.dist-info/METADATA +223 -0
  39. codeboarding-0.9.0.dist-info/RECORD +126 -0
  40. codeboarding-0.9.0.dist-info/WHEEL +5 -0
  41. codeboarding-0.9.0.dist-info/entry_points.txt +3 -0
  42. codeboarding-0.9.0.dist-info/licenses/LICENSE +21 -0
  43. codeboarding-0.9.0.dist-info/top_level.txt +18 -0
  44. core/__init__.py +101 -0
  45. core/plugin_loader.py +46 -0
  46. core/protocols.py +27 -0
  47. core/registry.py +46 -0
  48. diagram_analysis/__init__.py +4 -0
  49. diagram_analysis/analysis_json.py +346 -0
  50. diagram_analysis/diagram_generator.py +486 -0
  51. diagram_analysis/file_coverage.py +212 -0
  52. diagram_analysis/incremental/__init__.py +63 -0
  53. diagram_analysis/incremental/component_checker.py +236 -0
  54. diagram_analysis/incremental/file_manager.py +217 -0
  55. diagram_analysis/incremental/impact_analyzer.py +238 -0
  56. diagram_analysis/incremental/io_utils.py +281 -0
  57. diagram_analysis/incremental/models.py +72 -0
  58. diagram_analysis/incremental/path_patching.py +164 -0
  59. diagram_analysis/incremental/reexpansion.py +166 -0
  60. diagram_analysis/incremental/scoped_analysis.py +227 -0
  61. diagram_analysis/incremental/updater.py +464 -0
  62. diagram_analysis/incremental/validation.py +48 -0
  63. diagram_analysis/manifest.py +152 -0
  64. diagram_analysis/version.py +6 -0
  65. duckdb_crud.py +125 -0
  66. github_action.py +172 -0
  67. health/__init__.py +3 -0
  68. health/checks/__init__.py +11 -0
  69. health/checks/circular_deps.py +48 -0
  70. health/checks/cohesion.py +93 -0
  71. health/checks/coupling.py +140 -0
  72. health/checks/function_size.py +85 -0
  73. health/checks/god_class.py +167 -0
  74. health/checks/inheritance.py +104 -0
  75. health/checks/instability.py +77 -0
  76. health/checks/unused_code_diagnostics.py +338 -0
  77. health/config.py +172 -0
  78. health/constants.py +19 -0
  79. health/models.py +186 -0
  80. health/runner.py +236 -0
  81. install.py +518 -0
  82. logging_config.py +105 -0
  83. main.py +529 -0
  84. monitoring/__init__.py +12 -0
  85. monitoring/callbacks.py +163 -0
  86. monitoring/context.py +158 -0
  87. monitoring/mixin.py +16 -0
  88. monitoring/paths.py +47 -0
  89. monitoring/stats.py +50 -0
  90. monitoring/writers.py +172 -0
  91. output_generators/__init__.py +0 -0
  92. output_generators/html.py +163 -0
  93. output_generators/html_template.py +382 -0
  94. output_generators/markdown.py +140 -0
  95. output_generators/mdx.py +171 -0
  96. output_generators/sphinx.py +175 -0
  97. repo_utils/__init__.py +277 -0
  98. repo_utils/change_detector.py +289 -0
  99. repo_utils/errors.py +6 -0
  100. repo_utils/git_diff.py +74 -0
  101. repo_utils/ignore.py +341 -0
  102. static_analyzer/__init__.py +335 -0
  103. static_analyzer/analysis_cache.py +699 -0
  104. static_analyzer/analysis_result.py +269 -0
  105. static_analyzer/cluster_change_analyzer.py +391 -0
  106. static_analyzer/cluster_helpers.py +79 -0
  107. static_analyzer/constants.py +166 -0
  108. static_analyzer/git_diff_analyzer.py +224 -0
  109. static_analyzer/graph.py +746 -0
  110. static_analyzer/incremental_orchestrator.py +671 -0
  111. static_analyzer/java_config_scanner.py +232 -0
  112. static_analyzer/java_utils.py +227 -0
  113. static_analyzer/lsp_client/__init__.py +12 -0
  114. static_analyzer/lsp_client/client.py +1642 -0
  115. static_analyzer/lsp_client/diagnostics.py +62 -0
  116. static_analyzer/lsp_client/java_client.py +517 -0
  117. static_analyzer/lsp_client/language_settings.py +97 -0
  118. static_analyzer/lsp_client/typescript_client.py +235 -0
  119. static_analyzer/programming_language.py +152 -0
  120. static_analyzer/reference_resolve_mixin.py +166 -0
  121. static_analyzer/scanner.py +95 -0
  122. static_analyzer/typescript_config_scanner.py +54 -0
  123. tool_registry.py +433 -0
  124. user_config.py +134 -0
  125. utils.py +56 -0
  126. vscode_constants.py +124 -0
agents/validation.py ADDED
@@ -0,0 +1,383 @@
1
+ """Validation utilities for LLM agent outputs."""
2
+
3
+ import logging
4
+ import os
5
+ from dataclasses import dataclass, field
6
+
7
+ from agents.agent_responses import AnalysisInsights, ClusterAnalysis, ComponentFiles
8
+ from repo_utils import normalize_path
9
+ from static_analyzer.graph import CallGraph, ClusterResult
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class ValidationContext:
16
+ """
17
+ This class is used to provide the necessary context for validating different LLM steps.
18
+ It encapsulates all relevant information required by validation routines to ensure that each step in the LLM pipeline
19
+ is checked against the expected criteria.
20
+ """
21
+
22
+ cluster_results: dict[str, ClusterResult] = field(default_factory=dict)
23
+ cfg_graphs: dict[str, CallGraph] = field(default_factory=dict) # For edge checking
24
+ expected_cluster_ids: set[int] = field(default_factory=set)
25
+ expected_files: set[str] = field(default_factory=set)
26
+ valid_component_names: set[str] = field(default_factory=set) # For file classification validation
27
+ repo_dir: str | None = None # For path normalization
28
+
29
+
30
+ @dataclass
31
+ class ValidationResult:
32
+ """Result of a validation check."""
33
+
34
+ is_valid: bool
35
+ feedback_messages: list[str] = field(default_factory=list)
36
+
37
+
38
+ def validate_cluster_coverage(result: ClusterAnalysis, context: ValidationContext) -> ValidationResult:
39
+ """
40
+ Validate that all expected clusters are represented in the ClusterAnalysis.
41
+
42
+ Args:
43
+ result: ClusterAnalysis containing cluster_components
44
+ context: ValidationContext with expected_cluster_ids
45
+
46
+ Returns:
47
+ ValidationResult with feedback for missing clusters
48
+ """
49
+ if not context.expected_cluster_ids:
50
+ logger.warning("[Validation] No expected cluster IDs provided for coverage validation")
51
+ return ValidationResult(is_valid=True)
52
+
53
+ # Extract all cluster IDs from the result
54
+ result_cluster_ids = set()
55
+ for component in result.cluster_components:
56
+ result_cluster_ids.update(component.cluster_ids)
57
+
58
+ # Find missing clusters
59
+ missing_clusters = context.expected_cluster_ids - result_cluster_ids
60
+
61
+ if not missing_clusters:
62
+ logger.info("[Validation] All clusters are represented in the ClusterAnalysis")
63
+ return ValidationResult(is_valid=True)
64
+
65
+ # Build feedback message
66
+ missing_str = ", ".join(str(cid) for cid in sorted(missing_clusters))
67
+ feedback = (
68
+ f"The following cluster IDs are missing from the analysis: {missing_str}. "
69
+ f"Please ensure all clusters are assigned to a component or create new components for them."
70
+ )
71
+
72
+ logger.warning(f"[Validation] Missing clusters: {missing_str}")
73
+ return ValidationResult(is_valid=False, feedback_messages=[feedback])
74
+
75
+
76
+ def validate_component_relationships(result: AnalysisInsights, context: ValidationContext) -> ValidationResult:
77
+ """
78
+ Validate that component relationships have corresponding edges in the cluster graph.
79
+
80
+ Args:
81
+ result: AnalysisInsights containing components and components_relations
82
+ context: ValidationContext with cluster_results and cfg_graphs
83
+
84
+ Returns:
85
+ ValidationResult with feedback for invalid relationships
86
+ """
87
+ if not context.cfg_graphs or not result.components_relations:
88
+ logger.warning("[Validation] No CFG graphs or component relationships provided for relationship validation")
89
+ return ValidationResult(is_valid=True)
90
+
91
+ # Build component name -> source_cluster_ids mapping
92
+ component_clusters: dict[str, list[int]] = {}
93
+ for component in result.components:
94
+ component_clusters[component.name] = component.source_cluster_ids
95
+
96
+ cluster_edge_lookup = _build_cluster_edge_lookup(context.cluster_results, context.cfg_graphs)
97
+
98
+ invalid_relations: list[str] = []
99
+
100
+ for relation in result.components_relations:
101
+ src_clusters = component_clusters.get(relation.src_name, [])
102
+ dst_clusters = component_clusters.get(relation.dst_name, [])
103
+
104
+ if not src_clusters or not dst_clusters:
105
+ continue
106
+
107
+ # Check if any cluster pair has an edge
108
+ has_edge = _check_edge_between_cluster_sets(
109
+ src_clusters,
110
+ dst_clusters,
111
+ context.cluster_results,
112
+ context.cfg_graphs,
113
+ cluster_edge_lookup,
114
+ )
115
+
116
+ if not has_edge:
117
+ invalid_relations.append(f"({relation.src_name} -> {relation.dst_name})")
118
+
119
+ if not invalid_relations:
120
+ logger.info("[Validation] All component relationships have backing edges")
121
+ return ValidationResult(is_valid=True)
122
+
123
+ # Build feedback message
124
+ invalid_str = ", ".join(invalid_relations)
125
+ feedback = (
126
+ f"The following component relationships lack backing edges in the cluster graph: {invalid_str}. "
127
+ f"Please double-check if these components are actually related. If there is no direct edge between "
128
+ f"their clusters, the relationship may be indirect or incorrect."
129
+ )
130
+
131
+ logger.warning(f"[Validation] Invalid relationships: {invalid_str}")
132
+ return ValidationResult(is_valid=False, feedback_messages=[feedback])
133
+
134
+
135
+ def validate_key_entities(result: AnalysisInsights, context: ValidationContext) -> ValidationResult:
136
+ """
137
+ Validate that every component in AnalysisInsights has at least one key_entity assigned.
138
+
139
+ Args:
140
+ result: AnalysisInsights containing components
141
+ context: ValidationContext (not used but kept for interface consistency)
142
+
143
+ Returns:
144
+ ValidationResult with feedback for components missing key entities
145
+ """
146
+ components_without_key_entities: list[str] = []
147
+
148
+ for component in result.components:
149
+ if not component.key_entities or len(component.key_entities) == 0:
150
+ components_without_key_entities.append(component.name)
151
+
152
+ if not components_without_key_entities:
153
+ logger.info("[Validation] All components have key entities assigned")
154
+ return ValidationResult(is_valid=True)
155
+
156
+ # Build feedback message
157
+ missing_str = ", ".join(components_without_key_entities)
158
+ feedback = (
159
+ f"The following components are missing key entities: {missing_str}. "
160
+ f"Every component must have at least one key entity (critical class or method) "
161
+ f"that represents its core functionality. Please identify and add 2-5 key entities "
162
+ f"for each component."
163
+ )
164
+
165
+ logger.warning(f"[Validation] Components without key entities: {missing_str}")
166
+ return ValidationResult(is_valid=False, feedback_messages=[feedback])
167
+
168
+
169
+ def validate_cluster_ids_populated(result: AnalysisInsights, context: ValidationContext) -> ValidationResult:
170
+ """
171
+ Validate that every cluster is assigned to at least one component.
172
+
173
+ Args:
174
+ result: AnalysisInsights containing components
175
+ context: ValidationContext with cluster_results to get available cluster IDs
176
+
177
+ Returns:
178
+ ValidationResult with feedback for unassigned clusters
179
+ """
180
+ if not context.cluster_results:
181
+ logger.warning("[Validation] No cluster results provided for cluster ID validation")
182
+ return ValidationResult(is_valid=True)
183
+
184
+ all_cluster_ids: set[int] = set()
185
+ for lang_result in context.cluster_results.values():
186
+ all_cluster_ids.update(lang_result.get_cluster_ids())
187
+
188
+ if not all_cluster_ids:
189
+ logger.warning("[Validation] No cluster IDs available for cluster ID validation")
190
+ return ValidationResult(is_valid=True)
191
+
192
+ assigned_cluster_ids: set[int] = set()
193
+ for component in result.components:
194
+ assigned_cluster_ids.update(component.source_cluster_ids or [])
195
+
196
+ unassigned_clusters = all_cluster_ids - assigned_cluster_ids
197
+
198
+ if not unassigned_clusters:
199
+ logger.info("[Validation] All clusters are assigned to components")
200
+ return ValidationResult(is_valid=True)
201
+
202
+ missing_str = ", ".join(str(cid) for cid in sorted(unassigned_clusters))
203
+ feedback = (
204
+ f"The following cluster IDs are not assigned to any component: {missing_str}. "
205
+ f"Please assign every cluster to a component based on which code clusters belong to it."
206
+ )
207
+
208
+ logger.warning(f"[Validation] Unassigned clusters: {missing_str}")
209
+ return ValidationResult(is_valid=False, feedback_messages=[feedback])
210
+
211
+
212
+ def validate_file_classifications(result: ComponentFiles, context: ValidationContext) -> ValidationResult:
213
+ """
214
+ Validate that all unassigned files were classified to valid component names.
215
+
216
+ This validator is used for _classify_unassigned_files_with_llm to ensure:
217
+ 1. All input files are present in the result
218
+ 2. All component names are valid (exist in valid_component_names)
219
+
220
+ Args:
221
+ result: ComponentFiles with file_paths containing FileClassification objects
222
+ context: ValidationContext with expected_files (unassigned files) and valid_component_names
223
+
224
+ Returns:
225
+ ValidationResult with feedback for missing files or invalid component names
226
+ """
227
+ if not context.expected_files:
228
+ logger.warning("[Validation] No expected files provided for file classification validation")
229
+ return ValidationResult(is_valid=True)
230
+
231
+ feedback_messages = []
232
+
233
+ # Get classified file paths from result
234
+ classified_files = {normalize_path(fc.file_path, context.repo_dir) for fc in result.file_paths}
235
+
236
+ # Normalize paths for comparison
237
+ expected_files_normalized = {normalize_path(file_path, context.repo_dir) for file_path in context.expected_files}
238
+
239
+ # Check 1: Are all unassigned files classified?
240
+ missing_files = expected_files_normalized - classified_files
241
+ if missing_files:
242
+ missing_list = sorted(str(f) for f in missing_files)[:10]
243
+ missing_str = ", ".join(missing_list)
244
+ more_msg = f" and {len(missing_files) - 10} more" if len(missing_files) > 10 else ""
245
+ feedback_messages.append(
246
+ f"The following files were not classified: {missing_str}{more_msg}. "
247
+ f"Please ensure all files are assigned to a component."
248
+ )
249
+
250
+ # Check 2: Are all component names valid?
251
+ if context.valid_component_names:
252
+ invalid_classifications = []
253
+ for fc in result.file_paths:
254
+ if fc.component_name not in context.valid_component_names:
255
+ invalid_classifications.append(f"{fc.file_path} -> {fc.component_name}")
256
+
257
+ if invalid_classifications:
258
+ invalid_str = ", ".join(invalid_classifications[:10])
259
+ more_msg = f" and {len(invalid_classifications) - 10} more" if len(invalid_classifications) > 10 else ""
260
+ valid_names = ", ".join(sorted(context.valid_component_names))
261
+ feedback_messages.append(
262
+ f"Invalid component names found: {invalid_str}{more_msg}. "
263
+ f"Valid component names are: {valid_names}. "
264
+ f"Please use only these component names."
265
+ )
266
+
267
+ if not feedback_messages:
268
+ logger.info("[Validation] All unassigned files correctly classified")
269
+ return ValidationResult(is_valid=True)
270
+
271
+ logger.warning(f"[Validation] File classification issues: {len(feedback_messages)} problems found")
272
+ return ValidationResult(is_valid=False, feedback_messages=feedback_messages)
273
+
274
+
275
+ def validate_relation_component_names(result: AnalysisInsights, _context: ValidationContext) -> ValidationResult:
276
+ """
277
+ Validate that every src_name and dst_name in components_relations refers to an existing component.
278
+
279
+ When a relation references a component name that does not exist, assign_component_ids will
280
+ leave src_id or dst_id as an empty string, producing broken references in the output JSON.
281
+
282
+ Args:
283
+ result: AnalysisInsights containing components and components_relations
284
+ context: ValidationContext (not used but kept for interface consistency)
285
+
286
+ Returns:
287
+ ValidationResult with feedback listing every relation whose src_name or dst_name is unknown
288
+ """
289
+ known_names = {component.name for component in result.components}
290
+
291
+ invalid_relations: list[str] = []
292
+ for relation in result.components_relations:
293
+ unknown: list[str] = []
294
+ if relation.src_name not in known_names:
295
+ unknown.append(f"src_name='{relation.src_name}'")
296
+ if relation.dst_name not in known_names:
297
+ unknown.append(f"dst_name='{relation.dst_name}'")
298
+ if unknown:
299
+ invalid_relations.append(
300
+ f"({relation.src_name} -{relation.relation}-> {relation.dst_name}): {', '.join(unknown)}"
301
+ )
302
+
303
+ if not invalid_relations:
304
+ logger.info("[Validation] All relation component names refer to existing components")
305
+ return ValidationResult(is_valid=True)
306
+
307
+ invalid_str = "; ".join(invalid_relations)
308
+ known_str = ", ".join(sorted(known_names)) if known_names else "<none>"
309
+ feedback = (
310
+ f"The following relations reference component names that do not exist: {invalid_str}. "
311
+ f"Known component names are: {known_str}. "
312
+ f"Please ensure that src_name and dst_name in every relation match an existing component name exactly."
313
+ )
314
+
315
+ logger.warning(f"[Validation] Relations with unknown component names: {invalid_str}")
316
+ return ValidationResult(is_valid=False, feedback_messages=[feedback])
317
+
318
+
319
+ def _build_cluster_edge_lookup(
320
+ cluster_results: dict[str, ClusterResult],
321
+ cfg_graphs: dict[str, CallGraph],
322
+ ) -> dict[str, set[tuple[int, int]]]:
323
+ """Build a lookup of (src_cluster_id, dst_cluster_id) edges per language."""
324
+ cluster_edge_lookup: dict[str, set[tuple[int, int]]] = {}
325
+
326
+ for lang, cfg in cfg_graphs.items():
327
+ cluster_result = cluster_results.get(lang)
328
+ if not cluster_result:
329
+ continue
330
+
331
+ node_to_cluster: dict[str, int] = {}
332
+ for cluster_id, nodes in cluster_result.clusters.items():
333
+ for node in nodes:
334
+ node_to_cluster[node] = cluster_id
335
+
336
+ cluster_edges: set[tuple[int, int]] = set()
337
+ for edge in cfg.edges:
338
+ src_cluster = node_to_cluster.get(edge.get_source())
339
+ dst_cluster = node_to_cluster.get(edge.get_destination())
340
+ if src_cluster is None or dst_cluster is None:
341
+ continue
342
+ cluster_edges.add((src_cluster, dst_cluster))
343
+
344
+ cluster_edge_lookup[lang] = cluster_edges
345
+
346
+ return cluster_edge_lookup
347
+
348
+
349
+ def _check_edge_between_cluster_sets(
350
+ src_cluster_ids: list[int],
351
+ dst_cluster_ids: list[int],
352
+ cluster_results: dict[str, ClusterResult],
353
+ cfg_graphs: dict[str, CallGraph],
354
+ cluster_edge_lookup: dict[str, set[tuple[int, int]]] | None = None,
355
+ ) -> bool:
356
+ """
357
+ Check if there's an edge between any pair of clusters from two sets.
358
+
359
+ Args:
360
+ src_cluster_ids: Source cluster IDs
361
+ dst_cluster_ids: Destination cluster IDs
362
+ cluster_results: dict mapping language -> ClusterResult
363
+ cfg_graphs: dict mapping language -> CallGraph
364
+ cluster_edge_lookup: Optional precomputed (src_cluster, dst_cluster) edges per language
365
+
366
+ Returns:
367
+ True if any edge exists between the cluster sets
368
+ """
369
+ if not src_cluster_ids or not dst_cluster_ids:
370
+ return False
371
+
372
+ if cluster_edge_lookup is None:
373
+ cluster_edge_lookup = _build_cluster_edge_lookup(cluster_results, cfg_graphs)
374
+
375
+ src_set = set(src_cluster_ids)
376
+ dst_set = set(dst_cluster_ids)
377
+
378
+ for cluster_edges in cluster_edge_lookup.values():
379
+ for src_cluster, dst_cluster in cluster_edges:
380
+ if src_cluster in src_set and dst_cluster in dst_set:
381
+ return True
382
+
383
+ return False
caching/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from caching.cache import BaseCache
2
+ from caching.meta_cache import MetaCache, MetaCacheRecord
3
+
4
+ __all__ = ["BaseCache", "MetaCache", "MetaCacheRecord"]
caching/cache.py ADDED
@@ -0,0 +1,29 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import TypeVar, Generic
4
+
5
+ T = TypeVar("T")
6
+
7
+
8
+ class BaseCache(ABC, Generic[T]):
9
+
10
+ def __init__(self, filename: str, cache_dir: Path):
11
+ self.cache_dir = cache_dir
12
+ self.file_path = self.cache_dir / filename
13
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
14
+
15
+ @abstractmethod
16
+ def load(self) -> T | None:
17
+ pass
18
+
19
+ @abstractmethod
20
+ def store(self, data: T) -> None:
21
+ pass
22
+
23
+ @abstractmethod
24
+ def signature(self) -> str:
25
+ pass
26
+
27
+ @abstractmethod
28
+ def is_stale(self, data: T) -> bool:
29
+ pass
caching/meta_cache.py ADDED
@@ -0,0 +1,227 @@
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import sqlite3
5
+ from collections.abc import Sequence
6
+ from pathlib import Path
7
+
8
+ from langchain_community.cache import SQLiteCache
9
+ from langchain_core.language_models import BaseChatModel
10
+ from langchain_core.outputs import Generation
11
+ from pydantic import BaseModel
12
+
13
+ from agents.agent_responses import MetaAnalysisInsights
14
+ from agents.dependency_discovery import FileRole, discover_dependency_files
15
+ from caching.cache import BaseCache
16
+ from repo_utils import Repo, require_git_import
17
+ from repo_utils.ignore import RepoIgnoreManager
18
+ from utils import get_cache_dir
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ type JsonScalar = str | int | float | bool | None
23
+
24
+ _README_PATTERNS: tuple[str, ...] = (
25
+ "README.md",
26
+ "README.rst",
27
+ "README.txt",
28
+ "README",
29
+ "readme.md",
30
+ )
31
+
32
+ _CACHE_WATCH_ROLES: frozenset[FileRole] = frozenset({FileRole.MANIFEST, FileRole.CONFIG})
33
+
34
+
35
+ class MetaCacheRecord(BaseModel):
36
+ meta: MetaAnalysisInsights
37
+ base_commit: str
38
+ watch_files: list[str]
39
+ watch_state_hash: str | None = None
40
+
41
+
42
+ class MetaCache(BaseCache[MetaCacheRecord]):
43
+ """SQLite-backed cache for MetaAgent analysis results.
44
+
45
+ Watches dependency manifests, config files, and root-level READMEs.
46
+ Keyed by a composite of project name, prompt version hash, and LLM
47
+ configuration so that any change to prompts or models automatically
48
+ produces a cache miss.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ repo_dir: Path,
54
+ ignore_manager: RepoIgnoreManager,
55
+ project_name: str,
56
+ agent_llm: BaseChatModel,
57
+ parsing_llm: BaseChatModel,
58
+ prompt_material: str,
59
+ ):
60
+ super().__init__("meta_agent_llm.sqlite", cache_dir=get_cache_dir(repo_dir))
61
+ self._repo_dir = repo_dir
62
+ self._ignore_manager = ignore_manager
63
+ self._prompt_key = self._build_prompt_key(project_name, prompt_material)
64
+ self._llm_key = self._build_llm_key(agent_llm, parsing_llm)
65
+
66
+ @staticmethod
67
+ def _llm_signature(llm: BaseChatModel) -> str:
68
+ model_id = None
69
+ for attr in ("model_name", "model", "model_id"):
70
+ value = getattr(llm, attr, None)
71
+ if isinstance(value, str) and value:
72
+ model_id = value
73
+ break
74
+
75
+ config: dict[str, JsonScalar] = {}
76
+ for attr in ("temperature", "max_tokens", "top_p", "timeout", "max_retries"):
77
+ value = getattr(llm, attr, None)
78
+ if isinstance(value, (str, int, float, bool)) or value is None:
79
+ config[attr] = value
80
+
81
+ payload = {
82
+ "provider": f"{type(llm).__module__}.{type(llm).__name__}",
83
+ "model_id": model_id or type(llm).__name__,
84
+ "config": config,
85
+ }
86
+ return json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
87
+
88
+ def _build_prompt_key(self, project_name: str, prompt_material: str) -> str:
89
+ prompt_hash = hashlib.sha256(prompt_material.encode("utf-8")).hexdigest()
90
+ payload = {
91
+ "kind": "meta_agent_cache",
92
+ "project_name": project_name,
93
+ "prompt_version": prompt_hash,
94
+ }
95
+ return json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
96
+
97
+ def _build_llm_key(self, agent_llm: BaseChatModel, parsing_llm: BaseChatModel) -> str:
98
+ payload = {
99
+ "kind": "meta_agent_llm_cache",
100
+ "agent": self._llm_signature(agent_llm),
101
+ "parser": self._llm_signature(parsing_llm),
102
+ }
103
+ return json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
104
+
105
+ def signature(self) -> str:
106
+ """Return the composite cache key identifying this configuration."""
107
+ return self._prompt_key + "|" + self._llm_key
108
+
109
+ def _open_sqlite(self) -> SQLiteCache | None:
110
+ try:
111
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
112
+ return SQLiteCache(database_path=str(self.file_path))
113
+ except (OSError, sqlite3.Error) as e:
114
+ logger.warning("Meta cache disabled: %s", e)
115
+ return None
116
+
117
+ def load(self) -> MetaCacheRecord | None:
118
+ cache = self._open_sqlite()
119
+ if cache is None:
120
+ return None
121
+ raw: Sequence[Generation] | None = cache.lookup(self._prompt_key, self._llm_key)
122
+ if raw is None:
123
+ return None
124
+ if len(raw) > 1:
125
+ logger.warning("Meta cache returned %d generations; using first", len(raw))
126
+ try:
127
+ return MetaCacheRecord.model_validate_json(raw[0].text)
128
+ except Exception:
129
+ return None
130
+
131
+ def store(self, data: MetaCacheRecord) -> None:
132
+ cache = self._open_sqlite()
133
+ if cache is None:
134
+ return
135
+ cache.clear()
136
+ cache.update(self._prompt_key, self._llm_key, [Generation(text=data.model_dump_json())])
137
+
138
+ def clear(self) -> None:
139
+ cache = self._open_sqlite()
140
+ if cache is not None:
141
+ cache.clear()
142
+
143
+ @require_git_import(default=[])
144
+ def discover_watch_files(self) -> list[str]:
145
+ """Return git-known files whose changes should invalidate this cache.
146
+
147
+ Includes dependency manifests and configs (not locks) and root-level
148
+ README files that the meta agent reads for project context.
149
+ """
150
+ try:
151
+ repo = Repo(self._repo_dir)
152
+ tracked_files = set(repo.git.ls_files().splitlines())
153
+ untracked_files = {
154
+ Path(path).as_posix()
155
+ for path in repo.untracked_files
156
+ if not self._ignore_manager.should_ignore(Path(path))
157
+ }
158
+ git_known_files = tracked_files | untracked_files
159
+ except Exception as e:
160
+ logger.warning("Unable to discover git file set for meta cache watch list: %s", e)
161
+ return []
162
+
163
+ watch: set[str] = set()
164
+
165
+ for discovered in discover_dependency_files(self._repo_dir, self._ignore_manager, roles=_CACHE_WATCH_ROLES):
166
+ relative_path = discovered.path.relative_to(self._repo_dir).as_posix()
167
+ if relative_path in git_known_files:
168
+ watch.add(relative_path)
169
+
170
+ for pattern in _README_PATTERNS:
171
+ if (self._repo_dir / pattern).is_file() and pattern in git_known_files:
172
+ watch.add(pattern)
173
+
174
+ return sorted(watch)
175
+
176
+ @staticmethod
177
+ def _fingerprint_file(path: Path) -> bytes | None:
178
+ try:
179
+ digest = hashlib.sha256()
180
+ with path.open("rb") as handle:
181
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
182
+ digest.update(chunk)
183
+ return digest.digest()
184
+ except OSError:
185
+ return None
186
+
187
+ def compute_watch_state_hash(self, watch_files: Sequence[str]) -> str | None:
188
+ """Return a deterministic fingerprint for watched file contents."""
189
+ if not watch_files:
190
+ return None
191
+
192
+ digest = hashlib.sha256()
193
+ for relative_path in sorted(set(watch_files)):
194
+ file_digest = self._fingerprint_file(self._repo_dir / relative_path)
195
+ if file_digest is None:
196
+ logger.warning("Unable to fingerprint meta cache watch file: %s", relative_path)
197
+ return None
198
+ digest.update(relative_path.encode("utf-8"))
199
+ digest.update(b"\0")
200
+ digest.update(file_digest)
201
+ digest.update(b"\n")
202
+
203
+ return digest.hexdigest()
204
+
205
+ def is_stale(self, record: MetaCacheRecord) -> bool:
206
+ """Return True if watched file fingerprints differ from the cached record."""
207
+ if not record.watch_files:
208
+ return False
209
+
210
+ if not record.watch_state_hash:
211
+ logger.info("Meta cache record is missing watch-state fingerprint; recomputing once for migration")
212
+ return True
213
+
214
+ expected_watch_files = sorted(set(record.watch_files))
215
+ discovered_watch_files = self.discover_watch_files()
216
+ if discovered_watch_files:
217
+ normalized_discovered = sorted(set(discovered_watch_files))
218
+ if normalized_discovered != expected_watch_files:
219
+ logger.info("Meta cache watch-file set changed; recomputing metadata analysis")
220
+ return True
221
+ expected_watch_files = normalized_discovered
222
+
223
+ current_watch_hash = self.compute_watch_state_hash(expected_watch_files)
224
+ if current_watch_hash is None:
225
+ return True
226
+
227
+ return current_watch_hash != record.watch_state_hash