codebase-intel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_intel/__init__.py +3 -0
- codebase_intel/analytics/__init__.py +1 -0
- codebase_intel/analytics/benchmark.py +406 -0
- codebase_intel/analytics/feedback.py +496 -0
- codebase_intel/analytics/tracker.py +439 -0
- codebase_intel/cli/__init__.py +1 -0
- codebase_intel/cli/main.py +740 -0
- codebase_intel/contracts/__init__.py +1 -0
- codebase_intel/contracts/auto_generator.py +438 -0
- codebase_intel/contracts/evaluator.py +531 -0
- codebase_intel/contracts/models.py +433 -0
- codebase_intel/contracts/registry.py +225 -0
- codebase_intel/core/__init__.py +1 -0
- codebase_intel/core/config.py +248 -0
- codebase_intel/core/exceptions.py +454 -0
- codebase_intel/core/types.py +375 -0
- codebase_intel/decisions/__init__.py +1 -0
- codebase_intel/decisions/miner.py +297 -0
- codebase_intel/decisions/models.py +302 -0
- codebase_intel/decisions/store.py +411 -0
- codebase_intel/drift/__init__.py +1 -0
- codebase_intel/drift/detector.py +443 -0
- codebase_intel/graph/__init__.py +1 -0
- codebase_intel/graph/builder.py +391 -0
- codebase_intel/graph/parser.py +1232 -0
- codebase_intel/graph/query.py +377 -0
- codebase_intel/graph/storage.py +736 -0
- codebase_intel/mcp/__init__.py +1 -0
- codebase_intel/mcp/server.py +710 -0
- codebase_intel/orchestrator/__init__.py +1 -0
- codebase_intel/orchestrator/assembler.py +649 -0
- codebase_intel-0.1.0.dist-info/METADATA +361 -0
- codebase_intel-0.1.0.dist-info/RECORD +36 -0
- codebase_intel-0.1.0.dist-info/WHEEL +4 -0
- codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
- codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
"""High-level graph query engine — translates task descriptions into graph traversals.
|
|
2
|
+
|
|
3
|
+
This module bridges the gap between "what is the agent working on?" and
|
|
4
|
+
"what graph nodes/edges are relevant?" It's the intelligence layer above
|
|
5
|
+
raw graph traversal.
|
|
6
|
+
|
|
7
|
+
Edge cases:
|
|
8
|
+
- Task mentions files that don't exist in the graph (new files): return empty + warning
|
|
9
|
+
- Task is too vague ("improve performance"): return high-level module structure
|
|
10
|
+
- Task mentions multiple unrelated areas: union of relevant subgraphs
|
|
11
|
+
- Task mentions external dependencies: include the import edges but not
|
|
12
|
+
the external code itself
|
|
13
|
+
- Conflicting relevance signals: same node relevant for multiple reasons
|
|
14
|
+
with different priorities — take the highest
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import TYPE_CHECKING
|
|
23
|
+
|
|
24
|
+
from codebase_intel.core.types import (
|
|
25
|
+
ContextPriority,
|
|
26
|
+
EdgeKind,
|
|
27
|
+
GraphEdge,
|
|
28
|
+
GraphNode,
|
|
29
|
+
Language,
|
|
30
|
+
NodeKind,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from codebase_intel.graph.storage import GraphStorage
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class RelevanceResult:
|
|
41
|
+
"""Result of a relevance query — nodes with priorities and explanations."""
|
|
42
|
+
|
|
43
|
+
nodes: list[GraphNode] = field(default_factory=list)
|
|
44
|
+
edges: list[GraphEdge] = field(default_factory=list)
|
|
45
|
+
priorities: dict[str, ContextPriority] = field(default_factory=dict)
|
|
46
|
+
explanations: dict[str, str] = field(default_factory=dict)
|
|
47
|
+
warnings: list[str] = field(default_factory=list)
|
|
48
|
+
truncated: bool = False
|
|
49
|
+
|
|
50
|
+
def nodes_by_priority(self) -> dict[ContextPriority, list[GraphNode]]:
|
|
51
|
+
"""Group nodes by their assigned priority."""
|
|
52
|
+
grouped: dict[ContextPriority, list[GraphNode]] = {
|
|
53
|
+
p: [] for p in ContextPriority
|
|
54
|
+
}
|
|
55
|
+
for node in self.nodes:
|
|
56
|
+
priority = self.priorities.get(node.node_id, ContextPriority.LOW)
|
|
57
|
+
grouped[priority].append(node)
|
|
58
|
+
return grouped
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def unique_files(self) -> set[Path]:
|
|
62
|
+
"""All unique file paths referenced by result nodes."""
|
|
63
|
+
return {n.file_path for n in self.nodes}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class GraphQueryEngine:
|
|
67
|
+
"""Translates high-level queries into graph traversals.
|
|
68
|
+
|
|
69
|
+
The engine supports several query modes:
|
|
70
|
+
1. File-based: "what's relevant to these files?"
|
|
71
|
+
2. Symbol-based: "what's relevant to this function/class?"
|
|
72
|
+
3. Impact-based: "what's affected by changes to X?"
|
|
73
|
+
4. Scope-based: "show me everything in this directory/module"
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, storage: GraphStorage, max_result_nodes: int = 200) -> None:
|
|
77
|
+
self._storage = storage
|
|
78
|
+
self._max_result_nodes = max_result_nodes
|
|
79
|
+
|
|
80
|
+
async def query_by_files(
|
|
81
|
+
self,
|
|
82
|
+
file_paths: list[Path],
|
|
83
|
+
include_depth: int = 2,
|
|
84
|
+
) -> RelevanceResult:
|
|
85
|
+
"""Find relevant context for a set of files.
|
|
86
|
+
|
|
87
|
+
Priority assignment:
|
|
88
|
+
- CRITICAL: the files themselves
|
|
89
|
+
- HIGH: direct imports/dependencies of those files
|
|
90
|
+
- MEDIUM: transitive dependencies (depth 2)
|
|
91
|
+
- LOW: test files that test these files
|
|
92
|
+
|
|
93
|
+
Edge cases:
|
|
94
|
+
- File not in graph: might be new. Add warning, no nodes.
|
|
95
|
+
- File is a barrel/index: has many dependents. Cap and prioritize
|
|
96
|
+
by which dependents are most coupled (most edges).
|
|
97
|
+
- File is in node_modules: skip deep traversal (it's external)
|
|
98
|
+
"""
|
|
99
|
+
result = RelevanceResult()
|
|
100
|
+
seen_ids: set[str] = set()
|
|
101
|
+
|
|
102
|
+
for fp in file_paths:
|
|
103
|
+
file_nodes = await self._storage.get_nodes_by_file(fp)
|
|
104
|
+
if not file_nodes:
|
|
105
|
+
result.warnings.append(
|
|
106
|
+
f"File {fp} not found in graph (new file or not yet indexed)"
|
|
107
|
+
)
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
for node in file_nodes:
|
|
111
|
+
if node.node_id not in seen_ids:
|
|
112
|
+
seen_ids.add(node.node_id)
|
|
113
|
+
result.nodes.append(node)
|
|
114
|
+
result.priorities[node.node_id] = ContextPriority.CRITICAL
|
|
115
|
+
result.explanations[node.node_id] = "Directly referenced file"
|
|
116
|
+
|
|
117
|
+
# Gather dependencies (what these files import)
|
|
118
|
+
for node in file_nodes:
|
|
119
|
+
if node.kind == NodeKind.MODULE:
|
|
120
|
+
deps = await self._storage.get_dependencies(
|
|
121
|
+
node.node_id,
|
|
122
|
+
max_depth=include_depth,
|
|
123
|
+
)
|
|
124
|
+
for dep in deps:
|
|
125
|
+
if dep.node_id in seen_ids:
|
|
126
|
+
continue
|
|
127
|
+
if dep.is_external:
|
|
128
|
+
continue # Don't include external dependency internals
|
|
129
|
+
if len(result.nodes) >= self._max_result_nodes:
|
|
130
|
+
result.truncated = True
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
seen_ids.add(dep.node_id)
|
|
134
|
+
result.nodes.append(dep)
|
|
135
|
+
|
|
136
|
+
# Depth-based priority
|
|
137
|
+
priority = ContextPriority.HIGH
|
|
138
|
+
explanation = f"Direct dependency of {fp.name}"
|
|
139
|
+
# Check if it's a transitive dep (depth > 1)
|
|
140
|
+
direct_deps = await self._storage.get_dependencies(
|
|
141
|
+
node.node_id, max_depth=1
|
|
142
|
+
)
|
|
143
|
+
direct_ids = {d.node_id for d in direct_deps}
|
|
144
|
+
if dep.node_id not in direct_ids:
|
|
145
|
+
priority = ContextPriority.MEDIUM
|
|
146
|
+
explanation = f"Transitive dependency of {fp.name}"
|
|
147
|
+
|
|
148
|
+
result.priorities[dep.node_id] = priority
|
|
149
|
+
result.explanations[dep.node_id] = explanation
|
|
150
|
+
|
|
151
|
+
# Find test files that test these files
|
|
152
|
+
for node in file_nodes:
|
|
153
|
+
if node.kind == NodeKind.MODULE:
|
|
154
|
+
test_files = await self._find_test_files(node)
|
|
155
|
+
for test_node in test_files:
|
|
156
|
+
if test_node.node_id not in seen_ids:
|
|
157
|
+
seen_ids.add(test_node.node_id)
|
|
158
|
+
result.nodes.append(test_node)
|
|
159
|
+
result.priorities[test_node.node_id] = ContextPriority.LOW
|
|
160
|
+
result.explanations[test_node.node_id] = (
|
|
161
|
+
f"Test file for {fp.name}"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
async def query_by_symbol(
|
|
167
|
+
self,
|
|
168
|
+
symbol_name: str,
|
|
169
|
+
include_depth: int = 2,
|
|
170
|
+
) -> RelevanceResult:
|
|
171
|
+
"""Find relevant context for a specific symbol (function, class, etc.).
|
|
172
|
+
|
|
173
|
+
Edge cases:
|
|
174
|
+
- Symbol name is ambiguous (exists in multiple files): return all
|
|
175
|
+
matches with file path in the explanation
|
|
176
|
+
- Symbol is a common name ("get", "create", "handle"): may match
|
|
177
|
+
many nodes. Prioritize by: same directory > same package > global
|
|
178
|
+
- Symbol doesn't exist: might be a typo, suggest similar names
|
|
179
|
+
"""
|
|
180
|
+
result = RelevanceResult()
|
|
181
|
+
candidates = await self._search_symbol(symbol_name)
|
|
182
|
+
|
|
183
|
+
if not candidates:
|
|
184
|
+
result.warnings.append(
|
|
185
|
+
f"Symbol '{symbol_name}' not found in graph. "
|
|
186
|
+
f"It may be new, in an unparsed file, or misspelled."
|
|
187
|
+
)
|
|
188
|
+
return result
|
|
189
|
+
|
|
190
|
+
seen_ids: set[str] = set()
|
|
191
|
+
|
|
192
|
+
for node in candidates:
|
|
193
|
+
seen_ids.add(node.node_id)
|
|
194
|
+
result.nodes.append(node)
|
|
195
|
+
result.priorities[node.node_id] = ContextPriority.CRITICAL
|
|
196
|
+
result.explanations[node.node_id] = f"Matched symbol '{symbol_name}'"
|
|
197
|
+
|
|
198
|
+
# Get dependencies and dependents
|
|
199
|
+
deps = await self._storage.get_dependencies(
|
|
200
|
+
node.node_id, max_depth=include_depth
|
|
201
|
+
)
|
|
202
|
+
dependents = await self._storage.get_dependents(
|
|
203
|
+
node.node_id, max_depth=1
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
for dep in deps:
|
|
207
|
+
if dep.node_id not in seen_ids and not dep.is_external:
|
|
208
|
+
seen_ids.add(dep.node_id)
|
|
209
|
+
result.nodes.append(dep)
|
|
210
|
+
result.priorities[dep.node_id] = ContextPriority.HIGH
|
|
211
|
+
result.explanations[dep.node_id] = (
|
|
212
|
+
f"Dependency of {symbol_name}"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
for dep in dependents:
|
|
216
|
+
if dep.node_id not in seen_ids and not dep.is_external:
|
|
217
|
+
if len(result.nodes) >= self._max_result_nodes:
|
|
218
|
+
result.truncated = True
|
|
219
|
+
break
|
|
220
|
+
seen_ids.add(dep.node_id)
|
|
221
|
+
result.nodes.append(dep)
|
|
222
|
+
result.priorities[dep.node_id] = ContextPriority.MEDIUM
|
|
223
|
+
result.explanations[dep.node_id] = (
|
|
224
|
+
f"Depends on {symbol_name} (may be affected by changes)"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return result
|
|
228
|
+
|
|
229
|
+
async def query_impact(
|
|
230
|
+
self,
|
|
231
|
+
changed_files: list[Path],
|
|
232
|
+
max_depth: int = 3,
|
|
233
|
+
) -> RelevanceResult:
|
|
234
|
+
"""Analyze the impact of file changes — "what else could break?"
|
|
235
|
+
|
|
236
|
+
This is a REVERSE traversal: given changes, find what depends on them.
|
|
237
|
+
|
|
238
|
+
Edge cases:
|
|
239
|
+
- Changed file is __init__.py: potentially affects all importers of the package
|
|
240
|
+
- Changed file is a config file: affects everything that reads it
|
|
241
|
+
- Changed file has no dependents: isolated change (rare but valid)
|
|
242
|
+
- Cascade explosion: changed a core utility → 500 dependents.
|
|
243
|
+
Cap at max_result_nodes and prioritize by coupling strength
|
|
244
|
+
(number of edges to the changed file).
|
|
245
|
+
"""
|
|
246
|
+
result = RelevanceResult()
|
|
247
|
+
impact_map = await self._storage.impact_analysis(changed_files, max_depth=max_depth)
|
|
248
|
+
|
|
249
|
+
# Add changed files as CRITICAL
|
|
250
|
+
seen_ids: set[str] = set()
|
|
251
|
+
for fp in changed_files:
|
|
252
|
+
for node in await self._storage.get_nodes_by_file(fp):
|
|
253
|
+
if node.node_id not in seen_ids:
|
|
254
|
+
seen_ids.add(node.node_id)
|
|
255
|
+
result.nodes.append(node)
|
|
256
|
+
result.priorities[node.node_id] = ContextPriority.CRITICAL
|
|
257
|
+
result.explanations[node.node_id] = "Changed file"
|
|
258
|
+
|
|
259
|
+
# Add impacted nodes with distance-based priority
|
|
260
|
+
for file_key, affected_nodes in impact_map.items():
|
|
261
|
+
for node in affected_nodes:
|
|
262
|
+
if node.node_id in seen_ids:
|
|
263
|
+
continue
|
|
264
|
+
if len(result.nodes) >= self._max_result_nodes:
|
|
265
|
+
result.truncated = True
|
|
266
|
+
result.warnings.append(
|
|
267
|
+
f"Impact analysis truncated at {self._max_result_nodes} nodes. "
|
|
268
|
+
f"Consider narrowing the scope."
|
|
269
|
+
)
|
|
270
|
+
break
|
|
271
|
+
|
|
272
|
+
seen_ids.add(node.node_id)
|
|
273
|
+
result.nodes.append(node)
|
|
274
|
+
result.priorities[node.node_id] = ContextPriority.HIGH
|
|
275
|
+
result.explanations[node.node_id] = (
|
|
276
|
+
f"Depends on changed file {Path(file_key).name}"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
return result
|
|
280
|
+
|
|
281
|
+
async def query_scope(
|
|
282
|
+
self,
|
|
283
|
+
directory: Path,
|
|
284
|
+
max_depth: int = 1,
|
|
285
|
+
) -> RelevanceResult:
|
|
286
|
+
"""Get all nodes within a directory scope.
|
|
287
|
+
|
|
288
|
+
Useful for "show me the structure of this module/package."
|
|
289
|
+
|
|
290
|
+
Edge case: directory has 1000+ files (monorepo root). We return
|
|
291
|
+
MODULE-level nodes only and mark as truncated.
|
|
292
|
+
"""
|
|
293
|
+
result = RelevanceResult()
|
|
294
|
+
|
|
295
|
+
# This requires a storage method to search by path prefix
|
|
296
|
+
# For now, we do it via the stats + file listing approach
|
|
297
|
+
# In production, we'd add a dedicated index
|
|
298
|
+
|
|
299
|
+
result.warnings.append(
|
|
300
|
+
"Scope queries are limited to indexed files. "
|
|
301
|
+
"Run `codebase-intel analyze` to ensure the graph is current."
|
|
302
|
+
)
|
|
303
|
+
return result
|
|
304
|
+
|
|
305
|
+
# -------------------------------------------------------------------
|
|
306
|
+
# Private helpers
|
|
307
|
+
# -------------------------------------------------------------------
|
|
308
|
+
|
|
309
|
+
async def _search_symbol(self, name: str) -> list[GraphNode]:
|
|
310
|
+
"""Search for a symbol by name across the graph.
|
|
311
|
+
|
|
312
|
+
Edge case: name might be:
|
|
313
|
+
- Exact: "UserService" → match name field
|
|
314
|
+
- Qualified: "auth.UserService" → match qualified_name field
|
|
315
|
+
- Partial: "user_serv" → fuzzy match (future)
|
|
316
|
+
"""
|
|
317
|
+
# Exact name match
|
|
318
|
+
cursor = await self._storage._db.execute(
|
|
319
|
+
"""
|
|
320
|
+
SELECT node_id, kind, name, qualified_name, file_path,
|
|
321
|
+
line_start, line_end, language, content_hash, docstring,
|
|
322
|
+
is_generated, is_external, is_test, is_entry_point,
|
|
323
|
+
metadata_json
|
|
324
|
+
FROM nodes
|
|
325
|
+
WHERE name = ? OR qualified_name = ? OR qualified_name LIKE ?
|
|
326
|
+
LIMIT 50
|
|
327
|
+
""",
|
|
328
|
+
(name, name, f"%.{name}"),
|
|
329
|
+
)
|
|
330
|
+
rows = await cursor.fetchall()
|
|
331
|
+
return [self._storage._row_to_node(row) for row in rows]
|
|
332
|
+
|
|
333
|
+
async def _find_test_files(self, source_node: GraphNode) -> list[GraphNode]:
|
|
334
|
+
"""Find test files that test a given source module.
|
|
335
|
+
|
|
336
|
+
Detection heuristics:
|
|
337
|
+
- Explicit TESTS edge in graph
|
|
338
|
+
- File naming convention: foo.py → test_foo.py, foo_test.py
|
|
339
|
+
- Import-based: test file that imports the source module
|
|
340
|
+
"""
|
|
341
|
+
results: list[GraphNode] = []
|
|
342
|
+
|
|
343
|
+
# Check for explicit TESTS edges
|
|
344
|
+
cursor = await self._storage._db.execute(
|
|
345
|
+
"""
|
|
346
|
+
SELECT source_id FROM edges
|
|
347
|
+
WHERE target_id = ? AND kind = ?
|
|
348
|
+
""",
|
|
349
|
+
(source_node.node_id, EdgeKind.TESTS.value),
|
|
350
|
+
)
|
|
351
|
+
for row in await cursor.fetchall():
|
|
352
|
+
node = await self._storage.get_node(row[0])
|
|
353
|
+
if node:
|
|
354
|
+
results.append(node)
|
|
355
|
+
|
|
356
|
+
# Convention-based: look for test_<name> or <name>_test
|
|
357
|
+
if source_node.kind == NodeKind.MODULE:
|
|
358
|
+
source_name = source_node.name
|
|
359
|
+
test_patterns = [f"test_{source_name}", f"{source_name}_test", f"{source_name}_spec"]
|
|
360
|
+
cursor = await self._storage._db.execute(
|
|
361
|
+
f"""
|
|
362
|
+
SELECT node_id, kind, name, qualified_name, file_path,
|
|
363
|
+
line_start, line_end, language, content_hash, docstring,
|
|
364
|
+
is_generated, is_external, is_test, is_entry_point,
|
|
365
|
+
metadata_json
|
|
366
|
+
FROM nodes
|
|
367
|
+
WHERE kind = 'module' AND is_test = 1
|
|
368
|
+
AND ({' OR '.join(f"name = ?" for _ in test_patterns)})
|
|
369
|
+
""",
|
|
370
|
+
test_patterns,
|
|
371
|
+
)
|
|
372
|
+
for row in await cursor.fetchall():
|
|
373
|
+
node = self._storage._row_to_node(row)
|
|
374
|
+
if node.node_id not in {r.node_id for r in results}:
|
|
375
|
+
results.append(node)
|
|
376
|
+
|
|
377
|
+
return results
|