codevira 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codevira-1.6.0.dist-info/LICENSE +21 -0
- codevira-1.6.0.dist-info/METADATA +477 -0
- codevira-1.6.0.dist-info/RECORD +58 -0
- codevira-1.6.0.dist-info/WHEEL +5 -0
- codevira-1.6.0.dist-info/entry_points.txt +2 -0
- codevira-1.6.0.dist-info/top_level.txt +2 -0
- indexer/__init__.py +1 -0
- indexer/chunker.py +428 -0
- indexer/global_db.py +197 -0
- indexer/graph_generator.py +380 -0
- indexer/index_codebase.py +588 -0
- indexer/outcome_tracker.py +172 -0
- indexer/rule_learner.py +186 -0
- indexer/sqlite_graph.py +640 -0
- indexer/treesitter_parser.py +423 -0
- mcp_server/__init__.py +1 -0
- mcp_server/__main__.py +20 -0
- mcp_server/auto_init.py +257 -0
- mcp_server/cli.py +622 -0
- mcp_server/crash_logger.py +236 -0
- mcp_server/data/__init__.py +1 -0
- mcp_server/data/agents/builder.md +84 -0
- mcp_server/data/agents/developer.md +111 -0
- mcp_server/data/agents/documenter.md +138 -0
- mcp_server/data/agents/orchestrator.md +96 -0
- mcp_server/data/agents/planner.md +106 -0
- mcp_server/data/agents/reviewer.md +82 -0
- mcp_server/data/agents/tester.md +83 -0
- mcp_server/data/config.example.yaml +33 -0
- mcp_server/data/rules/coding-standards.md +48 -0
- mcp_server/data/rules/engineering-excellence.md +28 -0
- mcp_server/data/rules/git-cicd-governance.md +32 -0
- mcp_server/data/rules/git_commits.md +130 -0
- mcp_server/data/rules/incremental-updates.md +5 -0
- mcp_server/data/rules/master_rule.md +187 -0
- mcp_server/data/rules/multi-language.md +19 -0
- mcp_server/data/rules/persistence.md +21 -0
- mcp_server/data/rules/resilience-observability.md +17 -0
- mcp_server/data/rules/smoke-testing.md +48 -0
- mcp_server/data/rules/testing-standards.md +23 -0
- mcp_server/detect.py +284 -0
- mcp_server/gitignore.py +284 -0
- mcp_server/global_sync.py +187 -0
- mcp_server/http_server.py +341 -0
- mcp_server/ide_inject.py +444 -0
- mcp_server/launchd.py +156 -0
- mcp_server/migrate.py +215 -0
- mcp_server/paths.py +256 -0
- mcp_server/prompts.py +136 -0
- mcp_server/server.py +1049 -0
- mcp_server/tools/__init__.py +0 -0
- mcp_server/tools/changesets.py +223 -0
- mcp_server/tools/code_reader.py +335 -0
- mcp_server/tools/graph.py +637 -0
- mcp_server/tools/learning.py +238 -0
- mcp_server/tools/playbook.py +89 -0
- mcp_server/tools/roadmap.py +599 -0
- mcp_server/tools/search.py +145 -0
indexer/chunker.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Multi-language source chunker for codebase indexing.
|
|
3
|
+
Splits source files into function/class/module chunks for semantic search.
|
|
4
|
+
|
|
5
|
+
Language support:
|
|
6
|
+
- Python: stdlib ast module (full support)
|
|
7
|
+
- TypeScript, Go, Rust: tree-sitter grammars via treesitter_parser
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import ast
|
|
12
|
+
import functools
|
|
13
|
+
import os
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Iterator
|
|
17
|
+
|
|
18
|
+
from indexer.treesitter_parser import (
|
|
19
|
+
parse_file as ts_parse_file,
|
|
20
|
+
get_language as ts_get_language,
|
|
21
|
+
EXTENSION_MAP as TS_EXTENSION_MAP,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _load_config() -> dict:
|
|
26
|
+
from mcp_server.paths import get_data_dir
|
|
27
|
+
config_path = get_data_dir() / "config.yaml"
|
|
28
|
+
if config_path.exists():
|
|
29
|
+
try:
|
|
30
|
+
import yaml
|
|
31
|
+
with open(config_path) as f:
|
|
32
|
+
return yaml.safe_load(f) or {}
|
|
33
|
+
except Exception:
|
|
34
|
+
pass
|
|
35
|
+
return {}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SKIP_DIRS = {"__pycache__", ".venv", "venv", ".git", "node_modules", "migrations"}
|
|
39
|
+
SKIP_FILES = {"__init__.py"}
|
|
40
|
+
|
|
41
|
+
# All tree-sitter supported extensions for dispatch
|
|
42
|
+
_TS_SUPPORTED_EXTENSIONS = set(TS_EXTENSION_MAP.keys())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@functools.lru_cache(maxsize=None)
|
|
46
|
+
def _get_project_config() -> tuple[frozenset[str], tuple[str, ...]]:
|
|
47
|
+
"""Lazily load config.yaml and return (TARGET_DIRS, FILE_EXTENSIONS).
|
|
48
|
+
|
|
49
|
+
Cached so subsequent calls are free. lru_cache is used so that the
|
|
50
|
+
config is only loaded once per process after the data directory is known.
|
|
51
|
+
"""
|
|
52
|
+
cfg = _load_config()
|
|
53
|
+
project_cfg = cfg.get("project", cfg)
|
|
54
|
+
target_dirs: frozenset[str] = frozenset(project_cfg.get("watched_dirs", ["src"]))
|
|
55
|
+
file_extensions: tuple[str, ...] = tuple(project_cfg.get("file_extensions", [".py"]))
|
|
56
|
+
return target_dirs, file_extensions
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class CodeChunk:
|
|
61
|
+
file_path: str # relative to project root
|
|
62
|
+
chunk_type: str # "function" | "class" | "module"
|
|
63
|
+
name: str # function/class name or filename for module chunks
|
|
64
|
+
source_text: str # the actual source code
|
|
65
|
+
start_line: int
|
|
66
|
+
end_line: int
|
|
67
|
+
docstring: str # first docstring if present, else ""
|
|
68
|
+
layer: str # inferred from file path
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _infer_layer(file_path: str) -> str:
|
|
72
|
+
parts = Path(file_path).parts
|
|
73
|
+
for i, part in enumerate(parts):
|
|
74
|
+
if part in {"generator", "assembler", "indexer", "scanner", "drift", "graph", "context"}:
|
|
75
|
+
return part
|
|
76
|
+
if part in {"api", "routes"}:
|
|
77
|
+
return "api"
|
|
78
|
+
if part in {"core", "datastore", "schemas"}:
|
|
79
|
+
return part
|
|
80
|
+
if part in {"contexts", "application", "providers", "control", "services", "handlers"}:
|
|
81
|
+
return part
|
|
82
|
+
return "unknown"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _get_docstring(node: ast.AST) -> str:
|
|
86
|
+
try:
|
|
87
|
+
return ast.get_docstring(node) or ""
|
|
88
|
+
except Exception:
|
|
89
|
+
return ""
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _extract_source_lines(source_lines: list[str], start: int, end: int) -> str:
|
|
93
|
+
return "".join(source_lines[start - 1:end])
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def extract_imports(file_path: str, project_root: str) -> list[str]:
|
|
97
|
+
"""
|
|
98
|
+
Parse a source file's import statements and return relative paths of
|
|
99
|
+
project-local imports only (skips stdlib and third-party packages).
|
|
100
|
+
|
|
101
|
+
Dispatches to Python ast or tree-sitter based on file extension.
|
|
102
|
+
Returns list of relative file paths (e.g. 'src/services/provider.py').
|
|
103
|
+
Paths that cannot be resolved to an existing file are omitted.
|
|
104
|
+
"""
|
|
105
|
+
ext = Path(file_path).suffix.lower()
|
|
106
|
+
|
|
107
|
+
# Non-Python files: use tree-sitter import extraction
|
|
108
|
+
if ext in _TS_SUPPORTED_EXTENSIONS:
|
|
109
|
+
return _extract_imports_treesitter(file_path, project_root)
|
|
110
|
+
|
|
111
|
+
# Python files: existing ast-based extraction
|
|
112
|
+
return _extract_imports_python(file_path, project_root)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _extract_imports_treesitter(file_path: str, project_root: str) -> list[str]:
|
|
116
|
+
"""
|
|
117
|
+
Extract import paths from a non-Python file using tree-sitter.
|
|
118
|
+
Resolves relative/local imports to actual project file paths where possible.
|
|
119
|
+
Falls back to raw module strings for unresolvable imports.
|
|
120
|
+
"""
|
|
121
|
+
try:
|
|
122
|
+
parsed = ts_parse_file(file_path)
|
|
123
|
+
except (FileNotFoundError, ValueError):
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
project_root_path = Path(project_root)
|
|
127
|
+
file_dir = Path(file_path).parent
|
|
128
|
+
results: list[str] = []
|
|
129
|
+
|
|
130
|
+
for imp in parsed.imports:
|
|
131
|
+
raw = imp.module
|
|
132
|
+
resolved = _resolve_ts_import(raw, file_dir, project_root_path)
|
|
133
|
+
if resolved and resolved not in results:
|
|
134
|
+
results.append(resolved)
|
|
135
|
+
|
|
136
|
+
return results
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _resolve_ts_import(raw_module: str, file_dir: Path, project_root: Path) -> str | None:
|
|
140
|
+
"""
|
|
141
|
+
Try to resolve a tree-sitter import string to a relative file path.
|
|
142
|
+
Handles TypeScript/JS relative imports, Go package imports, and Rust use paths.
|
|
143
|
+
"""
|
|
144
|
+
# TypeScript/JS: relative imports like './foo' or '../bar'
|
|
145
|
+
if raw_module.startswith('.'):
|
|
146
|
+
# Resolve relative to the importing file's directory
|
|
147
|
+
candidates = [
|
|
148
|
+
file_dir / f"{raw_module}.ts",
|
|
149
|
+
file_dir / f"{raw_module}.tsx",
|
|
150
|
+
file_dir / f"{raw_module}.js",
|
|
151
|
+
file_dir / f"{raw_module}.jsx",
|
|
152
|
+
file_dir / raw_module / "index.ts",
|
|
153
|
+
file_dir / raw_module / "index.tsx",
|
|
154
|
+
file_dir / raw_module / "index.js",
|
|
155
|
+
]
|
|
156
|
+
for c in candidates:
|
|
157
|
+
resolved = c.resolve()
|
|
158
|
+
if resolved.exists():
|
|
159
|
+
try:
|
|
160
|
+
return str(resolved.relative_to(project_root))
|
|
161
|
+
except ValueError:
|
|
162
|
+
continue
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
# Non-relative: try as a project-local path (e.g. 'src/utils/foo')
|
|
166
|
+
# Check common extensions
|
|
167
|
+
for ext in ['.ts', '.tsx', '.js', '.go', '.rs']:
|
|
168
|
+
candidate = project_root / f"{raw_module}{ext}"
|
|
169
|
+
if candidate.exists():
|
|
170
|
+
return str(candidate.relative_to(project_root))
|
|
171
|
+
|
|
172
|
+
# Try as directory with index file
|
|
173
|
+
for index in ['index.ts', 'index.tsx', 'index.js', 'mod.rs']:
|
|
174
|
+
candidate = project_root / raw_module / index
|
|
175
|
+
if candidate.exists():
|
|
176
|
+
return str(candidate.relative_to(project_root))
|
|
177
|
+
|
|
178
|
+
# Go: package paths like 'project/internal/services'
|
|
179
|
+
# Try mapping to directory with .go files
|
|
180
|
+
candidate_dir = project_root / raw_module
|
|
181
|
+
if candidate_dir.is_dir():
|
|
182
|
+
go_files = list(candidate_dir.glob('*.go'))
|
|
183
|
+
if go_files:
|
|
184
|
+
return str(go_files[0].relative_to(project_root))
|
|
185
|
+
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _extract_imports_python(file_path: str, project_root: str) -> list[str]:
|
|
190
|
+
"""
|
|
191
|
+
Parse a Python file's import statements and return relative paths of
|
|
192
|
+
project-local imports only (skips stdlib and third-party packages).
|
|
193
|
+
"""
|
|
194
|
+
try:
|
|
195
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
196
|
+
source = f.read()
|
|
197
|
+
except (OSError, UnicodeDecodeError):
|
|
198
|
+
return []
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
tree = ast.parse(source, filename=file_path)
|
|
202
|
+
except SyntaxError:
|
|
203
|
+
return []
|
|
204
|
+
|
|
205
|
+
project_root_path = Path(project_root)
|
|
206
|
+
target_dirs, _ = _get_project_config()
|
|
207
|
+
project_packages = set(target_dirs)
|
|
208
|
+
|
|
209
|
+
results: list[str] = []
|
|
210
|
+
|
|
211
|
+
def _module_to_path(module: str) -> str | None:
|
|
212
|
+
"""Convert a dotted module name to a relative file path if project-local."""
|
|
213
|
+
parts = module.split(".")
|
|
214
|
+
if not parts or parts[0] not in project_packages:
|
|
215
|
+
return None
|
|
216
|
+
candidates = [
|
|
217
|
+
project_root_path / Path(*parts) / "__init__.py",
|
|
218
|
+
project_root_path / Path(*parts[:-1]) / f"{parts[-1]}.py",
|
|
219
|
+
project_root_path / Path(*parts).with_suffix(".py"),
|
|
220
|
+
]
|
|
221
|
+
for candidate in candidates:
|
|
222
|
+
if candidate.exists():
|
|
223
|
+
return str(candidate.relative_to(project_root_path))
|
|
224
|
+
direct = project_root_path / Path(*parts[:-1]) / f"{parts[-1]}.py"
|
|
225
|
+
rel = str(direct.relative_to(project_root_path))
|
|
226
|
+
if (project_root_path / Path(*parts[:-1])).exists():
|
|
227
|
+
return rel
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
for node in ast.walk(tree):
|
|
231
|
+
if isinstance(node, ast.Import):
|
|
232
|
+
for alias in node.names:
|
|
233
|
+
path = _module_to_path(alias.name)
|
|
234
|
+
if path and path not in results:
|
|
235
|
+
results.append(path)
|
|
236
|
+
elif isinstance(node, ast.ImportFrom):
|
|
237
|
+
if node.level and node.level > 0:
|
|
238
|
+
file_rel = os.path.relpath(file_path, project_root)
|
|
239
|
+
file_parts = Path(file_rel).parts
|
|
240
|
+
base_parts = list(file_parts[:-node.level]) if node.level < len(file_parts) else []
|
|
241
|
+
if node.module:
|
|
242
|
+
module_parts = base_parts + str(node.module).split(".")
|
|
243
|
+
else:
|
|
244
|
+
module_parts = base_parts
|
|
245
|
+
abs_module = ".".join(module_parts)
|
|
246
|
+
elif node.module:
|
|
247
|
+
abs_module = str(node.module)
|
|
248
|
+
else:
|
|
249
|
+
continue
|
|
250
|
+
path = _module_to_path(abs_module)
|
|
251
|
+
if path and path not in results:
|
|
252
|
+
results.append(path)
|
|
253
|
+
|
|
254
|
+
return results
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def chunk_file(file_path: str, project_root: str) -> list[CodeChunk]:
|
|
258
|
+
"""
|
|
259
|
+
Parse a source file and return all meaningful code chunks.
|
|
260
|
+
Dispatches to Python ast or tree-sitter based on file extension.
|
|
261
|
+
"""
|
|
262
|
+
ext = Path(file_path).suffix.lower()
|
|
263
|
+
|
|
264
|
+
# Non-Python files: dispatch to tree-sitter chunker
|
|
265
|
+
if ext in _TS_SUPPORTED_EXTENSIONS:
|
|
266
|
+
return _chunk_file_treesitter(file_path, project_root)
|
|
267
|
+
|
|
268
|
+
# Python files: existing ast-based chunking
|
|
269
|
+
return _chunk_file_python(file_path, project_root)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _chunk_file_treesitter(file_path: str, project_root: str) -> list[CodeChunk]:
|
|
273
|
+
"""Chunk a non-Python file using tree-sitter symbol extraction."""
|
|
274
|
+
rel_path = os.path.relpath(file_path, project_root)
|
|
275
|
+
layer = _infer_layer(rel_path)
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
parsed = ts_parse_file(file_path)
|
|
279
|
+
except (FileNotFoundError, ValueError):
|
|
280
|
+
return []
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
284
|
+
source_lines = f.read().splitlines(keepends=True)
|
|
285
|
+
except (OSError, UnicodeDecodeError):
|
|
286
|
+
return []
|
|
287
|
+
|
|
288
|
+
chunks: list[CodeChunk] = []
|
|
289
|
+
|
|
290
|
+
# Module-level docstring chunk
|
|
291
|
+
if parsed.module_docstring:
|
|
292
|
+
chunks.append(CodeChunk(
|
|
293
|
+
file_path=rel_path,
|
|
294
|
+
chunk_type="module",
|
|
295
|
+
name=Path(file_path).stem,
|
|
296
|
+
source_text=parsed.module_docstring,
|
|
297
|
+
start_line=1,
|
|
298
|
+
end_line=1,
|
|
299
|
+
docstring=parsed.module_docstring,
|
|
300
|
+
layer=layer,
|
|
301
|
+
))
|
|
302
|
+
|
|
303
|
+
for sym in parsed.symbols:
|
|
304
|
+
# Skip very short symbols (< 3 lines) like Python chunker does
|
|
305
|
+
if sym.end_line - sym.start_line < 3:
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
source_text = _extract_source_lines(source_lines, sym.start_line, sym.end_line)
|
|
309
|
+
|
|
310
|
+
# For classes/structs/impl, limit source to first 15 lines (like Python chunker)
|
|
311
|
+
chunk_type = sym.kind
|
|
312
|
+
if chunk_type in ("class", "struct", "impl", "interface", "trait", "enum"):
|
|
313
|
+
sig_end = min(sym.start_line + 15, sym.end_line)
|
|
314
|
+
source_text = _extract_source_lines(source_lines, sym.start_line, sig_end)
|
|
315
|
+
|
|
316
|
+
chunks.append(CodeChunk(
|
|
317
|
+
file_path=rel_path,
|
|
318
|
+
chunk_type=chunk_type,
|
|
319
|
+
name=sym.name,
|
|
320
|
+
source_text=source_text,
|
|
321
|
+
start_line=sym.start_line,
|
|
322
|
+
end_line=sym.end_line,
|
|
323
|
+
docstring=sym.docstring or "",
|
|
324
|
+
layer=layer,
|
|
325
|
+
))
|
|
326
|
+
|
|
327
|
+
return chunks
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _chunk_file_python(file_path: str, project_root: str) -> list[CodeChunk]:
|
|
331
|
+
"""Parse a Python file and return all meaningful code chunks."""
|
|
332
|
+
rel_path = os.path.relpath(file_path, project_root)
|
|
333
|
+
layer = _infer_layer(rel_path)
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
337
|
+
source = f.read()
|
|
338
|
+
source_lines = source.splitlines(keepends=True)
|
|
339
|
+
except (OSError, UnicodeDecodeError):
|
|
340
|
+
return []
|
|
341
|
+
|
|
342
|
+
try:
|
|
343
|
+
tree = ast.parse(source, filename=file_path)
|
|
344
|
+
except SyntaxError:
|
|
345
|
+
return []
|
|
346
|
+
|
|
347
|
+
chunks: list[CodeChunk] = []
|
|
348
|
+
|
|
349
|
+
# Module-level docstring chunk
|
|
350
|
+
module_doc = _get_docstring(tree)
|
|
351
|
+
if module_doc:
|
|
352
|
+
chunks.append(CodeChunk(
|
|
353
|
+
file_path=rel_path,
|
|
354
|
+
chunk_type="module",
|
|
355
|
+
name=Path(file_path).stem,
|
|
356
|
+
source_text=module_doc,
|
|
357
|
+
start_line=1,
|
|
358
|
+
end_line=1,
|
|
359
|
+
docstring=module_doc,
|
|
360
|
+
layer=layer,
|
|
361
|
+
))
|
|
362
|
+
|
|
363
|
+
for node in ast.walk(tree):
|
|
364
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
365
|
+
if node.name.startswith("__") and node.name.endswith("__"):
|
|
366
|
+
continue
|
|
367
|
+
end_line = getattr(node, "end_lineno", node.lineno)
|
|
368
|
+
source_text = _extract_source_lines(source_lines, node.lineno, end_line)
|
|
369
|
+
if end_line - node.lineno < 3:
|
|
370
|
+
continue
|
|
371
|
+
chunks.append(CodeChunk(
|
|
372
|
+
file_path=rel_path,
|
|
373
|
+
chunk_type="function",
|
|
374
|
+
name=node.name,
|
|
375
|
+
source_text=source_text,
|
|
376
|
+
start_line=node.lineno,
|
|
377
|
+
end_line=end_line,
|
|
378
|
+
docstring=_get_docstring(node),
|
|
379
|
+
layer=layer,
|
|
380
|
+
))
|
|
381
|
+
|
|
382
|
+
elif isinstance(node, ast.ClassDef):
|
|
383
|
+
end_line = getattr(node, "end_lineno", node.lineno)
|
|
384
|
+
sig_end = min(node.lineno + 15, end_line)
|
|
385
|
+
source_text = _extract_source_lines(source_lines, node.lineno, sig_end)
|
|
386
|
+
chunks.append(CodeChunk(
|
|
387
|
+
file_path=rel_path,
|
|
388
|
+
chunk_type="class",
|
|
389
|
+
name=node.name,
|
|
390
|
+
source_text=source_text,
|
|
391
|
+
start_line=node.lineno,
|
|
392
|
+
end_line=end_line,
|
|
393
|
+
docstring=_get_docstring(node),
|
|
394
|
+
layer=layer,
|
|
395
|
+
))
|
|
396
|
+
|
|
397
|
+
return chunks
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def iter_source_files(project_root: str) -> Iterator[str]:
|
|
401
|
+
"""Yield source files in TARGET_DIRS matching configured file_extensions."""
|
|
402
|
+
target_dirs, file_extensions = _get_project_config()
|
|
403
|
+
extensions = file_extensions
|
|
404
|
+
seen_files = set()
|
|
405
|
+
|
|
406
|
+
for target_dir in target_dirs:
|
|
407
|
+
target_path = os.path.join(project_root, target_dir)
|
|
408
|
+
if not os.path.exists(target_path):
|
|
409
|
+
continue
|
|
410
|
+
|
|
411
|
+
for root, dirs, files in os.walk(target_path):
|
|
412
|
+
# Prune skipped dirs
|
|
413
|
+
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
|
|
414
|
+
|
|
415
|
+
for fname in files:
|
|
416
|
+
if fname.endswith(extensions) and fname not in SKIP_FILES:
|
|
417
|
+
full_path = os.path.abspath(os.path.join(root, fname))
|
|
418
|
+
if full_path not in seen_files:
|
|
419
|
+
seen_files.add(full_path)
|
|
420
|
+
yield full_path
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def chunk_project(project_root: str) -> list[CodeChunk]:
|
|
424
|
+
"""Chunk all source files in the project. Returns flat list of all chunks."""
|
|
425
|
+
all_chunks: list[CodeChunk] = []
|
|
426
|
+
for file_path in iter_source_files(project_root):
|
|
427
|
+
all_chunks.extend(chunk_file(file_path, project_root))
|
|
428
|
+
return all_chunks
|
indexer/global_db.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""
|
|
2
|
+
global_db.py — Global SQLite database for cross-project intelligence.
|
|
3
|
+
|
|
4
|
+
Stores aggregated preferences, learned rules, and project registry in
|
|
5
|
+
~/.codevira/global.db. Enables new projects to inherit intelligence from
|
|
6
|
+
all past projects on day 1.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import sqlite3
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GlobalDB:
|
|
19
|
+
"""Lightweight SQLite wrapper for the global cross-project database."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, db_path: str | Path):
|
|
22
|
+
self.db_path = Path(db_path)
|
|
23
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
self.conn = sqlite3.connect(str(self.db_path), timeout=5)
|
|
25
|
+
self.conn.row_factory = sqlite3.Row
|
|
26
|
+
self.conn.execute("PRAGMA journal_mode=WAL")
|
|
27
|
+
self.conn.execute("PRAGMA foreign_keys=ON")
|
|
28
|
+
self._init_schema()
|
|
29
|
+
|
|
30
|
+
def _init_schema(self) -> None:
|
|
31
|
+
self.conn.executescript("""
|
|
32
|
+
CREATE TABLE IF NOT EXISTS projects (
|
|
33
|
+
path TEXT PRIMARY KEY,
|
|
34
|
+
name TEXT NOT NULL,
|
|
35
|
+
language TEXT,
|
|
36
|
+
git_remote TEXT,
|
|
37
|
+
last_synced_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
38
|
+
);
|
|
39
|
+
|
|
40
|
+
CREATE TABLE IF NOT EXISTS global_preferences (
|
|
41
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
42
|
+
category TEXT NOT NULL,
|
|
43
|
+
signal TEXT NOT NULL,
|
|
44
|
+
example TEXT,
|
|
45
|
+
frequency INTEGER DEFAULT 1,
|
|
46
|
+
source_projects TEXT DEFAULT '[]',
|
|
47
|
+
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
48
|
+
UNIQUE(category, signal)
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
CREATE TABLE IF NOT EXISTS global_rules (
|
|
52
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
53
|
+
rule_text TEXT NOT NULL UNIQUE,
|
|
54
|
+
confidence REAL DEFAULT 0.5,
|
|
55
|
+
source_projects TEXT DEFAULT '[]',
|
|
56
|
+
category TEXT,
|
|
57
|
+
language TEXT,
|
|
58
|
+
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
59
|
+
);
|
|
60
|
+
""")
|
|
61
|
+
self.conn.commit()
|
|
62
|
+
|
|
63
|
+
def close(self) -> None:
|
|
64
|
+
self.conn.close()
|
|
65
|
+
|
|
66
|
+
# ------------------------------------------------------------------
|
|
67
|
+
# Project registry
|
|
68
|
+
# ------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
def register_project(self, path: str, name: str, language: str,
|
|
71
|
+
git_remote: str | None = None) -> None:
|
|
72
|
+
# Ensure git_remote column exists (handles DBs created before v1.6)
|
|
73
|
+
try:
|
|
74
|
+
cols = [row[1] for row in self.conn.execute("PRAGMA table_info(projects)").fetchall()]
|
|
75
|
+
if "git_remote" not in cols:
|
|
76
|
+
self.conn.execute("ALTER TABLE projects ADD COLUMN git_remote TEXT")
|
|
77
|
+
self.conn.commit()
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
self.conn.execute(
|
|
81
|
+
"INSERT OR REPLACE INTO projects (path, name, language, git_remote, last_synced_at) "
|
|
82
|
+
"VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)",
|
|
83
|
+
(path, name, language, git_remote),
|
|
84
|
+
)
|
|
85
|
+
self.conn.commit()
|
|
86
|
+
|
|
87
|
+
def find_project_by_remote(self, remote_url: str) -> str | None:
|
|
88
|
+
"""Return the registered path for a project matching the given git remote URL, or None."""
|
|
89
|
+
try:
|
|
90
|
+
row = self.conn.execute(
|
|
91
|
+
"SELECT path FROM projects WHERE git_remote = ? LIMIT 1",
|
|
92
|
+
(remote_url,),
|
|
93
|
+
).fetchone()
|
|
94
|
+
return row["path"] if row else None
|
|
95
|
+
except Exception:
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
def get_project_count(self) -> int:
|
|
99
|
+
row = self.conn.execute("SELECT COUNT(*) FROM projects").fetchone()
|
|
100
|
+
return row[0] if row else 0
|
|
101
|
+
|
|
102
|
+
# ------------------------------------------------------------------
|
|
103
|
+
# Preferences
|
|
104
|
+
# ------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
def upsert_preference(self, category: str, signal: str, example: str | None,
|
|
107
|
+
source_project: str, frequency: int = 1) -> None:
|
|
108
|
+
"""Insert or update a global preference. Aggregates frequency across projects."""
|
|
109
|
+
existing = self.conn.execute(
|
|
110
|
+
"SELECT id, frequency, source_projects FROM global_preferences WHERE category = ? AND signal = ?",
|
|
111
|
+
(category, signal),
|
|
112
|
+
).fetchone()
|
|
113
|
+
|
|
114
|
+
if existing:
|
|
115
|
+
projects = json.loads(existing["source_projects"] or "[]")
|
|
116
|
+
if source_project not in projects:
|
|
117
|
+
projects.append(source_project)
|
|
118
|
+
new_freq = existing["frequency"] + frequency
|
|
119
|
+
self.conn.execute(
|
|
120
|
+
"UPDATE global_preferences SET frequency = ?, source_projects = ?, example = COALESCE(?, example), "
|
|
121
|
+
"updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
122
|
+
(new_freq, json.dumps(projects), example, existing["id"]),
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
self.conn.execute(
|
|
126
|
+
"INSERT INTO global_preferences (category, signal, example, frequency, source_projects) "
|
|
127
|
+
"VALUES (?, ?, ?, ?, ?)",
|
|
128
|
+
(category, signal, example, frequency, json.dumps([source_project])),
|
|
129
|
+
)
|
|
130
|
+
self.conn.commit()
|
|
131
|
+
|
|
132
|
+
def get_preferences(self, min_frequency: int = 3, language: str | None = None) -> list[dict]:
|
|
133
|
+
"""Get global preferences above the frequency threshold."""
|
|
134
|
+
rows = self.conn.execute(
|
|
135
|
+
"SELECT category, signal, example, frequency, source_projects FROM global_preferences "
|
|
136
|
+
"WHERE frequency >= ? ORDER BY frequency DESC",
|
|
137
|
+
(min_frequency,),
|
|
138
|
+
).fetchall()
|
|
139
|
+
return [dict(r) for r in rows]
|
|
140
|
+
|
|
141
|
+
# ------------------------------------------------------------------
|
|
142
|
+
# Rules
|
|
143
|
+
# ------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
def upsert_rule(self, rule_text: str, confidence: float, source_project: str,
|
|
146
|
+
category: str | None = None, language: str | None = None) -> None:
|
|
147
|
+
"""Insert or update a global rule. Merges confidence via weighted average."""
|
|
148
|
+
existing = self.conn.execute(
|
|
149
|
+
"SELECT id, confidence, source_projects FROM global_rules WHERE rule_text = ?",
|
|
150
|
+
(rule_text,),
|
|
151
|
+
).fetchone()
|
|
152
|
+
|
|
153
|
+
if existing:
|
|
154
|
+
projects = json.loads(existing["source_projects"] or "[]")
|
|
155
|
+
if source_project not in projects:
|
|
156
|
+
projects.append(source_project)
|
|
157
|
+
new_conf = existing["confidence"] * 0.6 + confidence * 0.4
|
|
158
|
+
self.conn.execute(
|
|
159
|
+
"UPDATE global_rules SET confidence = ?, source_projects = ?, "
|
|
160
|
+
"updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
161
|
+
(new_conf, json.dumps(projects), existing["id"]),
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
self.conn.execute(
|
|
165
|
+
"INSERT INTO global_rules (rule_text, confidence, source_projects, category, language) "
|
|
166
|
+
"VALUES (?, ?, ?, ?, ?)",
|
|
167
|
+
(rule_text, confidence, json.dumps([source_project]), category, language),
|
|
168
|
+
)
|
|
169
|
+
self.conn.commit()
|
|
170
|
+
|
|
171
|
+
def get_rules(self, min_confidence: float = 0.6, language: str | None = None) -> list[dict]:
|
|
172
|
+
"""Get global rules above confidence threshold, optionally filtered by language."""
|
|
173
|
+
if language:
|
|
174
|
+
rows = self.conn.execute(
|
|
175
|
+
"SELECT rule_text, confidence, source_projects, category, language FROM global_rules "
|
|
176
|
+
"WHERE confidence >= ? AND (language = ? OR language IS NULL) ORDER BY confidence DESC",
|
|
177
|
+
(min_confidence, language),
|
|
178
|
+
).fetchall()
|
|
179
|
+
else:
|
|
180
|
+
rows = self.conn.execute(
|
|
181
|
+
"SELECT rule_text, confidence, source_projects, category, language FROM global_rules "
|
|
182
|
+
"WHERE confidence >= ? ORDER BY confidence DESC",
|
|
183
|
+
(min_confidence,),
|
|
184
|
+
).fetchall()
|
|
185
|
+
return [dict(r) for r in rows]
|
|
186
|
+
|
|
187
|
+
# ------------------------------------------------------------------
|
|
188
|
+
# Stats
|
|
189
|
+
# ------------------------------------------------------------------
|
|
190
|
+
|
|
191
|
+
def get_stats(self) -> dict:
|
|
192
|
+
"""Return summary stats for the global database."""
|
|
193
|
+
return {
|
|
194
|
+
"project_count": self.get_project_count(),
|
|
195
|
+
"total_preferences": self.conn.execute("SELECT COUNT(*) FROM global_preferences").fetchone()[0],
|
|
196
|
+
"total_rules": self.conn.execute("SELECT COUNT(*) FROM global_rules").fetchone()[0],
|
|
197
|
+
}
|