codebase-index 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_index/__init__.py +7 -0
- codebase_index/__main__.py +3 -0
- codebase_index/cli.py +916 -0
- codebase_index/config.py +110 -0
- codebase_index/discovery/__init__.py +10 -0
- codebase_index/discovery/classify.py +151 -0
- codebase_index/discovery/ignore.py +58 -0
- codebase_index/discovery/walker.py +75 -0
- codebase_index/doctor.py +138 -0
- codebase_index/embeddings/__init__.py +2 -0
- codebase_index/embeddings/backend.py +67 -0
- codebase_index/embeddings/external.py +56 -0
- codebase_index/embeddings/local.py +41 -0
- codebase_index/embeddings/noop.py +15 -0
- codebase_index/graph/__init__.py +8 -0
- codebase_index/graph/analysis.py +468 -0
- codebase_index/graph/builder.py +160 -0
- codebase_index/graph/expand.py +136 -0
- codebase_index/graph/export.py +381 -0
- codebase_index/graph/navigate.py +201 -0
- codebase_index/indexer/__init__.py +8 -0
- codebase_index/indexer/doc_chunks.py +202 -0
- codebase_index/indexer/freshness.py +109 -0
- codebase_index/indexer/pipeline.py +423 -0
- codebase_index/mcp/__init__.py +2 -0
- codebase_index/mcp/server.py +354 -0
- codebase_index/models.py +145 -0
- codebase_index/output/__init__.py +6 -0
- codebase_index/output/json.py +13 -0
- codebase_index/output/markdown.py +316 -0
- codebase_index/output/redact.py +31 -0
- codebase_index/parsers/__init__.py +9 -0
- codebase_index/parsers/base.py +47 -0
- codebase_index/parsers/languages.py +290 -0
- codebase_index/parsers/line_chunker.py +39 -0
- codebase_index/parsers/symbol_chunks.py +62 -0
- codebase_index/parsers/treesitter.py +439 -0
- codebase_index/retrieval/__init__.py +9 -0
- codebase_index/retrieval/budget.py +82 -0
- codebase_index/retrieval/fusion.py +62 -0
- codebase_index/retrieval/intent.py +56 -0
- codebase_index/retrieval/pipeline.py +207 -0
- codebase_index/retrieval/rerank.py +69 -0
- codebase_index/retrieval/searchers.py +291 -0
- codebase_index/retrieval/skeleton.py +251 -0
- codebase_index/retrieval/types.py +79 -0
- codebase_index/scaffold.py +399 -0
- codebase_index/service.py +158 -0
- codebase_index/skill_template/SKILL.md +198 -0
- codebase_index/skill_template/examples/hooks/settings.json +16 -0
- codebase_index/skill_template/scripts/cbx +25 -0
- codebase_index/skill_template/scripts/cbx.ps1 +25 -0
- codebase_index/skill_update.py +150 -0
- codebase_index/storage/__init__.py +8 -0
- codebase_index/storage/db.py +116 -0
- codebase_index/storage/repo.py +701 -0
- codebase_index/storage/schema.sql +125 -0
- codebase_index/watch/__init__.py +5 -0
- codebase_index/watch/watcher.py +93 -0
- codebase_index-1.6.0.dist-info/METADATA +748 -0
- codebase_index-1.6.0.dist-info/RECORD +64 -0
- codebase_index-1.6.0.dist-info/WHEEL +4 -0
- codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
- codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
codebase_index/config.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Configuration loading, merging, and validation.
|
|
2
|
+
|
|
3
|
+
Resolution order (later wins): built-in defaults -> .claude/cache/codebase-index/config.json ->
|
|
4
|
+
environment overrides (CBX_*) -> CLI flags. A stable `config_hash` is computed over indexing-
|
|
5
|
+
relevant fields; when it changes, the indexer knows to rebuild affected rows (see SCHEMA.md).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Literal, Optional, Union
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChunkConfig(BaseModel):
|
|
20
|
+
window_lines: int = 80
|
|
21
|
+
overlap_lines: int = 10
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RetrievalConfig(BaseModel):
|
|
25
|
+
default_mode: Literal["hybrid", "fts", "symbol", "vector"] = "hybrid"
|
|
26
|
+
rrf_k: int = 60
|
|
27
|
+
token_budget: int = 1500
|
|
28
|
+
limit: int = 10
|
|
29
|
+
compact_snippets: bool = True
|
|
30
|
+
compact_min_reduction: float = 0.25
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EmbeddingsConfig(BaseModel):
|
|
34
|
+
backend: Literal["noop", "local", "external"] = "noop"
|
|
35
|
+
enabled: bool = False
|
|
36
|
+
model: str = "all-MiniLM-L6-v2"
|
|
37
|
+
allow_external: bool = False # external backend refused unless this is True AND a key is present
|
|
38
|
+
endpoint: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class GraphConfig(BaseModel):
|
|
42
|
+
max_depth: int = 2
|
|
43
|
+
node_cap: int = 40
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Config(BaseModel):
|
|
47
|
+
root: str = "."
|
|
48
|
+
languages: Union[Literal["auto"], list[str]] = "auto"
|
|
49
|
+
max_file_bytes: int = 1_048_576
|
|
50
|
+
ignore_files: list[str] = [".gitignore", ".cursorignore", ".claudeignore", ".codeindexignore"]
|
|
51
|
+
extra_ignore: list[str] = []
|
|
52
|
+
chunk: ChunkConfig = ChunkConfig()
|
|
53
|
+
retrieval: RetrievalConfig = RetrievalConfig()
|
|
54
|
+
embeddings: EmbeddingsConfig = EmbeddingsConfig()
|
|
55
|
+
graph: GraphConfig = GraphConfig()
|
|
56
|
+
redaction: dict = {"enabled": True}
|
|
57
|
+
|
|
58
|
+
def config_hash(self) -> str:
|
|
59
|
+
"""Stable hash over indexing-relevant fields; drives rebuild decisions."""
|
|
60
|
+
relevant = {
|
|
61
|
+
"root": self.root,
|
|
62
|
+
"languages": self.languages,
|
|
63
|
+
"max_file_bytes": self.max_file_bytes,
|
|
64
|
+
"ignore_files": self.ignore_files,
|
|
65
|
+
"extra_ignore": self.extra_ignore,
|
|
66
|
+
"chunk": self.chunk.model_dump(),
|
|
67
|
+
"redaction": self.redaction,
|
|
68
|
+
"embeddings": {
|
|
69
|
+
"enabled": self.embeddings.enabled,
|
|
70
|
+
"backend": self.embeddings.backend,
|
|
71
|
+
"model": self.embeddings.model,
|
|
72
|
+
},
|
|
73
|
+
}
|
|
74
|
+
blob = json.dumps(relevant, sort_keys=True, separators=(",", ":"))
|
|
75
|
+
return hashlib.sha256(blob.encode("utf-8")).hexdigest()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
_ROOT_MARKERS = (".git", ".claude")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def find_root(start: Optional[Path] = None) -> Path:
|
|
82
|
+
"""Find the nearest project root marker, or fall back to the start directory."""
|
|
83
|
+
start = (start or Path.cwd()).resolve()
|
|
84
|
+
home = Path.home().resolve()
|
|
85
|
+
for candidate in (start, *start.parents):
|
|
86
|
+
if (candidate / ".git").exists():
|
|
87
|
+
return candidate
|
|
88
|
+
if candidate != home and (candidate / ".claude").exists():
|
|
89
|
+
return candidate
|
|
90
|
+
return start
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _config_path(root: Path) -> Path:
|
|
94
|
+
return root / ".claude" / "cache" / "codebase-index" / "config.json"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def load(root: Optional[Path] = None) -> Config:
|
|
98
|
+
"""Discover the project root and return the resolved, validated Config."""
|
|
99
|
+
resolved_root = Path(root).resolve() if root is not None else find_root()
|
|
100
|
+
data: dict = {}
|
|
101
|
+
cfg_file = _config_path(resolved_root)
|
|
102
|
+
if cfg_file.is_file():
|
|
103
|
+
data = json.loads(cfg_file.read_text(encoding="utf-8"))
|
|
104
|
+
|
|
105
|
+
if "CBX_MAX_FILE_BYTES" in os.environ:
|
|
106
|
+
data["max_file_bytes"] = int(os.environ["CBX_MAX_FILE_BYTES"])
|
|
107
|
+
|
|
108
|
+
cfg = Config(**data)
|
|
109
|
+
cfg.root = str(resolved_root)
|
|
110
|
+
return cfg
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""File discovery + ignore rules + classification.
|
|
2
|
+
|
|
3
|
+
walker.py : walk the project root, yield candidate paths.
|
|
4
|
+
ignore.py : merge .gitignore/.cursorignore/.claudeignore/.codeindexignore + built-in denylist
|
|
5
|
+
via pathspec; expose `is_ignored(path) -> bool`.
|
|
6
|
+
classify.py : language detection, binary/size/secret gates, generated-file detection.
|
|
7
|
+
|
|
8
|
+
Hard guarantee: secrets, binaries, build/dependency dirs, and oversized files never leave this
|
|
9
|
+
layer as indexable candidates. See docs/SECURITY.md §2.
|
|
10
|
+
"""
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Pure file classification helpers for discovery gates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import PurePosixPath
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
_LANG_BY_SUFFIX = {
|
|
9
|
+
".py": "python",
|
|
10
|
+
".ts": "typescript",
|
|
11
|
+
".tsx": "typescript",
|
|
12
|
+
".js": "javascript",
|
|
13
|
+
".jsx": "javascript",
|
|
14
|
+
".mjs": "javascript",
|
|
15
|
+
".cjs": "javascript",
|
|
16
|
+
".go": "go",
|
|
17
|
+
".java": "java",
|
|
18
|
+
".rs": "rust",
|
|
19
|
+
".c": "c",
|
|
20
|
+
".h": "c",
|
|
21
|
+
".cpp": "cpp",
|
|
22
|
+
".cc": "cpp",
|
|
23
|
+
".cxx": "cpp",
|
|
24
|
+
".hpp": "cpp",
|
|
25
|
+
".hh": "cpp",
|
|
26
|
+
".hxx": "cpp",
|
|
27
|
+
".cs": "csharp",
|
|
28
|
+
".rb": "ruby",
|
|
29
|
+
".php": "php",
|
|
30
|
+
".kt": "kotlin",
|
|
31
|
+
".kts": "kotlin",
|
|
32
|
+
".lua": "lua",
|
|
33
|
+
".md": "markdown",
|
|
34
|
+
".json": "json",
|
|
35
|
+
".yml": "yaml",
|
|
36
|
+
".yaml": "yaml",
|
|
37
|
+
".toml": "toml",
|
|
38
|
+
".sql": "sql",
|
|
39
|
+
# Config / IaC (Tier C: line-chunk + FTS, no tree-sitter spec). These were already
|
|
40
|
+
# indexed as unknown-language text; labeling them surfaces infra files in `stats`
|
|
41
|
+
# and lets agents scope searches to config without a tree-sitter grammar.
|
|
42
|
+
".tf": "terraform",
|
|
43
|
+
".tfvars": "terraform",
|
|
44
|
+
".hcl": "hcl",
|
|
45
|
+
".ini": "ini",
|
|
46
|
+
".cfg": "ini",
|
|
47
|
+
".conf": "ini",
|
|
48
|
+
".properties": "ini",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Extension-less or specially-named config/IaC files, matched on the lowercased
|
|
52
|
+
# filename (and a `name.suffix` form, e.g. `web.Dockerfile`). Kept separate from
|
|
53
|
+
# the suffix table because these carry their identity in the name, not the suffix.
|
|
54
|
+
_LANG_BY_NAME = {
|
|
55
|
+
"dockerfile": "dockerfile",
|
|
56
|
+
"containerfile": "dockerfile",
|
|
57
|
+
"makefile": "make",
|
|
58
|
+
"gnumakefile": "make",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# Authoritative set of *code* languages routed to tree-sitter (Guardrail 1). Every entry MUST
|
|
62
|
+
# have a working extraction path — a Tier-A LangSpec or the Tier-B generic walker. This is
|
|
63
|
+
# enforced by tests/test_multilang_symbols.py (registry consistency), so the two registries
|
|
64
|
+
# cannot silently drift. Note: yaml/json/markdown/toml/sql have grammars too but are *data/prose*
|
|
65
|
+
# (Tier C) and deliberately stay on the line-chunk + FTS floor.
|
|
66
|
+
#
|
|
67
|
+
# `lua` here has no Tier-A spec on purpose: it exercises the Tier-B generic path end-to-end.
|
|
68
|
+
_TREE_SITTER_LANGS = {
|
|
69
|
+
"python",
|
|
70
|
+
"typescript",
|
|
71
|
+
"javascript",
|
|
72
|
+
"go",
|
|
73
|
+
"java",
|
|
74
|
+
"rust",
|
|
75
|
+
"c",
|
|
76
|
+
"cpp",
|
|
77
|
+
"csharp",
|
|
78
|
+
"ruby",
|
|
79
|
+
"php",
|
|
80
|
+
"kotlin",
|
|
81
|
+
"lua",
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
_SECRET_NAMES = {
|
|
85
|
+
".env",
|
|
86
|
+
"id_rsa",
|
|
87
|
+
"id_ed25519",
|
|
88
|
+
"credentials.json",
|
|
89
|
+
"service-account.json",
|
|
90
|
+
"secrets.json",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
_SECRET_SUFFIXES = (".pem", ".key", ".p12", ".pfx")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def detect_language(path: str) -> Optional[str]:
|
|
97
|
+
pure = PurePosixPath(path)
|
|
98
|
+
suffix = pure.suffix.lower()
|
|
99
|
+
if suffix:
|
|
100
|
+
lang = _LANG_BY_SUFFIX.get(suffix)
|
|
101
|
+
if lang is not None:
|
|
102
|
+
return lang
|
|
103
|
+
name = pure.name.lower()
|
|
104
|
+
if name in _LANG_BY_NAME:
|
|
105
|
+
return _LANG_BY_NAME[name]
|
|
106
|
+
# `web.Dockerfile`, `base.dockerfile`, etc.: identity is the suffix-as-name.
|
|
107
|
+
if suffix and suffix[1:] in _LANG_BY_NAME:
|
|
108
|
+
return _LANG_BY_NAME[suffix[1:]]
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def parser_for(lang: Optional[str]) -> str:
|
|
113
|
+
return "treesitter" if lang in _TREE_SITTER_LANGS else "line"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def is_secret_filename(path: str) -> bool:
|
|
117
|
+
name = PurePosixPath(path).name.lower()
|
|
118
|
+
if name in _SECRET_NAMES or name.startswith(".env."):
|
|
119
|
+
return True
|
|
120
|
+
return name.endswith(_SECRET_SUFFIXES)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def looks_binary(data: bytes) -> bool:
|
|
124
|
+
return b"\x00" in data
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def is_generated(path: str) -> bool:
|
|
128
|
+
name = PurePosixPath(path).name.lower()
|
|
129
|
+
return (
|
|
130
|
+
".generated." in name
|
|
131
|
+
or name.endswith(".generated")
|
|
132
|
+
or name.endswith(".min.js")
|
|
133
|
+
or name.endswith(".min.css")
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# Directory names that mark a test tree, and filename patterns for test modules.
|
|
138
|
+
# Matched on whole path segments / filename stems — NOT a bare substring — so
|
|
139
|
+
# `contest/`, `latest.py`, or `testimonials.ts` are never mistaken for tests.
|
|
140
|
+
_TEST_DIRS = {"test", "tests", "__tests__", "__test__", "testing", "spec", "specs", "e2e"}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def is_test_path(path: str) -> bool:
|
|
144
|
+
pure = PurePosixPath(path.replace("\\", "/"))
|
|
145
|
+
if any(part.lower() in _TEST_DIRS for part in pure.parts[:-1]):
|
|
146
|
+
return True
|
|
147
|
+
name = pure.name.lower()
|
|
148
|
+
stem = name.split(".", 1)[0]
|
|
149
|
+
if stem == "test" or stem.startswith("test_") or stem.endswith("_test"):
|
|
150
|
+
return True
|
|
151
|
+
return ".test." in name or ".spec." in name
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Layer built-in deny rules with root-level ignore files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pathspec
|
|
8
|
+
|
|
9
|
+
DEFAULT_IGNORE_FILES = [".gitignore", ".cursorignore", ".claudeignore", ".codeindexignore"]
|
|
10
|
+
|
|
11
|
+
BUILTIN_DENYLIST = [
|
|
12
|
+
".git/",
|
|
13
|
+
".hg/",
|
|
14
|
+
".svn/",
|
|
15
|
+
".claude/cache/codebase-index/",
|
|
16
|
+
"node_modules/",
|
|
17
|
+
"__pycache__/",
|
|
18
|
+
".pytest_cache/",
|
|
19
|
+
".mypy_cache/",
|
|
20
|
+
".ruff_cache/",
|
|
21
|
+
".venv/",
|
|
22
|
+
"venv/",
|
|
23
|
+
"build/",
|
|
24
|
+
"dist/",
|
|
25
|
+
"target/",
|
|
26
|
+
".next/",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
BUILTIN_DENY_DIRS = {p.rstrip("/") for p in BUILTIN_DENYLIST if p.endswith("/")}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class IgnoreMatcher:
|
|
33
|
+
"""Gitignore-style matcher for root-level ignore files and built-in denylist."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, patterns: list[str]) -> None:
|
|
36
|
+
self._spec = pathspec.PathSpec.from_lines("gitignore", patterns)
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_root(
|
|
40
|
+
cls,
|
|
41
|
+
root: Path,
|
|
42
|
+
*,
|
|
43
|
+
ignore_files: list[str] | None = None,
|
|
44
|
+
extra_ignore: list[str] | None = None,
|
|
45
|
+
) -> "IgnoreMatcher":
|
|
46
|
+
patterns = list(BUILTIN_DENYLIST)
|
|
47
|
+
for ignore_file in ignore_files or DEFAULT_IGNORE_FILES:
|
|
48
|
+
path = root / ignore_file
|
|
49
|
+
if path.is_file():
|
|
50
|
+
patterns.extend(path.read_text(encoding="utf-8").splitlines())
|
|
51
|
+
patterns.extend(extra_ignore or [])
|
|
52
|
+
return cls(patterns)
|
|
53
|
+
|
|
54
|
+
def is_ignored(self, rel_path: str) -> bool:
|
|
55
|
+
return self._spec.match_file(rel_path.replace("\\", "/"))
|
|
56
|
+
|
|
57
|
+
def is_ignored_dir(self, dirname: str) -> bool:
|
|
58
|
+
return dirname in BUILTIN_DENY_DIRS
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Walk the project root and yield indexable candidates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterator, Optional
|
|
9
|
+
|
|
10
|
+
from ..config import Config
|
|
11
|
+
from . import classify
|
|
12
|
+
from .ignore import IgnoreMatcher
|
|
13
|
+
|
|
14
|
+
_BINARY_SNIFF_BYTES = 4096
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class Candidate:
|
|
19
|
+
path: Path
|
|
20
|
+
rel_path: str
|
|
21
|
+
size_bytes: int
|
|
22
|
+
lang: Optional[str]
|
|
23
|
+
parser: str
|
|
24
|
+
is_generated: bool
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def walk(root: Path, config: Config) -> Iterator[Candidate]:
|
|
28
|
+
root = Path(root).resolve()
|
|
29
|
+
matcher = IgnoreMatcher.from_root(
|
|
30
|
+
root,
|
|
31
|
+
ignore_files=config.ignore_files,
|
|
32
|
+
extra_ignore=config.extra_ignore,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
36
|
+
dirnames[:] = [
|
|
37
|
+
d
|
|
38
|
+
for d in dirnames
|
|
39
|
+
if not matcher.is_ignored_dir(d)
|
|
40
|
+
and not matcher.is_ignored(_rel(root, Path(dirpath) / d) + "/")
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
for fname in filenames:
|
|
44
|
+
abs_path = Path(dirpath) / fname
|
|
45
|
+
rel = _rel(root, abs_path)
|
|
46
|
+
|
|
47
|
+
if matcher.is_ignored(rel) or classify.is_secret_filename(rel):
|
|
48
|
+
continue
|
|
49
|
+
try:
|
|
50
|
+
size = abs_path.stat().st_size
|
|
51
|
+
except OSError:
|
|
52
|
+
continue
|
|
53
|
+
if size > config.max_file_bytes:
|
|
54
|
+
continue
|
|
55
|
+
try:
|
|
56
|
+
with abs_path.open("rb") as fh:
|
|
57
|
+
head = fh.read(_BINARY_SNIFF_BYTES)
|
|
58
|
+
except OSError:
|
|
59
|
+
continue
|
|
60
|
+
if classify.looks_binary(head):
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
lang = classify.detect_language(rel)
|
|
64
|
+
yield Candidate(
|
|
65
|
+
path=abs_path,
|
|
66
|
+
rel_path=rel,
|
|
67
|
+
size_bytes=size,
|
|
68
|
+
lang=lang,
|
|
69
|
+
parser=classify.parser_for(lang),
|
|
70
|
+
is_generated=classify.is_generated(rel),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _rel(root: Path, path: Path) -> str:
|
|
75
|
+
return path.resolve().relative_to(root).as_posix()
|
codebase_index/doctor.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Safety / health self-check (docs/SECURITY.md §6).
|
|
2
|
+
|
|
3
|
+
M8 scope: report enabled `codebase-index` hooks, whether the cache is gitignored, and
|
|
4
|
+
index freshness. The fuller checklist (indexed-secret leak scan, oversized/binary audit,
|
|
5
|
+
permissions, allowed-tools diff) is M9.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Literal
|
|
13
|
+
|
|
14
|
+
from . import scaffold
|
|
15
|
+
from .config import Config
|
|
16
|
+
|
|
17
|
+
Severity = Literal["high", "medium", "info"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Finding:
|
|
22
|
+
id: str
|
|
23
|
+
ok: bool
|
|
24
|
+
severity: Severity
|
|
25
|
+
detail: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def run_doctor(root: Path, config: Config) -> list[Finding]:
|
|
29
|
+
root = Path(root)
|
|
30
|
+
findings: list[Finding] = []
|
|
31
|
+
|
|
32
|
+
# 1. Is the cache gitignored? (committing the index can leak code/secrets.)
|
|
33
|
+
gitignore = root / ".gitignore"
|
|
34
|
+
covered = (
|
|
35
|
+
gitignore.exists()
|
|
36
|
+
and scaffold._CACHE_IGNORE_LINE in gitignore.read_text(encoding="utf-8")
|
|
37
|
+
)
|
|
38
|
+
findings.append(
|
|
39
|
+
Finding(
|
|
40
|
+
id="cache_gitignored",
|
|
41
|
+
ok=covered,
|
|
42
|
+
severity="high",
|
|
43
|
+
detail=(
|
|
44
|
+
"cache is gitignored"
|
|
45
|
+
if covered
|
|
46
|
+
else f"add '{scaffold._CACHE_IGNORE_LINE}' to .gitignore (run `init`)"
|
|
47
|
+
),
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# 2. Which auto-update hooks are enabled? (informational; hooks run on every edit.)
|
|
52
|
+
hooks = scaffold.enabled_hooks(root)
|
|
53
|
+
findings.append(
|
|
54
|
+
Finding(
|
|
55
|
+
id="hooks_enabled",
|
|
56
|
+
ok=bool(hooks),
|
|
57
|
+
severity="info",
|
|
58
|
+
detail="; ".join(hooks) if hooks else "no auto-update hook (run `init --with-hooks`)",
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# 3. Index freshness.
|
|
63
|
+
db_path = root / scaffold.CACHE_REL / "index.sqlite"
|
|
64
|
+
if not db_path.exists():
|
|
65
|
+
findings.append(
|
|
66
|
+
Finding("index_fresh", ok=False, severity="medium", detail="no index (run `index`)")
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
from .indexer.freshness import compute_freshness
|
|
70
|
+
from .storage.db import Database
|
|
71
|
+
|
|
72
|
+
with Database(db_path) as db:
|
|
73
|
+
fr = compute_freshness(db.conn, root, config)
|
|
74
|
+
findings.append(
|
|
75
|
+
Finding(
|
|
76
|
+
id="index_fresh",
|
|
77
|
+
ok=not fr.stale,
|
|
78
|
+
severity="medium",
|
|
79
|
+
detail=(
|
|
80
|
+
"index is fresh"
|
|
81
|
+
if not fr.stale
|
|
82
|
+
else f"{fr.files_changed_since_build} file(s) changed — run `update`"
|
|
83
|
+
),
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# 4. Symbol-extraction coverage (Guardrail 2): a tree-sitter language with many files
|
|
88
|
+
# but ~0 symbols means extraction silently failed (the original Java bug).
|
|
89
|
+
from .storage import repo
|
|
90
|
+
from .storage.db import Database
|
|
91
|
+
|
|
92
|
+
with Database(db_path) as db:
|
|
93
|
+
coverage = repo.treesitter_coverage(db.conn)
|
|
94
|
+
dead = [r["lang"] for r in coverage if r["files"] >= _ZERO_SYMBOL_FILE_THRESHOLD
|
|
95
|
+
and (r["symbols"] or 0) == 0]
|
|
96
|
+
findings.append(
|
|
97
|
+
Finding(
|
|
98
|
+
id="symbol_extraction",
|
|
99
|
+
ok=not dead,
|
|
100
|
+
severity="medium",
|
|
101
|
+
detail=(
|
|
102
|
+
"tree-sitter languages extract symbols"
|
|
103
|
+
if not dead
|
|
104
|
+
else f"no symbols extracted for tree-sitter language(s): {', '.join(dead)} "
|
|
105
|
+
"— extraction path likely broken"
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# 5. Dependency-graph coverage: Tier-B languages (grammar but no hand-tuned spec)
|
|
111
|
+
# yield symbols but no import/inheritance edges, so refs/impact undercount.
|
|
112
|
+
from .parsers.languages import has_full_graph
|
|
113
|
+
|
|
114
|
+
tier_b = sorted({r["lang"] for r in coverage if not has_full_graph(r["lang"])})
|
|
115
|
+
findings.append(
|
|
116
|
+
Finding(
|
|
117
|
+
id="graph_coverage",
|
|
118
|
+
ok=True,
|
|
119
|
+
severity="info",
|
|
120
|
+
detail=(
|
|
121
|
+
"all indexed languages have full dependency-graph support"
|
|
122
|
+
if not tier_b
|
|
123
|
+
else f"partial dependency graph for Tier-B language(s): {', '.join(tier_b)} "
|
|
124
|
+
"— refs/impact may undercount (confirm with Grep)"
|
|
125
|
+
),
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return findings
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# Threshold above which a tree-sitter language with zero symbols is treated as broken rather
|
|
133
|
+
# than just a tiny/empty repo.
|
|
134
|
+
_ZERO_SYMBOL_FILE_THRESHOLD = 3
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def has_high_severity_failure(findings: list[Finding]) -> bool:
|
|
138
|
+
return any(f.severity == "high" and not f.ok for f in findings)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# src/codebase_index/embeddings/backend.py
|
|
2
|
+
"""Embedding backend protocol + the single gating factory.
|
|
3
|
+
|
|
4
|
+
`resolve_backend` is the ONLY place a backend is constructed. It enforces
|
|
5
|
+
SECURITY.md §4: external backends are refused unless `allow_external = true`,
|
|
6
|
+
an API key is present in the environment, AND a warning naming the endpoint is
|
|
7
|
+
emitted. When embeddings are disabled the factory returns a NoopBackend and
|
|
8
|
+
imports no optional dependency.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
from typing import Callable, Protocol, runtime_checkable
|
|
15
|
+
|
|
16
|
+
API_KEY_ENV = "CBX_EMBEDDINGS_API_KEY"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EmbeddingError(RuntimeError):
|
|
20
|
+
"""Raised when embeddings are misconfigured, refused, or a backend is unusable."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@runtime_checkable
|
|
24
|
+
class EmbeddingBackend(Protocol):
|
|
25
|
+
enabled: bool
|
|
26
|
+
name: str
|
|
27
|
+
dim: int
|
|
28
|
+
|
|
29
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
30
|
+
"""Return one vector (length == `dim`) per input text."""
|
|
31
|
+
...
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def resolve_backend(cfg, warn: Callable[[str], None] = lambda _m: None) -> "EmbeddingBackend":
|
|
35
|
+
"""Construct the configured backend, applying all security gates."""
|
|
36
|
+
emb = cfg.embeddings
|
|
37
|
+
if not emb.enabled or emb.backend == "noop":
|
|
38
|
+
from .noop import NoopBackend
|
|
39
|
+
|
|
40
|
+
return NoopBackend()
|
|
41
|
+
|
|
42
|
+
if emb.backend == "local":
|
|
43
|
+
from .local import LocalBackend
|
|
44
|
+
|
|
45
|
+
return LocalBackend(model_name=emb.model)
|
|
46
|
+
|
|
47
|
+
if emb.backend == "external":
|
|
48
|
+
if not emb.allow_external:
|
|
49
|
+
raise EmbeddingError(
|
|
50
|
+
"External embeddings require embeddings.allow_external = true (SECURITY.md §4)."
|
|
51
|
+
)
|
|
52
|
+
api_key = os.environ.get(API_KEY_ENV)
|
|
53
|
+
if not api_key:
|
|
54
|
+
raise EmbeddingError(
|
|
55
|
+
f"External embeddings require an API key in ${API_KEY_ENV} (SECURITY.md §4)."
|
|
56
|
+
)
|
|
57
|
+
if not emb.endpoint:
|
|
58
|
+
raise EmbeddingError("External embeddings require embeddings.endpoint to be set.")
|
|
59
|
+
warn(
|
|
60
|
+
f"[codebase-index] EXTERNAL EMBEDDINGS ENABLED — chunk text will be sent to "
|
|
61
|
+
f"{emb.endpoint}. Disable with embeddings.backend=local|noop."
|
|
62
|
+
)
|
|
63
|
+
from .external import ExternalBackend
|
|
64
|
+
|
|
65
|
+
return ExternalBackend(endpoint=emb.endpoint, api_key=api_key, model_name=emb.model)
|
|
66
|
+
|
|
67
|
+
raise EmbeddingError(f"Unknown embeddings.backend: {emb.backend!r}")
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# src/codebase_index/embeddings/external.py
|
|
2
|
+
"""External embedding API backend. Constructed ONLY via resolve_backend after the
|
|
3
|
+
SECURITY.md §4 gates pass. The network call is isolated in a transport callable so
|
|
4
|
+
it can be tested without hitting the network and swapped per provider.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
from urllib.request import Request, urlopen
|
|
12
|
+
|
|
13
|
+
from .backend import EmbeddingError
|
|
14
|
+
|
|
15
|
+
Transport = Callable[[str, str, str, list[str]], list[list[float]]]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _http_transport(endpoint: str, api_key: str, model: str, texts: list[str]) -> list[list[float]]:
|
|
19
|
+
body = json.dumps({"model": model, "input": texts}).encode("utf-8")
|
|
20
|
+
req = Request(
|
|
21
|
+
endpoint,
|
|
22
|
+
data=body,
|
|
23
|
+
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
|
|
24
|
+
method="POST",
|
|
25
|
+
)
|
|
26
|
+
with urlopen(req, timeout=30) as resp:
|
|
27
|
+
payload = json.loads(resp.read().decode("utf-8"))
|
|
28
|
+
return [item["embedding"] for item in payload["data"]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ExternalBackend:
|
|
32
|
+
enabled = True
|
|
33
|
+
dim: int = 0
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
*,
|
|
38
|
+
endpoint: str,
|
|
39
|
+
api_key: str,
|
|
40
|
+
model_name: str,
|
|
41
|
+
transport: Optional[Transport] = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
self.name = f"external:{model_name}"
|
|
44
|
+
self.model_name = model_name
|
|
45
|
+
self._endpoint = endpoint
|
|
46
|
+
self._api_key = api_key
|
|
47
|
+
self._transport = transport or _http_transport
|
|
48
|
+
|
|
49
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
50
|
+
if not texts:
|
|
51
|
+
return []
|
|
52
|
+
vecs = self._transport(self._endpoint, self._api_key, self.model_name, list(texts))
|
|
53
|
+
if not vecs or not vecs[0]:
|
|
54
|
+
raise EmbeddingError("External embedding endpoint returned no vectors.")
|
|
55
|
+
self.dim = len(vecs[0])
|
|
56
|
+
return [[float(x) for x in v] for v in vecs]
|