codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. codebase_index/__init__.py +7 -0
  2. codebase_index/__main__.py +3 -0
  3. codebase_index/cli.py +916 -0
  4. codebase_index/config.py +110 -0
  5. codebase_index/discovery/__init__.py +10 -0
  6. codebase_index/discovery/classify.py +151 -0
  7. codebase_index/discovery/ignore.py +58 -0
  8. codebase_index/discovery/walker.py +75 -0
  9. codebase_index/doctor.py +138 -0
  10. codebase_index/embeddings/__init__.py +2 -0
  11. codebase_index/embeddings/backend.py +67 -0
  12. codebase_index/embeddings/external.py +56 -0
  13. codebase_index/embeddings/local.py +41 -0
  14. codebase_index/embeddings/noop.py +15 -0
  15. codebase_index/graph/__init__.py +8 -0
  16. codebase_index/graph/analysis.py +468 -0
  17. codebase_index/graph/builder.py +160 -0
  18. codebase_index/graph/expand.py +136 -0
  19. codebase_index/graph/export.py +381 -0
  20. codebase_index/graph/navigate.py +201 -0
  21. codebase_index/indexer/__init__.py +8 -0
  22. codebase_index/indexer/doc_chunks.py +202 -0
  23. codebase_index/indexer/freshness.py +109 -0
  24. codebase_index/indexer/pipeline.py +423 -0
  25. codebase_index/mcp/__init__.py +2 -0
  26. codebase_index/mcp/server.py +354 -0
  27. codebase_index/models.py +145 -0
  28. codebase_index/output/__init__.py +6 -0
  29. codebase_index/output/json.py +13 -0
  30. codebase_index/output/markdown.py +316 -0
  31. codebase_index/output/redact.py +31 -0
  32. codebase_index/parsers/__init__.py +9 -0
  33. codebase_index/parsers/base.py +47 -0
  34. codebase_index/parsers/languages.py +290 -0
  35. codebase_index/parsers/line_chunker.py +39 -0
  36. codebase_index/parsers/symbol_chunks.py +62 -0
  37. codebase_index/parsers/treesitter.py +439 -0
  38. codebase_index/retrieval/__init__.py +9 -0
  39. codebase_index/retrieval/budget.py +82 -0
  40. codebase_index/retrieval/fusion.py +62 -0
  41. codebase_index/retrieval/intent.py +56 -0
  42. codebase_index/retrieval/pipeline.py +207 -0
  43. codebase_index/retrieval/rerank.py +69 -0
  44. codebase_index/retrieval/searchers.py +291 -0
  45. codebase_index/retrieval/skeleton.py +251 -0
  46. codebase_index/retrieval/types.py +79 -0
  47. codebase_index/scaffold.py +399 -0
  48. codebase_index/service.py +158 -0
  49. codebase_index/skill_template/SKILL.md +198 -0
  50. codebase_index/skill_template/examples/hooks/settings.json +16 -0
  51. codebase_index/skill_template/scripts/cbx +25 -0
  52. codebase_index/skill_template/scripts/cbx.ps1 +25 -0
  53. codebase_index/skill_update.py +150 -0
  54. codebase_index/storage/__init__.py +8 -0
  55. codebase_index/storage/db.py +116 -0
  56. codebase_index/storage/repo.py +701 -0
  57. codebase_index/storage/schema.sql +125 -0
  58. codebase_index/watch/__init__.py +5 -0
  59. codebase_index/watch/watcher.py +93 -0
  60. codebase_index-1.6.0.dist-info/METADATA +748 -0
  61. codebase_index-1.6.0.dist-info/RECORD +64 -0
  62. codebase_index-1.6.0.dist-info/WHEEL +4 -0
  63. codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
  64. codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,110 @@
1
+ """Configuration loading, merging, and validation.
2
+
3
+ Resolution order (later wins): built-in defaults -> .claude/cache/codebase-index/config.json ->
4
+ environment overrides (CBX_*) -> CLI flags. A stable `config_hash` is computed over indexing-
5
+ relevant fields; when it changes, the indexer knows to rebuild affected rows (see SCHEMA.md).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import json
12
+ import os
13
+ from pathlib import Path
14
+ from typing import Literal, Optional, Union
15
+
16
+ from pydantic import BaseModel
17
+
18
+
19
+ class ChunkConfig(BaseModel):
20
+ window_lines: int = 80
21
+ overlap_lines: int = 10
22
+
23
+
24
+ class RetrievalConfig(BaseModel):
25
+ default_mode: Literal["hybrid", "fts", "symbol", "vector"] = "hybrid"
26
+ rrf_k: int = 60
27
+ token_budget: int = 1500
28
+ limit: int = 10
29
+ compact_snippets: bool = True
30
+ compact_min_reduction: float = 0.25
31
+
32
+
33
+ class EmbeddingsConfig(BaseModel):
34
+ backend: Literal["noop", "local", "external"] = "noop"
35
+ enabled: bool = False
36
+ model: str = "all-MiniLM-L6-v2"
37
+ allow_external: bool = False # external backend refused unless this is True AND a key is present
38
+ endpoint: Optional[str] = None
39
+
40
+
41
+ class GraphConfig(BaseModel):
42
+ max_depth: int = 2
43
+ node_cap: int = 40
44
+
45
+
46
+ class Config(BaseModel):
47
+ root: str = "."
48
+ languages: Union[Literal["auto"], list[str]] = "auto"
49
+ max_file_bytes: int = 1_048_576
50
+ ignore_files: list[str] = [".gitignore", ".cursorignore", ".claudeignore", ".codeindexignore"]
51
+ extra_ignore: list[str] = []
52
+ chunk: ChunkConfig = ChunkConfig()
53
+ retrieval: RetrievalConfig = RetrievalConfig()
54
+ embeddings: EmbeddingsConfig = EmbeddingsConfig()
55
+ graph: GraphConfig = GraphConfig()
56
+ redaction: dict = {"enabled": True}
57
+
58
+ def config_hash(self) -> str:
59
+ """Stable hash over indexing-relevant fields; drives rebuild decisions."""
60
+ relevant = {
61
+ "root": self.root,
62
+ "languages": self.languages,
63
+ "max_file_bytes": self.max_file_bytes,
64
+ "ignore_files": self.ignore_files,
65
+ "extra_ignore": self.extra_ignore,
66
+ "chunk": self.chunk.model_dump(),
67
+ "redaction": self.redaction,
68
+ "embeddings": {
69
+ "enabled": self.embeddings.enabled,
70
+ "backend": self.embeddings.backend,
71
+ "model": self.embeddings.model,
72
+ },
73
+ }
74
+ blob = json.dumps(relevant, sort_keys=True, separators=(",", ":"))
75
+ return hashlib.sha256(blob.encode("utf-8")).hexdigest()
76
+
77
+
78
+ _ROOT_MARKERS = (".git", ".claude")
79
+
80
+
81
+ def find_root(start: Optional[Path] = None) -> Path:
82
+ """Find the nearest project root marker, or fall back to the start directory."""
83
+ start = (start or Path.cwd()).resolve()
84
+ home = Path.home().resolve()
85
+ for candidate in (start, *start.parents):
86
+ if (candidate / ".git").exists():
87
+ return candidate
88
+ if candidate != home and (candidate / ".claude").exists():
89
+ return candidate
90
+ return start
91
+
92
+
93
+ def _config_path(root: Path) -> Path:
94
+ return root / ".claude" / "cache" / "codebase-index" / "config.json"
95
+
96
+
97
+ def load(root: Optional[Path] = None) -> Config:
98
+ """Discover the project root and return the resolved, validated Config."""
99
+ resolved_root = Path(root).resolve() if root is not None else find_root()
100
+ data: dict = {}
101
+ cfg_file = _config_path(resolved_root)
102
+ if cfg_file.is_file():
103
+ data = json.loads(cfg_file.read_text(encoding="utf-8"))
104
+
105
+ if "CBX_MAX_FILE_BYTES" in os.environ:
106
+ data["max_file_bytes"] = int(os.environ["CBX_MAX_FILE_BYTES"])
107
+
108
+ cfg = Config(**data)
109
+ cfg.root = str(resolved_root)
110
+ return cfg
@@ -0,0 +1,10 @@
1
+ """File discovery + ignore rules + classification.
2
+
3
+ walker.py : walk the project root, yield candidate paths.
4
+ ignore.py : merge .gitignore/.cursorignore/.claudeignore/.codeindexignore + built-in denylist
5
+ via pathspec; expose `is_ignored(path) -> bool`.
6
+ classify.py : language detection, binary/size/secret gates, generated-file detection.
7
+
8
+ Hard guarantee: secrets, binaries, build/dependency dirs, and oversized files never leave this
9
+ layer as indexable candidates. See docs/SECURITY.md §2.
10
+ """
@@ -0,0 +1,151 @@
1
+ """Pure file classification helpers for discovery gates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import PurePosixPath
6
+ from typing import Optional
7
+
8
+ _LANG_BY_SUFFIX = {
9
+ ".py": "python",
10
+ ".ts": "typescript",
11
+ ".tsx": "typescript",
12
+ ".js": "javascript",
13
+ ".jsx": "javascript",
14
+ ".mjs": "javascript",
15
+ ".cjs": "javascript",
16
+ ".go": "go",
17
+ ".java": "java",
18
+ ".rs": "rust",
19
+ ".c": "c",
20
+ ".h": "c",
21
+ ".cpp": "cpp",
22
+ ".cc": "cpp",
23
+ ".cxx": "cpp",
24
+ ".hpp": "cpp",
25
+ ".hh": "cpp",
26
+ ".hxx": "cpp",
27
+ ".cs": "csharp",
28
+ ".rb": "ruby",
29
+ ".php": "php",
30
+ ".kt": "kotlin",
31
+ ".kts": "kotlin",
32
+ ".lua": "lua",
33
+ ".md": "markdown",
34
+ ".json": "json",
35
+ ".yml": "yaml",
36
+ ".yaml": "yaml",
37
+ ".toml": "toml",
38
+ ".sql": "sql",
39
+ # Config / IaC (Tier C: line-chunk + FTS, no tree-sitter spec). These were already
40
+ # indexed as unknown-language text; labeling them surfaces infra files in `stats`
41
+ # and lets agents scope searches to config without a tree-sitter grammar.
42
+ ".tf": "terraform",
43
+ ".tfvars": "terraform",
44
+ ".hcl": "hcl",
45
+ ".ini": "ini",
46
+ ".cfg": "ini",
47
+ ".conf": "ini",
48
+ ".properties": "ini",
49
+ }
50
+
51
+ # Extension-less or specially-named config/IaC files, matched on the lowercased
52
+ # filename (and a `name.suffix` form, e.g. `web.Dockerfile`). Kept separate from
53
+ # the suffix table because these carry their identity in the name, not the suffix.
54
+ _LANG_BY_NAME = {
55
+ "dockerfile": "dockerfile",
56
+ "containerfile": "dockerfile",
57
+ "makefile": "make",
58
+ "gnumakefile": "make",
59
+ }
60
+
61
+ # Authoritative set of *code* languages routed to tree-sitter (Guardrail 1). Every entry MUST
62
+ # have a working extraction path — a Tier-A LangSpec or the Tier-B generic walker. This is
63
+ # enforced by tests/test_multilang_symbols.py (registry consistency), so the two registries
64
+ # cannot silently drift. Note: yaml/json/markdown/toml/sql have grammars too but are *data/prose*
65
+ # (Tier C) and deliberately stay on the line-chunk + FTS floor.
66
+ #
67
+ # `lua` here has no Tier-A spec on purpose: it exercises the Tier-B generic path end-to-end.
68
+ _TREE_SITTER_LANGS = {
69
+ "python",
70
+ "typescript",
71
+ "javascript",
72
+ "go",
73
+ "java",
74
+ "rust",
75
+ "c",
76
+ "cpp",
77
+ "csharp",
78
+ "ruby",
79
+ "php",
80
+ "kotlin",
81
+ "lua",
82
+ }
83
+
84
+ _SECRET_NAMES = {
85
+ ".env",
86
+ "id_rsa",
87
+ "id_ed25519",
88
+ "credentials.json",
89
+ "service-account.json",
90
+ "secrets.json",
91
+ }
92
+
93
+ _SECRET_SUFFIXES = (".pem", ".key", ".p12", ".pfx")
94
+
95
+
96
+ def detect_language(path: str) -> Optional[str]:
97
+ pure = PurePosixPath(path)
98
+ suffix = pure.suffix.lower()
99
+ if suffix:
100
+ lang = _LANG_BY_SUFFIX.get(suffix)
101
+ if lang is not None:
102
+ return lang
103
+ name = pure.name.lower()
104
+ if name in _LANG_BY_NAME:
105
+ return _LANG_BY_NAME[name]
106
+ # `web.Dockerfile`, `base.dockerfile`, etc.: identity is the suffix-as-name.
107
+ if suffix and suffix[1:] in _LANG_BY_NAME:
108
+ return _LANG_BY_NAME[suffix[1:]]
109
+ return None
110
+
111
+
112
+ def parser_for(lang: Optional[str]) -> str:
113
+ return "treesitter" if lang in _TREE_SITTER_LANGS else "line"
114
+
115
+
116
+ def is_secret_filename(path: str) -> bool:
117
+ name = PurePosixPath(path).name.lower()
118
+ if name in _SECRET_NAMES or name.startswith(".env."):
119
+ return True
120
+ return name.endswith(_SECRET_SUFFIXES)
121
+
122
+
123
+ def looks_binary(data: bytes) -> bool:
124
+ return b"\x00" in data
125
+
126
+
127
+ def is_generated(path: str) -> bool:
128
+ name = PurePosixPath(path).name.lower()
129
+ return (
130
+ ".generated." in name
131
+ or name.endswith(".generated")
132
+ or name.endswith(".min.js")
133
+ or name.endswith(".min.css")
134
+ )
135
+
136
+
137
+ # Directory names that mark a test tree, and filename patterns for test modules.
138
+ # Matched on whole path segments / filename stems — NOT a bare substring — so
139
+ # `contest/`, `latest.py`, or `testimonials.ts` are never mistaken for tests.
140
+ _TEST_DIRS = {"test", "tests", "__tests__", "__test__", "testing", "spec", "specs", "e2e"}
141
+
142
+
143
+ def is_test_path(path: str) -> bool:
144
+ pure = PurePosixPath(path.replace("\\", "/"))
145
+ if any(part.lower() in _TEST_DIRS for part in pure.parts[:-1]):
146
+ return True
147
+ name = pure.name.lower()
148
+ stem = name.split(".", 1)[0]
149
+ if stem == "test" or stem.startswith("test_") or stem.endswith("_test"):
150
+ return True
151
+ return ".test." in name or ".spec." in name
@@ -0,0 +1,58 @@
1
+ """Layer built-in deny rules with root-level ignore files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pathspec
8
+
9
+ DEFAULT_IGNORE_FILES = [".gitignore", ".cursorignore", ".claudeignore", ".codeindexignore"]
10
+
11
+ BUILTIN_DENYLIST = [
12
+ ".git/",
13
+ ".hg/",
14
+ ".svn/",
15
+ ".claude/cache/codebase-index/",
16
+ "node_modules/",
17
+ "__pycache__/",
18
+ ".pytest_cache/",
19
+ ".mypy_cache/",
20
+ ".ruff_cache/",
21
+ ".venv/",
22
+ "venv/",
23
+ "build/",
24
+ "dist/",
25
+ "target/",
26
+ ".next/",
27
+ ]
28
+
29
+ BUILTIN_DENY_DIRS = {p.rstrip("/") for p in BUILTIN_DENYLIST if p.endswith("/")}
30
+
31
+
32
+ class IgnoreMatcher:
33
+ """Gitignore-style matcher for root-level ignore files and built-in denylist."""
34
+
35
+ def __init__(self, patterns: list[str]) -> None:
36
+ self._spec = pathspec.PathSpec.from_lines("gitignore", patterns)
37
+
38
+ @classmethod
39
+ def from_root(
40
+ cls,
41
+ root: Path,
42
+ *,
43
+ ignore_files: list[str] | None = None,
44
+ extra_ignore: list[str] | None = None,
45
+ ) -> "IgnoreMatcher":
46
+ patterns = list(BUILTIN_DENYLIST)
47
+ for ignore_file in ignore_files or DEFAULT_IGNORE_FILES:
48
+ path = root / ignore_file
49
+ if path.is_file():
50
+ patterns.extend(path.read_text(encoding="utf-8").splitlines())
51
+ patterns.extend(extra_ignore or [])
52
+ return cls(patterns)
53
+
54
+ def is_ignored(self, rel_path: str) -> bool:
55
+ return self._spec.match_file(rel_path.replace("\\", "/"))
56
+
57
+ def is_ignored_dir(self, dirname: str) -> bool:
58
+ return dirname in BUILTIN_DENY_DIRS
@@ -0,0 +1,75 @@
1
+ """Walk the project root and yield indexable candidates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Iterator, Optional
9
+
10
+ from ..config import Config
11
+ from . import classify
12
+ from .ignore import IgnoreMatcher
13
+
14
+ _BINARY_SNIFF_BYTES = 4096
15
+
16
+
17
+ @dataclass
18
+ class Candidate:
19
+ path: Path
20
+ rel_path: str
21
+ size_bytes: int
22
+ lang: Optional[str]
23
+ parser: str
24
+ is_generated: bool
25
+
26
+
27
+ def walk(root: Path, config: Config) -> Iterator[Candidate]:
28
+ root = Path(root).resolve()
29
+ matcher = IgnoreMatcher.from_root(
30
+ root,
31
+ ignore_files=config.ignore_files,
32
+ extra_ignore=config.extra_ignore,
33
+ )
34
+
35
+ for dirpath, dirnames, filenames in os.walk(root):
36
+ dirnames[:] = [
37
+ d
38
+ for d in dirnames
39
+ if not matcher.is_ignored_dir(d)
40
+ and not matcher.is_ignored(_rel(root, Path(dirpath) / d) + "/")
41
+ ]
42
+
43
+ for fname in filenames:
44
+ abs_path = Path(dirpath) / fname
45
+ rel = _rel(root, abs_path)
46
+
47
+ if matcher.is_ignored(rel) or classify.is_secret_filename(rel):
48
+ continue
49
+ try:
50
+ size = abs_path.stat().st_size
51
+ except OSError:
52
+ continue
53
+ if size > config.max_file_bytes:
54
+ continue
55
+ try:
56
+ with abs_path.open("rb") as fh:
57
+ head = fh.read(_BINARY_SNIFF_BYTES)
58
+ except OSError:
59
+ continue
60
+ if classify.looks_binary(head):
61
+ continue
62
+
63
+ lang = classify.detect_language(rel)
64
+ yield Candidate(
65
+ path=abs_path,
66
+ rel_path=rel,
67
+ size_bytes=size,
68
+ lang=lang,
69
+ parser=classify.parser_for(lang),
70
+ is_generated=classify.is_generated(rel),
71
+ )
72
+
73
+
74
+ def _rel(root: Path, path: Path) -> str:
75
+ return path.resolve().relative_to(root).as_posix()
@@ -0,0 +1,138 @@
1
+ """Safety / health self-check (docs/SECURITY.md §6).
2
+
3
+ M8 scope: report enabled `codebase-index` hooks, whether the cache is gitignored, and
4
+ index freshness. The fuller checklist (indexed-secret leak scan, oversized/binary audit,
5
+ permissions, allowed-tools diff) is M9.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Literal
13
+
14
+ from . import scaffold
15
+ from .config import Config
16
+
17
+ Severity = Literal["high", "medium", "info"]
18
+
19
+
20
+ @dataclass
21
+ class Finding:
22
+ id: str
23
+ ok: bool
24
+ severity: Severity
25
+ detail: str
26
+
27
+
28
+ def run_doctor(root: Path, config: Config) -> list[Finding]:
29
+ root = Path(root)
30
+ findings: list[Finding] = []
31
+
32
+ # 1. Is the cache gitignored? (committing the index can leak code/secrets.)
33
+ gitignore = root / ".gitignore"
34
+ covered = (
35
+ gitignore.exists()
36
+ and scaffold._CACHE_IGNORE_LINE in gitignore.read_text(encoding="utf-8")
37
+ )
38
+ findings.append(
39
+ Finding(
40
+ id="cache_gitignored",
41
+ ok=covered,
42
+ severity="high",
43
+ detail=(
44
+ "cache is gitignored"
45
+ if covered
46
+ else f"add '{scaffold._CACHE_IGNORE_LINE}' to .gitignore (run `init`)"
47
+ ),
48
+ )
49
+ )
50
+
51
+ # 2. Which auto-update hooks are enabled? (informational; hooks run on every edit.)
52
+ hooks = scaffold.enabled_hooks(root)
53
+ findings.append(
54
+ Finding(
55
+ id="hooks_enabled",
56
+ ok=bool(hooks),
57
+ severity="info",
58
+ detail="; ".join(hooks) if hooks else "no auto-update hook (run `init --with-hooks`)",
59
+ )
60
+ )
61
+
62
+ # 3. Index freshness.
63
+ db_path = root / scaffold.CACHE_REL / "index.sqlite"
64
+ if not db_path.exists():
65
+ findings.append(
66
+ Finding("index_fresh", ok=False, severity="medium", detail="no index (run `index`)")
67
+ )
68
+ else:
69
+ from .indexer.freshness import compute_freshness
70
+ from .storage.db import Database
71
+
72
+ with Database(db_path) as db:
73
+ fr = compute_freshness(db.conn, root, config)
74
+ findings.append(
75
+ Finding(
76
+ id="index_fresh",
77
+ ok=not fr.stale,
78
+ severity="medium",
79
+ detail=(
80
+ "index is fresh"
81
+ if not fr.stale
82
+ else f"{fr.files_changed_since_build} file(s) changed — run `update`"
83
+ ),
84
+ )
85
+ )
86
+
87
+ # 4. Symbol-extraction coverage (Guardrail 2): a tree-sitter language with many files
88
+ # but ~0 symbols means extraction silently failed (the original Java bug).
89
+ from .storage import repo
90
+ from .storage.db import Database
91
+
92
+ with Database(db_path) as db:
93
+ coverage = repo.treesitter_coverage(db.conn)
94
+ dead = [r["lang"] for r in coverage if r["files"] >= _ZERO_SYMBOL_FILE_THRESHOLD
95
+ and (r["symbols"] or 0) == 0]
96
+ findings.append(
97
+ Finding(
98
+ id="symbol_extraction",
99
+ ok=not dead,
100
+ severity="medium",
101
+ detail=(
102
+ "tree-sitter languages extract symbols"
103
+ if not dead
104
+ else f"no symbols extracted for tree-sitter language(s): {', '.join(dead)} "
105
+ "— extraction path likely broken"
106
+ ),
107
+ )
108
+ )
109
+
110
+ # 5. Dependency-graph coverage: Tier-B languages (grammar but no hand-tuned spec)
111
+ # yield symbols but no import/inheritance edges, so refs/impact undercount.
112
+ from .parsers.languages import has_full_graph
113
+
114
+ tier_b = sorted({r["lang"] for r in coverage if not has_full_graph(r["lang"])})
115
+ findings.append(
116
+ Finding(
117
+ id="graph_coverage",
118
+ ok=True,
119
+ severity="info",
120
+ detail=(
121
+ "all indexed languages have full dependency-graph support"
122
+ if not tier_b
123
+ else f"partial dependency graph for Tier-B language(s): {', '.join(tier_b)} "
124
+ "— refs/impact may undercount (confirm with Grep)"
125
+ ),
126
+ )
127
+ )
128
+
129
+ return findings
130
+
131
+
132
+ # Threshold above which a tree-sitter language with zero symbols is treated as broken rather
133
+ # than just a tiny/empty repo.
134
+ _ZERO_SYMBOL_FILE_THRESHOLD = 3
135
+
136
+
137
+ def has_high_severity_failure(findings: list[Finding]) -> bool:
138
+ return any(f.severity == "high" and not f.ok for f in findings)
@@ -0,0 +1,2 @@
1
+ """Opt-in, local-first embedding backends. Nothing here is imported by the base
2
+ install unless embeddings are explicitly enabled (see SECURITY.md §4)."""
@@ -0,0 +1,67 @@
1
+ # src/codebase_index/embeddings/backend.py
2
+ """Embedding backend protocol + the single gating factory.
3
+
4
+ `resolve_backend` is the ONLY place a backend is constructed. It enforces
5
+ SECURITY.md §4: external backends are refused unless `allow_external = true`,
6
+ an API key is present in the environment, AND a warning naming the endpoint is
7
+ emitted. When embeddings are disabled the factory returns a NoopBackend and
8
+ imports no optional dependency.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ from typing import Callable, Protocol, runtime_checkable
15
+
16
+ API_KEY_ENV = "CBX_EMBEDDINGS_API_KEY"
17
+
18
+
19
+ class EmbeddingError(RuntimeError):
20
+ """Raised when embeddings are misconfigured, refused, or a backend is unusable."""
21
+
22
+
23
+ @runtime_checkable
24
+ class EmbeddingBackend(Protocol):
25
+ enabled: bool
26
+ name: str
27
+ dim: int
28
+
29
+ def embed(self, texts: list[str]) -> list[list[float]]:
30
+ """Return one vector (length == `dim`) per input text."""
31
+ ...
32
+
33
+
34
+ def resolve_backend(cfg, warn: Callable[[str], None] = lambda _m: None) -> "EmbeddingBackend":
35
+ """Construct the configured backend, applying all security gates."""
36
+ emb = cfg.embeddings
37
+ if not emb.enabled or emb.backend == "noop":
38
+ from .noop import NoopBackend
39
+
40
+ return NoopBackend()
41
+
42
+ if emb.backend == "local":
43
+ from .local import LocalBackend
44
+
45
+ return LocalBackend(model_name=emb.model)
46
+
47
+ if emb.backend == "external":
48
+ if not emb.allow_external:
49
+ raise EmbeddingError(
50
+ "External embeddings require embeddings.allow_external = true (SECURITY.md §4)."
51
+ )
52
+ api_key = os.environ.get(API_KEY_ENV)
53
+ if not api_key:
54
+ raise EmbeddingError(
55
+ f"External embeddings require an API key in ${API_KEY_ENV} (SECURITY.md §4)."
56
+ )
57
+ if not emb.endpoint:
58
+ raise EmbeddingError("External embeddings require embeddings.endpoint to be set.")
59
+ warn(
60
+ f"[codebase-index] EXTERNAL EMBEDDINGS ENABLED — chunk text will be sent to "
61
+ f"{emb.endpoint}. Disable with embeddings.backend=local|noop."
62
+ )
63
+ from .external import ExternalBackend
64
+
65
+ return ExternalBackend(endpoint=emb.endpoint, api_key=api_key, model_name=emb.model)
66
+
67
+ raise EmbeddingError(f"Unknown embeddings.backend: {emb.backend!r}")
@@ -0,0 +1,56 @@
1
+ # src/codebase_index/embeddings/external.py
2
+ """External embedding API backend. Constructed ONLY via resolve_backend after the
3
+ SECURITY.md §4 gates pass. The network call is isolated in a transport callable so
4
+ it can be tested without hitting the network and swapped per provider.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from typing import Callable, Optional
11
+ from urllib.request import Request, urlopen
12
+
13
+ from .backend import EmbeddingError
14
+
15
+ Transport = Callable[[str, str, str, list[str]], list[list[float]]]
16
+
17
+
18
+ def _http_transport(endpoint: str, api_key: str, model: str, texts: list[str]) -> list[list[float]]:
19
+ body = json.dumps({"model": model, "input": texts}).encode("utf-8")
20
+ req = Request(
21
+ endpoint,
22
+ data=body,
23
+ headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
24
+ method="POST",
25
+ )
26
+ with urlopen(req, timeout=30) as resp:
27
+ payload = json.loads(resp.read().decode("utf-8"))
28
+ return [item["embedding"] for item in payload["data"]]
29
+
30
+
31
+ class ExternalBackend:
32
+ enabled = True
33
+ dim: int = 0
34
+
35
+ def __init__(
36
+ self,
37
+ *,
38
+ endpoint: str,
39
+ api_key: str,
40
+ model_name: str,
41
+ transport: Optional[Transport] = None,
42
+ ) -> None:
43
+ self.name = f"external:{model_name}"
44
+ self.model_name = model_name
45
+ self._endpoint = endpoint
46
+ self._api_key = api_key
47
+ self._transport = transport or _http_transport
48
+
49
+ def embed(self, texts: list[str]) -> list[list[float]]:
50
+ if not texts:
51
+ return []
52
+ vecs = self._transport(self._endpoint, self._api_key, self.model_name, list(texts))
53
+ if not vecs or not vecs[0]:
54
+ raise EmbeddingError("External embedding endpoint returned no vectors.")
55
+ self.dim = len(vecs[0])
56
+ return [[float(x) for x in v] for v in vecs]