codebase-cortex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. codebase_cortex/__init__.py +3 -0
  2. codebase_cortex/agents/__init__.py +0 -0
  3. codebase_cortex/agents/base.py +69 -0
  4. codebase_cortex/agents/code_analyzer.py +122 -0
  5. codebase_cortex/agents/doc_writer.py +356 -0
  6. codebase_cortex/agents/semantic_finder.py +64 -0
  7. codebase_cortex/agents/sprint_reporter.py +152 -0
  8. codebase_cortex/agents/task_creator.py +138 -0
  9. codebase_cortex/auth/__init__.py +0 -0
  10. codebase_cortex/auth/callback_server.py +80 -0
  11. codebase_cortex/auth/oauth.py +173 -0
  12. codebase_cortex/auth/token_store.py +90 -0
  13. codebase_cortex/cli.py +855 -0
  14. codebase_cortex/config.py +150 -0
  15. codebase_cortex/embeddings/__init__.py +0 -0
  16. codebase_cortex/embeddings/clustering.py +140 -0
  17. codebase_cortex/embeddings/indexer.py +208 -0
  18. codebase_cortex/embeddings/store.py +126 -0
  19. codebase_cortex/git/__init__.py +0 -0
  20. codebase_cortex/git/diff_parser.py +185 -0
  21. codebase_cortex/git/github_client.py +46 -0
  22. codebase_cortex/graph.py +111 -0
  23. codebase_cortex/mcp_client.py +94 -0
  24. codebase_cortex/notion/__init__.py +0 -0
  25. codebase_cortex/notion/bootstrap.py +298 -0
  26. codebase_cortex/notion/page_cache.py +107 -0
  27. codebase_cortex/state.py +77 -0
  28. codebase_cortex/utils/__init__.py +0 -0
  29. codebase_cortex/utils/json_parsing.py +59 -0
  30. codebase_cortex/utils/logging.py +62 -0
  31. codebase_cortex/utils/rate_limiter.py +56 -0
  32. codebase_cortex/utils/section_parser.py +139 -0
  33. codebase_cortex-0.1.0.dist-info/METADATA +209 -0
  34. codebase_cortex-0.1.0.dist-info/RECORD +37 -0
  35. codebase_cortex-0.1.0.dist-info/WHEEL +4 -0
  36. codebase_cortex-0.1.0.dist-info/entry_points.txt +3 -0
  37. codebase_cortex-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,150 @@
1
+ """Configuration and LLM factory for Codebase Cortex."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+
9
+ from dotenv import load_dotenv
10
+ from langchain_core.language_models import BaseChatModel
11
+
12
+ CORTEX_DIR_NAME = ".cortex"
13
+
14
+
15
+ def find_cortex_dir(start: Path | None = None) -> Path:
16
+ """Find the .cortex directory, starting from the given path or cwd.
17
+
18
+ Returns the .cortex path (may not exist yet — for `cortex init`).
19
+ """
20
+ start = start or Path.cwd()
21
+ return start.resolve() / CORTEX_DIR_NAME
22
+
23
+
24
+ DEFAULT_MODELS: dict[str, str] = {
25
+ "google": "gemini-2.5-flash-lite",
26
+ "anthropic": "claude-sonnet-4-20250514",
27
+ "openrouter": "", # no sensible default — user must choose
28
+ }
29
+
30
+ RECOMMENDED_MODELS: dict[str, list[str]] = {
31
+ "google": [
32
+ "gemini-2.5-flash-lite",
33
+ "gemini-3-flash-preview",
34
+ "gemini-2.5-pro",
35
+ ],
36
+ "anthropic": [
37
+ "claude-sonnet-4-20250514",
38
+ "claude-haiku-4-5-20251001",
39
+ ],
40
+ "openrouter": [
41
+ "anthropic/claude-sonnet-4",
42
+ "google/gemini-2.5-flash-lite",
43
+ "google/gemini-3-flash-preview",
44
+ ],
45
+ }
46
+
47
+
48
+ @dataclass
49
+ class Settings:
50
+ """Application settings loaded from .cortex/.env in the target repo."""
51
+
52
+ llm_provider: str = "google"
53
+ llm_model: str = ""
54
+ google_api_key: str = ""
55
+ anthropic_api_key: str = ""
56
+ openrouter_api_key: str = ""
57
+ github_token: str = ""
58
+ repo_path: Path = field(default_factory=lambda: Path.cwd())
59
+ cortex_dir: Path = field(default_factory=lambda: find_cortex_dir())
60
+ oauth_callback_port: int = 9876
61
+
62
+ @property
63
+ def data_dir(self) -> Path:
64
+ return self.cortex_dir
65
+
66
+ @property
67
+ def notion_token_path(self) -> Path:
68
+ return self.cortex_dir / "notion_tokens.json"
69
+
70
+ @property
71
+ def faiss_index_dir(self) -> Path:
72
+ return self.cortex_dir / "faiss_index"
73
+
74
+ @property
75
+ def page_cache_path(self) -> Path:
76
+ return self.cortex_dir / "page_cache.json"
77
+
78
+ @property
79
+ def env_path(self) -> Path:
80
+ return self.cortex_dir / ".env"
81
+
82
+ @classmethod
83
+ def from_env(cls, repo_path: Path | None = None) -> Settings:
84
+ """Load settings from .cortex/.env in the given or current directory."""
85
+ repo = (repo_path or Path.cwd()).resolve()
86
+ cortex_dir = repo / CORTEX_DIR_NAME
87
+ env_file = cortex_dir / ".env"
88
+
89
+ if env_file.exists():
90
+ load_dotenv(env_file)
91
+
92
+ provider = os.getenv("LLM_PROVIDER", "google")
93
+ return cls(
94
+ llm_provider=provider,
95
+ llm_model=os.getenv("LLM_MODEL", DEFAULT_MODELS.get(provider, "")),
96
+ google_api_key=os.getenv("GOOGLE_API_KEY", ""),
97
+ anthropic_api_key=os.getenv("ANTHROPIC_API_KEY", ""),
98
+ openrouter_api_key=os.getenv("OPENROUTER_API_KEY", ""),
99
+ github_token=os.getenv("GITHUB_TOKEN", ""),
100
+ repo_path=repo,
101
+ cortex_dir=cortex_dir,
102
+ )
103
+
104
+ @property
105
+ def is_initialized(self) -> bool:
106
+ return self.env_path.exists()
107
+
108
+
109
+ def get_llm(settings: Settings | None = None, model: str | None = None) -> BaseChatModel:
110
+ """Create an LLM instance based on settings.
111
+
112
+ Args:
113
+ settings: Application settings. Loads from env if not provided.
114
+ model: Override model name. Uses LLM_MODEL from env, then provider default.
115
+ """
116
+ if settings is None:
117
+ settings = Settings.from_env()
118
+
119
+ resolved_model = model or settings.llm_model
120
+
121
+ if settings.llm_provider == "anthropic":
122
+ from langchain_anthropic import ChatAnthropic
123
+
124
+ return ChatAnthropic(
125
+ model=resolved_model or DEFAULT_MODELS["anthropic"],
126
+ api_key=settings.anthropic_api_key,
127
+ )
128
+
129
+ if settings.llm_provider == "openrouter":
130
+ from langchain_openai import ChatOpenAI
131
+
132
+ if not resolved_model:
133
+ raise ValueError(
134
+ "LLM_MODEL is required for OpenRouter. "
135
+ "Set it in .cortex/.env (e.g. LLM_MODEL=anthropic/claude-sonnet-4)"
136
+ )
137
+
138
+ return ChatOpenAI(
139
+ model=resolved_model,
140
+ api_key=settings.openrouter_api_key,
141
+ base_url="https://openrouter.ai/api/v1",
142
+ )
143
+
144
+ # Default: Google Gemini
145
+ from langchain_google_genai import ChatGoogleGenerativeAI
146
+
147
+ return ChatGoogleGenerativeAI(
148
+ model=resolved_model or DEFAULT_MODELS["google"],
149
+ google_api_key=settings.google_api_key,
150
+ )
File without changes
@@ -0,0 +1,140 @@
1
+ """HDBSCAN topic clustering for Knowledge Map generation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from collections import defaultdict
7
+
8
+ import hdbscan
9
+ import numpy as np
10
+
11
+ from codebase_cortex.embeddings.indexer import CodeChunk
12
+
13
+
14
+ @dataclass
15
+ class TopicCluster:
16
+ """A cluster of related code chunks representing a topic."""
17
+
18
+ cluster_id: int
19
+ label: str
20
+ chunks: list[CodeChunk] = field(default_factory=list)
21
+ centroid: np.ndarray | None = field(default=None, repr=False)
22
+
23
+ @property
24
+ def file_paths(self) -> list[str]:
25
+ """Unique file paths in this cluster."""
26
+ return sorted(set(c.file_path for c in self.chunks))
27
+
28
+ @property
29
+ def size(self) -> int:
30
+ return len(self.chunks)
31
+
32
+
33
+ @dataclass
34
+ class TopicClusterer:
35
+ """Clusters code embeddings into topics using HDBSCAN.
36
+
37
+ HDBSCAN is density-based — it automatically determines the number
38
+ of clusters and marks sparse points as noise (cluster_id = -1).
39
+ """
40
+
41
+ min_cluster_size: int = 3
42
+ min_samples: int = 2
43
+
44
+ def cluster(
45
+ self,
46
+ embeddings: np.ndarray,
47
+ chunks: list[CodeChunk],
48
+ ) -> list[TopicCluster]:
49
+ """Cluster embeddings and return topic groups.
50
+
51
+ Args:
52
+ embeddings: Array of shape (n, dimension).
53
+ chunks: Corresponding code chunks.
54
+
55
+ Returns:
56
+ List of TopicCluster, excluding noise cluster.
57
+ """
58
+ if len(embeddings) < self.min_cluster_size:
59
+ # Too few chunks to cluster meaningfully
60
+ return [TopicCluster(
61
+ cluster_id=0,
62
+ label=self._generate_label(chunks),
63
+ chunks=list(chunks),
64
+ )] if chunks else []
65
+
66
+ clusterer = hdbscan.HDBSCAN(
67
+ min_cluster_size=self.min_cluster_size,
68
+ min_samples=self.min_samples,
69
+ metric="euclidean",
70
+ )
71
+ labels = clusterer.fit_predict(embeddings.astype(np.float64))
72
+
73
+ # Group chunks by cluster
74
+ cluster_map: dict[int, list[tuple[CodeChunk, np.ndarray]]] = defaultdict(list)
75
+ for label, chunk, emb in zip(labels, chunks, embeddings):
76
+ cluster_map[int(label)].append((chunk, emb))
77
+
78
+ topics = []
79
+ for cluster_id, items in sorted(cluster_map.items()):
80
+ if cluster_id == -1:
81
+ continue # Skip noise
82
+
83
+ cluster_chunks = [item[0] for item in items]
84
+ cluster_embeddings = np.array([item[1] for item in items])
85
+ centroid = cluster_embeddings.mean(axis=0)
86
+
87
+ topics.append(TopicCluster(
88
+ cluster_id=cluster_id,
89
+ label=self._generate_label(cluster_chunks),
90
+ chunks=cluster_chunks,
91
+ centroid=centroid,
92
+ ))
93
+
94
+ return topics
95
+
96
+ @staticmethod
97
+ def _generate_label(chunks: list[CodeChunk]) -> str:
98
+ """Generate a descriptive label from chunk metadata.
99
+
100
+ Uses the most common directory and chunk names to create
101
+ a human-readable topic label.
102
+ """
103
+ if not chunks:
104
+ return "Unknown"
105
+
106
+ # Find most common directory
107
+ dirs = defaultdict(int)
108
+ names = defaultdict(int)
109
+ for c in chunks:
110
+ parts = c.file_path.split("/")
111
+ if len(parts) > 1:
112
+ dirs[parts[0]] += 1
113
+ names[c.name] += 1
114
+
115
+ top_dir = max(dirs, key=dirs.get) if dirs else ""
116
+ # Pick top 2 most common names
117
+ top_names = sorted(names, key=names.get, reverse=True)[:2]
118
+
119
+ if top_dir:
120
+ return f"{top_dir}: {', '.join(top_names)}"
121
+ return ", ".join(top_names)
122
+
123
+ def to_markdown(self, topics: list[TopicCluster]) -> str:
124
+ """Render topic clusters as a Markdown Knowledge Map."""
125
+ if not topics:
126
+ return "No topics identified yet.\n"
127
+
128
+ lines = ["# Knowledge Map\n"]
129
+ lines.append(f"*{sum(t.size for t in topics)} code chunks across {len(topics)} topics*\n")
130
+
131
+ for topic in sorted(topics, key=lambda t: t.size, reverse=True):
132
+ lines.append(f"## {topic.label}")
133
+ lines.append(f"*{topic.size} chunks across {len(topic.file_paths)} files*\n")
134
+ for fp in topic.file_paths[:10]:
135
+ lines.append(f"- `{fp}`")
136
+ if len(topic.file_paths) > 10:
137
+ lines.append(f"- ... and {len(topic.file_paths) - 10} more files")
138
+ lines.append("")
139
+
140
+ return "\n".join(lines)
@@ -0,0 +1,208 @@
1
+ """Sentence-transformers embedding pipeline for code chunks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+ from dataclasses import dataclass, field
8
+
9
+ import numpy as np
10
+
11
+ # Lazy-loaded to avoid slow import at startup
12
+ _model = None
13
+ MODEL_NAME = "all-MiniLM-L6-v2"
14
+
15
+ # File extensions to index
16
+ CODE_EXTENSIONS = {
17
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
18
+ ".rb", ".php", ".c", ".cpp", ".h", ".hpp", ".cs", ".swift",
19
+ ".kt", ".scala", ".sh", ".bash", ".yml", ".yaml", ".toml",
20
+ ".json", ".md", ".rst", ".txt",
21
+ }
22
+
23
+ # Directories to skip
24
+ SKIP_DIRS = {
25
+ ".git", ".venv", "venv", "node_modules", "__pycache__",
26
+ ".pytest_cache", "dist", "build", ".eggs", ".tox",
27
+ ".mypy_cache", ".ruff_cache",
28
+ }
29
+
30
+ # Max file size to index (100KB)
31
+ MAX_FILE_SIZE = 100_000
32
+
33
+
34
+ def _get_model():
35
+ """Lazy-load the sentence-transformers model."""
36
+ global _model
37
+ if _model is None:
38
+ from sentence_transformers import SentenceTransformer
39
+ _model = SentenceTransformer(MODEL_NAME)
40
+ return _model
41
+
42
+
43
+ @dataclass
44
+ class CodeChunk:
45
+ """A chunk of code with metadata for embedding."""
46
+
47
+ file_path: str
48
+ chunk_type: str # "function" | "class" | "module" | "section"
49
+ name: str
50
+ content: str
51
+ start_line: int
52
+ end_line: int
53
+
54
+
55
+ @dataclass
56
+ class EmbeddingIndexer:
57
+ """Indexes code chunks using sentence-transformers.
58
+
59
+ Walks a repository, extracts meaningful code chunks,
60
+ and generates embeddings for similarity search.
61
+ """
62
+
63
+ repo_path: Path
64
+ chunks: list[CodeChunk] = field(default_factory=list)
65
+
66
+ def collect_chunks(self) -> list[CodeChunk]:
67
+ """Walk the repo and extract code chunks from all indexable files."""
68
+ self.chunks = []
69
+ for file_path in self._iter_files():
70
+ try:
71
+ content = file_path.read_text(encoding="utf-8", errors="ignore")
72
+ if not content.strip():
73
+ continue
74
+ rel_path = str(file_path.relative_to(self.repo_path))
75
+ chunks = self._chunk_file(rel_path, content)
76
+ self.chunks.extend(chunks)
77
+ except (OSError, UnicodeDecodeError):
78
+ continue
79
+ return self.chunks
80
+
81
+ def embed_chunks(self, chunks: list[CodeChunk] | None = None) -> np.ndarray:
82
+ """Generate embeddings for code chunks.
83
+
84
+ Args:
85
+ chunks: Chunks to embed. Uses self.chunks if not provided.
86
+
87
+ Returns:
88
+ numpy array of shape (n_chunks, embedding_dim).
89
+ """
90
+ chunks = chunks or self.chunks
91
+ if not chunks:
92
+ return np.array([])
93
+
94
+ model = _get_model()
95
+ texts = [self._chunk_to_text(c) for c in chunks]
96
+ embeddings = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
97
+ return embeddings
98
+
99
+ def embed_texts(self, texts: list[str]) -> np.ndarray:
100
+ """Embed arbitrary text strings (for query embedding)."""
101
+ if not texts:
102
+ return np.array([])
103
+ model = _get_model()
104
+ return model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
105
+
106
+ def _iter_files(self):
107
+ """Yield indexable files from the repo."""
108
+ for path in self.repo_path.rglob("*"):
109
+ if any(skip in path.parts for skip in SKIP_DIRS):
110
+ continue
111
+ if not path.is_file():
112
+ continue
113
+ if path.suffix not in CODE_EXTENSIONS:
114
+ continue
115
+ if path.stat().st_size > MAX_FILE_SIZE:
116
+ continue
117
+ yield path
118
+
119
+ def _chunk_file(self, rel_path: str, content: str) -> list[CodeChunk]:
120
+ """Split a file into meaningful chunks."""
121
+ if rel_path.endswith(".py"):
122
+ return self._chunk_python(rel_path, content)
123
+ # For non-Python files, chunk by sections or as whole module
124
+ return self._chunk_by_sections(rel_path, content)
125
+
126
+ def _chunk_python(self, rel_path: str, content: str) -> list[CodeChunk]:
127
+ """Extract Python functions and classes as chunks."""
128
+ chunks = []
129
+ lines = content.split("\n")
130
+
131
+ # Regex patterns for top-level definitions
132
+ func_pattern = re.compile(r"^(async\s+)?def\s+(\w+)")
133
+ class_pattern = re.compile(r"^class\s+(\w+)")
134
+
135
+ current_def = None
136
+ current_start = 0
137
+ current_name = ""
138
+ current_type = ""
139
+
140
+ for i, line in enumerate(lines):
141
+ func_match = func_pattern.match(line)
142
+ class_match = class_pattern.match(line)
143
+
144
+ if func_match or class_match:
145
+ # Save previous definition
146
+ if current_def is not None:
147
+ chunk_content = "\n".join(lines[current_start:i])
148
+ if chunk_content.strip():
149
+ chunks.append(CodeChunk(
150
+ file_path=rel_path,
151
+ chunk_type=current_type,
152
+ name=current_name,
153
+ content=chunk_content,
154
+ start_line=current_start + 1,
155
+ end_line=i,
156
+ ))
157
+
158
+ if func_match:
159
+ current_type = "function"
160
+ current_name = func_match.group(2)
161
+ else:
162
+ current_type = "class"
163
+ current_name = class_match.group(1)
164
+ current_def = True
165
+ current_start = i
166
+
167
+ # Save last definition
168
+ if current_def is not None:
169
+ chunk_content = "\n".join(lines[current_start:])
170
+ if chunk_content.strip():
171
+ chunks.append(CodeChunk(
172
+ file_path=rel_path,
173
+ chunk_type=current_type,
174
+ name=current_name,
175
+ content=chunk_content,
176
+ start_line=current_start + 1,
177
+ end_line=len(lines),
178
+ ))
179
+
180
+ # If no definitions found, treat whole file as module chunk
181
+ if not chunks and content.strip():
182
+ chunks.append(CodeChunk(
183
+ file_path=rel_path,
184
+ chunk_type="module",
185
+ name=rel_path,
186
+ content=content[:3000],
187
+ start_line=1,
188
+ end_line=len(lines),
189
+ ))
190
+
191
+ return chunks
192
+
193
+ def _chunk_by_sections(self, rel_path: str, content: str) -> list[CodeChunk]:
194
+ """Chunk non-Python files as whole modules (truncated if large)."""
195
+ lines = content.split("\n")
196
+ return [CodeChunk(
197
+ file_path=rel_path,
198
+ chunk_type="module",
199
+ name=rel_path,
200
+ content=content[:3000],
201
+ start_line=1,
202
+ end_line=len(lines),
203
+ )]
204
+
205
+ @staticmethod
206
+ def _chunk_to_text(chunk: CodeChunk) -> str:
207
+ """Convert a chunk to a text string suitable for embedding."""
208
+ return f"{chunk.file_path} ({chunk.chunk_type}: {chunk.name})\n{chunk.content[:2000]}"
@@ -0,0 +1,126 @@
1
+ """FAISS index management for code embeddings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from dataclasses import dataclass, field
8
+
9
+ import faiss
10
+ import numpy as np
11
+
12
+ from codebase_cortex.embeddings.indexer import CodeChunk
13
+
14
+
15
+ @dataclass
16
+ class SearchResult:
17
+ """A single search result from the FAISS index."""
18
+
19
+ chunk: CodeChunk
20
+ distance: float
21
+ score: float # 1 / (1 + distance), higher = more similar
22
+
23
+
24
+ @dataclass
25
+ class FAISSStore:
26
+ """Manages a FAISS vector index for code embeddings.
27
+
28
+ Stores embeddings in a flat L2 index with chunk metadata
29
+ persisted alongside in a JSON sidecar file.
30
+ """
31
+
32
+ index_dir: Path
33
+ index: faiss.IndexFlatL2 | None = field(default=None, repr=False)
34
+ chunks: list[CodeChunk] = field(default_factory=list)
35
+ _dimension: int = 384 # all-MiniLM-L6-v2 output dimension
36
+
37
+ def build(self, embeddings: np.ndarray, chunks: list[CodeChunk]) -> None:
38
+ """Build a new index from embeddings and chunks.
39
+
40
+ Args:
41
+ embeddings: Array of shape (n, dimension).
42
+ chunks: Corresponding code chunks (must match embeddings length).
43
+ """
44
+ if len(embeddings) == 0:
45
+ self.index = faiss.IndexFlatL2(self._dimension)
46
+ self.chunks = []
47
+ return
48
+
49
+ self._dimension = embeddings.shape[1]
50
+ self.index = faiss.IndexFlatL2(self._dimension)
51
+ self.index.add(embeddings.astype(np.float32))
52
+ self.chunks = list(chunks)
53
+
54
+ def search(self, query_embedding: np.ndarray, k: int = 5) -> list[SearchResult]:
55
+ """Search the index for the k most similar chunks.
56
+
57
+ Args:
58
+ query_embedding: Array of shape (1, dimension) or (dimension,).
59
+ k: Number of results to return.
60
+
61
+ Returns:
62
+ List of SearchResult sorted by similarity (highest first).
63
+ """
64
+ if self.index is None or self.index.ntotal == 0:
65
+ return []
66
+
67
+ query = query_embedding.reshape(1, -1).astype(np.float32)
68
+ k = min(k, self.index.ntotal)
69
+ distances, indices = self.index.search(query, k)
70
+
71
+ results = []
72
+ for dist, idx in zip(distances[0], indices[0]):
73
+ if idx < 0 or idx >= len(self.chunks):
74
+ continue
75
+ results.append(SearchResult(
76
+ chunk=self.chunks[idx],
77
+ distance=float(dist),
78
+ score=1.0 / (1.0 + float(dist)),
79
+ ))
80
+ return results
81
+
82
+ def save(self) -> None:
83
+ """Persist the index and chunk metadata to disk."""
84
+ self.index_dir.mkdir(parents=True, exist_ok=True)
85
+
86
+ if self.index is not None:
87
+ faiss.write_index(self.index, str(self.index_dir / "index.faiss"))
88
+
89
+ metadata = [
90
+ {
91
+ "file_path": c.file_path,
92
+ "chunk_type": c.chunk_type,
93
+ "name": c.name,
94
+ "content": c.content[:500], # Truncate for storage
95
+ "start_line": c.start_line,
96
+ "end_line": c.end_line,
97
+ }
98
+ for c in self.chunks
99
+ ]
100
+ (self.index_dir / "chunks.json").write_text(json.dumps(metadata, indent=2))
101
+
102
+ def load(self) -> bool:
103
+ """Load an existing index from disk.
104
+
105
+ Returns:
106
+ True if loaded successfully, False if no index exists.
107
+ """
108
+ index_path = self.index_dir / "index.faiss"
109
+ chunks_path = self.index_dir / "chunks.json"
110
+
111
+ if not index_path.exists() or not chunks_path.exists():
112
+ return False
113
+
114
+ self.index = faiss.read_index(str(index_path))
115
+ self._dimension = self.index.d
116
+
117
+ metadata = json.loads(chunks_path.read_text())
118
+ self.chunks = [CodeChunk(**m) for m in metadata]
119
+ return True
120
+
121
+ @property
122
+ def size(self) -> int:
123
+ """Number of vectors in the index."""
124
+ if self.index is None:
125
+ return 0
126
+ return self.index.ntotal
File without changes