codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
  2. codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
  3. codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
  4. codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
  5. codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
  6. corbell/__init__.py +6 -0
  7. corbell/cli/__init__.py +1 -0
  8. corbell/cli/commands/__init__.py +1 -0
  9. corbell/cli/commands/index.py +86 -0
  10. corbell/cli/commands/query.py +71 -0
  11. corbell/cli/main.py +57 -0
  12. corbell/core/__init__.py +1 -0
  13. corbell/core/constants.py +52 -0
  14. corbell/core/embeddings/__init__.py +6 -0
  15. corbell/core/embeddings/base.py +68 -0
  16. corbell/core/embeddings/extractor.py +201 -0
  17. corbell/core/embeddings/factory.py +48 -0
  18. corbell/core/embeddings/model.py +401 -0
  19. corbell/core/embeddings/search_cache.py +95 -0
  20. corbell/core/embeddings/sqlite_store.py +271 -0
  21. corbell/core/gitignore.py +76 -0
  22. corbell/core/graph/__init__.py +1 -0
  23. corbell/core/graph/builder.py +696 -0
  24. corbell/core/graph/method_graph.py +1077 -0
  25. corbell/core/graph/providers/__init__.py +6 -0
  26. corbell/core/graph/providers/aws_patterns.py +62 -0
  27. corbell/core/graph/providers/azure_patterns.py +64 -0
  28. corbell/core/graph/providers/gcp_patterns.py +59 -0
  29. corbell/core/graph/schema.py +175 -0
  30. corbell/core/graph/sqlite_store.py +500 -0
  31. corbell/core/indexing/__init__.py +1 -0
  32. corbell/core/indexing/builder.py +608 -0
  33. corbell/core/indexing/lock.py +150 -0
  34. corbell/core/indexing/tracker.py +245 -0
  35. corbell/core/llm_client.py +677 -0
  36. corbell/core/mcp/__init__.py +1 -0
  37. corbell/core/mcp/server.py +214 -0
  38. corbell/core/query/__init__.py +1 -0
  39. corbell/core/query/diagnostics.py +38 -0
  40. corbell/core/query/engine.py +321 -0
  41. corbell/core/query/enhancer.py +102 -0
  42. corbell/core/query/formatter.py +98 -0
  43. corbell/core/query/graph_expander.py +284 -0
  44. corbell/core/query/merger.py +171 -0
  45. corbell/core/query/reranker.py +131 -0
  46. corbell/core/workspace.py +408 -0
@@ -0,0 +1,95 @@
1
+ """In-memory numpy matrix cache for fast vectorized cosine similarity search."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, List, Optional, Tuple
6
+
7
+ import numpy as np
8
+
9
+ if TYPE_CHECKING:
10
+ from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
11
+
12
+
13
+ class EmbeddingSearchCache:
14
+ """Process-resident cache of all embedding vectors as a numpy matrix.
15
+
16
+ Loads all vectors from the embedding store once and performs vectorized
17
+ cosine similarity using matrix multiplication (O(n) instead of O(n) with
18
+ constant factors ~150x faster than row-by-row).
19
+
20
+ For 30K chunks @ 384 dims: ~46 MB memory, ~2ms per query.
21
+ """
22
+
23
+ def __init__(self) -> None:
24
+ self._ids: List[str] = []
25
+ self._matrix: Optional[np.ndarray] = None # shape (n_chunks, dim)
26
+
27
+ @property
28
+ def is_loaded(self) -> bool:
29
+ """True if the cache has been loaded from the store."""
30
+ return self._matrix is not None and len(self._ids) > 0
31
+
32
+ def load(self, store: "SQLiteEmbeddingStore") -> None:
33
+ """Load all embedding vectors from the store into a numpy matrix.
34
+
35
+ Args:
36
+ store: The embedding store to load vectors from.
37
+ """
38
+ rows = store.get_all_vectors()
39
+ if not rows:
40
+ self._ids = []
41
+ self._matrix = None
42
+ return
43
+
44
+ ids: List[str] = []
45
+ vecs: List[np.ndarray] = []
46
+
47
+ for chunk_id, blob in rows:
48
+ vec = np.frombuffer(blob, dtype=np.float32)
49
+ nrm = np.linalg.norm(vec)
50
+ if nrm > 0:
51
+ vecs.append(vec / nrm) # pre-normalize for cosine via dot product
52
+ else:
53
+ vecs.append(vec)
54
+ ids.append(chunk_id)
55
+
56
+ self._ids = ids
57
+ self._matrix = np.stack(vecs, axis=0) # shape (n_chunks, dim)
58
+
59
+ def search(self, query_vec: np.ndarray, top_k: int = 50) -> List[Tuple[str, float]]:
60
+ """Search for the top-K most similar chunks.
61
+
62
+ Args:
63
+ query_vec: Query embedding vector (will be normalized internally).
64
+ top_k: Number of results to return.
65
+
66
+ Returns:
67
+ List of ``(chunk_id, score)`` tuples ordered by descending similarity.
68
+ Returns empty list if cache is not loaded.
69
+ """
70
+ if not self.is_loaded or self._matrix is None:
71
+ return []
72
+
73
+ qvec = np.array(query_vec, dtype=np.float32)
74
+ qnorm = float(np.linalg.norm(qvec))
75
+ if qnorm == 0:
76
+ return []
77
+ qvec = qvec / qnorm # normalize query
78
+
79
+ # Vectorized cosine similarity: matrix @ query_vec
80
+ # (matrix rows are already normalized, so dot product = cosine similarity)
81
+ scores: np.ndarray = self._matrix @ qvec # shape (n_chunks,)
82
+
83
+ n = len(self._ids)
84
+ actual_k = min(top_k, n)
85
+
86
+ # Use argpartition for O(n) top-K selection instead of full sort
87
+ if actual_k < n:
88
+ # argpartition gives the indices of the top-k (unsorted)
89
+ top_indices = np.argpartition(scores, -actual_k)[-actual_k:]
90
+ # Sort only the top-k indices by score (descending)
91
+ top_indices = top_indices[np.argsort(scores[top_indices])[::-1]]
92
+ else:
93
+ top_indices = np.argsort(scores)[::-1]
94
+
95
+ return [(self._ids[int(i)], float(scores[int(i)])) for i in top_indices]
@@ -0,0 +1,271 @@
1
+ """SQLite backing store for code embeddings.
2
+
3
+ Stores embedding vectors as binary blobs and provides cosine-similarity search.
4
+ Implements :class:`~corbell.core.embeddings.base.EmbeddingStore`.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import sqlite3
10
+ from pathlib import Path
11
+ from typing import List, Optional, Tuple
12
+
13
+ import numpy as np
14
+
15
+ from corbell.core.embeddings.base import EmbeddingStore
16
+ from corbell.core.embeddings.extractor import EmbeddingRecord
17
+
18
+ _CREATE_CHUNKS = """
19
+ CREATE TABLE IF NOT EXISTS embedding_chunks (
20
+ id TEXT PRIMARY KEY,
21
+ service_id TEXT NOT NULL,
22
+ repo TEXT NOT NULL,
23
+ file_path TEXT NOT NULL,
24
+ start_line INTEGER,
25
+ end_line INTEGER,
26
+ content TEXT NOT NULL,
27
+ language TEXT NOT NULL,
28
+ chunk_type TEXT NOT NULL,
29
+ symbol TEXT,
30
+ embedding BLOB
31
+ );
32
+ """
33
+ _CREATE_IDX = "CREATE INDEX IF NOT EXISTS idx_chunks_service ON embedding_chunks(service_id);"
34
+
35
+
36
+ class SQLiteEmbeddingStore(EmbeddingStore):
37
+ """SQLite-backed embedding store with cosine-similarity search.
38
+
39
+ The embedding vector is stored as a raw float32 blob for compactness.
40
+ Implements :class:`~corbell.core.embeddings.base.EmbeddingStore`.
41
+ """
42
+
43
+ def __init__(self, db_path: Path | str):
44
+ self.db_path = Path(db_path)
45
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
46
+ self._init_db()
47
+
48
+ def _conn(self) -> sqlite3.Connection:
49
+ conn = sqlite3.connect(str(self.db_path))
50
+ conn.row_factory = sqlite3.Row
51
+ return conn
52
+
53
+ def _init_db(self) -> None:
54
+ with self._conn() as conn:
55
+ conn.execute(_CREATE_CHUNKS)
56
+ conn.execute(_CREATE_IDX)
57
+ conn.commit()
58
+
59
+ # ------------------------------------------------------------------ #
60
+ # Write #
61
+ # ------------------------------------------------------------------ #
62
+
63
+ def upsert(self, record: EmbeddingRecord) -> None:
64
+ """Insert or replace a single embedding record."""
65
+ emb_blob = self._vec_to_blob(record.embedding) if record.embedding else None
66
+ with self._conn() as conn:
67
+ conn.execute(
68
+ """INSERT OR REPLACE INTO embedding_chunks
69
+ (id, service_id, repo, file_path, start_line, end_line,
70
+ content, language, chunk_type, symbol, embedding)
71
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
72
+ (
73
+ record.id,
74
+ record.service_id,
75
+ record.repo,
76
+ record.file_path,
77
+ record.start_line,
78
+ record.end_line,
79
+ record.content,
80
+ record.language,
81
+ record.chunk_type,
82
+ record.symbol,
83
+ emb_blob,
84
+ ),
85
+ )
86
+ conn.commit()
87
+
88
+ def upsert_batch(self, records: List[EmbeddingRecord]) -> None:
89
+ """Bulk-upsert a list of records."""
90
+ with self._conn() as conn:
91
+ for record in records:
92
+ emb_blob = self._vec_to_blob(record.embedding) if record.embedding else None
93
+ conn.execute(
94
+ """INSERT OR REPLACE INTO embedding_chunks
95
+ (id, service_id, repo, file_path, start_line, end_line,
96
+ content, language, chunk_type, symbol, embedding)
97
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
98
+ (
99
+ record.id,
100
+ record.service_id,
101
+ record.repo,
102
+ record.file_path,
103
+ record.start_line,
104
+ record.end_line,
105
+ record.content,
106
+ record.language,
107
+ record.chunk_type,
108
+ record.symbol,
109
+ emb_blob,
110
+ ),
111
+ )
112
+ conn.commit()
113
+
114
+ # ------------------------------------------------------------------ #
115
+ # Read / Search #
116
+ # ------------------------------------------------------------------ #
117
+
118
+ def query(
119
+ self,
120
+ query_embedding: List[float],
121
+ service_ids: Optional[List[str]] = None,
122
+ top_k: int = 10,
123
+ ) -> List[EmbeddingRecord]:
124
+ """Return top-K most similar records by cosine similarity.
125
+
126
+ Args:
127
+ query_embedding: Query vector.
128
+ service_ids: Restrict search to these service IDs (None = all).
129
+ top_k: Number of results to return.
130
+
131
+ Returns:
132
+ List of :class:`EmbeddingRecord` ordered by descending similarity.
133
+ """
134
+ qvec = np.array(query_embedding, dtype=np.float32)
135
+ qnorm = np.linalg.norm(qvec)
136
+ if qnorm == 0:
137
+ return []
138
+
139
+ with self._conn() as conn:
140
+ if service_ids:
141
+ placeholders = ",".join("?" * len(service_ids))
142
+ rows = conn.execute(
143
+ f"SELECT * FROM embedding_chunks WHERE service_id IN ({placeholders}) "
144
+ f"AND embedding IS NOT NULL",
145
+ service_ids,
146
+ ).fetchall()
147
+ else:
148
+ rows = conn.execute(
149
+ "SELECT * FROM embedding_chunks WHERE embedding IS NOT NULL"
150
+ ).fetchall()
151
+
152
+ if not rows:
153
+ return []
154
+
155
+ # Compute cosine similarities
156
+ scored: List[Tuple[float, sqlite3.Row]] = []
157
+ for row in rows:
158
+ vec = self._blob_to_vec(row["embedding"])
159
+ if vec is None:
160
+ continue
161
+ sim = float(np.dot(qvec, vec) / (qnorm * np.linalg.norm(vec) + 1e-10))
162
+ scored.append((sim, row))
163
+
164
+ scored.sort(key=lambda x: x[0], reverse=True)
165
+ top = scored[:top_k]
166
+
167
+ return [self._row_to_record(row) for _, row in top]
168
+
169
+ def count(self, service_id: Optional[str] = None) -> int:
170
+ """Return number of stored chunks."""
171
+ with self._conn() as conn:
172
+ if service_id:
173
+ return conn.execute(
174
+ "SELECT COUNT(*) FROM embedding_chunks WHERE service_id = ?", (service_id,)
175
+ ).fetchone()[0]
176
+ return conn.execute("SELECT COUNT(*) FROM embedding_chunks").fetchone()[0]
177
+
178
+ def clear(self, service_id: Optional[str] = None) -> None:
179
+ """Delete all chunks, or only those for a service."""
180
+ with self._conn() as conn:
181
+ if service_id:
182
+ conn.execute(
183
+ "DELETE FROM embedding_chunks WHERE service_id = ?", (service_id,)
184
+ )
185
+ else:
186
+ conn.execute("DELETE FROM embedding_chunks")
187
+ conn.commit()
188
+
189
+ def delete_by_file(self, file_path: str, repo_id: str) -> int:
190
+ """Delete all chunks for a specific file in a repo.
191
+
192
+ Args:
193
+ file_path: Relative file path within the repo.
194
+ repo_id: The service/repo ID.
195
+
196
+ Returns:
197
+ Number of rows deleted.
198
+ """
199
+ with self._conn() as conn:
200
+ cursor = conn.execute(
201
+ "DELETE FROM embedding_chunks WHERE file_path = ? AND service_id = ?",
202
+ (file_path, repo_id),
203
+ )
204
+ conn.commit()
205
+ return cursor.rowcount
206
+
207
+ def get_all_vectors(self) -> List[Tuple[str, bytes]]:
208
+ """Return all chunk IDs and their raw embedding blobs.
209
+
210
+ Used by :class:`~corbell.core.embeddings.search_cache.EmbeddingSearchCache`
211
+ to load all vectors into memory at once.
212
+
213
+ Returns:
214
+ List of ``(chunk_id, raw_blob)`` tuples for rows that have an embedding.
215
+ """
216
+ with self._conn() as conn:
217
+ rows = conn.execute(
218
+ "SELECT id, embedding FROM embedding_chunks WHERE embedding IS NOT NULL"
219
+ ).fetchall()
220
+ return [(row["id"], row["embedding"]) for row in rows]
221
+
222
+ def get_chunks_by_ids(self, chunk_ids: List[str]) -> List[EmbeddingRecord]:
223
+ """Fetch full EmbeddingRecord objects for the given IDs.
224
+
225
+ Args:
226
+ chunk_ids: List of chunk IDs to retrieve.
227
+
228
+ Returns:
229
+ List of :class:`EmbeddingRecord` objects (order not guaranteed).
230
+ """
231
+ if not chunk_ids:
232
+ return []
233
+ with self._conn() as conn:
234
+ placeholders = ",".join("?" * len(chunk_ids))
235
+ rows = conn.execute(
236
+ f"SELECT * FROM embedding_chunks WHERE id IN ({placeholders})",
237
+ chunk_ids,
238
+ ).fetchall()
239
+ return [self._row_to_record(row) for row in rows]
240
+
241
+ # ------------------------------------------------------------------ #
242
+ # Serialization helpers #
243
+ # ------------------------------------------------------------------ #
244
+
245
+ @staticmethod
246
+ def _vec_to_blob(vec: List[float]) -> bytes:
247
+ arr = np.array(vec, dtype=np.float32)
248
+ return arr.tobytes()
249
+
250
+ @staticmethod
251
+ def _blob_to_vec(blob: bytes) -> Optional[np.ndarray]:
252
+ if not blob:
253
+ return None
254
+ return np.frombuffer(blob, dtype=np.float32)
255
+
256
+ @staticmethod
257
+ def _row_to_record(row: sqlite3.Row) -> EmbeddingRecord:
258
+ vec = SQLiteEmbeddingStore._blob_to_vec(row["embedding"])
259
+ return EmbeddingRecord(
260
+ id=row["id"],
261
+ service_id=row["service_id"],
262
+ repo=row["repo"],
263
+ file_path=row["file_path"],
264
+ start_line=row["start_line"] or 0,
265
+ end_line=row["end_line"] or 0,
266
+ content=row["content"],
267
+ language=row["language"],
268
+ chunk_type=row["chunk_type"],
269
+ symbol=row["symbol"],
270
+ embedding=vec.tolist() if vec is not None else None,
271
+ )
@@ -0,0 +1,76 @@
1
+ """Gitignore-aware path matching for file discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ import pathspec
9
+
10
+ from corbell.core.constants import SKIP_DIRS
11
+
12
+
13
+ def load_gitignore(repo_path: Path) -> pathspec.PathSpec:
14
+ """Load all .gitignore rules for a repo and return a combined matcher.
15
+
16
+ Collects patterns from:
17
+ - .git/info/exclude
18
+ - Root .gitignore
19
+ - Nested .gitignore files (with proper path anchoring)
20
+
21
+ Returns a PathSpec that matches repo-root-relative paths.
22
+ If no gitignore files exist, returns an empty matcher (matches nothing).
23
+ """
24
+ lines: List[str] = []
25
+
26
+ # .git/info/exclude
27
+ exclude = repo_path / ".git" / "info" / "exclude"
28
+ if exclude.is_file():
29
+ lines.extend(_read_patterns(exclude, rel_dir=""))
30
+
31
+ # Root .gitignore
32
+ root_gi = repo_path / ".gitignore"
33
+ if root_gi.is_file():
34
+ lines.extend(_read_patterns(root_gi, rel_dir=""))
35
+
36
+ # Nested .gitignore files
37
+ for gi in repo_path.rglob(".gitignore"):
38
+ if gi == root_gi:
39
+ continue
40
+ rel = gi.parent.relative_to(repo_path)
41
+ if any(part in SKIP_DIRS for part in rel.parts):
42
+ continue
43
+ lines.extend(_read_patterns(gi, rel_dir=str(rel).replace("\\", "/")))
44
+
45
+ return pathspec.PathSpec.from_lines("gitwildmatch", lines)
46
+
47
+
48
+ def _read_patterns(gi_path: Path, rel_dir: str) -> List[str]:
49
+ """Read a .gitignore file and transform patterns to be repo-root-relative."""
50
+ result: List[str] = []
51
+ try:
52
+ content = gi_path.read_text(encoding="utf-8", errors="ignore")
53
+ except OSError:
54
+ return result
55
+
56
+ for raw in content.splitlines():
57
+ line = raw.strip()
58
+ if not line or line.startswith("#"):
59
+ continue
60
+
61
+ negate = ""
62
+ if line.startswith("!"):
63
+ negate = "!"
64
+ line = line[1:]
65
+
66
+ if not rel_dir:
67
+ result.append(negate + line)
68
+ else:
69
+ if line.startswith("/"):
70
+ result.append(negate + rel_dir + "/" + line[1:])
71
+ elif "/" in line.rstrip("/"):
72
+ result.append(negate + rel_dir + "/" + line)
73
+ else:
74
+ result.append(negate + rel_dir + "/**/" + line)
75
+
76
+ return result
@@ -0,0 +1 @@
1
+ """Corbell graph module."""