PyPI - codebase-retrieval-context-engine - Versions diffs - 2.0.0__py3-none-any.whl - Mend

codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
corbell/__init__.py +6 -0
corbell/cli/__init__.py +1 -0
corbell/cli/commands/__init__.py +1 -0
corbell/cli/commands/index.py +86 -0
corbell/cli/commands/query.py +71 -0
corbell/cli/main.py +57 -0
corbell/core/__init__.py +1 -0
corbell/core/constants.py +52 -0
corbell/core/embeddings/__init__.py +6 -0
corbell/core/embeddings/base.py +68 -0
corbell/core/embeddings/extractor.py +201 -0
corbell/core/embeddings/factory.py +48 -0
corbell/core/embeddings/model.py +401 -0
corbell/core/embeddings/search_cache.py +95 -0
corbell/core/embeddings/sqlite_store.py +271 -0
corbell/core/gitignore.py +76 -0
corbell/core/graph/__init__.py +1 -0
corbell/core/graph/builder.py +696 -0
corbell/core/graph/method_graph.py +1077 -0
corbell/core/graph/providers/__init__.py +6 -0
corbell/core/graph/providers/aws_patterns.py +62 -0
corbell/core/graph/providers/azure_patterns.py +64 -0
corbell/core/graph/providers/gcp_patterns.py +59 -0
corbell/core/graph/schema.py +175 -0
corbell/core/graph/sqlite_store.py +500 -0
corbell/core/indexing/__init__.py +1 -0
corbell/core/indexing/builder.py +608 -0
corbell/core/indexing/lock.py +150 -0
corbell/core/indexing/tracker.py +245 -0
corbell/core/llm_client.py +677 -0
corbell/core/mcp/__init__.py +1 -0
corbell/core/mcp/server.py +214 -0
corbell/core/query/__init__.py +1 -0
corbell/core/query/diagnostics.py +38 -0
corbell/core/query/engine.py +321 -0
corbell/core/query/enhancer.py +102 -0
corbell/core/query/formatter.py +98 -0
corbell/core/query/graph_expander.py +284 -0
corbell/core/query/merger.py +171 -0
corbell/core/query/reranker.py +131 -0
corbell/core/workspace.py +408 -0

corbell/core/embeddings/search_cache.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""In-memory numpy matrix cache for fast vectorized cosine similarity search."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, List, Optional, Tuple
+import numpy as np
+if TYPE_CHECKING:
+    from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
+class EmbeddingSearchCache:
+    """Process-resident cache of all embedding vectors as a numpy matrix.
+    Loads all vectors from the embedding store once and performs vectorized
+    cosine similarity using matrix multiplication (O(n) instead of O(n) with
+    constant factors ~150x faster than row-by-row).
+    For 30K chunks @ 384 dims: ~46 MB memory, ~2ms per query.
+    """
+    def __init__(self) -> None:
+        self._ids: List[str] = []
+        self._matrix: Optional[np.ndarray] = None  # shape (n_chunks, dim)
+    @property
+    def is_loaded(self) -> bool:
+        """True if the cache has been loaded from the store."""
+        return self._matrix is not None and len(self._ids) > 0
+    def load(self, store: "SQLiteEmbeddingStore") -> None:
+        """Load all embedding vectors from the store into a numpy matrix.
+        Args:
+            store: The embedding store to load vectors from.
+        """
+        rows = store.get_all_vectors()
+        if not rows:
+            self._ids = []
+            self._matrix = None
+            return
+        ids: List[str] = []
+        vecs: List[np.ndarray] = []
+        for chunk_id, blob in rows:
+            vec = np.frombuffer(blob, dtype=np.float32)
+            nrm = np.linalg.norm(vec)
+            if nrm > 0:
+                vecs.append(vec / nrm)  # pre-normalize for cosine via dot product
+            else:
+                vecs.append(vec)
+            ids.append(chunk_id)
+        self._ids = ids
+        self._matrix = np.stack(vecs, axis=0)  # shape (n_chunks, dim)
+    def search(self, query_vec: np.ndarray, top_k: int = 50) -> List[Tuple[str, float]]:
+        """Search for the top-K most similar chunks.
+        Args:
+            query_vec: Query embedding vector (will be normalized internally).
+            top_k: Number of results to return.
+        Returns:
+            List of ``(chunk_id, score)`` tuples ordered by descending similarity.
+            Returns empty list if cache is not loaded.
+        """
+        if not self.is_loaded or self._matrix is None:
+            return []
+        qvec = np.array(query_vec, dtype=np.float32)
+        qnorm = float(np.linalg.norm(qvec))
+        if qnorm == 0:
+            return []
+        qvec = qvec / qnorm  # normalize query
+        # Vectorized cosine similarity: matrix @ query_vec
+        # (matrix rows are already normalized, so dot product = cosine similarity)
+        scores: np.ndarray = self._matrix @ qvec  # shape (n_chunks,)
+        n = len(self._ids)
+        actual_k = min(top_k, n)
+        # Use argpartition for O(n) top-K selection instead of full sort
+        if actual_k < n:
+            # argpartition gives the indices of the top-k (unsorted)
+            top_indices = np.argpartition(scores, -actual_k)[-actual_k:]
+            # Sort only the top-k indices by score (descending)
+            top_indices = top_indices[np.argsort(scores[top_indices])[::-1]]
+        else:
+            top_indices = np.argsort(scores)[::-1]
+        return [(self._ids[int(i)], float(scores[int(i)])) for i in top_indices]

corbell/core/embeddings/sqlite_store.py ADDED Viewed

@@ -0,0 +1,271 @@
+"""SQLite backing store for code embeddings.
+Stores embedding vectors as binary blobs and provides cosine-similarity search.
+Implements :class:`~corbell.core.embeddings.base.EmbeddingStore`.
+"""
+from __future__ import annotations
+import sqlite3
+from pathlib import Path
+from typing import List, Optional, Tuple
+import numpy as np
+from corbell.core.embeddings.base import EmbeddingStore
+from corbell.core.embeddings.extractor import EmbeddingRecord
+_CREATE_CHUNKS = """
+CREATE TABLE IF NOT EXISTS embedding_chunks (
+    id TEXT PRIMARY KEY,
+    service_id TEXT NOT NULL,
+    repo TEXT NOT NULL,
+    file_path TEXT NOT NULL,
+    start_line INTEGER,
+    end_line INTEGER,
+    content TEXT NOT NULL,
+    language TEXT NOT NULL,
+    chunk_type TEXT NOT NULL,
+    symbol TEXT,
+    embedding BLOB
+);
+"""
+_CREATE_IDX = "CREATE INDEX IF NOT EXISTS idx_chunks_service ON embedding_chunks(service_id);"
+class SQLiteEmbeddingStore(EmbeddingStore):
+    """SQLite-backed embedding store with cosine-similarity search.
+    The embedding vector is stored as a raw float32 blob for compactness.
+    Implements :class:`~corbell.core.embeddings.base.EmbeddingStore`.
+    """
+    def __init__(self, db_path: Path | str):
+        self.db_path = Path(db_path)
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self._init_db()
+    def _conn(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row
+        return conn
+    def _init_db(self) -> None:
+        with self._conn() as conn:
+            conn.execute(_CREATE_CHUNKS)
+            conn.execute(_CREATE_IDX)
+            conn.commit()
+    # ------------------------------------------------------------------ #
+    # Write                                                                #
+    # ------------------------------------------------------------------ #
+    def upsert(self, record: EmbeddingRecord) -> None:
+        """Insert or replace a single embedding record."""
+        emb_blob = self._vec_to_blob(record.embedding) if record.embedding else None
+        with self._conn() as conn:
+            conn.execute(
+                """INSERT OR REPLACE INTO embedding_chunks
+                   (id, service_id, repo, file_path, start_line, end_line,
+                    content, language, chunk_type, symbol, embedding)
+                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+                (
+                    record.id,
+                    record.service_id,
+                    record.repo,
+                    record.file_path,
+                    record.start_line,
+                    record.end_line,
+                    record.content,
+                    record.language,
+                    record.chunk_type,
+                    record.symbol,
+                    emb_blob,
+                ),
+            )
+            conn.commit()
+    def upsert_batch(self, records: List[EmbeddingRecord]) -> None:
+        """Bulk-upsert a list of records."""
+        with self._conn() as conn:
+            for record in records:
+                emb_blob = self._vec_to_blob(record.embedding) if record.embedding else None
+                conn.execute(
+                    """INSERT OR REPLACE INTO embedding_chunks
+                       (id, service_id, repo, file_path, start_line, end_line,
+                        content, language, chunk_type, symbol, embedding)
+                       VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+                    (
+                        record.id,
+                        record.service_id,
+                        record.repo,
+                        record.file_path,
+                        record.start_line,
+                        record.end_line,
+                        record.content,
+                        record.language,
+                        record.chunk_type,
+                        record.symbol,
+                        emb_blob,
+                    ),
+                )
+            conn.commit()
+    # ------------------------------------------------------------------ #
+    # Read / Search                                                        #
+    # ------------------------------------------------------------------ #
+    def query(
+        self,
+        query_embedding: List[float],
+        service_ids: Optional[List[str]] = None,
+        top_k: int = 10,
+    ) -> List[EmbeddingRecord]:
+        """Return top-K most similar records by cosine similarity.
+        Args:
+            query_embedding: Query vector.
+            service_ids: Restrict search to these service IDs (None = all).
+            top_k: Number of results to return.
+        Returns:
+            List of :class:`EmbeddingRecord` ordered by descending similarity.
+        """
+        qvec = np.array(query_embedding, dtype=np.float32)
+        qnorm = np.linalg.norm(qvec)
+        if qnorm == 0:
+            return []
+        with self._conn() as conn:
+            if service_ids:
+                placeholders = ",".join("?" * len(service_ids))
+                rows = conn.execute(
+                    f"SELECT * FROM embedding_chunks WHERE service_id IN ({placeholders}) "
+                    f"AND embedding IS NOT NULL",
+                    service_ids,
+                ).fetchall()
+            else:
+                rows = conn.execute(
+                    "SELECT * FROM embedding_chunks WHERE embedding IS NOT NULL"
+                ).fetchall()
+        if not rows:
+            return []
+        # Compute cosine similarities
+        scored: List[Tuple[float, sqlite3.Row]] = []
+        for row in rows:
+            vec = self._blob_to_vec(row["embedding"])
+            if vec is None:
+                continue
+            sim = float(np.dot(qvec, vec) / (qnorm * np.linalg.norm(vec) + 1e-10))
+            scored.append((sim, row))
+        scored.sort(key=lambda x: x[0], reverse=True)
+        top = scored[:top_k]
+        return [self._row_to_record(row) for _, row in top]
+    def count(self, service_id: Optional[str] = None) -> int:
+        """Return number of stored chunks."""
+        with self._conn() as conn:
+            if service_id:
+                return conn.execute(
+                    "SELECT COUNT(*) FROM embedding_chunks WHERE service_id = ?", (service_id,)
+                ).fetchone()[0]
+            return conn.execute("SELECT COUNT(*) FROM embedding_chunks").fetchone()[0]
+    def clear(self, service_id: Optional[str] = None) -> None:
+        """Delete all chunks, or only those for a service."""
+        with self._conn() as conn:
+            if service_id:
+                conn.execute(
+                    "DELETE FROM embedding_chunks WHERE service_id = ?", (service_id,)
+                )
+            else:
+                conn.execute("DELETE FROM embedding_chunks")
+            conn.commit()
+    def delete_by_file(self, file_path: str, repo_id: str) -> int:
+        """Delete all chunks for a specific file in a repo.
+        Args:
+            file_path: Relative file path within the repo.
+            repo_id: The service/repo ID.
+        Returns:
+            Number of rows deleted.
+        """
+        with self._conn() as conn:
+            cursor = conn.execute(
+                "DELETE FROM embedding_chunks WHERE file_path = ? AND service_id = ?",
+                (file_path, repo_id),
+            )
+            conn.commit()
+            return cursor.rowcount
+    def get_all_vectors(self) -> List[Tuple[str, bytes]]:
+        """Return all chunk IDs and their raw embedding blobs.
+        Used by :class:`~corbell.core.embeddings.search_cache.EmbeddingSearchCache`
+        to load all vectors into memory at once.
+        Returns:
+            List of ``(chunk_id, raw_blob)`` tuples for rows that have an embedding.
+        """
+        with self._conn() as conn:
+            rows = conn.execute(
+                "SELECT id, embedding FROM embedding_chunks WHERE embedding IS NOT NULL"
+            ).fetchall()
+        return [(row["id"], row["embedding"]) for row in rows]
+    def get_chunks_by_ids(self, chunk_ids: List[str]) -> List[EmbeddingRecord]:
+        """Fetch full EmbeddingRecord objects for the given IDs.
+        Args:
+            chunk_ids: List of chunk IDs to retrieve.
+        Returns:
+            List of :class:`EmbeddingRecord` objects (order not guaranteed).
+        """
+        if not chunk_ids:
+            return []
+        with self._conn() as conn:
+            placeholders = ",".join("?" * len(chunk_ids))
+            rows = conn.execute(
+                f"SELECT * FROM embedding_chunks WHERE id IN ({placeholders})",
+                chunk_ids,
+            ).fetchall()
+        return [self._row_to_record(row) for row in rows]
+    # ------------------------------------------------------------------ #
+    # Serialization helpers                                                #
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _vec_to_blob(vec: List[float]) -> bytes:
+        arr = np.array(vec, dtype=np.float32)
+        return arr.tobytes()
+    @staticmethod
+    def _blob_to_vec(blob: bytes) -> Optional[np.ndarray]:
+        if not blob:
+            return None
+        return np.frombuffer(blob, dtype=np.float32)
+    @staticmethod
+    def _row_to_record(row: sqlite3.Row) -> EmbeddingRecord:
+        vec = SQLiteEmbeddingStore._blob_to_vec(row["embedding"])
+        return EmbeddingRecord(
+            id=row["id"],
+            service_id=row["service_id"],
+            repo=row["repo"],
+            file_path=row["file_path"],
+            start_line=row["start_line"] or 0,
+            end_line=row["end_line"] or 0,
+            content=row["content"],
+            language=row["language"],
+            chunk_type=row["chunk_type"],
+            symbol=row["symbol"],
+            embedding=vec.tolist() if vec is not None else None,
+        )

corbell/core/gitignore.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""Gitignore-aware path matching for file discovery."""
+from __future__ import annotations
+from pathlib import Path
+from typing import List
+import pathspec
+from corbell.core.constants import SKIP_DIRS
+def load_gitignore(repo_path: Path) -> pathspec.PathSpec:
+    """Load all .gitignore rules for a repo and return a combined matcher.
+    Collects patterns from:
+    - .git/info/exclude
+    - Root .gitignore
+    - Nested .gitignore files (with proper path anchoring)
+    Returns a PathSpec that matches repo-root-relative paths.
+    If no gitignore files exist, returns an empty matcher (matches nothing).
+    """
+    lines: List[str] = []
+    # .git/info/exclude
+    exclude = repo_path / ".git" / "info" / "exclude"
+    if exclude.is_file():
+        lines.extend(_read_patterns(exclude, rel_dir=""))
+    # Root .gitignore
+    root_gi = repo_path / ".gitignore"
+    if root_gi.is_file():
+        lines.extend(_read_patterns(root_gi, rel_dir=""))
+    # Nested .gitignore files
+    for gi in repo_path.rglob(".gitignore"):
+        if gi == root_gi:
+            continue
+        rel = gi.parent.relative_to(repo_path)
+        if any(part in SKIP_DIRS for part in rel.parts):
+            continue
+        lines.extend(_read_patterns(gi, rel_dir=str(rel).replace("\\", "/")))
+    return pathspec.PathSpec.from_lines("gitwildmatch", lines)
+def _read_patterns(gi_path: Path, rel_dir: str) -> List[str]:
+    """Read a .gitignore file and transform patterns to be repo-root-relative."""
+    result: List[str] = []
+    try:
+        content = gi_path.read_text(encoding="utf-8", errors="ignore")
+    except OSError:
+        return result
+    for raw in content.splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        negate = ""
+        if line.startswith("!"):
+            negate = "!"
+            line = line[1:]
+        if not rel_dir:
+            result.append(negate + line)
+        else:
+            if line.startswith("/"):
+                result.append(negate + rel_dir + "/" + line[1:])
+            elif "/" in line.rstrip("/"):
+                result.append(negate + rel_dir + "/" + line)
+            else:
+                result.append(negate + rel_dir + "/**/" + line)
+    return result

corbell/core/graph/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Corbell graph module."""