PyPI - code-context-engine - Versions diffs - 0.4.0__py3-none-any.whl - Mend

code-context-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

code_context_engine-0.4.0.dist-info/METADATA +389 -0
code_context_engine-0.4.0.dist-info/RECORD +63 -0
code_context_engine-0.4.0.dist-info/WHEEL +5 -0
code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
context_engine/__init__.py +3 -0
context_engine/cli.py +2848 -0
context_engine/cli_style.py +66 -0
context_engine/compression/__init__.py +0 -0
context_engine/compression/compressor.py +144 -0
context_engine/compression/ollama_client.py +33 -0
context_engine/compression/output_rules.py +77 -0
context_engine/compression/prompts.py +9 -0
context_engine/compression/quality.py +37 -0
context_engine/config.py +198 -0
context_engine/dashboard/__init__.py +0 -0
context_engine/dashboard/_page.py +1548 -0
context_engine/dashboard/server.py +429 -0
context_engine/editors.py +265 -0
context_engine/event_bus.py +24 -0
context_engine/indexer/__init__.py +0 -0
context_engine/indexer/chunker.py +147 -0
context_engine/indexer/embedder.py +154 -0
context_engine/indexer/embedding_cache.py +168 -0
context_engine/indexer/git_hooks.py +73 -0
context_engine/indexer/git_indexer.py +136 -0
context_engine/indexer/ignorefile.py +96 -0
context_engine/indexer/manifest.py +78 -0
context_engine/indexer/pipeline.py +624 -0
context_engine/indexer/secrets.py +332 -0
context_engine/indexer/watcher.py +109 -0
context_engine/integration/__init__.py +0 -0
context_engine/integration/bootstrap.py +76 -0
context_engine/integration/git_context.py +132 -0
context_engine/integration/mcp_server.py +1825 -0
context_engine/integration/session_capture.py +306 -0
context_engine/memory/__init__.py +6 -0
context_engine/memory/compressor.py +344 -0
context_engine/memory/db.py +922 -0
context_engine/memory/extractive.py +106 -0
context_engine/memory/grammar.py +419 -0
context_engine/memory/hook_installer.py +258 -0
context_engine/memory/hook_server.py +83 -0
context_engine/memory/hooks.py +327 -0
context_engine/memory/migrate.py +268 -0
context_engine/models.py +96 -0
context_engine/pricing.py +104 -0
context_engine/project_commands.py +296 -0
context_engine/retrieval/__init__.py +0 -0
context_engine/retrieval/confidence.py +47 -0
context_engine/retrieval/query_parser.py +105 -0
context_engine/retrieval/retriever.py +199 -0
context_engine/serve_http.py +208 -0
context_engine/services.py +252 -0
context_engine/storage/__init__.py +0 -0
context_engine/storage/backend.py +39 -0
context_engine/storage/fts_store.py +112 -0
context_engine/storage/graph_store.py +219 -0
context_engine/storage/local_backend.py +109 -0
context_engine/storage/remote_backend.py +117 -0
context_engine/storage/vector_store.py +357 -0
context_engine/utils.py +72 -0

context_engine/storage/vector_store.py ADDED Viewed

@@ -0,0 +1,357 @@
+"""SQLite-vec backed vector store for chunk embeddings.
+Replaces LanceDB (217MB) with sqlite-vec (~2MB). Same API, same search
+quality. Uses cosine distance for similarity ranking.
+Schema:
+  chunks — regular table storing chunk metadata + content
+  chunks_vec — vec0 virtual table storing embeddings for vector search
+"""
+import logging
+import os
+import sqlite3
+import struct
+from threading import RLock
+from context_engine.models import Chunk, ChunkType
+log = logging.getLogger(__name__)
+_MAX_CONTENT_CHARS = 5_000
+def _to_list(embedding) -> list[float]:
+    """Ensure embedding is a plain list."""
+    if isinstance(embedding, list):
+        return embedding
+    return list(embedding)
+def _serialize_vec(vec) -> bytes:
+    """Pack a float vector into bytes for sqlite-vec."""
+    v = _to_list(vec)
+    return struct.pack(f"{len(v)}f", *v)
+class VectorStore:
+    def __init__(self, db_path: str) -> None:
+        self._db_path = db_path
+        self._lock = RLock()
+        self._dim: int | None = None
+        os.makedirs(db_path, exist_ok=True)
+        self._db_file = os.path.join(db_path, "vectors.db")
+        self._conn = self._connect()
+        self._ensure_tables()
+    def _connect(self) -> sqlite3.Connection:
+        import sqlite_vec
+        conn = sqlite3.connect(self._db_file, check_same_thread=False)
+        conn.enable_load_extension(True)
+        sqlite_vec.load(conn)
+        conn.enable_load_extension(False)
+        conn.execute("PRAGMA journal_mode=WAL")
+        conn.execute("PRAGMA synchronous=NORMAL")
+        return conn
+    def _ensure_tables(self) -> None:
+        with self._lock:
+            self._conn.execute("""
+                CREATE TABLE IF NOT EXISTS chunks (
+                    id TEXT PRIMARY KEY,
+                    content TEXT NOT NULL,
+                    chunk_type TEXT NOT NULL,
+                    file_path TEXT NOT NULL,
+                    start_line INTEGER NOT NULL,
+                    end_line INTEGER NOT NULL,
+                    language TEXT NOT NULL
+                )
+            """)
+            self._conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_chunks_file_path
+                ON chunks(file_path)
+            """)
+            # Cache of LLM-summarised / truncated chunk text. Keyed by
+            # (chunk_id, level) because different compression levels produce
+            # different output. Cleared automatically when the chunk is
+            # re-ingested (delete-by-file via FK-like DELETE in delete_by_file).
+            self._conn.execute("""
+                CREATE TABLE IF NOT EXISTS chunk_compressions (
+                    chunk_id TEXT NOT NULL,
+                    level TEXT NOT NULL,
+                    compressed TEXT NOT NULL,
+                    PRIMARY KEY (chunk_id, level)
+                )
+            """)
+            # Detect vector dimension from existing data
+            row = self._conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_vec'"
+            ).fetchone()
+            if row:
+                # Table exists — read dim from first row
+                r = self._conn.execute("SELECT rowid FROM chunks_vec LIMIT 1").fetchone()
+                if r:
+                    self._dim = self._conn.execute(
+                        "SELECT vec_length(embedding) FROM chunks_vec LIMIT 1"
+                    ).fetchone()[0]
+            self._conn.commit()
+    def _ensure_vec_table(self, dim: int) -> None:
+        if self._dim == dim:
+            return
+        with self._lock:
+            if self._dim is not None and self._dim != dim:
+                log.warning(
+                    "Embedding dimension changed (%d -> %d), rebuilding vector table",
+                    self._dim, dim,
+                )
+                # Wipe both halves of the index. Keeping `chunks` while
+                # dropping `chunks_vec` would leave previously-indexed rows
+                # counted by count_chunks() / file_chunk_counts() but with no
+                # vector to retrieve, so search would silently miss them.
+                # chunk_compressions is keyed by chunk_id, so flush it too.
+                self._conn.execute("DROP TABLE IF EXISTS chunks_vec")
+                self._conn.execute("DELETE FROM chunks")
+                self._conn.execute("DELETE FROM chunk_compressions")
+            self._conn.execute(f"""
+                CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec
+                USING vec0(embedding float[{dim}])
+            """)
+            self._dim = dim
+            self._conn.commit()
+    def _chunk_to_row(self, chunk: Chunk) -> tuple:
+        content = chunk.content
+        if len(content) > _MAX_CONTENT_CHARS:
+            content = content[:_MAX_CONTENT_CHARS] + "\n...[truncated]"
+        return (
+            chunk.id, content, chunk.chunk_type.value,
+            chunk.file_path, chunk.start_line, chunk.end_line,
+            chunk.language,
+        )
+    def _row_to_chunk(self, row, distance: float | None = None) -> Chunk:
+        chunk = Chunk(
+            id=row[0],
+            content=row[1],
+            chunk_type=ChunkType(row[2]),
+            file_path=row[3],
+            start_line=row[4],
+            end_line=row[5],
+            language=row[6],
+        )
+        if distance is not None:
+            chunk.metadata["_distance"] = distance
+        return chunk
+    async def ingest(self, chunks: list[Chunk]) -> None:
+        if not chunks:
+            return
+        valid = [c for c in chunks if c.embedding]
+        if not valid:
+            log.warning("ingest called but no chunks have embeddings")
+            return
+        dim = len(valid[0].embedding)
+        self._ensure_vec_table(dim)
+        with self._lock:
+            cursor = self._conn.cursor()
+            for chunk in valid:
+                row = self._chunk_to_row(chunk)
+                rowid = cursor.execute(
+                    "INSERT INTO chunks "
+                    "(id, content, chunk_type, file_path, start_line, end_line, language) "
+                    "VALUES (?, ?, ?, ?, ?, ?, ?) "
+                    "ON CONFLICT(id) DO UPDATE SET "
+                    "content = excluded.content, "
+                    "chunk_type = excluded.chunk_type, "
+                    "file_path = excluded.file_path, "
+                    "start_line = excluded.start_line, "
+                    "end_line = excluded.end_line, "
+                    "language = excluded.language "
+                    "RETURNING rowid",
+                    row,
+                ).fetchone()[0]
+                cursor.execute("DELETE FROM chunks_vec WHERE rowid = ?", (rowid,))
+                cursor.execute(
+                    "INSERT INTO chunks_vec(rowid, embedding) VALUES (?, ?)",
+                    (rowid, _serialize_vec(chunk.embedding)),
+                )
+            self._conn.commit()
+    async def search(
+        self,
+        query_embedding,
+        top_k: int = 10,
+        filters: dict | None = None,
+    ) -> list[Chunk]:
+        embedding_list = _to_list(query_embedding)
+        with self._lock:
+            if self._dim is None:
+                return []
+            try:
+                query_bytes = _serialize_vec(embedding_list)
+                # Vector search via sqlite-vec
+                # sqlite-vec requires k=? in WHERE, not LIMIT
+                unsupported = set(filters or {}) - {"file_path"}
+                if unsupported:
+                    log.warning("Unsupported filter keys ignored: %s", unsupported)
+                if filters and "file_path" in filters:
+                    fp = filters["file_path"]
+                    # First get matching rowids from vec search, then filter
+                    rows = self._conn.execute(
+                        """
+                        SELECT c.id, c.content, c.chunk_type, c.file_path,
+                               c.start_line, c.end_line, c.language, v.distance
+                        FROM chunks_vec v
+                        JOIN chunks c ON c.rowid = v.rowid
+                        WHERE v.embedding MATCH ? AND k = ?
+                          AND c.file_path = ?
+                        ORDER BY v.distance
+                        """,
+                        (query_bytes, top_k * 3, fp),
+                    ).fetchall()[:top_k]
+                else:
+                    rows = self._conn.execute(
+                        """
+                        SELECT c.id, c.content, c.chunk_type, c.file_path,
+                               c.start_line, c.end_line, c.language, v.distance
+                        FROM chunks_vec v
+                        JOIN chunks c ON c.rowid = v.rowid
+                        WHERE v.embedding MATCH ? AND k = ?
+                        ORDER BY v.distance
+                        """,
+                        (query_bytes, top_k),
+                    ).fetchall()
+            except Exception as exc:
+                log.warning(
+                    "vector_store.search failed (returning no results — "
+                    "this may indicate index corruption): %s",
+                    exc,
+                )
+                return []
+        return [self._row_to_chunk(row[:7], distance=row[7]) for row in rows]
+    async def delete_by_file(self, file_path: str) -> None:
+        await self.delete_by_files([file_path])
+    async def delete_by_files(self, file_paths: list[str]) -> None:
+        """Batched delete. Pipeline calls this once per re-index batch instead
+        of awaiting per-file deletes serially, which previously bottlenecked
+        the indexing loop on small SQLite roundtrips."""
+        if not file_paths:
+            return
+        from context_engine.utils import batched_params
+        with self._lock:
+            for batch in batched_params(file_paths):
+                placeholders = ",".join("?" * len(batch))
+                if self._dim is not None:
+                    self._conn.execute(
+                        f"DELETE FROM chunks_vec "
+                        f"WHERE rowid IN (SELECT rowid FROM chunks WHERE file_path IN ({placeholders}))",
+                        batch,
+                    )
+                self._conn.execute(
+                    f"DELETE FROM chunk_compressions "
+                    f"WHERE chunk_id IN (SELECT id FROM chunks WHERE file_path IN ({placeholders}))",
+                    batch,
+                )
+                self._conn.execute(
+                    f"DELETE FROM chunks WHERE file_path IN ({placeholders})",
+                    batch,
+                )
+            self._conn.commit()
+    def get_cached_compression(self, chunk_id: str, level: str) -> str | None:
+        """Return the cached compressed text for (chunk_id, level), or None."""
+        with self._lock:
+            try:
+                row = self._conn.execute(
+                    "SELECT compressed FROM chunk_compressions "
+                    "WHERE chunk_id = ? AND level = ?",
+                    (chunk_id, level),
+                ).fetchone()
+            except Exception as exc:
+                log.debug("get_cached_compression failed for %s/%s: %s", chunk_id, level, exc)
+                return None
+        return row[0] if row else None
+    def put_cached_compression(self, chunk_id: str, level: str, compressed: str) -> None:
+        """Persist a compression result so the same chunk isn't recompressed
+        on every retrieval. Silently ignores write errors — caching is best
+        effort, the caller already has the value to return to the user."""
+        with self._lock:
+            try:
+                self._conn.execute(
+                    "INSERT OR REPLACE INTO chunk_compressions "
+                    "(chunk_id, level, compressed) VALUES (?, ?, ?)",
+                    (chunk_id, level, compressed),
+                )
+                self._conn.commit()
+            except Exception as exc:
+                log.debug("put_cached_compression failed for %s/%s: %s", chunk_id, level, exc)
+    def count(self) -> int:
+        with self._lock:
+            try:
+                row = self._conn.execute("SELECT COUNT(*) FROM chunks").fetchone()
+                return row[0] if row else 0
+            except Exception as exc:
+                # Log so users see "the index is broken" instead of "search
+                # returns nothing"; bare-except-and-zero made schema corruption
+                # indistinguishable from an empty index.
+                log.warning("vector_store.count failed: %s", exc)
+                return 0
+    def file_chunk_counts(self) -> dict[str, int]:
+        with self._lock:
+            try:
+                rows = self._conn.execute(
+                    "SELECT file_path, COUNT(*) FROM chunks GROUP BY file_path"
+                ).fetchall()
+                return {fp: count for fp, count in rows}
+            except Exception as exc:
+                log.warning("vector_store.file_chunk_counts failed: %s", exc)
+                return {}
+    def clear(self) -> None:
+        with self._lock:
+            try:
+                self._conn.execute("DELETE FROM chunks")
+                self._conn.execute("DELETE FROM chunk_compressions")
+                if self._dim is not None:
+                    self._conn.execute("DROP TABLE IF EXISTS chunks_vec")
+                    self._dim = None
+                self._conn.commit()
+            except Exception as exc:
+                log.warning("vector_store.clear failed: %s", exc)
+    async def get_by_id(self, chunk_id: str) -> Chunk | None:
+        with self._lock:
+            try:
+                row = self._conn.execute(
+                    "SELECT id, content, chunk_type, file_path, start_line, end_line, language "
+                    "FROM chunks WHERE id = ?",
+                    (chunk_id,),
+                ).fetchone()
+            except Exception as exc:
+                log.error("get_by_id failed for %s: %s", chunk_id, exc)
+                return None
+        if not row:
+            return None
+        return self._row_to_chunk(row)
+    async def get_chunks_by_ids(self, chunk_ids: list[str]) -> list[Chunk]:
+        if not chunk_ids:
+            return []
+        with self._lock:
+            try:
+                placeholders = ",".join("?" for _ in chunk_ids)
+                rows = self._conn.execute(
+                    f"SELECT id, content, chunk_type, file_path, start_line, end_line, language "
+                    f"FROM chunks WHERE id IN ({placeholders})",
+                    chunk_ids,
+                ).fetchall()
+            except Exception as exc:
+                log.error("get_chunks_by_ids failed: %s", exc)
+                return []
+        return [self._row_to_chunk(r) for r in rows]

context_engine/utils.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Shared utilities for CCE."""
+import os
+import shutil
+import sys
+import tempfile
+from pathlib import Path
+from typing import Iterator, Sequence
+# SQLite SQLITE_MAX_VARIABLE_NUMBER defaults to 999; stay safely under.
+_SQL_PARAM_BATCH = 500
+def batched_params(items: Sequence, size: int = _SQL_PARAM_BATCH) -> Iterator[list]:
+    """Yield successive chunks of *items* for safe SQLite IN-clause usage."""
+    for i in range(0, len(items), size):
+        yield list(items[i : i + size])
+def atomic_write_text(path: Path, data: str) -> None:
+    """Write `data` to `path` via a tempfile + os.replace.
+    A plain `path.write_text(data)` truncates the target before writing, so a
+    crash mid-write leaves a zero-byte or partial file. The next load reads
+    that as `{}` and silently loses everything. The tempfile-then-rename
+    pattern keeps the existing file intact until the new one is fully on
+    disk; the rename is atomic on POSIX.
+    Creates the parent directory if it doesn't exist (or was deleted by a
+    concurrent process between an earlier mkdir and this call).
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp_name = tempfile.mkstemp(
+        prefix=f".{path.name}.", suffix=".tmp", dir=str(path.parent)
+    )
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as fh:
+            fh.write(data)
+            fh.flush()
+            os.fsync(fh.fileno())
+        os.replace(tmp_name, path)
+    except Exception:
+        # Best-effort cleanup if anything went wrong before the rename.
+        try:
+            os.unlink(tmp_name)
+        except OSError:
+            pass
+        raise
+def resolve_cce_binary() -> str:
+    """Find the globally installed cce binary path.
+    Checks user-local then system install paths across both Linux and macOS
+    (Homebrew on Apple Silicon installs to /opt/homebrew/bin), then PATH,
+    then sys.argv[0] if it looks like cce, then a bare "cce" fallback.
+    """
+    candidates = [
+        Path.home() / ".local" / "bin" / "cce",   # pipx / uv tool default (Linux + macOS)
+        Path("/opt/homebrew/bin/cce"),            # macOS Homebrew on Apple Silicon
+        Path("/usr/local/bin/cce"),               # macOS Homebrew on Intel + Linux /usr/local
+        Path("/opt/local/bin/cce"),               # MacPorts
+    ]
+    for candidate in candidates:
+        if candidate.is_file() and os.access(candidate, os.X_OK):
+            return str(candidate)
+    found = shutil.which("cce")
+    if found:
+        return found
+    arg0 = Path(sys.argv[0]).resolve()
+    if arg0.name in ("cce", "code-context-engine"):
+        return str(arg0)
+    return "cce"