code-context-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. code_context_engine-0.4.0.dist-info/METADATA +389 -0
  2. code_context_engine-0.4.0.dist-info/RECORD +63 -0
  3. code_context_engine-0.4.0.dist-info/WHEEL +5 -0
  4. code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
  5. code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
  7. context_engine/__init__.py +3 -0
  8. context_engine/cli.py +2848 -0
  9. context_engine/cli_style.py +66 -0
  10. context_engine/compression/__init__.py +0 -0
  11. context_engine/compression/compressor.py +144 -0
  12. context_engine/compression/ollama_client.py +33 -0
  13. context_engine/compression/output_rules.py +77 -0
  14. context_engine/compression/prompts.py +9 -0
  15. context_engine/compression/quality.py +37 -0
  16. context_engine/config.py +198 -0
  17. context_engine/dashboard/__init__.py +0 -0
  18. context_engine/dashboard/_page.py +1548 -0
  19. context_engine/dashboard/server.py +429 -0
  20. context_engine/editors.py +265 -0
  21. context_engine/event_bus.py +24 -0
  22. context_engine/indexer/__init__.py +0 -0
  23. context_engine/indexer/chunker.py +147 -0
  24. context_engine/indexer/embedder.py +154 -0
  25. context_engine/indexer/embedding_cache.py +168 -0
  26. context_engine/indexer/git_hooks.py +73 -0
  27. context_engine/indexer/git_indexer.py +136 -0
  28. context_engine/indexer/ignorefile.py +96 -0
  29. context_engine/indexer/manifest.py +78 -0
  30. context_engine/indexer/pipeline.py +624 -0
  31. context_engine/indexer/secrets.py +332 -0
  32. context_engine/indexer/watcher.py +109 -0
  33. context_engine/integration/__init__.py +0 -0
  34. context_engine/integration/bootstrap.py +76 -0
  35. context_engine/integration/git_context.py +132 -0
  36. context_engine/integration/mcp_server.py +1825 -0
  37. context_engine/integration/session_capture.py +306 -0
  38. context_engine/memory/__init__.py +6 -0
  39. context_engine/memory/compressor.py +344 -0
  40. context_engine/memory/db.py +922 -0
  41. context_engine/memory/extractive.py +106 -0
  42. context_engine/memory/grammar.py +419 -0
  43. context_engine/memory/hook_installer.py +258 -0
  44. context_engine/memory/hook_server.py +83 -0
  45. context_engine/memory/hooks.py +327 -0
  46. context_engine/memory/migrate.py +268 -0
  47. context_engine/models.py +96 -0
  48. context_engine/pricing.py +104 -0
  49. context_engine/project_commands.py +296 -0
  50. context_engine/retrieval/__init__.py +0 -0
  51. context_engine/retrieval/confidence.py +47 -0
  52. context_engine/retrieval/query_parser.py +105 -0
  53. context_engine/retrieval/retriever.py +199 -0
  54. context_engine/serve_http.py +208 -0
  55. context_engine/services.py +252 -0
  56. context_engine/storage/__init__.py +0 -0
  57. context_engine/storage/backend.py +39 -0
  58. context_engine/storage/fts_store.py +112 -0
  59. context_engine/storage/graph_store.py +219 -0
  60. context_engine/storage/local_backend.py +109 -0
  61. context_engine/storage/remote_backend.py +117 -0
  62. context_engine/storage/vector_store.py +357 -0
  63. context_engine/utils.py +72 -0
@@ -0,0 +1,357 @@
1
+ """SQLite-vec backed vector store for chunk embeddings.
2
+
3
+ Replaces LanceDB (217MB) with sqlite-vec (~2MB). Same API, same search
4
+ quality. Uses cosine distance for similarity ranking.
5
+
6
+ Schema:
7
+ chunks — regular table storing chunk metadata + content
8
+ chunks_vec — vec0 virtual table storing embeddings for vector search
9
+ """
10
+ import logging
11
+ import os
12
+ import sqlite3
13
+ import struct
14
+ from threading import RLock
15
+
16
+ from context_engine.models import Chunk, ChunkType
17
+
18
+ log = logging.getLogger(__name__)
19
+
20
+ _MAX_CONTENT_CHARS = 5_000
21
+
22
+
23
+ def _to_list(embedding) -> list[float]:
24
+ """Ensure embedding is a plain list."""
25
+ if isinstance(embedding, list):
26
+ return embedding
27
+ return list(embedding)
28
+
29
+
30
+ def _serialize_vec(vec) -> bytes:
31
+ """Pack a float vector into bytes for sqlite-vec."""
32
+ v = _to_list(vec)
33
+ return struct.pack(f"{len(v)}f", *v)
34
+
35
+
36
+ class VectorStore:
37
+ def __init__(self, db_path: str) -> None:
38
+ self._db_path = db_path
39
+ self._lock = RLock()
40
+ self._dim: int | None = None
41
+ os.makedirs(db_path, exist_ok=True)
42
+ self._db_file = os.path.join(db_path, "vectors.db")
43
+ self._conn = self._connect()
44
+ self._ensure_tables()
45
+
46
+ def _connect(self) -> sqlite3.Connection:
47
+ import sqlite_vec
48
+ conn = sqlite3.connect(self._db_file, check_same_thread=False)
49
+ conn.enable_load_extension(True)
50
+ sqlite_vec.load(conn)
51
+ conn.enable_load_extension(False)
52
+ conn.execute("PRAGMA journal_mode=WAL")
53
+ conn.execute("PRAGMA synchronous=NORMAL")
54
+ return conn
55
+
56
+ def _ensure_tables(self) -> None:
57
+ with self._lock:
58
+ self._conn.execute("""
59
+ CREATE TABLE IF NOT EXISTS chunks (
60
+ id TEXT PRIMARY KEY,
61
+ content TEXT NOT NULL,
62
+ chunk_type TEXT NOT NULL,
63
+ file_path TEXT NOT NULL,
64
+ start_line INTEGER NOT NULL,
65
+ end_line INTEGER NOT NULL,
66
+ language TEXT NOT NULL
67
+ )
68
+ """)
69
+ self._conn.execute("""
70
+ CREATE INDEX IF NOT EXISTS idx_chunks_file_path
71
+ ON chunks(file_path)
72
+ """)
73
+ # Cache of LLM-summarised / truncated chunk text. Keyed by
74
+ # (chunk_id, level) because different compression levels produce
75
+ # different output. Cleared automatically when the chunk is
76
+ # re-ingested (delete-by-file via FK-like DELETE in delete_by_file).
77
+ self._conn.execute("""
78
+ CREATE TABLE IF NOT EXISTS chunk_compressions (
79
+ chunk_id TEXT NOT NULL,
80
+ level TEXT NOT NULL,
81
+ compressed TEXT NOT NULL,
82
+ PRIMARY KEY (chunk_id, level)
83
+ )
84
+ """)
85
+ # Detect vector dimension from existing data
86
+ row = self._conn.execute(
87
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_vec'"
88
+ ).fetchone()
89
+ if row:
90
+ # Table exists — read dim from first row
91
+ r = self._conn.execute("SELECT rowid FROM chunks_vec LIMIT 1").fetchone()
92
+ if r:
93
+ self._dim = self._conn.execute(
94
+ "SELECT vec_length(embedding) FROM chunks_vec LIMIT 1"
95
+ ).fetchone()[0]
96
+ self._conn.commit()
97
+
98
+ def _ensure_vec_table(self, dim: int) -> None:
99
+ if self._dim == dim:
100
+ return
101
+ with self._lock:
102
+ if self._dim is not None and self._dim != dim:
103
+ log.warning(
104
+ "Embedding dimension changed (%d -> %d), rebuilding vector table",
105
+ self._dim, dim,
106
+ )
107
+ # Wipe both halves of the index. Keeping `chunks` while
108
+ # dropping `chunks_vec` would leave previously-indexed rows
109
+ # counted by count_chunks() / file_chunk_counts() but with no
110
+ # vector to retrieve, so search would silently miss them.
111
+ # chunk_compressions is keyed by chunk_id, so flush it too.
112
+ self._conn.execute("DROP TABLE IF EXISTS chunks_vec")
113
+ self._conn.execute("DELETE FROM chunks")
114
+ self._conn.execute("DELETE FROM chunk_compressions")
115
+ self._conn.execute(f"""
116
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec
117
+ USING vec0(embedding float[{dim}])
118
+ """)
119
+ self._dim = dim
120
+ self._conn.commit()
121
+
122
+ def _chunk_to_row(self, chunk: Chunk) -> tuple:
123
+ content = chunk.content
124
+ if len(content) > _MAX_CONTENT_CHARS:
125
+ content = content[:_MAX_CONTENT_CHARS] + "\n...[truncated]"
126
+ return (
127
+ chunk.id, content, chunk.chunk_type.value,
128
+ chunk.file_path, chunk.start_line, chunk.end_line,
129
+ chunk.language,
130
+ )
131
+
132
+ def _row_to_chunk(self, row, distance: float | None = None) -> Chunk:
133
+ chunk = Chunk(
134
+ id=row[0],
135
+ content=row[1],
136
+ chunk_type=ChunkType(row[2]),
137
+ file_path=row[3],
138
+ start_line=row[4],
139
+ end_line=row[5],
140
+ language=row[6],
141
+ )
142
+ if distance is not None:
143
+ chunk.metadata["_distance"] = distance
144
+ return chunk
145
+
146
+ async def ingest(self, chunks: list[Chunk]) -> None:
147
+ if not chunks:
148
+ return
149
+ valid = [c for c in chunks if c.embedding]
150
+ if not valid:
151
+ log.warning("ingest called but no chunks have embeddings")
152
+ return
153
+ dim = len(valid[0].embedding)
154
+ self._ensure_vec_table(dim)
155
+ with self._lock:
156
+ cursor = self._conn.cursor()
157
+ for chunk in valid:
158
+ row = self._chunk_to_row(chunk)
159
+ rowid = cursor.execute(
160
+ "INSERT INTO chunks "
161
+ "(id, content, chunk_type, file_path, start_line, end_line, language) "
162
+ "VALUES (?, ?, ?, ?, ?, ?, ?) "
163
+ "ON CONFLICT(id) DO UPDATE SET "
164
+ "content = excluded.content, "
165
+ "chunk_type = excluded.chunk_type, "
166
+ "file_path = excluded.file_path, "
167
+ "start_line = excluded.start_line, "
168
+ "end_line = excluded.end_line, "
169
+ "language = excluded.language "
170
+ "RETURNING rowid",
171
+ row,
172
+ ).fetchone()[0]
173
+ cursor.execute("DELETE FROM chunks_vec WHERE rowid = ?", (rowid,))
174
+ cursor.execute(
175
+ "INSERT INTO chunks_vec(rowid, embedding) VALUES (?, ?)",
176
+ (rowid, _serialize_vec(chunk.embedding)),
177
+ )
178
+ self._conn.commit()
179
+
180
+ async def search(
181
+ self,
182
+ query_embedding,
183
+ top_k: int = 10,
184
+ filters: dict | None = None,
185
+ ) -> list[Chunk]:
186
+ embedding_list = _to_list(query_embedding)
187
+ with self._lock:
188
+ if self._dim is None:
189
+ return []
190
+ try:
191
+ query_bytes = _serialize_vec(embedding_list)
192
+ # Vector search via sqlite-vec
193
+ # sqlite-vec requires k=? in WHERE, not LIMIT
194
+ unsupported = set(filters or {}) - {"file_path"}
195
+ if unsupported:
196
+ log.warning("Unsupported filter keys ignored: %s", unsupported)
197
+ if filters and "file_path" in filters:
198
+ fp = filters["file_path"]
199
+ # First get matching rowids from vec search, then filter
200
+ rows = self._conn.execute(
201
+ """
202
+ SELECT c.id, c.content, c.chunk_type, c.file_path,
203
+ c.start_line, c.end_line, c.language, v.distance
204
+ FROM chunks_vec v
205
+ JOIN chunks c ON c.rowid = v.rowid
206
+ WHERE v.embedding MATCH ? AND k = ?
207
+ AND c.file_path = ?
208
+ ORDER BY v.distance
209
+ """,
210
+ (query_bytes, top_k * 3, fp),
211
+ ).fetchall()[:top_k]
212
+ else:
213
+ rows = self._conn.execute(
214
+ """
215
+ SELECT c.id, c.content, c.chunk_type, c.file_path,
216
+ c.start_line, c.end_line, c.language, v.distance
217
+ FROM chunks_vec v
218
+ JOIN chunks c ON c.rowid = v.rowid
219
+ WHERE v.embedding MATCH ? AND k = ?
220
+ ORDER BY v.distance
221
+ """,
222
+ (query_bytes, top_k),
223
+ ).fetchall()
224
+ except Exception as exc:
225
+ log.warning(
226
+ "vector_store.search failed (returning no results — "
227
+ "this may indicate index corruption): %s",
228
+ exc,
229
+ )
230
+ return []
231
+ return [self._row_to_chunk(row[:7], distance=row[7]) for row in rows]
232
+
233
+ async def delete_by_file(self, file_path: str) -> None:
234
+ await self.delete_by_files([file_path])
235
+
236
+ async def delete_by_files(self, file_paths: list[str]) -> None:
237
+ """Batched delete. Pipeline calls this once per re-index batch instead
238
+ of awaiting per-file deletes serially, which previously bottlenecked
239
+ the indexing loop on small SQLite roundtrips."""
240
+ if not file_paths:
241
+ return
242
+ from context_engine.utils import batched_params
243
+
244
+ with self._lock:
245
+ for batch in batched_params(file_paths):
246
+ placeholders = ",".join("?" * len(batch))
247
+ if self._dim is not None:
248
+ self._conn.execute(
249
+ f"DELETE FROM chunks_vec "
250
+ f"WHERE rowid IN (SELECT rowid FROM chunks WHERE file_path IN ({placeholders}))",
251
+ batch,
252
+ )
253
+ self._conn.execute(
254
+ f"DELETE FROM chunk_compressions "
255
+ f"WHERE chunk_id IN (SELECT id FROM chunks WHERE file_path IN ({placeholders}))",
256
+ batch,
257
+ )
258
+ self._conn.execute(
259
+ f"DELETE FROM chunks WHERE file_path IN ({placeholders})",
260
+ batch,
261
+ )
262
+ self._conn.commit()
263
+
264
+ def get_cached_compression(self, chunk_id: str, level: str) -> str | None:
265
+ """Return the cached compressed text for (chunk_id, level), or None."""
266
+ with self._lock:
267
+ try:
268
+ row = self._conn.execute(
269
+ "SELECT compressed FROM chunk_compressions "
270
+ "WHERE chunk_id = ? AND level = ?",
271
+ (chunk_id, level),
272
+ ).fetchone()
273
+ except Exception as exc:
274
+ log.debug("get_cached_compression failed for %s/%s: %s", chunk_id, level, exc)
275
+ return None
276
+ return row[0] if row else None
277
+
278
+ def put_cached_compression(self, chunk_id: str, level: str, compressed: str) -> None:
279
+ """Persist a compression result so the same chunk isn't recompressed
280
+ on every retrieval. Silently ignores write errors — caching is best
281
+ effort, the caller already has the value to return to the user."""
282
+ with self._lock:
283
+ try:
284
+ self._conn.execute(
285
+ "INSERT OR REPLACE INTO chunk_compressions "
286
+ "(chunk_id, level, compressed) VALUES (?, ?, ?)",
287
+ (chunk_id, level, compressed),
288
+ )
289
+ self._conn.commit()
290
+ except Exception as exc:
291
+ log.debug("put_cached_compression failed for %s/%s: %s", chunk_id, level, exc)
292
+
293
+ def count(self) -> int:
294
+ with self._lock:
295
+ try:
296
+ row = self._conn.execute("SELECT COUNT(*) FROM chunks").fetchone()
297
+ return row[0] if row else 0
298
+ except Exception as exc:
299
+ # Log so users see "the index is broken" instead of "search
300
+ # returns nothing"; bare-except-and-zero made schema corruption
301
+ # indistinguishable from an empty index.
302
+ log.warning("vector_store.count failed: %s", exc)
303
+ return 0
304
+
305
+ def file_chunk_counts(self) -> dict[str, int]:
306
+ with self._lock:
307
+ try:
308
+ rows = self._conn.execute(
309
+ "SELECT file_path, COUNT(*) FROM chunks GROUP BY file_path"
310
+ ).fetchall()
311
+ return {fp: count for fp, count in rows}
312
+ except Exception as exc:
313
+ log.warning("vector_store.file_chunk_counts failed: %s", exc)
314
+ return {}
315
+
316
+ def clear(self) -> None:
317
+ with self._lock:
318
+ try:
319
+ self._conn.execute("DELETE FROM chunks")
320
+ self._conn.execute("DELETE FROM chunk_compressions")
321
+ if self._dim is not None:
322
+ self._conn.execute("DROP TABLE IF EXISTS chunks_vec")
323
+ self._dim = None
324
+ self._conn.commit()
325
+ except Exception as exc:
326
+ log.warning("vector_store.clear failed: %s", exc)
327
+
328
+ async def get_by_id(self, chunk_id: str) -> Chunk | None:
329
+ with self._lock:
330
+ try:
331
+ row = self._conn.execute(
332
+ "SELECT id, content, chunk_type, file_path, start_line, end_line, language "
333
+ "FROM chunks WHERE id = ?",
334
+ (chunk_id,),
335
+ ).fetchone()
336
+ except Exception as exc:
337
+ log.error("get_by_id failed for %s: %s", chunk_id, exc)
338
+ return None
339
+ if not row:
340
+ return None
341
+ return self._row_to_chunk(row)
342
+
343
+ async def get_chunks_by_ids(self, chunk_ids: list[str]) -> list[Chunk]:
344
+ if not chunk_ids:
345
+ return []
346
+ with self._lock:
347
+ try:
348
+ placeholders = ",".join("?" for _ in chunk_ids)
349
+ rows = self._conn.execute(
350
+ f"SELECT id, content, chunk_type, file_path, start_line, end_line, language "
351
+ f"FROM chunks WHERE id IN ({placeholders})",
352
+ chunk_ids,
353
+ ).fetchall()
354
+ except Exception as exc:
355
+ log.error("get_chunks_by_ids failed: %s", exc)
356
+ return []
357
+ return [self._row_to_chunk(r) for r in rows]
@@ -0,0 +1,72 @@
1
+ """Shared utilities for CCE."""
2
+ import os
3
+ import shutil
4
+ import sys
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Iterator, Sequence
8
+
9
+ # SQLite SQLITE_MAX_VARIABLE_NUMBER defaults to 999; stay safely under.
10
+ _SQL_PARAM_BATCH = 500
11
+
12
+
13
+ def batched_params(items: Sequence, size: int = _SQL_PARAM_BATCH) -> Iterator[list]:
14
+ """Yield successive chunks of *items* for safe SQLite IN-clause usage."""
15
+ for i in range(0, len(items), size):
16
+ yield list(items[i : i + size])
17
+
18
+
19
+ def atomic_write_text(path: Path, data: str) -> None:
20
+ """Write `data` to `path` via a tempfile + os.replace.
21
+
22
+ A plain `path.write_text(data)` truncates the target before writing, so a
23
+ crash mid-write leaves a zero-byte or partial file. The next load reads
24
+ that as `{}` and silently loses everything. The tempfile-then-rename
25
+ pattern keeps the existing file intact until the new one is fully on
26
+ disk; the rename is atomic on POSIX.
27
+
28
+ Creates the parent directory if it doesn't exist (or was deleted by a
29
+ concurrent process between an earlier mkdir and this call).
30
+ """
31
+ path.parent.mkdir(parents=True, exist_ok=True)
32
+ fd, tmp_name = tempfile.mkstemp(
33
+ prefix=f".{path.name}.", suffix=".tmp", dir=str(path.parent)
34
+ )
35
+ try:
36
+ with os.fdopen(fd, "w", encoding="utf-8") as fh:
37
+ fh.write(data)
38
+ fh.flush()
39
+ os.fsync(fh.fileno())
40
+ os.replace(tmp_name, path)
41
+ except Exception:
42
+ # Best-effort cleanup if anything went wrong before the rename.
43
+ try:
44
+ os.unlink(tmp_name)
45
+ except OSError:
46
+ pass
47
+ raise
48
+
49
+
50
+ def resolve_cce_binary() -> str:
51
+ """Find the globally installed cce binary path.
52
+
53
+ Checks user-local then system install paths across both Linux and macOS
54
+ (Homebrew on Apple Silicon installs to /opt/homebrew/bin), then PATH,
55
+ then sys.argv[0] if it looks like cce, then a bare "cce" fallback.
56
+ """
57
+ candidates = [
58
+ Path.home() / ".local" / "bin" / "cce", # pipx / uv tool default (Linux + macOS)
59
+ Path("/opt/homebrew/bin/cce"), # macOS Homebrew on Apple Silicon
60
+ Path("/usr/local/bin/cce"), # macOS Homebrew on Intel + Linux /usr/local
61
+ Path("/opt/local/bin/cce"), # MacPorts
62
+ ]
63
+ for candidate in candidates:
64
+ if candidate.is_file() and os.access(candidate, os.X_OK):
65
+ return str(candidate)
66
+ found = shutil.which("cce")
67
+ if found:
68
+ return found
69
+ arg0 = Path(sys.argv[0]).resolve()
70
+ if arg0.name in ("cce", "code-context-engine"):
71
+ return str(arg0)
72
+ return "cce"