code-context-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. code_context_engine-0.4.0.dist-info/METADATA +389 -0
  2. code_context_engine-0.4.0.dist-info/RECORD +63 -0
  3. code_context_engine-0.4.0.dist-info/WHEEL +5 -0
  4. code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
  5. code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
  7. context_engine/__init__.py +3 -0
  8. context_engine/cli.py +2848 -0
  9. context_engine/cli_style.py +66 -0
  10. context_engine/compression/__init__.py +0 -0
  11. context_engine/compression/compressor.py +144 -0
  12. context_engine/compression/ollama_client.py +33 -0
  13. context_engine/compression/output_rules.py +77 -0
  14. context_engine/compression/prompts.py +9 -0
  15. context_engine/compression/quality.py +37 -0
  16. context_engine/config.py +198 -0
  17. context_engine/dashboard/__init__.py +0 -0
  18. context_engine/dashboard/_page.py +1548 -0
  19. context_engine/dashboard/server.py +429 -0
  20. context_engine/editors.py +265 -0
  21. context_engine/event_bus.py +24 -0
  22. context_engine/indexer/__init__.py +0 -0
  23. context_engine/indexer/chunker.py +147 -0
  24. context_engine/indexer/embedder.py +154 -0
  25. context_engine/indexer/embedding_cache.py +168 -0
  26. context_engine/indexer/git_hooks.py +73 -0
  27. context_engine/indexer/git_indexer.py +136 -0
  28. context_engine/indexer/ignorefile.py +96 -0
  29. context_engine/indexer/manifest.py +78 -0
  30. context_engine/indexer/pipeline.py +624 -0
  31. context_engine/indexer/secrets.py +332 -0
  32. context_engine/indexer/watcher.py +109 -0
  33. context_engine/integration/__init__.py +0 -0
  34. context_engine/integration/bootstrap.py +76 -0
  35. context_engine/integration/git_context.py +132 -0
  36. context_engine/integration/mcp_server.py +1825 -0
  37. context_engine/integration/session_capture.py +306 -0
  38. context_engine/memory/__init__.py +6 -0
  39. context_engine/memory/compressor.py +344 -0
  40. context_engine/memory/db.py +922 -0
  41. context_engine/memory/extractive.py +106 -0
  42. context_engine/memory/grammar.py +419 -0
  43. context_engine/memory/hook_installer.py +258 -0
  44. context_engine/memory/hook_server.py +83 -0
  45. context_engine/memory/hooks.py +327 -0
  46. context_engine/memory/migrate.py +268 -0
  47. context_engine/models.py +96 -0
  48. context_engine/pricing.py +104 -0
  49. context_engine/project_commands.py +296 -0
  50. context_engine/retrieval/__init__.py +0 -0
  51. context_engine/retrieval/confidence.py +47 -0
  52. context_engine/retrieval/query_parser.py +105 -0
  53. context_engine/retrieval/retriever.py +199 -0
  54. context_engine/serve_http.py +208 -0
  55. context_engine/services.py +252 -0
  56. context_engine/storage/__init__.py +0 -0
  57. context_engine/storage/backend.py +39 -0
  58. context_engine/storage/fts_store.py +112 -0
  59. context_engine/storage/graph_store.py +219 -0
  60. context_engine/storage/local_backend.py +109 -0
  61. context_engine/storage/remote_backend.py +117 -0
  62. context_engine/storage/vector_store.py +357 -0
  63. context_engine/utils.py +72 -0
@@ -0,0 +1,24 @@
1
+ """Simple async event bus for inter-module communication."""
2
+ import asyncio
3
+ from collections import defaultdict
4
+ from typing import Any, Callable, Coroutine
5
+
6
+
7
+ Handler = Callable[[Any], Coroutine[Any, Any, None]]
8
+
9
+
10
+ class EventBus:
11
+ def __init__(self) -> None:
12
+ self._handlers: dict[str, list[Handler]] = defaultdict(list)
13
+
14
+ def subscribe(self, event: str, handler: Handler) -> None:
15
+ self._handlers[event].append(handler)
16
+
17
+ def unsubscribe(self, event: str, handler: Handler) -> None:
18
+ handlers = self._handlers.get(event, [])
19
+ if handler in handlers:
20
+ handlers.remove(handler)
21
+
22
+ async def emit(self, event: str, data: Any = None) -> None:
23
+ for handler in self._handlers.get(event, []):
24
+ await handler(data)
File without changes
@@ -0,0 +1,147 @@
1
+ """AST-aware code chunking using tree-sitter."""
2
+ import hashlib
3
+
4
+ import tree_sitter_python as tspython
5
+ import tree_sitter_javascript as tsjavascript
6
+ import tree_sitter_typescript as tstypescript
7
+ import tree_sitter_php as tsphp
8
+ import tree_sitter_go as tsgo
9
+ import tree_sitter_rust as tsrust
10
+ import tree_sitter_java as tsjava
11
+ from tree_sitter import Language, Parser
12
+
13
+ from context_engine.models import Chunk, ChunkType
14
+
15
+ _FUNCTION_TYPES = {
16
+ "function_definition", "function_declaration", # Python, PHP, JS
17
+ "method_definition", "method_declaration", # JS/TS, PHP/Go/Java
18
+ "arrow_function", # JS/TS
19
+ "function_item", # Rust
20
+ }
21
+ _CLASS_TYPES = {
22
+ "class_definition", "class_declaration", # Python, JS/TS, PHP, Java
23
+ "type_declaration", # Go (struct/interface)
24
+ "struct_item", "impl_item", "enum_item", # Rust
25
+ }
26
+ _IMPORT_TYPES = {
27
+ "import_statement", "import_from_statement", # Python
28
+ "import_declaration", # TypeScript, Go, Java
29
+ "use_declaration", # PHP, Rust
30
+ }
31
+
32
+ _LANGUAGES = {
33
+ "python": Language(tspython.language()),
34
+ "javascript": Language(tsjavascript.language()),
35
+ "typescript": Language(tstypescript.language_typescript()),
36
+ "tsx": Language(tstypescript.language_tsx()),
37
+ "php": Language(tsphp.language_php()),
38
+ "go": Language(tsgo.language()),
39
+ "rust": Language(tsrust.language()),
40
+ "java": Language(tsjava.language()),
41
+ }
42
+
43
+
44
+ class Chunker:
45
+ def __init__(self) -> None:
46
+ self._parsers: dict[str, Parser] = {}
47
+
48
+ def _get_parser(self, language: str) -> Parser | None:
49
+ if language not in _LANGUAGES:
50
+ return None
51
+ if language not in self._parsers:
52
+ parser = Parser(_LANGUAGES[language])
53
+ self._parsers[language] = parser
54
+ return self._parsers[language]
55
+
56
+ def chunk(self, source: str, file_path: str, language: str) -> list[Chunk]:
57
+ parser = self._get_parser(language)
58
+ if parser is None:
59
+ return [self._fallback_chunk(source, file_path, language)]
60
+ tree = parser.parse(source.encode("utf-8"))
61
+ chunks = []
62
+ self._walk(tree.root_node, source, file_path, language, chunks)
63
+ if not chunks:
64
+ return [self._fallback_chunk(source, file_path, language)]
65
+ return chunks
66
+
67
+ def _walk(self, node, source, file_path, language, chunks):
68
+ if node.type in _FUNCTION_TYPES:
69
+ chunks.append(self._node_to_chunk(node, source, file_path, language, ChunkType.FUNCTION))
70
+ elif node.type in _CLASS_TYPES:
71
+ chunks.append(self._node_to_chunk(node, source, file_path, language, ChunkType.CLASS))
72
+ for child in node.children:
73
+ self._walk(child, source, file_path, language, chunks)
74
+
75
+ def _node_to_chunk(self, node, source, file_path, language, chunk_type):
76
+ content = source[node.start_byte:node.end_byte]
77
+ start_line = node.start_point.row + 1
78
+ end_line = node.end_point.row + 1
79
+ chunk_id = hashlib.sha256(
80
+ f"{file_path}:{start_line}:{end_line}:{content[:100]}".encode()
81
+ ).hexdigest()[:16]
82
+ return Chunk(
83
+ id=chunk_id, content=content, chunk_type=chunk_type,
84
+ file_path=file_path, start_line=start_line, end_line=end_line, language=language,
85
+ )
86
+
87
+ def chunk_with_imports(
88
+ self, source: str, file_path: str, language: str
89
+ ) -> tuple[list[Chunk], list[str]]:
90
+ chunks = self.chunk(source, file_path, language)
91
+ imports = self._extract_imports(source, language)
92
+ return chunks, imports
93
+
94
+ def _extract_imports(self, source: str, language: str) -> list[str]:
95
+ parser = self._get_parser(language)
96
+ if parser is None:
97
+ return []
98
+ tree = parser.parse(source.encode("utf-8"))
99
+ imports: list[str] = []
100
+ self._walk_imports(tree.root_node, source, language, imports)
101
+ return list(dict.fromkeys(imports)) # deduplicate while preserving order
102
+
103
+ def _walk_imports(self, node, source, language, imports):
104
+ if node.type in _IMPORT_TYPES:
105
+ module = self._parse_import_module(node, source, language)
106
+ if module:
107
+ imports.append(module)
108
+ for child in node.children:
109
+ self._walk_imports(child, source, language, imports)
110
+
111
+ def _parse_import_module(self, node, source, language) -> str | None:
112
+ if node.type == "import_statement":
113
+ # Python: "import os" or "import os.path"
114
+ # Also handles JS/TS: "import React from 'react'" (string child present)
115
+ for child in node.children:
116
+ if child.type == "string":
117
+ # JavaScript/TypeScript import with string module specifier
118
+ raw = source[child.start_byte:child.end_byte].strip("'\"")
119
+ return raw.split("/")[0] if not raw.startswith("@") else "/".join(raw.split("/")[:2])
120
+ if child.type in ("dotted_name", "aliased_import"):
121
+ # Python bare import
122
+ name = source[child.start_byte:child.end_byte]
123
+ name = name.split(" as ")[0].strip()
124
+ return name.split(".")[0]
125
+ elif node.type == "import_from_statement":
126
+ # Python: "from pathlib import Path"
127
+ for child in node.children:
128
+ if child.type in ("dotted_name", "relative_import"):
129
+ name = source[child.start_byte:child.end_byte].strip()
130
+ name = name.lstrip(".")
131
+ if name:
132
+ return name.split(".")[0]
133
+ elif node.type == "import_declaration":
134
+ # TypeScript (tree-sitter-typescript): "import React from 'react'"
135
+ for child in node.children:
136
+ if child.type == "string":
137
+ raw = source[child.start_byte:child.end_byte].strip("'\"")
138
+ return raw.split("/")[0] if not raw.startswith("@") else "/".join(raw.split("/")[:2])
139
+ return None
140
+
141
+ def _fallback_chunk(self, source, file_path, language):
142
+ chunk_id = hashlib.sha256(f"{file_path}:module".encode()).hexdigest()[:16]
143
+ lines = source.strip().split("\n")
144
+ return Chunk(
145
+ id=chunk_id, content=source, chunk_type=ChunkType.MODULE,
146
+ file_path=file_path, start_line=1, end_line=len(lines), language=language,
147
+ )
@@ -0,0 +1,154 @@
1
+ """Embedding generation using fastembed (lightweight ONNX-based embeddings).
2
+
3
+ Uses BAAI/bge-small-en-v1.5 by default — 33% smaller and better quality
4
+ than all-MiniLM-L6-v2. Parallel embedding for 3-4x faster indexing.
5
+
6
+ Supports an optional EmbeddingCache so unchanged code chunks are never
7
+ re-embedded across index runs (inspired by Cursor's content-hash cache).
8
+ """
9
+ import logging
10
+ import os
11
+ import sys
12
+ from functools import lru_cache
13
+
14
+ from fastembed import TextEmbedding
15
+
16
+ from context_engine.indexer.embedding_cache import EmbeddingCache
17
+ from context_engine.models import Chunk
18
+
19
+ log = logging.getLogger(__name__)
20
+
21
+ _DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
22
+
23
+ # Passed straight to fastembed's `parallel` argument:
24
+ # None → no data-parallel mp; use onnxruntime's own threading
25
+ # N>0 → spawn N forkserver workers around onnxruntime
26
+ #
27
+ # Even parallel=1 takes the multiprocessing path — and that path deadlocks on
28
+ # macOS (workers idle on SimpleQueue.get while the main thread sits in
29
+ # asyncio.poll, leaving `cce init` stuck after the file-scan progress bar
30
+ # hits 100%). Default to None on darwin; allow override via CCE_EMBED_PARALLEL.
31
+ def _resolve_parallel() -> int | None:
32
+ override = os.environ.get("CCE_EMBED_PARALLEL")
33
+ if override:
34
+ try:
35
+ return max(1, int(override))
36
+ except ValueError:
37
+ pass
38
+ if sys.platform == "darwin":
39
+ return None
40
+ return min(os.cpu_count() or 2, 4)
41
+
42
+
43
+ _PARALLEL: int | None = _resolve_parallel()
44
+
45
+
46
+ class Embedder:
47
+ def __init__(
48
+ self,
49
+ model_name: str = "BAAI/bge-small-en-v1.5",
50
+ cache: EmbeddingCache | None = None,
51
+ ) -> None:
52
+ self._cache = cache
53
+ # Resolve short names: "all-MiniLM-L6-v2" → "sentence-transformers/all-MiniLM-L6-v2"
54
+ # but leave fully qualified names like "BAAI/bge-small-en-v1.5" alone.
55
+ if "/" not in model_name:
56
+ resolved = f"sentence-transformers/{model_name}"
57
+ else:
58
+ resolved = model_name
59
+ try:
60
+ self._model = TextEmbedding(resolved)
61
+ except Exception as exc:
62
+ raise RuntimeError(
63
+ f"Failed to load embedding model '{model_name}'. "
64
+ f"Ensure fastembed is installed and the model name is valid. "
65
+ f"Supported models: TextEmbedding.list_supported_models(). "
66
+ f"Original error: {exc}"
67
+ ) from exc
68
+
69
+ def embed(
70
+ self,
71
+ chunks: list[Chunk],
72
+ batch_size: int = 64,
73
+ progress_fn=None,
74
+ ) -> None:
75
+ """Embed chunks in-place. With a cache attached, only chunks whose
76
+ content hash is not already in the cache go through the model.
77
+
78
+ `progress_fn(current, total)` is called as embedding proceeds, where
79
+ `total` is the count of chunks that actually needed embedding (cache
80
+ misses). Cache hits return instantly and don't trigger callbacks.
81
+ """
82
+ if not chunks:
83
+ return
84
+
85
+ if self._cache is None:
86
+ self._embed_all(chunks, batch_size, progress_fn=progress_fn)
87
+ return
88
+
89
+ # Hash + batched lookup: one SQL roundtrip for the whole batch
90
+ # instead of N roundtrips through the per-chunk get() path.
91
+ hashes = [self._cache.content_hash(c.content) for c in chunks]
92
+ cached = self._cache.get_batch(hashes)
93
+
94
+ miss_indices: list[int] = []
95
+ for i, h in enumerate(hashes):
96
+ if h in cached:
97
+ chunks[i].embedding = cached[h]
98
+ else:
99
+ miss_indices.append(i)
100
+
101
+ if miss_indices:
102
+ miss_chunks = [chunks[i] for i in miss_indices]
103
+ self._embed_all(miss_chunks, batch_size, progress_fn=progress_fn)
104
+ # Persist newly-computed embeddings back to the cache.
105
+ new_entries = [
106
+ (hashes[i], chunks[i].embedding)
107
+ for i in miss_indices
108
+ if chunks[i].embedding is not None
109
+ ]
110
+ if new_entries:
111
+ self._cache.put_batch(new_entries)
112
+
113
+ cache_total = len(chunks)
114
+ cache_hits = cache_total - len(miss_indices)
115
+ if cache_hits > 0:
116
+ log.info(
117
+ "Embedding cache: %d/%d hits (%.0f%% reused)",
118
+ cache_hits, cache_total, cache_hits / cache_total * 100,
119
+ )
120
+
121
+ def _embed_all(
122
+ self,
123
+ chunks: list[Chunk],
124
+ batch_size: int = 64,
125
+ progress_fn=None,
126
+ ) -> None:
127
+ """Embed all chunks via the model (no cache).
128
+
129
+ Iterates fastembed's generator one item at a time so we can tick a
130
+ progress callback. The model still embeds in batches internally; we
131
+ just observe one yielded vector at a time.
132
+ """
133
+ texts = [c.content for c in chunks]
134
+ total = len(texts)
135
+ if progress_fn:
136
+ progress_fn(0, total)
137
+ for i, emb in enumerate(self._model.embed(
138
+ texts,
139
+ batch_size=batch_size,
140
+ parallel=_PARALLEL,
141
+ )):
142
+ chunks[i].embedding = emb.tolist()
143
+ if progress_fn and ((i + 1) % batch_size == 0 or i + 1 == total):
144
+ progress_fn(i + 1, total)
145
+
146
+ @lru_cache(maxsize=256)
147
+ def embed_query(self, query: str) -> tuple:
148
+ """Embed a single query string. Returns tuple for LRU cache hashability.
149
+
150
+ Callers that need a list (e.g. LanceDB) should use list(result)
151
+ or the _to_list() helper in vector_store.
152
+ """
153
+ results = list(self._model.query_embed(query))
154
+ return tuple(results[0].tolist())
@@ -0,0 +1,168 @@
1
+ """SQLite-backed embedding cache keyed by content hash.
2
+
3
+ Avoids recomputing embeddings for unchanged code chunks across re-index runs.
4
+ Inspired by Cursor's approach of caching embeddings by chunk content hash so
5
+ identical code is never re-embedded.
6
+
7
+ Vectors are stored via `struct.pack` (binary float32) rather than JSON — same
8
+ encoding the sqlite-vec store uses elsewhere in the codebase. JSON would be
9
+ ~4× larger on disk for typical 384-dim embeddings.
10
+ """
11
+ import hashlib
12
+ import logging
13
+ import sqlite3
14
+ import struct
15
+ from pathlib import Path
16
+ from threading import RLock
17
+
18
+ log = logging.getLogger(__name__)
19
+
20
+ _SCHEMA = """
21
+ CREATE TABLE IF NOT EXISTS embedding_cache (
22
+ content_hash TEXT PRIMARY KEY,
23
+ dim INTEGER NOT NULL,
24
+ embedding BLOB NOT NULL
25
+ );
26
+ """
27
+
28
+
29
+ class EmbeddingCache:
30
+ """Maps content SHA-256 → embedding vector, persisted in SQLite.
31
+
32
+ When *model_name* is provided the content hash is salted with the model
33
+ identifier so that switching embedding models automatically invalidates
34
+ stale cache entries rather than silently returning vectors with the wrong
35
+ dimensionality or semantics.
36
+
37
+ All SQLite access is serialised with an RLock (same pattern as VectorStore
38
+ and FTSStore). ``check_same_thread=False`` only disables Python's ownership
39
+ check; concurrent calls still need explicit locking.
40
+ """
41
+
42
+ def __init__(self, cache_path: Path, *, model_name: str = "") -> None:
43
+ self._path = cache_path
44
+ self._model_name = model_name
45
+ self._path.parent.mkdir(parents=True, exist_ok=True)
46
+ self._lock = RLock()
47
+ self._conn = sqlite3.connect(str(self._path), check_same_thread=False)
48
+ self._conn.execute("PRAGMA journal_mode=WAL")
49
+ self._conn.execute("PRAGMA synchronous=NORMAL")
50
+ self._conn.execute(_SCHEMA)
51
+ self._conn.commit()
52
+ self._hits = 0
53
+ self._misses = 0
54
+
55
+ def content_hash(self, text: str) -> str:
56
+ """SHA-256 of *text*, salted with model name when set."""
57
+ key = f"{self._model_name}:{text}" if self._model_name else text
58
+ return hashlib.sha256(key.encode("utf-8")).hexdigest()
59
+
60
+ @staticmethod
61
+ def _pack(vec) -> bytes:
62
+ v = list(vec) if not isinstance(vec, list) else vec
63
+ return struct.pack(f"{len(v)}f", *v)
64
+
65
+ @staticmethod
66
+ def _unpack(blob: bytes, dim: int) -> list[float]:
67
+ return list(struct.unpack(f"{dim}f", blob))
68
+
69
+ def get(self, content_hash: str) -> list[float] | None:
70
+ with self._lock:
71
+ row = self._conn.execute(
72
+ "SELECT dim, embedding FROM embedding_cache WHERE content_hash = ?",
73
+ (content_hash,),
74
+ ).fetchone()
75
+ if row is None:
76
+ self._misses += 1
77
+ return None
78
+ self._hits += 1
79
+ return self._unpack(row[1], row[0])
80
+
81
+ def put(self, content_hash: str, embedding) -> None:
82
+ v = list(embedding) if not isinstance(embedding, list) else embedding
83
+ with self._lock:
84
+ self._conn.execute(
85
+ "INSERT OR REPLACE INTO embedding_cache (content_hash, dim, embedding) VALUES (?, ?, ?)",
86
+ (content_hash, len(v), self._pack(v)),
87
+ )
88
+ self._conn.commit()
89
+
90
+ def put_batch(self, items: list[tuple[str, list[float]]]) -> None:
91
+ rows = [(h, len(e), self._pack(e)) for h, e in items]
92
+ with self._lock:
93
+ self._conn.executemany(
94
+ "INSERT OR REPLACE INTO embedding_cache (content_hash, dim, embedding) VALUES (?, ?, ?)",
95
+ rows,
96
+ )
97
+ self._conn.commit()
98
+
99
+ def get_batch(self, content_hashes: list[str]) -> dict[str, list[float]]:
100
+ """Retrieve multiple embeddings at once. Returns hash → embedding for hits."""
101
+ if not content_hashes:
102
+ return {}
103
+ results: dict[str, list[float]] = {}
104
+ with self._lock:
105
+ for i in range(0, len(content_hashes), 500):
106
+ batch = content_hashes[i : i + 500]
107
+ placeholders = ",".join("?" * len(batch))
108
+ rows = self._conn.execute(
109
+ f"SELECT content_hash, dim, embedding FROM embedding_cache "
110
+ f"WHERE content_hash IN ({placeholders})",
111
+ batch,
112
+ ).fetchall()
113
+ for h, dim, blob in rows:
114
+ results[h] = self._unpack(blob, dim)
115
+ self._hits += len(results)
116
+ self._misses += len(content_hashes) - len(results)
117
+ return results
118
+
119
+ def prune_orphans(self, known_hashes: set[str]) -> int:
120
+ """Drop cached entries whose content_hash is not in `known_hashes`.
121
+
122
+ Cache grows monotonically without this — every chunk content variant
123
+ ever seen accumulates forever even after the source files change or
124
+ get deleted. Call this after a `cce index --full` with the set of
125
+ hashes still present in the live index. Returns the count removed.
126
+ """
127
+ if not known_hashes:
128
+ return 0
129
+ with self._lock:
130
+ cur = self._conn.execute("SELECT content_hash FROM embedding_cache")
131
+ current = {row[0] for row in cur.fetchall()}
132
+ orphans = current - known_hashes
133
+ if not orphans:
134
+ return 0
135
+ removed = 0
136
+ orphan_list = list(orphans)
137
+ for i in range(0, len(orphan_list), 500):
138
+ batch = orphan_list[i : i + 500]
139
+ placeholders = ",".join("?" * len(batch))
140
+ self._conn.execute(
141
+ f"DELETE FROM embedding_cache WHERE content_hash IN ({placeholders})",
142
+ batch,
143
+ )
144
+ removed += len(batch)
145
+ self._conn.commit()
146
+ return removed
147
+
148
+ @property
149
+ def hits(self) -> int:
150
+ return self._hits
151
+
152
+ @property
153
+ def misses(self) -> int:
154
+ return self._misses
155
+
156
+ @property
157
+ def hit_rate(self) -> float:
158
+ total = self._hits + self._misses
159
+ return self._hits / total if total > 0 else 0.0
160
+
161
+ def size(self) -> int:
162
+ with self._lock:
163
+ row = self._conn.execute("SELECT COUNT(*) FROM embedding_cache").fetchone()
164
+ return row[0] if row else 0
165
+
166
+ def close(self) -> None:
167
+ with self._lock:
168
+ self._conn.close()
@@ -0,0 +1,73 @@
1
+ """Git hook installer and handler for triggering re-indexing."""
2
+ import os
3
+ import shutil
4
+ import stat
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ HOOK_MARKER = "# cce hook"
9
+ HOOK_NAMES = ["post-commit", "post-checkout", "post-merge"]
10
+
11
+
12
+ def _resolve_cce_binary() -> str:
13
+ """Find an absolute path to the `cce` launcher.
14
+
15
+ Preferring an absolute path means the git hook keeps working when the user
16
+ runs `git commit` from a shell that doesn't pick up the same PATH as the one
17
+ used to install the engine (e.g. different login shell, GUI git client).
18
+ """
19
+ candidate = Path(sys.executable).parent / "cce"
20
+ if candidate.exists():
21
+ return str(candidate)
22
+ which = shutil.which("cce") or shutil.which("code-context-engine")
23
+ if which:
24
+ return which
25
+ # Last-resort: rely on PATH at hook-run time.
26
+ return "cce"
27
+
28
+
29
+ def _hook_script() -> str:
30
+ bin_path = _resolve_cce_binary()
31
+ return f"""{HOOK_MARKER}
32
+ {bin_path} index --changed-only >/dev/null 2>&1 &
33
+ """
34
+
35
+
36
+ def install_hooks(project_dir: str) -> list[str]:
37
+ """Install CCE git hooks. Returns [] gracefully if not a git repo."""
38
+ hooks_dir = Path(project_dir) / ".git" / "hooks"
39
+ if not hooks_dir.exists():
40
+ return []
41
+ installed = []
42
+ for hook_name in HOOK_NAMES:
43
+ hook_path = hooks_dir / hook_name
44
+ _install_single_hook(hook_path)
45
+ installed.append(str(hook_path))
46
+ return installed
47
+
48
+
49
+ def _install_single_hook(hook_path: Path) -> None:
50
+ script = _hook_script()
51
+ if hook_path.exists():
52
+ existing = hook_path.read_text()
53
+ if HOOK_MARKER in existing:
54
+ return
55
+ new_content = existing.rstrip() + "\n\n" + script
56
+ else:
57
+ new_content = "#!/bin/sh\n\n" + script
58
+ hook_path.write_text(new_content)
59
+ hook_path.chmod(hook_path.stat().st_mode | stat.S_IEXEC)
60
+
61
+
62
+ def get_changed_files_from_hook() -> list[str]:
63
+ import subprocess
64
+ try:
65
+ result = subprocess.run(
66
+ ["git", "diff", "--name-only", "HEAD~1", "HEAD"],
67
+ capture_output=True, text=True, timeout=10,
68
+ )
69
+ if result.returncode == 0:
70
+ return [f for f in result.stdout.strip().split("\n") if f]
71
+ except (subprocess.TimeoutExpired, FileNotFoundError):
72
+ pass
73
+ return []