code-context-engine 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context_engine-0.4.0.dist-info/METADATA +389 -0
- code_context_engine-0.4.0.dist-info/RECORD +63 -0
- code_context_engine-0.4.0.dist-info/WHEEL +5 -0
- code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
- code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
- code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
- context_engine/__init__.py +3 -0
- context_engine/cli.py +2848 -0
- context_engine/cli_style.py +66 -0
- context_engine/compression/__init__.py +0 -0
- context_engine/compression/compressor.py +144 -0
- context_engine/compression/ollama_client.py +33 -0
- context_engine/compression/output_rules.py +77 -0
- context_engine/compression/prompts.py +9 -0
- context_engine/compression/quality.py +37 -0
- context_engine/config.py +198 -0
- context_engine/dashboard/__init__.py +0 -0
- context_engine/dashboard/_page.py +1548 -0
- context_engine/dashboard/server.py +429 -0
- context_engine/editors.py +265 -0
- context_engine/event_bus.py +24 -0
- context_engine/indexer/__init__.py +0 -0
- context_engine/indexer/chunker.py +147 -0
- context_engine/indexer/embedder.py +154 -0
- context_engine/indexer/embedding_cache.py +168 -0
- context_engine/indexer/git_hooks.py +73 -0
- context_engine/indexer/git_indexer.py +136 -0
- context_engine/indexer/ignorefile.py +96 -0
- context_engine/indexer/manifest.py +78 -0
- context_engine/indexer/pipeline.py +624 -0
- context_engine/indexer/secrets.py +332 -0
- context_engine/indexer/watcher.py +109 -0
- context_engine/integration/__init__.py +0 -0
- context_engine/integration/bootstrap.py +76 -0
- context_engine/integration/git_context.py +132 -0
- context_engine/integration/mcp_server.py +1825 -0
- context_engine/integration/session_capture.py +306 -0
- context_engine/memory/__init__.py +6 -0
- context_engine/memory/compressor.py +344 -0
- context_engine/memory/db.py +922 -0
- context_engine/memory/extractive.py +106 -0
- context_engine/memory/grammar.py +419 -0
- context_engine/memory/hook_installer.py +258 -0
- context_engine/memory/hook_server.py +83 -0
- context_engine/memory/hooks.py +327 -0
- context_engine/memory/migrate.py +268 -0
- context_engine/models.py +96 -0
- context_engine/pricing.py +104 -0
- context_engine/project_commands.py +296 -0
- context_engine/retrieval/__init__.py +0 -0
- context_engine/retrieval/confidence.py +47 -0
- context_engine/retrieval/query_parser.py +105 -0
- context_engine/retrieval/retriever.py +199 -0
- context_engine/serve_http.py +208 -0
- context_engine/services.py +252 -0
- context_engine/storage/__init__.py +0 -0
- context_engine/storage/backend.py +39 -0
- context_engine/storage/fts_store.py +112 -0
- context_engine/storage/graph_store.py +219 -0
- context_engine/storage/local_backend.py +109 -0
- context_engine/storage/remote_backend.py +117 -0
- context_engine/storage/vector_store.py +357 -0
- context_engine/utils.py +72 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Simple async event bus for inter-module communication."""
|
|
2
|
+
import asyncio
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Callable, Coroutine
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
Handler = Callable[[Any], Coroutine[Any, Any, None]]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EventBus:
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
self._handlers: dict[str, list[Handler]] = defaultdict(list)
|
|
13
|
+
|
|
14
|
+
def subscribe(self, event: str, handler: Handler) -> None:
|
|
15
|
+
self._handlers[event].append(handler)
|
|
16
|
+
|
|
17
|
+
def unsubscribe(self, event: str, handler: Handler) -> None:
|
|
18
|
+
handlers = self._handlers.get(event, [])
|
|
19
|
+
if handler in handlers:
|
|
20
|
+
handlers.remove(handler)
|
|
21
|
+
|
|
22
|
+
async def emit(self, event: str, data: Any = None) -> None:
|
|
23
|
+
for handler in self._handlers.get(event, []):
|
|
24
|
+
await handler(data)
|
|
File without changes
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""AST-aware code chunking using tree-sitter."""
|
|
2
|
+
import hashlib
|
|
3
|
+
|
|
4
|
+
import tree_sitter_python as tspython
|
|
5
|
+
import tree_sitter_javascript as tsjavascript
|
|
6
|
+
import tree_sitter_typescript as tstypescript
|
|
7
|
+
import tree_sitter_php as tsphp
|
|
8
|
+
import tree_sitter_go as tsgo
|
|
9
|
+
import tree_sitter_rust as tsrust
|
|
10
|
+
import tree_sitter_java as tsjava
|
|
11
|
+
from tree_sitter import Language, Parser
|
|
12
|
+
|
|
13
|
+
from context_engine.models import Chunk, ChunkType
|
|
14
|
+
|
|
15
|
+
_FUNCTION_TYPES = {
|
|
16
|
+
"function_definition", "function_declaration", # Python, PHP, JS
|
|
17
|
+
"method_definition", "method_declaration", # JS/TS, PHP/Go/Java
|
|
18
|
+
"arrow_function", # JS/TS
|
|
19
|
+
"function_item", # Rust
|
|
20
|
+
}
|
|
21
|
+
_CLASS_TYPES = {
|
|
22
|
+
"class_definition", "class_declaration", # Python, JS/TS, PHP, Java
|
|
23
|
+
"type_declaration", # Go (struct/interface)
|
|
24
|
+
"struct_item", "impl_item", "enum_item", # Rust
|
|
25
|
+
}
|
|
26
|
+
_IMPORT_TYPES = {
|
|
27
|
+
"import_statement", "import_from_statement", # Python
|
|
28
|
+
"import_declaration", # TypeScript, Go, Java
|
|
29
|
+
"use_declaration", # PHP, Rust
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
_LANGUAGES = {
|
|
33
|
+
"python": Language(tspython.language()),
|
|
34
|
+
"javascript": Language(tsjavascript.language()),
|
|
35
|
+
"typescript": Language(tstypescript.language_typescript()),
|
|
36
|
+
"tsx": Language(tstypescript.language_tsx()),
|
|
37
|
+
"php": Language(tsphp.language_php()),
|
|
38
|
+
"go": Language(tsgo.language()),
|
|
39
|
+
"rust": Language(tsrust.language()),
|
|
40
|
+
"java": Language(tsjava.language()),
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Chunker:
|
|
45
|
+
def __init__(self) -> None:
|
|
46
|
+
self._parsers: dict[str, Parser] = {}
|
|
47
|
+
|
|
48
|
+
def _get_parser(self, language: str) -> Parser | None:
|
|
49
|
+
if language not in _LANGUAGES:
|
|
50
|
+
return None
|
|
51
|
+
if language not in self._parsers:
|
|
52
|
+
parser = Parser(_LANGUAGES[language])
|
|
53
|
+
self._parsers[language] = parser
|
|
54
|
+
return self._parsers[language]
|
|
55
|
+
|
|
56
|
+
def chunk(self, source: str, file_path: str, language: str) -> list[Chunk]:
|
|
57
|
+
parser = self._get_parser(language)
|
|
58
|
+
if parser is None:
|
|
59
|
+
return [self._fallback_chunk(source, file_path, language)]
|
|
60
|
+
tree = parser.parse(source.encode("utf-8"))
|
|
61
|
+
chunks = []
|
|
62
|
+
self._walk(tree.root_node, source, file_path, language, chunks)
|
|
63
|
+
if not chunks:
|
|
64
|
+
return [self._fallback_chunk(source, file_path, language)]
|
|
65
|
+
return chunks
|
|
66
|
+
|
|
67
|
+
def _walk(self, node, source, file_path, language, chunks):
|
|
68
|
+
if node.type in _FUNCTION_TYPES:
|
|
69
|
+
chunks.append(self._node_to_chunk(node, source, file_path, language, ChunkType.FUNCTION))
|
|
70
|
+
elif node.type in _CLASS_TYPES:
|
|
71
|
+
chunks.append(self._node_to_chunk(node, source, file_path, language, ChunkType.CLASS))
|
|
72
|
+
for child in node.children:
|
|
73
|
+
self._walk(child, source, file_path, language, chunks)
|
|
74
|
+
|
|
75
|
+
def _node_to_chunk(self, node, source, file_path, language, chunk_type):
|
|
76
|
+
content = source[node.start_byte:node.end_byte]
|
|
77
|
+
start_line = node.start_point.row + 1
|
|
78
|
+
end_line = node.end_point.row + 1
|
|
79
|
+
chunk_id = hashlib.sha256(
|
|
80
|
+
f"{file_path}:{start_line}:{end_line}:{content[:100]}".encode()
|
|
81
|
+
).hexdigest()[:16]
|
|
82
|
+
return Chunk(
|
|
83
|
+
id=chunk_id, content=content, chunk_type=chunk_type,
|
|
84
|
+
file_path=file_path, start_line=start_line, end_line=end_line, language=language,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def chunk_with_imports(
|
|
88
|
+
self, source: str, file_path: str, language: str
|
|
89
|
+
) -> tuple[list[Chunk], list[str]]:
|
|
90
|
+
chunks = self.chunk(source, file_path, language)
|
|
91
|
+
imports = self._extract_imports(source, language)
|
|
92
|
+
return chunks, imports
|
|
93
|
+
|
|
94
|
+
def _extract_imports(self, source: str, language: str) -> list[str]:
|
|
95
|
+
parser = self._get_parser(language)
|
|
96
|
+
if parser is None:
|
|
97
|
+
return []
|
|
98
|
+
tree = parser.parse(source.encode("utf-8"))
|
|
99
|
+
imports: list[str] = []
|
|
100
|
+
self._walk_imports(tree.root_node, source, language, imports)
|
|
101
|
+
return list(dict.fromkeys(imports)) # deduplicate while preserving order
|
|
102
|
+
|
|
103
|
+
def _walk_imports(self, node, source, language, imports):
|
|
104
|
+
if node.type in _IMPORT_TYPES:
|
|
105
|
+
module = self._parse_import_module(node, source, language)
|
|
106
|
+
if module:
|
|
107
|
+
imports.append(module)
|
|
108
|
+
for child in node.children:
|
|
109
|
+
self._walk_imports(child, source, language, imports)
|
|
110
|
+
|
|
111
|
+
def _parse_import_module(self, node, source, language) -> str | None:
|
|
112
|
+
if node.type == "import_statement":
|
|
113
|
+
# Python: "import os" or "import os.path"
|
|
114
|
+
# Also handles JS/TS: "import React from 'react'" (string child present)
|
|
115
|
+
for child in node.children:
|
|
116
|
+
if child.type == "string":
|
|
117
|
+
# JavaScript/TypeScript import with string module specifier
|
|
118
|
+
raw = source[child.start_byte:child.end_byte].strip("'\"")
|
|
119
|
+
return raw.split("/")[0] if not raw.startswith("@") else "/".join(raw.split("/")[:2])
|
|
120
|
+
if child.type in ("dotted_name", "aliased_import"):
|
|
121
|
+
# Python bare import
|
|
122
|
+
name = source[child.start_byte:child.end_byte]
|
|
123
|
+
name = name.split(" as ")[0].strip()
|
|
124
|
+
return name.split(".")[0]
|
|
125
|
+
elif node.type == "import_from_statement":
|
|
126
|
+
# Python: "from pathlib import Path"
|
|
127
|
+
for child in node.children:
|
|
128
|
+
if child.type in ("dotted_name", "relative_import"):
|
|
129
|
+
name = source[child.start_byte:child.end_byte].strip()
|
|
130
|
+
name = name.lstrip(".")
|
|
131
|
+
if name:
|
|
132
|
+
return name.split(".")[0]
|
|
133
|
+
elif node.type == "import_declaration":
|
|
134
|
+
# TypeScript (tree-sitter-typescript): "import React from 'react'"
|
|
135
|
+
for child in node.children:
|
|
136
|
+
if child.type == "string":
|
|
137
|
+
raw = source[child.start_byte:child.end_byte].strip("'\"")
|
|
138
|
+
return raw.split("/")[0] if not raw.startswith("@") else "/".join(raw.split("/")[:2])
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
def _fallback_chunk(self, source, file_path, language):
|
|
142
|
+
chunk_id = hashlib.sha256(f"{file_path}:module".encode()).hexdigest()[:16]
|
|
143
|
+
lines = source.strip().split("\n")
|
|
144
|
+
return Chunk(
|
|
145
|
+
id=chunk_id, content=source, chunk_type=ChunkType.MODULE,
|
|
146
|
+
file_path=file_path, start_line=1, end_line=len(lines), language=language,
|
|
147
|
+
)
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Embedding generation using fastembed (lightweight ONNX-based embeddings).
|
|
2
|
+
|
|
3
|
+
Uses BAAI/bge-small-en-v1.5 by default — 33% smaller and better quality
|
|
4
|
+
than all-MiniLM-L6-v2. Parallel embedding for 3-4x faster indexing.
|
|
5
|
+
|
|
6
|
+
Supports an optional EmbeddingCache so unchanged code chunks are never
|
|
7
|
+
re-embedded across index runs (inspired by Cursor's content-hash cache).
|
|
8
|
+
"""
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
from functools import lru_cache
|
|
13
|
+
|
|
14
|
+
from fastembed import TextEmbedding
|
|
15
|
+
|
|
16
|
+
from context_engine.indexer.embedding_cache import EmbeddingCache
|
|
17
|
+
from context_engine.models import Chunk
|
|
18
|
+
|
|
19
|
+
log = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
|
|
22
|
+
|
|
23
|
+
# Passed straight to fastembed's `parallel` argument:
|
|
24
|
+
# None → no data-parallel mp; use onnxruntime's own threading
|
|
25
|
+
# N>0 → spawn N forkserver workers around onnxruntime
|
|
26
|
+
#
|
|
27
|
+
# Even parallel=1 takes the multiprocessing path — and that path deadlocks on
|
|
28
|
+
# macOS (workers idle on SimpleQueue.get while the main thread sits in
|
|
29
|
+
# asyncio.poll, leaving `cce init` stuck after the file-scan progress bar
|
|
30
|
+
# hits 100%). Default to None on darwin; allow override via CCE_EMBED_PARALLEL.
|
|
31
|
+
def _resolve_parallel() -> int | None:
|
|
32
|
+
override = os.environ.get("CCE_EMBED_PARALLEL")
|
|
33
|
+
if override:
|
|
34
|
+
try:
|
|
35
|
+
return max(1, int(override))
|
|
36
|
+
except ValueError:
|
|
37
|
+
pass
|
|
38
|
+
if sys.platform == "darwin":
|
|
39
|
+
return None
|
|
40
|
+
return min(os.cpu_count() or 2, 4)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
_PARALLEL: int | None = _resolve_parallel()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Embedder:
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
model_name: str = "BAAI/bge-small-en-v1.5",
|
|
50
|
+
cache: EmbeddingCache | None = None,
|
|
51
|
+
) -> None:
|
|
52
|
+
self._cache = cache
|
|
53
|
+
# Resolve short names: "all-MiniLM-L6-v2" → "sentence-transformers/all-MiniLM-L6-v2"
|
|
54
|
+
# but leave fully qualified names like "BAAI/bge-small-en-v1.5" alone.
|
|
55
|
+
if "/" not in model_name:
|
|
56
|
+
resolved = f"sentence-transformers/{model_name}"
|
|
57
|
+
else:
|
|
58
|
+
resolved = model_name
|
|
59
|
+
try:
|
|
60
|
+
self._model = TextEmbedding(resolved)
|
|
61
|
+
except Exception as exc:
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
f"Failed to load embedding model '{model_name}'. "
|
|
64
|
+
f"Ensure fastembed is installed and the model name is valid. "
|
|
65
|
+
f"Supported models: TextEmbedding.list_supported_models(). "
|
|
66
|
+
f"Original error: {exc}"
|
|
67
|
+
) from exc
|
|
68
|
+
|
|
69
|
+
def embed(
|
|
70
|
+
self,
|
|
71
|
+
chunks: list[Chunk],
|
|
72
|
+
batch_size: int = 64,
|
|
73
|
+
progress_fn=None,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""Embed chunks in-place. With a cache attached, only chunks whose
|
|
76
|
+
content hash is not already in the cache go through the model.
|
|
77
|
+
|
|
78
|
+
`progress_fn(current, total)` is called as embedding proceeds, where
|
|
79
|
+
`total` is the count of chunks that actually needed embedding (cache
|
|
80
|
+
misses). Cache hits return instantly and don't trigger callbacks.
|
|
81
|
+
"""
|
|
82
|
+
if not chunks:
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
if self._cache is None:
|
|
86
|
+
self._embed_all(chunks, batch_size, progress_fn=progress_fn)
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
# Hash + batched lookup: one SQL roundtrip for the whole batch
|
|
90
|
+
# instead of N roundtrips through the per-chunk get() path.
|
|
91
|
+
hashes = [self._cache.content_hash(c.content) for c in chunks]
|
|
92
|
+
cached = self._cache.get_batch(hashes)
|
|
93
|
+
|
|
94
|
+
miss_indices: list[int] = []
|
|
95
|
+
for i, h in enumerate(hashes):
|
|
96
|
+
if h in cached:
|
|
97
|
+
chunks[i].embedding = cached[h]
|
|
98
|
+
else:
|
|
99
|
+
miss_indices.append(i)
|
|
100
|
+
|
|
101
|
+
if miss_indices:
|
|
102
|
+
miss_chunks = [chunks[i] for i in miss_indices]
|
|
103
|
+
self._embed_all(miss_chunks, batch_size, progress_fn=progress_fn)
|
|
104
|
+
# Persist newly-computed embeddings back to the cache.
|
|
105
|
+
new_entries = [
|
|
106
|
+
(hashes[i], chunks[i].embedding)
|
|
107
|
+
for i in miss_indices
|
|
108
|
+
if chunks[i].embedding is not None
|
|
109
|
+
]
|
|
110
|
+
if new_entries:
|
|
111
|
+
self._cache.put_batch(new_entries)
|
|
112
|
+
|
|
113
|
+
cache_total = len(chunks)
|
|
114
|
+
cache_hits = cache_total - len(miss_indices)
|
|
115
|
+
if cache_hits > 0:
|
|
116
|
+
log.info(
|
|
117
|
+
"Embedding cache: %d/%d hits (%.0f%% reused)",
|
|
118
|
+
cache_hits, cache_total, cache_hits / cache_total * 100,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def _embed_all(
|
|
122
|
+
self,
|
|
123
|
+
chunks: list[Chunk],
|
|
124
|
+
batch_size: int = 64,
|
|
125
|
+
progress_fn=None,
|
|
126
|
+
) -> None:
|
|
127
|
+
"""Embed all chunks via the model (no cache).
|
|
128
|
+
|
|
129
|
+
Iterates fastembed's generator one item at a time so we can tick a
|
|
130
|
+
progress callback. The model still embeds in batches internally; we
|
|
131
|
+
just observe one yielded vector at a time.
|
|
132
|
+
"""
|
|
133
|
+
texts = [c.content for c in chunks]
|
|
134
|
+
total = len(texts)
|
|
135
|
+
if progress_fn:
|
|
136
|
+
progress_fn(0, total)
|
|
137
|
+
for i, emb in enumerate(self._model.embed(
|
|
138
|
+
texts,
|
|
139
|
+
batch_size=batch_size,
|
|
140
|
+
parallel=_PARALLEL,
|
|
141
|
+
)):
|
|
142
|
+
chunks[i].embedding = emb.tolist()
|
|
143
|
+
if progress_fn and ((i + 1) % batch_size == 0 or i + 1 == total):
|
|
144
|
+
progress_fn(i + 1, total)
|
|
145
|
+
|
|
146
|
+
@lru_cache(maxsize=256)
|
|
147
|
+
def embed_query(self, query: str) -> tuple:
|
|
148
|
+
"""Embed a single query string. Returns tuple for LRU cache hashability.
|
|
149
|
+
|
|
150
|
+
Callers that need a list (e.g. LanceDB) should use list(result)
|
|
151
|
+
or the _to_list() helper in vector_store.
|
|
152
|
+
"""
|
|
153
|
+
results = list(self._model.query_embed(query))
|
|
154
|
+
return tuple(results[0].tolist())
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""SQLite-backed embedding cache keyed by content hash.
|
|
2
|
+
|
|
3
|
+
Avoids recomputing embeddings for unchanged code chunks across re-index runs.
|
|
4
|
+
Inspired by Cursor's approach of caching embeddings by chunk content hash so
|
|
5
|
+
identical code is never re-embedded.
|
|
6
|
+
|
|
7
|
+
Vectors are stored via `struct.pack` (binary float32) rather than JSON — same
|
|
8
|
+
encoding the sqlite-vec store uses elsewhere in the codebase. JSON would be
|
|
9
|
+
~4× larger on disk for typical 384-dim embeddings.
|
|
10
|
+
"""
|
|
11
|
+
import hashlib
|
|
12
|
+
import logging
|
|
13
|
+
import sqlite3
|
|
14
|
+
import struct
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from threading import RLock
|
|
17
|
+
|
|
18
|
+
log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
_SCHEMA = """
|
|
21
|
+
CREATE TABLE IF NOT EXISTS embedding_cache (
|
|
22
|
+
content_hash TEXT PRIMARY KEY,
|
|
23
|
+
dim INTEGER NOT NULL,
|
|
24
|
+
embedding BLOB NOT NULL
|
|
25
|
+
);
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EmbeddingCache:
|
|
30
|
+
"""Maps content SHA-256 → embedding vector, persisted in SQLite.
|
|
31
|
+
|
|
32
|
+
When *model_name* is provided the content hash is salted with the model
|
|
33
|
+
identifier so that switching embedding models automatically invalidates
|
|
34
|
+
stale cache entries rather than silently returning vectors with the wrong
|
|
35
|
+
dimensionality or semantics.
|
|
36
|
+
|
|
37
|
+
All SQLite access is serialised with an RLock (same pattern as VectorStore
|
|
38
|
+
and FTSStore). ``check_same_thread=False`` only disables Python's ownership
|
|
39
|
+
check; concurrent calls still need explicit locking.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, cache_path: Path, *, model_name: str = "") -> None:
|
|
43
|
+
self._path = cache_path
|
|
44
|
+
self._model_name = model_name
|
|
45
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
self._lock = RLock()
|
|
47
|
+
self._conn = sqlite3.connect(str(self._path), check_same_thread=False)
|
|
48
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
49
|
+
self._conn.execute("PRAGMA synchronous=NORMAL")
|
|
50
|
+
self._conn.execute(_SCHEMA)
|
|
51
|
+
self._conn.commit()
|
|
52
|
+
self._hits = 0
|
|
53
|
+
self._misses = 0
|
|
54
|
+
|
|
55
|
+
def content_hash(self, text: str) -> str:
|
|
56
|
+
"""SHA-256 of *text*, salted with model name when set."""
|
|
57
|
+
key = f"{self._model_name}:{text}" if self._model_name else text
|
|
58
|
+
return hashlib.sha256(key.encode("utf-8")).hexdigest()
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _pack(vec) -> bytes:
|
|
62
|
+
v = list(vec) if not isinstance(vec, list) else vec
|
|
63
|
+
return struct.pack(f"{len(v)}f", *v)
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def _unpack(blob: bytes, dim: int) -> list[float]:
|
|
67
|
+
return list(struct.unpack(f"{dim}f", blob))
|
|
68
|
+
|
|
69
|
+
def get(self, content_hash: str) -> list[float] | None:
|
|
70
|
+
with self._lock:
|
|
71
|
+
row = self._conn.execute(
|
|
72
|
+
"SELECT dim, embedding FROM embedding_cache WHERE content_hash = ?",
|
|
73
|
+
(content_hash,),
|
|
74
|
+
).fetchone()
|
|
75
|
+
if row is None:
|
|
76
|
+
self._misses += 1
|
|
77
|
+
return None
|
|
78
|
+
self._hits += 1
|
|
79
|
+
return self._unpack(row[1], row[0])
|
|
80
|
+
|
|
81
|
+
def put(self, content_hash: str, embedding) -> None:
|
|
82
|
+
v = list(embedding) if not isinstance(embedding, list) else embedding
|
|
83
|
+
with self._lock:
|
|
84
|
+
self._conn.execute(
|
|
85
|
+
"INSERT OR REPLACE INTO embedding_cache (content_hash, dim, embedding) VALUES (?, ?, ?)",
|
|
86
|
+
(content_hash, len(v), self._pack(v)),
|
|
87
|
+
)
|
|
88
|
+
self._conn.commit()
|
|
89
|
+
|
|
90
|
+
def put_batch(self, items: list[tuple[str, list[float]]]) -> None:
|
|
91
|
+
rows = [(h, len(e), self._pack(e)) for h, e in items]
|
|
92
|
+
with self._lock:
|
|
93
|
+
self._conn.executemany(
|
|
94
|
+
"INSERT OR REPLACE INTO embedding_cache (content_hash, dim, embedding) VALUES (?, ?, ?)",
|
|
95
|
+
rows,
|
|
96
|
+
)
|
|
97
|
+
self._conn.commit()
|
|
98
|
+
|
|
99
|
+
def get_batch(self, content_hashes: list[str]) -> dict[str, list[float]]:
|
|
100
|
+
"""Retrieve multiple embeddings at once. Returns hash → embedding for hits."""
|
|
101
|
+
if not content_hashes:
|
|
102
|
+
return {}
|
|
103
|
+
results: dict[str, list[float]] = {}
|
|
104
|
+
with self._lock:
|
|
105
|
+
for i in range(0, len(content_hashes), 500):
|
|
106
|
+
batch = content_hashes[i : i + 500]
|
|
107
|
+
placeholders = ",".join("?" * len(batch))
|
|
108
|
+
rows = self._conn.execute(
|
|
109
|
+
f"SELECT content_hash, dim, embedding FROM embedding_cache "
|
|
110
|
+
f"WHERE content_hash IN ({placeholders})",
|
|
111
|
+
batch,
|
|
112
|
+
).fetchall()
|
|
113
|
+
for h, dim, blob in rows:
|
|
114
|
+
results[h] = self._unpack(blob, dim)
|
|
115
|
+
self._hits += len(results)
|
|
116
|
+
self._misses += len(content_hashes) - len(results)
|
|
117
|
+
return results
|
|
118
|
+
|
|
119
|
+
def prune_orphans(self, known_hashes: set[str]) -> int:
|
|
120
|
+
"""Drop cached entries whose content_hash is not in `known_hashes`.
|
|
121
|
+
|
|
122
|
+
Cache grows monotonically without this — every chunk content variant
|
|
123
|
+
ever seen accumulates forever even after the source files change or
|
|
124
|
+
get deleted. Call this after a `cce index --full` with the set of
|
|
125
|
+
hashes still present in the live index. Returns the count removed.
|
|
126
|
+
"""
|
|
127
|
+
if not known_hashes:
|
|
128
|
+
return 0
|
|
129
|
+
with self._lock:
|
|
130
|
+
cur = self._conn.execute("SELECT content_hash FROM embedding_cache")
|
|
131
|
+
current = {row[0] for row in cur.fetchall()}
|
|
132
|
+
orphans = current - known_hashes
|
|
133
|
+
if not orphans:
|
|
134
|
+
return 0
|
|
135
|
+
removed = 0
|
|
136
|
+
orphan_list = list(orphans)
|
|
137
|
+
for i in range(0, len(orphan_list), 500):
|
|
138
|
+
batch = orphan_list[i : i + 500]
|
|
139
|
+
placeholders = ",".join("?" * len(batch))
|
|
140
|
+
self._conn.execute(
|
|
141
|
+
f"DELETE FROM embedding_cache WHERE content_hash IN ({placeholders})",
|
|
142
|
+
batch,
|
|
143
|
+
)
|
|
144
|
+
removed += len(batch)
|
|
145
|
+
self._conn.commit()
|
|
146
|
+
return removed
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def hits(self) -> int:
|
|
150
|
+
return self._hits
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def misses(self) -> int:
|
|
154
|
+
return self._misses
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def hit_rate(self) -> float:
|
|
158
|
+
total = self._hits + self._misses
|
|
159
|
+
return self._hits / total if total > 0 else 0.0
|
|
160
|
+
|
|
161
|
+
def size(self) -> int:
|
|
162
|
+
with self._lock:
|
|
163
|
+
row = self._conn.execute("SELECT COUNT(*) FROM embedding_cache").fetchone()
|
|
164
|
+
return row[0] if row else 0
|
|
165
|
+
|
|
166
|
+
def close(self) -> None:
|
|
167
|
+
with self._lock:
|
|
168
|
+
self._conn.close()
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Git hook installer and handler for triggering re-indexing."""
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import stat
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
HOOK_MARKER = "# cce hook"
|
|
9
|
+
HOOK_NAMES = ["post-commit", "post-checkout", "post-merge"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _resolve_cce_binary() -> str:
|
|
13
|
+
"""Find an absolute path to the `cce` launcher.
|
|
14
|
+
|
|
15
|
+
Preferring an absolute path means the git hook keeps working when the user
|
|
16
|
+
runs `git commit` from a shell that doesn't pick up the same PATH as the one
|
|
17
|
+
used to install the engine (e.g. different login shell, GUI git client).
|
|
18
|
+
"""
|
|
19
|
+
candidate = Path(sys.executable).parent / "cce"
|
|
20
|
+
if candidate.exists():
|
|
21
|
+
return str(candidate)
|
|
22
|
+
which = shutil.which("cce") or shutil.which("code-context-engine")
|
|
23
|
+
if which:
|
|
24
|
+
return which
|
|
25
|
+
# Last-resort: rely on PATH at hook-run time.
|
|
26
|
+
return "cce"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _hook_script() -> str:
|
|
30
|
+
bin_path = _resolve_cce_binary()
|
|
31
|
+
return f"""{HOOK_MARKER}
|
|
32
|
+
{bin_path} index --changed-only >/dev/null 2>&1 &
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def install_hooks(project_dir: str) -> list[str]:
|
|
37
|
+
"""Install CCE git hooks. Returns [] gracefully if not a git repo."""
|
|
38
|
+
hooks_dir = Path(project_dir) / ".git" / "hooks"
|
|
39
|
+
if not hooks_dir.exists():
|
|
40
|
+
return []
|
|
41
|
+
installed = []
|
|
42
|
+
for hook_name in HOOK_NAMES:
|
|
43
|
+
hook_path = hooks_dir / hook_name
|
|
44
|
+
_install_single_hook(hook_path)
|
|
45
|
+
installed.append(str(hook_path))
|
|
46
|
+
return installed
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _install_single_hook(hook_path: Path) -> None:
|
|
50
|
+
script = _hook_script()
|
|
51
|
+
if hook_path.exists():
|
|
52
|
+
existing = hook_path.read_text()
|
|
53
|
+
if HOOK_MARKER in existing:
|
|
54
|
+
return
|
|
55
|
+
new_content = existing.rstrip() + "\n\n" + script
|
|
56
|
+
else:
|
|
57
|
+
new_content = "#!/bin/sh\n\n" + script
|
|
58
|
+
hook_path.write_text(new_content)
|
|
59
|
+
hook_path.chmod(hook_path.stat().st_mode | stat.S_IEXEC)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_changed_files_from_hook() -> list[str]:
|
|
63
|
+
import subprocess
|
|
64
|
+
try:
|
|
65
|
+
result = subprocess.run(
|
|
66
|
+
["git", "diff", "--name-only", "HEAD~1", "HEAD"],
|
|
67
|
+
capture_output=True, text=True, timeout=10,
|
|
68
|
+
)
|
|
69
|
+
if result.returncode == 0:
|
|
70
|
+
return [f for f in result.stdout.strip().split("\n") if f]
|
|
71
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
72
|
+
pass
|
|
73
|
+
return []
|