code-context-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. code_context_engine-0.4.0.dist-info/METADATA +389 -0
  2. code_context_engine-0.4.0.dist-info/RECORD +63 -0
  3. code_context_engine-0.4.0.dist-info/WHEEL +5 -0
  4. code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
  5. code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
  7. context_engine/__init__.py +3 -0
  8. context_engine/cli.py +2848 -0
  9. context_engine/cli_style.py +66 -0
  10. context_engine/compression/__init__.py +0 -0
  11. context_engine/compression/compressor.py +144 -0
  12. context_engine/compression/ollama_client.py +33 -0
  13. context_engine/compression/output_rules.py +77 -0
  14. context_engine/compression/prompts.py +9 -0
  15. context_engine/compression/quality.py +37 -0
  16. context_engine/config.py +198 -0
  17. context_engine/dashboard/__init__.py +0 -0
  18. context_engine/dashboard/_page.py +1548 -0
  19. context_engine/dashboard/server.py +429 -0
  20. context_engine/editors.py +265 -0
  21. context_engine/event_bus.py +24 -0
  22. context_engine/indexer/__init__.py +0 -0
  23. context_engine/indexer/chunker.py +147 -0
  24. context_engine/indexer/embedder.py +154 -0
  25. context_engine/indexer/embedding_cache.py +168 -0
  26. context_engine/indexer/git_hooks.py +73 -0
  27. context_engine/indexer/git_indexer.py +136 -0
  28. context_engine/indexer/ignorefile.py +96 -0
  29. context_engine/indexer/manifest.py +78 -0
  30. context_engine/indexer/pipeline.py +624 -0
  31. context_engine/indexer/secrets.py +332 -0
  32. context_engine/indexer/watcher.py +109 -0
  33. context_engine/integration/__init__.py +0 -0
  34. context_engine/integration/bootstrap.py +76 -0
  35. context_engine/integration/git_context.py +132 -0
  36. context_engine/integration/mcp_server.py +1825 -0
  37. context_engine/integration/session_capture.py +306 -0
  38. context_engine/memory/__init__.py +6 -0
  39. context_engine/memory/compressor.py +344 -0
  40. context_engine/memory/db.py +922 -0
  41. context_engine/memory/extractive.py +106 -0
  42. context_engine/memory/grammar.py +419 -0
  43. context_engine/memory/hook_installer.py +258 -0
  44. context_engine/memory/hook_server.py +83 -0
  45. context_engine/memory/hooks.py +327 -0
  46. context_engine/memory/migrate.py +268 -0
  47. context_engine/models.py +96 -0
  48. context_engine/pricing.py +104 -0
  49. context_engine/project_commands.py +296 -0
  50. context_engine/retrieval/__init__.py +0 -0
  51. context_engine/retrieval/confidence.py +47 -0
  52. context_engine/retrieval/query_parser.py +105 -0
  53. context_engine/retrieval/retriever.py +199 -0
  54. context_engine/serve_http.py +208 -0
  55. context_engine/services.py +252 -0
  56. context_engine/storage/__init__.py +0 -0
  57. context_engine/storage/backend.py +39 -0
  58. context_engine/storage/fts_store.py +112 -0
  59. context_engine/storage/graph_store.py +219 -0
  60. context_engine/storage/local_backend.py +109 -0
  61. context_engine/storage/remote_backend.py +117 -0
  62. context_engine/storage/vector_store.py +357 -0
  63. context_engine/utils.py +72 -0
@@ -0,0 +1,208 @@
1
+ """HTTP API server for remote context engine — exposes storage + compression endpoints.
2
+
3
+ Security model:
4
+ - Default bind is 127.0.0.1. Use --host 0.0.0.0 explicitly to expose on LAN.
5
+ - When bound to a non-loopback host, a bearer token is required. Set via the
6
+ CCE_API_TOKEN env var; requests without a matching `Authorization: Bearer <token>`
7
+ header get 401. Loopback requests skip auth for local development.
8
+ """
9
+ import hmac
10
+ import os
11
+ from pathlib import Path
12
+
13
+ from context_engine.config import load_config, PROJECT_CONFIG_NAME
14
+ from context_engine.storage.local_backend import LocalBackend
15
+ from context_engine.indexer.embedder import Embedder
16
+ from context_engine.compression.compressor import Compressor
17
+ from context_engine.models import Chunk, ChunkType, GraphNode, GraphEdge, NodeType, EdgeType
18
+
19
+ try:
20
+ from aiohttp import web
21
+ except ImportError as e:
22
+ raise ImportError(
23
+ "aiohttp is required for HTTP serve mode. "
24
+ "Install with: pip install 'code-context-engine[http]'"
25
+ ) from e
26
+
27
+
28
+ _MAX_REQUEST_BYTES = 10 * 1024 * 1024 # 10 MB — generous for bulk ingest, not unbounded
29
+ _LOOPBACK_HOSTS = {"127.0.0.1", "::1", "localhost"}
30
+
31
+
32
+ class ContextEngineHTTP:
33
+ def __init__(self, backend: LocalBackend, embedder: Embedder, compressor: Compressor):
34
+ self.backend = backend
35
+ self.embedder = embedder
36
+ self.compressor = compressor
37
+
38
+ async def handle_vector_search(self, request: web.Request) -> web.Response:
39
+ data = await _read_json(request)
40
+ embedding = data["embedding"]
41
+ top_k = data.get("top_k", 10)
42
+ results = await self.backend.vector_search(embedding, top_k=top_k)
43
+ return web.json_response({"results": [self._chunk_to_dict(c) for c in results]})
44
+
45
+ async def handle_fts_search(self, request: web.Request) -> web.Response:
46
+ data = await _read_json(request)
47
+ query = data["query"]
48
+ top_k = data.get("top_k", 30)
49
+ results = await self.backend.fts_search(query, top_k=top_k)
50
+ return web.json_response({"results": [{"id": i, "score": s} for i, s in results]})
51
+
52
+ async def handle_chunks_by_ids(self, request: web.Request) -> web.Response:
53
+ data = await _read_json(request)
54
+ ids = data.get("ids", [])
55
+ if not isinstance(ids, list):
56
+ return web.json_response({"error": "ids must be a list"}, status=400)
57
+ chunks = await self.backend.get_chunks_by_ids(ids)
58
+ return web.json_response({"results": [self._chunk_to_dict(c) for c in chunks]})
59
+
60
+ async def handle_graph_neighbors(self, request: web.Request) -> web.Response:
61
+ data = await _read_json(request)
62
+ node_id = data["node_id"]
63
+ edge_type = EdgeType(data["edge_type"]) if data.get("edge_type") else None
64
+ results = await self.backend.graph_neighbors(node_id, edge_type=edge_type)
65
+ return web.json_response({"results": [self._node_to_dict(n) for n in results]})
66
+
67
+ async def handle_ingest(self, request: web.Request) -> web.Response:
68
+ data = await _read_json(request)
69
+ chunks = [self._dict_to_chunk(d) for d in data.get("chunks", [])]
70
+ nodes = [self._dict_to_node(d) for d in data.get("nodes", [])]
71
+ edges = [self._dict_to_edge(d) for d in data.get("edges", [])]
72
+ await self.backend.ingest(chunks, nodes, edges)
73
+ return web.json_response({"ok": True})
74
+
75
+ async def handle_get_chunk(self, request: web.Request) -> web.Response:
76
+ chunk_id = request.match_info["chunk_id"]
77
+ chunk = await self.backend.get_chunk_by_id(chunk_id)
78
+ if chunk is None:
79
+ return web.json_response({"error": "not found"}, status=404)
80
+ return web.json_response(self._chunk_to_dict(chunk))
81
+
82
+ async def handle_delete_file(self, request: web.Request) -> web.Response:
83
+ file_path = request.match_info["file_path"]
84
+ # Reject absolute paths and traversal — delete_by_file is SQL-only today,
85
+ # but treating file_path as a relative project path is a safer contract.
86
+ if file_path.startswith("/") or ".." in Path(file_path).parts:
87
+ return web.json_response({"error": "invalid file_path"}, status=400)
88
+ await self.backend.delete_by_file(file_path)
89
+ return web.json_response({"ok": True})
90
+
91
+ async def handle_health(self, request: web.Request) -> web.Response:
92
+ return web.json_response({"status": "ok"})
93
+
94
+ def _chunk_to_dict(self, chunk):
95
+ return {"id": chunk.id, "content": chunk.content, "chunk_type": chunk.chunk_type.value,
96
+ "file_path": chunk.file_path, "start_line": chunk.start_line, "end_line": chunk.end_line,
97
+ "language": chunk.language, "embedding": chunk.embedding, "metadata": chunk.metadata}
98
+
99
+ def _dict_to_chunk(self, d):
100
+ return Chunk(id=d["id"], content=d["content"], chunk_type=ChunkType(d["chunk_type"]),
101
+ file_path=d["file_path"], start_line=d["start_line"], end_line=d["end_line"],
102
+ language=d["language"], embedding=d.get("embedding"), metadata=d.get("metadata", {}))
103
+
104
+ def _node_to_dict(self, node):
105
+ return {"id": node.id, "node_type": node.node_type.value, "name": node.name, "file_path": node.file_path}
106
+
107
+ def _dict_to_node(self, d):
108
+ return GraphNode(id=d["id"], node_type=NodeType(d["node_type"]), name=d["name"], file_path=d["file_path"])
109
+
110
+ def _dict_to_edge(self, d):
111
+ return GraphEdge(source_id=d["source_id"], target_id=d["target_id"], edge_type=EdgeType(d["edge_type"]))
112
+
113
+
114
+ async def _read_json(request: web.Request) -> dict:
115
+ try:
116
+ return await request.json()
117
+ except Exception as e:
118
+ raise web.HTTPBadRequest(
119
+ text=f'{{"error": "invalid JSON: {type(e).__name__}"}}',
120
+ content_type="application/json",
121
+ )
122
+
123
+
124
+ @web.middleware
125
+ async def _error_middleware(request, handler):
126
+ try:
127
+ return await handler(request)
128
+ except web.HTTPException:
129
+ raise
130
+ except KeyError as e:
131
+ return web.json_response({"error": f"missing field: {e.args[0]}"}, status=400)
132
+ except ValueError as e:
133
+ return web.json_response({"error": str(e)}, status=400)
134
+
135
+
136
+ def _make_auth_middleware(expected_token: str | None):
137
+ @web.middleware
138
+ async def _auth(request, handler):
139
+ # Health check is always open — used by liveness probes.
140
+ if request.path == "/health":
141
+ return await handler(request)
142
+
143
+ remote = request.remote or ""
144
+ # Loopback requests skip auth regardless of token setting — local dev UX.
145
+ if remote in _LOOPBACK_HOSTS:
146
+ return await handler(request)
147
+
148
+ if not expected_token:
149
+ # Bound to non-loopback but no token configured: refuse. Prevents
150
+ # accidentally exposing an unauthenticated server to a network.
151
+ return web.json_response(
152
+ {"error": "server is not configured for non-loopback access; set CCE_API_TOKEN"},
153
+ status=503,
154
+ )
155
+
156
+ auth_header = request.headers.get("Authorization", "")
157
+ if not auth_header.startswith("Bearer "):
158
+ return web.json_response({"error": "missing bearer token"}, status=401)
159
+ presented = auth_header[len("Bearer "):]
160
+ if not hmac.compare_digest(presented, expected_token):
161
+ return web.json_response({"error": "invalid token"}, status=401)
162
+ return await handler(request)
163
+
164
+ return _auth
165
+
166
+
167
+ def create_app(backend, embedder, compressor, *, api_token: str | None = None) -> web.Application:
168
+ handler = ContextEngineHTTP(backend, embedder, compressor)
169
+ app = web.Application(
170
+ client_max_size=_MAX_REQUEST_BYTES,
171
+ middlewares=[_make_auth_middleware(api_token), _error_middleware],
172
+ )
173
+ app.router.add_get("/health", handler.handle_health)
174
+ app.router.add_post("/vector_search", handler.handle_vector_search)
175
+ app.router.add_post("/fts_search", handler.handle_fts_search)
176
+ app.router.add_post("/chunks_by_ids", handler.handle_chunks_by_ids)
177
+ app.router.add_post("/graph_neighbors", handler.handle_graph_neighbors)
178
+ app.router.add_post("/ingest", handler.handle_ingest)
179
+ app.router.add_get("/chunk/{chunk_id}", handler.handle_get_chunk)
180
+ app.router.add_delete("/file/{file_path:.*}", handler.handle_delete_file)
181
+ return app
182
+
183
+
184
+ def run_http_server(config=None, host: str = "127.0.0.1", port: int = 8765) -> None:
185
+ if config is None:
186
+ project_path = Path.cwd() / PROJECT_CONFIG_NAME
187
+ config = load_config(project_path=project_path if project_path.exists() else None)
188
+
189
+ project_name = Path.cwd().name
190
+ storage_base = Path(config.storage_path) / project_name
191
+ storage_base.mkdir(parents=True, exist_ok=True)
192
+
193
+ backend = LocalBackend(base_path=str(storage_base))
194
+ embedder = Embedder(model_name=config.embedding_model)
195
+ compressor = Compressor(model=config.compression_model, cache=backend)
196
+
197
+ api_token = os.environ.get("CCE_API_TOKEN") or None
198
+ if host not in _LOOPBACK_HOSTS and not api_token:
199
+ raise SystemExit(
200
+ f"Refusing to bind {host}:{port} without CCE_API_TOKEN set. "
201
+ "Either bind --host 127.0.0.1 or export CCE_API_TOKEN=<secret>."
202
+ )
203
+
204
+ app = create_app(backend, embedder, compressor, api_token=api_token)
205
+ print(f"Context engine HTTP server starting on {host}:{port}")
206
+ if api_token:
207
+ print("Auth: bearer token required for non-loopback requests")
208
+ web.run_app(app, host=host, port=port, print=None)
@@ -0,0 +1,252 @@
1
+ """Service management for CCE — Ollama and Dashboard start/stop/status.
2
+
3
+ PID files live in <storage_base>/pids/ where storage_base is resolved
4
+ from config.yaml (defaults to ~/.cce):
5
+ ollama.pid PID of the ollama process CCE started
6
+ dashboard.pid PID of the dashboard process CCE started
7
+ dashboard.port Port the dashboard is running on
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import signal
14
+ import socket
15
+ import subprocess
16
+ from pathlib import Path
17
+
18
+ log = logging.getLogger(__name__)
19
+
20
+ _DASHBOARD_DEFAULT_PORT = 8080
21
+
22
+
23
+ def _storage_base() -> Path:
24
+ """Resolve storage base from config, falling back to default."""
25
+ try:
26
+ from context_engine.config import load_config
27
+ config = load_config()
28
+ return Path(config.storage_path).parent
29
+ except Exception as exc:
30
+ log.debug("Could not load config for storage base, using default: %s", exc)
31
+ from context_engine.config import _CCE_HOME
32
+ return _CCE_HOME
33
+
34
+
35
+ def _pid_dir() -> Path:
36
+ d = _storage_base() / "pids"
37
+ d.mkdir(parents=True, exist_ok=True)
38
+ return d
39
+
40
+
41
+ def _read_pid(name: str) -> int | None:
42
+ p = _pid_dir() / f"{name}.pid"
43
+ try:
44
+ return int(p.read_text().strip())
45
+ except (FileNotFoundError, ValueError):
46
+ return None
47
+
48
+
49
+ def _write_pid(name: str, pid: int) -> None:
50
+ (_pid_dir() / f"{name}.pid").write_text(str(pid))
51
+
52
+
53
+ def _remove_pid(name: str) -> None:
54
+ p = _pid_dir() / f"{name}.pid"
55
+ p.unlink(missing_ok=True)
56
+
57
+
58
+ def _process_alive(pid: int) -> bool:
59
+ try:
60
+ os.kill(pid, 0)
61
+ return True
62
+ except ProcessLookupError:
63
+ return False
64
+ except PermissionError:
65
+ # Process exists but owned by another user
66
+ return True
67
+
68
+
69
+ def _check_port_open(port: int, host: str = "127.0.0.1") -> bool:
70
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
71
+ s.settimeout(0.5)
72
+ return s.connect_ex((host, port)) == 0
73
+
74
+
75
+ def _ollama_running() -> bool:
76
+ """Check if Ollama is responding on its default port."""
77
+ try:
78
+ import httpx
79
+ resp = httpx.get("http://localhost:11434/api/tags", timeout=2.0)
80
+ return resp.status_code == 200
81
+ except Exception:
82
+ return False
83
+
84
+
85
+ def _mcp_running() -> bool:
86
+ """Check if a cce serve process is running via pgrep (or ps fallback)."""
87
+ try:
88
+ result = subprocess.run(
89
+ ["pgrep", "-f", "cce serve"],
90
+ capture_output=True, text=True, timeout=3,
91
+ )
92
+ if result.returncode == 0:
93
+ return True
94
+ # returncode 1 = no matches (normal). Any other code or stderr
95
+ # suggests pgrep itself failed — fall through to ps fallback.
96
+ if result.returncode == 1 and not result.stderr.strip():
97
+ return False
98
+ except FileNotFoundError:
99
+ pass
100
+ except Exception:
101
+ pass
102
+ # Fallback: ps with grep exclusion
103
+ try:
104
+ result = subprocess.run(
105
+ ["ps", "aux"], capture_output=True, text=True, timeout=3,
106
+ )
107
+ for line in result.stdout.splitlines():
108
+ if "cce serve" in line and "grep" not in line:
109
+ return True
110
+ except Exception:
111
+ pass
112
+ return False
113
+
114
+
115
+ # ── Public status API ─────────────────────────────────────────────────────────
116
+
117
+ def get_ollama_status() -> dict:
118
+ running = _ollama_running()
119
+ managed_pid = _read_pid("ollama")
120
+ managed = managed_pid is not None and _process_alive(managed_pid)
121
+
122
+ detail = ""
123
+ if running:
124
+ detail = "localhost:11434"
125
+ if not managed:
126
+ detail += " (external)"
127
+
128
+ return {
129
+ "name": "ollama",
130
+ "running": running,
131
+ "managed": managed,
132
+ "detail": detail,
133
+ }
134
+
135
+
136
+ def get_dashboard_status() -> dict:
137
+ port_file = _pid_dir() / "dashboard.port"
138
+ try:
139
+ port = int(port_file.read_text().strip())
140
+ except (FileNotFoundError, ValueError):
141
+ port = None
142
+
143
+ managed_pid = _read_pid("dashboard")
144
+ managed = managed_pid is not None and _process_alive(managed_pid)
145
+
146
+ running = False
147
+ detail = ""
148
+ if port and _check_port_open(port):
149
+ running = True
150
+ detail = f"http://localhost:{port}"
151
+ elif managed:
152
+ # PID alive but port not answering yet (starting up)
153
+ running = True
154
+ detail = "starting..."
155
+
156
+ return {
157
+ "name": "dashboard",
158
+ "running": running,
159
+ "managed": managed,
160
+ "port": port,
161
+ "detail": detail,
162
+ }
163
+
164
+
165
+ def get_mcp_status() -> dict:
166
+ running = _mcp_running()
167
+ return {
168
+ "name": "mcp",
169
+ "running": running,
170
+ "managed": False, # always managed by Claude Code
171
+ "detail": "managed by Claude Code" if running else "",
172
+ }
173
+
174
+
175
+ # ── Start/stop ────────────────────────────────────────────────────────────────
176
+
177
+ def start_ollama() -> tuple[bool, str]:
178
+ """Start ollama serve in the background. Returns (success, message)."""
179
+ if _ollama_running():
180
+ return False, "Ollama is already running."
181
+ try:
182
+ proc = subprocess.Popen(
183
+ ["ollama", "serve"],
184
+ stdout=subprocess.DEVNULL,
185
+ stderr=subprocess.DEVNULL,
186
+ # Detach from CCE's process group so SIGINT to the CLI doesn't
187
+ # kill the background daemon. Works on both Linux and macOS.
188
+ start_new_session=True,
189
+ )
190
+ _write_pid("ollama", proc.pid)
191
+ return True, f"Ollama started (PID {proc.pid})"
192
+ except FileNotFoundError:
193
+ return False, "ollama not found. Install: https://ollama.com (or `brew install ollama` on macOS)"
194
+ except Exception as exc:
195
+ return False, f"Failed to start Ollama: {exc}"
196
+
197
+
198
+ def stop_ollama() -> tuple[bool, str]:
199
+ """Stop the Ollama process CCE started."""
200
+ pid = _read_pid("ollama")
201
+ if pid is None:
202
+ if _ollama_running():
203
+ return False, "Ollama is running but was not started by CCE (external process)."
204
+ return False, "Ollama is not running."
205
+ if not _process_alive(pid):
206
+ _remove_pid("ollama")
207
+ return False, "Ollama process already stopped."
208
+ try:
209
+ os.kill(pid, signal.SIGTERM)
210
+ _remove_pid("ollama")
211
+ return True, f"Ollama stopped (PID {pid})"
212
+ except Exception as exc:
213
+ return False, f"Failed to stop Ollama: {exc}"
214
+
215
+
216
+ def start_dashboard(port: int = _DASHBOARD_DEFAULT_PORT) -> tuple[bool, str]:
217
+ """Start CCE dashboard as a background process."""
218
+ status = get_dashboard_status()
219
+ if status["running"]:
220
+ return False, f"Dashboard is already running at {status['detail']}"
221
+ try:
222
+ from context_engine.utils import resolve_cce_binary
223
+ cce_bin = resolve_cce_binary()
224
+ proc = subprocess.Popen(
225
+ [cce_bin, "dashboard", "--no-browser", "--port", str(port)],
226
+ stdout=subprocess.DEVNULL,
227
+ stderr=subprocess.DEVNULL,
228
+ start_new_session=True,
229
+ )
230
+ _write_pid("dashboard", proc.pid)
231
+ (_pid_dir() / "dashboard.port").write_text(str(port))
232
+ return True, f"Dashboard started at http://localhost:{port} (PID {proc.pid})"
233
+ except Exception as exc:
234
+ return False, f"Failed to start dashboard: {exc}"
235
+
236
+
237
+ def stop_dashboard() -> tuple[bool, str]:
238
+ """Stop the CCE dashboard process."""
239
+ pid = _read_pid("dashboard")
240
+ if pid is None:
241
+ return False, "Dashboard is not running (no PID on record)."
242
+ if not _process_alive(pid):
243
+ _remove_pid("dashboard")
244
+ (_pid_dir() / "dashboard.port").unlink(missing_ok=True)
245
+ return False, "Dashboard process already stopped."
246
+ try:
247
+ os.kill(pid, signal.SIGTERM)
248
+ _remove_pid("dashboard")
249
+ (_pid_dir() / "dashboard.port").unlink(missing_ok=True)
250
+ return True, f"Dashboard stopped (PID {pid})"
251
+ except Exception as exc:
252
+ return False, f"Failed to stop dashboard: {exc}"
File without changes
@@ -0,0 +1,39 @@
1
+ """Storage backend protocol — implemented by local and remote backends."""
2
+ from typing import Protocol, runtime_checkable
3
+
4
+ from context_engine.models import Chunk, GraphNode, GraphEdge, NodeType, EdgeType
5
+
6
+
7
+ @runtime_checkable
8
+ class StorageBackend(Protocol):
9
+ async def ingest(
10
+ self,
11
+ chunks: list[Chunk],
12
+ nodes: list[GraphNode],
13
+ edges: list[GraphEdge],
14
+ ) -> None: ...
15
+
16
+ async def vector_search(
17
+ self,
18
+ query_embedding: list[float],
19
+ top_k: int = 10,
20
+ filters: dict | None = None,
21
+ ) -> list[Chunk]: ...
22
+
23
+ async def graph_neighbors(
24
+ self,
25
+ node_id: str,
26
+ edge_type: EdgeType | None = None,
27
+ ) -> list[GraphNode]: ...
28
+
29
+ async def get_chunk_by_id(self, chunk_id: str) -> Chunk | None: ...
30
+
31
+ async def delete_by_file(self, file_path: str) -> None: ...
32
+
33
+ async def fts_search(
34
+ self,
35
+ query: str,
36
+ top_k: int = 30,
37
+ ) -> list[tuple[str, float]]: ...
38
+
39
+ async def get_chunks_by_ids(self, chunk_ids: list[str]) -> list[Chunk]: ...
@@ -0,0 +1,112 @@
1
+ """SQLite FTS5 full-text search store."""
2
+ import asyncio
3
+ import logging
4
+ import os
5
+ import sqlite3
6
+ from threading import RLock
7
+
8
+ from context_engine.models import Chunk
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+ _MAX_CONTENT_CHARS = 5_000
13
+
14
+
15
+ def _escape_fts5(query: str) -> str:
16
+ """Wrap user input as an FTS5 phrase to avoid operator injection."""
17
+ return '"' + query.replace('"', '""') + '"'
18
+
19
+
20
+ class FTSStore:
21
+ """Single-connection SQLite FTS store, serialised with an RLock.
22
+
23
+ `check_same_thread=False` only disables thread ownership checks; concurrent
24
+ operations on one sqlite3 connection are still unsafe. Mirrors VectorStore's
25
+ locking pattern so dashboard/MCP/reindex calls running through asyncio
26
+ .to_thread don't interleave on the connection.
27
+ """
28
+
29
+ def __init__(self, db_path: str) -> None:
30
+ os.makedirs(db_path, exist_ok=True)
31
+ self._lock = RLock()
32
+ self._conn = sqlite3.connect(
33
+ os.path.join(db_path, "fts.db"), check_same_thread=False
34
+ )
35
+ with self._lock:
36
+ self._conn.execute(
37
+ "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts "
38
+ "USING fts5(id UNINDEXED, content, file_path, language, chunk_type)"
39
+ )
40
+ self._conn.commit()
41
+
42
+ def _ingest_sync(self, chunks: list[Chunk]) -> None:
43
+ # executemany packs all rows into one prepared-statement batch — about
44
+ # 30-50% faster than the per-row INSERT loop on 1000+ chunks.
45
+ rows = [
46
+ (
47
+ chunk.id,
48
+ chunk.content[:_MAX_CONTENT_CHARS] if len(chunk.content) > _MAX_CONTENT_CHARS else chunk.content,
49
+ chunk.file_path,
50
+ chunk.language,
51
+ chunk.chunk_type.value,
52
+ )
53
+ for chunk in chunks
54
+ ]
55
+ with self._lock:
56
+ self._conn.executemany(
57
+ "INSERT OR REPLACE INTO chunks_fts(id, content, file_path, language, chunk_type) "
58
+ "VALUES (?, ?, ?, ?, ?)",
59
+ rows,
60
+ )
61
+ self._conn.commit()
62
+
63
+ def _search_sync(self, escaped_query: str, top_k: int) -> list[tuple[str, float]]:
64
+ with self._lock:
65
+ cursor = self._conn.execute(
66
+ "SELECT id, rank FROM chunks_fts WHERE chunks_fts MATCH ? "
67
+ "ORDER BY rank LIMIT ?",
68
+ (escaped_query, top_k),
69
+ )
70
+ return [(row[0], float(row[1])) for row in cursor.fetchall()]
71
+
72
+ def _delete_sync(self, file_path: str) -> None:
73
+ with self._lock:
74
+ self._conn.execute(
75
+ "DELETE FROM chunks_fts WHERE file_path = ?", (file_path,)
76
+ )
77
+ self._conn.commit()
78
+
79
+ def _delete_files_sync(self, file_paths: list[str]) -> None:
80
+ if not file_paths:
81
+ return
82
+ from context_engine.utils import batched_params
83
+
84
+ with self._lock:
85
+ for batch in batched_params(file_paths):
86
+ placeholders = ",".join("?" * len(batch))
87
+ self._conn.execute(
88
+ f"DELETE FROM chunks_fts WHERE file_path IN ({placeholders})",
89
+ batch,
90
+ )
91
+ self._conn.commit()
92
+
93
+ async def ingest(self, chunks: list[Chunk]) -> None:
94
+ if not chunks:
95
+ return
96
+ await asyncio.to_thread(self._ingest_sync, chunks)
97
+
98
+ async def search(self, query: str, top_k: int = 30) -> list[tuple[str, float]]:
99
+ if not query.strip():
100
+ return []
101
+ return await asyncio.to_thread(self._search_sync, _escape_fts5(query), top_k)
102
+
103
+ def clear(self) -> None:
104
+ with self._lock:
105
+ self._conn.execute("DELETE FROM chunks_fts")
106
+ self._conn.commit()
107
+
108
+ async def delete_by_file(self, file_path: str) -> None:
109
+ await asyncio.to_thread(self._delete_sync, file_path)
110
+
111
+ async def delete_by_files(self, file_paths: list[str]) -> None:
112
+ await asyncio.to_thread(self._delete_files_sync, file_paths)