code-context-engine 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context_engine-0.4.0.dist-info/METADATA +389 -0
- code_context_engine-0.4.0.dist-info/RECORD +63 -0
- code_context_engine-0.4.0.dist-info/WHEEL +5 -0
- code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
- code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
- code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
- context_engine/__init__.py +3 -0
- context_engine/cli.py +2848 -0
- context_engine/cli_style.py +66 -0
- context_engine/compression/__init__.py +0 -0
- context_engine/compression/compressor.py +144 -0
- context_engine/compression/ollama_client.py +33 -0
- context_engine/compression/output_rules.py +77 -0
- context_engine/compression/prompts.py +9 -0
- context_engine/compression/quality.py +37 -0
- context_engine/config.py +198 -0
- context_engine/dashboard/__init__.py +0 -0
- context_engine/dashboard/_page.py +1548 -0
- context_engine/dashboard/server.py +429 -0
- context_engine/editors.py +265 -0
- context_engine/event_bus.py +24 -0
- context_engine/indexer/__init__.py +0 -0
- context_engine/indexer/chunker.py +147 -0
- context_engine/indexer/embedder.py +154 -0
- context_engine/indexer/embedding_cache.py +168 -0
- context_engine/indexer/git_hooks.py +73 -0
- context_engine/indexer/git_indexer.py +136 -0
- context_engine/indexer/ignorefile.py +96 -0
- context_engine/indexer/manifest.py +78 -0
- context_engine/indexer/pipeline.py +624 -0
- context_engine/indexer/secrets.py +332 -0
- context_engine/indexer/watcher.py +109 -0
- context_engine/integration/__init__.py +0 -0
- context_engine/integration/bootstrap.py +76 -0
- context_engine/integration/git_context.py +132 -0
- context_engine/integration/mcp_server.py +1825 -0
- context_engine/integration/session_capture.py +306 -0
- context_engine/memory/__init__.py +6 -0
- context_engine/memory/compressor.py +344 -0
- context_engine/memory/db.py +922 -0
- context_engine/memory/extractive.py +106 -0
- context_engine/memory/grammar.py +419 -0
- context_engine/memory/hook_installer.py +258 -0
- context_engine/memory/hook_server.py +83 -0
- context_engine/memory/hooks.py +327 -0
- context_engine/memory/migrate.py +268 -0
- context_engine/models.py +96 -0
- context_engine/pricing.py +104 -0
- context_engine/project_commands.py +296 -0
- context_engine/retrieval/__init__.py +0 -0
- context_engine/retrieval/confidence.py +47 -0
- context_engine/retrieval/query_parser.py +105 -0
- context_engine/retrieval/retriever.py +199 -0
- context_engine/serve_http.py +208 -0
- context_engine/services.py +252 -0
- context_engine/storage/__init__.py +0 -0
- context_engine/storage/backend.py +39 -0
- context_engine/storage/fts_store.py +112 -0
- context_engine/storage/graph_store.py +219 -0
- context_engine/storage/local_backend.py +109 -0
- context_engine/storage/remote_backend.py +117 -0
- context_engine/storage/vector_store.py +357 -0
- context_engine/utils.py +72 -0
|
@@ -0,0 +1,624 @@
|
|
|
1
|
+
"""Reusable indexing pipeline — shared by the CLI (`cce index`) and MCP (`reindex`).
|
|
2
|
+
|
|
3
|
+
This module owns the full index-a-project flow so the CLI and MCP server don't
|
|
4
|
+
duplicate logic and can't drift. Callers pass a structured `IndexResult` back so
|
|
5
|
+
they can format their own output (click.echo, MCP text response, logs).
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import hashlib
|
|
11
|
+
import logging
|
|
12
|
+
import time
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Iterable
|
|
16
|
+
|
|
17
|
+
import subprocess
|
|
18
|
+
|
|
19
|
+
from context_engine.indexer.chunker import Chunker
|
|
20
|
+
from context_engine.indexer.embedder import Embedder
|
|
21
|
+
from context_engine.indexer.embedding_cache import EmbeddingCache
|
|
22
|
+
from context_engine.indexer.git_indexer import index_commits
|
|
23
|
+
from context_engine.indexer.manifest import Manifest
|
|
24
|
+
from context_engine.models import ChunkType, GraphNode, GraphEdge, NodeType, EdgeType
|
|
25
|
+
from context_engine.storage.local_backend import LocalBackend
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Map a chunk's semantic type to its graph node type. Without this every
|
|
29
|
+
# non-function chunk used to land as NodeType.CLASS, which polluted the graph
|
|
30
|
+
# (e.g. markdown / yaml / json / module-level fallback chunks all looked like
|
|
31
|
+
# classes and degraded related_context expansion).
|
|
32
|
+
_CHUNK_TO_NODE_TYPE = {
|
|
33
|
+
ChunkType.FUNCTION: NodeType.FUNCTION,
|
|
34
|
+
ChunkType.CLASS: NodeType.CLASS,
|
|
35
|
+
ChunkType.MODULE: NodeType.MODULE,
|
|
36
|
+
ChunkType.DOC: NodeType.DOC,
|
|
37
|
+
ChunkType.COMMENT: NodeType.DOC,
|
|
38
|
+
ChunkType.COMMIT: NodeType.COMMIT,
|
|
39
|
+
ChunkType.SESSION: NodeType.SESSION,
|
|
40
|
+
ChunkType.DECISION: NodeType.DECISION,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
log = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class PathOutsideProjectError(ValueError):
|
|
47
|
+
"""Raised when a target_path resolves outside the project root."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _resolve_within(project_dir: Path, target: str | Path) -> Path:
|
|
51
|
+
"""Resolve `target` relative to project_dir and assert it stays inside.
|
|
52
|
+
|
|
53
|
+
Prevents path traversal via `target_path="../../etc/passwd"` from any caller
|
|
54
|
+
that hands user input to `run_indexing`. Always call this before reading or
|
|
55
|
+
walking `target` against the filesystem.
|
|
56
|
+
"""
|
|
57
|
+
p = Path(target)
|
|
58
|
+
if not p.is_absolute():
|
|
59
|
+
p = project_dir / p
|
|
60
|
+
resolved = p.resolve()
|
|
61
|
+
project_resolved = project_dir.resolve()
|
|
62
|
+
try:
|
|
63
|
+
resolved.relative_to(project_resolved)
|
|
64
|
+
except ValueError as exc:
|
|
65
|
+
raise PathOutsideProjectError(
|
|
66
|
+
f"target path escapes project directory: {target}"
|
|
67
|
+
) from exc
|
|
68
|
+
return resolved
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# Serialise indexing runs so a watcher-triggered re-index can't race a manual
|
|
72
|
+
# `cce index` or MCP `reindex` tool call on the same LanceDB table.
|
|
73
|
+
_PIPELINE_LOCKS: dict[str, asyncio.Lock] = {}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _pipeline_lock(storage_key: str) -> asyncio.Lock:
|
|
77
|
+
lock = _PIPELINE_LOCKS.get(storage_key)
|
|
78
|
+
if lock is None:
|
|
79
|
+
lock = asyncio.Lock()
|
|
80
|
+
_PIPELINE_LOCKS[storage_key] = lock
|
|
81
|
+
return lock
|
|
82
|
+
|
|
83
|
+
# Binary / non-text extensions to skip (images, compiled, archives, etc.)
|
|
84
|
+
_SKIP_EXTENSIONS = {
|
|
85
|
+
# Images
|
|
86
|
+
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".tiff", ".svg",
|
|
87
|
+
# Compiled / bytecode
|
|
88
|
+
".pyc", ".pyo", ".class", ".o", ".so", ".dylib", ".dll", ".exe", ".wasm",
|
|
89
|
+
# Archives
|
|
90
|
+
".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".jar", ".war",
|
|
91
|
+
# Data / binary
|
|
92
|
+
".db", ".sqlite", ".sqlite3", ".bin", ".dat", ".pkl", ".pickle",
|
|
93
|
+
".parquet", ".arrow", ".lance",
|
|
94
|
+
# Media
|
|
95
|
+
".mp3", ".mp4", ".wav", ".avi", ".mov", ".flv", ".ogg", ".webm",
|
|
96
|
+
# Fonts
|
|
97
|
+
".ttf", ".otf", ".woff", ".woff2", ".eot",
|
|
98
|
+
# Documents (non-text)
|
|
99
|
+
".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
|
|
100
|
+
# Package locks (huge, not useful for context)
|
|
101
|
+
".lock",
|
|
102
|
+
# Source maps
|
|
103
|
+
".map",
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# Known extension → language mapping for tree-sitter and chunk metadata.
|
|
107
|
+
# Files with unlisted extensions are still indexed as "plaintext".
|
|
108
|
+
_LANGUAGE_MAP = {
|
|
109
|
+
".py": "python",
|
|
110
|
+
".js": "javascript",
|
|
111
|
+
".ts": "typescript",
|
|
112
|
+
".jsx": "javascript",
|
|
113
|
+
".tsx": "tsx",
|
|
114
|
+
".md": "markdown",
|
|
115
|
+
".php": "php",
|
|
116
|
+
".html": "html",
|
|
117
|
+
".htm": "html",
|
|
118
|
+
".css": "css",
|
|
119
|
+
".scss": "css",
|
|
120
|
+
".less": "css",
|
|
121
|
+
".json": "json",
|
|
122
|
+
".yaml": "yaml",
|
|
123
|
+
".yml": "yaml",
|
|
124
|
+
".toml": "toml",
|
|
125
|
+
".sh": "bash",
|
|
126
|
+
".bash": "bash",
|
|
127
|
+
".zsh": "bash",
|
|
128
|
+
".rb": "ruby",
|
|
129
|
+
".go": "go",
|
|
130
|
+
".rs": "rust",
|
|
131
|
+
".java": "java",
|
|
132
|
+
".c": "c",
|
|
133
|
+
".cpp": "cpp",
|
|
134
|
+
".h": "c",
|
|
135
|
+
".hpp": "cpp",
|
|
136
|
+
".swift": "swift",
|
|
137
|
+
".kt": "kotlin",
|
|
138
|
+
".kts": "kotlin",
|
|
139
|
+
".sql": "sql",
|
|
140
|
+
".graphql": "graphql",
|
|
141
|
+
".gql": "graphql",
|
|
142
|
+
".proto": "protobuf",
|
|
143
|
+
".xml": "xml",
|
|
144
|
+
".r": "r",
|
|
145
|
+
".R": "r",
|
|
146
|
+
".lua": "lua",
|
|
147
|
+
".ex": "elixir",
|
|
148
|
+
".exs": "elixir",
|
|
149
|
+
".erl": "erlang",
|
|
150
|
+
".hs": "haskell",
|
|
151
|
+
".scala": "scala",
|
|
152
|
+
".clj": "clojure",
|
|
153
|
+
".dart": "dart",
|
|
154
|
+
".vue": "vue",
|
|
155
|
+
".svelte": "svelte",
|
|
156
|
+
".pl": "perl",
|
|
157
|
+
".pm": "perl",
|
|
158
|
+
".cs": "csharp",
|
|
159
|
+
".fs": "fsharp",
|
|
160
|
+
".zig": "zig",
|
|
161
|
+
".nim": "nim",
|
|
162
|
+
".v": "vlang",
|
|
163
|
+
".tf": "terraform",
|
|
164
|
+
".hcl": "hcl",
|
|
165
|
+
".dockerfile": "dockerfile",
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@dataclass
|
|
170
|
+
class IndexResult:
|
|
171
|
+
indexed_files: list[str] = field(default_factory=list)
|
|
172
|
+
skipped_files: list[str] = field(default_factory=list)
|
|
173
|
+
deleted_files: list[str] = field(default_factory=list)
|
|
174
|
+
total_chunks: int = 0
|
|
175
|
+
errors: list[str] = field(default_factory=list)
|
|
176
|
+
# Embedding-cache hit/miss counters from the most-recent embedder run.
|
|
177
|
+
# Surfaced in `cce index` output so users can see how much the cache saved.
|
|
178
|
+
cache_hits: int = 0
|
|
179
|
+
cache_misses: int = 0
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _iter_project_files(
|
|
183
|
+
root: Path,
|
|
184
|
+
ignore_set: set[str],
|
|
185
|
+
skip_extensions: set[str],
|
|
186
|
+
*,
|
|
187
|
+
redact_secrets: bool = True,
|
|
188
|
+
cceignore_patterns: list[str] | None = None,
|
|
189
|
+
) -> Iterable[Path]:
|
|
190
|
+
"""Yield files under `root` respecting ignore list, skipping symlinks.
|
|
191
|
+
|
|
192
|
+
Symlinks are skipped outright to avoid loops; callers who need symlink
|
|
193
|
+
following can resolve them before calling the pipeline.
|
|
194
|
+
|
|
195
|
+
When `redact_secrets` is True (default), filenames matching well-known
|
|
196
|
+
credential patterns (.env*, *.pem, secrets.yml, etc.) are skipped at
|
|
197
|
+
the filesystem walk so they're never read or embedded. See
|
|
198
|
+
`indexer/secrets.py` for the full pattern list.
|
|
199
|
+
|
|
200
|
+
`cceignore_patterns` (typically loaded from `.cceignore`) supplements
|
|
201
|
+
the name-only `ignore_set` with gitignore-style globs evaluated
|
|
202
|
+
against the path relative to `root`.
|
|
203
|
+
"""
|
|
204
|
+
from context_engine.indexer.secrets import is_secret_file as _is_secret_file
|
|
205
|
+
from context_engine.indexer.ignorefile import matches_any as _ignore_matches
|
|
206
|
+
patterns = cceignore_patterns or []
|
|
207
|
+
seen: set[Path] = set()
|
|
208
|
+
|
|
209
|
+
def _rel(entry: Path) -> str:
|
|
210
|
+
try:
|
|
211
|
+
return str(entry.relative_to(root)).replace("\\", "/")
|
|
212
|
+
except ValueError:
|
|
213
|
+
return entry.name
|
|
214
|
+
|
|
215
|
+
def walk(directory: Path) -> Iterable[Path]:
|
|
216
|
+
try:
|
|
217
|
+
entries = sorted(directory.iterdir())
|
|
218
|
+
except (PermissionError, OSError):
|
|
219
|
+
return
|
|
220
|
+
for entry in entries:
|
|
221
|
+
if entry.name in ignore_set:
|
|
222
|
+
continue
|
|
223
|
+
if entry.is_symlink():
|
|
224
|
+
continue
|
|
225
|
+
try:
|
|
226
|
+
resolved = entry.resolve()
|
|
227
|
+
except (OSError, RuntimeError):
|
|
228
|
+
continue
|
|
229
|
+
if resolved in seen:
|
|
230
|
+
continue
|
|
231
|
+
seen.add(resolved)
|
|
232
|
+
# Evaluate .cceignore against the path relative to project root.
|
|
233
|
+
# Done after symlink/seen checks so we don't pay the cost on
|
|
234
|
+
# files we'd skip anyway.
|
|
235
|
+
if patterns and _ignore_matches(_rel(entry), entry.is_dir(), patterns):
|
|
236
|
+
continue
|
|
237
|
+
if entry.is_dir():
|
|
238
|
+
yield from walk(entry)
|
|
239
|
+
elif entry.is_file() and entry.suffix not in skip_extensions:
|
|
240
|
+
if redact_secrets and _is_secret_file(entry):
|
|
241
|
+
log.info("indexer: skipping secret file %s", entry)
|
|
242
|
+
continue
|
|
243
|
+
yield entry
|
|
244
|
+
|
|
245
|
+
yield from walk(root)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# Skip any single file larger than this — protects the indexer from OOM on
|
|
249
|
+
# accidentally-committed log dumps, generated fixtures, vendored bundles, etc.
|
|
250
|
+
# 2 MB easily covers normal source files (the largest module in CPython's
|
|
251
|
+
# stdlib is ~250 KB) while ruling out the kind of file you'd never want in
|
|
252
|
+
# a semantic index anyway.
|
|
253
|
+
_MAX_FILE_BYTES = 2 * 1024 * 1024
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _safe_read(file_path: Path) -> str | None:
|
|
257
|
+
"""Read file as UTF-8 text; return None for binary, oversized, or unreadable files."""
|
|
258
|
+
try:
|
|
259
|
+
if file_path.stat().st_size > _MAX_FILE_BYTES:
|
|
260
|
+
return None
|
|
261
|
+
return file_path.read_text(encoding="utf-8", errors="strict")
|
|
262
|
+
except (UnicodeDecodeError, OSError):
|
|
263
|
+
return None
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
async def run_indexing(
|
|
267
|
+
config,
|
|
268
|
+
project_dir: str | Path,
|
|
269
|
+
*,
|
|
270
|
+
full: bool = False,
|
|
271
|
+
target_path: str | None = None,
|
|
272
|
+
log_fn=None,
|
|
273
|
+
progress_fn=None,
|
|
274
|
+
embed_progress_fn=None,
|
|
275
|
+
phase_fn=None,
|
|
276
|
+
) -> IndexResult:
|
|
277
|
+
"""Run the indexing pipeline. Returns a structured `IndexResult`.
|
|
278
|
+
|
|
279
|
+
`target_path` (optional) restricts indexing to a single file or subtree.
|
|
280
|
+
`full=True` ignores the manifest and re-indexes everything visible.
|
|
281
|
+
`log_fn(msg)` is called for verbose progress output if provided.
|
|
282
|
+
`progress_fn(current, total)` is called after each batch with file counts.
|
|
283
|
+
`embed_progress_fn(current, total)` is called as embedding proceeds with
|
|
284
|
+
chunk counts (only for cache misses; cache hits return instantly).
|
|
285
|
+
`phase_fn(msg)` (if provided) is called between major phases —
|
|
286
|
+
"Embedding 32k chunks…", "Writing to index…" — so non-verbose callers
|
|
287
|
+
can announce *what* is starting; embed_progress_fn then drives motion
|
|
288
|
+
*within* the embed phase. Both serve the same goal (don't look hung
|
|
289
|
+
on large repos) and are complementary: phase_fn is per-phase, embed_
|
|
290
|
+
progress_fn is per-batch.
|
|
291
|
+
"""
|
|
292
|
+
project_dir = Path(project_dir)
|
|
293
|
+
project_name = project_dir.name
|
|
294
|
+
storage_base = Path(config.storage_path) / project_name
|
|
295
|
+
storage_base.mkdir(parents=True, exist_ok=True)
|
|
296
|
+
|
|
297
|
+
async with _pipeline_lock(str(storage_base)):
|
|
298
|
+
return await _run_indexing_locked(
|
|
299
|
+
config,
|
|
300
|
+
project_dir,
|
|
301
|
+
storage_base,
|
|
302
|
+
full=full,
|
|
303
|
+
target_path=target_path,
|
|
304
|
+
log_fn=log_fn,
|
|
305
|
+
progress_fn=progress_fn,
|
|
306
|
+
embed_progress_fn=embed_progress_fn,
|
|
307
|
+
phase_fn=phase_fn,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
async def _run_indexing_locked(
|
|
312
|
+
config,
|
|
313
|
+
project_dir: Path,
|
|
314
|
+
storage_base: Path,
|
|
315
|
+
*,
|
|
316
|
+
full: bool,
|
|
317
|
+
target_path: str | None,
|
|
318
|
+
log_fn,
|
|
319
|
+
progress_fn=None,
|
|
320
|
+
embed_progress_fn=None,
|
|
321
|
+
phase_fn=None,
|
|
322
|
+
) -> IndexResult:
|
|
323
|
+
backend = LocalBackend(base_path=str(storage_base))
|
|
324
|
+
chunker = Chunker()
|
|
325
|
+
manifest = Manifest(manifest_path=storage_base / "manifest.json")
|
|
326
|
+
ignore_set = set(config.indexer_ignore)
|
|
327
|
+
# Load .cceignore once per indexing run. Patterns are evaluated against
|
|
328
|
+
# paths relative to project_dir; see indexer/ignorefile.py.
|
|
329
|
+
from context_engine.indexer.ignorefile import load_ignore_patterns
|
|
330
|
+
cceignore_patterns = load_ignore_patterns(project_dir)
|
|
331
|
+
if cceignore_patterns and log_fn:
|
|
332
|
+
log_fn(f" [.cceignore] {len(cceignore_patterns)} pattern(s) loaded")
|
|
333
|
+
result = IndexResult()
|
|
334
|
+
|
|
335
|
+
# Determine the set of files to scan.
|
|
336
|
+
if target_path:
|
|
337
|
+
target = _resolve_within(project_dir, target_path)
|
|
338
|
+
if target.is_file():
|
|
339
|
+
file_iter = [target] if target.suffix not in _SKIP_EXTENSIONS else []
|
|
340
|
+
elif target.is_dir():
|
|
341
|
+
file_iter = list(_iter_project_files(
|
|
342
|
+
target, ignore_set, _SKIP_EXTENSIONS,
|
|
343
|
+
redact_secrets=getattr(config, "indexer_redact_secrets", True),
|
|
344
|
+
cceignore_patterns=cceignore_patterns,
|
|
345
|
+
))
|
|
346
|
+
else:
|
|
347
|
+
result.errors.append(f"Target path not found: {target_path}")
|
|
348
|
+
return result
|
|
349
|
+
else:
|
|
350
|
+
file_iter = list(_iter_project_files(
|
|
351
|
+
project_dir, ignore_set, _SKIP_EXTENSIONS,
|
|
352
|
+
redact_secrets=getattr(config, "indexer_redact_secrets", True),
|
|
353
|
+
cceignore_patterns=cceignore_patterns,
|
|
354
|
+
))
|
|
355
|
+
|
|
356
|
+
current_rel_paths: set[str] = set()
|
|
357
|
+
all_chunks: list = []
|
|
358
|
+
all_nodes: list[GraphNode] = []
|
|
359
|
+
all_edges: list[GraphEdge] = []
|
|
360
|
+
files_to_replace: list[str] = []
|
|
361
|
+
|
|
362
|
+
# Read + chunk asynchronously — both are wrapped in asyncio.to_thread so
|
|
363
|
+
# the I/O reads (kernel) and the chunker work (CPU-bound tree-sitter)
|
|
364
|
+
# both overlap across files in a batch instead of executing serially.
|
|
365
|
+
async def _read_file(fp: Path) -> tuple[Path, str | None]:
|
|
366
|
+
return fp, await asyncio.to_thread(_safe_read, fp)
|
|
367
|
+
|
|
368
|
+
async def _chunk_file(rel_path: str, content: str, language: str):
|
|
369
|
+
"""Run the tree-sitter chunker off the event loop. Returns chunks +
|
|
370
|
+
imports, or (None, None) on failure (already logged by caller)."""
|
|
371
|
+
return await asyncio.to_thread(
|
|
372
|
+
chunker.chunk_with_imports, content, rel_path, language
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Process files in batches to pipeline I/O with chunking.
|
|
376
|
+
_BATCH = 50
|
|
377
|
+
for batch_start in range(0, len(file_iter), _BATCH):
|
|
378
|
+
batch_paths = file_iter[batch_start:batch_start + _BATCH]
|
|
379
|
+
|
|
380
|
+
# Async read all files in this batch concurrently
|
|
381
|
+
read_tasks = [_read_file(fp) for fp in batch_paths]
|
|
382
|
+
read_results = await asyncio.gather(*read_tasks)
|
|
383
|
+
|
|
384
|
+
# First pass: hash + manifest check, decide which files actually need
|
|
385
|
+
# re-chunking. This is cheap and synchronous; doing it upfront lets us
|
|
386
|
+
# skip the chunker for unchanged files.
|
|
387
|
+
to_chunk: list[tuple[Path, str, str, str, str]] = [] # (file_path, rel_path, content, content_hash, language)
|
|
388
|
+
for file_path, content in read_results:
|
|
389
|
+
rel_path = str(file_path.relative_to(project_dir))
|
|
390
|
+
current_rel_paths.add(rel_path)
|
|
391
|
+
|
|
392
|
+
if content is None:
|
|
393
|
+
result.skipped_files.append(rel_path)
|
|
394
|
+
if log_fn:
|
|
395
|
+
log_fn(f" [skip] {rel_path} (binary or unreadable)")
|
|
396
|
+
continue
|
|
397
|
+
|
|
398
|
+
# Content-level secret redaction. Filename-level skipping
|
|
399
|
+
# already happened in `_iter_project_files`, so a file
|
|
400
|
+
# reaching this point is "indexable" — but the file might
|
|
401
|
+
# still contain inline credentials (an AWS key in a config
|
|
402
|
+
# comment, a JWT in a fixture). Redact those before they
|
|
403
|
+
# reach the chunker, embedder, or vector store.
|
|
404
|
+
if getattr(config, "indexer_redact_secrets", True):
|
|
405
|
+
from context_engine.indexer.secrets import redact_secrets
|
|
406
|
+
content, fired = redact_secrets(content)
|
|
407
|
+
if fired:
|
|
408
|
+
log.info(
|
|
409
|
+
"indexer: redacted %d secret(s) in %s (kinds: %s)",
|
|
410
|
+
len(fired), rel_path, ",".join(sorted(set(fired))),
|
|
411
|
+
)
|
|
412
|
+
if log_fn:
|
|
413
|
+
log_fn(f" [redact] {rel_path} ({len(fired)} secret(s))")
|
|
414
|
+
|
|
415
|
+
content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
416
|
+
if not full and not manifest.has_changed(rel_path, content_hash):
|
|
417
|
+
if log_fn:
|
|
418
|
+
log_fn(f" [skip] {rel_path} (unchanged)")
|
|
419
|
+
continue
|
|
420
|
+
|
|
421
|
+
language = _LANGUAGE_MAP.get(file_path.suffix, "plaintext")
|
|
422
|
+
to_chunk.append((file_path, rel_path, content, content_hash, language))
|
|
423
|
+
|
|
424
|
+
# Chunk all changed files in this batch in parallel. tree-sitter is
|
|
425
|
+
# a C extension that releases the GIL during parsing, so threads do
|
|
426
|
+
# give real concurrency for chunking.
|
|
427
|
+
if to_chunk:
|
|
428
|
+
chunk_tasks = [
|
|
429
|
+
_chunk_file(rel_path, content, language)
|
|
430
|
+
for (_, rel_path, content, _, language) in to_chunk
|
|
431
|
+
]
|
|
432
|
+
chunk_results = await asyncio.gather(*chunk_tasks, return_exceptions=True)
|
|
433
|
+
|
|
434
|
+
for (file_path, rel_path, content, content_hash, language), chunk_outcome in zip(
|
|
435
|
+
to_chunk, chunk_results
|
|
436
|
+
):
|
|
437
|
+
if isinstance(chunk_outcome, Exception):
|
|
438
|
+
result.errors.append(f"Chunking failed for {rel_path}: {chunk_outcome}")
|
|
439
|
+
log.warning("Chunking failed for %s", rel_path, exc_info=chunk_outcome)
|
|
440
|
+
continue
|
|
441
|
+
chunks, imported_modules = chunk_outcome
|
|
442
|
+
|
|
443
|
+
# Defer the actual store delete to a single batched call below.
|
|
444
|
+
files_to_replace.append(rel_path)
|
|
445
|
+
|
|
446
|
+
file_node = GraphNode(
|
|
447
|
+
id=f"file_{rel_path}",
|
|
448
|
+
node_type=NodeType.FILE,
|
|
449
|
+
name=file_path.name,
|
|
450
|
+
file_path=rel_path,
|
|
451
|
+
)
|
|
452
|
+
all_nodes.append(file_node)
|
|
453
|
+
|
|
454
|
+
for module in imported_modules:
|
|
455
|
+
all_edges.append(
|
|
456
|
+
GraphEdge(
|
|
457
|
+
source_id=file_node.id,
|
|
458
|
+
target_id=f"module_{module}",
|
|
459
|
+
edge_type=EdgeType.IMPORTS,
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
for chunk in chunks:
|
|
464
|
+
node_type = _CHUNK_TO_NODE_TYPE.get(
|
|
465
|
+
chunk.chunk_type, NodeType.MODULE
|
|
466
|
+
)
|
|
467
|
+
node_name = (
|
|
468
|
+
chunk.content.split("(")[0].split(":")[-1].strip()
|
|
469
|
+
if "(" in chunk.content
|
|
470
|
+
else chunk.id
|
|
471
|
+
)
|
|
472
|
+
all_nodes.append(
|
|
473
|
+
GraphNode(
|
|
474
|
+
id=chunk.id,
|
|
475
|
+
node_type=node_type,
|
|
476
|
+
name=node_name,
|
|
477
|
+
file_path=rel_path,
|
|
478
|
+
)
|
|
479
|
+
)
|
|
480
|
+
all_edges.append(
|
|
481
|
+
GraphEdge(
|
|
482
|
+
source_id=file_node.id,
|
|
483
|
+
target_id=chunk.id,
|
|
484
|
+
edge_type=EdgeType.DEFINES,
|
|
485
|
+
)
|
|
486
|
+
)
|
|
487
|
+
all_chunks.extend(chunks)
|
|
488
|
+
manifest.update(rel_path, content_hash)
|
|
489
|
+
result.indexed_files.append(rel_path)
|
|
490
|
+
|
|
491
|
+
if progress_fn:
|
|
492
|
+
progress_fn(min(batch_start + len(batch_paths), len(file_iter)), len(file_iter))
|
|
493
|
+
|
|
494
|
+
# NOTE: replacement deletes for `files_to_replace` are deferred until
|
|
495
|
+
# after embedding succeeds — see below. Deleting up front made the index
|
|
496
|
+
# vulnerable to a transient embed/ingest failure wiping previously-good
|
|
497
|
+
# data. The single batched delete still happens, just on the durable side
|
|
498
|
+
# of the embedder call.
|
|
499
|
+
|
|
500
|
+
# Index git history on full runs (skip for non-git projects)
|
|
501
|
+
_is_git = (Path(project_dir) / ".git").is_dir()
|
|
502
|
+
if full and not target_path and _is_git:
|
|
503
|
+
try:
|
|
504
|
+
git_chunks, git_nodes, git_edges = await index_commits(
|
|
505
|
+
project_dir, since_sha=manifest.last_git_sha
|
|
506
|
+
)
|
|
507
|
+
all_chunks.extend(git_chunks)
|
|
508
|
+
all_nodes.extend(git_nodes)
|
|
509
|
+
all_edges.extend(git_edges)
|
|
510
|
+
if git_chunks:
|
|
511
|
+
head_result = await asyncio.to_thread(
|
|
512
|
+
subprocess.run,
|
|
513
|
+
["git", "rev-parse", "HEAD"],
|
|
514
|
+
cwd=project_dir, capture_output=True, text=True, check=False,
|
|
515
|
+
)
|
|
516
|
+
if head_result.returncode == 0:
|
|
517
|
+
manifest.last_git_sha = head_result.stdout.strip()
|
|
518
|
+
if log_fn:
|
|
519
|
+
log_fn(f" [git] {len(git_chunks)} commit(s) indexed")
|
|
520
|
+
except Exception as exc:
|
|
521
|
+
log.warning("Git history indexing failed: %s", exc)
|
|
522
|
+
|
|
523
|
+
if all_chunks:
|
|
524
|
+
# Embedding is where first-run model downloads happen; isolate failures
|
|
525
|
+
# here so we don't write an index with empty vectors. Crucially, the
|
|
526
|
+
# replacement deletes (files_to_replace) have NOT happened yet, so a
|
|
527
|
+
# download or model failure leaves the previous index intact.
|
|
528
|
+
cache = EmbeddingCache(
|
|
529
|
+
storage_base / "embedding_cache.db",
|
|
530
|
+
model_name=config.embedding_model,
|
|
531
|
+
)
|
|
532
|
+
try:
|
|
533
|
+
embedder = Embedder(model_name=config.embedding_model, cache=cache)
|
|
534
|
+
if phase_fn:
|
|
535
|
+
phase_fn(
|
|
536
|
+
f"Embedding {len(all_chunks):,} chunks "
|
|
537
|
+
f"(CPU-bound, can take several minutes on large repos)…"
|
|
538
|
+
)
|
|
539
|
+
try:
|
|
540
|
+
embedder.embed(all_chunks, progress_fn=embed_progress_fn)
|
|
541
|
+
except Exception as exc:
|
|
542
|
+
msg = f"Embedding failed: {exc}"
|
|
543
|
+
result.errors.append(msg)
|
|
544
|
+
log.warning(msg, exc_info=exc)
|
|
545
|
+
# Manifest was updated in-memory in the loop but never reaches
|
|
546
|
+
# disk because we return before manifest.save(); the previous
|
|
547
|
+
# on-disk manifest + index data are still valid.
|
|
548
|
+
return result
|
|
549
|
+
result.cache_hits = cache.hits
|
|
550
|
+
result.cache_misses = cache.misses
|
|
551
|
+
|
|
552
|
+
# On a full re-index we know the complete set of live chunk
|
|
553
|
+
# hashes — opportunistically drop any cached embeddings whose
|
|
554
|
+
# source content is no longer present anywhere in the index.
|
|
555
|
+
# Without this the cache grows monotonically forever.
|
|
556
|
+
if full and not target_path:
|
|
557
|
+
try:
|
|
558
|
+
live_hashes = {
|
|
559
|
+
cache.content_hash(c.content) for c in all_chunks
|
|
560
|
+
}
|
|
561
|
+
pruned = cache.prune_orphans(live_hashes)
|
|
562
|
+
if pruned and log_fn:
|
|
563
|
+
log_fn(f" [cache] pruned {pruned} orphan embedding(s)")
|
|
564
|
+
except Exception as exc:
|
|
565
|
+
log.debug("Embedding cache prune skipped: %s", exc)
|
|
566
|
+
finally:
|
|
567
|
+
cache.close()
|
|
568
|
+
|
|
569
|
+
# Embedding succeeded — now it's safe to drop the rows we're about to
|
|
570
|
+
# replace. Still ordered before ingest so the new chunk IDs don't
|
|
571
|
+
# collide with the old ones across the three stores.
|
|
572
|
+
if files_to_replace:
|
|
573
|
+
try:
|
|
574
|
+
await backend.delete_by_files(files_to_replace)
|
|
575
|
+
except Exception as exc:
|
|
576
|
+
msg = f"Pre-ingest delete failed: {exc}"
|
|
577
|
+
result.errors.append(msg)
|
|
578
|
+
log.warning(msg, exc_info=exc)
|
|
579
|
+
return result
|
|
580
|
+
|
|
581
|
+
if phase_fn:
|
|
582
|
+
phase_fn(f"Writing {len(all_chunks):,} chunks to vector + FTS + graph index…")
|
|
583
|
+
try:
|
|
584
|
+
await backend.ingest(all_chunks, all_nodes, all_edges)
|
|
585
|
+
except Exception as exc:
|
|
586
|
+
msg = f"Backend ingest failed: {exc}"
|
|
587
|
+
result.errors.append(msg)
|
|
588
|
+
log.warning(msg, exc_info=exc)
|
|
589
|
+
return result
|
|
590
|
+
|
|
591
|
+
result.total_chunks = len(all_chunks)
|
|
592
|
+
elif files_to_replace:
|
|
593
|
+
# No new chunks (e.g. all changed files chunked to nothing) but we
|
|
594
|
+
# still need to drop their old rows.
|
|
595
|
+
try:
|
|
596
|
+
await backend.delete_by_files(files_to_replace)
|
|
597
|
+
except Exception as exc:
|
|
598
|
+
msg = f"Replacement delete failed: {exc}"
|
|
599
|
+
result.errors.append(msg)
|
|
600
|
+
log.warning(msg, exc_info=exc)
|
|
601
|
+
return result
|
|
602
|
+
|
|
603
|
+
# Prune chunks for files that were in the manifest but no longer on disk.
|
|
604
|
+
# Only meaningful for project-wide runs; skip when a single path was targeted.
|
|
605
|
+
if not target_path:
|
|
606
|
+
previous_rel_paths = set(manifest._entries.keys()) # noqa: SLF001
|
|
607
|
+
removed = list(previous_rel_paths - current_rel_paths)
|
|
608
|
+
if removed:
|
|
609
|
+
try:
|
|
610
|
+
await backend.delete_by_files(removed)
|
|
611
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
612
|
+
result.errors.append(f"Failed to prune deleted files: {exc}")
|
|
613
|
+
removed = []
|
|
614
|
+
for deleted in removed:
|
|
615
|
+
try:
|
|
616
|
+
manifest.remove(deleted)
|
|
617
|
+
result.deleted_files.append(deleted)
|
|
618
|
+
if log_fn:
|
|
619
|
+
log_fn(f" [delete] {deleted} (no longer on disk)")
|
|
620
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
621
|
+
result.errors.append(f"Failed to prune {deleted}: {exc}")
|
|
622
|
+
|
|
623
|
+
manifest.save()
|
|
624
|
+
return result
|