code-context-engine 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context_engine-0.4.0.dist-info/METADATA +389 -0
- code_context_engine-0.4.0.dist-info/RECORD +63 -0
- code_context_engine-0.4.0.dist-info/WHEEL +5 -0
- code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
- code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
- code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
- context_engine/__init__.py +3 -0
- context_engine/cli.py +2848 -0
- context_engine/cli_style.py +66 -0
- context_engine/compression/__init__.py +0 -0
- context_engine/compression/compressor.py +144 -0
- context_engine/compression/ollama_client.py +33 -0
- context_engine/compression/output_rules.py +77 -0
- context_engine/compression/prompts.py +9 -0
- context_engine/compression/quality.py +37 -0
- context_engine/config.py +198 -0
- context_engine/dashboard/__init__.py +0 -0
- context_engine/dashboard/_page.py +1548 -0
- context_engine/dashboard/server.py +429 -0
- context_engine/editors.py +265 -0
- context_engine/event_bus.py +24 -0
- context_engine/indexer/__init__.py +0 -0
- context_engine/indexer/chunker.py +147 -0
- context_engine/indexer/embedder.py +154 -0
- context_engine/indexer/embedding_cache.py +168 -0
- context_engine/indexer/git_hooks.py +73 -0
- context_engine/indexer/git_indexer.py +136 -0
- context_engine/indexer/ignorefile.py +96 -0
- context_engine/indexer/manifest.py +78 -0
- context_engine/indexer/pipeline.py +624 -0
- context_engine/indexer/secrets.py +332 -0
- context_engine/indexer/watcher.py +109 -0
- context_engine/integration/__init__.py +0 -0
- context_engine/integration/bootstrap.py +76 -0
- context_engine/integration/git_context.py +132 -0
- context_engine/integration/mcp_server.py +1825 -0
- context_engine/integration/session_capture.py +306 -0
- context_engine/memory/__init__.py +6 -0
- context_engine/memory/compressor.py +344 -0
- context_engine/memory/db.py +922 -0
- context_engine/memory/extractive.py +106 -0
- context_engine/memory/grammar.py +419 -0
- context_engine/memory/hook_installer.py +258 -0
- context_engine/memory/hook_server.py +83 -0
- context_engine/memory/hooks.py +327 -0
- context_engine/memory/migrate.py +268 -0
- context_engine/models.py +96 -0
- context_engine/pricing.py +104 -0
- context_engine/project_commands.py +296 -0
- context_engine/retrieval/__init__.py +0 -0
- context_engine/retrieval/confidence.py +47 -0
- context_engine/retrieval/query_parser.py +105 -0
- context_engine/retrieval/retriever.py +199 -0
- context_engine/serve_http.py +208 -0
- context_engine/services.py +252 -0
- context_engine/storage/__init__.py +0 -0
- context_engine/storage/backend.py +39 -0
- context_engine/storage/fts_store.py +112 -0
- context_engine/storage/graph_store.py +219 -0
- context_engine/storage/local_backend.py +109 -0
- context_engine/storage/remote_backend.py +117 -0
- context_engine/storage/vector_store.py +357 -0
- context_engine/utils.py +72 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Parse git log into searchable chunks."""
|
|
2
|
+
import asyncio
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from context_engine.models import (
|
|
9
|
+
Chunk, ChunkType, GraphNode, GraphEdge, NodeType, EdgeType,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
log = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
_SHA_RE = re.compile(r"^[0-9a-f]{40}$")
|
|
15
|
+
|
|
16
|
+
# Delimiter placed at the START of each commit record so we can split cleanly.
|
|
17
|
+
_RECORD_START = "---CCE_START---"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def index_commits(
|
|
21
|
+
project_dir: Path,
|
|
22
|
+
since_sha: str | None = None,
|
|
23
|
+
max_commits: int = 200,
|
|
24
|
+
) -> tuple[list[Chunk], list[GraphNode], list[GraphEdge]]:
|
|
25
|
+
"""Parse recent git history into searchable chunks."""
|
|
26
|
+
# Use two separate git calls:
|
|
27
|
+
# 1. git log --format=... to get commit metadata in order
|
|
28
|
+
# 2. git log --name-only to get changed files per commit
|
|
29
|
+
range_arg = f"{since_sha}..HEAD" if since_sha else f"-{max_commits}"
|
|
30
|
+
|
|
31
|
+
meta_result = await asyncio.to_thread(
|
|
32
|
+
subprocess.run,
|
|
33
|
+
["git", "log", range_arg, "--format=%H%n%an%n%ai%n%s%n%b%x00"],
|
|
34
|
+
cwd=project_dir, capture_output=True, text=True, check=False,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if meta_result.returncode != 0:
|
|
38
|
+
log.debug("git log skipped: %s", meta_result.stderr.strip())
|
|
39
|
+
return [], [], []
|
|
40
|
+
|
|
41
|
+
files_result = await asyncio.to_thread(
|
|
42
|
+
subprocess.run,
|
|
43
|
+
["git", "log", range_arg, "--name-only", "--format=%H"],
|
|
44
|
+
cwd=project_dir, capture_output=True, text=True, check=False,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
changed_files_by_hash: dict[str, list[str]] = {}
|
|
48
|
+
if files_result.returncode == 0:
|
|
49
|
+
changed_files_by_hash = _parse_name_only(files_result.stdout)
|
|
50
|
+
|
|
51
|
+
return _parse_meta(meta_result.stdout, changed_files_by_hash)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _parse_name_only(output: str) -> dict[str, list[str]]:
|
|
55
|
+
"""Parse `git log --name-only --format=%H` output into {hash: [files]}."""
|
|
56
|
+
result: dict[str, list[str]] = {}
|
|
57
|
+
current_hash: str | None = None
|
|
58
|
+
for line in output.splitlines():
|
|
59
|
+
stripped = line.strip()
|
|
60
|
+
if not stripped:
|
|
61
|
+
continue
|
|
62
|
+
if _SHA_RE.match(stripped):
|
|
63
|
+
current_hash = stripped
|
|
64
|
+
result.setdefault(current_hash, [])
|
|
65
|
+
elif current_hash is not None:
|
|
66
|
+
result[current_hash].append(stripped)
|
|
67
|
+
return result
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _parse_meta(
|
|
71
|
+
output: str,
|
|
72
|
+
changed_files_by_hash: dict[str, list[str]],
|
|
73
|
+
) -> tuple[list[Chunk], list[GraphNode], list[GraphEdge]]:
|
|
74
|
+
"""Parse commit metadata output and build chunks/nodes/edges."""
|
|
75
|
+
chunks: list[Chunk] = []
|
|
76
|
+
nodes: list[GraphNode] = []
|
|
77
|
+
edges: list[GraphEdge] = []
|
|
78
|
+
|
|
79
|
+
# Records are separated by NUL bytes (\x00)
|
|
80
|
+
records = output.split("\x00")
|
|
81
|
+
for record in records:
|
|
82
|
+
record = record.strip()
|
|
83
|
+
if not record:
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
lines = record.splitlines()
|
|
87
|
+
if len(lines) < 4:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
commit_hash = lines[0].strip()
|
|
91
|
+
if not _SHA_RE.match(commit_hash):
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
author = lines[1].strip()
|
|
95
|
+
date = lines[2].strip()
|
|
96
|
+
subject = lines[3].strip()
|
|
97
|
+
body = "\n".join(lines[4:]).strip()
|
|
98
|
+
|
|
99
|
+
content = f"{subject}\n\n{body}".strip()
|
|
100
|
+
short_hash = commit_hash[:7]
|
|
101
|
+
|
|
102
|
+
chunk = Chunk(
|
|
103
|
+
id=f"commit_{short_hash}",
|
|
104
|
+
content=content,
|
|
105
|
+
chunk_type=ChunkType.COMMIT,
|
|
106
|
+
file_path=f"git:{short_hash}",
|
|
107
|
+
start_line=0,
|
|
108
|
+
end_line=0,
|
|
109
|
+
language="git",
|
|
110
|
+
metadata={
|
|
111
|
+
"author": author,
|
|
112
|
+
"date": date,
|
|
113
|
+
"hash": commit_hash,
|
|
114
|
+
"chunk_kind": "commit",
|
|
115
|
+
},
|
|
116
|
+
)
|
|
117
|
+
chunks.append(chunk)
|
|
118
|
+
|
|
119
|
+
node = GraphNode(
|
|
120
|
+
id=f"commit_{short_hash}",
|
|
121
|
+
node_type=NodeType.COMMIT,
|
|
122
|
+
name=subject,
|
|
123
|
+
file_path=f"git:{short_hash}",
|
|
124
|
+
)
|
|
125
|
+
nodes.append(node)
|
|
126
|
+
|
|
127
|
+
for fname in changed_files_by_hash.get(commit_hash, []):
|
|
128
|
+
edges.append(
|
|
129
|
+
GraphEdge(
|
|
130
|
+
source_id=f"commit_{short_hash}",
|
|
131
|
+
target_id=f"file_{fname}",
|
|
132
|
+
edge_type=EdgeType.MODIFIES,
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return chunks, nodes, edges
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""`.cceignore` parser — gitignore-style patterns for the indexer.
|
|
2
|
+
|
|
3
|
+
Supports the practical subset of `.gitignore` syntax that covers ~95% of
|
|
4
|
+
real-world use:
|
|
5
|
+
|
|
6
|
+
· Glob patterns: `*.log`, `temp/*`, `**/build/`
|
|
7
|
+
· Directory matches: `node_modules/` (trailing slash)
|
|
8
|
+
· Comments: lines starting with `#`
|
|
9
|
+
· Blank lines: ignored
|
|
10
|
+
|
|
11
|
+
Deliberate deviation from strict gitignore: `*` here matches across
|
|
12
|
+
path separators (fnmatch behaviour), so `temp/*` excludes everything
|
|
13
|
+
under `temp/`, not just direct children. In our experience that's what
|
|
14
|
+
users actually want from an indexer ignore file.
|
|
15
|
+
|
|
16
|
+
NOT supported (intentionally — adds dependency and complexity for
|
|
17
|
+
diminishing returns):
|
|
18
|
+
|
|
19
|
+
· Negation patterns (`!keep.log`)
|
|
20
|
+
· Anchored patterns (leading `/`) — all patterns match anywhere in the tree
|
|
21
|
+
· Character classes beyond what `fnmatch` provides
|
|
22
|
+
|
|
23
|
+
Users who need full gitignore semantics can add `pathspec` to their
|
|
24
|
+
project and wire a custom matcher; this module covers the common case
|
|
25
|
+
without a third-party dependency.
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import fnmatch
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
|
|
32
|
+
CCEIGNORE_FILENAME = ".cceignore"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def load_ignore_patterns(project_dir: Path) -> list[str]:
|
|
36
|
+
"""Read `.cceignore` from `project_dir` and return its non-comment,
|
|
37
|
+
non-blank lines. Returns an empty list if the file doesn't exist.
|
|
38
|
+
|
|
39
|
+
Patterns are returned verbatim (whitespace stripped); matching is
|
|
40
|
+
delegated to `matches_any`.
|
|
41
|
+
"""
|
|
42
|
+
path = project_dir / CCEIGNORE_FILENAME
|
|
43
|
+
if not path.is_file():
|
|
44
|
+
return []
|
|
45
|
+
try:
|
|
46
|
+
raw = path.read_text(encoding="utf-8", errors="strict")
|
|
47
|
+
except OSError:
|
|
48
|
+
return []
|
|
49
|
+
out: list[str] = []
|
|
50
|
+
for line in raw.splitlines():
|
|
51
|
+
s = line.strip()
|
|
52
|
+
if not s or s.startswith("#"):
|
|
53
|
+
continue
|
|
54
|
+
out.append(s)
|
|
55
|
+
return out
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def matches_any(rel_path: str, is_dir: bool, patterns: list[str]) -> bool:
|
|
59
|
+
"""True if `rel_path` matches any of the given patterns.
|
|
60
|
+
|
|
61
|
+
`rel_path` is the path relative to the project root, using forward
|
|
62
|
+
slashes regardless of platform. `is_dir` distinguishes directories
|
|
63
|
+
so trailing-slash patterns (e.g. `build/`) only match directories.
|
|
64
|
+
"""
|
|
65
|
+
if not patterns:
|
|
66
|
+
return False
|
|
67
|
+
# Normalise: forward slashes, no leading "./"
|
|
68
|
+
rel = rel_path.replace("\\", "/").lstrip("./")
|
|
69
|
+
name = rel.rsplit("/", 1)[-1]
|
|
70
|
+
for pat in patterns:
|
|
71
|
+
# Trailing slash → directory-only pattern.
|
|
72
|
+
is_dir_pat = pat.endswith("/")
|
|
73
|
+
p = pat[:-1] if is_dir_pat else pat
|
|
74
|
+
if is_dir_pat and not is_dir:
|
|
75
|
+
continue
|
|
76
|
+
# Pattern with no slash → match against basename anywhere in tree.
|
|
77
|
+
# Pattern with a slash → match against the relative path from root.
|
|
78
|
+
if "/" not in p:
|
|
79
|
+
if fnmatch.fnmatchcase(name, p):
|
|
80
|
+
return True
|
|
81
|
+
else:
|
|
82
|
+
# Strip a leading `/` if user wrote it (anchored), our matcher
|
|
83
|
+
# is implicitly anchored against the project root anyway.
|
|
84
|
+
anchored = p.lstrip("/")
|
|
85
|
+
if fnmatch.fnmatchcase(rel, anchored):
|
|
86
|
+
return True
|
|
87
|
+
# `**` support — fnmatch treats it as `*`. We extend by also
|
|
88
|
+
# trying the pattern with `**/` stripped from the front, so
|
|
89
|
+
# `**/build/foo` matches `build/foo` and `src/build/foo`.
|
|
90
|
+
if anchored.startswith("**/"):
|
|
91
|
+
tail = anchored[3:]
|
|
92
|
+
if fnmatch.fnmatchcase(rel, tail):
|
|
93
|
+
return True
|
|
94
|
+
if fnmatch.fnmatchcase(rel, f"*/{tail}"):
|
|
95
|
+
return True
|
|
96
|
+
return False
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Content hash manifest for incremental indexing."""
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from context_engine.utils import atomic_write_text
|
|
7
|
+
|
|
8
|
+
log = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
CURRENT_SCHEMA_VERSION = 2
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Manifest:
|
|
14
|
+
def __init__(self, manifest_path: Path) -> None:
|
|
15
|
+
self._path = manifest_path
|
|
16
|
+
self._entries: dict[str, str] = {}
|
|
17
|
+
self._schema_version: int = CURRENT_SCHEMA_VERSION
|
|
18
|
+
self._last_git_sha: str | None = None
|
|
19
|
+
|
|
20
|
+
if self._path.exists():
|
|
21
|
+
try:
|
|
22
|
+
with open(self._path) as f:
|
|
23
|
+
loaded = json.load(f)
|
|
24
|
+
if isinstance(loaded, dict):
|
|
25
|
+
if "__schema_version" in loaded:
|
|
26
|
+
# New versioned format
|
|
27
|
+
self._schema_version = loaded["__schema_version"]
|
|
28
|
+
self._entries = loaded.get("files", {})
|
|
29
|
+
self._last_git_sha = loaded.get("last_git_sha")
|
|
30
|
+
else:
|
|
31
|
+
# Old plain-dict format (pre-v0.2) — treat as version 1
|
|
32
|
+
self._schema_version = 1
|
|
33
|
+
self._entries = loaded
|
|
34
|
+
else:
|
|
35
|
+
log.warning(
|
|
36
|
+
"Manifest at %s was not a dict (got %s); starting empty.",
|
|
37
|
+
self._path,
|
|
38
|
+
type(loaded).__name__,
|
|
39
|
+
)
|
|
40
|
+
except (json.JSONDecodeError, OSError) as exc:
|
|
41
|
+
log.warning("Manifest at %s unreadable (%s); starting empty.", self._path, exc)
|
|
42
|
+
self._entries = {}
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def schema_version(self) -> int:
|
|
46
|
+
return self._schema_version
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def needs_reindex(self) -> bool:
|
|
50
|
+
return self._schema_version != CURRENT_SCHEMA_VERSION
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def last_git_sha(self) -> str | None:
|
|
54
|
+
return self._last_git_sha
|
|
55
|
+
|
|
56
|
+
@last_git_sha.setter
|
|
57
|
+
def last_git_sha(self, value: str | None) -> None:
|
|
58
|
+
self._last_git_sha = value
|
|
59
|
+
|
|
60
|
+
def get_hash(self, file_path: str) -> str | None:
|
|
61
|
+
return self._entries.get(file_path)
|
|
62
|
+
|
|
63
|
+
def update(self, file_path: str, content_hash: str) -> None:
|
|
64
|
+
self._entries[file_path] = content_hash
|
|
65
|
+
|
|
66
|
+
def remove(self, file_path: str) -> None:
|
|
67
|
+
self._entries.pop(file_path, None)
|
|
68
|
+
|
|
69
|
+
def has_changed(self, file_path: str, content_hash: str) -> bool:
|
|
70
|
+
return self._entries.get(file_path) != content_hash
|
|
71
|
+
|
|
72
|
+
def save(self) -> None:
|
|
73
|
+
payload = {
|
|
74
|
+
"__schema_version": CURRENT_SCHEMA_VERSION,
|
|
75
|
+
"files": self._entries,
|
|
76
|
+
"last_git_sha": self._last_git_sha,
|
|
77
|
+
}
|
|
78
|
+
atomic_write_text(self._path, json.dumps(payload))
|