code-context-engine 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context_engine-0.4.0.dist-info/METADATA +389 -0
- code_context_engine-0.4.0.dist-info/RECORD +63 -0
- code_context_engine-0.4.0.dist-info/WHEEL +5 -0
- code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
- code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
- code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
- context_engine/__init__.py +3 -0
- context_engine/cli.py +2848 -0
- context_engine/cli_style.py +66 -0
- context_engine/compression/__init__.py +0 -0
- context_engine/compression/compressor.py +144 -0
- context_engine/compression/ollama_client.py +33 -0
- context_engine/compression/output_rules.py +77 -0
- context_engine/compression/prompts.py +9 -0
- context_engine/compression/quality.py +37 -0
- context_engine/config.py +198 -0
- context_engine/dashboard/__init__.py +0 -0
- context_engine/dashboard/_page.py +1548 -0
- context_engine/dashboard/server.py +429 -0
- context_engine/editors.py +265 -0
- context_engine/event_bus.py +24 -0
- context_engine/indexer/__init__.py +0 -0
- context_engine/indexer/chunker.py +147 -0
- context_engine/indexer/embedder.py +154 -0
- context_engine/indexer/embedding_cache.py +168 -0
- context_engine/indexer/git_hooks.py +73 -0
- context_engine/indexer/git_indexer.py +136 -0
- context_engine/indexer/ignorefile.py +96 -0
- context_engine/indexer/manifest.py +78 -0
- context_engine/indexer/pipeline.py +624 -0
- context_engine/indexer/secrets.py +332 -0
- context_engine/indexer/watcher.py +109 -0
- context_engine/integration/__init__.py +0 -0
- context_engine/integration/bootstrap.py +76 -0
- context_engine/integration/git_context.py +132 -0
- context_engine/integration/mcp_server.py +1825 -0
- context_engine/integration/session_capture.py +306 -0
- context_engine/memory/__init__.py +6 -0
- context_engine/memory/compressor.py +344 -0
- context_engine/memory/db.py +922 -0
- context_engine/memory/extractive.py +106 -0
- context_engine/memory/grammar.py +419 -0
- context_engine/memory/hook_installer.py +258 -0
- context_engine/memory/hook_server.py +83 -0
- context_engine/memory/hooks.py +327 -0
- context_engine/memory/migrate.py +268 -0
- context_engine/models.py +96 -0
- context_engine/pricing.py +104 -0
- context_engine/project_commands.py +296 -0
- context_engine/retrieval/__init__.py +0 -0
- context_engine/retrieval/confidence.py +47 -0
- context_engine/retrieval/query_parser.py +105 -0
- context_engine/retrieval/retriever.py +199 -0
- context_engine/serve_http.py +208 -0
- context_engine/services.py +252 -0
- context_engine/storage/__init__.py +0 -0
- context_engine/storage/backend.py +39 -0
- context_engine/storage/fts_store.py +112 -0
- context_engine/storage/graph_store.py +219 -0
- context_engine/storage/local_backend.py +109 -0
- context_engine/storage/remote_backend.py +117 -0
- context_engine/storage/vector_store.py +357 -0
- context_engine/utils.py +72 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Shared CLI styling — consistent colorful output across all cce commands."""
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# ── Colors ─────────────────────────────────────────────────────────────
|
|
9
|
+
# Cyan for headers/labels, green for success, yellow for warnings,
|
|
10
|
+
# dim white for secondary info, bold for emphasis.
|
|
11
|
+
|
|
12
|
+
def header(text: str) -> str:
|
|
13
|
+
return click.style(text, fg="cyan", bold=True)
|
|
14
|
+
|
|
15
|
+
def label(text: str) -> str:
|
|
16
|
+
return click.style(text, fg="cyan")
|
|
17
|
+
|
|
18
|
+
def success(text: str) -> str:
|
|
19
|
+
return click.style(text, fg="green")
|
|
20
|
+
|
|
21
|
+
def warn(text: str) -> str:
|
|
22
|
+
return click.style(text, fg="yellow")
|
|
23
|
+
|
|
24
|
+
def error(text: str) -> str:
|
|
25
|
+
return click.style(text, fg="red")
|
|
26
|
+
|
|
27
|
+
def dim(text: str) -> str:
|
|
28
|
+
return click.style(text, dim=True)
|
|
29
|
+
|
|
30
|
+
def bold(text: str) -> str:
|
|
31
|
+
return click.style(text, bold=True)
|
|
32
|
+
|
|
33
|
+
def value(text: str) -> str:
|
|
34
|
+
return click.style(text, fg="white", bold=True)
|
|
35
|
+
|
|
36
|
+
def magenta(text: str) -> str:
|
|
37
|
+
return click.style(text, fg="magenta")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ── Prefixes ───────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
CHECK = click.style("✓", fg="green")
|
|
43
|
+
CROSS = click.style("✗", fg="red")
|
|
44
|
+
DOT = click.style("·", fg="yellow")
|
|
45
|
+
ARROW = click.style("→", fg="cyan")
|
|
46
|
+
BULLET = click.style("●", fg="green")
|
|
47
|
+
BULLET_OFF = click.style("○", fg="yellow")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── Section headers ────────────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
def section(title: str, width: int = 50) -> str:
|
|
53
|
+
"""Render a styled section divider: ── Title ──────────"""
|
|
54
|
+
bar = "─" * max(1, width - len(title) - 4)
|
|
55
|
+
return f" {dim('──')} {header(title)} {dim(bar)}"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ── Animation ──────────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
def animate(lines: list[str], delay: float = 0.025) -> None:
|
|
61
|
+
"""Print lines with a reveal animation on TTY, instant otherwise."""
|
|
62
|
+
is_tty = sys.stdout.isatty()
|
|
63
|
+
for i, line in enumerate(lines):
|
|
64
|
+
click.echo(line)
|
|
65
|
+
if is_tty and i < 12 and delay > 0:
|
|
66
|
+
time.sleep(delay)
|
|
File without changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Compression pipeline — groups chunks, summarizes via LLM, falls back to truncation."""
|
|
2
|
+
import asyncio
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from context_engine.models import Chunk, ChunkType
|
|
6
|
+
from context_engine.compression.ollama_client import OllamaClient
|
|
7
|
+
from context_engine.compression.prompts import CODE_PROMPT, DECISION_PROMPT, ARCHITECTURE_PROMPT, DOC_PROMPT
|
|
8
|
+
from context_engine.compression.quality import QualityChecker
|
|
9
|
+
|
|
10
|
+
log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
# Ollama liveness probe is ~5ms over loopback but fires once per compress() call,
|
|
13
|
+
# adding up across a session. Refresh at most every _OLLAMA_PROBE_TTL seconds.
|
|
14
|
+
_OLLAMA_PROBE_TTL = 30.0
|
|
15
|
+
|
|
16
|
+
_PROMPT_MAP = {
|
|
17
|
+
ChunkType.FUNCTION: CODE_PROMPT, ChunkType.CLASS: CODE_PROMPT,
|
|
18
|
+
ChunkType.MODULE: ARCHITECTURE_PROMPT, ChunkType.DOC: DOC_PROMPT,
|
|
19
|
+
ChunkType.DECISION: DECISION_PROMPT, ChunkType.SESSION: DOC_PROMPT,
|
|
20
|
+
ChunkType.COMMIT: DOC_PROMPT, ChunkType.COMMENT: DOC_PROMPT,
|
|
21
|
+
}
|
|
22
|
+
_TRUNCATION_LIMITS: dict[str, int] = {"minimal": 100, "standard": 300, "full": 800}
|
|
23
|
+
# In "full" mode we pass a chunk through uncompressed only if we're confident it
|
|
24
|
+
# came from the retrieval pipeline (which sets confidence_score) — chunks
|
|
25
|
+
# ingested by other paths default to 0.0 and would otherwise be falsely
|
|
26
|
+
# classified as low-confidence. A chunk with no embedding wasn't retrieved.
|
|
27
|
+
_FULL_PASSTHROUGH_THRESHOLD = 0.8
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Compressor:
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
ollama_url: str = "http://localhost:11434",
|
|
34
|
+
model: str = "phi3:mini",
|
|
35
|
+
cache=None,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""`cache` is any object exposing get_cached_compression(chunk_id, level)
|
|
38
|
+
and put_cached_compression(chunk_id, level, text). LocalBackend satisfies
|
|
39
|
+
this; RemoteBackend currently doesn't, so we duck-type check and disable
|
|
40
|
+
caching gracefully if either method is missing."""
|
|
41
|
+
self._client = OllamaClient(base_url=ollama_url, model=model)
|
|
42
|
+
self._quality = QualityChecker()
|
|
43
|
+
if cache is not None and hasattr(cache, "get_cached_compression") and hasattr(cache, "put_cached_compression"):
|
|
44
|
+
self._cache = cache
|
|
45
|
+
else:
|
|
46
|
+
self._cache = None
|
|
47
|
+
self._ollama_available: bool | None = None
|
|
48
|
+
self._ollama_probed_at: float = 0.0
|
|
49
|
+
# Single-flight the probe so concurrent compress() calls don't all
|
|
50
|
+
# fire it at once when the cached value is stale.
|
|
51
|
+
self._probe_lock = asyncio.Lock()
|
|
52
|
+
|
|
53
|
+
async def _is_ollama_available(self) -> bool:
|
|
54
|
+
now = time.monotonic()
|
|
55
|
+
if self._ollama_available is not None and now - self._ollama_probed_at < _OLLAMA_PROBE_TTL:
|
|
56
|
+
return self._ollama_available
|
|
57
|
+
async with self._probe_lock:
|
|
58
|
+
# Re-check inside the lock — another coroutine may have refreshed
|
|
59
|
+
# the cache while we were waiting.
|
|
60
|
+
now = time.monotonic()
|
|
61
|
+
if self._ollama_available is not None and now - self._ollama_probed_at < _OLLAMA_PROBE_TTL:
|
|
62
|
+
return self._ollama_available
|
|
63
|
+
self._ollama_available = await self._client.is_available()
|
|
64
|
+
self._ollama_probed_at = now
|
|
65
|
+
return self._ollama_available
|
|
66
|
+
|
|
67
|
+
async def compress(self, chunks: list[Chunk], level: str = "standard") -> list[Chunk]:
|
|
68
|
+
ollama_available = await self._is_ollama_available()
|
|
69
|
+
for chunk in chunks:
|
|
70
|
+
if level == "full" and self._is_full_passthrough(chunk):
|
|
71
|
+
chunk.compressed_content = chunk.content
|
|
72
|
+
continue
|
|
73
|
+
# Cache hit: skip the LLM round-trip and the truncation work.
|
|
74
|
+
if self._cache is not None:
|
|
75
|
+
cached = self._cache.get_cached_compression(chunk.id, level)
|
|
76
|
+
if cached is not None:
|
|
77
|
+
chunk.compressed_content = cached
|
|
78
|
+
continue
|
|
79
|
+
if ollama_available and level != "minimal":
|
|
80
|
+
chunk.compressed_content = await self._llm_compress(chunk, level)
|
|
81
|
+
else:
|
|
82
|
+
chunk.compressed_content = self._fallback_compress(chunk, level)
|
|
83
|
+
# Persist for next time. Truncation is cheap, but caching it still
|
|
84
|
+
# saves the recompute and keeps behaviour symmetric with LLM mode.
|
|
85
|
+
if self._cache is not None and chunk.compressed_content:
|
|
86
|
+
self._cache.put_cached_compression(
|
|
87
|
+
chunk.id, level, chunk.compressed_content
|
|
88
|
+
)
|
|
89
|
+
return chunks
|
|
90
|
+
|
|
91
|
+
def _is_full_passthrough(self, chunk: Chunk) -> bool:
|
|
92
|
+
"""Accept either a high retrieval-confidence chunk *or* a chunk that
|
|
93
|
+
was never routed through retrieval (no embedding) — otherwise a direct
|
|
94
|
+
lookup via `expand_chunk` would get silently truncated in full mode.
|
|
95
|
+
"""
|
|
96
|
+
if chunk.confidence_score > _FULL_PASSTHROUGH_THRESHOLD:
|
|
97
|
+
return True
|
|
98
|
+
return chunk.embedding is None
|
|
99
|
+
|
|
100
|
+
async def _llm_compress(self, chunk: Chunk, level: str) -> str:
|
|
101
|
+
prompt = _PROMPT_MAP.get(chunk.chunk_type, CODE_PROMPT)
|
|
102
|
+
try:
|
|
103
|
+
summary = await self._client.summarize(chunk.content, prompt)
|
|
104
|
+
except Exception as exc:
|
|
105
|
+
log.info(
|
|
106
|
+
"Ollama summarize failed for chunk %s; falling back to truncation (%s)",
|
|
107
|
+
chunk.id,
|
|
108
|
+
exc,
|
|
109
|
+
)
|
|
110
|
+
return self._fallback_compress(chunk, level)
|
|
111
|
+
if self._quality.check(chunk.content, summary):
|
|
112
|
+
return summary
|
|
113
|
+
log.info(
|
|
114
|
+
"Quality check failed for chunk %s (identifier retention < 40%%); "
|
|
115
|
+
"falling back to truncation.",
|
|
116
|
+
chunk.id,
|
|
117
|
+
)
|
|
118
|
+
return self._fallback_compress(chunk, level)
|
|
119
|
+
|
|
120
|
+
def _fallback_compress(self, chunk: Chunk, level: str) -> str:
|
|
121
|
+
limit = _TRUNCATION_LIMITS.get(level, 300)
|
|
122
|
+
if chunk.chunk_type in (ChunkType.FUNCTION, ChunkType.CLASS):
|
|
123
|
+
return self._extract_signature(chunk.content, limit)
|
|
124
|
+
if len(chunk.content) <= limit:
|
|
125
|
+
return chunk.content
|
|
126
|
+
return chunk.content[:limit] + "..."
|
|
127
|
+
|
|
128
|
+
def _extract_signature(self, content: str, limit: int) -> str:
|
|
129
|
+
lines = content.split("\n")
|
|
130
|
+
result_lines: list[str] = []
|
|
131
|
+
in_docstring = False
|
|
132
|
+
char_count = 0
|
|
133
|
+
for line in lines:
|
|
134
|
+
if char_count + len(line) > limit and result_lines:
|
|
135
|
+
break
|
|
136
|
+
result_lines.append(line)
|
|
137
|
+
char_count += len(line) + 1
|
|
138
|
+
if '"""' in line or "'''" in line:
|
|
139
|
+
if in_docstring:
|
|
140
|
+
break
|
|
141
|
+
in_docstring = True
|
|
142
|
+
if not in_docstring and line.strip().endswith(":") and len(result_lines) > 1:
|
|
143
|
+
break
|
|
144
|
+
return "\n".join(result_lines)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Ollama API client for local LLM summarization."""
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OllamaClient:
|
|
7
|
+
def __init__(self, base_url="http://localhost:11434", model="phi3:mini", timeout=30.0):
|
|
8
|
+
self.base_url = base_url
|
|
9
|
+
self.model = model
|
|
10
|
+
self._timeout = timeout
|
|
11
|
+
|
|
12
|
+
async def is_available(self) -> bool:
|
|
13
|
+
try:
|
|
14
|
+
async with httpx.AsyncClient(timeout=5.0) as client:
|
|
15
|
+
resp = await client.get(f"{self.base_url}/api/tags")
|
|
16
|
+
return resp.status_code == 200
|
|
17
|
+
except (httpx.ConnectError, httpx.TimeoutException):
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
async def summarize(self, content: str, prompt: str) -> str:
|
|
21
|
+
full_prompt = prompt.format(content=content) if "{content}" in prompt else f"{prompt}\n\n{content}"
|
|
22
|
+
async with httpx.AsyncClient(timeout=self._timeout) as client:
|
|
23
|
+
resp = await client.post(
|
|
24
|
+
f"{self.base_url}/api/generate",
|
|
25
|
+
json={
|
|
26
|
+
"model": self.model,
|
|
27
|
+
"prompt": full_prompt,
|
|
28
|
+
"stream": False,
|
|
29
|
+
"options": {"temperature": 0.1, "num_predict": 256},
|
|
30
|
+
},
|
|
31
|
+
)
|
|
32
|
+
resp.raise_for_status()
|
|
33
|
+
return resp.json()["response"].strip()
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Output compression rules — reduces Claude's output token usage via style directives."""
|
|
2
|
+
|
|
3
|
+
LEVELS = ("off", "lite", "standard", "max")
|
|
4
|
+
|
|
5
|
+
# Estimated baseline reply size (tokens) when no compression is active.
|
|
6
|
+
# Used to estimate output_compression savings: the MCP server can't see
|
|
7
|
+
# Claude's actual reply length, so we assume an average per affected
|
|
8
|
+
# response and apply the advertised reduction. Tunable — the renderer
|
|
9
|
+
# footnotes the value so users can interpret the estimate.
|
|
10
|
+
ESTIMATED_AVG_REPLY_TOKENS = 500
|
|
11
|
+
|
|
12
|
+
# Advertised output-token reduction per level. Sourced from the level
|
|
13
|
+
# descriptions ("~65% savings", "~75% savings"). `lite` has no advertised
|
|
14
|
+
# number; we use a conservative 20% based on how much filler/hedging
|
|
15
|
+
# typically lives in default-mode replies.
|
|
16
|
+
ADVERTISED_PCT = {
|
|
17
|
+
"off": 0.0,
|
|
18
|
+
"lite": 0.20,
|
|
19
|
+
"standard": 0.65,
|
|
20
|
+
"max": 0.75,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
_RULES = {
|
|
24
|
+
"lite": (
|
|
25
|
+
"## Output Compression: Lite\n"
|
|
26
|
+
"Respond concisely. Rules:\n"
|
|
27
|
+
"- Remove filler words (just, really, basically, actually, simply)\n"
|
|
28
|
+
"- Remove hedging (I think, it seems, perhaps, might want to)\n"
|
|
29
|
+
"- No pleasantries (Sure!, Happy to help, Great question)\n"
|
|
30
|
+
"- No trailing summaries — the diff/output speaks for itself\n"
|
|
31
|
+
"- Keep full grammar and articles\n"
|
|
32
|
+
"- Code blocks, paths, commands, URLs: NEVER compress"
|
|
33
|
+
),
|
|
34
|
+
"standard": (
|
|
35
|
+
"## Output Compression: Standard\n"
|
|
36
|
+
"Respond in compressed style. Rules:\n"
|
|
37
|
+
"- Drop articles (a, an, the) in prose\n"
|
|
38
|
+
"- Use sentence fragments over full sentences\n"
|
|
39
|
+
"- Use short synonyms (fix > resolve, check > investigate, big > large)\n"
|
|
40
|
+
"- Pattern: [thing] [action] [reason]. [next step].\n"
|
|
41
|
+
"- No filler, hedging, pleasantries, or trailing summaries\n"
|
|
42
|
+
"- No restating what the user said\n"
|
|
43
|
+
"- One-line explanations unless detail is asked for\n"
|
|
44
|
+
"- Code blocks, paths, commands, URLs, errors: NEVER compress\n"
|
|
45
|
+
"- Security warnings and destructive action confirmations: use full clarity"
|
|
46
|
+
),
|
|
47
|
+
"max": (
|
|
48
|
+
"## Output Compression: Max\n"
|
|
49
|
+
"Respond in telegraphic style. Rules:\n"
|
|
50
|
+
"- Drop articles, pronouns, conjunctions where meaning survives\n"
|
|
51
|
+
"- Abbreviate common terms: DB, auth, config, fn, dep, impl, req, resp, init\n"
|
|
52
|
+
"- Use arrows for causality: → (leads to), ← (caused by)\n"
|
|
53
|
+
"- Use symbols: + (add), - (remove), ~ (change), ! (warning), ? (unclear)\n"
|
|
54
|
+
"- Max 1-2 sentences per explanation\n"
|
|
55
|
+
"- Pattern: [thing] → [action]. [reason].\n"
|
|
56
|
+
"- Code blocks, paths, commands, URLs, errors: NEVER compress\n"
|
|
57
|
+
"- Security warnings and destructive action confirmations: use full clarity"
|
|
58
|
+
),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_output_rules(level: str) -> str | None:
|
|
63
|
+
"""Return the output compression rules for the given level, or None if off."""
|
|
64
|
+
if level == "off":
|
|
65
|
+
return None
|
|
66
|
+
return _RULES.get(level)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_level_description(level: str) -> str:
|
|
70
|
+
"""Return a human-readable description of the compression level."""
|
|
71
|
+
descriptions = {
|
|
72
|
+
"off": "No output compression — Claude responds normally",
|
|
73
|
+
"lite": "Removes filler, hedging, and pleasantries. Keeps full grammar.",
|
|
74
|
+
"standard": "Drops articles, uses fragments, short synonyms. ~65% output token savings.",
|
|
75
|
+
"max": "Telegraphic style with abbreviations and symbols. ~75% output token savings.",
|
|
76
|
+
}
|
|
77
|
+
return descriptions.get(level, "Unknown level")
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Summarization prompt templates for different chunk types."""
|
|
2
|
+
|
|
3
|
+
CODE_PROMPT: str = "Summarize this code. Include: function/class name, purpose, inputs/outputs, key side effects. Be concise (2-3 sentences max).\n\nCode:\n{content}"
|
|
4
|
+
|
|
5
|
+
DECISION_PROMPT: str = "Summarize this decision. Include: what was decided, why, and what the outcome/action was. One paragraph max.\n\nDecision:\n{content}"
|
|
6
|
+
|
|
7
|
+
ARCHITECTURE_PROMPT: str = "Summarize this component. Include: what it does, its role in the system, and its key dependencies. Be concise (2-3 sentences).\n\nComponent:\n{content}"
|
|
8
|
+
|
|
9
|
+
DOC_PROMPT: str = "Summarize this documentation. Keep the key information, remove boilerplate. Be concise.\n\nDocumentation:\n{content}"
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Lossy detection — verify compressed summaries preserve key identifiers."""
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
_MIN_IDENTIFIER_RATIO = 0.4
|
|
5
|
+
_MIN_IDENTIFIER_LEN = 3
|
|
6
|
+
|
|
7
|
+
class QualityChecker:
|
|
8
|
+
def check(self, original: str, summary: str) -> bool:
|
|
9
|
+
identifiers = self.extract_identifiers(original)
|
|
10
|
+
if not identifiers:
|
|
11
|
+
return True
|
|
12
|
+
summary_lower = summary.lower()
|
|
13
|
+
preserved = sum(1 for ident in identifiers if ident.lower() in summary_lower)
|
|
14
|
+
ratio = preserved / len(identifiers)
|
|
15
|
+
return ratio >= _MIN_IDENTIFIER_RATIO
|
|
16
|
+
|
|
17
|
+
def extract_identifiers(self, code: str) -> list[str]:
|
|
18
|
+
patterns = [
|
|
19
|
+
r"(?:def|class|function)\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
20
|
+
r"([a-zA-Z_][a-zA-Z0-9_]*)\s*[=:(]",
|
|
21
|
+
r"self\.([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
22
|
+
]
|
|
23
|
+
identifiers = set()
|
|
24
|
+
for pattern in patterns:
|
|
25
|
+
for match in re.finditer(pattern, code):
|
|
26
|
+
name = match.group(1)
|
|
27
|
+
if len(name) >= _MIN_IDENTIFIER_LEN and name not in {"self", "None", "True", "False"}:
|
|
28
|
+
identifiers.add(name)
|
|
29
|
+
# Extract function/method parameters
|
|
30
|
+
for param_match in re.finditer(r"(?:def|function)\s+\w+\s*\(([^)]*)\)", code):
|
|
31
|
+
for param in re.split(r"[,\s]+", param_match.group(1)):
|
|
32
|
+
param = param.strip().split(":")[0].split("=")[0].strip()
|
|
33
|
+
if len(param) >= _MIN_IDENTIFIER_LEN and param not in {"self", "cls", "None", "True", "False"}:
|
|
34
|
+
identifiers.add(param)
|
|
35
|
+
for match in re.finditer(r"\b([A-Z][a-zA-Z0-9]+)\b", code):
|
|
36
|
+
identifiers.add(match.group(1))
|
|
37
|
+
return sorted(identifiers)
|
context_engine/config.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""Configuration loading — global + per-project with defaults."""
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import yaml
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
_CCE_HOME = Path.home() / ".cce"
|
|
9
|
+
|
|
10
|
+
DEFAULT_GLOBAL_PATH = _CCE_HOME / "config.yaml"
|
|
11
|
+
PROJECT_CONFIG_NAME = ".context-engine.yaml"
|
|
12
|
+
|
|
13
|
+
DEFAULT_IGNORE = [
|
|
14
|
+
# Version control
|
|
15
|
+
".git", ".svn", ".hg",
|
|
16
|
+
# Dependencies (JS, PHP, Python, Ruby, Go, Rust, Java, .NET)
|
|
17
|
+
"node_modules", "vendor", "bower_components",
|
|
18
|
+
".pnpm-store", ".pnpm", ".yarn",
|
|
19
|
+
".venv", "venv", "env", ".env",
|
|
20
|
+
".tox", ".nox", ".mypy_cache", ".pytest_cache",
|
|
21
|
+
".ruff_cache", ".cache",
|
|
22
|
+
"Pods", # iOS CocoaPods
|
|
23
|
+
# Build output
|
|
24
|
+
"dist", "build", "_build", "out", "target",
|
|
25
|
+
"bin", "obj", # .NET
|
|
26
|
+
".next", ".nuxt", ".output", ".vercel",
|
|
27
|
+
".turbo", ".parcel-cache",
|
|
28
|
+
# IDE / editor
|
|
29
|
+
".idea", ".vscode", ".vs",
|
|
30
|
+
# Coverage / test artifacts
|
|
31
|
+
"coverage", ".coverage", "htmlcov", ".nyc_output",
|
|
32
|
+
# OS files
|
|
33
|
+
".DS_Store",
|
|
34
|
+
# Compiled / generated
|
|
35
|
+
"__pycache__", ".sass-cache", ".gradle",
|
|
36
|
+
# Infra
|
|
37
|
+
".terraform", ".vagrant",
|
|
38
|
+
# Package locks (huge, not useful)
|
|
39
|
+
"package-lock.json", "yarn.lock", "pnpm-lock.yaml",
|
|
40
|
+
"composer.lock", "poetry.lock",
|
|
41
|
+
# Storage / logs
|
|
42
|
+
"storage", "logs", "tmp", "temp",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class Config:
|
|
48
|
+
# Compression
|
|
49
|
+
compression_level: str = "standard"
|
|
50
|
+
compression_model: str = "phi3:mini"
|
|
51
|
+
|
|
52
|
+
# Output compression
|
|
53
|
+
output_compression: str = "standard" # off | lite | standard | max
|
|
54
|
+
|
|
55
|
+
# Embedding
|
|
56
|
+
embedding_model: str = "BAAI/bge-small-en-v1.5"
|
|
57
|
+
|
|
58
|
+
# Retrieval
|
|
59
|
+
retrieval_confidence_threshold: float = 0.2
|
|
60
|
+
retrieval_top_k: int = 20
|
|
61
|
+
bootstrap_max_tokens: int = 10000
|
|
62
|
+
|
|
63
|
+
# Indexer
|
|
64
|
+
indexer_watch: bool = True
|
|
65
|
+
indexer_debounce_ms: int = 500
|
|
66
|
+
indexer_ignore: list[str] = field(default_factory=lambda: list(DEFAULT_IGNORE))
|
|
67
|
+
# When True, the indexer skips well-known credential filenames
|
|
68
|
+
# (.env*, *.pem, secrets.yml, credentials.json, …) and redacts
|
|
69
|
+
# AWS/GitHub/JWT/etc. patterns from the content of files it does
|
|
70
|
+
# index. See indexer/secrets.py for the full pattern list. Default
|
|
71
|
+
# True; users on non-sensitive corpora can opt out.
|
|
72
|
+
indexer_redact_secrets: bool = True
|
|
73
|
+
# When True, memory.db writes (decisions, code_areas, turn_summaries,
|
|
74
|
+
# session rollups) get PII scrubbed before storage: emails, IPs,
|
|
75
|
+
# credit cards (Luhn-validated), SSNs, phone numbers. Free-form
|
|
76
|
+
# session text often captures user data — for regulated industries
|
|
77
|
+
# this is the difference between "tool" and "compliance blocker".
|
|
78
|
+
memory_redact_pii: bool = True
|
|
79
|
+
# When True, every context_search call appends one JSON line to
|
|
80
|
+
# {storage_base}/audit.log: timestamp, query length, top_k, served
|
|
81
|
+
# chunks (file:start-end), score range, output compression level.
|
|
82
|
+
# The query text is hashed (sha256, 12-char prefix) — the log is
|
|
83
|
+
# for "what did Claude see when?" not "what did the user ask?".
|
|
84
|
+
audit_log_enabled: bool = False
|
|
85
|
+
|
|
86
|
+
# Pricing (for savings estimates)
|
|
87
|
+
pricing_model: str = "opus"
|
|
88
|
+
|
|
89
|
+
# Storage
|
|
90
|
+
storage_path: str = str(_CCE_HOME / "projects")
|
|
91
|
+
|
|
92
|
+
def detect_resource_profile(self) -> str:
|
|
93
|
+
try:
|
|
94
|
+
import psutil
|
|
95
|
+
ram_gb = psutil.virtual_memory().total / (1024 ** 3)
|
|
96
|
+
except ImportError:
|
|
97
|
+
ram_gb = 16
|
|
98
|
+
if ram_gb >= 32:
|
|
99
|
+
return "full"
|
|
100
|
+
if ram_gb >= 12:
|
|
101
|
+
return "standard"
|
|
102
|
+
return "light"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _deep_merge(base: dict, override: dict) -> dict:
|
|
106
|
+
result = base.copy()
|
|
107
|
+
for key, value in override.items():
|
|
108
|
+
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
|
|
109
|
+
result[key] = _deep_merge(result[key], value)
|
|
110
|
+
else:
|
|
111
|
+
result[key] = value
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
_EXPECTED_TYPES: dict[str, type | tuple[type, ...]] = {
|
|
116
|
+
"compression_level": str,
|
|
117
|
+
"compression_model": str,
|
|
118
|
+
"output_compression": str,
|
|
119
|
+
"embedding_model": str,
|
|
120
|
+
"retrieval_confidence_threshold": (int, float),
|
|
121
|
+
"retrieval_top_k": int,
|
|
122
|
+
"bootstrap_max_tokens": int,
|
|
123
|
+
"indexer_watch": bool,
|
|
124
|
+
"indexer_debounce_ms": int,
|
|
125
|
+
"indexer_ignore": list,
|
|
126
|
+
"indexer_redact_secrets": bool,
|
|
127
|
+
"memory_redact_pii": bool,
|
|
128
|
+
"audit_log_enabled": bool,
|
|
129
|
+
"storage_path": str,
|
|
130
|
+
"pricing_model": str,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _apply_dict_to_config(config: Config, data: dict) -> None:
|
|
135
|
+
mapping = {
|
|
136
|
+
("compression", "level"): "compression_level",
|
|
137
|
+
("compression", "model"): "compression_model",
|
|
138
|
+
("compression", "output"): "output_compression",
|
|
139
|
+
("embedding", "model"): "embedding_model",
|
|
140
|
+
("retrieval", "confidence_threshold"): "retrieval_confidence_threshold",
|
|
141
|
+
("retrieval", "top_k"): "retrieval_top_k",
|
|
142
|
+
("retrieval", "bootstrap_max_tokens"): "bootstrap_max_tokens",
|
|
143
|
+
("indexer", "watch"): "indexer_watch",
|
|
144
|
+
("indexer", "debounce_ms"): "indexer_debounce_ms",
|
|
145
|
+
("indexer", "ignore"): "indexer_ignore",
|
|
146
|
+
("indexer", "redact_secrets"): "indexer_redact_secrets",
|
|
147
|
+
("memory", "redact_pii"): "memory_redact_pii",
|
|
148
|
+
("audit", "enabled"): "audit_log_enabled",
|
|
149
|
+
("storage", "path"): "storage_path",
|
|
150
|
+
("pricing", "model"): "pricing_model",
|
|
151
|
+
}
|
|
152
|
+
for (section, key), attr in mapping.items():
|
|
153
|
+
if section in data and isinstance(data[section], dict) and key in data[section]:
|
|
154
|
+
value = data[section][key]
|
|
155
|
+
expected = _EXPECTED_TYPES.get(attr)
|
|
156
|
+
if expected is not None and not isinstance(value, expected):
|
|
157
|
+
# `bool` is a subclass of `int`, so guard against that edge case.
|
|
158
|
+
if expected is int and isinstance(value, bool):
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Config {section}.{key} must be int, got bool ({value!r})"
|
|
161
|
+
)
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"Config {section}.{key} must be "
|
|
164
|
+
f"{getattr(expected, '__name__', expected)}, "
|
|
165
|
+
f"got {type(value).__name__} ({value!r})"
|
|
166
|
+
)
|
|
167
|
+
# For ignore lists, merge with defaults instead of replacing.
|
|
168
|
+
# This way user config adds to the defaults, not overrides them.
|
|
169
|
+
if attr == "indexer_ignore" and isinstance(value, list):
|
|
170
|
+
merged = list(DEFAULT_IGNORE)
|
|
171
|
+
for item in value:
|
|
172
|
+
if item not in merged:
|
|
173
|
+
merged.append(item)
|
|
174
|
+
setattr(config, attr, merged)
|
|
175
|
+
else:
|
|
176
|
+
setattr(config, attr, value)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def load_config(
|
|
180
|
+
global_path: Path | None = None,
|
|
181
|
+
project_path: Path | None = None,
|
|
182
|
+
) -> Config:
|
|
183
|
+
global_path = global_path or DEFAULT_GLOBAL_PATH
|
|
184
|
+
config = Config()
|
|
185
|
+
|
|
186
|
+
global_data: dict = {}
|
|
187
|
+
if global_path.exists():
|
|
188
|
+
with open(global_path) as f:
|
|
189
|
+
global_data = yaml.safe_load(f) or {}
|
|
190
|
+
|
|
191
|
+
project_data: dict = {}
|
|
192
|
+
if project_path and project_path.exists():
|
|
193
|
+
with open(project_path) as f:
|
|
194
|
+
project_data = yaml.safe_load(f) or {}
|
|
195
|
+
|
|
196
|
+
merged = _deep_merge(global_data, project_data)
|
|
197
|
+
_apply_dict_to_config(config, merged)
|
|
198
|
+
return config
|
|
File without changes
|