code-context-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. code_context_engine-0.4.0.dist-info/METADATA +389 -0
  2. code_context_engine-0.4.0.dist-info/RECORD +63 -0
  3. code_context_engine-0.4.0.dist-info/WHEEL +5 -0
  4. code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
  5. code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
  7. context_engine/__init__.py +3 -0
  8. context_engine/cli.py +2848 -0
  9. context_engine/cli_style.py +66 -0
  10. context_engine/compression/__init__.py +0 -0
  11. context_engine/compression/compressor.py +144 -0
  12. context_engine/compression/ollama_client.py +33 -0
  13. context_engine/compression/output_rules.py +77 -0
  14. context_engine/compression/prompts.py +9 -0
  15. context_engine/compression/quality.py +37 -0
  16. context_engine/config.py +198 -0
  17. context_engine/dashboard/__init__.py +0 -0
  18. context_engine/dashboard/_page.py +1548 -0
  19. context_engine/dashboard/server.py +429 -0
  20. context_engine/editors.py +265 -0
  21. context_engine/event_bus.py +24 -0
  22. context_engine/indexer/__init__.py +0 -0
  23. context_engine/indexer/chunker.py +147 -0
  24. context_engine/indexer/embedder.py +154 -0
  25. context_engine/indexer/embedding_cache.py +168 -0
  26. context_engine/indexer/git_hooks.py +73 -0
  27. context_engine/indexer/git_indexer.py +136 -0
  28. context_engine/indexer/ignorefile.py +96 -0
  29. context_engine/indexer/manifest.py +78 -0
  30. context_engine/indexer/pipeline.py +624 -0
  31. context_engine/indexer/secrets.py +332 -0
  32. context_engine/indexer/watcher.py +109 -0
  33. context_engine/integration/__init__.py +0 -0
  34. context_engine/integration/bootstrap.py +76 -0
  35. context_engine/integration/git_context.py +132 -0
  36. context_engine/integration/mcp_server.py +1825 -0
  37. context_engine/integration/session_capture.py +306 -0
  38. context_engine/memory/__init__.py +6 -0
  39. context_engine/memory/compressor.py +344 -0
  40. context_engine/memory/db.py +922 -0
  41. context_engine/memory/extractive.py +106 -0
  42. context_engine/memory/grammar.py +419 -0
  43. context_engine/memory/hook_installer.py +258 -0
  44. context_engine/memory/hook_server.py +83 -0
  45. context_engine/memory/hooks.py +327 -0
  46. context_engine/memory/migrate.py +268 -0
  47. context_engine/models.py +96 -0
  48. context_engine/pricing.py +104 -0
  49. context_engine/project_commands.py +296 -0
  50. context_engine/retrieval/__init__.py +0 -0
  51. context_engine/retrieval/confidence.py +47 -0
  52. context_engine/retrieval/query_parser.py +105 -0
  53. context_engine/retrieval/retriever.py +199 -0
  54. context_engine/serve_http.py +208 -0
  55. context_engine/services.py +252 -0
  56. context_engine/storage/__init__.py +0 -0
  57. context_engine/storage/backend.py +39 -0
  58. context_engine/storage/fts_store.py +112 -0
  59. context_engine/storage/graph_store.py +219 -0
  60. context_engine/storage/local_backend.py +109 -0
  61. context_engine/storage/remote_backend.py +117 -0
  62. context_engine/storage/vector_store.py +357 -0
  63. context_engine/utils.py +72 -0
@@ -0,0 +1,332 @@
1
+ """Secret detection at index time.
2
+
3
+ Two layers:
4
+
5
+ 1. **Filename-based skipping** — files whose names match well-known
6
+ credential patterns (.env*, *.pem, secrets.yml, …) are never read,
7
+ never embedded, never served. This is the cheap first line of
8
+ defence and catches the most common leak.
9
+
10
+ 2. **Content-based redaction** — for files that DO get indexed, lines
11
+ containing what look like AWS keys, GitHub tokens, JWTs, etc. get
12
+ replaced with `[REDACTED:<reason>]` before chunking. The chunker
13
+ and embedder never see the secret value.
14
+
15
+ Both layers are conservative on purpose — false positives (over-redaction
16
+ of innocent code) are recoverable; false negatives (a real secret leaked
17
+ into the vector DB) are not. When in doubt, redact.
18
+
19
+ Tunable via config:
20
+ · indexer.redact_secrets (default: True) — master switch.
21
+ · indexer.secret_extra_patterns (default: []) — user-added regexes
22
+ for content scanning, merged with the built-in set.
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import re
27
+ from pathlib import Path
28
+
29
+ # ── Filename-level skip list ────────────────────────────────────────────────
30
+ # Glob-style suffixes / exact names that almost always mean "credentials".
31
+ # Match is case-insensitive against the full filename (not full path).
32
+
33
+ # Exact filenames (case-insensitive). Entire file is skipped.
34
+ _SECRET_FILENAMES = frozenset({
35
+ # Dotenv family (covers .env, .env.local, .env.production, etc. — see _SECRET_PREFIXES)
36
+ ".npmrc", ".pypirc", ".netrc",
37
+ # Cloud / infra
38
+ "credentials.json", "credentials.yaml", "credentials.yml",
39
+ "secrets.json", "secrets.yaml", "secrets.yml",
40
+ "service-account.json", "gcp-key.json",
41
+ "kube-config", "kubeconfig",
42
+ # CI / app config that frequently holds tokens
43
+ "auth.json",
44
+ # Git config can carry remote tokens
45
+ ".git-credentials",
46
+ # SSH private keys frequently sit extension-less in ~/.ssh.
47
+ "id_rsa", "id_dsa", "id_ecdsa", "id_ed25519", "id_xmss",
48
+ })
49
+
50
+ # Filename starts with any of these → skip (handles .env, .env.local, etc.).
51
+ _SECRET_PREFIXES = (".env",)
52
+
53
+ # File extensions whose presence is a strong signal of a key/cert. Skip outright.
54
+ _SECRET_EXTENSIONS = frozenset({
55
+ ".pem", ".key", ".crt", ".cer", ".der",
56
+ ".p12", ".pfx", # PKCS#12 cert bundles
57
+ ".jks", ".keystore", # Java keystores
58
+ ".pgp", ".asc", ".gpg", # PGP keys
59
+ ".kdbx", # KeePass
60
+ ".ppk", # PuTTY private keys
61
+ })
62
+
63
+
64
+ def is_secret_file(path: Path) -> bool:
65
+ """True if the filename alone is enough to classify as credentials.
66
+
67
+ Operates on basename only — callers don't need to pre-normalise the
68
+ path. Case-insensitive across the board (Windows/macOS reality).
69
+ """
70
+ name = path.name.lower()
71
+ if name in _SECRET_FILENAMES:
72
+ return True
73
+ if path.suffix.lower() in _SECRET_EXTENSIONS:
74
+ return True
75
+ for prefix in _SECRET_PREFIXES:
76
+ if name.startswith(prefix):
77
+ return True
78
+ return False
79
+
80
+
81
+ # ── Content-level redaction ─────────────────────────────────────────────────
82
+ # Patterns are tuples of (regex, label). Label appears in the redaction
83
+ # placeholder so users can tell *what kind* of secret was scrubbed without
84
+ # leaking the value itself.
85
+ #
86
+ # Conservative ordering: patterns earlier in the list win — specific
87
+ # vendor formats before generic high-entropy heuristics.
88
+
89
+ _CONTENT_PATTERNS: list[tuple[re.Pattern, str]] = [
90
+ # AWS access keys — fixed prefix + 16 base32 chars. Non-capturing
91
+ # group on the prefix so the whole match is the credential value
92
+ # (otherwise we'd only replace AKIA and leak the 16-char suffix).
93
+ (re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b"), "AWS_ACCESS_KEY"),
94
+ # AWS secret keys — 40 base64 chars after "aws_secret_access_key"-ish context.
95
+ (re.compile(
96
+ r"(?i)aws_secret_access_key\s*[:=]\s*['\"]?([A-Za-z0-9/+=]{40})['\"]?"
97
+ ), "AWS_SECRET_KEY"),
98
+ # GitHub tokens (classic + fine-grained + app + OAuth).
99
+ (re.compile(r"\bghp_[A-Za-z0-9]{36}\b"), "GITHUB_PAT"),
100
+ (re.compile(r"\bgithub_pat_[A-Za-z0-9_]{82}\b"), "GITHUB_FINE_GRAINED_PAT"),
101
+ (re.compile(r"\b(ghs|gho|ghu|ghr)_[A-Za-z0-9]{36}\b"), "GITHUB_OAUTH"),
102
+ # Slack tokens.
103
+ (re.compile(r"\bxox[abprs]-[A-Za-z0-9-]{10,}\b"), "SLACK_TOKEN"),
104
+ # Stripe live keys (test keys are deliberately not matched — they're
105
+ # safe to commit and matching them would over-redact every Stripe
106
+ # quickstart in the wild).
107
+ (re.compile(r"\b(sk|rk)_live_[A-Za-z0-9]{24,}\b"), "STRIPE_LIVE_KEY"),
108
+ # OpenAI / Anthropic API keys.
109
+ (re.compile(r"\bsk-[A-Za-z0-9]{20}T3BlbkFJ[A-Za-z0-9]{20}\b"), "OPENAI_KEY"),
110
+ (re.compile(r"\bsk-ant-(api03|admin01)-[A-Za-z0-9_\-]{93,}\b"), "ANTHROPIC_KEY"),
111
+ # Google API keys.
112
+ (re.compile(r"\bAIza[0-9A-Za-z_\-]{35}\b"), "GOOGLE_API_KEY"),
113
+ # Generic JWT — three base64url segments separated by dots.
114
+ (re.compile(r"\beyJ[A-Za-z0-9_\-]+\.eyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b"), "JWT"),
115
+ # Private key blocks (catch even if filename slipped past the skip list).
116
+ (re.compile(
117
+ r"-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]+?-----END [A-Z ]*PRIVATE KEY-----"
118
+ ), "PRIVATE_KEY_BLOCK"),
119
+ # Generic high-signal "looks like a secret" heuristic — variable
120
+ # named after a credential, assigned to a long opaque string. Skips
121
+ # placeholders ("xxx", "your-key-here", "<insert>") so the typical
122
+ # README example doesn't trigger a redaction.
123
+ #
124
+ # The keyword is matched anywhere on the line (no `^` anchor) so
125
+ # patterns like `config["password"] = "..."` and dict literals fire.
126
+ # Word boundary on the left edge keeps "ssh_password_dialog" from
127
+ # matching as a fake password assignment.
128
+ (re.compile(
129
+ r"(?i)\b(?:password|passwd|secret|api[_-]?key|access[_-]?token|"
130
+ r"private[_-]?key|auth[_-]?token|client[_-]?secret)\b"
131
+ r"['\"\]\s]*[:=]\s*['\"]([^'\"\s]{16,})['\"]"
132
+ ), "GENERIC_CREDENTIAL"),
133
+ ]
134
+
135
+
136
+ # Common placeholder values that should NOT be redacted even if they
137
+ # match the generic pattern. Reduces noise in templates / README files.
138
+ _PLACEHOLDER_VALUES = frozenset({
139
+ "your-api-key", "your_api_key", "your-key-here", "your_key_here",
140
+ "<your-key>", "<api-key>", "<your_key>", "<api_key>",
141
+ "xxxxxxxxxxxxxxxx", "0000000000000000",
142
+ "changeme", "change-me", "change_me",
143
+ "placeholder", "example", "sample",
144
+ })
145
+
146
+
147
+ _PLACEHOLDER_SUBSTRINGS = (
148
+ "placeholder", "example", "fake", "dummy", "sample",
149
+ "changeme", "change-me", "change_me", "not_real",
150
+ "not-real", "redacted", "<your", "<api",
151
+ # README phrasing variants
152
+ "your_key", "your-key", "your_secret", "your-secret",
153
+ "your_token", "your-token", "your_api", "your-api",
154
+ "your_password", "your-password",
155
+ "replace_with", "replace-with", "insert_your", "insert-your",
156
+ )
157
+
158
+
159
+ def _starts_with_placeholder_prefix(value: str) -> bool:
160
+ # Any string that opens with "your-" / "your_" / "my-" / "my_" is a
161
+ # tutorial-style placeholder, not a credential. README examples like
162
+ # "your-api-key-here" or "my_secret_value" all match.
163
+ for prefix in ("your-", "your_", "my-", "my_"):
164
+ if value.startswith(prefix):
165
+ return True
166
+ return False
167
+
168
+
169
+ def _is_placeholder(value: str) -> bool:
170
+ v = value.strip("'\"<>").lower()
171
+ if v in _PLACEHOLDER_VALUES:
172
+ return True
173
+ # Repeated single character ("xxxxxxxxxx", "0000000000") is almost
174
+ # always a placeholder, never a real key.
175
+ if len(set(v)) <= 2:
176
+ return True
177
+ # Substring heuristic — README/docs frequently embed credential-shaped
178
+ # strings with telltale words. Better to over-skip these than to redact
179
+ # innocent documentation.
180
+ for needle in _PLACEHOLDER_SUBSTRINGS:
181
+ if needle in v:
182
+ return True
183
+ if _starts_with_placeholder_prefix(v):
184
+ return True
185
+ return False
186
+
187
+
188
+ def redact_secrets(
189
+ text: str,
190
+ *,
191
+ extra_patterns: list[tuple[re.Pattern, str]] | None = None,
192
+ ) -> tuple[str, list[str]]:
193
+ """Replace credential-shaped substrings with `[REDACTED:LABEL]`.
194
+
195
+ Returns (redacted_text, labels) — `labels` enumerates which pattern
196
+ classes fired, so callers can record telemetry without leaking the
197
+ secret value. Empty list means the text is clean.
198
+
199
+ The redaction is line-aware for the generic-credential pattern:
200
+ the entire value gets replaced, but the variable name and assignment
201
+ syntax are preserved so chunked code still parses.
202
+ """
203
+ if not text:
204
+ return text, []
205
+ patterns = list(_CONTENT_PATTERNS)
206
+ if extra_patterns:
207
+ patterns.extend(extra_patterns)
208
+ out = text
209
+ fired: list[str] = []
210
+
211
+ for pattern, label in patterns:
212
+ def _sub(match: re.Match, _label: str = label) -> str:
213
+ # If the match has a single capture group, that's the actual
214
+ # credential value — preserve everything around it.
215
+ if match.lastindex:
216
+ value = match.group(match.lastindex)
217
+ if _is_placeholder(value):
218
+ return match.group(0)
219
+ fired.append(_label)
220
+ return match.group(0).replace(value, f"[REDACTED:{_label}]")
221
+ full = match.group(0)
222
+ if _is_placeholder(full):
223
+ return full
224
+ fired.append(_label)
225
+ return f"[REDACTED:{_label}]"
226
+
227
+ out = pattern.sub(_sub, out)
228
+
229
+ return out, fired
230
+
231
+
232
+ # ── PII patterns ────────────────────────────────────────────────────────────
233
+ # Used by memory.db writes (decisions / turn summaries / code areas) so
234
+ # personal data captured during a session doesn't end up persisted in
235
+ # searchable form. Lighter touch than secret detection — only the most
236
+ # unambiguous patterns. Free-form text shouldn't be aggressively scrubbed.
237
+
238
+ _PII_PATTERNS: list[tuple[re.Pattern, str]] = [
239
+ # Email addresses — simple but effective. Matches RFC-mostly-compliant
240
+ # addresses; over-matches a tiny bit (won't reject quoted local-parts)
241
+ # but that's fine for redaction.
242
+ (re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"), "EMAIL"),
243
+ # IPv4 — four 1-3 digit groups. Won't match localhost or 127.0.0.1
244
+ # specifically because those are useful in dev notes.
245
+ (re.compile(
246
+ r"\b(?!127\.0\.0\.1\b|0\.0\.0\.0\b|10\.0\.0\.1\b|192\.168\.\d+\.\d+\b)"
247
+ r"(?:[1-9]\d?|1\d{2}|2[0-4]\d|25[0-5])"
248
+ r"(?:\.(?:\d{1,3})){3}\b"
249
+ ), "IPV4"),
250
+ # Credit-card-shaped 13-19 digit runs (with optional spaces/dashes).
251
+ # Filtered through Luhn check to avoid false positives on order
252
+ # numbers, hashes, etc.
253
+ (re.compile(r"\b(?:\d[ -]?){13,19}\b"), "CREDIT_CARD"),
254
+ # US-style SSN.
255
+ (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "SSN"),
256
+ # E.164 phone numbers (with leading +).
257
+ (re.compile(r"\+\d{1,3}[ -]?\(?\d{1,4}\)?[ -]?\d{3,4}[ -]?\d{3,4}\b"), "PHONE_E164"),
258
+ ]
259
+
260
+
261
+ def _passes_luhn(digits: str) -> bool:
262
+ """Luhn check for credit-card validation. Skips invalid candidates so
263
+ we don't redact every long number. Strips non-digits first.
264
+ """
265
+ digits = re.sub(r"\D", "", digits)
266
+ if not (13 <= len(digits) <= 19):
267
+ return False
268
+ total = 0
269
+ parity = len(digits) % 2
270
+ for i, ch in enumerate(digits):
271
+ n = int(ch)
272
+ if i % 2 == parity:
273
+ n *= 2
274
+ if n > 9:
275
+ n -= 9
276
+ total += n
277
+ return total % 10 == 0
278
+
279
+
280
+ def redact_pii(
281
+ text: str,
282
+ *,
283
+ extra_patterns: list[tuple[re.Pattern, str]] | None = None,
284
+ ) -> tuple[str, list[str]]:
285
+ """Replace common PII (emails, IPs, credit cards, SSNs, phones) with
286
+ `[REDACTED:LABEL]`. Same return shape as `redact_secrets`.
287
+
288
+ Lighter touch than secret detection: only the unambiguous patterns
289
+ fire, and credit-card candidates are Luhn-validated so order numbers
290
+ and SHA hashes don't get clobbered.
291
+ """
292
+ if not text:
293
+ return text, []
294
+ patterns = list(_PII_PATTERNS)
295
+ if extra_patterns:
296
+ patterns.extend(extra_patterns)
297
+ out = text
298
+ fired: list[str] = []
299
+
300
+ for pattern, label in patterns:
301
+ def _sub(match: re.Match, _label: str = label) -> str:
302
+ value = match.group(0)
303
+ # Credit-card pattern needs Luhn validation to avoid false
304
+ # positives. Other PII patterns are accepted as-is.
305
+ if _label == "CREDIT_CARD" and not _passes_luhn(value):
306
+ return value
307
+ fired.append(_label)
308
+ return f"[REDACTED:{_label}]"
309
+
310
+ out = pattern.sub(_sub, out)
311
+
312
+ return out, fired
313
+
314
+
315
+ # ── Convenience: combined check for the indexer ────────────────────────────
316
+
317
+ def scan_and_redact(
318
+ file_path: Path,
319
+ content: str,
320
+ *,
321
+ extra_patterns: list[tuple[re.Pattern, str]] | None = None,
322
+ ) -> tuple[str | None, list[str]]:
323
+ """Indexer-facing entrypoint.
324
+
325
+ Returns (text_or_None, labels). `text_or_None` is None when the file
326
+ should be skipped entirely (filename-level secret), otherwise the
327
+ redacted content (which equals `content` if nothing fired). `labels`
328
+ is the list of pattern labels that triggered.
329
+ """
330
+ if is_secret_file(file_path):
331
+ return None, ["SECRET_FILENAME"]
332
+ return redact_secrets(content, extra_patterns=extra_patterns)
@@ -0,0 +1,109 @@
1
+ """File watcher with debouncing using watchdog.
2
+
3
+ Watches a directory for file changes and triggers an async callback
4
+ after a debounce period. Used by `cce serve` to keep the index
5
+ up-to-date as files are saved.
6
+ """
7
+ import asyncio
8
+ import logging
9
+ import threading
10
+ import time
11
+ from pathlib import Path
12
+
13
+ from watchdog.observers import Observer
14
+ from watchdog.events import FileSystemEventHandler
15
+
16
+ log = logging.getLogger(__name__)
17
+
18
+
19
+ class _DebouncedHandler(FileSystemEventHandler):
20
+ def __init__(self, on_change, debounce_ms, ignore_patterns, watch_dir, loop):
21
+ self._on_change = on_change
22
+ self._debounce_s = debounce_ms / 1000.0
23
+ self._ignore_set = set(ignore_patterns)
24
+ self._watch_dir = Path(watch_dir)
25
+ self._loop = loop
26
+ self._pending: dict[str, float] = {}
27
+ self._lock = threading.Lock()
28
+ self._timer: threading.Timer | None = None
29
+
30
+ def _should_ignore(self, path: str) -> bool:
31
+ """Check if any path component matches an ignore pattern."""
32
+ try:
33
+ rel = Path(path).relative_to(self._watch_dir)
34
+ except ValueError:
35
+ return False
36
+ for part in rel.parts:
37
+ if part in self._ignore_set:
38
+ return True
39
+ # Always skip CCE's own storage/index files
40
+ if part == ".cce":
41
+ return True
42
+ return False
43
+
44
+ def on_any_event(self, event):
45
+ if event.is_directory:
46
+ return
47
+ path = event.src_path
48
+ if self._should_ignore(path):
49
+ return
50
+ with self._lock:
51
+ self._pending[path] = time.time()
52
+ if self._timer:
53
+ self._timer.cancel()
54
+ self._timer = threading.Timer(self._debounce_s, self._flush)
55
+ self._timer.start()
56
+
57
+ def _flush(self):
58
+ with self._lock:
59
+ paths = list(self._pending.keys())
60
+ self._pending.clear()
61
+ for path in paths:
62
+ try:
63
+ asyncio.run_coroutine_threadsafe(self._on_change(path), self._loop)
64
+ except RuntimeError:
65
+ # Loop closed — shutting down
66
+ pass
67
+
68
+
69
+ class FileWatcher:
70
+ """Watch a directory for file changes with debounced async callbacks."""
71
+
72
+ def __init__(self, watch_dir, on_change, debounce_ms=500, ignore_patterns=None):
73
+ self._watch_dir = watch_dir
74
+ self._on_change = on_change
75
+ self._debounce_ms = debounce_ms
76
+ self._ignore_patterns = ignore_patterns or []
77
+ self._observer = None
78
+ self._handler = None
79
+
80
+ def start(self, loop=None):
81
+ """Start watching. Pass the running asyncio loop explicitly."""
82
+ if loop is None:
83
+ try:
84
+ loop = asyncio.get_running_loop()
85
+ except RuntimeError:
86
+ loop = asyncio.get_event_loop()
87
+ self._handler = _DebouncedHandler(
88
+ on_change=self._on_change,
89
+ debounce_ms=self._debounce_ms,
90
+ ignore_patterns=self._ignore_patterns,
91
+ watch_dir=self._watch_dir,
92
+ loop=loop,
93
+ )
94
+ self._observer = Observer()
95
+ self._observer.schedule(self._handler, self._watch_dir, recursive=True)
96
+ self._observer.daemon = True
97
+ self._observer.start()
98
+ log.debug("Watcher started for %s", self._watch_dir)
99
+
100
+ def stop(self):
101
+ if self._handler:
102
+ with self._handler._lock:
103
+ if self._handler._timer:
104
+ self._handler._timer.cancel()
105
+ self._handler._timer = None
106
+ if self._observer:
107
+ self._observer.stop()
108
+ self._observer.join(timeout=2)
109
+ log.debug("Watcher stopped")
File without changes
@@ -0,0 +1,76 @@
1
+ """Bootstrap context builder — generates compressed project context for session start."""
2
+ from importlib.metadata import version as pkg_version
3
+
4
+ from context_engine.models import Chunk, ConfidenceLevel
5
+
6
+ _CHARS_PER_TOKEN = 4
7
+
8
+
9
+ def _get_version() -> str:
10
+ try:
11
+ return pkg_version("code-context-engine")
12
+ except Exception:
13
+ return "unknown"
14
+
15
+
16
+ class BootstrapBuilder:
17
+ def __init__(self, max_tokens: int = 10000) -> None:
18
+ self._max_chars = max_tokens * _CHARS_PER_TOKEN
19
+
20
+ def build(self, project_name, chunks=None, recent_commits=None,
21
+ active_decisions=None, working_state=None, chunk_count=0,
22
+ project_commands_text=None):
23
+ sections = []
24
+ ver = _get_version()
25
+ status_line = f"CCE v{ver} · {chunk_count} chunks indexed" if chunk_count else f"CCE v{ver} · no chunks indexed yet"
26
+ sections.append(f"## Project: {project_name}\n`{status_line}`")
27
+ sections.append(self._build_architecture(chunks or []))
28
+ sections.append(self._build_activity(recent_commits or []))
29
+ if working_state:
30
+ state_text = "\n".join(f" {line}" for line in working_state)
31
+ sections.append(f"### Working State\n{state_text}")
32
+ if project_commands_text:
33
+ sections.append(project_commands_text)
34
+ if active_decisions:
35
+ decisions_text = "\n".join(f"- {d}" for d in active_decisions)
36
+ sections.append(f"### Active Context\n{decisions_text}")
37
+ code_section = self._build_code_context(chunks or [])
38
+ if code_section:
39
+ sections.append(code_section)
40
+ payload = "\n\n".join(sections)
41
+ if len(payload) > self._max_chars:
42
+ payload = payload[:self._max_chars] + "\n\n[Context truncated to fit token limit]"
43
+ return payload
44
+
45
+ def _build_architecture(self, chunks):
46
+ high_conf = [c for c in chunks if ConfidenceLevel.from_score(c.confidence_score) == ConfidenceLevel.HIGH]
47
+ if not high_conf:
48
+ return "### Architecture\nNo indexed context available yet."
49
+ by_file = {}
50
+ for chunk in high_conf:
51
+ by_file.setdefault(chunk.file_path, []).append(chunk)
52
+ lines = ["### Architecture"]
53
+ for file_path, file_chunks in sorted(by_file.items()):
54
+ lines.append(f"\n**{file_path}:**")
55
+ for chunk in file_chunks:
56
+ text = chunk.compressed_content or chunk.content[:200]
57
+ lines.append(f"- {text}")
58
+ return "\n".join(lines)
59
+
60
+ def _build_activity(self, commits):
61
+ if not commits:
62
+ return "### Recent Activity\nNo recent commits."
63
+ lines = ["### Recent Activity"]
64
+ for commit in commits[:10]:
65
+ lines.append(f"- {commit}")
66
+ return "\n".join(lines)
67
+
68
+ def _build_code_context(self, chunks):
69
+ medium_conf = [c for c in chunks if ConfidenceLevel.from_score(c.confidence_score) == ConfidenceLevel.MEDIUM]
70
+ if not medium_conf:
71
+ return ""
72
+ lines = ["### Additional Context (may need drill-down)"]
73
+ for chunk in medium_conf[:20]:
74
+ text = chunk.compressed_content or chunk.content[:150]
75
+ lines.append(f"- [{chunk.file_path}] {text}")
76
+ return "\n".join(lines)
@@ -0,0 +1,132 @@
1
+ """Git helpers for session-start context — recent commits, working state, modified files.
2
+
3
+ All functions gracefully return empty results when the project is not a git
4
+ repository, so CCE works for non-git projects too.
5
+ """
6
+ import subprocess
7
+ from pathlib import Path
8
+
9
+
10
+ def _is_git_repo(project_dir: str) -> bool:
11
+ """Return True if project_dir is inside a git work tree."""
12
+ try:
13
+ result = subprocess.run(
14
+ ["git", "rev-parse", "--is-inside-work-tree"],
15
+ cwd=project_dir,
16
+ capture_output=True,
17
+ text=True,
18
+ timeout=5,
19
+ )
20
+ return result.returncode == 0
21
+ except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
22
+ return False
23
+
24
+
25
+ def _run_git(args: list[str], cwd: str) -> str:
26
+ """Run a git command and return stdout, or empty string on failure."""
27
+ try:
28
+ result = subprocess.run(
29
+ ["git", *args],
30
+ cwd=cwd,
31
+ capture_output=True,
32
+ text=True,
33
+ timeout=5,
34
+ )
35
+ return result.stdout.strip() if result.returncode == 0 else ""
36
+ except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
37
+ return ""
38
+
39
+
40
+ def get_recent_commits(project_dir: str, count: int = 10) -> list[str]:
41
+ """Return the last N commits as short one-line strings."""
42
+ if not _is_git_repo(project_dir):
43
+ return []
44
+ output = _run_git(
45
+ ["log", "--oneline", f"-{count}"],
46
+ cwd=project_dir,
47
+ )
48
+ return output.splitlines() if output else []
49
+
50
+
51
+ def get_working_state(project_dir: str) -> list[str]:
52
+ """Return a summary of uncommitted changes and branch info."""
53
+ if not _is_git_repo(project_dir):
54
+ return []
55
+
56
+ lines: list[str] = []
57
+
58
+ # Current branch
59
+ branch = _run_git(["branch", "--show-current"], cwd=project_dir)
60
+ if branch:
61
+ lines.append(f"Branch: {branch}")
62
+
63
+ # Ahead/behind relative to upstream
64
+ if branch:
65
+ tracking = _run_git(
66
+ ["rev-list", "--left-right", "--count", f"{branch}@{{upstream}}...HEAD"],
67
+ cwd=project_dir,
68
+ )
69
+ if tracking:
70
+ parts = tracking.split()
71
+ if len(parts) == 2:
72
+ try:
73
+ behind, ahead = int(parts[0]), int(parts[1])
74
+ if ahead > 0:
75
+ lines.append(f"Ahead of remote by {ahead} commit(s)")
76
+ if behind > 0:
77
+ lines.append(f"Behind remote by {behind} commit(s)")
78
+ except ValueError:
79
+ pass
80
+
81
+ # Staged changes
82
+ staged = _run_git(["diff", "--cached", "--name-status"], cwd=project_dir)
83
+ if staged:
84
+ lines.append("Staged:")
85
+ for line in staged.splitlines()[:10]:
86
+ lines.append(f" {line}")
87
+
88
+ # Unstaged changes
89
+ unstaged = _run_git(["diff", "--name-status"], cwd=project_dir)
90
+ if unstaged:
91
+ lines.append("Modified (unstaged):")
92
+ for line in unstaged.splitlines()[:10]:
93
+ lines.append(f" {line}")
94
+
95
+ # Untracked files (just count, not full list)
96
+ untracked = _run_git(["ls-files", "--others", "--exclude-standard"], cwd=project_dir)
97
+ if untracked:
98
+ n = len(untracked.splitlines())
99
+ lines.append(f"Untracked files: {n}")
100
+
101
+ return lines
102
+
103
+
104
+ def get_recently_modified_files(project_dir: str, log_depth: int = 5) -> list[str]:
105
+ """Return file paths recently modified in git (last N commits + working tree)."""
106
+ if not _is_git_repo(project_dir):
107
+ return []
108
+
109
+ files: list[str] = []
110
+
111
+ # Files changed in working tree
112
+ wt_files = _run_git(["diff", "--name-only"], cwd=project_dir)
113
+ if wt_files:
114
+ files.extend(wt_files.splitlines())
115
+
116
+ # Files changed in recent commits
117
+ commit_files = _run_git(
118
+ ["log", f"-{log_depth}", "--pretty=format:", "--name-only"],
119
+ cwd=project_dir,
120
+ )
121
+ if commit_files:
122
+ files.extend(f for f in commit_files.splitlines() if f.strip())
123
+
124
+ # Deduplicate, preserve order, filter to existing files
125
+ seen: set[str] = set()
126
+ result: list[str] = []
127
+ for f in files:
128
+ if f not in seen:
129
+ seen.add(f)
130
+ if (Path(project_dir) / f).exists():
131
+ result.append(f)
132
+ return result[:15]