code-context-engine 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context_engine-0.4.0.dist-info/METADATA +389 -0
- code_context_engine-0.4.0.dist-info/RECORD +63 -0
- code_context_engine-0.4.0.dist-info/WHEEL +5 -0
- code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
- code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
- code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
- context_engine/__init__.py +3 -0
- context_engine/cli.py +2848 -0
- context_engine/cli_style.py +66 -0
- context_engine/compression/__init__.py +0 -0
- context_engine/compression/compressor.py +144 -0
- context_engine/compression/ollama_client.py +33 -0
- context_engine/compression/output_rules.py +77 -0
- context_engine/compression/prompts.py +9 -0
- context_engine/compression/quality.py +37 -0
- context_engine/config.py +198 -0
- context_engine/dashboard/__init__.py +0 -0
- context_engine/dashboard/_page.py +1548 -0
- context_engine/dashboard/server.py +429 -0
- context_engine/editors.py +265 -0
- context_engine/event_bus.py +24 -0
- context_engine/indexer/__init__.py +0 -0
- context_engine/indexer/chunker.py +147 -0
- context_engine/indexer/embedder.py +154 -0
- context_engine/indexer/embedding_cache.py +168 -0
- context_engine/indexer/git_hooks.py +73 -0
- context_engine/indexer/git_indexer.py +136 -0
- context_engine/indexer/ignorefile.py +96 -0
- context_engine/indexer/manifest.py +78 -0
- context_engine/indexer/pipeline.py +624 -0
- context_engine/indexer/secrets.py +332 -0
- context_engine/indexer/watcher.py +109 -0
- context_engine/integration/__init__.py +0 -0
- context_engine/integration/bootstrap.py +76 -0
- context_engine/integration/git_context.py +132 -0
- context_engine/integration/mcp_server.py +1825 -0
- context_engine/integration/session_capture.py +306 -0
- context_engine/memory/__init__.py +6 -0
- context_engine/memory/compressor.py +344 -0
- context_engine/memory/db.py +922 -0
- context_engine/memory/extractive.py +106 -0
- context_engine/memory/grammar.py +419 -0
- context_engine/memory/hook_installer.py +258 -0
- context_engine/memory/hook_server.py +83 -0
- context_engine/memory/hooks.py +327 -0
- context_engine/memory/migrate.py +268 -0
- context_engine/models.py +96 -0
- context_engine/pricing.py +104 -0
- context_engine/project_commands.py +296 -0
- context_engine/retrieval/__init__.py +0 -0
- context_engine/retrieval/confidence.py +47 -0
- context_engine/retrieval/query_parser.py +105 -0
- context_engine/retrieval/retriever.py +199 -0
- context_engine/serve_http.py +208 -0
- context_engine/services.py +252 -0
- context_engine/storage/__init__.py +0 -0
- context_engine/storage/backend.py +39 -0
- context_engine/storage/fts_store.py +112 -0
- context_engine/storage/graph_store.py +219 -0
- context_engine/storage/local_backend.py +109 -0
- context_engine/storage/remote_backend.py +117 -0
- context_engine/storage/vector_store.py +357 -0
- context_engine/utils.py +72 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""Secret detection at index time.
|
|
2
|
+
|
|
3
|
+
Two layers:
|
|
4
|
+
|
|
5
|
+
1. **Filename-based skipping** — files whose names match well-known
|
|
6
|
+
credential patterns (.env*, *.pem, secrets.yml, …) are never read,
|
|
7
|
+
never embedded, never served. This is the cheap first line of
|
|
8
|
+
defence and catches the most common leak.
|
|
9
|
+
|
|
10
|
+
2. **Content-based redaction** — for files that DO get indexed, lines
|
|
11
|
+
containing what look like AWS keys, GitHub tokens, JWTs, etc. get
|
|
12
|
+
replaced with `[REDACTED:<reason>]` before chunking. The chunker
|
|
13
|
+
and embedder never see the secret value.
|
|
14
|
+
|
|
15
|
+
Both layers are conservative on purpose — false positives (over-redaction
|
|
16
|
+
of innocent code) are recoverable; false negatives (a real secret leaked
|
|
17
|
+
into the vector DB) are not. When in doubt, redact.
|
|
18
|
+
|
|
19
|
+
Tunable via config:
|
|
20
|
+
· indexer.redact_secrets (default: True) — master switch.
|
|
21
|
+
· indexer.secret_extra_patterns (default: []) — user-added regexes
|
|
22
|
+
for content scanning, merged with the built-in set.
|
|
23
|
+
"""
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import re
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
# ── Filename-level skip list ────────────────────────────────────────────────
|
|
30
|
+
# Glob-style suffixes / exact names that almost always mean "credentials".
|
|
31
|
+
# Match is case-insensitive against the full filename (not full path).
|
|
32
|
+
|
|
33
|
+
# Exact filenames (case-insensitive). Entire file is skipped.
|
|
34
|
+
_SECRET_FILENAMES = frozenset({
|
|
35
|
+
# Dotenv family (covers .env, .env.local, .env.production, etc. — see _SECRET_PREFIXES)
|
|
36
|
+
".npmrc", ".pypirc", ".netrc",
|
|
37
|
+
# Cloud / infra
|
|
38
|
+
"credentials.json", "credentials.yaml", "credentials.yml",
|
|
39
|
+
"secrets.json", "secrets.yaml", "secrets.yml",
|
|
40
|
+
"service-account.json", "gcp-key.json",
|
|
41
|
+
"kube-config", "kubeconfig",
|
|
42
|
+
# CI / app config that frequently holds tokens
|
|
43
|
+
"auth.json",
|
|
44
|
+
# Git config can carry remote tokens
|
|
45
|
+
".git-credentials",
|
|
46
|
+
# SSH private keys frequently sit extension-less in ~/.ssh.
|
|
47
|
+
"id_rsa", "id_dsa", "id_ecdsa", "id_ed25519", "id_xmss",
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
# Filename starts with any of these → skip (handles .env, .env.local, etc.).
|
|
51
|
+
_SECRET_PREFIXES = (".env",)
|
|
52
|
+
|
|
53
|
+
# File extensions whose presence is a strong signal of a key/cert. Skip outright.
|
|
54
|
+
_SECRET_EXTENSIONS = frozenset({
|
|
55
|
+
".pem", ".key", ".crt", ".cer", ".der",
|
|
56
|
+
".p12", ".pfx", # PKCS#12 cert bundles
|
|
57
|
+
".jks", ".keystore", # Java keystores
|
|
58
|
+
".pgp", ".asc", ".gpg", # PGP keys
|
|
59
|
+
".kdbx", # KeePass
|
|
60
|
+
".ppk", # PuTTY private keys
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def is_secret_file(path: Path) -> bool:
|
|
65
|
+
"""True if the filename alone is enough to classify as credentials.
|
|
66
|
+
|
|
67
|
+
Operates on basename only — callers don't need to pre-normalise the
|
|
68
|
+
path. Case-insensitive across the board (Windows/macOS reality).
|
|
69
|
+
"""
|
|
70
|
+
name = path.name.lower()
|
|
71
|
+
if name in _SECRET_FILENAMES:
|
|
72
|
+
return True
|
|
73
|
+
if path.suffix.lower() in _SECRET_EXTENSIONS:
|
|
74
|
+
return True
|
|
75
|
+
for prefix in _SECRET_PREFIXES:
|
|
76
|
+
if name.startswith(prefix):
|
|
77
|
+
return True
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ── Content-level redaction ─────────────────────────────────────────────────
|
|
82
|
+
# Patterns are tuples of (regex, label). Label appears in the redaction
|
|
83
|
+
# placeholder so users can tell *what kind* of secret was scrubbed without
|
|
84
|
+
# leaking the value itself.
|
|
85
|
+
#
|
|
86
|
+
# Conservative ordering: patterns earlier in the list win — specific
|
|
87
|
+
# vendor formats before generic high-entropy heuristics.
|
|
88
|
+
|
|
89
|
+
_CONTENT_PATTERNS: list[tuple[re.Pattern, str]] = [
|
|
90
|
+
# AWS access keys — fixed prefix + 16 base32 chars. Non-capturing
|
|
91
|
+
# group on the prefix so the whole match is the credential value
|
|
92
|
+
# (otherwise we'd only replace AKIA and leak the 16-char suffix).
|
|
93
|
+
(re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b"), "AWS_ACCESS_KEY"),
|
|
94
|
+
# AWS secret keys — 40 base64 chars after "aws_secret_access_key"-ish context.
|
|
95
|
+
(re.compile(
|
|
96
|
+
r"(?i)aws_secret_access_key\s*[:=]\s*['\"]?([A-Za-z0-9/+=]{40})['\"]?"
|
|
97
|
+
), "AWS_SECRET_KEY"),
|
|
98
|
+
# GitHub tokens (classic + fine-grained + app + OAuth).
|
|
99
|
+
(re.compile(r"\bghp_[A-Za-z0-9]{36}\b"), "GITHUB_PAT"),
|
|
100
|
+
(re.compile(r"\bgithub_pat_[A-Za-z0-9_]{82}\b"), "GITHUB_FINE_GRAINED_PAT"),
|
|
101
|
+
(re.compile(r"\b(ghs|gho|ghu|ghr)_[A-Za-z0-9]{36}\b"), "GITHUB_OAUTH"),
|
|
102
|
+
# Slack tokens.
|
|
103
|
+
(re.compile(r"\bxox[abprs]-[A-Za-z0-9-]{10,}\b"), "SLACK_TOKEN"),
|
|
104
|
+
# Stripe live keys (test keys are deliberately not matched — they're
|
|
105
|
+
# safe to commit and matching them would over-redact every Stripe
|
|
106
|
+
# quickstart in the wild).
|
|
107
|
+
(re.compile(r"\b(sk|rk)_live_[A-Za-z0-9]{24,}\b"), "STRIPE_LIVE_KEY"),
|
|
108
|
+
# OpenAI / Anthropic API keys.
|
|
109
|
+
(re.compile(r"\bsk-[A-Za-z0-9]{20}T3BlbkFJ[A-Za-z0-9]{20}\b"), "OPENAI_KEY"),
|
|
110
|
+
(re.compile(r"\bsk-ant-(api03|admin01)-[A-Za-z0-9_\-]{93,}\b"), "ANTHROPIC_KEY"),
|
|
111
|
+
# Google API keys.
|
|
112
|
+
(re.compile(r"\bAIza[0-9A-Za-z_\-]{35}\b"), "GOOGLE_API_KEY"),
|
|
113
|
+
# Generic JWT — three base64url segments separated by dots.
|
|
114
|
+
(re.compile(r"\beyJ[A-Za-z0-9_\-]+\.eyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b"), "JWT"),
|
|
115
|
+
# Private key blocks (catch even if filename slipped past the skip list).
|
|
116
|
+
(re.compile(
|
|
117
|
+
r"-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]+?-----END [A-Z ]*PRIVATE KEY-----"
|
|
118
|
+
), "PRIVATE_KEY_BLOCK"),
|
|
119
|
+
# Generic high-signal "looks like a secret" heuristic — variable
|
|
120
|
+
# named after a credential, assigned to a long opaque string. Skips
|
|
121
|
+
# placeholders ("xxx", "your-key-here", "<insert>") so the typical
|
|
122
|
+
# README example doesn't trigger a redaction.
|
|
123
|
+
#
|
|
124
|
+
# The keyword is matched anywhere on the line (no `^` anchor) so
|
|
125
|
+
# patterns like `config["password"] = "..."` and dict literals fire.
|
|
126
|
+
# Word boundary on the left edge keeps "ssh_password_dialog" from
|
|
127
|
+
# matching as a fake password assignment.
|
|
128
|
+
(re.compile(
|
|
129
|
+
r"(?i)\b(?:password|passwd|secret|api[_-]?key|access[_-]?token|"
|
|
130
|
+
r"private[_-]?key|auth[_-]?token|client[_-]?secret)\b"
|
|
131
|
+
r"['\"\]\s]*[:=]\s*['\"]([^'\"\s]{16,})['\"]"
|
|
132
|
+
), "GENERIC_CREDENTIAL"),
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# Common placeholder values that should NOT be redacted even if they
|
|
137
|
+
# match the generic pattern. Reduces noise in templates / README files.
|
|
138
|
+
_PLACEHOLDER_VALUES = frozenset({
|
|
139
|
+
"your-api-key", "your_api_key", "your-key-here", "your_key_here",
|
|
140
|
+
"<your-key>", "<api-key>", "<your_key>", "<api_key>",
|
|
141
|
+
"xxxxxxxxxxxxxxxx", "0000000000000000",
|
|
142
|
+
"changeme", "change-me", "change_me",
|
|
143
|
+
"placeholder", "example", "sample",
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
_PLACEHOLDER_SUBSTRINGS = (
|
|
148
|
+
"placeholder", "example", "fake", "dummy", "sample",
|
|
149
|
+
"changeme", "change-me", "change_me", "not_real",
|
|
150
|
+
"not-real", "redacted", "<your", "<api",
|
|
151
|
+
# README phrasing variants
|
|
152
|
+
"your_key", "your-key", "your_secret", "your-secret",
|
|
153
|
+
"your_token", "your-token", "your_api", "your-api",
|
|
154
|
+
"your_password", "your-password",
|
|
155
|
+
"replace_with", "replace-with", "insert_your", "insert-your",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _starts_with_placeholder_prefix(value: str) -> bool:
|
|
160
|
+
# Any string that opens with "your-" / "your_" / "my-" / "my_" is a
|
|
161
|
+
# tutorial-style placeholder, not a credential. README examples like
|
|
162
|
+
# "your-api-key-here" or "my_secret_value" all match.
|
|
163
|
+
for prefix in ("your-", "your_", "my-", "my_"):
|
|
164
|
+
if value.startswith(prefix):
|
|
165
|
+
return True
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _is_placeholder(value: str) -> bool:
|
|
170
|
+
v = value.strip("'\"<>").lower()
|
|
171
|
+
if v in _PLACEHOLDER_VALUES:
|
|
172
|
+
return True
|
|
173
|
+
# Repeated single character ("xxxxxxxxxx", "0000000000") is almost
|
|
174
|
+
# always a placeholder, never a real key.
|
|
175
|
+
if len(set(v)) <= 2:
|
|
176
|
+
return True
|
|
177
|
+
# Substring heuristic — README/docs frequently embed credential-shaped
|
|
178
|
+
# strings with telltale words. Better to over-skip these than to redact
|
|
179
|
+
# innocent documentation.
|
|
180
|
+
for needle in _PLACEHOLDER_SUBSTRINGS:
|
|
181
|
+
if needle in v:
|
|
182
|
+
return True
|
|
183
|
+
if _starts_with_placeholder_prefix(v):
|
|
184
|
+
return True
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def redact_secrets(
|
|
189
|
+
text: str,
|
|
190
|
+
*,
|
|
191
|
+
extra_patterns: list[tuple[re.Pattern, str]] | None = None,
|
|
192
|
+
) -> tuple[str, list[str]]:
|
|
193
|
+
"""Replace credential-shaped substrings with `[REDACTED:LABEL]`.
|
|
194
|
+
|
|
195
|
+
Returns (redacted_text, labels) — `labels` enumerates which pattern
|
|
196
|
+
classes fired, so callers can record telemetry without leaking the
|
|
197
|
+
secret value. Empty list means the text is clean.
|
|
198
|
+
|
|
199
|
+
The redaction is line-aware for the generic-credential pattern:
|
|
200
|
+
the entire value gets replaced, but the variable name and assignment
|
|
201
|
+
syntax are preserved so chunked code still parses.
|
|
202
|
+
"""
|
|
203
|
+
if not text:
|
|
204
|
+
return text, []
|
|
205
|
+
patterns = list(_CONTENT_PATTERNS)
|
|
206
|
+
if extra_patterns:
|
|
207
|
+
patterns.extend(extra_patterns)
|
|
208
|
+
out = text
|
|
209
|
+
fired: list[str] = []
|
|
210
|
+
|
|
211
|
+
for pattern, label in patterns:
|
|
212
|
+
def _sub(match: re.Match, _label: str = label) -> str:
|
|
213
|
+
# If the match has a single capture group, that's the actual
|
|
214
|
+
# credential value — preserve everything around it.
|
|
215
|
+
if match.lastindex:
|
|
216
|
+
value = match.group(match.lastindex)
|
|
217
|
+
if _is_placeholder(value):
|
|
218
|
+
return match.group(0)
|
|
219
|
+
fired.append(_label)
|
|
220
|
+
return match.group(0).replace(value, f"[REDACTED:{_label}]")
|
|
221
|
+
full = match.group(0)
|
|
222
|
+
if _is_placeholder(full):
|
|
223
|
+
return full
|
|
224
|
+
fired.append(_label)
|
|
225
|
+
return f"[REDACTED:{_label}]"
|
|
226
|
+
|
|
227
|
+
out = pattern.sub(_sub, out)
|
|
228
|
+
|
|
229
|
+
return out, fired
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# ── PII patterns ────────────────────────────────────────────────────────────
|
|
233
|
+
# Used by memory.db writes (decisions / turn summaries / code areas) so
|
|
234
|
+
# personal data captured during a session doesn't end up persisted in
|
|
235
|
+
# searchable form. Lighter touch than secret detection — only the most
|
|
236
|
+
# unambiguous patterns. Free-form text shouldn't be aggressively scrubbed.
|
|
237
|
+
|
|
238
|
+
_PII_PATTERNS: list[tuple[re.Pattern, str]] = [
|
|
239
|
+
# Email addresses — simple but effective. Matches RFC-mostly-compliant
|
|
240
|
+
# addresses; over-matches a tiny bit (won't reject quoted local-parts)
|
|
241
|
+
# but that's fine for redaction.
|
|
242
|
+
(re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"), "EMAIL"),
|
|
243
|
+
# IPv4 — four 1-3 digit groups. Won't match localhost or 127.0.0.1
|
|
244
|
+
# specifically because those are useful in dev notes.
|
|
245
|
+
(re.compile(
|
|
246
|
+
r"\b(?!127\.0\.0\.1\b|0\.0\.0\.0\b|10\.0\.0\.1\b|192\.168\.\d+\.\d+\b)"
|
|
247
|
+
r"(?:[1-9]\d?|1\d{2}|2[0-4]\d|25[0-5])"
|
|
248
|
+
r"(?:\.(?:\d{1,3})){3}\b"
|
|
249
|
+
), "IPV4"),
|
|
250
|
+
# Credit-card-shaped 13-19 digit runs (with optional spaces/dashes).
|
|
251
|
+
# Filtered through Luhn check to avoid false positives on order
|
|
252
|
+
# numbers, hashes, etc.
|
|
253
|
+
(re.compile(r"\b(?:\d[ -]?){13,19}\b"), "CREDIT_CARD"),
|
|
254
|
+
# US-style SSN.
|
|
255
|
+
(re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "SSN"),
|
|
256
|
+
# E.164 phone numbers (with leading +).
|
|
257
|
+
(re.compile(r"\+\d{1,3}[ -]?\(?\d{1,4}\)?[ -]?\d{3,4}[ -]?\d{3,4}\b"), "PHONE_E164"),
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _passes_luhn(digits: str) -> bool:
|
|
262
|
+
"""Luhn check for credit-card validation. Skips invalid candidates so
|
|
263
|
+
we don't redact every long number. Strips non-digits first.
|
|
264
|
+
"""
|
|
265
|
+
digits = re.sub(r"\D", "", digits)
|
|
266
|
+
if not (13 <= len(digits) <= 19):
|
|
267
|
+
return False
|
|
268
|
+
total = 0
|
|
269
|
+
parity = len(digits) % 2
|
|
270
|
+
for i, ch in enumerate(digits):
|
|
271
|
+
n = int(ch)
|
|
272
|
+
if i % 2 == parity:
|
|
273
|
+
n *= 2
|
|
274
|
+
if n > 9:
|
|
275
|
+
n -= 9
|
|
276
|
+
total += n
|
|
277
|
+
return total % 10 == 0
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def redact_pii(
|
|
281
|
+
text: str,
|
|
282
|
+
*,
|
|
283
|
+
extra_patterns: list[tuple[re.Pattern, str]] | None = None,
|
|
284
|
+
) -> tuple[str, list[str]]:
|
|
285
|
+
"""Replace common PII (emails, IPs, credit cards, SSNs, phones) with
|
|
286
|
+
`[REDACTED:LABEL]`. Same return shape as `redact_secrets`.
|
|
287
|
+
|
|
288
|
+
Lighter touch than secret detection: only the unambiguous patterns
|
|
289
|
+
fire, and credit-card candidates are Luhn-validated so order numbers
|
|
290
|
+
and SHA hashes don't get clobbered.
|
|
291
|
+
"""
|
|
292
|
+
if not text:
|
|
293
|
+
return text, []
|
|
294
|
+
patterns = list(_PII_PATTERNS)
|
|
295
|
+
if extra_patterns:
|
|
296
|
+
patterns.extend(extra_patterns)
|
|
297
|
+
out = text
|
|
298
|
+
fired: list[str] = []
|
|
299
|
+
|
|
300
|
+
for pattern, label in patterns:
|
|
301
|
+
def _sub(match: re.Match, _label: str = label) -> str:
|
|
302
|
+
value = match.group(0)
|
|
303
|
+
# Credit-card pattern needs Luhn validation to avoid false
|
|
304
|
+
# positives. Other PII patterns are accepted as-is.
|
|
305
|
+
if _label == "CREDIT_CARD" and not _passes_luhn(value):
|
|
306
|
+
return value
|
|
307
|
+
fired.append(_label)
|
|
308
|
+
return f"[REDACTED:{_label}]"
|
|
309
|
+
|
|
310
|
+
out = pattern.sub(_sub, out)
|
|
311
|
+
|
|
312
|
+
return out, fired
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# ── Convenience: combined check for the indexer ────────────────────────────
|
|
316
|
+
|
|
317
|
+
def scan_and_redact(
|
|
318
|
+
file_path: Path,
|
|
319
|
+
content: str,
|
|
320
|
+
*,
|
|
321
|
+
extra_patterns: list[tuple[re.Pattern, str]] | None = None,
|
|
322
|
+
) -> tuple[str | None, list[str]]:
|
|
323
|
+
"""Indexer-facing entrypoint.
|
|
324
|
+
|
|
325
|
+
Returns (text_or_None, labels). `text_or_None` is None when the file
|
|
326
|
+
should be skipped entirely (filename-level secret), otherwise the
|
|
327
|
+
redacted content (which equals `content` if nothing fired). `labels`
|
|
328
|
+
is the list of pattern labels that triggered.
|
|
329
|
+
"""
|
|
330
|
+
if is_secret_file(file_path):
|
|
331
|
+
return None, ["SECRET_FILENAME"]
|
|
332
|
+
return redact_secrets(content, extra_patterns=extra_patterns)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""File watcher with debouncing using watchdog.
|
|
2
|
+
|
|
3
|
+
Watches a directory for file changes and triggers an async callback
|
|
4
|
+
after a debounce period. Used by `cce serve` to keep the index
|
|
5
|
+
up-to-date as files are saved.
|
|
6
|
+
"""
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from watchdog.observers import Observer
|
|
14
|
+
from watchdog.events import FileSystemEventHandler
|
|
15
|
+
|
|
16
|
+
log = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class _DebouncedHandler(FileSystemEventHandler):
|
|
20
|
+
def __init__(self, on_change, debounce_ms, ignore_patterns, watch_dir, loop):
|
|
21
|
+
self._on_change = on_change
|
|
22
|
+
self._debounce_s = debounce_ms / 1000.0
|
|
23
|
+
self._ignore_set = set(ignore_patterns)
|
|
24
|
+
self._watch_dir = Path(watch_dir)
|
|
25
|
+
self._loop = loop
|
|
26
|
+
self._pending: dict[str, float] = {}
|
|
27
|
+
self._lock = threading.Lock()
|
|
28
|
+
self._timer: threading.Timer | None = None
|
|
29
|
+
|
|
30
|
+
def _should_ignore(self, path: str) -> bool:
|
|
31
|
+
"""Check if any path component matches an ignore pattern."""
|
|
32
|
+
try:
|
|
33
|
+
rel = Path(path).relative_to(self._watch_dir)
|
|
34
|
+
except ValueError:
|
|
35
|
+
return False
|
|
36
|
+
for part in rel.parts:
|
|
37
|
+
if part in self._ignore_set:
|
|
38
|
+
return True
|
|
39
|
+
# Always skip CCE's own storage/index files
|
|
40
|
+
if part == ".cce":
|
|
41
|
+
return True
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
def on_any_event(self, event):
|
|
45
|
+
if event.is_directory:
|
|
46
|
+
return
|
|
47
|
+
path = event.src_path
|
|
48
|
+
if self._should_ignore(path):
|
|
49
|
+
return
|
|
50
|
+
with self._lock:
|
|
51
|
+
self._pending[path] = time.time()
|
|
52
|
+
if self._timer:
|
|
53
|
+
self._timer.cancel()
|
|
54
|
+
self._timer = threading.Timer(self._debounce_s, self._flush)
|
|
55
|
+
self._timer.start()
|
|
56
|
+
|
|
57
|
+
def _flush(self):
|
|
58
|
+
with self._lock:
|
|
59
|
+
paths = list(self._pending.keys())
|
|
60
|
+
self._pending.clear()
|
|
61
|
+
for path in paths:
|
|
62
|
+
try:
|
|
63
|
+
asyncio.run_coroutine_threadsafe(self._on_change(path), self._loop)
|
|
64
|
+
except RuntimeError:
|
|
65
|
+
# Loop closed — shutting down
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class FileWatcher:
|
|
70
|
+
"""Watch a directory for file changes with debounced async callbacks."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, watch_dir, on_change, debounce_ms=500, ignore_patterns=None):
|
|
73
|
+
self._watch_dir = watch_dir
|
|
74
|
+
self._on_change = on_change
|
|
75
|
+
self._debounce_ms = debounce_ms
|
|
76
|
+
self._ignore_patterns = ignore_patterns or []
|
|
77
|
+
self._observer = None
|
|
78
|
+
self._handler = None
|
|
79
|
+
|
|
80
|
+
def start(self, loop=None):
|
|
81
|
+
"""Start watching. Pass the running asyncio loop explicitly."""
|
|
82
|
+
if loop is None:
|
|
83
|
+
try:
|
|
84
|
+
loop = asyncio.get_running_loop()
|
|
85
|
+
except RuntimeError:
|
|
86
|
+
loop = asyncio.get_event_loop()
|
|
87
|
+
self._handler = _DebouncedHandler(
|
|
88
|
+
on_change=self._on_change,
|
|
89
|
+
debounce_ms=self._debounce_ms,
|
|
90
|
+
ignore_patterns=self._ignore_patterns,
|
|
91
|
+
watch_dir=self._watch_dir,
|
|
92
|
+
loop=loop,
|
|
93
|
+
)
|
|
94
|
+
self._observer = Observer()
|
|
95
|
+
self._observer.schedule(self._handler, self._watch_dir, recursive=True)
|
|
96
|
+
self._observer.daemon = True
|
|
97
|
+
self._observer.start()
|
|
98
|
+
log.debug("Watcher started for %s", self._watch_dir)
|
|
99
|
+
|
|
100
|
+
def stop(self):
|
|
101
|
+
if self._handler:
|
|
102
|
+
with self._handler._lock:
|
|
103
|
+
if self._handler._timer:
|
|
104
|
+
self._handler._timer.cancel()
|
|
105
|
+
self._handler._timer = None
|
|
106
|
+
if self._observer:
|
|
107
|
+
self._observer.stop()
|
|
108
|
+
self._observer.join(timeout=2)
|
|
109
|
+
log.debug("Watcher stopped")
|
|
File without changes
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Bootstrap context builder — generates compressed project context for session start."""
|
|
2
|
+
from importlib.metadata import version as pkg_version
|
|
3
|
+
|
|
4
|
+
from context_engine.models import Chunk, ConfidenceLevel
|
|
5
|
+
|
|
6
|
+
_CHARS_PER_TOKEN = 4
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_version() -> str:
|
|
10
|
+
try:
|
|
11
|
+
return pkg_version("code-context-engine")
|
|
12
|
+
except Exception:
|
|
13
|
+
return "unknown"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BootstrapBuilder:
|
|
17
|
+
def __init__(self, max_tokens: int = 10000) -> None:
|
|
18
|
+
self._max_chars = max_tokens * _CHARS_PER_TOKEN
|
|
19
|
+
|
|
20
|
+
def build(self, project_name, chunks=None, recent_commits=None,
|
|
21
|
+
active_decisions=None, working_state=None, chunk_count=0,
|
|
22
|
+
project_commands_text=None):
|
|
23
|
+
sections = []
|
|
24
|
+
ver = _get_version()
|
|
25
|
+
status_line = f"CCE v{ver} · {chunk_count} chunks indexed" if chunk_count else f"CCE v{ver} · no chunks indexed yet"
|
|
26
|
+
sections.append(f"## Project: {project_name}\n`{status_line}`")
|
|
27
|
+
sections.append(self._build_architecture(chunks or []))
|
|
28
|
+
sections.append(self._build_activity(recent_commits or []))
|
|
29
|
+
if working_state:
|
|
30
|
+
state_text = "\n".join(f" {line}" for line in working_state)
|
|
31
|
+
sections.append(f"### Working State\n{state_text}")
|
|
32
|
+
if project_commands_text:
|
|
33
|
+
sections.append(project_commands_text)
|
|
34
|
+
if active_decisions:
|
|
35
|
+
decisions_text = "\n".join(f"- {d}" for d in active_decisions)
|
|
36
|
+
sections.append(f"### Active Context\n{decisions_text}")
|
|
37
|
+
code_section = self._build_code_context(chunks or [])
|
|
38
|
+
if code_section:
|
|
39
|
+
sections.append(code_section)
|
|
40
|
+
payload = "\n\n".join(sections)
|
|
41
|
+
if len(payload) > self._max_chars:
|
|
42
|
+
payload = payload[:self._max_chars] + "\n\n[Context truncated to fit token limit]"
|
|
43
|
+
return payload
|
|
44
|
+
|
|
45
|
+
def _build_architecture(self, chunks):
|
|
46
|
+
high_conf = [c for c in chunks if ConfidenceLevel.from_score(c.confidence_score) == ConfidenceLevel.HIGH]
|
|
47
|
+
if not high_conf:
|
|
48
|
+
return "### Architecture\nNo indexed context available yet."
|
|
49
|
+
by_file = {}
|
|
50
|
+
for chunk in high_conf:
|
|
51
|
+
by_file.setdefault(chunk.file_path, []).append(chunk)
|
|
52
|
+
lines = ["### Architecture"]
|
|
53
|
+
for file_path, file_chunks in sorted(by_file.items()):
|
|
54
|
+
lines.append(f"\n**{file_path}:**")
|
|
55
|
+
for chunk in file_chunks:
|
|
56
|
+
text = chunk.compressed_content or chunk.content[:200]
|
|
57
|
+
lines.append(f"- {text}")
|
|
58
|
+
return "\n".join(lines)
|
|
59
|
+
|
|
60
|
+
def _build_activity(self, commits):
|
|
61
|
+
if not commits:
|
|
62
|
+
return "### Recent Activity\nNo recent commits."
|
|
63
|
+
lines = ["### Recent Activity"]
|
|
64
|
+
for commit in commits[:10]:
|
|
65
|
+
lines.append(f"- {commit}")
|
|
66
|
+
return "\n".join(lines)
|
|
67
|
+
|
|
68
|
+
def _build_code_context(self, chunks):
|
|
69
|
+
medium_conf = [c for c in chunks if ConfidenceLevel.from_score(c.confidence_score) == ConfidenceLevel.MEDIUM]
|
|
70
|
+
if not medium_conf:
|
|
71
|
+
return ""
|
|
72
|
+
lines = ["### Additional Context (may need drill-down)"]
|
|
73
|
+
for chunk in medium_conf[:20]:
|
|
74
|
+
text = chunk.compressed_content or chunk.content[:150]
|
|
75
|
+
lines.append(f"- [{chunk.file_path}] {text}")
|
|
76
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Git helpers for session-start context — recent commits, working state, modified files.
|
|
2
|
+
|
|
3
|
+
All functions gracefully return empty results when the project is not a git
|
|
4
|
+
repository, so CCE works for non-git projects too.
|
|
5
|
+
"""
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _is_git_repo(project_dir: str) -> bool:
|
|
11
|
+
"""Return True if project_dir is inside a git work tree."""
|
|
12
|
+
try:
|
|
13
|
+
result = subprocess.run(
|
|
14
|
+
["git", "rev-parse", "--is-inside-work-tree"],
|
|
15
|
+
cwd=project_dir,
|
|
16
|
+
capture_output=True,
|
|
17
|
+
text=True,
|
|
18
|
+
timeout=5,
|
|
19
|
+
)
|
|
20
|
+
return result.returncode == 0
|
|
21
|
+
except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
|
|
22
|
+
return False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _run_git(args: list[str], cwd: str) -> str:
|
|
26
|
+
"""Run a git command and return stdout, or empty string on failure."""
|
|
27
|
+
try:
|
|
28
|
+
result = subprocess.run(
|
|
29
|
+
["git", *args],
|
|
30
|
+
cwd=cwd,
|
|
31
|
+
capture_output=True,
|
|
32
|
+
text=True,
|
|
33
|
+
timeout=5,
|
|
34
|
+
)
|
|
35
|
+
return result.stdout.strip() if result.returncode == 0 else ""
|
|
36
|
+
except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_recent_commits(project_dir: str, count: int = 10) -> list[str]:
|
|
41
|
+
"""Return the last N commits as short one-line strings."""
|
|
42
|
+
if not _is_git_repo(project_dir):
|
|
43
|
+
return []
|
|
44
|
+
output = _run_git(
|
|
45
|
+
["log", "--oneline", f"-{count}"],
|
|
46
|
+
cwd=project_dir,
|
|
47
|
+
)
|
|
48
|
+
return output.splitlines() if output else []
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_working_state(project_dir: str) -> list[str]:
|
|
52
|
+
"""Return a summary of uncommitted changes and branch info."""
|
|
53
|
+
if not _is_git_repo(project_dir):
|
|
54
|
+
return []
|
|
55
|
+
|
|
56
|
+
lines: list[str] = []
|
|
57
|
+
|
|
58
|
+
# Current branch
|
|
59
|
+
branch = _run_git(["branch", "--show-current"], cwd=project_dir)
|
|
60
|
+
if branch:
|
|
61
|
+
lines.append(f"Branch: {branch}")
|
|
62
|
+
|
|
63
|
+
# Ahead/behind relative to upstream
|
|
64
|
+
if branch:
|
|
65
|
+
tracking = _run_git(
|
|
66
|
+
["rev-list", "--left-right", "--count", f"{branch}@{{upstream}}...HEAD"],
|
|
67
|
+
cwd=project_dir,
|
|
68
|
+
)
|
|
69
|
+
if tracking:
|
|
70
|
+
parts = tracking.split()
|
|
71
|
+
if len(parts) == 2:
|
|
72
|
+
try:
|
|
73
|
+
behind, ahead = int(parts[0]), int(parts[1])
|
|
74
|
+
if ahead > 0:
|
|
75
|
+
lines.append(f"Ahead of remote by {ahead} commit(s)")
|
|
76
|
+
if behind > 0:
|
|
77
|
+
lines.append(f"Behind remote by {behind} commit(s)")
|
|
78
|
+
except ValueError:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
# Staged changes
|
|
82
|
+
staged = _run_git(["diff", "--cached", "--name-status"], cwd=project_dir)
|
|
83
|
+
if staged:
|
|
84
|
+
lines.append("Staged:")
|
|
85
|
+
for line in staged.splitlines()[:10]:
|
|
86
|
+
lines.append(f" {line}")
|
|
87
|
+
|
|
88
|
+
# Unstaged changes
|
|
89
|
+
unstaged = _run_git(["diff", "--name-status"], cwd=project_dir)
|
|
90
|
+
if unstaged:
|
|
91
|
+
lines.append("Modified (unstaged):")
|
|
92
|
+
for line in unstaged.splitlines()[:10]:
|
|
93
|
+
lines.append(f" {line}")
|
|
94
|
+
|
|
95
|
+
# Untracked files (just count, not full list)
|
|
96
|
+
untracked = _run_git(["ls-files", "--others", "--exclude-standard"], cwd=project_dir)
|
|
97
|
+
if untracked:
|
|
98
|
+
n = len(untracked.splitlines())
|
|
99
|
+
lines.append(f"Untracked files: {n}")
|
|
100
|
+
|
|
101
|
+
return lines
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_recently_modified_files(project_dir: str, log_depth: int = 5) -> list[str]:
|
|
105
|
+
"""Return file paths recently modified in git (last N commits + working tree)."""
|
|
106
|
+
if not _is_git_repo(project_dir):
|
|
107
|
+
return []
|
|
108
|
+
|
|
109
|
+
files: list[str] = []
|
|
110
|
+
|
|
111
|
+
# Files changed in working tree
|
|
112
|
+
wt_files = _run_git(["diff", "--name-only"], cwd=project_dir)
|
|
113
|
+
if wt_files:
|
|
114
|
+
files.extend(wt_files.splitlines())
|
|
115
|
+
|
|
116
|
+
# Files changed in recent commits
|
|
117
|
+
commit_files = _run_git(
|
|
118
|
+
["log", f"-{log_depth}", "--pretty=format:", "--name-only"],
|
|
119
|
+
cwd=project_dir,
|
|
120
|
+
)
|
|
121
|
+
if commit_files:
|
|
122
|
+
files.extend(f for f in commit_files.splitlines() if f.strip())
|
|
123
|
+
|
|
124
|
+
# Deduplicate, preserve order, filter to existing files
|
|
125
|
+
seen: set[str] = set()
|
|
126
|
+
result: list[str] = []
|
|
127
|
+
for f in files:
|
|
128
|
+
if f not in seen:
|
|
129
|
+
seen.add(f)
|
|
130
|
+
if (Path(project_dir) / f).exists():
|
|
131
|
+
result.append(f)
|
|
132
|
+
return result[:15]
|