@ictechgy/context-guard 0.4.8 → 0.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CHANGELOG.md +29 -0
  2. package/README.ko.md +92 -37
  3. package/README.md +111 -37
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  8. package/docs/distribution.md +10 -7
  9. package/docs/experimental-benchmark-fixtures.md +8 -1
  10. package/package.json +3 -6
  11. package/packaging/homebrew/context-guard.rb.template +1 -1
  12. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  13. package/plugins/context-guard/README.ko.md +9 -6
  14. package/plugins/context-guard/README.md +27 -12
  15. package/plugins/context-guard/bin/context-guard +113 -26
  16. package/plugins/context-guard/bin/context-guard-artifact +542 -46
  17. package/plugins/context-guard/bin/context-guard-cache-score +380 -0
  18. package/plugins/context-guard/bin/context-guard-compress +146 -1
  19. package/plugins/context-guard/bin/context-guard-cost +783 -4
  20. package/plugins/context-guard/bin/context-guard-experiments +2211 -121
  21. package/plugins/context-guard/bin/context-guard-failed-nudge +3 -0
  22. package/plugins/context-guard/bin/context-guard-filter +163 -7
  23. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  24. package/plugins/context-guard/bin/context-guard-pack +602 -43
  25. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  26. package/plugins/context-guard/bin/context-guard-setup +165 -31
  27. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  28. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  29. package/plugins/context-guard/bin/context-guard-tool-prune +241 -1
  30. package/plugins/context-guard/lib/context_guard_commands.py +206 -0
  31. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  32. package/context-guard-kit/README.md +0 -91
  33. package/context-guard-kit/benchmark_runner.py +0 -2401
  34. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  35. package/context-guard-kit/context_compress.py +0 -695
  36. package/context-guard-kit/context_escrow.py +0 -935
  37. package/context-guard-kit/context_filter.py +0 -637
  38. package/context-guard-kit/context_guard_cli.py +0 -325
  39. package/context-guard-kit/context_guard_diet.py +0 -1711
  40. package/context-guard-kit/context_pack.py +0 -2713
  41. package/context-guard-kit/cost_guard.py +0 -2349
  42. package/context-guard-kit/experimental_registry.py +0 -2339
  43. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  44. package/context-guard-kit/guard_large_read.py +0 -690
  45. package/context-guard-kit/hook_secret_patterns.py +0 -43
  46. package/context-guard-kit/read_symbol.py +0 -483
  47. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  48. package/context-guard-kit/sanitize_output.py +0 -725
  49. package/context-guard-kit/settings.example.json +0 -67
  50. package/context-guard-kit/setup_wizard.py +0 -2515
  51. package/context-guard-kit/statusline.sh +0 -362
  52. package/context-guard-kit/statusline_merged.sh +0 -157
  53. package/context-guard-kit/tool_schema_pruner.py +0 -837
  54. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -1,935 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Store large sanitized command output outside Claude context and query slices later."""
3
- from __future__ import annotations
4
-
5
- import argparse
6
- import hashlib
7
- import importlib.machinery
8
- import importlib.util
9
- import json
10
- import os
11
- from pathlib import Path
12
- import re
13
- import stat
14
- import sys
15
- import time
16
- from typing import Iterable
17
-
18
- DEFAULT_ARTIFACT_DIR = ".context-guard/artifacts"
19
- LEGACY_ARTIFACT_DIR = ".claude-token-optimizer/artifacts"
20
- DEFAULT_MAX_BYTES = 10_000_000
21
- MAX_MAX_BYTES = 100_000_000
22
- MAX_METADATA_BYTES = 64_000
23
- DEFAULT_MAX_LINES = 80
24
- DEFAULT_MAX_CHARS = 20_000
25
- MAX_QUERY_LINES = 5_000
26
- MAX_LINE_CHARS = 2_000
27
- MAX_DIGEST_TEXT_CHARS = 360
28
- MAX_DIGEST_TEXT_BYTES = 512
29
- MAX_COMMAND_PREVIEW_BYTES = 2_048
30
- MAX_TOP_ERROR_RECEIPTS = 12
31
- MAX_DUPLICATE_GROUPS = 12
32
- MAX_SUGGESTED_QUERIES = 12
33
- ARTIFACT_ID_RE = re.compile(r"^[a-f0-9]{16,64}$")
34
- ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
35
- "tmp": Path("/private/tmp"),
36
- "var": Path("/private/var"),
37
- }
38
- ERROR_RE = re.compile(
39
- r"(FAIL|FAILED|ERROR|Error:|Exception|Traceback|AssertionError|panic:|fatal:|"
40
- r"segmentation fault|not ok|\bE\s+assert|\[ERROR\]|✗|✖)",
41
- re.IGNORECASE,
42
- )
43
- SECRET_VALUE_RE = re.compile(
44
- r"(?i)(Bearer\s+\S+|Basic\s+\S+|gh[pousr]_[A-Za-z0-9_]{20,}|"
45
- r"github_pat_[A-Za-z0-9_]{20,}|xox[abprs]-[A-Za-z0-9-]{10,}|"
46
- r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
47
- r"AIza[0-9A-Za-z_\-]{20,}|"
48
- r"([A-Za-z0-9_.-]*(?:api[_-]?key|token|secret|password|passwd|pwd)[A-Za-z0-9_.-]*\s*[:=]\s*)\S+)"
49
- )
50
-
51
-
52
- def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
53
- try:
54
- number = int(value)
55
- except (TypeError, ValueError, OverflowError):
56
- return default
57
- return min(max(number, minimum), maximum)
58
-
59
-
60
- def cap_line(line: str, limit: int = MAX_LINE_CHARS) -> str:
61
- if len(line) <= limit:
62
- return line
63
- marker = f"...[line trimmed: {len(line)} chars]"
64
- return line[: max(0, limit - len(marker))] + marker
65
-
66
-
67
- def cap_utf8_bytes(text: str, limit: int) -> str:
68
- encoded = text.encode("utf-8", errors="replace")
69
- if len(encoded) <= limit:
70
- return text
71
- marker = f"...[line trimmed: {len(text)} chars/{len(encoded)} bytes]"
72
- marker_bytes = marker.encode("utf-8")
73
- if len(marker_bytes) >= limit:
74
- return marker_bytes[:limit].decode("utf-8", errors="ignore")
75
- keep = limit - len(marker_bytes)
76
- out: list[str] = []
77
- used = 0
78
- for char in text:
79
- char_bytes = char.encode("utf-8", errors="replace")
80
- if used + len(char_bytes) > keep:
81
- break
82
- out.append(char)
83
- used += len(char_bytes)
84
- return "".join(out) + marker
85
-
86
-
87
- def cap_digest_text(text: str) -> str:
88
- return cap_utf8_bytes(cap_line(text, limit=MAX_DIGEST_TEXT_CHARS), MAX_DIGEST_TEXT_BYTES)
89
-
90
-
91
- def normalized_link_target(parent: Path, raw_target: str) -> Path:
92
- target = Path(raw_target)
93
- if not target.is_absolute():
94
- target = parent / target
95
- return Path(os.path.normpath(str(target)))
96
-
97
-
98
- def normalize_allowed_first_absolute_symlink(path: Path) -> Path:
99
- if not path.is_absolute() or len(path.parts) < 2:
100
- return path
101
- first = path.parts[1]
102
- expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
103
- if expected is None:
104
- return path
105
- link = Path(path.anchor) / first
106
- try:
107
- if not stat.S_ISLNK(os.lstat(link).st_mode):
108
- return path
109
- if normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
110
- return path
111
- except OSError:
112
- return path
113
- return expected.joinpath(*path.parts[2:])
114
-
115
-
116
- def compact_items(lines: Iterable[str], *, limit: int, max_chars: int = MAX_LINE_CHARS, max_bytes: int | None = None) -> list[str]:
117
- out: list[str] = []
118
- seen: set[str] = set()
119
- for line in lines:
120
- item = cap_line(line.strip(), limit=max_chars)
121
- if max_bytes is not None:
122
- item = cap_utf8_bytes(item, max_bytes)
123
- if not item or item in seen:
124
- continue
125
- out.append(item)
126
- seen.add(item)
127
- if len(out) >= limit:
128
- break
129
- return out
130
-
131
-
132
- class FallbackLineSanitizer:
133
- def __init__(self, *, show_paths: bool = False) -> None:
134
- self.show_paths = show_paths
135
- self.redactions = 0
136
-
137
- def sanitize(self, raw_line: str) -> tuple[str, bool]:
138
- def repl(match: re.Match[str]) -> str:
139
- groups = match.groups()
140
- if len(groups) >= 2 and groups[1]:
141
- return groups[1] + "[REDACTED]"
142
- return "[REDACTED]"
143
-
144
- line, count = SECRET_VALUE_RE.subn(repl, raw_line)
145
- if count:
146
- self.redactions += 1
147
- return line, bool(count)
148
-
149
-
150
- def load_line_sanitizer(show_paths: bool) -> object:
151
- script_dir = Path(__file__).resolve().parent
152
- for name in ("sanitize_output.py", "context-guard-sanitize-output", "claude-sanitize-output"):
153
- candidate = script_dir / name
154
- if not candidate.exists():
155
- continue
156
- try:
157
- loader = importlib.machinery.SourceFileLoader(f"_claude_token_sanitize_{os.getpid()}", str(candidate))
158
- spec = importlib.util.spec_from_loader(loader.name, loader)
159
- if spec is None:
160
- raise RuntimeError("import spec unavailable")
161
- module = importlib.util.module_from_spec(spec)
162
- loader.exec_module(module)
163
- return module.LineSanitizer(show_paths=show_paths)
164
- except Exception as exc:
165
- raise RuntimeError(f"could not load sanitizer {candidate}: {exc}") from exc
166
- return FallbackLineSanitizer(show_paths=show_paths)
167
-
168
-
169
- def sanitize_text(text: str, *, show_paths: bool = False) -> tuple[str, int]:
170
- sanitizer = load_line_sanitizer(show_paths)
171
- redacted = 0
172
- out: list[str] = []
173
- for line in text.splitlines(True):
174
- sanitized, did_redact = sanitizer.sanitize(line) # type: ignore[attr-defined]
175
- out.append(sanitized)
176
- if did_redact:
177
- redacted += 1
178
- return "".join(out), redacted
179
-
180
-
181
- def sanitize_one_line(text: str, *, show_paths: bool = False) -> str:
182
- sanitized, _ = sanitize_text(text + "\n", show_paths=show_paths)
183
- return cap_utf8_bytes(cap_line(" ".join(sanitized.strip().split())), MAX_COMMAND_PREVIEW_BYTES)
184
-
185
-
186
- def ensure_private_dir(path: Path) -> None:
187
- path = normalize_allowed_first_absolute_symlink(path)
188
- reject_symlink_components(path)
189
- path.mkdir(parents=True, exist_ok=True)
190
- reject_symlink_components(path)
191
- try:
192
- os.chmod(path, 0o700)
193
- except OSError:
194
- pass
195
-
196
-
197
- def reject_symlink_components(path: Path) -> None:
198
- path = normalize_allowed_first_absolute_symlink(path)
199
- current = Path(path.anchor) if path.is_absolute() else Path()
200
- for part in path.parts:
201
- if path.is_absolute() and part == path.anchor:
202
- continue
203
- current = current / part
204
- try:
205
- st = os.lstat(current)
206
- except FileNotFoundError:
207
- return
208
- if stat.S_ISLNK(st.st_mode):
209
- raise RuntimeError(f"refusing artifact path with symlink component: {current}")
210
- if not stat.S_ISDIR(st.st_mode) and current != path:
211
- raise RuntimeError(f"refusing artifact path through non-directory component: {current}")
212
-
213
-
214
- def regular_private_file_size(path: Path) -> int:
215
- path = normalize_allowed_first_absolute_symlink(path)
216
- reject_symlink_components(path.parent)
217
- st = os.lstat(path)
218
- if stat.S_ISLNK(st.st_mode):
219
- raise ValueError(f"artifact file must not be a symlink: {path.name}")
220
- if not stat.S_ISREG(st.st_mode):
221
- raise ValueError(f"artifact file must be a regular file: {path.name}")
222
- return int(st.st_size)
223
-
224
-
225
- def read_bounded_private_text(path: Path, max_bytes: int) -> str:
226
- path = normalize_allowed_first_absolute_symlink(path)
227
- size = regular_private_file_size(path)
228
- if size > max_bytes:
229
- raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: {size} > {max_bytes}")
230
- flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
231
- fd = os.open(str(path), flags)
232
- try:
233
- st = os.fstat(fd)
234
- if not stat.S_ISREG(st.st_mode):
235
- raise ValueError(f"artifact file must be a regular file: {path.name}")
236
- if st.st_size > max_bytes:
237
- raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: {st.st_size} > {max_bytes}")
238
- data = os.read(fd, max_bytes + 1)
239
- if len(data) > max_bytes:
240
- raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: > {max_bytes}")
241
- return data.decode("utf-8", errors="replace")
242
- finally:
243
- os.close(fd)
244
-
245
-
246
- def write_private_text(path: Path, text: str) -> None:
247
- path = normalize_allowed_first_absolute_symlink(path)
248
- ensure_private_dir(path.parent)
249
- tmp = path.with_name(path.name + f".tmp-{os.getpid()}-{time.time_ns()}")
250
- flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0)
251
- fd = os.open(str(tmp), flags, 0o600)
252
- try:
253
- with os.fdopen(fd, "w", encoding="utf-8", newline="") as handle:
254
- handle.write(text)
255
- except Exception:
256
- try:
257
- tmp.unlink()
258
- except FileNotFoundError:
259
- pass
260
- raise
261
- try:
262
- os.replace(tmp, path)
263
- except Exception:
264
- try:
265
- tmp.unlink()
266
- except FileNotFoundError:
267
- pass
268
- raise
269
- try:
270
- os.chmod(path, 0o600)
271
- except OSError:
272
- pass
273
-
274
-
275
- def read_bounded_stdin(max_bytes: int) -> tuple[str, bool, int]:
276
- data = sys.stdin.buffer.read(max_bytes + 1)
277
- truncated = len(data) > max_bytes
278
- if truncated:
279
- data = data[:max_bytes]
280
- return data.decode("utf-8", errors="replace"), truncated, len(data)
281
-
282
-
283
- def artifact_paths(directory: Path, artifact_id: str) -> tuple[Path, Path]:
284
- if not ARTIFACT_ID_RE.fullmatch(artifact_id):
285
- raise ValueError("artifact id must be 16-64 lowercase hex chars")
286
- directory = normalize_allowed_first_absolute_symlink(directory)
287
- return directory / f"{artifact_id}.txt", directory / f"{artifact_id}.json"
288
-
289
-
290
- def artifact_read_directories(raw_dir: str) -> list[Path]:
291
- """Return primary plus legacy read fallback for the default artifact dir.
292
-
293
- Rebranded ContextGuard stores new artifacts under `.context-guard/artifacts`,
294
- but users may still have receipts from the old `.claude-token-optimizer`
295
- default. Reads and listings include that legacy default so old receipts keep
296
- working; stores intentionally continue to use only the new path.
297
- """
298
- primary = normalize_allowed_first_absolute_symlink(Path(raw_dir).expanduser())
299
- directories = [primary]
300
- if Path(raw_dir).expanduser() == Path(DEFAULT_ARTIFACT_DIR):
301
- legacy = normalize_allowed_first_absolute_symlink(Path(LEGACY_ARTIFACT_DIR).expanduser())
302
- if legacy != primary:
303
- directories.append(legacy)
304
- return directories
305
-
306
-
307
- CONTENT_TYPE_VALUES = ("json", "diff", "log", "search", "code", "prose", "text")
308
- # Recommended retrieval strategy per content type. Pattern-oriented payloads
309
- # (logs, search hits, diffs) are best sliced by `--pattern`; structured or
310
- # narrative payloads (json, code, prose) read best by `--lines`. Unknown/empty
311
- # content falls back to a bounded `head` read.
312
- STRATEGY_BY_CONTENT_TYPE = {
313
- "json": "lines",
314
- "code": "lines",
315
- "prose": "lines",
316
- "diff": "pattern",
317
- "log": "pattern",
318
- "search": "pattern",
319
- "text": "head",
320
- }
321
- _SEARCH_HIT_RE = re.compile(r"^[^\s:]+:\d+:")
322
- _LOG_LINE_RE = re.compile(
323
- r"^(\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}|"
324
- r"\[(?:DEBUG|INFO|WARN|WARNING|ERROR|FATAL|TRACE)\]|"
325
- r"(?:DEBUG|INFO|WARN|WARNING|ERROR|FATAL|TRACE)\b)",
326
- re.IGNORECASE,
327
- )
328
- _CODE_LINE_RE = re.compile(
329
- r"^\s*(def |class |import |from \S+ import |function |const |let |var |"
330
- r"public |private |protected |#include|package |func |fn |impl |"
331
- r"return\b|if\s*\(|for\s*\(|while\s*\()"
332
- )
333
-
334
-
335
- def classify_content_type(text: str) -> str:
336
- """Classify stored content into one of CONTENT_TYPE_VALUES (advisory only).
337
-
338
- The classification is dependency-free and deterministic: identical input
339
- always yields the same label. It never influences redaction or storage; it
340
- only drives retrieval-strategy hints, so a wrong guess degrades to a less
341
- ergonomic (but still correct) retrieval suggestion. Empty input is "text".
342
- """
343
- stripped = text.strip()
344
- if not stripped:
345
- return "text"
346
- if stripped[0] in "{[":
347
- try:
348
- json.loads(stripped)
349
- return "json"
350
- except (ValueError, RecursionError):
351
- pass
352
- lines = stripped.splitlines()
353
- line_count = len(lines)
354
- majority = max(1, line_count // 2)
355
- diff_hits = sum(1 for line in lines if line.startswith(("diff --git ", "@@ ", "+++ ", "--- ", "index ")))
356
- if diff_hits and (lines[0].startswith(("diff --git ", "--- ", "@@ ")) or diff_hits >= 2):
357
- return "diff"
358
- # Log is checked before search because timestamps (HH:MM:SS) and bracketed
359
- # levels can superficially resemble the `path:line:` search shape.
360
- if sum(1 for line in lines if _LOG_LINE_RE.match(line)) >= majority:
361
- return "log"
362
- if sum(1 for line in lines if _SEARCH_HIT_RE.match(line)) >= majority:
363
- return "search"
364
- code_hits = sum(1 for line in lines if _CODE_LINE_RE.match(line))
365
- brace_lines = sum(1 for line in lines if line.rstrip().endswith(("{", "}", ";", "):")))
366
- if code_hits >= 2 or (code_hits >= 1 and brace_lines >= max(2, line_count // 3)):
367
- return "code"
368
- return "prose"
369
-
370
-
371
- def recommended_strategy(content_type: str) -> str:
372
- """Map a content type to its default retrieval strategy hint (advisory)."""
373
- return STRATEGY_BY_CONTENT_TYPE.get(content_type, "head")
374
-
375
-
376
- def first_error_anchor(text: str) -> str | None:
377
- """Return the first literal error token in text for a pattern hint, or None.
378
-
379
- The returned token is taken verbatim from ERROR_RE's match, so it is
380
- guaranteed to be an exact substring of the stored content. This makes the
381
- derived `--pattern` retrieval hint deterministic and exactly round-trippable.
382
- """
383
- for line in text.splitlines():
384
- match = ERROR_RE.search(line)
385
- if match:
386
- token = match.group(0).strip()
387
- if token:
388
- return token
389
- return None
390
-
391
-
392
- def build_retrieval_hints(
393
- artifact_id: str,
394
- sanitized_text: str,
395
- *,
396
- content_type: str,
397
- strategy: str,
398
- total_lines: int,
399
- ) -> list[dict[str, object]]:
400
- """Build deterministic, machine-readable retrieval hints for bounded round-trip.
401
-
402
- Each hint pairs a `selector` (consumable by `query_content` / the `get` CLI)
403
- with the exact CLI invocation for that selector. The line-range hint spans
404
- the full stored content when it fits the query cap, otherwise it advertises
405
- the first bounded chunk only. The pattern hint, when present, targets a
406
- literal token guaranteed to exist, so retrieval is reproducible. Order is
407
- fixed (lines, pattern, head) for determinism; callers pick the hint whose
408
- `type` matches `strategy`.
409
- """
410
- hints: list[dict[str, object]] = []
411
- if total_lines >= 1:
412
- end_line = min(total_lines, MAX_QUERY_LINES)
413
- lines_hint: dict[str, object] = {
414
- "type": "lines",
415
- "selector": {"start": 1, "end": end_line},
416
- "cli": line_query_cli(artifact_id, 1, end_line),
417
- "exact": total_lines <= MAX_QUERY_LINES,
418
- }
419
- if end_line > DEFAULT_MAX_LINES:
420
- lines_hint["max_lines"] = end_line
421
- lines_hint["max_lines_required"] = True
422
- lines_hint["note"] = (
423
- "`--max-lines` in this suggested query is only the returned-line cap for the selected "
424
- "`--lines` range; the explicit line range remains the selector."
425
- )
426
- if total_lines > MAX_QUERY_LINES:
427
- lines_hint["note"] = (
428
- f"first {MAX_QUERY_LINES} lines only; request later ranges for the full artifact. "
429
- "`--max-lines` is only the returned-line cap for the selected range."
430
- )
431
- lines_hint["total_lines"] = total_lines
432
- hints.append(lines_hint)
433
- anchor = first_error_anchor(sanitized_text)
434
- if anchor is not None:
435
- hints.append(
436
- {
437
- "type": "pattern",
438
- "selector": {"pattern": anchor},
439
- "cli": f"context-guard-artifact get {artifact_id} --pattern '{anchor}'",
440
- }
441
- )
442
- hints.append(
443
- {
444
- "type": "head",
445
- "selector": {"max_lines": DEFAULT_MAX_LINES},
446
- "cli": f"context-guard-artifact get {artifact_id} --max-lines {DEFAULT_MAX_LINES}",
447
- }
448
- )
449
- return hints
450
-
451
-
452
- def line_query_cli(artifact_id: str, start: int, end: int) -> str:
453
- cli = f"context-guard-artifact get {artifact_id} --lines {start}:{end}"
454
- requested_lines = end - start + 1
455
- if requested_lines > DEFAULT_MAX_LINES:
456
- cli += f" --max-lines {min(requested_lines, MAX_QUERY_LINES)}"
457
- return cli
458
-
459
-
460
- def line_receipt(artifact_id: str, line_number: int, text: str) -> dict[str, object]:
461
- return {
462
- "line": line_number,
463
- "text": cap_digest_text(text.strip()),
464
- "selector": {"type": "lines", "start": line_number, "end": line_number},
465
- "cli": line_query_cli(artifact_id, line_number, line_number),
466
- }
467
-
468
-
469
- def build_top_error_receipts(artifact_id: str, lines: list[str]) -> list[dict[str, object]]:
470
- receipts: list[dict[str, object]] = []
471
- seen: set[str] = set()
472
- for line_number, line in enumerate(lines, start=1):
473
- if not ERROR_RE.search(line):
474
- continue
475
- text = cap_digest_text(line.strip())
476
- if not text or text in seen:
477
- continue
478
- receipt = line_receipt(artifact_id, line_number, text)
479
- receipts.append(receipt)
480
- seen.add(text)
481
- if len(receipts) >= MAX_TOP_ERROR_RECEIPTS:
482
- break
483
- return receipts
484
-
485
-
486
- def build_duplicate_line_groups(artifact_id: str, lines: list[str], *, limit: int = MAX_DUPLICATE_GROUPS) -> list[dict[str, object]]:
487
- counts: dict[str, int] = {}
488
- first_line: dict[str, int] = {}
489
- for line_number, line in enumerate(lines, start=1):
490
- text = cap_digest_text(line.strip())
491
- if not text:
492
- continue
493
- if text not in counts:
494
- first_line[text] = line_number
495
- counts[text] = 0
496
- counts[text] += 1
497
- groups: list[dict[str, object]] = []
498
- for text, count in sorted(
499
- ((text, count) for text, count in counts.items() if count > 1),
500
- key=lambda item: (-item[1], first_line[item[0]], item[0]),
501
- )[:limit]:
502
- line_number = first_line[text]
503
- groups.append(
504
- {
505
- "count": count,
506
- "first_line": line_number,
507
- "text": text,
508
- "selector": {"type": "lines", "start": line_number, "end": line_number},
509
- "cli": line_query_cli(artifact_id, line_number, line_number),
510
- }
511
- )
512
- return groups
513
-
514
-
515
- def build_digest(sanitized_text: str, *, artifact_id: str, redacted_lines: int) -> dict[str, object]:
516
- lines = sanitized_text.splitlines()
517
- top_errors = compact_items(
518
- (line for line in lines if ERROR_RE.search(line)),
519
- limit=12,
520
- max_chars=MAX_DIGEST_TEXT_CHARS,
521
- max_bytes=MAX_DIGEST_TEXT_BYTES,
522
- )
523
- return {
524
- "status": "has_errors" if top_errors else "stored",
525
- "redacted_lines": redacted_lines,
526
- "redaction_counts": {
527
- "lines": redacted_lines,
528
- "markers": sanitized_text.count("[REDACTED]"),
529
- },
530
- "top_error_lines": top_errors,
531
- "top_error_receipts": build_top_error_receipts(artifact_id, lines),
532
- "duplicate_line_groups": build_duplicate_line_groups(artifact_id, lines),
533
- "representative_head": compact_items(
534
- lines,
535
- limit=8,
536
- max_chars=MAX_DIGEST_TEXT_CHARS,
537
- max_bytes=MAX_DIGEST_TEXT_BYTES,
538
- ),
539
- "representative_tail": compact_items(
540
- lines[-8:],
541
- limit=8,
542
- max_chars=MAX_DIGEST_TEXT_CHARS,
543
- max_bytes=MAX_DIGEST_TEXT_BYTES,
544
- ),
545
- }
546
-
547
-
548
- def suggested_queries_for(metadata: dict[str, object]) -> list[str]:
549
- queries: list[str] = []
550
-
551
- def add(value: object) -> None:
552
- if isinstance(value, str) and value and value not in queries:
553
- queries.append(value)
554
-
555
- digest = metadata.get("digest")
556
- if isinstance(digest, dict):
557
- for key in ("top_error_receipts", "duplicate_line_groups"):
558
- items = digest.get(key)
559
- if isinstance(items, list):
560
- for item in items:
561
- if isinstance(item, dict):
562
- add(item.get("cli"))
563
-
564
- retrieval = metadata.get("retrieval")
565
- if isinstance(retrieval, dict):
566
- hints = retrieval.get("hints")
567
- if isinstance(hints, list):
568
- for hint in hints:
569
- if isinstance(hint, dict):
570
- add(hint.get("cli"))
571
-
572
- return queries[:MAX_SUGGESTED_QUERIES]
573
-
574
-
575
- def receipt_for(metadata: dict[str, object]) -> dict[str, object]:
576
- artifact_id = str(metadata["artifact_id"])
577
- return {
578
- "artifact_id": artifact_id,
579
- "stored": True,
580
- "created_at": metadata.get("created_at"),
581
- "command_preview": metadata.get("command_preview"),
582
- "content_type": metadata.get("content_type"),
583
- "input": metadata.get("input"),
584
- "stored_output": metadata.get("stored_output"),
585
- "digest": metadata.get("digest"),
586
- "retrieval": metadata.get("retrieval"),
587
- "available_queries": [
588
- f"context-guard-artifact get {artifact_id} --lines 1:80",
589
- f"context-guard-artifact get {artifact_id} --pattern ERROR --max-lines 40",
590
- f"context-guard-artifact get {artifact_id} --json --lines 1:20",
591
- ],
592
- "suggested_queries": suggested_queries_for(metadata),
593
- }
594
-
595
-
596
- def metadata_json_text(metadata: dict[str, object]) -> str:
597
- return json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
598
-
599
-
600
- def metadata_size_bytes(metadata: dict[str, object]) -> int:
601
- return len(metadata_json_text(metadata).encode("utf-8", errors="replace"))
602
-
603
-
604
- def metadata_cap_diagnostic(metadata: dict[str, object], *, stage: str) -> str:
605
- digest = metadata.get("digest")
606
- digest_counts: dict[str, int] = {}
607
- if isinstance(digest, dict):
608
- for key in (
609
- "representative_tail",
610
- "representative_head",
611
- "duplicate_line_groups",
612
- "top_error_lines",
613
- "top_error_receipts",
614
- ):
615
- value = digest.get(key)
616
- if isinstance(value, list):
617
- digest_counts[key] = len(value)
618
- counts_text = ",".join(f"{key}={value}" for key, value in digest_counts.items()) or "none"
619
- return (
620
- "artifact metadata exceeds trusted size cap before write: "
621
- f"metadata_bytes={metadata_size_bytes(metadata)} "
622
- f"metadata_cap_bytes={MAX_METADATA_BYTES} "
623
- f"stage={stage} "
624
- f"remaining_digest_items={counts_text}; "
625
- "authoritative artifact content was not written because the receipt would be unreadable"
626
- )
627
-
628
-
629
- def shrink_digest_for_metadata_cap(metadata: dict[str, object]) -> None:
630
- """Keep stored metadata inside the trusted read cap before writing it.
631
-
632
- Digest fields are advisory receipts over the authoritative `.txt` artifact.
633
- If future fields or multi-byte text push metadata near the hard read cap,
634
- prefer dropping low-priority digest examples over writing a file that `get`
635
- and `list` will later reject as untrusted.
636
- """
637
- digest = metadata.get("digest")
638
- if not isinstance(digest, dict):
639
- if metadata_size_bytes(metadata) > MAX_METADATA_BYTES:
640
- raise ValueError(metadata_cap_diagnostic(metadata, stage="no_digest"))
641
- return
642
- if metadata_size_bytes(metadata) <= MAX_METADATA_BYTES:
643
- return
644
-
645
- digest["capped_for_metadata"] = True
646
- digest["metadata_cap_bytes"] = MAX_METADATA_BYTES
647
- shrink_order = (
648
- "representative_tail",
649
- "representative_head",
650
- "duplicate_line_groups",
651
- "top_error_lines",
652
- "top_error_receipts",
653
- )
654
- while metadata_size_bytes(metadata) > MAX_METADATA_BYTES:
655
- for key in shrink_order:
656
- items = digest.get(key)
657
- if isinstance(items, list) and items:
658
- items.pop()
659
- break
660
- else:
661
- raise ValueError(metadata_cap_diagnostic(metadata, stage="digest_shrink_exhausted"))
662
-
663
-
664
- def store_command(args: argparse.Namespace) -> int:
665
- directory = normalize_allowed_first_absolute_symlink(Path(args.dir).expanduser())
666
- max_bytes = bounded_int(args.max_bytes, DEFAULT_MAX_BYTES, 1, MAX_MAX_BYTES)
667
- raw_text, input_truncated, input_bytes = read_bounded_stdin(max_bytes)
668
- sanitized_text, redacted_lines = sanitize_text(raw_text, show_paths=args.show_paths)
669
- content_bytes = len(sanitized_text.encode("utf-8", errors="replace"))
670
- content_sha = hashlib.sha256(sanitized_text.encode("utf-8", errors="replace")).hexdigest()
671
- command_preview = sanitize_one_line(args.command or "", show_paths=args.show_paths) if args.command else None
672
- id_basis = json.dumps(
673
- {
674
- "content_sha256": content_sha,
675
- "command_preview": command_preview,
676
- "input_truncated": input_truncated,
677
- },
678
- sort_keys=True,
679
- )
680
- artifact_id = hashlib.sha256(id_basis.encode("utf-8")).hexdigest()[:20]
681
- content_path, meta_path = artifact_paths(directory, artifact_id)
682
- total_lines = sanitized_text.count("\n") + (1 if sanitized_text and not sanitized_text.endswith("\n") else 0)
683
- content_type = classify_content_type(sanitized_text)
684
- strategy = recommended_strategy(content_type)
685
- metadata: dict[str, object] = {
686
- "artifact_id": artifact_id,
687
- "created_at": int(time.time()),
688
- "command_preview": command_preview,
689
- "content_type": content_type,
690
- "input": {
691
- "bytes_read": input_bytes,
692
- "truncated": input_truncated,
693
- "max_bytes": max_bytes,
694
- },
695
- "stored_output": {
696
- "bytes": content_bytes,
697
- "lines": total_lines,
698
- "sha256": content_sha,
699
- "content_file": content_path.name,
700
- "metadata_file": meta_path.name,
701
- },
702
- "digest": build_digest(sanitized_text, artifact_id=artifact_id, redacted_lines=redacted_lines),
703
- "retrieval": {
704
- "strategy": strategy,
705
- "deterministic": True,
706
- "hints": build_retrieval_hints(
707
- artifact_id,
708
- sanitized_text,
709
- content_type=content_type,
710
- strategy=strategy,
711
- total_lines=total_lines,
712
- ),
713
- },
714
- }
715
- shrink_digest_for_metadata_cap(metadata)
716
- write_private_text(content_path, sanitized_text)
717
- write_private_text(meta_path, metadata_json_text(metadata))
718
- receipt = receipt_for(metadata)
719
- if args.json:
720
- print(json.dumps(receipt, ensure_ascii=False, indent=2, sort_keys=True))
721
- else:
722
- print(f"artifact_id={artifact_id}")
723
- stored = receipt["stored_output"]
724
- if isinstance(stored, dict):
725
- print(f"stored_output={stored.get('lines')} lines/{stored.get('bytes')} bytes")
726
- digest = receipt.get("digest")
727
- if isinstance(digest, dict) and digest.get("top_error_lines"):
728
- print("top_error_lines:")
729
- for line in digest["top_error_lines"]: # type: ignore[index]
730
- print(f"- {line}")
731
- print(f"query=context-guard-artifact get {artifact_id} --lines 1:80")
732
- return 0
733
-
734
-
735
- def load_metadata(directory: Path, artifact_id: str) -> dict[str, object]:
736
- content_path, meta_path = artifact_paths(directory, artifact_id)
737
- try:
738
- regular_private_file_size(content_path)
739
- meta_text = read_bounded_private_text(meta_path, MAX_METADATA_BYTES)
740
- except FileNotFoundError as exc:
741
- raise FileNotFoundError(f"artifact not found: {artifact_id}")
742
- data = json.loads(meta_text)
743
- if not isinstance(data, dict) or data.get("artifact_id") != artifact_id:
744
- raise ValueError(f"artifact metadata mismatch: {artifact_id}")
745
- return data
746
-
747
-
748
- def parse_line_range(value: str | None) -> tuple[int, int] | None:
749
- if not value:
750
- return None
751
- match = re.fullmatch(r"(\d+)(?::(\d+))?", value.strip())
752
- if not match:
753
- raise ValueError("--lines must be START or START:END using 1-based inclusive line numbers")
754
- start = int(match.group(1))
755
- end = int(match.group(2) or match.group(1))
756
- if start < 1 or end < start:
757
- raise ValueError("--lines must satisfy 1 <= START <= END")
758
- return start, end
759
-
760
-
761
- def cap_text(text: str, max_chars: int) -> tuple[str, bool]:
762
- if len(text) <= max_chars:
763
- return text, False
764
- marker = f"\n[context-guard-kit] artifact query capped: {len(text)} chars total\n"
765
- keep = max(0, max_chars - len(marker))
766
- return text[:keep].rstrip() + marker, True
767
-
768
-
769
- def query_content(
770
- content: str,
771
- *,
772
- line_range: tuple[int, int] | None,
773
- pattern: str | None,
774
- max_lines: int,
775
- full: bool = False,
776
- ) -> tuple[str, dict[str, object]]:
777
- lines = content.splitlines(True)
778
- selected: list[tuple[int, str]] = []
779
- if full:
780
- selected = list(enumerate(lines, start=1))
781
- selector = {"type": "full"}
782
- elif line_range is not None:
783
- start, end = line_range
784
- selected = list(enumerate(lines[start - 1 : end], start=start))
785
- selector = {"type": "lines", "start": start, "end": end}
786
- elif pattern:
787
- selected = [(idx, line) for idx, line in enumerate(lines, start=1) if pattern in line]
788
- selector = {"type": "pattern", "pattern": pattern}
789
- else:
790
- selected = list(enumerate(lines[:max_lines], start=1))
791
- selector = {"type": "head", "max_lines": max_lines}
792
- total_matches = len(selected)
793
- if not full:
794
- selected = selected[:max_lines]
795
- text = "".join(line for _idx, line in selected)
796
- return text, {"selector": selector, "returned_lines": len(selected), "matched_lines": total_matches, "total_lines": len(lines)}
797
-
798
-
799
- def get_command(args: argparse.Namespace) -> int:
800
- artifact_id = args.artifact_id
801
- full = bool(getattr(args, "full", False))
802
- try:
803
- if full and (args.lines or args.pattern or args.max_lines is not None):
804
- raise ValueError("--full cannot be combined with --lines, --pattern, or --max-lines")
805
- last_missing: FileNotFoundError | None = None
806
- for directory in artifact_read_directories(args.dir):
807
- try:
808
- metadata = load_metadata(directory, artifact_id)
809
- content_path, _meta_path = artifact_paths(directory, artifact_id)
810
- break
811
- except FileNotFoundError as exc:
812
- last_missing = exc
813
- else:
814
- if last_missing is not None:
815
- raise last_missing
816
- raise FileNotFoundError(f"artifact not found: {artifact_id}")
817
- stored_output = metadata.get("stored_output")
818
- expected_sha = stored_output.get("sha256") if isinstance(stored_output, dict) else None
819
- if not isinstance(expected_sha, str) or not re.fullmatch(r"[a-f0-9]{64}", expected_sha):
820
- raise ValueError(f"artifact metadata missing stored_output sha256: {artifact_id}")
821
- expected_bytes = stored_output.get("bytes") if isinstance(stored_output, dict) else None
822
- if not isinstance(expected_bytes, int) or expected_bytes < 0 or expected_bytes > MAX_MAX_BYTES:
823
- raise ValueError(f"artifact metadata has invalid stored_output bytes: {artifact_id}")
824
- actual_size = regular_private_file_size(content_path)
825
- if actual_size != expected_bytes:
826
- raise ValueError(f"artifact content checksum mismatch: {artifact_id}")
827
- content = read_bounded_private_text(content_path, expected_bytes)
828
- actual_sha = hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()
829
- if actual_sha != expected_sha:
830
- raise ValueError(f"artifact content checksum mismatch: {artifact_id}")
831
- default_max_chars = max(DEFAULT_MAX_CHARS, expected_bytes) if full else DEFAULT_MAX_CHARS
832
- max_chars = bounded_int(args.max_chars, default_max_chars, 1, MAX_MAX_BYTES)
833
- line_range = parse_line_range(args.lines)
834
- if line_range is not None and args.max_lines is None:
835
- max_lines = min(line_range[1] - line_range[0] + 1, MAX_QUERY_LINES)
836
- else:
837
- max_lines = bounded_int(args.max_lines, DEFAULT_MAX_LINES, 1, MAX_QUERY_LINES)
838
- selected, query = query_content(content, line_range=line_range, pattern=args.pattern, max_lines=max_lines, full=full)
839
- selected, capped = cap_text(selected, max_chars)
840
- except (FileNotFoundError, ValueError, OSError, json.JSONDecodeError) as exc:
841
- print(f"context-guard-artifact: {exc}", file=sys.stderr)
842
- return 1
843
- if args.json:
844
- payload = {
845
- "artifact_id": artifact_id,
846
- "content_type": metadata.get("content_type"),
847
- "query": query,
848
- "capped": capped,
849
- "content": selected,
850
- "stored_output": metadata.get("stored_output"),
851
- "retrieval": metadata.get("retrieval"),
852
- }
853
- print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
854
- else:
855
- sys.stdout.write(selected)
856
- return 0
857
-
858
-
859
- def list_command(args: argparse.Namespace) -> int:
860
- items: list[dict[str, object]] = []
861
- seen: set[str] = set()
862
- for directory in artifact_read_directories(args.dir):
863
- try:
864
- reject_symlink_components(directory)
865
- directory_is_safe = directory.is_dir() and not directory.is_symlink()
866
- except RuntimeError:
867
- directory_is_safe = False
868
- if not directory_is_safe:
869
- continue
870
- for meta_path in sorted(directory.glob("*.json")):
871
- try:
872
- data = json.loads(read_bounded_private_text(meta_path, MAX_METADATA_BYTES))
873
- except (OSError, ValueError, RuntimeError, json.JSONDecodeError):
874
- continue
875
- artifact_id = str(data.get("artifact_id", "")) if isinstance(data, dict) else ""
876
- if isinstance(data, dict) and ARTIFACT_ID_RE.fullmatch(artifact_id) and artifact_id not in seen:
877
- items.append(receipt_for(data))
878
- seen.add(artifact_id)
879
- items.sort(key=lambda item: str(item.get("artifact_id", "")))
880
- if args.json:
881
- print(json.dumps({"artifacts": items}, ensure_ascii=False, indent=2, sort_keys=True))
882
- else:
883
- for item in items:
884
- stored = item.get("stored_output")
885
- if isinstance(stored, dict):
886
- print(f"{item['artifact_id']}\t{stored.get('lines')} lines\t{stored.get('bytes')} bytes")
887
- else:
888
- print(item["artifact_id"])
889
- return 0
890
-
891
-
892
- def build_parser() -> argparse.ArgumentParser:
893
- parser = argparse.ArgumentParser(description="Store sanitized large outputs as queryable local artifacts.")
894
- parser.add_argument("--dir", default=DEFAULT_ARTIFACT_DIR, help=f"artifact directory (default: {DEFAULT_ARTIFACT_DIR})")
895
- subparsers = parser.add_subparsers(dest="command_name", required=True)
896
-
897
- store = subparsers.add_parser("store", help="store stdin as a sanitized artifact and print a compact receipt")
898
- store.add_argument("--command", help="optional command label to sanitize into the receipt")
899
- store.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES, help="maximum stdin bytes to read before truncating")
900
- store.add_argument(
901
- "--show-paths",
902
- action="store_true",
903
- help="show raw absolute paths instead of path hashes; local debugging only because private paths may be exposed",
904
- )
905
- store.add_argument("--json", action="store_true", help="emit receipt JSON")
906
- store.set_defaults(func=store_command)
907
-
908
- get = subparsers.add_parser("get", help="query a stored artifact")
909
- get.add_argument("artifact_id")
910
- get.add_argument("--lines", help="1-based inclusive line range, e.g. 10:40")
911
- get.add_argument("--pattern", help="literal substring filter")
912
- get.add_argument("--max-lines", type=int, default=None)
913
- get.add_argument("--full", action="store_true", help="return full stored artifact content; cannot be combined with selectors")
914
- get.add_argument("--max-chars", type=int, default=None)
915
- get.add_argument("--json", action="store_true", help="emit query JSON with content")
916
- get.set_defaults(func=get_command)
917
-
918
- list_parser = subparsers.add_parser("list", help="list stored artifacts")
919
- list_parser.add_argument("--json", action="store_true", help="emit list JSON")
920
- list_parser.set_defaults(func=list_command)
921
- return parser
922
-
923
-
924
- def main() -> int:
925
- parser = build_parser()
926
- args = parser.parse_args()
927
- try:
928
- return int(args.func(args))
929
- except (RuntimeError, ValueError) as exc:
930
- print(f"context-guard-artifact: {exc}", file=sys.stderr)
931
- return 1
932
-
933
-
934
- if __name__ == "__main__":
935
- raise SystemExit(main())