@ictechgy/context-guard 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +49 -0
- package/LICENSE +201 -0
- package/NOTICE +4 -0
- package/README.ko.md +353 -0
- package/README.md +353 -0
- package/context-guard-kit/README.md +76 -0
- package/context-guard-kit/benchmark_runner.py +1898 -0
- package/context-guard-kit/claude_transcript_cost_audit.py +1591 -0
- package/context-guard-kit/context_compress.py +543 -0
- package/context-guard-kit/context_escrow.py +919 -0
- package/context-guard-kit/context_guard_cli.py +149 -0
- package/context-guard-kit/context_guard_diet.py +1036 -0
- package/context-guard-kit/context_pack.py +929 -0
- package/context-guard-kit/failed_attempt_nudge.py +567 -0
- package/context-guard-kit/guard_large_read.py +690 -0
- package/context-guard-kit/hook_secret_patterns.py +43 -0
- package/context-guard-kit/read_symbol.py +483 -0
- package/context-guard-kit/rewrite_bash_for_token_budget.py +501 -0
- package/context-guard-kit/sanitize_output.py +725 -0
- package/context-guard-kit/settings.example.json +67 -0
- package/context-guard-kit/setup_wizard.py +1724 -0
- package/context-guard-kit/statusline.sh +362 -0
- package/context-guard-kit/statusline_merged.sh +157 -0
- package/context-guard-kit/tool_schema_pruner.py +837 -0
- package/context-guard-kit/trim_command_output.py +1098 -0
- package/docs/distribution.md +55 -0
- package/package.json +70 -0
- package/packaging/homebrew/context-guard.rb.template +34 -0
- package/plugins/context-guard/.claude-plugin/plugin.json +41 -0
- package/plugins/context-guard/LICENSE +201 -0
- package/plugins/context-guard/NOTICE +4 -0
- package/plugins/context-guard/README.ko.md +135 -0
- package/plugins/context-guard/README.md +135 -0
- package/plugins/context-guard/bin/claude-read-symbol +6 -0
- package/plugins/context-guard/bin/claude-sanitize-output +6 -0
- package/plugins/context-guard/bin/claude-token-artifact +6 -0
- package/plugins/context-guard/bin/claude-token-audit +6 -0
- package/plugins/context-guard/bin/claude-token-bench +6 -0
- package/plugins/context-guard/bin/claude-token-diet +6 -0
- package/plugins/context-guard/bin/claude-token-failed-nudge +6 -0
- package/plugins/context-guard/bin/claude-token-guard-read +6 -0
- package/plugins/context-guard/bin/claude-token-rewrite-bash +6 -0
- package/plugins/context-guard/bin/claude-token-setup +6 -0
- package/plugins/context-guard/bin/claude-token-statusline +6 -0
- package/plugins/context-guard/bin/claude-token-statusline-merged +6 -0
- package/plugins/context-guard/bin/claude-trim-output +6 -0
- package/plugins/context-guard/bin/context-guard +149 -0
- package/plugins/context-guard/bin/context-guard-artifact +919 -0
- package/plugins/context-guard/bin/context-guard-audit +1591 -0
- package/plugins/context-guard/bin/context-guard-bench +1898 -0
- package/plugins/context-guard/bin/context-guard-compress +543 -0
- package/plugins/context-guard/bin/context-guard-diet +1036 -0
- package/plugins/context-guard/bin/context-guard-failed-nudge +567 -0
- package/plugins/context-guard/bin/context-guard-guard-read +690 -0
- package/plugins/context-guard/bin/context-guard-pack +929 -0
- package/plugins/context-guard/bin/context-guard-read-symbol +483 -0
- package/plugins/context-guard/bin/context-guard-rewrite-bash +501 -0
- package/plugins/context-guard/bin/context-guard-sanitize-output +725 -0
- package/plugins/context-guard/bin/context-guard-setup +1724 -0
- package/plugins/context-guard/bin/context-guard-statusline +362 -0
- package/plugins/context-guard/bin/context-guard-statusline-merged +157 -0
- package/plugins/context-guard/bin/context-guard-tool-prune +837 -0
- package/plugins/context-guard/bin/context-guard-trim-output +1098 -0
- package/plugins/context-guard/brief/README.md +65 -0
- package/plugins/context-guard/brief/brief-mode.lite.md +29 -0
- package/plugins/context-guard/brief/brief-mode.standard.md +31 -0
- package/plugins/context-guard/brief/brief-mode.ultra.md +32 -0
- package/plugins/context-guard/lib/hook_secret_patterns.py +43 -0
- package/plugins/context-guard/skills/audit/SKILL.md +39 -0
- package/plugins/context-guard/skills/optimize/SKILL.md +48 -0
- package/plugins/context-guard/skills/setup/SKILL.md +40 -0
|
@@ -0,0 +1,919 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Store large sanitized command output outside Claude context and query slices later."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import hashlib
|
|
7
|
+
import importlib.machinery
|
|
8
|
+
import importlib.util
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import re
|
|
13
|
+
import stat
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
from typing import Iterable
|
|
17
|
+
|
|
18
|
+
DEFAULT_ARTIFACT_DIR = ".context-guard/artifacts"
|
|
19
|
+
LEGACY_ARTIFACT_DIR = ".claude-token-optimizer/artifacts"
|
|
20
|
+
DEFAULT_MAX_BYTES = 10_000_000
|
|
21
|
+
MAX_MAX_BYTES = 100_000_000
|
|
22
|
+
MAX_METADATA_BYTES = 64_000
|
|
23
|
+
DEFAULT_MAX_LINES = 80
|
|
24
|
+
DEFAULT_MAX_CHARS = 20_000
|
|
25
|
+
MAX_QUERY_LINES = 5_000
|
|
26
|
+
MAX_LINE_CHARS = 2_000
|
|
27
|
+
MAX_DIGEST_TEXT_CHARS = 360
|
|
28
|
+
MAX_DIGEST_TEXT_BYTES = 512
|
|
29
|
+
MAX_COMMAND_PREVIEW_BYTES = 2_048
|
|
30
|
+
MAX_TOP_ERROR_RECEIPTS = 12
|
|
31
|
+
MAX_DUPLICATE_GROUPS = 12
|
|
32
|
+
MAX_SUGGESTED_QUERIES = 12
|
|
33
|
+
ARTIFACT_ID_RE = re.compile(r"^[a-f0-9]{16,64}$")
|
|
34
|
+
ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
|
|
35
|
+
"tmp": Path("/private/tmp"),
|
|
36
|
+
"var": Path("/private/var"),
|
|
37
|
+
}
|
|
38
|
+
ERROR_RE = re.compile(
|
|
39
|
+
r"(FAIL|FAILED|ERROR|Error:|Exception|Traceback|AssertionError|panic:|fatal:|"
|
|
40
|
+
r"segmentation fault|not ok|\bE\s+assert|\[ERROR\]|✗|✖)",
|
|
41
|
+
re.IGNORECASE,
|
|
42
|
+
)
|
|
43
|
+
SECRET_VALUE_RE = re.compile(
|
|
44
|
+
r"(?i)(Bearer\s+\S+|Basic\s+\S+|gh[pousr]_[A-Za-z0-9_]{20,}|"
|
|
45
|
+
r"github_pat_[A-Za-z0-9_]{20,}|xox[abprs]-[A-Za-z0-9-]{10,}|"
|
|
46
|
+
r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
|
|
47
|
+
r"AIza[0-9A-Za-z_\-]{20,}|"
|
|
48
|
+
r"([A-Za-z0-9_.-]*(?:api[_-]?key|token|secret|password|passwd|pwd)[A-Za-z0-9_.-]*\s*[:=]\s*)\S+)"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
|
|
53
|
+
try:
|
|
54
|
+
number = int(value)
|
|
55
|
+
except (TypeError, ValueError, OverflowError):
|
|
56
|
+
return default
|
|
57
|
+
return min(max(number, minimum), maximum)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def cap_line(line: str, limit: int = MAX_LINE_CHARS) -> str:
|
|
61
|
+
if len(line) <= limit:
|
|
62
|
+
return line
|
|
63
|
+
marker = f"...[line trimmed: {len(line)} chars]"
|
|
64
|
+
return line[: max(0, limit - len(marker))] + marker
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def cap_utf8_bytes(text: str, limit: int) -> str:
|
|
68
|
+
encoded = text.encode("utf-8", errors="replace")
|
|
69
|
+
if len(encoded) <= limit:
|
|
70
|
+
return text
|
|
71
|
+
marker = f"...[line trimmed: {len(text)} chars/{len(encoded)} bytes]"
|
|
72
|
+
marker_bytes = marker.encode("utf-8")
|
|
73
|
+
if len(marker_bytes) >= limit:
|
|
74
|
+
return marker_bytes[:limit].decode("utf-8", errors="ignore")
|
|
75
|
+
keep = limit - len(marker_bytes)
|
|
76
|
+
out: list[str] = []
|
|
77
|
+
used = 0
|
|
78
|
+
for char in text:
|
|
79
|
+
char_bytes = char.encode("utf-8", errors="replace")
|
|
80
|
+
if used + len(char_bytes) > keep:
|
|
81
|
+
break
|
|
82
|
+
out.append(char)
|
|
83
|
+
used += len(char_bytes)
|
|
84
|
+
return "".join(out) + marker
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def cap_digest_text(text: str) -> str:
|
|
88
|
+
return cap_utf8_bytes(cap_line(text, limit=MAX_DIGEST_TEXT_CHARS), MAX_DIGEST_TEXT_BYTES)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def normalized_link_target(parent: Path, raw_target: str) -> Path:
|
|
92
|
+
target = Path(raw_target)
|
|
93
|
+
if not target.is_absolute():
|
|
94
|
+
target = parent / target
|
|
95
|
+
return Path(os.path.normpath(str(target)))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def normalize_allowed_first_absolute_symlink(path: Path) -> Path:
|
|
99
|
+
if not path.is_absolute() or len(path.parts) < 2:
|
|
100
|
+
return path
|
|
101
|
+
first = path.parts[1]
|
|
102
|
+
expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
|
|
103
|
+
if expected is None:
|
|
104
|
+
return path
|
|
105
|
+
link = Path(path.anchor) / first
|
|
106
|
+
try:
|
|
107
|
+
if not stat.S_ISLNK(os.lstat(link).st_mode):
|
|
108
|
+
return path
|
|
109
|
+
if normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
|
|
110
|
+
return path
|
|
111
|
+
except OSError:
|
|
112
|
+
return path
|
|
113
|
+
return expected.joinpath(*path.parts[2:])
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def compact_items(lines: Iterable[str], *, limit: int, max_chars: int = MAX_LINE_CHARS, max_bytes: int | None = None) -> list[str]:
|
|
117
|
+
out: list[str] = []
|
|
118
|
+
seen: set[str] = set()
|
|
119
|
+
for line in lines:
|
|
120
|
+
item = cap_line(line.strip(), limit=max_chars)
|
|
121
|
+
if max_bytes is not None:
|
|
122
|
+
item = cap_utf8_bytes(item, max_bytes)
|
|
123
|
+
if not item or item in seen:
|
|
124
|
+
continue
|
|
125
|
+
out.append(item)
|
|
126
|
+
seen.add(item)
|
|
127
|
+
if len(out) >= limit:
|
|
128
|
+
break
|
|
129
|
+
return out
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class FallbackLineSanitizer:
|
|
133
|
+
def __init__(self, *, show_paths: bool = False) -> None:
|
|
134
|
+
self.show_paths = show_paths
|
|
135
|
+
self.redactions = 0
|
|
136
|
+
|
|
137
|
+
def sanitize(self, raw_line: str) -> tuple[str, bool]:
|
|
138
|
+
def repl(match: re.Match[str]) -> str:
|
|
139
|
+
groups = match.groups()
|
|
140
|
+
if len(groups) >= 2 and groups[1]:
|
|
141
|
+
return groups[1] + "[REDACTED]"
|
|
142
|
+
return "[REDACTED]"
|
|
143
|
+
|
|
144
|
+
line, count = SECRET_VALUE_RE.subn(repl, raw_line)
|
|
145
|
+
if count:
|
|
146
|
+
self.redactions += 1
|
|
147
|
+
return line, bool(count)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def load_line_sanitizer(show_paths: bool) -> object:
|
|
151
|
+
script_dir = Path(__file__).resolve().parent
|
|
152
|
+
for name in ("sanitize_output.py", "context-guard-sanitize-output", "claude-sanitize-output"):
|
|
153
|
+
candidate = script_dir / name
|
|
154
|
+
if not candidate.exists():
|
|
155
|
+
continue
|
|
156
|
+
try:
|
|
157
|
+
loader = importlib.machinery.SourceFileLoader(f"_claude_token_sanitize_{os.getpid()}", str(candidate))
|
|
158
|
+
spec = importlib.util.spec_from_loader(loader.name, loader)
|
|
159
|
+
if spec is None:
|
|
160
|
+
raise RuntimeError("import spec unavailable")
|
|
161
|
+
module = importlib.util.module_from_spec(spec)
|
|
162
|
+
loader.exec_module(module)
|
|
163
|
+
return module.LineSanitizer(show_paths=show_paths)
|
|
164
|
+
except Exception as exc:
|
|
165
|
+
raise RuntimeError(f"could not load sanitizer {candidate}: {exc}") from exc
|
|
166
|
+
return FallbackLineSanitizer(show_paths=show_paths)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def sanitize_text(text: str, *, show_paths: bool = False) -> tuple[str, int]:
|
|
170
|
+
sanitizer = load_line_sanitizer(show_paths)
|
|
171
|
+
redacted = 0
|
|
172
|
+
out: list[str] = []
|
|
173
|
+
for line in text.splitlines(True):
|
|
174
|
+
sanitized, did_redact = sanitizer.sanitize(line) # type: ignore[attr-defined]
|
|
175
|
+
out.append(sanitized)
|
|
176
|
+
if did_redact:
|
|
177
|
+
redacted += 1
|
|
178
|
+
return "".join(out), redacted
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def sanitize_one_line(text: str, *, show_paths: bool = False) -> str:
|
|
182
|
+
sanitized, _ = sanitize_text(text + "\n", show_paths=show_paths)
|
|
183
|
+
return cap_utf8_bytes(cap_line(" ".join(sanitized.strip().split())), MAX_COMMAND_PREVIEW_BYTES)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def ensure_private_dir(path: Path) -> None:
|
|
187
|
+
path = normalize_allowed_first_absolute_symlink(path)
|
|
188
|
+
reject_symlink_components(path)
|
|
189
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
190
|
+
reject_symlink_components(path)
|
|
191
|
+
try:
|
|
192
|
+
os.chmod(path, 0o700)
|
|
193
|
+
except OSError:
|
|
194
|
+
pass
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def reject_symlink_components(path: Path) -> None:
|
|
198
|
+
path = normalize_allowed_first_absolute_symlink(path)
|
|
199
|
+
current = Path(path.anchor) if path.is_absolute() else Path()
|
|
200
|
+
for part in path.parts:
|
|
201
|
+
if path.is_absolute() and part == path.anchor:
|
|
202
|
+
continue
|
|
203
|
+
current = current / part
|
|
204
|
+
try:
|
|
205
|
+
st = os.lstat(current)
|
|
206
|
+
except FileNotFoundError:
|
|
207
|
+
return
|
|
208
|
+
if stat.S_ISLNK(st.st_mode):
|
|
209
|
+
raise RuntimeError(f"refusing artifact path with symlink component: {current}")
|
|
210
|
+
if not stat.S_ISDIR(st.st_mode) and current != path:
|
|
211
|
+
raise RuntimeError(f"refusing artifact path through non-directory component: {current}")
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def regular_private_file_size(path: Path) -> int:
|
|
215
|
+
path = normalize_allowed_first_absolute_symlink(path)
|
|
216
|
+
reject_symlink_components(path.parent)
|
|
217
|
+
st = os.lstat(path)
|
|
218
|
+
if stat.S_ISLNK(st.st_mode):
|
|
219
|
+
raise ValueError(f"artifact file must not be a symlink: {path.name}")
|
|
220
|
+
if not stat.S_ISREG(st.st_mode):
|
|
221
|
+
raise ValueError(f"artifact file must be a regular file: {path.name}")
|
|
222
|
+
return int(st.st_size)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def read_bounded_private_text(path: Path, max_bytes: int) -> str:
|
|
226
|
+
path = normalize_allowed_first_absolute_symlink(path)
|
|
227
|
+
size = regular_private_file_size(path)
|
|
228
|
+
if size > max_bytes:
|
|
229
|
+
raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: {size} > {max_bytes}")
|
|
230
|
+
flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
|
|
231
|
+
fd = os.open(str(path), flags)
|
|
232
|
+
try:
|
|
233
|
+
st = os.fstat(fd)
|
|
234
|
+
if not stat.S_ISREG(st.st_mode):
|
|
235
|
+
raise ValueError(f"artifact file must be a regular file: {path.name}")
|
|
236
|
+
if st.st_size > max_bytes:
|
|
237
|
+
raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: {st.st_size} > {max_bytes}")
|
|
238
|
+
data = os.read(fd, max_bytes + 1)
|
|
239
|
+
if len(data) > max_bytes:
|
|
240
|
+
raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: > {max_bytes}")
|
|
241
|
+
return data.decode("utf-8", errors="replace")
|
|
242
|
+
finally:
|
|
243
|
+
os.close(fd)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def write_private_text(path: Path, text: str) -> None:
|
|
247
|
+
path = normalize_allowed_first_absolute_symlink(path)
|
|
248
|
+
ensure_private_dir(path.parent)
|
|
249
|
+
tmp = path.with_name(path.name + f".tmp-{os.getpid()}-{time.time_ns()}")
|
|
250
|
+
flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0)
|
|
251
|
+
fd = os.open(str(tmp), flags, 0o600)
|
|
252
|
+
try:
|
|
253
|
+
with os.fdopen(fd, "w", encoding="utf-8", newline="") as handle:
|
|
254
|
+
handle.write(text)
|
|
255
|
+
except Exception:
|
|
256
|
+
try:
|
|
257
|
+
tmp.unlink()
|
|
258
|
+
except FileNotFoundError:
|
|
259
|
+
pass
|
|
260
|
+
raise
|
|
261
|
+
try:
|
|
262
|
+
os.replace(tmp, path)
|
|
263
|
+
except Exception:
|
|
264
|
+
try:
|
|
265
|
+
tmp.unlink()
|
|
266
|
+
except FileNotFoundError:
|
|
267
|
+
pass
|
|
268
|
+
raise
|
|
269
|
+
try:
|
|
270
|
+
os.chmod(path, 0o600)
|
|
271
|
+
except OSError:
|
|
272
|
+
pass
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def read_bounded_stdin(max_bytes: int) -> tuple[str, bool, int]:
|
|
276
|
+
data = sys.stdin.buffer.read(max_bytes + 1)
|
|
277
|
+
truncated = len(data) > max_bytes
|
|
278
|
+
if truncated:
|
|
279
|
+
data = data[:max_bytes]
|
|
280
|
+
return data.decode("utf-8", errors="replace"), truncated, len(data)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def artifact_paths(directory: Path, artifact_id: str) -> tuple[Path, Path]:
|
|
284
|
+
if not ARTIFACT_ID_RE.fullmatch(artifact_id):
|
|
285
|
+
raise ValueError("artifact id must be 16-64 lowercase hex chars")
|
|
286
|
+
directory = normalize_allowed_first_absolute_symlink(directory)
|
|
287
|
+
return directory / f"{artifact_id}.txt", directory / f"{artifact_id}.json"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def artifact_read_directories(raw_dir: str) -> list[Path]:
|
|
291
|
+
"""Return primary plus legacy read fallback for the default artifact dir.
|
|
292
|
+
|
|
293
|
+
Rebranded ContextGuard stores new artifacts under `.context-guard/artifacts`,
|
|
294
|
+
but users may still have receipts from the old `.claude-token-optimizer`
|
|
295
|
+
default. Reads and listings include that legacy default so old receipts keep
|
|
296
|
+
working; stores intentionally continue to use only the new path.
|
|
297
|
+
"""
|
|
298
|
+
primary = normalize_allowed_first_absolute_symlink(Path(raw_dir).expanduser())
|
|
299
|
+
directories = [primary]
|
|
300
|
+
if Path(raw_dir).expanduser() == Path(DEFAULT_ARTIFACT_DIR):
|
|
301
|
+
legacy = normalize_allowed_first_absolute_symlink(Path(LEGACY_ARTIFACT_DIR).expanduser())
|
|
302
|
+
if legacy != primary:
|
|
303
|
+
directories.append(legacy)
|
|
304
|
+
return directories
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
CONTENT_TYPE_VALUES = ("json", "diff", "log", "search", "code", "prose", "text")
|
|
308
|
+
# Recommended retrieval strategy per content type. Pattern-oriented payloads
|
|
309
|
+
# (logs, search hits, diffs) are best sliced by `--pattern`; structured or
|
|
310
|
+
# narrative payloads (json, code, prose) read best by `--lines`. Unknown/empty
|
|
311
|
+
# content falls back to a bounded `head` read.
|
|
312
|
+
STRATEGY_BY_CONTENT_TYPE = {
|
|
313
|
+
"json": "lines",
|
|
314
|
+
"code": "lines",
|
|
315
|
+
"prose": "lines",
|
|
316
|
+
"diff": "pattern",
|
|
317
|
+
"log": "pattern",
|
|
318
|
+
"search": "pattern",
|
|
319
|
+
"text": "head",
|
|
320
|
+
}
|
|
321
|
+
_SEARCH_HIT_RE = re.compile(r"^[^\s:]+:\d+:")
|
|
322
|
+
_LOG_LINE_RE = re.compile(
|
|
323
|
+
r"^(\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}|"
|
|
324
|
+
r"\[(?:DEBUG|INFO|WARN|WARNING|ERROR|FATAL|TRACE)\]|"
|
|
325
|
+
r"(?:DEBUG|INFO|WARN|WARNING|ERROR|FATAL|TRACE)\b)",
|
|
326
|
+
re.IGNORECASE,
|
|
327
|
+
)
|
|
328
|
+
_CODE_LINE_RE = re.compile(
|
|
329
|
+
r"^\s*(def |class |import |from \S+ import |function |const |let |var |"
|
|
330
|
+
r"public |private |protected |#include|package |func |fn |impl |"
|
|
331
|
+
r"return\b|if\s*\(|for\s*\(|while\s*\()"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def classify_content_type(text: str) -> str:
|
|
336
|
+
"""Classify stored content into one of CONTENT_TYPE_VALUES (advisory only).
|
|
337
|
+
|
|
338
|
+
The classification is dependency-free and deterministic: identical input
|
|
339
|
+
always yields the same label. It never influences redaction or storage; it
|
|
340
|
+
only drives retrieval-strategy hints, so a wrong guess degrades to a less
|
|
341
|
+
ergonomic (but still correct) retrieval suggestion. Empty input is "text".
|
|
342
|
+
"""
|
|
343
|
+
stripped = text.strip()
|
|
344
|
+
if not stripped:
|
|
345
|
+
return "text"
|
|
346
|
+
if stripped[0] in "{[":
|
|
347
|
+
try:
|
|
348
|
+
json.loads(stripped)
|
|
349
|
+
return "json"
|
|
350
|
+
except (ValueError, RecursionError):
|
|
351
|
+
pass
|
|
352
|
+
lines = stripped.splitlines()
|
|
353
|
+
line_count = len(lines)
|
|
354
|
+
majority = max(1, line_count // 2)
|
|
355
|
+
diff_hits = sum(1 for line in lines if line.startswith(("diff --git ", "@@ ", "+++ ", "--- ", "index ")))
|
|
356
|
+
if diff_hits and (lines[0].startswith(("diff --git ", "--- ", "@@ ")) or diff_hits >= 2):
|
|
357
|
+
return "diff"
|
|
358
|
+
# Log is checked before search because timestamps (HH:MM:SS) and bracketed
|
|
359
|
+
# levels can superficially resemble the `path:line:` search shape.
|
|
360
|
+
if sum(1 for line in lines if _LOG_LINE_RE.match(line)) >= majority:
|
|
361
|
+
return "log"
|
|
362
|
+
if sum(1 for line in lines if _SEARCH_HIT_RE.match(line)) >= majority:
|
|
363
|
+
return "search"
|
|
364
|
+
code_hits = sum(1 for line in lines if _CODE_LINE_RE.match(line))
|
|
365
|
+
brace_lines = sum(1 for line in lines if line.rstrip().endswith(("{", "}", ";", "):")))
|
|
366
|
+
if code_hits >= 2 or (code_hits >= 1 and brace_lines >= max(2, line_count // 3)):
|
|
367
|
+
return "code"
|
|
368
|
+
return "prose"
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def recommended_strategy(content_type: str) -> str:
|
|
372
|
+
"""Map a content type to its default retrieval strategy hint (advisory)."""
|
|
373
|
+
return STRATEGY_BY_CONTENT_TYPE.get(content_type, "head")
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def first_error_anchor(text: str) -> str | None:
|
|
377
|
+
"""Return the first literal error token in text for a pattern hint, or None.
|
|
378
|
+
|
|
379
|
+
The returned token is taken verbatim from ERROR_RE's match, so it is
|
|
380
|
+
guaranteed to be an exact substring of the stored content. This makes the
|
|
381
|
+
derived `--pattern` retrieval hint deterministic and exactly round-trippable.
|
|
382
|
+
"""
|
|
383
|
+
for line in text.splitlines():
|
|
384
|
+
match = ERROR_RE.search(line)
|
|
385
|
+
if match:
|
|
386
|
+
token = match.group(0).strip()
|
|
387
|
+
if token:
|
|
388
|
+
return token
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def build_retrieval_hints(
|
|
393
|
+
artifact_id: str,
|
|
394
|
+
sanitized_text: str,
|
|
395
|
+
*,
|
|
396
|
+
content_type: str,
|
|
397
|
+
strategy: str,
|
|
398
|
+
total_lines: int,
|
|
399
|
+
) -> list[dict[str, object]]:
|
|
400
|
+
"""Build deterministic, machine-readable retrieval hints for bounded round-trip.
|
|
401
|
+
|
|
402
|
+
Each hint pairs a `selector` (consumable by `query_content` / the `get` CLI)
|
|
403
|
+
with the exact CLI invocation for that selector. The line-range hint spans
|
|
404
|
+
the full stored content when it fits the query cap, otherwise it advertises
|
|
405
|
+
the first bounded chunk only. The pattern hint, when present, targets a
|
|
406
|
+
literal token guaranteed to exist, so retrieval is reproducible. Order is
|
|
407
|
+
fixed (lines, pattern, head) for determinism; callers pick the hint whose
|
|
408
|
+
`type` matches `strategy`.
|
|
409
|
+
"""
|
|
410
|
+
hints: list[dict[str, object]] = []
|
|
411
|
+
if total_lines >= 1:
|
|
412
|
+
end_line = min(total_lines, MAX_QUERY_LINES)
|
|
413
|
+
lines_hint: dict[str, object] = {
|
|
414
|
+
"type": "lines",
|
|
415
|
+
"selector": {"start": 1, "end": end_line},
|
|
416
|
+
"cli": line_query_cli(artifact_id, 1, end_line),
|
|
417
|
+
"exact": total_lines <= MAX_QUERY_LINES,
|
|
418
|
+
}
|
|
419
|
+
if end_line > DEFAULT_MAX_LINES:
|
|
420
|
+
lines_hint["max_lines"] = end_line
|
|
421
|
+
lines_hint["max_lines_required"] = True
|
|
422
|
+
lines_hint["note"] = (
|
|
423
|
+
"`--max-lines` in this suggested query is only the returned-line cap for the selected "
|
|
424
|
+
"`--lines` range; the explicit line range remains the selector."
|
|
425
|
+
)
|
|
426
|
+
if total_lines > MAX_QUERY_LINES:
|
|
427
|
+
lines_hint["note"] = (
|
|
428
|
+
f"first {MAX_QUERY_LINES} lines only; request later ranges for the full artifact. "
|
|
429
|
+
"`--max-lines` is only the returned-line cap for the selected range."
|
|
430
|
+
)
|
|
431
|
+
lines_hint["total_lines"] = total_lines
|
|
432
|
+
hints.append(lines_hint)
|
|
433
|
+
anchor = first_error_anchor(sanitized_text)
|
|
434
|
+
if anchor is not None:
|
|
435
|
+
hints.append(
|
|
436
|
+
{
|
|
437
|
+
"type": "pattern",
|
|
438
|
+
"selector": {"pattern": anchor},
|
|
439
|
+
"cli": f"context-guard-artifact get {artifact_id} --pattern '{anchor}'",
|
|
440
|
+
}
|
|
441
|
+
)
|
|
442
|
+
hints.append(
|
|
443
|
+
{
|
|
444
|
+
"type": "head",
|
|
445
|
+
"selector": {"max_lines": DEFAULT_MAX_LINES},
|
|
446
|
+
"cli": f"context-guard-artifact get {artifact_id} --max-lines {DEFAULT_MAX_LINES}",
|
|
447
|
+
}
|
|
448
|
+
)
|
|
449
|
+
return hints
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def line_query_cli(artifact_id: str, start: int, end: int) -> str:
|
|
453
|
+
cli = f"context-guard-artifact get {artifact_id} --lines {start}:{end}"
|
|
454
|
+
requested_lines = end - start + 1
|
|
455
|
+
if requested_lines > DEFAULT_MAX_LINES:
|
|
456
|
+
cli += f" --max-lines {min(requested_lines, MAX_QUERY_LINES)}"
|
|
457
|
+
return cli
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def line_receipt(artifact_id: str, line_number: int, text: str) -> dict[str, object]:
|
|
461
|
+
return {
|
|
462
|
+
"line": line_number,
|
|
463
|
+
"text": cap_digest_text(text.strip()),
|
|
464
|
+
"selector": {"type": "lines", "start": line_number, "end": line_number},
|
|
465
|
+
"cli": line_query_cli(artifact_id, line_number, line_number),
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def build_top_error_receipts(artifact_id: str, lines: list[str]) -> list[dict[str, object]]:
|
|
470
|
+
receipts: list[dict[str, object]] = []
|
|
471
|
+
seen: set[str] = set()
|
|
472
|
+
for line_number, line in enumerate(lines, start=1):
|
|
473
|
+
if not ERROR_RE.search(line):
|
|
474
|
+
continue
|
|
475
|
+
text = cap_digest_text(line.strip())
|
|
476
|
+
if not text or text in seen:
|
|
477
|
+
continue
|
|
478
|
+
receipt = line_receipt(artifact_id, line_number, text)
|
|
479
|
+
receipts.append(receipt)
|
|
480
|
+
seen.add(text)
|
|
481
|
+
if len(receipts) >= MAX_TOP_ERROR_RECEIPTS:
|
|
482
|
+
break
|
|
483
|
+
return receipts
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def build_duplicate_line_groups(artifact_id: str, lines: list[str], *, limit: int = MAX_DUPLICATE_GROUPS) -> list[dict[str, object]]:
|
|
487
|
+
counts: dict[str, int] = {}
|
|
488
|
+
first_line: dict[str, int] = {}
|
|
489
|
+
for line_number, line in enumerate(lines, start=1):
|
|
490
|
+
text = cap_digest_text(line.strip())
|
|
491
|
+
if not text:
|
|
492
|
+
continue
|
|
493
|
+
if text not in counts:
|
|
494
|
+
first_line[text] = line_number
|
|
495
|
+
counts[text] = 0
|
|
496
|
+
counts[text] += 1
|
|
497
|
+
groups: list[dict[str, object]] = []
|
|
498
|
+
for text, count in sorted(
|
|
499
|
+
((text, count) for text, count in counts.items() if count > 1),
|
|
500
|
+
key=lambda item: (-item[1], first_line[item[0]], item[0]),
|
|
501
|
+
)[:limit]:
|
|
502
|
+
line_number = first_line[text]
|
|
503
|
+
groups.append(
|
|
504
|
+
{
|
|
505
|
+
"count": count,
|
|
506
|
+
"first_line": line_number,
|
|
507
|
+
"text": text,
|
|
508
|
+
"selector": {"type": "lines", "start": line_number, "end": line_number},
|
|
509
|
+
"cli": line_query_cli(artifact_id, line_number, line_number),
|
|
510
|
+
}
|
|
511
|
+
)
|
|
512
|
+
return groups
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def build_digest(sanitized_text: str, *, artifact_id: str, redacted_lines: int) -> dict[str, object]:
|
|
516
|
+
lines = sanitized_text.splitlines()
|
|
517
|
+
top_errors = compact_items(
|
|
518
|
+
(line for line in lines if ERROR_RE.search(line)),
|
|
519
|
+
limit=12,
|
|
520
|
+
max_chars=MAX_DIGEST_TEXT_CHARS,
|
|
521
|
+
max_bytes=MAX_DIGEST_TEXT_BYTES,
|
|
522
|
+
)
|
|
523
|
+
return {
|
|
524
|
+
"status": "has_errors" if top_errors else "stored",
|
|
525
|
+
"redacted_lines": redacted_lines,
|
|
526
|
+
"redaction_counts": {
|
|
527
|
+
"lines": redacted_lines,
|
|
528
|
+
"markers": sanitized_text.count("[REDACTED]"),
|
|
529
|
+
},
|
|
530
|
+
"top_error_lines": top_errors,
|
|
531
|
+
"top_error_receipts": build_top_error_receipts(artifact_id, lines),
|
|
532
|
+
"duplicate_line_groups": build_duplicate_line_groups(artifact_id, lines),
|
|
533
|
+
"representative_head": compact_items(
|
|
534
|
+
lines,
|
|
535
|
+
limit=8,
|
|
536
|
+
max_chars=MAX_DIGEST_TEXT_CHARS,
|
|
537
|
+
max_bytes=MAX_DIGEST_TEXT_BYTES,
|
|
538
|
+
),
|
|
539
|
+
"representative_tail": compact_items(
|
|
540
|
+
lines[-8:],
|
|
541
|
+
limit=8,
|
|
542
|
+
max_chars=MAX_DIGEST_TEXT_CHARS,
|
|
543
|
+
max_bytes=MAX_DIGEST_TEXT_BYTES,
|
|
544
|
+
),
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def suggested_queries_for(metadata: dict[str, object]) -> list[str]:
|
|
549
|
+
queries: list[str] = []
|
|
550
|
+
|
|
551
|
+
def add(value: object) -> None:
|
|
552
|
+
if isinstance(value, str) and value and value not in queries:
|
|
553
|
+
queries.append(value)
|
|
554
|
+
|
|
555
|
+
digest = metadata.get("digest")
|
|
556
|
+
if isinstance(digest, dict):
|
|
557
|
+
for key in ("top_error_receipts", "duplicate_line_groups"):
|
|
558
|
+
items = digest.get(key)
|
|
559
|
+
if isinstance(items, list):
|
|
560
|
+
for item in items:
|
|
561
|
+
if isinstance(item, dict):
|
|
562
|
+
add(item.get("cli"))
|
|
563
|
+
|
|
564
|
+
retrieval = metadata.get("retrieval")
|
|
565
|
+
if isinstance(retrieval, dict):
|
|
566
|
+
hints = retrieval.get("hints")
|
|
567
|
+
if isinstance(hints, list):
|
|
568
|
+
for hint in hints:
|
|
569
|
+
if isinstance(hint, dict):
|
|
570
|
+
add(hint.get("cli"))
|
|
571
|
+
|
|
572
|
+
return queries[:MAX_SUGGESTED_QUERIES]
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def receipt_for(metadata: dict[str, object]) -> dict[str, object]:
|
|
576
|
+
artifact_id = str(metadata["artifact_id"])
|
|
577
|
+
return {
|
|
578
|
+
"artifact_id": artifact_id,
|
|
579
|
+
"stored": True,
|
|
580
|
+
"created_at": metadata.get("created_at"),
|
|
581
|
+
"command_preview": metadata.get("command_preview"),
|
|
582
|
+
"content_type": metadata.get("content_type"),
|
|
583
|
+
"input": metadata.get("input"),
|
|
584
|
+
"stored_output": metadata.get("stored_output"),
|
|
585
|
+
"digest": metadata.get("digest"),
|
|
586
|
+
"retrieval": metadata.get("retrieval"),
|
|
587
|
+
"available_queries": [
|
|
588
|
+
f"context-guard-artifact get {artifact_id} --lines 1:80",
|
|
589
|
+
f"context-guard-artifact get {artifact_id} --pattern ERROR --max-lines 40",
|
|
590
|
+
f"context-guard-artifact get {artifact_id} --json --lines 1:20",
|
|
591
|
+
],
|
|
592
|
+
"suggested_queries": suggested_queries_for(metadata),
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def metadata_json_text(metadata: dict[str, object]) -> str:
|
|
597
|
+
return json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def metadata_size_bytes(metadata: dict[str, object]) -> int:
|
|
601
|
+
return len(metadata_json_text(metadata).encode("utf-8", errors="replace"))
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def metadata_cap_diagnostic(metadata: dict[str, object], *, stage: str) -> str:
|
|
605
|
+
digest = metadata.get("digest")
|
|
606
|
+
digest_counts: dict[str, int] = {}
|
|
607
|
+
if isinstance(digest, dict):
|
|
608
|
+
for key in (
|
|
609
|
+
"representative_tail",
|
|
610
|
+
"representative_head",
|
|
611
|
+
"duplicate_line_groups",
|
|
612
|
+
"top_error_lines",
|
|
613
|
+
"top_error_receipts",
|
|
614
|
+
):
|
|
615
|
+
value = digest.get(key)
|
|
616
|
+
if isinstance(value, list):
|
|
617
|
+
digest_counts[key] = len(value)
|
|
618
|
+
counts_text = ",".join(f"{key}={value}" for key, value in digest_counts.items()) or "none"
|
|
619
|
+
return (
|
|
620
|
+
"artifact metadata exceeds trusted size cap before write: "
|
|
621
|
+
f"metadata_bytes={metadata_size_bytes(metadata)} "
|
|
622
|
+
f"metadata_cap_bytes={MAX_METADATA_BYTES} "
|
|
623
|
+
f"stage={stage} "
|
|
624
|
+
f"remaining_digest_items={counts_text}; "
|
|
625
|
+
"authoritative artifact content was not written because the receipt would be unreadable"
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def shrink_digest_for_metadata_cap(metadata: dict[str, object]) -> None:
|
|
630
|
+
"""Keep stored metadata inside the trusted read cap before writing it.
|
|
631
|
+
|
|
632
|
+
Digest fields are advisory receipts over the authoritative `.txt` artifact.
|
|
633
|
+
If future fields or multi-byte text push metadata near the hard read cap,
|
|
634
|
+
prefer dropping low-priority digest examples over writing a file that `get`
|
|
635
|
+
and `list` will later reject as untrusted.
|
|
636
|
+
"""
|
|
637
|
+
digest = metadata.get("digest")
|
|
638
|
+
if not isinstance(digest, dict):
|
|
639
|
+
if metadata_size_bytes(metadata) > MAX_METADATA_BYTES:
|
|
640
|
+
raise ValueError(metadata_cap_diagnostic(metadata, stage="no_digest"))
|
|
641
|
+
return
|
|
642
|
+
if metadata_size_bytes(metadata) <= MAX_METADATA_BYTES:
|
|
643
|
+
return
|
|
644
|
+
|
|
645
|
+
digest["capped_for_metadata"] = True
|
|
646
|
+
digest["metadata_cap_bytes"] = MAX_METADATA_BYTES
|
|
647
|
+
shrink_order = (
|
|
648
|
+
"representative_tail",
|
|
649
|
+
"representative_head",
|
|
650
|
+
"duplicate_line_groups",
|
|
651
|
+
"top_error_lines",
|
|
652
|
+
"top_error_receipts",
|
|
653
|
+
)
|
|
654
|
+
while metadata_size_bytes(metadata) > MAX_METADATA_BYTES:
|
|
655
|
+
for key in shrink_order:
|
|
656
|
+
items = digest.get(key)
|
|
657
|
+
if isinstance(items, list) and items:
|
|
658
|
+
items.pop()
|
|
659
|
+
break
|
|
660
|
+
else:
|
|
661
|
+
raise ValueError(metadata_cap_diagnostic(metadata, stage="digest_shrink_exhausted"))
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def store_command(args: argparse.Namespace) -> int:
|
|
665
|
+
directory = normalize_allowed_first_absolute_symlink(Path(args.dir).expanduser())
|
|
666
|
+
max_bytes = bounded_int(args.max_bytes, DEFAULT_MAX_BYTES, 1, MAX_MAX_BYTES)
|
|
667
|
+
raw_text, input_truncated, input_bytes = read_bounded_stdin(max_bytes)
|
|
668
|
+
sanitized_text, redacted_lines = sanitize_text(raw_text, show_paths=args.show_paths)
|
|
669
|
+
content_bytes = len(sanitized_text.encode("utf-8", errors="replace"))
|
|
670
|
+
content_sha = hashlib.sha256(sanitized_text.encode("utf-8", errors="replace")).hexdigest()
|
|
671
|
+
command_preview = sanitize_one_line(args.command or "", show_paths=args.show_paths) if args.command else None
|
|
672
|
+
id_basis = json.dumps(
|
|
673
|
+
{
|
|
674
|
+
"content_sha256": content_sha,
|
|
675
|
+
"command_preview": command_preview,
|
|
676
|
+
"input_truncated": input_truncated,
|
|
677
|
+
},
|
|
678
|
+
sort_keys=True,
|
|
679
|
+
)
|
|
680
|
+
artifact_id = hashlib.sha256(id_basis.encode("utf-8")).hexdigest()[:20]
|
|
681
|
+
content_path, meta_path = artifact_paths(directory, artifact_id)
|
|
682
|
+
total_lines = sanitized_text.count("\n") + (1 if sanitized_text and not sanitized_text.endswith("\n") else 0)
|
|
683
|
+
content_type = classify_content_type(sanitized_text)
|
|
684
|
+
strategy = recommended_strategy(content_type)
|
|
685
|
+
metadata: dict[str, object] = {
|
|
686
|
+
"artifact_id": artifact_id,
|
|
687
|
+
"created_at": int(time.time()),
|
|
688
|
+
"command_preview": command_preview,
|
|
689
|
+
"content_type": content_type,
|
|
690
|
+
"input": {
|
|
691
|
+
"bytes_read": input_bytes,
|
|
692
|
+
"truncated": input_truncated,
|
|
693
|
+
"max_bytes": max_bytes,
|
|
694
|
+
},
|
|
695
|
+
"stored_output": {
|
|
696
|
+
"bytes": content_bytes,
|
|
697
|
+
"lines": total_lines,
|
|
698
|
+
"sha256": content_sha,
|
|
699
|
+
"content_file": content_path.name,
|
|
700
|
+
"metadata_file": meta_path.name,
|
|
701
|
+
},
|
|
702
|
+
"digest": build_digest(sanitized_text, artifact_id=artifact_id, redacted_lines=redacted_lines),
|
|
703
|
+
"retrieval": {
|
|
704
|
+
"strategy": strategy,
|
|
705
|
+
"deterministic": True,
|
|
706
|
+
"hints": build_retrieval_hints(
|
|
707
|
+
artifact_id,
|
|
708
|
+
sanitized_text,
|
|
709
|
+
content_type=content_type,
|
|
710
|
+
strategy=strategy,
|
|
711
|
+
total_lines=total_lines,
|
|
712
|
+
),
|
|
713
|
+
},
|
|
714
|
+
}
|
|
715
|
+
shrink_digest_for_metadata_cap(metadata)
|
|
716
|
+
write_private_text(content_path, sanitized_text)
|
|
717
|
+
write_private_text(meta_path, metadata_json_text(metadata))
|
|
718
|
+
receipt = receipt_for(metadata)
|
|
719
|
+
if args.json:
|
|
720
|
+
print(json.dumps(receipt, ensure_ascii=False, indent=2, sort_keys=True))
|
|
721
|
+
else:
|
|
722
|
+
print(f"artifact_id={artifact_id}")
|
|
723
|
+
stored = receipt["stored_output"]
|
|
724
|
+
if isinstance(stored, dict):
|
|
725
|
+
print(f"stored_output={stored.get('lines')} lines/{stored.get('bytes')} bytes")
|
|
726
|
+
digest = receipt.get("digest")
|
|
727
|
+
if isinstance(digest, dict) and digest.get("top_error_lines"):
|
|
728
|
+
print("top_error_lines:")
|
|
729
|
+
for line in digest["top_error_lines"]: # type: ignore[index]
|
|
730
|
+
print(f"- {line}")
|
|
731
|
+
print(f"query=context-guard-artifact get {artifact_id} --lines 1:80")
|
|
732
|
+
return 0
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def load_metadata(directory: Path, artifact_id: str) -> dict[str, object]:
|
|
736
|
+
content_path, meta_path = artifact_paths(directory, artifact_id)
|
|
737
|
+
try:
|
|
738
|
+
regular_private_file_size(content_path)
|
|
739
|
+
meta_text = read_bounded_private_text(meta_path, MAX_METADATA_BYTES)
|
|
740
|
+
except FileNotFoundError as exc:
|
|
741
|
+
raise FileNotFoundError(f"artifact not found: {artifact_id}")
|
|
742
|
+
data = json.loads(meta_text)
|
|
743
|
+
if not isinstance(data, dict) or data.get("artifact_id") != artifact_id:
|
|
744
|
+
raise ValueError(f"artifact metadata mismatch: {artifact_id}")
|
|
745
|
+
return data
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def parse_line_range(value: str | None) -> tuple[int, int] | None:
|
|
749
|
+
if not value:
|
|
750
|
+
return None
|
|
751
|
+
match = re.fullmatch(r"(\d+)(?::(\d+))?", value.strip())
|
|
752
|
+
if not match:
|
|
753
|
+
raise ValueError("--lines must be START or START:END using 1-based inclusive line numbers")
|
|
754
|
+
start = int(match.group(1))
|
|
755
|
+
end = int(match.group(2) or match.group(1))
|
|
756
|
+
if start < 1 or end < start:
|
|
757
|
+
raise ValueError("--lines must satisfy 1 <= START <= END")
|
|
758
|
+
return start, end
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def cap_text(text: str, max_chars: int) -> tuple[str, bool]:
|
|
762
|
+
if len(text) <= max_chars:
|
|
763
|
+
return text, False
|
|
764
|
+
marker = f"\n[context-guard-kit] artifact query capped: {len(text)} chars total\n"
|
|
765
|
+
keep = max(0, max_chars - len(marker))
|
|
766
|
+
return text[:keep].rstrip() + marker, True
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def query_content(content: str, *, line_range: tuple[int, int] | None, pattern: str | None, max_lines: int) -> tuple[str, dict[str, object]]:
|
|
770
|
+
lines = content.splitlines(True)
|
|
771
|
+
selected: list[tuple[int, str]] = []
|
|
772
|
+
if line_range is not None:
|
|
773
|
+
start, end = line_range
|
|
774
|
+
selected = list(enumerate(lines[start - 1 : end], start=start))
|
|
775
|
+
selector = {"type": "lines", "start": start, "end": end}
|
|
776
|
+
elif pattern:
|
|
777
|
+
selected = [(idx, line) for idx, line in enumerate(lines, start=1) if pattern in line]
|
|
778
|
+
selector = {"type": "pattern", "pattern": pattern}
|
|
779
|
+
else:
|
|
780
|
+
selected = list(enumerate(lines[:max_lines], start=1))
|
|
781
|
+
selector = {"type": "head", "max_lines": max_lines}
|
|
782
|
+
total_matches = len(selected)
|
|
783
|
+
selected = selected[:max_lines]
|
|
784
|
+
text = "".join(line for _idx, line in selected)
|
|
785
|
+
return text, {"selector": selector, "returned_lines": len(selected), "matched_lines": total_matches, "total_lines": len(lines)}
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def get_command(args: argparse.Namespace) -> int:
|
|
789
|
+
artifact_id = args.artifact_id
|
|
790
|
+
max_chars = bounded_int(args.max_chars, DEFAULT_MAX_CHARS, 1, 1_000_000)
|
|
791
|
+
try:
|
|
792
|
+
last_missing: FileNotFoundError | None = None
|
|
793
|
+
for directory in artifact_read_directories(args.dir):
|
|
794
|
+
try:
|
|
795
|
+
metadata = load_metadata(directory, artifact_id)
|
|
796
|
+
content_path, _meta_path = artifact_paths(directory, artifact_id)
|
|
797
|
+
break
|
|
798
|
+
except FileNotFoundError as exc:
|
|
799
|
+
last_missing = exc
|
|
800
|
+
else:
|
|
801
|
+
if last_missing is not None:
|
|
802
|
+
raise last_missing
|
|
803
|
+
raise FileNotFoundError(f"artifact not found: {artifact_id}")
|
|
804
|
+
stored_output = metadata.get("stored_output")
|
|
805
|
+
expected_sha = stored_output.get("sha256") if isinstance(stored_output, dict) else None
|
|
806
|
+
if not isinstance(expected_sha, str) or not re.fullmatch(r"[a-f0-9]{64}", expected_sha):
|
|
807
|
+
raise ValueError(f"artifact metadata missing stored_output sha256: {artifact_id}")
|
|
808
|
+
expected_bytes = stored_output.get("bytes") if isinstance(stored_output, dict) else None
|
|
809
|
+
if not isinstance(expected_bytes, int) or expected_bytes < 0 or expected_bytes > MAX_MAX_BYTES:
|
|
810
|
+
raise ValueError(f"artifact metadata has invalid stored_output bytes: {artifact_id}")
|
|
811
|
+
actual_size = regular_private_file_size(content_path)
|
|
812
|
+
if actual_size != expected_bytes:
|
|
813
|
+
raise ValueError(f"artifact content checksum mismatch: {artifact_id}")
|
|
814
|
+
content = read_bounded_private_text(content_path, expected_bytes)
|
|
815
|
+
actual_sha = hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()
|
|
816
|
+
if actual_sha != expected_sha:
|
|
817
|
+
raise ValueError(f"artifact content checksum mismatch: {artifact_id}")
|
|
818
|
+
line_range = parse_line_range(args.lines)
|
|
819
|
+
if line_range is not None and args.max_lines is None:
|
|
820
|
+
max_lines = min(line_range[1] - line_range[0] + 1, MAX_QUERY_LINES)
|
|
821
|
+
else:
|
|
822
|
+
max_lines = bounded_int(args.max_lines, DEFAULT_MAX_LINES, 1, MAX_QUERY_LINES)
|
|
823
|
+
selected, query = query_content(content, line_range=line_range, pattern=args.pattern, max_lines=max_lines)
|
|
824
|
+
selected, capped = cap_text(selected, max_chars)
|
|
825
|
+
except (FileNotFoundError, ValueError, OSError, json.JSONDecodeError) as exc:
|
|
826
|
+
print(f"context-guard-artifact: {exc}", file=sys.stderr)
|
|
827
|
+
return 1
|
|
828
|
+
if args.json:
|
|
829
|
+
payload = {
|
|
830
|
+
"artifact_id": artifact_id,
|
|
831
|
+
"content_type": metadata.get("content_type"),
|
|
832
|
+
"query": query,
|
|
833
|
+
"capped": capped,
|
|
834
|
+
"content": selected,
|
|
835
|
+
"stored_output": metadata.get("stored_output"),
|
|
836
|
+
"retrieval": metadata.get("retrieval"),
|
|
837
|
+
}
|
|
838
|
+
print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
|
|
839
|
+
else:
|
|
840
|
+
sys.stdout.write(selected)
|
|
841
|
+
return 0
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def list_command(args: argparse.Namespace) -> int:
|
|
845
|
+
items: list[dict[str, object]] = []
|
|
846
|
+
seen: set[str] = set()
|
|
847
|
+
for directory in artifact_read_directories(args.dir):
|
|
848
|
+
try:
|
|
849
|
+
reject_symlink_components(directory)
|
|
850
|
+
directory_is_safe = directory.is_dir() and not directory.is_symlink()
|
|
851
|
+
except RuntimeError:
|
|
852
|
+
directory_is_safe = False
|
|
853
|
+
if not directory_is_safe:
|
|
854
|
+
continue
|
|
855
|
+
for meta_path in sorted(directory.glob("*.json")):
|
|
856
|
+
try:
|
|
857
|
+
data = json.loads(read_bounded_private_text(meta_path, MAX_METADATA_BYTES))
|
|
858
|
+
except (OSError, ValueError, RuntimeError, json.JSONDecodeError):
|
|
859
|
+
continue
|
|
860
|
+
artifact_id = str(data.get("artifact_id", "")) if isinstance(data, dict) else ""
|
|
861
|
+
if isinstance(data, dict) and ARTIFACT_ID_RE.fullmatch(artifact_id) and artifact_id not in seen:
|
|
862
|
+
items.append(receipt_for(data))
|
|
863
|
+
seen.add(artifact_id)
|
|
864
|
+
items.sort(key=lambda item: str(item.get("artifact_id", "")))
|
|
865
|
+
if args.json:
|
|
866
|
+
print(json.dumps({"artifacts": items}, ensure_ascii=False, indent=2, sort_keys=True))
|
|
867
|
+
else:
|
|
868
|
+
for item in items:
|
|
869
|
+
stored = item.get("stored_output")
|
|
870
|
+
if isinstance(stored, dict):
|
|
871
|
+
print(f"{item['artifact_id']}\t{stored.get('lines')} lines\t{stored.get('bytes')} bytes")
|
|
872
|
+
else:
|
|
873
|
+
print(item["artifact_id"])
|
|
874
|
+
return 0
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
878
|
+
parser = argparse.ArgumentParser(description="Store sanitized large outputs as queryable local artifacts.")
|
|
879
|
+
parser.add_argument("--dir", default=DEFAULT_ARTIFACT_DIR, help=f"artifact directory (default: {DEFAULT_ARTIFACT_DIR})")
|
|
880
|
+
subparsers = parser.add_subparsers(dest="command_name", required=True)
|
|
881
|
+
|
|
882
|
+
store = subparsers.add_parser("store", help="store stdin as a sanitized artifact and print a compact receipt")
|
|
883
|
+
store.add_argument("--command", help="optional command label to sanitize into the receipt")
|
|
884
|
+
store.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES, help="maximum stdin bytes to read before truncating")
|
|
885
|
+
store.add_argument(
|
|
886
|
+
"--show-paths",
|
|
887
|
+
action="store_true",
|
|
888
|
+
help="show raw absolute paths instead of path hashes; local debugging only because private paths may be exposed",
|
|
889
|
+
)
|
|
890
|
+
store.add_argument("--json", action="store_true", help="emit receipt JSON")
|
|
891
|
+
store.set_defaults(func=store_command)
|
|
892
|
+
|
|
893
|
+
get = subparsers.add_parser("get", help="query a stored artifact")
|
|
894
|
+
get.add_argument("artifact_id")
|
|
895
|
+
get.add_argument("--lines", help="1-based inclusive line range, e.g. 10:40")
|
|
896
|
+
get.add_argument("--pattern", help="literal substring filter")
|
|
897
|
+
get.add_argument("--max-lines", type=int, default=None)
|
|
898
|
+
get.add_argument("--max-chars", type=int, default=DEFAULT_MAX_CHARS)
|
|
899
|
+
get.add_argument("--json", action="store_true", help="emit query JSON with content")
|
|
900
|
+
get.set_defaults(func=get_command)
|
|
901
|
+
|
|
902
|
+
list_parser = subparsers.add_parser("list", help="list stored artifacts")
|
|
903
|
+
list_parser.add_argument("--json", action="store_true", help="emit list JSON")
|
|
904
|
+
list_parser.set_defaults(func=list_command)
|
|
905
|
+
return parser
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def main() -> int:
|
|
909
|
+
parser = build_parser()
|
|
910
|
+
args = parser.parse_args()
|
|
911
|
+
try:
|
|
912
|
+
return int(args.func(args))
|
|
913
|
+
except (RuntimeError, ValueError) as exc:
|
|
914
|
+
print(f"context-guard-artifact: {exc}", file=sys.stderr)
|
|
915
|
+
return 1
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
if __name__ == "__main__":
|
|
919
|
+
raise SystemExit(main())
|