@ictechgy/context-guard 0.4.9 → 0.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CHANGELOG.md +16 -0
  2. package/README.ko.md +41 -24
  3. package/README.md +66 -26
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  8. package/docs/distribution.md +10 -7
  9. package/docs/experimental-benchmark-fixtures.md +8 -1
  10. package/package.json +3 -6
  11. package/packaging/homebrew/context-guard.rb.template +1 -1
  12. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  13. package/plugins/context-guard/README.ko.md +9 -6
  14. package/plugins/context-guard/README.md +21 -13
  15. package/plugins/context-guard/bin/context-guard +113 -26
  16. package/plugins/context-guard/bin/context-guard-artifact +542 -46
  17. package/plugins/context-guard/bin/context-guard-cache-score +380 -0
  18. package/plugins/context-guard/bin/context-guard-compress +146 -1
  19. package/plugins/context-guard/bin/context-guard-cost +783 -4
  20. package/plugins/context-guard/bin/context-guard-experiments +99 -18
  21. package/plugins/context-guard/bin/context-guard-failed-nudge +3 -0
  22. package/plugins/context-guard/bin/context-guard-filter +163 -7
  23. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  24. package/plugins/context-guard/bin/context-guard-pack +602 -43
  25. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  26. package/plugins/context-guard/bin/context-guard-setup +165 -31
  27. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  28. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  29. package/plugins/context-guard/bin/context-guard-tool-prune +241 -1
  30. package/plugins/context-guard/lib/context_guard_commands.py +206 -0
  31. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  32. package/context-guard-kit/README.md +0 -91
  33. package/context-guard-kit/benchmark_runner.py +0 -2401
  34. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  35. package/context-guard-kit/context_compress.py +0 -695
  36. package/context-guard-kit/context_escrow.py +0 -935
  37. package/context-guard-kit/context_filter.py +0 -637
  38. package/context-guard-kit/context_guard_cli.py +0 -325
  39. package/context-guard-kit/context_guard_diet.py +0 -1711
  40. package/context-guard-kit/context_pack.py +0 -2713
  41. package/context-guard-kit/cost_guard.py +0 -2349
  42. package/context-guard-kit/experimental_registry.py +0 -4348
  43. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  44. package/context-guard-kit/guard_large_read.py +0 -690
  45. package/context-guard-kit/hook_secret_patterns.py +0 -43
  46. package/context-guard-kit/read_symbol.py +0 -483
  47. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  48. package/context-guard-kit/sanitize_output.py +0 -725
  49. package/context-guard-kit/settings.example.json +0 -67
  50. package/context-guard-kit/setup_wizard.py +0 -2515
  51. package/context-guard-kit/statusline.sh +0 -362
  52. package/context-guard-kit/statusline_merged.sh +0 -157
  53. package/context-guard-kit/tool_schema_pruner.py +0 -837
  54. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -1,837 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Select a bounded top-k subset from a local tool/MCP schema catalog.
3
-
4
- The helper is advisory only: it never edits MCP config or an agent's tool
5
- registry. It writes a compact receipt plus a separate sanitized payload so an
6
- agent can inject a small selection report first and recover the full sanitized
7
- schema later when needed.
8
- """
9
- from __future__ import annotations
10
-
11
- import argparse
12
- import hashlib
13
- import json
14
- import os
15
- import shlex
16
- from pathlib import Path
17
- import re
18
- import stat
19
- import sys
20
- import time
21
- from dataclasses import dataclass
22
- from typing import Any, NoReturn
23
-
24
- TOOL_NAME = "context-guard-tool-prune"
25
- SCHEMA_VERSION = "contextguard.tool-prune.v1"
26
- DEFAULT_STORE_DIR = ".context-guard/tool-prune"
27
- DEFAULT_TOP = 5
28
- DEFAULT_BUDGET_BYTES = 12_000
29
- DEFAULT_MAX_CATALOG_BYTES = 1_000_000
30
- DEFAULT_MAX_OUTPUT_BYTES = 65_536
31
- DEFAULT_MAX_PAYLOAD_BYTES = 1_048_576
32
- DEFAULT_MAX_RECEIPT_BYTES = 16_384
33
- MAX_TOP = 200
34
- MAX_LABEL_CHARS = 160
35
- MAX_DESCRIPTION_CHARS = 360
36
- MAX_OMITTED_TOOLS = 30
37
- TOKEN_PROXY_CHARS_PER_TOKEN = 4
38
- ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
39
- "tmp": Path("/private/tmp"),
40
- "var": Path("/private/var"),
41
- }
42
- RECEIPT_ID_RE = re.compile(r"^[a-f0-9]{16,64}$")
43
- TERM_RE = re.compile(r"[A-Za-z0-9_]+")
44
- SECRET_RE = re.compile(
45
- r"(?is)("
46
- r"-----BEGIN (?:[A-Z0-9 ]*PRIVATE KEY|PGP PRIVATE KEY BLOCK)-----.*?-----END (?:[A-Z0-9 ]*PRIVATE KEY|PGP PRIVATE KEY BLOCK)-----|"
47
- r"AKIA[0-9A-Z]{16}|"
48
- r"gh[pousr]_[A-Za-z0-9_]{20,}|"
49
- r"github_pat_[A-Za-z0-9_]{20,}|"
50
- r"glpat-[A-Za-z0-9_-]{12,}|"
51
- r"xox[abprs]-[A-Za-z0-9-]{10,}|"
52
- r"sk-(?:ant|proj)-[A-Za-z0-9_-]{8,}|"
53
- r"sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
54
- r"AIza[0-9A-Za-z_\-]{20,}|"
55
- r"(?i:Authorization)\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+|"
56
- r"[?&](?:X-Amz-Signature|X-Amz-Credential|X-Amz-Security-Token|AWSAccessKeyId|Signature|sig|access_token|refresh_token|id_token|auth|authorization|api[_-]?key|apikey|token|secret|password|client[_-]?secret|private[_-]?key|privatekey|pgp[_-]?private[_-]?key|pgpprivatekey|ssh[_-]?key|sshkey|(?:aws[_-]?)?access[_-]?key(?:[_-]?id)?|awsaccesskeyid)=[^&#\s,}\]]+|"
57
- r"(?<![A-Za-z0-9])(?:api[_-]?key|apikey|token|secret|password|client[_-]?secret|authorization|credential|signature|sig|private[_-]?key|privatekey|pgp[_-]?private[_-]?key|pgpprivatekey|ssh[_-]?key|sshkey|(?:aws[_-]?)?access[_-]?key(?:[_-]?id)?|awsaccesskeyid)\s*[:=]\s*[^\s,}\]]+"
58
- r")"
59
- )
60
- SENSITIVE_KEY_RE = re.compile(
61
- r"(?i)(authorization|api[_-]?key|apikey|token|secret|password|passwd|pwd|client[_-]?secret|credential|signature|sig|x-amz-signature|x-amz-credential|awsaccesskeyid|(?:aws[_-]?)?access[_-]?key(?:[_-]?id)?|private[_-]?key|privatekey|pgp[_-]?private[_-]?key|pgpprivatekey|ssh[_-]?key|sshkey)"
62
- )
63
- VALUE_BEARING_KEY_RE = re.compile(r"(?i)^(default|const|enum|example|examples|value|values)$")
64
-
65
-
66
- class ToolPruneError(ValueError):
67
- """User-facing fail-closed error."""
68
-
69
-
70
- @dataclass(frozen=True)
71
- class Candidate:
72
- name: str
73
- server: str | None
74
- description: str
75
- schema: dict[str, Any]
76
- index: int
77
- score: float = 0.0
78
- rank: int = 0
79
-
80
-
81
- def fail(message: str) -> NoReturn:
82
- raise ToolPruneError(message)
83
-
84
-
85
- def byte_len_text(text: str) -> int:
86
- return len(text.encode("utf-8", errors="replace"))
87
-
88
-
89
- def json_bytes(data: Any, *, indent: int | None = None) -> str:
90
- return json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":") if indent is None else None, indent=indent)
91
-
92
-
93
- def byte_len_json(data: Any) -> int:
94
- return byte_len_text(json_bytes(data))
95
-
96
-
97
- def sha256_text(text: str) -> str:
98
- return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
99
-
100
-
101
- def bounded_int(value: object, *, default: int, minimum: int, maximum: int, name: str) -> int:
102
- try:
103
- number = int(value)
104
- except (TypeError, ValueError, OverflowError):
105
- fail(f"{name} must be an integer")
106
- if number < minimum:
107
- fail(f"{name} must be >= {minimum}")
108
- if number > maximum:
109
- fail(f"{name} must be <= {maximum}")
110
- return number
111
-
112
-
113
- def cap_text(value: object, limit: int = MAX_LABEL_CHARS) -> str:
114
- text = " ".join(str(value or "").split())
115
- if len(text) <= limit:
116
- return text
117
- marker = f"…[trimmed:{len(text)} chars]"
118
- return text[: max(0, limit - len(marker))] + marker
119
-
120
-
121
- def redact_string(value: str) -> tuple[str, int]:
122
- def repl(match: re.Match[str]) -> str:
123
- text = match.group(0)
124
- if "=" in text:
125
- key = text.split("=", 1)[0]
126
- if SENSITIVE_KEY_RE.search(key):
127
- return key + "=[REDACTED]"
128
- if ":" in text:
129
- key = text.split(":", 1)[0]
130
- if SENSITIVE_KEY_RE.search(key):
131
- return key + ": [REDACTED]"
132
- return "[REDACTED]"
133
-
134
- return SECRET_RE.subn(repl, value)
135
-
136
-
137
- def redact_whole_value(value: Any) -> tuple[Any, int]:
138
- if isinstance(value, dict):
139
- out: dict[str, Any] = {}
140
- count = 0
141
- for key, item in value.items():
142
- safe_key, key_redactions = redact_string(str(key))
143
- sanitized, item_redactions = redact_whole_value(item)
144
- out[safe_key] = sanitized
145
- count += key_redactions + item_redactions
146
- return out, count
147
- if isinstance(value, list):
148
- out: list[Any] = []
149
- count = 0
150
- for item in value:
151
- sanitized, item_redactions = redact_whole_value(item)
152
- out.append(sanitized)
153
- count += item_redactions
154
- return out, count
155
- return "[REDACTED]", 1
156
-
157
-
158
- def sanitize_value(value: Any, *, sensitive_context: bool = False, sensitive_schema_context: bool = False) -> tuple[Any, int]:
159
- if sensitive_context:
160
- return redact_whole_value(value)
161
- if isinstance(value, str):
162
- return redact_string(value)
163
- if isinstance(value, list):
164
- out: list[Any] = []
165
- count = 0
166
- for item in value:
167
- sanitized, redactions = sanitize_value(item, sensitive_schema_context=sensitive_schema_context)
168
- out.append(sanitized)
169
- count += redactions
170
- return out, count
171
- if isinstance(value, dict):
172
- out: dict[str, Any] = {}
173
- count = 0
174
- for key, item in value.items():
175
- raw_key = str(key)
176
- safe_key, key_redactions = redact_string(raw_key)
177
- key_sensitive = bool(SENSITIVE_KEY_RE.search(raw_key))
178
- value_bearing = bool(VALUE_BEARING_KEY_RE.search(raw_key))
179
- if key_sensitive and not isinstance(item, dict):
180
- sanitized, item_redactions = sanitize_value(item, sensitive_context=True)
181
- elif key_sensitive:
182
- sanitized, item_redactions = sanitize_value(item, sensitive_schema_context=True)
183
- elif sensitive_schema_context and value_bearing:
184
- sanitized, item_redactions = sanitize_value(item, sensitive_context=True)
185
- else:
186
- sanitized, item_redactions = sanitize_value(item, sensitive_schema_context=sensitive_schema_context)
187
- out[safe_key] = sanitized
188
- count += key_redactions + item_redactions
189
- return out, count
190
- return value, 0
191
-
192
-
193
- def read_limited_path(path: Path, max_bytes: int) -> str:
194
- reject_symlink_components(path)
195
- flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
196
- try:
197
- fd = os.open(str(path), flags)
198
- except OSError as exc:
199
- fail(f"catalog read failed: {exc}")
200
- try:
201
- st = os.fstat(fd)
202
- if not stat.S_ISREG(st.st_mode):
203
- fail("catalog must be a regular file")
204
- if st.st_size > max_bytes:
205
- fail(f"catalog exceeds --max-catalog-bytes: {st.st_size} > {max_bytes}")
206
- data = os.read(fd, max_bytes + 1)
207
- finally:
208
- os.close(fd)
209
- if len(data) > max_bytes:
210
- fail(f"catalog exceeds --max-catalog-bytes: > {max_bytes}")
211
- return data.decode("utf-8", errors="replace")
212
-
213
-
214
- def read_limited_stdin(max_bytes: int) -> str:
215
- data = sys.stdin.buffer.read(max_bytes + 1)
216
- if len(data) > max_bytes:
217
- fail(f"catalog exceeds --max-catalog-bytes: > {max_bytes}")
218
- return data.decode("utf-8", errors="replace")
219
-
220
-
221
- def parse_catalog_text(text: str) -> tuple[Any, int]:
222
- try:
223
- raw = json.loads(text)
224
- except json.JSONDecodeError as exc:
225
- fail(f"catalog must be valid JSON: {exc.msg}")
226
- return sanitize_value(raw)
227
-
228
-
229
- def first_str(mapping: dict[str, Any], keys: tuple[str, ...]) -> str:
230
- for key in keys:
231
- value = mapping.get(key)
232
- if isinstance(value, str) and value.strip():
233
- return value
234
- return ""
235
-
236
-
237
- def tool_schema_from_dict(raw: dict[str, Any], *, fallback_name: str | None = None, server: str | None = None, index: int = 0) -> Candidate | None:
238
- name = first_str(raw, ("name", "tool", "id", "title")) or (fallback_name or "")
239
- name = cap_text(name, MAX_LABEL_CHARS)
240
- if not name:
241
- return None
242
- description = cap_text(first_str(raw, ("description", "summary", "doc", "docs")), MAX_DESCRIPTION_CHARS)
243
- schema = dict(raw)
244
- schema.setdefault("name", name)
245
- if description and "description" not in schema:
246
- schema["description"] = description
247
- if server and "server" not in schema:
248
- schema["server"] = server
249
- return Candidate(name=name, server=cap_text(server, MAX_LABEL_CHARS) if server else None, description=description, schema=schema, index=index)
250
-
251
-
252
- def normalize_catalog(raw: Any) -> list[Candidate]:
253
- candidates: list[Candidate] = []
254
-
255
- def add_tool(tool: Any, *, server: str | None = None, fallback_name: str | None = None) -> None:
256
- if isinstance(tool, str):
257
- tool = {"name": tool}
258
- if not isinstance(tool, dict):
259
- return
260
- cand = tool_schema_from_dict(tool, fallback_name=fallback_name, server=server, index=len(candidates))
261
- if cand is not None:
262
- candidates.append(cand)
263
-
264
- def add_tools(tools: Any, *, server: str | None = None) -> None:
265
- if isinstance(tools, list):
266
- for tool in tools:
267
- add_tool(tool, server=server)
268
- elif isinstance(tools, dict):
269
- for name, schema in tools.items():
270
- if isinstance(schema, dict):
271
- add_tool(schema, server=server, fallback_name=str(name))
272
- else:
273
- add_tool({"name": str(name), "schema": schema}, server=server)
274
-
275
- if isinstance(raw, list):
276
- add_tools(raw)
277
- elif isinstance(raw, dict):
278
- if "tools" in raw:
279
- add_tools(raw.get("tools"), server=first_str(raw, ("server", "name")) or None)
280
- if "servers" in raw and isinstance(raw.get("servers"), list):
281
- for server_obj in raw.get("servers") or []:
282
- if isinstance(server_obj, dict):
283
- add_tools(server_obj.get("tools"), server=first_str(server_obj, ("name", "id", "server")) or None)
284
- if "mcpServers" in raw and isinstance(raw.get("mcpServers"), dict):
285
- for server_name, server_obj in (raw.get("mcpServers") or {}).items():
286
- if isinstance(server_obj, dict):
287
- add_tools(server_obj.get("tools"), server=str(server_name))
288
- if not candidates:
289
- # Simple name-to-schema map.
290
- for name, schema in raw.items():
291
- if name in {"tools", "servers", "mcpServers"}:
292
- continue
293
- if isinstance(schema, dict):
294
- add_tool(schema, fallback_name=str(name))
295
- elif isinstance(schema, (str, list)):
296
- add_tool({"name": str(name), "schema": schema})
297
- if not candidates:
298
- fail("catalog contains no tools")
299
- return candidates
300
-
301
-
302
- def terms(text: str) -> set[str]:
303
- return {term.lower() for term in TERM_RE.findall(text or "") if term}
304
-
305
-
306
- def collect_parameter_text(value: Any, *, depth: int = 0, max_items: int = 500) -> list[str]:
307
- out: list[str] = []
308
- if depth > 8 or max_items <= 0:
309
- return out
310
- if isinstance(value, dict):
311
- for key, item in value.items():
312
- if len(out) >= max_items:
313
- break
314
- key_text = str(key)
315
- if key_text.lower() in {"properties", "parameters", "inputschema", "input_schema", "schema", "description", "title", "name"}:
316
- out.append(key_text)
317
- elif isinstance(item, (str, int, float, bool)):
318
- out.append(key_text)
319
- if isinstance(item, str) and key_text.lower() in {"description", "title", "name"}:
320
- out.append(item)
321
- out.extend(collect_parameter_text(item, depth=depth + 1, max_items=max_items - len(out)))
322
- elif isinstance(value, list):
323
- for item in value[:max_items]:
324
- if len(out) >= max_items:
325
- break
326
- out.extend(collect_parameter_text(item, depth=depth + 1, max_items=max_items - len(out)))
327
- return out[:max_items]
328
-
329
-
330
- def score_candidate(candidate: Candidate, query_terms: set[str]) -> float:
331
- if not query_terms:
332
- return 0.0
333
- name_terms = terms(candidate.name)
334
- desc_terms = terms(candidate.description)
335
- parameter_terms = terms(" ".join(collect_parameter_text(candidate.schema)))
336
- score = 0.0
337
- score += 4.0 * len(query_terms & name_terms)
338
- score += 1.5 * len(query_terms & desc_terms)
339
- score += 1.0 * len(query_terms & parameter_terms)
340
- # Light substring bonus for names such as git_status when the query says status.
341
- lowered_name = candidate.name.lower()
342
- for term in query_terms:
343
- if term and term in lowered_name and term not in name_terms:
344
- score += 1.0
345
- return score
346
-
347
-
348
- def rank_candidates(candidates: list[Candidate], query: str) -> list[Candidate]:
349
- query_terms = terms(query)
350
- scored: list[Candidate] = []
351
- for cand in candidates:
352
- scored.append(Candidate(cand.name, cand.server, cand.description, cand.schema, cand.index, score_candidate(cand, query_terms), 0))
353
- scored.sort(key=lambda item: (-item.score, item.index))
354
- ranked: list[Candidate] = []
355
- for rank, cand in enumerate(scored, start=1):
356
- ranked.append(Candidate(cand.name, cand.server, cand.description, cand.schema, cand.index, cand.score, rank))
357
- return ranked
358
-
359
-
360
- def normalized_link_target(parent: Path, raw_target: str) -> Path:
361
- target = Path(raw_target)
362
- if not target.is_absolute():
363
- target = parent / target
364
- return Path(os.path.normpath(str(target)))
365
-
366
-
367
- def normalize_allowed_first_absolute_symlink(path: Path) -> Path:
368
- if not path.is_absolute() or len(path.parts) < 2:
369
- return path
370
- first = path.parts[1]
371
- expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
372
- if expected is None:
373
- return path
374
- link = Path(path.anchor) / first
375
- try:
376
- if not stat.S_ISLNK(os.lstat(link).st_mode):
377
- return path
378
- if normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
379
- return path
380
- except OSError:
381
- return path
382
- return expected.joinpath(*path.parts[2:])
383
-
384
-
385
- def reject_symlink_components(path: Path) -> None:
386
- path = normalize_allowed_first_absolute_symlink(path)
387
- current = Path(path.anchor) if path.is_absolute() else Path()
388
- for part in path.parts:
389
- if path.is_absolute() and part == path.anchor:
390
- continue
391
- current = current / part
392
- try:
393
- st = os.lstat(current)
394
- except FileNotFoundError:
395
- return
396
- if stat.S_ISLNK(st.st_mode):
397
- fail(f"refusing path with symlink component: {current}")
398
- if not stat.S_ISDIR(st.st_mode) and current != path:
399
- fail(f"refusing path through non-directory component: {current}")
400
-
401
-
402
- def ensure_private_dir(path: Path) -> None:
403
- path = normalize_allowed_first_absolute_symlink(path)
404
- reject_symlink_components(path)
405
- try:
406
- path.mkdir(parents=True, exist_ok=True)
407
- reject_symlink_components(path)
408
- os.chmod(path, 0o700)
409
- except OSError as exc:
410
- fail(f"store directory unavailable: {exc}")
411
-
412
-
413
- def write_private_json_atomic(path: Path, data: dict[str, Any], *, max_bytes: int, label: str) -> int:
414
- text = json_bytes(data, indent=2) + "\n"
415
- size = byte_len_text(text)
416
- if size > max_bytes:
417
- fail(f"{label} exceeds size cap: {size} > {max_bytes}")
418
- ensure_private_dir(path.parent)
419
- tmp = path.with_name(path.name + f".tmp-{os.getpid()}-{time.time_ns()}")
420
- flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0)
421
- try:
422
- fd = os.open(str(tmp), flags, 0o600)
423
- except OSError as exc:
424
- fail(f"{label} write failed: {exc}")
425
- try:
426
- with os.fdopen(fd, "w", encoding="utf-8", newline="") as handle:
427
- handle.write(text)
428
- handle.flush()
429
- try:
430
- os.fsync(handle.fileno())
431
- except OSError:
432
- pass
433
- os.replace(tmp, path)
434
- try:
435
- os.chmod(path, 0o600)
436
- except OSError:
437
- pass
438
- except Exception:
439
- try:
440
- tmp.unlink()
441
- except OSError:
442
- pass
443
- raise
444
- return size
445
-
446
-
447
- def read_private_text(path: Path, *, max_bytes: int, label: str) -> tuple[str, int]:
448
- if path.is_symlink():
449
- fail(f"{label} must not be a symlink")
450
- flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
451
- try:
452
- fd = os.open(str(path), flags)
453
- except OSError as exc:
454
- fail(f"{label} read failed: {exc}")
455
- try:
456
- st = os.fstat(fd)
457
- if not stat.S_ISREG(st.st_mode):
458
- fail(f"{label} must be a regular file")
459
- if st.st_size > max_bytes:
460
- fail(f"{label} exceeds trusted size cap: {st.st_size} > {max_bytes}")
461
- data = os.read(fd, max_bytes + 1)
462
- finally:
463
- os.close(fd)
464
- if len(data) > max_bytes:
465
- fail(f"{label} exceeds trusted size cap: > {max_bytes}")
466
- return data.decode("utf-8", errors="replace"), len(data)
467
-
468
-
469
- def read_private_json(path: Path, *, max_bytes: int, label: str) -> dict[str, Any]:
470
- if path.is_symlink():
471
- fail(f"{label} must not be a symlink")
472
- flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
473
- try:
474
- fd = os.open(str(path), flags)
475
- except OSError as exc:
476
- fail(f"{label} read failed: {exc}")
477
- try:
478
- st = os.fstat(fd)
479
- if not stat.S_ISREG(st.st_mode):
480
- fail(f"{label} must be a regular file")
481
- if st.st_size > max_bytes:
482
- fail(f"{label} exceeds trusted size cap: {st.st_size} > {max_bytes}")
483
- data = os.read(fd, max_bytes + 1)
484
- finally:
485
- os.close(fd)
486
- if len(data) > max_bytes:
487
- fail(f"{label} exceeds trusted size cap: > {max_bytes}")
488
- try:
489
- parsed = json.loads(data.decode("utf-8", errors="replace"))
490
- except json.JSONDecodeError as exc:
491
- fail(f"{label} is malformed JSON: {exc.msg}")
492
- if not isinstance(parsed, dict):
493
- fail(f"{label} must be a JSON object")
494
- return parsed
495
-
496
-
497
- def display_path(path: Path) -> str:
498
- try:
499
- rel = os.path.relpath(path, Path.cwd())
500
- except ValueError:
501
- rel = path.name
502
- rel = rel.replace(os.sep, "/")
503
- safe, _count = redact_string(rel)
504
- return safe
505
-
506
-
507
- def store_paths(store_dir: str, receipt_id: str) -> tuple[Path, Path, Path]:
508
- if not RECEIPT_ID_RE.fullmatch(receipt_id):
509
- fail("receipt_id must be 16-64 lowercase hex chars")
510
- root = normalize_allowed_first_absolute_symlink(Path(store_dir).expanduser())
511
- return root, root / f"{receipt_id}.receipt.json", root / f"{receipt_id}.payload.json"
512
-
513
-
514
- def build_receipt_id(payload_without_id: dict[str, Any]) -> str:
515
- basis = json_bytes(payload_without_id) + f"\n{time.time_ns()}:{os.getpid()}"
516
- return hashlib.sha256(basis.encode("utf-8", errors="replace")).hexdigest()[:20]
517
-
518
-
519
- def build_payload(receipt_id: str, ranked: list[Candidate], query: str, redactions: int) -> dict[str, Any]:
520
- return {
521
- "tool": TOOL_NAME,
522
- "schema_version": SCHEMA_VERSION,
523
- "receipt_id": receipt_id,
524
- "created_at_unix": int(time.time()),
525
- "query": query,
526
- "candidate_count": len(ranked),
527
- "redaction": {"redacted_values": redactions},
528
- "tools": [
529
- {
530
- "name": cand.name,
531
- "server": cand.server,
532
- "description": cand.description,
533
- "score": cand.score,
534
- "rank": cand.rank,
535
- "schema_bytes": byte_len_json(cand.schema),
536
- "schema": cand.schema,
537
- }
538
- for cand in ranked
539
- ],
540
- }
541
-
542
-
543
- def compact_omitted(candidates: list[Candidate], limit: int) -> tuple[list[dict[str, Any]], int]:
544
- items: list[dict[str, Any]] = []
545
- for cand in candidates[:limit]:
546
- items.append({
547
- "name": cap_text(cand.name, MAX_LABEL_CHARS),
548
- "server": cap_text(cand.server, MAX_LABEL_CHARS) if cand.server else None,
549
- "reason": "below_top_k",
550
- "score": cand.score,
551
- "rank": cand.rank,
552
- })
553
- return items, max(0, len(candidates) - len(items))
554
-
555
-
556
- def retrieval_command(receipt_id: str, *, store_dir: str, tool_name: str | None = None) -> str:
557
- parts = ["context-guard-tool-prune", "get", receipt_id]
558
- if store_dir != DEFAULT_STORE_DIR:
559
- parts.extend(["--store-dir", shlex.quote(store_dir)])
560
- if tool_name is not None:
561
- parts.extend(["--tool", shlex.quote(tool_name)])
562
- parts.append("--json")
563
- return " ".join(parts)
564
-
565
-
566
- def selected_tool_record(cand: Candidate, receipt_id: str, budget_left: int, *, store_dir: str) -> tuple[dict[str, Any], int]:
567
- schema_size = byte_len_json(cand.schema)
568
- record: dict[str, Any] = {
569
- "name": cand.name,
570
- "server": cand.server,
571
- "score": cand.score,
572
- "rank": cand.rank,
573
- "description": cand.description,
574
- "schema_bytes": schema_size,
575
- "retrieval": retrieval_command(receipt_id, store_dir=store_dir, tool_name=cand.name),
576
- }
577
- if schema_size <= budget_left:
578
- record["schema_included"] = True
579
- record["schema"] = cand.schema
580
- return record, schema_size
581
- record["schema_included"] = False
582
- record["schema_omitted_reason"] = "budget"
583
- return record, 0
584
-
585
-
586
- def shrink_result_for_output(result: dict[str, Any], max_output_bytes: int) -> str:
587
- candidate = json_bytes(result, indent=2) + "\n"
588
- if byte_len_text(candidate) <= max_output_bytes:
589
- return candidate
590
-
591
- result = json.loads(json_bytes(result))
592
- omitted = result.get("omitted_tools")
593
- while isinstance(omitted, list) and len(omitted) > 0:
594
- keep = max(0, len(omitted) // 2)
595
- result["omitted_tools"] = omitted[:keep]
596
- result["omitted_tools_truncated"] = True
597
- result["omitted_tools_summary"] = f"{result.get('omitted_count', 0)} tools omitted; list capped to fit --max-output-bytes"
598
- candidate = json_bytes(result, indent=2) + "\n"
599
- if byte_len_text(candidate) <= max_output_bytes:
600
- return candidate
601
- omitted = result.get("omitted_tools")
602
-
603
- result["omitted_tools"] = []
604
- result["omitted_tools_truncated"] = True
605
- for item in result.get("selected_tools", []):
606
- if isinstance(item, dict):
607
- item.pop("description", None)
608
- candidate = json_bytes(result, indent=2) + "\n"
609
- if byte_len_text(candidate) <= max_output_bytes:
610
- return candidate
611
- fail(f"select report exceeds --max-output-bytes: {byte_len_text(candidate)} > {max_output_bytes}")
612
-
613
-
614
- def select_catalog(args: argparse.Namespace) -> str:
615
- max_catalog_bytes = bounded_int(args.max_catalog_bytes, default=DEFAULT_MAX_CATALOG_BYTES, minimum=1, maximum=100_000_000, name="--max-catalog-bytes")
616
- max_output_bytes = bounded_int(args.max_output_bytes, default=DEFAULT_MAX_OUTPUT_BYTES, minimum=1, maximum=10_000_000, name="--max-output-bytes")
617
- max_payload_bytes = bounded_int(args.max_payload_bytes, default=DEFAULT_MAX_PAYLOAD_BYTES, minimum=1, maximum=100_000_000, name="--max-payload-bytes")
618
- max_receipt_bytes = bounded_int(args.max_receipt_bytes, default=DEFAULT_MAX_RECEIPT_BYTES, minimum=1, maximum=10_000_000, name="--max-receipt-bytes")
619
- top = bounded_int(args.top, default=DEFAULT_TOP, minimum=1, maximum=MAX_TOP, name="--top")
620
- budget_bytes = bounded_int(args.budget_bytes, default=DEFAULT_BUDGET_BYTES, minimum=0, maximum=100_000_000, name="--budget-bytes")
621
-
622
- text = read_limited_path(Path(args.catalog), max_catalog_bytes) if args.catalog else read_limited_stdin(max_catalog_bytes)
623
- raw, redactions = parse_catalog_text(text)
624
- raw_query = args.query or ""
625
- safe_query, query_redactions = redact_string(raw_query)
626
- total_redactions = redactions + query_redactions
627
- ranked = rank_candidates(normalize_catalog(raw), raw_query)
628
- payload_without_id = build_payload("pending", ranked, safe_query, total_redactions)
629
- receipt_id = build_receipt_id(payload_without_id)
630
- payload = build_payload(receipt_id, ranked, safe_query, total_redactions)
631
- payload_text = json_bytes(payload, indent=2) + "\n"
632
- payload_bytes = byte_len_text(payload_text)
633
- if payload_bytes > max_payload_bytes:
634
- fail(f"payload exceeds --max-payload-bytes: {payload_bytes} > {max_payload_bytes}")
635
- payload_sha = sha256_text(payload_text.rstrip("\n"))
636
-
637
- store_dir, receipt_path, payload_path = store_paths(args.store_dir, receipt_id)
638
- receipt = {
639
- "tool": TOOL_NAME,
640
- "schema_version": SCHEMA_VERSION,
641
- "receipt_id": receipt_id,
642
- "created_at_unix": int(time.time()),
643
- "path": display_path(receipt_path),
644
- "payload_path": display_path(payload_path),
645
- "payload_sha256": payload_sha,
646
- "payload_bytes": payload_bytes,
647
- "contains": "compact_metadata_plus_sanitized_payload",
648
- "tool_count": len(ranked),
649
- "tools": [cand.name for cand in ranked[:50]],
650
- "tools_truncated": len(ranked) > 50,
651
- "retrieval_hint": retrieval_command(receipt_id, store_dir=args.store_dir, tool_name="<name>"),
652
- }
653
- receipt_size = byte_len_text(json_bytes(receipt, indent=2) + "\n")
654
- if receipt_size > max_receipt_bytes:
655
- fail(f"receipt exceeds --max-receipt-bytes: {receipt_size} > {max_receipt_bytes}")
656
-
657
- selected: list[dict[str, Any]] = []
658
- selected_schema_bytes = 0
659
- for cand in ranked[:top]:
660
- record, used = selected_tool_record(cand, receipt_id, budget_bytes - selected_schema_bytes, store_dir=args.store_dir)
661
- selected_schema_bytes += used
662
- selected.append(record)
663
- omitted_tools, omitted_truncated = compact_omitted(ranked[top:], MAX_OMITTED_TOOLS)
664
- result = {
665
- "tool": TOOL_NAME,
666
- "schema_version": SCHEMA_VERSION,
667
- "mode": "select",
668
- "query": safe_query,
669
- "top": top,
670
- "budget_bytes": budget_bytes,
671
- "selected_schema_bytes": selected_schema_bytes,
672
- "candidate_count": len(ranked),
673
- "selected_tools": selected,
674
- "omitted_tools": omitted_tools,
675
- "omitted_count": len(ranked[top:]),
676
- "omitted_tools_truncated_count": omitted_truncated,
677
- "receipt": {
678
- **receipt,
679
- "bytes": receipt_size,
680
- },
681
- "token_proxy": {"measurement": "estimated", "chars_per_token": TOKEN_PROXY_CHARS_PER_TOKEN},
682
- "caveats": [
683
- "Ranking is heuristic lexical overlap, not a correctness proof.",
684
- "Token counts are estimated proxies; byte counts and schema budgets are observed UTF-8 bytes.",
685
- "Use the receipt get command to retrieve full sanitized schemas before relying on omitted details.",
686
- ],
687
- "redaction": {"redacted_values": total_redactions},
688
- }
689
- rendered = shrink_result_for_output(result, max_output_bytes)
690
-
691
- # Only write after every size gate has passed, so failures leave no success receipt.
692
- ensure_private_dir(store_dir)
693
- written_payload_bytes = write_private_json_atomic(payload_path, payload, max_bytes=max_payload_bytes, label="payload")
694
- if written_payload_bytes != payload_bytes:
695
- fail("payload byte size changed during write")
696
- written_receipt_bytes = write_private_json_atomic(receipt_path, receipt, max_bytes=max_receipt_bytes, label="receipt")
697
- if written_receipt_bytes != receipt_size:
698
- fail("receipt byte size changed during write")
699
- return rendered
700
-
701
-
702
- def payload_path_from_receipt(store_dir: Path, receipt_id: str, receipt: dict[str, Any]) -> Path:
703
- expected_name = f"{receipt_id}.payload.json"
704
- raw = str(receipt.get("payload_path") or "")
705
- if raw:
706
- raw_path = Path(raw)
707
- if raw_path.is_absolute():
708
- fail("receipt payload_path must be relative")
709
- if raw_path.name != expected_name:
710
- fail("receipt payload_path does not match receipt_id")
711
- return store_dir / expected_name
712
-
713
-
714
- def get_schema(args: argparse.Namespace) -> str:
715
- max_payload_bytes = bounded_int(args.max_payload_bytes, default=DEFAULT_MAX_PAYLOAD_BYTES, minimum=1, maximum=100_000_000, name="--max-payload-bytes")
716
- max_receipt_bytes = bounded_int(args.max_receipt_bytes, default=DEFAULT_MAX_RECEIPT_BYTES, minimum=1, maximum=10_000_000, name="--max-receipt-bytes")
717
- max_output_bytes = bounded_int(args.max_output_bytes, default=10_000_000, minimum=1, maximum=100_000_000, name="--max-output-bytes")
718
- receipt_id = args.receipt_id
719
- if not RECEIPT_ID_RE.fullmatch(receipt_id):
720
- fail("receipt_id must be 16-64 lowercase hex chars")
721
- store_dir, receipt_path, _payload = store_paths(args.store_dir, receipt_id)
722
- reject_symlink_components(receipt_path)
723
- receipt = read_private_json(receipt_path, max_bytes=max_receipt_bytes, label="receipt")
724
- if receipt.get("receipt_id") != receipt_id:
725
- fail("receipt id mismatch")
726
- payload_path = payload_path_from_receipt(store_dir, receipt_id, receipt)
727
- reject_symlink_components(payload_path)
728
- expected_bytes = receipt.get("payload_bytes")
729
- expected_sha = receipt.get("payload_sha256")
730
- if not isinstance(expected_bytes, int) or expected_bytes < 0:
731
- fail("receipt missing payload byte size")
732
- if expected_bytes > max_payload_bytes:
733
- fail(f"payload exceeds trusted size cap: {expected_bytes} > {max_payload_bytes}")
734
- if not isinstance(expected_sha, str) or not re.fullmatch(r"[a-f0-9]{64}", expected_sha):
735
- fail("receipt missing payload sha256")
736
-
737
- payload_text, actual_size = read_private_text(payload_path, max_bytes=max_payload_bytes, label="payload")
738
- if actual_size != expected_bytes:
739
- fail(f"payload size mismatch: {actual_size} != {expected_bytes}")
740
- actual_sha = sha256_text(payload_text.rstrip("\n"))
741
- if actual_sha != expected_sha:
742
- fail("payload sha256 mismatch")
743
- try:
744
- payload = json.loads(payload_text)
745
- except json.JSONDecodeError as exc:
746
- fail(f"payload is malformed JSON: {exc.msg}")
747
- if not isinstance(payload, dict):
748
- fail("payload must be a JSON object")
749
- if payload.get("receipt_id") != receipt_id:
750
- fail("payload receipt id mismatch")
751
- tools = payload.get("tools")
752
- if not isinstance(tools, list):
753
- fail("payload tools missing")
754
-
755
- if not args.tool:
756
- result = {
757
- "tool": TOOL_NAME,
758
- "schema_version": SCHEMA_VERSION,
759
- "mode": "get",
760
- "receipt_id": receipt_id,
761
- "tools": [item.get("name") for item in tools if isinstance(item, dict)],
762
- }
763
- else:
764
- found = None
765
- for item in tools:
766
- if isinstance(item, dict) and item.get("name") == args.tool:
767
- found = item
768
- break
769
- if found is None:
770
- safe_tool, _tool_redactions = redact_string(args.tool)
771
- fail(f"tool not found in receipt: {safe_tool}")
772
- result = {
773
- "tool": TOOL_NAME,
774
- "schema_version": SCHEMA_VERSION,
775
- "mode": "get",
776
- "receipt_id": receipt_id,
777
- "tool_name": args.tool,
778
- "server": found.get("server"),
779
- "schema": found.get("schema"),
780
- }
781
- sanitized_result, _redactions = sanitize_value(result)
782
- if not isinstance(sanitized_result, dict):
783
- fail("get result sanitation failed")
784
- text = json_bytes(sanitized_result, indent=2) + "\n"
785
- if byte_len_text(text) > max_output_bytes:
786
- fail(f"get report exceeds --max-output-bytes: {byte_len_text(text)} > {max_output_bytes}")
787
- return text
788
-
789
-
790
- def build_parser() -> argparse.ArgumentParser:
791
- parser = argparse.ArgumentParser(description="Select bounded top-k tool/MCP schemas with local full-schema fallback receipts.")
792
- sub = parser.add_subparsers(dest="command", required=True)
793
-
794
- select = sub.add_parser("select", help="rank a local catalog and emit a bounded selection report")
795
- select.add_argument("--catalog", help="catalog JSON path; stdin is used when omitted")
796
- select.add_argument("--query", default="", help="task query used for lexical ranking")
797
- select.add_argument("--top", default=DEFAULT_TOP, help=f"number of tools to select (default: {DEFAULT_TOP})")
798
- select.add_argument("--budget-bytes", default=DEFAULT_BUDGET_BYTES, help=f"inline selected schema byte budget (default: {DEFAULT_BUDGET_BYTES})")
799
- select.add_argument("--max-catalog-bytes", default=DEFAULT_MAX_CATALOG_BYTES, help=f"maximum catalog JSON bytes (default: {DEFAULT_MAX_CATALOG_BYTES})")
800
- select.add_argument("--max-output-bytes", default=DEFAULT_MAX_OUTPUT_BYTES, help=f"maximum rendered select JSON bytes (default: {DEFAULT_MAX_OUTPUT_BYTES})")
801
- select.add_argument("--max-payload-bytes", default=DEFAULT_MAX_PAYLOAD_BYTES, help=f"maximum sanitized payload bytes (default: {DEFAULT_MAX_PAYLOAD_BYTES})")
802
- select.add_argument("--max-receipt-bytes", default=DEFAULT_MAX_RECEIPT_BYTES, help=f"maximum compact receipt bytes (default: {DEFAULT_MAX_RECEIPT_BYTES})")
803
- select.add_argument("--store-dir", default=DEFAULT_STORE_DIR, help=f"receipt/payload directory (default: {DEFAULT_STORE_DIR})")
804
- select.add_argument("--json", action="store_true", help="emit JSON (default and only stable output contract)")
805
-
806
- get = sub.add_parser("get", help="retrieve a full sanitized schema from a receipt payload")
807
- get.add_argument("receipt_id", help="receipt id returned by select")
808
- get.add_argument("--tool", help="tool name to retrieve; omit to list available names")
809
- get.add_argument("--store-dir", default=DEFAULT_STORE_DIR, help=f"receipt/payload directory (default: {DEFAULT_STORE_DIR})")
810
- get.add_argument("--max-output-bytes", default=10_000_000, help="maximum rendered get JSON bytes")
811
- get.add_argument("--max-payload-bytes", default=DEFAULT_MAX_PAYLOAD_BYTES, help=f"maximum trusted payload bytes (default: {DEFAULT_MAX_PAYLOAD_BYTES})")
812
- get.add_argument("--max-receipt-bytes", default=DEFAULT_MAX_RECEIPT_BYTES, help=f"maximum trusted receipt bytes (default: {DEFAULT_MAX_RECEIPT_BYTES})")
813
- get.add_argument("--json", action="store_true", help="emit JSON (default and only stable output contract)")
814
- return parser
815
-
816
-
817
- def main(argv: list[str] | None = None) -> int:
818
- parser = build_parser()
819
- args = parser.parse_args(argv)
820
- try:
821
- if args.command == "select":
822
- sys.stdout.write(select_catalog(args))
823
- return 0
824
- if args.command == "get":
825
- sys.stdout.write(get_schema(args))
826
- return 0
827
- parser.print_help(sys.stderr)
828
- return 2
829
- except ToolPruneError as exc:
830
- print(f"{TOOL_NAME}: {exc}", file=sys.stderr)
831
- return 1
832
- except BrokenPipeError:
833
- return 1
834
-
835
-
836
- if __name__ == "__main__":
837
- raise SystemExit(main())