@ictechgy/context-guard 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +49 -0
- package/LICENSE +201 -0
- package/NOTICE +4 -0
- package/README.ko.md +353 -0
- package/README.md +353 -0
- package/context-guard-kit/README.md +76 -0
- package/context-guard-kit/benchmark_runner.py +1898 -0
- package/context-guard-kit/claude_transcript_cost_audit.py +1591 -0
- package/context-guard-kit/context_compress.py +543 -0
- package/context-guard-kit/context_escrow.py +919 -0
- package/context-guard-kit/context_guard_cli.py +149 -0
- package/context-guard-kit/context_guard_diet.py +1036 -0
- package/context-guard-kit/context_pack.py +929 -0
- package/context-guard-kit/failed_attempt_nudge.py +567 -0
- package/context-guard-kit/guard_large_read.py +690 -0
- package/context-guard-kit/hook_secret_patterns.py +43 -0
- package/context-guard-kit/read_symbol.py +483 -0
- package/context-guard-kit/rewrite_bash_for_token_budget.py +501 -0
- package/context-guard-kit/sanitize_output.py +725 -0
- package/context-guard-kit/settings.example.json +67 -0
- package/context-guard-kit/setup_wizard.py +1724 -0
- package/context-guard-kit/statusline.sh +362 -0
- package/context-guard-kit/statusline_merged.sh +157 -0
- package/context-guard-kit/tool_schema_pruner.py +837 -0
- package/context-guard-kit/trim_command_output.py +1098 -0
- package/docs/distribution.md +55 -0
- package/package.json +70 -0
- package/packaging/homebrew/context-guard.rb.template +34 -0
- package/plugins/context-guard/.claude-plugin/plugin.json +41 -0
- package/plugins/context-guard/LICENSE +201 -0
- package/plugins/context-guard/NOTICE +4 -0
- package/plugins/context-guard/README.ko.md +135 -0
- package/plugins/context-guard/README.md +135 -0
- package/plugins/context-guard/bin/claude-read-symbol +6 -0
- package/plugins/context-guard/bin/claude-sanitize-output +6 -0
- package/plugins/context-guard/bin/claude-token-artifact +6 -0
- package/plugins/context-guard/bin/claude-token-audit +6 -0
- package/plugins/context-guard/bin/claude-token-bench +6 -0
- package/plugins/context-guard/bin/claude-token-diet +6 -0
- package/plugins/context-guard/bin/claude-token-failed-nudge +6 -0
- package/plugins/context-guard/bin/claude-token-guard-read +6 -0
- package/plugins/context-guard/bin/claude-token-rewrite-bash +6 -0
- package/plugins/context-guard/bin/claude-token-setup +6 -0
- package/plugins/context-guard/bin/claude-token-statusline +6 -0
- package/plugins/context-guard/bin/claude-token-statusline-merged +6 -0
- package/plugins/context-guard/bin/claude-trim-output +6 -0
- package/plugins/context-guard/bin/context-guard +149 -0
- package/plugins/context-guard/bin/context-guard-artifact +919 -0
- package/plugins/context-guard/bin/context-guard-audit +1591 -0
- package/plugins/context-guard/bin/context-guard-bench +1898 -0
- package/plugins/context-guard/bin/context-guard-compress +543 -0
- package/plugins/context-guard/bin/context-guard-diet +1036 -0
- package/plugins/context-guard/bin/context-guard-failed-nudge +567 -0
- package/plugins/context-guard/bin/context-guard-guard-read +690 -0
- package/plugins/context-guard/bin/context-guard-pack +929 -0
- package/plugins/context-guard/bin/context-guard-read-symbol +483 -0
- package/plugins/context-guard/bin/context-guard-rewrite-bash +501 -0
- package/plugins/context-guard/bin/context-guard-sanitize-output +725 -0
- package/plugins/context-guard/bin/context-guard-setup +1724 -0
- package/plugins/context-guard/bin/context-guard-statusline +362 -0
- package/plugins/context-guard/bin/context-guard-statusline-merged +157 -0
- package/plugins/context-guard/bin/context-guard-tool-prune +837 -0
- package/plugins/context-guard/bin/context-guard-trim-output +1098 -0
- package/plugins/context-guard/brief/README.md +65 -0
- package/plugins/context-guard/brief/brief-mode.lite.md +29 -0
- package/plugins/context-guard/brief/brief-mode.standard.md +31 -0
- package/plugins/context-guard/brief/brief-mode.ultra.md +32 -0
- package/plugins/context-guard/lib/hook_secret_patterns.py +43 -0
- package/plugins/context-guard/skills/audit/SKILL.md +39 -0
- package/plugins/context-guard/skills/optimize/SKILL.md +48 -0
- package/plugins/context-guard/skills/setup/SKILL.md +40 -0
|
@@ -0,0 +1,837 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Select a bounded top-k subset from a local tool/MCP schema catalog.
|
|
3
|
+
|
|
4
|
+
The helper is advisory only: it never edits MCP config or an agent's tool
|
|
5
|
+
registry. It writes a compact receipt plus a separate sanitized payload so an
|
|
6
|
+
agent can inject a small selection report first and recover the full sanitized
|
|
7
|
+
schema later when needed.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import shlex
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
import re
|
|
18
|
+
import stat
|
|
19
|
+
import sys
|
|
20
|
+
import time
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Any, NoReturn
|
|
23
|
+
|
|
24
|
+
TOOL_NAME = "context-guard-tool-prune"
|
|
25
|
+
SCHEMA_VERSION = "contextguard.tool-prune.v1"
|
|
26
|
+
DEFAULT_STORE_DIR = ".context-guard/tool-prune"
|
|
27
|
+
DEFAULT_TOP = 5
|
|
28
|
+
DEFAULT_BUDGET_BYTES = 12_000
|
|
29
|
+
DEFAULT_MAX_CATALOG_BYTES = 1_000_000
|
|
30
|
+
DEFAULT_MAX_OUTPUT_BYTES = 65_536
|
|
31
|
+
DEFAULT_MAX_PAYLOAD_BYTES = 1_048_576
|
|
32
|
+
DEFAULT_MAX_RECEIPT_BYTES = 16_384
|
|
33
|
+
MAX_TOP = 200
|
|
34
|
+
MAX_LABEL_CHARS = 160
|
|
35
|
+
MAX_DESCRIPTION_CHARS = 360
|
|
36
|
+
MAX_OMITTED_TOOLS = 30
|
|
37
|
+
TOKEN_PROXY_CHARS_PER_TOKEN = 4
|
|
38
|
+
ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
|
|
39
|
+
"tmp": Path("/private/tmp"),
|
|
40
|
+
"var": Path("/private/var"),
|
|
41
|
+
}
|
|
42
|
+
RECEIPT_ID_RE = re.compile(r"^[a-f0-9]{16,64}$")
|
|
43
|
+
TERM_RE = re.compile(r"[A-Za-z0-9_]+")
|
|
44
|
+
SECRET_RE = re.compile(
|
|
45
|
+
r"(?is)("
|
|
46
|
+
r"-----BEGIN (?:[A-Z0-9 ]*PRIVATE KEY|PGP PRIVATE KEY BLOCK)-----.*?-----END (?:[A-Z0-9 ]*PRIVATE KEY|PGP PRIVATE KEY BLOCK)-----|"
|
|
47
|
+
r"AKIA[0-9A-Z]{16}|"
|
|
48
|
+
r"gh[pousr]_[A-Za-z0-9_]{20,}|"
|
|
49
|
+
r"github_pat_[A-Za-z0-9_]{20,}|"
|
|
50
|
+
r"glpat-[A-Za-z0-9_-]{12,}|"
|
|
51
|
+
r"xox[abprs]-[A-Za-z0-9-]{10,}|"
|
|
52
|
+
r"sk-(?:ant|proj)-[A-Za-z0-9_-]{8,}|"
|
|
53
|
+
r"sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
|
|
54
|
+
r"AIza[0-9A-Za-z_\-]{20,}|"
|
|
55
|
+
r"(?i:Authorization)\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+|"
|
|
56
|
+
r"[?&](?:X-Amz-Signature|X-Amz-Credential|X-Amz-Security-Token|AWSAccessKeyId|Signature|sig|access_token|refresh_token|id_token|auth|authorization|api[_-]?key|apikey|token|secret|password|client[_-]?secret|private[_-]?key|privatekey|pgp[_-]?private[_-]?key|pgpprivatekey|ssh[_-]?key|sshkey|(?:aws[_-]?)?access[_-]?key(?:[_-]?id)?|awsaccesskeyid)=[^&#\s,}\]]+|"
|
|
57
|
+
r"(?<![A-Za-z0-9])(?:api[_-]?key|apikey|token|secret|password|client[_-]?secret|authorization|credential|signature|sig|private[_-]?key|privatekey|pgp[_-]?private[_-]?key|pgpprivatekey|ssh[_-]?key|sshkey|(?:aws[_-]?)?access[_-]?key(?:[_-]?id)?|awsaccesskeyid)\s*[:=]\s*[^\s,}\]]+"
|
|
58
|
+
r")"
|
|
59
|
+
)
|
|
60
|
+
SENSITIVE_KEY_RE = re.compile(
|
|
61
|
+
r"(?i)(authorization|api[_-]?key|apikey|token|secret|password|passwd|pwd|client[_-]?secret|credential|signature|sig|x-amz-signature|x-amz-credential|awsaccesskeyid|(?:aws[_-]?)?access[_-]?key(?:[_-]?id)?|private[_-]?key|privatekey|pgp[_-]?private[_-]?key|pgpprivatekey|ssh[_-]?key|sshkey)"
|
|
62
|
+
)
|
|
63
|
+
VALUE_BEARING_KEY_RE = re.compile(r"(?i)^(default|const|enum|example|examples|value|values)$")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ToolPruneError(ValueError):
|
|
67
|
+
"""User-facing fail-closed error."""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True)
|
|
71
|
+
class Candidate:
|
|
72
|
+
name: str
|
|
73
|
+
server: str | None
|
|
74
|
+
description: str
|
|
75
|
+
schema: dict[str, Any]
|
|
76
|
+
index: int
|
|
77
|
+
score: float = 0.0
|
|
78
|
+
rank: int = 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def fail(message: str) -> NoReturn:
|
|
82
|
+
raise ToolPruneError(message)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def byte_len_text(text: str) -> int:
|
|
86
|
+
return len(text.encode("utf-8", errors="replace"))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def json_bytes(data: Any, *, indent: int | None = None) -> str:
|
|
90
|
+
return json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":") if indent is None else None, indent=indent)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def byte_len_json(data: Any) -> int:
|
|
94
|
+
return byte_len_text(json_bytes(data))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def sha256_text(text: str) -> str:
|
|
98
|
+
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def bounded_int(value: object, *, default: int, minimum: int, maximum: int, name: str) -> int:
|
|
102
|
+
try:
|
|
103
|
+
number = int(value)
|
|
104
|
+
except (TypeError, ValueError, OverflowError):
|
|
105
|
+
fail(f"{name} must be an integer")
|
|
106
|
+
if number < minimum:
|
|
107
|
+
fail(f"{name} must be >= {minimum}")
|
|
108
|
+
if number > maximum:
|
|
109
|
+
fail(f"{name} must be <= {maximum}")
|
|
110
|
+
return number
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def cap_text(value: object, limit: int = MAX_LABEL_CHARS) -> str:
|
|
114
|
+
text = " ".join(str(value or "").split())
|
|
115
|
+
if len(text) <= limit:
|
|
116
|
+
return text
|
|
117
|
+
marker = f"…[trimmed:{len(text)} chars]"
|
|
118
|
+
return text[: max(0, limit - len(marker))] + marker
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def redact_string(value: str) -> tuple[str, int]:
|
|
122
|
+
def repl(match: re.Match[str]) -> str:
|
|
123
|
+
text = match.group(0)
|
|
124
|
+
if "=" in text:
|
|
125
|
+
key = text.split("=", 1)[0]
|
|
126
|
+
if SENSITIVE_KEY_RE.search(key):
|
|
127
|
+
return key + "=[REDACTED]"
|
|
128
|
+
if ":" in text:
|
|
129
|
+
key = text.split(":", 1)[0]
|
|
130
|
+
if SENSITIVE_KEY_RE.search(key):
|
|
131
|
+
return key + ": [REDACTED]"
|
|
132
|
+
return "[REDACTED]"
|
|
133
|
+
|
|
134
|
+
return SECRET_RE.subn(repl, value)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def redact_whole_value(value: Any) -> tuple[Any, int]:
|
|
138
|
+
if isinstance(value, dict):
|
|
139
|
+
out: dict[str, Any] = {}
|
|
140
|
+
count = 0
|
|
141
|
+
for key, item in value.items():
|
|
142
|
+
safe_key, key_redactions = redact_string(str(key))
|
|
143
|
+
sanitized, item_redactions = redact_whole_value(item)
|
|
144
|
+
out[safe_key] = sanitized
|
|
145
|
+
count += key_redactions + item_redactions
|
|
146
|
+
return out, count
|
|
147
|
+
if isinstance(value, list):
|
|
148
|
+
out: list[Any] = []
|
|
149
|
+
count = 0
|
|
150
|
+
for item in value:
|
|
151
|
+
sanitized, item_redactions = redact_whole_value(item)
|
|
152
|
+
out.append(sanitized)
|
|
153
|
+
count += item_redactions
|
|
154
|
+
return out, count
|
|
155
|
+
return "[REDACTED]", 1
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def sanitize_value(value: Any, *, sensitive_context: bool = False, sensitive_schema_context: bool = False) -> tuple[Any, int]:
|
|
159
|
+
if sensitive_context:
|
|
160
|
+
return redact_whole_value(value)
|
|
161
|
+
if isinstance(value, str):
|
|
162
|
+
return redact_string(value)
|
|
163
|
+
if isinstance(value, list):
|
|
164
|
+
out: list[Any] = []
|
|
165
|
+
count = 0
|
|
166
|
+
for item in value:
|
|
167
|
+
sanitized, redactions = sanitize_value(item, sensitive_schema_context=sensitive_schema_context)
|
|
168
|
+
out.append(sanitized)
|
|
169
|
+
count += redactions
|
|
170
|
+
return out, count
|
|
171
|
+
if isinstance(value, dict):
|
|
172
|
+
out: dict[str, Any] = {}
|
|
173
|
+
count = 0
|
|
174
|
+
for key, item in value.items():
|
|
175
|
+
raw_key = str(key)
|
|
176
|
+
safe_key, key_redactions = redact_string(raw_key)
|
|
177
|
+
key_sensitive = bool(SENSITIVE_KEY_RE.search(raw_key))
|
|
178
|
+
value_bearing = bool(VALUE_BEARING_KEY_RE.search(raw_key))
|
|
179
|
+
if key_sensitive and not isinstance(item, dict):
|
|
180
|
+
sanitized, item_redactions = sanitize_value(item, sensitive_context=True)
|
|
181
|
+
elif key_sensitive:
|
|
182
|
+
sanitized, item_redactions = sanitize_value(item, sensitive_schema_context=True)
|
|
183
|
+
elif sensitive_schema_context and value_bearing:
|
|
184
|
+
sanitized, item_redactions = sanitize_value(item, sensitive_context=True)
|
|
185
|
+
else:
|
|
186
|
+
sanitized, item_redactions = sanitize_value(item, sensitive_schema_context=sensitive_schema_context)
|
|
187
|
+
out[safe_key] = sanitized
|
|
188
|
+
count += key_redactions + item_redactions
|
|
189
|
+
return out, count
|
|
190
|
+
return value, 0
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def read_limited_path(path: Path, max_bytes: int) -> str:
|
|
194
|
+
reject_symlink_components(path)
|
|
195
|
+
flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
|
|
196
|
+
try:
|
|
197
|
+
fd = os.open(str(path), flags)
|
|
198
|
+
except OSError as exc:
|
|
199
|
+
fail(f"catalog read failed: {exc}")
|
|
200
|
+
try:
|
|
201
|
+
st = os.fstat(fd)
|
|
202
|
+
if not stat.S_ISREG(st.st_mode):
|
|
203
|
+
fail("catalog must be a regular file")
|
|
204
|
+
if st.st_size > max_bytes:
|
|
205
|
+
fail(f"catalog exceeds --max-catalog-bytes: {st.st_size} > {max_bytes}")
|
|
206
|
+
data = os.read(fd, max_bytes + 1)
|
|
207
|
+
finally:
|
|
208
|
+
os.close(fd)
|
|
209
|
+
if len(data) > max_bytes:
|
|
210
|
+
fail(f"catalog exceeds --max-catalog-bytes: > {max_bytes}")
|
|
211
|
+
return data.decode("utf-8", errors="replace")
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def read_limited_stdin(max_bytes: int) -> str:
|
|
215
|
+
data = sys.stdin.buffer.read(max_bytes + 1)
|
|
216
|
+
if len(data) > max_bytes:
|
|
217
|
+
fail(f"catalog exceeds --max-catalog-bytes: > {max_bytes}")
|
|
218
|
+
return data.decode("utf-8", errors="replace")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def parse_catalog_text(text: str) -> tuple[Any, int]:
|
|
222
|
+
try:
|
|
223
|
+
raw = json.loads(text)
|
|
224
|
+
except json.JSONDecodeError as exc:
|
|
225
|
+
fail(f"catalog must be valid JSON: {exc.msg}")
|
|
226
|
+
return sanitize_value(raw)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def first_str(mapping: dict[str, Any], keys: tuple[str, ...]) -> str:
|
|
230
|
+
for key in keys:
|
|
231
|
+
value = mapping.get(key)
|
|
232
|
+
if isinstance(value, str) and value.strip():
|
|
233
|
+
return value
|
|
234
|
+
return ""
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def tool_schema_from_dict(raw: dict[str, Any], *, fallback_name: str | None = None, server: str | None = None, index: int = 0) -> Candidate | None:
|
|
238
|
+
name = first_str(raw, ("name", "tool", "id", "title")) or (fallback_name or "")
|
|
239
|
+
name = cap_text(name, MAX_LABEL_CHARS)
|
|
240
|
+
if not name:
|
|
241
|
+
return None
|
|
242
|
+
description = cap_text(first_str(raw, ("description", "summary", "doc", "docs")), MAX_DESCRIPTION_CHARS)
|
|
243
|
+
schema = dict(raw)
|
|
244
|
+
schema.setdefault("name", name)
|
|
245
|
+
if description and "description" not in schema:
|
|
246
|
+
schema["description"] = description
|
|
247
|
+
if server and "server" not in schema:
|
|
248
|
+
schema["server"] = server
|
|
249
|
+
return Candidate(name=name, server=cap_text(server, MAX_LABEL_CHARS) if server else None, description=description, schema=schema, index=index)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def normalize_catalog(raw: Any) -> list[Candidate]:
|
|
253
|
+
candidates: list[Candidate] = []
|
|
254
|
+
|
|
255
|
+
def add_tool(tool: Any, *, server: str | None = None, fallback_name: str | None = None) -> None:
|
|
256
|
+
if isinstance(tool, str):
|
|
257
|
+
tool = {"name": tool}
|
|
258
|
+
if not isinstance(tool, dict):
|
|
259
|
+
return
|
|
260
|
+
cand = tool_schema_from_dict(tool, fallback_name=fallback_name, server=server, index=len(candidates))
|
|
261
|
+
if cand is not None:
|
|
262
|
+
candidates.append(cand)
|
|
263
|
+
|
|
264
|
+
def add_tools(tools: Any, *, server: str | None = None) -> None:
|
|
265
|
+
if isinstance(tools, list):
|
|
266
|
+
for tool in tools:
|
|
267
|
+
add_tool(tool, server=server)
|
|
268
|
+
elif isinstance(tools, dict):
|
|
269
|
+
for name, schema in tools.items():
|
|
270
|
+
if isinstance(schema, dict):
|
|
271
|
+
add_tool(schema, server=server, fallback_name=str(name))
|
|
272
|
+
else:
|
|
273
|
+
add_tool({"name": str(name), "schema": schema}, server=server)
|
|
274
|
+
|
|
275
|
+
if isinstance(raw, list):
|
|
276
|
+
add_tools(raw)
|
|
277
|
+
elif isinstance(raw, dict):
|
|
278
|
+
if "tools" in raw:
|
|
279
|
+
add_tools(raw.get("tools"), server=first_str(raw, ("server", "name")) or None)
|
|
280
|
+
if "servers" in raw and isinstance(raw.get("servers"), list):
|
|
281
|
+
for server_obj in raw.get("servers") or []:
|
|
282
|
+
if isinstance(server_obj, dict):
|
|
283
|
+
add_tools(server_obj.get("tools"), server=first_str(server_obj, ("name", "id", "server")) or None)
|
|
284
|
+
if "mcpServers" in raw and isinstance(raw.get("mcpServers"), dict):
|
|
285
|
+
for server_name, server_obj in (raw.get("mcpServers") or {}).items():
|
|
286
|
+
if isinstance(server_obj, dict):
|
|
287
|
+
add_tools(server_obj.get("tools"), server=str(server_name))
|
|
288
|
+
if not candidates:
|
|
289
|
+
# Simple name-to-schema map.
|
|
290
|
+
for name, schema in raw.items():
|
|
291
|
+
if name in {"tools", "servers", "mcpServers"}:
|
|
292
|
+
continue
|
|
293
|
+
if isinstance(schema, dict):
|
|
294
|
+
add_tool(schema, fallback_name=str(name))
|
|
295
|
+
elif isinstance(schema, (str, list)):
|
|
296
|
+
add_tool({"name": str(name), "schema": schema})
|
|
297
|
+
if not candidates:
|
|
298
|
+
fail("catalog contains no tools")
|
|
299
|
+
return candidates
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def terms(text: str) -> set[str]:
|
|
303
|
+
return {term.lower() for term in TERM_RE.findall(text or "") if term}
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def collect_parameter_text(value: Any, *, depth: int = 0, max_items: int = 500) -> list[str]:
|
|
307
|
+
out: list[str] = []
|
|
308
|
+
if depth > 8 or max_items <= 0:
|
|
309
|
+
return out
|
|
310
|
+
if isinstance(value, dict):
|
|
311
|
+
for key, item in value.items():
|
|
312
|
+
if len(out) >= max_items:
|
|
313
|
+
break
|
|
314
|
+
key_text = str(key)
|
|
315
|
+
if key_text.lower() in {"properties", "parameters", "inputschema", "input_schema", "schema", "description", "title", "name"}:
|
|
316
|
+
out.append(key_text)
|
|
317
|
+
elif isinstance(item, (str, int, float, bool)):
|
|
318
|
+
out.append(key_text)
|
|
319
|
+
if isinstance(item, str) and key_text.lower() in {"description", "title", "name"}:
|
|
320
|
+
out.append(item)
|
|
321
|
+
out.extend(collect_parameter_text(item, depth=depth + 1, max_items=max_items - len(out)))
|
|
322
|
+
elif isinstance(value, list):
|
|
323
|
+
for item in value[:max_items]:
|
|
324
|
+
if len(out) >= max_items:
|
|
325
|
+
break
|
|
326
|
+
out.extend(collect_parameter_text(item, depth=depth + 1, max_items=max_items - len(out)))
|
|
327
|
+
return out[:max_items]
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def score_candidate(candidate: Candidate, query_terms: set[str]) -> float:
|
|
331
|
+
if not query_terms:
|
|
332
|
+
return 0.0
|
|
333
|
+
name_terms = terms(candidate.name)
|
|
334
|
+
desc_terms = terms(candidate.description)
|
|
335
|
+
parameter_terms = terms(" ".join(collect_parameter_text(candidate.schema)))
|
|
336
|
+
score = 0.0
|
|
337
|
+
score += 4.0 * len(query_terms & name_terms)
|
|
338
|
+
score += 1.5 * len(query_terms & desc_terms)
|
|
339
|
+
score += 1.0 * len(query_terms & parameter_terms)
|
|
340
|
+
# Light substring bonus for names such as git_status when the query says status.
|
|
341
|
+
lowered_name = candidate.name.lower()
|
|
342
|
+
for term in query_terms:
|
|
343
|
+
if term and term in lowered_name and term not in name_terms:
|
|
344
|
+
score += 1.0
|
|
345
|
+
return score
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def rank_candidates(candidates: list[Candidate], query: str) -> list[Candidate]:
|
|
349
|
+
query_terms = terms(query)
|
|
350
|
+
scored: list[Candidate] = []
|
|
351
|
+
for cand in candidates:
|
|
352
|
+
scored.append(Candidate(cand.name, cand.server, cand.description, cand.schema, cand.index, score_candidate(cand, query_terms), 0))
|
|
353
|
+
scored.sort(key=lambda item: (-item.score, item.index))
|
|
354
|
+
ranked: list[Candidate] = []
|
|
355
|
+
for rank, cand in enumerate(scored, start=1):
|
|
356
|
+
ranked.append(Candidate(cand.name, cand.server, cand.description, cand.schema, cand.index, cand.score, rank))
|
|
357
|
+
return ranked
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def normalized_link_target(parent: Path, raw_target: str) -> Path:
|
|
361
|
+
target = Path(raw_target)
|
|
362
|
+
if not target.is_absolute():
|
|
363
|
+
target = parent / target
|
|
364
|
+
return Path(os.path.normpath(str(target)))
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def normalize_allowed_first_absolute_symlink(path: Path) -> Path:
|
|
368
|
+
if not path.is_absolute() or len(path.parts) < 2:
|
|
369
|
+
return path
|
|
370
|
+
first = path.parts[1]
|
|
371
|
+
expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
|
|
372
|
+
if expected is None:
|
|
373
|
+
return path
|
|
374
|
+
link = Path(path.anchor) / first
|
|
375
|
+
try:
|
|
376
|
+
if not stat.S_ISLNK(os.lstat(link).st_mode):
|
|
377
|
+
return path
|
|
378
|
+
if normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
|
|
379
|
+
return path
|
|
380
|
+
except OSError:
|
|
381
|
+
return path
|
|
382
|
+
return expected.joinpath(*path.parts[2:])
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def reject_symlink_components(path: Path) -> None:
|
|
386
|
+
path = normalize_allowed_first_absolute_symlink(path)
|
|
387
|
+
current = Path(path.anchor) if path.is_absolute() else Path()
|
|
388
|
+
for part in path.parts:
|
|
389
|
+
if path.is_absolute() and part == path.anchor:
|
|
390
|
+
continue
|
|
391
|
+
current = current / part
|
|
392
|
+
try:
|
|
393
|
+
st = os.lstat(current)
|
|
394
|
+
except FileNotFoundError:
|
|
395
|
+
return
|
|
396
|
+
if stat.S_ISLNK(st.st_mode):
|
|
397
|
+
fail(f"refusing path with symlink component: {current}")
|
|
398
|
+
if not stat.S_ISDIR(st.st_mode) and current != path:
|
|
399
|
+
fail(f"refusing path through non-directory component: {current}")
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def ensure_private_dir(path: Path) -> None:
|
|
403
|
+
path = normalize_allowed_first_absolute_symlink(path)
|
|
404
|
+
reject_symlink_components(path)
|
|
405
|
+
try:
|
|
406
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
407
|
+
reject_symlink_components(path)
|
|
408
|
+
os.chmod(path, 0o700)
|
|
409
|
+
except OSError as exc:
|
|
410
|
+
fail(f"store directory unavailable: {exc}")
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def write_private_json_atomic(path: Path, data: dict[str, Any], *, max_bytes: int, label: str) -> int:
|
|
414
|
+
text = json_bytes(data, indent=2) + "\n"
|
|
415
|
+
size = byte_len_text(text)
|
|
416
|
+
if size > max_bytes:
|
|
417
|
+
fail(f"{label} exceeds size cap: {size} > {max_bytes}")
|
|
418
|
+
ensure_private_dir(path.parent)
|
|
419
|
+
tmp = path.with_name(path.name + f".tmp-{os.getpid()}-{time.time_ns()}")
|
|
420
|
+
flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0)
|
|
421
|
+
try:
|
|
422
|
+
fd = os.open(str(tmp), flags, 0o600)
|
|
423
|
+
except OSError as exc:
|
|
424
|
+
fail(f"{label} write failed: {exc}")
|
|
425
|
+
try:
|
|
426
|
+
with os.fdopen(fd, "w", encoding="utf-8", newline="") as handle:
|
|
427
|
+
handle.write(text)
|
|
428
|
+
handle.flush()
|
|
429
|
+
try:
|
|
430
|
+
os.fsync(handle.fileno())
|
|
431
|
+
except OSError:
|
|
432
|
+
pass
|
|
433
|
+
os.replace(tmp, path)
|
|
434
|
+
try:
|
|
435
|
+
os.chmod(path, 0o600)
|
|
436
|
+
except OSError:
|
|
437
|
+
pass
|
|
438
|
+
except Exception:
|
|
439
|
+
try:
|
|
440
|
+
tmp.unlink()
|
|
441
|
+
except OSError:
|
|
442
|
+
pass
|
|
443
|
+
raise
|
|
444
|
+
return size
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def read_private_text(path: Path, *, max_bytes: int, label: str) -> tuple[str, int]:
|
|
448
|
+
if path.is_symlink():
|
|
449
|
+
fail(f"{label} must not be a symlink")
|
|
450
|
+
flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
|
|
451
|
+
try:
|
|
452
|
+
fd = os.open(str(path), flags)
|
|
453
|
+
except OSError as exc:
|
|
454
|
+
fail(f"{label} read failed: {exc}")
|
|
455
|
+
try:
|
|
456
|
+
st = os.fstat(fd)
|
|
457
|
+
if not stat.S_ISREG(st.st_mode):
|
|
458
|
+
fail(f"{label} must be a regular file")
|
|
459
|
+
if st.st_size > max_bytes:
|
|
460
|
+
fail(f"{label} exceeds trusted size cap: {st.st_size} > {max_bytes}")
|
|
461
|
+
data = os.read(fd, max_bytes + 1)
|
|
462
|
+
finally:
|
|
463
|
+
os.close(fd)
|
|
464
|
+
if len(data) > max_bytes:
|
|
465
|
+
fail(f"{label} exceeds trusted size cap: > {max_bytes}")
|
|
466
|
+
return data.decode("utf-8", errors="replace"), len(data)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def read_private_json(path: Path, *, max_bytes: int, label: str) -> dict[str, Any]:
|
|
470
|
+
if path.is_symlink():
|
|
471
|
+
fail(f"{label} must not be a symlink")
|
|
472
|
+
flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
|
|
473
|
+
try:
|
|
474
|
+
fd = os.open(str(path), flags)
|
|
475
|
+
except OSError as exc:
|
|
476
|
+
fail(f"{label} read failed: {exc}")
|
|
477
|
+
try:
|
|
478
|
+
st = os.fstat(fd)
|
|
479
|
+
if not stat.S_ISREG(st.st_mode):
|
|
480
|
+
fail(f"{label} must be a regular file")
|
|
481
|
+
if st.st_size > max_bytes:
|
|
482
|
+
fail(f"{label} exceeds trusted size cap: {st.st_size} > {max_bytes}")
|
|
483
|
+
data = os.read(fd, max_bytes + 1)
|
|
484
|
+
finally:
|
|
485
|
+
os.close(fd)
|
|
486
|
+
if len(data) > max_bytes:
|
|
487
|
+
fail(f"{label} exceeds trusted size cap: > {max_bytes}")
|
|
488
|
+
try:
|
|
489
|
+
parsed = json.loads(data.decode("utf-8", errors="replace"))
|
|
490
|
+
except json.JSONDecodeError as exc:
|
|
491
|
+
fail(f"{label} is malformed JSON: {exc.msg}")
|
|
492
|
+
if not isinstance(parsed, dict):
|
|
493
|
+
fail(f"{label} must be a JSON object")
|
|
494
|
+
return parsed
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def display_path(path: Path) -> str:
|
|
498
|
+
try:
|
|
499
|
+
rel = os.path.relpath(path, Path.cwd())
|
|
500
|
+
except ValueError:
|
|
501
|
+
rel = path.name
|
|
502
|
+
rel = rel.replace(os.sep, "/")
|
|
503
|
+
safe, _count = redact_string(rel)
|
|
504
|
+
return safe
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def store_paths(store_dir: str, receipt_id: str) -> tuple[Path, Path, Path]:
|
|
508
|
+
if not RECEIPT_ID_RE.fullmatch(receipt_id):
|
|
509
|
+
fail("receipt_id must be 16-64 lowercase hex chars")
|
|
510
|
+
root = normalize_allowed_first_absolute_symlink(Path(store_dir).expanduser())
|
|
511
|
+
return root, root / f"{receipt_id}.receipt.json", root / f"{receipt_id}.payload.json"
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def build_receipt_id(payload_without_id: dict[str, Any]) -> str:
|
|
515
|
+
basis = json_bytes(payload_without_id) + f"\n{time.time_ns()}:{os.getpid()}"
|
|
516
|
+
return hashlib.sha256(basis.encode("utf-8", errors="replace")).hexdigest()[:20]
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def build_payload(receipt_id: str, ranked: list[Candidate], query: str, redactions: int) -> dict[str, Any]:
|
|
520
|
+
return {
|
|
521
|
+
"tool": TOOL_NAME,
|
|
522
|
+
"schema_version": SCHEMA_VERSION,
|
|
523
|
+
"receipt_id": receipt_id,
|
|
524
|
+
"created_at_unix": int(time.time()),
|
|
525
|
+
"query": query,
|
|
526
|
+
"candidate_count": len(ranked),
|
|
527
|
+
"redaction": {"redacted_values": redactions},
|
|
528
|
+
"tools": [
|
|
529
|
+
{
|
|
530
|
+
"name": cand.name,
|
|
531
|
+
"server": cand.server,
|
|
532
|
+
"description": cand.description,
|
|
533
|
+
"score": cand.score,
|
|
534
|
+
"rank": cand.rank,
|
|
535
|
+
"schema_bytes": byte_len_json(cand.schema),
|
|
536
|
+
"schema": cand.schema,
|
|
537
|
+
}
|
|
538
|
+
for cand in ranked
|
|
539
|
+
],
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def compact_omitted(candidates: list[Candidate], limit: int) -> tuple[list[dict[str, Any]], int]:
|
|
544
|
+
items: list[dict[str, Any]] = []
|
|
545
|
+
for cand in candidates[:limit]:
|
|
546
|
+
items.append({
|
|
547
|
+
"name": cap_text(cand.name, MAX_LABEL_CHARS),
|
|
548
|
+
"server": cap_text(cand.server, MAX_LABEL_CHARS) if cand.server else None,
|
|
549
|
+
"reason": "below_top_k",
|
|
550
|
+
"score": cand.score,
|
|
551
|
+
"rank": cand.rank,
|
|
552
|
+
})
|
|
553
|
+
return items, max(0, len(candidates) - len(items))
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def retrieval_command(receipt_id: str, *, store_dir: str, tool_name: str | None = None) -> str:
|
|
557
|
+
parts = ["context-guard-tool-prune", "get", receipt_id]
|
|
558
|
+
if store_dir != DEFAULT_STORE_DIR:
|
|
559
|
+
parts.extend(["--store-dir", shlex.quote(store_dir)])
|
|
560
|
+
if tool_name is not None:
|
|
561
|
+
parts.extend(["--tool", shlex.quote(tool_name)])
|
|
562
|
+
parts.append("--json")
|
|
563
|
+
return " ".join(parts)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def selected_tool_record(cand: Candidate, receipt_id: str, budget_left: int, *, store_dir: str) -> tuple[dict[str, Any], int]:
|
|
567
|
+
schema_size = byte_len_json(cand.schema)
|
|
568
|
+
record: dict[str, Any] = {
|
|
569
|
+
"name": cand.name,
|
|
570
|
+
"server": cand.server,
|
|
571
|
+
"score": cand.score,
|
|
572
|
+
"rank": cand.rank,
|
|
573
|
+
"description": cand.description,
|
|
574
|
+
"schema_bytes": schema_size,
|
|
575
|
+
"retrieval": retrieval_command(receipt_id, store_dir=store_dir, tool_name=cand.name),
|
|
576
|
+
}
|
|
577
|
+
if schema_size <= budget_left:
|
|
578
|
+
record["schema_included"] = True
|
|
579
|
+
record["schema"] = cand.schema
|
|
580
|
+
return record, schema_size
|
|
581
|
+
record["schema_included"] = False
|
|
582
|
+
record["schema_omitted_reason"] = "budget"
|
|
583
|
+
return record, 0
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def shrink_result_for_output(result: dict[str, Any], max_output_bytes: int) -> str:
|
|
587
|
+
candidate = json_bytes(result, indent=2) + "\n"
|
|
588
|
+
if byte_len_text(candidate) <= max_output_bytes:
|
|
589
|
+
return candidate
|
|
590
|
+
|
|
591
|
+
result = json.loads(json_bytes(result))
|
|
592
|
+
omitted = result.get("omitted_tools")
|
|
593
|
+
while isinstance(omitted, list) and len(omitted) > 0:
|
|
594
|
+
keep = max(0, len(omitted) // 2)
|
|
595
|
+
result["omitted_tools"] = omitted[:keep]
|
|
596
|
+
result["omitted_tools_truncated"] = True
|
|
597
|
+
result["omitted_tools_summary"] = f"{result.get('omitted_count', 0)} tools omitted; list capped to fit --max-output-bytes"
|
|
598
|
+
candidate = json_bytes(result, indent=2) + "\n"
|
|
599
|
+
if byte_len_text(candidate) <= max_output_bytes:
|
|
600
|
+
return candidate
|
|
601
|
+
omitted = result.get("omitted_tools")
|
|
602
|
+
|
|
603
|
+
result["omitted_tools"] = []
|
|
604
|
+
result["omitted_tools_truncated"] = True
|
|
605
|
+
for item in result.get("selected_tools", []):
|
|
606
|
+
if isinstance(item, dict):
|
|
607
|
+
item.pop("description", None)
|
|
608
|
+
candidate = json_bytes(result, indent=2) + "\n"
|
|
609
|
+
if byte_len_text(candidate) <= max_output_bytes:
|
|
610
|
+
return candidate
|
|
611
|
+
fail(f"select report exceeds --max-output-bytes: {byte_len_text(candidate)} > {max_output_bytes}")
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def select_catalog(args: argparse.Namespace) -> str:
|
|
615
|
+
max_catalog_bytes = bounded_int(args.max_catalog_bytes, default=DEFAULT_MAX_CATALOG_BYTES, minimum=1, maximum=100_000_000, name="--max-catalog-bytes")
|
|
616
|
+
max_output_bytes = bounded_int(args.max_output_bytes, default=DEFAULT_MAX_OUTPUT_BYTES, minimum=1, maximum=10_000_000, name="--max-output-bytes")
|
|
617
|
+
max_payload_bytes = bounded_int(args.max_payload_bytes, default=DEFAULT_MAX_PAYLOAD_BYTES, minimum=1, maximum=100_000_000, name="--max-payload-bytes")
|
|
618
|
+
max_receipt_bytes = bounded_int(args.max_receipt_bytes, default=DEFAULT_MAX_RECEIPT_BYTES, minimum=1, maximum=10_000_000, name="--max-receipt-bytes")
|
|
619
|
+
top = bounded_int(args.top, default=DEFAULT_TOP, minimum=1, maximum=MAX_TOP, name="--top")
|
|
620
|
+
budget_bytes = bounded_int(args.budget_bytes, default=DEFAULT_BUDGET_BYTES, minimum=0, maximum=100_000_000, name="--budget-bytes")
|
|
621
|
+
|
|
622
|
+
text = read_limited_path(Path(args.catalog), max_catalog_bytes) if args.catalog else read_limited_stdin(max_catalog_bytes)
|
|
623
|
+
raw, redactions = parse_catalog_text(text)
|
|
624
|
+
raw_query = args.query or ""
|
|
625
|
+
safe_query, query_redactions = redact_string(raw_query)
|
|
626
|
+
total_redactions = redactions + query_redactions
|
|
627
|
+
ranked = rank_candidates(normalize_catalog(raw), raw_query)
|
|
628
|
+
payload_without_id = build_payload("pending", ranked, safe_query, total_redactions)
|
|
629
|
+
receipt_id = build_receipt_id(payload_without_id)
|
|
630
|
+
payload = build_payload(receipt_id, ranked, safe_query, total_redactions)
|
|
631
|
+
payload_text = json_bytes(payload, indent=2) + "\n"
|
|
632
|
+
payload_bytes = byte_len_text(payload_text)
|
|
633
|
+
if payload_bytes > max_payload_bytes:
|
|
634
|
+
fail(f"payload exceeds --max-payload-bytes: {payload_bytes} > {max_payload_bytes}")
|
|
635
|
+
payload_sha = sha256_text(payload_text.rstrip("\n"))
|
|
636
|
+
|
|
637
|
+
store_dir, receipt_path, payload_path = store_paths(args.store_dir, receipt_id)
|
|
638
|
+
receipt = {
|
|
639
|
+
"tool": TOOL_NAME,
|
|
640
|
+
"schema_version": SCHEMA_VERSION,
|
|
641
|
+
"receipt_id": receipt_id,
|
|
642
|
+
"created_at_unix": int(time.time()),
|
|
643
|
+
"path": display_path(receipt_path),
|
|
644
|
+
"payload_path": display_path(payload_path),
|
|
645
|
+
"payload_sha256": payload_sha,
|
|
646
|
+
"payload_bytes": payload_bytes,
|
|
647
|
+
"contains": "compact_metadata_plus_sanitized_payload",
|
|
648
|
+
"tool_count": len(ranked),
|
|
649
|
+
"tools": [cand.name for cand in ranked[:50]],
|
|
650
|
+
"tools_truncated": len(ranked) > 50,
|
|
651
|
+
"retrieval_hint": retrieval_command(receipt_id, store_dir=args.store_dir, tool_name="<name>"),
|
|
652
|
+
}
|
|
653
|
+
receipt_size = byte_len_text(json_bytes(receipt, indent=2) + "\n")
|
|
654
|
+
if receipt_size > max_receipt_bytes:
|
|
655
|
+
fail(f"receipt exceeds --max-receipt-bytes: {receipt_size} > {max_receipt_bytes}")
|
|
656
|
+
|
|
657
|
+
selected: list[dict[str, Any]] = []
|
|
658
|
+
selected_schema_bytes = 0
|
|
659
|
+
for cand in ranked[:top]:
|
|
660
|
+
record, used = selected_tool_record(cand, receipt_id, budget_bytes - selected_schema_bytes, store_dir=args.store_dir)
|
|
661
|
+
selected_schema_bytes += used
|
|
662
|
+
selected.append(record)
|
|
663
|
+
omitted_tools, omitted_truncated = compact_omitted(ranked[top:], MAX_OMITTED_TOOLS)
|
|
664
|
+
result = {
|
|
665
|
+
"tool": TOOL_NAME,
|
|
666
|
+
"schema_version": SCHEMA_VERSION,
|
|
667
|
+
"mode": "select",
|
|
668
|
+
"query": safe_query,
|
|
669
|
+
"top": top,
|
|
670
|
+
"budget_bytes": budget_bytes,
|
|
671
|
+
"selected_schema_bytes": selected_schema_bytes,
|
|
672
|
+
"candidate_count": len(ranked),
|
|
673
|
+
"selected_tools": selected,
|
|
674
|
+
"omitted_tools": omitted_tools,
|
|
675
|
+
"omitted_count": len(ranked[top:]),
|
|
676
|
+
"omitted_tools_truncated_count": omitted_truncated,
|
|
677
|
+
"receipt": {
|
|
678
|
+
**receipt,
|
|
679
|
+
"bytes": receipt_size,
|
|
680
|
+
},
|
|
681
|
+
"token_proxy": {"measurement": "estimated", "chars_per_token": TOKEN_PROXY_CHARS_PER_TOKEN},
|
|
682
|
+
"caveats": [
|
|
683
|
+
"Ranking is heuristic lexical overlap, not a correctness proof.",
|
|
684
|
+
"Token counts are estimated proxies; byte counts and schema budgets are observed UTF-8 bytes.",
|
|
685
|
+
"Use the receipt get command to retrieve full sanitized schemas before relying on omitted details.",
|
|
686
|
+
],
|
|
687
|
+
"redaction": {"redacted_values": total_redactions},
|
|
688
|
+
}
|
|
689
|
+
rendered = shrink_result_for_output(result, max_output_bytes)
|
|
690
|
+
|
|
691
|
+
# Only write after every size gate has passed, so failures leave no success receipt.
|
|
692
|
+
ensure_private_dir(store_dir)
|
|
693
|
+
written_payload_bytes = write_private_json_atomic(payload_path, payload, max_bytes=max_payload_bytes, label="payload")
|
|
694
|
+
if written_payload_bytes != payload_bytes:
|
|
695
|
+
fail("payload byte size changed during write")
|
|
696
|
+
written_receipt_bytes = write_private_json_atomic(receipt_path, receipt, max_bytes=max_receipt_bytes, label="receipt")
|
|
697
|
+
if written_receipt_bytes != receipt_size:
|
|
698
|
+
fail("receipt byte size changed during write")
|
|
699
|
+
return rendered
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def payload_path_from_receipt(store_dir: Path, receipt_id: str, receipt: dict[str, Any]) -> Path:
|
|
703
|
+
expected_name = f"{receipt_id}.payload.json"
|
|
704
|
+
raw = str(receipt.get("payload_path") or "")
|
|
705
|
+
if raw:
|
|
706
|
+
raw_path = Path(raw)
|
|
707
|
+
if raw_path.is_absolute():
|
|
708
|
+
fail("receipt payload_path must be relative")
|
|
709
|
+
if raw_path.name != expected_name:
|
|
710
|
+
fail("receipt payload_path does not match receipt_id")
|
|
711
|
+
return store_dir / expected_name
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
def get_schema(args: argparse.Namespace) -> str:
|
|
715
|
+
max_payload_bytes = bounded_int(args.max_payload_bytes, default=DEFAULT_MAX_PAYLOAD_BYTES, minimum=1, maximum=100_000_000, name="--max-payload-bytes")
|
|
716
|
+
max_receipt_bytes = bounded_int(args.max_receipt_bytes, default=DEFAULT_MAX_RECEIPT_BYTES, minimum=1, maximum=10_000_000, name="--max-receipt-bytes")
|
|
717
|
+
max_output_bytes = bounded_int(args.max_output_bytes, default=10_000_000, minimum=1, maximum=100_000_000, name="--max-output-bytes")
|
|
718
|
+
receipt_id = args.receipt_id
|
|
719
|
+
if not RECEIPT_ID_RE.fullmatch(receipt_id):
|
|
720
|
+
fail("receipt_id must be 16-64 lowercase hex chars")
|
|
721
|
+
store_dir, receipt_path, _payload = store_paths(args.store_dir, receipt_id)
|
|
722
|
+
reject_symlink_components(receipt_path)
|
|
723
|
+
receipt = read_private_json(receipt_path, max_bytes=max_receipt_bytes, label="receipt")
|
|
724
|
+
if receipt.get("receipt_id") != receipt_id:
|
|
725
|
+
fail("receipt id mismatch")
|
|
726
|
+
payload_path = payload_path_from_receipt(store_dir, receipt_id, receipt)
|
|
727
|
+
reject_symlink_components(payload_path)
|
|
728
|
+
expected_bytes = receipt.get("payload_bytes")
|
|
729
|
+
expected_sha = receipt.get("payload_sha256")
|
|
730
|
+
if not isinstance(expected_bytes, int) or expected_bytes < 0:
|
|
731
|
+
fail("receipt missing payload byte size")
|
|
732
|
+
if expected_bytes > max_payload_bytes:
|
|
733
|
+
fail(f"payload exceeds trusted size cap: {expected_bytes} > {max_payload_bytes}")
|
|
734
|
+
if not isinstance(expected_sha, str) or not re.fullmatch(r"[a-f0-9]{64}", expected_sha):
|
|
735
|
+
fail("receipt missing payload sha256")
|
|
736
|
+
|
|
737
|
+
payload_text, actual_size = read_private_text(payload_path, max_bytes=max_payload_bytes, label="payload")
|
|
738
|
+
if actual_size != expected_bytes:
|
|
739
|
+
fail(f"payload size mismatch: {actual_size} != {expected_bytes}")
|
|
740
|
+
actual_sha = sha256_text(payload_text.rstrip("\n"))
|
|
741
|
+
if actual_sha != expected_sha:
|
|
742
|
+
fail("payload sha256 mismatch")
|
|
743
|
+
try:
|
|
744
|
+
payload = json.loads(payload_text)
|
|
745
|
+
except json.JSONDecodeError as exc:
|
|
746
|
+
fail(f"payload is malformed JSON: {exc.msg}")
|
|
747
|
+
if not isinstance(payload, dict):
|
|
748
|
+
fail("payload must be a JSON object")
|
|
749
|
+
if payload.get("receipt_id") != receipt_id:
|
|
750
|
+
fail("payload receipt id mismatch")
|
|
751
|
+
tools = payload.get("tools")
|
|
752
|
+
if not isinstance(tools, list):
|
|
753
|
+
fail("payload tools missing")
|
|
754
|
+
|
|
755
|
+
if not args.tool:
|
|
756
|
+
result = {
|
|
757
|
+
"tool": TOOL_NAME,
|
|
758
|
+
"schema_version": SCHEMA_VERSION,
|
|
759
|
+
"mode": "get",
|
|
760
|
+
"receipt_id": receipt_id,
|
|
761
|
+
"tools": [item.get("name") for item in tools if isinstance(item, dict)],
|
|
762
|
+
}
|
|
763
|
+
else:
|
|
764
|
+
found = None
|
|
765
|
+
for item in tools:
|
|
766
|
+
if isinstance(item, dict) and item.get("name") == args.tool:
|
|
767
|
+
found = item
|
|
768
|
+
break
|
|
769
|
+
if found is None:
|
|
770
|
+
safe_tool, _tool_redactions = redact_string(args.tool)
|
|
771
|
+
fail(f"tool not found in receipt: {safe_tool}")
|
|
772
|
+
result = {
|
|
773
|
+
"tool": TOOL_NAME,
|
|
774
|
+
"schema_version": SCHEMA_VERSION,
|
|
775
|
+
"mode": "get",
|
|
776
|
+
"receipt_id": receipt_id,
|
|
777
|
+
"tool_name": args.tool,
|
|
778
|
+
"server": found.get("server"),
|
|
779
|
+
"schema": found.get("schema"),
|
|
780
|
+
}
|
|
781
|
+
sanitized_result, _redactions = sanitize_value(result)
|
|
782
|
+
if not isinstance(sanitized_result, dict):
|
|
783
|
+
fail("get result sanitation failed")
|
|
784
|
+
text = json_bytes(sanitized_result, indent=2) + "\n"
|
|
785
|
+
if byte_len_text(text) > max_output_bytes:
|
|
786
|
+
fail(f"get report exceeds --max-output-bytes: {byte_len_text(text)} > {max_output_bytes}")
|
|
787
|
+
return text
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
791
|
+
parser = argparse.ArgumentParser(description="Select bounded top-k tool/MCP schemas with local full-schema fallback receipts.")
|
|
792
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
793
|
+
|
|
794
|
+
select = sub.add_parser("select", help="rank a local catalog and emit a bounded selection report")
|
|
795
|
+
select.add_argument("--catalog", help="catalog JSON path; stdin is used when omitted")
|
|
796
|
+
select.add_argument("--query", default="", help="task query used for lexical ranking")
|
|
797
|
+
select.add_argument("--top", default=DEFAULT_TOP, help=f"number of tools to select (default: {DEFAULT_TOP})")
|
|
798
|
+
select.add_argument("--budget-bytes", default=DEFAULT_BUDGET_BYTES, help=f"inline selected schema byte budget (default: {DEFAULT_BUDGET_BYTES})")
|
|
799
|
+
select.add_argument("--max-catalog-bytes", default=DEFAULT_MAX_CATALOG_BYTES, help=f"maximum catalog JSON bytes (default: {DEFAULT_MAX_CATALOG_BYTES})")
|
|
800
|
+
select.add_argument("--max-output-bytes", default=DEFAULT_MAX_OUTPUT_BYTES, help=f"maximum rendered select JSON bytes (default: {DEFAULT_MAX_OUTPUT_BYTES})")
|
|
801
|
+
select.add_argument("--max-payload-bytes", default=DEFAULT_MAX_PAYLOAD_BYTES, help=f"maximum sanitized payload bytes (default: {DEFAULT_MAX_PAYLOAD_BYTES})")
|
|
802
|
+
select.add_argument("--max-receipt-bytes", default=DEFAULT_MAX_RECEIPT_BYTES, help=f"maximum compact receipt bytes (default: {DEFAULT_MAX_RECEIPT_BYTES})")
|
|
803
|
+
select.add_argument("--store-dir", default=DEFAULT_STORE_DIR, help=f"receipt/payload directory (default: {DEFAULT_STORE_DIR})")
|
|
804
|
+
select.add_argument("--json", action="store_true", help="emit JSON (default and only stable output contract)")
|
|
805
|
+
|
|
806
|
+
get = sub.add_parser("get", help="retrieve a full sanitized schema from a receipt payload")
|
|
807
|
+
get.add_argument("receipt_id", help="receipt id returned by select")
|
|
808
|
+
get.add_argument("--tool", help="tool name to retrieve; omit to list available names")
|
|
809
|
+
get.add_argument("--store-dir", default=DEFAULT_STORE_DIR, help=f"receipt/payload directory (default: {DEFAULT_STORE_DIR})")
|
|
810
|
+
get.add_argument("--max-output-bytes", default=10_000_000, help="maximum rendered get JSON bytes")
|
|
811
|
+
get.add_argument("--max-payload-bytes", default=DEFAULT_MAX_PAYLOAD_BYTES, help=f"maximum trusted payload bytes (default: {DEFAULT_MAX_PAYLOAD_BYTES})")
|
|
812
|
+
get.add_argument("--max-receipt-bytes", default=DEFAULT_MAX_RECEIPT_BYTES, help=f"maximum trusted receipt bytes (default: {DEFAULT_MAX_RECEIPT_BYTES})")
|
|
813
|
+
get.add_argument("--json", action="store_true", help="emit JSON (default and only stable output contract)")
|
|
814
|
+
return parser
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
def main(argv: list[str] | None = None) -> int:
|
|
818
|
+
parser = build_parser()
|
|
819
|
+
args = parser.parse_args(argv)
|
|
820
|
+
try:
|
|
821
|
+
if args.command == "select":
|
|
822
|
+
sys.stdout.write(select_catalog(args))
|
|
823
|
+
return 0
|
|
824
|
+
if args.command == "get":
|
|
825
|
+
sys.stdout.write(get_schema(args))
|
|
826
|
+
return 0
|
|
827
|
+
parser.print_help(sys.stderr)
|
|
828
|
+
return 2
|
|
829
|
+
except ToolPruneError as exc:
|
|
830
|
+
print(f"{TOOL_NAME}: {exc}", file=sys.stderr)
|
|
831
|
+
return 1
|
|
832
|
+
except BrokenPipeError:
|
|
833
|
+
return 1
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
if __name__ == "__main__":
|
|
837
|
+
raise SystemExit(main())
|