@ictechgy/context-guard 0.4.9 → 0.4.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/README.ko.md +59 -31
- package/README.md +85 -36
- package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
- package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
- package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
- package/docs/benchmark-workflow-examples.md +3 -0
- package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
- package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
- package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
- package/docs/distribution.md +10 -7
- package/docs/experimental-benchmark-fixtures.md +30 -6
- package/package.json +4 -6
- package/packaging/homebrew/context-guard.rb.template +1 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +20 -14
- package/plugins/context-guard/README.md +26 -17
- package/plugins/context-guard/bin/context-guard +147 -25
- package/plugins/context-guard/bin/context-guard-artifact +884 -79
- package/plugins/context-guard/bin/context-guard-audit +33 -2
- package/plugins/context-guard/bin/context-guard-bench +1542 -31
- package/plugins/context-guard/bin/context-guard-cache-score +665 -0
- package/plugins/context-guard/bin/context-guard-compress +146 -1
- package/plugins/context-guard/bin/context-guard-cost +790 -6
- package/plugins/context-guard/bin/context-guard-experiments +463 -26
- package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
- package/plugins/context-guard/bin/context-guard-filter +163 -7
- package/plugins/context-guard/bin/context-guard-guard-read +3 -0
- package/plugins/context-guard/bin/context-guard-pack +892 -49
- package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
- package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
- package/plugins/context-guard/bin/context-guard-setup +165 -31
- package/plugins/context-guard/bin/context-guard-statusline +490 -283
- package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
- package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
- package/plugins/context-guard/bin/context-guard-trim-output +288 -41
- package/plugins/context-guard/brief/README.md +5 -5
- package/plugins/context-guard/lib/context_guard_commands.py +230 -0
- package/plugins/context-guard/skills/setup/SKILL.md +1 -0
- package/context-guard-kit/README.md +0 -91
- package/context-guard-kit/benchmark_runner.py +0 -2401
- package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
- package/context-guard-kit/context_compress.py +0 -695
- package/context-guard-kit/context_escrow.py +0 -935
- package/context-guard-kit/context_filter.py +0 -637
- package/context-guard-kit/context_guard_cli.py +0 -325
- package/context-guard-kit/context_guard_diet.py +0 -1711
- package/context-guard-kit/context_pack.py +0 -2713
- package/context-guard-kit/cost_guard.py +0 -2349
- package/context-guard-kit/experimental_registry.py +0 -4348
- package/context-guard-kit/failed_attempt_nudge.py +0 -567
- package/context-guard-kit/guard_large_read.py +0 -690
- package/context-guard-kit/hook_secret_patterns.py +0 -43
- package/context-guard-kit/read_symbol.py +0 -483
- package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
- package/context-guard-kit/sanitize_output.py +0 -725
- package/context-guard-kit/settings.example.json +0 -67
- package/context-guard-kit/setup_wizard.py +0 -2515
- package/context-guard-kit/statusline.sh +0 -362
- package/context-guard-kit/statusline_merged.sh +0 -157
- package/context-guard-kit/tool_schema_pruner.py +0 -837
- package/context-guard-kit/trim_command_output.py +0 -1449
|
@@ -1,1711 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Scan a project for Claude Code token-diet configuration gaps.
|
|
3
|
-
|
|
4
|
-
The scanner is intentionally local, read-only, and heuristic. It looks for
|
|
5
|
-
large always-in-context instruction files, missing read deny rules for bulky or
|
|
6
|
-
sensitive paths, and missing helper hooks/statusline settings that reduce token
|
|
7
|
-
burn during noisy command runs.
|
|
8
|
-
"""
|
|
9
|
-
from __future__ import annotations
|
|
10
|
-
|
|
11
|
-
import argparse
|
|
12
|
-
import ast
|
|
13
|
-
from collections import Counter, defaultdict
|
|
14
|
-
import errno
|
|
15
|
-
import hashlib
|
|
16
|
-
import json
|
|
17
|
-
import os
|
|
18
|
-
import re
|
|
19
|
-
import stat
|
|
20
|
-
import sys
|
|
21
|
-
from dataclasses import dataclass, field
|
|
22
|
-
from pathlib import Path
|
|
23
|
-
from typing import Any, Iterable
|
|
24
|
-
|
|
25
|
-
CONTEXT_FILE_NAMES = {"CLAUDE.md", "AGENTS.md", "GEMINI.md"}
|
|
26
|
-
CONTEXT_EXACT_REL_FILES = {
|
|
27
|
-
".clinerules",
|
|
28
|
-
".cursorrules",
|
|
29
|
-
".github/copilot-instructions.md",
|
|
30
|
-
".windsurfrules",
|
|
31
|
-
}
|
|
32
|
-
CONTEXT_MD_DIRS = {
|
|
33
|
-
".claude/agents",
|
|
34
|
-
".claude/commands",
|
|
35
|
-
".claude/skills",
|
|
36
|
-
".clinerules",
|
|
37
|
-
".cursor/rules",
|
|
38
|
-
".windsurf/rules",
|
|
39
|
-
}
|
|
40
|
-
CONTEXT_SURFACE_LABELS = {
|
|
41
|
-
"claude": "Claude Code instructions",
|
|
42
|
-
"codex": "OpenAI Codex AGENTS.md",
|
|
43
|
-
"gemini": "Gemini CLI instructions",
|
|
44
|
-
"cursor": "Cursor rules",
|
|
45
|
-
"windsurf": "Windsurf rules",
|
|
46
|
-
"cline": "Cline rules",
|
|
47
|
-
"copilot": "GitHub Copilot instructions",
|
|
48
|
-
}
|
|
49
|
-
EXCLUDED_DIR_NAMES = {
|
|
50
|
-
".cache",
|
|
51
|
-
".git",
|
|
52
|
-
".hg",
|
|
53
|
-
".mypy_cache",
|
|
54
|
-
".next",
|
|
55
|
-
".omx",
|
|
56
|
-
".pytest_cache",
|
|
57
|
-
".ruff_cache",
|
|
58
|
-
".serena",
|
|
59
|
-
".tox",
|
|
60
|
-
".venv",
|
|
61
|
-
".vscode",
|
|
62
|
-
"__pycache__",
|
|
63
|
-
"build",
|
|
64
|
-
"coverage",
|
|
65
|
-
"dist",
|
|
66
|
-
"node_modules",
|
|
67
|
-
"target",
|
|
68
|
-
"vendor",
|
|
69
|
-
}
|
|
70
|
-
MAX_CONTEXT_READ_BYTES = 512_000
|
|
71
|
-
MAX_SECRET_SCAN_BYTES = 5_000_000
|
|
72
|
-
MAX_SETTINGS_READ_BYTES = 256_000
|
|
73
|
-
DEFAULT_LARGE_CONTEXT_BYTES = 16_000
|
|
74
|
-
DEFAULT_HUGE_CONTEXT_BYTES = 64_000
|
|
75
|
-
DEFAULT_LONG_CONTEXT_LINES = 300
|
|
76
|
-
STRUCTURAL_WASTE_SCHEMA_VERSION = "contextguard.structural-waste.v1"
|
|
77
|
-
DEFAULT_STRUCTURAL_WASTE_TOP = 20
|
|
78
|
-
DEFAULT_DUPLICATE_RULE_MIN_CHARS = 48
|
|
79
|
-
DEFAULT_DUPLICATE_CALL_THRESHOLD = 3
|
|
80
|
-
DEFAULT_MCP_SERVER_THRESHOLD = 6
|
|
81
|
-
DEFAULT_TOOL_COUNT_THRESHOLD = 40
|
|
82
|
-
DEFAULT_LARGE_SCHEMA_BYTES = 12_000
|
|
83
|
-
DEFAULT_MAX_TOOL_CATALOG_BYTES = 1_000_000
|
|
84
|
-
DEFAULT_MAX_LOG_BYTES = 5_000_000
|
|
85
|
-
DEFAULT_MAX_LOG_LINE_BYTES = 1_000_000
|
|
86
|
-
DEFAULT_MAX_STRUCTURAL_FILES = 2_000
|
|
87
|
-
MAX_REPORT_LABEL_CHARS = 160
|
|
88
|
-
TEXT_REFERENCE_SUFFIXES = {".md", ".txt", ".json", ".toml", ".yaml", ".yml", ".py", ".js", ".ts", ".tsx", ".jsx", ".sh"}
|
|
89
|
-
TOOL_CALL_NAME_KEYS = ("tool_name", "toolName", "tool")
|
|
90
|
-
TOOL_CALL_INPUT_KEYS = ("tool_input", "input", "arguments", "args", "parameters")
|
|
91
|
-
READ_TOOL_NAMES = {"read", "read_file", "fileread", "view_file", "open_file", "get_file", "functions.get_file"}
|
|
92
|
-
FILE_PATH_KEYS = {"file_path", "filepath", "path", "absolute_path", "relative_path", "file"}
|
|
93
|
-
|
|
94
|
-
HEAVY_PROJECT_DENIES: tuple[tuple[str, str, str], ...] = (
|
|
95
|
-
("node_modules", "node_modules", "Read(./node_modules/**)"),
|
|
96
|
-
("dist", "dist", "Read(./dist/**)"),
|
|
97
|
-
("build", "build", "Read(./build/**)"),
|
|
98
|
-
("coverage", "coverage", "Read(./coverage/**)"),
|
|
99
|
-
("logs", "logs", "Read(./logs/**)"),
|
|
100
|
-
("tmp", "tmp", "Read(./tmp/**)"),
|
|
101
|
-
("target", "target", "Read(./target/**)"),
|
|
102
|
-
(".next", ".next", "Read(./.next/**)"),
|
|
103
|
-
(".venv", ".venv", "Read(./.venv/**)"),
|
|
104
|
-
("vendor", "vendor", "Read(./vendor/**)"),
|
|
105
|
-
(".context-guard", ".context-guard", "Read(./.context-guard/**)"),
|
|
106
|
-
(".claude-token-optimizer", ".claude-token-optimizer", "Read(./.claude-token-optimizer/**)"),
|
|
107
|
-
)
|
|
108
|
-
SENSITIVE_PROJECT_DENIES: tuple[tuple[str, str, str], ...] = (
|
|
109
|
-
(".env", ".env", "Read(./.env)"),
|
|
110
|
-
(".env.*", ".env.*", "Read(./.env.*)"),
|
|
111
|
-
(".npmrc", ".npmrc", "Read(./.npmrc)"),
|
|
112
|
-
(".pypirc", ".pypirc", "Read(./.pypirc)"),
|
|
113
|
-
(".netrc", ".netrc", "Read(./.netrc)"),
|
|
114
|
-
)
|
|
115
|
-
SENSITIVE_HOME_DENIES: tuple[tuple[str, str], ...] = (
|
|
116
|
-
("~/.ssh", "Read(~/.ssh/**)"),
|
|
117
|
-
("~/.aws", "Read(~/.aws/**)"),
|
|
118
|
-
("~/.gnupg", "Read(~/.gnupg/**)"),
|
|
119
|
-
("~/.kube", "Read(~/.kube/**)"),
|
|
120
|
-
("~/.docker", "Read(~/.docker/**)"),
|
|
121
|
-
)
|
|
122
|
-
SECRET_CONTENT_RE = re.compile(
|
|
123
|
-
r"(?is)("
|
|
124
|
-
r"-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----|"
|
|
125
|
-
r"AKIA[0-9A-Z]{16}|"
|
|
126
|
-
r"gh[pousr]_[A-Za-z0-9_]{20,}|"
|
|
127
|
-
r"xox[abprs]-[A-Za-z0-9-]{10,}|"
|
|
128
|
-
r"AIza[0-9A-Za-z_\-]{20,}|"
|
|
129
|
-
r"(?i:Authorization)\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+|"
|
|
130
|
-
r"(?<![A-Za-z0-9])(?:api[_-]?key|token|secret|password|client[_-]?secret)\s*[:=]\s*[^\s]+"
|
|
131
|
-
r")"
|
|
132
|
-
)
|
|
133
|
-
REDACTED_PATH_COMPONENT = "[REDACTED-PATH-COMPONENT]"
|
|
134
|
-
BASH_TRIM_COMMAND_MARKERS = (
|
|
135
|
-
"context-guard-rewrite-bash",
|
|
136
|
-
"claude-token-rewrite-bash",
|
|
137
|
-
"rewrite_bash_for_token_budget.py",
|
|
138
|
-
)
|
|
139
|
-
LARGE_READ_GUARD_COMMAND_MARKERS = (
|
|
140
|
-
"context-guard-guard-read",
|
|
141
|
-
"claude-token-guard-read",
|
|
142
|
-
"guard_large_read.py",
|
|
143
|
-
)
|
|
144
|
-
STATUSLINE_COMMAND_MARKERS = (
|
|
145
|
-
"context-guard-statusline",
|
|
146
|
-
"claude-token-statusline",
|
|
147
|
-
"statusline.sh",
|
|
148
|
-
"statusline_merged.sh",
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
@dataclass
|
|
153
|
-
class Finding:
|
|
154
|
-
id: str
|
|
155
|
-
severity: str
|
|
156
|
-
path: str
|
|
157
|
-
message: str
|
|
158
|
-
action: str
|
|
159
|
-
evidence: dict[str, Any] = field(default_factory=dict)
|
|
160
|
-
rule_id: str | None = None
|
|
161
|
-
instance_id: str | None = None
|
|
162
|
-
|
|
163
|
-
def as_dict(self) -> dict[str, Any]:
|
|
164
|
-
return {
|
|
165
|
-
"id": self.id,
|
|
166
|
-
"rule_id": self.rule_id or self.id,
|
|
167
|
-
"instance_id": self.instance_id or self.id,
|
|
168
|
-
"severity": self.severity,
|
|
169
|
-
"path": self.path,
|
|
170
|
-
"message": self.message,
|
|
171
|
-
"action": self.action,
|
|
172
|
-
"evidence": self.evidence,
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def path_hash(path: Path) -> str:
|
|
177
|
-
return hashlib.sha256(str(path).encode("utf-8", "replace")).hexdigest()[:12]
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def text_hash(text: str) -> str:
|
|
181
|
-
return hashlib.sha256(text.encode("utf-8", "replace")).hexdigest()[:12]
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def safe_id_part(text: str) -> str:
|
|
185
|
-
normalized = text.lower().replace("*", " star ")
|
|
186
|
-
return re.sub(r"[^a-z0-9]+", "-", normalized).strip("-")
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def safe_resolve(path: Path) -> Path:
|
|
190
|
-
try:
|
|
191
|
-
return path.resolve()
|
|
192
|
-
except (OSError, RuntimeError):
|
|
193
|
-
return path.absolute()
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
def path_component_contains_secret(component: str) -> bool:
|
|
197
|
-
return bool(component and component not in {".", ".."} and SECRET_CONTENT_RE.search(component))
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def sanitize_path_component(component: str) -> str:
|
|
201
|
-
if not component or component in {".", ".."}:
|
|
202
|
-
return component
|
|
203
|
-
if not path_component_contains_secret(component):
|
|
204
|
-
return component
|
|
205
|
-
return REDACTED_PATH_COMPONENT
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
def sanitize_rel_path(path: str) -> str:
|
|
209
|
-
return "/".join(sanitize_path_component(component) for component in path.split("/"))
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
def sanitize_path_text(path: str) -> str:
|
|
213
|
-
return "/".join(sanitize_path_component(component) for component in path.replace(os.sep, "/").split("/"))
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def display_path_hash(path: Path) -> str:
|
|
217
|
-
return text_hash(sanitize_path_text(str(safe_resolve(path))))
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
def path_label(path: Path, show_paths: bool) -> str:
|
|
221
|
-
if show_paths:
|
|
222
|
-
return sanitize_path_text(str(path))
|
|
223
|
-
name = sanitize_path_component(path.name or "path")
|
|
224
|
-
return f"{name}#path:{display_path_hash(path)}"
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
def context_finding(
|
|
228
|
-
rule_id: str,
|
|
229
|
-
severity: str,
|
|
230
|
-
path: str,
|
|
231
|
-
message: str,
|
|
232
|
-
action: str,
|
|
233
|
-
evidence: dict[str, Any] | None = None,
|
|
234
|
-
) -> Finding:
|
|
235
|
-
instance_id = f"{rule_id}-{text_hash(path)}"
|
|
236
|
-
return Finding(instance_id, severity, path, message, action, evidence or {}, rule_id=rule_id, instance_id=instance_id)
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def root_label(root: Path, show_paths: bool) -> str:
|
|
240
|
-
if show_paths:
|
|
241
|
-
return sanitize_path_text(str(root))
|
|
242
|
-
name = sanitize_path_component(root.name or "project")
|
|
243
|
-
return f"{name}#path:{display_path_hash(root)}"
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
def rel_path(path: Path, root: Path) -> str:
|
|
247
|
-
try:
|
|
248
|
-
return sanitize_rel_path(path.resolve().relative_to(root.resolve()).as_posix())
|
|
249
|
-
except (OSError, RuntimeError, ValueError):
|
|
250
|
-
name = sanitize_path_component(path.name or "path")
|
|
251
|
-
return f"{name}#path:{display_path_hash(path)}"
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
def raw_rel_path(path: Path, root: Path) -> str | None:
|
|
255
|
-
try:
|
|
256
|
-
return path.resolve().relative_to(root.resolve()).as_posix()
|
|
257
|
-
except (OSError, RuntimeError, ValueError):
|
|
258
|
-
return None
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def context_surface_for_rel(raw_rel: str, name: str) -> dict[str, str] | None:
|
|
262
|
-
if name == "CLAUDE.md" or raw_rel.startswith(".claude/"):
|
|
263
|
-
key = "claude"
|
|
264
|
-
elif name == "AGENTS.md":
|
|
265
|
-
key = "codex"
|
|
266
|
-
elif name == "GEMINI.md":
|
|
267
|
-
key = "gemini"
|
|
268
|
-
elif raw_rel == ".cursorrules" or raw_rel.startswith(".cursor/rules/"):
|
|
269
|
-
key = "cursor"
|
|
270
|
-
elif raw_rel == ".windsurfrules" or raw_rel.startswith(".windsurf/rules/"):
|
|
271
|
-
key = "windsurf"
|
|
272
|
-
elif raw_rel == ".clinerules" or raw_rel.startswith(".clinerules/"):
|
|
273
|
-
key = "cline"
|
|
274
|
-
elif raw_rel == ".github/copilot-instructions.md":
|
|
275
|
-
key = "copilot"
|
|
276
|
-
else:
|
|
277
|
-
return None
|
|
278
|
-
return {
|
|
279
|
-
"surface": key,
|
|
280
|
-
"surface_label": CONTEXT_SURFACE_LABELS.get(key, key),
|
|
281
|
-
"surface_kind": "agent_rule",
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
class SettingsFileTooLargeError(ValueError):
|
|
286
|
-
pass
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
def load_json(path: Path, root: Path) -> tuple[dict[str, Any] | None, str | None]:
|
|
290
|
-
try:
|
|
291
|
-
data = json.loads(read_settings_json_bytes_no_follow(path, root).decode("utf-8"))
|
|
292
|
-
except FileNotFoundError:
|
|
293
|
-
return None, "missing"
|
|
294
|
-
except json.JSONDecodeError as exc:
|
|
295
|
-
return None, f"invalid JSON at line {exc.lineno}: {exc.msg}"
|
|
296
|
-
except SettingsFileTooLargeError as exc:
|
|
297
|
-
return None, str(exc)
|
|
298
|
-
except UnicodeDecodeError as exc:
|
|
299
|
-
return None, f"invalid UTF-8 near byte {exc.start}"
|
|
300
|
-
except OSError as exc:
|
|
301
|
-
return None, f"unreadable: {format_os_error(exc)}"
|
|
302
|
-
if not isinstance(data, dict):
|
|
303
|
-
return None, "settings root must be a JSON object"
|
|
304
|
-
return data, None
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def _open_regular_under_root_no_follow(root: Path, path: Path, *, path_kind: str = "settings"):
|
|
308
|
-
root_resolved = root.resolve()
|
|
309
|
-
nofollow = getattr(os, "O_NOFOLLOW", 0)
|
|
310
|
-
if not nofollow:
|
|
311
|
-
raise OSError(errno.ENOTSUP, "safe no-follow open is unavailable")
|
|
312
|
-
if os.open not in getattr(os, "supports_dir_fd", set()):
|
|
313
|
-
raise OSError(errno.ENOTSUP, "safe directory-relative open is unavailable")
|
|
314
|
-
try:
|
|
315
|
-
relative = path.relative_to(root_resolved)
|
|
316
|
-
except ValueError:
|
|
317
|
-
try:
|
|
318
|
-
relative = path.relative_to(root)
|
|
319
|
-
except ValueError as exc:
|
|
320
|
-
raise OSError(f"{path_kind} path is outside project root") from exc
|
|
321
|
-
parts = relative.parts
|
|
322
|
-
if not parts:
|
|
323
|
-
raise OSError(errno.EINVAL, f"{path_kind} path is missing a file name")
|
|
324
|
-
for component in parts:
|
|
325
|
-
if component in {"", "."} or component == "..":
|
|
326
|
-
raise OSError(errno.EINVAL, f"invalid {path_kind} path component")
|
|
327
|
-
dir_flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0) | nofollow
|
|
328
|
-
if hasattr(os, "O_CLOEXEC"):
|
|
329
|
-
dir_flags |= os.O_CLOEXEC
|
|
330
|
-
dir_fd = os.open(root_resolved, dir_flags)
|
|
331
|
-
try:
|
|
332
|
-
if not stat.S_ISDIR(os.fstat(dir_fd).st_mode):
|
|
333
|
-
raise OSError(errno.ENOTDIR, f"{path_kind} root is not a directory")
|
|
334
|
-
for component in parts[:-1]:
|
|
335
|
-
try:
|
|
336
|
-
next_fd = os.open(component, dir_flags, dir_fd=dir_fd)
|
|
337
|
-
except OSError as exc:
|
|
338
|
-
if exc.errno in {errno.ENOTDIR, errno.ELOOP}:
|
|
339
|
-
raise OSError(exc.errno, f"{path_kind} parent is not a directory") from exc
|
|
340
|
-
raise
|
|
341
|
-
try:
|
|
342
|
-
if not stat.S_ISDIR(os.fstat(next_fd).st_mode):
|
|
343
|
-
raise OSError(errno.ENOTDIR, f"{path_kind} parent is not a directory")
|
|
344
|
-
except Exception:
|
|
345
|
-
os.close(next_fd)
|
|
346
|
-
raise
|
|
347
|
-
old_fd = dir_fd
|
|
348
|
-
dir_fd = next_fd
|
|
349
|
-
os.close(old_fd)
|
|
350
|
-
file_flags = os.O_RDONLY
|
|
351
|
-
if hasattr(os, "O_CLOEXEC"):
|
|
352
|
-
file_flags |= os.O_CLOEXEC
|
|
353
|
-
if hasattr(os, "O_NONBLOCK"):
|
|
354
|
-
file_flags |= os.O_NONBLOCK
|
|
355
|
-
if nofollow:
|
|
356
|
-
file_flags |= nofollow
|
|
357
|
-
try:
|
|
358
|
-
fd = os.open(parts[-1], file_flags, dir_fd=dir_fd)
|
|
359
|
-
except OSError as exc:
|
|
360
|
-
if exc.errno == errno.ELOOP:
|
|
361
|
-
raise OSError(errno.ELOOP, "not a regular file") from exc
|
|
362
|
-
raise
|
|
363
|
-
try:
|
|
364
|
-
opened = os.fstat(fd)
|
|
365
|
-
if not stat.S_ISREG(opened.st_mode):
|
|
366
|
-
raise OSError(errno.EINVAL, "not a regular file")
|
|
367
|
-
handle = os.fdopen(fd, "rb")
|
|
368
|
-
fd = -1
|
|
369
|
-
return handle
|
|
370
|
-
except Exception:
|
|
371
|
-
if fd != -1:
|
|
372
|
-
os.close(fd)
|
|
373
|
-
raise
|
|
374
|
-
finally:
|
|
375
|
-
if dir_fd != -1:
|
|
376
|
-
os.close(dir_fd)
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
def read_settings_json_bytes_no_follow(path: Path, root: Path) -> bytes:
|
|
380
|
-
with _open_regular_under_root_no_follow(root, path) as handle:
|
|
381
|
-
st = os.fstat(handle.fileno())
|
|
382
|
-
if st.st_size > MAX_SETTINGS_READ_BYTES:
|
|
383
|
-
raise SettingsFileTooLargeError(
|
|
384
|
-
f"settings file is too large ({st.st_size} bytes > {MAX_SETTINGS_READ_BYTES})"
|
|
385
|
-
)
|
|
386
|
-
data = handle.read(MAX_SETTINGS_READ_BYTES + 1)
|
|
387
|
-
if len(data) > MAX_SETTINGS_READ_BYTES:
|
|
388
|
-
raise SettingsFileTooLargeError(f"settings file is too large (> {MAX_SETTINGS_READ_BYTES} bytes)")
|
|
389
|
-
return data
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
def iter_values(value: Any) -> Iterable[Any]:
|
|
393
|
-
if isinstance(value, dict):
|
|
394
|
-
for item in value.values():
|
|
395
|
-
yield from iter_values(item)
|
|
396
|
-
elif isinstance(value, list):
|
|
397
|
-
for item in value:
|
|
398
|
-
yield from iter_values(item)
|
|
399
|
-
else:
|
|
400
|
-
yield value
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
def string_values(value: Any) -> list[str]:
|
|
404
|
-
return [item for item in iter_values(value) if isinstance(item, str)]
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
def collect_settings(root: Path) -> tuple[list[dict[str, Any]], list[Finding]]:
|
|
408
|
-
settings: list[dict[str, Any]] = []
|
|
409
|
-
findings: list[Finding] = []
|
|
410
|
-
candidates = [root / ".claude" / "settings.json", root / ".claude" / "settings.local.json"]
|
|
411
|
-
has_project_settings = (root / ".claude" / "settings.json").exists() or (root / ".claude" / "settings.json").is_symlink()
|
|
412
|
-
for path in candidates:
|
|
413
|
-
if not path.exists() and not path.is_symlink():
|
|
414
|
-
continue
|
|
415
|
-
rel = rel_path(path, root)
|
|
416
|
-
data, error = load_json(path, root)
|
|
417
|
-
if error:
|
|
418
|
-
findings.append(Finding(
|
|
419
|
-
"settings-unreadable",
|
|
420
|
-
"high" if "outside project" in error or "invalid JSON" in error else "medium",
|
|
421
|
-
rel,
|
|
422
|
-
f"Claude settings could not be used: {error}.",
|
|
423
|
-
"Fix or remove the settings file so token-budget hooks and deny rules are predictable.",
|
|
424
|
-
))
|
|
425
|
-
continue
|
|
426
|
-
assert data is not None
|
|
427
|
-
settings.append({"path": rel, "data": data})
|
|
428
|
-
if not settings or not has_project_settings:
|
|
429
|
-
findings.append(Finding(
|
|
430
|
-
"missing-project-settings",
|
|
431
|
-
"medium",
|
|
432
|
-
".claude/settings.json",
|
|
433
|
-
"No shared project Claude settings file was found.",
|
|
434
|
-
"Add an opt-in project .claude/settings.json with read deny rules, statusline, and Bash output trimming hook.",
|
|
435
|
-
))
|
|
436
|
-
return settings, findings
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
def merged_settings(settings: list[dict[str, Any]]) -> dict[str, Any]:
|
|
440
|
-
merged: dict[str, Any] = {"permissions": {"deny": [], "allow": []}, "hooks": {}, "mcpServers": {}}
|
|
441
|
-
for item in settings:
|
|
442
|
-
data = item["data"]
|
|
443
|
-
permissions = data.get("permissions") if isinstance(data.get("permissions"), dict) else {}
|
|
444
|
-
for key in ("deny", "allow"):
|
|
445
|
-
values = permissions.get(key) if isinstance(permissions, dict) else []
|
|
446
|
-
if isinstance(values, list):
|
|
447
|
-
merged["permissions"][key].extend(str(v) for v in values if isinstance(v, str))
|
|
448
|
-
if isinstance(data.get("hooks"), dict):
|
|
449
|
-
for event, hooks in data["hooks"].items():
|
|
450
|
-
if isinstance(hooks, list):
|
|
451
|
-
merged["hooks"].setdefault(event, [])
|
|
452
|
-
if isinstance(merged["hooks"][event], list):
|
|
453
|
-
merged["hooks"][event].extend(hooks)
|
|
454
|
-
else:
|
|
455
|
-
merged["hooks"][event] = hooks
|
|
456
|
-
else:
|
|
457
|
-
merged["hooks"][event] = hooks
|
|
458
|
-
if isinstance(data.get("statusLine"), dict):
|
|
459
|
-
merged["statusLine"] = data["statusLine"]
|
|
460
|
-
if "model" in data:
|
|
461
|
-
merged["model"] = data["model"]
|
|
462
|
-
if "effortLevel" in data:
|
|
463
|
-
merged["effortLevel"] = data["effortLevel"]
|
|
464
|
-
if isinstance(data.get("mcpServers"), dict):
|
|
465
|
-
merged["mcpServers"].update(data["mcpServers"])
|
|
466
|
-
return merged
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
READ_TARGET_RE = re.compile(r"(?i)^\s*Read\((?P<target>.*)\)\s*$")
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
def normalize_read_target(value: str) -> str:
|
|
473
|
-
target = value.strip().strip('"').strip("'").replace("\\", "/")
|
|
474
|
-
while target.startswith("./"):
|
|
475
|
-
target = target[2:]
|
|
476
|
-
target = re.sub(r"/+", "/", target)
|
|
477
|
-
return target.rstrip("/") or "."
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
def parse_read_targets(deny_entries: list[str]) -> list[str]:
|
|
481
|
-
targets: list[str] = []
|
|
482
|
-
for entry in deny_entries:
|
|
483
|
-
match = READ_TARGET_RE.match(entry)
|
|
484
|
-
if not match:
|
|
485
|
-
continue
|
|
486
|
-
targets.append(normalize_read_target(match.group("target")))
|
|
487
|
-
return targets
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
def path_target_denied(deny_entries: list[str], recommended: str) -> bool:
|
|
491
|
-
"""Return True only for exact/equivalent or intentionally broader Read denies."""
|
|
492
|
-
required = parse_read_targets([recommended])
|
|
493
|
-
if not required:
|
|
494
|
-
return False
|
|
495
|
-
required_target = required[0]
|
|
496
|
-
if required_target in {"**", "*"}:
|
|
497
|
-
return False
|
|
498
|
-
targets = parse_read_targets(deny_entries)
|
|
499
|
-
broader_targets = {"**", "*", "./**", "."}
|
|
500
|
-
for target in targets:
|
|
501
|
-
if target in broader_targets:
|
|
502
|
-
return True
|
|
503
|
-
if target == required_target:
|
|
504
|
-
return True
|
|
505
|
-
if target.endswith("/**"):
|
|
506
|
-
base = target[:-3].rstrip("/")
|
|
507
|
-
if required_target == base or required_target.startswith(base + "/"):
|
|
508
|
-
return True
|
|
509
|
-
if target == "~/**" and required_target.startswith("~/"):
|
|
510
|
-
return True
|
|
511
|
-
return False
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
def project_path_exists(root: Path, rel: str) -> bool:
|
|
515
|
-
if rel == ".env":
|
|
516
|
-
return (root / ".env").exists()
|
|
517
|
-
if rel == ".env.*":
|
|
518
|
-
return any(path.name.startswith(".env.") for path in root.iterdir() if path.exists())
|
|
519
|
-
return (root / rel).exists()
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
def generic_context_pattern(rel: str) -> str:
|
|
523
|
-
if rel in {".env", ".npmrc", ".pypirc", ".netrc"}:
|
|
524
|
-
return rel
|
|
525
|
-
if rel.endswith(".*"):
|
|
526
|
-
return rel
|
|
527
|
-
if "*" in rel:
|
|
528
|
-
return rel.replace("./", "")
|
|
529
|
-
return f"{rel.rstrip('/')}/**"
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
def context_exclusion_recommendation(
|
|
533
|
-
*,
|
|
534
|
-
label: str,
|
|
535
|
-
rel: str,
|
|
536
|
-
recommended: str,
|
|
537
|
-
category: str,
|
|
538
|
-
severity: str,
|
|
539
|
-
deny_entries: list[str],
|
|
540
|
-
) -> dict[str, Any]:
|
|
541
|
-
already_denied = path_target_denied(deny_entries, recommended)
|
|
542
|
-
return {
|
|
543
|
-
"id": f"context-exclude-{safe_id_part(label)}",
|
|
544
|
-
"severity": severity,
|
|
545
|
-
"path": rel,
|
|
546
|
-
"category": category,
|
|
547
|
-
"status": "already_denied" if already_denied else "missing",
|
|
548
|
-
"reason": (
|
|
549
|
-
"Sensitive local file should not be read into AI-agent context."
|
|
550
|
-
if category == "sensitive"
|
|
551
|
-
else "Bulky generated/cache path should stay out of AI-agent context."
|
|
552
|
-
),
|
|
553
|
-
"recommended_deny": recommended,
|
|
554
|
-
"generic_pattern": generic_context_pattern(rel),
|
|
555
|
-
"applies_to": ["claude-permissions.deny", "agent-ignore-advisory"],
|
|
556
|
-
"surfaces": ["Claude Code permissions.deny", "generic agent ignore/exclude rules"],
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
def build_context_exclusion_recommendations(root: Path, deny_entries: list[str]) -> list[dict[str, Any]]:
|
|
561
|
-
recommendations: list[dict[str, Any]] = []
|
|
562
|
-
for label, rel, recommended in HEAVY_PROJECT_DENIES:
|
|
563
|
-
if project_path_exists(root, rel):
|
|
564
|
-
recommendations.append(context_exclusion_recommendation(
|
|
565
|
-
label=label,
|
|
566
|
-
rel=rel,
|
|
567
|
-
recommended=recommended,
|
|
568
|
-
category="generated_cache",
|
|
569
|
-
severity="medium",
|
|
570
|
-
deny_entries=deny_entries,
|
|
571
|
-
))
|
|
572
|
-
for label, rel, recommended in SENSITIVE_PROJECT_DENIES:
|
|
573
|
-
if project_path_exists(root, rel):
|
|
574
|
-
recommendations.append(context_exclusion_recommendation(
|
|
575
|
-
label=label,
|
|
576
|
-
rel=rel,
|
|
577
|
-
recommended=recommended,
|
|
578
|
-
category="sensitive",
|
|
579
|
-
severity="high",
|
|
580
|
-
deny_entries=deny_entries,
|
|
581
|
-
))
|
|
582
|
-
recommendations.sort(key=lambda item: (SEVERITY_ORDER.get(str(item["severity"]), 99), item["id"]))
|
|
583
|
-
return recommendations
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
def scan_settings(root: Path, settings: list[dict[str, Any]]) -> tuple[dict[str, Any], list[Finding]]:
|
|
587
|
-
findings: list[Finding] = []
|
|
588
|
-
merged = merged_settings(settings)
|
|
589
|
-
deny_entries = merged["permissions"]["deny"]
|
|
590
|
-
allow_entries = merged["permissions"]["allow"]
|
|
591
|
-
|
|
592
|
-
for label, rel, recommended in HEAVY_PROJECT_DENIES:
|
|
593
|
-
if project_path_exists(root, rel) and not path_target_denied(deny_entries, recommended):
|
|
594
|
-
findings.append(Finding(
|
|
595
|
-
f"missing-deny-{safe_id_part(label)}",
|
|
596
|
-
"medium",
|
|
597
|
-
rel,
|
|
598
|
-
f"Bulky generated/cache path `{rel}` exists but is not denied from Read.",
|
|
599
|
-
f"Add `{recommended}` to permissions.deny to avoid accidental large reads.",
|
|
600
|
-
{"recommended_deny": recommended},
|
|
601
|
-
))
|
|
602
|
-
|
|
603
|
-
for label, rel, recommended in SENSITIVE_PROJECT_DENIES:
|
|
604
|
-
if project_path_exists(root, rel) and not path_target_denied(deny_entries, recommended):
|
|
605
|
-
findings.append(Finding(
|
|
606
|
-
f"missing-sensitive-deny-{safe_id_part(label)}",
|
|
607
|
-
"high",
|
|
608
|
-
rel,
|
|
609
|
-
f"Sensitive project path `{rel}` exists but is not denied from Read.",
|
|
610
|
-
f"Add `{recommended}` to permissions.deny; do not send secrets to Claude context.",
|
|
611
|
-
{"recommended_deny": recommended},
|
|
612
|
-
))
|
|
613
|
-
|
|
614
|
-
for label, recommended in SENSITIVE_HOME_DENIES:
|
|
615
|
-
if not path_target_denied(deny_entries, recommended):
|
|
616
|
-
findings.append(Finding(
|
|
617
|
-
f"missing-home-deny-{safe_id_part(label)}",
|
|
618
|
-
"low",
|
|
619
|
-
label,
|
|
620
|
-
f"Home credential path `{label}` is not explicitly denied.",
|
|
621
|
-
f"Add `{recommended}` to permissions.deny as a guardrail against accidental credential reads.",
|
|
622
|
-
{"recommended_deny": recommended},
|
|
623
|
-
))
|
|
624
|
-
|
|
625
|
-
if not has_bash_trim_hook(merged):
|
|
626
|
-
findings.append(Finding(
|
|
627
|
-
"missing-bash-trim-hook",
|
|
628
|
-
"medium",
|
|
629
|
-
".claude/settings.json",
|
|
630
|
-
"No PreToolUse Bash hook for trimming noisy test/build/lint output was detected.",
|
|
631
|
-
"Install the example hook using context-guard-rewrite-bash or rewrite_bash_for_token_budget.py.",
|
|
632
|
-
))
|
|
633
|
-
|
|
634
|
-
if not has_large_read_guard(merged):
|
|
635
|
-
findings.append(Finding(
|
|
636
|
-
"missing-large-read-guard",
|
|
637
|
-
"medium",
|
|
638
|
-
".claude/settings.json",
|
|
639
|
-
"No PreToolUse Read hook for blocking large whole-file reads was detected.",
|
|
640
|
-
"Install context-guard-guard-read so Claude is nudged toward context-guard-read-symbol or line-range reads before large files enter context.",
|
|
641
|
-
))
|
|
642
|
-
|
|
643
|
-
if not has_statusline(merged):
|
|
644
|
-
findings.append(Finding(
|
|
645
|
-
"missing-token-statusline",
|
|
646
|
-
"low",
|
|
647
|
-
".claude/settings.json",
|
|
648
|
-
"No token/cost/context statusline command was detected.",
|
|
649
|
-
"Add context-guard-statusline so context and cost pressure stay visible during a session.",
|
|
650
|
-
))
|
|
651
|
-
|
|
652
|
-
for entry in allow_entries:
|
|
653
|
-
if any(target in {"**", "*", "."} for target in parse_read_targets([entry])):
|
|
654
|
-
findings.append(Finding(
|
|
655
|
-
"broad-read-allow",
|
|
656
|
-
"medium",
|
|
657
|
-
".claude/settings.json",
|
|
658
|
-
"A broad Read allow rule can make accidental large reads more likely.",
|
|
659
|
-
"Prefer narrow allow rules plus explicit deny entries for generated and secret paths.",
|
|
660
|
-
{"allow_entry": entry},
|
|
661
|
-
))
|
|
662
|
-
break
|
|
663
|
-
|
|
664
|
-
model = str(merged.get("model", "")).lower()
|
|
665
|
-
if "opus" in model:
|
|
666
|
-
findings.append(Finding(
|
|
667
|
-
"opus-default-model",
|
|
668
|
-
"medium",
|
|
669
|
-
".claude/settings.json",
|
|
670
|
-
"Default model appears to be Opus, which can burn scarce premium tokens on routine work.",
|
|
671
|
-
"Use Sonnet as the default and reserve Opus/opusplan for planning or high-risk reasoning.",
|
|
672
|
-
{"model": merged.get("model")},
|
|
673
|
-
))
|
|
674
|
-
|
|
675
|
-
effort = str(merged.get("effortLevel", "")).lower()
|
|
676
|
-
if effort in {"high", "max", "maximum"}:
|
|
677
|
-
findings.append(Finding(
|
|
678
|
-
"high-default-effort",
|
|
679
|
-
"low",
|
|
680
|
-
".claude/settings.json",
|
|
681
|
-
"Default effort is high, which can increase token burn on routine edits.",
|
|
682
|
-
"Use medium/low by default and raise effort only for hard design/debugging work.",
|
|
683
|
-
{"effortLevel": merged.get("effortLevel")},
|
|
684
|
-
))
|
|
685
|
-
|
|
686
|
-
mcp_servers = merged.get("mcpServers") if isinstance(merged.get("mcpServers"), dict) else {}
|
|
687
|
-
if len(mcp_servers) >= 6:
|
|
688
|
-
findings.append(Finding(
|
|
689
|
-
"many-mcp-servers",
|
|
690
|
-
"low",
|
|
691
|
-
".claude/settings.json",
|
|
692
|
-
"Many MCP servers are configured; tool schemas and discovery can add startup/context overhead.",
|
|
693
|
-
"Disable unused MCP servers for Claude sessions that do not need them.",
|
|
694
|
-
{"mcp_server_count": len(mcp_servers), "mcp_servers": sorted(mcp_servers)[:20]},
|
|
695
|
-
))
|
|
696
|
-
|
|
697
|
-
settings_summary = {
|
|
698
|
-
"files": [item["path"] for item in settings],
|
|
699
|
-
"deny_count": len(deny_entries),
|
|
700
|
-
"allow_count": len(allow_entries),
|
|
701
|
-
"has_bash_trim_hook": has_bash_trim_hook(merged),
|
|
702
|
-
"has_large_read_guard": has_large_read_guard(merged),
|
|
703
|
-
"has_statusline": has_statusline(merged),
|
|
704
|
-
"mcp_server_count": len(mcp_servers),
|
|
705
|
-
"model": merged.get("model"),
|
|
706
|
-
"effortLevel": merged.get("effortLevel"),
|
|
707
|
-
}
|
|
708
|
-
return settings_summary, findings
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
def has_bash_trim_hook(settings: dict[str, Any]) -> bool:
|
|
712
|
-
hooks = settings.get("hooks")
|
|
713
|
-
if not isinstance(hooks, dict):
|
|
714
|
-
return False
|
|
715
|
-
pre_tool = hooks.get("PreToolUse")
|
|
716
|
-
if not isinstance(pre_tool, list):
|
|
717
|
-
return False
|
|
718
|
-
for entry in pre_tool:
|
|
719
|
-
if not isinstance(entry, dict):
|
|
720
|
-
continue
|
|
721
|
-
matcher = entry.get("matcher")
|
|
722
|
-
if isinstance(matcher, str) and not matcher_applies_to_bash(matcher):
|
|
723
|
-
continue
|
|
724
|
-
commands = (
|
|
725
|
-
string_values(entry.get("hooks"))
|
|
726
|
-
+ string_values(entry.get("command"))
|
|
727
|
-
+ string_values(entry.get("commands"))
|
|
728
|
-
)
|
|
729
|
-
if any(any(marker in cmd for marker in BASH_TRIM_COMMAND_MARKERS) for cmd in commands):
|
|
730
|
-
return True
|
|
731
|
-
return False
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
def matcher_applies_to_bash(matcher: str) -> bool:
|
|
735
|
-
parts = [part.strip().lower() for part in matcher.split("|")]
|
|
736
|
-
return any(part in {"", "*", "bash"} for part in parts)
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
def has_large_read_guard(settings: dict[str, Any]) -> bool:
|
|
740
|
-
hooks = settings.get("hooks")
|
|
741
|
-
if not isinstance(hooks, dict):
|
|
742
|
-
return False
|
|
743
|
-
pre_tool = hooks.get("PreToolUse")
|
|
744
|
-
if not isinstance(pre_tool, list):
|
|
745
|
-
return False
|
|
746
|
-
for entry in pre_tool:
|
|
747
|
-
if not isinstance(entry, dict):
|
|
748
|
-
continue
|
|
749
|
-
matcher = entry.get("matcher")
|
|
750
|
-
if isinstance(matcher, str) and not matcher_applies_to_read(matcher):
|
|
751
|
-
continue
|
|
752
|
-
commands = (
|
|
753
|
-
string_values(entry.get("hooks"))
|
|
754
|
-
+ string_values(entry.get("command"))
|
|
755
|
-
+ string_values(entry.get("commands"))
|
|
756
|
-
)
|
|
757
|
-
if any(any(marker in cmd for marker in LARGE_READ_GUARD_COMMAND_MARKERS) for cmd in commands):
|
|
758
|
-
return True
|
|
759
|
-
return False
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
def matcher_applies_to_read(matcher: str) -> bool:
|
|
763
|
-
parts = [part.strip().lower() for part in matcher.split("|")]
|
|
764
|
-
return any(part in {"", "*", "read"} for part in parts)
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
def has_statusline(settings: dict[str, Any]) -> bool:
|
|
768
|
-
status = settings.get("statusLine")
|
|
769
|
-
if not isinstance(status, dict):
|
|
770
|
-
return False
|
|
771
|
-
command = status.get("command")
|
|
772
|
-
return isinstance(command, str) and any(marker in command for marker in STATUSLINE_COMMAND_MARKERS)
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
def should_scan_context_file(path: Path, root: Path) -> bool:
|
|
776
|
-
if path.name in CONTEXT_FILE_NAMES:
|
|
777
|
-
return True
|
|
778
|
-
raw_rel = raw_rel_path(path, root)
|
|
779
|
-
if raw_rel is None:
|
|
780
|
-
return False
|
|
781
|
-
if raw_rel in CONTEXT_EXACT_REL_FILES:
|
|
782
|
-
return True
|
|
783
|
-
rel = sanitize_rel_path(raw_rel)
|
|
784
|
-
return any(rel.startswith(prefix + "/") and path.suffix.lower() == ".md" for prefix in CONTEXT_MD_DIRS)
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
def iter_context_files(root: Path) -> Iterable[Path]:
|
|
788
|
-
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
|
|
789
|
-
current = Path(dirpath)
|
|
790
|
-
dirnames[:] = [
|
|
791
|
-
name
|
|
792
|
-
for name in dirnames
|
|
793
|
-
if name not in EXCLUDED_DIR_NAMES and not (current / name).is_symlink()
|
|
794
|
-
]
|
|
795
|
-
for name in filenames:
|
|
796
|
-
path = current / name
|
|
797
|
-
if path.is_symlink():
|
|
798
|
-
continue
|
|
799
|
-
if should_scan_context_file(path, root):
|
|
800
|
-
yield path
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
def read_text_prefix(path: Path, limit: int = MAX_CONTEXT_READ_BYTES, *, root: Path | None = None) -> tuple[str, bool]:
|
|
804
|
-
opener = (
|
|
805
|
-
_open_regular_under_root_no_follow(root, path, path_kind="context")
|
|
806
|
-
if root is not None
|
|
807
|
-
else open_regular_no_follow(path)
|
|
808
|
-
)
|
|
809
|
-
with opener as handle:
|
|
810
|
-
data = handle.read(limit + 1)
|
|
811
|
-
truncated = len(data) > limit
|
|
812
|
-
if truncated:
|
|
813
|
-
data = data[:limit]
|
|
814
|
-
return data.decode("utf-8", "replace"), truncated
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
def file_contains_secret(
|
|
818
|
-
path: Path,
|
|
819
|
-
chunk_bytes: int = 64_000,
|
|
820
|
-
*,
|
|
821
|
-
root: Path | None = None,
|
|
822
|
-
max_total_bytes: int = MAX_SECRET_SCAN_BYTES,
|
|
823
|
-
) -> bool:
|
|
824
|
-
carry = ""
|
|
825
|
-
bytes_read = 0
|
|
826
|
-
opener = (
|
|
827
|
-
_open_regular_under_root_no_follow(root, path, path_kind="context")
|
|
828
|
-
if root is not None
|
|
829
|
-
else open_regular_no_follow(path)
|
|
830
|
-
)
|
|
831
|
-
with opener as handle:
|
|
832
|
-
while True:
|
|
833
|
-
remaining = max_total_bytes - bytes_read
|
|
834
|
-
if remaining <= 0:
|
|
835
|
-
return False
|
|
836
|
-
data = handle.read(min(chunk_bytes, remaining))
|
|
837
|
-
if not data:
|
|
838
|
-
return False
|
|
839
|
-
bytes_read += len(data)
|
|
840
|
-
text = carry + data.decode("utf-8", "replace")
|
|
841
|
-
if SECRET_CONTENT_RE.search(text):
|
|
842
|
-
return True
|
|
843
|
-
carry = text[-512:]
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
def open_regular_no_follow(path: Path):
|
|
847
|
-
before = os.lstat(path)
|
|
848
|
-
if not stat.S_ISREG(before.st_mode):
|
|
849
|
-
raise OSError("not a regular file")
|
|
850
|
-
flags = os.O_RDONLY
|
|
851
|
-
nofollow = getattr(os, "O_NOFOLLOW", 0)
|
|
852
|
-
if hasattr(os, "O_NONBLOCK"):
|
|
853
|
-
flags |= os.O_NONBLOCK
|
|
854
|
-
if nofollow:
|
|
855
|
-
flags |= nofollow
|
|
856
|
-
fd = os.open(path, flags)
|
|
857
|
-
try:
|
|
858
|
-
opened = os.fstat(fd)
|
|
859
|
-
after = os.lstat(path)
|
|
860
|
-
if (
|
|
861
|
-
not stat.S_ISREG(opened.st_mode)
|
|
862
|
-
or not stat.S_ISREG(after.st_mode)
|
|
863
|
-
or not os.path.samestat(before, opened)
|
|
864
|
-
or not os.path.samestat(after, opened)
|
|
865
|
-
):
|
|
866
|
-
raise OSError("not a regular file")
|
|
867
|
-
handle = os.fdopen(fd, "rb")
|
|
868
|
-
except Exception:
|
|
869
|
-
os.close(fd)
|
|
870
|
-
raise
|
|
871
|
-
return handle
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
def format_os_error(exc: OSError) -> str:
|
|
875
|
-
reason = exc.strerror or exc.__class__.__name__
|
|
876
|
-
if exc.errno is not None:
|
|
877
|
-
return f"{reason} (errno {exc.errno})"
|
|
878
|
-
return reason
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
def scan_context(root: Path, large_bytes: int, huge_bytes: int, long_lines: int) -> tuple[list[dict[str, Any]], list[Finding]]:
|
|
882
|
-
context_files: list[dict[str, Any]] = []
|
|
883
|
-
findings: list[Finding] = []
|
|
884
|
-
for path in sorted(iter_context_files(root), key=lambda p: rel_path(p, root)):
|
|
885
|
-
rel = rel_path(path, root)
|
|
886
|
-
surface = context_surface_for_rel(raw_rel_path(path, root) or rel, path.name)
|
|
887
|
-
try:
|
|
888
|
-
st = path.lstat()
|
|
889
|
-
if not stat.S_ISREG(st.st_mode):
|
|
890
|
-
findings.append(context_finding(
|
|
891
|
-
"context-not-regular",
|
|
892
|
-
"medium",
|
|
893
|
-
rel,
|
|
894
|
-
"Context-like path is not a regular file.",
|
|
895
|
-
"Replace it with a regular markdown file or remove it from always-loaded context.",
|
|
896
|
-
))
|
|
897
|
-
continue
|
|
898
|
-
size = st.st_size
|
|
899
|
-
text, sample_truncated = read_text_prefix(path, root=root)
|
|
900
|
-
contains_secret = file_contains_secret(path, root=root)
|
|
901
|
-
except OSError as exc:
|
|
902
|
-
findings.append(context_finding(
|
|
903
|
-
"context-unreadable",
|
|
904
|
-
"low",
|
|
905
|
-
rel,
|
|
906
|
-
f"Context-like file could not be read: {format_os_error(exc)}.",
|
|
907
|
-
"Check file permissions or remove stale symlinks.",
|
|
908
|
-
))
|
|
909
|
-
continue
|
|
910
|
-
lines = text.count("\n") + (1 if text else 0)
|
|
911
|
-
code_fences = text.count("```")
|
|
912
|
-
item = {
|
|
913
|
-
"path": rel,
|
|
914
|
-
"bytes": size,
|
|
915
|
-
"sampled_lines": lines,
|
|
916
|
-
"sample_truncated": sample_truncated,
|
|
917
|
-
"code_fences": code_fences,
|
|
918
|
-
}
|
|
919
|
-
if surface is not None:
|
|
920
|
-
item.update(surface)
|
|
921
|
-
context_files.append(item)
|
|
922
|
-
|
|
923
|
-
if size >= huge_bytes:
|
|
924
|
-
evidence = {"bytes": size, "threshold_bytes": huge_bytes}
|
|
925
|
-
if surface is not None:
|
|
926
|
-
evidence.update(surface)
|
|
927
|
-
findings.append(context_finding(
|
|
928
|
-
"huge-context-file",
|
|
929
|
-
"high",
|
|
930
|
-
rel,
|
|
931
|
-
f"Context-like file is very large ({size} bytes).",
|
|
932
|
-
"Move long procedures/logs/examples into opt-in skills or commands and keep only a short index in always-loaded context.",
|
|
933
|
-
evidence,
|
|
934
|
-
))
|
|
935
|
-
elif size >= large_bytes or lines >= long_lines:
|
|
936
|
-
evidence = {"bytes": size, "large_bytes": large_bytes, "sampled_lines": lines, "long_lines": long_lines}
|
|
937
|
-
if surface is not None:
|
|
938
|
-
evidence.update(surface)
|
|
939
|
-
findings.append(context_finding(
|
|
940
|
-
"large-context-file",
|
|
941
|
-
"medium",
|
|
942
|
-
rel,
|
|
943
|
-
f"Context-like file is large ({size} bytes, sampled {lines} lines).",
|
|
944
|
-
"Trim stable instructions, move volatile or lengthy material to skills/custom commands, and keep examples short.",
|
|
945
|
-
evidence,
|
|
946
|
-
))
|
|
947
|
-
if code_fences >= 12:
|
|
948
|
-
findings.append(context_finding(
|
|
949
|
-
"context-heavy-code-fences",
|
|
950
|
-
"low",
|
|
951
|
-
rel,
|
|
952
|
-
"Context-like file contains many code fences, which can inflate startup context.",
|
|
953
|
-
"Replace long embedded examples with links or opt-in command/skill files.",
|
|
954
|
-
{"code_fences": code_fences},
|
|
955
|
-
))
|
|
956
|
-
if contains_secret:
|
|
957
|
-
findings.append(context_finding(
|
|
958
|
-
"secret-like-context-content",
|
|
959
|
-
"high",
|
|
960
|
-
rel,
|
|
961
|
-
"Context-like file contains credential-shaped text.",
|
|
962
|
-
"Remove secrets from prompt context and rotate exposed credentials if this file was shared.",
|
|
963
|
-
))
|
|
964
|
-
return context_files, findings
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
def bounded_top(value: int) -> int:
|
|
968
|
-
return max(1, min(int(value), 200))
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
def path_text_label(path_text: str, show_paths: bool) -> str:
|
|
972
|
-
sanitized = sanitize_path_text(str(path_text))
|
|
973
|
-
if show_paths:
|
|
974
|
-
return sanitized
|
|
975
|
-
name = sanitize_path_component(Path(sanitized).name or "path")
|
|
976
|
-
return f"{name}#path:{text_hash(sanitized)}"
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
def safe_report_label(value: Any, limit: int = MAX_REPORT_LABEL_CHARS) -> str:
|
|
980
|
-
text = " ".join(str(value or "").split())
|
|
981
|
-
text = SECRET_CONTENT_RE.sub("[REDACTED]", sanitize_path_text(text))
|
|
982
|
-
if len(text) <= limit:
|
|
983
|
-
return text
|
|
984
|
-
marker = f"…[trimmed:{len(text)} chars]"
|
|
985
|
-
return text[: max(0, limit - len(marker))] + marker
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
def json_byte_len(value: Any) -> int:
|
|
989
|
-
return len(json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":")).encode("utf-8", "replace"))
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
def iter_project_files(root: Path, suffixes: set[str], max_files: int) -> Iterable[Path]:
|
|
993
|
-
seen = 0
|
|
994
|
-
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
|
|
995
|
-
current = Path(dirpath)
|
|
996
|
-
dirnames[:] = [
|
|
997
|
-
name
|
|
998
|
-
for name in dirnames
|
|
999
|
-
if name not in EXCLUDED_DIR_NAMES and not (current / name).is_symlink()
|
|
1000
|
-
]
|
|
1001
|
-
for name in filenames:
|
|
1002
|
-
path = current / name
|
|
1003
|
-
if path.is_symlink() or path.suffix.lower() not in suffixes:
|
|
1004
|
-
continue
|
|
1005
|
-
yield path
|
|
1006
|
-
seen += 1
|
|
1007
|
-
if seen >= max_files:
|
|
1008
|
-
return
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
def walk_json(value: Any) -> Iterable[dict[str, Any]]:
|
|
1012
|
-
stack = [value]
|
|
1013
|
-
while stack:
|
|
1014
|
-
current = stack.pop()
|
|
1015
|
-
if isinstance(current, dict):
|
|
1016
|
-
yield current
|
|
1017
|
-
stack.extend(current.values())
|
|
1018
|
-
elif isinstance(current, list):
|
|
1019
|
-
stack.extend(current)
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
def normalize_rule_unit(line: str, min_chars: int) -> str | None:
|
|
1023
|
-
stripped = line.strip()
|
|
1024
|
-
if not stripped or stripped in {"```", "---"}:
|
|
1025
|
-
return None
|
|
1026
|
-
stripped = re.sub(r"^[-*+>]\s+", "", stripped)
|
|
1027
|
-
stripped = re.sub(r"^\d+[.)]\s+", "", stripped)
|
|
1028
|
-
stripped = re.sub(r"\s+", " ", stripped).strip().lower()
|
|
1029
|
-
if len(stripped) < min_chars:
|
|
1030
|
-
return None
|
|
1031
|
-
if len(stripped.split()) < 6:
|
|
1032
|
-
return None
|
|
1033
|
-
return stripped
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
def scan_duplicate_rules(root: Path, *, min_chars: int, top: int) -> tuple[list[dict[str, Any]], list[Finding]]:
|
|
1037
|
-
occurrences: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
1038
|
-
for path in sorted(iter_context_files(root), key=lambda p: rel_path(p, root)):
|
|
1039
|
-
rel = rel_path(path, root)
|
|
1040
|
-
try:
|
|
1041
|
-
text, truncated = read_text_prefix(path, root=root)
|
|
1042
|
-
except OSError:
|
|
1043
|
-
continue
|
|
1044
|
-
for line_no, line in enumerate(text.splitlines(), 1):
|
|
1045
|
-
normalized = normalize_rule_unit(line, min_chars)
|
|
1046
|
-
if normalized is None:
|
|
1047
|
-
continue
|
|
1048
|
-
occurrences[normalized].append({"path": rel, "line": line_no, "sample_truncated": truncated})
|
|
1049
|
-
groups: list[dict[str, Any]] = []
|
|
1050
|
-
findings: list[Finding] = []
|
|
1051
|
-
for normalized, items in occurrences.items():
|
|
1052
|
-
paths = sorted({item["path"] for item in items})
|
|
1053
|
-
if len(items) < 2 or len(paths) < 2:
|
|
1054
|
-
continue
|
|
1055
|
-
fingerprint = text_hash(normalized)
|
|
1056
|
-
group = {
|
|
1057
|
-
"fingerprint": fingerprint,
|
|
1058
|
-
"occurrence_count": len(items),
|
|
1059
|
-
"path_count": len(paths),
|
|
1060
|
-
"paths": paths[:top],
|
|
1061
|
-
"sample_chars": len(normalized),
|
|
1062
|
-
"confidence": "observed",
|
|
1063
|
-
}
|
|
1064
|
-
groups.append(group)
|
|
1065
|
-
findings.append(Finding(
|
|
1066
|
-
f"duplicate-context-rule-{fingerprint}",
|
|
1067
|
-
"low" if len(items) < 4 else "medium",
|
|
1068
|
-
"context-rules",
|
|
1069
|
-
"A normalized instruction/rule unit appears in multiple context-like files.",
|
|
1070
|
-
"Keep one canonical copy and replace duplicates with a short pointer if the rule is still needed.",
|
|
1071
|
-
group,
|
|
1072
|
-
rule_id="duplicate-context-rule",
|
|
1073
|
-
instance_id=f"duplicate-context-rule-{fingerprint}",
|
|
1074
|
-
))
|
|
1075
|
-
groups.sort(key=lambda item: (-item["occurrence_count"], item["fingerprint"]))
|
|
1076
|
-
findings.sort(key=lambda item: (SEVERITY_ORDER.get(item.severity, 99), item.id))
|
|
1077
|
-
return groups[:top], findings[:top]
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
def assigned_all_names(tree: ast.AST) -> set[str]:
|
|
1081
|
-
names: set[str] = set()
|
|
1082
|
-
for node in ast.walk(tree):
|
|
1083
|
-
if isinstance(node, ast.Assign):
|
|
1084
|
-
for target in node.targets:
|
|
1085
|
-
if isinstance(target, ast.Name) and target.id == "__all__" and isinstance(node.value, (ast.List, ast.Tuple)):
|
|
1086
|
-
for item in node.value.elts:
|
|
1087
|
-
if isinstance(item, ast.Constant) and isinstance(item.value, str):
|
|
1088
|
-
names.add(item.value)
|
|
1089
|
-
return names
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
def scan_python_imports(root: Path, *, top: int, max_files: int) -> tuple[dict[str, Any], list[Finding]]:
|
|
1093
|
-
findings: list[Finding] = []
|
|
1094
|
-
files_scanned = 0
|
|
1095
|
-
parse_errors = 0
|
|
1096
|
-
for path in iter_project_files(root, {".py"}, max_files):
|
|
1097
|
-
files_scanned += 1
|
|
1098
|
-
rel = rel_path(path, root)
|
|
1099
|
-
try:
|
|
1100
|
-
text, _ = read_text_prefix(path, limit=MAX_CONTEXT_READ_BYTES, root=root)
|
|
1101
|
-
tree = ast.parse(text, filename=rel)
|
|
1102
|
-
except (OSError, SyntaxError, ValueError):
|
|
1103
|
-
parse_errors += 1
|
|
1104
|
-
continue
|
|
1105
|
-
imports: list[tuple[str, int, str]] = []
|
|
1106
|
-
for node in ast.walk(tree):
|
|
1107
|
-
if isinstance(node, ast.Import):
|
|
1108
|
-
for alias in node.names:
|
|
1109
|
-
name = alias.asname or alias.name.split(".", 1)[0]
|
|
1110
|
-
if not name.startswith("_"):
|
|
1111
|
-
imports.append((name, node.lineno, alias.name))
|
|
1112
|
-
elif isinstance(node, ast.ImportFrom):
|
|
1113
|
-
if node.module == "__future__":
|
|
1114
|
-
continue
|
|
1115
|
-
for alias in node.names:
|
|
1116
|
-
if alias.name == "*":
|
|
1117
|
-
continue
|
|
1118
|
-
name = alias.asname or alias.name
|
|
1119
|
-
if not name.startswith("_"):
|
|
1120
|
-
imports.append((name, node.lineno, f"{node.module or ''}.{alias.name}".strip(".")))
|
|
1121
|
-
if not imports:
|
|
1122
|
-
continue
|
|
1123
|
-
used = {node.id for node in ast.walk(tree) if isinstance(node, ast.Name)} | assigned_all_names(tree)
|
|
1124
|
-
for name, line, module in imports:
|
|
1125
|
-
if name in used:
|
|
1126
|
-
continue
|
|
1127
|
-
instance = f"stale-python-import-{text_hash(f'{rel}:{line}:{name}')}"
|
|
1128
|
-
findings.append(Finding(
|
|
1129
|
-
instance,
|
|
1130
|
-
"low",
|
|
1131
|
-
rel,
|
|
1132
|
-
f"Python import `{name}` appears unused in static AST analysis.",
|
|
1133
|
-
"Review before removing; dynamic imports, re-exports, and type-checking paths can make this a false positive.",
|
|
1134
|
-
{"imported_name": name, "module": module, "line": line, "confidence": "advisory-static-ast"},
|
|
1135
|
-
rule_id="stale-python-import",
|
|
1136
|
-
instance_id=instance,
|
|
1137
|
-
))
|
|
1138
|
-
if len(findings) >= top:
|
|
1139
|
-
break
|
|
1140
|
-
if len(findings) >= top:
|
|
1141
|
-
break
|
|
1142
|
-
return {"files_scanned": files_scanned, "parse_errors": parse_errors, "unused_imports": [f.as_dict() for f in findings]}, findings
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
def iter_skill_files(root: Path, max_files: int) -> Iterable[Path]:
|
|
1146
|
-
count = 0
|
|
1147
|
-
for path in iter_project_files(root, {".md"}, max_files):
|
|
1148
|
-
if path.name == "SKILL.md" and "skills" in path.parts:
|
|
1149
|
-
yield path
|
|
1150
|
-
count += 1
|
|
1151
|
-
if count >= max_files:
|
|
1152
|
-
return
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
def safe_read_reference_text(path: Path, root: Path) -> str:
|
|
1156
|
-
try:
|
|
1157
|
-
text, _ = read_text_prefix(path, limit=128_000, root=root)
|
|
1158
|
-
return text.lower()
|
|
1159
|
-
except OSError:
|
|
1160
|
-
return ""
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
def scan_unused_skills(root: Path, *, top: int, max_files: int) -> tuple[dict[str, Any], list[Finding]]:
|
|
1164
|
-
skill_files = list(iter_skill_files(root, max_files))
|
|
1165
|
-
reference_files = [path for path in iter_project_files(root, TEXT_REFERENCE_SUFFIXES, max_files) if path.name != "SKILL.md"]
|
|
1166
|
-
reference_cache = {path: safe_read_reference_text(path, root) for path in reference_files}
|
|
1167
|
-
findings: list[Finding] = []
|
|
1168
|
-
candidates: list[dict[str, Any]] = []
|
|
1169
|
-
for skill in skill_files:
|
|
1170
|
-
skill_name = skill.parent.name
|
|
1171
|
-
needle_forms = {skill_name.lower(), f"/{skill_name.lower()}", f"context-guard:{skill_name.lower()}"}
|
|
1172
|
-
references = 0
|
|
1173
|
-
for ref_path, text in reference_cache.items():
|
|
1174
|
-
if ref_path == skill:
|
|
1175
|
-
continue
|
|
1176
|
-
if any(needle in text for needle in needle_forms):
|
|
1177
|
-
references += 1
|
|
1178
|
-
if references:
|
|
1179
|
-
continue
|
|
1180
|
-
rel = rel_path(skill, root)
|
|
1181
|
-
candidate = {"path": rel, "skill": safe_report_label(skill_name), "reference_count": 0, "confidence": "low-advisory"}
|
|
1182
|
-
candidates.append(candidate)
|
|
1183
|
-
instance = f"unused-skill-candidate-{text_hash(rel)}"
|
|
1184
|
-
findings.append(Finding(
|
|
1185
|
-
instance,
|
|
1186
|
-
"low",
|
|
1187
|
-
rel,
|
|
1188
|
-
"Skill file has no obvious project-local references outside its own SKILL.md.",
|
|
1189
|
-
"Confirm real usage through plugin manifests, user docs, or runtime telemetry before deleting or renaming it.",
|
|
1190
|
-
candidate,
|
|
1191
|
-
rule_id="unused-skill-candidate",
|
|
1192
|
-
instance_id=instance,
|
|
1193
|
-
))
|
|
1194
|
-
if len(findings) >= top:
|
|
1195
|
-
break
|
|
1196
|
-
return {"skills_scanned": len(skill_files), "reference_files_scanned": len(reference_files), "unused_candidates": candidates[:top]}, findings
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
def read_json_file_limited(path: Path, max_bytes: int) -> tuple[Any | None, str | None, int]:
|
|
1200
|
-
try:
|
|
1201
|
-
with open_regular_no_follow(path) as handle:
|
|
1202
|
-
size = os.fstat(handle.fileno()).st_size
|
|
1203
|
-
if size > max_bytes:
|
|
1204
|
-
return None, f"skipped oversized file ({size} bytes > {max_bytes})", size
|
|
1205
|
-
data = handle.read(max_bytes + 1)
|
|
1206
|
-
if len(data) > max_bytes:
|
|
1207
|
-
return None, f"skipped oversized file (> {max_bytes} bytes)", len(data)
|
|
1208
|
-
return json.loads(data.decode("utf-8", "replace")), None, len(data)
|
|
1209
|
-
except json.JSONDecodeError as exc:
|
|
1210
|
-
return None, f"invalid JSON at line {exc.lineno}: {exc.msg}", 0
|
|
1211
|
-
except (OSError, UnicodeDecodeError) as exc:
|
|
1212
|
-
return None, f"unreadable: {format_os_error(exc) if isinstance(exc, OSError) else exc.__class__.__name__}", 0
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
def tool_name_from_schema(d: dict[str, Any]) -> str | None:
|
|
1216
|
-
for key in ("name", "tool", "id", "title"):
|
|
1217
|
-
value = d.get(key)
|
|
1218
|
-
if isinstance(value, str) and value.strip():
|
|
1219
|
-
return safe_report_label(value)
|
|
1220
|
-
return None
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
def collect_tool_schemas(raw: Any) -> list[dict[str, Any]]:
|
|
1224
|
-
tools: list[dict[str, Any]] = []
|
|
1225
|
-
for d in walk_json(raw):
|
|
1226
|
-
name = tool_name_from_schema(d)
|
|
1227
|
-
if not name:
|
|
1228
|
-
continue
|
|
1229
|
-
if not any(key in d for key in ("inputSchema", "input_schema", "schema", "parameters", "description")):
|
|
1230
|
-
continue
|
|
1231
|
-
server = safe_report_label(d.get("server")) if isinstance(d.get("server"), str) else None
|
|
1232
|
-
tools.append({"name": name, "schema_bytes": json_byte_len(d), "server": server})
|
|
1233
|
-
dedup: dict[tuple[str, str | None], dict[str, Any]] = {}
|
|
1234
|
-
for tool in tools:
|
|
1235
|
-
key = (tool["name"], tool.get("server"))
|
|
1236
|
-
prior = dedup.get(key)
|
|
1237
|
-
if prior is None or int(tool["schema_bytes"]) > int(prior["schema_bytes"]):
|
|
1238
|
-
dedup[key] = tool
|
|
1239
|
-
return list(dedup.values())
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
def scan_tool_catalogs(root: Path, args: argparse.Namespace, settings: list[dict[str, Any]], *, top: int) -> tuple[dict[str, Any], list[Finding]]:
|
|
1243
|
-
findings: list[Finding] = []
|
|
1244
|
-
catalogs: list[dict[str, Any]] = []
|
|
1245
|
-
merged = merged_settings(settings)
|
|
1246
|
-
mcp_servers = merged.get("mcpServers") if isinstance(merged.get("mcpServers"), dict) else {}
|
|
1247
|
-
if len(mcp_servers) >= args.mcp_server_threshold:
|
|
1248
|
-
evidence = {"mcp_server_count": len(mcp_servers), "threshold": args.mcp_server_threshold, "confidence": "observed-settings"}
|
|
1249
|
-
findings.append(Finding(
|
|
1250
|
-
"excessive-mcp-servers",
|
|
1251
|
-
"low",
|
|
1252
|
-
".claude/settings.json",
|
|
1253
|
-
"Project Claude settings configure many MCP servers, which can increase tool discovery/schema overhead.",
|
|
1254
|
-
"Disable unused MCP servers for sessions that do not need them; keep this advisory until task-specific need is known.",
|
|
1255
|
-
evidence,
|
|
1256
|
-
rule_id="excessive-mcp-servers",
|
|
1257
|
-
instance_id="excessive-mcp-servers",
|
|
1258
|
-
))
|
|
1259
|
-
for raw_path in getattr(args, "tool_catalog", []) or []:
|
|
1260
|
-
path = safe_resolve(Path(raw_path).expanduser())
|
|
1261
|
-
label = path_text_label(str(path), args.show_paths)
|
|
1262
|
-
raw, error, size = read_json_file_limited(path, args.max_tool_catalog_bytes)
|
|
1263
|
-
if error:
|
|
1264
|
-
catalogs.append({"path": label, "status": "skipped", "reason": error, "bytes": size})
|
|
1265
|
-
continue
|
|
1266
|
-
tools = collect_tool_schemas(raw)
|
|
1267
|
-
total_schema_bytes = sum(int(tool["schema_bytes"]) for tool in tools)
|
|
1268
|
-
large_tools = sorted([tool for tool in tools if int(tool["schema_bytes"]) >= args.large_schema_bytes], key=lambda item: (-int(item["schema_bytes"]), item["name"]))[:top]
|
|
1269
|
-
catalog = {"path": label, "status": "scanned", "tool_count": len(tools), "schema_bytes": total_schema_bytes, "large_schema_tools": large_tools}
|
|
1270
|
-
catalogs.append(catalog)
|
|
1271
|
-
if len(tools) >= args.tool_count_threshold:
|
|
1272
|
-
instance = f"excessive-tool-catalog-{text_hash(label)}"
|
|
1273
|
-
findings.append(Finding(
|
|
1274
|
-
instance,
|
|
1275
|
-
"medium",
|
|
1276
|
-
label,
|
|
1277
|
-
"Local tool catalog contains many tools for one task context.",
|
|
1278
|
-
"Use context-guard-tool-prune or a task-specific tool allowlist before injecting full schemas.",
|
|
1279
|
-
{"tool_count": len(tools), "threshold": args.tool_count_threshold, "schema_bytes": total_schema_bytes, "confidence": "observed-catalog"},
|
|
1280
|
-
rule_id="excessive-tool-catalog",
|
|
1281
|
-
instance_id=instance,
|
|
1282
|
-
))
|
|
1283
|
-
for tool in large_tools:
|
|
1284
|
-
instance = f"large-tool-schema-{text_hash(label + ':' + tool['name'])}"
|
|
1285
|
-
findings.append(Finding(
|
|
1286
|
-
instance,
|
|
1287
|
-
"low",
|
|
1288
|
-
label,
|
|
1289
|
-
"A local tool schema is large enough to dominate narrow task context.",
|
|
1290
|
-
"Prefer a bounded top-k schema report and retrieve the full sanitized schema only when needed.",
|
|
1291
|
-
{"tool_name": tool["name"], "schema_bytes": tool["schema_bytes"], "threshold": args.large_schema_bytes, "confidence": "observed-catalog"},
|
|
1292
|
-
rule_id="large-tool-schema",
|
|
1293
|
-
instance_id=instance,
|
|
1294
|
-
))
|
|
1295
|
-
return {"mcp_server_count": len(mcp_servers), "catalogs": catalogs[:top]}, findings[: max(top, 1) * 2]
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
def iter_log_candidates(root: Path, log_paths: list[str], max_files: int) -> Iterable[Path]:
|
|
1299
|
-
candidates: list[Path] = []
|
|
1300
|
-
explicit = [Path(item).expanduser() for item in log_paths]
|
|
1301
|
-
default_roots = [root / ".claude", root / ".codex"]
|
|
1302
|
-
for path in explicit + default_roots:
|
|
1303
|
-
try:
|
|
1304
|
-
resolved = safe_resolve(path)
|
|
1305
|
-
except OSError:
|
|
1306
|
-
resolved = path
|
|
1307
|
-
if resolved.exists() and not resolved.is_symlink():
|
|
1308
|
-
candidates.append(resolved)
|
|
1309
|
-
yielded = 0
|
|
1310
|
-
for candidate in candidates:
|
|
1311
|
-
if candidate.is_file() and candidate.suffix.lower() in {".json", ".jsonl", ".ndjson", ".log"}:
|
|
1312
|
-
yield candidate
|
|
1313
|
-
yielded += 1
|
|
1314
|
-
elif candidate.is_dir():
|
|
1315
|
-
for dirpath, dirnames, filenames in os.walk(candidate, followlinks=False):
|
|
1316
|
-
current = Path(dirpath)
|
|
1317
|
-
dirnames[:] = [name for name in dirnames if name not in EXCLUDED_DIR_NAMES and not (current / name).is_symlink()]
|
|
1318
|
-
for name in filenames:
|
|
1319
|
-
path = current / name
|
|
1320
|
-
if path.is_symlink() or path.suffix.lower() not in {".json", ".jsonl", ".ndjson", ".log"}:
|
|
1321
|
-
continue
|
|
1322
|
-
yield path
|
|
1323
|
-
yielded += 1
|
|
1324
|
-
if yielded >= max_files:
|
|
1325
|
-
return
|
|
1326
|
-
if yielded >= max_files:
|
|
1327
|
-
return
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
def parse_possible_json(value: Any) -> Any:
|
|
1331
|
-
if isinstance(value, str):
|
|
1332
|
-
stripped = value.strip()
|
|
1333
|
-
if stripped and stripped[0] in "[{":
|
|
1334
|
-
try:
|
|
1335
|
-
return json.loads(stripped)
|
|
1336
|
-
except json.JSONDecodeError:
|
|
1337
|
-
return value
|
|
1338
|
-
return value
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
def call_name(d: dict[str, Any]) -> str | None:
|
|
1342
|
-
for key in TOOL_CALL_NAME_KEYS:
|
|
1343
|
-
value = d.get(key)
|
|
1344
|
-
if isinstance(value, str) and value.strip():
|
|
1345
|
-
return value.strip()[:120]
|
|
1346
|
-
typ = str(d.get("type") or "").lower()
|
|
1347
|
-
name = d.get("name")
|
|
1348
|
-
if isinstance(name, str) and name.strip() and (typ in {"tool_use", "tool_call", "function_call"} or any(key in d for key in TOOL_CALL_INPUT_KEYS)):
|
|
1349
|
-
return name.strip()[:120]
|
|
1350
|
-
return None
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
def call_input(d: dict[str, Any]) -> Any:
|
|
1354
|
-
for key in TOOL_CALL_INPUT_KEYS:
|
|
1355
|
-
if key in d:
|
|
1356
|
-
return parse_possible_json(d[key])
|
|
1357
|
-
return {}
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
def sanitized_fingerprint_value(value: Any) -> Any:
|
|
1361
|
-
if isinstance(value, dict):
|
|
1362
|
-
out: dict[str, Any] = {}
|
|
1363
|
-
for key, item in sorted(value.items(), key=lambda kv: str(kv[0])):
|
|
1364
|
-
safe_key = sanitize_path_component(str(key))
|
|
1365
|
-
out[safe_key] = sanitized_fingerprint_value(item)
|
|
1366
|
-
return out
|
|
1367
|
-
if isinstance(value, list):
|
|
1368
|
-
return [sanitized_fingerprint_value(item) for item in value[:20]]
|
|
1369
|
-
if isinstance(value, str):
|
|
1370
|
-
return SECRET_CONTENT_RE.sub("[REDACTED]", sanitize_path_text(value))[:500]
|
|
1371
|
-
return value
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
def find_path_argument(value: Any) -> str | None:
|
|
1375
|
-
stack = [parse_possible_json(value)]
|
|
1376
|
-
while stack:
|
|
1377
|
-
current = stack.pop()
|
|
1378
|
-
if isinstance(current, dict):
|
|
1379
|
-
for key, item in current.items():
|
|
1380
|
-
if str(key) in FILE_PATH_KEYS and isinstance(item, str) and item.strip():
|
|
1381
|
-
return item.strip()
|
|
1382
|
-
stack.append(item)
|
|
1383
|
-
elif isinstance(current, list):
|
|
1384
|
-
stack.extend(current)
|
|
1385
|
-
return None
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
def is_read_tool(name: str) -> bool:
|
|
1389
|
-
lowered = name.lower().replace("-", "_")
|
|
1390
|
-
tail = lowered.rsplit(".", 1)[-1]
|
|
1391
|
-
return lowered in READ_TOOL_NAMES or tail in READ_TOOL_NAMES or "read_file" in lowered
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
def scan_logs(root: Path, args: argparse.Namespace, *, top: int) -> tuple[dict[str, Any], list[Finding]]:
|
|
1395
|
-
tool_counts: Counter[tuple[str, str]] = Counter()
|
|
1396
|
-
tool_files: dict[tuple[str, str], set[str]] = defaultdict(set)
|
|
1397
|
-
read_counts: Counter[str] = Counter()
|
|
1398
|
-
read_labels: dict[str, str] = {}
|
|
1399
|
-
read_tools: dict[str, set[str]] = defaultdict(set)
|
|
1400
|
-
files_scanned = 0
|
|
1401
|
-
records_scanned = 0
|
|
1402
|
-
skipped_files: list[dict[str, Any]] = []
|
|
1403
|
-
skipped_records = 0
|
|
1404
|
-
for path in iter_log_candidates(root, getattr(args, "log_path", []) or [], args.max_structural_files):
|
|
1405
|
-
label = path_text_label(str(path), args.show_paths)
|
|
1406
|
-
try:
|
|
1407
|
-
with open_regular_no_follow(path) as handle:
|
|
1408
|
-
size = os.fstat(handle.fileno()).st_size
|
|
1409
|
-
if size > args.max_log_bytes:
|
|
1410
|
-
skipped_files.append({"path": label, "reason": f"oversized:{size}>{args.max_log_bytes}"})
|
|
1411
|
-
continue
|
|
1412
|
-
data = handle.read(args.max_log_bytes + 1)
|
|
1413
|
-
if len(data) > args.max_log_bytes:
|
|
1414
|
-
skipped_files.append({"path": label, "reason": f"oversized:>{args.max_log_bytes}"})
|
|
1415
|
-
continue
|
|
1416
|
-
except OSError as exc:
|
|
1417
|
-
skipped_files.append({"path": label, "reason": format_os_error(exc)})
|
|
1418
|
-
continue
|
|
1419
|
-
files_scanned += 1
|
|
1420
|
-
text = data.decode("utf-8", "replace")
|
|
1421
|
-
raw_records: list[Any] = []
|
|
1422
|
-
if path.suffix.lower() == ".json":
|
|
1423
|
-
try:
|
|
1424
|
-
parsed = json.loads(text)
|
|
1425
|
-
raw_records = parsed if isinstance(parsed, list) else [parsed]
|
|
1426
|
-
except json.JSONDecodeError:
|
|
1427
|
-
skipped_records += 1
|
|
1428
|
-
continue
|
|
1429
|
-
else:
|
|
1430
|
-
for raw_line in text.splitlines():
|
|
1431
|
-
if len(raw_line.encode("utf-8", "replace")) > args.max_log_line_bytes:
|
|
1432
|
-
skipped_records += 1
|
|
1433
|
-
continue
|
|
1434
|
-
if not raw_line.strip():
|
|
1435
|
-
continue
|
|
1436
|
-
try:
|
|
1437
|
-
raw_records.append(json.loads(raw_line))
|
|
1438
|
-
except json.JSONDecodeError:
|
|
1439
|
-
skipped_records += 1
|
|
1440
|
-
for record in raw_records:
|
|
1441
|
-
records_scanned += 1
|
|
1442
|
-
for d in walk_json(record):
|
|
1443
|
-
name = call_name(d)
|
|
1444
|
-
if not name:
|
|
1445
|
-
continue
|
|
1446
|
-
value = call_input(d)
|
|
1447
|
-
fp = text_hash(json.dumps(sanitized_fingerprint_value(value), ensure_ascii=False, sort_keys=True, default=str))
|
|
1448
|
-
key = (name, fp)
|
|
1449
|
-
tool_counts[key] += 1
|
|
1450
|
-
tool_files[key].add(label)
|
|
1451
|
-
if is_read_tool(name):
|
|
1452
|
-
path_arg = find_path_argument(value)
|
|
1453
|
-
if path_arg:
|
|
1454
|
-
read_fp = text_hash(sanitize_path_text(path_arg))
|
|
1455
|
-
read_counts[read_fp] += 1
|
|
1456
|
-
read_labels[read_fp] = path_text_label(path_arg, args.show_paths)
|
|
1457
|
-
read_tools[read_fp].add(name)
|
|
1458
|
-
findings: list[Finding] = []
|
|
1459
|
-
repeated_reads: list[dict[str, Any]] = []
|
|
1460
|
-
for fp, count in read_counts.most_common(top):
|
|
1461
|
-
if count < args.duplicate_call_threshold:
|
|
1462
|
-
continue
|
|
1463
|
-
item = {"path": read_labels[fp], "path_fingerprint": fp, "read_count": count, "tools": sorted(safe_report_label(name) for name in read_tools[fp]), "confidence": "observed-log"}
|
|
1464
|
-
repeated_reads.append(item)
|
|
1465
|
-
instance = f"repeated-file-read-{fp}"
|
|
1466
|
-
findings.append(Finding(
|
|
1467
|
-
instance,
|
|
1468
|
-
"medium",
|
|
1469
|
-
"local-logs",
|
|
1470
|
-
"The same file path appears to be read repeatedly in local tool-call logs.",
|
|
1471
|
-
"Use search/symbol/slice reads or a local artifact receipt instead of repeating whole-file reads.",
|
|
1472
|
-
item,
|
|
1473
|
-
rule_id="repeated-file-read",
|
|
1474
|
-
instance_id=instance,
|
|
1475
|
-
))
|
|
1476
|
-
duplicate_calls: list[dict[str, Any]] = []
|
|
1477
|
-
for (name, fp), count in tool_counts.most_common(top * 2):
|
|
1478
|
-
if count < args.duplicate_call_threshold:
|
|
1479
|
-
continue
|
|
1480
|
-
item = {"tool_name": safe_report_label(name), "input_fingerprint": fp, "call_count": count, "log_files": sorted(tool_files[(name, fp)])[:top], "confidence": "observed-log"}
|
|
1481
|
-
duplicate_calls.append(item)
|
|
1482
|
-
instance = f"duplicate-tool-call-{text_hash(name + ':' + fp)}"
|
|
1483
|
-
findings.append(Finding(
|
|
1484
|
-
instance,
|
|
1485
|
-
"low" if count < args.duplicate_call_threshold * 2 else "medium",
|
|
1486
|
-
"local-logs",
|
|
1487
|
-
"A tool call with the same sanitized input fingerprint repeats in local logs.",
|
|
1488
|
-
"Avoid replaying identical calls; keep one receipt or summarize the result before retrying.",
|
|
1489
|
-
item,
|
|
1490
|
-
rule_id="duplicate-tool-call",
|
|
1491
|
-
instance_id=instance,
|
|
1492
|
-
))
|
|
1493
|
-
if len(duplicate_calls) >= top:
|
|
1494
|
-
break
|
|
1495
|
-
return {
|
|
1496
|
-
"files_scanned": files_scanned,
|
|
1497
|
-
"records_scanned": records_scanned,
|
|
1498
|
-
"skipped_files": skipped_files[:top],
|
|
1499
|
-
"skipped_records": skipped_records,
|
|
1500
|
-
"repeated_file_reads": repeated_reads[:top],
|
|
1501
|
-
"duplicate_tool_calls": duplicate_calls[:top],
|
|
1502
|
-
}, findings[: top * 2]
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
def structural_summary(findings: list[Finding]) -> dict[str, Any]:
|
|
1506
|
-
by_rule: Counter[str] = Counter(item.rule_id or item.id for item in findings)
|
|
1507
|
-
by_severity: Counter[str] = Counter(item.severity for item in findings)
|
|
1508
|
-
return {
|
|
1509
|
-
"finding_count": len(findings),
|
|
1510
|
-
"by_rule": dict(sorted(by_rule.items())),
|
|
1511
|
-
"by_severity": dict(sorted(by_severity.items())),
|
|
1512
|
-
}
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
def build_structural_waste_report(args: argparse.Namespace) -> dict[str, Any]:
|
|
1516
|
-
root = safe_resolve(Path(args.path).expanduser())
|
|
1517
|
-
try:
|
|
1518
|
-
is_scan_root = root.exists() and root.is_dir()
|
|
1519
|
-
except OSError:
|
|
1520
|
-
is_scan_root = False
|
|
1521
|
-
if not is_scan_root:
|
|
1522
|
-
raise SystemExit(f"context-guard-diet: structural-waste path is not a directory: {path_label(root, args.show_paths)}")
|
|
1523
|
-
top = bounded_top(args.top)
|
|
1524
|
-
settings, _settings_findings = collect_settings(root)
|
|
1525
|
-
context_files, context_findings = scan_context(root, args.large_context_bytes, args.huge_context_bytes, args.long_context_lines)
|
|
1526
|
-
oversized_rule_findings = [item for item in context_findings if (item.rule_id or item.id) in {"large-context-file", "huge-context-file", "context-heavy-code-fences"}]
|
|
1527
|
-
duplicate_rule_groups, duplicate_rule_findings = scan_duplicate_rules(root, min_chars=args.duplicate_rule_min_chars, top=top)
|
|
1528
|
-
imports_category, import_findings = scan_python_imports(root, top=top, max_files=args.max_structural_files)
|
|
1529
|
-
skills_category, skill_findings = scan_unused_skills(root, top=top, max_files=args.max_structural_files)
|
|
1530
|
-
tools_category, tool_findings = scan_tool_catalogs(root, args, settings, top=top)
|
|
1531
|
-
logs_category, log_findings = scan_logs(root, args, top=top)
|
|
1532
|
-
findings = oversized_rule_findings + duplicate_rule_findings + import_findings + skill_findings + tool_findings + log_findings
|
|
1533
|
-
findings.sort(key=lambda item: (SEVERITY_ORDER.get(item.severity, 99), item.rule_id or item.id, item.path))
|
|
1534
|
-
return {
|
|
1535
|
-
"tool": "context-guard-diet",
|
|
1536
|
-
"mode": "structural-waste",
|
|
1537
|
-
"schema_version": STRUCTURAL_WASTE_SCHEMA_VERSION,
|
|
1538
|
-
"root": root_label(root, args.show_paths),
|
|
1539
|
-
"read_only": True,
|
|
1540
|
-
"network": "not-used",
|
|
1541
|
-
"destructive_actions": [],
|
|
1542
|
-
"limits": {
|
|
1543
|
-
"top": top,
|
|
1544
|
-
"max_structural_files": args.max_structural_files,
|
|
1545
|
-
"large_context_bytes": args.large_context_bytes,
|
|
1546
|
-
"huge_context_bytes": args.huge_context_bytes,
|
|
1547
|
-
"long_context_lines": args.long_context_lines,
|
|
1548
|
-
"duplicate_rule_min_chars": args.duplicate_rule_min_chars,
|
|
1549
|
-
"duplicate_call_threshold": args.duplicate_call_threshold,
|
|
1550
|
-
"mcp_server_threshold": args.mcp_server_threshold,
|
|
1551
|
-
"tool_count_threshold": args.tool_count_threshold,
|
|
1552
|
-
"large_schema_bytes": args.large_schema_bytes,
|
|
1553
|
-
"max_tool_catalog_bytes": args.max_tool_catalog_bytes,
|
|
1554
|
-
"max_log_bytes": args.max_log_bytes,
|
|
1555
|
-
"max_log_line_bytes": args.max_log_line_bytes,
|
|
1556
|
-
},
|
|
1557
|
-
"summary": structural_summary(findings),
|
|
1558
|
-
"categories": {
|
|
1559
|
-
"rule_files": {
|
|
1560
|
-
"context_files_scanned": len(context_files),
|
|
1561
|
-
"oversized_or_heavy": [item.as_dict() for item in oversized_rule_findings[:top]],
|
|
1562
|
-
"duplicate_rule_groups": duplicate_rule_groups,
|
|
1563
|
-
},
|
|
1564
|
-
"python_imports": imports_category,
|
|
1565
|
-
"skills": skills_category,
|
|
1566
|
-
"tool_schemas": tools_category,
|
|
1567
|
-
"local_logs": logs_category,
|
|
1568
|
-
},
|
|
1569
|
-
"finding_count": len(findings),
|
|
1570
|
-
"findings": [item.as_dict() for item in findings[: top * 10]],
|
|
1571
|
-
"caveats": [
|
|
1572
|
-
"Structural-waste diagnostics are advisory heuristics; verify before deleting rules, imports, skills, or tools.",
|
|
1573
|
-
"No network calls or destructive actions are performed by this command.",
|
|
1574
|
-
"Local log diagnostics use sanitized input fingerprints and do not print raw prompt, command, or tool-input text.",
|
|
1575
|
-
"Unused-skill and stale-import candidates can be false positives when usage is dynamic or outside the scanned project.",
|
|
1576
|
-
],
|
|
1577
|
-
}
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
def print_structural_waste_text(report: dict[str, Any]) -> None:
|
|
1581
|
-
print("ContextGuard structural-waste diagnostics")
|
|
1582
|
-
print(f"root: {report['root']}")
|
|
1583
|
-
print("read_only: yes network: not-used destructive_actions: none")
|
|
1584
|
-
summary = report["summary"]
|
|
1585
|
-
print(f"findings: {summary['finding_count']} by_rule={json.dumps(summary['by_rule'], sort_keys=True)}")
|
|
1586
|
-
if not report["findings"]:
|
|
1587
|
-
print("\nFindings:\n- none")
|
|
1588
|
-
return
|
|
1589
|
-
print("\nFindings:")
|
|
1590
|
-
for finding in report["findings"]:
|
|
1591
|
-
print(f"- [{finding['severity'].upper()}] {finding['rule_id']} @ {finding['path']}")
|
|
1592
|
-
print(f" why: {finding['message']}")
|
|
1593
|
-
print(f" fix: {finding['action']}")
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
SEVERITY_ORDER = {"high": 0, "medium": 1, "low": 2}
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
def build_report(args: argparse.Namespace) -> dict[str, Any]:
|
|
1600
|
-
root = safe_resolve(Path(args.path).expanduser())
|
|
1601
|
-
try:
|
|
1602
|
-
is_scan_root = root.exists() and root.is_dir()
|
|
1603
|
-
except OSError:
|
|
1604
|
-
is_scan_root = False
|
|
1605
|
-
if not is_scan_root:
|
|
1606
|
-
raise SystemExit(f"context-guard-diet: scan path is not a directory: {path_label(root, args.show_paths)}")
|
|
1607
|
-
settings, settings_findings = collect_settings(root)
|
|
1608
|
-
settings_summary, config_findings = scan_settings(root, settings)
|
|
1609
|
-
context_files, context_findings = scan_context(root, args.large_context_bytes, args.huge_context_bytes, args.long_context_lines)
|
|
1610
|
-
deny_entries = merged_settings(settings)["permissions"]["deny"]
|
|
1611
|
-
exclusion_recommendations = build_context_exclusion_recommendations(root, deny_entries)
|
|
1612
|
-
findings = settings_findings + config_findings + context_findings
|
|
1613
|
-
findings.sort(key=lambda item: (SEVERITY_ORDER.get(item.severity, 99), item.id, item.path))
|
|
1614
|
-
return {
|
|
1615
|
-
"tool": "context-guard-diet",
|
|
1616
|
-
"root": root_label(root, args.show_paths),
|
|
1617
|
-
"settings": settings_summary,
|
|
1618
|
-
"context_files": sorted(context_files, key=lambda item: item["bytes"], reverse=True)[: args.top],
|
|
1619
|
-
"context_exclusion_recommendations": exclusion_recommendations[: args.top],
|
|
1620
|
-
"finding_count": len(findings),
|
|
1621
|
-
"findings": [item.as_dict() for item in findings],
|
|
1622
|
-
}
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
def print_text(report: dict[str, Any]) -> None:
|
|
1626
|
-
print("Claude token diet scan")
|
|
1627
|
-
print(f"root: {report['root']}")
|
|
1628
|
-
settings = report["settings"]
|
|
1629
|
-
print(
|
|
1630
|
-
"settings: "
|
|
1631
|
-
f"files={len(settings['files'])} deny={settings['deny_count']} "
|
|
1632
|
-
f"trim_hook={'yes' if settings['has_bash_trim_hook'] else 'no'} "
|
|
1633
|
-
f"read_guard={'yes' if settings['has_large_read_guard'] else 'no'} "
|
|
1634
|
-
f"statusline={'yes' if settings['has_statusline'] else 'no'} "
|
|
1635
|
-
f"mcp={settings['mcp_server_count']}"
|
|
1636
|
-
)
|
|
1637
|
-
if report["context_files"]:
|
|
1638
|
-
print("\nTop context-like files:")
|
|
1639
|
-
for item in report["context_files"]:
|
|
1640
|
-
surface = f", surface={item['surface']}" if item.get("surface") else ""
|
|
1641
|
-
print(f"- {item['path']} ({item['bytes']} bytes, sampled_lines={item['sampled_lines']}{surface})")
|
|
1642
|
-
if report.get("context_exclusion_recommendations"):
|
|
1643
|
-
print("\nContext exclusion recommendations:")
|
|
1644
|
-
for item in report["context_exclusion_recommendations"]:
|
|
1645
|
-
status = item.get("status", "missing")
|
|
1646
|
-
print(f"- [{item['severity'].upper()}] {item['id']} @ {item['path']} ({status})")
|
|
1647
|
-
print(f" claude: {item['recommended_deny']}")
|
|
1648
|
-
print(f" generic: {item['generic_pattern']}")
|
|
1649
|
-
print("\nFindings:")
|
|
1650
|
-
if not report["findings"]:
|
|
1651
|
-
print("- none")
|
|
1652
|
-
return
|
|
1653
|
-
for finding in report["findings"]:
|
|
1654
|
-
print(f"- [{finding['severity'].upper()}] {finding['id']} @ {finding['path']}")
|
|
1655
|
-
print(f" why: {finding['message']}")
|
|
1656
|
-
print(f" fix: {finding['action']}")
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
def main() -> int:
|
|
1660
|
-
parser = argparse.ArgumentParser(prog="context-guard-diet")
|
|
1661
|
-
sub = parser.add_subparsers(dest="command", required=True)
|
|
1662
|
-
scan = sub.add_parser("scan", help="scan project settings and context files for token-diet gaps")
|
|
1663
|
-
scan.add_argument("path", nargs="?", default=".")
|
|
1664
|
-
scan.add_argument("--json", action="store_true", help="emit machine-readable JSON")
|
|
1665
|
-
scan.add_argument("--show-paths", action="store_true", help="show raw absolute root path instead of a stable anonymized root label; local debugging only because private paths may be exposed")
|
|
1666
|
-
scan.add_argument("--top", type=int, default=20, help="maximum context-like files and context-exclusion recommendations to list")
|
|
1667
|
-
scan.add_argument("--large-context-bytes", type=int, default=DEFAULT_LARGE_CONTEXT_BYTES)
|
|
1668
|
-
scan.add_argument("--huge-context-bytes", type=int, default=DEFAULT_HUGE_CONTEXT_BYTES)
|
|
1669
|
-
scan.add_argument("--long-context-lines", type=int, default=DEFAULT_LONG_CONTEXT_LINES)
|
|
1670
|
-
|
|
1671
|
-
structural = sub.add_parser("structural-waste", help="run local read-only structural waste diagnostics")
|
|
1672
|
-
structural.add_argument("path", nargs="?", default=".")
|
|
1673
|
-
structural.add_argument("--json", action="store_true", help="emit machine-readable JSON")
|
|
1674
|
-
structural.add_argument("--show-paths", action="store_true", help="show raw local paths for debugging; secret-shaped path components remain redacted")
|
|
1675
|
-
structural.add_argument("--top", type=int, default=DEFAULT_STRUCTURAL_WASTE_TOP, help="maximum findings per structural-waste category to list")
|
|
1676
|
-
structural.add_argument("--log-path", action="append", default=[], help="local JSON/JSONL log or directory to inspect for repeated reads/tool calls; may be repeated")
|
|
1677
|
-
structural.add_argument("--tool-catalog", action="append", default=[], help="local tool/MCP catalog JSON to inspect; may be repeated")
|
|
1678
|
-
structural.add_argument("--large-context-bytes", type=int, default=DEFAULT_LARGE_CONTEXT_BYTES)
|
|
1679
|
-
structural.add_argument("--huge-context-bytes", type=int, default=DEFAULT_HUGE_CONTEXT_BYTES)
|
|
1680
|
-
structural.add_argument("--long-context-lines", type=int, default=DEFAULT_LONG_CONTEXT_LINES)
|
|
1681
|
-
structural.add_argument("--duplicate-rule-min-chars", type=int, default=DEFAULT_DUPLICATE_RULE_MIN_CHARS)
|
|
1682
|
-
structural.add_argument("--duplicate-call-threshold", type=int, default=DEFAULT_DUPLICATE_CALL_THRESHOLD)
|
|
1683
|
-
structural.add_argument("--mcp-server-threshold", type=int, default=DEFAULT_MCP_SERVER_THRESHOLD)
|
|
1684
|
-
structural.add_argument("--tool-count-threshold", type=int, default=DEFAULT_TOOL_COUNT_THRESHOLD)
|
|
1685
|
-
structural.add_argument("--large-schema-bytes", type=int, default=DEFAULT_LARGE_SCHEMA_BYTES)
|
|
1686
|
-
structural.add_argument("--max-tool-catalog-bytes", type=int, default=DEFAULT_MAX_TOOL_CATALOG_BYTES)
|
|
1687
|
-
structural.add_argument("--max-log-bytes", type=int, default=DEFAULT_MAX_LOG_BYTES)
|
|
1688
|
-
structural.add_argument("--max-log-line-bytes", type=int, default=DEFAULT_MAX_LOG_LINE_BYTES)
|
|
1689
|
-
structural.add_argument("--max-structural-files", type=int, default=DEFAULT_MAX_STRUCTURAL_FILES)
|
|
1690
|
-
args = parser.parse_args()
|
|
1691
|
-
|
|
1692
|
-
if args.command == "scan":
|
|
1693
|
-
report = build_report(args)
|
|
1694
|
-
if args.json:
|
|
1695
|
-
print(json.dumps(report, indent=2, sort_keys=True, ensure_ascii=False))
|
|
1696
|
-
else:
|
|
1697
|
-
print_text(report)
|
|
1698
|
-
return 0
|
|
1699
|
-
if args.command == "structural-waste":
|
|
1700
|
-
report = build_structural_waste_report(args)
|
|
1701
|
-
if args.json:
|
|
1702
|
-
print(json.dumps(report, indent=2, sort_keys=True, ensure_ascii=False))
|
|
1703
|
-
else:
|
|
1704
|
-
print_structural_waste_text(report)
|
|
1705
|
-
return 0
|
|
1706
|
-
parser.error("unknown command")
|
|
1707
|
-
return 2
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
if __name__ == "__main__":
|
|
1711
|
-
raise SystemExit(main())
|