@ictechgy/context-guard 0.4.8 → 0.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CHANGELOG.md +29 -0
  2. package/README.ko.md +92 -37
  3. package/README.md +111 -37
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  8. package/docs/distribution.md +10 -7
  9. package/docs/experimental-benchmark-fixtures.md +8 -1
  10. package/package.json +3 -6
  11. package/packaging/homebrew/context-guard.rb.template +1 -1
  12. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  13. package/plugins/context-guard/README.ko.md +9 -6
  14. package/plugins/context-guard/README.md +27 -12
  15. package/plugins/context-guard/bin/context-guard +113 -26
  16. package/plugins/context-guard/bin/context-guard-artifact +542 -46
  17. package/plugins/context-guard/bin/context-guard-cache-score +380 -0
  18. package/plugins/context-guard/bin/context-guard-compress +146 -1
  19. package/plugins/context-guard/bin/context-guard-cost +783 -4
  20. package/plugins/context-guard/bin/context-guard-experiments +2211 -121
  21. package/plugins/context-guard/bin/context-guard-failed-nudge +3 -0
  22. package/plugins/context-guard/bin/context-guard-filter +163 -7
  23. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  24. package/plugins/context-guard/bin/context-guard-pack +602 -43
  25. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  26. package/plugins/context-guard/bin/context-guard-setup +165 -31
  27. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  28. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  29. package/plugins/context-guard/bin/context-guard-tool-prune +241 -1
  30. package/plugins/context-guard/lib/context_guard_commands.py +206 -0
  31. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  32. package/context-guard-kit/README.md +0 -91
  33. package/context-guard-kit/benchmark_runner.py +0 -2401
  34. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  35. package/context-guard-kit/context_compress.py +0 -695
  36. package/context-guard-kit/context_escrow.py +0 -935
  37. package/context-guard-kit/context_filter.py +0 -637
  38. package/context-guard-kit/context_guard_cli.py +0 -325
  39. package/context-guard-kit/context_guard_diet.py +0 -1711
  40. package/context-guard-kit/context_pack.py +0 -2713
  41. package/context-guard-kit/cost_guard.py +0 -2349
  42. package/context-guard-kit/experimental_registry.py +0 -2339
  43. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  44. package/context-guard-kit/guard_large_read.py +0 -690
  45. package/context-guard-kit/hook_secret_patterns.py +0 -43
  46. package/context-guard-kit/read_symbol.py +0 -483
  47. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  48. package/context-guard-kit/sanitize_output.py +0 -725
  49. package/context-guard-kit/settings.example.json +0 -67
  50. package/context-guard-kit/setup_wizard.py +0 -2515
  51. package/context-guard-kit/statusline.sh +0 -362
  52. package/context-guard-kit/statusline_merged.sh +0 -157
  53. package/context-guard-kit/tool_schema_pruner.py +0 -837
  54. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -1,1711 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Scan a project for Claude Code token-diet configuration gaps.
3
-
4
- The scanner is intentionally local, read-only, and heuristic. It looks for
5
- large always-in-context instruction files, missing read deny rules for bulky or
6
- sensitive paths, and missing helper hooks/statusline settings that reduce token
7
- burn during noisy command runs.
8
- """
9
- from __future__ import annotations
10
-
11
- import argparse
12
- import ast
13
- from collections import Counter, defaultdict
14
- import errno
15
- import hashlib
16
- import json
17
- import os
18
- import re
19
- import stat
20
- import sys
21
- from dataclasses import dataclass, field
22
- from pathlib import Path
23
- from typing import Any, Iterable
24
-
25
- CONTEXT_FILE_NAMES = {"CLAUDE.md", "AGENTS.md", "GEMINI.md"}
26
- CONTEXT_EXACT_REL_FILES = {
27
- ".clinerules",
28
- ".cursorrules",
29
- ".github/copilot-instructions.md",
30
- ".windsurfrules",
31
- }
32
- CONTEXT_MD_DIRS = {
33
- ".claude/agents",
34
- ".claude/commands",
35
- ".claude/skills",
36
- ".clinerules",
37
- ".cursor/rules",
38
- ".windsurf/rules",
39
- }
40
- CONTEXT_SURFACE_LABELS = {
41
- "claude": "Claude Code instructions",
42
- "codex": "OpenAI Codex AGENTS.md",
43
- "gemini": "Gemini CLI instructions",
44
- "cursor": "Cursor rules",
45
- "windsurf": "Windsurf rules",
46
- "cline": "Cline rules",
47
- "copilot": "GitHub Copilot instructions",
48
- }
49
- EXCLUDED_DIR_NAMES = {
50
- ".cache",
51
- ".git",
52
- ".hg",
53
- ".mypy_cache",
54
- ".next",
55
- ".omx",
56
- ".pytest_cache",
57
- ".ruff_cache",
58
- ".serena",
59
- ".tox",
60
- ".venv",
61
- ".vscode",
62
- "__pycache__",
63
- "build",
64
- "coverage",
65
- "dist",
66
- "node_modules",
67
- "target",
68
- "vendor",
69
- }
70
- MAX_CONTEXT_READ_BYTES = 512_000
71
- MAX_SECRET_SCAN_BYTES = 5_000_000
72
- MAX_SETTINGS_READ_BYTES = 256_000
73
- DEFAULT_LARGE_CONTEXT_BYTES = 16_000
74
- DEFAULT_HUGE_CONTEXT_BYTES = 64_000
75
- DEFAULT_LONG_CONTEXT_LINES = 300
76
- STRUCTURAL_WASTE_SCHEMA_VERSION = "contextguard.structural-waste.v1"
77
- DEFAULT_STRUCTURAL_WASTE_TOP = 20
78
- DEFAULT_DUPLICATE_RULE_MIN_CHARS = 48
79
- DEFAULT_DUPLICATE_CALL_THRESHOLD = 3
80
- DEFAULT_MCP_SERVER_THRESHOLD = 6
81
- DEFAULT_TOOL_COUNT_THRESHOLD = 40
82
- DEFAULT_LARGE_SCHEMA_BYTES = 12_000
83
- DEFAULT_MAX_TOOL_CATALOG_BYTES = 1_000_000
84
- DEFAULT_MAX_LOG_BYTES = 5_000_000
85
- DEFAULT_MAX_LOG_LINE_BYTES = 1_000_000
86
- DEFAULT_MAX_STRUCTURAL_FILES = 2_000
87
- MAX_REPORT_LABEL_CHARS = 160
88
- TEXT_REFERENCE_SUFFIXES = {".md", ".txt", ".json", ".toml", ".yaml", ".yml", ".py", ".js", ".ts", ".tsx", ".jsx", ".sh"}
89
- TOOL_CALL_NAME_KEYS = ("tool_name", "toolName", "tool")
90
- TOOL_CALL_INPUT_KEYS = ("tool_input", "input", "arguments", "args", "parameters")
91
- READ_TOOL_NAMES = {"read", "read_file", "fileread", "view_file", "open_file", "get_file", "functions.get_file"}
92
- FILE_PATH_KEYS = {"file_path", "filepath", "path", "absolute_path", "relative_path", "file"}
93
-
94
- HEAVY_PROJECT_DENIES: tuple[tuple[str, str, str], ...] = (
95
- ("node_modules", "node_modules", "Read(./node_modules/**)"),
96
- ("dist", "dist", "Read(./dist/**)"),
97
- ("build", "build", "Read(./build/**)"),
98
- ("coverage", "coverage", "Read(./coverage/**)"),
99
- ("logs", "logs", "Read(./logs/**)"),
100
- ("tmp", "tmp", "Read(./tmp/**)"),
101
- ("target", "target", "Read(./target/**)"),
102
- (".next", ".next", "Read(./.next/**)"),
103
- (".venv", ".venv", "Read(./.venv/**)"),
104
- ("vendor", "vendor", "Read(./vendor/**)"),
105
- (".context-guard", ".context-guard", "Read(./.context-guard/**)"),
106
- (".claude-token-optimizer", ".claude-token-optimizer", "Read(./.claude-token-optimizer/**)"),
107
- )
108
- SENSITIVE_PROJECT_DENIES: tuple[tuple[str, str, str], ...] = (
109
- (".env", ".env", "Read(./.env)"),
110
- (".env.*", ".env.*", "Read(./.env.*)"),
111
- (".npmrc", ".npmrc", "Read(./.npmrc)"),
112
- (".pypirc", ".pypirc", "Read(./.pypirc)"),
113
- (".netrc", ".netrc", "Read(./.netrc)"),
114
- )
115
- SENSITIVE_HOME_DENIES: tuple[tuple[str, str], ...] = (
116
- ("~/.ssh", "Read(~/.ssh/**)"),
117
- ("~/.aws", "Read(~/.aws/**)"),
118
- ("~/.gnupg", "Read(~/.gnupg/**)"),
119
- ("~/.kube", "Read(~/.kube/**)"),
120
- ("~/.docker", "Read(~/.docker/**)"),
121
- )
122
- SECRET_CONTENT_RE = re.compile(
123
- r"(?is)("
124
- r"-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----|"
125
- r"AKIA[0-9A-Z]{16}|"
126
- r"gh[pousr]_[A-Za-z0-9_]{20,}|"
127
- r"xox[abprs]-[A-Za-z0-9-]{10,}|"
128
- r"AIza[0-9A-Za-z_\-]{20,}|"
129
- r"(?i:Authorization)\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+|"
130
- r"(?<![A-Za-z0-9])(?:api[_-]?key|token|secret|password|client[_-]?secret)\s*[:=]\s*[^\s]+"
131
- r")"
132
- )
133
- REDACTED_PATH_COMPONENT = "[REDACTED-PATH-COMPONENT]"
134
- BASH_TRIM_COMMAND_MARKERS = (
135
- "context-guard-rewrite-bash",
136
- "claude-token-rewrite-bash",
137
- "rewrite_bash_for_token_budget.py",
138
- )
139
- LARGE_READ_GUARD_COMMAND_MARKERS = (
140
- "context-guard-guard-read",
141
- "claude-token-guard-read",
142
- "guard_large_read.py",
143
- )
144
- STATUSLINE_COMMAND_MARKERS = (
145
- "context-guard-statusline",
146
- "claude-token-statusline",
147
- "statusline.sh",
148
- "statusline_merged.sh",
149
- )
150
-
151
-
152
- @dataclass
153
- class Finding:
154
- id: str
155
- severity: str
156
- path: str
157
- message: str
158
- action: str
159
- evidence: dict[str, Any] = field(default_factory=dict)
160
- rule_id: str | None = None
161
- instance_id: str | None = None
162
-
163
- def as_dict(self) -> dict[str, Any]:
164
- return {
165
- "id": self.id,
166
- "rule_id": self.rule_id or self.id,
167
- "instance_id": self.instance_id or self.id,
168
- "severity": self.severity,
169
- "path": self.path,
170
- "message": self.message,
171
- "action": self.action,
172
- "evidence": self.evidence,
173
- }
174
-
175
-
176
- def path_hash(path: Path) -> str:
177
- return hashlib.sha256(str(path).encode("utf-8", "replace")).hexdigest()[:12]
178
-
179
-
180
- def text_hash(text: str) -> str:
181
- return hashlib.sha256(text.encode("utf-8", "replace")).hexdigest()[:12]
182
-
183
-
184
- def safe_id_part(text: str) -> str:
185
- normalized = text.lower().replace("*", " star ")
186
- return re.sub(r"[^a-z0-9]+", "-", normalized).strip("-")
187
-
188
-
189
- def safe_resolve(path: Path) -> Path:
190
- try:
191
- return path.resolve()
192
- except (OSError, RuntimeError):
193
- return path.absolute()
194
-
195
-
196
- def path_component_contains_secret(component: str) -> bool:
197
- return bool(component and component not in {".", ".."} and SECRET_CONTENT_RE.search(component))
198
-
199
-
200
- def sanitize_path_component(component: str) -> str:
201
- if not component or component in {".", ".."}:
202
- return component
203
- if not path_component_contains_secret(component):
204
- return component
205
- return REDACTED_PATH_COMPONENT
206
-
207
-
208
- def sanitize_rel_path(path: str) -> str:
209
- return "/".join(sanitize_path_component(component) for component in path.split("/"))
210
-
211
-
212
- def sanitize_path_text(path: str) -> str:
213
- return "/".join(sanitize_path_component(component) for component in path.replace(os.sep, "/").split("/"))
214
-
215
-
216
- def display_path_hash(path: Path) -> str:
217
- return text_hash(sanitize_path_text(str(safe_resolve(path))))
218
-
219
-
220
- def path_label(path: Path, show_paths: bool) -> str:
221
- if show_paths:
222
- return sanitize_path_text(str(path))
223
- name = sanitize_path_component(path.name or "path")
224
- return f"{name}#path:{display_path_hash(path)}"
225
-
226
-
227
- def context_finding(
228
- rule_id: str,
229
- severity: str,
230
- path: str,
231
- message: str,
232
- action: str,
233
- evidence: dict[str, Any] | None = None,
234
- ) -> Finding:
235
- instance_id = f"{rule_id}-{text_hash(path)}"
236
- return Finding(instance_id, severity, path, message, action, evidence or {}, rule_id=rule_id, instance_id=instance_id)
237
-
238
-
239
- def root_label(root: Path, show_paths: bool) -> str:
240
- if show_paths:
241
- return sanitize_path_text(str(root))
242
- name = sanitize_path_component(root.name or "project")
243
- return f"{name}#path:{display_path_hash(root)}"
244
-
245
-
246
- def rel_path(path: Path, root: Path) -> str:
247
- try:
248
- return sanitize_rel_path(path.resolve().relative_to(root.resolve()).as_posix())
249
- except (OSError, RuntimeError, ValueError):
250
- name = sanitize_path_component(path.name or "path")
251
- return f"{name}#path:{display_path_hash(path)}"
252
-
253
-
254
- def raw_rel_path(path: Path, root: Path) -> str | None:
255
- try:
256
- return path.resolve().relative_to(root.resolve()).as_posix()
257
- except (OSError, RuntimeError, ValueError):
258
- return None
259
-
260
-
261
- def context_surface_for_rel(raw_rel: str, name: str) -> dict[str, str] | None:
262
- if name == "CLAUDE.md" or raw_rel.startswith(".claude/"):
263
- key = "claude"
264
- elif name == "AGENTS.md":
265
- key = "codex"
266
- elif name == "GEMINI.md":
267
- key = "gemini"
268
- elif raw_rel == ".cursorrules" or raw_rel.startswith(".cursor/rules/"):
269
- key = "cursor"
270
- elif raw_rel == ".windsurfrules" or raw_rel.startswith(".windsurf/rules/"):
271
- key = "windsurf"
272
- elif raw_rel == ".clinerules" or raw_rel.startswith(".clinerules/"):
273
- key = "cline"
274
- elif raw_rel == ".github/copilot-instructions.md":
275
- key = "copilot"
276
- else:
277
- return None
278
- return {
279
- "surface": key,
280
- "surface_label": CONTEXT_SURFACE_LABELS.get(key, key),
281
- "surface_kind": "agent_rule",
282
- }
283
-
284
-
285
- class SettingsFileTooLargeError(ValueError):
286
- pass
287
-
288
-
289
- def load_json(path: Path, root: Path) -> tuple[dict[str, Any] | None, str | None]:
290
- try:
291
- data = json.loads(read_settings_json_bytes_no_follow(path, root).decode("utf-8"))
292
- except FileNotFoundError:
293
- return None, "missing"
294
- except json.JSONDecodeError as exc:
295
- return None, f"invalid JSON at line {exc.lineno}: {exc.msg}"
296
- except SettingsFileTooLargeError as exc:
297
- return None, str(exc)
298
- except UnicodeDecodeError as exc:
299
- return None, f"invalid UTF-8 near byte {exc.start}"
300
- except OSError as exc:
301
- return None, f"unreadable: {format_os_error(exc)}"
302
- if not isinstance(data, dict):
303
- return None, "settings root must be a JSON object"
304
- return data, None
305
-
306
-
307
- def _open_regular_under_root_no_follow(root: Path, path: Path, *, path_kind: str = "settings"):
308
- root_resolved = root.resolve()
309
- nofollow = getattr(os, "O_NOFOLLOW", 0)
310
- if not nofollow:
311
- raise OSError(errno.ENOTSUP, "safe no-follow open is unavailable")
312
- if os.open not in getattr(os, "supports_dir_fd", set()):
313
- raise OSError(errno.ENOTSUP, "safe directory-relative open is unavailable")
314
- try:
315
- relative = path.relative_to(root_resolved)
316
- except ValueError:
317
- try:
318
- relative = path.relative_to(root)
319
- except ValueError as exc:
320
- raise OSError(f"{path_kind} path is outside project root") from exc
321
- parts = relative.parts
322
- if not parts:
323
- raise OSError(errno.EINVAL, f"{path_kind} path is missing a file name")
324
- for component in parts:
325
- if component in {"", "."} or component == "..":
326
- raise OSError(errno.EINVAL, f"invalid {path_kind} path component")
327
- dir_flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0) | nofollow
328
- if hasattr(os, "O_CLOEXEC"):
329
- dir_flags |= os.O_CLOEXEC
330
- dir_fd = os.open(root_resolved, dir_flags)
331
- try:
332
- if not stat.S_ISDIR(os.fstat(dir_fd).st_mode):
333
- raise OSError(errno.ENOTDIR, f"{path_kind} root is not a directory")
334
- for component in parts[:-1]:
335
- try:
336
- next_fd = os.open(component, dir_flags, dir_fd=dir_fd)
337
- except OSError as exc:
338
- if exc.errno in {errno.ENOTDIR, errno.ELOOP}:
339
- raise OSError(exc.errno, f"{path_kind} parent is not a directory") from exc
340
- raise
341
- try:
342
- if not stat.S_ISDIR(os.fstat(next_fd).st_mode):
343
- raise OSError(errno.ENOTDIR, f"{path_kind} parent is not a directory")
344
- except Exception:
345
- os.close(next_fd)
346
- raise
347
- old_fd = dir_fd
348
- dir_fd = next_fd
349
- os.close(old_fd)
350
- file_flags = os.O_RDONLY
351
- if hasattr(os, "O_CLOEXEC"):
352
- file_flags |= os.O_CLOEXEC
353
- if hasattr(os, "O_NONBLOCK"):
354
- file_flags |= os.O_NONBLOCK
355
- if nofollow:
356
- file_flags |= nofollow
357
- try:
358
- fd = os.open(parts[-1], file_flags, dir_fd=dir_fd)
359
- except OSError as exc:
360
- if exc.errno == errno.ELOOP:
361
- raise OSError(errno.ELOOP, "not a regular file") from exc
362
- raise
363
- try:
364
- opened = os.fstat(fd)
365
- if not stat.S_ISREG(opened.st_mode):
366
- raise OSError(errno.EINVAL, "not a regular file")
367
- handle = os.fdopen(fd, "rb")
368
- fd = -1
369
- return handle
370
- except Exception:
371
- if fd != -1:
372
- os.close(fd)
373
- raise
374
- finally:
375
- if dir_fd != -1:
376
- os.close(dir_fd)
377
-
378
-
379
- def read_settings_json_bytes_no_follow(path: Path, root: Path) -> bytes:
380
- with _open_regular_under_root_no_follow(root, path) as handle:
381
- st = os.fstat(handle.fileno())
382
- if st.st_size > MAX_SETTINGS_READ_BYTES:
383
- raise SettingsFileTooLargeError(
384
- f"settings file is too large ({st.st_size} bytes > {MAX_SETTINGS_READ_BYTES})"
385
- )
386
- data = handle.read(MAX_SETTINGS_READ_BYTES + 1)
387
- if len(data) > MAX_SETTINGS_READ_BYTES:
388
- raise SettingsFileTooLargeError(f"settings file is too large (> {MAX_SETTINGS_READ_BYTES} bytes)")
389
- return data
390
-
391
-
392
- def iter_values(value: Any) -> Iterable[Any]:
393
- if isinstance(value, dict):
394
- for item in value.values():
395
- yield from iter_values(item)
396
- elif isinstance(value, list):
397
- for item in value:
398
- yield from iter_values(item)
399
- else:
400
- yield value
401
-
402
-
403
- def string_values(value: Any) -> list[str]:
404
- return [item for item in iter_values(value) if isinstance(item, str)]
405
-
406
-
407
- def collect_settings(root: Path) -> tuple[list[dict[str, Any]], list[Finding]]:
408
- settings: list[dict[str, Any]] = []
409
- findings: list[Finding] = []
410
- candidates = [root / ".claude" / "settings.json", root / ".claude" / "settings.local.json"]
411
- has_project_settings = (root / ".claude" / "settings.json").exists() or (root / ".claude" / "settings.json").is_symlink()
412
- for path in candidates:
413
- if not path.exists() and not path.is_symlink():
414
- continue
415
- rel = rel_path(path, root)
416
- data, error = load_json(path, root)
417
- if error:
418
- findings.append(Finding(
419
- "settings-unreadable",
420
- "high" if "outside project" in error or "invalid JSON" in error else "medium",
421
- rel,
422
- f"Claude settings could not be used: {error}.",
423
- "Fix or remove the settings file so token-budget hooks and deny rules are predictable.",
424
- ))
425
- continue
426
- assert data is not None
427
- settings.append({"path": rel, "data": data})
428
- if not settings or not has_project_settings:
429
- findings.append(Finding(
430
- "missing-project-settings",
431
- "medium",
432
- ".claude/settings.json",
433
- "No shared project Claude settings file was found.",
434
- "Add an opt-in project .claude/settings.json with read deny rules, statusline, and Bash output trimming hook.",
435
- ))
436
- return settings, findings
437
-
438
-
439
- def merged_settings(settings: list[dict[str, Any]]) -> dict[str, Any]:
440
- merged: dict[str, Any] = {"permissions": {"deny": [], "allow": []}, "hooks": {}, "mcpServers": {}}
441
- for item in settings:
442
- data = item["data"]
443
- permissions = data.get("permissions") if isinstance(data.get("permissions"), dict) else {}
444
- for key in ("deny", "allow"):
445
- values = permissions.get(key) if isinstance(permissions, dict) else []
446
- if isinstance(values, list):
447
- merged["permissions"][key].extend(str(v) for v in values if isinstance(v, str))
448
- if isinstance(data.get("hooks"), dict):
449
- for event, hooks in data["hooks"].items():
450
- if isinstance(hooks, list):
451
- merged["hooks"].setdefault(event, [])
452
- if isinstance(merged["hooks"][event], list):
453
- merged["hooks"][event].extend(hooks)
454
- else:
455
- merged["hooks"][event] = hooks
456
- else:
457
- merged["hooks"][event] = hooks
458
- if isinstance(data.get("statusLine"), dict):
459
- merged["statusLine"] = data["statusLine"]
460
- if "model" in data:
461
- merged["model"] = data["model"]
462
- if "effortLevel" in data:
463
- merged["effortLevel"] = data["effortLevel"]
464
- if isinstance(data.get("mcpServers"), dict):
465
- merged["mcpServers"].update(data["mcpServers"])
466
- return merged
467
-
468
-
469
- READ_TARGET_RE = re.compile(r"(?i)^\s*Read\((?P<target>.*)\)\s*$")
470
-
471
-
472
- def normalize_read_target(value: str) -> str:
473
- target = value.strip().strip('"').strip("'").replace("\\", "/")
474
- while target.startswith("./"):
475
- target = target[2:]
476
- target = re.sub(r"/+", "/", target)
477
- return target.rstrip("/") or "."
478
-
479
-
480
- def parse_read_targets(deny_entries: list[str]) -> list[str]:
481
- targets: list[str] = []
482
- for entry in deny_entries:
483
- match = READ_TARGET_RE.match(entry)
484
- if not match:
485
- continue
486
- targets.append(normalize_read_target(match.group("target")))
487
- return targets
488
-
489
-
490
- def path_target_denied(deny_entries: list[str], recommended: str) -> bool:
491
- """Return True only for exact/equivalent or intentionally broader Read denies."""
492
- required = parse_read_targets([recommended])
493
- if not required:
494
- return False
495
- required_target = required[0]
496
- if required_target in {"**", "*"}:
497
- return False
498
- targets = parse_read_targets(deny_entries)
499
- broader_targets = {"**", "*", "./**", "."}
500
- for target in targets:
501
- if target in broader_targets:
502
- return True
503
- if target == required_target:
504
- return True
505
- if target.endswith("/**"):
506
- base = target[:-3].rstrip("/")
507
- if required_target == base or required_target.startswith(base + "/"):
508
- return True
509
- if target == "~/**" and required_target.startswith("~/"):
510
- return True
511
- return False
512
-
513
-
514
- def project_path_exists(root: Path, rel: str) -> bool:
515
- if rel == ".env":
516
- return (root / ".env").exists()
517
- if rel == ".env.*":
518
- return any(path.name.startswith(".env.") for path in root.iterdir() if path.exists())
519
- return (root / rel).exists()
520
-
521
-
522
- def generic_context_pattern(rel: str) -> str:
523
- if rel in {".env", ".npmrc", ".pypirc", ".netrc"}:
524
- return rel
525
- if rel.endswith(".*"):
526
- return rel
527
- if "*" in rel:
528
- return rel.replace("./", "")
529
- return f"{rel.rstrip('/')}/**"
530
-
531
-
532
- def context_exclusion_recommendation(
533
- *,
534
- label: str,
535
- rel: str,
536
- recommended: str,
537
- category: str,
538
- severity: str,
539
- deny_entries: list[str],
540
- ) -> dict[str, Any]:
541
- already_denied = path_target_denied(deny_entries, recommended)
542
- return {
543
- "id": f"context-exclude-{safe_id_part(label)}",
544
- "severity": severity,
545
- "path": rel,
546
- "category": category,
547
- "status": "already_denied" if already_denied else "missing",
548
- "reason": (
549
- "Sensitive local file should not be read into AI-agent context."
550
- if category == "sensitive"
551
- else "Bulky generated/cache path should stay out of AI-agent context."
552
- ),
553
- "recommended_deny": recommended,
554
- "generic_pattern": generic_context_pattern(rel),
555
- "applies_to": ["claude-permissions.deny", "agent-ignore-advisory"],
556
- "surfaces": ["Claude Code permissions.deny", "generic agent ignore/exclude rules"],
557
- }
558
-
559
-
560
- def build_context_exclusion_recommendations(root: Path, deny_entries: list[str]) -> list[dict[str, Any]]:
561
- recommendations: list[dict[str, Any]] = []
562
- for label, rel, recommended in HEAVY_PROJECT_DENIES:
563
- if project_path_exists(root, rel):
564
- recommendations.append(context_exclusion_recommendation(
565
- label=label,
566
- rel=rel,
567
- recommended=recommended,
568
- category="generated_cache",
569
- severity="medium",
570
- deny_entries=deny_entries,
571
- ))
572
- for label, rel, recommended in SENSITIVE_PROJECT_DENIES:
573
- if project_path_exists(root, rel):
574
- recommendations.append(context_exclusion_recommendation(
575
- label=label,
576
- rel=rel,
577
- recommended=recommended,
578
- category="sensitive",
579
- severity="high",
580
- deny_entries=deny_entries,
581
- ))
582
- recommendations.sort(key=lambda item: (SEVERITY_ORDER.get(str(item["severity"]), 99), item["id"]))
583
- return recommendations
584
-
585
-
586
- def scan_settings(root: Path, settings: list[dict[str, Any]]) -> tuple[dict[str, Any], list[Finding]]:
587
- findings: list[Finding] = []
588
- merged = merged_settings(settings)
589
- deny_entries = merged["permissions"]["deny"]
590
- allow_entries = merged["permissions"]["allow"]
591
-
592
- for label, rel, recommended in HEAVY_PROJECT_DENIES:
593
- if project_path_exists(root, rel) and not path_target_denied(deny_entries, recommended):
594
- findings.append(Finding(
595
- f"missing-deny-{safe_id_part(label)}",
596
- "medium",
597
- rel,
598
- f"Bulky generated/cache path `{rel}` exists but is not denied from Read.",
599
- f"Add `{recommended}` to permissions.deny to avoid accidental large reads.",
600
- {"recommended_deny": recommended},
601
- ))
602
-
603
- for label, rel, recommended in SENSITIVE_PROJECT_DENIES:
604
- if project_path_exists(root, rel) and not path_target_denied(deny_entries, recommended):
605
- findings.append(Finding(
606
- f"missing-sensitive-deny-{safe_id_part(label)}",
607
- "high",
608
- rel,
609
- f"Sensitive project path `{rel}` exists but is not denied from Read.",
610
- f"Add `{recommended}` to permissions.deny; do not send secrets to Claude context.",
611
- {"recommended_deny": recommended},
612
- ))
613
-
614
- for label, recommended in SENSITIVE_HOME_DENIES:
615
- if not path_target_denied(deny_entries, recommended):
616
- findings.append(Finding(
617
- f"missing-home-deny-{safe_id_part(label)}",
618
- "low",
619
- label,
620
- f"Home credential path `{label}` is not explicitly denied.",
621
- f"Add `{recommended}` to permissions.deny as a guardrail against accidental credential reads.",
622
- {"recommended_deny": recommended},
623
- ))
624
-
625
- if not has_bash_trim_hook(merged):
626
- findings.append(Finding(
627
- "missing-bash-trim-hook",
628
- "medium",
629
- ".claude/settings.json",
630
- "No PreToolUse Bash hook for trimming noisy test/build/lint output was detected.",
631
- "Install the example hook using context-guard-rewrite-bash or rewrite_bash_for_token_budget.py.",
632
- ))
633
-
634
- if not has_large_read_guard(merged):
635
- findings.append(Finding(
636
- "missing-large-read-guard",
637
- "medium",
638
- ".claude/settings.json",
639
- "No PreToolUse Read hook for blocking large whole-file reads was detected.",
640
- "Install context-guard-guard-read so Claude is nudged toward context-guard-read-symbol or line-range reads before large files enter context.",
641
- ))
642
-
643
- if not has_statusline(merged):
644
- findings.append(Finding(
645
- "missing-token-statusline",
646
- "low",
647
- ".claude/settings.json",
648
- "No token/cost/context statusline command was detected.",
649
- "Add context-guard-statusline so context and cost pressure stay visible during a session.",
650
- ))
651
-
652
- for entry in allow_entries:
653
- if any(target in {"**", "*", "."} for target in parse_read_targets([entry])):
654
- findings.append(Finding(
655
- "broad-read-allow",
656
- "medium",
657
- ".claude/settings.json",
658
- "A broad Read allow rule can make accidental large reads more likely.",
659
- "Prefer narrow allow rules plus explicit deny entries for generated and secret paths.",
660
- {"allow_entry": entry},
661
- ))
662
- break
663
-
664
- model = str(merged.get("model", "")).lower()
665
- if "opus" in model:
666
- findings.append(Finding(
667
- "opus-default-model",
668
- "medium",
669
- ".claude/settings.json",
670
- "Default model appears to be Opus, which can burn scarce premium tokens on routine work.",
671
- "Use Sonnet as the default and reserve Opus/opusplan for planning or high-risk reasoning.",
672
- {"model": merged.get("model")},
673
- ))
674
-
675
- effort = str(merged.get("effortLevel", "")).lower()
676
- if effort in {"high", "max", "maximum"}:
677
- findings.append(Finding(
678
- "high-default-effort",
679
- "low",
680
- ".claude/settings.json",
681
- "Default effort is high, which can increase token burn on routine edits.",
682
- "Use medium/low by default and raise effort only for hard design/debugging work.",
683
- {"effortLevel": merged.get("effortLevel")},
684
- ))
685
-
686
- mcp_servers = merged.get("mcpServers") if isinstance(merged.get("mcpServers"), dict) else {}
687
- if len(mcp_servers) >= 6:
688
- findings.append(Finding(
689
- "many-mcp-servers",
690
- "low",
691
- ".claude/settings.json",
692
- "Many MCP servers are configured; tool schemas and discovery can add startup/context overhead.",
693
- "Disable unused MCP servers for Claude sessions that do not need them.",
694
- {"mcp_server_count": len(mcp_servers), "mcp_servers": sorted(mcp_servers)[:20]},
695
- ))
696
-
697
- settings_summary = {
698
- "files": [item["path"] for item in settings],
699
- "deny_count": len(deny_entries),
700
- "allow_count": len(allow_entries),
701
- "has_bash_trim_hook": has_bash_trim_hook(merged),
702
- "has_large_read_guard": has_large_read_guard(merged),
703
- "has_statusline": has_statusline(merged),
704
- "mcp_server_count": len(mcp_servers),
705
- "model": merged.get("model"),
706
- "effortLevel": merged.get("effortLevel"),
707
- }
708
- return settings_summary, findings
709
-
710
-
711
- def has_bash_trim_hook(settings: dict[str, Any]) -> bool:
712
- hooks = settings.get("hooks")
713
- if not isinstance(hooks, dict):
714
- return False
715
- pre_tool = hooks.get("PreToolUse")
716
- if not isinstance(pre_tool, list):
717
- return False
718
- for entry in pre_tool:
719
- if not isinstance(entry, dict):
720
- continue
721
- matcher = entry.get("matcher")
722
- if isinstance(matcher, str) and not matcher_applies_to_bash(matcher):
723
- continue
724
- commands = (
725
- string_values(entry.get("hooks"))
726
- + string_values(entry.get("command"))
727
- + string_values(entry.get("commands"))
728
- )
729
- if any(any(marker in cmd for marker in BASH_TRIM_COMMAND_MARKERS) for cmd in commands):
730
- return True
731
- return False
732
-
733
-
734
- def matcher_applies_to_bash(matcher: str) -> bool:
735
- parts = [part.strip().lower() for part in matcher.split("|")]
736
- return any(part in {"", "*", "bash"} for part in parts)
737
-
738
-
739
- def has_large_read_guard(settings: dict[str, Any]) -> bool:
740
- hooks = settings.get("hooks")
741
- if not isinstance(hooks, dict):
742
- return False
743
- pre_tool = hooks.get("PreToolUse")
744
- if not isinstance(pre_tool, list):
745
- return False
746
- for entry in pre_tool:
747
- if not isinstance(entry, dict):
748
- continue
749
- matcher = entry.get("matcher")
750
- if isinstance(matcher, str) and not matcher_applies_to_read(matcher):
751
- continue
752
- commands = (
753
- string_values(entry.get("hooks"))
754
- + string_values(entry.get("command"))
755
- + string_values(entry.get("commands"))
756
- )
757
- if any(any(marker in cmd for marker in LARGE_READ_GUARD_COMMAND_MARKERS) for cmd in commands):
758
- return True
759
- return False
760
-
761
-
762
- def matcher_applies_to_read(matcher: str) -> bool:
763
- parts = [part.strip().lower() for part in matcher.split("|")]
764
- return any(part in {"", "*", "read"} for part in parts)
765
-
766
-
767
- def has_statusline(settings: dict[str, Any]) -> bool:
768
- status = settings.get("statusLine")
769
- if not isinstance(status, dict):
770
- return False
771
- command = status.get("command")
772
- return isinstance(command, str) and any(marker in command for marker in STATUSLINE_COMMAND_MARKERS)
773
-
774
-
775
- def should_scan_context_file(path: Path, root: Path) -> bool:
776
- if path.name in CONTEXT_FILE_NAMES:
777
- return True
778
- raw_rel = raw_rel_path(path, root)
779
- if raw_rel is None:
780
- return False
781
- if raw_rel in CONTEXT_EXACT_REL_FILES:
782
- return True
783
- rel = sanitize_rel_path(raw_rel)
784
- return any(rel.startswith(prefix + "/") and path.suffix.lower() == ".md" for prefix in CONTEXT_MD_DIRS)
785
-
786
-
787
- def iter_context_files(root: Path) -> Iterable[Path]:
788
- for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
789
- current = Path(dirpath)
790
- dirnames[:] = [
791
- name
792
- for name in dirnames
793
- if name not in EXCLUDED_DIR_NAMES and not (current / name).is_symlink()
794
- ]
795
- for name in filenames:
796
- path = current / name
797
- if path.is_symlink():
798
- continue
799
- if should_scan_context_file(path, root):
800
- yield path
801
-
802
-
803
- def read_text_prefix(path: Path, limit: int = MAX_CONTEXT_READ_BYTES, *, root: Path | None = None) -> tuple[str, bool]:
804
- opener = (
805
- _open_regular_under_root_no_follow(root, path, path_kind="context")
806
- if root is not None
807
- else open_regular_no_follow(path)
808
- )
809
- with opener as handle:
810
- data = handle.read(limit + 1)
811
- truncated = len(data) > limit
812
- if truncated:
813
- data = data[:limit]
814
- return data.decode("utf-8", "replace"), truncated
815
-
816
-
817
- def file_contains_secret(
818
- path: Path,
819
- chunk_bytes: int = 64_000,
820
- *,
821
- root: Path | None = None,
822
- max_total_bytes: int = MAX_SECRET_SCAN_BYTES,
823
- ) -> bool:
824
- carry = ""
825
- bytes_read = 0
826
- opener = (
827
- _open_regular_under_root_no_follow(root, path, path_kind="context")
828
- if root is not None
829
- else open_regular_no_follow(path)
830
- )
831
- with opener as handle:
832
- while True:
833
- remaining = max_total_bytes - bytes_read
834
- if remaining <= 0:
835
- return False
836
- data = handle.read(min(chunk_bytes, remaining))
837
- if not data:
838
- return False
839
- bytes_read += len(data)
840
- text = carry + data.decode("utf-8", "replace")
841
- if SECRET_CONTENT_RE.search(text):
842
- return True
843
- carry = text[-512:]
844
-
845
-
846
- def open_regular_no_follow(path: Path):
847
- before = os.lstat(path)
848
- if not stat.S_ISREG(before.st_mode):
849
- raise OSError("not a regular file")
850
- flags = os.O_RDONLY
851
- nofollow = getattr(os, "O_NOFOLLOW", 0)
852
- if hasattr(os, "O_NONBLOCK"):
853
- flags |= os.O_NONBLOCK
854
- if nofollow:
855
- flags |= nofollow
856
- fd = os.open(path, flags)
857
- try:
858
- opened = os.fstat(fd)
859
- after = os.lstat(path)
860
- if (
861
- not stat.S_ISREG(opened.st_mode)
862
- or not stat.S_ISREG(after.st_mode)
863
- or not os.path.samestat(before, opened)
864
- or not os.path.samestat(after, opened)
865
- ):
866
- raise OSError("not a regular file")
867
- handle = os.fdopen(fd, "rb")
868
- except Exception:
869
- os.close(fd)
870
- raise
871
- return handle
872
-
873
-
874
- def format_os_error(exc: OSError) -> str:
875
- reason = exc.strerror or exc.__class__.__name__
876
- if exc.errno is not None:
877
- return f"{reason} (errno {exc.errno})"
878
- return reason
879
-
880
-
881
- def scan_context(root: Path, large_bytes: int, huge_bytes: int, long_lines: int) -> tuple[list[dict[str, Any]], list[Finding]]:
882
- context_files: list[dict[str, Any]] = []
883
- findings: list[Finding] = []
884
- for path in sorted(iter_context_files(root), key=lambda p: rel_path(p, root)):
885
- rel = rel_path(path, root)
886
- surface = context_surface_for_rel(raw_rel_path(path, root) or rel, path.name)
887
- try:
888
- st = path.lstat()
889
- if not stat.S_ISREG(st.st_mode):
890
- findings.append(context_finding(
891
- "context-not-regular",
892
- "medium",
893
- rel,
894
- "Context-like path is not a regular file.",
895
- "Replace it with a regular markdown file or remove it from always-loaded context.",
896
- ))
897
- continue
898
- size = st.st_size
899
- text, sample_truncated = read_text_prefix(path, root=root)
900
- contains_secret = file_contains_secret(path, root=root)
901
- except OSError as exc:
902
- findings.append(context_finding(
903
- "context-unreadable",
904
- "low",
905
- rel,
906
- f"Context-like file could not be read: {format_os_error(exc)}.",
907
- "Check file permissions or remove stale symlinks.",
908
- ))
909
- continue
910
- lines = text.count("\n") + (1 if text else 0)
911
- code_fences = text.count("```")
912
- item = {
913
- "path": rel,
914
- "bytes": size,
915
- "sampled_lines": lines,
916
- "sample_truncated": sample_truncated,
917
- "code_fences": code_fences,
918
- }
919
- if surface is not None:
920
- item.update(surface)
921
- context_files.append(item)
922
-
923
- if size >= huge_bytes:
924
- evidence = {"bytes": size, "threshold_bytes": huge_bytes}
925
- if surface is not None:
926
- evidence.update(surface)
927
- findings.append(context_finding(
928
- "huge-context-file",
929
- "high",
930
- rel,
931
- f"Context-like file is very large ({size} bytes).",
932
- "Move long procedures/logs/examples into opt-in skills or commands and keep only a short index in always-loaded context.",
933
- evidence,
934
- ))
935
- elif size >= large_bytes or lines >= long_lines:
936
- evidence = {"bytes": size, "large_bytes": large_bytes, "sampled_lines": lines, "long_lines": long_lines}
937
- if surface is not None:
938
- evidence.update(surface)
939
- findings.append(context_finding(
940
- "large-context-file",
941
- "medium",
942
- rel,
943
- f"Context-like file is large ({size} bytes, sampled {lines} lines).",
944
- "Trim stable instructions, move volatile or lengthy material to skills/custom commands, and keep examples short.",
945
- evidence,
946
- ))
947
- if code_fences >= 12:
948
- findings.append(context_finding(
949
- "context-heavy-code-fences",
950
- "low",
951
- rel,
952
- "Context-like file contains many code fences, which can inflate startup context.",
953
- "Replace long embedded examples with links or opt-in command/skill files.",
954
- {"code_fences": code_fences},
955
- ))
956
- if contains_secret:
957
- findings.append(context_finding(
958
- "secret-like-context-content",
959
- "high",
960
- rel,
961
- "Context-like file contains credential-shaped text.",
962
- "Remove secrets from prompt context and rotate exposed credentials if this file was shared.",
963
- ))
964
- return context_files, findings
965
-
966
-
967
- def bounded_top(value: int) -> int:
968
- return max(1, min(int(value), 200))
969
-
970
-
971
- def path_text_label(path_text: str, show_paths: bool) -> str:
972
- sanitized = sanitize_path_text(str(path_text))
973
- if show_paths:
974
- return sanitized
975
- name = sanitize_path_component(Path(sanitized).name or "path")
976
- return f"{name}#path:{text_hash(sanitized)}"
977
-
978
-
979
- def safe_report_label(value: Any, limit: int = MAX_REPORT_LABEL_CHARS) -> str:
980
- text = " ".join(str(value or "").split())
981
- text = SECRET_CONTENT_RE.sub("[REDACTED]", sanitize_path_text(text))
982
- if len(text) <= limit:
983
- return text
984
- marker = f"…[trimmed:{len(text)} chars]"
985
- return text[: max(0, limit - len(marker))] + marker
986
-
987
-
988
- def json_byte_len(value: Any) -> int:
989
- return len(json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":")).encode("utf-8", "replace"))
990
-
991
-
992
- def iter_project_files(root: Path, suffixes: set[str], max_files: int) -> Iterable[Path]:
993
- seen = 0
994
- for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
995
- current = Path(dirpath)
996
- dirnames[:] = [
997
- name
998
- for name in dirnames
999
- if name not in EXCLUDED_DIR_NAMES and not (current / name).is_symlink()
1000
- ]
1001
- for name in filenames:
1002
- path = current / name
1003
- if path.is_symlink() or path.suffix.lower() not in suffixes:
1004
- continue
1005
- yield path
1006
- seen += 1
1007
- if seen >= max_files:
1008
- return
1009
-
1010
-
1011
- def walk_json(value: Any) -> Iterable[dict[str, Any]]:
1012
- stack = [value]
1013
- while stack:
1014
- current = stack.pop()
1015
- if isinstance(current, dict):
1016
- yield current
1017
- stack.extend(current.values())
1018
- elif isinstance(current, list):
1019
- stack.extend(current)
1020
-
1021
-
1022
- def normalize_rule_unit(line: str, min_chars: int) -> str | None:
1023
- stripped = line.strip()
1024
- if not stripped or stripped in {"```", "---"}:
1025
- return None
1026
- stripped = re.sub(r"^[-*+>]\s+", "", stripped)
1027
- stripped = re.sub(r"^\d+[.)]\s+", "", stripped)
1028
- stripped = re.sub(r"\s+", " ", stripped).strip().lower()
1029
- if len(stripped) < min_chars:
1030
- return None
1031
- if len(stripped.split()) < 6:
1032
- return None
1033
- return stripped
1034
-
1035
-
1036
- def scan_duplicate_rules(root: Path, *, min_chars: int, top: int) -> tuple[list[dict[str, Any]], list[Finding]]:
1037
- occurrences: dict[str, list[dict[str, Any]]] = defaultdict(list)
1038
- for path in sorted(iter_context_files(root), key=lambda p: rel_path(p, root)):
1039
- rel = rel_path(path, root)
1040
- try:
1041
- text, truncated = read_text_prefix(path, root=root)
1042
- except OSError:
1043
- continue
1044
- for line_no, line in enumerate(text.splitlines(), 1):
1045
- normalized = normalize_rule_unit(line, min_chars)
1046
- if normalized is None:
1047
- continue
1048
- occurrences[normalized].append({"path": rel, "line": line_no, "sample_truncated": truncated})
1049
- groups: list[dict[str, Any]] = []
1050
- findings: list[Finding] = []
1051
- for normalized, items in occurrences.items():
1052
- paths = sorted({item["path"] for item in items})
1053
- if len(items) < 2 or len(paths) < 2:
1054
- continue
1055
- fingerprint = text_hash(normalized)
1056
- group = {
1057
- "fingerprint": fingerprint,
1058
- "occurrence_count": len(items),
1059
- "path_count": len(paths),
1060
- "paths": paths[:top],
1061
- "sample_chars": len(normalized),
1062
- "confidence": "observed",
1063
- }
1064
- groups.append(group)
1065
- findings.append(Finding(
1066
- f"duplicate-context-rule-{fingerprint}",
1067
- "low" if len(items) < 4 else "medium",
1068
- "context-rules",
1069
- "A normalized instruction/rule unit appears in multiple context-like files.",
1070
- "Keep one canonical copy and replace duplicates with a short pointer if the rule is still needed.",
1071
- group,
1072
- rule_id="duplicate-context-rule",
1073
- instance_id=f"duplicate-context-rule-{fingerprint}",
1074
- ))
1075
- groups.sort(key=lambda item: (-item["occurrence_count"], item["fingerprint"]))
1076
- findings.sort(key=lambda item: (SEVERITY_ORDER.get(item.severity, 99), item.id))
1077
- return groups[:top], findings[:top]
1078
-
1079
-
1080
- def assigned_all_names(tree: ast.AST) -> set[str]:
1081
- names: set[str] = set()
1082
- for node in ast.walk(tree):
1083
- if isinstance(node, ast.Assign):
1084
- for target in node.targets:
1085
- if isinstance(target, ast.Name) and target.id == "__all__" and isinstance(node.value, (ast.List, ast.Tuple)):
1086
- for item in node.value.elts:
1087
- if isinstance(item, ast.Constant) and isinstance(item.value, str):
1088
- names.add(item.value)
1089
- return names
1090
-
1091
-
1092
- def scan_python_imports(root: Path, *, top: int, max_files: int) -> tuple[dict[str, Any], list[Finding]]:
1093
- findings: list[Finding] = []
1094
- files_scanned = 0
1095
- parse_errors = 0
1096
- for path in iter_project_files(root, {".py"}, max_files):
1097
- files_scanned += 1
1098
- rel = rel_path(path, root)
1099
- try:
1100
- text, _ = read_text_prefix(path, limit=MAX_CONTEXT_READ_BYTES, root=root)
1101
- tree = ast.parse(text, filename=rel)
1102
- except (OSError, SyntaxError, ValueError):
1103
- parse_errors += 1
1104
- continue
1105
- imports: list[tuple[str, int, str]] = []
1106
- for node in ast.walk(tree):
1107
- if isinstance(node, ast.Import):
1108
- for alias in node.names:
1109
- name = alias.asname or alias.name.split(".", 1)[0]
1110
- if not name.startswith("_"):
1111
- imports.append((name, node.lineno, alias.name))
1112
- elif isinstance(node, ast.ImportFrom):
1113
- if node.module == "__future__":
1114
- continue
1115
- for alias in node.names:
1116
- if alias.name == "*":
1117
- continue
1118
- name = alias.asname or alias.name
1119
- if not name.startswith("_"):
1120
- imports.append((name, node.lineno, f"{node.module or ''}.{alias.name}".strip(".")))
1121
- if not imports:
1122
- continue
1123
- used = {node.id for node in ast.walk(tree) if isinstance(node, ast.Name)} | assigned_all_names(tree)
1124
- for name, line, module in imports:
1125
- if name in used:
1126
- continue
1127
- instance = f"stale-python-import-{text_hash(f'{rel}:{line}:{name}')}"
1128
- findings.append(Finding(
1129
- instance,
1130
- "low",
1131
- rel,
1132
- f"Python import `{name}` appears unused in static AST analysis.",
1133
- "Review before removing; dynamic imports, re-exports, and type-checking paths can make this a false positive.",
1134
- {"imported_name": name, "module": module, "line": line, "confidence": "advisory-static-ast"},
1135
- rule_id="stale-python-import",
1136
- instance_id=instance,
1137
- ))
1138
- if len(findings) >= top:
1139
- break
1140
- if len(findings) >= top:
1141
- break
1142
- return {"files_scanned": files_scanned, "parse_errors": parse_errors, "unused_imports": [f.as_dict() for f in findings]}, findings
1143
-
1144
-
1145
- def iter_skill_files(root: Path, max_files: int) -> Iterable[Path]:
1146
- count = 0
1147
- for path in iter_project_files(root, {".md"}, max_files):
1148
- if path.name == "SKILL.md" and "skills" in path.parts:
1149
- yield path
1150
- count += 1
1151
- if count >= max_files:
1152
- return
1153
-
1154
-
1155
- def safe_read_reference_text(path: Path, root: Path) -> str:
1156
- try:
1157
- text, _ = read_text_prefix(path, limit=128_000, root=root)
1158
- return text.lower()
1159
- except OSError:
1160
- return ""
1161
-
1162
-
1163
- def scan_unused_skills(root: Path, *, top: int, max_files: int) -> tuple[dict[str, Any], list[Finding]]:
1164
- skill_files = list(iter_skill_files(root, max_files))
1165
- reference_files = [path for path in iter_project_files(root, TEXT_REFERENCE_SUFFIXES, max_files) if path.name != "SKILL.md"]
1166
- reference_cache = {path: safe_read_reference_text(path, root) for path in reference_files}
1167
- findings: list[Finding] = []
1168
- candidates: list[dict[str, Any]] = []
1169
- for skill in skill_files:
1170
- skill_name = skill.parent.name
1171
- needle_forms = {skill_name.lower(), f"/{skill_name.lower()}", f"context-guard:{skill_name.lower()}"}
1172
- references = 0
1173
- for ref_path, text in reference_cache.items():
1174
- if ref_path == skill:
1175
- continue
1176
- if any(needle in text for needle in needle_forms):
1177
- references += 1
1178
- if references:
1179
- continue
1180
- rel = rel_path(skill, root)
1181
- candidate = {"path": rel, "skill": safe_report_label(skill_name), "reference_count": 0, "confidence": "low-advisory"}
1182
- candidates.append(candidate)
1183
- instance = f"unused-skill-candidate-{text_hash(rel)}"
1184
- findings.append(Finding(
1185
- instance,
1186
- "low",
1187
- rel,
1188
- "Skill file has no obvious project-local references outside its own SKILL.md.",
1189
- "Confirm real usage through plugin manifests, user docs, or runtime telemetry before deleting or renaming it.",
1190
- candidate,
1191
- rule_id="unused-skill-candidate",
1192
- instance_id=instance,
1193
- ))
1194
- if len(findings) >= top:
1195
- break
1196
- return {"skills_scanned": len(skill_files), "reference_files_scanned": len(reference_files), "unused_candidates": candidates[:top]}, findings
1197
-
1198
-
1199
- def read_json_file_limited(path: Path, max_bytes: int) -> tuple[Any | None, str | None, int]:
1200
- try:
1201
- with open_regular_no_follow(path) as handle:
1202
- size = os.fstat(handle.fileno()).st_size
1203
- if size > max_bytes:
1204
- return None, f"skipped oversized file ({size} bytes > {max_bytes})", size
1205
- data = handle.read(max_bytes + 1)
1206
- if len(data) > max_bytes:
1207
- return None, f"skipped oversized file (> {max_bytes} bytes)", len(data)
1208
- return json.loads(data.decode("utf-8", "replace")), None, len(data)
1209
- except json.JSONDecodeError as exc:
1210
- return None, f"invalid JSON at line {exc.lineno}: {exc.msg}", 0
1211
- except (OSError, UnicodeDecodeError) as exc:
1212
- return None, f"unreadable: {format_os_error(exc) if isinstance(exc, OSError) else exc.__class__.__name__}", 0
1213
-
1214
-
1215
- def tool_name_from_schema(d: dict[str, Any]) -> str | None:
1216
- for key in ("name", "tool", "id", "title"):
1217
- value = d.get(key)
1218
- if isinstance(value, str) and value.strip():
1219
- return safe_report_label(value)
1220
- return None
1221
-
1222
-
1223
- def collect_tool_schemas(raw: Any) -> list[dict[str, Any]]:
1224
- tools: list[dict[str, Any]] = []
1225
- for d in walk_json(raw):
1226
- name = tool_name_from_schema(d)
1227
- if not name:
1228
- continue
1229
- if not any(key in d for key in ("inputSchema", "input_schema", "schema", "parameters", "description")):
1230
- continue
1231
- server = safe_report_label(d.get("server")) if isinstance(d.get("server"), str) else None
1232
- tools.append({"name": name, "schema_bytes": json_byte_len(d), "server": server})
1233
- dedup: dict[tuple[str, str | None], dict[str, Any]] = {}
1234
- for tool in tools:
1235
- key = (tool["name"], tool.get("server"))
1236
- prior = dedup.get(key)
1237
- if prior is None or int(tool["schema_bytes"]) > int(prior["schema_bytes"]):
1238
- dedup[key] = tool
1239
- return list(dedup.values())
1240
-
1241
-
1242
- def scan_tool_catalogs(root: Path, args: argparse.Namespace, settings: list[dict[str, Any]], *, top: int) -> tuple[dict[str, Any], list[Finding]]:
1243
- findings: list[Finding] = []
1244
- catalogs: list[dict[str, Any]] = []
1245
- merged = merged_settings(settings)
1246
- mcp_servers = merged.get("mcpServers") if isinstance(merged.get("mcpServers"), dict) else {}
1247
- if len(mcp_servers) >= args.mcp_server_threshold:
1248
- evidence = {"mcp_server_count": len(mcp_servers), "threshold": args.mcp_server_threshold, "confidence": "observed-settings"}
1249
- findings.append(Finding(
1250
- "excessive-mcp-servers",
1251
- "low",
1252
- ".claude/settings.json",
1253
- "Project Claude settings configure many MCP servers, which can increase tool discovery/schema overhead.",
1254
- "Disable unused MCP servers for sessions that do not need them; keep this advisory until task-specific need is known.",
1255
- evidence,
1256
- rule_id="excessive-mcp-servers",
1257
- instance_id="excessive-mcp-servers",
1258
- ))
1259
- for raw_path in getattr(args, "tool_catalog", []) or []:
1260
- path = safe_resolve(Path(raw_path).expanduser())
1261
- label = path_text_label(str(path), args.show_paths)
1262
- raw, error, size = read_json_file_limited(path, args.max_tool_catalog_bytes)
1263
- if error:
1264
- catalogs.append({"path": label, "status": "skipped", "reason": error, "bytes": size})
1265
- continue
1266
- tools = collect_tool_schemas(raw)
1267
- total_schema_bytes = sum(int(tool["schema_bytes"]) for tool in tools)
1268
- large_tools = sorted([tool for tool in tools if int(tool["schema_bytes"]) >= args.large_schema_bytes], key=lambda item: (-int(item["schema_bytes"]), item["name"]))[:top]
1269
- catalog = {"path": label, "status": "scanned", "tool_count": len(tools), "schema_bytes": total_schema_bytes, "large_schema_tools": large_tools}
1270
- catalogs.append(catalog)
1271
- if len(tools) >= args.tool_count_threshold:
1272
- instance = f"excessive-tool-catalog-{text_hash(label)}"
1273
- findings.append(Finding(
1274
- instance,
1275
- "medium",
1276
- label,
1277
- "Local tool catalog contains many tools for one task context.",
1278
- "Use context-guard-tool-prune or a task-specific tool allowlist before injecting full schemas.",
1279
- {"tool_count": len(tools), "threshold": args.tool_count_threshold, "schema_bytes": total_schema_bytes, "confidence": "observed-catalog"},
1280
- rule_id="excessive-tool-catalog",
1281
- instance_id=instance,
1282
- ))
1283
- for tool in large_tools:
1284
- instance = f"large-tool-schema-{text_hash(label + ':' + tool['name'])}"
1285
- findings.append(Finding(
1286
- instance,
1287
- "low",
1288
- label,
1289
- "A local tool schema is large enough to dominate narrow task context.",
1290
- "Prefer a bounded top-k schema report and retrieve the full sanitized schema only when needed.",
1291
- {"tool_name": tool["name"], "schema_bytes": tool["schema_bytes"], "threshold": args.large_schema_bytes, "confidence": "observed-catalog"},
1292
- rule_id="large-tool-schema",
1293
- instance_id=instance,
1294
- ))
1295
- return {"mcp_server_count": len(mcp_servers), "catalogs": catalogs[:top]}, findings[: max(top, 1) * 2]
1296
-
1297
-
1298
- def iter_log_candidates(root: Path, log_paths: list[str], max_files: int) -> Iterable[Path]:
1299
- candidates: list[Path] = []
1300
- explicit = [Path(item).expanduser() for item in log_paths]
1301
- default_roots = [root / ".claude", root / ".codex"]
1302
- for path in explicit + default_roots:
1303
- try:
1304
- resolved = safe_resolve(path)
1305
- except OSError:
1306
- resolved = path
1307
- if resolved.exists() and not resolved.is_symlink():
1308
- candidates.append(resolved)
1309
- yielded = 0
1310
- for candidate in candidates:
1311
- if candidate.is_file() and candidate.suffix.lower() in {".json", ".jsonl", ".ndjson", ".log"}:
1312
- yield candidate
1313
- yielded += 1
1314
- elif candidate.is_dir():
1315
- for dirpath, dirnames, filenames in os.walk(candidate, followlinks=False):
1316
- current = Path(dirpath)
1317
- dirnames[:] = [name for name in dirnames if name not in EXCLUDED_DIR_NAMES and not (current / name).is_symlink()]
1318
- for name in filenames:
1319
- path = current / name
1320
- if path.is_symlink() or path.suffix.lower() not in {".json", ".jsonl", ".ndjson", ".log"}:
1321
- continue
1322
- yield path
1323
- yielded += 1
1324
- if yielded >= max_files:
1325
- return
1326
- if yielded >= max_files:
1327
- return
1328
-
1329
-
1330
- def parse_possible_json(value: Any) -> Any:
1331
- if isinstance(value, str):
1332
- stripped = value.strip()
1333
- if stripped and stripped[0] in "[{":
1334
- try:
1335
- return json.loads(stripped)
1336
- except json.JSONDecodeError:
1337
- return value
1338
- return value
1339
-
1340
-
1341
- def call_name(d: dict[str, Any]) -> str | None:
1342
- for key in TOOL_CALL_NAME_KEYS:
1343
- value = d.get(key)
1344
- if isinstance(value, str) and value.strip():
1345
- return value.strip()[:120]
1346
- typ = str(d.get("type") or "").lower()
1347
- name = d.get("name")
1348
- if isinstance(name, str) and name.strip() and (typ in {"tool_use", "tool_call", "function_call"} or any(key in d for key in TOOL_CALL_INPUT_KEYS)):
1349
- return name.strip()[:120]
1350
- return None
1351
-
1352
-
1353
- def call_input(d: dict[str, Any]) -> Any:
1354
- for key in TOOL_CALL_INPUT_KEYS:
1355
- if key in d:
1356
- return parse_possible_json(d[key])
1357
- return {}
1358
-
1359
-
1360
- def sanitized_fingerprint_value(value: Any) -> Any:
1361
- if isinstance(value, dict):
1362
- out: dict[str, Any] = {}
1363
- for key, item in sorted(value.items(), key=lambda kv: str(kv[0])):
1364
- safe_key = sanitize_path_component(str(key))
1365
- out[safe_key] = sanitized_fingerprint_value(item)
1366
- return out
1367
- if isinstance(value, list):
1368
- return [sanitized_fingerprint_value(item) for item in value[:20]]
1369
- if isinstance(value, str):
1370
- return SECRET_CONTENT_RE.sub("[REDACTED]", sanitize_path_text(value))[:500]
1371
- return value
1372
-
1373
-
1374
- def find_path_argument(value: Any) -> str | None:
1375
- stack = [parse_possible_json(value)]
1376
- while stack:
1377
- current = stack.pop()
1378
- if isinstance(current, dict):
1379
- for key, item in current.items():
1380
- if str(key) in FILE_PATH_KEYS and isinstance(item, str) and item.strip():
1381
- return item.strip()
1382
- stack.append(item)
1383
- elif isinstance(current, list):
1384
- stack.extend(current)
1385
- return None
1386
-
1387
-
1388
- def is_read_tool(name: str) -> bool:
1389
- lowered = name.lower().replace("-", "_")
1390
- tail = lowered.rsplit(".", 1)[-1]
1391
- return lowered in READ_TOOL_NAMES or tail in READ_TOOL_NAMES or "read_file" in lowered
1392
-
1393
-
1394
- def scan_logs(root: Path, args: argparse.Namespace, *, top: int) -> tuple[dict[str, Any], list[Finding]]:
1395
- tool_counts: Counter[tuple[str, str]] = Counter()
1396
- tool_files: dict[tuple[str, str], set[str]] = defaultdict(set)
1397
- read_counts: Counter[str] = Counter()
1398
- read_labels: dict[str, str] = {}
1399
- read_tools: dict[str, set[str]] = defaultdict(set)
1400
- files_scanned = 0
1401
- records_scanned = 0
1402
- skipped_files: list[dict[str, Any]] = []
1403
- skipped_records = 0
1404
- for path in iter_log_candidates(root, getattr(args, "log_path", []) or [], args.max_structural_files):
1405
- label = path_text_label(str(path), args.show_paths)
1406
- try:
1407
- with open_regular_no_follow(path) as handle:
1408
- size = os.fstat(handle.fileno()).st_size
1409
- if size > args.max_log_bytes:
1410
- skipped_files.append({"path": label, "reason": f"oversized:{size}>{args.max_log_bytes}"})
1411
- continue
1412
- data = handle.read(args.max_log_bytes + 1)
1413
- if len(data) > args.max_log_bytes:
1414
- skipped_files.append({"path": label, "reason": f"oversized:>{args.max_log_bytes}"})
1415
- continue
1416
- except OSError as exc:
1417
- skipped_files.append({"path": label, "reason": format_os_error(exc)})
1418
- continue
1419
- files_scanned += 1
1420
- text = data.decode("utf-8", "replace")
1421
- raw_records: list[Any] = []
1422
- if path.suffix.lower() == ".json":
1423
- try:
1424
- parsed = json.loads(text)
1425
- raw_records = parsed if isinstance(parsed, list) else [parsed]
1426
- except json.JSONDecodeError:
1427
- skipped_records += 1
1428
- continue
1429
- else:
1430
- for raw_line in text.splitlines():
1431
- if len(raw_line.encode("utf-8", "replace")) > args.max_log_line_bytes:
1432
- skipped_records += 1
1433
- continue
1434
- if not raw_line.strip():
1435
- continue
1436
- try:
1437
- raw_records.append(json.loads(raw_line))
1438
- except json.JSONDecodeError:
1439
- skipped_records += 1
1440
- for record in raw_records:
1441
- records_scanned += 1
1442
- for d in walk_json(record):
1443
- name = call_name(d)
1444
- if not name:
1445
- continue
1446
- value = call_input(d)
1447
- fp = text_hash(json.dumps(sanitized_fingerprint_value(value), ensure_ascii=False, sort_keys=True, default=str))
1448
- key = (name, fp)
1449
- tool_counts[key] += 1
1450
- tool_files[key].add(label)
1451
- if is_read_tool(name):
1452
- path_arg = find_path_argument(value)
1453
- if path_arg:
1454
- read_fp = text_hash(sanitize_path_text(path_arg))
1455
- read_counts[read_fp] += 1
1456
- read_labels[read_fp] = path_text_label(path_arg, args.show_paths)
1457
- read_tools[read_fp].add(name)
1458
- findings: list[Finding] = []
1459
- repeated_reads: list[dict[str, Any]] = []
1460
- for fp, count in read_counts.most_common(top):
1461
- if count < args.duplicate_call_threshold:
1462
- continue
1463
- item = {"path": read_labels[fp], "path_fingerprint": fp, "read_count": count, "tools": sorted(safe_report_label(name) for name in read_tools[fp]), "confidence": "observed-log"}
1464
- repeated_reads.append(item)
1465
- instance = f"repeated-file-read-{fp}"
1466
- findings.append(Finding(
1467
- instance,
1468
- "medium",
1469
- "local-logs",
1470
- "The same file path appears to be read repeatedly in local tool-call logs.",
1471
- "Use search/symbol/slice reads or a local artifact receipt instead of repeating whole-file reads.",
1472
- item,
1473
- rule_id="repeated-file-read",
1474
- instance_id=instance,
1475
- ))
1476
- duplicate_calls: list[dict[str, Any]] = []
1477
- for (name, fp), count in tool_counts.most_common(top * 2):
1478
- if count < args.duplicate_call_threshold:
1479
- continue
1480
- item = {"tool_name": safe_report_label(name), "input_fingerprint": fp, "call_count": count, "log_files": sorted(tool_files[(name, fp)])[:top], "confidence": "observed-log"}
1481
- duplicate_calls.append(item)
1482
- instance = f"duplicate-tool-call-{text_hash(name + ':' + fp)}"
1483
- findings.append(Finding(
1484
- instance,
1485
- "low" if count < args.duplicate_call_threshold * 2 else "medium",
1486
- "local-logs",
1487
- "A tool call with the same sanitized input fingerprint repeats in local logs.",
1488
- "Avoid replaying identical calls; keep one receipt or summarize the result before retrying.",
1489
- item,
1490
- rule_id="duplicate-tool-call",
1491
- instance_id=instance,
1492
- ))
1493
- if len(duplicate_calls) >= top:
1494
- break
1495
- return {
1496
- "files_scanned": files_scanned,
1497
- "records_scanned": records_scanned,
1498
- "skipped_files": skipped_files[:top],
1499
- "skipped_records": skipped_records,
1500
- "repeated_file_reads": repeated_reads[:top],
1501
- "duplicate_tool_calls": duplicate_calls[:top],
1502
- }, findings[: top * 2]
1503
-
1504
-
1505
- def structural_summary(findings: list[Finding]) -> dict[str, Any]:
1506
- by_rule: Counter[str] = Counter(item.rule_id or item.id for item in findings)
1507
- by_severity: Counter[str] = Counter(item.severity for item in findings)
1508
- return {
1509
- "finding_count": len(findings),
1510
- "by_rule": dict(sorted(by_rule.items())),
1511
- "by_severity": dict(sorted(by_severity.items())),
1512
- }
1513
-
1514
-
1515
- def build_structural_waste_report(args: argparse.Namespace) -> dict[str, Any]:
1516
- root = safe_resolve(Path(args.path).expanduser())
1517
- try:
1518
- is_scan_root = root.exists() and root.is_dir()
1519
- except OSError:
1520
- is_scan_root = False
1521
- if not is_scan_root:
1522
- raise SystemExit(f"context-guard-diet: structural-waste path is not a directory: {path_label(root, args.show_paths)}")
1523
- top = bounded_top(args.top)
1524
- settings, _settings_findings = collect_settings(root)
1525
- context_files, context_findings = scan_context(root, args.large_context_bytes, args.huge_context_bytes, args.long_context_lines)
1526
- oversized_rule_findings = [item for item in context_findings if (item.rule_id or item.id) in {"large-context-file", "huge-context-file", "context-heavy-code-fences"}]
1527
- duplicate_rule_groups, duplicate_rule_findings = scan_duplicate_rules(root, min_chars=args.duplicate_rule_min_chars, top=top)
1528
- imports_category, import_findings = scan_python_imports(root, top=top, max_files=args.max_structural_files)
1529
- skills_category, skill_findings = scan_unused_skills(root, top=top, max_files=args.max_structural_files)
1530
- tools_category, tool_findings = scan_tool_catalogs(root, args, settings, top=top)
1531
- logs_category, log_findings = scan_logs(root, args, top=top)
1532
- findings = oversized_rule_findings + duplicate_rule_findings + import_findings + skill_findings + tool_findings + log_findings
1533
- findings.sort(key=lambda item: (SEVERITY_ORDER.get(item.severity, 99), item.rule_id or item.id, item.path))
1534
- return {
1535
- "tool": "context-guard-diet",
1536
- "mode": "structural-waste",
1537
- "schema_version": STRUCTURAL_WASTE_SCHEMA_VERSION,
1538
- "root": root_label(root, args.show_paths),
1539
- "read_only": True,
1540
- "network": "not-used",
1541
- "destructive_actions": [],
1542
- "limits": {
1543
- "top": top,
1544
- "max_structural_files": args.max_structural_files,
1545
- "large_context_bytes": args.large_context_bytes,
1546
- "huge_context_bytes": args.huge_context_bytes,
1547
- "long_context_lines": args.long_context_lines,
1548
- "duplicate_rule_min_chars": args.duplicate_rule_min_chars,
1549
- "duplicate_call_threshold": args.duplicate_call_threshold,
1550
- "mcp_server_threshold": args.mcp_server_threshold,
1551
- "tool_count_threshold": args.tool_count_threshold,
1552
- "large_schema_bytes": args.large_schema_bytes,
1553
- "max_tool_catalog_bytes": args.max_tool_catalog_bytes,
1554
- "max_log_bytes": args.max_log_bytes,
1555
- "max_log_line_bytes": args.max_log_line_bytes,
1556
- },
1557
- "summary": structural_summary(findings),
1558
- "categories": {
1559
- "rule_files": {
1560
- "context_files_scanned": len(context_files),
1561
- "oversized_or_heavy": [item.as_dict() for item in oversized_rule_findings[:top]],
1562
- "duplicate_rule_groups": duplicate_rule_groups,
1563
- },
1564
- "python_imports": imports_category,
1565
- "skills": skills_category,
1566
- "tool_schemas": tools_category,
1567
- "local_logs": logs_category,
1568
- },
1569
- "finding_count": len(findings),
1570
- "findings": [item.as_dict() for item in findings[: top * 10]],
1571
- "caveats": [
1572
- "Structural-waste diagnostics are advisory heuristics; verify before deleting rules, imports, skills, or tools.",
1573
- "No network calls or destructive actions are performed by this command.",
1574
- "Local log diagnostics use sanitized input fingerprints and do not print raw prompt, command, or tool-input text.",
1575
- "Unused-skill and stale-import candidates can be false positives when usage is dynamic or outside the scanned project.",
1576
- ],
1577
- }
1578
-
1579
-
1580
- def print_structural_waste_text(report: dict[str, Any]) -> None:
1581
- print("ContextGuard structural-waste diagnostics")
1582
- print(f"root: {report['root']}")
1583
- print("read_only: yes network: not-used destructive_actions: none")
1584
- summary = report["summary"]
1585
- print(f"findings: {summary['finding_count']} by_rule={json.dumps(summary['by_rule'], sort_keys=True)}")
1586
- if not report["findings"]:
1587
- print("\nFindings:\n- none")
1588
- return
1589
- print("\nFindings:")
1590
- for finding in report["findings"]:
1591
- print(f"- [{finding['severity'].upper()}] {finding['rule_id']} @ {finding['path']}")
1592
- print(f" why: {finding['message']}")
1593
- print(f" fix: {finding['action']}")
1594
-
1595
-
1596
- SEVERITY_ORDER = {"high": 0, "medium": 1, "low": 2}
1597
-
1598
-
1599
- def build_report(args: argparse.Namespace) -> dict[str, Any]:
1600
- root = safe_resolve(Path(args.path).expanduser())
1601
- try:
1602
- is_scan_root = root.exists() and root.is_dir()
1603
- except OSError:
1604
- is_scan_root = False
1605
- if not is_scan_root:
1606
- raise SystemExit(f"context-guard-diet: scan path is not a directory: {path_label(root, args.show_paths)}")
1607
- settings, settings_findings = collect_settings(root)
1608
- settings_summary, config_findings = scan_settings(root, settings)
1609
- context_files, context_findings = scan_context(root, args.large_context_bytes, args.huge_context_bytes, args.long_context_lines)
1610
- deny_entries = merged_settings(settings)["permissions"]["deny"]
1611
- exclusion_recommendations = build_context_exclusion_recommendations(root, deny_entries)
1612
- findings = settings_findings + config_findings + context_findings
1613
- findings.sort(key=lambda item: (SEVERITY_ORDER.get(item.severity, 99), item.id, item.path))
1614
- return {
1615
- "tool": "context-guard-diet",
1616
- "root": root_label(root, args.show_paths),
1617
- "settings": settings_summary,
1618
- "context_files": sorted(context_files, key=lambda item: item["bytes"], reverse=True)[: args.top],
1619
- "context_exclusion_recommendations": exclusion_recommendations[: args.top],
1620
- "finding_count": len(findings),
1621
- "findings": [item.as_dict() for item in findings],
1622
- }
1623
-
1624
-
1625
- def print_text(report: dict[str, Any]) -> None:
1626
- print("Claude token diet scan")
1627
- print(f"root: {report['root']}")
1628
- settings = report["settings"]
1629
- print(
1630
- "settings: "
1631
- f"files={len(settings['files'])} deny={settings['deny_count']} "
1632
- f"trim_hook={'yes' if settings['has_bash_trim_hook'] else 'no'} "
1633
- f"read_guard={'yes' if settings['has_large_read_guard'] else 'no'} "
1634
- f"statusline={'yes' if settings['has_statusline'] else 'no'} "
1635
- f"mcp={settings['mcp_server_count']}"
1636
- )
1637
- if report["context_files"]:
1638
- print("\nTop context-like files:")
1639
- for item in report["context_files"]:
1640
- surface = f", surface={item['surface']}" if item.get("surface") else ""
1641
- print(f"- {item['path']} ({item['bytes']} bytes, sampled_lines={item['sampled_lines']}{surface})")
1642
- if report.get("context_exclusion_recommendations"):
1643
- print("\nContext exclusion recommendations:")
1644
- for item in report["context_exclusion_recommendations"]:
1645
- status = item.get("status", "missing")
1646
- print(f"- [{item['severity'].upper()}] {item['id']} @ {item['path']} ({status})")
1647
- print(f" claude: {item['recommended_deny']}")
1648
- print(f" generic: {item['generic_pattern']}")
1649
- print("\nFindings:")
1650
- if not report["findings"]:
1651
- print("- none")
1652
- return
1653
- for finding in report["findings"]:
1654
- print(f"- [{finding['severity'].upper()}] {finding['id']} @ {finding['path']}")
1655
- print(f" why: {finding['message']}")
1656
- print(f" fix: {finding['action']}")
1657
-
1658
-
1659
- def main() -> int:
1660
- parser = argparse.ArgumentParser(prog="context-guard-diet")
1661
- sub = parser.add_subparsers(dest="command", required=True)
1662
- scan = sub.add_parser("scan", help="scan project settings and context files for token-diet gaps")
1663
- scan.add_argument("path", nargs="?", default=".")
1664
- scan.add_argument("--json", action="store_true", help="emit machine-readable JSON")
1665
- scan.add_argument("--show-paths", action="store_true", help="show raw absolute root path instead of a stable anonymized root label; local debugging only because private paths may be exposed")
1666
- scan.add_argument("--top", type=int, default=20, help="maximum context-like files and context-exclusion recommendations to list")
1667
- scan.add_argument("--large-context-bytes", type=int, default=DEFAULT_LARGE_CONTEXT_BYTES)
1668
- scan.add_argument("--huge-context-bytes", type=int, default=DEFAULT_HUGE_CONTEXT_BYTES)
1669
- scan.add_argument("--long-context-lines", type=int, default=DEFAULT_LONG_CONTEXT_LINES)
1670
-
1671
- structural = sub.add_parser("structural-waste", help="run local read-only structural waste diagnostics")
1672
- structural.add_argument("path", nargs="?", default=".")
1673
- structural.add_argument("--json", action="store_true", help="emit machine-readable JSON")
1674
- structural.add_argument("--show-paths", action="store_true", help="show raw local paths for debugging; secret-shaped path components remain redacted")
1675
- structural.add_argument("--top", type=int, default=DEFAULT_STRUCTURAL_WASTE_TOP, help="maximum findings per structural-waste category to list")
1676
- structural.add_argument("--log-path", action="append", default=[], help="local JSON/JSONL log or directory to inspect for repeated reads/tool calls; may be repeated")
1677
- structural.add_argument("--tool-catalog", action="append", default=[], help="local tool/MCP catalog JSON to inspect; may be repeated")
1678
- structural.add_argument("--large-context-bytes", type=int, default=DEFAULT_LARGE_CONTEXT_BYTES)
1679
- structural.add_argument("--huge-context-bytes", type=int, default=DEFAULT_HUGE_CONTEXT_BYTES)
1680
- structural.add_argument("--long-context-lines", type=int, default=DEFAULT_LONG_CONTEXT_LINES)
1681
- structural.add_argument("--duplicate-rule-min-chars", type=int, default=DEFAULT_DUPLICATE_RULE_MIN_CHARS)
1682
- structural.add_argument("--duplicate-call-threshold", type=int, default=DEFAULT_DUPLICATE_CALL_THRESHOLD)
1683
- structural.add_argument("--mcp-server-threshold", type=int, default=DEFAULT_MCP_SERVER_THRESHOLD)
1684
- structural.add_argument("--tool-count-threshold", type=int, default=DEFAULT_TOOL_COUNT_THRESHOLD)
1685
- structural.add_argument("--large-schema-bytes", type=int, default=DEFAULT_LARGE_SCHEMA_BYTES)
1686
- structural.add_argument("--max-tool-catalog-bytes", type=int, default=DEFAULT_MAX_TOOL_CATALOG_BYTES)
1687
- structural.add_argument("--max-log-bytes", type=int, default=DEFAULT_MAX_LOG_BYTES)
1688
- structural.add_argument("--max-log-line-bytes", type=int, default=DEFAULT_MAX_LOG_LINE_BYTES)
1689
- structural.add_argument("--max-structural-files", type=int, default=DEFAULT_MAX_STRUCTURAL_FILES)
1690
- args = parser.parse_args()
1691
-
1692
- if args.command == "scan":
1693
- report = build_report(args)
1694
- if args.json:
1695
- print(json.dumps(report, indent=2, sort_keys=True, ensure_ascii=False))
1696
- else:
1697
- print_text(report)
1698
- return 0
1699
- if args.command == "structural-waste":
1700
- report = build_structural_waste_report(args)
1701
- if args.json:
1702
- print(json.dumps(report, indent=2, sort_keys=True, ensure_ascii=False))
1703
- else:
1704
- print_structural_waste_text(report)
1705
- return 0
1706
- parser.error("unknown command")
1707
- return 2
1708
-
1709
-
1710
- if __name__ == "__main__":
1711
- raise SystemExit(main())