@ictechgy/context-guard 0.4.1 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +15 -0
  2. package/README.ko.md +62 -33
  3. package/README.md +91 -23
  4. package/context-guard-kit/README.md +39 -26
  5. package/context-guard-kit/benchmark_runner.py +273 -8
  6. package/context-guard-kit/claude_transcript_cost_audit.py +597 -12
  7. package/context-guard-kit/context_compress.py +153 -1
  8. package/context-guard-kit/context_filter.py +446 -0
  9. package/context-guard-kit/context_guard_cli.py +3 -0
  10. package/context-guard-kit/context_guard_diet.py +677 -2
  11. package/context-guard-kit/context_pack.py +1694 -2
  12. package/context-guard-kit/cost_guard.py +1870 -0
  13. package/context-guard-kit/setup_wizard.py +820 -29
  14. package/context-guard-kit/trim_command_output.py +396 -45
  15. package/docs/benchmark-fixtures/learned-compression.tasks.example.json +24 -0
  16. package/docs/benchmark-fixtures/learned-compression.variants.example.json +10 -0
  17. package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +24 -0
  18. package/docs/benchmark-fixtures/visual-ocr.variants.example.json +10 -0
  19. package/docs/benchmark-workflow-examples.md +40 -0
  20. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +169 -0
  21. package/docs/benchmark-workflows/measured-token-workflow.example.json +170 -0
  22. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +170 -0
  23. package/docs/cache-diagnostics-schema.md +96 -0
  24. package/docs/cache-diagnostics.example.json +116 -0
  25. package/docs/cache-diagnostics.schema.json +460 -0
  26. package/docs/distribution.md +4 -2
  27. package/docs/experimental-benchmark-fixtures.md +36 -0
  28. package/package.json +11 -2
  29. package/packaging/homebrew/context-guard.rb.template +3 -2
  30. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  31. package/plugins/context-guard/README.ko.md +22 -14
  32. package/plugins/context-guard/README.md +24 -10
  33. package/plugins/context-guard/bin/context-guard +3 -0
  34. package/plugins/context-guard/bin/context-guard-audit +597 -12
  35. package/plugins/context-guard/bin/context-guard-bench +273 -8
  36. package/plugins/context-guard/bin/context-guard-compress +153 -1
  37. package/plugins/context-guard/bin/context-guard-cost +1870 -0
  38. package/plugins/context-guard/bin/context-guard-diet +677 -2
  39. package/plugins/context-guard/bin/context-guard-filter +446 -0
  40. package/plugins/context-guard/bin/context-guard-pack +1694 -2
  41. package/plugins/context-guard/bin/context-guard-setup +820 -29
  42. package/plugins/context-guard/bin/context-guard-trim-output +396 -45
  43. package/plugins/context-guard/brief/README.md +10 -3
  44. package/plugins/context-guard/skills/optimize/SKILL.md +5 -2
  45. package/plugins/context-guard/skills/setup/SKILL.md +3 -1
@@ -9,6 +9,8 @@ burn during noisy command runs.
9
9
  from __future__ import annotations
10
10
 
11
11
  import argparse
12
+ import ast
13
+ from collections import Counter, defaultdict
12
14
  import errno
13
15
  import hashlib
14
16
  import json
@@ -71,6 +73,23 @@ MAX_SETTINGS_READ_BYTES = 256_000
71
73
  DEFAULT_LARGE_CONTEXT_BYTES = 16_000
72
74
  DEFAULT_HUGE_CONTEXT_BYTES = 64_000
73
75
  DEFAULT_LONG_CONTEXT_LINES = 300
76
+ STRUCTURAL_WASTE_SCHEMA_VERSION = "contextguard.structural-waste.v1"
77
+ DEFAULT_STRUCTURAL_WASTE_TOP = 20
78
+ DEFAULT_DUPLICATE_RULE_MIN_CHARS = 48
79
+ DEFAULT_DUPLICATE_CALL_THRESHOLD = 3
80
+ DEFAULT_MCP_SERVER_THRESHOLD = 6
81
+ DEFAULT_TOOL_COUNT_THRESHOLD = 40
82
+ DEFAULT_LARGE_SCHEMA_BYTES = 12_000
83
+ DEFAULT_MAX_TOOL_CATALOG_BYTES = 1_000_000
84
+ DEFAULT_MAX_LOG_BYTES = 5_000_000
85
+ DEFAULT_MAX_LOG_LINE_BYTES = 1_000_000
86
+ DEFAULT_MAX_STRUCTURAL_FILES = 2_000
87
+ MAX_REPORT_LABEL_CHARS = 160
88
+ TEXT_REFERENCE_SUFFIXES = {".md", ".txt", ".json", ".toml", ".yaml", ".yml", ".py", ".js", ".ts", ".tsx", ".jsx", ".sh"}
89
+ TOOL_CALL_NAME_KEYS = ("tool_name", "toolName", "tool")
90
+ TOOL_CALL_INPUT_KEYS = ("tool_input", "input", "arguments", "args", "parameters")
91
+ READ_TOOL_NAMES = {"read", "read_file", "fileread", "view_file", "open_file", "get_file", "functions.get_file"}
92
+ FILE_PATH_KEYS = {"file_path", "filepath", "path", "absolute_path", "relative_path", "file"}
74
93
 
75
94
  HEAVY_PROJECT_DENIES: tuple[tuple[str, str, str], ...] = (
76
95
  ("node_modules", "node_modules", "Read(./node_modules/**)"),
@@ -200,7 +219,7 @@ def display_path_hash(path: Path) -> str:
200
219
 
201
220
  def path_label(path: Path, show_paths: bool) -> str:
202
221
  if show_paths:
203
- return str(path)
222
+ return sanitize_path_text(str(path))
204
223
  name = sanitize_path_component(path.name or "path")
205
224
  return f"{name}#path:{display_path_hash(path)}"
206
225
 
@@ -219,7 +238,7 @@ def context_finding(
219
238
 
220
239
  def root_label(root: Path, show_paths: bool) -> str:
221
240
  if show_paths:
222
- return str(root)
241
+ return sanitize_path_text(str(root))
223
242
  name = sanitize_path_component(root.name or "project")
224
243
  return f"{name}#path:{display_path_hash(root)}"
225
244
 
@@ -945,6 +964,635 @@ def scan_context(root: Path, large_bytes: int, huge_bytes: int, long_lines: int)
945
964
  return context_files, findings
946
965
 
947
966
 
967
+ def bounded_top(value: int) -> int:
968
+ return max(1, min(int(value), 200))
969
+
970
+
971
+ def path_text_label(path_text: str, show_paths: bool) -> str:
972
+ sanitized = sanitize_path_text(str(path_text))
973
+ if show_paths:
974
+ return sanitized
975
+ name = sanitize_path_component(Path(sanitized).name or "path")
976
+ return f"{name}#path:{text_hash(sanitized)}"
977
+
978
+
979
+ def safe_report_label(value: Any, limit: int = MAX_REPORT_LABEL_CHARS) -> str:
980
+ text = " ".join(str(value or "").split())
981
+ text = SECRET_CONTENT_RE.sub("[REDACTED]", sanitize_path_text(text))
982
+ if len(text) <= limit:
983
+ return text
984
+ marker = f"…[trimmed:{len(text)} chars]"
985
+ return text[: max(0, limit - len(marker))] + marker
986
+
987
+
988
+ def json_byte_len(value: Any) -> int:
989
+ return len(json.dumps(value, ensure_ascii=False, sort_keys=True, separators=(",", ":")).encode("utf-8", "replace"))
990
+
991
+
992
+ def iter_project_files(root: Path, suffixes: set[str], max_files: int) -> Iterable[Path]:
993
+ seen = 0
994
+ for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
995
+ current = Path(dirpath)
996
+ dirnames[:] = [
997
+ name
998
+ for name in dirnames
999
+ if name not in EXCLUDED_DIR_NAMES and not (current / name).is_symlink()
1000
+ ]
1001
+ for name in filenames:
1002
+ path = current / name
1003
+ if path.is_symlink() or path.suffix.lower() not in suffixes:
1004
+ continue
1005
+ yield path
1006
+ seen += 1
1007
+ if seen >= max_files:
1008
+ return
1009
+
1010
+
1011
+ def walk_json(value: Any) -> Iterable[dict[str, Any]]:
1012
+ stack = [value]
1013
+ while stack:
1014
+ current = stack.pop()
1015
+ if isinstance(current, dict):
1016
+ yield current
1017
+ stack.extend(current.values())
1018
+ elif isinstance(current, list):
1019
+ stack.extend(current)
1020
+
1021
+
1022
+ def normalize_rule_unit(line: str, min_chars: int) -> str | None:
1023
+ stripped = line.strip()
1024
+ if not stripped or stripped in {"```", "---"}:
1025
+ return None
1026
+ stripped = re.sub(r"^[-*+>]\s+", "", stripped)
1027
+ stripped = re.sub(r"^\d+[.)]\s+", "", stripped)
1028
+ stripped = re.sub(r"\s+", " ", stripped).strip().lower()
1029
+ if len(stripped) < min_chars:
1030
+ return None
1031
+ if len(stripped.split()) < 6:
1032
+ return None
1033
+ return stripped
1034
+
1035
+
1036
+ def scan_duplicate_rules(root: Path, *, min_chars: int, top: int) -> tuple[list[dict[str, Any]], list[Finding]]:
1037
+ occurrences: dict[str, list[dict[str, Any]]] = defaultdict(list)
1038
+ for path in sorted(iter_context_files(root), key=lambda p: rel_path(p, root)):
1039
+ rel = rel_path(path, root)
1040
+ try:
1041
+ text, truncated = read_text_prefix(path, root=root)
1042
+ except OSError:
1043
+ continue
1044
+ for line_no, line in enumerate(text.splitlines(), 1):
1045
+ normalized = normalize_rule_unit(line, min_chars)
1046
+ if normalized is None:
1047
+ continue
1048
+ occurrences[normalized].append({"path": rel, "line": line_no, "sample_truncated": truncated})
1049
+ groups: list[dict[str, Any]] = []
1050
+ findings: list[Finding] = []
1051
+ for normalized, items in occurrences.items():
1052
+ paths = sorted({item["path"] for item in items})
1053
+ if len(items) < 2 or len(paths) < 2:
1054
+ continue
1055
+ fingerprint = text_hash(normalized)
1056
+ group = {
1057
+ "fingerprint": fingerprint,
1058
+ "occurrence_count": len(items),
1059
+ "path_count": len(paths),
1060
+ "paths": paths[:top],
1061
+ "sample_chars": len(normalized),
1062
+ "confidence": "observed",
1063
+ }
1064
+ groups.append(group)
1065
+ findings.append(Finding(
1066
+ f"duplicate-context-rule-{fingerprint}",
1067
+ "low" if len(items) < 4 else "medium",
1068
+ "context-rules",
1069
+ "A normalized instruction/rule unit appears in multiple context-like files.",
1070
+ "Keep one canonical copy and replace duplicates with a short pointer if the rule is still needed.",
1071
+ group,
1072
+ rule_id="duplicate-context-rule",
1073
+ instance_id=f"duplicate-context-rule-{fingerprint}",
1074
+ ))
1075
+ groups.sort(key=lambda item: (-item["occurrence_count"], item["fingerprint"]))
1076
+ findings.sort(key=lambda item: (SEVERITY_ORDER.get(item.severity, 99), item.id))
1077
+ return groups[:top], findings[:top]
1078
+
1079
+
1080
+ def assigned_all_names(tree: ast.AST) -> set[str]:
1081
+ names: set[str] = set()
1082
+ for node in ast.walk(tree):
1083
+ if isinstance(node, ast.Assign):
1084
+ for target in node.targets:
1085
+ if isinstance(target, ast.Name) and target.id == "__all__" and isinstance(node.value, (ast.List, ast.Tuple)):
1086
+ for item in node.value.elts:
1087
+ if isinstance(item, ast.Constant) and isinstance(item.value, str):
1088
+ names.add(item.value)
1089
+ return names
1090
+
1091
+
1092
+ def scan_python_imports(root: Path, *, top: int, max_files: int) -> tuple[dict[str, Any], list[Finding]]:
1093
+ findings: list[Finding] = []
1094
+ files_scanned = 0
1095
+ parse_errors = 0
1096
+ for path in iter_project_files(root, {".py"}, max_files):
1097
+ files_scanned += 1
1098
+ rel = rel_path(path, root)
1099
+ try:
1100
+ text, _ = read_text_prefix(path, limit=MAX_CONTEXT_READ_BYTES, root=root)
1101
+ tree = ast.parse(text, filename=rel)
1102
+ except (OSError, SyntaxError, ValueError):
1103
+ parse_errors += 1
1104
+ continue
1105
+ imports: list[tuple[str, int, str]] = []
1106
+ for node in ast.walk(tree):
1107
+ if isinstance(node, ast.Import):
1108
+ for alias in node.names:
1109
+ name = alias.asname or alias.name.split(".", 1)[0]
1110
+ if not name.startswith("_"):
1111
+ imports.append((name, node.lineno, alias.name))
1112
+ elif isinstance(node, ast.ImportFrom):
1113
+ if node.module == "__future__":
1114
+ continue
1115
+ for alias in node.names:
1116
+ if alias.name == "*":
1117
+ continue
1118
+ name = alias.asname or alias.name
1119
+ if not name.startswith("_"):
1120
+ imports.append((name, node.lineno, f"{node.module or ''}.{alias.name}".strip(".")))
1121
+ if not imports:
1122
+ continue
1123
+ used = {node.id for node in ast.walk(tree) if isinstance(node, ast.Name)} | assigned_all_names(tree)
1124
+ for name, line, module in imports:
1125
+ if name in used:
1126
+ continue
1127
+ instance = f"stale-python-import-{text_hash(f'{rel}:{line}:{name}')}"
1128
+ findings.append(Finding(
1129
+ instance,
1130
+ "low",
1131
+ rel,
1132
+ f"Python import `{name}` appears unused in static AST analysis.",
1133
+ "Review before removing; dynamic imports, re-exports, and type-checking paths can make this a false positive.",
1134
+ {"imported_name": name, "module": module, "line": line, "confidence": "advisory-static-ast"},
1135
+ rule_id="stale-python-import",
1136
+ instance_id=instance,
1137
+ ))
1138
+ if len(findings) >= top:
1139
+ break
1140
+ if len(findings) >= top:
1141
+ break
1142
+ return {"files_scanned": files_scanned, "parse_errors": parse_errors, "unused_imports": [f.as_dict() for f in findings]}, findings
1143
+
1144
+
1145
+ def iter_skill_files(root: Path, max_files: int) -> Iterable[Path]:
1146
+ count = 0
1147
+ for path in iter_project_files(root, {".md"}, max_files):
1148
+ if path.name == "SKILL.md" and "skills" in path.parts:
1149
+ yield path
1150
+ count += 1
1151
+ if count >= max_files:
1152
+ return
1153
+
1154
+
1155
+ def safe_read_reference_text(path: Path, root: Path) -> str:
1156
+ try:
1157
+ text, _ = read_text_prefix(path, limit=128_000, root=root)
1158
+ return text.lower()
1159
+ except OSError:
1160
+ return ""
1161
+
1162
+
1163
+ def scan_unused_skills(root: Path, *, top: int, max_files: int) -> tuple[dict[str, Any], list[Finding]]:
1164
+ skill_files = list(iter_skill_files(root, max_files))
1165
+ reference_files = [path for path in iter_project_files(root, TEXT_REFERENCE_SUFFIXES, max_files) if path.name != "SKILL.md"]
1166
+ reference_cache = {path: safe_read_reference_text(path, root) for path in reference_files}
1167
+ findings: list[Finding] = []
1168
+ candidates: list[dict[str, Any]] = []
1169
+ for skill in skill_files:
1170
+ skill_name = skill.parent.name
1171
+ needle_forms = {skill_name.lower(), f"/{skill_name.lower()}", f"context-guard:{skill_name.lower()}"}
1172
+ references = 0
1173
+ for ref_path, text in reference_cache.items():
1174
+ if ref_path == skill:
1175
+ continue
1176
+ if any(needle in text for needle in needle_forms):
1177
+ references += 1
1178
+ if references:
1179
+ continue
1180
+ rel = rel_path(skill, root)
1181
+ candidate = {"path": rel, "skill": safe_report_label(skill_name), "reference_count": 0, "confidence": "low-advisory"}
1182
+ candidates.append(candidate)
1183
+ instance = f"unused-skill-candidate-{text_hash(rel)}"
1184
+ findings.append(Finding(
1185
+ instance,
1186
+ "low",
1187
+ rel,
1188
+ "Skill file has no obvious project-local references outside its own SKILL.md.",
1189
+ "Confirm real usage through plugin manifests, user docs, or runtime telemetry before deleting or renaming it.",
1190
+ candidate,
1191
+ rule_id="unused-skill-candidate",
1192
+ instance_id=instance,
1193
+ ))
1194
+ if len(findings) >= top:
1195
+ break
1196
+ return {"skills_scanned": len(skill_files), "reference_files_scanned": len(reference_files), "unused_candidates": candidates[:top]}, findings
1197
+
1198
+
1199
+ def read_json_file_limited(path: Path, max_bytes: int) -> tuple[Any | None, str | None, int]:
1200
+ try:
1201
+ with open_regular_no_follow(path) as handle:
1202
+ size = os.fstat(handle.fileno()).st_size
1203
+ if size > max_bytes:
1204
+ return None, f"skipped oversized file ({size} bytes > {max_bytes})", size
1205
+ data = handle.read(max_bytes + 1)
1206
+ if len(data) > max_bytes:
1207
+ return None, f"skipped oversized file (> {max_bytes} bytes)", len(data)
1208
+ return json.loads(data.decode("utf-8", "replace")), None, len(data)
1209
+ except json.JSONDecodeError as exc:
1210
+ return None, f"invalid JSON at line {exc.lineno}: {exc.msg}", 0
1211
+ except (OSError, UnicodeDecodeError) as exc:
1212
+ return None, f"unreadable: {format_os_error(exc) if isinstance(exc, OSError) else exc.__class__.__name__}", 0
1213
+
1214
+
1215
+ def tool_name_from_schema(d: dict[str, Any]) -> str | None:
1216
+ for key in ("name", "tool", "id", "title"):
1217
+ value = d.get(key)
1218
+ if isinstance(value, str) and value.strip():
1219
+ return safe_report_label(value)
1220
+ return None
1221
+
1222
+
1223
+ def collect_tool_schemas(raw: Any) -> list[dict[str, Any]]:
1224
+ tools: list[dict[str, Any]] = []
1225
+ for d in walk_json(raw):
1226
+ name = tool_name_from_schema(d)
1227
+ if not name:
1228
+ continue
1229
+ if not any(key in d for key in ("inputSchema", "input_schema", "schema", "parameters", "description")):
1230
+ continue
1231
+ server = safe_report_label(d.get("server")) if isinstance(d.get("server"), str) else None
1232
+ tools.append({"name": name, "schema_bytes": json_byte_len(d), "server": server})
1233
+ dedup: dict[tuple[str, str | None], dict[str, Any]] = {}
1234
+ for tool in tools:
1235
+ key = (tool["name"], tool.get("server"))
1236
+ prior = dedup.get(key)
1237
+ if prior is None or int(tool["schema_bytes"]) > int(prior["schema_bytes"]):
1238
+ dedup[key] = tool
1239
+ return list(dedup.values())
1240
+
1241
+
1242
+ def scan_tool_catalogs(root: Path, args: argparse.Namespace, settings: list[dict[str, Any]], *, top: int) -> tuple[dict[str, Any], list[Finding]]:
1243
+ findings: list[Finding] = []
1244
+ catalogs: list[dict[str, Any]] = []
1245
+ merged = merged_settings(settings)
1246
+ mcp_servers = merged.get("mcpServers") if isinstance(merged.get("mcpServers"), dict) else {}
1247
+ if len(mcp_servers) >= args.mcp_server_threshold:
1248
+ evidence = {"mcp_server_count": len(mcp_servers), "threshold": args.mcp_server_threshold, "confidence": "observed-settings"}
1249
+ findings.append(Finding(
1250
+ "excessive-mcp-servers",
1251
+ "low",
1252
+ ".claude/settings.json",
1253
+ "Project Claude settings configure many MCP servers, which can increase tool discovery/schema overhead.",
1254
+ "Disable unused MCP servers for sessions that do not need them; keep this advisory until task-specific need is known.",
1255
+ evidence,
1256
+ rule_id="excessive-mcp-servers",
1257
+ instance_id="excessive-mcp-servers",
1258
+ ))
1259
+ for raw_path in getattr(args, "tool_catalog", []) or []:
1260
+ path = safe_resolve(Path(raw_path).expanduser())
1261
+ label = path_text_label(str(path), args.show_paths)
1262
+ raw, error, size = read_json_file_limited(path, args.max_tool_catalog_bytes)
1263
+ if error:
1264
+ catalogs.append({"path": label, "status": "skipped", "reason": error, "bytes": size})
1265
+ continue
1266
+ tools = collect_tool_schemas(raw)
1267
+ total_schema_bytes = sum(int(tool["schema_bytes"]) for tool in tools)
1268
+ large_tools = sorted([tool for tool in tools if int(tool["schema_bytes"]) >= args.large_schema_bytes], key=lambda item: (-int(item["schema_bytes"]), item["name"]))[:top]
1269
+ catalog = {"path": label, "status": "scanned", "tool_count": len(tools), "schema_bytes": total_schema_bytes, "large_schema_tools": large_tools}
1270
+ catalogs.append(catalog)
1271
+ if len(tools) >= args.tool_count_threshold:
1272
+ instance = f"excessive-tool-catalog-{text_hash(label)}"
1273
+ findings.append(Finding(
1274
+ instance,
1275
+ "medium",
1276
+ label,
1277
+ "Local tool catalog contains many tools for one task context.",
1278
+ "Use context-guard-tool-prune or a task-specific tool allowlist before injecting full schemas.",
1279
+ {"tool_count": len(tools), "threshold": args.tool_count_threshold, "schema_bytes": total_schema_bytes, "confidence": "observed-catalog"},
1280
+ rule_id="excessive-tool-catalog",
1281
+ instance_id=instance,
1282
+ ))
1283
+ for tool in large_tools:
1284
+ instance = f"large-tool-schema-{text_hash(label + ':' + tool['name'])}"
1285
+ findings.append(Finding(
1286
+ instance,
1287
+ "low",
1288
+ label,
1289
+ "A local tool schema is large enough to dominate narrow task context.",
1290
+ "Prefer a bounded top-k schema report and retrieve the full sanitized schema only when needed.",
1291
+ {"tool_name": tool["name"], "schema_bytes": tool["schema_bytes"], "threshold": args.large_schema_bytes, "confidence": "observed-catalog"},
1292
+ rule_id="large-tool-schema",
1293
+ instance_id=instance,
1294
+ ))
1295
+ return {"mcp_server_count": len(mcp_servers), "catalogs": catalogs[:top]}, findings[: max(top, 1) * 2]
1296
+
1297
+
1298
+ def iter_log_candidates(root: Path, log_paths: list[str], max_files: int) -> Iterable[Path]:
1299
+ candidates: list[Path] = []
1300
+ explicit = [Path(item).expanduser() for item in log_paths]
1301
+ default_roots = [root / ".claude", root / ".codex"]
1302
+ for path in explicit + default_roots:
1303
+ try:
1304
+ resolved = safe_resolve(path)
1305
+ except OSError:
1306
+ resolved = path
1307
+ if resolved.exists() and not resolved.is_symlink():
1308
+ candidates.append(resolved)
1309
+ yielded = 0
1310
+ for candidate in candidates:
1311
+ if candidate.is_file() and candidate.suffix.lower() in {".json", ".jsonl", ".ndjson", ".log"}:
1312
+ yield candidate
1313
+ yielded += 1
1314
+ elif candidate.is_dir():
1315
+ for dirpath, dirnames, filenames in os.walk(candidate, followlinks=False):
1316
+ current = Path(dirpath)
1317
+ dirnames[:] = [name for name in dirnames if name not in EXCLUDED_DIR_NAMES and not (current / name).is_symlink()]
1318
+ for name in filenames:
1319
+ path = current / name
1320
+ if path.is_symlink() or path.suffix.lower() not in {".json", ".jsonl", ".ndjson", ".log"}:
1321
+ continue
1322
+ yield path
1323
+ yielded += 1
1324
+ if yielded >= max_files:
1325
+ return
1326
+ if yielded >= max_files:
1327
+ return
1328
+
1329
+
1330
+ def parse_possible_json(value: Any) -> Any:
1331
+ if isinstance(value, str):
1332
+ stripped = value.strip()
1333
+ if stripped and stripped[0] in "[{":
1334
+ try:
1335
+ return json.loads(stripped)
1336
+ except json.JSONDecodeError:
1337
+ return value
1338
+ return value
1339
+
1340
+
1341
+ def call_name(d: dict[str, Any]) -> str | None:
1342
+ for key in TOOL_CALL_NAME_KEYS:
1343
+ value = d.get(key)
1344
+ if isinstance(value, str) and value.strip():
1345
+ return value.strip()[:120]
1346
+ typ = str(d.get("type") or "").lower()
1347
+ name = d.get("name")
1348
+ if isinstance(name, str) and name.strip() and (typ in {"tool_use", "tool_call", "function_call"} or any(key in d for key in TOOL_CALL_INPUT_KEYS)):
1349
+ return name.strip()[:120]
1350
+ return None
1351
+
1352
+
1353
+ def call_input(d: dict[str, Any]) -> Any:
1354
+ for key in TOOL_CALL_INPUT_KEYS:
1355
+ if key in d:
1356
+ return parse_possible_json(d[key])
1357
+ return {}
1358
+
1359
+
1360
+ def sanitized_fingerprint_value(value: Any) -> Any:
1361
+ if isinstance(value, dict):
1362
+ out: dict[str, Any] = {}
1363
+ for key, item in sorted(value.items(), key=lambda kv: str(kv[0])):
1364
+ safe_key = sanitize_path_component(str(key))
1365
+ out[safe_key] = sanitized_fingerprint_value(item)
1366
+ return out
1367
+ if isinstance(value, list):
1368
+ return [sanitized_fingerprint_value(item) for item in value[:20]]
1369
+ if isinstance(value, str):
1370
+ return SECRET_CONTENT_RE.sub("[REDACTED]", sanitize_path_text(value))[:500]
1371
+ return value
1372
+
1373
+
1374
+ def find_path_argument(value: Any) -> str | None:
1375
+ stack = [parse_possible_json(value)]
1376
+ while stack:
1377
+ current = stack.pop()
1378
+ if isinstance(current, dict):
1379
+ for key, item in current.items():
1380
+ if str(key) in FILE_PATH_KEYS and isinstance(item, str) and item.strip():
1381
+ return item.strip()
1382
+ stack.append(item)
1383
+ elif isinstance(current, list):
1384
+ stack.extend(current)
1385
+ return None
1386
+
1387
+
1388
+ def is_read_tool(name: str) -> bool:
1389
+ lowered = name.lower().replace("-", "_")
1390
+ tail = lowered.rsplit(".", 1)[-1]
1391
+ return lowered in READ_TOOL_NAMES or tail in READ_TOOL_NAMES or "read_file" in lowered
1392
+
1393
+
1394
+ def scan_logs(root: Path, args: argparse.Namespace, *, top: int) -> tuple[dict[str, Any], list[Finding]]:
1395
+ tool_counts: Counter[tuple[str, str]] = Counter()
1396
+ tool_files: dict[tuple[str, str], set[str]] = defaultdict(set)
1397
+ read_counts: Counter[str] = Counter()
1398
+ read_labels: dict[str, str] = {}
1399
+ read_tools: dict[str, set[str]] = defaultdict(set)
1400
+ files_scanned = 0
1401
+ records_scanned = 0
1402
+ skipped_files: list[dict[str, Any]] = []
1403
+ skipped_records = 0
1404
+ for path in iter_log_candidates(root, getattr(args, "log_path", []) or [], args.max_structural_files):
1405
+ label = path_text_label(str(path), args.show_paths)
1406
+ try:
1407
+ with open_regular_no_follow(path) as handle:
1408
+ size = os.fstat(handle.fileno()).st_size
1409
+ if size > args.max_log_bytes:
1410
+ skipped_files.append({"path": label, "reason": f"oversized:{size}>{args.max_log_bytes}"})
1411
+ continue
1412
+ data = handle.read(args.max_log_bytes + 1)
1413
+ if len(data) > args.max_log_bytes:
1414
+ skipped_files.append({"path": label, "reason": f"oversized:>{args.max_log_bytes}"})
1415
+ continue
1416
+ except OSError as exc:
1417
+ skipped_files.append({"path": label, "reason": format_os_error(exc)})
1418
+ continue
1419
+ files_scanned += 1
1420
+ text = data.decode("utf-8", "replace")
1421
+ raw_records: list[Any] = []
1422
+ if path.suffix.lower() == ".json":
1423
+ try:
1424
+ parsed = json.loads(text)
1425
+ raw_records = parsed if isinstance(parsed, list) else [parsed]
1426
+ except json.JSONDecodeError:
1427
+ skipped_records += 1
1428
+ continue
1429
+ else:
1430
+ for raw_line in text.splitlines():
1431
+ if len(raw_line.encode("utf-8", "replace")) > args.max_log_line_bytes:
1432
+ skipped_records += 1
1433
+ continue
1434
+ if not raw_line.strip():
1435
+ continue
1436
+ try:
1437
+ raw_records.append(json.loads(raw_line))
1438
+ except json.JSONDecodeError:
1439
+ skipped_records += 1
1440
+ for record in raw_records:
1441
+ records_scanned += 1
1442
+ for d in walk_json(record):
1443
+ name = call_name(d)
1444
+ if not name:
1445
+ continue
1446
+ value = call_input(d)
1447
+ fp = text_hash(json.dumps(sanitized_fingerprint_value(value), ensure_ascii=False, sort_keys=True, default=str))
1448
+ key = (name, fp)
1449
+ tool_counts[key] += 1
1450
+ tool_files[key].add(label)
1451
+ if is_read_tool(name):
1452
+ path_arg = find_path_argument(value)
1453
+ if path_arg:
1454
+ read_fp = text_hash(sanitize_path_text(path_arg))
1455
+ read_counts[read_fp] += 1
1456
+ read_labels[read_fp] = path_text_label(path_arg, args.show_paths)
1457
+ read_tools[read_fp].add(name)
1458
+ findings: list[Finding] = []
1459
+ repeated_reads: list[dict[str, Any]] = []
1460
+ for fp, count in read_counts.most_common(top):
1461
+ if count < args.duplicate_call_threshold:
1462
+ continue
1463
+ item = {"path": read_labels[fp], "path_fingerprint": fp, "read_count": count, "tools": sorted(safe_report_label(name) for name in read_tools[fp]), "confidence": "observed-log"}
1464
+ repeated_reads.append(item)
1465
+ instance = f"repeated-file-read-{fp}"
1466
+ findings.append(Finding(
1467
+ instance,
1468
+ "medium",
1469
+ "local-logs",
1470
+ "The same file path appears to be read repeatedly in local tool-call logs.",
1471
+ "Use search/symbol/slice reads or a local artifact receipt instead of repeating whole-file reads.",
1472
+ item,
1473
+ rule_id="repeated-file-read",
1474
+ instance_id=instance,
1475
+ ))
1476
+ duplicate_calls: list[dict[str, Any]] = []
1477
+ for (name, fp), count in tool_counts.most_common(top * 2):
1478
+ if count < args.duplicate_call_threshold:
1479
+ continue
1480
+ item = {"tool_name": safe_report_label(name), "input_fingerprint": fp, "call_count": count, "log_files": sorted(tool_files[(name, fp)])[:top], "confidence": "observed-log"}
1481
+ duplicate_calls.append(item)
1482
+ instance = f"duplicate-tool-call-{text_hash(name + ':' + fp)}"
1483
+ findings.append(Finding(
1484
+ instance,
1485
+ "low" if count < args.duplicate_call_threshold * 2 else "medium",
1486
+ "local-logs",
1487
+ "A tool call with the same sanitized input fingerprint repeats in local logs.",
1488
+ "Avoid replaying identical calls; keep one receipt or summarize the result before retrying.",
1489
+ item,
1490
+ rule_id="duplicate-tool-call",
1491
+ instance_id=instance,
1492
+ ))
1493
+ if len(duplicate_calls) >= top:
1494
+ break
1495
+ return {
1496
+ "files_scanned": files_scanned,
1497
+ "records_scanned": records_scanned,
1498
+ "skipped_files": skipped_files[:top],
1499
+ "skipped_records": skipped_records,
1500
+ "repeated_file_reads": repeated_reads[:top],
1501
+ "duplicate_tool_calls": duplicate_calls[:top],
1502
+ }, findings[: top * 2]
1503
+
1504
+
1505
+ def structural_summary(findings: list[Finding]) -> dict[str, Any]:
1506
+ by_rule: Counter[str] = Counter(item.rule_id or item.id for item in findings)
1507
+ by_severity: Counter[str] = Counter(item.severity for item in findings)
1508
+ return {
1509
+ "finding_count": len(findings),
1510
+ "by_rule": dict(sorted(by_rule.items())),
1511
+ "by_severity": dict(sorted(by_severity.items())),
1512
+ }
1513
+
1514
+
1515
+ def build_structural_waste_report(args: argparse.Namespace) -> dict[str, Any]:
1516
+ root = safe_resolve(Path(args.path).expanduser())
1517
+ try:
1518
+ is_scan_root = root.exists() and root.is_dir()
1519
+ except OSError:
1520
+ is_scan_root = False
1521
+ if not is_scan_root:
1522
+ raise SystemExit(f"context-guard-diet: structural-waste path is not a directory: {path_label(root, args.show_paths)}")
1523
+ top = bounded_top(args.top)
1524
+ settings, _settings_findings = collect_settings(root)
1525
+ context_files, context_findings = scan_context(root, args.large_context_bytes, args.huge_context_bytes, args.long_context_lines)
1526
+ oversized_rule_findings = [item for item in context_findings if (item.rule_id or item.id) in {"large-context-file", "huge-context-file", "context-heavy-code-fences"}]
1527
+ duplicate_rule_groups, duplicate_rule_findings = scan_duplicate_rules(root, min_chars=args.duplicate_rule_min_chars, top=top)
1528
+ imports_category, import_findings = scan_python_imports(root, top=top, max_files=args.max_structural_files)
1529
+ skills_category, skill_findings = scan_unused_skills(root, top=top, max_files=args.max_structural_files)
1530
+ tools_category, tool_findings = scan_tool_catalogs(root, args, settings, top=top)
1531
+ logs_category, log_findings = scan_logs(root, args, top=top)
1532
+ findings = oversized_rule_findings + duplicate_rule_findings + import_findings + skill_findings + tool_findings + log_findings
1533
+ findings.sort(key=lambda item: (SEVERITY_ORDER.get(item.severity, 99), item.rule_id or item.id, item.path))
1534
+ return {
1535
+ "tool": "context-guard-diet",
1536
+ "mode": "structural-waste",
1537
+ "schema_version": STRUCTURAL_WASTE_SCHEMA_VERSION,
1538
+ "root": root_label(root, args.show_paths),
1539
+ "read_only": True,
1540
+ "network": "not-used",
1541
+ "destructive_actions": [],
1542
+ "limits": {
1543
+ "top": top,
1544
+ "max_structural_files": args.max_structural_files,
1545
+ "large_context_bytes": args.large_context_bytes,
1546
+ "huge_context_bytes": args.huge_context_bytes,
1547
+ "long_context_lines": args.long_context_lines,
1548
+ "duplicate_rule_min_chars": args.duplicate_rule_min_chars,
1549
+ "duplicate_call_threshold": args.duplicate_call_threshold,
1550
+ "mcp_server_threshold": args.mcp_server_threshold,
1551
+ "tool_count_threshold": args.tool_count_threshold,
1552
+ "large_schema_bytes": args.large_schema_bytes,
1553
+ "max_tool_catalog_bytes": args.max_tool_catalog_bytes,
1554
+ "max_log_bytes": args.max_log_bytes,
1555
+ "max_log_line_bytes": args.max_log_line_bytes,
1556
+ },
1557
+ "summary": structural_summary(findings),
1558
+ "categories": {
1559
+ "rule_files": {
1560
+ "context_files_scanned": len(context_files),
1561
+ "oversized_or_heavy": [item.as_dict() for item in oversized_rule_findings[:top]],
1562
+ "duplicate_rule_groups": duplicate_rule_groups,
1563
+ },
1564
+ "python_imports": imports_category,
1565
+ "skills": skills_category,
1566
+ "tool_schemas": tools_category,
1567
+ "local_logs": logs_category,
1568
+ },
1569
+ "finding_count": len(findings),
1570
+ "findings": [item.as_dict() for item in findings[: top * 10]],
1571
+ "caveats": [
1572
+ "Structural-waste diagnostics are advisory heuristics; verify before deleting rules, imports, skills, or tools.",
1573
+ "No network calls or destructive actions are performed by this command.",
1574
+ "Local log diagnostics use sanitized input fingerprints and do not print raw prompt, command, or tool-input text.",
1575
+ "Unused-skill and stale-import candidates can be false positives when usage is dynamic or outside the scanned project.",
1576
+ ],
1577
+ }
1578
+
1579
+
1580
+ def print_structural_waste_text(report: dict[str, Any]) -> None:
1581
+ print("ContextGuard structural-waste diagnostics")
1582
+ print(f"root: {report['root']}")
1583
+ print("read_only: yes network: not-used destructive_actions: none")
1584
+ summary = report["summary"]
1585
+ print(f"findings: {summary['finding_count']} by_rule={json.dumps(summary['by_rule'], sort_keys=True)}")
1586
+ if not report["findings"]:
1587
+ print("\nFindings:\n- none")
1588
+ return
1589
+ print("\nFindings:")
1590
+ for finding in report["findings"]:
1591
+ print(f"- [{finding['severity'].upper()}] {finding['rule_id']} @ {finding['path']}")
1592
+ print(f" why: {finding['message']}")
1593
+ print(f" fix: {finding['action']}")
1594
+
1595
+
948
1596
  SEVERITY_ORDER = {"high": 0, "medium": 1, "low": 2}
949
1597
 
950
1598
 
@@ -1019,6 +1667,26 @@ def main() -> int:
1019
1667
  scan.add_argument("--large-context-bytes", type=int, default=DEFAULT_LARGE_CONTEXT_BYTES)
1020
1668
  scan.add_argument("--huge-context-bytes", type=int, default=DEFAULT_HUGE_CONTEXT_BYTES)
1021
1669
  scan.add_argument("--long-context-lines", type=int, default=DEFAULT_LONG_CONTEXT_LINES)
1670
+
1671
+ structural = sub.add_parser("structural-waste", help="run local read-only structural waste diagnostics")
1672
+ structural.add_argument("path", nargs="?", default=".")
1673
+ structural.add_argument("--json", action="store_true", help="emit machine-readable JSON")
1674
+ structural.add_argument("--show-paths", action="store_true", help="show raw local paths for debugging; secret-shaped path components remain redacted")
1675
+ structural.add_argument("--top", type=int, default=DEFAULT_STRUCTURAL_WASTE_TOP, help="maximum findings per structural-waste category to list")
1676
+ structural.add_argument("--log-path", action="append", default=[], help="local JSON/JSONL log or directory to inspect for repeated reads/tool calls; may be repeated")
1677
+ structural.add_argument("--tool-catalog", action="append", default=[], help="local tool/MCP catalog JSON to inspect; may be repeated")
1678
+ structural.add_argument("--large-context-bytes", type=int, default=DEFAULT_LARGE_CONTEXT_BYTES)
1679
+ structural.add_argument("--huge-context-bytes", type=int, default=DEFAULT_HUGE_CONTEXT_BYTES)
1680
+ structural.add_argument("--long-context-lines", type=int, default=DEFAULT_LONG_CONTEXT_LINES)
1681
+ structural.add_argument("--duplicate-rule-min-chars", type=int, default=DEFAULT_DUPLICATE_RULE_MIN_CHARS)
1682
+ structural.add_argument("--duplicate-call-threshold", type=int, default=DEFAULT_DUPLICATE_CALL_THRESHOLD)
1683
+ structural.add_argument("--mcp-server-threshold", type=int, default=DEFAULT_MCP_SERVER_THRESHOLD)
1684
+ structural.add_argument("--tool-count-threshold", type=int, default=DEFAULT_TOOL_COUNT_THRESHOLD)
1685
+ structural.add_argument("--large-schema-bytes", type=int, default=DEFAULT_LARGE_SCHEMA_BYTES)
1686
+ structural.add_argument("--max-tool-catalog-bytes", type=int, default=DEFAULT_MAX_TOOL_CATALOG_BYTES)
1687
+ structural.add_argument("--max-log-bytes", type=int, default=DEFAULT_MAX_LOG_BYTES)
1688
+ structural.add_argument("--max-log-line-bytes", type=int, default=DEFAULT_MAX_LOG_LINE_BYTES)
1689
+ structural.add_argument("--max-structural-files", type=int, default=DEFAULT_MAX_STRUCTURAL_FILES)
1022
1690
  args = parser.parse_args()
1023
1691
 
1024
1692
  if args.command == "scan":
@@ -1028,6 +1696,13 @@ def main() -> int:
1028
1696
  else:
1029
1697
  print_text(report)
1030
1698
  return 0
1699
+ if args.command == "structural-waste":
1700
+ report = build_structural_waste_report(args)
1701
+ if args.json:
1702
+ print(json.dumps(report, indent=2, sort_keys=True, ensure_ascii=False))
1703
+ else:
1704
+ print_structural_waste_text(report)
1705
+ return 0
1031
1706
  parser.error("unknown command")
1032
1707
  return 2
1033
1708