@ictechgy/context-guard 0.4.1 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +15 -0
  2. package/README.ko.md +62 -33
  3. package/README.md +91 -23
  4. package/context-guard-kit/README.md +39 -26
  5. package/context-guard-kit/benchmark_runner.py +273 -8
  6. package/context-guard-kit/claude_transcript_cost_audit.py +597 -12
  7. package/context-guard-kit/context_compress.py +153 -1
  8. package/context-guard-kit/context_filter.py +446 -0
  9. package/context-guard-kit/context_guard_cli.py +3 -0
  10. package/context-guard-kit/context_guard_diet.py +677 -2
  11. package/context-guard-kit/context_pack.py +1694 -2
  12. package/context-guard-kit/cost_guard.py +1870 -0
  13. package/context-guard-kit/setup_wizard.py +820 -29
  14. package/context-guard-kit/trim_command_output.py +396 -45
  15. package/docs/benchmark-fixtures/learned-compression.tasks.example.json +24 -0
  16. package/docs/benchmark-fixtures/learned-compression.variants.example.json +10 -0
  17. package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +24 -0
  18. package/docs/benchmark-fixtures/visual-ocr.variants.example.json +10 -0
  19. package/docs/benchmark-workflow-examples.md +40 -0
  20. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +169 -0
  21. package/docs/benchmark-workflows/measured-token-workflow.example.json +170 -0
  22. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +170 -0
  23. package/docs/cache-diagnostics-schema.md +96 -0
  24. package/docs/cache-diagnostics.example.json +116 -0
  25. package/docs/cache-diagnostics.schema.json +460 -0
  26. package/docs/distribution.md +4 -2
  27. package/docs/experimental-benchmark-fixtures.md +36 -0
  28. package/package.json +11 -2
  29. package/packaging/homebrew/context-guard.rb.template +3 -2
  30. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  31. package/plugins/context-guard/README.ko.md +22 -14
  32. package/plugins/context-guard/README.md +24 -10
  33. package/plugins/context-guard/bin/context-guard +3 -0
  34. package/plugins/context-guard/bin/context-guard-audit +597 -12
  35. package/plugins/context-guard/bin/context-guard-bench +273 -8
  36. package/plugins/context-guard/bin/context-guard-compress +153 -1
  37. package/plugins/context-guard/bin/context-guard-cost +1870 -0
  38. package/plugins/context-guard/bin/context-guard-diet +677 -2
  39. package/plugins/context-guard/bin/context-guard-filter +446 -0
  40. package/plugins/context-guard/bin/context-guard-pack +1694 -2
  41. package/plugins/context-guard/bin/context-guard-setup +820 -29
  42. package/plugins/context-guard/bin/context-guard-trim-output +396 -45
  43. package/plugins/context-guard/brief/README.md +10 -3
  44. package/plugins/context-guard/skills/optimize/SKILL.md +5 -2
  45. package/plugins/context-guard/skills/setup/SKILL.md +3 -1
@@ -44,6 +44,55 @@ CODE_SIGNAL_RE = re.compile(
44
44
  r"(^\s*(def |class |function |func |import |from \S+ import |public |private |const |let |var |#include|package )"
45
45
  r"|[{};]\s*$|=>|::)"
46
46
  )
47
+ CODE_FENCE_RE = re.compile(r"(?m)^\s*```")
48
+ JSON_KEY_RE = re.compile(r'"(?:[^"\\]|\\.)*"\s*:')
49
+ QUOTED_STRING_RE = re.compile(r"""(?x)
50
+ "(?:[^"\\]|\\.)*" |
51
+ '(?:[^'\\]|\\.)*'
52
+ """)
53
+ HASH_RE = re.compile(r"\b(?:[0-9a-fA-F]{32,}|sha256:[0-9a-fA-F]{32,})\b")
54
+ PATH_RE = re.compile(
55
+ r"(?x)(?:"
56
+ r"(?<![\w.-])/(?:[A-Za-z0-9._@%+=:-]+/)*[A-Za-z0-9._@%+=:-]+"
57
+ r"|"
58
+ r"\b[A-Za-z]:\\(?:[^\\\s:\"'<>|]+\\)*[^\\\s:\"'<>|]+"
59
+ r"|"
60
+ r"\b[A-Za-z0-9._-]+\#path:[0-9a-f]{12}\b"
61
+ r")"
62
+ )
63
+ STACK_FRAME_RE = re.compile(
64
+ r"(?m)^\s*(?:File\s+\"[^\"]+\",\s+line\s+\d+,\s+in\s+\S+|at\s+\S+.*\([^)]*:\d+(?::\d+)?\))"
65
+ )
66
+ IDENTIFIER_RE = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*(?:[A-Z][A-Za-z0-9_]*)?\b")
67
+ NUMERIC_CONSTANT_RE = re.compile(r"(?<![\w.])[-+]?(?:0x[0-9A-Fa-f]+|\d+(?:\.\d+)?)(?![\w.])")
68
+ PROTECTED_ZONE_KEYS = (
69
+ "code_fence",
70
+ "diff",
71
+ "identifier",
72
+ "numeric_constant",
73
+ "hash",
74
+ "path",
75
+ "stack_frame",
76
+ "quoted_string",
77
+ "json_key",
78
+ )
79
+ PROTECTED_ALLOWED_TRANSFORMS = (
80
+ "exact_dedupe",
81
+ "structural_window",
82
+ "line_truncate",
83
+ "whitespace_normalize",
84
+ "json_compact",
85
+ "artifact_retrieval",
86
+ )
87
+ PROTECTED_DENIED_TRANSFORMS = (
88
+ "semantic_compress",
89
+ "paraphrase",
90
+ "identifier_rewrite",
91
+ "numeric_rewrite",
92
+ "hash_rewrite",
93
+ "path_rewrite",
94
+ "quoted_literal_rewrite",
95
+ )
47
96
 
48
97
 
49
98
  def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
@@ -173,6 +222,85 @@ def classify_content(text: str) -> str:
173
222
  return "prose"
174
223
 
175
224
 
225
+ def protected_zone_counts(text: str) -> dict[str, int]:
226
+ """Conservatively count semantic-sensitive zones without storing raw spans.
227
+
228
+ The counts intentionally over-approximate. They are policy signals for later
229
+ transform gates, not a parser. Metadata must never include the matched path,
230
+ identifier, hash, or string contents because receipts are safe to share.
231
+ """
232
+ lines = text.splitlines()
233
+ fence_markers = len(CODE_FENCE_RE.findall(text))
234
+ diff_lines = sum(
235
+ 1
236
+ for line in lines
237
+ if DIFF_FILE_HEADER_RE.match(line)
238
+ or DIFF_HUNK_RE.match(line)
239
+ or (line[:1] in "+-" and not line.startswith(("+++", "---")))
240
+ )
241
+ counts = {
242
+ "code_fence": (fence_markers + 1) // 2,
243
+ "diff": diff_lines,
244
+ "identifier": len(IDENTIFIER_RE.findall(text)),
245
+ "numeric_constant": len(NUMERIC_CONSTANT_RE.findall(text)),
246
+ "hash": len(HASH_RE.findall(text)),
247
+ "path": len(PATH_RE.findall(text)),
248
+ "stack_frame": len(STACK_FRAME_RE.findall(text)),
249
+ "quoted_string": len(QUOTED_STRING_RE.findall(text)),
250
+ "json_key": len(JSON_KEY_RE.findall(text)),
251
+ }
252
+ return {key: counts[key] for key in PROTECTED_ZONE_KEYS if counts.get(key, 0) > 0}
253
+
254
+
255
+ def build_protected_policy(
256
+ *,
257
+ text: str,
258
+ content_type: str,
259
+ strategy_detail: dict[str, object],
260
+ lossy: bool,
261
+ ) -> dict[str, object]:
262
+ """Build an opt-in transform policy for protected zones.
263
+
264
+ Protection governs transform eligibility and exact-retrieval expectations.
265
+ It does not claim the section should be provider-cache-stable; cache ordering
266
+ is handled by `context-guard-cost compile`.
267
+ """
268
+ zone_counts = protected_zone_counts(text)
269
+ detected = bool(zone_counts)
270
+ strategy = str(strategy_detail.get("strategy") or "unknown")
271
+ retrieval_required = bool(detected and lossy)
272
+ return {
273
+ "enabled": True,
274
+ "detected": detected,
275
+ "content_type": content_type,
276
+ "zone_counts": zone_counts,
277
+ "semantic_compress": False,
278
+ "allowed_transforms": list(PROTECTED_ALLOWED_TRANSFORMS),
279
+ "denied_transforms": list(PROTECTED_DENIED_TRANSFORMS),
280
+ "retrieval_required": retrieval_required,
281
+ "retrieval_scope": "sanitized_full_input" if retrieval_required else "compressed_output",
282
+ "raw_spans_stored": False,
283
+ "policy_note": "Protected zones permit structural transforms only; no semantic/paraphrase rewrites.",
284
+ "strategy": {
285
+ "name": strategy,
286
+ "structural_only": True,
287
+ },
288
+ }
289
+
290
+
291
+ def build_transform_policy(protected_policy: dict[str, object]) -> dict[str, object]:
292
+ """Summarize transform eligibility without embedding raw protected content."""
293
+ return {
294
+ "mode": "protected" if protected_policy.get("detected") else "structural_default",
295
+ "semantic_transforms_allowed": False,
296
+ "semantic_compress": False,
297
+ "allowed": list(PROTECTED_ALLOWED_TRANSFORMS),
298
+ "denied": list(PROTECTED_DENIED_TRANSFORMS),
299
+ "exact_retrieval_required": bool(protected_policy.get("retrieval_required")),
300
+ "raw_spans_stored": False,
301
+ }
302
+
303
+
176
304
  def _looks_like_json(stripped: str) -> bool:
177
305
  if stripped[0] not in "{[":
178
306
  return False
@@ -353,6 +481,7 @@ def build_metadata(
353
481
  input_truncated: bool,
354
482
  input_bytes: int,
355
483
  max_bytes: int,
484
+ protected_policy_enabled: bool = False,
356
485
  ) -> dict[str, object]:
357
486
  """Assemble the compress receipt: observed byte/line counts plus an estimated token proxy.
358
487
 
@@ -370,7 +499,7 @@ def build_metadata(
370
499
  if lossy
371
500
  else "Data-preserving: compact form is semantically equivalent to the sanitized input."
372
501
  )
373
- return {
502
+ metadata: dict[str, object] = {
374
503
  "tool": "context-guard-kit.context_compress",
375
504
  "metadata_version": 1,
376
505
  "content_type": content_type,
@@ -407,6 +536,21 @@ def build_metadata(
407
536
  },
408
537
  "retrieval_hint": retrieval_hint,
409
538
  }
539
+ if protected_policy_enabled:
540
+ protected_policy = build_protected_policy(
541
+ text=original_text,
542
+ content_type=content_type,
543
+ strategy_detail=strategy_detail,
544
+ lossy=lossy,
545
+ )
546
+ metadata["protected_zone_policy"] = protected_policy
547
+ metadata["transform_policy"] = build_transform_policy(protected_policy)
548
+ if protected_policy.get("retrieval_required"):
549
+ metadata["retrieval_hint"] = (
550
+ "Protected lossy structural transform: store the full sanitized text with "
551
+ "`context-guard-artifact store` and retrieve exact slices before relying on omitted content."
552
+ )
553
+ return metadata
410
554
 
411
555
 
412
556
  def compress_text(
@@ -417,6 +561,7 @@ def compress_text(
417
561
  input_truncated: bool,
418
562
  input_bytes: int,
419
563
  max_bytes: int,
564
+ protected_policy_enabled: bool = False,
420
565
  ) -> tuple[str, dict[str, object]]:
421
566
  """Sanitize first, then classify and compress, then build the receipt.
422
567
 
@@ -446,6 +591,7 @@ def compress_text(
446
591
  input_truncated=input_truncated,
447
592
  input_bytes=input_bytes,
448
593
  max_bytes=max_bytes,
594
+ protected_policy_enabled=protected_policy_enabled,
449
595
  )
450
596
  return compressed, metadata
451
597
 
@@ -489,6 +635,7 @@ def run_compress(args: argparse.Namespace) -> int:
489
635
  input_truncated=input_truncated,
490
636
  input_bytes=input_bytes,
491
637
  max_bytes=max_bytes,
638
+ protected_policy_enabled=bool(args.protected_policy),
492
639
  )
493
640
  if args.json:
494
641
  payload = {"metadata": metadata, "content": compressed}
@@ -513,6 +660,11 @@ def build_parser() -> argparse.ArgumentParser:
513
660
  help="force a content type instead of auto-detecting (json/diff/log/search/code/prose)",
514
661
  )
515
662
  parser.add_argument("--json", action="store_true", help="emit JSON with metadata and compressed content")
663
+ parser.add_argument(
664
+ "--protected-policy",
665
+ action="store_true",
666
+ help="add opt-in protected-zone transform policy metadata to --json/--metadata-only receipts; default content is unchanged",
667
+ )
516
668
  parser.add_argument(
517
669
  "--metadata-only",
518
670
  action="store_true",
@@ -0,0 +1,446 @@
1
+ #!/usr/bin/env python3
2
+ """Validate and apply bounded declarative command-output filters.
3
+
4
+ This helper is intentionally opt-in. User filter configs live outside package
5
+ code and invalid/no-match/failure cases pass command output through rather than
6
+ risk hiding evidence.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ from dataclasses import dataclass
12
+ import json
13
+ from pathlib import Path
14
+ import re
15
+ import shlex
16
+ import subprocess
17
+ import sys
18
+ from typing import Any, Iterable
19
+
20
+ SCHEMA_VERSION = "contextguard.filter-dsl.v1"
21
+ TOOL_NAME = "context-guard-filter"
22
+ MAX_CONFIG_BYTES = 1_000_000
23
+ MAX_FILTERS = 100
24
+ MAX_REGEXES_PER_FILTER = 20
25
+ MAX_REGEX_CHARS = 500
26
+ MAX_ARG_PARTS = 64
27
+ MAX_ARG_CHARS = 200
28
+ DEFAULT_MAX_CAPTURE_BYTES = 5_000_000
29
+ MAX_CAPTURE_BYTES_LIMIT = 50_000_000
30
+ DEFAULT_MAX_LINE_CHARS = 100_000
31
+ MAX_LINE_CHARS_LIMIT = 1_000_000
32
+ MAX_EMIT_LINES = 5_000
33
+ DEFAULT_TIMEOUT_SECONDS = 600
34
+ MAX_TIMEOUT_SECONDS = 86_400
35
+ TIMEOUT_EXIT_CODE = 124
36
+ FILTER_KEYS = {"id", "match", "passthrough_on_exit", "include_regex", "exclude_regex", "head_lines", "tail_lines", "max_lines"}
37
+ MATCH_KEYS = {"argv_prefix", "argv_regex"}
38
+ PROTECTED_BASENAMES = {
39
+ "git",
40
+ "gh",
41
+ "pytest",
42
+ "ruff",
43
+ "mypy",
44
+ "eslint",
45
+ "vitest",
46
+ "jest",
47
+ }
48
+ PROTECTED_NPM_TASKS = {"test", "lint"}
49
+ PROTECTED_PYTHON_MODULES = {"pytest", "ruff", "mypy"}
50
+ PROTECTED_DIRECT_NAMES = {"pytest", "ruff", "mypy", "eslint", "vitest", "jest", "tox"}
51
+ PROTECTED_INTENT_TOKENS = {"test", "tests", "lint", "clippy"}
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class CompiledFilter:
56
+ id: str
57
+ argv_prefix: tuple[str, ...] | None
58
+ argv_regex: re.Pattern[str] | None
59
+ passthrough_on_exit: bool
60
+ include_regex: tuple[re.Pattern[str], ...]
61
+ exclude_regex: tuple[re.Pattern[str], ...]
62
+ head_lines: int | None
63
+ tail_lines: int | None
64
+ max_lines: int | None
65
+
66
+
67
+ def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
68
+ try:
69
+ number = int(value)
70
+ except (TypeError, ValueError, OverflowError):
71
+ return default
72
+ return min(max(number, minimum), maximum)
73
+
74
+
75
+ def compact(text: str, limit: int = 160) -> str:
76
+ text = " ".join(str(text).split())
77
+ if len(text) <= limit:
78
+ return text
79
+ return text[: max(0, limit - 20)] + f"…[trimmed:{len(text)}]"
80
+
81
+
82
+ def read_json_limited(path: Path) -> tuple[Any | None, list[str]]:
83
+ try:
84
+ size = path.stat().st_size
85
+ if size > MAX_CONFIG_BYTES:
86
+ return None, [f"config file too large: {size}>{MAX_CONFIG_BYTES} bytes"]
87
+ raw = path.read_text(encoding="utf-8")
88
+ except OSError as exc:
89
+ return None, [f"could not read config: {exc.strerror or exc.__class__.__name__}"]
90
+ try:
91
+ return json.loads(raw), []
92
+ except json.JSONDecodeError as exc:
93
+ return None, [f"invalid JSON at line {exc.lineno}: {exc.msg}"]
94
+
95
+
96
+ def validate_str_list(value: Any, *, field: str, errors: list[str], max_items: int = MAX_REGEXES_PER_FILTER) -> list[str]:
97
+ if value is None:
98
+ return []
99
+ if not isinstance(value, list):
100
+ errors.append(f"{field} must be a list")
101
+ return []
102
+ if len(value) > max_items:
103
+ errors.append(f"{field} has too many items: {len(value)}>{max_items}")
104
+ out: list[str] = []
105
+ for idx, item in enumerate(value[:max_items]):
106
+ if not isinstance(item, str) or not item.strip():
107
+ errors.append(f"{field}[{idx}] must be a non-empty string")
108
+ continue
109
+ if len(item) > MAX_REGEX_CHARS:
110
+ errors.append(f"{field}[{idx}] exceeds {MAX_REGEX_CHARS} chars")
111
+ continue
112
+ out.append(item)
113
+ return out
114
+
115
+
116
+ def compile_regexes(patterns: Iterable[str], *, field: str, errors: list[str]) -> tuple[re.Pattern[str], ...]:
117
+ compiled: list[re.Pattern[str]] = []
118
+ for idx, pattern in enumerate(patterns):
119
+ try:
120
+ compiled.append(re.compile(pattern))
121
+ except re.error as exc:
122
+ errors.append(f"{field}[{idx}] invalid regex: {compact(str(exc), 120)}")
123
+ return tuple(compiled)
124
+
125
+
126
+ def bounded_optional_int(raw: Any, *, field: str, errors: list[str], minimum: int = 0) -> int | None:
127
+ if raw is None:
128
+ return None
129
+ if not isinstance(raw, int) or isinstance(raw, bool):
130
+ errors.append(f"{field} must be an integer")
131
+ return None
132
+ if raw < minimum or raw > MAX_EMIT_LINES:
133
+ errors.append(f"{field} out of bounds: {minimum}..{MAX_EMIT_LINES}")
134
+ return None
135
+ return raw
136
+
137
+
138
+ def validate_config(raw: Any) -> tuple[list[CompiledFilter], list[str]]:
139
+ errors: list[str] = []
140
+ if not isinstance(raw, dict):
141
+ return [], ["config root must be a JSON object"]
142
+ unknown_root = sorted(set(raw) - {"schema_version", "filters"})
143
+ if unknown_root:
144
+ errors.append(f"unknown root keys: {', '.join(unknown_root)}")
145
+ if raw.get("schema_version") != SCHEMA_VERSION:
146
+ errors.append(f"schema_version must be {SCHEMA_VERSION}")
147
+ filters_raw = raw.get("filters")
148
+ if not isinstance(filters_raw, list) or not filters_raw:
149
+ errors.append("filters must be a non-empty list")
150
+ return [], errors
151
+ if len(filters_raw) > MAX_FILTERS:
152
+ errors.append(f"filters has too many items: {len(filters_raw)}>{MAX_FILTERS}")
153
+ seen_ids: set[str] = set()
154
+ compiled: list[CompiledFilter] = []
155
+ for idx, item in enumerate(filters_raw[:MAX_FILTERS]):
156
+ prefix = f"filters[{idx}]"
157
+ if not isinstance(item, dict):
158
+ errors.append(f"{prefix} must be an object")
159
+ continue
160
+ unknown = sorted(set(item) - FILTER_KEYS)
161
+ if unknown:
162
+ errors.append(f"{prefix} unknown keys: {', '.join(unknown)}")
163
+ fid = item.get("id")
164
+ if not isinstance(fid, str) or not re.fullmatch(r"[A-Za-z0-9._-]{1,80}", fid):
165
+ errors.append(f"{prefix}.id must match [A-Za-z0-9._-] and be <=80 chars")
166
+ fid = f"invalid-{idx}"
167
+ elif fid in seen_ids:
168
+ errors.append(f"{prefix}.id duplicates {fid}")
169
+ seen_ids.add(str(fid))
170
+ match = item.get("match")
171
+ argv_prefix: tuple[str, ...] | None = None
172
+ argv_regex: re.Pattern[str] | None = None
173
+ if not isinstance(match, dict):
174
+ errors.append(f"{prefix}.match must be an object")
175
+ else:
176
+ unknown_match = sorted(set(match) - MATCH_KEYS)
177
+ if unknown_match:
178
+ errors.append(f"{prefix}.match unknown keys: {', '.join(unknown_match)}")
179
+ if "argv_prefix" in match:
180
+ parts = validate_str_list(match.get("argv_prefix"), field=f"{prefix}.match.argv_prefix", errors=errors, max_items=MAX_ARG_PARTS)
181
+ for part_idx, part in enumerate(parts):
182
+ if len(part) > MAX_ARG_CHARS:
183
+ errors.append(f"{prefix}.match.argv_prefix[{part_idx}] exceeds {MAX_ARG_CHARS} chars")
184
+ if parts:
185
+ argv_prefix = tuple(parts)
186
+ if "argv_regex" in match:
187
+ pattern = match.get("argv_regex")
188
+ if not isinstance(pattern, str) or not pattern.strip():
189
+ errors.append(f"{prefix}.match.argv_regex must be a non-empty string")
190
+ elif len(pattern) > MAX_REGEX_CHARS:
191
+ errors.append(f"{prefix}.match.argv_regex exceeds {MAX_REGEX_CHARS} chars")
192
+ else:
193
+ compiled_argv_regex = compile_regexes([pattern], field=f"{prefix}.match.argv_regex", errors=errors)
194
+ argv_regex = compiled_argv_regex[0] if compiled_argv_regex else None
195
+ if not argv_prefix and argv_regex is None:
196
+ errors.append(f"{prefix}.match requires argv_prefix or argv_regex")
197
+ passthrough = item.get("passthrough_on_exit", True)
198
+ if not isinstance(passthrough, bool):
199
+ errors.append(f"{prefix}.passthrough_on_exit must be boolean")
200
+ passthrough = True
201
+ include = validate_str_list(item.get("include_regex"), field=f"{prefix}.include_regex", errors=errors)
202
+ exclude = validate_str_list(item.get("exclude_regex"), field=f"{prefix}.exclude_regex", errors=errors)
203
+ if len(include) + len(exclude) > MAX_REGEXES_PER_FILTER:
204
+ errors.append(f"{prefix} has too many regexes: {len(include) + len(exclude)}>{MAX_REGEXES_PER_FILTER}")
205
+ head = bounded_optional_int(item.get("head_lines"), field=f"{prefix}.head_lines", errors=errors)
206
+ tail = bounded_optional_int(item.get("tail_lines"), field=f"{prefix}.tail_lines", errors=errors)
207
+ max_lines = bounded_optional_int(item.get("max_lines"), field=f"{prefix}.max_lines", errors=errors, minimum=1)
208
+ compiled.append(CompiledFilter(
209
+ id=str(fid),
210
+ argv_prefix=argv_prefix,
211
+ argv_regex=argv_regex,
212
+ passthrough_on_exit=passthrough,
213
+ include_regex=compile_regexes(include, field=f"{prefix}.include_regex", errors=errors),
214
+ exclude_regex=compile_regexes(exclude, field=f"{prefix}.exclude_regex", errors=errors),
215
+ head_lines=head,
216
+ tail_lines=tail,
217
+ max_lines=max_lines,
218
+ ))
219
+ return compiled, errors
220
+
221
+
222
+ def load_filters(path: Path) -> tuple[list[CompiledFilter], list[str]]:
223
+ raw, read_errors = read_json_limited(path)
224
+ if read_errors:
225
+ return [], read_errors
226
+ return validate_config(raw)
227
+
228
+
229
+ def command_text(argv: list[str]) -> str:
230
+ try:
231
+ return shlex.join(argv)
232
+ except Exception:
233
+ return " ".join(argv)
234
+
235
+
236
+ def filter_matches(flt: CompiledFilter, argv: list[str]) -> bool:
237
+ if flt.argv_prefix is not None and tuple(argv[: len(flt.argv_prefix)]) == flt.argv_prefix:
238
+ return True
239
+ if flt.argv_regex is not None and flt.argv_regex.search(command_text(argv)):
240
+ return True
241
+ return False
242
+
243
+
244
+ def basename(arg: str) -> str:
245
+ return Path(arg).name.lower()
246
+
247
+
248
+ def argv_signal_tokens(argv: list[str]) -> set[str]:
249
+ tokens: set[str] = set()
250
+ for arg in argv:
251
+ lowered = basename(arg)
252
+ if lowered:
253
+ tokens.add(lowered)
254
+ tokens.update(part for part in re.split(r"[^a-z0-9]+", lowered) if part)
255
+ return tokens
256
+
257
+
258
+ def has_test_lint_signal(argv: list[str]) -> bool:
259
+ tokens = argv_signal_tokens(argv)
260
+ return bool(tokens & PROTECTED_DIRECT_NAMES or tokens & PROTECTED_INTENT_TOKENS)
261
+
262
+
263
+ def is_protected_command(argv: list[str]) -> bool:
264
+ if not argv:
265
+ return False
266
+ first = basename(argv[0])
267
+ if first in PROTECTED_BASENAMES:
268
+ return True
269
+ if first in {"python", "python3"} and len(argv) >= 3 and argv[1] == "-m" and basename(argv[2]) in PROTECTED_PYTHON_MODULES:
270
+ return True
271
+ if first in {"npm", "pnpm", "yarn"} and len(argv) >= 2:
272
+ if argv[1] in PROTECTED_NPM_TASKS:
273
+ return True
274
+ if len(argv) >= 3 and argv[1] == "run" and has_test_lint_signal(argv[2:]):
275
+ return True
276
+ if len(argv) >= 3 and argv[1] in {"exec", "x", "dlx"} and has_test_lint_signal(argv[2:]):
277
+ return True
278
+ if first in {"npx", "bun", "make", "gradle", "gradlew", "mvn", "poetry", "uv", "pipenv", "hatch", "tox"} and has_test_lint_signal(argv):
279
+ return True
280
+ if first == "go" and len(argv) >= 2 and argv[1] == "test":
281
+ return True
282
+ if first == "cargo" and len(argv) >= 2 and argv[1] in {"test", "clippy"}:
283
+ return True
284
+ return False
285
+
286
+
287
+ def cap_line(line: str, max_chars: int) -> str:
288
+ if len(line) <= max_chars:
289
+ return line
290
+ suffix = "\n" if line.endswith("\n") else ""
291
+ marker = f"...[line capped:{len(line)} chars]"
292
+ return line[: max(0, max_chars - len(marker) - len(suffix))] + marker + suffix
293
+
294
+
295
+ def select_lines(lines: list[str], flt: CompiledFilter, max_line_chars: int) -> list[str]:
296
+ selected = [cap_line(line, max_line_chars) for line in lines]
297
+ if flt.include_regex:
298
+ selected = [line for line in selected if any(pattern.search(line) for pattern in flt.include_regex)]
299
+ if flt.exclude_regex:
300
+ selected = [line for line in selected if not any(pattern.search(line) for pattern in flt.exclude_regex)]
301
+ if flt.head_lines is not None or flt.tail_lines is not None:
302
+ head_n = flt.head_lines if flt.head_lines is not None else 0
303
+ tail_n = flt.tail_lines if flt.tail_lines is not None else 0
304
+ head = selected[:head_n] if head_n else []
305
+ tail = selected[-tail_n:] if tail_n else []
306
+ if head and tail:
307
+ seen_head_count = len(head)
308
+ tail = tail[max(0, seen_head_count + len(tail) - len(selected)):]
309
+ selected = head + tail
310
+ if flt.max_lines is not None and len(selected) > flt.max_lines:
311
+ selected = selected[:flt.max_lines]
312
+ if len(selected) > MAX_EMIT_LINES:
313
+ selected = selected[:MAX_EMIT_LINES]
314
+ return selected
315
+
316
+
317
+ def validation_payload(valid: bool, errors: list[str], count: int = 0) -> dict[str, Any]:
318
+ return {"tool": TOOL_NAME, "schema_version": SCHEMA_VERSION, "mode": "validate", "valid": valid, "filter_count": count, "errors": errors}
319
+
320
+
321
+ def print_validation(valid: bool, errors: list[str], count: int, as_json: bool) -> None:
322
+ if as_json:
323
+ print(json.dumps(validation_payload(valid, errors, count), ensure_ascii=False, sort_keys=True))
324
+ elif valid:
325
+ print(f"{TOOL_NAME}: valid filter config ({count} filter(s))")
326
+ else:
327
+ print(f"{TOOL_NAME}: invalid filter config", file=sys.stderr)
328
+ for error in errors:
329
+ print(f"- {error}", file=sys.stderr)
330
+
331
+
332
+ def timeout_text(value: str | bytes | None) -> str:
333
+ if isinstance(value, str):
334
+ return value
335
+ return (value or b"").decode("utf-8", "replace")
336
+
337
+
338
+ def run_command(argv: list[str], timeout_seconds: int) -> tuple[int, str, str, bool]:
339
+ try:
340
+ proc = subprocess.run(argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, errors="replace", timeout=timeout_seconds)
341
+ return proc.returncode, proc.stdout or "", proc.stderr or "", False
342
+ except subprocess.TimeoutExpired as exc:
343
+ stdout = timeout_text(exc.stdout)
344
+ stderr = timeout_text(exc.stderr) + f"\n[{TOOL_NAME}] command timed out after {timeout_seconds}s\n"
345
+ return TIMEOUT_EXIT_CODE, stdout, stderr, True
346
+ except OSError as exc:
347
+ return 127, "", f"{TOOL_NAME}: command failed to start: {exc.strerror or exc.__class__.__name__}\n", False
348
+
349
+
350
+ def emit_run_report(args: argparse.Namespace, payload: dict[str, Any]) -> None:
351
+ if payload.get("protected_nonzero"):
352
+ return
353
+ if args.json_report:
354
+ print(json.dumps(payload, ensure_ascii=False, sort_keys=True), file=sys.stderr)
355
+ elif payload.get("decision") == "passthrough" and payload.get("reason") not in {"no-match", "nonzero-passthrough"}:
356
+ print(f"{TOOL_NAME}: passthrough: {payload.get('reason')}", file=sys.stderr)
357
+
358
+
359
+ def cmd_validate(args: argparse.Namespace) -> int:
360
+ filters, errors = load_filters(Path(args.config).expanduser())
361
+ print_validation(not errors, errors, len(filters), args.json)
362
+ return 0 if not errors else 2
363
+
364
+
365
+ def cmd_run(args: argparse.Namespace) -> int:
366
+ command = list(args.command)
367
+ if command and command[0] == "--":
368
+ command = command[1:]
369
+ if not command:
370
+ print(f"{TOOL_NAME}: missing command", file=sys.stderr)
371
+ return 2
372
+ max_capture = bounded_int(args.max_capture_bytes, DEFAULT_MAX_CAPTURE_BYTES, 1, MAX_CAPTURE_BYTES_LIMIT)
373
+ max_line_chars = bounded_int(args.max_line_chars, DEFAULT_MAX_LINE_CHARS, 1, MAX_LINE_CHARS_LIMIT)
374
+ timeout_seconds = bounded_int(args.timeout_seconds, DEFAULT_TIMEOUT_SECONDS, 1, MAX_TIMEOUT_SECONDS)
375
+ filters, errors = load_filters(Path(args.config).expanduser())
376
+ rc, stdout_text, stderr_text, timed_out = run_command(command, timeout_seconds)
377
+ output = stdout_text + stderr_text
378
+ output_bytes = len(output.encode("utf-8", "replace"))
379
+ protected_nonzero = rc != 0 and is_protected_command(command)
380
+ report: dict[str, Any] = {"tool": TOOL_NAME, "schema_version": SCHEMA_VERSION, "mode": "run", "command_exit_code": rc, "decision": "passthrough", "reason": "unclassified", "protected_nonzero": protected_nonzero}
381
+ if timed_out:
382
+ report["reason"] = "timeout"
383
+ elif errors:
384
+ report["reason"] = "invalid-config"
385
+ report["errors"] = errors[:10]
386
+ elif output_bytes > max_capture:
387
+ report["reason"] = "capture-limit"
388
+ report["output_bytes"] = output_bytes
389
+ report["max_capture_bytes"] = max_capture
390
+ else:
391
+ matched = next((flt for flt in filters if filter_matches(flt, command)), None)
392
+ if matched is None:
393
+ report["reason"] = "no-match"
394
+ elif protected_nonzero:
395
+ report["reason"] = "protected-nonzero"
396
+ report["filter_id"] = matched.id
397
+ elif rc != 0 and matched.passthrough_on_exit:
398
+ report["reason"] = "nonzero-passthrough"
399
+ report["filter_id"] = matched.id
400
+ else:
401
+ try:
402
+ lines = output.splitlines(keepends=True)
403
+ filtered = select_lines(lines, matched, max_line_chars)
404
+ except re.error as exc:
405
+ report["reason"] = f"filter-error:{compact(str(exc), 80)}"
406
+ report["filter_id"] = matched.id
407
+ else:
408
+ if output and not filtered:
409
+ report["reason"] = "empty-output-fallback"
410
+ report["filter_id"] = matched.id
411
+ else:
412
+ sys.stdout.write("".join(filtered))
413
+ report.update({"decision": "filtered", "reason": "matched", "filter_id": matched.id, "input_lines": len(lines), "output_lines": len(filtered)})
414
+ emit_run_report(args, report)
415
+ return rc
416
+ sys.stdout.write(stdout_text)
417
+ sys.stderr.write(stderr_text)
418
+ emit_run_report(args, report)
419
+ return rc
420
+
421
+
422
+ def build_parser() -> argparse.ArgumentParser:
423
+ parser = argparse.ArgumentParser(prog=TOOL_NAME, description="Validate and apply bounded declarative command-output filters. Filtered mode applies line rules to combined stdout+stderr and writes the filtered result to stdout; passthrough mode preserves stdout/stderr streams.")
424
+ sub = parser.add_subparsers(dest="command_name", required=True)
425
+ validate = sub.add_parser("validate", help="validate a filter DSL JSON file")
426
+ validate.add_argument("--config", required=True, help="path to user-owned filter JSON")
427
+ validate.add_argument("--json", action="store_true", help="emit validation result as JSON")
428
+ validate.set_defaults(func=cmd_validate)
429
+ run = sub.add_parser("run", help="run a command and apply the first matching safe filter")
430
+ run.add_argument("--config", required=True, help="path to user-owned filter JSON")
431
+ run.add_argument("--json-report", action="store_true", help="emit filter decision JSON to stderr; protected nonzero passthrough suppresses reports to preserve raw stderr")
432
+ run.add_argument("--max-capture-bytes", type=int, default=DEFAULT_MAX_CAPTURE_BYTES)
433
+ run.add_argument("--max-line-chars", type=int, default=DEFAULT_MAX_LINE_CHARS)
434
+ run.add_argument("--timeout-seconds", type=int, default=DEFAULT_TIMEOUT_SECONDS)
435
+ run.add_argument("command", nargs=argparse.REMAINDER)
436
+ run.set_defaults(func=cmd_run)
437
+ return parser
438
+
439
+
440
+ def main() -> int:
441
+ args = build_parser().parse_args()
442
+ return int(args.func(args))
443
+
444
+
445
+ if __name__ == "__main__":
446
+ raise SystemExit(main())
@@ -19,6 +19,7 @@ PACKAGE_NAME = "@ictechgy/context-guard"
19
19
 
20
20
  HELPER_SUBCOMMANDS: dict[str, tuple[str, ...]] = {
21
21
  "setup": ("context-guard-setup",),
22
+ "doctor": ("context-guard-setup", "--verify"),
22
23
  "audit": ("context-guard-audit",),
23
24
  "diet": ("context-guard-diet",),
24
25
  "scan": ("context-guard-diet", "scan"),
@@ -26,10 +27,12 @@ HELPER_SUBCOMMANDS: dict[str, tuple[str, ...]] = {
26
27
  "trim": ("context-guard-trim-output",),
27
28
  "sanitize-output": ("context-guard-sanitize-output",),
28
29
  "sanitize": ("context-guard-sanitize-output",),
30
+ "filter": ("context-guard-filter",),
29
31
  "artifact": ("context-guard-artifact",),
30
32
  "pack": ("context-guard-pack",),
31
33
  "tool-prune": ("context-guard-tool-prune",),
32
34
  "compress": ("context-guard-compress",),
35
+ "cost": ("context-guard-cost",),
33
36
  "bench": ("context-guard-bench",),
34
37
  "read-symbol": ("context-guard-read-symbol",),
35
38
  "rewrite-bash": ("context-guard-rewrite-bash",),