@ictechgy/context-guard 0.4.9 → 0.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CHANGELOG.md +16 -0
  2. package/README.ko.md +41 -24
  3. package/README.md +66 -26
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  8. package/docs/distribution.md +10 -7
  9. package/docs/experimental-benchmark-fixtures.md +8 -1
  10. package/package.json +3 -6
  11. package/packaging/homebrew/context-guard.rb.template +1 -1
  12. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  13. package/plugins/context-guard/README.ko.md +9 -6
  14. package/plugins/context-guard/README.md +21 -13
  15. package/plugins/context-guard/bin/context-guard +113 -26
  16. package/plugins/context-guard/bin/context-guard-artifact +542 -46
  17. package/plugins/context-guard/bin/context-guard-cache-score +380 -0
  18. package/plugins/context-guard/bin/context-guard-compress +146 -1
  19. package/plugins/context-guard/bin/context-guard-cost +783 -4
  20. package/plugins/context-guard/bin/context-guard-experiments +99 -18
  21. package/plugins/context-guard/bin/context-guard-failed-nudge +3 -0
  22. package/plugins/context-guard/bin/context-guard-filter +163 -7
  23. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  24. package/plugins/context-guard/bin/context-guard-pack +602 -43
  25. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  26. package/plugins/context-guard/bin/context-guard-setup +165 -31
  27. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  28. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  29. package/plugins/context-guard/bin/context-guard-tool-prune +241 -1
  30. package/plugins/context-guard/lib/context_guard_commands.py +206 -0
  31. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  32. package/context-guard-kit/README.md +0 -91
  33. package/context-guard-kit/benchmark_runner.py +0 -2401
  34. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  35. package/context-guard-kit/context_compress.py +0 -695
  36. package/context-guard-kit/context_escrow.py +0 -935
  37. package/context-guard-kit/context_filter.py +0 -637
  38. package/context-guard-kit/context_guard_cli.py +0 -325
  39. package/context-guard-kit/context_guard_diet.py +0 -1711
  40. package/context-guard-kit/context_pack.py +0 -2713
  41. package/context-guard-kit/cost_guard.py +0 -2349
  42. package/context-guard-kit/experimental_registry.py +0 -4348
  43. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  44. package/context-guard-kit/guard_large_read.py +0 -690
  45. package/context-guard-kit/hook_secret_patterns.py +0 -43
  46. package/context-guard-kit/read_symbol.py +0 -483
  47. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  48. package/context-guard-kit/sanitize_output.py +0 -725
  49. package/context-guard-kit/settings.example.json +0 -67
  50. package/context-guard-kit/setup_wizard.py +0 -2515
  51. package/context-guard-kit/statusline.sh +0 -362
  52. package/context-guard-kit/statusline_merged.sh +0 -157
  53. package/context-guard-kit/tool_schema_pruner.py +0 -837
  54. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -1,695 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Classify stdin content and emit a sanitized, token-budget-friendly compression.
3
-
4
- The CLI never promises lossless *semantic* compression. It performs conservative,
5
- deterministic, content-type-aware shrinking (compact JSON, diff change-only views,
6
- log/search de-duplication, whitespace normalization) so large local output costs
7
- fewer tokens to keep in context. Secrets are redacted *before* the receipt is built,
8
- so no secret ever reaches the compressed body or the metadata.
9
-
10
- For exact byte-for-byte recovery the receipt points at `context-guard-artifact store`,
11
- which keeps the full sanitized content as a queryable local artifact.
12
- """
13
- from __future__ import annotations
14
-
15
- import argparse
16
- import importlib.machinery
17
- import importlib.util
18
- import json
19
- import os
20
- from pathlib import Path
21
- import re
22
- import sys
23
- from typing import Callable
24
-
25
- DEFAULT_MAX_BYTES = 10_000_000
26
- MAX_MAX_BYTES = 100_000_000
27
- # 토큰 추정은 보수적 proxy 일 뿐이다(관측값 아님). 평균 ~4 chars/token 휴리스틱을 쓰되
28
- # 메타데이터에 measurement="estimated" 로 명시해 관측 토큰 수와 혼동되지 않게 한다.
29
- TOKEN_PROXY_CHARS_PER_TOKEN = 4
30
- CONTENT_TYPES = ("json", "diff", "log", "search", "code", "prose")
31
-
32
- # diff 구조 라인(파일 헤더/헝크/변경)을 식별한다. 나머지 context 라인은 접어서 줄인다.
33
- DIFF_FILE_HEADER_RE = re.compile(r"^(diff --git |index [0-9a-f]|--- |\+\+\+ |rename |similarity |new file|deleted file)")
34
- DIFF_HUNK_RE = re.compile(r"^@@ .* @@")
35
- # search(grep/ripgrep) 라인: `path:line:content` 또는 `path:content`.
36
- # 콜론 앞 경로 토큰에 공백을 불허해, 타임스탬프 로그("2026-01-01 00:00:00 ...")가
37
- # search 로 오분류되는 것을 막는다(로그 타임스탬프는 콜론 앞에 공백을 포함).
38
- SEARCH_LINE_RE = re.compile(r"^[^\s:][^:\n\s]*:(?:\d+:)?.")
39
- # log 시그널: 선두 타임스탬프나 로그 레벨 토큰.
40
- LOG_LEVEL_RE = re.compile(r"\b(TRACE|DEBUG|INFO|NOTICE|WARN|WARNING|ERROR|FATAL|CRITICAL)\b")
41
- LOG_TIMESTAMP_RE = re.compile(r"^\s*(?:\[)?\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}|^\s*\d{2}:\d{2}:\d{2}\b")
42
- # code 시그널: 흔한 소스 키워드/구두점. diff 와 겹치지 않도록 diff 판정을 먼저 한다.
43
- CODE_SIGNAL_RE = re.compile(
44
- r"(^\s*(def |class |function |func |import |from \S+ import |public |private |const |let |var |#include|package )"
45
- r"|[{};]\s*$|=>|::)"
46
- )
47
- CODE_FENCE_RE = re.compile(r"(?m)^\s*```")
48
- JSON_KEY_RE = re.compile(r'"(?:[^"\\]|\\.)*"\s*:')
49
- QUOTED_STRING_RE = re.compile(r"""(?x)
50
- "(?:[^"\\]|\\.)*" |
51
- '(?:[^'\\]|\\.)*'
52
- """)
53
- HASH_RE = re.compile(r"\b(?:[0-9a-fA-F]{32,}|sha256:[0-9a-fA-F]{32,})\b")
54
- PATH_RE = re.compile(
55
- r"(?x)(?:"
56
- r"(?<![\w.-])/(?:[A-Za-z0-9._@%+=:-]+/)*[A-Za-z0-9._@%+=:-]+"
57
- r"|"
58
- r"\b[A-Za-z]:\\(?:[^\\\s:\"'<>|]+\\)*[^\\\s:\"'<>|]+"
59
- r"|"
60
- r"\b[A-Za-z0-9._-]+\#path:[0-9a-f]{12}\b"
61
- r")"
62
- )
63
- STACK_FRAME_RE = re.compile(
64
- r"(?m)^\s*(?:File\s+\"[^\"]+\",\s+line\s+\d+,\s+in\s+\S+|at\s+\S+.*\([^)]*:\d+(?::\d+)?\))"
65
- )
66
- IDENTIFIER_RE = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*(?:[A-Z][A-Za-z0-9_]*)?\b")
67
- NUMERIC_CONSTANT_RE = re.compile(r"(?<![\w.])[-+]?(?:0x[0-9A-Fa-f]+|\d+(?:\.\d+)?)(?![\w.])")
68
- PROTECTED_ZONE_KEYS = (
69
- "code_fence",
70
- "diff",
71
- "identifier",
72
- "numeric_constant",
73
- "hash",
74
- "path",
75
- "stack_frame",
76
- "quoted_string",
77
- "json_key",
78
- )
79
- PROTECTED_ALLOWED_TRANSFORMS = (
80
- "exact_dedupe",
81
- "structural_window",
82
- "line_truncate",
83
- "whitespace_normalize",
84
- "json_compact",
85
- "artifact_retrieval",
86
- )
87
- PROTECTED_DENIED_TRANSFORMS = (
88
- "semantic_compress",
89
- "paraphrase",
90
- "identifier_rewrite",
91
- "numeric_rewrite",
92
- "hash_rewrite",
93
- "path_rewrite",
94
- "quoted_literal_rewrite",
95
- )
96
-
97
-
98
- def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
99
- """Clamp an int-like value into [minimum, maximum], falling back on default."""
100
- try:
101
- number = int(value)
102
- except (TypeError, ValueError, OverflowError):
103
- return default
104
- return min(max(number, minimum), maximum)
105
-
106
-
107
- class FallbackLineSanitizer:
108
- """Minimal secret scrubber used when the shared sanitizer cannot be loaded."""
109
-
110
- SECRET_VALUE_RE = re.compile(
111
- r"(?i)(Bearer\s+\S+|Basic\s+\S+|gh[pousr]_[A-Za-z0-9_]{20,}|"
112
- r"github_pat_[A-Za-z0-9_]{20,}|xox[abprs]-[A-Za-z0-9-]{10,}|"
113
- r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
114
- r"AIza[0-9A-Za-z_\-]{20,}|"
115
- r"([A-Za-z0-9_.-]*(?:api[_-]?key|token|secret|password|passwd|pwd)[A-Za-z0-9_.-]*\s*[:=]\s*)\S+)"
116
- )
117
-
118
- def __init__(self, *, show_paths: bool = False) -> None:
119
- self.show_paths = show_paths
120
- self.redactions = 0
121
-
122
- def sanitize(self, raw_line: str) -> tuple[str, bool]:
123
- def repl(match: re.Match[str]) -> str:
124
- groups = match.groups()
125
- if len(groups) >= 2 and groups[1]:
126
- return groups[1] + "[REDACTED]"
127
- return "[REDACTED]"
128
-
129
- line, count = self.SECRET_VALUE_RE.subn(repl, raw_line)
130
- if count:
131
- self.redactions += 1
132
- return line, bool(count)
133
-
134
-
135
- def load_line_sanitizer(show_paths: bool) -> object:
136
- """Reuse the shipped strong sanitizer when present; else fall back locally.
137
-
138
- Mirrors context_escrow.py so the compress CLI redacts with the same rules
139
- as the rest of the kit when `sanitize_output.py` sits next to this script.
140
- """
141
- script_dir = Path(__file__).resolve().parent
142
- for name in ("sanitize_output.py", "context-guard-sanitize-output", "claude-sanitize-output"):
143
- candidate = script_dir / name
144
- if not candidate.exists():
145
- continue
146
- try:
147
- loader = importlib.machinery.SourceFileLoader(f"_context_guard_compress_sanitize_{os.getpid()}", str(candidate))
148
- spec = importlib.util.spec_from_loader(loader.name, loader)
149
- if spec is None:
150
- raise RuntimeError("import spec unavailable")
151
- module = importlib.util.module_from_spec(spec)
152
- loader.exec_module(module)
153
- return module.LineSanitizer(show_paths=show_paths)
154
- except Exception as exc:
155
- raise RuntimeError(f"could not load sanitizer {candidate}: {exc}") from exc
156
- return FallbackLineSanitizer(show_paths=show_paths)
157
-
158
-
159
- def sanitize_text(text: str, *, show_paths: bool = False) -> tuple[str, int]:
160
- """Redact secrets line-by-line, returning sanitized text and redacted-line count."""
161
- sanitizer = load_line_sanitizer(show_paths)
162
- redacted = 0
163
- out: list[str] = []
164
- for line in text.splitlines(True):
165
- sanitized, did_redact = sanitizer.sanitize(line) # type: ignore[attr-defined]
166
- out.append(sanitized)
167
- if did_redact:
168
- redacted += 1
169
- return "".join(out), redacted
170
-
171
-
172
- def read_bounded_stdin(max_bytes: int) -> tuple[str, bool, int]:
173
- """Read at most max_bytes from stdin, reporting truncation and bytes read."""
174
- data = sys.stdin.buffer.read(max_bytes + 1)
175
- truncated = len(data) > max_bytes
176
- if truncated:
177
- data = data[:max_bytes]
178
- return data.decode("utf-8", errors="replace"), truncated, len(data)
179
-
180
-
181
- def line_count(text: str) -> int:
182
- """Count logical lines without an off-by-one on a trailing newline."""
183
- if not text:
184
- return 0
185
- return text.count("\n") + (0 if text.endswith("\n") else 1)
186
-
187
-
188
- def byte_length(text: str) -> int:
189
- """UTF-8 byte length using the same lossy decode policy as the rest of the kit."""
190
- return len(text.encode("utf-8", errors="replace"))
191
-
192
-
193
- def token_proxy(text: str) -> int:
194
- """Conservative token estimate (chars/4). Labeled 'estimated' in metadata."""
195
- if not text:
196
- return 0
197
- return max(1, round(len(text) / TOKEN_PROXY_CHARS_PER_TOKEN))
198
-
199
-
200
- def classify_content(text: str) -> str:
201
- """Best-effort content classification into one of CONTENT_TYPES.
202
-
203
- Order matters: JSON and diff have the strongest unambiguous signals and are
204
- checked first; search/log/code are sampled over the first lines; prose is the
205
- conservative default so unknown text is never over-compressed.
206
- """
207
- stripped = text.strip()
208
- if not stripped:
209
- return "prose"
210
- if _looks_like_json(stripped):
211
- return "json"
212
- lines = stripped.splitlines()
213
- sample = lines[:200]
214
- if _looks_like_diff(sample):
215
- return "diff"
216
- if _looks_like_search(sample):
217
- return "search"
218
- if _looks_like_log(sample):
219
- return "log"
220
- if _looks_like_code(sample):
221
- return "code"
222
- return "prose"
223
-
224
-
225
- def protected_zone_counts(text: str) -> dict[str, int]:
226
- """Conservatively count semantic-sensitive zones without storing raw spans.
227
-
228
- The counts intentionally over-approximate. They are policy signals for later
229
- transform gates, not a parser. Metadata must never include the matched path,
230
- identifier, hash, or string contents because receipts are safe to share.
231
- """
232
- lines = text.splitlines()
233
- fence_markers = len(CODE_FENCE_RE.findall(text))
234
- diff_lines = sum(
235
- 1
236
- for line in lines
237
- if DIFF_FILE_HEADER_RE.match(line)
238
- or DIFF_HUNK_RE.match(line)
239
- or (line[:1] in "+-" and not line.startswith(("+++", "---")))
240
- )
241
- counts = {
242
- "code_fence": (fence_markers + 1) // 2,
243
- "diff": diff_lines,
244
- "identifier": len(IDENTIFIER_RE.findall(text)),
245
- "numeric_constant": len(NUMERIC_CONSTANT_RE.findall(text)),
246
- "hash": len(HASH_RE.findall(text)),
247
- "path": len(PATH_RE.findall(text)),
248
- "stack_frame": len(STACK_FRAME_RE.findall(text)),
249
- "quoted_string": len(QUOTED_STRING_RE.findall(text)),
250
- "json_key": len(JSON_KEY_RE.findall(text)),
251
- }
252
- return {key: counts[key] for key in PROTECTED_ZONE_KEYS if counts.get(key, 0) > 0}
253
-
254
-
255
- def build_protected_policy(
256
- *,
257
- text: str,
258
- content_type: str,
259
- strategy_detail: dict[str, object],
260
- lossy: bool,
261
- ) -> dict[str, object]:
262
- """Build an opt-in transform policy for protected zones.
263
-
264
- Protection governs transform eligibility and exact-retrieval expectations.
265
- It does not claim the section should be provider-cache-stable; cache ordering
266
- is handled by `context-guard-cost compile`.
267
- """
268
- zone_counts = protected_zone_counts(text)
269
- detected = bool(zone_counts)
270
- strategy = str(strategy_detail.get("strategy") or "unknown")
271
- retrieval_required = bool(detected and lossy)
272
- return {
273
- "enabled": True,
274
- "detected": detected,
275
- "content_type": content_type,
276
- "zone_counts": zone_counts,
277
- "semantic_compress": False,
278
- "allowed_transforms": list(PROTECTED_ALLOWED_TRANSFORMS),
279
- "denied_transforms": list(PROTECTED_DENIED_TRANSFORMS),
280
- "retrieval_required": retrieval_required,
281
- "retrieval_scope": "sanitized_full_input" if retrieval_required else "compressed_output",
282
- "raw_spans_stored": False,
283
- "policy_note": "Protected zones permit structural transforms only; no semantic/paraphrase rewrites.",
284
- "strategy": {
285
- "name": strategy,
286
- "structural_only": True,
287
- },
288
- }
289
-
290
-
291
- def build_transform_policy(protected_policy: dict[str, object]) -> dict[str, object]:
292
- """Summarize transform eligibility without embedding raw protected content."""
293
- return {
294
- "mode": "protected" if protected_policy.get("detected") else "structural_default",
295
- "semantic_transforms_allowed": False,
296
- "semantic_compress": False,
297
- "allowed": list(PROTECTED_ALLOWED_TRANSFORMS),
298
- "denied": list(PROTECTED_DENIED_TRANSFORMS),
299
- "exact_retrieval_required": bool(protected_policy.get("retrieval_required")),
300
- "raw_spans_stored": False,
301
- }
302
-
303
-
304
- def _looks_like_json(stripped: str) -> bool:
305
- if stripped[0] not in "{[":
306
- return False
307
- try:
308
- json.loads(stripped)
309
- except (ValueError, RecursionError):
310
- return False
311
- return True
312
-
313
-
314
- def _ratio(matches: int, total: int, threshold: float) -> bool:
315
- return bool(total) and (matches / total) >= threshold
316
-
317
-
318
- def _looks_like_diff(sample: list[str]) -> bool:
319
- headers = sum(1 for line in sample if DIFF_FILE_HEADER_RE.match(line) or DIFF_HUNK_RE.match(line))
320
- changes = sum(1 for line in sample if line[:1] in "+-" and not line.startswith(("+++", "---")))
321
- return headers >= 1 and (changes >= 1 or headers >= 2)
322
-
323
-
324
- def _looks_like_search(sample: list[str]) -> bool:
325
- matches = sum(1 for line in sample if SEARCH_LINE_RE.match(line))
326
- return _ratio(matches, len(sample), 0.6) and len(sample) >= 2
327
-
328
-
329
- def _looks_like_log(sample: list[str]) -> bool:
330
- matches = sum(1 for line in sample if LOG_TIMESTAMP_RE.match(line) or LOG_LEVEL_RE.search(line))
331
- return _ratio(matches, len(sample), 0.4)
332
-
333
-
334
- def _looks_like_code(sample: list[str]) -> bool:
335
- matches = sum(1 for line in sample if CODE_SIGNAL_RE.search(line))
336
- return _ratio(matches, len(sample), 0.25)
337
-
338
-
339
- def compress_json(text: str) -> tuple[str, dict[str, object]]:
340
- """Re-serialize JSON without insignificant whitespace (data-preserving)."""
341
- try:
342
- parsed = json.loads(text)
343
- except (ValueError, RecursionError):
344
- # 파싱 불가 시 무손실을 깨지 않도록 prose 전략으로 안전하게 폴백한다.
345
- compressed, detail = compress_prose(text)
346
- detail["fallback_from"] = "json"
347
- return compressed, detail
348
- compact = json.dumps(parsed, ensure_ascii=False, separators=(",", ":"))
349
- if not text.endswith("\n"):
350
- trailing = ""
351
- else:
352
- trailing = "\n"
353
- return compact + trailing, {"strategy": "json-compact", "lossy": False, "json_parse_ok": True}
354
-
355
-
356
- def compress_diff(text: str) -> tuple[str, dict[str, object]]:
357
- """Keep file headers, hunk headers, and +/- changes; collapse context runs."""
358
- out: list[str] = []
359
- context_run = 0
360
- collapsed = 0
361
-
362
- def flush() -> None:
363
- nonlocal context_run, collapsed
364
- if context_run:
365
- out.append(f"[context-guard-kit] {context_run} unchanged context line(s) omitted")
366
- collapsed += context_run
367
- context_run = 0
368
-
369
- for line in text.splitlines():
370
- is_structural = bool(DIFF_FILE_HEADER_RE.match(line) or DIFF_HUNK_RE.match(line))
371
- is_change = line[:1] in "+-" and not line.startswith(("+++", "---"))
372
- if is_structural or is_change:
373
- flush()
374
- out.append(line)
375
- elif line.startswith(" ") or line == "":
376
- context_run += 1
377
- else:
378
- flush()
379
- out.append(line)
380
- flush()
381
- return _join_lines(out, text), {"strategy": "diff-keep-changes", "lossy": True, "context_lines_omitted": collapsed}
382
-
383
-
384
- def compress_log(text: str) -> tuple[str, dict[str, object]]:
385
- """Collapse consecutive identical lines into a single `line (xN)` marker."""
386
- out: list[str] = []
387
- collapsed = 0
388
- previous: str | None = None
389
- run = 0
390
-
391
- def flush() -> None:
392
- nonlocal previous, run, collapsed
393
- if previous is None:
394
- return
395
- if run > 1:
396
- out.append(f"{previous} (x{run})")
397
- collapsed += run - 1
398
- else:
399
- out.append(previous)
400
- previous, run = None, 0
401
-
402
- for line in text.splitlines():
403
- if line == previous:
404
- run += 1
405
- continue
406
- flush()
407
- previous, run = line, 1
408
- flush()
409
- return _join_lines(out, text), {"strategy": "log-collapse-repeats", "lossy": True, "lines_collapsed": collapsed}
410
-
411
-
412
- def compress_search(text: str) -> tuple[str, dict[str, object]]:
413
- """Drop exact-duplicate match lines while preserving first-seen order."""
414
- out: list[str] = []
415
- seen: set[str] = set()
416
- dropped = 0
417
- for line in text.splitlines():
418
- key = line.rstrip()
419
- if key in seen:
420
- dropped += 1
421
- continue
422
- seen.add(key)
423
- out.append(line)
424
- return _join_lines(out, text), {"strategy": "search-dedupe", "lossy": dropped > 0, "duplicate_lines_dropped": dropped}
425
-
426
-
427
- def compress_code(text: str) -> tuple[str, dict[str, object]]:
428
- """Trim trailing whitespace and collapse 3+ blank lines to a single blank."""
429
- return _whitespace_normalize(text, strategy="code-whitespace", max_consecutive_blank=1)
430
-
431
-
432
- def compress_prose(text: str) -> tuple[str, dict[str, object]]:
433
- """Trim trailing whitespace and collapse 2+ blank lines to a single blank."""
434
- return _whitespace_normalize(text, strategy="prose-whitespace", max_consecutive_blank=1)
435
-
436
-
437
- def _whitespace_normalize(text: str, *, strategy: str, max_consecutive_blank: int) -> tuple[str, dict[str, object]]:
438
- out: list[str] = []
439
- blank_run = 0
440
- collapsed = 0
441
- for line in text.splitlines():
442
- trimmed = line.rstrip()
443
- if trimmed == "":
444
- blank_run += 1
445
- if blank_run > max_consecutive_blank:
446
- collapsed += 1
447
- continue
448
- else:
449
- blank_run = 0
450
- out.append(trimmed)
451
- lossy = collapsed > 0 or any(line != line.rstrip() for line in text.splitlines())
452
- return _join_lines(out, text), {"strategy": strategy, "lossy": lossy, "blank_lines_collapsed": collapsed}
453
-
454
-
455
- def _join_lines(lines: list[str], original: str) -> str:
456
- """Join compressed lines, restoring a trailing newline only if the input had one."""
457
- body = "\n".join(lines)
458
- if original.endswith("\n") and body and not body.endswith("\n"):
459
- body += "\n"
460
- return body
461
-
462
-
463
- STRATEGIES: dict[str, Callable[[str], tuple[str, dict[str, object]]]] = {
464
- "json": compress_json,
465
- "diff": compress_diff,
466
- "log": compress_log,
467
- "search": compress_search,
468
- "code": compress_code,
469
- "prose": compress_prose,
470
- }
471
-
472
-
473
- def build_metadata(
474
- *,
475
- content_type: str,
476
- type_source: str,
477
- strategy_detail: dict[str, object],
478
- original_text: str,
479
- compressed_text: str,
480
- redacted_lines: int,
481
- input_truncated: bool,
482
- input_bytes: int,
483
- max_bytes: int,
484
- protected_policy_enabled: bool = False,
485
- ) -> dict[str, object]:
486
- """Assemble the compress receipt: observed byte/line counts plus an estimated token proxy.
487
-
488
- `redacted_lines` is computed before this point (redaction-before-receipt), so the
489
- metadata can be safely emitted. A deterministic retrieval hint points at escrow for
490
- exact-byte recovery because every strategy except json-compact is lossy.
491
- """
492
- original_bytes = byte_length(original_text)
493
- compressed_bytes = byte_length(compressed_text)
494
- ratio = round(compressed_bytes / original_bytes, 4) if original_bytes else 1.0
495
- lossy = bool(strategy_detail.get("lossy", True))
496
- retrieval_hint = (
497
- "Lossy: store the full sanitized text for exact recovery via "
498
- "`context-guard-artifact store` and query slices later."
499
- if lossy
500
- else "Data-preserving: compact form is semantically equivalent to the sanitized input."
501
- )
502
- metadata: dict[str, object] = {
503
- "tool": "context-guard-kit.context_compress",
504
- "metadata_version": 1,
505
- "content_type": content_type,
506
- "type_source": type_source,
507
- "strategy": strategy_detail.get("strategy"),
508
- "strategy_detail": strategy_detail,
509
- "lossy": lossy,
510
- "input": {
511
- "bytes_read": input_bytes,
512
- "truncated": input_truncated,
513
- "max_bytes": max_bytes,
514
- },
515
- "redaction": {
516
- "redacted_lines": redacted_lines,
517
- "redacted_before_receipt": True,
518
- },
519
- "bytes": {
520
- "measurement": "observed",
521
- "original": original_bytes,
522
- "compressed": compressed_bytes,
523
- "saved": original_bytes - compressed_bytes,
524
- "compression_ratio": ratio,
525
- },
526
- "lines": {
527
- "measurement": "observed",
528
- "original": line_count(original_text),
529
- "compressed": line_count(compressed_text),
530
- },
531
- "token_proxy": {
532
- "measurement": "estimated",
533
- "method": f"chars_div_{TOKEN_PROXY_CHARS_PER_TOKEN}",
534
- "original": token_proxy(original_text),
535
- "compressed": token_proxy(compressed_text),
536
- },
537
- "retrieval_hint": retrieval_hint,
538
- }
539
- if protected_policy_enabled:
540
- protected_policy = build_protected_policy(
541
- text=original_text,
542
- content_type=content_type,
543
- strategy_detail=strategy_detail,
544
- lossy=lossy,
545
- )
546
- metadata["protected_zone_policy"] = protected_policy
547
- metadata["transform_policy"] = build_transform_policy(protected_policy)
548
- if protected_policy.get("retrieval_required"):
549
- metadata["retrieval_hint"] = (
550
- "Protected lossy structural transform: store the full sanitized text with "
551
- "`context-guard-artifact store` and retrieve exact slices before relying on omitted content."
552
- )
553
- return metadata
554
-
555
-
556
- def compress_text(
557
- text: str,
558
- *,
559
- forced_type: str | None,
560
- show_paths: bool,
561
- input_truncated: bool,
562
- input_bytes: int,
563
- max_bytes: int,
564
- protected_policy_enabled: bool = False,
565
- ) -> tuple[str, dict[str, object]]:
566
- """Sanitize first, then classify and compress, then build the receipt.
567
-
568
- Redaction runs on the raw input so no secret can leak into the classifier,
569
- the compressed body, or the metadata that follows.
570
- """
571
- sanitized, redacted_lines = sanitize_text(text, show_paths=show_paths)
572
- if forced_type is not None:
573
- content_type, type_source = forced_type, "override"
574
- else:
575
- content_type, type_source = classify_content(sanitized), "detected"
576
- compressed, strategy_detail = STRATEGIES[content_type](sanitized)
577
- # 보수성 보장: 어떤 전략도 입력보다 큰 결과를 내보내지 않는다. 작은 입력에서
578
- # 접기 마커가 원본보다 길어지는 경우 살균된 원본을 그대로 유지한다.
579
- if byte_length(compressed) >= byte_length(sanitized):
580
- compressed = sanitized
581
- strategy_detail["reduced"] = False
582
- else:
583
- strategy_detail["reduced"] = True
584
- metadata = build_metadata(
585
- content_type=content_type,
586
- type_source=type_source,
587
- strategy_detail=strategy_detail,
588
- original_text=sanitized,
589
- compressed_text=compressed,
590
- redacted_lines=redacted_lines,
591
- input_truncated=input_truncated,
592
- input_bytes=input_bytes,
593
- max_bytes=max_bytes,
594
- protected_policy_enabled=protected_policy_enabled,
595
- )
596
- return compressed, metadata
597
-
598
-
599
- def render_text_receipt(metadata: dict[str, object]) -> str:
600
- """One-block human summary written to stderr in text mode."""
601
- byte_stats = metadata.get("bytes", {})
602
- token_stats = metadata.get("token_proxy", {})
603
- redaction = metadata.get("redaction", {})
604
- lines = [
605
- "[context-guard-kit] compress",
606
- f"- content_type: {metadata.get('content_type')} ({metadata.get('type_source')})",
607
- f"- strategy: {metadata.get('strategy')} (lossy={str(metadata.get('lossy')).lower()})",
608
- ]
609
- if isinstance(byte_stats, dict):
610
- lines.append(
611
- f"- bytes: {byte_stats.get('original')} -> {byte_stats.get('compressed')} "
612
- f"(ratio={byte_stats.get('compression_ratio')})"
613
- )
614
- if isinstance(token_stats, dict):
615
- lines.append(
616
- f"- token_proxy(estimated): {token_stats.get('original')} -> {token_stats.get('compressed')}"
617
- )
618
- if isinstance(redaction, dict) and redaction.get("redacted_lines"):
619
- lines.append(f"- redacted_lines: {redaction.get('redacted_lines')}")
620
- return "\n".join(lines) + "\n"
621
-
622
-
623
- def run_compress(args: argparse.Namespace) -> int:
624
- """Read stdin, compress, then emit JSON or (compressed text + stderr receipt)."""
625
- max_bytes = bounded_int(args.max_bytes, DEFAULT_MAX_BYTES, 1, MAX_MAX_BYTES)
626
- raw_text, input_truncated, input_bytes = read_bounded_stdin(max_bytes)
627
- forced_type = args.type
628
- if forced_type is not None and forced_type not in STRATEGIES:
629
- print(f"context-guard-compress: unknown --type: {forced_type}", file=sys.stderr)
630
- return 2
631
- compressed, metadata = compress_text(
632
- raw_text,
633
- forced_type=forced_type,
634
- show_paths=args.show_paths,
635
- input_truncated=input_truncated,
636
- input_bytes=input_bytes,
637
- max_bytes=max_bytes,
638
- protected_policy_enabled=bool(args.protected_policy),
639
- )
640
- if args.json:
641
- payload = {"metadata": metadata, "content": compressed}
642
- print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
643
- elif args.metadata_only:
644
- print(json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True))
645
- else:
646
- sys.stdout.write(compressed)
647
- if not args.quiet:
648
- sys.stderr.write(render_text_receipt(metadata))
649
- return 0
650
-
651
-
652
- def build_parser() -> argparse.ArgumentParser:
653
- parser = argparse.ArgumentParser(
654
- description="Classify and conservatively compress stdin (sanitized) for token-budget reuse.",
655
- )
656
- parser.add_argument(
657
- "--type",
658
- choices=CONTENT_TYPES,
659
- default=None,
660
- help="force a content type instead of auto-detecting (json/diff/log/search/code/prose)",
661
- )
662
- parser.add_argument("--json", action="store_true", help="emit JSON with metadata and compressed content")
663
- parser.add_argument(
664
- "--protected-policy",
665
- action="store_true",
666
- help="add opt-in protected-zone transform policy metadata to --json/--metadata-only receipts; default content is unchanged",
667
- )
668
- parser.add_argument(
669
- "--metadata-only",
670
- action="store_true",
671
- help="emit only the JSON metadata receipt (no compressed body)",
672
- )
673
- parser.add_argument("--quiet", action="store_true", help="suppress the text receipt on stderr in text mode")
674
- parser.add_argument(
675
- "--show-paths",
676
- action="store_true",
677
- help="show raw absolute paths instead of path hashes; local debugging only because private paths may be exposed",
678
- )
679
- parser.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES, help="maximum stdin bytes to read before truncating")
680
- parser.set_defaults(func=run_compress)
681
- return parser
682
-
683
-
684
- def main() -> int:
685
- parser = build_parser()
686
- args = parser.parse_args()
687
- try:
688
- return int(args.func(args))
689
- except RuntimeError as exc:
690
- print(f"context-guard-compress: {exc}", file=sys.stderr)
691
- return 1
692
-
693
-
694
- if __name__ == "__main__":
695
- raise SystemExit(main())