@ictechgy/context-guard 0.4.9 → 0.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/README.ko.md +41 -24
- package/README.md +66 -26
- package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
- package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
- package/docs/distribution.md +10 -7
- package/docs/experimental-benchmark-fixtures.md +8 -1
- package/package.json +3 -6
- package/packaging/homebrew/context-guard.rb.template +1 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +9 -6
- package/plugins/context-guard/README.md +21 -13
- package/plugins/context-guard/bin/context-guard +113 -26
- package/plugins/context-guard/bin/context-guard-artifact +542 -46
- package/plugins/context-guard/bin/context-guard-cache-score +380 -0
- package/plugins/context-guard/bin/context-guard-compress +146 -1
- package/plugins/context-guard/bin/context-guard-cost +783 -4
- package/plugins/context-guard/bin/context-guard-experiments +99 -18
- package/plugins/context-guard/bin/context-guard-failed-nudge +3 -0
- package/plugins/context-guard/bin/context-guard-filter +163 -7
- package/plugins/context-guard/bin/context-guard-guard-read +3 -0
- package/plugins/context-guard/bin/context-guard-pack +602 -43
- package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
- package/plugins/context-guard/bin/context-guard-setup +165 -31
- package/plugins/context-guard/bin/context-guard-statusline +490 -283
- package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
- package/plugins/context-guard/bin/context-guard-tool-prune +241 -1
- package/plugins/context-guard/lib/context_guard_commands.py +206 -0
- package/plugins/context-guard/skills/setup/SKILL.md +1 -0
- package/context-guard-kit/README.md +0 -91
- package/context-guard-kit/benchmark_runner.py +0 -2401
- package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
- package/context-guard-kit/context_compress.py +0 -695
- package/context-guard-kit/context_escrow.py +0 -935
- package/context-guard-kit/context_filter.py +0 -637
- package/context-guard-kit/context_guard_cli.py +0 -325
- package/context-guard-kit/context_guard_diet.py +0 -1711
- package/context-guard-kit/context_pack.py +0 -2713
- package/context-guard-kit/cost_guard.py +0 -2349
- package/context-guard-kit/experimental_registry.py +0 -4348
- package/context-guard-kit/failed_attempt_nudge.py +0 -567
- package/context-guard-kit/guard_large_read.py +0 -690
- package/context-guard-kit/hook_secret_patterns.py +0 -43
- package/context-guard-kit/read_symbol.py +0 -483
- package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
- package/context-guard-kit/sanitize_output.py +0 -725
- package/context-guard-kit/settings.example.json +0 -67
- package/context-guard-kit/setup_wizard.py +0 -2515
- package/context-guard-kit/statusline.sh +0 -362
- package/context-guard-kit/statusline_merged.sh +0 -157
- package/context-guard-kit/tool_schema_pruner.py +0 -837
- package/context-guard-kit/trim_command_output.py +0 -1449
|
@@ -1,695 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Classify stdin content and emit a sanitized, token-budget-friendly compression.
|
|
3
|
-
|
|
4
|
-
The CLI never promises lossless *semantic* compression. It performs conservative,
|
|
5
|
-
deterministic, content-type-aware shrinking (compact JSON, diff change-only views,
|
|
6
|
-
log/search de-duplication, whitespace normalization) so large local output costs
|
|
7
|
-
fewer tokens to keep in context. Secrets are redacted *before* the receipt is built,
|
|
8
|
-
so no secret ever reaches the compressed body or the metadata.
|
|
9
|
-
|
|
10
|
-
For exact byte-for-byte recovery the receipt points at `context-guard-artifact store`,
|
|
11
|
-
which keeps the full sanitized content as a queryable local artifact.
|
|
12
|
-
"""
|
|
13
|
-
from __future__ import annotations
|
|
14
|
-
|
|
15
|
-
import argparse
|
|
16
|
-
import importlib.machinery
|
|
17
|
-
import importlib.util
|
|
18
|
-
import json
|
|
19
|
-
import os
|
|
20
|
-
from pathlib import Path
|
|
21
|
-
import re
|
|
22
|
-
import sys
|
|
23
|
-
from typing import Callable
|
|
24
|
-
|
|
25
|
-
DEFAULT_MAX_BYTES = 10_000_000
|
|
26
|
-
MAX_MAX_BYTES = 100_000_000
|
|
27
|
-
# 토큰 추정은 보수적 proxy 일 뿐이다(관측값 아님). 평균 ~4 chars/token 휴리스틱을 쓰되
|
|
28
|
-
# 메타데이터에 measurement="estimated" 로 명시해 관측 토큰 수와 혼동되지 않게 한다.
|
|
29
|
-
TOKEN_PROXY_CHARS_PER_TOKEN = 4
|
|
30
|
-
CONTENT_TYPES = ("json", "diff", "log", "search", "code", "prose")
|
|
31
|
-
|
|
32
|
-
# diff 구조 라인(파일 헤더/헝크/변경)을 식별한다. 나머지 context 라인은 접어서 줄인다.
|
|
33
|
-
DIFF_FILE_HEADER_RE = re.compile(r"^(diff --git |index [0-9a-f]|--- |\+\+\+ |rename |similarity |new file|deleted file)")
|
|
34
|
-
DIFF_HUNK_RE = re.compile(r"^@@ .* @@")
|
|
35
|
-
# search(grep/ripgrep) 라인: `path:line:content` 또는 `path:content`.
|
|
36
|
-
# 콜론 앞 경로 토큰에 공백을 불허해, 타임스탬프 로그("2026-01-01 00:00:00 ...")가
|
|
37
|
-
# search 로 오분류되는 것을 막는다(로그 타임스탬프는 콜론 앞에 공백을 포함).
|
|
38
|
-
SEARCH_LINE_RE = re.compile(r"^[^\s:][^:\n\s]*:(?:\d+:)?.")
|
|
39
|
-
# log 시그널: 선두 타임스탬프나 로그 레벨 토큰.
|
|
40
|
-
LOG_LEVEL_RE = re.compile(r"\b(TRACE|DEBUG|INFO|NOTICE|WARN|WARNING|ERROR|FATAL|CRITICAL)\b")
|
|
41
|
-
LOG_TIMESTAMP_RE = re.compile(r"^\s*(?:\[)?\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}|^\s*\d{2}:\d{2}:\d{2}\b")
|
|
42
|
-
# code 시그널: 흔한 소스 키워드/구두점. diff 와 겹치지 않도록 diff 판정을 먼저 한다.
|
|
43
|
-
CODE_SIGNAL_RE = re.compile(
|
|
44
|
-
r"(^\s*(def |class |function |func |import |from \S+ import |public |private |const |let |var |#include|package )"
|
|
45
|
-
r"|[{};]\s*$|=>|::)"
|
|
46
|
-
)
|
|
47
|
-
CODE_FENCE_RE = re.compile(r"(?m)^\s*```")
|
|
48
|
-
JSON_KEY_RE = re.compile(r'"(?:[^"\\]|\\.)*"\s*:')
|
|
49
|
-
QUOTED_STRING_RE = re.compile(r"""(?x)
|
|
50
|
-
"(?:[^"\\]|\\.)*" |
|
|
51
|
-
'(?:[^'\\]|\\.)*'
|
|
52
|
-
""")
|
|
53
|
-
HASH_RE = re.compile(r"\b(?:[0-9a-fA-F]{32,}|sha256:[0-9a-fA-F]{32,})\b")
|
|
54
|
-
PATH_RE = re.compile(
|
|
55
|
-
r"(?x)(?:"
|
|
56
|
-
r"(?<![\w.-])/(?:[A-Za-z0-9._@%+=:-]+/)*[A-Za-z0-9._@%+=:-]+"
|
|
57
|
-
r"|"
|
|
58
|
-
r"\b[A-Za-z]:\\(?:[^\\\s:\"'<>|]+\\)*[^\\\s:\"'<>|]+"
|
|
59
|
-
r"|"
|
|
60
|
-
r"\b[A-Za-z0-9._-]+\#path:[0-9a-f]{12}\b"
|
|
61
|
-
r")"
|
|
62
|
-
)
|
|
63
|
-
STACK_FRAME_RE = re.compile(
|
|
64
|
-
r"(?m)^\s*(?:File\s+\"[^\"]+\",\s+line\s+\d+,\s+in\s+\S+|at\s+\S+.*\([^)]*:\d+(?::\d+)?\))"
|
|
65
|
-
)
|
|
66
|
-
IDENTIFIER_RE = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*(?:[A-Z][A-Za-z0-9_]*)?\b")
|
|
67
|
-
NUMERIC_CONSTANT_RE = re.compile(r"(?<![\w.])[-+]?(?:0x[0-9A-Fa-f]+|\d+(?:\.\d+)?)(?![\w.])")
|
|
68
|
-
PROTECTED_ZONE_KEYS = (
|
|
69
|
-
"code_fence",
|
|
70
|
-
"diff",
|
|
71
|
-
"identifier",
|
|
72
|
-
"numeric_constant",
|
|
73
|
-
"hash",
|
|
74
|
-
"path",
|
|
75
|
-
"stack_frame",
|
|
76
|
-
"quoted_string",
|
|
77
|
-
"json_key",
|
|
78
|
-
)
|
|
79
|
-
PROTECTED_ALLOWED_TRANSFORMS = (
|
|
80
|
-
"exact_dedupe",
|
|
81
|
-
"structural_window",
|
|
82
|
-
"line_truncate",
|
|
83
|
-
"whitespace_normalize",
|
|
84
|
-
"json_compact",
|
|
85
|
-
"artifact_retrieval",
|
|
86
|
-
)
|
|
87
|
-
PROTECTED_DENIED_TRANSFORMS = (
|
|
88
|
-
"semantic_compress",
|
|
89
|
-
"paraphrase",
|
|
90
|
-
"identifier_rewrite",
|
|
91
|
-
"numeric_rewrite",
|
|
92
|
-
"hash_rewrite",
|
|
93
|
-
"path_rewrite",
|
|
94
|
-
"quoted_literal_rewrite",
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
|
|
99
|
-
"""Clamp an int-like value into [minimum, maximum], falling back on default."""
|
|
100
|
-
try:
|
|
101
|
-
number = int(value)
|
|
102
|
-
except (TypeError, ValueError, OverflowError):
|
|
103
|
-
return default
|
|
104
|
-
return min(max(number, minimum), maximum)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
class FallbackLineSanitizer:
|
|
108
|
-
"""Minimal secret scrubber used when the shared sanitizer cannot be loaded."""
|
|
109
|
-
|
|
110
|
-
SECRET_VALUE_RE = re.compile(
|
|
111
|
-
r"(?i)(Bearer\s+\S+|Basic\s+\S+|gh[pousr]_[A-Za-z0-9_]{20,}|"
|
|
112
|
-
r"github_pat_[A-Za-z0-9_]{20,}|xox[abprs]-[A-Za-z0-9-]{10,}|"
|
|
113
|
-
r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
|
|
114
|
-
r"AIza[0-9A-Za-z_\-]{20,}|"
|
|
115
|
-
r"([A-Za-z0-9_.-]*(?:api[_-]?key|token|secret|password|passwd|pwd)[A-Za-z0-9_.-]*\s*[:=]\s*)\S+)"
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
def __init__(self, *, show_paths: bool = False) -> None:
|
|
119
|
-
self.show_paths = show_paths
|
|
120
|
-
self.redactions = 0
|
|
121
|
-
|
|
122
|
-
def sanitize(self, raw_line: str) -> tuple[str, bool]:
|
|
123
|
-
def repl(match: re.Match[str]) -> str:
|
|
124
|
-
groups = match.groups()
|
|
125
|
-
if len(groups) >= 2 and groups[1]:
|
|
126
|
-
return groups[1] + "[REDACTED]"
|
|
127
|
-
return "[REDACTED]"
|
|
128
|
-
|
|
129
|
-
line, count = self.SECRET_VALUE_RE.subn(repl, raw_line)
|
|
130
|
-
if count:
|
|
131
|
-
self.redactions += 1
|
|
132
|
-
return line, bool(count)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def load_line_sanitizer(show_paths: bool) -> object:
|
|
136
|
-
"""Reuse the shipped strong sanitizer when present; else fall back locally.
|
|
137
|
-
|
|
138
|
-
Mirrors context_escrow.py so the compress CLI redacts with the same rules
|
|
139
|
-
as the rest of the kit when `sanitize_output.py` sits next to this script.
|
|
140
|
-
"""
|
|
141
|
-
script_dir = Path(__file__).resolve().parent
|
|
142
|
-
for name in ("sanitize_output.py", "context-guard-sanitize-output", "claude-sanitize-output"):
|
|
143
|
-
candidate = script_dir / name
|
|
144
|
-
if not candidate.exists():
|
|
145
|
-
continue
|
|
146
|
-
try:
|
|
147
|
-
loader = importlib.machinery.SourceFileLoader(f"_context_guard_compress_sanitize_{os.getpid()}", str(candidate))
|
|
148
|
-
spec = importlib.util.spec_from_loader(loader.name, loader)
|
|
149
|
-
if spec is None:
|
|
150
|
-
raise RuntimeError("import spec unavailable")
|
|
151
|
-
module = importlib.util.module_from_spec(spec)
|
|
152
|
-
loader.exec_module(module)
|
|
153
|
-
return module.LineSanitizer(show_paths=show_paths)
|
|
154
|
-
except Exception as exc:
|
|
155
|
-
raise RuntimeError(f"could not load sanitizer {candidate}: {exc}") from exc
|
|
156
|
-
return FallbackLineSanitizer(show_paths=show_paths)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def sanitize_text(text: str, *, show_paths: bool = False) -> tuple[str, int]:
|
|
160
|
-
"""Redact secrets line-by-line, returning sanitized text and redacted-line count."""
|
|
161
|
-
sanitizer = load_line_sanitizer(show_paths)
|
|
162
|
-
redacted = 0
|
|
163
|
-
out: list[str] = []
|
|
164
|
-
for line in text.splitlines(True):
|
|
165
|
-
sanitized, did_redact = sanitizer.sanitize(line) # type: ignore[attr-defined]
|
|
166
|
-
out.append(sanitized)
|
|
167
|
-
if did_redact:
|
|
168
|
-
redacted += 1
|
|
169
|
-
return "".join(out), redacted
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
def read_bounded_stdin(max_bytes: int) -> tuple[str, bool, int]:
|
|
173
|
-
"""Read at most max_bytes from stdin, reporting truncation and bytes read."""
|
|
174
|
-
data = sys.stdin.buffer.read(max_bytes + 1)
|
|
175
|
-
truncated = len(data) > max_bytes
|
|
176
|
-
if truncated:
|
|
177
|
-
data = data[:max_bytes]
|
|
178
|
-
return data.decode("utf-8", errors="replace"), truncated, len(data)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
def line_count(text: str) -> int:
|
|
182
|
-
"""Count logical lines without an off-by-one on a trailing newline."""
|
|
183
|
-
if not text:
|
|
184
|
-
return 0
|
|
185
|
-
return text.count("\n") + (0 if text.endswith("\n") else 1)
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
def byte_length(text: str) -> int:
|
|
189
|
-
"""UTF-8 byte length using the same lossy decode policy as the rest of the kit."""
|
|
190
|
-
return len(text.encode("utf-8", errors="replace"))
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def token_proxy(text: str) -> int:
|
|
194
|
-
"""Conservative token estimate (chars/4). Labeled 'estimated' in metadata."""
|
|
195
|
-
if not text:
|
|
196
|
-
return 0
|
|
197
|
-
return max(1, round(len(text) / TOKEN_PROXY_CHARS_PER_TOKEN))
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def classify_content(text: str) -> str:
|
|
201
|
-
"""Best-effort content classification into one of CONTENT_TYPES.
|
|
202
|
-
|
|
203
|
-
Order matters: JSON and diff have the strongest unambiguous signals and are
|
|
204
|
-
checked first; search/log/code are sampled over the first lines; prose is the
|
|
205
|
-
conservative default so unknown text is never over-compressed.
|
|
206
|
-
"""
|
|
207
|
-
stripped = text.strip()
|
|
208
|
-
if not stripped:
|
|
209
|
-
return "prose"
|
|
210
|
-
if _looks_like_json(stripped):
|
|
211
|
-
return "json"
|
|
212
|
-
lines = stripped.splitlines()
|
|
213
|
-
sample = lines[:200]
|
|
214
|
-
if _looks_like_diff(sample):
|
|
215
|
-
return "diff"
|
|
216
|
-
if _looks_like_search(sample):
|
|
217
|
-
return "search"
|
|
218
|
-
if _looks_like_log(sample):
|
|
219
|
-
return "log"
|
|
220
|
-
if _looks_like_code(sample):
|
|
221
|
-
return "code"
|
|
222
|
-
return "prose"
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def protected_zone_counts(text: str) -> dict[str, int]:
|
|
226
|
-
"""Conservatively count semantic-sensitive zones without storing raw spans.
|
|
227
|
-
|
|
228
|
-
The counts intentionally over-approximate. They are policy signals for later
|
|
229
|
-
transform gates, not a parser. Metadata must never include the matched path,
|
|
230
|
-
identifier, hash, or string contents because receipts are safe to share.
|
|
231
|
-
"""
|
|
232
|
-
lines = text.splitlines()
|
|
233
|
-
fence_markers = len(CODE_FENCE_RE.findall(text))
|
|
234
|
-
diff_lines = sum(
|
|
235
|
-
1
|
|
236
|
-
for line in lines
|
|
237
|
-
if DIFF_FILE_HEADER_RE.match(line)
|
|
238
|
-
or DIFF_HUNK_RE.match(line)
|
|
239
|
-
or (line[:1] in "+-" and not line.startswith(("+++", "---")))
|
|
240
|
-
)
|
|
241
|
-
counts = {
|
|
242
|
-
"code_fence": (fence_markers + 1) // 2,
|
|
243
|
-
"diff": diff_lines,
|
|
244
|
-
"identifier": len(IDENTIFIER_RE.findall(text)),
|
|
245
|
-
"numeric_constant": len(NUMERIC_CONSTANT_RE.findall(text)),
|
|
246
|
-
"hash": len(HASH_RE.findall(text)),
|
|
247
|
-
"path": len(PATH_RE.findall(text)),
|
|
248
|
-
"stack_frame": len(STACK_FRAME_RE.findall(text)),
|
|
249
|
-
"quoted_string": len(QUOTED_STRING_RE.findall(text)),
|
|
250
|
-
"json_key": len(JSON_KEY_RE.findall(text)),
|
|
251
|
-
}
|
|
252
|
-
return {key: counts[key] for key in PROTECTED_ZONE_KEYS if counts.get(key, 0) > 0}
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
def build_protected_policy(
|
|
256
|
-
*,
|
|
257
|
-
text: str,
|
|
258
|
-
content_type: str,
|
|
259
|
-
strategy_detail: dict[str, object],
|
|
260
|
-
lossy: bool,
|
|
261
|
-
) -> dict[str, object]:
|
|
262
|
-
"""Build an opt-in transform policy for protected zones.
|
|
263
|
-
|
|
264
|
-
Protection governs transform eligibility and exact-retrieval expectations.
|
|
265
|
-
It does not claim the section should be provider-cache-stable; cache ordering
|
|
266
|
-
is handled by `context-guard-cost compile`.
|
|
267
|
-
"""
|
|
268
|
-
zone_counts = protected_zone_counts(text)
|
|
269
|
-
detected = bool(zone_counts)
|
|
270
|
-
strategy = str(strategy_detail.get("strategy") or "unknown")
|
|
271
|
-
retrieval_required = bool(detected and lossy)
|
|
272
|
-
return {
|
|
273
|
-
"enabled": True,
|
|
274
|
-
"detected": detected,
|
|
275
|
-
"content_type": content_type,
|
|
276
|
-
"zone_counts": zone_counts,
|
|
277
|
-
"semantic_compress": False,
|
|
278
|
-
"allowed_transforms": list(PROTECTED_ALLOWED_TRANSFORMS),
|
|
279
|
-
"denied_transforms": list(PROTECTED_DENIED_TRANSFORMS),
|
|
280
|
-
"retrieval_required": retrieval_required,
|
|
281
|
-
"retrieval_scope": "sanitized_full_input" if retrieval_required else "compressed_output",
|
|
282
|
-
"raw_spans_stored": False,
|
|
283
|
-
"policy_note": "Protected zones permit structural transforms only; no semantic/paraphrase rewrites.",
|
|
284
|
-
"strategy": {
|
|
285
|
-
"name": strategy,
|
|
286
|
-
"structural_only": True,
|
|
287
|
-
},
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def build_transform_policy(protected_policy: dict[str, object]) -> dict[str, object]:
|
|
292
|
-
"""Summarize transform eligibility without embedding raw protected content."""
|
|
293
|
-
return {
|
|
294
|
-
"mode": "protected" if protected_policy.get("detected") else "structural_default",
|
|
295
|
-
"semantic_transforms_allowed": False,
|
|
296
|
-
"semantic_compress": False,
|
|
297
|
-
"allowed": list(PROTECTED_ALLOWED_TRANSFORMS),
|
|
298
|
-
"denied": list(PROTECTED_DENIED_TRANSFORMS),
|
|
299
|
-
"exact_retrieval_required": bool(protected_policy.get("retrieval_required")),
|
|
300
|
-
"raw_spans_stored": False,
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
def _looks_like_json(stripped: str) -> bool:
|
|
305
|
-
if stripped[0] not in "{[":
|
|
306
|
-
return False
|
|
307
|
-
try:
|
|
308
|
-
json.loads(stripped)
|
|
309
|
-
except (ValueError, RecursionError):
|
|
310
|
-
return False
|
|
311
|
-
return True
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
def _ratio(matches: int, total: int, threshold: float) -> bool:
|
|
315
|
-
return bool(total) and (matches / total) >= threshold
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
def _looks_like_diff(sample: list[str]) -> bool:
|
|
319
|
-
headers = sum(1 for line in sample if DIFF_FILE_HEADER_RE.match(line) or DIFF_HUNK_RE.match(line))
|
|
320
|
-
changes = sum(1 for line in sample if line[:1] in "+-" and not line.startswith(("+++", "---")))
|
|
321
|
-
return headers >= 1 and (changes >= 1 or headers >= 2)
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
def _looks_like_search(sample: list[str]) -> bool:
|
|
325
|
-
matches = sum(1 for line in sample if SEARCH_LINE_RE.match(line))
|
|
326
|
-
return _ratio(matches, len(sample), 0.6) and len(sample) >= 2
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
def _looks_like_log(sample: list[str]) -> bool:
|
|
330
|
-
matches = sum(1 for line in sample if LOG_TIMESTAMP_RE.match(line) or LOG_LEVEL_RE.search(line))
|
|
331
|
-
return _ratio(matches, len(sample), 0.4)
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def _looks_like_code(sample: list[str]) -> bool:
|
|
335
|
-
matches = sum(1 for line in sample if CODE_SIGNAL_RE.search(line))
|
|
336
|
-
return _ratio(matches, len(sample), 0.25)
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
def compress_json(text: str) -> tuple[str, dict[str, object]]:
|
|
340
|
-
"""Re-serialize JSON without insignificant whitespace (data-preserving)."""
|
|
341
|
-
try:
|
|
342
|
-
parsed = json.loads(text)
|
|
343
|
-
except (ValueError, RecursionError):
|
|
344
|
-
# 파싱 불가 시 무손실을 깨지 않도록 prose 전략으로 안전하게 폴백한다.
|
|
345
|
-
compressed, detail = compress_prose(text)
|
|
346
|
-
detail["fallback_from"] = "json"
|
|
347
|
-
return compressed, detail
|
|
348
|
-
compact = json.dumps(parsed, ensure_ascii=False, separators=(",", ":"))
|
|
349
|
-
if not text.endswith("\n"):
|
|
350
|
-
trailing = ""
|
|
351
|
-
else:
|
|
352
|
-
trailing = "\n"
|
|
353
|
-
return compact + trailing, {"strategy": "json-compact", "lossy": False, "json_parse_ok": True}
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
def compress_diff(text: str) -> tuple[str, dict[str, object]]:
|
|
357
|
-
"""Keep file headers, hunk headers, and +/- changes; collapse context runs."""
|
|
358
|
-
out: list[str] = []
|
|
359
|
-
context_run = 0
|
|
360
|
-
collapsed = 0
|
|
361
|
-
|
|
362
|
-
def flush() -> None:
|
|
363
|
-
nonlocal context_run, collapsed
|
|
364
|
-
if context_run:
|
|
365
|
-
out.append(f"[context-guard-kit] {context_run} unchanged context line(s) omitted")
|
|
366
|
-
collapsed += context_run
|
|
367
|
-
context_run = 0
|
|
368
|
-
|
|
369
|
-
for line in text.splitlines():
|
|
370
|
-
is_structural = bool(DIFF_FILE_HEADER_RE.match(line) or DIFF_HUNK_RE.match(line))
|
|
371
|
-
is_change = line[:1] in "+-" and not line.startswith(("+++", "---"))
|
|
372
|
-
if is_structural or is_change:
|
|
373
|
-
flush()
|
|
374
|
-
out.append(line)
|
|
375
|
-
elif line.startswith(" ") or line == "":
|
|
376
|
-
context_run += 1
|
|
377
|
-
else:
|
|
378
|
-
flush()
|
|
379
|
-
out.append(line)
|
|
380
|
-
flush()
|
|
381
|
-
return _join_lines(out, text), {"strategy": "diff-keep-changes", "lossy": True, "context_lines_omitted": collapsed}
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
def compress_log(text: str) -> tuple[str, dict[str, object]]:
|
|
385
|
-
"""Collapse consecutive identical lines into a single `line (xN)` marker."""
|
|
386
|
-
out: list[str] = []
|
|
387
|
-
collapsed = 0
|
|
388
|
-
previous: str | None = None
|
|
389
|
-
run = 0
|
|
390
|
-
|
|
391
|
-
def flush() -> None:
|
|
392
|
-
nonlocal previous, run, collapsed
|
|
393
|
-
if previous is None:
|
|
394
|
-
return
|
|
395
|
-
if run > 1:
|
|
396
|
-
out.append(f"{previous} (x{run})")
|
|
397
|
-
collapsed += run - 1
|
|
398
|
-
else:
|
|
399
|
-
out.append(previous)
|
|
400
|
-
previous, run = None, 0
|
|
401
|
-
|
|
402
|
-
for line in text.splitlines():
|
|
403
|
-
if line == previous:
|
|
404
|
-
run += 1
|
|
405
|
-
continue
|
|
406
|
-
flush()
|
|
407
|
-
previous, run = line, 1
|
|
408
|
-
flush()
|
|
409
|
-
return _join_lines(out, text), {"strategy": "log-collapse-repeats", "lossy": True, "lines_collapsed": collapsed}
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
def compress_search(text: str) -> tuple[str, dict[str, object]]:
|
|
413
|
-
"""Drop exact-duplicate match lines while preserving first-seen order."""
|
|
414
|
-
out: list[str] = []
|
|
415
|
-
seen: set[str] = set()
|
|
416
|
-
dropped = 0
|
|
417
|
-
for line in text.splitlines():
|
|
418
|
-
key = line.rstrip()
|
|
419
|
-
if key in seen:
|
|
420
|
-
dropped += 1
|
|
421
|
-
continue
|
|
422
|
-
seen.add(key)
|
|
423
|
-
out.append(line)
|
|
424
|
-
return _join_lines(out, text), {"strategy": "search-dedupe", "lossy": dropped > 0, "duplicate_lines_dropped": dropped}
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
def compress_code(text: str) -> tuple[str, dict[str, object]]:
|
|
428
|
-
"""Trim trailing whitespace and collapse 3+ blank lines to a single blank."""
|
|
429
|
-
return _whitespace_normalize(text, strategy="code-whitespace", max_consecutive_blank=1)
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
def compress_prose(text: str) -> tuple[str, dict[str, object]]:
|
|
433
|
-
"""Trim trailing whitespace and collapse 2+ blank lines to a single blank."""
|
|
434
|
-
return _whitespace_normalize(text, strategy="prose-whitespace", max_consecutive_blank=1)
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
def _whitespace_normalize(text: str, *, strategy: str, max_consecutive_blank: int) -> tuple[str, dict[str, object]]:
|
|
438
|
-
out: list[str] = []
|
|
439
|
-
blank_run = 0
|
|
440
|
-
collapsed = 0
|
|
441
|
-
for line in text.splitlines():
|
|
442
|
-
trimmed = line.rstrip()
|
|
443
|
-
if trimmed == "":
|
|
444
|
-
blank_run += 1
|
|
445
|
-
if blank_run > max_consecutive_blank:
|
|
446
|
-
collapsed += 1
|
|
447
|
-
continue
|
|
448
|
-
else:
|
|
449
|
-
blank_run = 0
|
|
450
|
-
out.append(trimmed)
|
|
451
|
-
lossy = collapsed > 0 or any(line != line.rstrip() for line in text.splitlines())
|
|
452
|
-
return _join_lines(out, text), {"strategy": strategy, "lossy": lossy, "blank_lines_collapsed": collapsed}
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
def _join_lines(lines: list[str], original: str) -> str:
|
|
456
|
-
"""Join compressed lines, restoring a trailing newline only if the input had one."""
|
|
457
|
-
body = "\n".join(lines)
|
|
458
|
-
if original.endswith("\n") and body and not body.endswith("\n"):
|
|
459
|
-
body += "\n"
|
|
460
|
-
return body
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
STRATEGIES: dict[str, Callable[[str], tuple[str, dict[str, object]]]] = {
|
|
464
|
-
"json": compress_json,
|
|
465
|
-
"diff": compress_diff,
|
|
466
|
-
"log": compress_log,
|
|
467
|
-
"search": compress_search,
|
|
468
|
-
"code": compress_code,
|
|
469
|
-
"prose": compress_prose,
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
def build_metadata(
|
|
474
|
-
*,
|
|
475
|
-
content_type: str,
|
|
476
|
-
type_source: str,
|
|
477
|
-
strategy_detail: dict[str, object],
|
|
478
|
-
original_text: str,
|
|
479
|
-
compressed_text: str,
|
|
480
|
-
redacted_lines: int,
|
|
481
|
-
input_truncated: bool,
|
|
482
|
-
input_bytes: int,
|
|
483
|
-
max_bytes: int,
|
|
484
|
-
protected_policy_enabled: bool = False,
|
|
485
|
-
) -> dict[str, object]:
|
|
486
|
-
"""Assemble the compress receipt: observed byte/line counts plus an estimated token proxy.
|
|
487
|
-
|
|
488
|
-
`redacted_lines` is computed before this point (redaction-before-receipt), so the
|
|
489
|
-
metadata can be safely emitted. A deterministic retrieval hint points at escrow for
|
|
490
|
-
exact-byte recovery because every strategy except json-compact is lossy.
|
|
491
|
-
"""
|
|
492
|
-
original_bytes = byte_length(original_text)
|
|
493
|
-
compressed_bytes = byte_length(compressed_text)
|
|
494
|
-
ratio = round(compressed_bytes / original_bytes, 4) if original_bytes else 1.0
|
|
495
|
-
lossy = bool(strategy_detail.get("lossy", True))
|
|
496
|
-
retrieval_hint = (
|
|
497
|
-
"Lossy: store the full sanitized text for exact recovery via "
|
|
498
|
-
"`context-guard-artifact store` and query slices later."
|
|
499
|
-
if lossy
|
|
500
|
-
else "Data-preserving: compact form is semantically equivalent to the sanitized input."
|
|
501
|
-
)
|
|
502
|
-
metadata: dict[str, object] = {
|
|
503
|
-
"tool": "context-guard-kit.context_compress",
|
|
504
|
-
"metadata_version": 1,
|
|
505
|
-
"content_type": content_type,
|
|
506
|
-
"type_source": type_source,
|
|
507
|
-
"strategy": strategy_detail.get("strategy"),
|
|
508
|
-
"strategy_detail": strategy_detail,
|
|
509
|
-
"lossy": lossy,
|
|
510
|
-
"input": {
|
|
511
|
-
"bytes_read": input_bytes,
|
|
512
|
-
"truncated": input_truncated,
|
|
513
|
-
"max_bytes": max_bytes,
|
|
514
|
-
},
|
|
515
|
-
"redaction": {
|
|
516
|
-
"redacted_lines": redacted_lines,
|
|
517
|
-
"redacted_before_receipt": True,
|
|
518
|
-
},
|
|
519
|
-
"bytes": {
|
|
520
|
-
"measurement": "observed",
|
|
521
|
-
"original": original_bytes,
|
|
522
|
-
"compressed": compressed_bytes,
|
|
523
|
-
"saved": original_bytes - compressed_bytes,
|
|
524
|
-
"compression_ratio": ratio,
|
|
525
|
-
},
|
|
526
|
-
"lines": {
|
|
527
|
-
"measurement": "observed",
|
|
528
|
-
"original": line_count(original_text),
|
|
529
|
-
"compressed": line_count(compressed_text),
|
|
530
|
-
},
|
|
531
|
-
"token_proxy": {
|
|
532
|
-
"measurement": "estimated",
|
|
533
|
-
"method": f"chars_div_{TOKEN_PROXY_CHARS_PER_TOKEN}",
|
|
534
|
-
"original": token_proxy(original_text),
|
|
535
|
-
"compressed": token_proxy(compressed_text),
|
|
536
|
-
},
|
|
537
|
-
"retrieval_hint": retrieval_hint,
|
|
538
|
-
}
|
|
539
|
-
if protected_policy_enabled:
|
|
540
|
-
protected_policy = build_protected_policy(
|
|
541
|
-
text=original_text,
|
|
542
|
-
content_type=content_type,
|
|
543
|
-
strategy_detail=strategy_detail,
|
|
544
|
-
lossy=lossy,
|
|
545
|
-
)
|
|
546
|
-
metadata["protected_zone_policy"] = protected_policy
|
|
547
|
-
metadata["transform_policy"] = build_transform_policy(protected_policy)
|
|
548
|
-
if protected_policy.get("retrieval_required"):
|
|
549
|
-
metadata["retrieval_hint"] = (
|
|
550
|
-
"Protected lossy structural transform: store the full sanitized text with "
|
|
551
|
-
"`context-guard-artifact store` and retrieve exact slices before relying on omitted content."
|
|
552
|
-
)
|
|
553
|
-
return metadata
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
def compress_text(
|
|
557
|
-
text: str,
|
|
558
|
-
*,
|
|
559
|
-
forced_type: str | None,
|
|
560
|
-
show_paths: bool,
|
|
561
|
-
input_truncated: bool,
|
|
562
|
-
input_bytes: int,
|
|
563
|
-
max_bytes: int,
|
|
564
|
-
protected_policy_enabled: bool = False,
|
|
565
|
-
) -> tuple[str, dict[str, object]]:
|
|
566
|
-
"""Sanitize first, then classify and compress, then build the receipt.
|
|
567
|
-
|
|
568
|
-
Redaction runs on the raw input so no secret can leak into the classifier,
|
|
569
|
-
the compressed body, or the metadata that follows.
|
|
570
|
-
"""
|
|
571
|
-
sanitized, redacted_lines = sanitize_text(text, show_paths=show_paths)
|
|
572
|
-
if forced_type is not None:
|
|
573
|
-
content_type, type_source = forced_type, "override"
|
|
574
|
-
else:
|
|
575
|
-
content_type, type_source = classify_content(sanitized), "detected"
|
|
576
|
-
compressed, strategy_detail = STRATEGIES[content_type](sanitized)
|
|
577
|
-
# 보수성 보장: 어떤 전략도 입력보다 큰 결과를 내보내지 않는다. 작은 입력에서
|
|
578
|
-
# 접기 마커가 원본보다 길어지는 경우 살균된 원본을 그대로 유지한다.
|
|
579
|
-
if byte_length(compressed) >= byte_length(sanitized):
|
|
580
|
-
compressed = sanitized
|
|
581
|
-
strategy_detail["reduced"] = False
|
|
582
|
-
else:
|
|
583
|
-
strategy_detail["reduced"] = True
|
|
584
|
-
metadata = build_metadata(
|
|
585
|
-
content_type=content_type,
|
|
586
|
-
type_source=type_source,
|
|
587
|
-
strategy_detail=strategy_detail,
|
|
588
|
-
original_text=sanitized,
|
|
589
|
-
compressed_text=compressed,
|
|
590
|
-
redacted_lines=redacted_lines,
|
|
591
|
-
input_truncated=input_truncated,
|
|
592
|
-
input_bytes=input_bytes,
|
|
593
|
-
max_bytes=max_bytes,
|
|
594
|
-
protected_policy_enabled=protected_policy_enabled,
|
|
595
|
-
)
|
|
596
|
-
return compressed, metadata
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
def render_text_receipt(metadata: dict[str, object]) -> str:
|
|
600
|
-
"""One-block human summary written to stderr in text mode."""
|
|
601
|
-
byte_stats = metadata.get("bytes", {})
|
|
602
|
-
token_stats = metadata.get("token_proxy", {})
|
|
603
|
-
redaction = metadata.get("redaction", {})
|
|
604
|
-
lines = [
|
|
605
|
-
"[context-guard-kit] compress",
|
|
606
|
-
f"- content_type: {metadata.get('content_type')} ({metadata.get('type_source')})",
|
|
607
|
-
f"- strategy: {metadata.get('strategy')} (lossy={str(metadata.get('lossy')).lower()})",
|
|
608
|
-
]
|
|
609
|
-
if isinstance(byte_stats, dict):
|
|
610
|
-
lines.append(
|
|
611
|
-
f"- bytes: {byte_stats.get('original')} -> {byte_stats.get('compressed')} "
|
|
612
|
-
f"(ratio={byte_stats.get('compression_ratio')})"
|
|
613
|
-
)
|
|
614
|
-
if isinstance(token_stats, dict):
|
|
615
|
-
lines.append(
|
|
616
|
-
f"- token_proxy(estimated): {token_stats.get('original')} -> {token_stats.get('compressed')}"
|
|
617
|
-
)
|
|
618
|
-
if isinstance(redaction, dict) and redaction.get("redacted_lines"):
|
|
619
|
-
lines.append(f"- redacted_lines: {redaction.get('redacted_lines')}")
|
|
620
|
-
return "\n".join(lines) + "\n"
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
def run_compress(args: argparse.Namespace) -> int:
|
|
624
|
-
"""Read stdin, compress, then emit JSON or (compressed text + stderr receipt)."""
|
|
625
|
-
max_bytes = bounded_int(args.max_bytes, DEFAULT_MAX_BYTES, 1, MAX_MAX_BYTES)
|
|
626
|
-
raw_text, input_truncated, input_bytes = read_bounded_stdin(max_bytes)
|
|
627
|
-
forced_type = args.type
|
|
628
|
-
if forced_type is not None and forced_type not in STRATEGIES:
|
|
629
|
-
print(f"context-guard-compress: unknown --type: {forced_type}", file=sys.stderr)
|
|
630
|
-
return 2
|
|
631
|
-
compressed, metadata = compress_text(
|
|
632
|
-
raw_text,
|
|
633
|
-
forced_type=forced_type,
|
|
634
|
-
show_paths=args.show_paths,
|
|
635
|
-
input_truncated=input_truncated,
|
|
636
|
-
input_bytes=input_bytes,
|
|
637
|
-
max_bytes=max_bytes,
|
|
638
|
-
protected_policy_enabled=bool(args.protected_policy),
|
|
639
|
-
)
|
|
640
|
-
if args.json:
|
|
641
|
-
payload = {"metadata": metadata, "content": compressed}
|
|
642
|
-
print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
|
|
643
|
-
elif args.metadata_only:
|
|
644
|
-
print(json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True))
|
|
645
|
-
else:
|
|
646
|
-
sys.stdout.write(compressed)
|
|
647
|
-
if not args.quiet:
|
|
648
|
-
sys.stderr.write(render_text_receipt(metadata))
|
|
649
|
-
return 0
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
def build_parser() -> argparse.ArgumentParser:
|
|
653
|
-
parser = argparse.ArgumentParser(
|
|
654
|
-
description="Classify and conservatively compress stdin (sanitized) for token-budget reuse.",
|
|
655
|
-
)
|
|
656
|
-
parser.add_argument(
|
|
657
|
-
"--type",
|
|
658
|
-
choices=CONTENT_TYPES,
|
|
659
|
-
default=None,
|
|
660
|
-
help="force a content type instead of auto-detecting (json/diff/log/search/code/prose)",
|
|
661
|
-
)
|
|
662
|
-
parser.add_argument("--json", action="store_true", help="emit JSON with metadata and compressed content")
|
|
663
|
-
parser.add_argument(
|
|
664
|
-
"--protected-policy",
|
|
665
|
-
action="store_true",
|
|
666
|
-
help="add opt-in protected-zone transform policy metadata to --json/--metadata-only receipts; default content is unchanged",
|
|
667
|
-
)
|
|
668
|
-
parser.add_argument(
|
|
669
|
-
"--metadata-only",
|
|
670
|
-
action="store_true",
|
|
671
|
-
help="emit only the JSON metadata receipt (no compressed body)",
|
|
672
|
-
)
|
|
673
|
-
parser.add_argument("--quiet", action="store_true", help="suppress the text receipt on stderr in text mode")
|
|
674
|
-
parser.add_argument(
|
|
675
|
-
"--show-paths",
|
|
676
|
-
action="store_true",
|
|
677
|
-
help="show raw absolute paths instead of path hashes; local debugging only because private paths may be exposed",
|
|
678
|
-
)
|
|
679
|
-
parser.add_argument("--max-bytes", type=int, default=DEFAULT_MAX_BYTES, help="maximum stdin bytes to read before truncating")
|
|
680
|
-
parser.set_defaults(func=run_compress)
|
|
681
|
-
return parser
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
def main() -> int:
|
|
685
|
-
parser = build_parser()
|
|
686
|
-
args = parser.parse_args()
|
|
687
|
-
try:
|
|
688
|
-
return int(args.func(args))
|
|
689
|
-
except RuntimeError as exc:
|
|
690
|
-
print(f"context-guard-compress: {exc}", file=sys.stderr)
|
|
691
|
-
return 1
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
if __name__ == "__main__":
|
|
695
|
-
raise SystemExit(main())
|