@ictechgy/context-guard 0.4.11 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,6 +63,7 @@ MAX_JSON_PATH_SEGMENT_CHARS = 64
63
63
  MAX_JSON_WALK_NODES = 10_000
64
64
  MAX_JSON_WALK_DEPTH = 64
65
65
  MAX_JSON_SHAPE_WARNINGS = 200
66
+ MAX_JSON_CANONICAL_COMPARE_BYTES = 200_000
66
67
  SAFE_JSON_PATH_SEGMENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_-]{0,63}$")
67
68
  DYNAMIC_JSON_KEY_RE = re.compile(r"(?i)(request|trace|nonce|random|timestamp|created[_-]?at|updated[_-]?at|date)")
68
69
  SENSITIVE_JSON_KEY_RE = re.compile(
@@ -93,6 +94,22 @@ def json_bytes(data: Any, *, indent: int | None = None) -> str:
93
94
  return json.dumps(data, ensure_ascii=False, sort_keys=True, separators=(",", ":") if indent is None else None, indent=indent)
94
95
 
95
96
 
97
+ def bounded_canonical_json(data: Any, *, max_bytes: int) -> str | None:
98
+ encoder = json.JSONEncoder(ensure_ascii=False, sort_keys=True, indent=2)
99
+ chunks: list[str] = []
100
+ size = 0
101
+ for chunk in encoder.iterencode(data):
102
+ size += byte_len_text(chunk)
103
+ if size > max_bytes:
104
+ return None
105
+ chunks.append(chunk)
106
+ size += 1
107
+ if size > max_bytes:
108
+ return None
109
+ chunks.append("\n")
110
+ return "".join(chunks)
111
+
112
+
96
113
  def json_path_child(path: str, key: object) -> str:
97
114
  """Return a JSON warning path segment without echoing sensitive/dynamic keys."""
98
115
  text = str(key)
@@ -335,8 +352,18 @@ def json_shape_warnings(text: str) -> tuple[str, list[dict[str, Any]]]:
335
352
  if not isinstance(data, (dict, list)):
336
353
  return "json-scalar", []
337
354
  warnings = _walk_json(data)
338
- canonical = json_bytes(data, indent=2) + "\n"
339
- if canonical != text:
355
+ input_bytes = byte_len_text(text)
356
+ canonical = bounded_canonical_json(data, max_bytes=MAX_JSON_CANONICAL_COMPARE_BYTES)
357
+ if canonical is None:
358
+ warnings.append({
359
+ "code": "json_canonical_check_skipped",
360
+ "path": "$",
361
+ "severity": "info",
362
+ "message": "JSON input is parseable but canonical formatting would exceed the comparison byte cap.",
363
+ "input_bytes": input_bytes,
364
+ "max_bytes": MAX_JSON_CANONICAL_COMPARE_BYTES,
365
+ })
366
+ elif canonical != text:
340
367
  warnings.append({
341
368
  "code": "json_not_canonical",
342
369
  "path": "$",
@@ -20,10 +20,12 @@ import os
20
20
  from pathlib import Path
21
21
  import re
22
22
  import sys
23
- from typing import Callable
23
+ from typing import Callable, Iterable
24
24
 
25
25
  DEFAULT_MAX_BYTES = 10_000_000
26
26
  MAX_MAX_BYTES = 100_000_000
27
+ MAX_SEARCH_DEDUPE_KEYS = 50_000
28
+ JSON_PARSE_FAILED = object()
27
29
  # 토큰 추정은 보수적 proxy 일 뿐이다(관측값 아님). 평균 ~4 chars/token 휴리스틱을 쓰되
28
30
  # 메타데이터에 measurement="estimated" 로 명시해 관측 토큰 수와 혼동되지 않게 한다.
29
31
  TOKEN_PROXY_CHARS_PER_TOKEN = 4
@@ -214,20 +216,57 @@ def token_proxy(text: str) -> int:
214
216
  return max(1, round(len(text) / TOKEN_PROXY_CHARS_PER_TOKEN))
215
217
 
216
218
 
219
+ LINE_BOUNDARY_CHARS = {"\n", "\r", "\v", "\f", "\x1c", "\x1d", "\x1e", "\x85", "\u2028", "\u2029"}
220
+
221
+
222
+ def iter_text_lines(text: str) -> Iterable[str]:
223
+ """Yield lines with str.splitlines() boundaries without building a line list."""
224
+ start = 0
225
+ index = 0
226
+ length = len(text)
227
+ while index < length:
228
+ char = text[index]
229
+ if char == "\r" and index + 1 < length and text[index + 1] == "\n":
230
+ yield text[start:index]
231
+ index += 2
232
+ start = index
233
+ continue
234
+ if char in LINE_BOUNDARY_CHARS:
235
+ yield text[start:index]
236
+ index += 1
237
+ start = index
238
+ continue
239
+ index += 1
240
+ if start < length:
241
+ yield text[start:]
242
+
243
+
244
+ def sample_text_lines(text: str, limit: int) -> list[str]:
245
+ sample: list[str] = []
246
+ for line in iter_text_lines(text):
247
+ sample.append(line)
248
+ if len(sample) >= limit:
249
+ break
250
+ return sample
251
+
252
+
217
253
  def classify_content(text: str) -> str:
218
254
  """Best-effort content classification into one of CONTENT_TYPES.
219
255
 
220
- Order matters: JSON and diff have the strongest unambiguous signals and are
221
- checked first; search/log/code are sampled over the first lines; prose is the
222
- conservative default so unknown text is never over-compressed.
256
+ Order matters: valid JSON and diff have the strongest unambiguous signals;
257
+ search/log/code are sampled over the first lines; prose is the conservative
258
+ default so unknown text is never over-compressed.
223
259
  """
224
260
  stripped = text.strip()
225
261
  if not stripped:
226
262
  return "prose"
227
263
  if _looks_like_json(stripped):
228
264
  return "json"
229
- lines = stripped.splitlines()
230
- sample = lines[:200]
265
+ return classify_non_json_content(stripped)
266
+
267
+
268
+ def classify_non_json_content(stripped: str) -> str:
269
+ sample = sample_text_lines(stripped, 200)
231
270
  if _looks_like_diff(sample):
232
271
  return "diff"
233
272
  if _looks_like_search(sample):
@@ -355,14 +394,17 @@ def build_readable_compression_metadata(
355
394
  }
356
395
 
357
396
 
358
- def _looks_like_json(stripped: str) -> bool:
359
- if stripped[0] not in "{[":
360
- return False
397
+ def parse_json_candidate(stripped: str) -> object:
398
+ if not stripped or stripped[0] not in "{[":
399
+ return JSON_PARSE_FAILED
361
400
  try:
362
- json.loads(stripped)
401
+ return json.loads(stripped)
363
402
  except (ValueError, RecursionError):
364
- return False
365
- return True
403
+ return JSON_PARSE_FAILED
404
+
405
+
406
+ def _looks_like_json(stripped: str) -> bool:
407
+ return parse_json_candidate(stripped) is not JSON_PARSE_FAILED
366
408
 
367
409
 
368
410
  def _ratio(matches: int, total: int, threshold: float) -> bool:
@@ -390,15 +432,7 @@ def _looks_like_code(sample: list[str]) -> bool:
390
432
  return _ratio(matches, len(sample), 0.25)
391
433
 
392
434
 
393
- def compress_json(text: str) -> tuple[str, dict[str, object]]:
394
- """Re-serialize JSON without insignificant whitespace (data-preserving)."""
395
- try:
396
- parsed = json.loads(text)
397
- except (ValueError, RecursionError):
398
- # 파싱 불가 시 무손실을 깨지 않도록 prose 전략으로 안전하게 폴백한다.
399
- compressed, detail = compress_prose(text)
400
- detail["fallback_from"] = "json"
401
- return compressed, detail
435
+ def compress_parsed_json(text: str, parsed: object) -> tuple[str, dict[str, object]]:
402
436
  compact = json.dumps(parsed, ensure_ascii=False, separators=(",", ":"))
403
437
  if not text.endswith("\n"):
404
438
  trailing = ""
@@ -407,6 +441,17 @@ def compress_json(text: str) -> tuple[str, dict[str, object]]:
407
441
  return compact + trailing, {"strategy": "json-compact", "lossy": False, "json_parse_ok": True}
408
442
 
409
443
 
444
+ def compress_json(text: str) -> tuple[str, dict[str, object]]:
445
+ """Re-serialize JSON without insignificant whitespace (data-preserving)."""
446
+ parsed = parse_json_candidate(text.strip())
447
+ if parsed is JSON_PARSE_FAILED:
448
+ # 파싱 불가 시 무손실을 깨지 않도록 prose 전략으로 안전하게 폴백한다.
449
+ compressed, detail = compress_prose(text)
450
+ detail["fallback_from"] = "json"
451
+ return compressed, detail
452
+ return compress_parsed_json(text, parsed)
453
+
454
+
410
455
  def compress_diff(text: str) -> tuple[str, dict[str, object]]:
411
456
  """Keep file headers, hunk headers, and +/- changes; collapse context runs."""
412
457
  out: list[str] = []
@@ -464,18 +509,28 @@ def compress_log(text: str) -> tuple[str, dict[str, object]]:
464
509
 
465
510
 
466
511
  def compress_search(text: str) -> tuple[str, dict[str, object]]:
467
- """Drop exact-duplicate match lines while preserving first-seen order."""
512
+ """Drop exact-duplicate match lines while preserving first-seen order with bounded keys."""
468
513
  out: list[str] = []
469
514
  seen: set[str] = set()
470
515
  dropped = 0
471
- for line in text.splitlines():
516
+ dedupe_limit_reached = False
517
+ for line in iter_text_lines(text):
472
518
  key = line.rstrip()
473
519
  if key in seen:
474
520
  dropped += 1
475
521
  continue
476
- seen.add(key)
522
+ if len(seen) < MAX_SEARCH_DEDUPE_KEYS:
523
+ seen.add(key)
524
+ else:
525
+ dedupe_limit_reached = True
477
526
  out.append(line)
478
- return _join_lines(out, text), {"strategy": "search-dedupe", "lossy": dropped > 0, "duplicate_lines_dropped": dropped}
527
+ return _join_lines(out, text), {
528
+ "strategy": "search-dedupe",
529
+ "lossy": dropped > 0,
530
+ "duplicate_lines_dropped": dropped,
531
+ "dedupe_key_limit": MAX_SEARCH_DEDUPE_KEYS,
532
+ "dedupe_key_limit_reached": dedupe_limit_reached,
533
+ }
479
534
 
480
535
 
481
536
  def compress_code(text: str) -> tuple[str, dict[str, object]]:
@@ -689,14 +744,21 @@ def compress_text(
689
744
  the compressed body, or the metadata that follows.
690
745
  """
691
746
  sanitized, redacted_lines = sanitize_text(text, show_paths=show_paths)
747
+ parsed_json: object = JSON_PARSE_FAILED
692
748
  if forced_type is not None:
693
749
  content_type, type_source = forced_type, "override"
694
750
  else:
695
- content_type, type_source = classify_content(sanitized), "detected"
751
+ stripped = sanitized.strip()
752
+ parsed_json = parse_json_candidate(stripped)
753
+ content_type = "json" if parsed_json is not JSON_PARSE_FAILED else classify_non_json_content(stripped)
754
+ type_source = "detected"
696
755
  if compression_mode == "readable" and content_type == "prose":
697
756
  compressed, strategy_detail = compress_prose_readable(sanitized)
698
757
  else:
699
- compressed, strategy_detail = STRATEGIES[content_type](sanitized)
758
+ if content_type == "json" and parsed_json is not JSON_PARSE_FAILED:
759
+ compressed, strategy_detail = compress_parsed_json(sanitized, parsed_json)
760
+ else:
761
+ compressed, strategy_detail = STRATEGIES[content_type](sanitized)
700
762
  if compression_mode == "readable":
701
763
  strategy_detail["readable_mode"] = True
702
764
  strategy_detail["readable_strategy"] = "sentence-window-preview"
@@ -9,6 +9,8 @@ from __future__ import annotations
9
9
 
10
10
  import argparse
11
11
  import codecs
12
+ import collections
13
+ import itertools
12
14
  from dataclasses import dataclass
13
15
  import json
14
16
  import os
@@ -455,26 +457,94 @@ def cap_line(line: str, max_chars: int) -> str:
455
457
  return line[: max(0, max_chars - len(marker) - len(suffix))] + marker + suffix
456
458
 
457
459
 
458
- def select_lines(lines: list[str], flt: CompiledFilter, max_line_chars: int) -> list[str]:
459
- selected = [cap_line(line, max_line_chars) for line in lines]
460
- if flt.include_regex:
461
- selected = [line for line in selected if any(pattern.search(line) for pattern in flt.include_regex)]
462
- if flt.exclude_regex:
463
- selected = [line for line in selected if not any(pattern.search(line) for pattern in flt.exclude_regex)]
460
+ LINE_BOUNDARY_CHARS = {"\n", "\r", "\v", "\f", "\x1c", "\x1d", "\x1e", "\x85", "\u2028", "\u2029"}
461
+
462
+
463
+ @dataclass
464
+ class LineSelection:
465
+ lines: list[str]
466
+ input_lines: int
467
+ input_complete: bool
468
+
469
+
470
+ def iter_text_lines_keepends(text: str) -> Iterable[str]:
471
+ """Yield lines with Python splitlines(keepends=True) boundaries without a list."""
472
+ start = 0
473
+ index = 0
474
+ length = len(text)
475
+ while index < length:
476
+ char = text[index]
477
+ if char == "\r" and index + 1 < length and text[index + 1] == "\n":
478
+ yield text[start : index + 2]
479
+ index += 2
480
+ start = index
481
+ continue
482
+ if char in LINE_BOUNDARY_CHARS:
483
+ yield text[start : index + 1]
484
+ index += 1
485
+ start = index
486
+ continue
487
+ index += 1
488
+ if start < length:
489
+ yield text[start:]
490
+
491
+
492
+ def line_matches_filter(line: str, flt: CompiledFilter) -> bool:
493
+ if flt.include_regex and not any(pattern.search(line) for pattern in flt.include_regex):
494
+ return False
495
+ if flt.exclude_regex and any(pattern.search(line) for pattern in flt.exclude_regex):
496
+ return False
497
+ return True
498
+
499
+
500
+ def select_lines_with_stats(lines: Iterable[str], flt: CompiledFilter, max_line_chars: int) -> LineSelection:
501
+ source_count = 0
502
+ matched_count = 0
503
+ input_complete = True
464
504
  if flt.head_lines is not None or flt.tail_lines is not None:
465
505
  head_n = flt.head_lines if flt.head_lines is not None else 0
466
506
  tail_n = flt.tail_lines if flt.tail_lines is not None else 0
467
- head = selected[:head_n] if head_n else []
468
- tail = selected[-tail_n:] if tail_n else []
469
- if head and tail:
470
- seen_head_count = len(head)
471
- tail = tail[max(0, seen_head_count + len(tail) - len(selected)):]
472
- selected = head + tail
507
+ head: list[str] = []
508
+ tail: collections.deque[str] = collections.deque(maxlen=tail_n)
509
+ for source_line in lines:
510
+ source_count += 1
511
+ line = cap_line(source_line, max_line_chars)
512
+ if not line_matches_filter(line, flt):
513
+ continue
514
+ matched_count += 1
515
+ if head_n and len(head) < head_n:
516
+ head.append(line)
517
+ if tail_n:
518
+ tail.append(line)
519
+ elif head_n and len(head) >= head_n:
520
+ input_complete = False
521
+ break
522
+ tail_list = list(tail)
523
+ if head and tail_list:
524
+ tail_list = tail_list[max(0, len(head) + len(tail_list) - matched_count):]
525
+ selected = head + tail_list
526
+ else:
527
+ limit = min(flt.max_lines if flt.max_lines is not None else MAX_EMIT_LINES, MAX_EMIT_LINES)
528
+ selected = []
529
+ for source_line in lines:
530
+ source_count += 1
531
+ line = cap_line(source_line, max_line_chars)
532
+ if not line_matches_filter(line, flt):
533
+ continue
534
+ matched_count += 1
535
+ selected.append(line)
536
+ if len(selected) >= limit:
537
+ input_complete = False
538
+ break
473
539
  if flt.max_lines is not None and len(selected) > flt.max_lines:
474
540
  selected = selected[:flt.max_lines]
475
541
  if len(selected) > MAX_EMIT_LINES:
476
542
  selected = selected[:MAX_EMIT_LINES]
477
- return selected
543
+ return LineSelection(selected, source_count, input_complete)
544
+
545
+
546
+ def select_lines(lines: Iterable[str], flt: CompiledFilter, max_line_chars: int) -> list[str]:
547
+ return select_lines_with_stats(lines, flt, max_line_chars).lines
478
548
 
479
549
 
480
550
  def validation_payload(valid: bool, errors: list[str], count: int = 0) -> dict[str, Any]:
@@ -720,7 +790,6 @@ def cmd_run(args: argparse.Namespace) -> int:
720
790
  filters, errors = load_filters(Path(args.config).expanduser())
721
791
  result = run_command(command, timeout_seconds, max_capture)
722
792
  rc = result.returncode
723
- output = result.stdout_text + result.stderr_text
724
793
  protected_nonzero = rc != 0 and is_protected_command(command)
725
794
  report: dict[str, Any] = {"tool": TOOL_NAME, "schema_version": SCHEMA_VERSION, "mode": "run", "command_exit_code": rc, "decision": "passthrough", "reason": "unclassified", "protected_nonzero": protected_nonzero}
726
795
  if result.timed_out:
@@ -746,18 +815,19 @@ def cmd_run(args: argparse.Namespace) -> int:
746
815
  report["filter_id"] = matched.id
747
816
  else:
748
817
  try:
749
- lines = output.splitlines(keepends=True)
750
- filtered = select_lines(lines, matched, max_line_chars)
818
+ source_lines = itertools.chain(iter_text_lines_keepends(result.stdout_text), iter_text_lines_keepends(result.stderr_text))
819
+ selection = select_lines_with_stats(source_lines, matched, max_line_chars)
820
+ filtered = selection.lines
751
821
  except re.error as exc:
752
822
  report["reason"] = f"filter-error:{compact(str(exc), 80)}"
753
823
  report["filter_id"] = matched.id
754
824
  else:
755
- if output and not filtered:
825
+ if (result.stdout_text or result.stderr_text) and not filtered:
756
826
  report["reason"] = "empty-output-fallback"
757
827
  report["filter_id"] = matched.id
758
828
  else:
759
829
  sys.stdout.write("".join(filtered))
760
- report.update({"decision": "filtered", "reason": "matched", "filter_id": matched.id, "input_lines": len(lines), "output_lines": len(filtered)})
830
+ report.update({"decision": "filtered", "reason": "matched", "filter_id": matched.id, "input_lines": selection.input_lines, "input_lines_complete": selection.input_complete, "output_lines": len(filtered)})
761
831
  emit_run_report(args, report)
762
832
  return rc
763
833
  if not result.passthrough_emitted:
@@ -957,6 +957,29 @@ def metadata_size(data: dict[str, Any]) -> int:
957
957
  return len(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True).encode("utf-8", errors="replace")) + 1
958
958
 
959
959
 
960
+ def receipt_working_copy(data: dict[str, Any]) -> tuple[dict[str, Any], bool]:
961
+ """Copy receipt metadata without deep-copying or serializing an oversized pack body.
962
+
963
+ The pack body is already an immutable string in normal builds and stdout remains
964
+ authoritative for it. When it cannot possibly fit under the receipt cap by
965
+ itself, omit it before the first receipt-size probe so capping work only touches
966
+ metadata previews.
967
+ """
968
+ receipt: dict[str, Any] = {}
969
+ pack_omitted = False
970
+ for key, value in data.items():
971
+ if key == "pack" and isinstance(value, str):
972
+ if len(value.encode("utf-8", errors="replace")) > MAX_RECEIPT_BYTES:
973
+ pack_omitted = True
974
+ continue
975
+ receipt[key] = value
976
+ continue
977
+ receipt[key] = copy.deepcopy(value)
978
+ if pack_omitted:
979
+ receipt["pack_omitted_from_receipt"] = True
980
+ return receipt, pack_omitted
981
+
982
+
960
983
  def artifact_failure(error: str, *, bytes_count: int = 0, capped: bool = False) -> dict[str, Any]:
961
984
  return {
962
985
  "stored": False,
@@ -1113,8 +1136,11 @@ def finalize_receipt_size(receipt: dict[str, Any]) -> int:
1113
1136
 
1114
1137
 
1115
1138
  def shrink_receipt_for_write(data: dict[str, Any]) -> tuple[dict[str, Any], bool]:
1116
- receipt = copy.deepcopy(data)
1117
- capped = False
1139
+ receipt, pack_omitted = receipt_working_copy(data)
1140
+ capped = pack_omitted
1141
+ if pack_omitted:
1142
+ receipt.setdefault("artifact", {})["capped"] = True
1143
+ receipt.setdefault("artifact", {})["cap_bytes"] = MAX_RECEIPT_BYTES
1118
1144
  if metadata_size(receipt) <= MAX_RECEIPT_BYTES:
1119
1145
  return receipt, capped
1120
1146
  capped = True
@@ -11,6 +11,7 @@ import argparse
11
11
  import ast
12
12
  import errno
13
13
  import hashlib
14
+ import importlib.machinery
14
15
  import importlib.util
15
16
  import json
16
17
  import os
@@ -39,8 +40,27 @@ def _load_hook_secret_patterns():
39
40
  raise ImportError("hook_secret_patterns.py not found in " + ", ".join(searched))
40
41
 
41
42
 
43
+ def _load_sanitize_output():
44
+ searched = []
45
+ for helper_path in (SCRIPT_DIR / "sanitize_output.py", SCRIPT_DIR / "context-guard-sanitize-output"):
46
+ searched.append(str(helper_path))
47
+ if not helper_path.is_file():
48
+ continue
49
+ loader = importlib.machinery.SourceFileLoader("_claude_token_sanitize_output", str(helper_path))
50
+ spec = importlib.util.spec_from_loader(loader.name, loader)
51
+ if spec is None:
52
+ continue
53
+ module = importlib.util.module_from_spec(spec)
54
+ loader.exec_module(module)
55
+ return module
56
+ raise ImportError("sanitize_output helper not found in " + ", ".join(searched))
57
+
58
+
42
59
  _hook_secret_patterns = _load_hook_secret_patterns()
60
+ _sanitize_output = _load_sanitize_output()
43
61
  hook_label_has_sensitive_evidence = _hook_secret_patterns.hook_label_has_sensitive_evidence
62
+ redact_sensitive_hook_text = _hook_secret_patterns.redact_sensitive_hook_text
63
+ LineSanitizer = _sanitize_output.LineSanitizer
44
64
 
45
65
  DEFAULT_CONTEXT_LINES = 3
46
66
  DEFAULT_MAX_CHARS = 16_000
@@ -391,6 +411,11 @@ def strip_line_for_brace_count(line: str, in_block_comment: bool = False) -> tup
391
411
  return "".join(output), in_block_comment
392
412
 
393
413
 
414
+ def redact_symbol_content(content: str) -> str:
415
+ sanitizer = LineSanitizer(show_paths=True)
416
+ return "".join(sanitizer.sanitize(line)[0] for line in content.splitlines(keepends=True))
417
+
418
+
394
419
  def find_symbol_slice(path: Path, symbol: str, context: int, max_chars: int, show_paths: bool) -> SymbolSlice | None:
395
420
  text, scan_truncated = read_text_bounded(path)
396
421
  lines = text.splitlines(keepends=True)
@@ -409,6 +434,8 @@ def find_symbol_slice(path: Path, symbol: str, context: int, max_chars: int, sho
409
434
  start_with_context = max(0, start - max(0, context))
410
435
  end_with_context = min(len(lines), end + max(0, context))
411
436
  content = "".join(lines[start_with_context:end_with_context])
437
+ content = redact_symbol_content(content)
438
+ content = redact_sensitive_hook_text(content, "[REDACTED]")
412
439
  capped = False
413
440
  if max_chars > 0 and len(content) > max_chars:
414
441
  marker = f"\n[context-guard-kit] symbol slice capped: {len(content)} chars total\n"