@ictechgy/context-guard 0.4.10 → 0.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -1
- package/README.ko.md +46 -28
- package/README.md +42 -33
- package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
- package/docs/benchmark-workflow-examples.md +3 -0
- package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
- package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
- package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
- package/docs/experimental-benchmark-fixtures.md +24 -7
- package/package.json +2 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +14 -11
- package/plugins/context-guard/README.md +15 -14
- package/plugins/context-guard/bin/context-guard +48 -17
- package/plugins/context-guard/bin/context-guard-artifact +342 -33
- package/plugins/context-guard/bin/context-guard-audit +36 -5
- package/plugins/context-guard/bin/context-guard-bench +1675 -44
- package/plugins/context-guard/bin/context-guard-cache-score +347 -35
- package/plugins/context-guard/bin/context-guard-compress +89 -27
- package/plugins/context-guard/bin/context-guard-cost +7 -2
- package/plugins/context-guard/bin/context-guard-experiments +364 -8
- package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
- package/plugins/context-guard/bin/context-guard-filter +88 -18
- package/plugins/context-guard/bin/context-guard-pack +329 -19
- package/plugins/context-guard/bin/context-guard-read-symbol +27 -0
- package/plugins/context-guard/bin/context-guard-sanitize-output +245 -18
- package/plugins/context-guard/bin/context-guard-setup +21 -5
- package/plugins/context-guard/bin/context-guard-tool-prune +287 -62
- package/plugins/context-guard/bin/context-guard-trim-output +394 -90
- package/plugins/context-guard/brief/README.md +5 -5
- package/plugins/context-guard/lib/context_guard_command_manifest_loader.py +123 -0
- package/plugins/context-guard/lib/context_guard_commands.py +217 -190
|
@@ -33,6 +33,7 @@ MAX_TOP_ERROR_RECEIPTS = 12
|
|
|
33
33
|
MAX_DUPLICATE_GROUPS = 12
|
|
34
34
|
MAX_SUGGESTED_QUERIES = 12
|
|
35
35
|
SEARCH_SCHEMA_VERSION = "contextguard.artifact.search.v1"
|
|
36
|
+
OUTPUT_SANDBOX_SCHEMA_VERSION = "contextguard.artifact.output-sandbox.v1"
|
|
36
37
|
DEFAULT_SEARCH_MAX_ARTIFACTS = 100
|
|
37
38
|
MAX_SEARCH_MAX_ARTIFACTS = 1_000
|
|
38
39
|
DEFAULT_SEARCH_MAX_MATCHES = 40
|
|
@@ -261,22 +262,38 @@ def reject_symlink_components(path: Path) -> None:
|
|
|
261
262
|
|
|
262
263
|
def regular_private_file_size(path: Path) -> int:
|
|
263
264
|
path = normalize_allowed_first_absolute_symlink(path)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
265
|
+
parent_fd = open_private_directory_no_follow(path.parent, label="artifact directory", create=False)
|
|
266
|
+
try:
|
|
267
|
+
leaf = path.name
|
|
268
|
+
if leaf in {"", ".", ".."}:
|
|
269
|
+
raise ValueError("artifact file must name a regular file")
|
|
270
|
+
if not DIR_FD_STAT_SUPPORTED:
|
|
271
|
+
raise RuntimeError("artifact reads require dir_fd stat support")
|
|
272
|
+
st = os.stat(leaf, dir_fd=parent_fd, follow_symlinks=False)
|
|
273
|
+
if stat.S_ISLNK(st.st_mode):
|
|
274
|
+
raise ValueError(f"artifact file must not be a symlink: {path.name}")
|
|
275
|
+
if not stat.S_ISREG(st.st_mode):
|
|
276
|
+
raise ValueError(f"artifact file must be a regular file: {path.name}")
|
|
277
|
+
return int(st.st_size)
|
|
278
|
+
finally:
|
|
279
|
+
os.close(parent_fd)
|
|
271
280
|
|
|
272
281
|
|
|
273
282
|
def read_bounded_private_text(path: Path, max_bytes: int) -> str:
|
|
274
283
|
path = normalize_allowed_first_absolute_symlink(path)
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
284
|
+
parent_fd = open_private_directory_no_follow(path.parent, label="artifact directory", create=False)
|
|
285
|
+
flags = os.O_RDONLY | os.O_NOFOLLOW
|
|
286
|
+
if hasattr(os, "O_CLOEXEC"):
|
|
287
|
+
flags |= os.O_CLOEXEC
|
|
288
|
+
leaf = path.name
|
|
289
|
+
if leaf in {"", ".", ".."}:
|
|
290
|
+
os.close(parent_fd)
|
|
291
|
+
raise ValueError("artifact file must name a regular file")
|
|
292
|
+
try:
|
|
293
|
+
fd = os.open(leaf, flags, dir_fd=parent_fd)
|
|
294
|
+
except OSError:
|
|
295
|
+
os.close(parent_fd)
|
|
296
|
+
raise
|
|
280
297
|
try:
|
|
281
298
|
st = os.fstat(fd)
|
|
282
299
|
if not stat.S_ISREG(st.st_mode):
|
|
@@ -289,6 +306,7 @@ def read_bounded_private_text(path: Path, max_bytes: int) -> str:
|
|
|
289
306
|
return data.decode("utf-8", errors="replace")
|
|
290
307
|
finally:
|
|
291
308
|
os.close(fd)
|
|
309
|
+
os.close(parent_fd)
|
|
292
310
|
|
|
293
311
|
|
|
294
312
|
def no_follow_dir_flags() -> int:
|
|
@@ -351,6 +369,8 @@ def open_private_directory_no_follow(path: Path, *, label: str, create: bool) ->
|
|
|
351
369
|
owned_fd = current_fd
|
|
352
370
|
current_fd = -1
|
|
353
371
|
return owned_fd
|
|
372
|
+
except FileNotFoundError:
|
|
373
|
+
raise
|
|
354
374
|
except OSError as exc:
|
|
355
375
|
raise RuntimeError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
|
|
356
376
|
finally:
|
|
@@ -574,6 +594,8 @@ def build_retrieval_hints(
|
|
|
574
594
|
content_type: str,
|
|
575
595
|
strategy: str,
|
|
576
596
|
total_lines: int,
|
|
597
|
+
raw_dir: str | None = None,
|
|
598
|
+
show_paths: bool = False,
|
|
577
599
|
) -> list[dict[str, object]]:
|
|
578
600
|
"""Build deterministic, machine-readable retrieval hints for bounded round-trip.
|
|
579
601
|
|
|
@@ -591,8 +613,8 @@ def build_retrieval_hints(
|
|
|
591
613
|
lines_hint: dict[str, object] = {
|
|
592
614
|
"type": "lines",
|
|
593
615
|
"selector": {"start": 1, "end": end_line},
|
|
594
|
-
"cli": line_query_cli(artifact_id, 1, end_line),
|
|
595
|
-
"exact": total_lines <= MAX_QUERY_LINES,
|
|
616
|
+
"cli": line_query_cli(artifact_id, 1, end_line, raw_dir=raw_dir, show_paths=show_paths),
|
|
617
|
+
"exact": total_lines <= MAX_QUERY_LINES and artifact_dir_cli_is_exact(raw_dir, show_paths=show_paths),
|
|
596
618
|
}
|
|
597
619
|
if end_line > DEFAULT_MAX_LINES:
|
|
598
620
|
lines_hint["max_lines"] = end_line
|
|
@@ -614,14 +636,14 @@ def build_retrieval_hints(
|
|
|
614
636
|
{
|
|
615
637
|
"type": "pattern",
|
|
616
638
|
"selector": {"pattern": anchor},
|
|
617
|
-
"cli": f"
|
|
639
|
+
"cli": f"{artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)} get {artifact_id} --pattern {shlex.quote(anchor)}",
|
|
618
640
|
}
|
|
619
641
|
)
|
|
620
642
|
hints.append(
|
|
621
643
|
{
|
|
622
644
|
"type": "head",
|
|
623
645
|
"selector": {"max_lines": DEFAULT_MAX_LINES},
|
|
624
|
-
"cli": f"
|
|
646
|
+
"cli": f"{artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)} get {artifact_id} --max-lines {DEFAULT_MAX_LINES}",
|
|
625
647
|
}
|
|
626
648
|
)
|
|
627
649
|
return hints
|
|
@@ -654,16 +676,29 @@ def line_query_cli(
|
|
|
654
676
|
return cli
|
|
655
677
|
|
|
656
678
|
|
|
657
|
-
def line_receipt(
|
|
679
|
+
def line_receipt(
|
|
680
|
+
artifact_id: str,
|
|
681
|
+
line_number: int,
|
|
682
|
+
text: str,
|
|
683
|
+
*,
|
|
684
|
+
raw_dir: str | None = None,
|
|
685
|
+
show_paths: bool = False,
|
|
686
|
+
) -> dict[str, object]:
|
|
658
687
|
return {
|
|
659
688
|
"line": line_number,
|
|
660
689
|
"text": cap_digest_text(text.strip()),
|
|
661
690
|
"selector": {"type": "lines", "start": line_number, "end": line_number},
|
|
662
|
-
"cli": line_query_cli(artifact_id, line_number, line_number),
|
|
691
|
+
"cli": line_query_cli(artifact_id, line_number, line_number, raw_dir=raw_dir, show_paths=show_paths),
|
|
663
692
|
}
|
|
664
693
|
|
|
665
694
|
|
|
666
|
-
def build_top_error_receipts(
|
|
695
|
+
def build_top_error_receipts(
|
|
696
|
+
artifact_id: str,
|
|
697
|
+
lines: list[str],
|
|
698
|
+
*,
|
|
699
|
+
raw_dir: str | None = None,
|
|
700
|
+
show_paths: bool = False,
|
|
701
|
+
) -> list[dict[str, object]]:
|
|
667
702
|
receipts: list[dict[str, object]] = []
|
|
668
703
|
seen: set[str] = set()
|
|
669
704
|
for line_number, line in enumerate(lines, start=1):
|
|
@@ -672,7 +707,7 @@ def build_top_error_receipts(artifact_id: str, lines: list[str]) -> list[dict[st
|
|
|
672
707
|
text = cap_digest_text(line.strip())
|
|
673
708
|
if not text or text in seen:
|
|
674
709
|
continue
|
|
675
|
-
receipt = line_receipt(artifact_id, line_number, text)
|
|
710
|
+
receipt = line_receipt(artifact_id, line_number, text, raw_dir=raw_dir, show_paths=show_paths)
|
|
676
711
|
receipts.append(receipt)
|
|
677
712
|
seen.add(text)
|
|
678
713
|
if len(receipts) >= MAX_TOP_ERROR_RECEIPTS:
|
|
@@ -680,7 +715,14 @@ def build_top_error_receipts(artifact_id: str, lines: list[str]) -> list[dict[st
|
|
|
680
715
|
return receipts
|
|
681
716
|
|
|
682
717
|
|
|
683
|
-
def build_duplicate_line_groups(
|
|
718
|
+
def build_duplicate_line_groups(
|
|
719
|
+
artifact_id: str,
|
|
720
|
+
lines: list[str],
|
|
721
|
+
*,
|
|
722
|
+
limit: int = MAX_DUPLICATE_GROUPS,
|
|
723
|
+
raw_dir: str | None = None,
|
|
724
|
+
show_paths: bool = False,
|
|
725
|
+
) -> list[dict[str, object]]:
|
|
684
726
|
counts: dict[str, int] = {}
|
|
685
727
|
first_line: dict[str, int] = {}
|
|
686
728
|
for line_number, line in enumerate(lines, start=1):
|
|
@@ -703,13 +745,20 @@ def build_duplicate_line_groups(artifact_id: str, lines: list[str], *, limit: in
|
|
|
703
745
|
"first_line": line_number,
|
|
704
746
|
"text": text,
|
|
705
747
|
"selector": {"type": "lines", "start": line_number, "end": line_number},
|
|
706
|
-
"cli": line_query_cli(artifact_id, line_number, line_number),
|
|
748
|
+
"cli": line_query_cli(artifact_id, line_number, line_number, raw_dir=raw_dir, show_paths=show_paths),
|
|
707
749
|
}
|
|
708
750
|
)
|
|
709
751
|
return groups
|
|
710
752
|
|
|
711
753
|
|
|
712
|
-
def build_digest(
|
|
754
|
+
def build_digest(
|
|
755
|
+
sanitized_text: str,
|
|
756
|
+
*,
|
|
757
|
+
artifact_id: str,
|
|
758
|
+
redacted_lines: int,
|
|
759
|
+
raw_dir: str | None = None,
|
|
760
|
+
show_paths: bool = False,
|
|
761
|
+
) -> dict[str, object]:
|
|
713
762
|
lines = sanitized_text.splitlines()
|
|
714
763
|
top_errors = compact_items(
|
|
715
764
|
(line for line in lines if ERROR_RE.search(line)),
|
|
@@ -725,8 +774,8 @@ def build_digest(sanitized_text: str, *, artifact_id: str, redacted_lines: int)
|
|
|
725
774
|
"markers": sanitized_text.count("[REDACTED]"),
|
|
726
775
|
},
|
|
727
776
|
"top_error_lines": top_errors,
|
|
728
|
-
"top_error_receipts": build_top_error_receipts(artifact_id, lines),
|
|
729
|
-
"duplicate_line_groups": build_duplicate_line_groups(artifact_id, lines),
|
|
777
|
+
"top_error_receipts": build_top_error_receipts(artifact_id, lines, raw_dir=raw_dir, show_paths=show_paths),
|
|
778
|
+
"duplicate_line_groups": build_duplicate_line_groups(artifact_id, lines, raw_dir=raw_dir, show_paths=show_paths),
|
|
730
779
|
"representative_head": compact_items(
|
|
731
780
|
lines,
|
|
732
781
|
limit=8,
|
|
@@ -769,7 +818,198 @@ def suggested_queries_for(metadata: dict[str, object]) -> list[str]:
|
|
|
769
818
|
return queries[:MAX_SUGGESTED_QUERIES]
|
|
770
819
|
|
|
771
820
|
|
|
772
|
-
def
|
|
821
|
+
def artifact_handle(artifact_id: str) -> str:
|
|
822
|
+
return f"contextguard-artifact:{artifact_id}"
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def compact_stored_output(metadata: dict[str, object]) -> dict[str, object]:
|
|
826
|
+
stored = metadata.get("stored_output")
|
|
827
|
+
if not isinstance(stored, dict):
|
|
828
|
+
return {}
|
|
829
|
+
compact: dict[str, object] = {}
|
|
830
|
+
for key in ("scope", "bytes", "lines", "sha256", "content_file", "metadata_file"):
|
|
831
|
+
if key in stored:
|
|
832
|
+
compact[key] = stored[key]
|
|
833
|
+
content_type = metadata.get("content_type")
|
|
834
|
+
if isinstance(content_type, str):
|
|
835
|
+
compact["content_type"] = content_type
|
|
836
|
+
return compact
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
def digest_count(digest: dict[str, object], key: str) -> int:
|
|
840
|
+
value = digest.get(key)
|
|
841
|
+
return len(value) if isinstance(value, list) else 0
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def build_output_sandbox_summary(metadata: dict[str, object]) -> dict[str, object]:
|
|
845
|
+
digest = metadata.get("digest")
|
|
846
|
+
if not isinstance(digest, dict):
|
|
847
|
+
return {"status": "stored"}
|
|
848
|
+
summary: dict[str, object] = {
|
|
849
|
+
"status": digest.get("status") or "stored",
|
|
850
|
+
"top_error_count": digest_count(digest, "top_error_lines"),
|
|
851
|
+
"top_error_receipt_count": digest_count(digest, "top_error_receipts"),
|
|
852
|
+
"duplicate_line_group_count": digest_count(digest, "duplicate_line_groups"),
|
|
853
|
+
"representative_head_count": digest_count(digest, "representative_head"),
|
|
854
|
+
"representative_tail_count": digest_count(digest, "representative_tail"),
|
|
855
|
+
}
|
|
856
|
+
redaction_counts = digest.get("redaction_counts")
|
|
857
|
+
if isinstance(redaction_counts, dict):
|
|
858
|
+
summary["redaction_counts"] = {
|
|
859
|
+
str(key): value
|
|
860
|
+
for key, value in redaction_counts.items()
|
|
861
|
+
if isinstance(value, (int, float, str, bool)) or value is None
|
|
862
|
+
}
|
|
863
|
+
elif "redacted_lines" in digest:
|
|
864
|
+
summary["redacted_lines"] = digest.get("redacted_lines")
|
|
865
|
+
capped = digest.get("capped_for_metadata")
|
|
866
|
+
if isinstance(capped, bool):
|
|
867
|
+
summary["capped_for_metadata"] = capped
|
|
868
|
+
return summary
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def rehydration_command_record(
|
|
872
|
+
*,
|
|
873
|
+
kind: str,
|
|
874
|
+
cli: str,
|
|
875
|
+
selector: dict[str, object],
|
|
876
|
+
exact: bool,
|
|
877
|
+
note: str | None = None,
|
|
878
|
+
) -> dict[str, object]:
|
|
879
|
+
record: dict[str, object] = {
|
|
880
|
+
"type": kind,
|
|
881
|
+
"selector": selector,
|
|
882
|
+
"cli": cli,
|
|
883
|
+
"exact": exact,
|
|
884
|
+
}
|
|
885
|
+
if note:
|
|
886
|
+
record["note"] = note
|
|
887
|
+
return record
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
def build_output_sandbox_rehydration(
|
|
891
|
+
metadata: dict[str, object],
|
|
892
|
+
*,
|
|
893
|
+
raw_dir: str | None = None,
|
|
894
|
+
show_paths: bool = False,
|
|
895
|
+
) -> dict[str, object]:
|
|
896
|
+
artifact_id = str(metadata["artifact_id"])
|
|
897
|
+
cli_exact = artifact_dir_cli_is_exact(raw_dir, show_paths=show_paths)
|
|
898
|
+
prefix = artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)
|
|
899
|
+
note = (
|
|
900
|
+
None
|
|
901
|
+
if cli_exact
|
|
902
|
+
else "custom artifact directory is redacted; rerun with the same --dir value or pass --show-paths for a directly executable local command"
|
|
903
|
+
)
|
|
904
|
+
commands: list[dict[str, object]] = [
|
|
905
|
+
rehydration_command_record(
|
|
906
|
+
kind="metadata",
|
|
907
|
+
selector={"type": "receipt"},
|
|
908
|
+
cli=f"{prefix} receipt {artifact_id} --json",
|
|
909
|
+
exact=cli_exact,
|
|
910
|
+
note=note,
|
|
911
|
+
)
|
|
912
|
+
]
|
|
913
|
+
|
|
914
|
+
retrieval = metadata.get("retrieval")
|
|
915
|
+
hints = retrieval.get("hints") if isinstance(retrieval, dict) else None
|
|
916
|
+
if isinstance(hints, list):
|
|
917
|
+
for hint in hints:
|
|
918
|
+
if not isinstance(hint, dict):
|
|
919
|
+
continue
|
|
920
|
+
hint_type = hint.get("type")
|
|
921
|
+
selector = hint.get("selector")
|
|
922
|
+
if not isinstance(selector, dict):
|
|
923
|
+
selector = {}
|
|
924
|
+
cli: str | None = None
|
|
925
|
+
exact = bool(hint.get("exact", True)) and cli_exact
|
|
926
|
+
if hint_type == "lines":
|
|
927
|
+
start = selector.get("start")
|
|
928
|
+
end = selector.get("end")
|
|
929
|
+
if isinstance(start, int) and isinstance(end, int):
|
|
930
|
+
cli = line_query_cli(artifact_id, start, end, raw_dir=raw_dir, show_paths=show_paths)
|
|
931
|
+
elif hint_type == "pattern":
|
|
932
|
+
pattern = selector.get("pattern")
|
|
933
|
+
if isinstance(pattern, str) and pattern:
|
|
934
|
+
cli = f"{prefix} get {artifact_id} --pattern {shlex.quote(pattern)}"
|
|
935
|
+
elif hint_type == "head":
|
|
936
|
+
max_lines = selector.get("max_lines")
|
|
937
|
+
if isinstance(max_lines, int) and max_lines > 0:
|
|
938
|
+
cli = f"{prefix} get {artifact_id} --max-lines {max_lines}"
|
|
939
|
+
if cli is None:
|
|
940
|
+
raw_cli = hint.get("cli")
|
|
941
|
+
cli = raw_cli if isinstance(raw_cli, str) and raw_cli else None
|
|
942
|
+
if cli:
|
|
943
|
+
commands.append(
|
|
944
|
+
rehydration_command_record(
|
|
945
|
+
kind=str(hint_type or "query"),
|
|
946
|
+
selector=selector,
|
|
947
|
+
cli=cli,
|
|
948
|
+
exact=exact,
|
|
949
|
+
note=note if not cli_exact else str(hint.get("note") or "") or None,
|
|
950
|
+
)
|
|
951
|
+
)
|
|
952
|
+
if len(commands) >= 5:
|
|
953
|
+
break
|
|
954
|
+
|
|
955
|
+
digest = metadata.get("digest")
|
|
956
|
+
top_error_lines = digest.get("top_error_lines") if isinstance(digest, dict) else None
|
|
957
|
+
if isinstance(top_error_lines, list):
|
|
958
|
+
anchor = first_error_anchor("\n".join(str(line) for line in top_error_lines))
|
|
959
|
+
if anchor and len(commands) < 5:
|
|
960
|
+
commands.append(
|
|
961
|
+
rehydration_command_record(
|
|
962
|
+
kind="search",
|
|
963
|
+
selector={"type": "literal", "pattern": anchor},
|
|
964
|
+
cli=f"{prefix} search {shlex.quote(anchor)} --json",
|
|
965
|
+
exact=cli_exact,
|
|
966
|
+
note=note,
|
|
967
|
+
)
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
return {
|
|
971
|
+
"commands": commands,
|
|
972
|
+
"dir_argument": "default" if default_artifact_dir_requested(raw_dir or DEFAULT_ARTIFACT_DIR) else ("included" if show_paths else "redacted"),
|
|
973
|
+
"exact_commands": cli_exact,
|
|
974
|
+
"note": note,
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
def build_output_sandbox_envelope(
|
|
979
|
+
metadata: dict[str, object],
|
|
980
|
+
*,
|
|
981
|
+
raw_dir: str | None = None,
|
|
982
|
+
show_paths: bool = False,
|
|
983
|
+
) -> dict[str, object]:
|
|
984
|
+
artifact_id = str(metadata["artifact_id"])
|
|
985
|
+
return {
|
|
986
|
+
"schema_version": OUTPUT_SANDBOX_SCHEMA_VERSION,
|
|
987
|
+
"mode": "local_artifact_receipt",
|
|
988
|
+
"handle": artifact_handle(artifact_id),
|
|
989
|
+
"artifact_id": artifact_id,
|
|
990
|
+
"stored_output": compact_stored_output(metadata),
|
|
991
|
+
"summary": build_output_sandbox_summary(metadata),
|
|
992
|
+
"rehydration": build_output_sandbox_rehydration(metadata, raw_dir=raw_dir, show_paths=show_paths),
|
|
993
|
+
"agent_guidance": [
|
|
994
|
+
"Keep this compact receipt in agent context instead of pasting the full output.",
|
|
995
|
+
"Before relying on omitted details, rehydrate the exact sanitized slice with one of rehydration.commands[].cli.",
|
|
996
|
+
"For repeated diagnostics, query narrower lines or literal matches instead of rerunning broad commands unchanged.",
|
|
997
|
+
],
|
|
998
|
+
"claim_boundary": {
|
|
999
|
+
"local_only": True,
|
|
1000
|
+
"stored_content_is_sanitized_copy": True,
|
|
1001
|
+
"hosted_api_token_or_cost_savings_claim_allowed": False,
|
|
1002
|
+
"exact_rehydration_required_before_relying_on_omitted_detail": True,
|
|
1003
|
+
},
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
|
|
1007
|
+
def receipt_for(
|
|
1008
|
+
metadata: dict[str, object],
|
|
1009
|
+
*,
|
|
1010
|
+
raw_dir: str | None = None,
|
|
1011
|
+
show_paths: bool = False,
|
|
1012
|
+
) -> dict[str, object]:
|
|
773
1013
|
artifact_id = str(metadata["artifact_id"])
|
|
774
1014
|
return {
|
|
775
1015
|
"artifact_id": artifact_id,
|
|
@@ -782,11 +1022,12 @@ def receipt_for(metadata: dict[str, object]) -> dict[str, object]:
|
|
|
782
1022
|
"digest": metadata.get("digest"),
|
|
783
1023
|
"retrieval": metadata.get("retrieval"),
|
|
784
1024
|
"available_queries": [
|
|
785
|
-
|
|
786
|
-
f"
|
|
787
|
-
f"
|
|
1025
|
+
line_query_cli(artifact_id, 1, 80, raw_dir=raw_dir, show_paths=show_paths),
|
|
1026
|
+
f"{artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)} get {artifact_id} --pattern ERROR --max-lines 40",
|
|
1027
|
+
f"{artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)} get {artifact_id} --json --lines 1:20",
|
|
788
1028
|
],
|
|
789
1029
|
"suggested_queries": suggested_queries_for(metadata),
|
|
1030
|
+
"output_sandbox": build_output_sandbox_envelope(metadata, raw_dir=raw_dir, show_paths=show_paths),
|
|
790
1031
|
}
|
|
791
1032
|
|
|
792
1033
|
|
|
@@ -896,7 +1137,13 @@ def store_command(args: argparse.Namespace) -> int:
|
|
|
896
1137
|
"content_file": content_path.name,
|
|
897
1138
|
"metadata_file": meta_path.name,
|
|
898
1139
|
},
|
|
899
|
-
"digest": build_digest(
|
|
1140
|
+
"digest": build_digest(
|
|
1141
|
+
sanitized_text,
|
|
1142
|
+
artifact_id=artifact_id,
|
|
1143
|
+
redacted_lines=redacted_lines,
|
|
1144
|
+
raw_dir=args.dir,
|
|
1145
|
+
show_paths=args.show_paths,
|
|
1146
|
+
),
|
|
900
1147
|
"retrieval": {
|
|
901
1148
|
"strategy": strategy,
|
|
902
1149
|
"deterministic": True,
|
|
@@ -906,17 +1153,22 @@ def store_command(args: argparse.Namespace) -> int:
|
|
|
906
1153
|
content_type=content_type,
|
|
907
1154
|
strategy=strategy,
|
|
908
1155
|
total_lines=total_lines,
|
|
1156
|
+
raw_dir=args.dir,
|
|
1157
|
+
show_paths=args.show_paths,
|
|
909
1158
|
),
|
|
910
1159
|
},
|
|
911
1160
|
}
|
|
912
1161
|
shrink_digest_for_metadata_cap(metadata)
|
|
913
1162
|
write_private_text(content_path, sanitized_text)
|
|
914
1163
|
write_private_text(meta_path, metadata_json_text(metadata))
|
|
915
|
-
receipt = receipt_for(metadata)
|
|
1164
|
+
receipt = receipt_for(metadata, raw_dir=args.dir, show_paths=args.show_paths)
|
|
916
1165
|
if args.json:
|
|
917
1166
|
print(json.dumps(receipt, ensure_ascii=False, indent=2, sort_keys=True))
|
|
918
1167
|
else:
|
|
919
1168
|
print(f"artifact_id={artifact_id}")
|
|
1169
|
+
sandbox = receipt.get("output_sandbox")
|
|
1170
|
+
handle = sandbox.get("handle") if isinstance(sandbox, dict) else artifact_handle(artifact_id)
|
|
1171
|
+
print(f"handle={handle}")
|
|
920
1172
|
stored = receipt["stored_output"]
|
|
921
1173
|
if isinstance(stored, dict):
|
|
922
1174
|
print(f"stored_output={stored.get('lines')} lines/{stored.get('bytes')} bytes")
|
|
@@ -925,7 +1177,16 @@ def store_command(args: argparse.Namespace) -> int:
|
|
|
925
1177
|
print("top_error_lines:")
|
|
926
1178
|
for line in digest["top_error_lines"]: # type: ignore[index]
|
|
927
1179
|
print(f"- {line}")
|
|
928
|
-
|
|
1180
|
+
available_queries = receipt.get("available_queries")
|
|
1181
|
+
if isinstance(available_queries, list) and available_queries:
|
|
1182
|
+
print(f"query={available_queries[0]}")
|
|
1183
|
+
rehydration = sandbox.get("rehydration") if isinstance(sandbox, dict) else None
|
|
1184
|
+
commands = rehydration.get("commands") if isinstance(rehydration, dict) else None
|
|
1185
|
+
if isinstance(commands, list):
|
|
1186
|
+
for command in commands:
|
|
1187
|
+
if isinstance(command, dict) and command.get("type") != "metadata" and isinstance(command.get("cli"), str):
|
|
1188
|
+
print(f"rehydrate={command['cli']}")
|
|
1189
|
+
break
|
|
929
1190
|
return 0
|
|
930
1191
|
|
|
931
1192
|
|
|
@@ -1205,6 +1466,44 @@ def get_command(args: argparse.Namespace) -> int:
|
|
|
1205
1466
|
return 0
|
|
1206
1467
|
|
|
1207
1468
|
|
|
1469
|
+
def receipt_command(args: argparse.Namespace) -> int:
|
|
1470
|
+
artifact_id = args.artifact_id
|
|
1471
|
+
try:
|
|
1472
|
+
last_missing: FileNotFoundError | None = None
|
|
1473
|
+
for directory in artifact_read_directories(args.dir):
|
|
1474
|
+
try:
|
|
1475
|
+
metadata, _content_path, _content = load_verified_artifact(directory, artifact_id)
|
|
1476
|
+
break
|
|
1477
|
+
except FileNotFoundError as exc:
|
|
1478
|
+
last_missing = exc
|
|
1479
|
+
else:
|
|
1480
|
+
if last_missing is not None:
|
|
1481
|
+
raise last_missing
|
|
1482
|
+
raise FileNotFoundError(f"artifact not found: {artifact_id}")
|
|
1483
|
+
receipt = receipt_for(metadata, raw_dir=args.dir, show_paths=bool(getattr(args, "show_paths", False)))
|
|
1484
|
+
except (FileNotFoundError, ValueError, OSError, json.JSONDecodeError) as exc:
|
|
1485
|
+
print(f"context-guard-artifact: {exc}", file=sys.stderr)
|
|
1486
|
+
return 1
|
|
1487
|
+
if args.json:
|
|
1488
|
+
print(json.dumps(receipt, ensure_ascii=False, indent=2, sort_keys=True))
|
|
1489
|
+
else:
|
|
1490
|
+
sandbox = receipt.get("output_sandbox")
|
|
1491
|
+
handle = sandbox.get("handle") if isinstance(sandbox, dict) else artifact_handle(artifact_id)
|
|
1492
|
+
print(f"artifact_id={artifact_id}")
|
|
1493
|
+
print(f"handle={handle}")
|
|
1494
|
+
stored = receipt.get("stored_output")
|
|
1495
|
+
if isinstance(stored, dict):
|
|
1496
|
+
print(f"stored_output={stored.get('lines')} lines/{stored.get('bytes')} bytes")
|
|
1497
|
+
rehydration = sandbox.get("rehydration") if isinstance(sandbox, dict) else None
|
|
1498
|
+
commands = rehydration.get("commands") if isinstance(rehydration, dict) else None
|
|
1499
|
+
if isinstance(commands, list):
|
|
1500
|
+
for command in commands[:4]:
|
|
1501
|
+
if isinstance(command, dict) and command.get("cli"):
|
|
1502
|
+
print(f"rehydrate={command.get('cli')}")
|
|
1503
|
+
print("claim_boundary=local sanitized artifact; no hosted token/cost savings claim")
|
|
1504
|
+
return 0
|
|
1505
|
+
|
|
1506
|
+
|
|
1208
1507
|
def search_command(args: argparse.Namespace) -> int:
|
|
1209
1508
|
try:
|
|
1210
1509
|
literal = search_literal(args.pattern)
|
|
@@ -1355,7 +1654,7 @@ def list_command(args: argparse.Namespace) -> int:
|
|
|
1355
1654
|
continue
|
|
1356
1655
|
artifact_id = str(data.get("artifact_id", "")) if isinstance(data, dict) else ""
|
|
1357
1656
|
if isinstance(data, dict) and ARTIFACT_ID_RE.fullmatch(artifact_id) and artifact_id not in seen:
|
|
1358
|
-
items.append(receipt_for(data))
|
|
1657
|
+
items.append(receipt_for(data, raw_dir=args.dir, show_paths=False))
|
|
1359
1658
|
seen.add(artifact_id)
|
|
1360
1659
|
items.sort(key=lambda item: str(item.get("artifact_id", "")))
|
|
1361
1660
|
if args.json:
|
|
@@ -1396,6 +1695,16 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1396
1695
|
get.add_argument("--json", action="store_true", help="emit query JSON with content")
|
|
1397
1696
|
get.set_defaults(func=get_command)
|
|
1398
1697
|
|
|
1698
|
+
receipt = subparsers.add_parser("receipt", help="print metadata-only receipt and rehydration handle for a stored artifact")
|
|
1699
|
+
receipt.add_argument("artifact_id")
|
|
1700
|
+
receipt.add_argument(
|
|
1701
|
+
"--show-paths",
|
|
1702
|
+
action="store_true",
|
|
1703
|
+
help="show raw custom --dir values in rehydration commands; local debugging only because private paths may be exposed",
|
|
1704
|
+
)
|
|
1705
|
+
receipt.add_argument("--json", action="store_true", help="emit receipt JSON without artifact content")
|
|
1706
|
+
receipt.set_defaults(func=receipt_command)
|
|
1707
|
+
|
|
1399
1708
|
list_parser = subparsers.add_parser("list", help="list stored artifacts")
|
|
1400
1709
|
list_parser.add_argument("--json", action="store_true", help="emit list JSON")
|
|
1401
1710
|
list_parser.set_defaults(func=list_command)
|
|
@@ -56,8 +56,10 @@ JSON_PARSE_RECURSION_LIMIT = 10_000
|
|
|
56
56
|
READ_CHUNK_BYTES = 64 * 1024
|
|
57
57
|
DEFAULT_MAX_FILE_BYTES = 50 * 1024 * 1024
|
|
58
58
|
DEFAULT_MAX_LINE_BYTES = 2 * 1024 * 1024
|
|
59
|
+
DEFAULT_MAX_SCAN_FILES = 100_000
|
|
59
60
|
MAX_FILE_BYTES_LIMIT = 2 * 1024 * 1024 * 1024
|
|
60
61
|
MAX_LINE_BYTES_LIMIT = 128 * 1024 * 1024
|
|
62
|
+
MAX_SCAN_FILES_LIMIT = 1_000_000
|
|
61
63
|
SECRET_VALUE_RE = re.compile(
|
|
62
64
|
r"(?i)(gh[pousr]_[A-Za-z0-9_]{8,}|github_pat_[A-Za-z0-9_]{20,}|"
|
|
63
65
|
r"xox[abprs]-[A-Za-z0-9-]{8,}|(?:AKIA|ASIA)[0-9A-Z]{8,}|"
|
|
@@ -143,14 +145,14 @@ class PromptCacheAudit:
|
|
|
143
145
|
|
|
144
146
|
def observe(self, root: Any) -> None:
|
|
145
147
|
self.sampled_records += 1
|
|
148
|
+
if len(self.samples) >= PROMPT_AUDIT_MAX_RECORDS:
|
|
149
|
+
self.capped_records += 1
|
|
150
|
+
return
|
|
146
151
|
segments, bytes_sampled, redactions, collection_capped = prompt_segments_for_record(root)
|
|
147
152
|
if collection_capped:
|
|
148
153
|
self.prompt_collection_capped_records += 1
|
|
149
154
|
if not segments:
|
|
150
155
|
return
|
|
151
|
-
if len(self.samples) >= PROMPT_AUDIT_MAX_RECORDS:
|
|
152
|
-
self.capped_records += 1
|
|
153
|
-
return
|
|
154
156
|
self.analyzed_prompt_records += 1
|
|
155
157
|
self.total_segments += len(segments)
|
|
156
158
|
self.total_bytes_sampled += bytes_sampled
|
|
@@ -169,6 +171,8 @@ class UsageSummary:
|
|
|
169
171
|
files: int = 0
|
|
170
172
|
records: int = 0
|
|
171
173
|
skipped_files: int = 0
|
|
174
|
+
unscanned_files_lower_bound: int = 0
|
|
175
|
+
scan_truncated: bool = False
|
|
172
176
|
skipped_records: int = 0
|
|
173
177
|
parse_errors: list[str] = field(default_factory=list)
|
|
174
178
|
tokens: Counter[str] = field(default_factory=Counter)
|
|
@@ -618,6 +622,7 @@ def os_error_summary(exc: OSError) -> str:
|
|
|
618
622
|
class ScanLimits:
|
|
619
623
|
max_file_bytes: int = DEFAULT_MAX_FILE_BYTES
|
|
620
624
|
max_line_bytes: int = DEFAULT_MAX_LINE_BYTES
|
|
625
|
+
max_files: int = DEFAULT_MAX_SCAN_FILES
|
|
621
626
|
|
|
622
627
|
|
|
623
628
|
def open_regular_no_symlink(file: Path):
|
|
@@ -809,6 +814,15 @@ def scan(
|
|
|
809
814
|
limits = limits or ScanLimits()
|
|
810
815
|
summary = UsageSummary()
|
|
811
816
|
for file in iter_jsonl_files(paths):
|
|
817
|
+
if summary.files >= limits.max_files:
|
|
818
|
+
summary.skipped_files += 1
|
|
819
|
+
summary.unscanned_files_lower_bound += 1
|
|
820
|
+
summary.scan_truncated = True
|
|
821
|
+
summary.note_error(
|
|
822
|
+
f"transcript scan file limit reached ({limits.max_files}); "
|
|
823
|
+
"rerun with narrower paths or --max-files if more evidence is required"
|
|
824
|
+
)
|
|
825
|
+
break
|
|
812
826
|
summary.files += 1
|
|
813
827
|
try:
|
|
814
828
|
with open_regular_no_symlink(file) as handle:
|
|
@@ -925,6 +939,8 @@ def scan_integrity(summary: UsageSummary) -> dict[str, Any]:
|
|
|
925
939
|
"files_scanned": summary.files,
|
|
926
940
|
"records_scanned": summary.records,
|
|
927
941
|
"skipped_files": summary.skipped_files,
|
|
942
|
+
"unscanned_files_lower_bound": summary.unscanned_files_lower_bound,
|
|
943
|
+
"scan_truncated": summary.scan_truncated,
|
|
928
944
|
"skipped_records": summary.skipped_records,
|
|
929
945
|
"parse_error_count": len(summary.parse_errors),
|
|
930
946
|
"complete": complete,
|
|
@@ -2151,11 +2167,14 @@ def summary_json(
|
|
|
2151
2167
|
"files": summary.files,
|
|
2152
2168
|
"records": summary.records,
|
|
2153
2169
|
"skipped_files": summary.skipped_files,
|
|
2170
|
+
"unscanned_files_lower_bound": summary.unscanned_files_lower_bound,
|
|
2171
|
+
"scan_truncated": summary.scan_truncated,
|
|
2154
2172
|
"skipped_records": summary.skipped_records,
|
|
2155
2173
|
"parse_errors": summary.parse_errors,
|
|
2156
2174
|
"scan_limits": {
|
|
2157
2175
|
"max_file_bytes": limits.max_file_bytes,
|
|
2158
2176
|
"max_line_bytes": limits.max_line_bytes,
|
|
2177
|
+
"max_files": limits.max_files,
|
|
2159
2178
|
},
|
|
2160
2179
|
"total_tokens": summary.total_tokens,
|
|
2161
2180
|
"tokens": dict(summary.tokens),
|
|
@@ -2221,10 +2240,17 @@ def main() -> int:
|
|
|
2221
2240
|
default=DEFAULT_MAX_LINE_BYTES,
|
|
2222
2241
|
help="skip individual JSONL records larger than this many bytes (default: 2 MiB)",
|
|
2223
2242
|
)
|
|
2243
|
+
parser.add_argument(
|
|
2244
|
+
"--max-files",
|
|
2245
|
+
type=int,
|
|
2246
|
+
default=DEFAULT_MAX_SCAN_FILES,
|
|
2247
|
+
help=f"stop after this many transcript files (default: {DEFAULT_MAX_SCAN_FILES})",
|
|
2248
|
+
)
|
|
2224
2249
|
args = parser.parse_args()
|
|
2225
2250
|
limits = ScanLimits(
|
|
2226
2251
|
max_file_bytes=require_scan_limit(parser, "--max-file-bytes", args.max_file_bytes, MAX_FILE_BYTES_LIMIT),
|
|
2227
2252
|
max_line_bytes=require_scan_limit(parser, "--max-line-bytes", args.max_line_bytes, MAX_LINE_BYTES_LIMIT),
|
|
2253
|
+
max_files=require_scan_limit(parser, "--max-files", args.max_files, MAX_SCAN_FILES_LIMIT),
|
|
2228
2254
|
)
|
|
2229
2255
|
|
|
2230
2256
|
summary = scan(args.paths, show_paths=args.show_paths, show_commands=args.show_commands, limits=limits)
|
|
@@ -2248,9 +2274,14 @@ def main() -> int:
|
|
|
2248
2274
|
print("Claude Code transcript usage audit")
|
|
2249
2275
|
print(
|
|
2250
2276
|
f"files_scanned={summary.files} records={summary.records} "
|
|
2251
|
-
f"skipped_files={summary.skipped_files} skipped_records={summary.skipped_records}"
|
|
2277
|
+
f"skipped_files={summary.skipped_files} skipped_records={summary.skipped_records} "
|
|
2278
|
+
f"scan_truncated={str(summary.scan_truncated).lower()} "
|
|
2279
|
+
f"unscanned_files_lower_bound={summary.unscanned_files_lower_bound}"
|
|
2280
|
+
)
|
|
2281
|
+
print(
|
|
2282
|
+
f"scan_limits=max_file_bytes:{limits.max_file_bytes} "
|
|
2283
|
+
f"max_line_bytes:{limits.max_line_bytes} max_files:{limits.max_files}"
|
|
2252
2284
|
)
|
|
2253
|
-
print(f"scan_limits=max_file_bytes:{limits.max_file_bytes} max_line_bytes:{limits.max_line_bytes}")
|
|
2254
2285
|
print(f"observed_total_tokens={summary.total_tokens}")
|
|
2255
2286
|
if summary.cost_usd:
|
|
2256
2287
|
print(f"observed_cost_usd={summary.cost_usd:.4f}")
|