npm - @ictechgy/context-guard - Versions diffs - 0.4.10 → 0.4.12 - Mend

@ictechgy/context-guard 0.4.10 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/CHANGELOG.md +17 -1
package/README.ko.md +46 -28
package/README.md +42 -33
package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
package/docs/benchmark-workflow-examples.md +3 -0
package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
package/docs/experimental-benchmark-fixtures.md +24 -7
package/package.json +2 -1
package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
package/plugins/context-guard/README.ko.md +14 -11
package/plugins/context-guard/README.md +15 -14
package/plugins/context-guard/bin/context-guard +48 -17
package/plugins/context-guard/bin/context-guard-artifact +342 -33
package/plugins/context-guard/bin/context-guard-audit +36 -5
package/plugins/context-guard/bin/context-guard-bench +1675 -44
package/plugins/context-guard/bin/context-guard-cache-score +347 -35
package/plugins/context-guard/bin/context-guard-compress +89 -27
package/plugins/context-guard/bin/context-guard-cost +7 -2
package/plugins/context-guard/bin/context-guard-experiments +364 -8
package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
package/plugins/context-guard/bin/context-guard-filter +88 -18
package/plugins/context-guard/bin/context-guard-pack +329 -19
package/plugins/context-guard/bin/context-guard-read-symbol +27 -0
package/plugins/context-guard/bin/context-guard-sanitize-output +245 -18
package/plugins/context-guard/bin/context-guard-setup +21 -5
package/plugins/context-guard/bin/context-guard-tool-prune +287 -62
package/plugins/context-guard/bin/context-guard-trim-output +394 -90
package/plugins/context-guard/brief/README.md +5 -5
package/plugins/context-guard/lib/context_guard_command_manifest_loader.py +123 -0
package/plugins/context-guard/lib/context_guard_commands.py +217 -190

package/plugins/context-guard/bin/context-guard-artifact CHANGED Viewed

@@ -33,6 +33,7 @@ MAX_TOP_ERROR_RECEIPTS = 12
 MAX_DUPLICATE_GROUPS = 12
 MAX_SUGGESTED_QUERIES = 12
 SEARCH_SCHEMA_VERSION = "contextguard.artifact.search.v1"
+OUTPUT_SANDBOX_SCHEMA_VERSION = "contextguard.artifact.output-sandbox.v1"
 DEFAULT_SEARCH_MAX_ARTIFACTS = 100
 MAX_SEARCH_MAX_ARTIFACTS = 1_000
 DEFAULT_SEARCH_MAX_MATCHES = 40
@@ -261,22 +262,38 @@ def reject_symlink_components(path: Path) -> None:
 def regular_private_file_size(path: Path) -> int:
     path = normalize_allowed_first_absolute_symlink(path)
-    reject_symlink_components(path.parent)
-    st = os.lstat(path)
-    if stat.S_ISLNK(st.st_mode):
-        raise ValueError(f"artifact file must not be a symlink: {path.name}")
-    if not stat.S_ISREG(st.st_mode):
-        raise ValueError(f"artifact file must be a regular file: {path.name}")
-    return int(st.st_size)
+    parent_fd = open_private_directory_no_follow(path.parent, label="artifact directory", create=False)
+    try:
+        leaf = path.name
+        if leaf in {"", ".", ".."}:
+            raise ValueError("artifact file must name a regular file")
+        if not DIR_FD_STAT_SUPPORTED:
+            raise RuntimeError("artifact reads require dir_fd stat support")
+        st = os.stat(leaf, dir_fd=parent_fd, follow_symlinks=False)
+        if stat.S_ISLNK(st.st_mode):
+            raise ValueError(f"artifact file must not be a symlink: {path.name}")
+        if not stat.S_ISREG(st.st_mode):
+            raise ValueError(f"artifact file must be a regular file: {path.name}")
+        return int(st.st_size)
+    finally:
+        os.close(parent_fd)
 def read_bounded_private_text(path: Path, max_bytes: int) -> str:
     path = normalize_allowed_first_absolute_symlink(path)
-    size = regular_private_file_size(path)
-    if size > max_bytes:
-        raise ValueError(f"artifact file exceeds trusted size cap: {path.name}: {size} > {max_bytes}")
-    flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
-    fd = os.open(str(path), flags)
+    parent_fd = open_private_directory_no_follow(path.parent, label="artifact directory", create=False)
+    flags = os.O_RDONLY | os.O_NOFOLLOW
+    if hasattr(os, "O_CLOEXEC"):
+        flags |= os.O_CLOEXEC
+    leaf = path.name
+    if leaf in {"", ".", ".."}:
+        os.close(parent_fd)
+        raise ValueError("artifact file must name a regular file")
+    try:
+        fd = os.open(leaf, flags, dir_fd=parent_fd)
+    except OSError:
+        os.close(parent_fd)
+        raise
     try:
         st = os.fstat(fd)
         if not stat.S_ISREG(st.st_mode):
@@ -289,6 +306,7 @@ def read_bounded_private_text(path: Path, max_bytes: int) -> str:
         return data.decode("utf-8", errors="replace")
     finally:
         os.close(fd)
+        os.close(parent_fd)
 def no_follow_dir_flags() -> int:
@@ -351,6 +369,8 @@ def open_private_directory_no_follow(path: Path, *, label: str, create: bool) ->
         owned_fd = current_fd
         current_fd = -1
         return owned_fd
+    except FileNotFoundError:
+        raise
     except OSError as exc:
         raise RuntimeError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
     finally:
@@ -574,6 +594,8 @@ def build_retrieval_hints(
     content_type: str,
     strategy: str,
     total_lines: int,
+    raw_dir: str | None = None,
+    show_paths: bool = False,
 ) -> list[dict[str, object]]:
     """Build deterministic, machine-readable retrieval hints for bounded round-trip.
@@ -591,8 +613,8 @@ def build_retrieval_hints(
         lines_hint: dict[str, object] = {
             "type": "lines",
             "selector": {"start": 1, "end": end_line},
-            "cli": line_query_cli(artifact_id, 1, end_line),
-            "exact": total_lines <= MAX_QUERY_LINES,
+            "cli": line_query_cli(artifact_id, 1, end_line, raw_dir=raw_dir, show_paths=show_paths),
+            "exact": total_lines <= MAX_QUERY_LINES and artifact_dir_cli_is_exact(raw_dir, show_paths=show_paths),
         }
         if end_line > DEFAULT_MAX_LINES:
             lines_hint["max_lines"] = end_line
@@ -614,14 +636,14 @@ def build_retrieval_hints(
             {
                 "type": "pattern",
                 "selector": {"pattern": anchor},
-                "cli": f"context-guard-artifact get {artifact_id} --pattern '{anchor}'",
+                "cli": f"{artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)} get {artifact_id} --pattern {shlex.quote(anchor)}",
             }
         )
     hints.append(
         {
             "type": "head",
             "selector": {"max_lines": DEFAULT_MAX_LINES},
-            "cli": f"context-guard-artifact get {artifact_id} --max-lines {DEFAULT_MAX_LINES}",
+            "cli": f"{artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)} get {artifact_id} --max-lines {DEFAULT_MAX_LINES}",
         }
     )
     return hints
@@ -654,16 +676,29 @@ def line_query_cli(
     return cli
-def line_receipt(artifact_id: str, line_number: int, text: str) -> dict[str, object]:
+def line_receipt(
+    artifact_id: str,
+    line_number: int,
+    text: str,
+    *,
+    raw_dir: str | None = None,
+    show_paths: bool = False,
+) -> dict[str, object]:
     return {
         "line": line_number,
         "text": cap_digest_text(text.strip()),
         "selector": {"type": "lines", "start": line_number, "end": line_number},
-        "cli": line_query_cli(artifact_id, line_number, line_number),
+        "cli": line_query_cli(artifact_id, line_number, line_number, raw_dir=raw_dir, show_paths=show_paths),
     }
-def build_top_error_receipts(artifact_id: str, lines: list[str]) -> list[dict[str, object]]:
+def build_top_error_receipts(
+    artifact_id: str,
+    lines: list[str],
+    *,
+    raw_dir: str | None = None,
+    show_paths: bool = False,
+) -> list[dict[str, object]]:
     receipts: list[dict[str, object]] = []
     seen: set[str] = set()
     for line_number, line in enumerate(lines, start=1):
@@ -672,7 +707,7 @@ def build_top_error_receipts(artifact_id: str, lines: list[str]) -> list[dict[st
         text = cap_digest_text(line.strip())
         if not text or text in seen:
             continue
-        receipt = line_receipt(artifact_id, line_number, text)
+        receipt = line_receipt(artifact_id, line_number, text, raw_dir=raw_dir, show_paths=show_paths)
         receipts.append(receipt)
         seen.add(text)
         if len(receipts) >= MAX_TOP_ERROR_RECEIPTS:
@@ -680,7 +715,14 @@ def build_top_error_receipts(artifact_id: str, lines: list[str]) -> list[dict[st
     return receipts
-def build_duplicate_line_groups(artifact_id: str, lines: list[str], *, limit: int = MAX_DUPLICATE_GROUPS) -> list[dict[str, object]]:
+def build_duplicate_line_groups(
+    artifact_id: str,
+    lines: list[str],
+    *,
+    limit: int = MAX_DUPLICATE_GROUPS,
+    raw_dir: str | None = None,
+    show_paths: bool = False,
+) -> list[dict[str, object]]:
     counts: dict[str, int] = {}
     first_line: dict[str, int] = {}
     for line_number, line in enumerate(lines, start=1):
@@ -703,13 +745,20 @@ def build_duplicate_line_groups(artifact_id: str, lines: list[str], *, limit: in
                 "first_line": line_number,
                 "text": text,
                 "selector": {"type": "lines", "start": line_number, "end": line_number},
-                "cli": line_query_cli(artifact_id, line_number, line_number),
+                "cli": line_query_cli(artifact_id, line_number, line_number, raw_dir=raw_dir, show_paths=show_paths),
             }
         )
     return groups
-def build_digest(sanitized_text: str, *, artifact_id: str, redacted_lines: int) -> dict[str, object]:
+def build_digest(
+    sanitized_text: str,
+    *,
+    artifact_id: str,
+    redacted_lines: int,
+    raw_dir: str | None = None,
+    show_paths: bool = False,
+) -> dict[str, object]:
     lines = sanitized_text.splitlines()
     top_errors = compact_items(
         (line for line in lines if ERROR_RE.search(line)),
@@ -725,8 +774,8 @@ def build_digest(sanitized_text: str, *, artifact_id: str, redacted_lines: int)
             "markers": sanitized_text.count("[REDACTED]"),
         },
         "top_error_lines": top_errors,
-        "top_error_receipts": build_top_error_receipts(artifact_id, lines),
-        "duplicate_line_groups": build_duplicate_line_groups(artifact_id, lines),
+        "top_error_receipts": build_top_error_receipts(artifact_id, lines, raw_dir=raw_dir, show_paths=show_paths),
+        "duplicate_line_groups": build_duplicate_line_groups(artifact_id, lines, raw_dir=raw_dir, show_paths=show_paths),
         "representative_head": compact_items(
             lines,
             limit=8,
@@ -769,7 +818,198 @@ def suggested_queries_for(metadata: dict[str, object]) -> list[str]:
     return queries[:MAX_SUGGESTED_QUERIES]
-def receipt_for(metadata: dict[str, object]) -> dict[str, object]:
+def artifact_handle(artifact_id: str) -> str:
+    return f"contextguard-artifact:{artifact_id}"
+def compact_stored_output(metadata: dict[str, object]) -> dict[str, object]:
+    stored = metadata.get("stored_output")
+    if not isinstance(stored, dict):
+        return {}
+    compact: dict[str, object] = {}
+    for key in ("scope", "bytes", "lines", "sha256", "content_file", "metadata_file"):
+        if key in stored:
+            compact[key] = stored[key]
+    content_type = metadata.get("content_type")
+    if isinstance(content_type, str):
+        compact["content_type"] = content_type
+    return compact
+def digest_count(digest: dict[str, object], key: str) -> int:
+    value = digest.get(key)
+    return len(value) if isinstance(value, list) else 0
+def build_output_sandbox_summary(metadata: dict[str, object]) -> dict[str, object]:
+    digest = metadata.get("digest")
+    if not isinstance(digest, dict):
+        return {"status": "stored"}
+    summary: dict[str, object] = {
+        "status": digest.get("status") or "stored",
+        "top_error_count": digest_count(digest, "top_error_lines"),
+        "top_error_receipt_count": digest_count(digest, "top_error_receipts"),
+        "duplicate_line_group_count": digest_count(digest, "duplicate_line_groups"),
+        "representative_head_count": digest_count(digest, "representative_head"),
+        "representative_tail_count": digest_count(digest, "representative_tail"),
+    }
+    redaction_counts = digest.get("redaction_counts")
+    if isinstance(redaction_counts, dict):
+        summary["redaction_counts"] = {
+            str(key): value
+            for key, value in redaction_counts.items()
+            if isinstance(value, (int, float, str, bool)) or value is None
+        }
+    elif "redacted_lines" in digest:
+        summary["redacted_lines"] = digest.get("redacted_lines")
+    capped = digest.get("capped_for_metadata")
+    if isinstance(capped, bool):
+        summary["capped_for_metadata"] = capped
+    return summary
+def rehydration_command_record(
+    *,
+    kind: str,
+    cli: str,
+    selector: dict[str, object],
+    exact: bool,
+    note: str | None = None,
+) -> dict[str, object]:
+    record: dict[str, object] = {
+        "type": kind,
+        "selector": selector,
+        "cli": cli,
+        "exact": exact,
+    }
+    if note:
+        record["note"] = note
+    return record
+def build_output_sandbox_rehydration(
+    metadata: dict[str, object],
+    *,
+    raw_dir: str | None = None,
+    show_paths: bool = False,
+) -> dict[str, object]:
+    artifact_id = str(metadata["artifact_id"])
+    cli_exact = artifact_dir_cli_is_exact(raw_dir, show_paths=show_paths)
+    prefix = artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)
+    note = (
+        None
+        if cli_exact
+        else "custom artifact directory is redacted; rerun with the same --dir value or pass --show-paths for a directly executable local command"
+    )
+    commands: list[dict[str, object]] = [
+        rehydration_command_record(
+            kind="metadata",
+            selector={"type": "receipt"},
+            cli=f"{prefix} receipt {artifact_id} --json",
+            exact=cli_exact,
+            note=note,
+        )
+    ]
+    retrieval = metadata.get("retrieval")
+    hints = retrieval.get("hints") if isinstance(retrieval, dict) else None
+    if isinstance(hints, list):
+        for hint in hints:
+            if not isinstance(hint, dict):
+                continue
+            hint_type = hint.get("type")
+            selector = hint.get("selector")
+            if not isinstance(selector, dict):
+                selector = {}
+            cli: str | None = None
+            exact = bool(hint.get("exact", True)) and cli_exact
+            if hint_type == "lines":
+                start = selector.get("start")
+                end = selector.get("end")
+                if isinstance(start, int) and isinstance(end, int):
+                    cli = line_query_cli(artifact_id, start, end, raw_dir=raw_dir, show_paths=show_paths)
+            elif hint_type == "pattern":
+                pattern = selector.get("pattern")
+                if isinstance(pattern, str) and pattern:
+                    cli = f"{prefix} get {artifact_id} --pattern {shlex.quote(pattern)}"
+            elif hint_type == "head":
+                max_lines = selector.get("max_lines")
+                if isinstance(max_lines, int) and max_lines > 0:
+                    cli = f"{prefix} get {artifact_id} --max-lines {max_lines}"
+            if cli is None:
+                raw_cli = hint.get("cli")
+                cli = raw_cli if isinstance(raw_cli, str) and raw_cli else None
+            if cli:
+                commands.append(
+                    rehydration_command_record(
+                        kind=str(hint_type or "query"),
+                        selector=selector,
+                        cli=cli,
+                        exact=exact,
+                        note=note if not cli_exact else str(hint.get("note") or "") or None,
+                    )
+                )
+            if len(commands) >= 5:
+                break
+    digest = metadata.get("digest")
+    top_error_lines = digest.get("top_error_lines") if isinstance(digest, dict) else None
+    if isinstance(top_error_lines, list):
+        anchor = first_error_anchor("\n".join(str(line) for line in top_error_lines))
+        if anchor and len(commands) < 5:
+            commands.append(
+                rehydration_command_record(
+                    kind="search",
+                    selector={"type": "literal", "pattern": anchor},
+                    cli=f"{prefix} search {shlex.quote(anchor)} --json",
+                    exact=cli_exact,
+                    note=note,
+                )
+            )
+    return {
+        "commands": commands,
+        "dir_argument": "default" if default_artifact_dir_requested(raw_dir or DEFAULT_ARTIFACT_DIR) else ("included" if show_paths else "redacted"),
+        "exact_commands": cli_exact,
+        "note": note,
+    }
+def build_output_sandbox_envelope(
+    metadata: dict[str, object],
+    *,
+    raw_dir: str | None = None,
+    show_paths: bool = False,
+) -> dict[str, object]:
+    artifact_id = str(metadata["artifact_id"])
+    return {
+        "schema_version": OUTPUT_SANDBOX_SCHEMA_VERSION,
+        "mode": "local_artifact_receipt",
+        "handle": artifact_handle(artifact_id),
+        "artifact_id": artifact_id,
+        "stored_output": compact_stored_output(metadata),
+        "summary": build_output_sandbox_summary(metadata),
+        "rehydration": build_output_sandbox_rehydration(metadata, raw_dir=raw_dir, show_paths=show_paths),
+        "agent_guidance": [
+            "Keep this compact receipt in agent context instead of pasting the full output.",
+            "Before relying on omitted details, rehydrate the exact sanitized slice with one of rehydration.commands[].cli.",
+            "For repeated diagnostics, query narrower lines or literal matches instead of rerunning broad commands unchanged.",
+        ],
+        "claim_boundary": {
+            "local_only": True,
+            "stored_content_is_sanitized_copy": True,
+            "hosted_api_token_or_cost_savings_claim_allowed": False,
+            "exact_rehydration_required_before_relying_on_omitted_detail": True,
+        },
+    }
+def receipt_for(
+    metadata: dict[str, object],
+    *,
+    raw_dir: str | None = None,
+    show_paths: bool = False,
+) -> dict[str, object]:
     artifact_id = str(metadata["artifact_id"])
     return {
         "artifact_id": artifact_id,
@@ -782,11 +1022,12 @@ def receipt_for(metadata: dict[str, object]) -> dict[str, object]:
         "digest": metadata.get("digest"),
         "retrieval": metadata.get("retrieval"),
         "available_queries": [
-            f"context-guard-artifact get {artifact_id} --lines 1:80",
-            f"context-guard-artifact get {artifact_id} --pattern ERROR --max-lines 40",
-            f"context-guard-artifact get {artifact_id} --json --lines 1:20",
+            line_query_cli(artifact_id, 1, 80, raw_dir=raw_dir, show_paths=show_paths),
+            f"{artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)} get {artifact_id} --pattern ERROR --max-lines 40",
+            f"{artifact_dir_cli_prefix(raw_dir, show_paths=show_paths)} get {artifact_id} --json --lines 1:20",
         ],
         "suggested_queries": suggested_queries_for(metadata),
+        "output_sandbox": build_output_sandbox_envelope(metadata, raw_dir=raw_dir, show_paths=show_paths),
     }
@@ -896,7 +1137,13 @@ def store_command(args: argparse.Namespace) -> int:
             "content_file": content_path.name,
             "metadata_file": meta_path.name,
         },
-        "digest": build_digest(sanitized_text, artifact_id=artifact_id, redacted_lines=redacted_lines),
+        "digest": build_digest(
+            sanitized_text,
+            artifact_id=artifact_id,
+            redacted_lines=redacted_lines,
+            raw_dir=args.dir,
+            show_paths=args.show_paths,
+        ),
         "retrieval": {
             "strategy": strategy,
             "deterministic": True,
@@ -906,17 +1153,22 @@ def store_command(args: argparse.Namespace) -> int:
                 content_type=content_type,
                 strategy=strategy,
                 total_lines=total_lines,
+                raw_dir=args.dir,
+                show_paths=args.show_paths,
             ),
         },
     }
     shrink_digest_for_metadata_cap(metadata)
     write_private_text(content_path, sanitized_text)
     write_private_text(meta_path, metadata_json_text(metadata))
-    receipt = receipt_for(metadata)
+    receipt = receipt_for(metadata, raw_dir=args.dir, show_paths=args.show_paths)
     if args.json:
         print(json.dumps(receipt, ensure_ascii=False, indent=2, sort_keys=True))
     else:
         print(f"artifact_id={artifact_id}")
+        sandbox = receipt.get("output_sandbox")
+        handle = sandbox.get("handle") if isinstance(sandbox, dict) else artifact_handle(artifact_id)
+        print(f"handle={handle}")
         stored = receipt["stored_output"]
         if isinstance(stored, dict):
             print(f"stored_output={stored.get('lines')} lines/{stored.get('bytes')} bytes")
@@ -925,7 +1177,16 @@ def store_command(args: argparse.Namespace) -> int:
             print("top_error_lines:")
             for line in digest["top_error_lines"]:  # type: ignore[index]
                 print(f"- {line}")
-        print(f"query=context-guard-artifact get {artifact_id} --lines 1:80")
+        available_queries = receipt.get("available_queries")
+        if isinstance(available_queries, list) and available_queries:
+            print(f"query={available_queries[0]}")
+        rehydration = sandbox.get("rehydration") if isinstance(sandbox, dict) else None
+        commands = rehydration.get("commands") if isinstance(rehydration, dict) else None
+        if isinstance(commands, list):
+            for command in commands:
+                if isinstance(command, dict) and command.get("type") != "metadata" and isinstance(command.get("cli"), str):
+                    print(f"rehydrate={command['cli']}")
+                    break
     return 0
@@ -1205,6 +1466,44 @@ def get_command(args: argparse.Namespace) -> int:
     return 0
+def receipt_command(args: argparse.Namespace) -> int:
+    artifact_id = args.artifact_id
+    try:
+        last_missing: FileNotFoundError | None = None
+        for directory in artifact_read_directories(args.dir):
+            try:
+                metadata, _content_path, _content = load_verified_artifact(directory, artifact_id)
+                break
+            except FileNotFoundError as exc:
+                last_missing = exc
+        else:
+            if last_missing is not None:
+                raise last_missing
+            raise FileNotFoundError(f"artifact not found: {artifact_id}")
+        receipt = receipt_for(metadata, raw_dir=args.dir, show_paths=bool(getattr(args, "show_paths", False)))
+    except (FileNotFoundError, ValueError, OSError, json.JSONDecodeError) as exc:
+        print(f"context-guard-artifact: {exc}", file=sys.stderr)
+        return 1
+    if args.json:
+        print(json.dumps(receipt, ensure_ascii=False, indent=2, sort_keys=True))
+    else:
+        sandbox = receipt.get("output_sandbox")
+        handle = sandbox.get("handle") if isinstance(sandbox, dict) else artifact_handle(artifact_id)
+        print(f"artifact_id={artifact_id}")
+        print(f"handle={handle}")
+        stored = receipt.get("stored_output")
+        if isinstance(stored, dict):
+            print(f"stored_output={stored.get('lines')} lines/{stored.get('bytes')} bytes")
+        rehydration = sandbox.get("rehydration") if isinstance(sandbox, dict) else None
+        commands = rehydration.get("commands") if isinstance(rehydration, dict) else None
+        if isinstance(commands, list):
+            for command in commands[:4]:
+                if isinstance(command, dict) and command.get("cli"):
+                    print(f"rehydrate={command.get('cli')}")
+        print("claim_boundary=local sanitized artifact; no hosted token/cost savings claim")
+    return 0
 def search_command(args: argparse.Namespace) -> int:
     try:
         literal = search_literal(args.pattern)
@@ -1355,7 +1654,7 @@ def list_command(args: argparse.Namespace) -> int:
                 continue
             artifact_id = str(data.get("artifact_id", "")) if isinstance(data, dict) else ""
             if isinstance(data, dict) and ARTIFACT_ID_RE.fullmatch(artifact_id) and artifact_id not in seen:
-                items.append(receipt_for(data))
+                items.append(receipt_for(data, raw_dir=args.dir, show_paths=False))
                 seen.add(artifact_id)
     items.sort(key=lambda item: str(item.get("artifact_id", "")))
     if args.json:
@@ -1396,6 +1695,16 @@ def build_parser() -> argparse.ArgumentParser:
     get.add_argument("--json", action="store_true", help="emit query JSON with content")
     get.set_defaults(func=get_command)
+    receipt = subparsers.add_parser("receipt", help="print metadata-only receipt and rehydration handle for a stored artifact")
+    receipt.add_argument("artifact_id")
+    receipt.add_argument(
+        "--show-paths",
+        action="store_true",
+        help="show raw custom --dir values in rehydration commands; local debugging only because private paths may be exposed",
+    )
+    receipt.add_argument("--json", action="store_true", help="emit receipt JSON without artifact content")
+    receipt.set_defaults(func=receipt_command)
     list_parser = subparsers.add_parser("list", help="list stored artifacts")
     list_parser.add_argument("--json", action="store_true", help="emit list JSON")
     list_parser.set_defaults(func=list_command)

package/plugins/context-guard/bin/context-guard-audit CHANGED Viewed

@@ -56,8 +56,10 @@ JSON_PARSE_RECURSION_LIMIT = 10_000
 READ_CHUNK_BYTES = 64 * 1024
 DEFAULT_MAX_FILE_BYTES = 50 * 1024 * 1024
 DEFAULT_MAX_LINE_BYTES = 2 * 1024 * 1024
+DEFAULT_MAX_SCAN_FILES = 100_000
 MAX_FILE_BYTES_LIMIT = 2 * 1024 * 1024 * 1024
 MAX_LINE_BYTES_LIMIT = 128 * 1024 * 1024
+MAX_SCAN_FILES_LIMIT = 1_000_000
 SECRET_VALUE_RE = re.compile(
     r"(?i)(gh[pousr]_[A-Za-z0-9_]{8,}|github_pat_[A-Za-z0-9_]{20,}|"
     r"xox[abprs]-[A-Za-z0-9-]{8,}|(?:AKIA|ASIA)[0-9A-Z]{8,}|"
@@ -143,14 +145,14 @@ class PromptCacheAudit:
     def observe(self, root: Any) -> None:
         self.sampled_records += 1
+        if len(self.samples) >= PROMPT_AUDIT_MAX_RECORDS:
+            self.capped_records += 1
+            return
         segments, bytes_sampled, redactions, collection_capped = prompt_segments_for_record(root)
         if collection_capped:
             self.prompt_collection_capped_records += 1
         if not segments:
             return
-        if len(self.samples) >= PROMPT_AUDIT_MAX_RECORDS:
-            self.capped_records += 1
-            return
         self.analyzed_prompt_records += 1
         self.total_segments += len(segments)
         self.total_bytes_sampled += bytes_sampled
@@ -169,6 +171,8 @@ class UsageSummary:
     files: int = 0
     records: int = 0
     skipped_files: int = 0
+    unscanned_files_lower_bound: int = 0
+    scan_truncated: bool = False
     skipped_records: int = 0
     parse_errors: list[str] = field(default_factory=list)
     tokens: Counter[str] = field(default_factory=Counter)
@@ -618,6 +622,7 @@ def os_error_summary(exc: OSError) -> str:
 class ScanLimits:
     max_file_bytes: int = DEFAULT_MAX_FILE_BYTES
     max_line_bytes: int = DEFAULT_MAX_LINE_BYTES
+    max_files: int = DEFAULT_MAX_SCAN_FILES
 def open_regular_no_symlink(file: Path):
@@ -809,6 +814,15 @@ def scan(
     limits = limits or ScanLimits()
     summary = UsageSummary()
     for file in iter_jsonl_files(paths):
+        if summary.files >= limits.max_files:
+            summary.skipped_files += 1
+            summary.unscanned_files_lower_bound += 1
+            summary.scan_truncated = True
+            summary.note_error(
+                f"transcript scan file limit reached ({limits.max_files}); "
+                "rerun with narrower paths or --max-files if more evidence is required"
+            )
+            break
         summary.files += 1
         try:
             with open_regular_no_symlink(file) as handle:
@@ -925,6 +939,8 @@ def scan_integrity(summary: UsageSummary) -> dict[str, Any]:
         "files_scanned": summary.files,
         "records_scanned": summary.records,
         "skipped_files": summary.skipped_files,
+        "unscanned_files_lower_bound": summary.unscanned_files_lower_bound,
+        "scan_truncated": summary.scan_truncated,
         "skipped_records": summary.skipped_records,
         "parse_error_count": len(summary.parse_errors),
         "complete": complete,
@@ -2151,11 +2167,14 @@ def summary_json(
         "files": summary.files,
         "records": summary.records,
         "skipped_files": summary.skipped_files,
+        "unscanned_files_lower_bound": summary.unscanned_files_lower_bound,
+        "scan_truncated": summary.scan_truncated,
         "skipped_records": summary.skipped_records,
         "parse_errors": summary.parse_errors,
         "scan_limits": {
             "max_file_bytes": limits.max_file_bytes,
             "max_line_bytes": limits.max_line_bytes,
+            "max_files": limits.max_files,
         },
         "total_tokens": summary.total_tokens,
         "tokens": dict(summary.tokens),
@@ -2221,10 +2240,17 @@ def main() -> int:
         default=DEFAULT_MAX_LINE_BYTES,
         help="skip individual JSONL records larger than this many bytes (default: 2 MiB)",
     )
+    parser.add_argument(
+        "--max-files",
+        type=int,
+        default=DEFAULT_MAX_SCAN_FILES,
+        help=f"stop after this many transcript files (default: {DEFAULT_MAX_SCAN_FILES})",
+    )
     args = parser.parse_args()
     limits = ScanLimits(
         max_file_bytes=require_scan_limit(parser, "--max-file-bytes", args.max_file_bytes, MAX_FILE_BYTES_LIMIT),
         max_line_bytes=require_scan_limit(parser, "--max-line-bytes", args.max_line_bytes, MAX_LINE_BYTES_LIMIT),
+        max_files=require_scan_limit(parser, "--max-files", args.max_files, MAX_SCAN_FILES_LIMIT),
     )
     summary = scan(args.paths, show_paths=args.show_paths, show_commands=args.show_commands, limits=limits)
@@ -2248,9 +2274,14 @@ def main() -> int:
     print("Claude Code transcript usage audit")
     print(
         f"files_scanned={summary.files} records={summary.records} "
-        f"skipped_files={summary.skipped_files} skipped_records={summary.skipped_records}"
+        f"skipped_files={summary.skipped_files} skipped_records={summary.skipped_records} "
+        f"scan_truncated={str(summary.scan_truncated).lower()} "
+        f"unscanned_files_lower_bound={summary.unscanned_files_lower_bound}"
+    )
+    print(
+        f"scan_limits=max_file_bytes:{limits.max_file_bytes} "
+        f"max_line_bytes:{limits.max_line_bytes} max_files:{limits.max_files}"
     )
-    print(f"scan_limits=max_file_bytes:{limits.max_file_bytes} max_line_bytes:{limits.max_line_bytes}")
     print(f"observed_total_tokens={summary.total_tokens}")
     if summary.cost_usd:
         print(f"observed_cost_usd={summary.cost_usd:.4f}")