@ictechgy/context-guard 0.4.3 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/CHANGELOG.md +13 -0
  2. package/README.ko.md +16 -3
  3. package/README.md +13 -3
  4. package/context-guard-kit/README.md +2 -2
  5. package/context-guard-kit/benchmark_runner.py +244 -6
  6. package/context-guard-kit/claude_transcript_cost_audit.py +443 -1
  7. package/docs/benchmark-fixtures/learned-compression-baseline-context-pack.prompt.example.md +19 -0
  8. package/docs/benchmark-fixtures/learned-compression-candidate-digest.prompt.example.md +21 -0
  9. package/docs/benchmark-fixtures/learned-compression.tasks.example.json +5 -1
  10. package/docs/benchmark-fixtures/output-transform-baseline-raw-output.prompt.example.md +20 -0
  11. package/docs/benchmark-fixtures/output-transform-digest-receipt.prompt.example.md +23 -0
  12. package/docs/benchmark-fixtures/output-transform.tasks.example.json +28 -0
  13. package/docs/benchmark-fixtures/output-transform.variants.example.json +10 -0
  14. package/docs/benchmark-fixtures/visual-ocr-cropped-ocr.prompt.example.md +22 -0
  15. package/docs/benchmark-fixtures/visual-ocr-full-visual.prompt.example.md +19 -0
  16. package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +5 -1
  17. package/docs/benchmark-workflow-examples.md +6 -2
  18. package/docs/benchmark-workflows/self-hosted-metrics-ledger.example.jsonl +1 -0
  19. package/docs/cache-diagnostics-schema.md +25 -4
  20. package/docs/experimental-benchmark-fixtures.md +17 -6
  21. package/docs/mac-visibility-feasibility-schema.md +62 -0
  22. package/docs/mac-visibility-feasibility.example.json +130 -0
  23. package/package.json +5 -1
  24. package/packaging/homebrew/context-guard.rb.template +1 -1
  25. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  26. package/plugins/context-guard/README.ko.md +3 -3
  27. package/plugins/context-guard/README.md +3 -3
  28. package/plugins/context-guard/bin/context-guard-audit +443 -1
  29. package/plugins/context-guard/bin/context-guard-bench +244 -6
@@ -27,6 +27,7 @@ Task fixture (`tasks.json`): 각 task 는 다음 필드를 가진다.
27
27
  "max_turns": 3,
28
28
  "max_budget_usd": 1.0,
29
29
  "allowed_tools": ["Read", "Edit", "Bash(npm test*)"],
30
+ "variant_prompt_files": {"context_hygiene": "t01.context_hygiene.prompt.md"},
30
31
  "success_command": "npm test -- auth/session",
31
32
  "success_cwd": "."
32
33
  }
@@ -183,6 +184,13 @@ MAX_USAGE_COST_USD = 10**9
183
184
  TOKEN_PROXY_BYTES_PER_TOKEN = 4
184
185
  BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
185
186
  MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
187
+ SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
188
+ SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
189
+ SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
190
+ MAX_SELF_HOSTED_LABEL_CHARS = 120
191
+ MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
192
+ MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
193
+ MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
186
194
  CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
187
195
  SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
188
196
  VERSION_OUTPUT_MAX_BYTES = 16_000
@@ -354,6 +362,8 @@ class TaskFixture:
354
362
  allowed_tools: list[str] = field(default_factory=list)
355
363
  success_command: str | None = None
356
364
  success_cwd: str = "."
365
+ variant_prompt_files: dict[str, str] = field(default_factory=dict)
366
+ variant_prompt_texts: dict[str, str] = field(default_factory=dict)
357
367
 
358
368
 
359
369
  @dataclass
@@ -387,6 +397,7 @@ class RunResult:
387
397
  provider_cached_tokens: int = 0
388
398
  provider_cached_tokens_measured: bool = False
389
399
  primary_tokens_measured: bool = False
400
+ self_hosted_metrics: dict[str, Any] | None = None
390
401
 
391
402
 
392
403
  @dataclass
@@ -433,6 +444,22 @@ def parse_string_list(value: Any, *, field: str, owner: str) -> list[str]:
433
444
  return items
434
445
 
435
446
 
447
+ def parse_string_map(value: Any, *, field: str, owner: str) -> dict[str, str]:
448
+ """Parse a JSON fixture field that must be an object of non-empty string values."""
449
+ if value is None:
450
+ return {}
451
+ if not isinstance(value, dict):
452
+ raise SystemExit(f"{owner} {field} must be a JSON object of strings")
453
+ items: dict[str, str] = {}
454
+ for raw_key, raw_value in value.items():
455
+ if not isinstance(raw_key, str) or not raw_key.strip():
456
+ raise SystemExit(f"{owner} {field} keys must be non-empty strings")
457
+ if not isinstance(raw_value, str) or not raw_value.strip():
458
+ raise SystemExit(f"{owner} {field}.{raw_key} must be a non-empty string")
459
+ items[raw_key] = raw_value
460
+ return items
461
+
462
+
436
463
  def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[str]:
437
464
  for index, arg in enumerate(extra_args):
438
465
  flag = arg.split("=", 1)[0]
@@ -443,6 +470,101 @@ def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[st
443
470
  return extra_args
444
471
 
445
472
 
473
+ def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
474
+ """Return a safe relative prompt-file path, or fail before any file read."""
475
+ rel_path = Path(raw_path)
476
+ if rel_path.is_absolute():
477
+ raise SystemExit(f"{owner} variant_prompt_files path must be relative: {raw_path}")
478
+ if not rel_path.parts or rel_path == Path("."):
479
+ raise SystemExit(f"{owner} variant_prompt_files path must name a file")
480
+ if any(part in ("", ".", "..") for part in rel_path.parts):
481
+ raise SystemExit(f"{owner} variant_prompt_files path must not contain '.', '..', or empty components: {raw_path}")
482
+ return rel_path
483
+
484
+
485
+ def validate_variant_prompt_file_references(
486
+ tasks: list[TaskFixture],
487
+ variants: list["Variant"],
488
+ ) -> None:
489
+ """Validate variant prompt-file keys and paths without dereferencing files.
490
+
491
+ Unknown variant keys and unsafe relative paths are rejected before any file
492
+ read. Missing prompt files are intentionally not checked here so a run
493
+ narrowed by --task-id/--variant is not blocked by unselected prompt files.
494
+ """
495
+ known_variants = {variant.name for variant in variants}
496
+ for task in tasks:
497
+ unknown = sorted(set(task.variant_prompt_files) - known_variants)
498
+ if unknown:
499
+ raise SystemExit(
500
+ f"task {task.id} variant_prompt_files references unknown variant(s): {', '.join(unknown)}"
501
+ )
502
+ for variant_name, raw_path in task.variant_prompt_files.items():
503
+ validate_variant_prompt_file_path(
504
+ raw_path,
505
+ owner=f"task {task.id} variant {variant_name}",
506
+ )
507
+
508
+
509
+ def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None = None) -> str:
510
+ """Read one selected prompt file with no-follow IO and an argv-safe size cap."""
511
+ label = display_path or path.name
512
+ try:
513
+ fd = _open_regular_no_symlink(path)
514
+ except OSError as exc:
515
+ detail = exc.strerror or exc.__class__.__name__
516
+ raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
517
+ try:
518
+ size = os.fstat(fd).st_size
519
+ if size > MAX_VARIANT_PROMPT_FILE_BYTES:
520
+ raise SystemExit(
521
+ f"{owner} variant_prompt_files prompt file exceeds "
522
+ f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
523
+ )
524
+ try:
525
+ with os.fdopen(fd, "r", encoding="utf-8") as handle:
526
+ fd = -1
527
+ text = handle.read()
528
+ except UnicodeDecodeError as exc:
529
+ raise SystemExit(
530
+ f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
531
+ f"{label}: {exc.reason}"
532
+ ) from None
533
+ except OSError as exc:
534
+ detail = exc.strerror or exc.__class__.__name__
535
+ raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
536
+ finally:
537
+ if fd != -1:
538
+ os.close(fd)
539
+ if len(text.encode("utf-8", errors="replace")) > MAX_VARIANT_PROMPT_FILE_BYTES:
540
+ raise SystemExit(
541
+ f"{owner} variant_prompt_files prompt text exceeds "
542
+ f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
543
+ )
544
+ return text
545
+
546
+
547
+ def load_variant_prompt_files_for_targets(
548
+ targets: list[tuple[TaskFixture, "Variant"]],
549
+ *,
550
+ task_file_dir: Path,
551
+ ) -> None:
552
+ """Load file-backed prompts only for selected (task, variant) targets."""
553
+ for task, variant in targets:
554
+ raw_path = task.variant_prompt_files.get(variant.name)
555
+ if raw_path is None:
556
+ continue
557
+ rel_path = validate_variant_prompt_file_path(
558
+ raw_path,
559
+ owner=f"task {task.id} variant {variant.name}",
560
+ )
561
+ task.variant_prompt_texts[variant.name] = read_variant_prompt_file(
562
+ task_file_dir / rel_path,
563
+ owner=f"task {task.id} variant {variant.name}",
564
+ display_path=str(rel_path),
565
+ )
566
+
567
+
446
568
  def normalize_usage_token(value: Any) -> int | None:
447
569
  """Return a safe non-negative token count, or None for invalid metrics."""
448
570
  if isinstance(value, bool) or not isinstance(value, (int, float)):
@@ -469,7 +591,7 @@ def normalize_usage_cost(value: Any) -> float | None:
469
591
  return numeric
470
592
 
471
593
 
472
- def parse_tasks(path: Path) -> list[TaskFixture]:
594
+ def parse_tasks(path: Path, variants: list["Variant"] | None = None) -> list[TaskFixture]:
473
595
  raw = json.loads(_read_text_no_follow(path))
474
596
  if not isinstance(raw, list):
475
597
  raise SystemExit(f"tasks file must be a JSON list: {path}")
@@ -488,21 +610,33 @@ def parse_tasks(path: Path) -> list[TaskFixture]:
488
610
  raise SystemExit(f"task {item.get('id')} max_budget_usd must be finite and > 0 (use null for unlimited)")
489
611
  else:
490
612
  budget = None
613
+ task_id = str(item["id"])
614
+ if "variant_prompts" in item:
615
+ raise SystemExit(
616
+ f"task {task_id} variant_prompts is not supported; use file-backed variant_prompt_files"
617
+ )
491
618
  fixtures.append(TaskFixture(
492
- id=str(item["id"]),
619
+ id=task_id,
493
620
  prompt=str(item["prompt"]),
494
621
  model=str(item.get("model", "sonnet")),
495
622
  effort=str(effort_raw) if effort_raw is not None else None,
496
- max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {item.get('id')}"),
623
+ max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {task_id}"),
497
624
  max_budget_usd=budget,
498
625
  allowed_tools=parse_string_list(
499
626
  item.get("allowed_tools", []),
500
627
  field="allowed_tools",
501
- owner=f"task {item.get('id')}",
628
+ owner=f"task {task_id}",
502
629
  ),
503
630
  success_command=item.get("success_command"),
504
631
  success_cwd=str(item.get("success_cwd", ".")),
632
+ variant_prompt_files=parse_string_map(
633
+ item.get("variant_prompt_files"),
634
+ field="variant_prompt_files",
635
+ owner=f"task {task_id}",
636
+ ),
505
637
  ))
638
+ if variants is not None:
639
+ validate_variant_prompt_file_references(fixtures, variants)
506
640
  return fixtures
507
641
 
508
642
 
@@ -717,6 +851,102 @@ def collect_shift_metrics(payload: Any) -> dict[str, int | float | bool]:
717
851
  return metrics
718
852
 
719
853
 
854
+ def normalize_self_hosted_metric(value: Any, *, maximum: float) -> float | None:
855
+ if isinstance(value, bool) or not isinstance(value, (int, float)):
856
+ return None
857
+ number = float(value)
858
+ if not math.isfinite(number) or number < 0 or number > maximum:
859
+ return None
860
+ return number
861
+
862
+
863
+ def sanitize_self_hosted_label(value: Any) -> str | None:
864
+ if not isinstance(value, str):
865
+ return None
866
+ text = sanitize_note_text(value)
867
+ if not text:
868
+ return None
869
+ if len(text) > MAX_SELF_HOSTED_LABEL_CHARS:
870
+ text = text[:MAX_SELF_HOSTED_LABEL_CHARS - 12].rstrip() + "…[truncated]"
871
+ return text
872
+
873
+
874
+ def normalize_self_hosted_metrics(raw: Any, *, source: str) -> dict[str, Any] | None:
875
+ if not isinstance(raw, dict):
876
+ return None
877
+ metrics: dict[str, float] = {}
878
+ labels: dict[str, str] = {}
879
+ availability = {
880
+ "latency_ms": False,
881
+ "peak_memory_mb": False,
882
+ "quality_score": False,
883
+ }
884
+ latency = normalize_self_hosted_metric(raw.get("latency_ms"), maximum=MAX_SELF_HOSTED_LATENCY_MS)
885
+ if latency is not None:
886
+ metrics["latency_ms"] = latency
887
+ availability["latency_ms"] = True
888
+ peak_memory = normalize_self_hosted_metric(raw.get("peak_memory_mb"), maximum=MAX_SELF_HOSTED_MEMORY_MB)
889
+ if peak_memory is not None:
890
+ metrics["peak_memory_mb"] = peak_memory
891
+ availability["peak_memory_mb"] = True
892
+ quality = normalize_self_hosted_metric(raw.get("quality_score"), maximum=1.0)
893
+ if quality is not None:
894
+ metrics["quality_score"] = quality
895
+ availability["quality_score"] = True
896
+ for key in ("model_server", "optimization", "quality_metric"):
897
+ label = sanitize_self_hosted_label(raw.get(key))
898
+ if label is not None:
899
+ labels[key] = label
900
+ if not metrics:
901
+ return None
902
+ return {
903
+ "schema_version": SELF_HOSTED_METRICS_SCHEMA_VERSION,
904
+ "source": source,
905
+ "metrics": metrics,
906
+ "labels": labels,
907
+ "measurement_availability": availability,
908
+ "claim_boundary": {
909
+ "id": SELF_HOSTED_METRICS_CLAIM_BOUNDARY,
910
+ "hosted_api_token_savings_claim_allowed": False,
911
+ "hosted_api_cost_savings_claim_allowed": False,
912
+ "requires_provider_measured_matched_tasks_for_hosted_claims": True,
913
+ "reason": (
914
+ "Self-hosted local/model-server latency, memory, and quality metrics "
915
+ "are not hosted API token or cost telemetry."
916
+ ),
917
+ },
918
+ }
919
+
920
+
921
+ def collect_self_hosted_metrics(payload: Any) -> dict[str, Any] | None:
922
+ """Collect explicit self-hosted metric sidecars without broad key inference.
923
+
924
+ Only explicit top-level telemetry envelopes are considered. Do not infer
925
+ from incidental keys like `self_hosted_latency_ms` or arbitrary nested model
926
+ message content: that would make local/model-server telemetry too easy to
927
+ mix into hosted API claim surfaces.
928
+ """
929
+ if not isinstance(payload, dict):
930
+ return None
931
+ candidates = [
932
+ (
933
+ payload.get(SELF_HOSTED_METRICS_KEY),
934
+ f"explicit_provider_payload.{SELF_HOSTED_METRICS_KEY}",
935
+ )
936
+ ]
937
+ metrics_envelope = payload.get("metrics")
938
+ if isinstance(metrics_envelope, dict):
939
+ candidates.append((
940
+ metrics_envelope.get(SELF_HOSTED_METRICS_KEY),
941
+ f"explicit_provider_payload.metrics.{SELF_HOSTED_METRICS_KEY}",
942
+ ))
943
+ for raw, source in candidates:
944
+ normalized = normalize_self_hosted_metrics(raw, source=source)
945
+ if normalized is not None:
946
+ return normalized
947
+ return None
948
+
949
+
720
950
  def claude_version(claude_bin: str) -> str:
721
951
  try:
722
952
  proc = run_bounded_command(
@@ -747,7 +977,7 @@ def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> l
747
977
  argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
748
978
  argv.extend(variant.extra_args)
749
979
  argv.append("--")
750
- argv.append(task.prompt)
980
+ argv.append(task.variant_prompt_texts.get(variant.name, task.prompt))
751
981
  return argv
752
982
 
753
983
 
@@ -1003,6 +1233,7 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
1003
1233
  tokens, cost, cost_measured, primary_tokens_measured = collect_usage(payload)
1004
1234
  provider_cached_tokens, provider_cached_tokens_measured = collect_provider_cache_telemetry(payload)
1005
1235
  shift_metrics = collect_shift_metrics(payload)
1236
+ self_hosted_metrics = collect_self_hosted_metrics(payload)
1006
1237
  success, success_note = run_success_command(task, project_root)
1007
1238
  return RunResult(
1008
1239
  task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
@@ -1021,6 +1252,7 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
1021
1252
  external_cost_measured=bool(shift_metrics["external_cost_measured"]),
1022
1253
  provider_cached_tokens=provider_cached_tokens,
1023
1254
  provider_cached_tokens_measured=provider_cached_tokens_measured,
1255
+ self_hosted_metrics=self_hosted_metrics,
1024
1256
  )
1025
1257
 
1026
1258
 
@@ -1169,6 +1401,7 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
1169
1401
  "provider_cache": result.provider_cached_tokens_measured,
1170
1402
  "byte_metrics": byte_metrics_observed,
1171
1403
  "wall_time": result.wall_time_seconds >= 0,
1404
+ "self_hosted_metrics": result.self_hosted_metrics is not None,
1172
1405
  },
1173
1406
  "proxy_metrics": {
1174
1407
  "byte_metrics_observed": byte_metrics_observed,
@@ -1177,6 +1410,8 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
1177
1410
  "claim_boundary": "proxy_only_not_hosted_token_savings",
1178
1411
  },
1179
1412
  }
1413
+ if result.self_hosted_metrics is not None:
1414
+ payload["self_hosted_metrics"] = result.self_hosted_metrics
1180
1415
  with csv_file_lock(path, create_parent=True):
1181
1416
  fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
1182
1417
  try:
@@ -2090,8 +2325,8 @@ def main() -> int:
2090
2325
  require_no_follow_file_ops_supported()
2091
2326
  validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
2092
2327
 
2093
- tasks = parse_tasks(args.tasks)
2094
2328
  variants = parse_variants(args.variants)
2329
+ tasks = parse_tasks(args.tasks, variants=variants)
2095
2330
  targets = filter_targets(tasks, variants, args.task_id, args.variant)
2096
2331
  if not targets:
2097
2332
  print("no (task, variant) targets matched the filters", file=sys.stderr)
@@ -2122,6 +2357,9 @@ def main() -> int:
2122
2357
  print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
2123
2358
  return 2
2124
2359
 
2360
+ if runnable_targets:
2361
+ load_variant_prompt_files_for_targets(runnable_targets, task_file_dir=args.tasks.parent)
2362
+
2125
2363
  project_root = args.project_root.resolve()
2126
2364
  claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")
2127
2365