@ictechgy/context-guard 0.4.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +13 -0
  2. package/README.ko.md +61 -32
  3. package/README.md +90 -22
  4. package/context-guard-kit/README.md +39 -26
  5. package/context-guard-kit/benchmark_runner.py +273 -8
  6. package/context-guard-kit/claude_transcript_cost_audit.py +325 -12
  7. package/context-guard-kit/context_compress.py +153 -1
  8. package/context-guard-kit/context_filter.py +446 -0
  9. package/context-guard-kit/context_guard_cli.py +3 -0
  10. package/context-guard-kit/context_guard_diet.py +677 -2
  11. package/context-guard-kit/context_pack.py +1694 -2
  12. package/context-guard-kit/cost_guard.py +1870 -0
  13. package/context-guard-kit/setup_wizard.py +820 -29
  14. package/context-guard-kit/trim_command_output.py +396 -45
  15. package/docs/benchmark-fixtures/learned-compression.tasks.example.json +24 -0
  16. package/docs/benchmark-fixtures/learned-compression.variants.example.json +10 -0
  17. package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +24 -0
  18. package/docs/benchmark-fixtures/visual-ocr.variants.example.json +10 -0
  19. package/docs/benchmark-workflow-examples.md +40 -0
  20. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +169 -0
  21. package/docs/benchmark-workflows/measured-token-workflow.example.json +170 -0
  22. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +170 -0
  23. package/docs/cache-diagnostics-schema.md +75 -0
  24. package/docs/cache-diagnostics.example.json +116 -0
  25. package/docs/cache-diagnostics.schema.json +460 -0
  26. package/docs/distribution.md +4 -2
  27. package/docs/experimental-benchmark-fixtures.md +36 -0
  28. package/package.json +11 -2
  29. package/packaging/homebrew/context-guard.rb.template +3 -2
  30. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  31. package/plugins/context-guard/README.ko.md +21 -13
  32. package/plugins/context-guard/README.md +24 -10
  33. package/plugins/context-guard/bin/context-guard +3 -0
  34. package/plugins/context-guard/bin/context-guard-audit +325 -12
  35. package/plugins/context-guard/bin/context-guard-bench +273 -8
  36. package/plugins/context-guard/bin/context-guard-compress +153 -1
  37. package/plugins/context-guard/bin/context-guard-cost +1870 -0
  38. package/plugins/context-guard/bin/context-guard-diet +677 -2
  39. package/plugins/context-guard/bin/context-guard-filter +446 -0
  40. package/plugins/context-guard/bin/context-guard-pack +1694 -2
  41. package/plugins/context-guard/bin/context-guard-setup +820 -29
  42. package/plugins/context-guard/bin/context-guard-trim-output +396 -45
  43. package/plugins/context-guard/brief/README.md +10 -3
  44. package/plugins/context-guard/skills/optimize/SKILL.md +5 -2
  45. package/plugins/context-guard/skills/setup/SKILL.md +3 -1
@@ -108,6 +108,7 @@ CSV_COLUMNS = [
108
108
  MAX_CSV_NOTE_CHARS = 500
109
109
  MAX_CSV_ROWS = 100_000
110
110
  CSV_FORMULA_PREFIXES = ("=", "+", "-", "@")
111
+ PLACEHOLDER_SUCCESS_COMMAND_MARKER = "fixture-only placeholder: replace success_command before real benchmark runs"
111
112
  PROTECTED_VARIANT_FLAGS = frozenset({
112
113
  "--",
113
114
  "-p",
@@ -180,6 +181,8 @@ MAX_USAGE_COST_USD = 10**9
180
181
  # 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
181
182
  # ~4 bytes/token의 통용 근사값을 사용한다.
182
183
  TOKEN_PROXY_BYTES_PER_TOKEN = 4
184
+ BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
185
+ MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
183
186
  CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
184
187
  SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
185
188
  VERSION_OUTPUT_MAX_BYTES = 16_000
@@ -395,6 +398,10 @@ class BoundedProcessResult:
395
398
  output_truncated: bool = False
396
399
 
397
400
 
401
+ def is_placeholder_success_command(command: str | None) -> bool:
402
+ return bool(command and PLACEHOLDER_SUCCESS_COMMAND_MARKER in command)
403
+
404
+
398
405
  def parse_positive_int(value: Any, *, field: str, owner: str) -> int:
399
406
  """Parse a JSON fixture field that must be a positive integer."""
400
407
  if isinstance(value, bool):
@@ -940,6 +947,14 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
940
947
  success=True, notes=f"dry-run: {shlex.join(argv)}",
941
948
  wall_time_seconds=0.0,
942
949
  )
950
+ if is_placeholder_success_command(task.success_command):
951
+ return RunResult(
952
+ task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
953
+ tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
954
+ success=False,
955
+ notes=f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing to invoke provider",
956
+ wall_time_seconds=elapsed_seconds_since(started_at),
957
+ )
943
958
  argv[0] = executable_argv0(argv[0])
944
959
  try:
945
960
  proc = run_bounded_command(
@@ -1116,11 +1131,14 @@ def write_text_no_follow(path: Path, text: str) -> None:
1116
1131
 
1117
1132
  def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
1118
1133
  shifted_cost_known = cost_shift_measured(result)
1134
+ byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
1119
1135
  payload = {
1136
+ "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
1120
1137
  "date": _dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
1121
1138
  "claude_version": claude_ver,
1122
1139
  "task_id": result.task_id,
1123
1140
  "variant": result.variant,
1141
+ "transform_id": result.variant,
1124
1142
  "success": result.success,
1125
1143
  "primary_cost_measured": result.cost_measured,
1126
1144
  "primary_cost_usd": round(result.cost_usd, 6),
@@ -1142,6 +1160,22 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
1142
1160
  "hook_triggers": result.hook_triggers,
1143
1161
  "turns": result.turns,
1144
1162
  "notes": sanitize_csv_note(result.notes),
1163
+ "measurement_availability": {
1164
+ "primary_tokens": result.primary_tokens_measured,
1165
+ "primary_cost": result.cost_measured,
1166
+ "external_tokens": result.external_tokens_measured,
1167
+ "external_cost": result.external_cost_measured,
1168
+ "shifted_cost": shifted_cost_known,
1169
+ "provider_cache": result.provider_cached_tokens_measured,
1170
+ "byte_metrics": byte_metrics_observed,
1171
+ "wall_time": result.wall_time_seconds >= 0,
1172
+ },
1173
+ "proxy_metrics": {
1174
+ "byte_metrics_observed": byte_metrics_observed,
1175
+ "token_proxy": "chars_div_4",
1176
+ "bytes_per_token": TOKEN_PROXY_BYTES_PER_TOKEN,
1177
+ "claim_boundary": "proxy_only_not_hosted_token_savings",
1178
+ },
1145
1179
  }
1146
1180
  with csv_file_lock(path, create_parent=True):
1147
1181
  fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
@@ -1283,7 +1317,9 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
1283
1317
  seen_tasks_by_variant: dict[str, set[str]] = {}
1284
1318
  successful_tasks_by_variant: dict[str, set[str]] = {}
1285
1319
 
1286
- for row in rows:
1320
+ for row_index, raw_row in enumerate(rows, start=1):
1321
+ row = dict(raw_row)
1322
+ row["_row_index"] = str(row_index)
1287
1323
  variant = row.get("variant") or "unknown"
1288
1324
  task_id = row.get("task_id") or "unknown"
1289
1325
  seen_tasks_by_variant.setdefault(variant, set()).add(task_id)
@@ -1566,7 +1602,215 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
1566
1602
  len(baseline_values),
1567
1603
  )
1568
1604
 
1605
+ def row_indices_for(rows_for_task: list[dict[str, str]]) -> list[int]:
1606
+ out: list[int] = []
1607
+ for row in rows_for_task:
1608
+ index = row_optional_nonnegative_int(row, "_row_index")
1609
+ if index is not None:
1610
+ out.append(index)
1611
+ return out
1612
+
1613
+ def all_rows_bool(rows_for_task: list[dict[str, str]], key: str) -> bool:
1614
+ return bool(rows_for_task) and all(row_bool(row, key) for row in rows_for_task)
1615
+
1616
+ def all_rows_optional_int(rows_for_task: list[dict[str, str]], key: str) -> list[int] | None:
1617
+ values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
1618
+ if not values or any(value is None for value in values):
1619
+ return None
1620
+ return [value for value in values if value is not None]
1621
+
1622
+ def all_rows_optional_float(rows_for_task: list[dict[str, str]], key: str) -> list[float] | None:
1623
+ values = [row_optional_float(row, key) for row in rows_for_task]
1624
+ if not values or any(value is None for value in values):
1625
+ return None
1626
+ return [value for value in values if value is not None]
1627
+
1628
+ def average_optional_int(rows_for_task: list[dict[str, str]], key: str) -> float | None:
1629
+ values = all_rows_optional_int(rows_for_task, key)
1630
+ return (sum(values) / len(values)) if values else None
1631
+
1632
+ def average_optional_float(rows_for_task: list[dict[str, str]], key: str) -> float | None:
1633
+ values = all_rows_optional_float(rows_for_task, key)
1634
+ return (sum(values) / len(values)) if values else None
1635
+
1636
+ def total_optional_int(rows_for_task: list[dict[str, str]], key: str) -> int | None:
1637
+ values = all_rows_optional_int(rows_for_task, key)
1638
+ return sum(values) if values is not None else None
1639
+
1640
+ def all_rows_shifted_cost_measured(rows_for_task: list[dict[str, str]]) -> bool:
1641
+ return bool(rows_for_task) and all(
1642
+ row_cost_shift_measured(row) and row_optional_float(row, "total_cost_with_shift_usd") is not None
1643
+ for row in rows_for_task
1644
+ )
1645
+
1646
+ def matched_side_evidence(variant: str, task_id: str, rows_for_task: list[dict[str, str]]) -> dict[str, Any]:
1647
+ primary_tokens_measured = all_rows_bool(rows_for_task, "primary_tokens_measured")
1648
+ primary_cost_measured = all_rows_bool(rows_for_task, "cost_measured")
1649
+ shifted_cost_measured = all_rows_shifted_cost_measured(rows_for_task)
1650
+ provider_cache_measured = all_rows_bool(rows_for_task, "provider_cached_tokens_measured")
1651
+ external_tokens_measured = all_rows_bool(rows_for_task, "external_tokens_measured")
1652
+ external_cost_measured = all_rows_bool(rows_for_task, "external_cost_measured")
1653
+ corrections_values = all_rows_optional_int(rows_for_task, "corrections")
1654
+ bytes_before_values = [row_optional_nonnegative_int(row, "bytes_before") for row in rows_for_task]
1655
+ bytes_after_values = [row_optional_nonnegative_int(row, "bytes_after") for row in rows_for_task]
1656
+ byte_metrics_observed = bool(rows_for_task) and not any(
1657
+ value is None for value in [*bytes_before_values, *bytes_after_values]
1658
+ )
1659
+ bytes_before_total = sum(value for value in bytes_before_values if value is not None)
1660
+ bytes_after_total = sum(value for value in bytes_after_values if value is not None)
1661
+ byte_delta = bytes_after_total - bytes_before_total if byte_metrics_observed else None
1662
+ token_proxy_delta = (
1663
+ int(byte_delta / TOKEN_PROXY_BYTES_PER_TOKEN) if byte_delta is not None else None
1664
+ )
1665
+ return {
1666
+ "variant": variant,
1667
+ "task_id": task_id,
1668
+ "run_count": len(rows_for_task),
1669
+ "row_indices": row_indices_for(rows_for_task),
1670
+ "primary_tokens": {
1671
+ "measured": primary_tokens_measured,
1672
+ "average": average_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
1673
+ "total": total_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
1674
+ },
1675
+ "primary_cost_usd": {
1676
+ "measured": primary_cost_measured,
1677
+ "average": average_optional_float(rows_for_task, "cost_usd") if primary_cost_measured else None,
1678
+ },
1679
+ "total_cost_with_shift_usd": {
1680
+ "measured": shifted_cost_measured,
1681
+ "average": (
1682
+ average_optional_float(rows_for_task, "total_cost_with_shift_usd")
1683
+ if shifted_cost_measured else None
1684
+ ),
1685
+ },
1686
+ "external_tokens": {
1687
+ "measured": external_tokens_measured,
1688
+ "total": total_optional_int(rows_for_task, "external_tokens") if external_tokens_measured else None,
1689
+ },
1690
+ "external_cost_usd": {
1691
+ "measured": external_cost_measured,
1692
+ "total": (
1693
+ sum(row_float(row, "external_cost_usd") for row in rows_for_task)
1694
+ if external_cost_measured else None
1695
+ ),
1696
+ },
1697
+ "bytes": {
1698
+ "measurement": "observed" if byte_metrics_observed else "unavailable",
1699
+ "before_total": bytes_before_total if byte_metrics_observed else None,
1700
+ "after_total": bytes_after_total if byte_metrics_observed else None,
1701
+ "delta_total": byte_delta,
1702
+ "token_proxy_delta": token_proxy_delta,
1703
+ "token_proxy": "chars_div_4_proxy_only" if byte_metrics_observed else "unavailable",
1704
+ },
1705
+ "wall_time_seconds": {
1706
+ "measured": all_rows_optional_float(rows_for_task, "wall_time_seconds") is not None,
1707
+ "average": average_optional_float(rows_for_task, "wall_time_seconds"),
1708
+ },
1709
+ "provider_cached_tokens": {
1710
+ "measured": provider_cache_measured,
1711
+ "average": (
1712
+ average_optional_int(rows_for_task, "provider_cached_tokens")
1713
+ if provider_cache_measured else None
1714
+ ),
1715
+ },
1716
+ "corrections": {
1717
+ "measured": corrections_values is not None,
1718
+ "average": (sum(corrections_values) / len(corrections_values)) if corrections_values else None,
1719
+ },
1720
+ }
1721
+
1722
+ def matched_pair_evidence_entry(
1723
+ variant: str,
1724
+ task_id: str,
1725
+ quality_gate: str,
1726
+ ) -> dict[str, Any]:
1727
+ baseline_rows = successful_rows_by_variant_task[baseline_variant][task_id]
1728
+ variant_rows = successful_rows_by_variant_task[variant][task_id]
1729
+ baseline_evidence = matched_side_evidence(baseline_variant, task_id, baseline_rows)
1730
+ variant_evidence = matched_side_evidence(variant, task_id, variant_rows)
1731
+ baseline_token_avg = baseline_evidence["primary_tokens"]["average"]
1732
+ variant_token_avg = variant_evidence["primary_tokens"]["average"]
1733
+ token_claim_allowed = (
1734
+ quality_gate == "pass"
1735
+ and bool(baseline_evidence["primary_tokens"]["measured"])
1736
+ and bool(variant_evidence["primary_tokens"]["measured"])
1737
+ and isinstance(baseline_token_avg, (int, float))
1738
+ and baseline_token_avg > 0
1739
+ and isinstance(variant_token_avg, (int, float))
1740
+ )
1741
+ baseline_cost_avg = baseline_evidence["total_cost_with_shift_usd"]["average"]
1742
+ variant_cost_avg = variant_evidence["total_cost_with_shift_usd"]["average"]
1743
+ shifted_cost_claim_allowed = (
1744
+ quality_gate == "pass"
1745
+ and bool(baseline_evidence["total_cost_with_shift_usd"]["measured"])
1746
+ and bool(variant_evidence["total_cost_with_shift_usd"]["measured"])
1747
+ and isinstance(baseline_cost_avg, (int, float))
1748
+ and baseline_cost_avg > 0
1749
+ and isinstance(variant_cost_avg, (int, float))
1750
+ )
1751
+ token_delta = (
1752
+ variant_token_avg - baseline_token_avg
1753
+ if token_claim_allowed
1754
+ else None
1755
+ )
1756
+ token_savings_pct = (
1757
+ (baseline_token_avg - variant_token_avg) / baseline_token_avg * 100.0
1758
+ if token_delta is not None
1759
+ else None
1760
+ )
1761
+ cost_delta = (
1762
+ variant_cost_avg - baseline_cost_avg
1763
+ if shifted_cost_claim_allowed
1764
+ else None
1765
+ )
1766
+ cost_savings_pct = (
1767
+ (baseline_cost_avg - variant_cost_avg) / baseline_cost_avg * 100.0
1768
+ if cost_delta is not None
1769
+ else None
1770
+ )
1771
+ base_after = baseline_evidence["bytes"]["after_total"]
1772
+ variant_after = variant_evidence["bytes"]["after_total"]
1773
+ byte_after_delta = (
1774
+ variant_after - base_after
1775
+ if isinstance(base_after, int) and isinstance(variant_after, int)
1776
+ else None
1777
+ )
1778
+ return {
1779
+ "schema_version": MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION,
1780
+ "task_id": task_id,
1781
+ "baseline_variant": baseline_variant,
1782
+ "variant": variant,
1783
+ "transform_id": variant,
1784
+ "quality_gate": quality_gate,
1785
+ "evidence_kind": "matched_successful_task_bucket",
1786
+ "measurements": {
1787
+ "baseline": baseline_evidence,
1788
+ "variant": variant_evidence,
1789
+ },
1790
+ "delta": {
1791
+ "primary_tokens_average": token_delta,
1792
+ "token_savings_pct": token_savings_pct,
1793
+ "total_cost_with_shift_usd_average": cost_delta,
1794
+ "cost_savings_pct_with_shift": cost_savings_pct,
1795
+ "bytes_after_total": byte_after_delta,
1796
+ "token_proxy_after_total": (
1797
+ int(byte_after_delta / TOKEN_PROXY_BYTES_PER_TOKEN)
1798
+ if byte_after_delta is not None else None
1799
+ ),
1800
+ "proxy_measurement": "chars_div_4_proxy_only",
1801
+ },
1802
+ "claim_boundary": {
1803
+ "quality_gate": quality_gate,
1804
+ "token_savings_claim_allowed": token_claim_allowed,
1805
+ "shifted_cost_claim_allowed": shifted_cost_claim_allowed,
1806
+ "byte_proxy_only": True,
1807
+ "requires_matched_successful_tasks": True,
1808
+ "raw_estimate_only_claim_allowed": False,
1809
+ },
1810
+ }
1811
+
1569
1812
  comparisons: list[dict[str, Any]] = []
1813
+ matched_pair_evidence: list[dict[str, Any]] = []
1570
1814
  baseline = by_variant.get(baseline_variant)
1571
1815
  baseline_successful_tasks = successful_tasks_by_variant.get(baseline_variant, set())
1572
1816
  baseline_failure_rate = baseline.get("failure_rate") if baseline else None
@@ -1680,6 +1924,8 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
1680
1924
  else:
1681
1925
  comparison["cost_savings_pct_with_shift"] = None
1682
1926
  comparison["paired_cost_task_count"] = cost_task_count
1927
+ for task_id in sorted(matched_tasks):
1928
+ matched_pair_evidence.append(matched_pair_evidence_entry(variant, task_id, quality_gate))
1683
1929
  comparisons.append(comparison)
1684
1930
 
1685
1931
  claim_status = "insufficient_baseline"
@@ -1712,6 +1958,7 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
1712
1958
  "row_count": len(rows),
1713
1959
  "summary_by_variant": by_variant,
1714
1960
  "comparisons": comparisons,
1961
+ "matched_pair_evidence": matched_pair_evidence,
1715
1962
  "claim_status": claim_status,
1716
1963
  "caveat": (
1717
1964
  "Proxy byte reductions are reported separately from matched-task token/cost metrics; "
@@ -1843,12 +2090,6 @@ def main() -> int:
1843
2090
  require_no_follow_file_ops_supported()
1844
2091
  validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
1845
2092
 
1846
- if not args.dry_run and shutil.which(args.claude_bin) is None:
1847
- # claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
1848
- if not Path(args.claude_bin).exists():
1849
- print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
1850
- return 2
1851
-
1852
2093
  tasks = parse_tasks(args.tasks)
1853
2094
  variants = parse_variants(args.variants)
1854
2095
  targets = filter_targets(tasks, variants, args.task_id, args.variant)
@@ -1857,8 +2098,32 @@ def main() -> int:
1857
2098
  return 1
1858
2099
 
1859
2100
  skip_keys = existing_keys(args.csv) if args.resume else set()
2101
+ runnable_targets = [
2102
+ (task, variant)
2103
+ for task, variant in targets
2104
+ if (task.id, variant.name) not in skip_keys
2105
+ ]
2106
+ placeholder_targets = [
2107
+ f"{task.id}/{variant.name}"
2108
+ for task, variant in runnable_targets
2109
+ if is_placeholder_success_command(task.success_command)
2110
+ ]
2111
+ if placeholder_targets and not args.dry_run:
2112
+ print(
2113
+ f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing non-dry-run provider invocation for: "
2114
+ f"{', '.join(placeholder_targets)}",
2115
+ file=sys.stderr,
2116
+ )
2117
+ return 2
2118
+
2119
+ if runnable_targets and not args.dry_run and shutil.which(args.claude_bin) is None:
2120
+ # claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
2121
+ if not Path(args.claude_bin).exists():
2122
+ print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
2123
+ return 2
2124
+
1860
2125
  project_root = args.project_root.resolve()
1861
- claude_ver = "dry-run" if args.dry_run else claude_version(args.claude_bin)
2126
+ claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")
1862
2127
 
1863
2128
  completed = 0
1864
2129
  for task, variant in targets: