@ictechgy/context-guard 0.4.3 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/CHANGELOG.md +13 -0
  2. package/README.ko.md +16 -3
  3. package/README.md +13 -3
  4. package/context-guard-kit/README.md +2 -2
  5. package/context-guard-kit/benchmark_runner.py +244 -6
  6. package/context-guard-kit/claude_transcript_cost_audit.py +443 -1
  7. package/docs/benchmark-fixtures/learned-compression-baseline-context-pack.prompt.example.md +19 -0
  8. package/docs/benchmark-fixtures/learned-compression-candidate-digest.prompt.example.md +21 -0
  9. package/docs/benchmark-fixtures/learned-compression.tasks.example.json +5 -1
  10. package/docs/benchmark-fixtures/output-transform-baseline-raw-output.prompt.example.md +20 -0
  11. package/docs/benchmark-fixtures/output-transform-digest-receipt.prompt.example.md +23 -0
  12. package/docs/benchmark-fixtures/output-transform.tasks.example.json +28 -0
  13. package/docs/benchmark-fixtures/output-transform.variants.example.json +10 -0
  14. package/docs/benchmark-fixtures/visual-ocr-cropped-ocr.prompt.example.md +22 -0
  15. package/docs/benchmark-fixtures/visual-ocr-full-visual.prompt.example.md +19 -0
  16. package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +5 -1
  17. package/docs/benchmark-workflow-examples.md +6 -2
  18. package/docs/benchmark-workflows/self-hosted-metrics-ledger.example.jsonl +1 -0
  19. package/docs/cache-diagnostics-schema.md +25 -4
  20. package/docs/experimental-benchmark-fixtures.md +17 -6
  21. package/docs/mac-visibility-feasibility-schema.md +62 -0
  22. package/docs/mac-visibility-feasibility.example.json +130 -0
  23. package/package.json +5 -1
  24. package/packaging/homebrew/context-guard.rb.template +1 -1
  25. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  26. package/plugins/context-guard/README.ko.md +3 -3
  27. package/plugins/context-guard/README.md +3 -3
  28. package/plugins/context-guard/bin/context-guard-audit +443 -1
  29. package/plugins/context-guard/bin/context-guard-bench +244 -6
@@ -46,9 +46,11 @@ COST_KEYS = ("total_cost_usd", "cost_usd", "costUSD")
46
46
  MODEL_KEYS = ("model", "model_id", "modelId")
47
47
  QUERY_SOURCE_KEYS = ("query_source", "querySource")
48
48
  TIMESTAMP_KEYS = ("timestamp", "created_at", "createdAt", "time", "ts")
49
- FEASIBILITY_SCHEMA_VERSION = "contextguard.metric-feasibility.v1.2"
49
+ FEASIBILITY_SCHEMA_VERSION = "contextguard.metric-feasibility.v1.3"
50
+ MAC_VISIBILITY_SCHEMA_VERSION = "contextguard.mac-visibility.v1"
50
51
  FEASIBILITY_PRODUCER = "context-guard-audit"
51
52
  CACHE_DIAGNOSTICS_SCHEMA_VERSION = "contextguard.cache-diagnostics.v1"
53
+ CACHE_LAYOUT_ADVICE_SCHEMA_VERSION = "contextguard.cache-layout-advice.v1"
52
54
  MAX_ERROR_EXAMPLES = 20
53
55
  JSON_PARSE_RECURSION_LIMIT = 10_000
54
56
  READ_CHUNK_BYTES = 64 * 1024
@@ -184,6 +186,7 @@ class UsageSummary:
184
186
  prompt_cache_audit: PromptCacheAudit = field(default_factory=PromptCacheAudit)
185
187
  cache_friendliness_cache: dict[str, Any] | None = field(default=None, init=False, repr=False)
186
188
  cache_diagnostics_cache: dict[str, Any] | None = field(default=None, init=False, repr=False)
189
+ cache_layout_advice_cache: dict[str, Any] | None = field(default=None, init=False, repr=False)
187
190
 
188
191
  @property
189
192
  def total_tokens(self) -> int:
@@ -1398,6 +1401,222 @@ def cache_diagnostics_for_summary(summary: UsageSummary) -> dict[str, Any]:
1398
1401
  return build_cache_diagnostics(summary)
1399
1402
 
1400
1403
 
1404
+ def _dominant_transcript(summary: UsageSummary) -> dict[str, Any] | None:
1405
+ if summary.total_tokens <= 0 or not summary.by_file:
1406
+ return None
1407
+ _label, tokens = summary.by_file.most_common(1)[0]
1408
+ share = tokens / summary.total_tokens if summary.total_tokens else 0.0
1409
+ return {
1410
+ "tokens": tokens,
1411
+ "share": round(share, 4),
1412
+ "dominates": share >= 0.20 and tokens >= 1_000,
1413
+ }
1414
+
1415
+
1416
+ def _first_dynamic_breaker(cache_diagnostics: dict[str, Any]) -> dict[str, Any] | None:
1417
+ breakers = cache_diagnostics.get("dynamic_prefix_breakers") or []
1418
+ if not breakers:
1419
+ return None
1420
+ first = breakers[0]
1421
+ return first if isinstance(first, dict) else None
1422
+
1423
+
1424
+ def build_cache_layout_advice(summary: UsageSummary) -> dict[str, Any]:
1425
+ if summary.cache_layout_advice_cache is not None:
1426
+ return summary.cache_layout_advice_cache
1427
+
1428
+ cache_friendliness = cache_friendliness_for_summary(summary)
1429
+ cache_diagnostics = cache_diagnostics_for_summary(summary)
1430
+ signals = cache_friendliness.get("signals") if isinstance(cache_friendliness.get("signals"), dict) else {}
1431
+ dynamic_breaker = _first_dynamic_breaker(cache_diagnostics)
1432
+ dominant = _dominant_transcript(summary)
1433
+ cache_creation = summary.tokens.get("cache_creation", 0)
1434
+ cache_read = summary.tokens.get("cache_read", 0)
1435
+ cache_fields = cache_diagnostics.get("observations", {}).get("cache_fields", {}) if isinstance(cache_diagnostics.get("observations"), dict) else {}
1436
+ cache_status = cache_fields.get("status") if isinstance(cache_fields, dict) else None
1437
+ stable_prefix_share = signals.get("stable_prefix_share")
1438
+ volatile_prefix_share = signals.get("volatile_prefix_share")
1439
+ volatile_tail_share = signals.get("volatile_tail_share")
1440
+ max_prefix_position = dynamic_breaker.get("position") if dynamic_breaker else None
1441
+ max_prefix_position_volatile_share = dynamic_breaker.get("volatile_share") if dynamic_breaker else signals.get("max_prefix_position_volatile_share")
1442
+
1443
+ status = "missing"
1444
+ confidence = "unavailable"
1445
+ observed_issue = "unknown"
1446
+ priority = "P2"
1447
+ hypothesized_causes: list[dict[str, Any]] = []
1448
+ corroborated_causes: list[dict[str, Any]] = []
1449
+ next_checks: list[dict[str, Any]] = []
1450
+ recommended_experiments: list[dict[str, Any]] = []
1451
+
1452
+ has_cache_any = bool(
1453
+ summary.token_field_presence.get("cache_read", 0)
1454
+ or summary.token_field_presence.get("cache_creation", 0)
1455
+ )
1456
+ has_prompt_samples = bool(summary.prompt_cache_audit.samples)
1457
+ if has_cache_any or has_prompt_samples:
1458
+ status = "partial" if (
1459
+ not has_prompt_samples
1460
+ or cache_friendliness.get("status") == "partial"
1461
+ or cache_diagnostics.get("status") == "partial"
1462
+ or summary.skipped_files
1463
+ or summary.skipped_records
1464
+ or summary.parse_errors
1465
+ ) else "available"
1466
+ confidence = "partial" if status == "partial" else "hypothesis"
1467
+
1468
+ volatile_prefix_breaker = bool(
1469
+ dynamic_breaker
1470
+ and cache_creation > 0
1471
+ and (max_prefix_position in {0, 1} or (max_prefix_position_volatile_share or 0) >= PROMPT_PREFIX_VOLATILE_THRESHOLD)
1472
+ )
1473
+ long_session_dominates = bool(dominant and dominant.get("dominates"))
1474
+
1475
+ if volatile_prefix_breaker:
1476
+ observed_issue = "volatile_prefix_breaker"
1477
+ priority = "P0" if cache_creation >= 50_000 and max_prefix_position in {0, 1} else "P1"
1478
+ hypothesized_causes.append({
1479
+ "id": "prefix-position-churn",
1480
+ "confidence": confidence,
1481
+ "evidence": EVIDENCE_INFERRED,
1482
+ "reason": (
1483
+ "A highly volatile redacted prompt segment appears in the early prefix window; "
1484
+ "this identifies a layout issue, not a confirmed source."
1485
+ ),
1486
+ "next_check": "Check whether startup context, generated evidence, or tool/MCP catalog changes are moving before stable policy.",
1487
+ })
1488
+ if cache_diagnostics.get("stable_prefix_candidates"):
1489
+ hypothesized_causes.append({
1490
+ "id": "evidence-before-policy",
1491
+ "confidence": confidence,
1492
+ "evidence": EVIDENCE_INFERRED,
1493
+ "reason": (
1494
+ "Stable reusable segments appear elsewhere while the early prefix churns; "
1495
+ "check whether logs, diffs, timestamps, or file evidence precede stable instructions."
1496
+ ),
1497
+ "next_check": "Keep stable policy/instructions first and move generated run evidence later.",
1498
+ })
1499
+ next_checks.append({
1500
+ "id": "inspect-startup-context-size",
1501
+ "confidence": "hypothesis",
1502
+ "command_templates": [
1503
+ "context-guard-diet scan <repo>",
1504
+ "context-guard-diet structural-waste <repo>",
1505
+ ],
1506
+ "evidence_required_for_corroboration": (
1507
+ "Large or duplicate CLAUDE.md/AGENTS.md/GEMINI.md findings from diet output."
1508
+ ),
1509
+ })
1510
+ elif long_session_dominates:
1511
+ observed_issue = "long_session_accumulation"
1512
+ priority = "P1"
1513
+ elif cache_creation >= 10_000 and cache_read > 0 and summary.cache_amortization < 0.5:
1514
+ observed_issue = "low_cache_reuse"
1515
+ priority = "P1"
1516
+ elif cache_status == "missing" or not has_cache_any:
1517
+ observed_issue = "missing_cache_fields"
1518
+ priority = "P2"
1519
+
1520
+ if long_session_dominates:
1521
+ recommended_experiments.append({
1522
+ "id": "split-long-sessions",
1523
+ "order": len(recommended_experiments) + 1,
1524
+ "priority": "P1",
1525
+ "effort": "low",
1526
+ "action": "Use /clear between unrelated tasks and /compact focus on changed files, failing tests, and remaining TODO during long work.",
1527
+ "expected_signal": "Cache creation per comparable task decreases and one transcript no longer dominates observed tokens.",
1528
+ "verification": "Re-run context-guard-audit on a comparable window and compare cache_creation, cache_amortization, and top transcript share.",
1529
+ "evidence": dominant or {},
1530
+ })
1531
+ if volatile_prefix_breaker:
1532
+ recommended_experiments.append({
1533
+ "id": "stabilize-cache-prefix",
1534
+ "order": len(recommended_experiments) + 1,
1535
+ "priority": priority,
1536
+ "effort": "medium",
1537
+ "action": "Keep stable reusable instructions/policy before volatile logs, diffs, timestamps, and generated file evidence.",
1538
+ "expected_signal": "Stable prefix share rises and volatile prefix share falls on matched audit windows.",
1539
+ "verification": "Re-run context-guard-audit --json --recommend and compare cache_layout_advice plus cache_friendliness signals.",
1540
+ "evidence": {
1541
+ "dynamic_prefix_breaker_position": max_prefix_position,
1542
+ "dynamic_prefix_breaker_volatile_share": max_prefix_position_volatile_share,
1543
+ },
1544
+ })
1545
+ recommended_experiments.append({
1546
+ "id": "run-context-diet-checks",
1547
+ "order": len(recommended_experiments) + 1,
1548
+ "priority": "P1",
1549
+ "effort": "low",
1550
+ "action": "Run the generated diet command templates and treat any large/duplicate context-file findings as corroborating evidence before editing instructions.",
1551
+ "expected_signal": "Diet output identifies or rules out oversized/duplicated startup context as a contributor.",
1552
+ "verification": "Record diet JSON separately; do not convert prefix-position evidence alone into a confirmed startup-context cause.",
1553
+ "command_templates": [
1554
+ "context-guard-diet scan <repo> --json > diet.json",
1555
+ "context-guard-diet structural-waste <repo> --json > structural-waste.json",
1556
+ ],
1557
+ })
1558
+ if cache_creation >= 50_000 and summary.cache_amortization_defined and 1.0 <= summary.cache_amortization < 5.0:
1559
+ recommended_experiments.append({
1560
+ "id": "defer-longer-ttl-until-prefix-stable" if volatile_prefix_breaker else "evaluate-longer-ttl-after-stability-check",
1561
+ "order": len(recommended_experiments) + 1,
1562
+ "priority": "P2",
1563
+ "effort": "medium",
1564
+ "action": "Treat longer TTL as secondary; first corroborate stable prefix reuse and current provider TTL/pricing behavior.",
1565
+ "expected_signal": "TTL evaluation happens only after prefix volatility is reduced or ruled out.",
1566
+ "verification": "Use timestamped cache telemetry and provider-measured billing/cost evidence; historical token totals alone are insufficient.",
1567
+ })
1568
+ if not recommended_experiments and status == "partial":
1569
+ next_checks.append({
1570
+ "id": "rerun-narrower-audit",
1571
+ "confidence": "partial",
1572
+ "command_templates": ["context-guard-audit <transcript-or-project-dir> --json --recommend"],
1573
+ "evidence_required_for_corroboration": "Enough uncapped prompt/cache records to classify prefix layout.",
1574
+ })
1575
+ if not recommended_experiments and observed_issue == "missing_cache_fields":
1576
+ next_checks.append({
1577
+ "id": "collect-cache-telemetry",
1578
+ "confidence": "unavailable",
1579
+ "command_templates": ["context-guard-audit ~/.claude/projects --json --recommend"],
1580
+ "evidence_required_for_corroboration": "Transcript records with cache_read/cache_creation fields.",
1581
+ })
1582
+
1583
+ advice = {
1584
+ "schema_version": CACHE_LAYOUT_ADVICE_SCHEMA_VERSION,
1585
+ "status": status,
1586
+ "confidence": confidence,
1587
+ "heuristic": True,
1588
+ "observed_issue": observed_issue,
1589
+ "priority": priority,
1590
+ "observed_summary": {
1591
+ "cache_creation_tokens": cache_creation,
1592
+ "cache_read_tokens": cache_read,
1593
+ "cache_amortization": round(summary.cache_amortization, 4) if summary.cache_amortization_defined else None,
1594
+ "stable_prefix_share": stable_prefix_share,
1595
+ "volatile_prefix_share": volatile_prefix_share,
1596
+ "volatile_tail_share": volatile_tail_share,
1597
+ "max_prefix_position": max_prefix_position,
1598
+ "max_prefix_position_volatile_share": max_prefix_position_volatile_share,
1599
+ "dominant_transcript_share": dominant.get("share") if dominant else None,
1600
+ },
1601
+ "hypothesized_causes": hypothesized_causes,
1602
+ "corroborated_causes": corroborated_causes,
1603
+ "next_checks": next_checks,
1604
+ "recommended_experiments": recommended_experiments,
1605
+ "caveats": [
1606
+ "Cache layout advice is a local transcript heuristic, not billing authority or provider-cache proof.",
1607
+ "Observed issues come from cache fields and redacted segment statistics; causes remain hypotheses until corroborated by diet/structural evidence.",
1608
+ "Generated command templates use placeholders and must not be treated as observed user commands or paths.",
1609
+ "Use matched before/after audits before making token or cost savings claims.",
1610
+ ],
1611
+ }
1612
+ summary.cache_layout_advice_cache = advice
1613
+ return advice
1614
+
1615
+
1616
+ def cache_layout_advice_for_summary(summary: UsageSummary) -> dict[str, Any]:
1617
+ return build_cache_layout_advice(summary)
1618
+
1619
+
1401
1620
  def build_metric_caveats(summary: UsageSummary) -> list[str]:
1402
1621
  caveats = [
1403
1622
  "Values are observed from local Claude Code transcript JSON/JSONL fields and are not official billing records.",
@@ -1417,6 +1636,168 @@ def build_metric_caveats(summary: UsageSummary) -> list[str]:
1417
1636
  return caveats
1418
1637
 
1419
1638
 
1639
+ def _mac_card(
1640
+ card_id: str,
1641
+ title: str,
1642
+ status: str,
1643
+ binding_paths: list[str],
1644
+ *,
1645
+ required_observation: str | None = None,
1646
+ ) -> dict[str, Any]:
1647
+ card: dict[str, Any] = {
1648
+ "id": card_id,
1649
+ "title": title,
1650
+ "status": status,
1651
+ "binding_paths": binding_paths,
1652
+ }
1653
+ if required_observation:
1654
+ card["required_observation"] = required_observation
1655
+ return card
1656
+
1657
+
1658
+ def build_mac_visibility_contract(
1659
+ *,
1660
+ availability: dict[str, Any],
1661
+ integrity: dict[str, Any],
1662
+ cache_layout_advice: dict[str, Any],
1663
+ ) -> dict[str, Any]:
1664
+ """Build the pre-GUI macOS visibility binding contract.
1665
+
1666
+ This is intentionally a thin index over already-emitted stable feasibility
1667
+ fields. It does not recompute metrics, read diagnostic summary data, or infer
1668
+ live context/headroom from historical transcript totals.
1669
+ """
1670
+ token_status = str((availability.get("tokens") or {}).get("status", "missing"))
1671
+ scan_status = str(integrity.get("status", "partial"))
1672
+ if token_status == "available" and scan_status == "complete":
1673
+ readiness_status = "ready"
1674
+ readiness_reason = "Transcript token totals are available and the scan completed within configured limits."
1675
+ elif token_status in {"available", "partial"}:
1676
+ readiness_status = "partial"
1677
+ readiness_reason = "Some stable fields can be shown, but scan integrity or metric availability is partial."
1678
+ else:
1679
+ readiness_status = "missing"
1680
+ readiness_reason = "Token totals are missing from the transcript scan; show setup or unavailable state."
1681
+
1682
+ context_status = str((availability.get("context") or {}).get("status", "missing"))
1683
+ headroom_status = str((availability.get("headroom") or {}).get("status", "missing"))
1684
+ cache_status = str((availability.get("cache") or {}).get("status", "missing"))
1685
+ cost_status = str((availability.get("cost") or {}).get("status", "missing"))
1686
+ advice_status = str(cache_layout_advice.get("status", "missing"))
1687
+
1688
+ missing_live_observations: list[dict[str, Any]] = []
1689
+ if context_status == "missing":
1690
+ missing_live_observations.append({
1691
+ "id": "live_context_window",
1692
+ "required_observation": "live_statusline_snapshot",
1693
+ "affects": ["context_availability", "metric_availability.context"],
1694
+ "reason": "Historical transcript scans do not include live Claude Code context_window data.",
1695
+ })
1696
+ if headroom_status == "missing":
1697
+ missing_live_observations.append({
1698
+ "id": "live_headroom",
1699
+ "required_observation": "live_statusline_snapshot",
1700
+ "affects": ["headroom_availability", "cache_diagnostics.headroom_diagnostics"],
1701
+ "reason": "Historical transcript totals are not remaining-token or live headroom observations.",
1702
+ })
1703
+
1704
+ return {
1705
+ "schema_version": MAC_VISIBILITY_SCHEMA_VERSION,
1706
+ "surface_kind": "local_macos_visibility_contract",
1707
+ "readiness": {
1708
+ "status": readiness_status,
1709
+ "reason": readiness_reason,
1710
+ },
1711
+ "bind_to_top_level_fields": [
1712
+ "source_kind",
1713
+ "source_freshness",
1714
+ "scan_integrity",
1715
+ "metric_availability",
1716
+ "metric_caveats",
1717
+ "redaction_mode",
1718
+ "context_availability",
1719
+ "headroom_availability",
1720
+ "cache_friendliness",
1721
+ "cache_diagnostics",
1722
+ "cache_layout_advice",
1723
+ "totals",
1724
+ ],
1725
+ "diagnostic_only_fields": ["summary"],
1726
+ "primary_cards": [
1727
+ _mac_card(
1728
+ "source_freshness",
1729
+ "Source freshness",
1730
+ "available",
1731
+ ["source_kind", "source_freshness.status", "source_freshness.generated_at"],
1732
+ ),
1733
+ _mac_card(
1734
+ "scan_integrity",
1735
+ "Scan integrity",
1736
+ scan_status,
1737
+ [
1738
+ "scan_integrity.status",
1739
+ "scan_integrity.files_scanned",
1740
+ "scan_integrity.records_scanned",
1741
+ "scan_integrity.skipped_files",
1742
+ "scan_integrity.skipped_records",
1743
+ ],
1744
+ ),
1745
+ _mac_card(
1746
+ "token_totals",
1747
+ "Token totals",
1748
+ token_status,
1749
+ [
1750
+ "totals.total_tokens",
1751
+ "totals.tokens.input",
1752
+ "totals.tokens.output",
1753
+ "totals.tokens.cache_read",
1754
+ "totals.tokens.cache_creation",
1755
+ ],
1756
+ ),
1757
+ _mac_card(
1758
+ "cache_reuse",
1759
+ "Cache-read share and reuse ratio",
1760
+ cache_status,
1761
+ ["totals.cache_read_share", "totals.cache_reuse_ratio", "metric_availability.cache"],
1762
+ ),
1763
+ _mac_card(
1764
+ "observed_cost",
1765
+ "Observed transcript cost",
1766
+ cost_status,
1767
+ ["totals.cost_usd_observed", "metric_availability.cost"],
1768
+ ),
1769
+ _mac_card(
1770
+ "context_availability",
1771
+ "Context availability",
1772
+ context_status,
1773
+ ["context_availability", "metric_availability.context"],
1774
+ required_observation="live_statusline_snapshot" if context_status == "missing" else None,
1775
+ ),
1776
+ _mac_card(
1777
+ "headroom_availability",
1778
+ "Headroom availability",
1779
+ headroom_status,
1780
+ ["headroom_availability", "cache_diagnostics.headroom_diagnostics"],
1781
+ required_observation="live_statusline_snapshot" if headroom_status == "missing" else None,
1782
+ ),
1783
+ _mac_card(
1784
+ "cache_layout_advice",
1785
+ "Cache layout advice",
1786
+ advice_status,
1787
+ ["cache_layout_advice", "cache_friendliness", "cache_diagnostics.dynamic_prefix_breakers"],
1788
+ ),
1789
+ ],
1790
+ "missing_live_observations": missing_live_observations,
1791
+ "claim_boundaries": [
1792
+ "Local transcript observations are not invoice-grade billing records.",
1793
+ "Provider cache fields are telemetry, not ContextGuard-caused token reduction and do not prove provider cache hits.",
1794
+ "Historical transcript totals do not infer live context headroom or remaining tokens.",
1795
+ "This contract does not guarantee token or cost savings.",
1796
+ ],
1797
+ "redaction_required": True,
1798
+ }
1799
+
1800
+
1420
1801
  def feasibility_json(
1421
1802
  summary: UsageSummary,
1422
1803
  top: int = 15,
@@ -1433,6 +1814,12 @@ def feasibility_json(
1433
1814
  stable_total_tokens = sum(stable_tokens.values())
1434
1815
  cache_friendliness = cache_friendliness_for_summary(summary)
1435
1816
  cache_diagnostics = cache_diagnostics_for_summary(summary)
1817
+ cache_layout_advice = cache_layout_advice_for_summary(summary)
1818
+ mac_visibility = build_mac_visibility_contract(
1819
+ availability=availability,
1820
+ integrity=integrity,
1821
+ cache_layout_advice=cache_layout_advice,
1822
+ )
1436
1823
  return {
1437
1824
  "schema_version": FEASIBILITY_SCHEMA_VERSION,
1438
1825
  "producer": FEASIBILITY_PRODUCER,
@@ -1452,6 +1839,8 @@ def feasibility_json(
1452
1839
  "headroom_availability",
1453
1840
  "cache_friendliness",
1454
1841
  "cache_diagnostics",
1842
+ "cache_layout_advice",
1843
+ "mac_visibility",
1455
1844
  "totals",
1456
1845
  ],
1457
1846
  "diagnostic_fields": ["summary"],
@@ -1480,6 +1869,8 @@ def feasibility_json(
1480
1869
  "headroom_availability": availability["headroom"],
1481
1870
  "cache_friendliness": cache_friendliness,
1482
1871
  "cache_diagnostics": cache_diagnostics,
1872
+ "cache_layout_advice": cache_layout_advice,
1873
+ "mac_visibility": mac_visibility,
1483
1874
  "totals": {
1484
1875
  "total_tokens": stable_total_tokens,
1485
1876
  "tokens": stable_tokens,
@@ -1531,6 +1922,36 @@ def build_recommendations(summary: UsageSummary, top: int) -> list[dict[str, Any
1531
1922
  input_ratio = input_tokens / total
1532
1923
  cache_friendliness = cache_friendliness_for_summary(summary)
1533
1924
  cache_diagnostics = cache_diagnostics_for_summary(summary)
1925
+ cache_layout_advice = cache_layout_advice_for_summary(summary)
1926
+ if cache_layout_advice.get("observed_issue") == "volatile_prefix_breaker":
1927
+ evidence = {
1928
+ "observed_issue": cache_layout_advice.get("observed_issue"),
1929
+ "priority": cache_layout_advice.get("priority"),
1930
+ "confidence": cache_layout_advice.get("confidence"),
1931
+ "cache_creation_tokens": cache_creation,
1932
+ "cache_read_tokens": cache_read,
1933
+ }
1934
+ observed_summary = cache_layout_advice.get("observed_summary")
1935
+ if isinstance(observed_summary, dict):
1936
+ for key in ("max_prefix_position", "max_prefix_position_volatile_share", "stable_prefix_share", "volatile_prefix_share"):
1937
+ evidence[key] = observed_summary.get(key)
1938
+ rec = recommendation(
1939
+ "prioritize-cache-prefix-stabilization",
1940
+ "Prioritize cache-prefix stabilization before TTL or output trimming",
1941
+ (
1942
+ "Cache creation remains material and redacted segment statistics show a volatile early prefix; "
1943
+ "this is an experiment-prioritization signal, not a confirmed root cause."
1944
+ ),
1945
+ (
1946
+ "If one transcript dominates, split unrelated work into shorter sessions; then check startup/context "
1947
+ "size and keep stable policy before volatile logs, diffs, timestamps, and generated evidence."
1948
+ ),
1949
+ str(cache_layout_advice.get("priority") or "P1"),
1950
+ evidence,
1951
+ )
1952
+ rec["heuristic"] = True
1953
+ rec["confidence"] = cache_layout_advice.get("confidence")
1954
+ recs.append(rec)
1534
1955
  for finding in cache_friendliness.get("findings", []):
1535
1956
  if isinstance(finding, dict) and finding.get("id") == "volatile-content-near-prefix":
1536
1957
  evidence = dict(finding.get("evidence") or {})
@@ -1754,6 +2175,7 @@ def summary_json(
1754
2175
  "top_tools": counter_json(summary.by_tool, top),
1755
2176
  "cache_friendliness": cache_friendliness_for_summary(summary),
1756
2177
  "cache_diagnostics": cache_diagnostics_for_summary(summary),
2178
+ "cache_layout_advice": cache_layout_advice_for_summary(summary),
1757
2179
  }
1758
2180
  if include_recommendations:
1759
2181
  data["recommendations"] = build_recommendations(summary, top)
@@ -1887,6 +2309,26 @@ def main() -> int:
1887
2309
  headroom = cache_diagnostics.get("headroom_diagnostics") or {}
1888
2310
  print(f" headroom_status {headroom.get('status')} ({headroom.get('evidence')})")
1889
2311
 
2312
+ cache_layout_advice = cache_layout_advice_for_summary(summary)
2313
+ if cache_layout_advice.get("status") != "missing" or cache_layout_advice.get("observed_issue") != "unknown":
2314
+ print("\nCache layout advice")
2315
+ print(f" status {cache_layout_advice.get('status')}")
2316
+ print(f" confidence {cache_layout_advice.get('confidence')}")
2317
+ print(f" observed_issue {cache_layout_advice.get('observed_issue')}")
2318
+ print(f" priority {cache_layout_advice.get('priority')}")
2319
+ experiments = cache_layout_advice.get("recommended_experiments") or []
2320
+ if experiments:
2321
+ first = experiments[0]
2322
+ print(f" first_experiment {first.get('id')} ({first.get('priority')})")
2323
+ print(f" experiment_action {first.get('action')}")
2324
+ checks = cache_layout_advice.get("next_checks") or []
2325
+ if checks:
2326
+ first = checks[0]
2327
+ print(f" next_check {first.get('id')}")
2328
+ templates = first.get("command_templates") or []
2329
+ if templates:
2330
+ print(f" command_template {templates[0]}")
2331
+
1890
2332
  model_totals = Counter({model: sum(tokens.values()) for model, tokens in summary.by_model.items()})
1891
2333
  print_counter("By model", model_totals, args.top)
1892
2334
 
@@ -0,0 +1,19 @@
1
+ Fixture-only baseline prompt for learned-compression experiment setup.
2
+
3
+ You are reviewing an already-sanitized context pack. This is synthetic benchmark input only. No learned compressor, latent helper, embedding model, reranker, or provider call is shipped or invoked by this fixture.
4
+
5
+ Sanitized evidence only: private paths, endpoints, screenshots, secrets, raw credentials, and unsanitized logs do not belong in this fixture. Protected evidence no semantic rewrite: protected identifiers, constants, hashes, paths, quoted strings, stack frames, JSON keys, code fences, and diff zones must remain exact or receipt-retrievable.
6
+
7
+ Sanitized context pack:
8
+ - pack id: fixture-pack-alpha
9
+ - source summary: sample_module.py lines 10:42 contain the decision branch
10
+ - protected evidence kept exact: identifier `sample_status`, numeric constant `3`, quoted string `retry`, JSON key `status`, and stack frame label `sample_module:31`
11
+ - omitted source: sample_helper.py lines 1:80
12
+ - exact retrieval fallback: context-guard-pack slice --path sample_helper.py --lines 1:80
13
+
14
+ Task:
15
+ 1. Identify which source should be inspected next.
16
+ 2. Explain which protected evidence must remain exact and not semantically rewritten.
17
+ 3. State that real comparisons require provider-measured primary token/cost fields on matched successful tasks, plus a failure-rate guardrail, human corrections, and shifted-cost accounting.
18
+
19
+ This prompt is dry-run-only fixture scaffolding and does not claim hosted API savings.
@@ -0,0 +1,21 @@
1
+ Fixture-only candidate prompt for learned-compression experiment setup.
2
+
3
+ You are reviewing an already-sanitized compressed digest candidate. This is synthetic benchmark input only. No learned compressor, latent helper, embedding model, reranker, or provider call is shipped or invoked by this fixture.
4
+
5
+ Sanitized evidence only: private paths, endpoints, screenshots, secrets, raw credentials, and unsanitized logs do not belong in this fixture. Protected evidence no semantic rewrite: protected identifiers, constants, hashes, paths, quoted strings, stack frames, JSON keys, code fences, and diff zones must remain exact or receipt-retrievable.
6
+
7
+ Compressed digest candidate:
8
+ - candidate id: fixture-compression-alpha
9
+ - digest summary: sample_module.py branch returns quoted string `retry` after numeric constant `3` attempts
10
+ - protected evidence preserved exactly: identifier `sample_status`, numeric constant `3`, quoted string `retry`, JSON key `status`, and stack frame label `sample_module:31`
11
+ - omitted protected context: sample_helper.py lines 1:80
12
+ - receipt fallback: fixture-receipt-alpha
13
+ - exact retrieval fallback: context-guard-pack slice --path sample_helper.py --lines 1:80
14
+
15
+ Task:
16
+ 1. Decide whether required evidence is exact or receipt-retrievable.
17
+ 2. Identify any protected evidence that would make semantic rewrite unsafe.
18
+ 3. State that digest size, byte ratios, and receipt availability are proxy or retrieval evidence only, not hosted API token or cost savings evidence.
19
+ 4. State that real comparisons require provider-measured primary token/cost fields on matched successful tasks, plus a failure-rate guardrail, human corrections, and shifted-cost accounting.
20
+
21
+ This prompt is dry-run-only fixture scaffolding and does not claim hosted API savings.
@@ -8,7 +8,11 @@
8
8
  "max_budget_usd": 1.0,
9
9
  "allowed_tools": [],
10
10
  "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
11
- "success_cwd": "."
11
+ "success_cwd": ".",
12
+ "variant_prompt_files": {
13
+ "baseline_uncompressed_fixture": "learned-compression-baseline-context-pack.prompt.example.md",
14
+ "fixture_only_learned_compression_candidate": "learned-compression-candidate-digest.prompt.example.md"
15
+ }
12
16
  },
13
17
  {
14
18
  "id": "learned_compression_artifact_digest_fixture",
@@ -0,0 +1,20 @@
1
+ Fixture-only raw-output prompt for reversible output-transform A/B setup.
2
+
3
+ You are reviewing an already-sanitized command transcript. Treat this as synthetic benchmark input only.
4
+
5
+ Raw sanitized command output:
6
+ - command: python3 -m unittest sample_suite
7
+ - status: failed
8
+ - summary: one assertion failed in sample_test_alpha
9
+ - excerpt line 01: expected status ok
10
+ - excerpt line 02: actual status retry
11
+ - excerpt line 03: sanitized stack frame in sample_module
12
+ - excerpt line 04: sanitized assertion message
13
+ - excerpt line 05: sanitized context marker
14
+
15
+ Task:
16
+ 1. Identify the failing command and failing check.
17
+ 2. Explain whether the visible raw output is enough to diagnose the synthetic failure.
18
+ 3. State that real token or cost comparisons require provider-measured telemetry on matched successful tasks, a failure-rate guardrail, human corrections, and shifted-cost accounting.
19
+
20
+ This prompt is not shipped benchmark evidence and does not claim hosted API savings.
@@ -0,0 +1,23 @@
1
+ Fixture-only digest plus artifact receipt prompt for reversible output-transform A/B setup.
2
+
3
+ You are reviewing an already-sanitized digest and receipt. Treat this as synthetic benchmark input only.
4
+
5
+ Digest of sanitized command output:
6
+ - command: python3 -m unittest sample_suite
7
+ - status: failed
8
+ - failure summary: sample_test_alpha expected ok but saw retry
9
+ - omitted sanitized lines: 5
10
+
11
+ Artifact receipt:
12
+ - artifact id: fixture-artifact-alpha
13
+ - digest id: fixture-digest-alpha
14
+ - exact re-expand command: context-guard-artifact show fixture-artifact-alpha
15
+ - re-expand expectation: retrieves the omitted sanitized lines exactly from a user-supplied local artifact store
16
+
17
+ Task:
18
+ 1. Identify the failing command and failing check.
19
+ 2. Describe which exact re-expand step would retrieve the omitted sanitized lines.
20
+ 3. State that artifact receipt metadata and byte counts are retrieval or proxy evidence only, not token or cost savings evidence.
21
+ 4. State that real comparisons require provider-measured telemetry on matched successful tasks, a failure-rate guardrail, human corrections, and shifted-cost accounting.
22
+
23
+ This prompt is dry-run-only fixture scaffolding and does not claim hosted API savings.
@@ -0,0 +1,28 @@
1
+ [
2
+ {
3
+ "id": "output_transform_trim_digest_fixture",
4
+ "prompt": "Fixture-only synthetic reversible output-transform task. Compare a placeholder raw command log with a digest plus artifact receipt and answer whether omitted sanitized lines can be exactly re-expanded. This fixture does not run a provider, trim command output, or fetch artifacts; future real runs must supply sanitized raw and digest evidence, artifact receipt metadata, provider-measured token/cost telemetry, matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
5
+ "model": "sonnet",
6
+ "effort": "medium",
7
+ "max_turns": 3,
8
+ "max_budget_usd": 1.0,
9
+ "allowed_tools": [],
10
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
11
+ "success_cwd": ".",
12
+ "variant_prompt_files": {
13
+ "baseline_raw_output_fixture": "output-transform-baseline-raw-output.prompt.example.md",
14
+ "fixture_only_digest_artifact_receipt": "output-transform-digest-receipt.prompt.example.md"
15
+ }
16
+ },
17
+ {
18
+ "id": "output_transform_failure_summary_fixture",
19
+ "prompt": "Fixture-only synthetic reversible output-transform task. Given a placeholder failure summary and a receipt-backed sanitized output handle, identify the failing command and describe which exact re-expand step would retrieve the omitted context. This fixture is dry-run-only until prompts, success checks, provider-measured primary token/cost fields, human corrections, and shifted-cost accounting are supplied for matched successful tasks.",
20
+ "model": "sonnet",
21
+ "effort": "medium",
22
+ "max_turns": 3,
23
+ "max_budget_usd": 1.0,
24
+ "allowed_tools": [],
25
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
26
+ "success_cwd": "."
27
+ }
28
+ ]
@@ -0,0 +1,10 @@
1
+ [
2
+ {
3
+ "name": "baseline_raw_output_fixture",
4
+ "extra_args": []
5
+ },
6
+ {
7
+ "name": "fixture_only_digest_artifact_receipt",
8
+ "extra_args": []
9
+ }
10
+ ]