@ictechgy/context-guard 0.4.3 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/README.ko.md +16 -3
- package/README.md +13 -3
- package/context-guard-kit/README.md +2 -2
- package/context-guard-kit/benchmark_runner.py +244 -6
- package/context-guard-kit/claude_transcript_cost_audit.py +443 -1
- package/docs/benchmark-fixtures/learned-compression-baseline-context-pack.prompt.example.md +19 -0
- package/docs/benchmark-fixtures/learned-compression-candidate-digest.prompt.example.md +21 -0
- package/docs/benchmark-fixtures/learned-compression.tasks.example.json +5 -1
- package/docs/benchmark-fixtures/output-transform-baseline-raw-output.prompt.example.md +20 -0
- package/docs/benchmark-fixtures/output-transform-digest-receipt.prompt.example.md +23 -0
- package/docs/benchmark-fixtures/output-transform.tasks.example.json +28 -0
- package/docs/benchmark-fixtures/output-transform.variants.example.json +10 -0
- package/docs/benchmark-fixtures/visual-ocr-cropped-ocr.prompt.example.md +22 -0
- package/docs/benchmark-fixtures/visual-ocr-full-visual.prompt.example.md +19 -0
- package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +5 -1
- package/docs/benchmark-workflow-examples.md +6 -2
- package/docs/benchmark-workflows/self-hosted-metrics-ledger.example.jsonl +1 -0
- package/docs/cache-diagnostics-schema.md +25 -4
- package/docs/experimental-benchmark-fixtures.md +17 -6
- package/docs/mac-visibility-feasibility-schema.md +62 -0
- package/docs/mac-visibility-feasibility.example.json +130 -0
- package/package.json +5 -1
- package/packaging/homebrew/context-guard.rb.template +1 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +3 -3
- package/plugins/context-guard/README.md +3 -3
- package/plugins/context-guard/bin/context-guard-audit +443 -1
- package/plugins/context-guard/bin/context-guard-bench +244 -6
|
@@ -46,9 +46,11 @@ COST_KEYS = ("total_cost_usd", "cost_usd", "costUSD")
|
|
|
46
46
|
MODEL_KEYS = ("model", "model_id", "modelId")
|
|
47
47
|
QUERY_SOURCE_KEYS = ("query_source", "querySource")
|
|
48
48
|
TIMESTAMP_KEYS = ("timestamp", "created_at", "createdAt", "time", "ts")
|
|
49
|
-
FEASIBILITY_SCHEMA_VERSION = "contextguard.metric-feasibility.v1.
|
|
49
|
+
FEASIBILITY_SCHEMA_VERSION = "contextguard.metric-feasibility.v1.3"
|
|
50
|
+
MAC_VISIBILITY_SCHEMA_VERSION = "contextguard.mac-visibility.v1"
|
|
50
51
|
FEASIBILITY_PRODUCER = "context-guard-audit"
|
|
51
52
|
CACHE_DIAGNOSTICS_SCHEMA_VERSION = "contextguard.cache-diagnostics.v1"
|
|
53
|
+
CACHE_LAYOUT_ADVICE_SCHEMA_VERSION = "contextguard.cache-layout-advice.v1"
|
|
52
54
|
MAX_ERROR_EXAMPLES = 20
|
|
53
55
|
JSON_PARSE_RECURSION_LIMIT = 10_000
|
|
54
56
|
READ_CHUNK_BYTES = 64 * 1024
|
|
@@ -184,6 +186,7 @@ class UsageSummary:
|
|
|
184
186
|
prompt_cache_audit: PromptCacheAudit = field(default_factory=PromptCacheAudit)
|
|
185
187
|
cache_friendliness_cache: dict[str, Any] | None = field(default=None, init=False, repr=False)
|
|
186
188
|
cache_diagnostics_cache: dict[str, Any] | None = field(default=None, init=False, repr=False)
|
|
189
|
+
cache_layout_advice_cache: dict[str, Any] | None = field(default=None, init=False, repr=False)
|
|
187
190
|
|
|
188
191
|
@property
|
|
189
192
|
def total_tokens(self) -> int:
|
|
@@ -1398,6 +1401,222 @@ def cache_diagnostics_for_summary(summary: UsageSummary) -> dict[str, Any]:
|
|
|
1398
1401
|
return build_cache_diagnostics(summary)
|
|
1399
1402
|
|
|
1400
1403
|
|
|
1404
|
+
def _dominant_transcript(summary: UsageSummary) -> dict[str, Any] | None:
|
|
1405
|
+
if summary.total_tokens <= 0 or not summary.by_file:
|
|
1406
|
+
return None
|
|
1407
|
+
_label, tokens = summary.by_file.most_common(1)[0]
|
|
1408
|
+
share = tokens / summary.total_tokens if summary.total_tokens else 0.0
|
|
1409
|
+
return {
|
|
1410
|
+
"tokens": tokens,
|
|
1411
|
+
"share": round(share, 4),
|
|
1412
|
+
"dominates": share >= 0.20 and tokens >= 1_000,
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
|
|
1416
|
+
def _first_dynamic_breaker(cache_diagnostics: dict[str, Any]) -> dict[str, Any] | None:
|
|
1417
|
+
breakers = cache_diagnostics.get("dynamic_prefix_breakers") or []
|
|
1418
|
+
if not breakers:
|
|
1419
|
+
return None
|
|
1420
|
+
first = breakers[0]
|
|
1421
|
+
return first if isinstance(first, dict) else None
|
|
1422
|
+
|
|
1423
|
+
|
|
1424
|
+
def build_cache_layout_advice(summary: UsageSummary) -> dict[str, Any]:
|
|
1425
|
+
if summary.cache_layout_advice_cache is not None:
|
|
1426
|
+
return summary.cache_layout_advice_cache
|
|
1427
|
+
|
|
1428
|
+
cache_friendliness = cache_friendliness_for_summary(summary)
|
|
1429
|
+
cache_diagnostics = cache_diagnostics_for_summary(summary)
|
|
1430
|
+
signals = cache_friendliness.get("signals") if isinstance(cache_friendliness.get("signals"), dict) else {}
|
|
1431
|
+
dynamic_breaker = _first_dynamic_breaker(cache_diagnostics)
|
|
1432
|
+
dominant = _dominant_transcript(summary)
|
|
1433
|
+
cache_creation = summary.tokens.get("cache_creation", 0)
|
|
1434
|
+
cache_read = summary.tokens.get("cache_read", 0)
|
|
1435
|
+
cache_fields = cache_diagnostics.get("observations", {}).get("cache_fields", {}) if isinstance(cache_diagnostics.get("observations"), dict) else {}
|
|
1436
|
+
cache_status = cache_fields.get("status") if isinstance(cache_fields, dict) else None
|
|
1437
|
+
stable_prefix_share = signals.get("stable_prefix_share")
|
|
1438
|
+
volatile_prefix_share = signals.get("volatile_prefix_share")
|
|
1439
|
+
volatile_tail_share = signals.get("volatile_tail_share")
|
|
1440
|
+
max_prefix_position = dynamic_breaker.get("position") if dynamic_breaker else None
|
|
1441
|
+
max_prefix_position_volatile_share = dynamic_breaker.get("volatile_share") if dynamic_breaker else signals.get("max_prefix_position_volatile_share")
|
|
1442
|
+
|
|
1443
|
+
status = "missing"
|
|
1444
|
+
confidence = "unavailable"
|
|
1445
|
+
observed_issue = "unknown"
|
|
1446
|
+
priority = "P2"
|
|
1447
|
+
hypothesized_causes: list[dict[str, Any]] = []
|
|
1448
|
+
corroborated_causes: list[dict[str, Any]] = []
|
|
1449
|
+
next_checks: list[dict[str, Any]] = []
|
|
1450
|
+
recommended_experiments: list[dict[str, Any]] = []
|
|
1451
|
+
|
|
1452
|
+
has_cache_any = bool(
|
|
1453
|
+
summary.token_field_presence.get("cache_read", 0)
|
|
1454
|
+
or summary.token_field_presence.get("cache_creation", 0)
|
|
1455
|
+
)
|
|
1456
|
+
has_prompt_samples = bool(summary.prompt_cache_audit.samples)
|
|
1457
|
+
if has_cache_any or has_prompt_samples:
|
|
1458
|
+
status = "partial" if (
|
|
1459
|
+
not has_prompt_samples
|
|
1460
|
+
or cache_friendliness.get("status") == "partial"
|
|
1461
|
+
or cache_diagnostics.get("status") == "partial"
|
|
1462
|
+
or summary.skipped_files
|
|
1463
|
+
or summary.skipped_records
|
|
1464
|
+
or summary.parse_errors
|
|
1465
|
+
) else "available"
|
|
1466
|
+
confidence = "partial" if status == "partial" else "hypothesis"
|
|
1467
|
+
|
|
1468
|
+
volatile_prefix_breaker = bool(
|
|
1469
|
+
dynamic_breaker
|
|
1470
|
+
and cache_creation > 0
|
|
1471
|
+
and (max_prefix_position in {0, 1} or (max_prefix_position_volatile_share or 0) >= PROMPT_PREFIX_VOLATILE_THRESHOLD)
|
|
1472
|
+
)
|
|
1473
|
+
long_session_dominates = bool(dominant and dominant.get("dominates"))
|
|
1474
|
+
|
|
1475
|
+
if volatile_prefix_breaker:
|
|
1476
|
+
observed_issue = "volatile_prefix_breaker"
|
|
1477
|
+
priority = "P0" if cache_creation >= 50_000 and max_prefix_position in {0, 1} else "P1"
|
|
1478
|
+
hypothesized_causes.append({
|
|
1479
|
+
"id": "prefix-position-churn",
|
|
1480
|
+
"confidence": confidence,
|
|
1481
|
+
"evidence": EVIDENCE_INFERRED,
|
|
1482
|
+
"reason": (
|
|
1483
|
+
"A highly volatile redacted prompt segment appears in the early prefix window; "
|
|
1484
|
+
"this identifies a layout issue, not a confirmed source."
|
|
1485
|
+
),
|
|
1486
|
+
"next_check": "Check whether startup context, generated evidence, or tool/MCP catalog changes are moving before stable policy.",
|
|
1487
|
+
})
|
|
1488
|
+
if cache_diagnostics.get("stable_prefix_candidates"):
|
|
1489
|
+
hypothesized_causes.append({
|
|
1490
|
+
"id": "evidence-before-policy",
|
|
1491
|
+
"confidence": confidence,
|
|
1492
|
+
"evidence": EVIDENCE_INFERRED,
|
|
1493
|
+
"reason": (
|
|
1494
|
+
"Stable reusable segments appear elsewhere while the early prefix churns; "
|
|
1495
|
+
"check whether logs, diffs, timestamps, or file evidence precede stable instructions."
|
|
1496
|
+
),
|
|
1497
|
+
"next_check": "Keep stable policy/instructions first and move generated run evidence later.",
|
|
1498
|
+
})
|
|
1499
|
+
next_checks.append({
|
|
1500
|
+
"id": "inspect-startup-context-size",
|
|
1501
|
+
"confidence": "hypothesis",
|
|
1502
|
+
"command_templates": [
|
|
1503
|
+
"context-guard-diet scan <repo>",
|
|
1504
|
+
"context-guard-diet structural-waste <repo>",
|
|
1505
|
+
],
|
|
1506
|
+
"evidence_required_for_corroboration": (
|
|
1507
|
+
"Large or duplicate CLAUDE.md/AGENTS.md/GEMINI.md findings from diet output."
|
|
1508
|
+
),
|
|
1509
|
+
})
|
|
1510
|
+
elif long_session_dominates:
|
|
1511
|
+
observed_issue = "long_session_accumulation"
|
|
1512
|
+
priority = "P1"
|
|
1513
|
+
elif cache_creation >= 10_000 and cache_read > 0 and summary.cache_amortization < 0.5:
|
|
1514
|
+
observed_issue = "low_cache_reuse"
|
|
1515
|
+
priority = "P1"
|
|
1516
|
+
elif cache_status == "missing" or not has_cache_any:
|
|
1517
|
+
observed_issue = "missing_cache_fields"
|
|
1518
|
+
priority = "P2"
|
|
1519
|
+
|
|
1520
|
+
if long_session_dominates:
|
|
1521
|
+
recommended_experiments.append({
|
|
1522
|
+
"id": "split-long-sessions",
|
|
1523
|
+
"order": len(recommended_experiments) + 1,
|
|
1524
|
+
"priority": "P1",
|
|
1525
|
+
"effort": "low",
|
|
1526
|
+
"action": "Use /clear between unrelated tasks and /compact focus on changed files, failing tests, and remaining TODO during long work.",
|
|
1527
|
+
"expected_signal": "Cache creation per comparable task decreases and one transcript no longer dominates observed tokens.",
|
|
1528
|
+
"verification": "Re-run context-guard-audit on a comparable window and compare cache_creation, cache_amortization, and top transcript share.",
|
|
1529
|
+
"evidence": dominant or {},
|
|
1530
|
+
})
|
|
1531
|
+
if volatile_prefix_breaker:
|
|
1532
|
+
recommended_experiments.append({
|
|
1533
|
+
"id": "stabilize-cache-prefix",
|
|
1534
|
+
"order": len(recommended_experiments) + 1,
|
|
1535
|
+
"priority": priority,
|
|
1536
|
+
"effort": "medium",
|
|
1537
|
+
"action": "Keep stable reusable instructions/policy before volatile logs, diffs, timestamps, and generated file evidence.",
|
|
1538
|
+
"expected_signal": "Stable prefix share rises and volatile prefix share falls on matched audit windows.",
|
|
1539
|
+
"verification": "Re-run context-guard-audit --json --recommend and compare cache_layout_advice plus cache_friendliness signals.",
|
|
1540
|
+
"evidence": {
|
|
1541
|
+
"dynamic_prefix_breaker_position": max_prefix_position,
|
|
1542
|
+
"dynamic_prefix_breaker_volatile_share": max_prefix_position_volatile_share,
|
|
1543
|
+
},
|
|
1544
|
+
})
|
|
1545
|
+
recommended_experiments.append({
|
|
1546
|
+
"id": "run-context-diet-checks",
|
|
1547
|
+
"order": len(recommended_experiments) + 1,
|
|
1548
|
+
"priority": "P1",
|
|
1549
|
+
"effort": "low",
|
|
1550
|
+
"action": "Run the generated diet command templates and treat any large/duplicate context-file findings as corroborating evidence before editing instructions.",
|
|
1551
|
+
"expected_signal": "Diet output identifies or rules out oversized/duplicated startup context as a contributor.",
|
|
1552
|
+
"verification": "Record diet JSON separately; do not convert prefix-position evidence alone into a confirmed startup-context cause.",
|
|
1553
|
+
"command_templates": [
|
|
1554
|
+
"context-guard-diet scan <repo> --json > diet.json",
|
|
1555
|
+
"context-guard-diet structural-waste <repo> --json > structural-waste.json",
|
|
1556
|
+
],
|
|
1557
|
+
})
|
|
1558
|
+
if cache_creation >= 50_000 and summary.cache_amortization_defined and 1.0 <= summary.cache_amortization < 5.0:
|
|
1559
|
+
recommended_experiments.append({
|
|
1560
|
+
"id": "defer-longer-ttl-until-prefix-stable" if volatile_prefix_breaker else "evaluate-longer-ttl-after-stability-check",
|
|
1561
|
+
"order": len(recommended_experiments) + 1,
|
|
1562
|
+
"priority": "P2",
|
|
1563
|
+
"effort": "medium",
|
|
1564
|
+
"action": "Treat longer TTL as secondary; first corroborate stable prefix reuse and current provider TTL/pricing behavior.",
|
|
1565
|
+
"expected_signal": "TTL evaluation happens only after prefix volatility is reduced or ruled out.",
|
|
1566
|
+
"verification": "Use timestamped cache telemetry and provider-measured billing/cost evidence; historical token totals alone are insufficient.",
|
|
1567
|
+
})
|
|
1568
|
+
if not recommended_experiments and status == "partial":
|
|
1569
|
+
next_checks.append({
|
|
1570
|
+
"id": "rerun-narrower-audit",
|
|
1571
|
+
"confidence": "partial",
|
|
1572
|
+
"command_templates": ["context-guard-audit <transcript-or-project-dir> --json --recommend"],
|
|
1573
|
+
"evidence_required_for_corroboration": "Enough uncapped prompt/cache records to classify prefix layout.",
|
|
1574
|
+
})
|
|
1575
|
+
if not recommended_experiments and observed_issue == "missing_cache_fields":
|
|
1576
|
+
next_checks.append({
|
|
1577
|
+
"id": "collect-cache-telemetry",
|
|
1578
|
+
"confidence": "unavailable",
|
|
1579
|
+
"command_templates": ["context-guard-audit ~/.claude/projects --json --recommend"],
|
|
1580
|
+
"evidence_required_for_corroboration": "Transcript records with cache_read/cache_creation fields.",
|
|
1581
|
+
})
|
|
1582
|
+
|
|
1583
|
+
advice = {
|
|
1584
|
+
"schema_version": CACHE_LAYOUT_ADVICE_SCHEMA_VERSION,
|
|
1585
|
+
"status": status,
|
|
1586
|
+
"confidence": confidence,
|
|
1587
|
+
"heuristic": True,
|
|
1588
|
+
"observed_issue": observed_issue,
|
|
1589
|
+
"priority": priority,
|
|
1590
|
+
"observed_summary": {
|
|
1591
|
+
"cache_creation_tokens": cache_creation,
|
|
1592
|
+
"cache_read_tokens": cache_read,
|
|
1593
|
+
"cache_amortization": round(summary.cache_amortization, 4) if summary.cache_amortization_defined else None,
|
|
1594
|
+
"stable_prefix_share": stable_prefix_share,
|
|
1595
|
+
"volatile_prefix_share": volatile_prefix_share,
|
|
1596
|
+
"volatile_tail_share": volatile_tail_share,
|
|
1597
|
+
"max_prefix_position": max_prefix_position,
|
|
1598
|
+
"max_prefix_position_volatile_share": max_prefix_position_volatile_share,
|
|
1599
|
+
"dominant_transcript_share": dominant.get("share") if dominant else None,
|
|
1600
|
+
},
|
|
1601
|
+
"hypothesized_causes": hypothesized_causes,
|
|
1602
|
+
"corroborated_causes": corroborated_causes,
|
|
1603
|
+
"next_checks": next_checks,
|
|
1604
|
+
"recommended_experiments": recommended_experiments,
|
|
1605
|
+
"caveats": [
|
|
1606
|
+
"Cache layout advice is a local transcript heuristic, not billing authority or provider-cache proof.",
|
|
1607
|
+
"Observed issues come from cache fields and redacted segment statistics; causes remain hypotheses until corroborated by diet/structural evidence.",
|
|
1608
|
+
"Generated command templates use placeholders and must not be treated as observed user commands or paths.",
|
|
1609
|
+
"Use matched before/after audits before making token or cost savings claims.",
|
|
1610
|
+
],
|
|
1611
|
+
}
|
|
1612
|
+
summary.cache_layout_advice_cache = advice
|
|
1613
|
+
return advice
|
|
1614
|
+
|
|
1615
|
+
|
|
1616
|
+
def cache_layout_advice_for_summary(summary: UsageSummary) -> dict[str, Any]:
|
|
1617
|
+
return build_cache_layout_advice(summary)
|
|
1618
|
+
|
|
1619
|
+
|
|
1401
1620
|
def build_metric_caveats(summary: UsageSummary) -> list[str]:
|
|
1402
1621
|
caveats = [
|
|
1403
1622
|
"Values are observed from local Claude Code transcript JSON/JSONL fields and are not official billing records.",
|
|
@@ -1417,6 +1636,168 @@ def build_metric_caveats(summary: UsageSummary) -> list[str]:
|
|
|
1417
1636
|
return caveats
|
|
1418
1637
|
|
|
1419
1638
|
|
|
1639
|
+
def _mac_card(
|
|
1640
|
+
card_id: str,
|
|
1641
|
+
title: str,
|
|
1642
|
+
status: str,
|
|
1643
|
+
binding_paths: list[str],
|
|
1644
|
+
*,
|
|
1645
|
+
required_observation: str | None = None,
|
|
1646
|
+
) -> dict[str, Any]:
|
|
1647
|
+
card: dict[str, Any] = {
|
|
1648
|
+
"id": card_id,
|
|
1649
|
+
"title": title,
|
|
1650
|
+
"status": status,
|
|
1651
|
+
"binding_paths": binding_paths,
|
|
1652
|
+
}
|
|
1653
|
+
if required_observation:
|
|
1654
|
+
card["required_observation"] = required_observation
|
|
1655
|
+
return card
|
|
1656
|
+
|
|
1657
|
+
|
|
1658
|
+
def build_mac_visibility_contract(
|
|
1659
|
+
*,
|
|
1660
|
+
availability: dict[str, Any],
|
|
1661
|
+
integrity: dict[str, Any],
|
|
1662
|
+
cache_layout_advice: dict[str, Any],
|
|
1663
|
+
) -> dict[str, Any]:
|
|
1664
|
+
"""Build the pre-GUI macOS visibility binding contract.
|
|
1665
|
+
|
|
1666
|
+
This is intentionally a thin index over already-emitted stable feasibility
|
|
1667
|
+
fields. It does not recompute metrics, read diagnostic summary data, or infer
|
|
1668
|
+
live context/headroom from historical transcript totals.
|
|
1669
|
+
"""
|
|
1670
|
+
token_status = str((availability.get("tokens") or {}).get("status", "missing"))
|
|
1671
|
+
scan_status = str(integrity.get("status", "partial"))
|
|
1672
|
+
if token_status == "available" and scan_status == "complete":
|
|
1673
|
+
readiness_status = "ready"
|
|
1674
|
+
readiness_reason = "Transcript token totals are available and the scan completed within configured limits."
|
|
1675
|
+
elif token_status in {"available", "partial"}:
|
|
1676
|
+
readiness_status = "partial"
|
|
1677
|
+
readiness_reason = "Some stable fields can be shown, but scan integrity or metric availability is partial."
|
|
1678
|
+
else:
|
|
1679
|
+
readiness_status = "missing"
|
|
1680
|
+
readiness_reason = "Token totals are missing from the transcript scan; show setup or unavailable state."
|
|
1681
|
+
|
|
1682
|
+
context_status = str((availability.get("context") or {}).get("status", "missing"))
|
|
1683
|
+
headroom_status = str((availability.get("headroom") or {}).get("status", "missing"))
|
|
1684
|
+
cache_status = str((availability.get("cache") or {}).get("status", "missing"))
|
|
1685
|
+
cost_status = str((availability.get("cost") or {}).get("status", "missing"))
|
|
1686
|
+
advice_status = str(cache_layout_advice.get("status", "missing"))
|
|
1687
|
+
|
|
1688
|
+
missing_live_observations: list[dict[str, Any]] = []
|
|
1689
|
+
if context_status == "missing":
|
|
1690
|
+
missing_live_observations.append({
|
|
1691
|
+
"id": "live_context_window",
|
|
1692
|
+
"required_observation": "live_statusline_snapshot",
|
|
1693
|
+
"affects": ["context_availability", "metric_availability.context"],
|
|
1694
|
+
"reason": "Historical transcript scans do not include live Claude Code context_window data.",
|
|
1695
|
+
})
|
|
1696
|
+
if headroom_status == "missing":
|
|
1697
|
+
missing_live_observations.append({
|
|
1698
|
+
"id": "live_headroom",
|
|
1699
|
+
"required_observation": "live_statusline_snapshot",
|
|
1700
|
+
"affects": ["headroom_availability", "cache_diagnostics.headroom_diagnostics"],
|
|
1701
|
+
"reason": "Historical transcript totals are not remaining-token or live headroom observations.",
|
|
1702
|
+
})
|
|
1703
|
+
|
|
1704
|
+
return {
|
|
1705
|
+
"schema_version": MAC_VISIBILITY_SCHEMA_VERSION,
|
|
1706
|
+
"surface_kind": "local_macos_visibility_contract",
|
|
1707
|
+
"readiness": {
|
|
1708
|
+
"status": readiness_status,
|
|
1709
|
+
"reason": readiness_reason,
|
|
1710
|
+
},
|
|
1711
|
+
"bind_to_top_level_fields": [
|
|
1712
|
+
"source_kind",
|
|
1713
|
+
"source_freshness",
|
|
1714
|
+
"scan_integrity",
|
|
1715
|
+
"metric_availability",
|
|
1716
|
+
"metric_caveats",
|
|
1717
|
+
"redaction_mode",
|
|
1718
|
+
"context_availability",
|
|
1719
|
+
"headroom_availability",
|
|
1720
|
+
"cache_friendliness",
|
|
1721
|
+
"cache_diagnostics",
|
|
1722
|
+
"cache_layout_advice",
|
|
1723
|
+
"totals",
|
|
1724
|
+
],
|
|
1725
|
+
"diagnostic_only_fields": ["summary"],
|
|
1726
|
+
"primary_cards": [
|
|
1727
|
+
_mac_card(
|
|
1728
|
+
"source_freshness",
|
|
1729
|
+
"Source freshness",
|
|
1730
|
+
"available",
|
|
1731
|
+
["source_kind", "source_freshness.status", "source_freshness.generated_at"],
|
|
1732
|
+
),
|
|
1733
|
+
_mac_card(
|
|
1734
|
+
"scan_integrity",
|
|
1735
|
+
"Scan integrity",
|
|
1736
|
+
scan_status,
|
|
1737
|
+
[
|
|
1738
|
+
"scan_integrity.status",
|
|
1739
|
+
"scan_integrity.files_scanned",
|
|
1740
|
+
"scan_integrity.records_scanned",
|
|
1741
|
+
"scan_integrity.skipped_files",
|
|
1742
|
+
"scan_integrity.skipped_records",
|
|
1743
|
+
],
|
|
1744
|
+
),
|
|
1745
|
+
_mac_card(
|
|
1746
|
+
"token_totals",
|
|
1747
|
+
"Token totals",
|
|
1748
|
+
token_status,
|
|
1749
|
+
[
|
|
1750
|
+
"totals.total_tokens",
|
|
1751
|
+
"totals.tokens.input",
|
|
1752
|
+
"totals.tokens.output",
|
|
1753
|
+
"totals.tokens.cache_read",
|
|
1754
|
+
"totals.tokens.cache_creation",
|
|
1755
|
+
],
|
|
1756
|
+
),
|
|
1757
|
+
_mac_card(
|
|
1758
|
+
"cache_reuse",
|
|
1759
|
+
"Cache-read share and reuse ratio",
|
|
1760
|
+
cache_status,
|
|
1761
|
+
["totals.cache_read_share", "totals.cache_reuse_ratio", "metric_availability.cache"],
|
|
1762
|
+
),
|
|
1763
|
+
_mac_card(
|
|
1764
|
+
"observed_cost",
|
|
1765
|
+
"Observed transcript cost",
|
|
1766
|
+
cost_status,
|
|
1767
|
+
["totals.cost_usd_observed", "metric_availability.cost"],
|
|
1768
|
+
),
|
|
1769
|
+
_mac_card(
|
|
1770
|
+
"context_availability",
|
|
1771
|
+
"Context availability",
|
|
1772
|
+
context_status,
|
|
1773
|
+
["context_availability", "metric_availability.context"],
|
|
1774
|
+
required_observation="live_statusline_snapshot" if context_status == "missing" else None,
|
|
1775
|
+
),
|
|
1776
|
+
_mac_card(
|
|
1777
|
+
"headroom_availability",
|
|
1778
|
+
"Headroom availability",
|
|
1779
|
+
headroom_status,
|
|
1780
|
+
["headroom_availability", "cache_diagnostics.headroom_diagnostics"],
|
|
1781
|
+
required_observation="live_statusline_snapshot" if headroom_status == "missing" else None,
|
|
1782
|
+
),
|
|
1783
|
+
_mac_card(
|
|
1784
|
+
"cache_layout_advice",
|
|
1785
|
+
"Cache layout advice",
|
|
1786
|
+
advice_status,
|
|
1787
|
+
["cache_layout_advice", "cache_friendliness", "cache_diagnostics.dynamic_prefix_breakers"],
|
|
1788
|
+
),
|
|
1789
|
+
],
|
|
1790
|
+
"missing_live_observations": missing_live_observations,
|
|
1791
|
+
"claim_boundaries": [
|
|
1792
|
+
"Local transcript observations are not invoice-grade billing records.",
|
|
1793
|
+
"Provider cache fields are telemetry, not ContextGuard-caused token reduction and do not prove provider cache hits.",
|
|
1794
|
+
"Historical transcript totals do not infer live context headroom or remaining tokens.",
|
|
1795
|
+
"This contract does not guarantee token or cost savings.",
|
|
1796
|
+
],
|
|
1797
|
+
"redaction_required": True,
|
|
1798
|
+
}
|
|
1799
|
+
|
|
1800
|
+
|
|
1420
1801
|
def feasibility_json(
|
|
1421
1802
|
summary: UsageSummary,
|
|
1422
1803
|
top: int = 15,
|
|
@@ -1433,6 +1814,12 @@ def feasibility_json(
|
|
|
1433
1814
|
stable_total_tokens = sum(stable_tokens.values())
|
|
1434
1815
|
cache_friendliness = cache_friendliness_for_summary(summary)
|
|
1435
1816
|
cache_diagnostics = cache_diagnostics_for_summary(summary)
|
|
1817
|
+
cache_layout_advice = cache_layout_advice_for_summary(summary)
|
|
1818
|
+
mac_visibility = build_mac_visibility_contract(
|
|
1819
|
+
availability=availability,
|
|
1820
|
+
integrity=integrity,
|
|
1821
|
+
cache_layout_advice=cache_layout_advice,
|
|
1822
|
+
)
|
|
1436
1823
|
return {
|
|
1437
1824
|
"schema_version": FEASIBILITY_SCHEMA_VERSION,
|
|
1438
1825
|
"producer": FEASIBILITY_PRODUCER,
|
|
@@ -1452,6 +1839,8 @@ def feasibility_json(
|
|
|
1452
1839
|
"headroom_availability",
|
|
1453
1840
|
"cache_friendliness",
|
|
1454
1841
|
"cache_diagnostics",
|
|
1842
|
+
"cache_layout_advice",
|
|
1843
|
+
"mac_visibility",
|
|
1455
1844
|
"totals",
|
|
1456
1845
|
],
|
|
1457
1846
|
"diagnostic_fields": ["summary"],
|
|
@@ -1480,6 +1869,8 @@ def feasibility_json(
|
|
|
1480
1869
|
"headroom_availability": availability["headroom"],
|
|
1481
1870
|
"cache_friendliness": cache_friendliness,
|
|
1482
1871
|
"cache_diagnostics": cache_diagnostics,
|
|
1872
|
+
"cache_layout_advice": cache_layout_advice,
|
|
1873
|
+
"mac_visibility": mac_visibility,
|
|
1483
1874
|
"totals": {
|
|
1484
1875
|
"total_tokens": stable_total_tokens,
|
|
1485
1876
|
"tokens": stable_tokens,
|
|
@@ -1531,6 +1922,36 @@ def build_recommendations(summary: UsageSummary, top: int) -> list[dict[str, Any
|
|
|
1531
1922
|
input_ratio = input_tokens / total
|
|
1532
1923
|
cache_friendliness = cache_friendliness_for_summary(summary)
|
|
1533
1924
|
cache_diagnostics = cache_diagnostics_for_summary(summary)
|
|
1925
|
+
cache_layout_advice = cache_layout_advice_for_summary(summary)
|
|
1926
|
+
if cache_layout_advice.get("observed_issue") == "volatile_prefix_breaker":
|
|
1927
|
+
evidence = {
|
|
1928
|
+
"observed_issue": cache_layout_advice.get("observed_issue"),
|
|
1929
|
+
"priority": cache_layout_advice.get("priority"),
|
|
1930
|
+
"confidence": cache_layout_advice.get("confidence"),
|
|
1931
|
+
"cache_creation_tokens": cache_creation,
|
|
1932
|
+
"cache_read_tokens": cache_read,
|
|
1933
|
+
}
|
|
1934
|
+
observed_summary = cache_layout_advice.get("observed_summary")
|
|
1935
|
+
if isinstance(observed_summary, dict):
|
|
1936
|
+
for key in ("max_prefix_position", "max_prefix_position_volatile_share", "stable_prefix_share", "volatile_prefix_share"):
|
|
1937
|
+
evidence[key] = observed_summary.get(key)
|
|
1938
|
+
rec = recommendation(
|
|
1939
|
+
"prioritize-cache-prefix-stabilization",
|
|
1940
|
+
"Prioritize cache-prefix stabilization before TTL or output trimming",
|
|
1941
|
+
(
|
|
1942
|
+
"Cache creation remains material and redacted segment statistics show a volatile early prefix; "
|
|
1943
|
+
"this is an experiment-prioritization signal, not a confirmed root cause."
|
|
1944
|
+
),
|
|
1945
|
+
(
|
|
1946
|
+
"If one transcript dominates, split unrelated work into shorter sessions; then check startup/context "
|
|
1947
|
+
"size and keep stable policy before volatile logs, diffs, timestamps, and generated evidence."
|
|
1948
|
+
),
|
|
1949
|
+
str(cache_layout_advice.get("priority") or "P1"),
|
|
1950
|
+
evidence,
|
|
1951
|
+
)
|
|
1952
|
+
rec["heuristic"] = True
|
|
1953
|
+
rec["confidence"] = cache_layout_advice.get("confidence")
|
|
1954
|
+
recs.append(rec)
|
|
1534
1955
|
for finding in cache_friendliness.get("findings", []):
|
|
1535
1956
|
if isinstance(finding, dict) and finding.get("id") == "volatile-content-near-prefix":
|
|
1536
1957
|
evidence = dict(finding.get("evidence") or {})
|
|
@@ -1754,6 +2175,7 @@ def summary_json(
|
|
|
1754
2175
|
"top_tools": counter_json(summary.by_tool, top),
|
|
1755
2176
|
"cache_friendliness": cache_friendliness_for_summary(summary),
|
|
1756
2177
|
"cache_diagnostics": cache_diagnostics_for_summary(summary),
|
|
2178
|
+
"cache_layout_advice": cache_layout_advice_for_summary(summary),
|
|
1757
2179
|
}
|
|
1758
2180
|
if include_recommendations:
|
|
1759
2181
|
data["recommendations"] = build_recommendations(summary, top)
|
|
@@ -1887,6 +2309,26 @@ def main() -> int:
|
|
|
1887
2309
|
headroom = cache_diagnostics.get("headroom_diagnostics") or {}
|
|
1888
2310
|
print(f" headroom_status {headroom.get('status')} ({headroom.get('evidence')})")
|
|
1889
2311
|
|
|
2312
|
+
cache_layout_advice = cache_layout_advice_for_summary(summary)
|
|
2313
|
+
if cache_layout_advice.get("status") != "missing" or cache_layout_advice.get("observed_issue") != "unknown":
|
|
2314
|
+
print("\nCache layout advice")
|
|
2315
|
+
print(f" status {cache_layout_advice.get('status')}")
|
|
2316
|
+
print(f" confidence {cache_layout_advice.get('confidence')}")
|
|
2317
|
+
print(f" observed_issue {cache_layout_advice.get('observed_issue')}")
|
|
2318
|
+
print(f" priority {cache_layout_advice.get('priority')}")
|
|
2319
|
+
experiments = cache_layout_advice.get("recommended_experiments") or []
|
|
2320
|
+
if experiments:
|
|
2321
|
+
first = experiments[0]
|
|
2322
|
+
print(f" first_experiment {first.get('id')} ({first.get('priority')})")
|
|
2323
|
+
print(f" experiment_action {first.get('action')}")
|
|
2324
|
+
checks = cache_layout_advice.get("next_checks") or []
|
|
2325
|
+
if checks:
|
|
2326
|
+
first = checks[0]
|
|
2327
|
+
print(f" next_check {first.get('id')}")
|
|
2328
|
+
templates = first.get("command_templates") or []
|
|
2329
|
+
if templates:
|
|
2330
|
+
print(f" command_template {templates[0]}")
|
|
2331
|
+
|
|
1890
2332
|
model_totals = Counter({model: sum(tokens.values()) for model, tokens in summary.by_model.items()})
|
|
1891
2333
|
print_counter("By model", model_totals, args.top)
|
|
1892
2334
|
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Fixture-only baseline prompt for learned-compression experiment setup.
|
|
2
|
+
|
|
3
|
+
You are reviewing an already-sanitized context pack. This is synthetic benchmark input only. No learned compressor, latent helper, embedding model, reranker, or provider call is shipped or invoked by this fixture.
|
|
4
|
+
|
|
5
|
+
Sanitized evidence only: private paths, endpoints, screenshots, secrets, raw credentials, and unsanitized logs do not belong in this fixture. Protected evidence no semantic rewrite: protected identifiers, constants, hashes, paths, quoted strings, stack frames, JSON keys, code fences, and diff zones must remain exact or receipt-retrievable.
|
|
6
|
+
|
|
7
|
+
Sanitized context pack:
|
|
8
|
+
- pack id: fixture-pack-alpha
|
|
9
|
+
- source summary: sample_module.py lines 10:42 contain the decision branch
|
|
10
|
+
- protected evidence kept exact: identifier `sample_status`, numeric constant `3`, quoted string `retry`, JSON key `status`, and stack frame label `sample_module:31`
|
|
11
|
+
- omitted source: sample_helper.py lines 1:80
|
|
12
|
+
- exact retrieval fallback: context-guard-pack slice --path sample_helper.py --lines 1:80
|
|
13
|
+
|
|
14
|
+
Task:
|
|
15
|
+
1. Identify which source should be inspected next.
|
|
16
|
+
2. Explain which protected evidence must remain exact and not semantically rewritten.
|
|
17
|
+
3. State that real comparisons require provider-measured primary token/cost fields on matched successful tasks, plus a failure-rate guardrail, human corrections, and shifted-cost accounting.
|
|
18
|
+
|
|
19
|
+
This prompt is dry-run-only fixture scaffolding and does not claim hosted API savings.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Fixture-only candidate prompt for learned-compression experiment setup.
|
|
2
|
+
|
|
3
|
+
You are reviewing an already-sanitized compressed digest candidate. This is synthetic benchmark input only. No learned compressor, latent helper, embedding model, reranker, or provider call is shipped or invoked by this fixture.
|
|
4
|
+
|
|
5
|
+
Sanitized evidence only: private paths, endpoints, screenshots, secrets, raw credentials, and unsanitized logs do not belong in this fixture. Protected evidence no semantic rewrite: protected identifiers, constants, hashes, paths, quoted strings, stack frames, JSON keys, code fences, and diff zones must remain exact or receipt-retrievable.
|
|
6
|
+
|
|
7
|
+
Compressed digest candidate:
|
|
8
|
+
- candidate id: fixture-compression-alpha
|
|
9
|
+
- digest summary: sample_module.py branch returns quoted string `retry` after numeric constant `3` attempts
|
|
10
|
+
- protected evidence preserved exactly: identifier `sample_status`, numeric constant `3`, quoted string `retry`, JSON key `status`, and stack frame label `sample_module:31`
|
|
11
|
+
- omitted protected context: sample_helper.py lines 1:80
|
|
12
|
+
- receipt fallback: fixture-receipt-alpha
|
|
13
|
+
- exact retrieval fallback: context-guard-pack slice --path sample_helper.py --lines 1:80
|
|
14
|
+
|
|
15
|
+
Task:
|
|
16
|
+
1. Decide whether required evidence is exact or receipt-retrievable.
|
|
17
|
+
2. Identify any protected evidence that would make semantic rewrite unsafe.
|
|
18
|
+
3. State that digest size, byte ratios, and receipt availability are proxy or retrieval evidence only, not hosted API token or cost savings evidence.
|
|
19
|
+
4. State that real comparisons require provider-measured primary token/cost fields on matched successful tasks, plus a failure-rate guardrail, human corrections, and shifted-cost accounting.
|
|
20
|
+
|
|
21
|
+
This prompt is dry-run-only fixture scaffolding and does not claim hosted API savings.
|
|
@@ -8,7 +8,11 @@
|
|
|
8
8
|
"max_budget_usd": 1.0,
|
|
9
9
|
"allowed_tools": [],
|
|
10
10
|
"success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
|
|
11
|
-
"success_cwd": "."
|
|
11
|
+
"success_cwd": ".",
|
|
12
|
+
"variant_prompt_files": {
|
|
13
|
+
"baseline_uncompressed_fixture": "learned-compression-baseline-context-pack.prompt.example.md",
|
|
14
|
+
"fixture_only_learned_compression_candidate": "learned-compression-candidate-digest.prompt.example.md"
|
|
15
|
+
}
|
|
12
16
|
},
|
|
13
17
|
{
|
|
14
18
|
"id": "learned_compression_artifact_digest_fixture",
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Fixture-only raw-output prompt for reversible output-transform A/B setup.
|
|
2
|
+
|
|
3
|
+
You are reviewing an already-sanitized command transcript. Treat this as synthetic benchmark input only.
|
|
4
|
+
|
|
5
|
+
Raw sanitized command output:
|
|
6
|
+
- command: python3 -m unittest sample_suite
|
|
7
|
+
- status: failed
|
|
8
|
+
- summary: one assertion failed in sample_test_alpha
|
|
9
|
+
- excerpt line 01: expected status ok
|
|
10
|
+
- excerpt line 02: actual status retry
|
|
11
|
+
- excerpt line 03: sanitized stack frame in sample_module
|
|
12
|
+
- excerpt line 04: sanitized assertion message
|
|
13
|
+
- excerpt line 05: sanitized context marker
|
|
14
|
+
|
|
15
|
+
Task:
|
|
16
|
+
1. Identify the failing command and failing check.
|
|
17
|
+
2. Explain whether the visible raw output is enough to diagnose the synthetic failure.
|
|
18
|
+
3. State that real token or cost comparisons require provider-measured telemetry on matched successful tasks, a failure-rate guardrail, human corrections, and shifted-cost accounting.
|
|
19
|
+
|
|
20
|
+
This prompt is not shipped benchmark evidence and does not claim hosted API savings.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Fixture-only digest plus artifact receipt prompt for reversible output-transform A/B setup.
|
|
2
|
+
|
|
3
|
+
You are reviewing an already-sanitized digest and receipt. Treat this as synthetic benchmark input only.
|
|
4
|
+
|
|
5
|
+
Digest of sanitized command output:
|
|
6
|
+
- command: python3 -m unittest sample_suite
|
|
7
|
+
- status: failed
|
|
8
|
+
- failure summary: sample_test_alpha expected ok but saw retry
|
|
9
|
+
- omitted sanitized lines: 5
|
|
10
|
+
|
|
11
|
+
Artifact receipt:
|
|
12
|
+
- artifact id: fixture-artifact-alpha
|
|
13
|
+
- digest id: fixture-digest-alpha
|
|
14
|
+
- exact re-expand command: context-guard-artifact show fixture-artifact-alpha
|
|
15
|
+
- re-expand expectation: retrieves the omitted sanitized lines exactly from a user-supplied local artifact store
|
|
16
|
+
|
|
17
|
+
Task:
|
|
18
|
+
1. Identify the failing command and failing check.
|
|
19
|
+
2. Describe which exact re-expand step would retrieve the omitted sanitized lines.
|
|
20
|
+
3. State that artifact receipt metadata and byte counts are retrieval or proxy evidence only, not token or cost savings evidence.
|
|
21
|
+
4. State that real comparisons require provider-measured telemetry on matched successful tasks, a failure-rate guardrail, human corrections, and shifted-cost accounting.
|
|
22
|
+
|
|
23
|
+
This prompt is dry-run-only fixture scaffolding and does not claim hosted API savings.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"id": "output_transform_trim_digest_fixture",
|
|
4
|
+
"prompt": "Fixture-only synthetic reversible output-transform task. Compare a placeholder raw command log with a digest plus artifact receipt and answer whether omitted sanitized lines can be exactly re-expanded. This fixture does not run a provider, trim command output, or fetch artifacts; future real runs must supply sanitized raw and digest evidence, artifact receipt metadata, provider-measured token/cost telemetry, matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
|
|
5
|
+
"model": "sonnet",
|
|
6
|
+
"effort": "medium",
|
|
7
|
+
"max_turns": 3,
|
|
8
|
+
"max_budget_usd": 1.0,
|
|
9
|
+
"allowed_tools": [],
|
|
10
|
+
"success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
|
|
11
|
+
"success_cwd": ".",
|
|
12
|
+
"variant_prompt_files": {
|
|
13
|
+
"baseline_raw_output_fixture": "output-transform-baseline-raw-output.prompt.example.md",
|
|
14
|
+
"fixture_only_digest_artifact_receipt": "output-transform-digest-receipt.prompt.example.md"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "output_transform_failure_summary_fixture",
|
|
19
|
+
"prompt": "Fixture-only synthetic reversible output-transform task. Given a placeholder failure summary and a receipt-backed sanitized output handle, identify the failing command and describe which exact re-expand step would retrieve the omitted context. This fixture is dry-run-only until prompts, success checks, provider-measured primary token/cost fields, human corrections, and shifted-cost accounting are supplied for matched successful tasks.",
|
|
20
|
+
"model": "sonnet",
|
|
21
|
+
"effort": "medium",
|
|
22
|
+
"max_turns": 3,
|
|
23
|
+
"max_budget_usd": 1.0,
|
|
24
|
+
"allowed_tools": [],
|
|
25
|
+
"success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
|
|
26
|
+
"success_cwd": "."
|
|
27
|
+
}
|
|
28
|
+
]
|