akernel-runtime 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. akernel_runtime-0.1.0.dist-info/METADATA +270 -0
  2. akernel_runtime-0.1.0.dist-info/RECORD +40 -0
  3. akernel_runtime-0.1.0.dist-info/WHEEL +5 -0
  4. akernel_runtime-0.1.0.dist-info/entry_points.txt +2 -0
  5. akernel_runtime-0.1.0.dist-info/licenses/LICENSE +201 -0
  6. akernel_runtime-0.1.0.dist-info/licenses/NOTICE +4 -0
  7. akernel_runtime-0.1.0.dist-info/top_level.txt +1 -0
  8. context_kernel/__init__.py +4 -0
  9. context_kernel/__main__.py +5 -0
  10. context_kernel/agent_reports.py +188 -0
  11. context_kernel/benchmarks.py +493 -0
  12. context_kernel/budget.py +72 -0
  13. context_kernel/cli.py +2953 -0
  14. context_kernel/context.py +161 -0
  15. context_kernel/evals.py +347 -0
  16. context_kernel/global_memory.py +126 -0
  17. context_kernel/loop.py +1617 -0
  18. context_kernel/marketplace.py +194 -0
  19. context_kernel/marketplace_data/skills/context_budget.json +27 -0
  20. context_kernel/marketplace_data/skills/context_compaction.json +27 -0
  21. context_kernel/marketplace_data/skills/edit_file.json +27 -0
  22. context_kernel/marketplace_data/skills/index.json +66 -0
  23. context_kernel/marketplace_data/skills/long_task_planning.json +27 -0
  24. context_kernel/marketplace_data/skills/multi_file_bugfix.json +28 -0
  25. context_kernel/memory.py +515 -0
  26. context_kernel/models.py +144 -0
  27. context_kernel/planner.py +155 -0
  28. context_kernel/policy.py +271 -0
  29. context_kernel/project.py +317 -0
  30. context_kernel/providers.py +1264 -0
  31. context_kernel/report_costs.py +375 -0
  32. context_kernel/runner.py +78 -0
  33. context_kernel/skills.py +318 -0
  34. context_kernel/state_writer.py +108 -0
  35. context_kernel/storage.py +171 -0
  36. context_kernel/tasks.py +549 -0
  37. context_kernel/text.py +42 -0
  38. context_kernel/tokenizer.py +22 -0
  39. context_kernel/tools.py +544 -0
  40. context_kernel/verifier.py +77 -0
@@ -0,0 +1,188 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from .storage import Workspace
6
+
7
+
8
+ def load_agent_report(workspace: Workspace, run_id: str) -> dict[str, Any]:
9
+ path = workspace.agent_runs_dir / f"{run_id}.json"
10
+ if not path.exists():
11
+ raise KeyError(f"Unknown agent run: {run_id}")
12
+ return Workspace.read_json(path)
13
+
14
+
15
+ def build_agent_cost_report(report: dict[str, Any]) -> dict[str, Any]:
16
+ steps = report.get("steps", [])
17
+ step_summaries: list[dict[str, Any]] = []
18
+ action_breakdown: dict[str, dict[str, int]] = {}
19
+ task_brief_series: list[int] = []
20
+ planned_context_series: list[int] = []
21
+
22
+ for step in steps:
23
+ action = str((step.get("action") or {}).get("action") or "unknown")
24
+ tokens = normalize_tokens(step.get("tokens", {}))
25
+ task_brief_tokens = read_int(step.get("plan", {}).get("task", {}).get("estimated_tokens"))
26
+ planned_context_tokens = read_int(step.get("plan", {}).get("budget", {}).get("estimated_used"))
27
+ task_brief_series.append(task_brief_tokens)
28
+ planned_context_series.append(planned_context_tokens)
29
+ tool = step.get("tool", {}) if isinstance(step.get("tool"), dict) else {}
30
+ step_summary = {
31
+ "index": read_int(step.get("index")),
32
+ "action": action,
33
+ "status": str(step.get("status") or ""),
34
+ "input_tokens": tokens["input_tokens"],
35
+ "output_tokens": tokens["output_tokens"],
36
+ "total_tokens": tokens["total_tokens"],
37
+ "task_brief_tokens": task_brief_tokens,
38
+ "planned_context_tokens": planned_context_tokens,
39
+ "trace_id": step.get("trace_id"),
40
+ "tool_trace_id": step.get("tool_trace_id"),
41
+ "tool": tool.get("name"),
42
+ "model_role": step.get("model_role"),
43
+ "model": step.get("model"),
44
+ "aux_review": summarize_aux_review(step.get("aux_review", {})),
45
+ }
46
+ step_summaries.append(step_summary)
47
+
48
+ bucket = action_breakdown.setdefault(
49
+ action,
50
+ {"count": 0, "input_tokens": 0, "output_tokens": 0, "total_tokens": 0},
51
+ )
52
+ bucket["count"] += 1
53
+ bucket["input_tokens"] += tokens["input_tokens"]
54
+ bucket["output_tokens"] += tokens["output_tokens"]
55
+ bucket["total_tokens"] += tokens["total_tokens"]
56
+
57
+ totals = normalize_tokens(report.get("totals", {}))
58
+ hotspots = sorted(step_summaries, key=lambda item: item["total_tokens"], reverse=True)[:3]
59
+ return {
60
+ "run_id": report.get("id"),
61
+ "task_id": report.get("task_id"),
62
+ "status": report.get("status"),
63
+ "request": str(report.get("request") or ""),
64
+ "model_routing": report.get("model_routing", {}),
65
+ "summary": {
66
+ "step_count": len(step_summaries),
67
+ "input_tokens": totals["input_tokens"],
68
+ "output_tokens": totals["output_tokens"],
69
+ "total_tokens": totals["total_tokens"],
70
+ "average_tokens_per_step": round(totals["total_tokens"] / len(step_summaries), 2) if step_summaries else 0.0,
71
+ "max_step_tokens": hotspots[0]["total_tokens"] if hotspots else 0,
72
+ "max_step_index": hotspots[0]["index"] if hotspots else 0,
73
+ "request_chars": len(str(report.get("request") or "")),
74
+ "final_response_chars": len(str(report.get("final_response") or "")),
75
+ "action_breakdown": action_breakdown,
76
+ "task_brief": summarize_series(task_brief_series),
77
+ "planned_context": summarize_series(planned_context_series),
78
+ },
79
+ "steps": step_summaries,
80
+ "hotspots": hotspots,
81
+ "storage": report.get("storage", {}),
82
+ }
83
+
84
+
85
+ def render_agent_cost_report(cost: dict[str, Any]) -> str:
86
+ summary = cost["summary"]
87
+ lines = [
88
+ f"agent_run: {cost['run_id']}",
89
+ f"task: {cost.get('task_id')}",
90
+ f"status: {cost.get('status')}",
91
+ f"steps: {summary['step_count']}",
92
+ (
93
+ f"tokens: total={summary['total_tokens']} "
94
+ f"input={summary['input_tokens']} output={summary['output_tokens']} "
95
+ f"avg_per_step={summary['average_tokens_per_step']}"
96
+ ),
97
+ (
98
+ f"task_brief_tokens: first={summary['task_brief']['first_tokens']} "
99
+ f"last={summary['task_brief']['last_tokens']} "
100
+ f"peak={summary['task_brief']['peak_tokens']} "
101
+ f"growth={summary['task_brief']['growth_tokens']}"
102
+ ),
103
+ (
104
+ f"planned_context_tokens: first={summary['planned_context']['first_tokens']} "
105
+ f"last={summary['planned_context']['last_tokens']} "
106
+ f"peak={summary['planned_context']['peak_tokens']} "
107
+ f"growth={summary['planned_context']['growth_tokens']}"
108
+ ),
109
+ f"hotspot: step={summary['max_step_index']} tokens={summary['max_step_tokens']}",
110
+ ]
111
+ routing = cost.get("model_routing") or {}
112
+ if routing:
113
+ lines.append(
114
+ "model_routing: "
115
+ f"mode={routing.get('mode')} "
116
+ f"primary={routing.get('primary_model')} "
117
+ f"auxiliary={routing.get('auxiliary_model')} "
118
+ f"review={routing.get('aux_review')}"
119
+ )
120
+ if summary["action_breakdown"]:
121
+ parts = []
122
+ for action, bucket in summary["action_breakdown"].items():
123
+ parts.append(f"{action}={bucket['count']}x/{bucket['total_tokens']}t")
124
+ lines.append("actions: " + ", ".join(parts))
125
+ lines.append("")
126
+ lines.append("Step Breakdown")
127
+ for step in cost["steps"]:
128
+ lines.append(
129
+ (
130
+ f"- step {step['index']}: action={step['action']} status={step['status']} "
131
+ f"tokens={step['total_tokens']} input={step['input_tokens']} output={step['output_tokens']} "
132
+ f"brief={step['task_brief_tokens']} context={step['planned_context_tokens']} "
133
+ f"model={step.get('model_role') or 'unknown'}:{step.get('model') or 'default'}"
134
+ f"{render_aux_review_inline(step.get('aux_review', {}))}"
135
+ )
136
+ )
137
+ return "\n".join(lines)
138
+
139
+
140
+ def summarize_aux_review(review: dict[str, Any]) -> dict[str, Any]:
141
+ if not isinstance(review, dict) or not review.get("enabled"):
142
+ return {"enabled": False}
143
+ tokens = normalize_tokens(review.get("tokens", {}))
144
+ return {
145
+ "enabled": True,
146
+ "trace_id": review.get("trace_id"),
147
+ "model": review.get("model"),
148
+ "risk": review.get("risk"),
149
+ "recommendation": review.get("recommendation"),
150
+ "total_tokens": tokens["total_tokens"],
151
+ }
152
+
153
+
154
+ def render_aux_review_inline(review: dict[str, Any]) -> str:
155
+ if not isinstance(review, dict) or not review.get("enabled"):
156
+ return ""
157
+ return f" review={review.get('risk')}:{review.get('recommendation')}:{review.get('total_tokens', 0)}t"
158
+
159
+
160
+ def summarize_series(values: list[int]) -> dict[str, int]:
161
+ if not values:
162
+ return {
163
+ "first_tokens": 0,
164
+ "last_tokens": 0,
165
+ "peak_tokens": 0,
166
+ "growth_tokens": 0,
167
+ }
168
+ return {
169
+ "first_tokens": values[0],
170
+ "last_tokens": values[-1],
171
+ "peak_tokens": max(values),
172
+ "growth_tokens": values[-1] - values[0],
173
+ }
174
+
175
+
176
+ def normalize_tokens(tokens: dict[str, Any]) -> dict[str, int]:
177
+ return {
178
+ "input_tokens": read_int(tokens.get("input_tokens")),
179
+ "output_tokens": read_int(tokens.get("output_tokens")),
180
+ "total_tokens": read_int(tokens.get("total_tokens")),
181
+ }
182
+
183
+
184
+ def read_int(value: Any) -> int:
185
+ try:
186
+ return int(value or 0)
187
+ except (TypeError, ValueError):
188
+ return 0
@@ -0,0 +1,493 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+ from uuid import uuid4
6
+
7
+ from .budget import DEFAULT_PROFILE
8
+ from .evals import diff_reports
9
+ from .evals import EvalRunner
10
+ from .models import utc_now
11
+ from .report_costs import build_benchmark_cost_report, diff_cost_reports, render_cost_markdown
12
+ from .storage import Workspace
13
+
14
+
15
+ class BenchmarkRunner:
16
+ def __init__(self, workspace: Workspace):
17
+ self.workspace = workspace
18
+ self.evals = EvalRunner(workspace)
19
+
20
+ def run_directory(
21
+ self,
22
+ path: Path,
23
+ default_budget: int | None = None,
24
+ default_profile: str = DEFAULT_PROFILE,
25
+ save: bool = True,
26
+ execute_provider: str | None = None,
27
+ execute_model: str | None = None,
28
+ execute_base_url: str | None = None,
29
+ ) -> dict[str, Any]:
30
+ fixtures = sorted(path.glob("*.json"))
31
+ if not fixtures:
32
+ raise ValueError(f"Benchmark directory has no JSON fixtures: {path}")
33
+
34
+ resolved_path = path.resolve()
35
+ reports = [
36
+ self.evals.run_fixture(
37
+ fixture,
38
+ default_budget=default_budget,
39
+ default_profile=default_profile,
40
+ save=False,
41
+ execute_provider=execute_provider,
42
+ execute_model=execute_model,
43
+ execute_base_url=execute_base_url,
44
+ )
45
+ for fixture in fixtures
46
+ ]
47
+ report = {
48
+ "id": uuid4().hex[:12],
49
+ "created_at": utc_now(),
50
+ "benchmark": str(path),
51
+ "benchmark_path": str(resolved_path),
52
+ "name": path.name,
53
+ "execution": {
54
+ "enabled": bool(execute_provider),
55
+ "provider": execute_provider,
56
+ "model": execute_model,
57
+ },
58
+ "fixtures": reports,
59
+ "summary": summarize_benchmark(reports),
60
+ }
61
+ if save:
62
+ self.save_report(report)
63
+ return report
64
+
65
+ def save_report(self, report: dict[str, Any]) -> Path:
66
+ self.workspace.benchmarks_dir.mkdir(parents=True, exist_ok=True)
67
+ path = self.workspace.benchmarks_dir / f"{report['id']}.json"
68
+ Workspace.write_json(path, report)
69
+ return path
70
+
71
+ def list_reports(self) -> list[dict[str, Any]]:
72
+ self.workspace.benchmarks_dir.mkdir(parents=True, exist_ok=True)
73
+ reports: list[dict[str, Any]] = []
74
+ for path in sorted(self.workspace.benchmarks_dir.glob("*.json")):
75
+ report = Workspace.read_json(path)
76
+ summary = report.get("summary", {})
77
+ reports.append(
78
+ {
79
+ "id": report.get("id", path.stem),
80
+ "created_at": report.get("created_at", ""),
81
+ "name": report.get("name", ""),
82
+ "fixture_count": summary.get("fixture_count", 0),
83
+ "task_count": summary.get("task_count", 0),
84
+ "average_savings_percent": summary.get("average_savings_percent", 0),
85
+ "checks": f"{summary.get('passed_checks', 0)}/{summary.get('total_checks', 0)}",
86
+ "ok": summary.get("ok", False),
87
+ }
88
+ )
89
+ return sorted(reports, key=lambda item: item["created_at"], reverse=True)
90
+
91
+ def get_report(self, report_id: str) -> dict[str, Any]:
92
+ path = self.workspace.benchmarks_dir / f"{report_id}.json"
93
+ if not path.exists():
94
+ raise KeyError(f"Unknown benchmark report: {report_id}")
95
+ return Workspace.read_json(path)
96
+
97
+ def find_baseline(
98
+ self,
99
+ path: Path,
100
+ *,
101
+ baseline_id: str | None = None,
102
+ exclude_id: str | None = None,
103
+ ) -> dict[str, Any] | None:
104
+ if baseline_id:
105
+ return {"match": "explicit", "report": self.get_report(baseline_id)}
106
+
107
+ self.workspace.benchmarks_dir.mkdir(parents=True, exist_ok=True)
108
+ path_matches: list[dict[str, Any]] = []
109
+ name_matches: list[dict[str, Any]] = []
110
+ for report_path in sorted(self.workspace.benchmarks_dir.glob("*.json")):
111
+ report = Workspace.read_json(report_path)
112
+ if exclude_id and report.get("id") == exclude_id:
113
+ continue
114
+ if benchmark_path_matches(report, path):
115
+ path_matches.append(report)
116
+ continue
117
+ if report.get("name") == path.name:
118
+ name_matches.append(report)
119
+
120
+ matches = path_matches or name_matches
121
+ if not matches:
122
+ return None
123
+ matches.sort(key=lambda item: item.get("created_at", ""), reverse=True)
124
+ return {
125
+ "match": "path" if path_matches else "name",
126
+ "report": matches[0],
127
+ }
128
+
129
+ def diff_reports(self, before_id: str, after_id: str) -> dict[str, Any]:
130
+ before = self.get_report(before_id)
131
+ after = self.get_report(after_id)
132
+ return diff_benchmarks(before, after)
133
+
134
+ def export_markdown(self, report_id: str, output: Path | None = None) -> Path:
135
+ report = self.get_report(report_id)
136
+ output = output or self.workspace.benchmarks_dir / f"{report_id}.md"
137
+ output.parent.mkdir(parents=True, exist_ok=True)
138
+ output.write_text(render_benchmark_markdown(report), encoding="utf-8")
139
+ return output
140
+
141
+ def evidence(
142
+ self,
143
+ report_ids: list[str] | None = None,
144
+ *,
145
+ limit: int | None = None,
146
+ ) -> dict[str, Any]:
147
+ reports = [self.get_report(report_id) for report_id in report_ids] if report_ids else self._recent_reports(limit=limit)
148
+ return build_benchmark_evidence(reports)
149
+
150
+ def export_evidence_markdown(
151
+ self,
152
+ report_ids: list[str] | None = None,
153
+ *,
154
+ limit: int | None = None,
155
+ output: Path | None = None,
156
+ ) -> Path:
157
+ evidence = self.evidence(report_ids, limit=limit)
158
+ output = output or self.workspace.benchmarks_dir / "evidence.md"
159
+ output.parent.mkdir(parents=True, exist_ok=True)
160
+ output.write_text(render_benchmark_evidence_markdown(evidence), encoding="utf-8")
161
+ return output
162
+
163
+ def _recent_reports(self, *, limit: int | None = None) -> list[dict[str, Any]]:
164
+ self.workspace.benchmarks_dir.mkdir(parents=True, exist_ok=True)
165
+ reports = [Workspace.read_json(path) for path in sorted(self.workspace.benchmarks_dir.glob("*.json"))]
166
+ reports.sort(key=lambda item: item.get("created_at", ""), reverse=True)
167
+ if limit is not None:
168
+ reports = reports[: max(0, limit)]
169
+ return reports
170
+
171
+
172
+ def summarize_benchmark(reports: list[dict[str, Any]]) -> dict[str, Any]:
173
+ summaries = [report["summary"] for report in reports]
174
+ total_tasks = sum(summary["task_count"] for summary in summaries)
175
+ total_kernel = sum(summary["total_kernel_tokens"] for summary in summaries)
176
+ total_baseline = sum(summary["total_baseline_tokens"] for summary in summaries)
177
+ total_savings = max(0, total_baseline - total_kernel)
178
+ total_checks = sum(summary["total_checks"] for summary in summaries)
179
+ passed_checks = sum(summary["passed_checks"] for summary in summaries)
180
+ total_execution_tokens = sum(summary.get("total_execution_tokens", 0) for summary in summaries)
181
+ executed_tasks = sum(summary.get("executed_tasks", 0) for summary in summaries)
182
+ blocked_tasks = sum(summary.get("blocked_tasks", 0) for summary in summaries)
183
+ average_savings = (
184
+ sum(summary["average_savings_percent"] for summary in summaries) / len(summaries)
185
+ if summaries
186
+ else 0.0
187
+ )
188
+ return {
189
+ "fixture_count": len(reports),
190
+ "task_count": total_tasks,
191
+ "total_kernel_tokens": total_kernel,
192
+ "total_baseline_tokens": total_baseline,
193
+ "total_savings_tokens": total_savings,
194
+ "total_savings_percent": round((total_savings / total_baseline) * 100, 2) if total_baseline else 0.0,
195
+ "average_savings_percent": round(average_savings, 2),
196
+ "passed_checks": passed_checks,
197
+ "total_checks": total_checks,
198
+ "executed_tasks": executed_tasks,
199
+ "blocked_tasks": blocked_tasks,
200
+ "total_execution_tokens": total_execution_tokens,
201
+ "ok": passed_checks == total_checks,
202
+ }
203
+
204
+
205
+ def build_benchmark_evidence(reports: list[dict[str, Any]]) -> dict[str, Any]:
206
+ reports = sorted(reports, key=lambda item: item.get("created_at", ""), reverse=True)
207
+ summaries = [report.get("summary", {}) for report in reports]
208
+ total_kernel = sum(int(summary.get("total_kernel_tokens", 0) or 0) for summary in summaries)
209
+ total_baseline = sum(int(summary.get("total_baseline_tokens", 0) or 0) for summary in summaries)
210
+ total_savings = max(0, total_baseline - total_kernel)
211
+ total_checks = sum(int(summary.get("total_checks", 0) or 0) for summary in summaries)
212
+ passed_checks = sum(int(summary.get("passed_checks", 0) or 0) for summary in summaries)
213
+ tasks = benchmark_task_snapshots(reports)
214
+ weakest = sorted(tasks, key=lambda item: item["savings_percent"])[:5]
215
+ strongest = sorted(tasks, key=lambda item: item["savings_tokens"], reverse=True)[:5]
216
+ return {
217
+ "id": uuid4().hex[:12],
218
+ "created_at": utc_now(),
219
+ "report_count": len(reports),
220
+ "benchmark_count": len({report.get("name", "") for report in reports if report.get("name")}),
221
+ "fixture_count": sum(int(summary.get("fixture_count", 0) or 0) for summary in summaries),
222
+ "task_count": sum(int(summary.get("task_count", 0) or 0) for summary in summaries),
223
+ "total_kernel_tokens": total_kernel,
224
+ "total_baseline_tokens": total_baseline,
225
+ "total_savings_tokens": total_savings,
226
+ "total_savings_percent": round((total_savings / total_baseline) * 100, 2) if total_baseline else 0.0,
227
+ "average_report_savings_percent": round(
228
+ sum(float(summary.get("total_savings_percent", 0) or 0) for summary in summaries) / len(summaries),
229
+ 2,
230
+ )
231
+ if summaries
232
+ else 0.0,
233
+ "passed_checks": passed_checks,
234
+ "total_checks": total_checks,
235
+ "pass_rate_percent": round((passed_checks / total_checks) * 100, 2) if total_checks else 0.0,
236
+ "reports": [benchmark_ref(report) | {"summary": report.get("summary", {})} for report in reports],
237
+ "strongest_savings": strongest,
238
+ "weakest_savings": weakest,
239
+ "ok": bool(reports) and passed_checks == total_checks,
240
+ }
241
+
242
+
243
+ def benchmark_task_snapshots(reports: list[dict[str, Any]]) -> list[dict[str, Any]]:
244
+ snapshots: list[dict[str, Any]] = []
245
+ for report in reports:
246
+ for fixture in report.get("fixtures", []):
247
+ fixture_name = Path(str(fixture.get("fixture", ""))).name
248
+ for task in fixture.get("tasks", []):
249
+ kernel_tokens = int(task.get("kernel", {}).get("estimated_tokens", 0) or 0)
250
+ baseline_tokens = int(task.get("baseline", {}).get("estimated_tokens", 0) or 0)
251
+ checks = task.get("checks", {})
252
+ snapshots.append(
253
+ {
254
+ "report_id": report.get("id"),
255
+ "benchmark": report.get("name"),
256
+ "fixture": fixture_name,
257
+ "task": task.get("id"),
258
+ "profile": task.get("profile"),
259
+ "kernel_tokens": kernel_tokens,
260
+ "baseline_tokens": baseline_tokens,
261
+ "savings_tokens": max(0, baseline_tokens - kernel_tokens),
262
+ "savings_percent": float(task.get("savings", {}).get("percent", 0) or 0),
263
+ "checks": f"{int(checks.get('passed', 0) or 0)}/{int(checks.get('total', 0) or 0)}",
264
+ }
265
+ )
266
+ return snapshots
267
+
268
+
269
+ def diff_benchmarks(before: dict[str, Any], after: dict[str, Any]) -> dict[str, Any]:
270
+ before_summary = before.get("summary", {})
271
+ after_summary = after.get("summary", {})
272
+ fixture_diffs = diff_fixtures(before.get("fixtures", []), after.get("fixtures", []))
273
+ cost_diff = diff_cost_reports(build_benchmark_cost_report(before), build_benchmark_cost_report(after))
274
+ regressions = [
275
+ fixture
276
+ for fixture in fixture_diffs
277
+ if fixture.get("status") == "changed"
278
+ and (
279
+ fixture["summary_delta"]["kernel_tokens"] > 10
280
+ or fixture["summary_delta"]["savings_percent"] < 0
281
+ or fixture["summary_delta"]["passed_checks"] < 0
282
+ or fixture["regressions"]
283
+ )
284
+ ]
285
+ return {
286
+ "before": benchmark_ref(before),
287
+ "after": benchmark_ref(after),
288
+ "summary_delta": {
289
+ "fixtures": delta(before_summary, after_summary, "fixture_count"),
290
+ "tasks": delta(before_summary, after_summary, "task_count"),
291
+ "kernel_tokens": delta(before_summary, after_summary, "total_kernel_tokens"),
292
+ "baseline_tokens": delta(before_summary, after_summary, "total_baseline_tokens"),
293
+ "savings_tokens": delta(before_summary, after_summary, "total_savings_tokens"),
294
+ "savings_percent": round(delta(before_summary, after_summary, "total_savings_percent"), 2),
295
+ "passed_checks": delta(before_summary, after_summary, "passed_checks"),
296
+ "total_checks": delta(before_summary, after_summary, "total_checks"),
297
+ "execution_tokens": delta(before_summary, after_summary, "total_execution_tokens"),
298
+ },
299
+ "cost_diff": cost_diff,
300
+ "cost_regressions": cost_diff["regressions"],
301
+ "fixtures": fixture_diffs,
302
+ "regressions": regressions,
303
+ "ok": not regressions and cost_diff["ok"],
304
+ }
305
+
306
+
307
+ def diff_fixtures(before_fixtures: list[dict[str, Any]], after_fixtures: list[dict[str, Any]]) -> list[dict[str, Any]]:
308
+ before_by_name = {Path(fixture["fixture"]).name: fixture for fixture in before_fixtures}
309
+ after_by_name = {Path(fixture["fixture"]).name: fixture for fixture in after_fixtures}
310
+ names = sorted(set(before_by_name).union(after_by_name))
311
+ diffs: list[dict[str, Any]] = []
312
+ for name in names:
313
+ before = before_by_name.get(name)
314
+ after = after_by_name.get(name)
315
+ if before is None:
316
+ diffs.append({"fixture": name, "status": "added"})
317
+ continue
318
+ if after is None:
319
+ diffs.append({"fixture": name, "status": "removed"})
320
+ continue
321
+ fixture_diff = diff_reports(before, after)
322
+ diffs.append(
323
+ {
324
+ "fixture": name,
325
+ "status": "changed",
326
+ "summary_delta": fixture_diff["summary_delta"],
327
+ "regressions": fixture_diff["regressions"],
328
+ "ok": fixture_diff["ok"],
329
+ }
330
+ )
331
+ return diffs
332
+
333
+
334
+ def render_benchmark_markdown(report: dict[str, Any]) -> str:
335
+ summary = report["summary"]
336
+ cost = build_benchmark_cost_report(report)
337
+ lines = [
338
+ f"# Benchmark Report: {report['name']}",
339
+ "",
340
+ f"- Report id: `{report['id']}`",
341
+ f"- Created at: `{report['created_at']}`",
342
+ f"- Benchmark: `{report['benchmark']}`",
343
+ f"- Fixtures: `{summary['fixture_count']}`",
344
+ f"- Tasks: `{summary['task_count']}`",
345
+ f"- Average savings: `{summary['average_savings_percent']}%`",
346
+ f"- Total savings: `{summary['total_savings_tokens']}` tokens (`{summary['total_savings_percent']}%`)",
347
+ f"- Checks: `{summary['passed_checks']}/{summary['total_checks']}`",
348
+ f"- Executed tasks: `{summary['executed_tasks']}`",
349
+ f"- Execution tokens: `{summary['total_execution_tokens']}`",
350
+ "",
351
+ "## Cost View",
352
+ "",
353
+ render_cost_markdown(cost),
354
+ "",
355
+ "## Fixtures",
356
+ "",
357
+ "| Fixture | Tasks | Avg Savings | Checks | Execution Tokens |",
358
+ "| --- | ---: | ---: | ---: | ---: |",
359
+ ]
360
+ for fixture in report["fixtures"]:
361
+ fixture_summary = fixture["summary"]
362
+ lines.append(
363
+ "| "
364
+ + " | ".join(
365
+ [
366
+ Path(fixture["fixture"]).name,
367
+ str(fixture_summary["task_count"]),
368
+ f"{fixture_summary['average_savings_percent']}%",
369
+ f"{fixture_summary['passed_checks']}/{fixture_summary['total_checks']}",
370
+ str(fixture_summary["total_execution_tokens"]),
371
+ ]
372
+ )
373
+ + " |"
374
+ )
375
+ lines.extend(["", "## Tasks", ""])
376
+ for fixture in report["fixtures"]:
377
+ lines.append(f"### {Path(fixture['fixture']).name}")
378
+ lines.append("")
379
+ lines.append("| Task | Profile | Kernel Tokens | Baseline Tokens | Savings | Checks |")
380
+ lines.append("| --- | --- | ---: | ---: | ---: | ---: |")
381
+ for task in fixture["tasks"]:
382
+ lines.append(
383
+ "| "
384
+ + " | ".join(
385
+ [
386
+ task["id"],
387
+ task["profile"],
388
+ str(task["kernel"]["estimated_tokens"]),
389
+ str(task["baseline"]["estimated_tokens"]),
390
+ f"{task['savings']['percent']}%",
391
+ f"{task['checks']['passed']}/{task['checks']['total']}",
392
+ ]
393
+ )
394
+ + " |"
395
+ )
396
+ lines.append("")
397
+ return "\n".join(lines).rstrip() + "\n"
398
+
399
+
400
+ def render_benchmark_evidence_markdown(evidence: dict[str, Any]) -> str:
401
+ lines = [
402
+ "# Benchmark Evidence",
403
+ "",
404
+ f"- Evidence id: `{evidence['id']}`",
405
+ f"- Created at: `{evidence['created_at']}`",
406
+ f"- Reports: `{evidence['report_count']}`",
407
+ f"- Benchmarks: `{evidence['benchmark_count']}`",
408
+ f"- Fixtures: `{evidence['fixture_count']}`",
409
+ f"- Tasks: `{evidence['task_count']}`",
410
+ f"- Kernel tokens: `{evidence['total_kernel_tokens']}`",
411
+ f"- Baseline tokens: `{evidence['total_baseline_tokens']}`",
412
+ f"- Token savings: `{evidence['total_savings_tokens']}` (`{evidence['total_savings_percent']}%`)",
413
+ f"- Checks: `{evidence['passed_checks']}/{evidence['total_checks']}` (`{evidence['pass_rate_percent']}%`)",
414
+ "",
415
+ "## Reports",
416
+ "",
417
+ "| Report | Benchmark | Tasks | Savings | Checks |",
418
+ "| --- | --- | ---: | ---: | ---: |",
419
+ ]
420
+ for report in evidence.get("reports", []):
421
+ summary = report.get("summary", {})
422
+ lines.append(
423
+ "| "
424
+ + " | ".join(
425
+ [
426
+ str(report.get("id", "")),
427
+ str(report.get("name", "")),
428
+ str(summary.get("task_count", 0)),
429
+ f"{summary.get('total_savings_tokens', 0)} ({summary.get('total_savings_percent', 0)}%)",
430
+ f"{summary.get('passed_checks', 0)}/{summary.get('total_checks', 0)}",
431
+ ]
432
+ )
433
+ + " |"
434
+ )
435
+ lines.extend(["", "## Strongest Savings", "", "| Scope | Kernel | Baseline | Savings | Checks |", "| --- | ---: | ---: | ---: | ---: |"])
436
+ for item in evidence.get("strongest_savings", []):
437
+ lines.append(benchmark_evidence_task_line(item))
438
+ lines.extend(["", "## Weakest Savings", "", "| Scope | Kernel | Baseline | Savings | Checks |", "| --- | ---: | ---: | ---: | ---: |"])
439
+ for item in evidence.get("weakest_savings", []):
440
+ lines.append(benchmark_evidence_task_line(item))
441
+ return "\n".join(lines).rstrip() + "\n"
442
+
443
+
444
+ def benchmark_evidence_task_line(item: dict[str, Any]) -> str:
445
+ scope = f"{item.get('benchmark', '')}/{item.get('fixture', '')}/{item.get('task', '')}"
446
+ return (
447
+ "| "
448
+ + " | ".join(
449
+ [
450
+ scope,
451
+ str(item.get("kernel_tokens", 0)),
452
+ str(item.get("baseline_tokens", 0)),
453
+ f"{item.get('savings_tokens', 0)} ({item.get('savings_percent', 0)}%)",
454
+ str(item.get("checks", "")),
455
+ ]
456
+ )
457
+ + " |"
458
+ )
459
+
460
+
461
+ def benchmark_ref(report: dict[str, Any]) -> dict[str, Any]:
462
+ return {
463
+ "id": report.get("id"),
464
+ "created_at": report.get("created_at"),
465
+ "benchmark": report.get("benchmark"),
466
+ "benchmark_path": report.get("benchmark_path"),
467
+ "name": report.get("name"),
468
+ }
469
+
470
+
471
+ def delta(before: dict[str, Any], after: dict[str, Any], key: str) -> int | float:
472
+ return after.get(key, 0) - before.get(key, 0)
473
+
474
+
475
+ def benchmark_path_matches(report: dict[str, Any], path: Path) -> bool:
476
+ return bool(benchmark_path_keys(path).intersection(report_benchmark_keys(report)))
477
+
478
+
479
+ def benchmark_path_keys(path: Path) -> set[str]:
480
+ return {key for key in {str(path), str(path.resolve())} if key}
481
+
482
+
483
+ def report_benchmark_keys(report: dict[str, Any]) -> set[str]:
484
+ keys: set[str] = set()
485
+ for value in (report.get("benchmark_path"), report.get("benchmark")):
486
+ if not isinstance(value, str):
487
+ continue
488
+ text = value.strip()
489
+ if not text:
490
+ continue
491
+ report_path = Path(text)
492
+ keys.update({text, str(report_path), str(report_path.resolve())})
493
+ return {key for key in keys if key}