akernel-runtime 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- akernel_runtime-0.1.0.dist-info/METADATA +270 -0
- akernel_runtime-0.1.0.dist-info/RECORD +40 -0
- akernel_runtime-0.1.0.dist-info/WHEEL +5 -0
- akernel_runtime-0.1.0.dist-info/entry_points.txt +2 -0
- akernel_runtime-0.1.0.dist-info/licenses/LICENSE +201 -0
- akernel_runtime-0.1.0.dist-info/licenses/NOTICE +4 -0
- akernel_runtime-0.1.0.dist-info/top_level.txt +1 -0
- context_kernel/__init__.py +4 -0
- context_kernel/__main__.py +5 -0
- context_kernel/agent_reports.py +188 -0
- context_kernel/benchmarks.py +493 -0
- context_kernel/budget.py +72 -0
- context_kernel/cli.py +2953 -0
- context_kernel/context.py +161 -0
- context_kernel/evals.py +347 -0
- context_kernel/global_memory.py +126 -0
- context_kernel/loop.py +1617 -0
- context_kernel/marketplace.py +194 -0
- context_kernel/marketplace_data/skills/context_budget.json +27 -0
- context_kernel/marketplace_data/skills/context_compaction.json +27 -0
- context_kernel/marketplace_data/skills/edit_file.json +27 -0
- context_kernel/marketplace_data/skills/index.json +66 -0
- context_kernel/marketplace_data/skills/long_task_planning.json +27 -0
- context_kernel/marketplace_data/skills/multi_file_bugfix.json +28 -0
- context_kernel/memory.py +515 -0
- context_kernel/models.py +144 -0
- context_kernel/planner.py +155 -0
- context_kernel/policy.py +271 -0
- context_kernel/project.py +317 -0
- context_kernel/providers.py +1264 -0
- context_kernel/report_costs.py +375 -0
- context_kernel/runner.py +78 -0
- context_kernel/skills.py +318 -0
- context_kernel/state_writer.py +108 -0
- context_kernel/storage.py +171 -0
- context_kernel/tasks.py +549 -0
- context_kernel/text.py +42 -0
- context_kernel/tokenizer.py +22 -0
- context_kernel/tools.py +544 -0
- context_kernel/verifier.py +77 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from .storage import Workspace
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_agent_report(workspace: Workspace, run_id: str) -> dict[str, Any]:
|
|
9
|
+
path = workspace.agent_runs_dir / f"{run_id}.json"
|
|
10
|
+
if not path.exists():
|
|
11
|
+
raise KeyError(f"Unknown agent run: {run_id}")
|
|
12
|
+
return Workspace.read_json(path)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_agent_cost_report(report: dict[str, Any]) -> dict[str, Any]:
|
|
16
|
+
steps = report.get("steps", [])
|
|
17
|
+
step_summaries: list[dict[str, Any]] = []
|
|
18
|
+
action_breakdown: dict[str, dict[str, int]] = {}
|
|
19
|
+
task_brief_series: list[int] = []
|
|
20
|
+
planned_context_series: list[int] = []
|
|
21
|
+
|
|
22
|
+
for step in steps:
|
|
23
|
+
action = str((step.get("action") or {}).get("action") or "unknown")
|
|
24
|
+
tokens = normalize_tokens(step.get("tokens", {}))
|
|
25
|
+
task_brief_tokens = read_int(step.get("plan", {}).get("task", {}).get("estimated_tokens"))
|
|
26
|
+
planned_context_tokens = read_int(step.get("plan", {}).get("budget", {}).get("estimated_used"))
|
|
27
|
+
task_brief_series.append(task_brief_tokens)
|
|
28
|
+
planned_context_series.append(planned_context_tokens)
|
|
29
|
+
tool = step.get("tool", {}) if isinstance(step.get("tool"), dict) else {}
|
|
30
|
+
step_summary = {
|
|
31
|
+
"index": read_int(step.get("index")),
|
|
32
|
+
"action": action,
|
|
33
|
+
"status": str(step.get("status") or ""),
|
|
34
|
+
"input_tokens": tokens["input_tokens"],
|
|
35
|
+
"output_tokens": tokens["output_tokens"],
|
|
36
|
+
"total_tokens": tokens["total_tokens"],
|
|
37
|
+
"task_brief_tokens": task_brief_tokens,
|
|
38
|
+
"planned_context_tokens": planned_context_tokens,
|
|
39
|
+
"trace_id": step.get("trace_id"),
|
|
40
|
+
"tool_trace_id": step.get("tool_trace_id"),
|
|
41
|
+
"tool": tool.get("name"),
|
|
42
|
+
"model_role": step.get("model_role"),
|
|
43
|
+
"model": step.get("model"),
|
|
44
|
+
"aux_review": summarize_aux_review(step.get("aux_review", {})),
|
|
45
|
+
}
|
|
46
|
+
step_summaries.append(step_summary)
|
|
47
|
+
|
|
48
|
+
bucket = action_breakdown.setdefault(
|
|
49
|
+
action,
|
|
50
|
+
{"count": 0, "input_tokens": 0, "output_tokens": 0, "total_tokens": 0},
|
|
51
|
+
)
|
|
52
|
+
bucket["count"] += 1
|
|
53
|
+
bucket["input_tokens"] += tokens["input_tokens"]
|
|
54
|
+
bucket["output_tokens"] += tokens["output_tokens"]
|
|
55
|
+
bucket["total_tokens"] += tokens["total_tokens"]
|
|
56
|
+
|
|
57
|
+
totals = normalize_tokens(report.get("totals", {}))
|
|
58
|
+
hotspots = sorted(step_summaries, key=lambda item: item["total_tokens"], reverse=True)[:3]
|
|
59
|
+
return {
|
|
60
|
+
"run_id": report.get("id"),
|
|
61
|
+
"task_id": report.get("task_id"),
|
|
62
|
+
"status": report.get("status"),
|
|
63
|
+
"request": str(report.get("request") or ""),
|
|
64
|
+
"model_routing": report.get("model_routing", {}),
|
|
65
|
+
"summary": {
|
|
66
|
+
"step_count": len(step_summaries),
|
|
67
|
+
"input_tokens": totals["input_tokens"],
|
|
68
|
+
"output_tokens": totals["output_tokens"],
|
|
69
|
+
"total_tokens": totals["total_tokens"],
|
|
70
|
+
"average_tokens_per_step": round(totals["total_tokens"] / len(step_summaries), 2) if step_summaries else 0.0,
|
|
71
|
+
"max_step_tokens": hotspots[0]["total_tokens"] if hotspots else 0,
|
|
72
|
+
"max_step_index": hotspots[0]["index"] if hotspots else 0,
|
|
73
|
+
"request_chars": len(str(report.get("request") or "")),
|
|
74
|
+
"final_response_chars": len(str(report.get("final_response") or "")),
|
|
75
|
+
"action_breakdown": action_breakdown,
|
|
76
|
+
"task_brief": summarize_series(task_brief_series),
|
|
77
|
+
"planned_context": summarize_series(planned_context_series),
|
|
78
|
+
},
|
|
79
|
+
"steps": step_summaries,
|
|
80
|
+
"hotspots": hotspots,
|
|
81
|
+
"storage": report.get("storage", {}),
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def render_agent_cost_report(cost: dict[str, Any]) -> str:
|
|
86
|
+
summary = cost["summary"]
|
|
87
|
+
lines = [
|
|
88
|
+
f"agent_run: {cost['run_id']}",
|
|
89
|
+
f"task: {cost.get('task_id')}",
|
|
90
|
+
f"status: {cost.get('status')}",
|
|
91
|
+
f"steps: {summary['step_count']}",
|
|
92
|
+
(
|
|
93
|
+
f"tokens: total={summary['total_tokens']} "
|
|
94
|
+
f"input={summary['input_tokens']} output={summary['output_tokens']} "
|
|
95
|
+
f"avg_per_step={summary['average_tokens_per_step']}"
|
|
96
|
+
),
|
|
97
|
+
(
|
|
98
|
+
f"task_brief_tokens: first={summary['task_brief']['first_tokens']} "
|
|
99
|
+
f"last={summary['task_brief']['last_tokens']} "
|
|
100
|
+
f"peak={summary['task_brief']['peak_tokens']} "
|
|
101
|
+
f"growth={summary['task_brief']['growth_tokens']}"
|
|
102
|
+
),
|
|
103
|
+
(
|
|
104
|
+
f"planned_context_tokens: first={summary['planned_context']['first_tokens']} "
|
|
105
|
+
f"last={summary['planned_context']['last_tokens']} "
|
|
106
|
+
f"peak={summary['planned_context']['peak_tokens']} "
|
|
107
|
+
f"growth={summary['planned_context']['growth_tokens']}"
|
|
108
|
+
),
|
|
109
|
+
f"hotspot: step={summary['max_step_index']} tokens={summary['max_step_tokens']}",
|
|
110
|
+
]
|
|
111
|
+
routing = cost.get("model_routing") or {}
|
|
112
|
+
if routing:
|
|
113
|
+
lines.append(
|
|
114
|
+
"model_routing: "
|
|
115
|
+
f"mode={routing.get('mode')} "
|
|
116
|
+
f"primary={routing.get('primary_model')} "
|
|
117
|
+
f"auxiliary={routing.get('auxiliary_model')} "
|
|
118
|
+
f"review={routing.get('aux_review')}"
|
|
119
|
+
)
|
|
120
|
+
if summary["action_breakdown"]:
|
|
121
|
+
parts = []
|
|
122
|
+
for action, bucket in summary["action_breakdown"].items():
|
|
123
|
+
parts.append(f"{action}={bucket['count']}x/{bucket['total_tokens']}t")
|
|
124
|
+
lines.append("actions: " + ", ".join(parts))
|
|
125
|
+
lines.append("")
|
|
126
|
+
lines.append("Step Breakdown")
|
|
127
|
+
for step in cost["steps"]:
|
|
128
|
+
lines.append(
|
|
129
|
+
(
|
|
130
|
+
f"- step {step['index']}: action={step['action']} status={step['status']} "
|
|
131
|
+
f"tokens={step['total_tokens']} input={step['input_tokens']} output={step['output_tokens']} "
|
|
132
|
+
f"brief={step['task_brief_tokens']} context={step['planned_context_tokens']} "
|
|
133
|
+
f"model={step.get('model_role') or 'unknown'}:{step.get('model') or 'default'}"
|
|
134
|
+
f"{render_aux_review_inline(step.get('aux_review', {}))}"
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
return "\n".join(lines)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def summarize_aux_review(review: dict[str, Any]) -> dict[str, Any]:
|
|
141
|
+
if not isinstance(review, dict) or not review.get("enabled"):
|
|
142
|
+
return {"enabled": False}
|
|
143
|
+
tokens = normalize_tokens(review.get("tokens", {}))
|
|
144
|
+
return {
|
|
145
|
+
"enabled": True,
|
|
146
|
+
"trace_id": review.get("trace_id"),
|
|
147
|
+
"model": review.get("model"),
|
|
148
|
+
"risk": review.get("risk"),
|
|
149
|
+
"recommendation": review.get("recommendation"),
|
|
150
|
+
"total_tokens": tokens["total_tokens"],
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def render_aux_review_inline(review: dict[str, Any]) -> str:
|
|
155
|
+
if not isinstance(review, dict) or not review.get("enabled"):
|
|
156
|
+
return ""
|
|
157
|
+
return f" review={review.get('risk')}:{review.get('recommendation')}:{review.get('total_tokens', 0)}t"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def summarize_series(values: list[int]) -> dict[str, int]:
|
|
161
|
+
if not values:
|
|
162
|
+
return {
|
|
163
|
+
"first_tokens": 0,
|
|
164
|
+
"last_tokens": 0,
|
|
165
|
+
"peak_tokens": 0,
|
|
166
|
+
"growth_tokens": 0,
|
|
167
|
+
}
|
|
168
|
+
return {
|
|
169
|
+
"first_tokens": values[0],
|
|
170
|
+
"last_tokens": values[-1],
|
|
171
|
+
"peak_tokens": max(values),
|
|
172
|
+
"growth_tokens": values[-1] - values[0],
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def normalize_tokens(tokens: dict[str, Any]) -> dict[str, int]:
|
|
177
|
+
return {
|
|
178
|
+
"input_tokens": read_int(tokens.get("input_tokens")),
|
|
179
|
+
"output_tokens": read_int(tokens.get("output_tokens")),
|
|
180
|
+
"total_tokens": read_int(tokens.get("total_tokens")),
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def read_int(value: Any) -> int:
|
|
185
|
+
try:
|
|
186
|
+
return int(value or 0)
|
|
187
|
+
except (TypeError, ValueError):
|
|
188
|
+
return 0
|
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
from .budget import DEFAULT_PROFILE
|
|
8
|
+
from .evals import diff_reports
|
|
9
|
+
from .evals import EvalRunner
|
|
10
|
+
from .models import utc_now
|
|
11
|
+
from .report_costs import build_benchmark_cost_report, diff_cost_reports, render_cost_markdown
|
|
12
|
+
from .storage import Workspace
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BenchmarkRunner:
|
|
16
|
+
def __init__(self, workspace: Workspace):
|
|
17
|
+
self.workspace = workspace
|
|
18
|
+
self.evals = EvalRunner(workspace)
|
|
19
|
+
|
|
20
|
+
def run_directory(
|
|
21
|
+
self,
|
|
22
|
+
path: Path,
|
|
23
|
+
default_budget: int | None = None,
|
|
24
|
+
default_profile: str = DEFAULT_PROFILE,
|
|
25
|
+
save: bool = True,
|
|
26
|
+
execute_provider: str | None = None,
|
|
27
|
+
execute_model: str | None = None,
|
|
28
|
+
execute_base_url: str | None = None,
|
|
29
|
+
) -> dict[str, Any]:
|
|
30
|
+
fixtures = sorted(path.glob("*.json"))
|
|
31
|
+
if not fixtures:
|
|
32
|
+
raise ValueError(f"Benchmark directory has no JSON fixtures: {path}")
|
|
33
|
+
|
|
34
|
+
resolved_path = path.resolve()
|
|
35
|
+
reports = [
|
|
36
|
+
self.evals.run_fixture(
|
|
37
|
+
fixture,
|
|
38
|
+
default_budget=default_budget,
|
|
39
|
+
default_profile=default_profile,
|
|
40
|
+
save=False,
|
|
41
|
+
execute_provider=execute_provider,
|
|
42
|
+
execute_model=execute_model,
|
|
43
|
+
execute_base_url=execute_base_url,
|
|
44
|
+
)
|
|
45
|
+
for fixture in fixtures
|
|
46
|
+
]
|
|
47
|
+
report = {
|
|
48
|
+
"id": uuid4().hex[:12],
|
|
49
|
+
"created_at": utc_now(),
|
|
50
|
+
"benchmark": str(path),
|
|
51
|
+
"benchmark_path": str(resolved_path),
|
|
52
|
+
"name": path.name,
|
|
53
|
+
"execution": {
|
|
54
|
+
"enabled": bool(execute_provider),
|
|
55
|
+
"provider": execute_provider,
|
|
56
|
+
"model": execute_model,
|
|
57
|
+
},
|
|
58
|
+
"fixtures": reports,
|
|
59
|
+
"summary": summarize_benchmark(reports),
|
|
60
|
+
}
|
|
61
|
+
if save:
|
|
62
|
+
self.save_report(report)
|
|
63
|
+
return report
|
|
64
|
+
|
|
65
|
+
def save_report(self, report: dict[str, Any]) -> Path:
|
|
66
|
+
self.workspace.benchmarks_dir.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
path = self.workspace.benchmarks_dir / f"{report['id']}.json"
|
|
68
|
+
Workspace.write_json(path, report)
|
|
69
|
+
return path
|
|
70
|
+
|
|
71
|
+
def list_reports(self) -> list[dict[str, Any]]:
|
|
72
|
+
self.workspace.benchmarks_dir.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
reports: list[dict[str, Any]] = []
|
|
74
|
+
for path in sorted(self.workspace.benchmarks_dir.glob("*.json")):
|
|
75
|
+
report = Workspace.read_json(path)
|
|
76
|
+
summary = report.get("summary", {})
|
|
77
|
+
reports.append(
|
|
78
|
+
{
|
|
79
|
+
"id": report.get("id", path.stem),
|
|
80
|
+
"created_at": report.get("created_at", ""),
|
|
81
|
+
"name": report.get("name", ""),
|
|
82
|
+
"fixture_count": summary.get("fixture_count", 0),
|
|
83
|
+
"task_count": summary.get("task_count", 0),
|
|
84
|
+
"average_savings_percent": summary.get("average_savings_percent", 0),
|
|
85
|
+
"checks": f"{summary.get('passed_checks', 0)}/{summary.get('total_checks', 0)}",
|
|
86
|
+
"ok": summary.get("ok", False),
|
|
87
|
+
}
|
|
88
|
+
)
|
|
89
|
+
return sorted(reports, key=lambda item: item["created_at"], reverse=True)
|
|
90
|
+
|
|
91
|
+
def get_report(self, report_id: str) -> dict[str, Any]:
|
|
92
|
+
path = self.workspace.benchmarks_dir / f"{report_id}.json"
|
|
93
|
+
if not path.exists():
|
|
94
|
+
raise KeyError(f"Unknown benchmark report: {report_id}")
|
|
95
|
+
return Workspace.read_json(path)
|
|
96
|
+
|
|
97
|
+
def find_baseline(
|
|
98
|
+
self,
|
|
99
|
+
path: Path,
|
|
100
|
+
*,
|
|
101
|
+
baseline_id: str | None = None,
|
|
102
|
+
exclude_id: str | None = None,
|
|
103
|
+
) -> dict[str, Any] | None:
|
|
104
|
+
if baseline_id:
|
|
105
|
+
return {"match": "explicit", "report": self.get_report(baseline_id)}
|
|
106
|
+
|
|
107
|
+
self.workspace.benchmarks_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
path_matches: list[dict[str, Any]] = []
|
|
109
|
+
name_matches: list[dict[str, Any]] = []
|
|
110
|
+
for report_path in sorted(self.workspace.benchmarks_dir.glob("*.json")):
|
|
111
|
+
report = Workspace.read_json(report_path)
|
|
112
|
+
if exclude_id and report.get("id") == exclude_id:
|
|
113
|
+
continue
|
|
114
|
+
if benchmark_path_matches(report, path):
|
|
115
|
+
path_matches.append(report)
|
|
116
|
+
continue
|
|
117
|
+
if report.get("name") == path.name:
|
|
118
|
+
name_matches.append(report)
|
|
119
|
+
|
|
120
|
+
matches = path_matches or name_matches
|
|
121
|
+
if not matches:
|
|
122
|
+
return None
|
|
123
|
+
matches.sort(key=lambda item: item.get("created_at", ""), reverse=True)
|
|
124
|
+
return {
|
|
125
|
+
"match": "path" if path_matches else "name",
|
|
126
|
+
"report": matches[0],
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
def diff_reports(self, before_id: str, after_id: str) -> dict[str, Any]:
|
|
130
|
+
before = self.get_report(before_id)
|
|
131
|
+
after = self.get_report(after_id)
|
|
132
|
+
return diff_benchmarks(before, after)
|
|
133
|
+
|
|
134
|
+
def export_markdown(self, report_id: str, output: Path | None = None) -> Path:
|
|
135
|
+
report = self.get_report(report_id)
|
|
136
|
+
output = output or self.workspace.benchmarks_dir / f"{report_id}.md"
|
|
137
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
138
|
+
output.write_text(render_benchmark_markdown(report), encoding="utf-8")
|
|
139
|
+
return output
|
|
140
|
+
|
|
141
|
+
def evidence(
|
|
142
|
+
self,
|
|
143
|
+
report_ids: list[str] | None = None,
|
|
144
|
+
*,
|
|
145
|
+
limit: int | None = None,
|
|
146
|
+
) -> dict[str, Any]:
|
|
147
|
+
reports = [self.get_report(report_id) for report_id in report_ids] if report_ids else self._recent_reports(limit=limit)
|
|
148
|
+
return build_benchmark_evidence(reports)
|
|
149
|
+
|
|
150
|
+
def export_evidence_markdown(
|
|
151
|
+
self,
|
|
152
|
+
report_ids: list[str] | None = None,
|
|
153
|
+
*,
|
|
154
|
+
limit: int | None = None,
|
|
155
|
+
output: Path | None = None,
|
|
156
|
+
) -> Path:
|
|
157
|
+
evidence = self.evidence(report_ids, limit=limit)
|
|
158
|
+
output = output or self.workspace.benchmarks_dir / "evidence.md"
|
|
159
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
output.write_text(render_benchmark_evidence_markdown(evidence), encoding="utf-8")
|
|
161
|
+
return output
|
|
162
|
+
|
|
163
|
+
def _recent_reports(self, *, limit: int | None = None) -> list[dict[str, Any]]:
|
|
164
|
+
self.workspace.benchmarks_dir.mkdir(parents=True, exist_ok=True)
|
|
165
|
+
reports = [Workspace.read_json(path) for path in sorted(self.workspace.benchmarks_dir.glob("*.json"))]
|
|
166
|
+
reports.sort(key=lambda item: item.get("created_at", ""), reverse=True)
|
|
167
|
+
if limit is not None:
|
|
168
|
+
reports = reports[: max(0, limit)]
|
|
169
|
+
return reports
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def summarize_benchmark(reports: list[dict[str, Any]]) -> dict[str, Any]:
|
|
173
|
+
summaries = [report["summary"] for report in reports]
|
|
174
|
+
total_tasks = sum(summary["task_count"] for summary in summaries)
|
|
175
|
+
total_kernel = sum(summary["total_kernel_tokens"] for summary in summaries)
|
|
176
|
+
total_baseline = sum(summary["total_baseline_tokens"] for summary in summaries)
|
|
177
|
+
total_savings = max(0, total_baseline - total_kernel)
|
|
178
|
+
total_checks = sum(summary["total_checks"] for summary in summaries)
|
|
179
|
+
passed_checks = sum(summary["passed_checks"] for summary in summaries)
|
|
180
|
+
total_execution_tokens = sum(summary.get("total_execution_tokens", 0) for summary in summaries)
|
|
181
|
+
executed_tasks = sum(summary.get("executed_tasks", 0) for summary in summaries)
|
|
182
|
+
blocked_tasks = sum(summary.get("blocked_tasks", 0) for summary in summaries)
|
|
183
|
+
average_savings = (
|
|
184
|
+
sum(summary["average_savings_percent"] for summary in summaries) / len(summaries)
|
|
185
|
+
if summaries
|
|
186
|
+
else 0.0
|
|
187
|
+
)
|
|
188
|
+
return {
|
|
189
|
+
"fixture_count": len(reports),
|
|
190
|
+
"task_count": total_tasks,
|
|
191
|
+
"total_kernel_tokens": total_kernel,
|
|
192
|
+
"total_baseline_tokens": total_baseline,
|
|
193
|
+
"total_savings_tokens": total_savings,
|
|
194
|
+
"total_savings_percent": round((total_savings / total_baseline) * 100, 2) if total_baseline else 0.0,
|
|
195
|
+
"average_savings_percent": round(average_savings, 2),
|
|
196
|
+
"passed_checks": passed_checks,
|
|
197
|
+
"total_checks": total_checks,
|
|
198
|
+
"executed_tasks": executed_tasks,
|
|
199
|
+
"blocked_tasks": blocked_tasks,
|
|
200
|
+
"total_execution_tokens": total_execution_tokens,
|
|
201
|
+
"ok": passed_checks == total_checks,
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def build_benchmark_evidence(reports: list[dict[str, Any]]) -> dict[str, Any]:
|
|
206
|
+
reports = sorted(reports, key=lambda item: item.get("created_at", ""), reverse=True)
|
|
207
|
+
summaries = [report.get("summary", {}) for report in reports]
|
|
208
|
+
total_kernel = sum(int(summary.get("total_kernel_tokens", 0) or 0) for summary in summaries)
|
|
209
|
+
total_baseline = sum(int(summary.get("total_baseline_tokens", 0) or 0) for summary in summaries)
|
|
210
|
+
total_savings = max(0, total_baseline - total_kernel)
|
|
211
|
+
total_checks = sum(int(summary.get("total_checks", 0) or 0) for summary in summaries)
|
|
212
|
+
passed_checks = sum(int(summary.get("passed_checks", 0) or 0) for summary in summaries)
|
|
213
|
+
tasks = benchmark_task_snapshots(reports)
|
|
214
|
+
weakest = sorted(tasks, key=lambda item: item["savings_percent"])[:5]
|
|
215
|
+
strongest = sorted(tasks, key=lambda item: item["savings_tokens"], reverse=True)[:5]
|
|
216
|
+
return {
|
|
217
|
+
"id": uuid4().hex[:12],
|
|
218
|
+
"created_at": utc_now(),
|
|
219
|
+
"report_count": len(reports),
|
|
220
|
+
"benchmark_count": len({report.get("name", "") for report in reports if report.get("name")}),
|
|
221
|
+
"fixture_count": sum(int(summary.get("fixture_count", 0) or 0) for summary in summaries),
|
|
222
|
+
"task_count": sum(int(summary.get("task_count", 0) or 0) for summary in summaries),
|
|
223
|
+
"total_kernel_tokens": total_kernel,
|
|
224
|
+
"total_baseline_tokens": total_baseline,
|
|
225
|
+
"total_savings_tokens": total_savings,
|
|
226
|
+
"total_savings_percent": round((total_savings / total_baseline) * 100, 2) if total_baseline else 0.0,
|
|
227
|
+
"average_report_savings_percent": round(
|
|
228
|
+
sum(float(summary.get("total_savings_percent", 0) or 0) for summary in summaries) / len(summaries),
|
|
229
|
+
2,
|
|
230
|
+
)
|
|
231
|
+
if summaries
|
|
232
|
+
else 0.0,
|
|
233
|
+
"passed_checks": passed_checks,
|
|
234
|
+
"total_checks": total_checks,
|
|
235
|
+
"pass_rate_percent": round((passed_checks / total_checks) * 100, 2) if total_checks else 0.0,
|
|
236
|
+
"reports": [benchmark_ref(report) | {"summary": report.get("summary", {})} for report in reports],
|
|
237
|
+
"strongest_savings": strongest,
|
|
238
|
+
"weakest_savings": weakest,
|
|
239
|
+
"ok": bool(reports) and passed_checks == total_checks,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def benchmark_task_snapshots(reports: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
244
|
+
snapshots: list[dict[str, Any]] = []
|
|
245
|
+
for report in reports:
|
|
246
|
+
for fixture in report.get("fixtures", []):
|
|
247
|
+
fixture_name = Path(str(fixture.get("fixture", ""))).name
|
|
248
|
+
for task in fixture.get("tasks", []):
|
|
249
|
+
kernel_tokens = int(task.get("kernel", {}).get("estimated_tokens", 0) or 0)
|
|
250
|
+
baseline_tokens = int(task.get("baseline", {}).get("estimated_tokens", 0) or 0)
|
|
251
|
+
checks = task.get("checks", {})
|
|
252
|
+
snapshots.append(
|
|
253
|
+
{
|
|
254
|
+
"report_id": report.get("id"),
|
|
255
|
+
"benchmark": report.get("name"),
|
|
256
|
+
"fixture": fixture_name,
|
|
257
|
+
"task": task.get("id"),
|
|
258
|
+
"profile": task.get("profile"),
|
|
259
|
+
"kernel_tokens": kernel_tokens,
|
|
260
|
+
"baseline_tokens": baseline_tokens,
|
|
261
|
+
"savings_tokens": max(0, baseline_tokens - kernel_tokens),
|
|
262
|
+
"savings_percent": float(task.get("savings", {}).get("percent", 0) or 0),
|
|
263
|
+
"checks": f"{int(checks.get('passed', 0) or 0)}/{int(checks.get('total', 0) or 0)}",
|
|
264
|
+
}
|
|
265
|
+
)
|
|
266
|
+
return snapshots
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def diff_benchmarks(before: dict[str, Any], after: dict[str, Any]) -> dict[str, Any]:
|
|
270
|
+
before_summary = before.get("summary", {})
|
|
271
|
+
after_summary = after.get("summary", {})
|
|
272
|
+
fixture_diffs = diff_fixtures(before.get("fixtures", []), after.get("fixtures", []))
|
|
273
|
+
cost_diff = diff_cost_reports(build_benchmark_cost_report(before), build_benchmark_cost_report(after))
|
|
274
|
+
regressions = [
|
|
275
|
+
fixture
|
|
276
|
+
for fixture in fixture_diffs
|
|
277
|
+
if fixture.get("status") == "changed"
|
|
278
|
+
and (
|
|
279
|
+
fixture["summary_delta"]["kernel_tokens"] > 10
|
|
280
|
+
or fixture["summary_delta"]["savings_percent"] < 0
|
|
281
|
+
or fixture["summary_delta"]["passed_checks"] < 0
|
|
282
|
+
or fixture["regressions"]
|
|
283
|
+
)
|
|
284
|
+
]
|
|
285
|
+
return {
|
|
286
|
+
"before": benchmark_ref(before),
|
|
287
|
+
"after": benchmark_ref(after),
|
|
288
|
+
"summary_delta": {
|
|
289
|
+
"fixtures": delta(before_summary, after_summary, "fixture_count"),
|
|
290
|
+
"tasks": delta(before_summary, after_summary, "task_count"),
|
|
291
|
+
"kernel_tokens": delta(before_summary, after_summary, "total_kernel_tokens"),
|
|
292
|
+
"baseline_tokens": delta(before_summary, after_summary, "total_baseline_tokens"),
|
|
293
|
+
"savings_tokens": delta(before_summary, after_summary, "total_savings_tokens"),
|
|
294
|
+
"savings_percent": round(delta(before_summary, after_summary, "total_savings_percent"), 2),
|
|
295
|
+
"passed_checks": delta(before_summary, after_summary, "passed_checks"),
|
|
296
|
+
"total_checks": delta(before_summary, after_summary, "total_checks"),
|
|
297
|
+
"execution_tokens": delta(before_summary, after_summary, "total_execution_tokens"),
|
|
298
|
+
},
|
|
299
|
+
"cost_diff": cost_diff,
|
|
300
|
+
"cost_regressions": cost_diff["regressions"],
|
|
301
|
+
"fixtures": fixture_diffs,
|
|
302
|
+
"regressions": regressions,
|
|
303
|
+
"ok": not regressions and cost_diff["ok"],
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def diff_fixtures(before_fixtures: list[dict[str, Any]], after_fixtures: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
308
|
+
before_by_name = {Path(fixture["fixture"]).name: fixture for fixture in before_fixtures}
|
|
309
|
+
after_by_name = {Path(fixture["fixture"]).name: fixture for fixture in after_fixtures}
|
|
310
|
+
names = sorted(set(before_by_name).union(after_by_name))
|
|
311
|
+
diffs: list[dict[str, Any]] = []
|
|
312
|
+
for name in names:
|
|
313
|
+
before = before_by_name.get(name)
|
|
314
|
+
after = after_by_name.get(name)
|
|
315
|
+
if before is None:
|
|
316
|
+
diffs.append({"fixture": name, "status": "added"})
|
|
317
|
+
continue
|
|
318
|
+
if after is None:
|
|
319
|
+
diffs.append({"fixture": name, "status": "removed"})
|
|
320
|
+
continue
|
|
321
|
+
fixture_diff = diff_reports(before, after)
|
|
322
|
+
diffs.append(
|
|
323
|
+
{
|
|
324
|
+
"fixture": name,
|
|
325
|
+
"status": "changed",
|
|
326
|
+
"summary_delta": fixture_diff["summary_delta"],
|
|
327
|
+
"regressions": fixture_diff["regressions"],
|
|
328
|
+
"ok": fixture_diff["ok"],
|
|
329
|
+
}
|
|
330
|
+
)
|
|
331
|
+
return diffs
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def render_benchmark_markdown(report: dict[str, Any]) -> str:
|
|
335
|
+
summary = report["summary"]
|
|
336
|
+
cost = build_benchmark_cost_report(report)
|
|
337
|
+
lines = [
|
|
338
|
+
f"# Benchmark Report: {report['name']}",
|
|
339
|
+
"",
|
|
340
|
+
f"- Report id: `{report['id']}`",
|
|
341
|
+
f"- Created at: `{report['created_at']}`",
|
|
342
|
+
f"- Benchmark: `{report['benchmark']}`",
|
|
343
|
+
f"- Fixtures: `{summary['fixture_count']}`",
|
|
344
|
+
f"- Tasks: `{summary['task_count']}`",
|
|
345
|
+
f"- Average savings: `{summary['average_savings_percent']}%`",
|
|
346
|
+
f"- Total savings: `{summary['total_savings_tokens']}` tokens (`{summary['total_savings_percent']}%`)",
|
|
347
|
+
f"- Checks: `{summary['passed_checks']}/{summary['total_checks']}`",
|
|
348
|
+
f"- Executed tasks: `{summary['executed_tasks']}`",
|
|
349
|
+
f"- Execution tokens: `{summary['total_execution_tokens']}`",
|
|
350
|
+
"",
|
|
351
|
+
"## Cost View",
|
|
352
|
+
"",
|
|
353
|
+
render_cost_markdown(cost),
|
|
354
|
+
"",
|
|
355
|
+
"## Fixtures",
|
|
356
|
+
"",
|
|
357
|
+
"| Fixture | Tasks | Avg Savings | Checks | Execution Tokens |",
|
|
358
|
+
"| --- | ---: | ---: | ---: | ---: |",
|
|
359
|
+
]
|
|
360
|
+
for fixture in report["fixtures"]:
|
|
361
|
+
fixture_summary = fixture["summary"]
|
|
362
|
+
lines.append(
|
|
363
|
+
"| "
|
|
364
|
+
+ " | ".join(
|
|
365
|
+
[
|
|
366
|
+
Path(fixture["fixture"]).name,
|
|
367
|
+
str(fixture_summary["task_count"]),
|
|
368
|
+
f"{fixture_summary['average_savings_percent']}%",
|
|
369
|
+
f"{fixture_summary['passed_checks']}/{fixture_summary['total_checks']}",
|
|
370
|
+
str(fixture_summary["total_execution_tokens"]),
|
|
371
|
+
]
|
|
372
|
+
)
|
|
373
|
+
+ " |"
|
|
374
|
+
)
|
|
375
|
+
lines.extend(["", "## Tasks", ""])
|
|
376
|
+
for fixture in report["fixtures"]:
|
|
377
|
+
lines.append(f"### {Path(fixture['fixture']).name}")
|
|
378
|
+
lines.append("")
|
|
379
|
+
lines.append("| Task | Profile | Kernel Tokens | Baseline Tokens | Savings | Checks |")
|
|
380
|
+
lines.append("| --- | --- | ---: | ---: | ---: | ---: |")
|
|
381
|
+
for task in fixture["tasks"]:
|
|
382
|
+
lines.append(
|
|
383
|
+
"| "
|
|
384
|
+
+ " | ".join(
|
|
385
|
+
[
|
|
386
|
+
task["id"],
|
|
387
|
+
task["profile"],
|
|
388
|
+
str(task["kernel"]["estimated_tokens"]),
|
|
389
|
+
str(task["baseline"]["estimated_tokens"]),
|
|
390
|
+
f"{task['savings']['percent']}%",
|
|
391
|
+
f"{task['checks']['passed']}/{task['checks']['total']}",
|
|
392
|
+
]
|
|
393
|
+
)
|
|
394
|
+
+ " |"
|
|
395
|
+
)
|
|
396
|
+
lines.append("")
|
|
397
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def render_benchmark_evidence_markdown(evidence: dict[str, Any]) -> str:
|
|
401
|
+
lines = [
|
|
402
|
+
"# Benchmark Evidence",
|
|
403
|
+
"",
|
|
404
|
+
f"- Evidence id: `{evidence['id']}`",
|
|
405
|
+
f"- Created at: `{evidence['created_at']}`",
|
|
406
|
+
f"- Reports: `{evidence['report_count']}`",
|
|
407
|
+
f"- Benchmarks: `{evidence['benchmark_count']}`",
|
|
408
|
+
f"- Fixtures: `{evidence['fixture_count']}`",
|
|
409
|
+
f"- Tasks: `{evidence['task_count']}`",
|
|
410
|
+
f"- Kernel tokens: `{evidence['total_kernel_tokens']}`",
|
|
411
|
+
f"- Baseline tokens: `{evidence['total_baseline_tokens']}`",
|
|
412
|
+
f"- Token savings: `{evidence['total_savings_tokens']}` (`{evidence['total_savings_percent']}%`)",
|
|
413
|
+
f"- Checks: `{evidence['passed_checks']}/{evidence['total_checks']}` (`{evidence['pass_rate_percent']}%`)",
|
|
414
|
+
"",
|
|
415
|
+
"## Reports",
|
|
416
|
+
"",
|
|
417
|
+
"| Report | Benchmark | Tasks | Savings | Checks |",
|
|
418
|
+
"| --- | --- | ---: | ---: | ---: |",
|
|
419
|
+
]
|
|
420
|
+
for report in evidence.get("reports", []):
|
|
421
|
+
summary = report.get("summary", {})
|
|
422
|
+
lines.append(
|
|
423
|
+
"| "
|
|
424
|
+
+ " | ".join(
|
|
425
|
+
[
|
|
426
|
+
str(report.get("id", "")),
|
|
427
|
+
str(report.get("name", "")),
|
|
428
|
+
str(summary.get("task_count", 0)),
|
|
429
|
+
f"{summary.get('total_savings_tokens', 0)} ({summary.get('total_savings_percent', 0)}%)",
|
|
430
|
+
f"{summary.get('passed_checks', 0)}/{summary.get('total_checks', 0)}",
|
|
431
|
+
]
|
|
432
|
+
)
|
|
433
|
+
+ " |"
|
|
434
|
+
)
|
|
435
|
+
lines.extend(["", "## Strongest Savings", "", "| Scope | Kernel | Baseline | Savings | Checks |", "| --- | ---: | ---: | ---: | ---: |"])
|
|
436
|
+
for item in evidence.get("strongest_savings", []):
|
|
437
|
+
lines.append(benchmark_evidence_task_line(item))
|
|
438
|
+
lines.extend(["", "## Weakest Savings", "", "| Scope | Kernel | Baseline | Savings | Checks |", "| --- | ---: | ---: | ---: | ---: |"])
|
|
439
|
+
for item in evidence.get("weakest_savings", []):
|
|
440
|
+
lines.append(benchmark_evidence_task_line(item))
|
|
441
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def benchmark_evidence_task_line(item: dict[str, Any]) -> str:
|
|
445
|
+
scope = f"{item.get('benchmark', '')}/{item.get('fixture', '')}/{item.get('task', '')}"
|
|
446
|
+
return (
|
|
447
|
+
"| "
|
|
448
|
+
+ " | ".join(
|
|
449
|
+
[
|
|
450
|
+
scope,
|
|
451
|
+
str(item.get("kernel_tokens", 0)),
|
|
452
|
+
str(item.get("baseline_tokens", 0)),
|
|
453
|
+
f"{item.get('savings_tokens', 0)} ({item.get('savings_percent', 0)}%)",
|
|
454
|
+
str(item.get("checks", "")),
|
|
455
|
+
]
|
|
456
|
+
)
|
|
457
|
+
+ " |"
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def benchmark_ref(report: dict[str, Any]) -> dict[str, Any]:
|
|
462
|
+
return {
|
|
463
|
+
"id": report.get("id"),
|
|
464
|
+
"created_at": report.get("created_at"),
|
|
465
|
+
"benchmark": report.get("benchmark"),
|
|
466
|
+
"benchmark_path": report.get("benchmark_path"),
|
|
467
|
+
"name": report.get("name"),
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def delta(before: dict[str, Any], after: dict[str, Any], key: str) -> int | float:
|
|
472
|
+
return after.get(key, 0) - before.get(key, 0)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def benchmark_path_matches(report: dict[str, Any], path: Path) -> bool:
|
|
476
|
+
return bool(benchmark_path_keys(path).intersection(report_benchmark_keys(report)))
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def benchmark_path_keys(path: Path) -> set[str]:
|
|
480
|
+
return {key for key in {str(path), str(path.resolve())} if key}
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def report_benchmark_keys(report: dict[str, Any]) -> set[str]:
|
|
484
|
+
keys: set[str] = set()
|
|
485
|
+
for value in (report.get("benchmark_path"), report.get("benchmark")):
|
|
486
|
+
if not isinstance(value, str):
|
|
487
|
+
continue
|
|
488
|
+
text = value.strip()
|
|
489
|
+
if not text:
|
|
490
|
+
continue
|
|
491
|
+
report_path = Path(text)
|
|
492
|
+
keys.update({text, str(report_path), str(report_path.resolve())})
|
|
493
|
+
return {key for key in keys if key}
|