@event4u/agent-config 2.18.0 → 2.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/agent-status.md +29 -0
- package/.agent-src/commands/onboard.md +221 -81
- package/.agent-src/commands/refine-ticket.md +3 -0
- package/.agent-src/packs/README.md +49 -0
- package/.agent-src/packs/agency-delivery.yml +63 -0
- package/.agent-src/packs/content-engine.yml +53 -0
- package/.agent-src/packs/founder-mvp.yml +51 -0
- package/.agent-src/personas/README.md +8 -0
- package/.agent-src/presets/README.md +26 -0
- package/.agent-src/presets/balanced.yml +34 -0
- package/.agent-src/presets/fast.yml +31 -0
- package/.agent-src/presets/strict.yml +38 -0
- package/.agent-src/profiles/README.md +29 -0
- package/.agent-src/profiles/agency.yml +27 -0
- package/.agent-src/profiles/content_creator.yml +25 -0
- package/.agent-src/profiles/developer.yml +26 -0
- package/.agent-src/profiles/finance.yml +24 -0
- package/.agent-src/profiles/founder.yml +25 -0
- package/.agent-src/profiles/ops.yml +25 -0
- package/.agent-src/rules/no-cheap-questions.md +25 -17
- package/.agent-src/skills/adr-create/SKILL.md +78 -68
- package/.agent-src/skills/refine-ticket/SKILL.md +3 -0
- package/.agent-src/skills/subagent-orchestration/SKILL.md +33 -0
- package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
- package/.agent-src/templates/skill-archive-note.md +101 -0
- package/.agent-src/user-types/README.md +124 -0
- package/.agent-src/user-types/_template/user-type.md +95 -0
- package/.agent-src/user-types/galabau-field-crew.md +100 -0
- package/.agent-src/user-types/metalworking-shop.md +105 -0
- package/.agent-src/user-types/truck-driver.md +113 -0
- package/.claude-plugin/marketplace.json +1 -1
- package/CHANGELOG.md +91 -30
- package/README.md +68 -72
- package/config/agent-settings.template.yml +22 -0
- package/docs/adrs/caveman/0001-default-off-until-bench.md +93 -0
- package/docs/adrs/caveman/README.md +9 -0
- package/docs/adrs/cost/0001-hard-stop-hook.md +114 -0
- package/docs/adrs/cost/README.md +9 -0
- package/docs/adrs/memory/0001-consumer-side-snapshot.md +111 -0
- package/docs/adrs/memory/README.md +9 -0
- package/docs/adrs/router/0001-three-tier-routing.md +119 -0
- package/docs/adrs/router/README.md +9 -0
- package/docs/adrs/schema/0001-json-schema-frontmatter.md +102 -0
- package/docs/adrs/schema/README.md +9 -0
- package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +99 -0
- package/docs/adrs/smoke/README.md +9 -0
- package/docs/architecture/current-onboard-baseline.md +126 -0
- package/docs/architecture/current-safety-behavior.md +137 -0
- package/docs/archive/CHANGELOG-pre-2.16.0.md +48 -0
- package/docs/contracts/adr-layout.md +108 -0
- package/docs/contracts/adr-mcp-runtime.md +128 -0
- package/docs/contracts/adr-user-types-axis.md +127 -0
- package/docs/contracts/benchmark-corpus-spec.md +97 -0
- package/docs/contracts/benchmark-report-schema.md +111 -0
- package/docs/contracts/command-clusters.md +1 -0
- package/docs/contracts/command-taxonomy.md +137 -0
- package/docs/contracts/compression-default-kill-criterion.md +69 -0
- package/docs/contracts/config-presets.md +144 -0
- package/docs/contracts/cost-dashboard.md +143 -0
- package/docs/contracts/cost-enforcement.md +134 -0
- package/docs/contracts/file-ownership-matrix.json +0 -7
- package/docs/contracts/mcp-tool-inventory.md +53 -0
- package/docs/contracts/measurement-baseline.md +102 -0
- package/docs/contracts/namespace.md +125 -0
- package/docs/contracts/profile-system.md +142 -0
- package/docs/contracts/safety-model.md +129 -0
- package/docs/contracts/smoke-contracts.md +144 -0
- package/docs/contracts/user-type-schema.md +146 -0
- package/docs/contracts/workflow-packs.md +121 -0
- package/docs/decisions/ADR-010-profile-pack-preset-boundary.md +132 -0
- package/docs/decisions/INDEX.md +1 -0
- package/docs/featured-commands.md +27 -0
- package/docs/parity/bench-ruflo.json +58 -0
- package/docs/parity/bench.json +41 -0
- package/docs/parity/ruflo.md +46 -0
- package/docs/profiles.md +91 -0
- package/docs/recruits/_template.md +81 -0
- package/package.json +1 -1
- package/scripts/_cli/cmd_explain.py +250 -0
- package/scripts/_lib/bench_cost.py +138 -0
- package/scripts/_lib/bench_quality.py +118 -0
- package/scripts/_lib/bench_report.py +150 -0
- package/scripts/agent-config +13 -0
- package/scripts/audit_adr_coverage.py +175 -0
- package/scripts/audit_mcp_tools.py +146 -0
- package/scripts/bench_baseline_ready.py +108 -0
- package/scripts/bench_drift_check.py +151 -0
- package/scripts/bench_per_tool.py +216 -0
- package/scripts/bench_run.py +155 -0
- package/scripts/compress.py +48 -2
- package/scripts/config/__init__.py +9 -0
- package/scripts/config/presets.py +206 -0
- package/scripts/config/profiles.py +173 -0
- package/scripts/cost/budget.mjs +73 -12
- package/scripts/cost/preflight.mjs +89 -0
- package/scripts/lint_archived_skills.py +143 -0
- package/scripts/lint_bench_corpus.py +161 -0
- package/scripts/lint_namespace.py +135 -0
- package/scripts/schemas/user-type.schema.json +35 -0
- package/scripts/skill_linter.py +139 -4
- package/scripts/skill_overlap.py +204 -0
- package/scripts/skill_tools/audit_user_type_coverage.py +148 -0
- package/scripts/skill_usage_collect.py +191 -0
- package/scripts/skill_usage_report.py +162 -0
- package/scripts/smoke/kernel.sh +101 -0
- package/scripts/smoke/router.sh +129 -0
- package/scripts/smoke/schema.sh +71 -0
- package/scripts/smoke/skills.sh +101 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""``agent-config explain`` — print the decision chain behind an outcome.
|
|
2
|
+
|
|
3
|
+
Step-15 Phase 1 item 3. Answers the silent "why did the agent do that?"
|
|
4
|
+
question by showing which inputs the loader / router consulted, in what
|
|
5
|
+
order, and which one won. Read-only; never edits state, never dispatches
|
|
6
|
+
network calls. Three subjects in the v1 surface:
|
|
7
|
+
|
|
8
|
+
* ``config`` — full resolution chain for the active profile +
|
|
9
|
+
preset. Uses :mod:`scripts.config.profiles` and
|
|
10
|
+
:mod:`scripts.config.presets`; surfaces source
|
|
11
|
+
(pack / profile / user / env / runtime / default)
|
|
12
|
+
and per-knob overrides.
|
|
13
|
+
* ``rule <name>`` — kernel vs tier-1 vs tier-2 placement plus the
|
|
14
|
+
declared trigger list from ``router.json``.
|
|
15
|
+
* ``route <text>`` — given prompt text, returns every tier-1 rule
|
|
16
|
+
whose trigger list matches plus kernel rules
|
|
17
|
+
(always active).
|
|
18
|
+
|
|
19
|
+
Exit codes: ``0`` clean, ``1`` not found / no match, ``2`` invocation
|
|
20
|
+
error (bad project root, malformed ``router.json``).
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
import json
|
|
26
|
+
import os
|
|
27
|
+
import sys
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
from scripts._lib.agent_settings import (
|
|
32
|
+
DEFAULT_PROJECT_FILE,
|
|
33
|
+
ProjectRootError,
|
|
34
|
+
load_agent_settings,
|
|
35
|
+
resolve_project_root,
|
|
36
|
+
)
|
|
37
|
+
from scripts.config import presets, profiles
|
|
38
|
+
|
|
39
|
+
ROUTER_FILENAME = "router.json"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _resolve_root(arg: str | None) -> tuple[Path, str]:
|
|
43
|
+
try:
|
|
44
|
+
return resolve_project_root(arg, cwd=Path.cwd())
|
|
45
|
+
except ProjectRootError as exc:
|
|
46
|
+
print(f"❌ explain: {exc}", file=sys.stderr)
|
|
47
|
+
raise SystemExit(2) from exc
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _load_user_settings(project_root: Path) -> dict[str, Any]:
|
|
51
|
+
path = project_root / DEFAULT_PROJECT_FILE
|
|
52
|
+
if not path.exists():
|
|
53
|
+
return {}
|
|
54
|
+
return load_agent_settings(project_path=path) or {}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _load_router(project_root: Path) -> dict[str, Any]:
|
|
58
|
+
path = project_root / ROUTER_FILENAME
|
|
59
|
+
if not path.exists():
|
|
60
|
+
return {}
|
|
61
|
+
try:
|
|
62
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
63
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
64
|
+
print(f"❌ explain: cannot read {path}: {exc}", file=sys.stderr)
|
|
65
|
+
raise SystemExit(2) from exc
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _explain_config(project_root: Path, *, as_json: bool) -> int:
|
|
69
|
+
settings = _load_user_settings(project_root)
|
|
70
|
+
resolved_profile = profiles.resolve_profile(
|
|
71
|
+
project_root=project_root,
|
|
72
|
+
user_settings=settings,
|
|
73
|
+
)
|
|
74
|
+
resolved_preset = presets.resolve_preset(
|
|
75
|
+
project_root=project_root,
|
|
76
|
+
user_settings=settings,
|
|
77
|
+
profile_preset_id=resolved_profile.preset_id,
|
|
78
|
+
)
|
|
79
|
+
payload = {
|
|
80
|
+
"project_root": str(project_root),
|
|
81
|
+
"profile": {
|
|
82
|
+
"id": resolved_profile.id,
|
|
83
|
+
"source": resolved_profile.source,
|
|
84
|
+
"preset_id": resolved_profile.preset_id,
|
|
85
|
+
"warning": resolved_profile.warning,
|
|
86
|
+
},
|
|
87
|
+
"preset": {
|
|
88
|
+
"id": resolved_preset.id,
|
|
89
|
+
"source": resolved_preset.source,
|
|
90
|
+
"overrides": list(resolved_preset.overrides),
|
|
91
|
+
"knobs": resolved_preset.knobs,
|
|
92
|
+
},
|
|
93
|
+
"env": {
|
|
94
|
+
profiles.PROFILE_ID_ENV: os.environ.get(profiles.PROFILE_ID_ENV),
|
|
95
|
+
presets.PRESET_ID_ENV: os.environ.get(presets.PRESET_ID_ENV),
|
|
96
|
+
},
|
|
97
|
+
}
|
|
98
|
+
if as_json:
|
|
99
|
+
json.dump(payload, sys.stdout, indent=2, sort_keys=True)
|
|
100
|
+
sys.stdout.write("\n")
|
|
101
|
+
return 0
|
|
102
|
+
print(f" 📍 project_root: {project_root}")
|
|
103
|
+
print()
|
|
104
|
+
print(f" profile.id: {resolved_profile.id} (source: {resolved_profile.source})")
|
|
105
|
+
if resolved_profile.warning:
|
|
106
|
+
print(f" ⚠️ {resolved_profile.warning}")
|
|
107
|
+
print(f" preset.id: {resolved_preset.id} (source: {resolved_preset.source})")
|
|
108
|
+
if resolved_preset.overrides:
|
|
109
|
+
print(f" overrides: {', '.join(resolved_preset.overrides)}")
|
|
110
|
+
cost = resolved_preset.knobs.get("cost", {})
|
|
111
|
+
if cost:
|
|
112
|
+
print(
|
|
113
|
+
f" cost caps: daily ${cost.get('daily_max_usd')} · "
|
|
114
|
+
f"weekly ${cost.get('weekly_max_usd')} · "
|
|
115
|
+
f"monthly ${cost.get('monthly_max_usd')}",
|
|
116
|
+
)
|
|
117
|
+
autonomy = resolved_preset.knobs.get("autonomy", {})
|
|
118
|
+
if autonomy:
|
|
119
|
+
print(f" autonomy: default={autonomy.get('default')}")
|
|
120
|
+
return 0
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _find_rule(router: dict[str, Any], name: str) -> tuple[str, dict[str, Any]] | None:
|
|
124
|
+
if name in router.get("kernel", []):
|
|
125
|
+
return "kernel", {"id": name, "triggers": [{"always": True}]}
|
|
126
|
+
for tier in ("tier_1", "tier_2"):
|
|
127
|
+
for entry in router.get(tier, []):
|
|
128
|
+
if entry.get("id") == name:
|
|
129
|
+
return tier, entry
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _explain_rule(project_root: Path, name: str, *, as_json: bool) -> int:
|
|
134
|
+
router = _load_router(project_root)
|
|
135
|
+
found = _find_rule(router, name)
|
|
136
|
+
if found is None:
|
|
137
|
+
print(f"❌ explain: rule {name!r} not found in router", file=sys.stderr)
|
|
138
|
+
return 1
|
|
139
|
+
tier, entry = found
|
|
140
|
+
payload = {"rule": name, "tier": tier, "entry": entry}
|
|
141
|
+
if as_json:
|
|
142
|
+
json.dump(payload, sys.stdout, indent=2, sort_keys=True)
|
|
143
|
+
sys.stdout.write("\n")
|
|
144
|
+
return 0
|
|
145
|
+
print(f" rule: {name}")
|
|
146
|
+
print(f" tier: {tier}")
|
|
147
|
+
triggers = entry.get("triggers") or []
|
|
148
|
+
print(f" triggers ({len(triggers)}):")
|
|
149
|
+
for trig in triggers:
|
|
150
|
+
print(f" · {trig}")
|
|
151
|
+
routes = entry.get("routes_to") or []
|
|
152
|
+
if routes:
|
|
153
|
+
print(f" routes_to: {', '.join(routes)}")
|
|
154
|
+
return 0
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _matches_trigger(trigger: dict[str, Any], text: str, lowered: str) -> str | None:
|
|
158
|
+
"""Return a human-readable match reason, or ``None`` for no match."""
|
|
159
|
+
if "keyword" in trigger:
|
|
160
|
+
kw = str(trigger["keyword"]).lower()
|
|
161
|
+
if kw and kw in lowered:
|
|
162
|
+
return f"keyword: {kw}"
|
|
163
|
+
if "phrase" in trigger:
|
|
164
|
+
ph = str(trigger["phrase"]).lower()
|
|
165
|
+
if ph and ph in lowered:
|
|
166
|
+
return f"phrase: {ph}"
|
|
167
|
+
if "path_prefix" in trigger:
|
|
168
|
+
prefix = str(trigger["path_prefix"])
|
|
169
|
+
if prefix and prefix in text:
|
|
170
|
+
return f"path_prefix: {prefix}"
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _explain_route(project_root: Path, text: str, *, as_json: bool) -> int:
|
|
175
|
+
router = _load_router(project_root)
|
|
176
|
+
lowered = text.lower()
|
|
177
|
+
matches: list[dict[str, Any]] = []
|
|
178
|
+
for entry in router.get("tier_1", []):
|
|
179
|
+
for trig in entry.get("triggers", []) or []:
|
|
180
|
+
reason = _matches_trigger(trig, text, lowered)
|
|
181
|
+
if reason is not None:
|
|
182
|
+
matches.append({
|
|
183
|
+
"id": entry["id"], "tier": "tier_1", "reason": reason,
|
|
184
|
+
})
|
|
185
|
+
break
|
|
186
|
+
payload = {
|
|
187
|
+
"input": text,
|
|
188
|
+
"kernel_always": list(router.get("kernel", [])),
|
|
189
|
+
"tier_1_matches": matches,
|
|
190
|
+
}
|
|
191
|
+
if as_json:
|
|
192
|
+
json.dump(payload, sys.stdout, indent=2, sort_keys=True)
|
|
193
|
+
sys.stdout.write("\n")
|
|
194
|
+
return 0
|
|
195
|
+
print(f" input: {text!r}")
|
|
196
|
+
print()
|
|
197
|
+
print(f" kernel (always active, {len(payload['kernel_always'])}):")
|
|
198
|
+
for kid in payload["kernel_always"]:
|
|
199
|
+
print(f" · {kid}")
|
|
200
|
+
print()
|
|
201
|
+
print(f" tier-1 matches ({len(matches)}):")
|
|
202
|
+
if not matches:
|
|
203
|
+
print(" · (no trigger matched — only kernel rules active)")
|
|
204
|
+
return 1
|
|
205
|
+
for match in matches:
|
|
206
|
+
print(f" · {match['id']} ({match['reason']})")
|
|
207
|
+
return 0
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def main(argv: list[str] | None = None) -> int:
|
|
211
|
+
parser = argparse.ArgumentParser(
|
|
212
|
+
prog="agent-config explain",
|
|
213
|
+
description=(
|
|
214
|
+
"Print the decision chain behind a configuration or routing "
|
|
215
|
+
"outcome. Read-only; no network calls."
|
|
216
|
+
),
|
|
217
|
+
)
|
|
218
|
+
parser.add_argument(
|
|
219
|
+
"subject", choices=("config", "rule", "route"),
|
|
220
|
+
help="what to explain",
|
|
221
|
+
)
|
|
222
|
+
parser.add_argument(
|
|
223
|
+
"target", nargs="?", default=None,
|
|
224
|
+
help="rule name (for 'rule') or prompt text (for 'route')",
|
|
225
|
+
)
|
|
226
|
+
parser.add_argument(
|
|
227
|
+
"--project", default=None,
|
|
228
|
+
help="project root (defaults to anchor walk from cwd)",
|
|
229
|
+
)
|
|
230
|
+
parser.add_argument(
|
|
231
|
+
"--json", action="store_true", dest="as_json",
|
|
232
|
+
help="emit JSON instead of human-readable text",
|
|
233
|
+
)
|
|
234
|
+
opts = parser.parse_args(argv)
|
|
235
|
+
project_root, _origin = _resolve_root(opts.project)
|
|
236
|
+
if opts.subject == "config":
|
|
237
|
+
return _explain_config(project_root, as_json=opts.as_json)
|
|
238
|
+
if opts.target is None:
|
|
239
|
+
print(
|
|
240
|
+
f"❌ explain: '{opts.subject}' requires a target argument",
|
|
241
|
+
file=sys.stderr,
|
|
242
|
+
)
|
|
243
|
+
return 2
|
|
244
|
+
if opts.subject == "rule":
|
|
245
|
+
return _explain_rule(project_root, opts.target, as_json=opts.as_json)
|
|
246
|
+
return _explain_route(project_root, opts.target, as_json=opts.as_json)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
if __name__ == "__main__": # pragma: no cover
|
|
250
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Cost capture for `scripts/bench_run.py` — step-4 Phase 2 Step 2.
|
|
2
|
+
#
|
|
3
|
+
# Reads Claude Code session jsonl summaries (one summary line per session)
|
|
4
|
+
# from agents/cost-tracking/sessions.jsonl — produced by scripts/cost/track.mjs
|
|
5
|
+
# — and aggregates totals using model rates from bench/pricing.yaml.
|
|
6
|
+
#
|
|
7
|
+
# Returns the dict shape declared in docs/contracts/benchmark-report-schema.md
|
|
8
|
+
# § JSON schema (v1) `cost`. When the source jsonl is missing, returns the
|
|
9
|
+
# `unavailable` sentinel block (NEVER silently drops, per schema invariant).
|
|
10
|
+
"""Cost capture helper for the bench runner."""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import yaml
|
|
19
|
+
except ImportError: # pragma: no cover — bench_run handles the same import
|
|
20
|
+
yaml = None # type: ignore[assignment]
|
|
21
|
+
|
|
22
|
+
UNKNOWN_TIER = "unknown"
|
|
23
|
+
TIER_KEYS = ("haiku", "sonnet", "opus", UNKNOWN_TIER)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_pricing(pricing_path: Path) -> tuple[dict[str, dict[str, float]], str | None]:
|
|
27
|
+
"""Return ({tier: rates}, oldest_sourced_on) from bench/pricing.yaml."""
|
|
28
|
+
if yaml is None or not pricing_path.is_file():
|
|
29
|
+
return {}, None
|
|
30
|
+
data = yaml.safe_load(pricing_path.read_text(encoding="utf-8")) or {}
|
|
31
|
+
rates: dict[str, dict[str, float]] = {}
|
|
32
|
+
oldest: str | None = None
|
|
33
|
+
for row in data.get("models", []):
|
|
34
|
+
tier = row.get("tier")
|
|
35
|
+
if not tier:
|
|
36
|
+
continue
|
|
37
|
+
rates[tier] = {
|
|
38
|
+
"input": float(row.get("input", 0.0)),
|
|
39
|
+
"output": float(row.get("output", 0.0)),
|
|
40
|
+
"cache_write": float(row.get("cache_write", 0.0)),
|
|
41
|
+
"cache_read": float(row.get("cache_read", 0.0)),
|
|
42
|
+
}
|
|
43
|
+
sourced = row.get("sourced_on")
|
|
44
|
+
# YAML 1.1 parses ISO dates to datetime.date; coerce to ISO string.
|
|
45
|
+
if sourced is not None and not isinstance(sourced, str):
|
|
46
|
+
sourced = sourced.isoformat() if hasattr(sourced, "isoformat") else str(sourced)
|
|
47
|
+
if isinstance(sourced, str) and (oldest is None or sourced < oldest):
|
|
48
|
+
oldest = sourced
|
|
49
|
+
return rates, oldest
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _empty_totals() -> dict[str, int | float]:
|
|
53
|
+
return {
|
|
54
|
+
"input_tokens": 0,
|
|
55
|
+
"output_tokens": 0,
|
|
56
|
+
"cache_read_input_tokens": 0,
|
|
57
|
+
"cache_creation_input_tokens": 0,
|
|
58
|
+
"total_cost_usd": 0.0,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _empty_per_tier() -> dict[str, dict[str, int | float]]:
|
|
63
|
+
return {t: {"messages": 0, "cost_usd": 0.0} for t in TIER_KEYS}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def unavailable_block(reason: str, source: str, pricing_sourced_on: str | None) -> dict[str, Any]:
|
|
67
|
+
"""Schema-compliant `cost` block when no session jsonl is readable."""
|
|
68
|
+
return {
|
|
69
|
+
"source": "unavailable",
|
|
70
|
+
"reason": reason,
|
|
71
|
+
"scanned_path": source,
|
|
72
|
+
"sessions_scanned": 0,
|
|
73
|
+
"totals": _empty_totals(),
|
|
74
|
+
"per_tier": _empty_per_tier(),
|
|
75
|
+
"pricing_sourced_on": pricing_sourced_on,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def aggregate_sessions(
|
|
80
|
+
sessions_jsonl: Path,
|
|
81
|
+
pricing_path: Path,
|
|
82
|
+
) -> dict[str, Any]:
|
|
83
|
+
"""Read agents/cost-tracking/sessions.jsonl and aggregate per-tier totals."""
|
|
84
|
+
rates, pricing_sourced_on = load_pricing(pricing_path)
|
|
85
|
+
if not sessions_jsonl.is_file():
|
|
86
|
+
return unavailable_block(
|
|
87
|
+
reason="sessions_jsonl_missing",
|
|
88
|
+
source=str(sessions_jsonl),
|
|
89
|
+
pricing_sourced_on=pricing_sourced_on,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
totals = _empty_totals()
|
|
93
|
+
per_tier = _empty_per_tier()
|
|
94
|
+
sessions_scanned = 0
|
|
95
|
+
|
|
96
|
+
for line in sessions_jsonl.read_text(encoding="utf-8").splitlines():
|
|
97
|
+
if not line.strip():
|
|
98
|
+
continue
|
|
99
|
+
try:
|
|
100
|
+
summary = json.loads(line)
|
|
101
|
+
except json.JSONDecodeError:
|
|
102
|
+
continue
|
|
103
|
+
sessions_scanned += 1
|
|
104
|
+
for _model, slot in (summary.get("byModel") or {}).items():
|
|
105
|
+
tier = slot.get("tier", UNKNOWN_TIER)
|
|
106
|
+
if tier not in per_tier:
|
|
107
|
+
tier = UNKNOWN_TIER
|
|
108
|
+
totals["input_tokens"] += int(slot.get("input_tokens", 0))
|
|
109
|
+
totals["output_tokens"] += int(slot.get("output_tokens", 0))
|
|
110
|
+
totals["cache_read_input_tokens"] += int(slot.get("cache_read_input_tokens", 0))
|
|
111
|
+
totals["cache_creation_input_tokens"] += int(slot.get("cache_creation_input_tokens", 0))
|
|
112
|
+
cost = float(slot.get("cost_usd", 0.0))
|
|
113
|
+
# Recompute from rates if upstream cost is zero AND we have rates;
|
|
114
|
+
# otherwise trust the upstream attribution (it priced at capture time).
|
|
115
|
+
if cost == 0.0 and tier in rates:
|
|
116
|
+
r = rates[tier]
|
|
117
|
+
cost = (
|
|
118
|
+
int(slot.get("input_tokens", 0)) / 1e6 * r["input"]
|
|
119
|
+
+ int(slot.get("output_tokens", 0)) / 1e6 * r["output"]
|
|
120
|
+
+ int(slot.get("cache_creation_input_tokens", 0)) / 1e6 * r["cache_write"]
|
|
121
|
+
+ int(slot.get("cache_read_input_tokens", 0)) / 1e6 * r["cache_read"]
|
|
122
|
+
)
|
|
123
|
+
per_tier[tier]["messages"] += int(slot.get("messages", 0))
|
|
124
|
+
per_tier[tier]["cost_usd"] += cost
|
|
125
|
+
totals["total_cost_usd"] += cost
|
|
126
|
+
|
|
127
|
+
# Round currency to 6 decimals for stable diffs.
|
|
128
|
+
totals["total_cost_usd"] = round(float(totals["total_cost_usd"]), 6)
|
|
129
|
+
for t in per_tier.values():
|
|
130
|
+
t["cost_usd"] = round(float(t["cost_usd"]), 6)
|
|
131
|
+
|
|
132
|
+
return {
|
|
133
|
+
"source": str(sessions_jsonl),
|
|
134
|
+
"sessions_scanned": sessions_scanned,
|
|
135
|
+
"totals": totals,
|
|
136
|
+
"per_tier": per_tier,
|
|
137
|
+
"pricing_sourced_on": pricing_sourced_on,
|
|
138
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Quality probe for `scripts/bench_run.py` — step-4 Phase 2 Step 3.
|
|
2
|
+
#
|
|
3
|
+
# Each prompt declares `rubric.must_include` / `must_not_include` or a
|
|
4
|
+
# `quality_assertion` regex (per docs/contracts/benchmark-corpus-spec.md).
|
|
5
|
+
# When an agent-output file is passed via --agent-output, we score the
|
|
6
|
+
# assertions against actual output. Without it, we emit `not_collected`
|
|
7
|
+
# per docs/contracts/benchmark-report-schema.md § quality invariants.
|
|
8
|
+
"""Quality probe helper for the bench runner."""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _eval_rubric(rubric: dict[str, Any], output: str) -> tuple[bool, str]:
|
|
18
|
+
"""Apply rubric.must_include / must_not_include / length_words to output."""
|
|
19
|
+
for phrase in rubric.get("must_include") or []:
|
|
20
|
+
if phrase not in output:
|
|
21
|
+
return False, f"missing: {phrase!r}"
|
|
22
|
+
for phrase in rubric.get("must_not_include") or []:
|
|
23
|
+
if phrase in output:
|
|
24
|
+
return False, f"forbidden: {phrase!r}"
|
|
25
|
+
bounds = rubric.get("length_words") or {}
|
|
26
|
+
if bounds:
|
|
27
|
+
words = len(output.split())
|
|
28
|
+
lo, hi = bounds.get("min", 0), bounds.get("max", 0)
|
|
29
|
+
if lo and words < lo:
|
|
30
|
+
return False, f"length<{lo}: {words}"
|
|
31
|
+
if hi and words > hi:
|
|
32
|
+
return False, f"length>{hi}: {words}"
|
|
33
|
+
return True, "ok"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _eval_regex(pattern: str, output: str) -> tuple[bool, str]:
|
|
37
|
+
try:
|
|
38
|
+
compiled = re.compile(pattern, re.MULTILINE)
|
|
39
|
+
except re.error as exc:
|
|
40
|
+
return False, f"bad_regex: {exc}"
|
|
41
|
+
return (bool(compiled.search(output)), "ok" if compiled.search(output) else "no_match")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _format_rubric(rubric: dict[str, Any]) -> str:
|
|
45
|
+
parts = []
|
|
46
|
+
if rubric.get("must_include"):
|
|
47
|
+
parts.append(f"must_include={rubric['must_include']}")
|
|
48
|
+
if rubric.get("must_not_include"):
|
|
49
|
+
parts.append(f"must_not_include={rubric['must_not_include']}")
|
|
50
|
+
if rubric.get("length_words"):
|
|
51
|
+
parts.append(f"length_words={rubric['length_words']}")
|
|
52
|
+
return " ".join(parts) or "<empty>"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def score_corpus(
|
|
56
|
+
prompts: list[dict[str, Any]],
|
|
57
|
+
agent_output_path: Path | None,
|
|
58
|
+
) -> dict[str, Any]:
|
|
59
|
+
"""Return the `quality` block per benchmark-report-schema § quality."""
|
|
60
|
+
declared = [
|
|
61
|
+
p for p in prompts
|
|
62
|
+
if (p.get("rubric") or {}).get("must_include")
|
|
63
|
+
or (p.get("rubric") or {}).get("must_not_include")
|
|
64
|
+
or (p.get("rubric") or {}).get("length_words")
|
|
65
|
+
or p.get("quality_assertion")
|
|
66
|
+
]
|
|
67
|
+
total_declared = len(declared)
|
|
68
|
+
|
|
69
|
+
if agent_output_path is None or not agent_output_path.is_file():
|
|
70
|
+
return {
|
|
71
|
+
"source": "not_collected",
|
|
72
|
+
"prompts_with_assertion": total_declared,
|
|
73
|
+
"prompts_passing": 0,
|
|
74
|
+
"quality_score": 0.0,
|
|
75
|
+
"per_prompt": [
|
|
76
|
+
{
|
|
77
|
+
"id": p["id"],
|
|
78
|
+
"assertion": p.get("quality_assertion") or _format_rubric(p.get("rubric") or {}),
|
|
79
|
+
"assertion_kind": "quality_assertion" if p.get("quality_assertion") else "rubric",
|
|
80
|
+
"passed": "not_collected",
|
|
81
|
+
}
|
|
82
|
+
for p in declared
|
|
83
|
+
],
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
outputs = json.loads(agent_output_path.read_text(encoding="utf-8"))
|
|
87
|
+
per_prompt: list[dict[str, Any]] = []
|
|
88
|
+
passing = 0
|
|
89
|
+
for p in declared:
|
|
90
|
+
pid = p["id"]
|
|
91
|
+
output_text = str(outputs.get(pid, ""))
|
|
92
|
+
rubric = p.get("rubric") or {}
|
|
93
|
+
regex = p.get("quality_assertion")
|
|
94
|
+
if regex:
|
|
95
|
+
ok, _why = _eval_regex(regex, output_text)
|
|
96
|
+
kind = "quality_assertion"
|
|
97
|
+
assertion = regex
|
|
98
|
+
else:
|
|
99
|
+
ok, _why = _eval_rubric(rubric, output_text)
|
|
100
|
+
kind = "rubric"
|
|
101
|
+
assertion = _format_rubric(rubric)
|
|
102
|
+
per_prompt.append({
|
|
103
|
+
"id": pid,
|
|
104
|
+
"assertion": assertion,
|
|
105
|
+
"assertion_kind": kind,
|
|
106
|
+
"passed": ok,
|
|
107
|
+
})
|
|
108
|
+
if ok:
|
|
109
|
+
passing += 1
|
|
110
|
+
|
|
111
|
+
score = round(passing / total_declared, 4) if total_declared else 0.0
|
|
112
|
+
return {
|
|
113
|
+
"source": str(agent_output_path),
|
|
114
|
+
"prompts_with_assertion": total_declared,
|
|
115
|
+
"prompts_passing": passing,
|
|
116
|
+
"quality_score": score,
|
|
117
|
+
"per_prompt": per_prompt,
|
|
118
|
+
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# Report emitter for `scripts/bench_run.py` — step-4 Phase 2 Step 4.
|
|
2
|
+
#
|
|
3
|
+
# Serializes the unified report dict to JSON + Markdown per
|
|
4
|
+
# docs/contracts/benchmark-report-schema.md. Filename format:
|
|
5
|
+
# `bench/reports/<UTC ISO-8601 with : -> ->-<corpus_id>.{json,md}`.
|
|
6
|
+
"""Report emitter for the bench runner."""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def utc_now_filename_stamp() -> str:
|
|
16
|
+
"""Sortable lexicographic stamp — drop ':' so filenames stay portable."""
|
|
17
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def utc_now_iso() -> str:
|
|
21
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def report_paths(reports_dir: Path, corpus_id: str, stamp: str) -> tuple[Path, Path]:
|
|
25
|
+
base = f"{stamp}-{corpus_id}"
|
|
26
|
+
return reports_dir / f"{base}.json", reports_dir / f"{base}.md"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def write_json(path: Path, report: dict[str, Any]) -> None:
|
|
30
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
path.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _selection_section(selection: dict[str, Any]) -> str:
|
|
35
|
+
lines = [
|
|
36
|
+
"## Selection accuracy",
|
|
37
|
+
"",
|
|
38
|
+
f"- top-K = **{selection['top_k']}** · "
|
|
39
|
+
f"hit **{selection['prompts_hit']} / {selection['prompts_total']}** · "
|
|
40
|
+
f"accuracy **{selection['selection_accuracy']:.2%}** · "
|
|
41
|
+
f"target **{selection['target']:.2%}** · "
|
|
42
|
+
f"verdict **{'PASS' if selection['passed'] else 'FAIL'}**",
|
|
43
|
+
"",
|
|
44
|
+
"| id | hit | expected | top-K ranked |",
|
|
45
|
+
"|---|---|---|---|",
|
|
46
|
+
]
|
|
47
|
+
for r in selection.get("per_prompt", []):
|
|
48
|
+
mark = "✅" if r["hit"] else "❌"
|
|
49
|
+
expected = ", ".join(r.get("expected_skills") or []) or "—"
|
|
50
|
+
ranked = ", ".join(r.get("top_k_ranked") or []) or "—"
|
|
51
|
+
lines.append(f"| `{r['id']}` | {mark} | {expected} | {ranked} |")
|
|
52
|
+
return "\n".join(lines)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _cost_section(cost: dict[str, Any]) -> str:
|
|
56
|
+
if cost.get("source") == "unavailable":
|
|
57
|
+
return (
|
|
58
|
+
"## Cost capture\n\n"
|
|
59
|
+
f"- **source:** `unavailable` ({cost.get('reason', 'unknown')})\n"
|
|
60
|
+
f"- **scanned:** `{cost.get('scanned_path', '—')}`\n"
|
|
61
|
+
f"- **pricing sourced on:** {cost.get('pricing_sourced_on') or '—'}\n\n"
|
|
62
|
+
"_No session jsonl available. Run `node scripts/cost/track.mjs` "
|
|
63
|
+
"from a real Claude Code session to populate agents/cost-tracking/sessions.jsonl._\n"
|
|
64
|
+
)
|
|
65
|
+
totals = cost["totals"]
|
|
66
|
+
lines = [
|
|
67
|
+
"## Cost capture",
|
|
68
|
+
"",
|
|
69
|
+
f"- **source:** `{cost['source']}` · sessions scanned: **{cost['sessions_scanned']}**",
|
|
70
|
+
f"- **pricing sourced on:** {cost.get('pricing_sourced_on') or '—'}",
|
|
71
|
+
f"- **total cost:** **${totals['total_cost_usd']:.6f}**",
|
|
72
|
+
"",
|
|
73
|
+
"| tier | messages | cost (USD) |",
|
|
74
|
+
"|---|---:|---:|",
|
|
75
|
+
]
|
|
76
|
+
for tier, slot in cost["per_tier"].items():
|
|
77
|
+
if slot["messages"] == 0 and slot["cost_usd"] == 0.0:
|
|
78
|
+
continue
|
|
79
|
+
lines.append(f"| {tier} | {slot['messages']} | ${slot['cost_usd']:.6f} |")
|
|
80
|
+
lines += [
|
|
81
|
+
"",
|
|
82
|
+
"| metric | value |",
|
|
83
|
+
"|---|---:|",
|
|
84
|
+
f"| input_tokens | {totals['input_tokens']} |",
|
|
85
|
+
f"| output_tokens | {totals['output_tokens']} |",
|
|
86
|
+
f"| cache_read_input_tokens | {totals['cache_read_input_tokens']} |",
|
|
87
|
+
f"| cache_creation_input_tokens | {totals['cache_creation_input_tokens']} |",
|
|
88
|
+
]
|
|
89
|
+
return "\n".join(lines)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _quality_section(quality: dict[str, Any]) -> str:
|
|
93
|
+
if quality["source"] == "not_collected":
|
|
94
|
+
return (
|
|
95
|
+
"## Quality probe\n\n"
|
|
96
|
+
f"- **source:** `not_collected` · assertions declared: "
|
|
97
|
+
f"**{quality['prompts_with_assertion']}**\n"
|
|
98
|
+
"- _Pass `--agent-output <path-to-outputs.json>` (map of `id -> str`) "
|
|
99
|
+
"to score the rubrics. Schema invariant: missing output keeps "
|
|
100
|
+
"`verdict.overall` at `partial`._\n"
|
|
101
|
+
)
|
|
102
|
+
lines = [
|
|
103
|
+
"## Quality probe",
|
|
104
|
+
"",
|
|
105
|
+
f"- **source:** `{quality['source']}` · "
|
|
106
|
+
f"passing **{quality['prompts_passing']} / {quality['prompts_with_assertion']}** · "
|
|
107
|
+
f"score **{quality['quality_score']:.2%}**",
|
|
108
|
+
"",
|
|
109
|
+
"| id | kind | passed | assertion |",
|
|
110
|
+
"|---|---|---|---|",
|
|
111
|
+
]
|
|
112
|
+
for r in quality.get("per_prompt", []):
|
|
113
|
+
mark = "✅" if r["passed"] is True else ("❌" if r["passed"] is False else "—")
|
|
114
|
+
lines.append(f"| `{r['id']}` | {r['assertion_kind']} | {mark} | `{r['assertion']}` |")
|
|
115
|
+
return "\n".join(lines)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def render_markdown(report: dict[str, Any]) -> str:
|
|
119
|
+
corpus = report["corpus"]
|
|
120
|
+
sel = report["selection"]
|
|
121
|
+
cost = report["cost"]
|
|
122
|
+
qual = report["quality"]
|
|
123
|
+
verdict = report["verdict"]
|
|
124
|
+
headline = (
|
|
125
|
+
f"# Benchmark Report — `{corpus['id']}` · {report['generated_at']}\n\n"
|
|
126
|
+
"## Headline\n\n"
|
|
127
|
+
f"- **selection** {sel['selection_accuracy']:.2%} (target {sel['target']:.2%}) → **{verdict['selection']}**\n"
|
|
128
|
+
f"- **cost** ${cost['totals']['total_cost_usd']:.6f} "
|
|
129
|
+
f"({'sessions=' + str(cost['sessions_scanned']) if cost['source'] != 'unavailable' else cost['source']})\n"
|
|
130
|
+
f"- **quality** {qual['quality_score']:.2%} → **{verdict['quality']}**\n"
|
|
131
|
+
f"- **overall** → **{verdict['overall']}**\n"
|
|
132
|
+
)
|
|
133
|
+
notes = (
|
|
134
|
+
"## Notes\n\n"
|
|
135
|
+
f"- corpus path: `{corpus['path']}` · prompts: **{corpus['prompt_count']}**\n"
|
|
136
|
+
f"- pricing: `bench/pricing.yaml`\n"
|
|
137
|
+
f"- baseline collector: `{report['runner']['baseline_collector']}`\n"
|
|
138
|
+
)
|
|
139
|
+
return "\n\n".join([
|
|
140
|
+
headline,
|
|
141
|
+
_selection_section(sel),
|
|
142
|
+
_cost_section(cost),
|
|
143
|
+
_quality_section(qual),
|
|
144
|
+
notes,
|
|
145
|
+
]) + "\n"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def write_markdown(path: Path, report: dict[str, Any]) -> None:
|
|
149
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
path.write_text(render_markdown(report), encoding="utf-8")
|
package/scripts/agent-config
CHANGED
|
@@ -97,6 +97,10 @@ Tier 1 — power-user (release shape, audit, migration):
|
|
|
97
97
|
Lists missing, modified, and foreign files.
|
|
98
98
|
Exits 1 on drift, 2 on missing lockfile.
|
|
99
99
|
Flags: --json | --project=<path>
|
|
100
|
+
explain Read-only decision-chain trace.
|
|
101
|
+
Usage: explain config | explain rule <name>
|
|
102
|
+
| explain route "<text>"
|
|
103
|
+
Flags: --json | --project=<path>
|
|
100
104
|
migrate One-shot migration off legacy composer / npm install paths
|
|
101
105
|
Flags: --dry-run (detect only)
|
|
102
106
|
first-run Guided first-run setup — cost profile, settings, tooling
|
|
@@ -749,6 +753,14 @@ cmd_versions() {
|
|
|
749
753
|
exec env PYTHONPATH="$PACKAGE_ROOT" python3 -m scripts._cli.cmd_versions "$@"
|
|
750
754
|
}
|
|
751
755
|
|
|
756
|
+
# `agent-config explain <config|rule|route>` — print the decision chain
|
|
757
|
+
# behind a configuration or routing outcome. Read-only diagnostic; never
|
|
758
|
+
# edits state. See scripts/_cli/cmd_explain.py.
|
|
759
|
+
cmd_explain() {
|
|
760
|
+
require_python3
|
|
761
|
+
exec env PYTHONPATH="$PACKAGE_ROOT" python3 -m scripts._cli.cmd_explain "$@"
|
|
762
|
+
}
|
|
763
|
+
|
|
752
764
|
main() {
|
|
753
765
|
local cmd="${1-}"
|
|
754
766
|
[[ $# -gt 0 ]] && shift || true
|
|
@@ -801,6 +813,7 @@ main() {
|
|
|
801
813
|
prune) cmd_prune "$@" ;;
|
|
802
814
|
doctor) cmd_doctor "$@" ;;
|
|
803
815
|
versions) cmd_versions "$@" ;;
|
|
816
|
+
explain) cmd_explain "$@" ;;
|
|
804
817
|
help|--help|-h|"")
|
|
805
818
|
# Optional `--tier=0|1|all` filter (default 0).
|
|
806
819
|
local tier_arg="0"
|