@event4u/agent-config 2.18.0 → 2.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/agent-status.md +29 -0
- package/.agent-src/commands/onboard.md +221 -81
- package/.agent-src/commands/refine-ticket.md +3 -0
- package/.agent-src/packs/README.md +49 -0
- package/.agent-src/packs/agency-delivery.yml +63 -0
- package/.agent-src/packs/content-engine.yml +53 -0
- package/.agent-src/packs/founder-mvp.yml +51 -0
- package/.agent-src/personas/README.md +8 -0
- package/.agent-src/presets/README.md +26 -0
- package/.agent-src/presets/balanced.yml +34 -0
- package/.agent-src/presets/fast.yml +31 -0
- package/.agent-src/presets/strict.yml +38 -0
- package/.agent-src/profiles/README.md +29 -0
- package/.agent-src/profiles/agency.yml +27 -0
- package/.agent-src/profiles/content_creator.yml +25 -0
- package/.agent-src/profiles/developer.yml +26 -0
- package/.agent-src/profiles/finance.yml +24 -0
- package/.agent-src/profiles/founder.yml +25 -0
- package/.agent-src/profiles/ops.yml +25 -0
- package/.agent-src/rules/no-cheap-questions.md +25 -17
- package/.agent-src/skills/adr-create/SKILL.md +78 -68
- package/.agent-src/skills/refine-ticket/SKILL.md +3 -0
- package/.agent-src/skills/subagent-orchestration/SKILL.md +33 -0
- package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
- package/.agent-src/templates/skill-archive-note.md +101 -0
- package/.agent-src/user-types/README.md +124 -0
- package/.agent-src/user-types/_template/user-type.md +95 -0
- package/.agent-src/user-types/galabau-field-crew.md +100 -0
- package/.agent-src/user-types/metalworking-shop.md +105 -0
- package/.agent-src/user-types/truck-driver.md +113 -0
- package/.claude-plugin/marketplace.json +1 -1
- package/CHANGELOG.md +91 -30
- package/README.md +68 -72
- package/config/agent-settings.template.yml +22 -0
- package/docs/adrs/caveman/0001-default-off-until-bench.md +93 -0
- package/docs/adrs/caveman/README.md +9 -0
- package/docs/adrs/cost/0001-hard-stop-hook.md +114 -0
- package/docs/adrs/cost/README.md +9 -0
- package/docs/adrs/memory/0001-consumer-side-snapshot.md +111 -0
- package/docs/adrs/memory/README.md +9 -0
- package/docs/adrs/router/0001-three-tier-routing.md +119 -0
- package/docs/adrs/router/README.md +9 -0
- package/docs/adrs/schema/0001-json-schema-frontmatter.md +102 -0
- package/docs/adrs/schema/README.md +9 -0
- package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +99 -0
- package/docs/adrs/smoke/README.md +9 -0
- package/docs/architecture/current-onboard-baseline.md +126 -0
- package/docs/architecture/current-safety-behavior.md +137 -0
- package/docs/archive/CHANGELOG-pre-2.16.0.md +48 -0
- package/docs/contracts/adr-layout.md +108 -0
- package/docs/contracts/adr-mcp-runtime.md +128 -0
- package/docs/contracts/adr-user-types-axis.md +127 -0
- package/docs/contracts/benchmark-corpus-spec.md +97 -0
- package/docs/contracts/benchmark-report-schema.md +111 -0
- package/docs/contracts/command-clusters.md +1 -0
- package/docs/contracts/command-taxonomy.md +137 -0
- package/docs/contracts/compression-default-kill-criterion.md +69 -0
- package/docs/contracts/config-presets.md +144 -0
- package/docs/contracts/cost-dashboard.md +143 -0
- package/docs/contracts/cost-enforcement.md +134 -0
- package/docs/contracts/file-ownership-matrix.json +0 -7
- package/docs/contracts/mcp-tool-inventory.md +53 -0
- package/docs/contracts/measurement-baseline.md +102 -0
- package/docs/contracts/namespace.md +125 -0
- package/docs/contracts/profile-system.md +142 -0
- package/docs/contracts/safety-model.md +129 -0
- package/docs/contracts/smoke-contracts.md +144 -0
- package/docs/contracts/user-type-schema.md +146 -0
- package/docs/contracts/workflow-packs.md +121 -0
- package/docs/decisions/ADR-010-profile-pack-preset-boundary.md +132 -0
- package/docs/decisions/INDEX.md +1 -0
- package/docs/featured-commands.md +27 -0
- package/docs/parity/bench-ruflo.json +58 -0
- package/docs/parity/bench.json +41 -0
- package/docs/parity/ruflo.md +46 -0
- package/docs/profiles.md +91 -0
- package/docs/recruits/_template.md +81 -0
- package/package.json +1 -1
- package/scripts/_cli/cmd_explain.py +250 -0
- package/scripts/_lib/bench_cost.py +138 -0
- package/scripts/_lib/bench_quality.py +118 -0
- package/scripts/_lib/bench_report.py +150 -0
- package/scripts/agent-config +13 -0
- package/scripts/audit_adr_coverage.py +175 -0
- package/scripts/audit_mcp_tools.py +146 -0
- package/scripts/bench_baseline_ready.py +108 -0
- package/scripts/bench_drift_check.py +151 -0
- package/scripts/bench_per_tool.py +216 -0
- package/scripts/bench_run.py +155 -0
- package/scripts/compress.py +48 -2
- package/scripts/config/__init__.py +9 -0
- package/scripts/config/presets.py +206 -0
- package/scripts/config/profiles.py +173 -0
- package/scripts/cost/budget.mjs +73 -12
- package/scripts/cost/preflight.mjs +89 -0
- package/scripts/lint_archived_skills.py +143 -0
- package/scripts/lint_bench_corpus.py +161 -0
- package/scripts/lint_namespace.py +135 -0
- package/scripts/schemas/user-type.schema.json +35 -0
- package/scripts/skill_linter.py +139 -4
- package/scripts/skill_overlap.py +204 -0
- package/scripts/skill_tools/audit_user_type_coverage.py +148 -0
- package/scripts/skill_usage_collect.py +191 -0
- package/scripts/skill_usage_report.py +162 -0
- package/scripts/smoke/kernel.sh +101 -0
- package/scripts/smoke/router.sh +129 -0
- package/scripts/smoke/schema.sh +71 -0
- package/scripts/smoke/skills.sh +101 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Per-tool projection-fidelity bench — step-4 Phase 4.
|
|
3
|
+
|
|
4
|
+
Re-runs the keyword-overlap selection scorer against each projected
|
|
5
|
+
tool surface and computes:
|
|
6
|
+
|
|
7
|
+
fidelity(tool) = selection_accuracy(tool) / selection_accuracy(reference)
|
|
8
|
+
|
|
9
|
+
Reference = Augment projection (most complete per roadmap). Threshold
|
|
10
|
+
for "fit for purpose" is >= 0.85.
|
|
11
|
+
|
|
12
|
+
Surfaces:
|
|
13
|
+
- `.augment/skills/` skill projection automated (reference)
|
|
14
|
+
- `.claude/skills/` skill projection automated
|
|
15
|
+
- `.cursor/rules/` rules-only not_applicable (no skill projection)
|
|
16
|
+
- `.windsurfrules` single concatenated not_applicable
|
|
17
|
+
- `.clinerules/` rules-only not_applicable
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
python3 scripts/bench_per_tool.py --corpus dev
|
|
21
|
+
python3 scripts/bench_per_tool.py --corpus dev --json
|
|
22
|
+
python3 scripts/bench_per_tool.py --corpus dev --threshold 0.85
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import datetime as dt
|
|
29
|
+
import json
|
|
30
|
+
import re
|
|
31
|
+
import sys
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
import yaml
|
|
36
|
+
except ImportError:
|
|
37
|
+
sys.stderr.write("error: PyYAML required (pip install pyyaml)\n")
|
|
38
|
+
sys.exit(2)
|
|
39
|
+
|
|
40
|
+
# Reuse tokenization + ranking from the reference runner so the only
|
|
41
|
+
# axis that changes between tools is the skill catalogue on disk.
|
|
42
|
+
from bench_runner import rank_skills # type: ignore # noqa: E402
|
|
43
|
+
|
|
44
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
45
|
+
CORPUS_DIR = REPO_ROOT / "tests" / "eval"
|
|
46
|
+
REPORTS_DIR = REPO_ROOT / "bench" / "reports"
|
|
47
|
+
|
|
48
|
+
# tool_id -> (skills_root, kind). kind = "skills" | "rules_only" | "single_file".
|
|
49
|
+
SURFACES: dict[str, tuple[Path, str]] = {
|
|
50
|
+
"augment": (REPO_ROOT / ".augment" / "skills", "skills"),
|
|
51
|
+
"claude": (REPO_ROOT / ".claude" / "skills", "skills"),
|
|
52
|
+
"cursor": (REPO_ROOT / ".cursor" / "rules", "rules_only"),
|
|
53
|
+
"cline": (REPO_ROOT / ".clinerules", "rules_only"),
|
|
54
|
+
"windsurf":(REPO_ROOT / ".windsurfrules", "single_file"),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
REFERENCE_TOOL = "augment"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_descriptions(root: Path) -> dict[str, str]:
|
|
61
|
+
"""Return {skill_name: 'name + description'} for SKILL.md files under root."""
|
|
62
|
+
out: dict[str, str] = {}
|
|
63
|
+
if not root.is_dir():
|
|
64
|
+
return out
|
|
65
|
+
for skill_dir in sorted(root.iterdir()):
|
|
66
|
+
skill_md = skill_dir / "SKILL.md"
|
|
67
|
+
if not skill_md.is_file():
|
|
68
|
+
continue
|
|
69
|
+
text = skill_md.read_text(encoding="utf-8")
|
|
70
|
+
m = re.search(r"^---\s*\n(.*?)\n---\s*\n", text, re.DOTALL)
|
|
71
|
+
if not m:
|
|
72
|
+
continue
|
|
73
|
+
try:
|
|
74
|
+
fm = yaml.safe_load(m.group(1)) or {}
|
|
75
|
+
except yaml.YAMLError:
|
|
76
|
+
continue
|
|
77
|
+
desc = fm.get("description") or ""
|
|
78
|
+
name = fm.get("name") or skill_dir.name
|
|
79
|
+
if desc:
|
|
80
|
+
out[name] = f"{name} {desc}"
|
|
81
|
+
return out
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def score_corpus(skills: dict[str, str], prompts: list[dict], top_k: int) -> dict:
|
|
85
|
+
hits = 0
|
|
86
|
+
per_prompt = []
|
|
87
|
+
for p in prompts:
|
|
88
|
+
ranked = rank_skills(p["prompt"], skills, top_k)
|
|
89
|
+
expected = set(p.get("expected_skills", []))
|
|
90
|
+
hit = bool(expected & set(ranked))
|
|
91
|
+
if hit:
|
|
92
|
+
hits += 1
|
|
93
|
+
per_prompt.append({"id": p["id"], "expected": sorted(expected),
|
|
94
|
+
"ranked": ranked, "hit": hit})
|
|
95
|
+
n = len(prompts)
|
|
96
|
+
return {
|
|
97
|
+
"prompts_total": n,
|
|
98
|
+
"prompts_hit": hits,
|
|
99
|
+
"selection_accuracy": round(hits / n, 4) if n else 0.0,
|
|
100
|
+
"skill_count": len(skills),
|
|
101
|
+
"per_prompt": per_prompt,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def evaluate(corpus_path: Path, top_k: int, threshold: float) -> dict:
|
|
106
|
+
corpus = yaml.safe_load(corpus_path.read_text(encoding="utf-8"))
|
|
107
|
+
prompts = corpus["prompts"]
|
|
108
|
+
results: dict[str, dict] = {}
|
|
109
|
+
|
|
110
|
+
for tool, (root, kind) in SURFACES.items():
|
|
111
|
+
if kind != "skills":
|
|
112
|
+
results[tool] = {
|
|
113
|
+
"status": "not_applicable",
|
|
114
|
+
"reason": f"surface is {kind}; no SKILL.md projection",
|
|
115
|
+
"path": str(root.relative_to(REPO_ROOT)),
|
|
116
|
+
}
|
|
117
|
+
continue
|
|
118
|
+
skills = load_descriptions(root)
|
|
119
|
+
if not skills:
|
|
120
|
+
results[tool] = {"status": "error", "reason": "no skills found",
|
|
121
|
+
"path": str(root.relative_to(REPO_ROOT))}
|
|
122
|
+
continue
|
|
123
|
+
scored = score_corpus(skills, prompts, top_k)
|
|
124
|
+
scored["status"] = "ok"
|
|
125
|
+
scored["path"] = str(root.relative_to(REPO_ROOT))
|
|
126
|
+
results[tool] = scored
|
|
127
|
+
|
|
128
|
+
ref = results.get(REFERENCE_TOOL, {})
|
|
129
|
+
ref_acc = ref.get("selection_accuracy", 0.0) if ref.get("status") == "ok" else 0.0
|
|
130
|
+
below = []
|
|
131
|
+
for tool, r in results.items():
|
|
132
|
+
if r.get("status") != "ok":
|
|
133
|
+
continue
|
|
134
|
+
fidelity = (r["selection_accuracy"] / ref_acc) if ref_acc else 0.0
|
|
135
|
+
r["fidelity"] = round(fidelity, 4)
|
|
136
|
+
r["passed_threshold"] = fidelity >= threshold
|
|
137
|
+
if tool != REFERENCE_TOOL and not r["passed_threshold"]:
|
|
138
|
+
below.append(tool)
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"schema": "projection-fidelity-v1",
|
|
142
|
+
"generated_at": dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
143
|
+
"corpus_id": corpus["corpus_id"],
|
|
144
|
+
"top_k": top_k,
|
|
145
|
+
"threshold": threshold,
|
|
146
|
+
"reference_tool": REFERENCE_TOOL,
|
|
147
|
+
"reference_accuracy": ref_acc,
|
|
148
|
+
"tools": results,
|
|
149
|
+
"below_threshold": below,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def render_markdown(summary: dict) -> str:
|
|
154
|
+
lines = [
|
|
155
|
+
f"# Projection fidelity — {summary['corpus_id']}",
|
|
156
|
+
"",
|
|
157
|
+
f"_Generated {summary['generated_at']} · top-K={summary['top_k']} · "
|
|
158
|
+
f"threshold={summary['threshold']:.2f} · reference=`{summary['reference_tool']}`_",
|
|
159
|
+
"",
|
|
160
|
+
"| tool | status | skills | accuracy | fidelity | pass |",
|
|
161
|
+
"|---|---|---:|---:|---:|---|",
|
|
162
|
+
]
|
|
163
|
+
for tool, r in summary["tools"].items():
|
|
164
|
+
status = r.get("status", "?")
|
|
165
|
+
if status != "ok":
|
|
166
|
+
lines.append(f"| `{tool}` | {status} | — | — | — | — |")
|
|
167
|
+
continue
|
|
168
|
+
lines.append(
|
|
169
|
+
f"| `{tool}` | ok | {r['skill_count']} | "
|
|
170
|
+
f"{r['selection_accuracy']:.2%} | {r['fidelity']:.2f} | "
|
|
171
|
+
f"{'✅' if r['passed_threshold'] else '❌'} |"
|
|
172
|
+
)
|
|
173
|
+
if summary["below_threshold"]:
|
|
174
|
+
lines += ["", f"**Below threshold:** {', '.join(summary['below_threshold'])} "
|
|
175
|
+
f"→ inspect `scripts/_lib/generate_tools.py` projection mapping."]
|
|
176
|
+
else:
|
|
177
|
+
lines += ["", "**All projections fit-for-purpose** (≥ threshold)."]
|
|
178
|
+
return "\n".join(lines) + "\n"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def main(argv=None) -> int:
|
|
182
|
+
ap = argparse.ArgumentParser()
|
|
183
|
+
ap.add_argument("--corpus", default="dev")
|
|
184
|
+
ap.add_argument("--top-k", type=int, default=3)
|
|
185
|
+
ap.add_argument("--threshold", type=float, default=0.85)
|
|
186
|
+
ap.add_argument("--json", action="store_true")
|
|
187
|
+
ap.add_argument("--write-report", action="store_true",
|
|
188
|
+
help="emit bench/reports/<ts>-<corpus>-projection.{json,md}")
|
|
189
|
+
args = ap.parse_args(argv)
|
|
190
|
+
|
|
191
|
+
corpus_path = CORPUS_DIR / f"corpus-{args.corpus}.yaml"
|
|
192
|
+
if not corpus_path.is_file():
|
|
193
|
+
sys.stderr.write(f"error: corpus not found: {corpus_path}\n")
|
|
194
|
+
return 2
|
|
195
|
+
|
|
196
|
+
summary = evaluate(corpus_path, args.top_k, args.threshold)
|
|
197
|
+
|
|
198
|
+
if args.write_report:
|
|
199
|
+
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
200
|
+
stamp = summary["generated_at"].replace(":", "-")
|
|
201
|
+
base = REPORTS_DIR / f"{stamp}-{args.corpus}-projection"
|
|
202
|
+
base.with_suffix(".json").write_text(json.dumps(summary, indent=2) + "\n")
|
|
203
|
+
base.with_suffix(".md").write_text(render_markdown(summary))
|
|
204
|
+
sys.stderr.write(f"wrote {base}.json + {base}.md\n")
|
|
205
|
+
|
|
206
|
+
if args.json:
|
|
207
|
+
print(json.dumps(summary, indent=2))
|
|
208
|
+
else:
|
|
209
|
+
print(render_markdown(summary))
|
|
210
|
+
|
|
211
|
+
return 1 if summary["below_threshold"] else 0
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
sys.exit(main())
|
|
216
|
+
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Bench orchestrator — step-4 measurement-and-benchmark Phase 2.
|
|
3
|
+
|
|
4
|
+
Wraps the selection-accuracy baseline collector (`scripts/bench_runner.py`),
|
|
5
|
+
captures token / cost data from `agents/cost-tracking/sessions.jsonl` if
|
|
6
|
+
present (per ruflo pattern, external-findings § 2), runs structural
|
|
7
|
+
quality assertions per prompt, and emits a versioned JSON + Markdown
|
|
8
|
+
report under `bench/reports/` per
|
|
9
|
+
`docs/contracts/benchmark-report-schema.md`.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python3 scripts/bench_run.py --corpus dev
|
|
13
|
+
python3 scripts/bench_run.py --corpus dev --top-k 3 --quiet
|
|
14
|
+
python3 scripts/bench_run.py --corpus dev --agent-output outputs.json
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import sys
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
23
|
+
sys.path.insert(0, str(REPO_ROOT / "scripts"))
|
|
24
|
+
|
|
25
|
+
from _lib import script_output # type: ignore[import-not-found] # noqa: E402
|
|
26
|
+
from _lib.bench_cost import aggregate_sessions # noqa: E402
|
|
27
|
+
from _lib.bench_quality import score_corpus # noqa: E402
|
|
28
|
+
from _lib.bench_report import ( # noqa: E402
|
|
29
|
+
report_paths,
|
|
30
|
+
render_markdown,
|
|
31
|
+
utc_now_filename_stamp,
|
|
32
|
+
utc_now_iso,
|
|
33
|
+
write_json,
|
|
34
|
+
write_markdown,
|
|
35
|
+
)
|
|
36
|
+
from bench_runner import run_corpus # noqa: E402
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
import yaml
|
|
40
|
+
except ImportError:
|
|
41
|
+
script_output.error("error: PyYAML required (pip install pyyaml)")
|
|
42
|
+
sys.exit(2)
|
|
43
|
+
|
|
44
|
+
BENCH_RUN_VERSION = "0.1.0"
|
|
45
|
+
PRICING_PATH = REPO_ROOT / "bench" / "pricing.yaml"
|
|
46
|
+
SESSIONS_JSONL = REPO_ROOT / "agents" / "cost-tracking" / "sessions.jsonl"
|
|
47
|
+
REPORTS_DIR = REPO_ROOT / "bench" / "reports"
|
|
48
|
+
CORPUS_DIR = REPO_ROOT / "tests" / "eval"
|
|
49
|
+
BASELINE_COLLECTOR = REPO_ROOT / "scripts" / "bench_runner.py"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _baseline_sha_or_mtime() -> str:
|
|
53
|
+
try:
|
|
54
|
+
return f"mtime:{int(BASELINE_COLLECTOR.stat().st_mtime)}"
|
|
55
|
+
except OSError:
|
|
56
|
+
return "unavailable"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _verdict(selection: dict, quality: dict) -> dict[str, str]:
|
|
60
|
+
sel = "pass" if selection["passed"] else "fail"
|
|
61
|
+
if quality["source"] == "not_collected":
|
|
62
|
+
qual = "not_collected"
|
|
63
|
+
overall = "partial"
|
|
64
|
+
else:
|
|
65
|
+
qual = "pass" if quality["quality_score"] >= 0.60 else "fail"
|
|
66
|
+
overall = "pass" if (sel == "pass" and qual == "pass") else "fail"
|
|
67
|
+
return {"selection": sel, "quality": qual, "overall": overall}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def build_report(
|
|
71
|
+
corpus_path: Path,
|
|
72
|
+
top_k: int,
|
|
73
|
+
agent_output: Path | None,
|
|
74
|
+
) -> dict:
|
|
75
|
+
selection = run_corpus(corpus_path, top_k)
|
|
76
|
+
corpus_yaml = yaml.safe_load(corpus_path.read_text(encoding="utf-8"))
|
|
77
|
+
prompts = corpus_yaml.get("prompts", [])
|
|
78
|
+
cost = aggregate_sessions(SESSIONS_JSONL, PRICING_PATH)
|
|
79
|
+
quality = score_corpus(prompts, agent_output)
|
|
80
|
+
verdict = _verdict(selection, quality)
|
|
81
|
+
return {
|
|
82
|
+
"schema_version": 1,
|
|
83
|
+
"generated_at": utc_now_iso(),
|
|
84
|
+
"corpus": {
|
|
85
|
+
"id": selection["corpus_id"],
|
|
86
|
+
"path": str(corpus_path.relative_to(REPO_ROOT)),
|
|
87
|
+
"prompt_count": len(prompts),
|
|
88
|
+
},
|
|
89
|
+
"runner": {
|
|
90
|
+
"bench_run_version": BENCH_RUN_VERSION,
|
|
91
|
+
"baseline_collector": str(BASELINE_COLLECTOR.relative_to(REPO_ROOT)),
|
|
92
|
+
"baseline_collector_sha": _baseline_sha_or_mtime(),
|
|
93
|
+
},
|
|
94
|
+
"selection": selection,
|
|
95
|
+
"cost": cost,
|
|
96
|
+
"quality": quality,
|
|
97
|
+
"verdict": verdict,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def main(argv: list[str] | None = None) -> int:
|
|
102
|
+
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
|
103
|
+
ap.add_argument("--corpus", default="dev", help="corpus id (default: dev)")
|
|
104
|
+
ap.add_argument("--top-k", type=int, default=3)
|
|
105
|
+
ap.add_argument("--agent-output", type=Path, default=None,
|
|
106
|
+
help="Path to JSON {id: output_text} for quality scoring (Phase 3)")
|
|
107
|
+
ap.add_argument("--quiet", action="store_true",
|
|
108
|
+
help="Print only the report path + headline")
|
|
109
|
+
ap.add_argument("--stamp", default=None,
|
|
110
|
+
help="Override timestamp (test hook); defaults to UTC now")
|
|
111
|
+
ap.add_argument("--no-write", action="store_true",
|
|
112
|
+
help="Compute the report but do not write files (dry run)")
|
|
113
|
+
args = ap.parse_args(argv)
|
|
114
|
+
|
|
115
|
+
corpus_path = CORPUS_DIR / f"corpus-{args.corpus}.yaml"
|
|
116
|
+
if not corpus_path.is_file():
|
|
117
|
+
script_output.error(f"error: corpus not found: {corpus_path}")
|
|
118
|
+
return 2
|
|
119
|
+
|
|
120
|
+
report = build_report(corpus_path, args.top_k, args.agent_output)
|
|
121
|
+
stamp = args.stamp or utc_now_filename_stamp()
|
|
122
|
+
json_path, md_path = report_paths(REPORTS_DIR, report["corpus"]["id"], stamp)
|
|
123
|
+
|
|
124
|
+
if not args.no_write:
|
|
125
|
+
write_json(json_path, report)
|
|
126
|
+
write_markdown(md_path, report)
|
|
127
|
+
|
|
128
|
+
verdict = report["verdict"]
|
|
129
|
+
sel = report["selection"]
|
|
130
|
+
qual = report["quality"]
|
|
131
|
+
cost = report["cost"]
|
|
132
|
+
headline = (
|
|
133
|
+
f"bench {report['corpus']['id']} · "
|
|
134
|
+
f"selection {sel['selection_accuracy']:.2%} ({verdict['selection']}) · "
|
|
135
|
+
f"cost ${cost['totals']['total_cost_usd']:.6f} ({cost.get('source', 'n/a')}) · "
|
|
136
|
+
f"quality {qual['quality_score']:.2%} ({verdict['quality']}) · "
|
|
137
|
+
f"overall {verdict['overall']}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if args.quiet:
|
|
141
|
+
print(headline)
|
|
142
|
+
if not args.no_write:
|
|
143
|
+
print(f"report: {md_path.relative_to(REPO_ROOT)}")
|
|
144
|
+
else:
|
|
145
|
+
print(render_markdown(report))
|
|
146
|
+
if not args.no_write:
|
|
147
|
+
print(f"\n→ json: {json_path.relative_to(REPO_ROOT)}")
|
|
148
|
+
print(f"→ markdown: {md_path.relative_to(REPO_ROOT)}")
|
|
149
|
+
|
|
150
|
+
# Exit zero on overall pass OR partial (partial = quality_not_collected by design).
|
|
151
|
+
return 0 if verdict["overall"] in ("pass", "partial") else 1
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
if __name__ == "__main__":
|
|
155
|
+
sys.exit(main(sys.argv[1:]))
|
package/scripts/compress.py
CHANGED
|
@@ -346,6 +346,7 @@ TOOL_DIRS = {
|
|
|
346
346
|
SKILLS_SOURCE = PROJECT_ROOT / ".agent-src" / "skills"
|
|
347
347
|
COMMANDS_SOURCE = PROJECT_ROOT / ".agent-src" / "commands"
|
|
348
348
|
PERSONAS_SOURCE = PROJECT_ROOT / ".agent-src" / "personas"
|
|
349
|
+
USER_TYPES_SOURCE = PROJECT_ROOT / ".agent-src" / "user-types"
|
|
349
350
|
CLAUDE_SKILLS_DIR = PROJECT_ROOT / ".claude" / "skills"
|
|
350
351
|
|
|
351
352
|
PERSONA_TOOL_DIRS = {
|
|
@@ -353,6 +354,11 @@ PERSONA_TOOL_DIRS = {
|
|
|
353
354
|
".cursor/personas": "../../.agent-src/personas",
|
|
354
355
|
}
|
|
355
356
|
|
|
357
|
+
USER_TYPE_TOOL_DIRS = {
|
|
358
|
+
".claude/user-types": "../../.agent-src/user-types",
|
|
359
|
+
".cursor/user-types": "../../.agent-src/user-types",
|
|
360
|
+
}
|
|
361
|
+
|
|
356
362
|
# Map tool-projection directories to the canonical tool ID used by
|
|
357
363
|
# `.agent-tools.yml`. Directories not in this map are always emitted.
|
|
358
364
|
_DIR_TOOL_ID = {
|
|
@@ -361,6 +367,8 @@ _DIR_TOOL_ID = {
|
|
|
361
367
|
".clinerules": "cline",
|
|
362
368
|
".claude/personas": "claude-code",
|
|
363
369
|
".cursor/personas": "cursor",
|
|
370
|
+
".claude/user-types": "claude-code",
|
|
371
|
+
".cursor/user-types": "cursor",
|
|
364
372
|
}
|
|
365
373
|
|
|
366
374
|
|
|
@@ -901,6 +909,43 @@ def generate_persona_symlinks() -> int:
|
|
|
901
909
|
return total
|
|
902
910
|
|
|
903
911
|
|
|
912
|
+
def generate_user_type_symlinks() -> int:
|
|
913
|
+
"""Create symlink directories for user-types (.claude/user-types/, .cursor/user-types/).
|
|
914
|
+
|
|
915
|
+
Symlinks each user-type .md file from .agent-src/user-types/ into tool-specific
|
|
916
|
+
directories. Excludes README.md and _template/ — those are authoring scaffolding,
|
|
917
|
+
not user-type lenses.
|
|
918
|
+
"""
|
|
919
|
+
if not USER_TYPES_SOURCE.exists():
|
|
920
|
+
print(" ⚠️ .agent-src/user-types/ not found — skipping user-types")
|
|
921
|
+
return 0
|
|
922
|
+
|
|
923
|
+
user_types = sorted([
|
|
924
|
+
f.name for f in USER_TYPES_SOURCE.glob("*.md") if f.stem != "README"
|
|
925
|
+
])
|
|
926
|
+
tool_dirs = _filter_tool_dirs(USER_TYPE_TOOL_DIRS)
|
|
927
|
+
total = 0
|
|
928
|
+
for tool_dir, rel_prefix in tool_dirs.items():
|
|
929
|
+
target_dir = PROJECT_ROOT / tool_dir
|
|
930
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
931
|
+
|
|
932
|
+
# Clean stale symlinks
|
|
933
|
+
for item in target_dir.iterdir():
|
|
934
|
+
if item.is_symlink() and item.name not in user_types and item.name != "README.md":
|
|
935
|
+
item.unlink()
|
|
936
|
+
|
|
937
|
+
for user_type in user_types:
|
|
938
|
+
link = target_dir / user_type
|
|
939
|
+
target = Path(rel_prefix) / user_type
|
|
940
|
+
if link.exists() or link.is_symlink():
|
|
941
|
+
link.unlink()
|
|
942
|
+
link.symlink_to(target)
|
|
943
|
+
total += 1
|
|
944
|
+
|
|
945
|
+
info(f" ✅ Created {total} user-type symlinks across {len(tool_dirs)} tool directories ({len(user_types)} user-types each)")
|
|
946
|
+
return total
|
|
947
|
+
|
|
948
|
+
|
|
904
949
|
def generate_tools() -> None:
|
|
905
950
|
"""Generate all tool-specific directories and files.
|
|
906
951
|
|
|
@@ -916,13 +961,14 @@ def generate_tools() -> None:
|
|
|
916
961
|
skills = generate_claude_skills() if _tool_active("claude-code") else 0
|
|
917
962
|
commands = generate_claude_commands() if _tool_active("claude-code") else 0
|
|
918
963
|
personas = generate_persona_symlinks()
|
|
964
|
+
user_types = generate_user_type_symlinks()
|
|
919
965
|
cursor_mdc = generate_cursor_mdc_rules() if _tool_active("cursor") else 0
|
|
920
966
|
windsurf_modern = generate_windsurf_modern_rules() if _tool_active("windsurf") else 0
|
|
921
967
|
cursor_cmds = generate_cursor_commands() if _tool_active("cursor") else 0
|
|
922
968
|
windsurf_wf = generate_windsurf_workflows() if _tool_active("windsurf") else 0
|
|
923
969
|
summary = (
|
|
924
970
|
f"✅ generate-tools — rules={rules} skills={skills} "
|
|
925
|
-
f"commands={commands} personas={personas} "
|
|
971
|
+
f"commands={commands} personas={personas} user_types={user_types} "
|
|
926
972
|
f"cursor_mdc={cursor_mdc} windsurf_rules={windsurf_modern} "
|
|
927
973
|
f"cursor_commands={cursor_cmds} windsurf_workflows={windsurf_wf} "
|
|
928
974
|
f"windsurfrules={windsurfrules}"
|
|
@@ -943,7 +989,7 @@ def generate_tools() -> None:
|
|
|
943
989
|
# them to symlinks (everything else is always symlinked).
|
|
944
990
|
|
|
945
991
|
# Subdirectories of .agent-src/ that map into .augment/ as symlinks.
|
|
946
|
-
AUGMENT_SYMLINK_DIRS = ("skills", "commands", "guidelines", "personas", "templates", "contexts", "scripts")
|
|
992
|
+
AUGMENT_SYMLINK_DIRS = ("skills", "commands", "guidelines", "personas", "user-types", "templates", "contexts", "scripts")
|
|
947
993
|
# Top-level files to symlink into .augment/ (README, etc.)
|
|
948
994
|
AUGMENT_SYMLINK_FILES = ("README.md",)
|
|
949
995
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Config-layer loaders (profile, preset, pack).
|
|
2
|
+
|
|
3
|
+
Phase 1 of step-15 product refinement. Single home for the audience /
|
|
4
|
+
governance / workflow axes introduced by
|
|
5
|
+
:mod:`docs.contracts.profile-system`,
|
|
6
|
+
:mod:`docs.contracts.config-presets`, and the upcoming workflow-packs
|
|
7
|
+
contract. Loaders here are pure, read-only, lazy-PyYAML; they layer on
|
|
8
|
+
top of :mod:`scripts._lib.agent_settings` for project-root anchoring.
|
|
9
|
+
"""
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Preset loader — step-15 Phase 1 item 4.
|
|
2
|
+
|
|
3
|
+
Resolves the active ``preset.id`` and merged knob set from the chain
|
|
4
|
+
documented in :mod:`docs.contracts.config-presets`. Pure, read-only,
|
|
5
|
+
lazy-PyYAML.
|
|
6
|
+
|
|
7
|
+
Resolution chain (last writer wins for any single knob):
|
|
8
|
+
|
|
9
|
+
1. ``pack.preset_id`` — set ``preset.id`` (Phase 2; ``None`` until packs
|
|
10
|
+
land).
|
|
11
|
+
2. ``profile.preset_id`` — set ``preset.id`` if not pack-set.
|
|
12
|
+
3. ``preset.<id>.yml`` — fill all knobs from the seed file.
|
|
13
|
+
4. ``.agent-settings.yml`` user keys under ``preset:`` — override per-knob.
|
|
14
|
+
5. Environment variables (``AGENT_CONFIG_PRESET_*``) — override per-knob,
|
|
15
|
+
structured keys mapped from the schema (see :data:`ENV_KNOB_MAP`).
|
|
16
|
+
6. Runtime CLI overrides — caller passes a flat ``runtime_overrides`` map.
|
|
17
|
+
|
|
18
|
+
Profile-aware overlay is **not** done here — callers that need
|
|
19
|
+
profile-specific reads of preset knobs (e.g. ``block_on_risk.code_paths``
|
|
20
|
+
for ``developer`` vs ``block_on_risk.financial_paths`` for ``founder``)
|
|
21
|
+
read the merged knob bag returned by :func:`resolve_preset`.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import logging
|
|
26
|
+
import os
|
|
27
|
+
from copy import deepcopy
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
PRESET_ID_ENV = "AGENT_CONFIG_PRESET_ID"
|
|
35
|
+
SEED_PRESET_IDS: tuple[str, ...] = ("fast", "balanced", "strict")
|
|
36
|
+
DEFAULT_PRESET_ID = "balanced"
|
|
37
|
+
PRESETS_DIRNAME = ".agent-src.uncompressed/presets"
|
|
38
|
+
|
|
39
|
+
SOURCE_PACK = "pack"
|
|
40
|
+
SOURCE_PROFILE = "profile"
|
|
41
|
+
SOURCE_USER = "user-settings"
|
|
42
|
+
SOURCE_ENV = "env"
|
|
43
|
+
SOURCE_RUNTIME = "runtime"
|
|
44
|
+
SOURCE_DEFAULT = "default"
|
|
45
|
+
|
|
46
|
+
ENV_KNOB_MAP: dict[str, tuple[str, ...]] = {
|
|
47
|
+
"AGENT_CONFIG_PRESET_COST_DAILY_MAX_USD": ("cost", "daily_max_usd"),
|
|
48
|
+
"AGENT_CONFIG_PRESET_COST_WEEKLY_MAX_USD": ("cost", "weekly_max_usd"),
|
|
49
|
+
"AGENT_CONFIG_PRESET_COST_MONTHLY_MAX_USD": ("cost", "monthly_max_usd"),
|
|
50
|
+
"AGENT_CONFIG_PRESET_MCP_PER_CALL_MAX_USD": ("mcp", "per_call_max_usd"),
|
|
51
|
+
"AGENT_CONFIG_PRESET_MCP_PER_SESSION_MAX_USD": ("mcp", "per_session_max_usd"),
|
|
52
|
+
"AGENT_CONFIG_PRESET_COUNCIL_CAP_PER_CONSULT_USD": (
|
|
53
|
+
"council",
|
|
54
|
+
"cap_per_consult_usd",
|
|
55
|
+
),
|
|
56
|
+
"AGENT_CONFIG_PRESET_AUTONOMY_DEFAULT": ("autonomy", "default"),
|
|
57
|
+
"AGENT_CONFIG_PRESET_CONFIDENCE_MIN_BAND": ("confidence", "min_band"),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass(frozen=True)
|
|
62
|
+
class ResolvedPreset:
|
|
63
|
+
"""Outcome of :func:`resolve_preset`. See config-presets contract."""
|
|
64
|
+
|
|
65
|
+
id: str
|
|
66
|
+
knobs: dict[str, Any] = field(default_factory=dict)
|
|
67
|
+
source: str = SOURCE_DEFAULT
|
|
68
|
+
overrides: tuple[str, ...] = ()
|
|
69
|
+
warning: str | None = None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class PresetError(Exception):
|
|
73
|
+
"""Raised when a preset id is referenced but its YAML cannot load."""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _load_yaml(path: Path) -> dict[str, Any]:
|
|
77
|
+
try:
|
|
78
|
+
import yaml # type: ignore[import-not-found]
|
|
79
|
+
except ImportError:
|
|
80
|
+
logger.info("PyYAML unavailable; preset %s returned empty", path)
|
|
81
|
+
return {}
|
|
82
|
+
try:
|
|
83
|
+
text = path.read_text(encoding="utf-8")
|
|
84
|
+
except OSError as exc:
|
|
85
|
+
logger.warning("preset read failed for %s: %s", path, exc)
|
|
86
|
+
return {}
|
|
87
|
+
try:
|
|
88
|
+
data = yaml.safe_load(text) or {}
|
|
89
|
+
except yaml.YAMLError as exc:
|
|
90
|
+
logger.warning("preset parse failed for %s: %s", path, exc)
|
|
91
|
+
return {}
|
|
92
|
+
return data if isinstance(data, dict) else {}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _preset_file(project_root: Path, preset_id: str) -> Path:
|
|
96
|
+
return project_root / PRESETS_DIRNAME / f"{preset_id}.yml"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _coerce_scalar(raw: str) -> Any:
|
|
100
|
+
try:
|
|
101
|
+
return int(raw)
|
|
102
|
+
except ValueError:
|
|
103
|
+
pass
|
|
104
|
+
try:
|
|
105
|
+
return float(raw)
|
|
106
|
+
except ValueError:
|
|
107
|
+
pass
|
|
108
|
+
if raw.lower() in {"true", "false"}:
|
|
109
|
+
return raw.lower() == "true"
|
|
110
|
+
return raw
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> list[str]:
|
|
114
|
+
"""Merge ``override`` into ``base`` in place; return dotted-override paths."""
|
|
115
|
+
paths: list[str] = []
|
|
116
|
+
|
|
117
|
+
def walk(b: dict[str, Any], o: dict[str, Any], prefix: str) -> None:
|
|
118
|
+
for key, value in o.items():
|
|
119
|
+
dotted = f"{prefix}{key}"
|
|
120
|
+
if isinstance(value, dict) and isinstance(b.get(key), dict):
|
|
121
|
+
walk(b[key], value, f"{dotted}.")
|
|
122
|
+
else:
|
|
123
|
+
b[key] = deepcopy(value)
|
|
124
|
+
paths.append(dotted)
|
|
125
|
+
|
|
126
|
+
walk(base, override, "")
|
|
127
|
+
return paths
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _pick_id(
|
|
131
|
+
pack_preset_id: str | None,
|
|
132
|
+
profile_preset_id: str | None,
|
|
133
|
+
user_settings: dict[str, Any],
|
|
134
|
+
runtime_id: str | None,
|
|
135
|
+
) -> tuple[str | None, str]:
|
|
136
|
+
if runtime_id:
|
|
137
|
+
return runtime_id, SOURCE_RUNTIME
|
|
138
|
+
env_id = os.environ.get(PRESET_ID_ENV)
|
|
139
|
+
if env_id:
|
|
140
|
+
return env_id, SOURCE_ENV
|
|
141
|
+
block = user_settings.get("preset") if isinstance(user_settings, dict) else None
|
|
142
|
+
if isinstance(block, dict) and block.get("id"):
|
|
143
|
+
return str(block["id"]), SOURCE_USER
|
|
144
|
+
if pack_preset_id:
|
|
145
|
+
return pack_preset_id, SOURCE_PACK
|
|
146
|
+
if profile_preset_id:
|
|
147
|
+
return profile_preset_id, SOURCE_PROFILE
|
|
148
|
+
return None, SOURCE_DEFAULT
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def resolve_preset(
|
|
152
|
+
*,
|
|
153
|
+
project_root: Path,
|
|
154
|
+
user_settings: dict[str, Any] | None = None,
|
|
155
|
+
pack_preset_id: str | None = None,
|
|
156
|
+
profile_preset_id: str | None = None,
|
|
157
|
+
runtime_id: str | None = None,
|
|
158
|
+
runtime_overrides: dict[tuple[str, ...], Any] | None = None,
|
|
159
|
+
) -> ResolvedPreset:
|
|
160
|
+
"""Return the active :class:`ResolvedPreset` for the current session."""
|
|
161
|
+
settings = user_settings or {}
|
|
162
|
+
preset_id, source = _pick_id(
|
|
163
|
+
pack_preset_id, profile_preset_id, settings, runtime_id,
|
|
164
|
+
)
|
|
165
|
+
if preset_id is None:
|
|
166
|
+
preset_id = DEFAULT_PRESET_ID
|
|
167
|
+
source = SOURCE_DEFAULT
|
|
168
|
+
yaml_path = _preset_file(project_root, preset_id)
|
|
169
|
+
if not yaml_path.exists():
|
|
170
|
+
raise PresetError(
|
|
171
|
+
f"preset.id={preset_id!r} ({source}) but {yaml_path} not found",
|
|
172
|
+
)
|
|
173
|
+
raw = _load_yaml(yaml_path)
|
|
174
|
+
knobs = raw.get("preset") or {}
|
|
175
|
+
if not isinstance(knobs, dict):
|
|
176
|
+
raise PresetError(f"{yaml_path} has no top-level 'preset:' mapping")
|
|
177
|
+
knobs = deepcopy(knobs)
|
|
178
|
+
knobs.pop("id", None)
|
|
179
|
+
overrides: list[str] = []
|
|
180
|
+
user_block = settings.get("preset") if isinstance(settings.get("preset"), dict) else None
|
|
181
|
+
if isinstance(user_block, dict):
|
|
182
|
+
user_overrides = {k: v for k, v in user_block.items() if k != "id"}
|
|
183
|
+
if user_overrides:
|
|
184
|
+
overrides.extend(_deep_merge(knobs, user_overrides))
|
|
185
|
+
for env_key, path in ENV_KNOB_MAP.items():
|
|
186
|
+
raw_value = os.environ.get(env_key)
|
|
187
|
+
if raw_value is None:
|
|
188
|
+
continue
|
|
189
|
+
cursor = knobs
|
|
190
|
+
for part in path[:-1]:
|
|
191
|
+
cursor = cursor.setdefault(part, {})
|
|
192
|
+
cursor[path[-1]] = _coerce_scalar(raw_value)
|
|
193
|
+
overrides.append(".".join(path))
|
|
194
|
+
if runtime_overrides:
|
|
195
|
+
for path, value in runtime_overrides.items():
|
|
196
|
+
cursor = knobs
|
|
197
|
+
for part in path[:-1]:
|
|
198
|
+
cursor = cursor.setdefault(part, {})
|
|
199
|
+
cursor[path[-1]] = value
|
|
200
|
+
overrides.append(".".join(path))
|
|
201
|
+
return ResolvedPreset(
|
|
202
|
+
id=preset_id,
|
|
203
|
+
knobs=knobs,
|
|
204
|
+
source=source,
|
|
205
|
+
overrides=tuple(overrides),
|
|
206
|
+
)
|