akm-cli 0.7.0-rc1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/cli.js +100 -16
- package/dist/src/commands/config-cli.js +42 -0
- package/dist/src/commands/history.js +78 -7
- package/dist/src/commands/registry-search.js +69 -6
- package/dist/src/commands/search.js +30 -3
- package/dist/src/commands/show.js +29 -0
- package/dist/src/commands/source-add.js +5 -1
- package/dist/src/commands/source-manage.js +7 -1
- package/dist/src/core/config.js +28 -0
- package/dist/src/indexer/db-search.js +1 -0
- package/dist/src/indexer/indexer.js +16 -2
- package/dist/src/indexer/matchers.js +1 -1
- package/dist/src/indexer/search-source.js +4 -2
- package/dist/src/integrations/agent/profiles.js +1 -1
- package/dist/src/integrations/agent/spawn.js +67 -16
- package/dist/src/integrations/github.js +9 -3
- package/dist/src/llm/embedders/remote.js +37 -3
- package/dist/src/output/cli-hints.js +15 -2
- package/dist/src/output/renderers.js +3 -1
- package/dist/src/output/shapes.js +8 -1
- package/dist/src/output/text.js +156 -3
- package/dist/src/registry/build-index.js +5 -4
- package/dist/src/registry/providers/static-index.js +3 -1
- package/dist/src/setup/setup.js +9 -0
- package/dist/src/wiki/wiki.js +54 -6
- package/dist/src/workflows/runs.js +37 -3
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
- package/dist/tests/bench/attribution.test.js +24 -23
- package/dist/tests/bench/cleanup.js +31 -0
- package/dist/tests/bench/cli.js +366 -31
- package/dist/tests/bench/cli.test.js +282 -14
- package/dist/tests/bench/corpus.js +3 -0
- package/dist/tests/bench/corpus.test.js +10 -10
- package/dist/tests/bench/doctor.js +525 -0
- package/dist/tests/bench/driver.js +77 -22
- package/dist/tests/bench/driver.test.js +142 -1
- package/dist/tests/bench/environment.js +233 -0
- package/dist/tests/bench/environment.test.js +199 -0
- package/dist/tests/bench/evolve.js +67 -0
- package/dist/tests/bench/evolve.test.js +12 -4
- package/dist/tests/bench/failure-modes.test.js +52 -3
- package/dist/tests/bench/feedback-integrity.test.js +3 -2
- package/dist/tests/bench/leakage.test.js +105 -2
- package/dist/tests/bench/learning-curve.test.js +3 -2
- package/dist/tests/bench/metrics.js +102 -26
- package/dist/tests/bench/metrics.test.js +10 -4
- package/dist/tests/bench/opencode-config.js +194 -0
- package/dist/tests/bench/opencode-config.test.js +370 -0
- package/dist/tests/bench/report.js +73 -9
- package/dist/tests/bench/report.test.js +59 -10
- package/dist/tests/bench/run-config.js +355 -0
- package/dist/tests/bench/run-config.test.js +298 -0
- package/dist/tests/bench/run-curate-test.js +32 -0
- package/dist/tests/bench/run-failing-tasks.js +56 -0
- package/dist/tests/bench/run-full-bench.js +51 -0
- package/dist/tests/bench/run-items36-targeted.js +69 -0
- package/dist/tests/bench/run-nano-quick.js +42 -0
- package/dist/tests/bench/run-waveg-targeted.js +62 -0
- package/dist/tests/bench/runner.js +257 -94
- package/dist/tests/bench/tmp.js +90 -0
- package/dist/tests/bench/trajectory.js +2 -2
- package/dist/tests/bench/verifier.js +6 -1
- package/dist/tests/bench/workflow-spec.js +11 -24
- package/dist/tests/bench/workflow-spec.test.js +1 -1
- package/dist/tests/bench/workflow-trace.js +34 -0
- package/dist/tests/cli-errors.test.js +1 -0
- package/dist/tests/commands/history.test.js +195 -0
- package/dist/tests/config.test.js +25 -0
- package/dist/tests/e2e.test.js +23 -2
- package/dist/tests/fixtures/stashes/load.js +1 -1
- package/dist/tests/fixtures/stashes/load.test.js +11 -2
- package/dist/tests/indexer.test.js +12 -1
- package/dist/tests/output-baseline.test.js +2 -1
- package/dist/tests/output-shapes-unit.test.js +3 -1
- package/dist/tests/registry-build-index.test.js +17 -1
- package/dist/tests/registry-providers/static-index.test.js +34 -0
- package/dist/tests/registry-search.test.js +200 -0
- package/dist/tests/remember-frontmatter.test.js +11 -13
- package/dist/tests/source-qa-fixes.test.js +18 -0
- package/dist/tests/source-registry.test.js +3 -3
- package/dist/tests/source-source.test.js +61 -1
- package/dist/tests/workflow-qa-fixes.test.js +18 -0
- package/package.json +1 -1
|
@@ -26,13 +26,89 @@ import { computeFixtureContentHash, loadFixtureStash } from "../fixtures/stashes
|
|
|
26
26
|
import { registerCleanup } from "./cleanup";
|
|
27
27
|
import { computeTaskCorpusHash, readTaskBody } from "./corpus";
|
|
28
28
|
import { runOne } from "./driver";
|
|
29
|
+
import { validateFixtureCorpus } from "./environment";
|
|
29
30
|
import { aggregateCorpus, aggregateFailureModes, aggregatePerTask, aggregateTrajectory, classifyFailureMode, computeCorpusDelta, computePerAssetAttribution, computePerTaskDelta, computeSearchBridge, extractAssetLoads, extractGoldRanks, } from "./metrics";
|
|
30
31
|
import { resolveGitBranch, resolveGitCommit } from "./report";
|
|
31
|
-
import { benchMkdtemp } from "./tmp";
|
|
32
|
+
import { benchMkdtemp, benchTmpRoot } from "./tmp";
|
|
32
33
|
import { computeTrajectory } from "./trajectory";
|
|
33
34
|
import { evaluateRunAgainstAllSpecs, } from "./workflow-evaluator";
|
|
34
35
|
import { loadAllWorkflowSpecs } from "./workflow-spec";
|
|
35
36
|
import { normalizeRunToTrace } from "./workflow-trace";
|
|
37
|
+
/** Checkpoint write interval: write a partial file every N completed runs. */
|
|
38
|
+
const CHECKPOINT_INTERVAL = 5;
|
|
39
|
+
/** Partial file max age before cleanup: 24 hours in milliseconds. */
|
|
40
|
+
const PARTIAL_MAX_AGE_MS = 24 * 60 * 60 * 1000;
|
|
41
|
+
/**
|
|
42
|
+
* Emit a one-line progress update to stderr after each (task, arm, seed)
|
|
43
|
+
* completes. Goes to stderr even when --json is passed so operators always
|
|
44
|
+
* have a heartbeat signal during long runs.
|
|
45
|
+
*
|
|
46
|
+
* Format: `[<completed>/<total>] <taskId> <arm> <outcome> <wallclockSeconds>s`
|
|
47
|
+
*/
|
|
48
|
+
function emitProgress(completed, total, run) {
|
|
49
|
+
const secs = Math.round(run.wallclockMs / 1000);
|
|
50
|
+
process.stderr.write(`[${completed}/${total}] ${run.taskId} ${run.arm} ${run.outcome} ${secs}s\n`);
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Write a partial checkpoint file under `${AKM_CACHE_DIR}/bench/`.
|
|
54
|
+
* The file contains the runs completed so far plus a `partial: true` marker
|
|
55
|
+
* and a `summary.total_runs_completed` counter. Old partial files (>24h)
|
|
56
|
+
* are not cleaned up here — that is done at startup via `cleanupOldPartials`.
|
|
57
|
+
*/
|
|
58
|
+
function writePartialCheckpoint(runs, timestamp) {
|
|
59
|
+
try {
|
|
60
|
+
const root = benchTmpRoot();
|
|
61
|
+
const filename = `bench-partial-${timestamp.replace(/[:.]/g, "-")}.json`;
|
|
62
|
+
const outPath = path.join(root, filename);
|
|
63
|
+
const envelope = {
|
|
64
|
+
partial: true,
|
|
65
|
+
summary: {
|
|
66
|
+
total_runs_completed: runs.length,
|
|
67
|
+
},
|
|
68
|
+
timestamp,
|
|
69
|
+
runs: runs.map((r) => ({
|
|
70
|
+
task_id: r.taskId,
|
|
71
|
+
arm: r.arm,
|
|
72
|
+
seed: r.seed,
|
|
73
|
+
model: r.model,
|
|
74
|
+
outcome: r.outcome,
|
|
75
|
+
wallclock_ms: r.wallclockMs,
|
|
76
|
+
})),
|
|
77
|
+
};
|
|
78
|
+
fs.writeFileSync(outPath, JSON.stringify(envelope, null, 2), "utf8");
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
// Checkpoint writes are best-effort — never abort a run for a write failure.
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Remove partial checkpoint files older than 24 hours from the bench tmp root.
|
|
86
|
+
* Called once at the start of `runUtility` to reap orphans from prior crashed runs.
|
|
87
|
+
*/
|
|
88
|
+
function cleanupOldPartials() {
|
|
89
|
+
try {
|
|
90
|
+
const root = benchTmpRoot();
|
|
91
|
+
const now = Date.now();
|
|
92
|
+
const entries = fs.readdirSync(root);
|
|
93
|
+
for (const entry of entries) {
|
|
94
|
+
if (!entry.startsWith("bench-partial-"))
|
|
95
|
+
continue;
|
|
96
|
+
const fullPath = path.join(root, entry);
|
|
97
|
+
try {
|
|
98
|
+
const stat = fs.statSync(fullPath);
|
|
99
|
+
if (now - stat.mtimeMs > PARTIAL_MAX_AGE_MS) {
|
|
100
|
+
fs.unlinkSync(fullPath);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
/* swallow per-file errors */
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
catch {
|
|
109
|
+
/* swallow — cleanup is best-effort */
|
|
110
|
+
}
|
|
111
|
+
}
|
|
36
112
|
/**
|
|
37
113
|
* Default workflows directory. Can be overridden by callers (tests) via
|
|
38
114
|
* `RunUtilityOptions.workflowsDir`. Specs in this directory are loaded ONCE
|
|
@@ -40,6 +116,16 @@ import { normalizeRunToTrace } from "./workflow-trace";
|
|
|
40
116
|
* `applies_to` so we don't I/O in the hot loop.
|
|
41
117
|
*/
|
|
42
118
|
const DEFAULT_WORKFLOWS_DIR = path.resolve(__dirname, "..", "fixtures", "bench", "workflows");
|
|
119
|
+
/**
|
|
120
|
+
* Run `items` in batches of `n` concurrently, calling `fn` for each item.
|
|
121
|
+
* Batches are executed sequentially; within each batch all items run with
|
|
122
|
+
* `Promise.all`. This gives bounded concurrency without a full work-queue.
|
|
123
|
+
*/
|
|
124
|
+
async function runInBatches(items, n, fn) {
|
|
125
|
+
for (let i = 0; i < items.length; i += n) {
|
|
126
|
+
await Promise.all(items.slice(i, i + n).map(fn));
|
|
127
|
+
}
|
|
128
|
+
}
|
|
43
129
|
/**
|
|
44
130
|
* Run K seeds × len(arms) × len(tasks) and return the §13.3 report.
|
|
45
131
|
*
|
|
@@ -47,6 +133,11 @@ const DEFAULT_WORKFLOWS_DIR = path.resolve(__dirname, "..", "fixtures", "bench",
|
|
|
47
133
|
* every failure path into a RunResult, so the runner only has to worry
|
|
48
134
|
* about its own infrastructure (stash materialisation, workspace copy).
|
|
49
135
|
* Those failures are recorded as `harness_error` runs.
|
|
136
|
+
*
|
|
137
|
+
* When `options.parallel > 1`, work items are batched and run concurrently
|
|
138
|
+
* via `runInBatches`. The shared `warnings`, `goldRankRecords`, and
|
|
139
|
+
* `workflowChecks` arrays are updated atomically at the end of each item so
|
|
140
|
+
* no JS-level races occur (Node/Bun is single-threaded).
|
|
50
141
|
*/
|
|
51
142
|
export async function runUtility(options) {
|
|
52
143
|
const seedsPerArm = options.seedsPerArm ?? 5;
|
|
@@ -54,9 +145,39 @@ export async function runUtility(options) {
|
|
|
54
145
|
const budgetWallMs = options.budgetWallMs ?? 120000;
|
|
55
146
|
const slice = options.slice ?? "all";
|
|
56
147
|
const materialiseStash = options.materialiseStash ?? true;
|
|
148
|
+
// Clamp parallel to [1, 8].
|
|
149
|
+
const parallel = Math.min(8, Math.max(1, options.parallel ?? 1));
|
|
150
|
+
if (parallel > 4 && !options.forceParallel) {
|
|
151
|
+
process.stderr.write(`bench: --parallel ${parallel} exceeds 4; high concurrency may overwhelm local providers. ` +
|
|
152
|
+
`Pass --force-parallel to suppress this warning.\n`);
|
|
153
|
+
}
|
|
154
|
+
// Clean up orphaned partial files from prior crashed runs (best-effort).
|
|
155
|
+
cleanupOldPartials();
|
|
57
156
|
const grouped = new Map();
|
|
58
157
|
const warnings = [];
|
|
158
|
+
// Validate all task stash references before starting any work. Missing
|
|
159
|
+
// fixtures produce harness_error at run time; better to surface them loudly
|
|
160
|
+
// at startup with the fixture name than to discover them per-seed mid-run.
|
|
161
|
+
if (materialiseStash && options.arms.includes("akm")) {
|
|
162
|
+
const { missing } = validateFixtureCorpus(options.tasks);
|
|
163
|
+
for (const [fixture, taskIds] of missing) {
|
|
164
|
+
const w = `fixture "${fixture}" missing MANIFEST.json — tasks will harness_error: ${taskIds.join(", ")}`;
|
|
165
|
+
process.stderr.write(`bench: WARNING: ${w}\n`);
|
|
166
|
+
warnings.push(w);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
59
169
|
const goldRankRecords = [];
|
|
170
|
+
// Progress tracking: compute total run count upfront so progress lines show
|
|
171
|
+
// `[7/40]` rather than an unbounded counter.
|
|
172
|
+
const armsForProgress = options.includeSynthetic
|
|
173
|
+
? [...new Set([...options.arms, "synthetic"])]
|
|
174
|
+
: options.arms;
|
|
175
|
+
const totalRuns = options.tasks.length * armsForProgress.length * seedsPerArm;
|
|
176
|
+
let completedRuns = 0;
|
|
177
|
+
// Partial checkpoint accumulator: collects all RunResults as they land so
|
|
178
|
+
// we can write a partial envelope periodically without keeping duplicates.
|
|
179
|
+
const allCompletedRuns = [];
|
|
180
|
+
const runTimestamp = options.timestamp ?? new Date().toISOString();
|
|
60
181
|
// #257: load workflow specs ONCE per runUtility call. Skipped when the
|
|
61
182
|
// caller passes an empty `workflowsDir` string (test escape hatch). Errors
|
|
62
183
|
// are surfaced as warnings — workflow evaluation is best-effort and a
|
|
@@ -89,7 +210,7 @@ export async function runUtility(options) {
|
|
|
89
210
|
let stashError;
|
|
90
211
|
if (options.arms.includes("akm") && materialiseStash && !overrideStashDir) {
|
|
91
212
|
try {
|
|
92
|
-
stash = loadFixtureStash(task.stash
|
|
213
|
+
stash = loadFixtureStash(task.stash);
|
|
93
214
|
}
|
|
94
215
|
catch (err) {
|
|
95
216
|
stashError = err instanceof Error ? err.message : String(err);
|
|
@@ -121,99 +242,130 @@ export async function runUtility(options) {
|
|
|
121
242
|
return options.arms;
|
|
122
243
|
return [...options.arms, "synthetic"];
|
|
123
244
|
})();
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
taskId: run.taskId,
|
|
208
|
-
seed: run.seed,
|
|
209
|
-
outcome: run.outcome,
|
|
210
|
-
};
|
|
211
|
-
const taskMetadata = buildWorkflowTaskMetadata(task, trace);
|
|
212
|
-
const checks = evaluateRunAgainstAllSpecs(trace, workflowSpecs, runCtx, taskMetadata);
|
|
213
|
-
workflowChecks.push(...checks);
|
|
214
|
-
}
|
|
215
|
-
}
|
|
245
|
+
const workItems = [];
|
|
246
|
+
for (const arm of armsForTask) {
|
|
247
|
+
taskRuns.set(arm, []);
|
|
248
|
+
for (let seed = 0; seed < seedsPerArm; seed += 1) {
|
|
249
|
+
workItems.push({ arm, seed });
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
// Per-run worker: resolves stash/prompt, executes runOneIsolated, then
|
|
253
|
+
// splices the result into the shared accumulators. Because Bun/Node is
|
|
254
|
+
// single-threaded these splices are race-free even across concurrent
|
|
255
|
+
// awaits — only one microtask runs at a time between yield points.
|
|
256
|
+
const runItem = async ({ arm, seed }) => {
|
|
257
|
+
// Resolve the stashDir we'll forward to the agent. The akm arm
|
|
258
|
+
// always carries a stashDir so AKM_STASH_DIR is set in the child
|
|
259
|
+
// env — this is how downstream tooling (and the trajectory parser
|
|
260
|
+
// event-stream lookup) distinguishes arms. When the operator opted
|
|
261
|
+
// out of fixture materialisation (tests, dry-run), we still pass a
|
|
262
|
+
// stable placeholder so the env keys are wired correctly.
|
|
263
|
+
let stashDir;
|
|
264
|
+
if (arm === "akm") {
|
|
265
|
+
// Resolution order (must match the issue #251 acceptance criteria):
|
|
266
|
+
// 1. Per-task explicit override (used by `runMaskedCorpus` to
|
|
267
|
+
// point at a tmp stash with one asset removed). Highest
|
|
268
|
+
// priority because attribution correctness depends on this
|
|
269
|
+
// branch never being shadowed by the `__no-stash__`
|
|
270
|
+
// placeholder fallback.
|
|
271
|
+
// 2. Per-(task, arm)-call `stashDirByFixture` override (Phase
|
|
272
|
+
// 3 evolve persistence).
|
|
273
|
+
// 3. Per-task materialised fixture stash from `loadFixtureStash`.
|
|
274
|
+
// 4. `materialiseStash: false` placeholder so AKM_STASH_DIR is
|
|
275
|
+
// still wired into the child env.
|
|
276
|
+
if (task.stashDirOverride)
|
|
277
|
+
stashDir = task.stashDirOverride;
|
|
278
|
+
else if (overrideStashDir)
|
|
279
|
+
stashDir = overrideStashDir;
|
|
280
|
+
else if (stash)
|
|
281
|
+
stashDir = stash.stashDir;
|
|
282
|
+
else if (!materialiseStash)
|
|
283
|
+
stashDir = path.join(task.taskDir, "__no-stash__");
|
|
284
|
+
}
|
|
285
|
+
// Build the prompt-override (#267). The builder is invoked once
|
|
286
|
+
// per (task, arm) — seeds share a prompt. `undefined` keeps the
|
|
287
|
+
// driver's default prompt in play.
|
|
288
|
+
//
|
|
289
|
+
// #261: the synthetic arm has a scratch-notes prompt contract —
|
|
290
|
+
// the model is told no AKM stash is available and instructed to
|
|
291
|
+
// write/use its own procedural notes. When the caller does not
|
|
292
|
+
// supply a `buildPrompt` override for the synthetic arm we fall
|
|
293
|
+
// back to a built-in scratch-notes prompt so the contract is
|
|
294
|
+
// honoured by every utility-track caller, not just `runEvolve`.
|
|
295
|
+
let promptOverride = options.buildPrompt?.(task, arm);
|
|
296
|
+
if (promptOverride === undefined && arm === "synthetic") {
|
|
297
|
+
promptOverride = buildUtilitySyntheticPrompt(task.id);
|
|
298
|
+
}
|
|
299
|
+
// Collect per-run warnings separately and merge after the run so
|
|
300
|
+
// concurrent runs don't interleave partial warning sequences.
|
|
301
|
+
const runWarnings = [];
|
|
302
|
+
const run = await runOneIsolated({
|
|
303
|
+
task,
|
|
304
|
+
arm,
|
|
305
|
+
seed,
|
|
306
|
+
model: options.model,
|
|
307
|
+
stashDir,
|
|
308
|
+
budgetTokens,
|
|
309
|
+
budgetWallMs,
|
|
310
|
+
spawn: options.spawn,
|
|
311
|
+
warnings: runWarnings,
|
|
312
|
+
...(promptOverride !== undefined ? { prompt: promptOverride } : {}),
|
|
313
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
314
|
+
...(stash?.indexCacheHome ? { indexCacheHome: stash.indexCacheHome } : {}),
|
|
315
|
+
});
|
|
316
|
+
// Merge per-run warnings into the shared array.
|
|
317
|
+
if (runWarnings.length > 0)
|
|
318
|
+
warnings.push(...runWarnings);
|
|
319
|
+
taskRuns.get(arm)?.push(run);
|
|
320
|
+
// Emit a compact progress line to stderr (unconditional — even under
|
|
321
|
+
// --json so operators have a heartbeat during long runs).
|
|
322
|
+
completedRuns += 1;
|
|
323
|
+
emitProgress(completedRuns, totalRuns, run);
|
|
324
|
+
// Accumulate for partial checkpointing.
|
|
325
|
+
allCompletedRuns.push(run);
|
|
326
|
+
if (completedRuns % CHECKPOINT_INTERVAL === 0) {
|
|
327
|
+
writePartialCheckpoint(allCompletedRuns, runTimestamp);
|
|
216
328
|
}
|
|
329
|
+
// §6.7 search-pipeline bridge: only the akm arm consults the stash,
|
|
330
|
+
// and we only attribute ranks for tasks with a gold ref. Both
|
|
331
|
+
// guards mean noakm and gold-less runs are silently excluded.
|
|
332
|
+
if (arm === "akm" && task.goldRef) {
|
|
333
|
+
const searches = extractGoldRanks(run, task.goldRef);
|
|
334
|
+
goldRankRecords.push({
|
|
335
|
+
taskId: task.id,
|
|
336
|
+
arm,
|
|
337
|
+
seed,
|
|
338
|
+
outcome: run.outcome,
|
|
339
|
+
goldRef: task.goldRef,
|
|
340
|
+
searches,
|
|
341
|
+
});
|
|
342
|
+
}
|
|
343
|
+
// #257: evaluate the akm-arm run against every workflow spec. The
|
|
344
|
+
// evaluator's `specApplies` filter handles applicability (arm,
|
|
345
|
+
// domain, gold ref, repeated-failures threshold), so we hand it the
|
|
346
|
+
// entire spec list and append whatever it returns. noakm/synthetic
|
|
347
|
+
// arms are not evaluated — workflow specs target the akm arm.
|
|
348
|
+
if (arm === "akm" && workflowSpecs.length > 0) {
|
|
349
|
+
const trace = normalizeRunToTrace(run, {
|
|
350
|
+
warnings: runWarnings,
|
|
351
|
+
harness: {
|
|
352
|
+
agentStartedTs: run.startedAt,
|
|
353
|
+
agentFinishedTs: run.finishedAt,
|
|
354
|
+
},
|
|
355
|
+
});
|
|
356
|
+
const runCtx = {
|
|
357
|
+
arm: run.arm,
|
|
358
|
+
taskId: run.taskId,
|
|
359
|
+
seed: run.seed,
|
|
360
|
+
outcome: run.outcome,
|
|
361
|
+
};
|
|
362
|
+
const taskMetadata = buildWorkflowTaskMetadata(task, trace);
|
|
363
|
+
const checks = evaluateRunAgainstAllSpecs(trace, workflowSpecs, runCtx, taskMetadata);
|
|
364
|
+
workflowChecks.push(...checks);
|
|
365
|
+
}
|
|
366
|
+
};
|
|
367
|
+
try {
|
|
368
|
+
await runInBatches(workItems, parallel, runItem);
|
|
217
369
|
}
|
|
218
370
|
finally {
|
|
219
371
|
// Deregister BEFORE running cleanup so a SIGINT arriving during this
|
|
@@ -289,6 +441,7 @@ async function runOneIsolated(args) {
|
|
|
289
441
|
track: "utility",
|
|
290
442
|
arm: args.arm,
|
|
291
443
|
taskId: args.task.id,
|
|
444
|
+
taskTitle: args.task.title,
|
|
292
445
|
workspace,
|
|
293
446
|
model: args.model,
|
|
294
447
|
seed: args.seed,
|
|
@@ -297,10 +450,13 @@ async function runOneIsolated(args) {
|
|
|
297
450
|
verifier: args.task.verifier,
|
|
298
451
|
taskDir: args.task.taskDir,
|
|
299
452
|
...(args.task.expectedMatch ? { expectedMatch: args.task.expectedMatch } : {}),
|
|
453
|
+
...(args.task.akmKeywords ? { akmKeywords: args.task.akmKeywords } : {}),
|
|
300
454
|
...(args.stashDir ? { stashDir: args.stashDir } : {}),
|
|
301
455
|
...(args.spawn ? { spawn: args.spawn } : {}),
|
|
302
456
|
...(args.prompt !== undefined ? { prompt: args.prompt } : {}),
|
|
303
457
|
warnings: args.warnings,
|
|
458
|
+
...(args.opencodeProviders ? { opencodeProviders: args.opencodeProviders } : {}),
|
|
459
|
+
...(args.indexCacheHome ? { indexCacheHome: args.indexCacheHome } : {}),
|
|
304
460
|
};
|
|
305
461
|
const result = await runOne(runOptions);
|
|
306
462
|
// Splice in the trajectory metric. The driver always returns
|
|
@@ -532,5 +688,12 @@ function buildReport(args) {
|
|
|
532
688
|
// we just collected. This is the §6.5 "free" diagnostic — it runs on every
|
|
533
689
|
// utility invocation, no extra spawns.
|
|
534
690
|
baseReport.perAsset = computePerAssetAttribution(baseReport);
|
|
691
|
+
// Stamp the optional baseline pass-rate map onto the report so the
|
|
692
|
+
// renderer surfaces a `vs base` column in markdown and a
|
|
693
|
+
// `baseline_by_task_id` field in JSON. Additive — when the caller did
|
|
694
|
+
// not pass a baseline the report shape is byte-identical to before.
|
|
695
|
+
if (args.options.baselineByTaskId) {
|
|
696
|
+
baseReport.baselineByTaskId = { ...args.options.baselineByTaskId };
|
|
697
|
+
}
|
|
535
698
|
return baseReport;
|
|
536
699
|
}
|
package/dist/tests/bench/tmp.js
CHANGED
|
@@ -39,3 +39,93 @@ export function benchTmpRoot() {
|
|
|
39
39
|
export function benchMkdtemp(prefix) {
|
|
40
40
|
return fs.mkdtempSync(path.join(benchTmpRoot(), prefix));
|
|
41
41
|
}
|
|
42
|
+
// ── PID file ────────────────────────────────────────────────────────────────
|
|
43
|
+
/** Absolute path to the bench PID file: `${AKM_CACHE_DIR}/bench/bench.pid`. */
|
|
44
|
+
export function benchPidPath() {
|
|
45
|
+
return path.join(benchTmpRoot(), "bench.pid");
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Write `process.pid` to `bench.pid`.
|
|
49
|
+
*
|
|
50
|
+
* If a stale PID file exists and the referenced process is no longer running,
|
|
51
|
+
* it is removed with a warning before writing the new one.
|
|
52
|
+
*
|
|
53
|
+
* Returns a cleanup function that removes the PID file. Call it in a
|
|
54
|
+
* `finally` block so the file is removed on both clean exit and exceptions.
|
|
55
|
+
*/
|
|
56
|
+
export function writeBenchPid() {
|
|
57
|
+
const pidPath = benchPidPath();
|
|
58
|
+
// Check for an existing PID file and warn if stale.
|
|
59
|
+
if (fs.existsSync(pidPath)) {
|
|
60
|
+
let existingPid;
|
|
61
|
+
try {
|
|
62
|
+
const raw = fs.readFileSync(pidPath, "utf8").trim();
|
|
63
|
+
existingPid = Number.parseInt(raw, 10);
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
// Unreadable — treat as stale.
|
|
67
|
+
}
|
|
68
|
+
if (existingPid !== undefined && Number.isFinite(existingPid) && !isPidRunning(existingPid)) {
|
|
69
|
+
// Stale PID — warn and remove.
|
|
70
|
+
process.stderr.write(`bench: removing stale PID file for PID ${existingPid} (process not running)\n`);
|
|
71
|
+
try {
|
|
72
|
+
fs.rmSync(pidPath, { force: true });
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
/* best-effort */
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
try {
|
|
80
|
+
fs.writeFileSync(pidPath, String(process.pid), "utf8");
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
/* best-effort — PID file is diagnostic, not critical */
|
|
84
|
+
}
|
|
85
|
+
return () => {
|
|
86
|
+
try {
|
|
87
|
+
// Only remove if it still contains our PID (guard against races).
|
|
88
|
+
const current = fs.readFileSync(pidPath, "utf8").trim();
|
|
89
|
+
if (current === String(process.pid)) {
|
|
90
|
+
fs.rmSync(pidPath, { force: true });
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
catch {
|
|
94
|
+
/* best-effort */
|
|
95
|
+
}
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Read the PID from `bench.pid`. Returns `undefined` when the file does not
|
|
100
|
+
* exist or cannot be parsed.
|
|
101
|
+
*/
|
|
102
|
+
export function readBenchPid() {
|
|
103
|
+
const pidPath = benchPidPath();
|
|
104
|
+
if (!fs.existsSync(pidPath))
|
|
105
|
+
return undefined;
|
|
106
|
+
try {
|
|
107
|
+
const raw = fs.readFileSync(pidPath, "utf8").trim();
|
|
108
|
+
const n = Number.parseInt(raw, 10);
|
|
109
|
+
return Number.isFinite(n) && n > 0 ? n : undefined;
|
|
110
|
+
}
|
|
111
|
+
catch {
|
|
112
|
+
return undefined;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Return `true` when the process with the given PID is running on this host.
|
|
117
|
+
* Uses `process.kill(pid, 0)` — signal 0 is a no-op probe that throws ESRCH
|
|
118
|
+
* when the process does not exist and EPERM when it exists but is owned by
|
|
119
|
+
* another user (in which case it IS running).
|
|
120
|
+
*/
|
|
121
|
+
export function isPidRunning(pid) {
|
|
122
|
+
try {
|
|
123
|
+
process.kill(pid, 0);
|
|
124
|
+
return true;
|
|
125
|
+
}
|
|
126
|
+
catch (err) {
|
|
127
|
+
const code = err.code;
|
|
128
|
+
// EPERM means the process exists but we don't have permission to signal it.
|
|
129
|
+
return code === "EPERM";
|
|
130
|
+
}
|
|
131
|
+
}
|
|
@@ -48,8 +48,8 @@ function computeCorrectAssetLoaded(task, runResult, opts) {
|
|
|
48
48
|
return null;
|
|
49
49
|
const ref = task.goldRef;
|
|
50
50
|
// Search the events stream for any tool-call event that carries the ref.
|
|
51
|
-
// akm
|
|
52
|
-
//
|
|
51
|
+
// akm show emits an event to events.jsonl, so this path is the primary
|
|
52
|
+
// detection route when the structured event stream is available.
|
|
53
53
|
for (const event of runResult.events) {
|
|
54
54
|
const refField = event.ref;
|
|
55
55
|
if (typeof refField === "string" && matchesRef(refField, ref))
|
|
@@ -79,7 +79,12 @@ export async function runVerifier(taskDir, workspace, kind, config) {
|
|
|
79
79
|
return runProcess(["bash", script], workspace, resolveSpawn(config));
|
|
80
80
|
}
|
|
81
81
|
if (kind === "pytest") {
|
|
82
|
-
|
|
82
|
+
// Test files live at <taskDir>/tests/, not inside the workspace copy.
|
|
83
|
+
// Pass the absolute path so pytest discovers them while running with
|
|
84
|
+
// cwd=workspace (which lets relative paths like pathlib.Path("file.yml") work).
|
|
85
|
+
const testsDir = path.join(taskDir, "tests");
|
|
86
|
+
const testArgs = fs.existsSync(testsDir) ? [testsDir] : [];
|
|
87
|
+
return runProcess(["pytest", "-q", "--tb=line", ...testArgs], workspace, resolveSpawn(config));
|
|
83
88
|
}
|
|
84
89
|
if (kind === "regex") {
|
|
85
90
|
const pattern = config?.expectedMatch;
|
|
@@ -9,9 +9,8 @@
|
|
|
9
9
|
* - `loadWorkflowSpec(path, root?)` — parses + validates one file
|
|
10
10
|
* - `loadAllWorkflowSpecs(dir)` — walks a workflows directory
|
|
11
11
|
*
|
|
12
|
-
* Event names are validated against
|
|
13
|
-
*
|
|
14
|
-
* from `workflow-trace.ts`. Until then this set is the contract.
|
|
12
|
+
* Event names are validated against `WORKFLOW_TRACE_EVENT_NAMES` imported from
|
|
13
|
+
* `workflow-trace.ts` — single source of truth, no dual-maintenance hazard.
|
|
15
14
|
*
|
|
16
15
|
* Asset refs (e.g. `gold_ref`) are validated via `parseAssetRef` from
|
|
17
16
|
* `src/core/asset-ref.ts` — never reinvent ref validation.
|
|
@@ -20,31 +19,19 @@ import { readdirSync, readFileSync, statSync } from "node:fs";
|
|
|
20
19
|
import path from "node:path";
|
|
21
20
|
import { parse as parseYaml } from "yaml";
|
|
22
21
|
import { parseAssetRef } from "../../src/core/asset-ref";
|
|
23
|
-
|
|
22
|
+
import { WORKFLOW_TRACE_EVENT_NAMES } from "./workflow-trace";
|
|
23
|
+
// ── Event-name set (derived from workflow-trace.ts — single source of truth) ─
|
|
24
24
|
/**
|
|
25
|
-
*
|
|
26
|
-
*
|
|
25
|
+
* Allowlist of known event names, derived from `WORKFLOW_TRACE_EVENT_NAMES` in
|
|
26
|
+
* `workflow-trace.ts`. Using the exported runtime Set eliminates the dual-
|
|
27
|
+
* maintenance hazard: add a new event type once in `workflow-trace.ts` and
|
|
28
|
+
* both the normalizer and the spec validator see it automatically.
|
|
27
29
|
*
|
|
28
30
|
* `first_workspace_write` is a synthetic marker (the first `workspace_write`
|
|
29
31
|
* for a run) and is included so specs can talk about it directly.
|
|
30
32
|
*/
|
|
31
|
-
export const KNOWN_EVENT_NAMES =
|
|
32
|
-
|
|
33
|
-
"akm_search",
|
|
34
|
-
"akm_show",
|
|
35
|
-
"akm_feedback",
|
|
36
|
-
"akm_reflect",
|
|
37
|
-
"akm_distill",
|
|
38
|
-
"akm_propose",
|
|
39
|
-
"akm_proposal_accept",
|
|
40
|
-
"workspace_read",
|
|
41
|
-
"workspace_write",
|
|
42
|
-
"test_run",
|
|
43
|
-
"verifier_run",
|
|
44
|
-
"agent_finished",
|
|
45
|
-
"first_workspace_write",
|
|
46
|
-
]);
|
|
47
|
-
const EVENT_NAME_SET = new Set(KNOWN_EVENT_NAMES);
|
|
33
|
+
export const KNOWN_EVENT_NAMES = WORKFLOW_TRACE_EVENT_NAMES;
|
|
34
|
+
const EVENT_NAME_SET = KNOWN_EVENT_NAMES;
|
|
48
35
|
function isKnownEvent(name) {
|
|
49
36
|
return typeof name === "string" && EVENT_NAME_SET.has(name);
|
|
50
37
|
}
|
|
@@ -96,7 +83,7 @@ function requireNumber(obj, key, specPath) {
|
|
|
96
83
|
}
|
|
97
84
|
function validateEventName(name, specPath, where) {
|
|
98
85
|
if (!isKnownEvent(name)) {
|
|
99
|
-
throw new WorkflowSpecError(`Unknown event name "${String(name)}" in ${where}. ` + `Allowed: ${KNOWN_EVENT_NAMES.join(", ")}`, specPath);
|
|
86
|
+
throw new WorkflowSpecError(`Unknown event name "${String(name)}" in ${where}. ` + `Allowed: ${[...KNOWN_EVENT_NAMES].join(", ")}`, specPath);
|
|
100
87
|
}
|
|
101
88
|
return name;
|
|
102
89
|
}
|
|
@@ -17,7 +17,7 @@ const REQUIRED_SPECS = [
|
|
|
17
17
|
"akm-feedback-after-use",
|
|
18
18
|
"akm-negative-feedback-on-failure",
|
|
19
19
|
"akm-reflect-after-repeated-failure",
|
|
20
|
-
"akm-
|
|
20
|
+
"akm-workflow-followed",
|
|
21
21
|
];
|
|
22
22
|
// ── Scratch directory helpers ──────────────────────────────────────────────
|
|
23
23
|
let scratch;
|