akm-cli 0.7.0-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/dist/src/cli.js +100 -16
  2. package/dist/src/commands/config-cli.js +42 -0
  3. package/dist/src/commands/history.js +78 -7
  4. package/dist/src/commands/registry-search.js +69 -6
  5. package/dist/src/commands/search.js +30 -3
  6. package/dist/src/commands/show.js +29 -0
  7. package/dist/src/commands/source-add.js +5 -1
  8. package/dist/src/commands/source-manage.js +7 -1
  9. package/dist/src/core/config.js +28 -0
  10. package/dist/src/indexer/db-search.js +1 -0
  11. package/dist/src/indexer/indexer.js +16 -2
  12. package/dist/src/indexer/matchers.js +1 -1
  13. package/dist/src/indexer/search-source.js +4 -2
  14. package/dist/src/integrations/agent/profiles.js +1 -1
  15. package/dist/src/integrations/agent/spawn.js +67 -16
  16. package/dist/src/integrations/github.js +9 -3
  17. package/dist/src/llm/embedders/remote.js +37 -3
  18. package/dist/src/output/cli-hints.js +15 -2
  19. package/dist/src/output/renderers.js +3 -1
  20. package/dist/src/output/shapes.js +8 -1
  21. package/dist/src/output/text.js +156 -3
  22. package/dist/src/registry/build-index.js +5 -4
  23. package/dist/src/registry/providers/static-index.js +3 -1
  24. package/dist/src/setup/setup.js +9 -0
  25. package/dist/src/wiki/wiki.js +54 -6
  26. package/dist/src/workflows/runs.js +37 -3
  27. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
  28. package/dist/tests/bench/attribution.test.js +24 -23
  29. package/dist/tests/bench/cleanup.js +31 -0
  30. package/dist/tests/bench/cli.js +366 -31
  31. package/dist/tests/bench/cli.test.js +282 -14
  32. package/dist/tests/bench/corpus.js +3 -0
  33. package/dist/tests/bench/corpus.test.js +10 -10
  34. package/dist/tests/bench/doctor.js +525 -0
  35. package/dist/tests/bench/driver.js +77 -22
  36. package/dist/tests/bench/driver.test.js +142 -1
  37. package/dist/tests/bench/environment.js +233 -0
  38. package/dist/tests/bench/environment.test.js +199 -0
  39. package/dist/tests/bench/evolve.js +67 -0
  40. package/dist/tests/bench/evolve.test.js +12 -4
  41. package/dist/tests/bench/failure-modes.test.js +52 -3
  42. package/dist/tests/bench/feedback-integrity.test.js +3 -2
  43. package/dist/tests/bench/leakage.test.js +105 -2
  44. package/dist/tests/bench/learning-curve.test.js +3 -2
  45. package/dist/tests/bench/metrics.js +102 -26
  46. package/dist/tests/bench/metrics.test.js +10 -4
  47. package/dist/tests/bench/opencode-config.js +194 -0
  48. package/dist/tests/bench/opencode-config.test.js +370 -0
  49. package/dist/tests/bench/report.js +73 -9
  50. package/dist/tests/bench/report.test.js +59 -10
  51. package/dist/tests/bench/run-config.js +355 -0
  52. package/dist/tests/bench/run-config.test.js +298 -0
  53. package/dist/tests/bench/run-curate-test.js +32 -0
  54. package/dist/tests/bench/run-failing-tasks.js +56 -0
  55. package/dist/tests/bench/run-full-bench.js +51 -0
  56. package/dist/tests/bench/run-items36-targeted.js +69 -0
  57. package/dist/tests/bench/run-nano-quick.js +42 -0
  58. package/dist/tests/bench/run-waveg-targeted.js +62 -0
  59. package/dist/tests/bench/runner.js +257 -94
  60. package/dist/tests/bench/tmp.js +90 -0
  61. package/dist/tests/bench/trajectory.js +2 -2
  62. package/dist/tests/bench/verifier.js +6 -1
  63. package/dist/tests/bench/workflow-spec.js +11 -24
  64. package/dist/tests/bench/workflow-spec.test.js +1 -1
  65. package/dist/tests/bench/workflow-trace.js +34 -0
  66. package/dist/tests/cli-errors.test.js +1 -0
  67. package/dist/tests/commands/history.test.js +195 -0
  68. package/dist/tests/config.test.js +25 -0
  69. package/dist/tests/e2e.test.js +23 -2
  70. package/dist/tests/fixtures/stashes/load.js +1 -1
  71. package/dist/tests/fixtures/stashes/load.test.js +11 -2
  72. package/dist/tests/indexer.test.js +12 -1
  73. package/dist/tests/output-baseline.test.js +2 -1
  74. package/dist/tests/output-shapes-unit.test.js +3 -1
  75. package/dist/tests/registry-build-index.test.js +17 -1
  76. package/dist/tests/registry-providers/static-index.test.js +34 -0
  77. package/dist/tests/registry-search.test.js +200 -0
  78. package/dist/tests/remember-frontmatter.test.js +11 -13
  79. package/dist/tests/source-qa-fixes.test.js +18 -0
  80. package/dist/tests/source-registry.test.js +3 -3
  81. package/dist/tests/source-source.test.js +61 -1
  82. package/dist/tests/workflow-qa-fixes.test.js +18 -0
  83. package/package.json +1 -1
@@ -26,13 +26,89 @@ import { computeFixtureContentHash, loadFixtureStash } from "../fixtures/stashes
26
26
  import { registerCleanup } from "./cleanup";
27
27
  import { computeTaskCorpusHash, readTaskBody } from "./corpus";
28
28
  import { runOne } from "./driver";
29
+ import { validateFixtureCorpus } from "./environment";
29
30
  import { aggregateCorpus, aggregateFailureModes, aggregatePerTask, aggregateTrajectory, classifyFailureMode, computeCorpusDelta, computePerAssetAttribution, computePerTaskDelta, computeSearchBridge, extractAssetLoads, extractGoldRanks, } from "./metrics";
30
31
  import { resolveGitBranch, resolveGitCommit } from "./report";
31
- import { benchMkdtemp } from "./tmp";
32
+ import { benchMkdtemp, benchTmpRoot } from "./tmp";
32
33
  import { computeTrajectory } from "./trajectory";
33
34
  import { evaluateRunAgainstAllSpecs, } from "./workflow-evaluator";
34
35
  import { loadAllWorkflowSpecs } from "./workflow-spec";
35
36
  import { normalizeRunToTrace } from "./workflow-trace";
37
+ /** Checkpoint write interval: write a partial file every N completed runs. */
38
+ const CHECKPOINT_INTERVAL = 5;
39
+ /** Partial file max age before cleanup: 24 hours in milliseconds. */
40
+ const PARTIAL_MAX_AGE_MS = 24 * 60 * 60 * 1000;
41
+ /**
42
+ * Emit a one-line progress update to stderr after each (task, arm, seed)
43
+ * completes. Goes to stderr even when --json is passed so operators always
44
+ * have a heartbeat signal during long runs.
45
+ *
46
+ * Format: `[<completed>/<total>] <taskId> <arm> <outcome> <wallclockSeconds>s`
47
+ */
48
+ function emitProgress(completed, total, run) {
49
+ const secs = Math.round(run.wallclockMs / 1000);
50
+ process.stderr.write(`[${completed}/${total}] ${run.taskId} ${run.arm} ${run.outcome} ${secs}s\n`);
51
+ }
52
+ /**
53
+ * Write a partial checkpoint file under `${AKM_CACHE_DIR}/bench/`.
54
+ * The file contains the runs completed so far plus a `partial: true` marker
55
+ * and a `summary.total_runs_completed` counter. Old partial files (>24h)
56
+ * are not cleaned up here — that is done at startup via `cleanupOldPartials`.
57
+ */
58
+ function writePartialCheckpoint(runs, timestamp) {
59
+ try {
60
+ const root = benchTmpRoot();
61
+ const filename = `bench-partial-${timestamp.replace(/[:.]/g, "-")}.json`;
62
+ const outPath = path.join(root, filename);
63
+ const envelope = {
64
+ partial: true,
65
+ summary: {
66
+ total_runs_completed: runs.length,
67
+ },
68
+ timestamp,
69
+ runs: runs.map((r) => ({
70
+ task_id: r.taskId,
71
+ arm: r.arm,
72
+ seed: r.seed,
73
+ model: r.model,
74
+ outcome: r.outcome,
75
+ wallclock_ms: r.wallclockMs,
76
+ })),
77
+ };
78
+ fs.writeFileSync(outPath, JSON.stringify(envelope, null, 2), "utf8");
79
+ }
80
+ catch {
81
+ // Checkpoint writes are best-effort — never abort a run for a write failure.
82
+ }
83
+ }
84
+ /**
85
+ * Remove partial checkpoint files older than 24 hours from the bench tmp root.
86
+ * Called once at the start of `runUtility` to reap orphans from prior crashed runs.
87
+ */
88
+ function cleanupOldPartials() {
89
+ try {
90
+ const root = benchTmpRoot();
91
+ const now = Date.now();
92
+ const entries = fs.readdirSync(root);
93
+ for (const entry of entries) {
94
+ if (!entry.startsWith("bench-partial-"))
95
+ continue;
96
+ const fullPath = path.join(root, entry);
97
+ try {
98
+ const stat = fs.statSync(fullPath);
99
+ if (now - stat.mtimeMs > PARTIAL_MAX_AGE_MS) {
100
+ fs.unlinkSync(fullPath);
101
+ }
102
+ }
103
+ catch {
104
+ /* swallow per-file errors */
105
+ }
106
+ }
107
+ }
108
+ catch {
109
+ /* swallow — cleanup is best-effort */
110
+ }
111
+ }
36
112
  /**
37
113
  * Default workflows directory. Can be overridden by callers (tests) via
38
114
  * `RunUtilityOptions.workflowsDir`. Specs in this directory are loaded ONCE
@@ -40,6 +116,16 @@ import { normalizeRunToTrace } from "./workflow-trace";
40
116
  * `applies_to` so we don't I/O in the hot loop.
41
117
  */
42
118
  const DEFAULT_WORKFLOWS_DIR = path.resolve(__dirname, "..", "fixtures", "bench", "workflows");
119
+ /**
120
+ * Run `items` in batches of `n` concurrently, calling `fn` for each item.
121
+ * Batches are executed sequentially; within each batch all items run with
122
+ * `Promise.all`. This gives bounded concurrency without a full work-queue.
123
+ */
124
+ async function runInBatches(items, n, fn) {
125
+ for (let i = 0; i < items.length; i += n) {
126
+ await Promise.all(items.slice(i, i + n).map(fn));
127
+ }
128
+ }
43
129
  /**
44
130
  * Run K seeds × len(arms) × len(tasks) and return the §13.3 report.
45
131
  *
@@ -47,6 +133,11 @@ const DEFAULT_WORKFLOWS_DIR = path.resolve(__dirname, "..", "fixtures", "bench",
47
133
  * every failure path into a RunResult, so the runner only has to worry
48
134
  * about its own infrastructure (stash materialisation, workspace copy).
49
135
  * Those failures are recorded as `harness_error` runs.
136
+ *
137
+ * When `options.parallel > 1`, work items are batched and run concurrently
138
+ * via `runInBatches`. The shared `warnings`, `goldRankRecords`, and
139
+ * `workflowChecks` arrays are updated atomically at the end of each item so
140
+ * no JS-level races occur (Node/Bun is single-threaded).
50
141
  */
51
142
  export async function runUtility(options) {
52
143
  const seedsPerArm = options.seedsPerArm ?? 5;
@@ -54,9 +145,39 @@ export async function runUtility(options) {
54
145
  const budgetWallMs = options.budgetWallMs ?? 120000;
55
146
  const slice = options.slice ?? "all";
56
147
  const materialiseStash = options.materialiseStash ?? true;
148
+ // Clamp parallel to [1, 8].
149
+ const parallel = Math.min(8, Math.max(1, options.parallel ?? 1));
150
+ if (parallel > 4 && !options.forceParallel) {
151
+ process.stderr.write(`bench: --parallel ${parallel} exceeds 4; high concurrency may overwhelm local providers. ` +
152
+ `Pass --force-parallel to suppress this warning.\n`);
153
+ }
154
+ // Clean up orphaned partial files from prior crashed runs (best-effort).
155
+ cleanupOldPartials();
57
156
  const grouped = new Map();
58
157
  const warnings = [];
158
+ // Validate all task stash references before starting any work. Missing
159
+ // fixtures produce harness_error at run time; better to surface them loudly
160
+ // at startup with the fixture name than to discover them per-seed mid-run.
161
+ if (materialiseStash && options.arms.includes("akm")) {
162
+ const { missing } = validateFixtureCorpus(options.tasks);
163
+ for (const [fixture, taskIds] of missing) {
164
+ const w = `fixture "${fixture}" missing MANIFEST.json — tasks will harness_error: ${taskIds.join(", ")}`;
165
+ process.stderr.write(`bench: WARNING: ${w}\n`);
166
+ warnings.push(w);
167
+ }
168
+ }
59
169
  const goldRankRecords = [];
170
+ // Progress tracking: compute total run count upfront so progress lines show
171
+ // `[7/40]` rather than an unbounded counter.
172
+ const armsForProgress = options.includeSynthetic
173
+ ? [...new Set([...options.arms, "synthetic"])]
174
+ : options.arms;
175
+ const totalRuns = options.tasks.length * armsForProgress.length * seedsPerArm;
176
+ let completedRuns = 0;
177
+ // Partial checkpoint accumulator: collects all RunResults as they land so
178
+ // we can write a partial envelope periodically without keeping duplicates.
179
+ const allCompletedRuns = [];
180
+ const runTimestamp = options.timestamp ?? new Date().toISOString();
60
181
  // #257: load workflow specs ONCE per runUtility call. Skipped when the
61
182
  // caller passes an empty `workflowsDir` string (test escape hatch). Errors
62
183
  // are surfaced as warnings — workflow evaluation is best-effort and a
@@ -89,7 +210,7 @@ export async function runUtility(options) {
89
210
  let stashError;
90
211
  if (options.arms.includes("akm") && materialiseStash && !overrideStashDir) {
91
212
  try {
92
- stash = loadFixtureStash(task.stash, { skipIndex: true });
213
+ stash = loadFixtureStash(task.stash);
93
214
  }
94
215
  catch (err) {
95
216
  stashError = err instanceof Error ? err.message : String(err);
@@ -121,99 +242,130 @@ export async function runUtility(options) {
121
242
  return options.arms;
122
243
  return [...options.arms, "synthetic"];
123
244
  })();
124
- try {
125
- for (const arm of armsForTask) {
126
- const armRuns = [];
127
- taskRuns.set(arm, armRuns);
128
- for (let seed = 0; seed < seedsPerArm; seed += 1) {
129
- // Resolve the stashDir we'll forward to the agent. The akm arm
130
- // always carries a stashDir so AKM_STASH_DIR is set in the child
131
- // env this is how downstream tooling (and the trajectory parser
132
- // event-stream lookup) distinguishes arms. When the operator opted
133
- // out of fixture materialisation (tests, dry-run), we still pass a
134
- // stable placeholder so the env keys are wired correctly.
135
- let stashDir;
136
- if (arm === "akm") {
137
- // Resolution order (must match the issue #251 acceptance criteria):
138
- // 1. Per-task explicit override (used by `runMaskedCorpus` to
139
- // point at a tmp stash with one asset removed). Highest
140
- // priority because attribution correctness depends on this
141
- // branch never being shadowed by the `__no-stash__`
142
- // placeholder fallback.
143
- // 2. Per-(task, arm)-call `stashDirByFixture` override (Phase
144
- // 3 evolve persistence).
145
- // 3. Per-task materialised fixture stash from `loadFixtureStash`.
146
- // 4. `materialiseStash: false` placeholder so AKM_STASH_DIR is
147
- // still wired into the child env.
148
- if (task.stashDirOverride)
149
- stashDir = task.stashDirOverride;
150
- else if (overrideStashDir)
151
- stashDir = overrideStashDir;
152
- else if (stash)
153
- stashDir = stash.stashDir;
154
- else if (!materialiseStash)
155
- stashDir = path.join(task.taskDir, "__no-stash__");
156
- }
157
- // Build the prompt-override (#267). The builder is invoked once
158
- // per (task, arm) — seeds share a prompt. `undefined` keeps the
159
- // driver's default prompt in play.
160
- //
161
- // #261: the synthetic arm has a scratch-notes prompt contract —
162
- // the model is told no AKM stash is available and instructed to
163
- // write/use its own procedural notes. When the caller does not
164
- // supply a `buildPrompt` override for the synthetic arm we fall
165
- // back to a built-in scratch-notes prompt so the contract is
166
- // honoured by every utility-track caller, not just `runEvolve`.
167
- let promptOverride = options.buildPrompt?.(task, arm);
168
- if (promptOverride === undefined && arm === "synthetic") {
169
- promptOverride = buildUtilitySyntheticPrompt(task.id);
170
- }
171
- const run = await runOneIsolated({
172
- task,
173
- arm,
174
- seed,
175
- model: options.model,
176
- stashDir,
177
- budgetTokens,
178
- budgetWallMs,
179
- spawn: options.spawn,
180
- warnings,
181
- ...(promptOverride !== undefined ? { prompt: promptOverride } : {}),
182
- });
183
- armRuns.push(run);
184
- // §6.7 search-pipeline bridge: only the akm arm consults the stash,
185
- // and we only attribute ranks for tasks with a gold ref. Both
186
- // guards mean noakm and gold-less runs are silently excluded.
187
- if (arm === "akm" && task.goldRef) {
188
- const searches = extractGoldRanks(run, task.goldRef);
189
- goldRankRecords.push({
190
- taskId: task.id,
191
- arm,
192
- seed,
193
- outcome: run.outcome,
194
- goldRef: task.goldRef,
195
- searches,
196
- });
197
- }
198
- // #257: evaluate the akm-arm run against every workflow spec. The
199
- // evaluator's `specApplies` filter handles applicability (arm,
200
- // domain, gold ref, repeated-failures threshold), so we hand it the
201
- // entire spec list and append whatever it returns. noakm/synthetic
202
- // arms are not evaluated — workflow specs target the akm arm.
203
- if (arm === "akm" && workflowSpecs.length > 0) {
204
- const trace = normalizeRunToTrace(run, { warnings });
205
- const runCtx = {
206
- arm: run.arm,
207
- taskId: run.taskId,
208
- seed: run.seed,
209
- outcome: run.outcome,
210
- };
211
- const taskMetadata = buildWorkflowTaskMetadata(task, trace);
212
- const checks = evaluateRunAgainstAllSpecs(trace, workflowSpecs, runCtx, taskMetadata);
213
- workflowChecks.push(...checks);
214
- }
215
- }
245
+ const workItems = [];
246
+ for (const arm of armsForTask) {
247
+ taskRuns.set(arm, []);
248
+ for (let seed = 0; seed < seedsPerArm; seed += 1) {
249
+ workItems.push({ arm, seed });
250
+ }
251
+ }
252
+ // Per-run worker: resolves stash/prompt, executes runOneIsolated, then
253
+ // splices the result into the shared accumulators. Because Bun/Node is
254
+ // single-threaded these splices are race-free even across concurrent
255
+ // awaits only one microtask runs at a time between yield points.
256
+ const runItem = async ({ arm, seed }) => {
257
+ // Resolve the stashDir we'll forward to the agent. The akm arm
258
+ // always carries a stashDir so AKM_STASH_DIR is set in the child
259
+ // env this is how downstream tooling (and the trajectory parser
260
+ // event-stream lookup) distinguishes arms. When the operator opted
261
+ // out of fixture materialisation (tests, dry-run), we still pass a
262
+ // stable placeholder so the env keys are wired correctly.
263
+ let stashDir;
264
+ if (arm === "akm") {
265
+ // Resolution order (must match the issue #251 acceptance criteria):
266
+ // 1. Per-task explicit override (used by `runMaskedCorpus` to
267
+ // point at a tmp stash with one asset removed). Highest
268
+ // priority because attribution correctness depends on this
269
+ // branch never being shadowed by the `__no-stash__`
270
+ // placeholder fallback.
271
+ // 2. Per-(task, arm)-call `stashDirByFixture` override (Phase
272
+ // 3 evolve persistence).
273
+ // 3. Per-task materialised fixture stash from `loadFixtureStash`.
274
+ // 4. `materialiseStash: false` placeholder so AKM_STASH_DIR is
275
+ // still wired into the child env.
276
+ if (task.stashDirOverride)
277
+ stashDir = task.stashDirOverride;
278
+ else if (overrideStashDir)
279
+ stashDir = overrideStashDir;
280
+ else if (stash)
281
+ stashDir = stash.stashDir;
282
+ else if (!materialiseStash)
283
+ stashDir = path.join(task.taskDir, "__no-stash__");
284
+ }
285
+ // Build the prompt-override (#267). The builder is invoked once
286
+ // per (task, arm) seeds share a prompt. `undefined` keeps the
287
+ // driver's default prompt in play.
288
+ //
289
+ // #261: the synthetic arm has a scratch-notes prompt contract —
290
+ // the model is told no AKM stash is available and instructed to
291
+ // write/use its own procedural notes. When the caller does not
292
+ // supply a `buildPrompt` override for the synthetic arm we fall
293
+ // back to a built-in scratch-notes prompt so the contract is
294
+ // honoured by every utility-track caller, not just `runEvolve`.
295
+ let promptOverride = options.buildPrompt?.(task, arm);
296
+ if (promptOverride === undefined && arm === "synthetic") {
297
+ promptOverride = buildUtilitySyntheticPrompt(task.id);
298
+ }
299
+ // Collect per-run warnings separately and merge after the run so
300
+ // concurrent runs don't interleave partial warning sequences.
301
+ const runWarnings = [];
302
+ const run = await runOneIsolated({
303
+ task,
304
+ arm,
305
+ seed,
306
+ model: options.model,
307
+ stashDir,
308
+ budgetTokens,
309
+ budgetWallMs,
310
+ spawn: options.spawn,
311
+ warnings: runWarnings,
312
+ ...(promptOverride !== undefined ? { prompt: promptOverride } : {}),
313
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
314
+ ...(stash?.indexCacheHome ? { indexCacheHome: stash.indexCacheHome } : {}),
315
+ });
316
+ // Merge per-run warnings into the shared array.
317
+ if (runWarnings.length > 0)
318
+ warnings.push(...runWarnings);
319
+ taskRuns.get(arm)?.push(run);
320
+ // Emit a compact progress line to stderr (unconditional — even under
321
+ // --json so operators have a heartbeat during long runs).
322
+ completedRuns += 1;
323
+ emitProgress(completedRuns, totalRuns, run);
324
+ // Accumulate for partial checkpointing.
325
+ allCompletedRuns.push(run);
326
+ if (completedRuns % CHECKPOINT_INTERVAL === 0) {
327
+ writePartialCheckpoint(allCompletedRuns, runTimestamp);
216
328
  }
329
+ // §6.7 search-pipeline bridge: only the akm arm consults the stash,
330
+ // and we only attribute ranks for tasks with a gold ref. Both
331
+ // guards mean noakm and gold-less runs are silently excluded.
332
+ if (arm === "akm" && task.goldRef) {
333
+ const searches = extractGoldRanks(run, task.goldRef);
334
+ goldRankRecords.push({
335
+ taskId: task.id,
336
+ arm,
337
+ seed,
338
+ outcome: run.outcome,
339
+ goldRef: task.goldRef,
340
+ searches,
341
+ });
342
+ }
343
+ // #257: evaluate the akm-arm run against every workflow spec. The
344
+ // evaluator's `specApplies` filter handles applicability (arm,
345
+ // domain, gold ref, repeated-failures threshold), so we hand it the
346
+ // entire spec list and append whatever it returns. noakm/synthetic
347
+ // arms are not evaluated — workflow specs target the akm arm.
348
+ if (arm === "akm" && workflowSpecs.length > 0) {
349
+ const trace = normalizeRunToTrace(run, {
350
+ warnings: runWarnings,
351
+ harness: {
352
+ agentStartedTs: run.startedAt,
353
+ agentFinishedTs: run.finishedAt,
354
+ },
355
+ });
356
+ const runCtx = {
357
+ arm: run.arm,
358
+ taskId: run.taskId,
359
+ seed: run.seed,
360
+ outcome: run.outcome,
361
+ };
362
+ const taskMetadata = buildWorkflowTaskMetadata(task, trace);
363
+ const checks = evaluateRunAgainstAllSpecs(trace, workflowSpecs, runCtx, taskMetadata);
364
+ workflowChecks.push(...checks);
365
+ }
366
+ };
367
+ try {
368
+ await runInBatches(workItems, parallel, runItem);
217
369
  }
218
370
  finally {
219
371
  // Deregister BEFORE running cleanup so a SIGINT arriving during this
@@ -289,6 +441,7 @@ async function runOneIsolated(args) {
289
441
  track: "utility",
290
442
  arm: args.arm,
291
443
  taskId: args.task.id,
444
+ taskTitle: args.task.title,
292
445
  workspace,
293
446
  model: args.model,
294
447
  seed: args.seed,
@@ -297,10 +450,13 @@ async function runOneIsolated(args) {
297
450
  verifier: args.task.verifier,
298
451
  taskDir: args.task.taskDir,
299
452
  ...(args.task.expectedMatch ? { expectedMatch: args.task.expectedMatch } : {}),
453
+ ...(args.task.akmKeywords ? { akmKeywords: args.task.akmKeywords } : {}),
300
454
  ...(args.stashDir ? { stashDir: args.stashDir } : {}),
301
455
  ...(args.spawn ? { spawn: args.spawn } : {}),
302
456
  ...(args.prompt !== undefined ? { prompt: args.prompt } : {}),
303
457
  warnings: args.warnings,
458
+ ...(args.opencodeProviders ? { opencodeProviders: args.opencodeProviders } : {}),
459
+ ...(args.indexCacheHome ? { indexCacheHome: args.indexCacheHome } : {}),
304
460
  };
305
461
  const result = await runOne(runOptions);
306
462
  // Splice in the trajectory metric. The driver always returns
@@ -532,5 +688,12 @@ function buildReport(args) {
532
688
  // we just collected. This is the §6.5 "free" diagnostic — it runs on every
533
689
  // utility invocation, no extra spawns.
534
690
  baseReport.perAsset = computePerAssetAttribution(baseReport);
691
+ // Stamp the optional baseline pass-rate map onto the report so the
692
+ // renderer surfaces a `vs base` column in markdown and a
693
+ // `baseline_by_task_id` field in JSON. Additive — when the caller did
694
+ // not pass a baseline the report shape is byte-identical to before.
695
+ if (args.options.baselineByTaskId) {
696
+ baseReport.baselineByTaskId = { ...args.options.baselineByTaskId };
697
+ }
535
698
  return baseReport;
536
699
  }
@@ -39,3 +39,93 @@ export function benchTmpRoot() {
39
39
  export function benchMkdtemp(prefix) {
40
40
  return fs.mkdtempSync(path.join(benchTmpRoot(), prefix));
41
41
  }
42
+ // ── PID file ────────────────────────────────────────────────────────────────
43
+ /** Absolute path to the bench PID file: `${AKM_CACHE_DIR}/bench/bench.pid`. */
44
+ export function benchPidPath() {
45
+ return path.join(benchTmpRoot(), "bench.pid");
46
+ }
47
+ /**
48
+ * Write `process.pid` to `bench.pid`.
49
+ *
50
+ * If a stale PID file exists and the referenced process is no longer running,
51
+ * it is removed with a warning before writing the new one.
52
+ *
53
+ * Returns a cleanup function that removes the PID file. Call it in a
54
+ * `finally` block so the file is removed on both clean exit and exceptions.
55
+ */
56
+ export function writeBenchPid() {
57
+ const pidPath = benchPidPath();
58
+ // Check for an existing PID file and warn if stale.
59
+ if (fs.existsSync(pidPath)) {
60
+ let existingPid;
61
+ try {
62
+ const raw = fs.readFileSync(pidPath, "utf8").trim();
63
+ existingPid = Number.parseInt(raw, 10);
64
+ }
65
+ catch {
66
+ // Unreadable — treat as stale.
67
+ }
68
+ if (existingPid !== undefined && Number.isFinite(existingPid) && !isPidRunning(existingPid)) {
69
+ // Stale PID — warn and remove.
70
+ process.stderr.write(`bench: removing stale PID file for PID ${existingPid} (process not running)\n`);
71
+ try {
72
+ fs.rmSync(pidPath, { force: true });
73
+ }
74
+ catch {
75
+ /* best-effort */
76
+ }
77
+ }
78
+ }
79
+ try {
80
+ fs.writeFileSync(pidPath, String(process.pid), "utf8");
81
+ }
82
+ catch {
83
+ /* best-effort — PID file is diagnostic, not critical */
84
+ }
85
+ return () => {
86
+ try {
87
+ // Only remove if it still contains our PID (guard against races).
88
+ const current = fs.readFileSync(pidPath, "utf8").trim();
89
+ if (current === String(process.pid)) {
90
+ fs.rmSync(pidPath, { force: true });
91
+ }
92
+ }
93
+ catch {
94
+ /* best-effort */
95
+ }
96
+ };
97
+ }
98
+ /**
99
+ * Read the PID from `bench.pid`. Returns `undefined` when the file does not
100
+ * exist or cannot be parsed.
101
+ */
102
+ export function readBenchPid() {
103
+ const pidPath = benchPidPath();
104
+ if (!fs.existsSync(pidPath))
105
+ return undefined;
106
+ try {
107
+ const raw = fs.readFileSync(pidPath, "utf8").trim();
108
+ const n = Number.parseInt(raw, 10);
109
+ return Number.isFinite(n) && n > 0 ? n : undefined;
110
+ }
111
+ catch {
112
+ return undefined;
113
+ }
114
+ }
115
+ /**
116
+ * Return `true` when the process with the given PID is running on this host.
117
+ * Uses `process.kill(pid, 0)` — signal 0 is a no-op probe that throws ESRCH
118
+ * when the process does not exist and EPERM when it exists but is owned by
119
+ * another user (in which case it IS running).
120
+ */
121
+ export function isPidRunning(pid) {
122
+ try {
123
+ process.kill(pid, 0);
124
+ return true;
125
+ }
126
+ catch (err) {
127
+ const code = err.code;
128
+ // EPERM means the process exists but we don't have permission to signal it.
129
+ return code === "EPERM";
130
+ }
131
+ }
@@ -48,8 +48,8 @@ function computeCorrectAssetLoaded(task, runResult, opts) {
48
48
  return null;
49
49
  const ref = task.goldRef;
50
50
  // Search the events stream for any tool-call event that carries the ref.
51
- // akm itself does not emit an event for `show`, but third parties might,
52
- // and the field is forward-compatible.
51
+ // akm show emits an event to events.jsonl, so this path is the primary
52
+ // detection route when the structured event stream is available.
53
53
  for (const event of runResult.events) {
54
54
  const refField = event.ref;
55
55
  if (typeof refField === "string" && matchesRef(refField, ref))
@@ -79,7 +79,12 @@ export async function runVerifier(taskDir, workspace, kind, config) {
79
79
  return runProcess(["bash", script], workspace, resolveSpawn(config));
80
80
  }
81
81
  if (kind === "pytest") {
82
- return runProcess(["pytest", "-q", "--tb=line"], workspace, resolveSpawn(config));
82
+ // Test files live at <taskDir>/tests/, not inside the workspace copy.
83
+ // Pass the absolute path so pytest discovers them while running with
84
+ // cwd=workspace (which lets relative paths like pathlib.Path("file.yml") work).
85
+ const testsDir = path.join(taskDir, "tests");
86
+ const testArgs = fs.existsSync(testsDir) ? [testsDir] : [];
87
+ return runProcess(["pytest", "-q", "--tb=line", ...testArgs], workspace, resolveSpawn(config));
83
88
  }
84
89
  if (kind === "regex") {
85
90
  const pattern = config?.expectedMatch;
@@ -9,9 +9,8 @@
9
9
  * - `loadWorkflowSpec(path, root?)` — parses + validates one file
10
10
  * - `loadAllWorkflowSpecs(dir)` — walks a workflows directory
11
11
  *
12
- * Event names are validated against a HARDCODED set in this file. Once
13
- * #254 lands, #256 will reconcile by importing the source-of-truth set
14
- * from `workflow-trace.ts`. Until then this set is the contract.
12
+ * Event names are validated against `WORKFLOW_TRACE_EVENT_NAMES` imported from
13
+ * `workflow-trace.ts` single source of truth, no dual-maintenance hazard.
15
14
  *
16
15
  * Asset refs (e.g. `gold_ref`) are validated via `parseAssetRef` from
17
16
  * `src/core/asset-ref.ts` — never reinvent ref validation.
@@ -20,31 +19,19 @@ import { readdirSync, readFileSync, statSync } from "node:fs";
20
19
  import path from "node:path";
21
20
  import { parse as parseYaml } from "yaml";
22
21
  import { parseAssetRef } from "../../src/core/asset-ref";
23
- // ── Event-name set (hardcoded; reconcile with #254 in wave 3) ──────────────
22
+ import { WORKFLOW_TRACE_EVENT_NAMES } from "./workflow-trace";
23
+ // ── Event-name set (derived from workflow-trace.ts — single source of truth) ─
24
24
  /**
25
- * Hardcoded event-name allowlist. Mirrors the WorkflowTraceEvent.kind set
26
- * specified in the #254 brief.
25
+ * Allowlist of known event names, derived from `WORKFLOW_TRACE_EVENT_NAMES` in
26
+ * `workflow-trace.ts`. Using the exported runtime Set eliminates the dual-
27
+ * maintenance hazard: add a new event type once in `workflow-trace.ts` and
28
+ * both the normalizer and the spec validator see it automatically.
27
29
  *
28
30
  * `first_workspace_write` is a synthetic marker (the first `workspace_write`
29
31
  * for a run) and is included so specs can talk about it directly.
30
32
  */
31
- export const KNOWN_EVENT_NAMES = Object.freeze([
32
- "agent_started",
33
- "akm_search",
34
- "akm_show",
35
- "akm_feedback",
36
- "akm_reflect",
37
- "akm_distill",
38
- "akm_propose",
39
- "akm_proposal_accept",
40
- "workspace_read",
41
- "workspace_write",
42
- "test_run",
43
- "verifier_run",
44
- "agent_finished",
45
- "first_workspace_write",
46
- ]);
47
- const EVENT_NAME_SET = new Set(KNOWN_EVENT_NAMES);
33
+ export const KNOWN_EVENT_NAMES = WORKFLOW_TRACE_EVENT_NAMES;
34
+ const EVENT_NAME_SET = KNOWN_EVENT_NAMES;
48
35
  function isKnownEvent(name) {
49
36
  return typeof name === "string" && EVENT_NAME_SET.has(name);
50
37
  }
@@ -96,7 +83,7 @@ function requireNumber(obj, key, specPath) {
96
83
  }
97
84
  function validateEventName(name, specPath, where) {
98
85
  if (!isKnownEvent(name)) {
99
- throw new WorkflowSpecError(`Unknown event name "${String(name)}" in ${where}. ` + `Allowed: ${KNOWN_EVENT_NAMES.join(", ")}`, specPath);
86
+ throw new WorkflowSpecError(`Unknown event name "${String(name)}" in ${where}. ` + `Allowed: ${[...KNOWN_EVENT_NAMES].join(", ")}`, specPath);
100
87
  }
101
88
  return name;
102
89
  }
@@ -17,7 +17,7 @@ const REQUIRED_SPECS = [
17
17
  "akm-feedback-after-use",
18
18
  "akm-negative-feedback-on-failure",
19
19
  "akm-reflect-after-repeated-failure",
20
- "akm-proposal-review-before-accept",
20
+ "akm-workflow-followed",
21
21
  ];
22
22
  // ── Scratch directory helpers ──────────────────────────────────────────────
23
23
  let scratch;